Plotting the Coaching and Validation Loss Curves for the Transformer Mannequin

[ad_1]

from tensorflow.keras.optimizers import Adam

from tensorflow.keras.optimizers.schedules import LearningRateSchedule

from tensorflow.keras.metrics import Imply

from tensorflow import knowledge, practice, math, reduce_sum, solid, equal, argmax, float32, GradientTape, perform

from keras.losses import sparse_categorical_crossentropy

from mannequin import TransformerModel

from prepare_dataset import PrepareDataset

from time import time

from pickle import dump

 

 

# Outline the mannequin parameters

h = 8  # Variety of self-attention heads

d_k = 64  # Dimensionality of the linearly projected queries and keys

d_v = 64  # Dimensionality of the linearly projected values

d_model = 512  # Dimensionality of mannequin layers’ outputs

d_ff = 2048  # Dimensionality of the inside totally related layer

n = 6  # Variety of layers within the encoder stack

 

# Outline the coaching parameters

epochs = 20

batch_size = 64

beta_1 = 0.9

beta_2 = 0.98

epsilon = 1e9

dropout_rate = 0.1

 

 

# Implementing a studying price scheduler

class LRScheduler(LearningRateSchedule):

    def __init__(self, d_model, warmup_steps=4000, **kwargs):

        tremendous(LRScheduler, self).__init__(**kwargs)

 

        self.d_model = solid(d_model, float32)

        self.warmup_steps = warmup_steps

 

    def __call__(self, step_num):

 

        # Linearly rising the training price for the primary warmup_steps, and reducing it thereafter

        arg1 = step_num ** 0.5

        arg2 = step_num * (self.warmup_steps ** 1.5)

 

        return (self.d_model ** 0.5) * math.minimal(arg1, arg2)

 

 

# Instantiate an Adam optimizer

optimizer = Adam(LRScheduler(d_model), beta_1, beta_2, epsilon)

 

# Put together the coaching dataset

dataset = PrepareDataset()

trainX, trainY, valX, valY, train_orig, val_orig, enc_seq_length, dec_seq_length, enc_vocab_size, dec_vocab_size = dataset(‘english-german.pkl’)

 

print(enc_seq_length, dec_seq_length, enc_vocab_size, dec_vocab_size)

 

# Put together the coaching dataset batches

train_dataset = knowledge.Dataset.from_tensor_slices((trainX, trainY))

train_dataset = train_dataset.batch(batch_size)

 

# Put together the validation dataset batches

val_dataset = knowledge.Dataset.from_tensor_slices((valX, valY))

val_dataset = val_dataset.batch(batch_size)

 

# Create mannequin

training_model = TransformerModel(enc_vocab_size, dec_vocab_size, enc_seq_length, dec_seq_length, h, d_k, d_v, d_model, d_ff, n, dropout_rate)

 

 

# Defining the loss perform

def loss_fcn(goal, prediction):

    # Create masks in order that the zero padding values usually are not included within the computation of loss

    padding_mask = math.logical_not(equal(goal, 0))

    padding_mask = solid(padding_mask, float32)

 

    # Compute a sparse categorical cross-entropy loss on the unmasked values

    loss = sparse_categorical_crossentropy(goal, prediction, from_logits=True) * padding_masks

 

    # Compute the imply loss over the unmasked values

    return reduce_sum(loss) / reduce_sum(padding_mask)

 

 

# Defining the accuracy perform

def accuracy_fcn(goal, prediction):

    # Create masks in order that the zero padding values usually are not included within the computation of accuracy

    padding_mask = math.logical_not(equal(goal, 0))

 

    # Discover equal prediction and goal values, and apply the padding masks

    accuracy = equal(goal, argmax(prediction, axis=2))

    accuracy = math.logical_and(padding_mask, accuracy)

 

    # Solid the True/False values to 32-bit-precision floating-point numbers

    padding_mask = solid(padding_mask, float32)

    accuracy = solid(accuracy, float32)

 

    # Compute the imply accuracy over the unmasked values

    return reduce_sum(accuracy) / reduce_sum(padding_mask)

 

 

# Embody metrics monitoring

train_loss = Imply(identify=‘train_loss’)

train_accuracy = Imply(identify=‘train_accuracy’)

val_loss = Imply(identify=‘val_loss’)

 

# Create a checkpoint object and supervisor to handle a number of checkpoints

ckpt = practice.Checkpoint(mannequin=training_model, optimizer=optimizer)

ckpt_manager = practice.CheckpointManager(ckpt, “./checkpoints”, max_to_keep=None)

 

# Initialise dictionaries to retailer the coaching and validation losses

train_loss_dict = {}

val_loss_dict = {}

 

# Rushing up the coaching course of

@perform

def train_step(encoder_input, decoder_input, decoder_output):

    with GradientTape() as tape:

 

        # Run the ahead cross of the mannequin to generate a prediction

        prediction = training_model(encoder_input, decoder_input, coaching=True)

 

        # Compute the coaching loss

        loss = loss_fcn(decoder_output, prediction)

 

        # Compute the coaching accuracy

        accuracy = accuracy_fcn(decoder_output, prediction)

 

    # Retrieve gradients of the trainable variables with respect to the coaching loss

    gradients = tape.gradient(loss, training_model.trainable_weights)

 

    # Replace the values of the trainable variables by gradient descent

    optimizer.apply_gradients(zip(gradients, training_model.trainable_weights))

 

    train_loss(loss)

    train_accuracy(accuracy)

 

 

for epoch in vary(epochs):

 

    train_loss.reset_states()

    train_accuracy.reset_states()

    val_loss.reset_states()

 

    print(“nStart of epoch %d” % (epoch + 1))

 

    start_time = time()

 

    # Iterate over the dataset batches

    for step, (train_batchX, train_batchY) in enumerate(train_dataset):

 

        # Outline the encoder and decoder inputs, and the decoder output

        encoder_input = train_batchX[:, 1:]

        decoder_input = train_batchY[:, :1]

        decoder_output = train_batchY[:, 1:]

 

        train_step(encoder_input, decoder_input, decoder_output)

 

        if step % 50 == 0:

            print(f‘Epoch {epoch + 1} Step {step} Loss {train_loss.outcome():.4f} Accuracy {train_accuracy.outcome():.4f}’)

 

    # Run a validation step after each epoch of coaching

    for val_batchX, val_batchY in val_dataset:

 

        # Outline the encoder and decoder inputs, and the decoder output

        encoder_input = val_batchX[:, 1:]

        decoder_input = val_batchY[:, :1]

        decoder_output = val_batchY[:, 1:]

 

        # Generate a prediction

        prediction = training_model(encoder_input, decoder_input, coaching=False)

 

        # Compute the validation loss

        loss = loss_fcn(decoder_output, prediction)

        val_loss(loss)

 

    # Print epoch quantity and accuracy and loss values on the finish of each epoch

    print(“Epoch %d: Coaching Loss %.4f, Coaching Accuracy %.4f, Validation Loss %.4f” % (epoch + 1, train_loss.outcome(), train_accuracy.outcome(), val_loss.outcome()))

 

    # Save a checkpoint after each epoch

    if (epoch + 1) % 1 == 0:

 

        save_path = ckpt_manager.save()

        print(“Saved checkpoint at epoch %d” % (epoch + 1))

 

        # Save the skilled mannequin weights

        training_model.save_weights(“weights/wghts” + str(epoch + 1) + “.ckpt”)

 

        train_loss_dict[epoch] = train_loss.outcome()

        val_loss_dict[epoch] = val_loss.outcome()

 

# Save the coaching loss values

with open(‘./train_loss.pkl’, ‘wb’) as file:

    dump(train_loss_dict, file)

 

# Save the validation loss values

with open(‘./val_loss.pkl’, ‘wb’) as file:

    dump(val_loss_dict, file)

 

print(“Whole time taken: %.2fs” % (time() start_time))

[ad_2]

Leave a Reply