Training epochs and validation epochs are not matched

When I tried to save each epoch’s loss and accuracy data, I wanted to plot them with matplotlib. Instead of utilising tensorboard, I logged it in two lists for training and validation. These two lists, however, are not the same length. I posted the code below.

class Engine(pl.LightningModule):
    """
    Multi-class Classification Engine
    """
    
    learning_rate = 1e-3
    
    def __init__(self):
        super().__init__()
        # Create loss function
        self.loss_fn = torch.nn.CrossEntropyLoss()
        self.train_losses = []
        self.valid_losses = []
        self.train_accuracies = []
        self.valid_accuracies = []
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        labels_hat = torch.argmax(y_hat, dim=1)
        n_correct_pred = torch.sum(y == labels_hat).item()
        loss = F.cross_entropy(y_hat, y.long())
        tensorboard_logs = {'train_acc_step': n_correct_pred, 'train_loss_step': loss}
        return {'loss': loss, "n_correct_pred": n_correct_pred, "n_pred": len(y), 'log': tensorboard_logs}

    def training_epoch_end(self, outputs):
        avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
        train_acc = sum([x['n_correct_pred'] for x in outputs]) / sum(x['n_pred'] for x in outputs)
        tensorboard_logs = {'train_acc': train_acc, 'train_loss': avg_loss, 'step': self.current_epoch}
        self.train_losses.append(avg_loss.detach().cpu().item())
        self.train_accuracies.append(train_acc)

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y.long())
        labels_hat = torch.argmax(y_hat, dim=1)
        n_correct_pred = torch.sum(y == labels_hat).item()
        return {'val_loss': loss, "n_correct_pred": n_correct_pred, "n_pred": len(y)}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        val_acc = sum([x['n_correct_pred'] for x in outputs]) / sum(x['n_pred'] for x in outputs)
        tensorboard_logs = {'val_loss': avg_loss, 'val_acc': val_acc, 'step': self.current_epoch}
        self.valid_losses.append(avg_loss.detach().cpu().item())
        self.valid_accuracies.append(val_acc)
        return {'log': tensorboard_logs}

As you can see, the length of validation losses is always larger than training.

I solved it!!!
For the people who want to know, just set num_sanity_val_step to 0. In default, pytorch lightning module will run n validation batches before the model starts its training process.

2 Likes