Metrics Not Lining Up With sklearn

For the life of me I can’t get the PyTorch Lightning Metrics to give me the same answer as the sklearn.metrics.

sklearn give me:

Accuracy: 0.8274366863905326
Precision: 0.004664135377115268
Recall: 0.7605095541401274
F1: 0.009271410046357051

When getting Precision, Recall, and F1 I have used the following:

  • no parameters

    [{‘test_accuracy’: 0.8302674293518066,
    ‘test_precision’: 0.004992614965885878,
    ‘test_recall’: 0.1422896534204483,
    ‘test_F1’: 0.0090402876958251}]

  • num_classes=2

    [{‘test_accuracy’: 0.8249880075454712,
    ‘test_precision’: 0.8249880075454712,
    ‘test_recall’: 0.8249880075454712,
    ‘test_F1’: 0.8249880075454712}]

  • multilabel=True

    [{‘test_accuracy’: 0.8274366855621338,
    ‘test_precision’: 0.004944399930536747,
    ‘test_recall’: 0.14596910774707794,
    ‘test_F1’: 0.008971753530204296}]

  • num_classes=2, multilabel=True

    [{‘test_accuracy’: 0.8220030665397644,
    ‘test_precision’: 0.004781993106007576,
    ‘test_recall’: 0.1394656002521515,
    ‘test_F1’: 0.8220030665397644}]

Anyone have any thoughts?

Hi there! We try to ensure that our metrics are rigorously tested against sklearn. Would you mind sharing any code to reproduce this?

You can find these tests comparing sklearn’s metrics and ours here: pytorch-lightning/test_precision_recall.py at master · PyTorchLightning/pytorch-lightning · GitHub. Could be helpful in ensuring you are supplying the correct arguments.

unfortunately the dataset is proprietary so I can’t share that.
in essence the:

  • y (truth): a long list of 0’s and 1’s
  • y_hat (predictions): a long list of 0’s and 1’s

Here is the code:

class Model(pl.LightningModule):
def __init__(self, input_size):
    super().__init__()
    self.input_size = input_size
    
    #Layers
    self.fc1 = torch.nn.Linear(self.input_size, 10) 
    self.fc2 = torch.nn.Linear(10, 2)
    self.relu = torch.nn.ReLU()
    
def forward(self,x):
    out = self.relu(self.fc1(x))
    out = self.fc2(out)
    return out

class LitClassifier(pl.LightningModule):
def __init__(self, train_data, model, batch_size=32, learning_rate = 1e-3):
    super().__init__()
    
    self.train_data = train_data
    self.model = model       
    
    self.batch_size = batch_size # must have for batch_size tuning
    self.learning_rate = learning_rate # must have for lr tuning
    
    # Metrics
    self.train_accuracy = pl.metrics.Accuracy()
    self.test_accuracy = pl.metrics.Accuracy()
    self.test_F1 = pl.metrics.F1()
    self.test_precision = pl.metrics.Precision()
    self.test_recall = pl.metrics.Recall()
   
    # Test Truth and Predictions
    self.test_y = []
    self.test_y_hat = []        
    
def configure_optimizers(self):
    return torch.optim.Adam(self.parameters())
    
def training_step(self, train_batch, batch_idx):
    X_batch = train_batch[0]
    y_batch = train_batch[1]
    y_hat_batch = self.model(X_batch)
    loss = F.cross_entropy(y_hat_batch, y_batch) # CrossEntropyLoss
    
    #Logs
    self.log('training_loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
    self.log('train_accuracy', self.train_accuracy(y_hat_batch, y_batch), on_step=False, on_epoch=True, prog_bar=True, logger=True)

    return loss

def test_step(self, test_batch, batch_idx):
    X_batch = test_batch[0]
    y_batch = test_batch[1]
    y_hat_batch = self.model(X_batch)
    _, y_hat_batch_tags = torch.max(y_hat_batch, dim=-1)
    
    # Metric Logs (DON'T GIVE ACCURATE VALUES)
    self.log('test_accuracy_1', self.test_accuracy(y_hat_batch_tags, y_batch), on_step=False, on_epoch=True, prog_bar=True, logger=True)
    self.log('test_precision_1', self.test_precision(y_hat_batch_tags, y_batch), on_step=False, on_epoch=True, prog_bar=True, logger=True)
    self.log('test_recall_1', self.test_recall(y_hat_batch_tags, y_batch), on_step=False, on_epoch=True, prog_bar=True, logger=True)
    self.log('test_F1_1', self.test_F1(y_hat_batch_tags, y_batch), on_step=False, on_epoch=True, prog_bar=True, logger=True)
   
    #Save Test Truth and Predictions
    y_batch = y_batch.cpu().numpy().tolist()
    self.test_y.extend(y_batch)
    
    y_hat_batch = y_hat_batch_tags.cpu().numpy().tolist()
    self.test_y_hat.extend(y_hat_batch)
    
    return

#######################################
# Dataset and Dataloader (multi-gpu)
#######################################
def setup(self, stage):
    if stage == 'fit':
        #Convert training data to a TorchDataset
        self.train_dataset = TorchDataset(self.train_data.X,self.train_data.Y[:,0])

def train_dataloader(self):
    # For this Experiment, the train_dataloader will be ran on every epoch
    # This give all targets and the same number of randomly sampled non-targets from the train_data
    sampled_X, sampled_Y = self.train_data.getSample(material_index=0)
    sampled_train_dataset = TorchDataset(sampled_X,sampled_Y[:,0])
    train_dataloader = DataLoader(sampled_train_dataset, batch_size=self.batch_size, num_workers=24)
    return train_dataloader

class MyCallbacks(pl.Callback):

# After the test loop is finished
def on_test_end(self, trainer, pl_module):
    
    #Log metrics to MLFlow
mlf_logger.experiment.log_metric(RUN_ID,key="test_accuracy_2",
    value=sklearn.metrics.accuracy_score(pl_module.test_y,pl_module.test_y_hat))
mlf_logger.experiment.log_metric(RUN_ID,key="test_precision_2",
    value=sklearn.metrics.precision_score(pl_module.test_y,pl_module.test_y_hat))
mlf_logger.experiment.log_metric(RUN_ID,key="test_recall_2",
    value=sklearn.metrics.recall_score(pl_module.test_y,pl_module.test_y_hat))
mlf_logger.experiment.log_metric(RUN_ID,key="test_f1_2",
    value=sklearn.metrics.f1_score(pl_module.test_y,pl_module.test_y_hat))

# TRAIN
EPOCHS = 100
BATCH_SIZE = 50

model = Model(input_size=train_data.spectra_length)
classifier = LitClassifier(train_data, model, batch_size = BATCH_SIZE)

trainer = pl.Trainer(gpus=1,
                              max_epochs=EPOCHS,
                              auto_lr_find=False, 
                              reload_dataloaders_every_epoch=True, 
                              logger=[mlf_logger, tb_logger],
                              callbacks=[MyCallbacks()]) #sample
trainer.fit(classifier)

# TEST
BATCH_SIZE = 4096
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, num_workers=24)
trainer.test(classifier,test_dataloader)

The PyTorch LIghtning logging in the test_step (played around with different paramters as stated in the OP) is different than the logging in the on_test_end callback (using sklearn).