Multi-Gpu Inferencing

Hi everyone,

Using multi-GPU, I am trying to infer (predict) from my Lightning Module. My Lightning Module is as follows:

class DistilBERTRegressor(pl.LightningModule):
    def __init__(self, config):
        
        super().__init__()

        self.config = config
        self.dbert = DistilBertModel.from_pretrained(config['bert']['name'], config=config['bert']['config'])

        self.drop = nn.Dropout(p=config['dropout'])
        self.linear1 = nn.Linear(self.dbert.config.hidden_size, self.config['fc']['linear1'])
        self.linear2 = nn.Linear(self.config['fc']['linear1'], self.config['fc']['linear2'])
        self.linear3 = nn.Linear(self.config['fc']['linear2'], 1)

        torch.nn.init.xavier_uniform_(self.linear1.weight)
        torch.nn.init.xavier_uniform_(self.linear2.weight)

        

    def forward(self, input_ids, attention_mask):
        dbert_out = self.dbert(
            input_ids = input_ids,
            attention_mask = attention_mask,
            return_dict=True
        )

        last_hidden_state = dbert_out.last_hidden_state
        cls_token = last_hidden_state[:, 0, :]
        yhat = self.drop(cls_token)
        yhat = self.linear1(yhat)
        yhat = self.linear2(yhat)
        yhat = self.linear3(yhat)

        return yhat

    def compute_loss(self, yhat, y):
        y = y.reshape(-1, 1)
        return torch.sqrt(F.mse_loss(yhat, y))


    def training_step(self, batch, batch_idx):

        input_ids, attention_mask, targets = batch['input_ids'], batch['attention_mask'], batch['target']
        outputs = self(input_ids, attention_mask)
        
        loss = self.compute_loss(outputs, targets.type_as(outputs)) # Calculates the loss

        self.log("train_loss", loss, prog_bar=True, logger=True, sync_dist=True)

        return {
            'loss' : loss,
        }

    def validation_step(self, batch, batch_idx):

        input_ids, attention_mask, targets = batch['input_ids'], batch['attention_mask'], batch['target']
        outputs = self(input_ids, attention_mask)

        loss = self.compute_loss(outputs, targets.type_as(outputs)) # Calculates the loss

        self.log("val_loss", loss, prog_bar=True, logger=True, sync_dist=True)

        return {
            'val_loss' : loss,
        }

    def predict_step(self, batch, batch_idx):
        input_ids, attention_mask, targets = batch['input_ids'], batch['attention_mask'], batch['target']
        return self(input_ids, attention_mask)
    
    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=self.config['lr'], weight_decay=self.config['weight_decay'])
        scheduler = get_scheduler(optimizer, self.config)

        return dict(
            optimizer=optimizer,
            lr_scheduler=scheduler
        )

The problem is I cannot get the full prediction results from it. So, can you please help me out?

Hi @Adityam_Ghosh, could you please explain more about the problem you are facing? What is the expected result and what do you get, an example?

The inference results are split in different machines and do not gather automatically. I think this issue is not solved at present.