So if I switch to distributed_backend=None when initializing the trainer and use gpu=1, this error happens:
Traceback (most recent call last):
File "main.py", line 49, in <module>
trainer.fit(model, train_loader, val_loader)
File "/afs/ece.cmu.edu/usr/xujinl/anaconda3/envs/CSD/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 439, in fit
results = self.accelerator_backend.train()
File "/afs/ece.cmu.edu/usr/xujinl/anaconda3/envs/CSD/lib/python3.7/site-packages/pytorch_lightning/accelerators/gpu_accelerator.py", line 54, in train
results = self.train_or_test()
File "/afs/ece.cmu.edu/usr/xujinl/anaconda3/envs/CSD/lib/python3.7/site-packages/pytorch_lightning/accelerators/accelerator.py", line 66, in train_or_test
results = self.trainer.train()
File "/afs/ece.cmu.edu/usr/xujinl/anaconda3/envs/CSD/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 482, in train
self.train_loop.run_training_epoch()
File "/afs/ece.cmu.edu/usr/xujinl/anaconda3/envs/CSD/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py", line 541, in run_training_epoch
batch_output = self.run_training_batch(batch, batch_idx, dataloader_idx)
File "/afs/ece.cmu.edu/usr/xujinl/anaconda3/envs/CSD/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py", line 678, in run_training_batch
self.trainer.hiddens
File "/afs/ece.cmu.edu/usr/xujinl/anaconda3/envs/CSD/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py", line 760, in training_step_and_backward
result = self.training_step(split_batch, batch_idx, opt_idx, hiddens)
File "/afs/ece.cmu.edu/usr/xujinl/anaconda3/envs/CSD/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py", line 304, in training_step
training_step_output = self.trainer.accelerator_backend.training_step(args)
File "/afs/ece.cmu.edu/usr/xujinl/anaconda3/envs/CSD/lib/python3.7/site-packages/pytorch_lightning/accelerators/gpu_accelerator.py", line 62, in training_step
output = self.__training_step(args)
File "/afs/ece.cmu.edu/usr/xujinl/anaconda3/envs/CSD/lib/python3.7/site-packages/pytorch_lightning/accelerators/gpu_accelerator.py", line 68, in __training_step
batch = self.to_device(batch)
File "/afs/ece.cmu.edu/usr/xujinl/anaconda3/envs/CSD/lib/python3.7/site-packages/pytorch_lightning/accelerators/gpu_accelerator.py", line 113, in to_device
return self.batch_to_device(batch, gpu_id)
File "/afs/ece.cmu.edu/usr/xujinl/anaconda3/envs/CSD/lib/python3.7/site-packages/pytorch_lightning/accelerators/accelerator.py", line 72, in batch_to_device
return model.transfer_batch_to_device(batch, device)
File "/afs/ece.cmu.edu/usr/xujinl/anaconda3/envs/CSD/lib/python3.7/site-packages/pytorch_lightning/core/hooks.py", line 555, in transfer_batch_to_device
return move_data_to_device(batch, device)
File "/afs/ece.cmu.edu/usr/xujinl/anaconda3/envs/CSD/lib/python3.7/site-packages/pytorch_lightning/utilities/apply_func.py", line 125, in move_data_to_device
return apply_to_collection(batch, dtype=dtype, function=batch_to)
File "/afs/ece.cmu.edu/usr/xujinl/anaconda3/envs/CSD/lib/python3.7/site-packages/pytorch_lightning/utilities/apply_func.py", line 58, in apply_to_collection
return elem_type([apply_to_collection(d, dtype, function, *args, **kwargs) for d in data])
File "/afs/ece.cmu.edu/usr/xujinl/anaconda3/envs/CSD/lib/python3.7/site-packages/pytorch_lightning/utilities/apply_func.py", line 58, in <listcomp>
return elem_type([apply_to_collection(d, dtype, function, *args, **kwargs) for d in data])
File "/afs/ece.cmu.edu/usr/xujinl/anaconda3/envs/CSD/lib/python3.7/site-packages/pytorch_lightning/utilities/apply_func.py", line 49, in apply_to_collection
return function(data, *args, **kwargs)
File "/afs/ece.cmu.edu/usr/xujinl/anaconda3/envs/CSD/lib/python3.7/site-packages/pytorch_lightning/utilities/apply_func.py", line 122, in batch_to
return data.to(device, **kwargs)
File "/afs/ece.cmu.edu/usr/xujinl/anaconda3/envs/CSD/lib/python3.7/site-packages/dgl/graph.py", line 3858, in to
self.ndata[k] = F.copy_to(self.ndata[k], ctx)
File "/afs/ece.cmu.edu/usr/xujinl/anaconda3/envs/CSD/lib/python3.7/site-packages/dgl/backend/pytorch/tensor.py", line 90, in copy_to
if ctx.type == 'cpu':
AttributeError: 'int' object has no attribute 'type'
If I use dp backend with 1 or 2 gpus, the same error of trying to backwards twice appears. I’m still trying to find whether this is a bug on the dgl side or lightning side, but I’m a bit more inclined to latter.