Cuda IndexKernel error, device side assert triggered

Following seems to work on the CPU but does not work on the GPU.
It is the first time I am using Lightning so I might also be doing something incorrectly also.

from typing import Dict, Tuple

import fire
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pl_bolts.models.detection.faster_rcnn import FasterRCNN
from pytorch_lightning.loggers import TensorBoardLogger
from torch.tensor import Tensor
from torch.utils.data.dataloader import DataLoader
from torch.utils.data.dataset import random_split

from <redacted> import Dataset


def collate(batch):
    return tuple(zip(*batch))


class DataModule(pl.LightningDataModule):
    """TorchLightning DataModule from Dataset"""

    def __init__(self, batch_size: int = 2):
        super().__init__()
        self.batch_size = batch_size
        self.num_workers = 1
        dataset = Dataset()
        ltrain = len(dataset)
        ltest = int(ltrain * 0.2)
        ltrain -= ltest
        lval = int(ltrain * 0.2)
        ltrain -= lval
        lengths = ltrain, lval, ltest
        print(f"train: {ltrain} val: {lval} test: {ltest}")
        self.train, self.val, self.test = random_split(dataset, lengths)

    def val_dataloader(self) -> DataLoader[Tuple[Tensor, Dict[str, Tensor]]]:
        val_loader: DataLoader[Tuple[Tensor, Dict[str, Tensor]]] = DataLoader(
            self.val,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            collate_fn=collate,
        )
        return val_loader

    def train_dataloader(
        self,
    ) -> DataLoader[Tuple[Tensor, Dict[str, Tensor]]]:
        train_loader: DataLoader[Tuple[Tensor, Dict[str, Tensor]]] = DataLoader(
            self.train,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            collate_fn=collate,
        )
        return train_loader

    def test_dataloader(self) -> DataLoader[Tuple[Tensor, Dict[str, Tensor]]]:
        test_loader: DataLoader[Tuple[Tensor, Dict[str, Tensor]]] = DataLoader(
            self.test,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            collate_fn=collate,
        )
        return test_loader


class Train():
    
    @staticmethod
    def start():
        example = Dataset()[0]
        print(f"""
            \tsrc: {example[0].shape}
            \tboxes: {example[1]['boxes'].shape}
            \tlabels: {example[1]['labels'].shape}
        """
        )

        logger = TensorBoardLogger(save_dir="logs", version=1, name="training_logs")
        trainer = Trainer(
            logger=logger,
            # accelerator="ddp",
            gpus=1,
            # amp_backend="apex",
        )
        model = FasterRCNN()
        trainer.fit(model, datamodule=DataModule())


if __name__=="__main__":
    fire.Fire(Train())

The dataset print is

                src: torch.Size([3, 1028, 1232])
                boxes: torch.Size([26, 4])
                labels: torch.Size([26])

But it fails with

UserWarning: The dataloader, train dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 20 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.
  warnings.warn(*args, **kwargs)
Epoch 0:   0%|                                                                         | 0/56 [00:00<?, ?it/s]/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:142: operator(): block: [0,0,0], thread: [32,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:142: operator(): block: [0,0,0], thread: [33,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:142: operator(): block: [0,0,0], thread: [34,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:142: operator(): block: [0,0,0], thread: [35,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernel.cu:142: operator(): block: [0,0,0], thread: [36,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed. 
<Continues for a while with errors where only thread number changes>
Traceback (most recent call last):
  File "/usr/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "<my_path>/.vscode-server/extensions/ms-python.python-2021.6.944021595/pythonFiles/lib/python/debugpy/__main__.py", line 45, in <module>
    cli.main()
  File "<my_path>/.vscode-server/extensions/ms-python.python-2021.6.944021595/pythonFiles/lib/python/debugpy/../debugpy/server/cli.py", line 444, in main
    run()
  File "<my_path>/.vscode-server/extensions/ms-python.python-2021.6.944021595/pythonFiles/lib/python/debugpy/../debugpy/server/cli.py", line 285, in run_file
    runpy.run_path(target_as_str, run_name=compat.force_str("__main__"))
  File "/usr/lib/python3.9/runpy.py", line 268, in run_path
    return _run_module_code(code, init_globals, run_name,
  File "/usr/lib/python3.9/runpy.py", line 97, in _run_module_code
    _run_code(code, mod_globals, init_globals,
  File "/usr/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "<my_path>/train.py", line 96, in <module>
    fire.Fire(Train())
  File "<my_path>/ENV/lib/python3.9/site-packages/fire/core.py", line 141, in Fire
    component_trace = _Fire(component, args, parsed_flag_args, context, name)
  File "<my_path>/ENV/lib/python3.9/site-packages/fire/core.py", line 466, in _Fire
    component, remaining_args = _CallAndUpdateTrace(
  File "<my_path>/ENV/lib/python3.9/site-packages/fire/core.py", line 681, in _CallAndUpdateTrace
    component = fn(*varargs, **kwargs)
  File "<my_path>/train.py", line 92, in mgg
    trainer.fit(model, datamodule=DataModule())
  File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 458, in fit
    self._run(model)
  File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 756, in _run
    self.dispatch()
  File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 797, in dispatch
    self.accelerator.start_training(self)
  File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/accelerators/accelerator.py", line 96, in start_training
    self.training_type_plugin.start_training(trainer)
  File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 144, in start_training
    self._results = trainer.run_stage()
  File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 807, in run_stage
    return self.run_train()
  File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 869, in run_train
    self.train_loop.run_training_epoch()
  File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/trainer/training_loop.py", line 499, in run_training_epoch
    batch_output = self.run_training_batch(batch, batch_idx, dataloader_idx)
  File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/trainer/training_loop.py", line 738, in run_training_batch
    self.optimizer_step(optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
  File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/trainer/training_loop.py", line 434, in optimizer_step
    model_ref.optimizer_step(
  File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/core/lightning.py", line 1403, in optimizer_step
    optimizer.step(closure=optimizer_closure)
  File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/core/optimizer.py", line 214, in step
    self.__optimizer_step(*args, closure=closure, profiler_name=profiler_name, **kwargs)
  File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/core/optimizer.py", line 134, in __optimizer_step
    trainer.accelerator.optimizer_step(optimizer, self._optimizer_idx, lambda_closure=closure, **kwargs)
  File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/accelerators/accelerator.py", line 329, in optimizer_step
    self.run_optimizer_step(optimizer, opt_idx, lambda_closure, **kwargs)
  File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/accelerators/accelerator.py", line 336, in run_optimizer_step
    self.training_type_plugin.optimizer_step(optimizer, lambda_closure=lambda_closure, **kwargs)
  File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 193, in optimizer_step
    optimizer.step(closure=lambda_closure, **kwargs)
  File "<my_path>/ENV/lib/python3.9/site-packages/torch/optim/optimizer.py", line 89, in wrapper
    return func(*args, **kwargs)
  File "<my_path>/ENV/lib/python3.9/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
    return func(*args, **kwargs)
  File "<my_path>/ENV/lib/python3.9/site-packages/torch/optim/sgd.py", line 87, in step
    loss = closure()
  File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/trainer/training_loop.py", line 732, in train_step_and_backward_closure
    result = self.training_step_and_backward(
  File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/trainer/training_loop.py", line 823, in training_step_and_backward
    result = self.training_step(split_batch, batch_idx, opt_idx, hiddens)
  File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/trainer/training_loop.py", line 290, in training_step
    training_step_output = self.trainer.accelerator.training_step(args)
  File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/accelerators/accelerator.py", line 204, in training_step
    return self.training_type_plugin.training_step(*args)
  File "<my_path>/ENV/lib/python3.9/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 155, in training_step
    return self.lightning_module.training_step(*args, **kwargs)
  File "<my_path>/ENV/lib/python3.9/site-packages/pl_bolts/models/detection/faster_rcnn/faster_rcnn_module.py", line 112, in training_step
    loss_dict = self.model(images, targets)
  File "<my_path>/ENV/lib/python3.9/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "<my_path>/ENV/lib/python3.9/site-packages/torchvision/models/detection/generalized_rcnn.py", line 97, in forward
    proposals, proposal_losses = self.rpn(images, features, targets)
  File "<my_path>/ENV/lib/python3.9/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "<my_path>/ENV/lib/python3.9/site-packages/torchvision/models/detection/rpn.py", line 364, in forward
    loss_objectness, loss_rpn_box_reg = self.compute_loss(
  File "<my_path>/ENV/lib/python3.9/site-packages/torchvision/models/detection/rpn.py", line 296, in compute_loss
    sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)
  File "<my_path>/ENV/lib/python3.9/site-packages/torchvision/models/detection/_utils.py", line 45, in __call__
    positive = torch.where(matched_idxs_per_image >= 1)[0]
RuntimeError: CUDA error: device-side assert triggered
Exception ignored in: <function tqdm.__del__ at 0x7f01c474d550>
Traceback (most recent call last):
  File "<my_path>/ENV/lib/python3.9/site-packages/tqdm/std.py", line 1145, in __del__
  File "<my_path>/ENV/lib/python3.9/site-packages/tqdm/std.py", line 1299, in close
  File "<my_path>/ENV/lib/python3.9/site-packages/tqdm/std.py", line 1492, in display
  File "<my_path>/ENV/lib/python3.9/site-packages/tqdm/std.py", line 1148, in __str__
  File "<my_path>/ENV/lib/python3.9/site-packages/tqdm/std.py", line 1450, in format_dict
TypeError: cannot unpack non-iterable NoneType object

I have no clue what this index assertion does. Do I have an error in my dataset (unlikely as CPU seems to work) or is this some other error?

Upgrade from torch==1.8.1 to 1.9.0 seems to have fixed this issue.