Terminate called after throwing an instance of 'c10::CUDAError' what(): CUDA error: initialization error

I am trying to run my pytorch-lghtning code on TPU in GCP.

import numpy as np # linear algebra
import pandas as pd 
import os
import string
from typing import Optional

# for checkpoint
# https://pytorch-lightning.readthedocs.io/en/latest/common/weights_loading.html
from torch.utils.data import DataLoader, Dataset, random_split
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import pytorch_lightning as pl
from pytorch_lightning import LightningDataModule
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, StandardScaler

import logging
logging.getLogger("lightning").setLevel(logging.ERROR)



class ModelDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.len = len(self.X)
        
    def __getitem__(self, index):
        return self.X[index], self.y[index]
    
    def __len__(self):
        return self.len

    
class ModelDataModule(LightningDataModule):
    def __init__(self):
        super().__init__()
        self.train_root_path = './20news-bydate-train'
        self.test_root_path = './20news-bydate-test'
        self.batch_size = 64

    def get_data(self, root_path):
        data = []
        label = []

        folders = ['rec.sport.hockey', 'sci.electronics' , 'rec.autos']
        for i, folder_name in enumerate(folders):
            files = []
            folder_path = os.path.join(root_path, folder_name)
            files.extend([f for f in os.listdir(folder_path)])
            for file in files:
                file_path = os.path.join(folder_path, file)
                with open(file_path, 'r', encoding='latin1') as f:
                    content = f.read()
                    data.append(content)
                    label.append(i)
        return data, label

    def preprocess(self, words):
        table = str.maketrans('', '', '\t')
        words = [word.translate(table) for word in words]
        n_table = str.maketrans('', '', '\n')
        words = [word.translate(n_table) for word in words]

        punctuations = (string.punctuation).replace("'", "")
        trans_table = str.maketrans('', '', punctuations)
        stripped_words = [word.translate(trans_table) for word in words]

        words = [str for str in stripped_words if str]

        p_words = []
        for word in words:
            if (word[0] and word[len(word)-1] == "'"):
                word = word[1:len(word)-1]
            elif(word[0] == "'"):
                word = word[1:len(word)]
            else:
                word = word
            p_words.append(word)

        words = p_words.copy()
        words = [word for word in words if not word.isdigit()]
        words = [word for word in words if not len(word) == 1]
        words = [str for str in words if str]
        words = [word.lower() for word in words]
        words = [word for word in words if len(word) > 2]

        return " ".join(words)
    
    def prepare_data(self):
        ### called only on 1 GPU
        # get data
        train_data, train_label = self.get_data(self.train_root_path)
        test_data, test_label = self.get_data(self.test_root_path)

        # preprocess
        for i, text in enumerate(train_data):
            train_data[i] = self.preprocess(text.strip().split())

        for i, text in enumerate(test_data):
            test_data[i] = self.preprocess(text.strip().split())

        # feature engineering
        ### data
        vectorizer = TfidfVectorizer(max_df=0.75,stop_words='english')
        X_train = vectorizer.fit_transform(train_data)
        X_test = vectorizer.transform(test_data)

        X_train = X_train.toarray()
        X_test = X_test.toarray()

        pca_1k = PCA(n_components=1024)
        X_train1k = pca_1k.fit_transform(X_train)
        X_test1k = pca_1k.transform(X_test)
        print (pca_1k.explained_variance_ratio_.cumsum()[-1])
        
        ### label
        le = LabelEncoder()
        y_train = le.fit_transform(train_label)
        y_test = le.transform(test_label)
        
        # scaling
        scaler = StandardScaler()
        scaler.fit(X_train1k)
        
        self.train_data = torch.tensor(scaler.transform(X_train1k))
        self.test_data = torch.tensor(scaler.transform(X_test1k))
        self.train_label = torch.tensor(y_train)
        self.test_label = torch.tensor(y_test)
        

    def setup(self, stage: Optional[str] = None):
        # called on every GPU  
        train_data = ModelDataset(self.train_data, self.train_label)
        test_dataset = ModelDataset(self.test_data, self.test_label)
        train_len = (len(train_data)//10)*8
        val_len = len(train_data) - train_len
        print(train_len, val_len)
        train_dataset, val_dataset = random_split(train_data, [train_len, val_len],
                                                  generator=torch.Generator().manual_seed(42))

        self.train = train_dataset
        self.val = val_dataset
        self.test = test_dataset

    def train_dataloader(self):
        train_loader = DataLoader(dataset=self.train, 
                          batch_size=self.batch_size, shuffle=True, drop_last=True)
        return train_loader

    def val_dataloader(self):
        val_loader = DataLoader(dataset=self.val, 
                         batch_size=self.batch_size)
        return val_loader

    def test_dataloader(self):
        test_loader = DataLoader(dataset=self.test, 
                         batch_size=self.batch_size)
        return test_loader

class LightningFFModel(pl.LightningModule):
    def __init__(self, input_size, hidden_size, learning_rate):
        super().__init__()
        self.input_size = input_size
        self.hidden_size  = hidden_size
        self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(self.hidden_size, 3)
        
        self.criterion = nn.CrossEntropyLoss()
        self.lr = learning_rate

    def forward(self, x):
        hidden = self.fc1(x)
        relu = self.relu(hidden)
        output = self.fc2(relu)
        return output

    def cross_entropy_loss(self, outputs, labels):
        return self.criterion(outputs, labels)

    def training_step(self, train_batch, batch_idx):
        x, y = train_batch
        outputs = self.forward(x.float())
        loss = self.cross_entropy_loss(outputs, y)
        self.log('train_loss', loss, rank_zero_only=True)
        return loss

    def validation_step(self, val_batch, batch_idx):
        x, y = val_batch
        outputs = self.forward(x.float())
        loss = self.cross_entropy_loss(outputs, y)
        self.log('val_loss', loss, rank_zero_only=True)

    def test_step(self, batch, batch_idx):
        x, y = batch
        outputs = self.forward(x.float())
        loss = self.cross_entropy_loss(outputs, y)
        self.log("test_loss", loss, rank_zero_only=True)

    def configure_optimizers(self):
        optimizer = optim.SGD(self.parameters(),
                              lr=(self.lr or self.learning_rate), momentum=0.9)
        return optimizer    
    

if __name__=='__main__':
	input_size = 1024
	hidden_size = 128
	model_dm = ModelDataModule()

	# train
	model = LightningFFModel(input_size, hidden_size, 0.005)

	early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=0.00, patience=3, verbose=False, mode="min")
	checkpoint_callback = ModelCheckpoint(
		        monitor='val_loss',
		        dirpath='pt_checkpoints',
		        filename='epoch{epoch:02d}',
		        auto_insert_metric_name=False)

	trainer = pl.Trainer(tpu_cores=8, max_epochs=5,
		                 callbacks=[early_stop_callback, checkpoint_callback],
		                 auto_lr_find=True,
						 log_every_n_steps=1)


	trainer.fit(model, datamodule=model_dm)

I am getting the error

terminate called after throwing an instance of 'c10::CUDAError' what(): CUDA error: initialization error CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1. ... torch.multiprocessing.spawn.ProcessExitedException: process 7 terminated with signal SIGABRT

The full stack trace is as follows,

GPU available: True, used: False
TPU available: True, using: 8 TPU cores
IPU available: False, using: 0 IPUs
/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py:1296: UserWarning: GPU available but not used. Set the gpus flag in your trainer `Trainer(gpus=1)` or script `--gpus=1`.
  "GPU available but not used. Set the gpus flag in your trainer"
Validation sanity check: 0it [00:00, ?it/s]/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/data_loading.py:106: UserWarning: The dataloader, val dataloader 0, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/data_loading.py:106: UserWarning: The dataloader, train dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
Epoch 0:   0%|                                                                     | 0/3 [00:00<00:00, 2368.33it/s]terminate called after throwing an instance of 'c10::CUDAError'
  what():  CUDA error: initialization error
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Exception raised from getDevice at /opt/conda/conda-bld/pytorch_1623448265233/work/c10/cuda/impl/CUDAGuardImpl.h:38 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7fbac23cfa22 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x9b5b (0x7fbac262ab5b in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10_cuda.so)
frame #2: <unknown function> + 0x3280aaa (0x7fbac5ac9aaa in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #3: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x2a (0x7fbac5acabfa in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #4: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x53 (0x7fbb3bc9dc43 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
frame #5: <unknown function> + 0xc9067 (0x7fbb70d75067 in /opt/conda/lib/python3.7/site-packages/pyarrow/../../../libstdc++.so.6)
frame #6: <unknown function> + 0x76db (0x7fbb8340f6db in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #7: clone + 0x3f (0x7fbb8278b71f in /lib/x86_64-linux-gnu/libc.so.6)

terminate called after throwing an instance of 'terminate called after throwing an instance of 'c10::CUDAErrorc10::CUDAError'
'
  what():    what():  CUDA error: initialization error
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Exception raised from getDevice at /opt/conda/conda-bld/pytorch_1623448265233/work/c10/cuda/impl/CUDAGuardImpl.h:38 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7fbac23cfa22 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x9b5b (0x7fbac262ab5b in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10_cuda.so)
frame #2: <unknown function> + 0x3280aaa (0x7fbac5ac9aaa in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #3: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x2a (0x7fbac5acabfa in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #4: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x53 (0x7fbb3bc9dc43 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
frame #5: <unknown function> + 0xc9067 (0x7fbb70d75067 in /opt/conda/lib/python3.7/site-packages/pyarrow/../../../libstdc++.so.6)
frame #6: <unknown function> + 0x76db (0x7fbb8340f6db in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #7: clone + 0x3f (0x7fbb8278b71f in /lib/x86_64-linux-gnu/libc.so.6)

CUDA error: initialization error
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Exception raised from getDevice at /opt/conda/conda-bld/pytorch_1623448265233/work/c10/cuda/impl/CUDAGuardImpl.h:38 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7fbac23cfa22 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x9b5b (0x7fbac262ab5b in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10_cuda.so)
frame #2: <unknown function> + 0x3280aaa (0x7fbac5ac9aaa in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #3: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x2a (0x7fbac5acabfa in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #4: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x53 (0x7fbb3bc9dc43 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
frame #5: <unknown function> + 0xc9067 (0x7fbb70d75067 in /opt/conda/lib/python3.7/site-packages/pyarrow/../../../libstdc++.so.6)
frame #6: <unknown function> + 0x76db (0x7fbb8340f6db in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #7: clone + 0x3f (0x7fbb8278b71f in /lib/x86_64-linux-gnu/libc.so.6)

terminate called after throwing an instance of 'c10::CUDAError'
  what():  CUDA error: initialization error
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Exception raised from getDevice at /opt/conda/conda-bld/pytorch_1623448265233/work/c10/cuda/impl/CUDAGuardImpl.h:38 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7fbac23cfa22 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x9b5b (0x7fbac262ab5b in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10_cuda.so)
frame #2: <unknown function> + 0x3280aaa (0x7fbac5ac9aaa in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #3: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x2a (0x7fbac5acabfa in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #4: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x53 (0x7fbb3bc9dc43 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
frame #5: <unknown function> + 0xc9067 (0x7fbb70d75067 in /opt/conda/lib/python3.7/site-packages/pyarrow/../../../libstdc++.so.6)
frame #6: <unknown function> + 0x76db (0x7fbb8340f6db in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #7: clone + 0x3f (0x7fbb8278b71f in /lib/x86_64-linux-gnu/libc.so.6)

terminate called after throwing an instance of 'c10::CUDAError'
  what():  CUDA error: initialization error
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Exception raised from getDevice at /opt/conda/conda-bld/pytorch_1623448265233/work/c10/cuda/impl/CUDAGuardImpl.h:38 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7fbac23cfa22 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x9b5b (0x7fbac262ab5b in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10_cuda.so)
frame #2: <unknown function> + 0x3280aaa (0x7fbac5ac9aaa in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #3: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x2a (0x7fbac5acabfa in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #4: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x53 (0x7fbb3bc9dc43 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
frame #5: <unknown function> + 0xc9067 (0x7fbb70d75067 in /opt/conda/lib/python3.7/site-packages/pyarrow/../../../libstdc++.so.6)
frame #6: <unknown function> + 0x76db (0x7fbb8340f6db in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #7: clone + 0x3f (0x7fbb8278b71f in /lib/x86_64-linux-gnu/libc.so.6)

terminate called after throwing an instance of 'c10::CUDAError'
  what():  CUDA error: initialization error
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Exception raised from getDevice at /opt/conda/conda-bld/pytorch_1623448265233/work/c10/cuda/impl/CUDAGuardImpl.h:38 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7fbac23cfa22 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x9b5b (0x7fbac262ab5b in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10_cuda.so)
frame #2: <unknown function> + 0x3280aaa (0x7fbac5ac9aaa in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #3: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x2a (0x7fbac5acabfa in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #4: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x53 (0x7fbb3bc9dc43 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
frame #5: <unknown function> + 0xc9067 (0x7fbb70d75067 in /opt/conda/lib/python3.7/site-packages/pyarrow/../../../libstdc++.so.6)
frame #6: <unknown function> + 0x76db (0x7fbb8340f6db in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #7: clone + 0x3f (0x7fbb8278b71f in /lib/x86_64-linux-gnu/libc.so.6)

terminate called after throwing an instance of 'c10::CUDAError'
  what():  CUDA error: initialization error
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Exception raised from getDevice at /opt/conda/conda-bld/pytorch_1623448265233/work/c10/cuda/impl/CUDAGuardImpl.h:38 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7fbac23cfa22 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x9b5b (0x7fbac262ab5b in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10_cuda.so)
frame #2: <unknown function> + 0x3280aaa (0x7fbac5ac9aaa in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #3: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x2a (0x7fbac5acabfa in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #4: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x53 (0x7fbb3bc9dc43 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
frame #5: <unknown function> + 0xc9067 (0x7fbb70d75067 in /opt/conda/lib/python3.7/site-packages/pyarrow/../../../libstdc++.so.6)
frame #6: <unknown function> + 0x76db (0x7fbb8340f6db in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #7: clone + 0x3f (0x7fbb8278b71f in /lib/x86_64-linux-gnu/libc.so.6)

terminate called after throwing an instance of 'c10::CUDAError'
  what():  CUDA error: initialization error
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Exception raised from getDevice at /opt/conda/conda-bld/pytorch_1623448265233/work/c10/cuda/impl/CUDAGuardImpl.h:38 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7fbac23cfa22 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x9b5b (0x7fbac262ab5b in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10_cuda.so)
frame #2: <unknown function> + 0x3280aaa (0x7fbac5ac9aaa in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #3: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x2a (0x7fbac5acabfa in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #4: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x53 (0x7fbb3bc9dc43 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
frame #5: <unknown function> + 0xc9067 (0x7fbb70d75067 in /opt/conda/lib/python3.7/site-packages/pyarrow/../../../libstdc++.so.6)
frame #6: <unknown function> + 0x76db (0x7fbb8340f6db in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #7: clone + 0x3f (0x7fbb8278b71f in /lib/x86_64-linux-gnu/libc.so.6)

Traceback (most recent call last):
  File "pt_test.py", line 229, in <module>
    trainer.fit(model, datamodule=model_dm)
  File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 553, in fit
    self._run(model)
  File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 918, in _run
    self._dispatch()
  File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 986, in _dispatch
    self.accelerator.start_training(self)
  File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/accelerators/accelerator.py", line 92, in start_training
    self.training_type_plugin.start_training(trainer)
  File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/plugins/training_type/tpu_spawn.py", line 267, in start_training
    xmp.spawn(self.new_process, **self.xmp_spawn_kwargs)
  File "/opt/conda/lib/python3.7/site-packages/torch_xla/distributed/xla_multiprocessing.py", line 394, in spawn
    start_method=start_method)
  File "/opt/conda/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 188, in start_processes
    while not context.join():
  File "/opt/conda/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 136, in join
    signal_name=name
torch.multiprocessing.spawn.ProcessExitedException: process 7 terminated with signal SIGABRT

python: 3.7
pytorch: 1.9
pytorch-lightning: 1.4.4
cuda: 11.1
tpu: v2-8

This works on single GPU and Error occurs at trainer.fit
What does c10-cudaerror mean? Is it something related to cuda version 10?
Any ideas what’s going wrong?

Same here! Still trying to find out if it’s a pytorch related problem or not…

Did you get a cahnce to check with Dataloader instead of Datamodule?

I get this error while training at random points, but it does not mention anything related to dataloaders in the stack tace… :frowning:

EDIT:
I think is related to toolkit 11.1 and pytorch 1.9, I’m currently working with the “pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime” docker image that has your same configuration.
I will try to use pytorch/pytorch:1.8.1-cuda10.2-cudnn7-runtime see if that persists. If you have any news let me know!

1 Like