Relatively new to using CUDA. I keep getting the following error after a seemingly random period of time: RuntimeError: CUDA error: an illegal memory access was encountered
I have seen people suggest things such as using cuda.set_device() rather than cuda.device(), setting torch.backends.cudnn.benchmark = False
but I can't seem to get the error to go away. Here are some pieces of my code:
torch.cuda.set_device(torch.device('cuda:0'))
torch.backends.cudnn.benchmark = False
class LSTM(nn.Module):
def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
super(LSTM, self).__init__()
self.hidden_dim = hidden_dim
self.num_layers = num_layers
self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=0.2)
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, x):
h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_().cuda()
c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_().cuda()
out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
out = self.fc(out[:, -1, :])
return out
def pred(self, x):
return self(x) > 0
def train(model, loss_fn, optimizer, num_epochs, x_train, y_train, x_val, y_val, loss_stop=60):
cur_best_loss = 999
loss_recur_count = 0
best_model = None
for t in range(num_epochs):
model.train()
y_train_pred = model(x_train)
train_loss = loss_fn(y_train_pred, y_train)
tr_l = train_loss.item()
optimizer.zero_grad()
train_loss.backward()
optimizer.step()
model.eval()
with torch.no_grad():
y_val_pred = model(x_val)
val_loss = loss_fn(y_val_pred, y_val)
va_l = val_loss.item()
if va_l < cur_best_loss:
cur_best_loss = va_l
best_model = model
loss_recur_count = 0
else:
loss_recur_count += 1
if loss_recur_count == loss_stop:
break
if best_model is None:
print("model is None.")
return best_model
def lstm_test(cols, df, test_percent, test_bal, initial_shares_test, max_price, last_sell_day):
wdw = 20
x_train, y_train, x_test, y_test, x_val, y_val = load_data(df, wdw, test_percent, cols)
x_train = torch.from_numpy(x_train).type(torch.Tensor).cuda()
x_test = torch.from_numpy(x_test).type(torch.Tensor).cuda()
x_val = torch.from_numpy(x_val).type(torch.Tensor).cuda()
y_train = torch.from_numpy(y_train).type(torch.Tensor).cuda()
y_test = torch.from_numpy(y_test).type(torch.Tensor).cuda()
y_val = torch.from_numpy(y_val).type(torch.Tensor).cuda()
input_dim = x_train.shape[-1]
hidden_dim = 32
num_layers = 2
output_dim = 1
y_preds_dict = {}
for i in range(11):
model = LSTM(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, num_layers=num_layers).cuda()
r = (y_train.cpu().shape[0] - np.count_nonzero(y_train.cpu()))/np.count_nonzero(y_train.cpu())/2
pos_w = torch.tensor([r]).cuda()
loss_fn = torch.nn.BCEWithLogitsLoss(pos_weight=pos_w).cuda()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.01)
best_model = train(model, loss_fn, optimizer, 300, x_train, y_train, x_val, y_val)
y_test_pred = get_predictions(best_model, x_test)
y_preds_dict[i] = y_test_pred.cpu().detach().numpy().flatten()
and here is the error msg:
<ipython-input-5-c52edc2c0508> in train(model, loss_fn, optimizer, num_epochs, x_train, y_train, x_val, y_val, loss_stop)
19 model.eval()
20 with torch.no_grad():
---> 21 y_val_pred = model(x_val)
22
23 val_loss = loss_fn(y_val_pred, y_val)
~\anaconda3\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1050 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051 return forward_call(*input, **kwargs)
1052 # Do not call functions when jit is used
1053 full_backward_hooks, non_full_backward_hooks = [], []
<ipython-input-4-9da8c811c037> in forward(self, x)
10
11 def forward(self, x):
---> 12 h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_().cuda()
13 c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_().cuda()
14
RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
It was partially said by the answer of the OP, but the problem under the hood with illegal memory access is that the GPU runs out of memory.
In my case, when I run a script on Windows I get the error message:
RuntimeError: CUDA out of memory. Tried to allocate 1.64 GiB (GPU 0; 4.00 GiB total capacity; 1.10 GiB already allocated; 1.27 GiB free; 1.12 GiB reserved in total by PyTorch)
but when run on Linux I get:
RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Perhaps the message in Windows is more understandable :)
References: https://forums.fast.ai/t/runtimeerror-cuda-error-an-illegal-memory-access-was-encountered/93899
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With