I am trying to implement Yolo-v2 in pytorch. However, I seem to be running out of memory just passing data through the network. The model is large and is shown below. However, I feel like I'm doing something stupid here with my network (like not freeing memory somewhere). The network works as expected on cpu.
The test code (where memory runs out) is:
x = torch.rand(32,3,416, 416).cuda()
model = Yolov2().cuda()
y = model(x.float())
Question
The model:
import torch
from torch import nn
import torch.nn.functional as F
class Yolov2(nn.Module):
def __init__(self):
super(Yolov2, self).__init__()
self.conv1 = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, stride=1, padding=1, bias=False)
self.batchnorm1 = nn.BatchNorm2d(32)
self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1, bias=False)
self.batchnorm2 = nn.BatchNorm2d(64)
self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1, bias=False)
self.batchnorm3 = nn.BatchNorm2d(128)
self.conv4 = nn.Conv2d(in_channels=128, out_channels=64, kernel_size=1, stride=1, padding=0, bias=False)
self.batchnorm4 = nn.BatchNorm2d(64)
self.conv5 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1, bias=False)
self.batchnorm5 = nn.BatchNorm2d(128)
self.conv6 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1, bias=False)
self.batchnorm6 = nn.BatchNorm2d(256)
self.conv7 = nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1, stride=1, padding=0, bias=False)
self.batchnorm7 = nn.BatchNorm2d(128)
self.conv8 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1, bias=False)
self.batchnorm8 = nn.BatchNorm2d(256)
self.conv9 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1, bias=False)
self.batchnorm9 = nn.BatchNorm2d(512)
self.conv10 = nn.Conv2d(in_channels=512, out_channels=256, kernel_size=1, stride=1, padding=0, bias=False)
self.batchnorm10 = nn.BatchNorm2d(256)
self.conv11 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1, bias=False)
self.batchnorm11 = nn.BatchNorm2d(512)
self.conv12 = nn.Conv2d(in_channels=512, out_channels=256, kernel_size=1, stride=1, padding=0, bias=False)
self.batchnorm12 = nn.BatchNorm2d(256)
self.conv13 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1, bias=False)
self.batchnorm13 = nn.BatchNorm2d(512)
self.conv14 = nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=1, padding=1, bias=False)
self.batchnorm14 = nn.BatchNorm2d(1024)
self.conv15 = nn.Conv2d(in_channels=1024, out_channels=512, kernel_size=1, stride=1, padding=0, bias=False)
self.batchnorm15 = nn.BatchNorm2d(512)
self.conv16 = nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=1, padding=1, bias=False)
self.batchnorm16 = nn.BatchNorm2d(1024)
self.conv17 = nn.Conv2d(in_channels=1024, out_channels=512, kernel_size=1, stride=1, padding=0, bias=False)
self.batchnorm17 = nn.BatchNorm2d(512)
self.conv18 = nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=1, padding=1, bias=False)
self.batchnorm18 = nn.BatchNorm2d(1024)
self.conv19 = nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, stride=1, padding=1, bias=False)
self.batchnorm19 = nn.BatchNorm2d(1024)
self.conv20 = nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, stride=1, padding=1, bias=False)
self.batchnorm20 = nn.BatchNorm2d(1024)
self.conv21 = nn.Conv2d(in_channels=3072, out_channels=1024, kernel_size=3, stride=1, padding=1, bias=False)
self.batchnorm21 = nn.BatchNorm2d(1024)
self.conv22 = nn.Conv2d(in_channels=1024, out_channels=125, kernel_size=1, stride=1, padding=0)
def reorg_layer(self, x):
stride = 2
batch_size, channels, height, width = x.size()
new_ht = int(height/stride)
new_wd = int(width/stride)
new_channels = channels * stride * stride
# from IPython.core.debugger import Tracer; Tracer()()
passthrough = x.permute(0, 2, 3, 1)
passthrough = passthrough.contiguous().view(-1, new_ht, stride, new_wd, stride, channels)
passthrough = passthrough.permute(0, 1, 3, 2, 4, 5)
passthrough = passthrough.contiguous().view(-1, new_ht, new_wd, new_channels)
passthrough = passthrough.permute(0, 3, 1, 2)
return passthrough
def forward(self, x):
out = F.max_pool2d(F.leaky_relu(self.batchnorm1(self.conv1(x)), negative_slope=0.1), 2, stride=2)
out = F.max_pool2d(F.leaky_relu(self.batchnorm2(self.conv2(out)), negative_slope=0.1), 2, stride=2)
out = F.leaky_relu(self.batchnorm3(self.conv3(out)), negative_slope=0.1)
out = F.leaky_relu(self.batchnorm4(self.conv4(out)), negative_slope=0.1)
out = F.leaky_relu(self.batchnorm5(self.conv5(out)), negative_slope=0.1)
out = F.max_pool2d(out, 2, stride=2)
out = F.leaky_relu(self.batchnorm6(self.conv6(out)), negative_slope=0.1)
out = F.leaky_relu(self.batchnorm7(self.conv7(out)), negative_slope=0.1)
out = F.leaky_relu(self.batchnorm8(self.conv8(out)), negative_slope=0.1)
out = F.max_pool2d(out, 2, stride=2)
out = F.leaky_relu(self.batchnorm9(self.conv9(out)), negative_slope=0.1)
out = F.leaky_relu(self.batchnorm10(self.conv10(out)), negative_slope=0.1)
out = F.leaky_relu(self.batchnorm11(self.conv11(out)), negative_slope=0.1)
out = F.leaky_relu(self.batchnorm12(self.conv12(out)), negative_slope=0.1)
out = F.leaky_relu(self.batchnorm13(self.conv13(out)), negative_slope=0.1)
# from IPython.core.debugger import Tracer; Tracer()()
passthrough = self.reorg_layer(out)
out = F.max_pool2d(out, 2, stride=2)
out = F.leaky_relu(self.batchnorm14(self.conv14(out)), negative_slope=0.1)
out = F.leaky_relu(self.batchnorm15(self.conv15(out)), negative_slope=0.1)
out = F.leaky_relu(self.batchnorm16(self.conv16(out)), negative_slope=0.1)
out = F.leaky_relu(self.batchnorm17(self.conv17(out)), negative_slope=0.1)
out = F.leaky_relu(self.batchnorm18(self.conv18(out)), negative_slope=0.1)
out = F.leaky_relu(self.batchnorm19(self.conv19(out)), negative_slope=0.1)
out = F.leaky_relu(self.batchnorm20(self.conv20(out)), negative_slope=0.1)
out = torch.cat([passthrough, out], 1)
out = F.leaky_relu(self.batchnorm21(self.conv21(out)), negative_slope=0.1)
out = self.conv22(out)
return out
'0.4.1.post2'I would try to use smaller batch sizes. Start from 1 and then check what is your maximum. I can also try to reduce your input Tensor dimensions. Your network is not so small for your GPU
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With