Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

System hangs after first epoch training in pytorch

So, I was trying to train on ResNet model in PyTorch using the ImageNet example in the GitHub repository.

Here's what my train method looks like (it is almost similar to that in example)

def train(train_loader, model, criterion, optimizer, epoch):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    args = get_args()

    # switch to train mode
    model.train()

    end = time.time()

    for i, (input, target) in enumerate(train_loader):
        print(i)
        # data loading time
        data_time.update(time.time() - end)

        if cuda:
            target = target.cuda(async = True)
            input_var = torch.autograd.Variable(input).cuda()
        else:
            input_var = torch.autograd.Variable(input)

        target_var = torch.autograd.Variable(target)

        # compute output
        output = model(input_var)
        loss = criterion(output, target_var)

        # measure accuracy and record loss
        prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
        losses.update(loss.item(), input.size(0))
        top1.update(prec1.item(), input.size(0))
        # top5.update(prec5.item(), input.size(0))

        # compute gradient and do optimizer step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        #measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        # print to console and write logs to tensorboard
        if i % args.print_freq == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'.format(
                epoch, i, len(train_loader), batch_time=batch_time,
                data_time=data_time, loss=losses, top1=top1, top5=top5))
            niter = epoch * len(train_loader) + i
            # writer.add_scalar('Train/Loss', losses.val, niter)
            # writer.add_scalar('Train/Prec@1', top1.val, niter)
            # writer.add_scalar('Train/Prec@5', top5.val, niter)

System Information: GPU: Nvidia Titan XP Memory: 32 Gb

PyTorch: 0.4.0

When I run this code, training starts with epoch 0

Epoch: [0][0/108]   Time 5.644 (5.644)  Data 1.929 (1.929)  Loss 6.9052 (6.9052)    Prec@1 0.000 (0.000)

And then the remote server automatically disconnects. It happened five times.

And this is the data loader:

#Load the Data --> TRAIN
    traindir = 'train'
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    train_dataset = datasets.ImageFolder(traindir, transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers,
        pin_memory=cuda
    )

    # Load the data --> Validation
    valdir = 'valid'
    valid_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(valdir, transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
        batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers,
        pin_memory=cuda
    )

    if args.evaluate:
        validate(valid_loader, model, criterion, epoch=0)
        return

    # Start
    for epoch in range(args.start_epoch, args.epochs):
        adjust_learning_rate(optimizer, epoch)

        # train for epoch
        train(train_loader, model, criterion, optimizer, epoch)

        # evaluate on valid
        prec1 = validate(valid_loader, model, criterion, epoch)

        # remember best prec1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)
        save_checkpoint({
            'epoch': epoch + 1,
            'arch': args.arch,
            'state_dict': model.state_dict(),
            'best_prec1': best_prec1,
            'optimizer': optimizer.state_dict()
        }, is_best)

With this params for the loader:

args.num_workers = 4
args.batch_size = 32
pin_memory = torch.cuda.is_available()

Is there something wrong in my approach?

like image 527
nirvair Avatar asked Nov 18 '22 09:11

nirvair


1 Answers

seems a bug in pytorch's dataloader.

try args.num_workers = 0

like image 152
pem Avatar answered Dec 05 '22 08:12

pem