I'm simply trying to train a ResNet18 model using PyTorch library. The training dataset consists of 25,000 images. Therefore, it is taking a lot of time for even the first epoch to complete. Therefore, I want to save the progress after a certain no. of batch iteration is completed. But I can't figure out how to modify my code and how to use the torch.save() and torch.load() functions in my code to save the periodic progress.
My code is given below:
# BUILD THE NETWORK
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
import torch.nn.functional as F
import torchvision
import torchvision.models as models
from torchvision import transforms
from PIL import Image
import matplotlib.pyplot as plt
# DOWNLOAD PRETRAINED MODELS ON ImageNet
model_resnet18 = torch.hub.load('pytorch/vision', 'resnet18', pretrained = True)
model_resnet34 = torch.hub.load('pytorch/vision', 'resnet34', pretrained = True)
for name, param in model_resnet18.named_parameters():
if('bn' not in name):
param.requires_grad = False
for name, param in model_resnet34.named_parameters():
if('bn' not in name):
param.requires_grad = False
num_classes = 2
model_resnet18.fc = nn.Sequential(nn.Linear(model_resnet18.fc.in_features, 512),
nn.ReLU(),
nn.Dropout(),
nn.Linear(512, num_classes))
model_resnet34.fc = nn.Sequential(nn.Linear(model_resnet34.fc.in_features, 512),
nn.ReLU(),
nn.Dropout(),
nn.Linear(512, num_classes))
# FUNCTIONS FOR TRAINING AND LOADING DATA
def train(model, optimizer, loss_fn, train_loader, val_loader, epochs = 5, device = "cuda"):
print("Inside Train Function\n")
for epoch in range(epochs):
print("Epoch : {} running".format(epoch))
training_loss = 0.0
valid_loss = 0.0
model.train()
k = 0
for batch in train_loader:
optimizer.zero_grad()
inputs, targets = batch
inputs = inputs.to(device)
output = model(inputs)
loss = loss_fn(output, targets)
loss.backward()
optimizer.step()
training_loss += loss.data.item() * inputs.size(0)
print("End of batch loop iteration {} \n".format(k))
k = k + 1
training_loss /= len(train_loader.dataset)
model.eval()
num_correct = 0
num_examples = 0
for batch in val_loader:
inputs, targets = batch
inputs.to(device)
output = model(inputs)
targets = targets.to(device)
loss = loss_fn(output, targets)
valid_loss += loss.data.item() * inputs.size(0)
correct = torch.eq(torch.max(F.softmax(output, dim = 1), dim = 1)[1], targets).view(-1)
num_correct += torch.sum(correct).item()
num_examples += correct.shape[0]
valid_loss /= len(val_loader.dataset)
print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}, accuracy = {:.4f}'.format(epoch, training_loss, valid_loss, num_correct / num_examples))
batch_size = 32
img_dimensions = 224
img_transforms = transforms.Compose([ transforms.Resize((img_dimensions, img_dimensions)),
transforms.ToTensor(),
transforms.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225])
])
img_test_transforms = transforms.Compose([ transforms.Resize((img_dimensions, img_dimensions)),
transforms.ToTensor(),
transforms.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225])
])
def check_image(path):
try:
im = Image.open(path)
return True
except:
return False
train_data_path = "E:\Image Recognition\dogsandcats\\train\\"
train_data = torchvision.datasets.ImageFolder(root=train_data_path,transform=img_transforms, is_valid_file=check_image)
validation_data_path = "E:\\Image Recognition\\dogsandcats\\validation\\"
validation_data = torchvision.datasets.ImageFolder(root=validation_data_path,transform=img_test_transforms, is_valid_file=check_image)
test_data_path = "E:\\Image Recognition\\dogsandcats\\test\\"
test_data = torchvision.datasets.ImageFolder(root=test_data_path,transform=img_test_transforms, is_valid_file=check_image)
num_workers = 6
train_data_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=num_workers)
validation_data_loader = torch.utils.data.DataLoader(validation_data, batch_size=batch_size, shuffle=False, num_workers=num_workers)
test_data_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=num_workers)
print(torch.cuda.is_available(), "\n")
if torch.cuda.is_available():
device = torch.device("cuda")
else:
device = torch.device("cpu")
print(f'Num training images: {len(train_data_loader.dataset)}')
print(f'Num validation images: {len(validation_data_loader.dataset)}')
print(f'Num test images: {len(test_data_loader.dataset)}')
def test_model(model):
print("Inside Test Model Function\n")
correct = 0
total = 0
with torch.no_grad():
for data in test_data_loader:
images, labels = data[0].to(device), data[1].to(device)
outputs = model(images)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print('correct: {:d} total: {:d}'.format(correct, total))
print('accuracy = {:f}'.format(correct / total))
model_resnet18.to(device)
optimizer = optim.Adam(model_resnet18.parameters(), lr=0.001)
if __name__ == "__main__":
train(model_resnet18, optimizer, torch.nn.CrossEntropyLoss(), train_data_loader, validation_data_loader, epochs=2, device=device)
test_model(model_resnet18)
model_resnet34.to(device)
optimizer = optim.Adam(model_resnet34.parameters(), lr=0.001)
if __name__ == "__main__":
train(model_resnet34, optimizer, torch.nn.CrossEntropyLoss(), train_data_loader, validation_data_loader, epochs=2, device=device)
test_model(model_resnet34)
import os
def find_classes(dir):
classes = os.listdir(dir)
classes.sort()
class_to_idx = {classes[i]: i for i in range(len(classes))}
return classes, class_to_idx
def make_prediction(model, filename):
labels, _ = find_classes('E:\\Image Recognition\\dogsandcats\\test\\test')
img = Image.open(filename)
img = img_test_transforms(img)
img = img.unsqueeze(0)
prediction = model(img.to(device))
prediction = prediction.argmax()
print(labels[prediction])
make_prediction(model_resnet34, 'E:\\Image Recognition\\dogsandcats\\test\\test\\3.jpg') #dog
make_prediction(model_resnet34, 'E:\\Image Recognition\\dogsandcats\\test\\test\\5.jpg') #cat
torch.save(model_resnet18.state_dict(), "./model_resnet18.pth")
torch.save(model_resnet34.state_dict(), "./model_resnet34.pth")
# Remember that you must call model.eval() to set dropout and batch normalization layers to
# evaluation mode before running inference. Failing to do this will yield inconsistent inference results.
resnet18 = torch.hub.load('pytorch/vision', 'resnet18')
resnet18.fc = nn.Sequential(nn.Linear(resnet18.fc.in_features,512),nn.ReLU(), nn.Dropout(), nn.Linear(512, num_classes))
resnet18.load_state_dict(torch.load('./model_resnet18.pth'))
resnet18.eval()
resnet34 = torch.hub.load('pytorch/vision', 'resnet34')
resnet34.fc = nn.Sequential(nn.Linear(resnet34.fc.in_features,512),nn.ReLU(), nn.Dropout(), nn.Linear(512, num_classes))
resnet34.load_state_dict(torch.load('./model_resnet34.pth'))
resnet34.eval()
# Test against the average of each prediction from the two models
models_ensemble = [resnet18.to(device), resnet34.to(device)]
correct = 0
total = 0
if __name__ == '__main__':
with torch.no_grad():
for data in test_data_loader:
images, labels = data[0].to(device), data[1].to(device)
predictions = [i(images).data for i in models_ensemble]
avg_predictions = torch.mean(torch.stack(predictions), dim=0)
_, predicted = torch.max(avg_predictions, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
if total != 0:
print('accuracy = {:f}'.format(correct / total))
print('correct: {:d} total: {:d}'.format(correct, total))
To be very precise, I want to save my progress at the end of for batch in train_loader: loop, for say k = 1500.
If anyone can guide me about modifying my code so that I can save my progress and resume it later, then it will be a great and highly appreciated.
Whenever you want to save your training progress, you need to save two things:
This can be done in the following way:
def save_checkpoint(model, optimizer, save_path, epoch):
torch.save({
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'epoch': epoch
}, save_path)
To resume training, you can restore your model and optimizer's state dict.
def load_checkpoint(model, optimizer, load_path):
checkpoint = torch.load(load_path)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
return model, optimizer, epoch
You can save your model at any point in training, wherever you need to. However, it should be ideal to save after finishing an epoch.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With