Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

How do I split a custom dataset into training and test datasets?

import pandas as pd import numpy as np import cv2 from torch.utils.data.dataset import Dataset  class CustomDatasetFromCSV(Dataset):     def __init__(self, csv_path, transform=None):         self.data = pd.read_csv(csv_path)         self.labels = pd.get_dummies(self.data['emotion']).as_matrix()         self.height = 48         self.width = 48         self.transform = transform      def __getitem__(self, index):         pixels = self.data['pixels'].tolist()         faces = []         for pixel_sequence in pixels:             face = [int(pixel) for pixel in pixel_sequence.split(' ')]             # print(np.asarray(face).shape)             face = np.asarray(face).reshape(self.width, self.height)             face = cv2.resize(face.astype('uint8'), (self.width, self.height))             faces.append(face.astype('float32'))         faces = np.asarray(faces)         faces = np.expand_dims(faces, -1)         return faces, self.labels      def __len__(self):         return len(self.data) 

This is what I could manage to do by using references from other repositories. However, I want to split this dataset into train and test.

How can I do that inside this class? Or do I need to make a separate class to do that?

like image 878
nirvair Avatar asked May 26 '18 16:05

nirvair


People also ask

Which is the correct way to divide dataset in training and testing dataset?

The simplest way to split the modelling dataset into training and testing sets is to assign 2/3 data points to the former and the remaining one-third to the latter. Therefore, we train the model using the training set and then apply the model to the test set.

How do you split data for training and evaluation?

A common strategy is to take all available labeled data, and split it into training and evaluation subsets, usually with a ratio of 70-80 percent for training and 20-30 percent for evaluation.


2 Answers

Starting in PyTorch 0.4.1 you can use random_split:

train_size = int(0.8 * len(full_dataset)) test_size = len(full_dataset) - train_size train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size]) 
like image 68
Fábio Perez Avatar answered Oct 21 '22 11:10

Fábio Perez


Using Pytorch's SubsetRandomSampler:

import torch import numpy as np from torchvision import datasets from torchvision import transforms from torch.utils.data.sampler import SubsetRandomSampler  class CustomDatasetFromCSV(Dataset):     def __init__(self, csv_path, transform=None):         self.data = pd.read_csv(csv_path)         self.labels = pd.get_dummies(self.data['emotion']).as_matrix()         self.height = 48         self.width = 48         self.transform = transform      def __getitem__(self, index):         # This method should return only 1 sample and label          # (according to "index"), not the whole dataset         # So probably something like this for you:         pixel_sequence = self.data['pixels'][index]         face = [int(pixel) for pixel in pixel_sequence.split(' ')]         face = np.asarray(face).reshape(self.width, self.height)         face = cv2.resize(face.astype('uint8'), (self.width, self.height))         label = self.labels[index]          return face, label      def __len__(self):         return len(self.labels)   dataset = CustomDatasetFromCSV(my_path) batch_size = 16 validation_split = .2 shuffle_dataset = True random_seed= 42  # Creating data indices for training and validation splits: dataset_size = len(dataset) indices = list(range(dataset_size)) split = int(np.floor(validation_split * dataset_size)) if shuffle_dataset :     np.random.seed(random_seed)     np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split]  # Creating PT data samplers and loaders: train_sampler = SubsetRandomSampler(train_indices) valid_sampler = SubsetRandomSampler(val_indices)  train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,                                             sampler=train_sampler) validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,                                                 sampler=valid_sampler)  # Usage Example: num_epochs = 10 for epoch in range(num_epochs):     # Train:        for batch_index, (faces, labels) in enumerate(train_loader):         # ... 
like image 36
benjaminplanche Avatar answered Oct 21 '22 10:10

benjaminplanche