I tried to fine tune a model with my personal information. So I can create a chat box where people can learn about me via chat gpt.
However, I got the error of
RuntimeError: stack expects each tensor to be equal size, but got [47] at entry 0 and [36] at entry 1
Because I have different length of input
Here are 2 of my sample input
What is the webisite of ABC company ? -> https://abcdef.org/
Do you know the website of ABC company ? -> It is https://abcdef.org/
Here is what I have tried so far
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import Dataset, DataLoader
class QADataset(Dataset):
def __init__(self, questions, answers, tokenizer, max_length):
self.questions = questions
self.answers = answers
self.tokenizer = tokenizer
self.max_length = max_length
# Add a padding token to the tokenizer
self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
def __len__(self):
return len(self.questions)
def __getitem__(self, index):
question = self.questions[index]
answer = self.answers[index]
input_text = f"Q: {question} A: {answer}"
input_ids = self.tokenizer.encode(input_text, add_special_tokens=True, max_length=self.max_length, padding=True, truncation=True)
if input_ids is None:
return None
input_ids = torch.tensor(input_ids, dtype=torch.long)
print(f"Input ids size: {input_ids.size()}")
return input_ids
# Set up the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
# Load the question and answer data
questions = ["What is the webisite of ABC company ?", "Do you know the website of ABC company ?"]
answers = ["https://abcdef.org/", "It is https://abcdef.org/"]
# Create the dataset and data loader
max_length = 64
dataset = QADataset(questions, answers, tokenizer, max_length=max_length)
data_loader = DataLoader(dataset, batch_size=8, shuffle=True)
# Fine-tune the model on the QA dataset
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()
for epoch in range(3):
running_loss = 0.0
for batch in data_loader:
batch = batch.to(device)
outputs = model(batch, labels=batch)
loss, _ = outputs[:2]
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item()
print(f"Epoch {epoch + 1} loss: {running_loss / len(data_loader)}")
# Save the fine-tuned model
model.save_pretrained("qa_finetuned_gpt2")
I dont have a solid background of AI, it is more like reading references and try to implement it.
Yes seems like you didn't pad your inputs. The model expects the size to be the same for each text. So if it's too short, you pad it, and if it's too long, it should be truncated.
See also
Try changing how the tokenizer process the inputs:
# Define the data loading class
class MyDataset(Dataset):
def __init__(self, data_path, tokenizer):
self.data_path = data_path
self.tokenizer = tokenizer
with open(self.data_path, 'r') as f:
self.data = f.read().split('\n')
def __len__(self):
return len(self.data)
def __getitem__(self, index):
text = self.data[index]
inputs = self.tokenizer.encode(text, add_special_tokens=True,
truncation=True, max_length=80, padding="max_length")
return torch.tensor(inputs)
As said before you will need to pad your input, however pretrained tokenizer dictionary for GPT2 doesn't include padding token, so you will need to set it manually for this to work. You can either add a new token or use another existing token, such as end of text. After adding the token you will need to resize the embeddings. Once you've done that you can tokenize your data and if padding is self to true it will properly use the padding token. Also, since you're using padding token in many cases you are going to want to use an attention mask.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With