Trying to get similar results on same dataset with Keras and PyTorch.
from numpy import array
from numpy import hstack
from sklearn.model_selection import train_test_split
# split a multivariate sequence into samples
def split_sequences(sequences, n_steps):
X, y = list(), list()
for i in range(len(sequences)):
# find the end of this pattern
end_ix = i + n_steps
# check if we are beyond the dataset
if end_ix > len(sequences):
break
# gather input and output parts of the pattern
seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1, -1]
X.append(seq_x)
y.append(seq_y)
return array(X), array(y)
def get_data():
# define input sequence
in_seq1 = array([x for x in range(0,500,10)])/1
in_seq2 = array([x for x in range(5,505,10)])/1
out_seq = array([in_seq1[i]+in_seq2[i] for i in range(len(in_seq1))])
# convert to [rows, columns] structure
in_seq1 = in_seq1.reshape((len(in_seq1), 1))
in_seq2 = in_seq2.reshape((len(in_seq2), 1))
out_seq = out_seq.reshape((len(out_seq), 1))
# horizontally stack columns
dataset = hstack((in_seq1, in_seq2, out_seq))
n_features = 2 # this is number of parallel inputs
n_timesteps = 3 # this is number of timesteps
# convert into input/output
X, y = split_sequences(dataset, n_timesteps)
print(X.shape, y.shape)
X_train,x_test,Y_train, y_test = train_test_split(X,y,test_size = 0.2,shuffle=False)
return X_train,x_test,Y_train, y_test
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from sklearn.metrics import mean_squared_error
import testing.TimeSeries.datacreator as dc # !!!!change this!!!!
X_train,x_test,Y_train, y_test = dc.get_data()
n_features = 2 # this is number of parallel inputs
n_timesteps = 3 # this is number of timesteps
# define model
model = Sequential()
model.add(LSTM(1024, activation='relu',
input_shape=(n_timesteps, n_features),
kernel_initializer='uniform',
recurrent_initializer='uniform'))
model.add(Dense(512, activation='relu'))
model.add(Dense(1))
opt = keras.optimizers.Adam(lr=0.001,
beta_1=0.9,
beta_2=0.999,
epsilon=keras.optimizers.K.epsilon(),
decay=0.0,
amsgrad=False)
model.compile(optimizer=opt, loss='mse')
# fit model
model.fit(X_train, Y_train, epochs=200, verbose=1,validation_data=(x_test,y_test))
yhat = model.predict(x_test, verbose=0)
mean_squared_error(y_test, yhat)
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.metrics import mean_squared_error
import testing.TimeSeries.datacreator as dc # !!!! change this !!!!
X_train,x_test,Y_train, y_test = dc.get_data()
n_features = 2 # this is number of parallel inputs
n_timesteps = 3 # this is number of timesteps
class MV_LSTM(torch.nn.Module):
def __init__(self,n_features,seq_length):
super(MV_LSTM, self).__init__()
self.n_features = n_features # number of parallel inputs
self.seq_len = seq_length # number of timesteps
self.n_hidden = 1024 # number of hidden states
self.n_layers = 1 # number of LSTM layers (stacked)
self.l_lstm = torch.nn.LSTM(input_size = n_features,
hidden_size = self.n_hidden,
num_layers = self.n_layers,
batch_first = True)
# according to pytorch docs LSTM output is
# (batch_size,seq_len, num_directions * hidden_size)
# when considering batch_first = True
self.l_linear = torch.nn.Linear(self.n_hidden*self.seq_len, 512)
# self.l_linear1 = torch.nn.Linear(512, 512)
self.l_linear2 = torch.nn.Linear(512, 1)
def init_hidden(self, batch_size):
# even with batch_first = True this remains same as docs
hidden_state = torch.zeros(self.n_layers,batch_size,self.n_hidden).to(next(self.parameters()).device)
cell_state = torch.zeros(self.n_layers,batch_size,self.n_hidden).to(next(self.parameters()).device)
self.hidden = (hidden_state, cell_state)
def forward(self, x):
batch_size, seq_len, _ = x.size()
lstm_out, self.hidden = self.l_lstm(x,self.hidden)
# lstm_out(with batch_first = True) is
# (batch_size,seq_len,num_directions * hidden_size)
# for following linear layer we want to keep batch_size dimension and merge rest
# .contiguous() -> solves tensor compatibility error
x = lstm_out.contiguous().view(batch_size,-1)
x = F.relu(x)
x = F.relu(self.l_linear(x))
# x = F.relu(self.l_linear1(x))
x = self.l_linear2(x)
return x
# create NN
mv_net = MV_LSTM(n_features,n_timesteps)
criterion = torch.nn.MSELoss()
import keras # for epsilon constant
optimizer = torch.optim.Adam(mv_net.parameters(),
lr=1e-3,
betas=[0.9,0.999],
eps=keras.optimizers.K.epsilon(),
weight_decay=0,
amsgrad=False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
mv_net.to(device)
train_episodes = 200
batch_size = 32
eval_batch_size = 32
for t in range(train_episodes):
# TRAIN
mv_net.train()
for b in range(0,len(X_train),batch_size):
inpt = X_train[b:b+batch_size,:,:]
target = Y_train[b:b+batch_size]
x_batch = torch.tensor(inpt,dtype=torch.float32).to(device)
y_batch = torch.tensor(target,dtype=torch.float32).to(device)
mv_net.init_hidden(x_batch.size(0))
output = mv_net(x_batch)
loss = criterion(output.view(-1), y_batch)
loss.backward()
optimizer.step()
optimizer.zero_grad()
# EVAL
mv_net.eval()
mv_net.init_hidden(eval_batch_size)
acc = 0
for b in range(0,len(x_test),eval_batch_size):
inpt = x_test[b:b+eval_batch_size,:,:]
target = y_test[b:b+eval_batch_size]
x_batch = torch.tensor(inpt,dtype=torch.float32).to(device)
y_batch = torch.tensor(target,dtype=torch.float32).to(device)
mv_net.init_hidden(x_batch.size(0))
output = mv_net(x_batch)
acc += mean_squared_error(y_batch.cpu().detach().numpy(), output.view(-1).cpu().detach().numpy())
print('step:' , t , 'train loss:' , round(loss.item(),3),'eval acc:',round(acc/len(x_test),3))
mv_net.init_hidden(len(x_test))
val = torch.tensor(x_test,dtype=torch.float32).to(device)
otp = mv_net(val)
print(mean_squared_error(y_test, otp.view(-1).cpu().detach().numpy()))
Keras produces test MSE almost 0, but PyTorch about 6000, which is way too different
I have tried couple tweaks in PyTorch code, but none got me anywhere close to similar keras, even with identical optim params
I cant see what is wrong with (kinda tutorialic) PyTorch code
I know it is almost one year too late. But I came across the same problem and I think the problem is the following. From the keras documentation it says:
return_sequences: Boolean. Whether to return the last output in the output sequence, or the full sequence.
this basically means that the input shape of your self.l_linear
needs to be torch.nn.Linear(1024, 512)
instead of self.n_hidden*self.seq_len, 512
.
Now you also need to do the same as keras does and only use the last output in your forward pass:
def forward(self, x):
batch_size, seq_len, _ = x.size()
lstm_out, self.hidden = self.l_lstm(x,self.hidden)
x = lstm_out[:,-1]
x = torch.nn.functional.relu(x)
x = torch.nn.functional.relu(self.l_linear(x))
x = self.l_linear2(x)
return x
when I run your example (which I needed to tweak a bit to get it run) I get very similar training losses.
Keras:
38/38 [==============================] - 0s 6ms/step - loss: 67.6081 - val_loss: 325.9259
PyTorch:
step: 199 train loss: 41.043 eval acc: 1142.688
I hope this helps others having a similar problem.
PS also note that keras is resetting the hidden state (stateful=False) by default.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With