Keras vs PyTorch LSTM different results

Question

Trying to get similar results on same dataset with Keras and PyTorch.

Data

from numpy import array
from numpy import hstack

from sklearn.model_selection import train_test_split  
 

# split a multivariate sequence into samples
def split_sequences(sequences, n_steps):
    X, y = list(), list()
    for i in range(len(sequences)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the dataset
        if end_ix > len(sequences):
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1, -1]
        X.append(seq_x)
        y.append(seq_y)
    return array(X), array(y)

def get_data():
    # define input sequence
    in_seq1 = array([x for x in range(0,500,10)])/1
    in_seq2 = array([x for x in range(5,505,10)])/1
    out_seq = array([in_seq1[i]+in_seq2[i] for i in range(len(in_seq1))])
    # convert to [rows, columns] structure
    in_seq1 = in_seq1.reshape((len(in_seq1), 1))
    in_seq2 = in_seq2.reshape((len(in_seq2), 1))
    out_seq = out_seq.reshape((len(out_seq), 1))
    # horizontally stack columns
    dataset = hstack((in_seq1, in_seq2, out_seq))
    
    n_features = 2 # this is number of parallel inputs
    n_timesteps = 3 # this is number of timesteps
    
    # convert into input/output
    X, y = split_sequences(dataset, n_timesteps)
    print(X.shape, y.shape)
    X_train,x_test,Y_train, y_test = train_test_split(X,y,test_size = 0.2,shuffle=False)
    
    return X_train,x_test,Y_train, y_test

Keras

from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense   

from sklearn.metrics import mean_squared_error
 
import testing.TimeSeries.datacreator as dc # !!!!change this!!!!     
X_train,x_test,Y_train, y_test = dc.get_data() 

n_features = 2 # this is number of parallel inputs
n_timesteps = 3 # this is number of timesteps

# define model
model = Sequential()
model.add(LSTM(1024, activation='relu', 
               input_shape=(n_timesteps, n_features),
               kernel_initializer='uniform',
               recurrent_initializer='uniform'))
model.add(Dense(512, activation='relu'))
model.add(Dense(1))
opt = keras.optimizers.Adam(lr=0.001, 
                      beta_1=0.9, 
                      beta_2=0.999, 
                      epsilon=keras.optimizers.K.epsilon(), 
                      decay=0.0, 
                      amsgrad=False)
model.compile(optimizer=opt, loss='mse')
# fit model
model.fit(X_train, Y_train, epochs=200, verbose=1,validation_data=(x_test,y_test))    
    
yhat = model.predict(x_test, verbose=0)    
    
mean_squared_error(y_test, yhat)

PyTorch - module class

import numpy as np
import torch
import torch.nn.functional as F
 
from sklearn.metrics import mean_squared_error
 
import testing.TimeSeries.datacreator as dc # !!!! change this !!!!   
X_train,x_test,Y_train, y_test =   dc.get_data()  
n_features = 2 # this is number of parallel inputs
n_timesteps = 3 # this is number of timesteps    

class MV_LSTM(torch.nn.Module):
    def __init__(self,n_features,seq_length):
        super(MV_LSTM, self).__init__()
        self.n_features = n_features # number of parallel inputs
        self.seq_len = seq_length # number of timesteps
        self.n_hidden = 1024 # number of hidden states
        self.n_layers = 1 # number of LSTM layers (stacked)
    
        self.l_lstm = torch.nn.LSTM(input_size = n_features, 
                                 hidden_size = self.n_hidden,
                                 num_layers = self.n_layers, 
                                 batch_first = True)
        # according to pytorch docs LSTM output is 
        # (batch_size,seq_len, num_directions * hidden_size)
        # when considering batch_first = True
        self.l_linear = torch.nn.Linear(self.n_hidden*self.seq_len, 512)
#        self.l_linear1 = torch.nn.Linear(512, 512)
        self.l_linear2 = torch.nn.Linear(512, 1)
        
    
    def init_hidden(self, batch_size):
        # even with batch_first = True this remains same as docs
        hidden_state = torch.zeros(self.n_layers,batch_size,self.n_hidden).to(next(self.parameters()).device)
        cell_state = torch.zeros(self.n_layers,batch_size,self.n_hidden).to(next(self.parameters()).device)
        self.hidden = (hidden_state, cell_state)
    
    
    def forward(self, x):        
        batch_size, seq_len, _ = x.size()
        
        lstm_out, self.hidden = self.l_lstm(x,self.hidden)
        # lstm_out(with batch_first = True) is 
        # (batch_size,seq_len,num_directions * hidden_size)
        # for following linear layer we want to keep batch_size dimension and merge rest       
        # .contiguous() -> solves tensor compatibility error
        x = lstm_out.contiguous().view(batch_size,-1)
        x = F.relu(x)
        x = F.relu(self.l_linear(x))
#        x = F.relu(self.l_linear1(x))
        x = self.l_linear2(x)
        return x

PyTorch - init and train

# create NN
mv_net = MV_LSTM(n_features,n_timesteps)
criterion = torch.nn.MSELoss()
import keras # for epsilon constant
optimizer = torch.optim.Adam(mv_net.parameters(), 
                             lr=1e-3,
                             betas=[0.9,0.999],
                             eps=keras.optimizers.K.epsilon(),
                             weight_decay=0,
                             amsgrad=False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
mv_net.to(device)


train_episodes = 200
batch_size = 32
eval_batch_size = 32

for t in range(train_episodes):
    # TRAIN
    mv_net.train()
    for b in range(0,len(X_train),batch_size):
        inpt = X_train[b:b+batch_size,:,:]
        target = Y_train[b:b+batch_size]    
        
        x_batch = torch.tensor(inpt,dtype=torch.float32).to(device)    
        y_batch = torch.tensor(target,dtype=torch.float32).to(device) 
    
        mv_net.init_hidden(x_batch.size(0))
        
        output = mv_net(x_batch) 
        loss = criterion(output.view(-1), y_batch)  
        
        loss.backward()
        optimizer.step()        
        optimizer.zero_grad() 
    
    # EVAL    
    mv_net.eval()
    mv_net.init_hidden(eval_batch_size)
    acc = 0
    for b in range(0,len(x_test),eval_batch_size):
        inpt = x_test[b:b+eval_batch_size,:,:]
        target = y_test[b:b+eval_batch_size]    
        
        x_batch = torch.tensor(inpt,dtype=torch.float32).to(device)    
        y_batch = torch.tensor(target,dtype=torch.float32).to(device) 
        mv_net.init_hidden(x_batch.size(0))
        
        output = mv_net(x_batch)
        acc += mean_squared_error(y_batch.cpu().detach().numpy(), output.view(-1).cpu().detach().numpy()) 
    print('step:' , t , 'train loss:' , round(loss.item(),3),'eval acc:',round(acc/len(x_test),3))


mv_net.init_hidden(len(x_test))
val = torch.tensor(x_test,dtype=torch.float32).to(device) 
otp = mv_net(val) 
print(mean_squared_error(y_test, otp.view(-1).cpu().detach().numpy()))

Results

Keras produces test MSE almost 0, but PyTorch about 6000, which is way too different

I have tried couple tweaks in PyTorch code, but none got me anywhere close to similar keras, even with identical optim params

I cant see what is wrong with (kinda tutorialic) PyTorch code

KIC · Accepted Answer

I know it is almost one year too late. But I came across the same problem and I think the problem is the following. From the keras documentation it says:

return_sequences: Boolean. Whether to return the last output in the output sequence, or the full sequence.

this basically means that the input shape of your self.l_linear needs to be torch.nn.Linear(1024, 512) instead of self.n_hidden*self.seq_len, 512.

Now you also need to do the same as keras does and only use the last output in your forward pass:

    def forward(self, x):        
        batch_size, seq_len, _ = x.size()

        lstm_out, self.hidden = self.l_lstm(x,self.hidden)

        x = lstm_out[:,-1]
        x = torch.nn.functional.relu(x)
        x = torch.nn.functional.relu(self.l_linear(x))
        x = self.l_linear2(x)
        return x

when I run your example (which I needed to tweak a bit to get it run) I get very similar training losses.

Keras:

38/38 [==============================] - 0s 6ms/step - loss: 67.6081 - val_loss: 325.9259

PyTorch:

step: 199 train loss: 41.043 eval acc: 1142.688

I hope this helps others having a similar problem.

PS also note that keras is resetting the hidden state (stateful=False) by default.

Keras vs PyTorch LSTM different results

Tags:

python

keras

lstm

pytorch

Data

Keras

PyTorch - module class

PyTorch - init and train

Results

Tomas Trdla

1 Answers

KIC

Recent Activity

Donate For Us

Keras vs PyTorch LSTM different results

Tags:

python

keras

lstm

pytorch

Data

Keras

PyTorch - module class

PyTorch - init and train

Results

Tomas Trdla

1 Answers

KIC

Related questions

Recent Activity

Donate For Us