Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

the same model converged in keras but not tensorflow, how is that possible?

I'm trying to work with lstm in tensorflow, but I got to the point I can't make a simple imdb sentiment model to converge.

I took a keras model and tried to duplicate the exact same model in tensorflow, in keras it trains and converge however in tensorflow it is just stuck at some point (0.69 loss).

I tried to make them as equal as possible, the only difference I can tell of is that in keras the padding is before the sequence, while in tensorflow I use 'post' padding due to the conventions in tensorflow.

Any idea whats wrong with my tensorflow model?

from __future__ import print_function

import random
import numpy as np

from tensorflow.contrib.keras.python.keras.preprocessing import sequence
from tensorflow.contrib.keras.python.keras.models import Sequential
from tensorflow.contrib.keras.python.keras.layers import Dense, Dropout, Activation
from tensorflow.contrib.keras.python.keras.layers import Embedding
from tensorflow.contrib.keras.python.keras.layers import LSTM
from tensorflow.contrib.keras.python.keras.layers import Conv1D, MaxPooling1D
from tensorflow.contrib.keras.python.keras.datasets import imdb

import tensorflow as tf

# Embedding
max_features = 30000
maxlen = 2494
embedding_size = 128

# Convolution
kernel_size = 5
filters = 64
pool_size = 4

# LSTM
lstm_output_size = 70

# Training
batch_size = 30
epochs = 2


class TrainData:
    def __init__(self, batch_sz=batch_size):
        (x_train, y_train), (_, _) = imdb.load_data(num_words=max_features)


        y_train = [[int(x == 1), int(x != 1)] for x in y_train]
        self._batch_size = batch_sz

        self._train_data = sequence.pad_sequences(x_train, padding='pre')

        self._train_labels = y_train

    def next_batch(self):
        if len(self._train_data) < self._batch_size:
            self.__init__()

        batch_x, batch_y = self._train_data[:self._batch_size], self._train_labels[:self._batch_size]
        self._train_data = self._train_data[self._batch_size:]
        self._train_labels = self._train_labels[self._batch_size:]

        return batch_x, batch_y

    def batch_generator(self):
        while True:
            if len(self._train_data) < self._batch_size:
                self.__init__()

            batch_x, batch_y = self._train_data[:self._batch_size], self._train_labels[:self._batch_size]
            self._train_data = self._train_data[self._batch_size:]
            self._train_labels = self._train_labels[self._batch_size:]

            yield batch_x, batch_y

    def get_num_batches(self):
        return int(len(self._train_data) / self._batch_size)

def length(sequence):
    used = tf.sign(tf.abs(sequence))
    length = tf.reduce_sum(used, reduction_indices=1)
    length = tf.cast(length, tf.int32)
    return length


def get_model(x, y):
    embedding = tf.get_variable("embedding", [max_features, embedding_size], dtype=tf.float32)
    embedded_x = tf.nn.embedding_lookup(embedding, x)
    print(x)
    print(embedded_x)
    print(length(x))

    cell_1 = tf.contrib.rnn.BasicLSTMCell(lstm_output_size)
    output_1, state_1 = tf.nn.dynamic_rnn(cell_1, embedded_x, dtype=tf.float32, scope="rnn_layer1",
                                          sequence_length=length(x))

    # Select last output.
    last_index = tf.shape(output_1)[1] - 1
    # reshaping to [seq_length, batch_size, num_units]
    output = tf.transpose(output_1, [1, 0, 2])

    last = tf.gather(output, last_index)

    # Softmax layer
    with tf.name_scope('fc_layer'):
        weight = tf.get_variable(name="weights", shape=[lstm_output_size, 2])
        bias = tf.get_variable(shape=[2], name="bias")

    logits = tf.matmul(last, weight) + bias

    loss = tf.losses.softmax_cross_entropy(y, logits=logits)

    optimizer = tf.train.AdamOptimizer()
    optimize_step = optimizer.minimize(loss=loss)

    return loss, optimize_step


def tf_model():
    x_holder = tf.placeholder(tf.int32, shape=[None, maxlen])
    y_holder = tf.placeholder(tf.int32, shape=[None, 2])
    loss, opt_step = get_model(x_holder, y_holder)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        step = 0
        for epoch in range(10):
            cost_epochs = []
            train_data = TrainData()
            cost_batch = 0
            for batch in range(train_data.get_num_batches()):
                x_train, y_train = train_data.next_batch()
                _, cost_batch = sess.run([opt_step, loss],
                                         feed_dict={x_holder: x_train,
                                                    y_holder: y_train})

                cost_epochs.append(cost_batch)


                step += 1
                # if step % 100 == 0:
                print("Epoch: " + str(epoch))
                print("\tcost: " + str(np.mean(cost_epochs)))



def keras_model():
    # print('Loading data...')
    (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

    y_test = [[int(x == 1), int(x != 1)] for x in y_test]

    x_test = sequence.pad_sequences(x_test, maxlen=maxlen, padding='pre')

    model = Sequential()
    model.add(Embedding(max_features, embedding_size, input_length=maxlen))

    model.add(LSTM(lstm_output_size))
    model.add(Dense(2))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    print('Train...')
    data = TrainData()
    model.fit_generator(data.batch_generator(), steps_per_epoch=data.get_num_batches(),
                        epochs=epochs,
                        validation_data=(x_test, y_test))


if __name__ == '__main__':
    # keras_model()
    tf_model()

EDIT

When I limit the sequence length to 100 both models converge, so I assume there is something different in the the lstm layer.

like image 237
guy Avatar asked May 28 '17 09:05

guy


Video Answer


1 Answers

Check the initial values of your operations. In my case the adadelta optimizer in keras had initial learning rate of 1.0 and in tf.keras it had 0.001 so in the mnist dataset it converged much slowly.

like image 90
christk Avatar answered Oct 25 '22 00:10

christk