Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Fail to implement layer normalization with keras

I am trying to implement the layer normalization in a fully connected neural network with keras. The issue I have met is that all the loss are NaN and it doesn't learn. Here is my code:

class DenseLN(Layer):
    def __init__(self, output_dim, init='glorot_uniform', activation='linear', weights=None,
                 W_regularizer=None, b_regularizer=None, activity_regularizer=None,
                 W_constraint=None, b_constraint=None, bias=True, input_dim=None, **kwargs):
        self.init = initializations.get(init)
        self.activation = activations.get(activation)
        self.output_dim = output_dim
        self.input_dim = input_dim
        self.epsilon = 1e-5        

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)
        self.activity_regularizer = regularizers.get(activity_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.initial_weights = weights
        self.input_spec = [InputSpec(ndim=2)]

        if self.input_dim:
            kwargs['input_shape'] = (self.input_dim,)
        super(DenseLN, self).__init__(**kwargs)

    def ln(self, x):
        # layer normalization function
        m = K.mean(x, axis=0)
        std = K.sqrt(K.var(x, axis=0) + self.epsilon)
        x_normed = (x - m) / (std + self.epsilon)
        x_normed = self.gamma * x_normed + self.beta
        return x_normed

    def build(self, input_shape):
        assert len(input_shape) == 2
        input_dim = input_shape[1]
        self.input_spec = [InputSpec(dtype=K.floatx(),
                                     shape=(None, input_dim))]

        self.gamma = K.variable(np.ones(self.output_dim) * 0.2, name='{}_gamma'.format(self.name)) 
        self.beta = K.zeros((self.output_dim,), name='{}_beta'.format(self.name))

        self.W = self.init((input_dim, self.output_dim),
                           name='{}_W'.format(self.name))
        if self.bias:
            self.b = K.zeros((self.output_dim,),
                             name='{}_b'.format(self.name))
            self.trainable_weights = [self.W, self.gamma, self.beta, self.b]
        else:
            self.trainable_weights = [self.W, self.gamma, self.beta]
        self.regularizers = []
        if self.W_regularizer:
            self.W_regularizer.set_param(self.W)
            self.regularizers.append(self.W_regularizer)

        if self.bias and self.b_regularizer:
            self.b_regularizer.set_param(self.b)
            self.regularizers.append(self.b_regularizer)

        if self.activity_regularizer:
            self.activity_regularizer.set_layer(self)
            self.regularizers.append(self.activity_regularizer)

        self.constraints = {}
        if self.W_constraint:
            self.constraints[self.W] = self.W_constraint
        if self.bias and self.b_constraint:
            self.constraints[self.b] = self.b_constraint

        if self.initial_weights is not None:
            self.set_weights(self.initial_weights)
            del self.initial_weights

    def call(self, x, mask=None):
        output = K.dot(x, self.W)
        output = self.ln(output)
        #print (theano.tensor.shape(output))
        if self.bias:
            output += self.b
        return self.activation(output)

    def get_output_shape_for(self, input_shape):
        assert input_shape and len(input_shape) == 2
        return (input_shape[0], self.output_dim)

model = Sequential()
model.add(Dense(12, activation='sigmoid', input_dim=12))
model.add(DenseLN(98, activation='sigmoid'))
model.add(DenseLN(108, activation='sigmoid'))
model.add(DenseLN(1))
adadelta = Adadelta(lr=0.1, rho=0.95, epsilon=1e-08)
adagrad = Adagrad(lr=0.003, epsilon=1e-08)

model.compile(loss='poisson',
              optimizer=adagrad,
              metrics=['accuracy'])

model.fit(X_train_scale,
          Y_train,
          batch_size=3000,
          callbacks=[history],
          nb_epoch=300)

Do you know what's wrong here and how can I fix it? Thanks in advance!

EDIT:

I have also tried some combinations of the layers and found something weired. If the input and output layer are both normal Dense layer, the accuracy would be very low, nearly zero. But if the input layer is DenseLN, i.e., my customized layer, the accuracy would be 0.6+ at first and after tens of iterations, it reduced to zero again. Indeed I copied most of the code from Dense layer and all the difference is the ln function and self.ln(output) in call function. Besides, I have also added the gamma and beta to the trainable_weights.

Any help is appreciated!

like image 540
user5779223 Avatar asked Oct 29 '22 21:10

user5779223


1 Answers

It's a lot cleaner and more flexible if you implement it as a separate layer. Something like this should work:

class LayerNorm(Layer):
    """ Layer Normalization in the style of https://arxiv.org/abs/1607.06450 """
    def __init__(self, scale_initializer='ones', bias_initializer='zeros', **kwargs):
        super(LayerNorm, self).__init__(**kwargs)
        self.epsilon = 1e-6
        self.scale_initializer = initializers.get(scale_initializer)
        self.bias_initializer = initializers.get(bias_initializer)

    def build(self, input_shape):
        self.scale = self.add_weight(shape=(input_shape[-1],), 
                                     initializer=self.scale_initializer,
                                     trainable=True,
                                     name='{}_scale'.format(self.name))
        self.bias = self.add_weight(shape=(input_shape[-1],),
                                    initializer=self.bias_initializer,
                                    trainable=True,
                                    name='{}_bias'.format(self.name))
        self.built = True

    def call(self, x, mask=None):
        mean = K.mean(x, axis=-1, keepdims=True)
        std = K.std(x, axis=-1, keepdims=True)
        norm = (x - mean) * (1/(std + self.epsilon))
        return norm * self.scale + self.bias

    def compute_output_shape(self, input_shape):
        return input_shape
like image 55
Alexander Measure Avatar answered Nov 15 '22 05:11

Alexander Measure