Difference between Batch Normalization and Self Normalized Neural Network with SELU

Question

I would like to know the difference between batch normalization and self normalized neural network. In other words, would SELU (Scaled Exponential Linear Unit) replace batch normalization and how?

Moreover, I after looking into the values of the SELU activations, they were in the range: [-1, 1]. While this is not the case with batch normalization. Instead, the values after the BN layer (before the relu activation), took the values of [-a, a] Approximately, and not [-1, 1].

Here is how I printed the values after the SELU activation and after batch norm layer:

batch_norm_layer = tf.Print(batch_norm_layer,
                           data=[tf.reduce_max(batch_norm_layer), tf.reduce_min(batch_norm_layer)],
                           message = name_scope + ' min and max')

And similar code for the SELU activations...

Batch norm layer is defined as follows:

def batch_norm(x, n_out, phase_train, in_conv_layer = True):

    with tf.variable_scope('bn'):
        beta = tf.Variable(tf.constant(0.0, shape=n_out),
                                     name='beta', trainable=True)
        gamma = tf.Variable(tf.constant(1.0, shape=n_out),
                                      name='gamma', trainable=True)
        if in_conv_layer:
            batch_mean, batch_var = tf.nn.moments(x, [0, 1, 2], name='moments')
        else:
            batch_mean, batch_var = tf.nn.moments(x, [0, 1], name='moments')

        ema = tf.train.ExponentialMovingAverage(decay=0.9999)

        def mean_var_with_update():
            ema_apply_op = ema.apply([batch_mean, batch_var])
            with tf.control_dependencies([ema_apply_op]):
                return tf.identity(batch_mean), tf.identity(batch_var)

        mean, var = tf.cond(phase_train,
                            mean_var_with_update,
                            lambda: (ema.average(batch_mean), ema.average(batch_var)))
        normed = tf.nn.batch_normalization(x, mean, var, beta, gamma, 1e-3)
    return normed

Therefore, since batch norm outputs higher values, the loss increases dramatically, and thus I got nans.

In addition, I tried reducing the learning rate with batch norm, but, that didn't help as well. So how to fix this problem???

Here is the following code:

import tensorflow as tf
import numpy as np
import os
import cv2

batch_size = 32
num_epoch = 102
latent_dim = 100

def weight_variable(kernal_shape):
    weights = tf.get_variable(name='weights', shape=kernal_shape, dtype=tf.float32, trainable=True,
                        initializer=tf.truncated_normal_initializer(stddev=0.02))
    return weights

def bias_variable(shape):
    initial = tf.constant(0.0, shape=shape)
    return tf.Variable(initial)

def batch_norm(x, n_out, phase_train, convolutional = True):
    with tf.variable_scope('bn'):
        exp_moving_avg = tf.train.ExponentialMovingAverage(decay=0.9999)

        beta = tf.Variable(tf.constant(0.0, shape=n_out),
                                     name='beta', trainable=True)
        gamma = tf.Variable(tf.constant(1.0, shape=n_out),
                                      name='gamma', trainable=True)
        if convolutional:
            batch_mean, batch_var = tf.nn.moments(x, [0, 1, 2], name='moments')

        else:
            batch_mean, batch_var = tf.nn.moments(x, [0], name='moments')

        update_moving_averages = exp_moving_avg.apply([batch_mean, batch_var])

        m = tf.cond(phase_train, lambda: exp_moving_avg.average(batch_mean), lambda: batch_mean)
        v = tf.cond(phase_train, lambda: exp_moving_avg.average(batch_var), lambda: batch_var)

        normed = tf.nn.batch_normalization(x, m, v, beta, gamma, 1e-3)
        normed = tf.Print(normed, data=[tf.shape(normed)], message='size of normed?')
    return normed, update_moving_averages   # Note that we should run the update_moving_averages with sess.run...

def conv_layer(x, w_shape, b_shape, padding='SAME'):
    W = weight_variable(w_shape)
    tf.summary.histogram("weights", W)

    b = bias_variable(b_shape)
    tf.summary.histogram("biases", b)

    # Note that I used a stride of 2 on purpose in order not to use max pool layer.
    conv = tf.nn.conv2d(x, W, strides=[1, 2, 2, 1], padding=padding) + b
    conv_batch_norm, update_moving_averages = batch_norm(conv, b_shape, phase_train=tf.cast(True, tf.bool))
    name_scope = tf.get_variable_scope().name

    conv_batch_norm = tf.Print(conv_batch_norm,
                               data=[tf.reduce_max(conv_batch_norm), tf.reduce_min(conv_batch_norm)],
                               message = name_scope + ' min and max')

    activations = tf.nn.relu(conv_batch_norm)
    tf.summary.histogram("activations", activations)

    return activations, update_moving_averages

def deconv_layer(x, w_shape, b_shape, padding="SAME", activation='selu'):
    W = weight_variable(w_shape)
    tf.summary.histogram("weights", W)

    b = bias_variable(b_shape)
    tf.summary.histogram('biases', b)

    x_shape = tf.shape(x)

    out_shape = tf.stack([x_shape[0], x_shape[1] * 2, x_shape[2] * 2, w_shape[2]])
    if activation == 'selu':
        conv_trans = tf.nn.conv2d_transpose(x, W, out_shape, [1, 2, 2, 1], padding=padding) + b
        conv_trans_batch_norm, update_moving_averages = \
            batch_norm(conv_trans, b_shape, phase_train=tf.cast(True, tf.bool))
        transposed_activations = tf.nn.relu(conv_trans_batch_norm)

    else:
        conv_trans = tf.nn.conv2d_transpose(x, W, out_shape, [1, 2, 2, 1], padding=padding) + b
        conv_trans_batch_norm, update_moving_averages = \
            batch_norm(conv_trans, b_shape, phase_train=tf.cast(True, tf.bool))
        transposed_activations = tf.nn.sigmoid(conv_trans_batch_norm)

    tf.summary.histogram("transpose_activation", transposed_activations)
    return transposed_activations, update_moving_averages

tfrecords_filename_seq = ["C:/Users/user/PycharmProjects/AffectiveComputing/P16_db.tfrecords"]
filename_queue = tf.train.string_input_producer(tfrecords_filename_seq, num_epochs=num_epoch, shuffle=False, name='queue')
reader = tf.TFRecordReader()

_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(
    serialized_example,
    # Defaults are not specified since both keys are required.
    features={
        'height': tf.FixedLenFeature([], tf.int64),
        'width': tf.FixedLenFeature([], tf.int64),
        'image_raw': tf.FixedLenFeature([], tf.string),
        'annotation_raw': tf.FixedLenFeature([], tf.string)
    })

# This is how we create one example, that is, extract one example from the database.
image = tf.decode_raw(features['image_raw'], tf.uint8)
# The height and the weights are used to
height = tf.cast(features['height'], tf.int32)
width = tf.cast(features['width'], tf.int32)

# The image is reshaped since when stored as a binary format, it is flattened. Therefore, we need the
# height and the weight to restore the original image back.
image = tf.reshape(image, [height, width, 3])

annotation = tf.cast(features['annotation_raw'], tf.string)

min_after_dequeue = 100
num_threads = 1
capacity = min_after_dequeue + num_threads * batch_size
label_batch, images_batch = tf.train.batch([annotation, image],
                                           shapes=[[], [112, 112, 3]],
                                           batch_size=batch_size,
                                           capacity=capacity,
                                           num_threads=num_threads)

label_batch_splitted = tf.string_split(label_batch, delimiter=',')
label_batch_values = tf.reshape(label_batch_splitted.values, [batch_size, -1])
label_batch_numbers = tf.string_to_number(label_batch_values, out_type=tf.float32)
confidences = tf.slice(label_batch_numbers, begin=[0, 2], size=[-1, 1])

images_batch = tf.cast([images_batch], tf.float32)[0]  # Note that casting the image will increases its rank.

with tf.name_scope('image_normal'):
    images_batch = tf.map_fn(lambda img: tf.image.per_image_standardization(img), images_batch)
    #images_batch = tf.Print(images_batch, data=[tf.reduce_max(images_batch), tf.reduce_min(images_batch)],
    #                        message='min and max in images_batch')
with tf.variable_scope('conv1'):
    conv1, uma_conv1 = conv_layer(images_batch, [4, 4, 3, 32], [32])      # image size: [56, 56]
with tf.variable_scope('conv2'):
    conv2, uma_conv2 = conv_layer(conv1, [4, 4, 32, 64], [64])     # image size: [28, 28]
with tf.variable_scope('conv3'):
    conv3, uma_conv3 = conv_layer(conv2, [4, 4, 64, 128], [128])   # image size: [14, 14]
with tf.variable_scope('conv4'):
    conv4, uma_conv4 = conv_layer(conv3, [4, 4, 128, 256], [256])  # image size: [7, 7]
    conv4_reshaped = tf.reshape(conv4, [-1, 7 * 7 * 256], name='conv4_reshaped')

w_c_mu = tf.Variable(tf.truncated_normal([7 * 7 * 256, latent_dim], stddev=0.1), name='weight_fc_mu')
b_c_mu = tf.Variable(tf.constant(0.1, shape=[latent_dim]), name='biases_fc_mu')
w_c_sig = tf.Variable(tf.truncated_normal([7 * 7 * 256, latent_dim], stddev=0.1), name='weight_fc_sig')
b_c_sig = tf.Variable(tf.constant(0.1, shape=[latent_dim]), name='biases_fc_sig')
epsilon = tf.random_normal([1, latent_dim])

tf.summary.histogram('weights_c_mu', w_c_mu)
tf.summary.histogram('biases_c_mu', b_c_mu)
tf.summary.histogram('weights_c_sig', w_c_sig)
tf.summary.histogram('biases_c_sig', b_c_sig)

with tf.variable_scope('mu'):
    mu = tf.nn.bias_add(tf.matmul(conv4_reshaped, w_c_mu), b_c_mu)
    tf.summary.histogram('mu', mu)

with tf.variable_scope('stddev'):
    stddev = tf.nn.bias_add(tf.matmul(conv4_reshaped, w_c_sig), b_c_sig)
    tf.summary.histogram('stddev', stddev)

with tf.variable_scope('z'):
    latent_var = mu + tf.multiply(tf.sqrt(tf.exp(stddev)), epsilon)
    tf.summary.histogram('features_sig', stddev)

w_dc = tf.Variable(tf.truncated_normal([latent_dim, 7 * 7 * 256], stddev=0.1), name='weights_dc')
b_dc = tf.Variable(tf.constant(0.0, shape=[7 * 7 * 256]), name='biases_dc')
tf.summary.histogram('weights_dc', w_dc)
tf.summary.histogram('biases_dc', b_dc)

with tf.variable_scope('deconv4'):
    deconv4 = tf.nn.bias_add(tf.matmul(latent_var, w_dc), b_dc)
    deconv4_batch_norm, uma_deconv4 = \
        batch_norm(deconv4, [7 * 7 * 256], phase_train=tf.cast(True, tf.bool), convolutional=False)

    deconv4 = tf.nn.relu(deconv4_batch_norm)
    deconv4_reshaped = tf.reshape(deconv4, [-1, 7, 7, 256], name='deconv4_reshaped')

with tf.variable_scope('deconv3'):
    deconv3, uma_deconv3 = deconv_layer(deconv4_reshaped, [3, 3, 128, 256], [128], activation='selu')
with tf.variable_scope('deconv2'):
    deconv2, uma_deconv2 = deconv_layer(deconv3, [3, 3, 64, 128], [64], activation='selu')
with tf.variable_scope('deconv1'):
    deconv1, uma_deconv1 = deconv_layer(deconv2, [3, 3, 32, 64], [32], activation='selu')
with tf.variable_scope('deconv_image'):
    deconv_image_batch, uma_deconv = deconv_layer(deconv1, [3, 3, 3, 32], [3], activation='sigmoid')

# loss function.
with tf.name_scope('loss_likelihood'):
    # temp1 shape: [32, 112, 112, 3]

    temp1 = images_batch * tf.log(deconv_image_batch + 1e-9) + (1 - images_batch) * tf.log(1 - deconv_image_batch + 1e-9)

    #temp1 = temp1 * confidences. This will give an error. Therefore, we should expand the dimension of confidence tensor
    confidences_ = tf.expand_dims(tf.expand_dims(confidences, axis=1), axis=1) # shape: [32, 1, 1, 1].
    temp1 = temp1 * confidences_
    log_likelihood = -tf.reduce_sum(temp1, reduction_indices=[1, 2, 3])
    log_likelihood_total = tf.reduce_sum(log_likelihood)
    #l2_loss = tf.reduce_mean(tf.abs(tf.subtract(images_batch, deconv_image_batch)))

with tf.name_scope('loss_KL'):
    # temp2 shape: [32, 200]
    temp2 = 1 + tf.log(tf.square(stddev + 1e-9)) - tf.square(mu) - tf.square(stddev)
    temp3 = temp2 * confidences     # confidences shape is [32, 1]
    KL_term = - 0.5 * tf.reduce_sum(temp3, reduction_indices=1)
    KL_term_total = tf.reduce_sum(KL_term)

with tf.name_scope('total_loss'):
    variational_lower_bound = tf.reduce_mean(log_likelihood + KL_term)
    tf.summary.scalar('loss', variational_lower_bound)
with tf.name_scope('optimizer'):
    optimizer = tf.train.AdamOptimizer(0.00001).minimize(variational_lower_bound)

init_op = tf.group(tf.local_variables_initializer(),
                   tf.global_variables_initializer())

saver = tf.train.Saver()

model_path = 'C:/Users/user/PycharmProjects/VariationalAutoEncoder/' \
             'VariationalAutoEncoderFaces/tensorboard_logs/Graph_model/ckpt'

# Here is the session...
with tf.Session() as sess:

    train_writer = tf.summary.FileWriter('C:/Users/user/PycharmProjects/VariationalAutoEncoder/'
                                         'VariationalAutoEncoderFaces/tensorboard_logs/Event_files', sess.graph)

    merged = tf.summary.merge_all()

    # Note that init_op should start before the Coordinator and the thread otherwise, this will throw an error.
    sess.run(init_op)

    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)
    step = 0

    to_run_list = [uma_conv1, uma_conv2, uma_conv3, uma_conv4, uma_deconv1, uma_deconv2, uma_deconv3,
                   uma_deconv4, uma_deconv, optimizer, variational_lower_bound, merged,
                   deconv_image_batch, image]

    # Note that the last name "Graph_model" is the name of the saved checkpoints file => the ckpt is saved
    # under tensorboard_logs.
    ckpt = tf.train.get_checkpoint_state(
        os.path.dirname(model_path))
    if ckpt and ckpt.model_checkpoint_path:
        saver.restore(sess, ckpt.model_checkpoint_path)
        print('checkpoints are saved!!!')
    else:
        print('No stored checkpoints')
    epoch = 0
    while not coord.should_stop():

        _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, loss, summary, reconstructed_image, original_image = \
            sess.run(to_run_list)

        print('total loss:', loss)

        original_image = cv2.cvtColor(np.array(original_image), cv2.COLOR_RGB2BGR)
        reconstructed_image = cv2.cvtColor(np.array(reconstructed_image[0]), cv2.COLOR_RGB2BGR)

        cv2.imshow('original_image', original_image)
        cv2.imshow('reconstructed_image', reconstructed_image)
        cv2.waitKey(1)
        if step % 234 == 0:
            epoch += 1
            print('epoch:', epoch)
            if epoch == num_epoch - 2:
                coord.request_stop()

        if step % 100 == 0:
            train_writer.add_summary(summary, step)
            #print('total loss:', loss)
            #print('log_likelihood_', log_likelihood_)
            #print('KL_term', KL_term_)
        step += 1

    save_path = saver.save(sess, model_path)
    coord.request_stop()
    coord.join(threads)
    train_writer.close()

Any help is much appreciated!!

Maosi Chen · Accepted Answer

Here are some sample codes to show the trend of means and variances over 3 SELU layers. The numbers of nodes on the layers (including the input layer) are [15, 30, 30, 8]

import tensorflow as tf
import numpy as np
import os

#-----------------------------------------------#
# https://github.com/bioinf-jku/SNNs/blob/master/selu.py
# The SELU activation function
def selu(x):
    with ops.name_scope('elu') as scope:
        alpha = 1.6732632423543772848170429916717
        scale = 1.0507009873554804934193349852946
        return scale*tf.where(x>=0.0, x, alpha*tf.nn.elu(x))

#-----------------------------------------------#
# https://github.com/bioinf-jku/SNNs/blob/master/selu.py
# alpha-dropout
def dropout_selu(x, rate, alpha= -1.7580993408473766, fixedPointMean=0.0, fixedPointVar=1.0,
                 noise_shape=None, seed=None, name=None, training=False):
    """Dropout to a value with rescaling."""

    def dropout_selu_impl(x, rate, alpha, noise_shape, seed, name):
        keep_prob = 1.0 - rate
        x = ops.convert_to_tensor(x, name="x")
        if isinstance(keep_prob, numbers.Real) and not 0 < keep_prob <= 1:
            raise ValueError("keep_prob must be a scalar tensor or a float in the "
                                             "range (0, 1], got %g" % keep_prob)
        keep_prob = ops.convert_to_tensor(keep_prob, dtype=x.dtype, name="keep_prob")
        keep_prob.get_shape().assert_is_compatible_with(tensor_shape.scalar())

        alpha = ops.convert_to_tensor(alpha, dtype=x.dtype, name="alpha")
        alpha.get_shape().assert_is_compatible_with(tensor_shape.scalar())

        if tensor_util.constant_value(keep_prob) == 1:
            return x

        noise_shape = noise_shape if noise_shape is not None else array_ops.shape(x)
        random_tensor = keep_prob
        random_tensor += random_ops.random_uniform(noise_shape, seed=seed, dtype=x.dtype)
        binary_tensor = math_ops.floor(random_tensor)
        ret = x * binary_tensor + alpha * (1-binary_tensor)

        a = math_ops.sqrt(fixedPointVar / (keep_prob *((1-keep_prob) * math_ops.pow(alpha-fixedPointMean,2) + fixedPointVar)))

        b = fixedPointMean - a * (keep_prob * fixedPointMean + (1 - keep_prob) * alpha)
        ret = a * ret + b
        ret.set_shape(x.get_shape())
        return ret

    with ops.name_scope(name, "dropout", [x]) as name:
        return utils.smart_cond(training,
            lambda: dropout_selu_impl(x, rate, alpha, noise_shape, seed, name),
            lambda: array_ops.identity(x))

#-----------------------------------------------#
# build a 3-layer dense network with SELU activation and alpha-dropout
sess = tf.InteractiveSession()

w1 = tf.constant(np.random.normal(loc=0.0, scale=np.sqrt(1.0/15.0), size = [15, 30]))
b1 = tf.constant(np.random.normal(loc=0.0, scale=0.5, size = [30]))

x1 = tf.constant(np.random.normal(loc=0.0, scale=1.0, size = [200, 15]))
y1 = tf.add(tf.matmul(x1, w1), b1)
y1_selu = selu(y1)
y1_selu_dropout = dropout_selu(y1_selu, 0.05, training=True)

w2 = tf.constant(np.random.normal(loc=0.0, scale=np.sqrt(1.0/30.0), size = [30, 30]))
b2 = tf.constant(np.random.normal(loc=0.0, scale=0.5, size = [30]))

x2 = y1_selu_dropout 
y2 = tf.add(tf.matmul(x2, w2), b2)
y2_selu = selu(y2)
y2_selu_dropout = dropout_selu(y2_selu, 0.05, training=True)


w3 = tf.constant(np.random.normal(loc=0.0, scale=np.sqrt(1.0/30.0), size = [30, 8]))
b3 = tf.constant(np.random.normal(loc=0.0, scale=0.5, size = [8]))

x3 = y2_selu_dropout 
y3 = tf.add(tf.matmul(x3, w3), b3)
y3_selu = selu(y3)
y3_selu_dropout = dropout_selu(y3_selu, 0.05, training=True)


#-------------------------#
# evaluate the network
x1_v, y1_selu_dropout_v, \
x2_v, y2_selu_dropout_v, \
x3_v, y3_selu_dropout_v, \
 = sess.run([x1, y1_selu_dropout, x2, y2_selu_dropout, x3, y3_selu_dropout]) 

#-------------------------#
# print each layer's mean and standard deviation (1st line: input; 2nd line: output)
print("Layer 1")
print(np.mean(x1_v), np.std(x1_v))
print(np.mean(y1_selu_dropout_v), np.std(y1_selu_dropout_v))
print("Layer 2")
print(np.mean(x2_v), np.std(x2_v))
print(np.mean(y2_selu_dropout_v), np.std(y2_selu_dropout_v))
print("Layer 3")
print(np.mean(x3_v), np.std(x3_v))
print(np.mean(y3_selu_dropout_v), np.std(y3_selu_dropout_v))

Here is one possible output. Over 3 layers, the mean and standard deviation are still close to 0 and 1, respectively.

Layer 1
-0.0101213033749 1.01375071842
0.0106228883975 1.09375593322
Layer 2
0.0106228883975 1.09375593322
-0.027910206754 1.12216643393
Layer 3
-0.027910206754 1.12216643393
-0.131790078631 1.09698413493

Difference between Batch Normalization and Self Normalized Neural Network with SELU

Tags:

neural-network

batch-normalization

I. A

1 Answers

Maosi Chen

Recent Activity

Donate For Us

Difference between Batch Normalization and Self Normalized Neural Network with SELU

Tags:

neural-network

batch-normalization

I. A

1 Answers

Maosi Chen

Related questions

Recent Activity

Donate For Us