I had tried several versions of batch_normalization in tensorflow, but none of them worked! The results were all incorrect when I set batch_size = 1 at inference time.
Version 1: directly use the official version in tensorflow.contrib
from tensorflow.contrib.layers.python.layers.layers import batch_norm
use like this:
output = lrelu(batch_norm(tf.nn.bias_add(conv, biases), is_training), 0.5, name=scope.name)
is_training = True at training time and False at inference time.
Version 2: from How could I use Batch Normalization in TensorFlow?
def batch_norm_layer(x, train_phase, scope_bn='bn'):
bn_train = batch_norm(x, decay=0.999, epsilon=1e-3, center=True, scale=True,
updates_collections=None,
is_training=True,
reuse=None, # is this right?
trainable=True,
scope=scope_bn)
bn_inference = batch_norm(x, decay=0.999, epsilon=1e-3, center=True, scale=True,
updates_collections=None,
is_training=False,
reuse=True, # is this right?
trainable=True,
scope=scope_bn)
z = tf.cond(train_phase, lambda: bn_train, lambda: bn_inference)
return z
use like this:
output = lrelu(batch_norm_layer(tf.nn.bias_add(conv, biases), is_training), 0.5, name=scope.name)
is_training is a placeholder at training time is True and False at inference time.
version 3: from slim https://github.com/tensorflow/models/blob/master/inception/inception/slim/ops.py
def batch_norm_layer(inputs,
is_training=True,
scope='bn'):
decay=0.999
epsilon=0.001
inputs_shape = inputs.get_shape()
with tf.variable_scope(scope) as t_scope:
axis = list(range(len(inputs_shape) - 1))
params_shape = inputs_shape[-1:]
# Allocate parameters for the beta and gamma of the normalization.
beta, gamma = None, None
beta = tf.Variable(tf.zeros_initializer(params_shape),
name='beta',
trainable=True)
gamma = tf.Variable(tf.ones_initializer(params_shape),
name='gamma',
trainable=True)
moving_mean = tf.Variable(tf.zeros_initializer(params_shape),
name='moving_mean',
trainable=False)
moving_variance = tf.Variable(tf.ones_initializer(params_shape),
name='moving_variance',
trainable=False)
if is_training:
# Calculate the moments based on the individual batch.
mean, variance = tf.nn.moments(inputs, axis)
update_moving_mean = moving_averages.assign_moving_average(
moving_mean, mean, decay)
update_moving_variance = moving_averages.assign_moving_average(
moving_variance, variance, decay)
else:
# Just use the moving_mean and moving_variance.
mean = moving_mean
variance = moving_variance
# Normalize the activations.
outputs = tf.nn.batch_normalization(
inputs, mean, variance, beta, gamma, epsilon)
outputs.set_shape(inputs.get_shape())
return outputs
use like this:
output = lrelu(batch_norm_layer(tf.nn.bias_add(conv, biases), is_training), 0.5, name=scope.name)
is_training = True at training time and False at inference time.
version 4: like version3, but add tf.control_dependencies
def batch_norm_layer(inputs,
decay=0.999,
center=True,
scale=True,
epsilon=0.001,
moving_vars='moving_vars',
activation=None,
is_training=True,
trainable=True,
restore=True,
scope='bn',
reuse=None):
inputs_shape = inputs.get_shape()
with tf.variable_op_scope([inputs], scope, 'BatchNorm', reuse=reuse):
axis = list(range(len(inputs_shape) - 1))
params_shape = inputs_shape[-1:]
# Allocate parameters for the beta and gamma of the normalization.
beta = tf.Variable(tf.zeros(params_shape), name='beta')
gamma = tf.Variable(tf.ones(params_shape), name='gamma')
# Create moving_mean and moving_variance add them to
# GraphKeys.MOVING_AVERAGE_VARIABLES collections.
moving_mean = tf.Variable(tf.zeros(params_shape), name='moving_mean',
trainable=False)
moving_variance = tf.Variable(tf.ones(params_shape), name='moving_variance',
trainable=False)
control_inputs = []
if is_training:
# Calculate the moments based on the individual batch.
mean, variance = tf.nn.moments(inputs, axis)
update_moving_mean = moving_averages.assign_moving_average(
moving_mean, mean, decay)
update_moving_variance = moving_averages.assign_moving_average(
moving_variance, variance, decay)
control_inputs = [update_moving_mean, update_moving_variance]
else:
# Just use the moving_mean and moving_variance.
mean = moving_mean
variance = moving_variance
# Normalize the activations.
with tf.control_dependencies(control_inputs):
return tf.nn.batch_normalization(
inputs, mean, variance, beta, gamma, epsilon)
use like this:
output = lrelu(batch_norm(tf.nn.bias_add(conv, biases), is_training), 0.5, name=scope.name)
is_training = True at training time and False at inference time.
The 4 versions of Batch_normalization are all not correct. So, how to use batch normalization correctly?
Another strange phenomenon is if I set batch_norm_layer to null like this, the inference result are all same.
def batch_norm_layer(inputs, is_training):
return inputs
Batch normalization applies a transformation that maintains the mean output close to 0 and the output standard deviation close to 1. Importantly, batch normalization works differently during training and during inference.
Batch normalization (BN) is a technique to normalize activations in intermediate layers of deep neural networks. Its tendency to improve accuracy and speed up training have established BN as a favorite technique in deep learning.
In practical coding, we add Batch Normalization after the activation function of the output layer or before the activation function of the input layer. Mostly researchers found good results in implementing Batch Normalization after the activation layer.
Batch normalization can significantly speed up the training of a neural network and lead to higher performance.
I have tested that the following simplified implementation of batch normalization gives the same result as tf.contrib.layers.batch_norm
as long as the setting is the same.
def initialize_batch_norm(scope, depth):
with tf.variable_scope(scope) as bnscope:
gamma = tf.get_variable("gamma", shape[-1], initializer=tf.constant_initializer(1.0))
beta = tf.get_variable("beta", shape[-1], initializer=tf.constant_initializer(0.0))
moving_avg = tf.get_variable("moving_avg", shape[-1], initializer=tf.constant_initializer(0.0), trainable=False)
moving_var = tf.get_variable("moving_var", shape[-1], initializer=tf.constant_initializer(1.0), trainable=False)
bnscope.reuse_variables()
def BatchNorm_layer(x, scope, train, epsilon=0.001, decay=.99):
# Perform a batch normalization after a conv layer or a fc layer
# gamma: a scale factor
# beta: an offset
# epsilon: the variance epsilon - a small float number to avoid dividing by 0
with tf.variable_scope(scope, reuse=True):
with tf.variable_scope('BatchNorm', reuse=True) as bnscope:
gamma, beta = tf.get_variable("gamma"), tf.get_variable("beta")
moving_avg, moving_var = tf.get_variable("moving_avg"), tf.get_variable("moving_var")
shape = x.get_shape().as_list()
control_inputs = []
if train:
avg, var = tf.nn.moments(x, range(len(shape)-1))
update_moving_avg = moving_averages.assign_moving_average(moving_avg, avg, decay)
update_moving_var = moving_averages.assign_moving_average(moving_var, var, decay)
control_inputs = [update_moving_avg, update_moving_var]
else:
avg = moving_avg
var = moving_var
with tf.control_dependencies(control_inputs):
output = tf.nn.batch_normalization(x, avg, var, offset=beta, scale=gamma, variance_epsilon=epsilon)
return output
The main tips with using the official implementation of batch normalization in tf.contrib.layers.batch_norm
are: (1) set is_training=True
for training time and is_training=False
for validation and testing time; (2) set updates_collections=None
to make sure that moving_variance
and moving_mean
are updated in place; (3) be aware and careful with the scope setting; (4) set decay
to be a smaller value (decay=0.9
or decay=0.99
) than default value (default is 0.999) if your dataset is small or your total training updates/steps are not that large.
I found the Zhongyu Kuang's code really useful, but I stuck on how to dynamically switch between train and test ops, i.e. how to move from a python boolean is_training to a tensorflow boolean placeholder is_training. I need this functionality to be able to test the network on the validation set during the training.
Starting from his code and inspired by this, I wrote the following code:
def batch_norm(x, scope, is_training, epsilon=0.001, decay=0.99):
"""
Returns a batch normalization layer that automatically switch between train and test phases based on the
tensor is_training
Args:
x: input tensor
scope: scope name
is_training: boolean tensor or variable
epsilon: epsilon parameter - see batch_norm_layer
decay: epsilon parameter - see batch_norm_layer
Returns:
The correct batch normalization layer based on the value of is_training
"""
assert isinstance(is_training, (ops.Tensor, variables.Variable)) and is_training.dtype == tf.bool
return tf.cond(
is_training,
lambda: batch_norm_layer(x=x, scope=scope, epsilon=epsilon, decay=decay, is_training=True, reuse=None),
lambda: batch_norm_layer(x=x, scope=scope, epsilon=epsilon, decay=decay, is_training=False, reuse=True),
)
def batch_norm_layer(x, scope, is_training, epsilon=0.001, decay=0.99, reuse=None):
"""
Performs a batch normalization layer
Args:
x: input tensor
scope: scope name
is_training: python boolean value
epsilon: the variance epsilon - a small float number to avoid dividing by 0
decay: the moving average decay
Returns:
The ops of a batch normalization layer
"""
with tf.variable_scope(scope, reuse=reuse):
shape = x.get_shape().as_list()
# gamma: a trainable scale factor
gamma = tf.get_variable("gamma", shape[-1], initializer=tf.constant_initializer(1.0), trainable=True)
# beta: a trainable shift value
beta = tf.get_variable("beta", shape[-1], initializer=tf.constant_initializer(0.0), trainable=True)
moving_avg = tf.get_variable("moving_avg", shape[-1], initializer=tf.constant_initializer(0.0), trainable=False)
moving_var = tf.get_variable("moving_var", shape[-1], initializer=tf.constant_initializer(1.0), trainable=False)
if is_training:
# tf.nn.moments == Calculate the mean and the variance of the tensor x
avg, var = tf.nn.moments(x, range(len(shape)-1))
update_moving_avg = moving_averages.assign_moving_average(moving_avg, avg, decay)
update_moving_var = moving_averages.assign_moving_average(moving_var, var, decay)
control_inputs = [update_moving_avg, update_moving_var]
else:
avg = moving_avg
var = moving_var
control_inputs = []
with tf.control_dependencies(control_inputs):
output = tf.nn.batch_normalization(x, avg, var, offset=beta, scale=gamma, variance_epsilon=epsilon)
return output
Then I use the batch_norm layer in this way:
fc1_weights = tf.Variable(...)
fc1 = tf.matmul(x, fc1_weights)
fc1 = batch_norm(fc1, 'fc1_bn', is_training=is_training)
fc1 = tf.nn.relu(fc1)
Where is_training is a boolean placeholder. Note that the bias addition is not needed because is replaced by the beta parameter as explained in the Batch Normalization paper.
During execution:
# Training phase
sess.run(loss, feed_dict={x: bx, y: by, is_training: True})
# Testing phase
sess.run(loss, feed_dict={x: bx, y: by, is_training: False})
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With