I trying to implement CNN with the function tf.contrib.layers.batch_norm on Mnist dataset.
When I train and check the model I see that the loss is decreasing (good!) but the accuracy of the test dataset is remain random (~10%)(BAD!!!)
If I use the same model without the batch normalization I see that the test accuracy is increasing as expected.
you can see in the code below how I use the batch normalization function. if I use for the test dataset to set the is_training=True I get good results so the problem is the is_training=False mode of the batch normalization function.
Please help me with this. Thanks in advance for all.
# BLOCK2 - Layer 1
conv1 = tf.nn.conv2d(output, block2_layer1_1_weights, [1, 1, 1, 1], padding='SAME')
conv2 = tf.nn.conv2d(output, block2_layer1_2_weights, [1, 1, 1, 1], padding='SAME')
conv3 = tf.nn.conv2d(output, block2_layer1_3_weights, [1, 1, 1, 1], padding='SAME')
conv4 = tf.nn.conv2d(output, block2_layer1_4_weights, [1, 1, 1, 1], padding='SAME')
conv_normed1 = tf.contrib.layers.batch_norm(conv1, scale=True, decay=batch_norm_decay, center=True, is_training=is_training, updates_collections=None )
conv_normed2 = tf.contrib.layers.batch_norm(conv2, scale=True, decay=batch_norm_decay, center=True, is_training=is_training, updates_collections=None )
conv_normed3 = tf.contrib.layers.batch_norm(conv3, scale=True, decay=batch_norm_decay, center=True, is_training=is_training, updates_collections=None )
conv_normed4 = tf.contrib.layers.batch_norm(conv4, scale=True, decay=batch_norm_decay, center=True, is_training=is_training, updates_collections=None )
after_stack = tf.stack([conv_normed1, conv_normed2, conv_normed3, conv_normed4])
after_maxout = tf.reduce_max(after_stack, 0)
# BLOCK2 - Layer 2
conv1 = tf.nn.conv2d(after_maxout, block2_layer2_1_weights, [1, 1, 1, 1], padding='SAME')
conv2 = tf.nn.conv2d(after_maxout, block2_layer2_2_weights, [1, 1, 1, 1], padding='SAME')
conv_normed1 = tf.contrib.layers.batch_norm(conv1, scale=True, decay=batch_norm_decay, center=True, is_training=is_training, updates_collections=None )
conv_normed2 = tf.contrib.layers.batch_norm(conv2, scale=True, decay=batch_norm_decay, center=True, is_training=is_training, updates_collections=None )
after_stack = tf.stack([conv_normed1, conv_normed2])
after_maxout = tf.reduce_max(after_stack, 0)
# BLOCK2 - Layer 3
conv1 = tf.nn.conv2d(after_maxout, block2_layer3_1_weights, [1, 1, 1, 1], padding='SAME')
conv2 = tf.nn.conv2d(after_maxout, block2_layer3_2_weights, [1, 1, 1, 1], padding='SAME')
conv_normed1 = tf.contrib.layers.batch_norm(conv1 , scale=True, decay=batch_norm_decay, center=True, is_training=is_training, updates_collections=None )
conv_normed2 = tf.contrib.layers.batch_norm(conv2 , scale=True, decay=batch_norm_decay, center=True, is_training=is_training, updates_collections=None )
after_stack = tf.stack([conv_normed1, conv_normed2])
after_maxout = tf.reduce_max(after_stack, 0)
pooled = tf.nn.max_pool(after_maxout, [1, 3, 3, 1], [1, 3, 3, 1], 'SAME')
output = tf.nn.dropout(pooled, 0.5)
# # Training computation.
logits = model(tf_train_dataset)
loss = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))
l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'BatchNorm' not in v.name])
loss += LAMBDA * l2_loss
#
# # Optimizer.
tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(loss)
# # Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(logits)
valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
#print(valid_prediction.shape)
test_prediction = tf.nn.softmax(model(tf_test_dataset))
num_steps = 6000
with tf.Session(graph=graph) as session:
tf.global_variables_initializer().run()
print('Initialized')
for step in range(num_steps):
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
test_offset = (step * batch_size) % (test_labels.shape[0] - batch_size)
batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
batch_labels = train_labels[offset:(offset + batch_size), :]
feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels, is_training: True}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 50 == 0):
print('Minibatch loss at step %d: %f' % (step, l))
print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
for i in range(1, 10001):
test_batch = test_dataset[((i - 1) * test_batch_size):(i * test_batch_size), :, :, :]
pred = test_prediction.eval(feed_dict={tf_test_dataset: test_batch, is_training: False})
if i == 1:
stacked_pred = pred
else:
stacked_pred = np.vstack((stacked_pred, pred))
print(np.argmax(stacked_pred,1))
print('test accuracy: %.1f%%' % accuracy(stacked_pred, test_labels))`
During training, batch-norm uses statistics based on the batch. During evaluation/testing (whenever is_training
is False
), it uses population statistics.
Internally, the population statistics are updated via implicitly created update ops which are added to the tf.GraphKeys.UPDATE_OPS
collection - but you have to force tensorflow to run these operations. A simple way of doing this is to introduce control_dependencies
on your optimization op.
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_op = optimizer.minimize(loss, step)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With