Adding multiple layers to TensorFlow causes loss function to become Nan

Question

I'm writing a neural-network classifier in TensorFlow/Python for the notMNIST dataset. I've implemented l2 regularization and dropout on the hidden layers. It works fine as long as there is only one hidden layer, but when I added more layers (to improve accuracy), the loss function increases rapidly at each step, becoming NaN by step 5. I tried temporarily disabling Dropout and L2 regularization, but I get the same behavior as long as there are 2+ layers. I even rewrote my code from scratch (doing some refactoring to make it more flexible), but with the same results. The number and size of layers is controlled by hidden_layer_spec. What am I missing?

#works for np.array([1024]) with about 96.1% accuracy
hidden_layer_spec = np.array([1024, 300])
num_hidden_layers = hidden_layer_spec.shape[0]
batch_size = 256
beta = 0.0005

epochs = 100
stepsPerEpoch = float(train_dataset.shape[0]) / batch_size
num_steps = int(math.ceil(float(epochs) * stepsPerEpoch))

l2Graph = tf.Graph()
with l2Graph.as_default():
  #with tf.device('/cpu:0'):
      # Input data. For the training data, we use a placeholder that will be fed
      # at run time with a training minibatch.
      tf_train_dataset = tf.placeholder(tf.float32,
                                        shape=(batch_size, image_size * image_size))
      tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
      tf_valid_dataset = tf.constant(valid_dataset)
      tf_test_dataset = tf.constant(test_dataset)

      weights = []
      biases = []
      for hi in range(0, num_hidden_layers + 1):
        width = image_size * image_size if hi == 0 else hidden_layer_spec[hi - 1]
        height = num_labels if hi == num_hidden_layers else hidden_layer_spec[hi]
        weights.append(tf.Variable(tf.truncated_normal([width, height]), name = "w" + `hi + 1`))
        biases.append(tf.Variable(tf.zeros([height]), name = "b" + `hi + 1`))
        print(`width` + 'x' + `height`)

      def logits(input, addDropoutLayer = False):
        previous_layer = input
        for hi in range(0, hidden_layer_spec.shape[0]):
          previous_layer = tf.nn.relu(tf.matmul(previous_layer, weights[hi]) + biases[hi])
          if addDropoutLayer:
            previous_layer = tf.nn.dropout(previous_layer, 0.5)
        return tf.matmul(previous_layer, weights[num_hidden_layers]) + biases[num_hidden_layers]

      # Training computation.
      train_logits = logits(tf_train_dataset, True)

      l2 = tf.nn.l2_loss(weights[0])
      for hi in range(1, len(weights)):
        l2 = l2 + tf.nn.l2_loss(weights[0])
      loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(train_logits, tf_train_labels)) + beta * l2

      # Optimizer.
      global_step = tf.Variable(0)  # count the number of steps taken.
      learning_rate = tf.train.exponential_decay(0.5, global_step, int(stepsPerEpoch) * 2, 0.96, staircase = True)
      optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)

      # Predictions for the training, validation, and test data.
      train_prediction = tf.nn.softmax(train_logits)
      valid_prediction = tf.nn.softmax(logits(tf_valid_dataset))
      test_prediction = tf.nn.softmax(logits(tf_test_dataset))
      saver = tf.train.Saver()

with tf.Session(graph=l2Graph) as session:
  tf.initialize_all_variables().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Learning rate: " % learning_rate)
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
  save_path = saver.save(session, "l2_degrade.ckpt")
  print("Model save to " + `save_path`)

Nimrand · Accepted Answer

Turns out this was not so much a coding issue as a Deep Learning Issue. The extra layer made the gradients too unstable, and that lead to the loss function quickly devolving to NaN. The best way to fix this is to use Xavier initialization. Otherwise, the variance of the initial values will tend to be too high, causing instability. Also, decreasing the learning rate may help.

user3192082 · Answer

I had the same problem and reducing the batch size and learning rate worked for me.

Adding multiple layers to TensorFlow causes loss function to become Nan

Tags:

python

neural-network

tensorflow

deep-learning

Nimrand

2 Answers

Nimrand

user3192082

Recent Activity

Donate For Us

Adding multiple layers to TensorFlow causes loss function to become Nan

Tags:

python

neural-network

tensorflow

deep-learning

Nimrand

2 Answers

Nimrand

user3192082

Related questions

Recent Activity

Donate For Us