ValueError: Trying to create optimizer slot variable under the scope for tf.distribute.Strategy

Please find below code for the image classification into 2 class, which I am trying to execute using Kaggle TPU's. Could you please help to figure the problem here? I have followed the guideline from the Kaggle website in order to use the GPU, but still no luck.

Below is the error stack generated from the code

import tensorflow as tf
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
# save the final model to file
from keras.applications.vgg16 import VGG16
from keras.models import Model
from keras.layers import Dense
from keras.layers import Flatten
from keras.preprocessing.image import ImageDataGenerator

# define cnn model
def define_model():
    with tpu_strategy.scope():
        # load model
        model = VGG16(include_top=False, input_shape=(224, 224, 3))
        # mark loaded layers as not trainable
        for layer in model.layers:
            layer.trainable = False
        # add new classifier layers
        flat1 = Flatten()(model.layers[-1].output)
        class1 = Dense(128, activation='relu', kernel_initializer='he_uniform')(flat1)
        output = Dense(1, activation='sigmoid')(class1)
        # define new model
        model = Model(inputs=model.inputs, outputs=output)
        # compile model
        model.compile(optimizer=tf.keras.optimizers.SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True), 
                      loss='binary_crossentropy', metrics=['accuracy'])
        return model

# run the test harness for evaluating a model
def train():
    # define model
    model = define_model()
    # create data generator
    datagen = ImageDataGenerator(featurewise_center=True)
    # specify imagenet mean values for centering
    datagen.mean = [123.68, 116.779, 103.939]
    # prepare iterator
    train_it = datagen.flow_from_directory('/kaggle/working/train/',
                                           class_mode='binary', batch_size=64, target_size=(224, 224))
    # fit model
    model.fit_generator(train_it, steps_per_epoch=len(train_it), epochs=10, verbose=0)
    # save model

# entry point, run the test harness

Error Stack :

Found 25000 images belonging to 2 classes.
ValueError                                Traceback (most recent call last)
<ipython-input-13-b7b93eb12fab> in <module>
     43 # entry point, run the test harness
---> 44 train()

<ipython-input-13-b7b93eb12fab> in train()
     37                                            class_mode='binary', batch_size=64, target_size=(224, 224))
     38     # fit model
---> 39     model.fit_generator(train_it, steps_per_epoch=len(train_it), epochs=10, verbose=0)
     40     # save model
     41     model.save('final_model.h5')

/opt/conda/lib/python3.7/site-packages/keras/legacy/interfaces.py in wrapper(*args, **kwargs)
     89                 warnings.warn('Update your `' + object_name + '` call to the ' +
     90                               'Keras 2 API: ' + signature, stacklevel=2)
---> 91             return func(*args, **kwargs)
     92         wrapper._original_function = func
     93         return wrapper

/opt/conda/lib/python3.7/site-packages/keras/engine/training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
   1730             use_multiprocessing=use_multiprocessing,
   1731             shuffle=shuffle,
-> 1732             initial_epoch=initial_epoch)
   1734     @interfaces.legacy_generator_methods_support

/opt/conda/lib/python3.7/site-packages/keras/engine/training_generator.py in fit_generator(model, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
     41     do_validation = bool(validation_data)
---> 42     model._make_train_function()
     43     if do_validation:
     44         model._make_test_function()

/opt/conda/lib/python3.7/site-packages/keras/engine/training.py in _make_train_function(self)
    314                     training_updates = self.optimizer.get_updates(
    315                         params=self._collected_trainable_weights,
--> 316                         loss=self.total_loss)
    317                 updates = self.updates + training_updates

/opt/conda/lib/python3.7/site-packages/keras/legacy/interfaces.py in wrapper(*args, **kwargs)
     89                 warnings.warn('Update your `' + object_name + '` call to the ' +
     90                               'Keras 2 API: ' + signature, stacklevel=2)
---> 91             return func(*args, **kwargs)
     92         wrapper._original_function = func
     93         return wrapper

/opt/conda/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py in symbolic_fn_wrapper(*args, **kwargs)
     73         if _SYMBOLIC_SCOPE.value:
     74             with get_graph().as_default():
---> 75                 return func(*args, **kwargs)
     76         else:
     77             return func(*args, **kwargs)

/opt/conda/lib/python3.7/site-packages/keras/optimizers.py in get_updates(self, loss, params)
    760     def get_updates(self, loss, params):
    761         if isinstance(self.optimizer, tf.keras.optimizers.Optimizer):
--> 762             return self.optimizer.get_updates(loss, params)
    763         else:
    764             grads = self.optimizer.compute_gradients(loss, var_list=params)

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py in get_updates(self, loss, params)
    507         if g is not None and v.dtype != dtypes.resource
    508     ])
--> 509     return [self.apply_gradients(grads_and_vars)]
    511   def _set_hyper(self, name, value):

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py in apply_gradients(self, grads_and_vars, name)
    432         _ = self.iterations
    433         self._create_hypers()
--> 434         self._create_slots(var_list)
    436       if not grads_and_vars:

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/gradient_descent.py in _create_slots(self, var_list)
     98     if self._momentum:
     99       for var in var_list:
--> 100         self.add_slot(var, "momentum")
    102   def _prepare_local(self, var_device, var_dtype, apply_state):

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py in add_slot(self, var, slot_name, initializer)
    590             "variables are created under the same strategy scope. This may "
    591             "happen if you're restoring from a checkpoint outside the scope"
--> 592             .format(strategy, var))
    594       with strategy.extended.colocate_vars_with(var):

ValueError: Trying to create optimizer slot variable under the scope for tf.distribute.Strategy (<tensorflow.python.distribute.distribute_lib._DefaultDistributionStrategy object at 0x7f3e60656050>), which is different from the scope used for the original variable (TPUMirroredVariable:{
  0 /job:worker/replica:0/task:0/device:TPU:0: <tf.Variable 'dense_17/kernel:0' shape=(25088, 128) dtype=float32, numpy=
array([[-0.01303554, -0.00389335,  0.00986688, ...,  0.01155722,
        -0.01016544,  0.00725855],
       [ 0.00431045, -0.01050912, -0.00490532, ..., -0.00075989,
         0.00883604,  0.01337762],
       [ 0.00195401,  0.01383564,  0.01248195, ..., -0.01159664,
         0.01150718,  0.00515156],
       [-0.00288643,  0.00719406, -0.01046378, ...,  0.00476515,
         0.00593644,  0.00830421],
       [ 0.01492004, -0.00584323,  0.00321727, ..., -0.00236337,
        -0.01540608,  0.01260902],
       [-0.01198301,  0.00917004,  0.0068699 , ..., -0.00365373,
        -0.00087463,  0.01179958]], dtype=float32)>,
  1 /job:worker/replica:0/task:0/device:TPU:1: <tf.Variable 'dense_17/kernel/replica_1:0' shape=(25088, 128) dtype=float32, numpy=
array([[-0.01303554, -0.00389335,  0.00986688, ...,  0.01155722,
        -0.01016544,  0.00725855],
       [ 0.00431045, -0.01050912, -0.00490532, ..., -0.00075989,
         0.00883604,  0.01337762],
       [ 0.00195401,  0.01383564,  0.01248195, ..., -0.01159664,
         0.01150718,  0.00515156],
       [-0.00288643,  0.00719406, -0.01046378, ...,  0.00476515,
         0.00593644,  0.00830421],
       [ 0.01492004, -0.00584323,  0.00321727, ..., -0.00236337,
        -0.01540608,  0.01260902],
       [-0.01198301,  0.00917004,  0.0068699 , ..., -0.00365373,
        -0.00087463,  0.01179958]], dtype=float32)>,
  7 /job:worker/replica:0/task:0/device:TPU:7: <tf.Variable 'dense_17/kernel/replica_7:0' shape=(25088, 128) dtype=float32, numpy=
array([[-0.01303554, -0.00389335,  0.00986688, ...,  0.01155722,
        -0.01016544,  0.00725855],
       [ 0.00431045, -0.01050912, -0.00490532, ..., -0.00075989,
         0.00883604,  0.01337762],
       [ 0.00195401,  0.01383564,  0.01248195, ..., -0.01159664,
         0.01150718,  0.00515156],
       [-0.00288643,  0.00719406, -0.01046378, ...,  0.00476515,
         0.00593644,  0.00830421],
       [ 0.01492004, -0.00584323,  0.00321727, ..., -0.00236337,
        -0.01540608,  0.01260902],
       [-0.01198301,  0.00917004,  0.0068699 , ..., -0.00365373,
        -0.00087463,  0.01179958]], dtype=float32)>
}). Make sure the slot variables are created under the same strategy scope. This may happen if you're restoring from a checkpoint outside the scope
1 Answers

The error looks a bit like the one described here: https://github.com/tensorflow/tensorflow/issues/32561

Looking at your error trace, it (edited) is complaining:

ValueError: Trying to create optimizer slot variable under the scope for tf.distribute.Strategy, which is different from the scope used for the original variable TPUMirroredVariable

so it looks like something you call in your train() method might need to be within the tpu_strategy scope.

