Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

ValueError: Trying to create optimizer slot variable under the scope for tf.distribute.Strategy

Please find below code for the image classification into 2 class, which I am trying to execute using Kaggle TPU's. Could you please help to figure the problem here? I have followed the guideline from the Kaggle website in order to use the GPU, but still no luck.

Below is the error stack generated from the code

import tensorflow as tf
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
print(tpu_strategy)
# save the final model to file
from keras.applications.vgg16 import VGG16
from keras.models import Model
from keras.layers import Dense
from keras.layers import Flatten
from keras.preprocessing.image import ImageDataGenerator

# define cnn model
def define_model():
    with tpu_strategy.scope():
        # load model
        model = VGG16(include_top=False, input_shape=(224, 224, 3))
        # mark loaded layers as not trainable
        for layer in model.layers:
            layer.trainable = False
        # add new classifier layers
        flat1 = Flatten()(model.layers[-1].output)
        class1 = Dense(128, activation='relu', kernel_initializer='he_uniform')(flat1)
        output = Dense(1, activation='sigmoid')(class1)
        # define new model
        model = Model(inputs=model.inputs, outputs=output)
        # compile model
        model.compile(optimizer=tf.keras.optimizers.SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True), 
                      loss='binary_crossentropy', metrics=['accuracy'])
        return model

# run the test harness for evaluating a model
def train():
    # define model
    model = define_model()
    # create data generator
    datagen = ImageDataGenerator(featurewise_center=True)
    # specify imagenet mean values for centering
    datagen.mean = [123.68, 116.779, 103.939]
    # prepare iterator
    train_it = datagen.flow_from_directory('/kaggle/working/train/',
                                           class_mode='binary', batch_size=64, target_size=(224, 224))
    # fit model
    model.fit_generator(train_it, steps_per_epoch=len(train_it), epochs=10, verbose=0)
    # save model
    model.save('final_model.h5')

# entry point, run the test harness
train()

Error Stack :

Found 25000 images belonging to 2 classes.
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-13-b7b93eb12fab> in <module>
     42 
     43 # entry point, run the test harness
---> 44 train()

<ipython-input-13-b7b93eb12fab> in train()
     37                                            class_mode='binary', batch_size=64, target_size=(224, 224))
     38     # fit model
---> 39     model.fit_generator(train_it, steps_per_epoch=len(train_it), epochs=10, verbose=0)
     40     # save model
     41     model.save('final_model.h5')

/opt/conda/lib/python3.7/site-packages/keras/legacy/interfaces.py in wrapper(*args, **kwargs)
     89                 warnings.warn('Update your `' + object_name + '` call to the ' +
     90                               'Keras 2 API: ' + signature, stacklevel=2)
---> 91             return func(*args, **kwargs)
     92         wrapper._original_function = func
     93         return wrapper

/opt/conda/lib/python3.7/site-packages/keras/engine/training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
   1730             use_multiprocessing=use_multiprocessing,
   1731             shuffle=shuffle,
-> 1732             initial_epoch=initial_epoch)
   1733 
   1734     @interfaces.legacy_generator_methods_support

/opt/conda/lib/python3.7/site-packages/keras/engine/training_generator.py in fit_generator(model, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
     40 
     41     do_validation = bool(validation_data)
---> 42     model._make_train_function()
     43     if do_validation:
     44         model._make_test_function()

/opt/conda/lib/python3.7/site-packages/keras/engine/training.py in _make_train_function(self)
    314                     training_updates = self.optimizer.get_updates(
    315                         params=self._collected_trainable_weights,
--> 316                         loss=self.total_loss)
    317                 updates = self.updates + training_updates
    318 

/opt/conda/lib/python3.7/site-packages/keras/legacy/interfaces.py in wrapper(*args, **kwargs)
     89                 warnings.warn('Update your `' + object_name + '` call to the ' +
     90                               'Keras 2 API: ' + signature, stacklevel=2)
---> 91             return func(*args, **kwargs)
     92         wrapper._original_function = func
     93         return wrapper

/opt/conda/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py in symbolic_fn_wrapper(*args, **kwargs)
     73         if _SYMBOLIC_SCOPE.value:
     74             with get_graph().as_default():
---> 75                 return func(*args, **kwargs)
     76         else:
     77             return func(*args, **kwargs)

/opt/conda/lib/python3.7/site-packages/keras/optimizers.py in get_updates(self, loss, params)
    760     def get_updates(self, loss, params):
    761         if isinstance(self.optimizer, tf.keras.optimizers.Optimizer):
--> 762             return self.optimizer.get_updates(loss, params)
    763         else:
    764             grads = self.optimizer.compute_gradients(loss, var_list=params)

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py in get_updates(self, loss, params)
    507         if g is not None and v.dtype != dtypes.resource
    508     ])
--> 509     return [self.apply_gradients(grads_and_vars)]
    510 
    511   def _set_hyper(self, name, value):

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py in apply_gradients(self, grads_and_vars, name)
    432         _ = self.iterations
    433         self._create_hypers()
--> 434         self._create_slots(var_list)
    435 
    436       if not grads_and_vars:

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/gradient_descent.py in _create_slots(self, var_list)
     98     if self._momentum:
     99       for var in var_list:
--> 100         self.add_slot(var, "momentum")
    101 
    102   def _prepare_local(self, var_device, var_dtype, apply_state):

/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py in add_slot(self, var, slot_name, initializer)
    590             "variables are created under the same strategy scope. This may "
    591             "happen if you're restoring from a checkpoint outside the scope"
--> 592             .format(strategy, var))
    593 
    594       with strategy.extended.colocate_vars_with(var):

ValueError: Trying to create optimizer slot variable under the scope for tf.distribute.Strategy (<tensorflow.python.distribute.distribute_lib._DefaultDistributionStrategy object at 0x7f3e60656050>), which is different from the scope used for the original variable (TPUMirroredVariable:{
  0 /job:worker/replica:0/task:0/device:TPU:0: <tf.Variable 'dense_17/kernel:0' shape=(25088, 128) dtype=float32, numpy=
array([[-0.01303554, -0.00389335,  0.00986688, ...,  0.01155722,
        -0.01016544,  0.00725855],
       [ 0.00431045, -0.01050912, -0.00490532, ..., -0.00075989,
         0.00883604,  0.01337762],
       [ 0.00195401,  0.01383564,  0.01248195, ..., -0.01159664,
         0.01150718,  0.00515156],
       ...,
       [-0.00288643,  0.00719406, -0.01046378, ...,  0.00476515,
         0.00593644,  0.00830421],
       [ 0.01492004, -0.00584323,  0.00321727, ..., -0.00236337,
        -0.01540608,  0.01260902],
       [-0.01198301,  0.00917004,  0.0068699 , ..., -0.00365373,
        -0.00087463,  0.01179958]], dtype=float32)>,
  1 /job:worker/replica:0/task:0/device:TPU:1: <tf.Variable 'dense_17/kernel/replica_1:0' shape=(25088, 128) dtype=float32, numpy=
array([[-0.01303554, -0.00389335,  0.00986688, ...,  0.01155722,
        -0.01016544,  0.00725855],
       [ 0.00431045, -0.01050912, -0.00490532, ..., -0.00075989,
         0.00883604,  0.01337762],
       [ 0.00195401,  0.01383564,  0.01248195, ..., -0.01159664,
         0.01150718,  0.00515156],
       ...,
       [-0.00288643,  0.00719406, -0.01046378, ...,  0.00476515,
         0.00593644,  0.00830421],
       [ 0.01492004, -0.00584323,  0.00321727, ..., -0.00236337,
        -0.01540608,  0.01260902],
       [-0.01198301,  0.00917004,  0.0068699 , ..., -0.00365373,
        -0.00087463,  0.01179958]], dtype=float32)>,
  7 /job:worker/replica:0/task:0/device:TPU:7: <tf.Variable 'dense_17/kernel/replica_7:0' shape=(25088, 128) dtype=float32, numpy=
array([[-0.01303554, -0.00389335,  0.00986688, ...,  0.01155722,
        -0.01016544,  0.00725855],
       [ 0.00431045, -0.01050912, -0.00490532, ..., -0.00075989,
         0.00883604,  0.01337762],
       [ 0.00195401,  0.01383564,  0.01248195, ..., -0.01159664,
         0.01150718,  0.00515156],
       ...,
       [-0.00288643,  0.00719406, -0.01046378, ...,  0.00476515,
         0.00593644,  0.00830421],
       [ 0.01492004, -0.00584323,  0.00321727, ..., -0.00236337,
        -0.01540608,  0.01260902],
       [-0.01198301,  0.00917004,  0.0068699 , ..., -0.00365373,
        -0.00087463,  0.01179958]], dtype=float32)>
}). Make sure the slot variables are created under the same strategy scope. This may happen if you're restoring from a checkpoint outside the scope
like image 209
Nitin Avatar asked Oct 26 '22 21:10

Nitin


1 Answers

The error looks a bit like the one described here: https://github.com/tensorflow/tensorflow/issues/32561

Looking at your error trace, it (edited) is complaining:

ValueError: Trying to create optimizer slot variable under the scope for tf.distribute.Strategy, which is different from the scope used for the original variable TPUMirroredVariable

so it looks like something you call in your train() method might need to be within the tpu_strategy scope.

like image 166
kmt Avatar answered Nov 15 '22 05:11

kmt