Please find below code for the image classification into 2 class, which I am trying to execute using Kaggle TPU's. Could you please help to figure the problem here? I have followed the guideline from the Kaggle website in order to use the GPU, but still no luck.
Below is the error stack generated from the code
import tensorflow as tf
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
print(tpu_strategy)
# save the final model to file
from keras.applications.vgg16 import VGG16
from keras.models import Model
from keras.layers import Dense
from keras.layers import Flatten
from keras.preprocessing.image import ImageDataGenerator
# define cnn model
def define_model():
with tpu_strategy.scope():
# load model
model = VGG16(include_top=False, input_shape=(224, 224, 3))
# mark loaded layers as not trainable
for layer in model.layers:
layer.trainable = False
# add new classifier layers
flat1 = Flatten()(model.layers[-1].output)
class1 = Dense(128, activation='relu', kernel_initializer='he_uniform')(flat1)
output = Dense(1, activation='sigmoid')(class1)
# define new model
model = Model(inputs=model.inputs, outputs=output)
# compile model
model.compile(optimizer=tf.keras.optimizers.SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True),
loss='binary_crossentropy', metrics=['accuracy'])
return model
# run the test harness for evaluating a model
def train():
# define model
model = define_model()
# create data generator
datagen = ImageDataGenerator(featurewise_center=True)
# specify imagenet mean values for centering
datagen.mean = [123.68, 116.779, 103.939]
# prepare iterator
train_it = datagen.flow_from_directory('/kaggle/working/train/',
class_mode='binary', batch_size=64, target_size=(224, 224))
# fit model
model.fit_generator(train_it, steps_per_epoch=len(train_it), epochs=10, verbose=0)
# save model
model.save('final_model.h5')
# entry point, run the test harness
train()
Error Stack :
Found 25000 images belonging to 2 classes.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-13-b7b93eb12fab> in <module>
42
43 # entry point, run the test harness
---> 44 train()
<ipython-input-13-b7b93eb12fab> in train()
37 class_mode='binary', batch_size=64, target_size=(224, 224))
38 # fit model
---> 39 model.fit_generator(train_it, steps_per_epoch=len(train_it), epochs=10, verbose=0)
40 # save model
41 model.save('final_model.h5')
/opt/conda/lib/python3.7/site-packages/keras/legacy/interfaces.py in wrapper(*args, **kwargs)
89 warnings.warn('Update your `' + object_name + '` call to the ' +
90 'Keras 2 API: ' + signature, stacklevel=2)
---> 91 return func(*args, **kwargs)
92 wrapper._original_function = func
93 return wrapper
/opt/conda/lib/python3.7/site-packages/keras/engine/training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
1730 use_multiprocessing=use_multiprocessing,
1731 shuffle=shuffle,
-> 1732 initial_epoch=initial_epoch)
1733
1734 @interfaces.legacy_generator_methods_support
/opt/conda/lib/python3.7/site-packages/keras/engine/training_generator.py in fit_generator(model, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
40
41 do_validation = bool(validation_data)
---> 42 model._make_train_function()
43 if do_validation:
44 model._make_test_function()
/opt/conda/lib/python3.7/site-packages/keras/engine/training.py in _make_train_function(self)
314 training_updates = self.optimizer.get_updates(
315 params=self._collected_trainable_weights,
--> 316 loss=self.total_loss)
317 updates = self.updates + training_updates
318
/opt/conda/lib/python3.7/site-packages/keras/legacy/interfaces.py in wrapper(*args, **kwargs)
89 warnings.warn('Update your `' + object_name + '` call to the ' +
90 'Keras 2 API: ' + signature, stacklevel=2)
---> 91 return func(*args, **kwargs)
92 wrapper._original_function = func
93 return wrapper
/opt/conda/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py in symbolic_fn_wrapper(*args, **kwargs)
73 if _SYMBOLIC_SCOPE.value:
74 with get_graph().as_default():
---> 75 return func(*args, **kwargs)
76 else:
77 return func(*args, **kwargs)
/opt/conda/lib/python3.7/site-packages/keras/optimizers.py in get_updates(self, loss, params)
760 def get_updates(self, loss, params):
761 if isinstance(self.optimizer, tf.keras.optimizers.Optimizer):
--> 762 return self.optimizer.get_updates(loss, params)
763 else:
764 grads = self.optimizer.compute_gradients(loss, var_list=params)
/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py in get_updates(self, loss, params)
507 if g is not None and v.dtype != dtypes.resource
508 ])
--> 509 return [self.apply_gradients(grads_and_vars)]
510
511 def _set_hyper(self, name, value):
/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py in apply_gradients(self, grads_and_vars, name)
432 _ = self.iterations
433 self._create_hypers()
--> 434 self._create_slots(var_list)
435
436 if not grads_and_vars:
/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/gradient_descent.py in _create_slots(self, var_list)
98 if self._momentum:
99 for var in var_list:
--> 100 self.add_slot(var, "momentum")
101
102 def _prepare_local(self, var_device, var_dtype, apply_state):
/opt/conda/lib/python3.7/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py in add_slot(self, var, slot_name, initializer)
590 "variables are created under the same strategy scope. This may "
591 "happen if you're restoring from a checkpoint outside the scope"
--> 592 .format(strategy, var))
593
594 with strategy.extended.colocate_vars_with(var):
ValueError: Trying to create optimizer slot variable under the scope for tf.distribute.Strategy (<tensorflow.python.distribute.distribute_lib._DefaultDistributionStrategy object at 0x7f3e60656050>), which is different from the scope used for the original variable (TPUMirroredVariable:{
0 /job:worker/replica:0/task:0/device:TPU:0: <tf.Variable 'dense_17/kernel:0' shape=(25088, 128) dtype=float32, numpy=
array([[-0.01303554, -0.00389335, 0.00986688, ..., 0.01155722,
-0.01016544, 0.00725855],
[ 0.00431045, -0.01050912, -0.00490532, ..., -0.00075989,
0.00883604, 0.01337762],
[ 0.00195401, 0.01383564, 0.01248195, ..., -0.01159664,
0.01150718, 0.00515156],
...,
[-0.00288643, 0.00719406, -0.01046378, ..., 0.00476515,
0.00593644, 0.00830421],
[ 0.01492004, -0.00584323, 0.00321727, ..., -0.00236337,
-0.01540608, 0.01260902],
[-0.01198301, 0.00917004, 0.0068699 , ..., -0.00365373,
-0.00087463, 0.01179958]], dtype=float32)>,
1 /job:worker/replica:0/task:0/device:TPU:1: <tf.Variable 'dense_17/kernel/replica_1:0' shape=(25088, 128) dtype=float32, numpy=
array([[-0.01303554, -0.00389335, 0.00986688, ..., 0.01155722,
-0.01016544, 0.00725855],
[ 0.00431045, -0.01050912, -0.00490532, ..., -0.00075989,
0.00883604, 0.01337762],
[ 0.00195401, 0.01383564, 0.01248195, ..., -0.01159664,
0.01150718, 0.00515156],
...,
[-0.00288643, 0.00719406, -0.01046378, ..., 0.00476515,
0.00593644, 0.00830421],
[ 0.01492004, -0.00584323, 0.00321727, ..., -0.00236337,
-0.01540608, 0.01260902],
[-0.01198301, 0.00917004, 0.0068699 , ..., -0.00365373,
-0.00087463, 0.01179958]], dtype=float32)>,
7 /job:worker/replica:0/task:0/device:TPU:7: <tf.Variable 'dense_17/kernel/replica_7:0' shape=(25088, 128) dtype=float32, numpy=
array([[-0.01303554, -0.00389335, 0.00986688, ..., 0.01155722,
-0.01016544, 0.00725855],
[ 0.00431045, -0.01050912, -0.00490532, ..., -0.00075989,
0.00883604, 0.01337762],
[ 0.00195401, 0.01383564, 0.01248195, ..., -0.01159664,
0.01150718, 0.00515156],
...,
[-0.00288643, 0.00719406, -0.01046378, ..., 0.00476515,
0.00593644, 0.00830421],
[ 0.01492004, -0.00584323, 0.00321727, ..., -0.00236337,
-0.01540608, 0.01260902],
[-0.01198301, 0.00917004, 0.0068699 , ..., -0.00365373,
-0.00087463, 0.01179958]], dtype=float32)>
}). Make sure the slot variables are created under the same strategy scope. This may happen if you're restoring from a checkpoint outside the scope
The error looks a bit like the one described here: https://github.com/tensorflow/tensorflow/issues/32561
Looking at your error trace, it (edited) is complaining:
ValueError: Trying to create optimizer slot variable under the scope for tf.distribute.Strategy, which is different from the scope used for the original variable TPUMirroredVariable
so it looks like something you call in your train() method might need to be within the tpu_strategy scope.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With