I am using a colab pro TPU instance for the purpose of patch image classification. i'm using tensorflow version 2.3.0.
When calling model.fit I get the following error: InvalidArgumentError: Unable to find the relevant tensor remote_handle: Op ID: 14738, Output num: 0
with the following trace:
--------
InvalidArgumentError Traceback (most recent call last)
<ipython-input-20-5fd2ec1ce2f9> in <module>()
15 steps_per_epoch=STEPS_PER_EPOCH,
16 validation_data=dev_ds,
---> 17 validation_steps=VALIDATION_STEPS
18 )
6 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in _method_wrapper(self, *args, **kwargs)
106 def _method_wrapper(self, *args, **kwargs):
107 if not self._in_multi_worker_mode(): # pylint: disable=protected-access
--> 108 return method(self, *args, **kwargs)
109
110 # Running inside `run_distribute_coordinator` already.
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
1084 data_handler._initial_epoch = ( # pylint: disable=protected-access
1085 self._maybe_load_initial_epoch_from_ckpt(initial_epoch))
-> 1086 for epoch, iterator in data_handler.enumerate_epochs():
1087 self.reset_metrics()
1088 callbacks.on_epoch_begin(epoch)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/data_adapter.py in enumerate_epochs(self)
1140 if self._insufficient_data: # Set by `catch_stop_iteration`.
1141 break
-> 1142 if self._adapter.should_recreate_iterator():
1143 data_iterator = iter(self._dataset)
1144 yield epoch, data_iterator
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/data_adapter.py in should_recreate_iterator(self)
725 # each epoch.
726 return (self._user_steps is None or
--> 727 cardinality.cardinality(self._dataset).numpy() == self._user_steps)
728
729 def _validate_args(self, y, sample_weights, steps):
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py in numpy(self)
1061 """
1062 # TODO(slebedev): Consider avoiding a copy for non-CPU or remote tensors.
-> 1063 maybe_arr = self._numpy() # pylint: disable=protected-access
1064 return maybe_arr.copy() if isinstance(maybe_arr, np.ndarray) else maybe_arr
1065
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py in _numpy(self)
1029 return self._numpy_internal()
1030 except core._NotOkStatusException as e: # pylint: disable=protected-access
-> 1031 six.raise_from(core._status_to_exception(e.code, e.message), None) # pylint: disable=protected-access
1032
1033 @property
/usr/local/lib/python3.6/dist-packages/six.py in raise_from(value, from_value)
InvalidArgumentError: Unable to find the relevant tensor remote_handle: Op ID: 14738, Output num: 0
H have two dataset zip files containing 300,000> and 100,000< training and validation examples which I download from my Google Drive using !gdown and unzip it on Colab VM. For data pipeline I use tf.data.Dataset API and feed the API with list of filepaths and then use .map method to perform image fetching from memory, please keep in mind that my training dataset can't be fit into memory
Here is the code for creating Dataset:
train_dir = '/content/content/Data/train'
dev_dir = '/content/content/Data/dev'
def create_dataset(dir, label_dic, is_training=True):
filepaths = list(tf.data.Dataset.list_files(dir + '/*.jpg'))
labels = []
for f in filepaths:
ind = f.numpy().decode().split('/')[-1].split('.')[0]
labels.append(label_dic[ind])
ds = tf.data.Dataset.from_tensor_slices((filepaths, labels))
ds = ds.map(load_images, num_parallel_calls=tf.data.experimental.AUTOTUNE)
ds = ds.cache()
if is_training:
ds = ds.shuffle(len(filepaths), reshuffle_each_iteration=True)
ds = ds.repeat(EPOCHS)
ds = ds.batch(BATCH_SIZE)
ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
return ds
train_ds = create_dataset(train_dir, train_label)
dev_ds = create_dataset(dev_dir, dev_label, False)
And here is the code for creating and compiling my model and fitting the datasets, I use a keras custom model with VGG16 backend:
def create_model(input_shape, batch_size):
VGG16 = keras.applications.VGG16(include_top=False,input_shape=input_shape, weights='imagenet')
for layer in VGG16.layers:
layer.trainable = False
input_layer = keras.Input(shape=input_shape, batch_size=batch_size)
VGG_out = VGG16(input_layer)
x = Flatten(name='flatten', input_shape=(512,8,8))(VGG_out)
x = Dense(256, activation='relu', name='fc1')(x)
x = Dropout(0.5)(x)
x = Dense(1, activation='sigmoid', name='fc2')(x)
model = Model(input_layer, x)
model.summary()
return model
with strategy.scope():
model = create_model(INPUT_SHAPE, BATCH_SIZE)
model.compile(optimizer='adam',
loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
metrics=['accuracy'])
model.fit(train_ds,
epochs=5,
steps_per_epoch=STEPS_PER_EPOCH,
validation_data=dev_ds,
validation_steps=VALIDATION_STEPS
)
For TPU initialization and strategyI use a strategy = tf.distribute.TPUStrategy(resolver)
Initialization code shown below:
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))
a copy of the whole notebook with outputs can be reached at: Colab Ipython Notebook
@Pooya448
I know this is quite late, but this may be useful for anyone stuck here. Following is the function I use to connect to TPUs.
def connect_to_tpu(tpu_address: str = None):
if tpu_address is not None: # When using GCP
cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
tpu=tpu_address)
if tpu_address not in ("", "local"):
tf.config.experimental_connect_to_cluster(cluster_resolver)
tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)
print("Running on TPU ", cluster_resolver.master())
print("REPLICAS: ", strategy.num_replicas_in_sync)
return cluster_resolver, strategy
else: # When using Colab or Kaggle
try:
cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)
print("Running on TPU ", cluster_resolver.master())
print("REPLICAS: ", strategy.num_replicas_in_sync)
return cluster_resolver, strategy
except:
print("WARNING: No TPU detected.")
mirrored_strategy = tf.distribute.MirroredStrategy()
return None, mirrored_strategy
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With