I'm trying to train a VGG19 model for a binary image classification problem. My dataset doesn't fit into the memory, so I use batches and the .fit_generator
function of the model
.
However, even when trying to train with batches, I get the following error:
W tensorflow/core/common_runtime/bfc_allocator.cc:275] Ran out of memory trying to allocate 392.00MiB. See logs for memory state.
W tensorflow/core/framework/op_kernel.cc:975] Resource exhausted: OOM when allocating tensor with shape
Here's the console output about my GPU when starting the training script:
Using TensorFlow backend.
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcublas.so locally
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcudnn.so locally
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcufft.so locally
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcuda.so.1 locally
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcurand.so locally
Found 20000 images belonging to 2 classes.
Found 5000 images belonging to 2 classes.
I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
I tensorflow/core/common_runtime/gpu/gpu_device.cc:885] Found device 0 with properties:
name: GeForce GT 750M
major: 3 minor: 0 memoryClockRate (GHz) 1.085
pciBusID 0000:01:00.0
Total memory: 1.95GiB
Free memory: 1.74GiB
I tensorflow/core/common_runtime/gpu/gpu_device.cc:906] DMA: 0
I tensorflow/core/common_runtime/gpu/gpu_device.cc:916] 0: Y
I tensorflow/core/common_runtime/gpu/gpu_device.cc:975] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GT 750M, pci bus id: 0000:01:00.0)
I don't know, but I think 1.5+ GB should be enough to train on small batches, right?
The full output of the script is quite huge and I'll paste a piece of it to this pastebin.
Below is the code for my model:
from keras.models import Sequential
from keras.layers.core import Flatten, Dense, Dropout
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau
class VGG19(object):
def __init__(self, weights_path=None, train_folder='data/train', validation_folder='data/val'):
self.weights_path = weights_path
self.model = self._init_model()
if weights_path:
self.model.load_weights(weights_path)
else:
self.datagen = self._datagen()
self.train_folder = train_folder
self.validation_folder = validation_folder
self.model.compile(
loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy']
)
def fit(self, batch_size=32, nb_epoch=10):
train_generator = self.datagen.flow_from_directory(
self.train_folder, target_size=(224, 224),
color_mode='rgb', class_mode='binary',
batch_size=2
)
validation_generator = self.datagen.flow_from_directory(
self.validation_folder, target_size=(224, 224),
color_mode='rgb', class_mode='binary',
batch_size=2
)
self.model.fit_generator(
train_generator,
samples_per_epoch=16,
nb_epoch=1,
verbose=1,
validation_data=validation_generator,
callbacks=[
TensorBoard(log_dir='./logs', write_images=True),
ModelCheckpoint(filepath='weights.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_loss'),
ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, min_lr=0.001)
],
nb_val_samples=8
)
def evaluate(self, X, y, batch_size=32):
return self.model.evaluate(
X, y,
batch_size=batch_size,
verbose=1
)
def predict(self, X, batch_size=4, verbose=1):
return self.model.predict(X, batch_size=batch_size, verbose=verbose)
def predict_proba(self, X, batch_size=4, verbose=1):
return self.model.predict_proba(X, batch_size=batch_size, verbose=verbose)
def _init_model(self):
model = Sequential()
model.add(ZeroPadding2D((1, 1), input_shape=(224, 224, 3)))
model.add(Convolution2D(64, 3, 3, activation='relu'))
model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(64, 3, 3, activation='relu'))
model.add(MaxPooling2D((2, 2), strides=(2, 2)))
model.add(ZeroPadding2D((1, 1)))
model.add(Convolution2D(128, 3, 3, activation='relu'))
model.add(ZeroPadding2D((1,1)))
model.add(Convolution2D(128, 3, 3, activation='relu'))
model.add(MaxPooling2D((2, 2), strides=(2, 2)))
model.add(ZeroPadding2D((1, 1)))
model.add(Convolution2D(256, 3, 3, activation='relu'))
model.add(ZeroPadding2D((1, 1)))
model.add(Convolution2D(256, 3, 3, activation='relu'))
model.add(ZeroPadding2D((1, 1)))
model.add(Convolution2D(256, 3, 3, activation='relu'))
model.add(ZeroPadding2D((1, 1)))
model.add(Convolution2D(256, 3, 3, activation='relu'))
model.add(MaxPooling2D((2, 2), strides=(2, 2)))
model.add(ZeroPadding2D((1, 1)))
model.add(Convolution2D(512, 3, 3, activation='relu'))
model.add(ZeroPadding2D((1, 1)))
model.add(Convolution2D(512, 3, 3, activation='relu'))
model.add(ZeroPadding2D((1, 1)))
model.add(Convolution2D(512, 3, 3, activation='relu'))
model.add(ZeroPadding2D((1, 1)))
model.add(Convolution2D(512, 3, 3, activation='relu'))
model.add(MaxPooling2D((2, 2), strides=(2, 2)))
model.add(ZeroPadding2D((1, 1)))
model.add(Convolution2D(512, 3, 3, activation='relu'))
model.add(ZeroPadding2D((1, 1)))
model.add(Convolution2D(512, 3, 3, activation='relu'))
model.add(ZeroPadding2D((1, 1)))
model.add(Convolution2D(512, 3, 3, activation='relu'))
model.add(ZeroPadding2D((1, 1)))
model.add(Convolution2D(512, 3, 3, activation='relu'))
model.add(MaxPooling2D((2, 2), strides=(2, 2)))
model.add(Flatten())
model.add(Dense(4096, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(4096, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='softmax'))
return model
def _datagen(self):
return ImageDataGenerator(
featurewise_center=True,
samplewise_center=False,
featurewise_std_normalization=True,
samplewise_std_normalization=False,
zca_whitening=False,
rotation_range=20,
width_shift_range=0.2,
height_shift_range=0.2,
horizontal_flip=True,
vertical_flip=True
)
I run the model the following way:
vgg19 = VGG19(train_folder='data/train/train', validation_folder='data/val/val')
vgg19.fit(nb_epoch=1)
and my data/train/train
and data/val/val
folders consist of two directories each: cats
and dogs
, so that ImageDataGenerator.flow_from_directory()
function could separate my classes correctly.
What am I doing wrong here? Is it just that VGG19 is too big for my machine or it's some problem with batch sizes?
What can I do to train the model on my machine?
PS: if I don't interrput the training script (even though it outputs lots of similar errors like one in the pastebin above), the last lines of the output are the following:
W tensorflow/core/common_runtime/bfc_allocator.cc:274] *****************************************************************************************xxxxxxxxxxx
W tensorflow/core/common_runtime/bfc_allocator.cc:275] Ran out of memory trying to allocate 392.00MiB. See logs for memory state.
W tensorflow/core/framework/op_kernel.cc:975] Resource exhausted: OOM when allocating tensor with shape[25088,4096]
Traceback (most recent call last):
File "train.py", line 6, in <module>
vgg19.fit(nb_epoch=1)
File "/home/denis/WEB/DeepLearning/CatsVsDogs/model/vgg19.py", line 84, in fit
nb_val_samples=8
File "/usr/local/lib/python2.7/dist-packages/keras/models.py", line 907, in fit_generator
pickle_safe=pickle_safe)
File "/usr/local/lib/python2.7/dist-packages/keras/engine/training.py", line 1378, in fit_generator
callbacks._set_model(callback_model)
File "/usr/local/lib/python2.7/dist-packages/keras/callbacks.py", line 32, in _set_model
callback._set_model(model)
File "/usr/local/lib/python2.7/dist-packages/keras/callbacks.py", line 493, in _set_model
self.sess = KTF.get_session()
File "/usr/local/lib/python2.7/dist-packages/keras/backend/tensorflow_backend.py", line 111, in get_session
_initialize_variables()
File "/usr/local/lib/python2.7/dist-packages/keras/backend/tensorflow_backend.py", line 200, in _initialize_variables
sess.run(tf.variables_initializer(uninitialized_variables))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 766, in run
run_metadata_ptr)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 964, in _run
feed_dict_string, options, run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1014, in _do_run
target_list, options, run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1034, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[4096]
[[Node: Variable_43/Assign = Assign[T=DT_FLOAT, _class=["loc:@Variable_43"], use_locking=true, validate_shape=true, _device="/job:localhost/replica:0/task:0/gpu:0"](Variable_43, Const_59)]]
Caused by op u'Variable_43/Assign', defined at:
File "train.py", line 6, in <module>
vgg19.fit(nb_epoch=1)
File "/home/denis/WEB/DeepLearning/CatsVsDogs/model/vgg19.py", line 84, in fit
nb_val_samples=8
File "/usr/local/lib/python2.7/dist-packages/keras/models.py", line 907, in fit_generator
pickle_safe=pickle_safe)
File "/usr/local/lib/python2.7/dist-packages/keras/engine/training.py", line 1351, in fit_generator
self._make_train_function()
File "/usr/local/lib/python2.7/dist-packages/keras/engine/training.py", line 696, in _make_train_function
self.total_loss)
File "/usr/local/lib/python2.7/dist-packages/keras/optimizers.py", line 387, in get_updates
ms = [K.zeros(shape) for shape in shapes]
File "/usr/local/lib/python2.7/dist-packages/keras/backend/tensorflow_backend.py", line 278, in zeros
dtype, name)
File "/usr/local/lib/python2.7/dist-packages/keras/backend/tensorflow_backend.py", line 182, in variable
v = tf.Variable(value, dtype=_convert_string_dtype(dtype), name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/variables.py", line 224, in __init__
expected_shape=expected_shape)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/variables.py", line 360, in _init_from_args
validate_shape=validate_shape).op
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_state_ops.py", line 47, in assign
use_locking=use_locking, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 759, in apply_op
op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2240, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1128, in __init__
self._traceback = _extract_stack()
ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[4096]
[[Node: Variable_43/Assign = Assign[T=DT_FLOAT, _class=["loc:@Variable_43"], use_locking=true, validate_shape=true, _device="/job:localhost/replica:0/task:0/gpu:0"](Variable_43, Const_59)]]
Following @rmeertens's advice, I've made last Dense layers smaller:
last block:
model.add(Flatten())
model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='softmax'))
and the error changed a bit. It's still an OOM error though: pastebin.com/SamkUbJA
If you tell me that this model is working i'd be very surprised.
Having a softmax activation on 1 output (your last layer) doesnt make sense. The softmax normalizes the outputs of a layer so that they sum up to 1... if you only have one output it will be 1 all the time! So if you want a binary probability, either use sigmoid on 1 output or use softmax on 2 outputs!
In this case the OOM error appears because your graph is too large. What is the shape of the tensor you tried to allocate when everything goes down?
Anyway, a first thing you could try is allocating the model without having any of data in memory. Is something else still running (another jupyter notebook, some other model service in the background).
Also, maybe you can save space in the last layers:
model.add(Dense(4096, activation='relu'))
model.add(Dense(4096, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
A 4096x4096 matrix is pretty big (and immediately going back to 1 is a bad idea anyway ;) )
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With