Here is a wrapper for tensorflow .pb frozen model (imagenet classification):
import tensorflow as tf
import numpy as np
import cv2
from numba import cuda
class ModelWrapper():
def __init__(self, model_filepath):
self.graph_def = self.load_graph_def(model_filepath)
self.graph = self.load_graph(self.graph_def)
self.set_inputs_and_outputs()
self.sess = tf.Session(graph=self.graph)
print(self.__class__.__name__, 'call __init__') #
def load_graph_def(self, model_filepath):
# Expects frozen graph in .pb format
with tf.gfile.GFile(model_filepath, "rb") as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
return graph_def
def load_graph(self, graph_def):
with tf.Graph().as_default() as graph:
tf.import_graph_def(graph_def, name="")
return graph
def set_inputs_and_outputs(self):
input_list = []
for op in self.graph.get_operations(): # tensorflow.python.framework.ops.Operation
if op.type == "Placeholder":
input_list.append(op.name)
print('Inputs:', input_list)
all_name_list = []
input_name_list = []
for node in self.graph_def.node: # tensorflow.core.framework.node_def_pb2.NodeDef
all_name_list.append(node.name)
input_name_list.extend(node.input)
output_list = list(set(all_name_list) - set(input_name_list))
print('Outputs:', output_list)
self.inputs = []
self.input_tensor_names = [name + ":0" for name in input_list]
for input_tensor_name in self.input_tensor_names:
self.inputs.append(self.graph.get_tensor_by_name(input_tensor_name))
self.outputs = []
self.output_tensor_names = [name + ":0" for name in output_list]
for output_tensor_name in self.output_tensor_names:
self.outputs.append(self.graph.get_tensor_by_name(output_tensor_name))
input_dim_list = []
for op in self.graph.get_operations(): # tensorflow.python.framework.ops.Operation
if op.type == "Placeholder":
bs = op.get_attr('shape').dim[0].size
h = op.get_attr('shape').dim[1].size
w = op.get_attr('shape').dim[2].size
c = op.get_attr('shape').dim[3].size
input_dim_list.append([bs, h, w ,c])
assert len(input_dim_list) == 1
_, self.input_img_h, self.input_img_w, _ = input_dim_list[0]
def predict(self, img):
h, w, c = img.shape
if h != self.input_img_h or w != self.input_img_w:
img = cv2.resize(img, (self.input_img_w, self.input_img_h))
batch = img[np.newaxis, ...]
feed_dict = {self.inputs[0]: batch}
outputs = self.sess.run(self.outputs, feed_dict=feed_dict) # (1, 1001)
output = outputs[0]
return output
def __del__(self):
print(self.__class__.__name__, 'call __del__') #
import time #
time.sleep(3) #
cuda.close()
What I'm trying to do is to clean up GPU memory after I don't need model anymore, in this example I just create and delete model in the loop, but in real life it can be several different models.
wget https://storage.googleapis.com/download.tensorflow.org/models/inception_v3_2016_08_28_frozen.pb.tar.gz
tar -xvzf inception_v3_2016_08_28_frozen.pb.tar.gz
rm -f imagenet_slim_labels.txt
rm -f inception_v3_2016_08_28_frozen.pb.tar.gz
import os
import time
import tensorflow as tf
import numpy as np
from model_wrapper import ModelWrapper
MODEL_FILEPATH = './inception_v3_2016_08_28_frozen.pb'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
def create_and_delete_in_loop():
for i in range(10):
print('-'*60)
print('i:', i)
model = ModelWrapper(MODEL_FILEPATH)
input_batch = np.zeros((model.input_img_h, model.input_img_w, 3), np.uint8)
y_pred = model.predict(input_batch)
print('y_pred.shape', y_pred.shape)
print('np.argmax(y_pred)', np.argmax(y_pred))
del model
if __name__ == "__main__":
create_and_delete_in_loop()
print('START WAITING')
time.sleep(10)
print('END OF THE PROGRAM!')
Output:
------------------------------------------------------------
i: 0
Inputs: ['input']
Outputs: ['InceptionV3/Predictions/Reshape_1']
ModelWrapper call __init__
y_pred.shape (1, 1001)
np.argmax(y_pred) 112
ModelWrapper call __del__
------------------------------------------------------------
i: 1
Inputs: ['input']
Outputs: ['InceptionV3/Predictions/Reshape_1']
ModelWrapper call __init__
Segmentation fault (core dumped)
What is the proper way of releasing GPU memory?
To limit TensorFlow to a specific set of GPUs, use the tf. config. set_visible_devices method. In some cases it is desirable for the process to only allocate a subset of the available memory, or to only grow the memory usage as is needed by the process.
The . pb format is the protocol buffer (protobuf) format, and in Tensorflow, this format is used to hold models. Protobufs are a general way to store data by Google that is much nicer to transport, as it compacts the data more efficiently and enforces a structure to the data.
If a TensorFlow operation has both CPU and GPU implementations, TensorFlow will automatically place the operation to run on a GPU device first. If you have more than one GPU, the GPU with the lowest ID will be selected by default. However, TensorFlow does not place operations into multiple GPUs automatically.
TL;DR Run your function as a new process+ .
tf.reset_default_graph()
is not guaranteed to release memory#. When a process dies, all the memory it was given (including your GPU Memory) will be released. Not only does this help keep things neatly organized, but also, you can analyze how much CPU, GPU, RAM, GPU Memory each process consumes.
For example, if you had these functions,
def train_model(x, y, params):
model = ModelWrapper(params.filepath)
model.fit(x, y, epochs=params.epochs)
def predict_model(x, params):
model = ModelWrapper(params.filepath)
y_pred = model.predict(x)
print(y_pred.shape)
You can use it like,
import multiprocessing
for i in range(8):
print(f"Training Model {i} from {params.filepath}")
process_train = multiprocessing.Process(train_model, args=(x_train, y_train, params))
process_train.start()
process_train.join()
print("Predicting")
process_predict = multiprocessing.Process(predict_model, args=(x_train, params))
process_predict.start()
process_predict.join()
This way python fires a new process for your tasks, which can run with their own memory.
Bonus Tip: You can also choose to run them in parallel if you have many CPUs and GPUs available: you just need to call process_train.join()
after the loop in that case. If you had eight GPUs, you can use this parent script to serve parameters, while each of the individual processes shall run on a different GPU.
# I tried a variety of things, separately and together, before I started using processes,
tf.reset_default_graph()
K.clear_session()
cuda.select_device(0); cuda.close()
model = get_new_model() # overwrite
model = None
del model
gc.collect()
+ I also considered using threads, subprocess.Popen, but I was satisfied with multiprocessing since it offered full decoupling that made it a lot easier to manage and allocate resources.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With