Nested Gradient Tape in function (TF2.0)

Question

I try to implement MAML. Therefore I need a copy of my model (model_copy) to be trained one step, then I need my meta_model to be trained with the loss of my model_copy.

I would like to do the training of the model_copy in a function. If I copy my code to the function I don't get proper gradients_meta (they will be all none).

It seems, that the graphs are unconnected - how can I connect the graphs?

Any idea of what I am doing wrong? I watch a lot of variables, but that doesn't seem to make a difference..

Here is the code to reproduce this issue:

import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.backend as keras_backend


def copy_model(model):
    copied_model = keras.Sequential()
    copied_model.add(keras.layers.Dense(5, input_shape=(1,)))
    copied_model.add(keras.layers.Dense(1))
    copied_model.set_weights(model.get_weights())
    return copied_model


def compute_loss(model, x, y):
    logits = model(x)  # prediction of my model
    mse = keras_backend.mean(keras.losses.mean_squared_error(y, logits))  # compute loss between prediciton and label/truth
    return mse, logits


# meta_model to learn in outer gradient tape
meta_model = keras.Sequential()
meta_model.add(keras.layers.Dense(5, input_shape=(1,)))
meta_model.add(keras.layers.Dense(1))

# optimizer for training
optimizer = keras.optimizers.Adam()


# function to calculate model_copys params
def do_calc(x, y, meta_model):
    with tf.GradientTape() as gg:
        model_copy = copy_model(meta_model)
        gg.watch(x)
        gg.watch(meta_model.trainable_variables)
        gg.watch(model_copy.trainable_variables)
        loss, _ = compute_loss(model_copy, x, y)
        gradient = gg.gradient(loss, model_copy.trainable_variables)
        optimizer.apply_gradients(zip(gradient, model_copy.trainable_variables))
        return model_copy


# inputs for training
x = tf.constant(3.0, shape=(1, 1, 1))
y = tf.constant(3.0, shape=(1, 1, 1))

with tf.GradientTape() as g:

    g.watch(x)
    g.watch(y)

    model_copy = do_calc(x, y, meta_model)
    g.watch(model_copy.trainable_variables)
    # calculate loss of model_copy
    test_loss, _ = compute_loss(model_copy, x, y)
    # build gradients for meta_model update
    gradients_meta = g.gradient(test_loss, meta_model.trainable_variables)
    # gradients always None !?!!11 elf
    optimizer.apply_gradients(zip(gradients_meta, meta_model.trainable_variables))

Thank you in advance for any help.

janbolle · Accepted Answer

I found a solution: I needed to "connect" meta-model and model-copy somehow.

Can anybody explain why this works and how I would achieve that using a "proper" optimizer?

import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.backend as keras_backend


def copy_model(model):
    copied_model = keras.Sequential()
    copied_model.add(keras.layers.Dense(5, input_shape=(1,)))
    copied_model.add(keras.layers.Dense(1))
    copied_model.set_weights(model.get_weights())
    return copied_model


def compute_loss(model, x, y):
    logits = model(x)  # prediction of my model
    mse = keras_backend.mean(keras.losses.mean_squared_error(y, logits))  # compute loss between prediciton and label/truth
    return mse, logits


# meta_model to learn in outer gradient tape
meta_model = keras.Sequential()
meta_model.add(keras.layers.Dense(5, input_shape=(1,)))
meta_model.add(keras.layers.Dense(1))

# optimizer for training
optimizer = keras.optimizers.Adam()


# function to calculate model_copys params
def do_calc(meta_model, x, y, gg, alpha=0.01):
    model_copy = copy_model(meta_model)
    loss, _ = compute_loss(model_copy, x, y)
    gradients = gg.gradient(loss, model_copy.trainable_variables)
    k = 0
    for layer in range(len(model_copy.layers)):
        # calculate adapted parameters w/ gradient descent
        # 	heta_i' = 	heta - \alpha * gradients
        model_copy.layers[layer].kernel = tf.subtract(meta_model.layers[layer].kernel,
                                                      tf.multiply(alpha, gradients[k]))
        model_copy.layers[layer].bias = tf.subtract(meta_model.layers[layer].bias,
                                                    tf.multiply(alpha, gradients[k + 1]))
        k += 2
    return model_copy


with tf.GradientTape() as g:
    # inputs for training
    x = tf.constant(3.0, shape=(1, 1, 1))
    y = tf.constant(3.0, shape=(1, 1, 1))
    adapted_models = []

    # model_copy = meta_model
    with tf.GradientTape() as gg:
        model_copy = do_calc(meta_model, x, y, gg)

    # calculate loss of model_copy
    test_loss, _ = compute_loss(model_copy, x, y)
    # build gradients for meta_model update
    gradients_meta = g.gradient(test_loss, meta_model.trainable_variables)
    # gradients work. Why???
    optimizer.apply_gradients(zip(gradients_meta, meta_model.trainable_variables))

Nested Gradient Tape in function (TF2.0)

Tags:

python

machine-learning

tensorflow

deep-learning

tensorflow2.0

janbolle

1 Answers

janbolle

Recent Activity

Donate For Us

Nested Gradient Tape in function (TF2.0)

Tags:

python

machine-learning

tensorflow

deep-learning

tensorflow2.0

janbolle

1 Answers

janbolle

Related questions

Recent Activity

Donate For Us