I am trying to implement the resilient backpropagation optimizer for Keras (link), but the challenging part was being able to perform an update on each individual parameter based on whether its corresponding gradient is positive, negative or zero. I wrote the code below as a start towards implementing the Rprop optimizer. However, I can't seem to find a way to access the parameters individually. Looping over params
(as in the code below) returns p, g, g_old, s, wChangeOld
at each iteration which are all matrices.
Is there a way where I could iterate over the individual parameters and update them ? It would also work if I could index the parameter vector based on the sign of its gradients.
class Rprop(Optimizer):
def __init__(self, init_step=0.01, **kwargs):
super(Rprop, self).__init__(**kwargs)
self.init_step = K.variable(init_step, name='init_step')
self.iterations = K.variable(0., name='iterations')
self.posStep = 1.2
self.negStep = 0.5
self.minStep = 1e-6
self.maxStep = 50.
def get_updates(self, params, constraints, loss):
grads = self.get_gradients(loss, params)
self.updates = [K.update_add(self.iterations, 1)]
shapes = [K.get_variable_shape(p) for p in params]
stepList = [K.ones(shape)*self.init_step for shape in shapes]
wChangeOldList = [K.zeros(shape) for shape in shapes]
grads_old = [K.zeros(shape) for shape in shapes]
self.weights = stepList + grads_old + wChangeOldList
self.updates = []
for p, g, g_old, s, wChangeOld in zip(params, grads, grads_old,
stepList, wChangeOldList):
change = K.sign(g * g_old)
if change > 0:
s_new = K.minimum(s * self.posStep, self.maxStep)
wChange = s_new * K.sign(g)
g_new = g
elif change < 0:
s_new = K.maximum(s * self.posStep, self.maxStep)
wChange = - wChangeOld
g_new = 0
else:
s_new = s
wChange = s_new * K.sign(g)
g_new = p
self.updates.append(K.update(g_old, g_new))
self.updates.append(K.update(wChangeOld, wChange))
self.updates.append(K.update(s, s_new))
new_p = p - wChange
# Apply constraints
if p in constraints:
c = constraints[p]
new_p = c(new_p)
self.updates.append(K.update(p, new_p))
return self.updates
def get_config(self):
config = {'init_step': float(K.get_value(self.init_step))}
base_config = super(Rprop, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
I was looking for an RProp algorithm in Keras as well and found this question. I took the liberty of adapting your code to my purpose and post it back here now. So far it seems to work quite well, but I didn't test it extensively.
Disclaimer: I'm very new to keras but have a lot of experience with theano (and blocks). Further I tested this only with theano as a backend, but not tensorflow.
class RProp(Optimizer):
def __init__(self, init_alpha=1e-3, scale_up=1.2, scale_down=0.5, min_alpha=1e-6, max_alpha=50., **kwargs):
super(RProp, self).__init__(**kwargs)
self.init_alpha = K.variable(init_alpha, name='init_alpha')
self.scale_up = K.variable(scale_up, name='scale_up')
self.scale_down = K.variable(scale_down, name='scale_down')
self.min_alpha = K.variable(min_alpha, name='min_alpha')
self.max_alpha = K.variable(max_alpha, name='max_alpha')
def get_updates(self, params, constraints, loss):
grads = self.get_gradients(loss, params)
shapes = [K.get_variable_shape(p) for p in params]
alphas = [K.variable(numpy.ones(shape) * self.init_alpha) for shape in shapes]
old_grads = [K.zeros(shape) for shape in shapes]
self.weights = alphas + old_grads
self.updates = []
for param, grad, old_grad, alpha in zip(params, grads, old_grads, alphas):
new_alpha = K.switch(
K.greater(grad * old_grad, 0),
K.minimum(alpha * self.scale_up, self.max_alpha),
K.maximum(alpha * self.scale_down, self.min_alpha)
)
new_param = param - K.sign(grad) * new_alpha
# Apply constraints
if param in constraints:
c = constraints[param]
new_param = c(new_param)
self.updates.append(K.update(param, new_param))
self.updates.append(K.update(alpha, new_alpha))
self.updates.append(K.update(old_grad, grad))
return self.updates
def get_config(self):
config = {
'init_alpha': float(K.get_value(self.init_alpha)),
'scale_up': float(K.get_value(self.scale_up)),
'scale_down': float(K.get_value(self.scale_down)),
'min_alpha': float(K.get_value(self.min_alpha)),
'max_alpha': float(K.get_value(self.max_alpha)),
}
base_config = super(RProp, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
Important notes:
A few comments about your code (referring to your original variable names):
wChange
is never used across iterations, so you don't need to store those in permanent variables.change > 0
does not do what you think it does because change
is a tensor variable. What you want here is a element-wise comparison, use K.switch()
instead.maxStep
twice instead of using minStep
the other time.change
is zero is negligible, since that almost never happens in practice.g_new = 0
and g_new = p
are both completely bogus and should be g_new = g
as in the first if branch.I'm new to keras and Python but I modified the code above for my purposes a bit.
It is incredibly fast and simple algorithm due to using full-batch learning and partial derivatives. In my tests it outperformed all other backpropagation algorithms, including Adam. I tested it with Tensorflow and CNTK as a backend.
Modified Rprop without Weight-Backtracking: https://pdfs.semanticscholar.org/df9c/6a3843d54a28138a596acc85a96367a064c2.pdf
class iRprop_(Optimizer):
def __init__(self, init_alpha=0.01, scale_up=1.2, scale_down=0.5, min_alpha=0.00001, max_alpha=50., **kwargs):
super(iRprop_, self).__init__(**kwargs)
self.init_alpha = K.variable(init_alpha, name='init_alpha')
self.scale_up = K.variable(scale_up, name='scale_up')
self.scale_down = K.variable(scale_down, name='scale_down')
self.min_alpha = K.variable(min_alpha, name='min_alpha')
self.max_alpha = K.variable(max_alpha, name='max_alpha')
def get_updates(self, params, loss):
grads = self.get_gradients(loss, params)
shapes = [K.get_variable_shape(p) for p in params]
alphas = [K.variable(K.ones(shape) * self.init_alpha) for shape in shapes]
old_grads = [K.zeros(shape) for shape in shapes]
self.weights = alphas + old_grads
self.updates = []
for p, grad, old_grad, alpha in zip(params, grads, old_grads, alphas):
grad = K.sign(grad)
new_alpha = K.switch(
K.greater(grad * old_grad, 0),
K.minimum(alpha * self.scale_up, self.max_alpha),
K.switch(K.less(grad * old_grad, 0),K.maximum(alpha * self.scale_down, self.min_alpha),alpha)
)
grad = K.switch(K.less(grad * old_grad, 0),K.zeros_like(grad),grad)
new_p = p - grad * new_alpha
# Apply constraints.
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)
self.updates.append(K.update(p, new_p))
self.updates.append(K.update(alpha, new_alpha))
self.updates.append(K.update(old_grad, grad))
return self.updates
def get_config(self):
config = {
'init_alpha': float(K.get_value(self.init_alpha)),
'scale_up': float(K.get_value(self.scale_up)),
'scale_down': float(K.get_value(self.scale_down)),
'min_alpha': float(K.get_value(self.min_alpha)),
'max_alpha': float(K.get_value(self.max_alpha)),
}
base_config = super(iRprop_, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With