I am learning Artificial Neural Network (ANN) recently and have got a code working and running in Python for the same based on mini-batch training. I followed the book of Michael Nilson's Neural Networks and Deep Learning where there is step by step explanation of each and every algorithm for the beginners. There is also a fully working code for handwritten digit recognition which works fine for me too.
However, I am trying to tweak the code a bit by means of passing the whole mini-batch together to train by backpropagation in the matrix form. I have also developed a working code for that, but the code performs real slow when run. Is there any way I can implement a full matrix based approach to mini-batch learning of the network based on back propagation algorithm?
import numpy as np
import pandas as pd
class Network:
def __init__(self, sizes):
self.layers = len(sizes)
self.sizes = sizes
self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
self.weights = [np.random.randn(y, x) for y, x in zip(sizes[1:], sizes[:-1])]
def feed_forward(self, a):
for w, b in zip(self.weights, self.biases):
a = sigmoid(np.dot(w,a) + b)
return a
# Calculate the cost derivative (Gradient of C w.r.t. 'a' - Nabla C(a))
def cost_derivative(self, output_activation, y):
return (output_activation - y)
def update_mini_batch(self, mini_batch, eta):
from scipy.linalg import block_diag
n = len(mini_batch)
xs = [x for x, y in mini_batch]
features = block_diag(*xs)
ys = [y for x, y in mini_batch]
responses = block_diag(*ys)
ws = [a for a in self.weights for i in xrange(n)]
new_list = []
k = 0
while (k < len(ws)):
new_list.append(ws[k: k + n])
k += n
weights = [block_diag(*elems) for elems in new_list]
bs = [b for b in self.biases for i in xrange(n)]
new_list2 = []
j = 0
while (j < len(bs)):
new_list2.append(bs[j : j + n])
j += n
biases = [block_diag(*elems) for elems in new_list2]
baises_dim_1 = [np.dot(np.ones((n*b.shape[0], b.shape[0])), b) for b in self.biases]
biases_dim_2 = [np.dot(b, np.ones((b.shape[1], n*b.shape[1]))) for b in baises_dim_1]
weights_dim_1 = [np.dot(np.ones((n*w.shape[0], w.shape[0])), w) for w in self.weights]
weights_dim_2 = [np.dot(w, np.ones((w.shape[1], n*w.shape[1]))) for w in weights_dim_1]
nabla_b = [np.zeros(b.shape) for b in biases_dim_2]
nabla_w = [np.zeros(w.shape) for w in weights_dim_2]
delta_b = [np.zeros(b.shape) for b in self.biases]
delta_w = [np.zeros(w.shape) for w in self.weights]
zs = []
activation = features
activations = [features]
for w, b in zip(weights, biases):
z = np.dot(w, activation) + b
zs.append(z)
activation = sigmoid(z)
activations.append(activation)
delta = self.cost_derivative(activations[-1], responses) * sigmoid_prime(zs[-1])
nabla_b[-1] = delta
nabla_w[-1] = np.dot(delta, activations[-2].transpose())
for l in xrange(2, self.layers):
z = zs[-l] # the weighted input for that layer
activation_prime = sigmoid_prime(z) # the derivative of activation for the layer
delta = np.dot(weights[-l + 1].transpose(), delta) * activation_prime # calculate the adjustment term (delta) for that layer
nabla_b[-l] = delta # calculate the bias adjustments - by means of using eq-BP3.
nabla_w[-l] = np.dot(delta, activations[-l-1].transpose()) # calculate the weight adjustments - by means of using eq-BP4.
delta_b = [self.split_cases(b, n) for b in nabla_b]
delta_w = [self.split_cases(w, n) for w in nabla_w]
self.weights = [w - (eta/n) * nw for w, nw in zip(self.weights, delta_w)]
self.biases = [b - (eta/ n) * nb for b, nb in zip(self.biases, delta_b)]
def split_cases(self, mat, mini_batch_size):
i = 0
j = 0
dim1 = mat.shape[0]/mini_batch_size
dim2 = mat.shape[1]/mini_batch_size
sum_samples = np.zeros((dim1, dim2))
while i < len(mat):
sum_samples = sum_samples + mat[i: i + dim1, j : j + dim2]
i += dim1
j += dim2
return sum_samples
"""Stochastic Gradient Descent for training in epochs"""
def SGD(self, training_data, epochs, mini_batch_size, eta, test_data = None):
n = len(training_data)
if test_data:
n_test = len(test_data)
for j in xrange(epochs):
np.random.shuffle(training_data) # for each epochs the mini-batches are selected randomly
mini_batches = [training_data[k: k+mini_batch_size] for k in xrange(0, n, mini_batch_size)] # select equal sizes of mini-batches for the epochs (last mini_batch size might differ however)
c = 1
for mini_batch in mini_batches:
print "Updating mini-batch {0}".format(c)
self.update_mini_batch(mini_batch, eta)
c += 1
if test_data:
print "Epoch {0}: {1}/{2}".format(j, self.evaluate(test_data), n_test)
else:
print "Epoch {0} completed.".format(j)
def evaluate(self, test_data):
test_results = [(np.argmax(self.feed_forward(x)), y) for (x, y) in test_data]
return (sum(int(x == y) for x, y in test_results))
def export_results(self, test_data):
results = [(np.argmax(self.feed_forward(x)), y) for (x, y) in test_data]
k = pd.DataFrame(results)
k.to_csv('net_results.csv')
# Global functions
## Activation function (sigmoid)
@np.vectorize
def sigmoid(z):
return 1.0/(1.0 + np.exp(-z))
## Activation derivative (sigmoid_prime)
@np.vectorize
def sigmoid_prime(z):
return sigmoid(z)*(1 - sigmoid(z))
Here is my code. The time taken to iterate 30 epochs reduces from 800+ seconds to 200+ seconds on my machine.
As I am new to python, I use what is readily available. This snippet only requires numpy to run.
Give it a try.
def feedforward2(self, a):
zs = []
activations = [a]
activation = a
for b, w in zip(self.biases, self.weights):
z = np.dot(w, activation) + b
zs.append(z)
activation = sigmoid(z)
activations.append(activation)
return (zs, activations)
def update_mini_batch2(self, mini_batch, eta):
batch_size = len(mini_batch)
# transform to (input x batch_size) matrix
x = np.asarray([_x.ravel() for _x, _y in mini_batch]).transpose()
# transform to (output x batch_size) matrix
y = np.asarray([_y.ravel() for _x, _y in mini_batch]).transpose()
nabla_b, nabla_w = self.backprop2(x, y)
self.weights = [w - (eta / batch_size) * nw for w, nw in zip(self.weights, nabla_w)]
self.biases = [b - (eta / batch_size) * nb for b, nb in zip(self.biases, nabla_b)]
return
def backprop2(self, x, y):
nabla_b = [0 for i in self.biases]
nabla_w = [0 for i in self.weights]
# feedforward
zs, activations = self.feedforward2(x)
# backward pass
delta = self.cost_derivative(activations[-1], y) * sigmoid_prime(zs[-1])
nabla_b[-1] = delta.sum(1).reshape([len(delta), 1]) # reshape to (n x 1) matrix
nabla_w[-1] = np.dot(delta, activations[-2].transpose())
for l in xrange(2, self.num_layers):
z = zs[-l]
sp = sigmoid_prime(z)
delta = np.dot(self.weights[-l + 1].transpose(), delta) * sp
nabla_b[-l] = delta.sum(1).reshape([len(delta), 1]) # reshape to (n x 1) matrix
nabla_w[-l] = np.dot(delta, activations[-l - 1].transpose())
return (nabla_b, nabla_w)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With