Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

ANN regression, linear function approximation

I have built a regular ANN–BP setup with one unit on input and output layer and 4 nodes in hidden with sigmoid. Giving it a simple task to approximate linear f(n) = n with n in range 0-100.

PROBLEM: Regardless of number of layers, units in hidden layer or whether or not I am using bias in node values it learns to approximate f(n) = Average(dataset) like so:

enter image description here

Code is written in JavaScript as a proof of concept. I have defined three classes: Net, Layer and Connection, where Layer is an array of input, bias and output values, Connection is a 2D array of weights and delta weights. Here is the Layer code where all important calculations happen:

Ann.Layer = function(nId, oNet, oConfig, bUseBias, aInitBiases) {
var _oThis = this;

var _initialize = function() {
        _oThis.id        = nId;
        _oThis.length    = oConfig.nodes;
        _oThis.outputs   = new Array(oConfig.nodes);
        _oThis.inputs    = new Array(oConfig.nodes);
        _oThis.gradients = new Array(oConfig.nodes);
        _oThis.biases    = new Array(oConfig.nodes);

        _oThis.outputs.fill(0);
        _oThis.inputs.fill(0);
        _oThis.biases.fill(0);

        if (bUseBias) {
            for (var n=0; n<oConfig.nodes; n++) {
                _oThis.biases[n] = Ann.random(aInitBiases[0], aInitBiases[1]);
            }
        }
    };

/****************** PUBLIC ******************/

this.id;
this.length;
this.inputs;
this.outputs;
this.gradients;
this.biases;
this.next;
this.previous;

this.inConnection;
this.outConnection;

this.isInput  = function() { return !this.previous;     }
this.isOutput = function() { return !this.next;         }

this.calculateGradients = function(aTarget) {
    var n, n1, nOutputError,
        fDerivative = Ann.Activation.Derivative[oConfig.activation];

    if (this.isOutput()) {
        for (n=0; n<oConfig.nodes; n++) {
            nOutputError = this.outputs[n] - aTarget[n];
            this.gradients[n] = nOutputError * fDerivative(this.outputs[n]);
        }
    } else {
        for (n=0; n<oConfig.nodes; n++) {
            nOutputError = 0.0;
            for (n1=0; n1<this.outConnection.weights[n].length; n1++) {
                nOutputError += this.outConnection.weights[n][n1] * this.next.gradients[n1];
            }
            // console.log(this.id, nOutputError, this.outputs[n], fDerivative(this.outputs[n]));
            this.gradients[n] = nOutputError * fDerivative(this.outputs[n]);
        }
    }
}

this.updateInputWeights = function() {
    if (!this.isInput()) {
        var nY,
            nX,
            nOldDeltaWeight,
            nNewDeltaWeight;

        for (nX=0; nX<this.previous.length; nX++) {
            for (nY=0; nY<this.length; nY++) {
                nOldDeltaWeight = this.inConnection.deltaWeights[nX][nY];
                nNewDeltaWeight =
                    - oNet.learningRate
                    * this.previous.outputs[nX]
                    * this.gradients[nY]
                    // Add momentum, a fraction of old delta weight
                    + oNet.learningMomentum
                    * nOldDeltaWeight;

                if (nNewDeltaWeight == 0 && nOldDeltaWeight != 0) {
                    console.log('Double overflow');
                }

                this.inConnection.deltaWeights[nX][nY] = nNewDeltaWeight;
                this.inConnection.weights[nX][nY]     += nNewDeltaWeight;
            }
        }
    }
}

this.updateInputBiases = function() {
    if (bUseBias && !this.isInput()) {
        var n,
            nNewDeltaBias;

        for (n=0; n<this.length; n++) {
            nNewDeltaBias = 
                - oNet.learningRate
                * this.gradients[n];

            this.biases[n] += nNewDeltaBias;
        }
    }
}

this.feedForward = function(a) {
    var fActivation = Ann.Activation[oConfig.activation];

    this.inputs = a;

    if (this.isInput()) {
        this.outputs = this.inputs;
    } else {
        for (var n=0; n<a.length; n++) {
            this.outputs[n] = fActivation(a[n] + this.biases[n]);
        }
    }
    if (!this.isOutput()) {
        this.outConnection.feedForward(this.outputs);
    }
}

_initialize();
}

The main feedForward and backProp functions are defined like so:

this.feedForward = function(a) {
    this.layers[0].feedForward(a);
    this.netError = 0;
}

this.backPropagate = function(aExample, aTarget) {
    this.target = aTarget;

    if (aExample.length != this.getInputCount())  { throw "Wrong input count in training data"; }
    if (aTarget.length  != this.getOutputCount()) { throw "Wrong output count in training data"; }

    this.feedForward(aExample);
    _calculateNetError(aTarget);

    var oLayer = null,
        nLast  = this.layers.length-1,
        n;

    for (n=nLast; n>0; n--) {
        if (n === nLast) {
            this.layers[n].calculateGradients(aTarget);
        } else {
            this.layers[n].calculateGradients();
        }
    }

    for (n=nLast; n>0; n--) {
        this.layers[n].updateInputWeights();
        this.layers[n].updateInputBiases();
    }
}

Connection code is rather simple:

Ann.Connection = function(oNet, oConfig, aInitWeights) {
var _oThis = this;

var _initialize = function() {
        var nX, nY, nIn, nOut;

        _oThis.from = oNet.layers[oConfig.from];
        _oThis.to   = oNet.layers[oConfig.to];

        nIn  = _oThis.from.length;
        nOut = _oThis.to.length;

        _oThis.weights      = new Array(nIn);
        _oThis.deltaWeights = new Array(nIn);

        for (nX=0; nX<nIn; nX++) {
            _oThis.weights[nX]      = new Array(nOut);
            _oThis.deltaWeights[nX] = new Array(nOut);
            _oThis.deltaWeights[nX].fill(0);
            for (nY=0; nY<nOut; nY++) {
                _oThis.weights[nX][nY] = Ann.random(aInitWeights[0], aInitWeights[1]);
            }
        }
    };

/****************** PUBLIC ******************/

this.weights;
this.deltaWeights;
this.from;
this.to;

this.feedForward = function(a) {
    var n, nX, nY, aOut = new Array(this.to.length);

    for (nY=0; nY<this.to.length; nY++) {
        n = 0;
        for (nX=0; nX<this.from.length; nX++) {
            n += a[nX] * this.weights[nX][nY];
        }
        aOut[nY] = n;
    }

    this.to.feedForward(aOut);
}

_initialize();
}

And my activation functions and derivatives are defined like so:

Ann.Activation = {
    linear : function(n) { return n; },
    sigma  : function(n) { return 1.0 / (1.0 + Math.exp(-n)); },
    tanh   : function(n) { return Math.tanh(n); }
}

Ann.Activation.Derivative = {
    linear : function(n) { return 1.0; },
    sigma  : function(n) { return n * (1.0 - n); },
    tanh   : function(n) { return 1.0 - n * n; }
}

And configuration JSON for the network is as follows:

var Config = {
    id : "Config1",

    learning_rate     : 0.01,
    learning_momentum : 0,
    init_weight       : [-1, 1],
    init_bias         : [-1, 1],
    use_bias          : false,

    layers: [
        {nodes : 1},
        {nodes : 4, activation : "sigma"},
        {nodes : 1, activation : "linear"}
    ],

    connections: [
        {from : 0, to : 1},
        {from : 1, to : 2}
    ]
}

Perhaps, your experienced eye can spot the problem with my calculations?

See example in JSFiddle

like image 920
Lex Avatar asked Nov 08 '16 16:11

Lex


2 Answers

I did not look extensively at the code (because it is a lot of code to look at, would need to take more time for that later, and I am not 100% familiar with javascript). Either way, I believe Stephen introduced some changes in how the weights are calculated, and his code seems to give correct results, so I'd recommend looking at that.

Here are a few points though that are not necessarily about the correctness of computations, but may still help:

  • How many examples are you showing the network for training? Are you showing the same input multiple times? You should show every example that you have (inputs) multiple times; showing every example only one time is not sufficient for algorithms based on gradient descent to learn, since they only move a little bit in the correct direction every time. It is possible that all of your code is correct, but you simply have to give it a bit more time to train.
  • Introducing more hidden layers like Stephen did may help to speed up training, or it may be detrimental. This is typically something you'd want to experiment with for your specific case. It definitely shouldn't be necessary for this simple problem though. I suspect a more important difference between your configuration and Stephen's configuration may be the activation function used in the hidden layer(s). You used a sigmoid, which means that all of the input values get squashed to lie below 1.0 in the hidden layer, and then you need to very large weights to transform these numbers back to the desired output (which can be up to a value of 100). Stephen used linear activation functions for all layers, which in this specific case is likely to make training much easier because you are actually trying to learn a linear function. In many other cases it would be desirable to introduce non-linearities though.
  • It may be beneficial to transform (normalize) both your input and your desired output to lie in [0, 1] instead of [0, 100]. This would make it more likely for your sigmoid layer to produce good results (though I'm still not sure if it would be enough, because you're still introducing a nonlinearity in a case where you intend to learn a linear function, and you may need more hidden nodes to correct for that). In ''real-world'' cases, where you have multiple different input variables, this is also typically done, because it ensures that all input variables are treated as being equally important initially. You could always do a preprocessing step where you normalize the input to [0, 1], give that as input to the network, train it to produce output in [0, 1], and then add a postprocessing step where you transform the output back to the original range.
like image 182
Dennis Soemers Avatar answered Oct 08 '22 23:10

Dennis Soemers


First... I really like this code. I know very little about NNs (just getting started) so pardon my lacking here if any.

Here is a summary of the changes I made:

//updateInputWeights has this in the middle now:

nNewDeltaWeight =
oNet.learningRate
* this.gradients[nY] 
/ this.previous.outputs[nX]
// Add momentum, a fraction of old delta weight
+ oNet.learningMomentum
* nOldDeltaWeight;


//updateInputWeights has this at the bottom now:

this.inConnection.deltaWeights[nX][nY] += nNewDeltaWeight; // += added
this.inConnection.weights[nX][nY]      += nNewDeltaWeight;

// I modified the following:

	_calculateNetError2 = function(aTarget) {
		var oOutputLayer = _oThis.getOutputLayer(),
			nOutputCount = oOutputLayer.length,
			nError = 0.0,
			nDelta = 0.0,
			n;

		for (n=0; n<nOutputCount; n++) {
			nDelta = aTarget[n] - oOutputLayer.outputs[n];
			nError += nDelta;
		}

		_oThis.netError = nError;
	};

The config section looks like this now:

var Config = {
id : "Config1",

learning_rate     : 0.001,
learning_momentum : 0.001,
init_weight       : [-1.0, 1.0],
init_bias         : [-1.0, 1.0],
use_bias          : false,

/*
layers: [
	{nodes : 1, activation : "linear"},
	{nodes : 5, activation : "linear"},
	{nodes : 1, activation : "linear"}
],

connections: [
	{from : 0, to : 1}
	,{from : 1, to : 2}
]
*/


layers: [
	{nodes : 1, activation : "linear"},
	{nodes : 2, activation : "linear"},
	{nodes : 2, activation : "linear"},
	{nodes : 2, activation : "linear"},
	{nodes : 2, activation : "linear"},
	{nodes : 1, activation : "linear"}
],

connections: [
	 {from : 0, to : 1}
	,{from : 1, to : 2}
	,{from : 2, to : 3}
	,{from : 3, to : 4}
	,{from : 4, to : 5}
]

}

These were my resulting images:

like image 20
Stephen Mathews Avatar answered Oct 08 '22 22:10

Stephen Mathews