I have a network with two inputs, two hidden nodes in a single layer, and an output node.

I am trying to solve XOR problem:
| i0 | i1 | desired output |
----------------------------
| 0 | 0 | 0 |
| 1 | 0 | 1 |
| 0 | 1 | 1 |
| 1 | 1 | 0 |
With my current code, I am running all 4 records above in a single epoch. I then repeat the epoch 20,000 times. I calculate the error after each record, not each epoch, and I back-propagate the error at this same time.
I use only sigmoid in the output layer, as I understand I want a result between 0 and 1.
My network, most of the time, converges. Other times, it doesn't.
I have tried using both sigmoid and tanh in the hidden layer, but neither seems to guarantee convergence.
I have tried randomly generating weights between 0 and 1 as well as between -1 and 1 using a uniform distribution. I have tried using Xavier Initialisation as both uniform and normal distribution. None of these seems to prevent the network from failing to converge. I have tried different combinations of activation function and weight generation.
Here is my complete code:
#include <iostream>
#include <array>
#include <random>
#include <chrono>
#include <iomanip>
#include <fstream>
#include <algorithm>
#include <iomanip>
typedef float DataType;
typedef DataType (*ActivationFuncPtr)(const DataType&);
const DataType learningRate = std::sqrt(2.f);
const DataType momentum = 0.25f;
const std::size_t numberEpochs = 20000;
DataType sigmoid(const DataType& x)
{
return DataType(1) / (DataType(1) + std::exp(-x));
}
DataType sigmoid_derivative(const DataType& x)
{
return x * (DataType(1) - x);
}
DataType relu(const DataType& x)
{
return x <= 0 ? 0 : x;
}
DataType relu_derivative(const DataType& x)
{
return x <= 0 ? 0 : 1;
}
DataType tanh(const DataType& x)
{
return std::tanh(x);
}
DataType tanh_derivative(const DataType& x)
{
return DataType(1) - x * x;
}
DataType leaky_relu(const DataType& x)
{
return x <= 0 ? DataType(0.01) * x : x;
}
DataType leaky_relu_derivative(const DataType& x)
{
return x <= 0 ? DataType(0.01) : 1;
}
template<std::size_t NumInputs>
class Neuron
{
public:
Neuron(ActivationFuncPtr activationFunction, ActivationFuncPtr derivativeFunc)
:
m_activationFunction(activationFunction),
m_derivativeFunction(derivativeFunc)
{
RandomiseWeights();
}
void RandomiseWeights()
{
std::generate(m_weights.begin(),m_weights.end(),[&]()
{
return m_xavierNormalDis(m_mt);
});
m_biasWeight = m_xavierNormalDis(m_mt);
for(std::size_t i = 0; i < NumInputs+1; ++i)
m_previousWeightUpdates[i] = 0;
}
void FeedForward(const std::array<DataType,NumInputs>& inputValues)
{
DataType sum = m_biasWeight;
for(std::size_t i = 0; i < inputValues.size(); ++i)
sum += inputValues[i] * m_weights[i];
m_output = m_activationFunction(sum);
m_netInput = sum;
}
DataType GetOutput() const
{
return m_output;
}
DataType GetNetInput() const
{
return m_netInput;
}
std::array<DataType,NumInputs> Backpropagate(const DataType& error,
const std::array<DataType,NumInputs>& inputValues,
std::array<DataType,NumInputs+1>& weightAdjustments)
{
DataType errorOverOutput = error;
DataType outputOverNetInput = m_derivativeFunction(m_output);
std::array<DataType,NumInputs> netInputOverWeight;
for(std::size_t i = 0; i < NumInputs; ++i)
{
netInputOverWeight[i] = inputValues[i];
}
DataType netInputOverBias = DataType(1);
std::array<DataType,NumInputs> errorOverWeight;
for(std::size_t i = 0; i < NumInputs; ++i)
{
errorOverWeight[i] = errorOverOutput * outputOverNetInput * netInputOverWeight[i];
}
DataType errorOverBias = errorOverOutput * outputOverNetInput * netInputOverBias;
for(std::size_t i = 0; i < NumInputs; ++i)
{
weightAdjustments[i] = errorOverWeight[i];
}
weightAdjustments[NumInputs] = errorOverBias;
DataType errorOverNetInput = errorOverOutput * outputOverNetInput;
std::array<DataType,NumInputs> errorWeights;
for(std::size_t i = 0; i < NumInputs; ++i)
{
errorWeights[i] = errorOverNetInput * m_weights[i];
}
return errorWeights;
}
void AdjustWeights(const std::array<DataType,NumInputs+1>& adjustments)
{
for(std::size_t i = 0; i < NumInputs; ++i)
{
m_weights[i] = m_weights[i] - learningRate * adjustments[i] + momentum * m_previousWeightUpdates[i];
m_previousWeightUpdates[i] = learningRate * adjustments[i] + momentum * m_previousWeightUpdates[i];
}
m_biasWeight = m_biasWeight - learningRate * adjustments[NumInputs] + momentum * m_previousWeightUpdates[NumInputs];
m_previousWeightUpdates[NumInputs] = learningRate * adjustments[NumInputs] + momentum * m_previousWeightUpdates[NumInputs];
}
const std::array<DataType,NumInputs>& GetWeights() const { return m_weights; }
const DataType& GetBiasWeight() const { return m_biasWeight; }
protected:
static std::mt19937 m_mt;
static std::uniform_real_distribution<DataType> m_uniformDisRandom;
static std::uniform_real_distribution<DataType> m_xavierUniformDis;
static std::normal_distribution<DataType> m_xavierNormalDis;
std::array<DataType,NumInputs> m_weights;
DataType m_biasWeight;
ActivationFuncPtr m_activationFunction;
ActivationFuncPtr m_derivativeFunction;
DataType m_output;
DataType m_netInput;
std::array<DataType,NumInputs+1> m_previousWeightUpdates;
};
template<std::size_t NumInputs>
std::mt19937 Neuron<NumInputs>::m_mt(std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count());
template<std::size_t NumInputs>
std::uniform_real_distribution<DataType> Neuron<NumInputs>::m_uniformDisRandom(-1,1);
template<std::size_t NumInputs>
std::uniform_real_distribution<DataType> Neuron<NumInputs>::m_xavierUniformDis(-std::sqrt(6.f / NumInputs+1),std::sqrt(6.f / NumInputs+1));
template<std::size_t NumInputs>
std::normal_distribution<DataType> Neuron<NumInputs>::m_xavierNormalDis(0,std::sqrt(2.f / NumInputs+1));
main()
{
std::ofstream file("error_out.csv", std::ios::out | std::ios::trunc);
if(!file.is_open())
{
std::cout << "couldn't open file" << std::endl;
return 0;
}
file << std::fixed << std::setprecision(80);
std::array<std::array<DataType,2>,4> inputData = {{{0,0},{0,1},{1,0},{1,1}}};
std::array<std::array<DataType,1>,4> desiredOutputs = {{{0},{1},{1},{0}}};
std::array<Neuron<2>*,2> hiddenLayer1 =
{{
new Neuron<2>(tanh, tanh_derivative),
new Neuron<2>(tanh, tanh_derivative)
}};
std::array<Neuron<2>*,1> outputLayer =
{{
new Neuron<2>(sigmoid, sigmoid_derivative)
}};
std::cout << std::fixed << std::setprecision(80);
std::cout << "Initial Weights: " << std::endl;
const std::array<DataType,2>& outputWeights = outputLayer[0]->GetWeights();
const DataType& outputBias = outputLayer[0]->GetBiasWeight();
const std::array<DataType,2>& hidden1Weights = hiddenLayer1[0]->GetWeights();
const DataType& hidden1Bias = hiddenLayer1[0]->GetBiasWeight();
const std::array<DataType,2>& hidden2Weights = hiddenLayer1[1]->GetWeights();
const DataType& hidden2Bias = hiddenLayer1[1]->GetBiasWeight();
std::cout << "W0: " << hidden1Weights[0] << "\n"
<< "W1: " << hidden1Weights[1] << "\n"
<< "B0: " << hidden1Bias << "\n"
<< "W2: " << hidden2Weights[0] << "\n"
<< "W3: " << hidden2Weights[1] << "\n"
<< "B1: " << hidden2Bias << "\n"
<< "W4: " << outputWeights[0] << "\n"
<< "W5: " << outputWeights[1] << "\n"
<< "B2: " << outputBias << "\n" << std::endl;
DataType finalMSE = 0;
std::size_t epochNumber = 0;
while(epochNumber < numberEpochs)
{
DataType epochMSE = 0;
for(std::size_t row = 0; row < inputData.size(); ++row)
{
const std::array<DataType,2>& dataRow = inputData[row];
const std::array<DataType,1>& outputRow = desiredOutputs[row];
// Feed the values through to the output layer
hiddenLayer1[0]->FeedForward(dataRow);
hiddenLayer1[1]->FeedForward(dataRow);
DataType output0 = hiddenLayer1[0]->GetOutput();
DataType output1 = hiddenLayer1[1]->GetOutput();
outputLayer[0]->FeedForward({output0,output1});
DataType finalOutput0 = outputLayer[0]->GetOutput();
// if there was more than 1 output neuron these errors need to be summed together first to create total error
DataType totalError = 0.5 * std::pow(outputRow[0] - finalOutput0,2.f);
epochMSE += totalError * totalError;
DataType propagateError = -(outputRow[0] - finalOutput0);
std::array<DataType,3> weightAdjustmentsOutput;
std::array<DataType,2> outputError = outputLayer[0]->Backpropagate(propagateError,
{output0,output1},
weightAdjustmentsOutput);
std::array<DataType,3> weightAdjustmentsHidden1;
hiddenLayer1[0]->Backpropagate(outputError[0],dataRow,weightAdjustmentsHidden1);
std::array<DataType,3> weightAdjustmentsHidden2;
hiddenLayer1[1]->Backpropagate(outputError[1],dataRow,weightAdjustmentsHidden2);
outputLayer[0]->AdjustWeights(weightAdjustmentsOutput);
hiddenLayer1[0]->AdjustWeights(weightAdjustmentsHidden1);
hiddenLayer1[1]->AdjustWeights(weightAdjustmentsHidden2);
}
epochMSE *= DataType(1) / inputData.size();
file << epochNumber << "," << epochMSE << std::endl;
finalMSE = epochMSE;
++epochNumber;
}
std::cout << std::fixed << std::setprecision(80)
<< "\n\n====================================\n"
<< " TRAINING COMPLETE"
<< "\n\n====================================" << std::endl;
std::cout << "Final Error: " << finalMSE << std::endl;
std::cout << "Number epochs: " << epochNumber << "/" << numberEpochs << std::endl;
// output tests
std::cout << std::fixed << std::setprecision(2)
<< "\n\n====================================\n"
<< " FINAL TESTS"
<< "\n\n====================================" << std::endl;
for(std::size_t row = 0; row < inputData.size(); ++row)
{
const std::array<DataType,2>& dataRow = inputData[row];
const std::array<DataType,1>& outputRow = desiredOutputs[row];
std::cout << dataRow[0] << "," << dataRow[1] << " (" << outputRow[0] << ") : ";
// Feed the values through to the output layer
hiddenLayer1[0]->FeedForward(dataRow);
hiddenLayer1[1]->FeedForward(dataRow);
DataType output0 = hiddenLayer1[0]->GetOutput();
DataType output1 = hiddenLayer1[1]->GetOutput();
outputLayer[0]->FeedForward({output0,output1});
DataType finalOutput0 = outputLayer[0]->GetOutput();
std::cout << finalOutput0 << std::endl;
}
file.close();
return 0;
}
When things are working, I get an output like:
====================================
FINAL TESTS
====================================
0.00,0.00 (0.00) : 0.00
0.00,1.00 (1.00) : 0.99
1.00,0.00 (1.00) : 0.99
1.00,1.00 (0.00) : 0.00
When it's not working I get an output like:
====================================
FINAL TESTS
====================================
0.00,0.00 (0.00) : 0.57
0.00,1.00 (1.00) : 0.57
1.00,0.00 (1.00) : 1.00
1.00,1.00 (0.00) : 0.00
When it's working, the error for each epoch looks like:

The initial weights were:
W0: -0.47551780939102172851562500000000000000000000000000000000000000000000000000000000
W1: 0.40949764847755432128906250000000000000000000000000000000000000000000000000000000
B0: 2.33756542205810546875000000000000000000000000000000000000000000000000000000000000
W2: 2.16713166236877441406250000000000000000000000000000000000000000000000000000000000
W3: -2.74766492843627929687500000000000000000000000000000000000000000000000000000000000
B1: 0.34863436222076416015625000000000000000000000000000000000000000000000000000000000
W4: -0.53460156917572021484375000000000000000000000000000000000000000000000000000000000
W5: 0.04940851405262947082519531250000000000000000000000000000000000000000000000000000
B2: 0.97842389345169067382812500000000000000000000000000000000000000000000000000000000
But when it doesn't work, the error for each epoch looks like:

the initial weights in this particular one was:
W0: 1.16670060157775878906250000000000000000000000000000000000000000000000000000000000
W1: -2.37987256050109863281250000000000000000000000000000000000000000000000000000000000
B0: 0.41097882390022277832031250000000000000000000000000000000000000000000000000000000
W2: -0.23449644446372985839843750000000000000000000000000000000000000000000000000000000
W3: -1.99990248680114746093750000000000000000000000000000000000000000000000000000000000
B1: 1.77582693099975585937500000000000000000000000000000000000000000000000000000000000
W4: 1.98818421363830566406250000000000000000000000000000000000000000000000000000000000
W5: 2.71223402023315429687500000000000000000000000000000000000000000000000000000000000
B2: -0.79067271947860717773437500000000000000000000000000000000000000000000000000000000
I see nothing really telling about these weights that can help me generate good starting weights (which is what I believe the problem to be, regardless of the activation function used).
Question: What can I do to ensure convergence occurs?
Do I need to change the weight initialisation? Do I need to use different activation functions? Do I need more layers or a different number of nodes?
I haven't read all your code because it is quite long, but:
NeuralNetwork class and a Connection class eventually to avoid writing all the logic in main.ActivationFuncPtr typedef which you could use to try and mixup different activation functions for different Neurons (maybe with a genetic algorithm)?Now, to answer your question, there are really no definitive answers, but I can give you a few advice:
1/(1+exp(-4*x)), 4 being arbitrary for instance.I know this is an old question, but the code you provided compiled and ran, so I thought I'd take a look :)
With your code, I would suggest the following:
The reason why convergence isn't guaranteed with your code is because of the "vanishing gradient problem".
You could also add an "error threshold" to test if convergence is met.
I have updated your code with these mentioned changes. With these changes, it should achieve convergence in under 2000 epochs, but there's still no guarantee.
#include <iostream>
#include <fstream>
#include <array>
#include <cmath>
#include <random>
#include <chrono>
#include <algorithm>
#include <iomanip>
typedef float DataType;
typedef DataType (*ActivationFuncPtr)(const DataType&);
const DataType learningRate = std::sqrt(2.f);
const DataType momentum = 0.25f;
const std::size_t numberEpochs = 20000;
const DataType convergence_threshold = 1e-6;
DataType sigmoid(const DataType& x)
{
return DataType(1) / (DataType(1) + std::exp(-x));
}
DataType sigmoid_derivative(const DataType& x)
{
return x * (DataType(1) - x);
}
DataType tanh(const DataType& x)
{
return std::tanh(x);
}
DataType tanh_derivative(const DataType& x)
{
return DataType(1) - x * x;
}
DataType get_binary_result(const DataType& x)
{
return x < 0.5 ? 0 : 1;
}
template<std::size_t NumInputs>
class Neuron
{
public:
Neuron(ActivationFuncPtr activationFunction, ActivationFuncPtr derivativeFunc)
: m_activationFunction(activationFunction),
m_derivativeFunction(derivativeFunc)
{
RandomiseWeights();
}
void RandomiseWeights()
{
std::generate(m_weights.begin(), m_weights.end(), [&]()
{
return m_xavierNormalDis(m_mt);
});
m_biasWeight = m_xavierNormalDis(m_mt);
for (std::size_t i = 0; i < NumInputs + 1; ++i)
m_previousWeightUpdates[i] = 0;
}
void FeedForward(const std::array<DataType, NumInputs>& inputValues)
{
DataType sum = m_biasWeight;
for (std::size_t i = 0; i < inputValues.size(); ++i)
sum += inputValues[i] * m_weights[i];
m_output = m_activationFunction(sum);
m_netInput = sum;
}
DataType GetOutput() const
{
return m_output;
}
const std::array<DataType, NumInputs>& GetWeights() const { return m_weights; }
const DataType& GetBiasWeight() const { return m_biasWeight; }
std::array<DataType, NumInputs> Backpropagate(const DataType& error,
const std::array<DataType, NumInputs>& inputValues,
std::array<DataType, NumInputs + 1>& weightAdjustments)
{
DataType errorOverOutput = error;
DataType outputOverNetInput = m_derivativeFunction(m_output);
std::array<DataType, NumInputs> netInputOverWeight;
for (std::size_t i = 0; i < NumInputs; ++i)
netInputOverWeight[i] = inputValues[i];
DataType netInputOverBias = DataType(1);
std::array<DataType, NumInputs> errorOverWeight;
for (std::size_t i = 0; i < NumInputs; ++i)
errorOverWeight[i] = errorOverOutput * outputOverNetInput * netInputOverWeight[i];
DataType errorOverBias = errorOverOutput * outputOverNetInput * netInputOverBias;
for (std::size_t i = 0; i < NumInputs; ++i)
weightAdjustments[i] = errorOverWeight[i];
weightAdjustments[NumInputs] = errorOverBias;
DataType errorOverNetInput = errorOverOutput * outputOverNetInput;
std::array<DataType, NumInputs> errorWeights;
for (std::size_t i = 0; i < NumInputs; ++i)
errorWeights[i] = errorOverNetInput * m_weights[i];
return errorWeights;
}
void AdjustWeights(const std::array<DataType, NumInputs + 1>& adjustments)
{
for (std::size_t i = 0; i < NumInputs; ++i)
{
m_weights[i] = m_weights[i] - learningRate * adjustments[i] + momentum * m_previousWeightUpdates[i];
m_previousWeightUpdates[i] = learningRate * adjustments[i] + momentum * m_previousWeightUpdates[i];
}
m_biasWeight = m_biasWeight - learningRate * adjustments[NumInputs] + momentum * m_previousWeightUpdates[NumInputs];
m_previousWeightUpdates[NumInputs] = learningRate * adjustments[NumInputs] + momentum * m_previousWeightUpdates[NumInputs];
}
protected:
static std::mt19937 m_mt;
static std::uniform_real_distribution<DataType> m_uniformDisRandom;
static std::uniform_real_distribution<DataType> m_xavierUniformDis;
static std::normal_distribution<DataType> m_xavierNormalDis;
std::array<DataType, NumInputs> m_weights;
DataType m_biasWeight;
ActivationFuncPtr m_activationFunction;
ActivationFuncPtr m_derivativeFunction;
DataType m_output;
DataType m_netInput;
std::array<DataType, NumInputs + 1> m_previousWeightUpdates;
};
template<std::size_t NumInputs>
std::mt19937 Neuron<NumInputs>::m_mt(std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count());
template<std::size_t NumInputs>
std::uniform_real_distribution<DataType> Neuron<NumInputs>::m_uniformDisRandom(-1, 1);
template<std::size_t NumInputs>
std::uniform_real_distribution<DataType> Neuron<NumInputs>::m_xavierUniformDis(-std::sqrt(6.f / (NumInputs + 1)), std::sqrt(6.f / (NumInputs + 1)));
template<std::size_t NumInputs>
std::normal_distribution<DataType> Neuron<NumInputs>::m_xavierNormalDis(0, std::sqrt(2.f / (NumInputs + 1)));
int main()
{
std::ofstream file("error_out.csv", std::ios::out | std::ios::trunc);
if (!file.is_open())
{
std::cout << "couldn't open file" << std::endl;
return 0;
}
file << std::fixed << std::setprecision(80);
std::array<std::array<DataType, 2>, 4> inputData = {{{0, 0}, {0, 1}, {1, 0}, {1, 1}}};
std::array<std::array<DataType, 1>, 4> desiredOutputs = {{{0}, {1}, {1}, {0}}};
// define hidden layer with 4 neurons instead of 2
std::array<Neuron<2>*, 4> hiddenLayer1 =
{{
new Neuron<2>(sigmoid, sigmoid_derivative),
new Neuron<2>(sigmoid, sigmoid_derivative),
new Neuron<2>(sigmoid, sigmoid_derivative),
new Neuron<2>(sigmoid, sigmoid_derivative)
}};
// Output layer now takes 4 inputs from the hidden layer
std::array<Neuron<4>*, 1> outputLayer =
{{
new Neuron<4>(sigmoid, sigmoid_derivative)
}};
std::cout << std::fixed << std::setprecision(80);
std::cout << "Initial Weights: " << std::endl;
const std::array<DataType, 4>& outputWeights = outputLayer[0]->GetWeights();
const DataType& outputBias = outputLayer[0]->GetBiasWeight();
const std::array<DataType, 2>& hidden1Weights = hiddenLayer1[0]->GetWeights();
const DataType& hidden1Bias = hiddenLayer1[0]->GetBiasWeight();
const std::array<DataType, 2>& hidden2Weights = hiddenLayer1[1]->GetWeights();
const DataType& hidden2Bias = hiddenLayer1[1]->GetBiasWeight();
const std::array<DataType, 2>& hidden3Weights = hiddenLayer1[2]->GetWeights();
const DataType& hidden3Bias = hiddenLayer1[2]->GetBiasWeight();
const std::array<DataType, 2>& hidden4Weights = hiddenLayer1[3]->GetWeights();
const DataType& hidden4Bias = hiddenLayer1[3]->GetBiasWeight();
std::cout << "W0: " << hidden1Weights[0] << "\n"
<< "W1: " << hidden1Weights[1] << "\n"
<< "B0: " << hidden1Bias << "\n"
<< "W2: " << hidden2Weights[0] << "\n"
<< "W3: " << hidden2Weights[1] << "\n"
<< "B1: " << hidden2Bias << "\n"
<< "W4: " << hidden3Weights[0] << "\n"
<< "W5: " << hidden3Weights[1] << "\n"
<< "B2: " << hidden3Bias << "\n"
<< "W6: " << hidden4Weights[0] << "\n"
<< "W7: " << hidden4Weights[1] << "\n"
<< "B3: " << hidden4Bias << "\n"
<< "W8: " << outputWeights[0] << "\n"
<< "W9: " << outputWeights[1] << "\n"
<< "W10: " << outputWeights[2] << "\n"
<< "W11: " << outputWeights[3] << "\n"
<< "B4: " << outputBias << "\n" << std::endl;
DataType finalMSE = 0;
std::size_t epochNumber = 0;
while (epochNumber < numberEpochs)
{
DataType epochMSE = 0;
for (std::size_t row = 0; row < inputData.size(); ++row)
{
const std::array<DataType, 2>& dataRow = inputData[row];
const std::array<DataType, 1>& outputRow = desiredOutputs[row];
// Feed the values through to the output layer
hiddenLayer1[0]->FeedForward(dataRow);
hiddenLayer1[1]->FeedForward(dataRow);
hiddenLayer1[2]->FeedForward(dataRow);
hiddenLayer1[3]->FeedForward(dataRow);
DataType output0 = hiddenLayer1[0]->GetOutput();
DataType output1 = hiddenLayer1[1]->GetOutput();
DataType output2 = hiddenLayer1[2]->GetOutput();
DataType output3 = hiddenLayer1[3]->GetOutput();
outputLayer[0]->FeedForward({output0, output1, output2, output3});
DataType finalOutput0 = outputLayer[0]->GetOutput();
// if there was more than 1 output neuron these errors need to be summed together first to create total error
DataType totalError = 0.5 * std::pow(outputRow[0] - finalOutput0, 2.f);
epochMSE += totalError * totalError;
// Backpropagation
DataType propagateError = -(outputRow[0] - finalOutput0);
std::array<DataType, 5> weightAdjustmentsOutput;
std::array<DataType, 4> outputError = outputLayer[0]->Backpropagate(propagateError,
{output0, output1, output2, output3},
weightAdjustmentsOutput);
std::array<DataType, 3> weightAdjustmentsHidden1_0;
std::array<DataType, 3> weightAdjustmentsHidden1_1;
std::array<DataType, 3> weightAdjustmentsHidden1_2;
std::array<DataType, 3> weightAdjustmentsHidden1_3;
hiddenLayer1[0]->Backpropagate(outputError[0], dataRow, weightAdjustmentsHidden1_0);
hiddenLayer1[1]->Backpropagate(outputError[1], dataRow, weightAdjustmentsHidden1_1);
hiddenLayer1[2]->Backpropagate(outputError[2], dataRow, weightAdjustmentsHidden1_2);
hiddenLayer1[3]->Backpropagate(outputError[3], dataRow, weightAdjustmentsHidden1_3);
// Adjust weights
outputLayer[0]->AdjustWeights(weightAdjustmentsOutput);
hiddenLayer1[0]->AdjustWeights(weightAdjustmentsHidden1_0);
hiddenLayer1[1]->AdjustWeights(weightAdjustmentsHidden1_1);
hiddenLayer1[2]->AdjustWeights(weightAdjustmentsHidden1_2);
hiddenLayer1[3]->AdjustWeights(weightAdjustmentsHidden1_3);
}
epochMSE *= DataType(1) / inputData.size();
file << epochNumber << "," << epochMSE << std::endl;
finalMSE = epochMSE;
// Let's exit if the error is less than the convergence error threshold.
if (epochMSE < convergence_threshold) {
std::cout << "Exiting, as error level less than " << convergence_threshold << "." << std::endl;
break;
}
++epochNumber;
}
std::cout << std::fixed << std::setprecision(80)
<< "\n\n====================================\n"
<< " TRAINING COMPLETE"
<< "\n\n====================================" << std::endl;
std::cout << "Final Error: " << finalMSE << std::endl;
std::cout << "Number epochs: " << epochNumber << "/" << numberEpochs << std::endl;
if (!(finalMSE < convergence_threshold)) {
std::cout << "*** FAILED TO CONVERGE ***" << std::endl;
return 1;
}
// output tests
std::cout << std::fixed << std::setprecision(2)
<< "\n\n====================================\n"
<< " FINAL TESTS"
<< "\n\n====================================" << std::endl;
for (std::size_t row = 0; row < inputData.size(); ++row)
{
const std::array<DataType, 2>& dataRow = inputData[row];
const std::array<DataType, 1>& outputRow = desiredOutputs[row];
std::cout << dataRow[0] << "," << dataRow[1] << " (" << outputRow[0] << ") : ";
// Feed the values through to the output layer
hiddenLayer1[0]->FeedForward(dataRow);
hiddenLayer1[1]->FeedForward(dataRow);
hiddenLayer1[2]->FeedForward(dataRow);
hiddenLayer1[3]->FeedForward(dataRow);
DataType output0 = hiddenLayer1[0]->GetOutput();
DataType output1 = hiddenLayer1[1]->GetOutput();
DataType output2 = hiddenLayer1[2]->GetOutput();
DataType output3 = hiddenLayer1[3]->GetOutput();
outputLayer[0]->FeedForward({output0, output1, output2, output3});
DataType finalOutput0 = get_binary_result(outputLayer[0]->GetOutput());
std::cout << finalOutput0 << std::endl;
}
file.close();
// Clean up dynamically allocated memory
for (auto& neuron : hiddenLayer1) delete neuron;
for (auto& neuron : outputLayer) delete neuron;
return 0;
}
EDIT: I stand corrected. Even with 4 neurons in the middle layer, the network will sometimes fail to converge (although this happens a lot less frequently than the 2 hidden neuron model). Failure can be confirmed using:
while ./a.out > /dev/null; do ((counter++)); echo "Counter: $counter"; done
On my laptop, the failure usually happens after 1000+ executions. So it looks like the "solution" is the dirty one mentioned in the answer above - if you detect convergence failed, then generate new random weights are start training the network again from scratch.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With