我正在制作一个神经网络,它应该能够使用可在此处下载的 Mnist 数据库识别手写数字。该网络可以完美地处理 1 到 5 个示例,但在 10 个示例之后它开始变得有点不确定。使用 5000 个示例的标准,该程序将停滞在大约 0.42 成本(它从大约 1.2 成本开始)。最后一层中 10 个神经元的所有输出也将趋向于 0.1,因此网络显然永远不会非常确定它的猜测(通常猜测的输出将在 0.1 到 0.2 左右,但有一些例外)
训练 5000 次迭代后最后一层的猜测和输出示例:
Example: 5000
index: 2435
Cost: 0.459006
Expected: 0
Out_value 0: 0.0900279
Out_value 1: 0.104657
Out_value 2: 0.0980369
Out_value 3: 0.0990471
Out_value 4: 0.101716
Out_value 5: 0.0937537
Out_value 6: 0.0933432
Out_value 7: 0.114351
Out_value 8: 0.10058
Out_value 9: 0.0924466
Guess: 7
Guess certainty: 0.114351
false
我已经尝试调整 h 层的数量和大小以及学习率,但结果总是相同的(不断跳跃大约 0.42 的成本)。当然,我推测我的反向传播或数学只是没有检查出来,但是使用基于反向传播指南的测试网络进行测试,链接这里我的权重根据文章完美地调整到小数点。所以我不知道该怎么做才能防止我的网络停滞不前并让它在这一点上学习。有谁知道为什么它会像这样停滞不前?
神经网络的 cpp 文件中的相关代码:
#define _USE_MATH_DEFINES
#include <cmath>
#include <utility>
#include "neural_network.h"
#include <ctime>
#include <string>
#include <iostream>
#include <cstdlib>
namespace nn{
double fRand(const double& f_min, const double& f_max){ //generate random double from f_min to f_max
const auto f = static_cast<double>(rand()) / RAND_MAX;
return f_min + f * (f_max - f_min);
}
double sigmoid(const double& net) { //sigmoid function for out value
const double result = 1.0 / static_cast<double>(1.0 + pow(M_E, -net));
return result;
}
double xavier(int layer_from_size) { //function used to initialize initial weights.
const double val = sqrt(1.0 / static_cast<double>(layer_from_size));
return val;
}
double out_net_derivative(const double& out){ //derviative of out-value with respect to the net-value
const double val = out * (1 - out);
return val;
}
double cost_out_derivative(const double& out, const double& target)
//derivative of the cost with respect to the out-value for the neurons in the last layer
{
const double val = out - target;
return val;
}
double calc_cost(const Layer& layer, std::vector<double> target){ //calculating the total cost mainly for logging
double cost = 0;
for(int i = 0; i < target.size(); i++){
cost += pow(target[i] - layer.get_neurons()[i].get_out(), 2) / 2;
}
return cost;
}
double delta(const double& cost_out_derivative, const double& out)
//derivative of the cost with respect to the current neurons out multiplied by out_net_derivative
{
const double val = cost_out_derivative * out_net_derivative(out);
return val;
}
Weight::Weight(double weight, int neuron_from_index)
:weight_{ weight }, neuron_from_index_{ neuron_from_index }
{}
Neuron::Neuron(int pos, int layer) //creating a empty neuron
: net_{ 0.0 }, out_{ 0.0 }, error_gradient_{ 0.0 }, pos_{ pos }, layer_{ layer }
{
}
Neuron::Neuron(int pos, double out) //creating a neuron in the first layer with a pre-assigned out-value
: net_{ 0.0 }, out_{ out }, error_gradient_{ 0.0 }, pos_{ pos }, layer_{ 0 }
{
}
void Neuron::update_weights(const Layer& layer_from, const double& learning_rate){
for (Weight& weight : weights_to_) {
//derivative of net with respect to weight
double neuron_from_out = layer_from.get_neurons()[weight.get_neuron_from_index()].get_out();
//derivative of cost with respect to weight
double val = delta(error_gradient_, out_) * neuron_from_out;
weight.update_weight(val, learning_rate);
}
}
void Layer::update_error_gradient(Layer& layer_from)
//update all the error gradients (derivative of the cost with respect to the neurons out-value) in the previous layer (layer_from)
{
for (Neuron& neuron : layer_from.neurons_) neuron.set_error_gradient(0); //resetting all previous error gradients
for (int i = 0; i < neurons_.size(); i++) {
for (int j = 0; j < layer_from.get_neurons().size(); j++) {
double delta_val = delta(neurons_[i].get_error_gradient(), neurons_[i].get_out());
//partial derivative of cost with respect to the last layers neuron in position j
double val = neurons_[i].get_weights_to()[j].get_weight() * delta_val;
layer_from.neurons_[j].update_error_gradient(val);
}
}
}
void Layer::update_bias(const double& learning_rate){
for(const Neuron& neuron: neurons_){
//derivative of the cost with respect to the layer-bias
double val = out_net_derivative(neuron.get_out()) * neuron.get_error_gradient();
bias_ -= learning_rate * val;
}
}
void Neuron::set_weights(const int& layer_from_size){ //set initial weights for neuron
for(int i = 0; i < layer_from_size; i++){
//get random weight using xavier weight initialization
double v_val = fRand(-xavier(layer_from_size), xavier(layer_from_size));
Weight weight{ v_val, i };
weights_to_.push_back(weight);
}
}
void Layer::set_weights(const int& layer_from_size){ //set initial weights for layer
for (Neuron& neuron : neurons_) neuron.set_weights(layer_from_size);
}
void Network::set_weights(){ //set initial weights for network
//srand(time(NULL));
for(int i = 1; i < layers_.size(); i++){
layers_[i].set_weights(layers_[i - 1].get_neurons().size());
}
}
Layer::Layer(int pos, int size) //make layer of any size
: pos_{ pos }, bias_{ 0.0 }
{
for (int i = 0; i < size; i++) //fill with neurons according to desired size
{
Neuron neuron{ i, pos };
neurons_.push_back(neuron);
}
}
Layer::Layer(std::vector<Neuron> first_layer) //set the first layer of the network according pre-acquired neurons
:pos_{ 0 }, bias_{ 0.0 }, neurons_{std::move(first_layer)}
{}
void Layer::forward_pass(const Layer& layer_from){ //calculate net, and out-value of each neuron in layer
for(Neuron& neuron : neurons_){
double val = calc_net(layer_from, neuron, bias_);
neuron.set_net(val);
neuron.set_out(sigmoid(val));
}
}
void Network::forward_pass(){ //calculate net, and out-value of each neuron in network
for (int i = 1; i < layers_.size(); i++)
layers_[i].forward_pass(layers_[i - 1]);
}
void Layer::backprop(const Layer& layer_from, const double& learning_rate){ //backprop and thus update weights in layer
for (Neuron& neuron : neurons_)
neuron.update_weights(layer_from, learning_rate);
}
void Network::backprop(const std::vector<double>& target){ //backprop entire network and thus update weights and biases
forward_pass();
set_last_layer_error_grads(target);
for(int i = layers_.size() - 1; i > 0; i--){
//update error gradients for the previous layer in the network
layers_[i].update_error_gradient(layers_[i - 1]);
layers_[i].backprop(layers_[i - 1], learning_rate_);
layers_[i].update_bias(learning_rate_);
}
}
Network::Network(std::vector<int> structure, double learning_rate) //create a network skeleton
:learning_rate_{learning_rate}
{
for(int i = 0; i < structure.size(); i++){ //fill network with layers of various sizes according to structure
Layer layer{ i, structure[i] };
layers_.push_back(layer);
}
}
void Network::set_last_layer_error_grads(std::vector<double> target){
for (int i = 0; i < layers_[layers_.size() - 1].get_neurons().size(); i++) {
double val = cost_out_derivative(layers_[layers_.size() - 1].get_neurons()[i].get_out(), target[i]);
layers_[layers_.size() - 1].set_neuron_error_grad(i, val);
}
}
int Network::get_guess() const{ //get the networks guess for each example (image)
int guess = 0;
for (int i = 0; i < layers_[layers_.size() - 1].get_neurons().size(); i++) {
if (layers_[layers_.size() - 1].get_neurons()[guess].get_out() < layers_[layers_.size() - 1].get_neurons()[i].get_out())
guess = i;
//std::cout << "Guess certainty " << i << ":\t" << layers[layers.size() - 1].get_neurons()[i].get_out_value() << '\n';
std::cout << "Out_value " << i << ":\t" << layers_[layers_.size() - 1].get_neurons()[i].get_out() << '\n';
}
std::cout << "Guess:\t" << guess << '\n'
<< "Guess certainty:\t" << layers_[layers_.size() - 1].get_neurons()[guess].get_out() << "\n\n";
return guess;
}
int Network::get_weight_amount() const //get number of weights
{
int amount = 0;
for (int i = 1; i < layers_.size(); i++) {
amount += layers_[i - 1].get_neurons().size() * layers_[i].get_neurons().size();
}
return amount;
}
double calc_net(const Layer& layer_from, const Neuron& neuron, const double& bias){ // calculate net-value for specific neuron
const std::vector<Neuron>& neurons_from = layer_from.get_neurons();
const std::vector<Weight>& weights = neuron.get_weights_to();
if (neurons_from.size() != weights.size())
throw std::exception("there is not strictly one weight for each neuron in layer from.");
double net = 0;
//calculate net value with respect to the previous layers neurons and weights connecting them
for (int i = 0; i < neurons_from.size(); i++)
net += neurons_from[i].get_out() * weights[i].get_weight();
net += bias;
return net;
}
void Network::train(std::ifstream& practice_file, const int& sample_size, const int& practise_loops)
//train network with a specific sample size a specific number of times according to practice loops,
//getting necessary data for the first layer from a practice file
{
//srand(time(NULL));
std::vector<Layer> images;
std::vector<std::vector<double>> targets;
for(int i = 0; i < sample_size; i++){ //get and store all images and targets for the images in different vectors
std::vector<double> image = get_image(practice_file);
images.push_back(get_flayer(image));
targets.push_back(get_target(image, layers_[layers_.size() - 1].get_neurons().size()));
}
//backprop through random examples taken from the sample
for(int i = 0; i < practise_loops; i++){
int index = rand() % images.size();
layers_[0] = images[index];
backprop(targets[index]);
std::cout << "Example:\t" << i << '\n' << //logging
"index:\t" << index << '\n'
<< "Cost:\t" << calc_cost(layers_[layers_.size() - 1], targets[index]) << '\n';
if (correct_guess(targets[index]))
std::cout << "true\n";
else std::cout << "false\n";
}
}
double Network::test(std::ifstream& test_file, const int& sample_size){ //test network accuracy
int correct = 0;
std::vector<Layer> images;
std::vector<std::vector<double>> targets;
for (int i = 0; i < sample_size; i++) {
std::vector<double> image = get_image(test_file);
images.push_back(get_flayer(image));
targets.push_back(get_target(image, layers_[layers_.size() - 1].get_neurons().size()));
}
for(int i = 0; i < sample_size; i++)
{
layers_[0] = images[i];
forward_pass();
if (correct_guess(targets[i])) correct++; //keep track of correct guesses
}
double accuracy = 100 * correct / sample_size; //calculate accuracy
return accuracy;
}
std::vector<double> get_image(std::ifstream& ifs) { //get an image data from a file (specifically from the mnist files
std::vector<double> values; //all data converted to relevant format
std::string value; //data in string format
std::string line; //all data in string format
std::getline(ifs, line); //get image
//convert image string to relevant grey scale and target doubles and store them to be returned
for (const char& ch : line) {
switch (ch) {
case '0': case '1':
case '2': case '3':
case '4': case '5':
case '6': case '7':
case '8': case '9':
case '.':
value += ch;
break;
default:
values.push_back(std::stod(value));
value.clear();
break;
}
}
values.push_back(std::stod(value)); //store last piece of data
return values;
}
std::vector<double> get_target(const std::vector<double>& image, int last_layer_size){ //get target for an image
std::vector<double> target(last_layer_size);
//make sure that every neuron that is not the correct answer isn't lit up and do the opposite for the correct answer neuron
for(int i = 0; i < last_layer_size; i++){
//according to the file setup the first piece of data in the image is the target, hence image[0]
if (i == static_cast<int>(image[0])) target[i] = 1.0; //0.99
}
return target;
}
Layer get_flayer(std::vector<double> image) { //get the first layer through image
std::vector<Neuron> neurons;
image.erase(image.begin()); //throw away the target
for (int i = 0; i < image.size(); i++) {
Neuron neuron{ i, image[i] };
neurons.push_back(neuron);
}
Layer layer{ neurons };
return layer;
}
bool Network::correct_guess( const std::vector<double>& target) const{ //confirm if a guess by the network is correct
int excpected = 0;
for (int i = 0; i < target.size(); i++)
if (target[i] == 1.0) excpected = i; //the correct guess is the neuron position of the neuron fully lit of the bunch
std::cout << "Excpected:\t" << excpected << "\n\n";
return excpected == get_guess();
}
}
链接到完整代码,包括 GitHub 上 cpp-file、h-file 和 main-file 中的一些 exta 函数以获取更多上下文:完整代码