我试图理解神经网络学习背后的直觉。我了解它背后的数学原理,并且已经尝试分析解决它。Multilayer Perceptron
在我从头开始编码时,Python
我面临着增加总错误的问题。我已经评论了我的代码,解释了操作,并发布了结果输出和三种不同训练场景的图表。此外,我NumPy
尽可能使用向量运算来减少我的代码。
概括:
- 主要方法包含生成用于二进制分类的数据的代码,使用随机梯度下降方法,以及 Dense 类对象及其训练模型的方法
- 网络分为三层;输入(4 个节点)、隐藏层(6 个节点)和输出(2 个节点)
- Dense 类是网络中层的实现
密集类表示 中的一个层MLP Network
:
包含以下内容:
- 构造函数:随机初始化权重和偏差
- Sigmoid 方法:激活层的线性组合,也就是激活电位
- d_sigmoid 方法:找到sigmoid的第一个微分值
- forward_pass 方法:执行层的前向传播
- backward_pass 方法:执行层的反向传播
#Importing numpy for vector operations
import numpy as np
np.random.seed(78)
class Dense():
def __init__(self, n_inputs, n_nodes):
#Weights associated with all n_nodes for n_inputs synaptic inputs
#Each column in the weight matrix is weight vector associated with one neuron
self.weights = np.random.uniform(low=0, high=1, size=n_inputs*n_nodes).reshape(n_inputs, n_nodes)
#Biases associated with each neuron, shape (1, n_nodes)
#There are n_nodes columns and each one represent bias associated with one neuron in the layer
#This one dimension array will be added to linear combinations of all neurons
#assuming the synaptic connection associated with biases is '1'
self.biases = np.ones(n_nodes)
def sigmoid(self):
#Activates the activation_potential -> ndarray
#Save the activated ndarry to outputs, as I will need this later
self.outputs = 1 / (1 + np.exp(-self.activation_potentials))
return self.outputs
def d_sigmoid(self):
#Derivitive of the activation potential -> ndarray (For all neurons, values of first differentail activation function at activation_potential)
#Will be used in local gradients calculation
#Vector value of shape (n_nodes,)
return self.outputs * (1 - self.outputs)
def forward_pass(self, inputs):
#Calculate activation potential of the current layer neurons -> ndarray
#Which is inputs times weights add a bias
#Stores it to activation_potentials
self.activation_potentials = np.dot(self.weights.T, inputs) + self.biases
#Return the outputs of the layer by calling sigmoid() whih activates the activation potentials
return self.sigmoid()
def backward_pass(self, learning_rate, inputs_to_layer, target,
prev_loc_grads = [], prev_weights = []):
#The term previous layer in this method is a lyer next to current layer which made call to this method, because this bakward signal propegate from output layer to input layer
#input_to_layer: represent ndarray of inputs to current layer which made call to this method
#target: represent ndarray of target after one-hot-encoding it
#prev_loc_grads: local gradients of layer next to current layer, I call it previous because this is backpropagation and the flow is propagated from output towards input layer
#prev_weights: weights matrix of layer next to current layer, I call it previous because of backward signal flow
#No previous local gradients means the call to backwardpass is made with output layer object
if not len(prev_loc_grads):
#At Output layer local gradient of each node is error at that node * derivative of activation function
#While error at a node is (predicted - actual value)
#Next line perform element wise subtraction of two array (the predicted and desired)
self.error_at_end_nodes = self.outputs - target
#Calculate the local gradients
self.loc_gradients = self.error_at_end_nodes * self.d_sigmoid()
else:
# Local gradients of nodes in hidden layer are (derivitive of activation * sum of all (local gradients of previous layer neurons * wights associated to synaptic connection of those neurons)
# Calculating the sum of all (local gradients of previous layer neurons * wights associated to synaptic connection of those neurons)
temp = np.zeros(prev_weights.shape[0])
for i in range(prev_loc_grads.size):
temp += prev_loc_grads[i] * prev_weights[:, i]
#Local gradients of the hidden layer
self.loc_gradients = self.d_sigmoid() * temp
#Update Weights, based on learning rate, local gradients and inputs to layer
self.weights = self.weights + (learning_rate * np.outer(inputs_to_layer, self.loc_gradients))
# The inputs_to_layer is ommited as bias is (conceptually) multiplied by input from a neuron with a fixed activation of 1
self.biases = self.biases + learning_rate * self.loc_gradients
return self.weights, self.biases
包含训练循环和训练数据的主要代码:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from dense import Dense
import numpy as np
import random
def main():
print("In Main .. Starting ..")
#Generating Data with 10 samples containing 4 features and one target (binary classification)
data = make_classification(n_samples=10, n_features=4, n_classes=2)
X_train = data[0]
y_train = data[1]
#Normalizing the data
X_train = X_train/X_train.max()
print('Training Data: (X) : ', X_train)
print('Taraining Data: (y): ', y_train)
#Encoding data using one hot encoding technique
enc = OneHotEncoder(sparse = False)
desired = enc.fit_transform(y_train.reshape(-1,1))
print("Targets after One Hot Encoding: ", desired)
#-------------------------------Experimental Code-----------------------------------------
#netwok is a list of all layers in the network, each time a layer is created as Dense() object, will be appended to network
network = []
print("----------------------------Layer 1-------------------------")
network.append(Dense(4,6))
print('Created Only Hidden Layer: N_nodes: {} , N_inputs: {}'.format(6,4))
print("----------------------------Layer 2-------------------------")
network.append(Dense(6,2))
print('Created Output Layer: N_nodes: {} , N_inputs: {}'.format(2,6))
epoch = 1
error = []
#The main training loop, exits at after 10 epochs
while epoch <= 10:
#Random suffling of training data
temp = list(zip(X_train,y_train))
random.shuffle(temp)
X_train,y_train=zip(*temp)
#Variabels to store the temporary state of the operations
weights = 0
biases = 0
#List to store the intermediate weights and biases for means square error calculation at end of each epoch
wb = []
print('--------------------------Epoch: {}-------------------------'.format(epoch), '\n')
#Select one feature vector from feature matrix and corresponding target vector from desired matrix (Which was obtained from one hot encoding)
for x,y in zip(X_train, desired):
#previous_inputs list keeps track of inputs from layer to layer in each epoch
previous_inputs = []
#At start of each epoch, the list contains only inputs from input nodes which are the features of current training sample
previous_inputs.append(x)
#This loop iterates over all layers in network and perform forward pass
for layer in network:
#Forward_pass perform forward propagation of a layer of last element of the previous_inputs list,
#and returns the output of layer which is stored as ndarray in list, as it will be used as inputs to next layer
previous_inputs.append(layer.forward_pass(previous_inputs[-1]))
#Ignore the output of last layer, as I'm using the preious_inputs array in reverse order in backward_pass in next loop
previous_inputs = previous_inputs[:-1]
#Next loop reverses the network and previous_inputs lists to perform backward propagation of al layers from output layer all the way to input layer
for layer, inputs in zip(network[::-1], previous_inputs[::-1]):
#If the layer is not output layer then perform backward propagation using code inside if statement
if layer != network[-1]:
#call to backward_pass using learning rate = 0.0001, inputs to current layer, target vector 'y',
#previous_loc_gradients (local gradients of layer next to current layer),
#and prev_weights (weights of layer next to current layer)
#Store the updated weights and biases for mean square error calculation at end of epoch
weights, biases = layer.backward_pass(0.0001, prev_inputs, y, prev_loc_gradients, prev_weights)
#otherwise, perform the backward pass for output layer using code in else block
else:
weights, biases = layer.backward_pass(0.0001, prev_inputs, y)
#Store local gradietns nad weights of current layer for next layer backward pass
prev_loc_gradients = layer.loc_gradients
prev_weights = layer.weights
#Add updated weights and biases to wb, will be using it in next loop
wb.append((weights, biases))
#error_i is sum of errors for all training examples on updated weights and biases
error_i = 0
#This loop calculates Total Error on new weights and biases, by considering the whole training data
for x_val, y_val in zip(X_train, desired):
previous_inputs = []
previous_inputs.append(x_val)
#Perform forward pass on new weights and biases
for layer in network:
#Forward Pass
previous_inputs.append(layer.forward_pass(previous_inputs[-1]))
#add the error of prediction of current training sample to prevoius errors
error_i += np.power((previous_inputs[-1] - y_val), 2).sum()
#Append total error of current sample to error list, and repeate the process for next sample, do this for all samples
error.append(error_i)
#Increase epoch by one, and perform forward, backward on next sample, and calculate error for all samples, do this until while is true
epoch += 1
#Plot the errors after training completes,
plt.plot(error)
plt.show()
#-------------------------------Experimental Code-----------------------------------------
if __name__ == "__main__":
main()
最后,我得到这些输出和图表,具体取决于时期和训练数据大小:
- 对于 1 个 epoch 和 10 个训练样本
In Main .. Starting ..
Training Data: (X) : [[-0.26390333 -0.12430637 -0.38741338 0.20075948]
[ 0.63580037 -1.05223163 -0.58551008 0.68911107]
[-0.54448011 0.08334418 -0.4174701 0.11937366]
[ 0.22123838 -0.54513245 -0.40486294 0.39508491]
[-0.3489578 -0.2067747 -0.55992358 0.30225496]
[ 0.46346633 0.29702914 0.76883225 -0.42087526]
[ 0.05631264 0.04373764 0.10200898 -0.05777301]
[ 0.19738736 -0.26007568 -0.10694419 0.15615838]
[ 0.12548086 -0.17220663 -0.07570972 0.10523554]
[-0.52398487 1. 0.63178402 -0.68315832]]
Taraining Data: (y): [0 1 0 1 0 1 1 1 0 0]
Targets after One Hot Encoding: [[1. 0.]
[0. 1.]
[1. 0.]
[0. 1.]
[1. 0.]
[0. 1.]
[0. 1.]
[0. 1.]
[1. 0.]
[1. 0.]]
----------------------------Layer 1-------------------------
Created Only Hidden Layer: N_nodes: 6 , N_inputs: 4
----------------------------Layer 2-------------------------
Created Output Layer: N_nodes: 2 , N_inputs: 6
--------------------------Epoch: 1-------------------------
- 对于 10 个 epoch 和 10 个训练样本
In Main .. Starting ..
Training Data: (X) : [[-0.26390333 -0.12430637 -0.38741338 0.20075948]
[ 0.63580037 -1.05223163 -0.58551008 0.68911107]
[-0.54448011 0.08334418 -0.4174701 0.11937366]
[ 0.22123838 -0.54513245 -0.40486294 0.39508491]
[-0.3489578 -0.2067747 -0.55992358 0.30225496]
[ 0.46346633 0.29702914 0.76883225 -0.42087526]
[ 0.05631264 0.04373764 0.10200898 -0.05777301]
[ 0.19738736 -0.26007568 -0.10694419 0.15615838]
[ 0.12548086 -0.17220663 -0.07570972 0.10523554]
[-0.52398487 1. 0.63178402 -0.68315832]]
Taraining Data: (y): [0 1 0 1 0 1 1 1 0 0]
Targets after One Hot Encoding: [[1. 0.]
[0. 1.]
[1. 0.]
[0. 1.]
[1. 0.]
[0. 1.]
[0. 1.]
[0. 1.]
[1. 0.]
[1. 0.]]
----------------------------Layer 1-------------------------
Created Only Hidden Layer: N_nodes: 6 , N_inputs: 4
----------------------------Layer 2-------------------------
Created Output Layer: N_nodes: 2 , N_inputs: 6
--------------------------Epoch: 1-------------------------
--------------------------Epoch: 2-------------------------
--------------------------Epoch: 3-------------------------
--------------------------Epoch: 4-------------------------
--------------------------Epoch: 5-------------------------
--------------------------Epoch: 6-------------------------
--------------------------Epoch: 7-------------------------
--------------------------Epoch: 8-------------------------
--------------------------Epoch: 9-------------------------
--------------------------Epoch: 10-------------------------
- 对于 50 个 epoch 和 1000 个样本
In Main .. Starting ..
Training Data: (X) : [[ 0.10845729 0.03110484 -0.10935314 -0.01435112]
[-0.27863109 -0.17048214 -0.04769305 0.04802046]
[-0.10521553 -0.07933533 -0.07228399 0.01997508]
...
[-0.25583767 -0.24504791 -0.36494096 0.0549903 ]
[ 0.06933997 -0.29438308 -1.21018002 0.02951967]
[-0.02084834 0.06847175 0.29115171 -0.00640819]]
Taraining Data: (y): [1 0 0 1 0 0 1 0 1 1 0 0 0 0 0 1 1 0 1 1 1 0 1 1 1 0 1 0 0 1 1 0 0 1 1 1 0
1 1 0 0 1 0 0 0 1 0 1 0 1 1 1 0 1 1 1 0 1 1 0 0 0 0 1 1 0 1 0 1 0 1 1 1 0
1 0 1 1 0 0 0 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 0 1 1 1 0 0 0 1 1 0 0 1 0 1
0 0 0 1 0 1 1 0 1 1 0 1 1 0 0 1 0 0 0 0 0 1 0 0 1 0 0 1 1 1 1 1 1 0 0 0 0
0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 1 0 1 0 1 0 0 0 0 1 1 1 1 0 0 1 0 0 1 1
1 0 1 1 0 1 0 1 1 0 0 0 1 1 0 1 0 1 0 1 1 0 1 1 0 1 0 1 0 0 0 0 0 0 0 0 0
0 1 0 1 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 1 0 1 0 1 0 0 1 0 1 1 0 0 1 1
0 0 0 1 1 1 0 1 0 0 0 1 0 0 0 1 1 1 1 1 1 0 1 0 1 0 0 1 0 1 1 1 1 0 1 0 0
1 1 0 1 0 1 1 1 0 0 1 1 1 0 1 0 0 0 0 1 1 1 1 1 1 0 0 1 0 1 1 0 1 0 1 1 0
1 0 0 1 1 1 0 0 1 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 0 1 0 0
0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0 1 1 0 1 1 1 0 0 0 0 0 1 1 0 1 1 0 1 0 0
1 1 0 0 1 1 0 0 0 1 1 1 0 1 0 0 1 0 0 0 0 1 0 0 1 1 1 0 1 1 0 1 1 0 1 1 0
1 1 0 0 1 0 1 1 0 0 0 0 0 1 0 1 0 1 1 1 0 0 0 0 1 0 1 0 1 1 1 1 0 1 0 0 1
0 0 0 1 0 0 1 1 0 1 0 0 0 1 0 1 1 1 0 1 1 1 1 1 0 1 1 0 0 1 1 0 0 0 1 0 1
0 0 1 1 0 0 0 0 0 0 1 1 0 1 1 0 1 1 1 1 0 0 0 0 1 0 1 0 0 0 1 0 1 0 0 0 1
1 1 1 1 1 0 1 0 1 0 0 1 1 0 1 0 0 0 0 0 1 1 1 1 1 1 0 1 0 0 0 0 0 0 0 1 0
0 1 1 1 1 0 1 0 0 1 1 0 1 1 1 1 0 0 1 0 1 0 0 1 0 0 0 0 0 1 1 1 1 1 1 1 0
1 1 0 1 0 1 0 1 0 1 0 1 0 1 1 1 0 1 1 1 1 0 0 0 0 0 1 0 0 1 1 0 1 1 0 0 0
1 0 1 0 1 1 0 1 1 1 0 1 0 1 0 1 0 1 1 1 0 0 0 1 0 0 0 1 1 0 0 0 1 1 0 1 1
1 1 1 1 1 1 1 0 1 0 0 1 0 0 1 1 0 0 1 1 1 1 0 1 0 0 0 0 0 1 1 1 1 0 0 1 0
1 0 1 0 1 0 0 1 0 0 0 1 1 1 0 1 0 0 1 1 1 1 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0
1 0 0 1 1 0 1 1 0 0 0 0 1 1 0 1 0 1 0 1 1 1 0 0 1 1 0 1 0 0 0 1 1 0 0 0 0
0 1 0 0 1 0 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 0 0 1 0 0 1 0 1 0 0 1 0 1 1 1 0
0 1 1 0 0 1 0 0 0 1 1 1 0 1 0 1 1 0 0 1 1 1 0 0 0 1 1 1 1 1 0 0 0 1 1 0 1
0 1 0 0 0 1 1 1 0 0 1 1 1 0 1 0 1 1 0 0 0 0 0 0 1 0 1 0 1 0 1 1 1 0 1 1 1
1 1 0 1 1 0 0 1 0 1 0 1 1 1 0 1 0 0 0 0 0 1 0 1 1 1 1 0 0 1 0 1 1 0 0 1 1
1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 0 0 0 1 1 1 0 1 0 1 1 0 1 0 1 1 1 1 0 0
0]
Targets after One Hot Encoding: [[0. 1.]
[1. 0.]
[1. 0.]
...
[1. 0.]
[1. 0.]
[1. 0.]]
----------------------------Layer 1-------------------------
Created Only Hidden Layer: N_nodes: 6 , N_inputs: 4
----------------------------Layer 2-------------------------
Created Output Layer: N_nodes: 2 , N_inputs: 6
--------------------------Epoch: 1-------------------------
--------------------------Epoch: 2-------------------------
--------------------------Epoch: 3-------------------------
--------------------------Epoch: 4-------------------------
--------------------------Epoch: 5-------------------------
--------------------------Epoch: 6-------------------------
--------------------------Epoch: 7-------------------------
--------------------------Epoch: 8-------------------------
--------------------------Epoch: 9-------------------------
--------------------------Epoch: 10-------------------------
--------------------------Epoch: 11-------------------------
--------------------------Epoch: 12-------------------------
--------------------------Epoch: 13-------------------------
--------------------------Epoch: 14-------------------------
--------------------------Epoch: 15-------------------------
--------------------------Epoch: 16-------------------------
--------------------------Epoch: 17-------------------------
--------------------------Epoch: 18-------------------------
--------------------------Epoch: 19-------------------------
--------------------------Epoch: 20-------------------------
--------------------------Epoch: 21-------------------------
--------------------------Epoch: 22-------------------------
--------------------------Epoch: 23-------------------------
--------------------------Epoch: 24-------------------------
--------------------------Epoch: 25-------------------------
--------------------------Epoch: 26-------------------------
--------------------------Epoch: 27-------------------------
--------------------------Epoch: 28-------------------------
--------------------------Epoch: 29-------------------------
--------------------------Epoch: 30-------------------------
--------------------------Epoch: 31-------------------------
--------------------------Epoch: 32-------------------------
--------------------------Epoch: 33-------------------------
--------------------------Epoch: 34-------------------------
--------------------------Epoch: 35-------------------------
--------------------------Epoch: 36-------------------------
--------------------------Epoch: 37-------------------------
--------------------------Epoch: 38-------------------------
--------------------------Epoch: 39-------------------------
--------------------------Epoch: 40-------------------------
--------------------------Epoch: 41-------------------------
--------------------------Epoch: 42-------------------------
--------------------------Epoch: 43-------------------------
--------------------------Epoch: 44-------------------------
--------------------------Epoch: 45-------------------------
--------------------------Epoch: 46-------------------------
--------------------------Epoch: 47-------------------------
--------------------------Epoch: 48-------------------------
--------------------------Epoch: 49-------------------------
--------------------------Epoch: 50-------------------------
我不明白是什么导致错误增加,因为目标是减少它。我错过了什么?