python - 如何在 Numpy 中实现矢量化反向传播

Question

我正在做一个学校项目，并坚持如何使用我目前拥有的前向道具结构在 Numpy 中实施反向传播。该脚本的目的是仅使用 numpy 创建一个简单的动态（意味着任意数量的层和节点）全连接网络。

我认为我必须找到激活函数的导数并将其乘以原始误差以及我遇到的每个激活函数的导数向后移动。

但是，我无法弄清楚如何在我的脚本中正确实现这一点。

鉴于此处设置的复杂性，如果有人可以用英语解释我到底要做什么，或者甚至推荐处理动态大小反向传播的视频/帖子，那将是一个很大的帮助。

现在，所有权重和偏差都存储在列表中以供将来反向传播，并且我能够使用反向传播函数中当前的少量代码来获取每个输出的错误。

这个代码块

#initialize a test model w/ 128 bacth and lr of 0.01
model = Model(128, 0.01)

#simple x data input
X = np.array([[1,1],[0,0],[12,5]])
Y = np.array([[1],[0],[-1]])
#adding 4 layers
z = model.add(X, 3, "sigmoid")
z = model.add(z, 1, "sigmoid", output=True)

#this is a full forward pass through the layers
z = model.predict(X)
print(z)

#this is the error of the predictions
print(model.backprop(z, Y))

输出以下向量：

[[0.50006457]
 [0.50006459]
 [0.50006431]]

[[0.24993544]
 [0.2500646 ]
 [2.25019293]]

就像我说的，不知道如何从这里向前（或向后；））。

以下是运行示例所需的完整脚本：

import math
import numpy as np

#everything below is defining activation functions
#--------------------------------------------------------------------------------------------

def b_relu(input):
  return max((0, max(input)))

def bd_relu(input):
  if(input < 0 or input == 0):
    return 0
  else:
    return 1

def b_sigmoid(x):
  return 1 / (1 + math.exp(-x))

def bd_sigmoid(input):
  return sigmoid(input) * (1 - sigmoid(input))

def b_tanh(input):
  top = (math.exp(input) - math.exp(-input))
  bottom = (math.exp(input) + math.exp(-input))
  return (top/bottom)

#helper functions for tanh
def cosh(input):
  return ((math.exp(input) + math.exp(-input)) / 2)
def sinh(input):
  return ((math.exp(input) - math.exp(-input)) / 2) 

def bd_tanh(input):
  top = (math.pow(cosh(input), 2) - math.pow(sinh(input), 2))
  bottom = math.pow(input, 2)
  return (top / bottom)

def b_softmax(z):
  # subracting the max adds numerical stability
  shiftx = z - np.max(z,axis=1)[:,np.newaxis]
  exps = np.exp(shiftx)
  return exps / np.sum(exps,axis=1)[:,np.newaxis]

def bd_softmax(Y_hat, Y):
  return Y_hat - Y

def b_linear(input):
  return input
def bd_linear(input):
  return 1

#vectorizing the activation and deriv. activation functions
relu = np.vectorize(b_relu)
d_relu = np.vectorize(bd_relu)

sigmoid = np.vectorize(b_sigmoid)
d_sigmoid = np.vectorize(bd_sigmoid)

tanh = np.vectorize(b_tanh)
d_tanh = np.vectorize(bd_tanh)

softmax = np.vectorize(b_softmax)
d_softmax = np.vectorize(bd_softmax)

linear = np.vectorize(b_linear)
d_linear = np.vectorize(bd_linear)

class Model:
  def __init__(self, batch, lr):
    #initializing self lists to keep track of stuff for bacthes, forward prop & backporp
    self.batch = batch
    self.lr = lr
    self.W = []
    self.B = []
    self.A = []
    self.Z = []
    self.X = []
    self.layers = []
    self.tempW = []
    self.tempB = []

    #store error for backprop
    self.output_error = []

  #initialize the weights during 'model.add' so we can test our network shapes dynamically w/out model.compile
  #added an output bool here so we can make sure the shape of the output network is (1,n)
  def initial_weights(self, input_data, output_shape, output=False):
    B = np.zeros((1, output_shape))
    #assigning the shape 
    W = np.random.uniform(-1e-3, 1e-3, size = (input_data.shape[len(input_data.shape) - 1], output_shape))
    self.B.append(B)
    self.W.append(W)

  def add(self, input_data, output_shape, activation, output=False):
    #append to layers so we have a correct index value
    self.layers.append(69)

    #making sure our data in a numpy array
    if (type(input_data) == np.ndarray):
      X = input_data
    else:
      X = np.asarray(input_data)

    #adding data and activations to self lists
    self.X.append(X)
    self.A.append(activation)


    #keep track of our index & initializing random weights for dynamic comatibility testing
    index = len(self.layers)-1
    self.initial_weights(input_data, output_shape, output=False)

    X2 = self.forward(input_data, index)
    #printing layer info 
    print("Layer:", index)
    print("Input Shape: ", X.shape)
    print("Weight Shape: ", self.W[index].shape)
    print("Output Shape: ", X2.shape)
    print(" ")
    return(X2)

  def forward(self, input_data, index):
    #pulling weights and biases from  main lists for operations
    B = self.B[index]
    W = self.W[index]

    #matmul of data # weights + bias
    Z = np.matmul(input_data, W) + B
    #summing each row of inputs to activation node
    for x in Z:
      x = sum(x)
    #pulling activation from index 
    act = str(self.A[index])
    #activating 
    Z = activate(Z, act)
    #keeping track of Z i guess
    self.Zappend = Z
    return(Z)

  def predict(self, input_data):
    for x in range(len(self.layers)):
      z = model.forward(input_data, x)
      input_data = z
    return z

  def backprop(self, model_output, ground_truth):
    #------------------------------
    #now begins the backprop portion
    #let's start with finding the error between predictions and actual values

    #gonna do MSE to keep it simple
    self.output_error = (ground_truth - model_output) ** 2
    #so now we have the error of the output layer, this tells us two things, how wrong we were, and in which direction we should update
    #the outputs of these nodes

    '''
    What to do if this was linear regression (for m & b)
    1. Take the error and multiply it by the transpose of the last layer weights 
    (I think the error in this case is where the prime activation function should be if we had activations)
    2. The last layer bias is just the error
    3. The second to last layer inputs is the bias times the transpose of second layers weights
    3. Then I have no idea
    '''
    return self.output_error

python - 如何在 Numpy 中实现矢量化反向传播

0 回答 0

Related

Reference