neural-network - XOR 与 ReLU 激活函数

Question

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

input = [[0,0,1],[0,1,1],[1,0,1],[1,1,1]]
output = [0,1,1,0]

N = np.size(input,0) # number of samples

Ni = np.size(input,1) # dimension of the samples of input

No = 1 # dimension of the sample of output

Nh = 10 # number of hidden units

Ws = 1/4*np.random.rand(Nh,Ni+1)
print(Ws)

Wo = 1/4*np.random.rand(No,Nh)
print(Wo)

alpha = 0.05 # Learning rate

t_ = []

loss_ = []

def ReLU(x):
    return np.maximum(0,x)

def sigmoid(x):
    return 1/(1+np.exp(-x))

## train the model ====================================================================
for epoch in range(0,3000):
    loss = 0
    for id_ in range(0,N):
        dWs = 0*Ws
        dWo = 0*Wo
        
        x = np.append(input[id_],1)
        
        Z_1 = np.dot(Ws,x)
        Z_2 = np.dot(Wo,ReLU(Z_1))
        y = sigmoid(Z_2)
        d = output[id_]

        for j in range(0,Nh):
            for i in range(0,No):
                if Z_1[j] >= 0:
                    dWo[i,j] = dWo[i,j] + (y[i]-d)*Z_1[j]
                    #dWo[i,j] = dWo[i,j] + sigmoid(Z_1[j])*(y[i]-d)
                else:
                    dWo[i,j] += 0

        Wo = Wo - alpha*dWo
        
        for k in range(0,Ni+1):
            for j in range(0,Nh):
                for i in range(0,No):
                    if Z_1[j] >= 0:
                        dWs[j,k] = dWs[j,k] + x[k]*Wo[i,j]*(y[i]-d)
                        #dWs[j,k] = dWs[j,k] + x[k]*Wo[i,j]*sigmoid(Z_1[j])*(1-sigmoid(Z_1[j]))*(y[i]-d)              
                    else:
                        dWs[j,k] += 0
                        
        Ws = Ws - alpha*dWs
        
        loss = loss + 1/2*np.linalg.norm(y-d)

    if np.mod(epoch,50) == 0:
        print(epoch,"-th epoch trained")
            
        t_ = np.append(t_,epoch)
            
        loss_ = np.append(loss_,loss)
            
        fig = plt.figure(num=0,figsize=[10,5])
        plt.plot(t_,loss_,marker="")
        plt.title('Loss decay')
        plt.xlabel('epoch',FontSize=20)
        plt.ylabel('Loss',FontSize=20)
        plt.show()
            
        ## figure out the function shape the model========================================== 
        xn = np.linspace(0,1,20)
        yn = np.linspace(0,1,20)
        xm, ym = np.meshgrid(xn, yn)
        xx = np.reshape(xm,np.size(xm,0)*np.size(xm,1))
        yy = np.reshape(ym,np.size(xm,0)*np.size(xm,1))
        Z = []
        for id__ in range(0,np.size(xm)):
            x = np.append([xx[id__],yy[id__]],[1,1])
            Z_1 = np.dot(Ws,x)
            y_ = sigmoid(np.dot(Wo,ReLU(Z_1)))
            Z = np.append(Z,y_)
                
        fig = plt.figure(num=1,figsize=[10,5])
        ax = fig.gca(projection='3d')
        surf = ax.plot_surface(xm,ym,np.reshape(Z,(np.size(xm,0),np.size(xm,1))),cmap='coolwarm',linewidth=0,antialiased=False)            
        print("====================================================================")
        plt.show()       
        
## test the trained model ====================================================================
for id_ in range(0,N):
    x = np.append(input[id_],1)
        
    Z_1 = np.dot(Ws,x)
        
    y = sigmoid(np.dot(Wo,ReLU(Z_1)))
    
    print(y)

如果我尝试使用 sigmoid 函数，它可以正常工作，但是当实现 ReLU 激活函数时，程序不会学习任何东西。

NN 由 3 个输入层、隐藏层和输出层组成，并为输出函数实现了 sigmoid 激活函数。手工计算似乎很好，但找不到缺陷。

下面带有 sigmoid 激活函数的代码工作得很好。

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

input = [[0,0,1],[0,1,1],[1,0,1],[1,1,1]]
output = [0,1,1,0]

N = np.size(input,0) # number of samples

Ni = np.size(input,1) # dimension of the samples of input

No = 1 # dimension of the sample of output

Nh = 5 # number of hidden units

Ws = 1/4*np.random.rand(Nh,Ni+1)
#print(Ws)

Wo = 1/4*np.random.rand(No,Nh)
#print(Wo)

alpha = 0.1 # Learning rate

t_ = []

loss_ = []

def sigmoid(x):
    return 1/(1+np.exp(-x))

## train the model ====================================================================
for epoch in range(0,5000):
    loss = 0
    for id_ in range(0,N):
        dWs = 0*Ws
        dWo = 0*Wo
        
        x = np.append(input[id_],1)
        
        Z_1 = np.dot(Ws,x)
        
        A_1 = sigmoid(Z_1)
        
        Z_2 = np.dot(Wo,A_1)

        y = sigmoid(Z_2)
        
        d = output[id_]

        for j in range(0,Nh):
            for i in range(0,No):
                dWo[i,j] = dWo[i,j] + sigmoid(Z_1[j])*(y[i]-d)
                
        Wo = Wo - alpha*dWo
        
        for k in range(0,Ni+1):
            for j in range(0,Nh):
                for i in range(0,No):
                    dWs[j,k] = dWs[j,k] + x[k]*Wo[i,j]*sigmoid(Z_1[j])*(1-sigmoid(Z_1[j]))*(y[i]-d) 
        
        Ws = Ws - alpha*dWs
        
        loss = loss + 1/2*np.linalg.norm(y-d)
        
    if np.mod(epoch,50) == 0:
        print(epoch,"-th epoch trained")
            
        t_ = np.append(t_,epoch)
            
        loss_ = np.append(loss_,loss)
            
        fig = plt.figure(num=0,figsize=[10,5])
        plt.plot(t_,loss_,marker="")
        plt.title('Loss decay')
        plt.xlabel('epoch',FontSize=20)
        plt.ylabel('Loss',FontSize=20)
        plt.show()
            
        ## figure out the function shape the model========================================== 
        xn = np.linspace(0,1,20)
        yn = np.linspace(0,1,20)
        xm, ym = np.meshgrid(xn, yn)
        xx = np.reshape(xm,np.size(xm,0)*np.size(xm,1))
        yy = np.reshape(ym,np.size(xm,0)*np.size(xm,1))
        Z = []
        for id__ in range(0,np.size(xm)):
            x = np.append([xx[id__],yy[id__]],[1,1])
            Z_1 = np.dot(Ws,x)
            y_ = sigmoid(np.dot(Wo,sigmoid(Z_1)))
            Z = np.append(Z,y_)
                
        fig = plt.figure(num=1,figsize=[10,5])
        ax = fig.gca(projection='3d')
        surf = ax.plot_surface(xm,ym,np.reshape(Z,(np.size(xm,0),np.size(xm,1))),cmap='coolwarm',linewidth=0,antialiased=False)            
        print("====================================================================")
        plt.show()
        
        
## test the trained model ====================================================================
for id_ in range(0,N):
    x = np.append(input[id_],1)
        
    Z_1 = np.dot(Ws,x)
        
    y = sigmoid(np.dot(Wo,sigmoid(Z_1)))
    
    print(y)

score 0 · Accepted Answer

我在 Quora 中发现了类似的案例。并在我的网络中对其进行了测试，这些网络涉及建模逻辑以解决一些嘈杂的成本函数。

我发现 ReLu 的输出通常被爆满，通过 MLP 的第 3 层，输出之前的值已经累积到数千甚至数百万。有了这个，我更喜欢带有 MLP 的 sigmoid。不要忘记，sigmoid 将输出限制为 1，但 ReLu 不会。

ReLu 背后的直觉是它通过 MAX(0,X) 函数过滤掉不需要的信息，然后转发到下一层处理。出于同样的原因，您会看到它被用于卷积问题。注意：在这些情况下使用归一化层，以便节点的输出值不会爆炸。

但是在 MLP 的情况下，在 ReLu 之后您没有实现任何 Norm Layer，因此很难对 XOR 等简单函数进行建模。总之，在没有 Norm Layer 的情况下，我不推荐使用 ReLu，尽管在某些情况下，它仍然可以正常工作。

neural-network - XOR 与 ReLU 激活函数

1 回答 1

Related

Reference