我为Joel 的 FizzBuzz 实现制作了 3 个神经网络,每个都使用 Numpy、TensorFlow 和 Pytorch。使用相同的超参数和 1k epoch,我的 numpy 脚本收敛到 0.002 损失,但我的 pytorch 和 tensorflow 仍然在 0.6 左右跳跃。有人可以帮我弄清楚发生了什么。我不认为 Google 和 [Facebook + Nvidia] 仅仅为了 GPU 提升而做出了比 Numpy 更重要的事情。我的代码如下
麻木的
import numpy as np
input_size = 10
epochs = 1000
batches = 64
lr = 0.01
def sig(val):
return 1 / (1 + np.exp(-val))
def sig_d(val):
sig_val = sig(val)
return sig_val * (1 - sig_val)
def binary_enc(num):
ret = [int(i) for i in '{0:b}'.format(num)]
return [0] * (input_size - len(ret)) + ret
def binary_dec(array):
ret = 0
for i in array:
ret = ret * 2 + int(i)
return ret
def training_test_gen(x, y):
assert len(x) == len(y)
indices = np.random.permutation(range(len(x)))
split_size = int(0.9 * len(indices))
trX = x[indices[:split_size]]
trY = y[indices[:split_size]]
teX = x[indices[split_size:]]
teY = y[indices[split_size:]]
return trX, trY, teX, teY
def x_y_gen():
x = []
y = []
for i in range(1000):
x.append(binary_enc(i))
if i % 15 == 0:
y.append([1, 0, 0, 0])
elif i % 5 == 0:
y.append([0, 1, 0, 0])
elif i % 3 == 0:
y.append([0, 0, 1, 0])
else:
y.append([0, 0, 0, 1])
return training_test_gen(np.array(x), np.array(y))
def check_fizbuz(i):
if i % 15 == 0:
return 'fizbuz'
elif i % 5 == 0:
return 'buz'
elif i % 3 == 0:
return 'fiz'
else:
return 'number'
trX, trY, teX, teY = x_y_gen()
w1 = np.random.randn(10, 100)
w2 = np.random.randn(100, 4)
b1 = np.zeros((1, 100))
b2 = np.zeros((1, 4))
no_of_batches = int(len(trX) / batches)
for epoch in range(epochs):
for batch in range(no_of_batches):
# forward
start = batch * batches
end = start + batches
x = trX[start:end]
y = trY[start:end]
a2 = x.dot(w1) + b1
h2 = sig(a2)
a3 = h2.dot(w2) + b2
hyp = sig(a3)
error = hyp - y
loss = (error ** 2).mean()
# backward
outerror = error
outgrad = outerror * sig_d(a3)
outdelta = h2.T.dot(outgrad)
outbiasdelta = np.ones([1, batches]).dot(outgrad)
hiddenerror = outerror.dot(w2.T)
hiddengrad = hiddenerror * sig_d(a2)
hiddendelta = x.T.dot(hiddengrad)
hiddenbiasdelta = np.ones([1, batches]).dot(hiddengrad)
w1 -= hiddendelta * lr
b1 -= hiddenbiasdelta * lr
w2 -= outdelta * lr
b2 -= outbiasdelta * lr
print(epoch, loss)
# test
a2 = teX.dot(w1) + b1
h2 = sig(a2)
a3 = h2.dot(w2) + b2
hyp = sig(a3)
outli = ['fizbuz', 'buz', 'fiz', 'number']
for i in range(len(teX)):
num = binary_dec(teX[i])
print(
'Number: {} -- Actual: {} -- Prediction: {}'.format(
num, check_fizbuz(num), outli[hyp[i].argmax()]))
print('Test loss: ', np.mean(teY - hyp))
火炬
import numpy as np
import torch as th
from torch.autograd import Variable
input_size = 10
epochs = 1000
batches = 64
lr = 0.01
def binary_enc(num):
ret = [int(i) for i in '{0:b}'.format(num)]
return [0] * (input_size - len(ret)) + ret
def binary_dec(array):
ret = 0
for i in array:
ret = ret * 2 + int(i)
return ret
def training_test_gen(x, y):
assert len(x) == len(y)
indices = np.random.permutation(range(len(x)))
split_size = int(0.9 * len(indices))
trX = x[indices[:split_size]]
trY = y[indices[:split_size]]
teX = x[indices[split_size:]]
teY = y[indices[split_size:]]
return trX, trY, teX, teY
def x_y_gen():
x = []
y = []
for i in range(1000):
x.append(binary_enc(i))
if i % 15 == 0:
y.append([1, 0, 0, 0])
elif i % 5 == 0:
y.append([0, 1, 0, 0])
elif i % 3 == 0:
y.append([0, 0, 1, 0])
else:
y.append([0, 0, 0, 1])
return training_test_gen(np.array(x), np.array(y))
def check_fizbuz(i):
if i % 15 == 0:
return 'fizbuz'
elif i % 5 == 0:
return 'buz'
elif i % 3 == 0:
return 'fiz'
else:
return 'number'
trX, trY, teX, teY = x_y_gen()
if th.cuda.is_available():
dtype = th.cuda.FloatTensor
else:
dtype = th.FloatTensor
x = Variable(th.from_numpy(trX).type(dtype), requires_grad=False)
y = Variable(th.from_numpy(trY).type(dtype), requires_grad=False)
w1 = Variable(th.randn(10, 100).type(dtype), requires_grad=True)
w2 = Variable(th.randn(100, 4).type(dtype), requires_grad=True)
b1 = Variable(th.zeros(1, 100).type(dtype), requires_grad=True)
b2 = Variable(th.zeros(1, 4).type(dtype), requires_grad=True)
no_of_batches = int(len(trX) / batches)
for epoch in range(epochs):
for batch in range(no_of_batches):
start = batch * batches
end = start + batches
x_ = x[start:end]
y_ = y[start:end]
a2 = x_.mm(w1)
a2 = a2.add(b1.expand_as(a2))
h2 = a2.sigmoid()
a3 = h2.mm(w2)
a3 = a3.add(b2.expand_as(a3))
hyp = a3.sigmoid()
error = hyp - y_
loss = error.pow(2).sum()
loss.backward()
w1.data -= lr * w1.grad.data
w2.data -= lr * w2.grad.data
b1.data -= lr * b1.grad.data
b2.data -= lr * b2.grad.data
w1.grad.data.zero_()
w2.grad.data.zero_()
print(epoch, error.mean().data[0])
TensorFlow
import tensorflow as tf
import numpy as np
input_size = 10
epochs = 1000
batches = 64
learning_rate = 0.01
def binary_enc(num):
ret = [int(i) for i in '{0:b}'.format(num)]
return [0] * (input_size - len(ret)) + ret
def binary_dec(array):
ret = 0
for i in array:
ret = ret * 2 + int(i)
return ret
def training_test_gen(x, y):
assert len(x) == len(y)
indices = np.random.permutation(range(len(x)))
split_size = int(0.9 * len(indices))
trX = x[indices[:split_size]]
trY = y[indices[:split_size]]
teX = x[indices[split_size:]]
teY = y[indices[split_size:]]
return trX, trY, teX, teY
def x_y_gen():
x = []
y = []
for i in range(1000):
x.append(binary_enc(i))
if i % 15 == 0:
y.append([1, 0, 0, 0])
elif i % 5 == 0:
y.append([0, 1, 0, 0])
elif i % 3 == 0:
y.append([0, 0, 1, 0])
else:
y.append([0, 0, 0, 1])
return training_test_gen(np.array(x), np.array(y))
def check_fizbuz(i):
if i % 15 == 0:
return 'fizbuz'
elif i % 5 == 0:
return 'buz'
elif i % 3 == 0:
return 'fiz'
else:
return 'number'
trX, trY, teX, teY = x_y_gen()
x = tf.placeholder(tf.float32, [None, 10], name='x')
y = tf.placeholder(tf.float32, [None, 4], name='y')
lr = tf.placeholder(tf.float32, [], name='lr')
w1 = tf.Variable(tf.truncated_normal([10, 100]))
w2 = tf.Variable(tf.truncated_normal([100, 4]))
b1 = tf.Variable(tf.zeros(100))
b2 = tf.Variable(tf.zeros(4))
a2 = tf.sigmoid(tf.add(tf.matmul(x, w1), b1))
hyp = tf.sigmoid(tf.add(tf.matmul(a2, w2), b2))
cost = tf.reduce_mean(tf.square(hyp - y))
optmizer = tf.train.GradientDescentOptimizer(lr).minimize(cost)
prediction = tf.argmax(hyp, 1)
no_of_batches = int(len(trX) / batches)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for epoch in range(epochs):
p = np.random.permutation(range(len(trX)))
trX = trX[p]
trY = trY[p]
for batch in range(no_of_batches):
start = batch * batches
end = start + batches
input_batch = trX[start: end]
target_batch = trY[start: end]
sess.run(
optmizer, feed_dict={x: input_batch, y: target_batch, lr: learning_rate})
if epoch % 100 == 0:
a = np.argmax(teY, axis=1)
b = sess.run(prediction, feed_dict={x: teX})
acc = np.mean(a == b)
out_cost = sess.run(
cost, feed_dict={x: input_batch, y: target_batch, lr: learning_rate})
print('cost - {} --- accuracy - {}'.format(out_cost.mean(), acc))