事实证明,Theano 并没有使用先前计算的梯度来计算计算图较低层的梯度。这是一个具有 3 个隐藏层和一个输出层的神经网络的虚拟示例。但是,这根本不是什么大问题,因为计算梯度是一生一次的操作,除非您必须在每次迭代中计算梯度。Theano 将导数的符号表达式作为计算图返回,从那时起,您可以简单地将其用作函数。从那时起,我们只需使用 Theano 派生的函数来计算数值并使用这些值更新权重。
import theano.tensor as T
import time
import numpy as np
class neuralNet(object):
def __init__(self, examples, num_features, num_classes):
self.w = shared(np.random.random((16384, 5000)).astype(T.config.floatX), borrow = True, name = 'w')
self.w2 = shared(np.random.random((5000, 3000)).astype(T.config.floatX), borrow = True, name = 'w2')
self.w3 = shared(np.random.random((3000, 512)).astype(T.config.floatX), borrow = True, name = 'w3')
self.w4 = shared(np.random.random((512, 40)).astype(T.config.floatX), borrow = True, name = 'w4')
self.b = shared(np.ones(5000, dtype=T.config.floatX), borrow = True, name = 'b')
self.b2 = shared(np.ones(3000, dtype=T.config.floatX), borrow = True, name = 'b2')
self.b3 = shared(np.ones(512, dtype=T.config.floatX), borrow = True, name = 'b3')
self.b4 = shared(np.ones(40, dtype=T.config.floatX), borrow = True, name = 'b4')
self.x = examples
L1 = T.nnet.sigmoid(T.dot(self.x, self.w) + self.b)
L2 = T.nnet.sigmoid(T.dot(L1, self.w2) + self.b2)
L3 = T.nnet.sigmoid(T.dot(L2, self.w3) + self.b3)
L4 = T.dot(L3, self.w4) + self.b4
self.forwardProp = T.nnet.softmax(L4)
self.predict = T.argmax(self.forwardProp, axis = 1)
def loss(self, y):
return -T.mean(T.log(self.forwardProp)[T.arange(y.shape[0]), y])
x = T.matrix('x')
y = T.ivector('y')
nnet = neuralNet(x)
loss = nnet.loss(y)
diffrentiationTime = []
for i in range(100):
t1 = time.time()
gw, gw2, gw3, gw4, gb, gb2, gb3, gb4 = T.grad(loss, [nnet.w, nnet.w2, logReg.w3, nnet.w4, nnet.b, nnet.b2, nnet.b3, nnet.b4])
diffrentiationTime.append(time.time() - t1)
print 'Efficient Method: Took %f seconds with std %f' % (np.mean(diffrentiationTime), np.std(diffrentiationTime))
diffrentiationTime = []
for i in range(100):
t1 = time.time()
gw = T.grad(loss, [nnet.w])
gw2 = T.grad(loss, [nnet.w2])
gw3 = T.grad(loss, [nnet.w3])
gw4 = T.grad(loss, [nnet.w4])
gb = T.grad(loss, [nnet.b])
gb2 = T.grad(loss, [nnet.b2])
gb3 = T.grad(loss, [nnet.b3])
gb4 = T.grad(loss, [nnet.b4])
diffrentiationTime.append(time.time() - t1)
print 'Inefficient Method: Took %f seconds with std %f' % (np.mean(diffrentiationTime), np.std(diffrentiationTime))
这将打印出以下内容:
Efficient Method: Took 0.061056 seconds with std 0.013217
Inefficient Method: Took 0.305081 seconds with std 0.026024
这表明 Theano 使用动态规划方法来计算梯度以实现高效方法。