0

从 TensorFlow 网站教程开始,我正在尝试创建一个非常简单的神经网络来预测基于Adience Benchmark的性别。按照我两次尝试的代码:

def model1(x, y_, data_size):
    W = tf.Variable(tf.zeros([data_size, 1]))
    b = tf.Variable(tf.zeros([1]))
    y = tf.matmul(x, W) + b

    cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
    return tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy), y, cross_entropy

def model2(x, y_):
    def fully_connected(input, size):
        weights = tf.get_variable( 'weights', 
            shape = [input.get_shape()[1], size],
            initializer = tf.contrib.layers.xavier_initializer()
          )
        biases = tf.get_variable( 'biases',
            shape = [size],
            initializer=tf.constant_initializer(0.0)
          )
        return tf.matmul(input, weights) + biases
    def model_pass(input):
        with tf.variable_scope('hidden'):
            hidden = fully_connected(input, size = 100)
        relu_hidden = tf.nn.relu(hidden)
        with tf.variable_scope('out'):
            prediction = fully_connected(relu_hidden, size = 1)
        return prediction   

    predictions = model_pass(x)  
    loss = tf.reduce_mean(tf.square(predictions - y_))
    optimizer = tf.train.MomentumOptimizer(
        learning_rate = 0.01, 
        momentum = 0.9, 
        use_nesterov = True
    ).minimize(loss)

    return optimizer, predictions, loss

这里是训练模型的代码(批量大小为 128,图像已加载并按照本文建议裁剪为 227x227 ):

graph = tf.Graph()
with graph.as_default():
    path_batch, label_batch = input_pipeline(people_path, batch_size, None, True)
    label_batch = extract_feature(label_batch,1)
    label_batch = tf.reshape(label_batch,[batch_size,1])

    data_batch = path_to_image_crop(path_batch, os.path.dirname(people_path), image_prefix, image_dimension)
    data_batch = tf.reshape(data_batch,[batch_size, data_size])

    x = tf.placeholder(tf.float32, [None, data_size])
    y_ = tf.placeholder(tf.float32, [None, 1])

    train_step, y, loss = model1(x, y_, data_size)
    #train_step, y, loss = model2(x, y_)

with tf.Session(graph = graph) as session:
    session.run(tf.global_variables_initializer())
    session.run(tf.local_variables_initializer())
    coord = tf.train.Coordinator ()
    threads = tf.train.start_queue_runners (coord = coord)
    for i in range(num_epochs):
        batch_xs, batch_ys = session.run([data_batch, label_batch])
        batch_xs = images_as_float(batch_xs, batch_size, data_size)
        p,l,_ = session.run([y, loss, train_step], feed_dict={x: batch_xs, y_: batch_ys})
        print('%d: %s -> %s %s' % (i, l, p[i % batch_size], batch_ys[i % batch_size])) # 1 -> male, 2 -> female
        if (i == 0): print(batch_xs)
    coord.request_stop ()
    coord.join (threads)

第一个模型为每次迭代产生等于零的损失,第二个模型:

   0: 0.0 -> [ 0.] [2]
[[ 0.09019608  0.0745098   0.07058824 ...,  0.14509804  0.08627451
   0.05882353]
 [ 0.03529412  0.03137255  0.02352941 ...,  0.05882353  0.04313725
   0.03921569]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 ..., 
 [ 0.27843137  0.2627451   0.32156863 ...,  0.17647059  0.21176471
   0.24705882]
 [ 0.19607843  0.03137255  0.07058824 ...,  0.19215686  0.0745098
   0.08235294]
 [ 0.29803922  0.20392157  0.15686275 ...,  0.70588235  0.45490196
   0.41568627]]
1: 0.0 -> [ 8368.69335938] [1]
2: 0.0 -> [ 9435.07910156] [1]
3: 0.0 -> [ 6342.55175781] [2]
4: 0.0 -> [ 24146.79492188] [2]
5: 0.0 -> [ 38010.859375] [2]
6: 0.0 -> [ 28421.10546875] [1]
7: 0.0 -> [ 38900.63671875] [2]
8: 0.0 -> [ 12061.45605469] [2]
9: 0.0 -> [ 62396.5390625] [2]
10: 0.0 -> [ 44290.84765625] [2]
11: 0.0 -> [ 26015.90234375] [2]
12: 0.0 -> [ 57388.23046875] [1]
13: 0.0 -> [ 119108.203125] [1]
14: 0.0 -> [ 131051.671875] [1]
15: 0.0 -> [ 131854.78125] [2]
16: 0.0 -> [ 159839.875] [2]
17: 0.0 -> [ 128897.90625] [1]
18: 0.0 -> [ 61369.3359375] [1]
19: 0.0 -> [ 190607.71875] [1]
....

第二个模型在一些迭代后发散:

0: 2.14689 -> [ 0.14331065] [1]
[[ 0.03921569  0.07843137  0.04705882 ...,  0.34117647  0.29803922
   0.36078431]
 [ 0.16078431  0.11764706  0.13333333 ...,  0.17254902  0.11372549
   0.10196078]
 [ 0.18823529  0.15294118  0.1254902  ...,  0.90196078  0.84705882
   0.84313725]
 ..., 
 [ 0.50196078  0.36078431  0.29803922 ...,  0.6         0.40784314
   0.34901961]
 [ 0.58039216  0.40392157  0.38039216 ...,  0.6745098   0.61176471
   0.55294118]
 [ 0.17254902  0.29803922  0.14509804 ...,  0.16470588  0.15686275
   0.23921569]]
1: 1.38878e+06 -> [ 1075.10534668] [1]
2: 17212.8 -> [-68.56524658] [1]
3: 2431.18 -> [-46.70772934] [2]
4: 4.38701e+11 -> [ 670822.0625] [2]
5: 5.75069e+08 -> [-23979.0625] [1]
6: 1.10681e+09 -> [-33267.28515625] [1]
7: 1.66428e+09 -> [-40794.125] [1]
8: 2.17327e+09 -> [-46616.8828125] [1]
9: 2.58284e+09 -> [-50820.1875] [2]
10: 2.86359e+09 -> [-53511.0703125] [2]
11: 3.00476e+09 -> [-54814.17578125] [1]
12: 3.01057e+09 -> [-54867.203125] [2]

我知道卷积模型比我实现的模型效果更好,但我的直觉是模型 1 和模型 2 的收敛精度不高,而不是这个奇怪的结果。是我的假设错误还是我实现模型或训练步骤的方式存在错误?

完整代码在github上

感谢您的任何建议!

此致

詹卢卡

4

0 回答 0