我的图像的原始大小是 3900 x 6000 x 3。我制作重叠的形状块 (232024, 28, 28, 3),然后制作大小为 1000 的批次。我有一个用于语义分割的 CNN 模型,如下所示:
def conv_layer(inputs, filters, kernel_size, strides = 1, padding = "SAME", bias_constant = 0.0, name = "conv"):
with tf.name_scope(name):
input_shape = inputs.shape.as_list()
filter_tensor = tf.truncated_normal([kernel_size[0], kernel_size[1], input_shape[3], filters], dtype = tf.float32)
filter = tf.Variable(initial_value = filter_tensor, trainable = True, name = "kernel")
bias = tf.Variable(tf.constant(bias_constant, shape=[filters]), name="bias")
conv2d = tf.nn.conv2d(input = tf.cast(inputs, dtype = tf.float32), filter = filter, strides = [1, strides, strides, 1], padding = padding)
activation = tf.nn.relu(conv2d + bias)
tf.summary.histogram("weights", filter)
tf.summary.histogram("biases", bias)
tf.summary.histogram("activations", activation)
return tf.cast(activation, dtype = tf.float16)
def deconv_layer(inputs, filters, kernel_size, output_size, strides = 1, padding = "SAME", bias_constant = 0.0, name = "deconv"):
with tf.name_scope(name):
input_shape = inputs.shape.as_list()
deconv_shape = tf.stack([tf.shape(inputs)[0], output_size[0], output_size[1],filters])
filter_tensor = tf.truncated_normal([kernel_size[0], kernel_size[1], filters, input_shape[3]], dtype = tf.float32)
filter = tf.Variable(initial_value = filter_tensor, trainable = True, name = "kernel")
bias = tf.Variable(tf.constant(bias_constant, shape=[filters]), name="bias")
print("bias:")
print(bias)
conv2d_transpose = tf.nn.conv2d_transpose(value = tf.cast(inputs, dtype = tf.float32),
filter = filter,
strides = [1, strides, strides, 1],
output_shape=deconv_shape,
padding = padding)
activation = tf.nn.relu(conv2d_transpose + bias)
tf.summary.histogram("weights", filter)
tf.summary.histogram("biases", bias)
tf.summary.histogram("activations", activation)
return tf.cast(activation, dtype = tf.float16)
def semantic_seg_model(features, mode, batch_size):
bias_constant = 0.1
conv_filters = [20, 50, 90]
conv_sizes = []
tf.summary.image('input', features, batch_size)
"""Model function for CNN."""
# Encoding starts here.
# Convolutional Layer 1
# Input: 100 x 100
conv = conv_layer(inputs=features,
filters=conv_filters[0],
kernel_size=[5, 5],
bias_constant = bias_constant,
name = "conv1")
conv_sizes.append(conv.shape.as_list())
print(conv.shape)
# Convolutional Layer 2
# Input: 100 x 100
conv = conv_layer(inputs = conv,
filters = conv_filters[1],
kernel_size = [5, 5],
strides = 2,
bias_constant = bias_constant,
name = "conv2")
conv_sizes.append(conv.shape.as_list())
print(conv.shape)
# Convolutional Layer 3
# Input: 100 x 100
conv = conv_layer(inputs = conv,
filters = conv_filters[2],
kernel_size = [5, 5],
bias_constant = bias_constant,
strides = 2,
name = "conv3")
conv_sizes.append(conv.shape.as_list())
print(conv.shape)
# Deconvolution Layer 3
# Input: 100 x 100
deconv = deconv_layer(inputs = conv,
filters = conv_filters[1],
kernel_size = [5, 5],
bias_constant = bias_constant,
strides = 2,
output_size = [conv_sizes[1][1], conv_sizes[1][2]],
name = "deconv3")
print(deconv.shape)
# Deconvolution Layer 2
# Input: 100 x 100
deconv = deconv_layer(inputs = deconv,
filters = conv_filters[0],
kernel_size = [5, 5],
bias_constant = bias_constant,
strides = 2,
output_size = [conv_sizes[0][1], conv_sizes[0][2]],
name = "deconv2")
print(deconv.shape)
deconv = deconv_layer(inputs = deconv,
filters = 3,
kernel_size = [5, 5],
output_size = [features.shape.as_list()[1], features.shape.as_list()[2]],
bias_constant = bias_constant,
name = "deconv1")
print(deconv.shape)
return deconv
epochs = 1000
learning_rate = 1e-50
image, label = tf.train.slice_input_producer([features, labels], shuffle = False)
BATCH_SIZE = 1000
THREAD_NUM = 5
MIN_AFTER_DEQUEUE = 10000
queue_capacity = MIN_AFTER_DEQUEUE + THREAD_NUM * BATCH_SIZE
image_batch, label_batch = tf.train.batch(tensors = [image, label],
batch_size = BATCH_SIZE,
capacity = queue_capacity,
num_threads = THREAD_NUM,
allow_smaller_final_batch = True)
output = semantic_seg_model(image_batch, tf.estimator.ModeKeys.TRAIN, BATCH_SIZE)
#cost
with tf.name_scope("cross_entropy"):
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits = output, labels = label_batch)
cost = tf.reduce_mean( cross_entropy )
# return cost, optimizer, accr
tf.summary.scalar("xent", cost)
#optimizer
with tf.name_scope("optimizer"):
optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost)
# Accuracy
with tf.name_scope("accuracy"):
correct_prediction = tf.equal(tf.argmax(label_batch, 1), tf.argmax(output, 1))
accr = tf.reduce_mean(tf.cast(correct_prediction, tf.float16))
tf.summary.scalar("accuracy", accr)
merged_summary = tf.summary.merge_all()
# Session configs
config = tf.ConfigProto()
config.log_device_placement = True
config.gpu_options.allow_growth = True
# config.gpu_options.per_process_gpu_memory_fraction=0.8
# Initialize session
sess = tf.Session(config=config)
sess.run(tf.global_variables_initializer())
coord = tf.train.Coordinator()
enqueue_threads = tf.train.start_queue_runners(sess = sess, coord = coord)
try:
for epoch in range(epochs):
if coord.should_stop():
break
epoch_loss = 0
train_loss = []; train_accuracy = []
s = sess.run(merged_summary)
writer.add_summary(s, epoch)
for batch in range(math.ceil(features.shape.as_list()[0]/BATCH_SIZE)):
_, sess_cost, sess_accuracy = sess.run([optimizer, cost, accr])
train_loss.append(sess_cost)
train_accuracy.append(sess_accuracy)
train_loss = np.mean(train_loss)
train_accuracy = np.mean(train_accuracy)
saver.save(sess, "./semantic_seg_model_1", global_step=epoch)
print ("[%02d/%02d] trainLoss: %.4f trainAcc: %.2f"
% (epoch + 1, epochs, sess_cost, sess_accuracy))
except Exception as e:
# Report exceptions to the coordinator.
coord.request_stop(e)
finally:
# Terminate as usual. It is safe to call `coord.request_stop()` twice.
coord.request_stop()
coord.join(enqueue_threads)
sess.close()
开始培训课程时出现错误。错误如下:
[01/1000] trainLoss: 0.0000 trainAcc: 1.00
INFO:tensorflow:Error 报告给 Coordinator: , Nan 在摘要直方图中: deconv2/biases [[Node: deconv2/biases = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device :CPU:0"](deconv2/biases/tag, deconv2/bias/read/_105)]] [[节点:batch/fifo_queue_Size/_91 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task :0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_37_batch/fifo_queue_Size", tensor_type=DT_INT32, _device ="/job:localhost/replica:0/task:0/device:GPU:0"]]
由操作“deconv2/biases”引起,定义在:文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\runpy.py”,第 193 行,在 _run_module_as_main “ main", mod_spec) 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\runpy.py",第 85 行,_run_code exec(code, run_globals) 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel_launcher.py”,第 16 行,在 app.launch_new_instance() 文件“c:\users\fawad khalil\appdata\local\programs\python\ python36\lib\site-packages\traitlets\config\application.py”,第 658 行,在 launch_instance app.start() 文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site -packages\ipykernel\kernelapp.py”,第 478 行,开始 self.io_loop.start() 文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\ eventloop\ioloop.py",第 177 行,在 start super(ZMQIOLoop, self) 中。start() 文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\ioloop.py”,第 888 行,在 start handler_func(fd_obj, events) 文件“c :\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\stack_context.py",第 277 行,在 null_wrapper 中返回 fn(*args, **kwargs) 文件“c:\ users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py”,第 440 行,在 _handle_events self._handle_recv() 文件“c:\users\fawad khalil\ appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py”,第 472 行,在 _handle_recv self._run_callback(callback, msg) 文件“c:\users\fawad khalil\appdata\本地\程序\python\python36\lib\site-packages\zmq\eventloop\zmqstream。py”,第 414 行,在 _run_callback 回调(*args,**kwargs)文件中“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\stack_context.py”,第 277 行,在 null_wrapper 返回 fn(*args, **kwargs) 文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py”,第 281 行,在调度程序返回 self.dispatch_shell(stream, msg) 文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py”,第 232 行,在 dispatch_shell handler(stream, idents, msg) 文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py”,第 397 行,在 execute_request user_expressions,allow_stdin)文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\ipkernel.py”,第 208 行,在 do_execute res = shell.run_cell(代码,store_history=store_history,silent=silent)文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\zmqshell.py”,第 533 行,在 run_cell 返回 super(ZMQInteractiveShell, self).run_cell(*args , **kwargs) 文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py”,第 2728 行,run_cell interactivity=interactivity,编译器=compiler, result=result) 文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py”,第 2850 行,如果 self.运行代码(代码,结果):在 run_code exec(code_obj, self.user_global_ns, self. user_ns) 文件 "",第 1 行,在输出 = semantic_seg_model(image_batch, tf.estimator.ModeKeys.TRAIN, BATCH_SIZE) 文件 "",第 107 行,在语义_seg_model 名称 = "deconv2") 文件 "",第 78 行,在 deconv_layer tf.summary.histogram("biases", bias) 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\summary\summary.py", 行192、在直方图中 tag=tag, values=values, name=scope) 文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\ops\gen_logging_ops.py”,第 187 行,在 _histogram_summary “HistogramSummary”中,tag=tag,values=values,name=name)文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ tensorflow\python\framework\op_def_library.py”,第 787 行,在 _apply_op_helper op_def=op_def) 文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\ framework\ops.py”,第 2956 行,在 create_op op_def=op_def) 文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops. py",第 1470 行,在\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\op_def_library.py”,第 787 行,在 _apply_op_helper op_def=op_def) 文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py”,第 2956 行,在 create_op op_def=op_def) 文件“c:\users\fawad khalil\appdata\ local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py",第 1470 行,在\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\op_def_library.py”,第 787 行,在 _apply_op_helper op_def=op_def) 文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py”,第 2956 行,在 create_op op_def=op_def) 文件“c:\users\fawad khalil\appdata\ local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py",第 1470 行,在\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py”,第 1470 行,在\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py”,第 1470 行,在init self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
InvalidArgumentError(参见上面的回溯):Nan 在摘要直方图中: deconv2/biases [[Node: deconv2/biases = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU :0"](deconv2/biases/tag, deconv2/bias/read/_105)]] [[节点:batch/fifo_queue_Size/_91 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0 /device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_37_batch/fifo_queue_Size", tensor_type=DT_INT32, _device=" /job:localhost/replica:0/task:0/device:GPU:0"]]
本期完成的迭代次数:0 ------------------------------------------ --------------------------------- InvalidArgumentError Traceback(最近一次调用最后一次) c:\users\fawad khalil\appdata \local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py 在 _do_call(self, fn, *args) 1322 中尝试:-> 1323 返回 fn(*args) 1324,但错误除外。操作错误为 e:
c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata) 1301 feed_dict , fetch_list, target_list, -> 1302 状态, run_metadata) 1303
c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\errors_impl.py退出(self,type_arg,value_arg,traceback_arg)472 compat.as_text(c_api .TF_Message(self.status.status)), --> 473 c_api.TF_GetCode(self.status.status)) 474 # 从内存中删除底层状态对象,否则它保持活动状态
InvalidArgumentError: Nan in summary histogram for: deconv2/biases
[[Node: deconv2/biases = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](deconv2 /biases/tag, deconv2/bias/read/_105)]] [[节点:batch/fifo_queue_Size/_91 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0 ", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_37_batch/fifo_queue_Size", tensor_type=DT_INT32, _device="/job:localhost/replica :0/任务:0/设备:GPU:0"]]在处理上述异常的过程中,又出现了一个异常:
InvalidArgumentError Traceback (most recent call last) in () 40 # 照常终止。
coord.request_stop()
打两次电话是安全的。41 coord.request_stop() ---> 42 coord.join(enqueue_threads) 43 44 sess.close()c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\training\coordinator.py in join(self, threads, stop_grace_period_secs, ignore_live_threads) 387 self._registered_threads = set () 388 if self._exc_info_to_raise: --> 389 Six.reraise(*self._exc_info_to_raise) 390 elif stragglers: 391 if ignore_live_threads:
c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\six.py in reraise(tp, value, tb) 691 if value。回溯不是 tb: 692 raise value.with_traceback(tb) --> 693 raise value 694 finally: 695 value = None
in () 13 train_loss = []; train_accuracy = [] 14 ---> 15 s = sess.run(merged_summary) 16 writer.add_summary(s, epoch) 17
c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py 在运行中(self,fetches,feed_dict,options,run_metadata)887 尝试:888结果 = self._run(None, fetches, feed_dict, options_ptr, --> 889 run_metadata_ptr) 890 if run_metadata: 891 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
1118 if final_fetches or final_targets or (handle and feed_dict_tensor): 1119 results = self._do_run(handle, final_targets, final_fetches, -> 1120 feed_dict_tensor, options, run_metadata) 1121 else: 1122 results = []c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py in _do_run(self、handle、target_list、fetch_list、feed_dict、options、run_metadata) 1315 如果句柄为无:1316 返回 self._do_call(_run_fn, self._session, feeds, fetches, targets, -> 1317 options, run_metadata) 1318 else: 1319 return self._do_call(_prun_fn, self._session, handle, feeds,获取)
c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py in _do_call(self, fn, *args) 1334 除了 KeyError: 1335 pass - > 1336 raise type(e)(node_def, op, message) 1337 1338 def _extend_graph(self):
InvalidArgumentError: Nan in summary histogram for: deconv2/biases
[[Node: deconv2/biases = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](deconv2 /biases/tag, deconv2/bias/read/_105)]] [[节点:batch/fifo_queue_Size/_91 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0 ", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_37_batch/fifo_queue_Size", tensor_type=DT_INT32, _device="/job:localhost/replica :0/任务:0/设备:GPU:0"]]由操作“deconv2/biases”引起,定义在:文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\runpy.py”,第 193 行,在 _run_module_as_main “ main", mod_spec) 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\runpy.py",第 85 行,_run_code exec(code, run_globals) 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel_launcher.py”,第 16 行,在 app.launch_new_instance() 文件“c:\users\fawad khalil\appdata\local\programs\python\ python36\lib\site-packages\traitlets\config\application.py”,第 658 行,在 launch_instance app.start() 文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site -packages\ipykernel\kernelapp.py”,第 478 行,开始 self.io_loop.start() 文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\ eventloop\ioloop.py",第 177 行,在 start super(ZMQIOLoop, self) 中。start() 文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\ioloop.py”,第 888 行,在 start handler_func(fd_obj, events) 文件“c :\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\stack_context.py",第 277 行,在 null_wrapper 中返回 fn(*args, **kwargs) 文件“c:\ users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py”,第 440 行,在 _handle_events self._handle_recv() 文件“c:\users\fawad khalil\ appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py”,第 472 行,在 _handle_recv self._run_callback(callback, msg) 文件“c:\users\fawad khalil\appdata\本地\程序\python\python36\lib\site-packages\zmq\eventloop\zmqstream。py”,第 414 行,在 _run_callback 回调(*args,**kwargs)文件中“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\stack_context.py”,第 277 行,在 null_wrapper 返回 fn(*args, **kwargs) 文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py”,第 281 行,在调度程序返回 self.dispatch_shell(stream, msg) 文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py”,第 232 行,在 dispatch_shell handler(stream, idents, msg) 文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py”,第 397 行,在 execute_request user_expressions,allow_stdin)文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\ipkernel.py”,第 208 行,在 do_execute res = shell.run_cell(代码,store_history=store_history,silent=silent)文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\zmqshell.py”,第 533 行,在 run_cell 返回 super(ZMQInteractiveShell, self).run_cell(*args , **kwargs) 文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py”,第 2728 行,run_cell interactivity=interactivity,编译器=compiler, result=result) 文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py”,第 2850 行,如果 self.运行代码(代码,结果):在 run_code exec(code_obj, self.user_global_ns, self. user_ns) 文件 "",第 1 行,在输出 = semantic_seg_model(image_batch, tf.estimator.ModeKeys.TRAIN, BATCH_SIZE) 文件 "",第 107 行,在语义_seg_model 名称 = "deconv2") 文件 "",第 78 行,在 deconv_layer tf.summary.histogram("biases", bias) 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\summary\summary.py", 行192、在直方图中 tag=tag, values=values, name=scope) 文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\ops\gen_logging_ops.py”,第 187 行,在 _histogram_summary “HistogramSummary”中,tag=tag,values=values,name=name)文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ tensorflow\python\framework\op_def_library.py”,第 787 行,在 _apply_op_helper op_def=op_def) 文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\ framework\ops.py”,第 2956 行,在 create_op op_def=op_def) 文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops. py",第 1470 行,在\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\op_def_library.py”,第 787 行,在 _apply_op_helper op_def=op_def) 文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py”,第 2956 行,在 create_op op_def=op_def) 文件“c:\users\fawad khalil\appdata\ local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py",第 1470 行,在\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\op_def_library.py”,第 787 行,在 _apply_op_helper op_def=op_def) 文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py”,第 2956 行,在 create_op op_def=op_def) 文件“c:\users\fawad khalil\appdata\ local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py",第 1470 行,在\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py”,第 1470 行,在\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py”,第 1470 行,在init self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
InvalidArgumentError(参见上面的回溯):Nan 在摘要直方图中: deconv2/biases [[Node: deconv2/biases = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU :0"](deconv2/biases/tag, deconv2/bias/read/_105)]] [[节点:batch/fifo_queue_Size/_91 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0 /device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_37_batch/fifo_queue_Size", tensor_type=DT_INT32, _device=" /job:localhost/replica:0/task:0/device:GPU:0"]]
github tensorflow issues 上的某个人建议尝试在模型发散时降低学习率,但这并没有帮助。另一个建议应该将 dtype 从 float16 更改为 float32,因为 float16 是有问题的。当我将数据的 dtype 更改为 float32 时,我在 python 日志控制台中收到以下错误:
[libprotobuf 错误 C:\tf_jenkins\home\workspace\rel-win\M\windows-gpu\PY\36\cmake_build\protobuf\src\protobuf\src\google\protobuf\message_lite.cc:297] 超过最大 protobuf 大小2GB。[libprotobuf 错误 C:\tf_jenkins\home\workspace\rel-win\M\windows-gpu\PY\36\cmake_build\protobuf\src\protobuf\src\google\protobuf\message_lite.cc:297] 超过最大 protobuf 大小2GB。
当我尝试增加图像重叠块的宽度和高度时,也会发生同样的错误。我也尝试过减少 BATCH_SIZE 但没有帮助。
我有 4GB NVIDIA GeForce GTX 960M 专用显卡和 16GB RAM,配备 Intel Core i7-6700HQ CPU @ 2.60 GHz 2.60 GHz。Python 版本为 3.6.4,Tensorflow 版本为 1.4,带 GPU。
更新 1: 更新模型:
def semantic_seg_model(features, mode, batch_size):
bias_constant = 0.1
conv_filters = [10, 25, 90]
conv_sizes = []
tf.summary.image('input', features, batch_size)
"""Model function for CNN."""
# Encoding starts here.
# Convolutional Layer 1
# Input: 100 x 100
conv = conv_layer(inputs=features,
filters=conv_filters[0],
kernel_size=[2, 2],
bias_constant = bias_constant,
name = "conv1")
conv_sizes.append(conv.shape.as_list())
print(conv.shape)
# Convolutional Layer 2
# Input: 100 x 100
conv = conv_layer(inputs = conv,
filters = conv_filters[1],
kernel_size = [2, 2],
bias_constant = bias_constant,
name = "conv2")
conv_sizes.append(conv.shape.as_list())
print(conv.shape)
# Deconvolution Layer 2
# Input: 100 x 100
deconv = deconv_layer(inputs = conv,
filters = conv_filters[0],
kernel_size = [2, 2],
bias_constant = bias_constant,
output_size = [conv_sizes[0][1], conv_sizes[0][2]],
name = "deconv2")
print(deconv.shape)
deconv = deconv_layer(inputs = deconv,
filters = 3,
kernel_size = [2, 2],
output_size = [features.shape.as_list()[1], features.shape.as_list()[2]],
bias_constant = bias_constant,
name = "deconv1")
print(deconv.shape)
return tf.cast(deconv, dtype = tf.float16)