尝试让 LinearClassifier 与 Colab TPU 一起运行。 https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/estimator/LinearClassifier
TensorFlow 2.0 Estimator(LinearClassifier) 支持 TPUStrategy https://www.tensorflow.org/beta/guide/distribute_strategy#whats_supported_now_2
在没有 tpu_strategy 的情况下,LinearClassifier 按预期工作。 https://www.tensorflow.org/beta/guide/distribute_strategy#tpustrategy
将 tpu_strategy 添加为 LinearClassifier 的配置时,出现以下错误:
InvalidArgumentError:没有注册 OpKernel 以支持{{node input0}} 使用的 Op 'TPUReplicatedInput' 具有以下属性:[T=DT_DOUBLE,N=8] 注册设备:[CPU,XLA_CPU] 注册内核:[[input0]] https ://www.tensorflow.org/beta/guide/distribute_strategy#using_tfdistributetestrategy_with_estimator
这几天一直在吵架,这是怎么回事?
!pip install tensorflow==2.0.0-beta0
import tensorflow.feature_column as fc
import tensorflow as tf
import os
print(tf.__version__)
TPU_WORKER = 'grpc://' + os.environ['COLAB_TPU_ADDR']
cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=TPU_WORKER)
tf.config.experimental_connect_to_host(cluster_resolver.master())
tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
tpu_strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)
config = tf.estimator.RunConfig(train_distribute=tpu_strategy, eval_distribute=tpu_strategy)
batch_size = 1
def make_input_fn(X, y):
def input_fn():
dataset = tf.data.Dataset.from_tensor_slices((dict(X), y)).batch(batch_size)
return dataset
return input_fn
input_fn = make_input_fn(estimator_train_attributes_dictionary,labels_train)
linear_est = tf.estimator.LinearClassifier(feature_columns=attibute_columns,config=config)
linear_est.train(input_fn=input_fn)
Colab 中的完全例外:
W0618 18:08:10.280844 140506166175616 estimator.py:1811] Using temporary folder as model directory: /tmp/tmp2xc1fixj
2.0.0-beta0
W0618 18:09:00.986362 140506166175616 tpu.py:218] 3 unsupported operations found:
ScalarSummary (bias)
ScalarSummary (fraction_of_zero_weights)
ScalarSummary (loss)
W0618 18:09:43.578035 140506166175616 tpu_strategy_util.py:57] TPU system %s has already been initialized. Reinitializing the TPU can cause previously created variables on TPU to be lost.
---------------------------------------------------------------------------
InvalidArgumentError Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1355 try:
-> 1356 return fn(*args)
1357 except errors.OpError as e:
20 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _run_fn(feed_dict, fetch_list, target_list, options, run_metadata)
1338 # Ensure any changes to the graph are reflected in the runtime.
-> 1339 self._extend_graph()
1340 return self._call_tf_sessionrun(
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _extend_graph(self)
1373 with self._graph._session_run_lock(): # pylint: disable=protected-access
-> 1374 tf_session.ExtendSession(self._session)
1375
InvalidArgumentError: No OpKernel was registered to support Op 'TPUReplicatedInput' used by {{node input0}}with these attrs: [T=DT_DOUBLE, N=8]
Registered devices: [CPU, XLA_CPU]
Registered kernels:
<no registered kernels>
[[input0]]
During handling of the above exception, another exception occurred:
InvalidArgumentError Traceback (most recent call last)
<ipython-input-23-66caf93d8677> in <module>()
25
26 linear_est = tf.estimator.LinearClassifier(feature_columns=attibute_columns,config=config)#feature_columns=featureNames,,config=config
---> 27 linear_est.train(input_fn=input_fn)#,max_steps=100
28
29 #train_spec = tf.estimator.TrainSpec(input_fn=input_fn, max_steps=1000)
/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py in train(self, input_fn, hooks, steps, max_steps, saving_listeners)
365
366 saving_listeners = _check_listeners_type(saving_listeners)
--> 367 loss = self._train_model(input_fn, hooks, saving_listeners)
368 logging.info('Loss for final step: %s.', loss)
369 return self
/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py in _train_model(self, input_fn, hooks, saving_listeners)
1154 def _train_model(self, input_fn, hooks, saving_listeners):
1155 if self._train_distribution:
-> 1156 return self._train_model_distributed(input_fn, hooks, saving_listeners)
1157 else:
1158 return self._train_model_default(input_fn, hooks, saving_listeners)
/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py in _train_model_distributed(self, input_fn, hooks, saving_listeners)
1217 self._config._train_distribute.configure(self._config.session_config)
1218 return self._actual_train_model_distributed(
-> 1219 self._config._train_distribute, input_fn, hooks, saving_listeners)
1220 # pylint: enable=protected-access
1221
/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py in _actual_train_model_distributed(self, strategy, input_fn, hooks, saving_listeners)
1327 return self._train_with_estimator_spec(estimator_spec, worker_hooks,
1328 hooks, global_step_tensor,
-> 1329 saving_listeners)
1330
1331 def _train_with_estimator_spec_distributed(self, estimator_spec, worker_hooks,
/usr/local/lib/python3.6/dist-packages/tensorflow_estimator/python/estimator/estimator.py in _train_with_estimator_spec(self, estimator_spec, worker_hooks, hooks, global_step_tensor, saving_listeners)
1478 save_summaries_steps=save_summary_steps,
1479 config=self._session_config,
-> 1480 log_step_count_steps=log_step_count_steps) as mon_sess:
1481 loss = None
1482 any_step_done = False
/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py in MonitoredTrainingSession(master, is_chief, checkpoint_dir, scaffold, hooks, chief_only_hooks, save_checkpoint_secs, save_summaries_steps, save_summaries_secs, config, stop_grace_period_secs, log_step_count_steps, max_wait_secs, save_checkpoint_steps, summary_dir)
582 session_creator=session_creator,
583 hooks=all_hooks,
--> 584 stop_grace_period_secs=stop_grace_period_secs)
585
586
/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py in __init__(self, session_creator, hooks, stop_grace_period_secs)
1005 hooks,
1006 should_recover=True,
-> 1007 stop_grace_period_secs=stop_grace_period_secs)
1008
1009
/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py in __init__(self, session_creator, hooks, should_recover, stop_grace_period_secs)
723 stop_grace_period_secs=stop_grace_period_secs)
724 if should_recover:
--> 725 self._sess = _RecoverableSession(self._coordinated_creator)
726 else:
727 self._sess = self._coordinated_creator.create_session()
/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py in __init__(self, sess_creator)
1198 """
1199 self._sess_creator = sess_creator
-> 1200 _WrappedSession.__init__(self, self._create_session())
1201
1202 def _create_session(self):
/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py in _create_session(self)
1203 while True:
1204 try:
-> 1205 return self._sess_creator.create_session()
1206 except _PREEMPTION_ERRORS as e:
1207 logging.info(
/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py in create_session(self)
869 """Creates a coordinated session."""
870 # Keep the tf_sess for unit testing.
--> 871 self.tf_sess = self._session_creator.create_session()
872 # We don't want coordinator to suppress any exception.
873 self.coord = coordinator.Coordinator(clean_stop_exception_types=[])
/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/monitored_session.py in create_session(self)
645 init_op=self._scaffold.init_op,
646 init_feed_dict=self._scaffold.init_feed_dict,
--> 647 init_fn=self._scaffold.init_fn)
648
649
/usr/local/lib/python3.6/dist-packages/tensorflow/python/training/session_manager.py in prepare_session(self, master, init_op, saver, checkpoint_dir, checkpoint_filename_with_path, wait_for_checkpoint, max_wait_secs, config, init_feed_dict, init_fn)
294 "init_fn or local_init_op was given")
295 if init_op is not None:
--> 296 sess.run(init_op, feed_dict=init_feed_dict)
297 if init_fn:
298 init_fn(sess)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
948 try:
949 result = self._run(None, fetches, feed_dict, options_ptr,
--> 950 run_metadata_ptr)
951 if run_metadata:
952 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
1171 if final_fetches or final_targets or (handle and feed_dict_tensor):
1172 results = self._do_run(handle, final_targets, final_fetches,
-> 1173 feed_dict_tensor, options, run_metadata)
1174 else:
1175 results = []
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
1348 if handle is None:
1349 return self._do_call(_run_fn, feeds, fetches, targets, options,
-> 1350 run_metadata)
1351 else:
1352 return self._do_call(_prun_fn, handle, feeds, fetches)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1368 pass
1369 message = error_interpolation.interpolate(message, self._graph)
-> 1370 raise type(e)(node_def, op, message)
1371
1372 def _extend_graph(self):
InvalidArgumentError: No OpKernel was registered to support Op 'TPUReplicatedInput' used by node input0 (defined at <ipython-input-23-66caf93d8677>:27) with these attrs: [T=DT_DOUBLE, N=8]
Registered devices: [CPU, XLA_CPU]
Registered kernels:
<no registered kernels>
[[input0]]