我一直在尝试按照 Google Documents 运行 TensorFlow 数据验证
遵循与https://www.tensorflow.org/tfx/data_validation/install相同的步骤:
>pip install tensorflow-data-validation
>git clone https://github.com/tensorflow/data-validation
>cd data-validation
>pip install strip-hints
>python tensorflow_data_validation/tools/strip_type_hints.py tensorflow_data_validation/
>sudo docker-compose build manylinux2010
>sudo docker-compose run -e PYTHON_VERSION=${PYTHON_VERSION} manylinux2010
用正确的路径更新
import tensorflow_data_validation as tfdv
from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions, StandardOptions, SetupOptions
PROJECT_ID = ''
JOB_NAME = ''
GCS_STAGING_LOCATION = ''
GCS_TMP_LOCATION = ''
GCS_DATA_LOCATION = ''
# GCS_STATS_OUTPUT_PATH is the file path to which to output the data statistics
# result.
GCS_STATS_OUTPUT_PATH = ''
PATH_TO_WHL_FILE = 'tensorflow_data_validation-0.13.1-cp27-cp27mu-manylinux1_x86_64.whl'
# Create and set your PipelineOptions.
options = PipelineOptions()
# For Cloud execution, set the Cloud Platform project, job_name,
# staging location, temp_location and specify DataflowRunner.
google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.project = PROJECT_ID
google_cloud_options.job_name = JOB_NAME
google_cloud_options.staging_location = GCS_STAGING_LOCATION
google_cloud_options.temp_location = GCS_TMP_LOCATION
options.view_as(StandardOptions).runner = 'DataflowRunner'
setup_options = options.view_as(SetupOptions)
# PATH_TO_WHL_FILE should point to the downloaded tfdv wheel file.
setup_options.extra_packages = [PATH_TO_WHL_FILE]
tfdv.generate_statistics_from_csv(GCS_DATA_LOCATION,
output_path=GCS_STATS_OUTPUT_PATH,
pipeline_options=options)
也遵循与 python3 相同的程序,这给了我下面的 .whl
tensorflow_data_validation-0.23.0.dev0-cp37-cp37m-manylinux2010_x86_64.whl
` 我得到 2 种错误,
使用时cp27
Error message from worker: Traceback (most recent call last): File "/usr/local/lib/python2.7/site-packages/dataflow_worker/batchworker.py", line 647, in do_work work_executor.execute() File "/usr/local/lib/python2.7/site-packages/dataflow_worker/executor.py", line 153, in execute test_shuffle_sink=self._test_shuffle_sink) File "/usr/local/lib/python2.7/site-packages/dataflow_worker/executor.py", line 118, in create_operation is_streaming=False) File "apache_beam/runners/worker/operations.py", line 1050, in apache_beam.runners.worker.operations.create_operation op = create_pgbk_op(name_context, spec, counter_factory, state_sampler) File "apache_beam/runners/worker/operations.py", line 856, in apache_beam.runners.worker.operations.create_pgbk_op return PGBKCVOperation(step_name, spec, counter_factory, state_sampler) File "apache_beam/runners/worker/operations.py", line 914, in apache_beam.runners.worker.operations.PGBKCVOperation.__init__ fn, args, kwargs = pickler.loads(self.spec.combine_fn)[:3] File "/usr/local/lib/python2.7/site-packages/apache_beam/internal/pickler.py", line 287, in loads return dill.loads(s) File "/usr/local/lib/python2.7/site-packages/dill/_dill.py", line 275, in loads return load(file, ignore, **kwds) File "/usr/local/lib/python2.7/site-packages/dill/_dill.py", line 270, in load return Unpickler(file, ignore=ignore, **kwds).load() File "/usr/local/lib/python2.7/site-packages/dill/_dill.py", line 472, in load obj = StockUnpickler.load(self) File "/usr/local/lib/python2.7/pickle.py", line 864, in load dispatch[key](self) File "/usr/local/lib/python2.7/pickle.py", line 1139, in load_reduce value = func(*args) File "/usr/local/lib/python2.7/site-packages/dill/_dill.py", line 827, in _import_module return getattr(__import__(module, None, None, [obj]), obj) File "/usr/local/lib/python2.7/site-packages/tensorflow_data_validation/__init__.py", line 18, in <module> from tensorflow_data_validation.api.stats_api import GenerateStatistics File "/usr/local/lib/python2.7/site-packages/tensorflow_data_validation/api/stats_api.py", line 50, in <module> from tensorflow_data_validation import types ImportError: cannot import name types
使用时cp37
Error message from worker: Traceback (most recent call last): File "/usr/local/lib/python3.7/site-packages/apache_beam/internal/pickler.py", line 283, in loads return dill.loads(s) File "/usr/local/lib/python3.7/site-packages/dill/_dill.py", line 275, in loads return load(file, ignore, **kwds) File "/usr/local/lib/python3.7/site-packages/dill/_dill.py", line 270, in load return Unpickler(file, ignore=ignore, **kwds).load() File "/usr/local/lib/python3.7/site-packages/dill/_dill.py", line 472, in load obj = StockUnpickler.load(self) File "/usr/local/lib/python3.7/site-packages/dill/_dill.py", line 462, in find_class return StockUnpickler.find_class(self, module, name) File "/usr/local/lib/python3.7/site-packages/tensorflow_data_validation/statistics/stats_impl.py", line 31, in <module> from tensorflow_data_validation import constants File "/usr/local/lib/python3.7/site-packages/tensorflow_data_validation/__init__.py", line 39, in <module> from tensorflow_data_validation.statistics.generators.lift_stats_generator import LiftStatsGenerator File "/usr/local/lib/python3.7/site-packages/tensorflow_data_validation/statistics/generators/lift_stats_generator.py", line 68, in <module> ('y', _YType)]) File "/usr/local/lib/python3.7/typing.py", line 1448, in __new__ return _make_nmtuple(typename, fields) File "/usr/local/lib/python3.7/typing.py", line 1341, in _make_nmtuple types = [(n, _type_check(t, msg)) for n, t in types] File "/usr/local/lib/python3.7/typing.py", line 1341, in <listcomp> types = [(n, _type_check(t, msg)) for n, t in types] File "/usr/local/lib/python3.7/typing.py", line 142, in _type_check raise TypeError(f"{msg} Got {arg!r:.100}.") TypeError: NamedTuple('Name', [(f0, t0), (f1, t1), ...]); each t must be a type Got Any. During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/usr/local/lib/python3.7/site-packages/dataflow_worker/batchworker.py", line 647, in do_work work_executor.execute() File "/usr/local/lib/python3.7/site-packages/dataflow_worker/executor.py", line 153, in execute test_shuffle_sink=self._test_shuffle_sink) File "/usr/local/lib/python3.7/site-packages/dataflow_worker/executor.py", line 118, in create_operation is_streaming=False) File "apache_beam/runners/worker/operations.py", line 1050, in apache_beam.runners.worker.operations.create_operation File "apache_beam/runners/worker/operations.py", line 856, in apache_beam.runners.worker.operations.create_pgbk_op File "apache_beam/runners/worker/operations.py", line 914, in apache_beam.runners.worker.operations.PGBKCVOperation.__init__ File "/usr/local/lib/python3.7/site-packages/apache_beam/internal/pickler.py", line 287, in loads return dill.loads(s) File "/usr/local/lib/python3.7/site-packages/dill/_dill.py", line 275, in loads return load(file, ignore, **kwds) File "/usr/local/lib/python3.7/site-packages/dill/_dill.py", line 270, in load return Unpickler(file, ignore=ignore, **kwds).load() File "/usr/local/lib/python3.7/site-packages/dill/_dill.py", line 472, in load obj = StockUnpickler.load(self) File "/usr/local/lib/python3.7/site-packages/dill/_dill.py", line 462, in find_class return StockUnpickler.find_class(self, module, name) File "/usr/local/lib/python3.7/site-packages/tensorflow_data_validation/statistics/stats_impl.py", line 31, in <module> from tensorflow_data_validation import constants File "/usr/local/lib/python3.7/site-packages/tensorflow_data_validation/__init__.py", line 39, in <module> from tensorflow_data_validation.statistics.generators.lift_stats_generator import LiftStatsGenerator File "/usr/local/lib/python3.7/site-packages/tensorflow_data_validation/statistics/generators/lift_stats_generator.py", line 68, in <module> ('y', _YType)]) File "/usr/local/lib/python3.7/typing.py", line 1448, in __new__ return _make_nmtuple(typename, fields) File "/usr/local/lib/python3.7/typing.py", line 1341, in _make_nmtuple types = [(n, _type_check(t, msg)) for n, t in types] File "/usr/local/lib/python3.7/typing.py", line 1341, in <listcomp> types = [(n, _type_check(t, msg)) for n, t in types] File "/usr/local/lib/python3.7/typing.py", line 142, in _type_check raise TypeError(f"{msg} Got {arg!r:.100}.") TypeError: NamedTuple('Name', [(f0, t0), (f1, t1), ...]); each t must be a type Got Any.