2
  • 操作系统平台和发行版:ubuntu 20.04
  • TensorFlow版本:2.1.0
  • Python版本:3.7.6

我想写一个简单的层来处理 tf.experiment.make_csv_dataset 的输出,我可以用批量平均值来估算数字 dtypes 中的缺失值,保持在测试时使用的移动平均值,为分类创建嵌入列并保持维度依赖于预定义的唯一值列表。

下面是我写的代码:

import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.keras import layers
from tensorflow import feature_column
class NUM_TO_DENSE(layers.Layer):
    def __init__(self,num_cols):
        super().__init__()
        self.keys = num_cols
        self.keys_all = self.keys+[str(i)+'__nullcol' for i in self.keys]
    def build(self,input_shape):
        def create_moving_mean_vars():
            return tf.Variable(initial_value=0.,shape=(),dtype=tf.float32,trainable=False)
        self.moving_means_total = {t:create_moving_mean_vars() for t in self.keys}
        self.layer_global_counter = tf.Variable(initial_value=0.,shape=(),dtype=tf.float32,trainable=False)

    def call(self,inputs, training = True):
        null_cols = {k:tf.math.is_finite(inputs[k]) for k in self.keys}
        current_means = {}
        def compute_update_current_means(t):
            current_mean = tf.math.divide_no_nan(tf.reduce_sum(tf.where(null_cols[t],inputs[t],0.),axis=0),\
                                  tf.reduce_sum(tf.cast(tf.math.is_finite(inputs[t]),tf.float32),axis=0))
            self.moving_means_total[t].assign_add(current_mean)
            return current_mean

        if training:
            current_means = {t:compute_update_current_means(t) for t in self.keys}
            outputs = {t:tf.where(null_cols[t],inputs[t],current_means[t]) for t in self.keys}
            outputs.update({str(k)+'__nullcol':tf.cast(null_cols[k],tf.float32) for k in self.keys})
            self.layer_global_counter.assign_add(1.)
        else:
            outputs = {t:tf.where(null_cols[t],inputs[t],(self.moving_means_total[t]/self.layer_global_counter))\
                       for t in self.keys}
            outputs.update({str(k)+'__nullcol':tf.cast(null_cols[k],tf.float32) for k in self.keys})
        return outputs


class PREPROCESS_MONSOON(layers.Layer):
    def __init__(self,cat_cols_with_unique_values,num_cols):
        '''cat_cols_with_unqiue_values: (dict) {'col_cat':[unique_values_list]}
        num_cols: (list) [num_cols_name_list]'''
        super().__init__()
        self.cat_cols = cat_cols_with_unique_values
        self.num_cols = num_cols
    def build(self,input_shape):
        self.ntd = NUM_TO_DENSE(self.num_cols)
        self.num_colnames = self.ntd.keys_all
        self.ctd = {k:layers.DenseFeatures\
                    (feature_column.embedding_column\
                     (feature_column.categorical_column_with_vocabulary_list\
                      (k,v),tf.cast(tf.math.ceil(tf.math.log(tf.cast(len(self.cat_cols[k]),tf.float32))),tf.int32).numpy()))\
                   for k,v in self.cat_cols.items()}
        self.cat_colnames = [i for i in self.cat_cols]
        self.dense_colnames = self.num_colnames+self.cat_colnames
    def call(self,inputs,training=True):
        dense_num_d = self.ntd(inputs,training=training)
        dense_cat_d = {k:self.ctd[k](inputs) for k in self.cat_colnames}

        dense_num = tf.stack([dense_num_d[k] for k in self.num_colnames],axis=1)
        dense_cat = tf.concat([dense_cat_d[k] for k in self.cat_colnames],axis=1)
        dense_all = tf.concat([dense_num,dense_cat],axis=1)
        return dense_all

创建数据来测试这个

    mnist = tf.keras.datasets.mnist

    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    x_train, x_test = x_train / 255.0, x_test / 255.0

    x_train_ = pd.DataFrame(x_train.reshape(60000,-1),columns = ['col_'+str(i) for i in range(28*28)])
    x_test_ = pd.DataFrame(x_test.reshape(10000,-1),columns = ['col_'+str(i) for i in range(28*28)])
    x_train_['col_cat1'] = [np.random.choice(['a','b','c','d','e','f','g','h','i']) for i in range(x_train_.shape[0])]
    x_test_['col_cat1'] = [np.random.choice(['a','b','c','d','e','f','g','h','i','j']) for i in range(x_test_.shape[0])]
    x_train_['col_cat2'] = [np.random.choice(['a','b','c','d','e','f','g','h','i']) for i in range(x_train_.shape[0])]
    x_test_['col_cat2'] = [np.random.choice(['a','b','c','d','e','f','g','h','i','j']) for i in range(x_test_.shape[0])]
    x_train_[np.random.choice([True,False],size = x_train_.shape,p=[0.05,0.95]).reshape(x_train_.shape)] = np.nan
    x_test_[np.random.choice([True,False],size = x_test_.shape,p=[0.05,0.95]).reshape(x_test_.shape)] = np.nan
    x_train_.to_csv('data/x_train.csv',index=False)
    x_test_.to_csv('data/x_test.csv',index=False)

在 ram 中创建一批

cdtypes = pd.read_csv('data/x_train.csv',nrows=2).dtypes
xtb = tf.data.experimental.make_csv_dataset('data/x_train.csv',32,header=True,prefetch_buffer_size=1,
                                           column_defaults=[np.nan if i == (float or int) else '__missing__' for i in cdtypes])
for i in xtb:
    break
dd = pd.read_csv('data/x_train.csv',nrows=2).head()
num_cols = [i for i in dd.columns if i not in ['col_cat1','col_cat2']]
cat_cols = [i for i in dd.columns if i in ['col_cat1','col_cat2']]

col_cat1_unique = ['a','b','c','d','e','f','g','h','i']
col_cat2_unique = ['a','b','c','d','e','f','g','h','i']

col_cat_unique = [col_cat1_unique,col_cat2_unique]

catcoldict = {k:v for k,v in zip(cat_cols,col_cat_unique)}

测试它:这有效:

pm = PREPROCESS_MONSOON(catcoldict,num_cols)
pm(i)

这适用于错误报告

pm = PREPROCESS_MONSOON(catcoldict,num_cols)
@tf.function
def p(i):
    return pm(i)

p(i)
output: (along with the expected preprocessed batch)
WARNING:tensorflow:AutoGraph could not transform <bound method NUM_TO_DENSE.call of <__main__.NUM_TO_DENSE object at 0x7f6458a0ec50>> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 10)
WARNING: AutoGraph could not transform <bound method NUM_TO_DENSE.call of <__main__.NUM_TO_DENSE object at 0x7f6458a0ec50>> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 10)
WARNING:tensorflow:AutoGraph could not transform <bound method NUM_TO_DENSE.call of <__main__.NUM_TO_DENSE object at 0x7f6458a0ec50>> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 10)
WARNING: AutoGraph could not transform <bound method NUM_TO_DENSE.call of <__main__.NUM_TO_DENSE object at 0x7f6458a0ec50>> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 10)

这失败了

pm = PREPROCESS_MONSOON(catcoldict,num_cols)

inputs = tf.keras.Input(shape=(None,786))
x = pm(inputs)
output:
WARNING:tensorflow:AutoGraph could not transform <bound method NUM_TO_DENSE.call of <__main__.NUM_TO_DENSE object at 0x7f6458aa3a90>> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 10)
WARNING: AutoGraph could not transform <bound method NUM_TO_DENSE.call of <__main__.NUM_TO_DENSE object at 0x7f6458aa3a90>> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 10)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-78-64c553138beb> in <module>
      2 
      3 inputs = tf.keras.Input(shape=(None,786))
----> 4 x = pm(inputs)
      5 # x = tf.keras.layers.Dense(500,tf.keras.layers.ReLU(100.,0.01,0.))
      6 # output = tf.keras.layers.Dense(10,tf.keras.layers.Softmax())

~/anaconda3/envs/tensorflow/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/base_layer.py in __call__(self, inputs, *args, **kwargs)
    771                     not base_layer_utils.is_in_eager_or_tf_function()):
    772                   with auto_control_deps.AutomaticControlDependencies() as acd:
--> 773                     outputs = call_fn(cast_inputs, *args, **kwargs)
    774                     # Wrap Tensors in `outputs` in `tf.identity` to avoid
    775                     # circular dependencies.

~/anaconda3/envs/tensorflow/lib/python3.7/site-packages/tensorflow_core/python/autograph/impl/api.py in wrapper(*args, **kwargs)
    235       except Exception as e:  # pylint:disable=broad-except
    236         if hasattr(e, 'ag_error_metadata'):
--> 237           raise e.ag_error_metadata.to_exception(e)
    238         else:
    239           raise

TypeError: in converted code:

    <ipython-input-66-936477fe8a70>:62 call  *
        dense_num_d = self.ntd(inputs,training=training)
    /home/nitin/anaconda3/envs/tensorflow/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/base_layer.py:773 __call__
        outputs = call_fn(cast_inputs, *args, **kwargs)
    <ipython-input-66-936477fe8a70>:20 call
        null_cols = {k:tf.math.is_finite(inputs[k]) for k in self.keys}
    <ipython-input-66-936477fe8a70>:20 <dictcomp>
        null_cols = {k:tf.math.is_finite(inputs[k]) for k in self.keys}
    /home/nitin/anaconda3/envs/tensorflow/lib/python3.7/site-packages/tensorflow_core/python/ops/array_ops.py:862 _slice_helper
        _check_index(s)
    /home/nitin/anaconda3/envs/tensorflow/lib/python3.7/site-packages/tensorflow_core/python/ops/array_ops.py:752 _check_index
        raise TypeError(_SLICE_TYPE_ERROR + ", got {!r}".format(idx))

    TypeError: Only integers, slices (`:`), ellipsis (`...`), tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid indices, got 'col_0'

有人可以帮助我了解这里发生了什么以及如何实现预期的行为

4

1 回答 1

1

我认为这个问题与 AutoGraph 无关。从堆栈跟踪中可以看出,您正在尝试inputs使用方法中的string键对图层进行切片:callNUM_TO_DENSE

null_cols = {k:tf.math.is_finite(inputs[k]) for k in self.keys}

但是,TensorFlow 中的切片只能使用 int32 或 int64 完成

于 2020-05-02T00:41:54.793 回答