- 操作系统平台和发行版:ubuntu 20.04
- TensorFlow版本:2.1.0
- Python版本:3.7.6
我想写一个简单的层来处理 tf.experiment.make_csv_dataset 的输出,我可以用批量平均值来估算数字 dtypes 中的缺失值,保持在测试时使用的移动平均值,为分类创建嵌入列并保持维度依赖于预定义的唯一值列表。
下面是我写的代码:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.keras import layers
from tensorflow import feature_column
class NUM_TO_DENSE(layers.Layer):
def __init__(self,num_cols):
super().__init__()
self.keys = num_cols
self.keys_all = self.keys+[str(i)+'__nullcol' for i in self.keys]
def build(self,input_shape):
def create_moving_mean_vars():
return tf.Variable(initial_value=0.,shape=(),dtype=tf.float32,trainable=False)
self.moving_means_total = {t:create_moving_mean_vars() for t in self.keys}
self.layer_global_counter = tf.Variable(initial_value=0.,shape=(),dtype=tf.float32,trainable=False)
def call(self,inputs, training = True):
null_cols = {k:tf.math.is_finite(inputs[k]) for k in self.keys}
current_means = {}
def compute_update_current_means(t):
current_mean = tf.math.divide_no_nan(tf.reduce_sum(tf.where(null_cols[t],inputs[t],0.),axis=0),\
tf.reduce_sum(tf.cast(tf.math.is_finite(inputs[t]),tf.float32),axis=0))
self.moving_means_total[t].assign_add(current_mean)
return current_mean
if training:
current_means = {t:compute_update_current_means(t) for t in self.keys}
outputs = {t:tf.where(null_cols[t],inputs[t],current_means[t]) for t in self.keys}
outputs.update({str(k)+'__nullcol':tf.cast(null_cols[k],tf.float32) for k in self.keys})
self.layer_global_counter.assign_add(1.)
else:
outputs = {t:tf.where(null_cols[t],inputs[t],(self.moving_means_total[t]/self.layer_global_counter))\
for t in self.keys}
outputs.update({str(k)+'__nullcol':tf.cast(null_cols[k],tf.float32) for k in self.keys})
return outputs
class PREPROCESS_MONSOON(layers.Layer):
def __init__(self,cat_cols_with_unique_values,num_cols):
'''cat_cols_with_unqiue_values: (dict) {'col_cat':[unique_values_list]}
num_cols: (list) [num_cols_name_list]'''
super().__init__()
self.cat_cols = cat_cols_with_unique_values
self.num_cols = num_cols
def build(self,input_shape):
self.ntd = NUM_TO_DENSE(self.num_cols)
self.num_colnames = self.ntd.keys_all
self.ctd = {k:layers.DenseFeatures\
(feature_column.embedding_column\
(feature_column.categorical_column_with_vocabulary_list\
(k,v),tf.cast(tf.math.ceil(tf.math.log(tf.cast(len(self.cat_cols[k]),tf.float32))),tf.int32).numpy()))\
for k,v in self.cat_cols.items()}
self.cat_colnames = [i for i in self.cat_cols]
self.dense_colnames = self.num_colnames+self.cat_colnames
def call(self,inputs,training=True):
dense_num_d = self.ntd(inputs,training=training)
dense_cat_d = {k:self.ctd[k](inputs) for k in self.cat_colnames}
dense_num = tf.stack([dense_num_d[k] for k in self.num_colnames],axis=1)
dense_cat = tf.concat([dense_cat_d[k] for k in self.cat_colnames],axis=1)
dense_all = tf.concat([dense_num,dense_cat],axis=1)
return dense_all
创建数据来测试这个
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
x_train_ = pd.DataFrame(x_train.reshape(60000,-1),columns = ['col_'+str(i) for i in range(28*28)])
x_test_ = pd.DataFrame(x_test.reshape(10000,-1),columns = ['col_'+str(i) for i in range(28*28)])
x_train_['col_cat1'] = [np.random.choice(['a','b','c','d','e','f','g','h','i']) for i in range(x_train_.shape[0])]
x_test_['col_cat1'] = [np.random.choice(['a','b','c','d','e','f','g','h','i','j']) for i in range(x_test_.shape[0])]
x_train_['col_cat2'] = [np.random.choice(['a','b','c','d','e','f','g','h','i']) for i in range(x_train_.shape[0])]
x_test_['col_cat2'] = [np.random.choice(['a','b','c','d','e','f','g','h','i','j']) for i in range(x_test_.shape[0])]
x_train_[np.random.choice([True,False],size = x_train_.shape,p=[0.05,0.95]).reshape(x_train_.shape)] = np.nan
x_test_[np.random.choice([True,False],size = x_test_.shape,p=[0.05,0.95]).reshape(x_test_.shape)] = np.nan
x_train_.to_csv('data/x_train.csv',index=False)
x_test_.to_csv('data/x_test.csv',index=False)
在 ram 中创建一批
cdtypes = pd.read_csv('data/x_train.csv',nrows=2).dtypes
xtb = tf.data.experimental.make_csv_dataset('data/x_train.csv',32,header=True,prefetch_buffer_size=1,
column_defaults=[np.nan if i == (float or int) else '__missing__' for i in cdtypes])
for i in xtb:
break
dd = pd.read_csv('data/x_train.csv',nrows=2).head()
num_cols = [i for i in dd.columns if i not in ['col_cat1','col_cat2']]
cat_cols = [i for i in dd.columns if i in ['col_cat1','col_cat2']]
col_cat1_unique = ['a','b','c','d','e','f','g','h','i']
col_cat2_unique = ['a','b','c','d','e','f','g','h','i']
col_cat_unique = [col_cat1_unique,col_cat2_unique]
catcoldict = {k:v for k,v in zip(cat_cols,col_cat_unique)}
测试它:这有效:
pm = PREPROCESS_MONSOON(catcoldict,num_cols)
pm(i)
这适用于错误报告
pm = PREPROCESS_MONSOON(catcoldict,num_cols)
@tf.function
def p(i):
return pm(i)
p(i)
output: (along with the expected preprocessed batch)
WARNING:tensorflow:AutoGraph could not transform <bound method NUM_TO_DENSE.call of <__main__.NUM_TO_DENSE object at 0x7f6458a0ec50>> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 10)
WARNING: AutoGraph could not transform <bound method NUM_TO_DENSE.call of <__main__.NUM_TO_DENSE object at 0x7f6458a0ec50>> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 10)
WARNING:tensorflow:AutoGraph could not transform <bound method NUM_TO_DENSE.call of <__main__.NUM_TO_DENSE object at 0x7f6458a0ec50>> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 10)
WARNING: AutoGraph could not transform <bound method NUM_TO_DENSE.call of <__main__.NUM_TO_DENSE object at 0x7f6458a0ec50>> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 10)
这失败了
pm = PREPROCESS_MONSOON(catcoldict,num_cols)
inputs = tf.keras.Input(shape=(None,786))
x = pm(inputs)
output:
WARNING:tensorflow:AutoGraph could not transform <bound method NUM_TO_DENSE.call of <__main__.NUM_TO_DENSE object at 0x7f6458aa3a90>> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 10)
WARNING: AutoGraph could not transform <bound method NUM_TO_DENSE.call of <__main__.NUM_TO_DENSE object at 0x7f6458aa3a90>> and will run it as-is.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unexpected indent (<unknown>, line 10)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-78-64c553138beb> in <module>
2
3 inputs = tf.keras.Input(shape=(None,786))
----> 4 x = pm(inputs)
5 # x = tf.keras.layers.Dense(500,tf.keras.layers.ReLU(100.,0.01,0.))
6 # output = tf.keras.layers.Dense(10,tf.keras.layers.Softmax())
~/anaconda3/envs/tensorflow/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/base_layer.py in __call__(self, inputs, *args, **kwargs)
771 not base_layer_utils.is_in_eager_or_tf_function()):
772 with auto_control_deps.AutomaticControlDependencies() as acd:
--> 773 outputs = call_fn(cast_inputs, *args, **kwargs)
774 # Wrap Tensors in `outputs` in `tf.identity` to avoid
775 # circular dependencies.
~/anaconda3/envs/tensorflow/lib/python3.7/site-packages/tensorflow_core/python/autograph/impl/api.py in wrapper(*args, **kwargs)
235 except Exception as e: # pylint:disable=broad-except
236 if hasattr(e, 'ag_error_metadata'):
--> 237 raise e.ag_error_metadata.to_exception(e)
238 else:
239 raise
TypeError: in converted code:
<ipython-input-66-936477fe8a70>:62 call *
dense_num_d = self.ntd(inputs,training=training)
/home/nitin/anaconda3/envs/tensorflow/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/base_layer.py:773 __call__
outputs = call_fn(cast_inputs, *args, **kwargs)
<ipython-input-66-936477fe8a70>:20 call
null_cols = {k:tf.math.is_finite(inputs[k]) for k in self.keys}
<ipython-input-66-936477fe8a70>:20 <dictcomp>
null_cols = {k:tf.math.is_finite(inputs[k]) for k in self.keys}
/home/nitin/anaconda3/envs/tensorflow/lib/python3.7/site-packages/tensorflow_core/python/ops/array_ops.py:862 _slice_helper
_check_index(s)
/home/nitin/anaconda3/envs/tensorflow/lib/python3.7/site-packages/tensorflow_core/python/ops/array_ops.py:752 _check_index
raise TypeError(_SLICE_TYPE_ERROR + ", got {!r}".format(idx))
TypeError: Only integers, slices (`:`), ellipsis (`...`), tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid indices, got 'col_0'
有人可以帮助我了解这里发生了什么以及如何实现预期的行为