我使用 TensorFlow-2.0-beta1 创建了一个模型。这使用 Keras 功能 API 对输入数据执行回归。数据需要对分类特征进行一次性编码,并将数字输入归一化。在过去使用 TF1.11 中的 Estimators API 时,这可以通过使用特征列并将工程应用于 ServingInputReceiver 中的特征来解决。从 keras 导出模型时,有没有办法做类似的事情?
import tensorflow as tf
import pickle
import tensorflow_datasets as tfds
import pandas as pd
tf.keras.backend.clear_session() # For easy reset of notebook state.
VERSION = tf.__version__
CWD = os.getcwd()
PARENT_DIR = os.path.split(CWD)[0]
DATETIME = datetime.datetime.utcnow()
DATA_DIR = os.path.join(PARENT_DIR, 'data')
train_file_path = os.path.join(DATA_DIR, 'traindf.csv')
test_file_path = os.path.join(DATA_DIR, 'testdf.csv')
CATEGORIES = os.path.join(DATA_DIR, "CATEGORIES")
fileObject = open(CATEGORIES, 'rb')
CATEGORIES = pickle.load(fileObject)
fileObject.close()
NUMERICSTATS = os.path.join(DATA_DIR, "NUMERICSTATS")
fileObject = open(NUMERICSTATS, 'rb')
NUMERICSTATS = pickle.load(fileObject)
fileObject.close()
# CSV columns in the input file.
with open(train_file_path, 'r') as f:
names_row = f.readline()
CSV_COLUMNS = names_row.rstrip('\n').split(',')
print(CSV_COLUMNS)
drop_columns = ['SubSilo','Year','StockID', 'QuickRef', 'sumUKQuantity', 'sumNonUKQuantity']
columns_to_use = [col for col in CSV_COLUMNS if col not in drop_columns]
columns_to_use
LABEL_COLUMN = 'totalqty'
FEATURE_COLUMNS = [column for column in columns_to_use if column != LABEL_COLUMN]
test_labels = testdf[LABEL_COLUMN]
COLUMN_DEFAULTS = [tf.dtypes.string, #ProductBrand
tf.dtypes.string, #Department
tf.dtypes.string, #ProductType
tf.dtypes.string, #ProductSubType
tf.dtypes.string, #Silo
tf.dtypes.string, #Level
tf.dtypes.string, #BaseColour
tf.dtypes.string, #Sport
tf.dtypes.string, #UKSize
tf.dtypes.float32, #UnitCostPrice
tf.dtypes.float32, #ExVatSalesValue
tf.dtypes.float32, #RRP_GBP
tf.dtypes.string, #Week
tf.dtypes.int32] #totalqty
def get_dataset(file_path):
dataset = tf.data.experimental.make_csv_dataset(
file_path,
batch_size=60, # Artificially small to make examples easier to show.
label_name=LABEL_COLUMN,
select_columns=columns_to_use ,
column_defaults=COLUMN_DEFAULTS,
num_epochs=1,
ignore_errors=True,
shuffle=False)
return dataset
raw_train_data = get_dataset(train_file_path)
raw_test_data = get_dataset(test_file_path)
def process_categorical_data(data, categories):
"""Returns a one-hot encoded tensor representing categorical values."""
# Remove leading ' '.
data = tf.strings.regex_replace(data, '^ ', '')
# Remove trailing '.'.
data = tf.strings.regex_replace(data, r'\.$', '')
# ONE HOT ENCODE
# Reshape data from 1d (a list) to a 2d (a list of one-element lists)
data = tf.reshape(data, [-1, 1])
# For each element, create a new list of boolean values the length of categories,
# where the truth value is element == category label
data = tf.equal(categories, data)
# Cast booleans to floats.
data = tf.cast(data, tf.float32)
# The entire encoding can fit on one line:
# data = tf.cast(tf.equal(categories, tf.reshape(data, [-1, 1])), tf.float32)
return data
def process_continuous_data(data, mean, std):
# Normalize data
data = (tf.cast(data, tf.float32) - mean) / std
return tf.reshape(data, [-1, 1])
def preprocess(features, labels):
# Process categorial features.
for feature in CATEGORIES.keys():
features[feature] = process_categorical_data(features[feature], CATEGORIES[feature])
# Process continuous features.
for feature in NUMERICSTATS.keys():
features[feature] = process_continuous_data(features[feature],
NUMERICSTATS[feature]['mean'],
NUMERICSTATS[feature]['std']
)
# Assemble features into a single tensor.
features = tf.concat([features[column] for column in FEATURE_COLUMNS], 1)
return features, labels
train_data = raw_train_data.map(preprocess).shuffle(len(traindf))
test_data = raw_test_data.map(preprocess)
def get_model(input_dim):
"""Create a Keras model with layers.
Args:
input_dim: (int) The shape of an item in a batch.
Returns:
A Keras model.
"""
inputs = tf.keras.Input(shape=(input_dim,))
x = tf.keras.layers.Dense(244, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001))(inputs)
x = tf.keras.layers.Dropout(0.5)(x)
x = tf.keras.layers.Dense(200, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001))(x)
x = tf.keras.layers.Dropout(0.5)(x)
x = tf.keras.layers.Dense(100, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001))(x)
x = tf.keras.layers.Dropout(0.5)(x)
x = tf.keras.layers.Dense(50, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001))(x)
x = tf.keras.layers.Dropout(0.5)(x)
outputs = tf.keras.layers.Dense(1)(x)
model = tf.keras.Model(inputs, outputs)
return model
input_shape, output_shape = train_data.output_shapes
input_dimension = input_shape.dims[1] # [0] is the batch size
model = get_model(input_dimension)
optimizer = tf.keras.optimizers.Adam(0.001)
model.compile(loss='mse',
optimizer=optimizer,
metrics=['mae', 'mse', tf.keras.metrics.RootMeanSquaredError()])
# The patience parameter is the amount of epochs to check for improvement
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20)
# Display training progress by printing a single dot for each completed epoch
class PrintDot(tf.keras.callbacks.Callback):
def on_epoch_end(self, epoch, logs):
if epoch % 100 == 0: print('')
print('.', end='')
tensor_board = tf.keras.callbacks.TensorBoard(log_dir=os.path.join(PARENT_DIR, 'tensorBoardLogs'))
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
factor=0.2,
patience=4,
verbose=1,
min_lr=0.00001)
history = model.fit(train_data,
validation_data=test_data,
epochs=100,
verbose=1,
callbacks=[early_stop,
PrintDot(),
tensor_board,
reduce_lr]
)
tf.keras.experimental.export_saved_model(model, saved_model_path=os.path.join(PARENT_DIR, 'models/1'))
我想要的是有一个模型,我可以使用 TensorFlow 服务来提供服务,它将获取训练数据中的特征,其中 13 个并在模型本身中对它们进行预处理。因此不需要使用像 Flask 这样的东西作为中间人