1

我使用 TensorFlow-2.0-beta1 创建了一个模型。这使用 Keras 功能 API 对输入数据执行回归。数据需要对分类特征进行一次性编码,并将数字输入归一化。在过去使用 TF1.11 中的 Estimators API 时,这可以通过使用特征列并将工程应用于 ServingInputReceiver 中的特征来解决。从 keras 导出模型时,有没有办法做类似的事情?

import tensorflow as tf
import pickle
import tensorflow_datasets as tfds
import pandas as pd

tf.keras.backend.clear_session()  # For easy reset of notebook state.

VERSION = tf.__version__
CWD = os.getcwd()
PARENT_DIR = os.path.split(CWD)[0]
DATETIME = datetime.datetime.utcnow()
DATA_DIR = os.path.join(PARENT_DIR, 'data')
train_file_path = os.path.join(DATA_DIR, 'traindf.csv')
test_file_path = os.path.join(DATA_DIR, 'testdf.csv')

CATEGORIES = os.path.join(DATA_DIR, "CATEGORIES")
fileObject = open(CATEGORIES, 'rb')
CATEGORIES = pickle.load(fileObject)
fileObject.close()

NUMERICSTATS = os.path.join(DATA_DIR, "NUMERICSTATS")
fileObject = open(NUMERICSTATS, 'rb')
NUMERICSTATS = pickle.load(fileObject)
fileObject.close()


# CSV columns in the input file.
with open(train_file_path, 'r') as f:
    names_row = f.readline()

CSV_COLUMNS = names_row.rstrip('\n').split(',')
print(CSV_COLUMNS)


drop_columns = ['SubSilo','Year','StockID', 'QuickRef', 'sumUKQuantity', 'sumNonUKQuantity']
columns_to_use = [col for col in CSV_COLUMNS if col not in drop_columns]

columns_to_use


LABEL_COLUMN = 'totalqty'
FEATURE_COLUMNS = [column for column in columns_to_use if column != LABEL_COLUMN]
test_labels = testdf[LABEL_COLUMN]


COLUMN_DEFAULTS = [tf.dtypes.string, #ProductBrand
                  tf.dtypes.string, #Department
                  tf.dtypes.string, #ProductType
                  tf.dtypes.string, #ProductSubType
                  tf.dtypes.string, #Silo
                  tf.dtypes.string, #Level
                  tf.dtypes.string, #BaseColour
                  tf.dtypes.string, #Sport
                  tf.dtypes.string, #UKSize
                  tf.dtypes.float32, #UnitCostPrice
                  tf.dtypes.float32, #ExVatSalesValue
                  tf.dtypes.float32, #RRP_GBP
                  tf.dtypes.string, #Week
                  tf.dtypes.int32] #totalqty

def get_dataset(file_path):
    dataset = tf.data.experimental.make_csv_dataset(
        file_path,
        batch_size=60, # Artificially small to make examples easier to show.
        label_name=LABEL_COLUMN,
        select_columns=columns_to_use ,
        column_defaults=COLUMN_DEFAULTS,
        num_epochs=1,
        ignore_errors=True,
        shuffle=False)
    return dataset

raw_train_data = get_dataset(train_file_path)
raw_test_data = get_dataset(test_file_path)


def process_categorical_data(data, categories):
    """Returns a one-hot encoded tensor representing categorical values."""

    # Remove leading ' '.
    data = tf.strings.regex_replace(data, '^ ', '')
    # Remove trailing '.'.
    data = tf.strings.regex_replace(data, r'\.$', '')

    # ONE HOT ENCODE
    # Reshape data from 1d (a list) to a 2d (a list of one-element lists)
    data = tf.reshape(data, [-1, 1])
    # For each element, create a new list of boolean values the length of categories,
    # where the truth value is element == category label
    data = tf.equal(categories, data)
    # Cast booleans to floats.
    data = tf.cast(data, tf.float32)

    # The entire encoding can fit on one line:
    # data = tf.cast(tf.equal(categories, tf.reshape(data, [-1, 1])), tf.float32)
    return data


def process_continuous_data(data, mean, std):
    # Normalize data
    data = (tf.cast(data, tf.float32) - mean) / std
    return tf.reshape(data, [-1, 1])


def preprocess(features, labels):
    # Process categorial features.
    for feature in CATEGORIES.keys():
        features[feature] = process_categorical_data(features[feature], CATEGORIES[feature])


    # Process continuous features.
    for feature in NUMERICSTATS.keys():
        features[feature] = process_continuous_data(features[feature],
                                                    NUMERICSTATS[feature]['mean'],
                                                    NUMERICSTATS[feature]['std']
                                                   )


    # Assemble features into a single tensor.
    features = tf.concat([features[column] for column in FEATURE_COLUMNS], 1)

    return features, labels


train_data = raw_train_data.map(preprocess).shuffle(len(traindf))
test_data = raw_test_data.map(preprocess)


def get_model(input_dim):
    """Create a Keras model with layers.

    Args:
        input_dim: (int) The shape of an item in a batch. 

    Returns:
        A Keras model.
    """

    inputs = tf.keras.Input(shape=(input_dim,))
    x = tf.keras.layers.Dense(244, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001))(inputs)
    x = tf.keras.layers.Dropout(0.5)(x)
    x = tf.keras.layers.Dense(200, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001))(x)
    x = tf.keras.layers.Dropout(0.5)(x)
    x = tf.keras.layers.Dense(100, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001))(x)
    x = tf.keras.layers.Dropout(0.5)(x)
    x = tf.keras.layers.Dense(50, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001))(x)
    x = tf.keras.layers.Dropout(0.5)(x)
    outputs = tf.keras.layers.Dense(1)(x)

    model = tf.keras.Model(inputs, outputs)

    return model


input_shape, output_shape = train_data.output_shapes
input_dimension = input_shape.dims[1] # [0] is the batch size

model = get_model(input_dimension)


optimizer = tf.keras.optimizers.Adam(0.001)

model.compile(loss='mse',
            optimizer=optimizer,
            metrics=['mae', 'mse', tf.keras.metrics.RootMeanSquaredError()])


# The patience parameter is the amount of epochs to check for improvement
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20)

# Display training progress by printing a single dot for each completed epoch
class PrintDot(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs):
        if epoch % 100 == 0: print('')
        print('.', end='')

tensor_board = tf.keras.callbacks.TensorBoard(log_dir=os.path.join(PARENT_DIR, 'tensorBoardLogs'))

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
                                                 factor=0.2,
                                                 patience=4,
                                                 verbose=1,
                                                 min_lr=0.00001)

history = model.fit(train_data,
                    validation_data=test_data,
                    epochs=100,
                    verbose=1,
                    callbacks=[early_stop,
                               PrintDot(),
                               tensor_board,
                               reduce_lr]
                   )


tf.keras.experimental.export_saved_model(model, saved_model_path=os.path.join(PARENT_DIR, 'models/1'))

我想要的是有一个模型,我可以使用 TensorFlow 服务来提供服务,它将获取训练数据中的特征,其中 13 个并在模型本身中对它们进行预处理。因此不需要使用像 Flask 这样的东西作为中间人

4

1 回答 1

0

您可以考虑 using ,它在 期间应用了与您在Tensorflow Transform期间应用的相同的转换。ServingTraining

您可以使用以下代码替换您的函数 ,process_categorical_dataprocess_continuous_datapreprocess

def preprocessing_fn(inputs):
    """Preprocess input columns into transformed columns."""
    x = inputs['x']
    y = inputs['y']
    s = inputs['s']
    x_centered = x - tft.mean(x)
    y_normalized = tft.scale_to_0_1(y)
    s_integerized = tft.compute_and_apply_vocabulary(s)
    x_centered_times_y_normalized = (x_centered * y_normalized)
    return {
        'x_centered': x_centered,
        'y_normalized': y_normalized,
        's_integerized': s_integerized,
        'x_centered_times_y_normalized': x_centered_times_y_normalized,
    }

# Ignore the warnings
with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
  transformed_dataset, transform_fn = (  # pylint: disable=unused-variable
        (raw_data, raw_data_metadata) | tft_beam.AnalyzeAndTransformDataset(
            preprocessing_fn))

transformed_data, transformed_metadata = transformed_dataset  

print('\nRaw data:\n{}\n'.format(pprint.pformat(raw_data)))
print('Transformed data:\n{}'.format(pprint.pformat(transformed_data)))

更多细节请参考 TF 变换指南Tutorial1Tutorial2

于 2020-04-10T11:22:27.997 回答