我正在尝试集成 MLRun 以使用 keras 代码对我的有毒评论进行分类,但我无法与我的代码集成。
谁能告诉我在哪里修改我的代码以将 MLRun 集成到我的代码中。我在下面附上我的代码。
MLRun -链接
MLRun 是一个用于跟踪模型和部署的开源软件。我能够训练模型,但无法与 mlrun 集成并部署,我发现文档很难。请帮忙
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
from sklearn import preprocessing
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, MaxPool1D, Dropout, Dense, GlobalMaxPooling1D, Embedding, Activation
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
train_data = pd.read_csv('/content/toxic_train.csv')
test_data = pd.read_csv('/content/toxic_test.csv')
# drop unnamed column
train_data = train_data.drop(columns=['Unnamed: 0'])
train_data.head()
test_data = test_data.drop(columns=['Unnamed: 0'])
test_data.head()
def preprocess_text(sen):
# lower the character
sentence = sen.lower()
# Remove punctuations and numbers
sentence = re.sub('[^a-zA-Z]', ' ', sen)
# Single character removal
sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
# Removing multiple spaces
sentence = re.sub(r'\s+', ' ', sentence)
stops = stopwords.words('english')
for word in sentence.split():
if word in stops:
sentence = sentence.replace(word, '')
return sentence
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
# preprocess data
train_data['comment_text'] = train_data['comment_text'].apply(lambda x : preprocess_text(x))
test_data['comment_text'] = test_data['comment_text'].apply(lambda x : preprocess_text(x))
# tokenize the data
token = Tokenizer(28164)
token.fit_on_texts(train_data['comment_text'])
text = token.texts_to_sequences(train_data['comment_text'])
text = pad_sequences(text, maxlen=100)
y = train_data['toxic'].values
# split the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(text, y, test_size=0.2, random_state=1, stratify=y)
# build the model
max_features = 28164
embedding_dim = 32
model = Sequential()
model.add(Embedding(max_features, embedding_dim))
model.add(Dropout(0.2))
model.add(LSTM(32, return_sequences=True))
model.add(Dropout(0.2))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.summary()
# compile and train model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, y_train, batch_size=1024, validation_data=(X_test, y_test), epochs=5)
#mlrun
from cloudpickle import dumps
model_data = dumps(model)
context.log_model(key='my_model', body=model_data, model_file='my_model.pkl')
def train_iris(context: MLClientCtx, dataset: DataItem, label_column: str = "labels"):
raw, labels, header = get_sample(dataset, sample=-1, label=label_column)
# Basic scikit-learn Iris data-set SVM model
X_train, X_test, y_train, y_test = train_test_split( raw, labels, test_size=0.2, random_state=42)
context.log_dataset('train_set',
df=pd.concat([X_train, y_train.to_frame()], axis=1),
format='csv', index=False,
artifact_path=context.artifact_subpath('data'))
context.log_dataset('test_set',
df=pd.concat([X_test, y_test.to_frame()], axis=1),
format='csv', index=False,
labels={"data-type": "held-out"},
artifact_path=context.artifact_subpath('data'))
model = linear_model.LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)
# Evaluate model results and get the evaluation metrics
eval_metrics = eval_model_v2(context, X_test, y_test, model)
# Log model
context.log_model("model",
body=dumps(model),
artifact_path=context.artifact_subpath("models"),
extra_data=eval_metrics,
model_file="model.pkl",
metrics=context.results,
labels={"class": "sklearn.linear_model.LogisticRegression"})