0

我正在尝试集成 MLRun 以使用 keras 代码对我的有毒评论进行分类,但我无法与我的代码集成。

谁能告诉我在哪里修改我的代码以将 MLRun 集成到我的代码中。我在下面附上我的代码。

MLRun -链接

MLRun 是一个用于跟踪模型和部署的开源软件。我能够训练模型,但无法与 mlrun 集成并部署,我发现文档很难。请帮忙

import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix

from sklearn import preprocessing

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, MaxPool1D, Dropout, Dense, GlobalMaxPooling1D, Embedding, Activation
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pandas as pd

train_data = pd.read_csv('/content/toxic_train.csv')
test_data = pd.read_csv('/content/toxic_test.csv')

# drop unnamed column

train_data = train_data.drop(columns=['Unnamed: 0'])
train_data.head()

test_data = test_data.drop(columns=['Unnamed: 0'])
test_data.head()

def preprocess_text(sen):
    # lower the character
    sentence = sen.lower()
    
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sen)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    
    stops = stopwords.words('english')
    
    for word in sentence.split():
        if word in stops:
            sentence = sentence.replace(word, '')
    return sentence

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# preprocess data

train_data['comment_text'] = train_data['comment_text'].apply(lambda x : preprocess_text(x))
test_data['comment_text'] = test_data['comment_text'].apply(lambda x : preprocess_text(x))

# tokenize the data

token = Tokenizer(28164)
token.fit_on_texts(train_data['comment_text'])
text = token.texts_to_sequences(train_data['comment_text'])
text = pad_sequences(text, maxlen=100)

y = train_data['toxic'].values

# split the data into training and testing data

X_train, X_test, y_train, y_test = train_test_split(text, y, test_size=0.2, random_state=1, stratify=y)

# build the model

max_features = 28164
embedding_dim = 32

model = Sequential()
model.add(Embedding(max_features, embedding_dim))
model.add(Dropout(0.2))
model.add(LSTM(32, return_sequences=True))
model.add(Dropout(0.2))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.summary()

# compile and train model

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, y_train, batch_size=1024, validation_data=(X_test, y_test), epochs=5)

#mlrun
from cloudpickle import dumps
model_data = dumps(model)
context.log_model(key='my_model', body=model_data, model_file='my_model.pkl')

def train_iris(context: MLClientCtx, dataset: DataItem, label_column: str = "labels"):
    raw, labels, header = get_sample(dataset, sample=-1, label=label_column)
    # Basic scikit-learn Iris data-set SVM model
    X_train, X_test, y_train, y_test = train_test_split( raw, labels, test_size=0.2, random_state=42)
    
    context.log_dataset('train_set', 
                        df=pd.concat([X_train, y_train.to_frame()], axis=1),
                        format='csv', index=False, 
                        artifact_path=context.artifact_subpath('data'))

    context.log_dataset('test_set', 
                        df=pd.concat([X_test, y_test.to_frame()], axis=1),
                        format='csv', index=False, 
                        labels={"data-type": "held-out"},
                        artifact_path=context.artifact_subpath('data'))
    
    model = linear_model.LogisticRegression(max_iter=10000)
    model.fit(X_train, y_train)
    
    # Evaluate model results and get the evaluation metrics
    eval_metrics = eval_model_v2(context, X_test, y_test, model)
    
    # Log model
    context.log_model("model",
                      body=dumps(model),
                      artifact_path=context.artifact_subpath("models"),
                      extra_data=eval_metrics, 
                      model_file="model.pkl",
                      metrics=context.results,
                      labels={"class": "sklearn.linear_model.LogisticRegression"})
4

1 回答 1

1

您的代码看起来可以使用 MLRun 执行。根据 MLRun 的安装方式,您必须配置您的环境才能找到 API。看看这里https://docs.mlrun.org/en/latest/install.html

这是一个可用于测试的简短脚本。

from mlrun import code_to_function

fn = code_to_function('train_iris', handler='train_iris', kind='job', filename="<YOUR PYTHON FILE PATH>")
# RUN LOCAL
fn.run(project='iris',local=True)

# RUN IN KUBERNETES (if you are running MLRUN in Kubernetes)
fn.run(project='iris',local=True)

于 2021-07-07T16:50:07.783 回答