有没有人有一个将 Hyperopt 集成到 Spark 的 MlLib 中的好例子?我一直在尝试在 Databricks 上这样做并继续遇到同样的错误。我不确定这是否是我的目标函数的问题,或者是否与 pyspark 上的 Spark ML 以及它如何与 Databricks 挂钩。
import itertools
from pyspark.sql import functions as f
from pyspark.sql import DataFrame
from pyspark.sql.types import *
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import OneHotEncoder, Imputer, VectorAssembler, StringIndexer
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression, GBTClassifier
from pyspark.ml.classification import GBTClassificationModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, CrossValidatorModel
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import numpy as np
from itertools import product
from hyperopt import fmin, hp, tpe, STATUS_OK, SparkTrials
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
search_space ={'maxDepth' : hp.choice("maxDepth", np.arange(3, 8, dtype=int)),
'maxIter' : hp.uniform("maxIter", 200,800),
'featureSubsetStrategy' : str(hp.quniform("featureSubsetStrategy", .5,1,.1)),
'minInstancesPerNode' : hp.uniform("min_child_weight", 1,10),
'stepSize' : hp.loguniform('stepSize', np.log(0.01), np.log(0.1)),
'subsamplingRate' : hp.quniform("featureSubsetStrategy", .5,1,.1)
}
evaluator = BinaryClassificationEvaluator(labelCol="positive")
def train(params):
gbtModel = GBTClassifier(labelCol="positive", featuresCol="features").fit(train)
predictions_val = gbtModel.predict(val.map(lambda x: x.features))
labelsAndPredictions = val.map(lambda lp: lp.label).zip(predictions_val)
ROC = evaluator.evaluate(predictions_val, {evaluator.metricName: "areaUnderROC"})
return {'ROC': ROC, 'status': STATUS_OK}
N_HYPEROPT_PROBES = 1000 #can increase, keep small for testing
EARLY_STOPPING = 50
HYPEROPT_ALGO = tpe.suggest
NB_CV_FOLDS = 5 # for testing, can increase
obj_call_count = 0
cur_best_score = 1000000
spark_trials = SparkTrials(parallelism=4)
best = fmin(fn=train,
space=search_space,
algo=HYPEROPT_ALGO,
max_evals=N_HYPEROPT_PROBES,
trials=spark_trials,
verbose=1)
运行后,我收到以下错误:
Total Trials: 0: 0 succeeded, 0 failed, 0 cancelled. py4j.Py4JException: Method __getstate__([]) does not exist