0

我遇到了一个奇怪的错误,我的代码在使用时失败GridSearchCV,但在单独运行时却没有sklearnMLPRegressor

以下代码:

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn import preprocessing
import pandas as pd
import numpy as np

def str_to_num(arr):
    le = preprocessing.LabelEncoder()
    new_arr = le.fit_transform(arr)
    return new_arr

def compare_values(arr1, arr2):
    thediff = 0
    thediffs = []
    for thing1, thing2 in zip(arr1, arr2):
        thediff = abs(thing1 - thing2)
        thediffs.append(thediff)

    return thediffs

def print_to_file(filepath, arr):
    with open(filepath, 'w') as f:
        for item in arr:
            f.write("%s\n" % item)

data = pd.read_csv('data2.csv')

# create the labels, or field we are trying to estimate
label = data['TOTAL']
# remove the header
label = label[1:]

# create the data, or the data that is to be estimated
data = data.drop('TOTAL', axis=1)
data = data.drop('SERIALNUM', axis=1)
# remove the header
data = data[1:]

# # split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size = 0.2)

mlp = MLPRegressor(activation = 'relu', solver = 'lbfgs', verbose=False)
mlp.fit(X_train, y_train)
mlp_predictions = mlp.predict(X_test)
mlp_differences = compare_values(y_test, mlp_predictions)
mlp_Avg = np.average(mlp_differences)
print(mlp_Avg)

打印以下内容:

32.92041129078561 (是的,我知道平均错误很糟糕)

但是,当尝试优化参数时,同样的参数设置会产生错误:

from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn import preprocessing
import pandas as pd
import numpy as np


def str_to_num(arr):
    le = preprocessing.LabelEncoder()
    new_arr = le.fit_transform(arr)
    return new_arr

def compare_values(arr1, arr2):
    thediff = 0
    thediffs = []
    for thing1, thing2 in zip(arr1, arr2):
        thediff = abs(thing1 - thing2)
        thediffs.append(thediff)

    return thediffs

def print_to_file(filepath, arr):
    with open(filepath, 'w') as f:
        for item in arr:
            f.write("%s\n" % item)

data = pd.read_csv('data2.csv')

# create the labels, or field we are trying to estimate
label = data['TOTAL_DAYS_TO_COMPLETE']
# remove the header
label = label[1:]

# create the data, or the data that is to be estimated
data = data.drop('TOTAL_DAYS_TO_COMPLETE', axis=1)
data = data.drop('SERIALNUM', axis=1)
# remove the header
data = data[1:]

# # split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size = 0.2)

param_grid = {
    #'hidden_layer_sizes': [(1,),(2,),(3,),(10,),(15,),(20,),(25,)],
    'activation': ['identity', 'logistic', 'relu'],
    #'activation': ['relu'],
    'solver': ['lbfgs', 'sgd', 'adam'],
    #'solver': ['adam']
    #'alpha': [0.0001, 0.0005, 0.0009],
    #'learning_rate': ['constant', 'invscaling', 'adaptive'],
    #'learning_rate_init': [0.001, 0.01, 0.99],
    #'warm_start': [True, False]
    #'momentum': [0.1, 0.9, 0.99]
    # Did not solver-specifics...yet
}# Create a based model

mlp = MLPRegressor()# Instantiate the grid search model
grid_search = GridSearchCV(estimator = mlp, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(X_train, y_train)
print()
print(grid_search.best_params_)
print(grid_search.best_score_)
print()
print("Grid scores on development set: ")
print()
answers = grid_search.predict(X_test)
results = compare_values(answers, y_test)
print("Accuracy: ", np.average(results))
print()

产生以下结果:

为 9 个候选者中的每一个拟合 3 折,总共 27 次拟合 [Parallel(n_jobs=-1)]:使用后端 LokyBackend 和 8 个并发工作人员。[CV] 激活=身份,求解器=lbfgs ................... [CV] 激活=身份,求解器= lbfgs ................... [CV] 激活=身份,求解器=sgd .... ......................... C:\Python367-64\lib\site-packages\sklearn\neural_network_base.py:195: RuntimeWarning: 遇到溢出在平方返回 ((y_true - y_pred) ** 2).mean() / 2 [CV] activation=identity, solver=adam ..... ......... [CV] 激活=身份,求解器=lbfgs ................... .. [CV] 激活=身份,求解器=sgd .................. [CV] 激活=身份,求解器 = sgd ....................

<删除了工作正常的额外行>

!!!这是它开始失败的地方[CV] !!!!

.....................激活=relu,求解器=lbfgs,总计= 0.5s

joblib.externals.loky.process_executor._RemoteTraceback: """ Traceback (最近一次调用最后): 文件 "C:\Python367-64\lib\site-packages\joblib\externals\loky\process_executor.py",第 418 行,在 _process_worker r = call_item() 文件“C:\Python367-64\lib\site-packages\joblib\externals\loky\process_executor.py”,第 272 行,调用中 返回 self.fn(*self.args, ** self.kwargs) 文件“C:\Python367-64\lib\site-packages\joblib_parallel_backends.py”,第 567 行,调用 return self.func(*args, **kwargs) 文件“C:\Python367-64\ lib\site-packages\joblib\parallel.py”,第 225 行, 调用中 在 _passthrough_scorer 返回 estimator.score(*args, **kwargs) 文件“C:\Python367-64\lib\site-packages\sklearn\base.py”,第 410 行,得分 y_type, _, _, _ = _check_reg_targets (y, y_pred, None) 文件“C:\Python367-64\lib\site-packages\sklearn\metrics\regression.py”,第 79 行,_check_reg_targets y_pred = check_array(y_pred, ensure_2d=False) 文件“C: \Python367-64\lib\site-packages\sklearn\utils\validation.py",第 542 行,在 check_array allow_nan=force_all_finite == 'allow-nan') 文件 "C:\Python367-64\lib\site-packages \sklearn\utils\validation.py",第 56 行,在 _assert_all_finite 中引发 ValueError(msg_err.format(type_err, X.dtype)) ValueError: Input contains NaN, infinity or a value too large for dtype('float64'). """ score(*args, **kwargs) 文件“C:\Python367-64\lib\site-packages\sklearn\base.py”,第 410 行,得分 y_type, _, _, _ = _check_reg_targets(y, y_pred,无)文件“C:\Python367-64\lib\site-packages\sklearn\metrics\regression.py”,第 79 行,在 _check_reg_targets y_pred = check_array(y_pred, ensure_2d=False) 文件“C:\Python367-64\ lib\site-packages\sklearn\utils\validation.py”,第 542 行,在 check_array allow_nan=force_all_finite == 'allow-nan') 文件“C:\Python367-64\lib\site-packages\sklearn\utils\ validation.py”,第 56 行,在 _assert_all_finite raise ValueError(msg_err.format(type_err, X.dtype)) ValueError: Input contains NaN, infinity or a value too large for dtype('float64'). """ score(*args, **kwargs) 文件“C:\Python367-64\lib\site-packages\sklearn\base.py”,第 410 行,得分 y_type, _, _, _ = _check_reg_targets(y, y_pred,无)文件“C:\Python367-64\lib\site-packages\sklearn\metrics\regression.py”,第 79 行,在 _check_reg_targets y_pred = check_array(y_pred, ensure_2d=False) 文件“C:\Python367-64\ lib\site-packages\sklearn\utils\validation.py”,第 542 行,在 check_array allow_nan=force_all_finite == 'allow-nan') 文件“C:\Python367-64\lib\site-packages\sklearn\utils\ validation.py”,第 56 行,在 _assert_all_finite raise ValueError(msg_err.format(type_err, X.dtype)) ValueError: Input contains NaN, infinity or a value too large for dtype('float64'). """ 无)文件“C:\Python367-64\lib\site-packages\sklearn\metrics\regression.py”,第 79 行,在 _check_reg_targets y_pred = check_array(y_pred, ensure_2d=False) 文件“C:\Python367-64\ lib\site-packages\sklearn\utils\validation.py”,第 542 行,在 check_array allow_nan=force_all_finite == 'allow-nan') 文件“C:\Python367-64\lib\site-packages\sklearn\utils\ validation.py”,第 56 行,在 _assert_all_finite raise ValueError(msg_err.format(type_err, X.dtype)) ValueError: Input contains NaN, infinity or a value too large for dtype('float64'). """ 无)文件“C:\Python367-64\lib\site-packages\sklearn\metrics\regression.py”,第 79 行,在 _check_reg_targets y_pred = check_array(y_pred, ensure_2d=False) 文件“C:\Python367-64\ lib\site-packages\sklearn\utils\validation.py”,第 542 行,在 check_array allow_nan=force_all_finite == 'allow-nan') 文件“C:\Python367-64\lib\site-packages\sklearn\utils\ validation.py”,第 56 行,在 _assert_all_finite raise ValueError(msg_err.format(type_err, X.dtype)) ValueError: Input contains NaN, infinity or a value too large for dtype('float64'). """

上述异常是以下异常的直接原因:

回溯(最后一次调用):文件“mlp_optimizer.py”,第 93 行,在 grid_search.fit(X_train, y_train) 文件“C:\Python367-64\lib\site-packages\sklearn\model_selection_search.py​​”,行687,适合 self._run_search(evaluate_candidates) 文件“C:\Python367-64\lib\site-packages\sklearn\model_selection_search.py​​”,第 1148 行,在 _run_search evaluate_candidates(ParameterGrid(self.param_grid)) 文件“C: \Python367-64\lib\site-packages\sklearn\model_selection_search.py​​”,第 666 行,在 evaluate_candidates cv.split(X, y, groups))) 文件“C:\Python367-64\lib\site-packages\ joblib\parallel.py",第 934 行, 调用中 self.retrieve() 文件“C:\Python367-64\lib\site-packages\joblib\parallel.py”,第 833 行,在检索 self._output.extend(job.get(timeout=self.timeout)) 文件中“C:\Python367-64\lib\site-packages\joblib_parallel_backends.py”,第 521 行,在 wrap_future_result 返回 future.result(timeout=timeout) 文件“C:\Python367-64\lib\concurrent\futures_base.py” ,第 432 行,结果返回 self.__get_result() 文件“C:\Python367-64\lib\concurrent\futures_base.py”,第 384 行,在 __get_result 中引发 self._exception ValueError: Input contains NaN, infinity or a value too对于 dtype('float64') 来说很大。

为什么它在不使用时会起作用GridSearchCV,但使用GridSearchCV会导致它失败?

4

1 回答 1

1

该问题与此行有关:

'solver': ['lbfgs', 'sgd', 'adam'],

根据文档,该sgd选项需要特定阈值中的某些参数

简单地改变 'solver': ['lbfgs', 'sgd', 'adam'],

'solver': ['lbfgs', 'adam'],

解决了问题

于 2019-08-14T14:29:42.733 回答