python-3.x - 为什么 hyperopt 在随机森林中运行时会给出最好的损失 Nan？

Question

我正在解决一个 Kaggle 问题：https : //www.kaggle.com/c/forest-cover-type-prediction/data 我使用 hyperopt 来找到随机森林的最佳超参数。但是我被困在这里，因为几乎大部分迭代它都给出了最好的损失：Nan。

我的完整代码：

import pandas as pd
import numpy as np

# Lets Load the Dataset

train = pd.read_csv(r"D:\Study Material\Py_Programs\Data Sets\forest-cover-type-prediction\train.csv")
test =  pd.read_csv(r"D:\Study Material\Py_Programs\Data Sets\forest-cover-type-prediction\test.csv")

# Lets Append all together so that we can study altogether    
# Lets not include test_2 for a while

test['Cover_Type'] = np.nan
data = train.append(test,ignore_index = True)
del train,test

# Lets Now do Feature Engineering
# We could do Manula Feature Engineering but lets not do it 
# Lets use feature Tools
# Lets first create a simple new attribute that can later be index for Soil Enity

data['Id_Soil'] = np.arange(len(data))

import featuretools as ft
es = ft.EntitySet(id = 'forest')
es = es.entity_from_dataframe(entity_id = 'Forest_Pred',dataframe = data,index = 'Id')

>>> es
Entityset: forest
  Entities:
    Forest_Pred [Rows: 581012, Columns: 63]
  Relationships:
    No relationships

# Lets Make a Seperate Entity for Soil
Additional_Variable = data.columns[data.columns.str.startswith('Soil')]
Additional_Variable
es = es.normalize_entity(base_entity_id = 'Forest_Pred',new_entity_id = 'Soil',index = 'Id_Soil',additional_variables = 
                        list(Additional_Variable))

>>>es 
Entityset: forest
  Entities:
    Forest_Pred [Rows: 581012, Columns: 23]
    Soil [Rows: 581012, Columns: 41]
  Relationships:
    Forest_Pred.Id_Soil -> Soil.Id_Soil

# Lets Run DFS 
feature_matrix,feature_defs = ft.dfs(entityset = es,target_entity = 'Forest_Pred')

drop_cols = []
for col in feature_matrix:
    if col == 'Cover_Type':
        pass
    else:
        if 'Cover_Type' in col:
            drop_cols.append(col)

feature_matrix = feature_matrix[[x for x in feature_matrix if x not in drop_cols]]         
feature_matrix.head()


# Create correlation matrix
corr_matrix = feature_matrix.corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] >= 0.95)]

print('There are {} columns with >= 0.95 correlation.'.format(len(to_drop)))

>>>to_drop

There are 83 columns with >= 0.95 correlation.

# These are the Redundant Columns

['New2',      # I manually created New2, Hill1 and Hill3
 'Hill1',
 'Hill3',
 'Soil.SUM(Forest_Pred.wild1)',
 'Soil.SUM(Forest_Pred.Vertical_Distance_To_Hydrology)',
 'Soil.SUM(Forest_Pred.Hillshade_Noon)',
 'Soil.SUM(Forest_Pred.Horizontal_Distance_To_Roadways)',
 'Soil.SUM(Forest_Pred.Slope)',
 'Soil.SUM(Forest_Pred.Wilderness_Area4)',
 'Soil.SUM(Forest_Pred.New4)',
 'Soil.SUM(Forest_Pred.Hill2)',
 'Soil.SUM(Forest_Pred.New2)',
 'Soil.SUM(Forest_Pred.Wilderness_Area2)',
 'Soil.SUM(Forest_Pred.Horizontal_Distance_To_Hydrology)',
 'Soil.SUM(Forest_Pred.Hillshade_9am)',
 'Soil.SUM(Forest_Pred.Aspect)',
 'Soil.SUM(Forest_Pred.Hillshade_3pm)',
 'Soil.SUM(Forest_Pred.Hill1)',
 'Soil.SUM(Forest_Pred.Hill3)',
 'Soil.SUM(Forest_Pred.Elevation)',
 'Soil.SUM(Forest_Pred.Wilderness_Area3)',
 'Soil.SUM(Forest_Pred.Horizontal_Distance_To_Fire_Points)',
 'Soil.SUM(Forest_Pred.Wilderness_Area1)',
 'Soil.MAX(Forest_Pred.wild1)',
 'Soil.MAX(Forest_Pred.Vertical_Distance_To_Hydrology)',
 'Soil.MAX(Forest_Pred.Hillshade_Noon)',
 'Soil.MAX(Forest_Pred.Horizontal_Distance_To_Roadways)',
 'Soil.MAX(Forest_Pred.Slope)',
 'Soil.MAX(Forest_Pred.Wilderness_Area4)',
 'Soil.MAX(Forest_Pred.New4)',
 'Soil.MAX(Forest_Pred.Hill2)',
 'Soil.MAX(Forest_Pred.New2)',
 'Soil.MAX(Forest_Pred.Wilderness_Area2)',
 'Soil.MAX(Forest_Pred.Horizontal_Distance_To_Hydrology)',
 'Soil.MAX(Forest_Pred.Hillshade_9am)',
 'Soil.MAX(Forest_Pred.Aspect)',
 'Soil.MAX(Forest_Pred.Hillshade_3pm)',
 'Soil.MAX(Forest_Pred.Hill1)',
 'Soil.MAX(Forest_Pred.Hill3)',
 'Soil.MAX(Forest_Pred.Elevation)',
 'Soil.MAX(Forest_Pred.Wilderness_Area3)',
 'Soil.MAX(Forest_Pred.Horizontal_Distance_To_Fire_Points)',
 'Soil.MAX(Forest_Pred.Wilderness_Area1)',
 'Soil.MIN(Forest_Pred.wild1)',
 'Soil.MIN(Forest_Pred.Vertical_Distance_To_Hydrology)',
 'Soil.MIN(Forest_Pred.Hillshade_Noon)',
 'Soil.MIN(Forest_Pred.Horizontal_Distance_To_Roadways)',
 'Soil.MIN(Forest_Pred.Slope)',
 'Soil.MIN(Forest_Pred.Wilderness_Area4)',
 'Soil.MIN(Forest_Pred.New4)',
 'Soil.MIN(Forest_Pred.Hill2)',
 'Soil.MIN(Forest_Pred.New2)',
 'Soil.MIN(Forest_Pred.Wilderness_Area2)',
 'Soil.MIN(Forest_Pred.Horizontal_Distance_To_Hydrology)',
 'Soil.MIN(Forest_Pred.Hillshade_9am)',
 'Soil.MIN(Forest_Pred.Aspect)',
 'Soil.MIN(Forest_Pred.Hillshade_3pm)',
 'Soil.MIN(Forest_Pred.Hill1)',
 'Soil.MIN(Forest_Pred.Hill3)',
 'Soil.MIN(Forest_Pred.Elevation)',
 'Soil.MIN(Forest_Pred.Wilderness_Area3)',
 'Soil.MIN(Forest_Pred.Horizontal_Distance_To_Fire_Points)',
 'Soil.MIN(Forest_Pred.Wilderness_Area1)',
 'Soil.MEAN(Forest_Pred.wild1)',
 'Soil.MEAN(Forest_Pred.Vertical_Distance_To_Hydrology)',
 'Soil.MEAN(Forest_Pred.Hillshade_Noon)',
 'Soil.MEAN(Forest_Pred.Horizontal_Distance_To_Roadways)',
 'Soil.MEAN(Forest_Pred.Slope)',
 'Soil.MEAN(Forest_Pred.Wilderness_Area4)',
 'Soil.MEAN(Forest_Pred.New4)',
 'Soil.MEAN(Forest_Pred.Hill2)',
 'Soil.MEAN(Forest_Pred.New2)',
 'Soil.MEAN(Forest_Pred.Wilderness_Area2)',
 'Soil.MEAN(Forest_Pred.Horizontal_Distance_To_Hydrology)',
 'Soil.MEAN(Forest_Pred.Hillshade_9am)',
 'Soil.MEAN(Forest_Pred.Aspect)',
 'Soil.MEAN(Forest_Pred.Hillshade_3pm)',
 'Soil.MEAN(Forest_Pred.Hill1)',
 'Soil.MEAN(Forest_Pred.Hill3)',
 'Soil.MEAN(Forest_Pred.Elevation)',
 'Soil.MEAN(Forest_Pred.Wilderness_Area3)',
 'Soil.MEAN(Forest_Pred.Horizontal_Distance_To_Fire_Points)',
 'Soil.MEAN(Forest_Pred.Wilderness_Area1)']


# Lets get the feature first
# Lets Now Look at the NULL Values

Null_Values = pd.DataFrame(train.isnull().sum()).rename(columns = {0 : 'Total'})
Null_Values['Percentage'] = Null_Values['Total']/len(train)
Null_Values.sort_values('Percentage',ascending = False)
Fully_Null_Columns = Null_Values.loc[Null_Values['Percentage'] == 1.0]
To_Remove = Fully_Null_Columns.index


Feature = list(train.columns)
for Val in To_Remove:
    Feature.remove(Val)

>>>len(Feature)
58

Pipe = Pipeline([
    ('impute',Imputer(strategy = 'median')),
    ('scaler',MinMaxScaler())
])
train = Pipe.fit_transform(train)
test = Pipe.transform(test)


######################## Hyperopt Part Begins From Here ###############################333


# Lets Apply Hyperopt to Optimize the Two Model that we think may do good Random Forest and MLP
# lETS fIRST dO For Random Forest
#Lets Define The Objective Function for it

from hyperopt import STATUS_OK

def Objective_Forest(params):
    classifier = RandomForestClassifier(**params)
    score = cross_val_score(classifier,train,Target,cv = 10,scoring = scorer)
    Best_Score = 1 - np.mean(score)
    return {'loss': Best_Score,'params':params,'status':STATUS_OK}

# lETS DEFINE THE PARAMETER SPACE FOR THE RANDOM FOREST CLASSIFIER
from hyperopt import hp
Param_grid = {
    'n_estimators': hp.choice('n_estimators',range(10,1000)),
    'max_depth' : hp.choice('max_depth',range(1,20)),
    'min_samples_split': hp.choice('min_samples_split',range(2,20)),
    'min_samples_leaf': hp.choice('min_samples_leaf',range(1,11)),
    'min_weight_fraction_leaf': hp.uniform('min_weight_fraction_leaf',0.0,1.0),
    'max_features':hp.choice('max_features',["sqrt", "log2","None",0.2,0.5,0.8]),
    'max_leaf_nodes':hp.choice('max_leaf_nodes',range(10,150)),
    'min_impurity_decrease':hp.uniform('min_impurity_decrease',0.0,1.0),
    'class_weight':hp.choice('class_weight',[None,'balanced']),
    'max_samples':hp.uniform('max_samples',0.0,1.0)
}

from hyperopt import tpe
tpe_algo = tpe.suggest


from hyperopt import Trials
bayes_trials = Trials()


from hyperopt import fmin
MAX_EVALS = 100
# Optimize
best = fmin(fn = Objective_Forest,space = Param_grid,algo = tpe_algo,max_evals = MAX_EVALS,trials = bayes_trials)

>>> [print(t['result'],end = '\n\n\n') for t in bayes_trials.trials]

{'loss': nan, 'params': {'class_weight': 'balanced', 'max_depth': 1, 'max_features': 'None', 'max_leaf_nodes': 33, 'max_samples': 0.4660771469206677, 'min_impurity_decrease': 0.45511437833393464, 'min_samples_leaf': 10, 'min_samples_split': 8, 'min_weight_fraction_leaf': 0.9339453161850745, 'n_estimators': 972}, 'status': 'ok'}

{'loss': nan, 'params': {'class_weight': None, 'max_depth': 11, 'max_features': 'log2', 'max_leaf_nodes': 49, 'max_samples': 0.14947278280397347, 'min_impurity_decrease': 0.2358674422658822, 'min_samples_leaf': 9, 'min_samples_split': 16, 'min_weight_fraction_leaf': 0.5935700756502073, 'n_estimators': 436}, 'status': 'ok'}


{'loss': nan, 'params': {'class_weight': None, 'max_depth': 16, 'max_features': 'None', 'max_leaf_nodes': 64, 'max_samples': 0.008126252217055763, 'min_impurity_decrease': 0.5860665211910298, 'min_samples_leaf': 3, 'min_samples_split': 14, 'min_weight_fraction_leaf': 0.7589621329866701, 'n_estimators': 544}, 'status': 'ok'}


{'loss': nan, 'params': {'class_weight': None, 'max_depth': 13, 'max_features': 'None', 'max_leaf_nodes': 88, 'max_samples': 0.8342507642254701, 'min_impurity_decrease': 0.29169826447891134, 'min_samples_leaf': 9, 'min_samples_split': 14, 'min_weight_fraction_leaf': 0.5732868446872494, 'n_estimators': 759}, 'status': 'ok'}


{'loss': 0.514714207538852, 'params': {'class_weight': None, 'max_depth': 4, 'max_features': 'sqrt', 'max_leaf_nodes': 104, 'max_samples': 0.10435155448150135, 'min_impurity_decrease': 0.024801820935633656, 'min_samples_leaf': 5, 'min_samples_split': 10, 'min_weight_fraction_leaf': 0.09350127980207612, 'n_estimators': 739}, 'status': 'ok'}


{'loss': 0.9642857142857143, 'params': {'class_weight': 'balanced', 'max_depth': 5, 'max_features': 'log2', 'max_leaf_nodes': 86, 'max_samples': 0.029032222646389272, 'min_impurity_decrease': 0.4459819146508117, 'min_samples_leaf': 5, 'min_samples_split': 10, 'min_weight_fraction_leaf': 0.16673304793166255, 'n_estimators': 419}, 'status': 'ok'}


{'loss': nan, 'params': {'class_weight': 'balanced', 'max_depth': 1, 'max_features': 'None', 'max_leaf_nodes': 18, 'max_samples': 0.4913763122828826, 'min_impurity_decrease': 0.35382231135300235, 'min_samples_leaf': 3, 'min_samples_split': 18, 'min_weight_fraction_leaf': 0.7421569901774066, 'n_estimators': 354}, 'status': 'ok'}


{'loss': nan, 'params': {'class_weight': 'balanced', 'max_depth': 4, 'max_features': 'sqrt', 'max_leaf_nodes': 69, 'max_samples': 0.27201985914939086, 'min_impurity_decrease': 0.486936153640398, 'min_samples_leaf': 8, 'min_samples_split': 15, 'min_weight_fraction_leaf': 0.7310520866089266, 'n_estimators': 142}, 'status': 'ok'}


{'loss': nan, 'params': {'class_weight': None, 'max_depth': 12, 'max_features': 'sqrt', 'max_leaf_nodes': 36, 'max_samples': 0.9771715541709761, 'min_impurity_decrease': 0.1971412468087903, 'min_samples_leaf': 9, 'min_samples_split': 3, 'min_weight_fraction_leaf': 0.8200016570398415, 'n_estimators': 34}, 'status': 'ok'}


{'loss': 0.9642857142857143, 'params': {'class_weight': None, 'max_depth': 10, 'max_features': 'sqrt', 'max_leaf_nodes': 73, 'max_samples': 0.45641569744506405, 'min_impurity_decrease': 0.8403030256419523, 'min_samples_leaf': 7, 'min_samples_split': 9, 'min_weight_fraction_leaf': 0.0701815156303528, 'n_estimators': 873}, 'status': 'ok'}


{'loss': 0.9642857142857143, 'params': {'class_weight': None, 'max_depth': 17, 'max_features': 'sqrt', 'max_leaf_nodes': 46, 'max_samples': 0.15866300388832533, 'min_impurity_decrease': 0.9297347852530089, 'min_samples_leaf': 7, 'min_samples_split': 6, 'min_weight_fraction_leaf': 0.18404233693328886, 'n_estimators': 121}, 'status': 'ok'}


{'loss': nan, 'params': {'class_weight': 'balanced', 'max_depth': 7, 'max_features': 'None', 'max_leaf_nodes': 104, 'max_samples': 0.0367072640631847, 'min_impurity_decrease': 0.12910648344978914, 'min_samples_leaf': 2, 'min_samples_split': 15, 'min_weight_fraction_leaf': 0.3161712810846662, 'n_estimators': 767}, 'status': 'ok'}


{'loss': nan, 'params': {'class_weight': 'balanced', 'max_depth': 3, 'max_features': 'None', 'max_leaf_nodes': 124, 'max_samples': 0.16440865223966705, 'min_impurity_decrease': 0.391904635576072, 'min_samples_leaf': 1, 'min_samples_split': 7, 'min_weight_fraction_leaf': 0.0811356314154057, 'n_estimators': 347}, 'status': 'ok'}


{'loss': nan, 'params': {'class_weight': 'balanced', 'max_depth': 12, 'max_features': 'log2', 'max_leaf_nodes': 68, 'max_samples': 0.8502406812728349, 'min_impurity_decrease': 0.7058978690401395, 'min_samples_leaf': 2, 'min_samples_split': 16, 'min_weight_fraction_leaf': 0.7016784424128134, 'n_estimators': 938}, 'status': 'ok'}


{'loss': nan, 'params': {'class_weight': 'balanced', 'max_depth': 5, 'max_features': 'log2', 'max_leaf_nodes': 99, 'max_samples': 0.23705851369580344, 'min_impurity_decrease': 0.20836965887913506, 'min_samples_leaf': 7, 'min_samples_split': 3, 'min_weight_fraction_leaf': 0.7453528956610014, 'n_estimators': 468}, 'status': 'ok'}


{'loss': nan, 'params': {'class_weight': None, 'max_depth': 15, 'max_features': 'None', 'max_leaf_nodes': 114, 'max_samples': 0.7084444118326696, 'min_impurity_decrease': 0.986092424730284, 'min_samples_leaf': 3, 'min_samples_split': 14, 'min_weight_fraction_leaf': 0.30715124274867167, 'n_estimators': 743}, 'status': 'ok'}


{'loss': 0.9642857142857143, 'params': {'class_weight': 'balanced', 'max_depth': 10, 'max_features': 'sqrt', 'max_leaf_nodes': 97, 'max_samples': 0.9199683481619908, 'min_impurity_decrease': 0.34148971488668467, 'min_samples_leaf': 5, 'min_samples_split': 10, 'min_weight_fraction_leaf': 0.006984816385200432, 'n_estimators': 386}, 'status': 'ok'}


{'loss': nan, 'params': {'class_weight': None, 'max_depth': 13, 'max_features': 'None', 'max_leaf_nodes': 20, 'max_samples': 0.38036460187991084, 'min_impurity_decrease': 0.8852038598514178, 'min_samples_leaf': 5, 'min_samples_split': 11, 'min_weight_fraction_leaf': 0.06166031048348186, 'n_estimators': 635}, 'status': 'ok'}


{'loss': nan, 'params': {'class_weight': 'balanced', 'max_depth': 5, 'max_features': 'None', 'max_leaf_nodes': 52, 'max_samples': 0.8640312159272309, 'min_impurity_decrease': 0.16823848137945396, 'min_samples_leaf': 1, 'min_samples_split': 9, 'min_weight_fraction_leaf': 0.24162088495434908, 'n_estimators': 564}, 'status': 'ok'}


{'status': 'new'}


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

我为完整迭代执行了“fmin”，但结果相同。我在做什么错在这里？？？

score 0 · Accepted Answer

您的损失Objective_Forest定义为(1 - np.mean(score)，并score通过与的交叉验证进行评估cross_val_score。因此，损失取决于cross_val_score函数的输出。您在中scorer用作评估指标cross_val_score，但您没有在代码中的任何地方定义它（可能在其他地方定义）。您的 NaN 值很可能是由于您在交叉验证中实施的评分类型。

def Objective_Forest(params):
    classifier = RandomForestClassifier(**params)
    score = cross_val_score(classifier,train,Target,cv = 10,scoring = scorer)
    Best_Score = 1 - np.mean(score)
    return {'loss': Best_Score,'params':params,'status':STATUS_OK}

python-3.x - 为什么 hyperopt 在随机森林中运行时会给出最好的损失 Nan？

1 回答 1

Related

Reference