0

我对此很陌生,并且看到其他人有同样的错误,但看不到我如何实施解决方案。我正在尝试使用来自 scikit learn 的随机网格搜索来编写随机森林机器学习方法。它适用于标准网格搜索,但当我使用随机网格搜索时,scikit learn 的拟合函数出现奇怪错误而失败。关于如何解决这个问题的任何建议都会很棒

这是显示错误的示例。

import scipy
import math
import numpy as np
import pandas as pd
import plotly.plotly as py

from time import time
from sklearn import preprocessing, metrics, cross_validation
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.cross_validation import KFold

data = pd.read_csv("data.csv", sep=",")
data = SubFeAll.fillna(SubFeAll.mean()) # replace the NA values with the mean of the descriptor
header = data.columns.values # Ues the column headers as the descriptor labels
data.head()

# Set the numpy global random number seed (similar effect to random_state) 
np.random.seed(1)  

# Random Forest results initialised
RFr2 = []
RFmse = []
RFrmse = []

# Predictions results initialised 
RFpredictions = []

metcount = 0

# Give the array from pandas to numpy
npArray = np.array(data)
print header.shape
npheader = np.array(header[1:-1])
print("Array shape X = %d, Y = %d " % (npArray.shape))
datax, datay =  npArray.shape

# Split the data into: names labels of the molecules ; y the True results ; X the descriptors for each data point
names = npArray[:,0]
X = npArray[:,1:-1].astype(float)
y = npArray[:,-1] .astype(float)
X = preprocessing.scale(X)
print X.shape

# Open output files
train_name = "Training.csv"
fi_name = "Feature_importance.csv"

with open(train_name,'w') as ftrain:
        ftrain.write("This file contains the training information for all three models (Random Forest, Support Vector Regression and Partial Least Squares),\n")
        ftrain.write("The code use a ten fold cross validation 90% training 10% test at each fold so ten training sets are used here,\n")
        ftrain.write("Fold %d ,\n" %(metcount+1))
ftrain.close()

with open(fi_name,'w') as ffeatimp:
        ffeatimp.write("This file contains the feature importance information for the Random Forest model,\n")
ffeatimp.close()

# Begin the K-fold cross validation over ten folds
kf = KFold(datax, n_folds=10)
print "------------------- Begining Ten Fold Cross Validation -------------------"
for train, test in kf:
    XTrain, XTest, yTrain, yTest = X[train], X[test], y[train], y[test]
    ytestdim = yTest.shape[0]
    i = 0
    with open (train_name, 'a') as ftrain:
        while i< ytestdim :
                 ftrain.write(str(round(yTest[i],2))+',\n')
                 i += 1
    ftrain.close()

    print "\n"
    # random forest grid search parameters
    print "------------------- Begining Random Forest Grid Search -------------------"
    rfparamgrid = {"n_estimators": scipy.stats.expon(scale=100), "max_features": ["auto", "sqrt", "log2"], "max_depth": scipy.stats.expon(scale=100)}
    rf = RandomForestRegressor(random_state=0,n_jobs=2)
    RfGridSearch = RandomizedSearchCV(rf,param_distributions=rfparamgrid,scoring='mean_squared_error',n_iter=20)
    start = time()
    RfGridSearch.fit(XTrain,yTrain)

    # Get best random forest parameters
    print("GridSearchCV took %.2f seconds for %d candidate parameter settings" %(time() - start,len(RfGridSearch.grid_scores_)))
    RFtime = time() - start,len(RfGridSearch.grid_scores_)
    report(RfGridSearch.grid_scores_)
    print("n_estimators = %d " % RfGridSearch.best_params_['n_estimators'])
    ne = RfGridSearch.best_params_['n_estimators']
    print("max_features = %s " % RfGridSearch.best_params_['max_features'])
    mf = RfGridSearch.best_params_['max_features']
    print("max_depth = %d " % RfGridSearch.best_params_['max_depth'])
    md = RfGridSearch.best_params_['max_depth']
    with open (train_name, 'a') as ftrain:
           ftrain.write("Random Forest")
           ftrain.write("RF search time, %s ,\n" % (str(RFtime)))
           ftrain.write("Number of Trees, %s ,\n" % str(ne))
           ftrain.write("Number of feature at split, %s ,\n" % str(mf))
           ftrain.write("Max depth of tree, %s ,\n" % str(md))
     ftrain.close()

给出的错误如下

Traceback (most recent call last):
  File "rgscv.py", line 81, in <module>
    RfGridSearch.fit(XTrain,yTrain)
  File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/grid_search.py", line 996, in fit
    return self._fit(X, y, sampled_params)
  File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/grid_search.py", line 553, in _fit
    for parameters in parameter_iterable
  File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 800, in __call__
    while self.dispatch_one_batch(iterator):
  File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 658, in dispatch_one_batch
    self._dispatch(tasks)
  File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 566, in _dispatch
    job = ImmediateComputeBatch(batch)
  File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 180, in __init__
    self.results = batch()
  File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 72, in __call__
    return [func(*args, **kwargs) for func, args, kwargs in self.items]
  File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/cross_validation.py", line 1531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 276, in fit
    for i in range(n_more_estimators):
TypeError: range() integer end argument expected, got float.

起初我以为我只是错过了一个参数,但这种直接网格搜索的精确方法似乎没有问题。代码如下。任何人都可以向我建议是什么导致了这个错误?

import scipy
import math
import numpy as np
import pandas as pd
import plotly.plotly as py

from time import time
from sklearn import preprocessing, metrics, cross_validation
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.cross_validation import KFold

data = pd.read_csv("data.csv", sep=",")
data = data.fillna(SubFeAll.mean()) # replace the NA values with the mean of the descriptor
header = data.columns.values # Ues the column headers as the descriptor labels
data.head()

# Set the numpy global random number seed (similar effect to random_state) 
np.random.seed(1)  

# Random Forest results initialised
RFr2 = []
RFmse = []
RFrmse = []

# Predictions results initialised 
RFpredictions = []

metcount = 0

# Give the array from pandas to numpy
npArray = np.array(data)
print header.shape
npheader = np.array(header[1:-1])
print("Array shape X = %d, Y = %d " % (npArray.shape))
datax, datay =  npArray.shape

# Split the data into: names labels of the molecules ; y the True results ; X the descriptors for each data point
names = npArray[:,0]
X = npArray[:,1:-1].astype(float)
y = npArray[:,-1] .astype(float)
X = preprocessing.scale(X)
print X.shape

# Open output files
train_name = "Training.csv"
fi_name = "Feature_importance.csv"

with open(train_name,'w') as ftrain:
        ftrain.write("This file contains the training information for all three models (Random Forest, Support Vector Regression and Partial Least Squares),\n")
        ftrain.write("The code use a ten fold cross validation 90% training 10% test at each fold so ten training sets are used here,\n")
        ftrain.write("Fold %d ,\n" %(metcount+1))
ftrain.close()

with open(fi_name,'w') as ffeatimp:
        ffeatimp.write("This file contains the feature importance information for the Random Forest model,\n")
ffeatimp.close()

# Begin the K-fold cross validation over ten folds
kf = KFold(datax, n_folds=10)
print "------------------- Begining Ten Fold Cross Validation -------------------"
for train, test in kf:
    XTrain, XTest, yTrain, yTest = X[train], X[test], y[train], y[test]
    ytestdim = yTest.shape[0]
    i = 0
    with open (train_name, 'a') as ftrain:
        while i< ytestdim :
              ftrain.write(str(round(yTest[i],2))+',\n')
              i += 1
    ftrain.close()

    print "\n"
    # random forest grid search parameters
    print "------------------- Begining Random Forest Grid Search -------------------"
    #rfparamgrid = {"n_estimators": scipy.stats.expon(scale=100), "max_features": ["auto", "sqrt", "log2"], "max_depth": scipy.stats.expon(scale=100)}
    rfparamgrid = {"n_estimators": [10, 20, 25, 50, 100, 1000], "max_features": ["auto", "sqrt", "log2"], "max_depth": [1,2,3,5,7,10]}
    rf = RandomForestRegressor(random_state=0,n_jobs=2)
    RfGridSearch = GridSearchCV(rf,param_grid=rfparamgrid,scoring='mean_squared_error')
    start = time()
    RfGridSearch.fit(XTrain,yTrain)

    # Get best random forest parameters
    print("GridSearchCV took %.2f seconds for %d candidate parameter settings" %(time() - start,len(RfGridSearch.grid_scores_)))
     RFtime = time() - start,len(RfGridSearch.grid_scores_)
     report(RfGridSearch.grid_scores_)
     print("n_estimators = %d " % RfGridSearch.best_params_['n_estimators'])
     ne = RfGridSearch.best_params_['n_estimators']
     print("max_features = %s " % RfGridSearch.best_params_['max_features'])
     mf = RfGridSearch.best_params_['max_features']
     print("max_depth = %d " % RfGridSearch.best_params_['max_depth'])
     md = RfGridSearch.best_params_['max_depth']
     with open (train_name, 'a') as ftrain:
                ftrain.write("Random Forest")
                ftrain.write("RF search time, %s ,\n" % (str(RFtime)))
                ftrain.write("Number of Trees, %s ,\n" % str(ne))
                ftrain.write("Number of feature at split, %s ,\n" % str(mf))
                ftrain.write("Max depth of tree, %s ,\n" % str(md))
     ftrain.close()
4

1 回答 1

1

估算器的数量必须是整数,并且您的代码会产生浮点数。创建一个n_estimators包含整数的有效值列表,就可以了。

于 2016-05-03T23:23:28.597 回答