我对此很陌生,并且看到其他人有同样的错误,但看不到我如何实施解决方案。我正在尝试使用来自 scikit learn 的随机网格搜索来编写随机森林机器学习方法。它适用于标准网格搜索,但当我使用随机网格搜索时,scikit learn 的拟合函数出现奇怪错误而失败。关于如何解决这个问题的任何建议都会很棒
这是显示错误的示例。
import scipy
import math
import numpy as np
import pandas as pd
import plotly.plotly as py
from time import time
from sklearn import preprocessing, metrics, cross_validation
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.cross_validation import KFold
data = pd.read_csv("data.csv", sep=",")
data = SubFeAll.fillna(SubFeAll.mean()) # replace the NA values with the mean of the descriptor
header = data.columns.values # Ues the column headers as the descriptor labels
data.head()
# Set the numpy global random number seed (similar effect to random_state)
np.random.seed(1)
# Random Forest results initialised
RFr2 = []
RFmse = []
RFrmse = []
# Predictions results initialised
RFpredictions = []
metcount = 0
# Give the array from pandas to numpy
npArray = np.array(data)
print header.shape
npheader = np.array(header[1:-1])
print("Array shape X = %d, Y = %d " % (npArray.shape))
datax, datay = npArray.shape
# Split the data into: names labels of the molecules ; y the True results ; X the descriptors for each data point
names = npArray[:,0]
X = npArray[:,1:-1].astype(float)
y = npArray[:,-1] .astype(float)
X = preprocessing.scale(X)
print X.shape
# Open output files
train_name = "Training.csv"
fi_name = "Feature_importance.csv"
with open(train_name,'w') as ftrain:
ftrain.write("This file contains the training information for all three models (Random Forest, Support Vector Regression and Partial Least Squares),\n")
ftrain.write("The code use a ten fold cross validation 90% training 10% test at each fold so ten training sets are used here,\n")
ftrain.write("Fold %d ,\n" %(metcount+1))
ftrain.close()
with open(fi_name,'w') as ffeatimp:
ffeatimp.write("This file contains the feature importance information for the Random Forest model,\n")
ffeatimp.close()
# Begin the K-fold cross validation over ten folds
kf = KFold(datax, n_folds=10)
print "------------------- Begining Ten Fold Cross Validation -------------------"
for train, test in kf:
XTrain, XTest, yTrain, yTest = X[train], X[test], y[train], y[test]
ytestdim = yTest.shape[0]
i = 0
with open (train_name, 'a') as ftrain:
while i< ytestdim :
ftrain.write(str(round(yTest[i],2))+',\n')
i += 1
ftrain.close()
print "\n"
# random forest grid search parameters
print "------------------- Begining Random Forest Grid Search -------------------"
rfparamgrid = {"n_estimators": scipy.stats.expon(scale=100), "max_features": ["auto", "sqrt", "log2"], "max_depth": scipy.stats.expon(scale=100)}
rf = RandomForestRegressor(random_state=0,n_jobs=2)
RfGridSearch = RandomizedSearchCV(rf,param_distributions=rfparamgrid,scoring='mean_squared_error',n_iter=20)
start = time()
RfGridSearch.fit(XTrain,yTrain)
# Get best random forest parameters
print("GridSearchCV took %.2f seconds for %d candidate parameter settings" %(time() - start,len(RfGridSearch.grid_scores_)))
RFtime = time() - start,len(RfGridSearch.grid_scores_)
report(RfGridSearch.grid_scores_)
print("n_estimators = %d " % RfGridSearch.best_params_['n_estimators'])
ne = RfGridSearch.best_params_['n_estimators']
print("max_features = %s " % RfGridSearch.best_params_['max_features'])
mf = RfGridSearch.best_params_['max_features']
print("max_depth = %d " % RfGridSearch.best_params_['max_depth'])
md = RfGridSearch.best_params_['max_depth']
with open (train_name, 'a') as ftrain:
ftrain.write("Random Forest")
ftrain.write("RF search time, %s ,\n" % (str(RFtime)))
ftrain.write("Number of Trees, %s ,\n" % str(ne))
ftrain.write("Number of feature at split, %s ,\n" % str(mf))
ftrain.write("Max depth of tree, %s ,\n" % str(md))
ftrain.close()
给出的错误如下
Traceback (most recent call last):
File "rgscv.py", line 81, in <module>
RfGridSearch.fit(XTrain,yTrain)
File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/grid_search.py", line 996, in fit
return self._fit(X, y, sampled_params)
File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/grid_search.py", line 553, in _fit
for parameters in parameter_iterable
File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 800, in __call__
while self.dispatch_one_batch(iterator):
File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 658, in dispatch_one_batch
self._dispatch(tasks)
File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 566, in _dispatch
job = ImmediateComputeBatch(batch)
File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 180, in __init__
self.results = batch()
File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 72, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/cross_validation.py", line 1531, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 276, in fit
for i in range(n_more_estimators):
TypeError: range() integer end argument expected, got float.
起初我以为我只是错过了一个参数,但这种直接网格搜索的精确方法似乎没有问题。代码如下。任何人都可以向我建议是什么导致了这个错误?
import scipy
import math
import numpy as np
import pandas as pd
import plotly.plotly as py
from time import time
from sklearn import preprocessing, metrics, cross_validation
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.cross_validation import KFold
data = pd.read_csv("data.csv", sep=",")
data = data.fillna(SubFeAll.mean()) # replace the NA values with the mean of the descriptor
header = data.columns.values # Ues the column headers as the descriptor labels
data.head()
# Set the numpy global random number seed (similar effect to random_state)
np.random.seed(1)
# Random Forest results initialised
RFr2 = []
RFmse = []
RFrmse = []
# Predictions results initialised
RFpredictions = []
metcount = 0
# Give the array from pandas to numpy
npArray = np.array(data)
print header.shape
npheader = np.array(header[1:-1])
print("Array shape X = %d, Y = %d " % (npArray.shape))
datax, datay = npArray.shape
# Split the data into: names labels of the molecules ; y the True results ; X the descriptors for each data point
names = npArray[:,0]
X = npArray[:,1:-1].astype(float)
y = npArray[:,-1] .astype(float)
X = preprocessing.scale(X)
print X.shape
# Open output files
train_name = "Training.csv"
fi_name = "Feature_importance.csv"
with open(train_name,'w') as ftrain:
ftrain.write("This file contains the training information for all three models (Random Forest, Support Vector Regression and Partial Least Squares),\n")
ftrain.write("The code use a ten fold cross validation 90% training 10% test at each fold so ten training sets are used here,\n")
ftrain.write("Fold %d ,\n" %(metcount+1))
ftrain.close()
with open(fi_name,'w') as ffeatimp:
ffeatimp.write("This file contains the feature importance information for the Random Forest model,\n")
ffeatimp.close()
# Begin the K-fold cross validation over ten folds
kf = KFold(datax, n_folds=10)
print "------------------- Begining Ten Fold Cross Validation -------------------"
for train, test in kf:
XTrain, XTest, yTrain, yTest = X[train], X[test], y[train], y[test]
ytestdim = yTest.shape[0]
i = 0
with open (train_name, 'a') as ftrain:
while i< ytestdim :
ftrain.write(str(round(yTest[i],2))+',\n')
i += 1
ftrain.close()
print "\n"
# random forest grid search parameters
print "------------------- Begining Random Forest Grid Search -------------------"
#rfparamgrid = {"n_estimators": scipy.stats.expon(scale=100), "max_features": ["auto", "sqrt", "log2"], "max_depth": scipy.stats.expon(scale=100)}
rfparamgrid = {"n_estimators": [10, 20, 25, 50, 100, 1000], "max_features": ["auto", "sqrt", "log2"], "max_depth": [1,2,3,5,7,10]}
rf = RandomForestRegressor(random_state=0,n_jobs=2)
RfGridSearch = GridSearchCV(rf,param_grid=rfparamgrid,scoring='mean_squared_error')
start = time()
RfGridSearch.fit(XTrain,yTrain)
# Get best random forest parameters
print("GridSearchCV took %.2f seconds for %d candidate parameter settings" %(time() - start,len(RfGridSearch.grid_scores_)))
RFtime = time() - start,len(RfGridSearch.grid_scores_)
report(RfGridSearch.grid_scores_)
print("n_estimators = %d " % RfGridSearch.best_params_['n_estimators'])
ne = RfGridSearch.best_params_['n_estimators']
print("max_features = %s " % RfGridSearch.best_params_['max_features'])
mf = RfGridSearch.best_params_['max_features']
print("max_depth = %d " % RfGridSearch.best_params_['max_depth'])
md = RfGridSearch.best_params_['max_depth']
with open (train_name, 'a') as ftrain:
ftrain.write("Random Forest")
ftrain.write("RF search time, %s ,\n" % (str(RFtime)))
ftrain.write("Number of Trees, %s ,\n" % str(ne))
ftrain.write("Number of feature at split, %s ,\n" % str(mf))
ftrain.write("Max depth of tree, %s ,\n" % str(md))
ftrain.close()