目前我正在阅读 Peter Harrington 的“Machine Learning in Action”。我尝试使用书中的一些 AdaBoost 代码来实现 Gradient Boosting。我使用了Trevor Hastie 等人的“统计学习要素”中的伪代码。和弗里德曼的梯度提升算法。我花了很多时间和精力在 python 中实现这个算法,所以如果你能指出我的错误在哪里,我将不胜感激。
stumpReg(), buildstumReg(), gradBoostPredict()
是改编自本书的辅助函数。我使用绝对误差损失作为损失函数这是代码:
"""
stumpReg
Description: Creates a simple stump by taking mean values of target
variable(classLabel) in each of 2 branches
Parameters:
dataSet - training data
classLabel - target for the prediction (dependent variable)
dim - dimension of the feature vector
thresh - threshold value
ineq - inequality('less than', 'greater than')
Returns:
retArr - the resulting array after splitting
select - boolean array that defines values in 2 branches
"""
def stumpReg(dataSet,dim,thresh,ineq):
retArr = ones((dataSet.shape[0],1))
if ineq == 'lt':
select = dataSet[:,dim] <= thresh
retArr[select] = mean(dataSet[:,-1][select])
retArr[~select] = mean(dataSet[:,-1][~select])
else:
select = dataSet[:,dim] > thresh
retArr[select] = mean(dataSet[:,-1][~select])
retArr[~select] = mean(dataSet[:,-1][select])
return retArr, select
def buildStumpReg(dataSet,classLabel):
dataSet = mat(dataSet); classLabel = mat(classLabel).T
m,n = shape(dataSet)
stepNum = 10.0; bestClassEst = mat(zeros((m,1))); bestStump = {}
minError = inf
for i in range(n):
minRange = dataSet[:,i].min(); maxRange = dataSet[:,i].max()
stepSize = (maxRange - minRange) / stepNum
for j in range(int(stepNum)):
for ineq in ['lt','gt']:
thresh = (minRange + float(j) * stepSize)
classArr, selected = stumpReg(dataSet,i,thresh,ineq)
errArr = err(classLabel,classArr)
totalErr = errArr.sum()
if totalErr < minError:
minError = totalErr
bestClassEst = classArr.copy()
bestSelect = selected.copy()
bestStump['dim'] = i
bestStump['thresh'] = thresh
bestStump['ineq'] = ineq
return bestStump, minError, bestClassEst, bestSelect
def findMinGamma(sub,x,y):
gamma = inf
for error in [sub.max(), sub.min(), sub.mean()]:
if err(x+gamma,y).sum() > err(x+error,y).sum():
gamma = error
return gamma
def TreeBoost(dataset, classLab, numIt=19):
N, numFeat = dataset.shape # N is the number of training entries
weakPredictors = []
stump, error, classEst,sel = buildStumpReg(dataset,classLab.T)
weakPredictors.append(classEst.T)
gradLoss = zeros((N,1))
for m in range(numIt):
gradLoss = sign(classLab.T - classEst) #gradient of the absolute error loss function
bestFittedStump,fittedError,f,selected = buildStumpReg(dataset,gradLoss) # fitting a tree to target *gradLoss*
f=mat(f)
left = f[selected]
yLeft = gradLoss[selected]
right = f[~selected]
yRight = gradLoss[~selected]
subLeft=yLeft-left
subRight=yRight-right
gammaLeft = findMinGamma(subLeft,left,yLeft)
gammaRight = findMinGamma(subRight,right,yRight)
gamma = selected*gammaLeft + ~selected*gammaRight
bestFittedStump['gamma'] = gamma
weakPredictors.append(bestFittedStump)
f += multiply(f,gamma)
return f,weakPredictors
def gradBoostPredict(testData, weakPredictors):
testData = mat(testData)
m = testData.shape[0]
pred = mat(zeros((m,1)))
classEst = weakPredictors[0]
for i in range(1,len(weakPredictors)):
classEst, select = stumpReg(testData,classEst, weakPredictors[i]['dim'], \
weakPredictors[i]['thresh'], weakPredictors[i]['ineq'])
classEst += multiply(classEst,weakPredictors[i]['gamma'])
classEst = classEst.T
return pred
有些事情是非常错误的,因为预测与实际值相差甚远。如果需要,我将非常乐意提供更多说明。提前致谢。