1

目前我正在阅读 Peter Harrington 的“Machine Learning in Action”。我尝试使用书中的一些 AdaBoost 代码来实现 Gradient Boosting。我使用了Trevor Hastie 等人的“统计学习要素”中的伪代码。和弗里德曼的梯度提升算法。我花了很多时间和精力在 python 中实现这个算法,所以如果你能指出我的错误在哪里,我将不胜感激。

stumpReg(), buildstumReg(), gradBoostPredict()

是改编自本书的辅助函数。我使用绝对误差损失作为损失函数这是代码:

"""
stumpReg
Description: Creates a simple stump by taking mean values of target 
        variable(classLabel) in each of 2 branches
Parameters: 
dataSet - training data
classLabel - target for the prediction (dependent variable)
dim - dimension of the feature vector
thresh - threshold value
ineq - inequality('less than', 'greater than') 
Returns:
retArr - the resulting array after splitting
select - boolean array that defines values in 2 branches
"""
def stumpReg(dataSet,dim,thresh,ineq):
    retArr = ones((dataSet.shape[0],1))
    if ineq == 'lt':
        select = dataSet[:,dim] <= thresh
        retArr[select] = mean(dataSet[:,-1][select])
        retArr[~select] = mean(dataSet[:,-1][~select])
    else:
        select = dataSet[:,dim] > thresh
        retArr[select] = mean(dataSet[:,-1][~select])
        retArr[~select] = mean(dataSet[:,-1][select])
    return retArr, select

def buildStumpReg(dataSet,classLabel):
    dataSet = mat(dataSet); classLabel = mat(classLabel).T
    m,n = shape(dataSet)
    stepNum = 10.0; bestClassEst = mat(zeros((m,1))); bestStump = {}
    minError = inf
    for i in range(n):
        minRange = dataSet[:,i].min(); maxRange = dataSet[:,i].max()
        stepSize = (maxRange - minRange) / stepNum
        for j in range(int(stepNum)):
            for ineq in ['lt','gt']:
                thresh = (minRange + float(j) * stepSize)
                classArr, selected = stumpReg(dataSet,i,thresh,ineq) 
                errArr = err(classLabel,classArr)
                totalErr = errArr.sum()
                if totalErr < minError:
                    minError = totalErr
                    bestClassEst = classArr.copy()
                    bestSelect = selected.copy()
                    bestStump['dim'] = i
                    bestStump['thresh'] = thresh
                    bestStump['ineq'] = ineq
    return bestStump, minError, bestClassEst, bestSelect

def findMinGamma(sub,x,y):
    gamma = inf
    for error in [sub.max(), sub.min(), sub.mean()]:
        if err(x+gamma,y).sum() > err(x+error,y).sum():
            gamma = error
    return gamma

def TreeBoost(dataset, classLab, numIt=19):
    N, numFeat = dataset.shape # N is the number of training entries
    weakPredictors = []
    stump, error, classEst,sel = buildStumpReg(dataset,classLab.T)
    weakPredictors.append(classEst.T)
    gradLoss = zeros((N,1))
    for m in range(numIt):
        gradLoss = sign(classLab.T - classEst) #gradient of the absolute error loss function
        bestFittedStump,fittedError,f,selected = buildStumpReg(dataset,gradLoss) # fitting a tree to target *gradLoss*
        f=mat(f)
        left = f[selected]
        yLeft = gradLoss[selected]
        right = f[~selected]
        yRight = gradLoss[~selected]
        subLeft=yLeft-left
        subRight=yRight-right
        gammaLeft = findMinGamma(subLeft,left,yLeft)
        gammaRight = findMinGamma(subRight,right,yRight)
        gamma = selected*gammaLeft + ~selected*gammaRight
        bestFittedStump['gamma'] = gamma
        weakPredictors.append(bestFittedStump)
        f += multiply(f,gamma)
    return f,weakPredictors

def gradBoostPredict(testData, weakPredictors):
    testData = mat(testData)
    m = testData.shape[0]
    pred = mat(zeros((m,1)))
    classEst = weakPredictors[0]
    for i in range(1,len(weakPredictors)):
        classEst, select = stumpReg(testData,classEst, weakPredictors[i]['dim'], \
                                    weakPredictors[i]['thresh'], weakPredictors[i]['ineq'])
        classEst += multiply(classEst,weakPredictors[i]['gamma'])
        classEst = classEst.T
    return pred

有些事情是非常错误的,因为预测与实际值相差甚远。如果需要,我将非常乐意提供更多说明。提前致谢。

4

0 回答 0