这是一个有点小众的问题,但我真的不明白。
当我运行 Tweedie GLM 时,可以通过执行 exp(link) 从链接中获取预测。为了获得 Tweedie GLM 的预测,我通过执行 exp(link)/2 从链接获得预测。我不明白为什么我需要除以 2。
下面的最小可重复示例,灵感来自https://github.com/dmlc/xgboost/blob/master/R-package/demo/tweedie_regression.R上的 tweedie 回归演示
library(xgboost)
library(data.table)
library(cplm) # for insurance data
library(statmod) # for tweedie glm
data(AutoClaim)
# auto insurance dataset analyzed by Yip and Yau (2005)
dt <- data.table(AutoClaim)
# exclude these columns from the model matrix
exclude <- c('POLICYNO', 'PLCYDATE', 'CLM_FREQ5', 'CLM_AMT5', 'CLM_FLAG', 'IN_YY')
# retains the missing values
# NOTE: this dataset is comes ready out of the box
options(na.action = 'na.pass')
x <- sparse.model.matrix(~ . - 1, data = dt[, -exclude, with = F])
options(na.action = 'na.omit')
# response
y <- dt[, CLM_AMT5]
d_train <- xgb.DMatrix(data = x, label = y, missing = NA)
# the tweedie_variance_power parameter determines the shape of
# distribution
# - closer to 1 is more poisson like and the mass
# is more concentrated near zero
# - closer to 2 is more gamma like and the mass spreads to the
# the right with less concentration near zero
params <- list(
objective = 'reg:tweedie',
eval_metric = 'rmse',
tweedie_variance_power = 1.4,
max_depth = 2,
eta = 1)
set.seed(42)
bst <- xgb.train(
data = d_train,
params = params,
maximize = FALSE,
watchlist = list(train = d_train),
nrounds = 3)
xgb.plot.tree(model = bst)
```
# Manually extract the values for the first record :
x[1,]
# travtime < 102, bluebook <61645 -->tree #1 value= 2.49922585
# revolkedyes < -9.53674316e-07, npolicy < 5.5 --> tree #2 value= 2.48586464
# REVOLKEDYes < -9.53674316e-07, areaurban > -9.53674316e-07 --> tree #2 vakye = 2.36028123
link_gbm <- 2.49922585 +2.48586464+ 2.36028123
link_gbm # 7.345372
# Take exp(link_gbm), divide by 2
exp(link_gbm ) / 2 # 774.5053
# Compare with getting prediction directly from GBM.
predict(bst, d_train)[1] # 774.5053
# Let's do the same with a GLM:
dt2 <- dt[, -exclude, with = F]
dt2$CLM_AMT5 <- dt$CLM_AMT5
tweedie_fit <-
glm(CLM_AMT5 ~ .,
family=tweedie(var.power=1.4, link.power=0),
data = dt2)
summary(tweedie_fit)
# Manually get the link value for the first record
dt2[1,]
link_glm <- tweedie_fit$coefficients["(Intercept)"] +
14 * tweedie_fit$coefficients["TRAVTIME"] +
14230 * tweedie_fit$coefficients["BLUEBOOK"] +
11 * tweedie_fit$coefficients["RETAINED"] +
1 * tweedie_fit$coefficients["NPOLICY"] +
1 * tweedie_fit$coefficients["CAR_TYPESedan"] +
1 * tweedie_fit$coefficients["RED_CARyes"] +
3 * tweedie_fit$coefficients["MVR_PTS"] +
60 * tweedie_fit$coefficients["AGE"] +
11 * tweedie_fit$coefficients["YOJ"] +
67349 * tweedie_fit$coefficients["INCOME"] +
1 * tweedie_fit$coefficients["GENDERM"] +
1 * tweedie_fit$coefficients["JOBCLASSProfessional"] +
1 * tweedie_fit$coefficients["MAX_EDUCPhD"] +
18 * tweedie_fit$coefficients["SAMEHOME"] +
1 * tweedie_fit$coefficients["AREAUrban"]
link_glm # 8.299899
# prediction is exp(link_glm)
exp(link_glm) # 4023.466
# compare with link and prediction from glm ... yes, it's identical
predict(tweedie_fit, type="link")[1]
predict(tweedie_fit, type="response")[1] # 4023.466