样本数据:
pp.inc <- structure(list(has.di.rec.pp = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0), m.dist.km2 = c(-34.4150009155273, 6.80600023269653, -6.55499982833862,
-61.7700004577637, 15.6840000152588, -11.2869997024536, -26.9729995727539,
0, 81.9940032958984, -35.1459999084473, -12.5179996490479, 0,
21.5919990539551, 81.9940032958984, -20.7770004272461, 85.9469985961914,
-15.2959995269775, -75.5879974365234, 81.9940032958984, 3.04999995231628,
-17.1490001678467, -25.806999206543, -16.0060005187988, -14.91100025177,
-12.9020004272461, -16.0060005187988, 5.44000005722046, -34.4150009155273,
81.9940032958984, 3.61400008201599, 13.7379999160767, 2.71300005912781,
4.31300020217896), treated = c(0, 1, 0, 0, 1, 0, 0, 1, 1, 0,
0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1,
1, 1)), .Names = c("has.di.rec.pp", "m.dist.km2", "treated"), row.names = c(NA,
-33L), class = c("data.table", "data.frame"))
代码:
library(data.table)
library(ggplot2)
rddplot <- function(data, outcome, runvar, treatment = treated, span, bw, ...){
data <- data.table(data)
data.span <- data[abs(runvar) <= span, ]
data.span <- data.span[ , bins := cut(runvar,
seq(-span, span, by = bw),
include.lowest = TRUE, right = FALSE)]
data.span.plot <- data.span[ , list(avg.outcome = mean(outcome),
avg.runvar = mean(runvar),
treated = max(treatment),
n.iid = length(outcome)), keyby = bins]
data.span.plot <- data.span.plot[ , runvar := head(seq(-span, span, by = bw), -1)]
bp <- ggplot(data = data.span.plot, aes(x = runvar, y = avg.outcome))
bp <- bp + geom_point(aes(colour = n.iid))
bp <- bp + stat_smooth(data = data.span, aes(x = runvar, y = outcome,
group = factor(treatment)), ...)
bp
return(bp)
}
rddplot(pp.inc, has.di.rec.pp, m.dist.km2, treated, 50, 5)
如果我不将其包装在函数中,则此代码运行完美。我是 R 的新手,只是很少使用它。我究竟做错了什么?我是否遗漏了一些明显的东西,还是与data.table
or有关ggplot2
?我认为这可能与 ggplot 有关,因为其他问题提到存在问题并且aes_string
应该使用。我可以重写data.table
部分以使用基本功能。但我认为错误已经发生在此之前,在第二行。我该如何进行这项工作?
编辑:
[原标题:R函数在eval(expr,envir,enclos)中返回错误:找不到对象'name']
我有一些时间再次查看并制定了解决方案,因此我也稍微修改了标题。使用eval()
对我来说并没有真正奏效,所以我选择了[['columname']]
选择路线。我已经放弃了data.table
(也放弃plyr
了),因此这只使用base
了 ggplot2 以外的函数。我很高兴对如何改进它提出任何意见。如果有一些基本缺陷,请告诉我。如果不是,我稍后会在我的解决方案中添加答案。
我已经更改了 bin 计算,以便在零处始终有一个断点,这是必要的。默认 binwidth 由 Silverman 规则确定。我正在考虑单独计算模型拟合并返回它,因为 ggplot 中的模型选择是有限的,但是我想不出一个很好的方法来将它用于各种不同的模型,例如 lm 或 loess,而且它并不严格必要的。我实际上想覆盖一个细条形图,显示每个 bin 中的观察次数,但发现这在 ggplot 中是不可能的(我知道这通常是一个坏主意,但有几篇发表良好的论文使用了类似的图表)。我不觉得size
这里有什么吸引人的地方,但这些确实是次要的抱怨。
感谢您让我走上正确的道路。
我的解决方案:
rddplot <- function(data, outcome, runvar, treatment = treated,
span, bw = bw.nrd0(data[[runvar]]), ...){
breaks <- c(sort(-seq(0, span, by = bw)[-1]), seq(0, span, by = bw))
data.span <- data[abs(data[[runvar]]) <= max(breaks), ]
data.span$bins <- cut(data.span[[runvar]], breaks,
include.lowest = TRUE, right = FALSE)
data.span.plot <- as.data.frame(cbind(tapply(data.span[[outcome]], data.span$bins, mean),
tapply(data.span[[runvar]], data.span$bins, mean),
tapply(data.span[[treatment]], data.span$bins, max),
tapply(data.span[[outcome]], data.span$bins, length),
tapply(data.span[[outcome]], data.span$bins, sum)))
colnames(data.span.plot) <- c("avg.outcome", "avg.runvar", "treated", "n.iid", "n.rec")
data.span.plot$runvar <- head(breaks, -1)
print(data.span.plot)
bp <- ggplot(data = data.span.plot, aes(x = runvar, y = avg.outcome))
bp <- bp + geom_point(aes(size = n.iid))
bp <- bp + stat_smooth(data = data.span, aes_string(x = runvar, y = outcome,
group = treatment), ...)
print(bp)
}
称呼:
rddplot(pp.inc, "has.di.rec.pp", "m.dist.km2", "treated", 50,
method = lm, formula = y ~ poly(x, 4, raw = TRUE))