2

我想定义一个通用函数

func_boxplot2 <- function(tmp, xvar, yvar, groupvar)
{
  xvar <- enquo(xvar)
  yvar <- enquo(yvar)
  groupvar <- enquo(groupvar)

  # If variable yield exists, put concentrations to NA for all yields < annual_yield_thres
  if( "yield" %in% names(tmp) )
  {
    tmp <- tmp %>%
      mutate_at(vars(!!yvar), ~ifelse(round(yield, 0) < 85, NA, .))
  }
  
  # Compute IQR for each year
  tmp_iqr <- tmp %>%
    group_by(!!groupvar) %>%
    summarise(iqr=IQR(!!yvar, na.rm = TRUE))
  
  p <- ggplot(data = tmp %>% mutate_at(vars(!!yvar), ~ifelse(tmp_iqr[which(tmp_iqr[[!!groupvar]] %in% (!!xvar)),]$iqr == 0, . + runif(1, -0.01, 0.01), . )), aes(x = !!xvar, y = !!yvar))
  p <- p + stat_boxplot(aes(group = !!groupvar), na.rm = TRUE, coef = 10000)   # Trick (large unrealistic coef value) so whiskers end at min(y) & max(y)
  p <- p + geom_boxplot(na.rm = TRUE, outlier.shape = NA)

  return(p)
}

即使IQR为0,它也能够绘制扩展到最小/最大值的箱线图晶须。我试图通过向有罪数据添加微小的随机数(低于显着性水平)来避免IQR = 0来实现这一点。

但是,我一定错过了关于 quosure 的语法,因为运行这个函数

func_boxplot2(data, date, days, date)

与数据集

structure(list(date = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 
4L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 
7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 10L, 
10L, 10L, 10L, 10L, 10L), .Label = c("2010", "2011", "2012", 
"2013", "2014", "2015", "2016", "2017", "2018", "2019"), class = c("ordered", 
"factor")), station = c("41B011", "41MEU1", "41N043", "41R001", 
"41R012", "41WOL1", "41B011", "41MEU1", "41N043", "41R001", "41R012", 
"41WOL1", "41B011", "41MEU1", "41N043", "41R001", "41R012", "41WOL1", 
"41B011", "41MEU1", "41N043", "41R001", "41R012", "41WOL1", "41B011", 
"41MEU1", "41N043", "41R001", "41R012", "41WOL1", "41B011", "41MEU1", 
"41N043", "41R001", "41R012", "41WOL1", "41B011", "41MEU1", "41N043", 
"41R001", "41R012", "41WOL1", "41B011", "41MEU1", "41N043", "41R001", 
"41R012", "41WOL1", "41B011", "41MEU1", "41N043", "41R001", "41R012", 
"41WOL1", "41B011", "41MEU1", "41N043", "41R001", "41R012", "41WOL1"
), days = c(16, 15, 45, 26, 14, 14, 32, 7, 87, 42, 24, 23, 25, 
25, 55, 29, 29, 16, 11, 14, 58, 21, 19, 10, 10, 14, 33, 18, 10, 
7, 9, 10, 19, 7, 8, 7, 1, 5, 15, 8, 1, 4, 5, 6, 14, 6, 5, 5, 
3, 5, 19, 8, 4, 5, 3, 4, 16, 3, 1, 3), yield = c(98.4817351598173, 
49.4520547945205, 95.8561643835616, 97.6712328767123, 98.2648401826484, 
95.1598173515982, 97.8767123287671, 27.9109589041096, 98.310502283105, 
98.972602739726, 97.203196347032, 96.2100456621005, 98.7818761384335, 
96.7554644808743, 97.4954462659381, 98.8046448087432, 98.747723132969, 
98.3037340619308, 99.0525114155251, 96.1986301369863, 97.1004566210046, 
96.4954337899543, 96.3698630136986, 98.2077625570776, 96.62100456621, 
98.3675799086758, 95.6963470319635, 96.8835616438356, 93.5844748858447, 
87.8196347031963, 91.2328767123288, 92.5570776255708, 81.5182648401827, 
82.7739726027397, 90.1826484018265, 87.1461187214612, 87.2153916211293, 
92.9986338797814, 94.6948998178506, 85.5760473588342, 92.3611111111111, 
96.2204007285975, 86.3698630136986, 86.4269406392694, 87.796803652968, 
93.2762557077626, 96.6438356164384, 95.6164383561644, 71.3812785388128, 
93.7442922374429, 96.3698630136986, 97.2602739726027, 95.7876712328767, 
94.7146118721461, 87.6141552511416, 43.0821917808219, 88.6872146118722, 
92.6826484018265, 90.365296803653, 86.541095890411), environ = structure(c(5L, 
4L, 6L, 3L, 5L, 3L, 5L, 4L, 6L, 3L, 5L, 3L, 5L, 4L, 6L, 3L, 5L, 
3L, 5L, 4L, 6L, 3L, 5L, 3L, 5L, 4L, 6L, 3L, 5L, 3L, 5L, 4L, 6L, 
3L, 5L, 3L, 5L, 4L, 6L, 3L, 5L, 3L, 5L, 4L, 6L, 3L, 5L, 3L, 5L, 
4L, 6L, 3L, 5L, 3L, 5L, 4L, 6L, 3L, 5L, 3L), .Label = c("Urbain avec très forte influence du trafic", 
"Urbain avec forte influence du trafic", "Urbain avec influence modérée du trafic", 
"Urbain avec faible influence du trafic", "Urbain avec très faible influence du trafic", 
"Industriel avec influence modérée du trafic"), class = "factor")), row.names = c(NA, 
-60L), class = c("tbl_df", "tbl", "data.frame"))

给我以下错误

 Error: Problem with `mutate()` input `days`.
x Must extract column with a single valid subscript.
x Subscript `date` has size 60 but must be size 1.
ℹ Input `days` is `(structure(function (..., .x = ..1, .y = ..2, . = ..1) ...`.

请问我的语法有什么问题?

非常感谢,

一种。

======== 更新 ==========

使用建议的更新功能

func_boxplot2 <- function(tmp, xvar, yvar, groupvar)
{
  # If variable yield exists, put concentrations to NA for all yields < annual_yield_thres
  if("yield" %in% names(tmp)) {
    tmp <-
      tmp %>%
      mutate(across({{yvar}}, ~ifelse(round(yield, 0) < 85, NA, .)))
  }
  
  tmp <-
    tmp %>%
    group_by({{groupvar}}) %>%
    mutate(
      across({{yvar}}, function (x) {
        ifelse(
          IQR({{yvar}}, na.rm = TRUE) == 0,
          x + runif(1, -0.01,0.01),
          x
        )
      })
    )

  ggplot(tmp, aes(x = {{xvar}}, y = {{yvar}})) +
    stat_boxplot(aes(group = {{groupvar}}), na.rm = TRUE, coef = 10000) +
    geom_boxplot(na.rm = TRUE, outlier.shape = NA)
}

结果如下图

在此处输入图像描述

正如我在评论中所说,似乎处理tmp同一年所有行的输出是今年的第一个值yvar,这解释了情节。事实上,评论这个块给出了下图

在此处输入图像描述

4

2 回答 2

0

中的条件ifelse不是正确的长度。您可以将data ggplot参数更改为此。

data =
  tmp %>%
    group_by(!!groupvar) %>%
    mutate_at(
        vars(!!yvar),
        if (IQR(., na.rm = TRUE) == 0) {
          . + runif(1, -0.01,0.01)
        } else {
          .
        }
    )

您对quosureand的使用!!是正确的,但是您应该改用更新的{{运算符

这是更新的功能

func_boxplot2 <- function(tmp, xvar, yvar, groupvar)
{
  # If variable yield exists, put concentrations to NA for all yields < annual_yield_thres
  if("yield" %in% names(tmp)) {
    tmp <-
      tmp %>%
      mutate(across({{yvar}}, ~ifelse(round(yield, 0) < 85, NA, .)))
  }
  
  tmp <-
    tmp %>%
    group_by({{groupvar}}) %>%
    mutate(
      across({{yvar}}, function (x) {
        if (IQR(x, na.rm = TRUE) == 0) {
          x + runif(length(x), -0.01, 0.01)
        } else {
          x
        }
      })
    )

  ggplot(tmp, aes(x = {{xvar}}, y = {{yvar}})) +
    stat_boxplot(aes(group = {{groupvar}}), na.rm = TRUE, coef = 10000) +
    geom_boxplot(na.rm = TRUE, outlier.shape = NA)
}
func_boxplot2(data, date, days, date)

阴谋

于 2020-11-13T16:39:17.927 回答
0

我认为你的问题得到了很好的回答。这是解决问题的一种完全不同的方法,回答了一个您可能应该问的问题:如何修改用于生成箱线图的统计数据,而不是修改数据。

通常 stat 是StatBoxplot,并且该ggproto对象的重要部分是方法compute_group。在绘图中使用时,它返回包含列的单行数据框

[1] "ymin"        "lower"       "middle"      "upper"      
[5] "ymax"        "outliers"    "notchupper"  "notchlower" 
[9] "x"           "relvarwidth" "flipped_aes" 

这些大多有一种明显的含义;唯一不明显的是outliers,它是一个列表模式列,包含一个包含异常值的数字向量。

因此,要完全摆脱异常值绘图,您可以创建一个继承的 stat ,它就像StatBoxplot,但它修改了 的结果compute_group

NoOutlierStatBoxplot <- 
  ggproto("NoOutlierStatBoxplot", ggplot2::StatBoxplot,
          compute_group = function(..., self) {
            res <- ggproto_parent(StatBoxplot, self)$compute_group(...)
            res$ymin <- min(c(res$ymin, res$outliers[[1]]))
            res$ymax <- max(c(res$ymax, res$outliers[[1]]))
            res$outliers <- list(numeric())
            res
          })

(这与您所做的并不完全相同:它仍然会在移除异常值后计算上四分位数和下四分位数。如果这对您很重要,您可能需要进行更广泛的修改。)

通过此修改,您可以从 中删除很多代码func_boxplot2,包括删除stat_boxplot()

func_boxplot3 <- function(tmp, xvar, yvar, groupvar)
{
  # If variable yield exists, put concentrations to NA for all yields < annual_yield_thres
  if("yield" %in% names(tmp)) {
    tmp <-
      tmp %>%
      mutate(across({{yvar}}, ~ifelse(round(yield, 0) < 85, NA, .)))
  }
  
  ggplot(tmp, aes(x = {{xvar}}, y = {{yvar}})) +
    geom_boxplot(na.rm = TRUE, outlier.shape = NA, 
                 aes(group = {{groupvar}}), 
                 stat = NoOutlierStatBoxplot)
}

func_boxplot3(mydf, date, days, date)

在此处输入图像描述

于 2020-11-15T14:25:46.507 回答