0

我试图在条形图中可视化我的方差分析和事后 Tukey 的统计测量。到目前为止,这已经解决了,但是我的字母顺序错误,“mit Downsampling”(2)栏应该是与其他人不同的,而不是第一个“ohne Sampling”。 示例可视化

我正在使用的代码:

library(ggplot2)
library(multcompView)
library(reshape2)
a.lda <- read.table(header = TRUE, text = "  rowname ohne_Sampling mit_Downsampling mit_Upsampling Gewichtung
                          1   Fold1     0.6732673        0.8390805      0.7192982  0.6732673
                          2   Fold2     0.7227723        0.8181818      0.7105263  0.7227723
                          3   Fold3     0.7100000        0.7586207      0.6842105  0.7100000
                          4   Fold4     0.6633663        0.8295455      0.7105263  0.6633663
                          5   Fold5     0.7128713        0.8750000      0.7017544  0.7128713")

#Transformation of the dataframe to get a format ggplot2 is able to use    
a.lda <- melt(a.lda, id.vars="rowname")
   
#data_summary Function
data_summary <- function(data, varname, groupnames){
  require(plyr)
  summary_func <- function(x, col){
    c(mean = mean(x[[col]], na.rm=TRUE),
      sd = sd(x[[col]], na.rm=TRUE),
      minimum = min(x[[col]], na.rm=TRUE))
  }
  data_sum<-ddply(data, groupnames, .fun=summary_func,
                  varname)
  data_sum <- rename(data_sum, c("mean" = varname))
  return(data_sum)
}

a.sd.lda <- data_summary(a.lda, varname = "value", groupnames = "variable")

#ANOVA+Tuckey
a.anova <- aov(data=a.lda, value ~ variable)
tukey <- TukeyHSD(a.anova)
cld <- as.data.frame.list((multcompLetters4(a.anova,tukey))$variable)
#The wrong letters do already appear here
a.sd.lda$cld <- cld$Letters

因此,通过查看a.sd.lda表格,您已经可以看到错误的字母为 a、b、b、b 而不是 a、b、a、a。同样通过检查 tukey 结果,ohne Sampling、mit Upsampling 和 Gewichtung 之间没有显着差异。所以我猜这个multcompLetters4()功能导致了错误。

我会非常感谢任何建议!!!

寻找答案时,我发现了这个 stackoverflow 条目(R multcompView 包中的错误 Tukey-letter ordering),但没有一个答案能解决我的问题。

只是为了总结一下,这是可视化的代码,尽管我的代码中的错误必须在上面

#Visualization
ldaplot <- ggplot(a.sd.lda, aes(variable,value,fill=variable))+
      labs(title="LDA")+
      scale_x_discrete(guide = guide_axis(n.dodge=2))+
      coord_cartesian(ylim=c(y_min,1))+
      geom_bar(stat="identity", color="black", 
               position=position_dodge()) +
      scale_fill_brewer(palette="YlOrBr")+
      geom_text(data = a.sd.lda, aes(x = variable, y = value, label = cld), size = 5, vjust=-.5, hjust=-.7)+
      geom_errorbar(aes(ymin=value-sd, ymax=value+sd), width=.2,
                    position=position_dodge(.9))+ 
      labs(x="", y="Accuracy")+
      geom_abline(aes(intercept=Akzeptanzwert,slope=0), color="red")

multcompLetter4() 函数和相关的源代码可以在这里找到:https ://rdrr.io/cran/multcompView/f/

4

2 回答 2

1

嗯,我看到您已经解决了自己的问题,但无论如何这是我的解决方案:

a.lda <- tibble::tribble(
     ~Fold,               ~Var,      ~Val,
   "Fold1",    "ohne_Sampling", 0.6732673,
   "Fold2",    "ohne_Sampling", 0.7227723,
   "Fold3",    "ohne_Sampling",      0.71,
   "Fold4",    "ohne_Sampling", 0.6633663,
   "Fold5",    "ohne_Sampling", 0.7128713,
   "Fold1", "mit_Downsampling", 0.8390805,
   "Fold2", "mit_Downsampling", 0.8181818,
   "Fold3", "mit_Downsampling", 0.7586207,
   "Fold4", "mit_Downsampling", 0.8295455,
   "Fold5", "mit_Downsampling",     0.875,
   "Fold1",   "mit_Upsampling", 0.7192982,
   "Fold2",   "mit_Upsampling", 0.7105263,
   "Fold3",   "mit_Upsampling", 0.6842105,
   "Fold4",   "mit_Upsampling", 0.7105263,
   "Fold5",   "mit_Upsampling", 0.7017544,
   "Fold1",       "Gewichtung", 0.6732673,
   "Fold2",       "Gewichtung", 0.7227723,
   "Fold3",       "Gewichtung",      0.71,
   "Fold4",       "Gewichtung", 0.6633663,
   "Fold5",       "Gewichtung", 0.7128713
  )


library(tidyverse)
library(dlookr)
library(emmeans)
library(multcomp)
library(multcompView)
library(scales)

# data summary ------------------------------------------------------------
a.sd.lda <- a.lda %>%
  group_by(Var) %>%
  dlookr::describe(Val) %>%
  ungroup()

a.sd.lda
#> # A tibble: 4 x 27
#>   variable Var            n    na  mean     sd se_mean     IQR skewness kurtosis
#>   <chr>    <chr>      <int> <int> <dbl>  <dbl>   <dbl>   <dbl>    <dbl>    <dbl>
#> 1 Val      Gewichtung     5     0 0.696 0.0264 0.0118  0.0396    -0.536    -2.63
#> 2 Val      mit_Downs~     5     0 0.824 0.0423 0.0189  0.0209    -0.798     1.79
#> 3 Val      mit_Upsam~     5     0 0.705 0.0133 0.00595 0.00877   -1.12      1.46
#> 4 Val      ohne_Samp~     5     0 0.696 0.0264 0.0118  0.0396    -0.536    -2.63
#> # ... with 17 more variables: p00 <dbl>, p01 <dbl>, p05 <dbl>, p10 <dbl>,
#> #   p20 <dbl>, p25 <dbl>, p30 <dbl>, p40 <dbl>, p50 <dbl>, p60 <dbl>,
#> #   p70 <dbl>, p75 <dbl>, p80 <dbl>, p90 <dbl>, p95 <dbl>, p99 <dbl>,
#> #   p100 <dbl>


# compact letter display --------------------------------------------------
emmcld <- lm(Val ~ Var, data = a.lda) %>%
  emmeans(~ Var) %>%
  cld(Letters = letters, adjust = "Tukey")

我认为这是从 Tukey 测试中获取描述性统计和紧凑型字母显示所需的所有代码。

请注意,对于绘图,我仅使用原始数据a.lda和结果输出emmcld

重现你的情节

ggplot() +
  # y-axis
  scale_y_continuous(
    name = "Accuracy",
    limits = c(NA, 1),
    breaks = pretty_breaks(),
    expand = expansion(mult = c(0, 0))
  ) +
  # x-axis
  scale_x_discrete(name = NULL) +
  # general layout
  theme_classic() +
  # colors
  scale_fill_brewer(palette = "YlOrBr") +
  # horizontal line
  geom_hline(yintercept = 0.9, color = "red") +
  # bars
  geom_bar(
    data = emmcld,
    aes(y = emmean, x = Var, fill = Var),
    color = "black",
    stat = "identity"
  ) +
  # errorbars
  geom_errorbar(data = emmcld,
                aes(
                  ymin = emmean - SE,
                  ymax = emmean + SE,
                  x = Var
                ),
                width = 0.1) +
  # letters
  geom_text(
    data = emmcld,
    aes(
      y = emmean + SE,
      x = Var,
      label = str_trim(.group)
    ),
    hjust = 0.5,
    vjust = -0.5
  ) +
  # caption
  labs(
    caption = str_wrap(
      "Bars with errorbars represent (estimated marginal) means ± standard error. Means not sharing any letter are significantly different by the Tukey-test at the 5% level of significance.",
      width = 90
    )
  )

替代情节建议

..因为有些人不喜欢这篇博文中描述的“炸药情节”

# optional: sort factor levels of groups column according to highest mean
# ...in means table
emmcld <- emmcld %>%
  mutate(Var = fct_reorder(Var, emmean))

# ...in data table
a.lda <- a.lda %>%
  mutate(Var = fct_relevel(Var, levels(emmcld$group)))

# base plot setup
ggplot() +
  # y-axis
  scale_y_continuous(limits = c(NA, 1),
                     name = "Accuracy",
                     breaks = pretty_breaks()) +
  # x-axis
  scale_x_discrete(name = NULL) +
  # general layout
  theme_classic() +
  # colors
  scale_fill_brewer(palette = "YlOrBr", guide = "none") +
  scale_color_brewer(palette = "YlOrBr", guide = "none") +
  # horizontal line
  geom_hline(yintercept = 0.9,
             color = "red",
             linetype = "dashed") +
  # black data points
  geom_point(
    data = a.lda,
    aes(y = Val, x = Var, fill = Var),
    shape = 21,
    alpha = 0.9,
    position = position_nudge(x = -0.2)
  ) +
  # black boxplot
  geom_boxplot(
    data = a.lda,
    aes(y = Val, x = Var, fill = Var),
    width = 0.05,
    outlier.shape = NA,
    position = position_nudge(x = -0.1)
  ) +
  # red mean value
  geom_point(data = emmcld,
             aes(y = emmean, x = Var),
             size = 2,
             color = "red") +
  # red mean errorbar
  geom_errorbar(
    data = emmcld,
    aes(ymin = lower.CL, ymax = upper.CL, x = Var),
    width = 0.05,
    color = "red"
  ) +
  # red letters
  geom_text(
    data = emmcld,
    aes(
      y = emmean,
      x = Var,
      label = str_trim(.group)
    ),
    position = position_nudge(x = 0.1),
    hjust = 0,
    color = "red"
  ) +
  # caption
  labs(
    caption = str_wrap(
      "Black dots represent raw data. Red dots and error bars represent (estimated marginal) means ± 95% confidence interval per group. Means not sharing any letter are significantly different by the Tukey-test at the 5% level of significance.",
      width = 90
    )
  )

reprex 包于 2022-01-27 创建(v2.0.1)

于 2022-01-27T13:27:21.127 回答
0

好吧,在尝试了很多之后,我找到了一种通过另一个包解决这个问题的方法:

multcomp

所以这是我获得正确结果和绘图的代码:

library(ggplot2)
library(multcomp)
library(reshape2)


a.lda <- read.table(header = TRUE, text = "  rowname ohne_Sampling mit_Downsampling mit_Upsampling Gewichtung
                          1   Fold1     0.6732673        0.8390805      0.7192982  0.6732673
                          2   Fold2     0.7227723        0.8181818      0.7105263  0.7227723
                          3   Fold3     0.7100000        0.7586207      0.6842105  0.7100000
                          4   Fold4     0.6633663        0.8295455      0.7105263  0.6633663
                          5   Fold5     0.7128713        0.8750000      0.7017544  0.7128713")

#Transformation of the dataframe to get a format ggplot2 is able to use    
a.lda <- melt(a.lda, id.vars="rowname")

#data_summary Function
data_summary <- function(data, varname, groupnames){
  require(plyr)
  summary_func <- function(x, col){
    c(mean = mean(x[[col]], na.rm=TRUE),
      sd = sd(x[[col]], na.rm=TRUE),
      minimum = min(x[[col]], na.rm=TRUE))
  }
  data_sum<-ddply(data, groupnames, .fun=summary_func,
                  varname)
  data_sum <- rename(data_sum, c("mean" = varname))
  return(data_sum)
}

a.sd.lda <- data_summary(a.lda, varname = "value", groupnames = "variable")


#ANOVA+Tuckey **NEW**
a.anova <- aov(data=a.lda, value ~ variable)
tukey <- glht(a.anova, linfct = mcp(variable = "Tukey"))
tuk.cld <- cld(tukey)
a.sd.lda$cld <- tuk.cld$mcletters$Letters

#Visualization
ldaplot <- ggplot(a.sd.lda, aes(variable,value,fill=variable))+
  labs(title="LDA")+
  scale_x_discrete(guide = guide_axis(n.dodge=2))+
  coord_cartesian(ylim=c(y_min,1))+
  geom_bar(stat="identity", color="black", 
           position=position_dodge()) +
  scale_fill_brewer(palette="YlOrBr")+
  geom_text(data = a.sd.lda, aes(x = variable, y = value, label = cld), size = 5, vjust=-.5, hjust=-.7)+
  geom_errorbar(aes(ymin=value-sd, ymax=value+sd), width=.2,
                position=position_dodge(.9))+ 
  labs(x="", y="Accuracy")+
  geom_abline(aes(intercept=Akzeptanzwert,slope=0), color="red")

此解决方案的想法来自:https ://stats.stackexchange.com/questions/31547/how-to-obtain-the-results-of-a-tukey-hsd-post-hoc-test-in-a-table -显示组

希望它也可以帮助其他 R 用户:)

于 2022-01-27T08:47:41.887 回答