r - R ggplot2：将 kruskal Wallis 和成对 Wilcoxon 检验添加到每组和方面内具有多个组/子组的箱线图中

Question

我正在尝试将 kruskal Wallis 和成对 Wilcoxon 检验添加到图中以显示哪些组显着不同，但我在每个组和方面都有多个组/子组，这使得它变得复杂。

这是以 iris 数据集为例的 R 代码，其想法是针对不同的变量（Sepal.Length、Sepal.Width、Petal.Length、Petal.Width）在不同的处理（A、B、C）中执行 Kruskal.test ) 每个物种，以及它们之间的 wilcox.test 成对测试：

rm(list=ls(all=TRUE)); cat('\014') # clear workspace
library(tidyverse)
library(ggplot2)
library(viridis) 
library(rstatix)

data(iris)
iris$treatment <- rep(c("A","B","C"), length(iris$Species)/3)
mydf <- gather(iris,Variable,value,Sepal.Length:Petal.Width)
# change number to create more difference
mydf[mydf$treatment=="B",]$value <- mydf[mydf$treatment=="B",]$value*1.2
#mydf[mydf$treatment=="C",]$value <- mydf[mydf$treatment=="C",]$value+0.3

# do pairwise Wilcoxon test for pairwise comparisons between groups
df_wilcox <- mydf %>%
  group_by(Species,Variable) %>%
  pairwise_wilcox_test(value ~ treatment) %>%
  add_y_position(step.increase = 0.02)

# do Kruskal Wallis test to see whether or not there is statistically significant difference between three or more groups
df_kw <- compare_means(value ~ treatment, mydf, group.by = c("Species","Variable"), method="kruskal") 

# plot boxplot with wilcoxon and kruskal test results 
P <- ggplot(data=mydf,
            aes(x=treatment, y=value, fill=Variable))+
  stat_boxplot(geom = "errorbar")+geom_boxplot(outlier.shape = NA)+
  facet_wrap(~Species,nrow=1)+
  theme_bw()+
  theme(axis.text=element_text(size=12),axis.title=element_text(size=16),plot.title=element_text(size=20)) +
  theme(strip.text = element_text(size=14))+
  scale_fill_viridis(discrete = TRUE) +
  guides(fill=guide_legend(title="Variable"))+
  stat_pvalue_manual(df_wilcox,color ="Variable",step.group.by="Variable",tip.length = 0,step.increase = 0.02)
  #stat_pvalue_manual(df_wilcox,color ="Variable",step.group.by="Variable",tip.length = 0,step.increase = 0.02,hide.ns=T) #hide non-significant

# change legend title and wilcoxon test color  
ggpar(P,legend.title = "Wilcoxon test",palette = c("#440154FF","#3B528BFF","#21908CFF","#FDE725FF"))

这将产生以下图：

为了改善这个数字，我想：

自动将“df_kw”中的 Kruskal 测试结果作为文本添加到图中，并且仅显示显着的 p 值（例如 KW（petal.length）p = 0.003）
使不同变量（例如花瓣/花瓣长度/宽度）的处理（例如“A”、“B”、“C”）之间的威尔克森线看起来整齐（例如，所有在箱线图的顶部，具有一致的行距）
使 wilcoxon 测试线的颜色与箱线图的颜色相同（当 wilcoxon 测试变量小于实际变量时，如果我隐藏非显着性，现在 'ggpar' 并不总是有效）

我被困在这里，想知道有人有解决方案吗？非常感谢！

score 0 · Accepted Answer

我可以回答您关于如何自动将 pvalues 标签添加到图中的问题的第一部分。一种方法是合并mydf，df_kw以便df_kw包括所有与mydf. 在这里，我使用这样的data.table包来做到这一点：

setDT(mydf); setDT(df_kw) # convert to data.tables by reference

df_kw <- mydf[df_kw, mult = "first", on = c("Variable", "Species"), nomatch=0L] #creates data table with the same columns as mydf
df_kw <- df_kw[df_kw$p < 0.05,] #removes non-significant values

然后您可以使用自动添加标签geom_text。我会生成一个值的字符向量来首先像这样定位标签：

y_lab_placement <-  c(sort(rep(seq(max(mydf$value)*1.25, by = -0.35, length.out = length(unique(mydf$Variable))),
                                length(unique(mydf$Species))), decreasing = T)) # creates y values of where to place the labels
y_lab_placement <- y_lab_placement[1:nrow(df_kw)] # adjusts length of placements to the length of significant values

然后我会将此行添加到您的 ggplot 以添加标签：

geom_text(data = df_kw, aes(x = 2 , y = y_lab_placement, label = c(paste(Variable, "KW p ~" , round(p, 5)))))+ #adds labels to the plot based on your data

这是您的整个代码块，包括这些版本。

rm(list=ls(all=TRUE)); cat('\014') # clear workspace
library(tidyverse)
library(ggplot2)
library(viridis) 
library(rstatix)
library(data.table) # used in creating combined data table

data(iris)
iris$treatment <- rep(c("A","B","C"), length(iris$Species)/3)
mydf <- gather(iris,Variable,value,Sepal.Length:Petal.Width)
# change number to create more difference
mydf[mydf$treatment=="B",]$value <- mydf[mydf$treatment=="B",]$value*1.2
#mydf[mydf$treatment=="C",]$value <- mydf[mydf$treatment=="C",]$value+0.3

# do pairwise Wilcoxon test for pairwise comparisons between groups
df_wilcox <- mydf %>%
  group_by(Species,Variable) %>%
  pairwise_wilcox_test(value ~ treatment) %>%
  add_y_position(step.increase = 0.02)

# do Kruskal Wallis test to see whether or not there is statistically significant difference between three or more groups
df_kw <- compare_means(value ~ treatment, mydf, group.by = c("Species","Variable"), method="kruskal") 

setDT(mydf); setDT(df_kw) # convert to data.tables by reference

df_kw <- mydf[df_kw, mult = "first", on = c("Variable", "Species"), nomatch=0L] #creates data table with the same columns as mydf
df_kw <- df_kw[df_kw$p < 0.05,] #removes non-significant values 

# plot boxplot with wilcoxon and kruskal test results 
y_lab_placement <-  c(sort(rep(seq(max(mydf$value)*1.25, by = -0.35, length.out = length(unique(mydf$Variable))),
                                length(unique(mydf$Species))), decreasing = T)) # creates y values of where to place the labels
y_lab_placement <- y_lab_placement[1:nrow(df_kw)] # adjusts length of placements to the length of significant values

P <- ggplot(data=mydf,
            aes(x=treatment, y=value, fill=Variable))+
  stat_boxplot(geom = "errorbar")+geom_boxplot(outlier.shape = NA)+
  facet_wrap(~Species,nrow=1)+
  theme_bw()+
  theme(axis.text=element_text(size=12),axis.title=element_text(size=16),plot.title=element_text(size=20)) +
  theme(strip.text = element_text(size=14))+
  scale_fill_viridis(discrete = TRUE) +
  guides(fill=guide_legend(title="Variable"))+
  geom_text(data = df_kw, aes(x = 2 , y = y_lab_placement, label = c(paste(Variable, "KW p ~" , round(p, 5)))))+ #adds labels to the plot based on your data
  stat_pvalue_manual(df_wilcox,color ="Variable",step.group.by="Variable",tip.length = 0,step.increase = 0.02)
#stat_pvalue_manual(df_wilcox,color ="Variable",step.group.by="Variable",tip.length = 0,step.increase = 0.02,hide.ns=T) #hide non-significant

# change legend title and wilcoxon test color  
ggpar(P,legend.title = "Wilcoxon test",palette = c("#440154FF","#3B528BFF","#21908CFF","#FDE725FF"))

r - R ggplot2：将 kruskal Wallis 和成对 Wilcoxon 检验添加到每组和方面内具有多个组/子组的箱线图中

1 回答 1

Related

Reference