0

我有一个这样的数据集(但有数百个样本):

data <- structure(list(sample = c("C001", "C001", "C001", "C001", "C001", 
                          "C001", "C001", "C001", "C001", "C001", "C001", "C001", "C001", 
                          "C002", "C002", "C002", "C002", "C002", "C002", "C002", "C002", 
                          "C002", "C002", "C002", "C002", "C002", "C003", "C003", "C003", 
                          "C003", "C003", "C003", "C003", "C003", "C003", "C003", "C003", 
                          "C003", "C003", "C004", "C004", "C004", "C004", "C004", "C004", 
                          "C004", "C004", "C004", "C004", "C004", "C004", "C004", "C007", 
                          "C007", "C007", "C007", "C007", "C007", "C007", "C007", "C007", 
                          "C007", "C007", "C007", "C007", "C009", "C009", "C009", "C009", 
                          "C009", "C009", "C009", "C009", "C009", "C009", "C009", "C009", 
                          "C009", "C011", "C011", "C011", "C011", "C011", "C011", "C011", 
                          "C011", "C011", "C011", "C011", "C011", "C011", "C012", "C012", 
                          "C012", "C012", "C012", "C012", "C012", "C012", "C012", "C012", 
                          "C012", "C012", "C012", "C014", "C014", "C014", "C014", "C014", 
                          "C014", "C014", "C014", "C014", "C014", "C014", "C014", "C014", 
                          "C015", "C015", "C015", "C015", "C015", "C015", "C015", "C015", 
                          "C015", "C015", "C015", "C015", "C015", "C016", "C016", "C016", 
                          "C016", "C016", "C016", "C016", "C016", "C016", "C016", "C016", 
                          "C016", "C016", "C018", "C018", "C018", "C018", "C018", "C018", 
                          "C018", "C018", "C018", "C018", "C018", "C018", "C018"), count = c(0L, 
                                                                                             130L, 0L, 10L, 0L, 20L, 568L, 23L, 6L, 77L, 616L, 230734L, 177L, 
                                                                                             10L, 6396L, 0L, 5747L, 0L, 208L, 115189L, 13130L, 1L, 38L, 200L, 
                                                                                             2604L, 3104L, 0L, 95476L, 0L, 3591L, 0L, 7L, 26359L, 83L, 5L, 
                                                                                             1L, 1521L, 36004L, 9779L, 12L, 852L, 0L, 13L, 5L, 329L, 152053L, 
                                                                                             288L, 2L, 0L, 0L, 530L, 1023L, 57L, 84L, 98060L, 122L, 0L, 8552L, 
                                                                                             668L, 209L, 7L, 0L, 155L, 10159L, 4934L, 15L, 47L, 83L, 1L, 0L, 
                                                                                             54L, 462L, 89L, 43L, 0L, 127476L, 2614L, 3659L, 12L, 1L, 1L, 
                                                                                             1061L, 0L, 84199L, 845L, 898L, 0L, 29L, 10L, 63L, 1834L, 87L, 
                                                                                             36L, 7L, 407L, 20167L, 39969L, 1429L, 51072L, 0L, 0L, 27L, 9560L, 
                                                                                             3643L, 2899L, 10L, 0L, 380L, 0L, 82L, 1543L, 55L, 765L, 25172L, 
                                                                                             29791L, 39805L, 922L, 6L, 843L, 5L, 110L, 0L, 174L, 134582L, 
                                                                                             575L, 15L, 65L, 37L, 19240L, 830L, 1L, 1L, 0L, 0L, 0L, 63L, 156446L, 
                                                                                             22L, 1L, 15L, 76L, 9710L, 793L, 128L, 4L, 1L, 2L, 0L, 1904L, 
                                                                                             199L, 98779L, 0L, 0L, 11436L, 91L, 1813L), class = structure(c(1L, 
                                                                                                                                                            2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 
                                                                                                                                                            4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 
                                                                                                                                                            6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 
                                                                                                                                                            8L, 9L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 
                                                                                                                                                            11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 
                                                                                                                                                            12L, 13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 
                                                                                                                                                            13L, 14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 
                                                                                                                                                            14L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L, 
                                                                                                                                                            1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L, 1L, 2L, 
                                                                                                                                                            3L, 4L, 5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L, 1L, 2L, 3L, 4L, 
                                                                                                                                                            5L, 6L, 7L, 8L, 9L, 11L, 12L, 13L, 14L), .Label = c("a", "b", 
                                                                                                                                                                                                                "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n"), class = "factor")), .Names = c("sample", 
                                                                                                                                                                                                                                                                                                            "count", "class"), row.names = c(NA, -156L), class = c("tbl_df", 
                                                                                                                                                                                                                                                                                                                                                                   "tbl", "data.frame"))

我想绘制这个数据的直方图:

library(tidyverse)
ggplot(data = data, aes(x = sample)) +
  geom_bar(aes(y = count, fill = class), color = "black", 
           position = "fill", stat = "identity")

在此处输入图像描述

但正如您所看到的,条形图不是按顺序排列的,因此比较不同的样本并不容易。

所以我在手边重新组织它,让它更“漂亮”(在某些方面)

data$sample <- factor(data$sample, levels = c("C001", "C014", "C009", "C018",
                      "C012", "C004", "C016", "C002", "C015", "C011", "C003", "C007"))

ggplot(data = data, aes(x = sample)) +
  geom_bar(aes(y = count, fill = class), color = "black", 
           position = "fill", stat = "identity")

在此处输入图像描述

这可能不是最好的顺序,但比较相似样本之间的比例更容易。

最后,我想制作这样的情节(使用facet_grid),但让我们从头开始。

在此处输入图像描述 资源

4

1 回答 1

3

没有明确的最佳方法来做到这一点。您要做的第一件事是定义样本之间的某种差异度量。一减去相关性似乎是(许多)可能的候选者之一。然后您可以查看如何根据相似性度量对结果进行排序。层次聚类为您提供了可能的顺序。

在以下代码中,我使用您的示例数据是有序且完整的。否则你可能需要调整。

# unique samples
samples <- unique(data$sample)
## dissimilarity measure
dm <- matrix(mapply(function(x, y) 1-cor(data[data$sample == x, ]$count, data[data$sample == y, ]$count), 
                    rep(samples, times = length(samples)),
                    rep(samples, each = length(samples))), nrow = length(samples))
# single linkage clustering
hc <- hclust(as.dist(dm), method = "single")
# reorder
data$sample <- factor(data$sample, levels = samples[hc$order])
# plot
ggplot(data = data, aes(x = sample)) +
  geom_bar(aes(y = count, fill = class), color = "black", 
           position = "fill", stat = "identity")

有序图

于 2017-08-18T10:07:07.377 回答