2

So I have some data - gene expression in several samples - that I want to plot as an histogram binned in a way that makes sense, and then overlaying a density curve. Something along the lines of this plot: Plotting_distributions_(ggplot2)

ggplot(df, aes(x=rating)) + 
    geom_histogram(aes(y=..density..),      # Histogram with density instead of count on y-axis
                   binwidth=.5,
                   colour="black", fill="white") +
    geom_density(alpha=.2, fill="#FF6666")  # Overlay with transparent density plot

enter image description here

but with the bins being set by using cut(). This what my data looks like:

df <- structure(list(gene_id = c("0610005C13Rik", "0610007C21Rik", 
"0610007L01Rik", "0610007N19Rik", "0610007P08Rik", "0610007P14Rik", 
"0610007P22Rik", "0610008F07Rik", "0610009B14Rik", "0610009B22Rik", 
"Zwilch", "Zwint", "Zxda", "Zxdb", "Zxdc", "Zyg11a", "Zyg11b", 
"Zyx", "Zzef1", "Zzz3"), sample_name = structure(c(8L, 8L, 8L, 
8L, 8L, 8L, 8L, 8L, 8L, 8L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
7L), .Label = c("SRSF1", "SRSF2", "SRSF3", "SRSF4", "SRSF5", 
"SRSF6", "SRSF7", "GFP"), class = "factor"), fpkm = c(0, 21.7863, 
27.9083, 3.83815, 8.65554, 40.7533, 20.2477, 0, 0.670717, 41.5058, 
33.6888, 17.7709, 0.756213, 1.38326, 4.07163, 5.53123, 8.50196, 
40.8762, 2.79376, 14.6917), conf_hi = c(0, 28.6113, 33.396, 5.43846, 
10.5892, 50.3002, 25.7052, 0, 1.27673, 52.502, 40.9057, 21.8586, 
1.0982, 1.95332, 5.68713, 7.53352, 10.3355, 49.5294, 3.49622, 
17.7908), conf_lo = c(0, 14.9613, 22.4206, 2.23784, 6.72186, 
31.2064, 14.7903, 0, 0.0647031, 30.5095, 26.4719, 13.6831, 0.414229, 
0.813193, 2.45612, 3.52894, 6.66839, 32.2231, 2.0913, 11.5925
), quant_status = c("OK", "OK", "OK", "OK", "OK", "OK", "OK", 
"OK", "OK", "OK", "OK", "OK", "OK", "OK", "OK", "OK", "OK", "OK", 
"OK", "OK"), stdev = c(0, 3.4125, 2.74385, 0.800155, 0.96683, 
4.77345, 2.72875, 0, 0.3030065, 5.4981, 3.60845, 2.04385, 0.1709935, 
0.28503, 0.80775, 1.001145, 0.91677, 4.3266, 0.35123, 1.54955
)), .Names = c("gene_id", "sample_name", "fpkm", "conf_hi", "conf_lo", 
"quant_status", "stdev"), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 
7L, 8L, 9L, 10L, 185871L, 185872L, 185873L, 185874L, 185875L, 
185876L, 185877L, 185878L, 185879L, 185880L), class = "data.frame")

Which I bin with cut():

df.bin <- transform(df, expression_bin=cut(fpkm, breaks=c(-Inf, 0.01, 0.1, 1, 10, 100, 1000, 10000, Inf), labels=c("<0.01","0.01-0.1","0.1-1","1-10","10-100","100-1000","1000-10.000", ">10.000")))

and the histogram plotting is also straightforward:

p.fpkm.bin <- ggplot(df.bin, aes(x=expression_bin)) + 
          geom_histogram(aes(fill = expression_bin), stat="bin") +
          #           geom_density(aes(x=fpkm),alpha=.2, fill="#FF6666")  # Overlay with transparent density plot
          facet_wrap(~ sample_name) +
          theme(axis.text.x  = element_text(angle=45, vjust=0.5)) + xlab("Expression category (fpkm)") + 
          ylab("Number of genes") + scale_fill_brewer()
p.fpkm.bin

my nice plot

The question is how to overlay the density line? Or how to define the bins inside the ggplot2 code to emulate the example plot?

4

0 回答 0