1

我有一个由 OTU 表组成的数据集,其中行 = 样本,列 = OTU,如下所示:

Otus <- data.frame(OTU_1 = c(0, 0, 1), OTU_2 = c(12, 0, 5), OTU_3 = c(0, 5, 3), row.names = c("S_1", "S_2", "S_3"))

我把它从上面phyloseq移到DESeq2使用phyloseq_to_deseq2命令没有问题。现在它是一个DESeq2对象,我想通过以下命令使用方差稳定转换来规范化 OTU 表:

vsd <- varianceStabilizingTransformation(DESeq2_Object, blind = TRUE)

但是,我不断收到此错误:

estimateSizeFactorsForMatrix(counts(object), locfunc = locfunc, 中的错误:每个基因至少包含一个零,无法计算对数几何平均值

在进行了广泛的研究以了解此错误后,我发现这意味着如果我的每一列(OTU_1、OTU_2、OTU_3 等)都包含零,则 DESeq2 无法计算其几何平均值。这确实是我的情况,因为我的所有列确实包含至少 1 个零,对于每一列。但是,我发现使用用于计算微分表达式的不同命令可以解决完全相同的错误。在这种情况下,解决方法是sfType = poscountsDESeq命令内应用,如下所示:

Diffs <- DESeq(DESeq2_Object, test = "Wald", fitType = "parametric", sfType = "poscounts")

但是,此命令不会填充我所追求的方差稳定矩阵,它仅计算原始 OTU 表中的微分表达式。

我已经查看了小插图并阅读了varianceStabilizingTransformation命令(使用R),但它没有sfType克服这个问题的标志。

有没有办法解决这个错误,所以我可以获得一个方差稳定的矩阵?

4

1 回答 1

1

您对方差稳定矩阵的预期目的是什么?然后你打算使用矩阵来计算差异丰度吗?在本教程中,作者(Susan Holmes 和 Joey McMurdie)参考本教程使用改变的几何平均公式进行差异丰度测试,例如

#BiocManager::install("phyloseq")
#BiocManager::install("DESeq2")
library(phyloseq)
library(DESeq2)
#> Loading required package: S4Vectors
#> Loading required package: stats4
#> Loading required package: BiocGenerics
#> Loading required package: parallel
#> Loading required package: IRanges
#> Loading required package: GenomicRanges
#> Loading required package: GenomeInfoDb
#> Loading required package: SummarizedExperiment
#> Loading required package: MatrixGenerics
#> Loading required package: matrixStats


# Load example data
otufile = system.file("extdata", "GP_otu_table_rand_short.txt.gz", package = "phyloseq")
mapfile = system.file("extdata", "master_map.txt", package = "phyloseq")
trefile = system.file("extdata", "GP_tree_rand_short.newick.gz", package = "phyloseq")
qiimex = import_qiime(otufile, mapfile, trefile, showProgress = FALSE)
#> Processing map file...
#> Processing otu/tax file...
#> Reading file into memory prior to parsing...
#> Detecting first header line...
#> Header is on line 2  
#> Converting input file to a table...
#> Defining OTU table... 
#> Parsing taxonomy table...
#> Processing phylogenetic tree...
#>  /Library/Frameworks/R.framework/Versions/4.1/Resources/library/phyloseq/extdata/GP_tree_rand_short.newick.gz ...
#> Warning in import_qiime(otufile, mapfile, trefile, showProgress = FALSE):
#> treefilename failed import. It will not be included.
qiimex
#> phyloseq-class experiment-level object
#> otu_table()   OTU Table:         [ 500 taxa and 26 samples ]
#> sample_data() Sample Data:       [ 26 samples by 7 sample variables ]
#> tax_table()   Taxonomy Table:    [ 500 taxa by 7 taxonomic ranks ]

# Select samples
qiimex@sam_data$SampleType
#>  [1] "Freshwater (creek)" "Freshwater (creek)" "Freshwater (creek)"
#>  [4] "Soil"               "Soil"               "Mock"              
#>  [7] "Mock"               "Mock"               "Skin"              
#> [10] "Freshwater"         "Feces"              "Skin"              
#> [13] "Tongue"             "Feces"              "Skin"              
#> [16] "Tongue"             "Ocean"              "Ocean"             
#> [19] "Ocean"              "Freshwater"         "Soil"              
#> [22] "Sediment (estuary)" "Sediment (estuary)" "Sediment (estuary)"
#> [25] "Feces"              "Feces"
qiimex_subset <- subset_samples(qiimex, SampleType == "Soil" | SampleType == "Mock")

# Convert to DESeq2 object
DESeq2_Object <- phyloseq_to_deseq2(qiimex_subset, ~ SampleType)
#> converting counts to integer mode
#> Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
#> design formula are characters, converting to factors

# calculate geometric means prior to estimate size factors
gm_mean = function(x, na.rm=TRUE){
  exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))
}
geoMeans = apply(counts(DESeq2_Object), 1, gm_mean)
DESeq2_dds = estimateSizeFactors(DESeq2_Object, geoMeans = geoMeans)
DESeq2_dds = DESeq(DESeq2_dds, fitType="local")
#> using pre-existing size factors
#> estimating dispersions
#> gene-wise dispersion estimates
#> mean-dispersion relationship
#> final dispersion estimates
#> fitting model and testing

# Explore results
res = results(DESeq2_dds)
res = res[order(res$padj, na.last=NA), ]
alpha = 0.01
sigtab = res[(res$padj < alpha), ]
sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(qiimex_subset)[rownames(sigtab), ], "matrix"))
head(sigtab)
#>         baseMean log2FoldChange    lfcSE      stat       pvalue         padj
#> 184403 3316.1856     -11.604876 1.446460 -8.022948 1.032372e-15 1.827298e-13
#> 193343  820.0846     -10.147175 1.317754 -7.700354 1.356898e-14 1.200855e-12
#> 10113   316.3938      -9.216756 1.313276 -7.018143 2.248365e-12 1.326535e-10
#> 234421  238.0599     -11.167192 1.657220 -6.738510 1.600195e-11 7.080861e-10
#> 526081  259.6722      -9.346145 1.471139 -6.352999 2.111570e-10 7.474959e-09
#> 313166 1322.7565      -9.911298 1.593579 -6.219519 4.986800e-10 1.471106e-08
#>         Kingdom         Phylum               Class             Order
#> 184403 Bacteria     Firmicutes          Clostridia     Clostridiales
#> 193343 Bacteria     Firmicutes          Clostridia     Clostridiales
#> 10113  Bacteria Proteobacteria Gammaproteobacteria Enterobacteriales
#> 234421 Bacteria     Firmicutes          Clostridia     Clostridiales
#> 526081 Bacteria     Firmicutes          Clostridia     Clostridiales
#> 313166 Bacteria  Bacteroidetes         Bacteroidia     Bacteroidales
#>                    Family        Genus            Species
#> 184403    Lachnospiraceae         <NA>               <NA>
#> 193343    Lachnospiraceae    Roseburia Eubacteriumrectale
#> 10113  Enterobacteriaceae         <NA>               <NA>
#> 234421    Lachnospiraceae Ruminococcus               <NA>
#> 526081    Lachnospiraceae Oribacterium               <NA>
#> 313166     Bacteroidaceae  Bacteroides               <NA>
posigtab = sigtab[sigtab[, "log2FoldChange"] > 0, ]
posigtab = posigtab[, c("baseMean", "log2FoldChange", "lfcSE", "padj", "Phylum", "Class", "Family", "Genus")]
posigtab
#>          baseMean log2FoldChange    lfcSE         padj          Phylum
#> 303295 201.335216       6.506674 1.204792 1.305775e-06  Proteobacteria
#> 5552   196.834531       6.309543 1.178371 1.518862e-06  Proteobacteria
#> 236550  69.552022       8.770931 1.815173 1.840288e-05  Proteobacteria
#> 113626 844.163976       7.207704 1.503844 2.078799e-05   Acidobacteria
#> 97627  171.891263       6.169004 1.307597 2.812963e-05  Proteobacteria
#> 89763   52.875334       8.375284 1.795633 3.426524e-05  Actinobacteria
#> 306684 149.608143       5.964643 1.346820 9.322811e-05   Bacteroidetes
#> 218985  50.742401       8.315832 1.882803 9.334941e-05     Chloroflexi
#> 102382 335.108981       6.747414 1.555112 1.267530e-04  Proteobacteria
#> 206632 332.777997       7.486585 1.747926 1.553274e-04 Verrucomicrobia
#> 573607 178.082806       6.729167 1.614098 2.360541e-04   Acidobacteria
#> 593006 146.954357       7.392336 1.781983 2.469451e-04  Proteobacteria
#> 279590 151.624630       6.576054 1.637287 3.873396e-04   Acidobacteria
#> 89337   56.919233       6.692225 1.663102 3.873396e-04  Actinobacteria
#> 218035  43.876643       7.133246 1.793490 4.373432e-04  Actinobacteria
#> 368027  18.296878       6.844054 1.791678 7.384186e-04  Actinobacteria
#> 572242  62.103316       6.140545 1.616489 7.802034e-04  Planctomycetes
#> 588604 211.108277       6.589139 1.738856 7.862979e-04  Proteobacteria
#> 212596 183.109699       6.931159 1.846832 8.501324e-04   Acidobacteria
#> 237963 113.390704       7.549096 2.013483 8.501324e-04  Proteobacteria
#> 548077  23.508540       7.205060 1.921985 8.501324e-04  Actinobacteria
#> 222914 348.580255       7.393221 2.040394 1.319435e-03 Verrucomicrobia
#> 265094 519.413992       5.885380 1.636179 1.389556e-03  Actinobacteria
#> 341901  26.562097       7.381503 2.068411 1.512002e-03  Actinobacteria
#> 104310  11.372452       6.157420 1.746257 1.715256e-03  Actinobacteria
#> 561842  51.066519       5.459834 1.549686 1.715256e-03  Proteobacteria
#> 248299  16.577945       6.700539 1.906851 1.736686e-03  Actinobacteria
#> 166835 868.093822       6.189342 1.766318 1.762950e-03  Proteobacteria
#> 369734  32.758200       6.711075 1.919352 1.775080e-03  Proteobacteria
#> 295422  15.101303       6.565460 1.913842 2.221501e-03  Proteobacteria
#> 137099  25.876135       6.367122 1.867180 2.254420e-03 Verrucomicrobia
#> 254706  13.039735       6.358471 1.862435 2.254420e-03   Acidobacteria
#> 204462 137.941401       6.613294 1.963268 2.572456e-03   Acidobacteria
#> 160337  18.708823       5.893400 1.783191 3.172144e-03  Actinobacteria
#> 144065  23.284353       7.191156 2.184408 3.260211e-03  Proteobacteria
#> 113959 894.081156       6.136411 1.870146 3.325954e-03   Acidobacteria
#> 223948  10.185674       6.003017 1.833057 3.341251e-03   Acidobacteria
#> 160135   8.669323       5.764444 1.764057 3.366660e-03  Actinobacteria
#> 244840   8.505259       5.736708 1.759948 3.404914e-03  Planctomycetes
#> 240591 179.115203       5.662464 1.749499 3.628632e-03  Proteobacteria
#> 547752  22.387044       6.156040 1.918045 3.921972e-03  Proteobacteria
#> 163176  17.574738       5.803532 1.823396 4.230077e-03  Proteobacteria
#> 321885  24.312098       6.270669 2.013227 5.172611e-03  Planctomycetes
#> 329327  16.600143       5.717837 1.839840 5.212980e-03   Fibrobacteres
#> 262115  25.392319       7.316622 2.362343 5.319945e-03             SC4
#> 512616  15.661404       5.632695 1.854032 6.385192e-03  Proteobacteria
#> 261663  25.664154       4.953787 1.643615 6.812189e-03     Chloroflexi
#> 257151  16.884984       4.909367 1.658737 8.015407e-03   Bacteroidetes
#> 238929  25.265076       4.509092 1.544313 8.856346e-03 Verrucomicrobia
#> 322045   6.504135       5.355636 1.834044 8.856346e-03  Proteobacteria
#>                      Class                      Family                Genus
#> 303295 Deltaproteobacteria               Haliangiaceae                 <NA>
#> 5552   Alphaproteobacteria            Caulobacteraceae     Phenylobacterium
#> 236550  Betaproteobacteria            Oxalobacteraceae             Massilia
#> 113626  Chloracidobacteria                        <NA>                 <NA>
#> 97627   Betaproteobacteria              Comamonadaceae          Hylemonella
#> 89763       Actinobacteria                        <NA>                 <NA>
#> 306684     Sphingobacteria                        <NA>                 <NA>
#> 218985              SOGA31                        <NA>                 <NA>
#> 102382  Betaproteobacteria            Oxalobacteraceae             Massilia
#> 206632      Spartobacteria          Spartobacteriaceae                 MC18
#> 573607  Chloracidobacteria                        <NA>                 <NA>
#> 593006 Gammaproteobacteria             Sinobacteraceae                 <NA>
#> 279590       Acidobacteria                        <NA>                 <NA>
#> 89337       Actinobacteria          Pseudonocardiaceae    Actinoalloteichus
#> 218035      Actinobacteria           Microbacteriaceae                 <NA>
#> 368027      Actinobacteria                        <NA>                 <NA>
#> 572242        Planctomycea                 Gemmataceae                 <NA>
#> 588604 Alphaproteobacteria           Rhodospirillaceae                 <NA>
#> 212596        Solibacteres             Solibacteraceae CandidatusSolibacter
#> 237963 Deltaproteobacteria                        <NA>                 <NA>
#> 548077      Actinobacteria                        <NA>                 <NA>
#> 222914    Verrucomicrobiae Verrucomicrobiasubdivision3                 <NA>
#> 265094      Actinobacteria          Micromonosporaceae       Micromonospora
#> 341901      Actinobacteria             Nocardioidaceae                 <NA>
#> 104310      Actinobacteria                 Frankiaceae                 <NA>
#> 561842 Alphaproteobacteria                        <NA>                 <NA>
#> 248299      Actinobacteria                        <NA>                 <NA>
#> 166835  Betaproteobacteria            Burkholderiaceae         Burkholderia
#> 369734 Alphaproteobacteria           Rhodospirillaceae                 <NA>
#> 295422  Betaproteobacteria              Alcaligenaceae           Sutterella
#> 137099    Verrucomicrobiae                        <NA>                 <NA>
#> 254706  Chloracidobacteria                        <NA>                 <NA>
#> 204462        Solibacteres             Solibacteraceae CandidatusSolibacter
#> 160337      Actinobacteria                 Frankiaceae                 <NA>
#> 144065 Alphaproteobacteria            Acetobacteraceae                 <NA>
#> 113959                <NA>                        <NA>                 <NA>
#> 223948  Chloracidobacteria                        <NA>                 <NA>
#> 160135      Actinobacteria           Streptomycetaceae         Streptomyces
#> 244840        Planctomycea               Pirellulaceae            Pirellula
#> 240591 Alphaproteobacteria          Phyllobacteriaceae                 <NA>
#> 547752  Betaproteobacteria            Burkholderiaceae         Burkholderia
#> 163176  Betaproteobacteria              Rhodocyclaceae    Methyloversatilis
#> 321885        Planctomycea                 Gemmataceae                 <NA>
#> 329327       Fibrobacteres            Fibrobacteraceae          Fibrobacter
#> 262115                <NA>                        <NA>                 <NA>
#> 512616 Gammaproteobacteria             Sinobacteraceae                 <NA>
#> 261663             Bljii12                     Dolo_23                 <NA>
#> 257151     Sphingobacteria                        <NA>                 <NA>
#> 238929    Verrucomicrobiae                        <NA>                 <NA>
#> 322045  Betaproteobacteria            Oxalobacteraceae             Massilia

# Plot the results
library(ggplot2)
theme_set(theme_bw())
sigtabgen = subset(sigtab, !is.na(Genus))
# Phylum order
x = tapply(sigtabgen$log2FoldChange, sigtabgen$Phylum, function(x) max(x))
x = sort(x, TRUE)
sigtabgen$Phylum = factor(as.character(sigtabgen$Phylum), levels=names(x))
# Genus order
x = tapply(sigtabgen$log2FoldChange, sigtabgen$Genus, function(x) max(x))
x = sort(x, TRUE)
sigtabgen$Genus = factor(as.character(sigtabgen$Genus), levels=names(x))
ggplot(sigtabgen, aes(y=Genus, x=log2FoldChange, color=Phylum)) + 
  geom_vline(xintercept = 0.0, color = "gray", size = 0.5) +
  geom_point(size=6) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))

reprex 包于 2021-10-08 创建(v2.0.1)

该教程于 2021 年 5 月发布,因此它是最新的,它们基本上跳过了与您的问题相关的“中间步骤”,但如果您出于其他原因需要方差稳定矩阵,则可能的解决方案是使用Varistran应用 Anscombe 的方差稳定变换(它还提供了一些不错的绘图功能),例如

#devtools::install_github("MonashBioinformaticsPlatform/varistran")
#BiocManager::install("edgeR")
library(varistran)
#> Loading required package: grid

counts <- DESeq2_Object@assays@data$counts
design <- model.matrix(~ qiimex_subset@sam_data$SampleType)

y <- vst(counts, design=design)
#> Dispersion estimated as 0.5672283

# "y" contains the transformed counts
y
#>               CC1        CL3      Even1      Even2      Even3        SV1
#> 338272 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927  0.7855066
#> 289855 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927  0.7855066
#> 287759  0.2967411 -0.3192927 -0.3192927 -0.3192927  1.5903105  4.0624033
#> 334839 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927
#> 144065  3.2103181  6.8071310 -0.3192927 -0.3192927 -0.3192927  0.7855066
#> 18643  -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927
#> 218985  6.1359268  3.1727994 -0.3192927 -0.3192927 -0.3192927  8.2069561
#> 238997 -0.3192927  0.5037921 -0.3192927 -0.3192927 -0.3192927 -0.3192927
#> 245893 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927
#> 131765 -0.3192927 -0.3192927 -0.3192927 -0.3192927  1.5903105  0.7855066
#> 536982 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927
#> 351629  5.0182785  4.5937241  5.4609218  4.0571176  2.8729479  5.3692361
#> 321069 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927
#> 512157 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927 -0.3192927
#> ...
#> attr(,"lib.size")
#>       CC1       CL3     Even1     Even2     Even3       SV1 
#> 84395.682 57904.079  1560.595  4272.039 15659.007 38299.983 
#> attr(,"true.lib.size")
#>   CC1   CL3 Even1 Even2 Even3   SV1 
#> 31125 37942 10184  7169  6597 34353 
#> attr(,"lib.size.method")
#> [1] "TMM normalization"
#> attr(,"cpm")
#> [1] FALSE
#> attr(,"method")
#> [1] "anscombe.nb"
#> attr(,"dispersion")
#> [1] 0.5672283
plot_stability(y, counts, design=design)
#> Warning: Transformation introduced infinite values in continuous x-axis
#> Warning: Transformation introduced infinite values in continuous x-axis

plot_biplot(y)

plot_heatmap(y, n=50, cluster_samples = TRUE)
#> Registered S3 method overwritten by 'seriation':
#>   method         from 
#>   reorder.hclust vegan

reprex 包于 2021-10-08 创建(v2.0.1)

于 2021-10-08T05:55:12.577 回答