-1

我正在尝试编写一个 rscript,它将使用 bioconductor 包中的 biomaRt 找到 ChIP-seq 峰的注释。我正在从这里调整注释代码,https://sites.google.com/a/brown.edu/bioinformatics-in-biomed/spp-r-from-chip-seq我需要在 2.5 到 5kb 内找到 TSS 站点的结合峰。与示例网站不同,我必须在整个基因组中进行。

我知道代码适用于注释 - 目前,我将代码块复制了 22 次,而不是循环。

如果正在迭代的染色体上没有峰,我还需要设法防止脚本退出。

#!/usr/bin/Rscript --vanilla --slave

# Change to data directory
setwd("/data/met/bowtie_out/tAfiles/MLE15/2/");

# send the output to a file AND the Terminal
sink("09June2013_spp.txt", append=FALSE, split=TRUE);

# Load Libraries
library(biomaRt);
library(plyr);

load("MLE15_pooled_2.Rdata");

# y equals the SPP score. I have to truncate it after IDR analysis.

bp <- llply(region.peaks$npl, subset, y > 12.0633)

print(paste("After filtering",sum(unlist(lapply(bp,function(d) length(d$x)))),"peaks remain"));

save.image(file="09Jun13_1.RData");

# begin collecting annotation have to use mm10.

ensembl= useMart('ensembl', dataset='mmusculus_gene_ensembl', host="apr2013.archive.ensembl.org");

# need to make a for loop that will loop through all of the chromosomes and not error out if no peaks are on that chromosome.

# So for this 'for' loop in R do I need to make a list? like for (z in (c('1'....etc?

for ( z in ('1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', 'X', 'Y', 'M' ) {

# To insert the variable which I am looping on with other variables, is there something I should add like the $ in bash for loops that is specific to R? can I put the variable in quotes - like for the input to biomaRt? ex. values= "z"?

genes.chrz = getBM(attributes = c("chromosome_name", "start_position", "end_position", "strand", "description", "entrezgene"), filters = "chromosome_name", values= "z", mart = ensembl);

overlap = function(bs, ts, l)
{
    if ((bs > ts - l) && (bs < ts + l)) {
        TRUE;
    } else {
        FALSE;
    }
}

fivePrimeGenes = function(bs, ts, te, s, l, n, c, d)
{
    fivePrimeVec = logical();
    for (i in 1:length(ts)) {
            fivePrime = FALSE;
            for (j in 1:length(bs)) {
                if (s[i] == 1) {
                    fivePrime = fivePrime || overlap(bs[j], ts[i], l);
                } else {
                    fivePrime = fivePrime || overlap(bs[j], te[i], l);
                }
             }
            fivePrimeVec = c(fivePrimeVec, fivePrime);
    }
     fivePrimeVec;
}

fivePrimeGenesLogical = fivePrimeGenes(bp$chrz$x, genes.chrz$start_position, genes.chrz$end_position, genes.chrz$strand, 5000, genes.chrz$entrezgene, genes.chrz$chromosome_name, genes.chrz$description);
fivePrimeStartsPlus = genes.chrz$start_position[fivePrimeGenesLogical & genes.chrz$strand == 1]
fivePrimeStartsMinus = genes.chrz$end_position[fivePrimeGenesLogical & genes.chrz$strand == -1]
fivePrimeStarts = sort(c(fivePrimeStartsPlus, fivePrimeStartsMinus));

entrezgene <- data.frame(genes.chrz$entrezgene[fivePrimeGenesLogical]);
chromosome_name <- data.frame(genes.chrz$chromosome_name[fivePrimeGenesLogical]);
start_pos <- data.frame(genes.chrz$start_position[fivePrimeGenesLogical]);
end_pos <- data.frame(genes.chrz$end_position[fivePrimeGenesLogical]);
strand <- data.frame(genes.chrz$strand[fivePrimeGenesLogical]);
description <- data.frame(genes.chrz$description[fivePrimeGenesLogical]);


AnnotationData <- cbind(chromosome_name, entrezgene, start_pos, end_pos, strand, description);
write.table(AnnotationData, file="chrz_annotation_data.csv", row.names=FALSE, col.names=FALSE, sep="\t");

}

save.image(file="09Jun13_2.RData");

# close the output file
sink()

# clean all
rm(list=ls(all=TRUE));

quit("yes");

很抱歉提出这样一个凌乱的问题,但我认为将这些行放在那里将有助于其他试图注释其峰值文件的人。非常感谢任何帮助或建议!

谢谢!

4

1 回答 1

4

让我们获取注释;在这个阶段不需要循环染色体

library(biomaRt)
ensembl <- useMart('ENSEMBL_MART_ENSEMBL', dataset='mmusculus_gene_ensembl',
                   host="apr2013.archive.ensembl.org")
genes.chrz <- getBM(attributes = c("chromosome_name", "start_position",
                      "end_position", "strand", "description", "entrezgene"),
                    mart = ensembl)

使用GenomicRanges包中的函数会很方便,特别是将我们的注释(和我们的数据)表示为一个GRanges对象。

library(GenomicRanges)
genes <- with(genes.chrz, {
    GRanges(chromosome_name, IRanges(start_position, end_position), strand, 
            entrezgene=entrezgene, description=description)
})

看起来你对每个基因开始的 5000 nt 侧翼(5')感兴趣

flanks <- flank(genes, 5000)

看起来您在数据框中有一些峰值数据,您可以将其制成GRanges对象;从您提供的内容中很难判断。然后很容易,例如,计算与每个侧翼区域重叠的峰的数量,或将峰子集以仅包含与侧翼区域重叠的峰。

gr <- with(df, GRanges(chrom, IRanges(start, end), strand))
countOverlaps(flanks, gr)
subsetByOverlaps(gr, flanks)

GenomicRangesIRanges登陆页面提供了大量的小插曲;Bioconductor邮件列表是一个有用的资源。

但是,如果有人真的想使用马特提供的代码......我首先要删除所有这些';' (在 R 中不需要)并将代码包装到 80 列以提高可读性。然后我将常量函数定义拉到循环外并修改overlap为对向量ts而不是标量进行操作

overlap = function(bs, ts, l)
{
    ## suppose ts is a vector, bs and l scalars, then use single '&';
    ## result is a logical(length(ts)) with appropriate values TRUE or
    ## FALSE
    (bs > ts - l) & (bs < ts + l)
}

因为fivePrimeGenes我试图利用矢量化overlap- 我不必迭代ts,只需为每个调用overlap一次bs[j]。有更有效的方法可以做到这一点......我使用seq_along了这样我就不会被1:0bs长度为 0 时被抓住,并删除了函数中未使用的参数

fivePrimeGenes = function(bs, ts, te, s, l)
{
    ## use 'pre-allocate and fill' rather than copy-and-append
    fivePrimeVec <- logical(length(ts)) # initially all FALSE
    ## choose strand for all elements of ts, te;
    se <- ifelse(s == 1, ts, te)
    ## overlap is vectorized, no need to iterate over elements of se
    ## use seq_along() to avoid edge case of 0-length ts
    for (j in seq_along(bs))
        ## calculate overlaps for each element of bs
        fivePrimeVec <- fivePrimeVec | overlap(bs[j], se, l)
    fivePrimeVec
}

好的,对于主循环,让 R 创建序列 1:19,并强制转换为字符。z在查询 biomaRt 以及创建要写入数据的文件名时使用变量而不是字符串“z”。使用with是一种方便/邪恶;在这种情况下,它有助于整理代码,所以我选择使用它。fivePrimeStarts后面的计算中没有用到,所以我把它去掉了。似乎可以将 AnnotationData 创建为现有数据框的子集,而不是从子集部件中进行容易出错的组装。

for ( z in c( 1:19, "X", "Y", "M" ) )  {
    ## use the variable z, rather than the string "z"; order
    ## attributes as wanted in final representation
    attributes <- c("entrezgene", "chromosome_name", "start_position",
                    "end_position", "strand", "description",
                    "entrezgene")
    genes.chrz = getBM(attributes = attributes,
      filters = "chromosome_name", values=z, mart = ensembl)

    ## use 'with' to simplify common column selection
    fivePrimeGenesLogical = with(genes.chrz, {
        fivePrimeGenes(bp$chrz$x, start_position, end_position, strand, 5000)
    })

    ## AnnotationData as subset, rather than assembling
    AnnotationData <- genes.chrz[fivePrimeGenesLogical,, drop=FALSE]
    ## as a matrix? as.matrix(AnnotationData)

    ## use 'sprintf' to create a file name
    write.table(AnnotationData,
                file=sprintf("chr%s_annotation_data.csv", z),
                row.names=FALSE, col.names=FALSE, sep="\t")
}

我希望这会有所帮助;可能我介绍了很多错误!

于 2013-06-10T21:39:37.127 回答