我正在尝试编写一个 rscript,它将使用 bioconductor 包中的 biomaRt 找到 ChIP-seq 峰的注释。我正在从这里调整注释代码,https://sites.google.com/a/brown.edu/bioinformatics-in-biomed/spp-r-from-chip-seq我需要在 2.5 到 5kb 内找到 TSS 站点的结合峰。与示例网站不同,我必须在整个基因组中进行。
我知道代码适用于注释 - 目前,我将代码块复制了 22 次,而不是循环。
如果正在迭代的染色体上没有峰,我还需要设法防止脚本退出。
#!/usr/bin/Rscript --vanilla --slave
# Change to data directory
setwd("/data/met/bowtie_out/tAfiles/MLE15/2/");
# send the output to a file AND the Terminal
sink("09June2013_spp.txt", append=FALSE, split=TRUE);
# Load Libraries
library(biomaRt);
library(plyr);
load("MLE15_pooled_2.Rdata");
# y equals the SPP score. I have to truncate it after IDR analysis.
bp <- llply(region.peaks$npl, subset, y > 12.0633)
print(paste("After filtering",sum(unlist(lapply(bp,function(d) length(d$x)))),"peaks remain"));
save.image(file="09Jun13_1.RData");
# begin collecting annotation have to use mm10.
ensembl= useMart('ensembl', dataset='mmusculus_gene_ensembl', host="apr2013.archive.ensembl.org");
# need to make a for loop that will loop through all of the chromosomes and not error out if no peaks are on that chromosome.
# So for this 'for' loop in R do I need to make a list? like for (z in (c('1'....etc?
for ( z in ('1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', 'X', 'Y', 'M' ) {
# To insert the variable which I am looping on with other variables, is there something I should add like the $ in bash for loops that is specific to R? can I put the variable in quotes - like for the input to biomaRt? ex. values= "z"?
genes.chrz = getBM(attributes = c("chromosome_name", "start_position", "end_position", "strand", "description", "entrezgene"), filters = "chromosome_name", values= "z", mart = ensembl);
overlap = function(bs, ts, l)
{
if ((bs > ts - l) && (bs < ts + l)) {
TRUE;
} else {
FALSE;
}
}
fivePrimeGenes = function(bs, ts, te, s, l, n, c, d)
{
fivePrimeVec = logical();
for (i in 1:length(ts)) {
fivePrime = FALSE;
for (j in 1:length(bs)) {
if (s[i] == 1) {
fivePrime = fivePrime || overlap(bs[j], ts[i], l);
} else {
fivePrime = fivePrime || overlap(bs[j], te[i], l);
}
}
fivePrimeVec = c(fivePrimeVec, fivePrime);
}
fivePrimeVec;
}
fivePrimeGenesLogical = fivePrimeGenes(bp$chrz$x, genes.chrz$start_position, genes.chrz$end_position, genes.chrz$strand, 5000, genes.chrz$entrezgene, genes.chrz$chromosome_name, genes.chrz$description);
fivePrimeStartsPlus = genes.chrz$start_position[fivePrimeGenesLogical & genes.chrz$strand == 1]
fivePrimeStartsMinus = genes.chrz$end_position[fivePrimeGenesLogical & genes.chrz$strand == -1]
fivePrimeStarts = sort(c(fivePrimeStartsPlus, fivePrimeStartsMinus));
entrezgene <- data.frame(genes.chrz$entrezgene[fivePrimeGenesLogical]);
chromosome_name <- data.frame(genes.chrz$chromosome_name[fivePrimeGenesLogical]);
start_pos <- data.frame(genes.chrz$start_position[fivePrimeGenesLogical]);
end_pos <- data.frame(genes.chrz$end_position[fivePrimeGenesLogical]);
strand <- data.frame(genes.chrz$strand[fivePrimeGenesLogical]);
description <- data.frame(genes.chrz$description[fivePrimeGenesLogical]);
AnnotationData <- cbind(chromosome_name, entrezgene, start_pos, end_pos, strand, description);
write.table(AnnotationData, file="chrz_annotation_data.csv", row.names=FALSE, col.names=FALSE, sep="\t");
}
save.image(file="09Jun13_2.RData");
# close the output file
sink()
# clean all
rm(list=ls(all=TRUE));
quit("yes");
很抱歉提出这样一个凌乱的问题,但我认为将这些行放在那里将有助于其他试图注释其峰值文件的人。非常感谢任何帮助或建议!
谢谢!