r - 如何修改 R 程序以支持 RHadoop

Question

我是 RHadoop 和 R 的新手。我有一个普通的 R 程序，它有一个库（Methylkit）。我想知道有人可以就如何在 hadoop 上运行这个 R 程序提供一些见解。我需要在原始 R 程序中修改什么？如果有人给我一些想法，那将真的很有帮助。

编码：

library(methylKit)
file.list=list( "new_sample1.txt","new_sample2.txt","n_sample3.txt")
myobj=read(file.list,sample.id=list("test1","test2","ctrl1"),assembly="hg19",treatment=c(1,1,0),context="CpG", pipeline=list(fraction=TRUE,chr.col=1,start.col=2,end.col=2,
coverage.col=6,strand.col=3,freqC.col=5 ))
getMethylationStats(myobj[[1]],plot=F,both.strands=F)
pdf("sample1_statistics.pdf")
getMethylationStats(myobj[[1]],plot=T,both.strands=F)
dev.off()
getMethylationStats(myobj[[2]],plot=F,both.strands=F)
pdf("sample2_statistics.pdf")
getMethylationStats(myobj[[2]],plot=T,both.strands=F)
dev.off()
getCoverageStats(myobj[[3]],plot=F,both.strands=F)
pdf("sample3_statistics.pdf")
getMethylationStats(myobj[[3]],plot=T,both.strands=F)
dev.off()
library("graphics")
pdf("sample1_coverage.pdf")
getCoverageStats(myobj[[1]], plot = T, both.strands = F)
dev.off()
pdf("sample2_coverage.pdf")
getCoverageStats(myobj[[2]], plot = T, both.strands = F)
dev.off()
pdf("sample3_coverage.pdf")
getCoverageStats(myobj[[3]], plot = T, both.strands = F)
dev.off()
meth=unite(myobj, destrand=FALSE)
pdf("correlation.pdf")
getCorrelation(meth,plot=T)
dev.off()
pdf("cluster.pdf")
clusterSamples(meth, dist="correlation",method="ward", plot=TRUE)
dev.off()
hc <- clusterSamples(meth, dist = "correlation", method = "ward",plot = FALSE)
pdf("pca.pdf")
PCASamples(meth, screeplot = TRUE)
PCASamples(meth)
myDiff=calculateDiffMeth(meth)
write.table(myDiff, "mydiff.txt", sep='\t')
myDiff25p.hyper <-get.methylDiff(myDiff,differenc=25,qvalue=0.01,type="hyper")
myDiff25p.hyper
write.table(myDiff25p.hyper,"hyper_methylated.txt",sep='\t')
myDiff25p.hypo <-get.methylDiff(myDiff,differenc=25,qvalue=0.01,type="hypo")
myDiff25p.hypo
write.table(myDiff25p.hypo,"hypo_methylated.txt",sep='\t')
myDiff25p <-get.methylDiff(myDiff,differenc=25,qvalue=0.01)
myDiff25p
write.table(myDiff25p,"differentialy_methylated.txt",sep='\t')
diffMethPerChr(myDiff,plot=FALSE,qvalue.cutoff=0.01,meth.cutoff=25)
pdf("diffMethPerChr.pdf")
diffMethPerChr(myDiff,plot=TRUE,qvalue.cutoff=0.01,meth.cutoff=25)
dev.off()
gene.obj <- read.transcript.features(system.file("extdata","refseq.hg18.bed.txt", package = "methylKit"))
write.table(gene.obj,"gene_obj.txt", sep='\t')
annotate.WithGenicParts(myDiff25p, gene.obj)
cpg.obj <- read.feature.flank(system.file("extdata","cpgi.hg18.bed.txt", package = "methylKit"),feature.flank.name = c("CpGi","shores"))
write.table(cpg.obj,"cpg_obj.txt", sep='\t')
diffCpGann <- annotate.WithFeature.Flank(myDiff25p,cpg.obj$CpGi, cpg.obj$shores, feature.name = "CpGi",flank.name = "shores")
write.table(diffCpGann,"diffCpCann.txt", sep='\t')
diffCpGann 
promoters <- regionCounts(myobj, gene.obj$promoters)
head(promoters[[1]])
write.table(promoters,"promoters.txt", sep='\t')
diffAnn <- annotate.WithGenicParts(myDiff25p, gene.obj)
head(getAssociationWithTSS(diffAnn))
diffAnn
write.table(getAssociationWithTSS(diffAnn),"diff_ann.txt", sep='\t')
getTargetAnnotationStats(diffAnn, percentage = TRUE,precedence = TRUE)
pdf("piechart1.pdf")
plotTargetAnnotation(diffAnn, precedence = TRUE, main ="differential methylation annotation")
dev.off()
pdf("piechart2.pdf")
plotTargetAnnotation(diffCpGann, col = c("green","gray", "white"), main = "differential methylation annotation")
dev.off()
getFeatsWithTargetsStats(diffAnn, percentage = TRUE)

score 0 · Accepted Answer

*.txt 文件是否位于 hdfs 中？如果没有，请放。您可以使用 hadoop 流从 hadoop 读取数据。

line1 <- file('stdin')
open(line1)
while(length(line <- readLines(line1,n=1)) > 0) {
}

'stdin' 是来自 hadoop 流 jar 的 R 程序的输入参数。'line' 每次循环迭代时都会获取新的数据行。在 while 循环内部写下如何处理 line 的逻辑。

用于hadoop jar $HADOOP_HOME/contrib/streaming/hadoop-streaming.jar -input hdfs_input_file1, file2,n-files -output hdfs_output_dir -file mapper_file -file reducer_file -mapper mapper.R -reducer reducer.R运行程序。

-input接受 n 输入文件。Hadoop流媒体jar一一读取并提供给stdin

r - 如何修改 R 程序以支持 RHadoop

1 回答 1

Related

Reference