我有一个大型数据集,我想编写一个自定义合并函数以与 apply 一起使用,但我无法解决某个问题。我不能使用循环,因为它会花费太长时间。数据大致是这样的;
# [ Name, Strand, Start, End ]
R1 = c( 'GeneA', '+', 1000, 1500 )
R2 = c( 'GeneA', '+', 1510, 2000 )
R3 = c( 'GeneA', '+', 2001, 2500 )
R4 = c( 'GeneB', '-', 3100, 4000 )
数据是一个 data.frame 行 R1:R4
到目前为止,我可以得到一个比较 Ri 和 Rj (j = i +1) 的函数,如果 Name 相同,Strand 相同,并且它们之间的差距小于 100,则合并它们。
GAP = Ri[End] - Rj[Start]
如果我将该函数应用于每一行并建立输出。然后应该创建函数输出
R1 = c( 'GeneA', '+', 1000, 2500 )
R4 = c( 'GeneB', '-', 3100, 4000 )
我可以获得一个工作函数,可以使用 apply 将两个连续的元素合并为一个,但我不知道如何合并三个连续的元素。一个丑陋的解决方案是运行两个连续的元素合并功能,直到没有进行额外的合并,但这不是有效的。我对想法有点坚持,任何见解都将不胜感激。
编辑:澄清一下,数据按连续染色体上的起始位置排序(未显示),每个基因名称在数据集中的不同位置多次出现(即 GeneA 可以是 1000-2500,然后再次出现在 4000-5000,而且我不想合并这两个基因,只有连续的基因)。
编辑 2:我在下面使用了 Tim P 解决方案。合并的效率出现了问题。还有一种方法可以将文本文件发布到堆栈溢出,这样我就可以显示数据的真实样子,然后发布我的脚本?
# Let ValidMerge be a logical vector of MergeDown corresponding to the rows in the data
# as defined by TimP below
# The data.frame is called RMDB
# TeMerge function is previously defined to merge two rows Ri and Rj into one entry
# which spans the start of Ri to the end of Rj (with same name and strand)
COUNT = 0
RMDB.OUT = 0
while (COUNT < nrow(RMDB)) { # Cycle through all rows of RMDB
COUNT = COUNT + 1
# Is this position a merger start?
# If yes, then returns position in startpt
# which will be the end position in endpt
# If no, returns 0
Merge = match(COUNT,startpt,nomatch=0)
if ( Merge == 0 ){
# No merge starts at this position
RMDB.OUT = rbind(RMDB.OUT,RMDB[COUNT,]) # Append COUNT Row to output
}else{
# Merge COUNT row in RMDB to its endpoint
RMDB.OUT = rbind(RMDB.OUT,
TeMerge(RMDB[COUNT,],RMDB[endpt[Merge],]))
#print(paste('Merging Row',COUNT,' and Row',endpt[Merge]))
COUNT = endpt[Merge] # Move Count to the endpoint
}
}
这个脚本给出了我感兴趣的结果,唯一的问题是我的 data.frame 有 5 000 000 个条目,我想使用 GAP 大小的几个参数来运行分析以比较结果。有没有办法可以重写这部分代码以提高效率?到目前为止,一切都在合理的时间内运行(大约几分钟)。这部分已经在 700 000 个数据子集上运行了 3 个多小时。
编辑 3:所有案例都位于顶部的尖峰数据(MIR3)。忽略列 1:4,8,11:15。
dput(RMDB) 结构(列表(V1 = c(3612L, 318L, 318L, 318L, 318L, 318L, 318L, 318L, 318L, 318L, 318L, 318L, 318L, 318L, 741L, 444L, 741L, 407L, 2 407L, 656L, 2058L, 257L, 4051L, 456L, 351L, 850L, 335L, 1000L, 1566L, 236L, 588L, 3877L, 750L, 2292L, 783L, 747L, 666L, 260L, 1118L, 341L, 7010L, 320L, 7010L, 249L, 458L, 24L, 631L, 631L, 875L), V2 = c(11.4, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 18.7, 11.6, 24.9 , 28.8, 14.1, 28.8, 23.6, 18.3, 25, 7, 8.2, 23.6, 24.9, 29.5, 13.5, 19.4, 34.8, 17.4, 22.9, 27.6, 12, 26.6, 30.4, 39.2, 37..5, 19.2, 37.8, 1 , 0, 17.3, 21.2, 19.3, 0, 3.9, 26.6, 22.6), V3 = c(27, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8, 4.5, 0, 0, 7.3, 0.3, 7.3, 12.2, 4.9, 0, 0.4, 0, 1.8, 15.1, 12.7, 0, 3.6, 3, 7.5, 14, 9.4, 0, 14.1, 4.1, 4, 1.4, 9.4, 5.9, 2.7, 0, 9.3,1, 0, 0, 0, 1.8, 9.5), V4 = c(1.3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3.1, 1.1 , 3, 0.3, 3, 3, 0, 0, 0, 0, 3.6, 0.8, 2.7, 0, 2.8, 0.4, 0.8, 1.6, 6.4, 0, 3.8, 3.7, 0, 0, 2.6, 1.5, 2.5 , 2.4, 1.4, 2, 5, 0, 0, 0.6, 0.5), V5 = 结构(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "chr1", class = "factor"), V6 = c(10469L, 20001L, 20210L, 21000L , 21201L, 22000L, 22201L, 23000L, 23201L, 20000L, 20001L, 24001L, 24205L, 24405L, 0L, 30855L, 30953L, 31293L, 31436L, 31734L, 32841L, 33048L, 33466L, 33529L, 34048L, 34451L, 34565L, 35217L, 35367L , 37045L, 37733L, 38059L, 38256L, 39465L, 39624L, 39925L, 40333L, 40629L,40736L, 41380L, 42370L, 43243L, 44836L, 44877L, 45887L, 46079L, 46217L, 46416L, 46553L, 46893L), V7 = c(11447L, 20200L, 20400L, 21200L, 21400L, 22200L, 22400L, 23200L, 23400L, 20001L, 20200L , 24200L, 24400L, 24600L, 0L, 30952L, 31131L, 31435L, 31733L, 31754L, 33037L, 33456L, 33509L, 34041L, 34108L, 34560L, 34921L, 35366L, 35499L, 37431L, 37861L, 38191L, 39464L, 39623L, 39924L, 40294L , 40626L, 40729L, 40878L, 42285L, 42504L, 44835L, 44876L, 45753L, 45987L, 46198L, 46240L, 46493L, 46722L, 47092L), V8 = structure(c(38L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 37L, 36L, 35L, 34L, 33L, 32L, 31L, 30L, 29L, 28L, 27L, 26L, 25L, 24L, 23L, 22L, 21L, 20L, 19L, 18L, 17L, 16L, 15L, 14L, 13L, 12L, 11L, 10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L), .标签= c(“(249203529)”,“(249203899)”,“(249204128)”,“(249204381)”,“(249204423)”,“(249204634)”,“(249204868)”,“(249205745) ”、“(249205786)”、“(249208117)”、“(249208336)”、“(249209743)”、“(249209892)”、“(249209995)”、“(249210327)”、“(249210697)”、 “(249210998)”, “(249211157)”, “(249212430)”, “(249212760)”, “(249213190)”, “(249215122)”, “(249215255)”, “(249215700)”, “( 249216061)"、"(249216513)"、"(249216580)"、"(249217112)"、"(249217165)"、"(249217584)"、"(249218867)"、"(249218888)”, “(249219186)”, “(249219490)”, “(249219669)”, “(249219773)”, “(249235266)”, “(249239174)”), 类 = “因子”), V9 =结构(c(2L,1L,1L,2L,2L,2L,2L,2L,1L,2L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L , 2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L , 1L, 2L), .Label = c("+", "C"), class = "factor"), V10 = structure(c(32L, 23L, 23L, 23L, 23L, 23L, 24L, 23L, 23L , 23L, 23L, 23L, 23L, 23L, 34L, 33L, 27L, 26L, 1L, 26L, 22L, 12L, 5L, 14L, 13L, 16L, 30L, 25L, 2L, 7L, 16L, 15L, 29L, 28L , 3L, 28L, 15L, 4L, 18L, 8L, 19L, 10L, 31L, 10L, 9L, 11L, 6L, 17L, 20L, 21L), .Label = c("AluJo",“AluJr”、“AluSx”、“AluSz6”、“AluYc”、“AT_rich”、“Charlie5”、“ERVL-E-int”、“L1M5”、“L1MA8”、“L1MA9”、“L1MB5”、“L1P1” ”、“L1PA6”、“L2a”、“L2c”、“LTR12F”、“LTR16C”、“MamRep1527”、“MER45A”、“MER58A”、“MIR”、“MIR3”、“MIR3a”、“MIRb”、 “MIRc”、“MLT1A”、“MLT1E1A”、“MLT1E1A-int”、“MLT1J2”、“(TAAA)n”、“TAR1”、“(TC)n”、“XXXXX”),类 =“因子” ), V11 = 结构(c(10L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 14L, 11L, 9L, 13L, 12L, 13L, 13L, 3升,12L, 3L, 3L, 4L, 9L, 13L, 12L, 1L, 4L, 4L, 9L, 9L, 12L, 9L, 4L, 12L, 8L, 8L, 6L, 3L, 11L, 3L, 3L, 3L, 5L, 7L, 2L, 1L), .Label = c("DNA/hAT-Charlie", "DNA/hAT-Tip100", "LINE/L1", "LINE/L2", "Low_complexity", "LTR", "LTR /ERV1"、"LTR/ERVL"、"LTR/ERVL-MaLR"、"Satellite/telo"、"Simple_repeat"、"SINE/Alu"、"SINE/MIR"、"XXXXXXXX"),class = "factor" ), V12 = 结构(c(19L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 2L, 9L, 8L, 25L, 2L, 10L, 9L, 23L, 2L, 1L, 15L, 12L, 1L, 6L, 2L, 11L, 16L, 13L, 7L, 2L, 2L, 8L, 14L, 1L, 3L, 26L, 17L, 18L, 9L, 21L, 22L, 24L, 2L, 4L, 27L, 20L), .Label = c("(0)", "1", "(113)”、“(115)”、“(119)”、“(13)”、“131”、“173”、“2”、“218”、“2234”、“(231)”、“ 2705”、“2923”、“2970”、“3242”、“359”、“3715”、“(399)”、“(4)”、“5306”、“5334”、“5746”、“6167” , "67", "(685)", "7"), class = "factor"), V13 = c(1712L, 143L, 143L, 143L, 143L, 143L, 143L, 143L, 143L, 143L, 143L, 143L ,143L,143L,162L,96L,349L,217L,298L,238L,216L,6174L,44L,44L,6154L,3030L,3156L,448L,448L,255L,133L,133L,133L,2623L,2623L,3375L,3375L,3375L,2846L,148.6L,1489016661616,302,302,302,302,302,302,302元,302,302元,302; , 312L, 376L, 4982L, 499L, 5305L, 41L, 6290L, 5433L, 6280L, 24L, 404L, 178L, 220L),V14 =结构(c(23L,24L,24L,24L,24L,24L,24L,24L,24L,24L,24L,24L,24L,24L,8L,1L,10L,25L,4L,13L,21L,1L, 11L, 26L, 15L, 14L, 20L, 30L, 5L, 2L, 1L, 27L, 1L, 18L, 3L, 4L, 7L, 6L, 9L, 19L, 22L, 29L, 1L, 2L, 28L, 16L, 1L, 17L, 1L, 12L), .Label = c(“(0)”, “(1)”, “(11)”, “(14)”, “(179)”, “208”, “(209) ”、“(212)”、“232”、“(25)”、“(255)”、“3”、“(30)”、“3049”、“(3116)”、“(32)”、 “327”、“(388)”、“4016”、“41”、“(46)”、“(470)”、“483”、“49”、“(51)”、“5640”、“( 573)"、"(691)"、"(838)"、"91")、类 = "因子"), V15 = c(2L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 22L, 23L, 22L, 24L, 25L, 24L , 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 38L, 39L, 38L, 37L, 40L, 41L, 42L, 43L, 44L, 45L, 44L, 46L , 47L, 48L, 49L, 50L, 51L)), .Names = c("V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", “V9”、“V10”、“V11”、“V12”、“V13”、“V14”、“V15”),类 = “data.frame”,row.names = c(NA,-50L))40L, 41L, 42L, 43L, 44L, 45L, 44L, 46L, 47L, 48L, 49L, 50L, 51L)), .Names = c("V1", "V2", "V3", "V4", " V5”、“V6”、“V7”、“V8”、“V9”、“V10”、“V11”、“V12”、“V13”、“V14”、“V15”),class =“data.frame” ", row.names = c(NA, -50L))40L, 41L, 42L, 43L, 44L, 45L, 44L, 46L, 47L, 48L, 49L, 50L, 51L)), .Names = c("V1", "V2", "V3", "V4", " V5”、“V6”、“V7”、“V8”、“V9”、“V10”、“V11”、“V12”、“V13”、“V14”、“V15”),class =“data.frame” ", row.names = c(NA, -50L))
这是应用 ValidMerge 后的输出。
dput(RMDB.OUT) 结构(列表(V1 = c(3612L, NA, NA, 318L, 318L, 318L, 318L, 318L, 318L, NA, 741L, 444L, 741L, 407L, 2059L, 407L, 656L, 2058L, 257L, 4051L, 456L, 351L, 850L, 335L, 1000L, 1566L, 236L, 588L, 3877L, 750L, 2292L, 783L, 747L, 666L, 260L, 1118L, 341L, 7010L, 320L, 7010L, 249L, 458L, 24L, 631L, 631L, 875L), V2 = c(11.4, NA, NA, 23, 23, 23, 23, 23, 23, NA, 18.7, 11.6, 24.9, 28.8, 14.1, 28.8, 23.6, 18.3, 25, 7 , 8.2, 23.6, 24.9, 29.5, 13.5, 19.4, 34.8, 17.4, 22.9, 27.6, 12, 26.6, 30.4, 12.9, 38.5, 35.4, 27.8, 19.2, 0, 17.3, 6, 21.2, 9, 19, 3.. , 22.6), V3 = c(27, NA, NA, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8, NA, 4.5, 0, 0, 7.3, 0.3, 7.3, 12.2, 4.9, 0, 0.4, 0, 1.8, 15.1, 12.7, 0, 3.6, 3, 7.5, 14, 9.4, 0, 14.1, 4.1, 4, 1.4, 9.4, 5.9, 2.7, 0, 9.3, 1, 0, 0, 0, 1.8, 9.5) , V4 = c(1.3, NA, NA, 0, 0, 0, 0, 0, 0, NA,0, 3.1, 1.1, 3, 0.3, 3, 3, 0, 0, 0, 0, 3.6, 0.8, 2.7, 0, 2.8, 0.4, 0.8, 1.6, 6.4, 0, 3.8, 3.7, 0, 0, 2.6, 1.5, 2.5, 2.4, 1.4, 2, 5, 0, 0, 0.6, 0.5), V5 = 结构(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L , 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L , 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "chr1", class = "factor"), V6 = c(10469L, 20001L, 21000L, 22000L, 22201L, 23000L, 23201L, 20000L, 20001L, 24001L, 0L, 30855L, 30953L, 31293L, 31436L, 31734L, 32841L, 33048L, 33466L, 33529L, 34048L, 34451L, 34565L, 35217L, 35367L, 37045L, 37733L, 38059L, 38256L, 39465L, 39624L, 39925L, 40333L, 40629L, 40736L, 41380L, 42370L, 43243L, 44836L, 44877L, 45887L, 46079L, 46217L, 46416L, 46553L, 46893L), V7 = c(11447L, 20400L, 21400L, 22200L,22400L, 23200L, 23400L, 20001L, 20200L, 24600L, 0L, 30952L, 31131L, 31435L, 31733L, 31754L, 33037L, 33456L, 33509L, 34041L, 34108L, 34560L, 34921L, 35366L, 35499L, 37431L, 37861L, 38191L, 39464L, 39623L, 39924L, 40294L, 40626L, 40729L, 40878L, 42285L, 42504L, 44835L, 44876L, 45753L, 45987L, 46198L, 46240L, 46493L, 46722L, 47092L), V8 = structure(c(38L, 37L, 37L, 37L, 37L , 37L, 37L, 37L, 37L, 37L, 36L, 35L, 34L, 33L, 32L, 31L, 30L, 29L, 28L, 27L, 26L, 25L, 24L, 23L, 22L, 21L, 20L, 19L, 18L, 17L , 16L, 15L, 14L, 13L, 12L, 11L, 10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L), .Label = c("(249203529)", "(249203899)" , “(249204128)”, “(249204381)”, “(249204423)”, “(249204634)”, “(249204868)”, “(249205745)”, “(249205786)”, “(249208117)"、"(249208336)"、"(249209743)"、"(249209892)"、"(249209995)"、"(249210327)"、"(249210697)"、"(249210998)"、"(249211157) )”、“(249212430)”、“(249212760)”、“(249213190)”、“(249215122)”、“(249215255)”、“(249215700)”、“(249216061)”、“(249216513)” , “(249216580)”, “(249217112)”, “(249217165)”, “(249217584)”, “(249218867)”, “(249218888)”, “(249219186)”, “(249219490)”, “ (249219669)”, “(249219773)”, “(249235266)”, “(249239174)”), 类 = “因子”), V9 = 结构(c(2L, 1L, 2L, 2L, 2L, 2L, 1L , 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,1L,1L,2L,1L,2L,2L,2L,1L,1L,1L,1L,1L,1L,1L,1L,1L,2L,2L,2L,1L,1L,1L,1L,1L,1L, 1L, 2L, 1L, 2L), .Label = c("+", "C"), class = "factor"), V10 = structure(c(32L, 23L, 23L, 23L, 24L, 23L, 23L, 23L, 23L, 23L, 34L, 33L, 27L, 26L, 1L, 26L, 22L, 12L, 5L, 14L, 13L, 16L, 30L, 25L, 2L, 7L, 16L, 15L, 29L, 28L, 3L, 28L, 15L, 4L, 18L, 8L, 19L, 10L, 31L, 10L, 9L, 11L, 6L, 17L, 20L, 21L), .Label = c("AluJo", "AluJr", "AluSx", "AluSz6", “AluYc”、“AT_rich”、“Charlie5”、“ERVL-E-int”、“L1M5”、“L1MA8”、“L1MA9”、“L1MB5”、“L1P1”、“L1PA6”、“L2a”、“L2c” ", "LTR12F", "LTR16C", "MamRep1527", "MER45A”、“MER58A”、“MIR”、“MIR3”、“MIR3a”、“MIRb”、“MIRc”、“MLT1A”、“MLT1E1A”、“MLT1E1A-int”、“MLT1J2”、“(TAAA)n ", "TAR1", "(TC)n", "XXXXX"), class = "factor"), V11 = structure(c(10L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L , 14L, 11L, 9L, 13L, 12L, 13L, 13L, 3L, 12L, 3L, 3L, 4L, 9L, 13L, 12L, 1L, 4L, 4L, 9L, 9L, 12L, 9L, 4L, 12L, 8L , 8L, 6L, 3L, 11L, 3L, 3L, 3L, 5L, 7L, 2L, 1L), .Label = c("DNA/hAT-Charlie", "DNA/hAT-Tip100", "LINE/L1" 、“LINE/L2”、“Low_complexity”、“LTR”、“LTR/ERV1”、“LTR/ERVL”、“LTR/ERVL-MaLR”、“卫星/telo”、“Simple_repeat”、“SINE/Alu", "SINE/MIR", "XXXXXXXX"), class = "factor"), V12 = structure(c(19L, NA, NA, 5L, 5L, 5L, 5L, 5L, 5L, NA, 2L , 9L, 8L, 25L, 2L, 10L, 9L, 23L, 2L, 1L, 15L, 12L, 1L, 6L, 2L, 11L, 16L, 13L, 7L, 2L, 2L, 8L, 14L, 1L, 3L, 26L , 17L, 18L, 9L, 21L, 22L, 24L, 2L, 4L, 27L, 20L), .Label = c("(0)", "1", "(113)", "(115)", " (119)”、“(13)”、“131”、“173”、“2”、“218”、“2234”、“(231)”、“2705”、“2923”、“2970”、“ 3242”、“359”、“3715”、“(399)”、“(4)”、“5306”、“5334”、“5746”、“6167”、“67”、“(685)”、“ 7"), 类 = "因子"),V13 = c(1712L, NA, NA, 143L, 143L, 143L, 143L, 143L, 143L, NA, 162L, 96L, 349L, 217L, 298L, 238L, 216L, 6174L, 44L, 6154L, 3030L, 36, 4 255L, 133L, 2623L, 3375L, 2846L, 1489L, 172L, 301L, 666L, 3217L, 312L, 376L, 4982L, 499L, 5305L, 41L, 6290L, 5433L, 6280L, 24L, 404L, 178L, 220L), V14 = structure (c(23L, NA, NA, 24L, 24L, 24L, 24L, 24L, 24L, NA, 8L, 1L, 10L, 25L, 4L, 13L, 21L, 1L, 11L, 26L, 15L, 14L, 20L, 30L , 5L, 2L, 1L, 27L, 1L, 18L, 3L, 4L, 7L, 6L, 9L, 19L, 22L, 29L, 1L, 2L, 28L, 16L, 1L, 17L, 1L, 12L), .Label = c (“(0)”、“(1)”、“(11)”、“(14)”、“(179)”、“208”、“(209)”、“(212)”、“232” , “(25)”, “(255)”, “3”, “(30)”, “3049”, “(3116)”, “(32)”, “327”、“(388)”、“4016”、“41”、“(46)”、“(470)”、“483”、“49”、“(51)”、“5640”、“(573) )", "(691)", "(838)", "91"), 类 = "因子"), V15 = c(2L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L , 22L, 23L, 22L, 24L, 25L, 24L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 38L, 39L, 38L, 37L, 40L, 41L , 42L, 43L, 44L, 45L, 44L, 46L, 47L, 48L, 49L, 50L, 51L)), .Names = c("V1", "V2", "V3", "V4", "V5", “V6”、“V7”、“V8”、“V9”、“V10”、“V11”、“V12”、“V13”、“V14”、“V15”),class =“data.frame”,行.names = c(NA,-46L))
编辑 4:对不起,这是简化版本:Initial Data.frame
输入(RMDB.cut)
结构(列表(染色体=结构(c(1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L) , .Label = "chr1", class = "factor"), Start = c(10469L, 20001L, 20210L, 21000L, 21201L, 22000L, 22201L, 23000L, 23201L, 20000L, 20001L, 24001L, 24205L, 24201L, 24205L, , 30953L, 31293L, 31436L, 31734L), End = c(11447L, 20200L, 20400L, 21200L, 21400L, 22200L, 22400L, 23200L, 23400L, 20001L, 20200L, 24200L, 24400L, 24600L, 0L, 30952L, 31131L, 31435L, 31733L, 31754L),
(Left)
=结构(c(38L,37L,37L,37L,37L,37L,37L,37L,37L,37L,37L,37L,37L,37L,36L,35L,34L,33L,32L,31L),.Label = c (“(249203529)”、“(249203899)”、“(249204128)”、“(249204381)”、“(249204423)”、“(249204634)”、“(249204868)”、“(249205745)”、“ (249205786)"、"(249208117)"、"(249208336)"、"(249209743)"、"(249209892)"、"(249209995)"、"(249210327)"、"(249210697)"、"(249210998) )”、“(249211157)”、“(249212430)”、“(249212760)”、“(249213190)”、“(249215122)”、“(249215255)”、“(249215700)”、“(249216061)” , "(249216513)", "(249216580)"、"(249217112)"、"(249217165)"、"(249217584)"、"(249218867)"、"(249218888)"、"(249219186)"、"(249219490)"、"(249219669) )”, “(249219773)”, “(249235266)”, “(249239174)”), 类 = “因子”), 链 = 结构 (c(2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L , 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("+", "C"), class = "factor"), repName = 结构(c(32L, 23L, 23L, 23L, 23L, 23L, 24L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 34L, 33L, 27L, 26L, 1L, 26L), .Label = c(" AluJo”、“AluJr”、“AluSx”、“AluSz6”、“AluYc”、“AT_rich”、“Charlie5”、“ERVL-E-int”、“L1M5”、“L1MA8”、“L1MA9”、“L1MB5”、“L1P1”、“L1PA6”、“L2a”、“L2c”、“LTR12F”、“LTR16C”、“MamRep1527”、“MER45A”、“MER58A” "、"MIR"、"MIR3"、"MIR3a"、"MIRb"、"MIRc"、"MLT1A"、"MLT1E1A"、"MLT1E1A-int"、"MLT1J2"、"(TAAA)n"、"TAR1" , "(TC)n", "XXXXX"), class = "factor"), ValidMerge = c(FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE,假,假,假,假,假,假,假)),.Names = c(“染色体”,“开始”,“结束”,“(左)”,“链”,“repName", "ValidMerge"), row.names = c(NA, 20L), class = "data.frame")
以及合并后的输出
输入(RMDB.out.cut)
结构(列表(染色体=结构(c(1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L),.Label =“chr1”,类=“因子”),开始= c(“10469”,“20001”,“21000”,“22000”,“22201”,“23000”,“23201”,“20000”,“20001”,“24001” ,“0”,“30855”,“30953”,“31293”,“31436”,“31734”),结束= c(“11447”,“20400”,“21400”,“22200”,“22400”, “23200”、“23400”、“20001”、“20200”、“24600”、“0”、“30952”、“31131”、“31435”、“31733”、“31754”),
(Left)
=结构(c(38L,37L,37L,37L,37L,37L,37L,37L,37L,37L,36L,35L,34L,33L,32L,31L),.Label = c(“(249203529)”,“ (249203899)"、"(249204128)"、"(249204381)"、"(249204423)"、"(249204634)"、"(249204868)"、"(249205745)"、"(249205786)"、"(249208117) )”、“(249208336)”、“(249209743)”、“(249209892)”、“(249209995)”、“(249210327)”、“(249210697)”、“(249210998)”、“(249211157)” , “(249212430)”, “(249212760)”, “(249213190)”, “(249215122)”, “(249215255)”, “(249215700)”, “(249216061)”, “(249216513)”, “ (249216580)", "(249217112)"、"(249217165)"、"(249217584)"、"(249218867)"、"(249218888)"、"(249219186)"、"(249219490)"、"(249219669)"、"(249219773) )", "(249235266)", "(249239174)" ), class = "factor"), Strand = structure(c(2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L , 1L, 1L, 1L, 1L, 1L), .Label = c("+", "C"), class = "factor"), repName = structure(c(32L, 23L, 23L, 23L, 24L, 23L , 23L, 23L, 23L, 23L, 34L, 33L, 27L, 26L, 1L, 26L), .Label = c("AluJo", "AluJr", "AluSx", "AluSz6", "AluYc", "AT_rich" ,“查理5”,“ERVL-E-int”,“L1M5”,“L1MA8”,“L1MA9”,“L1MB5”、“L1P1”、“L1PA6”、“L2a”、“L2c”、“LTR12F”、“LTR16C”、“MamRep1527”、“MER45A”、“MER58A”、“MIR”、“MIR3”、“MIR3a” 、“MIRb”、“MIRc”、“MLT1A”、“MLT1E1A”、“MLT1E1A-int”、“MLT1J2”、“(TAAA)n”、“TAR1”、“(TC)n”、“XXXXX”)、类 = “因子”),ValidMerge = c(假,真,假,真,假,假,假,假,假,假,假,真,真,假,假,假)),.Names = c( “染色体”,“开始”,“结束”,“(左)”,“链”,“repName”,“ValidMerge”),row.names = c(“2”,“3”,“4”,“6”、“7”、“8”、“9”、“10”、“11”、“111”、“15”、“16”、“17”、“18”、“19”、“20” " ), class = "data.frame")