1

我有一个文件,第一行有 20 个字段作为标题。其余行的字段数不相等,有些行的列数比标题多。当我尝试使用 read.delim() 读取它时,它读取数据没有错误,但总行数超过了原始数。

以下是文件的几行:

Chromosome   Position    SNPid   Reference   Alternate   QUAL    Homozygosity    Tool    Depth   MappingQuality  EFFECT  IMPACT  FUNCTIONAL_CLASS    CODON_CHANGE    AMINO_ACID_CHANGE   GENE_NAME   GENE_BIOTYPE    GENE_CODING     TRANSCRIPT_ID   EXON_ID    
chr1    403111  .   G   A   24  het SAM 20  55  INTERGENIC  MODIFIER    _   _   _   _   _   _   _   _   _
chr1    602567  rs21953190  A   G   3265.77 hom GATKSAM 91  58.46   SYNONYMOUS_CODING   LOW SILENT  gaT/gaC D1034   ADNP2   protein_coding  CODING  ENSCAFT00000000008  5   _
chr1    604894  rs21953191  A   G   2869.77 hom GATKSAM 77  59.70   NON_SYNONYMOUS_CODING   MODERATE    MISSENSE    Ttt/Ctt F259L   ADNP2   protein_coding  CODING  ENSCAFT00000000008  5   _
chr1    758630  .   T   TC  1531.73 hom GATKSAM 38  46.20   INTRON  MODIFIER    _   _   _   PQLC1   protein_coding  CODING  ENSCAFT00000000011  2   _
chr1    800715  .   C   CT  514.73  hom GATKSAM 13  60.00   INTRON  MODIFIER    _   _   _   PQLC1   protein_coding  CODING  ENSCAFT00000000011  6   ,SPLICE_SITE_ACCEPTOR   HIGH    _   _   _   PQLC1   protein_coding  CODING  ENSCAFT00000000011  7   ,SPLICE_SITE_DONOR  HIGH    _   _   _   PQLC1   protein_coding  CODING  ENSCAFT00000000011  6   _
chr1    1104035 rs21966859  G   A   3803.77 hom GATKSAM 97  57.97   INTRON  MODIFIER    _   _   _   NFATC1  protein_coding  CODING  ENSCAFT00000000013  2   ,INTRON MODIFIER    _   _   _   NFATC1  protein_coding  CODING  ENSCAFT00000036234  2   _
chr1    1120994 .   CGCG    C   604.73  hom GATKSAM 21  56.55   INTERGENIC  MODIFIER    _   _   _   _   _   _   _   _   ,UPSTREAM   MODIFIER    _   _   _   NFATC1  protein_coding  CODING  ENSCAFT00000000013  _   ,UPSTREAM   MODIFIER    _   _   _   NFATC1  protein_coding  CODING  ENSCAFT00000036234  _   _
chr1    1136916 rs21935602  G   A   3899.77 hom GATKSAM 101 59.17   DOWNSTREAM  MODIFIER    _   _   _   ATP9B   protein_coding  CODING  ENSCAFT00000000014  _   ,DOWNSTREAM MODIFIER    _   _   _   ATP9B   protein_coding  CODING  ENSCAFT00000042968  _   ,UTR_3_PRIME    MODIFIER    _   _   _   ATP9B   protein_coding  CODING  ENSCAFT00000046825  29  _

文件中有 9 行。但是当在 R 中读取它并计算行数时,它显示为 12。

read.delim("test.txt",header=T,sep='\t')->data
nrow(data)

有人可以帮忙正确读取数据吗?

以下是 dput(data) 的输出

> dput(data)
structure(list(Chromosome = structure(c(3L, 3L, 3L, 3L, 3L, 1L, 
3L, 2L, 3L, 2L, 3L, 2L), .Label = c("HIGH", "MODIFIER", "chr1"
), class = "factor"), Position = structure(c(4L, 5L, 6L, 7L, 
8L, 9L, 1L, 9L, 2L, 9L, 3L, 9L), .Label = c("1104035", "1120994", 
"1136916", "403111", "602567", "604894", "758630", "800715", 
"_"), class = "factor"), SNPid = structure(c(1L, 4L, 5L, 1L, 
1L, 2L, 6L, 2L, 1L, 2L, 3L, 2L), .Label = c(".", "_", "rs21935602", 
"rs21953190", "rs21953191", "rs21966859"), class = "factor"), 
Reference = structure(c(4L, 1L, 1L, 5L, 2L, 6L, 4L, 6L, 3L, 
6L, 4L, 6L), .Label = c("A", "C", "CGCG", "G", "T", "_"), class = "factor"), 
Alternate = structure(c(1L, 5L, 5L, 8L, 4L, 7L, 1L, 6L, 3L, 
6L, 1L, 2L), .Label = c("A", "ATP9B", "C", "CT", "G", "NFATC1", 
"PQLC1", "TC"), class = "factor"), QUAL = structure(c(2L, 
4L, 3L, 1L, 7L, 9L, 5L, 9L, 8L, 9L, 6L, 9L), .Label = c("1531.73", 
"24", "2869.77", "3265.77", "3803.77", "3899.77", "514.73", 
"604.73", "protein_coding"), class = "factor"), Homozygosity = structure(c(2L, 
3L, 3L, 3L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L), .Label = c("CODING", 
"het", "hom"), class = "factor"), Tool = structure(c(6L, 
5L, 5L, 5L, 5L, 1L, 5L, 3L, 5L, 2L, 5L, 4L), .Label = c("ENSCAFT00000000011", 
"ENSCAFT00000000013", "ENSCAFT00000036234", "ENSCAFT00000042968", 
"GATKSAM", "SAM"), class = "factor"), Depth = structure(c(4L, 
9L, 8L, 6L, 2L, 7L, 10L, 3L, 5L, 11L, 1L, 11L), .Label = c("101", 
"13", "2", "20", "21", "38", "7", "77", "91", "97", "_"), class = "factor"), 
MappingQuality = structure(c(5L, 8L, 10L, 4L, 11L, 1L, 7L, 
12L, 6L, 2L, 9L, 3L), .Label = c(",SPLICE_SITE_DONOR", ",UPSTREAM", 
",UTR_3_PRIME", "46.20", "55", "56.55", "57.97", "58.46", 
"59.17", "59.70", "60.00", "_"), class = "factor"), EFFECT = structure(c(4L, 
8L, 7L, 5L, 5L, 3L, 5L, 1L, 4L, 6L, 2L, 6L), .Label = c("", 
"DOWNSTREAM", "HIGH", "INTERGENIC", "INTRON", "MODIFIER", 
"NON_SYNONYMOUS_CODING", "SYNONYMOUS_CODING"), class = "factor"), 
IMPACT = structure(c(4L, 2L, 3L, 4L, 4L, 5L, 4L, 1L, 4L, 
5L, 4L, 5L), .Label = c("", "LOW", "MODERATE", "MODIFIER", 
"_"), class = "factor"), FUNCTIONAL_CLASS = structure(c(4L, 
3L, 2L, 4L, 4L, 4L, 4L, 1L, 4L, 4L, 4L, 4L), .Label = c("", 
"MISSENSE", "SILENT", "_"), class = "factor"), CODON_CHANGE = structure(c(3L, 
4L, 2L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L), .Label = c("", 
"Ttt/Ctt", "_", "gaT/gaC"), class = "factor"), AMINO_ACID_CHANGE = structure(c(7L, 
3L, 4L, 7L, 7L, 6L, 7L, 1L, 7L, 5L, 7L, 2L), .Label = c("", 
"ATP9B", "D1034", "F259L", "NFATC1", "PQLC1", "_"), class = "factor"), 
GENE_NAME = structure(c(6L, 2L, 2L, 5L, 5L, 7L, 4L, 1L, 6L, 
7L, 3L, 7L), .Label = c("", "ADNP2", "ATP9B", "NFATC1", "PQLC1", 
"_", "protein_coding"), class = "factor"), GENE_BIOTYPE = structure(c(3L, 
4L, 4L, 4L, 4L, 2L, 4L, 1L, 3L, 2L, 4L, 2L), .Label = c("", 
"CODING", "_", "protein_coding"), class = "factor"), GENE_CODING = structure(c(6L, 
2L, 2L, 2L, 2L, 3L, 2L, 1L, 6L, 4L, 2L, 5L), .Label = c("", 
"CODING", "ENSCAFT00000000011", "ENSCAFT00000036234", "ENSCAFT00000046825", 
"_"), class = "factor"), TRANSCRIPT_ID = structure(c(8L, 
4L, 4L, 5L, 5L, 3L, 6L, 1L, 8L, 8L, 7L, 2L), .Label = c("", 
"29", "6", "ENSCAFT00000000008", "ENSCAFT00000000011", "ENSCAFT00000000013", 
"ENSCAFT00000000014", "_"), class = "factor"), EXON_ID = structure(c(5L, 
3L, 3L, 2L, 4L, 5L, 2L, 1L, 5L, 5L, 5L, 5L), .Label = c("", 
"2", "5", "6", "_"), class = "factor"), X = structure(c(6L, 
6L, 6L, 6L, 4L, 1L, 3L, 1L, 5L, 1L, 2L, 1L), .Label = c("", 
",DOWNSTREAM", ",INTRON", ",SPLICE_SITE_ACCEPTOR", ",UPSTREAM", 
"_"), class = "factor")), .Names = c("Chromosome", "Position", 
"SNPid", "Reference", "Alternate", "QUAL", "Homozygosity", "Tool", 
"Depth", "MappingQuality", "EFFECT", "IMPACT", "FUNCTIONAL_CLASS", 
"CODON_CHANGE", "AMINO_ACID_CHANGE", "GENE_NAME", "GENE_BIOTYPE", 
"GENE_CODING", "TRANSCRIPT_ID", "EXON_ID", "X"), class = "data.frame", row.names = c(NA, 
-12L))
4

2 回答 2

2

R 认为您每行有 21 个而不是 20 个字段(也许每行都有尾随制表符?),并且您的第 6-9 行有其他字段:

 count.fields("test.txt",sep="\t")
## [1] 21 21 21 21 21 41 31 41 41

这混淆了read.delim,它试图从前 5 行猜测发生了什么(它不应该,但就是这样)。你可能认为你可以用fill=TRUE它来解决这个问题,但你不能。

我尝试使用colClasseswithfill=TRUE来指定字段类型(我使用过colClasses=rep("character",41),但您可能猜得比这更好),但它似乎不起作用,可能是因为您的标题只有 21 列。

包中的fread函数data.table可以做得更好,但前提是你告诉它不要尝试从 #5 之后的行中猜测格式,并且它会丢弃超过 21 列中的数据。

library(data.table)
nrow(fread("test.txt",autostart=5))  ## 9

嗯,即使这样也不能像预期的那样工作(即使我设置了,它也不能正确拾取标题header=TRUE,可能是因为第 21 列没有标题字段......底线是你可能必须弄清楚那些额外的字段是什么,并用它们做一些更明确的事情(例如添加标题字段......)

基本上,R 期望您的数据非常干净。将这个示例发送给包的维护者可能是值得的data.table,他们正试图使其fread尽可能健壮......这将是一个挑战。

于 2013-06-13T20:29:17.467 回答
2

查看数据,您可以看到它高度“突变”,具有许多融合线。在许多情况下,这些是由逗号的存在表示的。我认为这些数据的格式与您预期的不同。dput 数据中的第一个元素是染色体值 =c("HIGH", "MODIFIER", "chr1") 的因子。这不是一个明智的结果,表明您对原始数据的组织缺乏了解。您应该将原始文本文件发布到可以通过 Internet 访问的位置,以便可以检查原始布局。特别是您认为是分隔符的选项卡要么不存在,要么没有被 SO 接口捕获。

在指向数据样本后,应该通过您进行编辑将其放入问题正文中,尝试删除逗号后面的注释:

 datL <- readLines("~/Downloads/test.txt")
 datLred <- gsub("[,].+$", "", datL)
 read.delim(text=datLred)

> str(read.delim(text=datLred) )
'data.frame':   8 obs. of  21 variables:
 $ Chromosome       : Factor w/ 1 level "chr1": 1 1 1 1 1 1 1 1
 $ Position         : int  403111 602567 604894 758630 800715 1104035 1120994 1136916
 $ SNPid            : Factor w/ 5 levels ".","rs21935602",..: 1 3 4 1 1 5 1 2
 $ Reference        : Factor w/ 5 levels "A","C","CGCG",..: 4 1 1 5 2 4 3 4
 $ Alternate        : Factor w/ 5 levels "A","C","CT","G",..: 1 4 4 5 3 1 2 1
snipped remain columns
于 2013-06-13T21:00:10.777 回答