我有一个文件,第一行有 20 个字段作为标题。其余行的字段数不相等,有些行的列数比标题多。当我尝试使用 read.delim() 读取它时,它读取数据没有错误,但总行数超过了原始数。
以下是文件的几行:
Chromosome Position SNPid Reference Alternate QUAL Homozygosity Tool Depth MappingQuality EFFECT IMPACT FUNCTIONAL_CLASS CODON_CHANGE AMINO_ACID_CHANGE GENE_NAME GENE_BIOTYPE GENE_CODING TRANSCRIPT_ID EXON_ID
chr1 403111 . G A 24 het SAM 20 55 INTERGENIC MODIFIER _ _ _ _ _ _ _ _ _
chr1 602567 rs21953190 A G 3265.77 hom GATKSAM 91 58.46 SYNONYMOUS_CODING LOW SILENT gaT/gaC D1034 ADNP2 protein_coding CODING ENSCAFT00000000008 5 _
chr1 604894 rs21953191 A G 2869.77 hom GATKSAM 77 59.70 NON_SYNONYMOUS_CODING MODERATE MISSENSE Ttt/Ctt F259L ADNP2 protein_coding CODING ENSCAFT00000000008 5 _
chr1 758630 . T TC 1531.73 hom GATKSAM 38 46.20 INTRON MODIFIER _ _ _ PQLC1 protein_coding CODING ENSCAFT00000000011 2 _
chr1 800715 . C CT 514.73 hom GATKSAM 13 60.00 INTRON MODIFIER _ _ _ PQLC1 protein_coding CODING ENSCAFT00000000011 6 ,SPLICE_SITE_ACCEPTOR HIGH _ _ _ PQLC1 protein_coding CODING ENSCAFT00000000011 7 ,SPLICE_SITE_DONOR HIGH _ _ _ PQLC1 protein_coding CODING ENSCAFT00000000011 6 _
chr1 1104035 rs21966859 G A 3803.77 hom GATKSAM 97 57.97 INTRON MODIFIER _ _ _ NFATC1 protein_coding CODING ENSCAFT00000000013 2 ,INTRON MODIFIER _ _ _ NFATC1 protein_coding CODING ENSCAFT00000036234 2 _
chr1 1120994 . CGCG C 604.73 hom GATKSAM 21 56.55 INTERGENIC MODIFIER _ _ _ _ _ _ _ _ ,UPSTREAM MODIFIER _ _ _ NFATC1 protein_coding CODING ENSCAFT00000000013 _ ,UPSTREAM MODIFIER _ _ _ NFATC1 protein_coding CODING ENSCAFT00000036234 _ _
chr1 1136916 rs21935602 G A 3899.77 hom GATKSAM 101 59.17 DOWNSTREAM MODIFIER _ _ _ ATP9B protein_coding CODING ENSCAFT00000000014 _ ,DOWNSTREAM MODIFIER _ _ _ ATP9B protein_coding CODING ENSCAFT00000042968 _ ,UTR_3_PRIME MODIFIER _ _ _ ATP9B protein_coding CODING ENSCAFT00000046825 29 _
文件中有 9 行。但是当在 R 中读取它并计算行数时,它显示为 12。
read.delim("test.txt",header=T,sep='\t')->data
nrow(data)
有人可以帮忙正确读取数据吗?
以下是 dput(data) 的输出
> dput(data)
structure(list(Chromosome = structure(c(3L, 3L, 3L, 3L, 3L, 1L,
3L, 2L, 3L, 2L, 3L, 2L), .Label = c("HIGH", "MODIFIER", "chr1"
), class = "factor"), Position = structure(c(4L, 5L, 6L, 7L,
8L, 9L, 1L, 9L, 2L, 9L, 3L, 9L), .Label = c("1104035", "1120994",
"1136916", "403111", "602567", "604894", "758630", "800715",
"_"), class = "factor"), SNPid = structure(c(1L, 4L, 5L, 1L,
1L, 2L, 6L, 2L, 1L, 2L, 3L, 2L), .Label = c(".", "_", "rs21935602",
"rs21953190", "rs21953191", "rs21966859"), class = "factor"),
Reference = structure(c(4L, 1L, 1L, 5L, 2L, 6L, 4L, 6L, 3L,
6L, 4L, 6L), .Label = c("A", "C", "CGCG", "G", "T", "_"), class = "factor"),
Alternate = structure(c(1L, 5L, 5L, 8L, 4L, 7L, 1L, 6L, 3L,
6L, 1L, 2L), .Label = c("A", "ATP9B", "C", "CT", "G", "NFATC1",
"PQLC1", "TC"), class = "factor"), QUAL = structure(c(2L,
4L, 3L, 1L, 7L, 9L, 5L, 9L, 8L, 9L, 6L, 9L), .Label = c("1531.73",
"24", "2869.77", "3265.77", "3803.77", "3899.77", "514.73",
"604.73", "protein_coding"), class = "factor"), Homozygosity = structure(c(2L,
3L, 3L, 3L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L), .Label = c("CODING",
"het", "hom"), class = "factor"), Tool = structure(c(6L,
5L, 5L, 5L, 5L, 1L, 5L, 3L, 5L, 2L, 5L, 4L), .Label = c("ENSCAFT00000000011",
"ENSCAFT00000000013", "ENSCAFT00000036234", "ENSCAFT00000042968",
"GATKSAM", "SAM"), class = "factor"), Depth = structure(c(4L,
9L, 8L, 6L, 2L, 7L, 10L, 3L, 5L, 11L, 1L, 11L), .Label = c("101",
"13", "2", "20", "21", "38", "7", "77", "91", "97", "_"), class = "factor"),
MappingQuality = structure(c(5L, 8L, 10L, 4L, 11L, 1L, 7L,
12L, 6L, 2L, 9L, 3L), .Label = c(",SPLICE_SITE_DONOR", ",UPSTREAM",
",UTR_3_PRIME", "46.20", "55", "56.55", "57.97", "58.46",
"59.17", "59.70", "60.00", "_"), class = "factor"), EFFECT = structure(c(4L,
8L, 7L, 5L, 5L, 3L, 5L, 1L, 4L, 6L, 2L, 6L), .Label = c("",
"DOWNSTREAM", "HIGH", "INTERGENIC", "INTRON", "MODIFIER",
"NON_SYNONYMOUS_CODING", "SYNONYMOUS_CODING"), class = "factor"),
IMPACT = structure(c(4L, 2L, 3L, 4L, 4L, 5L, 4L, 1L, 4L,
5L, 4L, 5L), .Label = c("", "LOW", "MODERATE", "MODIFIER",
"_"), class = "factor"), FUNCTIONAL_CLASS = structure(c(4L,
3L, 2L, 4L, 4L, 4L, 4L, 1L, 4L, 4L, 4L, 4L), .Label = c("",
"MISSENSE", "SILENT", "_"), class = "factor"), CODON_CHANGE = structure(c(3L,
4L, 2L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L), .Label = c("",
"Ttt/Ctt", "_", "gaT/gaC"), class = "factor"), AMINO_ACID_CHANGE = structure(c(7L,
3L, 4L, 7L, 7L, 6L, 7L, 1L, 7L, 5L, 7L, 2L), .Label = c("",
"ATP9B", "D1034", "F259L", "NFATC1", "PQLC1", "_"), class = "factor"),
GENE_NAME = structure(c(6L, 2L, 2L, 5L, 5L, 7L, 4L, 1L, 6L,
7L, 3L, 7L), .Label = c("", "ADNP2", "ATP9B", "NFATC1", "PQLC1",
"_", "protein_coding"), class = "factor"), GENE_BIOTYPE = structure(c(3L,
4L, 4L, 4L, 4L, 2L, 4L, 1L, 3L, 2L, 4L, 2L), .Label = c("",
"CODING", "_", "protein_coding"), class = "factor"), GENE_CODING = structure(c(6L,
2L, 2L, 2L, 2L, 3L, 2L, 1L, 6L, 4L, 2L, 5L), .Label = c("",
"CODING", "ENSCAFT00000000011", "ENSCAFT00000036234", "ENSCAFT00000046825",
"_"), class = "factor"), TRANSCRIPT_ID = structure(c(8L,
4L, 4L, 5L, 5L, 3L, 6L, 1L, 8L, 8L, 7L, 2L), .Label = c("",
"29", "6", "ENSCAFT00000000008", "ENSCAFT00000000011", "ENSCAFT00000000013",
"ENSCAFT00000000014", "_"), class = "factor"), EXON_ID = structure(c(5L,
3L, 3L, 2L, 4L, 5L, 2L, 1L, 5L, 5L, 5L, 5L), .Label = c("",
"2", "5", "6", "_"), class = "factor"), X = structure(c(6L,
6L, 6L, 6L, 4L, 1L, 3L, 1L, 5L, 1L, 2L, 1L), .Label = c("",
",DOWNSTREAM", ",INTRON", ",SPLICE_SITE_ACCEPTOR", ",UPSTREAM",
"_"), class = "factor")), .Names = c("Chromosome", "Position",
"SNPid", "Reference", "Alternate", "QUAL", "Homozygosity", "Tool",
"Depth", "MappingQuality", "EFFECT", "IMPACT", "FUNCTIONAL_CLASS",
"CODON_CHANGE", "AMINO_ACID_CHANGE", "GENE_NAME", "GENE_BIOTYPE",
"GENE_CODING", "TRANSCRIPT_ID", "EXON_ID", "X"), class = "data.frame", row.names = c(NA,
-12L))