我正在尝试使用此 R 脚本从 NCBI 获取一些信息:
require(rentrez)
require(magrittr)
rs = "rs16891982"
rss = c("rs16891982", "rs12203592", "rs1408799", "rs10756819", "rs35264875", "rs1393350", "rs12821256", "rs17128291", "rs1800407", "rs12913832", "rs1805008", "rs4911414")
# given a rs number, return chr, bp, allele and gene name
annotateGeneName = function(rs) {
anno = rentrez::entrez_search(db = "snp", term = rs) %>%
"[["("ids") %>%
rentrez::entrez_summary(db = "snp", id = .)
if(length(anno) < 1) {
warning(sprintf("%s not found in dbSNP!", rs))
return(invisible(NULL))
}
# there might be multiple entries
# if "snp_id" is not in the list, then
# it means multiple SNPs have been return for this search
# just take the first hit
if(! "snp_id" %in% names(anno)) {
anno = anno[[1]]
}
chrpos = anno[["chrpos"]]
EA = anno$allele_origin %>% gsub("\\(.*", "", .)
fEA = anno$global_maf %>% gsub("/.*", "", .) %>% gsub("^.*=", "", .)
genes = dplyr::first(anno$genes, default = NA)
res = data.frame(snp = rs, chrpos = chrpos, EA = EA, fEA = fEA, genes = genes)
res
}
annotateGeneNames = function(rss) {
do.call(rbind, lapply(rss, annotateGeneName))
}
ids = rentrez::entrez_search(db = "snp", term = rs) %>% "[["("ids")
x = rentrez::entrez_fetch(db = "snp", id = ids[1], rettype="xml")
snp1xml = xmlParse(x)
snp1list = xmlToList(snp1xml)
print(snp1list)
当您打印结果时,您可以看到如下内容:
...
$Rs$Sequence$.attrs
exemplarSs ancestralAllele
"285153617" "C,C,C,C,C,C"
$Rs$Ss$.attrs
ssId handle batchId locSnpId subSnpClass orient
"23456916" "PERLEGEN" "12309" "afd3693051" "snp" "forward"
strand molType buildId methodClass validated
"bottom" "genomic" "123" "hybridize" "by-cluster"
$Rs$Ss$.attrs
ssId handle
"28510204" "MGC_GENOME_DIFF"
batchId locSnpId
"12314" "BC064405x37550355-C16403799G"
subSnpClass orient
"snp" "forward"
strand molType
"bottom" "cDNA"
buildId methodClass
"126" "computed"
$Rs$Ss
$Rs$Ss$Sequence
$Rs$Ss$Sequence$Seq5
[1] "TTCCCTTTCATTTTCCAGAGAAACTTGATCAGGAACCCACTGATTCCAAGAGCAAAGTAATCAGTGAGGAAATGACACCTAGAATTCATGATGAAAAAAGGATGCTTTATATGGTCCTTTTTAAGGTGATAGTTTTTCCTGACGTCCATAGATTTATTAAGAATCTGGTATTTTAAACAGTAGGAAATACACATAGAAATATCAAATCCAAGTTGTGCTAGACCAGAAACTTTTAGAAGACATCCTTAGGAGAGAGAAAGACTTACAAGAATAAAGTGAGGAAAACACGGAGTTGATGCA"
$Rs$Ss$.attrs
$Rs$Ss$Sequence
$Rs$Ss$Sequence$Seq5
[1] "AAGACATCCTTAGGAGAGAGAAAGACTTACAAGAATAAAGTGAGGAAAACACGGAGTTGATGCA"
$Rs$Assembly$Component$MapLoc$FxnSet
geneId symbol mrnaAcc mrnaVer protAcc protVer
"51151" "SLC45A2" "NM_016180" "4" "NP_057264" "3"
fxnClass readingFrame allele residue aaPosition
"reference" "3" "C" "F" "373"
$Rs$Assembly$Component$MapLoc$FxnSet
geneId symbol mrnaAcc
"51151" "SLC45A2" "NM_016180"
mrnaVer protAcc protVer
"4" "NP_057264" "3"
fxnClass readingFrame allele
"missense" "3" "G"
residue aaPosition soTerm
"L" "373" "non_synonymous_codon"
这个列表中有很多 .attrs 条目,而且它们经常是重复的。还有其他重复的条目,例如:
$Rs$Ss$Sequence$Seq5
$Rs$Assembly$Component$MapLoc$FxnSet
等等
.attrs 是什么意思,我如何理解这些数据?我不知道如何在一个列表中包含两个同名条目。