1

我正在尝试使用此 R 脚本从 NCBI 获取一些信息:

require(rentrez)
require(magrittr)
rs = "rs16891982"
rss = c("rs16891982", "rs12203592", "rs1408799", "rs10756819", "rs35264875", "rs1393350", "rs12821256", "rs17128291", "rs1800407", "rs12913832", "rs1805008", "rs4911414")
# given a rs number, return chr, bp, allele and gene name
annotateGeneName = function(rs) {
    anno = rentrez::entrez_search(db = "snp", term = rs) %>%
           "[["("ids")                                   %>%
           rentrez::entrez_summary(db = "snp", id = .)
           if(length(anno) < 1) {
               warning(sprintf("%s not found in dbSNP!", rs))
               return(invisible(NULL))
           }
           # there might be multiple entries
           # if "snp_id" is not in the list, then
           # it means multiple SNPs have been return for this search
           # just take the first hit
           if(! "snp_id" %in% names(anno)) {
               anno = anno[[1]]
           }
    chrpos = anno[["chrpos"]]
    EA     = anno$allele_origin %>% gsub("\\(.*", "", .)
    fEA    = anno$global_maf %>% gsub("/.*", "", .) %>% gsub("^.*=", "", .)
    genes  = dplyr::first(anno$genes, default = NA)
    res = data.frame(snp = rs, chrpos = chrpos, EA = EA, fEA = fEA, genes = genes)
    res
}
annotateGeneNames = function(rss) {
    do.call(rbind, lapply(rss, annotateGeneName))
}
ids = rentrez::entrez_search(db = "snp", term = rs) %>% "[["("ids")
x = rentrez::entrez_fetch(db = "snp", id = ids[1], rettype="xml")
snp1xml = xmlParse(x)
snp1list = xmlToList(snp1xml)
print(snp1list)

当您打印结果时,您可以看到如下内容:

...
$Rs$Sequence$.attrs
     exemplarSs ancestralAllele 
    "285153617"   "C,C,C,C,C,C" 


$Rs$Ss$.attrs
        ssId       handle      batchId     locSnpId  subSnpClass       orient 
  "23456916"   "PERLEGEN"      "12309" "afd3693051"        "snp"    "forward" 
      strand      molType      buildId  methodClass    validated 
    "bottom"    "genomic"        "123"  "hybridize" "by-cluster" 


$Rs$Ss$.attrs
                          ssId                         handle 
                    "28510204"              "MGC_GENOME_DIFF" 
                       batchId                       locSnpId 
                       "12314" "BC064405x37550355-C16403799G" 
                   subSnpClass                         orient 
                         "snp"                      "forward" 
                        strand                        molType 
                      "bottom"                         "cDNA" 
                       buildId                    methodClass 
                         "126"                     "computed" 


$Rs$Ss
$Rs$Ss$Sequence
$Rs$Ss$Sequence$Seq5
[1] "TTCCCTTTCATTTTCCAGAGAAACTTGATCAGGAACCCACTGATTCCAAGAGCAAAGTAATCAGTGAGGAAATGACACCTAGAATTCATGATGAAAAAAGGATGCTTTATATGGTCCTTTTTAAGGTGATAGTTTTTCCTGACGTCCATAGATTTATTAAGAATCTGGTATTTTAAACAGTAGGAAATACACATAGAAATATCAAATCCAAGTTGTGCTAGACCAGAAACTTTTAGAAGACATCCTTAGGAGAGAGAAAGACTTACAAGAATAAAGTGAGGAAAACACGGAGTTGATGCA"

$Rs$Ss$.attrs

$Rs$Ss$Sequence
$Rs$Ss$Sequence$Seq5
[1] "AAGACATCCTTAGGAGAGAGAAAGACTTACAAGAATAAAGTGAGGAAAACACGGAGTTGATGCA"

$Rs$Assembly$Component$MapLoc$FxnSet
      geneId       symbol      mrnaAcc      mrnaVer      protAcc      protVer 
     "51151"    "SLC45A2"  "NM_016180"          "4"  "NP_057264"          "3" 
    fxnClass readingFrame       allele      residue   aaPosition 
 "reference"          "3"          "C"          "F"        "373" 

$Rs$Assembly$Component$MapLoc$FxnSet
                geneId                 symbol                mrnaAcc 
               "51151"              "SLC45A2"            "NM_016180" 
               mrnaVer                protAcc                protVer 
                   "4"            "NP_057264"                    "3" 
              fxnClass           readingFrame                 allele 
            "missense"                    "3"                    "G" 
               residue             aaPosition                 soTerm 
                   "L"                  "373" "non_synonymous_codon" 

这个列表中有很多 .attrs 条目,而且它们经常是重复的。还有其他重复的条目,例如:

$Rs$Ss$Sequence$Seq5
$Rs$Assembly$Component$MapLoc$FxnSet

等等

.attrs 是什么意思,我如何理解这些数据?我不知道如何在一个列表中包含两个同名条目。

4

1 回答 1

2

在 R中attributesattr是分配或提取属性的函数,但据我所知,“.attr”只是一个列表位置名称。它的含义基本上是作者认为它应该表示的任何含义....之后您的代码通过解析 XML 并将其转换为 R 列表。它不是 R 定义的一部分,因此请阅读文档。

我现在看到您对具有相同名称的列表项感到困扰。这在 R 中是可能的。“[”和“[[”将检索树中与名称匹配的第一个项目。访问需要是数字的或由 lapply 或 sapply 介导,这些函数遍历树的上层以避免歧义。

> mylist=vector("list", length=2)
> mylist
[[1]]
NULL

[[2]]
NULL

> names(mylist) <- c("a","a")
> mylist
$a
NULL

$a
NULL

> mylist[['a']]
NULL
> mylist['a']
$a
NULL

> lapply( mylist , "[[", "a")
$a
NULL

$a
NULL

(我也没有看到在提取和处理该数据的过程中使用了这些函数定义中的任何一个。)

于 2015-04-08T15:47:08.433 回答