这是一个解决方案,涉及过滤TxDb.Hsapiens.UCSC.hg19.knownGene
最长的转录本gene_id
(它确实删除了没有 的基因gene_id
):
suppressPackageStartupMessages({
invisible(lapply(c("ggbio", "biovizBase", "data.table",
"TxDb.Hsapiens.UCSC.hg19.knownGene",
"org.Hs.eg.db"),
require, character.only = TRUE))})
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
# retrieve transcript lengths
txlen <- transcriptLengths(txdb, with.utr5_len=TRUE, with.utr3_len=TRUE)
setDT(txlen)
txlen$len <- rowSums(as.matrix(txlen[, .(tx_len, utr5_len, utr3_len)]))
setkey(txlen, gene_id, len, tx_id)
# filter longesttranscript by gene_id
ltx <- txlen[!is.na(gene_id)][, tail(.SD,1), by=gene_id]$tx_id
# filter txdb object
txb <- as.list(txdb)
txb$transcripts <- txb$transcripts[txb$transcripts$tx_id %in% ltx, ]
txb$splicings <- txb$splicings[txb$splicings$tx_id %in% ltx,]
txb$genes <- txb$genes[txb$genes$tx_id %in% ltx,]
txb <- do.call(makeTxDb, txb)
# plot according to vignette, chapter 2.2.5
range <- GRanges("chr10", IRanges(start = 78000000 , end = 79000000))
gr.txdb <- crunch(txb, which = range)
#> Parsing transcripts...
#> Parsing exons...
#> Parsing cds...
#> Parsing utrs...
#> ------exons...
#> ------cdss...
#> ------introns...
#> ------utr...
#> aggregating...
#> Done
colnames(values(gr.txdb))[4] <- "model"
grl <- split(gr.txdb, gr.txdb$gene_id)
symbols <- select(org.Hs.eg.db, keys=names(grl), columns="SYMBOL", keytype="ENTREZID")
#> 'select()' returned 1:1 mapping between keys and columns
names(grl) <- symbols[match(symbols$ENTREZID, names(grl), nomatch=0),"SYMBOL"]
autoplot(grl, aes(type = "model"), gap.geom="chevron")
#> Constructing graphics...
由reprex 包(v0.3.0)于 2020-05-29 创建
编辑:
要获取基因符号而不是基因(或转录本)ID,只需将 grl 的名称替换为相关的基因符号,例如 viaorg.Hs.eg.db
或任何其他匹配它们的资源。