我有一份 PubMed 条目列表以及 PubMed ID。我想创建一个 python 脚本或使用接受 PubMed id 号作为输入的 python,然后从 PubMed 网站获取摘要。
到目前为止,我在 Python 中遇到了 NCBI Eutilities 和 importurl 库,但我不知道应该如何编写模板。
任何指针将不胜感激。
谢谢,
我有一份 PubMed 条目列表以及 PubMed ID。我想创建一个 python 脚本或使用接受 PubMed id 号作为输入的 python,然后从 PubMed 网站获取摘要。
到目前为止,我在 Python 中遇到了 NCBI Eutilities 和 importurl 库,但我不知道应该如何编写模板。
任何指针将不胜感激。
谢谢,
使用Biopython的名为Entrez的模块,您可以很容易地获得摘要以及所有其他元数据。这将打印摘要:
from Bio.Entrez import efetch
def print_abstract(pmid):
handle = efetch(db='pubmed', id=pmid, retmode='text', rettype='abstract')
print handle.read()
这是一个获取 XML 并仅返回摘要的函数:
from Bio.Entrez import efetch, read
def fetch_abstract(pmid):
handle = efetch(db='pubmed', id=pmid, retmode='xml')
xml_data = read(handle)[0]
try:
article = xml_data['MedlineCitation']['Article']
abstract = article['Abstract']['AbstractText'][0]
return abstract
except IndexError:
return None
PS我实际上需要在实际任务中做这种事情,所以我将代码组织成一个类——见这个要点。
哇,一周前我自己也在做一个类似的项目!
编辑:我最近更新了代码以利用BeautifulSoup。我有我自己的 virtualenv,但你可以用 pip 安装它。
基本上,我的程序获取一个 pubmed ID、一个 DOI 或一个包含 pubmed ID 和/或 DOI 行的文本文件,并获取有关文章的信息。它可以很容易地根据您自己的需要进行调整以获取摘要,但这是我的代码:
import re
import sys
import traceback
from bs4 import BeautifulSoup
import requests
class PubMedObject(object):
soup = None
url = None
# pmid is a PubMed ID
# url is the url of the PubMed web page
# search_term is the string used in the search box on the PubMed website
def __init__(self, pmid=None, url='', search_term=''):
if pmid:
pmid = pmid.strip()
url = "http://www.ncbi.nlm.nih.gov/pubmed/%s" % pmid
if search_term:
url = "http://www.ncbi.nlm.nih.gov/pubmed/?term=%s" % search_term
page = requests.get(url).text
self.soup = BeautifulSoup(page, "html.parser")
# set the url to be the fixed one with the PubMedID instead of the search_term
if search_term:
try:
url = "http://www.ncbi.nlm.nih.gov/pubmed/%s" % self.soup.find("dl",class_="rprtid").find("dd").text
except AttributeError as e: # NoneType has no find method
print("Error on search_term=%s" % search_term)
self.url = url
def get_title(self):
return self.soup.find(class_="abstract").find("h1").text
#auths is the string that has the list of authors to return
def get_authors(self):
result = []
author_list = [a.text for a in self.soup.find(class_="auths").findAll("a")]
for author in author_list:
lname, remainder = author.rsplit(' ', 1)
#add periods after each letter in the first name
fname = ".".join(remainder) + "."
result.append(lname + ', ' + fname)
return ', '.join(result)
def get_citation(self):
return self.soup.find(class_="cit").text
def get_external_url(self):
url = None
doi_string = self.soup.find(text=re.compile("doi:"))
if doi_string:
doi = doi_string.split("doi:")[-1].strip().split(" ")[0][:-1]
if doi:
url = "http://dx.doi.org/%s" % doi
else:
doi_string = self.soup.find(class_="portlet")
if doi_string:
doi_string = doi_string.find("a")['href']
if doi_string:
return doi_string
return url or self.url
def render(self):
template_text = ''
with open('template.html','r') as template_file:
template_text = template_file.read()
try:
template_text = template_text.replace("{{ external_url }}", self.get_external_url())
template_text = template_text.replace("{{ citation }}", self.get_citation())
template_text = template_text.replace("{{ title }}", self.get_title())
template_text = template_text.replace("{{ authors }}", self.get_authors())
template_text = template_text.replace("{{ error }}", '')
except AttributeError as e:
template_text = template_text.replace("{{ external_url }}", '')
template_text = template_text.replace("{{ citation }}", '')
template_text = template_text.replace("{{ title }}", '')
template_text = template_text.replace("{{ authors }}", '')
template_text = template_text.replace("{{ error }}", '<!-- Error -->')
return template_text.encode('utf8')
def start_table(f):
f.write('\t\t\t\t\t\t\t\t\t<div class="resourcesTable">\n');
f.write('\t\t\t\t\t\t\t\t\t\t<table border="0" cellspacing="0" cellpadding="0">\n');
def end_table(f):
f.write('\t\t\t\t\t\t\t\t\t\t</table>\n');
f.write('\t\t\t\t\t\t\t\t\t</div>\n');
def start_accordion(f):
f.write('\t\t\t\t\t\t\t\t\t<div class="accordion">\n');
def end_accordion(f):
f.write('\t\t\t\t\t\t\t\t\t</div>\n');
def main(args):
try:
# program's main code here
print("Parsing pmids.txt...")
with open('result.html', 'w') as sum_file:
sum_file.write('<!--\n')
with open('pmids.txt','r') as pmid_file:
with open('result.html','a') as sum_file:
for pmid in pmid_file:
sum_file.write(pmid)
sum_file.write('\n-->\n')
with open('pmids.txt','r') as pmid_file:
h3 = False
h4 = False
table_mode = False
accordion_mode = False
with open('result.html', 'a') as sum_file:
for pmid in pmid_file:
if pmid[:4] == "####":
if h3 and not accordion_mode:
start_accordion(sum_file)
accordion_mode = True
sum_file.write('\t\t\t\t\t\t\t\t\t<h4><a href="#">%s</a></h4>\n' % pmid[4:].strip())
h4 = True
elif pmid[:3] == "###":
if h4:
if table_mode:
end_table(sum_file)
table_mode = False
end_accordion(sum_file)
h4 = False
accordion_mode = False
elif h3:
end_table(sum_file)
table_mode = False
sum_file.write('\t\t\t\t\t\t\t\t<h3><a href="#">%s</a></h3>\n' % pmid[3:].strip())
h3 = True
elif pmid.strip():
if (h3 or h4) and not table_mode:
start_table(sum_file)
table_mode = True
if pmid[:4] == "http":
if pmid[:18] == "http://dx.doi.org/":
sum_file.write(PubMedObject(search_term=pmid[18:]).render())
else:
print("url=%s" % pmid)
p = PubMedObject(url=pmid).render()
sum_file.write(p)
print(p)
elif pmid.isdigit():
sum_file.write(PubMedObject(pmid).render())
else:
sum_file.write(PubMedObject(search_term=pmid).render())
if h3:
if h4:
end_table(sum_file)
end_accordion(sum_file)
else:
end_table(sum_file)
pmid_file.close()
print("Done!")
except BaseException as e:
print traceback.format_exc()
print "Error: %s %s" % (sys.exc_info()[0], e.args)
return 1
except:
# error handling code here
print "Error: %s" % sys.exc_info()[0]
return 1 # exit on error
else:
raw_input("Press enter to exit.")
return 0 # exit errorlessly
if __name__ == '__main__':
sys.exit(main(sys.argv))
它现在根据下载的信息返回一个 HTML 文件。这是模板.txt:
<tr>{{ error }}
<td valign="top" class="resourcesICO"><a href="{{ external_url }}" target="_blank"><img src="/image/ico_sitelink.gif" width="24" height="24" /></a></td>
<td><a href="{{ external_url }}">{{ title }}</a><br />
{{ authors }}<br />
<em>{{ citation }}</em></td>
</tr>
当你运行它时,程序会要求你提供 DOI 或 Pubmed ID。如果您不提供,它将显示 pmids.txt。请随意使用您认为合适的代码。
Pubmed 文章的格式为:http ://www.ncbi.nlm.nih.gov/pubmed/?Id
如果您知道 id,那么您可以获取上述内容,您将可以访问该文章。摘要包含在如下结构中:
<div class="abstr"><h3>Abstract</h3><div class=""><p>α-latrotoxin and snake presynaptic phospholipases A2 neurotoxins target the presynaptic membrane of axon terminals of the neuromuscular junction....</p></div></div>
然后你需要一个工具来提取它。我建议使用:http ://www.crummy.com/software/BeautifulSoup/bs4/doc/
您仍然需要一个工具来实际获取 html。为此,我会使用 phantom.js 或曾经流行的 requests 模块。
您的工作流程类似于:
pubmed_ids [1,2,3]
abstracts = []
for id in pubmed_ids:
html_for_id = requests.get('http://www.ncbi.nlm.nih.gov/pubmed/{0}'.format(id))
soup = BeautifulSoup(html_for_id)
abstract = soup.find('selector for abstract')
abstracts.append(abstract)
metapub库就是为此而构建的。Metapub 在超过 1/3 的 PubMed 数据库上进行了测试(截至 2019 年)。
from metapub import PubMedFetcher
pmids = [<your list of ids>]
for pmid in pmids:
article = fetch.article_by_id(pmid)
print(article.abstract)
如果你想获得每篇文章的全文,你可以这样做:
from metapub import FindIt
pmids = [<yourlist>]
for pmid in pmids:
src = FindIt(pmid)
print(src.doi)
print(src.url)
我已经针对数百万篇文章测试了这个库,以至于 Medline XML(即 Entrez)解析器的健壮性约为 99%。相信我,这些数据很混乱。
来源:我是作者。
似乎“模式”模块可以轻松做到这一点:
from pattern import web
import requests
id = 27523945
url = "http://www.ncbi.nlm.nih.gov/pubmed/{0}".format(id)
page = requests.get(url).text.encode('ascii', 'ignore')
dom = web.Element(page)
print(dom.by_tag("abstracttext")[0].content)