以下 Python 3 代码将解析一个 HTML 文件(“dict.html”)并返回一个包含单词及其定义的 dict 对象。此代码假定 HTML 文件的格式与您的示例中一样,即<p><b>Some word</b>Word definition</p>
.
from html.parser import HTMLParser
dictionary = {}
# Custom html parser which will add word and definition pairs
# to the dict 'dictionary'
class html_to_dict_parser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
# These variables will tell the parser what data it's reading at the moment
self.in_word_label = False
self.in_definition = False
self.has_definition = False
# Called everytime the parser encounters a new tag
# e.g. <html> or <p>
def handle_starttag(self, tag, attrs):
if tag == 'b':
self.in_word_label = True
elif tag == 'p':
self.in_definition = True
# Similar to above
def handle_endtag(self, tag):
if tag == 'b':
self.in_word_label = False
elif tag == 'p':
self.in_definition = False
# Called when the parser encounters the contents of a tag
# e.g. 'Some word' in '<p>Some word</p>
def handle_data(self, data):
if self.in_word_label:
# Inside a <b> tag
self.latest_word = data.lower()
self.has_definition = True
elif self.in_definition and self.has_definition:
# Inside a <p> tag which also contained a <b> tag
dictionary[ self.latest_word ] = data
self.has_definition = False
# Run the parser!
parser = html_to_dict_parser()
with open('dict.html') as html_file:
parser.feed(html_file.read())
parser.close()
print(dictionary)
使用上面的 html 创建的 dict 的示例输出:
{'abbajare': ', per dimostrar gridando. Inf. VII, 43.', 'abbandonare': ', per lasciare una impresa difficile: Par.\nXVIII, 9.', 'abbagliato': ' (l’), sanese, uomo goloso che consu'}
现在应该很容易在字典中搜索您选择的单词,例如,如果您要将命令参数传递给解释器(例如$ python3 dict_parser.py abbandonare
),您可以扩展上面的程序以搜索您传递给它的单词:
import sys
for word in sys.argv[1:]:
if word in dictionary:
print(word, ':', dictionary[word])
else:
print(word, "not found in dictionary.")
更多信息:
HTML 解析器模块的文档:http: //docs.python.org/3/library/html.parser.html