正如 Chris-Top 所说,BeautifulSoup 是要走的路:
from BeautifulSoup import BeautifulSoup, Tag, NavigableString
import re
html = """
<div>
<p>The quick brown <a href='http://en.wikipedia.org/wiki/Dog'>fox</a> jumped over the lazy Dog</p>
<p>The <a href='http://en.wikipedia.org/wiki/Dog'>dog</a>, who was, in reality, not so lazy, gave chase to the fox.</p>
<p>See image for reference:</p>
<img src='dog_chasing_fox.jpg' title='Dog chasing fox'/>
</div>
"""
soup = BeautifulSoup(html)
#search term, url reference
keywords = [("dog","http://en.wikipedia.org/wiki/Dog"),
("fox","http://en.wikipedia.org/wiki/Fox")]
def insertLinks(string_value,string_href):
for t in soup.findAll(text=re.compile(string_value, re.IGNORECASE)):
if t.parent.name !='a':
a = Tag('a', name='a')
a['href'] = string_href
a.insert(0, NavigableString(string_value))
string_list = re.compile(string_value, re.IGNORECASE).split(t)
replacement_text = soup.new_string(string_list[0])
t.replace_with(replacement_text)
replacement_text.insert_after(a)
a.insert_after(soup.new_string(string_list[1]))
for word in keywords:
insertLinks(word[0],word[1])
print soup
将产生:
<div>
<p>The quick brown <a href="http://en.wikipedia.org/wiki/Dog">fox</a> jumped over the lazy <a href="http://en.wikipedia.org/wiki/Dog">dog</a></p>
<p>The <a href="http://en.wikipedia.org/wiki/Dog">dog</a>, who was, in reality, not so lazy, gave chase to the <a href="http://en.wikipedia.org/wiki/Fox">fox</a>.</p>
<p>See image for reference:</p>
<img src="dog_chasing_fox.jpg" title="Dog chasing fox"/>
</div>