对于那些使用 Python 2.7 和 BeautifulSoup 3 的人来说,这里有一个函数,其目标是尽可能多地刮掉绒毛并坚持文本内容:
from django.utils.html import escape
from django.utils.text import Truncator
from BeautifulSoup import MinimalSoup, CData, Comment, Declaration, ProcessingInstruction, \
BeautifulStoneSoup
...
def get_text(self):
soup = MinimalSoup(self.htmlxml, convertEntities=BeautifulStoneSoup.ALL_ENTITIES)
comments = soup.findAll(text=lambda text: isinstance(text, Comment))
[comment.extract() for comment in comments]
cdatas = soup.findAll(text=lambda text: isinstance(text, CData))
[cdata.extract() for cdata in cdatas]
decls = soup.findAll(text=lambda text: isinstance(text, Declaration))
[decl.extract() for decl in decls]
pis = soup.findAll(text=lambda text: isinstance(text, ProcessingInstruction))
[pi.extract() for pi in pis]
return Truncator(escape(re.sub('\n', '', re.sub('<.*?>', '', soup.renderContents())))).chars(limit)