尝试node()
结合Join()
:
loader.get_xpath('//div[@class="short-description"]/node()', Join())
结果看起来像:
>>> from scrapy.contrib.loader import XPathItemLoader
>>> from scrapy.contrib.loader.processor import Join
>>> from scrapy.http import HtmlResponse
>>>
>>> body = """
... <html>
... <div class="short-description">
... {some mess with text, <br>, other html tags, etc}
... <div>
... <p>{some mess with text, <br>, other html tags, etc}</p>
... </div>
... <p>{some mess with text, <br>, other html tags, etc}</p>
... </div>
... </html>
... """
>>> response = HtmlResponse(url='http://example.com/', body=body)
>>>
>>> loader = XPathItemLoader(response=response)
>>>
>>> print loader.get_xpath('//div[@class="short-description"]/node()', Join())
{some mess with text, <br> , other html tags, etc}
<div>
<p>{some mess with text, <br>, other html tags, etc}</p>
</div>
<p>{some mess with text, <br>, other html tags, etc}</p>
>>>
>>> loader.get_xpath('//div[@class="short-description"]/node()', Join())
u'\n {some mess with text, <br> , other html tags, etc}\n
<div>\n <p>{some mess with text, <br>, other html tags, etc}</p>\n
</div> \n <p>{some mess with text, <br>, other html tags, etc}</p> \n'