from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class DmozSpider(BaseSpider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//ul/li')
for site in sites:
title = site.select('a/text()').extract()
link = site.select('a/@href').extract()
desc = site.select('text()').extract()
print title, link, desc
这是我的代码。我想要大量的 URL 来使用循环抓取。那么我应该怎么做这些呢?我确实在那里放了多个网址,但我没有从所有网址中获得输出。某些 URL 停止响应。那么如何使用此代码确定获取数据呢?