这是我的代码,有人可以帮忙吗,由于某种原因蜘蛛运行但实际上并没有抓取论坛线程。我正在尝试在我的开始网址中提取特定论坛的论坛主题中的所有文本。
from scrapy.spider import BaseSpider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from xbox.items import xboxItem
from scrapy.item import Item
from scrapy.conf import settings
class xboxSpider(CrawlSpider):
name = "xbox"
allowed_domains = ["forums.xbox.com"]
start_urls= [
"http://forums.xbox.com/xbox_forums/xbox_360_games/e_k/gearsofwar3/default.aspx",
]
rules= [
Rule(SgmlLinkExtractor(allow=['/t/\d+']),callback='parse_thread'),
Rule(SgmlLinkExtractor(allow=('/t/new\?new_start=\d+',)))
]
def parse_thread(self, response):
hxs=HtmlXPathSelector(response)
item=xboxItem()
item['content']=hxs.selec("//div[@class='post-content user-defined-markup']/p/text()").extract()
item['date']=hxs.select("//span[@class='value']/text()").extract()
return item
日志输出:
2013-03-13 11:22:18-0400 [scrapy] DEBUG: Enabled item pipelines:
2013-03-13 11:22:18-0400 [xbox] INFO: Spider opened
2013-03-13 11:22:18-0400 [xbox] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2013-03-13 11:22:18-0400 [scrapy] DEBUG: Telnet console listening on 0.0.0.0:6023
2013-03-13 11:22:18-0400 [scrapy] DEBUG: Web service listening on 0.0.0.0:6080
2013-03-13 11:22:20-0400 [xbox] DEBUG: Crawled (200) <GET forums.xbox.com/xbox_forums/xbox_360_games/e_k/gearsofwar3/f/…; (referer: None)
2013-03-13 11:22:20-0400 [xbox] DEBUG: Filtered offsite request to 'forums.xbox.com': <GET forums.xbox.com/xbox_forums/xbox_360_games/e_k/gearsofwar3/f/…;
2013-03-13 11:22:20-0400 [xbox] INFO: Closing spider (finished)
2013-03-13 11:22:20-0400 [xbox] INFO: Dumping spider stats