试图刮一个Y!组和我可以从一页获取数据,仅此而已。我有一些基本规则,但显然它们是不正确的。有人已经解决了这个问题吗?
class YgroupSpider(CrawlSpider):
name = "yahoo.com"
allowed_domains = ["launch.groups.yahoo.com"]
start_urls = [
"http://launch.groups.yahoo.com/group/random_public_ygroup/post"
]
rules = (
Rule(SgmlLinkExtractor(allow=('message','messages' ), deny=('mygroups', ))),
Rule(SgmlLinkExtractor(), callback='parse_item'),
)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('/html')
item = Item()
for site in sites:
item = YgroupItem()
item['title'] = site.select('//title').extract()
item['pubDate'] = site.select('//abbr[@class="updated"]/text()').extract()
item['desc'] = site.select("//div[contains(concat(' ',normalize-space(@class),' '),' entry-content ')]/text()").extract()
return item