我正在尝试抓取这个网站: http: //stats.swehockey.se/ScheduleAndResults/Schedule/3940
我已经(感谢 alecxe)检索日期和团队。
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class SchemaItem(Item):
date = Field()
teams = Field()
class SchemaSpider(BaseSpider):
name = "schema"
allowed_domains = ["http://stats.swehockey.se/"]
start_urls = [
"http://stats.swehockey.se/ScheduleAndResults/Schedule/3940"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
rows = hxs.select('//table[@class="tblContent"]/tr')
for row in rows:
item = SchemaItem()
item['date'] = row.select('.//td[2]/div/span/text()').extract()
item['teams'] = row.select('.//td[3]/text()').extract()
yield item
所以,我的下一步是过滤掉任何不是“AIK”或“Djurgårdens IF”主场比赛的东西。之后,我需要重新格式化为可以添加到 Google 日历的 .ics 文件。
编辑:所以我已经解决了一些事情,但还有很多事情要做。我的代码现在看起来像这样..
# -*- coding: UTF-8 -*-
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class SchemaItem(Item):
date = Field()
teams = Field()
class SchemaSpider(BaseSpider):
name = "schema"
allowed_domains = ["http://stats.swehockey.se/"]
start_urls = [
"http://stats.swehockey.se/ScheduleAndResults/Schedule/3940"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
rows = hxs.select('//table[@class="tblContent"]/tr')
for row in rows:
item = SchemaItem()
item['date'] = row.select('.//td[2]/div/span/text()').extract()
item['teams'] = row.select('.//td[3]/text()').extract()
for string in item['teams']:
teams = string.split('-') #split it
home_team = teams[0]#.split(' ') #only the first name, e.g. just 'Djurgårdens' out of 'Djurgårdens IF'
away_team = teams[1]
#home_team[0] = home_team[0].replace(" ", "") #remove whitespace
#home_team = home_team[0]
if "AIK" in home_team:
for string in item['date']:
year = string[0:4]
month = string[5:7]
day = string[8:10]
hour = string[11:13]
minute = string[14:16]
print year, month, day, hour, minute, home_team, away_team
elif u"Djurgårdens" in home_team:
for string in item['date']:
year = string[0:4]
month = string[5:7]
day = string[8:10]
hour = string[11:13]
minute = string[14:16]
print year, month, day, hour, minute, home_team, away_team
该代码打印出“AIK”、“Djurgårdens IF”和“Skellefteå AIK”的游戏。所以我的问题显然是如何过滤掉“Skellefteå AIK”游戏,以及是否有任何简单的方法可以让这个程序变得更好。对此有什么想法?
此致!