我正在运行Scrapy
(Mac OSX Lion 10.7.5
以防万一)
以下是我的爬虫:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from BoxOfficeMojo.items import BoxofficemojoItem
from BoxOfficeMojo.items import ActorItem
class MojoSpider(BaseSpider):
name = 'MojoSpider'
allowed_domains = ['boxofficemojo.com']
start_urls = ['http://www.boxofficemojo.com/movies/alphabetical.htm?letter=A&p=.htm']
def parse(self, response):
items = []
movie = BoxofficemojoItem()
hxs = HtmlXPathSelector(response)
print ('hxs:', hxs)
links = hxs.select('//div[@id="body"]/div/table/tr/td/table/tr[2]/td/table[2]/tr/td[1]/font/a/@href').extract() #was previously
titles = hxs.select('//div[@id="body"]/div/table/tr/td/table/tr[2]/td/table[2]/tr/td[1]/font/a/b/text()').extract()
gross = hxs.select('//div[@id="body"]/div/table/tr/td/table/tr[2]/td/table[2]/tr/td[3]/font/text()').extract()
opening = hxs.select('//div[@id="body"]/div/table/tr/td/table/tr[2]/td/table[2]/tr/td[7]/font//text()').extract()
for item in gross:
if 'Total' in item:
gross.remove(item)
items = []
for i in range(len(links)):
movie['title'] = titles[i]
movie['link'] = 'http://www.boxofficemojo.com' + links[i]
movie['gross'] = gross[i]
movie['release_date'] = opening[i]
items.append(movie)
return items
这是我的MySQL
管道:
import sys; sys.path.append("/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages")
import MySQLdb
import hashlib
from scrapy.exceptions import DropItem
from scrapy.http import Request
class BoxofficemojoPipeline(object):
def __init__(self):
self.conn = MySQLdb.connect(user='testuser', passwd='test', db='testdb', host='localhost', charset='utf8', use_unicode=True)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
try:
self.cursor.execute("""INSERT INTO example_movie (title, link, gross, release_date) VALUES (%s, %s, %s, %s)""", (item['title'], item['link'], item['gross'], item['release_date']))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
当我查看页面中的条目时,MySQL Database
页面中应该存在的电影数量,但它们都是同一部电影,Act of Worship
,这是页面上的最后一部电影。欢迎任何和所有建议!感谢您的关注!