我正在使用 django 和 scrapy 在在线业务目录上制作一个项目。我只是使用scrapy来抓取数据并将其存储到数据库中。django 部分没有直接连接到scrapy。我需要从业务目录http://directory.thesun.co.uk/find/uk/computer-repair中删除业务详细信息 。问题是它只是从一个页面而不是从其他页面影响细节。 .请帮忙解决这个问题..
请查看我的代码并帮助我解决它
蜘蛛代码::
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from project2.items import Project2Item
class ProjectSpider(BaseSpider):
name = "project2spider"
allowed_domains = ["http://directory.thesun.co.uk/"]
start_urls = [
'http://directory.thesun.co.uk/find/uk/computer-repair',
'http://directory.thesun.co.uk/find/uk/computer-repair/page/2',
'http://directory.thesun.co.uk/find/uk/computer-repair/page/3',
'http://directory.thesun.co.uk/find/uk/computer-repair/page/4',
'http://directory.thesun.co.uk/find/uk/computer-repair/page/5',
'http://directory.thesun.co.uk/find/uk/computer-repair/page/6',
'http://directory.thesun.co.uk/find/uk/computer-repair/page/7',
'http://directory.thesun.co.uk/find/uk/computer-repair/page/8',
'http://directory.thesun.co.uk/find/uk/computer-repair/page/9',
'http://directory.thesun.co.uk/find/uk/computer-repair/page/10'
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[@class="abTbl "]')
items = []
for site in sites:
item = Project2Item()
item['Catogory'] = site.select('span[@class="icListBusType"]/text()').extract()
item['Bussiness_name'] = site.select('a/@title').extract()
item['Description'] = site.select('span[last()]/text()').extract()
item['Number'] = site.select('span[@class="searchInfoLabel"]/span/@id').extract()
item['Web_url'] = site.select('span[@class="searchInfoLabel"]/a/@href').extract()
item['adress_name'] = site.select('span[@class="searchInfoLabel"]/span/text()').extract()
item['Photo_name'] = site.select('img/@alt').extract()
item['Photo_path'] = site.select('img/@src').extract()
items.append(item)
return items
我的 items.py 代码如下::
from scrapy.item import Item, Field
class Project2Item(Item):
Catogory = Field()
Bussiness_name = Field()
Description = Field()
Number = Field()
Web_url = Field()
adress_name = Field()
Photo_name = Field()
Photo_path = Field()
我的 settings.py 是:::
BOT_NAME = 'project2'
SPIDER_MODULES = ['project2.spiders']
NEWSPIDER_MODULE = 'project2.spiders'
ITEM_PIPELINES = (
'project2.pipelines.MySQLStorePipeline',
)
我的管道代码是::
from scrapy import log
#from scrapy.core.exceptions import DropItem
from twisted.enterprise import adbapi
import MySQLdb.cursors
class MySQLStorePipeline(object):
def __init__(self):
# hardcoded db settings
self.dbpool = adbapi.ConnectionPool('MySQLdb',
db='project',
user='root',
passwd='',
host='127.0.0.1',
port='3306',
cursorclass=MySQLdb.cursors.DictCursor,
charset='utf8',
use_unicode=True
)
def process_item(self, item, spider):
# run db query in thread pool
query = self.dbpool.runInteraction(self._conditional_insert, item)
query.addErrback(self.handle_error)
return item
def _conditional_insert(self, tx, item):
insert_id = tx.execute(\
"insert into crawlerapp_directory (Catogory, Bussiness_name, Description, Number, Web_url) "
"values (%s, %s, %s, %s, %s)",
(item['Catogory'][0],
item['Bussiness_name'][0],
item['Description'][0],
item['Number'][0],
item['Web_url'][0],
)
)
tx.execute(\
"insert into crawlerapp_adress (directory_id, adress_name) "
"values (%s, %s)",
(insert_id,
item['adress_name'][0]
)
)
tx.execute(\
"insert into crawlerapp_photos (directory_id, Photo_path, Photo_name) "
"values (%s, %s, %s)",
(insert_id,
item['Photo_path'][0],
item['Photo_name'][0]
)
)
log.msg("Item stored in db: %s" % item, level=log.DEBUG)
def handle_error(self, e):
log.err(e)
请帮助我也从其他页面中删除数据..