0

I am making a Project using scrapy in which I need to scrap the business details from a business directory http://directory.thesun.co.uk/find/uk/computer-repair
the problem is that , it is not fetching proper results.

Please see my code and help me to solve it

spider code::

from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from project2.items import Project2Item

class ProjectSpider(BaseSpider):
    name = "project2spider"
    allowed_domains = ["http://directory.thesun.co.uk/"]
    start_urls = [
        'http://directory.thesun.co.uk/find/uk/computer-repair',
        'http://directory.thesun.co.uk/find/uk/computer-repair/page/2',
        'http://directory.thesun.co.uk/find/uk/computer-repair/page/3',
        'http://directory.thesun.co.uk/find/uk/computer-repair/page/4',
        'http://directory.thesun.co.uk/find/uk/computer-repair/page/5',
        'http://directory.thesun.co.uk/find/uk/computer-repair/page/6',
        'http://directory.thesun.co.uk/find/uk/computer-repair/page/7',
        'http://directory.thesun.co.uk/find/uk/computer-repair/page/8',
        'http://directory.thesun.co.uk/find/uk/computer-repair/page/9',
        'http://directory.thesun.co.uk/find/uk/computer-repair/page/10'
        ]

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        sites = hxs.select('//div[@class="abTbl "]')
        items = []
        for site in sites:
            item = Project2Item()
            item['Catogory'] = site.select('span[@class="icListBusType"]/text()').extract()
            item['Bussiness_name'] = site.select('a/@title').extract()
            item['Description'] = site.select('span[last()]/text()').extract()
            item['Number'] = site.select('span[@class="searchInfoLabel"]/span/@id').extract()
            item['Web_url'] = site.select('span[@class="searchInfoLabel"]/a/@href').extract()
            item['adress_name'] = site.select('span[@class="searchInfoLabel"]/span/text()').extract()
            item['Photo_name'] = site.select('img/@alt').extract()
            item['Photo_path'] = site.select('img/@src').extract()
            items.append(item)
        return items

My items.py code is as follows::

from scrapy.item import Item, Field

class Project2Item(Item):
    Catogory = Field()
    Bussiness_name = Field()
    Description = Field()
    Number = Field()
    Web_url = Field()
    adress_name = Field()
    Photo_name = Field()
    Photo_path = Field()

my settings.py is:::

BOT_NAME = 'project2'

SPIDER_MODULES = ['project2.spiders']
NEWSPIDER_MODULE = 'project2.spiders'
ITEM_PIPELINES = (
    'project2.pipelines.MySQLStorePipeline',
)

My Pipeline code is::

from scrapy import log
#from scrapy.core.exceptions import DropItem
from twisted.enterprise import adbapi

import MySQLdb.cursors


class MySQLStorePipeline(object):

    def __init__(self):
        #  hardcoded db settings

        self.dbpool = adbapi.ConnectionPool('MySQLdb',
                db='project',
                user='root',
                passwd='',
                host='127.0.0.1',
                port='3306',                            
                cursorclass=MySQLdb.cursors.DictCursor,
                charset='utf8',
                use_unicode=True
            )

    def process_item(self, item, spider):
        # run db query in thread pool
        query = self.dbpool.runInteraction(self._conditional_insert, item)
        query.addErrback(self.handle_error)

        return item


    def _conditional_insert(self, tx, item):
        insert_id = tx.execute(\
            "insert into crawlerapp_directory (Catogory, Bussiness_name, Description, Number, Web_url) "
            "values (%s, %s, %s, %s, %s)",
            (item['Catogory'][0],
             item['Bussiness_name'][0],
             item['Description'][0],
             item['Number'][0],
             item['Web_url'][0],
             )
            )

        tx.execute(\
            "insert into crawlerapp_adress (directory_id, adress_name) "
            "values (%s, %s)",
            (insert_id,
             item['adress_name'][0]
             )
            )

        tx.execute(\
            "insert into crawlerapp_photos (directory_id, Photo_path, Photo_name) "
            "values (%s, %s, %s)",
            (insert_id,
             item['Photo_path'][0],
             item['Photo_name'][0]
             )
            )
        log.msg("Item stored in db: %s" % item, level=log.DEBUG)
    def handle_error(self, e):
        log.err(e)

My cmd is getting exception at ::

""--- <exception caught here> ---
  File "/home/pixlie/abhi/local/lib/python2.7/site-packages/twisted/python/threadpool.py", line 191, in _worker
    result = context.call(ctx, function, *args, **kwargs)
  File "/home/pixlie/abhi/local/lib/python2.7/site-packages/twisted/python/context.py", line 118, in callWithContext
    return self.currentContext().callWithContext(ctx, func, *args, **kw)
  File "/home/pixlie/abhi/local/lib/python2.7/site-packages/twisted/python/context.py", line 81, in callWithContext
    return func(*args,**kw)
  File "/home/pixlie/abhi/local/lib/python2.7/site-packages/twisted/enterprise/adbapi.py", line 445, in _runInteraction
    conn = self.connectionFactory(self)
  File "/home/pixlie/abhi/local/lib/python2.7/site-packages/twisted/enterprise/adbapi.py", line 38, in __init__
    self.reconnect()
  File "/home/pixlie/abhi/local/lib/python2.7/site-packages/twisted/enterprise/adbapi.py", line 75, in reconnect
    self._connection = self._pool.connect()
  File "/home/pixlie/abhi/local/lib/python2.7/site-packages/twisted/enterprise/adbapi.py", line 414, in connect
    conn = self.dbapi.connect(*self.connargs, **self.connkw)
  File "/home/pixlie/abhi/local/lib/python2.7/site-packages/MySQLdb/__init__.py", line 81, in Connect
    return Connection(*args, **kwargs)
  File "/home/pixlie/abhi/local/lib/python2.7/site-packages/MySQLdb/connections.py", line 187, in __init__
    super(Connection, self).__init__(*args, **kwargs2)
exceptions.TypeError: an integer is required

2013-07-23 10:33:08+0530 [project2spider] DEBUG: Crawled (200) <GET     http://directory.thesun.co.uk/find/uk/computer-repair/page/2> (referer: None)
2013-07-23 10:33:08+0530 [project2spider] DEBUG: Scraped from <200 http://directory.thesun.co.uk/find/uk/computer-repair/page/2>
{'Bussiness_name': [u'Camden Laptop Repair Center Ltd'],
 'Catogory': [u'Computer Repair and Maintenance Services'],
 'Description': [u'Welcome to Camden Laptop Repair Centre Ltd. We specialize in laptop and desktop repairs.\nIf you h...'],
 'Number': [u'phone_11072240'],
 'Photo_name': [u'Hands On IT'],
 'Photo_path': [u'http://asset04.scoot.co.uk/S/94/20121127154348aa03d70cb8175721fe5ba4b02b1278c0.jpg'],
 'Web_url': [u'/email/company/11072240/contact/10840012'],
 'adress_name': [u'16B Pratt Street, Camden Town', u'LONDON', u'NW1 0AB']}
 2013-07-23 10:33:08+0530 [project2spider] DEBUG: Scraped from <200 http://directory.thesun.co.uk/find/uk/computer-repair/page/2>
{'Bussiness_name': [u'Apple Mac Mechanic'],
 'Catogory': [u'Computer Support & Services'],
 'Description': [u'Apple Mac Mechanic\r\n\r\nMac Repairs and Mac Technical Support based in Oxfordshire.\r\nWe are happy t...'],
 'Number': [u'phone_12495342'],
 'Photo_name': [u''],
 'Photo_path': [u'http://asset04.scoot.co.uk/L/7/2010102400000012495342ca09b8e6254b29ac5fae555ec5c838f1.jpg'],
 'Web_url': [u'/email/company/12495342/contact/5566054',
             u'http://www.applemacmechanic.co.uk'],
 'adress_name': [u'Lane Farm, Warpsgrove Lane, Chalgrove',
                 u'OXFORD',
                 u'OX44 7RW']}
 2013-07-23 10:33:08+0530 [project2spider] DEBUG: Scraped from <200 http://directory.thesun.co.uk/find/uk/computer-repair/page/2>
{'Bussiness_name': [u'Pentagon Computer Solutions Ltd'],
 'Catogory': [u'Computer Repair and Maintenance Services'],
 'Description': [u'IT Support, Maintenance and Computer Repair in the North East, we specialise in supporting and ma...'],
 'Number': [u'phone_16421610'],
 'Photo_name': [u'Pentagon Computer Solutions Limited'],
 'Photo_path': [u'http://asset04.scoot.co.uk/S/50/201301171657414b074f64efbae41b3f17e6dc5f02b01f.jpg'],
 'Web_url': [u'/email/company/16421610/contact/11420654'],
 'adress_name': [u'Clavering House , Clavering Place',
                 u'NEWCASTLE UPON TYNE',
                 u'Tyne and Wear',
                 u'NE1 3NG']}
 2013-07-23 10:33:08+0530 [project2spider] DEBUG: Scraped from <200 http://directory.thesun.co.uk/find/uk/computer-repair/page/2>
{'Bussiness_name': [u'One It Support ', u'Read More'],
 'Catogory': [u'Computer Services'],
 'Description': [u'\xa0'],
 'Number': [],
 'Photo_name': [],
 'Photo_path': [],
 'Web_url': [u'/email/company/16729910/contact/'],
 'adress_name': [u'1b St Oswins Place',
                 u'NORTH SHIELDS',
                 u'Tyne and Wear',
                 u'NE30 4RQ']}
   2013-07-23 10:33:08+0530 [project2spider] DEBUG: Scraped from <200 http://directory.thesun.co.uk/find/uk/computer-repair/page/2>
{'Bussiness_name': [u'Uk Green Angels Cic'],
 'Catogory': [u'Computer Consumables Suppliers'],
 'Description': [u'Located in the busy Walton area of Liverpool, UK Green Angels Recycling CIC was formed in January...'],
 'Number': [u'phone_14525460'],
 'Photo_name': [u'Logo'],
 'Photo_path': [u'http://asset03.scoot.co.uk/C/60/20120518112439f121d135f39f03e48da5fe5e8ced5b0a.jpg'],
 'Web_url': [u'/email/company/14525460/contact/8700320',
             u'http://www.green-angels-recycling.org.uk'],
 'adress_name': [u'Suite G4, Walton Cornerstone, 2 Liston Street. Liverpool',
                 u'LIVERPOOL',
                 u'Merseyside',
                 u'L4 5RT']}
   2013-07-23 10:33:09+0530 [project2spider] DEBUG: Scraped from <200 http://directory.thesun.co.uk/find/uk/computer-repair/page/2>
{'Bussiness_name': [u'One IT Support'],
 'Catogory': [u'Computer Repair and Maintenance Services'],
 'Description': [u'We specialise in the following services:\r\n\r\n    * Data Recovery\r\n    * PC and Laptop Repairs\r\n   ...'],
 'Number': [u'phone_11995160'],
 'Photo_name': [u''],
 'Photo_path': [u'http://asset02.scoot.co.uk/E/74/201010240000001199516065eb50eb908e910c748ab6964ede6d4e.png'],
 'Web_url': [u'/email/company/11995160/contact/3829492'],
 'adress_name': [u'1b St Oswins Place, Tynemouth',
                 u'NORTH SHIELDS',
                 u'NE30 4RQ']}
   2013-07-23 10:33:09+0530 [scrapy] Unhandled Error
Traceback (most recent call last):
  File "/usr/lib64/python2.7/threading.py", line 524, in __bootstrap
    self.__bootstrap_inner()
  File "/usr/lib64/python2.7/threading.py", line 551, in __bootstrap_inner
    self.run()
  File "/usr/lib64/python2.7/threading.py", line 504, in run
    self.__target(*self.__args, **self.__kwargs)
--- <exception caught here> ---
  File "/home/pixlie/abhi/local/lib/python2.7/site-packages/twisted/python/threadpool.py", line 191, in _worker
    result = context.call(ctx, function, *args, **kwargs)
  File "/home/pixlie/abhi/local/lib/python2.7/site-packages/twisted/python/context.py", line 118, in callWithContext
    return self.currentContext().callWithContext(ctx, func, *args, **kw)
  File "/home/pixlie/abhi/local/lib/python2.7/site-packages/twisted/python/context.py", line 81, in callWithContext
    return func(*args,**kw)
  File "/home/pixlie/abhi/local/lib/python2.7/site-packages/twisted/enterprise/adbapi.py", line 445, in _runInteraction
    conn = self.connectionFactory(self)
    --- <exception caught here> ---
  File "/home/pixlie/abhi/local/lib/python2.7/site-packages/twisted/python/threadpool.py", line 191, in _worker
    result = context.call(ctx, function, *args, **kwargs)
  File "/home/pixlie/abhi/local/lib/python2.7/site-packages/twisted/python/context.py", line 118, in callWithContext
    return self.currentContext().callWithContext(ctx, func, *args, **kw)
  File "/home/pixlie/abhi/local/lib/python2.7/site-packages/twisted/python/context.py", line 81, in callWithContext
    return func(*args,**kw)
  File "/home/pixlie/abhi/local/lib/python2.7/site-packages/twisted/enterprise/adbapi.py", line 445, in _runInteraction
    conn = self.connectionFactory(self)
  File "/home/pixlie/abhi/local/lib/python2.7/site-packages/twisted/enterprise/adbapi.py", line 38, in __init__
    self.reconnect()
  File "/home/pixlie/abhi/local/lib/python2.7/site-packages/twisted/enterprise/adbapi.py", line 75, in reconnect
    self._connection = self._pool.connect()
  File "/home/pixlie/abhi/local/lib/python2.7/site-packages/twisted/enterprise/adbapi.py", line 414, in connect
    conn = self.dbapi.connect(*self.connargs, **self.connkw)
  File "/home/pixlie/abhi/local/lib/python2.7/site-packages/MySQLdb/__init__.py", line 81, in Connect
    return Connection(*args, **kwargs)
  File "/home/pixlie/abhi/local/lib/python2.7/site-packages/MySQLdb/connections.py", line 187, in __init__
    super(Connection, self).__init__(*args, **kwargs2)
exceptions.TypeError: an integer is required

2013-07-23 10:33:09+0530 [project2spider] DEBUG: Crawled (200) <GET   http://directory.thesun.co.uk/find/uk/computer-repair/page/9> (referer: None)
2013-07-23 10:33:09+0530 [project2spider] DEBUG: Crawled (200) <GET http://directory.thesun.co.uk/find/uk/computer-repair/page/10> (referer: None)
2013-07-23 10:33:09+0530 [project2spider] INFO: Closing spider (finished)
2013-07-23 10:33:09+0530 [project2spider] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 2574,
 'downloader/request_count': 10,
 'downloader/request_method_count/GET': 10,
 'downloader/response_bytes': 88619,
 'downloader/response_count': 10,
 'downloader/response_status_count/200': 10,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2013, 7, 23, 5, 3, 9, 949186),
 'item_scraped_count': 36,
 'log_count/DEBUG': 52,
 'log_count/ERROR': 36,
 'log_count/INFO': 4,
 'response_received_count': 10,
 'scheduler/dequeued': 10,
 'scheduler/dequeued/memory': 10,
 'scheduler/enqueued': 10,
 'scheduler/enqueued/memory': 10,
 'start_time': datetime.datetime(2013, 7, 23, 5, 3, 7, 251684)}
 2013-07-23 10:33:09+0530 [project2spider] INFO: Spider closed (finished)""

PLease help me to solve it and to get proper results

4

0 回答 0