0

我想将我抓取的数据发送到 MySQL 数据库。这是我的pipelines.py

class MyImagesPipeline(ImagesPipeline):

    def get_media_requests(self, item, info):
        for image_url in item['image_urls']:
            yield scrapy.Request(image_url)

    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        item['image_paths'] = image_paths
        return item


class MySQLPipeline(object):
    
    def __init__(self):
        self.conn = MySQLdb.connect(user='testuser', passwd='megablock:333', db='crawl', host='localhost', charset="utf8", use_unicode=True)
        self.cursor = self.conn.cursor()
        
    
    def process_item(self, item, spider):    
        
        guid = self._get_guid(item)
        now = datetime.utcnow().replace(microsecond=0).isoformat(' ')
        
        try:
            self.cursor.execute("""INSERT INTO produits (guid, now, product, link, price, description, image_urls, image_paths, brand, couleur, gamme, largeur, profondeur, hauteur, longueur, diametre)
               VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
             (guid, now, item['product'], item['link'], item['price'], item['description'], item['image_urls'], item['image_paths'], item['brand'], item['couleur'], item['gamme'], item['largeur'], item['profondeur'], item['hauteur'], item['longueur'], item['diametre']))

            self.conn.commit()
            print "Data sent to db"

        except MySQLdb.Error, e:
            print "Error %d: %s" % (e.args[0], e.args[1])


        return item
        
    def _get_guid(self, item):
        """Generates an unique identifier for a given item."""
        # hash based solely in the url field
        return md5(item['link']).hexdigest()

这是我的item.py

class IkeacrawlItem(scrapy.Item):
    product = scrapy.Field()
    link = scrapy.Field()
    price = scrapy.Field()
    description = scrapy.Field()
    image_urls = scrapy.Field()
    images = scrapy.Field()
    brand = scrapy.Field()
    couleur = scrapy.Field()
    gamme = scrapy.Field()
    largeur = scrapy.Field()
    profondeur = scrapy.Field()
    hauteur = scrapy.Field()
    longueur = scrapy.Field()
    diametre = scrapy.Field()
    image_paths = scrapy.Field()

并且,如果需要,我的数据库结构:

CREATE TABLE produits (
  guid CHAR(32) PRIMARY KEY,
  product VARCHAR(30),
  link TEXT,
  price FLOAT(5,2) UNSIGNED,
  description VARCHAR(100),
  image_urls TEXT,
  image_paths TEXT,
  brand VARCHAR(30),
  couleur VARCHAR(150),
  gamme TEXT,
  largeur FLOAT(5,2) UNSIGNED,
  profondeur FLOAT(5,2) UNSIGNED,
  hauteur FLOAT(5,2) UNSIGNED,
  longueur FLOAT(5,2) UNSIGNED,
  diametre FLOAT(5,2) UNSIGNED,
  now DATETIME
) DEFAULT CHARSET=utf8;

刮擦本身效果很好。我已经成功单独发送了前三个元素,它也很好,表明它不是MySQL连接的问题。我认为它可能与并发问题有关,但即使抓取速度很慢,它也不起作用。不过,这是我收到的错误消息:

2016-11-08 11:16:41 [scrapy] DEBUG: Crawled (200) <GET http://www.ikea.com/fr/fr/catalog/products/30262095/> (referer: http://www.ikea.com/fr/fr/catalog/productsaz/6/)
2016-11-08 11:16:41 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET http://www.ikea.com/fr/fr/images/products/gunnern-gueridon-blanc__0242398_PE381794_S4.JPG> referred in <None>
Error 1241: Operand should contain 1 column(s)
2016-11-08 11:16:41 [scrapy] DEBUG: Scraped from <200 http://www.ikea.com/fr/fr/catalog/products/30262095/>
{'brand': 'IKEA',
 'couleur': [u'blanc', u'gris'],
 'description': u'Gu\xe9ridon',
 'diametre': 0.0,
 'gamme': u'\xc9tag\xe8res',
 'hauteur': u'74.7',
 'image_paths': 'images',
 'image_urls': [u'http://www.ikea.com/fr/fr/images/products/gunnern-gueridon-blanc__0242398_PE381794_S4.JPG'],
 'images': [{'checksum': '70fce09525b1489155a34cbc55ce6729',
             'path': 'full/275b007d3103babddff8e46fdde102d42fcae8a1.jpg',
             'url': 'http://www.ikea.com/fr/fr/images/products/gunnern-gueridon-blanc__0242398_PE381794_S4.JPG'}],
 'largeur': 0.0,
 'link': 'http://www.ikea.com/fr/fr/catalog/products/30262095/',
 'longueur': 0.0,
 'price': u'35',
 'product': u'GUNNERN',
 'profondeur': 0.0}
2016-11-08 11:16:45 [scrapy] DEBUG: Crawled (200) <GET http://www.ikea.com/fr/fr/catalog/products/10288195/> (referer: http://www.ikea.com/fr/fr/catalog/productsaz/6/)
2016-11-08 11:16:45 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET http://www.ikea.com/fr/fr/images/products/gunnern-armoire-verrouillable-rouge__0275675_PE413909_S4.JPG> referred in <None>
Error 1064: You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near '), 'Meubles à miroir', '32', '10.4', '32', 0, 0)' at line 2
2016-11-08 11:16:45 [scrapy] DEBUG: Scraped from <200 http://www.ikea.com/fr/fr/catalog/products/10288195/>
{'brand': 'IKEA',
 'couleur': [],
 'description': u'Armoire verrouillable',
 'diametre': 0.0,
 'gamme': u'Meubles \xe0 miroir',
 'hauteur': u'32',
 'image_paths': 'images',
 'image_urls': [u'http://www.ikea.com/fr/fr/images/products/gunnern-armoire-verrouillable-rouge__0275675_PE413909_S4.JPG'],
 'images': [{'checksum': 'c95f7095c0b6645b5780bff39b47e3d2',
             'path': 'full/20c2afef8324d798baac384970e9efb4557f9f19.jpg',
             'url': 'http://www.ikea.com/fr/fr/images/products/gunnern-armoire-verrouillable-rouge__0275675_PE413909_S4.JPG'}],
 'largeur': u'32',
 'link': 'http://www.ikea.com/fr/fr/catalog/products/10288195/',
 'longueur': 0.0,
 'price': u'25',
 'product': u'GUNNERN',
 'profondeur': u'10.4'}

我有两个不同的错误消息,我不明白为什么。在整个抓取过程中,它们交替出现。你能帮我么 ?

谢谢

4

0 回答 0