我想将我抓取的数据发送到 MySQL 数据库。这是我的pipelines.py
:
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
item['image_paths'] = image_paths
return item
class MySQLPipeline(object):
def __init__(self):
self.conn = MySQLdb.connect(user='testuser', passwd='megablock:333', db='crawl', host='localhost', charset="utf8", use_unicode=True)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
guid = self._get_guid(item)
now = datetime.utcnow().replace(microsecond=0).isoformat(' ')
try:
self.cursor.execute("""INSERT INTO produits (guid, now, product, link, price, description, image_urls, image_paths, brand, couleur, gamme, largeur, profondeur, hauteur, longueur, diametre)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
(guid, now, item['product'], item['link'], item['price'], item['description'], item['image_urls'], item['image_paths'], item['brand'], item['couleur'], item['gamme'], item['largeur'], item['profondeur'], item['hauteur'], item['longueur'], item['diametre']))
self.conn.commit()
print "Data sent to db"
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
def _get_guid(self, item):
"""Generates an unique identifier for a given item."""
# hash based solely in the url field
return md5(item['link']).hexdigest()
这是我的item.py
:
class IkeacrawlItem(scrapy.Item):
product = scrapy.Field()
link = scrapy.Field()
price = scrapy.Field()
description = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
brand = scrapy.Field()
couleur = scrapy.Field()
gamme = scrapy.Field()
largeur = scrapy.Field()
profondeur = scrapy.Field()
hauteur = scrapy.Field()
longueur = scrapy.Field()
diametre = scrapy.Field()
image_paths = scrapy.Field()
并且,如果需要,我的数据库结构:
CREATE TABLE produits (
guid CHAR(32) PRIMARY KEY,
product VARCHAR(30),
link TEXT,
price FLOAT(5,2) UNSIGNED,
description VARCHAR(100),
image_urls TEXT,
image_paths TEXT,
brand VARCHAR(30),
couleur VARCHAR(150),
gamme TEXT,
largeur FLOAT(5,2) UNSIGNED,
profondeur FLOAT(5,2) UNSIGNED,
hauteur FLOAT(5,2) UNSIGNED,
longueur FLOAT(5,2) UNSIGNED,
diametre FLOAT(5,2) UNSIGNED,
now DATETIME
) DEFAULT CHARSET=utf8;
刮擦本身效果很好。我已经成功单独发送了前三个元素,它也很好,表明它不是MySQL连接的问题。我认为它可能与并发问题有关,但即使抓取速度很慢,它也不起作用。不过,这是我收到的错误消息:
2016-11-08 11:16:41 [scrapy] DEBUG: Crawled (200) <GET http://www.ikea.com/fr/fr/catalog/products/30262095/> (referer: http://www.ikea.com/fr/fr/catalog/productsaz/6/)
2016-11-08 11:16:41 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET http://www.ikea.com/fr/fr/images/products/gunnern-gueridon-blanc__0242398_PE381794_S4.JPG> referred in <None>
Error 1241: Operand should contain 1 column(s)
2016-11-08 11:16:41 [scrapy] DEBUG: Scraped from <200 http://www.ikea.com/fr/fr/catalog/products/30262095/>
{'brand': 'IKEA',
'couleur': [u'blanc', u'gris'],
'description': u'Gu\xe9ridon',
'diametre': 0.0,
'gamme': u'\xc9tag\xe8res',
'hauteur': u'74.7',
'image_paths': 'images',
'image_urls': [u'http://www.ikea.com/fr/fr/images/products/gunnern-gueridon-blanc__0242398_PE381794_S4.JPG'],
'images': [{'checksum': '70fce09525b1489155a34cbc55ce6729',
'path': 'full/275b007d3103babddff8e46fdde102d42fcae8a1.jpg',
'url': 'http://www.ikea.com/fr/fr/images/products/gunnern-gueridon-blanc__0242398_PE381794_S4.JPG'}],
'largeur': 0.0,
'link': 'http://www.ikea.com/fr/fr/catalog/products/30262095/',
'longueur': 0.0,
'price': u'35',
'product': u'GUNNERN',
'profondeur': 0.0}
2016-11-08 11:16:45 [scrapy] DEBUG: Crawled (200) <GET http://www.ikea.com/fr/fr/catalog/products/10288195/> (referer: http://www.ikea.com/fr/fr/catalog/productsaz/6/)
2016-11-08 11:16:45 [scrapy] DEBUG: File (uptodate): Downloaded image from <GET http://www.ikea.com/fr/fr/images/products/gunnern-armoire-verrouillable-rouge__0275675_PE413909_S4.JPG> referred in <None>
Error 1064: You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near '), 'Meubles à miroir', '32', '10.4', '32', 0, 0)' at line 2
2016-11-08 11:16:45 [scrapy] DEBUG: Scraped from <200 http://www.ikea.com/fr/fr/catalog/products/10288195/>
{'brand': 'IKEA',
'couleur': [],
'description': u'Armoire verrouillable',
'diametre': 0.0,
'gamme': u'Meubles \xe0 miroir',
'hauteur': u'32',
'image_paths': 'images',
'image_urls': [u'http://www.ikea.com/fr/fr/images/products/gunnern-armoire-verrouillable-rouge__0275675_PE413909_S4.JPG'],
'images': [{'checksum': 'c95f7095c0b6645b5780bff39b47e3d2',
'path': 'full/20c2afef8324d798baac384970e9efb4557f9f19.jpg',
'url': 'http://www.ikea.com/fr/fr/images/products/gunnern-armoire-verrouillable-rouge__0275675_PE413909_S4.JPG'}],
'largeur': u'32',
'link': 'http://www.ikea.com/fr/fr/catalog/products/10288195/',
'longueur': 0.0,
'price': u'25',
'product': u'GUNNERN',
'profondeur': u'10.4'}
我有两个不同的错误消息,我不明白为什么。在整个抓取过程中,它们交替出现。你能帮我么 ?
谢谢