I have 2 spiders and run it here:
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
settings = get_project_settings()
process1 = CrawlerProcess(settings)
process1.crawl('spider1')
process1.crawl('spider2')
process1.start()
and I want these spiders write a common file.
This is Pipeline class:
class FilePipeline(object):
def __init__(self):
self.file = codecs.open('data.txt', 'w', encoding='utf-8')
self.spiders = []
def open_spider(self, spider):
self.spiders.append(spider.name)
def process_item(self, item, spider):
line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
self.file.write(line)
return item
def spider_closed(self, spider):
self.spiders.remove(spider.name)
if len(self.spiders) == 0:
self.file.close()
But although I don't get error message, when all spiders are done writing in the common file i have less lines (item) than the scrapy log does. A few lines are cut. Maybe there is some practice writing in one file simultaneously from two spiders?
UPDATE:
Thanks, everybody!) I implemented it this way:
class FilePipeline1(object):
lock = threading.Lock()
datafile = codecs.open('myfile.txt', 'w', encoding='utf-8')
def __init__(self):
pass
def open_spider(self, spider):
pass
def process_item(self, item, spider):
line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
try:
FilePipeline1.lock.acquire()
if isinstance(item, VehicleItem):
FilePipeline1.datafile.write(line)
except:
pass
finally:
FilePipeline1.lock.release()
return item
def spider_closed(self, spider):
pass