我正在使用 scrapy 来抓取网站数据,但我想将其存储在数据库中。
我当前的代码是这样的:
def start_requests(self):
yield scrapy.Request(self.start_urls, callback=self.parse
meta={"use_splash": False})
def parse(self, response):
for sel in response.xpath('//li'):
item = ProjectjomkerjaItem()
item['title'] = sel.xpath('a/div[@class="position"]/div[@id="job-title-job-listing"]/strong/text()').extract()
item['company'] = sel.xpath('a/div[@class="position"]/div[@class="company"]/strong/text()').extract()
item['link'] = sel.xpath('a/@href').extract()
item['job_type'] = sel.xpath('a/ul[@class="meta"]/li/text()').extract()
for link in item['link']:
yield scrapy.Request(link, meta={'item': item},
callback=self.description)
def description(self, response):
sel = HtmlXPathSelector(response)
item = response.meta['item']
item['location'] = sel.xpath('//h2[@class="page-subtitle"]/span[@class="job-location"]/text()').extract()
item['salary'] = sel.xpath('//h2[@class="page-subtitle"]/span[@class="company-social-title"]/text()').extract()
item['job_description'] = sel.xpath('//div[@class="job-desc"]/div[@class="show-more-inner"]/span[@class="no_translate"]/ul/li/text()').extract()
item['more_job_description'] = sel.xpath('//div[@class="job-desc"]/div[@class="show-more-inner"]/span[@class="no_translate"]/p/text()').extract()
item['others'] = sel.xpath('//div/span[@class="no_translate"]/text()').extract()
item['about_company'] = sel.xpath('//div/span[@class="no_translate"]/span[@id="job_detail_"]/text()').extract()
yield item
像“about_company”和“company”这样的一些项目将属于一个table2,其余的属于另一个table1。Table2 将有一个 job_id,它是 table1 的唯一 ID。
我如何实现这一目标?
(PS - 我正在使用带有 sqlalchemy 的 postgresql。
编辑: 这是我在 pipelines.py 中所做的另一次尝试:
from sqlalchemy.orm import sessionmaker
from models import Jobs, db_connect, create_jobs_table
from models import CompanyDetails, db_connect, create_company_details_table
from projectjomkerja.items import ProjectjomkerjaMainItem
from projectjomkerja.items import ProjectjomkerjaSecondaryItem
class ProjectjomkerjaPipeline(object):
def __init__(self):
"""
Initializes database connection and sessionmaker.
Creates deals table.
"""
engine = db_connect()
create_jobs_table(engine)
create_company_details_table(engine)
self.Session = sessionmaker(bind=engine)
def process_item(self, item, spider):
if isinstance(item, ProjectjomkerjaMainItem):
session = self.Session()
jobs = Jobs(**item)
try:
session.add(jobs)
session.commit()
except:
session.rollback()
raise
finally:
session.close()
return item
if isinstance(item, ProjectjomkerjaSecondaryItem):
session = self.Session()
company_details = CompanyDetails(**item)
try:
session.add(company_details)
session.commit()
except:
session.rollback()
raise
finally:
session.close()
return item
def process_item2(self, item, spider):
item1 = item
item2 = item1['item2']
job_id = Jobs(item1)
CompanyDetails(item2, job_id)
不过,我没有得到我的job_id。