我正在从 2 个不同的表中抓取数据,并希望将数据合并到 MongoDB
现在我想刮的第二张桌子有问题。该表有 1 个表头和 5 个表行我如何抓取 MongoDB 字段具有表的所有元素(列)的表
我想刮的桌子看起来像这样 https://codepen.io/linkslegend/pen/JjPrqLq
这是我到目前为止的代码
import scrapy
import pymongo
from ..items import testItem
class IssSpider(scrapy.Spider):
name = "test_spider"
start_urls = ["https://de.iss.fst.com/dichtungen/radialwellendichtringe/rwdr-mit-geschlossenem-kafig/ba"]
def parse(self, response):
self.log("I just visted:" + response.url)
urls = response.css('.details-button > a::attr(href)').extract()
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_details)
# follow paignation link
next_page_url = response.css('li.item > a.next::attr(href)').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse)
def parse_details(self, response):
item = testItem()
item['Artikelnummer'] = response.css('td[data-th="Artikelnummer"]::text').extract_first().strip(),
item['Hersteller'] = response.css('td[data-th="Hersteller"]::text').extract_first().strip(),
item['Materialvariante'] = response.css('td[data-th="Materialvariante"]::text').extract_first().strip(),
item['Material'] = response.css('td[data-th="Material"]::text').extract_first().strip(),
item['Gewicht_Gramm'] = response.css('td[data-th="Gewicht (Gramm)"]::text').extract_first().strip(),
item['Gehaeusedurchmesser'] = response.css('td[data-th="Gehäusedurchmesser"]::text').extract_first().strip(),
item['Breite'] = response.css('td[data-th="Breite"]::text').extract_first().strip(),
item['Innendurchmesser'] = response.css('td[data-th="Innendurchmesser"]::text').extract_first().strip(),
item['Wellendurchmesser'] = response.css('td[data-th="Wellendurchmesser"]::text').extract_first().strip(),
item['Außendurchmesser'] = response.css('td[data-th="Außendurchmesser"]::text').extract_first().strip(),
for row in response.css('tr.offer'):
item['Lieferant'] = row.css('td.vendor > span.offer-vendor::text').extract_first().strip(),
item['Anforderungsmenge'] = row.css('td.item-no > span.offer-item-no::text').extract_first().strip(),
item['Lieferzeit'] = row.css('td.replenishment-time > span.offer-replenishment-time::text').extract_first().strip(),
item['PreisproStueck'] = row.css('td.cell.price-per-item > span.offer-price-per-item > span.price::text').extract_first().strip()
yield item
这是 mongodb 的管道
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
class testPipeline(object):
def __init__(self):
self.conn = pymongo.MongoClient(
"localhost",
27017
)
db = self.conn["test_db"]
self.collection = db["test_tb"]
def process_item(self, item, spider):
self.collection.insert(dict(item))
return item
谢谢你的帮助!