我正在使用 Scrapy 从网站上抓取数据,这是我在 Scrapy 文件夹蜘蛛中的文件 spider.py 中的代码
class ThumbSpider(scrapy.Spider):
userInput = readInputData('input/user_input.json')
name = 'thumb'
# start_urls = ['https://vietnamnews.vn/politics-laws', 'https://vietnamnews.vn/society']
def __init__(self, *args, **kwargs):
super(ThumbSpider, self).__init__(*args, **kwargs)
self.start_urls = kwargs.get('start_urls')
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
for cssThumb in self.userInput['cssThumb']: # browse each cssThumb which user provides
items = response.css('{0}::attr(href)'.format(cssThumb)).getall() # access it
for item in items:
item = response.urljoin(item)
yield scrapy.Request(url=item, callback=self.parse_details)
def parse_details(self, response):
data = response.css('div.vnnews-text-post p span::text').extract()
with open('result/page_content.txt', 'a') as outfile:
json.dump(data, outfile)
yield data
我ThumbSpider
在文件main.py中调用类并在终端中运行此文件
import json
import os
import modules.misc as msc
from scrapy.crawler import CrawlerProcess
from week_7.spiders.spider import NaviSpider, ThumbSpider
process2 = CrawlerProcess()
process2.crawl(ThumbSpider, start_urls=['https://vietnamnews.vn/politics-laws', 'https://vietnamnews.vn/society'])
process2.start()
我的程序没有从 2 个 url 中得到任何东西,但是当我取消注释start_urls = ['https://vietnamnews.vn/politics-laws', 'https://vietnamnews.vn/society']
和删除类和文件 main.py 中的方法时__init__
,编辑它运行良好。我不知道发生了什么。任何人都可以帮助我,非常感谢start_requests
ThumbSpider
process2.crawl(ThumbSpider, start_urls=msc.getUserChoices())
process2.crawl(ThumbSpider)