0

我正在尝试playwright使用 scrapy 来抓取动态javascript加载站点。最小的目标是从第一个网页中提取每个项目的链接。但是,我混合了response 200and response 204,最后出现以下错误:

playwright._impl._api_types.TimeoutError:超过超时 30000 毫秒。

也许我没有Playwright正确实施,如果是这样,我会欢迎一些支持来导航正确的路径。

import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy_playwright.page import PageCoroutine
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.loader import ItemLoader

class EtsyItem(scrapy.Item):
    items = Field(output_processor = TakeFirst())


class EtsySpider(scrapy.Spider):
    name = 'Etsy_test'
    start_urls = ['https://www.etsy.com/search/clothing/womens-clothing?q=30s&explicit=1&locationQuery=2635167&ship_to=GB']
    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(
                url = url,
                callback = self.parse,
                meta= dict(
                    playwright = True,
                    playwright_include_page = True,
                    playwright_page_coroutines = [
                        PageCoroutine('wait_for_selector', 'div#content')
                        ]
                )
            )
    async def parse(self, response):
       stuff = response.xpath("//ul[@class='wt-grid wt-grid--block wt-pl-xs-0 tab-reorder-container']")
       for items in stuff:
           loaders = ItemLoader(EtsyItem(), selector = items)
           loaders.add_xpath('items', '//li//a//@href')
           yield loaders.load_item()

if __name__ == "__main__":
    process = CrawlerProcess(settings={
        "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
        "DOWNLOAD_HANDLERS": {
            "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
        }, })
    process.crawl(EtsySpider)
    process.start()
2022-01-04 15:02:59 [scrapy.core.scraper] ERROR: Error downloading <GET https://www.etsy.com/search/clothing/womens-clothing?q=30s&explicit=1&locationQuery=2635167&ship_to=GB>
Traceback (most recent call last):
...
playwright._impl._api_types.TimeoutError: Timeout 30000ms exceeded.
=========================== logs ===========================
waiting for selector "div#content" to be visible
============================================================
4

0 回答 0