我正在尝试抓取使用 Ajax 分页的https://wegotthiscovered.com/reviews/ 。我尝试了所有方法,但它没有返回或返回 http-status 代码 400。任何人都可以帮助解决这个问题吗?
import json
import scrapy
from..items import xyzItem
class MySpider(scrapy.Spider):
name = 'abc'
data = {"id":"infinite_scroll_1","order":"","orderby":"","catnames":"reviews","postnotin":"900303,899404,898188,897386,896672,893944,895290,895136,892571,892412,891795,887847","timestampbefore":'1589354802'}
headers = {"content-type": "application/json"}
url = 'https://wegotthiscovered.com/wp-admin/admin-ajax.php'
def start_requests(self):
yield scrapy.Request(
url=self.url,
method='POST',
body=json.dumps(self.data),
headers=self.headers,
meta={'index': 0}
)
def parse(self, response):
items = xyzItem()
i = 1
movie_title = response.css('h4').css('::text').getall()
# movie_text = response.css('.summary').xpath('text()').getall()
movie_id = response.css('h4').css('::attr(href)').getall()
li = items['movie_title']
for i in range(len(li)):
li_split = li[i].split(" ")
#print(movietitle)
#if 'Review:' in li_split or 'review:' in li_split or 'Review' in li_split or 'review' in li_split:
outputs = DeccanchronicleItem()
outputs['page_title'] = li[i]
# outputs['review_content'] = items['movie_text'][i]
outputs['review_link'] = items['movie_id'][i]
yield outputs
page = response.meta['index'] + 1
self.data['index'] = page
yield scrapy.Request(self.url, headers=self.headers, method='POST', body=json.dumps(self.data), meta={'index': page})