from scrapy import FormRequest
url = "https://stackoverflow.com/users/login"
fetch(url)
req = FormRequest.from_response(
response,
formid='login-form',
formdata={'email': 'test@test.com',
'password': 'testpw'},
clickdata={'id': 'submit-button'},
)
fetch(req)
在scrapy shell中使用上面的代码,我可以登录stackoverflow。但是,我想执行此活动而不是作为命令行参数。所以,我试图在子进程中使用上述命令登录。
import subprocess
import scrapy
from scrapy import FormRequest
from subprocess import run
from bs4 import BeautifulSoup
class QuoteSpider(scrapy.Spider):
name = 'stackover'
start_urls = ['https://stackoverflow.com/users/login']
run(["scrapy","fetch", start_urls[0]], capture_output=True, text=True)
def parse(self, response):
req = FormRequest.from_response(
response,
formid='login-form',
formdata={'email': 'test@test.com',
'password': 'testpw'},
clickdata={'id': 'submit-button'},
)
run(["scrapy","fetch", req], shell=True)
但它给了我这样的错误:
TypeError:“FormRequest”类型的参数不可迭代
我还尝试将响应保存在 html 文件中并将该文件作为响应读取并得到与上面相同的错误消息。
with open("output.html","w") as f:
response = call(["scrapy","fetch", url], stdout=f, shell=True)
with open("output.html", encoding="utf-8") as f:
data = f.read()
response = BeautifulSoup(data, 'lxml')
我也尝试获取文本响应并再次收到上述错误消息。
r = run(["scrapy","fetch", start_urls[0]], capture_output=True)
response = r.stdout.decode()
我还尝试在调用解析函数之前创建请求,例如:
class QuoteSpider(scrapy.Spider):
name = 'stackover'
start_urls = ['https://stackoverflow.com/users/login']
r = run(["scrapy","fetch", start_urls[0]], capture_output=True)
response = r.stdout.decode()
req = FormRequest.from_response(
response,
formid='login-form',
formdata={'email': 'test@test.com',
'password': 'testpw'},
clickdata={'id': 'submit-button'},
)
run(["scrapy","fetch", req], shell=True)
def parse(self, response):
print(response)
而且,我得到了新的错误。
AttributeError:“str”对象没有属性“encoding”
那么,我如何使用子进程运行scrapy shell 命令来登录stackoverflow。scrapy 中 Formrequest 中的响应究竟是什么作为输入?
我正在学习scrapy和各种登录stackoverflow的方法来练习网页抓取。