我正在尝试使用 Pyppetter,但不确定为什么 headless 会导致机器人检测,而 headless = False 则不会。
这是我的代码(请注意 proxy-sever args 需要删除或替换为您的代理服务器):
我假设在进行无头操作时需要调整一些设置?谢谢你。
import glob
import re
import datetime
import time
import random
import logging
import requests_html
from concurrent.futures import ThreadPoolExecutor
from collections import defaultdict
import lxml
import asyncio
import pandas as pd
from pyppeteer import launch
import pyppeteer
S_ALPHA_URL = 'https://seekingalpha.com/earnings/earnings-call-transcripts/{}'
async def make_request(url):
args = ['--proxy-server=xx.xxxx.'] # xxx replaced by your proxy server
for i in range(3):
try:
browser = await launch(headless = False,
args = args)
page = await browser.newPage()
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36')
await page.goto(url, {'waitUntil' : 'domcontentloaded'})
await page.reload()
content = await page.content()
print('return')
return content
except (pyppeteer.errors.PageError, pyppeteer.errors.TimeoutError):
continue
finally:
await browser.close()
def sa_test():
''' testing '''
for i in range(2):
start = time.time()
num = random.randint(1,100)
url = S_ALPHA_URL.format(num)
loop = asyncio.get_event_loop()
content = loop.run_until_complete(make_request(url))
soup = BeautifulSoup(content, 'html.parser')
print(soup)
print('time taken:', time.time() - start)
if __name__ = '__main__':
sa_test()