0

我正在尝试使用 Pyppetter,但不确定为什么 headless 会导致机器人检测,而 headless = False 则不会。

这是我的代码(请注意 proxy-sever args 需要删除或替换为您的代理服务器):

我假设在进行无头操作时需要调整一些设置?谢谢你。

import glob
import re
import datetime
import time
import random
import logging
import requests_html
from  concurrent.futures import ThreadPoolExecutor
from collections import defaultdict
import lxml
import asyncio
import pandas as pd
from pyppeteer import launch
import pyppeteer


S_ALPHA_URL = 'https://seekingalpha.com/earnings/earnings-call-transcripts/{}'


async def make_request(url):

        args = ['--proxy-server=xx.xxxx.'] # xxx replaced by your proxy server
        
        for i in range(3):

            try:
                browser = await launch(headless = False,
                                       args = args)

                page = await browser.newPage()
                await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36')
                await page.goto(url, {'waitUntil' : 'domcontentloaded'})
                await page.reload()
                content = await page.content()

                print('return')

                return content

            except (pyppeteer.errors.PageError, pyppeteer.errors.TimeoutError):
              continue

            finally:
                await browser.close()
    

def sa_test():

    ''' testing '''    

    for i in range(2):
       
        start = time.time()  
        
        num = random.randint(1,100)
        url = S_ALPHA_URL.format(num)
        loop = asyncio.get_event_loop()
        content = loop.run_until_complete(make_request(url))
        soup = BeautifulSoup(content, 'html.parser')
        
        print(soup)
        print('time taken:', time.time() - start)

    
if __name__ = '__main__':
    
        sa_test()
4

0 回答 0