0

我正在尝试为一个小组项目抓取“the north face”网站,并且我正在寻找一种更快的方法来更快地获得输出。每次我获取页面的 html 时,有没有更快的方法不打开 chrome 网页?我不能使用请求,因为它没有给我完整的源代码。感谢您的帮助。这就是我所拥有的:

import requests
from bs4 import BeautifulSoup
from helium import *
import time

# To tell the API that I am a user using Google Chrome.
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
# open Chrome in the back ground.
browser = start_chrome("https://www.thenorthface.com/shop/mens-jackets-vests-en-ca#facet=&beginIndex=0", headless=True)
# Click on the "LOAD MORE" button to load all the products in the page.
while Text("LOAD MORE").exists():
    click("LOAD MORE")
    time.sleep(2.0)

# get the html source of the page
html = browser.page_source
kill_browser()
# creat a soup object
soup = BeautifulSoup(html, "html.parser")
# print(soup.prettify())
# soup object for all products
products_cards = soup.find_all("div", {"class": "product-block-info info info-js"})
# print(products_cards)

products_names = []
products_links = []
products_prices = []
for card in products_cards:
    for name in card.find_all("div", {"class": "product-block-name name name-js"}):
        for i in name.find_all("a", class_="product-block-name-link"):
            # print(i.get("title"))
            products_names.append(i.get("title"))
            # print(i.get("href"))
            products_links.append(i.get("href"))

# soup object for specific product
# product_soup = BeautifulSoup(html, "html.parser")
#!!!!!!!!!!!!!!!!

for jacket_url in products_links[:3]:
    browser = start_chrome(jacket_url, headless=True)
    html = browser.page_source
    kill_browser()
    product_soup = BeautifulSoup(html, "html.parser")
    price_info = product_soup.find_all("div", class_="product-content-info-price product-price product-price-js")
    for info in price_info:
        for price in info.find("span", "product-content-info-offer-price offer-price offer-price-js product-price-amount-js"):
            products_prices.append(price)


print(len(products_prices))
print(len(products_names))
print(len(products_links)) ```
4

0 回答 0