0

我正在使用 PyQT4(第一次)来抓取一些页面。因为我尝试抓取多个页面,所以我使用 QEventloop。但是我无法将 loadFinished 信号添加到代码中。这是我的代码的样子:

   # Imports
import requests
from bs4 import BeautifulSoup
import sys  
from PyQt4.QtGui import *  
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from PyQt4.QtNetwork import QNetworkRequest
from PyQt4.QtGui import *
from lxml import html
import csv
import win_unicode_console
import time
# Main setting
DIR = "data"
URL = "https://addons.mozilla.org"
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"}

def Render(url):
    page = QWebPage()
    loop = QEventLoop() # Create event loop
    page.mainFrame().loadFinished.connect(loop.quit) # Connect loadFinished to loop quit
    page.mainFrame().load(QUrl(url))
    loop.exec_() # Run event loop, it will end on loadFinished
    return page.mainFrame().toHtml()

app = QApplication(sys.argv)

def pagination(page):
    page_url = "https://addons.mozilla.org/en-US/firefox/extensions/?sort=users&page=" + str(page)
    response = requests.get(page_url, headers=headers)
    soup = BeautifulSoup(response.text, "lxml")
    items = soup.findAll("div", class_="item addon")
    for item in items:
        time.sleep(2)
        item = URL + item.h3.select('a')[0].get('href')
        print(item)
        addon_scraper(item)

def addon_scraper(url):
    time.sleep(7)
    result = Render(url)
    print(result)
    soup = BeautifulSoup(result, "lxml")
    addon_name = soup.select("#addon > hgroup > h1 > span")[0].get_text()
    print(addon_name)
    addon_author = soup.select("#addon > hgroup > h4 > a")[0].get_text()
    category = soup.select("#related > ul")[0].get_text().strip()
    with open("category_list.csv", "a", newline="", encoding="utf-16") as f:
        writer = csv.writer(f, dialect="excel-tab")
        writer.writerow([addon_name, addon_author, category])


# Run the scraper
if __name__ == "__main__":
    win_unicode_console.enable() # Enable unicode support in command line interface
    for i in range(1, 100):
        print(i)
        pagination(i)
        app.exit()

最后它只是重新启动脚本并且什么都不做。我试图在这里实现用户 Mip 提供的解决方案:Web Scraping Multiple Links with PyQt / QtWebkit 我认为将用户代理添加到上述应用程序和隐式睡眠(类似于 selenium 案例)将解决我的问题。但我无法做到。现在我收到以下错误。我认为这是因为 PyQt4 在加载源内容之前退出循环:

Traceback (most recent call last): File "main.py", line 56, in <module> pagination(i) File "mozilla_file.py", line 36, in pagination addon_scraper(item) File "mozilla_file.py", line 46, in addon_scraper category = soup.select("#related > ul")[0].get_text().strip() IndexError: list index out of range
4

0 回答 0