python - 如何使用 Python 截取网站的屏幕截图/图像？

Question

我想要实现的是从 python 中的任何网站获取网站截图。

环境：Linux

score 52 · Accepted Answer

这是一个使用 webkit 的简单解决方案： http ://webscraping.com/blog/Webpage-screenshots-with-webkit/

import sys
import time
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *

class Screenshot(QWebView):
    def __init__(self):
        self.app = QApplication(sys.argv)
        QWebView.__init__(self)
        self._loaded = False
        self.loadFinished.connect(self._loadFinished)

    def capture(self, url, output_file):
        self.load(QUrl(url))
        self.wait_load()
        # set to webpage size
        frame = self.page().mainFrame()
        self.page().setViewportSize(frame.contentsSize())
        # render image
        image = QImage(self.page().viewportSize(), QImage.Format_ARGB32)
        painter = QPainter(image)
        frame.render(painter)
        painter.end()
        print 'saving', output_file
        image.save(output_file)

    def wait_load(self, delay=0):
        # process app events until page loaded
        while not self._loaded:
            self.app.processEvents()
            time.sleep(delay)
        self._loaded = False

    def _loadFinished(self, result):
        self._loaded = True

s = Screenshot()
s.capture('http://webscraping.com', 'website.png')
s.capture('http://webscraping.com/blog', 'blog.png')

score 39 · Accepted Answer

这是我从各种来源获取帮助的解决方案。它需要完整的网页屏幕截图并裁剪它（可选）并从裁剪的图像中生成缩略图。以下是要求：

要求：

安装 NodeJS
使用 Node 的包管理器安装 phantomjs：npm -g install phantomjs
安装 selenium（在你的 virtualenv 中，如果你正在使用它）
安装 imageMagick
将 phantomjs 添加到系统路径（在 Windows 上）

import os
from subprocess import Popen, PIPE
from selenium import webdriver

abspath = lambda *p: os.path.abspath(os.path.join(*p))
ROOT = abspath(os.path.dirname(__file__))


def execute_command(command):
    result = Popen(command, shell=True, stdout=PIPE).stdout.read()
    if len(result) > 0 and not result.isspace():
        raise Exception(result)


def do_screen_capturing(url, screen_path, width, height):
    print "Capturing screen.."
    driver = webdriver.PhantomJS()
    # it save service log file in same directory
    # if you want to have log file stored else where
    # initialize the webdriver.PhantomJS() as
    # driver = webdriver.PhantomJS(service_log_path='/var/log/phantomjs/ghostdriver.log')
    driver.set_script_timeout(30)
    if width and height:
        driver.set_window_size(width, height)
    driver.get(url)
    driver.save_screenshot(screen_path)


def do_crop(params):
    print "Croping captured image.."
    command = [
        'convert',
        params['screen_path'],
        '-crop', '%sx%s+0+0' % (params['width'], params['height']),
        params['crop_path']
    ]
    execute_command(' '.join(command))


def do_thumbnail(params):
    print "Generating thumbnail from croped captured image.."
    command = [
        'convert',
        params['crop_path'],
        '-filter', 'Lanczos',
        '-thumbnail', '%sx%s' % (params['width'], params['height']),
        params['thumbnail_path']
    ]
    execute_command(' '.join(command))


def get_screen_shot(**kwargs):
    url = kwargs['url']
    width = int(kwargs.get('width', 1024)) # screen width to capture
    height = int(kwargs.get('height', 768)) # screen height to capture
    filename = kwargs.get('filename', 'screen.png') # file name e.g. screen.png
    path = kwargs.get('path', ROOT) # directory path to store screen

    crop = kwargs.get('crop', False) # crop the captured screen
    crop_width = int(kwargs.get('crop_width', width)) # the width of crop screen
    crop_height = int(kwargs.get('crop_height', height)) # the height of crop screen
    crop_replace = kwargs.get('crop_replace', False) # does crop image replace original screen capture?

    thumbnail = kwargs.get('thumbnail', False) # generate thumbnail from screen, requires crop=True
    thumbnail_width = int(kwargs.get('thumbnail_width', width)) # the width of thumbnail
    thumbnail_height = int(kwargs.get('thumbnail_height', height)) # the height of thumbnail
    thumbnail_replace = kwargs.get('thumbnail_replace', False) # does thumbnail image replace crop image?

    screen_path = abspath(path, filename)
    crop_path = thumbnail_path = screen_path

    if thumbnail and not crop:
        raise Exception, 'Thumnail generation requires crop image, set crop=True'

    do_screen_capturing(url, screen_path, width, height)

    if crop:
        if not crop_replace:
            crop_path = abspath(path, 'crop_'+filename)
        params = {
            'width': crop_width, 'height': crop_height,
            'crop_path': crop_path, 'screen_path': screen_path}
        do_crop(params)

        if thumbnail:
            if not thumbnail_replace:
                thumbnail_path = abspath(path, 'thumbnail_'+filename)
            params = {
                'width': thumbnail_width, 'height': thumbnail_height,
                'thumbnail_path': thumbnail_path, 'crop_path': crop_path}
            do_thumbnail(params)
    return screen_path, crop_path, thumbnail_path


if __name__ == '__main__':
    '''
        Requirements:
        Install NodeJS
        Using Node's package manager install phantomjs: npm -g install phantomjs
        install selenium (in your virtualenv, if you are using that)
        install imageMagick
        add phantomjs to system path (on windows)
    '''

    url = 'http://stackoverflow.com/questions/1197172/how-can-i-take-a-screenshot-image-of-a-website-using-python'
    screen_path, crop_path, thumbnail_path = get_screen_shot(
        url=url, filename='sof.png',
        crop=True, crop_replace=False,
        thumbnail=True, thumbnail_replace=False,
        thumbnail_width=200, thumbnail_height=150,
    )

这些是生成的图像：

score 27 · Accepted Answer

可以使用 Selenium

from selenium import webdriver

DRIVER = 'chromedriver'
driver = webdriver.Chrome(DRIVER)
driver.get('https://www.spotify.com')
screenshot = driver.save_screenshot('my_screenshot.png')
driver.quit()

https://sites.google.com/a/chromium.org/chromedriver/getting-started

score 19 · Accepted Answer

在 Mac 上，有webkit2png，在 Linux+KDE 上，你可以使用khtml2png。我试过前者，效果很好，听说后者投入使用。

我最近遇到了声称是跨平台的QtWebKit （我猜是 Qt 将 WebKit 滚动到了他们的库中）。但是我从来没有尝试过，所以我不能告诉你更多。

QtWebKit 链接显示了如何从 Python 访问。您至少应该能够使用 subprocess 与其他人做同样的事情。

score 9 · Accepted Answer

使用Rendertron是一种选择。在底层，这是一个暴露以下端点的无头 Chrome：

/render/:urlrequests.get：如果您对 DOM 感兴趣，请访问此路线。
/screenshot/:url：如果您对屏幕截图感兴趣，请访问此路线。

您可以使用 npm 安装 rendertron，rendertron在一个终端中运行，访问http://localhost:3000/screenshot/:url并保存文件，但在render-tron.appspot.com上提供了一个演示，可以在不安装 npm 包的情况下在本地运行这个 Python3 片段：

import requests

BASE = 'https://render-tron.appspot.com/screenshot/'
url = 'https://google.com'
path = 'target.jpg'
response = requests.get(BASE + url, stream=True)
# save file, see https://stackoverflow.com/a/13137873/7665691
if response.status_code == 200:
    with open(path, 'wb') as file:
        for chunk in response:
            file.write(chunk)

score 7 · Accepted Answer

11 年后...使用and
截取网站截图：Python3.6Google PageSpeedApi Insights v5

import base64
import requests
import traceback
import urllib.parse as ul

# It's possible to make requests without the api key, but the number of requests is very limited  

url = "https://duckgo.com"
urle = ul.quote_plus(url)
image_path = "duckgo.jpg"

key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
strategy = "desktop" # "mobile"
u = f"https://www.googleapis.com/pagespeedonline/v5/runPagespeed?key={key}&strategy={strategy}&url={urle}"

try:
    j = requests.get(u).json()
    ss_encoded = j['lighthouseResult']['audits']['final-screenshot']['details']['data'].replace("data:image/jpeg;base64,", "")
    ss_decoded = base64.b64decode(ss_encoded)
    with open(image_path, 'wb+') as f:
        f.write(ss_decoded) 
except :
    print(traceback.format_exc())
    exit(1)

笔记：

现场演示
优点：免费
缺点：低分辨率
获取 API 密钥
文档
限制：
- 每天查询 = 25,000
- 每 100 秒的查询次数 = 400

score 6 · Accepted Answer

我无法评论 ars 的答案，但实际上我使用 QtWebkit 运行了Roland Tapken 的代码，并且运行良好。

只是想确认 Roland 在他的博客上发布的内容在 Ubuntu 上运行良好。我们的生产版本最终没有使用他写的任何东西，但我们使用 PyQt/QtWebKit 绑定取得了很大成功。

注意：以前的 URL 是： http: //www.blogs.uni-osnabrueck.de/rotapken/2008/12/03/create-screenshots-of-a-web-page-using-python-and-qtwebkit/我已经用工作副本更新了它。

score 4 · Accepted Answer

这是一个古老的问题，大多数答案都有些过时了。目前，我会做两件事中的一件。

1.创建一个截图程序

我会使用Pyppeteer来截取网站的截图。这在Puppeteer包上运行。Puppeteer 启动了一个无头 chrome 浏览器，因此屏幕截图看起来与普通浏览器中的完全一样。

这取自 pyppeteer 文档：

import asyncio
from pyppeteer import launch

async def main():
    browser = await launch()
    page = await browser.newPage()
    await page.goto('https://example.com')
    await page.screenshot({'path': 'example.png'})
    await browser.close()

asyncio.get_event_loop().run_until_complete(main())

2.使用截图API

你也可以使用一个截图 API，比如这个。好处是您不必自己设置所有内容，只需调用 API 端点即可。

这取自屏幕截图 API 的文档：

import urllib.parse
import urllib.request
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

# The parameters.
token = "YOUR_API_TOKEN"
url = urllib.parse.quote_plus("https://example.com")
width = 1920
height = 1080
output = "image"

# Create the query URL.
query = "https://screenshotapi.net/api/v1/screenshot"
query += "?token=%s&url=%s&width=%d&height=%d&output=%s" % (token, url, width, height, output)

# Call the API.
urllib.request.urlretrieve(query, "./example.png")

score 2 · Accepted Answer

您可以使用 Google Page Speed API 轻松完成您的任务。在我当前的项目中，我使用了用 Python 编写的 Google Page Speed API 查询来捕获提供的任何 Web URL 的屏幕截图并将其保存到某个位置。看一看。

import urllib2
import json
import base64
import sys
import requests
import os
import errno

#   The website's URL as an Input
site = sys.argv[1]
imagePath = sys.argv[2]

#   The Google API.  Remove "&strategy=mobile" for a desktop screenshot
api = "https://www.googleapis.com/pagespeedonline/v1/runPagespeed?screenshot=true&strategy=mobile&url=" + urllib2.quote(site)

#   Get the results from Google
try:
    site_data = json.load(urllib2.urlopen(api))
except urllib2.URLError:
    print "Unable to retreive data"
    sys.exit()

try:
    screenshot_encoded =  site_data['screenshot']['data']
except ValueError:
    print "Invalid JSON encountered."
    sys.exit()

#   Google has a weird way of encoding the Base64 data
screenshot_encoded = screenshot_encoded.replace("_", "/")
screenshot_encoded = screenshot_encoded.replace("-", "+")

#   Decode the Base64 data
screenshot_decoded = base64.b64decode(screenshot_encoded)

if not os.path.exists(os.path.dirname(impagepath)):
    try:
        os.makedirs(os.path.dirname(impagepath))
        except  OSError as exc:
            if exc.errno  != errno.EEXIST:
                raise

#   Save the file
with open(imagePath, 'w') as file_:
    file_.write(screenshot_decoded)

不幸的是，以下是缺点。如果这些都不重要，您可以继续使用 Google Page Speed API。它运作良好。

最大宽度为 320px
根据 Google API Quota，每天有 25,000 个请求的限制

score 2 · Accepted Answer

使用网络服务 s-shot.ru（所以它不是那么快），但通过链接配置很容易设置需要的内容。并且可以轻松截取整页截图

import requests
import urllib.parse

BASE = 'https://mini.s-shot.ru/1024x0/JPEG/1024/Z100/?' # you can modify size, format, zoom
url = 'https://stackoverflow.com/'#or whatever link you need
url = urllib.parse.quote_plus(url) #service needs link to be joined in encoded format
print(url)

path = 'target1.jpg'
response = requests.get(BASE + url, stream=True)

if response.status_code == 200:
    with open(path, 'wb') as file:
        for chunk in response:
            file.write(chunk)

score 1 · Accepted Answer

你没有提到你在什么环境中运行，这有很大的不同，因为没有一个能够呈现 HTML 的纯 Python Web 浏览器。

但是，如果您使用的是 Mac，我使用webkit2png取得了巨大成功。如果没有，正如其他人指出的那样，有很多选择。

score 1 · Accepted Answer

我创建了一个名为 pywebcapture 的库，它包装了 selenium，它可以做到这一点：

pip install pywebcapture

使用 pip 安装后，您可以执行以下操作以轻松获取全尺寸屏幕截图：

# import modules
from pywebcapture import loader, driver

# load csv with urls
csv_file = loader.CSVLoader("csv_file_with_urls.csv", has_header_bool, url_column, optional_filename_column)
uri_dict = csv_file.get_uri_dict()

# create instance of the driver and run
d = driver.Driver("path/to/webdriver/", output_filepath, delay, uri_dict)
d.run()

享受！

https://pypi.org/project/pywebcapture/

score 0 · Accepted Answer

import subprocess

def screenshots(url, name):
    subprocess.run('webkit2png -F -o {} {} -D ./screens'.format(name, url), 
      shell=True)

score -1 · Accepted Answer

试试这个..

#!/usr/bin/env python

import gtk.gdk

import time

import random

while 1 :
    # generate a random time between 120 and 300 sec
    random_time = random.randrange(120,300)

    # wait between 120 and 300 seconds (or between 2 and 5 minutes)
    print "Next picture in: %.2f minutes" % (float(random_time) / 60)

    time.sleep(random_time)

    w = gtk.gdk.get_default_root_window()
    sz = w.get_size()

    print "The size of the window is %d x %d" % sz

    pb = gtk.gdk.Pixbuf(gtk.gdk.COLORSPACE_RGB,False,8,sz[0],sz[1])
    pb = pb.get_from_drawable(w,w.get_colormap(),0,0,0,0,sz[0],sz[1])

    ts = time.time()
    filename = "screenshot"
    filename += str(ts)
    filename += ".png"

    if (pb != None):
        pb.save(filename,"png")
        print "Screenshot saved to "+filename
    else:
        print "Unable to get the screenshot."

python - 如何使用 Python 截取网站的屏幕截图/图像？

14 回答 14

Related

Reference