python - BeautifulSoup 和 urllib 不解析谷歌图像页面

Question

我正在尝试使用 BeautifulSoup 从谷歌图像中查找随机图像。我的代码看起来像这样。

import urllib, bs4, random
from urllib import request
urlname = "https://www.google.com/search?hl=en&q=" + str(random.        randrange(999999))  + "&ion=1&bav=on.2,or.r_gc.r_pw.r_cp.r_qf.&bvm=bv.  42553238,d.dmg&biw=1354&bih=622&um=1&ie=UTF-                            8&tbm=isch&source=og&sa=N&tab=wi&ei=sNEfUf-fHvLx0wG7uoG4DQ"

page = bs4.BeautifulSoup(urllib.request.urlopen(urlname)

但是每当我尝试从页面对象中获取 HTML 时，我都会得到：

urllib.error.HTTPError: HTTP Error 403: Forbidden

我测试通过将它们粘贴到我的网络浏览器中生成的 URL，并且浏览器不会返回此错误。这是怎么回事？

score 1 · Accepted Answer

我很确定谷歌会告诉你：“请不要这样做”。请参阅http 403 错误的说明。

发生的事情是您的 python 脚本，或者更具体地说是 urllib 正在发送标头，告诉谷歌这是某种普通的请求，它不是来自浏览器。

谷歌这样做是正确的，否则很多人会简单地抓取他们的网站并将谷歌结果显示为他们自己的。

到目前为止，我可以看到两种解决方案。

1) 使用谷歌自定义搜索 API。它支持图像搜索，并且每天有 100 个查询的免费配额 - 对于更多查询，您必须付费。

2) 像mechanize这样的工具会误导网站，告诉他们它们是浏览器，而不是实际上抓取机器人，例如通过发送操纵的标题。这里的常见问题是，如果您的爬虫过于贪婪（短时间内请求太多），谷歌将永久阻止您的 IP 地址...

score 0 · Accepted Answer

那是因为没有user-agent指定。默认requests user-agent值为python-requests，因此 Google 会阻止请求，因为它知道这是一个机器人而不是“真正的”用户访问并user-agent伪造它。

要抓取 Google 图片、缩略图和完整分辨率 URL，您需要从<script>标签内的页面源中解析日期：

# find all <script> tags:
soup.select('script')

# match images data via regex:
matched_images_data = ''.join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))

# match desired images (full res size) via regex:

# https://kodlogs.com/34776/json-decoder-jsondecodeerror-expecting-property-name-enclosed-in-double-quotes
# if you try to json.loads() without json.dumps() it will throw an error:
# "Expecting property name enclosed in double quotes"
matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)

matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]",
                                                    matched_images_data_json)

# Extract and decode them using bytes() and decode():
for fixed_full_res_image in matched_google_full_resolution_images:
    original_size_img_not_fixed = bytes(fixed_full_res_image, 'ascii').decode('unicode-escape')
    original_size_img = bytes(original_size_img_not_fixed, 'ascii').decode('unicode-escape')

下载 Google 图片的在线 IDE 中的代码和完整示例：

import requests, lxml, re, json, shutil, urllib.request
from bs4 import BeautifulSoup
from py_random_words import RandomWords

random_word = RandomWords().get_word()

headers = {
    "User-Agent":
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}

params = {
    "q": random_word,
    "tbm": "isch", 
    "hl": "en",
    "ijn": "0",
}

html = requests.get("https://www.google.com/search", params=params, headers=headers)
soup = BeautifulSoup(html.text, 'lxml')


def get_images_data():

    print('\nGoogle Images Metadata:')
    for google_image in soup.select('.isv-r.PNCib.MSM1fd.BUooTd'):
        title = google_image.select_one('.VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb')['title']
        source = google_image.select_one('.fxgdke').text
        link = google_image.select_one('.VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb')['href']
        print(f'{title}\n{source}\n{link}\n')

    # this steps could be refactored to a more compact
    all_script_tags = soup.select('script')

    # # https://regex101.com/r/48UZhY/4
    matched_images_data = ''.join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
    
    # https://kodlogs.com/34776/json-decoder-jsondecodeerror-expecting-property-name-enclosed-in-double-quotes
    # if you try to json.loads() without json.dumps() it will throw an error:
    # "Expecting property name enclosed in double quotes"
    matched_images_data_fix = json.dumps(matched_images_data)
    matched_images_data_json = json.loads(matched_images_data_fix)

    # https://regex101.com/r/pdZOnW/3
    matched_google_image_data = re.findall(r'\[\"GRID_STATE0\",null,\[\[1,\[0,\".*?\",(.*),\"All\",', matched_images_data_json)

    # https://regex101.com/r/NnRg27/1
    matched_google_images_thumbnails = ', '.join(
        re.findall(r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]',
                   str(matched_google_image_data))).split(', ')

    print('Google Image Thumbnails:')  # in order
    for fixed_google_image_thumbnail in matched_google_images_thumbnails:
        # https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
        google_image_thumbnail_not_fixed = bytes(fixed_google_image_thumbnail, 'ascii').decode('unicode-escape')

        # after first decoding, Unicode characters are still present. After the second iteration, they were decoded.
        google_image_thumbnail = bytes(google_image_thumbnail_not_fixed, 'ascii').decode('unicode-escape')
        print(google_image_thumbnail)

    # removing previously matched thumbnails for easier full resolution image matches.
    removed_matched_google_images_thumbnails = re.sub(
        r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', '', str(matched_google_image_data))

    # https://regex101.com/r/fXjfb1/4
    # https://stackoverflow.com/a/19821774/15164646
    matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]",
                                                       removed_matched_google_images_thumbnails)


    print('\nFull Resolution Images:')  # in order
    for index, fixed_full_res_image in enumerate(matched_google_full_resolution_images):
        # https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
        original_size_img_not_fixed = bytes(fixed_full_res_image, 'ascii').decode('unicode-escape')
        original_size_img = bytes(original_size_img_not_fixed, 'ascii').decode('unicode-escape')
        print(original_size_img)

        # ------------------------------------------------
        # Download original images

        print(f'Downloading {index} image...')
        
        opener=urllib.request.build_opener()
        opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582')]
        urllib.request.install_opener(opener)

        urllib.request.urlretrieve(original_size_img, f'Bs4_Images/original_size_img_{index}.jpg')

    
get_images_data()

或者，您可以使用来自 SerpApi的Google 图片 API来实现相同的目的。这是一个带有免费计划的付费 API。

您的情况的不同之处在于，您不必处理regex从页面源匹配和提取所需数据，相反，您只需要迭代结构化 JSON 并快速获得您想要的，并且不需要随着时间的推移保持它。

要集成以实现您的目标的代码：

import os, urllib.request, json # json for pretty output
from serpapi import GoogleSearch
from py_random_words import RandomWords


random_word = RandomWords().get_word()

def get_google_images():
    params = {
      "api_key": os.getenv("API_KEY"),
      "engine": "google",
      "q": random_word,
      "tbm": "isch"
    }

    search = GoogleSearch(params)
    results = search.get_dict()

    print(json.dumps(results['images_results'], indent=2, ensure_ascii=False))

    # -----------------------
    # Downloading images

    for index, image in enumerate(results['images_results']):

        print(f'Downloading {index} image...')
        
        opener=urllib.request.build_opener()
        opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582')]
        urllib.request.install_opener(opener)

        urllib.request.urlretrieve(image['original'], f'SerpApi_Images/original_size_img_{index}.jpg')


get_google_images()

PS - 我写了一篇更深入的博客文章，关于如何抓取谷歌图片，以及如何减少在网络抓取搜索引擎时被阻止的机会。

免责声明，我为 SerpApi 工作。

python - BeautifulSoup 和 urllib 不解析谷歌图像页面

2 回答 2

Related

Reference