python - 如何从 Google Scholar 搜索结果（Python）中抓取全文引用？

Question

我正在尝试从 Google Scholar 上抓取一些关于学术论文的有用数据。到目前为止，我在获取标题、出版年份、引文计数和“引用者”URL 方面没有问题。

我现在想获得完整的引文，包括完整的作者列表、期刊、页面（如果有）等...（见下面的快照）单击双引号时出现的完整 APA 引文（红色圈出）

我使用 ScraperAPI 来处理代理和验证码（它们免费提供 5000 个请求）。

下面是我的代码（我知道它很重而且根本不是最佳的，但现在可以完成工作）：

import requests
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup

APIKEY = "????????????????????"
BASE_URL = f"http://api.scraperapi.com?api_key={APIKEY}&url="

def scraper_api(query, n_pages):
    """Uses scraperAPI to scrape Google Scholar for 
    papers' Title, Year, Citations, Cited By url returns a dataframe
    ---------------------------
    parameters:
    query: in the following format "automation+container+terminal"
    n_pages: number of pages to scrape
    ---------------------------
    returns:
    dataframe with the following columns: 
    "Title": title of each papers
    "Year": year of publication of each paper
    "Citations": citations count
    "cited_by_url": URL given by "cited by" button, reshaped to allow further
                    scraping
    ---------------------------"""

    pages = np.arange(0,(n_pages*10),10)
    papers = []
    for page in pages:
        print(f"Scraping page {int(page/10) + 1}")
        webpage = f"https://scholar.google.com/scholar?start={page}&q={query}&hl=fr&as_sdt=0,5"
        url = BASE_URL + webpage
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        
        for paper in soup.find_all("div", class_="gs_ri"):
            # get the title of each paper
            title = paper.find("h3", class_="gs_rt").find("a").text
            if title == None:
                title = paper.find("h3", class_="gs_rt").find("span").text
            # get the year of publication of each paper
            txt_year = paper.find("div", class_="gs_a").text
            year = re.findall('[0-9]{4}', txt_year)
            if year:
                year = list(map(int,year))[0]
            else:
                year = 0
            # get number of citations for each paper
            txt_cite = paper.find("div", class_="gs_fl").find_all("a")[2].string
            if txt_cite:
                citations = re.findall('[0-9]+', txt_cite)
                if citations:
                    citations = list(map(int,citations))[0]
                else:
                    citations = 0
            else:
                citations = 0
            # get the "cited_by" url for later scraping of citing papers
            # had to extract the "href" tag and then reshuffle the url as not
            # following same pattern for pagination
            urls = paper.find("div", class_="gs_fl").find_all(href=True)
            if urls:
                for url in urls:
                    if "cites" in url["href"]:
                        cited_url = url["href"]
                        index1 = cited_url.index("?")
                        url_slices = []
                        url_slices.append(cited_url[:index1+1])
                        url_slices.append(cited_url[index1+1:])

                        index_and = url_slices[1].index("&")
                        url_slices.append(url_slices[1][:index_and+1])
                        url_slices.append(url_slices[1][index_and+1:])
                        url_slices.append(url_slices[3][:23])
                        del url_slices[1]
                        new_url = "https://scholar.google.com.tw"+url_slices[0]+"start=00&hl=en&"+url_slices[3]+url_slices[1]+"scipsc="
            else:
                new_url = "no citations"
            # appends everything in a list of dictionaries    
            papers.append({'title': title, 'year': year, 'citations': citations, 'cited_by_url': new_url})
    # converts the list of dict to a pandas df
    papers_df = pd.DataFrame(papers)
    return papers_df

我想检索完整的 APA 引用，但似乎它不在同一个 HTML 页面上并且没有href关联。

如果你有任何线索对我有很大帮助！！谢谢：）

score 0 · Accepted Answer

data-cid属性是唯一的发布 ID。您需要从页面中解析所有这些，向引用 URL 发出另一个请求，解析data-cid为ce.teuf所述。

下面的示例适用于大约 10-20 个请求，然后 Google 将抛出验证码，否则您将达到速率限制。理想的解决方案是拥有 CAPTCHA 解决服务以及代理。

示例代码：

from bs4 import BeautifulSoup
import requests, lxml

params = {
    "q": "automated container terminal",  # search query
    "hl": "en"                            # language
}

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.3538.102 Safari/537.36 Edge/18.19582",
    'accept-language': 'en-US,en',
    'accept': 'text/html,application/xhtml+xml,application/xml',
    "server": "scholar",
    "referer": f"https://scholar.google.com/scholar?hl={params['hl']}&q={params['q']}",
}


def cite_ids() -> list[str]:
    response = requests.get("https://scholar.google.com/scholar", params=params, headers=headers)
    soup = BeautifulSoup(response.text, "lxml")

    # returns a list of publication ID's -> U8bh6Ca9uwQJ
    return [result["data-cid"] for result in soup.select(".gs_or")]

def scrape_cite_results() -> list[dict[str]]:
    cited_authors = []

    for cite_id in cite_ids():
        response = requests.get(f"https://scholar.google.com/scholar?output=cite&q=info:{cite_id}:scholar.google.com", headers=headers)
        soup = BeautifulSoup(response.text, "lxml")

        for result in soup.select("tr"):
            if "APA" in result.select_one("th"):
                title = result.select_one("th").text
                authors = result.select_one("td").text

                cited_authors.append({"title": title, "cited_authors": authors})

    return cited_authors

或者，您可以使用来自 SerpApi 的Google Scholar Organic Results API来实现它。这是一个带有免费计划的付费 API。

这种情况的不同之处在于，如果您发送一堆请求并达到 IP 速率限制，或者它会抛出验证码，您不必修改选择器以找到合适的选择器或弄清楚如何绕过 Google 的块.

要集成的代码：

import os, json
from serpapi import GoogleSearch


def organic_results() -> list[str]:
    params = {
        "api_key": os.getenv("API_KEY"),
        "engine": "google_scholar",
        "q": "automated container terminal",  # search query
        "hl": "en"                            # language
    }

    search = GoogleSearch(params)
    results = search.get_dict()

    return [result["result_id"] for result in results["organic_results"]]


def cite_results() -> list[dict[str]]:

    citation_results = []

    for citation_id in organic_results():
        params = {
            "api_key": os.getenv("API_KEY"),
            "engine": "google_scholar_cite",
            "q": citation_id
        }

        search = GoogleSearch(params)
        results = search.get_dict()

        for result in results["citations"]:
            if "APA" in result["title"]:
                institution = result["title"]
                authors = result["snippet"]

                citation_results.append({
                    "institution": institution,
                    "authors": authors
                })

    return citation_results

print(json.dumps(cite_results(), indent=2))

'''
[
  {
    "institution": "APA",
    "authors": "Vis, I. F., & Harika, I. (2004). Comparison of vehicle types at an automated container terminal. OR Spectrum, 26(1), 117-143."
  },
  {
    "institution": "APA",
    "authors": "Vis, I. F., De Koster, R., Roodbergen, K. J., & Peeters, L. W. (2001). Determination of the number of automated guided vehicles required at a semi-automated container terminal. Journal of the Operational research Society, 52(4), 409-417."
  },
  {
    "institution": "APA",
    "authors": "Zhen, L., Lee, L. H., Chew, E. P., Chang, D. F., & Xu, Z. X. (2011). A comparative study on two types of automated container terminal systems. IEEE Transactions on Automation Science and Engineering, 9(1), 56-69."
  },
  {
    "institution": "APA",
    "authors": "Liu, C. I., Jula, H., & Ioannou, P. A. (2002). Design, simulation, and evaluation of automated container terminals. IEEE Transactions on intelligent transportation systems, 3(1), 12-26."
  },
  {
    "institution": "APA",
    "authors": "Park, T., Choe, R., Kim, Y. H., & Ryu, K. R. (2011). Dynamic adjustment of container stacking policy in an automated container terminal. International Journal of Production Economics, 133(1), 385-392."
  },
  {
    "institution": "APA",
    "authors": "Bae, H. Y., Choe, R., Park, T., & Ryu, K. R. (2011). Comparison of operations of AGVs and ALVs in an automated container terminal. Journal of Intelligent Manufacturing, 22(3), 413-426."
  },
  {
    "institution": "APA",
    "authors": "Luo, J., Wu, Y., & Mendes, A. B. (2016). Modelling of integrated vehicle scheduling and container storage problems in unloading process at an automated container terminal. Computers & Industrial Engineering, 94, 32-44."
  },
  {
    "institution": "APA",
    "authors": "Zhu, M., Fan, X., Cheng, H., & He, Q. (2010). Modeling and Simulation of Automated Container Terminal Operation. J. Comput., 5(6), 951-957."
  },
  {
    "institution": "APA",
    "authors": "Luo, J., & Wu, Y. (2020). Scheduling of container-handling equipment during the loading process at an automated container terminal. Computers & Industrial Engineering, 149, 106848."
  },
  {
    "institution": "APA",
    "authors": "Yang, X., Mi, W., Li, X., An, G., Zhao, N., & Mi, C. (2015). A simulation study on the design of a novel automated container terminal. IEEE Transactions on Intelligent Transportation Systems, 16(5), 2889-2899."
  }
]
'''

免责声明，我为 SerpApi 工作。

score 0 · Accepted Answer

打开 F12，进入网络选项卡下，然后单击“引文符号”。您应该会看到一个请求出现。请求的 url 如下：

“https://scholar.google.com/scholar?q=info:dgGDGDdf5:scholar.google.com/&output=cite&scirp=0&hl=fr”

其中“dgGDGDdf5”是可在主页的每个 div 行中找到的“data-cid”。每个“data-cid”对应一个独特的文章。

因此，提取这个“data-cid”并使用这个 url 提出一个子请求，然后提取 APA 或其他引用形式。

实现示例：

import requests as rq
from bs4 import BeautifulSoup as bs
from urllib.parse import urlencode

headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0"}
def google_scholar(query, n_pages, since_year):
    data = []
    encoded_query = urlencode({"q": query})
    for start in range(0, n_pages*10, 10):
        url = "https://scholar.google.com/scholar?as_ylo=%s&%s&hl=fr&start=%s" % (since_year, encoded_query, start)
        resp = rq.get(url, headers=headers)
        soup = bs(resp.content, "lxml")
        print(soup)
        main_div = soup.find_all('div', {'id': 'gs_res_ccl_mid'})[0]
        divs = main_div.find_all('div', {'class': 'gs_r gs_or gs_scl'})
        for div in divs:
            data_cid = div['data-cid']
            print(data_cid)
            title = div.find_all('h3', {'class': 'gs_rt'})[0].text
            infos = div.find_all('div', {'class': 'gs_a'})[0].text
            
            # APA citation
            url_cite = "https://scholar.google.com/scholar?q=info:%s:scholar.google.com/&output=cite&scirp=0&hl=fr" % (data_cid)
            resp2 = rq.get(url_cite, headers=headers)
            
            # --> extract apa here from resp2

python - 如何从 Google Scholar 搜索结果（Python）中抓取全文引用？

2 回答 2

Related

Reference