-1

我是编码新手。此代码分两步工作。在第一步中,用户输入他想要查找的文章的名称。点击“研究门”按钮。该网站为我们提供 DOI 号。然后在第二步中,我们通过单击“Sci-Hub”按钮搜索此 DOI 号。而已。

我想做的是从解析的文本中获取这个 DOI 号。然后在 Sci-Hub 中搜索 DOI。所以,我认为我们可以只用一个文本框和一个按钮来做到这一点。因此,当用户输入文章名称时,它会在 Sci-Hub 中打开创建的结果。

我在下面写了一点网络解析。

from PyQt5.QtWidgets import *
from PyQt5.QtWidgets import QApplication, QMainWindow, QMessageBox
from bs4 import BeautifulSoup
import sys
import webbrowser


def doi(event):
    
    if window.textbox.text() == "":
        QMessageBox.about(window, "Notification", "Please type the DOI of the article you want to find")
    else:
        lib = window.textbox.text()   
        url = "https://sci-hub.se/"+(str(lib))
        webbrowser.open_new(url)

def research():

    if window.textbox.text() == "":
        QMessageBox.about(window, "Notification", "Please type the name of the article")
    else:
        lib = window.textbox.text()
        url = "https://www.researchgate.net/search/publication?q="+(str(lib))
        webbrowser.open_new(url)

def quit_window():
    window.close()


app = QApplication(sys.argv)
window = QMainWindow()
window.setGeometry(500,300,300,300)
window.setWindowTitle("Publication Search")

window.textbox = QLineEdit(window)
window.textbox.setPlaceholderText("Please type the name of the article you want to find")
window.textbox.move(20, 70)
window.textbox.resize(270,30)

button1 = QPushButton(window)
button1.setText("ResearchGate")
button1.clicked.connect(research)
button1.move(100, 130)

button2 = QPushButton(window)
button2.setText("SCI-HUB")
button2.clicked.connect(doi)
button2.move(100, 170)
 
button3 = QPushButton(window)
button3.setText("Exit")
button3.clicked.connect(quit_window)
button3.move(100, 210)

window.show()
sys.exit(app.exec_())

这是我从网站上查找已解析文本的操作。

import requests 
import bs4
from bs4 import BeautifulSoup
  
text= "Mobile TV: a new form of entertainment?"
url = 'https://www.researchgate.net/search/publication?q=' + text   
result=requests.get(url)
soup = bs4.BeautifulSoup(result.text, "html.parser")
print(soup.get_text())
##print(soup.prettify())
4

1 回答 1

1

我看到DOI了,<span class="">所以您可以尝试soup.find_all('span'),稍后您可以检查是否textDOI:

我使用get(url, params={'q': text})而不是,url&g=text它将使用特殊代码而不是文本中的空格。有些服务器可能需要它。

import requests 
from bs4 import BeautifulSoup
import webbrowser
  
text =  "Mobile TV: a new form of entertainment?"

payload = {
    'q': text,
}

count = 0

url = 'https://www.researchgate.net/search/publication' 
result = requests.get(url, params=payload)
#print(result.request.url)

soup = BeautifulSoup(result.text, "html.parser")
items = soup.find_all('span')

for span in items:
    doi = span.get_text(strip=True)
    if doi.startswith('DOI:'):
        print(doi)
        count += 1

        number = doi[4:] # skip `DOI:`
        url_sci_hub = "https://sci-hub.se/"+number

        webbrowser.open_new(url_sci_hub)

print('count:', count)

许多页面的代码,结果打开researchgate.net

import requests 
from bs4 import BeautifulSoup
import webbrowser
  
text = "Mobile TV: a new form of entertainment?"

url = 'https://www.researchgate.net/search/publication' 

payload = {
    'q': text,
    'page': 0,
}
count = 0

for page in range(1, 5):
    print('--- page:', page, '---')

    payload['page'] = page
    result = requests.get(url, params=payload)
    #print(result.request.url)

    soup = BeautifulSoup(result.text, "html.parser")
    items = soup.find_all('span')

    for i in items:
        text = i.get_text(strip=True)
        if text.startswith('DOI:'):
            print(text)
            count += 1

            number = text[4:] # skip `DOI:`
            url_sci_hub = "https://sci-hub.se/"+number

            webbrowser.open_new(url_sci_hub)

print('count:', count)
于 2020-12-17T01:21:05.500 回答