1

因此,对于一个项目,我正在努力创建一个 API 来与我学校的课程查找器进行交互,并且我正在努力从他们存储数据的 HTML 表中获取数据,而无需使用 Selenium。我最初可以使用 Selenium 提取 HTML 数据,但我的教练说如果我使用 BeautifulSoup4 和 MechanicalSoup 库,他会更喜欢。我提交了一个搜索并抓取了存储数据的 HTML 表。我不确定如何迭代存储在 HTML 表中的数据,就像我在下面的 Selenium 代码中所做的那样。

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.options import Options

Chrome_Options = Options()
Chrome_Options.add_argument("--headless") #allows program to run without opening a chrome window

driver = webdriver.Chrome() 
driver.get("https://winnet.wartburg.edu/coursefinder/") #sets the Silenium driver

select = Select(driver.find_element_by_id("ctl00_ContentPlaceHolder1_FormView1_DropDownList_Term"))
term_options = select.options
#for index in range(0, len(term_options) - 1):
#    select.select_by_index(index)


lst = []

DeptSelect = Select(driver.find_element_by_id("ctl00_ContentPlaceHolder1_FormView1_DropDownList_Department")) 
DeptSelect.select_by_visible_text("History") #finds the desiered department

search = driver.find_element_by_name("ctl00$ContentPlaceHolder1$FormView1$Button_FindNow")
search.click() #sends query

table_id = driver.find_element_by_id("ctl00_ContentPlaceHolder1_GridView1")
rows = table_id.find_elements_by_tag_name("tr")
for row in rows: #creates a list of lists containing our data
    col_lst = []
    col = row.find_elements_by_tag_name("td")
    for data in col:
        lst.append(data.text)

def chunk(l, n): #class that partitions our lists neatly
    print("chunking...")
    for i in range(0, len(l), n):
        yield l[i:i + n]

n = 16 #each list contains 16 items regardless of contents or search
uberlist = list(chunk(lst, n)) #call chunk fn to partion list

with open('class_data.txt', 'w') as handler: #output of scraped data
    print("writing file...")
    for listitem in uberlist:
        handler.write('%s\n' % listitem)

driver.close #ends and closes Silenium control over brower

这是我的汤代码,我想知道如何以与上面使用 Selenium 类似的方式从 HTML 中获取数据。

import mechanicalsoup
import requests
from lxml import html
from lxml import etree
import pandas as pd

def text(elt):
    return elt.text_content().replace(u'\xa0', u' ')

#This Will Use Mechanical Soup to grab the Form, Subit it and find the Data Table
browser = mechanicalsoup.StatefulBrowser()
winnet = "http://winnet.wartburg.edu/coursefinder/"
browser.open(winnet)
Searchform = browser.select_form()
Searchform.choose_submit('ctl00$ContentPlaceHolder1$FormView1$Button_FindNow')
response1 = browser.submit_selected() #This Progresses to Second Form
dataURL = browser.get_url() #Get URL of Second Form w/ Data
dataURL2 = 'https://winnet.wartburg.edu/coursefinder/Results.aspx'

pageContent=requests.get(dataURL2)
tree = html.fromstring(pageContent.content)
dataTable = tree.xpath('//*[@id="ctl00_ContentPlaceHolder1_GridView1"]')
rows = [] #initialize a collection of rows
for row in dataTable[0].xpath(".//tr")[1:]: #add new rows to the collection
    rows.append([cell.text_content().strip() for cell in row.xpath(".//td")])

df = pd.DataFrame(rows) #load the collection to a dataframe
print(df)
#XPath to Table
#//*[@id="ctl00_ContentPlaceHolder1_GridView1"]
#//*[@id="ctl00_ContentPlaceHolder1_GridView1"]/tbody
4

1 回答 1

1

原来我在使用 MechanicalSoup 时能够传递错误的东西。我能够将新页面的内容传递给一个名为 table 的变量,该变量.find('table')用于检索表格 HTML 而不是整个页面的 HTML。从那里开始table.get_text().split('\n'),基本上只是用来制作所有行的巨大列表。

我还涉足设置表单过滤器,效果也很好。

import mechanicalsoup
from bs4 import BeautifulSoup

#Sets StatefulBrowser Object to winnet then it it grabs form
browser = mechanicalsoup.StatefulBrowser()
winnet = "http://winnet.wartburg.edu/coursefinder/"
browser.open(winnet)
Searchform = browser.select_form()

#Selects submit button and has filter options listed.

Searchform.choose_submit('ctl00$ContentPlaceHolder1$FormView1$Button_FindNow')
Searchform.set('ctl00$ContentPlaceHolder1$FormView1$TextBox_keyword', "") #Keyword Searches by Class Title. Inputting string will search by that string ignoring any stored nonsense in the page.
#ACxxx Course Codes have 3 spaces after them, THIS IS REQUIRED. Except the All value for not searching by a Department does not.
Searchform.set("ctl00$ContentPlaceHolder1$FormView1$DropDownList_Department", 'All') #For Department List, it takes the CourseCodes as inputs and displays as the Full Name
Searchform.set("ctl00$ContentPlaceHolder1$FormView1$DropDownList_Term", "2020 Winter Term") # Term Dropdown takes a value that is a string. String is Exactly the Term date.
Searchform.set('ctl00$ContentPlaceHolder1$FormView1$DropDownList_MeetingTime', 'all') #Takes the Week Class Time as a String. Need to Retrieve list of options from pages
Searchform.set('ctl00$ContentPlaceHolder1$FormView1$DropDownList_EssentialEd', 'none') #takes a small string signialling the EE req or 'all' or 'none'. None doesn't select and option and all selects all coruses w/ a EE
Searchform.set('ctl00$ContentPlaceHolder1$FormView1$DropDownList_CulturalDiversity', 'none')# Cultural Diversity, Takes none, C, D or all
Searchform.set('ctl00$ContentPlaceHolder1$FormView1$DropDownList_WritingIntensive', 'none') # options are none or WI
Searchform.set('ctl00$ContentPlaceHolder1$FormView1$DropDownList_PassFail', 'none')# Pass/Faill takes 'none' or 'PF'
Searchform.set('ctl00$ContentPlaceHolder1$FormView1$CheckBox_OpenCourses', False) #Check Box, It's True or False
Searchform.set('ctl00$ContentPlaceHolder1$FormView1$DropDownList_Instructor', '0')# 0 is for None Selected otherwise it is a string of numbers (Instructor ID?)

#Submits Page, Grabs results and then launches a browser for test purposes.
browser.submit_selected()# Submits Form. Retrieves Results.
table = browser.get_current_page().find('table') #Finds Result Table
print(type(table))
rows = table.get_text().split('\n') # List of all Class Rows split by \n. 
于 2020-02-06T16:49:21.400 回答