我需要从网站中提取电话号码和网站链接以及大学的名称和国家/地区。该网站是https://www.whed.net/results_institutions.php?Chp2=Business%20Administration问题是+
每所大学都需要单击一个标志,然后需要提取数据,它需要关闭并继续下一个。
我通过 selenium 尝试了多种方法,如下所示:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup
import pandas as pd
#opening the web browser
browser = webdriver.Chrome('C:\\Users\\albert.malhotra\\Desktop\\Web Scrapings\\Kentucky State\\chromedriver')
#assigning the link to a variable
url = 'https://www.whed.net/results_institutions.php?Chp2=Business%20Administration'
#opening the url in browser while waiting 10 seconds for it to load
browser.get(url)
dfs = []
dfss = []
for n in range(50):
html = browser.page_source
soup = BeautifulSoup(html, 'lxml')
for data in soup.find_all('p' , {'class' : 'country'}):
item = data.text
for thead in soup.find_all('div', {'class' : 'details'}):
#data_2 = thead.find_all('a')
data_2 = thead.select('h3')
browser.find_element_by_link_text('More details').click()
html_2 = browser.page_source
soup_1 = BeautifulSoup(html_2, 'lxml')
name = []
for phone in soup_1.find_all('span' , {'class' : 'contenu'}):
data_3 = phone.text
name.append(data_3)
browser.find_element_by_class_name("fancybox-item fancybox-close").click()
dfss.append(data_2[0].text)
dfs.append(item)