我编写了一个脚本来保存 LinkedIn 信息,例如:姓名、姓氏、毕业大学和最重要的 LinkedIn 脚本链接。我的脚本是使用 Selenium 和 chromedriver 进入 LinkedIn 然后刮。我的问题是保存个人资料链接。链接没有正确抓取。这是我的代码:
import csv
from selenium import webdriver
from time import sleep
from selenium.webdriver.common.keys import Keys
import parameters
import re
class LinkedIn():
def __init__(self):
self.driver = webdriver.Chrome()
self.people_ls_dic = []
self.csv_name_colums = ["name","degree_connection","zawod","region","opis","firma","link"]
def login(self):
self.driver.get("http://www.linkedin.com/login")
sleep(3)
username = self.driver.find_element_by_name('session_key')
username.send_keys(parameters.linkedin_username)
password = self.driver.find_element_by_name('session_password')
password.send_keys(parameters.linkedin_password)
sign_in_button = self.driver.find_elements_by_xpath('//*[@class="btn__primary--large from__button--floating mercado-button--primary"]')
sign_in_button[0].click()
sleep(5)
def neville_try(self):
sleep(3)
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
profiles = self.driver.find_element_by_xpath('/html/body/div[7]/div[3]/div/div[2]/div/div/div/div[2]/ul')
profiles = profiles.find_elements_by_css_selector('li')
profiles = [(i.text, i.find_element_by_xpath('//*[@data-control-name="entity_result"]').get_attribute('href')) for i in profiles]
print("\n\n")
info_ls = []
for profile, link in profiles:
info_ls.append( (profile.split('\n'), link) )
for iteam, link in info_ls:
if 'Learn more' in iteam:
info_ls.remove(iteam)
print(info_ls)
info_ls = [(iteam, link) for iteam, link in info_ls if iteam != ['']]
for info, link in info_ls:
if info[0] == info[1]:
info.remove(info[1])
try:
name = info[0]
degree_connection = info[2]
zawod = info[3]
region = info[4]
opis = info[5]
opis_f = opis.replace(","," ")
list_of_user_data = [name, zawod, opis_f]
for data in list_of_user_data:
try:
comp = re.findall('at ([a-zA-Z0-9]+)',data)
firma = comp[0]
break
except:
continue
if comp == []:
firma = "brak_danych"
self.people_ls_dic.append({"name":name,"degree_connection":degree_connection,"zawod":zawod,"region":region,"opis":opis,"firma":firma,"link":link})
except:
pass
def go_home(self):
home = self.driver.find_element_by_xpath('//*[@id="inbug-nav-item"]/a')
home.click()
def next_page(self):
sleep(3)
next_p = self.driver.find_element_by_xpath('//*[@aria-label="Next"]')
next_p.click()
def open_people(self):
self.driver.get("https://www.linkedin.com/search/results/people/?origin=DISCOVER_FROM_SEARCH_HOME")
sleep(2)
search_bar = self.driver.find_element_by_xpath('//*[@class="search-global-typeahead__input always-show-placeholder"]')
search_bar.send_keys(parameters.search_query)
search_bar.send_keys(Keys.ENTER)
sleep(3)
def filter_company(self):
cl = self.driver.find_element_by_xpath('//*[@aria-label="Current company filter. Clicking this button displays all Current company filter options."]')
cl.click()
for comp in parameters.list_of_comp:
text = self.driver.find_element_by_xpath('//*[@placeholder="Add a company"]')
text.send_keys(comp)
sleep(1)
filt = self.driver.find_element_by_xpath('/html/body/div[7]/div[3]/div/div[1]/nav/div/div[1]/div/div[2]/ul/li[5]/div/div/div/div[1]/div/form/fieldset/div[1]/div/div/div[2]/div/div[2]')
sleep(0.2)
filt.click()
sleep(1)
apply = self.driver.find_element_by_xpath('/html/body/div[7]/div[3]/div/div[1]/nav/div/div[1]/div/div[2]/ul/li[5]/div/div/div/div[1]/div/form/fieldset/div[2]/button[2]')
apply.click()
sleep(1)
def close(self):
self.driver.close()
def write_to_csv(self):
csv_file = "neville.csv"
with open(csv_file, 'w', encoding="utf-8", newline='') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames = self.csv_name_colums)
writer.writeheader()
for data in self.people_ls_dic:
writer.writerow(data)
scrypt = LinkedIn()
scrypt.login()
scrypt.open_people()
ls = range(parameters.ilosc_stron)
scrypt.filter_company()
for i in sorted(ls,reverse=True):
scrypt.neville_try()
if i == 1:
break
scrypt.next_page()
scrypt.write_to_csv()
scrypt.close()
Ofc 我有带参数的文件,我看起来像这样:
linkedin_username = ""
linkedin_password = ""
search_query = 'vcloud director'
list_of_comp = ['Microsoft']
ilosc_stron = 2 //number of pages to click on