0

我编写了一个脚本来保存 LinkedIn 信息,例如:姓名、姓氏、毕业大学和最重要的 LinkedIn 脚本链接。我的脚本是使用 Selenium 和 chromedriver 进入 LinkedIn 然后刮。我的问题是保存个人资料链接。链接没有正确抓取。这是我的代码:

import csv
from selenium import webdriver
from time import sleep
from selenium.webdriver.common.keys import Keys
import parameters
import re

class LinkedIn():

    def __init__(self):
        self.driver = webdriver.Chrome()
        self.people_ls_dic = []
        self.csv_name_colums = ["name","degree_connection","zawod","region","opis","firma","link"]

    def login(self):
        self.driver.get("http://www.linkedin.com/login")
        sleep(3)

        username = self.driver.find_element_by_name('session_key')
        username.send_keys(parameters.linkedin_username)

        password = self.driver.find_element_by_name('session_password')
        password.send_keys(parameters.linkedin_password)
        
        sign_in_button = self.driver.find_elements_by_xpath('//*[@class="btn__primary--large from__button--floating mercado-button--primary"]')
        sign_in_button[0].click()
        sleep(5)


    def neville_try(self):
        sleep(3)
        self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        profiles = self.driver.find_element_by_xpath('/html/body/div[7]/div[3]/div/div[2]/div/div/div/div[2]/ul') 
        profiles = profiles.find_elements_by_css_selector('li')
        profiles = [(i.text, i.find_element_by_xpath('//*[@data-control-name="entity_result"]').get_attribute('href')) for i in profiles]
        print("\n\n")
        
 
        info_ls = []
        for profile, link in profiles:
            info_ls.append( (profile.split('\n'), link) )


        for iteam, link in info_ls:
            if 'Learn more' in iteam:
                info_ls.remove(iteam)


        print(info_ls)
        info_ls = [(iteam, link) for iteam, link in info_ls if iteam != ['']]

  
        for info, link in info_ls:
            if info[0] == info[1]:
                info.remove(info[1])
            try:
                name = info[0]
                degree_connection = info[2]
                zawod = info[3]
                region = info[4]
                opis = info[5]

                opis_f = opis.replace(","," ")

                list_of_user_data = [name, zawod, opis_f]

                for data in list_of_user_data:
                    
                    try:
                        comp = re.findall('at ([a-zA-Z0-9]+)',data)
                        firma = comp[0]
                        break
                    except:
                        continue

                    if comp == []:
                        firma = "brak_danych"
                self.people_ls_dic.append({"name":name,"degree_connection":degree_connection,"zawod":zawod,"region":region,"opis":opis,"firma":firma,"link":link})
            except:
                pass

            
            
        
    def go_home(self):
        home = self.driver.find_element_by_xpath('//*[@id="inbug-nav-item"]/a')
        home.click()

 
    def next_page(self):
        sleep(3)
        next_p = self.driver.find_element_by_xpath('//*[@aria-label="Next"]')
        next_p.click()

    
    def open_people(self):

        self.driver.get("https://www.linkedin.com/search/results/people/?origin=DISCOVER_FROM_SEARCH_HOME")
        sleep(2)

        search_bar = self.driver.find_element_by_xpath('//*[@class="search-global-typeahead__input always-show-placeholder"]')
        search_bar.send_keys(parameters.search_query)
        search_bar.send_keys(Keys.ENTER)

        sleep(3) 

    def filter_company(self):
        cl = self.driver.find_element_by_xpath('//*[@aria-label="Current company filter. Clicking this button displays all Current company filter options."]')
        cl.click()
        for comp in parameters.list_of_comp:
            text = self.driver.find_element_by_xpath('//*[@placeholder="Add a company"]')
            text.send_keys(comp)
            sleep(1)
            filt = self.driver.find_element_by_xpath('/html/body/div[7]/div[3]/div/div[1]/nav/div/div[1]/div/div[2]/ul/li[5]/div/div/div/div[1]/div/form/fieldset/div[1]/div/div/div[2]/div/div[2]')
            sleep(0.2)
            filt.click()
        sleep(1)
        apply = self.driver.find_element_by_xpath('/html/body/div[7]/div[3]/div/div[1]/nav/div/div[1]/div/div[2]/ul/li[5]/div/div/div/div[1]/div/form/fieldset/div[2]/button[2]')
        apply.click()
        sleep(1)

    
    def close(self):
        self.driver.close()
    
    def write_to_csv(self):
        csv_file = "neville.csv"
        with open(csv_file, 'w', encoding="utf-8", newline='') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames = self.csv_name_colums)
            writer.writeheader()
            for data in self.people_ls_dic:
                writer.writerow(data)
        


scrypt = LinkedIn()


scrypt.login()

scrypt.open_people()



ls = range(parameters.ilosc_stron)
scrypt.filter_company()
for i in sorted(ls,reverse=True):
    scrypt.neville_try()
    if i == 1:
        break
    scrypt.next_page()
    

scrypt.write_to_csv()
scrypt.close()

Ofc 我有带参数的文件,我看起来像这样:

linkedin_username = ""
linkedin_password = ""
search_query = 'vcloud director'
list_of_comp = ['Microsoft']
ilosc_stron = 2 //number of pages to click on 
4

0 回答 0