python-2.7 - 在 python 中使用 selenium chrome webdriver 提取 Twitter 关注者数据？无法加载所有关注者

Question

我正在尝试使用 Selenium chrome webdriver 和 BeautifulSoup 为拥有 80K 关注者的帐户提取 Twitter 关注者数据。我的脚本面临两个问题：

1）在加载所有关注者后滚动到页面底部以获取整个页面源时，我的脚本不会一直滚动到底部。它在加载随机数量的追随者后停止滚动，然后开始遍历每个追随者个人资料以获取他们的数据。我希望它加载页面上的所有关注者，然后开始遍历配置文件。

2）我的第二个问题是每次我运行脚本时，它都会尝试一个一个地滚动到底部，直到所有关注者都加载完毕，然后通过一次解析一个关注者数据开始提取数据。在我的案例中，这需要 4 到 5 天才能获取所有关注者数据（80K 关注者）。有没有更好的方法来做到这一点。

这是我的脚本：

from bs4 import BeautifulSoup
import sys
import os,re
import time
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from os import listdir
from os.path import isfile, join

print "Running for chrome."

chromedriver=sys.argv[1]
download_path=sys.argv[2]
os.system('killall -9 "Google Chrome"')
try:
	os.environ["webdriver.chrome.driver"]=chromedriver
	chromeOptions = webdriver.ChromeOptions()
	prefs = {"download.default_directory" : download_path}
	chromeOptions.add_experimental_option("prefs",prefs)
	driver = webdriver.Chrome(executable_path=chromedriver, chrome_options=chromeOptions)
	driver.implicitly_wait(20)
	driver.maximize_window()
except Exception as err:
	print "Error:Failed to open chrome."
	print "Error: ",err
	driver.stop_client()
	driver.close()
	
#opening the web page
try:
	driver.get('https://twitter.com/login')
except Exception as err:
	print "Error:Failed to open url."
	print "Error: ",err
	driver.stop_client()
	driver.close()

username = driver.find_element_by_xpath("//input[@name='session[username_or_email]' and @class='js-username-field email-input js-initial-focus']")
password = driver.find_element_by_xpath("//input[@name='session[password]' and @class='js-password-field']")

username.send_keys("###########")
password.send_keys("###########")
driver.find_element_by_xpath("//button[@type='submit']").click()
#os.system('killall -9 "Google Chrome"')
driver.get('https://twitter.com/sadserver/followers')



followers_link=driver.page_source  #follwer page 18at a time
soup=BeautifulSoup(followers_link,'html.parser')

output=open('twitter_follower_sadoperator.csv','a')
output.write('Name,Twitter_Handle,Location,Bio,Join_Date,Link'+'\n')
div = soup.find('div',{'class':'GridTimeline-items has-items'})
bref = div.findAll('a',{'class':'ProfileCard-bg js-nav'})
name_list=[]
lastHeight = driver.execute_script("return document.body.scrollHeight")


for _ in xrange(0, followers_count/followers_per_page + 1):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)
        newHeight = driver.execute_script("return document.body.scrollHeight")
        if newHeight == lastHeight:
                followers_link=driver.page_source  #follwer page 18at a time
                soup=BeautifulSoup(followers_link,'html.parser')
                div = soup.find('div',{'class':'GridTimeline-items has-items'})
                bref = div.findAll('a',{'class':'ProfileCard-bg js-nav'})
                for name in bref:
                        name_list.append(name['href'])
                break
        lastHeight = newHeight
        followers_link=''

print len(name_list)


for x in range(0,len(name_list)):
        #print name['href']
        #print name.text
        driver.stop_client()
        driver.get('https://twitter.com'+name_list[x])
        page_source=driver.page_source
        each_soup=BeautifulSoup(page_source,'html.parser')
        profile=each_soup.find('div',{'class':'ProfileHeaderCard'})
                            
        try:
                name = profile.find('h1',{'class':'ProfileHeaderCard-name'}).find('a').text
                if name:
                        output.write('"'+name.strip().encode('utf-8')+'"'+',')
                else:
                        output.write(' '+',')
        except Exception as e:
                output.write(' '+',')
                print 'Error in name:',e

        try:
                handle=profile.find('h2',{'class':'ProfileHeaderCard-screenname u-inlineBlock u-dir'}).text
                if handle:
                        output.write('"'+handle.strip().encode('utf-8')+'"'+',')
                else:
                        output.write(' '+',')
        except Exception as e:
                output.write(' '+',')
                print 'Error in handle:',e

        try:
                location = profile.find('div',{'class':'ProfileHeaderCard-location'}).text
                if location:
                        output.write('"'+location.strip().encode('utf-8')+'"'+',')
                else:
                        output.write(' '+',')
        except Exception as e:
                output.write(' '+',')
                print 'Error in location:',e

        try:
                bio=profile.find('p',{'class':'ProfileHeaderCard-bio u-dir'}).text
                if bio:
                        output.write('"'+bio.strip().encode('utf-8')+'"'+',')
                else:
                        output.write(' '+',')
        except Exception as e:
                output.write(' '+',')
                print 'Error in bio:',e
                        
        try:
                joinDate = profile.find('div',{'class':'ProfileHeaderCard-joinDate'}).text
                if joinDate:
                        output.write('"'+joinDate.strip().encode('utf-8')+'"'+',')
                else:
                        output.write(' '+',')
        except Exception as e:
                output.write(' '+',')
                print 'Error in joindate:',e
        
        try:
                url =  [check.find('a') for check in profile.find('div',{'class':'ProfileHeaderCard-url'}).findAll('span')][1]
                if url:
                        output.write('"'+url['href'].strip().encode('utf-8')+'"'+'\n')
                else:
                        output.write(' '+'\n')
        except Exception as e:
                output.write(' '+'\n')
                print 'Error in url:',e
        


        
output.close()


os.system("kill -9 `ps -deaf | grep chrome | awk '{print $2}'`")

score 0 · Accepted Answer

有更好的方法。使用Twitter API ，这是我找到的一个快速Github脚本Github Script 抱歉，您可能觉得使用 Selenium 已经浪费了很多时间（不使用 API 有好处） 关于自动化和了解工作原理的好帖子:推特 API

有一种方法可以滚动多次，但您必须做一些数学运算或设置条件来停止滚动。

driver.execute_script("window.scrollTo(0, 10000);")

假设您有10k关注者，初始显示100 个关注者，之后您将加载 10 个关注者每个滚动。您将再滚动990次。

当然，这是alecxe为您的案例所做的确切用法：D。 Qudora* 回答者 - alecxe -

html = driver.page_source

一旦您显示所有关注者（滚动），然后使用BeautifulSoup之类的东西解析它，就可以使用这个.page_source

score 0 · Accepted Answer

我按照alecxe在他的回答中提到的那样进行了实施，但我的脚本仍然没有解析所有的追随者。它仍在加载随机数量的追随者。似乎无法深究这一点。有人可以尝试在他们的一端运行它，看看他们是否能够加载所有的追随者。这是修改后的脚本：

from bs4 import BeautifulSoup
import sys
import os,re
import time
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from os import listdir
from os.path import isfile, join

print "Running for chrome."

chromedriver=sys.argv[1]
download_path=sys.argv[2]
os.system('killall -9 "Google Chrome"')
try:
	os.environ["webdriver.chrome.driver"]=chromedriver
	chromeOptions = webdriver.ChromeOptions()
	prefs = {"download.default_directory" : download_path}
	chromeOptions.add_experimental_option("prefs",prefs)
	driver = webdriver.Chrome(executable_path=chromedriver, chrome_options=chromeOptions)
	driver.implicitly_wait(20)
	driver.maximize_window()
except Exception as err:
	print "Error:Failed to open chrome."
	print "Error: ",err
	driver.stop_client()
	driver.close()
	
#opening the web page
try:
	driver.get('https://twitter.com/login')
except Exception as err:
	print "Error:Failed to open url."
	print "Error: ",err
	driver.stop_client()
	driver.close()

username = driver.find_element_by_xpath("//input[@name='session[username_or_email]' and @class='js-username-field email-input js-initial-focus']")
password = driver.find_element_by_xpath("//input[@name='session[password]' and @class='js-password-field']")

username.send_keys("*****************")
password.send_keys("*****************")
driver.find_element_by_xpath("//button[@type='submit']").click()
#os.system('killall -9 "Google Chrome"')
driver.get('https://twitter.com/sadoperator/followers')



followers_link=driver.page_source  #follwer page 18at a time
soup=BeautifulSoup(followers_link,'html.parser')

output=open('twitter_follower_sadoperator.csv','a')
output.write('Name,Twitter_Handle,Location,Bio,Join_Date,Link'+'\n')
div = soup.find('div',{'class':'GridTimeline-items has-items'})
bref = div.findAll('a',{'class':'ProfileCard-bg js-nav'})
name_list=[]
lastHeight = driver.execute_script("return document.body.scrollHeight")

followers_link=driver.page_source  #follwer page 18at a time
soup=BeautifulSoup(followers_link,'html.parser')

followers_per_page = 18
followers_count = 15777


for _ in xrange(0, followers_count/followers_per_page + 1):
        driver.execute_script("window.scrollTo(0, 7755000);")
        time.sleep(2)
        newHeight = driver.execute_script("return document.body.scrollHeight")
        if newHeight == lastHeight:
                followers_link=driver.page_source  #follwer page 18at a time
                soup=BeautifulSoup(followers_link,'html.parser')
                div = soup.find('div',{'class':'GridTimeline-items has-items'})
                bref = div.findAll('a',{'class':'ProfileCard-bg js-nav'})
                for name in bref:
                        name_list.append(name['href'])
                break
        lastHeight = newHeight
        followers_link=''

print len(name_list)

'''
for x in range(0,len(name_list)):
        #print name['href']
        #print name.text
        driver.stop_client()
        driver.get('https://twitter.com'+name_list[x])
        page_source=driver.page_source
        each_soup=BeautifulSoup(page_source,'html.parser')
        profile=each_soup.find('div',{'class':'ProfileHeaderCard'})
                            
        try:
                name = profile.find('h1',{'class':'ProfileHeaderCard-name'}).find('a').text
                if name:
                        output.write('"'+name.strip().encode('utf-8')+'"'+',')
                else:
                        output.write(' '+',')
        except Exception as e:
                output.write(' '+',')
                print 'Error in name:',e

        try:
                handle=profile.find('h2',{'class':'ProfileHeaderCard-screenname u-inlineBlock u-dir'}).text
                if handle:
                        output.write('"'+handle.strip().encode('utf-8')+'"'+',')
                else:
                        output.write(' '+',')
        except Exception as e:
                output.write(' '+',')
                print 'Error in handle:',e

        try:
                location = profile.find('div',{'class':'ProfileHeaderCard-location'}).text
                if location:
                        output.write('"'+location.strip().encode('utf-8')+'"'+',')
                else:
                        output.write(' '+',')
        except Exception as e:
                output.write(' '+',')
                print 'Error in location:',e

        try:
                bio=profile.find('p',{'class':'ProfileHeaderCard-bio u-dir'}).text
                if bio:
                        output.write('"'+bio.strip().encode('utf-8')+'"'+',')
                else:
                        output.write(' '+',')
        except Exception as e:
                output.write(' '+',')
                print 'Error in bio:',e
                        
        try:
                joinDate = profile.find('div',{'class':'ProfileHeaderCard-joinDate'}).text
                if joinDate:
                        output.write('"'+joinDate.strip().encode('utf-8')+'"'+',')
                else:
                        output.write(' '+',')
        except Exception as e:
                output.write(' '+',')
                print 'Error in joindate:',e
        
        try:
                url =  [check.find('a') for check in profile.find('div',{'class':'ProfileHeaderCard-url'}).findAll('span')][1]
                if url:
                        output.write('"'+url['href'].strip().encode('utf-8')+'"'+'\n')
                else:
                        output.write(' '+'\n')
        except Exception as e:
                output.write(' '+'\n')
                print 'Error in url:',e
        


        
output.close()
'''

os.system("kill -9 `ps -deaf | grep chrome | awk '{print $2}'`")

score 0 · Accepted Answer

在 Firefox 或其他浏览器中打开开发人员控制台并记下（复制）在向下滚动页面期间发生的请求 - 您将使用它来构建您的请求。请求看起来像这样 - https://twitter.com/DiaryofaMadeMan/followers/users?include_available_features=1&include_entities=1&max_position=1584951385597824282&reset_error_state=false，并在 html 源中搜索data-min-position像这样 - data-min-position= “1584938620170076301”
使用 PhantomJS 加载 HTML - 使用 Beautifulsoup 解析。您需要获取关注者的第一部分和“数据最小值”值。将关注者保存到列表中，并将“数据最小位置”保存到变量中
在第 1 阶段使用保存的请求和“data-min”来构造新请求 - 仅用保存的 data-min 替换请求的 data-max 的数字
使用 python 请求（不再使用 webdriver）发送请求并接收 json 响应。
从响应 json 中获取新的关注者和新的数据最小值
重复 2,3,4 直到 data-min=0

这种方式比 API 好得多，因为您可以加载大量数据而没有任何限制

python-2.7 - 在 python 中使用 selenium chrome webdriver 提取 Twitter 关注者数据？无法加载所有关注者

3 回答 3

Related

Reference