我正在尝试使用 Selenium chrome webdriver 和 BeautifulSoup 为拥有 80K 关注者的帐户提取 Twitter 关注者数据。我的脚本面临两个问题:
1)在加载所有关注者后滚动到页面底部以获取整个页面源时,我的脚本不会一直滚动到底部。它在加载随机数量的追随者后停止滚动,然后开始遍历每个追随者个人资料以获取他们的数据。我希望它加载页面上的所有关注者,然后开始遍历配置文件。
2)我的第二个问题是每次我运行脚本时,它都会尝试一个一个地滚动到底部,直到所有关注者都加载完毕,然后通过一次解析一个关注者数据开始提取数据。在我的案例中,这需要 4 到 5 天才能获取所有关注者数据(80K 关注者)。有没有更好的方法来做到这一点。
这是我的脚本:
from bs4 import BeautifulSoup
import sys
import os,re
import time
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from os import listdir
from os.path import isfile, join
print "Running for chrome."
chromedriver=sys.argv[1]
download_path=sys.argv[2]
os.system('killall -9 "Google Chrome"')
try:
os.environ["webdriver.chrome.driver"]=chromedriver
chromeOptions = webdriver.ChromeOptions()
prefs = {"download.default_directory" : download_path}
chromeOptions.add_experimental_option("prefs",prefs)
driver = webdriver.Chrome(executable_path=chromedriver, chrome_options=chromeOptions)
driver.implicitly_wait(20)
driver.maximize_window()
except Exception as err:
print "Error:Failed to open chrome."
print "Error: ",err
driver.stop_client()
driver.close()
#opening the web page
try:
driver.get('https://twitter.com/login')
except Exception as err:
print "Error:Failed to open url."
print "Error: ",err
driver.stop_client()
driver.close()
username = driver.find_element_by_xpath("//input[@name='session[username_or_email]' and @class='js-username-field email-input js-initial-focus']")
password = driver.find_element_by_xpath("//input[@name='session[password]' and @class='js-password-field']")
username.send_keys("###########")
password.send_keys("###########")
driver.find_element_by_xpath("//button[@type='submit']").click()
#os.system('killall -9 "Google Chrome"')
driver.get('https://twitter.com/sadserver/followers')
followers_link=driver.page_source #follwer page 18at a time
soup=BeautifulSoup(followers_link,'html.parser')
output=open('twitter_follower_sadoperator.csv','a')
output.write('Name,Twitter_Handle,Location,Bio,Join_Date,Link'+'\n')
div = soup.find('div',{'class':'GridTimeline-items has-items'})
bref = div.findAll('a',{'class':'ProfileCard-bg js-nav'})
name_list=[]
lastHeight = driver.execute_script("return document.body.scrollHeight")
for _ in xrange(0, followers_count/followers_per_page + 1):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
newHeight = driver.execute_script("return document.body.scrollHeight")
if newHeight == lastHeight:
followers_link=driver.page_source #follwer page 18at a time
soup=BeautifulSoup(followers_link,'html.parser')
div = soup.find('div',{'class':'GridTimeline-items has-items'})
bref = div.findAll('a',{'class':'ProfileCard-bg js-nav'})
for name in bref:
name_list.append(name['href'])
break
lastHeight = newHeight
followers_link=''
print len(name_list)
for x in range(0,len(name_list)):
#print name['href']
#print name.text
driver.stop_client()
driver.get('https://twitter.com'+name_list[x])
page_source=driver.page_source
each_soup=BeautifulSoup(page_source,'html.parser')
profile=each_soup.find('div',{'class':'ProfileHeaderCard'})
try:
name = profile.find('h1',{'class':'ProfileHeaderCard-name'}).find('a').text
if name:
output.write('"'+name.strip().encode('utf-8')+'"'+',')
else:
output.write(' '+',')
except Exception as e:
output.write(' '+',')
print 'Error in name:',e
try:
handle=profile.find('h2',{'class':'ProfileHeaderCard-screenname u-inlineBlock u-dir'}).text
if handle:
output.write('"'+handle.strip().encode('utf-8')+'"'+',')
else:
output.write(' '+',')
except Exception as e:
output.write(' '+',')
print 'Error in handle:',e
try:
location = profile.find('div',{'class':'ProfileHeaderCard-location'}).text
if location:
output.write('"'+location.strip().encode('utf-8')+'"'+',')
else:
output.write(' '+',')
except Exception as e:
output.write(' '+',')
print 'Error in location:',e
try:
bio=profile.find('p',{'class':'ProfileHeaderCard-bio u-dir'}).text
if bio:
output.write('"'+bio.strip().encode('utf-8')+'"'+',')
else:
output.write(' '+',')
except Exception as e:
output.write(' '+',')
print 'Error in bio:',e
try:
joinDate = profile.find('div',{'class':'ProfileHeaderCard-joinDate'}).text
if joinDate:
output.write('"'+joinDate.strip().encode('utf-8')+'"'+',')
else:
output.write(' '+',')
except Exception as e:
output.write(' '+',')
print 'Error in joindate:',e
try:
url = [check.find('a') for check in profile.find('div',{'class':'ProfileHeaderCard-url'}).findAll('span')][1]
if url:
output.write('"'+url['href'].strip().encode('utf-8')+'"'+'\n')
else:
output.write(' '+'\n')
except Exception as e:
output.write(' '+'\n')
print 'Error in url:',e
output.close()
os.system("kill -9 `ps -deaf | grep chrome | awk '{print $2}'`")