试图构建一些机器人来点击页面上的“跳过广告”按钮。到目前为止,我设法使用 Mechanize 加载 Web 驱动程序浏览器并连接到某个页面,但 Mechanize 模块不直接支持 js,所以如果我理解正确,现在我需要像 Selenium 这样的东西。我也是编程初学者,所以请具体一点。我如何将 Selenium 与 Mechanize 一起使用,或者如果有任何其他合适的解决方案,请解释详细信息。
这是到目前为止的来源:
#!/usr/bin/python
# FILENAME: test.py
import mechanize
import os, time
from random import choice, randrange
prox_list = []
#list of common UAS to apply to each connection attempt to impersonate browsers
user_agent_strings = [ 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1',
'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 4.0; Tablet PC 2.0; InfoPath.3; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; chromeframe/11.0.696.57)',
'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; InfoPath.1; SV1; .NET CLR 3.8.36217; WOW64; en-US)',
'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; GTB7.4; InfoPath.2; SV1; .NET CLR 3.3.69573; WOW64; en-US)'
]
def load_proxy_list(target):
#loads and parses the proxy list
file = open(target, 'r')
count = 0
for line in file:
prox_list.append(line)
count += 1
print "Loaded " + str(count) + " proxies!"
load_proxy_list('proxies.txt')
#for i in range(1,(len(prox_list) - 1)):
# depreceated for overloading
for i in range(1,30):
br = mechanize.Browser()
#pick a random UAS to add some extra cover to the bot
br.addheaders = [('User-agent', choice(user_agent_strings))]
print "----------------------------------------------------"
#This is bad internet ethics
br.set_handle_robots(False)
#choose a proxy
proxy = choice(prox_list)
br.set_proxies({"http": proxy})
br.set_debug_http(True)
try:
print "Trying connection with: " + str(proxy)
#currently using: BTC CoinURL - Grooveshark Broadcast
br.open("http://cur.lv/4czwj")
print "Opened successfully!"
#act like a nice little drone and view the ads
sleep_time_on_link = randrange(17.0,34.0)
time.sleep(sleep_time_on_link)
except mechanize.HTTPError, e:
print "Oops Request threw " + str(e.code)
except mechanize.URLError, e:
print "Oops! Request was refused, blacklisting proxy!" + str(e)
prox_list.remove(proxy)
del br #close browser entirely
#wait between 5-30 seconds like a good little human
sleep_time = randrange(5.0, 30.0)
print "Waiting for %.1f seconds like a good bot." % (sleep_time)
time.sleep(sleep_time)