1

试图构建一些机器人来点击页面上的“跳过广告”按钮。到目前为止,我设法使用 Mechanize 加载 Web 驱动程序浏览器并连接到某个页面,但 Mechanize 模块不直接支持 js,所以如果我理解正确,现在我需要像 Selenium 这样的东西。我也是编程初学者,所以请具体一点。我如何将 Selenium 与 Mechanize 一起使用,或者如果有任何其他合适的解决方案,请解释详细信息。

这是到目前为止的来源:

#!/usr/bin/python
# FILENAME: test.py

import mechanize
import os, time
from random import choice, randrange

prox_list = []

#list of common UAS to apply to each connection attempt to impersonate browsers

user_agent_strings = [ 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36',
                       'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1',
                       'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
                       'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52',
                       'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0',
                       'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7',
                       'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 4.0; Tablet PC 2.0; InfoPath.3; .NET4.0C; .NET4.0E)',
                       'Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0',
                       'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; chromeframe/11.0.696.57)',
                       'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; InfoPath.1; SV1; .NET CLR 3.8.36217; WOW64; en-US)',
                       'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; GTB7.4; InfoPath.2; SV1; .NET CLR 3.3.69573; WOW64; en-US)'
                       ]

def load_proxy_list(target):
    #loads and parses the proxy list
    file = open(target, 'r')

    count = 0
    for line in file:
        prox_list.append(line)
        count += 1

    print "Loaded " + str(count) + " proxies!"

load_proxy_list('proxies.txt')

#for i in range(1,(len(prox_list) - 1)):
# depreceated for overloading
for i in range(1,30):
    br = mechanize.Browser()
    #pick a random UAS to add some extra cover to the bot
    br.addheaders = [('User-agent', choice(user_agent_strings))]
    print "----------------------------------------------------"

    #This is bad internet ethics
    br.set_handle_robots(False)

    #choose a proxy
    proxy = choice(prox_list)
    br.set_proxies({"http": proxy})
    br.set_debug_http(True)

    try:
        print "Trying connection with: " + str(proxy)
        #currently using: BTC CoinURL - Grooveshark Broadcast
        br.open("http://cur.lv/4czwj")
        print "Opened successfully!"
        #act like a nice little drone and view the ads
        sleep_time_on_link = randrange(17.0,34.0)
        time.sleep(sleep_time_on_link)


    except mechanize.HTTPError, e:
        print "Oops Request threw " + str(e.code)

    except mechanize.URLError, e:
        print "Oops! Request was refused, blacklisting proxy!" + str(e)
        prox_list.remove(proxy)

    del br #close browser entirely

    #wait between 5-30 seconds like a good little human
    sleep_time = randrange(5.0, 30.0)
    print "Waiting for %.1f seconds like a good bot." % (sleep_time)
    time.sleep(sleep_time)
4

1 回答 1

2

分裂应该为此工作:

http://splinter.cobrateam.info/

from splinter import Browser

with Browser('firefox') as browser:
    browser.visit('www.website.com')
    browser.find_by_name('element_name').click()
于 2013-10-26T04:06:31.980 回答