5

我有一些包含代理 ip 的文本文件。

如下所示

    130.14.29.111:80                          
    130.14.29.120:80                          
    130.159.235.31:80                         
    14.198.198.220:8909                       
    141.105.26.183:8000                       
    160.79.35.27:80                           
    164.77.196.75:80                          
    164.77.196.78:45430                       
    164.77.196.78:80                          
    173.10.134.173:8081                       
    174.132.145.80:80                         
    174.137.152.60:8080                       
    174.137.184.37:8080                       
    174.142.125.161:80     

处理后检查这个代理,然后我想标记如下

     total number of '0' = 8
     total number of 'x' = 6
     percentage = alive 60% , dead 40%


     x  130.14.29.111:80              
     0  130.14.29.120:80              
     0  130.159.235.31:80             
     0  14.198.198.220:8909           
     0  141.105.26.183:8000           
     0  160.79.35.27:80               
     x  164.77.196.75:80              
     x  164.77.196.78:45430           
     x  164.77.196.78:80              
     0  173.10.134.173:8081           
     0  174.132.145.80:80             
     0  174.137.152.60:8080           
     x  174.137.184.37:8080           
     x  174.142.125.161:80           

如何用python完成?或一些样品,如果有人会帮助我或启发我非常感谢!

我被编辑了

这是我所拥有的脚本来源

最后检查完成的代理列表是否保存到'proxy_alive.txt'

在这个文件中,我想标记代理元素是否存在。

    import socket
    import urllib2
    import threading
    import sys
    import Queue
    import socket

    socket.setdefaulttimeout(7)

    print "Bobng's proxy checker. Using %s second timeout"%(socket.getdefaulttimeout())

    #input_file = sys.argv[1]
    #proxy_type = sys.argv[2] #options: http,s4,s5
    #output_file = sys.argv[3]
    input_file = 'proxylist.txt'
    proxy_type = 'http'
    output_file = 'proxy_alive.txt'

    url = "www.seemyip.com" # Don't put http:// in here, or any /'s

    check_queue = Queue.Queue()
    output_queue = Queue.Queue()
    threads = 20

    def writer(f,rq):
        while True:
            line = rq.get()
            f.write(line+'\n')

    def checker(q,oq):
        while True:
            proxy_info = q.get() #ip:port
            if proxy_info == None:
                print "Finished"
            #quit()
                return
            #print "Checking %s"%proxy_info
            if proxy_type == 'http':
                try:

            listhandle = open("proxylist.txt").read().split('\n')

            for line in listhandle:   
                saveAlive = open("proxy_alive.txt", 'a')

                details = line.split(':')
                email = details[0]
                password = details[1].replace('\n', '')


                proxy_handler = urllib2.ProxyHandler({'http':proxy_info})
                opener = urllib2.build_opener(proxy_handler)
                opener.addheaders = [('User-agent','Mozilla/5.0')]
                urllib2.install_opener(opener)
                req = urllib2.Request("http://www.google.com")
                sock=urllib2.urlopen(req, timeout= 7)
                rs = sock.read(1000)
                if '<title>Google</title>' in rs:
                oq.put(proxy_info)
                print '[+] alive proxy' , proxy_info
                saveAlive.write(line)
            saveAlive.close()    
                except urllib2.HTTPError,e:
            print 'url open error? slow?'
                    pass
                except Exception,detail:
                    print '[-] bad proxy' ,proxy_info

            else:
                # gotta be socks
                try:
                    s = socks.socksocket()
                    if proxy_type == "s4":
                        t = socks.PROXY_TYPE_SOCKS4
                    else:
                        t = socks.PROXY_TYPE_SOCKS5
                    ip,port = proxy_info.split(':')
                    s.setproxy(t,ip,int(port))
                    s.connect((url,80))
                    oq.put(proxy_info)
                    print proxy_info
                except Exception,error:
                    print proxy_info

    threading.Thread(target=writer,args=(open(output_file,"wb"),output_queue)).start()
    for i in xrange(threads):
        threading.Thread(target=checker,args=(check_queue,output_queue)).start()
    for line in open(input_file).readlines():
        check_queue.put(line.strip('\n'))
    print "File reading done"
    for i in xrange(threads):
        check_queue.put(None)
    raw_input("PRESS ENTER TO QUIT")
    sys.exit(0)
4

1 回答 1

14

这是你想要的吗?

#!/usr/bin/env python
import Queue
import threading
import urllib2
import time

input_file = 'proxylist.txt'
threads = 10

queue = Queue.Queue()
output = []

class ThreadUrl(threading.Thread):
    """Threaded Url Grab"""
    def __init__(self, queue):
        threading.Thread.__init__(self)
        self.queue = queue

    def run(self):
        while True:
            #grabs host from queue
            proxy_info = self.queue.get()

            try:
                proxy_handler = urllib2.ProxyHandler({'http':proxy_info})
                opener = urllib2.build_opener(proxy_handler)
                opener.addheaders = [('User-agent','Mozilla/5.0')]
                urllib2.install_opener(opener)
                req = urllib2.Request("http://www.google.com")
                sock=urllib2.urlopen(req, timeout= 7)
                rs = sock.read(1000)
                if '<title>Google</title>' in rs:
                    output.append(('0',proxy_info))
                else:
                    raise "Not Google"
            except:
                output.append(('x',proxy_info))
            #signals to queue job is done
            self.queue.task_done()

start = time.time()
def main():

    #spawn a pool of threads, and pass them queue instance 
    for i in range(5):
        t = ThreadUrl(queue)
        t.setDaemon(True)
        t.start()
    hosts = [host.strip() for host in open(input_file).readlines()]
    #populate queue with data   
    for host in hosts:
        queue.put(host)

    #wait on the queue until everything has been processed     
    queue.join()

main()
for proxy,host in output:
    print proxy,host

print "Elapsed Time: %s" % (time.time() - start)
于 2012-06-02T11:46:43.020 回答