0
#!/usr/bin/env python

import threading
import urllib, sys,os
import Queue


concurrent = 200
queue = Queue.Queue(concurrent*2)

try:
    aim = sys.argv[1].lower()
    dic = open(sys.argv[2],'r')

except:
    print "Usage: %s url wordlist" % sys.argv[0]
    sys.exit(1)

class Scanner(threading.Thread):
    def __init__(self,queue):
        threading.Thread.__init__(self)
        self.queue=queue

    def run(self):

        while True:

            self.path = self.queue.get()
            self.geturl = urllib.urlopen(aim+'/'+self.path)
            self.status =  self.geturl.getcode()
            self.url = aim+self.path
            self.result = self.url+'=>'+str(self.status)
            print self.result
            self.writeresult(self.result)
            self.queue.task_done()



    def writeresult(self,result):

        fp = open('result.txt','a+')
        fp.write(result+'\n')
        fp.close()  


def main():         

    for i in range(concurrent):
        t = Scanner(queue)
        t.setDaemon(True)
        t.start()

    for path in dic.readlines():
        queue.put(path.strip())

    queue.join()

if __name__ == '__main__':
    main()

这是一个用于扫描网站目录的python程序,当扫描完成时,它甚至没有用ctrl+c退出我想知道它什么时候完成扫描如何自动退出程序。

当它在进行中时,也会出现这样的问题:

Exception in thread Thread-130:
Traceback (most recent call last):
  File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/threading.py", line 551, in __bootstrap_inner
    self.run()
  File "tt.py", line 28, in run
    self.geturl = urllib.urlopen(aim+'/'+self.path)
  File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.py", line 86, in urlopen
    return opener.open(url)
  File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.py", line 207, in open
    return getattr(self, name)(url)
  File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib.py", line 344, in open_http
    h.endheaders(data)
  File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 954, in endheaders
    self._send_output(message_body)
  File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 814, in _send_output
    self.send(msg)
  File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 776, in send
    self.connect()
  File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 757, in connect
    self.timeout, self.source_address)
  File "/usr/local/Cellar/python/2.7.3/Frameworks/Python.framework/Versions/2.7/lib/python2.7/socket.py", line 553, in create_connection
    for res in getaddrinfo(host, port, 0, SOCK_STREAM):
IOError: [Errno socket error] [Errno 8] nodename nor servname provided, or not known
4

2 回答 2

0

该程序原样,当所有线程完成时它将关闭。但是为了轻松摆脱所有这些错误,在你的函数运行中,从类中,在 while True: claus 之后,将后面的所有内容放在 try:except: 子句中,如下所示

try:
     code
except:
    pass

它不完全是最干净的方法,但考虑到您所追求的,它会完成这项工作,并且会让您摆脱那些异常,顺便说一句,这意味着某些 URL 已超时。

于 2013-10-13T14:58:05.210 回答
0

我想要一些练习,所以我尝试了这个并改变了很多。它会给你一整套结果吗?您需要将路径替换为原始参数读数。

  • 使用这些线程,您可能会遇到未处理的异常而导致结果丢失?我添加了一种机制来捕获读取过程中的任何错误并将其传递给结果编写器。
  • 我想从多个线程附加到文件是可以的,但我添加了一个编写器线程来更干净地管理文件
  • 大多数给自己的任务都是不必要的
  • 如果您仍然遇到套接字错误,请检查结果文件中的路径并查看您希望如何处理这些结果(如果有的话)
  • 我不是专家,所以不要将此作为最佳实践

import threading
import urllib
import Queue

concurrent = 5

aim = 'http://edition.cnn.com'
paths = ['2013/10/12/opinion/kazin-tea-party/index.html?hpt=hp_t5',
         '2013/10/11/opinion/opinion-hay-nobel-opcw/index.html?hpt=hp_t5',
         '2013/10/11/opinion/rosin-women-in-charge/index.html?hpt=hp_t5',
         'some invalid path',
         '2013']  # also an invalid path


def main():
    work_q = Queue.Queue()
    result_q = Queue.Queue()

    # start the scanners and the result writer
    scanners = [Scanner(work_q, result_q) for i in range(concurrent)]
    for s in scanners:
        s.start()
    results_file_path = 'results.txt'
    result_writer = ResultWriter(result_q, 'results.txt')
    result_writer.start()
    # send all the work and wait for it to be completed
    for path in paths:
        work_q.put(path.strip())
    work_q.join()
    # tell everyone to stop
    # you could just kill the threads but you writer needs to close the file
    for s in scanners:
        work_q.put(Scanner.STOP_TOKEN)
    result_q.put(ResultWriter.STOP_TOKEN)  # make sure file gets closed
    # wait for everyone to actually stop
    for s in scanners:
        s.join()
    result_writer.join()
    print 'the scan has finished and results are in {}'.format(results_file_path)


class Scanner(threading.Thread):
    STOP_TOKEN = '<<stop>>'

    def __init__(self, work_q, result_q):
        threading.Thread.__init__(self)
        self.work_q = work_q
        self.result_q = result_q

    def run(self):
        while True:
            path = status = None  # reset in case of error
            try:
                try:
                    path = self.work_q.get(timeout=0.00001)
                except Queue.Empty:
                    continue
                if path == self.STOP_TOKEN:
                    break  # stop looking for work
                get_url = urllib.urlopen(aim + '/' + path)
                status = get_url.getcode()
            except Exception as e:
                status = 'unhandled error ({})'.format(e)
            self.result_q.put((path, status))
            self.work_q.task_done()


class ResultWriter(threading.Thread):
    STOP_TOKEN = '<<stop>>'

    def __init__(self, result_q, results_file_path):
        threading.Thread.__init__(self)
        self.result_q = result_q
        self.results_file_path = results_file_path

    def run(self):
        with open(self.results_file_path, 'w') as results_file:
            while True:
                try:
                    result = self.result_q.get(timeout=0.00001)
                except Queue.Empty:
                    continue
                if result == self.STOP_TOKEN:
                    break  # stop looking for results
                path, status = result
                results_file.write('{}=>{}\n'.format(path, status))


if __name__ == '__main__':
    main()
于 2013-10-13T17:36:53.387 回答