0

使用我正在编写一个 python 脚本来检查 url 是否有效。该脚本会将 url 和响应代码写入日志文件。为了加快检查速度,我使用了线程和队列。

如果要检查的 url 的数量很少,则该脚本运行良好,但是当将 url 的数量增加到数百个时,一些 url 会从日志文件中丢失。

有什么我需要解决的吗?

我的脚本是

#!/usr/bin/env python
import Queue
import threading
import urllib2,urllib,sys,cx_Oracle,os
import time
from urllib2 import HTTPError, URLError


queue = Queue.Queue()
##print_queue = Queue.Queue()

class NoRedirectHandler(urllib2.HTTPRedirectHandler):
    def http_error_302(self, req, fp, code, msg, headers):
        infourl = urllib.addinfourl(fp, headers, req.get_full_url())
        infourl.status = code
        infourl.code = code
        return infourl
    http_error_300 = http_error_302
    http_error_301 = http_error_302
    http_error_303 = http_error_302
    http_error_307 = http_error_302

class ThreadUrl(threading.Thread):
    #Threaded Url Grab
##    def __init__(self, queue, print_queue):
    def __init__(self, queue,error_log):    
        threading.Thread.__init__(self)
        self.queue = queue
##        self.print_queue = print_queue
        self.error_log = error_log

    def do_something_with_exception(self,idx,url,error_log):
        exc_type, exc_value = sys.exc_info()[:2]
##        self.print_queue.put([idx,url,exc_type.__name__])
        with open( error_log, 'a') as err_log_f:
            err_log_f.write("{0},{1},{2}\n".format(idx,url,exc_type.__name__))


    def openUrl(self,pair):
        try:
            idx = pair[1]
            url = 'http://'+pair[2]

            opener = urllib2.build_opener(NoRedirectHandler())
            urllib2.install_opener(opener)
            request = urllib2.Request(url)
            request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 5.1; rv:13.0) Gecko/20100101 Firefox/13.0.1')


            #open urls of hosts 
            resp = urllib2.urlopen(request, timeout=10)

##            self.print_queue.put([idx,url,resp.code])
            with open( self.error_log, 'a') as err_log_f:
                err_log_f.write("{0},{1},{2}\n".format(idx,url,resp.code))


        except:
            self.do_something_with_exception(idx,url,self.error_log)    


    def run(self):
        while True:
            #grabs host from queue
            pair = self.queue.get()
            self.openUrl(pair)

            #signals to queue job is done
            self.queue.task_done()

def readUrlFromDB(queue,connect_string,column_name,table_name):
    try:  
        connection = cx_Oracle.Connection(connect_string)
        cursor = cx_Oracle.Cursor(connection)
        query = 'select ' + column_name + ' from ' + table_name
        cursor.execute(query)

        #Count lines in the file
        rows = cursor.fetchall()
        total = cursor.rowcount        

        #Loop through returned urls
        for row in rows:
            #print row[1],row[2]
##            url = 'http://'+row[2]
            queue.put(row)
        cursor.close()
        connection.close()

        return total

    except cx_Oracle.DatabaseError, e:
        print e[0].context
        raise   

def main():   
    start = time.time()
    error_log = "D:\\chkWebsite_Error_Log.txt"

    #Check if error_log file exists
    #If exists then deletes it
    if os.path.isfile(error_log):  
         os.remove(error_log)

    #spawn a pool of threads, and pass them queue instance 
    for i in range(10):
        t = ThreadUrl(queue,error_log)
        t.setDaemon(True)
        t.start()

    connect_string,column_name,table_name = "user/pass@db","*","T_URL_TEST"
    tn = readUrlFromDB(queue,connect_string,column_name,table_name)


   #wait on the queue until everything has been processed     
    queue.join()
##    print_queue.join()

    print "Total retrived: {0}".format(tn)
    print "Elapsed Time: %s" % (time.time() - start)

main()
4

2 回答 2

1

由于全局解释器锁,Python 的线程模块并不是真正的多线程,http ://wiki.python.org/moin/GlobalInterpreterLock 因此你应该真正使用multiprocessing http://docs.python.org/library/multiprocessing.html如果你真的想利用多核。

此外,您似乎正在同时访问文件

with open( self.error_log, 'a') as err_log_f:
    err_log_f.write("{0},{1},{2}\n".format(idx,url,resp.code))

这真的很糟糕 AFAIK,如果两个线程试图同时或几乎同时写入同一个文件,请记住,它们不是真正的多线程,行为往往是未定义的,想象一个线程写入而另一个刚关了...

无论如何,您需要第三个队列来处理对文件的写入。

于 2012-07-13T05:52:36.893 回答
0

乍一看,这看起来像是一种竞争条件,因为许多线程都试图同时写入日志文件。有关如何锁定文件以进行写入的一些指示,请参阅此问题(因此一次只有一个线程可以访问它)。

于 2012-07-13T05:47:37.663 回答