我正在四处寻找线程来开发很酷的简单网络爬虫,但它的工作速度很慢。
这是我在ibm 库中找到的一段代码:
urls = [] # huge list of urls
in_queue = Queue.Queue()
out_queue = Queue.Queue()
pool = ActivePool()
s = threading.Semaphore(semaphore)
for url in urls[:slice_size]:
in_queue.put(url)
t = ThreadUrl(pool, s, url, in_queue, out_queue)
t.setDaemon(True)
t.start()
counter = slice_size
while not in_queue.empty() or not out_queue.empty():
speed_new_daemon = time.time()
url = urls[counter]
in_queue.put(url)
t = ThreadUrl(pool, s, url, in_queue, out_queue)
t.setDaemon(True)
t.start() # <------ why 20% of all time I lose here?
counter += 1
speed_new_daemon = time.time() - speed_new_daemon
speed_parser = time.time()
result = out_queue.get()
my_parser(result)
speed_parser = time.time() - speed_parser
# speed_parser only 80%, when speed_new_daemon takes 20%...
in_queue.join()