python - python urllib2 多下载

Question

如何让下面的脚本一次下载多个链接，而不是使用 urllib2 一次下载一个？

Python：

from BeautifulSoup import BeautifulSoup
import lxml.html as html
import urlparse
import os, sys
import urllib2
import re

print ("downloading and parsing Bibles...")
root = html.parse(open('links.html'))
for link in root.findall('//a'):
  url = link.get('href')
  name = urlparse.urlparse(url).path.split('/')[-1]
  dirname = urlparse.urlparse(url).path.split('.')[-1]
  f = urllib2.urlopen(url)
  s = f.read()
  if (os.path.isdir(dirname) == 0): 
    os.mkdir(dirname)
  soup = BeautifulSoup(s)
  articleTag = soup.html.body.article
  converted = str(articleTag)
  full_path = os.path.join(dirname, name)
  open(full_path, 'w').write(converted)
  print(name)
print("DOWNLOADS COMPLETE!")

链接.html

<a href="http://www.youversion.com/bible/gen.1.nmv-fas">http://www.youversion.com/bible/gen.1.nmv-fas</a>

<a href="http://www.youversion.com/bible/gen.2.nmv-fas">http://www.youversion.com/bible/gen.2.nmv-fas</a>

<a href="http://www.youversion.com/bible/gen.3.nmv-fas">http://www.youversion.com/bible/gen.3.nmv-fas</a>

<a href="http://www.youversion.com/bible/gen.4.nmv-fas">http://www.youversion.com/bible/gen.4.nmv-fas</a>

<a href="http://www.youversion.com/bible/gen.5.nmv-fas">http://www.youversion.com/bible/gen.5.nmv-fas</a>

<a href="http://www.youversion.com/bible/gen.6.nmv-fas">http://www.youversion.com/bible/gen.6.nmv-fas</a>

score 1 · Accepted Answer

Blainer，尝试穿线。

这是一个很好的实际例子

http://www.ibm.com/developerworks/aix/library/au-threadingpython/

然后也引用 python std 库

http://docs.python.org/library/threading.html

如果您查看实际示例，它实际上有一个 urllib2 并发下载的线程版本示例。II 继续并带您进一步进入该过程，您将不得不使用修复此问题的部分来进一步解析您的 html。

#!/usr/bin/env python

import Queue
import threading
import urllib2
import time
import htmllib, formatter

class LinksExtractor(htmllib.HTMLParser):
    # derive new HTML parser

    def __init__(self, formatter):        
        # class constructor
        htmllib.HTMLParser.__init__(self, formatter)  
        # base class constructor
        self.links = []        
        # create an empty list for storing hyperlinks

    def start_a(self, attrs) :  # override handler of <A ...>...</A> tags
        # process the attributes
        if len(attrs) > 0 :
            for attr in attrs :
                if attr[0] == "href":         
                    # ignore all non HREF attributes
                    self.links.append(attr[1]) # save the link info in the list

    def get_links(self) :     
        # return the list of extracted links
        return self.links

format = formatter.NullFormatter()
htmlparser = LinksExtractor(format)

data = open("links.html")
htmlparser.feed(data.read())
htmlparser.close()

hosts = htmlparser.links

queue = Queue.Queue()

class ThreadUrl(threading.Thread):
    """Threaded Url Grab"""
    def __init__(self, queue):
        threading.Thread.__init__(self)
        self.queue = queue

    def run(self):
        while True:
            #grabs host from queue
            host = self.queue.get()

            ####################################
            ############FIX THIS PART###########
            #VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV#

            url = urllib2.urlopen(host)
            morehtml = url.read() # your own your own with this

            #signals to queue job is done
            self.queue.task_done()

start = time.time()
def main():
    #spawn a pool of threads, and pass them queue instance 
    for i in range(5):
        t = ThreadUrl(queue)
        t.setDaemon(True)
        t.start()

        #populate queue with data   
    for host in hosts:
        queue.put(host)

    #wait on the queue until everything has been processed     
    queue.join()

main()
print "Elapsed Time: %s" % (time.time() - start)

python - python urllib2 多下载

1 回答 1

Related

Reference