python - 如何使用 Urllib2 更有效地抓取？

Question

新手来了我使用 urllib2 编写了一个简单的脚本来浏览 Billboard.com，并从 1958 年到 2013 年每周抓取顶级歌曲和艺术家。问题是它非常慢 - 需要几个小时才能完成。

我想知道瓶颈在哪里，是否有办法使用 Urllib2 更有效地抓取，或者我是否需要使用更复杂的工具？

import re
import urllib2
array = []
url = 'http://www.billboard.com/charts/1958-08-09/hot-100'
date = ""
while date != '2013-07-13':
    response = urllib2.urlopen(url)
    htmlText = response.read()
    date = re.findall('\d\d\d\d-\d\d-\d\d',url)[0]
    song = re.findall('<h1>.*</h1>', htmlText)[0]
    song = song[4:-5]
    artist = re.findall('/artist.*</a>', htmlText)[1]
    artist = re.findall('>.*<', artist)[0]
    artist = artist[1:-1]
    nextWeek = re.findall('href.*>Next', htmlText)[0]
    nextWeek = nextWeek[5:-5]
    array.append([date, song, artist])
    url = 'http://www.billboard.com' + nextWeek
print array

score 3 · Accepted Answer

您的瓶颈几乎肯定是从网站获取数据。每个网络请求都有延迟，这会阻止其他任何事情在此期间发生。您应该考虑跨多个线程拆分请求，以便一次可以发送多个请求。基本上，您的性能是受 I/O 限制的，而不是受 CPU 限制的。

这是一个从头开始构建的简单解决方案，以便您了解爬虫通常是如何工作的。从长远来看，使用像 Scrapy 这样的东西可能是最好的，但我发现从简单而明确的东西开始总是有帮助的。

import threading
import Queue
import time
import datetime
import urllib2
import re

class Crawler(threading.Thread):
    def __init__(self, thread_id, queue):
        threading.Thread.__init__(self)
        self.thread_id = thread_id
        self.queue = queue

        # let's use threading events to tell the thread when to exit
        self.stop_request = threading.Event()

    # this is the function which will run when the thread is started
    def run(self):
        print 'Hello from thread %d! Starting crawling...' % self.thread_id

        while not self.stop_request.isSet():
            # main crawl loop

            try:
                # attempt to get a url target from the queue
                url = self.queue.get_nowait()
            except Queue.Empty:
                # if there's nothing on the queue, sleep and continue
                time.sleep(0.01)
                continue

            # we got a url, so let's scrape it!
            response = urllib2.urlopen(url) # might want to consider adding a timeout here
            htmlText = response.read()

            # scraping with regex blows.
            # consider using xpath after parsing the html using lxml.html module
            song = re.findall('<h1>.*</h1>', htmlText)[0]
            song = song[4:-5]
            artist = re.findall('/artist.*</a>', htmlText)[1]
            artist = re.findall('>.*<', artist)[0]
            artist = artist[1:-1]

            print 'thread %d found artist:', (self.thread_id, artist)

    # we're overriding the default join function for the thread so
    # that we can make sure it stops
    def join(self, timeout=None):
        self.stop_request.set()
        super(Crawler, self).join(timeout)

if __name__ == '__main__':
    # how many threads do you want?  more is faster, but too many
    # might get your IP blocked or even bring down the site (DoS attack)
    n_threads = 10

    # use a standard queue object (thread-safe) for communication
    queue = Queue.Queue()

    # create our threads
    threads = []
    for i in range(n_threads):
        threads.append(Crawler(i, queue))

    # generate the urls and fill the queue
    url_template = 'http://www.billboard.com/charts/%s/hot-100'
    start_date = datetime.datetime(year=1958, month=8, day=9)
    end_date = datetime.datetime(year=1959, month=9, day=5)
    delta = datetime.timedelta(weeks=1)

    week = 0
    date = start_date + delta*week
    while date <= end_date:
        url = url_template % date.strftime('%Y-%m-%d')
        queue.put(url)
        week += 1
        date = start_date + delta*week

    # start crawling!
    for t in threads:
        t.start()

    # wait until the queue is empty
    while not queue.empty():
        time.sleep(0.01)

    # kill the threads
    for t in threads:
        t.join()

score 2 · Accepted Answer

这是使用Scrapy的解决方案。看一下概述，您就会明白它是为此类任务设计的工具：

它很快（基于扭曲）
易于使用和理解
基于 xpaths 的内置提取机制（您可以使用bs或lxml也可以使用）
内置支持将提取的项目流水线化到数据库、xml、json 等等
还有更多功能

这是一个工作蜘蛛，可以提取您所要求的所有内容（在我相当旧的笔记本电脑上工作了 15 分钟）：

import datetime
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector


class BillBoardItem(Item):
    date = Field()
    song = Field()
    artist = Field()


BASE_URL = "http://www.billboard.com/charts/%s/hot-100"


class BillBoardSpider(BaseSpider):
    name = "billboard_spider"
    allowed_domains = ["billboard.com"]

    def __init__(self):
        date = datetime.date(year=1958, month=8, day=9)

        self.start_urls = []
        while True:
            if date.year >= 2013:
                break

            self.start_urls.append(BASE_URL % date.strftime('%Y-%m-%d'))
            date += datetime.timedelta(days=7)

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        date = hxs.select('//span[@class="chart_date"]/text()').extract()[0]

        songs = hxs.select('//div[@class="listing chart_listing"]/article')
        for song in songs:
            item = BillBoardItem()
            item['date'] = date
            try:
                item['song'] = song.select('.//header/h1/text()').extract()[0]
                item['artist'] = song.select('.//header/p[@class="chart_info"]/a/text()').extract()[0]
            except:
                continue

            yield item

将其保存到billboard.py并通过scrapy runspider billboard.py -o output.json. 然后，在output.json您将看到：

...
{"date": "September 20, 1958", "artist": "Domenico Modugno", "song": "Nel Blu Dipinto Di Blu (Volar\u00c3\u00a9)"}
{"date": "September 20, 1958", "artist": "The Everly Brothers", "song": "Bird Dog"}
{"date": "September 20, 1958", "artist": "The Elegants", "song": "Little Star"}
{"date": "September 20, 1958", "artist": "Tommy Edwards", "song": "It's All In The Game"}
{"date": "September 20, 1958", "artist": "Jimmy Clanton And His Rockets", "song": "Just A Dream"}
{"date": "September 20, 1958", "artist": "Poni-Tails", "song": "Born Too Late"}
{"date": "September 20, 1958", "artist": "The Olympics", "song": "Western Movies"}
{"date": "September 20, 1958", "artist": "Little Anthony And The Imperials", "song": "Tears On My Pillow"}
{"date": "September 20, 1958", "artist": "Robin Luke", "song": "Susie Darlin'"}
{"date": "September 27, 1958", "artist": "Domenico Modugno", "song": "Nel Blu Dipinto Di Blu (Volar\u00c3\u00a9)"}
{"date": "September 27, 1958", "artist": "The Everly Brothers", "song": "Bird Dog"}
{"date": "September 27, 1958", "artist": "Tommy Edwards", "song": "It's All In The Game"}
{"date": "September 27, 1958", "artist": "The Elegants", "song": "Little Star"}
{"date": "September 27, 1958", "artist": "Jimmy Clanton And His Rockets", "song": "Just A Dream"}
{"date": "September 27, 1958", "artist": "Little Anthony And The Imperials", "song": "Tears On My Pillow"}
{"date": "September 27, 1958", "artist": "Robin Luke", "song": "Susie Darlin'"}
...

另外，看看grequests作为替代工具。

希望有帮助。

python - 如何使用 Urllib2 更有效地抓取？

2 回答 2

Related

Reference