scrapy - 从当前网页获取他们的网址时无法获取某些页面

Question

我写了一个基于 scapy0.14.4 的蜘蛛，我发现了一个奇怪的问题。

以下是我的蜘蛛。

parse_origin() 中注释的 url 是从 for 循环中获得的。但是注释的语句可以得到页面，而在yiled一个url的http请求时，花了很长时间但仍然无法得到页面。

class CrawlAmazon(BaseSpider):

    name = 'amazontest'

    def __init__(self):
        self.httpheaders = {
            'Connection'     : 'keep-alive',
            'Accept'         : '*/*',
            'Accept-encoding': 'gzip,deflate,sdch',
            'Accept-Language': 'en-US,en;q=0.8',
            'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
             'Referer'        : 'http://www.amazon.cn',
            'User-Agent'     : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) '\
                               'AppleWebKit/535.19 (KHTML, like Gecko) '\
                               'Chrome/18.0.1025.45 Safari/535.19',
        }

        self.urls = [
            'http://www.amazon.cn/gp/search/ref=sr_hi_4?rh=n%'\
            '3A852803051%2Cn%3A%21852804051%2Cn%3A111054071%2Cn%'\
            '3A2133896051%2Cn%3A2133899051&bbn=2133899051&ie=UTF8&qid=1351595402',
             ]

    def start_requests(self):
        for url in self.urls:
            headers = self.httpheaders
            headers['Referer'] = 'http://www.google.com'
            yield Request(url, headers=headers, callback=self.parse_origin)

   def parse_origin(self, response):
        hxs = HtmlXPathSelector(response)
        body = response.body
        url_cur = response.url

        # headers = self.httpheaders
        # headers['Referer'] = url_cur
        # headers['Accept'] =  'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
        # url = 'http://www.amazon.cn/gp/search/other?rh=n%3A2029189051&'\
        #       'bbn=2029189051&pickerToList=brandtextbin&ie=UTF8&qid=1351597447&rd=1'
        # yield Request(url, headers=headers, callback=self.parse_brand)

        nodes_url = hxs.select('//a[contains(@href, "/gp/search/other")]')

        for node in nodes_url:
            url = node.select('@href').extract()[0]
            url = '%s%s' % ('http://www.amazon.cn', url)

            if re.search('brand', url):
                print 'YIELD NEW REQUEST'
                headers = self.httpheaders
                headers['Referer'] = url_cur
                headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
                yield Request(url=url, headers=headers, callback=self.parse_brand)
                break

    def parse_brand(self, response):
        print 'IN PARSE_BRAND'
        url_cur = response.url

        body = response.body
        fn = '%s.txt' % (hashlib.md5(url_cur).hexdigest()[-8:], )
        with open(fn, 'ab+') as fp:
            fp.write(body)

scrapy - 从当前网页获取他们的网址时无法获取某些页面

0 回答 0

Related

Reference