-2

当我尝试运行我的代码时,我遇到了这个问题,我已经为此抓取定义了一个实时请求,但仍然无法正常工作。有谁知道如何在python中处理这个问题?在这种情况下,站点地图有多重要?提前致谢

import logging
import re
from urllib.parse import urljoin, urlparse
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy import Request
from scrapy.spiders import SitemapSpider
from scrapy.selector import Selector
from scrapy.linkextractors import LinkExtractor
from scrapy.shell import inspect_response
from sqlalchemy.orm import sessionmaker
from content.spiders.templates.sitemap_template import ModSitemapSpider
from content.models import db_connect, create_db_table, Articles
from content.items import ContentItems
from content.item_functions import (process_item,
                                process_singular_item,
                                process_date_item,
                                process_array_item,
                                process_plural_texts,
                                process_external_links,
                                process_article_text)

HEADER_XPATH = ['//h1[@class="article-title"]//text()']
AUTHOR_XPATH = ['//span[@class="cnnbyline"]//text()',
            '//span[@class="byline"]//text()']
PUBDATE_XPATH = ['//span[@class="cnnDateStamp"]//text()']
TAGS_XPATH = ['']
CATEGORY_XPATH = ['']
TEXT = ['//div[@id="storytext"]//text()',
    '//div[@id="storycontent"]//p//text()']
INTERLINKS = ['//span[@class="inStoryHeading"]//a/@href']
DATE_FORMAT_STRING = '%Y-%m-%d'


class CNNnewsSpider(ModSitemapSpider):

    name = 'cnn'
    allowed_domains = ["cnn.com"]
    sitemap_urls = ["http://edition.cnn.com/sitemaps/sitemap-news.xml"]


def parse(self, response):
    items = []
    item = ContentItems()
    item['title'] = process_singular_item(self, response, HEADER_XPATH, single=True)
    item['resource'] = urlparse(response.url).hostname
    item['author'] = process_array_item(self, response, AUTHOR_XPATH, single=False)
    item['pubdate'] = process_date_item(self, response, PUBDATE_XPATH, DATE_FORMAT_STRING, single=True)
    item['tags'] = process_plural_texts(self, response, TAGS_XPATH, single=False)
    item['category'] = process_array_item(self, response, CATEGORY_XPATH, single=False)
    item['article_text'] = process_article_text(self, response, TEXT)
    item['external_links'] = process_external_links(self, response, INTERLINKS, single=False)
    item['link'] = response.url
    items.append(item)
    return items

这是我的文本结果:

File "/home/nik/project/lib/python3.5/site-      packages/scrapy/spiders/__init__.py", line 76, in parse
raise NotImplementedError
NotImplementedError
2016-10-17 18:48:04 [scrapy] DEBUG: Redirecting (302) to <GET     http://edition.cnn.com/2016/10/15/opinions/the-black-panthers-heirs-after-50-     years-joseph/index.html> from <GET http://www.cnn.com/2016/10/15/opinions/the-     black-panthers-heirs-after-50-years-joseph/index.html>
2016-10-17 18:48:04 [scrapy] DEBUG: Redirecting (302) to <GET   http://edition.cnn.com/2016/10/15/africa/montreal-climate-change-hfc-  kigali/index.html> from <GET http://www.cnn.com/2016/10/15/africa/montreal-  climate-change-hfc-kigali/index.html>
2016-10-17 18:48:04 [scrapy] DEBUG: Redirecting (302) to <GET http://edition.cnn.com/2016/10/14/middleeast/battle-for-mosul-hawija-iraq/index.html> from <GET http://www.cnn.com/2016/10/14/middleeast/battle-for-mosul-hawija-iraq/index.html>
2016-10-17 18:48:04 [scrapy] ERROR: Spider error processing <GET    http://edition.cnn.com/2016/10/15/politics/donald-trump-hillary-clinton-drug-    test/index.html> (referer: http://edition.cnn.com/sitemaps/sitemap-news.xml)
Traceback (most recent call last):
File "/home/nik/project/lib/python3.5/site-   packages/twisted/internet/defer.py", line 587, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/home/nik/project/lib/python3.5/site-   packages/scrapy/spiders/__init__.py", line 76, in parse
raise NotImplementedError
4

1 回答 1

0

抛出异常是因为您的类没有CNNnewsSpider覆盖parse(). 尽管您在粘贴的代码scrapy.BaseSpider中定义了一个方法,但由于缩进,它没有被包含在内:相反,它被定义为一个独立的函数。您需要按如下方式修复缩进:parse()CNNnewsSpider

class CNNnewsSpider(ModSitemapSpider):
    name = 'cnn'
    allowed_domains = ["cnn.com"]
    sitemap_urls = ["http://edition.cnn.com/sitemaps/sitemap-news.xml"]

    def parse(self, response):
        items = []
        item = ContentItems()
        item['title'] = process_singular_item(self, response, HEADER_XPATH, single=True)
        item['resource'] = urlparse(response.url).hostname
        item['author'] = process_array_item(self, response, AUTHOR_XPATH, single=False)
        item['pubdate'] = process_date_item(self, response, PUBDATE_XPATH, DATE_FORMAT_STRING, single=True)
        item['tags'] = process_plural_texts(self, response, TAGS_XPATH, single=False)
        item['category'] = process_array_item(self, response, CATEGORY_XPATH, single=False)
        item['article_text'] = process_article_text(self, response, TEXT)
        item['external_links'] = process_external_links(self, response, INTERLINKS, single=False)
        item['link'] = response.url
        items.append(item)
        return items
于 2016-10-17T19:24:51.150 回答