当我尝试运行我的代码时,我遇到了这个问题,我已经为此抓取定义了一个实时请求,但仍然无法正常工作。有谁知道如何在python中处理这个问题?在这种情况下,站点地图有多重要?提前致谢
import logging
import re
from urllib.parse import urljoin, urlparse
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy import Request
from scrapy.spiders import SitemapSpider
from scrapy.selector import Selector
from scrapy.linkextractors import LinkExtractor
from scrapy.shell import inspect_response
from sqlalchemy.orm import sessionmaker
from content.spiders.templates.sitemap_template import ModSitemapSpider
from content.models import db_connect, create_db_table, Articles
from content.items import ContentItems
from content.item_functions import (process_item,
process_singular_item,
process_date_item,
process_array_item,
process_plural_texts,
process_external_links,
process_article_text)
HEADER_XPATH = ['//h1[@class="article-title"]//text()']
AUTHOR_XPATH = ['//span[@class="cnnbyline"]//text()',
'//span[@class="byline"]//text()']
PUBDATE_XPATH = ['//span[@class="cnnDateStamp"]//text()']
TAGS_XPATH = ['']
CATEGORY_XPATH = ['']
TEXT = ['//div[@id="storytext"]//text()',
'//div[@id="storycontent"]//p//text()']
INTERLINKS = ['//span[@class="inStoryHeading"]//a/@href']
DATE_FORMAT_STRING = '%Y-%m-%d'
class CNNnewsSpider(ModSitemapSpider):
name = 'cnn'
allowed_domains = ["cnn.com"]
sitemap_urls = ["http://edition.cnn.com/sitemaps/sitemap-news.xml"]
def parse(self, response):
items = []
item = ContentItems()
item['title'] = process_singular_item(self, response, HEADER_XPATH, single=True)
item['resource'] = urlparse(response.url).hostname
item['author'] = process_array_item(self, response, AUTHOR_XPATH, single=False)
item['pubdate'] = process_date_item(self, response, PUBDATE_XPATH, DATE_FORMAT_STRING, single=True)
item['tags'] = process_plural_texts(self, response, TAGS_XPATH, single=False)
item['category'] = process_array_item(self, response, CATEGORY_XPATH, single=False)
item['article_text'] = process_article_text(self, response, TEXT)
item['external_links'] = process_external_links(self, response, INTERLINKS, single=False)
item['link'] = response.url
items.append(item)
return items
这是我的文本结果:
File "/home/nik/project/lib/python3.5/site- packages/scrapy/spiders/__init__.py", line 76, in parse
raise NotImplementedError
NotImplementedError
2016-10-17 18:48:04 [scrapy] DEBUG: Redirecting (302) to <GET http://edition.cnn.com/2016/10/15/opinions/the-black-panthers-heirs-after-50- years-joseph/index.html> from <GET http://www.cnn.com/2016/10/15/opinions/the- black-panthers-heirs-after-50-years-joseph/index.html>
2016-10-17 18:48:04 [scrapy] DEBUG: Redirecting (302) to <GET http://edition.cnn.com/2016/10/15/africa/montreal-climate-change-hfc- kigali/index.html> from <GET http://www.cnn.com/2016/10/15/africa/montreal- climate-change-hfc-kigali/index.html>
2016-10-17 18:48:04 [scrapy] DEBUG: Redirecting (302) to <GET http://edition.cnn.com/2016/10/14/middleeast/battle-for-mosul-hawija-iraq/index.html> from <GET http://www.cnn.com/2016/10/14/middleeast/battle-for-mosul-hawija-iraq/index.html>
2016-10-17 18:48:04 [scrapy] ERROR: Spider error processing <GET http://edition.cnn.com/2016/10/15/politics/donald-trump-hillary-clinton-drug- test/index.html> (referer: http://edition.cnn.com/sitemaps/sitemap-news.xml)
Traceback (most recent call last):
File "/home/nik/project/lib/python3.5/site- packages/twisted/internet/defer.py", line 587, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/home/nik/project/lib/python3.5/site- packages/scrapy/spiders/__init__.py", line 76, in parse
raise NotImplementedError