我建议您查看我在GitHub 上发布的报纸概述文档。该文档有多个提取示例和其他可能有用的技术。
import newspaper
from newspaper import Config
from newspaper import Article
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'
config = Config()
config.browser_user_agent = USER_AGENT
config.request_timeout = 10
base_url = 'https://www.marketwatch.com'
article_urls = set()
marketwatch = newspaper.build(base_url, config=config, memoize_articles=False, language='en')
for sub_article in marketwatch.articles:
article = Article(sub_article.url, config=config, memoize_articles=False, language='en')
if article.url not in article_urls:
# The majority of the article elements are located
# within the meta data section of the page's
# navigational structure
article_meta_data = article.meta_data
published_date = {value for (key, value) in article_meta_data.items() if key == 'parsely-pub-date'}
article_published_date = " ".join(str(x) for x in published_date)
authors = sorted({value for (key, value) in article_meta_data.items() if key == 'parsely-author'})
article_author = ', '.join(authors)
title = {value for (key, value) in article_meta_data.items() if key == 'parsely-title'}
article_title = " ".join(str(x) for x in title)
keywords = ''.join({value for (key, value) in article_meta_data.items() if key == 'keywords'})
keywords_list = sorted(keywords.lower().split(','))
article_keywords = ', '.join(keywords_list)
tags = ''.join({value for (key, value) in article_meta_data.items() if key == 'parsely-tags'})
tag_list = sorted(tags.lower().split(','))
article_tags = ', '.join(tag_list)
summary = {value for (key, value) in article_meta_data.items() if key == 'description'}
article_summary = " ".join(str(x) for x in summary)
# the replace is used to remove newlines
article_text = article.text.replace('\n', '')
import newspaper
from newspaper import Config
from newspaper import Article
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'
config = Config()
config.browser_user_agent = USER_AGENT
config.request_timeout = 10
base_url = 'https://www.euronews.com'
article_urls = set()
euronews = newspaper.build(base_url, config=config, memoize_articles=False, language='en')
for sub_article in euronews.articles:
if sub_article.url not in article_urls:
article = Article(sub_article.url, config=config, memoize_articles=False, language='en')
# The majority of the article elements are located
# within the meta data section of the page's
# navigational structure
article_meta_data = article.meta_data
published_date = {value for (key, value) in article_meta_data.items() if key == 'date.created'}
article_published_date = " ".join(str(x) for x in published_date)
article_title = article.title
summary = {value for (key, value) in article_meta_data.items() if key == 'description'}
article_summary = " ".join(str(x) for x in summary)
keywords = ''.join({value for (key, value) in article_meta_data.items() if key == 'keywords'})
keywords_list = sorted(keywords.lower().split(','))
article_keywords = ', '.join(keywords_list).strip()
# the replace is used to remove newlines
article_text = article.text.replace('\n', '')