使用python报纸3k包,我试图循环浏览网站上的所有文章,并用文章的内容构建一个数据框。
文章的 meta_data 以嵌套字典的形式出现,我可以将它从一篇文章中提取出来,但当我遍历它们时却不行。列表的长度为 0
from rake_nltk import Rake
import readability
import newspaper
from newspaper import Config
from newspaper import Article
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download([
"names",
"stopwords",
"state_union",
"twitter_samples",
"movie_reviews",
"averaged_perceptron_tagger",
"vader_lexicon",
"punkt",
])
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
config = Config()
config.browser_user_agent = USER_AGENT
config.request_timeout = 10
base_url = 'https://www.marketwatch.com'
article_urls = set()
marketwatch = newspaper.build(base_url, config=config, memoize_articles=False, language='en')
title = []
sitename = []
og_type = []
url = []
og_description = []
twitter_identifier = []
twitter_id = []
fb_id = []
author = []
section = []
pub_date = []
tags = []
for sub_article in marketwatch.articles[0:10]:
try:
article = Article(sub_article.url, config=config, memoize_articles=False, language='en')
article.download()
article.parse()
if article.url not in article_urls:
article_urls.add(article.url)
# The majority of the article elements are located
# within the meta data section of the page's
# navigational structure
article_meta_data = article.meta_data
published_date = {value for (key, value) in article_meta_data.items() if key == 'parsely-pub-date'}
article_published_date = " ".join(str(x) for x in published_date)
authors = sorted({value for (key, value) in article_meta_data.items() if key == 'parsely-author'})
article_author = ', '.join(authors)
title = {value for (key, value) in article_meta_data.items() if key == 'parsely-title'}
article_title = " ".join(str(x) for x in title)
keywords = ''.join({value for (key, value) in article_meta_data.items() if key == 'keywords'})
keywords_list = sorted(keywords.lower().split(','))
article_keywords = ', '.join(keywords_list)
tags = ''.join({value for (key, value) in article_meta_data.items() if key == 'parsely-tags'})
tag_list = sorted(tags.lower().split(','))
article_tags = ', '.join(tag_list)
summary = {value for (key, value) in article_meta_data.items() if key == 'description'}
article_summary = " ".join(str(x) for x in summary)
# the replace is used to remove newlines
article_text = article.text.replace('\n', '')
for key, value in article.meta_data.items():
print(key, ' : ', value)
# Trying to append content to the list
title.append(article.meta_data['og']['title'])
sitename.append(article.meta_data['og']['site_name'])
og_type.append(article.meta_data['og']['type'])
url.append(article.meta_data['og']['url'])
og_description.append(article.meta_data['og']['description'])
twitter_identifier.append(article.meta_data['twitter']['site']['identifier'])
twitter_id.append(article.meta_data['twitter']['site']['id'])
fb_id.append(article.meta_data['fb']['app_id'])
author.append(article.meta_data['author'])
section.append(article.meta_data['parsely-section'])
pub_date.append(article.meta_data['parsely-pub-date'])
tags.append(article.meta_data['parsely-tags'])
print()
except:
pass