0

使用python报纸3k包,我试图循环浏览网站上的所有文章,并用文章的内容构建一个数据框。

文章的 meta_data 以嵌套字典的形式出现,我可以将它从一篇文章中提取出来,但当我遍历它们时却不行。列表的长度为 0

from rake_nltk import Rake
import readability
import newspaper
from newspaper import Config
from newspaper import Article
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download([
    "names",
    "stopwords",
    "state_union",
    "twitter_samples",
    "movie_reviews",
    "averaged_perceptron_tagger",
    "vader_lexicon",
    "punkt",
])

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'

config = Config()
config.browser_user_agent = USER_AGENT
config.request_timeout = 10


base_url = 'https://www.marketwatch.com'
article_urls = set()
marketwatch = newspaper.build(base_url, config=config, memoize_articles=False, language='en')


title = []
sitename = []
og_type = []
url = []
og_description = []
twitter_identifier = []
twitter_id = []
fb_id = []
author = []
section = []
pub_date = []
tags = []

for sub_article in marketwatch.articles[0:10]:
  try:
    article = Article(sub_article.url, config=config, memoize_articles=False, language='en')
    article.download()
    article.parse()

    if article.url not in article_urls:
      article_urls.add(article.url)

      # The majority of the article elements are located
      # within the meta data section of the page's
      # navigational structure
      article_meta_data = article.meta_data

      published_date = {value for (key, value) in article_meta_data.items() if key == 'parsely-pub-date'}
      article_published_date = " ".join(str(x) for x in published_date)

      authors = sorted({value for (key, value) in article_meta_data.items() if key == 'parsely-author'})
      article_author = ', '.join(authors)

      title = {value for (key, value) in article_meta_data.items() if key == 'parsely-title'}
      article_title = " ".join(str(x) for x in title)

      keywords = ''.join({value for (key, value) in article_meta_data.items() if key == 'keywords'})
      keywords_list = sorted(keywords.lower().split(','))
      article_keywords = ', '.join(keywords_list)

      tags = ''.join({value for (key, value) in article_meta_data.items() if key == 'parsely-tags'})
      tag_list = sorted(tags.lower().split(','))
      article_tags = ', '.join(tag_list)

      summary = {value for (key, value) in article_meta_data.items() if key == 'description'}
      article_summary = " ".join(str(x) for x in summary)

      # the replace is used to remove newlines
      article_text = article.text.replace('\n', '')
      for key, value in article.meta_data.items():
        print(key, ' : ', value)
      
      # Trying to append content to the list
      title.append(article.meta_data['og']['title'])
      sitename.append(article.meta_data['og']['site_name'])
      og_type.append(article.meta_data['og']['type'])
      url.append(article.meta_data['og']['url'])
      og_description.append(article.meta_data['og']['description'])
      twitter_identifier.append(article.meta_data['twitter']['site']['identifier'])
      twitter_id.append(article.meta_data['twitter']['site']['id'])
      fb_id.append(article.meta_data['fb']['app_id'])
      author.append(article.meta_data['author'])
      section.append(article.meta_data['parsely-section'])
      pub_date.append(article.meta_data['parsely-pub-date'])
      tags.append(article.meta_data['parsely-tags'])

      print()

  except:
    pass
4

0 回答 0