我想用newspaper3k在一个法国网站上抓取数据,结果只有50篇文章。这个网站有50多篇文章。我哪里错了?
我的目标是把这个网站上的所有文章都刮掉。
我试过这个:
import newspaper
legorafi_paper = newspaper.build('http://www.legorafi.fr/', memoize_articles=False)
# Empty list to put all urls
papers = []
for article in legorafi_paper.articles:
papers.append(article.url)
print(legorafi_paper.size())
这次打印的结果是 50 篇文章。
我不明白为什么报纸 3k 只会刮掉50 篇文章,而不是更多。
我尝试过的更新:
def Foo(firstTime = []):
if firstTime == []:
WebDriverWait(driver, 30).until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR,"div#appconsent>iframe")))
firstTime.append('Not Empty')
else:
print('Cookies already accepted')
%%time
categories = ['societe', 'politique']
import time
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import newspaper
import requests
from newspaper.utils import BeautifulSoup
from newspaper import Article
categories = ['people', 'sports']
papers = []
driver = webdriver.Chrome(executable_path="/Users/name/Downloads/chromedriver 4")
driver.get('http://www.legorafi.fr/')
for category in categories:
url = 'http://www.legorafi.fr/category/' + category
#WebDriverWait(self.driver, 10)
driver.get(url)
Foo()
WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button.button--filled>span.baseText"))).click()
pagesToGet = 2
title = []
content = []
for page in range(1, pagesToGet+1):
print('Processing page :', page)
#url = 'http://www.legorafi.fr/category/france/politique/page/'+str(page)
print(driver.current_url)
#print(url)
time.sleep(3)
raw_html = requests.get(url)
soup = BeautifulSoup(raw_html.text, 'html.parser')
for articles_tags in soup.findAll('div', {'class': 'articles'}):
for article_href in articles_tags.find_all('a', href=True):
if not str(article_href['href']).endswith('#commentaires'):
urls_set.add(article_href['href'])
papers.append(article_href['href'])
for url in papers:
article = Article(url)
article.download()
article.parse()
if article.title not in title:
title.append(article.title)
if article.text not in content:
content.append(article.text)
#print(article.title,article.text)
time.sleep(5)
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
driver.find_element_by_xpath("//a[contains(text(),'Suivant')]").click()
time.sleep(10)