我正在尝试为一个学校项目抓取https: //edition.cnn.com/ ...我定义了一个获取查询作为输入的方法,我必须从所有文章中获取所有文章的标题、片段等页面...我使用了for循环,但它只返回每页上第一篇文章数据的 10 倍...这是我的脚本...
我尝试向下滚动页面并使用不同的定位器定义结果。
while True:
results=driver.find_elements_by_css_selector('[class="cnn-search__result cnn-search__result--article"]')
time.sleep(5)
for result in results:
link,title,date,snippet='NA','NA','NA','NA'
try:
linkBox=driver.find_element_by_css_selector('[class="cnn-search__result-headline"]')
except NoSuchElementException:
linkBox=None
if linkBox:
link=linkBox.get_attribute('href')
try:
titleBox=driver.find_element_by_css_selector('[class="cnn-search__result-headline"]')
except NoSuchElementException:
titleBox=None
if titleBox:title=titleBox.text
try:
dateBox=driver.find_element_by_css_selector('[class="cnn-search__result-publish-date"]')
except NoSuchElementException:
dateBox=None
if dateBox:date=dateBox.text
try:
snippetBox=driver.find_element_by_css_selector('[class="cnn-search__result-body"]')
except NoSuchElementException:
snippetBox=None
if snippetBox:snippet=snippetBox.text
writer.writerow([link,title,date,snippet])
driver.execute_script("arguments[0].scrollIntoView();",result)
time.sleep(1)
try:
nextButton=driver.find_element_by_css_selector('[class="pagination-arrow pagination-arrow-right cnnSearchPageLink text-active"]')
driver.execute_script("arguments[0].click();",nextButton)
time.sleep(3)
except NoSuchElementException:
driver.quit()
break
fw.close()