0

我正在尝试为一个学校项目抓取https: //edition.cnn.com/ ...我定义了一个获取查询作为输入的方法,我必须从所有文章中获取所有文章的标题、片段等页面...我使用了for循环,但它只返回每页上第一篇文章数据的 10 倍...这是我的脚本...

我尝试向下滚动页面并使用不同的定位器定义结果。

while True:

    results=driver.find_elements_by_css_selector('[class="cnn-search__result cnn-search__result--article"]')
    time.sleep(5)

    for result in results:

        link,title,date,snippet='NA','NA','NA','NA'

        try:
            linkBox=driver.find_element_by_css_selector('[class="cnn-search__result-headline"]')
        except NoSuchElementException:
            linkBox=None

        if linkBox:
            link=linkBox.get_attribute('href')

        try:
            titleBox=driver.find_element_by_css_selector('[class="cnn-search__result-headline"]')
        except NoSuchElementException:
            titleBox=None

        if titleBox:title=titleBox.text

        try:
            dateBox=driver.find_element_by_css_selector('[class="cnn-search__result-publish-date"]')
        except NoSuchElementException:
            dateBox=None

        if dateBox:date=dateBox.text

        try:
            snippetBox=driver.find_element_by_css_selector('[class="cnn-search__result-body"]')
        except NoSuchElementException:
            snippetBox=None

        if snippetBox:snippet=snippetBox.text

        writer.writerow([link,title,date,snippet])

        driver.execute_script("arguments[0].scrollIntoView();",result)
        time.sleep(1)


    try:
        nextButton=driver.find_element_by_css_selector('[class="pagination-arrow pagination-arrow-right cnnSearchPageLink text-active"]')
        driver.execute_script("arguments[0].click();",nextButton)
        time.sleep(3)
    except NoSuchElementException:
        driver.quit()

        break

fw.close()
4

0 回答 0