1

我正在使用https://github.com/FriendsOfPHP/Goutte。在 while 循环中单击分页链接时,我不断收到错误的 url。

对象上的selectLink返回第一个 while 循环的正确 url。看起来第二个循环为selectLink返回了错误的值。

这是代码。

public function __construct(Goutte\Client $client){

    $this->client = $client;
}

public function parse(){

    $url = "https://www.nextag.com/Arts-Entertainment--zz2702147z0z1zB6c4z5---html";

    // crawl through first page
    $crawler    = $this->client->request('GET', $url);

    // first page pagination links
    $links      = $this->paginationCrawler($crawler);

    $linkBatch  = array(); 

    // get all pagination links and check if the next 10 links are available 
    list($linkBatch[], $_nextPage) = $this->getPaginationLinks($links);

    // if $_nextPage == '11+/21+/etc' then crawl through all links
    while($_nextPage != 'false'){

        $link                           = $links->selectLink($_nextPage)->link();

        $crawler                        = $this->client->click($link);

        $links                          = $this->paginationCrawler($crawler);

        list($linkBatch[], $_nextPage)  = $this->getPaginationLinks($links);

    }

    dd($linkBatch);
}   

public function paginationCrawler($crawler){

    return $crawler->filter('#pagination');
}

public function getPaginationLinks($links){

    $allLinks = $links->filter('#numbers a');

    $linkNodes = $allLinks->each(function(Crawler $a) {

        return  $a->attr('href');

    });

    $lastPage = trim($links->filter('#numbers :last-child')->text());

    if (strpos($lastPage,'+') === false) {

        $lastPage = 'false';

    }

    return array($linkNodes, $lastPage);
}

这是输出:

在此处输入图像描述

4

0 回答 0