我正在使用https://github.com/FriendsOfPHP/Goutte。在 while 循环中单击分页链接时,我不断收到错误的 url。
对象上的selectLink返回第一个 while 循环的正确 url。看起来第二个循环为selectLink返回了错误的值。
这是代码。
public function __construct(Goutte\Client $client){
$this->client = $client;
}
public function parse(){
$url = "https://www.nextag.com/Arts-Entertainment--zz2702147z0z1zB6c4z5---html";
// crawl through first page
$crawler = $this->client->request('GET', $url);
// first page pagination links
$links = $this->paginationCrawler($crawler);
$linkBatch = array();
// get all pagination links and check if the next 10 links are available
list($linkBatch[], $_nextPage) = $this->getPaginationLinks($links);
// if $_nextPage == '11+/21+/etc' then crawl through all links
while($_nextPage != 'false'){
$link = $links->selectLink($_nextPage)->link();
$crawler = $this->client->click($link);
$links = $this->paginationCrawler($crawler);
list($linkBatch[], $_nextPage) = $this->getPaginationLinks($links);
}
dd($linkBatch);
}
public function paginationCrawler($crawler){
return $crawler->filter('#pagination');
}
public function getPaginationLinks($links){
$allLinks = $links->filter('#numbers a');
$linkNodes = $allLinks->each(function(Crawler $a) {
return $a->attr('href');
});
$lastPage = trim($links->filter('#numbers :last-child')->text());
if (strpos($lastPage,'+') === false) {
$lastPage = 'false';
}
return array($linkNodes, $lastPage);
}
这是输出: