这是一种在 PHP 中使用并行 cURL 的方法。此脚本会抓取 Chrome 网上商店中存在的所有扩展程序(按受欢迎程度排名)并检索以下信息:
- 用户数
- 星级数
- 文字评论数
- 每个文本评论的字符数(每个扩展最多抓取 100 条评论)
//GET URL
$url0 = "https://chrome.google.com/";
//AUTO LOOP
foreach(range(0, 705, 5) as $x) {
//Nb PAGES TO DOWNLOAD
$frompge = $x+1;
$topge = $x+5;
$nbpages = ($topge - $frompge)+1;
$zitems = $nbpages*20;
//MULTI cURL INIT
$mh = curl_multi_init();
$running = null;
//GENERATE URLs ARRAY
$urls = array();
for ($a = $frompge; $a <= $topge; $a++){
$aa = $url0 . 'webstore/list/most_popular/'. $a .'?category=ext';
$urls[] = $aa;
}
foreach ($urls as $name => $url)
{
$c[$name]=curl_init($url);
curl_setopt($c[$name], CURLOPT_HEADER, false);
curl_setopt($c[$name], CURLOPT_FAILONERROR, true);
curl_setopt($c[$name], CURLOPT_FOLLOWLOCATION, true);
curl_setopt($c[$name], CURLOPT_AUTOREFERER, true);
curl_setopt($c[$name], CURLOPT_RETURNTRANSFER, true);
curl_setopt($c[$name], CURLOPT_TIMEOUT, 10);
curl_multi_add_handle ($mh,$c[$name]);
}
// execute all queries simultaneously, and continue when all are complete
do {
curl_multi_exec($mh, $running);
} while ($running >0);
$html = array();
foreach ($urls as $name => $url)
{
$html[]=curl_multi_getcontent($c[$name]);
curl_multi_remove_handle($mh,$c[$name]);
curl_close($c[$name]);
}
curl_multi_close($mh);
for ($b = 0; $b <= $nbpages-1; $b++) {
// Parse the HTML information and return the results.
$dom = new DOMDocument();
@$dom->loadHtml($html[$b]);
$xpath = new DOMXPath($dom);
$links = $xpath->query("//a[contains(@class, 'title-a')]");
$result = array();
foreach ( $links as $item ) {
$newDom = new DOMDocument;
$newDom->appendChild($newDom->importNode($item,true));
$xpath = new DOMXPath( $newDom );
$cleaner = array(" users", " user", "(", ")", ","," ");
$data = str_replace($cleaner,"",trim($xpath->query("//script")->item(0)->nodeValue));
list($b1,$id,$b2,$b3,$b4,$name,$b5,$b6,$b7,$b8,$b9,$b10,$b11,$b12,$b13,$nbusers) = explode("\"", $data);
$label = str_replace(" ", "", strtolower(ereg_replace("[^A-Za-z0-9 ]", "", $name)));
//CATEGORIES (based on nb of users)
if($nbusers<100){$category = '1';$color = 'inherit';}
else if($nbusers>=100 && $nbusers<1000){$category = '2';$color = '#E6EEEE';}
else if($nbusers>=1000 && $nbusers<10000){$category = '3';$color = '#CDDEDE';}
else if($nbusers>=10000 && $nbusers<100000){$category = '4';$color = '#B5CDCD';}
else if($nbusers>=100000 && $nbusers<1000000){$category = '5';$color = '#9CBDBD';}
else if($nbusers == '1000000+'){$category = '6';$color = '#83ACAC';}
else{$category = '-9';}
/////////////////////////////////////////////LOOP REVIEWS
$extURL = 'http://chrome.google.com/extensions/permalink?id='.$id;
$c1 = curl_init('https://chrome.google.com/reviews/json/search');
$c1a = curl_init('https://chrome.google.com/reviews/json/search');
$c2 = curl_init('https://chrome.google.com/reviews/json/lookup');
$fields1 = http_build_query(array(
'req' => '{"searchSpecs":[{"requireComment":true,"entities":[{"url":"'.$extURL.'"}],"groups":["public_comment"],"matchExtraGroups":true,"sortBy":"quality","startIndex":0,"numResults":100,"includeNickNames":false}],"applicationId":94}',
));
$options1 = array(
CURLOPT_RETURNTRANSFER => true,
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_POST => true,
CURLOPT_TIMEOUT => 10,
CURLOPT_POSTFIELDS => $fields1,
);
$fields1a = http_build_query(array(
'req' => '{"searchSpecs":[{"requireComment":true,"entities":[{"url":"'.$extURL.'"}],"groups":["public_comment"],"matchExtraGroups":true,"startIndex":0,"numResults":100,"includeNickNames":false}],"applicationId":94}',
));
$options1a = array(
CURLOPT_RETURNTRANSFER => true,
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_POST => true,
CURLOPT_TIMEOUT => 10,
CURLOPT_POSTFIELDS => $fields1a,
);
$fields2 = http_build_query(array(
'req' => '{"entities":[{"url" : "'.$extURL.'", "includeAggregateInfo" : true}],"applicationId":94}',
));
$options2 = array(
CURLOPT_RETURNTRANSFER => true,
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_POST => true,
CURLOPT_TIMEOUT => 10,
CURLOPT_POSTFIELDS => $fields2,
);
curl_setopt_array($c1, $options1);
curl_setopt_array($c1a, $options1a);
curl_setopt_array($c2, $options2);
$mh2 = curl_multi_init();
curl_multi_add_handle($mh2,$c1);
curl_multi_add_handle($mh2,$c1a);
curl_multi_add_handle($mh2,$c2);
$active = null;
do {
curl_multi_exec($mh2, $active);
} while ($active >0);
//close the handles$c1 = curl_init('https://chrome.google.com/reviews/json/search');
$json1=curl_multi_getcontent($c1);
$json1a=curl_multi_getcontent($c1a);
$json2=curl_multi_getcontent($c2);
curl_multi_remove_handle($mh2, $c1);
curl_multi_remove_handle($mh2, $c1a);
curl_multi_remove_handle($mh2, $c2);
curl_multi_close($mh2);
$data1 = json_decode(utf8_encode($json1), true);
$data1a = json_decode(utf8_encode($json1a), true);
$data2 = json_decode(utf8_encode($json2), true);
if ($data1['channelHeader']['errorCode']) return;
$nbreviews = $data1['searchResults'][0]['numAnnotations'];
if ($nbreviews > 100){$nbreviews2=100;}
else{$nbreviews2=$nbreviews;}
//Sum strings
$comments = $data1['searchResults'][0]['annotations'];
$sum =0;
foreach($comments as $comment){
$msg = preg_replace('/[\n\r\t]/', ' ', htmlspecialchars($comment['comment']));
$msg = str_replace(">", "", $msg);
$msg = str_replace(" ", "", $msg);
$strlen = strlen($msg);
$sum += $strlen;
}
$add = $sum;
$final = $add/$nbreviews2;
//Sum strings A
if ($data1a['channelHeader']['errorCode']) return;
$nbreviewsa = $data1a['searchResults'][0]['numAnnotations'];
$commentsa = $data1a['searchResults'][0]['annotations'];
$suma =0;
foreach($commentsa as $commenta){
$msga = preg_replace('/[\n\r\t]/', ' ', htmlspecialchars($commenta['comment']));
$msga = str_replace(">", "", $msga);
$msga = str_replace(" ", "", $msga);
$strlena = strlen($msga);
$suma += $strlena;
}
$adda = $suma;
$finala = $adda/$nbreviews2;
//Ratings
if ($data2['channelHeader']['errorCode']) return;
$nbratings = $data2['annotations'][0]['aggregateInfo']['numRatings'];
$nbstars = $data2['annotations'][0]['aggregateInfo']['averageRating'];
$delta = $nbratings - $nbreviews;
$ratio = $nbratings/$nbusers;
$ratio2 = $nbreviews/$nbusers;
////////////////////////////////////////////END LOOP REVIEWS
//PUT VALUES TOGETHER
$result[] = array($name,$label,$id,$category,$nbusers,$nbratings,$nbreviews,$nbreviewsa,$delta,$ratio,$ratio2,$nbstars,$nbreviews2,$add,$final,$adda,$finala);
}//END FOREACH
//print_r($result,false);
//DISPLAY RESULTS
for ($z = 0; $z <= 20; $z++) {
echo "<tr><td class=\"non\">" .$result[$z][0] . "</td><td class=\"non\">" .$result[$z][1] . "</td><td>" .$result[$z][3] . "</td><td>" .$result[$z][4] . "</td><td>" .$result[$z][5] . "</td><td>" .$result[$z][6] . "</td><td>" .$result[$z][7] . "</td><td>" .$result[$z][8] . "</td><td>" .$result[$z][9] . "</td><td>" .$result[$z][10] . "</td><td>" .$result[$z][11] . "</td><td>" .$result[$z][12] . "</td><td>" .$result[$z][13] . "</td><td>" .$result[$z][14] . "</td><td>" .$result[$z][15] . "</td><td>" .$result[$z][16] . "</td></tr>";
ob_flush();
flush();
}
}
}//END FOREACH