1

我有兴趣收集/抓取有关 Chrome Webstore 中可用的流行扩展所获得的评论的数据。

特别是,我需要检索为特定扩展留下的总评论数,然后检索此插件公开可用的所有评论。我的问题如下:我无法编写标准的 PHP Curl 刮板,因为我感兴趣的数据可通过 json 请求获得,特别是,我需要调用:

我试着写这个:

 <script src="http://code.jquery.com/jquery-latest.js"></script>
 <script type="text/javascript">
  function getReviews(extensionId, callback) {
    var entities = [{'url' : 'http://chrome.google.com/extensions/permalink?id=' +   extensionId}];
    var param = {"searchSpecs":[{"requireComment":true,"entities": entities,"groups":["public_comment"],"matchExtraGroups":true,"sortBy":"quality","startIndex":10,"numResults":10,"includeNickNames":true}],"applicationId":94};

   $.ajax({
      type: 'POST',
      url: 'https://chrome.google.com/reviews/json/search',
      contentType: 'application/xml',
      xhrFields: {withCredentials: true },
      dataType: 'json',
      data: 'req=' + JSON.stringify(param) + '&requestSource=widget'
         }).success(callback);
   }
</script>

<script type="text/javascript">
 $(document).ready(getReviews('gighmmpiobklfepjocnamgkkbiglidom', function(reviews) {    console.log(reviews); }));
</script>

我对 jQuery/JSON(-P) 不是很感兴趣,上面的代码肯定是错误的。

我的问题如下:

  1. 如何绕过同域策略?我尝试了YQL但没有成功...
  2. 如何格式化我的 url/'data' 以仅检索 chrome.google.com/reviews/components 上的评论数量('numRatings')和 chrome.google.com/reviews/json/ 上的评论('comments')搜索由其 id 标识的特定扩展,例如 gighmmpiobklfepjocnamgkkbiglidom?

我已经使用 PHP 为流行的 Mozilla 插件完成了这种抓取,并使用标准 curl/XPath 收集了我需要的数据。

谢谢你的帮助!

4

2 回答 2

0

1) 最简单的方法是创建一个 Chrome 扩展程序;

2) 见https://github.com/xpressyoo/MyExtensions

[...]
getComments : function() {
        var entities = [];
        //each(Ext.extensions, function(data, id) {
            entities.push({'url' : 'http://chrome.google.com/extensions/permalink?id=' + this.hash});
        //});               

        Ext.XHR['comments'] = new Ajax({
            'method'        : 'POST',
            'encodeURI'     : false,    // Needed
            'url'           : 'https://chrome.google.com/reviews/json/search',
            'headers'       : {
                'Content-type'  : 'application/xml'
            },

            'parameters'    : {
                'req'       : JSON.stringify({'searchSpecs' :  [{'entities' : entities, 'groups' : ['public_comment'], 'matchExtraGroups' : true,"sortBy":"quality", 'startIndex' : 0, 'numResults' : 80, 'includeNickNames' : true}], 'applicationId' : 94 }) + '&requestSource=widget'
            },

            'onSuccess'     : function(xhr) {
                var json = xhr.responseJSON;
                if(json && json.searchResults ) {
                    this.comments = {

'total'             : Number(json.searchResults[0].numAnnotations.toString().replace(/,/, '').toInt()),
'latest'            : json.searchResults[0].annotations ? json.searchResults[0].annotations[0] :{},
'previous'          : this.comments.total || null,
'latestPrevious'            : $merge(this.comments.latest) || null,
'new'                   : this.comments['new'] || false
                    }
Ext.XHR['comments'] = null;
                }
            }.bind(this)    
        }).send();      

        return this;
    },
 [...]

var nbreviews = this.comments.total; //The number of reviews

var latestcomment = (this.comments.latest0 && this.comments.latest0.comment ? this.comments.latest0.comment.replace(/\n/gi, '')  : '');// get the latest comment
var nthcomment = (this.comments.latestn && this.comments.latestn.comment ? this.comments.latestn.comment.replace(/\n/gi, '')  : '');//Get the nth comment

在哪里:

'latestn'           : json.searchResults[0].annotations ? json.searchResults[0].annotations[n] :{},
于 2011-09-10T02:18:33.840 回答
0

这是一种在 PHP 中使用并行 cURL 的方法。此脚本会抓取 Chrome 网上商店中存在的所有扩展程序(按受欢迎程度排名)并检索以下信息:

  • 用户数
  • 星级数
  • 文字评论数
  • 每个文本评论的字符数(每个扩展最多抓取 100 条评论)
//GET URL
$url0 = "https://chrome.google.com/";

//AUTO LOOP
foreach(range(0, 705, 5) as $x) {

//Nb PAGES TO DOWNLOAD
$frompge = $x+1;
$topge   = $x+5;
$nbpages = ($topge - $frompge)+1;
$zitems  = $nbpages*20;

//MULTI cURL INIT
$mh      = curl_multi_init();
$running = null;

//GENERATE URLs ARRAY
$urls    = array();

for ($a = $frompge; $a <= $topge; $a++){
     $aa = $url0 . 'webstore/list/most_popular/'. $a .'?category=ext';
     $urls[] = $aa;
}


foreach ($urls as $name => $url) 
{
        $c[$name]=curl_init($url);
        curl_setopt($c[$name], CURLOPT_HEADER, false);
        curl_setopt($c[$name], CURLOPT_FAILONERROR, true);
        curl_setopt($c[$name], CURLOPT_FOLLOWLOCATION, true);
        curl_setopt($c[$name], CURLOPT_AUTOREFERER, true);
        curl_setopt($c[$name], CURLOPT_RETURNTRANSFER, true);
        curl_setopt($c[$name], CURLOPT_TIMEOUT, 10);
        curl_multi_add_handle ($mh,$c[$name]);
}

// execute all queries simultaneously, and continue when all are complete
do {
    curl_multi_exec($mh, $running);
    } while ($running >0);

$html = array();
foreach ($urls as $name => $url) 
{
    $html[]=curl_multi_getcontent($c[$name]);
    curl_multi_remove_handle($mh,$c[$name]);
    curl_close($c[$name]);
}
curl_multi_close($mh);

for ($b = 0; $b <= $nbpages-1; $b++) {

// Parse the HTML information and return the results.
$dom = new DOMDocument(); 
@$dom->loadHtml($html[$b]);

$xpath = new DOMXPath($dom);
$links = $xpath->query("//a[contains(@class, 'title-a')]");

$result = array();

foreach ( $links as $item ) {
    $newDom = new DOMDocument;
    $newDom->appendChild($newDom->importNode($item,true));

    $xpath = new DOMXPath( $newDom );
    $cleaner = array(" users", " user", "(", ")", ","," ");

$data = str_replace($cleaner,"",trim($xpath->query("//script")->item(0)->nodeValue));

list($b1,$id,$b2,$b3,$b4,$name,$b5,$b6,$b7,$b8,$b9,$b10,$b11,$b12,$b13,$nbusers) = explode("\"", $data);
$label = str_replace(" ", "", strtolower(ereg_replace("[^A-Za-z0-9 ]", "", $name)));

//CATEGORIES (based on nb of users)
        if($nbusers<100){$category = '1';$color = 'inherit';}
        else if($nbusers>=100 && $nbusers<1000){$category = '2';$color = '#E6EEEE';}
        else if($nbusers>=1000 && $nbusers<10000){$category = '3';$color = '#CDDEDE';}
        else if($nbusers>=10000 && $nbusers<100000){$category = '4';$color = '#B5CDCD';}
        else if($nbusers>=100000 && $nbusers<1000000){$category = '5';$color = '#9CBDBD';}
        else if($nbusers == '1000000+'){$category = '6';$color = '#83ACAC';}
        else{$category = '-9';}

/////////////////////////////////////////////LOOP REVIEWS

$extURL = 'http://chrome.google.com/extensions/permalink?id='.$id;
$c1     = curl_init('https://chrome.google.com/reviews/json/search');
$c1a    = curl_init('https://chrome.google.com/reviews/json/search');
$c2     = curl_init('https://chrome.google.com/reviews/json/lookup');

$fields1 = http_build_query(array(
    'req' => '{"searchSpecs":[{"requireComment":true,"entities":[{"url":"'.$extURL.'"}],"groups":["public_comment"],"matchExtraGroups":true,"sortBy":"quality","startIndex":0,"numResults":100,"includeNickNames":false}],"applicationId":94}',
));
$options1 = array(
    CURLOPT_RETURNTRANSFER => true,
    CURLOPT_SSL_VERIFYPEER => false,
    CURLOPT_POST => true,
    CURLOPT_TIMEOUT => 10,
    CURLOPT_POSTFIELDS => $fields1,
);

$fields1a = http_build_query(array(
    'req' => '{"searchSpecs":[{"requireComment":true,"entities":[{"url":"'.$extURL.'"}],"groups":["public_comment"],"matchExtraGroups":true,"startIndex":0,"numResults":100,"includeNickNames":false}],"applicationId":94}',
));
$options1a = array(
    CURLOPT_RETURNTRANSFER => true,
    CURLOPT_SSL_VERIFYPEER => false,
    CURLOPT_POST => true,
    CURLOPT_TIMEOUT => 10,
    CURLOPT_POSTFIELDS => $fields1a,
);

$fields2 = http_build_query(array(
    'req' => '{"entities":[{"url" : "'.$extURL.'", "includeAggregateInfo" : true}],"applicationId":94}',
));
$options2 = array(
    CURLOPT_RETURNTRANSFER => true,
    CURLOPT_SSL_VERIFYPEER => false,
    CURLOPT_POST => true,
    CURLOPT_TIMEOUT => 10,
    CURLOPT_POSTFIELDS => $fields2,
);

curl_setopt_array($c1, $options1);
curl_setopt_array($c1a, $options1a);
curl_setopt_array($c2, $options2);

$mh2 = curl_multi_init();

curl_multi_add_handle($mh2,$c1);
curl_multi_add_handle($mh2,$c1a);
curl_multi_add_handle($mh2,$c2);

$active = null;

do {
curl_multi_exec($mh2, $active);
    } while ($active >0);

//close the handles$c1 = curl_init('https://chrome.google.com/reviews/json/search');
$json1=curl_multi_getcontent($c1);
$json1a=curl_multi_getcontent($c1a);
$json2=curl_multi_getcontent($c2);
curl_multi_remove_handle($mh2, $c1);
curl_multi_remove_handle($mh2, $c1a);
curl_multi_remove_handle($mh2, $c2);
curl_multi_close($mh2);

$data1 = json_decode(utf8_encode($json1), true);
$data1a = json_decode(utf8_encode($json1a), true);
$data2 = json_decode(utf8_encode($json2), true);

if ($data1['channelHeader']['errorCode']) return;
$nbreviews = $data1['searchResults'][0]['numAnnotations'];
if ($nbreviews > 100){$nbreviews2=100;}
else{$nbreviews2=$nbreviews;}

//Sum strings
$comments = $data1['searchResults'][0]['annotations'];
$sum =0;
foreach($comments as $comment){
    $msg = preg_replace('/[\n\r\t]/', ' ', htmlspecialchars($comment['comment']));
    $msg = str_replace("&gt;", "", $msg);
    $msg = str_replace(" ", "", $msg);
    $strlen = strlen($msg);
    $sum += $strlen;
}
$add = $sum;
$final = $add/$nbreviews2;

//Sum strings A
if ($data1a['channelHeader']['errorCode']) return;
$nbreviewsa = $data1a['searchResults'][0]['numAnnotations'];
$commentsa = $data1a['searchResults'][0]['annotations'];
$suma =0;
foreach($commentsa as $commenta){
$msga = preg_replace('/[\n\r\t]/', ' ', htmlspecialchars($commenta['comment']));
$msga = str_replace("&gt;", "", $msga);
$msga = str_replace(" ", "", $msga);
$strlena = strlen($msga);
$suma += $strlena;
}
$adda = $suma;
$finala = $adda/$nbreviews2;

//Ratings
if ($data2['channelHeader']['errorCode']) return;
$nbratings = $data2['annotations'][0]['aggregateInfo']['numRatings'];
$nbstars = $data2['annotations'][0]['aggregateInfo']['averageRating'];

$delta = $nbratings - $nbreviews;
$ratio = $nbratings/$nbusers;
$ratio2 = $nbreviews/$nbusers;
////////////////////////////////////////////END LOOP REVIEWS

//PUT VALUES TOGETHER
    $result[] = array($name,$label,$id,$category,$nbusers,$nbratings,$nbreviews,$nbreviewsa,$delta,$ratio,$ratio2,$nbstars,$nbreviews2,$add,$final,$adda,$finala);
}//END FOREACH
//print_r($result,false);

//DISPLAY RESULTS

for ($z = 0; $z <= 20; $z++) {

echo "<tr><td class=\"non\">" .$result[$z][0] . "</td><td class=\"non\">" .$result[$z][1] . "</td><td>" .$result[$z][3] . "</td><td>" .$result[$z][4] . "</td><td>" .$result[$z][5] . "</td><td>" .$result[$z][6] . "</td><td>" .$result[$z][7] . "</td><td>" .$result[$z][8] . "</td><td>" .$result[$z][9] . "</td><td>" .$result[$z][10] . "</td><td>" .$result[$z][11] . "</td><td>" .$result[$z][12] . "</td><td>" .$result[$z][13] . "</td><td>" .$result[$z][14] . "</td><td>" .$result[$z][15] . "</td><td>" .$result[$z][16] . "</td></tr>";
ob_flush();
flush();
}

}
}//END FOREACH
于 2012-04-16T12:00:44.153 回答