我正在为我的公司构建一个抄袭检查工具,但在计算唯一性因子时遇到了问题。
通过使用 GSERP 在 Google 中搜索代码段来进行计算。然后,该脚本会检查结果片段是否不唯一。
这是我的代码;
. . .
$snippet = '"' . join(" ", array_slice($contentArray, $start, $limit)) . '"';
$start += $limit;
$end += $limit;
$counter++;
$url = '';
$lang = 'en';
$gserp = (g_serp($snippet, $url, $lang));
$gserpCount = count($gserp);
. . .
. . .
error_reporting(E_ALL ^ E_NOTICE);
//helper function -- file_get_contents using curl
function file_get_contents_curl($url, $referer = '', $ua = '') {
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_HEADER, FALSE);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
if ($referer != '') {
curl_setopt($ch, CURLOPT_REFERER, $referer);
}
if ($ua != '') {
curl_setopt($ch, CURLOPT_USERAGENT, $ua);
}
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
$data = curl_exec($ch);
curl_close($ch);
return $data;
}
//this is the main function
function g_serp($keyword, $url, $lang = 'en') {
$results = array();
$g_url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&q=' . urlencode($keyword) .
'&rsz=large&userip=' . $_SERVER['REMOTE_DDR'] . '&hl=' . $lang;
for ($i = 0; $i < 64; $i+=8) {
$start = $i;
$referer = $_SERVER['HTTP_REFERER']; //change this into your real domain
$rawdata = file_get_contents_curl($g_url . '&start=' . $start, $referer, $_SERVER['HTTP_USER_AGENT']);
$decoded = json_decode($rawdata, TRUE); //decode as assoc array
if (is_array($decoded['responseData']['results'])) {
$pos = $start;
foreach ($decoded['responseData']['results'] as $result) {
//if (substr_count(strtolower($result['url']), $url)) {
// $GLOBALS['index'] = $pos + 1;
// }
$res['position'] = $pos + 1;
$res['title'] = $result['titleNoFormatting'];
$res['url'] = $result['unescapedUrl'];
array_push($results, $res);
$pos++;
}
}
}
return $results;
}
. . .
任何人都知道我可能做错了什么或者我能如何做到最好?