注意力
谷歌试图阻止抓取,因此服务器将被阻止,并且当他们怀疑抓取时请求将被丢弃。因此,如果您偶尔需要获取一些 google 搜索结果,则可以使用它。检查google-scraper.squabbel.com以获取基于代理的爬虫以及有关 google 阻止机制的更多信息。这也违反了他们的政策,因此是非法的。
google api 不允许超过 64 个结果,因此如果您需要更多结果,您需要自己抓取网站。因为这是一个有趣的项目,所以我创建了一个类来为你做这件事。
它需要免费的PHP Simple HTML DOM Parser,因此您还需要下载此代码。
它将输出一个数组,如
array(100) {
[0]=>
array(3) {
["title"]=>
string(67) "Online Tests - Online aptitude tests for interview, competitive ..."
["href"]=>
string(36) "http://www.indiabix.com/online-test/"
["description"]=>
string(168) "Online aptitude tests for competitive examination, entrance examination and
campus interview. Take various tests and find out how much you score before
you ... "
}
[1]=>
array(3) {
["title"]=>
string(37) "Test your English - Cambridge English"
["href"]=>
string(50) "http://www.cambridgeenglish.org/test-your-english/"
["description"]=>
string(179) "Test Your English. This is a quick, free online test. It will tell you which Cambridge
English exam may be best for you. Click 'Begin Test' and answer each of the ... "
}
//removed for better visibility
}
如何使用:
//start the scraper for google.com (english results)
$gs = new GoogleScraper();
//start the scraper for google.nl (dutch results)
//$gs = new GoogleScraper('https://www.google.nl');
//set your search query
$gs->SearchQuery('online exams');
//start loading the pages. You can enter any integer above 0
$gs->LoadPages(10);
//dump the results, but its just an array so you can also do other things with it.
echo '<pre>';
var_dump($gs->GetResults());
echo '</pre>';
?>
然后GoogleScraper.php
<?php
require_once('simple_html_dom.php');
class GoogleScraper
{
private $_results;
private $_baseUrl;
private $_searchQuery;
private $_resultsPerPage;
/**
* constructor
* I use the constructor to set all the defaults to keep it all in one place
*/
final public function __construct($baseUrl='')
{
$this->_results = array();
$this->_resultsPerPage = 100;
if (empty($baseUrl)) {
$this->_baseUrl = 'https://www.google.com';
} else {
$this->_baseUrl = $baseUrl;
}
}
/**
* cleanup
*/
final public function __destruct()
{
unset($this->_results);
unset($this->_baseUrl);
unset($this->_searchQuery);
}
/**
* Set the query
*/
final public function SearchQuery($searchQuery)
{
if (!(is_string($searchQuery) || is_numeric($searchQuery)))
{
throw new Exception('Invalid query type');
}
$this->_searchQuery = $searchQuery;
}
/**
* Set the number of results per page
*/
final public function ResultsPerPage($resultsPerPage)
{
if (!is_int($resultsPerPage) || $resultsPerPage<10 || $resultsPerPage>100)
{
throw new Exception('Results per page must be value between 10 and 100');
}
$this->_resultsPerPage = $resultsPerPage;
}
/**
* Get the result
*/
final public function GetResults()
{
return $this->_results;
}
/**
* Scrape the search results
*/
final public function LoadPages($pages=1)
{
if (!is_int($pages) || $pages<1)
{
throw new Exception('Invalid number of pages');
}
if (empty($this->_searchQuery))
{
throw new Exception('Missing search query');
}
$url = $this->_baseUrl . '/search?num='.$this->_resultsPerPage.'&q=' . urlencode($this->_searchQuery);
$currentPage = 1;
while($pages--) {
if ($content = $this->LoadUrl($url)) {
/*
Load content in to simple html dom
*/
$html = new simple_html_dom();
$html->load($content);
/*
Find and handle search results
*/
$items = $html->find('div#ires li');
foreach($items as $item) {
/*
Only normal search results have this container. Special results like found images or news dont have it.
*/
$check = $item->find('div.s');
if (count($check)!=1) {
continue;
}
$head = $item->find('h3.r a', 0);
$result['title'] = $head->plaintext;
/*
If we dont have a title, there is no point in continuing
*/
if (empty($result['title'])) {
continue;
}
$result['href'] = $head->href;
/*
Check if we can parse the URL for the actual url
*/
if (!empty($result['href'])) {
$qs = explode('?', $result['href']);
if (!empty($qs[1])) {
parse_str($qs[1], $querystring);
if (!empty($querystring['q'])) {
$result['href'] = $querystring['q'];
}
}
}
/*
Try to find the description
*/
$info = $item->find('span.st', 0);
$result['description'] = $info->plaintext;
/*
Add the results to the total
*/
$this->_results[] = $result;
}
/*
Find next page
*/
$url = $this->_baseUrl . '/search?num='.$this->_resultsPerPage.'&q=' . urlencode($this->_searchQuery) . '$start=' . ($currentPage*$this->_resultsPerPage);
} else {
throw new Exception('Failed to load page');
}
$currentPage++;
}
}
/**
* Load the url
*/
final private function LoadUrl($url)
{
if (!is_string($url))
{
throw new Exception('Invalid url');
}
$options['http'] = array(
'user_agent' => "GoogleScraper",
'timeout' => 0.5
);
$context = stream_context_create($options);
$content = file_get_contents($url, null, $context);
if (!empty($http_response_header))
{
return (substr_count($http_response_header[0], ' 200 OK')>0) ? $content : false;
}
return false;
}
}
?>
检查这个PHP Fiddle以查看它的实际效果。因为这个服务器可以经常使用它,所以谷歌有可能出现 503 错误。