我正在为一个有点慢的网站编写一个页面抓取工具,但有很多信息我想用于小部件目的(在他们的许可下)。目前,执行和解析我到目前为止所抓取4-5 minutes
的所有内容都需要大致执行和解析。~150 pages
这将是一个crontab
'd 事件,并在生成时使用临时表,然后在完成时复制到“实时”表,因此从客户端的角度来看,这是一个无缝过渡,但是您能找到一种加快速度的方法我的代码,可能吗?
//mysql connection stuff here
function dnl2array($domnodelist) {
$return = array();
$nb = $domnodelist->length;
for ($i = 0; $i < $nb; ++$i) {
$return['pt'][] = utf8_decode(trim($domnodelist->item($i)->nodeValue));
$return['html'][] = utf8_decode(trim(get_inner_html($domnodelist->item($i))));
}
return $return;
}
function get_inner_html( $node ) {
$innerHTML= '';
$children = $node->childNodes;
foreach ($children as $child) {
$innerHTML .= $child->ownerDocument->saveXML( $child );
}
return $innerHTML;
}
// NEW curl instead of file_get_contents()
$c = curl_init($url);
curl_setopt($c, CURLOPT_HEADER, false);
curl_setopt($c, CURLOPT_USERAGENT, getUserAgent());
curl_setopt($c, CURLOPT_FAILONERROR, true);
curl_setopt($c, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($c, CURLOPT_AUTOREFERER, true);
curl_setopt($c, CURLOPT_RETURNTRANSFER, true);
curl_setopt($c, CURLOPT_TIMEOUT, 20);
// Grab the data.
$html = curl_exec($c);
// Check if the HTML didn't load right, if it didn't - report an error
if (!$html) {
echo "<p>cURL error number: " .curl_errno($c) . " on URL: " . $url ."</p>" .
"<p>cURL error: " . curl_error($c) . "</p>";
}
// $html = file_get_contents($url);
$doc = new DOMDocument;
// Load the html into our object
$doc->loadHTML($html);
$xPath = new DOMXPath( $doc );
// scrape initial page that contains list of everything I want to scrape
$results = $xPath->query('//div[@id="food-plan-contents"]//td[@class="product-name"]');
$test['itams'] = dnl2array($results);
foreach($test['itams']['html'] as $get_url){
$prepared_url[] = ""; // The url being scraped, modified slightly to gain access to more information -- not SO applicable data to see
}
$i = 0;
foreach($prepared_url as $url){
$c = curl_init($url);
curl_setopt($c, CURLOPT_HEADER, false);
curl_setopt($c, CURLOPT_USERAGENT, getUserAgent());
curl_setopt($c, CURLOPT_FAILONERROR, true);
curl_setopt($c, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($c, CURLOPT_AUTOREFERER, true);
curl_setopt($c, CURLOPT_RETURNTRANSFER, true);
curl_setopt($c, CURLOPT_TIMEOUT, 20);
// Grab the data.
$html = curl_exec($c);
// Check if the HTML didn't load right, if it didn't - report an error
if (!$html) {
echo "<p>cURL error number: " .curl_errno($c) . " on URL: " . $url ."</p>" .
"<p>cURL error: " . curl_error($c) . "</p>";
}
// $html = file_get_contents($url);
$doc = new DOMDocument;
$doc->loadHTML($html);
$xPath = new DOMXPath($doc);
$results = $xPath->query('//h3[@class="product-name"]');
$arr[$i]['name'] = dnl2array($results);
$results = $xPath->query('//div[@class="product-specs"]');
$arr[$i]['desc'] = dnl2array($results);
$results = $xPath->query('//p[@class="product-image-zoom"]');
$arr[$i]['img'] = dnl2array($results);
$results = $xPath->query('//div[@class="groupedTable"]/table/tbody/tr//span[@class="price"]');
$arr[$i]['price'] = dnl2array($results);
$arr[$i]['url'] = $url;
if($i % 5 == 1){
lazy_loader($arr); //lazy loader adds data to sql database
unset($arr); // keep memory footprint light (server is wimpy -- but free!)
}
$i++;
usleep(50000); // Don't be bandwith pig
}
// Get any stragglers
if(count($arr) > 0){
lazy_loader($arr);
$time = time() + (23 * 60 * 60); // Time + 23 hours for "tomorrow's date"
$tab_name = "sr_data_items_" . date("m_d_y", $time);
// and copy table now that script is finished
mysql_query("CREATE TABLE IF NOT EXISTS `{$tab_name}` LIKE `sr_data_items_skel`");
mysql_query("INSERT INTO `{$tab_name}` SELECT * FROM `sr_data_items_skel`");
mysql_query("TRUNCATE TABLE `sr_data_items_skel`");
}