您不应该使用正则表达式来解析 html,这是我放在一起的一个简单函数,它使用 domDocument 和 curl 更快。
示例刮:
查找a
具有onmouseout
值为 的属性的所有链接return nd();
:
<?php
$link = 'http://www.bandliste.de/Bandliste/';
$data=curl_get($link, $link);
$info = DOMParse($data,'a','onmouseout','return nd();');
print_r($info);
/*
Array
(
[0] => Array
(
[tag] => a
[onmouseout] => return nd();
[text] => Martin und Kiehm
)
[1] => Array
(
[tag] => a
[onmouseout] => return nd();
[text] => Blues For Three
)
[2] => Array
(
[tag] => a
[onmouseout] => return nd();
[text] => Phrase Applauders
)
...
...
*/
?>
或第二个示例寻找div
具有class
名为的属性bandinfo
:
<?php
$link = 'Bands/Falling_For_Beautiful/14469/';
$link='http://www.bandliste.de/'.$link;
$data=curl_get($link, $link);
$info = DOMParse($data,'div','class','bandinfo');
/*
Array
(
[0] => Array
(
[tag] => div
[class] => bandinfo
[text] => What? We are Falling For Beautiful and we make music. And basically thats it. Sound? Rock. Indie. Alternative. Pop. Who? Adrianne (Vocals/Guitar) Nina (Guitar/Special Effects) Bianca (Bass) Marisa (Drums) When? Some of us started having a band in 2003 we played tons of gigs, covered tons of songs, started writing our own songs. In 2008 we decided to forget about that and founded FFB. So we started to write songs and arranged them. We made them sound simple and catchy focusing on lyrics. Our songs are about life. Booking: Bianca Untertrifallerhttp://www.fallingforbeautiful.com
)
)
*/
?>
或某些 javascript 中的 onclick 中包含的图像:
获取所有img
标签onclicks
<?php
$img = DOMParse($data,'img','onclick');
//Then find the image we are looking for
function parse_img($array){
foreach($array as $value){
if(strstr($value['onclick'],"Band Foto")){
preg_match('#window.open\(\'(.*?)\', \'Band Foto\'#',$value['onclick'],$match);
return $match[1];
}
}
}
//echo parse_img($img); //bandfoto-14469.jpg
?>
实际的dom函数:
<?php
function DOMParse($source,$tags,$attribute=null,$attributeValue=null){
header('Content-Type: text/html; charset=utf-8');
$return = array();
$dom = new DOMDocument("1.0","UTF-8");
@$dom->loadHTML($source);
$dom->preserveWhiteSpace = false;
foreach($dom->getElementsByTagName($tags) as $ret) {
//No attribute to look for so return only nodeValue
if($attribute==null){
if(trim($ret->nodeValue)==''){continue;}
$return[] = array('tag'=>$tags,'text'=>preg_replace('/\s+/', ' ',$ret->nodeValue));
}else{
//Attribute not null look for eg: src, href, class ect
if(trim($ret->nodeValue)=='' && $ret->getAttribute($attribute)==''){continue;}
//If we looking for specific value from an attribute containg an attibute value
if($attributeValue!=null){
if($ret->getAttribute($attribute)==$attributeValue){
$return[] = array('tag'=>$tags,$attribute=>$ret->getAttribute($attribute),'text'=>preg_replace('/\s+/', ' ',$ret->nodeValue));
}
}else{
$return[] = array('tag'=>$tags,$attribute=>$ret->getAttribute($attribute),'text'=>preg_replace('/\s+/', ' ',$ret->nodeValue));
}
}
}
return $return;
}
?>
和卷曲功能:
<?php
function curl_get($url, $referer){
//check curl is installed or revert back to file_get_contents
$return = (function_exists('curl_init')) ? '' : false;
if($return==false){return file_get_contents($url);}
$curl = curl_init();
$header[0] = "Accept: text/xml,application/xml,application/json,application/xhtml+xml,";
$header[0] .= "text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
$header[] = "Cache-Control: max-age=0";
$header[] = "Connection: keep-alive";
$header[] = "Keep-Alive: 300";
$header[] = "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7";
$header[] = "Accept-Language: en-us,en;q=0.5";
$header[] = "Pragma: ";
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0 Firefox/5.0');
curl_setopt($curl, CURLOPT_HTTPHEADER, $header);
curl_setopt($curl, CURLOPT_HEADER, 0);
curl_setopt($curl, CURLOPT_REFERER, $referer);
curl_setopt($curl, CURLOPT_ENCODING, 'gzip,deflate');
curl_setopt($curl, CURLOPT_AUTOREFERER, true);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($curl, CURLOPT_TIMEOUT, 30);
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
$html = curl_exec($curl);
curl_close($curl);
return $html;
}
?>
希望能帮助到你。