function get_url_contents($url) {
$crl = curl_init();
curl_setopt($crl, CURLOPT_USERAGENT, 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)');
curl_setopt($crl, CURLOPT_URL, $url);
curl_setopt($crl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($crl, CURLOPT_CONNECTTIMEOUT, 5);
$ret = curl_exec($crl);
curl_close($crl);
return $ret;
}
function getElementsByClassName(DOMDocument $DOMDocument, $ClassName)
{
$Elements = $DOMDocument -> getElementsByTagName("*");
$Matched = array();
foreach($Elements as $node)
{
if( ! $node -> hasAttributes())
continue;
$classAttribute = $node -> attributes -> getNamedItem('class');
if( ! $classAttribute)
continue;
$classes = explode(' ', $classAttribute -> nodeValue);
if(in_array($ClassName, $classes))
$Matched[] = $node;
}
return $Matched;
}
libxml_use_internal_errors(true);
$content = get_url_contents("http://www.imdb.com/movies-in-theaters/");
$dom = new DomDocument();
$dom->loadHTML($content);
$elemsByClassName = getElementsByClassName($dom, 'overview-top');
foreach($elemsByClassName as $elem) {
foreach ($elem->getElementsByTagName('a') as $a) {
preg_match('/(title\/)([0-9A-Za-z]+)(\/)?/',$a->getAttribute('href'), $matches);
echo $a->nodeValue. ' - ' . $matches[2] . '<br/>';
break; // we need only the first A tag.
}
}
输出:
Star Trek Into Darkness (2013) - tt1408101
Frances Ha (2012) - tt2347569
Stories We Tell (2012) - tt2366450
The Expatriate (2012) - tt1645155
The English Teacher (2013) - tt2055765
Augustine (2012) - tt2098628
Black Rock (2012) - tt1930294
State 194 (2012) - tt2324918
Iron Man 3 (2013) - tt1300854
The Great Gatsby (2013) - tt1343092
Pain & Gain (2013) - tt1980209
Peeples (2013) - tt1699755
42 (2013) - tt0453562
Oblivion (2013) - tt1483013
The Croods (2013) - tt0481499
The Big Wedding (2013) - tt1931435
Mud (2012) - tt1935179
Oz the Great and Powerful (2013) - tt1623205