尝试这个。
$content = file_get_contents('http://www.filehippo.com');
$this->html = $content;
$this->process();
function process(){
// header
$this->_replace('/.*<head>/ism', "<?xml version='1.0' encoding='UTF-8'?><!DOCTYPE html PUBLIC '-//WAPFORUM//DTD XHTML Mobile 1.0//EN' 'http://www.wapforum.org/DTD/xhtml-mobile10.dtd'><html xmlns='http://www.w3.org/1999/xhtml'><head>");
// title
$this->_replace('/<head>.*?(<title>.*<\/title>).*?<\/head>/ism', '<head>$1</head>');
// strip out divs with little content
$this->_stripContentlessDivs();
// divs/p
$this->_replace('/<div[^>]*>/ism', '') ;
$this->_replace('/<\/div>/ism','<br/><br/>');
$this->_replace('/<p[^>]*>/ism','');
$this->_replace('/<\/p>/ism', '<br/>') ;
// h tags
$this->_replace('/<h[1-5][^>]*>(.*?)<\/h[1-5]>/ism', '<br/><b>$1</b><br/><br/>') ;
// remove align/height/width/style/rel/id/class tags
$this->_replace('/\salign=(\'?\"?).*?\\1/ism','');
$this->_replace('/\sheight=(\'?\"?).*?\\1/ism','');
$this->_replace('/\swidth=(\'?\"?).*?\\1/ism','');
$this->_replace('/\sstyle=(\'?\"?).*?\\1/ism','');
$this->_replace('/\srel=(\'?\"?).*?\\1/ism','');
$this->_replace('/\sid=(\'?\"?).*?\\1/ism','');
$this->_replace('/\sclass=(\'?\"?).*?\\1/ism','');
// remove coments
$this->_replace('/<\!--.*?-->/ism','');
// remove script/style
$this->_replace('/<script[^>]*>.*?\/script>/ism','');
$this->_replace('/<style[^>]*>.*?\/style>/ism','');
// multiple \n
$this->_replace('/\n{2,}/ism','');
// remove multiple <br/>
$this->_replace('/(<br\s?\/?>){2}/ism','<br/>');
$this->_replace('/(<br\s?\/?>\s*){3,}/ism','<br/><br/>');
//tables
$this->_replace('/<table[^>]*>/ism', '');
$this->_replace('/<\/table>/ism', '<br/>');
$this->_replace('/<(tr|td|th)[^>]*>/ism', '');
$this->_replace('/<\/(tr|td|th)[^>]*>/ism', '<br/>');
// wrap and close
}
private function _replace($pattern, $replacement, $limit=-1){
$this->html = preg_replace($pattern, $replacement, $this->html, $limit);
}
更多信息 - https://code.google.com/p/phpmobilizer/