公共服务公告:不要使用正则表达式重写格式化文档的元素。
执行此操作的正确方法是将文档加载为实体(DOMDocument
或SimpleXMLElement
)并根据节点和值进行处理。最初的解决方案也没有处理src
基本相对 URL(例如/css/style.css
)的标签或解析。
这是一个最合适的解决方案,如果需要可以扩展:
# Example URL
$url = "http://www.stackoverflow.com/";
# Get the root and current directory
$pattern = "/(.*\/\/[^\/]+\/)([^?#]*\/)?/";
/* The pattern has two groups: one for the domain (anything before
the first two slashes, the slashes, anything until the next slash,
and the next slash) and one for the current directory (anything
that isn't an anchor or query string, then the last slash before
any anchor or query string). This yields:
- [0]: http://stackoverflow.com/question/123412341234
- [1]: http://stackoverflow.com/
- [2]: question/
We only need [0] (the entire match) and [1] (just the first group).
*/
$matches = array();
preg_match($pattern, $url, $matches);
$cd = $matches[0];
$root = $matches[1];
# Normalizes the URL on the provided element's attribute
function normalizeAttr($element, $attr){
global $pattern, $cd, $root;
$href = $element->getAttribute($attr);
# If this is an external URL, ignore
if(preg_match($pattern, $href))
return;
# If this is a base-relative URL, prepend the base
elseif(substr($href, 0, 1) == '/')
$element->setAttribute($attr, $root . substr($href, 1));
# If this is a relative URL, prepend the current directory
elseif(substr($href, 0, strlen($cd)) != $cd)
$element->setAttribute($attr, $cd . $href);
}
# Load in the data, ignoring HTML5 errors
$page = new DOMDocument();
libxml_use_internal_errors(true);
$page->loadHTMLFile($url);
libxml_use_internal_errors(false);
$page->normalizeDocument();
# Normalize <link href="..."/>
foreach($page->getElementsByTagName('link') as $link)
normalizeAttr($link, 'href');
# Normalize <a href="...">...</a>
foreach($page->getElementsByTagName('a') as $anchor)
normalizeAttr($anchor, 'href');
# Normalize <img src="..."/>
foreach($page->getElementsByTagName('img') as $image)
normalizeAttr($image, 'src');
# Normalize <script src="..."></script>
foreach($page->getElementsByTagName('script') as $script)
normalizeAttr($script, 'src');
# Render normalized data
print $page->saveHTML();