I want to extract various data from URLs that will be converted to UTF-8 no matter what the encoding methods is used in original page (or at least it will work on most of the source encodings).
So, after looking and searching many discussions and answers, I finally came with the following code, with which I am parsing HTML data twice (once for detecting encoding and a second time for getting the actual data). This is working at least on all the checked URLs. But I think that the code is poorly written.
Can anyone let me know if there are any better alternatives to do the same or if I need any improvements on the code?
<?php
header('Content-Type: text/html; charset=utf-8');
require_once 'curl.php';
require_once 'curl_response.php';
$curl = new Curl;
$url = "http://" . $_GET['domain'];
$curl_response = $curl->get($url);
$header_content_type = $curl_response->headers['Content-Type'];
$dom_doc = new DOMDocument();
libxml_use_internal_errors(TRUE);
$dom_doc->loadHTML('<?xml encoding="utf-8" ?>' . $curl_response);
libxml_use_internal_errors(FALSE);
$metas = $dom_doc->getElementsByTagName('meta');
foreach ($metas as $meta) {
if (strtolower($meta->getAttribute('http-equiv')) == 'content-type') {
$meta_content_type = $meta->getAttribute('content');
}
if ($meta->getAttribute('charset') != '') {
$html5_charset = $meta->getAttribute('charset');
}
}
if (preg_match('/charset=(.+)/', $header_content_type, $m)) {
$charset = $m[1];
} elseif (preg_match('/charset=(.+)/', $meta_content_type, $m)) {
$charset = $m[1];
} elseif (!empty($html5_charset)) {
$charset = $html5_charset;
} elseif (preg_match('/encoding=(.+)/', $curl_response, $m)) {
$charset = $m[1];
} else {
// browser default charset
// $charset = 'ISO-8859-1';
}
if (!empty($charset) && $charset != "utf-8") {
$tmp = iconv($charset,'utf-8', $curl_response);
libxml_use_internal_errors(TRUE);
$dom_doc->loadHTML('<?xml encoding="utf-8" ?>' . $tmp);
libxml_use_internal_errors(FALSE);
}
$page_title = $dom_doc->getElementsByTagName('title')->item(0)->nodeValue;
$metas = $dom_doc->getElementsByTagName('meta');
foreach ($metas as $meta) {
if (strtolower($meta->getAttribute('name')) == 'description') {
$meta_description = $meta->getAttribute('content');
}
if (strtolower($meta->getAttribute('name')) == 'keywords') {
$meta_tags = $meta->getAttribute('content');
}
}
print $charset;
print "<hr>";
print $page_title;
print "<hr>";
print $meta_description;
print "<hr>";
print $meta_tags;
print "<hr>";
print "Memory Peak Usages: " . memory_get_peak_usage()/1024/1024 . " MB";
?>