我得到了这个函数来从外部 URL 获取一些信息。问题是,如果网站有 robots no-index 这个函数会崩溃,然后导致 foreach 循环崩溃。
错误信息:
警告:file_get_contents(http://webontwerp-arnhem.nl/contact):无法打开流:/var/www/vhosts/free-sitemap-generator.com/httpdocs/includes/cra/simple_html_dom.php 中的连接被拒绝第 79 行
致命错误:未捕获的错误:调用 /var/www/vhosts/free-sitemap-generator.com/httpdocs/includes/cra/xml-functions.php:60 中布尔值的成员函数 find() 堆栈跟踪:#0 /var/www/vhosts/free-sitemap-generator.com/httpdocs/crawler.php(44): crawl_site(' http://webontwe ...') #1 {main} 抛出 /var/www/vhosts /free-sitemap-generator.com/httpdocs/includes/cra/xml-functions.php 第 60 行
功能:
function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
{
$dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
$contents = file_get_contents($url, $use_include_path, $context, $offset);
if (empty($contents) || strlen($contents) > MAX_FILE_SIZE)
{
return false;
}
$dom->load($contents, $lowercase, $stripRN);
return $dom;
}
使用循环调用函数:
function crawl_site($u) {
$urlList = array();
global $crawled_urls, $found_urls;
$uen = urlencode($u);
if ((array_key_exists($uen, $crawled_urls) == 0 || $crawled_urls[$uen] < date("YmdHis", strtotime('-25 seconds', time())))) {
$html = file_get_html($u);
$crawled_urls[$uen] = date("YmdHis");
foreach($html -> find("a")as $li) {
$url = perfect_url($li -> href, $u);
$enurl = urlencode($url);
$str = basename($url);
$dirn = dirname($url);
if ($url != '' && substr($url, 0, 4) != "mail" && substr($url, 0, 3) != "tel" && substr($url, 0, 5) != "phone" && substr($url, 0, 5) != "skype" && substr($url, 0, 4) != "java" && array_key_exists($enurl, $found_urls) == 0) {
$found_urls[$enurl] = 1;
$pos = strpos($str[0], '#');
$ext = strpos($url, $u);
if ($ext != = false && $pos == = false) {
echo "<li><div class='url-row'>$dirn/<span class='strong'>$str</span></div></li>";
array_push($urlList, $url);
}
}
}
}
}