0

我得到了这个函数来从外部 URL 获取一些信息。问题是,如果网站有 robots no-index 这个函数会崩溃,然后导致 foreach 循环崩溃。

错误信息:

警告:file_get_contents(http://webontwerp-arnhem.nl/contact):无法打开流:/var/www/vhosts/free-sitemap-generator.com/httpdocs/includes/cra/simple_html_dom.php 中的连接被拒绝第 79 行

致命错误:未捕获的错误:调用 /var/www/vhosts/free-sitemap-generator.com/httpdocs/includes/cra/xml-functions.php:60 中布尔值的成员函数 find() 堆栈跟踪:#0 /var/www/vhosts/free-sitemap-generator.com/httpdocs/crawler.php(44): crawl_site(' http://webontwe ...') #1 {main} 抛出 /var/www/vhosts /free-sitemap-generator.com/httpdocs/includes/cra/xml-functions.php 第 60 行

功能:

function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
{
    $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
    $contents = file_get_contents($url, $use_include_path, $context, $offset);

    if (empty($contents) || strlen($contents) > MAX_FILE_SIZE)
    {
        return false;
    }

    $dom->load($contents, $lowercase, $stripRN);
    return $dom;
}

使用循环调用函数:

function crawl_site($u) {
    $urlList = array();
    global $crawled_urls, $found_urls;
    $uen = urlencode($u);
    if ((array_key_exists($uen, $crawled_urls) == 0 || $crawled_urls[$uen] < date("YmdHis", strtotime('-25 seconds', time())))) {
        $html = file_get_html($u);

        $crawled_urls[$uen] = date("YmdHis");
        foreach($html -> find("a")as $li) {
            $url = perfect_url($li -> href, $u);
            $enurl = urlencode($url);
            $str = basename($url);
            $dirn = dirname($url);
            if ($url != '' && substr($url, 0, 4) != "mail" && substr($url, 0, 3) != "tel" && substr($url, 0, 5) != "phone" && substr($url, 0, 5) != "skype" && substr($url, 0, 4) != "java" && array_key_exists($enurl, $found_urls) == 0) {
                $found_urls[$enurl] = 1;
                $pos = strpos($str[0], '#');
                $ext = strpos($url, $u);
                if ($ext != = false && $pos == = false) {
                    echo "<li><div class='url-row'>$dirn/<span class='strong'>$str</span></div></li>";

                    array_push($urlList, $url);
                }
            }
        }
    }
}
4

1 回答 1

0

您可以使用 CURL 代替 file_get_contents()

<?php
    $url = 'http://webontwerp-arnhem.nl/contact';         
    $ch = curl_init();        
    curl_setopt($ch, CURLOPT_URL, $url);        
    curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);        
    $contents = curl_exec ($ch);        
    curl_close ($ch); 
于 2017-03-03T18:11:51.503 回答