1

我正在尝试运行一个长时间运行的脚本来下载和解析页面的新闻项目。我的脚本下载了大约 260 多个 html 页面,并从每个页面解析新闻项目。

这是一个长期运行的脚本。

如果我将它必须解析的页面数量设置为低(这意味着它会花费更少的时间来执行),那么脚本运行良好:

// Since the University blog page has 262 pages, we'll iterate through that.
// Only 21 pages.
for ($i = 2; $i <= 21; $i++) {
    $url = "http://www.uvm.cl/noticias_mas.shtml?AA_SL_Session=34499aef1fc7a296fb666dcc7b9d8d05&scrl=1&scr_scr_Go=" . $i;
    $page = file_get_html($url);
    parse_page_for_news($page, $parsedNews);
}

如果我将该页数增加到 40 或更多,脚本将不再返回任何内容。在 Google Chrome 上会出现此错误消息:

// Since the University blog page has 262 pages, we'll iterate through that.
// 41 pages this time. Longer running time!
for ($i = 2; $i <= 41; $i++) {
    $url = "http://www.uvm.cl/noticias_mas.shtml?AA_SL_Session=34499aef1fc7a296fb666dcc7b9d8d05&scrl=1&scr_scr_Go=" . $i;
    $page = file_get_html($url);
    parse_page_for_news($page, $parsedNews);
}

没有收到任何数据

无法加载网页,因为服务器未发送数据。

以下是一些建议: 稍后重新加载此网页。

错误 324 (net::ERR_EMPTY_RESPONSE):服务器关闭连接而不发送任何数据。

这是我php.ini根据一些搜索所做的修改,但仍然没有工作修复。

;;;;;;;;;;;;;;;;;;;
; Resource Limits ;
;;;;;;;;;;;;;;;;;;;

max_execution_time = 0     ; Maximum execution time of each script, in seconds
max_input_time = 60 ; Maximum amount of time each script may spend parsing request data
memory_limit = -1      ; Maximum amount of memory a script may consume (8MB)

另一个奇怪的是,当我将脚本设置为快速运行(迭代次数很少)时,我实际上可以在access_log文件中看到请求。

127.0.0.1 - - [27/Jul/2012:15:50:19 -0400] "GET /scrapernoticias/scraper.php HTTP/1.1" 200 509
127.0.0.1 - - [27/Jul/2012:15:50:23 -0400] "GET /scrapernoticias/scraper.php HTTP/1.1" 200 509
127.0.0.1 - - [27/Jul/2012:15:58:02 -0400] "GET /scrapernoticias/scraper.php HTTP/1.1" 200 500

当我将事情设置为高(长时间运行迭代)时,我看不到这个新请求,就好像它从未到达服务器一样。

这是整个脚本:

<h1>Scraper Noticias</h1>

<?php

include('simple_html_dom.php');
include('rb.php');

// Setup RedBean to work with a database.
R::setup('mysql:host=localhost;dbname=noticias','root','');

set_time_limit(0);

class News {
    var $image;
    var $fechanoticia;
    var $title;
    var $description;
    var $sourceurl;

    function get_image( ) {
        return $this->image;
    }

    function set_image ($new_image) {
        $this->image = $new_image;
    }

    function get_fechanoticia( ) {
        return $this->fechanoticia;
    }

    function set_fechanoticia ($new_fechanoticia) {
        $this->fechanoticia = $new_fechanoticia;
    }

    function get_title( ) {
        return $this->title;
    }

    function set_title ($new_title) {
        $this->title = $new_title;
    }

    function get_description( ) {
        return $this->description;
    }

    function set_description ($new_description) {
        $this->description = $new_description;
    }

    function get_sourceurl( ) {
        return $this->sourceurl;
    }

    function set_sourceurl ($new_sourceurl) {
        $this->sourceurl = $new_sourceurl;
    }
}

// Declare variable to hold all parsed news items.
$parsedNews = array();

// Grab page number 1, and parse that first.
$initialPage = file_get_html('http://www.uvm.cl/noticias_mas.shtml');
parse_page_for_news($initialPage, $parsedNews);

// Since the University blog page has 262 pages, we'll iterate through that.
for ($i = 2; $i <= 3; $i++) {
    $url = "http://www.uvm.cl/noticias_mas.shtml?AA_SL_Session=34499aef1fc7a296fb666dcc7b9d8d05&scrl=1&scr_scr_Go=" . $i;
    $page = file_get_html($url);
    parse_page_for_news($page, $parsedNews);
}

echo "<h1>Noticias encontradas:" . count($parsedNews) . "</h1>";
//echo print_r($parsedNews[count($parsedNews) - 1]);

// Save each parsed news to the database.
foreach($parsedNews as &$tmpNews) {
    $noticia = R::dispense('noticia');
    $noticia->imagen = $tmpNews->get_image();
    $noticia->fecha = $tmpNews->get_fechanoticia();
    $noticia->titulo = $tmpNews->get_title();
    $noticia->url = $tmpNews->get_sourceurl();
    $noticia->descripcion = $tmpNews->get_description(); 
    $id = R::store($noticia);  
}

// Disconnect from the database.
R::close();

// Function receives an HTML Dom object, and the library works against that single HTML object.
function parse_page_for_news ($page, &$parsedNews) {

    foreach($page->find('#cont2 p') as $element) {

        $newItem = new News;

        // Parse the news item's thumbnail image.
        foreach ($element->find('img') as $image) {
            $newItem->set_image($image->src);
            //echo $newItem->get_image() . "<br />";
        }

        // Parse the news item's post date.
        foreach ($element->find('span.fechanoticia') as $fecha) {
            $newItem->set_fechanoticia($fecha->innertext);
            //echo $newItem->get_fechanoticia() . "<br />";
        }

        // Parse the news item's title.
        foreach ($element->find('a') as $title) {
            $newItem->set_title($title->innertext);
            //echo $newItem->get_title() . "<br />";
        }

        // Parse the news item's source URL link.
        foreach ($element->find('a') as $sourceurl) {
            $newItem->set_sourceurl("http://www.uvm.cl/" . $sourceurl->href);
        }

        // Parse the news items' description text.
        foreach ($element->find('a') as $link) {
            $link->outertext = '';
        }

        foreach ($element->find('span') as $link) {
            $link->outertext = '';
        }

        foreach ($element->find('img') as $link) {
            $link->outertext = '';
        }

        $newItem->set_description($element->innertext);

        // Add the newly formed NewsItem to the $parsedNews object.
        $parsedNews[] = $newItem;

        // For debugging purposes, it'll print each parsed News Item.
        //print_r($newItem);
        //echo "<br /><br /><br />";

    }
} 

?>
4

0 回答 0