0

好的,我尝试构建我的第一个适当的 cUrl 函数,我使用 Nettuts cUrl(http://net.tutsplus.com/tutorials/php/techniques-and-resources-for-mastering-curl/)wordpress 链接检查器作为基础,然后出于安全原因重做数据库访问。我不知道为什么它不起作用,因为我只重写了数据库访问部分和第 32 行的一些更改。我还将发布来自 Nettuts 的原始代码,希望对您有所帮助。该代码用于检查文档(.PDF 和 .doc)的链接是否仍然存在或是否需要更新。

任何帮助,将不胜感激!

原始代码

    // CONFIG  
$db_host = 'localhost';  
$db_user = 'root';  
$db_pass = '';  
$db_name = 'wordpress';  
$excluded_domains = array(  
    'localhost', 'www.mydomain.com');  
$max_connections = 10;  
// initialize some variables  
$url_list = array();  
$working_urls = array();  
$dead_urls = array();  
$not_found_urls = array();  
$active = null;  

// connect to MySQL  
if (!mysql_connect($db_host, $db_user, $db_pass)) {  
    die('Could not connect: ' . mysql_error());  
}  
if (!mysql_select_db($db_name)) {  
    die('Could not select db: ' . mysql_error());  
}  
// get all published posts that have links  
$q = "SELECT post_content FROM wp_posts 
    WHERE post_content LIKE '%href=%' 
    AND post_status = 'publish' 
    AND post_type = 'post'";  
$r = mysql_query($q) or die(mysql_error());  
while ($d = mysql_fetch_assoc($r)) {  

    // get all links via regex  
    if (preg_match_all("!href=\"(.*?)\"!", $d['post_content'], $matches)) {  

        foreach ($matches[1] as $url) {  

            // exclude some domains  
            $tmp = parse_url($url);  
            if (in_array($tmp['host'], $excluded_domains)) {  
                continue;  
            }  

            // store the url  
            $url_list []= $url;  
        }  
    }  
}  

// remove duplicates  
$url_list = array_values(array_unique($url_list));  

if (!$url_list) {  
    die('No URL to check');  
}  

    // 1. multi handle  
$mh = curl_multi_init();  

// 2. add multiple URLs to the multi handle  
for ($i = 0; $i < $max_connections; $i++) {  
    add_url_to_multi_handle($mh, $url_list);  
}  

// 3. initial execution  
do {  
    $mrc = curl_multi_exec($mh, $active);  
} while ($mrc == CURLM_CALL_MULTI_PERFORM);  

// 4. main loop  
while ($active && $mrc == CURLM_OK) {  

    // 5. there is activity  
    if (curl_multi_select($mh) != -1) {  

        // 6. do work  
        do {  
            $mrc = curl_multi_exec($mh, $active);  
        } while ($mrc == CURLM_CALL_MULTI_PERFORM);  

        // 7. is there info?  
        if ($mhinfo = curl_multi_info_read($mh)) {  
            // this means one of the requests were finished  

            // 8. get the info on the curl handle  
            $chinfo = curl_getinfo($mhinfo['handle']);  

            // 9. dead link?  
            if (!$chinfo['http_code']) {  
                $dead_urls []= $chinfo['url'];  

            // 10. 404?  
            } else if ($chinfo['http_code'] == 404) {  
                $not_found_urls []= $chinfo['url'];  

            // 11. working  
            } else {  
                $working_urls []= $chinfo['url'];  
            }  

            // 12. remove the handle  
            curl_multi_remove_handle($mh, $mhinfo['handle']);  
            curl_close($mhinfo['handle']);  

            // 13. add a new url and do work  
            if (add_url_to_multi_handle($mh, $url_list)) {  

                do {  
                    $mrc = curl_multi_exec($mh, $active);  
                } while ($mrc == CURLM_CALL_MULTI_PERFORM);  
            }  
        }  
    }  
}  

// 14. finished  
curl_multi_close($mh);  

echo "==Dead URLs==\n";  
echo implode("\n",$dead_urls) . "\n\n";  

echo "==404 URLs==\n";  
echo implode("\n",$not_found_urls) . "\n\n";  

echo "==Working URLs==\n";  
echo implode("\n",$working_urls);  

// 15. adds a url to the multi handle  
function add_url_to_multi_handle($mh, $url_list) {  
    static $index = 0;  

    // if we have another url to get  
    if ($url_list[$index]) {  

        // new curl handle  
        $ch = curl_init();  

        // set the url  
        curl_setopt($ch, CURLOPT_URL, $url_list[$index]);  
        // to prevent the response from being outputted  
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);  
        // follow redirections  
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);  
        // do not need the body. this saves bandwidth and time  
        curl_setopt($ch, CURLOPT_NOBODY, 1);  

        // add it to the multi handle  
        curl_multi_add_handle($mh, $ch);  
        // increment so next url is used next time  
        $index++;  

        return true;  
    } else {  

        // we are done adding new URLs  
        return false;  
    }  
}  

我的代码

<?php
/*Config*/
/*** mysql hostname ***/
$hostname = 'localhost';

/*** mysql username ***/
$username = 'root';

/*** mysql password ***/
$password = 'root';
/*curl setup of varibles*/
$excluded_domains = array(  
'localhost', 'rollnstroll.se');
$max_connections = 10;
$url_list = array();  
$working_urls = array();  
$dead_urls = array();  
$not_found_urls = array();  
$active = null;



try {
  $dbh = new PDO("mysql:host=$hostname;dbname=blankett", $username, $password);
  $dbh->exec('SET CHARACTER SET utf8');
  /*** echo a message saying we have connected ***/


/*** fetch into an PDOStatement object ***/
$sql = "SELECT link_forms FROM forms2 WHERE id = ?";

$stmt = $dbh->query($sql);

$result = $stmt->fetch(PDO::FETCH_ASSOC);

// get all links via regex  
if (preg_match_all("!href=\"(.*?)\"!", $d['link_forms'], $matches)) {

    foreach ($matches[1] as $url) { 

        // exclude some domains  
        $tmp = parse_url($url);  
        if (in_array($tmp['host'], $excluded_domains)) {  
            continue;  
        }
        // store the url  
        $url_list []= $url; 
    }
}

// remove duplicates  
$url_list = array_values(array_unique($url_list));

if (!$url_list) {  
die('No URL to check');  
}  


}
catch(PDOException $e)
{
echo $e->getMessage();
}

// 1. multi handle  
$mh = curl_multi_init();  

// 2. add multiple URLs to the multi handle  
for ($i = 0; $i < $max_connections; $i++) {  
add_url_to_multi_handle($mh, $url_list);  
}  

// 3. initial execution  
do {  
$mrc = curl_multi_exec($mh, $active);  
} while ($mrc == CURLM_CALL_MULTI_PERFORM);  

// 4. main loop  
while ($active && $mrc == CURLM_OK) {  

// 5. there is activity  
if (curl_multi_select($mh) != -1) {  

    // 6. do work  
    do {  
        $mrc = curl_multi_exec($mh, $active);  
    } while ($mrc == CURLM_CALL_MULTI_PERFORM);  

    // 7. is there info?  
    if ($mhinfo = curl_multi_info_read($mh)) {  
        // this means one of the requests were finished  

        // 8. get the info on the curl handle  
        $chinfo = curl_getinfo($mhinfo['handle']);  

        // 9. dead link?  
        if (!$chinfo['http_code']) {  
            $dead_urls []= $chinfo['url'];  

        // 10. 404?  
        } else if ($chinfo['http_code'] == 404) {  
            $not_found_urls []= $chinfo['url'];  

        // 11. working  
        } else {  
            $working_urls []= $chinfo['url'];  
        }  

        // 12. remove the handle  
        curl_multi_remove_handle($mh, $mhinfo['handle']);  
        curl_close($mhinfo['handle']);  

        // 13. add a new url and do work  
        if (add_url_to_multi_handle($mh, $url_list)) {  

            do {  
                $mrc = curl_multi_exec($mh, $active);  
            } while ($mrc == CURLM_CALL_MULTI_PERFORM);  
        }  
    }  
  }  
}  

// 14. finished  
curl_multi_close($mh);  

echo "==Dead URLs==\n";  
echo implode("\n",$dead_urls) . "\n\n";  

echo "==404 URLs==\n";  
echo implode("\n",$not_found_urls) . "\n\n";  

echo "==Working URLs==\n";  
echo implode("\n",$working_urls);  

// 15. adds a url to the multi handle  
function add_url_to_multi_handle($mh, $url_list) {  
static $index = 0;  

// if we have another url to get  
if ($url_list[$index]) {  

    // new curl handle  
    $ch = curl_init();  

    // set the url  
    curl_setopt($ch, CURLOPT_URL, $url_list[$index]);  
    // to prevent the response from being outputted  
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);  
    // follow redirections  
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);  
    // do not need the body. this saves bandwidth and time  
    curl_setopt($ch, CURLOPT_NOBODY, 1);  

    // add it to the multi handle  
    curl_multi_add_handle($mh, $ch);  
    // increment so next url is used next time  
    $index++;  

    return true;  
} else {  

    // we are done adding new URLs  
    return false;  
}  
}  
?>

我从原始版本重写的是数据库连接,这意味着我必须缩进我的代码,因为我使用了 PDO。我也重写了:

if (preg_match_all("!href=\"(.*?)\"!", $d['link_forms'], $matches)) {
from
if (preg_match_all("!href=\"(.*?)\"!", $d['post_content'], $matches)) {

我认为问题就在这里,但我缺乏技能并没有让我找到答案。如果有更好的脚本来检查死链接、重定向和功能链接,请告诉我。

4

1 回答 1

1

我看不到你在用$result做什么

原始代码中有

$r = mysql_query($q) or die(mysql_error());  
while ($d = mysql_fetch_assoc($r)) {  
    if (preg_match_all("!href=\"(.*?)\"!", $d['post_content'], $matches)) { 
...

仅在您的代码中

$stmt = $dbh->query($sql);
$result = $stmt->fetch(PDO::FETCH_ASSOC);
    if (preg_match_all("!href=\"(.*?)\"!", $d['link_forms'], $matches)) {

因此,$d并不$d['link_forms']存在!!
所以if (preg_match_all(..., $d['link_forms'], ...))返回 False。

消除

$result = $stmt->fetch(PDO::FETCH_ASSOC); 
    if (preg_match_all("!href=\"(.*?)\"!", $d['link_forms'], $matches)) {

并将其替换为

while ($d = $stmt->fetch(PDO::FETCH_ASSOC)) {  
    if (preg_match_all("!href=\"(.*?)\"!", $d['link_forms'], $matches)) {
...

你得到 print_r($matches);什么?
查看输出数组,您需要从$matches.

if (preg_match_all("!href=\"(.*?)\"!", $d['link_forms'], $matches)) {

    print_r($matches);

如果有更多的 url,$d['link_forms']那么这个语句是不可能的。

foreach ($matches[1] as $url) {   

然后你必须通过数组

foreach ($matches as $url) {   
    echo "part 0: " . $url[0] . "\n";
    echo "part 1: " . $url[1] . "\n";
    ...
于 2013-05-03T15:56:36.460 回答