-3

我有一个 php 网络爬虫,我很想给它添加 get_meta_tags() 函数。它扫描给定网页的所有 url 等等。是否可以将 get_meta_tag 方法添加到网络爬虫中,以便从扫描的网址中获取元数据?

 session_start();

 $domain = "www.ebay.com";

 if(empty($_SESSION['page']))
 {
 $original_file = file_get_contents("http://" . $domain . "/");

 $_SESSION['i'] = 0;

 $connect = mysql_connect("cust-mysql-123-05", "uthe_774575_0001", "rooney08");

 if (!$connect)
 {
 die("MySQL could not connect!");
 }

 $DB = mysql_select_db('theqlickcom_774575_db1');

if(!$DB)
{
 die("MySQL could not select Database!");
}
}
if(isset($_SESSION['page']))
{

$connect = mysql_connect("xxxxx", "xxxxx", "xxxx");

if (!$connect)
{
die("MySQL could not connect!");
}

$DB = mysql_select_db('xxxx');

if(!$DB)
{
die("MySQL could not select Database!");
}
$PAGE = $_SESSION['page'];
$original_file = file_get_contents("$PAGE");
}

$stripped_file = strip_tags($original_file, "<a>");
preg_match_all("/<a(?:[^>]*)href=\"([^\"]*)\"(?:[^>]*)>(?:[^<]*)<\/a>/is", $stripped_file,     $matches);

foreach($matches[1] as $key => $value)
{

if(strpos($value,"http://") != 'FALSE' && strpos($value,"https://") != 'FALSE')
{
$New_URL = "http://" . $domain . $value; 
}
else
{
$New_URL = $value;
}
$New_URL = addslashes($New_URL);
$Check = mysql_query("SELECT * FROM pages WHERE url='$New_URL'");
$Num = mysql_num_rows($Check);

if($Num == 0)
{
mysql_query("INSERT INTO pages (url)
VALUES ('$New_URL')");

 $_SESSION['i']++;

 echo $_SESSION['i'] . "";
  } 
  echo mysql_error();
   }

  $RandQuery = mysql_query("SELECT DISTINCT * FROM pages ORDER BY rank LIMIT 0,1");
  $RandReturn = mysql_num_rows($RandQuery);
  while($row1 = mysql_fetch_assoc($RandQuery))
  {
  $_SESSION['page'] = $row1['url'];
  } 
  echo $RandReturn;
  echo $_SESSION['page'];
  mysql_close();

  ?>
4

2 回答 2

1

在从外部源读取 html 标签之前,我已经遇到过这个问题。Jstel 为我提供了一个很好的解决方案,尽管我相信您可以将她的解决方案纳入您的解决方案。

http://www.php.net/manual/en/function.get-meta-tags.php#92197

根据您的代码,它是如何工作的:

$domain = "www.ebay.com";
$original_file = file_get_contents("http://" . $domain . "/");
preg_match_all("/<meta[^>]+(http\-equiv|name)=\"([^\"]*)\"[^>]" . "+content=\"([^\"]*)\"[^>]*>/i",$original_file, $result);
print_r($result);

您将在下面看到我从这个正则表达式得到的示例结果:

Array
(
    [0] => Array
        (
            [0] => <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
            [1] => <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
            [2] => <meta name="keywords" content="ebay, electronics, cars, clothing, apparel, collectibles, sporting goods, digital cameras, antiques, tickets, jewelry, online shopping, auction, online auction">
            [3] => <meta name="description" content="Buy and sell electronics, cars, fashion apparel, collectibles, sporting goods, digital cameras, baby items, coupons, and everything else on eBay, the world's online marketplace">
            [4] => <meta name="verify-v1" content="j6ZKbG61n+f9pUtbkf69zFRBrRSeUqyfEJ2BjiRxWDQ=">
            [5] => <meta name="y_key" content="acf32e2a69cbc2b0">
            [6] => <meta name="msvalidate.01" content="31154A785F516EC9842FC3BA2A70FB1A">
        )

    [1] => Array
        (
            [0] => http-equiv
            [1] => http-equiv
            [2] => name
            [3] => name
            [4] => name
            [5] => name
            [6] => name
        )

    [2] => Array
        (
            [0] => Content-Type
            [1] => Content-Type
            [2] => keywords
            [3] => description
            [4] => verify-v1
            [5] => y_key
            [6] => msvalidate.01
        )

    [3] => Array
        (
            [0] => text/html; charset=UTF-8
            [1] => text/html; charset=UTF-8
            [2] => ebay, electronics, cars, clothing, apparel, collectibles, sporting goods, digital cameras, antiques, tickets, jewelry, online shopping, auction, online auction
            [3] => Buy and sell electronics, cars, fashion apparel, collectibles, sporting goods, digital cameras, baby items, coupons, and everything else on eBay, the world's online marketplace
            [4] => j6ZKbG61n+f9pUtbkf69zFRBrRSeUqyfEJ2BjiRxWDQ=
            [5] => acf32e2a69cbc2b0
            [6] => 31154A785F516EC9842FC3BA2A70FB1A
        )

)
于 2012-09-04T17:43:06.120 回答
0

首先,为什么要在这一行加上引号?:

$original_file = file_get_contents("$PAGE");

其次,可以通过以下方式检索所有元标记

$tags = get_meta_tags('http://www.example.com/');

php.net

所以在你的例子中,我想你将不得不使用:

$tags = get_meta_tags($New_URL);

并将该数组保存在您的数据库中。

于 2012-09-04T17:11:16.453 回答