我有一个 php 网络爬虫,我很想给它添加 get_meta_tags() 函数。它扫描给定网页的所有 url 等等。是否可以将 get_meta_tag 方法添加到网络爬虫中,以便从扫描的网址中获取元数据?
session_start();
$domain = "www.ebay.com";
if(empty($_SESSION['page']))
{
$original_file = file_get_contents("http://" . $domain . "/");
$_SESSION['i'] = 0;
$connect = mysql_connect("cust-mysql-123-05", "uthe_774575_0001", "rooney08");
if (!$connect)
{
die("MySQL could not connect!");
}
$DB = mysql_select_db('theqlickcom_774575_db1');
if(!$DB)
{
die("MySQL could not select Database!");
}
}
if(isset($_SESSION['page']))
{
$connect = mysql_connect("xxxxx", "xxxxx", "xxxx");
if (!$connect)
{
die("MySQL could not connect!");
}
$DB = mysql_select_db('xxxx');
if(!$DB)
{
die("MySQL could not select Database!");
}
$PAGE = $_SESSION['page'];
$original_file = file_get_contents("$PAGE");
}
$stripped_file = strip_tags($original_file, "<a>");
preg_match_all("/<a(?:[^>]*)href=\"([^\"]*)\"(?:[^>]*)>(?:[^<]*)<\/a>/is", $stripped_file, $matches);
foreach($matches[1] as $key => $value)
{
if(strpos($value,"http://") != 'FALSE' && strpos($value,"https://") != 'FALSE')
{
$New_URL = "http://" . $domain . $value;
}
else
{
$New_URL = $value;
}
$New_URL = addslashes($New_URL);
$Check = mysql_query("SELECT * FROM pages WHERE url='$New_URL'");
$Num = mysql_num_rows($Check);
if($Num == 0)
{
mysql_query("INSERT INTO pages (url)
VALUES ('$New_URL')");
$_SESSION['i']++;
echo $_SESSION['i'] . "";
}
echo mysql_error();
}
$RandQuery = mysql_query("SELECT DISTINCT * FROM pages ORDER BY rank LIMIT 0,1");
$RandReturn = mysql_num_rows($RandQuery);
while($row1 = mysql_fetch_assoc($RandQuery))
{
$_SESSION['page'] = $row1['url'];
}
echo $RandReturn;
echo $_SESSION['page'];
mysql_close();
?>