0

我需要使用 url 的购物网站的标题和元标记以及图像标记。这是我的代码,它使用亚马逊产品链接工作。但它不能像以下网址一样工作:

  1. http://www.alternate.de/Synology/Synology+DS413,_NAS/html/product/1028780/
  2. http://www.bonprix.de/produkt/baby-fleecejacke-hellgrau-meliert-958416/

我获取标签的代码:

$url ="http://rads.stackoverflow.com/amzn/click/B009T9QCWI";
$ch = curl_init();
$timeout = 5;
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
$data = curl_exec($ch);
curl_close($ch);
$returned_content = $data;
$doc = new \DOMDocument();
@$doc->loadHTML($returned_content);
$nodes = $doc->getElementsByTagName("title");
//$title = $nodes->item(0)->nodeValue;
$product_title = str_replace("'", " ", $title);
$xml=simplexml_import_dom($doc);
$images=$xml->xpath("//img");
$j=0;
foreach($images as $img) {
    $host = explode(":",$img["src"]);
    $ht = $host[0];
    if ($ht == "http" || $ht == "https" ) {
        $info = pathinfo($img["src"]);
        if (array_key_exists('extension', $info)) {
            $extension =  $info["extension"];
        }
        if ($extension == "jpg" || $extension == "jpeg") {
            $imagesrc[] = $img["src"];

            $j++;

            $image[] = $img["src"] ;

        }
    }
}
$metas = $doc->getElementsByTagName('meta');
for ($i = 0; $i < $metas->length; $i++) {
    $meta = $metas->item($i);
    if ($meta->getAttribute('name') == 'description' || $meta->getAttribute('name') == 'Description') {
        $description = $meta->getAttribute('content');
    }
    if ($meta->getAttribute('name') == 'keywords') {
        $keywords = $meta->getAttribute('content');
    }
}
if (empty($image)) {
    $domarray[] = array('desc' => $description, 'title'=>$product_title);
    print_r($domarray);


} else {
    $domarray[] = array('img' =>$image, 'desc' => $description, 'title'=>$product_title);
    print_r($domarray) ;

}
4

1 回答 1

0

为什么不使用simplehtmldom解析器。

例子:

require_once 'simple_html_dom.php';

$url ="http://rads.stackoverflow.com/amzn/click/B009T9QCWI";

$html = file_get_html( $url );

// all results stored in this array
$result = array();

// page title
$result[ 'title' ] = $html->find( 'title', 0 )->plaintext;

// get all meta tags, which have an attribute "name"
foreach( $html->find( 'meta[name]' ) as $meta ) {
    $result[ 'meta' ][] = array(
        'name' => $meta->name,
        'content' => $meta->content
    );
}

// get all images
foreach( $html->find( 'img' ) as $image ) {
    $result[ 'image' ][] = $image->src;
}

print_r( $result );

输出

Array
(
    [title] => Amazon.com: Samsung Galaxy S III, Black 16GB (Verizon Wireless): Cell Phones &amp; Accessories
    [meta] => Array
        (
            [0] => Array
                (
                    [name] => description
                    [content] => Shop cell phones and accessories at Amazon.com. You&#39;ll find great prices on cases, headsets, and the latest smartphones from carriers like Verizon, AT&amp;T, and Sprint
                )

            [1] => Array
                (
                    [name] => title
                    [content] => Amazon.com: Samsung Galaxy S III, Black 16GB (Verizon Wireless): Cell Phones &amp; Accessories
                )

            [2] => Array
                (
                    [name] => keywords
                    [content] => Samsung Galaxy S III, Black 16GB (Verizon Wireless),Samsung,Galaxy S III
                )

        )

    [image] => Array
        (
            [0] => http://g-ecx.images-amazon.com/images/G/01/gno/beacon/BeaconSprite-US-01._V397411194_.png
            [1] => http://g-ecx.images-amazon.com/images/G/01/x-locale/common/transparent-pixel._V386942464_.gif
            [2] => http://g-ecx.images-amazon.com/images/G/01/x-locale/common/transparent-pixel._V386942464_.gif
            [3] => http://ecx.images-amazon.com/images/I/41%2Bh%2BUmrcRL._SY300_.jpg
            [4] => http://ecx.images-amazon.com/images/I/41%2Bh%2BUmrcRL._SL500_AA280_.jpg
            [5] => http://g-ecx.images-amazon.com/images/G/01/icons/icon-offsite-sl-7069-t4._V171196157_.png
            [6] => http://g-ecx.images-amazon.com/images/G/01/icons/icon-offsite-sl-7069-t4._V171196157_.png
            [7] => http://ecx.images-amazon.com/images/I/41FBSaIC4AL._SL500_SS100_.jpg
            [8] => http://ecx.images-amazon.com/images/I/41HGvd6-jwL._SL500_SS100_.jpg
            [9] => http://ecx.images-amazon.com/images/I/51jiU%2BiYWUL._SL500_SS100_.jpg
            [10] => http://ecx.images-amazon.com/images/I/317JogSYmkL._SL500_SS100_.jpg
            [11] => http://ecx.images-amazon.com/images/I/41d6B11BDuL._SL500_SS100_.jpg
            [12] => http://ecx.images-amazon.com/images/I/41a94BWHXbL._SL500_SS100_.jpg
            [13] => http://g-ecx.images-amazon.com/images/G/01/wireless/detail-page/B009T9QCWI.main_SM.jpg
            [14] => http://g-ecx.images-amazon.com/images/G/01/wireless/detail-page/B009T9QCWI.pt01_SM.jpg
            [15] => http://g-ecx.images-amazon.com/images/G/01/wireless/detail-page/wireless-box-logo-verizon-box.jpg
            [16] => http://g-ecx.images-amazon.com/images/G/01/th/aplus/a-plus_bottom-217._V180545591_.gif
            [17] => http://g-ecx.images-amazon.com/images/G/01/wireless/detail-page/B009T9QCWI.pt02_SM.jpg
            [18] => http://g-ecx.images-amazon.com/images/G/01/wireless/detail-page/amazon_app_suite_1_sma.jpg
            [19] => http://g-ecx.images-amazon.com/images/G/01/wireless/detail-page/amazon_app_suite_5_sm.jpg
            [20] => http://ecx.images-amazon.com/images/I/41HGvd6-jwL._SL75_SS50_.jpg
            [21] => http://ecx.images-amazon.com/images/I/41FBSaIC4AL._SL75_SS50_.jpg
            [22] => http://ecx.images-amazon.com/images/I/51jiU%2BiYWUL._SL75_SS50_.jpg
            [23] => http://ecx.images-amazon.com/images/I/41a94BWHXbL._SL75_SS50_.jpg
            [24] => http://g-ecx.images-amazon.com/images/G/01/x-locale/communities/reputation/suggestionbox._V192249929_.gif
            [25] => http://g-ecx.images-amazon.com/images/G/01/icons/orange-arrow._V192570247_.gif
            [26] => http://g-ecx.images-amazon.com/images/G/01/icons/orange-arrow._V192570247_.gif
            [27] => http://g-ecx.images-amazon.com/images/G/01/icons/orange-arrow._V192570247_.gif
            [28] => http://g-ecx.images-amazon.com/images/G/01/gno/images/general/navAmazonLogoFooter._V169459313_.gif
            [29] => /gp/uedata/unsticky/182-7026578-6696341//ntpoffrw?noscript&amp;id=158FKQCX6TYATFBQQW0V
        )

)

您可以循环传递 url 并对所有人执行相同的操作。为简单起见,我保留了您对图像和元标记进行的检查。希望能帮助到你。

于 2013-07-12T09:37:49.640 回答