0

我有一个 php 代码。此代码输出 HTML。我需要修改此代码以输出 XML。关于我将如何去做这件事的任何想法。是否有任何可用的 XML 库可以直接完成这项工作,还是我必须手动创建每个节点。?

我的PHP代码是:

<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />

<style>
a {text-decoration:none; color:black;}
</style>
</head>

<body>


<?php

$a=$_POST["title"];
$b=$_POST["name"];

$c="http://www.imdb.com/search/title?title=".urlencode($a)."&title_type=".urlencode($b);
$d=file_get_contents($c);


preg_match_all('/<div id="main">\n(No results.)/', $d,$nore);


preg_match_all('#<img src="(.*)"#Us', $d, $img);//image

preg_match_all('/<a\s*href="\/title\/tt[0-9]*\/">((?:[a-z]*(?:&*[.]*)?\s*-*[a-z]*[0-9]*[^<])+)/i',$d,$tit);  //title 

preg_match_all('/<span\sclass="year_type">\s*\(([\d]*)/',$d,$ye); //movie year working fine

preg_match_all('#<span class="credit">\n    Dir: (.*)\n(?:    With:)?#Us',$d,$dir); //director 

preg_match_all('/<span class="rating-rating"><span class="value">([\w]*.[\w]*)/i',$d,$rat); //rating 

preg_match_all('/<a\shref="(\/title\/tt[0-9]*\/)"\s*[title]+/i',$d,$lin); //link 




for($i=0;$i<5;$i++)
{ 
  if (@$rat[1][$i]=="-")
  $rat[1][$i]="N/A";
}

for($i=0;$i<5;$i++)
{ 
 if(@$dir[1][$i]=="")
 $dir[1][$i]="N/A";
}




if(count($tit[1])>5)
$cnt=5;
else
$cnt=count($tit[1]);



 echo"<center><b>Search Result</b></center>";
echo "<br/>";
echo "<center><b>\"$a\"of type\"$b\":</b></center>";
echo"<br/>";

if(@$nore[1][0]=="No results.")
echo "<center><b>No movies found!</b></center>";
else
{
echo "<center><table border=1><tr><td><center>Image</center></td><td><center>Title</center></td><td><center>Year</center></td><td><center>Director</center></td><td><center>Rating(10)</center></td><td><center>Link to Movie</center></td></tr>";
  for($j=0;$j<$cnt;$j++)
          {
            echo "<tr>";
            echo "<td>".@$img[0][$j+2]."</td>";
            echo "<td><center>".@$tit[1][$j]."</center></td>";
            echo "<td><center>".@$ye[1][$j]."</center></td>";
            echo "<td><center>".@$dir[1][$j]."</center></td>";
            echo "<td><center>".@$rat[1][$j]."</center></td>";
            echo '<td><center><a style="text-decoration:underline; color:blue;" href="http://www.imdb.com'.@$lin[1][$j].'">Details</a></center></td>';
            echo "</tr>";
          }




echo "</table></center>";
}               

?>

</body>
</html>

预期的 XML 输出:

<result cover="http://ia.mediaimdb.com/images      
/M/MV5BMjMyOTM4MDMxNV5BMl5BanBnXkFtZTcwNjIyNzExOA@@._V1._SX54_
CR0,0,54,74_.jpg" title="The Amazing Spider-Man(2012)"year="2012"
director="Marc Webb" rating="7.5"
details="http://www.imdb.com/title/tt0948470"/>

<result cover="http://ia.mediaimdb.
com/images/M/MV5BMzk3MTE5MDU5NV5BMl5BanBnXkFtZTYwMjY3NTY3._V1._SX54_CR0,
0,54,74_.jpg" title="Spider-Man(2002)" year="2002"director="Sam Raimi"
rating="7.3" details="http://www.imdb.com/title/tt0145487"/>

<result cover="http://ia.mediaimdb.
com/images/M/MV5BODUwMDc5Mzc5M15BMl5BanBnXkFtZTcwNDgzOTY0MQ@@._V1._SX54_
CR0,0,54,74_.jpg" title="Spider-Man 3 (2007)" year="2007" director="Sam
Raimi" rating="6.3" details="http://www.imdb.com/title/tt0413300"/>

<result cover="http://i.mediaimdb.
com/images/SF1f0a42ee1aa08d477a576fbbf7562eed/realm/feature.gif" title="
The Amazing Spider-Man 2 (2014)" year="2014" director="Sam Raimi"
rating="6.3" details="http://www.imdb.com/title/tt1872181"/>

<result cover="http://ia.mediaimdb.
com/images/M/MV5BMjE1ODcyODYxMl5BMl5BanBnXkFtZTcwNjA1NDE3MQ@@._V1._SX54_
CR0,0,54,74_.jpg" title="Spider-Man 2 (2004)" year="2004" director="Sam
Raimi" rating="7.5" details="http://www.imdb.com/title/tt0316654"/>
</results>
4

1 回答 1

2

首先,你正在用正则表达式解析你的 html 结果,这是低效的、不必要的,而且......好吧,你正在回答 cthulhu 的电话

其次,解析 IMDB HTML 以检索结果虽然有效,但可能是不必要的。有一些简洁的 3rd 方 API 可以为您完成这项工作,例如http://imdbapi.org

如果您不想使用任何第 3 方 API,恕我直言,您应该改为使用 DOM 解析器/操纵器解析 HTML,例如 DOMDocument,这样更安全、更好,同时可以解决您的 HTML 到 XML 的问题。

这是您问的问题(从结果构建 XML 和 HTML):

function resultsToHTML($results)
{
    $doc = new DOMDocumet();
    $table = $doc->createElement('table');

    foreach ($results as $r) {
        $row = $doc->createElement('tr');
        $doc->appendChild($row);
        $title = $doc->createElement('td', $r['title']);
        $row->appendChild($title);
        $year = $doc->createElement('td', $r['year']);
        $row->appendChild($year);
        $rating = $doc->createElement('td', $r['rating']);
        $row->appendChild($rating);

        $imgTD = $doc->createElement('td');

        //Creating a img tag (use only on)
        $img = $doc->createElement('img');
        $img->setAttribute('src', $r['img_src']);
        $imgTD->appendChild($img);
        $row->appendChild($imgTD);

        $imgTD = $doc->createElement('td');

        //Importing directly from the old document
        $fauxDoc = new DOMDocument();
        $fauxDoc->loadXML($r['img']);
        $img = $fauxDoc->getElementsByTagName('img')->index(0);
        $importedImg = $doc->importNode('$img', true);
        $imgTD->appendChild($importedImg);
        $row->appendChild($imgTD);
    }
    return $doc;
}

function resultsToXML($results)
{
    $doc = new DOMDocumet();
    $root = $doc->createElement('results');
    foreach ($results as $r) {
        $element = $root->createElement('result');
        $element->setAttribute('cover', $r['img_src']);
        $element->setAttribute('title', $r['title']);
        $element->setAttribute('year', $r['year']);
        $element->setAttribute('rating', $r['rating']);
        $root->appendChild($element);
    }
    $doc->appendChild($root);
    return $doc;
}

要打印它们,您只需要

$xml = resultsToXML($results);
print $xml->saveXML();

与 html 相同


根据您的帖子,这是使用 DOMDocument 重构的代码:

<?php
//Mock IMDB Link
$a = 'The Amazing Spider-Man';
$b = 'title';
$c = "http://www.imdb.com/search/title?title=".urlencode($a)."&title_type=".urlencode($b);

// HTML might be malformed so we want DOMDocument to be quiet
libxml_use_internal_errors(true);
//Initialize DOMDocument parser
$doc = new DOMDocument();

//Load previously downloaded document
$doc->loadHTMLFile($c);

//initialize array to store results
$results = array();

// get table of results and extract a list of rows
$listOfTables = $doc->getElementsByTagName('table');
$rows = getResultRows($listOfTables);

$i = 0;
//loop through all rows to retrieve information
foreach ($rows as $row) {
    if ($title = getTitle($row)) {
        $results[$i]['title'] = $title;
    }
    if (!is_null($year = getYear($row)) && $year) {
        $results[$i]['year'] = $year;
    }
    if (!is_null($rating = getRating($row)) && $rating) {
        $results[$i]['rating'] = $rating;
    }
    if ($img = getImage($row)) {
        $results[$i]['img'] = $img;
    }
    if ($src = getImageSrc($row)) {
        $results[$i]['img_src'] = $src;
    }
    ++$i;
}

//the first result can be a false positive due to the
// results' table header, so we remove it
if (isset($results[0])) {
    array_shift($results);
}

职能

function getResultRows($listOfTables)
{
    foreach ($listOfTables as $table) {
        if ($table->getAttribute('class') === 'results') {
            return $table->getElementsByTagName('tr');
        }
    }
}

function getImageSrc($row)
{
    $img = $row->getElementsByTagName('img')->item(0);
    if (!is_null($img)) {
        return $img->getAttribute('src');
    } else {
        return false;
    }
}

function getImage($row, $doc)
{
    $img = $row->getElementsByTagName('img')->item(0);
    if (!is_null($img)) {
        return $doc->saveHTML($img);
    } else {
        return false;
    }
}


function getTitle($row)
{
    $tdInfo = getTDInfo($row->getElementsByTagName('td'));
    if (!is_null($tdInfo) && !is_null($as = $tdInfo->getElementsByTagName('a'))) {
        return $as->item(0)->nodeValue;
    } else {
        return false;
    }
}


function getYear($row)
{
    $tdInfo = getTDInfo($row->getElementsByTagName('td'));
    if (!is_null($tdInfo) && !is_null($spans = $tdInfo->getElementsByTagName('span'))) {
        foreach ($spans as $span) {
            if ($span->getAttribute('class') === 'year_type') {
                return str_replace(')', '', str_replace('(', '', $span->nodeValue));
            }
        }
    }
}

function getRating($row)
{
    $tdInfo = getTDInfo($row->getElementsByTagName('td'));
    if (!is_null($tdInfo) && !is_null($spans = $tdInfo->getElementsByTagName('span'))) {
        foreach ($spans as $span) {
            if ($span->getAttribute('class') === 'rating-rating') {
                return $span->nodeValue;
            }
        }
    }
}


function getTDInfo($tds)
{
    foreach ($tds as $td) {
        if ($td->getAttribute('class') == 'title') {
            return $td;
        }
    }
}
于 2012-11-11T09:40:06.663 回答