0

在下面的processPage函数中,我从每个处理的 URL 的关键字元标记中获取关键字。我需要更改它,preg_split以便它只提取任何关键字集群的前三个单词。

例如,给定这个关键字meta标签:

<meta name="keywords" content="this is too long, this is not, keyword three" />

我只想要第一个关键字集群的“这也是”部分。

另外,如果关键字词组的总列表长于 10 个,我只想从列表中提取前 10 个关键字词组。

即,(关键字词组 1、kw 2、kw 3、kw4 等...,关键字词组 10)

非常感谢任何帮助。

<?php

class ResultPage
{
    function __construct($siteurl){$this->url = $siteurl;$this->processPage();}

    public $url;
    public $title;
    public $html;
    public $plainText;
    public $wordList;
    public $keywords = array();

    function processPage(){
        $this->html = rseo_keywordSearch_scrapePage($this->url);
        $dom = str_get_html($this->html);
        $metakws = $dom->find('meta[name=keywords]');
        if(count($metakws)){
            $metakw = $metakws[0];
            if($metakw->content){
                $this->keywords = preg_split("/[\s]*[,][\s]*/",$metakw->content); //EDIT HERE
                }
            }
        }

    public function GetResults(){
        return rseo_keyword_getCountArray($this->wordList);
    }
}


/*
 * 
 * Calls remote web page using cUrl, 
 * and returns the raw html
 * 
 */
function rseo_keywordSearch_scrapePage($url, $headonly = TRUE ){

    $agents = 'Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.204 Safari/534.16';

    $ch = curl_init();
    curl_setopt($ch, CURLOPT_HEADER, 0);
    curl_setopt($ch, CURLOPT_VERBOSE, FALSE);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
    curl_setopt($ch, CURLOPT_USERAGENT, $agents);
    //curl_setopt($ch, CURLOPT_NOBODY, $headonly);
    curl_setopt($ch, CURLOPT_URL, $url);

    $curlResp = curl_exec($ch);
    curl_close($ch);
    $resp = str_replace("class=l","class='l'",$curlResp);

    return $resp;
}

function rseo_keyword_getCountArray($arr){
    $retarr = array_count_values($arr);
    arsort($retarr);
    return $retarr;
}
4

2 回答 2

1

这比拆分更容易匹配,例如:

preg_match_all('/(?<=^|,)\s*((?:[^\s,]+\s*){1,3})/', $metakw->content, $m);
$this->keywords = array_slice($m[1], 0, 10);

print_r($this->keywords);

/*
Array
    (
        [0] => this is too 
        [1] => this is not
        [2] => keyword three
    )
*/
于 2011-08-14T01:30:26.353 回答
0

Preg_split 不适合您尝试做的事情。

我会尝试这样的事情:

$keywords = explode(',', $this->content);

foreach ($keywords as $key => $keyword) {
    $count = substr_count($keyword, ' ');

    if ($count > 2) {
        // first 3 words out of a keyword cluster.
        $this->keywords[] = implode(' ', explode(' ', $keyword, -($count - 2)));
    } else {
        $this->keywords[] = $keyword;
    }

    // stop a 10 keywords
    if ($key + 1 == 10) {
        break;
    }
}
于 2011-08-14T00:54:36.423 回答