1

我正在尝试将一系列文本字符串与 PHP 上的 PCRE 进行匹配,并且无法在第一个和第二个之间获取所有匹配项。

如果有人想知道我到底为什么要这样做,那是因为 Doc Comments。哦,我多么希望 Zend 能够制作原生/插件函数来从 PHP 文件中读取 Doc Comments ......

以下示例(纯文本)将用于问题。它始终是纯 PHP 代码,文件开头只有一个开始标记,没有结束标记。您可以假设语法总是正确的。

<?php
  class someClass extends someExample
  {
    function doSomething($someArg = 'someValue')
    {
      // Nested code blocks...
      if($boolTest){}
    }
    private function killFurbies(){}
    protected function runSomething(){}
  }

  abstract
  class anotherClass
  {
    public function __construct(){}
    abstract function saveTheWhales();
  }

  function globalFunc(){}

问题

试图匹配一个类中的所有方法;我的 RegEx 根本找不到该方法killFurbies()。让它贪婪意味着它只匹配一个类中的最后一个方法,让它变得懒惰意味着它只匹配第一个方法。

$part = '.*';  // Greedy
$part = '.*?'; // Lazy

$regex = '%class(?:\\n|\\r|\\s)+([a-zA-Z_\\x7f-\\xff][a-zA-Z0-9_\\x7f-\\xff]*)'
       . '.*?\{' . $part .'(?:(public|protected|private)(?:\\n|\\r|\\s)+)?'
       . 'function(?:\\n|\\r|\\s)+([a-zA-Z_\\x7f-\\xff][a-zA-Z0-9_\\x7f-\\xff'
       . ']*)(?:\\n|\\r|\\s)*\\(%ms';

preg_match_all($regex, file_get_contents(__EXAMPLE__), $matches, PREG_SET_ORDER);
var_dump($matches);

结果是:

// Lazy:
array(2) {
  [0]=>
  array(4) {
    [0]=>
    // Omitted.
    [1]=>
    string(9) "someClass"
    [2]=>
    string(0) ""
    [3]=>
    string(11) "doSomething"
  }
  [1]=>
  array(4) {
    [0]=>
    // Omitted.
    [1]=>
    string(12) "anotherClass"
    [2]=>
    string(6) "public"
    [3]=>
    string(11) "__construct"
  }
}

// Greedy:
array(2) {
  [0]=>
  array(4) {
    [0]=>
    // Omitted.
    [1]=>
    string(9) "someClass"
    [2]=>
    string(0) ""
    [3]=>
    string(13) "saveTheWhales"
  }
  [1]=>
  array(4) {
    [0]=>
    // Omitted.
    [1]=>
    string(12) "anotherClass"
    [2]=>
    string(0) ""
    [3]=>
    string(13) "saveTheWhales"
  }
}

我如何匹配所有?:S

任何帮助将不胜感激,因为我在打字时已经觉得这个问题很荒谬。任何试图回答这样的问题的人都比我勇敢!

4

3 回答 3

0

更好地token_get_all用于获取PHP 代码的令牌并对其进行迭代。PHPDoc 样式的注释标记可以用T_DOC_COMMENT.

于 2010-02-26T20:52:03.193 回答
0

错误,您不能仅使用解析源token_get_all并查找类型的标记T_DOC_COMMENT(从 T_COMMENT 更改为 T_DOC_COMMENT,请参阅 Gumnbo 的帖子)?

token_get_all可以在此处找到如何使用此功能的示例。

于 2010-02-26T20:57:30.687 回答
0

解决方案

我想出了一个类来提取文件中类和方法的文档注释。感谢所有回答这个问题的人,以及其他关于匹配代码块的人。

以下示例的平均基准在 0.00495 到 0.00505 秒之间。

<?php

$file = 'path/to/libraries/tokenizer.php';
include $file;
$tokenizer = new Tokenizer;
// Start Benchmarking here.
$tokenizer->load($file);
// End Benchmarking here.
// The following will output 'bool(false)'.
var_dump($tokenizer->get_doc('Tokenizer', 'get_tokens'));
// The following will output 'string(18) "/** load method */"'.

Tokenizer(是的,我还没有想到更好的名字......)类:

<?php

class Tokenizer
{

  private $compiled = false, $path = false, $tokens = false, $classes = array();

  /** load method */
  public function load($path)
  {
    $path = realpath($path);
    if(!file_exists($path) || !function_exists('token_get_all'))
    {
      return false;
    }
    $this->compiled = false;
    $this->classes = array();
    $this->path = $path;
    $this->tokens = false;

    $this->get_tokens();
    $this->get_classes();
    $this->class_blocks();
    $this->class_functions();
    return true;
  }

  protected function get_tokens()
  {
    $tokens = token_get_all(file_get_contents($this->path));
    $compiled = '';
    foreach($tokens as $k => $t)
    {
      if(is_array($t) && $t[0] != T_WHITESPACE)
      {
        $compiled .= $k . ':' . $t[0] . ',';
      }
      else
      {
        if($t == '{' || $t == '}')
        {
          $compiled .= $t . ',';
        }
      }
    }
    $this->tokens = $tokens;
    $this->compiled = trim($compiled, ',');
  }

  protected function get_classes()
  {
    if(!$this->compiled)
    {
      return false;
    }
    $regex = '%(?:(\\d+)\\:366,)?(?:\\d+\\:(?:345|344|353),)?\\d+\\:352,(\\d+)\\:307,(?:\\d+\\:(?:354|355),\\d+\\:307,)*{%';
    preg_match_all($regex, $this->compiled, $classes, PREG_SET_ORDER);
    if(is_array($classes))
    {
      foreach($classes as $class)
      {
        $this->classes[$this->tokens[$class[2]][1]] = array('token' => $class[2]);
        $this->classes[$this->tokens[$class[2]][1]]['doc'] = isset($this->tokens[$class[1]][1]) ? $this->tokens[$class[1]][1] : false;
      }
    }
  }

  private function class_blocks()
  {
    if(!$this->compiled)
    {
      return false;
    }
    foreach($this->classes as $class_name => $class)
    {
      $this->classes[$class_name]['block'] = $this->get_block($class['token']);
    }
  }

  protected function get_block($name_token)
  {
    if(!$this->compiled || ($pos = strpos($this->compiled, $name_token . ':')) === false)
    {
      return false;
    }
    $section= substr($this->compiled, $pos);
    $len = strlen($section);
    $block = '';
    $opening = 1;
    $closing = 0;
    for($i = 0; $i < $len; $i++)
    {
      if($section[$i] == '{')
      {
        $opening++;
      }
      elseif($section[$i] == '}')
      {
        $closing++;
        if($closing == $opening)
        {
          break;
        }
      }
      if($opening > 0)
      {
        $block .= $section[$i];
      }
    }
    return trim($block, ',');
  }

  protected function class_functions()
  {
    if(!$this->compiled)
    {
      return false;
    }
    foreach($this->classes as $class_name => $class)
    {
      $regex = '%(?:(\d+)\:366,)?(?:\d+\:(?:344|345),)?(?:\d+\:(?:341|342|343),)?\d+\:333,(\d+)\:307,\{%';
      preg_match_all($regex, $class['block'], $functions, PREG_SET_ORDER);
      foreach($functions as $function)
      {
        $function_name = $this->tokens[$function[2]][1];
        $this->classes[$class_name]['functions'][$function_name] = array('token' => $function[2]);
        $this->classes[$class_name]['functions'][$function_name]['doc'] = isset($this->tokens[$function[1]][1]) ? $this->tokens[$function[1]][1] : false;
        $this->classes[$class_name]['functions'][$function_name]['block'] = $this->get_block($function[2]);
      }
    }
  }

  public function get_doc($class, $function = false)
  {
    if(!is_string($class) || !isset($this->classes[$class]))
    {
      return false;
    }
    if(!is_string($function))
    {
      return $this->classes[$class]['doc'];
    }
    else
    {
      if(!isset($this->classes[$class]['functions'][$function]))
      {
        return false;
      }
      return $this->classes[$class]['functions'][$function]['doc'];
    }
  }

}

对此有何想法或评论?欢迎所有批评!

谢谢,米尼兹。

于 2010-03-01T03:37:10.470 回答