c++ - 从解扰符号中提取类

Question

我正在尝试使用boost::regex. 这个示例程序

#include <vector>

namespace Ns1
{
namespace Ns2
{
    template<typename T, class Cont>
    class A
    {
    public:
        A() {}
        ~A() {}
        void foo(const Cont& c) {}
        void bar(const A<T,Cont>& x) {}

    private:
        Cont cont;
    };
}
}

int main()
{
    Ns1::Ns2::A<int,std::vector<int> > a;
    Ns1::Ns2::A<int,std::vector<int> > b;
    std::vector<int> v;

    a.foo(v);
    a.bar(b);
}

将为 A 类生成以下符号

Ns1::Ns2::A<int, std::vector<int, std::allocator<int> > >::A()
Ns1::Ns2::A<int, std::vector<int, std::allocator<int> > >::bar(Ns1::Ns2::A<int, std::vector<int, std::allocator<int> > > const&)
Ns1::Ns2::A<int, std::vector<int, std::allocator<int> > >::foo(std::vector<int, std::allocator<int> > const&)
Ns1::Ns2::A<int, std::vector<int, std::allocator<int> > >::~A()

我想Ns1::Ns2::A<int, std::vector<int, std::allocator<int> > >最好使用单个正则表达式模式提取类（实例）名称，但是在解析对中递归出现的类说明符时遇到问题<>。

有谁知道如何使用正则表达式模式（由支持boost::regex）来做到这一点？

我的解决方案（基于David Hammen的回答，因此接受）：

我不使用（单个）正则表达式来提取类和命名空间符号。我创建了一个简单的函数，从符号字符串的尾部去除括号字符对（例如<>或）：()

std::string stripBracketPair(char openingBracket,char closingBracket,const std::string& symbol, std::string& strippedPart)
{
    std::string result = symbol;

    if(!result.empty() &&
       result[result.length() -1] == closingBracket)
    {
        size_t openPos = result.find_first_of(openingBracket);
        if(openPos != std::string::npos)
        {
            strippedPart = result.substr(openPos);
            result = result.substr(0,openPos);
        }
    }
    return result;
}

这用于从符号中提取名称空间/类的另外两种方法中：

std::string extractNamespace(const std::string& symbol)
{
    std::string ns;
    std::string strippedPart;
    std::string cls = extractClass(symbol);
    if(!cls.empty())
    {
        cls = stripBracketPair('<','>',cls,strippedPart);
        std::vector<std::string> classPathParts;

        boost::split(classPathParts,cls,boost::is_any_of("::"),boost::token_compress_on);
        ns = buildNamespaceFromSymbolPath(classPathParts);
    }
    else
    {
        // Assume this symbol is a namespace global function/variable
        std::string globalSymbolName = stripBracketPair('(',')',symbol,strippedPart);
        globalSymbolName = stripBracketPair('<','>',globalSymbolName,strippedPart);
        std::vector<std::string> symbolPathParts;

        boost::split(symbolPathParts,globalSymbolName,boost::is_any_of("::"),boost::token_compress_on);
        ns = buildNamespaceFromSymbolPath(symbolPathParts);
        std::vector<std::string> wsSplitted;
        boost::split(wsSplitted,ns,boost::is_any_of(" \t"),boost::token_compress_on);
        if(wsSplitted.size() > 1)
        {
            ns = wsSplitted[wsSplitted.size() - 1];
        }
    }

    if(isClass(ns))
    {
        ns = "";
    }
    return ns;
}

std::string extractClass(const std::string& symbol)
{
    std::string cls;
    std::string strippedPart;
    std::string fullSymbol = symbol;
    boost::trim(fullSymbol);
    fullSymbol = stripBracketPair('(',')',symbol,strippedPart);
    fullSymbol = stripBracketPair('<','>',fullSymbol,strippedPart);

    size_t pos = fullSymbol.find_last_of(':');
    if(pos != std::string::npos)
    {
        --pos;
        cls = fullSymbol.substr(0,pos);
        std::string untemplatedClassName = stripBracketPair('<','>',cls,strippedPart);
        if(untemplatedClassName.find('<') == std::string::npos &&
        untemplatedClassName.find(' ') != std::string::npos)
        {
            cls = "";
        }
    }

    if(!cls.empty() && !isClass(cls))
    {
        cls = "";
    }
    return cls;
}

该buildNamespaceFromSymbolPath()方法只是连接有效的命名空间部分：

std::string buildNamespaceFromSymbolPath(const std::vector<std::string>& symbolPathParts)
{
    if(symbolPathParts.size() >= 2)
    {
        std::ostringstream oss;
        bool firstItem = true;
        for(unsigned int i = 0;i < symbolPathParts.size() - 1;++i)
        {
            if((symbolPathParts[i].find('<') != std::string::npos) ||
               (symbolPathParts[i].find('(') != std::string::npos))
            {
                break;
            }
            if(!firstItem)
            {
                oss << "::";
            }
            else
            {
                firstItem = false;
            }
            oss << symbolPathParts[i];
        }
        return oss.str();
    }
    return "";
}

至少该isClass()方法使用正则表达式来扫描构造方法的所有符号（不幸的是，这似乎不适用于仅包含成员函数的类）：

std::set<std::string> allClasses;

bool isClass(const std::string& classSymbol)
{
    std::set<std::string>::iterator foundClass = allClasses.find(classSymbol);
    if(foundClass != allClasses.end())
    {
        return true;
    }

std::string strippedPart;
    std::string constructorName = stripBracketPair('<','>',classSymbol,strippedPart);
    std::vector<std::string> constructorPathParts;

    boost::split(constructorPathParts,constructorName,boost::is_any_of("::"),boost::token_compress_on);
    if(constructorPathParts.size() > 1)
    {
        constructorName = constructorPathParts.back();
    }
    boost::replace_all(constructorName,"(","[\\(]");
    boost::replace_all(constructorName,")","[\\)]");
    boost::replace_all(constructorName,"*","[\\*]");

    std::ostringstream constructorPattern;
    std::string symbolPattern = classSymbol;
    boost::replace_all(symbolPattern,"(","[\\(]");
    boost::replace_all(symbolPattern,")","[\\)]");
    boost::replace_all(symbolPattern,"*","[\\*]");
    constructorPattern << "^" << symbolPattern << "::" << constructorName << "[\\(].+$";
    boost::regex reConstructor(constructorPattern.str());

    for(std::vector<NmRecord>::iterator it = allRecords.begin();
        it != allRecords.end();
        ++it)
    {
        if(boost::regex_match(it->symbolName,reConstructor))
        {
            allClasses.insert(classSymbol);
            return true;
        }
    }
    return false;
}

如前所述，如果类不提供任何构造函数，则最后一种方法无法安全地找到类名，并且在大符号表上非常慢。但至少这似乎涵盖了你可以从 nm 的符号信息中得到什么。

我为这个问题留下了正则表达式标签，其他用户可能会发现正则表达式不是正确的方法。

score 2 · Accepted Answer

perl 的扩展正则表达式很难做到这一点，它比 C++ 中的任何东西都强大得多。我建议采用不同的策略：

首先去掉数据等看起来不像函数的东西（寻找 D 指示符）。virtual thunk to this,virtual table for that等内容也会妨碍您；在进行主要解析之前摆脱它们。这种过滤是正则表达式可以提供帮助的地方。你应该留下的是函数。对于每个功能，

在最后的右括号之后摆脱这些东西。例如，Foo::Bar(int,double) const变成Foo::Bar(int,double)。
剥离函数参数。这里的问题是括号内可以有括号，例如，将函数指针作为参数的函数，而这些函数又可能将函数指针作为参数。不要使用正则表达式。使用括号匹配的事实。经过这一步，Foo::Bar(int,double)变成Foo::Barwhilea::b::Baz<lots<of<template>, stuff>>::Baz(int, void (*)(int, void (*)(int)))变成a::b::Baz<lots<of<template>, stuff>>::Baz。
现在在前端工作。使用类似的方案来解析该模板内容。这样一来，那乱七八糟的a::b::Baz<lots<of<template>, stuff>>::Baz就变成了a::b::Baz::Baz。
在此阶段，您的函数将如下所示a::b:: ... ::ClassName::function_name。一些命名空间中的自由函数在这里有一个小问题。析构函数是一个类的死赠品；如果函数名以波浪号开头，那么毫无疑问你有一个类名。构造函数几乎是您手头有一个类的赠品——只要您没有Foo在其中定义函数的名称空间Foo。
最后，您可能想要重新插入您剪切的模板内容。

score 1 · Accepted Answer

我使用简单的 C++函数进行了提取。

完整代码见链接，背后的想法是：

有由 . 分隔的基本级别标记::。
如果有N个基础级token，首先N-1描述className，最后是function
(我们通过或提升级别（+1）<
关闭)或>我们下降一级（-1）
基础级别当然意味着 -level == 0

我有强烈的感觉，这不能通过正则表达式来完成，因为我们有无限级别的括号。我的函数中有 255 个 - 可以切换到std::stack<char>无限级别。

功能：

std::vector<std::string> parseCppName(std::string line)
{
   std::vector<std::string> retVal;
   int level = 0;
   char closeChars[256];

   size_t startPart = 0;
   for (size_t i = 0; i < line.length(); ++i)
   {
      if (line[i] == ':' && level == 0)
      {
          if (i + 1 >= line.length() || line[i + 1] != ':')
             throw std::runtime_error("missing :");
          retVal.push_back(line.substr(startPart, i - startPart));
          startPart = ++i + 1;
      }
      else if (line[i] == '(') {
         closeChars[level++] = ')';
      } 
      else if (line[i] == '<') {
         closeChars[level++] = '>';
      } 
      else if (level > 0 && line[i] == closeChars[level - 1]) {
         --level;
      }
      else if (line[i] == '>' || line[i] == ')') {
         throw std::runtime_error("Extra )>");
      }
   }
   if (level > 0)
       throw std::runtime_error("Missing )>");
   retVal.push_back(line.substr(startPart));
   return retVal;
}

c++ - 从解扰符号中提取类

2 回答 2

Related

Reference