如果我使用 P 标记(来自 Html Annotator)作为 PASSAGE。我想忽略注释中的标记。
脚本:
//-------------------------------------------------------------------
// SPECIAL SQUARE HYPHEN PARENTHESIS
//-------------------------------------------------------------------
DECLARE LParen, RParen;
SPECIAL{REGEXP("[(]") -> MARK(LParen)};
SPECIAL{REGEXP("[)]") -> MARK(RParen)};
DECLARE LSQParen, RSQParen;
SPECIAL{REGEXP("[\\[]") -> MARK(LSQParen)};
SPECIAL{REGEXP("[\\]]") -> MARK(RSQParen)};
DECLARE LANGLEBRACKET,RANGLEBRACKET;
SPECIAL{REGEXP("<")->MARK(LANGLEBRACKET)};
AMP{REGEXP("<")->MARK(LANGLEBRACKET)};
SPECIAL{REGEXP(">")->MARK(RANGLEBRACKET)};
AMP{REGEXP(">")->MARK(RANGLEBRACKET)};
DECLARE LBracket,RBracket;
(LParen|LSQParen|LANGLEBRACKET){->MARK(LBracket)};
(RParen|RSQParen|RANGLEBRACKET){->MARK(RBracket)};
DECLARE PASSAGE,TESTPASSAGE;
"<a name=\"para(.+?)\">(.*?)</a>"->2=PASSAGE;
RETAINTYPE(WS); // or RETAINTYPE(SPACE, BREAK,...);
PASSAGE{-> TRIM(WS)};
RETAINTYPE;
PASSAGE{->MARK(TESTPASSAGE)};
DECLARE TagContent,PassageFirstToken,InitialTag;
LBracket ANY+? RBracket{-PARTOF(TagContent)->MARK(TagContent,1,3)};
BLOCK(foreach)PASSAGE{}
{
Document{->MARKFIRST(PassageFirstToken)};
}
TagContent{CONTAINS(PassageFirstToken),-PARTOF(InitialTag)->MARK(InitialTag)};
BLOCK(foreach)PASSAGE{}
{
InitialTag ANY+{->SHIFT(PASSAGE,2,2)};
}
样本输入:
<p class="Normal"><a name="para1"><h1><b>On the Insert tab, the galleries include items that are designed to coordinate with the overall look of your document. </b></a></p>
<p class="Normal"><a name="para2"><aus>On the Insert tab, the galleries include items that are designed to coordinate with the overall look of your document.</a></p>
<p class="Normal"><a name="para3">On the Insert tab, the galleries include items that are designed to coordinate with the overall look of your document.</a></p>
<p class="Normal"><a name="para4">On the Insert tab, the galleries include items that are designed to coordinate with the overall look of your document. </a></p>
<p class="Normal"><a name="para5">On the Insert tab, the <span>galleries</span> include items that are designed to coordinate with the overall look of your document.</a></p>
PASSAGE(5) 和 TESTPASSAGE(2)。为什么 TESTPASSAGE 减少了?并且 InitialTag 没有被标记。