我喜欢解析器和编译器理论,因此我编写了一个小型解析器(手动),它能够将您的示例片段解析为 XML DOM 文档对象。应该可以对其进行修改,使其产生其他类型的树结构,例如自定义 AST(抽象语法树)。
使用您的示例代码段作为输入,语句result = new OrgModParser().parse(input); result.xml
<org-mode-document indentLevel="-1">
<section indentLevel="0">
<header indentLevel="0">This is a heading</header>
<paragraph indentLevel="1">P1 Start a paragraph here but since it is the first indentation level the paragraph may have a lower indentation on the next line or a greater one for that matter.</paragraph>
<list indentLevel="1">
<list-item indentLevel="1">
<paragraph indentLevel="2">LI1.1 I am beginning a list here</paragraph>
<list-item indentLevel="1">
<paragraph indentLevel="2">LI1.2 Here begins another list item which continues here and also here</paragraph>
<paragraph indentLevel="1">P2 but is broken here (this line becomes a paragraph outside of the first list).</paragraph>
<list indentLevel="1">
<list-item indentLevel="1">
<paragraph indentLevel="2">LI2.1 P1 Second list item.</paragraph>
<list indentLevel="2">
<list-item indentLevel="2">
<paragraph indentLevel="3">LI2.1.1 Inner list with a simple item</paragraph>
<list-item indentLevel="2">
<paragraph indentLevel="3">LI2.1.2 P1 and with an item containing several paragraphs. Here is the second line in the item, and now</paragraph>
<paragraph indentLevel="3">LI2.1.2 P2 I begin a new paragraph still in the same item. The indentation can be only higher</paragraph>
<paragraph indentLevel="2">LI2.1 P2 but if the indentation is lower, it breaks the item, (and the whole list), and this is a paragraph in the LI2.1 list item</paragraph>
<list indentLevel="2">
<list-item indentLevel="2">
<paragraph indentLevel="3">LI2.2.1 You get the picture</paragraph>
<paragraph indentLevel="1">P3 Just plain text outside of the list.</paragraph>
* File: orgmodparser.js
* Basic usage: var object = new OrgModeParser().parse(input);
* Works on: JScript and JScript.Net.
* - For other JavaScript platforms, just replace or override the .createRoot() method
OrgModeParser = function (options) {
if (typeof options == "object") {
for (var i in options) {
this[i] = options[i];
OrgModeParser.prototype = {
"INDENT_WIDTH" : 2, // Two spaces
"LINE_SEPARATOR" : "\r\n",
* Each line in the input will be matched against this regexp.
* Only spaces are allowed as indentation characters.
* The symbols '*', '+' and '-' will be recognized, but only if they are followed by at least one space.
* Add other symbols in this regexp if you want the parser to recognize them
"re" : /^( *)([\+\-\*] +)?(.*)/,
// This function must return a valid XML DOM document object
createRoot : function () {
var err, progIDs = ["Msxml2.DOMDocument.6.0", "Msxml2.DOMDocument.5.0", "Msxml2.DOMDocument.4.0", "Msxml2.DOMDocument.3.0", "Msxml2.DOMDocument.2.0", "Msxml2.DOMDocument.1.0", "Msxml2.DOMDocument"];
for (var i = 0; i < progIDs.length; i++) {
try {
return new ActiveXObject(progIDs[i]);
catch (err) {
alert("Org-mode parser - Error - Failed to instantiate root object");
return null;
parse : function (text) {
function createNode (tagName, text) {
var node = root.createElement(tagName);
node.setAttribute("indentLevel", level);
if (text) {
var textNode = root.createTextNode(text);
return node;
function getContainer () {
if (lastNode.tagName == "section") { return lastNode; }
var anc = lastNode.parentNode;
while (anc) {
if (modifier == "+" || modifier == "-") {
if (anc.getAttribute("indentLevel") == level && anc.tagName == "list") { return anc; }
if (anc.getAttribute("indentLevel") < level && anc.tagName != "paragraph") { return anc; }
anc = anc.parentNode;
alert("Org-mode parser - Internal error at line: "+i);return null;
if (typeof text != "string") { alert("Org-mode - Type error - Input must be of type 'string'"); return null; }
var body;
var content; // The text of the current line, without its indentation and modifier
var lastNode; // The node being processed
var indent; // The indentation of the current line
var isAfterDubbleLineBreak; // Indicates if the current line follows a dubble line break
var line; // The current line being processed
var level; // The current indentation level; given by indent.length / this.INDENT_WIDTH. Not to confuse with the nesting level
var lines; // Array. Empty lines are included.
var match;
var modifier; // This can be "*", "+", "-" or ""
var root;
isAfterDubbleLineBreak = false;
level = -1; // Indentation level is -1 initially; it will be 0 for the first "*"-bloc
lines = text.split(this.LINE_SEPARATOR);
root = this.createRoot();
body = root.appendChild(createNode("org-mode-document"));
lastNode = body;
for (var i = 0; i < lines .length; i++) {
line = lines[i];
match = line.match(this.re);
if (match === null) { alert("org-mode parse error at line: " + i); return null; }
indent = match[1];
level = indent.length / this.INDENT_WIDTH;
modifier = match[2] && match[2].charAt(0);
content = match[3];
// These conditions tell the parser what to do when encountering a line with a given modifer
if (content === "") { dubbleLineBreak(); continue; }
else if (modifier == "+" || modifier == "-") { plus(); }
else if (modifier == "*") { star(); }
else if (modifier == "+") { plus(); }
else if (modifier == "-") { minus(); }
else if (modifier == "") { noModifier(); }
isAfterDubbleLineBreak = false;
return root;
function star() {
// The '*' modifier is not allowed on an indented line
if (indent) { alert("Org-mode parse error: unexpected '*' symbol at line " + i); return null; }
lastNode = body.appendChild(createNode("section"));
// The div remains the current node
lastNode.appendChild(createNode("header", content));
function plus() {
var container = getContainer();
var tn = container.tagName;
if (tn == "section" || tn == "list-item") {
lastNode = container.appendChild(createNode("list"));
lastNode = lastNode.appendChild(createNode("list-item"));
lastNode = lastNode.appendChild(createNode("paragraph", content));
} else if (tn == "list") {
lastNode = container.appendChild(createNode("list-item"));
lastNode = lastNode.appendChild(createNode("paragraph", content));
else alert("Org-mode parser - Internal error - Bad container tag name: " + tn);
lastNode.setAttribute("indentLevel", Number(lastNode.getAttribute("indentLevel")) + 1);
function minus() { plus(); }
function noModifier() {
if (lastNode.tagName == "paragraph" && !isAfterDubbleLineBreak && (lastNode.getAttribute("indentLevel") == 1 || level >= lastNode.getAttribute("indentLevel"))) {
lastNode.childNodes[0].appendData(" " + content);
} else {
var container = getContainer();
lastNode = container.appendChild(createNode("paragraph", content));
function dubbleLineBreak() {
while (lines[i+1] && /^\s*$/.test(lines[i+1])) { i++; }
isAfterDubbleLineBreak = true;