我最近对通过使用正则表达式“清理”html字符串来处理任务的解决方案有点偏执。它们在很大程度上取决于给定正则表达式的“防弹”程度。所以,我想出了这个片段,并希望从社区中得到一些关于它的反馈。谢谢。
//
// #notags
String.prototype.notags = (function (doc) {
var mkel = doc.createElement.bind(doc);
var hasown = Function.prototype.call.bind(Object.prototype.hasOwnProperty);
// #textlike-nodes
var textlike = {
12 : "NOTATION_NODE",
3 : "TEXT_NODE",
4 : "CDATA_SECTION_NODE",
5 : "ENTITY_REFERENCE_NODE",
6 : "ENTITY_NODE",
7 : "PROCESSING_INSTRUCTION_NODE",
8 : "COMMENT_NODE"
};
// #_notags
// main function
var _notags = function (tagedstring) {
var div;
var istxt = istextnode;
var nodes;
var nodetxt = getxt;
var res;
div = mkel('div');
div.innerHTML = (''+ tagedstring);
// get the div's descendants
// and read their text content
// until all of its childern are plain
// text nodes...
nodes = descendants(div);
while (!nodes.every(istxt)) {
div.innerHTML = nodetxt(div);
nodes = descendants(div);
}
res = div.innerHTML;
// get rid of temporary div
// prevents mem. leaks
div.innerHTML = '';
delete div;
return res;
};
// #save
// String#notags
return function () {
return _notags(this);
};
////////////////
////// #helpers
// #istextnode
function istextnode (node) {
return !(3 - node.nodeType);
}
// #descendants
function descendants (startnode, _nodels) {
_nodels || (_nodels = []);
var node = startnode.firstChild;
while (node) {
_nodels.push(node);
descendants(node, _nodels);
node = node.nextSibling;
}
return _nodels;
}
// #getxt
// loop each node's descendant
// and fetch it' text content
function getxt (node) {
var _ = {
str: '',
txt: textlike
};
descendants(node)
.forEach(getnodetext, _);
return _.str;
}
// #getnodetext
function getnodetext (node) {
//this: {str, txt}
if (hasown(this.txt, node.nodeType))
this.str += (node.data || node.textContent || node.nodeValue);
}
})(document);
// /eof