javascript - 剥离 html 标签，'RegExp-free-way'

Question

我最近对通过使用正则表达式“清理”html字符串来处理任务的解决方案有点偏执。它们在很大程度上取决于给定正则表达式的“防弹”程度。所以，我想出了这个片段，并希望从社区中得到一些关于它的反馈。谢谢。

//
// #notags
String.prototype.notags = (function (doc) {

  var mkel     = doc.createElement.bind(doc);
  var hasown   = Function.prototype.call.bind(Object.prototype.hasOwnProperty);

  // #textlike-nodes
  var textlike = {

    12 : "NOTATION_NODE", 
    3  : "TEXT_NODE", 
    4  : "CDATA_SECTION_NODE", 
    5  : "ENTITY_REFERENCE_NODE", 
    6  : "ENTITY_NODE", 
    7  : "PROCESSING_INSTRUCTION_NODE", 
    8  : "COMMENT_NODE"

  };

  // #_notags
  // main function
  var _notags = function (tagedstring) {

    var div;
    var istxt = istextnode;
    var nodes;
    var nodetxt = getxt;
    var res;

    div = mkel('div');
    div.innerHTML = (''+ tagedstring);


    // get the div's descendants 
    // and read their text content 
    // until all of its childern are plain 
    // text nodes...

    nodes = descendants(div);

    while (!nodes.every(istxt)) {
      div.innerHTML = nodetxt(div);
      nodes = descendants(div);
    }

    res = div.innerHTML;

    // get rid of temporary div
    // prevents mem. leaks
    div.innerHTML = '';
    delete div;

    return res;
  };


  // #save
  // String#notags
  return function () {
    return _notags(this);
  };



  ////////////////
  ////// #helpers

  // #istextnode
  function istextnode (node) {
    return !(3 - node.nodeType);
  }

  // #descendants
  function descendants (startnode, _nodels) {

    _nodels || (_nodels = []);

    var node = startnode.firstChild;

    while (node) {

      _nodels.push(node);
      descendants(node, _nodels);

      node = node.nextSibling;
    }

    return _nodels;
  }

  // #getxt
  // loop each node's descendant 
  // and fetch it' text content
  function getxt (node) {

    var _ = {
      str: '', 
      txt: textlike
    };

    descendants(node)
    .forEach(getnodetext, _);

    return _.str;
  }

  // #getnodetext
  function getnodetext (node) {
    //this: {str, txt}
    if (hasown(this.txt, node.nodeType))
      this.str += (node.data || node.textContent || node.nodeValue);
  }


})(document);

// /eof

javascript - 剥离 html 标签，'RegExp-free-way'

0 回答 0

Related

Reference