javascript - 删除 html 标签，除了
或
带有 javascript 的标签

Question

我想使用javascript从字符串中删除除<br>或标签之外的所有html标签。<br/>我见过很多这样的问题，但他们的答案将删除所有 html 标签，包括<br>和<br/>标签。

有谁知道一个正则表达式来做到这一点？

score 63 · Accepted Answer

使用负前瞻（通过使用正则表达式，例如/<(?!br\s*\/?)[^>]+>/g）：

var html = 'this is my <b>string</b> and it\'s pretty cool<br />isn\'t it?<br>Yep, it is. <strong>More HTML tags</strong>';
html = html.replace(/<(?!br\s*\/?)[^>]+>/g, '');

console.log(html); 
//this is my string and it's pretty cool<br />isn't it?<br>Yep, it is. More HTML tags

演示

score 12 · Accepted Answer

尝试这个

 function remove_tags(html)
 {
   var html = html.replace("<br>","||br||");  
   var tmp = document.createElement("DIV");
   tmp.innerHTML = html;
   html = tmp.textContent||tmp.innerText;
   return html.replace("||br||","<br>");  
 }

score 7 · Accepted Answer

我已经研究了最后一个建议来开发一个删除所有标签或只保留一些标签的功能

function strip_tags( _html /*you can put each single tag per argument*/ )
{
    var _tags = [], _tag = "" ;
    for( var _a = 1 ; _a < arguments.length ; _a++ )
    {
        _tag = arguments[_a].replace( /<|>/g, '' ).trim() ;
        if ( arguments[_a].length > 0 ) _tags.push( _tag, "/"+_tag );
    }

    if ( !( typeof _html == "string" ) && !( _html instanceof String ) ) return "" ;
    else if ( _tags.length == 0 ) return _html.replace( /<(\s*\/?)[^>]+>/g, "" ) ;
    else
    {
        var _re = new RegExp( "<(?!("+_tags.join("|")+")\s*\/?)[^>]+>", "g" );
        return _html.replace( _re, '' );
    }
}

var _html = "<b>Just</b> some <i>tags</i> and text to test <u>this code</u>" ;
document.write( "This is the original html code including some tags<br>" );
document.write( _html + "<br><br>" ); // original html code
document.write( "Now we remove all tags (plain text)<br>" );
document.write( strip_tags( _html ) + "<br><br>" ); // remove all tags
document.write( "Only the bold tag is kept<br>" );
document.write( strip_tags( _html, "b" ) + "<br><br>" ); // keep <b> only
document.write( "Only the underline tag is kept<br>" );
document.write( strip_tags( _html, "u" ) + "<br><br>" ); // keep <u> only
document.write( "Only the italic tag is kept<br>" );
document.write( strip_tags( _html, "<i>" ) + "<br><br>" ); // keep <i> only
document.write( "Keeping both italic and underline<br>" );
document.write( strip_tags( _html, "i", "u" ) ); // keep both <i> and <u>

score 3 · Accepted Answer

要扩展 h2ooooooo 答案以包含前导空格并不区分大小写，您可以使用

/<(?!\s*br\s*\/?)[^>]+>/gi

score 3 · Accepted Answer

这是一个老问题，但仍然是高级问题，所以我想我会提供一个更通用的 ES6 解决方案。

此解决方案将去除除排除标签之外的所有标签，并简化这些标签以删除属性。

如果您想处理粘贴事件并简化 HTML，这将特别有用。

它还剥离 HTML 注释，因为有时复制/粘贴包含等。

  function strip_tags(html, ...args) {
    return html.replace(/<(\/?)(\w+)[^>]*\/?>/g, (_, endMark, tag) => {
      return args.includes(tag) ? '<' + endMark + tag + '>' :'';
    }).replace(/<!--.*?-->/g, '');
  }

它不是构建复杂的，而是进行替换，然后检查标签，返回简单的开始/结束标签或空字符串。

使用示例：

// Strip all except basic formatting and paragraphs and breaks.
const h = strip_tags(html, 'b', 'i', 'u', 'p', 'br');

这就是我用来处理简单 HTML 编辑器的粘贴事件的方法。它并不完美，因为它不能处理嵌入在标签属性中的“>”之类的奇怪情况，但这似乎不太可能发生。

希望它对某人有用。欢迎改进！

score 1 · Accepted Answer

基于 h2ooooooo 回答您的问题的正确正则表达式是：

<((?!\/?br\s?\/?>)\s*)[^>]+>

这个解决方案甚至适用于 Wolfie 和 Olivier 提到的案例。

这是演示

score 0 · Accepted Answer

你的观点很好，那么整个方法应该改变，因为没有办法将正则表达式边界和负前瞻结合起来。因此，我重写了整个函数，并且它似乎与所提出的示例一起工作得很好。 我还扩展了嵌入属性的标签的删除。

基本上，这只是一种残酷的方法：输入文本是预先扫描的，所有标签都被首先提取。如果输入参数和初始存档之间不匹配，它们将在所有输入文本中被删除。我还评论了一些对调试有用的行。

<SCRIPT LANGUAGE="javascript" TYPE="text/javascript">
function strip_tags( _html /*you can put each single tag per argument*/ )
{
    var _tags = [], _tag = "" ;
    for( var _a = 1 ; _a < arguments.length ; _a++ )
    {
        _tag = arguments[_a].replace( /<|>/g, '' ).trim() ;
        if ( arguments[_a].length > 0 ) _tags.push( _tag );
    }
    
    _tags = [...new Set( _tags )]; // unique values

    //console.log( "KEEP THESE TAGS:", _tags.length == 0 ? "delete all" : _tags.join( ", " ) );

    if ( !( typeof _html == "string" ) && !( _html instanceof String ) ) return "" ;
    else if ( _tags.length == 0 ) return _html.replace( /<(\s*\/?)[^>]+>/g, "" ) ;
    else
    {
        //console.log( "in:", _html );
        var _all_tags_prescan = _html.match( /\<[A-Za-z]+/g ), _re;
            _all_tags_prescan = [...new Set( _all_tags_prescan )]; // unique values
        //console.log( "all tags prescan:", _all_tags_prescan );
            //cleaning
        _all_tags_prescan.forEach( function( _tag, _i ){
            _all_tags_prescan[ _i ] = _tag.replace( /[\<\>]/g, '' );
        } );

        //console.log( "all tags prescan (clean):", _all_tags_prescan );
        //console.log( "input tags (clean):", _tags );
            
        for( var _t = 0 ; _t < _all_tags_prescan.length; _t++ )
        {
            //console.log( _all_tags_prescan[_t], _tags.includes( _tags[_t] ) );
            if ( _tags.includes( _all_tags_prescan[_t] ) ) continue;
            _re = new RegExp( "<"+_all_tags_prescan[_t]+"\\s*\\w*\=*\"[\\w\:\;\#]*\">", "g" );
            //console.log( _re, _html.match( _re ) );
            _html = _html.replace( _re, '' );

            if ( _all_tags_prescan[_t][0] != "/" ) _all_tags_prescan[_t] = "\/?" + _all_tags_prescan[_t];
            _re = new RegExp( "<"+_all_tags_prescan[_t]+">", "g" );
            //console.log( _re, _html.match( _re ) );
            _html = _html.replace( _re, '' );
        }

        return _html;
    }
}

var _html = "<b>Just</b> some <i STYLE=\"color:#323232;\">tags</i> and <pre>text</pre> to <b>test</b> <u>this code</u>" ;
console.log( "This is the original html code including some tag" );
console.log( _html ); // original html code
console.log( "GOAL: remove all tags (plain text)" );
console.log( strip_tags( _html ) ); // remove all tags
console.log( "GOAL: only the bold tag is kept" );
console.log( strip_tags( _html, "b" ) ); // keep <b> only
console.log( "GOAL: only the bold and the underline tags are kept" );
console.log( strip_tags( _html, "b", "i" ) ); // keep <b> and <i>
console.log( "GOAL: only the italic tag is kept" );
console.log( strip_tags( _html, "i" ) ); // keep <i>
</SCRIPT>

score 0 · Accepted Answer

我已经调整了 Sandro Rosa 的功能来解决 Nikita 提到的问题：

function strip_tags( _html /*you can put each single tag per argument*/ )
{
    var _tags = [], _tag = "";
    for ( var _a = 1 ; _a < arguments.length ; _a++ ) {
        _tag = arguments[_a].replace(/[<>\/]/g, '').trim();
        if ( arguments[_a].length > 0 ) _tags.push( _tag );
    }

    if ( !( typeof _html == "string" ) && !( _html instanceof String ) ) return "";
    else if ( _tags.length == 0 ) return _html.replace( /<\s*\/?[^>]+>/g, "" );
    else
    {
        var _re = new RegExp( "<(?!\\s*\\/?(" + _tags.join("|") + ")\\s*\\/?>)[^>]*>", "g" );
        return _html.replace( _re, '' );
    }
}

var _html = "<b>Just</b> some <i>tags</i> and text to test <u>this code</u>" ;
console.log( "This is the original html code including some tags" );
console.log( _html ); // original html code
console.log( "Now we remove all tags (plain text)" );
console.log( strip_tags( _html ) ); // remove all tags
console.log( "Only the bold tag is kept" );
console.log( strip_tags( _html, "b" ) ); // keep <b> only
console.log( "Only the underline tag is kept" );
console.log( strip_tags( _html, "u" ) ); // keep <u> only
console.log( "Only the italic tag is kept" );
console.log( strip_tags( _html, "<i>" ) ); // keep <i> only
console.log( "Keeping both italic and underline" );
console.log( strip_tags( _html, "i", "u" ) ); // keep both <i> and <u>

_html = "this is my <b>string</b> and it's pretty cool<br />isn't it?<br>Yep, it is.<strong>More HTML tags</strong><span></span><bol>" ;
console.log( "Keeping just the bold tag" );
console.log( strip_tags( _html, "b" ) ); // keep just the <b>, not the <br> or <bol>

javascript - 删除 html 标签，除了或带有 javascript 的标签

8 回答 8

Related

Reference

javascript - 删除 html 标签，除了
或
带有 javascript 的标签