sql - 从 SQL Server 列中删除 HTML

Question

我们曾经使用 WYSIWYG 编辑器，并且在 SQL Server 2008 R2 表中有四个文本类型列。我想删除所有的 HTML 标签。这在 ASP（经典 ASP）页面中使用时有效：

Function RemoveHTML(strText )
  Dim RegEx

  Set RegEx = New RegExp

  RegEx.Pattern = "<[^>]*>"
  RegEx.Global = True

  RemoveHTML = RegEx.Replace(strText, "")
End Function

但是我想要一个不同的解决方案，也许是 SQL 驱动的。非常希望得到一些帮助。正则表达式可以这样用在 SQL 语句中吗？任何帮助建议都会很棒？我想我想保留中断标签，但其余的都可以。

TYIA

score 3 · Accepted Answer

让我们创建一个示例表以供讨论

create table #tmp (id int identity primary key, sometext varchar(max));
insert #tmp values ('This <has some> HTML in it <and some more> yeah>');
insert #tmp values ('No HTML here');
insert #tmp values ('This is > than that');
insert #tmp values ('And This is < than that');
insert #tmp values ('');
insert #tmp values ('I have many        blanks. Don''t lose them <y>');
insert #tmp values (null);
insert #tmp values ('<b>This is bold</b> and <i>this is in italics</i>');
insert #tmp values ('I <<<<stttuttter> a lot <>');
GO

接下来，如果您的数据库中还没有 Numbers（又名 Tally）表，请创建一个。

create table Numbers (number int identity primary key);
GO
insert Numbers default values
GO 1000

最好使用多语句 TSQL 来解决这个问题，它可以利用索引并实现中间结果。这里的第一步是收集所有没有被<and>标签包围的字符。

create table #tmp2 (id int, number int, primary key(id,number), char char(1));

insert #tmp2
select u.id, N.Number, substring(u.sometext, N.number, 1) char
from #tmp u
join Numbers N on N.number <= len(u.sometext)
left join
    (
        select t.id, N.number lt, charindex('>', t.sometext, N.number+1) gt
        from #tmp t
        join Numbers N on substring(t.sometext, N.number, 1) = '<'
    ) exclusions on u.id = exclusions.id and n.number between exclusions.lt and exclusions.gt
where exclusions.id is null
GO

下一步只是使用FOR XML.

update #tmp
set sometext = (select char+''
            from #tmp2 b
            where b.id = #tmp.id
            order by Number
            for xml path(''), type).value('/','nvarchar(max)')
where sometext like '%<%>%'

再次检查我们的桌子

select * from #tmp

id          sometext
----------- --------------------------------------------------
1           This  HTML in it  yeah>
2           No HTML here
3           This is > than that
4           And This is < than that
5           
6           I have many        blanks. Don't lose them 
7           NULL
8           This is bold and this is in italics
9           I  a lot

对于一个小表，在单个查询（如上）中执行排除就足够了。对于较大的表，CTE 方法似乎更有效，因为它只遍历数据一次。上面代码中的表可以通过在创建后对表运行这些来扩大：

-- replicate the table data to about 20K records
insert #tmp select sometext from #tmp
GO 11

-- expand each string by 8 times the original length
update #tmp set sometext = sometext + sometext
GO 3

并使用它来创建#tmp2 表

;with cte(id, pos, sometext, size, char, flag) as (
    select id, 1, sometext, len(sometext), substring(sometext, 1, 1),
        case when substring(sometext, 1, 1) = '<' then 1 else 0 end
    from #tmp
    where sometext like '%<%>%'
    union all
    select id, pos+1, sometext, size, substring(sometext, pos+1, 1),
        case when substring(sometext, pos+1, 1) = '<' then 1 -- starts a new html tag section
             when char = '>' then 0 -- closed in prior iteration
             else flag end -- continue flag
    from cte
    where pos < len(sometext)
)
insert #tmp2 (id, Number, char)
select id, pos, char
from cte
where flag = 0
option (maxrecursion 500)
GO

这导致时间

62 秒创建#tmp2
使用 #tmp2 更新 #tmp 需要 1 秒

该算法具有线性复杂性，因此如果您的字符串较短，或者您的记录较少，只需进行近似缩放以了解该过程需要多长时间。

score 2 · Accepted Answer

我最终使用了从这里https://www.simple-talk.com/sql/t-sql-programming/tsql-regular-expression-workbench/获得的解决方案。它工作得非常好，这对我有用。在数据库中创建函数：

IF OBJECT_ID(N'dbo.RegexReplace') IS NOT NULL 
   DROP FUNCTION dbo.RegexReplace
GO
CREATE FUNCTION dbo.RegexReplace
(
  @pattern VARCHAR(255),
  @replacement VARCHAR(255),
  @Subject VARCHAR(MAX),
  @global BIT = 1,
 @Multiline bit =1
)
RETURNS VARCHAR(MAX)

AS BEGIN
DECLARE @objRegexExp INT,
    @objErrorObject INT,
    @strErrorMessage VARCHAR(255),
    @Substituted VARCHAR(8000),
    @hr INT,
    @Replace BIT

SELECT  @strErrorMessage = 'creating a regex object'
EXEC @hr= sp_OACreate 'VBScript.RegExp', @objRegexExp OUT
IF @hr = 0 
    SELECT  @strErrorMessage = 'Setting the Regex pattern',
            @objErrorObject = @objRegexExp
IF @hr = 0 
    EXEC @hr= sp_OASetProperty @objRegexExp, 'Pattern', @pattern
IF @hr = 0 /*By default, the regular expression is case sensitive. Set the IgnoreCase property to True to make it case insensitive.*/
    SELECT  @strErrorMessage = 'Specifying the type of match' 
IF @hr = 0 
    EXEC @hr= sp_OASetProperty @objRegexExp, 'IgnoreCase', 1
IF @hr = 0 
    EXEC @hr= sp_OASetProperty @objRegexExp, 'MultiLine', @Multiline
IF @hr = 0 
    EXEC @hr= sp_OASetProperty @objRegexExp, 'Global', @global
IF @hr = 0 
    SELECT  @strErrorMessage = 'Doing a Replacement' 
IF @hr = 0 
    EXEC @hr= sp_OAMethod @objRegexExp, 'Replace', @Substituted OUT,
        @subject, @Replacement
 /*If the RegExp.Global property is False (the default), Replace will return the @subject string with the first regex match (if any) substituted with the replacement text. If RegExp.Global is true, the @Subject string will be returned with all matches replaced.*/   
IF @hr <> 0 
    BEGIN
        DECLARE @Source VARCHAR(255),
            @Description VARCHAR(255),
            @Helpfile VARCHAR(255),
            @HelpID INT

        EXECUTE sp_OAGetErrorInfo @objErrorObject, @source OUTPUT,
            @Description OUTPUT, @Helpfile OUTPUT, @HelpID OUTPUT
        SELECT  @strErrorMessage = 'Error whilst '
                + COALESCE(@strErrorMessage, 'doing something') + ', '
                + COALESCE(@Description, '')
        RETURN @strErrorMessage
    END
EXEC sp_OADestroy @objRegexExp
RETURN @Substituted
END
Go

现在简单地运行这个：

--replace all the HTML in a field example
update TableName set fieldName=dbo.RegexReplace('<(?:[^>''"]*|([''"]).*?\1)*>',
 '',fieldName,1,1)

它工作得非常好

sql - 从 SQL Server 列中删除 HTML

2 回答 2

Related

Reference