0

我有这个功能:

CREATE FUNCTION [dbo].[udf_StripHTML]
(@HTMLText VARCHAR(MAX))
RETURNS VARCHAR(MAX)
AS
BEGIN
DECLARE @Start INT
DECLARE @End INT
DECLARE @Length INT
SET @Start = CHARINDEX('<',@HTMLText)
SET @End = CHARINDEX('>',@HTMLText,CHARINDEX('<',@HTMLText))
SET @Length = (@End - @Start) + 1
WHILE @Start > 0
AND @End > 0
AND @Length > 0
BEGIN
SET @HTMLText = STUFF(@HTMLText,@Start,@Length,'')
SET @Start = CHARINDEX('<',@HTMLText)
SET @End = CHARINDEX('>',@HTMLText,CHARINDEX('<',@HTMLText))
SET @Length = (@End - @Start) + 1
END
RETURN LTRIM(RTRIM(@HTMLText))
END

我需要剥离除标签之外<em>的所有内容。<strong>

谢谢

托马斯

4

3 回答 3

1

通过 clr 函数进行 xslt 转换(例如,来自 MDS 程序集的那个)

更新

这是使用 XSLT 转换方法的答案:How to remove all tags except some using Nokogiri

更新 2

那么剩下的唯一选择就是使用正则表达式。再次通过 CLR 函数。

去除除链接之外的所有 HTML 标签

MDS 组件的安装

在没有 MDS 的情况下部署 SQL 2008 R2 MDS 函数

于 2012-04-19T19:58:59.980 回答
0

编辑:以下是我用于解决问题的内容,因为我的 HTML 非常具体。我最初的问题的正确答案是我在上面被丹尼斯接受的那个。

我使用的数据非常具体。

我需要删除的标签都是大写字母,即:

我想保留的是小写,即:

所以我可以用 PATINDEX 而不是 CHARINDEX 轻松做到这一点:

ALTER FUNCTION [dbo].[udf_StripHTMLlinks]
(@HTMLText VARCHAR(MAX))
RETURNS VARCHAR(MAX)
AS
BEGIN
DECLARE @Start INT
DECLARE @End INT
DECLARE @Length INT
SET @Start = PATINDEX('%<[/ABCDEFGHIJKLMNOPQRSTUVWXYZ][/ABCDEFGHIJKLMNOPQRSTUVWXYZ    >]%',@HTMLText COLLATE SQL_Latin1_General_CP1_CS_AS)
SET @End = CHARINDEX('>',@HTMLText,PATINDEX('%<[/ABCDEFGHIJKLMNOPQRSTUVWXYZ][/ABCDEFGHIJKLMNOPQRSTUVWXYZ >]%',@HTMLText COLLATE SQL_Latin1_General_CP1_CS_AS))
SET @Length = (@End - @Start) + 1
WHILE @Start > 0
AND @End > 0
AND @Length > 0
BEGIN
SET @HTMLText = STUFF(@HTMLText,@Start,@Length,'')
SET @Start = PATINDEX('%<[/ABCDEFGHIJKLMNOPQRSTUVWXYZ][/ABCDEFGHIJKLMNOPQRSTUVWXYZ >]%',@HTMLText COLLATE SQL_Latin1_General_CP1_CS_AS)
SET @End = CHARINDEX('>',@HTMLText,PATINDEX('%<[/ABCDEFGHIJKLMNOPQRSTUVWXYZ][/ABCDEFGHIJKLMNOPQRSTUVWXYZ >]%',@HTMLText COLLATE SQL_Latin1_General_CP1_CS_AS))
SET @Length = (@End - @Start) + 1
END
RETURN LTRIM(RTRIM(@HTMLText))
END

感谢您的输入。

于 2012-04-19T20:29:25.447 回答
0

您可以使用 SQL 函数,例如:

ALTER FUNCTION [dbo].[StripOutHTML]
(
    @HTMLText VARCHAR(max),
    @stripDisallowedOnly BIT
)
returns VARCHAR(max) 
AS 
  BEGIN 
      DECLARE @Start INT
      DECLARE @End INT
      DECLARE @Length INT 

      -- Replace the HTML entity & with the '&' character (this needs to be done first, as  
      -- '&' might be double encoded as '&amp;')  
      SET @Start = Charindex('&amp;', @HTMLText) 
      SET @End = @Start + 4 
      SET @Length = ( @End - @Start ) + 1 

      WHILE ( @Start > 0 
              AND @End > 0 
              AND @Length > 0 ) 
        BEGIN 
            SET @HTMLText = Stuff(@HTMLText, @Start, @Length, '&') 
            SET @Start = Charindex('&amp;', @HTMLText) 
            SET @End = @Start + 4 
            SET @Length = ( @End - @Start ) + 1 
        END 

      -- Replace the HTML entity < with the '<' character  
      SET @Start = Charindex('&lt;', @HTMLText) 
      SET @End = @Start + 3 
      SET @Length = ( @End - @Start ) + 1 

      WHILE ( @Start > 0 
              AND @End > 0 
              AND @Length > 0 ) 
        BEGIN 
            SET @HTMLText = Stuff(@HTMLText, @Start, @Length, '<') 
            SET @Start = Charindex('&lt;', @HTMLText) 
            SET @End = @Start + 3 
            SET @Length = ( @End - @Start ) + 1 
        END 

      -- Replace the HTML entity > with the '>' character  
      SET @Start = Charindex('&gt;', @HTMLText) 
      SET @End = @Start + 3 
      SET @Length = ( @End - @Start ) + 1 

      WHILE ( @Start > 0 
              AND @End > 0 
              AND @Length > 0 ) 
        BEGIN 
            SET @HTMLText = Stuff(@HTMLText, @Start, @Length, '>') 
            SET @Start = Charindex('&gt;', @HTMLText) 
            SET @End = @Start + 3 
            SET @Length = ( @End - @Start ) + 1 
        END 

      -- Replace the HTML entity & with the '&' character  
      SET @Start = Charindex('&amp;amp;', @HTMLText) 
      SET @End = @Start + 4 
      SET @Length = ( @End - @Start ) + 1 

      WHILE ( @Start > 0 
              AND @End > 0 
              AND @Length > 0 ) 
        BEGIN 
            SET @HTMLText = Stuff(@HTMLText, @Start, @Length, '&') 
            SET @Start = Charindex('&amp;amp;', @HTMLText) 
            SET @End = @Start + 4 
            SET @Length = ( @End - @Start ) + 1 
        END 

      -- Replace the HTML entity   with the ' ' character  
      SET @Start = Charindex('&nbsp;', @HTMLText) 
      SET @End = @Start + 5 
      SET @Length = ( @End - @Start ) + 1 

      WHILE ( @Start > 0 
              AND @End > 0 
              AND @Length > 0 ) 
        BEGIN 
            SET @HTMLText = Stuff(@HTMLText, @Start, @Length, ' ') 
            SET @Start = Charindex('&nbsp;', @HTMLText) 
            SET @End = @Start + 5 
            SET @Length = ( @End - @Start ) + 1 
        END 

      -- Replace any <P>, </P>tags with a <BR>, so they will be replaced with a new line in next step  
      SET @HTMLText = REPLACE(@HTMLText, '<P>', '<br>') 
      SET @HTMLText = REPLACE(@HTMLText, '</P>', '<br>') 

      -- Replace any <BR> tags with a newline  
      SET @Start = Charindex('<br>', @HTMLText) 
      SET @End = @Start + 3 
      SET @Length = ( @End - @Start ) + 1 

      WHILE ( @Start > 0 
              AND @End > 0 
              AND @Length > 0 ) 
        BEGIN 
            SET @HTMLText = Stuff(@HTMLText, @Start, @Length, 
                            Char(13) + Char(10)) 
            SET @Start = Charindex('<br>', @HTMLText) 
            SET @End = @Start + 3 
            SET @Length = ( @End - @Start ) + 1 
        END 

      -- Replace any  tags with a newline  
      SET @Start = Charindex('<br/>', @HTMLText) 
      SET @End = @Start + 4 
      SET @Length = ( @End - @Start ) + 1 

      WHILE ( @Start > 0 
              AND @End > 0 
              AND @Length > 0 ) 
        BEGIN 
            SET @HTMLText = Stuff(@HTMLText, @Start, @Length, 
                            'CHAR(13) + CHAR(10)') 
            SET @Start = Charindex('<br/>', @HTMLText) 
            SET @End = @Start + 4 
            SET @Length = ( @End - @Start ) + 1 
        END 

      -- Replace any  tags with a newline  
      SET @Start = Charindex('<br />', @HTMLText) 
      SET @End = @Start + 5 
      SET @Length = ( @End - @Start ) + 1 

      WHILE ( @Start > 0 
              AND @End > 0 
              AND @Length > 0 ) 
        BEGIN 
            SET @HTMLText = Stuff(@HTMLText, @Start, @Length, 
                            'CHAR(13) + CHAR(10)') 
            SET @Start = Charindex('<br />', @HTMLText) 
            SET @End = @Start + 5 
            SET @Length = ( @End - @Start ) + 1 
        END 

      -- Remove anything between  tags  
      SET @Start = Charindex('<', @HTMLText) 
      SET @End = Charindex('>', @HTMLText, Charindex('<', @HTMLText)) 
      SET @Length = ( @End - @Start ) + 1 

      WHILE ( @Start > 0 
              AND @End > 0 
              AND @Length > 0 ) 
        BEGIN 
            IF @stripDisallowedOnly = 1 
              BEGIN 
                  IF ( Upper(Substring(@HTMLText, @Start, 2)) <> '<B' ) 
                     AND ( Upper(Substring(@HTMLText, @Start, 3)) <> '</B' ) 
                     AND ( Upper(Substring(@HTMLText, @Start, 2)) <> '<U' ) 
                     AND ( Upper(Substring(@HTMLText, @Start, 3)) <> '</U' ) 
                     AND ( Upper(Substring(@HTMLText, @Start, 2)) <> '<I' ) 
                     AND ( Upper(Substring(@HTMLText, @Start, 3)) <> '</I' ) 
                    BEGIN 
                        SET @HTMLText = Stuff(@HTMLText, @Start, @Length, '') 
                    END 
                  ELSE 
                    BEGIN 
                        SET @Length = 0 
                    END 
              END 
            ELSE 
              BEGIN 
                  SET @HTMLText = Stuff(@HTMLText, @Start, @Length, '') 
              END 

            SET @Start = Charindex('<', @HTMLText, @End - @Length) 
            SET @End = Charindex('>', @HTMLText, Charindex('<', @HTMLText, 
                                                 @Start) 
                       ) 
            SET @Length = ( @End - @Start ) + 1 
        END 

      -- Remove any leading space/carriage return 
      DECLARE @trimchars VARCHAR(10)
      SET @trimchars = CHAR(9)+CHAR(10)+CHAR(13)+CHAR(32)
         IF @HTMLText LIKE '[' + @trimchars + ']%' SET @HTMLText = SUBSTRING(@HTMLText, PATINDEX('%[^' + @trimchars + ']%', @HTMLText), LEN(@HTMLText))
      RETURN Ltrim(Rtrim(@HTMLText)) 
  END

您需要更改以下行以保留em类似于以下的标签Upper(Substring(@HTMLText, @Start, 2)) <> '<B' ):在正确的方向。<BR><P>

WHILE ( @Start > 0 
      AND @End > 0 
      AND @Length > 0 ) 
BEGIN 
    IF @stripDisallowedOnly = 1 
      BEGIN 
          IF ( Upper(Substring(@HTMLText, @Start, 2)) <> '<B' ) 
             AND ( Upper(Substring(@HTMLText, @Start, 3)) <> '</B' ) 
             AND ( Upper(Substring(@HTMLText, @Start, 2)) <> '<U' ) 
             AND ( Upper(Substring(@HTMLText, @Start, 3)) <> '</U' ) 
             AND ( Upper(Substring(@HTMLText, @Start, 2)) <> '<I' ) 
             AND ( Upper(Substring(@HTMLText, @Start, 3)) <> '</I' ) 
            BEGIN 
                SET @HTMLText = Stuff(@HTMLText, @Start, @Length, '') 
            END 
          ELSE 
            BEGIN 
                SET @Length = 0 
            END 
      END 
    ELSE 
      BEGIN 
          SET @HTMLText = Stuff(@HTMLText, @Start, @Length, '') 
      END 

    SET @Start = Charindex('<', @HTMLText, @End - @Length) 
    SET @End = Charindex('>', @HTMLText, Charindex('<', @HTMLText, 
                                         @Start) 
               ) 
    SET @Length = ( @End - @Start ) + 1 
END 
于 2014-06-30T16:54:25.727 回答