sql - 用 join 替换 varchar(max) 字段中的值

Question

我有一个表，其中包含带有占位符的文本字段。像这样的东西：

Row Notes  
1.  This is some notes ##placeholder130## this ##myPlaceholder##, #oneMore#. End.
2.  Second row...just a ##test#.

（此表平均包含大约 1-5k 行。一行中占位符的平均数量为 5-15）。

现在，我有一个如下所示的查找表：

Name             Value
placeholder130    Dog
myPlaceholder     Cat
oneMore           Cow
test              Horse

（查找表将包含 10k 到 100k 条记录）

我需要找到将这些占位符从字符串连接到查找表并替换为值的最快方法。所以，我的结果应该是这样的（第一行）：

这是一些笔记 Dog this Cat, Cow。结尾。

我想出的是为每个占位符将每一行拆分为多个，然后将其加入查找表，然后将记录连接回具有新值的原始行，但平均需要大约 10-30 秒。

score 9 · Accepted Answer

您可以尝试使用数字表拆分字符串并使用for xml path.

select (
       select coalesce(L.Value, T.Value)
       from Numbers as N
         cross apply (select substring(Notes.notes, N.Number, charindex('##', Notes.notes + '##', N.Number) - N.Number)) as T(Value)
         left outer join Lookup as L
           on L.Name = T.Value
       where N.Number <= len(notes) and
             substring('##' + notes, Number, 2) = '##'
       order by N.Number
       for xml path(''), type
       ).value('text()[1]', 'varchar(max)')
from Notes

SQL小提琴

我从Aaron Bertrand 的这篇博文中借用了字符串拆分

score 6 · Accepted Answer

SQL Server 在字符串操作方面不是很快，所以这可能最好在客户端完成。让客户端加载整个查找表，并在笔记到达时替换它们。

话虽如此，它当然可以在 SQL 中完成。这是一个递归 CTE 的解决方案。每个递归步骤执行一次查找：

; with  Repl as
        (
        select  row_number() over (order by l.name) rn
        ,       Name
        ,       Value
        from    Lookup l
        )
,       Recurse as
        (
        select  Notes
        ,       0 as rn
        from    Notes
        union all
        select  replace(Notes, '##' + l.name + '##', l.value)
        ,       r.rn + 1
        from    Recurse r
        join    Repl l
        on      l.rn = r.rn + 1
        )
select  *
from    Recurse
where   rn = 
        (
        select  count(*)
        from    Lookup
        )
option  (maxrecursion 0)

SQL Fiddle 的示例。

另一种选择是while循环不断替换查找，直到找不到更多：

declare @notes table (notes varchar(max))

insert  @notes
select  Notes
from    Notes

while 1=1
    begin

    update  n
    set     Notes = replace(n.Notes, '##' + l.name + '##', l.value)
    from    @notes n
    outer apply
            (
            select  top 1 Name
            ,       Value
            from    Lookup l
            where   n.Notes like '%##' + l.name + '##%'
            ) l
    where   l.name is not null

    if @@rowcount = 0
        break
    end   

select  *
from    @notes

SQL Fiddle 的示例。

score 4 · Accepted Answer

尝试这个

;WITH CTE (org, calc, [Notes], [level]) AS
(
    SELECT [Notes], [Notes], CONVERT(varchar(MAX),[Notes]), 0 FROM PlaceholderTable

    UNION ALL

    SELECT  CTE.org, CTE.[Notes],
        CONVERT(varchar(MAX), REPLACE(CTE.[Notes],'##' + T.[Name] + '##', T.[Value])), CTE.[level] + 1
    FROM    CTE
    INNER JOIN LookupTable T ON CTE.[Notes] LIKE '%##' + T.[Name] + '##%'

)

SELECT DISTINCT org, [Notes], level FROM CTE
WHERE [level] = (SELECT MAX(level) FROM CTE c WHERE CTE.org = c.org)

SQL 小提琴演示

检查下面的 devioblog 帖子以供参考

devioblog 帖子

score 4 · Accepted Answer

我赞同 tsql 不适合此操作的评论，但如果您必须在数据库中执行此操作，这里是一个使用函数来管理多个替换语句的示例。

由于每个音符 (5-15) 中的标记数量相对较少且标记数量非常多 (10k-100k)，因此我的函数首先从输入中提取标记作为潜在标记，并使用该集合加入您的查找 ( dbo.Token 下面）。在每个音符中寻找任何标记的出现是太多的工作。

我使用 50k 令牌和 5k 笔记做了一些性能测试，这个功能运行得非常好，在 <2 秒内完成（在我的笔记本电脑上）。请报告此策略对您的效果。

注意：在您的示例数据中，令牌格式不一致（##_#, ##_##, #_#），我猜这只是一个错字，并假设所有令牌都采用 ##TokenName## 的形式。

--setup
    if object_id('dbo.[Lookup]') is not null
        drop table dbo.[Lookup];
    go
    if object_id('dbo.fn_ReplaceLookups') is not null
        drop function dbo.fn_ReplaceLookups;
    go

    create table dbo.[Lookup] (LookupName varchar(100) primary key, LookupValue varchar(100));
    insert into dbo.[Lookup]
        select '##placeholder130##','Dog' union all
        select '##myPlaceholder##','Cat' union all
        select '##oneMore##','Cow' union all
        select '##test##','Horse';
    go

    create function [dbo].[fn_ReplaceLookups](@input varchar(max))
    returns varchar(max)
    as
    begin

        declare @xml xml;
        select @xml = cast(('<r><i>'+replace(@input,'##' ,'</i><i>')+'</i></r>') as xml);

        --extract the potential tokens
        declare @LookupsInString table (LookupName varchar(100) primary key);
        insert into @LookupsInString
            select  distinct '##'+v+'##'
            from    (   select  [v] = r.n.value('(./text())[1]', 'varchar(100)'),
                                [r] = row_number() over (order by n)
                        from    @xml.nodes('r/i') r(n)
                    )d(v,r)
            where   r%2=0;

        --tokenize the input
        select  @input = replace(@input, l.LookupName, l.LookupValue)
        from    dbo.[Lookup] l
        join    @LookupsInString lis on 
                l.LookupName = lis.LookupName;

        return @input;
    end
    go          
    return            

--usage
    declare @Notes table ([Id] int primary key, notes varchar(100));
    insert into @Notes
        select 1, 'This is some notes ##placeholder130## this ##myPlaceholder##, ##oneMore##. End.' union all
        select 2, 'Second row...just a ##test##.';

    select  *,
            dbo.fn_ReplaceLookups(notes)
    from    @Notes;

回报：

Tokenized
--------------------------------------------------------
This is some notes Dog this Cat, Cow. End.
Second row...just a Horse.

score 1 · Accepted Answer

为了加快速度，您可以将笔记模板预处理为更有效的形式。这将是一个片段序列，每个片段都以替换结尾。最后一个片段的替换可能为 NULL。

Notes
Id     FragSeq    Text                    SubsId
1      1          'This is some notes '   1
1      2          ' this '                2
1      3          ', '                    3
1      4          '. End.'                null
2      1          'Second row...just a '  4
2      2          '.'                     null

Subs
Id  Name               Value
1   'placeholder130'   'Dog'
2   'myPlaceholder'    'Cat'
3   'oneMore'          'Cow'
4   'test'             'Horse'

现在我们可以通过简单的连接来进行替换。

SELECT Notes.Text + COALESCE(Subs.Value, '') 
FROM Notes LEFT JOIN Subs 
ON SubsId = Subs.Id WHERE Notes.Id = ?
ORDER BY FragSeq

这将生成一个替换完成的片段列表。我不是 MSQL 用户，但在大多数 SQL 方言中，您可以很容易地将这些片段连接到一个变量中：

DECLARE @Note VARCHAR(8000)
SELECT @Note = COALESCE(@Note, '') + Notes.Text + COALSCE(Subs.Value, '') 
FROM Notes LEFT JOIN Subs 
ON SubsId = Subs.Id WHERE Notes.Id = ?
ORDER BY FragSeq

使用其他帖子的字符串拆分技术将笔记模板预处理为片段将很简单。

不幸的是，我不在可以测试它的位置，但它应该可以正常工作。

score 0 · Accepted Answer

我真的不知道它会在 10k+ 的查找中表现如何。旧的动态 SQL 是如何执行的？

DECLARE @sqlCommand  NVARCHAR(MAX)
SELECT @sqlCommand  = N'PlaceholderTable.[Notes]'

SELECT @sqlCommand  = 'REPLACE( ' + @sqlCommand  + 
                      ', ''##' + LookupTable.[Name] + '##'', ''' + 
                      LookupTable.[Value] + ''')'  
FROM LookupTable

SELECT @sqlCommand  = 'SELECT *, ' + @sqlCommand  + ' FROM PlaceholderTable'

EXECUTE sp_executesql @sqlCommand

小提琴演示

score 0 · Accepted Answer

现在进行一些递归 CTE。

如果您的索引设置正确，则该索引应该非常快或非常慢。当谈到 r-CTE 时，SQL Server 总是以极端的性能让我感到惊讶......

;WITH T AS (
  SELECT
    Row,
    StartIdx = 1,                                  -- 1 as first starting index
    EndIdx = CAST(patindex('%##%', Notes) as int), -- first ending index
    Result = substring(Notes, 1, patindex('%##%', Notes) - 1)
                                                   -- (first) temp result bounded by indexes
  FROM PlaceholderTable -- **this is your source table**
  UNION ALL
  SELECT
    pt.Row,
    StartIdx = newstartidx,                        -- starting index (calculated in calc1)
    EndIdx = EndIdx + CAST(newendidx as int) + 1,  -- ending index (calculated in calc4 + total offset)
    Result = Result + CAST(ISNULL(newtokensub, newtoken) as nvarchar(max))
                                                   -- temp result taken from subquery or original
  FROM 
    T
    JOIN PlaceholderTable pt -- **this is your source table**
      ON pt.Row = T.Row
    CROSS APPLY(
      SELECT newstartidx = EndIdx + 2              -- new starting index moved by 2 from last end ('##')
    ) calc1
    CROSS APPLY(
      SELECT newtxt = substring(pt.Notes, newstartidx, len(pt.Notes))
                                                   -- current piece of txt we work on
    ) calc2
    CROSS APPLY(
      SELECT patidx = patindex('%##%', newtxt)     -- current index of '##'
    ) calc3
    CROSS APPLY(
      SELECT newendidx = CASE 
        WHEN patidx = 0 THEN len(newtxt) + 1
        ELSE patidx END                            -- if last piece of txt, end with its length
    ) calc4
    CROSS APPLY(
      SELECT newtoken = substring(pt.Notes, newstartidx, newendidx - 1)
                                                   -- get the new token
    ) calc5
    OUTER APPLY(
      SELECT newtokensub = Value
      FROM LookupTable
      WHERE Name = newtoken                        -- substitute the token if you can find it in **your lookup table**
    ) calc6
  WHERE newstartidx + len(newtxt) - 1  <= len(pt.Notes)  
                                                   -- do this while {new starting index} + {length of txt we work on} exceeds total length
) 
,lastProcessed AS (
  SELECT 
    Row, 
    Result,
    rn = row_number() over(partition by Row order by StartIdx desc)
  FROM T 
)                                                  -- enumerate all (including intermediate) results
SELECT *
FROM lastProcessed
WHERE rn = 1                                       -- filter out intermediate results (display only last ones)

sql - 用 join 替换 varchar(max) 字段中的值

7 回答 7

Related

Reference