让我们创建一个示例表以供讨论
create table #tmp (id int identity primary key, sometext varchar(max));
insert #tmp values ('This <has some> HTML in it <and some more> yeah>');
insert #tmp values ('No HTML here');
insert #tmp values ('This is > than that');
insert #tmp values ('And This is < than that');
insert #tmp values ('');
insert #tmp values ('I have many blanks. Don''t lose them <y>');
insert #tmp values (null);
insert #tmp values ('<b>This is bold</b> and <i>this is in italics</i>');
insert #tmp values ('I <<<<stttuttter> a lot <>');
GO
接下来,如果您的数据库中还没有 Numbers(又名 Tally)表,请创建一个。
create table Numbers (number int identity primary key);
GO
insert Numbers default values
GO 1000
最好使用多语句 TSQL 来解决这个问题,它可以利用索引并实现中间结果。这里的第一步是收集所有没有被<
and>
标签包围的字符。
create table #tmp2 (id int, number int, primary key(id,number), char char(1));
insert #tmp2
select u.id, N.Number, substring(u.sometext, N.number, 1) char
from #tmp u
join Numbers N on N.number <= len(u.sometext)
left join
(
select t.id, N.number lt, charindex('>', t.sometext, N.number+1) gt
from #tmp t
join Numbers N on substring(t.sometext, N.number, 1) = '<'
) exclusions on u.id = exclusions.id and n.number between exclusions.lt and exclusions.gt
where exclusions.id is null
GO
下一步只是使用FOR XML
.
update #tmp
set sometext = (select char+''
from #tmp2 b
where b.id = #tmp.id
order by Number
for xml path(''), type).value('/','nvarchar(max)')
where sometext like '%<%>%'
再次检查我们的桌子
select * from #tmp
id sometext
----------- --------------------------------------------------
1 This HTML in it yeah>
2 No HTML here
3 This is > than that
4 And This is < than that
5
6 I have many blanks. Don't lose them
7 NULL
8 This is bold and this is in italics
9 I a lot
对于一个小表,在单个查询(如上)中执行排除就足够了。对于较大的表,CTE 方法似乎更有效,因为它只遍历数据一次。上面代码中的表可以通过在创建后对表运行这些来扩大:
-- replicate the table data to about 20K records
insert #tmp select sometext from #tmp
GO 11
-- expand each string by 8 times the original length
update #tmp set sometext = sometext + sometext
GO 3
并使用它来创建#tmp2 表
;with cte(id, pos, sometext, size, char, flag) as (
select id, 1, sometext, len(sometext), substring(sometext, 1, 1),
case when substring(sometext, 1, 1) = '<' then 1 else 0 end
from #tmp
where sometext like '%<%>%'
union all
select id, pos+1, sometext, size, substring(sometext, pos+1, 1),
case when substring(sometext, pos+1, 1) = '<' then 1 -- starts a new html tag section
when char = '>' then 0 -- closed in prior iteration
else flag end -- continue flag
from cte
where pos < len(sometext)
)
insert #tmp2 (id, Number, char)
select id, pos, char
from cte
where flag = 0
option (maxrecursion 500)
GO
这导致时间
- 62 秒创建#tmp2
- 使用 #tmp2 更新 #tmp 需要 1 秒
该算法具有线性复杂性,因此如果您的字符串较短,或者您的记录较少,只需进行近似缩放以了解该过程需要多长时间。