是否可以使用全文在 SQL Server 中查找 1.1.1 或 1.5.2(多级段落)等字符串?我的 sql 看起来像这样:
contains (MyTable.MyColumn,'"*5.1.1*"')
我已经尝试从停用词列表中删除数字或完全禁用停用词列表。结果,像 5.1 或 1.1 这样的字符串工作正常(可能在内部这些字符串被处理为数字?),但对于带有 2 个点的数字,仍然没有结果。
有没有办法逃脱那些虚线/数字,或任何其他解决方案?
是否可以使用全文在 SQL Server 中查找 1.1.1 或 1.5.2(多级段落)等字符串?我的 sql 看起来像这样:
contains (MyTable.MyColumn,'"*5.1.1*"')
我已经尝试从停用词列表中删除数字或完全禁用停用词列表。结果,像 5.1 或 1.1 这样的字符串工作正常(可能在内部这些字符串被处理为数字?),但对于带有 2 个点的数字,仍然没有结果。
有没有办法逃脱那些虚线/数字,或任何其他解决方案?
句点在全文搜索中是有问题的,因为它们通常被视为单词之间的句号。用不同的字符替换句点是解决方案,您可以对应用程序进行最小的更改。这是一个相当长的脚本,它将引导您识别问题并找到解决方案。如果您想要的只是解决方法,您可以跳到“简答”版本。
设置全文架构
SET ANSI_NULLS ON
SET QUOTED_IDENTIFIER ON
SET ANSI_PADDING ON
CREATE TABLE [dbo].[FT_Test](
[id] [int] IDENTITY(1,1) NOT NULL,
[TextData] [varchar](max) NOT NULL,
CONSTRAINT [PK_FT_Test] PRIMARY KEY CLUSTERED
(
[id] ASC
) ON [PRIMARY]
) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY]
GO
CREATE FULLTEXT CATALOG [ft_default] WITH ACCENT_SENSITIVITY = ON
CREATE FULLTEXT INDEX ON [dbo].[FT_Test] KEY INDEX [PK_FT_Test] ON ([ft_default])
WITH (CHANGE_TRACKING AUTO)
ALTER FULLTEXT INDEX ON [dbo].[FT_Test] ADD ([TextData])
ALTER FULLTEXT INDEX ON [dbo].[FT_Test] ENABLE
验证 Sql Server 版本
此脚本是围绕 Sql Server 2012 设计的,但也应适用于 2008。分词器在 Sql 2008 和 Sql 2012 之间发生了很大变化(至少对于语言 id 1033 - 美国英语)。主要含义是1-2-3被分解为1、2、3、1-2-3、nn1、nn2、nn3(包括1-2-3是新的)
go
PRINT 'Version 14.0.4763.1000 is Sql Server 2012'
EXEC master.sys.sp_help_fulltext_system_components @component_type = 'wordbreaker', @param=1033
Sql Server 半智能解析关键字
不幸的是,这目前对我们不利。我们会变得臃肿,因为相同的数据被多次存储并且搜索结果也很糟糕。
go
DELETE FROM ft_test
INSERT INTO dbo.FT_Test ( TextData )
VALUES
( '1.1.1 5.2.1, 7.1.1.34.69; 12.11.10.9.8 4.6 7/13/2013 15,456.345')
WAITFOR DELAY '00:00:05'
--Wait 5 seconds for ft index to populate
SELECT ft_test.*, ft_content.display_term, ft_content.occurrence_count
FROM sys.dm_fts_index_keywords_by_document(DB_ID(), OBJECT_ID('ft_test')) ft_content
INNER JOIN dbo.FT_Test ON document_id = id
ORDER BY id, keyword
--Notice what is returned,the two digit numbers are identified, but the 1 digit numbers aren't (due to default stoplist).
--Also, note that they are treated as distinct items and are broken up. 4.6 does show up because it is a decimal number.
--the nn* display_terms are standardized numeric (also, note how the date got standardized as dd20120713 in addition to 7/13/2013)
SELECT *
FROM ft_test
WHERE CONTAINS ( *, '"5.2*"' ) -- No results, 5 and 2 are in default stopword list.
SELECT *
FROM ft_test
WHERE CONTAINS ( *, '"12.11*"' ) -- periods are hard breaks, so this doesn't work either
创建自定义停止列表以索引单个数字
在全文搜索方面,个位数通常不值钱,但我们需要它们。我们将使用默认的系统停止列表作为基础。
CREATE FULLTEXT STOPLIST [no_numbers]
FROM SYSTEM STOPLIST
AUTHORIZATION [dbo];
go
ALTER FULLTEXT STOPLIST [no_numbers] DROP '0' LANGUAGE 'English';
ALTER FULLTEXT STOPLIST [no_numbers] DROP '1' LANGUAGE 'English';
ALTER FULLTEXT STOPLIST [no_numbers] DROP '2' LANGUAGE 'English';
ALTER FULLTEXT STOPLIST [no_numbers] DROP '3' LANGUAGE 'English';
ALTER FULLTEXT STOPLIST [no_numbers] DROP '4' LANGUAGE 'English';
ALTER FULLTEXT STOPLIST [no_numbers] DROP '5' LANGUAGE 'English';
ALTER FULLTEXT STOPLIST [no_numbers] DROP '6' LANGUAGE 'English';
ALTER FULLTEXT STOPLIST [no_numbers] DROP '7' LANGUAGE 'English';
ALTER FULLTEXT STOPLIST [no_numbers] DROP '8' LANGUAGE 'English';
ALTER FULLTEXT STOPLIST [no_numbers] DROP '9' LANGUAGE 'English';
GO
根据新的停止列表重新创建全文索引
这有助于一些人,让我们更接近我们想要的地方。
DROP FULLTEXT INDEX ON dbo.FT_Test
CREATE FULLTEXT INDEX ON [dbo].[FT_Test] ( TextData) KEY INDEX [PK_FT_Test] ON ([ft_default])
WITH (CHANGE_TRACKING AUTO, STOPLIST = [no_numbers])
WAITFOR DELAY '00:00:05'
--Wait 5 seconds for ft index to populate
SELECT ft_test.*, ft_content.display_term, ft_content.occurrence_count
FROM sys.dm_fts_index_keywords_by_document(DB_ID(), OBJECT_ID('ft_test')) ft_content
INNER JOIN dbo.FT_Test ON document_id = id
ORDER BY id, keyword
--Progress, now single digits are showing up
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '1 1 14.123' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '5.2.1.1.14' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '1.1.3 ' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '2.2.3.3' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '6.0 88.00.00' )
--This works in the first 3 cases, but doesn't work for 2.2
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '1.1.1*' ) ct ON ct.[key] = ft_test.id
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '2.2.3.3*' ) ct ON ct.[key] = ft_test.id
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '2.2.3*' ) ct ON ct.[key] = ft_test.id
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '2.2*' ) ct ON ct.[key] = ft_test.id
--Double quoting makes it match more stuff, but still is broken.
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '"1.1.1*"' ) ct ON ct.[key] = ft_test.id
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '"1.1*"' ) ct ON ct.[key] = ft_test.id
我们现在肯定更接近了,但上面的 2.2* 案例很烦人。它被解析为十进制数:
declare @stoplistId INT
SET @stoplistid = (SELECT stoplist_id FROM sys.fulltext_stoplists WHERE name ='no_numbers')
SELECT * FROM sys.dm_fts_parser('"1.1.1*"', 1033,@stoplistId, 0)
SELECT * FROM sys.dm_fts_parser('1.1.1*', 1033,@stoplistId, 0)
SELECT * FROM sys.dm_fts_parser('"1.1*"', 1033,@stoplistId, 0)
SELECT * FROM sys.dm_fts_parser('1.1*', 1033,@stoplistId, 0)
还有哪些其他字符是潜在的分隔符?
让我们尝试一些,看看有没有跳出来。我们可以尝试类似 'XXXDOTXXX' 之类的东西,但如果可能的话,将其保留为单个字符会更简洁。
INSERT INTO dbo.FT_Test ( TextData )
VALUES
( '1-1-1 2@2@2 3#3#3 4$4$4 5%5%5 6^6^6 7&7&7 8*8*8 9=9=9 10_10_10 11|11|11 12:12:12 12:12:12:12 13"13"13" 14~14~14 15`15`15')
SELECT ft_test.*, ft_content.display_term, ft_content.occurrence_count
FROM sys.dm_fts_index_keywords_by_document(DB_ID(), OBJECT_ID('ft_test')) ft_content
INNER JOIN dbo.FT_Test ON document_id = id WHERE textdata LIKE '1-1-1%'
ORDER BY id, keyword
DELETE FROM ft_test WHERE textdata LIKE '%3#3#3%'
似乎连字符、下划线或反引号都可以。让我们更详细地研究这些。
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '3`3`3`4 1`2`3 6`1`2`3`4 ' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '5-5-5-6 2-3-4 6-1-2-3-4-5' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '6_6_6_7 3_4_5 7_1_2_3_4_5_6' )
SELECT ft_test.*, ft_content.display_term, ft_content.occurrence_count
FROM sys.dm_fts_index_keywords_by_document(DB_ID(), OBJECT_ID('ft_test')) ft_content
INNER JOIN dbo.FT_Test ON document_id = id WHERE textdata LIKE '3`3%' OR TextData LIKE '5-5%' OR textdata LIKE '6_6%'
ORDER BY id, keyword
--Hyphen isn't looking good now, it gets stored 3 times, as numbers, as individual digits and as a full string.
--Let's try backquote:
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '"3`3*"' ) ct ON ct.[key] = ft_test.id
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '"1`2`3*"' ) ct ON ct.[key] = ft_test.id
-- these match anything with a single 6... not good...
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '"6`*"' ) ct ON ct.[key] = ft_test.id
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '6`*' ) ct ON ct.[key] = ft_test.id
--the backquote is getting dropped when it's parsed
declare @stoplistId INT
SET @stoplistid = (SELECT stoplist_id FROM sys.fulltext_stoplists WHERE name ='no_numbers')
SELECT * FROM sys.dm_fts_parser('"6`*"', 1033,@stoplistId, 0)
SELECT * FROM sys.dm_fts_parser('6`*', 1033,@stoplistId, 0)
--Underscore is just about all we have left.
declare @stoplistId INT
SET @stoplistid = (SELECT stoplist_id FROM sys.fulltext_stoplists WHERE name ='no_numbers')
SELECT * FROM sys.dm_fts_parser('"2_*"', 1033,@stoplistId, 0)
SELECT * FROM sys.dm_fts_parser('2_*', 1033,@stoplistId, 0)
SELECT * FROM sys.dm_fts_parser('2_2*', 1033,@stoplistId, 0)
SELECT * FROM sys.dm_fts_parser('2_2*', 1033,@stoplistId, 0)
SELECT * FROM sys.dm_fts_parser('2_2_*', 1033,@stoplistId, 0)
SELECT * FROM sys.dm_fts_parser('2_2_*', 1033,@stoplistId, 0)
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '6_6_66_7 77_6_6_6' )
--
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '"6_*"' ) ct ON ct.[key] = ft_test.id WHERE textdata LIKE '%[_]%'
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '6_*' ) ct ON ct.[key] = ft_test.id WHERE textdata LIKE '%[_]%'
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '6_6*' ) ct ON ct.[key] = ft_test.id WHERE textdata LIKE '%[_]%'
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '2_3*' ) ct ON ct.[key] = ft_test.id WHERE textdata LIKE '%[_]%'
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '6_6*' ) ct ON ct.[key] = ft_test.id WHERE textdata LIKE '%[_]%'
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '6_6_6_7*' ) ct ON ct.[key] = ft_test.id WHERE textdata LIKE '%[_]%'
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '"6_6_6_7*"' ) ct ON ct.[key] = ft_test.id WHERE textdata LIKE '%[_]%'
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '"6_6_6_*"' ) ct ON ct.[key] = ft_test.id WHERE textdata LIKE '%[_]%'
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '"6_6_6*"' ) ct ON ct.[key] = ft_test.id WHERE textdata LIKE '%[_]%'
SELECT * FROM ft_test LEFT JOIN CONTAINSTABLE(ft_test, *, '"6_6_*"' ) ct ON ct.[key] = ft_test.id WHERE textdata LIKE '%[_]%'
用下划线替换句点
下划线是要走的路。它被视为一个字符,而不是标点符号。Sql Server 可以在计算列上创建全文索引。这将允许我们使用公式来“修复”数据、对其进行索引和查询,而无需额外的存储(并且开销也最小)。您需要修改应用程序以查询“1_2_3”而不是“1.2.3”。
--naive implementation
ALTER TABLE ft_test ADD [TextData_FT1] AS ([textdata]+' '+replace([TextData],'.','_'))
--strip all characters. You can customize to get pull out only the paragraph numbers
ALTER TABLE ft_test ADD [TextData_FT2] AS (REPLACE(REPLACE(
REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(
REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(
REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(UPPER([TextData])
,'A', ' '),'B', ' '),'C', ' '),'D', ' '),'E', ' '),'F', ' '),
'G', ' '),'H', ' '),'I', ' '),'J', ' '),'K', ' '),'L', ' '),'M', ' '),'N', ' '),
'O', ' '),'P', ' '),'Q', ' '),'R', ' '),'S', ' '),'T', ' '),'U', ' '),'V', ' '),
'W', ' '),'X', ' '),'Y', ' '),'Z', ' '), '.','_') , ' ',' ')
)
--Add computed columns to FT index
ALTER FULLTEXT INDEX ON [dbo].[FT_Test] ADD ([TextData_FT1])
ALTER FULLTEXT INDEX ON [dbo].[FT_Test] ADD ([TextData_FT2])
DELETE FROM dbo.FT_Test
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '1 This is the chapter title' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '1.1 Section heading' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '1.1.1 paragraph 1 is very interesting' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '1.1.2 paragraph two is better' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '1.2 Another Section' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '1.2.1 Foobar qwerty loren ipsum' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '1.2.2 Foobar2 qwerty2 loren ipsum 12 items ' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '1.2.12 Foobar2 qwerty2 loren ipsum ' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES ( '2.2.17 sql server is great. ' )
--naive implementation
SELECT * FROM ft_Test WHERE CONTAINS(TextData_ft1, '"1_1*"')
SELECT * FROM ft_Test WHERE CONTAINS(TextData_ft1, '1*')
SELECT * FROM ft_Test WHERE CONTAINS(TextData_ft1, '2*') --
SELECT * FROM ft_Test WHERE CONTAINS(TextData_ft1, '"1_1_2*"')
SELECT * FROM ft_Test WHERE CONTAINS(TextData_ft1, '1_1_2*')
--only index the paragraph identifiers
SELECT * FROM ft_Test WHERE CONTAINS(TextData_ft2, '"1_1*"')
SELECT * FROM ft_Test WHERE CONTAINS(TextData_ft2, '1*')
SELECT * FROM ft_Test WHERE CONTAINS(TextData_ft2, '2*') --
SELECT * FROM ft_Test WHERE CONTAINS(TextData_ft2, '"1_1_2*"')
SELECT * FROM ft_Test WHERE CONTAINS(TextData_ft2, '1_1_2*')