3

是否可以使用全文在 SQL Server 中查找 1.1.1 或 1.5.2(多级段落)等字符串?我的 sql 看起来像这样:

contains (MyTable.MyColumn,'"*5.1.1*"')

我已经尝试从停用词列表中删除数字或完全禁用停用词列表。结果,像 5.1 或 1.1 这样的字符串工作正常(可能在内部这些字符串被处理为数字?),但对于带有 2 个点的数字,仍然没有结果。

有没有办法逃脱那些虚线/数字,或任何其他解决方案?

4

1 回答 1

2

句点在全文搜索中是有问题的,因为它们通常被视为单词之间的句号。用不同的字符替换句点是解决方案,您可以对应用程序进行最小的更改。这是一个相当长的脚本,它将引导您识别问题并找到解决方案。如果您想要的只是解决方法,您可以跳到“简答”版本。

设置全文架构

SET ANSI_NULLS ON
SET QUOTED_IDENTIFIER ON
SET ANSI_PADDING ON

CREATE TABLE [dbo].[FT_Test](
    [id] [int] IDENTITY(1,1) NOT NULL,
    [TextData] [varchar](max) NOT NULL,
 CONSTRAINT [PK_FT_Test] PRIMARY KEY CLUSTERED 
(
    [id] ASC
) ON [PRIMARY]
) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY]

GO
CREATE FULLTEXT CATALOG [ft_default] WITH ACCENT_SENSITIVITY = ON
CREATE FULLTEXT INDEX ON [dbo].[FT_Test] KEY INDEX [PK_FT_Test] ON ([ft_default])
     WITH (CHANGE_TRACKING AUTO)
ALTER FULLTEXT INDEX ON [dbo].[FT_Test] ADD ([TextData])
ALTER FULLTEXT INDEX ON [dbo].[FT_Test] ENABLE

验证 Sql Server 版本

此脚本是围绕 Sql Server 2012 设计的,但也应适用于 2008。分词器在 Sql 2008 和 Sql 2012 之间发生了很大变化(至少对于语言 id 1033 - 美国英语)。主要含义是1-2-3被分解为1、2、3、1-2-3、nn1、nn2、nn3(包括1-2-3是新的)

go
PRINT 'Version 14.0.4763.1000 is Sql Server 2012'
EXEC master.sys.sp_help_fulltext_system_components @component_type = 'wordbreaker', @param=1033

Sql Server 半智能解析关键字

不幸的是,这目前对我们不利。我们会变得臃肿,因为相同的数据被多次存储并且搜索结果也很糟糕。

go
DELETE FROM ft_test
INSERT INTO dbo.FT_Test ( TextData )
VALUES  
(  '1.1.1  5.2.1, 7.1.1.34.69; 12.11.10.9.8 4.6  7/13/2013  15,456.345')

WAITFOR DELAY '00:00:05'
 --Wait 5 seconds for ft index to populate

SELECT  ft_test.*, ft_content.display_term, ft_content.occurrence_count
FROM    sys.dm_fts_index_keywords_by_document(DB_ID(), OBJECT_ID('ft_test')) ft_content
        INNER JOIN dbo.FT_Test ON document_id = id
ORDER BY id, keyword
--Notice what is returned,the two digit numbers are identified, but the 1 digit numbers aren't (due to default stoplist).  
--Also, note that they are treated as distinct items and are broken up. 4.6 does show up because it is a decimal number.
--the nn* display_terms are standardized numeric (also, note how the date got standardized as dd20120713 in addition to 7/13/2013)

SELECT  *
FROM    ft_test
WHERE   CONTAINS ( *, '"5.2*"' ) -- No results, 5 and 2 are in default stopword list.

SELECT  *
FROM    ft_test
WHERE   CONTAINS ( *, '"12.11*"' ) -- periods are hard breaks, so this doesn't work either

创建自定义停止列表以索引单个数字

在全文搜索方面,个位数通常不值钱,但我们需要它们。我们将使用默认的系统停止列表作为基础。

CREATE FULLTEXT STOPLIST [no_numbers]
FROM SYSTEM STOPLIST
AUTHORIZATION [dbo];
go
ALTER FULLTEXT STOPLIST [no_numbers] DROP '0' LANGUAGE 'English';
ALTER FULLTEXT STOPLIST [no_numbers] DROP '1' LANGUAGE 'English';
ALTER FULLTEXT STOPLIST [no_numbers] DROP '2' LANGUAGE 'English';
ALTER FULLTEXT STOPLIST [no_numbers] DROP '3' LANGUAGE 'English';
ALTER FULLTEXT STOPLIST [no_numbers] DROP '4' LANGUAGE 'English';
ALTER FULLTEXT STOPLIST [no_numbers] DROP '5' LANGUAGE 'English';
ALTER FULLTEXT STOPLIST [no_numbers] DROP '6' LANGUAGE 'English';
ALTER FULLTEXT STOPLIST [no_numbers] DROP '7' LANGUAGE 'English';
ALTER FULLTEXT STOPLIST [no_numbers] DROP '8' LANGUAGE 'English';
ALTER FULLTEXT STOPLIST [no_numbers] DROP '9' LANGUAGE 'English';
GO

根据新的停止列表重新创建全文索引

这有助于一些人,让我们更接近我们想要的地方。

DROP FULLTEXT INDEX ON dbo.FT_Test

CREATE FULLTEXT INDEX ON [dbo].[FT_Test] ( TextData) KEY INDEX [PK_FT_Test] ON ([ft_default])
     WITH (CHANGE_TRACKING AUTO, STOPLIST = [no_numbers])

WAITFOR DELAY '00:00:05'
 --Wait 5 seconds for ft index to populate

SELECT  ft_test.*, ft_content.display_term, ft_content.occurrence_count
FROM    sys.dm_fts_index_keywords_by_document(DB_ID(), OBJECT_ID('ft_test')) ft_content
        INNER JOIN dbo.FT_Test ON document_id = id 
ORDER BY id, keyword

--Progress, now single digits are showing up
INSERT INTO dbo.FT_Test ( TextData )
VALUES  ( '1 1 14.123' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES  ( '5.2.1.1.14' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES  ( '1.1.3 ' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES  ( '2.2.3.3' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES  ( '6.0 88.00.00' )

--This works in the first 3 cases, but doesn't work for 2.2
SELECT  * FROM   ft_test LEFT JOIN CONTAINSTABLE(ft_test,  *, '1.1.1*'  ) ct ON ct.[key] = ft_test.id 
SELECT  * FROM   ft_test LEFT JOIN CONTAINSTABLE(ft_test,  *, '2.2.3.3*'  ) ct ON ct.[key] = ft_test.id 
SELECT  * FROM   ft_test LEFT JOIN CONTAINSTABLE(ft_test,  *, '2.2.3*'  ) ct ON ct.[key] = ft_test.id 
SELECT  * FROM   ft_test LEFT JOIN CONTAINSTABLE(ft_test,  *, '2.2*'  ) ct ON ct.[key] = ft_test.id 

--Double quoting makes it match more stuff, but still is broken.
SELECT  * FROM   ft_test LEFT JOIN CONTAINSTABLE(ft_test,  *, '"1.1.1*"'  ) ct ON ct.[key] = ft_test.id 
SELECT  * FROM   ft_test LEFT JOIN CONTAINSTABLE(ft_test,  *, '"1.1*"'  ) ct ON ct.[key] = ft_test.id 

我们现在肯定更接近了,但上面的 2.2* 案例很烦人。它被解析为十进制数:

declare @stoplistId INT
SET @stoplistid = (SELECT stoplist_id FROM  sys.fulltext_stoplists WHERE name ='no_numbers')
SELECT * FROM  sys.dm_fts_parser('"1.1.1*"', 1033,@stoplistId, 0) 
SELECT * FROM  sys.dm_fts_parser('1.1.1*', 1033,@stoplistId, 0) 
SELECT * FROM  sys.dm_fts_parser('"1.1*"', 1033,@stoplistId, 0)
SELECT * FROM  sys.dm_fts_parser('1.1*', 1033,@stoplistId, 0)

还有哪些其他字符是潜在的分隔符?

让我们尝试一些,看看有没有跳出来。我们可以尝试类似 'XXXDOTXXX' 之类的东西,但如果可能的话,将其保留为单个字符会更简洁。

INSERT INTO dbo.FT_Test ( TextData )
VALUES  
(  '1-1-1 2@2@2 3#3#3 4$4$4 5%5%5 6^6^6 7&7&7 8*8*8 9=9=9 10_10_10 11|11|11 12:12:12 12:12:12:12 13"13"13"  14~14~14 15`15`15')

SELECT  ft_test.*, ft_content.display_term, ft_content.occurrence_count
FROM    sys.dm_fts_index_keywords_by_document(DB_ID(), OBJECT_ID('ft_test')) ft_content
        INNER JOIN dbo.FT_Test ON document_id = id WHERE textdata LIKE '1-1-1%'
ORDER BY id, keyword

DELETE FROM ft_test WHERE textdata LIKE '%3#3#3%'

似乎连字符、下划线或反引号都可以。让我们更详细地研究这些。

INSERT INTO dbo.FT_Test ( TextData )
VALUES  ( '3`3`3`4   1`2`3  6`1`2`3`4 ' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES  ( '5-5-5-6 2-3-4 6-1-2-3-4-5' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES  ( '6_6_6_7 3_4_5 7_1_2_3_4_5_6' )


SELECT  ft_test.*, ft_content.display_term, ft_content.occurrence_count
FROM    sys.dm_fts_index_keywords_by_document(DB_ID(), OBJECT_ID('ft_test')) ft_content
        INNER JOIN dbo.FT_Test ON document_id = id WHERE textdata LIKE '3`3%' OR TextData LIKE '5-5%' OR textdata LIKE '6_6%'
ORDER BY id, keyword
--Hyphen isn't looking good now, it gets stored 3 times, as numbers, as individual digits and as a full string.

--Let's try backquote:
SELECT  * FROM   ft_test LEFT JOIN CONTAINSTABLE(ft_test,  *, '"3`3*"'  ) ct ON ct.[key] = ft_test.id 
SELECT  * FROM   ft_test LEFT JOIN CONTAINSTABLE(ft_test,  *, '"1`2`3*"'  ) ct ON ct.[key] = ft_test.id 

-- these match anything with a single 6... not good... 
SELECT  * FROM   ft_test LEFT JOIN CONTAINSTABLE(ft_test,  *, '"6`*"'  ) ct ON ct.[key] = ft_test.id   
SELECT  * FROM   ft_test LEFT JOIN CONTAINSTABLE(ft_test,  *, '6`*'  ) ct ON ct.[key] = ft_test.id   

--the backquote is getting dropped when it's parsed
declare @stoplistId INT
SET @stoplistid = (SELECT stoplist_id FROM  sys.fulltext_stoplists WHERE name ='no_numbers')
SELECT * FROM  sys.dm_fts_parser('"6`*"', 1033,@stoplistId, 0) 
SELECT * FROM  sys.dm_fts_parser('6`*', 1033,@stoplistId, 0) 


--Underscore is just about all we have left.
declare @stoplistId INT
SET @stoplistid = (SELECT stoplist_id FROM  sys.fulltext_stoplists WHERE name ='no_numbers')
SELECT * FROM  sys.dm_fts_parser('"2_*"', 1033,@stoplistId, 0) 
SELECT * FROM  sys.dm_fts_parser('2_*', 1033,@stoplistId, 0) 
SELECT * FROM  sys.dm_fts_parser('2_2*', 1033,@stoplistId, 0) 
SELECT * FROM  sys.dm_fts_parser('2_2*', 1033,@stoplistId, 0) 
SELECT * FROM  sys.dm_fts_parser('2_2_*', 1033,@stoplistId, 0) 
SELECT * FROM  sys.dm_fts_parser('2_2_*', 1033,@stoplistId, 0) 


INSERT INTO dbo.FT_Test ( TextData )
VALUES  ( '6_6_66_7 77_6_6_6' )


-- 
SELECT  * FROM   ft_test LEFT JOIN CONTAINSTABLE(ft_test,  *, '"6_*"'  ) ct ON ct.[key] = ft_test.id     WHERE textdata LIKE '%[_]%'
SELECT  * FROM   ft_test LEFT JOIN CONTAINSTABLE(ft_test,  *, '6_*'  ) ct ON ct.[key] = ft_test.id     WHERE textdata LIKE  '%[_]%'
SELECT  * FROM   ft_test LEFT JOIN CONTAINSTABLE(ft_test,  *, '6_6*'  ) ct ON ct.[key] = ft_test.id     WHERE textdata LIKE  '%[_]%'
SELECT  * FROM   ft_test LEFT JOIN CONTAINSTABLE(ft_test,  *, '2_3*'  ) ct ON ct.[key] = ft_test.id     WHERE textdata LIKE  '%[_]%'
SELECT  * FROM   ft_test LEFT JOIN CONTAINSTABLE(ft_test,  *, '6_6*'  ) ct ON ct.[key] = ft_test.id     WHERE textdata LIKE  '%[_]%'

SELECT  * FROM   ft_test LEFT JOIN CONTAINSTABLE(ft_test,  *, '6_6_6_7*'  ) ct ON ct.[key] = ft_test.id     WHERE textdata LIKE  '%[_]%'
SELECT  * FROM   ft_test LEFT JOIN CONTAINSTABLE(ft_test,  *, '"6_6_6_7*"'  ) ct ON ct.[key] = ft_test.id   WHERE textdata LIKE  '%[_]%'

SELECT  * FROM   ft_test LEFT JOIN CONTAINSTABLE(ft_test,  *, '"6_6_6_*"'  ) ct ON ct.[key] = ft_test.id     WHERE textdata LIKE  '%[_]%'
SELECT  * FROM   ft_test LEFT JOIN CONTAINSTABLE(ft_test,  *, '"6_6_6*"'  ) ct ON ct.[key] = ft_test.id       WHERE textdata LIKE  '%[_]%'
SELECT  * FROM   ft_test LEFT JOIN CONTAINSTABLE(ft_test,  *, '"6_6_*"'  ) ct ON ct.[key] = ft_test.id       WHERE textdata LIKE  '%[_]%'

简答

用下划线替换句点

下划线是要走的路。它被视为一个字符,而不是标点符号。Sql Server 可以在计算列上创建全文索引。这将允许我们使用公式来“修复”数据、对其进行索引和查询,而无需额外的存储(并且开销也最小)。您需要修改应用程序以查询“1_2_3”而不是“1.2.3”。

--naive implementation
ALTER TABLE ft_test ADD [TextData_FT1]  AS ([textdata]+' '+replace([TextData],'.','_'))

--strip all characters.  You can customize to get pull out only the paragraph numbers
ALTER TABLE ft_test ADD [TextData_FT2]  AS (REPLACE(REPLACE(
REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(
REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(
REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(UPPER([TextData])
,'A', ' '),'B', ' '),'C', ' '),'D', ' '),'E', ' '),'F', ' '),
'G', ' '),'H', ' '),'I', ' '),'J', ' '),'K', ' '),'L', ' '),'M', ' '),'N', ' '),
'O', ' '),'P', ' '),'Q', ' '),'R', ' '),'S', ' '),'T', ' '),'U', ' '),'V', ' '),
'W', ' '),'X', ' '),'Y', ' '),'Z', ' '), '.','_') , '  ',' ')
)


--Add computed columns to FT index
ALTER FULLTEXT INDEX ON [dbo].[FT_Test] ADD ([TextData_FT1])
ALTER FULLTEXT INDEX ON [dbo].[FT_Test] ADD ([TextData_FT2])


DELETE FROM dbo.FT_Test

INSERT INTO dbo.FT_Test ( TextData )
VALUES  ( '1  This is the chapter title' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES  ( '1.1  Section heading' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES  ( '1.1.1 paragraph 1 is very interesting' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES  ( '1.1.2 paragraph two is better' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES  ( '1.2 Another Section' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES  ( '1.2.1 Foobar qwerty loren ipsum' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES  ( '1.2.2 Foobar2 qwerty2 loren ipsum 12 items ' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES  ( '1.2.12 Foobar2 qwerty2 loren ipsum ' )
INSERT INTO dbo.FT_Test ( TextData )
VALUES  ( '2.2.17 sql server is great. ' )

--naive implementation
SELECT * FROM  ft_Test WHERE CONTAINS(TextData_ft1, '"1_1*"')
SELECT * FROM  ft_Test WHERE CONTAINS(TextData_ft1, '1*') 
SELECT * FROM  ft_Test WHERE CONTAINS(TextData_ft1, '2*') --
SELECT * FROM  ft_Test WHERE CONTAINS(TextData_ft1, '"1_1_2*"')
SELECT * FROM  ft_Test WHERE CONTAINS(TextData_ft1, '1_1_2*')

--only index the paragraph identifiers
SELECT * FROM  ft_Test WHERE CONTAINS(TextData_ft2, '"1_1*"')
SELECT * FROM  ft_Test WHERE CONTAINS(TextData_ft2, '1*') 
SELECT * FROM  ft_Test WHERE CONTAINS(TextData_ft2, '2*') --
SELECT * FROM  ft_Test WHERE CONTAINS(TextData_ft2, '"1_1_2*"')
SELECT * FROM  ft_Test WHERE CONTAINS(TextData_ft2, '1_1_2*')
于 2013-07-08T17:21:21.033 回答