我发现 Chris 的答案非常有帮助,但我想使用 T-SQL(而不是使用 CLR)在 SQL Server 中运行它,所以我将他的代码转换为 T-SQL 代码。但后来我更进一步,将所有内容包装在执行以下操作的存储过程中:
- 使用批量插入最初导入 CSV 文件
- 使用 Chris 的代码清理行
- 以表格形式返回结果
为了我的需要,我通过删除值周围的引号并将两个双引号转换为一个双引号来进一步清理行(我认为这是正确的方法)。
CREATE PROCEDURE SSP_CSVToTable
-- Add the parameters for the stored procedure here
@InputFile nvarchar(4000)
, @FirstLine int
AS
BEGIN
-- SET NOCOUNT ON added to prevent extra result sets from
-- interfering with SELECT statements.
SET NOCOUNT ON;
--convert the CSV file to a table
--clean up the lines so that commas are handles correctly
DECLARE @sql nvarchar(4000)
DECLARE @PH1 nvarchar(50)
DECLARE @LINECOUNT int -- This will also serve as a primary key
DECLARE @CURLINE int
DECLARE @Line nvarchar(4000)
DECLARE @starti int
DECLARE @endi int
DECLARE @FieldTerminatorFound bit
DECLARE @backChar nvarchar(4000)
DECLARE @quoteCount int
DECLARE @source nvarchar(4000)
DECLARE @COLCOUNT int
DECLARE @CURCOL int
DECLARE @ColVal nvarchar(4000)
-- new delimiter
SET @PH1 = '†'
-- create single column table to hold each line of file
CREATE TABLE [#CSVLine]([line] nvarchar(4000))
-- bulk insert into temp table
-- cannot use variable path with bulk insert
-- so we must run using dynamic sql
SET @Sql = 'BULK INSERT #CSVLine
FROM ''' + @InputFile + '''
WITH
(
FIRSTROW=' + CAST(@FirstLine as varchar) + ',
FIELDTERMINATOR = ''\n'',
ROWTERMINATOR = ''\n''
)'
-- run dynamic statement to populate temp table
EXEC(@sql)
-- get number of lines in table
SET @LINECOUNT = @@ROWCOUNT
-- add identity column to table so that we can loop through it
ALTER TABLE [#CSVLine] ADD [RowId] [int] IDENTITY(1,1) NOT NULL
IF @LINECOUNT > 0
BEGIN
-- cycle through each line, cleaning each line
SET @CURLINE = 1
WHILE @CURLINE <= @LINECOUNT
BEGIN
-- get current line
SELECT @line = line
FROM #CSVLine
WHERE [RowId] = @CURLINE
-- Replace commas with our custom-made delimiter
SET @Line = REPLACE(@Line, ',', @PH1)
-- Find a quoted part of the line, which could legitimately contain commas.
-- In that case we will need to identify the quoted section and swap commas back in for our custom placeholder.
SET @starti = CHARINDEX(@PH1 + '"' ,@Line, 0)
If CHARINDEX('"', @Line, 0) = 0 SET @starti = 0
-- loop through quoted fields
WHILE @starti > 0
BEGIN
SET @FieldTerminatorFound = 0
-- Find end quote token (originally a ",)
SET @endi = CHARINDEX('"' + @PH1, @Line, @starti) -- sLine.IndexOf("""" & PH1, starti)
IF @endi < 1
BEGIN
SET @FieldTerminatorFound = 1
If @endi < 1 SET @endi = LEN(@Line) - 1
END
WHILE @FieldTerminatorFound = 0
BEGIN
-- Find any more quotes that are part of that sequence, if any
SET @backChar = '"' -- thats one quote
SET @quoteCount = 0
WHILE @backChar = '"'
BEGIN
SET @quoteCount = @quoteCount + 1
SET @backChar = SUBSTRING(@Line, @endi-@quoteCount, 1) -- sLine.Chars(endi - quoteCount)
END
IF (@quoteCount % 2) = 1
BEGIN
-- odd number of quotes. real field terminator
SET @FieldTerminatorFound = 1
END
ELSE
BEGIN
-- keep looking
SET @endi = CHARINDEX('"' + @PH1, @Line, @endi + 1) -- sLine.IndexOf("""" & PH1, endi + 1)
END
END
-- Grab the quoted field from the line, now that we have the start and ending indices
SET @source = SUBSTRING(@Line, @starti + LEN(@PH1), @endi - @starti - LEN(@PH1) + 1)
-- sLine.Substring(starti + PH1.Length, endi - starti - PH1.Length + 1)
-- And swap the commas back in
SET @Line = REPLACE(@Line, @source, REPLACE(@source, @PH1, ','))
--sLine.Replace(source, source.Replace(PH1, ","))
-- Find the next quoted field
-- If endi >= line.Length - 1 Then endi = line.Length 'During the swap, the length of line shrinks so an endi value at the end of the line will fail
SET @starti = CHARINDEX(@PH1 + '"', @Line, @starti + LEN(@PH1))
--sLine.IndexOf(PH1 & """", starti + PH1.Length)
END
-- get table based on current line
IF OBJECT_ID('tempdb..#Line') IS NOT NULL
DROP TABLE #Line
-- converts a delimited list into a table
SELECT *
INTO #Line
FROM dbo.iter_charlist_to_table(@Line,@PH1)
-- get number of columns in line
SET @COLCOUNT = @@ROWCOUNT
-- dynamically create CSV temp table to hold CSV columns and lines
-- only need to create once
IF OBJECT_ID('tempdb..#CSV') IS NULL
BEGIN
-- create initial structure of CSV table
CREATE TABLE [#CSV]([Col1] nvarchar(100))
-- dynamically add a column for each column found in the first line
SET @CURCOL = 1
WHILE @CURCOL <= @COLCOUNT
BEGIN
-- first column already exists, don't need to add
IF @CURCOL > 1
BEGIN
-- add field
SET @sql = 'ALTER TABLE [#CSV] ADD [Col' + Cast(@CURCOL as varchar) + '] nvarchar(100)'
--print @sql
-- this adds the fields to the temp table
EXEC(@sql)
END
-- go to next column
SET @CURCOL = @CURCOL + 1
END
END
-- build dynamic sql to insert current line into CSV table
SET @sql = 'INSERT INTO [#CSV] VALUES('
-- loop through line table, dynamically adding each column value
SET @CURCOL = 1
WHILE @CURCOL <= @COLCOUNT
BEGIN
-- get current column
Select @ColVal = str
From #Line
Where listpos = @CURCOL
IF LEN(@ColVal) > 0
BEGIN
-- remove quotes from beginning if exist
IF LEFT(@ColVal,1) = '"'
SET @ColVal = RIGHT(@ColVal, LEN(@ColVal) - 1)
-- remove quotes from end if exist
IF RIGHT(@ColVal,1) = '"'
SET @ColVal = LEFT(@ColVal, LEN(@ColVal) - 1)
END
-- write column value
-- make value sql safe by replacing single quotes with two single quotes
-- also, replace two double quotes with a single double quote
SET @sql = @sql + '''' + REPLACE(REPLACE(@ColVal, '''',''''''), '""', '"') + ''''
-- add comma separater except for the last record
IF @CURCOL <> @COLCOUNT
SET @sql = @sql + ','
-- go to next column
SET @CURCOL = @CURCOL + 1
END
-- close sql statement
SET @sql = @sql + ')'
--print @sql
-- run sql to add line to table
EXEC(@sql)
-- move to next line
SET @CURLINE = @CURLINE + 1
END
END
-- return CSV table
SELECT * FROM [#CSV]
END
GO
存储过程利用这个帮助函数将字符串解析为表(感谢 Erland Sommarskog!):
CREATE FUNCTION [dbo].[iter_charlist_to_table]
(@list ntext,
@delimiter nchar(1) = N',')
RETURNS @tbl TABLE (listpos int IDENTITY(1, 1) NOT NULL,
str varchar(4000),
nstr nvarchar(2000)) AS
BEGIN
DECLARE @pos int,
@textpos int,
@chunklen smallint,
@tmpstr nvarchar(4000),
@leftover nvarchar(4000),
@tmpval nvarchar(4000)
SET @textpos = 1
SET @leftover = ''
WHILE @textpos <= datalength(@list) / 2
BEGIN
SET @chunklen = 4000 - datalength(@leftover) / 2
SET @tmpstr = @leftover + substring(@list, @textpos, @chunklen)
SET @textpos = @textpos + @chunklen
SET @pos = charindex(@delimiter, @tmpstr)
WHILE @pos > 0
BEGIN
SET @tmpval = ltrim(rtrim(left(@tmpstr, @pos - 1)))
INSERT @tbl (str, nstr) VALUES(@tmpval, @tmpval)
SET @tmpstr = substring(@tmpstr, @pos + 1, len(@tmpstr))
SET @pos = charindex(@delimiter, @tmpstr)
END
SET @leftover = @tmpstr
END
INSERT @tbl(str, nstr) VALUES (ltrim(rtrim(@leftover)), ltrim(rtrim(@leftover)))
RETURN
END
这是我从 T-SQL 中调用它的方式。在这种情况下,我将结果插入到临时表中,因此我首先创建临时表:
-- create temp table for file import
CREATE TABLE #temp
(
CustomerCode nvarchar(100) NULL,
Name nvarchar(100) NULL,
[Address] nvarchar(100) NULL,
City nvarchar(100) NULL,
[State] nvarchar(100) NULL,
Zip nvarchar(100) NULL,
OrderNumber nvarchar(100) NULL,
TimeWindow nvarchar(100) NULL,
OrderType nvarchar(100) NULL,
Duration nvarchar(100) NULL,
[Weight] nvarchar(100) NULL,
Volume nvarchar(100) NULL
)
-- convert the CSV file into a table
INSERT #temp
EXEC [dbo].[SSP_CSVToTable]
@InputFile = @FileLocation
,@FirstLine = @FirstImportRow
我没有对性能进行太多测试,但它可以很好地满足我的需要——导入少于 1000 行的 CSV 文件。但是,它可能会阻塞非常大的文件。
希望其他人也发现它有用。
干杯!