我做了一个简短的测试项目,并尝试了几种不同的方法。我的目标是使用顺序代码尽快构建一个包含 27 列和 (id,A,B,C,...,Z) 和 NumOfRows 约 300,000 的 DataTable。
(每行填充一个 id,其余列填充随机 5 个字母的单词)。
在我的第四次尝试中,我偶然发现了一种基于 Object 类型值数组将行添加到表中的不同语法。(见这里)。
在您的情况下,它将类似于:
cityTable.Rows.Add( new Object[] {
((City)e.DatabaseEntry).Id ,
ObjectThatGoesInColumn2 ,
ObjectThatGoesInColumn3 ,
ObjectThatGoesInLastColumn
}
代替:
DataRow row = cityTable.NewRow();
row[0] = 100;
row["City Name"] = Anaheim;
row["Column 7"] = ...
...
row["Column 26"] = checksum;
workTable.Rows.Add( row );
这将加快您的速度,因为您不会一次单独设置每一列,并且根据您的分析器图片,您至少有 12 个单独设置的列。
这也避免了对列名字符串进行散列以查看您正在处理的数组位置,然后再次检查数据类型是否正确。
如果您有兴趣,这是我的测试项目:
class Program
{
public static System.Data.DataSet dataSet;
public static System.Data.DataSet dataSet2;
public static System.Data.DataSet dataSet3;
public static System.Data.DataSet dataSet4;
public static Random rand = new Random();
public static int NumOfRows = 300000;
static void Main(string[] args)
{
#region test1
Console.WriteLine("Starting");
Console.WriteLine("");
Stopwatch watch = new Stopwatch();
watch.Start();
MakeTable();
watch.Stop();
Console.WriteLine("Elapsed Time was: " + watch.ElapsedMilliseconds + " milliseconds.");
dataSet = null;
Console.WriteLine("");
Console.WriteLine("Completed.");
Console.WriteLine("");
#endregion
/*
#region test2
Console.WriteLine("Starting Test 2");
Console.WriteLine("");
watch.Reset();
watch.Start();
MakeTable2();
watch.Stop();
Console.WriteLine("Elapsed Time was: " + watch.ElapsedMilliseconds + " milliseconds.");
dataSet2 = null;
Console.WriteLine("");
Console.WriteLine("Completed Test 2.");
#endregion
#region test3
Console.WriteLine("");
Console.WriteLine("Starting Test 3");
Console.WriteLine("");
watch.Reset();
watch.Start();
MakeTable3();
watch.Stop();
Console.WriteLine("Elapsed Time was: " + watch.ElapsedMilliseconds + " milliseconds.");
dataSet3 = null;
Console.WriteLine("");
Console.WriteLine("Completed Test 3.");
#endregion
*/
#region test4
Console.WriteLine("Starting Test 4");
Console.WriteLine("");
watch.Reset();
watch.Start();
MakeTable4();
watch.Stop();
Console.WriteLine("Elapsed Time was: " + watch.ElapsedMilliseconds + " milliseconds.");
dataSet4 = null;
Console.WriteLine("");
Console.WriteLine("Completed Test 4.");
#endregion
//printTable();
Console.WriteLine("");
Console.WriteLine("Press Enter to Exit...");
Console.ReadLine();
}
private static void MakeTable()
{
DataTable table = new DataTable("Table 1");
DataColumn column;
DataRow row;
column = new DataColumn();
column.DataType = System.Type.GetType("System.Int32");
column.ColumnName = "id";
column.ReadOnly = true;
column.Unique = true;
table.Columns.Add(column);
for (int i = 65; i <= 90; i++)
{
column = new DataColumn();
column.DataType = System.Type.GetType("System.String");
column.ColumnName = "5-Letter Word " + (char)i;
column.AutoIncrement = false;
column.Caption = "Random Word " + (char)i;
column.ReadOnly = false;
column.Unique = false;
// Add the column to the table.
table.Columns.Add(column);
}
DataColumn[] PrimaryKeyColumns = new DataColumn[1];
PrimaryKeyColumns[0] = table.Columns["id"];
table.PrimaryKey = PrimaryKeyColumns;
// Instantiate the DataSet variable.
dataSet = new DataSet();
// Add the new DataTable to the DataSet.
dataSet.Tables.Add(table);
// Create three new DataRow objects and add
// them to the DataTable
for (int i = 0; i < NumOfRows; i++)
{
row = table.NewRow();
row["id"] = i;
for (int j = 65; j <= 90; j++)
{
row["5-Letter Word " + (char)j] = getRandomWord();
}
table.Rows.Add(row);
}
}
private static void MakeTable2()
{
DataTable table = new DataTable("Table 2");
DataColumn column;
DataRow row;
column = new DataColumn();
column.DataType = System.Type.GetType("System.Int32");
column.ColumnName = "id";
column.ReadOnly = true;
column.Unique = true;
table.Columns.Add(column);
for (int i = 65; i <= 90; i++)
{
column = new DataColumn();
column.DataType = System.Type.GetType("System.String");
column.ColumnName = "5-Letter Word " + (char)i;
column.AutoIncrement = false;
column.Caption = "Random Word " + (char)i;
column.ReadOnly = false;
column.Unique = false;
// Add the column to the table.
table.Columns.Add(column);
}
DataColumn[] PrimaryKeyColumns = new DataColumn[1];
PrimaryKeyColumns[0] = table.Columns["id"];
table.PrimaryKey = PrimaryKeyColumns;
// Instantiate the DataSet variable.
dataSet2 = new DataSet();
// Add the new DataTable to the DataSet.
dataSet2.Tables.Add(table);
// Create three new DataRow objects and add
// them to the DataTable
for (int i = 0; i < NumOfRows; i++)
{
row = table.NewRow();
row.BeginEdit();
row["id"] = i;
for (int j = 65; j <= 90; j++)
{
row["5-Letter Word " + (char)j] = getRandomWord();
}
row.EndEdit();
table.Rows.Add(row);
}
}
private static void MakeTable3()
{
DataTable table = new DataTable("Table 3");
DataColumn column;
column = new DataColumn();
column.DataType = System.Type.GetType("System.Int32");
column.ColumnName = "id";
column.ReadOnly = true;
column.Unique = true;
table.Columns.Add(column);
for (int i = 65; i <= 90; i++)
{
column = new DataColumn();
column.DataType = System.Type.GetType("System.String");
column.ColumnName = "5-Letter Word " + (char)i;
column.AutoIncrement = false;
column.Caption = "Random Word " + (char)i;
column.ReadOnly = false;
column.Unique = false;
// Add the column to the table.
table.Columns.Add(column);
}
DataColumn[] PrimaryKeyColumns = new DataColumn[1];
PrimaryKeyColumns[0] = table.Columns["id"];
table.PrimaryKey = PrimaryKeyColumns;
// Instantiate the DataSet variable.
dataSet3 = new DataSet();
// Add the new DataTable to the DataSet.
dataSet3.Tables.Add(table);
DataRow[] newRows = new DataRow[NumOfRows];
for (int i = 0; i < NumOfRows; i++)
{
newRows[i] = table.NewRow();
}
// Create three new DataRow objects and add
// them to the DataTable
for (int i = 0; i < NumOfRows; i++)
{
newRows[i]["id"] = i;
for (int j = 65; j <= 90; j++)
{
newRows[i]["5-Letter Word " + (char)j] = getRandomWord();
}
table.Rows.Add(newRows[i]);
}
}
private static void MakeTable4()
{
DataTable table = new DataTable("Table 2");
DataColumn column;
column = new DataColumn();
column.DataType = System.Type.GetType("System.Int32");
column.ColumnName = "id";
column.ReadOnly = true;
column.Unique = true;
table.Columns.Add(column);
for (int i = 65; i <= 90; i++)
{
column = new DataColumn();
column.DataType = System.Type.GetType("System.String");
column.ColumnName = "5-Letter Word " + (char)i;
column.AutoIncrement = false;
column.Caption = "Random Word " + (char)i;
column.ReadOnly = false;
column.Unique = false;
// Add the column to the table.
table.Columns.Add(column);
}
DataColumn[] PrimaryKeyColumns = new DataColumn[1];
PrimaryKeyColumns[0] = table.Columns["id"];
table.PrimaryKey = PrimaryKeyColumns;
// Instantiate the DataSet variable.
dataSet4 = new DataSet();
// Add the new DataTable to the DataSet.
dataSet4.Tables.Add(table);
// Create three new DataRow objects and add
// them to the DataTable
for (int i = 0; i < NumOfRows; i++)
{
table.Rows.Add(
new Object[] {
i,
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord(),
getRandomWord()
}
);
}
}
private static string getRandomWord()
{
char c0 = (char)rand.Next(65, 90);
char c1 = (char)rand.Next(65, 90);
char c2 = (char)rand.Next(65, 90);
char c3 = (char)rand.Next(65, 90);
char c4 = (char)rand.Next(65, 90);
return "" + c0 + c1 + c2 + c3 + c4;
}
private static void printTable()
{
foreach (DataRow row in dataSet.Tables[0].Rows)
{
Console.WriteLine( row["id"] + "--" + row["5-Letter Word A"] + " - " + row["5-Letter Word Z"] );
}
}
}
我还没有真正看过你的并行性,但有几件事。
首先,更改“ParsedFiles++;” 到“Interlocked.Increment(ref ParsedFiles);”,或者通过锁定它。
其次,我建议不要使用复杂的事件驱动并行性,而是使用非常适合这种情况的流水线模式。
使用来自并发集合的并发队列(或阻塞集合)来保存阶段。
第一阶段将保存要处理的文件列表。
工作任务将从该工作列表中取出一个文件,对其进行解析,然后将其添加到第二阶段。
在第二阶段,工作任务将从第二阶段队列(刚刚完成的数据表块)中取出项目,并在它们准备好上传时将它们上传到数据库。
编辑:
我写了一个流水线版本的代码,它应该可以帮助你:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Collections.Concurrent;
using System.Threading.Tasks;
using System.IO;
using System.Data;
namespace dataTableTesting2
{
class Program
{
private static const int BufferSize = 20; //Each buffer can only contain this many elements at a time
//This limits the total amount of memory
private static const int MaxBlockSize = 100;
private static BlockingCollection<string> buffer1 = new BlockingCollection<string>(BufferSize);
private static BlockingCollection<string[]> buffer2 = new BlockingCollection<string[]>(BufferSize);
private static BlockingCollection<Object[][]> buffer3 = new BlockingCollection<Object[][]>(BufferSize);
/// <summary>
/// Start Pipelines and wait for them to finish.
/// </summary>
static void Main(string[] args)
{
TaskFactory f = new TaskFactory(TaskCreationOptions.LongRunning, TaskContinuationOptions.None);
Task stage0 = f.StartNew(() => PopulateFilesList(buffer1));
Task stage1 = f.StartNew(() => ReadFiles(buffer1, buffer2));
Task stage2 = f.StartNew(() => ParseStringBlocks(buffer2, buffer3));
Task stage3 = f.StartNew(() => UploadBlocks(buffer3) );
Task.WaitAll(stage0, stage1, stage2, stage3);
/*
// Note for more workers on particular stages you can make more tasks for each stage, like the following
// which populates the file list in 1 task, reads the files into string[] blocks in 1 task,
// then parses the string[] blocks in 4 concurrent tasks
// and lastly uploads the info in 2 tasks
TaskFactory f = new TaskFactory(TaskCreationOptions.LongRunning, TaskContinuationOptions.None);
Task stage0 = f.StartNew(() => PopulateFilesList(buffer1));
Task stage1 = f.StartNew(() => ReadFiles(buffer1, buffer2));
Task stage2a = f.StartNew(() => ParseStringBlocks(buffer2, buffer3));
Task stage2b = f.StartNew(() => ParseStringBlocks(buffer2, buffer3));
Task stage2c = f.StartNew(() => ParseStringBlocks(buffer2, buffer3));
Task stage2d = f.StartNew(() => ParseStringBlocks(buffer2, buffer3));
Task stage3a = f.StartNew(() => UploadBlocks(buffer3) );
Task stage3b = f.StartNew(() => UploadBlocks(buffer3) );
Task.WaitAll(stage0, stage1, stage2a, stage2b, stage2c, stage2d, stage3a, stage3b);
*/
}
/// <summary>
/// Adds the filenames to process into the first pipeline
/// </summary>
/// <param name="output"></param>
private static void PopulateFilesList( BlockingCollection<string> output )
{
try
{
buffer1.Add("file1.txt");
buffer1.Add("file2.txt");
//...
buffer1.Add("lastFile.txt");
}
finally
{
output.CompleteAdding();
}
}
/// <summary>
/// Takes filnames out of the first pipeline, reads them into string[] blocks, and puts them in the second pipeline
/// </summary>
private static void ReadFiles( BlockingCollection<string> input, BlockingCollection<string[]> output)
{
try
{
foreach (string file in input.GetConsumingEnumerable())
{
List<string> list = new List<string>(MaxBlockSize);
using (StreamReader sr = new StreamReader(file))
{
int countLines = 0;
while (!sr.EndOfStream)
{
list.Add( sr.ReadLine() );
countLines++;
if (countLines > MaxBlockSize)
{
output.Add(list.ToArray());
countLines = 0;
list = new List<string>(MaxBlockSize);
}
}
if (list.Count > 0)
{
output.Add(list.ToArray());
}
}
}
}
finally
{
output.CompleteAdding();
}
}
/// <summary>
/// Takes string[] blocks from the second pipeline, for each line, splits them by tabs, and parses
/// the data, storing each line as an object array into the third pipline.
/// </summary>
private static void ParseStringBlocks( BlockingCollection<string[]> input, BlockingCollection< Object[][] > output)
{
try
{
List<Object[]> result = new List<object[]>(MaxBlockSize);
foreach (string[] block in input.GetConsumingEnumerable())
{
foreach (string line in block)
{
string[] splitLine = line.Split('\t'); //split line on tab
string cityName = splitLine[0];
int cityPop = Int32.Parse( splitLine[1] );
int cityElevation = Int32.Parse(splitLine[2]);
//...
result.Add(new Object[] { cityName, cityPop, cityElevation });
}
output.Add( result.ToArray() );
}
}
finally
{
output.CompleteAdding();
}
}
/// <summary>
/// Takes the data blocks from the third pipeline, and uploads each row to SQL Database
/// </summary>
private static void UploadBlocks(BlockingCollection<Object[][]> input)
{
/*
* At this point 'block' is an array of object arrays.
*
* The block contains MaxBlockSize number of cities.
*
* There is one object array for each city.
*
* The object array for the city is in the pre-defined order from pipeline stage2
*
* You could do a couple of things at this point:
*
* 1. declare and initialize a DataTable with the correct column types
* then, do the dataTable.Rows.Add( rowValues )
* then, use a Bulk Copy Operation to upload the dataTable to SQL
* http://msdn.microsoft.com/en-us/library/7ek5da1a
*
* 2. Manually perform the sql commands/transactions similar to what
* Kevin recommends in this suggestion:
* http://stackoverflow.com/questions/1024123/sql-insert-one-row-or-multiple-rows-data/1024195#1024195
*
* I've demonstrated the first approach with this code.
*
* */
DataTable dataTable = new DataTable();
//set up columns of dataTable here.
foreach (Object[][] block in input.GetConsumingEnumerable())
{
foreach (Object[] rowValues in block)
{
dataTable.Rows.Add(rowValues);
}
//do bulkCopy to upload table containing MaxBlockSize number of cities right here.
dataTable.Rows.Clear(); //Remove the rows when you are done uploading, but not the dataTable.
}
}
}
}
它将工作分为 4 个部分,可以由不同的任务完成:
列出要处理的文件
从该列表中获取文件并将它们读入字符串 []
从上一部分获取字符串 [] 并解析它们,创建包含表中每一行的值的对象 []
将行上传到数据库
为每个阶段分配多个任务也很容易,如果需要,允许多个工作人员执行相同的管道阶段。
(我怀疑从文件中读取超过 1 个任务是否有用,除非您使用的是固态驱动器,因为在内存中跳转非常慢)。
此外,您可以通过执行程序来限制内存中的数据量。
每个缓冲区都是一个使用最大大小初始化的 BlockingCollection,这意味着如果缓冲区已满,并且另一个任务尝试添加另一个元素,它将阻塞该任务。
幸运的是,Task Parallel Library 很聪明,如果一个任务被阻塞,它会安排另一个未被阻塞的任务,然后检查第一个任务是否已经停止被阻塞。
目前每个buffer只能容纳20个item,每个item只有100大,也就是说:
所以这需要足够的内存来保存 20 个文件名、文件中的 2000 行和 2000 个城市信息。(对于局部变量等有一些额外的东西)。
您可能希望增加 BufferSize 和 MaxBlockSize 以提高效率,尽管这应该可以工作。
请注意,我没有对此进行测试,因为我没有任何输入文件,因此可能存在一些错误。