我有一个大约 25G 的大型 CSV 文件。我需要解析大约 10 列的每一行并进行一些处理,最后将其保存到包含解析数据的新文件中。
我使用字典作为我的数据结构。为了避免内存溢出,我在 500,000 条记录后写入文件并清除字典。
谁能建议这是否是一种好方法。如果没有,还有其他更好的方法吗?现在处理 25G 文件需要 30 分钟。
这是代码
private static void ReadData(string filename, FEnum fileType)
{
var resultData = new ResultsData
{
DataColumns = new List<string>(),
DataRows = new List<Dictionary<string, Results>>()
};
resultData.DataColumns.Add("count");
resultData.DataColumns.Add("userid");
Console.WriteLine("Start Processing : " + DateTime.Now);
const long processLimit = 100000;
//ProcessLimit : 500000, TimeElapsed : 30 Mins;
//ProcessLimit : 100000, TimeElaspsed - Overflow
Stopwatch stopwatch = new Stopwatch();
stopwatch.Start();
Dictionary<string, Results> parsedData = new Dictionary<string, Results>();
FileStream fileStream = new FileStream(filename, FileMode.Open, FileAccess.Read);
using (StreamReader streamReader = new StreamReader(fileStream))
{
string charsRead = streamReader.ReadLine();
int count = 0;
long linesProcessed = 0;
while (!String.IsNullOrEmpty(charsRead))
{
string[] columns = charsRead.Split(',');
string eventsList = columns[0] + ";" + columns[1] + ";" + columns[2] + ";" + columns[3] + ";" +
columns[4] + ";" + columns[5] + ";" + columns[6] + ";" + columns[7];
if (parsedData.ContainsKey(columns[0]))
{
Results results = parsedData[columns[0]];
results.Count = results.Count + 1;
results.Conversion = results.Count;
results.EventList.Add(eventsList);
parsedData[columns[0]] = results;
}
else
{
Results results = new Results {
Count = 1, Hash_Person_Id = columns[0], Tag_Id = columns[1], Conversion = 1,
Campaign_Id = columns[2], Inventory_Placement = columns[3], Action_Id = columns[4],
Creative_Group_Id = columns[5], Creative_Id = columns[6], Record_Time = columns[7]
};
results.EventList = new List<string> {eventsList};
parsedData.Add(columns[0], results);
}
charsRead = streamReader.ReadLine();
linesProcessed++;
if (linesProcessed == processLimit)
{
linesProcessed = 0;
SaveParsedValues(filename, fileType, parsedData);
//Clear Dictionary
parsedData.Clear();
}
}
}
stopwatch.Stop();
Console.WriteLine(@"File : {0} Batch Limit : {1} Time elapsed : {2} ", filename + Environment.NewLine, processLimit + Environment.NewLine, stopwatch.Elapsed + Environment.NewLine);
}
谢谢