5

我有一个大约 25G 的大型 CSV 文件。我需要解析大约 10 列的每一行并进行一些处理,最后将其保存到包含解析数据的新文件中。

我使用字典作为我的数据结构。为了避免内存溢出,我在 500,000 条记录后写入文件并清除字典。

谁能建议这是否是一种好方法。如果没有,还有其他更好的方法吗?现在处理 25G 文件需要 30 分钟

这是代码

        private static void ReadData(string filename, FEnum fileType)
    {

       var resultData = new ResultsData
                        {
                            DataColumns = new List<string>(),
                            DataRows = new List<Dictionary<string, Results>>()
                        };

                    resultData.DataColumns.Add("count");
                    resultData.DataColumns.Add("userid");

                    Console.WriteLine("Start Processing : " + DateTime.Now);
                    const long processLimit = 100000;
                        //ProcessLimit : 500000, TimeElapsed : 30 Mins;
                        //ProcessLimit : 100000, TimeElaspsed - Overflow

                    Stopwatch stopwatch = new Stopwatch();

                    stopwatch.Start();
                    Dictionary<string, Results> parsedData = new Dictionary<string, Results>();

                    FileStream fileStream = new FileStream(filename, FileMode.Open, FileAccess.Read);
                    using (StreamReader streamReader = new StreamReader(fileStream))
                    {
                        string charsRead = streamReader.ReadLine();

                        int count = 0;
                        long linesProcessed = 0;

                        while (!String.IsNullOrEmpty(charsRead))
                        {

                            string[] columns = charsRead.Split(',');
                            string eventsList = columns[0] + ";" + columns[1] + ";" + columns[2] + ";" + columns[3] + ";" +
                                                columns[4] + ";" + columns[5] + ";" + columns[6] + ";" + columns[7];
                            if (parsedData.ContainsKey(columns[0]))
                            {
                                Results results = parsedData[columns[0]];
                                results.Count = results.Count + 1;
                                results.Conversion = results.Count;

                                results.EventList.Add(eventsList);
                                parsedData[columns[0]] = results;
                            }
                            else
                            {
                                Results results = new Results {
                                                    Count = 1, Hash_Person_Id = columns[0], Tag_Id = columns[1], Conversion = 1,
                                                    Campaign_Id = columns[2], Inventory_Placement = columns[3], Action_Id = columns[4], 
                                                    Creative_Group_Id = columns[5], Creative_Id = columns[6], Record_Time = columns[7]
                                                    };
                                results.EventList = new List<string> {eventsList};

                                    parsedData.Add(columns[0], results);
                            }
                            charsRead = streamReader.ReadLine();

                            linesProcessed++;

                            if (linesProcessed == processLimit)
                            {
                                linesProcessed = 0;
                                SaveParsedValues(filename, fileType, parsedData);
//Clear Dictionary
                                parsedData.Clear();
                            }
                        }
                    }


                    stopwatch.Stop();
                    Console.WriteLine(@"File  : {0}  Batch Limit : {1}  Time elapsed : {2} ", filename + Environment.NewLine, processLimit + Environment.NewLine, stopwatch.Elapsed + Environment.NewLine);

                }

谢谢

4

1 回答 1

1

Microsoft.VisualBasic.FileIO.TextFieldParser类看起来可以完成这项工作。试试看,它可能会加快速度。

于 2013-03-11T03:38:06.273 回答