Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Reading a large CSV file and processing in C#. Any suggestions?

I have a large CSV file around 25G. I need to parse each line which has around 10 columns and do some processing and finally save it to a new file with parsed data.

I am using dictionary as my datastructure. To avoid the memory overflow I am writing the file after 500,000 records and clearing the dictionary.

Can anyone suggest whether is this good way of doing. If not, any other better way of doing this? Right now it is taking 30 mins to process 25G file.

Here is the code

        private static void ReadData(string filename, FEnum fileType)
    {

       var resultData = new ResultsData
                        {
                            DataColumns = new List<string>(),
                            DataRows = new List<Dictionary<string, Results>>()
                        };

                    resultData.DataColumns.Add("count");
                    resultData.DataColumns.Add("userid");

                    Console.WriteLine("Start Processing : " + DateTime.Now);
                    const long processLimit = 100000;
                        //ProcessLimit : 500000, TimeElapsed : 30 Mins;
                        //ProcessLimit : 100000, TimeElaspsed - Overflow

                    Stopwatch stopwatch = new Stopwatch();

                    stopwatch.Start();
                    Dictionary<string, Results> parsedData = new Dictionary<string, Results>();

                    FileStream fileStream = new FileStream(filename, FileMode.Open, FileAccess.Read);
                    using (StreamReader streamReader = new StreamReader(fileStream))
                    {
                        string charsRead = streamReader.ReadLine();

                        int count = 0;
                        long linesProcessed = 0;

                        while (!String.IsNullOrEmpty(charsRead))
                        {

                            string[] columns = charsRead.Split(',');
                            string eventsList = columns[0] + ";" + columns[1] + ";" + columns[2] + ";" + columns[3] + ";" +
                                                columns[4] + ";" + columns[5] + ";" + columns[6] + ";" + columns[7];
                            if (parsedData.ContainsKey(columns[0]))
                            {
                                Results results = parsedData[columns[0]];
                                results.Count = results.Count + 1;
                                results.Conversion = results.Count;

                                results.EventList.Add(eventsList);
                                parsedData[columns[0]] = results;
                            }
                            else
                            {
                                Results results = new Results {
                                                    Count = 1, Hash_Person_Id = columns[0], Tag_Id = columns[1], Conversion = 1,
                                                    Campaign_Id = columns[2], Inventory_Placement = columns[3], Action_Id = columns[4], 
                                                    Creative_Group_Id = columns[5], Creative_Id = columns[6], Record_Time = columns[7]
                                                    };
                                results.EventList = new List<string> {eventsList};

                                    parsedData.Add(columns[0], results);
                            }
                            charsRead = streamReader.ReadLine();

                            linesProcessed++;

                            if (linesProcessed == processLimit)
                            {
                                linesProcessed = 0;
                                SaveParsedValues(filename, fileType, parsedData);
//Clear Dictionary
                                parsedData.Clear();
                            }
                        }
                    }


                    stopwatch.Stop();
                    Console.WriteLine(@"File  : {0}  Batch Limit : {1}  Time elapsed : {2} ", filename + Environment.NewLine, processLimit + Environment.NewLine, stopwatch.Elapsed + Environment.NewLine);

                }

Thank you

like image 974
Think Avatar asked Nov 13 '22 09:11

Think


1 Answers

The Microsoft.VisualBasic.FileIO.TextFieldParser class looks like it could do the job. Try it, it may speed things up.

like image 80
Sam Axe Avatar answered Nov 15 '22 11:11

Sam Axe