I am trying to read a csv file into a struct containing a vector of vector of strings. The file contains ~2 million lines and size on disk is ~350 mb. When I read the file into struct top
shows me that the on reading the full file, the program is now using almost 3.5GB of my memory. I have used vector reserve to try to limit vector capacity increase on push_back.
#include<iomanip>
#include<stdio.h>
#include<stdlib.h>
#include<iostream>
#include<fstream>
#include<string.h>
#include<sstream>
#include<math.h>
#include<vector>
#include<algorithm>
#include<array>
#include<ctime>
#include<boost/algorithm/string.hpp>
using namespace std;
struct datStr{
vector<string> colNames;
vector<vector<string>> data;
};
datStr readBoost(string fileName)
{
datStr ds;
ifstream inFile;
inFile.open(fileName);
string line;
getline(inFile, line);
vector<string> colNames;
stringstream ss(line);
string item;
int i = 0;
vector<int> colTypeInt;
while(getline(ss, item, ','))
{
item.erase( remove( item.begin(), item.end(), ' ' ), item.end() );
colNames.push_back(item);
vector<string> colVec;
ds.data.push_back(colVec);
ds.data[i].reserve(3000000);
i++;
}
int itr = 0;
while(getline(inFile, line))
{
vector<string> rowStr;
boost::split(rowStr, line, boost::is_any_of(","));
for(int ktr = 0; ktr < rowStr.size(); ktr++)
{
rowStr[ktr].erase( remove( rowStr[ktr].begin(), rowStr[ktr].end(), ' ' ), rowStr[ktr].end() );
ds.data[ktr].push_back(rowStr[ktr]);
}
itr++;
}
int main()
{
datStr ds = readBoost("file.csv");
while(true)
{
}
}
PS: The last while
is just so I can monitor the memory usage on completion of the program.
Is this something expected when using vectors or am I missing something here?
Another interesting fact. I started adding up size and capacity for each string in the read loop. Surprisingly it just adds up to 1/10 of what I am shown in top on ubuntu? Could it be that top is misreporting or my compiler is allocating too much space?
I tested your code with an input file that has 1886850
lines of text, with a size of 105M
.
With your code, the memory consumption was about 2.5G.
Then, I started modifying how data is stored.
First test:
Change datStr
to:
struct datStr{
vector<string> colNames;
vector<string> lines;
};
This reduced the memory consumption to 206M
. That's more than 10 fold reduction in size. It's clear that the penalty for using
vector<vector<string>> data;
is rather stiff.
Second test:
Change datStr
to:
struct datStr{
vector<string> colNames;
vector<string> lines;
vector<vector<string::size_type>> indices;
};
with indices
keeping track of where the tokens in lines
start. You can extract the tokens from each line by using lines
and indices
.
With this change, the memory consumption went to 543MB
but sill is five times smaller than the original.
Third test
Change dataStr
to:
struct datStr{
vector<string> colNames;
vector<string> lines;
vector<vector<unsigned int>> indices;
};
With this change, the memory consumption came down to 455MB
. This should work if you don't expect your lines to be longer or equal to UINT_MAX
.
Fourth Test
Change dataStr
to:
struct datStr{
vector<string> colNames;
vector<string> lines;
vector<vector<unsigned short>> indices;
};
With this change, the memory consumption came down to 278MB
. This should work if you don't expect your lines to be longer or equal to USHRT_MAX
. For this case, the overhead of indices
is really small, only 72MB
.
Here's the modified code I used for my tests.
#include<iomanip>
#include<stdio.h>
#include<stdlib.h>
#include<iostream>
#include<fstream>
#include<string.h>
#include<sstream>
#include<math.h>
#include<vector>
#include<algorithm>
#include<array>
#include<ctime>
// #include<boost/algorithm/string.hpp>
using namespace std;
struct datStr{
vector<string> colNames;
vector<string> lines;
vector<vector<unsigned short>> data;
};
void split(vector<unsigned short>& rowStr, string const& line)
{
string::size_type begin = 0;
string::size_type end = line.size();
string::size_type iter = begin;
while ( iter != end)
{
++iter;
if ( line[iter] == ',' )
{
rowStr.push_back(static_cast<unsigned short>(begin));
++iter;
begin = iter;
}
}
if (begin != end )
{
rowStr.push_back(static_cast<unsigned short>(begin));
}
}
datStr readBoost(string fileName)
{
datStr ds;
ifstream inFile;
inFile.open(fileName);
string line;
getline(inFile, line);
vector<string> colNames;
stringstream ss(line);
string item;
int i = 0;
vector<int> colTypeInt;
while(getline(ss, item, ','))
{
item.erase( remove( item.begin(), item.end(), ' ' ), item.end() );
ds.colNames.push_back(item);
}
int itr = 0;
while(getline(inFile, line))
{
ds.lines.push_back(line);
vector<unsigned short> rowStr;
split(rowStr, line);
ds.data.push_back(rowStr);
}
}
int main(int argc, char** argv)
{
datStr ds = readBoost(argv[1]);
while(true)
{
}
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With