I want to find a programmatic solution using C++.
I have a 900 files each of 27MB size. (just to inform about the enormity ).
Each file has 55K rows and Varying columns. But the header indicates the columns
I want to sort the rows in an order w.r.t to a Column Value.
I wrote the sorting algorithm for this (definitely my newbie attempts, you may say). This algorithm is working for few numbers, but fails for larger numbers.
Here is the code for the same: basic functions I defined to use inside the main code:
int getNumberOfColumns(const string& aline)
{
int ncols=0;
istringstream ss(aline);
string s1;
while(ss>>s1) ncols++;
return ncols;
}
vector<string> getWordsFromSentence(const string& aline)
{
vector<string>words;
istringstream ss(aline);
string tstr;
while(ss>>tstr) words.push_back(tstr);
return words;
}
bool findColumnName(vector<string> vs, const string& colName)
{
vector<string>::iterator it = find(vs.begin(), vs.end(), colName);
if ( it != vs.end())
return true;
else return false;
}
int getIndexForColumnName(vector<string> vs, const string& colName)
{
if ( !findColumnName(vs,colName) ) return -1;
else {
vector<string>::iterator it = find(vs.begin(), vs.end(), colName);
return it - vs.begin();
}
}
////////// I like the Recurssive functions - I tried to create a recursive function
///here. This worked for small values , say 20 rows. But for 55K - core dumps
void sort2D(vector<string>vn, vector<string> &srt, int columnIndex)
{
vector<double> pVals;
for ( int i = 0; i < vn.size(); i++) {
vector<string>meancols = getWordsFromSentence(vn[i]);
pVals.push_back(stringToDouble(meancols[columnIndex]));
}
srt.push_back(vn[max_element(pVals.begin(), pVals.end())-pVals.begin()]);
if (vn.size() > 1 ) {
vn.erase(vn.begin()+(max_element(pVals.begin(), pVals.end())-pVals.begin()) );
vector<string> vn2 = vn;
//cout<<srt[srt.size() -1 ]<<endl;
sort2D(vn2 , srt, columnIndex);
}
}
Now the main code:
for ( int i = 0; i < TissueNames.size() -1; i++)
{
for ( int j = i+1; j < TissueNames.size(); j++)
{
//string fname = path+"/gse7307_Female_rma"+TissueNames[i]+"_"+TissueNames[j]+".txt";
//string fname2 = sortpath2+"/gse7307_Female_rma"+TissueNames[i]+"_"+TissueNames[j]+"Sorted.txt";
string fname = path+"/gse7307_Male_rma"+TissueNames[i]+"_"+TissueNames[j]+".txt";
string fname2 = sortpath2+"/gse7307_Male_rma"+TissueNames[i]+"_"+TissueNames[j]+"4Columns.txt";
vector<string>AllLinesInFile;
BioInputStream fin(fname);
string aline;
getline(fin,aline);
replace (aline.begin(), aline.end(), '"',' ');
string headerline = aline;
vector<string> header = getWordsFromSentence(aline);
int pindex = getIndexForColumnName(header,"p-raw");
int xcindex = getIndexForColumnName(header,"xC");
int xeindex = getIndexForColumnName(header,"xE");
int prbindex = getIndexForColumnName(header,"X");
string newheaderline = "X\txC\txE\tp-raw";
BioOutputStream fsrt(fname2);
fsrt<<newheaderline<<endl;
int newpindex=3;
while ( getline(fin, aline) ){
replace (aline.begin(), aline.end(), '"',' ');
istringstream ss2(aline);
string tstr;
ss2>>tstr;
tstr = ss2.str().substr(tstr.length()+1);
vector<string> words = getWordsFromSentence(tstr);
string values = words[prbindex]+"\t"+words[xcindex]+"\t"+words[xeindex]+"\t"+words[pindex];
AllLinesInFile.push_back(values);
}
vector<string>SortedLines;
sort2D(AllLinesInFile, SortedLines,newpindex);
for ( int si = 0; si < SortedLines.size(); si++)
fsrt<<SortedLines[si]<<endl;
cout<<"["<<i<<","<<j<<"] = "<<SortedLines.size()<<endl;
}
}
can some one suggest me a better way of doing this? why it is failing for larger values. ?
The primary function of interest for this query is Sort2D function.
thanks for the time and patience.
prasad.