// // Created by joe on 5/13/15. // #include "fileDataProvider.h" #include <random> #include <ctime> using namespace std; mt19937 gen; int FileDP::getDataSet(DataList &out){ DataEntry* e; out.clear(); int fsize; fin>>fsize; LOG(INFO)<<"Feature size:"<<fsize; out.setfSize(fsize); while (!fin.eof()) { e = new DataEntry; fin>>e->rank; if (e->rank == 0) { delete e; break; } fin>>e->qid; e->feature.resize(fsize); for (int i=0;i<fsize;++i) { fin>>e->feature(i); } out.addEntry(e); } eof=true; return 0; } void RidFileDP::readEntries() { DataEntry *e; int fsize; d.clear(); fin >> fsize; LOG(INFO) << "Feature size:" << fsize; d.setfSize(fsize); while (!fin.eof()) { e = new DataEntry; fin >> e->qid; if (e->qid == "0") { delete e; break; } e->feature.resize(fsize); e->rank=-1; for (int i = 0; i < fsize; ++i) { fin >> e->feature(i); } d.addEntry(e); } pos = 0; std::vector<DataEntry*> & dat = d.getData(); while (pos<dat.size() && dat[pos]->rank!=-1 && dat[pos]->qid!="-1") ++pos; qid = 1; read = true; } int RidFileDP::getDataSet(DataList &out){ DataEntry *e; int fsize; if (!read) readEntries(); out.clear(); fsize = d.getfSize(); out.setfSize(fsize); std::vector<DataEntry*> & dat = d.getData(); for (int i=0;i<d.getSize();++i) if (i!=pos) { if (dat[i]->qid == dat[pos]->qid) { e = new DataEntry; e->rank=1; dat[i]->rank=qid; } else { e = new DataEntry; e->rank=-1; } e->feature.resize(d.getfSize()); e->qid=dat[pos]->qid; for (int j = 0; j < fsize; ++j) { e->feature(j) = fabs(dat[i]->feature(j) -dat[pos]->feature(j)); } out.addEntry(e); } ++qid; dat[pos]->rank=qid; while (pos<dat.size() && (dat[pos]->rank!=-1 || dat[pos]->qid=="-1")) ++pos; if (pos==d.getSize()) eof = true; return 0; } int RidFileDP::getpSize() { std::vector<string> p; if (!read) readEntries(); std::vector<DataEntry*> &dat = d.getData(); for (int i=0;i<dat.size();++i) { bool ext=false; for (int j=0;j<p.size();++j) if (p[j] == dat[i]->qid ) { ext=true; break; } if (!ext) p.push_back(dat[i]->qid); } return p.size(); }; void scrambler(vector<DataEntry*> &dat) { DataEntry* e; int sz=(int)dat.size(); for (int i=0;i<sz;++i) { int pos = (int)(gen()%(sz-i)); e=dat[pos]; dat[pos] = dat[sz-i-1]; dat[sz-i-1] = e; } } void RidFileDP::take(int n,vector<DataEntry*> &a,vector<DataEntry*> &b) { gen.seed(time(NULL)); DataEntry *e; if (!read) readEntries(); vector<DataEntry*> tmp; tmp.reserve(d.getSize()); a.clear(); b.clear(); std::vector<DataEntry*> &dat = d.getData(); scrambler(tmp); for (int i=0;i<dat.size();++i) tmp.push_back(dat[i]); int pos = 0; string qid; for (int i=0;i<n;++i) { while (tmp[pos]==NULL) ++pos; qid = tmp[pos]->qid; a.push_back(tmp[pos]); tmp[pos]=NULL; for (int j = pos+1; j< tmp.size();++j) if (tmp[j]!=NULL &&tmp[j]->qid==qid) { a.push_back(tmp[j]); tmp[j]=NULL; } } for (int i=0;i<tmp.size();++i) if (tmp[i]!=NULL) b.push_back(tmp[i]); scrambler(a); scrambler(b); } void RidFileDP::getAllDataSet(RidList &out){ DataEntry *e; if (!read) readEntries(); out.clear(); std::vector<DataEntry*> &dat = d.getData(); for (int i=0;i<dat.size();++i) out.addEntry(dat[i]); out.setfSize(d.getfSize()); }