summaryrefslogtreecommitdiff
path: root/tools/fileDataProvider.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'tools/fileDataProvider.cpp')
-rw-r--r--tools/fileDataProvider.cpp173
1 files changed, 173 insertions, 0 deletions
diff --git a/tools/fileDataProvider.cpp b/tools/fileDataProvider.cpp
new file mode 100644
index 0000000..e9b7f3d
--- /dev/null
+++ b/tools/fileDataProvider.cpp
@@ -0,0 +1,173 @@
+//
+// Created by joe on 5/13/15.
+//
+
+#include "fileDataProvider.h"
+#include <random>
+#include <ctime>
+
+using namespace std;
+
+mt19937 gen;
+
+int FileDP::getDataSet(DataList &out){
+ DataEntry* e;
+ out.clear();
+ int fsize;
+ fin>>fsize;
+ LOG(INFO)<<"Feature size:"<<fsize;
+ out.setfSize(fsize);
+ while (!fin.eof()) {
+ e = new DataEntry;
+ fin>>e->rank;
+ if (e->rank == 0)
+ {
+ delete e;
+ break;
+ }
+ fin>>e->qid;
+ e->feature.resize(fsize);
+ for (int i=0;i<fsize;++i) {
+ fin>>e->feature(i);
+ }
+ out.addEntry(e);
+ }
+ eof=true;
+ return 0;
+}
+
+void RidFileDP::readEntries() {
+ DataEntry *e;
+ int fsize;
+ d.clear();
+ fin >> fsize;
+ LOG(INFO) << "Feature size:" << fsize;
+ d.setfSize(fsize);
+ while (!fin.eof()) {
+ e = new DataEntry;
+ fin >> e->qid;
+ if (e->qid == "0") {
+ delete e;
+ break;
+ }
+ e->feature.resize(fsize);
+ e->rank=-1;
+ for (int i = 0; i < fsize; ++i) {
+ fin >> e->feature(i);
+ }
+ d.addEntry(e);
+ }
+ pos = 0;
+ qid = 1;
+ read = true;
+}
+
+int RidFileDP::getDataSet(DataList &out){
+ DataEntry *e;
+ int fsize;
+ if (!read)
+ readEntries();
+ out.clear();
+ fsize = d.getfSize();
+ out.setfSize(fsize);
+ std::vector<DataEntry*> & dat = d.getData();
+ for (int i=0;i<d.getSize();++i)
+ if (i!=pos)
+ {
+ if (dat[i]->qid == dat[pos]->qid)
+ {
+ e = new DataEntry;
+ e->rank=1;
+ dat[i]->rank=qid;
+ }
+ else
+ {
+ e = new DataEntry;
+ e->rank=-1;
+ }
+ e->feature.resize(d.getfSize());
+ e->qid=dat[pos]->qid;
+ for (int j = 0; j < fsize; ++j) {
+ e->feature(j) = fabs(dat[i]->feature(j) -dat[pos]->feature(j));
+ }
+ out.addEntry(e);
+ }
+ dat[pos]->qid=std::to_string(qid);
+ ++qid;
+ dat[pos]->rank=qid;
+ while (pos<dat.size() && dat[pos]->rank!=-1)
+ ++pos;
+ if (pos==d.getSize())
+ eof = true;
+ return 0;
+}
+
+int RidFileDP::getpSize() {
+ std::vector<string> p;
+ if (!read)
+ readEntries();
+ std::vector<DataEntry*> &dat = d.getData();
+ for (int i=0;i<dat.size();++i)
+ {
+ bool ext=false;
+ for (int j=0;j<p.size();++j)
+ if (p[j] == dat[i]->qid )
+ {
+ ext=true;
+ break;
+ }
+ if (!ext)
+ p.push_back(dat[i]->qid);
+ }
+ return p.size();
+};
+
+void scrambler(vector<DataEntry*> &dat)
+{
+ DataEntry* e;
+ int sz=(int)dat.size();
+ for (int i=0;i<sz;++i)
+ {
+ int pos = (int)(gen()%(sz-i));
+ e=dat[pos];
+ dat[pos] = dat[sz-i-1];
+ dat[sz-i-1] = e;
+ }
+}
+
+void RidFileDP::take(int n,vector<DataEntry*> &a,vector<DataEntry*> &b)
+{
+ gen.seed(time(NULL));
+ DataEntry *e;
+ if (!read)
+ readEntries();
+ vector<DataEntry*> tmp;
+ tmp.reserve(d.getSize());
+ a.clear();
+ b.clear();
+ std::vector<DataEntry*> &dat = d.getData();
+ scrambler(tmp);
+ for (int i=0;i<dat.size();++i)
+ tmp.push_back(dat[i]);
+ int pos = 0;
+ string qid;
+ for (int i=0;i<n;++i)
+ {
+ while (tmp[pos]==NULL)
+ ++pos;
+ qid = tmp[pos]->qid;
+ a.push_back(tmp[pos]);
+ tmp[pos]=NULL;
+ for (int j = pos+1; j< tmp.size();++j)
+ if (tmp[j]!=NULL &&tmp[j]->qid==qid)
+ {
+ a.push_back(tmp[j]);
+ tmp[j]=NULL;
+ }
+ }
+ for (int i=0;i<tmp.size();++i)
+ if (tmp[i]!=NULL)
+ b.push_back(tmp[i]);
+ scrambler(a);
+ scrambler(b);
+} \ No newline at end of file