summaryrefslogtreecommitdiff
path: root/src/KaldiLib/Tokenizer.cc
diff options
context:
space:
mode:
authorJoe Zhao <ztuowen@gmail.com>2014-04-14 08:14:45 +0800
committerJoe Zhao <ztuowen@gmail.com>2014-04-14 08:14:45 +0800
commitcccccbf6cca94a3eaf813b4468453160e91c332b (patch)
tree23418cb73a10ae3b0688681a7f0ba9b06424583e /src/KaldiLib/Tokenizer.cc
downloadtnet-cccccbf6cca94a3eaf813b4468453160e91c332b.tar.gz
tnet-cccccbf6cca94a3eaf813b4468453160e91c332b.tar.bz2
tnet-cccccbf6cca94a3eaf813b4468453160e91c332b.zip
First commit
Diffstat (limited to 'src/KaldiLib/Tokenizer.cc')
-rw-r--r--src/KaldiLib/Tokenizer.cc53
1 files changed, 53 insertions, 0 deletions
diff --git a/src/KaldiLib/Tokenizer.cc b/src/KaldiLib/Tokenizer.cc
new file mode 100644
index 0000000..0c49050
--- /dev/null
+++ b/src/KaldiLib/Tokenizer.cc
@@ -0,0 +1,53 @@
+#include "Tokenizer.h"
+#include "string.h"
+
+namespace TNet
+{
+ //****************************************************************************
+ //****************************************************************************
+ void
+ Tokenizer::
+ AddString(const char* pString)
+ {
+ // copy into string struct, which is more convenient
+ std::string aux_string(pString);
+ std::string aux_record;
+ std::string::size_type cur_pos = 0;
+ std::string::size_type old_pos = 0;
+ std::string::size_type search_start = 0;
+
+ // make sure we have enough space
+ aux_record.reserve(aux_string.length());
+
+ // find all of separators and make a list of tokens
+ while(old_pos < std::string::npos) {
+ // find the next separator
+ cur_pos = aux_string.find_first_of(mSeparator, search_start);
+
+ // if backslash is in front of separator, ignore this separator
+ if (cur_pos != 0 && cur_pos != std::string::npos &&
+ pString[cur_pos - 1] == '\\') {
+ search_start = cur_pos + 1;
+ continue;
+ }
+
+ // we don't want to have empty records
+ if (!(cur_pos == old_pos && mSkipEmpty)) {
+ // extract token
+ aux_record.insert(0, pString+old_pos, cur_pos==std::string::npos ? strlen(pString+old_pos) : cur_pos - old_pos);
+ // insert to list
+ this->push_back(aux_record);
+
+ // we don't need the contents of the token
+ aux_record.erase();
+ }
+
+ // update old position so that it points behind the separator
+ old_pos = cur_pos < std::string::npos ? cur_pos + 1 : cur_pos;
+ search_start = old_pos;
+ }
+ }
+
+
+} // namespace TNet
+