First commit

author: Joe Zhao <ztuowen@gmail.com> 2014-04-14 08:14:45 +0800
committer: Joe Zhao <ztuowen@gmail.com> 2014-04-14 08:14:45 +0800
commit: cccccbf6cca94a3eaf813b4468453160e91c332b (patch)
tree: 23418cb73a10ae3b0688681a7f0ba9b06424583e
download: tnet-cccccbf6cca94a3eaf813b4468453160e91c332b.tar.gz
tnet-cccccbf6cca94a3eaf813b4468453160e91c332b.tar.bz2
tnet-cccccbf6cca94a3eaf813b4468453160e91c332b.zip
321 files changed, 63747 insertions, 0 deletions
diff --git a/NetGen/layerobj.py b/NetGen/layerobj.py
new file mode 100644
index 0000000..3cfadf0
--- /dev/null
+++ b/NetGen/layerobj.py
@@ -0,0 +1,160 @@
+'''
+Created on Mar 16, 2014
+
+@author: joe
+'''
+
+import random
+import mathobj
+
+matRandFnc=lambda :3*random.gauss(0.0,1.0)
+vecRandFnc=lambda :random.random()/5.0-4.1
+
+class SharedLinearity:
+    def __init__(self,nIn=1,nOut=1,inS=True,
+                 matRand=matRandFnc,vecRand=vecRandFnc):
+        self.nInputs=nIn
+        self.nOutputs=nOut
+        self.inScale=inS
+        self.matRand=matRand
+        self.vecRand=vecRand
+        
+    def flush(self):
+        print "<biasedlinearity>", self.nOutputs, self.nInputs
+        mathobj.randomFnc=self.matRand
+        mathobj.inputScale=self.inScale
+        mathobj.PrintTransMatrix(self.nInputs, self.nOutputs)
+        mathobj.randomFnc=self.vecRand
+        mathobj.PrintVector(self.nOutputs)
+
+class Linearity:
+    def __init__(self,nIn=1,nOut=1,inS=True,
+                 matRand=matRandFnc):
+        self.nInputs=nIn
+        self.nOutputs=nOut
+        self.inScale=inS
+        self.matRand=matRand
+        
+    def flush(self):
+        print "<linearity>", self.nOutputs, self.nInputs
+        mathobj.randomFnc=self.matRand
+        mathobj.inputScale=self.inScale
+        mathobj.PrintTransMatrix(self.nInputs, self.nOutputs)
+
+class UpdatableBias:        
+    def __init__(self,nIn=1,vecRand=vecRandFnc):
+        self.nInputs=nIn
+        self.nOutputs=nIn
+        self.vecRand=vecRand
+        
+    def flush(self):
+        print "<updatablebias>", self.nOutputs, self.nInputs
+        mathobj.randomFnc=self.vecRand
+        mathobj.PrintVector(self.nInputs)
+
+class MiscPipe:
+    def __init__(self,nIn=1,name="<pipe>"):
+        self.nInputs=nIn
+        self.nOutputs=nIn
+        self.name=name
+    
+    def flush(self):
+        print self.name, self.nOutputs, self.nInputs
+
+class Distrib:
+    def __init__(self,nIn=1,size=1):
+        self.nInputs=nIn
+        self.nOutputs=nIn
+        self.size=size
+    
+    def flush(self):
+        print "<distrib>", self.nOutputs, self.nInputs
+        print self.size
+
+class Combine:
+    def __init__(self,nIn=1,size=1):
+        self.nInputs=nIn
+        self.nOutputs=nIn
+        self.size=size
+    
+    def flush(self):
+        print "<combine>", self.nOutputs, self.nInputs
+        print self.size
+
+class Divide:
+    def __init__(self,nIn=1,divLen=[]):
+        self.nInputs=nIn
+        self.nOutputs=divLen[0]
+        self.divLen=divLen
+    
+    def push(self,nxtLen):
+        self.divLen+=[nxtLen];
+    
+    def flush(self):
+        print "<divide>", self.nOutputs, self.nInputs
+        print len(self.divLen),
+        for Len in self.divLen:
+            print Len,
+        print
+
+class Merge:
+    def __init__(self,nOut=1,divLen=[]):
+        self.nOutputs=nOut
+        self.nInputs=divLen[0]
+        self.divLen=divLen
+    
+    def push(self,nxtLen):
+        self.divLen+=[nxtLen];
+    
+    def flush(self):
+        print "<merge>", self.nOutputs, self.nInputs
+        print len(self.divLen),
+        for Len in self.divLen:
+            print Len,
+        print
+
+class Reorder:
+    def __init__(self,nIn=1,Order=[]):
+        self.nInputs=nIn
+        self.nOutputs=nIn
+        self.Order=Order
+    
+    def push(self,nxtPos):
+        self.Order+=[nxtPos];
+    
+    def flush(self):
+        print "<reorder>", self.nOutputs, self.nInputs
+        print len(self.Order),
+        for Len in self.Order:
+            print Len,
+        print
+        
+class Compound:
+    def __init__(self,nIn=1,nOut=1,Objs=[]):
+        self.nInputs=nIn
+        self.nOutputs=nOut
+        self.Objs=Objs
+    
+    def push(self,nxtObj):
+        self.Objs+=[nxtObj];
+    
+    def flush(self):
+        print "<compound>", self.nOutputs, self.nInputs
+        for Obj in self.Objs:
+            Obj.flush()
+        print "<endblock>"
+
+class Discrete:
+    def __init__(self,nIn=1,nOut=1,Objs=[]):
+        self.nInputs=nIn
+        self.nOutputs=nOut
+        self.Objs=Objs
+    
+    def push(self,nxtObj):
+        self.Objs+=[nxtObj];
+    
+    def flush(self):
+        print "<discrete>", self.nOutputs, self.nInputs
+        for Obj in self.Objs:
+            Obj.flush()
+        print "<endblock>"
+\ No newline at end of file
diff --git a/NetGen/layerobj.pyc b/NetGen/layerobj.pyc
new file mode 100644
index 0000000..1ad80a9
--- /dev/null
+++ b/NetGen/layerobj.pyc
diff --git a/NetGen/mathobj.py b/NetGen/mathobj.py
new file mode 100644
index 0000000..77dad90
--- /dev/null
+++ b/NetGen/mathobj.py
@@ -0,0 +1,32 @@
+'''
+Created on Mar 16, 2014
+
+@author: joe
+'''
+
+import math, random
+
+ 
+randomFnc=lambda :3*random.gauss(0.0,1.0)
+inputScale=True
+
+def PrintTransMatrix(rows,cols):
+    PrintMatrix(cols,rows)
+
+def PrintMatrix(rows,cols):
+    print 'm', rows, cols
+    for row in range(rows):
+        for col in range(cols):
+            if(inputScale):
+                print randomFnc()/math.sqrt(rows),
+            else:
+                print randomFnc(),
+        print
+
+def PrintVector(cols):
+    print 'v', cols
+    for col in range(cols):
+        print randomFnc(),
+    print
+        
+        
+\ No newline at end of file
diff --git a/NetGen/mathobj.pyc b/NetGen/mathobj.pyc
new file mode 100644
index 0000000..4963b50
--- /dev/null
+++ b/NetGen/mathobj.pyc
diff --git a/NetGen/testscript.py b/NetGen/testscript.py
new file mode 100644
index 0000000..70fa005
--- /dev/null
+++ b/NetGen/testscript.py
@@ -0,0 +1,72 @@
+'''
+Created on Mar 16, 2014
+
+@author: joe
+'''
+from layerobj import *
+
+if __name__ == '__main__':
+    Discrete(1,1,[Distrib(1,3)]).flush();
+    Discrete(1,1,
+             [SharedLinearity(1,1),
+              MiscPipe(1,"<learnstop>"),
+              MiscPipe(1,"<learnstop>"),
+              ]).flush();
+    Discrete(1,1,
+             [MiscPipe(1,"<softmax>"),
+              MiscPipe(1),
+              MiscPipe(1),
+              ]).flush();
+    Discrete(1,1,
+             [SharedLinearity(1,1),
+              MiscPipe(1),
+              MiscPipe(1),
+              ]).flush();
+    Discrete(1,1,
+             [MiscPipe(1,"<softmax>"),
+              MiscPipe(1),
+              MiscPipe(1),
+              ]).flush();
+    Discrete(1,1,
+             [MiscPipe(1,"<sigmoid>"),
+              MiscPipe(1),
+              MiscPipe(1),
+              ]).flush();
+    Discrete(1,1,
+             [Distrib(1,2),
+              MiscPipe(1),
+              MiscPipe(1),
+              ]).flush();
+    Discrete(1,1,
+             [MiscPipe(1,"<learnstop>"),
+              SharedLinearity(1,1),
+              SharedLinearity(1,1),
+              MiscPipe(1),
+              ]).flush();
+    Discrete(1,1,
+             [MiscPipe(1),
+              Combine(2),
+              MiscPipe(1),
+              ]).flush();
+    Discrete(1,1,
+             [MiscPipe(1),
+              SharedLinearity(1,1),
+              MiscPipe(1),
+              ]).flush();
+    Discrete(1,1,
+             [MiscPipe(1),
+              MiscPipe(1,"<sigmoid>"),
+              MiscPipe(1),
+              ]).flush();
+    Discrete(1,1,
+             [SharedLinearity(1,1),
+              SharedLinearity(1,1),
+              SharedLinearity(1,1),
+              ]).flush();
+    Discrete(1,1,
+             [Combine(3)
+              ]).flush();
+    MiscPipe(1,"<sigmoid>").flush()
+    SharedLinearity(1,1).flush();
+    MiscPipe(1,"<softmax>").flush();
+    
+\ No newline at end of file
diff --git a/src/.depend.mk b/src/.depend.mk
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/.depend.mk
diff --git a/src/.depend.mk1 b/src/.depend.mk1
new file mode 100644
index 0000000..0620a4c
--- /dev/null
+++ b/src/.depend.mk1
@@ -0,0 +1,618 @@
+TNet.o: TNet.cc KaldiLib/Error.h /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/iosfwd /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/postypes.h /usr/include/c++/4.6/cwchar \
+ /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/xlocale.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/cctype \
+ /usr/include/ctype.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/endian.h \
+ /usr/include/bits/endian.h /usr/include/bits/byteswap.h \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/time.h \
+ /usr/include/bits/sched.h /usr/include/bits/time.h \
+ /usr/include/bits/timex.h /usr/include/bits/pthreadtypes.h \
+ /usr/include/bits/setjmp.h /usr/include/unistd.h \
+ /usr/include/bits/posix_opt.h /usr/include/bits/environments.h \
+ /usr/include/bits/confname.h /usr/include/getopt.h \
+ /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/bits/locale_classes.h /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/sstream /usr/include/c++/4.6/bits/sstream.tcc \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/sys/types.h /usr/include/sys/select.h \
+ /usr/include/bits/select.h /usr/include/bits/sigset.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/alloca.h /usr/include/bits/stdlib.h /usr/include/execinfo.h \
+ KaldiLib/Timer.h KaldiLib/Error.h /usr/include/sys/time.h \
+ KaldiLib/Features.h /usr/include/c++/4.6/list \
+ /usr/include/c++/4.6/bits/stl_list.h /usr/include/c++/4.6/bits/list.tcc \
+ /usr/include/c++/4.6/queue /usr/include/c++/4.6/deque \
+ /usr/include/c++/4.6/bits/stl_construct.h \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_deque.h \
+ /usr/include/c++/4.6/bits/deque.tcc /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_queue.h KaldiLib/Common.h \
+ /usr/include/string.h /usr/include/bits/string3.h KaldiLib/Matrix.h \
+ KaldiLib/cblas.h KaldiLib/clapack.h KaldiLib/cblas.h KaldiLib/MathAux.h \
+ /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ KaldiLib/Types.h KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h /usr/include/c++/4.6/limits \
+ KaldiLib/Vector.h /usr/include/c++/4.6/cstddef KaldiLib/Vector.tcc \
+ /usr/include/c++/4.6/cstring KaldiLib/StkStream.h KaldiLib/StkStream.tcc \
+ KaldiLib/Timer.h KaldiLib/Common.h KaldiLib/MlfStream.h \
+ /usr/include/c++/4.6/map /usr/include/c++/4.6/bits/stl_tree.h \
+ /usr/include/c++/4.6/bits/stl_map.h \
+ /usr/include/c++/4.6/bits/stl_multimap.h /usr/include/c++/4.6/set \
+ /usr/include/c++/4.6/bits/stl_set.h \
+ /usr/include/c++/4.6/bits/stl_multiset.h KaldiLib/MlfStream.tcc \
+ KaldiLib/StkMatch.h KaldiLib/UserInterface.h TNetLib/Nnet.h \
+ TNetLib/Component.h KaldiLib/Vector.h KaldiLib/Matrix.h \
+ TNetLib/BiasedLinearity.h TNetLib/SharedLinearity.h TNetLib/Activation.h \
+ TNetLib/ObjFun.h /usr/include/c++/4.6/cassert /usr/include/assert.h \
+ TNetLib/Platform.h TNetLib/Thread.h KaldiLib/Labels.h \
+ KaldiLib/MlfStream.h KaldiLib/Features.h TNetLib/Cache.h TNetLib/Nnet.h \
+ TNetLib/ObjFun.h TNetLib/Mutex.h TNetLib/Semaphore.h TNetLib/Barrier.h \
+ /usr/include/c++/4.6/iterator \
+ /usr/include/c++/4.6/bits/stream_iterator.h /usr/include/c++/4.6/numeric \
+ /usr/include/c++/4.6/bits/stl_numeric.h
+TNorm.o: TNorm.cc KaldiLib/Error.h /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/iosfwd /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/postypes.h /usr/include/c++/4.6/cwchar \
+ /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/xlocale.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/cctype \
+ /usr/include/ctype.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/endian.h \
+ /usr/include/bits/endian.h /usr/include/bits/byteswap.h \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/time.h \
+ /usr/include/bits/sched.h /usr/include/bits/time.h \
+ /usr/include/bits/timex.h /usr/include/bits/pthreadtypes.h \
+ /usr/include/bits/setjmp.h /usr/include/unistd.h \
+ /usr/include/bits/posix_opt.h /usr/include/bits/environments.h \
+ /usr/include/bits/confname.h /usr/include/getopt.h \
+ /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/bits/locale_classes.h /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/sstream /usr/include/c++/4.6/bits/sstream.tcc \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/sys/types.h /usr/include/sys/select.h \
+ /usr/include/bits/select.h /usr/include/bits/sigset.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/alloca.h /usr/include/bits/stdlib.h /usr/include/execinfo.h \
+ KaldiLib/Timer.h KaldiLib/Error.h /usr/include/sys/time.h \
+ KaldiLib/Features.h /usr/include/c++/4.6/list \
+ /usr/include/c++/4.6/bits/stl_list.h /usr/include/c++/4.6/bits/list.tcc \
+ /usr/include/c++/4.6/queue /usr/include/c++/4.6/deque \
+ /usr/include/c++/4.6/bits/stl_construct.h \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_deque.h \
+ /usr/include/c++/4.6/bits/deque.tcc /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_queue.h KaldiLib/Common.h \
+ /usr/include/string.h /usr/include/bits/string3.h KaldiLib/Matrix.h \
+ KaldiLib/cblas.h KaldiLib/clapack.h KaldiLib/cblas.h KaldiLib/MathAux.h \
+ /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ KaldiLib/Types.h KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h /usr/include/c++/4.6/limits \
+ KaldiLib/Vector.h /usr/include/c++/4.6/cstddef KaldiLib/Vector.tcc \
+ /usr/include/c++/4.6/cstring KaldiLib/StkStream.h KaldiLib/StkStream.tcc \
+ KaldiLib/Timer.h KaldiLib/Common.h KaldiLib/UserInterface.h \
+ /usr/include/c++/4.6/map /usr/include/c++/4.6/bits/stl_tree.h \
+ /usr/include/c++/4.6/bits/stl_map.h \
+ /usr/include/c++/4.6/bits/stl_multimap.h TNetLib/Nnet.h \
+ TNetLib/Component.h KaldiLib/Vector.h KaldiLib/Matrix.h \
+ TNetLib/BiasedLinearity.h TNetLib/SharedLinearity.h TNetLib/Activation.h \
+ /usr/include/c++/4.6/numeric /usr/include/c++/4.6/bits/stl_numeric.h
+TFeaCat.o: TFeaCat.cc KaldiLib/Error.h /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/iosfwd /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/postypes.h /usr/include/c++/4.6/cwchar \
+ /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/xlocale.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/cctype \
+ /usr/include/ctype.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/endian.h \
+ /usr/include/bits/endian.h /usr/include/bits/byteswap.h \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/time.h \
+ /usr/include/bits/sched.h /usr/include/bits/time.h \
+ /usr/include/bits/timex.h /usr/include/bits/pthreadtypes.h \
+ /usr/include/bits/setjmp.h /usr/include/unistd.h \
+ /usr/include/bits/posix_opt.h /usr/include/bits/environments.h \
+ /usr/include/bits/confname.h /usr/include/getopt.h \
+ /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/bits/locale_classes.h /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/sstream /usr/include/c++/4.6/bits/sstream.tcc \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/sys/types.h /usr/include/sys/select.h \
+ /usr/include/bits/select.h /usr/include/bits/sigset.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/alloca.h /usr/include/bits/stdlib.h /usr/include/execinfo.h \
+ KaldiLib/Timer.h KaldiLib/Error.h /usr/include/sys/time.h \
+ KaldiLib/Features.h /usr/include/c++/4.6/list \
+ /usr/include/c++/4.6/bits/stl_list.h /usr/include/c++/4.6/bits/list.tcc \
+ /usr/include/c++/4.6/queue /usr/include/c++/4.6/deque \
+ /usr/include/c++/4.6/bits/stl_construct.h \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_deque.h \
+ /usr/include/c++/4.6/bits/deque.tcc /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_queue.h KaldiLib/Common.h \
+ /usr/include/string.h /usr/include/bits/string3.h KaldiLib/Matrix.h \
+ KaldiLib/cblas.h KaldiLib/clapack.h KaldiLib/cblas.h KaldiLib/MathAux.h \
+ /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ KaldiLib/Types.h KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h /usr/include/c++/4.6/limits \
+ KaldiLib/Vector.h /usr/include/c++/4.6/cstddef KaldiLib/Vector.tcc \
+ /usr/include/c++/4.6/cstring KaldiLib/StkStream.h KaldiLib/StkStream.tcc \
+ KaldiLib/Timer.h KaldiLib/Common.h KaldiLib/UserInterface.h \
+ /usr/include/c++/4.6/map /usr/include/c++/4.6/bits/stl_tree.h \
+ /usr/include/c++/4.6/bits/stl_map.h \
+ /usr/include/c++/4.6/bits/stl_multimap.h TNetLib/Nnet.h \
+ TNetLib/Component.h KaldiLib/Vector.h KaldiLib/Matrix.h \
+ TNetLib/BiasedLinearity.h TNetLib/SharedLinearity.h TNetLib/Activation.h
+TSegmenter.o: TSegmenter.cc KaldiLib/Error.h \
+ /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/iosfwd /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/postypes.h /usr/include/c++/4.6/cwchar \
+ /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/xlocale.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/cctype \
+ /usr/include/ctype.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/endian.h \
+ /usr/include/bits/endian.h /usr/include/bits/byteswap.h \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/time.h \
+ /usr/include/bits/sched.h /usr/include/bits/time.h \
+ /usr/include/bits/timex.h /usr/include/bits/pthreadtypes.h \
+ /usr/include/bits/setjmp.h /usr/include/unistd.h \
+ /usr/include/bits/posix_opt.h /usr/include/bits/environments.h \
+ /usr/include/bits/confname.h /usr/include/getopt.h \
+ /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/bits/locale_classes.h /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/sstream /usr/include/c++/4.6/bits/sstream.tcc \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/sys/types.h /usr/include/sys/select.h \
+ /usr/include/bits/select.h /usr/include/bits/sigset.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/alloca.h /usr/include/bits/stdlib.h /usr/include/execinfo.h \
+ KaldiLib/Timer.h KaldiLib/Error.h /usr/include/sys/time.h \
+ KaldiLib/Features.h /usr/include/c++/4.6/list \
+ /usr/include/c++/4.6/bits/stl_list.h /usr/include/c++/4.6/bits/list.tcc \
+ /usr/include/c++/4.6/queue /usr/include/c++/4.6/deque \
+ /usr/include/c++/4.6/bits/stl_construct.h \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_deque.h \
+ /usr/include/c++/4.6/bits/deque.tcc /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_queue.h KaldiLib/Common.h \
+ /usr/include/string.h /usr/include/bits/string3.h KaldiLib/Matrix.h \
+ KaldiLib/cblas.h KaldiLib/clapack.h KaldiLib/cblas.h KaldiLib/MathAux.h \
+ /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ KaldiLib/Types.h KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h /usr/include/c++/4.6/limits \
+ KaldiLib/Vector.h /usr/include/c++/4.6/cstddef KaldiLib/Vector.tcc \
+ /usr/include/c++/4.6/cstring KaldiLib/StkStream.h KaldiLib/StkStream.tcc \
+ KaldiLib/Timer.h KaldiLib/Common.h KaldiLib/MlfStream.h \
+ /usr/include/c++/4.6/map /usr/include/c++/4.6/bits/stl_tree.h \
+ /usr/include/c++/4.6/bits/stl_map.h \
+ /usr/include/c++/4.6/bits/stl_multimap.h /usr/include/c++/4.6/set \
+ /usr/include/c++/4.6/bits/stl_set.h \
+ /usr/include/c++/4.6/bits/stl_multiset.h KaldiLib/MlfStream.tcc \
+ KaldiLib/StkMatch.h KaldiLib/UserInterface.h \
+ /usr/include/c++/4.6/numeric /usr/include/c++/4.6/bits/stl_numeric.h \
+ /usr/include/sys/stat.h /usr/include/bits/stat.h
+TJoiner.o: TJoiner.cc KaldiLib/Error.h /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/iosfwd /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/postypes.h /usr/include/c++/4.6/cwchar \
+ /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/xlocale.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/cctype \
+ /usr/include/ctype.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/endian.h \
+ /usr/include/bits/endian.h /usr/include/bits/byteswap.h \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/time.h \
+ /usr/include/bits/sched.h /usr/include/bits/time.h \
+ /usr/include/bits/timex.h /usr/include/bits/pthreadtypes.h \
+ /usr/include/bits/setjmp.h /usr/include/unistd.h \
+ /usr/include/bits/posix_opt.h /usr/include/bits/environments.h \
+ /usr/include/bits/confname.h /usr/include/getopt.h \
+ /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/bits/locale_classes.h /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/sstream /usr/include/c++/4.6/bits/sstream.tcc \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/sys/types.h /usr/include/sys/select.h \
+ /usr/include/bits/select.h /usr/include/bits/sigset.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/alloca.h /usr/include/bits/stdlib.h /usr/include/execinfo.h \
+ KaldiLib/Timer.h KaldiLib/Error.h /usr/include/sys/time.h \
+ KaldiLib/Features.h /usr/include/c++/4.6/list \
+ /usr/include/c++/4.6/bits/stl_list.h /usr/include/c++/4.6/bits/list.tcc \
+ /usr/include/c++/4.6/queue /usr/include/c++/4.6/deque \
+ /usr/include/c++/4.6/bits/stl_construct.h \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_deque.h \
+ /usr/include/c++/4.6/bits/deque.tcc /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_queue.h KaldiLib/Common.h \
+ /usr/include/string.h /usr/include/bits/string3.h KaldiLib/Matrix.h \
+ KaldiLib/cblas.h KaldiLib/clapack.h KaldiLib/cblas.h KaldiLib/MathAux.h \
+ /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ KaldiLib/Types.h KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h /usr/include/c++/4.6/limits \
+ KaldiLib/Vector.h /usr/include/c++/4.6/cstddef KaldiLib/Vector.tcc \
+ /usr/include/c++/4.6/cstring KaldiLib/StkStream.h KaldiLib/StkStream.tcc \
+ KaldiLib/Timer.h KaldiLib/Common.h KaldiLib/MlfStream.h \
+ /usr/include/c++/4.6/map /usr/include/c++/4.6/bits/stl_tree.h \
+ /usr/include/c++/4.6/bits/stl_map.h \
+ /usr/include/c++/4.6/bits/stl_multimap.h /usr/include/c++/4.6/set \
+ /usr/include/c++/4.6/bits/stl_set.h \
+ /usr/include/c++/4.6/bits/stl_multiset.h KaldiLib/MlfStream.tcc \
+ KaldiLib/StkMatch.h KaldiLib/UserInterface.h \
+ /usr/include/c++/4.6/numeric /usr/include/c++/4.6/bits/stl_numeric.h \
+ /usr/include/sys/stat.h /usr/include/bits/stat.h
diff --git a/src/.depend.mk2 b/src/.depend.mk2
new file mode 100644
index 0000000..28f663a
--- /dev/null
+++ b/src/.depend.mk2
@@ -0,0 +1,757 @@
+TNetCu.o: TNetCu.cc KaldiLib/Error.h /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/iosfwd /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/postypes.h /usr/include/c++/4.6/cwchar \
+ /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/xlocale.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/cctype \
+ /usr/include/ctype.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/endian.h \
+ /usr/include/bits/endian.h /usr/include/bits/byteswap.h \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/time.h \
+ /usr/include/bits/sched.h /usr/include/bits/time.h \
+ /usr/include/bits/timex.h /usr/include/bits/pthreadtypes.h \
+ /usr/include/bits/setjmp.h /usr/include/unistd.h \
+ /usr/include/bits/posix_opt.h /usr/include/bits/environments.h \
+ /usr/include/bits/confname.h /usr/include/getopt.h \
+ /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/bits/locale_classes.h /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/sstream /usr/include/c++/4.6/bits/sstream.tcc \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/sys/types.h /usr/include/sys/select.h \
+ /usr/include/bits/select.h /usr/include/bits/sigset.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/alloca.h /usr/include/bits/stdlib.h /usr/include/execinfo.h \
+ KaldiLib/Timer.h KaldiLib/Error.h /usr/include/sys/time.h \
+ KaldiLib/Features.h /usr/include/c++/4.6/list \
+ /usr/include/c++/4.6/bits/stl_list.h /usr/include/c++/4.6/bits/list.tcc \
+ /usr/include/c++/4.6/queue /usr/include/c++/4.6/deque \
+ /usr/include/c++/4.6/bits/stl_construct.h \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_deque.h \
+ /usr/include/c++/4.6/bits/deque.tcc /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_queue.h KaldiLib/Common.h \
+ /usr/include/string.h /usr/include/bits/string3.h KaldiLib/Matrix.h \
+ KaldiLib/cblas.h KaldiLib/clapack.h KaldiLib/cblas.h KaldiLib/MathAux.h \
+ /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ KaldiLib/Types.h KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h /usr/include/c++/4.6/limits \
+ KaldiLib/Vector.h /usr/include/c++/4.6/cstddef KaldiLib/Vector.tcc \
+ /usr/include/c++/4.6/cstring KaldiLib/StkStream.h KaldiLib/StkStream.tcc \
+ KaldiLib/Timer.h KaldiLib/Labels.h KaldiLib/MlfStream.h \
+ /usr/include/c++/4.6/map /usr/include/c++/4.6/bits/stl_tree.h \
+ /usr/include/c++/4.6/bits/stl_map.h \
+ /usr/include/c++/4.6/bits/stl_multimap.h /usr/include/c++/4.6/set \
+ /usr/include/c++/4.6/bits/stl_set.h \
+ /usr/include/c++/4.6/bits/stl_multiset.h KaldiLib/MlfStream.tcc \
+ KaldiLib/StkMatch.h KaldiLib/Features.h KaldiLib/Common.h \
+ KaldiLib/MlfStream.h KaldiLib/UserInterface.h \
+ CuTNetLib/cuObjectiveFunction.h /usr/include/c++/4.6/cassert \
+ /usr/include/assert.h KaldiLib/Vector.h CuBaseLib/cuvector.h \
+ CuBaseLib/cuvector.tcc /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/host_defines.h \
+ /usr/local/cuda-5.0/include/builtin_types.h \
+ /usr/local/cuda-5.0/include/device_types.h \
+ /usr/local/cuda-5.0/include/driver_types.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/limits.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/syslimits.h \
+ /usr/include/limits.h /usr/include/bits/posix1_lim.h \
+ /usr/include/bits/local_lim.h /usr/include/linux/limits.h \
+ /usr/include/bits/posix2_lim.h /usr/include/bits/xopen_lim.h \
+ /usr/local/cuda-5.0/include/surface_types.h \
+ /usr/local/cuda-5.0/include/texture_types.h \
+ /usr/local/cuda-5.0/include/vector_types.h \
+ /usr/local/cuda-5.0/include/cuda_device_runtime_api.h \
+ CuBaseLib/cucommon.h CuBaseLib/cumatrix.h KaldiLib/Matrix.h \
+ CuBaseLib/cukernels.h /usr/local/cuda-5.0/include/vector_types.h \
+ CuBaseLib/cumatrix.tcc /usr/local/cuda-5.0/include/cublas.h \
+ /usr/local/cuda-5.0/include/cuda_runtime.h \
+ /usr/local/cuda-5.0/include/host_config.h \
+ /usr/local/cuda-5.0/include/channel_descriptor.h \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/driver_functions.h \
+ /usr/local/cuda-5.0/include/vector_functions.h \
+ /usr/local/cuda-5.0/include/cublas_api.h \
+ /usr/local/cuda-5.0/include/cuComplex.h CuBaseLib/cuvector.h \
+ CuBaseLib/cudevice.h CuBaseLib/cumatrix.h CuTNetLib/cuNetwork.h \
+ CuTNetLib/cuComponent.h CuTNetLib/cuBiasedLinearity.h \
+ CuTNetLib/cuActivation.h CuTNetLib/cuCRBEDctFeat.h CuBaseLib/cumath.h \
+ CuTNetLib/cuCache.h /usr/include/c++/4.6/numeric \
+ /usr/include/c++/4.6/bits/stl_numeric.h
+TNormCu.o: TNormCu.cc KaldiLib/Error.h /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/iosfwd /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/postypes.h /usr/include/c++/4.6/cwchar \
+ /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/xlocale.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/cctype \
+ /usr/include/ctype.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/endian.h \
+ /usr/include/bits/endian.h /usr/include/bits/byteswap.h \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/time.h \
+ /usr/include/bits/sched.h /usr/include/bits/time.h \
+ /usr/include/bits/timex.h /usr/include/bits/pthreadtypes.h \
+ /usr/include/bits/setjmp.h /usr/include/unistd.h \
+ /usr/include/bits/posix_opt.h /usr/include/bits/environments.h \
+ /usr/include/bits/confname.h /usr/include/getopt.h \
+ /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/bits/locale_classes.h /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/sstream /usr/include/c++/4.6/bits/sstream.tcc \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/sys/types.h /usr/include/sys/select.h \
+ /usr/include/bits/select.h /usr/include/bits/sigset.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/alloca.h /usr/include/bits/stdlib.h /usr/include/execinfo.h \
+ KaldiLib/Timer.h KaldiLib/Error.h /usr/include/sys/time.h \
+ KaldiLib/Features.h /usr/include/c++/4.6/list \
+ /usr/include/c++/4.6/bits/stl_list.h /usr/include/c++/4.6/bits/list.tcc \
+ /usr/include/c++/4.6/queue /usr/include/c++/4.6/deque \
+ /usr/include/c++/4.6/bits/stl_construct.h \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_deque.h \
+ /usr/include/c++/4.6/bits/deque.tcc /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_queue.h KaldiLib/Common.h \
+ /usr/include/string.h /usr/include/bits/string3.h KaldiLib/Matrix.h \
+ KaldiLib/cblas.h KaldiLib/clapack.h KaldiLib/cblas.h KaldiLib/MathAux.h \
+ /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ KaldiLib/Types.h KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h /usr/include/c++/4.6/limits \
+ KaldiLib/Vector.h /usr/include/c++/4.6/cstddef KaldiLib/Vector.tcc \
+ /usr/include/c++/4.6/cstring KaldiLib/StkStream.h KaldiLib/StkStream.tcc \
+ KaldiLib/Timer.h KaldiLib/Common.h KaldiLib/UserInterface.h \
+ /usr/include/c++/4.6/map /usr/include/c++/4.6/bits/stl_tree.h \
+ /usr/include/c++/4.6/bits/stl_map.h \
+ /usr/include/c++/4.6/bits/stl_multimap.h CuTNetLib/cuNetwork.h \
+ CuTNetLib/cuComponent.h KaldiLib/Vector.h KaldiLib/Matrix.h \
+ CuBaseLib/cumatrix.h CuBaseLib/cukernels.h \
+ /usr/local/cuda-5.0/include/vector_types.h \
+ /usr/local/cuda-5.0/include/builtin_types.h \
+ /usr/local/cuda-5.0/include/device_types.h \
+ /usr/local/cuda-5.0/include/host_defines.h \
+ /usr/local/cuda-5.0/include/driver_types.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/limits.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/syslimits.h \
+ /usr/include/limits.h /usr/include/bits/posix1_lim.h \
+ /usr/include/bits/local_lim.h /usr/include/linux/limits.h \
+ /usr/include/bits/posix2_lim.h /usr/include/bits/xopen_lim.h \
+ /usr/local/cuda-5.0/include/surface_types.h \
+ /usr/local/cuda-5.0/include/texture_types.h \
+ /usr/local/cuda-5.0/include/vector_types.h CuBaseLib/cumatrix.tcc \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/cuda_device_runtime_api.h \
+ /usr/local/cuda-5.0/include/cublas.h \
+ /usr/local/cuda-5.0/include/cuda_runtime.h \
+ /usr/local/cuda-5.0/include/host_config.h \
+ /usr/local/cuda-5.0/include/channel_descriptor.h \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/driver_functions.h \
+ /usr/local/cuda-5.0/include/vector_functions.h \
+ /usr/local/cuda-5.0/include/cublas_api.h \
+ /usr/local/cuda-5.0/include/cuComplex.h CuBaseLib/cucommon.h \
+ CuBaseLib/cuvector.h CuBaseLib/cuvector.tcc CuBaseLib/cumatrix.h \
+ CuBaseLib/cudevice.h CuTNetLib/cuBiasedLinearity.h \
+ CuTNetLib/cuActivation.h CuTNetLib/cuCRBEDctFeat.h CuBaseLib/cumath.h \
+ TNetLib/Nnet.h TNetLib/Component.h TNetLib/BiasedLinearity.h \
+ TNetLib/SharedLinearity.h TNetLib/Activation.h \
+ /usr/include/c++/4.6/numeric /usr/include/c++/4.6/bits/stl_numeric.h
+TFeaCatCu.o: TFeaCatCu.cc KaldiLib/Error.h /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/iosfwd /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/postypes.h /usr/include/c++/4.6/cwchar \
+ /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/xlocale.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/cctype \
+ /usr/include/ctype.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/endian.h \
+ /usr/include/bits/endian.h /usr/include/bits/byteswap.h \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/time.h \
+ /usr/include/bits/sched.h /usr/include/bits/time.h \
+ /usr/include/bits/timex.h /usr/include/bits/pthreadtypes.h \
+ /usr/include/bits/setjmp.h /usr/include/unistd.h \
+ /usr/include/bits/posix_opt.h /usr/include/bits/environments.h \
+ /usr/include/bits/confname.h /usr/include/getopt.h \
+ /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/bits/locale_classes.h /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/sstream /usr/include/c++/4.6/bits/sstream.tcc \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/sys/types.h /usr/include/sys/select.h \
+ /usr/include/bits/select.h /usr/include/bits/sigset.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/alloca.h /usr/include/bits/stdlib.h /usr/include/execinfo.h \
+ KaldiLib/Timer.h KaldiLib/Error.h /usr/include/sys/time.h \
+ KaldiLib/Features.h /usr/include/c++/4.6/list \
+ /usr/include/c++/4.6/bits/stl_list.h /usr/include/c++/4.6/bits/list.tcc \
+ /usr/include/c++/4.6/queue /usr/include/c++/4.6/deque \
+ /usr/include/c++/4.6/bits/stl_construct.h \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_deque.h \
+ /usr/include/c++/4.6/bits/deque.tcc /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_queue.h KaldiLib/Common.h \
+ /usr/include/string.h /usr/include/bits/string3.h KaldiLib/Matrix.h \
+ KaldiLib/cblas.h KaldiLib/clapack.h KaldiLib/cblas.h KaldiLib/MathAux.h \
+ /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ KaldiLib/Types.h KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h /usr/include/c++/4.6/limits \
+ KaldiLib/Vector.h /usr/include/c++/4.6/cstddef KaldiLib/Vector.tcc \
+ /usr/include/c++/4.6/cstring KaldiLib/StkStream.h KaldiLib/StkStream.tcc \
+ KaldiLib/Timer.h KaldiLib/Common.h KaldiLib/UserInterface.h \
+ /usr/include/c++/4.6/map /usr/include/c++/4.6/bits/stl_tree.h \
+ /usr/include/c++/4.6/bits/stl_map.h \
+ /usr/include/c++/4.6/bits/stl_multimap.h CuTNetLib/cuNetwork.h \
+ CuTNetLib/cuComponent.h KaldiLib/Vector.h KaldiLib/Matrix.h \
+ CuBaseLib/cumatrix.h CuBaseLib/cukernels.h \
+ /usr/local/cuda-5.0/include/vector_types.h \
+ /usr/local/cuda-5.0/include/builtin_types.h \
+ /usr/local/cuda-5.0/include/device_types.h \
+ /usr/local/cuda-5.0/include/host_defines.h \
+ /usr/local/cuda-5.0/include/driver_types.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/limits.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/syslimits.h \
+ /usr/include/limits.h /usr/include/bits/posix1_lim.h \
+ /usr/include/bits/local_lim.h /usr/include/linux/limits.h \
+ /usr/include/bits/posix2_lim.h /usr/include/bits/xopen_lim.h \
+ /usr/local/cuda-5.0/include/surface_types.h \
+ /usr/local/cuda-5.0/include/texture_types.h \
+ /usr/local/cuda-5.0/include/vector_types.h CuBaseLib/cumatrix.tcc \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/cuda_device_runtime_api.h \
+ /usr/local/cuda-5.0/include/cublas.h \
+ /usr/local/cuda-5.0/include/cuda_runtime.h \
+ /usr/local/cuda-5.0/include/host_config.h \
+ /usr/local/cuda-5.0/include/channel_descriptor.h \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/driver_functions.h \
+ /usr/local/cuda-5.0/include/vector_functions.h \
+ /usr/local/cuda-5.0/include/cublas_api.h \
+ /usr/local/cuda-5.0/include/cuComplex.h CuBaseLib/cucommon.h \
+ CuBaseLib/cuvector.h CuBaseLib/cuvector.tcc CuBaseLib/cumatrix.h \
+ CuBaseLib/cudevice.h CuTNetLib/cuBiasedLinearity.h \
+ CuTNetLib/cuActivation.h CuTNetLib/cuCRBEDctFeat.h CuBaseLib/cumath.h
+TRbmCu.o: TRbmCu.cc KaldiLib/Error.h /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/iosfwd /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/postypes.h /usr/include/c++/4.6/cwchar \
+ /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/xlocale.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/cctype \
+ /usr/include/ctype.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/endian.h \
+ /usr/include/bits/endian.h /usr/include/bits/byteswap.h \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/time.h \
+ /usr/include/bits/sched.h /usr/include/bits/time.h \
+ /usr/include/bits/timex.h /usr/include/bits/pthreadtypes.h \
+ /usr/include/bits/setjmp.h /usr/include/unistd.h \
+ /usr/include/bits/posix_opt.h /usr/include/bits/environments.h \
+ /usr/include/bits/confname.h /usr/include/getopt.h \
+ /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/bits/locale_classes.h /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/sstream /usr/include/c++/4.6/bits/sstream.tcc \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/sys/types.h /usr/include/sys/select.h \
+ /usr/include/bits/select.h /usr/include/bits/sigset.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/alloca.h /usr/include/bits/stdlib.h /usr/include/execinfo.h \
+ KaldiLib/Timer.h KaldiLib/Error.h /usr/include/sys/time.h \
+ KaldiLib/Features.h /usr/include/c++/4.6/list \
+ /usr/include/c++/4.6/bits/stl_list.h /usr/include/c++/4.6/bits/list.tcc \
+ /usr/include/c++/4.6/queue /usr/include/c++/4.6/deque \
+ /usr/include/c++/4.6/bits/stl_construct.h \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_deque.h \
+ /usr/include/c++/4.6/bits/deque.tcc /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_queue.h KaldiLib/Common.h \
+ /usr/include/string.h /usr/include/bits/string3.h KaldiLib/Matrix.h \
+ KaldiLib/cblas.h KaldiLib/clapack.h KaldiLib/cblas.h KaldiLib/MathAux.h \
+ /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ KaldiLib/Types.h KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h /usr/include/c++/4.6/limits \
+ KaldiLib/Vector.h /usr/include/c++/4.6/cstddef KaldiLib/Vector.tcc \
+ /usr/include/c++/4.6/cstring KaldiLib/StkStream.h KaldiLib/StkStream.tcc \
+ KaldiLib/Timer.h KaldiLib/Common.h KaldiLib/UserInterface.h \
+ /usr/include/c++/4.6/map /usr/include/c++/4.6/bits/stl_tree.h \
+ /usr/include/c++/4.6/bits/stl_map.h \
+ /usr/include/c++/4.6/bits/stl_multimap.h CuTNetLib/cuNetwork.h \
+ CuTNetLib/cuComponent.h KaldiLib/Vector.h KaldiLib/Matrix.h \
+ CuBaseLib/cumatrix.h CuBaseLib/cukernels.h \
+ /usr/local/cuda-5.0/include/vector_types.h \
+ /usr/local/cuda-5.0/include/builtin_types.h \
+ /usr/local/cuda-5.0/include/device_types.h \
+ /usr/local/cuda-5.0/include/host_defines.h \
+ /usr/local/cuda-5.0/include/driver_types.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/limits.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/syslimits.h \
+ /usr/include/limits.h /usr/include/bits/posix1_lim.h \
+ /usr/include/bits/local_lim.h /usr/include/linux/limits.h \
+ /usr/include/bits/posix2_lim.h /usr/include/bits/xopen_lim.h \
+ /usr/local/cuda-5.0/include/surface_types.h \
+ /usr/local/cuda-5.0/include/texture_types.h \
+ /usr/local/cuda-5.0/include/vector_types.h CuBaseLib/cumatrix.tcc \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/cuda_device_runtime_api.h \
+ /usr/local/cuda-5.0/include/cublas.h \
+ /usr/local/cuda-5.0/include/cuda_runtime.h \
+ /usr/local/cuda-5.0/include/host_config.h \
+ /usr/local/cuda-5.0/include/channel_descriptor.h \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/driver_functions.h \
+ /usr/local/cuda-5.0/include/vector_functions.h \
+ /usr/local/cuda-5.0/include/cublas_api.h \
+ /usr/local/cuda-5.0/include/cuComplex.h CuBaseLib/cucommon.h \
+ CuBaseLib/cuvector.h CuBaseLib/cuvector.tcc CuBaseLib/cumatrix.h \
+ CuBaseLib/cudevice.h CuTNetLib/cuBiasedLinearity.h \
+ CuTNetLib/cuActivation.h CuTNetLib/cuCRBEDctFeat.h CuBaseLib/cumath.h \
+ CuTNetLib/cuRbm.h CuTNetLib/cuCache.h CuTNetLib/cuObjectiveFunction.h \
+ /usr/include/c++/4.6/cassert /usr/include/assert.h CuBaseLib/cuvector.h \
+ CuBaseLib/curand.h CuBaseLib/curand.tcc CuBaseLib/curandkernels.h \
+ /usr/include/c++/4.6/numeric /usr/include/c++/4.6/bits/stl_numeric.h
+TRecurrentCu.o: TRecurrentCu.cc KaldiLib/Error.h \
+ /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/iosfwd /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/postypes.h /usr/include/c++/4.6/cwchar \
+ /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/xlocale.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/cctype \
+ /usr/include/ctype.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/endian.h \
+ /usr/include/bits/endian.h /usr/include/bits/byteswap.h \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/time.h \
+ /usr/include/bits/sched.h /usr/include/bits/time.h \
+ /usr/include/bits/timex.h /usr/include/bits/pthreadtypes.h \
+ /usr/include/bits/setjmp.h /usr/include/unistd.h \
+ /usr/include/bits/posix_opt.h /usr/include/bits/environments.h \
+ /usr/include/bits/confname.h /usr/include/getopt.h \
+ /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/bits/locale_classes.h /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/sstream /usr/include/c++/4.6/bits/sstream.tcc \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/sys/types.h /usr/include/sys/select.h \
+ /usr/include/bits/select.h /usr/include/bits/sigset.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/alloca.h /usr/include/bits/stdlib.h /usr/include/execinfo.h \
+ KaldiLib/Timer.h KaldiLib/Error.h /usr/include/sys/time.h \
+ KaldiLib/Features.h /usr/include/c++/4.6/list \
+ /usr/include/c++/4.6/bits/stl_list.h /usr/include/c++/4.6/bits/list.tcc \
+ /usr/include/c++/4.6/queue /usr/include/c++/4.6/deque \
+ /usr/include/c++/4.6/bits/stl_construct.h \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_deque.h \
+ /usr/include/c++/4.6/bits/deque.tcc /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_queue.h KaldiLib/Common.h \
+ /usr/include/string.h /usr/include/bits/string3.h KaldiLib/Matrix.h \
+ KaldiLib/cblas.h KaldiLib/clapack.h KaldiLib/cblas.h KaldiLib/MathAux.h \
+ /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ KaldiLib/Types.h KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h /usr/include/c++/4.6/limits \
+ KaldiLib/Vector.h /usr/include/c++/4.6/cstddef KaldiLib/Vector.tcc \
+ /usr/include/c++/4.6/cstring KaldiLib/StkStream.h KaldiLib/StkStream.tcc \
+ KaldiLib/Timer.h KaldiLib/Labels.h KaldiLib/MlfStream.h \
+ /usr/include/c++/4.6/map /usr/include/c++/4.6/bits/stl_tree.h \
+ /usr/include/c++/4.6/bits/stl_map.h \
+ /usr/include/c++/4.6/bits/stl_multimap.h /usr/include/c++/4.6/set \
+ /usr/include/c++/4.6/bits/stl_set.h \
+ /usr/include/c++/4.6/bits/stl_multiset.h KaldiLib/MlfStream.tcc \
+ KaldiLib/StkMatch.h KaldiLib/Features.h KaldiLib/Common.h \
+ KaldiLib/MlfStream.h KaldiLib/UserInterface.h \
+ CuTNetLib/cuObjectiveFunction.h /usr/include/c++/4.6/cassert \
+ /usr/include/assert.h KaldiLib/Vector.h CuBaseLib/cuvector.h \
+ CuBaseLib/cuvector.tcc /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/host_defines.h \
+ /usr/local/cuda-5.0/include/builtin_types.h \
+ /usr/local/cuda-5.0/include/device_types.h \
+ /usr/local/cuda-5.0/include/driver_types.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/limits.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/syslimits.h \
+ /usr/include/limits.h /usr/include/bits/posix1_lim.h \
+ /usr/include/bits/local_lim.h /usr/include/linux/limits.h \
+ /usr/include/bits/posix2_lim.h /usr/include/bits/xopen_lim.h \
+ /usr/local/cuda-5.0/include/surface_types.h \
+ /usr/local/cuda-5.0/include/texture_types.h \
+ /usr/local/cuda-5.0/include/vector_types.h \
+ /usr/local/cuda-5.0/include/cuda_device_runtime_api.h \
+ CuBaseLib/cucommon.h CuBaseLib/cumatrix.h KaldiLib/Matrix.h \
+ CuBaseLib/cukernels.h /usr/local/cuda-5.0/include/vector_types.h \
+ CuBaseLib/cumatrix.tcc /usr/local/cuda-5.0/include/cublas.h \
+ /usr/local/cuda-5.0/include/cuda_runtime.h \
+ /usr/local/cuda-5.0/include/host_config.h \
+ /usr/local/cuda-5.0/include/channel_descriptor.h \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/driver_functions.h \
+ /usr/local/cuda-5.0/include/vector_functions.h \
+ /usr/local/cuda-5.0/include/cublas_api.h \
+ /usr/local/cuda-5.0/include/cuComplex.h CuBaseLib/cuvector.h \
+ CuBaseLib/cudevice.h CuBaseLib/cumatrix.h CuTNetLib/cuNetwork.h \
+ CuTNetLib/cuComponent.h CuTNetLib/cuBiasedLinearity.h \
+ CuTNetLib/cuActivation.h CuTNetLib/cuCRBEDctFeat.h CuBaseLib/cumath.h \
+ CuTNetLib/cuRecurrent.h /usr/include/c++/4.6/numeric \
+ /usr/include/c++/4.6/bits/stl_numeric.h
diff --git a/src/CuBaseLib/.depend.mk b/src/CuBaseLib/.depend.mk
new file mode 100644
index 0000000..0bc1b34
--- /dev/null
+++ b/src/CuBaseLib/.depend.mk
@@ -0,0 +1,279 @@
+cudevice.o: cudevice.cc cudevice.h /usr/include/c++/4.6/map \
+ /usr/include/c++/4.6/bits/stl_tree.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/stl_map.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/stl_multimap.h \
+ /usr/include/c++/4.6/bits/range_access.h /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/postypes.h /usr/include/c++/4.6/cwchar \
+ /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/xlocale.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/iosfwd \
+ /usr/include/c++/4.6/cctype /usr/include/ctype.h \
+ /usr/include/bits/types.h /usr/include/bits/typesizes.h \
+ /usr/include/endian.h /usr/include/bits/endian.h \
+ /usr/include/bits/byteswap.h /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/time.h \
+ /usr/include/bits/sched.h /usr/include/bits/time.h \
+ /usr/include/bits/timex.h /usr/include/bits/pthreadtypes.h \
+ /usr/include/bits/setjmp.h /usr/include/unistd.h \
+ /usr/include/bits/posix_opt.h /usr/include/bits/environments.h \
+ /usr/include/bits/confname.h /usr/include/getopt.h \
+ /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/bits/basic_string.tcc /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/bits/locale_classes.h \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc \
+ /usr/local/cuda-5.0/include/cublas.h \
+ /usr/local/cuda-5.0/include/cuda_runtime.h \
+ /usr/local/cuda-5.0/include/host_config.h \
+ /usr/local/cuda-5.0/include/builtin_types.h \
+ /usr/local/cuda-5.0/include/device_types.h \
+ /usr/local/cuda-5.0/include/host_defines.h \
+ /usr/local/cuda-5.0/include/driver_types.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/limits.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/syslimits.h \
+ /usr/include/limits.h /usr/include/bits/posix1_lim.h \
+ /usr/include/bits/local_lim.h /usr/include/linux/limits.h \
+ /usr/include/bits/posix2_lim.h /usr/include/bits/xopen_lim.h \
+ /usr/include/bits/stdio_lim.h \
+ /usr/local/cuda-5.0/include/surface_types.h \
+ /usr/local/cuda-5.0/include/texture_types.h \
+ /usr/local/cuda-5.0/include/vector_types.h \
+ /usr/local/cuda-5.0/include/channel_descriptor.h \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/cuda_device_runtime_api.h \
+ /usr/local/cuda-5.0/include/driver_functions.h \
+ /usr/local/cuda-5.0/include/vector_functions.h \
+ /usr/local/cuda-5.0/include/cublas_api.h \
+ /usr/local/cuda-5.0/include/cuComplex.h /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ /usr/local/cuda-5.0/include/cuda.h /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/sys/types.h /usr/include/sys/select.h \
+ /usr/include/bits/select.h /usr/include/bits/sigset.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/alloca.h /usr/include/bits/stdlib.h cumatrix.h \
+ /usr/include/c++/4.6/sstream /usr/include/c++/4.6/bits/sstream.tcc \
+ ../KaldiLib/Matrix.h /usr/include/c++/4.6/stdexcept ../KaldiLib/cblas.h \
+ ../KaldiLib/clapack.h ../KaldiLib/cblas.h ../KaldiLib/Common.h \
+ /usr/include/c++/4.6/cstdlib /usr/include/string.h \
+ /usr/include/bits/string3.h ../KaldiLib/MathAux.h \
+ /usr/include/c++/4.6/cmath ../KaldiLib/Types.h ../KaldiLib/Error.h \
+ /usr/include/execinfo.h ../KaldiLib/Matrix.tcc \
+ /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/sys_errlist.h \
+ /usr/include/bits/stdio.h /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h \
+ /usr/include/c++/4.6/bits/stl_construct.h /usr/include/c++/4.6/limits \
+ /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc ../KaldiLib/Vector.h \
+ /usr/include/c++/4.6/cstddef ../KaldiLib/Vector.tcc \
+ /usr/include/c++/4.6/cstring ../KaldiLib/Matrix.h cukernels.h \
+ /usr/local/cuda-5.0/include/vector_types.h cumatrix.tcc \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h ../KaldiLib/Timer.h \
+ /usr/include/sys/time.h cucommon.h ../KaldiLib/Error.h cuvector.h \
+ ../KaldiLib/Vector.h cuvector.tcc cudevice.h cumath.h
+cumath.o: cumath.cc cumath.h cumatrix.h /usr/include/c++/4.6/sstream \
+ /usr/include/c++/4.6/istream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/iosfwd \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/postypes.h /usr/include/c++/4.6/cwchar \
+ /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/xlocale.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/cctype \
+ /usr/include/ctype.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/endian.h \
+ /usr/include/bits/endian.h /usr/include/bits/byteswap.h \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/time.h \
+ /usr/include/bits/sched.h /usr/include/bits/time.h \
+ /usr/include/bits/timex.h /usr/include/bits/pthreadtypes.h \
+ /usr/include/bits/setjmp.h /usr/include/unistd.h \
+ /usr/include/bits/posix_opt.h /usr/include/bits/environments.h \
+ /usr/include/bits/confname.h /usr/include/getopt.h \
+ /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/bits/locale_classes.h /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc /usr/include/c++/4.6/ostream \
+ /usr/include/c++/4.6/bits/ostream.tcc \
+ /usr/include/c++/4.6/bits/istream.tcc \
+ /usr/include/c++/4.6/bits/sstream.tcc ../KaldiLib/Matrix.h \
+ /usr/include/stdlib.h /usr/include/bits/waitflags.h \
+ /usr/include/bits/waitstatus.h /usr/include/sys/types.h \
+ /usr/include/sys/select.h /usr/include/bits/select.h \
+ /usr/include/bits/sigset.h /usr/include/bits/select2.h \
+ /usr/include/sys/sysmacros.h /usr/include/alloca.h \
+ /usr/include/bits/stdlib.h /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/iostream ../KaldiLib/cblas.h ../KaldiLib/clapack.h \
+ ../KaldiLib/cblas.h ../KaldiLib/Common.h /usr/include/c++/4.6/cstdlib \
+ /usr/include/string.h /usr/include/bits/string3.h ../KaldiLib/MathAux.h \
+ /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ ../KaldiLib/Types.h ../KaldiLib/Error.h /usr/include/execinfo.h \
+ ../KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h \
+ /usr/include/c++/4.6/bits/stl_construct.h /usr/include/c++/4.6/limits \
+ /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc ../KaldiLib/Vector.h \
+ /usr/include/c++/4.6/cstddef ../KaldiLib/Vector.tcc \
+ /usr/include/c++/4.6/cstring ../KaldiLib/Matrix.h cukernels.h \
+ /usr/local/cuda-5.0/include/vector_types.h \
+ /usr/local/cuda-5.0/include/builtin_types.h \
+ /usr/local/cuda-5.0/include/device_types.h \
+ /usr/local/cuda-5.0/include/host_defines.h \
+ /usr/local/cuda-5.0/include/driver_types.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/limits.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/syslimits.h \
+ /usr/include/limits.h /usr/include/bits/posix1_lim.h \
+ /usr/include/bits/local_lim.h /usr/include/linux/limits.h \
+ /usr/include/bits/posix2_lim.h /usr/include/bits/xopen_lim.h \
+ /usr/local/cuda-5.0/include/surface_types.h \
+ /usr/local/cuda-5.0/include/texture_types.h \
+ /usr/local/cuda-5.0/include/vector_types.h cumatrix.tcc \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/cuda_device_runtime_api.h \
+ /usr/local/cuda-5.0/include/cublas.h \
+ /usr/local/cuda-5.0/include/cuda_runtime.h \
+ /usr/local/cuda-5.0/include/host_config.h \
+ /usr/local/cuda-5.0/include/channel_descriptor.h \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/driver_functions.h \
+ /usr/local/cuda-5.0/include/vector_functions.h \
+ /usr/local/cuda-5.0/include/cublas_api.h \
+ /usr/local/cuda-5.0/include/cuComplex.h ../KaldiLib/Timer.h \
+ /usr/include/sys/time.h cucommon.h ../KaldiLib/Error.h cuvector.h \
+ ../KaldiLib/Vector.h cuvector.tcc cudevice.h /usr/include/c++/4.6/map \
+ /usr/include/c++/4.6/bits/stl_tree.h /usr/include/c++/4.6/bits/stl_map.h \
+ /usr/include/c++/4.6/bits/stl_multimap.h
diff --git a/src/CuBaseLib/.svn/entries b/src/CuBaseLib/.svn/entries
new file mode 100644
index 0000000..c84cfaf
--- /dev/null
+++ b/src/CuBaseLib/.svn/entries
@@ -0,0 +1,572 @@
+10
+
+dir
+117
+svn+ssh://merlin.fit.vutbr.cz/svn/TNet/trunk/src/CuBaseLib
+svn+ssh://merlin.fit.vutbr.cz/svn/TNet
+
+
+
+2012-03-23T13:22:49.912359Z
+110
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+bda6da93-004a-4ae9-8e07-715c10848801
+
+cuvector.h
+file
+
+
+
+
+2012-04-02T13:49:13.000000Z
+b3ee9f2c8cb663233c57f8409626b2b4
+2011-04-04T17:14:16.666438Z
+46
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+2176
+
+cukernels.h
+file
+
+
+
+
+2012-04-02T13:49:13.000000Z
+45a1ded3d70c77dd88ad829ab297aef7
+2011-09-19T11:12:27.685840Z
+69
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+4100
+
+cumatrix.h
+file
+
+
+
+
+2012-04-02T13:49:13.000000Z
+311eb907ea742705a8cb7cedf708a316
+2012-02-07T17:50:53.635354Z
+103
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+5223
+
+curand.tcc
+file
+
+
+
+
+2012-04-02T13:49:13.000000Z
+002df511ddd5aaaf936cc8d4188fc1ae
+2011-02-24T12:12:08.754106Z
+34
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+5068
+
+curandkernels.h
+file
+
+
+
+
+2012-04-02T13:49:13.000000Z
+63dd2817bc838424d9a7991b3bfd7653
+2011-02-24T12:12:08.754106Z
+34
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+897
+
+cukernels.cu
+file
+
+
+
+
+2012-04-02T13:49:13.000000Z
+11ba351040079306c93d33a36a3fd663
+2011-09-19T11:12:27.685840Z
+69
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+17013
+
+cuvector.tcc
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+9a81d31c3acf55bd49fd9480e68369b7
+2011-04-29T12:18:20.752880Z
+49
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+6514
+
+curand.h
+file
+
+
+
+
+2012-04-02T13:49:13.000000Z
+45b3049e6e49bf0d360033072c9dde18
+2011-02-24T12:12:08.754106Z
+34
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+654
+
+cumatrix.tcc
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+5e150cb43f80d5d923c605865689c340
+2012-02-07T17:50:53.635354Z
+103
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+17625
+
+cumath.cc
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+fba9157202573cb0c816b13f12b4e983
+2011-03-07T10:43:43.160610Z
+40
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+17426
+
+curandkernels.cu
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+e799b057186a7997cd3501d1d9f25341
+2011-02-24T12:12:08.754106Z
+34
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+3765
+
+cucommon.h
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+f38d7325c130f751cc00c75591dbd75a
+2011-02-24T12:12:08.754106Z
+34
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+959
+
+cumath.h
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+f67afa65b4ff27708ee807b74f7cbf04
+2011-03-07T10:43:43.160610Z
+40
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+5743
+
+cudevice.cc
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+d67d39c4be979930583bc7ba52f45f3c
+2012-03-23T13:22:49.912359Z
+110
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+3221
+
+Makefile
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+9998baf0d675da3b4da44d283c46e7d7
+2011-03-24T17:03:17.103393Z
+43
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+995
+
+cudevice.h
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+c87181021fd59feadcb9b72ca97c893d
+2012-03-23T13:22:49.912359Z
+110
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+1606
+
diff --git a/src/CuBaseLib/.svn/prop-base/Makefile.svn-base b/src/CuBaseLib/.svn/prop-base/Makefile.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuBaseLib/.svn/prop-base/Makefile.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuBaseLib/.svn/prop-base/cucommon.h.svn-base b/src/CuBaseLib/.svn/prop-base/cucommon.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuBaseLib/.svn/prop-base/cucommon.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuBaseLib/.svn/prop-base/cudevice.cc.svn-base b/src/CuBaseLib/.svn/prop-base/cudevice.cc.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuBaseLib/.svn/prop-base/cudevice.cc.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuBaseLib/.svn/prop-base/cudevice.h.svn-base b/src/CuBaseLib/.svn/prop-base/cudevice.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuBaseLib/.svn/prop-base/cudevice.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuBaseLib/.svn/prop-base/cukernels.cu.svn-base b/src/CuBaseLib/.svn/prop-base/cukernels.cu.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuBaseLib/.svn/prop-base/cukernels.cu.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuBaseLib/.svn/prop-base/cukernels.h.svn-base b/src/CuBaseLib/.svn/prop-base/cukernels.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuBaseLib/.svn/prop-base/cukernels.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuBaseLib/.svn/prop-base/cumath.h.svn-base b/src/CuBaseLib/.svn/prop-base/cumath.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuBaseLib/.svn/prop-base/cumath.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuBaseLib/.svn/prop-base/cumatrix.h.svn-base b/src/CuBaseLib/.svn/prop-base/cumatrix.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuBaseLib/.svn/prop-base/cumatrix.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuBaseLib/.svn/prop-base/cumatrix.tcc.svn-base b/src/CuBaseLib/.svn/prop-base/cumatrix.tcc.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuBaseLib/.svn/prop-base/cumatrix.tcc.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuBaseLib/.svn/prop-base/curand.h.svn-base b/src/CuBaseLib/.svn/prop-base/curand.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuBaseLib/.svn/prop-base/curand.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuBaseLib/.svn/prop-base/curand.tcc.svn-base b/src/CuBaseLib/.svn/prop-base/curand.tcc.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuBaseLib/.svn/prop-base/curand.tcc.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuBaseLib/.svn/prop-base/curandkernels.cu.svn-base b/src/CuBaseLib/.svn/prop-base/curandkernels.cu.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuBaseLib/.svn/prop-base/curandkernels.cu.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuBaseLib/.svn/prop-base/curandkernels.h.svn-base b/src/CuBaseLib/.svn/prop-base/curandkernels.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuBaseLib/.svn/prop-base/curandkernels.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuBaseLib/.svn/prop-base/cuvector.h.svn-base b/src/CuBaseLib/.svn/prop-base/cuvector.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuBaseLib/.svn/prop-base/cuvector.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuBaseLib/.svn/prop-base/cuvector.tcc.svn-base b/src/CuBaseLib/.svn/prop-base/cuvector.tcc.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuBaseLib/.svn/prop-base/cuvector.tcc.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuBaseLib/.svn/text-base/Makefile.svn-base b/src/CuBaseLib/.svn/text-base/Makefile.svn-base
new file mode 100644
index 0000000..b574c4a
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/Makefile.svn-base
@@ -0,0 +1,59 @@
+
+include ../tnet.mk
+
+INCLUDE = -I. -I../ -I../KaldiLib 
+
+
+CUDA_INCLUDE= -I$(CUDA_TK_BASE)/include
+CUDA_BIN=$(CUDA_TK_BASE)/bin
+
+
+CUSRC=$(wildcard *.cu)
+CUOBJ=$(patsubst %.cu, %.o, $(CUSRC))
+
+
+
+CUDA_FLAGS = -g -Xcompiler -fPIC --verbose 
+ifeq ($(BITS64), true)
+  CUDA_FLAGS += --machine 64
+  BUT_FORCE_GCC64  = ln -s `which x86_64-linux-gcc` $(PWD)/gcc 
+  BUT_UNLINK_GCC64 = unlink $(PWD)/gcc
+else
+  CUDA_FLAGS += --machine 32
+endif
+
+ifeq ($(DOUBLEPRECISION), true)
+  CUDA_FLAGS += --gpu-architecture compute_13 --gpu-code sm_13
+endif
+
+
+
+
+all : libCuBase.a
+
+libCuBase.a : $(CUOBJ) $(OBJ)
+	$(AR) ruv $@ $?
+	$(RANLIB) $@
+
+
+%.o : %.cu
+	$(BUT_FORCE_GCC64)
+	export PATH=$(PWD):$(CUDA_BIN):$(PATH); $(CUDA_BIN)/nvcc -c $< -o $@ -I. $(CUDA_INCLUDE) $(CUDA_FLAGS)
+	$(BUT_UNLINK_GCC64)
+
+%.o : %.cc
+	$(CXX) -c $< -o $@ $(CXXFLAGS) $(CUDA_INCLUDE) $(INCLUDE)
+
+
+
+
+.PHONY: clean depend
+
+clean :
+	rm -f *.o *.a
+
+depend:
+	$(CXX) -M $(CXXFLAGS) *.cc $(INCLUDE) $(CUDA_INCLUDE) > .depend.mk
+
+-include .depend.mk
+
diff --git a/src/CuBaseLib/.svn/text-base/cucommon.h.svn-base b/src/CuBaseLib/.svn/text-base/cucommon.h.svn-base
new file mode 100644
index 0000000..6dc7e94
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/cucommon.h.svn-base
@@ -0,0 +1,46 @@
+#ifndef _CUCOMMON_H_
+#define _CUCOMMON_H_
+
+#include <iostream>
+#include <sstream>
+
+#include <cuda_runtime_api.h>
+
+#include "Error.h"
+
+
+
+#define cuSafeCall(fun) \
+{ \
+  int ret; \
+  if((ret = (fun)) != 0) { \
+    std::ostringstream os; \
+    os << "CUDA ERROR #" << ret << " " << __FILE__ ":" << __LINE__ << " "  << __func__ << "()" << " '" << #fun << "' " << cudaGetErrorString((cudaError_t)ret); \
+    throw(MyException(os.str())); \
+  } \
+  cudaThreadSynchronize(); \
+} 
+
+
+
+
+namespace TNet {
+
+  /** The size of edge of CUDA square block **/
+  static const int CUBLOCK = 16;
+
+  /** Number of blocks in which is split task of size 'size' **/
+  inline int n_blocks(int size, int block_size) 
+  { return size / block_size + ((size % block_size == 0)? 0 : 1); }
+
+  /** Printing dim3 output operator **/
+  inline std::ostream& operator<<(std::ostream& os, dim3 arr) {
+    os << "[" << arr.x << "," << arr.y << "," << arr.z << "]";
+    return os;
+  }
+
+}
+
+
+
+#endif
diff --git a/src/CuBaseLib/.svn/text-base/cudevice.cc.svn-base b/src/CuBaseLib/.svn/text-base/cudevice.cc.svn-base
new file mode 100644
index 0000000..90c5bf3
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/cudevice.cc.svn-base
@@ -0,0 +1,129 @@
+
+#include <cudevice.h>
+#include <cublas.h>
+#include <cuda.h>
+
+///////////////////
+//DEBUG: Just make sure it compiles...
+#include "cumatrix.h"
+#include "cuvector.h"
+#include "cumath.h"
+template class TNet::CuMatrix<float>;
+template class TNet::CuVector<float>;
+template class TNet::CuMath<float>;
+///////////////////
+
+namespace TNet {
+
+
+  /**********************************************************************************
+   * CuDevice::
+   */
+  CuDevice::
+  CuDevice()
+    : mIsPresent(false), mVerbose(false)
+  {
+    //get number of devices
+    int N_GPU = 0;
+    cudaGetDeviceCount(&N_GPU);
+
+    //select device if more than one
+    if(N_GPU > 1) {
+      char name[128];
+      size_t free, total;
+      std::vector<float> free_mem_ratio;
+      //get ratios of memory use
+      std::cout << "Selecting from " << N_GPU << " GPUs\n";
+      for(int n=0; n<N_GPU; n++) {
+        std::cout << "cudaSetDevice(" << n << "): ";
+        cuSafeCall(cudaSetDevice(n));//context created by cuSafeCall(...)
+        cuDeviceGetName(name,128,n);
+        std::cout << name << "\t";
+        cuSafeCall(cuMemGetInfo(&free,&total));
+        std::cout << "free: " << free/1024/1024 << "M, "
+                  << "total: "<< total/1024/1024 << "M, "
+                  << "ratio: "<< free/(float)total << "\n";
+        free_mem_ratio.push_back(free/(float)total);
+        cudaThreadExit();//destroy context
+      }
+      //find GPU with max free memory
+      int max_id=0;
+      for(int n=1; n<free_mem_ratio.size(); n++) {
+        if(free_mem_ratio[n] > free_mem_ratio[max_id]) max_id=n;
+      }
+      std::cout << "Selected device: " << max_id << " (automatically)\n";
+      cuSafeCall(cudaSetDevice(max_id));
+    }
+      
+    if(N_GPU > 0) {
+      //initialize the CUBLAS
+      cuSafeCall(cublasInit());
+      mIsPresent = true;
+    } else {
+      Warning("No CUDA enabled GPU is present!");
+    }
+  }
+
+  CuDevice::
+  ~CuDevice()
+  {
+    if(mIsPresent) {
+      cuSafeCall(cublasShutdown());
+      if(mVerbose) {
+        TraceLog("CUBLAS released");
+        PrintProfile();
+      }
+    } else {
+      Warning("No CUDA enabled GPU was present!");
+    }
+  }
+
+
+  void 
+  CuDevice::
+  SelectGPU(int gpu_id)
+  {
+    //get number of devices
+    int N_GPU = 0;
+    cudaGetDeviceCount(&N_GPU);
+    if(gpu_id >= N_GPU) {
+      KALDI_ERR << "Cannot select GPU " << gpu_id 
+                << ", detected " << N_GPU << " CUDA capable cards!";
+    }
+    //release old card
+    cuSafeCall(cublasShutdown());
+    cudaThreadExit();
+    //select new card
+    cuSafeCall(cudaSetDevice(gpu_id));
+    //initialize CUBLAS
+    cuSafeCall(cublasInit());
+    std::cout << "Selected device " << gpu_id << " (manually)\n";
+  }
+
+
+  std::string
+  CuDevice::
+  GetFreeMemory()
+  {
+    size_t mem_free, mem_total;
+    cuMemGetInfo(&mem_free, &mem_total);
+    std::ostringstream os;
+    os << "Free:" << mem_free/(1024*1024) << "MB "
+       << "Used:" << (mem_total-mem_free)/(1024*1024) << "MB "
+       << "Total:" << mem_total/(1024*1024) << "MB";
+    return os.str();
+  }
+
+
+  ////////////////////////////////////////////////
+  // Instance of the static singleton 
+  //
+  CuDevice CuDevice::msDevice;
+  //
+  ////////////////////////////////////////////////
+  
+
+
+}
+
+
diff --git a/src/CuBaseLib/.svn/text-base/cudevice.h.svn-base b/src/CuBaseLib/.svn/text-base/cudevice.h.svn-base
new file mode 100644
index 0000000..c5eeb7b
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/cudevice.h.svn-base
@@ -0,0 +1,79 @@
+#ifndef _CUDEVICE_H_
+#define _CUDEVICE_H_
+
+#include <map>
+#include <string>
+#include <iostream>
+
+namespace TNet {
+
+  /**
+   * Singleton object which represents CUDA device
+   * responsible for CUBLAS initilalisation
+   * and memory block registration
+   */
+  class CuDevice 
+  {
+    // Singleton interface...
+    private:
+      CuDevice();
+      CuDevice(CuDevice&);
+      CuDevice& operator=(CuDevice&);
+
+    public:
+      ~CuDevice();
+      static CuDevice& Instantiate()
+      { return msDevice; }
+
+    private:
+      static CuDevice msDevice;
+
+
+    /**********************************/
+    // Instance interface
+    public:
+
+      void SelectGPU(int gpu_id);
+     
+      /// Check if the CUDA device is in the system      
+      bool IsPresent()
+      { return mIsPresent; }
+
+      void Verbose(bool verbose)
+      { mVerbose = verbose; }
+
+      /// Sum the IO time
+      void AccuProfile(const std::string& key,double time) 
+      { 
+        if(mProfileMap.find(key) == mProfileMap.end()) {
+          mProfileMap[key] = 0.0;
+        }
+        mProfileMap[key] += time;
+      }
+
+      void PrintProfile()
+      { 
+        std::cout << "[cudevice profile]\n";
+        std::map<std::string, double>::iterator it;
+        for(it = mProfileMap.begin(); it != mProfileMap.end(); ++it) {
+          std::cout << it->first << "\t" << it->second << "s\n";
+        }
+      }
+
+      void ResetProfile()
+      { mProfileMap.clear(); }
+
+      std::string GetFreeMemory();
+
+
+    private:
+      std::map<std::string, double> mProfileMap;
+      bool mIsPresent;
+      bool mVerbose;
+  }; //class CuDevice
+
+
+}
+
+
+#endif
diff --git a/src/CuBaseLib/.svn/text-base/cukernels.cu.svn-base b/src/CuBaseLib/.svn/text-base/cukernels.cu.svn-base
new file mode 100644
index 0000000..d6f866d
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/cukernels.cu.svn-base
@@ -0,0 +1,626 @@
+
+#include <cfloat>
+#include "cukernels.h"
+
+
+
+/*****************
+ * CUDA kernels
+ */
+//CuMatrix
+template<typename T>
+__global__
+static void _set_const(T* mat, T value, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if ( i < d.cols  &&  j < d.rows )
+    mat[index] = value;
+}
+
+
+
+template<typename T>
+__global__
+static void _apply_log(T* mat, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if ( i < d.cols  &&  j < d.rows )
+    mat[index] = log(mat[index]);
+}
+
+
+template<typename T>
+__global__
+static void _apply_mask(T* mat, const float* mask, MatrixDim dmat, MatrixDim dmask) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*dmat.stride;
+  int index2 = i + j*dmask.stride;
+  if ( i < dmat.cols  &&  j < dmat.rows ) 
+    if(mask[index2] == 0) mat[index] = 0;
+}
+
+
+template<typename T>
+__global__
+static void _apply_l1(T* mat, T l1, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if ( i < d.cols  &&  j < d.rows ) {
+    T value = mat[index];
+    T tgt;
+    if(abs(value) < l1) {
+      tgt = 0;
+    } else {
+      tgt = (value > 0?value-l1:value+l1);
+    }
+    mat[index] = tgt;
+  }
+}
+
+
+template<typename T>
+__global__
+static void _scale_cols(T* mat, const T* scale, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if ( i < d.cols  &&  j < d.rows )
+    mat[index] *= scale[i];
+}
+
+
+template<typename T>
+__global__
+static void _scale_rows(T* mat, const T* scale, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if ( i < d.cols  &&  j < d.rows )
+    mat[index] *= scale[j];
+}
+
+
+template<typename T>
+__global__
+static void _add_scaled(T alpha, const T* A, T beta, T* dst, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if ( i < d.cols  &&  j < d.rows )
+    dst[index] = alpha*A[index] + beta*dst[index];
+}
+
+
+template<typename T>
+__global__
+static void _add_scaled_row(T alpha, const T* row, T beta, T* dst, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+
+#if 0
+  //this does not accelerate :(
+  __shared__ T aux[16];
+  if(threadIdx.y == 0 && i < d.cols) aux[threadIdx.x] = row[i];
+  __syncthreads();
+  
+  if ( i < d.cols  &&  j < d.rows )
+    dst[index] = alpha*aux[threadIdx.x] + beta*dst[index];
+#else
+  if ( i < d.cols  &&  j < d.rows )
+    dst[index] = alpha*row[i] + beta*dst[index];
+#endif
+}
+
+
+template<typename T>
+__global__
+static void _mul_elem(T* mat, const T* A, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if ( i < d.cols  &&  j < d.rows )
+    mat[index] = mat[index] * A[index];
+}
+
+
+template<typename T>
+__global__
+static void _log_elem(T* mat, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if ( i < d.cols  &&  j < d.rows ) {
+    if(mat[index] < FLT_MIN) mat[index] = FLT_MIN;
+    mat[index] = log(mat[index]);
+  }
+}
+
+
+
+
+//CuVector
+template<typename T>
+__global__
+static void _add_col_sum(T alpha, const T* mat, T beta, T* vec, MatrixDim d) {
+
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  //This should be called 1-D
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  if(j > 0) return;
+  
+  if(i < d.cols) {
+    double sum = 0.0;
+    for(int k = 0; k < d.rows; k++) {
+      sum += mat[i+k*d.stride];
+    }
+    vec[i] = alpha*sum + beta*vec[i];
+  }
+}
+
+
+template<typename T>
+__global__
+static void _add_col_sum_reduce(T alpha, const T* mat, T beta, T* vec, MatrixDim d) {
+
+  //flipped x,y for reducing... x..row, y..col
+  int j = blockIdx.x * blockDim.x + threadIdx.x;
+  int i = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if(blockIdx.x > 0) return;
+  if(blockDim.y != 1) return;
+
+  //copy vector to shared mem
+  __shared__ T aux[512];
+  aux[threadIdx.x] = mat[i+j*d.stride];
+  __syncthreads();
+
+  T sum = _sum_reduce(aux);
+  __syncthreads();
+  //copy out the result
+  vec[i] = alpha*sum + beta*vec[i];
+}
+
+
+
+//CuMath
+template<typename T>
+__global__
+static void _sigmoid(T*y, const T*x, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if( i < d.cols  &&  j < d.rows ) {
+    T res = 1.0 / (1.0 + exp(-x[index]));
+    /*
+    if(res < 0.001) res = 0.001;
+    if(res > 0.999) res = 0.999;
+    */
+    y[index] = res;
+  }
+}
+
+
+template<typename T>
+__global__
+static void _diff_sigmoid(T*eout, const T*e, const T*y, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if( i < d.cols  && j < d.rows ) 
+    eout[index] = y[index]*(1.0-y[index]) * e[index];
+}
+
+
+template<typename T>
+__global__
+static void _softmax(T*y, const T*x, MatrixDim d) {
+  int j = blockIdx.x * blockDim.x + threadIdx.x;
+  if(j >= d.rows) return;
+
+  //copy to output and find max...
+  double max = -1e20;
+  double sum = 0.0;
+  for(int i=0; i<d.cols; i++) {
+    if(max < x[i+j*d.stride]) max = x[i+j*d.stride];
+    y[i+j*d.stride] = x[i+j*d.stride];
+  }
+  //subtract max, apply exp, sum up...
+  for(int i=0; i<d.cols; i++) {
+    y[i+j*d.stride] = exp(y[i+j*d.stride] - max);
+    sum += y[i+j*d.stride];
+  }
+  //normalize by sum...
+  for(int i=0; i<d.cols; i++) {
+    y[i+j*d.stride] /= sum;
+  }
+}
+
+
+
+
+template<typename T>
+__device__
+static T _max_reduce(T buffer[]) {
+
+  // Total number of active threads
+  int nTotalThreads = blockDim.x;	
+  __syncthreads();
+
+  while(nTotalThreads > 1) {
+    int halfPoint = ((1+nTotalThreads) >> 1);	// divide by two
+    // only the first half of the threads will be active.
+    if (threadIdx.x < halfPoint)  {
+      // Get the shared value stored by another thread
+      T temp = -1e20;
+      if(threadIdx.x+halfPoint < nTotalThreads) {
+        temp = buffer[threadIdx.x + halfPoint];
+      }
+      if (temp > buffer[threadIdx.x]) buffer[threadIdx.x] = temp;
+    }
+    __syncthreads();
+    nTotalThreads = ((1+nTotalThreads) >> 1);	// divide by two.
+  }
+  // the result
+  return buffer[0];
+}
+
+
+
+
+template<typename T>
+__device__
+static T _sum_reduce(T buffer[]) {
+
+  // Total number of active threads
+  int nTotalThreads = blockDim.x;	
+  __syncthreads();
+
+  while(nTotalThreads > 1) {
+    int halfPoint = ((1+nTotalThreads) >> 1);	// divide by two
+    // only the first half of the threads will be active.
+    if (threadIdx.x < halfPoint)  {
+      // Get the shared value stored by another thread
+      T temp = 0.0;
+      if(threadIdx.x+halfPoint < nTotalThreads) {
+        temp = buffer[threadIdx.x + halfPoint];
+      }
+      buffer[threadIdx.x] += temp;
+    }
+    __syncthreads();
+    nTotalThreads = ((1+nTotalThreads) >> 1);	// divide by two.
+  }
+  // the result
+  return buffer[0];
+}
+
+
+
+template<typename T>
+__global__
+static void _softmax_reduce(T*y, const T*x, MatrixDim d) {
+  
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if(blockIdx.x > 0) return;
+  if(blockDim.y > 1) return;
+
+  __shared__ T row_data[256];
+  __shared__ T aux[256];
+
+  //copy the input to row_data
+  row_data[i] = x[i+j*d.stride];
+  __syncthreads();
+
+  //copy input to aux
+  aux[i] = row_data[i];
+  __syncthreads();
+  //get the maximum value
+  T max = _max_reduce(aux);
+  __syncthreads();
+
+  //calculate exp(data-max)
+  row_data[i] = exp(row_data[i]-max);
+ 
+  //copy the values to aux
+  aux[i] = row_data[i];
+  __syncthreads();
+  //get the sum
+  T sum = _sum_reduce(aux);
+  __syncthreads();
+
+  //divide the values
+  row_data[i] /= sum;
+  //copy out
+  y[i+j*d.stride] = row_data[i];
+
+}
+
+
+
+template<typename T>
+__global__
+static void _expand(T* y, const T* x, const int* off, MatrixDim d_out, MatrixDim d_in)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d_out.stride;
+  if( i < d_out.cols  && j < d_out.rows ) {
+    int src_col = i % d_in.cols;
+    int src_row = j + off[i / d_in.cols];
+    if(src_row < 0) src_row = 0;
+    if(src_row >= d_in.rows) src_row = d_in.rows-1;
+    y[index] = x[src_col + src_row*d_in.stride];
+  }
+}
+
+
+template<typename T>
+__global__
+static void _rearrange(T* y, const T* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d_out.stride;
+  if( i < d_out.cols  && j < d_out.rows ) {
+    int src_col = copy_from[i];
+    if(src_col >= 0 && src_col < d_in.cols) {
+      y[index] = x[src_col + j*d_in.stride];
+    } else {
+      y[index] = 1.0/0.0;
+    }
+  }
+}
+
+
+template<typename T>
+__global__
+static void _randomize(T* y, const T* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d_out.stride;
+  if( i < d_out.cols  && j < d_out.rows ) {
+    int src_row = copy_from[j];
+    y[index] = x[i + src_row*d_in.stride];
+  }
+}
+
+
+template<typename T>
+__global__
+static void _check_class(const T* out, const T* des, int* match, MatrixDim d)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  if(j>0) return;
+
+  if(i<d.rows) {
+    int out_id = -1, des_id = -2;
+    T out_max = -1e20, des_max = -1e20;
+
+    for(int k=0; k<d.cols; k++) {
+      T val = out[k + i*d.stride];
+      if(val > out_max) { out_max = val; out_id = k; }
+    }
+    for(int k=0; k<d.cols; k++) {
+      T val = des[k + i*d.stride];
+      if(val > des_max) { des_max = val; des_id = k; }
+    }
+    
+    match[i] = ((out_id == des_id)?1:0);
+  }
+}
+
+
+template<typename T>
+__device__
+static int _max_id_reduce(T val[],int idx[]) {
+
+  // Total number of active threads
+  int nTotalThreads = blockDim.x;	
+  __syncthreads();
+
+  while(nTotalThreads > 1) {
+    int halfPoint = ((1+nTotalThreads) >> 1);	// divide by two
+    // only the first half of the threads will be active.
+    if (threadIdx.x < halfPoint)  {
+      // Get the shared value stored by another thread
+      T temp = -1e20;
+      if(threadIdx.x+halfPoint < nTotalThreads) {
+        temp = val[idx[threadIdx.x + halfPoint]];
+      }
+      if (temp > val[idx[threadIdx.x]]) idx[threadIdx.x]=idx[threadIdx.x + halfPoint];
+    }
+    __syncthreads();
+    nTotalThreads = ((1+nTotalThreads) >> 1);	// divide by two.
+  }
+  // the result
+  return idx[0];
+}
+
+
+
+
+
+
+template<typename T>
+__global__
+static void _check_class_reduce(const T* out, const T* des, int* match, MatrixDim d)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if(blockIdx.x > 0) return;
+  if(blockDim.y != 1) return;
+
+  __shared__ T value[256];
+  __shared__ int index[256];
+
+  value[threadIdx.x] = out[i+j*d.stride];
+  index[threadIdx.x] = threadIdx.x;
+  __syncthreads();
+
+  int out_max = _max_id_reduce(value,index);
+  __syncthreads();
+
+  value[threadIdx.x] = des[i+j*d.stride];
+  index[threadIdx.x] = threadIdx.x;
+  __syncthreads();
+  
+  int des_max = _max_id_reduce(value,index);
+  __syncthreads();
+
+  if(threadIdx.x == 0) {
+    match[j] = ((out_max == des_max)?1:0);
+  }
+}
+
+
+
+
+/**************
+ * C wrappers around CUDA kernels
+ */
+//:FLOAT:
+//CuMatrix
+void cudaF_set_const(dim3 Gr, dim3 Bl, float* mat, float value, MatrixDim d) 
+{ _set_const<<<Gr,Bl>>>(mat,value,d); }
+
+void cudaF_apply_log(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) 
+{ _apply_log<<<Gr,Bl>>>(mat,d); }
+
+void cudaF_apply_mask(dim3 Gr, dim3 Bl, float* mat, const float* mask, MatrixDim dmat, MatrixDim dmask)
+{ _apply_mask<<<Gr,Bl>>>(mat,mask,dmat,dmask); }
+
+void cudaF_apply_l1(dim3 Gr, dim3 Bl, float* mat, float l1, MatrixDim d)
+{ _apply_l1<<<Gr,Bl>>>(mat,l1,d); }
+
+void cudaF_scale_cols(dim3 Gr, dim3 Bl, float* mat, const float* scale, MatrixDim d)
+{ _scale_cols<<<Gr,Bl>>>(mat,scale,d); }
+
+void cudaF_scale_rows(dim3 Gr, dim3 Bl, float* mat, const float* scale, MatrixDim d)
+{ _scale_rows<<<Gr,Bl>>>(mat,scale,d); }
+
+void cudaF_add_scaled(dim3 Gr, dim3 Bl, float alpha, const float* A, float beta, float* dst, MatrixDim d)
+{ _add_scaled<<<Gr,Bl>>>(alpha,A,beta,dst,d); }
+
+void cudaF_add_scaled_row(dim3 Gr, dim3 Bl, float alpha, const float* row, float beta, float* dst, MatrixDim d)
+{ _add_scaled_row<<<Gr,Bl>>>(alpha,row,beta,dst,d); }
+
+void cudaF_mul_elem(dim3 Gr, dim3 Bl, float*mat, const float*A, MatrixDim d)
+{ _mul_elem<<<Gr,Bl>>>(mat,A,d); }
+
+void cudaF_log_elem(dim3 Gr, dim3 Bl, float*mat, MatrixDim d)
+{ _log_elem<<<Gr,Bl>>>(mat,d); }
+
+//CuVector
+void cudaF_add_col_sum(size_t Gr, size_t Bl, float alpha, const float* mat, float beta, float* vec, MatrixDim d)
+{ _add_col_sum<<<Gr,Bl>>>(alpha,mat,beta,vec,d); }
+
+void cudaF_add_col_sum_reduce(dim3 Gr, dim3 Bl, float alpha, const float* mat, float beta, float* vec, MatrixDim d) 
+{ _add_col_sum_reduce<<<Gr,Bl>>>(alpha,mat,beta,vec,d); }
+
+//CuMath
+void cudaF_sigmoid (dim3 Gr, dim3 Bl, float *y, const float*x, MatrixDim d)
+{ _sigmoid<<<Gr,Bl>>>(y, x, d); }
+
+void cudaF_diff_sigmoid (dim3 Gr, dim3 Bl, float*eout, const float*e, const float*y, MatrixDim d) {
+  _diff_sigmoid<<<Gr,Bl>>>(eout, e, y, d);
+}
+
+void cudaF_softmax (size_t Gr, size_t Bl, float*y, const float*x, MatrixDim d) 
+{ _softmax<<<Gr,Bl>>>(y, x, d); }
+
+void cudaF_softmax_reduce (dim3 Gr, dim3 Bl, float*y, const float*x, MatrixDim d) 
+{ _softmax_reduce<<<Gr,Bl>>>(y, x, d); }
+
+
+void cudaF_expand(dim3 Gr, dim3 Bl, float* y, const float* x, const int* off, MatrixDim d_out, MatrixDim d_in)
+{ _expand<<<Gr,Bl>>>(y,x,off,d_out,d_in); }
+
+
+void cudaF_rearrange(dim3 Gr, dim3 Bl, float* y, const float* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in)
+{ _rearrange<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in); }
+
+  
+void cudaF_randomize(dim3 Gr, dim3 Bl, float* y, const float* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in)
+{ _randomize<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in); }
+
+
+void cudaF_check_class(size_t Gr, size_t Bl, const float* out, const float* des, int* match, MatrixDim d)
+{ _check_class<<<Gr,Bl>>>(out,des,match,d); }
+
+void cudaF_check_class_reduce(dim3 Gr, dim3 Bl, const float* out, const float* des, int* match, MatrixDim d)
+{ _check_class_reduce<<<Gr,Bl>>>(out,des,match,d); }
+
+
+
+
+//:DOUBLE:
+//CuMatrix
+void cudaD_set_const(dim3 Gr, dim3 Bl, double* mat, double value, MatrixDim d) 
+{ _set_const<<<Gr,Bl>>>(mat,value,d); }
+
+void cudaD_apply_log(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) 
+{ _apply_log<<<Gr,Bl>>>(mat,d); }
+
+void cudaD_scale_cols(dim3 Gr, dim3 Bl, double* mat, const double* scale, MatrixDim d)
+{ _scale_cols<<<Gr,Bl>>>(mat,scale,d); }
+
+void cudaD_scale_rows(dim3 Gr, dim3 Bl, double* mat, const double* scale, MatrixDim d)
+{ _scale_rows<<<Gr,Bl>>>(mat,scale,d); }
+
+void cudaD_add_scaled(dim3 Gr, dim3 Bl, double alpha, const double* A, double beta, double* dst, MatrixDim d)
+{ _add_scaled<<<Gr,Bl>>>(alpha,A,beta,dst,d); }
+
+void cudaD_add_scaled_row(dim3 Gr, dim3 Bl, double alpha, const double* row, double beta, double* dst, MatrixDim d)
+{ _add_scaled_row<<<Gr,Bl>>>(alpha,row,beta,dst,d); }
+
+void cudaD_mul_elem(dim3 Gr, dim3 Bl, double*mat, const double*A, MatrixDim d)
+{ _mul_elem<<<Gr,Bl>>>(mat,A,d); }
+
+void cudaD_log_elem(dim3 Gr, dim3 Bl, double*mat, MatrixDim d)
+{ _log_elem<<<Gr,Bl>>>(mat,d); }
+
+//CuVector
+void cudaD_add_col_sum(size_t Gr, size_t Bl, double alpha, const double* mat, double beta, double* vec, MatrixDim d)
+{ _add_col_sum<<<Gr,Bl>>>(alpha,mat,beta,vec,d); }
+
+//CuMath
+void cudaD_sigmoid (dim3 Gr, dim3 Bl, double *y, const double*x, MatrixDim d)
+{ _sigmoid<<<Gr,Bl>>>(y, x, d); }
+
+
+void cudaD_diff_sigmoid (dim3 Gr, dim3 Bl, double*eout, const double*e, const double*y, MatrixDim d) {
+  _diff_sigmoid<<<Gr,Bl>>>(eout, e, y, d);
+}
+
+void cudaD_softmax (size_t Gr, size_t Bl, double*y, const double*x, MatrixDim d) 
+{ _softmax<<<Gr,Bl>>>(y, x, d); }
+
+
+void cudaD_expand(dim3 Gr, dim3 Bl, double* y, const double* x, const int* off, MatrixDim d_out, MatrixDim d_in)
+{ _expand<<<Gr,Bl>>>(y,x,off,d_out,d_in); }
+
+
+void cudaD_rearrange(dim3 Gr, dim3 Bl, double* y, const double* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in)
+{ _rearrange<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in); }
+
+  
+void cudaD_randomize(dim3 Gr, dim3 Bl, double* y, const double* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in)
+{ _randomize<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in); }
+
+
+void cudaD_check_class(size_t Gr, size_t Bl, const double* out, const double* des, int* match, MatrixDim d)
+{ _check_class<<<Gr,Bl>>>(out,des,match,d); }
+
+
+
+
diff --git a/src/CuBaseLib/.svn/text-base/cukernels.h.svn-base b/src/CuBaseLib/.svn/text-base/cukernels.h.svn-base
new file mode 100644
index 0000000..d8320b5
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/cukernels.h.svn-base
@@ -0,0 +1,81 @@
+#ifndef _cuda_kernels_h_
+#define _cuda_kernels_h_
+
+
+extern "C" {
+
+#pragma GCC diagnostic ignored "-Wshadow";
+#include <vector_types.h>
+#pragma GCC diagnostic warning "-Wshadow";
+
+  typedef struct MatrixDim_ {
+    int rows;
+    int cols;
+    int stride;
+  } MatrixDim;
+
+  /*************
+   * Float instances
+   */
+  //CuMatrix 
+  void cudaF_set_const(dim3 Gr, dim3 Bl, float*mat, float value, MatrixDim d);
+  void cudaF_apply_log(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
+  void cudaF_apply_mask(dim3 Gr, dim3 Bl, float* mat, const float* mask, MatrixDim dmat, MatrixDim dmask);
+  void cudaF_apply_l1(dim3 Gr, dim3 Bl, float* mat, float l1, MatrixDim d);
+  void cudaF_scale_cols(dim3 Gr, dim3 Bl, float*mat, const float* scale, MatrixDim d);
+  void cudaF_scale_rows(dim3 Gr, dim3 Bl, float*mat, const float* scale, MatrixDim d);
+  void cudaF_add_scaled(dim3 Gr, dim3 Bl, float alpha, const float* A, float beta, float* dst, MatrixDim d);
+  void cudaF_add_scaled_row(dim3 Gr, dim3 Bl, float alpha, const float* row, float beta, float* dst, MatrixDim d);
+  void cudaF_mul_elem(dim3 Gr, dim3 Bl, float*mat, const float*A, MatrixDim d);
+  void cudaF_log_elem(dim3 Gr, dim3 Bl, float*mat, MatrixDim d);
+   
+  //CuVector
+  void cudaF_add_col_sum(size_t Gr, size_t Bl, float alpha, const float* mat, float beta, float* vec, MatrixDim d);
+  void cudaF_add_col_sum_reduce(dim3 Gr, dim3 Bl, float alpha, const float* mat, float beta, float* vec, MatrixDim d);
+
+  //CuMath
+  void cudaF_softmax      (size_t Gr, size_t Bl, float*y, const float*x, MatrixDim d);
+  void cudaF_softmax_reduce (dim3 Gr, dim3 Bl, float*y, const float*x, MatrixDim d); 
+  void cudaF_sigmoid      (dim3 Gr, dim3 Bl, float*y, const float*x, MatrixDim d);
+  void cudaF_diff_sigmoid (dim3 Gr, dim3 Bl, float* eout, const float* e, const float* y, MatrixDim d);
+
+  void cudaF_expand(dim3 Gr, dim3 Bl, float* y, const float* x, const int* off, MatrixDim d_out, MatrixDim d_in);
+  void cudaF_rearrange(dim3 Gr, dim3 Bl, float* y, const float* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in);
+  void cudaF_randomize(dim3 Gr, dim3 Bl, float* y, const float* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in);
+  
+  void cudaF_check_class(size_t Gr, size_t Bl, const float* out, const float* des, int* match, MatrixDim d);
+  void cudaF_check_class_reduce(dim3 Gr, dim3 Bl, const float* out, const float* des, int* match, MatrixDim d);
+
+
+
+  /*************
+   * Double instances
+   */
+  //CuMatrix 
+  void cudaD_set_const(dim3 Gr, dim3 Bl, double*mat, double value, MatrixDim d);
+  void cudaD_apply_log(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
+  void cudaD_scale_cols(dim3 Gr, dim3 Bl, double*mat, const double* scale, MatrixDim d);
+  void cudaD_scale_rows(dim3 Gr, dim3 Bl, double*mat, const double* scale, MatrixDim d);
+  void cudaD_add_scaled(dim3 Gr, dim3 Bl, double alpha, const double* A, double beta, double* dst, MatrixDim d);
+  void cudaD_add_scaled_row(dim3 Gr, dim3 Bl, double alpha, const double* row, double beta, double* dst, MatrixDim d);
+  void cudaD_mul_elem(dim3 Gr, dim3 Bl, double*mat, const double*A, MatrixDim d);
+  void cudaD_log_elem(dim3 Gr, dim3 Bl, double*mat, MatrixDim d);
+   
+  //CuVector
+  void cudaD_add_col_sum(size_t Gr, size_t Bl, double alpha, const double* mat, double beta, double* vec, MatrixDim d);
+
+  //CuMath
+  void cudaD_softmax      (size_t Gr, size_t Bl, double*y, const double*x, MatrixDim d);
+  void cudaD_sigmoid      (dim3 Gr, dim3 Bl, double*y, const double*x, MatrixDim d);
+  void cudaD_diff_sigmoid (dim3 Gr, dim3 Bl, double* eout, const double* e, const double* y, MatrixDim d);
+
+  void cudaD_expand(dim3 Gr, dim3 Bl, double* y, const double* x, const int* off, MatrixDim d_out, MatrixDim d_in);
+  void cudaD_rearrange(dim3 Gr, dim3 Bl, double* y, const double* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in);
+  void cudaD_randomize(dim3 Gr, dim3 Bl, double* y, const double* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in);
+  
+  void cudaD_check_class(size_t Gr, size_t Bl, const double* out, const double* des, int* match, MatrixDim d);
+
+
+}
+
+#endif
diff --git a/src/CuBaseLib/.svn/text-base/cumath.cc.svn-base b/src/CuBaseLib/.svn/text-base/cumath.cc.svn-base
new file mode 100644
index 0000000..d718324
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/cumath.cc.svn-base
@@ -0,0 +1,574 @@
+
+
+
+#include "cumath.h"
+#include "cukernels.h"
+
+
+namespace TNet {
+
+  //////////////////////////////////////////////////////////////////////////////
+  //// CuMath<> Template specializations (float)
+  ////
+  template<>
+  void CuMath<float>::Sigmoid(CuMatrix<float>& Y, const CuMatrix<float>& X)
+  {
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(X.Cols(),CUBLOCK), n_blocks(X.Rows(), CUBLOCK));
+
+    cudaF_sigmoid(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+  template<>
+  void CuMath<float>::DiffSigmoid(CuMatrix<float>& Eout, const CuMatrix<float>& Ein, const CuMatrix<float>& Y)
+  {
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Eout.Cols(), CUBLOCK), n_blocks(Eout.Rows(),CUBLOCK));
+
+    cudaF_diff_sigmoid(dimGrid, dimBlock, Eout.pCUData(), Ein.pCUData(), Y.pCUData(), Eout.Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+    
+  template<>
+  void CuMath<float>::Softmax(CuMatrix<float>& Y, const CuMatrix<float>& X)
+  {
+    Timer tim; tim.Start();
+
+#if 0
+    //disable 'reduce' functions
+    size_t dimBlock = CUBLOCK;
+    size_t dimGrid  = n_blocks(X.Rows(),CUBLOCK);
+
+    cudaF_softmax(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim());
+    cuSafeCall(cudaGetLastError());
+#else
+    if(X.Cols() > 256) {
+      //use old implementation (can't use reduction due to 
+      //limited size of shared memory)
+      size_t dimBlock = CUBLOCK;
+      size_t dimGrid  = n_blocks(X.Rows(),CUBLOCK);
+
+      cudaF_softmax(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim());
+      cuSafeCall(cudaGetLastError());
+    } else {
+      //use implementation with reduction
+      dim3 dimBlock(X.Cols(),1);
+      dim3 dimGrid(1,X.Rows());
+
+      cudaF_softmax_reduce(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim());
+      cuSafeCall(cudaGetLastError());
+    }
+#endif
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  void CuMath<float>::BlockLinearity(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuMatrix<float>& block_transf)
+  {
+    Timer tim; tim.Start();
+
+    assert(Y.Rows() == X.Rows());
+    assert((X.Cols() % block_transf.Rows()) == 0);
+    assert((Y.Cols() % block_transf.Cols()) == 0);
+    assert((X.Cols() / block_transf.Rows()) == (Y.Cols() / block_transf.Cols()));
+
+    int blocks = X.Cols() / block_transf.Rows();
+
+    for(int i = 0; i < blocks; i++) {
+      int m = block_transf.Cols();
+      int n = X.Rows();
+      int k = block_transf.Rows();
+
+      /*
+      //DEBUG MESSAGE
+      std::cout << "N N " << m << " " << n << " " << k << " " 
+                << 1.0 << " " << block_transf << " " << block_transf.Stride() 
+                << " " << X+i*k << " " << X.Stride() << " " 
+                << 0.0 << " " << Y+i*n << " " << Y.Stride() 
+                << "\n" << std::flush;
+      */
+
+
+      cublasSgemm('N', 'N', m, n, k, 
+                  1.0, block_transf.pCUData(), block_transf.Stride(), 
+                  X.pCUData()+i*k, X.Stride(), 
+                  0.0, Y.pCUData()+i*m, Y.Stride());
+    }
+    cuSafeCall(cublasGetError());    
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+
+  template<>
+  void CuMath<float>::Expand(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& frameOffsets)
+  {
+    Timer tim; tim.Start();
+
+    assert(Y.Rows() == X.Rows());
+    assert(X.Cols() * frameOffsets.Dim() == Y.Cols());
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK));
+
+    cudaF_expand(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), frameOffsets.pCUData(), Y.Dim(), X.Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  void CuMath<float>::Rearrange(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& copyFrom)
+  {
+    Timer tim; tim.Start();
+
+    assert(copyFrom.Dim() == Y.Cols());
+    assert(Y.Rows() == X.Rows());
+    
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK));
+    
+    cudaF_rearrange(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), Y.Dim(), X.Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  void CuMath<float>::Randomize(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& copyFrom)
+  {
+    Timer tim; tim.Start();
+
+    assert(X.Cols() == Y.Cols());
+    assert(X.Rows() == Y.Rows());
+    assert(copyFrom.Dim() <= Y.Rows());
+    
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(copyFrom.Dim(),CUBLOCK));
+    
+    MatrixDim dimX = X.Dim(); dimX.rows=copyFrom.Dim();
+    MatrixDim dimY = Y.Dim(); dimY.rows=copyFrom.Dim();
+
+    cudaF_randomize(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), dimY, dimX);
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  void CuMath<float>::CheckClass(const CuMatrix<float>& out, const CuMatrix<float> &des, CuVector<int>& match)
+  {
+    Timer tim; tim.Start();
+
+    assert(out.Cols() == des.Cols());
+    assert(out.Rows() == des.Rows());
+    assert(out.Stride() == des.Stride());
+    assert(match.Dim() == out.Rows());
+
+    if(out.Cols() > 256) {
+      size_t dimBlock = CUBLOCK;
+      size_t dimGrid = n_blocks(out.Rows(),CUBLOCK);
+
+      cudaF_check_class(dimGrid, dimBlock, out.pCUData(), des.pCUData(), match.pCUData(), out.Dim());
+      cuSafeCall(cudaGetLastError());
+    } else {
+      dim3 dimBlock(out.Cols(),1);
+      dim3 dimGrid(1,out.Rows());
+
+      cudaF_check_class_reduce(dimGrid, dimBlock, out.pCUData(), des.pCUData(), match.pCUData(), out.Dim());
+      cuSafeCall(cudaGetLastError());
+    }
+
+
+
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  void CuMath<float>::OffsetGemm(char transA, char transB, float alpha, const CuMatrix<float>& A, const CuMatrix<float>& B, float beta, CuMatrix<float>& C, int offA, int offB, int offC)
+  {
+    Timer tim; tim.Start();
+    // CUBLAS is col major, TNet is row major
+    // keep trans..., just swap A&B argumets: A->B B->A
+    //
+    // WARNING
+    // NO DIMENSION CHECK!!!
+    
+    //m,n,k is cublas m,n,k
+    size_t m = ((transB=='T' || transB=='t')? B.Rows() : B.Cols()); 
+    size_t n = ((transA=='T' || transA=='t')? A.Cols() : A.Rows());
+    size_t k = ((transB=='T' || transB=='t')? B.Cols() : B.Rows());
+    size_t k1 = ((transA=='T' || transA=='t')? A.Rows() : A.Cols());
+
+    k = ((k<k1)?k:k1);
+    m = ((m<C.Cols())?m:C.Cols());
+    n = ((n<C.Rows())?m:C.Rows());
+
+#if 0
+    std::cout << "A " << transA << " "<< A.Rows() << " " << A.Cols() << " " << A.Stride() << " " << offA
+         << "; B " << transB << " "<< B.Rows() << " " << B.Cols() << " " << B.Stride() << " " << offB
+         << "; C " << C.Rows() << " " << C.Cols() << " " << C.Stride() << " " << offC
+         << "; alpha" << alpha << " beta" << beta << " REALmnk:" << m <<" "<< n <<" "<< k << std::endl;
+#endif
+         
+
+    cublasSgemm(transB, transA, m, n, k, 
+                alpha, B.pCUData()+offB, B.Stride(), 
+                A.pCUData()+offA, A.Stride(), 
+                beta, C.pCUData()+offC, C.Stride());
+    cuSafeCall(cublasGetError());    
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+/*
+
+  template<>
+  void CuMath<float>::Gemv(char trans, float alpha, const CuMatrix<float>& A, const float* x, size_t dimX, float beta, float* y, size_t dimY)
+  {
+    Timer tim; tim.Start();
+    // CUBLAS is col major, TNet is row major
+    // y = alpha * op(A) * x + beta * y,
+    
+    size_t m = A.Cols(); //m..rows of A in colmajor (== cols in rowmajor)
+    size_t n = A.Rows(); //n..cols of A in colmajor (== rows in rowmajor)
+ 
+    // switch the trans parameter!
+    char cu_trans;
+    if(trans == 't' || trans == 'T') {
+      cu_trans = 'n';
+    } else if (trans == 'n' || trans == 'N') {
+      cu_trans = 't';
+    } else {
+      Error(std::string("Unknown trans")+trans);
+    }
+   
+    //check the dims
+    if(cu_trans == 'n') {
+      assert(dimX == n);
+      assert(dimY == m);
+    } else {
+      assert(dimX == m);
+      assert(dimY == n);
+    }
+ 
+    //run gemv
+    cublasSgemv(cu_trans,m,n,alpha,
+                A.pCUData(), A.Stride(), x, 1,
+                beta, y, 1);
+    
+    cuSafeCall(cublasGetError());    
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+  */
+
+  /**
+   * offsetY tells how many outputs of 'Ax' mutiplication is skipped at the beginning, 
+   */
+  template<>
+  void CuMath<float>::OffsetGemv(char trans, float alpha, const CuMatrix<float>& A, const float* x, size_t dimX, float beta, float* y, size_t dimY, size_t offsetY)
+  {
+    Timer tim; tim.Start();
+    // CUBLAS is col major, TNet is row major
+    // y = alpha * op(A) * x + beta * y,
+    
+    size_t m = A.Cols(); //m..rows of A in colmajor (== cols in rowmajor)
+    size_t n = A.Rows(); //n..cols of A in colmajor (== rows in rowmajor)
+ 
+    // switch the trans parameter!
+    char cu_trans;
+    if(trans == 't' || trans == 'T') {
+      cu_trans = 'n';
+    } else if (trans == 'n' || trans == 'N') {
+      cu_trans = 't';
+    } else {
+      Error(std::string("Unknown trans")+trans);
+    }
+
+    // select part of matrix for compute
+    size_t cu_offset = 0;
+    if(cu_trans == 'n') {
+      cu_offset += offsetY;
+      assert(m >= dimY+offsetY);
+      m = dimY;
+    } else {
+      cu_offset += offsetY*A.Stride();
+      assert(n >= dimY+offsetY);
+      n = dimY;
+    }
+   
+    //check the dims
+    if(cu_trans == 'n') {
+      assert(dimX == n);
+      assert(dimY == m);
+    } else {
+      assert(dimX == m);
+      assert(dimY == n);
+    }
+ 
+    //run gemv
+    cublasSgemv(cu_trans,m,n,alpha,
+                A.pCUData()+cu_offset, A.Stride(), x, 1,
+                beta, y, 1);
+    
+    cuSafeCall(cublasGetError());    
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+  
+  template<>
+  void CuMath<float>::BlasGer(float alpha, const float* x, size_t dimX, const float* y, size_t dimY, CuMatrix<float>& A) {
+    Timer tim; tim.Start();
+    // CUBLAS is col major, TNet is row major
+    // -> switch x and y
+
+    // A = alpha * x * transpose(y) + A,
+    
+    assert(dimX == A.Rows());
+    assert(dimY == A.Cols());
+
+    size_t m = A.Cols(); //m..rows of A in colmajor (== cols in rowmajor)
+    size_t n = A.Rows(); //n..cols of A in colmajor (== rows in rowmajor)
+
+    cublasSger(m,n,alpha,y,1,x,1,A.pCUData(),A.Stride()); 
+    cuSafeCall(cublasGetError());    
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  void CuMath<float>::VecExpand(const CuVector<float>&in, CuVector<float>&out)
+  {
+    Timer tim; tim.Start();
+
+    assert(out.Dim() % in.Dim() == 0);
+    int n_copies = out.Dim()/in.Dim();
+    CuVector<int> offsets(n_copies);
+    //offsets.SetConst(0); done implicitly!
+
+    dim3 dimBlock(CUBLOCK);
+    dim3 dimGrid(n_blocks(out.Dim(), CUBLOCK));
+    
+    MatrixDim dim_in = { 1, in.Dim(), in.Dim() };
+    MatrixDim dim_out = { 1, out.Dim(), out.Dim() };
+    cudaF_expand(dimGrid, dimBlock, out.pCUData(), in.pCUData(), offsets.pCUData(), dim_out, dim_in);
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  void CuMath<float>::VecAddColSum(float alpha, const CuVector<float>&in, float beta, CuVector<float>&out)
+  {
+    Timer tim; tim.Start();
+
+    assert(in.Dim() % out.Dim() == 0);
+
+    size_t dimBlock = CUBLOCK;
+    size_t dimGrid = n_blocks(out.Dim(),CUBLOCK); 
+
+    MatrixDim dim = { in.Dim()/out.Dim(), out.Dim(), out.Dim() };
+
+    cudaF_add_col_sum(dimGrid,dimBlock,alpha,in.pCUData(),beta,out.pCUData(),dim);
+
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  //////////////////////////////////////////////////////////////////////////////
+  //// CuMath<> Template specializations (double)
+  ////
+  template<>
+  void CuMath<double>::Sigmoid(CuMatrix<double>& Y, const CuMatrix<double>& X)
+  {
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(X.Cols(),CUBLOCK), n_blocks(X.Rows(), CUBLOCK));
+
+    cudaD_sigmoid(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+  template<>
+  void CuMath<double>::DiffSigmoid(CuMatrix<double>& Eout, const CuMatrix<double>& Ein, const CuMatrix<double>& Y)
+  {
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Eout.Cols(), CUBLOCK), n_blocks(Eout.Rows(),CUBLOCK));
+
+    cudaD_diff_sigmoid(dimGrid, dimBlock, Eout.pCUData(), Ein.pCUData(), Y.pCUData(), Eout.Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+    
+  template<>
+  void CuMath<double>::Softmax(CuMatrix<double>& Y, const CuMatrix<double>& X)
+  {
+    Timer tim; tim.Start();
+
+    size_t dimBlock = CUBLOCK;
+    size_t dimGrid  = n_blocks(X.Rows(),CUBLOCK);
+
+    cudaD_softmax(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  void CuMath<double>::BlockLinearity(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuMatrix<double>& block_transf)
+  {
+    Timer tim; tim.Start();
+
+    assert(Y.Rows() == X.Rows());
+    assert((X.Cols() % block_transf.Rows()) == 0);
+    assert((Y.Cols() % block_transf.Cols()) == 0);
+    assert((X.Cols() / block_transf.Rows()) == (Y.Cols() / block_transf.Cols()));
+
+    int blocks = X.Cols() / block_transf.Rows();
+
+    for(int i = 0; i < blocks; i++) {
+      int m = block_transf.Cols();
+      int n = X.Rows();
+      int k = block_transf.Rows();
+
+      /*
+      //DEBUG MESSAGE
+      std::cout << "N N " << m << " " << n << " " << k << " " 
+                << 1.0 << " " << block_transf << " " << block_transf.Stride() 
+                << " " << X+i*k << " " << X.Stride() << " " 
+                << 0.0 << " " << Y+i*n << " " << Y.Stride() 
+                << "\n" << std::flush;
+      */
+
+
+      cublasDgemm('N', 'N', m, n, k, 
+                  1.0, block_transf.pCUData(), block_transf.Stride(), 
+                  X.pCUData()+i*k, X.Stride(), 
+                  0.0, Y.pCUData()+i*m, Y.Stride());
+    }
+    cuSafeCall(cublasGetError());    
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+
+  template<>
+  void CuMath<double>::Expand(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& frameOffsets)
+  {
+    Timer tim; tim.Start();
+
+    assert(Y.Rows() == X.Rows());
+    assert(X.Cols() * frameOffsets.Dim() == Y.Cols());
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK));
+
+    cudaD_expand(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), frameOffsets.pCUData(), Y.Dim(), X.Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  void CuMath<double>::Rearrange(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& copyFrom)
+  {
+    Timer tim; tim.Start();
+
+    assert(copyFrom.Dim() == Y.Cols());
+    assert(Y.Rows() == X.Rows());
+    
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK));
+    
+    cudaD_rearrange(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), Y.Dim(), X.Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  void CuMath<double>::Randomize(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& copyFrom)
+  {
+    Timer tim; tim.Start();
+
+    assert(X.Cols() == Y.Cols());
+    assert(X.Rows() == Y.Rows());
+    assert(copyFrom.Dim() <= Y.Rows());
+    
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(copyFrom.Dim(),CUBLOCK));
+    
+    MatrixDim dimX = X.Dim(); dimX.rows=copyFrom.Dim();
+    MatrixDim dimY = Y.Dim(); dimY.rows=copyFrom.Dim();
+
+    cudaD_randomize(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), dimY, dimX);
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  void CuMath<double>::CheckClass(const CuMatrix<double>& out, const CuMatrix<double> &des, CuVector<int>& match)
+  {
+    Timer tim; tim.Start();
+
+    assert(out.Cols() == des.Cols());
+    assert(out.Rows() == des.Rows());
+    assert(out.Stride() == des.Stride());
+    assert(match.Dim() == out.Rows());
+
+    size_t dimBlock = CUBLOCK;
+    size_t dimGrid = n_blocks(out.Rows(),CUBLOCK);
+
+    cudaD_check_class(dimGrid, dimBlock, out.pCUData(), des.pCUData(), match.pCUData(), out.Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+}
diff --git a/src/CuBaseLib/.svn/text-base/cumath.h.svn-base b/src/CuBaseLib/.svn/text-base/cumath.h.svn-base
new file mode 100644
index 0000000..5680082
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/cumath.h.svn-base
@@ -0,0 +1,146 @@
+#ifndef _CUMATH_H_
+#define _CUMATH_H_
+
+#include "cumatrix.h"
+
+#include "Timer.h"
+#include "cudevice.h"
+
+namespace TNet {
+  
+  
+  /**
+   * Group of Math operations for the NN training
+   */
+  template<typename _ElemT>
+  class CuMath 
+  {
+   public:
+
+    /// Y = Sigmoid(X)
+    static void Sigmoid(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X)
+    { Error("__func__ Not implemented"); }
+
+    /// Eout = E(1-E) * Y
+    static void DiffSigmoid(CuMatrix<_ElemT>& Eout, const CuMatrix<_ElemT>& Ein, const CuMatrix<_ElemT>& Y)
+    { Error("__func__ Not implemented"); }
+
+    /// Y = Softmax(X)
+    static void Softmax(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X)
+    { Error("__func__ Not implemented"); }
+
+    /// for DCT in FeaCat
+    static void BlockLinearity(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X, const CuMatrix<_ElemT>& block_transf)
+    { Error("__func__ Not implemented"); }
+
+    static void Expand(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X, const CuVector<int>& frameOffsets)
+    { Error("__func__ Not implemented"); }
+
+    /// ie. switch cols according to copyFrom
+    static void Rearrange(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X, const CuVector<int>& copyFrom)
+    { Error("__func__ Not implemented"); }
+
+    /// ie. switch rows according to copyFrom   
+    static void Randomize(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X, const CuVector<int>& copyFrom)
+    { Error("__func__ Not implemented"); }
+
+    /// check match in the classification for Xentropy
+    static void CheckClass(const CuMatrix<_ElemT>& out, const CuMatrix<_ElemT> &des, CuVector<int>& match)
+    { Error("__func__ Not implemented"); }
+    
+    /// gemm with offset for CuSharedLinearity
+    static void OffsetGemm(char transA, char transB, _ElemT alpha, const CuMatrix<_ElemT>& A, const CuMatrix<_ElemT>& B, _ElemT beta, CuMatrix<_ElemT>& C, int offA, int offB, int offC)
+    { Error("__func__ Not implemented"); }
+
+    /// gemv with offset for CuRecurrent
+    static void OffsetGemv(char trans, _ElemT alpha, const CuMatrix<_ElemT>& A, const _ElemT* x, size_t dimX, _ElemT beta, _ElemT* y, size_t dimY, size_t offsetY)
+    { Error("__func__ Not implemented"); }
+
+    /// ger for weight updates in CuRecurrent
+    static void BlasGer(_ElemT alpha, const _ElemT* x, size_t dimX, const _ElemT* y, size_t dimY, CuMatrix<_ElemT>& A)
+    { Error("__func__ Not implemented"); }
+
+    /// concatenate one vector several times for CuSharedLinearity
+    static void VecExpand(const CuVector<_ElemT>&in, CuVector<_ElemT>&out)
+    { Error("__func__ Not implemented"); }
+
+    /// sum the vector as if it was matrix data for CuSharedLinearity
+    static void VecAddColSum(_ElemT alpha, const CuVector<_ElemT>&in, _ElemT beta, CuVector<_ElemT>&out)
+    { Error("__func__ Not implemented"); }
+
+  }; //class CuMath::
+
+
+  //////////////////////////////////////////////////////////////////////////////
+  //// CuMath<> Template specializations (float)
+  ////
+  template<>
+  void CuMath<float>::Sigmoid(CuMatrix<float>& Y, const CuMatrix<float>& X);
+
+  template<>
+  void CuMath<float>::DiffSigmoid(CuMatrix<float>& Eout, const CuMatrix<float>& Ein, const CuMatrix<float>& Y);
+    
+  template<>
+  void CuMath<float>::Softmax(CuMatrix<float>& Y, const CuMatrix<float>& X);
+
+  template<>
+  void CuMath<float>::BlockLinearity(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuMatrix<float>& block_transf);
+
+  template<>
+  void CuMath<float>::Expand(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& frameOffsets);
+
+  template<>
+  void CuMath<float>::Rearrange(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& copyFrom);
+
+  template<>
+  void CuMath<float>::Randomize(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& copyFrom);
+
+  template<>
+  void CuMath<float>::CheckClass(const CuMatrix<float>& out, const CuMatrix<float> &des, CuVector<int>& match);
+
+  template<>
+  void CuMath<float>::OffsetGemm(char transA, char transB, float alpha, const CuMatrix<float>& A, const CuMatrix<float>& B, float beta, CuMatrix<float>& C, int offA, int offB, int offC);
+
+  template<>
+  void CuMath<float>::OffsetGemv(char trans, float alpha, const CuMatrix<float>& A, const float* x, size_t dimX, float beta, float* y, size_t dimY, size_t offsetY);
+
+  template<>
+  void CuMath<float>::BlasGer(float alpha, const float* x, size_t dimX, const float* y, size_t dimY, CuMatrix<float>& A);
+
+  template<>
+  void CuMath<float>::VecExpand(const CuVector<float>&in, CuVector<float>&out);
+
+  template<>
+  void CuMath<float>::VecAddColSum(float alpha, const CuVector<float>&in, float beta, CuVector<float>&out);
+
+
+  //////////////////////////////////////////////////////////////////////////////
+  //// CuMath<> Template specializations (double)
+  ////
+  template<>
+  void CuMath<double>::Sigmoid(CuMatrix<double>& Y, const CuMatrix<double>& X);
+
+  template<>
+  void CuMath<double>::DiffSigmoid(CuMatrix<double>& Eout, const CuMatrix<double>& Ein, const CuMatrix<double>& Y);
+    
+  template<>
+  void CuMath<double>::Softmax(CuMatrix<double>& Y, const CuMatrix<double>& X);
+
+  template<>
+  void CuMath<double>::BlockLinearity(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuMatrix<double>& block_transf);
+
+  template<>
+  void CuMath<double>::Expand(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& frameOffsets);
+
+  template<>
+  void CuMath<double>::Rearrange(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& copyFrom);
+
+  template<>
+  void CuMath<double>::Randomize(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& copyFrom);
+
+  template<>
+  void CuMath<double>::CheckClass(const CuMatrix<double>& out, const CuMatrix<double> &des, CuVector<int>& match);
+
+}
+
+#endif
diff --git a/src/CuBaseLib/.svn/text-base/cumatrix.h.svn-base b/src/CuBaseLib/.svn/text-base/cumatrix.h.svn-base
new file mode 100644
index 0000000..4e767e3
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/cumatrix.h.svn-base
@@ -0,0 +1,199 @@
+#ifndef _CUMATRIX_H_
+#define _CUMATRIX_H_
+
+#include <sstream>
+
+#include "Matrix.h"
+#include "cukernels.h"
+
+
+
+namespace TNet {
+
+  template<typename _ElemT> class CuVector;
+
+  /**
+   * Matrix for CUDA computing
+   */
+  template<typename _ElemT>
+  class CuMatrix 
+  {
+    typedef CuMatrix<_ElemT> ThisType;
+
+    public:
+
+      /// Default Constructor
+      CuMatrix<_ElemT>()
+       : mRows(0), mCols(0), mStride(0), mpCUData(NULL)
+      { }
+      /// Constructor with memory initialisation
+      CuMatrix<_ElemT>(size_t rows, size_t cols)
+       : mRows(0), mCols(0), mStride(0), mpCUData(NULL)
+      { Init(rows, cols); }
+
+      /// Destructor
+      ~CuMatrix()
+      { Destroy(); }
+
+      /// Dimensions
+      size_t Rows() const
+      { return mRows; }
+
+      size_t Cols() const 
+      { return mCols; }
+
+      size_t Stride() const
+      { return mStride; }
+
+      ::MatrixDim Dim() const
+      { ::MatrixDim d = { 
+          static_cast<int>(mRows), 
+          static_cast<int>(mCols), 
+          static_cast<int>(mStride) 
+        }; 
+        return d; 
+      }
+
+      /// Get raw pointer
+      const _ElemT* pCUData() const
+      { return mpCUData; }
+      _ElemT* pCUData()
+      { return mpCUData; }
+
+      /// Get raw row pointer
+      const _ElemT* pCURowData(size_t r) const
+      { assert(r < Rows()); return mpCUData+r*mStride; }
+      _ElemT* pCURowData(size_t r)
+      { assert(r < Rows()); return mpCUData+r*mStride; }
+
+      /// Get size of matrix in bytes
+      size_t MSize() const
+      { return mRows*mStride*sizeof(_ElemT); }
+      /// Get size of matrix row in bytes
+      size_t MRowSize() const
+      { return mStride*sizeof(_ElemT); }
+
+      /// Allocate the memory
+      ThisType& Init(size_t rows, size_t cols);
+
+      /// Deallocate the memory
+      void Destroy();
+
+      /// Copy functions (reallocates when needed)
+      ThisType&        CopyFrom(const CuMatrix<_ElemT>& rSrc);
+      ThisType&        CopyFrom(const Matrix<_ElemT>& rSrc);
+      Matrix<_ElemT>&  CopyTo(Matrix<_ElemT>& rDst) const;
+
+      /// Copy rowCnt rows from rSrc, starting by row srcOri, 
+      /// copying to memory block starting by row dstOri
+      void CopyRows(size_t rowCnt, size_t srcOri, const CuMatrix<_ElemT>& rSrc, size_t dstOri);
+
+      /// Copy colCnt columns from rSrc, starting by col srcOri, 
+      /// copying to memory block starting by row dstOri
+      void CopyCols(size_t colCnt, size_t srcOri, const CuMatrix<_ElemT>& rSrc, size_t dstOri);
+
+
+      // Math operations, some calling kernels
+      //
+      void SetZero();
+
+      void SetConst(_ElemT value)
+      { Error("__func__ Not implemented"); }
+
+      void ApplyLog()
+      { Error("__func__ Not implemented"); }
+
+      void ApplyMask(const CuMatrix<BaseFloat>& mask)
+      { Error("__func__ Not implemented"); }
+
+      void ApplyL1(BaseFloat l1)
+      { Error("__func__ Not implemented"); }
+
+      /// scale i'th column by scale[i]
+      void ScaleCols(const CuVector<_ElemT>& scale)
+      { Error("__func__ Not implemented"); }
+
+      /// scale i'th row by scale[i]
+      void ScaleRows(const CuVector<_ElemT>& scale)
+      { Error("__func__ Not implemented"); }
+
+      /// B = aplha * A + beta * B
+      void AddScaled(_ElemT alpha, const CuMatrix<_ElemT>& A, _ElemT beta)
+      { Error("__func__ Not implemented"); }
+
+      /// B = aplha * row + beta * B
+      void AddScaledRow(_ElemT alpha, const CuVector<_ElemT>& row, _ElemT beta)
+      { Error("__func__ Not implemented"); }
+
+      /// C = alpha * A(^T)*B(^T) + beta * C
+      void Gemm(char transa, char transb, 
+                _ElemT alpha, 
+                const CuMatrix<_ElemT>& A, const CuMatrix<_ElemT>& B, 
+                _ElemT beta)
+      { Error("__func__ Not implemented"); }
+
+      /// A = alpha * x*y^T + A
+      void BlasGer(_ElemT alpha, 
+                const CuVector<_ElemT>& x, const CuVector<_ElemT>& y)
+      { Error("__func__ Not implemented"); }
+
+
+      /// Multiply two matrices elementhwise: C = A .* C
+      void MulElem(const CuMatrix<_ElemT>& A)
+      { Error("__func__ Not implemented"); }
+      
+      /// A = log(A)
+      void LogElem()
+      { Error("__func__ Not implemented"); }
+
+      void Print() const
+      { 
+        Matrix<_ElemT> mat(Rows(),Cols());
+        CopyTo(mat);
+        std::cout << mat;
+      }
+
+      
+
+      void CheckData()
+      {
+        Matrix<_ElemT> mat;
+        CopyTo(mat);
+        for(size_t i=0; i<Rows(); i++) {
+          for(size_t j=0; j<Cols(); j++) {
+            if(std::isnan(mat(i,j)) || std::isinf(mat(i,j))) {
+              std::ostringstream os;
+              os << "Invalid value:" << mat(i,j) << "at row"<<i<<" col"<<j<<"\n";
+              Error(os.str());
+            }
+          }
+        }
+      }
+        
+      
+    private:
+      size_t mRows;
+      size_t mCols;
+      size_t mStride;
+
+      _ElemT* mpCUData;
+
+  };
+
+
+  /// Prints the matrix dimensions and pointer to stream
+  template<typename _ElemT>
+  inline std::ostream& operator << (std::ostream& out, const CuMatrix<_ElemT>& mat)
+  { 
+    out << "[CUMATRIX R" << mat.Rows() << " C" << mat.Cols() << " S" << mat.Stride() 
+        << " PTR" << mat.pCUData() << "]" << std::flush;
+    return out;
+  }
+  
+  
+}
+
+
+#include "cumatrix.tcc"
+
+#endif
diff --git a/src/CuBaseLib/.svn/text-base/cumatrix.tcc.svn-base b/src/CuBaseLib/.svn/text-base/cumatrix.tcc.svn-base
new file mode 100644
index 0000000..4582e8d
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/cumatrix.tcc.svn-base
@@ -0,0 +1,627 @@
+
+#include <cuda_runtime_api.h>
+#include <cublas.h>
+
+#include "Timer.h"
+#include "cucommon.h"
+#include "cuvector.h"
+#include "cudevice.h"
+
+namespace TNet {
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+  template<typename _ElemT>
+  CuMatrix<_ElemT>&
+  CuMatrix<_ElemT>::
+  Init(size_t rows, size_t cols)
+  {
+    if(mRows == rows && mCols == cols) {
+      //SetZero();
+      return *this;
+    }
+
+    Destroy();
+
+    size_t row_bytes = cols * sizeof(_ElemT);
+    size_t pitch;
+    cuSafeCall(cudaMallocPitch((void**)&mpCUData, &pitch, row_bytes, rows));
+    mRows = rows; mCols = cols; 
+    mStride = pitch/sizeof(_ElemT);
+    SetZero();
+    
+    return *this;
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+  
+  template<typename _ElemT>
+  void
+  CuMatrix<_ElemT>::
+  Destroy()
+  {
+    if(NULL != mpCUData) {
+      cuSafeCall(cudaFree(mpCUData));
+      mpCUData = NULL;
+    }
+    mRows = mCols = mStride = 0;
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+  template<typename _ElemT>
+  CuMatrix<_ElemT>&
+  CuMatrix<_ElemT>::
+  CopyFrom(const CuMatrix<_ElemT>& rSrc)
+  {
+    Init(rSrc.Rows(),rSrc.Cols());
+    
+    Timer tim; tim.Start();
+
+    size_t dst_pitch = mStride*sizeof(_ElemT);
+    size_t src_pitch = rSrc.Stride()*sizeof(_ElemT);
+    size_t width = rSrc.Cols()*sizeof(_ElemT);
+    cuSafeCall(cudaMemcpy2D(mpCUData, dst_pitch, rSrc.pCUData(), src_pitch, width, rSrc.Rows(), cudaMemcpyDeviceToDevice));
+
+    tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::CopyFromD2D",tim.Val());
+    return *this;
+  }
+  
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+  template<typename _ElemT>
+  CuMatrix<_ElemT>&
+  CuMatrix<_ElemT>::
+  CopyFrom(const Matrix<_ElemT>& rSrc)
+  {
+    Init(rSrc.Rows(),rSrc.Cols());
+
+    Timer tim; tim.Start();
+
+    size_t dst_pitch = mStride*sizeof(_ElemT);
+    size_t src_pitch = rSrc.Stride()*sizeof(_ElemT);
+    size_t width = rSrc.Cols()*sizeof(_ElemT);
+    cuSafeCall(cudaMemcpy2D(mpCUData, dst_pitch, rSrc.pData(), src_pitch, width, rSrc.Rows(), cudaMemcpyHostToDevice));
+
+    tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::CopyFromH2D",tim.Val());
+    return *this;
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+  template<typename _ElemT>
+  Matrix<_ElemT>&
+  CuMatrix<_ElemT>::
+  CopyTo(Matrix<_ElemT>& rDst) const
+  {
+    if(rDst.Rows() != Rows()  ||  rDst.Cols() != Cols()) {
+      rDst.Init(Rows(),Cols());
+    }
+
+    Timer tim; tim.Start();
+   
+    size_t src_pitch = mStride*sizeof(_ElemT);
+    size_t dst_pitch = rDst.Stride()*sizeof(_ElemT);
+    size_t width = Cols()*sizeof(_ElemT);
+    cuSafeCall(cudaMemcpy2D(rDst.pData(), dst_pitch, pCUData(), src_pitch, width, Rows(), cudaMemcpyDeviceToHost));
+
+    tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::CopyToD2H",tim.Val());
+
+    return rDst;
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+  
+  template<typename _ElemT>
+  void
+  CuMatrix<_ElemT>::
+  CopyRows(size_t rowCnt, size_t srcOri, const CuMatrix<_ElemT>& rSrc, size_t dstOri)
+  {
+    assert(rowCnt+srcOri <= rSrc.Rows());
+    assert(rowCnt+dstOri <= Rows());
+    assert(Cols() == rSrc.Cols());
+ 
+    Timer tim; tim.Start();
+
+    size_t dst_pitch = mStride*sizeof(_ElemT);
+    size_t src_pitch = rSrc.Stride()*sizeof(_ElemT);
+    size_t width = rSrc.Cols()*sizeof(_ElemT);
+
+    const _ElemT* p_src = rSrc.pCUData() + srcOri*rSrc.Stride();  
+    _ElemT* p_dst = mpCUData + dstOri*mStride;
+
+    cuSafeCall(cudaMemcpy2D(p_dst, dst_pitch, p_src, src_pitch, width, rowCnt, cudaMemcpyDeviceToDevice));
+
+    tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::CopyRowsD2D",tim.Val());
+   
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+  
+  template<typename _ElemT>
+  void
+  CuMatrix<_ElemT>::
+  CopyCols(size_t colCnt, size_t srcOri, const CuMatrix<_ElemT>& rSrc, size_t dstOri)
+  {
+    assert(colCnt+srcOri <= rSrc.Cols());
+    assert(colCnt+dstOri <= Cols());
+    assert(Rows() == rSrc.Rows());
+ 
+    Timer tim; tim.Start();
+
+    size_t dst_pitch = mStride*sizeof(_ElemT);
+    size_t src_pitch = rSrc.Stride()*sizeof(_ElemT);
+    size_t width = colCnt*sizeof(_ElemT);
+
+    const _ElemT* p_src = rSrc.pCUData() + srcOri;  
+    _ElemT* p_dst = mpCUData + dstOri;
+
+    cuSafeCall(cudaMemcpy2D(p_dst, dst_pitch, p_src, src_pitch, width, Rows(), cudaMemcpyDeviceToDevice));
+
+    tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::CopyColsD2D",tim.Val());
+   
+  }
+ 
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+ 
+  template<typename _ElemT>
+  void
+  CuMatrix<_ElemT>::
+  SetZero() 
+  {
+    Timer tim; tim.Start();
+    cuSafeCall(cudaMemset(mpCUData, 0, mRows*mStride*sizeof(_ElemT)));
+    tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::SetZero",tim.Val());
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+ 
+ 
+  ////////////////////////////////////////////////////////////////////////
+  //// CuMatrix:: templeate specializations (float)
+  ////
+  template<> 
+  inline void CuMatrix<float>::SetConst(float value)
+  { 
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaF_set_const(dimGrid,dimBlock,mpCUData,value,Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<> 
+  inline void CuMatrix<float>::ApplyLog()
+  { 
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaF_apply_log(dimGrid,dimBlock,mpCUData,Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+  
+  
+  template<> 
+  inline void CuMatrix<float>::ApplyMask(const CuMatrix<BaseFloat>& mask)
+  { 
+    Timer tim; tim.Start();
+
+    assert(mask.Rows() == Rows());
+    assert(mask.Cols() == Cols());
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaF_apply_mask(dimGrid,dimBlock,mpCUData,mask.pCUData(),Dim(),mask.Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+  
+
+  template<> 
+  inline void CuMatrix<float>::ApplyL1(float l1)
+  { 
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaF_apply_l1(dimGrid,dimBlock,mpCUData,l1,Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  inline void CuMatrix<float>::ScaleCols(const CuVector<float>& scale)
+  {
+    Timer tim; tim.Start();
+
+    assert(scale.Dim() == Cols());
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaF_scale_cols(dimGrid,dimBlock,mpCUData,scale.pCUData(),Dim());
+    cuSafeCall(cudaGetLastError());
+
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  
+  template<>
+  inline void CuMatrix<float>::ScaleRows(const CuVector<float>& scale)
+  { 
+    Timer tim; tim.Start();
+
+    assert(scale.Dim() == Rows());
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaF_scale_rows(dimGrid,dimBlock,mpCUData,scale.pCUData(),Dim());
+    cuSafeCall(cudaGetLastError());
+
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  inline void CuMatrix<float>::AddScaled(float alpha, const CuMatrix<float>& A, float beta)
+  {
+    Timer tim; tim.Start();
+
+    assert(A.Rows() == Rows());
+    assert(A.Cols() == Cols());
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaF_add_scaled(dimGrid,dimBlock,alpha,A.pCUData(),beta,mpCUData,Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  inline void CuMatrix<float>::AddScaledRow(float alpha, const CuVector<float>& row, float beta)
+  { 
+    Timer tim; tim.Start();
+
+    if(row.Dim() != Cols()) {
+      std::ostringstream os;
+      os << "Non matching dimensions: Cols:" << Cols() << " VectorDim:" << row.Dim();
+      Error(os.str());
+    }
+    assert(row.Dim() == Cols());
+   
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaF_add_scaled_row(dimGrid,dimBlock,alpha,row.pCUData(),beta,mpCUData,Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  inline void CuMatrix<float>::Gemm(char transa, char transb, 
+            float alpha, 
+            const CuMatrix<float>& A, const CuMatrix<float>& B, 
+            float beta)
+  { 
+    // CUBLAS is col major, TNet is row major
+    // keep trans..., just swap A&B argumets: A->B B->A
+    size_t m = ((transb=='T' || transb=='t')? B.Rows() : B.Cols()); 
+    size_t n = ((transa=='T' || transa=='t')? A.Cols() : A.Rows());
+    size_t k = ((transb=='T' || transb=='t')? B.Cols() : B.Rows());
+    size_t k1 = ((transa=='T' || transa=='t')? A.Rows() : A.Cols());
+
+    assert(m == Cols());
+    assert(n == Rows());
+    assert(k == k1);
+
+    #if 0
+     //DEBUG MESSAGE
+    std::cout << "\n" << transb << " " << transa << " " << m << " " << n << " " << k << " " <<
+                alpha << " " << B << " " << B.Stride() << " " <<
+                A << " " << A.Stride() << " " << beta << " " << C << " " << 
+                C.Stride() << "\n" << std::flush;
+    #endif
+
+    Timer tim; tim.Start();
+
+    cublasSgemm(transb, transa, m, n, k, 
+                alpha, B.pCUData(), B.Stride(), A.pCUData(), A.Stride(), 
+                beta, mpCUData, Stride());
+
+    cuSafeCall(cublasGetError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  inline void CuMatrix<float>::BlasGer(float alpha, 
+            const CuVector<float>& x, const CuVector<float>& y)
+  { 
+    // CUBLAS is col major, TNet is row major
+    // just swap x and y
+    assert(x.Dim() == Rows());
+    assert(y.Dim() == Cols());
+
+    Timer tim; tim.Start();
+    
+    cublasSger(Cols(),Rows(),alpha,y.pCUData(),1,x.pCUData(),1,mpCUData,Stride());
+    cuSafeCall(cublasGetError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  inline void CuMatrix<float>::MulElem(const CuMatrix<float>& A)
+  {
+    Timer tim; tim.Start();
+
+    assert(mCols == A.Cols());
+    assert(mRows == A.Rows());
+    assert(mStride == A.Stride());
+    
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaF_mul_elem(dimGrid,dimBlock,mpCUData, A.pCUData(), Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  inline void CuMatrix<float>::LogElem()
+  {
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaF_log_elem(dimGrid,dimBlock,mpCUData, Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////
+  //// CuMatrix:: templeate specializations (double)
+  ////
+  template<> 
+  inline void CuMatrix<double>::SetConst(double value)
+  { 
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaD_set_const(dimGrid,dimBlock,mpCUData,value,Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<> 
+  inline void CuMatrix<double>::ApplyLog()
+  { 
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaD_apply_log(dimGrid,dimBlock,mpCUData,Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  inline void CuMatrix<double>::ScaleCols(const CuVector<double>& scale)
+  {
+    Timer tim; tim.Start();
+
+    assert(scale.Dim() == Cols());
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaD_scale_cols(dimGrid,dimBlock,mpCUData,scale.pCUData(),Dim());
+    cuSafeCall(cudaGetLastError());
+
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  
+  template<>
+  inline void CuMatrix<double>::ScaleRows(const CuVector<double>& scale)
+  { 
+    Timer tim; tim.Start();
+
+    assert(scale.Dim() == Rows());
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaD_scale_rows(dimGrid,dimBlock,mpCUData,scale.pCUData(),Dim());
+    cuSafeCall(cudaGetLastError());
+
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  inline void CuMatrix<double>::AddScaled(double alpha, const CuMatrix<double>& A, double beta)
+  {
+    Timer tim; tim.Start();
+
+    assert(A.Rows() == Rows());
+    assert(A.Cols() == Cols());
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaD_add_scaled(dimGrid,dimBlock,alpha,A.pCUData(),beta,mpCUData,Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  inline void CuMatrix<double>::AddScaledRow(double alpha, const CuVector<double>& row, double beta)
+  { 
+    Timer tim; tim.Start();
+
+    assert(row.Dim() == Cols());
+   
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaD_add_scaled_row(dimGrid,dimBlock,alpha,row.pCUData(),beta,mpCUData,Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  inline void CuMatrix<double>::Gemm(char transa, char transb, 
+            double alpha, 
+            const CuMatrix<double>& A, const CuMatrix<double>& B, 
+            double beta)
+  { 
+    // CUBLAS is col major, TNet is row major
+    // keep trans..., just swap A&B argumets: A->B B->A
+    size_t m = ((transb=='T' || transb=='t')? B.Rows() : B.Cols()); 
+    size_t n = ((transa=='T' || transa=='t')? A.Cols() : A.Rows());
+    size_t k = ((transb=='T' || transb=='t')? B.Cols() : B.Rows());
+    size_t k1 = ((transa=='T' || transa=='t')? A.Rows() : A.Cols());
+
+    assert(m == Cols());
+    assert(n == Rows());
+    assert(k == k1);
+
+    #if 0
+     //DEBUG MESSAGE
+    std::cout << "\n" << transb << " " << transa << " " << m << " " << n << " " << k << " " <<
+                alpha << " " << B << " " << B.Stride() << " " <<
+                A << " " << A.Stride() << " " << beta << " " << C << " " << 
+                C.Stride() << "\n" << std::flush;
+    #endif
+
+    Timer tim; tim.Start();
+
+    cublasDgemm(transb, transa, m, n, k, 
+                alpha, B.pCUData(), B.Stride(), A.pCUData(), A.Stride(), 
+                beta, mpCUData, Stride());
+
+    cuSafeCall(cublasGetError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+  template<>
+  inline void CuMatrix<double>::BlasGer(double alpha, 
+            const CuVector<double>& x, const CuVector<double>& y)
+  { 
+    // CUBLAS is col major, TNet is row major
+    // just swap x and y
+    assert(x.Dim() == Rows());
+    assert(y.Dim() == Cols());
+
+    Timer tim; tim.Start();
+    
+    cublasDger(Cols(),Rows(),alpha,y.pCUData(),1,x.pCUData(),1,mpCUData,Stride());
+    cuSafeCall(cublasGetError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+
+  template<>
+  inline void CuMatrix<double>::MulElem(const CuMatrix<double>& A)
+  {
+    Timer tim; tim.Start();
+
+    assert(mCols == A.Cols());
+    assert(mRows == A.Rows());
+    assert(mStride == A.Stride());
+    
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaD_mul_elem(dimGrid,dimBlock,mpCUData, A.pCUData(), Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  inline void CuMatrix<double>::LogElem()
+  {
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaD_log_elem(dimGrid,dimBlock,mpCUData, Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+}
diff --git a/src/CuBaseLib/.svn/text-base/curand.h.svn-base b/src/CuBaseLib/.svn/text-base/curand.h.svn-base
new file mode 100644
index 0000000..8aa66d5
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/curand.h.svn-base
@@ -0,0 +1,40 @@
+#ifndef _CU_RAND_H_
+#define _CU_RAND_H_
+
+
+#include "cumatrix.h"
+
+
+namespace TNet {
+  
+  template<typename T> 
+  class CuRand {
+   public:
+
+    CuRand(size_t rows, size_t cols)
+    { SeedGpu(rows,cols); }
+
+    ~CuRand() { }
+
+    void SeedGpu(size_t rows, size_t cols);
+    void Rand(CuMatrix<T>& tgt);
+    void GaussRand(CuMatrix<T>& tgt);
+
+    void BinarizeProbs(const CuMatrix<T>& probs, CuMatrix<T>& states);
+    void AddGaussNoise(CuMatrix<T>& tgt, T gscale = 1.0);
+  
+   private:
+    static void SeedRandom(Matrix<unsigned>& mat);
+     
+   private:
+    CuMatrix<unsigned> z1, z2, z3, z4;
+    CuMatrix<T> tmp;
+  };
+
+}
+
+
+#include "curand.tcc"
+
+
+#endif
diff --git a/src/CuBaseLib/.svn/text-base/curand.tcc.svn-base b/src/CuBaseLib/.svn/text-base/curand.tcc.svn-base
new file mode 100644
index 0000000..e337189
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/curand.tcc.svn-base
@@ -0,0 +1,228 @@
+
+#include <cstdlib>
+#include "curandkernels.h"
+
+
+namespace TNet {
+ 
+ 
+
+  template<typename T>
+  inline void
+  CuRand<T>::
+  SeedGpu(size_t rows, size_t cols)
+  {
+    Matrix<unsigned> mat(rows,cols);
+    SeedRandom(mat);
+    z1.CopyFrom(mat);
+    SeedRandom(mat);
+    z2.CopyFrom(mat);
+    SeedRandom(mat);
+    z3.CopyFrom(mat);
+    SeedRandom(mat);
+    z4.CopyFrom(mat);
+
+    /*
+    std::cout << "RANDININIT" << std::endl;
+    z1.Print();
+    z2.Print();
+    z3.Print();
+    z4.Print();
+    std::cout << "RANDININIT" << std::endl;
+    */
+
+    tmp.Init(rows,cols);
+  }
+
+
+
+  template<typename T>  
+  inline void 
+  CuRand<T>::
+  SeedRandom(Matrix<unsigned>& mat) {
+    for(size_t j=0; j<mat.Rows(); j++) {
+      for(size_t i=0; i<mat.Cols(); i++) {
+        unsigned value = 0;
+        while(value <= 128) { value = lrand48(); }
+        mat(j,i) = value;
+      }
+    }
+  }
+
+
+  template<typename T>
+  inline void
+  CuRand<T>::
+  AddGaussNoise(CuMatrix<T>& tgt, T gscale)
+  { 
+    GaussRand(tmp);
+    tgt.AddScaled(gscale,tmp,1.0);
+  }
+
+
+
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  //// invalid general wrappers over CUDA kernels
+  template<typename T>
+  inline void
+  CuRand<T>::
+  Rand(CuMatrix<T>& tgt)
+  { Error("Unimplemented"); }
+  
+  template<typename T>
+  inline void
+  CuRand<T>::
+  GaussRand(CuMatrix<T>& tgt)
+  { Error("Unimplemented"); }
+ 
+  template<typename T>
+  inline void
+  CuRand<T>::
+  BinarizeProbs(const CuMatrix<T>& probs, CuMatrix<T>& states)
+  { Error("Unimplemented"); }
+
+
+  //////////////////////////////////////////////////////////////////////////
+  //// float specializations
+  template<>
+  inline void
+  CuRand<float>::
+  Rand(CuMatrix<float>& tgt)
+  {
+    Timer tim; tim.Start();
+
+    tgt.Init(z1.Rows(), z1.Cols());
+  
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(tgt.Cols(), CUBLOCK), n_blocks(tgt.Rows(),CUBLOCK));
+
+    cudaF_rand(dimGrid,dimBlock,tgt.pCUData(), z1.pCUData(), z2.pCUData(), z3.pCUData(), z4.pCUData(),tgt.Dim());
+
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+    
+  
+  template<>
+  inline void
+  CuRand<float>::
+  GaussRand(CuMatrix<float>& tgt)
+  {
+
+    Timer tim; tim.Start();
+
+    tgt.Init(z1.Rows(), z1.Cols());
+  
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(tgt.Cols(), CUBLOCK), n_blocks(tgt.Rows(),CUBLOCK));
+
+    cudaF_gauss_rand(dimGrid,dimBlock,tgt.pCUData(), z1.pCUData(), z2.pCUData(), z3.pCUData(), z4.pCUData(),tgt.Dim());
+
+    cuSafeCall(cudaGetLastError());
+
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+ 
+  template<>
+  inline void
+  CuRand<float>::
+  BinarizeProbs(const CuMatrix<float>& probs, CuMatrix<float>& states)
+  {
+    if(probs.Rows() != z1.Rows() || probs.Cols() != z1.Cols()) {
+      Error("Non matching dims!!");
+    }
+
+    states.Init(z1.Rows(),z1.Cols());
+    Rand(tmp);
+
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(z1.Cols(), CUBLOCK), n_blocks(z1.Rows(),CUBLOCK));
+
+    cudaF_binarize_probs(dimGrid,dimBlock,states.pCUData(), probs.pCUData(), tmp.pCUData(),states.Dim());
+
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  //////////////////////////////////////////////////////////////////////////
+  //// double specializations
+  template<>
+  inline void
+  CuRand<double>::
+  Rand(CuMatrix<double>& tgt)
+  {
+    Timer tim; tim.Start();
+
+    tgt.Init(z1.Rows(), z1.Cols());
+  
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(tgt.Cols(), CUBLOCK), n_blocks(tgt.Rows(),CUBLOCK));
+
+    cudaD_rand(dimGrid,dimBlock,tgt.pCUData(), z1.pCUData(), z2.pCUData(), z3.pCUData(), z4.pCUData(),tgt.Dim());
+
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+    
+  
+  template<>
+  inline void
+  CuRand<double>::
+  GaussRand(CuMatrix<double>& tgt)
+  {
+
+    Timer tim; tim.Start();
+
+    tgt.Init(z1.Rows(), z1.Cols());
+  
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(tgt.Cols(), CUBLOCK), n_blocks(tgt.Rows(),CUBLOCK));
+
+    cudaD_gauss_rand(dimGrid,dimBlock,tgt.pCUData(), z1.pCUData(), z2.pCUData(), z3.pCUData(), z4.pCUData(),tgt.Dim());
+
+    cuSafeCall(cudaGetLastError());
+
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+ 
+  template<>
+  inline void
+  CuRand<double>::
+  BinarizeProbs(const CuMatrix<double>& probs, CuMatrix<double>& states)
+  {
+    if(probs.Rows() != z1.Rows() || probs.Cols() != z1.Cols()) {
+      Error("Non matching dims!!");
+    }
+
+    states.Init(z1.Rows(),z1.Cols());
+    Rand(tmp);
+
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(z1.Cols(), CUBLOCK), n_blocks(z1.Rows(),CUBLOCK));
+
+    cudaD_binarize_probs(dimGrid,dimBlock,states.pCUData(), probs.pCUData(), tmp.pCUData(),states.Dim());
+
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+}
diff --git a/src/CuBaseLib/.svn/text-base/curandkernels.cu.svn-base b/src/CuBaseLib/.svn/text-base/curandkernels.cu.svn-base
new file mode 100644
index 0000000..7e1c8dd
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/curandkernels.cu.svn-base
@@ -0,0 +1,135 @@
+
+#include "curandkernels.h"
+
+
+
+//
+//Hybrid Tauss/LCG random number generator
+//
+//http://http.developer.nvidia.com/GPUGems3/gpugems3_ch37.html
+
+
+// S1, S2, S3, and M are all constants, and z is part of the  
+// private per-thread generator state.
+__device__
+static unsigned TausStep(unsigned &z, int S1, int S2, int S3, unsigned M)  
+{  
+  unsigned b=(((z << S1) ^ z) >> S2);  
+  return z = (((z & M) << S3) ^ b);  
+}  
+
+// A and C are constants  
+__device__
+static unsigned LCGStep(unsigned &z, unsigned A, unsigned C)  
+{  
+  return z=(A*z+C);  
+} 
+
+template<typename T>
+__device__
+static T HybridTaus(unsigned& z1, unsigned& z2, unsigned& z3, unsigned& z4)  
+{  
+  // Combined period is lcm(p1,p2,p3,p4)~ 2^121
+  T randval;
+  do { 
+   randval = 2.3283064365387e-10 * (          // Periods  
+    TausStep(z1, 13, 19, 12, 4294967294UL) ^  // p1=2^31-1  
+    TausStep(z2, 2, 25, 4, 4294967288UL) ^    // p2=2^30-1  
+    TausStep(z3, 3, 11, 17, 4294967280UL) ^   // p3=2^28-1  
+    LCGStep(z4, 1664525, 1013904223UL)        // p4=2^32  
+   );
+  } while (!(randval > 0.0 && randval < 1.0));
+  return randval;
+}  
+
+
+
+
+template<typename T>
+__global__
+static void _rand(T* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if( i < d.cols  && j < d.rows ) {
+    mat[index] = HybridTaus<T>(z1[index],z2[index],z3[index],z4[index]);
+  }
+}
+
+/*
+float2 BoxMuller()  
+{  
+  float u0=HybridTaus (), u1=HybridTaus ();  
+  float r=sqrt(-2 log(u0));  
+  float theta=2*PI*u1;  
+  return make_float2(r*sin(theta),r*cos(theta));  
+} 
+*/
+ 
+template<typename T>
+__device__
+static T BoxMuller(unsigned& z1, unsigned& z2, unsigned& z3, unsigned& z4)  
+{
+  const T M_2PI = 6.283185307179586476925286766558;
+
+  T u0 = HybridTaus<T>(z1,z2,z3,z4), u1 = HybridTaus<T>(z1,z2,z3,z4);
+  T r = sqrt(-2.0 * log(u0));
+  T theta = M_2PI * u1;
+  return r*sin(theta);
+  
+}  
+
+
+template<typename T>
+__global__
+static void _gauss_rand(T* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if( i < d.cols  && j < d.rows ) {
+    mat[index] = BoxMuller<T>(z1[index],z2[index],z3[index],z4[index]);
+  }
+}
+
+
+template<typename T>
+__global__
+static void _binarize_probs(T* states, const T* probs, const T* rand, MatrixDim d)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if( i < d.cols  && j < d.rows ) {
+    states[index] = ((probs[index] > rand[index])? 1.0 : 0.0);
+  }
+}
+
+
+
+/************
+ * :FLOAT:
+ */
+void cudaF_rand(dim3 Gr, dim3 Bl, float* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
+{ _rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); }
+
+void cudaF_gauss_rand(dim3 Gr, dim3 Bl, float* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
+{ _gauss_rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); }
+
+void cudaF_binarize_probs(dim3 Gr, dim3 Bl, float* states, const float* probs, float* rand, MatrixDim d) 
+{ _binarize_probs<<<Gr,Bl>>>(states,probs,rand,d); }
+
+
+/************
+ * :DOUBLE:
+ */
+void cudaD_rand(dim3 Gr, dim3 Bl, double* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
+{ _rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); }
+
+void cudaD_gauss_rand(dim3 Gr, dim3 Bl, double* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
+{ _gauss_rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); }
+
+void cudaD_binarize_probs(dim3 Gr, dim3 Bl, double* states, const double* probs, double* rand, MatrixDim d) 
+{ _binarize_probs<<<Gr,Bl>>>(states,probs,rand,d); }
+
diff --git a/src/CuBaseLib/.svn/text-base/curandkernels.h.svn-base b/src/CuBaseLib/.svn/text-base/curandkernels.h.svn-base
new file mode 100644
index 0000000..69b589f
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/curandkernels.h.svn-base
@@ -0,0 +1,34 @@
+#ifndef _cuda_rand_kernels_h_
+#define _cuda_rand_kernels_h_
+
+
+#include "cukernels.h"
+
+
+extern "C" {
+  //**************
+  //float
+  //
+  void cudaF_rand(dim3 Gr, dim3 Bl, float* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d);
+
+
+  void cudaF_gauss_rand(dim3 Gr, dim3 Bl, float* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d);
+
+
+  void cudaF_binarize_probs(dim3 Gr, dim3 Bl, float* states, const float* probs, float* rand, MatrixDim d);
+  
+  //**************
+  //double
+  //
+  void cudaD_rand(dim3 Gr, dim3 Bl, double* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d);
+
+
+  void cudaD_gauss_rand(dim3 Gr, dim3 Bl, double* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d);
+
+
+  void cudaD_binarize_probs(dim3 Gr, dim3 Bl, double* states, const double* probs, double* rand, MatrixDim d);
+
+}
+
+
+#endif
diff --git a/src/CuBaseLib/.svn/text-base/cuvector.h.svn-base b/src/CuBaseLib/.svn/text-base/cuvector.h.svn-base
new file mode 100644
index 0000000..945565a
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/cuvector.h.svn-base
@@ -0,0 +1,104 @@
+#ifndef _CUVECTOR_H_
+#define _CUVECTOR_H_
+
+#include "Vector.h"
+
+namespace TNet {
+
+  template<typename _ElemT> class CuMatrix;
+
+  /**
+   * Matrix for CUDA computing
+   */
+  template<typename _ElemT>
+  class CuVector 
+  {
+    typedef CuVector<_ElemT> ThisType;
+
+    public:
+
+      /// Default Constructor
+      CuVector<_ElemT>()
+       : mDim(0), mpCUData(NULL)
+      { }
+      /// Constructor with memory initialisation
+      CuVector<_ElemT>(size_t dim)
+       : mDim(0), mpCUData(NULL)
+      { Init(dim); }
+
+      /// Destructor
+      ~CuVector()
+      { Destroy(); }
+
+      /// Dimensions
+      size_t Dim() const
+      { return mDim; }
+
+      /*
+      ::MatrixDim Dim() const
+      { ::MatrixDim d = { mDim, 1, 1 }; return d; }
+      */
+
+      /// Get raw pointer
+      const _ElemT* pCUData() const
+      { return mpCUData; }
+      _ElemT* pCUData()
+      { return mpCUData; }
+
+      /// Allocate the memory
+      ThisType& Init(size_t dim);
+
+      /// Deallocate the memory
+      void Destroy();
+
+      /// Copy functions (reallocates when needed)
+      ThisType&        CopyFrom(const CuVector<_ElemT>& rSrc);
+      ThisType&        CopyFrom(const Vector<_ElemT>& rSrc);
+      Vector<_ElemT>&  CopyTo(Vector<_ElemT>& rDst) const;
+
+
+      
+      // Math operations
+      //
+      void SetZero();
+
+      void SetConst(_ElemT value)
+      { Error("__func__ Not implemented"); }
+
+      void AddScaled(_ElemT alpha, const CuVector<_ElemT>& vec, _ElemT beta)
+      { Error("__func__ Not implemented"); }
+
+      void AddColSum(_ElemT alpha, const CuMatrix<_ElemT>& mat, _ElemT beta)
+      { Error("__func__ Not implemented"); }
+
+      void Print() const
+      { 
+        Vector<_ElemT> vec(Dim());
+        CopyTo(vec);
+        std::cout << vec << "\n";
+      }
+
+
+    private:
+      size_t mDim;
+      _ElemT* mpCUData;
+  };
+
+
+  /// Prints the matrix dimensions and pointer to stream
+  template<typename _ElemT>
+  inline std::ostream& operator << (std::ostream& out, const CuVector<_ElemT>& vec)
+  { 
+    size_t d = vec.Dim(); 
+    out << "[CuVector D" << d
+        << " PTR" << vec.pCUData() << "]" << std::flush;
+    return out;
+  }
+  
+  
+}
+
+
+#include "cuvector.tcc"
+
+#endif
diff --git a/src/CuBaseLib/.svn/text-base/cuvector.tcc.svn-base b/src/CuBaseLib/.svn/text-base/cuvector.tcc.svn-base
new file mode 100644
index 0000000..0107859
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/cuvector.tcc.svn-base
@@ -0,0 +1,254 @@
+
+#include <cuda_runtime_api.h>
+
+#include "Timer.h"
+#include "cucommon.h"
+#include "cumatrix.h"
+#include "cudevice.h"
+
+namespace TNet {
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+  template<typename _ElemT>
+  CuVector<_ElemT>&
+  CuVector<_ElemT>::
+  Init(size_t dim)
+  {
+    if(mDim == dim) {
+      //SetZero();
+      return *this;
+    }
+
+    Destroy();
+
+    cuSafeCall(cudaMalloc((void**)&mpCUData, dim*sizeof(_ElemT)));
+    mDim = dim;
+    SetZero();
+
+    return *this;
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+  
+  template<typename _ElemT>
+  void
+  CuVector<_ElemT>::
+  Destroy()
+  {
+    if(NULL != mpCUData) {
+      cuSafeCall(cudaFree(mpCUData));
+      mpCUData = NULL;
+    }
+    mDim = 0;
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+  template<typename _ElemT>
+  CuVector<_ElemT>&
+  CuVector<_ElemT>::
+  CopyFrom(const CuVector<_ElemT>& rSrc)
+  {
+    Init(rSrc.Dim());
+    
+    Timer tim; tim.Start();
+
+    cuSafeCall(cudaMemcpy(mpCUData, rSrc.pCUData(), rSrc.Dim()*sizeof(_ElemT), cudaMemcpyDeviceToDevice));
+
+    tim.End(); CuDevice::Instantiate().AccuProfile("CuVector::CopyFromD2D",tim.Val());
+    return *this;
+  }
+  
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+  template<typename _ElemT>
+  CuVector<_ElemT>&
+  CuVector<_ElemT>::
+  CopyFrom(const Vector<_ElemT>& rSrc)
+  {
+    Init(rSrc.Dim());
+
+    Timer tim; tim.Start();
+
+    cuSafeCall(cudaMemcpy(mpCUData, rSrc.pData(), rSrc.Dim()*sizeof(_ElemT), cudaMemcpyHostToDevice));
+
+    tim.End(); CuDevice::Instantiate().AccuProfile("CuVector::CopyFromH2D",tim.Val());
+    return *this;
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+  template<typename _ElemT>
+  Vector<_ElemT>&
+  CuVector<_ElemT>::
+  CopyTo(Vector<_ElemT>& rDst) const
+  {
+    if(rDst.Dim() != mDim) {
+      rDst.Init(mDim);
+    }
+
+    Timer tim; tim.Start();
+   
+    cuSafeCall(cudaMemcpy(rDst.pData(), pCUData(), mDim*sizeof(_ElemT), cudaMemcpyDeviceToHost));
+
+    tim.End(); CuDevice::Instantiate().AccuProfile("CuVector::CopyToD2H",tim.Val());
+
+    return rDst;
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+
+  template<typename _ElemT>
+  void 
+  CuVector<_ElemT>::
+  SetZero()
+  {
+    Timer tim; tim.Start();
+    cuSafeCall(cudaMemset(mpCUData, 0, mDim*sizeof(_ElemT)));
+    tim.End(); CuDevice::Instantiate().AccuProfile("CuVector::SetZero",tim.Val());
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////
+  //// CuVector:: templeate specializations (float)
+  ////
+  template<>
+  inline void CuVector<float>::SetConst(float value)
+  {
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK);
+    dim3 dimGrid(n_blocks(Dim(), CUBLOCK));
+    ::MatrixDim d = { 1, Dim(), Dim() };
+
+    cudaF_set_const(dimGrid,dimBlock,mpCUData,value,d);
+    cuSafeCall(cudaGetLastError());
+
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  inline void CuVector<float>::AddScaled(float alpha, const CuVector<float>& vec, float beta)
+  {
+    Timer tim; tim.Start();
+
+    assert(vec.Dim() == Dim());
+
+    dim3 dimBlock(CUBLOCK);
+    dim3 dimGrid(n_blocks(Dim(), CUBLOCK));
+    ::MatrixDim d = { 1, Dim(), Dim() };
+
+    cudaF_add_scaled(dimGrid,dimBlock,alpha,vec.pCUData(),beta,mpCUData,d);
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  inline void CuVector<float>::AddColSum(float alpha, const CuMatrix<float>& mat, float beta)
+  {
+    Timer tim; tim.Start();
+
+    assert(mat.Cols() == Dim());
+    
+    /**
+     * Rows()<=512 limit due to limited shared memory
+     * Cols()<=256 limit due to coalesced memory alignment:
+     *             matrices with huge strides have slow access!!!
+     */
+    if(mat.Rows() > 512 || mat.Cols() > 256) {
+      size_t dimBlock = CUBLOCK*2;
+      size_t dimGrid = n_blocks(Dim(),CUBLOCK*2); 
+
+      cudaF_add_col_sum(dimGrid,dimBlock,alpha,mat.pCUData(),beta,mpCUData,mat.Dim());
+      cuSafeCall(cudaGetLastError());
+    } else {
+      dim3 dimBlock(mat.Rows(),1);
+      dim3 dimGrid(1,Dim()); 
+
+      cudaF_add_col_sum_reduce(dimGrid,dimBlock,alpha,mat.pCUData(),beta,mpCUData,mat.Dim());
+      cuSafeCall(cudaGetLastError());
+    }
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////
+  //// CuVector:: templeate specializations (double)
+  ////
+  template<>
+  inline void CuVector<double>::SetConst(double value)
+  {
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK);
+    dim3 dimGrid(n_blocks(Dim(), CUBLOCK));
+    ::MatrixDim d = { 1, Dim(), Dim() };
+
+    cudaD_set_const(dimGrid,dimBlock,mpCUData,value,d);
+    cuSafeCall(cudaGetLastError());
+
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  inline void CuVector<double>::AddScaled(double alpha, const CuVector<double>& vec, double beta)
+  {
+    Timer tim; tim.Start();
+
+    assert(vec.Dim() == Dim());
+
+    dim3 dimBlock(CUBLOCK);
+    dim3 dimGrid(n_blocks(Dim(), CUBLOCK));
+    ::MatrixDim d = { 1, Dim(), Dim() };
+
+    cudaD_add_scaled(dimGrid,dimBlock,alpha,vec.pCUData(),beta,mpCUData,d);
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  inline void CuVector<double>::AddColSum(double alpha, const CuMatrix<double>& mat, double beta)
+  {
+    Timer tim; tim.Start();
+
+    assert(mat.Cols() == Dim());
+
+    size_t dimBlock = CUBLOCK*2;
+    size_t dimGrid = n_blocks(Dim(),CUBLOCK*2); 
+
+    cudaD_add_col_sum(dimGrid,dimBlock,alpha,mat.pCUData(),beta,mpCUData,mat.Dim());
+    cuSafeCall(cudaGetLastError());
+
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+}
+
+
+
diff --git a/src/CuBaseLib/Makefile b/src/CuBaseLib/Makefile
new file mode 100644
index 0000000..cb7c00b
--- /dev/null
+++ b/src/CuBaseLib/Makefile
@@ -0,0 +1,59 @@
+
+include ../tnet.mk
+
+INCLUDE = -I. -I../ -I../KaldiLib 
+
+
+CUDA_INCLUDE= -I$(CUDA_TK_BASE)/include
+CUDA_BIN=$(CUDA_TK_BASE)/bin
+
+
+CUSRC=$(wildcard *.cu)
+CUOBJ=$(patsubst %.cu, %.o, $(CUSRC))
+
+
+
+CUDA_FLAGS = -g -Xcompiler -fPIC --verbose 
+ifeq ($(BITS64), true)
+  CUDA_FLAGS += --machine 64
+  #BUT_FORCE_GCC64  = ln -s `which x86_64-linux-gcc` $(PWD)/gcc 
+  #BUT_UNLINK_GCC64 = unlink $(PWD)/gcc
+else
+  CUDA_FLAGS += --machine 32
+endif
+
+ifeq ($(DOUBLEPRECISION), true)
+  CUDA_FLAGS += --gpu-architecture compute_13 --gpu-code sm_13
+endif
+
+
+
+
+all : libCuBase.a
+
+libCuBase.a : $(CUOBJ) $(OBJ)
+	$(AR) ruv $@ $?
+	$(RANLIB) $@
+
+
+%.o : %.cu
+	$(BUT_FORCE_GCC64)
+	export PATH=$(PWD):$(CUDA_BIN):$(PATH); $(CUDA_BIN)/nvcc -c $< -o $@ -I. $(CUDA_INCLUDE) $(CUDA_FLAGS)
+	$(BUT_UNLINK_GCC64)
+
+%.o : %.cc
+	$(CXX) -c $< -o $@ $(CXXFLAGS) $(CUDA_INCLUDE) $(INCLUDE)
+
+
+
+
+.PHONY: clean depend
+
+clean :
+	rm -f *.o *.a
+
+depend:
+	$(CXX) -M $(CXXFLAGS) *.cc $(INCLUDE) $(CUDA_INCLUDE) > .depend.mk
+
+-include .depend.mk
+
diff --git a/src/CuBaseLib/cucommon.h b/src/CuBaseLib/cucommon.h
new file mode 100644
index 0000000..6dc7e94
--- /dev/null
+++ b/src/CuBaseLib/cucommon.h
@@ -0,0 +1,46 @@
+#ifndef _CUCOMMON_H_
+#define _CUCOMMON_H_
+
+#include <iostream>
+#include <sstream>
+
+#include <cuda_runtime_api.h>
+
+#include "Error.h"
+
+
+
+#define cuSafeCall(fun) \
+{ \
+  int ret; \
+  if((ret = (fun)) != 0) { \
+    std::ostringstream os; \
+    os << "CUDA ERROR #" << ret << " " << __FILE__ ":" << __LINE__ << " "  << __func__ << "()" << " '" << #fun << "' " << cudaGetErrorString((cudaError_t)ret); \
+    throw(MyException(os.str())); \
+  } \
+  cudaThreadSynchronize(); \
+} 
+
+
+
+
+namespace TNet {
+
+  /** The size of edge of CUDA square block **/
+  static const int CUBLOCK = 16;
+
+  /** Number of blocks in which is split task of size 'size' **/
+  inline int n_blocks(int size, int block_size) 
+  { return size / block_size + ((size % block_size == 0)? 0 : 1); }
+
+  /** Printing dim3 output operator **/
+  inline std::ostream& operator<<(std::ostream& os, dim3 arr) {
+    os << "[" << arr.x << "," << arr.y << "," << arr.z << "]";
+    return os;
+  }
+
+}
+
+
+
+#endif
diff --git a/src/CuBaseLib/cudevice.cc b/src/CuBaseLib/cudevice.cc
new file mode 100644
index 0000000..90c5bf3
--- /dev/null
+++ b/src/CuBaseLib/cudevice.cc
@@ -0,0 +1,129 @@
+
+#include <cudevice.h>
+#include <cublas.h>
+#include <cuda.h>
+
+///////////////////
+//DEBUG: Just make sure it compiles...
+#include "cumatrix.h"
+#include "cuvector.h"
+#include "cumath.h"
+template class TNet::CuMatrix<float>;
+template class TNet::CuVector<float>;
+template class TNet::CuMath<float>;
+///////////////////
+
+namespace TNet {
+
+
+  /**********************************************************************************
+   * CuDevice::
+   */
+  CuDevice::
+  CuDevice()
+    : mIsPresent(false), mVerbose(false)
+  {
+    //get number of devices
+    int N_GPU = 0;
+    cudaGetDeviceCount(&N_GPU);
+
+    //select device if more than one
+    if(N_GPU > 1) {
+      char name[128];
+      size_t free, total;
+      std::vector<float> free_mem_ratio;
+      //get ratios of memory use
+      std::cout << "Selecting from " << N_GPU << " GPUs\n";
+      for(int n=0; n<N_GPU; n++) {
+        std::cout << "cudaSetDevice(" << n << "): ";
+        cuSafeCall(cudaSetDevice(n));//context created by cuSafeCall(...)
+        cuDeviceGetName(name,128,n);
+        std::cout << name << "\t";
+        cuSafeCall(cuMemGetInfo(&free,&total));
+        std::cout << "free: " << free/1024/1024 << "M, "
+                  << "total: "<< total/1024/1024 << "M, "
+                  << "ratio: "<< free/(float)total << "\n";
+        free_mem_ratio.push_back(free/(float)total);
+        cudaThreadExit();//destroy context
+      }
+      //find GPU with max free memory
+      int max_id=0;
+      for(int n=1; n<free_mem_ratio.size(); n++) {
+        if(free_mem_ratio[n] > free_mem_ratio[max_id]) max_id=n;
+      }
+      std::cout << "Selected device: " << max_id << " (automatically)\n";
+      cuSafeCall(cudaSetDevice(max_id));
+    }
+      
+    if(N_GPU > 0) {
+      //initialize the CUBLAS
+      cuSafeCall(cublasInit());
+      mIsPresent = true;
+    } else {
+      Warning("No CUDA enabled GPU is present!");
+    }
+  }
+
+  CuDevice::
+  ~CuDevice()
+  {
+    if(mIsPresent) {
+      cuSafeCall(cublasShutdown());
+      if(mVerbose) {
+        TraceLog("CUBLAS released");
+        PrintProfile();
+      }
+    } else {
+      Warning("No CUDA enabled GPU was present!");
+    }
+  }
+
+
+  void 
+  CuDevice::
+  SelectGPU(int gpu_id)
+  {
+    //get number of devices
+    int N_GPU = 0;
+    cudaGetDeviceCount(&N_GPU);
+    if(gpu_id >= N_GPU) {
+      KALDI_ERR << "Cannot select GPU " << gpu_id 
+                << ", detected " << N_GPU << " CUDA capable cards!";
+    }
+    //release old card
+    cuSafeCall(cublasShutdown());
+    cudaThreadExit();
+    //select new card
+    cuSafeCall(cudaSetDevice(gpu_id));
+    //initialize CUBLAS
+    cuSafeCall(cublasInit());
+    std::cout << "Selected device " << gpu_id << " (manually)\n";
+  }
+
+
+  std::string
+  CuDevice::
+  GetFreeMemory()
+  {
+    size_t mem_free, mem_total;
+    cuMemGetInfo(&mem_free, &mem_total);
+    std::ostringstream os;
+    os << "Free:" << mem_free/(1024*1024) << "MB "
+       << "Used:" << (mem_total-mem_free)/(1024*1024) << "MB "
+       << "Total:" << mem_total/(1024*1024) << "MB";
+    return os.str();
+  }
+
+
+  ////////////////////////////////////////////////
+  // Instance of the static singleton 
+  //
+  CuDevice CuDevice::msDevice;
+  //
+  ////////////////////////////////////////////////
+  
+
+
+}
+
+
diff --git a/src/CuBaseLib/cudevice.h b/src/CuBaseLib/cudevice.h
new file mode 100644
index 0000000..c5eeb7b
--- /dev/null
+++ b/src/CuBaseLib/cudevice.h
@@ -0,0 +1,79 @@
+#ifndef _CUDEVICE_H_
+#define _CUDEVICE_H_
+
+#include <map>
+#include <string>
+#include <iostream>
+
+namespace TNet {
+
+  /**
+   * Singleton object which represents CUDA device
+   * responsible for CUBLAS initilalisation
+   * and memory block registration
+   */
+  class CuDevice 
+  {
+    // Singleton interface...
+    private:
+      CuDevice();
+      CuDevice(CuDevice&);
+      CuDevice& operator=(CuDevice&);
+
+    public:
+      ~CuDevice();
+      static CuDevice& Instantiate()
+      { return msDevice; }
+
+    private:
+      static CuDevice msDevice;
+
+
+    /**********************************/
+    // Instance interface
+    public:
+
+      void SelectGPU(int gpu_id);
+     
+      /// Check if the CUDA device is in the system      
+      bool IsPresent()
+      { return mIsPresent; }
+
+      void Verbose(bool verbose)
+      { mVerbose = verbose; }
+
+      /// Sum the IO time
+      void AccuProfile(const std::string& key,double time) 
+      { 
+        if(mProfileMap.find(key) == mProfileMap.end()) {
+          mProfileMap[key] = 0.0;
+        }
+        mProfileMap[key] += time;
+      }
+
+      void PrintProfile()
+      { 
+        std::cout << "[cudevice profile]\n";
+        std::map<std::string, double>::iterator it;
+        for(it = mProfileMap.begin(); it != mProfileMap.end(); ++it) {
+          std::cout << it->first << "\t" << it->second << "s\n";
+        }
+      }
+
+      void ResetProfile()
+      { mProfileMap.clear(); }
+
+      std::string GetFreeMemory();
+
+
+    private:
+      std::map<std::string, double> mProfileMap;
+      bool mIsPresent;
+      bool mVerbose;
+  }; //class CuDevice
+
+
+}
+
+
+#endif
diff --git a/src/CuBaseLib/cukernels.cu b/src/CuBaseLib/cukernels.cu
new file mode 100644
index 0000000..d6f866d
--- /dev/null
+++ b/src/CuBaseLib/cukernels.cu
@@ -0,0 +1,626 @@
+
+#include <cfloat>
+#include "cukernels.h"
+
+
+
+/*****************
+ * CUDA kernels
+ */
+//CuMatrix
+template<typename T>
+__global__
+static void _set_const(T* mat, T value, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if ( i < d.cols  &&  j < d.rows )
+    mat[index] = value;
+}
+
+
+
+template<typename T>
+__global__
+static void _apply_log(T* mat, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if ( i < d.cols  &&  j < d.rows )
+    mat[index] = log(mat[index]);
+}
+
+
+template<typename T>
+__global__
+static void _apply_mask(T* mat, const float* mask, MatrixDim dmat, MatrixDim dmask) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*dmat.stride;
+  int index2 = i + j*dmask.stride;
+  if ( i < dmat.cols  &&  j < dmat.rows ) 
+    if(mask[index2] == 0) mat[index] = 0;
+}
+
+
+template<typename T>
+__global__
+static void _apply_l1(T* mat, T l1, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if ( i < d.cols  &&  j < d.rows ) {
+    T value = mat[index];
+    T tgt;
+    if(abs(value) < l1) {
+      tgt = 0;
+    } else {
+      tgt = (value > 0?value-l1:value+l1);
+    }
+    mat[index] = tgt;
+  }
+}
+
+
+template<typename T>
+__global__
+static void _scale_cols(T* mat, const T* scale, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if ( i < d.cols  &&  j < d.rows )
+    mat[index] *= scale[i];
+}
+
+
+template<typename T>
+__global__
+static void _scale_rows(T* mat, const T* scale, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if ( i < d.cols  &&  j < d.rows )
+    mat[index] *= scale[j];
+}
+
+
+template<typename T>
+__global__
+static void _add_scaled(T alpha, const T* A, T beta, T* dst, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if ( i < d.cols  &&  j < d.rows )
+    dst[index] = alpha*A[index] + beta*dst[index];
+}
+
+
+template<typename T>
+__global__
+static void _add_scaled_row(T alpha, const T* row, T beta, T* dst, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+
+#if 0
+  //this does not accelerate :(
+  __shared__ T aux[16];
+  if(threadIdx.y == 0 && i < d.cols) aux[threadIdx.x] = row[i];
+  __syncthreads();
+  
+  if ( i < d.cols  &&  j < d.rows )
+    dst[index] = alpha*aux[threadIdx.x] + beta*dst[index];
+#else
+  if ( i < d.cols  &&  j < d.rows )
+    dst[index] = alpha*row[i] + beta*dst[index];
+#endif
+}
+
+
+template<typename T>
+__global__
+static void _mul_elem(T* mat, const T* A, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if ( i < d.cols  &&  j < d.rows )
+    mat[index] = mat[index] * A[index];
+}
+
+
+template<typename T>
+__global__
+static void _log_elem(T* mat, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if ( i < d.cols  &&  j < d.rows ) {
+    if(mat[index] < FLT_MIN) mat[index] = FLT_MIN;
+    mat[index] = log(mat[index]);
+  }
+}
+
+
+
+
+//CuVector
+template<typename T>
+__global__
+static void _add_col_sum(T alpha, const T* mat, T beta, T* vec, MatrixDim d) {
+
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  //This should be called 1-D
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  if(j > 0) return;
+  
+  if(i < d.cols) {
+    double sum = 0.0;
+    for(int k = 0; k < d.rows; k++) {
+      sum += mat[i+k*d.stride];
+    }
+    vec[i] = alpha*sum + beta*vec[i];
+  }
+}
+
+
+template<typename T>
+__global__
+static void _add_col_sum_reduce(T alpha, const T* mat, T beta, T* vec, MatrixDim d) {
+
+  //flipped x,y for reducing... x..row, y..col
+  int j = blockIdx.x * blockDim.x + threadIdx.x;
+  int i = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if(blockIdx.x > 0) return;
+  if(blockDim.y != 1) return;
+
+  //copy vector to shared mem
+  __shared__ T aux[512];
+  aux[threadIdx.x] = mat[i+j*d.stride];
+  __syncthreads();
+
+  T sum = _sum_reduce(aux);
+  __syncthreads();
+  //copy out the result
+  vec[i] = alpha*sum + beta*vec[i];
+}
+
+
+
+//CuMath
+template<typename T>
+__global__
+static void _sigmoid(T*y, const T*x, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if( i < d.cols  &&  j < d.rows ) {
+    T res = 1.0 / (1.0 + exp(-x[index]));
+    /*
+    if(res < 0.001) res = 0.001;
+    if(res > 0.999) res = 0.999;
+    */
+    y[index] = res;
+  }
+}
+
+
+template<typename T>
+__global__
+static void _diff_sigmoid(T*eout, const T*e, const T*y, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if( i < d.cols  && j < d.rows ) 
+    eout[index] = y[index]*(1.0-y[index]) * e[index];
+}
+
+
+template<typename T>
+__global__
+static void _softmax(T*y, const T*x, MatrixDim d) {
+  int j = blockIdx.x * blockDim.x + threadIdx.x;
+  if(j >= d.rows) return;
+
+  //copy to output and find max...
+  double max = -1e20;
+  double sum = 0.0;
+  for(int i=0; i<d.cols; i++) {
+    if(max < x[i+j*d.stride]) max = x[i+j*d.stride];
+    y[i+j*d.stride] = x[i+j*d.stride];
+  }
+  //subtract max, apply exp, sum up...
+  for(int i=0; i<d.cols; i++) {
+    y[i+j*d.stride] = exp(y[i+j*d.stride] - max);
+    sum += y[i+j*d.stride];
+  }
+  //normalize by sum...
+  for(int i=0; i<d.cols; i++) {
+    y[i+j*d.stride] /= sum;
+  }
+}
+
+
+
+
+template<typename T>
+__device__
+static T _max_reduce(T buffer[]) {
+
+  // Total number of active threads
+  int nTotalThreads = blockDim.x;	
+  __syncthreads();
+
+  while(nTotalThreads > 1) {
+    int halfPoint = ((1+nTotalThreads) >> 1);	// divide by two
+    // only the first half of the threads will be active.
+    if (threadIdx.x < halfPoint)  {
+      // Get the shared value stored by another thread
+      T temp = -1e20;
+      if(threadIdx.x+halfPoint < nTotalThreads) {
+        temp = buffer[threadIdx.x + halfPoint];
+      }
+      if (temp > buffer[threadIdx.x]) buffer[threadIdx.x] = temp;
+    }
+    __syncthreads();
+    nTotalThreads = ((1+nTotalThreads) >> 1);	// divide by two.
+  }
+  // the result
+  return buffer[0];
+}
+
+
+
+
+template<typename T>
+__device__
+static T _sum_reduce(T buffer[]) {
+
+  // Total number of active threads
+  int nTotalThreads = blockDim.x;	
+  __syncthreads();
+
+  while(nTotalThreads > 1) {
+    int halfPoint = ((1+nTotalThreads) >> 1);	// divide by two
+    // only the first half of the threads will be active.
+    if (threadIdx.x < halfPoint)  {
+      // Get the shared value stored by another thread
+      T temp = 0.0;
+      if(threadIdx.x+halfPoint < nTotalThreads) {
+        temp = buffer[threadIdx.x + halfPoint];
+      }
+      buffer[threadIdx.x] += temp;
+    }
+    __syncthreads();
+    nTotalThreads = ((1+nTotalThreads) >> 1);	// divide by two.
+  }
+  // the result
+  return buffer[0];
+}
+
+
+
+template<typename T>
+__global__
+static void _softmax_reduce(T*y, const T*x, MatrixDim d) {
+  
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if(blockIdx.x > 0) return;
+  if(blockDim.y > 1) return;
+
+  __shared__ T row_data[256];
+  __shared__ T aux[256];
+
+  //copy the input to row_data
+  row_data[i] = x[i+j*d.stride];
+  __syncthreads();
+
+  //copy input to aux
+  aux[i] = row_data[i];
+  __syncthreads();
+  //get the maximum value
+  T max = _max_reduce(aux);
+  __syncthreads();
+
+  //calculate exp(data-max)
+  row_data[i] = exp(row_data[i]-max);
+ 
+  //copy the values to aux
+  aux[i] = row_data[i];
+  __syncthreads();
+  //get the sum
+  T sum = _sum_reduce(aux);
+  __syncthreads();
+
+  //divide the values
+  row_data[i] /= sum;
+  //copy out
+  y[i+j*d.stride] = row_data[i];
+
+}
+
+
+
+template<typename T>
+__global__
+static void _expand(T* y, const T* x, const int* off, MatrixDim d_out, MatrixDim d_in)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d_out.stride;
+  if( i < d_out.cols  && j < d_out.rows ) {
+    int src_col = i % d_in.cols;
+    int src_row = j + off[i / d_in.cols];
+    if(src_row < 0) src_row = 0;
+    if(src_row >= d_in.rows) src_row = d_in.rows-1;
+    y[index] = x[src_col + src_row*d_in.stride];
+  }
+}
+
+
+template<typename T>
+__global__
+static void _rearrange(T* y, const T* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d_out.stride;
+  if( i < d_out.cols  && j < d_out.rows ) {
+    int src_col = copy_from[i];
+    if(src_col >= 0 && src_col < d_in.cols) {
+      y[index] = x[src_col + j*d_in.stride];
+    } else {
+      y[index] = 1.0/0.0;
+    }
+  }
+}
+
+
+template<typename T>
+__global__
+static void _randomize(T* y, const T* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d_out.stride;
+  if( i < d_out.cols  && j < d_out.rows ) {
+    int src_row = copy_from[j];
+    y[index] = x[i + src_row*d_in.stride];
+  }
+}
+
+
+template<typename T>
+__global__
+static void _check_class(const T* out, const T* des, int* match, MatrixDim d)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  if(j>0) return;
+
+  if(i<d.rows) {
+    int out_id = -1, des_id = -2;
+    T out_max = -1e20, des_max = -1e20;
+
+    for(int k=0; k<d.cols; k++) {
+      T val = out[k + i*d.stride];
+      if(val > out_max) { out_max = val; out_id = k; }
+    }
+    for(int k=0; k<d.cols; k++) {
+      T val = des[k + i*d.stride];
+      if(val > des_max) { des_max = val; des_id = k; }
+    }
+    
+    match[i] = ((out_id == des_id)?1:0);
+  }
+}
+
+
+template<typename T>
+__device__
+static int _max_id_reduce(T val[],int idx[]) {
+
+  // Total number of active threads
+  int nTotalThreads = blockDim.x;	
+  __syncthreads();
+
+  while(nTotalThreads > 1) {
+    int halfPoint = ((1+nTotalThreads) >> 1);	// divide by two
+    // only the first half of the threads will be active.
+    if (threadIdx.x < halfPoint)  {
+      // Get the shared value stored by another thread
+      T temp = -1e20;
+      if(threadIdx.x+halfPoint < nTotalThreads) {
+        temp = val[idx[threadIdx.x + halfPoint]];
+      }
+      if (temp > val[idx[threadIdx.x]]) idx[threadIdx.x]=idx[threadIdx.x + halfPoint];
+    }
+    __syncthreads();
+    nTotalThreads = ((1+nTotalThreads) >> 1);	// divide by two.
+  }
+  // the result
+  return idx[0];
+}
+
+
+
+
+
+
+template<typename T>
+__global__
+static void _check_class_reduce(const T* out, const T* des, int* match, MatrixDim d)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if(blockIdx.x > 0) return;
+  if(blockDim.y != 1) return;
+
+  __shared__ T value[256];
+  __shared__ int index[256];
+
+  value[threadIdx.x] = out[i+j*d.stride];
+  index[threadIdx.x] = threadIdx.x;
+  __syncthreads();
+
+  int out_max = _max_id_reduce(value,index);
+  __syncthreads();
+
+  value[threadIdx.x] = des[i+j*d.stride];
+  index[threadIdx.x] = threadIdx.x;
+  __syncthreads();
+  
+  int des_max = _max_id_reduce(value,index);
+  __syncthreads();
+
+  if(threadIdx.x == 0) {
+    match[j] = ((out_max == des_max)?1:0);
+  }
+}
+
+
+
+
+/**************
+ * C wrappers around CUDA kernels
+ */
+//:FLOAT:
+//CuMatrix
+void cudaF_set_const(dim3 Gr, dim3 Bl, float* mat, float value, MatrixDim d) 
+{ _set_const<<<Gr,Bl>>>(mat,value,d); }
+
+void cudaF_apply_log(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) 
+{ _apply_log<<<Gr,Bl>>>(mat,d); }
+
+void cudaF_apply_mask(dim3 Gr, dim3 Bl, float* mat, const float* mask, MatrixDim dmat, MatrixDim dmask)
+{ _apply_mask<<<Gr,Bl>>>(mat,mask,dmat,dmask); }
+
+void cudaF_apply_l1(dim3 Gr, dim3 Bl, float* mat, float l1, MatrixDim d)
+{ _apply_l1<<<Gr,Bl>>>(mat,l1,d); }
+
+void cudaF_scale_cols(dim3 Gr, dim3 Bl, float* mat, const float* scale, MatrixDim d)
+{ _scale_cols<<<Gr,Bl>>>(mat,scale,d); }
+
+void cudaF_scale_rows(dim3 Gr, dim3 Bl, float* mat, const float* scale, MatrixDim d)
+{ _scale_rows<<<Gr,Bl>>>(mat,scale,d); }
+
+void cudaF_add_scaled(dim3 Gr, dim3 Bl, float alpha, const float* A, float beta, float* dst, MatrixDim d)
+{ _add_scaled<<<Gr,Bl>>>(alpha,A,beta,dst,d); }
+
+void cudaF_add_scaled_row(dim3 Gr, dim3 Bl, float alpha, const float* row, float beta, float* dst, MatrixDim d)
+{ _add_scaled_row<<<Gr,Bl>>>(alpha,row,beta,dst,d); }
+
+void cudaF_mul_elem(dim3 Gr, dim3 Bl, float*mat, const float*A, MatrixDim d)
+{ _mul_elem<<<Gr,Bl>>>(mat,A,d); }
+
+void cudaF_log_elem(dim3 Gr, dim3 Bl, float*mat, MatrixDim d)
+{ _log_elem<<<Gr,Bl>>>(mat,d); }
+
+//CuVector
+void cudaF_add_col_sum(size_t Gr, size_t Bl, float alpha, const float* mat, float beta, float* vec, MatrixDim d)
+{ _add_col_sum<<<Gr,Bl>>>(alpha,mat,beta,vec,d); }
+
+void cudaF_add_col_sum_reduce(dim3 Gr, dim3 Bl, float alpha, const float* mat, float beta, float* vec, MatrixDim d) 
+{ _add_col_sum_reduce<<<Gr,Bl>>>(alpha,mat,beta,vec,d); }
+
+//CuMath
+void cudaF_sigmoid (dim3 Gr, dim3 Bl, float *y, const float*x, MatrixDim d)
+{ _sigmoid<<<Gr,Bl>>>(y, x, d); }
+
+void cudaF_diff_sigmoid (dim3 Gr, dim3 Bl, float*eout, const float*e, const float*y, MatrixDim d) {
+  _diff_sigmoid<<<Gr,Bl>>>(eout, e, y, d);
+}
+
+void cudaF_softmax (size_t Gr, size_t Bl, float*y, const float*x, MatrixDim d) 
+{ _softmax<<<Gr,Bl>>>(y, x, d); }
+
+void cudaF_softmax_reduce (dim3 Gr, dim3 Bl, float*y, const float*x, MatrixDim d) 
+{ _softmax_reduce<<<Gr,Bl>>>(y, x, d); }
+
+
+void cudaF_expand(dim3 Gr, dim3 Bl, float* y, const float* x, const int* off, MatrixDim d_out, MatrixDim d_in)
+{ _expand<<<Gr,Bl>>>(y,x,off,d_out,d_in); }
+
+
+void cudaF_rearrange(dim3 Gr, dim3 Bl, float* y, const float* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in)
+{ _rearrange<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in); }
+
+  
+void cudaF_randomize(dim3 Gr, dim3 Bl, float* y, const float* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in)
+{ _randomize<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in); }
+
+
+void cudaF_check_class(size_t Gr, size_t Bl, const float* out, const float* des, int* match, MatrixDim d)
+{ _check_class<<<Gr,Bl>>>(out,des,match,d); }
+
+void cudaF_check_class_reduce(dim3 Gr, dim3 Bl, const float* out, const float* des, int* match, MatrixDim d)
+{ _check_class_reduce<<<Gr,Bl>>>(out,des,match,d); }
+
+
+
+
+//:DOUBLE:
+//CuMatrix
+void cudaD_set_const(dim3 Gr, dim3 Bl, double* mat, double value, MatrixDim d) 
+{ _set_const<<<Gr,Bl>>>(mat,value,d); }
+
+void cudaD_apply_log(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) 
+{ _apply_log<<<Gr,Bl>>>(mat,d); }
+
+void cudaD_scale_cols(dim3 Gr, dim3 Bl, double* mat, const double* scale, MatrixDim d)
+{ _scale_cols<<<Gr,Bl>>>(mat,scale,d); }
+
+void cudaD_scale_rows(dim3 Gr, dim3 Bl, double* mat, const double* scale, MatrixDim d)
+{ _scale_rows<<<Gr,Bl>>>(mat,scale,d); }
+
+void cudaD_add_scaled(dim3 Gr, dim3 Bl, double alpha, const double* A, double beta, double* dst, MatrixDim d)
+{ _add_scaled<<<Gr,Bl>>>(alpha,A,beta,dst,d); }
+
+void cudaD_add_scaled_row(dim3 Gr, dim3 Bl, double alpha, const double* row, double beta, double* dst, MatrixDim d)
+{ _add_scaled_row<<<Gr,Bl>>>(alpha,row,beta,dst,d); }
+
+void cudaD_mul_elem(dim3 Gr, dim3 Bl, double*mat, const double*A, MatrixDim d)
+{ _mul_elem<<<Gr,Bl>>>(mat,A,d); }
+
+void cudaD_log_elem(dim3 Gr, dim3 Bl, double*mat, MatrixDim d)
+{ _log_elem<<<Gr,Bl>>>(mat,d); }
+
+//CuVector
+void cudaD_add_col_sum(size_t Gr, size_t Bl, double alpha, const double* mat, double beta, double* vec, MatrixDim d)
+{ _add_col_sum<<<Gr,Bl>>>(alpha,mat,beta,vec,d); }
+
+//CuMath
+void cudaD_sigmoid (dim3 Gr, dim3 Bl, double *y, const double*x, MatrixDim d)
+{ _sigmoid<<<Gr,Bl>>>(y, x, d); }
+
+
+void cudaD_diff_sigmoid (dim3 Gr, dim3 Bl, double*eout, const double*e, const double*y, MatrixDim d) {
+  _diff_sigmoid<<<Gr,Bl>>>(eout, e, y, d);
+}
+
+void cudaD_softmax (size_t Gr, size_t Bl, double*y, const double*x, MatrixDim d) 
+{ _softmax<<<Gr,Bl>>>(y, x, d); }
+
+
+void cudaD_expand(dim3 Gr, dim3 Bl, double* y, const double* x, const int* off, MatrixDim d_out, MatrixDim d_in)
+{ _expand<<<Gr,Bl>>>(y,x,off,d_out,d_in); }
+
+
+void cudaD_rearrange(dim3 Gr, dim3 Bl, double* y, const double* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in)
+{ _rearrange<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in); }
+
+  
+void cudaD_randomize(dim3 Gr, dim3 Bl, double* y, const double* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in)
+{ _randomize<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in); }
+
+
+void cudaD_check_class(size_t Gr, size_t Bl, const double* out, const double* des, int* match, MatrixDim d)
+{ _check_class<<<Gr,Bl>>>(out,des,match,d); }
+
+
+
+
diff --git a/src/CuBaseLib/cukernels.h b/src/CuBaseLib/cukernels.h
new file mode 100644
index 0000000..d8320b5
--- /dev/null
+++ b/src/CuBaseLib/cukernels.h
@@ -0,0 +1,81 @@
+#ifndef _cuda_kernels_h_
+#define _cuda_kernels_h_
+
+
+extern "C" {
+
+#pragma GCC diagnostic ignored "-Wshadow";
+#include <vector_types.h>
+#pragma GCC diagnostic warning "-Wshadow";
+
+  typedef struct MatrixDim_ {
+    int rows;
+    int cols;
+    int stride;
+  } MatrixDim;
+
+  /*************
+   * Float instances
+   */
+  //CuMatrix 
+  void cudaF_set_const(dim3 Gr, dim3 Bl, float*mat, float value, MatrixDim d);
+  void cudaF_apply_log(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
+  void cudaF_apply_mask(dim3 Gr, dim3 Bl, float* mat, const float* mask, MatrixDim dmat, MatrixDim dmask);
+  void cudaF_apply_l1(dim3 Gr, dim3 Bl, float* mat, float l1, MatrixDim d);
+  void cudaF_scale_cols(dim3 Gr, dim3 Bl, float*mat, const float* scale, MatrixDim d);
+  void cudaF_scale_rows(dim3 Gr, dim3 Bl, float*mat, const float* scale, MatrixDim d);
+  void cudaF_add_scaled(dim3 Gr, dim3 Bl, float alpha, const float* A, float beta, float* dst, MatrixDim d);
+  void cudaF_add_scaled_row(dim3 Gr, dim3 Bl, float alpha, const float* row, float beta, float* dst, MatrixDim d);
+  void cudaF_mul_elem(dim3 Gr, dim3 Bl, float*mat, const float*A, MatrixDim d);
+  void cudaF_log_elem(dim3 Gr, dim3 Bl, float*mat, MatrixDim d);
+   
+  //CuVector
+  void cudaF_add_col_sum(size_t Gr, size_t Bl, float alpha, const float* mat, float beta, float* vec, MatrixDim d);
+  void cudaF_add_col_sum_reduce(dim3 Gr, dim3 Bl, float alpha, const float* mat, float beta, float* vec, MatrixDim d);
+
+  //CuMath
+  void cudaF_softmax      (size_t Gr, size_t Bl, float*y, const float*x, MatrixDim d);
+  void cudaF_softmax_reduce (dim3 Gr, dim3 Bl, float*y, const float*x, MatrixDim d); 
+  void cudaF_sigmoid      (dim3 Gr, dim3 Bl, float*y, const float*x, MatrixDim d);
+  void cudaF_diff_sigmoid (dim3 Gr, dim3 Bl, float* eout, const float* e, const float* y, MatrixDim d);
+
+  void cudaF_expand(dim3 Gr, dim3 Bl, float* y, const float* x, const int* off, MatrixDim d_out, MatrixDim d_in);
+  void cudaF_rearrange(dim3 Gr, dim3 Bl, float* y, const float* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in);
+  void cudaF_randomize(dim3 Gr, dim3 Bl, float* y, const float* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in);
+  
+  void cudaF_check_class(size_t Gr, size_t Bl, const float* out, const float* des, int* match, MatrixDim d);
+  void cudaF_check_class_reduce(dim3 Gr, dim3 Bl, const float* out, const float* des, int* match, MatrixDim d);
+
+
+
+  /*************
+   * Double instances
+   */
+  //CuMatrix 
+  void cudaD_set_const(dim3 Gr, dim3 Bl, double*mat, double value, MatrixDim d);
+  void cudaD_apply_log(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
+  void cudaD_scale_cols(dim3 Gr, dim3 Bl, double*mat, const double* scale, MatrixDim d);
+  void cudaD_scale_rows(dim3 Gr, dim3 Bl, double*mat, const double* scale, MatrixDim d);
+  void cudaD_add_scaled(dim3 Gr, dim3 Bl, double alpha, const double* A, double beta, double* dst, MatrixDim d);
+  void cudaD_add_scaled_row(dim3 Gr, dim3 Bl, double alpha, const double* row, double beta, double* dst, MatrixDim d);
+  void cudaD_mul_elem(dim3 Gr, dim3 Bl, double*mat, const double*A, MatrixDim d);
+  void cudaD_log_elem(dim3 Gr, dim3 Bl, double*mat, MatrixDim d);
+   
+  //CuVector
+  void cudaD_add_col_sum(size_t Gr, size_t Bl, double alpha, const double* mat, double beta, double* vec, MatrixDim d);
+
+  //CuMath
+  void cudaD_softmax      (size_t Gr, size_t Bl, double*y, const double*x, MatrixDim d);
+  void cudaD_sigmoid      (dim3 Gr, dim3 Bl, double*y, const double*x, MatrixDim d);
+  void cudaD_diff_sigmoid (dim3 Gr, dim3 Bl, double* eout, const double* e, const double* y, MatrixDim d);
+
+  void cudaD_expand(dim3 Gr, dim3 Bl, double* y, const double* x, const int* off, MatrixDim d_out, MatrixDim d_in);
+  void cudaD_rearrange(dim3 Gr, dim3 Bl, double* y, const double* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in);
+  void cudaD_randomize(dim3 Gr, dim3 Bl, double* y, const double* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in);
+  
+  void cudaD_check_class(size_t Gr, size_t Bl, const double* out, const double* des, int* match, MatrixDim d);
+
+
+}
+
+#endif
diff --git a/src/CuBaseLib/cumath.cc b/src/CuBaseLib/cumath.cc
new file mode 100644
index 0000000..d718324
--- /dev/null
+++ b/src/CuBaseLib/cumath.cc
@@ -0,0 +1,574 @@
+
+
+
+#include "cumath.h"
+#include "cukernels.h"
+
+
+namespace TNet {
+
+  //////////////////////////////////////////////////////////////////////////////
+  //// CuMath<> Template specializations (float)
+  ////
+  template<>
+  void CuMath<float>::Sigmoid(CuMatrix<float>& Y, const CuMatrix<float>& X)
+  {
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(X.Cols(),CUBLOCK), n_blocks(X.Rows(), CUBLOCK));
+
+    cudaF_sigmoid(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+  template<>
+  void CuMath<float>::DiffSigmoid(CuMatrix<float>& Eout, const CuMatrix<float>& Ein, const CuMatrix<float>& Y)
+  {
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Eout.Cols(), CUBLOCK), n_blocks(Eout.Rows(),CUBLOCK));
+
+    cudaF_diff_sigmoid(dimGrid, dimBlock, Eout.pCUData(), Ein.pCUData(), Y.pCUData(), Eout.Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+    
+  template<>
+  void CuMath<float>::Softmax(CuMatrix<float>& Y, const CuMatrix<float>& X)
+  {
+    Timer tim; tim.Start();
+
+#if 0
+    //disable 'reduce' functions
+    size_t dimBlock = CUBLOCK;
+    size_t dimGrid  = n_blocks(X.Rows(),CUBLOCK);
+
+    cudaF_softmax(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim());
+    cuSafeCall(cudaGetLastError());
+#else
+    if(X.Cols() > 256) {
+      //use old implementation (can't use reduction due to 
+      //limited size of shared memory)
+      size_t dimBlock = CUBLOCK;
+      size_t dimGrid  = n_blocks(X.Rows(),CUBLOCK);
+
+      cudaF_softmax(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim());
+      cuSafeCall(cudaGetLastError());
+    } else {
+      //use implementation with reduction
+      dim3 dimBlock(X.Cols(),1);
+      dim3 dimGrid(1,X.Rows());
+
+      cudaF_softmax_reduce(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim());
+      cuSafeCall(cudaGetLastError());
+    }
+#endif
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  void CuMath<float>::BlockLinearity(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuMatrix<float>& block_transf)
+  {
+    Timer tim; tim.Start();
+
+    assert(Y.Rows() == X.Rows());
+    assert((X.Cols() % block_transf.Rows()) == 0);
+    assert((Y.Cols() % block_transf.Cols()) == 0);
+    assert((X.Cols() / block_transf.Rows()) == (Y.Cols() / block_transf.Cols()));
+
+    int blocks = X.Cols() / block_transf.Rows();
+
+    for(int i = 0; i < blocks; i++) {
+      int m = block_transf.Cols();
+      int n = X.Rows();
+      int k = block_transf.Rows();
+
+      /*
+      //DEBUG MESSAGE
+      std::cout << "N N " << m << " " << n << " " << k << " " 
+                << 1.0 << " " << block_transf << " " << block_transf.Stride() 
+                << " " << X+i*k << " " << X.Stride() << " " 
+                << 0.0 << " " << Y+i*n << " " << Y.Stride() 
+                << "\n" << std::flush;
+      */
+
+
+      cublasSgemm('N', 'N', m, n, k, 
+                  1.0, block_transf.pCUData(), block_transf.Stride(), 
+                  X.pCUData()+i*k, X.Stride(), 
+                  0.0, Y.pCUData()+i*m, Y.Stride());
+    }
+    cuSafeCall(cublasGetError());    
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+
+  template<>
+  void CuMath<float>::Expand(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& frameOffsets)
+  {
+    Timer tim; tim.Start();
+
+    assert(Y.Rows() == X.Rows());
+    assert(X.Cols() * frameOffsets.Dim() == Y.Cols());
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK));
+
+    cudaF_expand(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), frameOffsets.pCUData(), Y.Dim(), X.Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  void CuMath<float>::Rearrange(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& copyFrom)
+  {
+    Timer tim; tim.Start();
+
+    assert(copyFrom.Dim() == Y.Cols());
+    assert(Y.Rows() == X.Rows());
+    
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK));
+    
+    cudaF_rearrange(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), Y.Dim(), X.Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  void CuMath<float>::Randomize(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& copyFrom)
+  {
+    Timer tim; tim.Start();
+
+    assert(X.Cols() == Y.Cols());
+    assert(X.Rows() == Y.Rows());
+    assert(copyFrom.Dim() <= Y.Rows());
+    
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(copyFrom.Dim(),CUBLOCK));
+    
+    MatrixDim dimX = X.Dim(); dimX.rows=copyFrom.Dim();
+    MatrixDim dimY = Y.Dim(); dimY.rows=copyFrom.Dim();
+
+    cudaF_randomize(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), dimY, dimX);
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  void CuMath<float>::CheckClass(const CuMatrix<float>& out, const CuMatrix<float> &des, CuVector<int>& match)
+  {
+    Timer tim; tim.Start();
+
+    assert(out.Cols() == des.Cols());
+    assert(out.Rows() == des.Rows());
+    assert(out.Stride() == des.Stride());
+    assert(match.Dim() == out.Rows());
+
+    if(out.Cols() > 256) {
+      size_t dimBlock = CUBLOCK;
+      size_t dimGrid = n_blocks(out.Rows(),CUBLOCK);
+
+      cudaF_check_class(dimGrid, dimBlock, out.pCUData(), des.pCUData(), match.pCUData(), out.Dim());
+      cuSafeCall(cudaGetLastError());
+    } else {
+      dim3 dimBlock(out.Cols(),1);
+      dim3 dimGrid(1,out.Rows());
+
+      cudaF_check_class_reduce(dimGrid, dimBlock, out.pCUData(), des.pCUData(), match.pCUData(), out.Dim());
+      cuSafeCall(cudaGetLastError());
+    }
+
+
+
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  void CuMath<float>::OffsetGemm(char transA, char transB, float alpha, const CuMatrix<float>& A, const CuMatrix<float>& B, float beta, CuMatrix<float>& C, int offA, int offB, int offC)
+  {
+    Timer tim; tim.Start();
+    // CUBLAS is col major, TNet is row major
+    // keep trans..., just swap A&B argumets: A->B B->A
+    //
+    // WARNING
+    // NO DIMENSION CHECK!!!
+    
+    //m,n,k is cublas m,n,k
+    size_t m = ((transB=='T' || transB=='t')? B.Rows() : B.Cols()); 
+    size_t n = ((transA=='T' || transA=='t')? A.Cols() : A.Rows());
+    size_t k = ((transB=='T' || transB=='t')? B.Cols() : B.Rows());
+    size_t k1 = ((transA=='T' || transA=='t')? A.Rows() : A.Cols());
+
+    k = ((k<k1)?k:k1);
+    m = ((m<C.Cols())?m:C.Cols());
+    n = ((n<C.Rows())?m:C.Rows());
+
+#if 0
+    std::cout << "A " << transA << " "<< A.Rows() << " " << A.Cols() << " " << A.Stride() << " " << offA
+         << "; B " << transB << " "<< B.Rows() << " " << B.Cols() << " " << B.Stride() << " " << offB
+         << "; C " << C.Rows() << " " << C.Cols() << " " << C.Stride() << " " << offC
+         << "; alpha" << alpha << " beta" << beta << " REALmnk:" << m <<" "<< n <<" "<< k << std::endl;
+#endif
+         
+
+    cublasSgemm(transB, transA, m, n, k, 
+                alpha, B.pCUData()+offB, B.Stride(), 
+                A.pCUData()+offA, A.Stride(), 
+                beta, C.pCUData()+offC, C.Stride());
+    cuSafeCall(cublasGetError());    
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+/*
+
+  template<>
+  void CuMath<float>::Gemv(char trans, float alpha, const CuMatrix<float>& A, const float* x, size_t dimX, float beta, float* y, size_t dimY)
+  {
+    Timer tim; tim.Start();
+    // CUBLAS is col major, TNet is row major
+    // y = alpha * op(A) * x + beta * y,
+    
+    size_t m = A.Cols(); //m..rows of A in colmajor (== cols in rowmajor)
+    size_t n = A.Rows(); //n..cols of A in colmajor (== rows in rowmajor)
+ 
+    // switch the trans parameter!
+    char cu_trans;
+    if(trans == 't' || trans == 'T') {
+      cu_trans = 'n';
+    } else if (trans == 'n' || trans == 'N') {
+      cu_trans = 't';
+    } else {
+      Error(std::string("Unknown trans")+trans);
+    }
+   
+    //check the dims
+    if(cu_trans == 'n') {
+      assert(dimX == n);
+      assert(dimY == m);
+    } else {
+      assert(dimX == m);
+      assert(dimY == n);
+    }
+ 
+    //run gemv
+    cublasSgemv(cu_trans,m,n,alpha,
+                A.pCUData(), A.Stride(), x, 1,
+                beta, y, 1);
+    
+    cuSafeCall(cublasGetError());    
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+  */
+
+  /**
+   * offsetY tells how many outputs of 'Ax' mutiplication is skipped at the beginning, 
+   */
+  template<>
+  void CuMath<float>::OffsetGemv(char trans, float alpha, const CuMatrix<float>& A, const float* x, size_t dimX, float beta, float* y, size_t dimY, size_t offsetY)
+  {
+    Timer tim; tim.Start();
+    // CUBLAS is col major, TNet is row major
+    // y = alpha * op(A) * x + beta * y,
+    
+    size_t m = A.Cols(); //m..rows of A in colmajor (== cols in rowmajor)
+    size_t n = A.Rows(); //n..cols of A in colmajor (== rows in rowmajor)
+ 
+    // switch the trans parameter!
+    char cu_trans;
+    if(trans == 't' || trans == 'T') {
+      cu_trans = 'n';
+    } else if (trans == 'n' || trans == 'N') {
+      cu_trans = 't';
+    } else {
+      Error(std::string("Unknown trans")+trans);
+    }
+
+    // select part of matrix for compute
+    size_t cu_offset = 0;
+    if(cu_trans == 'n') {
+      cu_offset += offsetY;
+      assert(m >= dimY+offsetY);
+      m = dimY;
+    } else {
+      cu_offset += offsetY*A.Stride();
+      assert(n >= dimY+offsetY);
+      n = dimY;
+    }
+   
+    //check the dims
+    if(cu_trans == 'n') {
+      assert(dimX == n);
+      assert(dimY == m);
+    } else {
+      assert(dimX == m);
+      assert(dimY == n);
+    }
+ 
+    //run gemv
+    cublasSgemv(cu_trans,m,n,alpha,
+                A.pCUData()+cu_offset, A.Stride(), x, 1,
+                beta, y, 1);
+    
+    cuSafeCall(cublasGetError());    
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+  
+  template<>
+  void CuMath<float>::BlasGer(float alpha, const float* x, size_t dimX, const float* y, size_t dimY, CuMatrix<float>& A) {
+    Timer tim; tim.Start();
+    // CUBLAS is col major, TNet is row major
+    // -> switch x and y
+
+    // A = alpha * x * transpose(y) + A,
+    
+    assert(dimX == A.Rows());
+    assert(dimY == A.Cols());
+
+    size_t m = A.Cols(); //m..rows of A in colmajor (== cols in rowmajor)
+    size_t n = A.Rows(); //n..cols of A in colmajor (== rows in rowmajor)
+
+    cublasSger(m,n,alpha,y,1,x,1,A.pCUData(),A.Stride()); 
+    cuSafeCall(cublasGetError());    
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  void CuMath<float>::VecExpand(const CuVector<float>&in, CuVector<float>&out)
+  {
+    Timer tim; tim.Start();
+
+    assert(out.Dim() % in.Dim() == 0);
+    int n_copies = out.Dim()/in.Dim();
+    CuVector<int> offsets(n_copies);
+    //offsets.SetConst(0); done implicitly!
+
+    dim3 dimBlock(CUBLOCK);
+    dim3 dimGrid(n_blocks(out.Dim(), CUBLOCK));
+    
+    MatrixDim dim_in = { 1, in.Dim(), in.Dim() };
+    MatrixDim dim_out = { 1, out.Dim(), out.Dim() };
+    cudaF_expand(dimGrid, dimBlock, out.pCUData(), in.pCUData(), offsets.pCUData(), dim_out, dim_in);
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  void CuMath<float>::VecAddColSum(float alpha, const CuVector<float>&in, float beta, CuVector<float>&out)
+  {
+    Timer tim; tim.Start();
+
+    assert(in.Dim() % out.Dim() == 0);
+
+    size_t dimBlock = CUBLOCK;
+    size_t dimGrid = n_blocks(out.Dim(),CUBLOCK); 
+
+    MatrixDim dim = { in.Dim()/out.Dim(), out.Dim(), out.Dim() };
+
+    cudaF_add_col_sum(dimGrid,dimBlock,alpha,in.pCUData(),beta,out.pCUData(),dim);
+
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  //////////////////////////////////////////////////////////////////////////////
+  //// CuMath<> Template specializations (double)
+  ////
+  template<>
+  void CuMath<double>::Sigmoid(CuMatrix<double>& Y, const CuMatrix<double>& X)
+  {
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(X.Cols(),CUBLOCK), n_blocks(X.Rows(), CUBLOCK));
+
+    cudaD_sigmoid(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+  template<>
+  void CuMath<double>::DiffSigmoid(CuMatrix<double>& Eout, const CuMatrix<double>& Ein, const CuMatrix<double>& Y)
+  {
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Eout.Cols(), CUBLOCK), n_blocks(Eout.Rows(),CUBLOCK));
+
+    cudaD_diff_sigmoid(dimGrid, dimBlock, Eout.pCUData(), Ein.pCUData(), Y.pCUData(), Eout.Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+    
+  template<>
+  void CuMath<double>::Softmax(CuMatrix<double>& Y, const CuMatrix<double>& X)
+  {
+    Timer tim; tim.Start();
+
+    size_t dimBlock = CUBLOCK;
+    size_t dimGrid  = n_blocks(X.Rows(),CUBLOCK);
+
+    cudaD_softmax(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  void CuMath<double>::BlockLinearity(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuMatrix<double>& block_transf)
+  {
+    Timer tim; tim.Start();
+
+    assert(Y.Rows() == X.Rows());
+    assert((X.Cols() % block_transf.Rows()) == 0);
+    assert((Y.Cols() % block_transf.Cols()) == 0);
+    assert((X.Cols() / block_transf.Rows()) == (Y.Cols() / block_transf.Cols()));
+
+    int blocks = X.Cols() / block_transf.Rows();
+
+    for(int i = 0; i < blocks; i++) {
+      int m = block_transf.Cols();
+      int n = X.Rows();
+      int k = block_transf.Rows();
+
+      /*
+      //DEBUG MESSAGE
+      std::cout << "N N " << m << " " << n << " " << k << " " 
+                << 1.0 << " " << block_transf << " " << block_transf.Stride() 
+                << " " << X+i*k << " " << X.Stride() << " " 
+                << 0.0 << " " << Y+i*n << " " << Y.Stride() 
+                << "\n" << std::flush;
+      */
+
+
+      cublasDgemm('N', 'N', m, n, k, 
+                  1.0, block_transf.pCUData(), block_transf.Stride(), 
+                  X.pCUData()+i*k, X.Stride(), 
+                  0.0, Y.pCUData()+i*m, Y.Stride());
+    }
+    cuSafeCall(cublasGetError());    
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+
+  template<>
+  void CuMath<double>::Expand(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& frameOffsets)
+  {
+    Timer tim; tim.Start();
+
+    assert(Y.Rows() == X.Rows());
+    assert(X.Cols() * frameOffsets.Dim() == Y.Cols());
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK));
+
+    cudaD_expand(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), frameOffsets.pCUData(), Y.Dim(), X.Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  void CuMath<double>::Rearrange(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& copyFrom)
+  {
+    Timer tim; tim.Start();
+
+    assert(copyFrom.Dim() == Y.Cols());
+    assert(Y.Rows() == X.Rows());
+    
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK));
+    
+    cudaD_rearrange(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), Y.Dim(), X.Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  void CuMath<double>::Randomize(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& copyFrom)
+  {
+    Timer tim; tim.Start();
+
+    assert(X.Cols() == Y.Cols());
+    assert(X.Rows() == Y.Rows());
+    assert(copyFrom.Dim() <= Y.Rows());
+    
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(copyFrom.Dim(),CUBLOCK));
+    
+    MatrixDim dimX = X.Dim(); dimX.rows=copyFrom.Dim();
+    MatrixDim dimY = Y.Dim(); dimY.rows=copyFrom.Dim();
+
+    cudaD_randomize(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), dimY, dimX);
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  void CuMath<double>::CheckClass(const CuMatrix<double>& out, const CuMatrix<double> &des, CuVector<int>& match)
+  {
+    Timer tim; tim.Start();
+
+    assert(out.Cols() == des.Cols());
+    assert(out.Rows() == des.Rows());
+    assert(out.Stride() == des.Stride());
+    assert(match.Dim() == out.Rows());
+
+    size_t dimBlock = CUBLOCK;
+    size_t dimGrid = n_blocks(out.Rows(),CUBLOCK);
+
+    cudaD_check_class(dimGrid, dimBlock, out.pCUData(), des.pCUData(), match.pCUData(), out.Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+}
diff --git a/src/CuBaseLib/cumath.h b/src/CuBaseLib/cumath.h
new file mode 100644
index 0000000..5680082
--- /dev/null
+++ b/src/CuBaseLib/cumath.h
@@ -0,0 +1,146 @@
+#ifndef _CUMATH_H_
+#define _CUMATH_H_
+
+#include "cumatrix.h"
+
+#include "Timer.h"
+#include "cudevice.h"
+
+namespace TNet {
+  
+  
+  /**
+   * Group of Math operations for the NN training
+   */
+  template<typename _ElemT>
+  class CuMath 
+  {
+   public:
+
+    /// Y = Sigmoid(X)
+    static void Sigmoid(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X)
+    { Error("__func__ Not implemented"); }
+
+    /// Eout = E(1-E) * Y
+    static void DiffSigmoid(CuMatrix<_ElemT>& Eout, const CuMatrix<_ElemT>& Ein, const CuMatrix<_ElemT>& Y)
+    { Error("__func__ Not implemented"); }
+
+    /// Y = Softmax(X)
+    static void Softmax(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X)
+    { Error("__func__ Not implemented"); }
+
+    /// for DCT in FeaCat
+    static void BlockLinearity(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X, const CuMatrix<_ElemT>& block_transf)
+    { Error("__func__ Not implemented"); }
+
+    static void Expand(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X, const CuVector<int>& frameOffsets)
+    { Error("__func__ Not implemented"); }
+
+    /// ie. switch cols according to copyFrom
+    static void Rearrange(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X, const CuVector<int>& copyFrom)
+    { Error("__func__ Not implemented"); }
+
+    /// ie. switch rows according to copyFrom   
+    static void Randomize(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X, const CuVector<int>& copyFrom)
+    { Error("__func__ Not implemented"); }
+
+    /// check match in the classification for Xentropy
+    static void CheckClass(const CuMatrix<_ElemT>& out, const CuMatrix<_ElemT> &des, CuVector<int>& match)
+    { Error("__func__ Not implemented"); }
+    
+    /// gemm with offset for CuSharedLinearity
+    static void OffsetGemm(char transA, char transB, _ElemT alpha, const CuMatrix<_ElemT>& A, const CuMatrix<_ElemT>& B, _ElemT beta, CuMatrix<_ElemT>& C, int offA, int offB, int offC)
+    { Error("__func__ Not implemented"); }
+
+    /// gemv with offset for CuRecurrent
+    static void OffsetGemv(char trans, _ElemT alpha, const CuMatrix<_ElemT>& A, const _ElemT* x, size_t dimX, _ElemT beta, _ElemT* y, size_t dimY, size_t offsetY)
+    { Error("__func__ Not implemented"); }
+
+    /// ger for weight updates in CuRecurrent
+    static void BlasGer(_ElemT alpha, const _ElemT* x, size_t dimX, const _ElemT* y, size_t dimY, CuMatrix<_ElemT>& A)
+    { Error("__func__ Not implemented"); }
+
+    /// concatenate one vector several times for CuSharedLinearity
+    static void VecExpand(const CuVector<_ElemT>&in, CuVector<_ElemT>&out)
+    { Error("__func__ Not implemented"); }
+
+    /// sum the vector as if it was matrix data for CuSharedLinearity
+    static void VecAddColSum(_ElemT alpha, const CuVector<_ElemT>&in, _ElemT beta, CuVector<_ElemT>&out)
+    { Error("__func__ Not implemented"); }
+
+  }; //class CuMath::
+
+
+  //////////////////////////////////////////////////////////////////////////////
+  //// CuMath<> Template specializations (float)
+  ////
+  template<>
+  void CuMath<float>::Sigmoid(CuMatrix<float>& Y, const CuMatrix<float>& X);
+
+  template<>
+  void CuMath<float>::DiffSigmoid(CuMatrix<float>& Eout, const CuMatrix<float>& Ein, const CuMatrix<float>& Y);
+    
+  template<>
+  void CuMath<float>::Softmax(CuMatrix<float>& Y, const CuMatrix<float>& X);
+
+  template<>
+  void CuMath<float>::BlockLinearity(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuMatrix<float>& block_transf);
+
+  template<>
+  void CuMath<float>::Expand(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& frameOffsets);
+
+  template<>
+  void CuMath<float>::Rearrange(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& copyFrom);
+
+  template<>
+  void CuMath<float>::Randomize(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& copyFrom);
+
+  template<>
+  void CuMath<float>::CheckClass(const CuMatrix<float>& out, const CuMatrix<float> &des, CuVector<int>& match);
+
+  template<>
+  void CuMath<float>::OffsetGemm(char transA, char transB, float alpha, const CuMatrix<float>& A, const CuMatrix<float>& B, float beta, CuMatrix<float>& C, int offA, int offB, int offC);
+
+  template<>
+  void CuMath<float>::OffsetGemv(char trans, float alpha, const CuMatrix<float>& A, const float* x, size_t dimX, float beta, float* y, size_t dimY, size_t offsetY);
+
+  template<>
+  void CuMath<float>::BlasGer(float alpha, const float* x, size_t dimX, const float* y, size_t dimY, CuMatrix<float>& A);
+
+  template<>
+  void CuMath<float>::VecExpand(const CuVector<float>&in, CuVector<float>&out);
+
+  template<>
+  void CuMath<float>::VecAddColSum(float alpha, const CuVector<float>&in, float beta, CuVector<float>&out);
+
+
+  //////////////////////////////////////////////////////////////////////////////
+  //// CuMath<> Template specializations (double)
+  ////
+  template<>
+  void CuMath<double>::Sigmoid(CuMatrix<double>& Y, const CuMatrix<double>& X);
+
+  template<>
+  void CuMath<double>::DiffSigmoid(CuMatrix<double>& Eout, const CuMatrix<double>& Ein, const CuMatrix<double>& Y);
+    
+  template<>
+  void CuMath<double>::Softmax(CuMatrix<double>& Y, const CuMatrix<double>& X);
+
+  template<>
+  void CuMath<double>::BlockLinearity(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuMatrix<double>& block_transf);
+
+  template<>
+  void CuMath<double>::Expand(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& frameOffsets);
+
+  template<>
+  void CuMath<double>::Rearrange(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& copyFrom);
+
+  template<>
+  void CuMath<double>::Randomize(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& copyFrom);
+
+  template<>
+  void CuMath<double>::CheckClass(const CuMatrix<double>& out, const CuMatrix<double> &des, CuVector<int>& match);
+
+}
+
+#endif
diff --git a/src/CuBaseLib/cumatrix.h b/src/CuBaseLib/cumatrix.h
new file mode 100644
index 0000000..887b92d
--- /dev/null
+++ b/src/CuBaseLib/cumatrix.h
@@ -0,0 +1,221 @@
+#ifndef _CUMATRIX_H_
+#define _CUMATRIX_H_
+
+#include <sstream>
+
+#include "Matrix.h"
+#include "cukernels.h"
+
+
+
+namespace TNet {
+
+  template<typename _ElemT> class CuVector;
+
+  /**
+   * \brief Matrix for CUDA computing
+   */
+  template<typename _ElemT>
+  class CuMatrix 
+  {
+    typedef CuMatrix<_ElemT> ThisType;
+
+    public:
+
+      /// Default Constructor
+      CuMatrix<_ElemT>()
+       : mRows(0), mCols(0), mStride(0), mpCUData(NULL),isOwn(false)
+      { }
+      /// Constructor with memory initialisation
+      CuMatrix<_ElemT>(size_t rows, size_t cols)
+       : mRows(0), mCols(0), mStride(0), mpCUData(NULL),isOwn(false)
+      { Init(rows, cols); }
+
+      /// Destructor
+      ~CuMatrix()
+      { Destroy(); }
+
+      /// Dimensions
+      size_t Rows() const
+      { return mRows; }
+
+      size_t Cols() const 
+      { return mCols; }
+
+      size_t Stride() const
+      { return mStride; }
+
+      ::MatrixDim Dim() const
+      { ::MatrixDim d = { 
+          static_cast<int>(mRows), 
+          static_cast<int>(mCols), 
+          static_cast<int>(mStride) 
+        }; 
+        return d; 
+      }
+
+      /// Get raw pointer
+      const _ElemT* pCUData() const
+      { return mpCUData; }
+      _ElemT* pCUData()
+      { return mpCUData; }
+
+      /// Get raw row pointer
+      const _ElemT* pCURowData(size_t r) const
+      { assert(r < Rows()); return mpCUData+r*mStride; }
+      _ElemT* pCURowData(size_t r)
+      { assert(r < Rows()); return mpCUData+r*mStride; }
+
+      /// Get size of matrix in bytes
+      size_t MSize() const
+      { return mRows*mStride*sizeof(_ElemT); }
+      /// Get size of matrix row in bytes
+      size_t MRowSize() const
+      { return mStride*sizeof(_ElemT); }
+
+      /// Allocate the memory
+      ThisType& Init(size_t rows, size_t cols);
+      /// Copy the ptr of rSrc starting at x with span of cols
+      ThisType& Init(CuMatrix<_ElemT>& rSrc, size_t x, size_t cols);
+      /// Copy the settings of rSrc
+      ThisType& Init(CuMatrix<_ElemT>& rSrc);
+
+      /// Deallocate the memory
+      void Destroy();
+
+      /// Copy functions (reallocates when needed)
+      ThisType&        CopyFrom(const CuMatrix<_ElemT>& rSrc);
+      ThisType&        CopyFrom(const Matrix<_ElemT>& rSrc);
+      Matrix<_ElemT>&  CopyTo(Matrix<_ElemT>& rDst) const;
+
+      /// Copy rowCnt rows from rSrc, starting by row srcOri, 
+      /// copying to memory block starting by row dstOri
+      void CopyRows(size_t rowCnt, size_t srcOri, const CuMatrix<_ElemT>& rSrc, size_t dstOri);
+
+      /// Copy colCnt columns from rSrc, starting by col srcOri, 
+      /// copying to memory block starting by row dstOri
+      void CopyCols(size_t colCnt, size_t srcOri, const CuMatrix<_ElemT>& rSrc, size_t dstOri);
+
+
+      // Math operations, some calling kernels
+      //
+      void SetZero();
+
+      void SetConst(_ElemT value)
+      { Error("__func__ Not implemented"); }
+
+      /// Natural Logarithm of every elements
+      void ApplyLog()
+      { Error("__func__ Not implemented"); }
+
+      /// Setting values to zero if mask[i][j]==0
+      void ApplyMask(const CuMatrix<BaseFloat>& mask)
+      { Error("__func__ Not implemented"); }
+
+      /** 
+       * \brief Apply Lasso function
+       *
+       * \param l1 \f$ L^1 \_ Norm \f$ function parameter
+       *
+       *  Lasso: \f[ Y_{ij} = \left\{
+       *   \begin{array}{lr} 
+       *    X_{ij} + l1 & , X_{ij} < -l1 \\
+       *    0 & , |X_{ij}| \le l1 \\
+       *    X_{ij} - l1 & , X_{ij} > -l1
+       *   \end{array}
+       *  \right. \f]
+       */
+      void ApplyL1(BaseFloat l1)
+      { Error("__func__ Not implemented"); }
+
+      /// scale i'th column by scale[i]
+      void ScaleCols(const CuVector<_ElemT>& scale)
+      { Error("__func__ Not implemented"); }
+
+      /// scale i'th row by scale[i]
+      void ScaleRows(const CuVector<_ElemT>& scale)
+      { Error("__func__ Not implemented"); }
+
+      /// B = aplha * A + beta * B
+      void AddScaled(_ElemT alpha, const CuMatrix<_ElemT>& A, _ElemT beta)
+      { Error("__func__ Not implemented"); }
+
+      /// B = aplha * row + beta * B
+      void AddScaledRow(_ElemT alpha, const CuVector<_ElemT>& row, _ElemT beta)
+      { Error("__func__ Not implemented"); }
+
+      /// C = alpha * A(^T)*B(^T) + beta * C
+      void Gemm(char transa, char transb, 
+                _ElemT alpha, 
+                const CuMatrix<_ElemT>& A, const CuMatrix<_ElemT>& B, 
+                _ElemT beta)
+      { Error("__func__ Not implemented"); }
+
+      /// A = alpha * x*y^T + A
+      void BlasGer(_ElemT alpha, 
+                const CuVector<_ElemT>& x, const CuVector<_ElemT>& y)
+      { Error("__func__ Not implemented"); }
+
+
+      /// Multiply two matrices elementhwise: C = A .* C
+      void MulElem(const CuMatrix<_ElemT>& A)
+      { Error("__func__ Not implemented"); }
+      
+      /// A = log(A)
+      void LogElem()
+      { Error("__func__ Not implemented"); }
+
+      void Print() const
+      { 
+        Matrix<_ElemT> mat(Rows(),Cols());
+        CopyTo(mat);
+        std::cout << mat;
+      }
+
+      
+
+      /// Check if contains invalid value
+      void CheckData()
+      {
+        Matrix<_ElemT> mat;
+        CopyTo(mat);
+        for(size_t i=0; i<Rows(); i++) {
+          for(size_t j=0; j<Cols(); j++) {
+            if(std::isnan(mat(i,j)) || std::isinf(mat(i,j))) {
+              std::ostringstream os;
+              os << "Invalid value:" << mat(i,j) << "at row"<<i<<" col"<<j<<"\n";
+              Error(os.str());
+            }
+          }
+        }
+      }
+        
+      
+    private:
+      size_t mRows;
+      size_t mCols;
+      size_t mStride;
+
+      _ElemT* mpCUData;
+      
+      bool isOwn;
+
+  };
+
+
+  /// Prints the matrix dimensions and pointer to stream
+  template<typename _ElemT>
+  inline std::ostream& operator << (std::ostream& out, const CuMatrix<_ElemT>& mat)
+  { 
+    out << "[CUMATRIX R" << mat.Rows() << " C" << mat.Cols() << " S" << mat.Stride() 
+        << " PTR" << mat.pCUData() << "]" << std::flush;
+    return out;
+  }
+  
+  
+}
+
+
+#include "cumatrix.tcc"
+
+#endif
diff --git a/src/CuBaseLib/cumatrix.h~ b/src/CuBaseLib/cumatrix.h~
new file mode 100644
index 0000000..a0ad5cd
--- /dev/null
+++ b/src/CuBaseLib/cumatrix.h~
@@ -0,0 +1,214 @@
+#ifndef _CUMATRIX_H_
+#define _CUMATRIX_H_
+
+#include <sstream>
+
+#include "Matrix.h"
+#include "cukernels.h"
+
+
+
+namespace TNet {
+
+  template<typename _ElemT> class CuVector;
+
+  /**
+   * \brief Matrix for CUDA computing
+   */
+  template<typename _ElemT>
+  class CuMatrix 
+  {
+    typedef CuMatrix<_ElemT> ThisType;
+
+    public:
+
+      /// Default Constructor
+      CuMatrix<_ElemT>()
+       : mRows(0), mCols(0), mStride(0), mpCUData(NULL)
+      { }
+      /// Constructor with memory initialisation
+      CuMatrix<_ElemT>(size_t rows, size_t cols)
+       : mRows(0), mCols(0), mStride(0), mpCUData(NULL)
+      { Init(rows, cols); }
+
+      /// Destructor
+      ~CuMatrix()
+      { Destroy(); }
+
+      /// Dimensions
+      size_t Rows() const
+      { return mRows; }
+
+      size_t Cols() const 
+      { return mCols; }
+
+      size_t Stride() const
+      { return mStride; }
+
+      ::MatrixDim Dim() const
+      { ::MatrixDim d = { 
+          static_cast<int>(mRows), 
+          static_cast<int>(mCols), 
+          static_cast<int>(mStride) 
+        }; 
+        return d; 
+      }
+
+      /// Get raw pointer
+      const _ElemT* pCUData() const
+      { return mpCUData; }
+      _ElemT* pCUData()
+      { return mpCUData; }
+
+      /// Get raw row pointer
+      const _ElemT* pCURowData(size_t r) const
+      { assert(r < Rows()); return mpCUData+r*mStride; }
+      _ElemT* pCURowData(size_t r)
+      { assert(r < Rows()); return mpCUData+r*mStride; }
+
+      /// Get size of matrix in bytes
+      size_t MSize() const
+      { return mRows*mStride*sizeof(_ElemT); }
+      /// Get size of matrix row in bytes
+      size_t MRowSize() const
+      { return mStride*sizeof(_ElemT); }
+
+      /// Allocate the memory
+      ThisType& Init(size_t rows, size_t cols);
+
+      /// Deallocate the memory
+      void Destroy();
+
+      /// Copy functions (reallocates when needed)
+      ThisType&        CopyFrom(const CuMatrix<_ElemT>& rSrc);
+      ThisType&        CopyFrom(const Matrix<_ElemT>& rSrc);
+      Matrix<_ElemT>&  CopyTo(Matrix<_ElemT>& rDst) const;
+
+      /// Copy rowCnt rows from rSrc, starting by row srcOri, 
+      /// copying to memory block starting by row dstOri
+      void CopyRows(size_t rowCnt, size_t srcOri, const CuMatrix<_ElemT>& rSrc, size_t dstOri);
+
+      /// Copy colCnt columns from rSrc, starting by col srcOri, 
+      /// copying to memory block starting by row dstOri
+      void CopyCols(size_t colCnt, size_t srcOri, const CuMatrix<_ElemT>& rSrc, size_t dstOri);
+
+
+      // Math operations, some calling kernels
+      //
+      void SetZero();
+
+      void SetConst(_ElemT value)
+      { Error("__func__ Not implemented"); }
+
+      /// Natural Logarithm of every elements
+      void ApplyLog()
+      { Error("__func__ Not implemented"); }
+
+      /// Setting values to zero if mask[i][j]==0
+      void ApplyMask(const CuMatrix<BaseFloat>& mask)
+      { Error("__func__ Not implemented"); }
+
+      /** 
+       * \brief Apply Lasso function
+       *
+       * \param l1 \f$ L^1 \_ Norm \f$ function parameter
+       *
+       *  Lasso: \f[ Y_{ij} = \left\{
+       *   \begin{array}{lr} 
+       *    X_{ij} + l1 & , X_{ij} < -l1 \\
+       *    0 & , |X_{ij}| \le l1 \\
+       *    X_{ij} - l1 & , X_{ij} > -l1
+       *   \end{array}
+       *  \right. \f]
+       */
+      void ApplyL1(BaseFloat l1)
+      { Error("__func__ Not implemented"); }
+
+      /// scale i'th column by scale[i]
+      void ScaleCols(const CuVector<_ElemT>& scale)
+      { Error("__func__ Not implemented"); }
+
+      /// scale i'th row by scale[i]
+      void ScaleRows(const CuVector<_ElemT>& scale)
+      { Error("__func__ Not implemented"); }
+
+      /// B = aplha * A + beta * B
+      void AddScaled(_ElemT alpha, const CuMatrix<_ElemT>& A, _ElemT beta)
+      { Error("__func__ Not implemented"); }
+
+      /// B = aplha * row + beta * B
+      void AddScaledRow(_ElemT alpha, const CuVector<_ElemT>& row, _ElemT beta)
+      { Error("__func__ Not implemented"); }
+
+      /// C = alpha * A(^T)*B(^T) + beta * C
+      void Gemm(char transa, char transb, 
+                _ElemT alpha, 
+                const CuMatrix<_ElemT>& A, const CuMatrix<_ElemT>& B, 
+                _ElemT beta)
+      { Error("__func__ Not implemented"); }
+
+      /// A = alpha * x*y^T + A
+      void BlasGer(_ElemT alpha, 
+                const CuVector<_ElemT>& x, const CuVector<_ElemT>& y)
+      { Error("__func__ Not implemented"); }
+
+
+      /// Multiply two matrices elementhwise: C = A .* C
+      void MulElem(const CuMatrix<_ElemT>& A)
+      { Error("__func__ Not implemented"); }
+      
+      /// A = log(A)
+      void LogElem()
+      { Error("__func__ Not implemented"); }
+
+      void Print() const
+      { 
+        Matrix<_ElemT> mat(Rows(),Cols());
+        CopyTo(mat);
+        std::cout << mat;
+      }
+
+      
+
+      void CheckData()
+      {
+        Matrix<_ElemT> mat;
+        CopyTo(mat);
+        for(size_t i=0; i<Rows(); i++) {
+          for(size_t j=0; j<Cols(); j++) {
+            if(std::isnan(mat(i,j)) || std::isinf(mat(i,j))) {
+              std::ostringstream os;
+              os << "Invalid value:" << mat(i,j) << "at row"<<i<<" col"<<j<<"\n";
+              Error(os.str());
+            }
+          }
+        }
+      }
+        
+      
+    private:
+      size_t mRows;
+      size_t mCols;
+      size_t mStride;
+
+      _ElemT* mpCUData;
+
+  };
+
+
+  /// Prints the matrix dimensions and pointer to stream
+  template<typename _ElemT>
+  inline std::ostream& operator << (std::ostream& out, const CuMatrix<_ElemT>& mat)
+  { 
+    out << "[CUMATRIX R" << mat.Rows() << " C" << mat.Cols() << " S" << mat.Stride() 
+        << " PTR" << mat.pCUData() << "]" << std::flush;
+    return out;
+  }
+  
+  
+}
+
+
+#include "cumatrix.tcc"
+
+#endif
diff --git a/src/CuBaseLib/cumatrix.tcc b/src/CuBaseLib/cumatrix.tcc
new file mode 100644
index 0000000..7d6a136
--- /dev/null
+++ b/src/CuBaseLib/cumatrix.tcc
@@ -0,0 +1,660 @@
+
+#include <cuda_runtime_api.h>
+#include <cublas.h>
+
+#include "Timer.h"
+#include "cucommon.h"
+#include "cuvector.h"
+#include "cudevice.h"
+
+namespace TNet {
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+  template<typename _ElemT>
+  CuMatrix<_ElemT>&
+  CuMatrix<_ElemT>::
+  Init(size_t rows, size_t cols)
+  {
+    if(mRows == rows && mCols == cols) {
+      //SetZero();
+      return *this;
+    }
+
+    Destroy();
+
+    size_t row_bytes = cols * sizeof(_ElemT);
+    size_t pitch;
+    cuSafeCall(cudaMallocPitch((void**)&mpCUData, &pitch, row_bytes, rows+1));
+    mRows = rows; mCols = cols; 
+    mStride = pitch/sizeof(_ElemT);
+    SetZero();
+    
+    isOwn=true;
+    return *this;
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+  template<typename _ElemT>
+  CuMatrix<_ElemT>&
+  CuMatrix<_ElemT>::
+  Init(CuMatrix<_ElemT>& rSrc, size_t x, size_t cols)
+  {
+    mRows = rSrc.Rows(); 
+    mCols = cols; 
+    mStride = rSrc.Stride();
+    mpCUData = rSrc.pCUData() + x;
+    isOwn=false;
+    return *this;
+  }
+  
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+  template<typename _ElemT>
+  CuMatrix<_ElemT>&
+  CuMatrix<_ElemT>::
+  Init(CuMatrix<_ElemT>& rSrc)
+  {
+    mRows = rSrc.Rows(); 
+    mCols = rSrc.Cols();
+    mStride = rSrc.Stride();
+    mpCUData = rSrc.pCUData();
+    isOwn=false;
+    return *this;
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+  
+  template<typename _ElemT>
+  void
+  CuMatrix<_ElemT>::
+  Destroy()
+  {
+    if(NULL != mpCUData && isOwn) {
+      cuSafeCall(cudaFree(mpCUData));
+    }
+    mpCUData = NULL;
+    mRows = mCols = mStride = 0;
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+  template<typename _ElemT>
+  CuMatrix<_ElemT>&
+  CuMatrix<_ElemT>::
+  CopyFrom(const CuMatrix<_ElemT>& rSrc)
+  {
+    Init(rSrc.Rows(),rSrc.Cols());
+    
+    Timer tim; tim.Start();
+
+    size_t dst_pitch = mStride*sizeof(_ElemT);
+    size_t src_pitch = rSrc.Stride()*sizeof(_ElemT);
+    size_t width = rSrc.Cols()*sizeof(_ElemT);
+    cuSafeCall(cudaMemcpy2D(mpCUData, dst_pitch, rSrc.pCUData(), src_pitch, width, rSrc.Rows(), cudaMemcpyDeviceToDevice));
+
+    tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::CopyFromD2D",tim.Val());
+    return *this;
+  }
+  
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+  template<typename _ElemT>
+  CuMatrix<_ElemT>&
+  CuMatrix<_ElemT>::
+  CopyFrom(const Matrix<_ElemT>& rSrc)
+  {
+    Init(rSrc.Rows(),rSrc.Cols());
+
+    Timer tim; tim.Start();
+
+    size_t dst_pitch = mStride*sizeof(_ElemT);
+    size_t src_pitch = rSrc.Stride()*sizeof(_ElemT);
+    size_t width = rSrc.Cols()*sizeof(_ElemT);
+    cuSafeCall(cudaMemcpy2D(mpCUData, dst_pitch, rSrc.pData(), src_pitch, width, rSrc.Rows(), cudaMemcpyHostToDevice));
+
+    tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::CopyFromH2D",tim.Val());
+    return *this;
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+  template<typename _ElemT>
+  Matrix<_ElemT>&
+  CuMatrix<_ElemT>::
+  CopyTo(Matrix<_ElemT>& rDst) const
+  {
+    if(rDst.Rows() != Rows()  ||  rDst.Cols() != Cols()) {
+      rDst.Init(Rows(),Cols());
+    }
+
+    Timer tim; tim.Start();
+   
+    size_t src_pitch = mStride*sizeof(_ElemT);
+    size_t dst_pitch = rDst.Stride()*sizeof(_ElemT);
+    size_t width = Cols()*sizeof(_ElemT);
+    cuSafeCall(cudaMemcpy2D(rDst.pData(), dst_pitch, pCUData(), src_pitch, width, Rows(), cudaMemcpyDeviceToHost));
+
+    tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::CopyToD2H",tim.Val());
+
+    return rDst;
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+  
+  template<typename _ElemT>
+  void
+  CuMatrix<_ElemT>::
+  CopyRows(size_t rowCnt, size_t srcOri, const CuMatrix<_ElemT>& rSrc, size_t dstOri)
+  {
+    assert(rowCnt+srcOri <= rSrc.Rows());
+    assert(rowCnt+dstOri <= Rows());
+    assert(Cols() == rSrc.Cols());
+ 
+    Timer tim; tim.Start();
+
+    size_t dst_pitch = mStride*sizeof(_ElemT);
+    size_t src_pitch = rSrc.Stride()*sizeof(_ElemT);
+    size_t width = rSrc.Cols()*sizeof(_ElemT);
+
+    const _ElemT* p_src = rSrc.pCUData() + srcOri*rSrc.Stride();  
+    _ElemT* p_dst = mpCUData + dstOri*mStride;
+
+    cuSafeCall(cudaMemcpy2D(p_dst, dst_pitch, p_src, src_pitch, width, rowCnt, cudaMemcpyDeviceToDevice));
+
+    tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::CopyRowsD2D",tim.Val());
+   
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+  
+  template<typename _ElemT>
+  void
+  CuMatrix<_ElemT>::
+  CopyCols(size_t colCnt, size_t srcOri, const CuMatrix<_ElemT>& rSrc, size_t dstOri)
+  {
+    assert(colCnt+srcOri <= rSrc.Cols());
+    assert(colCnt+dstOri <= Cols());
+    assert(Rows() == rSrc.Rows());
+ 
+    Timer tim; tim.Start();
+
+    size_t dst_pitch = mStride*sizeof(_ElemT);
+    size_t src_pitch = rSrc.Stride()*sizeof(_ElemT);
+    size_t width = colCnt*sizeof(_ElemT);
+
+    const _ElemT* p_src = rSrc.pCUData() + srcOri;  
+    _ElemT* p_dst = mpCUData + dstOri;
+
+    cuSafeCall(cudaMemcpy2D(p_dst, dst_pitch, p_src, src_pitch, width, Rows(), cudaMemcpyDeviceToDevice));
+
+    tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::CopyColsD2D",tim.Val());
+   
+  }
+ 
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+ 
+  template<typename _ElemT>
+  void
+  CuMatrix<_ElemT>::
+  SetZero() 
+  {
+    Timer tim; tim.Start();
+    cuSafeCall(cudaMemset(mpCUData, 0, mRows*mStride*sizeof(_ElemT)));
+    tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::SetZero",tim.Val());
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+ 
+ 
+  ////////////////////////////////////////////////////////////////////////
+  //// CuMatrix:: templeate specializations (float)
+  ////
+  template<> 
+  inline void CuMatrix<float>::SetConst(float value)
+  { 
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    (dimGrid,dimBlock,mpCUData,value,Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<> 
+  inline void CuMatrix<float>::ApplyLog()
+  { 
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaF_apply_log(dimGrid,dimBlock,mpCUData,Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+  
+  
+  template<> 
+  inline void CuMatrix<float>::ApplyMask(const CuMatrix<BaseFloat>& mask)
+  { 
+    Timer tim; tim.Start();
+
+    assert(mask.Rows() == Rows());
+    assert(mask.Cols() == Cols());
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaF_apply_mask(dimGrid,dimBlock,mpCUData,mask.pCUData(),Dim(),mask.Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+  
+
+  template<> 
+  inline void CuMatrix<float>::ApplyL1(float l1)
+  { 
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaF_apply_l1(dimGrid,dimBlock,mpCUData,l1,Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  inline void CuMatrix<float>::ScaleCols(const CuVector<float>& scale)
+  {
+    Timer tim; tim.Start();
+
+    assert(scale.Dim() == Cols());
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaF_scale_cols(dimGrid,dimBlock,mpCUData,scale.pCUData(),Dim());
+    cuSafeCall(cudaGetLastError());
+
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  
+  template<>
+  inline void CuMatrix<float>::ScaleRows(const CuVector<float>& scale)
+  { 
+    Timer tim; tim.Start();
+
+    assert(scale.Dim() == Rows());
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaF_scale_rows(dimGrid,dimBlock,mpCUData,scale.pCUData(),Dim());
+    cuSafeCall(cudaGetLastError());
+
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  inline void CuMatrix<float>::AddScaled(float alpha, const CuMatrix<float>& A, float beta)
+  {
+    Timer tim; tim.Start();
+
+    assert(A.Rows() == Rows());
+    assert(A.Cols() == Cols());
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaF_add_scaled(dimGrid,dimBlock,alpha,A.pCUData(),beta,mpCUData,Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  inline void CuMatrix<float>::AddScaledRow(float alpha, const CuVector<float>& row, float beta)
+  { 
+    Timer tim; tim.Start();
+
+    if(row.Dim() != Cols()) {
+      std::ostringstream os;
+      os << "Non matching dimensions: Cols:" << Cols() << " VectorDim:" << row.Dim();
+      Error(os.str());
+    }
+    assert(row.Dim() == Cols());
+   
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaF_add_scaled_row(dimGrid,dimBlock,alpha,row.pCUData(),beta,mpCUData,Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  inline void CuMatrix<float>::Gemm(char transa, char transb, 
+            float alpha, 
+            const CuMatrix<float>& A, const CuMatrix<float>& B, 
+            float beta)
+  { 
+    // CUBLAS is col major, TNet is row major
+    // keep trans..., just swap A&B argumets: A->B B->A
+    size_t m = ((transb=='T' || transb=='t')? B.Rows() : B.Cols()); 
+    size_t n = ((transa=='T' || transa=='t')? A.Cols() : A.Rows());
+    size_t k = ((transb=='T' || transb=='t')? B.Cols() : B.Rows());
+    size_t k1 = ((transa=='T' || transa=='t')? A.Rows() : A.Cols());
+
+    assert(m == Cols());
+    assert(n == Rows());
+    assert(k == k1);
+
+    #if 0
+     //DEBUG MESSAGE
+    std::cout << "\n" << transb << " " << transa << " " << m << " " << n << " " << k << " " <<
+                alpha << " " << B << " " << B.Stride() << " " <<
+                A << " " << A.Stride() << " " << beta << " " << C << " " << 
+                C.Stride() << "\n" << std::flush;
+    #endif
+
+    Timer tim; tim.Start();
+
+    cublasSgemm(transb, transa, m, n, k, 
+                alpha, B.pCUData(), B.Stride(), A.pCUData(), A.Stride(), 
+                beta, mpCUData, Stride());
+
+    cuSafeCall(cublasGetError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  inline void CuMatrix<float>::BlasGer(float alpha, 
+            const CuVector<float>& x, const CuVector<float>& y)
+  { 
+    // CUBLAS is col major, TNet is row major
+    // just swap x and y
+    assert(x.Dim() == Rows());
+    assert(y.Dim() == Cols());
+
+    Timer tim; tim.Start();
+    
+    cublasSger(Cols(),Rows(),alpha,y.pCUData(),1,x.pCUData(),1,mpCUData,Stride());
+    cuSafeCall(cublasGetError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  inline void CuMatrix<float>::MulElem(const CuMatrix<float>& A)
+  {
+    Timer tim; tim.Start();
+
+    assert(mCols == A.Cols());
+    assert(mRows == A.Rows());
+    assert(mStride == A.Stride());
+    
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaF_mul_elem(dimGrid,dimBlock,mpCUData, A.pCUData(), Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  inline void CuMatrix<float>::LogElem()
+  {
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaF_log_elem(dimGrid,dimBlock,mpCUData, Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////
+  //// CuMatrix:: templeate specializations (double)
+  ////
+  template<> 
+  inline void CuMatrix<double>::SetConst(double value)
+  { 
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaD_set_const(dimGrid,dimBlock,mpCUData,value,Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<> 
+  inline void CuMatrix<double>::ApplyLog()
+  { 
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaD_apply_log(dimGrid,dimBlock,mpCUData,Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  inline void CuMatrix<double>::ScaleCols(const CuVector<double>& scale)
+  {
+    Timer tim; tim.Start();
+
+    assert(scale.Dim() == Cols());
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaD_scale_cols(dimGrid,dimBlock,mpCUData,scale.pCUData(),Dim());
+    cuSafeCall(cudaGetLastError());
+
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  
+  template<>
+  inline void CuMatrix<double>::ScaleRows(const CuVector<double>& scale)
+  { 
+    Timer tim; tim.Start();
+
+    assert(scale.Dim() == Rows());
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaD_scale_rows(dimGrid,dimBlock,mpCUData,scale.pCUData(),Dim());
+    cuSafeCall(cudaGetLastError());
+
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  inline void CuMatrix<double>::AddScaled(double alpha, const CuMatrix<double>& A, double beta)
+  {
+    Timer tim; tim.Start();
+
+    assert(A.Rows() == Rows());
+    assert(A.Cols() == Cols());
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaD_add_scaled(dimGrid,dimBlock,alpha,A.pCUData(),beta,mpCUData,Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  inline void CuMatrix<double>::AddScaledRow(double alpha, const CuVector<double>& row, double beta)
+  { 
+    Timer tim; tim.Start();
+
+    assert(row.Dim() == Cols());
+   
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaD_add_scaled_row(dimGrid,dimBlock,alpha,row.pCUData(),beta,mpCUData,Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  inline void CuMatrix<double>::Gemm(char transa, char transb, 
+            double alpha, 
+            const CuMatrix<double>& A, const CuMatrix<double>& B, 
+            double beta)
+  { 
+    // CUBLAS is col major, TNet is row major
+    // keep trans..., just swap A&B argumets: A->B B->A
+    size_t m = ((transb=='T' || transb=='t')? B.Rows() : B.Cols()); 
+    size_t n = ((transa=='T' || transa=='t')? A.Cols() : A.Rows());
+    size_t k = ((transb=='T' || transb=='t')? B.Cols() : B.Rows());
+    size_t k1 = ((transa=='T' || transa=='t')? A.Rows() : A.Cols());
+
+    assert(m == Cols());
+    assert(n == Rows());
+    assert(k == k1);
+
+    #if 0
+     //DEBUG MESSAGE
+    std::cout << "\n" << transb << " " << transa << " " << m << " " << n << " " << k << " " <<
+                alpha << " " << B << " " << B.Stride() << " " <<
+                A << " " << A.Stride() << " " << beta << " " << C << " " << 
+                C.Stride() << "\n" << std::flush;
+    #endif
+
+    Timer tim; tim.Start();
+
+    cublasDgemm(transb, transa, m, n, k, 
+                alpha, B.pCUData(), B.Stride(), A.pCUData(), A.Stride(), 
+                beta, mpCUData, Stride());
+
+    cuSafeCall(cublasGetError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+  template<>
+  inline void CuMatrix<double>::BlasGer(double alpha, 
+            const CuVector<double>& x, const CuVector<double>& y)
+  { 
+    // CUBLAS is col major, TNet is row major
+    // just swap x and y
+    assert(x.Dim() == Rows());
+    assert(y.Dim() == Cols());
+
+    Timer tim; tim.Start();
+    
+    cublasDger(Cols(),Rows(),alpha,y.pCUData(),1,x.pCUData(),1,mpCUData,Stride());
+    cuSafeCall(cublasGetError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+
+  template<>
+  inline void CuMatrix<double>::MulElem(const CuMatrix<double>& A)
+  {
+    Timer tim; tim.Start();
+
+    assert(mCols == A.Cols());
+    assert(mRows == A.Rows());
+    assert(mStride == A.Stride());
+    
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaD_mul_elem(dimGrid,dimBlock,mpCUData, A.pCUData(), Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  inline void CuMatrix<double>::LogElem()
+  {
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaD_log_elem(dimGrid,dimBlock,mpCUData, Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+}
diff --git a/src/CuBaseLib/curand.h b/src/CuBaseLib/curand.h
new file mode 100644
index 0000000..8aa66d5
--- /dev/null
+++ b/src/CuBaseLib/curand.h
@@ -0,0 +1,40 @@
+#ifndef _CU_RAND_H_
+#define _CU_RAND_H_
+
+
+#include "cumatrix.h"
+
+
+namespace TNet {
+  
+  template<typename T> 
+  class CuRand {
+   public:
+
+    CuRand(size_t rows, size_t cols)
+    { SeedGpu(rows,cols); }
+
+    ~CuRand() { }
+
+    void SeedGpu(size_t rows, size_t cols);
+    void Rand(CuMatrix<T>& tgt);
+    void GaussRand(CuMatrix<T>& tgt);
+
+    void BinarizeProbs(const CuMatrix<T>& probs, CuMatrix<T>& states);
+    void AddGaussNoise(CuMatrix<T>& tgt, T gscale = 1.0);
+  
+   private:
+    static void SeedRandom(Matrix<unsigned>& mat);
+     
+   private:
+    CuMatrix<unsigned> z1, z2, z3, z4;
+    CuMatrix<T> tmp;
+  };
+
+}
+
+
+#include "curand.tcc"
+
+
+#endif
diff --git a/src/CuBaseLib/curand.tcc b/src/CuBaseLib/curand.tcc
new file mode 100644
index 0000000..e337189
--- /dev/null
+++ b/src/CuBaseLib/curand.tcc
@@ -0,0 +1,228 @@
+
+#include <cstdlib>
+#include "curandkernels.h"
+
+
+namespace TNet {
+ 
+ 
+
+  template<typename T>
+  inline void
+  CuRand<T>::
+  SeedGpu(size_t rows, size_t cols)
+  {
+    Matrix<unsigned> mat(rows,cols);
+    SeedRandom(mat);
+    z1.CopyFrom(mat);
+    SeedRandom(mat);
+    z2.CopyFrom(mat);
+    SeedRandom(mat);
+    z3.CopyFrom(mat);
+    SeedRandom(mat);
+    z4.CopyFrom(mat);
+
+    /*
+    std::cout << "RANDININIT" << std::endl;
+    z1.Print();
+    z2.Print();
+    z3.Print();
+    z4.Print();
+    std::cout << "RANDININIT" << std::endl;
+    */
+
+    tmp.Init(rows,cols);
+  }
+
+
+
+  template<typename T>  
+  inline void 
+  CuRand<T>::
+  SeedRandom(Matrix<unsigned>& mat) {
+    for(size_t j=0; j<mat.Rows(); j++) {
+      for(size_t i=0; i<mat.Cols(); i++) {
+        unsigned value = 0;
+        while(value <= 128) { value = lrand48(); }
+        mat(j,i) = value;
+      }
+    }
+  }
+
+
+  template<typename T>
+  inline void
+  CuRand<T>::
+  AddGaussNoise(CuMatrix<T>& tgt, T gscale)
+  { 
+    GaussRand(tmp);
+    tgt.AddScaled(gscale,tmp,1.0);
+  }
+
+
+
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  //// invalid general wrappers over CUDA kernels
+  template<typename T>
+  inline void
+  CuRand<T>::
+  Rand(CuMatrix<T>& tgt)
+  { Error("Unimplemented"); }
+  
+  template<typename T>
+  inline void
+  CuRand<T>::
+  GaussRand(CuMatrix<T>& tgt)
+  { Error("Unimplemented"); }
+ 
+  template<typename T>
+  inline void
+  CuRand<T>::
+  BinarizeProbs(const CuMatrix<T>& probs, CuMatrix<T>& states)
+  { Error("Unimplemented"); }
+
+
+  //////////////////////////////////////////////////////////////////////////
+  //// float specializations
+  template<>
+  inline void
+  CuRand<float>::
+  Rand(CuMatrix<float>& tgt)
+  {
+    Timer tim; tim.Start();
+
+    tgt.Init(z1.Rows(), z1.Cols());
+  
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(tgt.Cols(), CUBLOCK), n_blocks(tgt.Rows(),CUBLOCK));
+
+    cudaF_rand(dimGrid,dimBlock,tgt.pCUData(), z1.pCUData(), z2.pCUData(), z3.pCUData(), z4.pCUData(),tgt.Dim());
+
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+    
+  
+  template<>
+  inline void
+  CuRand<float>::
+  GaussRand(CuMatrix<float>& tgt)
+  {
+
+    Timer tim; tim.Start();
+
+    tgt.Init(z1.Rows(), z1.Cols());
+  
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(tgt.Cols(), CUBLOCK), n_blocks(tgt.Rows(),CUBLOCK));
+
+    cudaF_gauss_rand(dimGrid,dimBlock,tgt.pCUData(), z1.pCUData(), z2.pCUData(), z3.pCUData(), z4.pCUData(),tgt.Dim());
+
+    cuSafeCall(cudaGetLastError());
+
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+ 
+  template<>
+  inline void
+  CuRand<float>::
+  BinarizeProbs(const CuMatrix<float>& probs, CuMatrix<float>& states)
+  {
+    if(probs.Rows() != z1.Rows() || probs.Cols() != z1.Cols()) {
+      Error("Non matching dims!!");
+    }
+
+    states.Init(z1.Rows(),z1.Cols());
+    Rand(tmp);
+
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(z1.Cols(), CUBLOCK), n_blocks(z1.Rows(),CUBLOCK));
+
+    cudaF_binarize_probs(dimGrid,dimBlock,states.pCUData(), probs.pCUData(), tmp.pCUData(),states.Dim());
+
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  //////////////////////////////////////////////////////////////////////////
+  //// double specializations
+  template<>
+  inline void
+  CuRand<double>::
+  Rand(CuMatrix<double>& tgt)
+  {
+    Timer tim; tim.Start();
+
+    tgt.Init(z1.Rows(), z1.Cols());
+  
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(tgt.Cols(), CUBLOCK), n_blocks(tgt.Rows(),CUBLOCK));
+
+    cudaD_rand(dimGrid,dimBlock,tgt.pCUData(), z1.pCUData(), z2.pCUData(), z3.pCUData(), z4.pCUData(),tgt.Dim());
+
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+    
+  
+  template<>
+  inline void
+  CuRand<double>::
+  GaussRand(CuMatrix<double>& tgt)
+  {
+
+    Timer tim; tim.Start();
+
+    tgt.Init(z1.Rows(), z1.Cols());
+  
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(tgt.Cols(), CUBLOCK), n_blocks(tgt.Rows(),CUBLOCK));
+
+    cudaD_gauss_rand(dimGrid,dimBlock,tgt.pCUData(), z1.pCUData(), z2.pCUData(), z3.pCUData(), z4.pCUData(),tgt.Dim());
+
+    cuSafeCall(cudaGetLastError());
+
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+ 
+  template<>
+  inline void
+  CuRand<double>::
+  BinarizeProbs(const CuMatrix<double>& probs, CuMatrix<double>& states)
+  {
+    if(probs.Rows() != z1.Rows() || probs.Cols() != z1.Cols()) {
+      Error("Non matching dims!!");
+    }
+
+    states.Init(z1.Rows(),z1.Cols());
+    Rand(tmp);
+
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(z1.Cols(), CUBLOCK), n_blocks(z1.Rows(),CUBLOCK));
+
+    cudaD_binarize_probs(dimGrid,dimBlock,states.pCUData(), probs.pCUData(), tmp.pCUData(),states.Dim());
+
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+}
diff --git a/src/CuBaseLib/curandkernels.cu b/src/CuBaseLib/curandkernels.cu
new file mode 100644
index 0000000..5e42258
--- /dev/null
+++ b/src/CuBaseLib/curandkernels.cu
@@ -0,0 +1,135 @@
+
+#include "curandkernels.h"
+
+
+
+//
+//Hybrid Tausworthe/LCG random number generator
+//
+//http://http.developer.nvidia.com/GPUGems3/gpugems3_ch37.html
+
+
+// S1, S2, S3, and M are all constants, and z is part of the  
+// private per-thread generator state.
+__device__
+static unsigned TausStep(unsigned &z, int S1, int S2, int S3, unsigned M)  
+{  
+  unsigned b=(((z << S1) ^ z) >> S2);  
+  return z = (((z & M) << S3) ^ b);  
+}  
+
+// A and C are constants  
+__device__
+static unsigned LCGStep(unsigned &z, unsigned A, unsigned C)  
+{  
+  return z=(A*z+C);  
+} 
+
+template<typename T>
+__device__
+static T HybridTaus(unsigned& z1, unsigned& z2, unsigned& z3, unsigned& z4)  
+{  
+  // Combined period is lcm(p1,p2,p3,p4)~ 2^121
+  T randval;
+  do { 
+   randval = 2.3283064365387e-10 * (          // Periods  
+    TausStep(z1, 13, 19, 12, 4294967294UL) ^  // p1=2^31-1  
+    TausStep(z2, 2, 25, 4, 4294967288UL) ^    // p2=2^30-1  
+    TausStep(z3, 3, 11, 17, 4294967280UL) ^   // p3=2^28-1  
+    LCGStep(z4, 1664525, 1013904223UL)        // p4=2^32  
+   );
+  } while (!(randval > 0.0 && randval < 1.0));
+  return randval;
+}  
+
+
+
+
+template<typename T>
+__global__
+static void _rand(T* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if( i < d.cols  && j < d.rows ) {
+    mat[index] = HybridTaus<T>(z1[index],z2[index],z3[index],z4[index]);
+  }
+}
+
+/*
+float2 BoxMuller()  
+{  
+  float u0=HybridTaus (), u1=HybridTaus ();  
+  float r=sqrt(-2 log(u0));  
+  float theta=2*PI*u1;  
+  return make_float2(r*sin(theta),r*cos(theta));  
+} 
+*/
+ 
+template<typename T>
+__device__
+static T BoxMuller(unsigned& z1, unsigned& z2, unsigned& z3, unsigned& z4)  
+{
+  const T M_2PI = 6.283185307179586476925286766558;
+
+  T u0 = HybridTaus<T>(z1,z2,z3,z4), u1 = HybridTaus<T>(z1,z2,z3,z4);
+  T r = sqrt(-2.0 * log(u0));
+  T theta = M_2PI * u1;
+  return r*sin(theta);
+  
+}  
+
+
+template<typename T>
+__global__
+static void _gauss_rand(T* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if( i < d.cols  && j < d.rows ) {
+    mat[index] = BoxMuller<T>(z1[index],z2[index],z3[index],z4[index]);
+  }
+}
+
+
+template<typename T>
+__global__
+static void _binarize_probs(T* states, const T* probs, const T* rand, MatrixDim d)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if( i < d.cols  && j < d.rows ) {
+    states[index] = ((probs[index] > rand[index])? 1.0 : 0.0);
+  }
+}
+
+
+
+/************
+ * :FLOAT:
+ */
+void cudaF_rand(dim3 Gr, dim3 Bl, float* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
+{ _rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); }
+
+void cudaF_gauss_rand(dim3 Gr, dim3 Bl, float* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
+{ _gauss_rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); }
+
+void cudaF_binarize_probs(dim3 Gr, dim3 Bl, float* states, const float* probs, float* rand, MatrixDim d) 
+{ _binarize_probs<<<Gr,Bl>>>(states,probs,rand,d); }
+
+
+/************
+ * :DOUBLE:
+ */
+void cudaD_rand(dim3 Gr, dim3 Bl, double* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
+{ _rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); }
+
+void cudaD_gauss_rand(dim3 Gr, dim3 Bl, double* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
+{ _gauss_rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); }
+
+void cudaD_binarize_probs(dim3 Gr, dim3 Bl, double* states, const double* probs, double* rand, MatrixDim d) 
+{ _binarize_probs<<<Gr,Bl>>>(states,probs,rand,d); }
+
diff --git a/src/CuBaseLib/curandkernels.cu~ b/src/CuBaseLib/curandkernels.cu~
new file mode 100644
index 0000000..7e1c8dd
--- /dev/null
+++ b/src/CuBaseLib/curandkernels.cu~
@@ -0,0 +1,135 @@
+
+#include "curandkernels.h"
+
+
+
+//
+//Hybrid Tauss/LCG random number generator
+//
+//http://http.developer.nvidia.com/GPUGems3/gpugems3_ch37.html
+
+
+// S1, S2, S3, and M are all constants, and z is part of the  
+// private per-thread generator state.
+__device__
+static unsigned TausStep(unsigned &z, int S1, int S2, int S3, unsigned M)  
+{  
+  unsigned b=(((z << S1) ^ z) >> S2);  
+  return z = (((z & M) << S3) ^ b);  
+}  
+
+// A and C are constants  
+__device__
+static unsigned LCGStep(unsigned &z, unsigned A, unsigned C)  
+{  
+  return z=(A*z+C);  
+} 
+
+template<typename T>
+__device__
+static T HybridTaus(unsigned& z1, unsigned& z2, unsigned& z3, unsigned& z4)  
+{  
+  // Combined period is lcm(p1,p2,p3,p4)~ 2^121
+  T randval;
+  do { 
+   randval = 2.3283064365387e-10 * (          // Periods  
+    TausStep(z1, 13, 19, 12, 4294967294UL) ^  // p1=2^31-1  
+    TausStep(z2, 2, 25, 4, 4294967288UL) ^    // p2=2^30-1  
+    TausStep(z3, 3, 11, 17, 4294967280UL) ^   // p3=2^28-1  
+    LCGStep(z4, 1664525, 1013904223UL)        // p4=2^32  
+   );
+  } while (!(randval > 0.0 && randval < 1.0));
+  return randval;
+}  
+
+
+
+
+template<typename T>
+__global__
+static void _rand(T* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if( i < d.cols  && j < d.rows ) {
+    mat[index] = HybridTaus<T>(z1[index],z2[index],z3[index],z4[index]);
+  }
+}
+
+/*
+float2 BoxMuller()  
+{  
+  float u0=HybridTaus (), u1=HybridTaus ();  
+  float r=sqrt(-2 log(u0));  
+  float theta=2*PI*u1;  
+  return make_float2(r*sin(theta),r*cos(theta));  
+} 
+*/
+ 
+template<typename T>
+__device__
+static T BoxMuller(unsigned& z1, unsigned& z2, unsigned& z3, unsigned& z4)  
+{
+  const T M_2PI = 6.283185307179586476925286766558;
+
+  T u0 = HybridTaus<T>(z1,z2,z3,z4), u1 = HybridTaus<T>(z1,z2,z3,z4);
+  T r = sqrt(-2.0 * log(u0));
+  T theta = M_2PI * u1;
+  return r*sin(theta);
+  
+}  
+
+
+template<typename T>
+__global__
+static void _gauss_rand(T* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if( i < d.cols  && j < d.rows ) {
+    mat[index] = BoxMuller<T>(z1[index],z2[index],z3[index],z4[index]);
+  }
+}
+
+
+template<typename T>
+__global__
+static void _binarize_probs(T* states, const T* probs, const T* rand, MatrixDim d)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if( i < d.cols  && j < d.rows ) {
+    states[index] = ((probs[index] > rand[index])? 1.0 : 0.0);
+  }
+}
+
+
+
+/************
+ * :FLOAT:
+ */
+void cudaF_rand(dim3 Gr, dim3 Bl, float* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
+{ _rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); }
+
+void cudaF_gauss_rand(dim3 Gr, dim3 Bl, float* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
+{ _gauss_rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); }
+
+void cudaF_binarize_probs(dim3 Gr, dim3 Bl, float* states, const float* probs, float* rand, MatrixDim d) 
+{ _binarize_probs<<<Gr,Bl>>>(states,probs,rand,d); }
+
+
+/************
+ * :DOUBLE:
+ */
+void cudaD_rand(dim3 Gr, dim3 Bl, double* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
+{ _rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); }
+
+void cudaD_gauss_rand(dim3 Gr, dim3 Bl, double* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
+{ _gauss_rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); }
+
+void cudaD_binarize_probs(dim3 Gr, dim3 Bl, double* states, const double* probs, double* rand, MatrixDim d) 
+{ _binarize_probs<<<Gr,Bl>>>(states,probs,rand,d); }
+
diff --git a/src/CuBaseLib/curandkernels.h b/src/CuBaseLib/curandkernels.h
new file mode 100644
index 0000000..69b589f
--- /dev/null
+++ b/src/CuBaseLib/curandkernels.h
@@ -0,0 +1,34 @@
+#ifndef _cuda_rand_kernels_h_
+#define _cuda_rand_kernels_h_
+
+
+#include "cukernels.h"
+
+
+extern "C" {
+  //**************
+  //float
+  //
+  void cudaF_rand(dim3 Gr, dim3 Bl, float* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d);
+
+
+  void cudaF_gauss_rand(dim3 Gr, dim3 Bl, float* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d);
+
+
+  void cudaF_binarize_probs(dim3 Gr, dim3 Bl, float* states, const float* probs, float* rand, MatrixDim d);
+  
+  //**************
+  //double
+  //
+  void cudaD_rand(dim3 Gr, dim3 Bl, double* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d);
+
+
+  void cudaD_gauss_rand(dim3 Gr, dim3 Bl, double* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d);
+
+
+  void cudaD_binarize_probs(dim3 Gr, dim3 Bl, double* states, const double* probs, double* rand, MatrixDim d);
+
+}
+
+
+#endif
diff --git a/src/CuBaseLib/cuvector.h b/src/CuBaseLib/cuvector.h
new file mode 100644
index 0000000..7bfe116
--- /dev/null
+++ b/src/CuBaseLib/cuvector.h
@@ -0,0 +1,121 @@
+#ifndef _CUVECTOR_H_
+#define _CUVECTOR_H_
+
+#include "Vector.h"
+
+namespace TNet {
+
+  template<typename _ElemT> class CuMatrix;
+
+  /**
+   * \brief Vector for CUDA computing
+   *
+   * Vector is a matrix consist of a single row
+   */
+  template<typename _ElemT>
+  class CuVector 
+  {
+    typedef CuVector<_ElemT> ThisType;
+
+    public:
+
+      /// Default Constructor
+      CuVector<_ElemT>()
+       : mDim(0), mpCUData(NULL)
+      { }
+      /// Constructor with memory initialisation
+      CuVector<_ElemT>(size_t dim)
+       : mDim(0), mpCUData(NULL)
+      { Init(dim); }
+
+      /// Destructor
+      ~CuVector()
+      { Destroy(); }
+
+      /// Dimensions
+      size_t Dim() const
+      { return mDim; }
+
+      /*
+      ::MatrixDim Dim() const
+      { ::MatrixDim d = { mDim, 1, 1 }; return d; }
+      */
+
+      /// Get raw pointer
+      const _ElemT* pCUData() const
+      { return mpCUData; }
+      _ElemT* pCUData()
+      { return mpCUData; }
+
+      /// Allocate the memory
+      ThisType& Init(size_t dim);
+
+      /// Deallocate the memory
+      void Destroy();
+
+      /// Copy functions (reallocates when needed)
+      ThisType&        CopyFrom(const CuVector<_ElemT>& rSrc);
+      ThisType&        CopyFrom(const Vector<_ElemT>& rSrc);
+      Vector<_ElemT>&  CopyTo(Vector<_ElemT>& rDst) const;
+
+
+      
+      // Math operations
+      /// Set to All Zeros
+      void SetZero();
+
+      /// Set to Constant
+      ///
+      /// \param[in] value Desired constant value
+      void SetConst(_ElemT value)
+      { Error("__func__ Not implemented"); }
+
+      /// Add scaled vector
+      ///
+      /// \param[in] alpha
+      /// \param[in] vec
+      /// \param[in] beta
+      /// Cal \f$ A=\alpha vec + \beta A \f$
+      void AddScaled(_ElemT alpha, const CuVector<_ElemT>& vec, _ElemT beta)
+      { Error("__func__ Not implemented"); }
+
+      //// Add scaled vector
+      ///
+      /// \param[in] alpha
+      /// \param[in] mat
+      /// \param[in] beta
+      /// Cal \f$ A=\alpha mat.ColSum() + \beta A \f$
+      void AddColSum(_ElemT alpha, const CuMatrix<_ElemT>& mat, _ElemT beta)
+      { Error("__func__ Not implemented"); }
+
+      void Print() const
+      { 
+        Vector<_ElemT> vec(Dim());
+        CopyTo(vec);
+        std::cout << vec << "\n";
+      }
+
+
+    private:
+      size_t mDim;
+      _ElemT* mpCUData;
+  };
+
+
+  /// Prints the matrix dimensions and pointer to stream
+  template<typename _ElemT>
+  inline std::ostream& operator << (std::ostream& out, const CuVector<_ElemT>& vec)
+  { 
+    size_t d = vec.Dim(); 
+    out << "[CuVector D" << d
+        << " PTR" << vec.pCUData() << "]" << std::flush;
+    return out;
+  }
+  
+  
+}
+
+
+#include "cuvector.tcc"
+
+#endif
diff --git a/src/CuBaseLib/cuvector.h~ b/src/CuBaseLib/cuvector.h~
new file mode 100644
index 0000000..d118ec4
--- /dev/null
+++ b/src/CuBaseLib/cuvector.h~
@@ -0,0 +1,104 @@
+#ifndef _CUVECTOR_H_
+#define _CUVECTOR_H_
+
+#include "Vector.h"
+
+namespace TNet {
+
+  template<typename _ElemT> class CuMatrix;
+
+  /**
+   * \brief Vector for CUDA computing
+   */
+  template<typename _ElemT>
+  class CuVector 
+  {
+    typedef CuVector<_ElemT> ThisType;
+
+    public:
+
+      /// Default Constructor
+      CuVector<_ElemT>()
+       : mDim(0), mpCUData(NULL)
+      { }
+      /// Constructor with memory initialisation
+      CuVector<_ElemT>(size_t dim)
+       : mDim(0), mpCUData(NULL)
+      { Init(dim); }
+
+      /// Destructor
+      ~CuVector()
+      { Destroy(); }
+
+      /// Dimensions
+      size_t Dim() const
+      { return mDim; }
+
+      /*
+      ::MatrixDim Dim() const
+      { ::MatrixDim d = { mDim, 1, 1 }; return d; }
+      */
+
+      /// Get raw pointer
+      const _ElemT* pCUData() const
+      { return mpCUData; }
+      _ElemT* pCUData()
+      { return mpCUData; }
+
+      /// Allocate the memory
+      ThisType& Init(size_t dim);
+
+      /// Deallocate the memory
+      void Destroy();
+
+      /// Copy functions (reallocates when needed)
+      ThisType&        CopyFrom(const CuVector<_ElemT>& rSrc);
+      ThisType&        CopyFrom(const Vector<_ElemT>& rSrc);
+      Vector<_ElemT>&  CopyTo(Vector<_ElemT>& rDst) const;
+
+
+      
+      // Math operations
+      //
+      void SetZero();
+
+      void SetConst(_ElemT value)
+      { Error("__func__ Not implemented"); }
+
+      void AddScaled(_ElemT alpha, const CuVector<_ElemT>& vec, _ElemT beta)
+      { Error("__func__ Not implemented"); }
+
+      void AddColSum(_ElemT alpha, const CuMatrix<_ElemT>& mat, _ElemT beta)
+      { Error("__func__ Not implemented"); }
+
+      void Print() const
+      { 
+        Vector<_ElemT> vec(Dim());
+        CopyTo(vec);
+        std::cout << vec << "\n";
+      }
+
+
+    private:
+      size_t mDim;
+      _ElemT* mpCUData;
+  };
+
+
+  /// Prints the matrix dimensions and pointer to stream
+  template<typename _ElemT>
+  inline std::ostream& operator << (std::ostream& out, const CuVector<_ElemT>& vec)
+  { 
+    size_t d = vec.Dim(); 
+    out << "[CuVector D" << d
+        << " PTR" << vec.pCUData() << "]" << std::flush;
+    return out;
+  }
+  
+  
+}
+
+
+#include "cuvector.tcc"
+
+#endif
diff --git a/src/CuBaseLib/cuvector.tcc b/src/CuBaseLib/cuvector.tcc
new file mode 100644
index 0000000..0107859
--- /dev/null
+++ b/src/CuBaseLib/cuvector.tcc
@@ -0,0 +1,254 @@
+
+#include <cuda_runtime_api.h>
+
+#include "Timer.h"
+#include "cucommon.h"
+#include "cumatrix.h"
+#include "cudevice.h"
+
+namespace TNet {
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+  template<typename _ElemT>
+  CuVector<_ElemT>&
+  CuVector<_ElemT>::
+  Init(size_t dim)
+  {
+    if(mDim == dim) {
+      //SetZero();
+      return *this;
+    }
+
+    Destroy();
+
+    cuSafeCall(cudaMalloc((void**)&mpCUData, dim*sizeof(_ElemT)));
+    mDim = dim;
+    SetZero();
+
+    return *this;
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+  
+  template<typename _ElemT>
+  void
+  CuVector<_ElemT>::
+  Destroy()
+  {
+    if(NULL != mpCUData) {
+      cuSafeCall(cudaFree(mpCUData));
+      mpCUData = NULL;
+    }
+    mDim = 0;
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+  template<typename _ElemT>
+  CuVector<_ElemT>&
+  CuVector<_ElemT>::
+  CopyFrom(const CuVector<_ElemT>& rSrc)
+  {
+    Init(rSrc.Dim());
+    
+    Timer tim; tim.Start();
+
+    cuSafeCall(cudaMemcpy(mpCUData, rSrc.pCUData(), rSrc.Dim()*sizeof(_ElemT), cudaMemcpyDeviceToDevice));
+
+    tim.End(); CuDevice::Instantiate().AccuProfile("CuVector::CopyFromD2D",tim.Val());
+    return *this;
+  }
+  
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+  template<typename _ElemT>
+  CuVector<_ElemT>&
+  CuVector<_ElemT>::
+  CopyFrom(const Vector<_ElemT>& rSrc)
+  {
+    Init(rSrc.Dim());
+
+    Timer tim; tim.Start();
+
+    cuSafeCall(cudaMemcpy(mpCUData, rSrc.pData(), rSrc.Dim()*sizeof(_ElemT), cudaMemcpyHostToDevice));
+
+    tim.End(); CuDevice::Instantiate().AccuProfile("CuVector::CopyFromH2D",tim.Val());
+    return *this;
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+  template<typename _ElemT>
+  Vector<_ElemT>&
+  CuVector<_ElemT>::
+  CopyTo(Vector<_ElemT>& rDst) const
+  {
+    if(rDst.Dim() != mDim) {
+      rDst.Init(mDim);
+    }
+
+    Timer tim; tim.Start();
+   
+    cuSafeCall(cudaMemcpy(rDst.pData(), pCUData(), mDim*sizeof(_ElemT), cudaMemcpyDeviceToHost));
+
+    tim.End(); CuDevice::Instantiate().AccuProfile("CuVector::CopyToD2H",tim.Val());
+
+    return rDst;
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+
+  template<typename _ElemT>
+  void 
+  CuVector<_ElemT>::
+  SetZero()
+  {
+    Timer tim; tim.Start();
+    cuSafeCall(cudaMemset(mpCUData, 0, mDim*sizeof(_ElemT)));
+    tim.End(); CuDevice::Instantiate().AccuProfile("CuVector::SetZero",tim.Val());
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////
+  //// CuVector:: templeate specializations (float)
+  ////
+  template<>
+  inline void CuVector<float>::SetConst(float value)
+  {
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK);
+    dim3 dimGrid(n_blocks(Dim(), CUBLOCK));
+    ::MatrixDim d = { 1, Dim(), Dim() };
+
+    cudaF_set_const(dimGrid,dimBlock,mpCUData,value,d);
+    cuSafeCall(cudaGetLastError());
+
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  inline void CuVector<float>::AddScaled(float alpha, const CuVector<float>& vec, float beta)
+  {
+    Timer tim; tim.Start();
+
+    assert(vec.Dim() == Dim());
+
+    dim3 dimBlock(CUBLOCK);
+    dim3 dimGrid(n_blocks(Dim(), CUBLOCK));
+    ::MatrixDim d = { 1, Dim(), Dim() };
+
+    cudaF_add_scaled(dimGrid,dimBlock,alpha,vec.pCUData(),beta,mpCUData,d);
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  inline void CuVector<float>::AddColSum(float alpha, const CuMatrix<float>& mat, float beta)
+  {
+    Timer tim; tim.Start();
+
+    assert(mat.Cols() == Dim());
+    
+    /**
+     * Rows()<=512 limit due to limited shared memory
+     * Cols()<=256 limit due to coalesced memory alignment:
+     *             matrices with huge strides have slow access!!!
+     */
+    if(mat.Rows() > 512 || mat.Cols() > 256) {
+      size_t dimBlock = CUBLOCK*2;
+      size_t dimGrid = n_blocks(Dim(),CUBLOCK*2); 
+
+      cudaF_add_col_sum(dimGrid,dimBlock,alpha,mat.pCUData(),beta,mpCUData,mat.Dim());
+      cuSafeCall(cudaGetLastError());
+    } else {
+      dim3 dimBlock(mat.Rows(),1);
+      dim3 dimGrid(1,Dim()); 
+
+      cudaF_add_col_sum_reduce(dimGrid,dimBlock,alpha,mat.pCUData(),beta,mpCUData,mat.Dim());
+      cuSafeCall(cudaGetLastError());
+    }
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////
+  //// CuVector:: templeate specializations (double)
+  ////
+  template<>
+  inline void CuVector<double>::SetConst(double value)
+  {
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK);
+    dim3 dimGrid(n_blocks(Dim(), CUBLOCK));
+    ::MatrixDim d = { 1, Dim(), Dim() };
+
+    cudaD_set_const(dimGrid,dimBlock,mpCUData,value,d);
+    cuSafeCall(cudaGetLastError());
+
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  inline void CuVector<double>::AddScaled(double alpha, const CuVector<double>& vec, double beta)
+  {
+    Timer tim; tim.Start();
+
+    assert(vec.Dim() == Dim());
+
+    dim3 dimBlock(CUBLOCK);
+    dim3 dimGrid(n_blocks(Dim(), CUBLOCK));
+    ::MatrixDim d = { 1, Dim(), Dim() };
+
+    cudaD_add_scaled(dimGrid,dimBlock,alpha,vec.pCUData(),beta,mpCUData,d);
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  inline void CuVector<double>::AddColSum(double alpha, const CuMatrix<double>& mat, double beta)
+  {
+    Timer tim; tim.Start();
+
+    assert(mat.Cols() == Dim());
+
+    size_t dimBlock = CUBLOCK*2;
+    size_t dimGrid = n_blocks(Dim(),CUBLOCK*2); 
+
+    cudaD_add_col_sum(dimGrid,dimBlock,alpha,mat.pCUData(),beta,mpCUData,mat.Dim());
+    cuSafeCall(cudaGetLastError());
+
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+}
+
+
+
diff --git a/src/CuTNetLib/.depend.mk b/src/CuTNetLib/.depend.mk
new file mode 100644
index 0000000..bcf9cb2
--- /dev/null
+++ b/src/CuTNetLib/.depend.mk
@@ -0,0 +1,2250 @@
+cuActivation.o: cuActivation.cc cuActivation.h cuComponent.h \
+ ../KaldiLib/Vector.h /usr/include/c++/4.6/cstddef \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/endian.h /usr/include/bits/endian.h \
+ /usr/include/bits/byteswap.h /usr/include/xlocale.h \
+ /usr/include/sys/types.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/time.h \
+ /usr/include/sys/select.h /usr/include/bits/select.h \
+ /usr/include/bits/sigset.h /usr/include/bits/time.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/bits/pthreadtypes.h /usr/include/alloca.h \
+ /usr/include/bits/stdlib.h /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/postypes.h \
+ /usr/include/c++/4.6/cwchar /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/iosfwd \
+ /usr/include/c++/4.6/cctype /usr/include/ctype.h \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/bits/sched.h \
+ /usr/include/bits/timex.h /usr/include/bits/setjmp.h \
+ /usr/include/unistd.h /usr/include/bits/posix_opt.h \
+ /usr/include/bits/environments.h /usr/include/bits/confname.h \
+ /usr/include/getopt.h /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/bits/locale_classes.h \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc ../KaldiLib/cblas.h \
+ ../KaldiLib/clapack.h ../KaldiLib/cblas.h ../KaldiLib/Common.h \
+ /usr/include/string.h /usr/include/bits/string3.h \
+ /usr/include/c++/4.6/sstream /usr/include/c++/4.6/bits/sstream.tcc \
+ ../KaldiLib/MathAux.h /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ ../KaldiLib/Types.h ../KaldiLib/Error.h /usr/include/execinfo.h \
+ ../KaldiLib/Vector.tcc /usr/include/c++/4.6/cstring \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ ../KaldiLib/Matrix.h ../KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h \
+ /usr/include/c++/4.6/bits/stl_construct.h /usr/include/c++/4.6/limits \
+ /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc ../KaldiLib/Vector.h \
+ ../KaldiLib/Matrix.h ../KaldiLib/Error.h ../CuBaseLib/cumatrix.h \
+ ../CuBaseLib/cukernels.h /usr/local/cuda-5.0/include/vector_types.h \
+ /usr/local/cuda-5.0/include/builtin_types.h \
+ /usr/local/cuda-5.0/include/device_types.h \
+ /usr/local/cuda-5.0/include/host_defines.h \
+ /usr/local/cuda-5.0/include/driver_types.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/limits.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/syslimits.h \
+ /usr/include/limits.h /usr/include/bits/posix1_lim.h \
+ /usr/include/bits/local_lim.h /usr/include/linux/limits.h \
+ /usr/include/bits/posix2_lim.h /usr/include/bits/xopen_lim.h \
+ /usr/local/cuda-5.0/include/surface_types.h \
+ /usr/local/cuda-5.0/include/texture_types.h \
+ /usr/local/cuda-5.0/include/vector_types.h ../CuBaseLib/cumatrix.tcc \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/cuda_device_runtime_api.h \
+ /usr/local/cuda-5.0/include/cublas.h \
+ /usr/local/cuda-5.0/include/cuda_runtime.h \
+ /usr/local/cuda-5.0/include/host_config.h \
+ /usr/local/cuda-5.0/include/channel_descriptor.h \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/driver_functions.h \
+ /usr/local/cuda-5.0/include/vector_functions.h \
+ /usr/local/cuda-5.0/include/cublas_api.h \
+ /usr/local/cuda-5.0/include/cuComplex.h ../KaldiLib/Timer.h \
+ /usr/include/sys/time.h ../CuBaseLib/cucommon.h ../CuBaseLib/cuvector.h \
+ ../CuBaseLib/cuvector.tcc ../CuBaseLib/cumatrix.h \
+ ../CuBaseLib/cudevice.h /usr/include/c++/4.6/map \
+ /usr/include/c++/4.6/bits/stl_tree.h /usr/include/c++/4.6/bits/stl_map.h \
+ /usr/include/c++/4.6/bits/stl_multimap.h ../CuBaseLib/cumath.h
+cuBiasedLinearity.o: cuBiasedLinearity.cc cuBiasedLinearity.h \
+ cuComponent.h ../KaldiLib/Vector.h /usr/include/c++/4.6/cstddef \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/endian.h /usr/include/bits/endian.h \
+ /usr/include/bits/byteswap.h /usr/include/xlocale.h \
+ /usr/include/sys/types.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/time.h \
+ /usr/include/sys/select.h /usr/include/bits/select.h \
+ /usr/include/bits/sigset.h /usr/include/bits/time.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/bits/pthreadtypes.h /usr/include/alloca.h \
+ /usr/include/bits/stdlib.h /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/postypes.h \
+ /usr/include/c++/4.6/cwchar /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/iosfwd \
+ /usr/include/c++/4.6/cctype /usr/include/ctype.h \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/bits/sched.h \
+ /usr/include/bits/timex.h /usr/include/bits/setjmp.h \
+ /usr/include/unistd.h /usr/include/bits/posix_opt.h \
+ /usr/include/bits/environments.h /usr/include/bits/confname.h \
+ /usr/include/getopt.h /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/bits/locale_classes.h \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc ../KaldiLib/cblas.h \
+ ../KaldiLib/clapack.h ../KaldiLib/cblas.h ../KaldiLib/Common.h \
+ /usr/include/string.h /usr/include/bits/string3.h \
+ /usr/include/c++/4.6/sstream /usr/include/c++/4.6/bits/sstream.tcc \
+ ../KaldiLib/MathAux.h /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ ../KaldiLib/Types.h ../KaldiLib/Error.h /usr/include/execinfo.h \
+ ../KaldiLib/Vector.tcc /usr/include/c++/4.6/cstring \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ ../KaldiLib/Matrix.h ../KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h \
+ /usr/include/c++/4.6/bits/stl_construct.h /usr/include/c++/4.6/limits \
+ /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc ../KaldiLib/Vector.h \
+ ../KaldiLib/Matrix.h ../KaldiLib/Error.h ../CuBaseLib/cumatrix.h \
+ ../CuBaseLib/cukernels.h /usr/local/cuda-5.0/include/vector_types.h \
+ /usr/local/cuda-5.0/include/builtin_types.h \
+ /usr/local/cuda-5.0/include/device_types.h \
+ /usr/local/cuda-5.0/include/host_defines.h \
+ /usr/local/cuda-5.0/include/driver_types.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/limits.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/syslimits.h \
+ /usr/include/limits.h /usr/include/bits/posix1_lim.h \
+ /usr/include/bits/local_lim.h /usr/include/linux/limits.h \
+ /usr/include/bits/posix2_lim.h /usr/include/bits/xopen_lim.h \
+ /usr/local/cuda-5.0/include/surface_types.h \
+ /usr/local/cuda-5.0/include/texture_types.h \
+ /usr/local/cuda-5.0/include/vector_types.h ../CuBaseLib/cumatrix.tcc \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/cuda_device_runtime_api.h \
+ /usr/local/cuda-5.0/include/cublas.h \
+ /usr/local/cuda-5.0/include/cuda_runtime.h \
+ /usr/local/cuda-5.0/include/host_config.h \
+ /usr/local/cuda-5.0/include/channel_descriptor.h \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/driver_functions.h \
+ /usr/local/cuda-5.0/include/vector_functions.h \
+ /usr/local/cuda-5.0/include/cublas_api.h \
+ /usr/local/cuda-5.0/include/cuComplex.h ../KaldiLib/Timer.h \
+ /usr/include/sys/time.h ../CuBaseLib/cucommon.h ../CuBaseLib/cuvector.h \
+ ../CuBaseLib/cuvector.tcc ../CuBaseLib/cumatrix.h \
+ ../CuBaseLib/cudevice.h /usr/include/c++/4.6/map \
+ /usr/include/c++/4.6/bits/stl_tree.h /usr/include/c++/4.6/bits/stl_map.h \
+ /usr/include/c++/4.6/bits/stl_multimap.h
+cuBlockArray.o: cuBlockArray.cc cuBlockArray.h cuComponent.h \
+ ../KaldiLib/Vector.h /usr/include/c++/4.6/cstddef \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/endian.h /usr/include/bits/endian.h \
+ /usr/include/bits/byteswap.h /usr/include/xlocale.h \
+ /usr/include/sys/types.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/time.h \
+ /usr/include/sys/select.h /usr/include/bits/select.h \
+ /usr/include/bits/sigset.h /usr/include/bits/time.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/bits/pthreadtypes.h /usr/include/alloca.h \
+ /usr/include/bits/stdlib.h /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/postypes.h \
+ /usr/include/c++/4.6/cwchar /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/iosfwd \
+ /usr/include/c++/4.6/cctype /usr/include/ctype.h \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/bits/sched.h \
+ /usr/include/bits/timex.h /usr/include/bits/setjmp.h \
+ /usr/include/unistd.h /usr/include/bits/posix_opt.h \
+ /usr/include/bits/environments.h /usr/include/bits/confname.h \
+ /usr/include/getopt.h /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/bits/locale_classes.h \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc ../KaldiLib/cblas.h \
+ ../KaldiLib/clapack.h ../KaldiLib/cblas.h ../KaldiLib/Common.h \
+ /usr/include/string.h /usr/include/bits/string3.h \
+ /usr/include/c++/4.6/sstream /usr/include/c++/4.6/bits/sstream.tcc \
+ ../KaldiLib/MathAux.h /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ ../KaldiLib/Types.h ../KaldiLib/Error.h /usr/include/execinfo.h \
+ ../KaldiLib/Vector.tcc /usr/include/c++/4.6/cstring \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ ../KaldiLib/Matrix.h ../KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h \
+ /usr/include/c++/4.6/bits/stl_construct.h /usr/include/c++/4.6/limits \
+ /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc ../KaldiLib/Vector.h \
+ ../KaldiLib/Matrix.h ../KaldiLib/Error.h ../CuBaseLib/cumatrix.h \
+ ../CuBaseLib/cukernels.h /usr/local/cuda-5.0/include/vector_types.h \
+ /usr/local/cuda-5.0/include/builtin_types.h \
+ /usr/local/cuda-5.0/include/device_types.h \
+ /usr/local/cuda-5.0/include/host_defines.h \
+ /usr/local/cuda-5.0/include/driver_types.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/limits.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/syslimits.h \
+ /usr/include/limits.h /usr/include/bits/posix1_lim.h \
+ /usr/include/bits/local_lim.h /usr/include/linux/limits.h \
+ /usr/include/bits/posix2_lim.h /usr/include/bits/xopen_lim.h \
+ /usr/local/cuda-5.0/include/surface_types.h \
+ /usr/local/cuda-5.0/include/texture_types.h \
+ /usr/local/cuda-5.0/include/vector_types.h ../CuBaseLib/cumatrix.tcc \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/cuda_device_runtime_api.h \
+ /usr/local/cuda-5.0/include/cublas.h \
+ /usr/local/cuda-5.0/include/cuda_runtime.h \
+ /usr/local/cuda-5.0/include/host_config.h \
+ /usr/local/cuda-5.0/include/channel_descriptor.h \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/driver_functions.h \
+ /usr/local/cuda-5.0/include/vector_functions.h \
+ /usr/local/cuda-5.0/include/cublas_api.h \
+ /usr/local/cuda-5.0/include/cuComplex.h ../KaldiLib/Timer.h \
+ /usr/include/sys/time.h ../CuBaseLib/cucommon.h ../CuBaseLib/cuvector.h \
+ ../CuBaseLib/cuvector.tcc ../CuBaseLib/cumatrix.h \
+ ../CuBaseLib/cudevice.h /usr/include/c++/4.6/map \
+ /usr/include/c++/4.6/bits/stl_tree.h /usr/include/c++/4.6/bits/stl_map.h \
+ /usr/include/c++/4.6/bits/stl_multimap.h cuNetwork.h cuBiasedLinearity.h \
+ cuActivation.h cuCRBEDctFeat.h ../CuBaseLib/cumath.h
+cuCache.o: cuCache.cc cuCache.h ../CuBaseLib/cumatrix.h \
+ /usr/include/c++/4.6/sstream /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/ios /usr/include/c++/4.6/iosfwd \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/postypes.h /usr/include/c++/4.6/cwchar \
+ /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/xlocale.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/cctype \
+ /usr/include/ctype.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/endian.h \
+ /usr/include/bits/endian.h /usr/include/bits/byteswap.h \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/time.h \
+ /usr/include/bits/sched.h /usr/include/bits/time.h \
+ /usr/include/bits/timex.h /usr/include/bits/pthreadtypes.h \
+ /usr/include/bits/setjmp.h /usr/include/unistd.h \
+ /usr/include/bits/posix_opt.h /usr/include/bits/environments.h \
+ /usr/include/bits/confname.h /usr/include/getopt.h \
+ /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/bits/locale_classes.h /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc /usr/include/c++/4.6/ostream \
+ /usr/include/c++/4.6/bits/ostream.tcc \
+ /usr/include/c++/4.6/bits/istream.tcc \
+ /usr/include/c++/4.6/bits/sstream.tcc ../KaldiLib/Matrix.h \
+ /usr/include/stdlib.h /usr/include/bits/waitflags.h \
+ /usr/include/bits/waitstatus.h /usr/include/sys/types.h \
+ /usr/include/sys/select.h /usr/include/bits/select.h \
+ /usr/include/bits/sigset.h /usr/include/bits/select2.h \
+ /usr/include/sys/sysmacros.h /usr/include/alloca.h \
+ /usr/include/bits/stdlib.h /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/iostream ../KaldiLib/cblas.h ../KaldiLib/clapack.h \
+ ../KaldiLib/cblas.h ../KaldiLib/Common.h /usr/include/c++/4.6/cstdlib \
+ /usr/include/string.h /usr/include/bits/string3.h ../KaldiLib/MathAux.h \
+ /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ ../KaldiLib/Types.h ../KaldiLib/Error.h /usr/include/execinfo.h \
+ ../KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h \
+ /usr/include/c++/4.6/bits/stl_construct.h /usr/include/c++/4.6/limits \
+ /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc ../KaldiLib/Vector.h \
+ /usr/include/c++/4.6/cstddef ../KaldiLib/Vector.tcc \
+ /usr/include/c++/4.6/cstring ../KaldiLib/Matrix.h \
+ ../CuBaseLib/cukernels.h /usr/local/cuda-5.0/include/vector_types.h \
+ /usr/local/cuda-5.0/include/builtin_types.h \
+ /usr/local/cuda-5.0/include/device_types.h \
+ /usr/local/cuda-5.0/include/host_defines.h \
+ /usr/local/cuda-5.0/include/driver_types.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/limits.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/syslimits.h \
+ /usr/include/limits.h /usr/include/bits/posix1_lim.h \
+ /usr/include/bits/local_lim.h /usr/include/linux/limits.h \
+ /usr/include/bits/posix2_lim.h /usr/include/bits/xopen_lim.h \
+ /usr/local/cuda-5.0/include/surface_types.h \
+ /usr/local/cuda-5.0/include/texture_types.h \
+ /usr/local/cuda-5.0/include/vector_types.h ../CuBaseLib/cumatrix.tcc \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/cuda_device_runtime_api.h \
+ /usr/local/cuda-5.0/include/cublas.h \
+ /usr/local/cuda-5.0/include/cuda_runtime.h \
+ /usr/local/cuda-5.0/include/host_config.h \
+ /usr/local/cuda-5.0/include/channel_descriptor.h \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/driver_functions.h \
+ /usr/local/cuda-5.0/include/vector_functions.h \
+ /usr/local/cuda-5.0/include/cublas_api.h \
+ /usr/local/cuda-5.0/include/cuComplex.h ../KaldiLib/Timer.h \
+ /usr/include/sys/time.h ../CuBaseLib/cucommon.h ../KaldiLib/Error.h \
+ ../CuBaseLib/cuvector.h ../KaldiLib/Vector.h ../CuBaseLib/cuvector.tcc \
+ ../CuBaseLib/cumatrix.h ../CuBaseLib/cudevice.h /usr/include/c++/4.6/map \
+ /usr/include/c++/4.6/bits/stl_tree.h /usr/include/c++/4.6/bits/stl_map.h \
+ /usr/include/c++/4.6/bits/stl_multimap.h ../CuBaseLib/cumath.h
+cuCompDisc.o: cuCompDisc.cc cuCompDisc.h cuComponent.h \
+ ../KaldiLib/Vector.h /usr/include/c++/4.6/cstddef \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/endian.h /usr/include/bits/endian.h \
+ /usr/include/bits/byteswap.h /usr/include/xlocale.h \
+ /usr/include/sys/types.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/time.h \
+ /usr/include/sys/select.h /usr/include/bits/select.h \
+ /usr/include/bits/sigset.h /usr/include/bits/time.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/bits/pthreadtypes.h /usr/include/alloca.h \
+ /usr/include/bits/stdlib.h /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/postypes.h \
+ /usr/include/c++/4.6/cwchar /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/iosfwd \
+ /usr/include/c++/4.6/cctype /usr/include/ctype.h \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/bits/sched.h \
+ /usr/include/bits/timex.h /usr/include/bits/setjmp.h \
+ /usr/include/unistd.h /usr/include/bits/posix_opt.h \
+ /usr/include/bits/environments.h /usr/include/bits/confname.h \
+ /usr/include/getopt.h /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/bits/locale_classes.h \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc ../KaldiLib/cblas.h \
+ ../KaldiLib/clapack.h ../KaldiLib/cblas.h ../KaldiLib/Common.h \
+ /usr/include/string.h /usr/include/bits/string3.h \
+ /usr/include/c++/4.6/sstream /usr/include/c++/4.6/bits/sstream.tcc \
+ ../KaldiLib/MathAux.h /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ ../KaldiLib/Types.h ../KaldiLib/Error.h /usr/include/execinfo.h \
+ ../KaldiLib/Vector.tcc /usr/include/c++/4.6/cstring \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ ../KaldiLib/Matrix.h ../KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h \
+ /usr/include/c++/4.6/bits/stl_construct.h /usr/include/c++/4.6/limits \
+ /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc ../KaldiLib/Vector.h \
+ ../KaldiLib/Matrix.h ../KaldiLib/Error.h ../CuBaseLib/cumatrix.h \
+ ../CuBaseLib/cukernels.h /usr/local/cuda-5.0/include/vector_types.h \
+ /usr/local/cuda-5.0/include/builtin_types.h \
+ /usr/local/cuda-5.0/include/device_types.h \
+ /usr/local/cuda-5.0/include/host_defines.h \
+ /usr/local/cuda-5.0/include/driver_types.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/limits.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/syslimits.h \
+ /usr/include/limits.h /usr/include/bits/posix1_lim.h \
+ /usr/include/bits/local_lim.h /usr/include/linux/limits.h \
+ /usr/include/bits/posix2_lim.h /usr/include/bits/xopen_lim.h \
+ /usr/local/cuda-5.0/include/surface_types.h \
+ /usr/local/cuda-5.0/include/texture_types.h \
+ /usr/local/cuda-5.0/include/vector_types.h ../CuBaseLib/cumatrix.tcc \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/cuda_device_runtime_api.h \
+ /usr/local/cuda-5.0/include/cublas.h \
+ /usr/local/cuda-5.0/include/cuda_runtime.h \
+ /usr/local/cuda-5.0/include/host_config.h \
+ /usr/local/cuda-5.0/include/channel_descriptor.h \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/driver_functions.h \
+ /usr/local/cuda-5.0/include/vector_functions.h \
+ /usr/local/cuda-5.0/include/cublas_api.h \
+ /usr/local/cuda-5.0/include/cuComplex.h ../KaldiLib/Timer.h \
+ /usr/include/sys/time.h ../CuBaseLib/cucommon.h ../CuBaseLib/cuvector.h \
+ ../CuBaseLib/cuvector.tcc ../CuBaseLib/cumatrix.h \
+ ../CuBaseLib/cudevice.h /usr/include/c++/4.6/map \
+ /usr/include/c++/4.6/bits/stl_tree.h /usr/include/c++/4.6/bits/stl_map.h \
+ /usr/include/c++/4.6/bits/stl_multimap.h cuNetwork.h cuBiasedLinearity.h \
+ cuActivation.h cuCRBEDctFeat.h ../CuBaseLib/cumath.h
+cuConcatenate.o: cuConcatenate.cc cuConcatenate.h cuComponent.h \
+ ../KaldiLib/Vector.h /usr/include/c++/4.6/cstddef \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/endian.h /usr/include/bits/endian.h \
+ /usr/include/bits/byteswap.h /usr/include/xlocale.h \
+ /usr/include/sys/types.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/time.h \
+ /usr/include/sys/select.h /usr/include/bits/select.h \
+ /usr/include/bits/sigset.h /usr/include/bits/time.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/bits/pthreadtypes.h /usr/include/alloca.h \
+ /usr/include/bits/stdlib.h /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/postypes.h \
+ /usr/include/c++/4.6/cwchar /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/iosfwd \
+ /usr/include/c++/4.6/cctype /usr/include/ctype.h \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/bits/sched.h \
+ /usr/include/bits/timex.h /usr/include/bits/setjmp.h \
+ /usr/include/unistd.h /usr/include/bits/posix_opt.h \
+ /usr/include/bits/environments.h /usr/include/bits/confname.h \
+ /usr/include/getopt.h /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/bits/locale_classes.h \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc ../KaldiLib/cblas.h \
+ ../KaldiLib/clapack.h ../KaldiLib/cblas.h ../KaldiLib/Common.h \
+ /usr/include/string.h /usr/include/bits/string3.h \
+ /usr/include/c++/4.6/sstream /usr/include/c++/4.6/bits/sstream.tcc \
+ ../KaldiLib/MathAux.h /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ ../KaldiLib/Types.h ../KaldiLib/Error.h /usr/include/execinfo.h \
+ ../KaldiLib/Vector.tcc /usr/include/c++/4.6/cstring \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ ../KaldiLib/Matrix.h ../KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h \
+ /usr/include/c++/4.6/bits/stl_construct.h /usr/include/c++/4.6/limits \
+ /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc ../KaldiLib/Vector.h \
+ ../KaldiLib/Matrix.h ../KaldiLib/Error.h ../CuBaseLib/cumatrix.h \
+ ../CuBaseLib/cukernels.h /usr/local/cuda-5.0/include/vector_types.h \
+ /usr/local/cuda-5.0/include/builtin_types.h \
+ /usr/local/cuda-5.0/include/device_types.h \
+ /usr/local/cuda-5.0/include/host_defines.h \
+ /usr/local/cuda-5.0/include/driver_types.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/limits.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/syslimits.h \
+ /usr/include/limits.h /usr/include/bits/posix1_lim.h \
+ /usr/include/bits/local_lim.h /usr/include/linux/limits.h \
+ /usr/include/bits/posix2_lim.h /usr/include/bits/xopen_lim.h \
+ /usr/local/cuda-5.0/include/surface_types.h \
+ /usr/local/cuda-5.0/include/texture_types.h \
+ /usr/local/cuda-5.0/include/vector_types.h ../CuBaseLib/cumatrix.tcc \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/cuda_device_runtime_api.h \
+ /usr/local/cuda-5.0/include/cublas.h \
+ /usr/local/cuda-5.0/include/cuda_runtime.h \
+ /usr/local/cuda-5.0/include/host_config.h \
+ /usr/local/cuda-5.0/include/channel_descriptor.h \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/driver_functions.h \
+ /usr/local/cuda-5.0/include/vector_functions.h \
+ /usr/local/cuda-5.0/include/cublas_api.h \
+ /usr/local/cuda-5.0/include/cuComplex.h ../KaldiLib/Timer.h \
+ /usr/include/sys/time.h ../CuBaseLib/cucommon.h ../CuBaseLib/cuvector.h \
+ ../CuBaseLib/cuvector.tcc ../CuBaseLib/cumatrix.h \
+ ../CuBaseLib/cudevice.h /usr/include/c++/4.6/map \
+ /usr/include/c++/4.6/bits/stl_tree.h /usr/include/c++/4.6/bits/stl_map.h \
+ /usr/include/c++/4.6/bits/stl_multimap.h cuNetwork.h cuBiasedLinearity.h \
+ cuActivation.h cuCRBEDctFeat.h ../CuBaseLib/cumath.h
+cuDiscreteLinearity.o: cuDiscreteLinearity.cc cuDiscreteLinearity.h \
+ cuComponent.h ../KaldiLib/Vector.h /usr/include/c++/4.6/cstddef \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/endian.h /usr/include/bits/endian.h \
+ /usr/include/bits/byteswap.h /usr/include/xlocale.h \
+ /usr/include/sys/types.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/time.h \
+ /usr/include/sys/select.h /usr/include/bits/select.h \
+ /usr/include/bits/sigset.h /usr/include/bits/time.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/bits/pthreadtypes.h /usr/include/alloca.h \
+ /usr/include/bits/stdlib.h /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/postypes.h \
+ /usr/include/c++/4.6/cwchar /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/iosfwd \
+ /usr/include/c++/4.6/cctype /usr/include/ctype.h \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/bits/sched.h \
+ /usr/include/bits/timex.h /usr/include/bits/setjmp.h \
+ /usr/include/unistd.h /usr/include/bits/posix_opt.h \
+ /usr/include/bits/environments.h /usr/include/bits/confname.h \
+ /usr/include/getopt.h /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/bits/locale_classes.h \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc ../KaldiLib/cblas.h \
+ ../KaldiLib/clapack.h ../KaldiLib/cblas.h ../KaldiLib/Common.h \
+ /usr/include/string.h /usr/include/bits/string3.h \
+ /usr/include/c++/4.6/sstream /usr/include/c++/4.6/bits/sstream.tcc \
+ ../KaldiLib/MathAux.h /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ ../KaldiLib/Types.h ../KaldiLib/Error.h /usr/include/execinfo.h \
+ ../KaldiLib/Vector.tcc /usr/include/c++/4.6/cstring \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ ../KaldiLib/Matrix.h ../KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h \
+ /usr/include/c++/4.6/bits/stl_construct.h /usr/include/c++/4.6/limits \
+ /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc ../KaldiLib/Vector.h \
+ ../KaldiLib/Matrix.h ../KaldiLib/Error.h ../CuBaseLib/cumatrix.h \
+ ../CuBaseLib/cukernels.h /usr/local/cuda-5.0/include/vector_types.h \
+ /usr/local/cuda-5.0/include/builtin_types.h \
+ /usr/local/cuda-5.0/include/device_types.h \
+ /usr/local/cuda-5.0/include/host_defines.h \
+ /usr/local/cuda-5.0/include/driver_types.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/limits.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/syslimits.h \
+ /usr/include/limits.h /usr/include/bits/posix1_lim.h \
+ /usr/include/bits/local_lim.h /usr/include/linux/limits.h \
+ /usr/include/bits/posix2_lim.h /usr/include/bits/xopen_lim.h \
+ /usr/local/cuda-5.0/include/surface_types.h \
+ /usr/local/cuda-5.0/include/texture_types.h \
+ /usr/local/cuda-5.0/include/vector_types.h ../CuBaseLib/cumatrix.tcc \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/cuda_device_runtime_api.h \
+ /usr/local/cuda-5.0/include/cublas.h \
+ /usr/local/cuda-5.0/include/cuda_runtime.h \
+ /usr/local/cuda-5.0/include/host_config.h \
+ /usr/local/cuda-5.0/include/channel_descriptor.h \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/driver_functions.h \
+ /usr/local/cuda-5.0/include/vector_functions.h \
+ /usr/local/cuda-5.0/include/cublas_api.h \
+ /usr/local/cuda-5.0/include/cuComplex.h ../KaldiLib/Timer.h \
+ /usr/include/sys/time.h ../CuBaseLib/cucommon.h ../CuBaseLib/cuvector.h \
+ ../CuBaseLib/cuvector.tcc ../CuBaseLib/cumatrix.h \
+ ../CuBaseLib/cudevice.h /usr/include/c++/4.6/map \
+ /usr/include/c++/4.6/bits/stl_tree.h /usr/include/c++/4.6/bits/stl_map.h \
+ /usr/include/c++/4.6/bits/stl_multimap.h ../CuBaseLib/cumath.h
+cuLinearity.o: cuLinearity.cc cuLinearity.h cuComponent.h \
+ ../KaldiLib/Vector.h /usr/include/c++/4.6/cstddef \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/endian.h /usr/include/bits/endian.h \
+ /usr/include/bits/byteswap.h /usr/include/xlocale.h \
+ /usr/include/sys/types.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/time.h \
+ /usr/include/sys/select.h /usr/include/bits/select.h \
+ /usr/include/bits/sigset.h /usr/include/bits/time.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/bits/pthreadtypes.h /usr/include/alloca.h \
+ /usr/include/bits/stdlib.h /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/postypes.h \
+ /usr/include/c++/4.6/cwchar /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/iosfwd \
+ /usr/include/c++/4.6/cctype /usr/include/ctype.h \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/bits/sched.h \
+ /usr/include/bits/timex.h /usr/include/bits/setjmp.h \
+ /usr/include/unistd.h /usr/include/bits/posix_opt.h \
+ /usr/include/bits/environments.h /usr/include/bits/confname.h \
+ /usr/include/getopt.h /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/bits/locale_classes.h \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc ../KaldiLib/cblas.h \
+ ../KaldiLib/clapack.h ../KaldiLib/cblas.h ../KaldiLib/Common.h \
+ /usr/include/string.h /usr/include/bits/string3.h \
+ /usr/include/c++/4.6/sstream /usr/include/c++/4.6/bits/sstream.tcc \
+ ../KaldiLib/MathAux.h /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ ../KaldiLib/Types.h ../KaldiLib/Error.h /usr/include/execinfo.h \
+ ../KaldiLib/Vector.tcc /usr/include/c++/4.6/cstring \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ ../KaldiLib/Matrix.h ../KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h \
+ /usr/include/c++/4.6/bits/stl_construct.h /usr/include/c++/4.6/limits \
+ /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc ../KaldiLib/Vector.h \
+ ../KaldiLib/Matrix.h ../KaldiLib/Error.h ../CuBaseLib/cumatrix.h \
+ ../CuBaseLib/cukernels.h /usr/local/cuda-5.0/include/vector_types.h \
+ /usr/local/cuda-5.0/include/builtin_types.h \
+ /usr/local/cuda-5.0/include/device_types.h \
+ /usr/local/cuda-5.0/include/host_defines.h \
+ /usr/local/cuda-5.0/include/driver_types.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/limits.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/syslimits.h \
+ /usr/include/limits.h /usr/include/bits/posix1_lim.h \
+ /usr/include/bits/local_lim.h /usr/include/linux/limits.h \
+ /usr/include/bits/posix2_lim.h /usr/include/bits/xopen_lim.h \
+ /usr/local/cuda-5.0/include/surface_types.h \
+ /usr/local/cuda-5.0/include/texture_types.h \
+ /usr/local/cuda-5.0/include/vector_types.h ../CuBaseLib/cumatrix.tcc \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/cuda_device_runtime_api.h \
+ /usr/local/cuda-5.0/include/cublas.h \
+ /usr/local/cuda-5.0/include/cuda_runtime.h \
+ /usr/local/cuda-5.0/include/host_config.h \
+ /usr/local/cuda-5.0/include/channel_descriptor.h \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/driver_functions.h \
+ /usr/local/cuda-5.0/include/vector_functions.h \
+ /usr/local/cuda-5.0/include/cublas_api.h \
+ /usr/local/cuda-5.0/include/cuComplex.h ../KaldiLib/Timer.h \
+ /usr/include/sys/time.h ../CuBaseLib/cucommon.h ../CuBaseLib/cuvector.h \
+ ../CuBaseLib/cuvector.tcc ../CuBaseLib/cumatrix.h \
+ ../CuBaseLib/cudevice.h /usr/include/c++/4.6/map \
+ /usr/include/c++/4.6/bits/stl_tree.h /usr/include/c++/4.6/bits/stl_map.h \
+ /usr/include/c++/4.6/bits/stl_multimap.h
+cuNetwork.o: cuNetwork.cc /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/endian.h /usr/include/bits/endian.h \
+ /usr/include/bits/byteswap.h /usr/include/xlocale.h \
+ /usr/include/sys/types.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/time.h \
+ /usr/include/sys/select.h /usr/include/bits/select.h \
+ /usr/include/bits/sigset.h /usr/include/bits/time.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/bits/pthreadtypes.h /usr/include/alloca.h \
+ /usr/include/bits/stdlib.h /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h \
+ /usr/include/c++/4.6/bits/stl_construct.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/cctype \
+ /usr/include/ctype.h /usr/include/c++/4.6/list \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/stl_list.h /usr/include/c++/4.6/bits/list.tcc \
+ /usr/include/c++/4.6/sstream /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/ios /usr/include/c++/4.6/iosfwd \
+ /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/postypes.h /usr/include/c++/4.6/cwchar \
+ /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/bits/sched.h \
+ /usr/include/bits/timex.h /usr/include/bits/setjmp.h \
+ /usr/include/unistd.h /usr/include/bits/posix_opt.h \
+ /usr/include/bits/environments.h /usr/include/bits/confname.h \
+ /usr/include/getopt.h /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/bits/locale_classes.h /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/bits/basic_string.tcc \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc /usr/include/c++/4.6/ostream \
+ /usr/include/c++/4.6/bits/ostream.tcc \
+ /usr/include/c++/4.6/bits/istream.tcc \
+ /usr/include/c++/4.6/bits/sstream.tcc cuNetwork.h cuComponent.h \
+ ../KaldiLib/Vector.h /usr/include/c++/4.6/cstddef \
+ /usr/include/c++/4.6/stdexcept /usr/include/c++/4.6/iostream \
+ ../KaldiLib/cblas.h ../KaldiLib/clapack.h ../KaldiLib/cblas.h \
+ ../KaldiLib/Common.h /usr/include/string.h /usr/include/bits/string3.h \
+ ../KaldiLib/MathAux.h /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ ../KaldiLib/Types.h ../KaldiLib/Error.h /usr/include/execinfo.h \
+ ../KaldiLib/Vector.tcc /usr/include/c++/4.6/cstring \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ ../KaldiLib/Matrix.h ../KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/limits \
+ /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc ../KaldiLib/Vector.h \
+ ../KaldiLib/Matrix.h ../KaldiLib/Error.h ../CuBaseLib/cumatrix.h \
+ ../CuBaseLib/cukernels.h /usr/local/cuda-5.0/include/vector_types.h \
+ /usr/local/cuda-5.0/include/builtin_types.h \
+ /usr/local/cuda-5.0/include/device_types.h \
+ /usr/local/cuda-5.0/include/host_defines.h \
+ /usr/local/cuda-5.0/include/driver_types.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/limits.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/syslimits.h \
+ /usr/include/limits.h /usr/include/bits/posix1_lim.h \
+ /usr/include/bits/local_lim.h /usr/include/linux/limits.h \
+ /usr/include/bits/posix2_lim.h /usr/include/bits/xopen_lim.h \
+ /usr/local/cuda-5.0/include/surface_types.h \
+ /usr/local/cuda-5.0/include/texture_types.h \
+ /usr/local/cuda-5.0/include/vector_types.h ../CuBaseLib/cumatrix.tcc \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/cuda_device_runtime_api.h \
+ /usr/local/cuda-5.0/include/cublas.h \
+ /usr/local/cuda-5.0/include/cuda_runtime.h \
+ /usr/local/cuda-5.0/include/host_config.h \
+ /usr/local/cuda-5.0/include/channel_descriptor.h \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/driver_functions.h \
+ /usr/local/cuda-5.0/include/vector_functions.h \
+ /usr/local/cuda-5.0/include/cublas_api.h \
+ /usr/local/cuda-5.0/include/cuComplex.h ../KaldiLib/Timer.h \
+ /usr/include/sys/time.h ../CuBaseLib/cucommon.h ../CuBaseLib/cuvector.h \
+ ../CuBaseLib/cuvector.tcc ../CuBaseLib/cumatrix.h \
+ ../CuBaseLib/cudevice.h /usr/include/c++/4.6/map \
+ /usr/include/c++/4.6/bits/stl_tree.h /usr/include/c++/4.6/bits/stl_map.h \
+ /usr/include/c++/4.6/bits/stl_multimap.h cuBiasedLinearity.h \
+ cuActivation.h cuCRBEDctFeat.h ../CuBaseLib/cumath.h \
+ cuDiscreteLinearity.h cuSharedLinearity.h cuSparseLinearity.h cuRbm.h \
+ cuRbmSparse.h cuRecurrent.h cuBlockArray.h cuLinearity.h \
+ cuUpdatableBias.h cuMisc.h cuCompDisc.h
+cuObjectiveFunction.o: cuObjectiveFunction.cc cuObjectiveFunction.h \
+ /usr/include/c++/4.6/cassert /usr/include/assert.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/limits \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/c++/4.6/cmath /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ /usr/include/c++/4.6/sstream /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/ios /usr/include/c++/4.6/iosfwd \
+ /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/postypes.h /usr/include/c++/4.6/cwchar \
+ /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/xlocale.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/cctype \
+ /usr/include/ctype.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/endian.h \
+ /usr/include/bits/endian.h /usr/include/bits/byteswap.h \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/time.h \
+ /usr/include/bits/sched.h /usr/include/bits/time.h \
+ /usr/include/bits/timex.h /usr/include/bits/pthreadtypes.h \
+ /usr/include/bits/setjmp.h /usr/include/unistd.h \
+ /usr/include/bits/posix_opt.h /usr/include/bits/environments.h \
+ /usr/include/bits/confname.h /usr/include/getopt.h \
+ /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/bits/locale_classes.h /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc /usr/include/c++/4.6/ostream \
+ /usr/include/c++/4.6/bits/ostream.tcc \
+ /usr/include/c++/4.6/bits/istream.tcc \
+ /usr/include/c++/4.6/bits/sstream.tcc ../KaldiLib/Vector.h \
+ /usr/include/c++/4.6/cstddef /usr/include/c++/4.6/cstdlib \
+ /usr/include/stdlib.h /usr/include/bits/waitflags.h \
+ /usr/include/bits/waitstatus.h /usr/include/sys/types.h \
+ /usr/include/sys/select.h /usr/include/bits/select.h \
+ /usr/include/bits/sigset.h /usr/include/bits/select2.h \
+ /usr/include/sys/sysmacros.h /usr/include/alloca.h \
+ /usr/include/bits/stdlib.h /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/iostream ../KaldiLib/cblas.h ../KaldiLib/clapack.h \
+ ../KaldiLib/cblas.h ../KaldiLib/Common.h /usr/include/string.h \
+ /usr/include/bits/string3.h ../KaldiLib/MathAux.h ../KaldiLib/Types.h \
+ ../KaldiLib/Error.h /usr/include/execinfo.h ../KaldiLib/Vector.tcc \
+ /usr/include/c++/4.6/cstring /usr/include/c++/4.6/fstream \
+ /usr/include/c++/4.6/bits/codecvt.h /usr/include/c++/4.6/cstdio \
+ /usr/include/libio.h /usr/include/_G_config.h \
+ /usr/include/bits/stdio_lim.h /usr/include/bits/sys_errlist.h \
+ /usr/include/bits/stdio.h /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ ../KaldiLib/Matrix.h ../KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h \
+ /usr/include/c++/4.6/bits/stl_construct.h /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc ../KaldiLib/Vector.h \
+ ../CuBaseLib/cuvector.h ../CuBaseLib/cuvector.tcc \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/host_defines.h \
+ /usr/local/cuda-5.0/include/builtin_types.h \
+ /usr/local/cuda-5.0/include/device_types.h \
+ /usr/local/cuda-5.0/include/driver_types.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/limits.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/syslimits.h \
+ /usr/include/limits.h /usr/include/bits/posix1_lim.h \
+ /usr/include/bits/local_lim.h /usr/include/linux/limits.h \
+ /usr/include/bits/posix2_lim.h /usr/include/bits/xopen_lim.h \
+ /usr/local/cuda-5.0/include/surface_types.h \
+ /usr/local/cuda-5.0/include/texture_types.h \
+ /usr/local/cuda-5.0/include/vector_types.h \
+ /usr/local/cuda-5.0/include/cuda_device_runtime_api.h \
+ ../KaldiLib/Timer.h /usr/include/sys/time.h ../CuBaseLib/cucommon.h \
+ ../KaldiLib/Error.h ../CuBaseLib/cumatrix.h ../KaldiLib/Matrix.h \
+ ../CuBaseLib/cukernels.h /usr/local/cuda-5.0/include/vector_types.h \
+ ../CuBaseLib/cumatrix.tcc /usr/local/cuda-5.0/include/cublas.h \
+ /usr/local/cuda-5.0/include/cuda_runtime.h \
+ /usr/local/cuda-5.0/include/host_config.h \
+ /usr/local/cuda-5.0/include/channel_descriptor.h \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/driver_functions.h \
+ /usr/local/cuda-5.0/include/vector_functions.h \
+ /usr/local/cuda-5.0/include/cublas_api.h \
+ /usr/local/cuda-5.0/include/cuComplex.h ../CuBaseLib/cuvector.h \
+ ../CuBaseLib/cudevice.h /usr/include/c++/4.6/map \
+ /usr/include/c++/4.6/bits/stl_tree.h /usr/include/c++/4.6/bits/stl_map.h \
+ /usr/include/c++/4.6/bits/stl_multimap.h ../CuBaseLib/cumatrix.h \
+ ../CuBaseLib/cumath.h
+cuRbm.o: cuRbm.cc /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/postypes.h \
+ /usr/include/c++/4.6/cwchar /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/xlocale.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/iosfwd \
+ /usr/include/c++/4.6/cctype /usr/include/ctype.h \
+ /usr/include/bits/types.h /usr/include/bits/typesizes.h \
+ /usr/include/endian.h /usr/include/bits/endian.h \
+ /usr/include/bits/byteswap.h /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/time.h \
+ /usr/include/bits/sched.h /usr/include/bits/time.h \
+ /usr/include/bits/timex.h /usr/include/bits/pthreadtypes.h \
+ /usr/include/bits/setjmp.h /usr/include/unistd.h \
+ /usr/include/bits/posix_opt.h /usr/include/bits/environments.h \
+ /usr/include/bits/confname.h /usr/include/getopt.h \
+ /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc /usr/include/c++/4.6/sstream \
+ /usr/include/c++/4.6/istream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/bits/locale_classes.h \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc /usr/include/c++/4.6/ostream \
+ /usr/include/c++/4.6/bits/ostream.tcc \
+ /usr/include/c++/4.6/bits/istream.tcc \
+ /usr/include/c++/4.6/bits/sstream.tcc cuRbm.h cuComponent.h \
+ ../KaldiLib/Vector.h /usr/include/c++/4.6/cstddef \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/sys/types.h /usr/include/sys/select.h \
+ /usr/include/bits/select.h /usr/include/bits/sigset.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/alloca.h /usr/include/bits/stdlib.h \
+ /usr/include/c++/4.6/stdexcept /usr/include/c++/4.6/iostream \
+ ../KaldiLib/cblas.h ../KaldiLib/clapack.h ../KaldiLib/cblas.h \
+ ../KaldiLib/Common.h /usr/include/string.h /usr/include/bits/string3.h \
+ ../KaldiLib/MathAux.h /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ ../KaldiLib/Types.h ../KaldiLib/Error.h /usr/include/execinfo.h \
+ ../KaldiLib/Vector.tcc /usr/include/c++/4.6/cstring \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ ../KaldiLib/Matrix.h ../KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h \
+ /usr/include/c++/4.6/bits/stl_construct.h /usr/include/c++/4.6/limits \
+ /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc ../KaldiLib/Vector.h \
+ ../KaldiLib/Matrix.h ../KaldiLib/Error.h ../CuBaseLib/cumatrix.h \
+ ../CuBaseLib/cukernels.h /usr/local/cuda-5.0/include/vector_types.h \
+ /usr/local/cuda-5.0/include/builtin_types.h \
+ /usr/local/cuda-5.0/include/device_types.h \
+ /usr/local/cuda-5.0/include/host_defines.h \
+ /usr/local/cuda-5.0/include/driver_types.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/limits.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/syslimits.h \
+ /usr/include/limits.h /usr/include/bits/posix1_lim.h \
+ /usr/include/bits/local_lim.h /usr/include/linux/limits.h \
+ /usr/include/bits/posix2_lim.h /usr/include/bits/xopen_lim.h \
+ /usr/local/cuda-5.0/include/surface_types.h \
+ /usr/local/cuda-5.0/include/texture_types.h \
+ /usr/local/cuda-5.0/include/vector_types.h ../CuBaseLib/cumatrix.tcc \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/cuda_device_runtime_api.h \
+ /usr/local/cuda-5.0/include/cublas.h \
+ /usr/local/cuda-5.0/include/cuda_runtime.h \
+ /usr/local/cuda-5.0/include/host_config.h \
+ /usr/local/cuda-5.0/include/channel_descriptor.h \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/driver_functions.h \
+ /usr/local/cuda-5.0/include/vector_functions.h \
+ /usr/local/cuda-5.0/include/cublas_api.h \
+ /usr/local/cuda-5.0/include/cuComplex.h ../KaldiLib/Timer.h \
+ /usr/include/sys/time.h ../CuBaseLib/cucommon.h ../CuBaseLib/cuvector.h \
+ ../CuBaseLib/cuvector.tcc ../CuBaseLib/cumatrix.h \
+ ../CuBaseLib/cudevice.h /usr/include/c++/4.6/map \
+ /usr/include/c++/4.6/bits/stl_tree.h /usr/include/c++/4.6/bits/stl_map.h \
+ /usr/include/c++/4.6/bits/stl_multimap.h ../CuBaseLib/cumath.h
+cuRbmSparse.o: cuRbmSparse.cc /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/postypes.h \
+ /usr/include/c++/4.6/cwchar /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/xlocale.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/iosfwd \
+ /usr/include/c++/4.6/cctype /usr/include/ctype.h \
+ /usr/include/bits/types.h /usr/include/bits/typesizes.h \
+ /usr/include/endian.h /usr/include/bits/endian.h \
+ /usr/include/bits/byteswap.h /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/time.h \
+ /usr/include/bits/sched.h /usr/include/bits/time.h \
+ /usr/include/bits/timex.h /usr/include/bits/pthreadtypes.h \
+ /usr/include/bits/setjmp.h /usr/include/unistd.h \
+ /usr/include/bits/posix_opt.h /usr/include/bits/environments.h \
+ /usr/include/bits/confname.h /usr/include/getopt.h \
+ /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc /usr/include/c++/4.6/sstream \
+ /usr/include/c++/4.6/istream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/bits/locale_classes.h \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc /usr/include/c++/4.6/ostream \
+ /usr/include/c++/4.6/bits/ostream.tcc \
+ /usr/include/c++/4.6/bits/istream.tcc \
+ /usr/include/c++/4.6/bits/sstream.tcc cuRbmSparse.h cuComponent.h \
+ ../KaldiLib/Vector.h /usr/include/c++/4.6/cstddef \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/sys/types.h /usr/include/sys/select.h \
+ /usr/include/bits/select.h /usr/include/bits/sigset.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/alloca.h /usr/include/bits/stdlib.h \
+ /usr/include/c++/4.6/stdexcept /usr/include/c++/4.6/iostream \
+ ../KaldiLib/cblas.h ../KaldiLib/clapack.h ../KaldiLib/cblas.h \
+ ../KaldiLib/Common.h /usr/include/string.h /usr/include/bits/string3.h \
+ ../KaldiLib/MathAux.h /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ ../KaldiLib/Types.h ../KaldiLib/Error.h /usr/include/execinfo.h \
+ ../KaldiLib/Vector.tcc /usr/include/c++/4.6/cstring \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ ../KaldiLib/Matrix.h ../KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h \
+ /usr/include/c++/4.6/bits/stl_construct.h /usr/include/c++/4.6/limits \
+ /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc ../KaldiLib/Vector.h \
+ ../KaldiLib/Matrix.h ../KaldiLib/Error.h ../CuBaseLib/cumatrix.h \
+ ../CuBaseLib/cukernels.h /usr/local/cuda-5.0/include/vector_types.h \
+ /usr/local/cuda-5.0/include/builtin_types.h \
+ /usr/local/cuda-5.0/include/device_types.h \
+ /usr/local/cuda-5.0/include/host_defines.h \
+ /usr/local/cuda-5.0/include/driver_types.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/limits.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/syslimits.h \
+ /usr/include/limits.h /usr/include/bits/posix1_lim.h \
+ /usr/include/bits/local_lim.h /usr/include/linux/limits.h \
+ /usr/include/bits/posix2_lim.h /usr/include/bits/xopen_lim.h \
+ /usr/local/cuda-5.0/include/surface_types.h \
+ /usr/local/cuda-5.0/include/texture_types.h \
+ /usr/local/cuda-5.0/include/vector_types.h ../CuBaseLib/cumatrix.tcc \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/cuda_device_runtime_api.h \
+ /usr/local/cuda-5.0/include/cublas.h \
+ /usr/local/cuda-5.0/include/cuda_runtime.h \
+ /usr/local/cuda-5.0/include/host_config.h \
+ /usr/local/cuda-5.0/include/channel_descriptor.h \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/driver_functions.h \
+ /usr/local/cuda-5.0/include/vector_functions.h \
+ /usr/local/cuda-5.0/include/cublas_api.h \
+ /usr/local/cuda-5.0/include/cuComplex.h ../KaldiLib/Timer.h \
+ /usr/include/sys/time.h ../CuBaseLib/cucommon.h ../CuBaseLib/cuvector.h \
+ ../CuBaseLib/cuvector.tcc ../CuBaseLib/cumatrix.h \
+ ../CuBaseLib/cudevice.h /usr/include/c++/4.6/map \
+ /usr/include/c++/4.6/bits/stl_tree.h /usr/include/c++/4.6/bits/stl_map.h \
+ /usr/include/c++/4.6/bits/stl_multimap.h cuRbm.h ../CuBaseLib/cumath.h
+cuRecurrent.o: cuRecurrent.cc /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/postypes.h \
+ /usr/include/c++/4.6/cwchar /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/xlocale.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/iosfwd \
+ /usr/include/c++/4.6/cctype /usr/include/ctype.h \
+ /usr/include/bits/types.h /usr/include/bits/typesizes.h \
+ /usr/include/endian.h /usr/include/bits/endian.h \
+ /usr/include/bits/byteswap.h /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/time.h \
+ /usr/include/bits/sched.h /usr/include/bits/time.h \
+ /usr/include/bits/timex.h /usr/include/bits/pthreadtypes.h \
+ /usr/include/bits/setjmp.h /usr/include/unistd.h \
+ /usr/include/bits/posix_opt.h /usr/include/bits/environments.h \
+ /usr/include/bits/confname.h /usr/include/getopt.h \
+ /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc /usr/include/c++/4.6/sstream \
+ /usr/include/c++/4.6/istream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/bits/locale_classes.h \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc /usr/include/c++/4.6/ostream \
+ /usr/include/c++/4.6/bits/ostream.tcc \
+ /usr/include/c++/4.6/bits/istream.tcc \
+ /usr/include/c++/4.6/bits/sstream.tcc cuRecurrent.h cuComponent.h \
+ ../KaldiLib/Vector.h /usr/include/c++/4.6/cstddef \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/sys/types.h /usr/include/sys/select.h \
+ /usr/include/bits/select.h /usr/include/bits/sigset.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/alloca.h /usr/include/bits/stdlib.h \
+ /usr/include/c++/4.6/stdexcept /usr/include/c++/4.6/iostream \
+ ../KaldiLib/cblas.h ../KaldiLib/clapack.h ../KaldiLib/cblas.h \
+ ../KaldiLib/Common.h /usr/include/string.h /usr/include/bits/string3.h \
+ ../KaldiLib/MathAux.h /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ ../KaldiLib/Types.h ../KaldiLib/Error.h /usr/include/execinfo.h \
+ ../KaldiLib/Vector.tcc /usr/include/c++/4.6/cstring \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ ../KaldiLib/Matrix.h ../KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h \
+ /usr/include/c++/4.6/bits/stl_construct.h /usr/include/c++/4.6/limits \
+ /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc ../KaldiLib/Vector.h \
+ ../KaldiLib/Matrix.h ../KaldiLib/Error.h ../CuBaseLib/cumatrix.h \
+ ../CuBaseLib/cukernels.h /usr/local/cuda-5.0/include/vector_types.h \
+ /usr/local/cuda-5.0/include/builtin_types.h \
+ /usr/local/cuda-5.0/include/device_types.h \
+ /usr/local/cuda-5.0/include/host_defines.h \
+ /usr/local/cuda-5.0/include/driver_types.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/limits.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/syslimits.h \
+ /usr/include/limits.h /usr/include/bits/posix1_lim.h \
+ /usr/include/bits/local_lim.h /usr/include/linux/limits.h \
+ /usr/include/bits/posix2_lim.h /usr/include/bits/xopen_lim.h \
+ /usr/local/cuda-5.0/include/surface_types.h \
+ /usr/local/cuda-5.0/include/texture_types.h \
+ /usr/local/cuda-5.0/include/vector_types.h ../CuBaseLib/cumatrix.tcc \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/cuda_device_runtime_api.h \
+ /usr/local/cuda-5.0/include/cublas.h \
+ /usr/local/cuda-5.0/include/cuda_runtime.h \
+ /usr/local/cuda-5.0/include/host_config.h \
+ /usr/local/cuda-5.0/include/channel_descriptor.h \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/driver_functions.h \
+ /usr/local/cuda-5.0/include/vector_functions.h \
+ /usr/local/cuda-5.0/include/cublas_api.h \
+ /usr/local/cuda-5.0/include/cuComplex.h ../KaldiLib/Timer.h \
+ /usr/include/sys/time.h ../CuBaseLib/cucommon.h ../CuBaseLib/cuvector.h \
+ ../CuBaseLib/cuvector.tcc ../CuBaseLib/cumatrix.h \
+ ../CuBaseLib/cudevice.h /usr/include/c++/4.6/map \
+ /usr/include/c++/4.6/bits/stl_tree.h /usr/include/c++/4.6/bits/stl_map.h \
+ /usr/include/c++/4.6/bits/stl_multimap.h ../CuBaseLib/cumath.h
+cuSharedLinearity.o: cuSharedLinearity.cc cuSharedLinearity.h \
+ cuComponent.h ../KaldiLib/Vector.h /usr/include/c++/4.6/cstddef \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/endian.h /usr/include/bits/endian.h \
+ /usr/include/bits/byteswap.h /usr/include/xlocale.h \
+ /usr/include/sys/types.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/time.h \
+ /usr/include/sys/select.h /usr/include/bits/select.h \
+ /usr/include/bits/sigset.h /usr/include/bits/time.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/bits/pthreadtypes.h /usr/include/alloca.h \
+ /usr/include/bits/stdlib.h /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/postypes.h \
+ /usr/include/c++/4.6/cwchar /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/iosfwd \
+ /usr/include/c++/4.6/cctype /usr/include/ctype.h \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/bits/sched.h \
+ /usr/include/bits/timex.h /usr/include/bits/setjmp.h \
+ /usr/include/unistd.h /usr/include/bits/posix_opt.h \
+ /usr/include/bits/environments.h /usr/include/bits/confname.h \
+ /usr/include/getopt.h /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/bits/locale_classes.h \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc ../KaldiLib/cblas.h \
+ ../KaldiLib/clapack.h ../KaldiLib/cblas.h ../KaldiLib/Common.h \
+ /usr/include/string.h /usr/include/bits/string3.h \
+ /usr/include/c++/4.6/sstream /usr/include/c++/4.6/bits/sstream.tcc \
+ ../KaldiLib/MathAux.h /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ ../KaldiLib/Types.h ../KaldiLib/Error.h /usr/include/execinfo.h \
+ ../KaldiLib/Vector.tcc /usr/include/c++/4.6/cstring \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ ../KaldiLib/Matrix.h ../KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h \
+ /usr/include/c++/4.6/bits/stl_construct.h /usr/include/c++/4.6/limits \
+ /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc ../KaldiLib/Vector.h \
+ ../KaldiLib/Matrix.h ../KaldiLib/Error.h ../CuBaseLib/cumatrix.h \
+ ../CuBaseLib/cukernels.h /usr/local/cuda-5.0/include/vector_types.h \
+ /usr/local/cuda-5.0/include/builtin_types.h \
+ /usr/local/cuda-5.0/include/device_types.h \
+ /usr/local/cuda-5.0/include/host_defines.h \
+ /usr/local/cuda-5.0/include/driver_types.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/limits.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/syslimits.h \
+ /usr/include/limits.h /usr/include/bits/posix1_lim.h \
+ /usr/include/bits/local_lim.h /usr/include/linux/limits.h \
+ /usr/include/bits/posix2_lim.h /usr/include/bits/xopen_lim.h \
+ /usr/local/cuda-5.0/include/surface_types.h \
+ /usr/local/cuda-5.0/include/texture_types.h \
+ /usr/local/cuda-5.0/include/vector_types.h ../CuBaseLib/cumatrix.tcc \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/cuda_device_runtime_api.h \
+ /usr/local/cuda-5.0/include/cublas.h \
+ /usr/local/cuda-5.0/include/cuda_runtime.h \
+ /usr/local/cuda-5.0/include/host_config.h \
+ /usr/local/cuda-5.0/include/channel_descriptor.h \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/driver_functions.h \
+ /usr/local/cuda-5.0/include/vector_functions.h \
+ /usr/local/cuda-5.0/include/cublas_api.h \
+ /usr/local/cuda-5.0/include/cuComplex.h ../KaldiLib/Timer.h \
+ /usr/include/sys/time.h ../CuBaseLib/cucommon.h ../CuBaseLib/cuvector.h \
+ ../CuBaseLib/cuvector.tcc ../CuBaseLib/cumatrix.h \
+ ../CuBaseLib/cudevice.h /usr/include/c++/4.6/map \
+ /usr/include/c++/4.6/bits/stl_tree.h /usr/include/c++/4.6/bits/stl_map.h \
+ /usr/include/c++/4.6/bits/stl_multimap.h ../CuBaseLib/cumath.h
+cuSparseLinearity.o: cuSparseLinearity.cc cuSparseLinearity.h \
+ cuComponent.h ../KaldiLib/Vector.h /usr/include/c++/4.6/cstddef \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/endian.h /usr/include/bits/endian.h \
+ /usr/include/bits/byteswap.h /usr/include/xlocale.h \
+ /usr/include/sys/types.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/time.h \
+ /usr/include/sys/select.h /usr/include/bits/select.h \
+ /usr/include/bits/sigset.h /usr/include/bits/time.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/bits/pthreadtypes.h /usr/include/alloca.h \
+ /usr/include/bits/stdlib.h /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/postypes.h \
+ /usr/include/c++/4.6/cwchar /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/iosfwd \
+ /usr/include/c++/4.6/cctype /usr/include/ctype.h \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/bits/sched.h \
+ /usr/include/bits/timex.h /usr/include/bits/setjmp.h \
+ /usr/include/unistd.h /usr/include/bits/posix_opt.h \
+ /usr/include/bits/environments.h /usr/include/bits/confname.h \
+ /usr/include/getopt.h /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/bits/locale_classes.h \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc ../KaldiLib/cblas.h \
+ ../KaldiLib/clapack.h ../KaldiLib/cblas.h ../KaldiLib/Common.h \
+ /usr/include/string.h /usr/include/bits/string3.h \
+ /usr/include/c++/4.6/sstream /usr/include/c++/4.6/bits/sstream.tcc \
+ ../KaldiLib/MathAux.h /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ ../KaldiLib/Types.h ../KaldiLib/Error.h /usr/include/execinfo.h \
+ ../KaldiLib/Vector.tcc /usr/include/c++/4.6/cstring \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ ../KaldiLib/Matrix.h ../KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h \
+ /usr/include/c++/4.6/bits/stl_construct.h /usr/include/c++/4.6/limits \
+ /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc ../KaldiLib/Vector.h \
+ ../KaldiLib/Matrix.h ../KaldiLib/Error.h ../CuBaseLib/cumatrix.h \
+ ../CuBaseLib/cukernels.h /usr/local/cuda-5.0/include/vector_types.h \
+ /usr/local/cuda-5.0/include/builtin_types.h \
+ /usr/local/cuda-5.0/include/device_types.h \
+ /usr/local/cuda-5.0/include/host_defines.h \
+ /usr/local/cuda-5.0/include/driver_types.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/limits.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/syslimits.h \
+ /usr/include/limits.h /usr/include/bits/posix1_lim.h \
+ /usr/include/bits/local_lim.h /usr/include/linux/limits.h \
+ /usr/include/bits/posix2_lim.h /usr/include/bits/xopen_lim.h \
+ /usr/local/cuda-5.0/include/surface_types.h \
+ /usr/local/cuda-5.0/include/texture_types.h \
+ /usr/local/cuda-5.0/include/vector_types.h ../CuBaseLib/cumatrix.tcc \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/cuda_device_runtime_api.h \
+ /usr/local/cuda-5.0/include/cublas.h \
+ /usr/local/cuda-5.0/include/cuda_runtime.h \
+ /usr/local/cuda-5.0/include/host_config.h \
+ /usr/local/cuda-5.0/include/channel_descriptor.h \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/driver_functions.h \
+ /usr/local/cuda-5.0/include/vector_functions.h \
+ /usr/local/cuda-5.0/include/cublas_api.h \
+ /usr/local/cuda-5.0/include/cuComplex.h ../KaldiLib/Timer.h \
+ /usr/include/sys/time.h ../CuBaseLib/cucommon.h ../CuBaseLib/cuvector.h \
+ ../CuBaseLib/cuvector.tcc ../CuBaseLib/cumatrix.h \
+ ../CuBaseLib/cudevice.h /usr/include/c++/4.6/map \
+ /usr/include/c++/4.6/bits/stl_tree.h /usr/include/c++/4.6/bits/stl_map.h \
+ /usr/include/c++/4.6/bits/stl_multimap.h
+cuUpdatableBias.o: cuUpdatableBias.cc cuUpdatableBias.h cuComponent.h \
+ ../KaldiLib/Vector.h /usr/include/c++/4.6/cstddef \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/endian.h /usr/include/bits/endian.h \
+ /usr/include/bits/byteswap.h /usr/include/xlocale.h \
+ /usr/include/sys/types.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/time.h \
+ /usr/include/sys/select.h /usr/include/bits/select.h \
+ /usr/include/bits/sigset.h /usr/include/bits/time.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/bits/pthreadtypes.h /usr/include/alloca.h \
+ /usr/include/bits/stdlib.h /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/postypes.h \
+ /usr/include/c++/4.6/cwchar /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/iosfwd \
+ /usr/include/c++/4.6/cctype /usr/include/ctype.h \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/bits/sched.h \
+ /usr/include/bits/timex.h /usr/include/bits/setjmp.h \
+ /usr/include/unistd.h /usr/include/bits/posix_opt.h \
+ /usr/include/bits/environments.h /usr/include/bits/confname.h \
+ /usr/include/getopt.h /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/bits/locale_classes.h \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc ../KaldiLib/cblas.h \
+ ../KaldiLib/clapack.h ../KaldiLib/cblas.h ../KaldiLib/Common.h \
+ /usr/include/string.h /usr/include/bits/string3.h \
+ /usr/include/c++/4.6/sstream /usr/include/c++/4.6/bits/sstream.tcc \
+ ../KaldiLib/MathAux.h /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ ../KaldiLib/Types.h ../KaldiLib/Error.h /usr/include/execinfo.h \
+ ../KaldiLib/Vector.tcc /usr/include/c++/4.6/cstring \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ ../KaldiLib/Matrix.h ../KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h \
+ /usr/include/c++/4.6/bits/stl_construct.h /usr/include/c++/4.6/limits \
+ /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc ../KaldiLib/Vector.h \
+ ../KaldiLib/Matrix.h ../KaldiLib/Error.h ../CuBaseLib/cumatrix.h \
+ ../CuBaseLib/cukernels.h /usr/local/cuda-5.0/include/vector_types.h \
+ /usr/local/cuda-5.0/include/builtin_types.h \
+ /usr/local/cuda-5.0/include/device_types.h \
+ /usr/local/cuda-5.0/include/host_defines.h \
+ /usr/local/cuda-5.0/include/driver_types.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/limits.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include-fixed/syslimits.h \
+ /usr/include/limits.h /usr/include/bits/posix1_lim.h \
+ /usr/include/bits/local_lim.h /usr/include/linux/limits.h \
+ /usr/include/bits/posix2_lim.h /usr/include/bits/xopen_lim.h \
+ /usr/local/cuda-5.0/include/surface_types.h \
+ /usr/local/cuda-5.0/include/texture_types.h \
+ /usr/local/cuda-5.0/include/vector_types.h ../CuBaseLib/cumatrix.tcc \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/cuda_device_runtime_api.h \
+ /usr/local/cuda-5.0/include/cublas.h \
+ /usr/local/cuda-5.0/include/cuda_runtime.h \
+ /usr/local/cuda-5.0/include/host_config.h \
+ /usr/local/cuda-5.0/include/channel_descriptor.h \
+ /usr/local/cuda-5.0/include/cuda_runtime_api.h \
+ /usr/local/cuda-5.0/include/driver_functions.h \
+ /usr/local/cuda-5.0/include/vector_functions.h \
+ /usr/local/cuda-5.0/include/cublas_api.h \
+ /usr/local/cuda-5.0/include/cuComplex.h ../KaldiLib/Timer.h \
+ /usr/include/sys/time.h ../CuBaseLib/cucommon.h ../CuBaseLib/cuvector.h \
+ ../CuBaseLib/cuvector.tcc ../CuBaseLib/cumatrix.h \
+ ../CuBaseLib/cudevice.h /usr/include/c++/4.6/map \
+ /usr/include/c++/4.6/bits/stl_tree.h /usr/include/c++/4.6/bits/stl_map.h \
+ /usr/include/c++/4.6/bits/stl_multimap.h
diff --git a/src/CuTNetLib/.svn/entries b/src/CuTNetLib/.svn/entries
new file mode 100644
index 0000000..8846e9b
--- /dev/null
+++ b/src/CuTNetLib/.svn/entries
@@ -0,0 +1,946 @@
+10
+
+dir
+117
+svn+ssh://merlin.fit.vutbr.cz/svn/TNet/trunk/src/CuTNetLib
+svn+ssh://merlin.fit.vutbr.cz/svn/TNet
+
+
+
+2012-02-07T17:50:53.635354Z
+103
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+bda6da93-004a-4ae9-8e07-715c10848801
+
+cuCache.cc
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+ba7400672c1166f91f51b1d02b0124ab
+2011-12-22T15:49:51.623339Z
+96
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+5190
+
+cuActivation.h
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+a6e665ef8082542cddeb9cfb344f01d2
+2011-09-26T14:48:24.142069Z
+73
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+2417
+
+cuComponent.h
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+410e86f76bb21318f8e348f7287de915
+2012-02-07T17:50:53.635354Z
+103
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+8634
+
+cuCache.h
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+2d53b9558d6ce6627a31f4c76399905e
+2011-04-04T17:14:16.666438Z
+46
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+2021
+
+cuCRBEDctFeat.h
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+02cfc50560d325a7b919e321c854a888
+2011-09-26T14:48:24.142069Z
+73
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+7633
+
+cuBiasedLinearity.cc
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+293eda5bda355fdea6417f94c3657a08
+2011-04-04T17:14:16.666438Z
+46
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+3082
+
+cuRecurrent.cc
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+45994d12f52bfb36ecc3b40d041073d0
+2011-03-22T21:01:03.678832Z
+41
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+5721
+
+cuBiasedLinearity.h
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+df1f7b17849004a0574744a43befabc8
+2011-09-26T14:48:24.142069Z
+73
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+1846
+
+cuRecurrent.h
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+f0005036c7deeb217115a673e6fff070
+2011-09-26T14:48:24.142069Z
+73
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+2002
+
+cuSparseLinearity.cc
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+e7121409664c29ea00a3abc0b3bd6caa
+2011-09-26T13:47:57.076756Z
+70
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+4896
+
+cuObjectiveFunction.cc
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+fbedbd40c62459261f9f694f8bf62c29
+2011-03-07T10:43:43.160610Z
+40
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+2128
+
+cuRbm.cc
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+e849f983f4533fd34b87295e4d93602f
+2011-10-11T11:00:50.704096Z
+81
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+6654
+
+cuSparseLinearity.h
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+0d38089478e85f485e737a1f5a5e24ad
+2011-09-26T14:48:24.142069Z
+73
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+2570
+
+cuObjectiveFunction.h
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+ba18c448975a869560b72311cefdca58
+2011-04-04T17:14:16.666438Z
+46
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+3792
+
+cuRbm.h
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+065ff585dcb58651e1084e90a447290a
+2011-12-08T10:59:03.566125Z
+94
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+3522
+
+cuDiscreteLinearity.cc
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+d5580b34ae83cdadd40902f705edb990
+2011-12-08T10:59:03.566125Z
+94
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+4235
+
+cuDiscreteLinearity.h
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+1bc0c4c0ab790c9585a272fb63319041
+2011-12-08T10:59:03.566125Z
+94
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+1971
+
+cuSharedLinearity.cc
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+ee6608a11be7ded2ba5c60b009e4846a
+2011-04-04T17:14:16.666438Z
+46
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+5282
+
+cuNetwork.cc
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+6bd052ddb4d464abf3dc8ef5ef17a4b3
+2012-02-07T17:50:53.635354Z
+103
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+9320
+
+cuBlockArray.cc
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+641f990dfe9ed2666cf076a43982556e
+2012-02-07T17:50:53.635354Z
+103
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+3240
+
+cuSharedLinearity.h
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+fa4ddac3fab6f14d678184aef599c269
+2011-09-26T14:48:24.142069Z
+73
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+1776
+
+cuNetwork.h
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+af7fdb7f8e37a6159ed4802eba57e25b
+2011-09-19T11:12:27.685840Z
+69
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+5499
+
+cuBlockArray.h
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+0edf09ffe051998c7f2dd04fec19914c
+2012-02-07T17:50:53.635354Z
+103
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+1529
+
+cuRbmSparse.cc
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+626fd92e997ff3c18dbecef8150c5e80
+2011-12-08T10:59:03.566125Z
+94
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+7681
+
+cuRbmSparse.h
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+a8c1d1d2a7dcf461c4180a1eb34c7991
+2011-12-08T10:59:03.566125Z
+94
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+3500
+
+cuActivation.cc
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+4c43c28a4c87e7fddad7958a3cfe0e1f
+2011-02-24T12:12:08.754106Z
+34
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+694
+
+Makefile
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+57c55cf8cb29bdb2590cfb9d6ba02413
+2011-03-24T17:03:17.103393Z
+43
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+383
+
diff --git a/src/CuTNetLib/.svn/prop-base/Makefile.svn-base b/src/CuTNetLib/.svn/prop-base/Makefile.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuTNetLib/.svn/prop-base/Makefile.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuTNetLib/.svn/prop-base/cuActivation.cc.svn-base b/src/CuTNetLib/.svn/prop-base/cuActivation.cc.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuTNetLib/.svn/prop-base/cuActivation.cc.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuTNetLib/.svn/prop-base/cuActivation.h.svn-base b/src/CuTNetLib/.svn/prop-base/cuActivation.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuTNetLib/.svn/prop-base/cuActivation.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuTNetLib/.svn/prop-base/cuBiasedLinearity.cc.svn-base b/src/CuTNetLib/.svn/prop-base/cuBiasedLinearity.cc.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuTNetLib/.svn/prop-base/cuBiasedLinearity.cc.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuTNetLib/.svn/prop-base/cuBiasedLinearity.h.svn-base b/src/CuTNetLib/.svn/prop-base/cuBiasedLinearity.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuTNetLib/.svn/prop-base/cuBiasedLinearity.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuTNetLib/.svn/prop-base/cuCRBEDctFeat.h.svn-base b/src/CuTNetLib/.svn/prop-base/cuCRBEDctFeat.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuTNetLib/.svn/prop-base/cuCRBEDctFeat.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuTNetLib/.svn/prop-base/cuCache.cc.svn-base b/src/CuTNetLib/.svn/prop-base/cuCache.cc.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuTNetLib/.svn/prop-base/cuCache.cc.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuTNetLib/.svn/prop-base/cuCache.h.svn-base b/src/CuTNetLib/.svn/prop-base/cuCache.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuTNetLib/.svn/prop-base/cuCache.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuTNetLib/.svn/prop-base/cuComponent.h.svn-base b/src/CuTNetLib/.svn/prop-base/cuComponent.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuTNetLib/.svn/prop-base/cuComponent.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuTNetLib/.svn/prop-base/cuNetwork.cc.svn-base b/src/CuTNetLib/.svn/prop-base/cuNetwork.cc.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuTNetLib/.svn/prop-base/cuNetwork.cc.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuTNetLib/.svn/prop-base/cuNetwork.h.svn-base b/src/CuTNetLib/.svn/prop-base/cuNetwork.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuTNetLib/.svn/prop-base/cuNetwork.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuTNetLib/.svn/prop-base/cuObjectiveFunction.cc.svn-base b/src/CuTNetLib/.svn/prop-base/cuObjectiveFunction.cc.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuTNetLib/.svn/prop-base/cuObjectiveFunction.cc.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuTNetLib/.svn/prop-base/cuObjectiveFunction.h.svn-base b/src/CuTNetLib/.svn/prop-base/cuObjectiveFunction.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuTNetLib/.svn/prop-base/cuObjectiveFunction.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuTNetLib/.svn/prop-base/cuRbm.cc.svn-base b/src/CuTNetLib/.svn/prop-base/cuRbm.cc.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuTNetLib/.svn/prop-base/cuRbm.cc.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuTNetLib/.svn/prop-base/cuRbm.h.svn-base b/src/CuTNetLib/.svn/prop-base/cuRbm.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuTNetLib/.svn/prop-base/cuRbm.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuTNetLib/.svn/prop-base/cuRecurrent.cc.svn-base b/src/CuTNetLib/.svn/prop-base/cuRecurrent.cc.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuTNetLib/.svn/prop-base/cuRecurrent.cc.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuTNetLib/.svn/prop-base/cuRecurrent.h.svn-base b/src/CuTNetLib/.svn/prop-base/cuRecurrent.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuTNetLib/.svn/prop-base/cuRecurrent.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuTNetLib/.svn/prop-base/cuSharedLinearity.cc.svn-base b/src/CuTNetLib/.svn/prop-base/cuSharedLinearity.cc.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuTNetLib/.svn/prop-base/cuSharedLinearity.cc.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuTNetLib/.svn/prop-base/cuSharedLinearity.h.svn-base b/src/CuTNetLib/.svn/prop-base/cuSharedLinearity.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/CuTNetLib/.svn/prop-base/cuSharedLinearity.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/CuTNetLib/.svn/text-base/Makefile.svn-base b/src/CuTNetLib/.svn/text-base/Makefile.svn-base
new file mode 100644
index 0000000..c7678da
--- /dev/null
+++ b/src/CuTNetLib/.svn/text-base/Makefile.svn-base
@@ -0,0 +1,30 @@
+
+include ../tnet.mk
+
+INCLUDE= -I. -I../ -I../KaldiLib -I../CuBaseLib -I../TNetLib -I$(CUDA_TK_BASE)/include/ 
+
+
+all : libCuTNet.a
+
+libCuTNet.a : $(OBJ)
+	$(AR) ruv $@ $?
+	$(RANLIB) $@
+
+
+%.o : %.cc
+	$(CXX) -c $< -o $@ $(CFLAGS) $(CXXFLAGS) $(INCLUDE) 
+
+
+
+
+.PHONY: clean depend
+
+clean :
+	rm -f *.o *.a
+
+
+depend:
+	$(CXX) -M $(CXXFLAGS) *.cc $(INCLUDE) > .depend.mk
+
+-include .depend.mk
+
diff --git a/src/CuTNetLib/.svn/text-base/cuActivation.cc.svn-base b/src/CuTNetLib/.svn/text-base/cuActivation.cc.svn-base
new file mode 100644
index 0000000..bd57ae5
--- /dev/null
+++ b/src/CuTNetLib/.svn/text-base/cuActivation.cc.svn-base
@@ -0,0 +1,46 @@
+
+#include "cuActivation.h"
+#include "cumath.h"
+
+
+namespace TNet {
+
+
+  void
+  CuSigmoid::
+  PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    CuMath<BaseFloat>::Sigmoid(Y, X);
+  }
+
+
+  void 
+  CuSigmoid::
+  BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    CuMath<BaseFloat>::DiffSigmoid(Y, X, mOutput);
+  }
+
+
+
+  void 
+  CuSoftmax::
+  PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    CuMath<BaseFloat>::Softmax(Y,X);
+  }
+
+   
+   
+  void
+  CuSoftmax::
+  BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    //we assume X is already dE/dSoftmax_input
+    Y.CopyFrom(X);
+  }
+
+
+
+} //namespace
+
diff --git a/src/CuTNetLib/.svn/text-base/cuActivation.h.svn-base b/src/CuTNetLib/.svn/text-base/cuActivation.h.svn-base
new file mode 100644
index 0000000..9fb2862
--- /dev/null
+++ b/src/CuTNetLib/.svn/text-base/cuActivation.h.svn-base
@@ -0,0 +1,123 @@
+
+#ifndef _CUACT_FUN_I_
+#define _CUACT_FUN_I_
+
+
+#include "cuComponent.h"
+#include "cumatrix.h"
+
+
+namespace TNet
+{
+
+  /**
+   * Common interface for activation functions
+   */
+  class CuActivation : public CuComponent 
+  {
+    public:
+      CuActivation(size_t nInputs, size_t nOutputs, CuComponent *pPred);
+
+    protected:
+  };
+
+
+  /**
+   * Sigmoid activation function
+   */
+  class CuSigmoid : public CuActivation
+  {
+    public:
+      CuSigmoid(size_t nInputs, size_t nOutputs, CuComponent *pPred);
+
+      ComponentType GetType() const;
+      const char* GetName() const;
+
+    protected:
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+  };
+    
+
+  /**
+   * Softmax activation function
+   */
+  class CuSoftmax : public CuActivation
+  {
+    public:
+      CuSoftmax(size_t nInputs, size_t nOutputs, CuComponent *pPred);
+
+      ComponentType GetType() const;
+      const char* GetName() const;
+
+    protected:
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+  };
+
+
+  //////////////////////////////////////////////////////////////////////////
+  // Inline functions
+  // Activation::
+  inline
+  CuActivation::
+  CuActivation(size_t nInputs, size_t nOutputs, CuComponent *pPred)
+    : CuComponent(nInputs,nOutputs, pPred)
+  { 
+    assert(nInputs == nOutputs);
+  } 
+
+
+  //////////////////////////////////////////////////////////////////////////
+  // Inline functions
+  // Sigmoid::
+  inline
+  CuSigmoid::
+  CuSigmoid(size_t nInputs, size_t nOutputs, CuComponent *pPred)
+    : CuActivation(nInputs,nOutputs, pPred) 
+  { } 
+
+  inline CuComponent::ComponentType
+  CuSigmoid::
+  GetType() const
+  {
+    return CuComponent::SIGMOID;
+  }
+
+  inline const char*
+  CuSigmoid::
+  GetName() const
+  {
+    return "<sigmoid>";
+  }
+
+
+
+  //////////////////////////////////////////////////////////////////////////
+  // Inline functions
+  // Softmax::
+  inline
+  CuSoftmax::
+  CuSoftmax(size_t nInputs, size_t nOutputs, CuComponent *pPred)
+    : CuActivation(nInputs,nOutputs, pPred) 
+  { } 
+
+  inline CuComponent::ComponentType
+  CuSoftmax::
+  GetType() const
+  {
+    return CuComponent::SOFTMAX;
+  }
+
+  inline const char*
+  CuSoftmax::
+  GetName() const
+  {
+    return "<softmax>";
+  }
+
+
+} //namespace
+
+
+#endif
diff --git a/src/CuTNetLib/.svn/text-base/cuBiasedLinearity.cc.svn-base b/src/CuTNetLib/.svn/text-base/cuBiasedLinearity.cc.svn-base
new file mode 100644
index 0000000..b9ac137
--- /dev/null
+++ b/src/CuTNetLib/.svn/text-base/cuBiasedLinearity.cc.svn-base
@@ -0,0 +1,123 @@
+
+
+#include "cuBiasedLinearity.h"
+
+
+namespace TNet
+{
+
+  void 
+  CuBiasedLinearity::
+  PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    //Y.SetConst(0.0);
+    Y.AddScaledRow(1.0,mBias,0.0);
+    Y.Gemm('N','N', 1.0, X, mLinearity, 1.0);
+  }
+
+
+  void 
+  CuBiasedLinearity::
+  BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    //Y.SetConst(0.0);
+    Y.Gemm('N', 'T', 1.0, X, mLinearity, 0.0);
+  }
+
+  
+  void 
+  CuBiasedLinearity::
+  Update() 
+  {
+#if 0
+    //former implementation
+    BaseFloat N = static_cast<BaseFloat>(GetInput().Rows());
+
+    mLinearityCorrection.Gemm('T','N',-mLearningRate/N,GetInput(),GetErrorInput(),mMomentum);
+    mBiasCorrection.AddColSum(-mLearningRate/N,GetErrorInput(),mMomentum);
+
+    //regularization weight decay
+    mLinearityCorrection.AddScaled(-mLearningRate*mWeightcost,mLinearity,1.0);
+    
+    mLinearity.AddScaled(1.0,mLinearityCorrection,1.0);
+    mBias.AddScaled(1.0,mBiasCorrection,1.0);
+#endif
+
+#if 1
+    //new implementation
+    BaseFloat N = 1;
+    if(mGradDivFrm) {
+      N = static_cast<BaseFloat>(GetInput().Rows());
+    }
+    BaseFloat mmt_gain = static_cast<BaseFloat>(1.0/(1.0-mMomentum));
+    N *= mmt_gain;
+
+    mLinearityCorrection.Gemm('T','N',1.0,GetInput(),GetErrorInput(),mMomentum);
+    mBiasCorrection.AddColSum(1.0,GetErrorInput(),mMomentum);
+
+    mLinearity.AddScaled(-mLearningRate/N,mLinearityCorrection,1.0);
+    mBias.AddScaled(-mLearningRate/N,mBiasCorrection,1.0);
+
+    //regularization weight decay (from actual weights only)
+    BaseFloat L2_decay = -mLearningRate*mWeightcost*(mGradDivFrm?1.0:GetInput().Rows());
+    mLinearity.AddScaled(L2_decay, mLinearity,1.0);
+#endif
+  }
+
+
+  void
+  CuBiasedLinearity::
+  ReadFromStream(std::istream& rIn)
+  {
+    //matrix is stored transposed as SNet does
+    BfMatrix transpose;
+    rIn >> transpose;
+    mLinearity.CopyFrom(BfMatrix(transpose, TRANS));
+    //biases stored normally
+    BfVector bias;
+    rIn >> bias;
+    mBias.CopyFrom(bias);
+
+    if(transpose.Cols()*transpose.Rows() == 0) {
+      Error("Missing linearity matrix in network file");
+    }
+    if(bias.Dim() == 0) {
+      Error("Missing bias vector in network file");
+    }
+    if(mLinearity.Cols() != GetNOutputs() || 
+       mLinearity.Rows() != GetNInputs() ||
+       mBias.Dim() != GetNOutputs()
+    ){
+      std::ostringstream os;
+      os << "Wrong dimensionalities of matrix/vector in network file\n"
+         << "Inputs:" << GetNInputs()
+         << "Outputs:" << GetNOutputs()
+         << "\n"
+         << "linearityCols:" << mLinearity.Cols()
+         << "linearityRows:" << mLinearity.Rows()
+         << "biasDims:" << mBias.Dim()
+         << "\n";
+      Error(os.str());
+    }
+  }
+
+   
+  void
+  CuBiasedLinearity::
+  WriteToStream(std::ostream& rOut)
+  {
+    //matrix is stored transposed as SNet does
+    BfMatrix tmp;
+    mLinearity.CopyTo(tmp);
+    BfMatrix transpose(tmp, TRANS);
+    rOut << transpose;
+    //biases stored normally
+    BfVector vec;
+    mBias.CopyTo(vec);
+    rOut << vec;
+    rOut << std::endl;
+  }
+
+ 
+} //namespace
+
diff --git a/src/CuTNetLib/.svn/text-base/cuBiasedLinearity.h.svn-base b/src/CuTNetLib/.svn/text-base/cuBiasedLinearity.h.svn-base
new file mode 100644
index 0000000..18a75ac
--- /dev/null
+++ b/src/CuTNetLib/.svn/text-base/cuBiasedLinearity.h.svn-base
@@ -0,0 +1,85 @@
+#ifndef _CUBIASED_LINEARITY_H_
+#define _CUBIASED_LINEARITY_H_
+
+
+#include "cuComponent.h"
+#include "cumatrix.h"
+
+
+#include "Matrix.h"
+#include "Vector.h"
+
+
+namespace TNet {
+
+  class CuBiasedLinearity : public CuUpdatableComponent
+  {
+    public:
+
+      CuBiasedLinearity(size_t nInputs, size_t nOutputs, CuComponent *pPred); 
+      ~CuBiasedLinearity();  
+      
+      ComponentType GetType() const;
+      const char* GetName() const;
+
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+
+      void Update();
+
+      void ReadFromStream(std::istream& rIn);
+      void WriteToStream(std::ostream& rOut);
+
+    protected:
+      CuMatrix<BaseFloat> mLinearity;  ///< Matrix with neuron weights
+      CuVector<BaseFloat> mBias;       ///< Vector with biases
+
+      CuMatrix<BaseFloat> mLinearityCorrection; ///< Matrix for linearity updates
+      CuVector<BaseFloat> mBiasCorrection;      ///< Vector for bias updates
+
+  };
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // CuBiasedLinearity::
+  inline 
+  CuBiasedLinearity::
+  CuBiasedLinearity(size_t nInputs, size_t nOutputs, CuComponent *pPred)
+    : CuUpdatableComponent(nInputs, nOutputs, pPred), 
+      mLinearity(nInputs,nOutputs), mBias(nOutputs),
+      mLinearityCorrection(nInputs,nOutputs), mBiasCorrection(nOutputs)
+  { 
+    mLinearityCorrection.SetConst(0.0);
+    mBiasCorrection.SetConst(0.0);
+  }
+
+
+  inline
+  CuBiasedLinearity::
+  ~CuBiasedLinearity()
+  { }
+
+  inline CuComponent::ComponentType
+  CuBiasedLinearity::
+  GetType() const
+  {
+    return CuComponent::BIASED_LINEARITY;
+  }
+
+  inline const char*
+  CuBiasedLinearity::
+  GetName() const
+  {
+    return "<biasedlinearity>";
+  }
+
+
+
+} //namespace
+
+
+
+#endif
diff --git a/src/CuTNetLib/.svn/text-base/cuBlockArray.cc.svn-base b/src/CuTNetLib/.svn/text-base/cuBlockArray.cc.svn-base
new file mode 100644
index 0000000..461bd37
--- /dev/null
+++ b/src/CuTNetLib/.svn/text-base/cuBlockArray.cc.svn-base
@@ -0,0 +1,139 @@
+
+
+#include "cuBlockArray.h"
+#include "cuNetwork.h"
+
+
+namespace TNet
+{
+
+  void 
+  CuBlockArray::
+  PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    CuMatrix<BaseFloat> colsX;
+    CuMatrix<BaseFloat> colsY;
+    
+    int X_src_ori=0, Y_tgt_ori=0;
+    for(int i=0; i<mNBlocks; i++) {
+      //copy column stripe from the input X
+      int colsX_cnt=mBlocks[i]->GetNInputs();
+      colsX.Init(X.Rows(),colsX_cnt);
+      colsX.CopyCols(colsX_cnt,X_src_ori,X,0);
+
+      //propagate through the block(network)
+      mBlocks[i]->Propagate(colsX,colsY);
+
+      //copy column stripe to the output Y
+      int colsY_cnt=mBlocks[i]->GetNOutputs();
+      Y.CopyCols(colsY_cnt,0,colsY,Y_tgt_ori);
+
+      //shift the origin coordinates
+      X_src_ori += colsX_cnt;
+      Y_tgt_ori += colsY_cnt;
+    }
+
+    assert(X_src_ori == X.Cols());
+    assert(Y_tgt_ori == Y.Cols());
+  }
+
+
+  void 
+  CuBlockArray::
+  BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    KALDI_ERR << "Unimplemented";
+  }
+
+  
+  void 
+  CuBlockArray::
+  Update() 
+  {
+    KALDI_ERR << "Unimplemented";
+  }
+
+
+  void
+  CuBlockArray::
+  ReadFromStream(std::istream& rIn)
+  {
+    if(mBlocks.size() > 0) {
+      KALDI_ERR << "Cannot read block vector, "
+                << "aleady filled bt "
+                << mBlocks.size()
+                << "elements";
+    }
+
+    rIn >> std::ws >> mNBlocks;
+    if(mNBlocks < 1) {
+      KALDI_ERR << "Bad number of blocks:" << mNBlocks;
+    }
+
+    //read all the blocks
+    std::string tag;
+    int block_id;
+    for(int i=0; i<mNBlocks; i++) {
+      //read tag <block>
+      rIn >> std::ws >> tag;
+      //make it lowercase
+      std::transform(tag.begin(), tag.end(), tag.begin(), tolower);
+      //check
+      if(tag!="<block>") {
+        KALDI_ERR << "<block> keywotd expected";
+      }
+    
+      //read block number
+      rIn >> std::ws >> block_id;
+      if(block_id != i+1) {
+        KALDI_ERR << "Expected block number:" << i+1
+                  << " read block number: " << block_id;
+      }
+
+      //read the nnet
+      CuNetwork* p_nnet = new CuNetwork;
+      p_nnet->ReadNetwork(rIn);
+      if(p_nnet->Layers() == 0) {
+        KALDI_ERR << "Cannot read empty network to a block";
+      }
+
+      //add it to the vector
+      mBlocks.push_back(p_nnet);
+    }
+
+    //check the declared dimensionality
+    int sum_inputs=0, sum_outputs=0;
+    for(int i=0; i<mNBlocks; i++) {
+      sum_inputs += mBlocks[i]->GetNInputs();
+      sum_outputs += mBlocks[i]->GetNOutputs();
+    }
+    if(sum_inputs != GetNInputs()) {
+      KALDI_ERR << "Non-matching number of INPUTS! Declared:"
+                << GetNInputs()
+                << " summed from blocks"
+                << sum_inputs;
+    }
+    if(sum_outputs != GetNOutputs()) {
+      KALDI_ERR << "Non-matching number of OUTPUTS! Declared:"
+                << GetNOutputs()
+                << " summed from blocks"
+                << sum_outputs;
+    }
+  }
+
+   
+  void
+  CuBlockArray::
+  WriteToStream(std::ostream& rOut)
+  {
+    rOut << " " << mBlocks.size() << " ";
+    for(int i=0; i<mBlocks.size(); i++) {
+      rOut << "<block> " << i+1 << "\n";
+      mBlocks[i]->WriteNetwork(rOut);
+      rOut << "<endblock>\n";
+    }
+  }
+
+ 
+} //namespace
+
diff --git a/src/CuTNetLib/.svn/text-base/cuBlockArray.h.svn-base b/src/CuTNetLib/.svn/text-base/cuBlockArray.h.svn-base
new file mode 100644
index 0000000..aea7922
--- /dev/null
+++ b/src/CuTNetLib/.svn/text-base/cuBlockArray.h.svn-base
@@ -0,0 +1,83 @@
+#ifndef _CUBLOCK_ARRAY_H_
+#define _CUBLOCK_ARRAY_H_
+
+
+#include "cuComponent.h"
+#include "cumatrix.h"
+
+#include "Matrix.h"
+#include "Vector.h"
+
+
+namespace TNet {
+
+  class CuNetwork;
+
+  class CuBlockArray : public CuUpdatableComponent
+  {
+    public:
+
+      CuBlockArray(size_t nInputs, size_t nOutputs, CuComponent *pPred); 
+      ~CuBlockArray();  
+      
+      ComponentType GetType() const;
+      const char* GetName() const;
+
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+
+      void Update();
+
+      void ReadFromStream(std::istream& rIn);
+      void WriteToStream(std::ostream& rOut);
+
+    protected:
+      std::vector<CuNetwork*> mBlocks; ///< vector with networks, one network is one block
+      size_t mNBlocks;  
+  };
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // CuBlockArray::
+  inline 
+  CuBlockArray::
+  CuBlockArray(size_t nInputs, size_t nOutputs, CuComponent *pPred)
+    : CuUpdatableComponent(nInputs, nOutputs, pPred), 
+      mNBlocks(0) 
+  { }
+
+
+  inline
+  CuBlockArray::
+  ~CuBlockArray()
+  { 
+    for(int i=0; i<mBlocks.size(); i++) {
+      delete mBlocks[i];
+    }
+    mBlocks.clear();
+  }
+
+  inline CuComponent::ComponentType
+  CuBlockArray::
+  GetType() const
+  {
+    return CuComponent::BLOCK_ARRAY;
+  }
+
+  inline const char*
+  CuBlockArray::
+  GetName() const
+  {
+    return "<blockarray>";
+  }
+
+
+
+} //namespace
+
+
+
+#endif
diff --git a/src/CuTNetLib/.svn/text-base/cuCRBEDctFeat.h.svn-base b/src/CuTNetLib/.svn/text-base/cuCRBEDctFeat.h.svn-base
new file mode 100644
index 0000000..62c62f2
--- /dev/null
+++ b/src/CuTNetLib/.svn/text-base/cuCRBEDctFeat.h.svn-base
@@ -0,0 +1,310 @@
+#ifndef _CUCRBEDCTFEATURES_H_
+#define _CUCRBEDCTFEATURES_H_
+
+
+#include "cuComponent.h"
+#include "cumath.h"
+
+
+namespace TNet {
+
+  /**
+   * Expands the time context of the input features
+   * in N, out k*N, FrameOffset o_1,o_2,...,o_k
+   * FrameOffset example 11frames: -5 -4 -3 -2 -1 0 1 2 3 4 5
+   */
+  class CuExpand : public CuComponent
+  {
+   public:
+    CuExpand(size_t nInputs, size_t nOutputs, CuComponent* pPred)
+      : CuComponent(nInputs,nOutputs,pPred)
+    { }
+
+    ~CuExpand()
+    { }
+
+    ComponentType GetType() const
+    { return EXPAND; }
+
+    const char* GetName() const
+    { return "<expand>"; }
+   
+    void ReadFromStream(std::istream& rIn)
+    { Vector<int> vec; rIn >> vec; mFrameOffset.CopyFrom(vec); }
+
+    void WriteToStream(std::ostream& rOut)  
+    { Vector<int> vec; mFrameOffset.CopyTo(vec); rOut << vec; }
+     
+   protected:
+    void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+    { CuMath<BaseFloat>::Expand(Y,X,mFrameOffset); }
+
+    void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+    { Error("__func__ Nonsense"); }
+
+   protected:
+    CuVector<int> mFrameOffset;
+  };
+
+
+
+  /**
+   * Rearrange the matrix columns according to the indices in mCopyFromIndices
+   */
+  class CuCopy : public CuComponent
+  {
+   public:
+    CuCopy(size_t nInputs, size_t nOutputs, CuComponent* pPred)
+      : CuComponent(nInputs,nOutputs,pPred)
+    { }
+
+    ~CuCopy()
+    { }
+
+    ComponentType GetType() const
+    { return COPY; }
+
+    const char* GetName() const
+    { return "<copy>"; }
+   
+    void ReadFromStream(std::istream& rIn)
+    { Vector<int> vec; rIn >> vec; vec.Add(-1); mCopyFromIndices.CopyFrom(vec); }
+
+    void WriteToStream(std::ostream& rOut)  
+    { Vector<int> vec; mCopyFromIndices.CopyTo(vec); vec.Add(1); rOut << vec; }
+     
+   protected:
+    void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+    { CuMath<BaseFloat>::Rearrange(Y,X,mCopyFromIndices); }
+
+    void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+    { Error("__func__ Nonsense"); }
+
+   protected:
+    CuVector<int> mCopyFromIndices;
+  };
+  
+  class CuTranspose : public CuComponent
+  {
+   public:
+    CuTranspose(size_t nInputs, size_t nOutputs, CuComponent* pPred)
+      : CuComponent(nInputs,nOutputs,pPred), mContext(0)
+    { }
+
+    ~CuTranspose()
+    { }
+
+    ComponentType GetType() const
+    { return TRANSPOSE; }
+
+    const char* GetName() const
+    { return "<transpose>"; }
+   
+    void ReadFromStream(std::istream& rIn)
+    { 
+      rIn >> std::ws >> mContext;
+
+      if(GetNInputs() != GetNOutputs()) { 
+        Error("Input dim must be same as output dim"); 
+      }
+      if(GetNInputs() % mContext != 0) { 
+        Error("Number of inputs must be divisible by context length"); 
+      }
+
+      Vector<int> vec(GetNInputs());
+      int channels = GetNInputs() / mContext;
+      for(int i=0, ch=0; ch<channels; ch++) {
+        for(int idx=ch; idx < (int)GetNInputs(); idx+=channels, i++) {
+          assert(i < (int)GetNInputs());
+          vec[i] = idx;
+        }
+      }
+
+      mCopyFromIndices.CopyFrom(vec); 
+    }
+
+    void WriteToStream(std::ostream& rOut)  
+    { rOut << " " << mContext << "\n"; }
+     
+   protected:
+    void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+    { CuMath<BaseFloat>::Rearrange(Y,X,mCopyFromIndices); }
+
+    void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+    { Error("__func__ Nonsense"); }
+
+   protected:
+    int mContext;
+    CuVector<int> mCopyFromIndices;
+  };
+
+
+  /**
+   * CuBlockLinearity is used for the blockwise multiplication by 
+   * DCT transform loaded from disk
+   */
+  class CuBlockLinearity : public CuComponent
+  {
+    public:
+      CuBlockLinearity(size_t nInputs, size_t nOutputs, CuComponent* pPred)
+        : CuComponent(nInputs,nOutputs,pPred)
+      { }
+
+      ~CuBlockLinearity()
+      { }
+
+
+      ComponentType GetType() const
+      { return CuComponent::BLOCK_LINEARITY; }
+
+      const char* GetName() const
+      { return "<blocklinearity>"; }
+
+
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y) 
+      { CuMath<BaseFloat>::BlockLinearity(Y,X,mBlockLinearity); }
+        
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y) 
+      { Error("__func__ Not implemented"); }
+
+
+      void ReadFromStream(std::istream& rIn)
+      { 
+        Matrix<BaseFloat> mat;
+        rIn >> mat;
+        Matrix<BaseFloat> trans(mat,TRANS);
+        mBlockLinearity.CopyFrom(trans);
+
+        if((GetNOutputs() % mBlockLinearity.Cols() != 0) ||
+           (GetNInputs() % mBlockLinearity.Rows() != 0) ||
+           ((GetNOutputs() / mBlockLinearity.Cols()) != 
+            (GetNInputs() / mBlockLinearity.Rows()))) 
+        {
+          Error("BlockLinearity matrix dimensions must divide IO dims");
+        }
+      }
+
+      void WriteToStream(std::ostream& rOut)
+      {
+        Matrix<BaseFloat> mat;
+        mBlockLinearity.CopyTo(mat);
+        Matrix<BaseFloat> trans(mat,TRANS);
+        rOut << trans;
+      }
+
+    private:
+      CuMatrix<BaseFloat> mBlockLinearity;
+  };
+
+
+  
+  class CuBias : public CuComponent
+  {
+    public:
+      CuBias(size_t nInputs, size_t nOutputs, CuComponent* pPred)
+        : CuComponent(nInputs,nOutputs,pPred)
+      { }
+
+      ~CuBias()
+      { }
+
+
+      ComponentType GetType() const
+      { return CuComponent::BIAS; }
+
+      const char* GetName() const
+      { return "<bias>"; }
+
+
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+      { Y.CopyFrom(X); Y.AddScaledRow(1.0, mBias, 1.0); }
+
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+      { Y.CopyFrom(X); }
+  
+     
+      void ReadFromStream(std::istream& rIn)
+      { Vector<BaseFloat> vec; rIn >> vec; mBias.CopyFrom(vec); }
+
+      void WriteToStream(std::ostream& rOut)
+      { Vector<BaseFloat> vec; mBias.CopyTo(vec); rOut << vec; }
+
+    private:
+      CuVector<BaseFloat> mBias;
+  };
+
+
+
+  class CuWindow : public CuComponent
+  {
+    public:
+      CuWindow(size_t nInputs, size_t nOutputs, CuComponent* pPred)
+        : CuComponent(nInputs, nOutputs, pPred)
+      { }
+
+      ~CuWindow()
+      { }
+
+
+      ComponentType GetType() const
+      { return CuComponent::WINDOW; }
+
+      const char* GetName() const
+      { return "<window>"; }
+
+
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+      { Y.CopyFrom(X); Y.ScaleCols(mWindow); }
+
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+      { Error("__func__ Not implemented"); }
+     
+      
+      void ReadFromStream(std::istream& rIn)
+      { Vector<BaseFloat> vec; rIn >> vec; mWindow.CopyFrom(vec); }
+
+      void WriteToStream(std::ostream& rOut)
+      { Vector<BaseFloat> vec; mWindow.CopyTo(vec); rOut << vec; }
+
+    private:
+      CuVector<BaseFloat> mWindow;
+  };
+
+  class CuLog : public CuComponent
+  {
+    public:
+      CuLog(size_t nInputs, size_t nOutputs, CuComponent* pPred)
+        : CuComponent(nInputs, nOutputs, pPred)
+      { }
+
+      ~CuLog()
+      { }
+
+
+      ComponentType GetType() const
+      { return CuComponent::LOG; }
+
+      const char* GetName() const
+      { return "<log>"; }
+
+
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+      { Y.CopyFrom(X); Y.ApplyLog(); }
+
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+      { Error("__func__ Not implemented"); }
+     
+      
+      void ReadFromStream(std::istream& rIn)
+      { }
+
+      void WriteToStream(std::ostream& rOut)
+      { }
+
+  };
+
+}
+
+
+#endif
+
diff --git a/src/CuTNetLib/.svn/text-base/cuCache.cc.svn-base b/src/CuTNetLib/.svn/text-base/cuCache.cc.svn-base
new file mode 100644
index 0000000..f96b3b1
--- /dev/null
+++ b/src/CuTNetLib/.svn/text-base/cuCache.cc.svn-base
@@ -0,0 +1,203 @@
+
+
+#include "cuCache.h"
+#include "cumath.h"
+
+
+
+namespace TNet {
+
+  CuCache::
+  CuCache()
+    : mState(EMPTY), mIntakePos(0), mExhaustPos(0), mDiscarded(0), 
+      mRandomized(false), mTrace(0)
+  { }
+
+  CuCache::
+  ~CuCache()
+  { }
+
+  void
+  CuCache::
+  Init(size_t cachesize, size_t bunchsize)
+  {
+    if((cachesize % bunchsize) != 0) {
+      Error("Non divisible cachesize by bunchsize");
+    }
+    
+    mCachesize = cachesize;
+    mBunchsize = bunchsize;
+
+    mState = EMPTY;
+
+    mIntakePos = 0;
+    mExhaustPos = 0;
+
+    mRandomized = false;
+
+  }
+
+  void 
+  CuCache::
+  AddData(const CuMatrix<BaseFloat>& rFeatures, const CuMatrix<BaseFloat>& rDesired)
+  {
+    assert(rFeatures.Rows() == rDesired.Rows());
+
+    //lazy buffers allocation
+    if(mFeatures.Rows() != mCachesize) {
+      mFeatures.Init(mCachesize,rFeatures.Cols());
+      mDesired.Init(mCachesize,rDesired.Cols());
+    }
+
+    //warn if segment longer than half-cache
+    if(rFeatures.Rows() > mCachesize/2) {
+      std::ostringstream os;
+      os << "Too long segment and small feature cache! "
+         << " cachesize: " << mCachesize
+         << " segmentsize: " << rFeatures.Rows();
+      Warning(os.str());
+    }
+
+    //change state
+    if(mState == EMPTY) { 
+      if(mTrace&3) std::cout << "/" << std::flush; 
+      mState = INTAKE; mIntakePos = 0;
+     
+      //check for leftover from previous segment 
+      int leftover = mFeaturesLeftover.Rows();
+      //check if leftover is not bigger than cachesize
+      if(leftover > mCachesize) {
+        std::ostringstream os;
+        os << "Too small feature cache: " << mCachesize
+           << ", truncating: "
+           << leftover - mCachesize << " frames from previous segment leftover";
+        //Error(os.str());
+        Warning(os.str());
+        leftover = mCachesize;
+      }
+      //prefill cache with leftover
+      if(leftover > 0) {
+        mFeatures.CopyRows(leftover,0,mFeaturesLeftover,0);
+        mDesired.CopyRows(leftover,0,mDesiredLeftover,0);
+        mFeaturesLeftover.Destroy();
+        mDesiredLeftover.Destroy();
+        mIntakePos += leftover;
+      } 
+    }
+
+    assert(mState == INTAKE);
+    assert(rFeatures.Rows() == rDesired.Rows());
+    if(mTrace&2) std::cout << "F" << std::flush; 
+
+    int cache_space = mCachesize - mIntakePos;
+    int feature_length = rFeatures.Rows();
+    int fill_rows = (cache_space<feature_length)? cache_space : feature_length;
+    int leftover = feature_length - fill_rows;
+
+    assert(cache_space > 0);
+
+    //copy the data to cache
+    mFeatures.CopyRows(fill_rows,0,rFeatures,mIntakePos);
+    mDesired.CopyRows(fill_rows,0,rDesired,mIntakePos);
+
+    //copy leftovers
+    if(leftover > 0) {
+      mFeaturesLeftover.Init(leftover,mFeatures.Cols());
+      mDesiredLeftover.Init(leftover,mDesired.Cols());
+      mFeaturesLeftover.CopyRows(leftover,fill_rows,rFeatures,0);
+      mDesiredLeftover.CopyRows(leftover,fill_rows,rDesired,0);
+    }
+ 
+    //update cursor
+    mIntakePos += fill_rows;
+    
+    //change state
+    if(mIntakePos == mCachesize) { 
+      if(mTrace&3) std::cout << "\\" << std::flush; 
+      mState = FULL;
+    }
+  }
+
+
+
+  void
+  CuCache::
+  Randomize()
+  {
+    assert(mState == FULL || mState == INTAKE);
+
+    if(mTrace&3) std::cout << "R" << std::flush;
+
+    //lazy initialization of hte output buffers
+    mFeaturesRandom.Init(mCachesize,mFeatures.Cols());
+    mDesiredRandom.Init(mCachesize,mDesired.Cols());
+
+    //generate random series of integers
+    Vector<int> randmask(mIntakePos);
+    for(unsigned int i=0; i<mIntakePos; i++) {
+      randmask[i]=i;
+    }
+    int* ptr = randmask.pData();
+    std::random_shuffle(ptr, ptr+mIntakePos, GenerateRandom);
+
+    CuVector<int> cu_randmask;
+    cu_randmask.CopyFrom(randmask);
+
+    //randomize
+    CuMath<BaseFloat>::Randomize(mFeaturesRandom,mFeatures,cu_randmask);
+    CuMath<BaseFloat>::Randomize(mDesiredRandom,mDesired,cu_randmask);
+
+    mRandomized = true;
+
+  }
+
+  void
+  CuCache::
+  GetBunch(CuMatrix<BaseFloat>& rFeatures, CuMatrix<BaseFloat>& rDesired)
+  {
+    if(mState == EMPTY) {
+      Error("GetBunch on empty cache!!!");
+    }
+
+    //change state if full...
+    if(mState == FULL) { 
+      if(mTrace&3) std::cout << "\\" << std::flush; 
+      mState = EXHAUST; mExhaustPos = 0; 
+    }
+
+    //final cache is not completely filled
+    if(mState == INTAKE) //&& mpFeatures->EndOfList()
+    { 
+      if(mTrace&3) std::cout << "\\-LAST\n" << std::flush; 
+      mState = EXHAUST; mExhaustPos = 0; 
+    } 
+
+    assert(mState == EXHAUST);
+
+    //init the output
+    rFeatures.Init(mBunchsize,mFeatures.Cols());
+    rDesired.Init(mBunchsize,mDesired.Cols());
+
+    //copy the output
+    if(mRandomized) {
+      rFeatures.CopyRows(mBunchsize,mExhaustPos,mFeaturesRandom,0);
+      rDesired.CopyRows(mBunchsize,mExhaustPos,mDesiredRandom,0);
+    } else {
+      rFeatures.CopyRows(mBunchsize,mExhaustPos,mFeatures,0);
+      rDesired.CopyRows(mBunchsize,mExhaustPos,mDesired,0);
+    }
+
+    //update cursor
+    mExhaustPos += mBunchsize;
+
+    //change state to EMPTY
+    if(mExhaustPos > mIntakePos-mBunchsize) {
+      //we don't have more complete bunches...
+      mDiscarded += mIntakePos - mExhaustPos;
+
+      mState = EMPTY;
+    }
+  }
+
+
+}
diff --git a/src/CuTNetLib/.svn/text-base/cuCache.h.svn-base b/src/CuTNetLib/.svn/text-base/cuCache.h.svn-base
new file mode 100644
index 0000000..cef2dd9
--- /dev/null
+++ b/src/CuTNetLib/.svn/text-base/cuCache.h.svn-base
@@ -0,0 +1,74 @@
+#ifndef _CUCACHE_H_
+#define _CUCACHE_H_
+
+#include "cumatrix.h"
+
+namespace TNet {
+
+
+  /**
+   * The feature-target pair cache
+   */
+  class CuCache {
+    typedef enum { EMPTY, INTAKE, FULL, EXHAUST } State;
+    public:
+      CuCache();
+      ~CuCache();
+     
+      /// Initialize the cache
+      void Init(size_t cachesize, size_t bunchsize);
+
+      /// Add data to cache, returns number of added vectors
+      void AddData(const CuMatrix<BaseFloat>& rFeatures, const CuMatrix<BaseFloat>& rDesired);
+      /// Randomizes the cache
+      void Randomize();
+      /// Get the bunch of training data
+      void GetBunch(CuMatrix<BaseFloat>& rFeatures, CuMatrix<BaseFloat>& rDesired);
+
+
+      /// Returns true if the cache was completely filled
+      bool Full()
+      { return (mState == FULL); }
+      
+      /// Returns true if the cache is empty
+      bool Empty()
+      { return (mState == EMPTY || mIntakePos < mBunchsize); }
+      
+      /// Number of discarded frames
+      int Discarded() 
+      { return mDiscarded; }
+      
+      /// Set the trace message level
+      void Trace(int trace)
+      { mTrace = trace; }
+
+    private:
+    
+      static long int GenerateRandom(int max)
+      { return lrand48() % max; }
+      
+      State mState; ///< Current state of the cache
+
+      size_t mIntakePos; ///< Number of intaken vectors by AddData
+      size_t mExhaustPos; ///< Number of exhausted vectors by GetBunch
+      
+      size_t mCachesize; ///< Size of cache
+      size_t mBunchsize; ///< Size of bunch
+      int mDiscarded; ///< Number of discarded frames
+
+      CuMatrix<BaseFloat> mFeatures; ///< Feature cache
+      CuMatrix<BaseFloat> mFeaturesRandom; ///< Feature cache
+      CuMatrix<BaseFloat> mFeaturesLeftover; ///< Feature cache
+      
+      CuMatrix<BaseFloat> mDesired;  ///< Desired vector cache
+      CuMatrix<BaseFloat> mDesiredRandom;  ///< Desired vector cache
+      CuMatrix<BaseFloat> mDesiredLeftover;  ///< Desired vector cache
+
+      bool mRandomized;
+
+      int mTrace;
+  }; 
+
+}
+
+#endif
diff --git a/src/CuTNetLib/.svn/text-base/cuComponent.h.svn-base b/src/CuTNetLib/.svn/text-base/cuComponent.h.svn-base
new file mode 100644
index 0000000..332c156
--- /dev/null
+++ b/src/CuTNetLib/.svn/text-base/cuComponent.h.svn-base
@@ -0,0 +1,384 @@
+#ifndef _CUNETWORK_COMPONENT_I_H
+#define _CUNETWORK_COMPONENT_I_H
+
+
+#include "Vector.h"
+#include "Matrix.h"
+#include "Error.h"
+
+#include "cumatrix.h"
+
+#include <iostream>
+#include <stdexcept>
+
+
+namespace TNet {
+
+    
+  /**
+   * Basic element of the network,
+   * it is a box with defined inputs and outputs, 
+   * and functions to refresh outputs
+   *
+   * it is able to compute tranformation function (forward pass) 
+   * and jacobian function (backward pass), 
+   * which is to be implemented in descendents
+   */ 
+  class CuComponent 
+  {
+    public:
+    /// Types of the net components
+    typedef enum { 
+      UPDATABLE_COMPONENT = 0x0100, 
+      BIASED_LINEARITY,
+      DISCRETE_LINEARITY,
+      SHARED_LINEARITY,
+      SPARSE_LINEARITY,
+      RBM,
+      RBM_SPARSE,
+      RECURRENT,
+
+      ACT_FUN = 0x0200, 
+      SOFTMAX, 
+      SIGMOID,
+
+      OTHER = 0x0400,
+      EXPAND,
+      COPY,
+      TRANSPOSE,
+      BLOCK_LINEARITY,
+      WINDOW,
+      BIAS,
+      LOG,
+
+      BLOCK_ARRAY,
+    } ComponentType;
+
+    //////////////////////////////////////////////////////////////
+    // Constructor & Destructor
+    public: 
+      CuComponent(size_t nInputs, size_t nOutputs, CuComponent *pPred); 
+      virtual ~CuComponent();  
+       
+    //////////////////////////////////////////////////////////////
+    // Interface specification (public)
+    public:
+      /// Get Type Identification of the component
+      virtual ComponentType GetType() const = 0;  
+      /// Get Type Label of the component
+      virtual const char* GetName() const = 0;
+      /// 
+      virtual bool IsUpdatable() const 
+      { return false; }
+
+      /// Get size of input vectors
+      size_t GetNInputs() const;  
+      /// Get size of output vectors 
+      size_t GetNOutputs() const; 
+     
+      /// IO Data getters
+      const CuMatrix<BaseFloat>& GetInput() const; 
+      const CuMatrix<BaseFloat>& GetOutput() const;
+      const CuMatrix<BaseFloat>& GetErrorInput() const;
+      const CuMatrix<BaseFloat>& GetErrorOutput() const;
+      
+      /// Set input vector (bind with the preceding NetworkComponent)
+      void SetInput(const CuMatrix<BaseFloat>& rInput);           
+      /// Set error input vector (bind with the following NetworkComponent) 
+      void SetErrorInput(const CuMatrix<BaseFloat>& rErrorInput);  
+       
+      /// Perform forward pass propagateion Input->Output
+      void Propagate(); 
+      /// Perform backward pass propagateion ErrorInput->ErrorOutput
+      void Backpropagate(); 
+ 
+      /// Reads the component parameters from stream
+      virtual void ReadFromStream(std::istream& rIn)  { }
+      /// Writes the components parameters to stream
+      virtual void WriteToStream(std::ostream& rOut)  { } 
+
+
+    ///////////////////////////////////////////////////////////////
+    // Nonpublic member functions used to update data outputs 
+    protected:
+      /// Forward pass transformation (to be implemented by descendents...)
+      virtual void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y) = 0;
+      /// Backward pass transformation (to be implemented by descendents...)
+      virtual void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y) = 0;
+      
+   
+    ///////////////////////////////////////////////////////////////
+    // data members
+    protected:
+
+      size_t mNInputs;  ///< Size of input vectors
+      size_t mNOutputs; ///< Size of output vectors 
+      
+      const CuMatrix<BaseFloat>* mpInput; ///< inputs are NOT OWNED by component
+      const CuMatrix<BaseFloat>* mpErrorInput;///< inputs are NOT OWNED by component
+
+      CuMatrix<BaseFloat> mOutput; ///< outputs are OWNED by component
+      CuMatrix<BaseFloat> mErrorOutput; ///< outputs are OWNED by component
+
+  };
+
+
+  /**
+   * Class UpdatableComponent is a box which has some 
+   * parameters adjustable by learning
+   *
+   * you can set the learning rate, lock the params,
+   * and learn from each data observation
+   */
+  class CuUpdatableComponent : public CuComponent
+  {
+    //////////////////////////////////////////////////////////////
+    // Constructor & Destructor
+    public: 
+      CuUpdatableComponent(size_t nInputs, size_t nOutputs, CuComponent *pPred); 
+      virtual ~CuUpdatableComponent();
+
+
+    //////////////////////////////////////////////////////////////
+    // Interface specification (public)
+    public:
+      ///
+      virtual bool IsUpdatable() const 
+      { return true; }
+
+      /// get gradient and update the parameters in one step
+      virtual void Update() = 0;    
+      
+      /// Sets the learning rate of gradient descent
+      void LearnRate(BaseFloat rate);
+      /// Gets the learning rate of gradient descent
+      BaseFloat LearnRate();
+
+      void Momentum(BaseFloat mmt);
+      BaseFloat Momentum();
+
+      void Weightcost(BaseFloat cost);
+      BaseFloat Weightcost();
+
+      void GradDivFrm(bool div);
+      bool GradDivFrm();
+  
+    protected:
+      BaseFloat mLearningRate;
+      BaseFloat mMomentum;
+      BaseFloat mWeightcost;
+      bool      mGradDivFrm;
+
+  };
+
+
+
+
+  //////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // CuComponent::
+  inline
+  CuComponent::
+  CuComponent(size_t nInputs, size_t nOutputs, CuComponent *pPred) 
+    : mNInputs(nInputs), mNOutputs(nOutputs), 
+      mpInput(NULL), mpErrorInput(NULL), 
+      mOutput(), mErrorOutput()
+  { 
+    /* DOUBLE LINK the Components */
+    if (pPred != NULL) {
+      SetInput(pPred->GetOutput());
+      pPred->SetErrorInput(GetErrorOutput());
+    }
+  } 
+
+
+  inline
+  CuComponent::
+  ~CuComponent()
+  {
+    ;
+  }
+
+  inline void
+  CuComponent::
+  Propagate()
+  {
+    //initialize output buffer
+    mOutput.Init(GetInput().Rows(),GetNOutputs());
+    //do the dimensionality test
+    if(GetNInputs() != GetInput().Cols()) {
+      KALDI_ERR << "Non-matching INPUT dim!!! Network dim: " << GetNInputs() 
+                << " Data dim: " << GetInput().Cols();
+    }
+    //run transform
+    PropagateFnc(GetInput(),mOutput);
+  }
+
+
+  inline void
+  CuComponent::
+  Backpropagate()
+  {
+    //re-initialize the output buffer
+    mErrorOutput.Init(GetErrorInput().Rows(),GetNInputs());
+
+    //do the dimensionality test
+    assert(GetErrorInput().Cols() == mNOutputs);
+    assert(mErrorOutput.Cols() == mNInputs);
+    assert(mErrorOutput.Rows() == GetErrorInput().Rows());
+
+    //transform
+    BackpropagateFnc(GetErrorInput(),mErrorOutput);
+  }
+
+
+  inline void
+  CuComponent::
+  SetInput(const CuMatrix<BaseFloat>& rInput)
+  {
+    mpInput = &rInput;
+  }
+
+
+  inline void
+  CuComponent::
+  SetErrorInput(const CuMatrix<BaseFloat>& rErrorInput)
+  {
+    mpErrorInput = &rErrorInput;
+  }
+
+
+  inline const CuMatrix<BaseFloat>&
+  CuComponent::
+  GetInput() const
+  {
+    if (NULL == mpInput) Error("mpInput is NULL");
+    return *mpInput;
+  }
+
+  inline const CuMatrix<BaseFloat>&
+  CuComponent::
+  GetOutput() const
+  {
+    return mOutput;
+  }
+
+  inline const CuMatrix<BaseFloat>&
+  CuComponent::
+  GetErrorInput() const
+  {
+    if (NULL == mpErrorInput) Error("mpErrorInput is NULL");
+    return *mpErrorInput;
+  }
+
+  inline const CuMatrix<BaseFloat>&
+  CuComponent::
+  GetErrorOutput() const
+  {
+    return mErrorOutput;
+  }
+
+  inline size_t
+  CuComponent::
+  GetNInputs() const
+  {
+    return mNInputs;
+  }
+
+  inline size_t
+  CuComponent::
+  GetNOutputs() const
+  {
+    return mNOutputs;
+  }
+
+
+
+  //////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // UpdatableComponent::
+  
+  inline 
+  CuUpdatableComponent::
+  CuUpdatableComponent(size_t nInputs, size_t nOutputs, CuComponent *pPred) 
+    : CuComponent(nInputs, nOutputs, pPred), 
+      mLearningRate(0.0), mMomentum(0), mWeightcost(0), mGradDivFrm(true)
+  {
+    ; 
+  } 
+
+
+  inline
+  CuUpdatableComponent::
+  ~CuUpdatableComponent()
+  {
+    ;
+  }
+
+
+  inline void
+  CuUpdatableComponent::
+  LearnRate(BaseFloat rate)
+  {
+    mLearningRate = rate;
+  }
+
+
+  inline BaseFloat
+  CuUpdatableComponent::
+  LearnRate()
+  {
+    return mLearningRate;
+  }
+  
+
+  inline void
+  CuUpdatableComponent::
+  Momentum(BaseFloat mmt)
+  {
+    mMomentum = mmt;
+  }
+
+
+  inline BaseFloat
+  CuUpdatableComponent::
+  Momentum()
+  {
+    return mMomentum;
+  }
+  
+  
+  inline void
+  CuUpdatableComponent::
+  Weightcost(BaseFloat cost)
+  {
+    mWeightcost = cost;
+  }
+
+
+  inline BaseFloat
+  CuUpdatableComponent::
+  Weightcost()
+  {
+    return mWeightcost;
+  }
+
+  
+  inline void 
+  CuUpdatableComponent::
+  GradDivFrm(bool div)
+  {
+    mGradDivFrm = div;
+  }
+   
+  inline bool
+  CuUpdatableComponent::
+  GradDivFrm()
+  {
+    return mGradDivFrm;
+  }
+
+} // namespace TNet
+
+
+#endif
diff --git a/src/CuTNetLib/.svn/text-base/cuDiscreteLinearity.cc.svn-base b/src/CuTNetLib/.svn/text-base/cuDiscreteLinearity.cc.svn-base
new file mode 100644
index 0000000..befde24
--- /dev/null
+++ b/src/CuTNetLib/.svn/text-base/cuDiscreteLinearity.cc.svn-base
@@ -0,0 +1,160 @@
+
+
+#include "cuDiscreteLinearity.h"
+#include "cumath.h"
+
+namespace TNet
+{
+
+  void 
+  CuDiscreteLinearity::
+  PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    //Y.SetConst(0.0);
+
+    //precopy bias
+    Y.AddScaledRow(1.0,mBias,0.0);
+
+    //mulitply with the matrices
+    int offset_in=0, offset_out=0;
+    for (int i=0; i<mNBlocks; i++) {
+      CuMath<BaseFloat>::OffsetGemm('N','N', 1.0, X, mLinearity[i], 1.0, Y, 
+                                    offset_in, 0, offset_out);
+      offset_in += mLinearity[i].Rows();
+      offset_out += mLinearity[i].Cols();
+    }
+  }
+
+
+  void 
+  CuDiscreteLinearity::
+  BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    //Y.SetConst(0.0);
+
+    int offset_in=0, offset_out=0;
+    for(int i=0; i<mNBlocks; i++) {
+      CuMath<BaseFloat>::OffsetGemm('N', 'T', 1.0, X, mLinearity[i], 0.0, Y,
+                                    offset_in, 0, offset_out);
+      offset_in += mLinearity[i].Cols();
+      offset_out += mLinearity[i].Rows();
+    }
+  }
+
+  
+  void 
+  CuDiscreteLinearity::
+  Update() 
+  {
+    //new implementation
+    BaseFloat N = 1; 
+    if(mGradDivFrm) {
+      N = static_cast<BaseFloat>(GetInput().Rows());
+    }
+    BaseFloat mmt_gain = static_cast<BaseFloat>(1.0/(1.0-mMomentum));
+    N *= mmt_gain; //compensate higher gradient estimates due to momentum 
+
+    //get gradients of discrete linearities
+    int offset_in=0, offset_out=0;
+    for(int i=0; i<mNBlocks; i++) {
+      CuMath<BaseFloat>::OffsetGemm('T','N',1.0,
+                        GetInput(),GetErrorInput(),
+                        mMomentum, mLinearityCorrection[i],
+                        offset_in,offset_out,0);
+      offset_in += mLinearity[i].Rows();
+      offset_out += mLinearity[i].Cols();
+    }
+    for(int i=0; i<mNBlocks; i++) {
+      //perform update 
+      mLinearity[i].AddScaled(-mLearningRate/N,mLinearityCorrection[i],1.0);
+      //regularization weight decay
+      mLinearity[i].AddScaled(-mLearningRate*mWeightcost,mLinearity[i],1.0);
+    }
+
+    //get gradient of bias
+    mBiasCorrection.AddColSum(1.0,GetErrorInput(),mMomentum);
+    //update biases
+    mBias.AddScaled(-mLearningRate/N,mBiasCorrection,1.0);
+  }
+
+
+  void
+  CuDiscreteLinearity::
+  ReadFromStream(std::istream& rIn)
+  {
+    rIn >> std::ws >> mNBlocks;
+    if(mNBlocks < 1) {
+      KALDI_ERR << "Bad number of blocks:" << mNBlocks;
+    }
+
+    mLinearity.resize(mNBlocks);
+    mLinearityCorrection.resize(mNBlocks);
+
+    int in_dim = 0, out_dim = 0;
+    for(int i=0; i<mNBlocks; i++) {
+      //matrix is stored transposed as SNet does
+      BfMatrix transpose;
+      rIn >> transpose;
+      mLinearity[i].CopyFrom(BfMatrix(transpose, TRANS));
+      
+      if(transpose.Cols()*transpose.Rows() == 0) {
+        Error("Missing linearity matrix in network file");
+      }
+      //allocate training buffers
+      mLinearityCorrection[i].Init(mLinearity[i].Rows(),mLinearity[i].Cols());
+      mLinearityCorrection[i].SetConst(0.0);
+
+      in_dim += transpose.Cols();
+      out_dim += transpose.Rows();
+    }
+    
+    //biases stored normally
+    BfVector bias;
+    rIn >> bias;
+    mBias.CopyFrom(bias);
+    if(bias.Dim() == 0) {
+      Error("Missing bias vector in network file");
+    }
+    mBiasCorrection.Init(mBias.Dim());
+    mBiasCorrection.SetConst(0.0);
+
+    if(out_dim != GetNOutputs() || 
+       in_dim != GetNInputs() ||
+       mBias.Dim() != GetNOutputs()
+    ){
+      std::ostringstream os;
+      os << "Wrong dimensionalities of matrix/vector in network file\n"
+         << "Inputs:" << GetNInputs()
+         << "Outputs:" << GetNOutputs()
+         << "\n"
+         << "linearityCols:" << in_dim
+         << "linearityRows:" << out_dim
+         << "biasDims:" << mBias.Dim()
+         << "\n";
+      Error(os.str());
+    }
+  }
+
+   
+  void
+  CuDiscreteLinearity::
+  WriteToStream(std::ostream& rOut)
+  {
+    rOut << mNBlocks << "\n";
+    for(int i=0; i< mNBlocks; i++) {
+      //matrix is stored transposed as SNet does
+      BfMatrix tmp;
+      mLinearity[i].CopyTo(tmp);
+      BfMatrix transpose(tmp, TRANS);
+      rOut << transpose;
+    }
+    //biases stored normally
+    BfVector vec;
+    mBias.CopyTo(vec);
+    rOut << vec;
+    rOut << std::endl;
+  }
+
+ 
+} //namespace
+
diff --git a/src/CuTNetLib/.svn/text-base/cuDiscreteLinearity.h.svn-base b/src/CuTNetLib/.svn/text-base/cuDiscreteLinearity.h.svn-base
new file mode 100644
index 0000000..06c8d74
--- /dev/null
+++ b/src/CuTNetLib/.svn/text-base/cuDiscreteLinearity.h.svn-base
@@ -0,0 +1,90 @@
+#ifndef _CUDISCRETE_LINEARITY_H_
+#define _CUDISCRETE_LINEARITY_H_
+
+
+#include "cuComponent.h"
+#include "cumatrix.h"
+
+
+#include "Matrix.h"
+#include "Vector.h"
+
+#include <vector>
+
+
+namespace TNet {
+
+  class CuDiscreteLinearity : public CuUpdatableComponent
+  {
+    public:
+
+      CuDiscreteLinearity(size_t nInputs, size_t nOutputs, CuComponent *pPred); 
+      ~CuDiscreteLinearity();  
+      
+      ComponentType GetType() const;
+      const char* GetName() const;
+
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+
+      void Update();
+
+      void ReadFromStream(std::istream& rIn);
+      void WriteToStream(std::ostream& rOut);
+
+    protected:
+      std::vector<CuMatrix<BaseFloat> > mLinearity;  ///< Matrix with neuron weights
+      CuVector<BaseFloat> mBias;       ///< Vector with biases
+
+      std::vector<CuMatrix<BaseFloat> > mLinearityCorrection; ///< Matrix for linearity updates
+      CuVector<BaseFloat> mBiasCorrection;      ///< Vector for bias updates
+
+      size_t mNBlocks;
+
+  };
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // CuDiscreteLinearity::
+  inline 
+  CuDiscreteLinearity::
+  CuDiscreteLinearity(size_t nInputs, size_t nOutputs, CuComponent *pPred)
+    : CuUpdatableComponent(nInputs, nOutputs, pPred), 
+      //mLinearity(nInputs,nOutputs), mBias(nOutputs),
+      //mLinearityCorrection(nInputs,nOutputs), mBiasCorrection(nOutputs)
+      mNBlocks(0)
+  { 
+    //mLinearityCorrection.SetConst(0.0);
+    //mBiasCorrection.SetConst(0.0);
+  }
+
+
+  inline
+  CuDiscreteLinearity::
+  ~CuDiscreteLinearity()
+  { }
+
+  inline CuComponent::ComponentType
+  CuDiscreteLinearity::
+  GetType() const
+  {
+    return CuComponent::DISCRETE_LINEARITY;
+  }
+
+  inline const char*
+  CuDiscreteLinearity::
+  GetName() const
+  {
+    return "<discretelinearity>";
+  }
+
+
+
+} //namespace
+
+
+
+#endif
diff --git a/src/CuTNetLib/.svn/text-base/cuNetwork.cc.svn-base b/src/CuTNetLib/.svn/text-base/cuNetwork.cc.svn-base
new file mode 100644
index 0000000..e245699
--- /dev/null
+++ b/src/CuTNetLib/.svn/text-base/cuNetwork.cc.svn-base
@@ -0,0 +1,380 @@
+
+#include <algorithm>
+//#include <locale>
+#include <cctype>
+#include <list>
+#include <sstream>
+
+#include "cuNetwork.h"
+
+#include "cuDiscreteLinearity.h"
+#include "cuSharedLinearity.h"
+#include "cuSparseLinearity.h"
+#include "cuRbm.h"
+#include "cuRbmSparse.h"
+#include "cuRecurrent.h"
+#include "cuBlockArray.h"
+
+namespace TNet {
+
+
+  
+
+  void
+  CuNetwork::
+  ReadNetwork(const char* pSrc)
+  {
+    std::ifstream in(pSrc);
+    if(!in.good()) {
+      Error(std::string("Error, cannot read model: ")+pSrc);
+    }
+    ReadNetwork(in);
+    in.close();
+  }
+
+ 
+ 
+  void
+  CuNetwork::
+  WriteNetwork(const char* pDst)
+  {
+    std::ofstream out(pDst);
+    if(!out.good()) {
+      Error(std::string("Error, cannot write model: ")+pDst);
+    }
+    WriteNetwork(out);
+    out.close();
+  }
+
+   
+
+  void
+  CuNetwork::
+  ReadNetwork(std::istream& rIn)
+  {
+    //get the network elements from a factory
+    CuComponent *pComp;
+    while(NULL != (pComp = ComponentFactory(rIn))) { 
+      mNetComponents.push_back(pComp);
+    }
+  }
+
+
+
+  void
+  CuNetwork::
+  WriteNetwork(std::ostream& rOut)
+  {
+    //dump all the componetns
+    LayeredType::iterator it;
+    for(it=mNetComponents.begin(); it!=mNetComponents.end(); ++it) {
+      ComponentDumper(rOut, **it);
+    }
+  }
+
+
+  void
+  CuNetwork::
+  SetLearnRate(BaseFloat learnRate, const char* pLearnRateFactors)
+  {
+    //parse the learn rate factors: "0.1:0.5:0.6:1.0" to std::list
+    std::list<BaseFloat> lr_factors;
+    if(NULL != pLearnRateFactors) {
+      //replace ':' by ' '
+      std::string str(pLearnRateFactors);
+      size_t pos = 0;
+      while((pos = str.find(':',pos)) != std::string::npos) str[pos] = ' ';
+      while((pos = str.find(',',pos)) != std::string::npos) str[pos] = ' ';
+
+      //parse to std::list
+      std::istringstream is(str);
+      is >> std::skipws;
+      BaseFloat f; 
+      while(!is.eof()) {
+        if(!(is >> f).fail()) { lr_factors.push_back(f); }
+        else break;
+      }
+    }
+
+    //initialize rate factors iterator
+    BaseFloat scale = 1.0f;
+
+    //store global learning rate
+    mGlobLearnRate = learnRate;
+    mpLearnRateFactors = pLearnRateFactors;
+
+    //give scaled learning rate to components
+    LayeredType::iterator it;
+    bool stopper_given = false;
+    for(it=mNetComponents.begin(); it!=mNetComponents.end(); ++it) {
+      if((*it)->IsUpdatable()) {
+        //get next scale factor
+        if(NULL != pLearnRateFactors) {
+          if(!(lr_factors.size() > 0)) {
+            Error("Too few learninig rate scale factors");
+          }
+          scale = lr_factors.front(); 
+          lr_factors.pop_front(); 
+        }
+        //set scaled learning rate to the component
+        dynamic_cast<CuUpdatableComponent*>(*it)->LearnRate(learnRate*scale);
+        //set the stopper component for backpropagation
+        if(!stopper_given && (learnRate*scale > 0.0)) {
+          mpPropagErrorStopper = *it; stopper_given = true;
+        }
+      }
+    }
+    if(lr_factors.size() > 0) {
+      Error("Too much learninig rate scale factors");
+    }
+  }
+
+
+  BaseFloat
+  CuNetwork::
+  GetLearnRate()
+  {
+    return mGlobLearnRate;
+  }
+
+
+  void
+  CuNetwork::
+  PrintLearnRate()
+  {
+    assert(mNetComponents.size() > 0);
+    std::cout << "Learning rate: global " << mGlobLearnRate;
+    std::cout << " components' ";
+    for(size_t i=0; i<mNetComponents.size(); i++) {
+      if(mNetComponents[i]->IsUpdatable()) {
+        std::cout << " " << dynamic_cast<CuUpdatableComponent*>(mNetComponents[i])->LearnRate();
+      }
+    }
+    std::cout << "\n" << std::flush;
+  }
+
+
+
+  void
+  CuNetwork::
+  SetMomentum(BaseFloat momentum)
+  {
+    LayeredType::iterator it;
+    for(it=mNetComponents.begin(); it!=mNetComponents.end(); ++it) {
+      if((*it)->IsUpdatable()) {
+        dynamic_cast<CuUpdatableComponent*>(*it)->Momentum(momentum);
+      }
+    }
+  }
+
+  void
+  CuNetwork::
+  SetWeightcost(BaseFloat weightcost)
+  {
+    LayeredType::iterator it;
+    for(it=mNetComponents.begin(); it!=mNetComponents.end(); ++it) {
+      if((*it)->IsUpdatable()) {
+        dynamic_cast<CuUpdatableComponent*>(*it)->Weightcost(weightcost);
+      }
+    }
+  }
+
+  void
+  CuNetwork::
+  SetL1(BaseFloat l1)
+  {
+    LayeredType::iterator it;
+    for(it=mNetComponents.begin(); it!=mNetComponents.end(); ++it) {
+      if((*it)->GetType() == CuComponent::SPARSE_LINEARITY) {
+        dynamic_cast<CuSparseLinearity*>(*it)->L1(l1);
+      }
+    }
+  }
+
+  void
+  CuNetwork::
+  SetGradDivFrm(bool div)
+  {
+    LayeredType::iterator it;
+    for(it=mNetComponents.begin(); it!=mNetComponents.end(); ++it) {
+      if((*it)->IsUpdatable()) {
+        dynamic_cast<CuUpdatableComponent*>(*it)->GradDivFrm(div);
+      }
+    }
+  }
+   
+
+  CuComponent*
+  CuNetwork::
+  ComponentFactory(std::istream& rIn)
+  {
+    rIn >> std::ws;
+    if(rIn.eof()) return NULL;
+
+    CuComponent* pRet=NULL;
+    CuComponent* pPred=NULL;
+
+    std::string componentTag;
+    size_t nInputs, nOutputs;
+
+    rIn >> std::ws;
+    rIn >> componentTag;
+    if(componentTag == "") return NULL; //nothing left in the file
+
+    //make it lowercase
+    std::transform(componentTag.begin(), componentTag.end(), 
+                   componentTag.begin(), tolower);
+
+    if(componentTag[0] != '<' || componentTag[componentTag.size()-1] != '>') {
+      Error(std::string("Invalid component tag:")+componentTag);
+    }
+
+    //the 'endblock' tag terminates the network
+    if(componentTag == "<endblock>") return NULL;
+
+    rIn >> std::ws;
+    rIn >> nOutputs;
+    rIn >> std::ws;
+    rIn >> nInputs;
+    assert(nInputs > 0 && nOutputs > 0);
+
+    //make coupling with predecessor
+    if(mNetComponents.size() != 0) {
+      pPred = mNetComponents.back();
+    }
+    
+    //array with list of component tags
+    static const std::string TAGS[] = {
+      "<biasedlinearity>",
+      "<discretelinearity>",
+      "<sharedlinearity>",
+      "<sparselinearity>",
+      "<rbm>",
+      "<rbmsparse>",
+      "<recurrent>",
+
+      "<softmax>",
+      "<sigmoid>",
+
+      "<expand>",
+      "<copy>",
+      "<transpose>",
+      "<blocklinearity>",
+      "<bias>",
+      "<window>",
+      "<log>", 
+
+      "<blockarray>",
+    };
+
+    static const int n_tags = sizeof(TAGS) / sizeof(TAGS[0]);
+    int i;
+    for(i=0; i<n_tags; i++) {
+      if(componentTag == TAGS[i]) break;
+    }
+       
+    //switch according to position in array TAGS
+    switch(i) {
+      case 0: pRet = new CuBiasedLinearity(nInputs,nOutputs,pPred); break;
+      case 1: pRet = new CuDiscreteLinearity(nInputs,nOutputs,pPred); break;
+      case 2: pRet = new CuSharedLinearity(nInputs,nOutputs,pPred); break;
+      case 3: pRet = new CuSparseLinearity(nInputs,nOutputs,pPred); break;
+      case 4: pRet = new CuRbm(nInputs,nOutputs,pPred); break;
+      case 5: pRet = new CuRbmSparse(nInputs,nOutputs,pPred); break;
+      case 6: pRet = new CuRecurrent(nInputs,nOutputs,pPred); break;
+
+      case 7: pRet = new CuSoftmax(nInputs,nOutputs,pPred); break;
+      case 8: pRet = new CuSigmoid(nInputs,nOutputs,pPred); break;
+
+      case 9: pRet = new CuExpand(nInputs,nOutputs,pPred); break;
+      case 10: pRet = new CuCopy(nInputs,nOutputs,pPred); break;
+      case 11: pRet = new CuTranspose(nInputs,nOutputs,pPred); break;
+      case 12: pRet = new CuBlockLinearity(nInputs,nOutputs,pPred); break;
+      case 13: pRet = new CuBias(nInputs,nOutputs,pPred); break;
+      case 14: pRet = new CuWindow(nInputs,nOutputs,pPred); break;
+      case 15: pRet = new CuLog(nInputs,nOutputs,pPred); break;
+     
+      case 16: pRet = new CuBlockArray(nInputs,nOutputs,pPred); break;
+      
+      default: Error(std::string("Unknown Component tag:")+componentTag);
+    }
+   
+    //read components content
+    pRet->ReadFromStream(rIn);
+        
+    //return
+    return pRet;
+  }
+
+
+  void
+  CuNetwork::
+  ComponentDumper(std::ostream& rOut, CuComponent& rComp)
+  {
+    //use tags of all the components; or the identification codes
+    //array with list of component tags
+    static const CuComponent::ComponentType TYPES[] = {
+      CuComponent::BIASED_LINEARITY,
+      CuComponent::DISCRETE_LINEARITY,
+      CuComponent::SHARED_LINEARITY,
+      CuComponent::SPARSE_LINEARITY,
+      CuComponent::RBM,
+      CuComponent::RBM_SPARSE,
+      CuComponent::RECURRENT,
+
+      CuComponent::SIGMOID,
+      CuComponent::SOFTMAX,
+
+      CuComponent::EXPAND,
+      CuComponent::COPY,
+      CuComponent::TRANSPOSE,
+      CuComponent::BLOCK_LINEARITY,
+      CuComponent::BIAS,
+      CuComponent::WINDOW,
+      CuComponent::LOG,
+
+      CuComponent::BLOCK_ARRAY,
+    };
+    static const std::string TAGS[] = {
+      "<biasedlinearity>",
+      "<discretelinearity>",
+      "<sharedlinearity>",
+      "<sparselinearity>",
+      "<rbm>",
+      "<rbmsparse>",
+      "<recurrent>",
+
+      "<sigmoid>",
+      "<softmax>",
+
+      "<expand>",
+      "<copy>",
+      "<transpose>",
+      "<blocklinearity>",
+      "<bias>",
+      "<window>",
+      "<log>",
+
+      "<blockarray>",
+    };
+    static const int MAX = sizeof TYPES / sizeof TYPES[0];
+
+    int i;
+    for(i=0; i<MAX; ++i) {
+      if(TYPES[i] == rComp.GetType()) break;
+    }
+    if(i == MAX) Error("Unknown ComponentType");
+    
+    //dump the component tag
+    rOut << TAGS[i] << " " 
+         << rComp.GetNOutputs() << " " 
+         << rComp.GetNInputs() << std::endl;
+
+    //write components content
+    rComp.WriteToStream(rOut);
+  }
+
+
+  
+} //namespace
+
diff --git a/src/CuTNetLib/.svn/text-base/cuNetwork.h.svn-base b/src/CuTNetLib/.svn/text-base/cuNetwork.h.svn-base
new file mode 100644
index 0000000..0453376
--- /dev/null
+++ b/src/CuTNetLib/.svn/text-base/cuNetwork.h.svn-base
@@ -0,0 +1,220 @@
+#ifndef _CUNETWORK_H_
+#define _CUNETWORK_H_
+
+#include "cuComponent.h"
+
+#include "cuBiasedLinearity.h"
+//#include "cuBlockLinearity.h"
+//#include "cuBias.h"
+//#include "cuWindow.h"
+
+#include "cuActivation.h"
+
+#include "cuCRBEDctFeat.h"
+
+#include "Vector.h"
+
+#include <vector>
+
+
+namespace TNet {
+
+  class CuNetwork
+  {
+    //////////////////////////////////////
+    // Typedefs
+    typedef std::vector<CuComponent*> LayeredType;
+      
+      //////////////////////////////////////
+      // Disable copy construction, assignment and default constructor
+    private:
+      CuNetwork(CuNetwork&); 
+      CuNetwork& operator=(CuNetwork&);
+       
+    public:
+      CuNetwork() { }
+      CuNetwork(std::istream& rIn); 
+      ~CuNetwork();
+
+      void AddLayer(CuComponent* layer);
+
+      int Layers()
+      { return mNetComponents.size(); }
+
+      CuComponent& Layer(int i)
+      { return *mNetComponents[i]; }
+
+      /// forward the data to the output
+      void Propagate(const CuMatrix<BaseFloat>& in, CuMatrix<BaseFloat>& out);
+
+      /// backpropagate the error while updating weights
+      void Backpropagate(const CuMatrix<BaseFloat>& globerr); 
+
+      void ReadNetwork(const char* pSrc);     ///< read the network from file
+      void WriteNetwork(const char* pDst);    ///< write network to file
+
+      void ReadNetwork(std::istream& rIn);    ///< read the network from stream
+      void WriteNetwork(std::ostream& rOut);  ///< write network to stream
+
+      size_t GetNInputs() const; ///< Dimensionality of the input features
+      size_t GetNOutputs() const; ///< Dimensionality of the desired vectors
+
+      /// set the learning rate
+      void SetLearnRate(BaseFloat learnRate, const char* pLearnRateFactors = NULL); 
+      BaseFloat GetLearnRate();  ///< get the learning rate value
+      void PrintLearnRate();     ///< log the learning rate values
+
+      void SetMomentum(BaseFloat momentum);
+      void SetWeightcost(BaseFloat weightcost);
+      void SetL1(BaseFloat l1);
+
+      void SetGradDivFrm(bool div);
+
+
+    private:
+      /// Creates a component by reading from stream
+      CuComponent* ComponentFactory(std::istream& In);
+      /// Dumps component into a stream
+      void ComponentDumper(std::ostream& rOut, CuComponent& rComp);
+
+
+
+    private:
+      LayeredType mNetComponents; ///< container with the network layers
+      CuComponent* mpPropagErrorStopper;
+      BaseFloat mGlobLearnRate; ///< The global (unscaled) learn rate of the network
+      const char* mpLearnRateFactors; ///< The global (unscaled) learn rate of the network
+      
+
+    //friend class NetworkGenerator; //<< For generating networks...
+
+  };
+    
+
+  //////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // CuNetwork::
+  inline 
+  CuNetwork::
+  CuNetwork(std::istream& rSource)
+    : mpPropagErrorStopper(NULL), mGlobLearnRate(0.0), mpLearnRateFactors(NULL)
+  {
+    ReadNetwork(rSource);
+  }
+
+
+  inline
+  CuNetwork::
+  ~CuNetwork()
+  {
+    //delete all the components
+    LayeredType::iterator it;
+    for(it=mNetComponents.begin(); it!=mNetComponents.end(); ++it) {
+      delete *it;
+      *it = NULL;
+    }
+    mNetComponents.resize(0);
+  }
+
+  
+  inline void 
+  CuNetwork::
+  AddLayer(CuComponent* layer)
+  {
+    if(mNetComponents.size() > 0) {
+      if(GetNOutputs() != layer->GetNInputs()) {
+        Error("Nonmatching dims");
+      }
+      layer->SetInput(mNetComponents.back()->GetOutput());
+      mNetComponents.back()->SetErrorInput(layer->GetErrorOutput());
+    }
+    mNetComponents.push_back(layer);
+  }
+
+
+  inline void
+  CuNetwork::
+  Propagate(const CuMatrix<BaseFloat>& in, CuMatrix<BaseFloat>& out)
+  {
+    //empty network => copy input
+    if(mNetComponents.size() == 0) { 
+      out.CopyFrom(in); 
+      return;
+    }
+
+    //check dims
+    if(in.Cols() != GetNInputs()) {
+      std::ostringstream os;
+      os << "Nonmatching dims"
+         << " data dim is: " << in.Cols() 
+         << " network needs: " << GetNInputs();
+      Error(os.str());
+    }
+    mNetComponents.front()->SetInput(in);
+    
+    //propagate
+    LayeredType::iterator it;
+    for(it=mNetComponents.begin(); it!=mNetComponents.end(); ++it) {
+      (*it)->Propagate();
+    }
+
+    //copy the output
+    out.CopyFrom(mNetComponents.back()->GetOutput());
+  }
+
+
+
+
+  inline void 
+  CuNetwork::
+  Backpropagate(const CuMatrix<BaseFloat>& globerr) 
+  {
+    mNetComponents.back()->SetErrorInput(globerr);
+
+    // back-propagation
+    LayeredType::reverse_iterator it;
+    for(it=mNetComponents.rbegin(); it!=mNetComponents.rend(); ++it) {
+      //stopper component does not propagate error (no updatable predecessors)
+      if(*it != mpPropagErrorStopper) {
+        //compute errors for preceding network components
+        (*it)->Backpropagate();
+      }
+      //update weights if updatable component
+      if((*it)->IsUpdatable()) {
+        CuUpdatableComponent& rComp = dynamic_cast<CuUpdatableComponent&>(**it); 
+        if(rComp.LearnRate() > 0.0f) {
+          rComp.Update();
+        }
+      }
+      //stop backprop if no updatable components precede current component
+      if(mpPropagErrorStopper == *it) break;
+    }
+  }
+
+      
+  inline size_t
+  CuNetwork::
+  GetNInputs() const
+  {
+    if(!mNetComponents.size() > 0) return 0;
+    return mNetComponents.front()->GetNInputs();
+  }
+
+
+  inline size_t
+  CuNetwork::
+  GetNOutputs() const
+  {
+    if(!mNetComponents.size() > 0) return 0;
+    return mNetComponents.back()->GetNOutputs();
+  }
+
+
+
+
+
+} //namespace
+
+#endif
+
+
diff --git a/src/CuTNetLib/.svn/text-base/cuObjectiveFunction.cc.svn-base b/src/CuTNetLib/.svn/text-base/cuObjectiveFunction.cc.svn-base
new file mode 100644
index 0000000..e2b0a1d
--- /dev/null
+++ b/src/CuTNetLib/.svn/text-base/cuObjectiveFunction.cc.svn-base
@@ -0,0 +1,87 @@
+
+#include "cuObjectiveFunction.h"
+
+#include "Error.h"
+#include "cumath.h"
+
+
+namespace TNet
+{
+
+
+
+  CuObjectiveFunction*
+  CuObjectiveFunction::
+  Factory(ObjFunType type) {
+    CuObjectiveFunction* ret = NULL;
+    switch(type) {
+      case MEAN_SQUARE_ERROR:  ret = new CuMeanSquareError;  break;
+      case CROSS_ENTROPY:      ret = new CuCrossEntropy;     break;
+      default: Error("Unknown ObjFun type");
+    }
+    return ret;
+  }
+
+
+  void 
+  CuMeanSquareError::
+  Evaluate(const CuMatrix<BaseFloat>& rNetOutput, const CuMatrix<BaseFloat>& rDesired, CuMatrix<BaseFloat>& rNetError)
+  {
+    //get the global error
+    rNetError.CopyFrom(rNetOutput);
+    rNetError.AddScaled(-1.0,rDesired,1.0);
+
+    //calculate the MSE
+    mAuxMat.CopyFrom(rNetError);
+    mAuxMat.MulElem(mAuxMat);
+    
+    mAuxVec.Init(mAuxMat.Cols());
+    mAuxVec.AddColSum(1.0,mAuxMat,0.0);
+    mAuxVec.CopyTo(mAuxVecHost);
+
+    mError += mAuxVecHost.Sum();
+ 
+    //count the frames    
+    mFrames += rNetError.Rows();
+  }
+
+  void 
+  CuCrossEntropy::
+  Evaluate(const CuMatrix<BaseFloat>& rNetOutput, const CuMatrix<BaseFloat>& rDesired, CuMatrix<BaseFloat>& rNetError)
+  {
+    if(rDesired.Cols() != rNetOutput.Cols()) {
+      std::ostringstream os;
+      os << "Non-matching dimensions of network output with training targets!!!"
+         << " Netoutput:" << rNetOutput.Cols()
+         << " Targets:" << rDesired.Cols();
+      Error(os.str());
+    }
+
+    //get the global error
+    //dXent/dSoftmax_in = y-d
+    rNetError.CopyFrom(rNetOutput);
+    rNetError.AddScaled(-1.0,rDesired,1.0);
+   
+    //check classification
+    mClassifyVec.Init(rNetOutput.Rows());
+    CuMath<BaseFloat>::CheckClass(rNetOutput,rDesired,mClassifyVec);
+    mClassifyVec.CopyTo(mClassifyVecHost);
+    mCorrect += mClassifyVecHost.Sum();
+
+    //calculate Xent
+    mAuxMat.CopyFrom(rNetOutput);
+    mAuxMat.LogElem();
+    mAuxMat.MulElem(rDesired);
+
+    mAuxVec.Init(mAuxMat.Cols());
+    mAuxVec.AddColSum(-1.0,mAuxMat,0.0);
+    mAuxVec.CopyTo(mAuxVecHost);
+
+    mError += mAuxVecHost.Sum();
+
+    //count the frames    
+    mFrames += rNetError.Rows();
+  }
+
+
+} // namespace TNet
diff --git a/src/CuTNetLib/.svn/text-base/cuObjectiveFunction.h.svn-base b/src/CuTNetLib/.svn/text-base/cuObjectiveFunction.h.svn-base
new file mode 100644
index 0000000..6b425e8
--- /dev/null
+++ b/src/CuTNetLib/.svn/text-base/cuObjectiveFunction.h.svn-base
@@ -0,0 +1,166 @@
+#ifndef _CUOBJ_FUN_I_
+#define _CUOBJ_FUN_I_
+
+#include <cassert>
+#include <limits>
+#include <cmath>
+#include <sstream>
+
+#include "Vector.h"
+#include "cuvector.h"
+#include "cumatrix.h"
+
+namespace TNet 
+{
+
+  
+  /**
+   * General interface for objective functions
+   */
+  class CuObjectiveFunction
+  {
+    public:
+      /// Enum with objective function types
+      typedef enum { 
+        OBJ_FUN_I = 0x0300, 
+        MEAN_SQUARE_ERROR, 
+        CROSS_ENTROPY, 
+      } ObjFunType;
+
+      /// Factory for creating objective function instances
+      static CuObjectiveFunction* Factory(ObjFunType type);
+    
+    //////////////////////////////////////////////////////////////
+    // Interface specification
+    public:
+      CuObjectiveFunction() 
+      { } 
+
+      virtual ~CuObjectiveFunction() 
+      { }
+
+      virtual ObjFunType GetTypeId() = 0; 
+      virtual const char* GetTypeLabel() = 0; 
+
+      /// evaluates the data, calculate global error
+      virtual void Evaluate(const CuMatrix<BaseFloat>& rNetOutput, const CuMatrix<BaseFloat>& rDesired, CuMatrix<BaseFloat>& rNetError) = 0;
+ 
+      ///get the average per frame error
+      virtual double GetError() = 0;  
+      ///the number of processed frames 
+      virtual size_t GetFrames() = 0;
+      ///report the error to std::cout 
+      virtual std::string Report() = 0;
+  };
+
+
+
+
+  /**
+   * Means square error, useful for autoencoders, RBMs et al.
+   */
+  class CuMeanSquareError : public CuObjectiveFunction
+  {
+    public:
+      CuMeanSquareError() 
+        : mError(0), mFrames(0)
+      { }
+    
+      virtual ~CuMeanSquareError() 
+      { }
+
+      ObjFunType GetTypeId()
+      { return CuObjectiveFunction::MEAN_SQUARE_ERROR; }
+
+      const char* GetTypeLabel()
+      { return "<mean_square_error>"; }
+
+      void Evaluate(const CuMatrix<BaseFloat>& rNetOutput, const CuMatrix<BaseFloat>& rDesired, CuMatrix<BaseFloat>& rNetError);
+      
+      double GetError()
+      { return mError; }  
+      
+      size_t GetFrames()
+      { return mFrames; }
+      
+      std::string Report()
+      { 
+        std::ostringstream ss;
+        ss << "Mse:" << mError << " frames:" << mFrames 
+           << " err/frm:" << mError/mFrames << "\n";
+        return ss.str();
+      }
+
+    private:
+      double mError;
+      size_t mFrames;
+
+      CuMatrix<BaseFloat> mAuxMat;
+      CuVector<BaseFloat> mAuxVec;
+      Vector<BaseFloat> mAuxVecHost;
+
+  };
+
+
+ /**
+   * Cross entropy, it assumes desired vectors as output values 
+   */
+  class CuCrossEntropy : public CuObjectiveFunction
+  {
+    public:
+      CuCrossEntropy() 
+        : mError(0), mFrames(0), mCorrect(0)
+      { }
+      
+      ~CuCrossEntropy() 
+      { }
+      
+      ObjFunType GetTypeId()
+      { return CuObjectiveFunction::CROSS_ENTROPY; }
+
+      const char* GetTypeLabel()
+      { return "<cross_entropy>"; }
+
+      void Evaluate(const CuMatrix<BaseFloat>& rNetOutput, const CuMatrix<BaseFloat>& rDesired, CuMatrix<BaseFloat>& rNetError);
+
+      double GetError()
+      { return mError; }
+
+      size_t GetFrames()
+      { return mFrames; }
+
+      std::string Report()
+      {
+        std::ostringstream ss;
+        //for compatibility with SNet
+        //ss << " correct: >> " << 100.0*mCorrect/mFrames << "% <<\n";
+        
+        //current new format...
+        ss << "Xent:" << mError << " frames:" << mFrames 
+           << " err/frm:" << mError/mFrames 
+           << " correct[" << 100.0*mCorrect/mFrames << "%]"
+           << "\n";
+        return ss.str();
+      }
+
+    private:
+      double mError;
+      size_t mFrames;
+      size_t mCorrect;
+      
+      CuMatrix<BaseFloat> mAuxMat;
+      CuVector<BaseFloat> mAuxVec;
+      Vector<BaseFloat> mAuxVecHost;
+
+      CuVector<int> mClassifyVec;
+      Vector<int> mClassifyVecHost;
+  };
+
+
+
+
+
+} //namespace TNet
+
+
+#endif
diff --git a/src/CuTNetLib/.svn/text-base/cuRbm.cc.svn-base b/src/CuTNetLib/.svn/text-base/cuRbm.cc.svn-base
new file mode 100644
index 0000000..3d0699d
--- /dev/null
+++ b/src/CuTNetLib/.svn/text-base/cuRbm.cc.svn-base
@@ -0,0 +1,244 @@
+
+#include <string>
+#include <sstream>
+
+#include "cuRbm.h"
+
+#include "cumath.h"
+
+
+namespace TNet
+{
+
+  void 
+  CuRbm::
+  PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    Y.SetConst(0.0);
+    Y.AddScaledRow(1.0,mHidBias,0.0);
+    Y.Gemm('N','N', 1.0, X, mVisHid, 1.0);
+    if(mHidType == BERNOULLI) {
+      CuMath<BaseFloat>::Sigmoid(Y,Y);
+    }
+  }
+
+
+  void 
+  CuRbm::
+  BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    if(mHidType == BERNOULLI) {
+      mBackpropErrBuf.Init(X.Rows(),X.Cols());
+      CuMath<BaseFloat>::DiffSigmoid(mBackpropErrBuf,X,GetOutput());
+    } else {
+      mBackpropErrBuf.CopyFrom(X);
+    }
+
+    Y.SetConst(0.0);
+    Y.Gemm('N', 'T', 1.0, mBackpropErrBuf, mVisHid, 0.0);
+  }
+
+  
+  void 
+  CuRbm::
+  Update() 
+  {
+    //THIS IS DONE TWICE BECAUSE OF THE BACKPROP STOPPER!!!
+    if(mHidType == BERNOULLI) {
+      mBackpropErrBuf.Init(GetErrorInput().Rows(),GetErrorInput().Cols());
+      CuMath<BaseFloat>::DiffSigmoid(mBackpropErrBuf,GetErrorInput(),GetOutput());
+    } else {
+      mBackpropErrBuf.CopyFrom(GetErrorInput());
+    }
+
+/*
+    std::cout << " " << GetInput().Rows()
+              << " " << GetInput().Cols()  
+              << " " << mBackpropErrBuf.Rows()  
+              << " " << mBackpropErrBuf.Cols()  
+              << " " << mVisHidCorrection.Rows()  
+              << " " << mVisHidCorrection.Cols()  
+              ;
+*/
+
+#if 0
+    //former implementation
+    BaseFloat N = static_cast<BaseFloat>(GetInput().Rows());
+
+    mVisHidCorrection.Gemm('T','N',-mLearningRate/N,GetInput(),mBackpropErrBuf,mMomentum);
+    mHidBiasCorrection.AddColSum(-mLearningRate/N,mBackpropErrBuf,mMomentum);
+
+    //regularization weight decay
+    mVisHidCorrection.AddScaled(-mLearningRate*mWeightcost,mVisHid,1.0);
+    
+    mVisHid.AddScaled(1.0,mVisHidCorrection,1.0);
+    mHidBias.AddScaled(1.0,mHidBiasCorrection,1.0);
+#endif
+
+#if 1
+    //new implementation
+    BaseFloat N = 1;
+    if(mGradDivFrm) {
+      N = static_cast<BaseFloat>(GetInput().Rows());
+    }
+    BaseFloat mmt_gain = static_cast<BaseFloat>(1.0/(1.0-mMomentum));
+    N *= mmt_gain;
+
+    mVisHidCorrection.Gemm('T','N',1.0,GetInput(),mBackpropErrBuf,mMomentum);
+    mHidBiasCorrection.AddColSum(1.0,mBackpropErrBuf,mMomentum);
+
+    mVisHid.AddScaled(-mLearningRate/N,mVisHidCorrection,1.0);
+    mHidBias.AddScaled(-mLearningRate/N,mHidBiasCorrection,1.0);
+
+    //regularization weight decay (from actual weights only)
+    mVisHid.AddScaled(-mLearningRate*mWeightcost,mVisHid,1.0);
+#endif
+
+  }
+
+
+
+  void 
+  CuRbm::
+  Propagate(const CuMatrix<BaseFloat>& visProbs, CuMatrix<BaseFloat>& hidProbs)
+  {
+    if(visProbs.Cols() != GetNInputs()) {
+      std::ostringstream os;
+      os << " Nonmatching input dim, needs:" << GetNInputs() 
+         << " got:" << visProbs.Cols() << "\n";
+      Error(os.str());
+    }
+
+    hidProbs.Init(visProbs.Rows(),GetNOutputs());
+
+    PropagateFnc(visProbs, hidProbs);
+  }
+
+  void
+  CuRbm::
+  Reconstruct(const CuMatrix<BaseFloat>& hidState, CuMatrix<BaseFloat>& visProbs)
+  {
+    visProbs.Init(hidState.Rows(),mNInputs);
+    visProbs.SetConst(0.0);
+    visProbs.AddScaledRow(1.0,mVisBias,0.0);
+    visProbs.Gemm('N','T', 1.0, hidState, mVisHid, 1.0);
+    if(mVisType == BERNOULLI) {
+      CuMath<BaseFloat>::Sigmoid(visProbs,visProbs);
+    }
+  }
+
+
+  void 
+  CuRbm::
+  RbmUpdate(const CuMatrix<BaseFloat>& pos_vis, const CuMatrix<BaseFloat>& pos_hid, const CuMatrix<BaseFloat>& neg_vis, const CuMatrix<BaseFloat>& neg_hid)
+  {
+    assert(pos_vis.Rows() == pos_hid.Rows() &&
+           pos_vis.Rows() == neg_vis.Rows() &&
+           pos_vis.Rows() == neg_hid.Rows() &&
+           pos_vis.Cols() == neg_vis.Cols() &&
+           pos_hid.Cols() == neg_hid.Cols() &&
+           pos_vis.Cols() == mNInputs &&
+           pos_hid.Cols() == mNOutputs);
+
+    //  UPDATE vishid matrix
+    //  
+    //  vishidinc = momentum*vishidinc + ...
+    //              epsilonw*( (posprods-negprods)/numcases - weightcost*vishid);
+    //
+    //  vishidinc[t] = -(epsilonw/numcases)*negprods + momentum*vishidinc[t-1]
+    //                 +(epsilonw/numcases)*posprods
+    //                 -(epsilonw*weightcost)*vishid[t-1]
+    //
+    BaseFloat N = static_cast<BaseFloat>(pos_vis.Rows());
+    mVisHidCorrection.Gemm('T','N',-mLearningRate/N,neg_vis,neg_hid,mMomentum);
+    mVisHidCorrection.Gemm('T','N',+mLearningRate/N,pos_vis,pos_hid,1.0);
+    mVisHidCorrection.AddScaled(-mLearningRate*mWeightcost,mVisHid,1.0);
+    mVisHid.AddScaled(1.0,mVisHidCorrection,1.0);
+
+    //  UPDATE visbias vector
+    //
+    //  visbiasinc = momentum*visbiasinc + (epsilonvb/numcases)*(posvisact-negvisact);
+    //
+    mVisBiasCorrection.AddColSum(-mLearningRate/N,neg_vis,mMomentum);
+    mVisBiasCorrection.AddColSum(+mLearningRate/N,pos_vis,1.0);
+    mVisBias.AddScaled(1.0,mVisBiasCorrection,1.0);
+    
+    //  UPDATE hidbias vector
+    //
+    // hidbiasinc = momentum*hidbiasinc + (epsilonhb/numcases)*(poshidact-neghidact);
+    //
+    mHidBiasCorrection.AddColSum(-mLearningRate/N,neg_hid,mMomentum);
+    mHidBiasCorrection.AddColSum(+mLearningRate/N,pos_hid,1.0);
+    mHidBias.AddScaled(1.0/*0.0*/,mHidBiasCorrection,1.0);
+
+  }
+
+
+  void
+  CuRbm::
+  ReadFromStream(std::istream& rIn)
+  {
+    //type of the units
+    std::string str;
+    
+    rIn >> std::ws >> str;
+    if(0 == str.compare("bern")) {
+      mVisType = BERNOULLI;
+    } else if(0 == str.compare("gauss")) {
+      mVisType = GAUSSIAN;
+    } else Error(std::string("Invalid unit type: ")+str);
+
+    rIn >> std::ws >> str;
+    if(0 == str.compare("bern")) {
+      mHidType = BERNOULLI;
+    } else if(0 == str.compare("gauss")) {
+      mHidType = GAUSSIAN;
+    } else Error(std::string("Invalid unit type: ")+str);
+
+
+    //matrix is stored transposed as SNet does
+    BfMatrix transpose;
+    rIn >> transpose;
+    mVisHid.CopyFrom(BfMatrix(transpose, TRANS));
+    //biases stored normally
+    BfVector bias;
+    rIn >> bias;
+    mVisBias.CopyFrom(bias);
+    rIn >> bias;
+    mHidBias.CopyFrom(bias);
+  }
+
+   
+  void
+  CuRbm::
+  WriteToStream(std::ostream& rOut)
+  {
+    //store unit type info
+    if(mVisType == BERNOULLI) {
+      rOut << " bern ";
+    } else {
+      rOut << " gauss ";
+    }
+    if(mHidType == BERNOULLI) {
+      rOut << " bern\n";
+    } else {
+      rOut << " gauss\n";
+    }
+
+    //matrix is stored transposed as SNet does
+    BfMatrix tmp;
+    mVisHid.CopyTo(tmp);
+    BfMatrix transpose(tmp, TRANS);
+    rOut << transpose;
+    //biases stored normally
+    BfVector vec;
+    mVisBias.CopyTo(vec);
+    rOut << vec;
+    rOut << std::endl;
+    mHidBias.CopyTo(vec);
+    rOut << vec;
+    rOut << std::endl;
+  }
+
+ 
+} //namespace
diff --git a/src/CuTNetLib/.svn/text-base/cuRbm.h.svn-base b/src/CuTNetLib/.svn/text-base/cuRbm.h.svn-base
new file mode 100644
index 0000000..c1e984b
--- /dev/null
+++ b/src/CuTNetLib/.svn/text-base/cuRbm.h.svn-base
@@ -0,0 +1,146 @@
+#ifndef _CU_RBM_H_
+#define _CU_RBM_H_
+
+
+#include "cuComponent.h"
+#include "cumatrix.h"
+
+
+#include "Matrix.h"
+#include "Vector.h"
+
+
+namespace TNet {
+
+  class CuRbmBase : public CuUpdatableComponent
+  {
+   public:
+    typedef enum {
+      BERNOULLI,
+      GAUSSIAN
+    } RbmUnitType;
+   
+    CuRbmBase(size_t nInputs, size_t nOutputs, CuComponent *pPred) :
+      CuUpdatableComponent(nInputs, nOutputs, pPred)
+    { }
+   
+    
+    virtual void Propagate(
+      const CuMatrix<BaseFloat>& visProbs, 
+      CuMatrix<BaseFloat>& hidProbs
+    ) = 0;
+    virtual void Reconstruct(
+      const CuMatrix<BaseFloat>& hidState, 
+      CuMatrix<BaseFloat>& visProbs
+    ) = 0;
+    virtual void RbmUpdate(
+      const CuMatrix<BaseFloat>& pos_vis, 
+      const CuMatrix<BaseFloat>& pos_hid, 
+      const CuMatrix<BaseFloat>& neg_vis, 
+      const CuMatrix<BaseFloat>& neg_hid
+    ) = 0;
+
+    virtual RbmUnitType VisType() = 0;
+    virtual RbmUnitType HidType() = 0;
+  };
+
+
+  class CuRbm : public CuRbmBase
+  {
+    public:
+
+      CuRbm(size_t nInputs, size_t nOutputs, CuComponent *pPred); 
+      ~CuRbm();  
+      
+      ComponentType GetType() const;
+      const char* GetName() const;
+
+      //CuUpdatableComponent API
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+
+      void Update();
+
+      //RBM training API
+      void Propagate(const CuMatrix<BaseFloat>& visProbs, CuMatrix<BaseFloat>& hidProbs);
+      void Reconstruct(const CuMatrix<BaseFloat>& hidState, CuMatrix<BaseFloat>& visProbs);
+      void RbmUpdate(const CuMatrix<BaseFloat>& pos_vis, const CuMatrix<BaseFloat>& pos_hid, const CuMatrix<BaseFloat>& neg_vis, const CuMatrix<BaseFloat>& neg_hid);
+
+      RbmUnitType VisType()
+      { return mVisType; }
+
+      RbmUnitType HidType()
+      { return mHidType; }
+
+      //static void BinarizeProbs(const CuMatrix<BaseFloat>& probs, CuMatrix<BaseFloat>& states);
+
+      //I/O
+      void ReadFromStream(std::istream& rIn);
+      void WriteToStream(std::ostream& rOut);
+
+    protected:
+      CuMatrix<BaseFloat> mVisHid;  ///< Matrix with neuron weights
+      CuVector<BaseFloat> mVisBias;       ///< Vector with biases
+      CuVector<BaseFloat> mHidBias;       ///< Vector with biases
+
+      CuMatrix<BaseFloat> mVisHidCorrection; ///< Matrix for linearity updates
+      CuVector<BaseFloat> mVisBiasCorrection;      ///< Vector for bias updates
+      CuVector<BaseFloat> mHidBiasCorrection;      ///< Vector for bias updates
+
+      CuMatrix<BaseFloat> mBackpropErrBuf;
+
+      RbmUnitType mVisType;
+      RbmUnitType mHidType;
+
+  };
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // CuRbm::
+  inline 
+  CuRbm::
+  CuRbm(size_t nInputs, size_t nOutputs, CuComponent *pPred)
+    : CuRbmBase(nInputs, nOutputs, pPred), 
+      mVisHid(nInputs,nOutputs), 
+      mVisBias(nInputs), mHidBias(nOutputs),
+      mVisHidCorrection(nInputs,nOutputs), 
+      mVisBiasCorrection(nInputs), mHidBiasCorrection(nOutputs),
+      mBackpropErrBuf(),
+      mVisType(BERNOULLI),
+      mHidType(BERNOULLI)
+  { 
+    mVisHidCorrection.SetConst(0.0);
+    mVisBiasCorrection.SetConst(0.0);
+    mHidBiasCorrection.SetConst(0.0);
+  }
+
+
+  inline
+  CuRbm::
+  ~CuRbm()
+  { }
+
+  inline CuComponent::ComponentType
+  CuRbm::
+  GetType() const
+  {
+    return CuComponent::RBM;
+  }
+
+  inline const char*
+  CuRbm::
+  GetName() const
+  {
+    return "<rbm>";
+  }
+
+
+
+} //namespace
+
+
+
+#endif
diff --git a/src/CuTNetLib/.svn/text-base/cuRbmSparse.cc.svn-base b/src/CuTNetLib/.svn/text-base/cuRbmSparse.cc.svn-base
new file mode 100644
index 0000000..e0b7352
--- /dev/null
+++ b/src/CuTNetLib/.svn/text-base/cuRbmSparse.cc.svn-base
@@ -0,0 +1,269 @@
+
+#include <string>
+#include <sstream>
+
+#include "cuRbmSparse.h"
+
+#include "cumath.h"
+
+
+namespace TNet
+{
+
+  void 
+  CuRbmSparse::
+  PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    Y.SetConst(0.0);
+    Y.AddScaledRow(1.0,mHidBias,0.0);
+    Y.Gemm('N','N', 1.0, X, mVisHid, 1.0);
+    if(mHidType == BERNOULLI) {
+      CuMath<BaseFloat>::Sigmoid(Y,Y);
+    }
+  }
+
+
+  void 
+  CuRbmSparse::
+  BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    if(mHidType == BERNOULLI) {
+      mBackpropErrBuf.Init(X.Rows(),X.Cols());
+      CuMath<BaseFloat>::DiffSigmoid(mBackpropErrBuf,X,GetOutput());
+    } else {
+      mBackpropErrBuf.CopyFrom(X);
+    }
+
+    Y.SetConst(0.0);
+    Y.Gemm('N', 'T', 1.0, mBackpropErrBuf, mVisHid, 0.0);
+  }
+
+  
+  void 
+  CuRbmSparse::
+  Update() 
+  {
+    //THIS IS DONE TWICE BECAUSE OF THE BACKPROP STOPPER!!!
+    if(mHidType == BERNOULLI) {
+      mBackpropErrBuf.Init(GetErrorInput().Rows(),GetErrorInput().Cols());
+      CuMath<BaseFloat>::DiffSigmoid(mBackpropErrBuf,GetErrorInput(),GetOutput());
+    } else {
+      mBackpropErrBuf.CopyFrom(GetErrorInput());
+    }
+
+/*
+    std::cout << " " << GetInput().Rows()
+              << " " << GetInput().Cols()  
+              << " " << mBackpropErrBuf.Rows()  
+              << " " << mBackpropErrBuf.Cols()  
+              << " " << mVisHidCorrection.Rows()  
+              << " " << mVisHidCorrection.Cols()  
+              ;
+*/
+
+#if 0
+    //former implementation
+    BaseFloat N = static_cast<BaseFloat>(GetInput().Rows());
+
+    mVisHidCorrection.Gemm('T','N',-mLearningRate/N,GetInput(),mBackpropErrBuf,mMomentum);
+    mHidBiasCorrection.AddColSum(-mLearningRate/N,mBackpropErrBuf,mMomentum);
+
+    //regularization weight decay
+    mVisHidCorrection.AddScaled(-mLearningRate*mWeightcost,mVisHid,1.0);
+    
+    mVisHid.AddScaled(1.0,mVisHidCorrection,1.0);
+    mHidBias.AddScaled(1.0,mHidBiasCorrection,1.0);
+#endif
+
+#if 1
+    //new implementation
+    BaseFloat N = 1;
+    if(mGradDivFrm) {
+      N = static_cast<BaseFloat>(GetInput().Rows());
+    }
+    BaseFloat mmt_gain = static_cast<BaseFloat>(1.0/(1.0-mMomentum));
+    N *= mmt_gain;
+
+    mVisHidCorrection.Gemm('T','N',1.0,GetInput(),mBackpropErrBuf,mMomentum);
+    mHidBiasCorrection.AddColSum(1.0,mBackpropErrBuf,mMomentum);
+
+    mVisHid.AddScaled(-mLearningRate/N,mVisHidCorrection,1.0);
+    mHidBias.AddScaled(-mLearningRate/N,mHidBiasCorrection,1.0);
+
+    //regularization weight decay (from actual weights only)
+    mVisHid.AddScaled(-mLearningRate*mWeightcost,mVisHid,1.0);
+#endif
+
+  }
+
+
+
+  void 
+  CuRbmSparse::
+  Propagate(const CuMatrix<BaseFloat>& visProbs, CuMatrix<BaseFloat>& hidProbs)
+  {
+    if(visProbs.Cols() != GetNInputs()) {
+      std::ostringstream os;
+      os << " Nonmatching input dim, needs:" << GetNInputs() 
+         << " got:" << visProbs.Cols() << "\n";
+      Error(os.str());
+    }
+
+    hidProbs.Init(visProbs.Rows(),GetNOutputs());
+
+    PropagateFnc(visProbs, hidProbs);
+  }
+
+  void
+  CuRbmSparse::
+  Reconstruct(const CuMatrix<BaseFloat>& hidState, CuMatrix<BaseFloat>& visProbs)
+  {
+    visProbs.Init(hidState.Rows(),mNInputs);
+    visProbs.SetConst(0.0);
+    visProbs.AddScaledRow(1.0,mVisBias,0.0);
+    visProbs.Gemm('N','T', 1.0, hidState, mVisHid, 1.0);
+    if(mVisType == BERNOULLI) {
+      CuMath<BaseFloat>::Sigmoid(visProbs,visProbs);
+    }
+  }
+
+
+  void 
+  CuRbmSparse::
+  RbmUpdate(const CuMatrix<BaseFloat>& pos_vis, const CuMatrix<BaseFloat>& pos_hid, const CuMatrix<BaseFloat>& neg_vis, const CuMatrix<BaseFloat>& neg_hid)
+  {
+    assert(pos_vis.Rows() == pos_hid.Rows() &&
+           pos_vis.Rows() == neg_vis.Rows() &&
+           pos_vis.Rows() == neg_hid.Rows() &&
+           pos_vis.Cols() == neg_vis.Cols() &&
+           pos_hid.Cols() == neg_hid.Cols() &&
+           pos_vis.Cols() == mNInputs &&
+           pos_hid.Cols() == mNOutputs);
+
+    //:SPARSITY:
+    if(mHidType==BERNOULLI) {
+      //get expected node activity from current batch
+      mSparsityQCurrent.AddColSum(1.0/pos_hid.Rows(),pos_hid,0.0);
+      //get smoothed expected node activity
+      mSparsityQ.AddScaled(1.0-mLambda,mSparsityQCurrent,mLambda);
+      //subtract the prior: (q-p)
+      mSparsityQCurrent.SetConst(-mSparsityPrior);
+      mSparsityQCurrent.AddScaled(1.0,mSparsityQ,1.0);
+      //get mean pos_vis
+      mVisMean.AddColSum(1.0/pos_vis.Rows(),pos_vis,0.0);
+    }
+
+    //  UPDATE vishid matrix
+    //  
+    //  vishidinc = momentum*vishidinc + ...
+    //              epsilonw*( (posprods-negprods)/numcases - weightcost*vishid)
+    //              -sparsitycost*mean_posvis'*(q-p);
+    //
+    //  vishidinc[t] = -(epsilonw/numcases)*negprods + momentum*vishidinc[t-1]
+    //                 +(epsilonw/numcases)*posprods
+    //                 -(epsilonw*weightcost)*vishid[t-1]
+    //
+    BaseFloat N = static_cast<BaseFloat>(pos_vis.Rows());
+    mVisHidCorrection.Gemm('T','N',-mLearningRate/N,neg_vis,neg_hid,mMomentum);
+    mVisHidCorrection.Gemm('T','N',+mLearningRate/N,pos_vis,pos_hid,1.0);
+    mVisHidCorrection.AddScaled(-mLearningRate*mWeightcost,mVisHid,1.0);//L2
+    if(mHidType==BERNOULLI) {
+      mVisHidCorrection.BlasGer(-mSparsityCost,mVisMean,mSparsityQCurrent);//sparsity
+    }
+    mVisHid.AddScaled(1.0,mVisHidCorrection,1.0);
+
+    //  UPDATE visbias vector
+    //
+    //  visbiasinc = momentum*visbiasinc + (epsilonvb/numcases)*(posvisact-negvisact);
+    //
+    mVisBiasCorrection.AddColSum(-mLearningRate/N,neg_vis,mMomentum);
+    mVisBiasCorrection.AddColSum(+mLearningRate/N,pos_vis,1.0);
+    mVisBias.AddScaled(1.0,mVisBiasCorrection,1.0);
+    
+    //  UPDATE hidbias vector
+    //
+    // hidbiasinc = momentum*hidbiasinc + (epsilonhb/numcases)*(poshidact-neghidact);
+    //
+    mHidBiasCorrection.AddColSum(-mLearningRate/N,neg_hid,mMomentum);
+    mHidBiasCorrection.AddColSum(+mLearningRate/N,pos_hid,1.0);
+    if(mHidType==BERNOULLI) {
+      mHidBiasCorrection.AddScaled(-mSparsityCost,mSparsityQCurrent,1.0);//sparsity
+    }
+    mHidBias.AddScaled(1.0/*0.0*/,mHidBiasCorrection,1.0);
+
+  }
+
+
+  void
+  CuRbmSparse::
+  ReadFromStream(std::istream& rIn)
+  {
+    //type of the units
+    std::string str;
+    
+    rIn >> std::ws >> str;
+    if(0 == str.compare("bern")) {
+      mVisType = BERNOULLI;
+    } else if(0 == str.compare("gauss")) {
+      mVisType = GAUSSIAN;
+    } else Error(std::string("Invalid unit type: ")+str);
+
+    rIn >> std::ws >> str;
+    if(0 == str.compare("bern")) {
+      mHidType = BERNOULLI;
+    } else if(0 == str.compare("gauss")) {
+      mHidType = GAUSSIAN;
+    } else Error(std::string("Invalid unit type: ")+str);
+
+
+    //matrix is stored transposed as SNet does
+    BfMatrix transpose;
+    rIn >> transpose;
+    mVisHid.CopyFrom(BfMatrix(transpose, TRANS));
+    //biases stored normally
+    BfVector bias;
+    rIn >> bias;
+    mVisBias.CopyFrom(bias);
+    rIn >> bias;
+    mHidBias.CopyFrom(bias);
+
+    rIn >> std::ws >> mSparsityCost;
+    std::cout << "RBM::mSparsityCost=" << mSparsityCost;
+  }
+
+   
+  void
+  CuRbmSparse::
+  WriteToStream(std::ostream& rOut)
+  {
+    //store unit type info
+    if(mVisType == BERNOULLI) {
+      rOut << " bern ";
+    } else {
+      rOut << " gauss ";
+    }
+    if(mHidType == BERNOULLI) {
+      rOut << " bern\n";
+    } else {
+      rOut << " gauss\n";
+    }
+
+    //matrix is stored transposed as SNet does
+    BfMatrix tmp;
+    mVisHid.CopyTo(tmp);
+    BfMatrix transpose(tmp, TRANS);
+    rOut << transpose;
+    //biases stored normally
+    BfVector vec;
+    mVisBias.CopyTo(vec);
+    rOut << vec;
+    rOut << std::endl;
+    mHidBias.CopyTo(vec);
+    rOut << vec;
+    rOut << std::endl;
+    //store the sparsity cost
+    rOut << mSparsityCost << std::endl;
+  }
+
+ 
+} //namespace
diff --git a/src/CuTNetLib/.svn/text-base/cuRbmSparse.h.svn-base b/src/CuTNetLib/.svn/text-base/cuRbmSparse.h.svn-base
new file mode 100644
index 0000000..9d7e304
--- /dev/null
+++ b/src/CuTNetLib/.svn/text-base/cuRbmSparse.h.svn-base
@@ -0,0 +1,134 @@
+#ifndef _CU_RBM_SPARSE_H_
+#define _CU_RBM_SPARSE_H_
+
+
+#include "cuComponent.h"
+#include "cumatrix.h"
+#include "cuRbm.h"
+
+
+#include "Matrix.h"
+#include "Vector.h"
+
+
+namespace TNet {
+
+  class CuRbmSparse : public CuRbmBase
+  {
+    public:
+
+      CuRbmSparse(size_t nInputs, size_t nOutputs, CuComponent *pPred); 
+      ~CuRbmSparse();  
+      
+      ComponentType GetType() const;
+      const char* GetName() const;
+
+      //CuUpdatableComponent API
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+
+      void Update();
+
+      //RBM training API
+      void Propagate(const CuMatrix<BaseFloat>& visProbs, CuMatrix<BaseFloat>& hidProbs);
+      void Reconstruct(const CuMatrix<BaseFloat>& hidState, CuMatrix<BaseFloat>& visProbs);
+      void RbmUpdate(const CuMatrix<BaseFloat>& pos_vis, const CuMatrix<BaseFloat>& pos_hid, const CuMatrix<BaseFloat>& neg_vis, const CuMatrix<BaseFloat>& neg_hid);
+
+      RbmUnitType VisType()
+      { return mVisType; }
+
+      RbmUnitType HidType()
+      { return mHidType; }
+
+      //static void BinarizeProbs(const CuMatrix<BaseFloat>& probs, CuMatrix<BaseFloat>& states);
+
+      //I/O
+      void ReadFromStream(std::istream& rIn);
+      void WriteToStream(std::ostream& rOut);
+
+    protected:
+      CuMatrix<BaseFloat> mVisHid;  ///< Matrix with neuron weights
+      CuVector<BaseFloat> mVisBias;       ///< Vector with biases
+      CuVector<BaseFloat> mHidBias;       ///< Vector with biases
+
+      CuMatrix<BaseFloat> mVisHidCorrection; ///< Matrix for linearity updates
+      CuVector<BaseFloat> mVisBiasCorrection;      ///< Vector for bias updates
+      CuVector<BaseFloat> mHidBiasCorrection;      ///< Vector for bias updates
+
+      CuMatrix<BaseFloat> mBackpropErrBuf;
+
+      RbmUnitType mVisType;
+      RbmUnitType mHidType;
+
+      ////// sparsity 
+      BaseFloat mSparsityPrior; ///< sparsity target (unit activity prior)
+      BaseFloat mLambda; ///< exponential decay factor for q (observed probability of unit to be active)
+      BaseFloat mSparsityCost; ///< sparsity cost coef.
+
+      CuVector<BaseFloat> mSparsityQ;
+      CuVector<BaseFloat> mSparsityQCurrent;
+      CuVector<BaseFloat> mVisMean; ///< buffer for mean visible
+
+  };
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // CuRbmSparse::
+  inline 
+  CuRbmSparse::
+  CuRbmSparse(size_t nInputs, size_t nOutputs, CuComponent *pPred)
+    : CuRbmBase(nInputs, nOutputs, pPred), 
+      mVisHid(nInputs,nOutputs), 
+      mVisBias(nInputs), mHidBias(nOutputs),
+      mVisHidCorrection(nInputs,nOutputs), 
+      mVisBiasCorrection(nInputs), mHidBiasCorrection(nOutputs),
+      mBackpropErrBuf(),
+      mVisType(BERNOULLI),
+      mHidType(BERNOULLI),
+
+      mSparsityPrior(0.0001),
+      mLambda(0.95),
+      mSparsityCost(1e-7),
+      mSparsityQ(nOutputs),
+      mSparsityQCurrent(nOutputs),
+      mVisMean(nInputs)
+  { 
+    mVisHidCorrection.SetConst(0.0);
+    mVisBiasCorrection.SetConst(0.0);
+    mHidBiasCorrection.SetConst(0.0);
+
+    mSparsityQ.SetConst(mSparsityPrior);
+    mSparsityQCurrent.SetConst(0.0);
+    mVisMean.SetConst(0.0);
+  }
+
+
+  inline
+  CuRbmSparse::
+  ~CuRbmSparse()
+  { }
+
+  inline CuComponent::ComponentType
+  CuRbmSparse::
+  GetType() const
+  {
+    return CuComponent::RBM_SPARSE;
+  }
+
+  inline const char*
+  CuRbmSparse::
+  GetName() const
+  {
+    return "<rbmsparse>";
+  }
+
+
+
+} //namespace
+
+
+
+#endif
diff --git a/src/CuTNetLib/.svn/text-base/cuRecurrent.cc.svn-base b/src/CuTNetLib/.svn/text-base/cuRecurrent.cc.svn-base
new file mode 100644
index 0000000..428df2c
--- /dev/null
+++ b/src/CuTNetLib/.svn/text-base/cuRecurrent.cc.svn-base
@@ -0,0 +1,191 @@
+
+#include <string>
+#include <sstream>
+
+#include "cuRecurrent.h"
+
+#include "cumath.h"
+#include "cuda_runtime.h"
+
+
+namespace TNet
+{
+
+  void 
+  CuRecurrent::
+  PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    assert(X.Rows() == 1);
+    assert(Y.Rows() == 1);
+    if(mInputHistory.Rows() == 0) {
+      Error("Bptt order was not set");
+    }
+
+    //pushback the history
+    CuMatrix<BaseFloat> tmp(mInputHistory.Rows()-1,mInputHistory.Cols());
+    tmp.CopyRows(tmp.Rows(),0,mInputHistory,0);
+    mInputHistory.CopyRows(tmp.Rows(),0,tmp,1);
+
+    //compose the input vector to 0th row, use input X and previous Y
+    cudaMemcpy(mInputHistory.pCUData(), X.pCUData(),
+               sizeof(BaseFloat)*X.Cols(), cudaMemcpyDeviceToDevice);
+    cudaMemcpy(mInputHistory.pCUData()+X.Cols(), Y.pCUData(),
+               sizeof(BaseFloat)*Y.Cols(), cudaMemcpyDeviceToDevice);
+
+    //extract first row
+    //CuMatrix<BaseFloat> first_row(1,mInputHistory.Cols());
+    //first_row.CopyRows(1,0,mInputHistory,0);
+
+    //calculate the output
+    Y.AddScaledRow(1.0,mBias,0.0);
+    //take 0th vector of history, propagate
+    CuMath<BaseFloat>::OffsetGemv('T',1.0,mLinearity,mInputHistory.pCUData(),mInputHistory.Cols(),1.0,Y.pCUData(),Y.Cols(),0); 
+    //Y.Gemm('N','N', 1.0, first_row, mLinearity, 1.0);
+    CuMath<BaseFloat>::Sigmoid(Y,Y);
+
+    /*
+    std::cout << "-------------------------------------" << std::endl;
+    X.Print();
+    Y.Print();
+    mInputHistory.Print();
+    */
+
+  }
+
+
+  void 
+  CuRecurrent::
+  BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    assert(Y.Rows() == 1);
+    assert(X.Rows() == 1);
+
+    //apply diff sigmoid
+    CuMatrix<BaseFloat> diff_sigm(1,X.Cols());
+    CuMath<BaseFloat>::DiffSigmoid(diff_sigm,X,GetOutput());
+    
+    //:TODO: inefficent to calculate all the input errors!!!
+    //       we need only part of them!
+    //
+    //backward-multiply by weights
+    /*
+    CuMatrix<BaseFloat> err_prev(1,mLinearity.Rows());
+    err_prev.Gemm('N', 'T', 1.0, diff_sigm, mLinearity, 0.0);
+ 
+    //copy out the interval
+    cudaMemcpy(Y.pCUData(),err_prev.pCUData(),
+               sizeof(BaseFloat)*Y.Cols(),cudaMemcpyDeviceToDevice);
+    */
+
+    //backward-multiply by weights
+    CuMath<BaseFloat>::OffsetGemv('N',1.0,mLinearity,diff_sigm.pCUData(),diff_sigm.Cols(),1.0,Y.pCUData(),Y.Cols(),0); 
+
+  }
+
+  
+  void 
+  CuRecurrent::
+  Update() 
+  {
+    //
+    //correction from PRESENT input x error pair
+    //
+    //apply diff sigmoid
+    CuMatrix<BaseFloat> diff_sigm(1,GetOutput().Cols());
+    CuMath<BaseFloat>::DiffSigmoid(diff_sigm,GetErrorInput(),GetOutput());
+
+    //get 0th row of history (present time)
+    CuMatrix<BaseFloat> history_row(1,mInputHistory.Cols());
+    history_row.CopyRows(1,0,mInputHistory,0);
+
+    //calculate update
+    //mLinearityCorrection.Gemm('T','N',-mLearningRate,history_row,diff_sigm,mMomentum);
+    mLinearityCorrection.SetConst(0.0); //:TODO: should be scale/momentum
+    CuMath<BaseFloat>::BlasGer(-mLearningRate,history_row.pCUData(),history_row.Cols(),diff_sigm.pCUData(),diff_sigm.Cols(),mLinearityCorrection);
+
+    mBiasCorrection.AddColSum(-mLearningRate,diff_sigm,mMomentum);
+   
+    //
+    //BPTT (backprop through time) 
+    //
+    CuMatrix<BaseFloat> err_prev(1,mLinearity.Rows());
+    CuMatrix<BaseFloat> err_prev_part(1,diff_sigm.Cols());
+    CuMatrix<BaseFloat> history_output(1,GetOutput().Cols());
+    for(int i=1; i<=mBpttOrder; i++) {
+      //:TODO: inefficent to calculate all the input errors!!!
+      //       we need only part of them!
+      //
+      /*
+      //get previous error
+      err_prev.Gemm('N','T',1.0,diff_sigm,mLinearity,0.0);
+      //select interval
+      cudaMemcpy(err_prev_part.pCUData(),err_prev.pCUData()+GetNInputs(),
+                 sizeof(BaseFloat)*err_prev_part.Cols(),cudaMemcpyDeviceToDevice);
+      */
+
+      //backward-multiply by weights
+      CuMath<BaseFloat>::OffsetGemv('N',1.0,mLinearity,diff_sigm.pCUData(),diff_sigm.Cols(),0.0,err_prev_part.pCUData(),err_prev_part.Cols(),GetInput().Cols()); 
+
+      //apply diff sigmoid with activations of HISTORY frame!!!
+      cudaMemcpy(history_output.pCUData(), mInputHistory.pCURowData(i-1)+GetInput().Cols(),
+          sizeof(BaseFloat)*history_output.Cols(), cudaMemcpyDeviceToDevice);
+      CuMath<BaseFloat>::DiffSigmoid(diff_sigm,err_prev_part,history_output);
+
+      //get history row
+      history_row.CopyRows(1,i,mInputHistory,0);
+
+      //accu the update
+      //mLinearityCorrection.Gemm('T','N',-mLearningRate,history_row,diff_sigm,1.0);
+      CuMath<BaseFloat>::BlasGer(-mLearningRate,history_row.pCUData(),history_row.Cols(),diff_sigm.pCUData(),diff_sigm.Cols(),mLinearityCorrection);
+      mBiasCorrection.AddColSum(-mLearningRate,diff_sigm,1.0);
+    }
+
+    //
+    //update the weights
+    //
+    //regularization weight decay
+    mLinearityCorrection.AddScaled(-mLearningRate*mWeightcost,mLinearity,1.0);
+    
+    //perform update
+    mLinearity.AddScaled(1.0,mLinearityCorrection,1.0);
+    mBias.AddScaled(1.0,mBiasCorrection,1.0);
+
+  }
+
+
+
+
+  void
+  CuRecurrent::
+  ReadFromStream(std::istream& rIn)
+  {
+    //matrix is stored transposed as SNet does
+    BfMatrix transpose;
+    rIn >> transpose;
+    mLinearity.CopyFrom(BfMatrix(transpose, TRANS));
+    //biases stored normally
+    BfVector bias;
+    rIn >> bias;
+    mBias.CopyFrom(bias);
+  }
+
+   
+  void
+  CuRecurrent::
+  WriteToStream(std::ostream& rOut)
+  {
+    //matrix is stored transposed as SNet does
+    BfMatrix tmp;
+    mLinearity.CopyTo(tmp);
+    BfMatrix transpose(tmp, TRANS);
+    rOut << transpose;
+    //biases stored normally
+    BfVector vec;
+    mBias.CopyTo(vec);
+    rOut << vec;
+    rOut << std::endl;
+  }
+
+ 
+} //namespace
+
diff --git a/src/CuTNetLib/.svn/text-base/cuRecurrent.h.svn-base b/src/CuTNetLib/.svn/text-base/cuRecurrent.h.svn-base
new file mode 100644
index 0000000..e487b27
--- /dev/null
+++ b/src/CuTNetLib/.svn/text-base/cuRecurrent.h.svn-base
@@ -0,0 +1,101 @@
+#ifndef _CU_RECURRENT_H_
+#define _CU_RECURRENT_H_
+
+
+#include "cuComponent.h"
+#include "cumatrix.h"
+
+
+#include "Matrix.h"
+#include "Vector.h"
+
+
+namespace TNet {
+
+  class CuRecurrent : public CuUpdatableComponent
+  {
+    public:
+
+      CuRecurrent(size_t nInputs, size_t nOutputs, CuComponent *pPred); 
+      ~CuRecurrent();  
+      
+      ComponentType GetType() const;
+      const char* GetName() const;
+
+      //CuUpdatableComponent API
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+
+      void Update();
+
+      //Recurrent training API
+      void BpttOrder(int ord) {
+        mBpttOrder = ord;
+        mInputHistory.Init(ord+1,GetNInputs()+GetNOutputs());
+      }
+      void ClearHistory() {
+        mInputHistory.SetConst(0.0);
+        if(mOutput.MSize() > 0) {
+          mOutput.SetConst(0.0);
+        }
+      }
+
+      //I/O
+      void ReadFromStream(std::istream& rIn);
+      void WriteToStream(std::ostream& rOut);
+
+    protected:
+      CuMatrix<BaseFloat> mLinearity;
+      CuVector<BaseFloat> mBias;
+
+      CuMatrix<BaseFloat> mLinearityCorrection;
+      CuVector<BaseFloat> mBiasCorrection;
+
+      CuMatrix<BaseFloat> mInputHistory;
+
+      int mBpttOrder;
+  };
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // CuRecurrent::
+  inline 
+  CuRecurrent::
+  CuRecurrent(size_t nInputs, size_t nOutputs, CuComponent *pPred)
+    : CuUpdatableComponent(nInputs, nOutputs, pPred), 
+      mLinearity(nInputs+nOutputs,nOutputs),
+      mBias(nOutputs),
+      mLinearityCorrection(nInputs+nOutputs,nOutputs), 
+      mBiasCorrection(nOutputs)
+  { }
+
+
+  inline
+  CuRecurrent::
+  ~CuRecurrent()
+  { }
+
+  inline CuComponent::ComponentType
+  CuRecurrent::
+  GetType() const
+  {
+    return CuComponent::RECURRENT;
+  }
+
+  inline const char*
+  CuRecurrent::
+  GetName() const
+  {
+    return "<recurrent>";
+  }
+
+
+
+} //namespace
+
+
+
+#endif
diff --git a/src/CuTNetLib/.svn/text-base/cuSharedLinearity.cc.svn-base b/src/CuTNetLib/.svn/text-base/cuSharedLinearity.cc.svn-base
new file mode 100644
index 0000000..8d5ec09
--- /dev/null
+++ b/src/CuTNetLib/.svn/text-base/cuSharedLinearity.cc.svn-base
@@ -0,0 +1,179 @@
+
+
+#include "cuSharedLinearity.h"
+#include "cumath.h"
+
+
+namespace TNet
+{
+
+  void 
+  CuSharedLinearity::
+  PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    CuMath<BaseFloat>::VecExpand(mBias,mBiasExpand); /// [ 1 2 3 ] -> [ 1 2 3 1 2 3 ... ]
+    Y.AddScaledRow(1.0,mBiasExpand,0.0);
+
+    //mBiasExpand.Print();
+
+    for(int i=0; i<mNInstances; i++) {
+      CuMath<BaseFloat>::OffsetGemm('N','N', 1.0, X, mLinearity, 1.0, Y, 
+                                    i*mLinearity.Rows(), 0, i*mLinearity.Cols());
+    }
+    //std::cout << CuDevice::Instantiate().GetFreeMemory();
+    //GetInput().Print();
+    //GetOutput().Print();
+  }
+
+
+  void 
+  CuSharedLinearity::
+  BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    for(int i=0; i<mNInstances; i++) {
+      CuMath<BaseFloat>::OffsetGemm('N', 'T', 1.0, X, mLinearity, 0.0, Y,
+                                    i*mLinearity.Cols(), 0, i*mLinearity.Rows());
+    }
+  }
+
+  
+  void 
+  CuSharedLinearity::
+  Update() 
+  {
+#if 0
+    //former implementation
+    BaseFloat N = static_cast<BaseFloat>(GetInput().Rows());
+
+    for(int i=0; i<mNInstances; i++) {
+      CuMath<BaseFloat>::OffsetGemm('T','N',-mLearningRate/(N*mNInstances),
+                        GetInput(),GetErrorInput(),
+                        ((i==0)?mMomentum:1.0f), mLinearityCorrection, 
+                        i*mLinearity.Rows(),i*mLinearity.Cols(),0);
+    }
+    mBiasCorrectionExpand.AddColSum(1.0,GetErrorInput(),0.0);
+    CuMath<BaseFloat>::VecAddColSum(-mLearningRate/(N*mNInstances),mBiasCorrectionExpand,mMomentum,mBiasCorrection);
+
+
+    //regularization weight decay
+    mLinearityCorrection.AddScaled(-mLearningRate*mWeightcost,mLinearity,1.0);
+    
+    mLinearity.AddScaled(1.0,mLinearityCorrection,1.0);
+    mBias.AddScaled(1.0,mBiasCorrection,1.0);
+#endif
+
+#if 1
+    //new implementation
+    BaseFloat N = 1; 
+    if(mGradDivFrm) {
+      N = static_cast<BaseFloat>(GetInput().Rows());
+    }
+    BaseFloat mmt_gain = static_cast<BaseFloat>(1.0/(1.0-mMomentum));
+    N *= mmt_gain; //compensate higher gradient estimates due to momentum 
+    
+    //compensate augmented dyn. range of gradient caused by multiple instances
+    N *= static_cast<BaseFloat>(mNInstances); 
+
+    //get gradient of shared linearity
+    for(int i=0; i<mNInstances; i++) {
+      CuMath<BaseFloat>::OffsetGemm('T','N',1.0,
+                        GetInput(),GetErrorInput(),
+                        ((i==0)?mMomentum:1.0f), mLinearityCorrection, 
+                        i*mLinearity.Rows(),i*mLinearity.Cols(),0);
+    }
+    //get gradient of shared bias
+    mBiasCorrectionExpand.AddColSum(1.0,GetErrorInput(),0.0);
+    CuMath<BaseFloat>::VecAddColSum(1.0,mBiasCorrectionExpand,mMomentum,mBiasCorrection);
+   
+    //perform update 
+    mLinearity.AddScaled(-mLearningRate/N,mLinearityCorrection,1.0);
+    mBias.AddScaled(-mLearningRate/N,mBiasCorrection,1.0);
+    
+    //regularization weight decay
+    mLinearity.AddScaled(-mLearningRate*mWeightcost,mLinearity,1.0);
+#endif
+   
+  }
+
+
+  void
+  CuSharedLinearity::
+  ReadFromStream(std::istream& rIn)
+  {
+    //number of instances of shared weights in layer
+    rIn >> std::ws >> mNInstances;
+    if(mNInstances < 1) {
+      std::ostringstream os;
+      os << "Bad number of instances:" << mNInstances;
+      Error(os.str());
+    }
+    if(GetNInputs() % mNInstances != 0 || GetNOutputs() % mNInstances != 0) {
+      std::ostringstream os;
+      os << "Number of Inputs/Outputs must be divisible by number of instances"
+         << " Inputs:" << GetNInputs()
+         << " Outputs" << GetNOutputs()
+         << " Intances:" << mNInstances;
+      Error(os.str());
+    }
+      
+    //matrix is stored transposed as SNet does
+    BfMatrix transpose;
+    rIn >> transpose;
+    mLinearity.CopyFrom(BfMatrix(transpose, TRANS));
+    //biases stored normally
+    BfVector bias;
+    rIn >> bias;
+    mBias.CopyFrom(bias);
+
+    if(transpose.Cols()*transpose.Rows() == 0) {
+      Error("Missing linearity matrix in network file");
+    }
+    if(bias.Dim() == 0) {
+      Error("Missing bias vector in network file");
+    }
+
+
+    if(mLinearity.Cols() != GetNOutputs() / mNInstances || 
+       mLinearity.Rows() != GetNInputs() / mNInstances ||
+       mBias.Dim() != GetNOutputs() / mNInstances
+    ){
+      std::ostringstream os;
+      os << "Wrong dimensionalities of matrix/vector in network file\n"
+         << "Inputs:" << GetNInputs()
+         << "Outputs:" << GetNOutputs()
+         << "\n"
+         << "linearityCols:" << mLinearity.Cols()
+         << "linearityRows:" << mLinearity.Rows()
+         << "biasDims:" << mBias.Dim()
+         << "\n";
+      Error(os.str());
+    }
+
+    mLinearityCorrection.Init(mLinearity.Rows(),mLinearity.Cols());
+    mBiasCorrection.Init(mBias.Dim());
+
+    mBiasExpand.Init(mBias.Dim()*mNInstances);
+    mBiasCorrectionExpand.Init(mBias.Dim()*mNInstances);
+  }
+
+   
+  void
+  CuSharedLinearity::
+  WriteToStream(std::ostream& rOut)
+  {
+    rOut << mNInstances << std::endl;
+
+    //matrix is stored transposed as SNet does
+    BfMatrix tmp;
+    mLinearity.CopyTo(tmp);
+    BfMatrix transpose(tmp, TRANS);
+    rOut << transpose;
+    //biases stored normally
+    BfVector vec;
+    mBias.CopyTo(vec);
+    rOut << vec;
+    rOut << std::endl;
+  }
+
+ 
+} //namespace
diff --git a/src/CuTNetLib/.svn/text-base/cuSharedLinearity.h.svn-base b/src/CuTNetLib/.svn/text-base/cuSharedLinearity.h.svn-base
new file mode 100644
index 0000000..4aa022a
--- /dev/null
+++ b/src/CuTNetLib/.svn/text-base/cuSharedLinearity.h.svn-base
@@ -0,0 +1,85 @@
+#ifndef _CUSHARED_LINEARITY_H_
+#define _CUSHARED_LINEARITY_H_
+
+
+#include "cuComponent.h"
+#include "cumatrix.h"
+
+
+#include "Matrix.h"
+#include "Vector.h"
+
+
+namespace TNet {
+
+  class CuSharedLinearity : public CuUpdatableComponent
+  {
+    public:
+
+      CuSharedLinearity(size_t nInputs, size_t nOutputs, CuComponent *pPred); 
+      ~CuSharedLinearity();  
+      
+      ComponentType GetType() const;
+      const char* GetName() const;
+
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+
+      void Update();
+
+      void ReadFromStream(std::istream& rIn);
+      void WriteToStream(std::ostream& rOut);
+
+    protected:
+      CuMatrix<BaseFloat> mLinearity;  ///< Matrix with neuron weights
+      CuVector<BaseFloat> mBias;       ///< Vector with biases
+
+      CuMatrix<BaseFloat> mLinearityCorrection; ///< Matrix for linearity updates
+      CuVector<BaseFloat> mBiasCorrection;      ///< Vector for bias updates
+
+      int mNInstances;
+      CuVector<BaseFloat> mBiasExpand;
+      CuVector<BaseFloat> mBiasCorrectionExpand;
+
+  };
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // CuSharedLinearity::
+  inline 
+  CuSharedLinearity::
+  CuSharedLinearity(size_t nInputs, size_t nOutputs, CuComponent *pPred)
+    : CuUpdatableComponent(nInputs, nOutputs, pPred), 
+      mNInstances(0)
+  { }
+
+
+  inline
+  CuSharedLinearity::
+  ~CuSharedLinearity()
+  { }
+
+  inline CuComponent::ComponentType
+  CuSharedLinearity::
+  GetType() const
+  {
+    return CuComponent::SHARED_LINEARITY;
+  }
+
+  inline const char*
+  CuSharedLinearity::
+  GetName() const
+  {
+    return "<sharedlinearity>";
+  }
+
+
+
+} //namespace
+
+
+
+#endif
diff --git a/src/CuTNetLib/.svn/text-base/cuSparseLinearity.cc.svn-base b/src/CuTNetLib/.svn/text-base/cuSparseLinearity.cc.svn-base
new file mode 100644
index 0000000..2f1159b
--- /dev/null
+++ b/src/CuTNetLib/.svn/text-base/cuSparseLinearity.cc.svn-base
@@ -0,0 +1,190 @@
+
+
+#include "cuSparseLinearity.h"
+#include <cmath>
+#include <cstdlib>
+
+
+namespace TNet
+{
+
+  void 
+  CuSparseLinearity::
+  PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    Y.AddScaledRow(1.0,mBias,0.0);
+    Y.Gemm('N','N', 1.0, X, mLinearity, 1.0);
+  }
+
+
+  void 
+  CuSparseLinearity::
+  BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    Y.Gemm('N', 'T', 1.0, X, mLinearity, 0.0);
+  }
+
+  
+  void 
+  CuSparseLinearity::
+  Update() 
+  {
+    BaseFloat N = 1;
+    if(mGradDivFrm) {
+      N = static_cast<BaseFloat>(GetInput().Rows());
+    }
+    BaseFloat mmt_gain = static_cast<BaseFloat>(1.0/(1.0-mMomentum));
+    N *= mmt_gain;
+
+    mLinearityCorrection.Gemm('T','N',1.0,GetInput(),GetErrorInput(),mMomentum);
+    mBiasCorrection.AddColSum(1.0,GetErrorInput(),mMomentum);
+
+    mLinearity.AddScaled(-mLearningRate/N,mLinearityCorrection,1.0);
+    mBias.AddScaled(-mLearningRate/N,mBiasCorrection,1.0);
+
+    mLinearityCorrectionAccu.AddScaled(1.0,mLinearityCorrection,1.0);
+    mLinearity.ApplyMask(mSparsityMask); 
+
+    //L1 regularization lassoo...
+    //each update? everty 1000th update?
+    if(mL1Const > 0) {
+      BaseFloat L1_const = mLearningRate*mL1Const*(mGradDivFrm?1.0:GetInput().Rows());
+      mLinearity.ApplyL1(L1_const);
+    }
+
+    //L2 regularization weight decay (from actual weights only)
+    if(mWeightcost > 0) {
+      BaseFloat L2_decay = -mLearningRate*mWeightcost*(mGradDivFrm?1.0:GetInput().Rows());
+      mLinearity.AddScaled(L2_decay, mLinearity,1.0);
+    }
+
+    mNFrames += GetInput().Rows();
+
+  }
+
+
+  void 
+  CuSparseLinearity::
+  UpdateMask()
+  {
+    //move data to host
+    Matrix<BaseFloat> linearity, linearity_correction_accu; 
+    Matrix<BaseFloat> sparsity_mask;
+
+    mLinearity.CopyTo(linearity);
+    mLinearityCorrectionAccu.CopyTo(linearity_correction_accu);
+    mSparsityMask.CopyTo(sparsity_mask);
+
+    //decide on new sparsity mask
+    for(size_t r=0; r<sparsity_mask.Rows(); r++) {
+      for(size_t c=0; c<sparsity_mask.Cols(); c++) {
+        if(sparsity_mask(r,c) == 1.0f) { //weight active
+          if(fabs(linearity(r,c)) < mSparsifyWeightThreshold) {
+            sparsity_mask(r,c) = 0;//deactivate
+            linearity(r,c) = 0;
+          }
+        } else { //weight inactive
+          if(abs(linearity_correction_accu(r,c))/(BaseFloat)mNFrames > mUnsparsifyAccu) {
+            sparsity_mask(r,c) = 1;//activate
+          }
+        }
+      }
+    }
+
+    //move data to the device
+    mLinearity.CopyFrom(linearity);
+    mSparsityMask.CopyFrom(sparsity_mask);
+  }
+
+
+  void
+  CuSparseLinearity::
+  ReadFromStream(std::istream& rIn)
+  {
+    //matrix is stored transposed as SNet does
+    BfMatrix transpose;
+    rIn >> transpose;
+    mLinearity.CopyFrom(BfMatrix(transpose, TRANS));
+    //biases stored normally
+    BfVector bias;
+    rIn >> bias;
+    mBias.CopyFrom(bias);
+
+    //sparsity mask
+    rIn >> std::ws;
+    Matrix<BaseFloat> mask_transp;
+    if(rIn.peek() == 'm') {//load from file
+      rIn >> mask_transp;
+    } else {//or set all elements active
+      mask_transp.Init(transpose.Rows(),transpose.Cols());
+      int items=transpose.Rows()*transpose.Stride();
+      BaseFloat* p = mask_transp.pData();
+      for(int i=0; i<items; i++) {//set all elements to one
+        *p++ = 1;
+      }
+    }
+    mSparsityMask.CopyFrom(BfMatrix(mask_transp,TRANS));
+
+    //dummy matrix with acumulated gradients
+    rIn >> std::ws;
+    if(rIn.peek() == 'm') {//load from file
+      BfMatrix dummy;
+      rIn >> dummy;
+    }
+
+    if(transpose.Cols()*transpose.Rows() == 0) {
+      Error("Missing linearity matrix in network file");
+    }
+    if(bias.Dim() == 0) {
+      Error("Missing bias vector in network file");
+    }
+    if(mLinearity.Cols() != GetNOutputs() || 
+       mLinearity.Rows() != GetNInputs() ||
+       mBias.Dim() != GetNOutputs()
+    ){
+      std::ostringstream os;
+      os << "Wrong dimensionalities of matrix/vector in network file\n"
+         << "Inputs:" << GetNInputs()
+         << "Outputs:" << GetNOutputs()
+         << "\n"
+         << "linearityCols:" << mLinearity.Cols()
+         << "linearityRows:" << mLinearity.Rows()
+         << "biasDims:" << mBias.Dim()
+         << "\n";
+      Error(os.str());
+    }
+
+    assert(mLinearity.Rows() == mSparsityMask.Rows());
+    assert(mLinearity.Cols() == mSparsityMask.Cols());
+
+  }
+
+   
+  void
+  CuSparseLinearity::
+  WriteToStream(std::ostream& rOut)
+  {
+    UpdateMask();
+
+    //matrix is stored transposed as SNet does
+    BfMatrix tmp;
+    mLinearity.CopyTo(tmp);
+    BfMatrix transpose(tmp, TRANS);
+    rOut << transpose;
+    //biases stored normally
+    BfVector vec;
+    mBias.CopyTo(vec);
+    rOut << vec;
+    rOut << std::endl;
+    //store mask
+    mSparsityMask.CopyTo(tmp);
+    rOut << BfMatrix(tmp,TRANS);
+    //store accu
+    mLinearityCorrectionAccu.CopyTo(tmp);
+    rOut << BfMatrix(tmp,TRANS);
+
+  }
+
+ 
+} //namespace
+
diff --git a/src/CuTNetLib/.svn/text-base/cuSparseLinearity.h.svn-base b/src/CuTNetLib/.svn/text-base/cuSparseLinearity.h.svn-base
new file mode 100644
index 0000000..c2b6d6f
--- /dev/null
+++ b/src/CuTNetLib/.svn/text-base/cuSparseLinearity.h.svn-base
@@ -0,0 +1,104 @@
+#ifndef _CUSPARSE_LINEARITY_H_
+#define _CUSPARSE_LINEARITY_H_
+
+
+#include "cuComponent.h"
+#include "cumatrix.h"
+
+
+#include "Matrix.h"
+#include "Vector.h"
+
+
+namespace TNet {
+
+  class CuSparseLinearity : public CuUpdatableComponent
+  {
+    public:
+
+      CuSparseLinearity(size_t nInputs, size_t nOutputs, CuComponent *pPred); 
+      ~CuSparseLinearity();  
+      
+      ComponentType GetType() const;
+      const char* GetName() const;
+
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+
+      void Update();
+      void UpdateMask();
+
+      void ReadFromStream(std::istream& rIn);
+      void WriteToStream(std::ostream& rOut);
+
+      void L1(BaseFloat l1) {
+        mL1Const = l1;
+      }
+
+    protected:
+      CuMatrix<BaseFloat> mLinearity;  ///< Matrix with neuron weights
+      CuVector<BaseFloat> mBias;       ///< Vector with biases
+      CuMatrix<BaseFloat> mSparsityMask; ///< Mask which selects active weights
+
+      CuMatrix<BaseFloat> mLinearityCorrection; ///< Matrix for linearity updates
+      CuVector<BaseFloat> mBiasCorrection;      ///< Vector for bias updates
+
+      CuMatrix<BaseFloat> mLinearityCorrectionAccu; ///< Accumulator for linearity updates
+
+      BaseFloat mL1Const; ///< L1 regularization constant
+
+      size_t mNFrames; ///< Number of accumulated frames 
+      BaseFloat mSparsifyWeightThreshold; ///< Cutoff
+      BaseFloat mUnsparsifyAccu; ///< Threshold to unsparsify the Cutoff
+
+      
+  };
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // CuSparseLinearity::
+  inline 
+  CuSparseLinearity::
+  CuSparseLinearity(size_t nInputs, size_t nOutputs, CuComponent *pPred)
+    : CuUpdatableComponent(nInputs, nOutputs, pPred), 
+      mLinearity(nInputs,nOutputs), mBias(nOutputs), mSparsityMask(nInputs,nOutputs),
+      mLinearityCorrection(nInputs,nOutputs), mBiasCorrection(nOutputs),
+      mLinearityCorrectionAccu(nInputs,nOutputs),
+      mNFrames(0), mSparsifyWeightThreshold(1.0e-3),
+      mUnsparsifyAccu(1e20f)
+  { 
+    mLinearityCorrection.SetConst(0.0f);
+    mBiasCorrection.SetConst(0.0f);
+    mLinearityCorrectionAccu.SetConst(0.0f);
+  }
+
+
+  inline
+  CuSparseLinearity::
+  ~CuSparseLinearity()
+  { }
+
+  inline CuComponent::ComponentType
+  CuSparseLinearity::
+  GetType() const
+  {
+    return CuComponent::SPARSE_LINEARITY;
+  }
+
+  inline const char*
+  CuSparseLinearity::
+  GetName() const
+  {
+    return "<sparselinearity>";
+  }
+
+
+
+} //namespace
+
+
+
+#endif
diff --git a/src/CuTNetLib/Makefile b/src/CuTNetLib/Makefile
new file mode 100644
index 0000000..c7678da
--- /dev/null
+++ b/src/CuTNetLib/Makefile
@@ -0,0 +1,30 @@
+
+include ../tnet.mk
+
+INCLUDE= -I. -I../ -I../KaldiLib -I../CuBaseLib -I../TNetLib -I$(CUDA_TK_BASE)/include/ 
+
+
+all : libCuTNet.a
+
+libCuTNet.a : $(OBJ)
+	$(AR) ruv $@ $?
+	$(RANLIB) $@
+
+
+%.o : %.cc
+	$(CXX) -c $< -o $@ $(CFLAGS) $(CXXFLAGS) $(INCLUDE) 
+
+
+
+
+.PHONY: clean depend
+
+clean :
+	rm -f *.o *.a
+
+
+depend:
+	$(CXX) -M $(CXXFLAGS) *.cc $(INCLUDE) > .depend.mk
+
+-include .depend.mk
+
diff --git a/src/CuTNetLib/cuActivation.cc b/src/CuTNetLib/cuActivation.cc
new file mode 100644
index 0000000..bd57ae5
--- /dev/null
+++ b/src/CuTNetLib/cuActivation.cc
@@ -0,0 +1,46 @@
+
+#include "cuActivation.h"
+#include "cumath.h"
+
+
+namespace TNet {
+
+
+  void
+  CuSigmoid::
+  PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    CuMath<BaseFloat>::Sigmoid(Y, X);
+  }
+
+
+  void 
+  CuSigmoid::
+  BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    CuMath<BaseFloat>::DiffSigmoid(Y, X, mOutput);
+  }
+
+
+
+  void 
+  CuSoftmax::
+  PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    CuMath<BaseFloat>::Softmax(Y,X);
+  }
+
+   
+   
+  void
+  CuSoftmax::
+  BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    //we assume X is already dE/dSoftmax_input
+    Y.CopyFrom(X);
+  }
+
+
+
+} //namespace
+
diff --git a/src/CuTNetLib/cuActivation.h b/src/CuTNetLib/cuActivation.h
new file mode 100644
index 0000000..c66640c
--- /dev/null
+++ b/src/CuTNetLib/cuActivation.h
@@ -0,0 +1,132 @@
+
+#ifndef _CUACT_FUN_I_
+#define _CUACT_FUN_I_
+
+
+#include "cuComponent.h"
+#include "cumatrix.h"
+
+
+namespace TNet
+{
+
+  /**
+   * \brief Common interface for activation functions
+   */
+  class CuActivation : public CuComponent 
+  {
+    public:
+      CuActivation(size_t nInputs, size_t nOutputs, CuComponent *pPred);
+
+    protected:
+  };
+
+
+  /**
+   * \brief CuSigmoid activation function
+   *
+   * \ingroup CuNNActivation
+   * Implements forward pass: \f[ Y_i=\frac{1}{1+e^{-X_i}} \f]
+   * Error propagation: \f[ E_i=Y_i(1-Y_i)e_i \f]
+   */
+  class CuSigmoid : public CuActivation
+  {
+    public:
+      CuSigmoid(size_t nInputs, size_t nOutputs, CuComponent *pPred);
+
+      ComponentType GetType() const;
+      const char* GetName() const;
+
+    protected:
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+  };
+    
+
+  /**
+   * \brief CuSoftmax activation function
+   *
+   * \ingroup CuNNActivation
+   * Implements forward pass: \f[ Y_i=\frac{1}{Z} e^{X_i} \f]
+   * where \f$ Z=\Sigma_{i=0}^{i=N-1} e^{X_i} \f$
+   * Error Propagation: \f[ E_i=Y_i - \Sigma_{j=0}^{j=N-1} Y_i Y_j e_j \f]
+   */
+  class CuSoftmax : public CuActivation
+  {
+    public:
+      CuSoftmax(size_t nInputs, size_t nOutputs, CuComponent *pPred);
+
+      ComponentType GetType() const;
+      const char* GetName() const;
+
+    protected:
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+  };
+
+
+  //////////////////////////////////////////////////////////////////////////
+  // Inline functions
+  // Activation::
+  inline
+  CuActivation::
+  CuActivation(size_t nInputs, size_t nOutputs, CuComponent *pPred)
+    : CuComponent(nInputs,nOutputs, pPred)
+  { 
+    assert(nInputs == nOutputs);
+  } 
+
+
+  //////////////////////////////////////////////////////////////////////////
+  // Inline functions
+  // Sigmoid::
+  inline
+  CuSigmoid::
+  CuSigmoid(size_t nInputs, size_t nOutputs, CuComponent *pPred)
+    : CuActivation(nInputs,nOutputs, pPred) 
+  { } 
+
+  inline CuComponent::ComponentType
+  CuSigmoid::
+  GetType() const
+  {
+    return CuComponent::SIGMOID;
+  }
+
+  inline const char*
+  CuSigmoid::
+  GetName() const
+  {
+    return "<sigmoid>";
+  }
+
+
+
+  //////////////////////////////////////////////////////////////////////////
+  // Inline functions
+  // Softmax::
+  inline
+  CuSoftmax::
+  CuSoftmax(size_t nInputs, size_t nOutputs, CuComponent *pPred)
+    : CuActivation(nInputs,nOutputs, pPred) 
+  { } 
+
+  inline CuComponent::ComponentType
+  CuSoftmax::
+  GetType() const
+  {
+    return CuComponent::SOFTMAX;
+  }
+
+  inline const char*
+  CuSoftmax::
+  GetName() const
+  {
+    return "<softmax>";
+  }
+
+
+} //namespace
+
+
+#endif
diff --git a/src/CuTNetLib/cuBiasedLinearity.cc b/src/CuTNetLib/cuBiasedLinearity.cc
new file mode 100644
index 0000000..b9ac137
--- /dev/null
+++ b/src/CuTNetLib/cuBiasedLinearity.cc
@@ -0,0 +1,123 @@
+
+
+#include "cuBiasedLinearity.h"
+
+
+namespace TNet
+{
+
+  void 
+  CuBiasedLinearity::
+  PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    //Y.SetConst(0.0);
+    Y.AddScaledRow(1.0,mBias,0.0);
+    Y.Gemm('N','N', 1.0, X, mLinearity, 1.0);
+  }
+
+
+  void 
+  CuBiasedLinearity::
+  BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    //Y.SetConst(0.0);
+    Y.Gemm('N', 'T', 1.0, X, mLinearity, 0.0);
+  }
+
+  
+  void 
+  CuBiasedLinearity::
+  Update() 
+  {
+#if 0
+    //former implementation
+    BaseFloat N = static_cast<BaseFloat>(GetInput().Rows());
+
+    mLinearityCorrection.Gemm('T','N',-mLearningRate/N,GetInput(),GetErrorInput(),mMomentum);
+    mBiasCorrection.AddColSum(-mLearningRate/N,GetErrorInput(),mMomentum);
+
+    //regularization weight decay
+    mLinearityCorrection.AddScaled(-mLearningRate*mWeightcost,mLinearity,1.0);
+    
+    mLinearity.AddScaled(1.0,mLinearityCorrection,1.0);
+    mBias.AddScaled(1.0,mBiasCorrection,1.0);
+#endif
+
+#if 1
+    //new implementation
+    BaseFloat N = 1;
+    if(mGradDivFrm) {
+      N = static_cast<BaseFloat>(GetInput().Rows());
+    }
+    BaseFloat mmt_gain = static_cast<BaseFloat>(1.0/(1.0-mMomentum));
+    N *= mmt_gain;
+
+    mLinearityCorrection.Gemm('T','N',1.0,GetInput(),GetErrorInput(),mMomentum);
+    mBiasCorrection.AddColSum(1.0,GetErrorInput(),mMomentum);
+
+    mLinearity.AddScaled(-mLearningRate/N,mLinearityCorrection,1.0);
+    mBias.AddScaled(-mLearningRate/N,mBiasCorrection,1.0);
+
+    //regularization weight decay (from actual weights only)
+    BaseFloat L2_decay = -mLearningRate*mWeightcost*(mGradDivFrm?1.0:GetInput().Rows());
+    mLinearity.AddScaled(L2_decay, mLinearity,1.0);
+#endif
+  }
+
+
+  void
+  CuBiasedLinearity::
+  ReadFromStream(std::istream& rIn)
+  {
+    //matrix is stored transposed as SNet does
+    BfMatrix transpose;
+    rIn >> transpose;
+    mLinearity.CopyFrom(BfMatrix(transpose, TRANS));
+    //biases stored normally
+    BfVector bias;
+    rIn >> bias;
+    mBias.CopyFrom(bias);
+
+    if(transpose.Cols()*transpose.Rows() == 0) {
+      Error("Missing linearity matrix in network file");
+    }
+    if(bias.Dim() == 0) {
+      Error("Missing bias vector in network file");
+    }
+    if(mLinearity.Cols() != GetNOutputs() || 
+       mLinearity.Rows() != GetNInputs() ||
+       mBias.Dim() != GetNOutputs()
+    ){
+      std::ostringstream os;
+      os << "Wrong dimensionalities of matrix/vector in network file\n"
+         << "Inputs:" << GetNInputs()
+         << "Outputs:" << GetNOutputs()
+         << "\n"
+         << "linearityCols:" << mLinearity.Cols()
+         << "linearityRows:" << mLinearity.Rows()
+         << "biasDims:" << mBias.Dim()
+         << "\n";
+      Error(os.str());
+    }
+  }
+
+   
+  void
+  CuBiasedLinearity::
+  WriteToStream(std::ostream& rOut)
+  {
+    //matrix is stored transposed as SNet does
+    BfMatrix tmp;
+    mLinearity.CopyTo(tmp);
+    BfMatrix transpose(tmp, TRANS);
+    rOut << transpose;
+    //biases stored normally
+    BfVector vec;
+    mBias.CopyTo(vec);
+    rOut << vec;
+    rOut << std::endl;
+  }
+
+ 
+} //namespace
+
diff --git a/src/CuTNetLib/cuBiasedLinearity.h b/src/CuTNetLib/cuBiasedLinearity.h
new file mode 100644
index 0000000..d3c3b78
--- /dev/null
+++ b/src/CuTNetLib/cuBiasedLinearity.h
@@ -0,0 +1,98 @@
+#ifndef _CUBIASED_LINEARITY_H_
+#define _CUBIASED_LINEARITY_H_
+
+
+#include "cuComponent.h"
+#include "cumatrix.h"
+
+
+#include "Matrix.h"
+#include "Vector.h"
+
+
+namespace TNet {
+  /**
+   * \brief CuBiasedLinearity summation function
+   *
+   * \ingroup CuNNUpdatable
+   * Implements forward pass: \f[ Y_j=\Sigma_{i=0}^{i=N-1}w_ij X_i +{\beta}_j \f]
+   * Error propagation: \f[ E_i = \Sigma_{i=0}^{i=N-1} w_ij e_j \f]
+   *
+   * Weight adjustion: \f[ W_{ij} = (1-D)(w_{ij} - \alpha(1-\mu)x_i e_j - \mu \Delta) \f]
+   * and fot bias: \f[ {\Beta}_i = {\beta}_i - \alpha(1-\mu)e_i - \mu \Delta \f]
+   * where
+   *  - D for weight decay => penalizing large weight
+   *  - \f$ \alpha \f$ for learning rate
+   *  - \f$ \mu \f$ for momentum => avoiding oscillation
+   */
+  class CuBiasedLinearity : public CuUpdatableComponent
+  {
+    public:
+
+      CuBiasedLinearity(size_t nInputs, size_t nOutputs, CuComponent *pPred); 
+      ~CuBiasedLinearity();  
+      
+      ComponentType GetType() const;
+      const char* GetName() const;
+
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+
+      void Update();
+
+      void ReadFromStream(std::istream& rIn);
+      void WriteToStream(std::ostream& rOut);
+
+    protected:
+      CuMatrix<BaseFloat> mLinearity;  ///< Matrix with neuron weights
+      CuVector<BaseFloat> mBias;       ///< Vector with biases
+
+      CuMatrix<BaseFloat> mLinearityCorrection; ///< Matrix for linearity updates
+      CuVector<BaseFloat> mBiasCorrection;      ///< Vector for bias updates
+
+  };
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // CuBiasedLinearity::
+  inline 
+  CuBiasedLinearity::
+  CuBiasedLinearity(size_t nInputs, size_t nOutputs, CuComponent *pPred)
+    : CuUpdatableComponent(nInputs, nOutputs, pPred), 
+      mLinearity(nInputs,nOutputs), mBias(nOutputs),
+      mLinearityCorrection(nInputs,nOutputs), mBiasCorrection(nOutputs)
+  { 
+    mLinearityCorrection.SetConst(0.0);
+    mBiasCorrection.SetConst(0.0);
+  }
+
+
+  inline
+  CuBiasedLinearity::
+  ~CuBiasedLinearity()
+  { }
+
+  inline CuComponent::ComponentType
+  CuBiasedLinearity::
+  GetType() const
+  {
+    return CuComponent::BIASED_LINEARITY;
+  }
+
+  inline const char*
+  CuBiasedLinearity::
+  GetName() const
+  {
+    return "<biasedlinearity>";
+  }
+
+
+
+} //namespace
+
+
+
+#endif
diff --git a/src/CuTNetLib/cuBlockArray.cc b/src/CuTNetLib/cuBlockArray.cc
new file mode 100644
index 0000000..890a752
--- /dev/null
+++ b/src/CuTNetLib/cuBlockArray.cc
@@ -0,0 +1,138 @@
+
+
+#include "cuBlockArray.h"
+#include "cuNetwork.h"
+
+
+namespace TNet
+{
+
+  void 
+  CuBlockArray::
+  PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    CuMatrix<BaseFloat> colsX;
+    CuMatrix<BaseFloat> colsY;
+    
+    int X_src_ori=0, Y_tgt_ori=0;
+    for(int i=0; i<mNBlocks; i++) {
+      //copy column stripe from the input X
+      int colsX_cnt=mBlocks[i]->GetNInputs();
+      colsX.Init(X.Rows(),colsX_cnt);
+      colsX.CopyCols(colsX_cnt,X_src_ori,X,0);
+
+      //propagate through the block(network)
+      mBlocks[i]->Propagate(colsX,colsY);
+
+      //copy column stripe to the output Y
+      int colsY_cnt=mBlocks[i]->GetNOutputs();
+      Y.CopyCols(colsY_cnt,0,colsY,Y_tgt_ori);
+
+      //shift the origin coordinates
+      X_src_ori += colsX_cnt;
+      Y_tgt_ori += colsY_cnt;
+    }
+
+    assert(X_src_ori == X.Cols());
+    assert(Y_tgt_ori == Y.Cols());
+  }
+
+  /// @todo CuBlockArray::BackpropagateFnc not implemented
+  void 
+  CuBlockArray::
+  BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    KALDI_ERR << "Unimplemented";
+  }
+
+  void 
+  CuBlockArray::
+  Update() 
+  {
+    KALDI_ERR << "Unimplemented";
+  }
+
+
+  void
+  CuBlockArray::
+  ReadFromStream(std::istream& rIn)
+  {
+    if(mBlocks.size() > 0) {
+      KALDI_ERR << "Cannot read block vector, "
+                << "aleady filled bt "
+                << mBlocks.size()
+                << "elements";
+    }
+
+    rIn >> std::ws >> mNBlocks;
+    if(mNBlocks < 1) {
+      KALDI_ERR << "Bad number of blocks:" << mNBlocks;
+    }
+
+    //read all the blocks
+    std::string tag;
+    int block_id;
+    for(int i=0; i<mNBlocks; i++) {
+      //read tag <block>
+      rIn >> std::ws >> tag;
+      //make it lowercase
+      std::transform(tag.begin(), tag.end(), tag.begin(), tolower);
+      //check
+      if(tag!="<block>") {
+        KALDI_ERR << "<block> keywotd expected";
+      }
+    
+      //read block number
+      rIn >> std::ws >> block_id;
+      if(block_id != i+1) {
+        KALDI_ERR << "Expected block number:" << i+1
+                  << " read block number: " << block_id;
+      }
+
+      //read the nnet
+      CuNetwork* p_nnet = new CuNetwork;
+      p_nnet->ReadNetwork(rIn);
+      if(p_nnet->Layers() == 0) {
+        KALDI_ERR << "Cannot read empty network to a block";
+      }
+
+      //add it to the vector
+      mBlocks.push_back(p_nnet);
+    }
+
+    //check the declared dimensionality
+    int sum_inputs=0, sum_outputs=0;
+    for(int i=0; i<mNBlocks; i++) {
+      sum_inputs += mBlocks[i]->GetNInputs();
+      sum_outputs += mBlocks[i]->GetNOutputs();
+    }
+    if(sum_inputs != GetNInputs()) {
+      KALDI_ERR << "Non-matching number of INPUTS! Declared:"
+                << GetNInputs()
+                << " summed from blocks"
+                << sum_inputs;
+    }
+    if(sum_outputs != GetNOutputs()) {
+      KALDI_ERR << "Non-matching number of OUTPUTS! Declared:"
+                << GetNOutputs()
+                << " summed from blocks"
+                << sum_outputs;
+    }
+  }
+
+   
+  void
+  CuBlockArray::
+  WriteToStream(std::ostream& rOut)
+  {
+    rOut << " " << mBlocks.size() << " ";
+    for(int i=0; i<mBlocks.size(); i++) {
+      rOut << "<block> " << i+1 << "\n";
+      mBlocks[i]->WriteNetwork(rOut);
+      rOut << "<endblock>\n";
+    }
+  }
+
+ 
+} //namespace
+
diff --git a/src/CuTNetLib/cuBlockArray.h b/src/CuTNetLib/cuBlockArray.h
new file mode 100644
index 0000000..5454538
--- /dev/null
+++ b/src/CuTNetLib/cuBlockArray.h
@@ -0,0 +1,90 @@
+#ifndef _CUBLOCK_ARRAY_H_
+#define _CUBLOCK_ARRAY_H_
+
+
+#include "cuComponent.h"
+#include "cumatrix.h"
+
+#include "Matrix.h"
+#include "Vector.h"
+
+
+namespace TNet {
+
+  class CuNetwork;
+  /**
+   * \brief Updatable compenent consists of several networks
+   *
+   * \ingroup CuNNUpdatable
+   * Each network is individually propagated and backpropagated with non-overlaping input and output.
+   *
+   * Enabling multipath topological structure within the network.
+   */
+  class CuBlockArray : public CuUpdatableComponent
+  {
+    public:
+
+      CuBlockArray(size_t nInputs, size_t nOutputs, CuComponent *pPred); 
+      ~CuBlockArray();  
+      
+      ComponentType GetType() const;
+      const char* GetName() const;
+
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+
+      void Update();
+
+      void ReadFromStream(std::istream& rIn);
+      void WriteToStream(std::ostream& rOut);
+
+    protected:
+      std::vector<CuNetwork*> mBlocks; ///< vector with networks, one network is one block
+      size_t mNBlocks;  
+  };
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // CuBlockArray::
+  inline 
+  CuBlockArray::
+  CuBlockArray(size_t nInputs, size_t nOutputs, CuComponent *pPred)
+    : CuUpdatableComponent(nInputs, nOutputs, pPred), 
+      mNBlocks(0) 
+  { }
+
+
+  inline
+  CuBlockArray::
+  ~CuBlockArray()
+  { 
+    for(int i=0; i<mBlocks.size(); i++) {
+      delete mBlocks[i];
+    }
+    mBlocks.clear();
+  }
+
+  inline CuComponent::ComponentType
+  CuBlockArray::
+  GetType() const
+  {
+    return CuComponent::BLOCK_ARRAY;
+  }
+
+  inline const char*
+  CuBlockArray::
+  GetName() const
+  {
+    return "<blockarray>";
+  }
+
+
+
+} //namespace
+
+
+
+#endif
diff --git a/src/CuTNetLib/cuCRBEDctFeat.h b/src/CuTNetLib/cuCRBEDctFeat.h
new file mode 100644
index 0000000..e74a5b6
--- /dev/null
+++ b/src/CuTNetLib/cuCRBEDctFeat.h
@@ -0,0 +1,340 @@
+#ifndef _CUCRBEDCTFEATURES_H_
+#define _CUCRBEDCTFEATURES_H_
+
+
+#include "cuComponent.h"
+#include "cumath.h"
+
+
+namespace TNet {
+
+  /**
+   * \brief Input features time context expand
+   *
+   * \ingroup CuNNMisc
+   * Expands the time context of the input features according to FrameOffset
+   *
+   * in N, out k*N, FrameOffset o_1,o_2,...,o_k
+   * FrameOffset example 11frames: -5 -4 -3 -2 -1 0 1 2 3 4 5
+   */
+  class CuExpand : public CuComponent
+  {
+   public:
+    CuExpand(size_t nInputs, size_t nOutputs, CuComponent* pPred)
+      : CuComponent(nInputs,nOutputs,pPred)
+    { }
+
+    ~CuExpand()
+    { }
+
+    ComponentType GetType() const
+    { return EXPAND; }
+
+    const char* GetName() const
+    { return "<expand>"; }
+   
+    void ReadFromStream(std::istream& rIn)
+    { Vector<int> vec; rIn >> vec; mFrameOffset.CopyFrom(vec); }
+
+    void WriteToStream(std::ostream& rOut)  
+    { Vector<int> vec; mFrameOffset.CopyTo(vec); rOut << vec; }
+     
+   protected:
+    void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+    { CuMath<BaseFloat>::Expand(Y,X,mFrameOffset); }
+
+    void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+    { Error("__func__ Nonsense"); }
+
+   protected:
+    CuVector<int> mFrameOffset;
+  };
+
+
+
+  /**
+   * \brief Columns' value shuffling
+   *
+   * \ingroup CuNNMisc
+   * Rearrange the matrix columns according to the indices in CopyFromIndices
+   */
+  class CuCopy : public CuComponent
+  {
+   public:
+    CuCopy(size_t nInputs, size_t nOutputs, CuComponent* pPred)
+      : CuComponent(nInputs,nOutputs,pPred)
+    { }
+
+    ~CuCopy()
+    { }
+
+    ComponentType GetType() const
+    { return COPY; }
+
+    const char* GetName() const
+    { return "<copy>"; }
+   
+    void ReadFromStream(std::istream& rIn)
+    { Vector<int> vec; rIn >> vec; vec.Add(-1); mCopyFromIndices.CopyFrom(vec); }
+
+    void WriteToStream(std::ostream& rOut)  
+    { Vector<int> vec; mCopyFromIndices.CopyTo(vec); vec.Add(1); rOut << vec; }
+     
+   protected:
+    void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+    { CuMath<BaseFloat>::Rearrange(Y,X,mCopyFromIndices); }
+
+    void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+    { Error("__func__ Nonsense"); }
+
+   protected:
+    CuVector<int> mCopyFromIndices;
+  };
+  
+  /**
+   * \brief Perform Feature transpose
+   *
+   * \ingroup CuNNMisc
+   */
+  class CuTranspose : public CuComponent
+  {
+   public:
+    CuTranspose(size_t nInputs, size_t nOutputs, CuComponent* pPred)
+      : CuComponent(nInputs,nOutputs,pPred), mContext(0)
+    { }
+
+    ~CuTranspose()
+    { }
+
+    ComponentType GetType() const
+    { return TRANSPOSE; }
+
+    const char* GetName() const
+    { return "<transpose>"; }
+   
+    void ReadFromStream(std::istream& rIn)
+    { 
+      rIn >> std::ws >> mContext;
+
+      if(GetNInputs() != GetNOutputs()) { 
+        Error("Input dim must be same as output dim"); 
+      }
+      if(GetNInputs() % mContext != 0) { 
+        Error("Number of inputs must be divisible by context length"); 
+      }
+
+      Vector<int> vec(GetNInputs());
+      int channels = GetNInputs() / mContext;
+      for(int i=0, ch=0; ch<channels; ch++) {
+        for(int idx=ch; idx < (int)GetNInputs(); idx+=channels, i++) {
+          assert(i < (int)GetNInputs());
+          vec[i] = idx;
+        }
+      }
+
+      mCopyFromIndices.CopyFrom(vec); 
+    }
+
+    void WriteToStream(std::ostream& rOut)  
+    { rOut << " " << mContext << "\n"; }
+     
+   protected:
+    void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+    { CuMath<BaseFloat>::Rearrange(Y,X,mCopyFromIndices); }
+
+    void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+    { Error("__func__ Nonsense"); }
+
+   protected:
+    int mContext;
+    CuVector<int> mCopyFromIndices;
+  };
+
+
+  /**
+   * \brief Used for the blockwise multiplication by 
+   * DCT transform loaded from disk
+   *
+   * \ingroup CuNNMisc
+   */
+  class CuBlockLinearity : public CuComponent
+  {
+    public:
+      CuBlockLinearity(size_t nInputs, size_t nOutputs, CuComponent* pPred)
+        : CuComponent(nInputs,nOutputs,pPred)
+      { }
+
+      ~CuBlockLinearity()
+      { }
+
+
+      ComponentType GetType() const
+      { return CuComponent::BLOCK_LINEARITY; }
+
+      const char* GetName() const
+      { return "<blocklinearity>"; }
+
+
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y) 
+      { CuMath<BaseFloat>::BlockLinearity(Y,X,mBlockLinearity); }
+        
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y) 
+      { Error("__func__ Not implemented"); }
+
+
+      void ReadFromStream(std::istream& rIn)
+      { 
+        Matrix<BaseFloat> mat;
+        rIn >> mat;
+        Matrix<BaseFloat> trans(mat,TRANS);
+        mBlockLinearity.CopyFrom(trans);
+
+        if((GetNOutputs() % mBlockLinearity.Cols() != 0) ||
+           (GetNInputs() % mBlockLinearity.Rows() != 0) ||
+           ((GetNOutputs() / mBlockLinearity.Cols()) != 
+            (GetNInputs() / mBlockLinearity.Rows()))) 
+        {
+          Error("BlockLinearity matrix dimensions must divide IO dims");
+        }
+      }
+
+      void WriteToStream(std::ostream& rOut)
+      {
+        Matrix<BaseFloat> mat;
+        mBlockLinearity.CopyTo(mat);
+        Matrix<BaseFloat> trans(mat,TRANS);
+        rOut << trans;
+      }
+
+    private:
+      CuMatrix<BaseFloat> mBlockLinearity;
+  };
+
+
+  /**
+   * \brief Bias layer
+   *
+   * \ingroup CuNNMisc
+   * Implements: \f$ \vec{Y}=\vec{X}+\vec{\beta} \f$
+   */ 
+  class CuBias : public CuComponent
+  {
+    public:
+      CuBias(size_t nInputs, size_t nOutputs, CuComponent* pPred)
+        : CuComponent(nInputs,nOutputs,pPred)
+      { }
+
+      ~CuBias()
+      { }
+
+
+      ComponentType GetType() const
+      { return CuComponent::BIAS; }
+
+      const char* GetName() const
+      { return "<bias>"; }
+
+
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+      { Y.CopyFrom(X); Y.AddScaledRow(1.0, mBias, 1.0); }
+
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+      { Y.CopyFrom(X); }
+  
+     
+      void ReadFromStream(std::istream& rIn)
+      { Vector<BaseFloat> vec; rIn >> vec; mBias.CopyFrom(vec); }
+
+      void WriteToStream(std::ostream& rOut)
+      { Vector<BaseFloat> vec; mBias.CopyTo(vec); rOut << vec; }
+
+    private:
+      CuVector<BaseFloat> mBias;
+  };
+
+
+  /**
+   * \brief Column Scaling
+   *
+   * \ingroup CuNNMisc
+   * Scaling the inputs with coefficients in scaling CuVector mWindow
+   */
+  class CuWindow : public CuComponent
+  {
+    public:
+      CuWindow(size_t nInputs, size_t nOutputs, CuComponent* pPred)
+        : CuComponent(nInputs, nOutputs, pPred)
+      { }
+
+      ~CuWindow()
+      { }
+
+
+      ComponentType GetType() const
+      { return CuComponent::WINDOW; }
+
+      const char* GetName() const
+      { return "<window>"; }
+
+
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+      { Y.CopyFrom(X); Y.ScaleCols(mWindow); }
+
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+      { Error("__func__ Not implemented"); }
+     
+      
+      void ReadFromStream(std::istream& rIn)
+      { Vector<BaseFloat> vec; rIn >> vec; mWindow.CopyFrom(vec); }
+
+      void WriteToStream(std::ostream& rOut)
+      { Vector<BaseFloat> vec; mWindow.CopyTo(vec); rOut << vec; }
+
+    private:
+      CuVector<BaseFloat> mWindow;  ///< Scaling factors
+  };
+
+  /**
+   * \brief Perform log transorm
+   *
+   * \ingroup CuNNMisc
+   * Calculate: \f[ \vec{Y}=\ln \vec{X} \f]
+   */
+  class CuLog : public CuComponent
+  {
+    public:
+      CuLog(size_t nInputs, size_t nOutputs, CuComponent* pPred)
+        : CuComponent(nInputs, nOutputs, pPred)
+      { }
+
+      ~CuLog()
+      { }
+
+
+      ComponentType GetType() const
+      { return CuComponent::LOG; }
+
+      const char* GetName() const
+      { return "<log>"; }
+
+
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+      { Y.CopyFrom(X); Y.ApplyLog(); }
+
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+      { Error("__func__ Not implemented"); }
+     
+      
+      void ReadFromStream(std::istream& rIn)
+      { }
+
+      void WriteToStream(std::ostream& rOut)
+      { }
+
+  };
+
+}
+
+
+#endif
+
diff --git a/src/CuTNetLib/cuCache.cc b/src/CuTNetLib/cuCache.cc
new file mode 100644
index 0000000..f96b3b1
--- /dev/null
+++ b/src/CuTNetLib/cuCache.cc
@@ -0,0 +1,203 @@
+
+
+#include "cuCache.h"
+#include "cumath.h"
+
+
+
+namespace TNet {
+
+  CuCache::
+  CuCache()
+    : mState(EMPTY), mIntakePos(0), mExhaustPos(0), mDiscarded(0), 
+      mRandomized(false), mTrace(0)
+  { }
+
+  CuCache::
+  ~CuCache()
+  { }
+
+  void
+  CuCache::
+  Init(size_t cachesize, size_t bunchsize)
+  {
+    if((cachesize % bunchsize) != 0) {
+      Error("Non divisible cachesize by bunchsize");
+    }
+    
+    mCachesize = cachesize;
+    mBunchsize = bunchsize;
+
+    mState = EMPTY;
+
+    mIntakePos = 0;
+    mExhaustPos = 0;
+
+    mRandomized = false;
+
+  }
+
+  void 
+  CuCache::
+  AddData(const CuMatrix<BaseFloat>& rFeatures, const CuMatrix<BaseFloat>& rDesired)
+  {
+    assert(rFeatures.Rows() == rDesired.Rows());
+
+    //lazy buffers allocation
+    if(mFeatures.Rows() != mCachesize) {
+      mFeatures.Init(mCachesize,rFeatures.Cols());
+      mDesired.Init(mCachesize,rDesired.Cols());
+    }
+
+    //warn if segment longer than half-cache
+    if(rFeatures.Rows() > mCachesize/2) {
+      std::ostringstream os;
+      os << "Too long segment and small feature cache! "
+         << " cachesize: " << mCachesize
+         << " segmentsize: " << rFeatures.Rows();
+      Warning(os.str());
+    }
+
+    //change state
+    if(mState == EMPTY) { 
+      if(mTrace&3) std::cout << "/" << std::flush; 
+      mState = INTAKE; mIntakePos = 0;
+     
+      //check for leftover from previous segment 
+      int leftover = mFeaturesLeftover.Rows();
+      //check if leftover is not bigger than cachesize
+      if(leftover > mCachesize) {
+        std::ostringstream os;
+        os << "Too small feature cache: " << mCachesize
+           << ", truncating: "
+           << leftover - mCachesize << " frames from previous segment leftover";
+        //Error(os.str());
+        Warning(os.str());
+        leftover = mCachesize;
+      }
+      //prefill cache with leftover
+      if(leftover > 0) {
+        mFeatures.CopyRows(leftover,0,mFeaturesLeftover,0);
+        mDesired.CopyRows(leftover,0,mDesiredLeftover,0);
+        mFeaturesLeftover.Destroy();
+        mDesiredLeftover.Destroy();
+        mIntakePos += leftover;
+      } 
+    }
+
+    assert(mState == INTAKE);
+    assert(rFeatures.Rows() == rDesired.Rows());
+    if(mTrace&2) std::cout << "F" << std::flush; 
+
+    int cache_space = mCachesize - mIntakePos;
+    int feature_length = rFeatures.Rows();
+    int fill_rows = (cache_space<feature_length)? cache_space : feature_length;
+    int leftover = feature_length - fill_rows;
+
+    assert(cache_space > 0);
+
+    //copy the data to cache
+    mFeatures.CopyRows(fill_rows,0,rFeatures,mIntakePos);
+    mDesired.CopyRows(fill_rows,0,rDesired,mIntakePos);
+
+    //copy leftovers
+    if(leftover > 0) {
+      mFeaturesLeftover.Init(leftover,mFeatures.Cols());
+      mDesiredLeftover.Init(leftover,mDesired.Cols());
+      mFeaturesLeftover.CopyRows(leftover,fill_rows,rFeatures,0);
+      mDesiredLeftover.CopyRows(leftover,fill_rows,rDesired,0);
+    }
+ 
+    //update cursor
+    mIntakePos += fill_rows;
+    
+    //change state
+    if(mIntakePos == mCachesize) { 
+      if(mTrace&3) std::cout << "\\" << std::flush; 
+      mState = FULL;
+    }
+  }
+
+
+
+  void
+  CuCache::
+  Randomize()
+  {
+    assert(mState == FULL || mState == INTAKE);
+
+    if(mTrace&3) std::cout << "R" << std::flush;
+
+    //lazy initialization of hte output buffers
+    mFeaturesRandom.Init(mCachesize,mFeatures.Cols());
+    mDesiredRandom.Init(mCachesize,mDesired.Cols());
+
+    //generate random series of integers
+    Vector<int> randmask(mIntakePos);
+    for(unsigned int i=0; i<mIntakePos; i++) {
+      randmask[i]=i;
+    }
+    int* ptr = randmask.pData();
+    std::random_shuffle(ptr, ptr+mIntakePos, GenerateRandom);
+
+    CuVector<int> cu_randmask;
+    cu_randmask.CopyFrom(randmask);
+
+    //randomize
+    CuMath<BaseFloat>::Randomize(mFeaturesRandom,mFeatures,cu_randmask);
+    CuMath<BaseFloat>::Randomize(mDesiredRandom,mDesired,cu_randmask);
+
+    mRandomized = true;
+
+  }
+
+  void
+  CuCache::
+  GetBunch(CuMatrix<BaseFloat>& rFeatures, CuMatrix<BaseFloat>& rDesired)
+  {
+    if(mState == EMPTY) {
+      Error("GetBunch on empty cache!!!");
+    }
+
+    //change state if full...
+    if(mState == FULL) { 
+      if(mTrace&3) std::cout << "\\" << std::flush; 
+      mState = EXHAUST; mExhaustPos = 0; 
+    }
+
+    //final cache is not completely filled
+    if(mState == INTAKE) //&& mpFeatures->EndOfList()
+    { 
+      if(mTrace&3) std::cout << "\\-LAST\n" << std::flush; 
+      mState = EXHAUST; mExhaustPos = 0; 
+    } 
+
+    assert(mState == EXHAUST);
+
+    //init the output
+    rFeatures.Init(mBunchsize,mFeatures.Cols());
+    rDesired.Init(mBunchsize,mDesired.Cols());
+
+    //copy the output
+    if(mRandomized) {
+      rFeatures.CopyRows(mBunchsize,mExhaustPos,mFeaturesRandom,0);
+      rDesired.CopyRows(mBunchsize,mExhaustPos,mDesiredRandom,0);
+    } else {
+      rFeatures.CopyRows(mBunchsize,mExhaustPos,mFeatures,0);
+      rDesired.CopyRows(mBunchsize,mExhaustPos,mDesired,0);
+    }
+
+    //update cursor
+    mExhaustPos += mBunchsize;
+
+    //change state to EMPTY
+    if(mExhaustPos > mIntakePos-mBunchsize) {
+      //we don't have more complete bunches...
+      mDiscarded += mIntakePos - mExhaustPos;
+
+      mState = EMPTY;
+    }
+  }
+
+
+}
diff --git a/src/CuTNetLib/cuCache.h b/src/CuTNetLib/cuCache.h
new file mode 100644
index 0000000..42d9b4d
--- /dev/null
+++ b/src/CuTNetLib/cuCache.h
@@ -0,0 +1,94 @@
+#ifndef _CUCACHE_H_
+#define _CUCACHE_H_
+
+#include "cumatrix.h"
+
+namespace TNet {
+
+
+  /**
+   * \brief The feature-target pair cache
+   *
+   * \ingroup CuNNComp
+   * Preloads mCachesize features and labels to GRAM
+   * 
+   * During every iterations read mBunchsize data to networks.
+   *
+   * When Cache is to be filled with more data that it can hold 
+   * extras are stored in LeftOver, and when data is filled again
+   * LeftOvers are moved to the Cache.
+   *
+   * Note:
+   *  - Cache Size must be divisible by Bunch Size to ensure proper functionality
+   *  - Once extracted data. Cache must be depleted to begin filling or every time extraction start at location Zero. 
+   *  - Cache must be filled to begin extraction of Data or we can't start filling and harder to avoid discarding data.
+   *  - @todo Why not implement CuCache as a Stack instead of a Queue?
+   *  .
+   */
+  class CuCache {
+    typedef enum { EMPTY, INTAKE, FULL, EXHAUST } State;
+    public:
+      CuCache();
+      ~CuCache();
+     
+      /// Initialize the cache
+      void Init(size_t cachesize, size_t bunchsize);
+
+      /// Add data to cache, returns number of added vectors
+      /// \param[in] rFeatures CuNN Input features data
+      /// \param[in] rDesired CuNN features data label
+      void AddData(const CuMatrix<BaseFloat>& rFeatures, const CuMatrix<BaseFloat>& rDesired);
+      /// Randomizes the cache
+      void Randomize();
+      /// Get the bunch of training data
+      /// \param[out] rFeatures Bunchsize of CuNN Input features data
+      /// \param[out] rDesired  Bunchsize of CuNN features data label
+      void GetBunch(CuMatrix<BaseFloat>& rFeatures, CuMatrix<BaseFloat>& rDesired);
+
+
+      /// Returns true if the cache was completely filled
+      bool Full()
+      { return (mState == FULL); }
+      
+      /// Returns true if the cache is empty
+      bool Empty()
+      { return (mState == EMPTY || mIntakePos < mBunchsize); }
+      
+      /// Number of discarded frames
+      int Discarded() 
+      { return mDiscarded; }
+      
+      /// Set the trace message level
+      void Trace(int trace)
+      { mTrace = trace; }
+
+    private:
+    
+      static long int GenerateRandom(int max)
+      { return lrand48() % max; }
+      
+      State mState; ///< Current state of the cache
+
+      size_t mIntakePos; ///< Number of intaken vectors by AddData
+      size_t mExhaustPos; ///< Number of exhausted vectors by GetBunch
+      
+      size_t mCachesize; ///< Size of cache
+      size_t mBunchsize; ///< Size of bunch
+      int mDiscarded; ///< Number of discarded frames
+
+      CuMatrix<BaseFloat> mFeatures; ///< Feature cache
+      CuMatrix<BaseFloat> mFeaturesRandom; ///< Feature cache
+      CuMatrix<BaseFloat> mFeaturesLeftover; ///< Feature cache
+      
+      CuMatrix<BaseFloat> mDesired;  ///< Desired vector cache
+      CuMatrix<BaseFloat> mDesiredRandom;  ///< Desired vector cache
+      CuMatrix<BaseFloat> mDesiredLeftover;  ///< Desired vector cache
+
+      bool mRandomized;
+
+      int mTrace;
+  }; 
+
+}
+
+#endif
diff --git a/src/CuTNetLib/cuCompDisc.cc b/src/CuTNetLib/cuCompDisc.cc
new file mode 100644
index 0000000..2336a86
--- /dev/null
+++ b/src/CuTNetLib/cuCompDisc.cc
@@ -0,0 +1,178 @@
+
+
+#include "cuCompDisc.h"
+#include "cuNetwork.h"
+
+#include "Error.h"
+
+
+namespace TNet
+{
+  
+  void 
+  CuDiscrete::
+  Propagate()
+  {
+    for (int i=0;i<inID.size(); i++)
+      mBlocks[inID[i].block]->SetInput(GetInput(i),inID[i].pos);
+    for (int i=0; i<mBlocks.size(); i++)
+      mBlocks[i]->Propagate();
+  }
+
+  void 
+  CuDiscrete::
+  PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    Error("Not applicable");
+  }
+
+  void 
+  CuDiscrete::
+  Backpropagate()
+  {
+    for (int i=0;i<outID.size(); i++)
+      mBlocks[outID[i].block]->SetErrorInput(GetOutput(i),outID[i].pos);
+    for(int i=0; i<mBlocks.size(); i++)
+      mBlocks[i]->Backpropagate();
+  }
+  
+  void 
+  CuDiscrete::
+  BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    Error("Not applicable");
+  }
+
+  void 
+  CuDiscrete::
+  Update() 
+  {
+    for(int i=0; i<mBlocks.size(); i++)
+    if ( mBlocks[i]->IsUpdatable() )
+    {
+      CuUpdatableComponent& rComp = dynamic_cast<CuUpdatableComponent&>(*mBlocks[i]);
+      rComp.Update();
+    }
+  }
+
+
+  void
+  CuDiscrete::
+  ReadFromStream(std::istream& rIn)
+  {
+    int i;
+    for(i=0; i<mBlocks.size(); i++) {
+      delete mBlocks[i];
+    }
+    mBlocks.clear();
+    inID.clear();
+    outID.clear();
+    CuComponent* comp;
+    i=0;
+    while ( NULL != (comp=CuNetwork::ComponentReader(rIn,NULL)) )
+    {
+      mBlocks.push_back(comp);
+      for (int j=0;j<(comp->GetInSect());++j)
+        inID.push_back(posID(i,j));
+      for (int j=0;j<(comp->GetOutSect());++j)
+        outID.push_back(posID(i,j));
+      ++i;
+    }
+  }
+
+   
+  void
+  CuDiscrete::
+  WriteToStream(std::ostream& rOut)
+  {
+    for(int i=0; i<mBlocks.size(); i++)
+      CuNetwork::ComponentDumper(rOut,*mBlocks[i]);
+    rOut << "<endblock>\n";
+  }
+
+  void 
+  CuCompound::
+  PropagateF(CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    int iLoc=0,oLoc=0;
+    CuMatrix<BaseFloat> In;
+    CuMatrix<BaseFloat> Out;
+    for(int i=0; i<mBlocks.size(); i++)
+    {
+      In.Init(X,iLoc,mBlocks[i]->GetNInputs());
+      Out.Init(Y,oLoc,mBlocks[i]->GetNOutputs());
+      mBlocks[i]->PropagateF(In,Out);
+      iLoc+=mBlocks[i]->GetNInputs();
+      oLoc+=mBlocks[i]->GetNOutputs();
+    }
+  }
+  
+  void 
+  CuCompound::
+  BackpropagateF(CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    int iLoc=0,oLoc=0;
+    CuMatrix<BaseFloat> In;
+    CuMatrix<BaseFloat> Out;
+    for(int i=0; i<mBlocks.size(); i++)
+    {
+      In.Init(X,iLoc,mBlocks[i]->GetNOutputs());
+      Out.Init(Y,oLoc,mBlocks[i]->GetNInputs());
+      mBlocks[i]->BackpropagateF(In,Out);
+      iLoc+=mBlocks[i]->GetNOutputs();
+      oLoc+=mBlocks[i]->GetNInputs();
+    }
+  }
+  
+  void 
+  CuCompound::
+  PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    Error("Not applicable");
+  }
+  
+  void 
+  CuCompound::
+  BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    Error("Not applicable");
+  }
+
+  void 
+  CuCompound::
+  Update() 
+  {
+    for(int i=0; i<mBlocks.size(); i++)
+    if ( mBlocks[i]->IsUpdatable() )
+    {
+      CuUpdatableComponent& rComp = dynamic_cast<CuUpdatableComponent&>(*mBlocks[i]);
+      rComp.Update();
+    }
+  }
+
+
+  void
+  CuCompound::
+  ReadFromStream(std::istream& rIn)
+  {
+    for(int i=0; i<mBlocks.size(); i++) {
+      delete mBlocks[i];
+    }
+    mBlocks.clear();
+    CuComponent* comp;
+    while ( NULL != (comp=CuNetwork::ComponentReader(rIn,NULL)) )
+      mBlocks.push_back(comp);
+  }
+
+   
+  void
+  CuCompound::
+  WriteToStream(std::ostream& rOut)
+  {
+    for(int i=0; i<mBlocks.size(); i++)
+      CuNetwork::ComponentDumper(rOut,*mBlocks[i]);
+    rOut << "<endblock>\n";
+  }
+ 
+} //namespace
+
diff --git a/src/CuTNetLib/cuCompDisc.h b/src/CuTNetLib/cuCompDisc.h
new file mode 100644
index 0000000..5b3232e
--- /dev/null
+++ b/src/CuTNetLib/cuCompDisc.h
@@ -0,0 +1,288 @@
+#ifndef _CUCOMPDISC_H_
+#define _CUCOMPDISC_H_
+
+
+#include "cuComponent.h"
+#include "cumatrix.h"
+#include "cuNetwork.h"
+
+#include "Matrix.h"
+#include "Vector.h"
+#include "Error.h"
+
+
+namespace TNet {
+  
+  /**
+   * \brief A layer of updatable compenents
+   *
+   * \ingroup CuNNUpdatable
+   * Each components are individually propagated and backpropagated with discrete inputs and outputs
+   *
+   * Enabling multipath topological structure within the network by layers
+   */
+   
+  class CuLumpUpdatable : public CuUpdatableComponent
+  {
+    public:
+      CuLumpUpdatable(size_t nInputs, size_t nOutputs, CuComponent *pPred)
+        : CuUpdatableComponent(nInputs, nOutputs, pPred)
+      { }
+  
+      void LearnRate(BaseFloat rate)
+      {
+        mLearningRate = rate;
+        for (int i=0;i<mBlocks.size();++i)
+          if ( mBlocks[i]->IsUpdatable() )
+          {
+            CuUpdatableComponent& rComp = dynamic_cast<CuUpdatableComponent&>(*mBlocks[i]);
+            rComp.LearnRate(rate);
+          }
+      }
+
+    
+      void Momentum(BaseFloat mmt)
+      {
+        mMomentum = mmt;
+        for (int i=0;i<mBlocks.size();++i)
+          if ( mBlocks[i]->IsUpdatable() )
+          {
+            CuUpdatableComponent& rComp = dynamic_cast<CuUpdatableComponent&>(*mBlocks[i]);
+            rComp.Momentum(mmt);
+          }
+      }
+
+      void Weightcost(BaseFloat cost)
+      {
+        mWeightcost = cost;
+        for (int i=0;i<mBlocks.size();++i)
+          if ( mBlocks[i]->IsUpdatable() )
+          {
+            CuUpdatableComponent& rComp = dynamic_cast<CuUpdatableComponent&>(*mBlocks[i]);
+            rComp.Weightcost(cost);
+          }
+      }
+
+      void GradDivFrm(bool div)
+      {
+        mGradDivFrm = div;
+        for (int i=0;i<mBlocks.size();++i)
+          if ( mBlocks[i]->IsUpdatable() )
+          {
+            CuUpdatableComponent& rComp = dynamic_cast<CuUpdatableComponent&>(*mBlocks[i]);
+            rComp.GradDivFrm(div);
+          }
+      }
+
+    protected:
+      std::vector< CuComponent* > mBlocks; ///< vector with component, one component is one block
+  };
+  
+  /**
+   * \brief A layer of updatable compenents
+   *
+   * \ingroup CuNNUpdatable
+   * Each components are individually propagated and backpropagated with inputs and outputs within one matrix to save space
+   *
+   */
+  
+  class CuDiscrete : public CuLumpUpdatable
+  {
+    public:
+    
+      typedef struct posID{ int block,pos; posID(int b, int p):block(b),pos(p){}} posID;
+        
+
+      CuDiscrete(size_t nInputs, size_t nOutputs, CuComponent *pPred); 
+      ~CuDiscrete();  
+      
+      ComponentType GetType() const;
+      const char* GetName() const;
+      
+      void Propagate(); 
+      void Backpropagate(); 
+
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+
+      void Update();
+
+      void ReadFromStream(std::istream& rIn);
+      void WriteToStream(std::ostream& rOut);
+      
+      int GetInSect()
+      {
+        return inID.size();
+      }
+      
+      int GetOutSect()
+      {
+        return outID.size();
+      }
+      
+      CuComponent* FindInput(int &pos)
+      {
+        if (pos<0 or pos>=inID.size())
+          Error("Position out of bound");
+        int i=pos;
+        pos=inID[i].pos;
+        return mBlocks[inID[i].block];
+      }
+      
+      CuComponent* FindOutput(int &pos)
+      {
+        if (pos<0 or pos>=outID.size())
+          Error("Position out of bound");
+        int i=pos;
+        pos=outID[i].pos;
+        return mBlocks[outID[i].block];
+      }
+      
+      /// IO Data getters
+      CuMatrix<BaseFloat>& GetInput(int pos=0)
+      {
+        if (preComp!=NULL)
+          return preComp->GetOutput(pos);
+        return *mpInput;
+      }
+      CuMatrix<BaseFloat>& GetOutput(int pos=0)
+      {
+        CuComponent* pComp=FindOutput(pos);
+        return pComp->GetOutput(pos);
+      }
+      CuMatrix<BaseFloat>& GetErrorInput(int pos=0)
+      {
+        if (nxtComp!=NULL)
+          return nxtComp->GetErrorOutput(pos);
+        return *mpErrorInput;
+      }
+      CuMatrix<BaseFloat>& GetErrorOutput(int pos=0)
+      {
+        CuComponent* pComp=FindInput(pos);
+        return pComp->GetErrorOutput(pos);
+      }
+
+      /// Set input vector (bind with the preceding NetworkComponent)
+      void SetInput(CuMatrix<BaseFloat>& rInput,int pos=0)
+      {
+        if (pos==0)
+          mpInput=&rInput;
+        CuComponent* pComp=FindInput(pos);
+        pComp->SetInput(rInput,pos);
+      }          
+      /// Set error input vector (bind with the following NetworkComponent) 
+      void SetErrorInput(CuMatrix<BaseFloat>& rErrorInput,int pos=0)
+      {
+        if (pos==0)
+          mpErrorInput=&rErrorInput;
+        CuComponent* pComp=FindOutput(pos);
+        pComp->SetErrorInput(rErrorInput,pos);
+      }
+    private:
+      std::vector< CuComponent* > mBlocks;
+      std::vector< posID > inID,outID;
+  };
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // CuDiscrete::
+  inline 
+  CuDiscrete::
+  CuDiscrete(size_t nInputs, size_t nOutputs, CuComponent *pPred)
+    : CuLumpUpdatable(nInputs, nOutputs, pPred)
+  { }
+
+
+  inline
+  CuDiscrete::
+  ~CuDiscrete()
+  { 
+    for(int i=0; i<mBlocks.size(); i++) {
+      delete mBlocks[i];
+    }
+    mBlocks.clear();
+  }
+
+  inline CuComponent::ComponentType
+  CuDiscrete::
+  GetType() const
+  {
+    return CuComponent::DISCRETE;
+  }
+
+  inline const char*
+  CuDiscrete::
+  GetName() const
+  {
+    return "<discrete>";
+  }
+
+  class CuCompound : public CuLumpUpdatable
+  {
+    public:
+
+      CuCompound(size_t nInputs, size_t nOutputs, CuComponent *pPred); 
+      ~CuCompound();  
+      
+      ComponentType GetType() const;
+      const char* GetName() const;
+
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+
+      void Update();
+
+      void ReadFromStream(std::istream& rIn);
+      void WriteToStream(std::ostream& rOut);
+      
+      void PropagateF(CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+      void BackpropagateF(CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+
+    protected:
+      
+      std::vector< CuComponent* > mBlocks; ///< vector with component, one component is one block
+
+  };
+  
+    ////////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // CuLinearity::
+  inline 
+  CuCompound::
+  CuCompound(size_t nInputs, size_t nOutputs, CuComponent *pPred)
+    : CuLumpUpdatable(nInputs, nOutputs, pPred)
+  { }
+
+
+  inline
+  CuCompound::
+  ~CuCompound()
+  {
+    for(int i=0; i<mBlocks.size(); i++) {
+      delete mBlocks[i];
+    }
+    mBlocks.clear(); 
+  }
+
+  inline CuComponent::ComponentType
+  CuCompound::
+  GetType() const
+  {
+    return CuComponent::COMPOUND;
+  }
+
+  inline const char*
+  CuCompound::
+  GetName() const
+  {
+    return "<compound>";
+  }
+
+} //namespace
+
+
+
+#endif
diff --git a/src/CuTNetLib/cuComponent.h b/src/CuTNetLib/cuComponent.h
new file mode 100644
index 0000000..6cc8462
--- /dev/null
+++ b/src/CuTNetLib/cuComponent.h
@@ -0,0 +1,505 @@
+#ifndef _CUNETWORK_COMPONENT_I_H
+#define _CUNETWORK_COMPONENT_I_H
+
+
+#include "Vector.h"
+#include "Matrix.h"
+#include "Error.h"
+
+#include "cumatrix.h"
+
+#include <iostream>
+#include <stdexcept>
+#include <vector>
+
+/// \defgroup CuNNLayer CuNN Layer types
+/// \ingroup CuNNComp
+
+/// \defgroup CuNNUpdatable CuNN Updatable Layer
+/// \ingroup CuNNLayer
+
+/// \defgroup CuNNActivation CuNN Activation Func Layer
+/// \ingroup CuNNLayer
+
+/// \defgroup CuNNMisc CuNN Misc Layer
+/// \ingroup CuNNLayer
+
+namespace TNet {
+
+    
+  /**
+   * \brief Neural network building blocks
+   *
+   * Basic element of the network,
+   * it is a box with defined inputs and outputs, 
+   * and functions to refresh outputs
+   *
+   * it is able to compute tranformation function (forward pass) 
+   * and jacobian function (backward pass), 
+   * which is to be implemented in descendents
+   *
+   * Compoments together form a doubly-linked list.
+   * 
+   * Every input rows are a frame of data,
+   *  and the input size are the number of columns 
+   */ 
+  class CuComponent 
+  {
+    public:
+    /// Types of the net components
+    typedef enum { 
+      UPDATABLE_COMPONENT = 0x0100,
+      BIASED_LINEARITY,
+      DISCRETE_LINEARITY,
+      SHARED_LINEARITY,
+      SPARSE_LINEARITY,
+      RBM,
+      RBM_SPARSE,
+      RECURRENT,
+      LINEARITY,
+      UPDATABLEBIAS,
+      DISCRETE,
+      COMPOUND,
+
+      ACT_FUN = 0x0200,
+      SOFTMAX, 
+      SIGMOID,
+
+      OTHER = 0x0400,
+      EXPAND,
+      COPY,
+      TRANSPOSE,
+      BLOCK_LINEARITY,
+      WINDOW,
+      BIAS,
+      LOG,
+      PIPE,
+      LEARNSTOP,
+      DISTRIB,
+      COMBINE,
+      DIVIDE,
+      MERGE,
+      REORDER,
+
+      BLOCK_ARRAY,
+    } ComponentType;
+    
+    typedef std::vector< CuMatrix<BaseFloat>* > MatrixPtrVec;
+
+    //////////////////////////////////////////////////////////////
+    // Constructor & Destructor
+    public: 
+      CuComponent(size_t nInputs, size_t nOutputs, CuComponent *pPred); 
+      virtual ~CuComponent();  
+       
+    //////////////////////////////////////////////////////////////
+    // Interface specification (public)
+    public:
+      /// Get Type Identification of the component
+      virtual ComponentType GetType() const = 0;  
+      /// Get Type Label of the component
+      virtual const char* GetName() const = 0;
+      /// Return if the component is UpdatableComponent
+      virtual bool IsUpdatable() const 
+      { return false; }
+
+      /// Get size of input vectors
+      size_t GetNInputs() const;  
+      /// Get size of output vectors 
+      size_t GetNOutputs() const;
+
+      /// Set size of input vectors
+      size_t SetNInputs(size_t nInputs);
+      /// Set size of output vectors
+      size_t SetNOutputs(size_t nOutputs);
+      /// Set the previous component
+      void SetPrevious(CuComponent* pPred);
+      /// Set the next component
+      void SetNext(CuComponent* pNxt);
+      
+      /// Return the number of different inputs for complex component
+      int GetInSect();
+      /// Return the number of different outputs for complex component
+      int GetOutSect();
+
+      /// IO Data getters
+      CuMatrix<BaseFloat>& GetInput(int pos=0); 
+      CuMatrix<BaseFloat>& GetOutput(int pos=0);
+      CuMatrix<BaseFloat>& GetErrorInput(int pos=0);
+      CuMatrix<BaseFloat>& GetErrorOutput(int pos=0);
+
+      /// Set input vector (bind with the preceding NetworkComponent)
+      void SetInput(CuMatrix<BaseFloat>& rInput,int pos=0);           
+      /// Set error input vector (bind with the following NetworkComponent) 
+      void SetErrorInput(CuMatrix<BaseFloat>& rErrorInput,int pos=0);  
+
+      /// Perform forward pass propagateion Input->Output,
+      /// wrapper for the PropagateFnc method
+      void Propagate(); 
+      /// Perform backward pass propagateion ErrorInput->ErrorOutput,
+      /// wrapper for the BackpropagateFnc method
+      void Backpropagate(); 
+ 
+      /// Reads the component parameters from stream
+      virtual void ReadFromStream(std::istream& rIn)  { }
+      /// Writes the components parameters to stream
+      virtual void WriteToStream(std::ostream& rOut)  { } 
+      
+      /// Public wrapper for PropagateFnc
+      void PropagateF(CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+      /// Public wrapper for BackpropagateFnc
+      void BackpropagateF(CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+
+
+    ///////////////////////////////////////////////////////////////
+    // Nonpublic member functions used to update data outputs 
+    protected:
+      /// Forward pass transformation (to be implemented by descendents...)
+      /// \param[in] X InputMatrix (Network input or Output from last layer)
+      /// \param[out] Y OutputMatrix (Network output or input of the next layer)
+      virtual void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y) = 0;
+      /// Backward pass transformation (to be implemented by descendents...)
+      /// \param[in] X InputMatrix (Network Error, objective func output, or Error output from the next layer)
+      /// \param[out] Y OutputMatrix (Error input of the last layer)
+      virtual void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y) = 0;
+      
+   
+    ///////////////////////////////////////////////////////////////
+    // data members
+    protected:
+
+      size_t mNInputs;  ///< Size of input vectors
+      size_t mNOutputs; ///< Size of output vectors 
+      
+      CuMatrix<BaseFloat>* mpInput; ///< inputs are NOT OWNED by component
+      CuMatrix<BaseFloat>* mpErrorInput;///< inputs are NOT OWNED by component
+
+      CuMatrix<BaseFloat> mOutput; ///< outputs are OWNED by component
+      CuMatrix<BaseFloat> mErrorOutput; ///< outputs are OWNED by component
+      
+      CuComponent* preComp;///< The preceding component in the Network
+      CuComponent* nxtComp;///< The following component in the Network
+  };
+
+
+  /**
+   * \brief Class UpdatableComponent is a box which has some 
+   * parameters adjustable by learning
+   * 
+   * you can set the learning rate, lock the params,
+   * and learn from each data observation
+   */
+  class CuUpdatableComponent : public CuComponent
+  {
+    //////////////////////////////////////////////////////////////
+    // Constructor & Destructor
+    public: 
+      CuUpdatableComponent(size_t nInputs, size_t nOutputs, CuComponent *pPred); 
+      virtual ~CuUpdatableComponent();
+
+
+    //////////////////////////////////////////////////////////////
+    // Interface specification (public)
+    public:
+      /// Return if CuUpdatableComponent is updatable? 
+      virtual bool IsUpdatable() const 
+      { return true; }
+
+      /// get gradient and update the parameters in one step
+      virtual void Update() = 0;    
+      
+      /// Sets the learning rate of gradient descent
+      void LearnRate(BaseFloat rate);
+      /// Gets the learning rate of gradient descent
+      BaseFloat LearnRate();
+
+      /// Sets the momentum
+      void Momentum(BaseFloat mmt);
+      BaseFloat Momentum();
+
+      /// Set the weight decay rate to penalize large weights
+      void Weightcost(BaseFloat cost);
+      BaseFloat Weightcost();
+
+      /// Set whether gradient is divided by frames
+      void GradDivFrm(bool div);
+      bool GradDivFrm();
+  
+    protected:
+      BaseFloat mLearningRate;
+      BaseFloat mMomentum;
+      BaseFloat mWeightcost;
+      bool      mGradDivFrm;
+
+  };
+
+
+
+
+  //////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // CuComponent::
+  inline
+  CuComponent::
+  CuComponent(size_t nInputs, size_t nOutputs, CuComponent *pPred) 
+    : mNInputs(nInputs), mNOutputs(nOutputs), 
+      mpInput(NULL), mpErrorInput(NULL), 
+      mOutput(), mErrorOutput(),preComp(pPred)
+  { 
+    /* DOUBLE LINK the Components */
+    if (pPred != NULL) {
+      SetPrevious(pPred);
+      pPred->SetNext(this);
+    }
+  } 
+
+  inline void
+  CuComponent::
+  SetPrevious(CuComponent* pPred)
+  {
+	preComp=pPred;
+    /* DOUBLE LINK the Components */
+    if (pPred != NULL) {
+      SetInput(pPred->GetOutput());
+    }
+  }
+  
+  inline void
+  CuComponent::
+  SetNext(CuComponent* pNxt)
+  {
+	nxtComp=pNxt;
+	if (pNxt != NULL) {
+      SetErrorInput(pNxt->GetErrorOutput());
+    }
+  }
+
+  inline
+  CuComponent::
+  ~CuComponent()
+  {
+    ;
+  }
+
+  inline void
+  CuComponent::
+  Propagate()
+  {
+    //initialize output buffer
+    mOutput.Init(GetInput().Rows(),GetNOutputs());
+    //do the dimensionality test
+    if(GetNInputs() != GetInput().Cols()) {
+      KALDI_ERR << "Non-matching INPUT dim!!! Network dim: " << GetNInputs() 
+                << " Data dim: " << GetInput().Cols();
+    }
+    //run transform
+    PropagateF(GetInput(),mOutput);
+  }
+
+
+  inline void
+  CuComponent::
+  Backpropagate()
+  {
+    //re-initialize the output buffer
+    mErrorOutput.Init(GetErrorInput().Rows(),GetNInputs());
+
+    //do the dimensionality test
+    assert(GetErrorInput().Cols() == mNOutputs);
+    assert(mErrorOutput.Cols() == mNInputs);
+    assert(mErrorOutput.Rows() == GetErrorInput().Rows());
+
+    //transform
+    BackpropagateF(GetErrorInput(),mErrorOutput);
+  }
+
+
+  inline void
+  CuComponent::
+  SetInput(CuMatrix<BaseFloat>& rInput,int pos)
+  {
+    mpInput = &rInput;
+  }
+
+
+  inline void
+  CuComponent::
+  SetErrorInput(CuMatrix<BaseFloat>& rErrorInput,int pos)
+  {
+    mpErrorInput = &rErrorInput;
+  }
+
+  inline CuMatrix<BaseFloat>&
+  CuComponent::
+  GetInput(int pos)
+  {
+    if (NULL == mpInput) Error("mpInput is NULL");
+    return *mpInput;
+  }
+
+  inline CuMatrix<BaseFloat>&
+  CuComponent::
+  GetOutput(int pos)
+  {
+    return mOutput;
+  }
+
+  inline CuMatrix<BaseFloat>&
+  CuComponent::
+  GetErrorInput(int pos)
+  {
+    if (NULL == mpErrorInput) Error("mpErrorInput is NULL");
+    return *mpErrorInput;
+  }
+
+  inline CuMatrix<BaseFloat>&
+  CuComponent::
+  GetErrorOutput(int pos)
+  {
+    return mErrorOutput;
+  }
+  
+  inline size_t
+  CuComponent::
+  GetNInputs() const
+  {
+    return mNInputs;
+  }
+
+  inline size_t
+  CuComponent::
+  GetNOutputs() const
+  {
+    return mNOutputs;
+  }
+  
+  inline int
+  CuComponent::
+  GetInSect()
+  {
+	return 1;
+  }
+  
+  inline int
+  CuComponent::
+  GetOutSect()
+  {
+	return 1;
+  }
+
+  inline size_t
+  CuComponent::
+  SetNInputs(size_t nInputs)
+  {
+    mNInputs=nInputs;
+  }
+  
+  inline size_t
+  CuComponent::
+  SetNOutputs(size_t nOutputs)
+  {
+    mNOutputs=nOutputs;
+  }
+  
+  inline void 
+  CuComponent::
+  PropagateF(CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+	  PropagateFnc(X,Y);
+  }
+  inline void 
+  CuComponent::
+  BackpropagateF(CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+	  BackpropagateFnc(X,Y);
+  }
+
+
+  //////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // UpdatableComponent::
+  
+  inline 
+  CuUpdatableComponent::
+  CuUpdatableComponent(size_t nInputs, size_t nOutputs, CuComponent *pPred) 
+    : CuComponent(nInputs, nOutputs, pPred), 
+      mLearningRate(0.0), mMomentum(0), mWeightcost(0), mGradDivFrm(true)
+  {
+    ; 
+  } 
+
+
+  inline
+  CuUpdatableComponent::
+  ~CuUpdatableComponent()
+  {
+    ;
+  }
+
+
+  inline void
+  CuUpdatableComponent::
+  LearnRate(BaseFloat rate)
+  {
+    mLearningRate = rate;
+  }
+
+
+  inline BaseFloat
+  CuUpdatableComponent::
+  LearnRate()
+  {
+    return mLearningRate;
+  }
+  
+
+  inline void
+  CuUpdatableComponent::
+  Momentum(BaseFloat mmt)
+  {
+    mMomentum = mmt;
+  }
+
+
+  inline BaseFloat
+  CuUpdatableComponent::
+  Momentum()
+  {
+    return mMomentum;
+  }
+  
+  
+  inline void
+  CuUpdatableComponent::
+  Weightcost(BaseFloat cost)
+  {
+    mWeightcost = cost;
+  }
+
+
+  inline BaseFloat
+  CuUpdatableComponent::
+  Weightcost()
+  {
+    return mWeightcost;
+  }
+
+  
+  inline void 
+  CuUpdatableComponent::
+  GradDivFrm(bool div)
+  {
+    mGradDivFrm = div;
+  }
+   
+  inline bool
+  CuUpdatableComponent::
+  GradDivFrm()
+  {
+    return mGradDivFrm;
+  }
+
+} // namespace TNet
+
+
+#endif
diff --git a/src/CuTNetLib/cuConcatenate.cc b/src/CuTNetLib/cuConcatenate.cc
new file mode 100644
index 0000000..f80fe9b
--- /dev/null
+++ b/src/CuTNetLib/cuConcatenate.cc
@@ -0,0 +1,138 @@
+
+
+#include "cuConcatenate.h"
+#include "cuNetwork.h"
+
+
+namespace TNet
+{
+
+  void 
+  CuBlockArray::
+  PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    CuMatrix<BaseFloat> colsX;
+    CuMatrix<BaseFloat> colsY;
+    
+    int X_src_ori=0, Y_tgt_ori=0;
+    for(int i=0; i<mNBlocks; i++) {
+      //copy column stripe from the input X
+      int colsX_cnt=mBlocks[i]->GetNInputs();
+      colsX.Init(X.Rows(),colsX_cnt);
+      colsX.CopyCols(colsX_cnt,X_src_ori,X,0);
+
+      //propagate through the block(network)
+      mBlocks[i]->Propagate(colsX,colsY);
+
+      //copy column stripe to the output Y
+      int colsY_cnt=mBlocks[i]->GetNOutputs();
+      Y.CopyCols(colsY_cnt,0,colsY,Y_tgt_ori);
+
+      //shift the origin coordinates
+      X_src_ori += colsX_cnt;
+      Y_tgt_ori += colsY_cnt;
+    }
+
+    assert(X_src_ori == X.Cols());
+    assert(Y_tgt_ori == Y.Cols());
+  }
+
+  /// @todo CuBlockArray::BackpropagateFnc not implemented
+  void 
+  CuBlockArray::
+  BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    KALDI_ERR << "Unimplemented";
+  }
+
+  void 
+  CuBlockArray::
+  Update() 
+  {
+    KALDI_ERR << "Unimplemented";
+  }
+
+
+  void
+  CuBlockArray::
+  ReadFromStream(std::istream& rIn)
+  {
+    if(mBlocks.size() > 0) {
+      KALDI_ERR << "Cannot read block vector, "
+                << "aleady filled bt "
+                << mBlocks.size()
+                << "elements";
+    }
+
+    rIn >> std::ws >> mNBlocks;
+    if(mNBlocks < 1) {
+      KALDI_ERR << "Bad number of blocks:" << mNBlocks;
+    }
+
+    //read all the blocks
+    std::string tag;
+    int block_id;
+    for(int i=0; i<mNBlocks; i++) {
+      //read tag <block>
+      rIn >> std::ws >> tag;
+      //make it lowercase
+      std::transform(tag.begin(), tag.end(), tag.begin(), tolower);
+      //check
+      if(tag!="<block>") {
+        KALDI_ERR << "<block> keywotd expected";
+      }
+    
+      //read block number
+      rIn >> std::ws >> block_id;
+      if(block_id != i+1) {
+        KALDI_ERR << "Expected block number:" << i+1
+                  << " read block number: " << block_id;
+      }
+
+      //read the nnet
+      CuNetwork* p_nnet = new CuNetwork;
+      p_nnet->ReadNetwork(rIn);
+      if(p_nnet->Layers() == 0) {
+        KALDI_ERR << "Cannot read empty network to a block";
+      }
+
+      //add it to the vector
+      mBlocks.push_back(p_nnet);
+    }
+
+    //check the declared dimensionality
+    int sum_inputs=0, sum_outputs=0;
+    for(int i=0; i<mNBlocks; i++) {
+      sum_inputs += mBlocks[i]->GetNInputs();
+      sum_outputs += mBlocks[i]->GetNOutputs();
+    }
+    if(sum_inputs != GetNInputs()) {
+      KALDI_ERR << "Non-matching number of INPUTS! Declared:"
+                << GetNInputs()
+                << " summed from blocks"
+                << sum_inputs;
+    }
+    if(sum_outputs != GetNOutputs()) {
+      KALDI_ERR << "Non-matching number of OUTPUTS! Declared:"
+                << GetNOutputs()
+                << " summed from blocks"
+                << sum_outputs;
+    }
+  }
+
+   
+  void
+  CuBlockArray::
+  WriteToStream(std::ostream& rOut)
+  {
+    rOut << " " << mBlocks.size() << " ";
+    for(int i=0; i<mBlocks.size(); i++) {
+      rOut << "<block> " << i+1 << "\n";
+      mBlocks[i]->WriteNetwork(rOut);
+      rOut << "<endblock>\n";
+    }
+  }
+
+ 
+} //namespace
+
diff --git a/src/CuTNetLib/cuConcatenate.h b/src/CuTNetLib/cuConcatenate.h
new file mode 100644
index 0000000..5454538
--- /dev/null
+++ b/src/CuTNetLib/cuConcatenate.h
@@ -0,0 +1,90 @@
+#ifndef _CUBLOCK_ARRAY_H_
+#define _CUBLOCK_ARRAY_H_
+
+
+#include "cuComponent.h"
+#include "cumatrix.h"
+
+#include "Matrix.h"
+#include "Vector.h"
+
+
+namespace TNet {
+
+  class CuNetwork;
+  /**
+   * \brief Updatable compenent consists of several networks
+   *
+   * \ingroup CuNNUpdatable
+   * Each network is individually propagated and backpropagated with non-overlaping input and output.
+   *
+   * Enabling multipath topological structure within the network.
+   */
+  class CuBlockArray : public CuUpdatableComponent
+  {
+    public:
+
+      CuBlockArray(size_t nInputs, size_t nOutputs, CuComponent *pPred); 
+      ~CuBlockArray();  
+      
+      ComponentType GetType() const;
+      const char* GetName() const;
+
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+
+      void Update();
+
+      void ReadFromStream(std::istream& rIn);
+      void WriteToStream(std::ostream& rOut);
+
+    protected:
+      std::vector<CuNetwork*> mBlocks; ///< vector with networks, one network is one block
+      size_t mNBlocks;  
+  };
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // CuBlockArray::
+  inline 
+  CuBlockArray::
+  CuBlockArray(size_t nInputs, size_t nOutputs, CuComponent *pPred)
+    : CuUpdatableComponent(nInputs, nOutputs, pPred), 
+      mNBlocks(0) 
+  { }
+
+
+  inline
+  CuBlockArray::
+  ~CuBlockArray()
+  { 
+    for(int i=0; i<mBlocks.size(); i++) {
+      delete mBlocks[i];
+    }
+    mBlocks.clear();
+  }
+
+  inline CuComponent::ComponentType
+  CuBlockArray::
+  GetType() const
+  {
+    return CuComponent::BLOCK_ARRAY;
+  }
+
+  inline const char*
+  CuBlockArray::
+  GetName() const
+  {
+    return "<blockarray>";
+  }
+
+
+
+} //namespace
+
+
+
+#endif
diff --git a/src/CuTNetLib/cuDiscreteLinearity.cc b/src/CuTNetLib/cuDiscreteLinearity.cc
new file mode 100644
index 0000000..befde24
--- /dev/null
+++ b/src/CuTNetLib/cuDiscreteLinearity.cc
@@ -0,0 +1,160 @@
+
+
+#include "cuDiscreteLinearity.h"
+#include "cumath.h"
+
+namespace TNet
+{
+
+  void 
+  CuDiscreteLinearity::
+  PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    //Y.SetConst(0.0);
+
+    //precopy bias
+    Y.AddScaledRow(1.0,mBias,0.0);
+
+    //mulitply with the matrices
+    int offset_in=0, offset_out=0;
+    for (int i=0; i<mNBlocks; i++) {
+      CuMath<BaseFloat>::OffsetGemm('N','N', 1.0, X, mLinearity[i], 1.0, Y, 
+                                    offset_in, 0, offset_out);
+      offset_in += mLinearity[i].Rows();
+      offset_out += mLinearity[i].Cols();
+    }
+  }
+
+
+  void 
+  CuDiscreteLinearity::
+  BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    //Y.SetConst(0.0);
+
+    int offset_in=0, offset_out=0;
+    for(int i=0; i<mNBlocks; i++) {
+      CuMath<BaseFloat>::OffsetGemm('N', 'T', 1.0, X, mLinearity[i], 0.0, Y,
+                                    offset_in, 0, offset_out);
+      offset_in += mLinearity[i].Cols();
+      offset_out += mLinearity[i].Rows();
+    }
+  }
+
+  
+  void 
+  CuDiscreteLinearity::
+  Update() 
+  {
+    //new implementation
+    BaseFloat N = 1; 
+    if(mGradDivFrm) {
+      N = static_cast<BaseFloat>(GetInput().Rows());
+    }
+    BaseFloat mmt_gain = static_cast<BaseFloat>(1.0/(1.0-mMomentum));
+    N *= mmt_gain; //compensate higher gradient estimates due to momentum 
+
+    //get gradients of discrete linearities
+    int offset_in=0, offset_out=0;
+    for(int i=0; i<mNBlocks; i++) {
+      CuMath<BaseFloat>::OffsetGemm('T','N',1.0,
+                        GetInput(),GetErrorInput(),
+                        mMomentum, mLinearityCorrection[i],
+                        offset_in,offset_out,0);
+      offset_in += mLinearity[i].Rows();
+      offset_out += mLinearity[i].Cols();
+    }
+    for(int i=0; i<mNBlocks; i++) {
+      //perform update 
+      mLinearity[i].AddScaled(-mLearningRate/N,mLinearityCorrection[i],1.0);
+      //regularization weight decay
+      mLinearity[i].AddScaled(-mLearningRate*mWeightcost,mLinearity[i],1.0);
+    }
+
+    //get gradient of bias
+    mBiasCorrection.AddColSum(1.0,GetErrorInput(),mMomentum);
+    //update biases
+    mBias.AddScaled(-mLearningRate/N,mBiasCorrection,1.0);
+  }
+
+
+  void
+  CuDiscreteLinearity::
+  ReadFromStream(std::istream& rIn)
+  {
+    rIn >> std::ws >> mNBlocks;
+    if(mNBlocks < 1) {
+      KALDI_ERR << "Bad number of blocks:" << mNBlocks;
+    }
+
+    mLinearity.resize(mNBlocks);
+    mLinearityCorrection.resize(mNBlocks);
+
+    int in_dim = 0, out_dim = 0;
+    for(int i=0; i<mNBlocks; i++) {
+      //matrix is stored transposed as SNet does
+      BfMatrix transpose;
+      rIn >> transpose;
+      mLinearity[i].CopyFrom(BfMatrix(transpose, TRANS));
+      
+      if(transpose.Cols()*transpose.Rows() == 0) {
+        Error("Missing linearity matrix in network file");
+      }
+      //allocate training buffers
+      mLinearityCorrection[i].Init(mLinearity[i].Rows(),mLinearity[i].Cols());
+      mLinearityCorrection[i].SetConst(0.0);
+
+      in_dim += transpose.Cols();
+      out_dim += transpose.Rows();
+    }
+    
+    //biases stored normally
+    BfVector bias;
+    rIn >> bias;
+    mBias.CopyFrom(bias);
+    if(bias.Dim() == 0) {
+      Error("Missing bias vector in network file");
+    }
+    mBiasCorrection.Init(mBias.Dim());
+    mBiasCorrection.SetConst(0.0);
+
+    if(out_dim != GetNOutputs() || 
+       in_dim != GetNInputs() ||
+       mBias.Dim() != GetNOutputs()
+    ){
+      std::ostringstream os;
+      os << "Wrong dimensionalities of matrix/vector in network file\n"
+         << "Inputs:" << GetNInputs()
+         << "Outputs:" << GetNOutputs()
+         << "\n"
+         << "linearityCols:" << in_dim
+         << "linearityRows:" << out_dim
+         << "biasDims:" << mBias.Dim()
+         << "\n";
+      Error(os.str());
+    }
+  }
+
+   
+  void
+  CuDiscreteLinearity::
+  WriteToStream(std::ostream& rOut)
+  {
+    rOut << mNBlocks << "\n";
+    for(int i=0; i< mNBlocks; i++) {
+      //matrix is stored transposed as SNet does
+      BfMatrix tmp;
+      mLinearity[i].CopyTo(tmp);
+      BfMatrix transpose(tmp, TRANS);
+      rOut << transpose;
+    }
+    //biases stored normally
+    BfVector vec;
+    mBias.CopyTo(vec);
+    rOut << vec;
+    rOut << std::endl;
+  }
+
+ 
+} //namespace
+
diff --git a/src/CuTNetLib/cuDiscreteLinearity.h b/src/CuTNetLib/cuDiscreteLinearity.h
new file mode 100644
index 0000000..8a70fa7
--- /dev/null
+++ b/src/CuTNetLib/cuDiscreteLinearity.h
@@ -0,0 +1,97 @@
+#ifndef _CUDISCRETE_LINEARITY_H_
+#define _CUDISCRETE_LINEARITY_H_
+
+
+#include "cuComponent.h"
+#include "cumatrix.h"
+
+
+#include "Matrix.h"
+#include "Vector.h"
+
+#include <vector>
+
+
+namespace TNet {
+  /**
+   * \brief CuDiscreteLinearity Summation function
+   * 
+   * \ingroup CuNNUpdatable
+   * Similar to CuSharedLinearity except that mBias is not shared,
+   * only mLinearity is used as multiple instances.
+   * \sa CuBiasedLinearity
+   */
+  class CuDiscreteLinearity : public CuUpdatableComponent
+  {
+    public:
+
+      CuDiscreteLinearity(size_t nInputs, size_t nOutputs, CuComponent *pPred); 
+      ~CuDiscreteLinearity();  
+      
+      ComponentType GetType() const;
+      const char* GetName() const;
+
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+
+      void Update();
+
+      void ReadFromStream(std::istream& rIn);
+      void WriteToStream(std::ostream& rOut);
+
+    protected:
+      std::vector<CuMatrix<BaseFloat> > mLinearity;  ///< Matrix with neuron weights
+      CuVector<BaseFloat> mBias;       ///< Vector with biases
+
+      std::vector<CuMatrix<BaseFloat> > mLinearityCorrection; ///< Matrix for linearity updates
+      CuVector<BaseFloat> mBiasCorrection;      ///< Vector for bias updates
+
+      size_t mNBlocks;
+
+  };
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // CuDiscreteLinearity::
+  inline 
+  CuDiscreteLinearity::
+  CuDiscreteLinearity(size_t nInputs, size_t nOutputs, CuComponent *pPred)
+    : CuUpdatableComponent(nInputs, nOutputs, pPred), 
+      //mLinearity(nInputs,nOutputs), mBias(nOutputs),
+      //mLinearityCorrection(nInputs,nOutputs), mBiasCorrection(nOutputs)
+      mNBlocks(0)
+  { 
+    //mLinearityCorrection.SetConst(0.0);
+    //mBiasCorrection.SetConst(0.0);
+  }
+
+
+  inline
+  CuDiscreteLinearity::
+  ~CuDiscreteLinearity()
+  { }
+
+  inline CuComponent::ComponentType
+  CuDiscreteLinearity::
+  GetType() const
+  {
+    return CuComponent::DISCRETE_LINEARITY;
+  }
+
+  inline const char*
+  CuDiscreteLinearity::
+  GetName() const
+  {
+    return "<discretelinearity>";
+  }
+
+
+
+} //namespace
+
+
+
+#endif
diff --git a/src/CuTNetLib/cuLinearity.cc b/src/CuTNetLib/cuLinearity.cc
new file mode 100644
index 0000000..5fb247d
--- /dev/null
+++ b/src/CuTNetLib/cuLinearity.cc
@@ -0,0 +1,107 @@
+
+
+#include "cuLinearity.h"
+
+
+namespace TNet
+{
+
+  void 
+  CuLinearity::
+  PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    //Y.SetConst(0.0);
+    Y.Gemm('N','N', 1.0, X, mLinearity, 0.0);
+  }
+
+
+  void 
+  CuLinearity::
+  BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    //Y.SetConst(0.0);
+    Y.Gemm('N', 'T', 1.0, X, mLinearity, 0.0);
+  }
+
+  
+  void 
+  CuLinearity::
+  Update() 
+  {
+#if 0
+    //former implementation
+    BaseFloat N = static_cast<BaseFloat>(GetInput().Rows());
+
+    mLinearityCorrection.Gemm('T','N',-mLearningRate/N,GetInput(),GetErrorInput(),mMomentum);
+    mBiasCorrection.AddColSum(-mLearningRate/N,GetErrorInput(),mMomentum);
+
+    //regularization weight decay
+    mLinearityCorrection.AddScaled(-mLearningRate*mWeightcost,mLinearity,1.0);
+    
+    mLinearity.AddScaled(1.0,mLinearityCorrection,1.0);
+    mBias.AddScaled(1.0,mBiasCorrection,1.0);
+#endif
+
+#if 1
+    //new implementation
+    BaseFloat N = 1;
+    if(mGradDivFrm) {
+      N = static_cast<BaseFloat>(GetInput().Rows());
+    }
+    BaseFloat mmt_gain = static_cast<BaseFloat>(1.0/(1.0-mMomentum));
+    N *= mmt_gain;
+
+    mLinearityCorrection.Gemm('T','N',1.0,GetInput(),GetErrorInput(),mMomentum);
+
+    mLinearity.AddScaled(-mLearningRate/N,mLinearityCorrection,1.0);
+
+    //regularization weight decay (from actual weights only)
+    BaseFloat L2_decay = -mLearningRate*mWeightcost*(mGradDivFrm?1.0:GetInput().Rows());
+    mLinearity.AddScaled(L2_decay, mLinearity,1.0);
+#endif
+  }
+
+
+  void
+  CuLinearity::
+  ReadFromStream(std::istream& rIn)
+  {
+    //matrix is stored transposed as SNet does
+    BfMatrix transpose;
+    rIn >> transpose;
+    mLinearity.CopyFrom(BfMatrix(transpose, TRANS));
+
+    /*if(transpose.Cols()*transpose.Rows() == 0) { 
+      Error("Missing linearity matrix in network file");
+    }*/
+    if(mLinearity.Cols() != GetNOutputs() || 
+       mLinearity.Rows() != GetNInputs()
+    ){
+      std::ostringstream os;
+      os << "Wrong dimensionalities of matrix/vector in network file\n"
+         << "Inputs:" << GetNInputs()
+         << "Outputs:" << GetNOutputs()
+         << "\n"
+         << "linearityCols:" << mLinearity.Cols()
+         << "linearityRows:" << mLinearity.Rows()
+         << "\n";
+      Error(os.str());
+    }
+  }
+
+   
+  void
+  CuLinearity::
+  WriteToStream(std::ostream& rOut)
+  {
+    //matrix is stored transposed as SNet does
+    BfMatrix tmp;
+    mLinearity.CopyTo(tmp);
+    BfMatrix transpose(tmp, TRANS);
+    rOut << transpose;
+    rOut << std::endl;
+  }
+
+ 
+} //namespace
+
diff --git a/src/CuTNetLib/cuLinearity.h b/src/CuTNetLib/cuLinearity.h
new file mode 100644
index 0000000..050591d
--- /dev/null
+++ b/src/CuTNetLib/cuLinearity.h
@@ -0,0 +1,94 @@
+#ifndef _CULINEARITY_H_
+#define _CULINEARITY_H_
+
+
+#include "cuComponent.h"
+#include "cumatrix.h"
+
+
+#include "Matrix.h"
+#include "Vector.h"
+
+
+namespace TNet {
+  /**
+   * \brief CuLinearity summation function
+   *
+   * \ingroup CuNNUpdatable
+   * Implements forward pass: \f[ Y_j=\Sigma_{i=0}^{i=N-1}w_ij X_i +{\beta}_j \f]
+   * Error propagation: \f[ E_i = \Sigma_{i=0}^{i=N-1} w_ij e_j \f]
+   *
+   * Weight adjustion: \f[ W_{ij} = (1-D)(w_{ij} - \alpha(1-\mu)x_i e_j - \mu \Delta) \f]
+   * where
+   *  - D for weight decay => penalizing large weight
+   *  - \f$ \alpha \f$ for learning rate
+   *  - \f$ \mu \f$ for momentum => avoiding oscillation
+   */
+  class CuLinearity : public CuUpdatableComponent
+  {
+    public:
+
+      CuLinearity(size_t nInputs, size_t nOutputs, CuComponent *pPred); 
+      ~CuLinearity();  
+      
+      ComponentType GetType() const;
+      const char* GetName() const;
+
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+
+      void Update();
+
+      void ReadFromStream(std::istream& rIn);
+      void WriteToStream(std::ostream& rOut);
+
+    protected:
+      CuMatrix<BaseFloat> mLinearity;  ///< Matrix with neuron weights
+
+      CuMatrix<BaseFloat> mLinearityCorrection; ///< Matrix for linearity updates
+
+  };
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // CuLinearity::
+  inline 
+  CuLinearity::
+  CuLinearity(size_t nInputs, size_t nOutputs, CuComponent *pPred)
+    : CuUpdatableComponent(nInputs, nOutputs, pPred), 
+      mLinearity(nInputs,nOutputs), 
+      mLinearityCorrection(nInputs,nOutputs) 
+  { 
+    mLinearityCorrection.SetConst(0.0);
+  }
+
+
+  inline
+  CuLinearity::
+  ~CuLinearity()
+  { }
+
+  inline CuComponent::ComponentType
+  CuLinearity::
+  GetType() const
+  {
+    return CuComponent::LINEARITY;
+  }
+
+  inline const char*
+  CuLinearity::
+  GetName() const
+  {
+    return "<linearity>";
+  }
+
+
+
+} //namespace
+
+
+
+#endif
diff --git a/src/CuTNetLib/cuMisc.h b/src/CuTNetLib/cuMisc.h
new file mode 100644
index 0000000..7319adf
--- /dev/null
+++ b/src/CuTNetLib/cuMisc.h
@@ -0,0 +1,555 @@
+#ifndef _CUMISC_H_
+#define _CUMISC_H_
+
+#include <vector>
+
+#include "cuComponent.h"
+#include "cumatrix.h"
+
+
+#include "Matrix.h"
+#include "Vector.h"
+#include "Error.h"
+
+
+namespace TNet {
+  /**
+   * \brief A pipe for input and errorinput propagation(doesn't incurr copy)
+   * 
+   * \ingroup CuNNMisc
+   */
+  class CuPipe : public CuComponent
+  {
+    public:
+    CuPipe(size_t nInputs, size_t nOutputs, CuComponent* pPred)
+      : CuComponent(nInputs,nOutputs,pPred)
+    { }
+
+    ~CuPipe()
+    { }
+
+    ComponentType GetType() const
+    { return PIPE; }
+
+    const char* GetName() const
+    { return "<pipe>"; }
+   
+    void ReadFromStream(std::istream& rIn)
+    { }
+
+    void WriteToStream(std::ostream& rOut)  
+    { }
+    
+    void Propagate()
+    {
+      if (NULL == mpInput) Error("mpInput is NULL");
+      mOutput.Init(*mpInput);
+    }
+    void BackPropagate()
+    {
+      if (NULL == mpErrorInput) Error("mpErrorInput is NULL");
+      mErrorOutput.Init(*mpErrorInput);
+    }
+     
+   protected:
+    
+    void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+    { Y.CopyFrom(X);}
+
+    void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+    { Y.CopyFrom(X); }
+
+  };
+  
+  /**
+   * \brief A pipe for input propagation(doesn't incurr copy) and set any error to zero
+   * 
+   * \ingroup CuNNMisc
+   * 
+   * @todo have to be set to zero on every pass(setup a common zeroed space?!)
+   */
+  class CuLearnStop : public CuComponent
+  {
+    public:
+    CuLearnStop(size_t nInputs, size_t nOutputs, CuComponent* pPred)
+      : CuComponent(nInputs,nOutputs,pPred)
+    { }
+
+    ~CuLearnStop()
+    { }
+
+    ComponentType GetType() const
+    { return LEARNSTOP; }
+
+    const char* GetName() const
+    { return "<learnstop>"; }
+   
+    void ReadFromStream(std::istream& rIn)
+    { }
+
+    void WriteToStream(std::ostream& rOut)  
+    { }
+    
+    void Propagate()
+    {
+      if (NULL == mpInput) Error("mpInput is NULL");
+      mOutput.Init(*mpInput);
+    }
+     
+   protected:
+    
+    void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+    { Y.CopyFrom(X);}
+
+    void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+    { Y.SetZero(); }
+
+  };
+  
+  /**
+   * \brief Distribute the input to several output
+   * 
+   * \ingroup CuNNMisc
+   * 
+   */
+  class CuDistrib : public CuComponent
+  {
+    public:
+    CuDistrib(size_t nInputs, size_t nOutputs, CuComponent* pPred)
+      : CuComponent(nInputs,nOutputs,pPred),size(0),ErrInputVec()
+    {
+    }
+
+    ~CuDistrib()
+    { }
+
+    ComponentType GetType() const
+    { return DISTRIB; }
+
+    const char* GetName() const
+    { return "<distrib>"; }
+   
+    void ReadFromStream(std::istream& rIn)
+    {
+      rIn >> std::ws >> size;
+      ErrInputVec.clear();
+      for (int i=0; i<size;++i)
+        ErrInputVec.push_back(NULL);
+    }
+
+    void WriteToStream(std::ostream& rOut)  
+    {
+      rOut<<size<<std::endl;
+    }
+    
+    void Propagate()
+    {
+      if (NULL == mpInput) Error("mpInput is NULL");
+      mOutput.Init(*mpInput);
+    }
+    
+    int GetOutSect()
+    {
+      return size;
+    }
+    
+    CuMatrix<BaseFloat>& GetErrorInput(int pos=0)
+    {
+      if (pos>=0 && pos<size)
+        return *ErrInputVec[pos];
+      return *ErrInputVec[0];
+    }
+
+    void SetErrorInput(CuMatrix<BaseFloat>& rErrorInput,int pos=0)
+    {
+      if (pos==0)
+        mpErrorInput=&rErrorInput;
+      if (pos>=0 && pos<size)
+        ErrInputVec[pos]=&rErrorInput;
+    }  
+     
+   protected:
+    
+    void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+    { Y.CopyFrom(X);}
+
+    void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+    {
+      Y.SetZero();
+      for (int i=0;i<size;++i)
+        Y.AddScaled(1.0,*ErrInputVec[i],1.0);
+    }
+    
+    int size;
+    MatrixPtrVec ErrInputVec;
+    Vector<BaseFloat> Scale;
+  };
+  
+  /**
+   * \brief Combining(Adding) several inputs together
+   * 
+   * \ingroup CuNNMisc
+   * 
+   */
+  class CuCombine : public CuComponent
+  {
+    public:
+    CuCombine(size_t nInputs, size_t nOutputs, CuComponent* pPred)
+      : CuComponent(nInputs,nOutputs,pPred),size(0),InputVec()
+    {
+    }
+
+    ~CuCombine()
+    { }
+
+    ComponentType GetType() const
+    { return COMBINE; }
+
+    const char* GetName() const
+    { return "<combine>"; }
+   
+    void ReadFromStream(std::istream& rIn)
+    {
+      rIn >> std::ws >> size;
+      InputVec.clear();
+      for (int i=0; i<size;++i)
+        InputVec.push_back(NULL);
+    }
+
+    void WriteToStream(std::ostream& rOut)  
+    {
+      rOut<<size<<std::endl;
+    }
+    
+    void Backpropagate()
+    {
+      if (NULL == mpErrorInput) Error("mpErrorInput is NULL");
+      mErrorOutput.Init(*mpErrorInput);
+    }
+    
+    int GetInSect()
+    {
+      return size;
+    }
+    
+    /// IO Data getters
+    CuMatrix<BaseFloat>& GetInput(int pos=0)
+    {
+      if (pos>=0 && pos<size)
+        return *InputVec[pos];
+      return *InputVec[0];
+    }
+
+    /// Set input vector (bind with the preceding NetworkComponent)
+    void SetInput(CuMatrix<BaseFloat>& rInput,int pos=0)
+    {
+      if (pos==0)
+        mpInput=&rInput;
+      if (pos>=0 && pos<size)
+        InputVec[pos]=&rInput;
+    }
+     
+   protected:
+    
+    void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+    {
+      Y.SetZero();
+      for (int i=0;i<size;++i)
+        Y.AddScaled(1.0,*InputVec[i],1.0);
+    }
+
+    void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+    {
+      Y.CopyFrom(X);
+    }
+    
+    int size;
+    MatrixPtrVec InputVec;
+  };
+  
+  /**
+   * \brief Divide the input matrix to several outputs
+   * 
+   * \ingroup CuNNMisc
+   * 
+   */
+  class CuDivide : public CuComponent
+  {
+    public:
+    CuDivide(size_t nInputs, size_t nOutputs, CuComponent* pPred)
+      : CuComponent(nInputs,nOutputs,pPred),size(0)
+    { }
+
+    ~CuDivide()
+    { }
+
+    ComponentType GetType() const
+    { return DIVIDE; }
+
+    const char* GetName() const
+    { return "<divide>"; }
+    
+    int GetOutSect()
+    {
+      return size;
+    }
+   
+    void ReadFromStream(std::istream& rIn)
+    {
+      int len;
+      for (int i=0; i<size;++i)
+        delete OutputVec[i];
+      rIn >> std::ws >> size;
+      OutputVec.clear();
+      for (int i=0; i<size;++i)
+      {
+        rIn>>len;
+        OutputVec.push_back(new CuMatrix<BaseFloat>());
+        SectLen.push_back(len);
+      }
+    }
+
+    void WriteToStream(std::ostream& rOut)  
+    {
+      rOut<<size<<" ";
+      for (int i=0; i<size;++i)
+        rOut<<SectLen[i]<<" ";
+      rOut<<std::endl;
+    }
+    
+    void Propagate()
+    {
+      if (NULL == mpInput) Error("mpInput is NULL");
+      int loc=0;
+      for (int i=0;i<size;++i)
+      {
+        OutputVec[i]->Init(*mpInput,loc,SectLen[i]);
+        loc+=SectLen[i];
+      }
+    }
+     
+   protected:
+    
+    void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+    { Error("__func__ Nonsense"); }
+
+    void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+    {
+      int loc=0;
+      for (int i=0;i<size;++i)
+      {
+        Y.CopyCols(SectLen[i], 0, X, loc);
+        loc+=SectLen[i];
+      }
+    }
+    
+    int size;
+    MatrixPtrVec OutputVec;
+    MatrixPtrVec ErrorInputVec;
+    std::vector<int> SectLen;
+
+  };
+  
+  /**
+   * \brief Merge several input matrices to one single output
+   * 
+   * \ingroup CuNNMisc
+   * 
+   */
+  class CuMerge : public CuComponent
+  {
+    public:
+    CuMerge(size_t nInputs, size_t nOutputs, CuComponent* pPred)
+      : CuComponent(nInputs,nOutputs,pPred)
+    { }
+
+    ~CuMerge()
+    { }
+
+    ComponentType GetType() const
+    { return MERGE; }
+
+    const char* GetName() const
+    { return "<merge>"; }
+    
+    int GetInSect()
+    {
+      return size;
+    }
+   
+    void ReadFromStream(std::istream& rIn)
+    {
+      int len;
+      for (int i=0; i<size;++i)
+        delete ErrorOutputVec[i];
+      rIn >> std::ws >> size;
+      ErrorOutputVec.clear();
+      for (int i=0; i<size;++i)
+      {
+        rIn>>len;
+        ErrorOutputVec.push_back(new CuMatrix<BaseFloat>());
+        SectLen.push_back(len);
+      }
+    }
+
+    void WriteToStream(std::ostream& rOut)  
+    {
+      rOut<<size<<" ";
+      for (int i=0; i<size;++i)
+        rOut<<SectLen[i]<<" ";
+      rOut<<std::endl;
+    }
+    
+    void Backpropagate()
+    {
+      if (NULL == mpErrorInput) Error("mpErrorInput is NULL");
+      int loc=0;
+      for (int i=0;i<size;++i)
+      {
+        ErrorOutputVec[i]->Init(*mpErrorInput,loc,SectLen[i]);
+        loc+=SectLen[i];
+      }
+    }
+     
+   protected:
+    
+    void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+    {
+      int loc=0;
+      for (int i=0;i<size;++i)
+      {
+        Y.CopyCols(SectLen[i], 0, X, loc);
+        loc+=SectLen[i];
+      }
+    }
+
+    void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+    { Error("__func__ Nonsense"); }
+    
+    int size;
+    
+    MatrixPtrVec InputVec;
+    MatrixPtrVec ErrorOutputVec;
+    std::vector<int> SectLen;
+
+  };
+  
+  /**
+   * \brief Reordering several inputs
+   * 
+   * \ingroup CuNNMisc
+   * 
+   */
+  class CuReorder : public CuComponent
+  {
+    public:
+    CuReorder(size_t nInputs, size_t nOutputs, CuComponent* pPred)
+      : CuComponent(nInputs,nOutputs,pPred)
+    { }
+
+    ~CuReorder()
+    { }
+
+    ComponentType GetType() const
+    { return REORDER; }
+
+    const char* GetName() const
+    { return "<reorder>"; }
+    
+    int GetInSect()
+    {
+      return size;
+    }
+      
+    int GetOutSect()
+    {
+      return size;
+    }
+   
+    void ReadFromStream(std::istream& rIn)
+    {
+      int pos;
+      for (int i=0; i<size;++i)
+        delete PipeVec[i];
+      rIn >> std::ws >> size;
+      Order.clear();
+      PipeVec.clear();
+      for (int i=0; i<size;++i)
+      {
+        rIn>>pos;
+        Order.push_back(pos);
+        PipeVec.push_back(new CuPipe(0,0,NULL));
+      }
+    }
+
+    void WriteToStream(std::ostream& rOut)  
+    {
+      rOut << size<< " ";
+      for (int i=0; i<size;++i)
+        rOut<<Order[i]<<" ";
+      rOut<<std::endl;
+    }
+    
+    void Propagate()
+    {
+      if (NULL == mpInput) Error("mpInput is NULL");
+      for (int i=0; i<size;++i)
+        PipeVec[i]->Propagate();
+    }
+    
+    void Backpropagate()
+    {
+      if (NULL == mpErrorInput) Error("mpErrorInput is NULL");
+      for (int i=0; i<size;++i)
+        PipeVec[i]->Backpropagate();
+    }
+    
+    /// IO Data getters
+    CuMatrix<BaseFloat>& GetInput(int pos=0)
+    {
+      return PipeVec[pos]->GetInput();
+    }
+    CuMatrix<BaseFloat>& GetOutput(int pos=0)
+    {
+      return PipeVec[Order[pos]]->GetOutput();
+    }
+    CuMatrix<BaseFloat>& GetErrorInput(int pos=0)
+    {
+      return PipeVec[Order[pos]]->GetErrorInput();
+    }
+    CuMatrix<BaseFloat>& GetErrorOutput(int pos=0)
+    {
+      return PipeVec[pos]->GetErrorOutput();
+    }
+
+    /// Set input vector (bind with the preceding NetworkComponent)
+    void SetInput(CuMatrix<BaseFloat>& rInput,int pos=0)
+    {
+      PipeVec[pos]->SetInput(rInput);
+    }          
+    /// Set error input vector (bind with the following NetworkComponent) 
+    void SetErrorInput(CuMatrix<BaseFloat>& rErrorInput,int pos=0)
+    {
+      PipeVec[Order[pos]]->SetErrorInput(rErrorInput);
+    }
+     
+   protected:
+    
+    void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+    { Error("__func__ Nonsense"); }
+
+    void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+    { Error("__func__ Nonsense"); }
+    
+    int size;
+    
+    std::vector<int> Order;
+    
+    std::vector< CuPipe* > PipeVec;
+  };
+  
+} //namespace
+
+
+
+#endif
diff --git a/src/CuTNetLib/cuNetwork.cc b/src/CuTNetLib/cuNetwork.cc
new file mode 100644
index 0000000..e0d8549
--- /dev/null
+++ b/src/CuTNetLib/cuNetwork.cc
@@ -0,0 +1,442 @@
+
+#include <algorithm>
+//#include <locale>
+#include <cctype>
+#include <list>
+#include <sstream>
+
+#include "cuNetwork.h"
+
+#include "cuDiscreteLinearity.h"
+#include "cuSharedLinearity.h"
+#include "cuSparseLinearity.h"
+#include "cuRbm.h"
+#include "cuRbmSparse.h"
+#include "cuRecurrent.h"
+#include "cuBlockArray.h"
+#include "cuLinearity.h"
+#include "cuUpdatableBias.h"
+#include "cuMisc.h"
+#include "cuCompDisc.h"
+
+namespace TNet {
+
+
+  
+
+  void
+  CuNetwork::
+  ReadNetwork(const char* pSrc)
+  {
+    std::ifstream in(pSrc);
+    if(!in.good()) {
+      Error(std::string("Error, cannot read model: ")+pSrc);
+    }
+    ReadNetwork(in);
+    in.close();
+  }
+
+ 
+ 
+  void
+  CuNetwork::
+  WriteNetwork(const char* pDst)
+  {
+    std::ofstream out(pDst);
+    if(!out.good()) {
+      Error(std::string("Error, cannot write model: ")+pDst);
+    }
+    WriteNetwork(out);
+    out.close();
+  }
+
+   
+
+  void
+  CuNetwork::
+  ReadNetwork(std::istream& rIn)
+  {
+    //get the network elements from a factory
+    CuComponent *pComp;
+    while(NULL != (pComp = ComponentFactory(rIn))) { 
+      mNetComponents.push_back(pComp);
+    }
+  }
+
+
+
+  void
+  CuNetwork::
+  WriteNetwork(std::ostream& rOut)
+  {
+    //dump all the componetns
+    LayeredType::iterator it;
+    for(it=mNetComponents.begin(); it!=mNetComponents.end(); ++it) {
+      ComponentDumper(rOut, **it);
+    }
+  }
+
+
+  void
+  CuNetwork::
+  SetLearnRate(BaseFloat learnRate, const char* pLearnRateFactors)
+  {
+    //parse the learn rate factors: "0.1:0.5:0.6:1.0" to std::list
+    std::list<BaseFloat> lr_factors;
+    if(NULL != pLearnRateFactors) {
+      //replace ':' by ' '
+      std::string str(pLearnRateFactors);
+      size_t pos = 0;
+      while((pos = str.find(':',pos)) != std::string::npos) str[pos] = ' ';
+      while((pos = str.find(',',pos)) != std::string::npos) str[pos] = ' ';
+
+      //parse to std::list
+      std::istringstream is(str);
+      is >> std::skipws;
+      BaseFloat f; 
+      while(!is.eof()) {
+        if(!(is >> f).fail()) { lr_factors.push_back(f); }
+        else break;
+      }
+    }
+
+    //initialize rate factors iterator
+    BaseFloat scale = 1.0f;
+
+    //store global learning rate
+    mGlobLearnRate = learnRate;
+    mpLearnRateFactors = pLearnRateFactors;
+
+    //give scaled learning rate to components
+    LayeredType::iterator it;
+    bool stopper_given = false;
+    for(it=mNetComponents.begin(); it!=mNetComponents.end(); ++it) {
+      if((*it)->IsUpdatable()) {
+        //get next scale factor
+        if(NULL != pLearnRateFactors) {
+          if(!(lr_factors.size() > 0)) {
+            Error("Too few learninig rate scale factors");
+          }
+          scale = lr_factors.front(); 
+          lr_factors.pop_front(); 
+        }
+        //set scaled learning rate to the component
+        dynamic_cast<CuUpdatableComponent*>(*it)->LearnRate(learnRate*scale);
+        //set the stopper component for backpropagation
+        if(!stopper_given && (learnRate*scale > 0.0)) {
+          mpPropagErrorStopper = *it; stopper_given = true;
+        }
+      }
+    }
+    if(lr_factors.size() > 0) {
+      Error("Too much learninig rate scale factors");
+    }
+  }
+
+
+  BaseFloat
+  CuNetwork::
+  GetLearnRate()
+  {
+    return mGlobLearnRate;
+  }
+
+
+  void
+  CuNetwork::
+  PrintLearnRate()
+  {
+    assert(mNetComponents.size() > 0);
+    std::cout << "Learning rate: global " << mGlobLearnRate;
+    std::cout << " components' ";
+    for(size_t i=0; i<mNetComponents.size(); i++) {
+      if(mNetComponents[i]->IsUpdatable()) {
+        std::cout << " " << dynamic_cast<CuUpdatableComponent*>(mNetComponents[i])->LearnRate();
+      }
+    }
+    std::cout << "\n" << std::flush;
+  }
+
+
+
+  void
+  CuNetwork::
+  SetMomentum(BaseFloat momentum)
+  {
+    LayeredType::iterator it;
+    for(it=mNetComponents.begin(); it!=mNetComponents.end(); ++it) {
+      if((*it)->IsUpdatable()) {
+        dynamic_cast<CuUpdatableComponent*>(*it)->Momentum(momentum);
+      }
+    }
+  }
+
+  void
+  CuNetwork::
+  SetWeightcost(BaseFloat weightcost)
+  {
+    LayeredType::iterator it;
+    for(it=mNetComponents.begin(); it!=mNetComponents.end(); ++it) {
+      if((*it)->IsUpdatable()) {
+        dynamic_cast<CuUpdatableComponent*>(*it)->Weightcost(weightcost);
+      }
+    }
+  }
+
+  void
+  CuNetwork::
+  SetL1(BaseFloat l1)
+  {
+    LayeredType::iterator it;
+    for(it=mNetComponents.begin(); it!=mNetComponents.end(); ++it) {
+      if((*it)->GetType() == CuComponent::SPARSE_LINEARITY) {
+        dynamic_cast<CuSparseLinearity*>(*it)->L1(l1);
+      }
+    }
+  }
+
+  void
+  CuNetwork::
+  SetGradDivFrm(bool div)
+  {
+    LayeredType::iterator it;
+    for(it=mNetComponents.begin(); it!=mNetComponents.end(); ++it) {
+      if((*it)->IsUpdatable()) {
+        dynamic_cast<CuUpdatableComponent*>(*it)->GradDivFrm(div);
+      }
+    }
+  }
+   
+
+  CuComponent*
+  CuNetwork::
+  ComponentFactory(std::istream& rIn)
+  {
+    rIn >> std::ws;
+    if(rIn.eof()) return NULL;
+
+    CuComponent* pRet=NULL;
+    CuComponent* pPred=NULL;
+    
+    //make coupling with predecessor
+    if(mNetComponents.size() != 0) {
+      pPred = mNetComponents.back();
+    }
+    
+    pRet=ComponentReader(rIn, pPred);
+
+    //return
+    return pRet;
+  }
+
+  CuComponent*
+  CuNetwork::
+  ComponentReader(std::istream& rIn, CuComponent* pPred)
+  {
+    CuComponent* pRet=NULL;
+    
+    std::string componentTag;
+    size_t nInputs, nOutputs;
+
+    rIn >> std::ws;
+    rIn >> componentTag;
+    if(componentTag == "") return NULL; //nothing left in the file
+
+    //make it lowercase
+    std::transform(componentTag.begin(), componentTag.end(), 
+                   componentTag.begin(), tolower);
+
+    if(componentTag[0] != '<' || componentTag[componentTag.size()-1] != '>') {
+      Error(std::string("Invalid component tag:")+componentTag);
+    }
+
+    //the 'endblock' tag terminates the network
+    if(componentTag == "<endblock>") return NULL;
+
+    rIn >> std::ws;
+    rIn >> nOutputs;
+    rIn >> std::ws;
+    rIn >> nInputs;
+    assert(nInputs > 0 && nOutputs > 0);
+    
+    //array with list of component tags
+    static const std::string TAGS[] = {
+      "<biasedlinearity>",
+      "<discretelinearity>",
+      "<sharedlinearity>",
+      "<sparselinearity>",
+      "<rbm>",
+      "<rbmsparse>",
+      "<recurrent>",
+
+      "<softmax>",
+      "<sigmoid>",
+
+      "<expand>",
+      "<copy>",
+      "<transpose>",
+      "<blocklinearity>",
+      "<bias>",
+      "<window>",
+      "<log>", 
+
+      "<blockarray>",
+
+      "<linearity>",
+      "<updatablebias>",
+      "<pipe>",
+      "<learnstop>",
+      "<distrib>",
+      "<combine>",
+      "<compound>",
+      "<discrete>",
+      "<divide>",
+      "<merge>",
+      "<reorder>",
+
+    };
+
+    static const int n_tags = sizeof(TAGS) / sizeof(TAGS[0]);
+    int i;
+    for(i=0; i<n_tags; i++) {
+      if(componentTag == TAGS[i]) break;
+    }
+
+    //switch according to position in array TAGS
+    switch(i) {
+      case 0: pRet = new CuBiasedLinearity(nInputs,nOutputs,pPred); break;
+      case 1: pRet = new CuDiscreteLinearity(nInputs,nOutputs,pPred); break;
+      case 2: pRet = new CuSharedLinearity(nInputs,nOutputs,pPred); break;
+      case 3: pRet = new CuSparseLinearity(nInputs,nOutputs,pPred); break;
+      case 4: pRet = new CuRbm(nInputs,nOutputs,pPred); break;
+      case 5: pRet = new CuRbmSparse(nInputs,nOutputs,pPred); break;
+      case 6: pRet = new CuRecurrent(nInputs,nOutputs,pPred); break;
+
+      case 7: pRet = new CuSoftmax(nInputs,nOutputs,pPred); break;
+      case 8: pRet = new CuSigmoid(nInputs,nOutputs,pPred); break;
+
+      case 9: pRet = new CuExpand(nInputs,nOutputs,pPred); break;
+      case 10: pRet = new CuCopy(nInputs,nOutputs,pPred); break;
+      case 11: pRet = new CuTranspose(nInputs,nOutputs,pPred); break;
+      case 12: pRet = new CuBlockLinearity(nInputs,nOutputs,pPred); break;
+      case 13: pRet = new CuBias(nInputs,nOutputs,pPred); break;
+      case 14: pRet = new CuWindow(nInputs,nOutputs,pPred); break;
+      case 15: pRet = new CuLog(nInputs,nOutputs,pPred); break;
+     
+      case 16: pRet = new CuBlockArray(nInputs,nOutputs,pPred); break;
+
+      case 17: pRet = new CuLinearity(nInputs,nOutputs,pPred); break;
+      case 18: pRet = new CuUpdatableBias(nInputs,nOutputs,pPred); break;
+      case 19: pRet = new CuPipe(nInputs,nOutputs,pPred); break;
+      case 20: pRet = new CuLearnStop(nInputs,nOutputs,pPred); break;
+      case 21: pRet = new CuDistrib(nInputs,nOutputs,pPred); break;
+      case 22: pRet = new CuCombine(nInputs,nOutputs,pPred); break;
+      case 23: pRet = new CuCompound(nInputs,nOutputs,pPred); break;
+      case 24: pRet = new CuDiscrete(nInputs,nOutputs,pPred); break;
+      case 25: pRet = new CuDivide(nInputs,nOutputs,pPred); break;
+      case 26: pRet = new CuMerge(nInputs,nOutputs,pPred); break;
+      case 27: pRet = new CuReorder(nInputs,nOutputs,pPred); break;
+
+      
+      default: Error(std::string("Unknown Component tag:")+componentTag);
+    }
+   
+    //read components content
+    pRet->ReadFromStream(rIn);
+    
+    //return
+    return pRet;
+  }
+
+
+  void
+  CuNetwork::
+  ComponentDumper(std::ostream& rOut, CuComponent& rComp)
+  {
+    //use tags of all the components; or the identification codes
+    //array with list of component tags
+    static const CuComponent::ComponentType TYPES[] = {
+      CuComponent::BIASED_LINEARITY,
+      CuComponent::DISCRETE_LINEARITY,
+      CuComponent::SHARED_LINEARITY,
+      CuComponent::SPARSE_LINEARITY,
+      CuComponent::RBM,
+      CuComponent::RBM_SPARSE,
+      CuComponent::RECURRENT,
+      CuComponent::LINEARITY,
+      CuComponent::UPDATABLEBIAS,
+      CuComponent::COMPOUND,
+      CuComponent::DISCRETE,
+
+      CuComponent::SIGMOID,
+      CuComponent::SOFTMAX,
+
+      CuComponent::EXPAND,
+      CuComponent::COPY,
+      CuComponent::TRANSPOSE,
+      CuComponent::BLOCK_LINEARITY,
+      CuComponent::BIAS,
+      CuComponent::WINDOW,
+      CuComponent::LOG,
+      CuComponent::PIPE,
+      CuComponent::LEARNSTOP,
+      CuComponent::DISTRIB,
+      CuComponent::COMBINE,
+      CuComponent::DIVIDE,
+      CuComponent::MERGE,
+      CuComponent::REORDER,
+
+      CuComponent::BLOCK_ARRAY,
+    };
+    static const std::string TAGS[] = {
+      "<biasedlinearity>",
+      "<discretelinearity>",
+      "<sharedlinearity>",
+      "<sparselinearity>",
+      "<rbm>",
+      "<rbmsparse>",
+      "<recurrent>",
+      "<linearity>",
+      "<updatablebias>",
+      "<compound>",
+      "<discrete>",	
+
+      "<sigmoid>",
+      "<softmax>",
+
+      "<expand>",
+      "<copy>",
+      "<transpose>",
+      "<blocklinearity>",
+      "<bias>",
+      "<window>",
+      "<log>",
+      "<pipe>",
+      "<learnstop>",
+      "<distrib>",
+      "<combine>",
+      "<divide>",
+      "<merge>",
+      "<reorder>",
+
+      "<blockarray>",
+    };
+    static const int MAX = sizeof TYPES / sizeof TYPES[0];
+
+    int i;
+    for(i=0; i<MAX; ++i) {
+      if(TYPES[i] == rComp.GetType()) break;
+    }
+    if(i == MAX) Error("Unknown ComponentType");
+    
+    //dump the component tag
+    rOut << TAGS[i] << " " 
+         << rComp.GetNOutputs() << " " 
+         << rComp.GetNInputs() << std::endl;
+
+    //write components content
+    rComp.WriteToStream(rOut);
+  }
+  
+} //namespace
+
diff --git a/src/CuTNetLib/cuNetwork.h b/src/CuTNetLib/cuNetwork.h
new file mode 100644
index 0000000..05e0ecb
--- /dev/null
+++ b/src/CuTNetLib/cuNetwork.h
@@ -0,0 +1,227 @@
+#ifndef _CUNETWORK_H_
+#define _CUNETWORK_H_
+
+#include "cuComponent.h"
+
+#include "cuBiasedLinearity.h"
+//#include "cuBlockLinearity.h"
+//#include "cuBias.h"
+//#include "cuWindow.h"
+
+#include "cuActivation.h"
+
+#include "cuCRBEDctFeat.h"
+
+#include "Vector.h"
+
+#include <vector>
+
+/**
+ * \file cuNetwork.h
+ * \brief CuNN manipulation class
+ */
+
+/// \defgroup CuNNComp CuNN Components
+
+namespace TNet {
+  /**
+   * \brief Nural Network Manipulator & public interfaces
+   *
+   * \ingroup CuNNComp
+   */
+  class CuNetwork
+  {
+    //////////////////////////////////////
+    // Typedefs
+    typedef std::vector<CuComponent*> LayeredType;
+      
+      //////////////////////////////////////
+      // Disable copy construction, assignment and default constructor
+    private:
+      CuNetwork(CuNetwork&); 
+      CuNetwork& operator=(CuNetwork&);
+       
+    public:
+      CuNetwork() { }
+      CuNetwork(std::istream& rIn); 
+      ~CuNetwork();
+
+      void AddLayer(CuComponent* layer);
+
+      int Layers()
+      { return mNetComponents.size(); }
+
+      CuComponent& Layer(int i)
+      { return *mNetComponents[i]; }
+
+      /// forward the data to the output
+      void Propagate(CuMatrix<BaseFloat>& in, CuMatrix<BaseFloat>& out);
+
+      /// backpropagate the error while updating weights
+      void Backpropagate(CuMatrix<BaseFloat>& globerr); 
+
+      void ReadNetwork(const char* pSrc);     ///< read the network from file
+      void WriteNetwork(const char* pDst);    ///< write network to file
+
+      void ReadNetwork(std::istream& rIn);    ///< read the network from stream
+      void WriteNetwork(std::ostream& rOut);  ///< write network to stream
+
+      size_t GetNInputs() const; ///< Dimensionality of the input features
+      size_t GetNOutputs() const; ///< Dimensionality of the desired vectors
+
+      /// set the learning rate
+      void SetLearnRate(BaseFloat learnRate, const char* pLearnRateFactors = NULL); 
+      BaseFloat GetLearnRate();  ///< get the learning rate value
+      void PrintLearnRate();     ///< log the learning rate values
+
+      void SetMomentum(BaseFloat momentum);
+      void SetWeightcost(BaseFloat weightcost);
+      void SetL1(BaseFloat l1);
+
+      void SetGradDivFrm(bool div);
+      
+      /// Reads a component from a stream
+      static CuComponent* ComponentReader(std::istream& rIn, CuComponent* pPred);
+      /// Dumps component into a stream
+      static void ComponentDumper(std::ostream& rOut, CuComponent& rComp);
+
+      
+    private:
+      /// Creates a component by reading from stream
+      CuComponent* ComponentFactory(std::istream& In);
+
+
+    private:
+      LayeredType mNetComponents; ///< container with the network layers
+      CuComponent* mpPropagErrorStopper;
+      BaseFloat mGlobLearnRate; ///< The global (unscaled) learn rate of the network
+      const char* mpLearnRateFactors; ///< The global (unscaled) learn rate of the network
+      
+
+    //friend class NetworkGenerator; //<< For generating networks...
+
+  };
+    
+  //////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // CuNetwork::
+  inline 
+  CuNetwork::
+  CuNetwork(std::istream& rSource)
+    : mpPropagErrorStopper(NULL), mGlobLearnRate(0.0), mpLearnRateFactors(NULL)
+  {
+    ReadNetwork(rSource);
+  }
+
+
+  inline
+  CuNetwork::
+  ~CuNetwork()
+  {
+    //delete all the components
+    LayeredType::iterator it;
+    for(it=mNetComponents.begin(); it!=mNetComponents.end(); ++it) {
+      delete *it;
+      *it = NULL;
+    }
+    mNetComponents.resize(0);
+  }
+
+  
+  inline void 
+  CuNetwork::
+  AddLayer(CuComponent* layer)
+  {
+    if(mNetComponents.size() > 0) {
+      if(GetNOutputs() != layer->GetNInputs()) {
+        Error("Nonmatching dims");
+      }
+      layer->SetPrevious(mNetComponents.back());
+      mNetComponents.back()->SetNext(layer);
+    }
+    mNetComponents.push_back(layer);
+  }
+
+
+  inline void
+  CuNetwork::
+  Propagate(CuMatrix<BaseFloat>& in, CuMatrix<BaseFloat>& out)
+  {
+    //empty network => copy input
+    if(mNetComponents.size() == 0) { 
+      out.CopyFrom(in); 
+      return;
+    }
+
+    //check dims
+    if(in.Cols() != GetNInputs()) {
+      std::ostringstream os;
+      os << "Nonmatching dims"
+         << " data dim is: " << in.Cols() 
+         << " network needs: " << GetNInputs();
+      Error(os.str());
+    }
+    mNetComponents.front()->SetInput(in);
+    
+    //propagate
+    LayeredType::iterator it;
+    for(it=mNetComponents.begin(); it!=mNetComponents.end(); ++it) {
+      (*it)->Propagate();
+    }
+
+    //copy the output
+    out.CopyFrom(mNetComponents.back()->GetOutput());
+  }
+
+
+
+
+  inline void 
+  CuNetwork::
+  Backpropagate(CuMatrix<BaseFloat>& globerr) 
+  {
+    mNetComponents.back()->SetErrorInput(globerr);
+
+    // back-propagation
+    LayeredType::reverse_iterator it;
+    for(it=mNetComponents.rbegin(); it!=mNetComponents.rend(); ++it) {
+      //stopper component does not propagate error (no updatable predecessors)
+      if(*it != mpPropagErrorStopper) {
+        //compute errors for preceding network components
+        (*it)->Backpropagate();
+      }
+      //update weights if updatable component
+      if((*it)->IsUpdatable()) {
+        CuUpdatableComponent& rComp = dynamic_cast<CuUpdatableComponent&>(**it); 
+        if(rComp.LearnRate() > 0.0f) {
+          rComp.Update();
+        }
+      }
+      //stop backprop if no updatable components precede current component
+      if(mpPropagErrorStopper == *it) break;
+    }
+  }
+
+      
+  inline size_t
+  CuNetwork::
+  GetNInputs() const
+  {
+    if(!mNetComponents.size() > 0) return 0;
+    return mNetComponents.front()->GetNInputs();
+  }
+
+
+  inline size_t
+  CuNetwork::
+  GetNOutputs() const
+  {
+    if(!mNetComponents.size() > 0) return 0;
+    return mNetComponents.back()->GetNOutputs();
+  }
+
+} //namespace
+
+#endif
+
+
diff --git a/src/CuTNetLib/cuObjectiveFunction.cc b/src/CuTNetLib/cuObjectiveFunction.cc
new file mode 100644
index 0000000..e2b0a1d
--- /dev/null
+++ b/src/CuTNetLib/cuObjectiveFunction.cc
@@ -0,0 +1,87 @@
+
+#include "cuObjectiveFunction.h"
+
+#include "Error.h"
+#include "cumath.h"
+
+
+namespace TNet
+{
+
+
+
+  CuObjectiveFunction*
+  CuObjectiveFunction::
+  Factory(ObjFunType type) {
+    CuObjectiveFunction* ret = NULL;
+    switch(type) {
+      case MEAN_SQUARE_ERROR:  ret = new CuMeanSquareError;  break;
+      case CROSS_ENTROPY:      ret = new CuCrossEntropy;     break;
+      default: Error("Unknown ObjFun type");
+    }
+    return ret;
+  }
+
+
+  void 
+  CuMeanSquareError::
+  Evaluate(const CuMatrix<BaseFloat>& rNetOutput, const CuMatrix<BaseFloat>& rDesired, CuMatrix<BaseFloat>& rNetError)
+  {
+    //get the global error
+    rNetError.CopyFrom(rNetOutput);
+    rNetError.AddScaled(-1.0,rDesired,1.0);
+
+    //calculate the MSE
+    mAuxMat.CopyFrom(rNetError);
+    mAuxMat.MulElem(mAuxMat);
+    
+    mAuxVec.Init(mAuxMat.Cols());
+    mAuxVec.AddColSum(1.0,mAuxMat,0.0);
+    mAuxVec.CopyTo(mAuxVecHost);
+
+    mError += mAuxVecHost.Sum();
+ 
+    //count the frames    
+    mFrames += rNetError.Rows();
+  }
+
+  void 
+  CuCrossEntropy::
+  Evaluate(const CuMatrix<BaseFloat>& rNetOutput, const CuMatrix<BaseFloat>& rDesired, CuMatrix<BaseFloat>& rNetError)
+  {
+    if(rDesired.Cols() != rNetOutput.Cols()) {
+      std::ostringstream os;
+      os << "Non-matching dimensions of network output with training targets!!!"
+         << " Netoutput:" << rNetOutput.Cols()
+         << " Targets:" << rDesired.Cols();
+      Error(os.str());
+    }
+
+    //get the global error
+    //dXent/dSoftmax_in = y-d
+    rNetError.CopyFrom(rNetOutput);
+    rNetError.AddScaled(-1.0,rDesired,1.0);
+   
+    //check classification
+    mClassifyVec.Init(rNetOutput.Rows());
+    CuMath<BaseFloat>::CheckClass(rNetOutput,rDesired,mClassifyVec);
+    mClassifyVec.CopyTo(mClassifyVecHost);
+    mCorrect += mClassifyVecHost.Sum();
+
+    //calculate Xent
+    mAuxMat.CopyFrom(rNetOutput);
+    mAuxMat.LogElem();
+    mAuxMat.MulElem(rDesired);
+
+    mAuxVec.Init(mAuxMat.Cols());
+    mAuxVec.AddColSum(-1.0,mAuxMat,0.0);
+    mAuxVec.CopyTo(mAuxVecHost);
+
+    mError += mAuxVecHost.Sum();
+
+    //count the frames    
+    mFrames += rNetError.Rows();
+  }
+
+
+} // namespace TNet
diff --git a/src/CuTNetLib/cuObjectiveFunction.h b/src/CuTNetLib/cuObjectiveFunction.h
new file mode 100644
index 0000000..4dd0c32
--- /dev/null
+++ b/src/CuTNetLib/cuObjectiveFunction.h
@@ -0,0 +1,185 @@
+#ifndef _CUOBJ_FUN_I_
+#define _CUOBJ_FUN_I_
+
+#include <cassert>
+#include <limits>
+#include <cmath>
+#include <sstream>
+
+#include "Vector.h"
+#include "cuvector.h"
+#include "cumatrix.h"
+
+/**
+ * \file cuObjectiveFunction.h
+ * \brief Objective Functions used to compare the model and data
+ */
+
+/**
+ * \defgroup CuModelObj CuNN Objective Functions
+ * \ingroup CuNNComp
+ */
+
+namespace TNet 
+{
+
+  
+  /**
+   * \brief General interface for objective functions
+   */
+  class CuObjectiveFunction
+  {
+    public:
+      /// Enum with objective function types
+      typedef enum { 
+        OBJ_FUN_I = 0x0300, 
+        MEAN_SQUARE_ERROR, 
+        CROSS_ENTROPY, 
+      } ObjFunType;
+
+      /// Factory for creating objective function instances
+      static CuObjectiveFunction* Factory(ObjFunType type);
+    
+    //////////////////////////////////////////////////////////////
+    // Interface specification
+    public:
+      CuObjectiveFunction() 
+      { } 
+
+      virtual ~CuObjectiveFunction() 
+      { }
+
+      virtual ObjFunType GetTypeId() = 0; 
+      virtual const char* GetTypeLabel() = 0; 
+
+      /// evaluates the data, calculate global error
+      /// \param[in] rNetOutput CuNN output as generated by model
+      /// \param[in] rDesired   Desired output specified by data
+      /// \param[out] rNetError Derivative of the Energy Function
+      virtual void Evaluate(const CuMatrix<BaseFloat>& rNetOutput, const CuMatrix<BaseFloat>& rDesired, CuMatrix<BaseFloat>& rNetError) = 0;
+ 
+      ///get the average per frame error
+      virtual double GetError() = 0;  
+      ///the number of processed frames 
+      virtual size_t GetFrames() = 0;
+      ///report the error to std::cout 
+      virtual std::string Report() = 0;
+  };
+
+
+
+
+  /**
+   * \brief Means square error, useful for autoencoders, RBMs et al.
+   *
+   * \ingroup CuModelObj
+   * Calculate: \f[ ||\vec{ModelOutput}-\vec{Label}||^2 \f]
+   */
+  class CuMeanSquareError : public CuObjectiveFunction
+  {
+    public:
+      CuMeanSquareError() 
+        : mError(0), mFrames(0)
+      { }
+    
+      virtual ~CuMeanSquareError() 
+      { }
+
+      ObjFunType GetTypeId()
+      { return CuObjectiveFunction::MEAN_SQUARE_ERROR; }
+
+      const char* GetTypeLabel()
+      { return "<mean_square_error>"; }
+
+      void Evaluate(const CuMatrix<BaseFloat>& rNetOutput, const CuMatrix<BaseFloat>& rDesired, CuMatrix<BaseFloat>& rNetError);
+      
+      double GetError()
+      { return mError; }  
+      
+      size_t GetFrames()
+      { return mFrames; }
+      
+      std::string Report()
+      { 
+        std::ostringstream ss;
+        ss << "Mse:" << mError << " frames:" << mFrames 
+           << " err/frm:" << mError/mFrames << "\n";
+        return ss.str();
+      }
+
+    private:
+      double mError;
+      size_t mFrames;
+
+      CuMatrix<BaseFloat> mAuxMat;
+      CuVector<BaseFloat> mAuxVec;
+      Vector<BaseFloat> mAuxVecHost;
+
+  };
+
+
+ /**
+   * \brief Cross entropy, it assumes desired vectors as output values
+   *
+   * \ingroup CuModelObj
+   * Calculate: \f[ -\ln(\vec{ModelOutput}) \cdot \vec{Label} \f]
+   */
+  class CuCrossEntropy : public CuObjectiveFunction
+  {
+    public:
+      CuCrossEntropy() 
+        : mError(0), mFrames(0), mCorrect(0)
+      { }
+      
+      ~CuCrossEntropy() 
+      { }
+      
+      ObjFunType GetTypeId()
+      { return CuObjectiveFunction::CROSS_ENTROPY; }
+
+      const char* GetTypeLabel()
+      { return "<cross_entropy>"; }
+
+      void Evaluate(const CuMatrix<BaseFloat>& rNetOutput, const CuMatrix<BaseFloat>& rDesired, CuMatrix<BaseFloat>& rNetError);
+
+      double GetError()
+      { return mError; }
+
+      size_t GetFrames()
+      { return mFrames; }
+
+      std::string Report()
+      {
+        std::ostringstream ss;
+        //for compatibility with SNet
+        //ss << " correct: >> " << 100.0*mCorrect/mFrames << "% <<\n";
+        
+        //current new format...
+        ss << "Xent:" << mError << " frames:" << mFrames 
+           << " err/frm:" << mError/mFrames 
+           << " correct[" << 100.0*mCorrect/mFrames << "%]"
+           << "\n";
+        return ss.str();
+      }
+
+    private:
+      double mError;
+      size_t mFrames;
+      size_t mCorrect;
+      
+      CuMatrix<BaseFloat> mAuxMat;
+      CuVector<BaseFloat> mAuxVec;
+      Vector<BaseFloat> mAuxVecHost;
+
+      CuVector<int> mClassifyVec;
+      Vector<int> mClassifyVecHost;
+  };
+
+
+
+
+
+} //namespace TNet
+
+
+#endif
diff --git a/src/CuTNetLib/cuRbm.cc b/src/CuTNetLib/cuRbm.cc
new file mode 100644
index 0000000..3d0699d
--- /dev/null
+++ b/src/CuTNetLib/cuRbm.cc
@@ -0,0 +1,244 @@
+
+#include <string>
+#include <sstream>
+
+#include "cuRbm.h"
+
+#include "cumath.h"
+
+
+namespace TNet
+{
+
+  void 
+  CuRbm::
+  PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    Y.SetConst(0.0);
+    Y.AddScaledRow(1.0,mHidBias,0.0);
+    Y.Gemm('N','N', 1.0, X, mVisHid, 1.0);
+    if(mHidType == BERNOULLI) {
+      CuMath<BaseFloat>::Sigmoid(Y,Y);
+    }
+  }
+
+
+  void 
+  CuRbm::
+  BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    if(mHidType == BERNOULLI) {
+      mBackpropErrBuf.Init(X.Rows(),X.Cols());
+      CuMath<BaseFloat>::DiffSigmoid(mBackpropErrBuf,X,GetOutput());
+    } else {
+      mBackpropErrBuf.CopyFrom(X);
+    }
+
+    Y.SetConst(0.0);
+    Y.Gemm('N', 'T', 1.0, mBackpropErrBuf, mVisHid, 0.0);
+  }
+
+  
+  void 
+  CuRbm::
+  Update() 
+  {
+    //THIS IS DONE TWICE BECAUSE OF THE BACKPROP STOPPER!!!
+    if(mHidType == BERNOULLI) {
+      mBackpropErrBuf.Init(GetErrorInput().Rows(),GetErrorInput().Cols());
+      CuMath<BaseFloat>::DiffSigmoid(mBackpropErrBuf,GetErrorInput(),GetOutput());
+    } else {
+      mBackpropErrBuf.CopyFrom(GetErrorInput());
+    }
+
+/*
+    std::cout << " " << GetInput().Rows()
+              << " " << GetInput().Cols()  
+              << " " << mBackpropErrBuf.Rows()  
+              << " " << mBackpropErrBuf.Cols()  
+              << " " << mVisHidCorrection.Rows()  
+              << " " << mVisHidCorrection.Cols()  
+              ;
+*/
+
+#if 0
+    //former implementation
+    BaseFloat N = static_cast<BaseFloat>(GetInput().Rows());
+
+    mVisHidCorrection.Gemm('T','N',-mLearningRate/N,GetInput(),mBackpropErrBuf,mMomentum);
+    mHidBiasCorrection.AddColSum(-mLearningRate/N,mBackpropErrBuf,mMomentum);
+
+    //regularization weight decay
+    mVisHidCorrection.AddScaled(-mLearningRate*mWeightcost,mVisHid,1.0);
+    
+    mVisHid.AddScaled(1.0,mVisHidCorrection,1.0);
+    mHidBias.AddScaled(1.0,mHidBiasCorrection,1.0);
+#endif
+
+#if 1
+    //new implementation
+    BaseFloat N = 1;
+    if(mGradDivFrm) {
+      N = static_cast<BaseFloat>(GetInput().Rows());
+    }
+    BaseFloat mmt_gain = static_cast<BaseFloat>(1.0/(1.0-mMomentum));
+    N *= mmt_gain;
+
+    mVisHidCorrection.Gemm('T','N',1.0,GetInput(),mBackpropErrBuf,mMomentum);
+    mHidBiasCorrection.AddColSum(1.0,mBackpropErrBuf,mMomentum);
+
+    mVisHid.AddScaled(-mLearningRate/N,mVisHidCorrection,1.0);
+    mHidBias.AddScaled(-mLearningRate/N,mHidBiasCorrection,1.0);
+
+    //regularization weight decay (from actual weights only)
+    mVisHid.AddScaled(-mLearningRate*mWeightcost,mVisHid,1.0);
+#endif
+
+  }
+
+
+
+  void 
+  CuRbm::
+  Propagate(const CuMatrix<BaseFloat>& visProbs, CuMatrix<BaseFloat>& hidProbs)
+  {
+    if(visProbs.Cols() != GetNInputs()) {
+      std::ostringstream os;
+      os << " Nonmatching input dim, needs:" << GetNInputs() 
+         << " got:" << visProbs.Cols() << "\n";
+      Error(os.str());
+    }
+
+    hidProbs.Init(visProbs.Rows(),GetNOutputs());
+
+    PropagateFnc(visProbs, hidProbs);
+  }
+
+  void
+  CuRbm::
+  Reconstruct(const CuMatrix<BaseFloat>& hidState, CuMatrix<BaseFloat>& visProbs)
+  {
+    visProbs.Init(hidState.Rows(),mNInputs);
+    visProbs.SetConst(0.0);
+    visProbs.AddScaledRow(1.0,mVisBias,0.0);
+    visProbs.Gemm('N','T', 1.0, hidState, mVisHid, 1.0);
+    if(mVisType == BERNOULLI) {
+      CuMath<BaseFloat>::Sigmoid(visProbs,visProbs);
+    }
+  }
+
+
+  void 
+  CuRbm::
+  RbmUpdate(const CuMatrix<BaseFloat>& pos_vis, const CuMatrix<BaseFloat>& pos_hid, const CuMatrix<BaseFloat>& neg_vis, const CuMatrix<BaseFloat>& neg_hid)
+  {
+    assert(pos_vis.Rows() == pos_hid.Rows() &&
+           pos_vis.Rows() == neg_vis.Rows() &&
+           pos_vis.Rows() == neg_hid.Rows() &&
+           pos_vis.Cols() == neg_vis.Cols() &&
+           pos_hid.Cols() == neg_hid.Cols() &&
+           pos_vis.Cols() == mNInputs &&
+           pos_hid.Cols() == mNOutputs);
+
+    //  UPDATE vishid matrix
+    //  
+    //  vishidinc = momentum*vishidinc + ...
+    //              epsilonw*( (posprods-negprods)/numcases - weightcost*vishid);
+    //
+    //  vishidinc[t] = -(epsilonw/numcases)*negprods + momentum*vishidinc[t-1]
+    //                 +(epsilonw/numcases)*posprods
+    //                 -(epsilonw*weightcost)*vishid[t-1]
+    //
+    BaseFloat N = static_cast<BaseFloat>(pos_vis.Rows());
+    mVisHidCorrection.Gemm('T','N',-mLearningRate/N,neg_vis,neg_hid,mMomentum);
+    mVisHidCorrection.Gemm('T','N',+mLearningRate/N,pos_vis,pos_hid,1.0);
+    mVisHidCorrection.AddScaled(-mLearningRate*mWeightcost,mVisHid,1.0);
+    mVisHid.AddScaled(1.0,mVisHidCorrection,1.0);
+
+    //  UPDATE visbias vector
+    //
+    //  visbiasinc = momentum*visbiasinc + (epsilonvb/numcases)*(posvisact-negvisact);
+    //
+    mVisBiasCorrection.AddColSum(-mLearningRate/N,neg_vis,mMomentum);
+    mVisBiasCorrection.AddColSum(+mLearningRate/N,pos_vis,1.0);
+    mVisBias.AddScaled(1.0,mVisBiasCorrection,1.0);
+    
+    //  UPDATE hidbias vector
+    //
+    // hidbiasinc = momentum*hidbiasinc + (epsilonhb/numcases)*(poshidact-neghidact);
+    //
+    mHidBiasCorrection.AddColSum(-mLearningRate/N,neg_hid,mMomentum);
+    mHidBiasCorrection.AddColSum(+mLearningRate/N,pos_hid,1.0);
+    mHidBias.AddScaled(1.0/*0.0*/,mHidBiasCorrection,1.0);
+
+  }
+
+
+  void
+  CuRbm::
+  ReadFromStream(std::istream& rIn)
+  {
+    //type of the units
+    std::string str;
+    
+    rIn >> std::ws >> str;
+    if(0 == str.compare("bern")) {
+      mVisType = BERNOULLI;
+    } else if(0 == str.compare("gauss")) {
+      mVisType = GAUSSIAN;
+    } else Error(std::string("Invalid unit type: ")+str);
+
+    rIn >> std::ws >> str;
+    if(0 == str.compare("bern")) {
+      mHidType = BERNOULLI;
+    } else if(0 == str.compare("gauss")) {
+      mHidType = GAUSSIAN;
+    } else Error(std::string("Invalid unit type: ")+str);
+
+
+    //matrix is stored transposed as SNet does
+    BfMatrix transpose;
+    rIn >> transpose;
+    mVisHid.CopyFrom(BfMatrix(transpose, TRANS));
+    //biases stored normally
+    BfVector bias;
+    rIn >> bias;
+    mVisBias.CopyFrom(bias);
+    rIn >> bias;
+    mHidBias.CopyFrom(bias);
+  }
+
+   
+  void
+  CuRbm::
+  WriteToStream(std::ostream& rOut)
+  {
+    //store unit type info
+    if(mVisType == BERNOULLI) {
+      rOut << " bern ";
+    } else {
+      rOut << " gauss ";
+    }
+    if(mHidType == BERNOULLI) {
+      rOut << " bern\n";
+    } else {
+      rOut << " gauss\n";
+    }
+
+    //matrix is stored transposed as SNet does
+    BfMatrix tmp;
+    mVisHid.CopyTo(tmp);
+    BfMatrix transpose(tmp, TRANS);
+    rOut << transpose;
+    //biases stored normally
+    BfVector vec;
+    mVisBias.CopyTo(vec);
+    rOut << vec;
+    rOut << std::endl;
+    mHidBias.CopyTo(vec);
+    rOut << vec;
+    rOut << std::endl;
+  }
+
+ 
+} //namespace
diff --git a/src/CuTNetLib/cuRbm.h b/src/CuTNetLib/cuRbm.h
new file mode 100644
index 0000000..c1e984b
--- /dev/null
+++ b/src/CuTNetLib/cuRbm.h
@@ -0,0 +1,146 @@
+#ifndef _CU_RBM_H_
+#define _CU_RBM_H_
+
+
+#include "cuComponent.h"
+#include "cumatrix.h"
+
+
+#include "Matrix.h"
+#include "Vector.h"
+
+
+namespace TNet {
+
+  class CuRbmBase : public CuUpdatableComponent
+  {
+   public:
+    typedef enum {
+      BERNOULLI,
+      GAUSSIAN
+    } RbmUnitType;
+   
+    CuRbmBase(size_t nInputs, size_t nOutputs, CuComponent *pPred) :
+      CuUpdatableComponent(nInputs, nOutputs, pPred)
+    { }
+   
+    
+    virtual void Propagate(
+      const CuMatrix<BaseFloat>& visProbs, 
+      CuMatrix<BaseFloat>& hidProbs
+    ) = 0;
+    virtual void Reconstruct(
+      const CuMatrix<BaseFloat>& hidState, 
+      CuMatrix<BaseFloat>& visProbs
+    ) = 0;
+    virtual void RbmUpdate(
+      const CuMatrix<BaseFloat>& pos_vis, 
+      const CuMatrix<BaseFloat>& pos_hid, 
+      const CuMatrix<BaseFloat>& neg_vis, 
+      const CuMatrix<BaseFloat>& neg_hid
+    ) = 0;
+
+    virtual RbmUnitType VisType() = 0;
+    virtual RbmUnitType HidType() = 0;
+  };
+
+
+  class CuRbm : public CuRbmBase
+  {
+    public:
+
+      CuRbm(size_t nInputs, size_t nOutputs, CuComponent *pPred); 
+      ~CuRbm();  
+      
+      ComponentType GetType() const;
+      const char* GetName() const;
+
+      //CuUpdatableComponent API
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+
+      void Update();
+
+      //RBM training API
+      void Propagate(const CuMatrix<BaseFloat>& visProbs, CuMatrix<BaseFloat>& hidProbs);
+      void Reconstruct(const CuMatrix<BaseFloat>& hidState, CuMatrix<BaseFloat>& visProbs);
+      void RbmUpdate(const CuMatrix<BaseFloat>& pos_vis, const CuMatrix<BaseFloat>& pos_hid, const CuMatrix<BaseFloat>& neg_vis, const CuMatrix<BaseFloat>& neg_hid);
+
+      RbmUnitType VisType()
+      { return mVisType; }
+
+      RbmUnitType HidType()
+      { return mHidType; }
+
+      //static void BinarizeProbs(const CuMatrix<BaseFloat>& probs, CuMatrix<BaseFloat>& states);
+
+      //I/O
+      void ReadFromStream(std::istream& rIn);
+      void WriteToStream(std::ostream& rOut);
+
+    protected:
+      CuMatrix<BaseFloat> mVisHid;  ///< Matrix with neuron weights
+      CuVector<BaseFloat> mVisBias;       ///< Vector with biases
+      CuVector<BaseFloat> mHidBias;       ///< Vector with biases
+
+      CuMatrix<BaseFloat> mVisHidCorrection; ///< Matrix for linearity updates
+      CuVector<BaseFloat> mVisBiasCorrection;      ///< Vector for bias updates
+      CuVector<BaseFloat> mHidBiasCorrection;      ///< Vector for bias updates
+
+      CuMatrix<BaseFloat> mBackpropErrBuf;
+
+      RbmUnitType mVisType;
+      RbmUnitType mHidType;
+
+  };
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // CuRbm::
+  inline 
+  CuRbm::
+  CuRbm(size_t nInputs, size_t nOutputs, CuComponent *pPred)
+    : CuRbmBase(nInputs, nOutputs, pPred), 
+      mVisHid(nInputs,nOutputs), 
+      mVisBias(nInputs), mHidBias(nOutputs),
+      mVisHidCorrection(nInputs,nOutputs), 
+      mVisBiasCorrection(nInputs), mHidBiasCorrection(nOutputs),
+      mBackpropErrBuf(),
+      mVisType(BERNOULLI),
+      mHidType(BERNOULLI)
+  { 
+    mVisHidCorrection.SetConst(0.0);
+    mVisBiasCorrection.SetConst(0.0);
+    mHidBiasCorrection.SetConst(0.0);
+  }
+
+
+  inline
+  CuRbm::
+  ~CuRbm()
+  { }
+
+  inline CuComponent::ComponentType
+  CuRbm::
+  GetType() const
+  {
+    return CuComponent::RBM;
+  }
+
+  inline const char*
+  CuRbm::
+  GetName() const
+  {
+    return "<rbm>";
+  }
+
+
+
+} //namespace
+
+
+
+#endif
diff --git a/src/CuTNetLib/cuRbmSparse.cc b/src/CuTNetLib/cuRbmSparse.cc
new file mode 100644
index 0000000..e0b7352
--- /dev/null
+++ b/src/CuTNetLib/cuRbmSparse.cc
@@ -0,0 +1,269 @@
+
+#include <string>
+#include <sstream>
+
+#include "cuRbmSparse.h"
+
+#include "cumath.h"
+
+
+namespace TNet
+{
+
+  void 
+  CuRbmSparse::
+  PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    Y.SetConst(0.0);
+    Y.AddScaledRow(1.0,mHidBias,0.0);
+    Y.Gemm('N','N', 1.0, X, mVisHid, 1.0);
+    if(mHidType == BERNOULLI) {
+      CuMath<BaseFloat>::Sigmoid(Y,Y);
+    }
+  }
+
+
+  void 
+  CuRbmSparse::
+  BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    if(mHidType == BERNOULLI) {
+      mBackpropErrBuf.Init(X.Rows(),X.Cols());
+      CuMath<BaseFloat>::DiffSigmoid(mBackpropErrBuf,X,GetOutput());
+    } else {
+      mBackpropErrBuf.CopyFrom(X);
+    }
+
+    Y.SetConst(0.0);
+    Y.Gemm('N', 'T', 1.0, mBackpropErrBuf, mVisHid, 0.0);
+  }
+
+  
+  void 
+  CuRbmSparse::
+  Update() 
+  {
+    //THIS IS DONE TWICE BECAUSE OF THE BACKPROP STOPPER!!!
+    if(mHidType == BERNOULLI) {
+      mBackpropErrBuf.Init(GetErrorInput().Rows(),GetErrorInput().Cols());
+      CuMath<BaseFloat>::DiffSigmoid(mBackpropErrBuf,GetErrorInput(),GetOutput());
+    } else {
+      mBackpropErrBuf.CopyFrom(GetErrorInput());
+    }
+
+/*
+    std::cout << " " << GetInput().Rows()
+              << " " << GetInput().Cols()  
+              << " " << mBackpropErrBuf.Rows()  
+              << " " << mBackpropErrBuf.Cols()  
+              << " " << mVisHidCorrection.Rows()  
+              << " " << mVisHidCorrection.Cols()  
+              ;
+*/
+
+#if 0
+    //former implementation
+    BaseFloat N = static_cast<BaseFloat>(GetInput().Rows());
+
+    mVisHidCorrection.Gemm('T','N',-mLearningRate/N,GetInput(),mBackpropErrBuf,mMomentum);
+    mHidBiasCorrection.AddColSum(-mLearningRate/N,mBackpropErrBuf,mMomentum);
+
+    //regularization weight decay
+    mVisHidCorrection.AddScaled(-mLearningRate*mWeightcost,mVisHid,1.0);
+    
+    mVisHid.AddScaled(1.0,mVisHidCorrection,1.0);
+    mHidBias.AddScaled(1.0,mHidBiasCorrection,1.0);
+#endif
+
+#if 1
+    //new implementation
+    BaseFloat N = 1;
+    if(mGradDivFrm) {
+      N = static_cast<BaseFloat>(GetInput().Rows());
+    }
+    BaseFloat mmt_gain = static_cast<BaseFloat>(1.0/(1.0-mMomentum));
+    N *= mmt_gain;
+
+    mVisHidCorrection.Gemm('T','N',1.0,GetInput(),mBackpropErrBuf,mMomentum);
+    mHidBiasCorrection.AddColSum(1.0,mBackpropErrBuf,mMomentum);
+
+    mVisHid.AddScaled(-mLearningRate/N,mVisHidCorrection,1.0);
+    mHidBias.AddScaled(-mLearningRate/N,mHidBiasCorrection,1.0);
+
+    //regularization weight decay (from actual weights only)
+    mVisHid.AddScaled(-mLearningRate*mWeightcost,mVisHid,1.0);
+#endif
+
+  }
+
+
+
+  void 
+  CuRbmSparse::
+  Propagate(const CuMatrix<BaseFloat>& visProbs, CuMatrix<BaseFloat>& hidProbs)
+  {
+    if(visProbs.Cols() != GetNInputs()) {
+      std::ostringstream os;
+      os << " Nonmatching input dim, needs:" << GetNInputs() 
+         << " got:" << visProbs.Cols() << "\n";
+      Error(os.str());
+    }
+
+    hidProbs.Init(visProbs.Rows(),GetNOutputs());
+
+    PropagateFnc(visProbs, hidProbs);
+  }
+
+  void
+  CuRbmSparse::
+  Reconstruct(const CuMatrix<BaseFloat>& hidState, CuMatrix<BaseFloat>& visProbs)
+  {
+    visProbs.Init(hidState.Rows(),mNInputs);
+    visProbs.SetConst(0.0);
+    visProbs.AddScaledRow(1.0,mVisBias,0.0);
+    visProbs.Gemm('N','T', 1.0, hidState, mVisHid, 1.0);
+    if(mVisType == BERNOULLI) {
+      CuMath<BaseFloat>::Sigmoid(visProbs,visProbs);
+    }
+  }
+
+
+  void 
+  CuRbmSparse::
+  RbmUpdate(const CuMatrix<BaseFloat>& pos_vis, const CuMatrix<BaseFloat>& pos_hid, const CuMatrix<BaseFloat>& neg_vis, const CuMatrix<BaseFloat>& neg_hid)
+  {
+    assert(pos_vis.Rows() == pos_hid.Rows() &&
+           pos_vis.Rows() == neg_vis.Rows() &&
+           pos_vis.Rows() == neg_hid.Rows() &&
+           pos_vis.Cols() == neg_vis.Cols() &&
+           pos_hid.Cols() == neg_hid.Cols() &&
+           pos_vis.Cols() == mNInputs &&
+           pos_hid.Cols() == mNOutputs);
+
+    //:SPARSITY:
+    if(mHidType==BERNOULLI) {
+      //get expected node activity from current batch
+      mSparsityQCurrent.AddColSum(1.0/pos_hid.Rows(),pos_hid,0.0);
+      //get smoothed expected node activity
+      mSparsityQ.AddScaled(1.0-mLambda,mSparsityQCurrent,mLambda);
+      //subtract the prior: (q-p)
+      mSparsityQCurrent.SetConst(-mSparsityPrior);
+      mSparsityQCurrent.AddScaled(1.0,mSparsityQ,1.0);
+      //get mean pos_vis
+      mVisMean.AddColSum(1.0/pos_vis.Rows(),pos_vis,0.0);
+    }
+
+    //  UPDATE vishid matrix
+    //  
+    //  vishidinc = momentum*vishidinc + ...
+    //              epsilonw*( (posprods-negprods)/numcases - weightcost*vishid)
+    //              -sparsitycost*mean_posvis'*(q-p);
+    //
+    //  vishidinc[t] = -(epsilonw/numcases)*negprods + momentum*vishidinc[t-1]
+    //                 +(epsilonw/numcases)*posprods
+    //                 -(epsilonw*weightcost)*vishid[t-1]
+    //
+    BaseFloat N = static_cast<BaseFloat>(pos_vis.Rows());
+    mVisHidCorrection.Gemm('T','N',-mLearningRate/N,neg_vis,neg_hid,mMomentum);
+    mVisHidCorrection.Gemm('T','N',+mLearningRate/N,pos_vis,pos_hid,1.0);
+    mVisHidCorrection.AddScaled(-mLearningRate*mWeightcost,mVisHid,1.0);//L2
+    if(mHidType==BERNOULLI) {
+      mVisHidCorrection.BlasGer(-mSparsityCost,mVisMean,mSparsityQCurrent);//sparsity
+    }
+    mVisHid.AddScaled(1.0,mVisHidCorrection,1.0);
+
+    //  UPDATE visbias vector
+    //
+    //  visbiasinc = momentum*visbiasinc + (epsilonvb/numcases)*(posvisact-negvisact);
+    //
+    mVisBiasCorrection.AddColSum(-mLearningRate/N,neg_vis,mMomentum);
+    mVisBiasCorrection.AddColSum(+mLearningRate/N,pos_vis,1.0);
+    mVisBias.AddScaled(1.0,mVisBiasCorrection,1.0);
+    
+    //  UPDATE hidbias vector
+    //
+    // hidbiasinc = momentum*hidbiasinc + (epsilonhb/numcases)*(poshidact-neghidact);
+    //
+    mHidBiasCorrection.AddColSum(-mLearningRate/N,neg_hid,mMomentum);
+    mHidBiasCorrection.AddColSum(+mLearningRate/N,pos_hid,1.0);
+    if(mHidType==BERNOULLI) {
+      mHidBiasCorrection.AddScaled(-mSparsityCost,mSparsityQCurrent,1.0);//sparsity
+    }
+    mHidBias.AddScaled(1.0/*0.0*/,mHidBiasCorrection,1.0);
+
+  }
+
+
+  void
+  CuRbmSparse::
+  ReadFromStream(std::istream& rIn)
+  {
+    //type of the units
+    std::string str;
+    
+    rIn >> std::ws >> str;
+    if(0 == str.compare("bern")) {
+      mVisType = BERNOULLI;
+    } else if(0 == str.compare("gauss")) {
+      mVisType = GAUSSIAN;
+    } else Error(std::string("Invalid unit type: ")+str);
+
+    rIn >> std::ws >> str;
+    if(0 == str.compare("bern")) {
+      mHidType = BERNOULLI;
+    } else if(0 == str.compare("gauss")) {
+      mHidType = GAUSSIAN;
+    } else Error(std::string("Invalid unit type: ")+str);
+
+
+    //matrix is stored transposed as SNet does
+    BfMatrix transpose;
+    rIn >> transpose;
+    mVisHid.CopyFrom(BfMatrix(transpose, TRANS));
+    //biases stored normally
+    BfVector bias;
+    rIn >> bias;
+    mVisBias.CopyFrom(bias);
+    rIn >> bias;
+    mHidBias.CopyFrom(bias);
+
+    rIn >> std::ws >> mSparsityCost;
+    std::cout << "RBM::mSparsityCost=" << mSparsityCost;
+  }
+
+   
+  void
+  CuRbmSparse::
+  WriteToStream(std::ostream& rOut)
+  {
+    //store unit type info
+    if(mVisType == BERNOULLI) {
+      rOut << " bern ";
+    } else {
+      rOut << " gauss ";
+    }
+    if(mHidType == BERNOULLI) {
+      rOut << " bern\n";
+    } else {
+      rOut << " gauss\n";
+    }
+
+    //matrix is stored transposed as SNet does
+    BfMatrix tmp;
+    mVisHid.CopyTo(tmp);
+    BfMatrix transpose(tmp, TRANS);
+    rOut << transpose;
+    //biases stored normally
+    BfVector vec;
+    mVisBias.CopyTo(vec);
+    rOut << vec;
+    rOut << std::endl;
+    mHidBias.CopyTo(vec);
+    rOut << vec;
+    rOut << std::endl;
+    //store the sparsity cost
+    rOut << mSparsityCost << std::endl;
+  }
+
+ 
+} //namespace
diff --git a/src/CuTNetLib/cuRbmSparse.h b/src/CuTNetLib/cuRbmSparse.h
new file mode 100644
index 0000000..9d7e304
--- /dev/null
+++ b/src/CuTNetLib/cuRbmSparse.h
@@ -0,0 +1,134 @@
+#ifndef _CU_RBM_SPARSE_H_
+#define _CU_RBM_SPARSE_H_
+
+
+#include "cuComponent.h"
+#include "cumatrix.h"
+#include "cuRbm.h"
+
+
+#include "Matrix.h"
+#include "Vector.h"
+
+
+namespace TNet {
+
+  class CuRbmSparse : public CuRbmBase
+  {
+    public:
+
+      CuRbmSparse(size_t nInputs, size_t nOutputs, CuComponent *pPred); 
+      ~CuRbmSparse();  
+      
+      ComponentType GetType() const;
+      const char* GetName() const;
+
+      //CuUpdatableComponent API
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+
+      void Update();
+
+      //RBM training API
+      void Propagate(const CuMatrix<BaseFloat>& visProbs, CuMatrix<BaseFloat>& hidProbs);
+      void Reconstruct(const CuMatrix<BaseFloat>& hidState, CuMatrix<BaseFloat>& visProbs);
+      void RbmUpdate(const CuMatrix<BaseFloat>& pos_vis, const CuMatrix<BaseFloat>& pos_hid, const CuMatrix<BaseFloat>& neg_vis, const CuMatrix<BaseFloat>& neg_hid);
+
+      RbmUnitType VisType()
+      { return mVisType; }
+
+      RbmUnitType HidType()
+      { return mHidType; }
+
+      //static void BinarizeProbs(const CuMatrix<BaseFloat>& probs, CuMatrix<BaseFloat>& states);
+
+      //I/O
+      void ReadFromStream(std::istream& rIn);
+      void WriteToStream(std::ostream& rOut);
+
+    protected:
+      CuMatrix<BaseFloat> mVisHid;  ///< Matrix with neuron weights
+      CuVector<BaseFloat> mVisBias;       ///< Vector with biases
+      CuVector<BaseFloat> mHidBias;       ///< Vector with biases
+
+      CuMatrix<BaseFloat> mVisHidCorrection; ///< Matrix for linearity updates
+      CuVector<BaseFloat> mVisBiasCorrection;      ///< Vector for bias updates
+      CuVector<BaseFloat> mHidBiasCorrection;      ///< Vector for bias updates
+
+      CuMatrix<BaseFloat> mBackpropErrBuf;
+
+      RbmUnitType mVisType;
+      RbmUnitType mHidType;
+
+      ////// sparsity 
+      BaseFloat mSparsityPrior; ///< sparsity target (unit activity prior)
+      BaseFloat mLambda; ///< exponential decay factor for q (observed probability of unit to be active)
+      BaseFloat mSparsityCost; ///< sparsity cost coef.
+
+      CuVector<BaseFloat> mSparsityQ;
+      CuVector<BaseFloat> mSparsityQCurrent;
+      CuVector<BaseFloat> mVisMean; ///< buffer for mean visible
+
+  };
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // CuRbmSparse::
+  inline 
+  CuRbmSparse::
+  CuRbmSparse(size_t nInputs, size_t nOutputs, CuComponent *pPred)
+    : CuRbmBase(nInputs, nOutputs, pPred), 
+      mVisHid(nInputs,nOutputs), 
+      mVisBias(nInputs), mHidBias(nOutputs),
+      mVisHidCorrection(nInputs,nOutputs), 
+      mVisBiasCorrection(nInputs), mHidBiasCorrection(nOutputs),
+      mBackpropErrBuf(),
+      mVisType(BERNOULLI),
+      mHidType(BERNOULLI),
+
+      mSparsityPrior(0.0001),
+      mLambda(0.95),
+      mSparsityCost(1e-7),
+      mSparsityQ(nOutputs),
+      mSparsityQCurrent(nOutputs),
+      mVisMean(nInputs)
+  { 
+    mVisHidCorrection.SetConst(0.0);
+    mVisBiasCorrection.SetConst(0.0);
+    mHidBiasCorrection.SetConst(0.0);
+
+    mSparsityQ.SetConst(mSparsityPrior);
+    mSparsityQCurrent.SetConst(0.0);
+    mVisMean.SetConst(0.0);
+  }
+
+
+  inline
+  CuRbmSparse::
+  ~CuRbmSparse()
+  { }
+
+  inline CuComponent::ComponentType
+  CuRbmSparse::
+  GetType() const
+  {
+    return CuComponent::RBM_SPARSE;
+  }
+
+  inline const char*
+  CuRbmSparse::
+  GetName() const
+  {
+    return "<rbmsparse>";
+  }
+
+
+
+} //namespace
+
+
+
+#endif
diff --git a/src/CuTNetLib/cuRecurrent.cc b/src/CuTNetLib/cuRecurrent.cc
new file mode 100644
index 0000000..428df2c
--- /dev/null
+++ b/src/CuTNetLib/cuRecurrent.cc
@@ -0,0 +1,191 @@
+
+#include <string>
+#include <sstream>
+
+#include "cuRecurrent.h"
+
+#include "cumath.h"
+#include "cuda_runtime.h"
+
+
+namespace TNet
+{
+
+  void 
+  CuRecurrent::
+  PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    assert(X.Rows() == 1);
+    assert(Y.Rows() == 1);
+    if(mInputHistory.Rows() == 0) {
+      Error("Bptt order was not set");
+    }
+
+    //pushback the history
+    CuMatrix<BaseFloat> tmp(mInputHistory.Rows()-1,mInputHistory.Cols());
+    tmp.CopyRows(tmp.Rows(),0,mInputHistory,0);
+    mInputHistory.CopyRows(tmp.Rows(),0,tmp,1);
+
+    //compose the input vector to 0th row, use input X and previous Y
+    cudaMemcpy(mInputHistory.pCUData(), X.pCUData(),
+               sizeof(BaseFloat)*X.Cols(), cudaMemcpyDeviceToDevice);
+    cudaMemcpy(mInputHistory.pCUData()+X.Cols(), Y.pCUData(),
+               sizeof(BaseFloat)*Y.Cols(), cudaMemcpyDeviceToDevice);
+
+    //extract first row
+    //CuMatrix<BaseFloat> first_row(1,mInputHistory.Cols());
+    //first_row.CopyRows(1,0,mInputHistory,0);
+
+    //calculate the output
+    Y.AddScaledRow(1.0,mBias,0.0);
+    //take 0th vector of history, propagate
+    CuMath<BaseFloat>::OffsetGemv('T',1.0,mLinearity,mInputHistory.pCUData(),mInputHistory.Cols(),1.0,Y.pCUData(),Y.Cols(),0); 
+    //Y.Gemm('N','N', 1.0, first_row, mLinearity, 1.0);
+    CuMath<BaseFloat>::Sigmoid(Y,Y);
+
+    /*
+    std::cout << "-------------------------------------" << std::endl;
+    X.Print();
+    Y.Print();
+    mInputHistory.Print();
+    */
+
+  }
+
+
+  void 
+  CuRecurrent::
+  BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    assert(Y.Rows() == 1);
+    assert(X.Rows() == 1);
+
+    //apply diff sigmoid
+    CuMatrix<BaseFloat> diff_sigm(1,X.Cols());
+    CuMath<BaseFloat>::DiffSigmoid(diff_sigm,X,GetOutput());
+    
+    //:TODO: inefficent to calculate all the input errors!!!
+    //       we need only part of them!
+    //
+    //backward-multiply by weights
+    /*
+    CuMatrix<BaseFloat> err_prev(1,mLinearity.Rows());
+    err_prev.Gemm('N', 'T', 1.0, diff_sigm, mLinearity, 0.0);
+ 
+    //copy out the interval
+    cudaMemcpy(Y.pCUData(),err_prev.pCUData(),
+               sizeof(BaseFloat)*Y.Cols(),cudaMemcpyDeviceToDevice);
+    */
+
+    //backward-multiply by weights
+    CuMath<BaseFloat>::OffsetGemv('N',1.0,mLinearity,diff_sigm.pCUData(),diff_sigm.Cols(),1.0,Y.pCUData(),Y.Cols(),0); 
+
+  }
+
+  
+  void 
+  CuRecurrent::
+  Update() 
+  {
+    //
+    //correction from PRESENT input x error pair
+    //
+    //apply diff sigmoid
+    CuMatrix<BaseFloat> diff_sigm(1,GetOutput().Cols());
+    CuMath<BaseFloat>::DiffSigmoid(diff_sigm,GetErrorInput(),GetOutput());
+
+    //get 0th row of history (present time)
+    CuMatrix<BaseFloat> history_row(1,mInputHistory.Cols());
+    history_row.CopyRows(1,0,mInputHistory,0);
+
+    //calculate update
+    //mLinearityCorrection.Gemm('T','N',-mLearningRate,history_row,diff_sigm,mMomentum);
+    mLinearityCorrection.SetConst(0.0); //:TODO: should be scale/momentum
+    CuMath<BaseFloat>::BlasGer(-mLearningRate,history_row.pCUData(),history_row.Cols(),diff_sigm.pCUData(),diff_sigm.Cols(),mLinearityCorrection);
+
+    mBiasCorrection.AddColSum(-mLearningRate,diff_sigm,mMomentum);
+   
+    //
+    //BPTT (backprop through time) 
+    //
+    CuMatrix<BaseFloat> err_prev(1,mLinearity.Rows());
+    CuMatrix<BaseFloat> err_prev_part(1,diff_sigm.Cols());
+    CuMatrix<BaseFloat> history_output(1,GetOutput().Cols());
+    for(int i=1; i<=mBpttOrder; i++) {
+      //:TODO: inefficent to calculate all the input errors!!!
+      //       we need only part of them!
+      //
+      /*
+      //get previous error
+      err_prev.Gemm('N','T',1.0,diff_sigm,mLinearity,0.0);
+      //select interval
+      cudaMemcpy(err_prev_part.pCUData(),err_prev.pCUData()+GetNInputs(),
+                 sizeof(BaseFloat)*err_prev_part.Cols(),cudaMemcpyDeviceToDevice);
+      */
+
+      //backward-multiply by weights
+      CuMath<BaseFloat>::OffsetGemv('N',1.0,mLinearity,diff_sigm.pCUData(),diff_sigm.Cols(),0.0,err_prev_part.pCUData(),err_prev_part.Cols(),GetInput().Cols()); 
+
+      //apply diff sigmoid with activations of HISTORY frame!!!
+      cudaMemcpy(history_output.pCUData(), mInputHistory.pCURowData(i-1)+GetInput().Cols(),
+          sizeof(BaseFloat)*history_output.Cols(), cudaMemcpyDeviceToDevice);
+      CuMath<BaseFloat>::DiffSigmoid(diff_sigm,err_prev_part,history_output);
+
+      //get history row
+      history_row.CopyRows(1,i,mInputHistory,0);
+
+      //accu the update
+      //mLinearityCorrection.Gemm('T','N',-mLearningRate,history_row,diff_sigm,1.0);
+      CuMath<BaseFloat>::BlasGer(-mLearningRate,history_row.pCUData(),history_row.Cols(),diff_sigm.pCUData(),diff_sigm.Cols(),mLinearityCorrection);
+      mBiasCorrection.AddColSum(-mLearningRate,diff_sigm,1.0);
+    }
+
+    //
+    //update the weights
+    //
+    //regularization weight decay
+    mLinearityCorrection.AddScaled(-mLearningRate*mWeightcost,mLinearity,1.0);
+    
+    //perform update
+    mLinearity.AddScaled(1.0,mLinearityCorrection,1.0);
+    mBias.AddScaled(1.0,mBiasCorrection,1.0);
+
+  }
+
+
+
+
+  void
+  CuRecurrent::
+  ReadFromStream(std::istream& rIn)
+  {
+    //matrix is stored transposed as SNet does
+    BfMatrix transpose;
+    rIn >> transpose;
+    mLinearity.CopyFrom(BfMatrix(transpose, TRANS));
+    //biases stored normally
+    BfVector bias;
+    rIn >> bias;
+    mBias.CopyFrom(bias);
+  }
+
+   
+  void
+  CuRecurrent::
+  WriteToStream(std::ostream& rOut)
+  {
+    //matrix is stored transposed as SNet does
+    BfMatrix tmp;
+    mLinearity.CopyTo(tmp);
+    BfMatrix transpose(tmp, TRANS);
+    rOut << transpose;
+    //biases stored normally
+    BfVector vec;
+    mBias.CopyTo(vec);
+    rOut << vec;
+    rOut << std::endl;
+  }
+
+ 
+} //namespace
+
diff --git a/src/CuTNetLib/cuRecurrent.h b/src/CuTNetLib/cuRecurrent.h
new file mode 100644
index 0000000..e487b27
--- /dev/null
+++ b/src/CuTNetLib/cuRecurrent.h
@@ -0,0 +1,101 @@
+#ifndef _CU_RECURRENT_H_
+#define _CU_RECURRENT_H_
+
+
+#include "cuComponent.h"
+#include "cumatrix.h"
+
+
+#include "Matrix.h"
+#include "Vector.h"
+
+
+namespace TNet {
+
+  class CuRecurrent : public CuUpdatableComponent
+  {
+    public:
+
+      CuRecurrent(size_t nInputs, size_t nOutputs, CuComponent *pPred); 
+      ~CuRecurrent();  
+      
+      ComponentType GetType() const;
+      const char* GetName() const;
+
+      //CuUpdatableComponent API
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+
+      void Update();
+
+      //Recurrent training API
+      void BpttOrder(int ord) {
+        mBpttOrder = ord;
+        mInputHistory.Init(ord+1,GetNInputs()+GetNOutputs());
+      }
+      void ClearHistory() {
+        mInputHistory.SetConst(0.0);
+        if(mOutput.MSize() > 0) {
+          mOutput.SetConst(0.0);
+        }
+      }
+
+      //I/O
+      void ReadFromStream(std::istream& rIn);
+      void WriteToStream(std::ostream& rOut);
+
+    protected:
+      CuMatrix<BaseFloat> mLinearity;
+      CuVector<BaseFloat> mBias;
+
+      CuMatrix<BaseFloat> mLinearityCorrection;
+      CuVector<BaseFloat> mBiasCorrection;
+
+      CuMatrix<BaseFloat> mInputHistory;
+
+      int mBpttOrder;
+  };
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // CuRecurrent::
+  inline 
+  CuRecurrent::
+  CuRecurrent(size_t nInputs, size_t nOutputs, CuComponent *pPred)
+    : CuUpdatableComponent(nInputs, nOutputs, pPred), 
+      mLinearity(nInputs+nOutputs,nOutputs),
+      mBias(nOutputs),
+      mLinearityCorrection(nInputs+nOutputs,nOutputs), 
+      mBiasCorrection(nOutputs)
+  { }
+
+
+  inline
+  CuRecurrent::
+  ~CuRecurrent()
+  { }
+
+  inline CuComponent::ComponentType
+  CuRecurrent::
+  GetType() const
+  {
+    return CuComponent::RECURRENT;
+  }
+
+  inline const char*
+  CuRecurrent::
+  GetName() const
+  {
+    return "<recurrent>";
+  }
+
+
+
+} //namespace
+
+
+
+#endif
diff --git a/src/CuTNetLib/cuSharedLinearity.cc b/src/CuTNetLib/cuSharedLinearity.cc
new file mode 100644
index 0000000..8d5ec09
--- /dev/null
+++ b/src/CuTNetLib/cuSharedLinearity.cc
@@ -0,0 +1,179 @@
+
+
+#include "cuSharedLinearity.h"
+#include "cumath.h"
+
+
+namespace TNet
+{
+
+  void 
+  CuSharedLinearity::
+  PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    CuMath<BaseFloat>::VecExpand(mBias,mBiasExpand); /// [ 1 2 3 ] -> [ 1 2 3 1 2 3 ... ]
+    Y.AddScaledRow(1.0,mBiasExpand,0.0);
+
+    //mBiasExpand.Print();
+
+    for(int i=0; i<mNInstances; i++) {
+      CuMath<BaseFloat>::OffsetGemm('N','N', 1.0, X, mLinearity, 1.0, Y, 
+                                    i*mLinearity.Rows(), 0, i*mLinearity.Cols());
+    }
+    //std::cout << CuDevice::Instantiate().GetFreeMemory();
+    //GetInput().Print();
+    //GetOutput().Print();
+  }
+
+
+  void 
+  CuSharedLinearity::
+  BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    for(int i=0; i<mNInstances; i++) {
+      CuMath<BaseFloat>::OffsetGemm('N', 'T', 1.0, X, mLinearity, 0.0, Y,
+                                    i*mLinearity.Cols(), 0, i*mLinearity.Rows());
+    }
+  }
+
+  
+  void 
+  CuSharedLinearity::
+  Update() 
+  {
+#if 0
+    //former implementation
+    BaseFloat N = static_cast<BaseFloat>(GetInput().Rows());
+
+    for(int i=0; i<mNInstances; i++) {
+      CuMath<BaseFloat>::OffsetGemm('T','N',-mLearningRate/(N*mNInstances),
+                        GetInput(),GetErrorInput(),
+                        ((i==0)?mMomentum:1.0f), mLinearityCorrection, 
+                        i*mLinearity.Rows(),i*mLinearity.Cols(),0);
+    }
+    mBiasCorrectionExpand.AddColSum(1.0,GetErrorInput(),0.0);
+    CuMath<BaseFloat>::VecAddColSum(-mLearningRate/(N*mNInstances),mBiasCorrectionExpand,mMomentum,mBiasCorrection);
+
+
+    //regularization weight decay
+    mLinearityCorrection.AddScaled(-mLearningRate*mWeightcost,mLinearity,1.0);
+    
+    mLinearity.AddScaled(1.0,mLinearityCorrection,1.0);
+    mBias.AddScaled(1.0,mBiasCorrection,1.0);
+#endif
+
+#if 1
+    //new implementation
+    BaseFloat N = 1; 
+    if(mGradDivFrm) {
+      N = static_cast<BaseFloat>(GetInput().Rows());
+    }
+    BaseFloat mmt_gain = static_cast<BaseFloat>(1.0/(1.0-mMomentum));
+    N *= mmt_gain; //compensate higher gradient estimates due to momentum 
+    
+    //compensate augmented dyn. range of gradient caused by multiple instances
+    N *= static_cast<BaseFloat>(mNInstances); 
+
+    //get gradient of shared linearity
+    for(int i=0; i<mNInstances; i++) {
+      CuMath<BaseFloat>::OffsetGemm('T','N',1.0,
+                        GetInput(),GetErrorInput(),
+                        ((i==0)?mMomentum:1.0f), mLinearityCorrection, 
+                        i*mLinearity.Rows(),i*mLinearity.Cols(),0);
+    }
+    //get gradient of shared bias
+    mBiasCorrectionExpand.AddColSum(1.0,GetErrorInput(),0.0);
+    CuMath<BaseFloat>::VecAddColSum(1.0,mBiasCorrectionExpand,mMomentum,mBiasCorrection);
+   
+    //perform update 
+    mLinearity.AddScaled(-mLearningRate/N,mLinearityCorrection,1.0);
+    mBias.AddScaled(-mLearningRate/N,mBiasCorrection,1.0);
+    
+    //regularization weight decay
+    mLinearity.AddScaled(-mLearningRate*mWeightcost,mLinearity,1.0);
+#endif
+   
+  }
+
+
+  void
+  CuSharedLinearity::
+  ReadFromStream(std::istream& rIn)
+  {
+    //number of instances of shared weights in layer
+    rIn >> std::ws >> mNInstances;
+    if(mNInstances < 1) {
+      std::ostringstream os;
+      os << "Bad number of instances:" << mNInstances;
+      Error(os.str());
+    }
+    if(GetNInputs() % mNInstances != 0 || GetNOutputs() % mNInstances != 0) {
+      std::ostringstream os;
+      os << "Number of Inputs/Outputs must be divisible by number of instances"
+         << " Inputs:" << GetNInputs()
+         << " Outputs" << GetNOutputs()
+         << " Intances:" << mNInstances;
+      Error(os.str());
+    }
+      
+    //matrix is stored transposed as SNet does
+    BfMatrix transpose;
+    rIn >> transpose;
+    mLinearity.CopyFrom(BfMatrix(transpose, TRANS));
+    //biases stored normally
+    BfVector bias;
+    rIn >> bias;
+    mBias.CopyFrom(bias);
+
+    if(transpose.Cols()*transpose.Rows() == 0) {
+      Error("Missing linearity matrix in network file");
+    }
+    if(bias.Dim() == 0) {
+      Error("Missing bias vector in network file");
+    }
+
+
+    if(mLinearity.Cols() != GetNOutputs() / mNInstances || 
+       mLinearity.Rows() != GetNInputs() / mNInstances ||
+       mBias.Dim() != GetNOutputs() / mNInstances
+    ){
+      std::ostringstream os;
+      os << "Wrong dimensionalities of matrix/vector in network file\n"
+         << "Inputs:" << GetNInputs()
+         << "Outputs:" << GetNOutputs()
+         << "\n"
+         << "linearityCols:" << mLinearity.Cols()
+         << "linearityRows:" << mLinearity.Rows()
+         << "biasDims:" << mBias.Dim()
+         << "\n";
+      Error(os.str());
+    }
+
+    mLinearityCorrection.Init(mLinearity.Rows(),mLinearity.Cols());
+    mBiasCorrection.Init(mBias.Dim());
+
+    mBiasExpand.Init(mBias.Dim()*mNInstances);
+    mBiasCorrectionExpand.Init(mBias.Dim()*mNInstances);
+  }
+
+   
+  void
+  CuSharedLinearity::
+  WriteToStream(std::ostream& rOut)
+  {
+    rOut << mNInstances << std::endl;
+
+    //matrix is stored transposed as SNet does
+    BfMatrix tmp;
+    mLinearity.CopyTo(tmp);
+    BfMatrix transpose(tmp, TRANS);
+    rOut << transpose;
+    //biases stored normally
+    BfVector vec;
+    mBias.CopyTo(vec);
+    rOut << vec;
+    rOut << std::endl;
+  }
+
+ 
+} //namespace
diff --git a/src/CuTNetLib/cuSharedLinearity.h b/src/CuTNetLib/cuSharedLinearity.h
new file mode 100644
index 0000000..76133eb
--- /dev/null
+++ b/src/CuTNetLib/cuSharedLinearity.h
@@ -0,0 +1,94 @@
+#ifndef _CUSHARED_LINEARITY_H_
+#define _CUSHARED_LINEARITY_H_
+
+
+#include "cuComponent.h"
+#include "cumatrix.h"
+
+
+#include "Matrix.h"
+#include "Vector.h"
+
+
+namespace TNet {
+  
+  /**
+   * \brief CuSharedLinearity summation function
+   *
+   * \ingroup CuNNUpdatable
+   * Weights and bias are shared between output,
+   *  which means the interconnections are segmented.
+   *  every time the outputs (blocks) are generated individually, and bias is expanded to speed up?
+   *  \sa CuBiasedLinearity
+   */
+  class CuSharedLinearity : public CuUpdatableComponent
+  {
+    public:
+
+      CuSharedLinearity(size_t nInputs, size_t nOutputs, CuComponent *pPred); 
+      ~CuSharedLinearity();  
+      
+      ComponentType GetType() const;
+      const char* GetName() const;
+
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+
+      void Update();
+
+      void ReadFromStream(std::istream& rIn);
+      void WriteToStream(std::ostream& rOut);
+
+    protected:
+      CuMatrix<BaseFloat> mLinearity;  ///< Matrix with neuron weights
+      CuVector<BaseFloat> mBias;       ///< Vector with biases
+
+      CuMatrix<BaseFloat> mLinearityCorrection; ///< Matrix for linearity updates
+      CuVector<BaseFloat> mBiasCorrection;      ///< Vector for bias updates
+
+      int mNInstances;                          ///< Number of times the bias and weights are shared
+      CuVector<BaseFloat> mBiasExpand;          ///< Bias expanded by mNInstances times
+      CuVector<BaseFloat> mBiasCorrectionExpand;///< Bias correction for the expanded bias vector
+
+  };
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // CuSharedLinearity::
+  inline 
+  CuSharedLinearity::
+  CuSharedLinearity(size_t nInputs, size_t nOutputs, CuComponent *pPred)
+    : CuUpdatableComponent(nInputs, nOutputs, pPred), 
+      mNInstances(0)
+  { }
+
+
+  inline
+  CuSharedLinearity::
+  ~CuSharedLinearity()
+  { }
+
+  inline CuComponent::ComponentType
+  CuSharedLinearity::
+  GetType() const
+  {
+    return CuComponent::SHARED_LINEARITY;
+  }
+
+  inline const char*
+  CuSharedLinearity::
+  GetName() const
+  {
+    return "<sharedlinearity>";
+  }
+
+
+
+} //namespace
+
+
+
+#endif
diff --git a/src/CuTNetLib/cuSparseLinearity.cc b/src/CuTNetLib/cuSparseLinearity.cc
new file mode 100644
index 0000000..7209630
--- /dev/null
+++ b/src/CuTNetLib/cuSparseLinearity.cc
@@ -0,0 +1,190 @@
+
+
+#include "cuSparseLinearity.h"
+#include <cmath>
+#include <cstdlib>
+
+
+namespace TNet
+{
+
+  void 
+  CuSparseLinearity::
+  PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    Y.AddScaledRow(1.0,mBias,0.0);
+    Y.Gemm('N','N', 1.0, X, mLinearity, 1.0);
+  }
+
+
+  void 
+  CuSparseLinearity::
+  BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    Y.Gemm('N', 'T', 1.0, X, mLinearity, 0.0);
+  }
+
+  
+  void 
+  CuSparseLinearity::
+  Update() 
+  {
+    BaseFloat N = 1;
+    if(mGradDivFrm) {
+      N = static_cast<BaseFloat>(GetInput().Rows());
+    }
+    BaseFloat mmt_gain = static_cast<BaseFloat>(1.0/(1.0-mMomentum));
+    N *= mmt_gain;
+
+    mLinearityCorrection.Gemm('T','N',1.0,GetInput(),GetErrorInput(),mMomentum);
+    mBiasCorrection.AddColSum(1.0,GetErrorInput(),mMomentum);
+
+    mLinearity.AddScaled(-mLearningRate/N,mLinearityCorrection,1.0);
+    mBias.AddScaled(-mLearningRate/N,mBiasCorrection,1.0);
+
+    mLinearityCorrectionAccu.AddScaled(1.0,mLinearityCorrection,1.0);
+    mLinearity.ApplyMask(mSparsityMask); 
+
+    //L1 regularization lasso...
+    //each update? everty 1000th update?
+    if(mL1Const > 0) {
+      BaseFloat L1_const = mLearningRate*mL1Const*(mGradDivFrm?1.0:GetInput().Rows());
+      mLinearity.ApplyL1(L1_const);
+    }
+
+    //L2 regularization weight decay (from actual weights only)
+    if(mWeightcost > 0) {
+      BaseFloat L2_decay = -mLearningRate*mWeightcost*(mGradDivFrm?1.0:GetInput().Rows());
+      mLinearity.AddScaled(L2_decay, mLinearity,1.0);
+    }
+
+    mNFrames += GetInput().Rows();
+
+  }
+
+
+  void 
+  CuSparseLinearity::
+  UpdateMask()
+  {
+    //move data to host
+    Matrix<BaseFloat> linearity, linearity_correction_accu; 
+    Matrix<BaseFloat> sparsity_mask;
+
+    mLinearity.CopyTo(linearity);
+    mLinearityCorrectionAccu.CopyTo(linearity_correction_accu);
+    mSparsityMask.CopyTo(sparsity_mask);
+
+    //decide on new sparsity mask
+    for(size_t r=0; r<sparsity_mask.Rows(); r++) {
+      for(size_t c=0; c<sparsity_mask.Cols(); c++) {
+        if(sparsity_mask(r,c) == 1.0f) { //weight active
+          if(fabs(linearity(r,c)) < mSparsifyWeightThreshold) {
+            sparsity_mask(r,c) = 0;//deactivate
+            linearity(r,c) = 0;
+          }
+        } else { //weight inactive
+          if(abs(linearity_correction_accu(r,c))/(BaseFloat)mNFrames > mUnsparsifyAccu) {
+            sparsity_mask(r,c) = 1;//activate
+          }
+        }
+      }
+    }
+
+    //move data to the device
+    mLinearity.CopyFrom(linearity);
+    mSparsityMask.CopyFrom(sparsity_mask);
+  }
+
+
+  void
+  CuSparseLinearity::
+  ReadFromStream(std::istream& rIn)
+  {
+    //matrix is stored transposed as SNet does
+    BfMatrix transpose;
+    rIn >> transpose;
+    mLinearity.CopyFrom(BfMatrix(transpose, TRANS));
+    //biases stored normally
+    BfVector bias;
+    rIn >> bias;
+    mBias.CopyFrom(bias);
+
+    //sparsity mask
+    rIn >> std::ws;
+    Matrix<BaseFloat> mask_transp;
+    if(rIn.peek() == 'm') {//load from file
+      rIn >> mask_transp;
+    } else {//or set all elements active
+      mask_transp.Init(transpose.Rows(),transpose.Cols());
+      int items=transpose.Rows()*transpose.Stride();
+      BaseFloat* p = mask_transp.pData();
+      for(int i=0; i<items; i++) {//set all elements to one
+        *p++ = 1;
+      }
+    }
+    mSparsityMask.CopyFrom(BfMatrix(mask_transp,TRANS));
+
+    //dummy matrix with acumulated gradients
+    rIn >> std::ws;
+    if(rIn.peek() == 'm') {//load from file
+      BfMatrix dummy;
+      rIn >> dummy;
+    }
+
+    if(transpose.Cols()*transpose.Rows() == 0) {
+      Error("Missing linearity matrix in network file");
+    }
+    if(bias.Dim() == 0) {
+      Error("Missing bias vector in network file");
+    }
+    if(mLinearity.Cols() != GetNOutputs() || 
+       mLinearity.Rows() != GetNInputs() ||
+       mBias.Dim() != GetNOutputs()
+    ){
+      std::ostringstream os;
+      os << "Wrong dimensionalities of matrix/vector in network file\n"
+         << "Inputs:" << GetNInputs()
+         << "Outputs:" << GetNOutputs()
+         << "\n"
+         << "linearityCols:" << mLinearity.Cols()
+         << "linearityRows:" << mLinearity.Rows()
+         << "biasDims:" << mBias.Dim()
+         << "\n";
+      Error(os.str());
+    }
+
+    assert(mLinearity.Rows() == mSparsityMask.Rows());
+    assert(mLinearity.Cols() == mSparsityMask.Cols());
+
+  }
+
+   
+  void
+  CuSparseLinearity::
+  WriteToStream(std::ostream& rOut)
+  {
+    UpdateMask();
+
+    //matrix is stored transposed as SNet does
+    BfMatrix tmp;
+    mLinearity.CopyTo(tmp);
+    BfMatrix transpose(tmp, TRANS);
+    rOut << transpose;
+    //biases stored normally
+    BfVector vec;
+    mBias.CopyTo(vec);
+    rOut << vec;
+    rOut << std::endl;
+    //store mask
+    mSparsityMask.CopyTo(tmp);
+    rOut << BfMatrix(tmp,TRANS);
+    //store accu
+    mLinearityCorrectionAccu.CopyTo(tmp);
+    rOut << BfMatrix(tmp,TRANS);
+
+  }
+
+ 
+} //namespace
+
diff --git a/src/CuTNetLib/cuSparseLinearity.h b/src/CuTNetLib/cuSparseLinearity.h
new file mode 100644
index 0000000..3cdf078
--- /dev/null
+++ b/src/CuTNetLib/cuSparseLinearity.h
@@ -0,0 +1,115 @@
+#ifndef _CUSPARSE_LINEARITY_H_
+#define _CUSPARSE_LINEARITY_H_
+
+
+#include "cuComponent.h"
+#include "cumatrix.h"
+
+
+#include "Matrix.h"
+#include "Vector.h"
+
+
+namespace TNet {
+
+  /**
+   * \brief CuSparseLinearity summation function
+   *
+   * \ingroup CuNNUpdatable
+   * Using weight masks to avoid fluctuation in the output
+   *  -Weights are masked when it is lower than certain threshold - mSparsifyWeightThreshold
+   *  -Weights are activated when the accumulated change is larger than certan value - mUnsparsifyAccu
+   *  -L1 lasso function zeroing weights
+   *  .
+   * \sa CuBiasedLinearity
+   */
+  class CuSparseLinearity : public CuUpdatableComponent
+  {
+    public:
+
+      CuSparseLinearity(size_t nInputs, size_t nOutputs, CuComponent *pPred); 
+      ~CuSparseLinearity();  
+      
+      ComponentType GetType() const;
+      const char* GetName() const;
+
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+
+      void Update();
+      void UpdateMask();
+
+      void ReadFromStream(std::istream& rIn);
+      void WriteToStream(std::ostream& rOut);
+
+      void L1(BaseFloat l1) {
+        mL1Const = l1;
+      }
+
+    protected:
+      CuMatrix<BaseFloat> mLinearity;  ///< Matrix with neuron weights
+      CuVector<BaseFloat> mBias;       ///< Vector with biases
+      CuMatrix<BaseFloat> mSparsityMask; ///< Mask which selects active weights
+
+      CuMatrix<BaseFloat> mLinearityCorrection; ///< Matrix for linearity updates
+      CuVector<BaseFloat> mBiasCorrection;      ///< Vector for bias updates
+
+      CuMatrix<BaseFloat> mLinearityCorrectionAccu; ///< Accumulator for linearity updates
+
+      BaseFloat mL1Const; ///< L1 regularization constant
+
+      size_t mNFrames; ///< Number of accumulated frames 
+      BaseFloat mSparsifyWeightThreshold; ///< Cutoff
+      BaseFloat mUnsparsifyAccu; ///< Threshold to unsparsify the Cutoff
+
+      
+  };
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // CuSparseLinearity::
+  inline 
+  CuSparseLinearity::
+  CuSparseLinearity(size_t nInputs, size_t nOutputs, CuComponent *pPred)
+    : CuUpdatableComponent(nInputs, nOutputs, pPred), 
+      mLinearity(nInputs,nOutputs), mBias(nOutputs), mSparsityMask(nInputs,nOutputs),
+      mLinearityCorrection(nInputs,nOutputs), mBiasCorrection(nOutputs),
+      mLinearityCorrectionAccu(nInputs,nOutputs),
+      mNFrames(0), mSparsifyWeightThreshold(1.0e-3),
+      mUnsparsifyAccu(1e20f)
+  { 
+    mLinearityCorrection.SetConst(0.0f);
+    mBiasCorrection.SetConst(0.0f);
+    mLinearityCorrectionAccu.SetConst(0.0f);
+  }
+
+
+  inline
+  CuSparseLinearity::
+  ~CuSparseLinearity()
+  { }
+
+  inline CuComponent::ComponentType
+  CuSparseLinearity::
+  GetType() const
+  {
+    return CuComponent::SPARSE_LINEARITY;
+  }
+
+  inline const char*
+  CuSparseLinearity::
+  GetName() const
+  {
+    return "<sparselinearity>";
+  }
+
+
+
+} //namespace
+
+
+
+#endif
diff --git a/src/CuTNetLib/cuUpdatableBias.cc b/src/CuTNetLib/cuUpdatableBias.cc
new file mode 100644
index 0000000..2a9cbed
--- /dev/null
+++ b/src/CuTNetLib/cuUpdatableBias.cc
@@ -0,0 +1,96 @@
+
+
+#include "cuUpdatableBias.h"
+
+
+namespace TNet
+{
+
+  void 
+  CuUpdatableBias::
+  PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    //Y.SetConst(0.0);
+    Y.AddScaledRow(1.0,mBias,0.0);
+    Y.AddScaled(1.0,X,1.0);
+  }
+
+
+  void 
+  CuUpdatableBias::
+  BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y)
+  {
+    //Y.SetConst(0.0);
+    Y.CopyFrom(X);
+  }
+
+  
+  void 
+  CuUpdatableBias::
+  Update() 
+  {
+#if 0
+    //former implementation
+    BaseFloat N = static_cast<BaseFloat>(GetInput().Rows());
+
+    mBiasCorrection.AddColSum(-mLearningRate/N,GetErrorInput(),mMomentum);
+
+    mBias.AddScaled(1.0,mBiasCorrection,1.0);
+#endif
+
+#if 1
+    //new implementation
+    BaseFloat N = 1;
+    if(mGradDivFrm) {
+      N = static_cast<BaseFloat>(GetInput().Rows());
+    }
+    BaseFloat mmt_gain = static_cast<BaseFloat>(1.0/(1.0-mMomentum));
+    N *= mmt_gain;
+
+    mBiasCorrection.AddColSum(1.0,GetErrorInput(),mMomentum);
+
+    mBias.AddScaled(-mLearningRate/N,mBiasCorrection,1.0);
+
+#endif
+  }
+
+
+  void
+  CuUpdatableBias::
+  ReadFromStream(std::istream& rIn)
+  {
+    //biases stored normally
+    BfVector bias;
+    rIn >> bias;
+    mBias.CopyFrom(bias);
+
+    /*if(bias.Dim() == 0) {
+      Error("Missing bias vector in network file");
+    }*/
+    if( mBias.Dim() != GetNOutputs()
+    ){
+      std::ostringstream os;
+      os << "Wrong dimensionalities of matrix/vector in network file\n"
+         << "Inputs:" << GetNInputs()
+         << "Outputs:" << GetNOutputs()
+         << "\n"
+         << "biasDims:" << mBias.Dim()
+         << "\n";
+      Error(os.str());
+    }
+  }
+
+   
+  void
+  CuUpdatableBias::
+  WriteToStream(std::ostream& rOut)
+  {
+    BfVector vec;
+    mBias.CopyTo(vec);
+    rOut << vec;
+    rOut << std::endl;
+  }
+
+ 
+} //namespace
+
diff --git a/src/CuTNetLib/cuUpdatableBias.h b/src/CuTNetLib/cuUpdatableBias.h
new file mode 100644
index 0000000..df8066a
--- /dev/null
+++ b/src/CuTNetLib/cuUpdatableBias.h
@@ -0,0 +1,109 @@
+#ifndef _CUUPDATABLE_BIAS_H_
+#define _CUUPDATABLE_BIAS_H_
+
+
+#include "cuComponent.h"
+#include "cumatrix.h"
+
+
+#include "Matrix.h"
+#include "Vector.h"
+
+
+namespace TNet {
+  /**
+   * \brief CuUpdatableBias summation function
+   *
+   * \ingroup CuNNUpdatable
+   * Implements forward pass: \f[ Y_i=X_i +{\beta}_i \f]
+   * Error propagation: \f[ E_i = e_i \f]
+   *
+   * Weight adjust:
+   * for bias: \f[ {\Beta}_i = {\beta}_i - \alpha(1-\mu)e_i - \mu \Delta \f]
+   * where
+   *  - D for weight decay => penalizing large weight
+   *  - \f$ \alpha \f$ for learning rate
+   *  - \f$ \mu \f$ for momentum => avoiding oscillation
+   */
+  class CuUpdatableBias : public CuUpdatableComponent
+  {
+    public:
+
+      CuUpdatableBias(size_t nInputs, size_t nOutputs, CuComponent *pPred); 
+      ~CuUpdatableBias();  
+      
+      ComponentType GetType() const;
+      const char* GetName() const;
+      
+      const CuMatrix<float>& GetErrorOutput();
+
+      void PropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+      void BackpropagateFnc(const CuMatrix<BaseFloat>& X, CuMatrix<BaseFloat>& Y);
+
+      void Update();
+
+      void ReadFromStream(std::istream& rIn);
+      void WriteToStream(std::ostream& rOut);
+      
+      void Backpropagate();
+
+    protected:
+      CuVector<BaseFloat> mBias;       ///< Vector with biases
+
+      CuVector<BaseFloat> mBiasCorrection;      ///< Vector for bias updates
+
+  };
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // CuUpdatableBias::
+  inline 
+  CuUpdatableBias::
+  CuUpdatableBias(size_t nInputs, size_t nOutputs, CuComponent *pPred)
+    : CuUpdatableComponent(nInputs, nOutputs, pPred), 
+      mBias(nOutputs), mBiasCorrection(nOutputs)
+  { 
+    mBiasCorrection.SetConst(0.0);
+  }
+
+
+  inline
+  CuUpdatableBias::
+  ~CuUpdatableBias()
+  { }
+
+  inline CuComponent::ComponentType
+  CuUpdatableBias::
+  GetType() const
+  {
+    return CuComponent::UPDATABLEBIAS;
+  }
+
+  inline const char*
+  CuUpdatableBias::
+  GetName() const
+  {
+    return "<updatablebias>";
+  }
+ 
+  inline void
+  CuUpdatableBias::
+  Backpropagate()
+  {
+  }
+
+  inline const CuMatrix<BaseFloat>&
+  CuUpdatableBias::
+  GetErrorOutput()
+  {
+    return GetErrorInput();
+  }
+
+} //namespace
+
+
+
+#endif
diff --git a/src/GotoBLASLib/.svn/entries b/src/GotoBLASLib/.svn/entries
new file mode 100644
index 0000000..29b523f
--- /dev/null
+++ b/src/GotoBLASLib/.svn/entries
@@ -0,0 +1,96 @@
+10
+
+dir
+117
+svn+ssh://merlin.fit.vutbr.cz/svn/TNet/trunk/src/GotoBLASLib
+svn+ssh://merlin.fit.vutbr.cz/svn/TNet
+
+
+
+2011-05-30T00:13:43.494031Z
+57
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+bda6da93-004a-4ae9-8e07-715c10848801
+
+00License.txt
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+f5be3760860238b7f064d27c77f66e74
+2011-05-30T00:13:43.494031Z
+57
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+1608
+
+README
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+318aea90056e223fda01e6408e70a774
+2011-05-30T00:13:43.494031Z
+57
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+261
+
diff --git a/src/GotoBLASLib/.svn/prop-base/README.svn-base b/src/GotoBLASLib/.svn/prop-base/README.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/GotoBLASLib/.svn/prop-base/README.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/GotoBLASLib/.svn/text-base/00License.txt.svn-base b/src/GotoBLASLib/.svn/text-base/00License.txt.svn-base
new file mode 100644
index 0000000..56a0f74
--- /dev/null
+++ b/src/GotoBLASLib/.svn/text-base/00License.txt.svn-base
@@ -0,0 +1,32 @@
+
+Copyright 2009, 2010 The University of Texas at Austin.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT AUSTIN ``AS IS''
+AND ANY  EXPRESS OR IMPLIED  WARRANTIES, INCLUDING, BUT NOT  LIMITED TO,
+THE IMPLIED  WARRANTIES OF MERCHANTABILITY AND FITNESS  FOR A PARTICULAR
+PURPOSE ARE  DISCLAIMED.  IN NO EVENT  SHALL THE UNIVERSITY  OF TEXAS AT
+AUSTIN OR  CONTRIBUTORS BE LIABLE FOR ANY  DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+TO, PROCUREMENT OF  SUBSTITUTE GOODS OR SERVICES; LOSS  OF USE, DATA, OR
+PROFITS; OR BUSINESS  INTERRUPTION) HOWEVER CAUSED AND ON  ANY THEORY OF
+LIABILITY,  WHETHER IN  CONTRACT, STRICT  LIABILITY, OR  TORT (INCLUDING
+NEGLIGENCE  OR OTHERWISE)  ARISING IN  ANY WAY  OUT OF  THE USE  OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+The views and conclusions contained in the software and documentation
+are those of the authors and should not be interpreted as representing
+official policies, either expressed or implied, of The University of
+Texas at Austin.
diff --git a/src/GotoBLASLib/.svn/text-base/README.svn-base b/src/GotoBLASLib/.svn/text-base/README.svn-base
new file mode 100644
index 0000000..d267034
--- /dev/null
+++ b/src/GotoBLASLib/.svn/text-base/README.svn-base
@@ -0,0 +1,8 @@
+== README ==
+The GotoBLAS library can be obtained from URL:
+http://www.tacc.utexas.edu/tacc-projects/gotoblas2/downloads/
+
+The GotoBLAS library is distributed under the BSD licence,
+for details go see: 00License.txt
+
+The author of GotoBLAS is Kazuyoshi Goto...
diff --git a/src/GotoBLASLib/00License.txt b/src/GotoBLASLib/00License.txt
new file mode 100644
index 0000000..56a0f74
--- /dev/null
+++ b/src/GotoBLASLib/00License.txt
@@ -0,0 +1,32 @@
+
+Copyright 2009, 2010 The University of Texas at Austin.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT AUSTIN ``AS IS''
+AND ANY  EXPRESS OR IMPLIED  WARRANTIES, INCLUDING, BUT NOT  LIMITED TO,
+THE IMPLIED  WARRANTIES OF MERCHANTABILITY AND FITNESS  FOR A PARTICULAR
+PURPOSE ARE  DISCLAIMED.  IN NO EVENT  SHALL THE UNIVERSITY  OF TEXAS AT
+AUSTIN OR  CONTRIBUTORS BE LIABLE FOR ANY  DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+TO, PROCUREMENT OF  SUBSTITUTE GOODS OR SERVICES; LOSS  OF USE, DATA, OR
+PROFITS; OR BUSINESS  INTERRUPTION) HOWEVER CAUSED AND ON  ANY THEORY OF
+LIABILITY,  WHETHER IN  CONTRACT, STRICT  LIABILITY, OR  TORT (INCLUDING
+NEGLIGENCE  OR OTHERWISE)  ARISING IN  ANY WAY  OUT OF  THE USE  OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+The views and conclusions contained in the software and documentation
+are those of the authors and should not be interpreted as representing
+official policies, either expressed or implied, of The University of
+Texas at Austin.
diff --git a/src/GotoBLASLib/README b/src/GotoBLASLib/README
new file mode 100644
index 0000000..d267034
--- /dev/null
+++ b/src/GotoBLASLib/README
@@ -0,0 +1,8 @@
+== README ==
+The GotoBLAS library can be obtained from URL:
+http://www.tacc.utexas.edu/tacc-projects/gotoblas2/downloads/
+
+The GotoBLAS library is distributed under the BSD licence,
+for details go see: 00License.txt
+
+The author of GotoBLAS is Kazuyoshi Goto...
diff --git a/src/GotoBLASLib/libgoto2.so b/src/GotoBLASLib/libgoto2.so
new file mode 100755
index 0000000..db4362c
--- /dev/null
+++ b/src/GotoBLASLib/libgoto2.so
diff --git a/src/GotoBLASLib/libgoto2_64.so b/src/GotoBLASLib/libgoto2_64.so
new file mode 100755
index 0000000..fec56c0
--- /dev/null
+++ b/src/GotoBLASLib/libgoto2_64.so
diff --git a/src/KaldiLib/.depend.mk b/src/KaldiLib/.depend.mk
new file mode 100644
index 0000000..48bfef8
--- /dev/null
+++ b/src/KaldiLib/.depend.mk
@@ -0,0 +1,959 @@
+Common.o: Common.cc /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/postypes.h \
+ /usr/include/c++/4.6/cwchar /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/xlocale.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/iosfwd \
+ /usr/include/c++/4.6/cctype /usr/include/ctype.h \
+ /usr/include/bits/types.h /usr/include/bits/typesizes.h \
+ /usr/include/endian.h /usr/include/bits/endian.h \
+ /usr/include/bits/byteswap.h /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/time.h \
+ /usr/include/bits/sched.h /usr/include/bits/time.h \
+ /usr/include/bits/timex.h /usr/include/bits/pthreadtypes.h \
+ /usr/include/bits/setjmp.h /usr/include/unistd.h \
+ /usr/include/bits/posix_opt.h /usr/include/bits/environments.h \
+ /usr/include/bits/confname.h /usr/include/getopt.h \
+ /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc \
+ /usr/include/c++/4.6/stdexcept /usr/include/c++/4.6/cmath \
+ /usr/include/math.h /usr/include/bits/huge_val.h \
+ /usr/include/bits/huge_valf.h /usr/include/bits/huge_vall.h \
+ /usr/include/bits/inf.h /usr/include/bits/nan.h \
+ /usr/include/bits/mathdef.h /usr/include/bits/mathcalls.h \
+ /usr/include/bits/mathinline.h /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h Common.h /usr/include/c++/4.6/cstdlib \
+ /usr/include/stdlib.h /usr/include/bits/waitflags.h \
+ /usr/include/bits/waitstatus.h /usr/include/sys/types.h \
+ /usr/include/sys/select.h /usr/include/bits/select.h \
+ /usr/include/bits/sigset.h /usr/include/bits/select2.h \
+ /usr/include/sys/sysmacros.h /usr/include/alloca.h \
+ /usr/include/bits/stdlib.h /usr/include/string.h \
+ /usr/include/bits/string3.h /usr/include/c++/4.6/sstream \
+ /usr/include/c++/4.6/istream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/bits/locale_classes.h \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc /usr/include/c++/4.6/ostream \
+ /usr/include/c++/4.6/bits/ostream.tcc \
+ /usr/include/c++/4.6/bits/istream.tcc \
+ /usr/include/c++/4.6/bits/sstream.tcc MathAux.h
+Features.o: Features.cc /usr/include/c++/4.6/sstream \
+ /usr/include/c++/4.6/istream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/iosfwd \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/postypes.h /usr/include/c++/4.6/cwchar \
+ /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/xlocale.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/cctype \
+ /usr/include/ctype.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/endian.h \
+ /usr/include/bits/endian.h /usr/include/bits/byteswap.h \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/time.h \
+ /usr/include/bits/sched.h /usr/include/bits/time.h \
+ /usr/include/bits/timex.h /usr/include/bits/pthreadtypes.h \
+ /usr/include/bits/setjmp.h /usr/include/unistd.h \
+ /usr/include/bits/posix_opt.h /usr/include/bits/environments.h \
+ /usr/include/bits/confname.h /usr/include/getopt.h \
+ /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/bits/locale_classes.h /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc /usr/include/c++/4.6/ostream \
+ /usr/include/c++/4.6/bits/ostream.tcc \
+ /usr/include/c++/4.6/bits/istream.tcc \
+ /usr/include/c++/4.6/bits/sstream.tcc /usr/include/c++/4.6/map \
+ /usr/include/c++/4.6/bits/stl_tree.h /usr/include/c++/4.6/bits/stl_map.h \
+ /usr/include/c++/4.6/bits/stl_multimap.h /usr/include/c++/4.6/list \
+ /usr/include/c++/4.6/bits/stl_list.h /usr/include/c++/4.6/bits/list.tcc \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h Features.h /usr/include/c++/4.6/queue \
+ /usr/include/c++/4.6/deque /usr/include/c++/4.6/bits/stl_construct.h \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_deque.h \
+ /usr/include/c++/4.6/bits/deque.tcc /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_queue.h Common.h \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/sys/types.h /usr/include/sys/select.h \
+ /usr/include/bits/select.h /usr/include/bits/sigset.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/alloca.h /usr/include/bits/stdlib.h /usr/include/string.h \
+ /usr/include/bits/string3.h /usr/include/c++/4.6/stdexcept Matrix.h \
+ /usr/include/c++/4.6/iostream cblas.h clapack.h cblas.h MathAux.h \
+ /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h Types.h \
+ Error.h /usr/include/execinfo.h Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h /usr/include/c++/4.6/limits \
+ Vector.h /usr/include/c++/4.6/cstddef Vector.tcc \
+ /usr/include/c++/4.6/cstring StkStream.h StkStream.tcc Timer.h \
+ /usr/include/sys/time.h Tokenizer.h StkMatch.h
+Labels.o: Labels.cc Labels.h Matrix.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h /usr/include/stdlib.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/endian.h /usr/include/bits/endian.h \
+ /usr/include/bits/byteswap.h /usr/include/xlocale.h \
+ /usr/include/sys/types.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/time.h \
+ /usr/include/sys/select.h /usr/include/bits/select.h \
+ /usr/include/bits/sigset.h /usr/include/bits/time.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/bits/pthreadtypes.h /usr/include/alloca.h \
+ /usr/include/bits/stdlib.h /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/exception \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/c++/4.6/string /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/postypes.h \
+ /usr/include/c++/4.6/cwchar /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/iosfwd \
+ /usr/include/c++/4.6/cctype /usr/include/ctype.h \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/bits/sched.h \
+ /usr/include/bits/timex.h /usr/include/bits/setjmp.h \
+ /usr/include/unistd.h /usr/include/bits/posix_opt.h \
+ /usr/include/bits/environments.h /usr/include/bits/confname.h \
+ /usr/include/getopt.h /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/bits/locale_classes.h \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc cblas.h clapack.h cblas.h Common.h \
+ /usr/include/c++/4.6/cstdlib /usr/include/string.h \
+ /usr/include/bits/string3.h /usr/include/c++/4.6/sstream \
+ /usr/include/c++/4.6/bits/sstream.tcc MathAux.h \
+ /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h Types.h \
+ Error.h /usr/include/execinfo.h Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h \
+ /usr/include/c++/4.6/bits/stl_construct.h /usr/include/c++/4.6/limits \
+ /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc Vector.h \
+ /usr/include/c++/4.6/cstddef Vector.tcc /usr/include/c++/4.6/cstring \
+ MlfStream.h /usr/include/c++/4.6/map \
+ /usr/include/c++/4.6/bits/stl_tree.h /usr/include/c++/4.6/bits/stl_map.h \
+ /usr/include/c++/4.6/bits/stl_multimap.h /usr/include/c++/4.6/list \
+ /usr/include/c++/4.6/bits/stl_list.h /usr/include/c++/4.6/bits/list.tcc \
+ /usr/include/c++/4.6/set /usr/include/c++/4.6/bits/stl_set.h \
+ /usr/include/c++/4.6/bits/stl_multiset.h MlfStream.tcc StkMatch.h \
+ Features.h /usr/include/c++/4.6/queue /usr/include/c++/4.6/deque \
+ /usr/include/c++/4.6/bits/stl_deque.h \
+ /usr/include/c++/4.6/bits/deque.tcc \
+ /usr/include/c++/4.6/bits/stl_queue.h StkStream.h StkStream.tcc Timer.h \
+ /usr/include/sys/time.h
+Matrix.o: Matrix.cc Matrix.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h /usr/include/stdlib.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/endian.h /usr/include/bits/endian.h \
+ /usr/include/bits/byteswap.h /usr/include/xlocale.h \
+ /usr/include/sys/types.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/time.h \
+ /usr/include/sys/select.h /usr/include/bits/select.h \
+ /usr/include/bits/sigset.h /usr/include/bits/time.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/bits/pthreadtypes.h /usr/include/alloca.h \
+ /usr/include/bits/stdlib.h /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/exception \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/c++/4.6/string /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/postypes.h \
+ /usr/include/c++/4.6/cwchar /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/iosfwd \
+ /usr/include/c++/4.6/cctype /usr/include/ctype.h \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/bits/sched.h \
+ /usr/include/bits/timex.h /usr/include/bits/setjmp.h \
+ /usr/include/unistd.h /usr/include/bits/posix_opt.h \
+ /usr/include/bits/environments.h /usr/include/bits/confname.h \
+ /usr/include/getopt.h /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/bits/locale_classes.h \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc cblas.h clapack.h cblas.h Common.h \
+ /usr/include/c++/4.6/cstdlib /usr/include/string.h \
+ /usr/include/bits/string3.h /usr/include/c++/4.6/sstream \
+ /usr/include/c++/4.6/bits/sstream.tcc MathAux.h \
+ /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h Types.h \
+ Error.h /usr/include/execinfo.h Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h \
+ /usr/include/c++/4.6/bits/stl_construct.h /usr/include/c++/4.6/limits \
+ /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc Vector.h \
+ /usr/include/c++/4.6/cstddef Vector.tcc /usr/include/c++/4.6/cstring
+MlfStream.o: MlfStream.cc MlfStream.h /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/iosfwd /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/postypes.h /usr/include/c++/4.6/cwchar \
+ /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/xlocale.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/cctype \
+ /usr/include/ctype.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/endian.h \
+ /usr/include/bits/endian.h /usr/include/bits/byteswap.h \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/time.h \
+ /usr/include/bits/sched.h /usr/include/bits/time.h \
+ /usr/include/bits/timex.h /usr/include/bits/pthreadtypes.h \
+ /usr/include/bits/setjmp.h /usr/include/unistd.h \
+ /usr/include/bits/posix_opt.h /usr/include/bits/environments.h \
+ /usr/include/bits/confname.h /usr/include/getopt.h \
+ /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/bits/locale_classes.h /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_construct.h \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc /usr/include/c++/4.6/map \
+ /usr/include/c++/4.6/bits/stl_tree.h /usr/include/c++/4.6/bits/stl_map.h \
+ /usr/include/c++/4.6/bits/stl_multimap.h /usr/include/c++/4.6/list \
+ /usr/include/c++/4.6/bits/stl_list.h /usr/include/c++/4.6/bits/list.tcc \
+ /usr/include/c++/4.6/set /usr/include/c++/4.6/bits/stl_set.h \
+ /usr/include/c++/4.6/bits/stl_multiset.h MlfStream.tcc \
+ /usr/include/c++/4.6/algorithm /usr/include/c++/4.6/utility \
+ /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h /usr/include/c++/4.6/cstdlib \
+ /usr/include/stdlib.h /usr/include/bits/waitflags.h \
+ /usr/include/bits/waitstatus.h /usr/include/sys/types.h \
+ /usr/include/sys/select.h /usr/include/bits/select.h \
+ /usr/include/bits/sigset.h /usr/include/bits/select2.h \
+ /usr/include/sys/sysmacros.h /usr/include/alloca.h \
+ /usr/include/bits/stdlib.h /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h Common.h /usr/include/string.h \
+ /usr/include/bits/string3.h /usr/include/c++/4.6/sstream \
+ /usr/include/c++/4.6/bits/sstream.tcc /usr/include/c++/4.6/stdexcept \
+ StkMatch.h Error.h /usr/include/execinfo.h
+StkMatch.o: StkMatch.cc StkMatch.h /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/postypes.h \
+ /usr/include/c++/4.6/cwchar /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/xlocale.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/iosfwd \
+ /usr/include/c++/4.6/cctype /usr/include/ctype.h \
+ /usr/include/bits/types.h /usr/include/bits/typesizes.h \
+ /usr/include/endian.h /usr/include/bits/endian.h \
+ /usr/include/bits/byteswap.h /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/time.h \
+ /usr/include/bits/sched.h /usr/include/bits/time.h \
+ /usr/include/bits/timex.h /usr/include/bits/pthreadtypes.h \
+ /usr/include/bits/setjmp.h /usr/include/unistd.h \
+ /usr/include/bits/posix_opt.h /usr/include/bits/environments.h \
+ /usr/include/bits/confname.h /usr/include/getopt.h \
+ /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc Common.h \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/sys/types.h /usr/include/sys/select.h \
+ /usr/include/bits/select.h /usr/include/bits/sigset.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/alloca.h /usr/include/bits/stdlib.h /usr/include/string.h \
+ /usr/include/bits/string3.h /usr/include/c++/4.6/sstream \
+ /usr/include/c++/4.6/istream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/bits/locale_classes.h \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc /usr/include/c++/4.6/ostream \
+ /usr/include/c++/4.6/bits/ostream.tcc \
+ /usr/include/c++/4.6/bits/istream.tcc \
+ /usr/include/c++/4.6/bits/sstream.tcc /usr/include/c++/4.6/stdexcept
+Timer.o: Timer.cc Timer.h Error.h /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/iosfwd /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/postypes.h /usr/include/c++/4.6/cwchar \
+ /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/xlocale.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/cctype \
+ /usr/include/ctype.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/endian.h \
+ /usr/include/bits/endian.h /usr/include/bits/byteswap.h \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/time.h \
+ /usr/include/bits/sched.h /usr/include/bits/time.h \
+ /usr/include/bits/timex.h /usr/include/bits/pthreadtypes.h \
+ /usr/include/bits/setjmp.h /usr/include/unistd.h \
+ /usr/include/bits/posix_opt.h /usr/include/bits/environments.h \
+ /usr/include/bits/confname.h /usr/include/getopt.h \
+ /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/bits/locale_classes.h /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/sstream /usr/include/c++/4.6/bits/sstream.tcc \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/sys/types.h /usr/include/sys/select.h \
+ /usr/include/bits/select.h /usr/include/bits/sigset.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/alloca.h /usr/include/bits/stdlib.h /usr/include/execinfo.h \
+ /usr/include/sys/time.h
+Tokenizer.o: Tokenizer.cc Tokenizer.h /usr/include/c++/4.6/list \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/stl_list.h \
+ /usr/include/c++/4.6/initializer_list /usr/include/c++/4.6/bits/list.tcc \
+ /usr/include/c++/4.6/string /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/postypes.h /usr/include/c++/4.6/cwchar \
+ /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/xlocale.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/iosfwd \
+ /usr/include/c++/4.6/cctype /usr/include/ctype.h \
+ /usr/include/bits/types.h /usr/include/bits/typesizes.h \
+ /usr/include/endian.h /usr/include/bits/endian.h \
+ /usr/include/bits/byteswap.h /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/time.h \
+ /usr/include/bits/sched.h /usr/include/bits/time.h \
+ /usr/include/bits/timex.h /usr/include/bits/pthreadtypes.h \
+ /usr/include/bits/setjmp.h /usr/include/unistd.h \
+ /usr/include/bits/posix_opt.h /usr/include/bits/environments.h \
+ /usr/include/bits/confname.h /usr/include/getopt.h \
+ /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/bits/basic_string.tcc /usr/include/string.h \
+ /usr/include/bits/string3.h
+UserInterface.o: UserInterface.cc /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/exception \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/c++/4.6/string /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/postypes.h \
+ /usr/include/c++/4.6/cwchar /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/xlocale.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/iosfwd \
+ /usr/include/c++/4.6/cctype /usr/include/ctype.h \
+ /usr/include/bits/types.h /usr/include/bits/typesizes.h \
+ /usr/include/endian.h /usr/include/bits/endian.h \
+ /usr/include/bits/byteswap.h /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/time.h \
+ /usr/include/bits/sched.h /usr/include/bits/time.h \
+ /usr/include/bits/timex.h /usr/include/bits/pthreadtypes.h \
+ /usr/include/bits/setjmp.h /usr/include/unistd.h \
+ /usr/include/bits/posix_opt.h /usr/include/bits/environments.h \
+ /usr/include/bits/confname.h /usr/include/getopt.h \
+ /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc /usr/include/c++/4.6/sstream \
+ /usr/include/c++/4.6/istream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/bits/locale_classes.h \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc /usr/include/c++/4.6/ostream \
+ /usr/include/c++/4.6/bits/ostream.tcc \
+ /usr/include/c++/4.6/bits/istream.tcc \
+ /usr/include/c++/4.6/bits/sstream.tcc UserInterface.h \
+ /usr/include/c++/4.6/iostream /usr/include/c++/4.6/cstdlib \
+ /usr/include/stdlib.h /usr/include/bits/waitflags.h \
+ /usr/include/bits/waitstatus.h /usr/include/sys/types.h \
+ /usr/include/sys/select.h /usr/include/bits/select.h \
+ /usr/include/bits/sigset.h /usr/include/bits/select2.h \
+ /usr/include/sys/sysmacros.h /usr/include/alloca.h \
+ /usr/include/bits/stdlib.h /usr/include/c++/4.6/map \
+ /usr/include/c++/4.6/bits/stl_tree.h /usr/include/c++/4.6/bits/stl_map.h \
+ /usr/include/c++/4.6/bits/stl_multimap.h StkStream.h \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_construct.h \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc /usr/include/c++/4.6/list \
+ /usr/include/c++/4.6/bits/stl_list.h /usr/include/c++/4.6/bits/list.tcc \
+ StkStream.tcc /usr/include/c++/4.6/cstring /usr/include/string.h \
+ /usr/include/bits/string3.h Common.h Features.h \
+ /usr/include/c++/4.6/queue /usr/include/c++/4.6/deque \
+ /usr/include/c++/4.6/bits/stl_deque.h \
+ /usr/include/c++/4.6/bits/deque.tcc /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_queue.h Matrix.h cblas.h clapack.h cblas.h \
+ MathAux.h /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h Types.h \
+ Error.h /usr/include/execinfo.h Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/iomanip /usr/include/c++/4.6/typeinfo \
+ /usr/include/c++/4.6/algorithm /usr/include/c++/4.6/utility \
+ /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h /usr/include/c++/4.6/limits \
+ Vector.h /usr/include/c++/4.6/cstddef Vector.tcc Timer.h \
+ /usr/include/sys/time.h
+Vector.o: Vector.cc /usr/include/c++/4.6/cstdlib \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/stdlib.h /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/endian.h /usr/include/bits/endian.h \
+ /usr/include/bits/byteswap.h /usr/include/xlocale.h \
+ /usr/include/sys/types.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/time.h \
+ /usr/include/sys/select.h /usr/include/bits/select.h \
+ /usr/include/bits/sigset.h /usr/include/bits/time.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/bits/pthreadtypes.h /usr/include/alloca.h \
+ /usr/include/bits/stdlib.h /usr/include/c++/4.6/cmath \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ /usr/include/c++/4.6/cstring /usr/include/string.h \
+ /usr/include/bits/string3.h /usr/include/c++/4.6/fstream \
+ /usr/include/c++/4.6/istream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/iosfwd /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/postypes.h /usr/include/c++/4.6/cwchar \
+ /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/cctype \
+ /usr/include/ctype.h /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/bits/sched.h \
+ /usr/include/bits/timex.h /usr/include/bits/setjmp.h \
+ /usr/include/unistd.h /usr/include/bits/posix_opt.h \
+ /usr/include/bits/environments.h /usr/include/bits/confname.h \
+ /usr/include/getopt.h /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/bits/locale_classes.h /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc /usr/include/c++/4.6/ostream \
+ /usr/include/c++/4.6/bits/ostream.tcc \
+ /usr/include/c++/4.6/bits/istream.tcc \
+ /usr/include/c++/4.6/bits/codecvt.h /usr/include/c++/4.6/cstdio \
+ /usr/include/libio.h /usr/include/_G_config.h \
+ /usr/include/bits/stdio_lim.h /usr/include/bits/sys_errlist.h \
+ /usr/include/bits/stdio.h /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ Common.h /usr/include/c++/4.6/sstream \
+ /usr/include/c++/4.6/bits/sstream.tcc /usr/include/c++/4.6/stdexcept \
+ cblas.h Matrix.h /usr/include/c++/4.6/iostream clapack.h cblas.h \
+ MathAux.h Types.h Error.h /usr/include/execinfo.h Matrix.tcc \
+ /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h \
+ /usr/include/c++/4.6/bits/stl_construct.h /usr/include/c++/4.6/limits \
+ /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc Vector.h \
+ /usr/include/c++/4.6/cstddef Vector.tcc
+clapack.o: clapack.cc
diff --git a/src/KaldiLib/.svn/entries b/src/KaldiLib/.svn/entries
new file mode 100644
index 0000000..b081f47
--- /dev/null
+++ b/src/KaldiLib/.svn/entries
@@ -0,0 +1,1116 @@
+10
+
+dir
+117
+svn+ssh://merlin.fit.vutbr.cz/svn/TNet/trunk/src/KaldiLib
+svn+ssh://merlin.fit.vutbr.cz/svn/TNet
+
+
+
+2012-03-23T13:22:49.912359Z
+110
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+bda6da93-004a-4ae9-8e07-715c10848801
+
+Error.h
+file
+
+
+
+
+2012-04-02T13:49:15.000000Z
+9c94ed7f143028a52f6c04e2dbf2fe8a
+2011-09-26T13:47:57.076756Z
+70
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+3533
+
+clapack.h
+file
+
+
+
+
+2012-04-02T13:49:15.000000Z
+55c1d7eb68c676cb8f18944f4b299888
+2011-02-24T12:12:08.754106Z
+34
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+8225
+
+Labels.cc
+file
+
+
+
+
+2012-04-02T13:49:15.000000Z
+1c9fe7163158c402d8132c56ace8b64c
+2012-03-06T10:47:03.584726Z
+108
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+6429
+
+Vector.h
+file
+
+
+
+
+2012-04-02T13:49:15.000000Z
+db56c6fd16aa2942730f4192c17585b4
+2012-03-23T13:22:49.912359Z
+110
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+12556
+
+Matrix.h
+file
+
+
+
+
+2012-04-02T13:49:15.000000Z
+d2ac8ddb2cdc2bba4f20abde60f08295
+2011-12-08T10:59:03.566125Z
+94
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+19232
+
+MathAux.h
+file
+
+
+
+
+2012-04-02T13:49:15.000000Z
+27359bc09626fa2dbc4701c1aa849a8f
+2011-02-24T12:12:08.754106Z
+34
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+2410
+
+Tokenizer.h
+file
+
+
+
+
+2012-04-02T13:49:15.000000Z
+3700af448271e89c81e06b34a45cd356
+2011-02-24T12:12:08.754106Z
+34
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+1165
+
+Features.cc
+file
+
+
+
+
+2012-04-02T13:49:15.000000Z
+c117134be9bd27dd7c91d095d0caa10b
+2012-03-09T18:30:40.332683Z
+109
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+55834
+
+MlfStream.tcc
+file
+
+
+
+
+2012-04-02T13:49:15.000000Z
+89d36edc8e76da017cdc3cf59e6b8cc3
+2012-03-23T13:22:49.912359Z
+110
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+14778
+
+StkMatch.cc
+file
+
+
+
+
+2012-04-02T13:49:15.000000Z
+15c7044c78619f0e3fa3eb22f2d873b5
+2011-02-24T12:12:08.754106Z
+34
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+18030
+
+Timer.h
+file
+
+
+
+
+2012-04-02T13:49:15.000000Z
+1d93c9c66183bed8f716c642819ee10b
+2011-02-24T12:12:08.754106Z
+34
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+1574
+
+Makefile
+file
+
+
+
+
+2012-04-02T13:49:15.000000Z
+769633138edc6544034a463c096a089b
+2011-03-24T17:03:17.103393Z
+43
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+357
+
+Common.h
+file
+
+
+
+
+2012-04-02T13:49:15.000000Z
+375324199f017a7826a59e2d64f6952d
+2011-02-24T12:12:08.754106Z
+34
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+6838
+
+MlfStream.h
+file
+
+
+
+
+2012-04-02T13:49:15.000000Z
+858de52f3b15b99a646ef346daec3f1e
+2011-02-24T12:12:08.754106Z
+34
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+16873
+
+clapack.cc
+file
+
+
+
+
+2012-04-02T13:49:15.000000Z
+3fe83d260dcb9daf3f052f680022e663
+2011-02-24T12:12:08.754106Z
+34
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+1748
+
+UserInterface.cc
+file
+
+
+
+
+2012-04-02T13:49:15.000000Z
+ac7e3eaa15fd5a096560ba5037e1e98b
+2011-02-24T12:12:08.754106Z
+34
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+19843
+
+UserInterface.h
+file
+
+
+
+
+2012-04-02T13:49:15.000000Z
+76a8a4b9faabba54d65738c98581abe2
+2011-02-24T12:12:08.754106Z
+34
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+4227
+
+StkStream.tcc
+file
+
+
+
+
+2012-04-02T13:49:15.000000Z
+80225c1e16814ab3a80faba7878b4d25
+2011-02-24T12:12:08.754106Z
+34
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+5938
+
+Vector.cc
+file
+
+
+
+
+2012-04-02T13:49:15.000000Z
+ee4684446df24b6f0ee8acdb2d26de51
+2012-03-06T10:47:03.584726Z
+108
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+2665
+
+Labels.h
+file
+
+
+
+
+2012-04-02T13:49:15.000000Z
+e43167ea848bd334aebf97024bd3c804
+2011-04-04T17:14:16.666438Z
+46
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+2068
+
+Matrix.cc
+file
+
+
+
+
+2012-04-02T13:49:15.000000Z
+9a5d3820e044f6467cc1eefc30ad21df
+2011-02-24T12:12:08.754106Z
+34
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+10766
+
+Types.h
+file
+
+
+
+
+2012-04-02T13:49:15.000000Z
+3ec1387bdfce0588fdb686691ee4fd37
+2011-03-22T21:01:03.678832Z
+41
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+1642
+
+Tokenizer.cc
+file
+
+
+
+
+2012-04-02T13:49:15.000000Z
+22eccbaf72cf840566041b54592fc890
+2011-02-24T12:12:08.754106Z
+34
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+1659
+
+StkStream.h
+file
+
+
+
+
+2012-04-02T13:49:15.000000Z
+d845d20b56fce534768eb0f5b949a778
+2011-02-24T12:12:08.754106Z
+34
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+16884
+
+Timer.cc
+file
+
+
+
+
+2012-04-02T13:49:15.000000Z
+432c189d859feb846843d7471aec2d15
+2011-02-24T12:12:08.754106Z
+34
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+46
+
+StkMatch.h
+file
+
+
+
+
+2012-04-02T13:49:15.000000Z
+ccbf7d599028083c6c2565caf73b4892
+2011-02-24T12:12:08.754106Z
+34
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+4632
+
+Features.h
+file
+
+
+
+
+2012-04-02T13:49:15.000000Z
+7db288f0abd2566709a02731d7e451fd
+2012-03-06T10:47:03.584726Z
+108
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+20490
+
+cblas.h
+file
+
+
+
+
+2012-04-02T13:49:15.000000Z
+84cb6fe5a4d2cd9f7afbd6033889fc88
+2011-02-24T12:12:08.754106Z
+34
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+33895
+
+Vector.tcc
+file
+
+
+
+
+2012-04-02T13:49:15.000000Z
+43165f9a2b33e5450ebe31bb3584295d
+2011-03-22T21:01:03.678832Z
+41
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+18379
+
+Common.cc
+file
+
+
+
+
+2012-04-02T13:49:15.000000Z
+2761bdee947027a0f80d090fa95d7a8c
+2011-02-24T12:12:08.754106Z
+34
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+7100
+
+Matrix.tcc
+file
+
+
+
+
+2012-04-02T13:49:15.000000Z
+8a17a493ed2a0739a1aa9a738269e7c2
+2011-03-22T21:01:03.678832Z
+41
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+21580
+
+MlfStream.cc
+file
+
+
+
+
+2012-04-02T13:49:15.000000Z
+66740c851436ba1d374dc4262186fbd5
+2011-04-29T12:18:20.752880Z
+49
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+6959
+
diff --git a/src/KaldiLib/.svn/prop-base/Common.cc.svn-base b/src/KaldiLib/.svn/prop-base/Common.cc.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/KaldiLib/.svn/prop-base/Common.cc.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/KaldiLib/.svn/prop-base/Common.h.svn-base b/src/KaldiLib/.svn/prop-base/Common.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/KaldiLib/.svn/prop-base/Common.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/KaldiLib/.svn/prop-base/Error.h.svn-base b/src/KaldiLib/.svn/prop-base/Error.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/KaldiLib/.svn/prop-base/Error.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/KaldiLib/.svn/prop-base/Features.cc.svn-base b/src/KaldiLib/.svn/prop-base/Features.cc.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/KaldiLib/.svn/prop-base/Features.cc.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/KaldiLib/.svn/prop-base/Features.h.svn-base b/src/KaldiLib/.svn/prop-base/Features.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/KaldiLib/.svn/prop-base/Features.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/KaldiLib/.svn/prop-base/Labels.cc.svn-base b/src/KaldiLib/.svn/prop-base/Labels.cc.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/KaldiLib/.svn/prop-base/Labels.cc.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/KaldiLib/.svn/prop-base/Labels.h.svn-base b/src/KaldiLib/.svn/prop-base/Labels.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/KaldiLib/.svn/prop-base/Labels.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/KaldiLib/.svn/prop-base/Makefile.svn-base b/src/KaldiLib/.svn/prop-base/Makefile.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/KaldiLib/.svn/prop-base/Makefile.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/KaldiLib/.svn/prop-base/MathAux.h.svn-base b/src/KaldiLib/.svn/prop-base/MathAux.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/KaldiLib/.svn/prop-base/MathAux.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/KaldiLib/.svn/prop-base/Matrix.cc.svn-base b/src/KaldiLib/.svn/prop-base/Matrix.cc.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/KaldiLib/.svn/prop-base/Matrix.cc.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/KaldiLib/.svn/prop-base/Matrix.h.svn-base b/src/KaldiLib/.svn/prop-base/Matrix.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/KaldiLib/.svn/prop-base/Matrix.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/KaldiLib/.svn/prop-base/Matrix.tcc.svn-base b/src/KaldiLib/.svn/prop-base/Matrix.tcc.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/KaldiLib/.svn/prop-base/Matrix.tcc.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/KaldiLib/.svn/prop-base/MlfStream.cc.svn-base b/src/KaldiLib/.svn/prop-base/MlfStream.cc.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/KaldiLib/.svn/prop-base/MlfStream.cc.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/KaldiLib/.svn/prop-base/MlfStream.h.svn-base b/src/KaldiLib/.svn/prop-base/MlfStream.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/KaldiLib/.svn/prop-base/MlfStream.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/KaldiLib/.svn/prop-base/MlfStream.tcc.svn-base b/src/KaldiLib/.svn/prop-base/MlfStream.tcc.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/KaldiLib/.svn/prop-base/MlfStream.tcc.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/KaldiLib/.svn/prop-base/StkMatch.cc.svn-base b/src/KaldiLib/.svn/prop-base/StkMatch.cc.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/KaldiLib/.svn/prop-base/StkMatch.cc.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/KaldiLib/.svn/prop-base/StkMatch.h.svn-base b/src/KaldiLib/.svn/prop-base/StkMatch.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/KaldiLib/.svn/prop-base/StkMatch.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/KaldiLib/.svn/prop-base/StkStream.h.svn-base b/src/KaldiLib/.svn/prop-base/StkStream.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/KaldiLib/.svn/prop-base/StkStream.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/KaldiLib/.svn/prop-base/StkStream.tcc.svn-base b/src/KaldiLib/.svn/prop-base/StkStream.tcc.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/KaldiLib/.svn/prop-base/StkStream.tcc.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/KaldiLib/.svn/prop-base/Timer.cc.svn-base b/src/KaldiLib/.svn/prop-base/Timer.cc.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/KaldiLib/.svn/prop-base/Timer.cc.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/KaldiLib/.svn/prop-base/Timer.h.svn-base b/src/KaldiLib/.svn/prop-base/Timer.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/KaldiLib/.svn/prop-base/Timer.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/KaldiLib/.svn/prop-base/Tokenizer.cc.svn-base b/src/KaldiLib/.svn/prop-base/Tokenizer.cc.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/KaldiLib/.svn/prop-base/Tokenizer.cc.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/KaldiLib/.svn/prop-base/Tokenizer.h.svn-base b/src/KaldiLib/.svn/prop-base/Tokenizer.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/KaldiLib/.svn/prop-base/Tokenizer.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/KaldiLib/.svn/prop-base/Types.h.svn-base b/src/KaldiLib/.svn/prop-base/Types.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/KaldiLib/.svn/prop-base/Types.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/KaldiLib/.svn/prop-base/UserInterface.cc.svn-base b/src/KaldiLib/.svn/prop-base/UserInterface.cc.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/KaldiLib/.svn/prop-base/UserInterface.cc.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/KaldiLib/.svn/prop-base/UserInterface.h.svn-base b/src/KaldiLib/.svn/prop-base/UserInterface.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/KaldiLib/.svn/prop-base/UserInterface.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/KaldiLib/.svn/prop-base/Vector.cc.svn-base b/src/KaldiLib/.svn/prop-base/Vector.cc.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/KaldiLib/.svn/prop-base/Vector.cc.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/KaldiLib/.svn/prop-base/Vector.h.svn-base b/src/KaldiLib/.svn/prop-base/Vector.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/KaldiLib/.svn/prop-base/Vector.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/KaldiLib/.svn/prop-base/Vector.tcc.svn-base b/src/KaldiLib/.svn/prop-base/Vector.tcc.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/KaldiLib/.svn/prop-base/Vector.tcc.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/KaldiLib/.svn/prop-base/cblas.h.svn-base b/src/KaldiLib/.svn/prop-base/cblas.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/KaldiLib/.svn/prop-base/cblas.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/KaldiLib/.svn/prop-base/clapack.cc.svn-base b/src/KaldiLib/.svn/prop-base/clapack.cc.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/KaldiLib/.svn/prop-base/clapack.cc.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/KaldiLib/.svn/prop-base/clapack.h.svn-base b/src/KaldiLib/.svn/prop-base/clapack.h.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/KaldiLib/.svn/prop-base/clapack.h.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/KaldiLib/.svn/text-base/Common.cc.svn-base b/src/KaldiLib/.svn/text-base/Common.cc.svn-base
new file mode 100644
index 0000000..40909ee
--- /dev/null
+++ b/src/KaldiLib/.svn/text-base/Common.cc.svn-base
@@ -0,0 +1,277 @@
+#include <string>
+#include <stdexcept>
+#include <cmath>
+#include <cfloat>
+#include <cstdio>
+
+#include "Common.h"
+#include "MathAux.h"
+
+
+/// Defines the white chars for string trimming
+#if !defined(WHITE_CHARS)
+#  define WHITE_CHARS " \t"
+#endif
+
+namespace TNet {
+
+#include <ios>
+  
+  // Allocating stream variable used by stream modifier MatrixVectorIostreamControl
+  const int MATRIX_IOS_FORMAT_IWORD = std::ios_base::xalloc();
+
+  //***************************************************************************
+  //***************************************************************************
+  int getHTKstr(char *str)
+  {
+    char termChar = '\0';
+    char *chrptr = str;
+  
+    while (std::isspace(*chrptr)) ++chrptr;
+  
+    if (*chrptr == '\'' || *chrptr == '"') {
+      termChar = *chrptr;
+      chrptr++;
+    }
+  
+    for (; *chrptr; chrptr++) {
+      if (*chrptr == '\'' || *chrptr == '"') {
+        if (termChar == *chrptr) {
+          termChar = '\0';
+          chrptr++;
+          break;
+        }
+      }
+  
+      if (std::isspace(*chrptr) && !termChar) {
+        break;
+      }
+  
+      if (*chrptr == '\\') {
+        ++chrptr;
+        if (*chrptr == '\0' || (*chrptr    >= '0' && *chrptr <= '7' &&
+                              (*++chrptr  <  '0' || *chrptr >  '7' ||
+                              *++chrptr  <  '0' || *chrptr >  '7'))) {
+          return -1;
+        }
+  
+        if (*chrptr  >= '0' && *chrptr  <= '7') {
+          *chrptr = (char)((*chrptr - '0') + (chrptr[-1] - '0') * 8 + (chrptr[-2] - '0') * 64);
+        }
+      }
+      *str++ = *chrptr;
+    }
+  
+    if (termChar) {
+      return -2;
+    }
+  
+    *str = '\0';
+  
+    return 0;
+  }
+  
+
+  //*****************************************************************************
+  //*****************************************************************************
+  void
+  ParseHTKString(const std::string & rIn, std::string & rOut)
+  {
+    int ret_val;
+
+    // the new string will be at most as long as the original, so we allocate
+    // space
+    char* new_str = new char[rIn.size() + 1];
+
+    char* p_htk_str = new_str;
+
+    strcpy(p_htk_str, rIn.c_str());
+    ret_val = getHTKstr(p_htk_str);
+
+    // call the function
+    if (!ret_val) {
+      rOut = p_htk_str;
+    }
+
+    delete [] new_str;
+
+    if (ret_val) {
+      throw std::runtime_error("Error parsing HTK string");
+    }
+  }
+
+  
+
+  //***************************************************************************
+  //***************************************************************************
+  bool 
+  IsBigEndian()
+  {
+    int a = 1;
+    return (bool) ((char *) &a)[0] != 1;
+  }
+  
+
+  //***************************************************************************
+  //***************************************************************************
+  void 
+  MakeHtkFileName(char* pOutFileName,  const char* inFileName,
+               const char* out_dir, const char* out_ext)
+  {
+    const char* base_name;
+    const char* bname_end = NULL;
+    const char* chrptr;
+  
+    //  if (*inFileName == '*' && *++inFileName == '/') ++inFileName;
+    
+    // we don't do anything if file is stdin/out
+    if (!strcmp(inFileName, "-"))
+    {
+      pOutFileName[0] = '-';
+      pOutFileName[1] = '\0';
+      return;
+    }    
+    
+    base_name = strrchr(inFileName, '/');
+    base_name = base_name != NULL ? base_name + 1 : inFileName;
+    
+    if (out_ext) bname_end = strrchr(base_name, '.');
+    if (!bname_end) bname_end = base_name + strlen(base_name);
+  
+  
+    if ((chrptr = strstr(inFileName, "/./")) != NULL) 
+    {
+      // what is in path after /./ serve as base name
+      base_name = chrptr + 3;
+    }
+    /* else if (*inFileName != '/') 
+    {
+      // if inFileName isn't absolut path, don't forget directory structure
+      base_name = inFileName;
+    }*/
+  
+    *pOutFileName = '\0';
+    if (out_dir) 
+    {
+      if (*out_dir) 
+      {
+        strcat(pOutFileName, out_dir);
+        strcat(pOutFileName, "/");
+      }
+      strncat(pOutFileName, base_name, bname_end-base_name);
+    } 
+    else 
+    {
+      strncat(pOutFileName, inFileName, bname_end-inFileName);
+    }
+  
+    if (out_ext && *out_ext) 
+    {
+      strcat(pOutFileName, ".");
+      strcat(pOutFileName, out_ext);
+    }
+  }
+
+  
+  //****************************************************************************
+  //****************************************************************************
+  bool 
+  CloseEnough(const float f1, const float f2, const float nRounds)
+  {
+    bool ret_val = (_ABS((f1 - f2) / (f2 == 0.0f ? 1.0f : f2))
+        < (nRounds * FLT_EPSILON));
+
+    return ret_val;
+  } 
+
+  
+  //****************************************************************************
+  //****************************************************************************
+  bool 
+  CloseEnough(const double f1, const double f2, const double nRounds)
+  {
+    bool ret_val = (_ABS((f1 - f2) / (f2 == 0.0 ? 1.0 : f2))
+        < (nRounds * DBL_EPSILON));
+
+    return ret_val;
+  } 
+
+
+  //****************************************************************************
+  //****************************************************************************
+  char* 
+  ExpandHtkFilterCmd(const char *command, const char *filename, const char* pFilter)
+  {
+
+    char *out, *outend;
+    const char *chrptr = command;
+    int ndollars = 0;
+    int fnlen = strlen(filename);
+
+    while (*chrptr++) ndollars += (*chrptr ==  *pFilter);
+
+    out = (char*) malloc(strlen(command) - ndollars + ndollars * fnlen + 1);
+
+    outend = out;
+
+    for (chrptr = command; *chrptr; chrptr++) {
+      if (*chrptr ==  *pFilter) {
+        strcpy(outend, filename);
+        outend += fnlen;
+      } else {
+        *outend++ = *chrptr;
+      }
+    }
+    *outend = '\0';
+    return out;
+  }
+
+  //***************************************************************************
+  //***************************************************************************
+  char *
+  StrToUpper(char *str)
+  {
+    char *chptr;
+    for (chptr = str; *chptr; chptr++) {
+      *chptr = (char)toupper(*chptr);
+    }
+    return str;
+  }
+  
+
+  //**************************************************************************** 
+  //**************************************************************************** 
+  std::string&
+  Trim(std::string& rStr)
+  {
+    // WHITE_CHARS is defined in common.h
+    std::string::size_type pos = rStr.find_last_not_of(WHITE_CHARS);
+    if(pos != std::string::npos) 
+    {
+      rStr.erase(pos + 1);
+      pos = rStr.find_first_not_of(WHITE_CHARS);
+      if(pos != std::string::npos) rStr.erase(0, pos);
+    }
+    else 
+      rStr.erase(rStr.begin(), rStr.end());
+
+    return rStr;
+  }
+
+
+} // namespace TNet
+
+//#ifdef CYGWIN
+
+void assertf(const char *c, int i, const char *msg){
+  printf("Assertion \"%s\" failed: file \"%s\", line %d\n", msg?msg:"(null)", c?c:"(null)", i);
+  abort();
+}
+
+
+void assertf_throw(const char *c, int i, const char *msg){
+  char buf[2000];
+  snprintf(buf, 1999, "Assertion \"%s\" failed, throwing exception: file \"%s\", line %d\n", msg?msg:"(null)", c?c:"(null)", i);
+  throw std::runtime_error((std::string)buf);
+}
+//#endif
diff --git a/src/KaldiLib/.svn/text-base/Common.h.svn-base b/src/KaldiLib/.svn/text-base/Common.h.svn-base
new file mode 100644
index 0000000..9cd9658
--- /dev/null
+++ b/src/KaldiLib/.svn/text-base/Common.h.svn-base
@@ -0,0 +1,233 @@
+#ifndef TNet_Common_h
+#define TNet_Common_h
+
+#include <cstdlib>
+#include <string.h> // C string stuff like strcpy
+#include <string>
+#include <sstream>
+#include <stdexcept>
+
+/* Alignment of critical dynamic data structure
+ *
+ * Not all platforms support memalign so we provide a stk_memalign wrapper
+ * void *stk_memalign( size_t align, size_t size, void **pp_orig )
+ * *pp_orig is the pointer that has to be freed afterwards.
+ */
+#ifdef HAVE_POSIX_MEMALIGN
+#  define stk_memalign(align,size,pp_orig) \
+     ( !posix_memalign( pp_orig, align, size ) ? *(pp_orig) : NULL )
+#  ifdef STK_MEMALIGN_MANUAL
+#    undef STK_MEMALIGN_MANUAL
+#  endif
+#elif defined(HAVE_MEMALIGN)
+   /* Some systems have memalign() but no declaration for it */
+   //void * memalign( size_t align, size_t size );
+#  define stk_memalign(align,size,pp_orig) \
+     ( *(pp_orig) = memalign( align, size ) )
+#  ifdef STK_MEMALIGN_MANUAL
+#    undef STK_MEMALIGN_MANUAL
+#  endif
+#else /* We don't have any choice but to align manually */
+#  define stk_memalign(align,size,pp_orig) \
+     (( *(pp_orig) = malloc( size + align - 1 )) ? \
+     (void *)( (((unsigned long)*(pp_orig)) + 15) & ~0xFUL ) : NULL )
+#  define STK_MEMALIGN_MANUAL
+#endif
+
+
+#define swap8(a) { \
+  char t=((char*)&a)[0]; ((char*)&a)[0]=((char*)&a)[7]; ((char*)&a)[7]=t;\
+      t=((char*)&a)[1]; ((char*)&a)[1]=((char*)&a)[6]; ((char*)&a)[6]=t;\
+      t=((char*)&a)[2]; ((char*)&a)[2]=((char*)&a)[5]; ((char*)&a)[5]=t;\
+      t=((char*)&a)[3]; ((char*)&a)[3]=((char*)&a)[4]; ((char*)&a)[4]=t;}
+#define swap4(a) { \
+  char t=((char*)&a)[0]; ((char*)&a)[0]=((char*)&a)[3]; ((char*)&a)[3]=t;\
+      t=((char*)&a)[1]; ((char*)&a)[1]=((char*)&a)[2]; ((char*)&a)[2]=t;}
+#define swap2(a) { \
+  char t=((char*)&a)[0]; ((char*)&a)[0]=((char*)&a)[1]; ((char*)&a)[1]=t;}
+
+
+namespace TNet
+{
+  /** **************************************************************************
+   ** **************************************************************************
+   * @brief Aligns a number to a specified base
+   * @param n Number of type @c _T to align
+   * @return Aligned value of type @c _T
+   */
+  template<size_t _align, typename _T>
+    inline _T 
+    align(const _T n)
+    {
+      const _T x(_align - 1); 
+      return (n + x) & ~(x);
+    }
+
+
+  /** 
+   * @brief Returns true if architecture is big endian
+   */
+  bool 
+  IsBigEndian();
+
+
+  /** 
+   * @brief Returns true if two numbers are close enough to each other
+   * 
+   * @param f1  First operand
+   * @param f2  Second operand
+   * @param nRounds Expected number of operations prior to this comparison
+  */
+  bool 
+  CloseEnough(const float f1, const float f2, const float nRounds);
+  
+
+  /** 
+   * @brief Returns true if two numbers are close enough to each other
+   * 
+   * @param f1  First operand
+   * @param f2  Second operand
+   * @param nRounds Expected number of operations prior to this comparison
+  */
+  bool 
+  CloseEnough(const double f1, const double f2, const double nRounds);
+  
+
+  /** 
+   * @brief Parses a HTK-style string into a C++ std::string readable
+   * 
+   * @param rIn  HTK input string
+   * @param rOut output parsed string
+   */
+  void
+  ParseHTKString(const std::string & rIn, std::string & rOut);
+
+  
+  /** 
+   * @brief Synthesize new file name based on name, path, and extension
+   * 
+   * @param pOutFileName  full ouptut file name
+   * @param pInFileName   file name
+   * @param pOutDir       directory
+   * @param pOutExt       extension
+   */
+  void    
+  MakeHtkFileName(char *pOutFileName, const char* pInFileName, const char *pOutDir, 
+      const char *pOutExt);
+  
+
+  /** 
+   * @brief Removes the leading and trailing white chars
+   *
+   * @param rStr Refference to the string to be processed
+   * @return Refference to the original string
+   *
+   * The white characters are determined by the @c WHITE_CHARS macro defined 
+   * above.
+   */
+  std::string&
+  Trim(std::string& rStr);
+
+
+  char*
+  StrToUpper(char* pStr);
+
+  char* 
+  ExpandHtkFilterCmd(const char *command, const char *filename, const char* pFilter);
+  
+  
+  template <class T>
+  std::string to_string(const T& val)
+  {
+    std::stringstream ss;
+    ss << val;
+    return ss.str();
+  }
+  
+  inline void 
+  ExpectKeyword(std::istream &i_stream, const char *kwd)
+  {
+     std::string token;
+     i_stream >> token;
+     if (token != kwd) {
+       throw std::runtime_error(std::string(kwd) + " expected");
+     }
+  }
+  
+  extern const int MATRIX_IOS_FORMAT_IWORD;
+
+  enum MatrixVectorIostreamControlBits {
+    ACCUMULATE_INPUT = 1,
+//  BINARY_OUTPUT    = 2
+  };
+  
+  class MatrixVectorIostreamControl
+  {
+    public:
+      MatrixVectorIostreamControl(enum MatrixVectorIostreamControlBits bitsToBeSet, bool valueToBeSet)
+      : mBitsToBeSet(bitsToBeSet), mValueToBeSet(valueToBeSet) {}
+      
+      static long Flags(std::ios_base &rIos, enum MatrixVectorIostreamControlBits bits)
+      { return rIos.iword(MATRIX_IOS_FORMAT_IWORD); }
+      
+      long mBitsToBeSet;
+      bool mValueToBeSet;
+            
+      friend std::ostream & operator <<(std::ostream &rOs, const MatrixVectorIostreamControl modifier)
+      {
+        if(modifier.mValueToBeSet) {
+          rOs.iword(MATRIX_IOS_FORMAT_IWORD) |= modifier.mBitsToBeSet;
+        } else {
+          rOs.iword(MATRIX_IOS_FORMAT_IWORD) &= ~modifier.mBitsToBeSet;
+        }
+        return rOs;
+      }
+
+      friend std::istream & operator >>(std::istream &rIs, const MatrixVectorIostreamControl modifier)
+      {
+        if(modifier.mValueToBeSet) {
+          rIs.iword(MATRIX_IOS_FORMAT_IWORD) |= modifier.mBitsToBeSet;
+        } else {
+          rIs.iword(MATRIX_IOS_FORMAT_IWORD) &= ~modifier.mBitsToBeSet;
+        }
+        return rIs;
+      }
+  };
+  
+  
+  
+
+} // namespace TNet
+
+#ifdef __ICC
+#pragma warning (disable: 383) // ICPC remark we don't want.
+#pragma warning (disable: 810) // ICPC remark we don't want.
+#pragma warning (disable: 981) // ICPC remark we don't want.
+#pragma warning (disable: 1418) // ICPC remark we don't want.
+#pragma warning (disable: 444) // ICPC remark we don't want.
+#pragma warning (disable: 869) // ICPC remark we don't want.
+#pragma warning (disable: 1287) // ICPC remark we don't want.
+#pragma warning (disable: 279) // ICPC remark we don't want.
+#pragma warning (disable: 981) // ICPC remark we don't want.
+#endif
+
+//#ifdef CYGWIN
+#if 1
+#undef assert
+#ifndef NDEBUG
+#define assert(e)          ((e) ? (void)0 : assertf(__FILE__, __LINE__, #e))
+#else
+#define assert(e)         ((void)0)
+#endif
+void assertf(const char *c, int i, const char *msg); // Just make it possible to break into assert on gdb-- has some kind of bug on cygwin.
+#else
+#include <cassert>
+#endif
+
+#define assert_throw(e)          ((e) ? (void)0 : assertf_throw(__FILE__, __LINE__, #e))
+void assertf_throw(const char *c, int i, const char *msg); 
+
+#define DAN_STYLE_IO
+
+#endif // ifndef TNet_Common_h
+
diff --git a/src/KaldiLib/.svn/text-base/Error.h.svn-base b/src/KaldiLib/.svn/text-base/Error.h.svn-base
new file mode 100644
index 0000000..873f3db
--- /dev/null
+++ b/src/KaldiLib/.svn/text-base/Error.h.svn-base
@@ -0,0 +1,155 @@
+//
+// C++ Interface: %{MODULE}
+//
+// Description: 
+//
+//
+// Author: %{AUTHOR} <%{EMAIL}>, (C) %{YEAR}
+//
+// Copyright: See COPYING file that comes with this distribution
+//
+//
+
+/** @file Error.h
+ *  This header defines several types and functions relating to the
+ *  handling of exceptions in STK.
+ */
+ 
+#ifndef TNET_Error_h
+#define TNET_Error_h
+
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <sstream>
+
+#include <cstdlib>
+#include <execinfo.h>
+
+// THESE MACROS TERRIBLY CLASH WITH STK!!!!
+// WE MUST USE SAME MACROS!
+//
+//#define Error(msg) _Error_(__func__, __FILE__, __LINE__, msg)
+//#define Warning(msg) _Warning_(__func__, __FILE__, __LINE__, msg)
+//#define TraceLog(msg) _TraceLog_(__func__, __FILE__, __LINE__, msg)
+//
+
+#ifndef Error
+  #define Error(...) _Error_(__func__, __FILE__, __LINE__, __VA_ARGS__)
+#endif
+#ifndef Warning
+  #define Warning(...) _Warning_(__func__, __FILE__, __LINE__, __VA_ARGS__)
+#endif
+#ifndef TraceLog
+  #define TraceLog(...) _TraceLog_(__func__, __FILE__, __LINE__, __VA_ARGS__)
+#endif
+
+namespace TNet {
+  
+
+
+  /** MyException
+   * Custom exception class, gets the stacktrace
+   */
+  class MyException 
+    : public std::runtime_error
+  {
+    public:
+      explicit MyException(const std::string& what_arg) throw();
+      virtual ~MyException() throw();
+
+      const char* what() const throw() 
+      { return mWhat.c_str(); }
+
+    private:
+      std::string mWhat;
+  };
+
+  /** 
+   * MyException:: implemenatation
+   */
+  inline
+  MyException::
+  MyException(const std::string& what_arg) throw()
+    : std::runtime_error(what_arg)
+  {
+    mWhat = what_arg;
+    mWhat += "\nTHE STACKTRACE INSIDE MyException OBJECT IS:\n";
+    
+    void *array[10];
+    size_t size;
+    char **strings;
+    size_t i;
+
+    size = backtrace (array, 10);
+    strings = backtrace_symbols (array, size);
+    
+    //<< 0th string is the MyException ctor, so ignore and start by 1
+    for (i = 1; i < size; i++) { 
+      mWhat += strings[i];
+      mWhat += "\n";
+    }
+
+    free (strings);
+  }
+
+
+  inline
+  MyException::
+  ~MyException() throw()
+  { } 
+
+
+
+  /**
+   *  @brief Error throwing function (with backtrace)
+   */
+  inline void 
+  _Error_(const char *func, const char *file, int line, const std::string &msg)
+  {
+     std::stringstream ss;
+     ss << "ERROR (" << func << ':' << file  << ':' << line << ") " << msg;
+     throw MyException(ss.str());
+  }
+  
+  /**
+   *  @brief Warning handling function
+   */
+  inline void 
+  _Warning_(const char *func, const char *file, int line, const std::string &msg)
+  {
+	std::cout << "WARNING (" << func << ':' << file  << ':' << line << ") " << msg << std::endl;
+  }
+
+  inline void 
+  _TraceLog_(const char *func, const char *file, int line, const std::string &msg)
+  {
+	std::cout << "INFO (" << func << ':' << file  << ':' << line << ") " << msg << std::endl;
+  }
+
+  /**
+   * New kaldi error handling:
+   *
+   * class KaldiErrorMessage is invoked from the KALDI_ERROR macro.
+   * The destructor throws an exception.
+   */
+  class KaldiErrorMessage {
+   public:
+    KaldiErrorMessage(const char *func, const char *file, int line) {
+      this->stream() << "ERROR (" 
+                     << func << "():"
+                     << file << ':' << line << ") ";
+    }
+    inline std::ostream &stream() { return ss; }
+    ~KaldiErrorMessage() { throw MyException(ss.str()); }
+   private:
+    std::ostringstream ss;
+  };
+  #define KALDI_ERR TNet::KaldiErrorMessage(__func__, __FILE__, __LINE__).stream() 
+
+
+
+} // namespace TNet
+
+//#define TNET_Error_h
+#endif
diff --git a/src/KaldiLib/.svn/text-base/Features.cc.svn-base b/src/KaldiLib/.svn/text-base/Features.cc.svn-base
new file mode 100644
index 0000000..8d173bc
--- /dev/null
+++ b/src/KaldiLib/.svn/text-base/Features.cc.svn-base
@@ -0,0 +1,1798 @@
+
+//enable feature repository profiling
+#define PROFILING 1
+
+#include <sstream>
+#include <map>
+#include <list>
+#include <cstdio>
+
+#include "Features.h"
+#include "Tokenizer.h"
+#include "StkMatch.h"
+#include "Types.h"
+
+
+
+namespace TNet
+{
+  const char 
+  FeatureRepository::
+  mpParmKindNames[13][16] =
+  {
+    {"WAVEFORM"},
+    {"LPC"},
+    {"LPREFC"},
+    {"LPCEPSTRA"},
+    {"LPDELCEP"},
+    {"IREFC"},
+    {"MFCC"},
+    {"FBANK"},
+    {"MELSPEC"},
+    {"USER"},
+    {"DISCRETE"},
+    {"PLP"},
+    {"ANON"}
+  };
+
+  //***************************************************************************
+  //***************************************************************************
+
+  FileListElem::
+  FileListElem(const std::string & rFileName)
+  {
+    std::string::size_type  pos;
+    
+    mLogical = rFileName;
+    mWeight  = 1.0;
+    
+    // some slash-backslash replacement hack
+    for (size_t i = 0; i < mLogical.size(); i++) {
+      if (mLogical[i] == '\\') {
+        mLogical[i] = '/';
+      }
+    }
+        
+    // read sentence weight definition if any ( physical_file.fea[s,e]{weight} )
+    if ((pos = mLogical.find('{')) != std::string::npos)
+    {
+      std::string       tmp_weight(mLogical.begin() + pos + 1, mLogical.end());
+      std::stringstream tmp_ss(tmp_weight);
+
+      tmp_ss >> mWeight;
+      mLogical.erase(pos);
+    }
+
+    // look for "=" symbol and if found, split it
+    if ((pos = mLogical.find('=')) != std::string::npos)
+    {
+      // copy all from mLogical[pos+1] till the end to mPhysical
+      mPhysical.assign(mLogical.begin() + pos + 1, mLogical.end());
+      // erase all from pos + 1 till the end from mLogical
+      mLogical.erase(pos);
+      // trim the leading and trailing spaces
+      Trim(mPhysical);
+      Trim(mLogical);
+    }
+    else
+    {
+      // trim the leading and trailing spaces
+      Trim(mLogical);
+
+      mPhysical = mLogical;
+    }    
+  }    
+
+
+  //###########################################################################
+  //###########################################################################
+  // FeatureRepository section
+  //###########################################################################
+  //###########################################################################
+  
+  //***************************************************************************
+  //***************************************************************************
+  void 
+  FeatureRepository::
+  ReadCepsNormFile(
+    const char *  pFileName, 
+    char **       pLastFileName, 
+    BaseFloat **      vec_buff,
+    int           sampleKind, 
+    CNFileType    type, 
+    int           coefs)
+  {
+    FILE*   fp;
+    int     i;
+    char    s1[64];
+    char    s2[64];
+    const char*   typeStr = (type == CNF_Mean     ? "MEAN" :
+                    type == CNF_Variance ? "VARIANCE" : "VARSCALE");
+  
+    const char*   typeStr2 = (type == CNF_Mean     ? "CMN" :
+                    type == CNF_Variance ? "CVN" : "VarScale");
+  
+    if (*pLastFileName != NULL && !strcmp(*pLastFileName, pFileName)) {
+      return;
+    }
+    free(*pLastFileName);
+    *pLastFileName=strdup(pFileName);
+    *vec_buff = (BaseFloat*) realloc(*vec_buff, coefs * sizeof(BaseFloat));
+  
+    if (*pLastFileName == NULL || *vec_buff== NULL) 
+      throw std::runtime_error("Insufficient memory");
+    
+    if ((fp = fopen(pFileName, "r")) == NULL)  {
+      throw std::runtime_error(std::string("Cannot open ") + typeStr2 
+          + " pFileName: '" + pFileName + "'");
+    }
+    
+    if ((type != CNF_VarScale
+        && (fscanf(fp, " <%64[^>]> <%64[^>]>", s1, s2) != 2
+          || strcmp(StrToUpper(s1), "CEPSNORM")
+          || ReadParmKind(s2, false) != sampleKind))
+        || fscanf(fp, " <%64[^>]> %d", s1, &i) != 2
+        || strcmp(StrToUpper(s1), typeStr)
+        || i != coefs) 
+    {
+      ParmKind2Str(sampleKind, s2);
+
+      //std::cout << "[[[TADY!!!!]]]" << pFileName << "\n" << std::flush;
+
+      throw std::runtime_error(std::string("")
+            + (type == CNF_VarScale ? "" : "<CEPSNORM> <")
+            + (type == CNF_VarScale ? "" : s2)
+            + (type == CNF_VarScale ? "" : ">")
+            + " <" + typeStr + " ... expected in " + typeStr2
+            + " file " + pFileName);
+    }
+    
+    for (i = 0; i < coefs; i++) {
+      if (fscanf(fp, " "FLOAT_FMT, *vec_buff+i) != 1) {
+        if (fscanf(fp, "%64s", s2) == 1) {
+          throw std::runtime_error(std::string("Decimal number expected but '")
+              + s2 + "' found in " + typeStr2 + " file " + pFileName);
+        } 
+        else if (feof(fp)) {
+          throw std::runtime_error(std::string("Unexpected end of ") 
+              + typeStr2 + " file "+ pFileName);
+        } 
+        else {
+          throw std::runtime_error(std::string("Cannot read ") + typeStr2 
+              + " file " + pFileName);
+        }
+      }
+      
+      if (type == CNF_Variance)      
+        (*vec_buff)[i] = BaseFloat(1 / sqrt((*vec_buff)[i]));
+      else if (type == CNF_VarScale) 
+        (*vec_buff)[i] = BaseFloat(sqrt((*vec_buff)[i]));
+    }
+    
+    if (fscanf(fp, "%64s", s2) == 1) 
+    {
+      throw std::runtime_error(std::string("End of file expected but '") 
+          + s2 + "' found in " + typeStr2 + " file " + pFileName);
+    }
+    
+    fclose(fp);
+  } // ReadCepsNormFile(...)
+  
+
+  //***************************************************************************
+  //***************************************************************************
+  void
+  FeatureRepository::
+  HtkFilter(const char* pFilter, const char* pValue, FeatureRepository& rOut)
+  {
+    std::list<FileListElem>::iterator   it;
+    std::string  str;
+
+    rOut.mSwapFeatures    = mSwapFeatures;
+    rOut.mStartFrameExt   = mStartFrameExt;
+    rOut.mEndFrameExt     = mEndFrameExt;
+    rOut.mTargetKind      = mTargetKind;
+    rOut.mDerivOrder      = mDerivOrder;
+    rOut.mDerivWinLengths = mDerivWinLengths;
+
+    rOut.mpCvgFile        = mpCvgFile;
+    rOut.mpCmnPath        = mpCmnPath;
+    rOut.mpCmnMask        = mpCmnMask;
+    rOut.mpCvnPath        = mpCvnPath;
+    rOut.mpCvnMask        = mpCvnMask;
+
+    rOut.mInputQueue.clear();
+
+    // go through all records and check the mask
+    for (it=mInputQueue.begin(); it!= mInputQueue.end(); ++it) {
+      if (pFilter == NULL
+      ||  (ProcessMask(it->Logical(), pFilter, str) && (str == pValue))) {
+        rOut.mInputQueue.push_back(*it);
+      }
+    }
+
+    // set the queue position to the begining
+    rOut.mInputQueueIterator = mInputQueue.end(); 
+
+    rOut.mCurrentIndexFileName  = "";
+    rOut.mCurrentIndexFileDir   = "";
+    rOut.mCurrentIndexFileExt   = "";
+
+    mStream.close();
+    mStream.clear();
+
+    rOut.mpLastFileName = NULL;
+    rOut.mLastFileName  = "";
+    rOut.mpLastCmnFile  = NULL;
+    rOut.mpLastCvnFile  = NULL;
+    rOut.mpLastCvgFile  = NULL;
+    rOut.mpCmn          = NULL;
+    rOut.mpCvn          = NULL;
+    rOut.mpCvg          = NULL;
+    rOut.mpA            = NULL;
+    rOut.mpB            = NULL;
+
+  }
+
+
+  //***************************************************************************
+  //***************************************************************************
+  void
+  FeatureRepository::
+  HtkSelection(const char* pFilter, std::list< std::string >& rOut)
+  {
+    std::map< std::string, bool> aux_map;
+    std::map< std::string, bool>::iterator map_it;
+    std::list<FileListElem>::iterator   it;
+    std::string  str;
+
+    rOut.clear();
+    
+    if(pFilter != NULL) {
+      // go through all records and check the mask
+      for (it=mInputQueue.begin(); it!= mInputQueue.end(); ++it) {
+        if (ProcessMask(it->Logical(), pFilter, str)) {
+          aux_map[str] = true;
+        }
+      }
+    } else {
+      aux_map[std::string("default speaker")] = true;
+    }
+
+    for (map_it = aux_map.begin(); map_it != aux_map.end(); ++map_it) {
+      rOut.push_back(map_it->first);
+    }
+  }
+
+
+  //***************************************************************************
+  //***************************************************************************
+  int     
+  FeatureRepository::
+  ParmKind2Str(unsigned parmKind, char *pOutString)
+  {
+    // :KLUDGE: Absolutely no idea what this is...
+      if ((parmKind & 0x003F) >= sizeof(mpParmKindNames)/sizeof(mpParmKindNames[0])) 
+      return 0;
+  
+    strcpy(pOutString, mpParmKindNames[parmKind & 0x003F]);
+  
+    if (parmKind & PARAMKIND_E) strcat(pOutString, "_E");
+    if (parmKind & PARAMKIND_N) strcat(pOutString, "_N");
+    if (parmKind & PARAMKIND_D) strcat(pOutString, "_D");
+    if (parmKind & PARAMKIND_A) strcat(pOutString, "_A");
+    if (parmKind & PARAMKIND_C) strcat(pOutString, "_C");
+    if (parmKind & PARAMKIND_Z) strcat(pOutString, "_Z");
+    if (parmKind & PARAMKIND_K) strcat(pOutString, "_K");
+    if (parmKind & PARAMKIND_0) strcat(pOutString, "_0");
+    if (parmKind & PARAMKIND_V) strcat(pOutString, "_V");
+    if (parmKind & PARAMKIND_T) strcat(pOutString, "_T");
+    
+    return 1;
+  }
+
+
+  // //***************************************************************************
+  // //***************************************************************************
+  // void
+  // AddFileListToFeatureRepositories(
+  //   const char* pFileName, 
+  //   const char* pFilter, 
+  //   std::queue<FeatureRepository *> &featureRepositoryList)
+  // {
+  //   IStkStream            l_stream;
+  //   std::string           file_name;
+  //   Tokenizer             file_list(pFileName, ",");
+  //   Tokenizer::iterator   p_file_name;
+
+  //   //:TODO: error if empty featureRepositoryList
+  //   
+  //   for (p_file_name = file_list.begin(); p_file_name != file_list.end(); ++p_file_name)
+  //   {
+  //     // get rid of initial and trailing blanks
+  //     Trim(*p_file_name);
+
+  //     // open file name
+  //     l_stream.open(p_file_name->c_str(), std::ios::in, pFilter);
+  //     
+  //     if (!l_stream.good()) {
+  //       //:TODO:
+  //       // Warning or error ... Why warning? -Lukas
+  //       throw std::runtime_error(std::string("Cannot not open list file ") +
+  //           *p_file_name);
+  //     }
+
+  //     // read all lines and parse them
+  //     for(;;)
+  //     {
+  //       l_stream >> file_name;
+  //       //:TODO: if(l_stream.badl()) Error()
+  //       // Reading after last token set the fail bit
+  //       if(l_stream.fail()) 
+  //         break;
+  //       // we can push_back a std::string as new FileListElem object
+  //       // is created using FileListElem(const std::string&) constructor
+  //       // and logical and physical names are correctly extracted
+  //       featureRepositoryList.front()->mInputQueue.push_back(file_name);
+  //       
+  //       //cycle in the featureRepositoryList
+  //       featureRepositoryList.push(featureRepositoryList.front());
+  //       featureRepositoryList.pop();
+  //     }
+  //     l_stream.close();
+  //   }
+  // } // AddFileList(const std::string & rFileName)
+
+
+  //***************************************************************************
+  //***************************************************************************
+  void
+  FeatureRepository::
+  Init(
+      bool                  swap,
+      int                   extLeft,
+      int                   extRight,
+      int                   targetKind,
+      int                   derivOrder,
+      int*                  pDerivWinLen,
+      const char*           pCmnPath,
+      const char*           pCmnMask,
+      const char*           pCvnPath,
+      const char*           pCvnMask,
+      const char*           pCvgFile)
+  {
+    mSwapFeatures       =   swap;         
+    mStartFrameExt      =   extLeft;      
+    mEndFrameExt        =   extRight;     
+    mTargetKind         =   targetKind;   
+    mDerivOrder         =   derivOrder;   
+    mDerivWinLengths    =   pDerivWinLen; 
+    mpCmnPath           =   pCmnPath;     
+    mpCmnMask           =   pCmnMask;     
+    mpCvnPath           =   pCvnPath;     
+    mpCvnMask           =   pCvnMask;     
+    mpCvgFile           =   pCvgFile;    
+  } // Init()
+
+
+  //***************************************************************************
+  //***************************************************************************
+  void
+  FeatureRepository::
+  AddFile(const std::string & rFileName)
+  {
+    mInputQueue.push_back(rFileName);
+  } // AddFile(const std::string & rFileName)
+
+  
+  //***************************************************************************
+  //***************************************************************************
+  void
+  FeatureRepository::
+  AddFileList(const char* pFileName, const char* pFilter)
+  {
+    IStkStream            l_stream;
+    std::string           file_name;
+    Tokenizer             file_list(pFileName, ",");
+    Tokenizer::iterator   p_file_name;
+
+    for (p_file_name = file_list.begin(); p_file_name != file_list.end(); ++p_file_name)
+    {
+      // get rid of spaces
+      Trim(*p_file_name);
+
+      // open the file
+      l_stream.open(p_file_name->c_str(), std::ios::in, pFilter);
+      
+      if (!l_stream.good())
+      {
+        //:TODO:
+        // Warning or error ... Why warning? -Lukas
+        throw std::runtime_error(std::string("Cannot not open list file ") +
+            *p_file_name);
+      }
+      // read all lines and parse them
+      for(;;)
+      {
+        l_stream >> file_name;
+        //:TODO: if(l_stream.badl()) Error()
+        // Reading after last token set the fail bit
+        if(l_stream.fail()) 
+	  break;
+        // we can push_back a std::string as new FileListElem object
+        // is created using FileListElem(const std::string&) constructor
+        // and logical and physical names are correctly extracted
+        mInputQueue.push_back(file_name);
+      }
+      l_stream.close();
+    }
+  } // AddFileList(const std::string & rFileName)
+
+  
+  //***************************************************************************
+  //***************************************************************************
+  void
+  FeatureRepository::
+  MoveNext()
+  {
+    assert (mInputQueueIterator != mInputQueue.end());
+    mInputQueueIterator++;
+  } // ReadFullMatrix(Matrix<BaseFloat>& rMatrix)
+
+
+  //***************************************************************************
+  //***************************************************************************
+  bool
+  FeatureRepository::
+  ReadFullMatrix(Matrix<BaseFloat>& rMatrix)
+  {
+    // clear the matrix
+    rMatrix.Destroy();
+
+    // extract index file name
+    if (!mCurrentIndexFileDir.empty())
+    {
+      char tmp_name[mCurrentIndexFileDir.length() + 
+        mCurrentIndexFileExt.length() + 
+        mInputQueueIterator->Physical().length()]; 
+      
+      MakeHtkFileName(tmp_name, mInputQueueIterator->Physical().c_str(), 
+          mCurrentIndexFileDir.c_str(), mCurrentIndexFileExt.c_str());
+      
+      mCurrentIndexFileName = tmp_name;
+    }
+    else
+      mCurrentIndexFileName = "";
+
+    //get the 3-letter suffix
+    int pos_last_three_chars = mInputQueueIterator->Physical().size() - 3;
+    if (pos_last_three_chars < 0) pos_last_three_chars = 0;
+    //read the gzipped ascii features
+    if (mInputQueueIterator->Physical().substr(pos_last_three_chars) == ".gz") {
+      return ReadGzipAsciiFeatures(*mInputQueueIterator, rMatrix);
+    }
+     
+    // read the matrix and return the result
+    return ReadHTKFeatures(*mInputQueueIterator, rMatrix);
+  } // ReadFullMatrix(Matrix<BaseFloat>& rMatrix)
+
+
+
+  //***************************************************************************
+  //***************************************************************************
+  bool
+  FeatureRepository::
+  WriteFeatureMatrix(const Matrix<BaseFloat>& rMatrix, const std::string& filename, int targetKind, int samplePeriod)
+  {
+    FILE* fp = fopen(filename.c_str(),"w");
+    if(NULL == fp) { Error(std::string("Cannot create file:") + filename); return false; }
+
+    WriteHTKFeatures(fp, samplePeriod, targetKind, mSwapFeatures, const_cast<Matrix<BaseFloat>&>(rMatrix));
+
+    fclose(fp);
+
+    return true;
+  }
+
+
+  //***************************************************************************
+  //***************************************************************************
+  // private:
+  int 
+  FeatureRepository::
+  ReadHTKHeader()
+  {
+    // TODO 
+    // Change this... We should read from StkStream
+    FILE* fp = mStream.fp();
+    
+    if (!fread(&mHeader.mNSamples,     sizeof(INT_32),  1, fp)) return -1;
+    if (!fread(&mHeader.mSamplePeriod, sizeof(INT_32),  1, fp)) return -1;
+    if (!fread(&mHeader.mSampleSize,   sizeof(INT_16),  1, fp)) return -1;
+    if (!fread(&mHeader.mSampleKind,   sizeof(UINT_16), 1, fp)) return -1;
+
+    if (mSwapFeatures) 
+    {
+      swap4(mHeader.mNSamples);
+      swap4(mHeader.mSamplePeriod);
+      swap2(mHeader.mSampleSize);
+      swap2(mHeader.mSampleKind);
+    }
+  
+    if (mHeader.mSamplePeriod < 0 
+    ||  mHeader.mSamplePeriod > 100000 
+    ||  mHeader.mNSamples     < 0 
+    ||  mHeader.mSampleSize   < 0) 
+    {
+      return -1;
+    }
+  
+    return 0;
+  }
+  
+  
+  //***************************************************************************
+  //***************************************************************************
+  // private:
+  int 
+  FeatureRepository::
+  ReadHTKFeature(
+      BaseFloat*    pIn, 
+      size_t    feaLen, 
+      bool      decompress, 
+      BaseFloat*    pScale, 
+      BaseFloat*    pBias)
+  {
+    FILE*  fp = mStream.fp();
+    
+    size_t i;
+    
+    if (decompress) 
+    {
+      INT_16 s;
+  //    BaseFloat pScale = (xmax - xmin) / (2*32767);
+  //    BaseFloat pBias  = (xmax + xmin) / 2;
+  
+      for (i = 0; i < feaLen; i++) 
+      {
+        if (fread(&s, sizeof(INT_16), 1, fp) != 1) 
+          return -1;
+        
+        if (mSwapFeatures) swap2(s);
+        pIn[i] = ((BaseFloat)s + pBias[i]) / pScale[i];
+      }
+      
+      return 0;
+    }
+  
+#if !DOUBLEPRECISION
+    if (fread(pIn, sizeof(FLOAT_32), feaLen, fp) != feaLen) 
+      return -1;
+    
+    if (mSwapFeatures) 
+      for (i = 0; i < feaLen; i++) 
+        swap4(pIn[i]);
+#else
+    float f;
+  
+    for (i = 0; i < feaLen; i++) 
+    {
+      if (fread(&f, sizeof(FLOAT_32), 1, fp) != 1)
+        return -1;
+      
+      if (mSwapFeatures) 
+        swap4(f);
+        
+      pIn[i] = f;
+    }
+#endif
+    return 0;
+  }  // int ReadHTKFeature
+  
+  
+
+  //***************************************************************************
+  //***************************************************************************
+/*  bool 
+  FeatureRepository::
+  ReadHTKFeatures(const std::string& rFileName, Matrix<BaseFloat>& rFeatureMatrix)
+  {
+    std::string           file_name(rFileName);
+    std::string           cmn_file_name;
+    std::string           cvn_file_name;  
+    
+    int                   ext_left  = mStartFrameExt;
+    int                   ext_right = mEndFrameExt;
+    int                   from_frame;
+    int                   to_frame;
+    int                   tot_frames;
+    int                   trg_vec_size;
+    int                   src_vec_size;
+    int                   src_deriv_order;
+    int                   lo_src_tgz_deriv_order;
+    int                   i;
+    int                   j;
+    int                   k;
+    int                   e;
+    int                   coefs;
+    int                   trg_E;
+    int                   trg_0;
+    int                   trg_N;
+    int                   src_E;
+    int                   src_0;
+    int                   src_N;
+    int                   comp;
+    int                   coef_size;
+    char*                 chptr;
+  
+    
+   
+    // read frame range definition if any ( physical_file.fea[s,e] )
+    if ((chptr = strrchr(file_name.c_str(), '[')) == NULL ||
+        ((i=0), sscanf(chptr, "[%d,%d]%n", &from_frame, &to_frame, &i), 
+         chptr[i] != '\0')) 
+    {
+      chptr = NULL;
+    }
+    
+    if (chptr != NULL)                                
+      *chptr = '\0';
+  
+  // Experimental changes...
+  // if ((strcmp(file_name.c_str(), "-"))
+  // &&  (mpLastFileName != NULL) 
+  // &&  (!strcmp(mpLastFileName, file_name.c_str()))) 
+  //   {
+  //     mHeader = mLastHeader;
+  //   } 
+  //   else 
+  //   {
+  //     if (mpLastFileName) 
+  //     {
+  //       //if (mpFp != stdin) 
+  //       //  fclose(mpFp);
+  //       mStream.close();
+  //       
+  //       free(mpLastFileName);
+  //       mpLastFileName = NULL;
+  //     }
+
+    if ((file_name != "-" )
+    &&  (!mLastFileName.empty()) 
+    &&  (mLastFileName == file_name)) 
+    {
+      mHeader = mLastHeader;
+    } 
+    else 
+    {
+      if (!mLastFileName.empty()) 
+      {
+        mStream.close();
+        mLastFileName = "";
+      }
+      
+      
+      // open the feature file
+      mStream.open(file_name.c_str(), ios::binary);
+      if (!mStream.good())
+      {
+        Error("Cannot open feature file: '%s'", file_name.c_str());
+      }
+      
+      
+      if (ReadHTKHeader()) 
+        Error("Invalid HTK header in feature file: '%s'", file_name.c_str());
+      
+      if (mHeader.mSampleKind & PARAMKIND_C) 
+      {
+        // File is in compressed form, scale and pBias vectors
+        // are appended after HTK header.
+  
+        int coefs = mHeader.mSampleSize/sizeof(INT_16);
+        mpA = (BaseFloat*) realloc(mpA, coefs * sizeof(BaseFloat));
+        mpB = (BaseFloat*) realloc(mpB, coefs * sizeof(BaseFloat));
+        if (mpA == NULL || mpB == NULL) Error("Insufficient memory");
+  
+        e  = ReadHTKFeature(mpA, coefs, 0, 0, 0);
+        e |= ReadHTKFeature(mpB, coefs, 0, 0, 0);
+        
+        if (e) 
+          Error("Cannot read feature file: '%s'", file_name.c_str());
+        
+        mHeader.mNSamples -= 2 * sizeof(FLOAT_32) / sizeof(INT_16);
+      }
+      
+      // remember current settings
+      mLastFileName = file_name;
+      mLastHeader   = mHeader;
+    }
+    
+    if (chptr != NULL) 
+      *chptr = '[';
+  
+    if (chptr == NULL) 
+    { // Range [s,e] was not specified
+      from_frame = 0;
+      to_frame   = mHeader.mNSamples-1;
+    }
+    
+    src_deriv_order = PARAMKIND_T & mHeader.mSampleKind ? 3 :
+                      PARAMKIND_A & mHeader.mSampleKind ? 2 :
+                      PARAMKIND_D & mHeader.mSampleKind ? 1 : 0;
+    src_E =  (PARAMKIND_E & mHeader.mSampleKind) != 0;
+    src_0 =  (PARAMKIND_0 & mHeader.mSampleKind) != 0;
+    src_N = ((PARAMKIND_N & mHeader.mSampleKind) != 0) * (src_E + src_0);
+    comp =    PARAMKIND_C & mHeader.mSampleKind;
+    
+    mHeader.mSampleKind &= ~PARAMKIND_C;
+  
+    if (mTargetKind == PARAMKIND_ANON) 
+    {
+      mTargetKind = mHeader.mSampleKind;
+    } 
+    else if ((mTargetKind & 077) == PARAMKIND_ANON) 
+    {
+      mTargetKind &= ~077;
+      mTargetKind |= mHeader.mSampleKind & 077;
+    }
+    
+    trg_E = (PARAMKIND_E & mTargetKind) != 0;
+    trg_0 = (PARAMKIND_0 & mTargetKind) != 0;
+    trg_N =((PARAMKIND_N & mTargetKind) != 0) * (trg_E + trg_0);
+  
+    coef_size     = comp ? sizeof(INT_16) : sizeof(FLOAT_32);
+    coefs         = (mHeader.mSampleSize/coef_size + src_N) / 
+                    (src_deriv_order+1) - src_E - src_0;
+    src_vec_size  = (coefs + src_E + src_0) * (src_deriv_order+1) - src_N;
+  
+    //Is coefs dividable by 1 + number of derivatives specified in header
+    if (src_vec_size * coef_size != mHeader.mSampleSize) 
+    {
+      Error("Invalid HTK header in feature file: '%s'. "
+            "mSampleSize do not match with parmKind", file_name.c_str());
+    }
+    
+    if (mDerivOrder < 0) 
+      mDerivOrder = src_deriv_order;
+  
+  
+    if ((!src_E && trg_E) || (!src_0 && trg_0) || (src_N && !trg_N) ||
+        (trg_N && !trg_E && !trg_0) || (trg_N && !mDerivOrder) ||
+        (src_N && !src_deriv_order && mDerivOrder) ||
+        ((mHeader.mSampleKind & 077) != (mTargetKind & 077) &&
+         (mHeader.mSampleKind & 077) != PARAMKIND_ANON)) 
+    {
+      char srcParmKind[64];
+      char trgParmKind[64];
+      
+      ParmKind2Str(mHeader.mSampleKind, srcParmKind);
+      ParmKind2Str(mTargetKind,       trgParmKind);
+      Error("Cannot convert %s to %s", srcParmKind, trgParmKind);
+    }
+  
+    lo_src_tgz_deriv_order = LOWER_OF(src_deriv_order, mDerivOrder);
+    trg_vec_size  = (coefs + trg_E + trg_0) * (mDerivOrder+1) - trg_N;
+    
+    i =  LOWER_OF(from_frame, mStartFrameExt);
+    from_frame  -= i;
+    ext_left     -= i;
+  
+    i =  LOWER_OF(mHeader.mNSamples-to_frame-1, mEndFrameExt);
+    to_frame    += i;
+    ext_right    -= i;
+  
+    if (from_frame > to_frame || from_frame >= mHeader.mNSamples || to_frame< 0)
+      Error("Invalid frame range for feature file: '%s'", file_name.c_str());
+    
+    tot_frames = to_frame - from_frame + 1 + ext_left + ext_right;
+    
+    // initialize matrix 
+    rFeatureMatrix.Init(tot_frames, trg_vec_size);
+    
+    // fill the matrix with features
+    for (i = 0; i <= to_frame - from_frame; i++) 
+    {
+      BaseFloat* A      = mpA;
+      BaseFloat* B      = mpB;
+      BaseFloat* mxPtr  = rFeatureMatrix[i+ext_left];
+      
+      // seek to the desired position
+      fseek(mStream.fp(), 
+          sizeof(HtkHeader) + (comp ? src_vec_size * 2 * sizeof(FLOAT_32) : 0)
+          + (from_frame + i) * src_vec_size * coef_size, 
+          SEEK_SET);
+  
+      e = ReadHTKFeature(mxPtr, coefs, comp, A, B);
+      
+      mxPtr += coefs; 
+      A     += coefs; 
+      B     += coefs;
+        
+      if (src_0 && !src_N) e |= ReadHTKFeature(mxPtr, 1, comp, A++, B++);
+      if (trg_0 && !trg_N) mxPtr++;
+      if (src_E && !src_N) e |= ReadHTKFeature(mxPtr, 1, comp, A++, B++);
+      if (trg_E && !trg_N) mxPtr++;
+  
+      for (j = 0; j < lo_src_tgz_deriv_order; j++) 
+      {
+        e |= ReadHTKFeature(mxPtr, coefs, comp, A, B);
+        mxPtr += coefs; 
+        A     += coefs; 
+        B     += coefs;
+        
+        if (src_0) e |= ReadHTKFeature(mxPtr, 1, comp, A++, B++);
+        if (trg_0) mxPtr++;
+        if (src_E) e |= ReadHTKFeature(mxPtr, 1, comp, A++, B++);
+        if (trg_E) mxPtr++;
+      }
+      
+      if (e) 
+        Error("Cannot read feature file: '%s' frame %d/%d", file_name.c_str(),
+            i, to_frame - from_frame + 1);
+    }
+  
+    // From now, coefs includes also trg_0 + trg_E !
+    coefs += trg_0 + trg_E; 
+    
+    // If extension of the matrix to the left or to the right is required,
+    // perform it here
+    for (i = 0; i < ext_left; i++) 
+    {
+      memcpy(rFeatureMatrix[i],
+             rFeatureMatrix[ext_left],
+             (coefs * (1+lo_src_tgz_deriv_order) - trg_N) * sizeof(BaseFloat));
+    }
+    
+    for (i = tot_frames - ext_right; i < tot_frames; i++) 
+    {
+      memcpy(rFeatureMatrix[i],
+             rFeatureMatrix[tot_frames - ext_right - 1],
+             (coefs * (1+lo_src_tgz_deriv_order) - trg_N) * sizeof(BaseFloat));
+    }
+
+    // Sentence cepstral mean normalization
+    if( (mpCmnPath == NULL)
+    && !(PARAMKIND_Z & mHeader.mSampleKind) 
+    &&  (PARAMKIND_Z & mTargetKind)) 
+    {
+      // for each coefficient
+      for(j=0; j < coefs; j++) 
+      {          
+        BaseFloat norm = 0.0;
+        for(i=0; i < tot_frames; i++)      // for each frame
+        {
+          norm += rFeatureMatrix[i][j - trg_N];
+          //norm += fea_mx[i*trg_vec_size - trg_N + j];
+        }
+        
+        norm /= tot_frames;
+  
+        for(i=0; i < tot_frames; i++)      // for each frame
+          rFeatureMatrix[i][j - trg_N] -= norm;
+          //fea_mx[i*trg_vec_size - trg_N + j] -= norm;
+      }
+    }
+    
+    // Compute missing derivatives
+    for (; src_deriv_order < mDerivOrder; src_deriv_order++) 
+    { 
+      int winLen = mDerivWinLengths[src_deriv_order];
+      BaseFloat norm = 0.0;
+      
+      for (k = 1; k <= winLen; k++) 
+      {
+        norm += 2 * k * k;
+      }
+      
+      // for each frame
+      for (i=0; i < tot_frames; i++) 
+      {        
+        // for each coefficient
+        for (j=0; j < coefs; j++) 
+        {          
+          //BaseFloat* src = fea_mx + i*trg_vec_size + src_deriv_order*coefs - trg_N + j;
+          BaseFloat* src = &rFeatureMatrix[i][src_deriv_order*coefs - trg_N + j];
+          
+          *(src + coefs) = 0.0;
+          
+          if (i < winLen || i >= tot_frames-winLen) 
+          { // boundaries need special treatment
+            for (k = 1; k <= winLen; k++) 
+            {  
+              *(src+coefs) += k*(src[ LOWER_OF(tot_frames-1-i,k)*rFeatureMatrix.Stride()]
+                                -src[-LOWER_OF(i,             k)*rFeatureMatrix.Stride()]);
+            }
+          } 
+          else 
+          { // otherwise use more efficient code
+            for (k = 1; k <= winLen; k++) 
+            {  
+              *(src+coefs) += k*(src[ k * rFeatureMatrix.Stride()]
+                                -src[-k * rFeatureMatrix.Stride()]);
+            }
+          }
+          *(src + coefs) /= norm;
+        }
+      }
+    }
+    
+    mHeader.mNSamples    = tot_frames;
+    mHeader.mSampleSize  = trg_vec_size * sizeof(FLOAT_32);
+    mHeader.mSampleKind  = mTargetKind & ~(PARAMKIND_D | PARAMKIND_A | PARAMKIND_T);
+  
+
+    ////////////////////////////////////////////////////////////////////////////
+    /////////////// Cepstral mean and variance normalization ///////////////////
+    ////////////////////////////////////////////////////////////////////////////
+    //.........................................................................
+    if (mpCmnPath != NULL
+    &&  mpCmnMask != NULL) 
+    {
+      // retrieve file name
+      ProcessMask(file_name, mpCmnMask, cmn_file_name);
+      // add the path correctly
+      cmn_file_name.insert(0, "/");
+      cmn_file_name.insert(0, mpCmnPath);
+
+      // read the file
+      ReadCepsNormFile(cmn_file_name.c_str(), &mpLastCmnFile, &mpCmn,
+          mHeader.mSampleKind & ~PARAMKIND_Z, CNF_Mean, coefs);
+                      
+      // recompute feature values
+      for (i=0; i < tot_frames; i++) 
+      {
+        for (j=trg_N; j < coefs; j++) 
+        {
+          rFeatureMatrix[i][j - trg_N] -= mpCmn[j];
+        }
+      }
+    }
+  
+    mHeader.mSampleKind |= mDerivOrder==3 ? PARAMKIND_D | PARAMKIND_A | PARAMKIND_T :
+                           mDerivOrder==2 ? PARAMKIND_D | PARAMKIND_A :
+                           mDerivOrder==1 ? PARAMKIND_D : 0;
+  
+    //.........................................................................
+    if (mpCvnPath != NULL
+    &&  mpCvnMask != NULL) 
+    {
+      // retrieve file name
+      ProcessMask(file_name, mpCvnMask, cvn_file_name);
+      // add the path correctly
+      cvn_file_name.insert(0, "/");
+      cvn_file_name.insert(0, mpCvnPath);
+
+      // read the file
+      ReadCepsNormFile(cvn_file_name.c_str(), &mpLastCvnFile, &mpCvn,
+          mHeader.mSampleKind, CNF_Variance, trg_vec_size);
+                      
+      // recompute feature values
+      for (i=0; i < tot_frames; i++) 
+      {
+        for (j=trg_N; j < trg_vec_size; j++) 
+        {
+          rFeatureMatrix[i][j - trg_N] *= mpCvn[j];
+        }
+      }
+    }
+    
+    //.........................................................................
+    // process the global covariance file
+    if (mpCvgFile != NULL) 
+    {
+      ReadCepsNormFile(mpCvgFile, &mpLastCvgFile, &mpCvg,
+                      -1, CNF_VarScale, trg_vec_size);
+                      
+      // recompute feature values
+      for (i=0; i < tot_frames; i++) 
+      {
+        for (j=trg_N; j < trg_vec_size; j++) 
+        {
+          rFeatureMatrix[i][j - trg_N] *= mpCvg[j];
+        }
+      }
+    }
+    
+    return true;
+  }
+*/
+
+  //***************************************************************************
+  //***************************************************************************
+
+
+
+
+  
+  //***************************************************************************
+  //***************************************************************************
+  bool 
+  FeatureRepository::
+  ReadHTKFeatures(const FileListElem&    rFileNameRecord, 
+                        Matrix<BaseFloat>&        rFeatureMatrix)
+  {
+    std::string           file_name(rFileNameRecord.Physical());
+    std::string           cmn_file_name;
+    std::string           cvn_file_name;  
+    
+    int                   ext_left  = mStartFrameExt;
+    int                   ext_right = mEndFrameExt;
+    int                   from_frame;
+    int                   to_frame;
+    int                   tot_frames;
+    int                   trg_vec_size;
+    int                   src_vec_size;
+    int                   src_deriv_order;
+    int                   lo_src_tgz_deriv_order;
+    int                   i;
+    int                   j;
+    int                   k;
+    int                   e;
+    int                   coefs;
+    int                   trg_E;
+    int                   trg_0;
+    int                   trg_N;
+    int                   src_E;
+    int                   src_0;
+    int                   src_N;
+    int                   comp;
+    int                   coef_size;
+    char*                 chptr;
+  
+
+  TIMER_START(mTim);
+   
+    // read frame range definition if any ( physical_file.fea[s,e] )
+    if ((chptr = strrchr((char*)file_name.c_str(), '[')) == NULL ||
+        ((i=0), sscanf(chptr, "[%d,%d]%n", &from_frame, &to_frame, &i), 
+         chptr[i] != '\0')) 
+    {
+      chptr = NULL;
+    }
+    
+    if (chptr != NULL)                                
+      *chptr = '\0';
+  
+
+    if ((file_name != "-" )
+    &&  (!mLastFileName.empty()) 
+    &&  (mLastFileName == file_name)) 
+    {
+      mHeader = mLastHeader;
+    } 
+    else 
+    {
+      if (!mLastFileName.empty()) 
+      {
+        mStream.close();
+        mLastFileName = "";
+      }
+      
+      
+      // open the feature file
+      mStream.open(file_name.c_str(), std::ios::binary);
+      if (!mStream.good())
+      {
+        throw std::runtime_error(std::string("Cannot open feature file: '") 
+            + file_name.c_str() + "'");
+      }
+      
+      
+      if (ReadHTKHeader())  {
+        throw std::runtime_error(std::string("Invalid HTK header in feature file: '") 
+            + file_name.c_str() + "'");
+      }
+      
+      if (mHeader.mSampleKind & PARAMKIND_C) 
+      {
+        // File is in compressed form, scale and pBias vectors
+        // are appended after HTK header.
+	    coefs = mHeader.mSampleSize/sizeof(INT_16);
+
+        mpA = (BaseFloat*) realloc(mpA, coefs * sizeof(BaseFloat));
+        mpB = (BaseFloat*) realloc(mpB, coefs * sizeof(BaseFloat));
+
+        if (mpA == NULL || mpB == NULL) {
+          throw std::runtime_error("Insufficient memory");
+        }
+  
+        e  = ReadHTKFeature(mpA, coefs, 0, 0, 0);
+        e |= ReadHTKFeature(mpB, coefs, 0, 0, 0);
+        
+        if (e) {
+          throw std::runtime_error(std::string("Cannot read feature file: '") 
+              + file_name.c_str() + "'");
+        }
+        
+        mHeader.mNSamples -= 2 * sizeof(FLOAT_32) / sizeof(INT_16);
+      }
+      
+      // remember current settings
+      mLastFileName = file_name;
+      mLastHeader   = mHeader;
+    }
+    
+    if (chptr != NULL) {
+      *chptr = '[';
+    }
+  
+    if (chptr == NULL) { 
+      // Range [s,e] was not specified
+      from_frame = 0;
+      to_frame   = mHeader.mNSamples-1;
+    }
+    
+    src_deriv_order = PARAMKIND_T & mHeader.mSampleKind ? 3 :
+                      PARAMKIND_A & mHeader.mSampleKind ? 2 :
+                      PARAMKIND_D & mHeader.mSampleKind ? 1 : 0;
+    src_E =  (PARAMKIND_E & mHeader.mSampleKind) != 0;
+    src_0 =  (PARAMKIND_0 & mHeader.mSampleKind) != 0;
+    src_N = ((PARAMKIND_N & mHeader.mSampleKind) != 0) * (src_E + src_0);
+    comp =    PARAMKIND_C & mHeader.mSampleKind;
+    
+    mHeader.mSampleKind &= ~PARAMKIND_C;
+  
+    if (mTargetKind == PARAMKIND_ANON) 
+    {
+      mTargetKind = mHeader.mSampleKind;
+    } 
+    else if ((mTargetKind & 077) == PARAMKIND_ANON) 
+    {
+      mTargetKind &= ~077;
+      mTargetKind |= mHeader.mSampleKind & 077;
+    }
+    
+    trg_E = (PARAMKIND_E & mTargetKind) != 0;
+    trg_0 = (PARAMKIND_0 & mTargetKind) != 0;
+    trg_N =((PARAMKIND_N & mTargetKind) != 0) * (trg_E + trg_0);
+  
+    coef_size     = comp ? sizeof(INT_16) : sizeof(FLOAT_32);
+    coefs         = (mHeader.mSampleSize/coef_size + src_N) / 
+                    (src_deriv_order+1) - src_E - src_0;
+    src_vec_size  = (coefs + src_E + src_0) * (src_deriv_order+1) - src_N;
+  
+    //Is coefs dividable by 1 + number of derivatives specified in header
+    if (src_vec_size * coef_size != mHeader.mSampleSize) 
+    {
+      throw std::runtime_error(std::string("Invalid HTK header in feature file: '") 
+            + file_name + "' mSampleSize do not match with parmKind");
+    }
+    
+    if (mDerivOrder < 0) 
+      mDerivOrder = src_deriv_order;
+  
+  
+    if ((!src_E && trg_E) || (!src_0 && trg_0) || (src_N && !trg_N) ||
+        (trg_N && !trg_E && !trg_0) || (trg_N && !mDerivOrder) ||
+        (src_N && !src_deriv_order && mDerivOrder) ||
+        ((mHeader.mSampleKind & 077) != (mTargetKind & 077) &&
+         (mHeader.mSampleKind & 077) != PARAMKIND_ANON)) 
+    {
+      char srcParmKind[64];
+      char trgParmKind[64];
+      memset(srcParmKind,0,64);
+      memset(trgParmKind,0,64);
+      
+      ParmKind2Str(mHeader.mSampleKind, srcParmKind);
+      ParmKind2Str(mTargetKind,       trgParmKind);
+      throw std::runtime_error(std::string("Cannot convert ") + srcParmKind 
+          + " to " + trgParmKind);
+    }
+  
+    lo_src_tgz_deriv_order = std::min(src_deriv_order, mDerivOrder);
+    trg_vec_size  = (coefs + trg_E + trg_0) * (mDerivOrder+1) - trg_N;
+    
+    i =  std::min(from_frame, mStartFrameExt);
+    from_frame  -= i;
+    ext_left     -= i;
+  
+    i =  std::min(mHeader.mNSamples-to_frame-1, mEndFrameExt);
+    to_frame    += i;
+    ext_right    -= i;
+  
+    if (from_frame > to_frame || from_frame >= mHeader.mNSamples || to_frame< 0)
+      throw std::runtime_error(std::string("Invalid frame range for feature file: '")
+            + file_name.c_str() + "'");
+    
+    tot_frames = to_frame - from_frame + 1 + ext_left + ext_right;
+   
+    
+   TIMER_END(mTim,mTimeOpen);
+
+
+    // initialize matrix 
+    rFeatureMatrix.Init(tot_frames, trg_vec_size, false);
+    
+    // fill the matrix with features
+    for (i = 0; i <= to_frame - from_frame; i++) 
+    {
+      BaseFloat* A      = mpA;
+      BaseFloat* B      = mpB;
+      BaseFloat* mxPtr  = rFeatureMatrix.pRowData(i+ext_left);
+
+    TIMER_START(mTim);      
+      // seek to the desired position
+      fseek(mStream.fp(), 
+          sizeof(HtkHeader) + (comp ? src_vec_size * 2 * sizeof(FLOAT_32) : 0)
+          + (from_frame + i) * src_vec_size * coef_size, 
+          SEEK_SET);
+    TIMER_END(mTim,mTimeSeek);
+ 
+    TIMER_START(mTim);
+      // read 
+      e = ReadHTKFeature(mxPtr, coefs, comp, A, B);
+    TIMER_END(mTim,mTimeRead);
+      
+      mxPtr += coefs; 
+      A     += coefs; 
+      B     += coefs;
+        
+      if (src_0 && !src_N) e |= ReadHTKFeature(mxPtr, 1, comp, A++, B++);
+      if (trg_0 && !trg_N) mxPtr++;
+      if (src_E && !src_N) e |= ReadHTKFeature(mxPtr, 1, comp, A++, B++);
+      if (trg_E && !trg_N) mxPtr++;
+  
+      for (j = 0; j < lo_src_tgz_deriv_order; j++) 
+      {
+        e |= ReadHTKFeature(mxPtr, coefs, comp, A, B);
+        mxPtr += coefs; 
+        A     += coefs; 
+        B     += coefs;
+        
+        if (src_0) e |= ReadHTKFeature(mxPtr, 1, comp, A++, B++);
+        if (trg_0) mxPtr++;
+        if (src_E) e |= ReadHTKFeature(mxPtr, 1, comp, A++, B++);
+        if (trg_E) mxPtr++;
+      }
+
+      if (e) {
+        std::cout << mHeader.mNSamples << "\n";
+        std::cout << 2 * sizeof(FLOAT_32) / sizeof(INT_16) << "\n";
+        std::cout << "from" << from_frame << "to" << to_frame << "i" << i << "\n";
+
+        std::ostringstream s;
+        s << i << "/" << to_frame - from_frame + 1, s.str();
+        throw std::runtime_error(std::string("Cannot read feature file: '")
+              + file_name + "' frame " + s.str());
+      }
+    }
+  
+    // From now, coefs includes also trg_0 + trg_E !
+    coefs += trg_0 + trg_E; 
+    
+    // If extension of the matrix to the left or to the right is required,
+    // perform it here
+    for (i = 0; i < ext_left; i++) 
+    {
+      memcpy(rFeatureMatrix.pRowData(i),
+             rFeatureMatrix.pRowData(ext_left),
+             (coefs * (1+lo_src_tgz_deriv_order) - trg_N) * sizeof(BaseFloat));
+    }
+    
+    for (i = tot_frames - ext_right; i < tot_frames; i++) 
+    {
+      memcpy(rFeatureMatrix.pRowData(i),
+             rFeatureMatrix.pRowData(tot_frames - ext_right - 1),
+             (coefs * (1+lo_src_tgz_deriv_order) - trg_N) * sizeof(BaseFloat));
+    }
+
+    // Sentence cepstral mean normalization
+    if( (mpCmnPath == NULL)
+    && !(PARAMKIND_Z & mHeader.mSampleKind) 
+    &&  (PARAMKIND_Z & mTargetKind)) 
+    {
+      // for each coefficient
+      for(j=0; j < coefs; j++) 
+      {          
+        BaseFloat norm = 0.0;
+        for(i=0; i < tot_frames; i++)      // for each frame
+        {
+          norm += rFeatureMatrix[i][j - trg_N];
+          //norm += fea_mx[i*trg_vec_size - trg_N + j];
+        }
+        
+        norm /= tot_frames;
+  
+        for(i=0; i < tot_frames; i++)      // for each frame
+          rFeatureMatrix[i][j - trg_N] -= norm;
+          //fea_mx[i*trg_vec_size - trg_N + j] -= norm;
+      }
+    }
+    
+    // Compute missing derivatives
+    for (; src_deriv_order < mDerivOrder; src_deriv_order++) 
+    { 
+      int winLen = mDerivWinLengths[src_deriv_order];
+      BaseFloat norm = 0.0;
+      
+      for (k = 1; k <= winLen; k++) 
+      {
+        norm += 2 * k * k;
+      }
+      
+      // for each frame
+      for (i=0; i < tot_frames; i++) 
+      {        
+        // for each coefficient
+        for (j=0; j < coefs; j++) 
+        {          
+          //BaseFloat* src = fea_mx + i*trg_vec_size + src_deriv_order*coefs - trg_N + j;
+          BaseFloat* src = &rFeatureMatrix[i][src_deriv_order*coefs - trg_N + j];
+          
+          *(src + coefs) = 0.0;
+          
+          if (i < winLen || i >= tot_frames-winLen) 
+          { // boundaries need special treatment
+            for (k = 1; k <= winLen; k++) 
+            {  
+              *(src+coefs) += k*(src[ std::min(tot_frames-1-i,k)*rFeatureMatrix.Stride()]
+                                -src[-std::min(i,             k)*rFeatureMatrix.Stride()]);
+            }
+          } 
+          else 
+          { // otherwise use more efficient code
+            for (k = 1; k <= winLen; k++) 
+            {  
+              *(src+coefs) += k*(src[ k * rFeatureMatrix.Stride()]
+                                -src[-k * rFeatureMatrix.Stride()]);
+            }
+          }
+          *(src + coefs) /= norm;
+        }
+      }
+    }
+    
+    mHeader.mNSamples    = tot_frames;
+    mHeader.mSampleSize  = trg_vec_size * sizeof(FLOAT_32);
+    mHeader.mSampleKind  = mTargetKind & ~(PARAMKIND_D | PARAMKIND_A | PARAMKIND_T);
+  
+
+   TIMER_START(mTim);
+    ////////////////////////////////////////////////////////////////////////////
+    /////////////// Cepstral mean and variance normalization ///////////////////
+    ////////////////////////////////////////////////////////////////////////////
+    //.........................................................................
+    if (mpCmnPath != NULL
+    &&  mpCmnMask != NULL) 
+    {
+      // retrieve file name
+      ProcessMask(rFileNameRecord.Logical(), mpCmnMask, cmn_file_name);
+      // add the path correctly
+
+      if(cmn_file_name == "") {
+        throw std::runtime_error("CMN Matching failed");
+      }
+
+      cmn_file_name.insert(0, "/");
+      cmn_file_name.insert(0, mpCmnPath);
+
+      // read the file
+      ReadCepsNormFile(cmn_file_name.c_str(), &mpLastCmnFile, &mpCmn,
+          mHeader.mSampleKind & ~PARAMKIND_Z, CNF_Mean, coefs);
+                      
+      // recompute feature values
+      for (i=0; i < tot_frames; i++) 
+      {
+        for (j=trg_N; j < coefs; j++) 
+        {
+          rFeatureMatrix[i][j - trg_N] -= mpCmn[j];
+        }
+      }
+    }
+  
+    mHeader.mSampleKind |= mDerivOrder==3 ? PARAMKIND_D | PARAMKIND_A | PARAMKIND_T :
+                           mDerivOrder==2 ? PARAMKIND_D | PARAMKIND_A :
+                           mDerivOrder==1 ? PARAMKIND_D : 0;
+  
+    //.........................................................................
+    if (mpCvnPath != NULL
+    &&  mpCvnMask != NULL) 
+    {
+      // retrieve file name
+      ProcessMask(rFileNameRecord.Logical(), mpCvnMask, cvn_file_name);
+      // add the path correctly
+      cvn_file_name.insert(0, "/");
+      cvn_file_name.insert(0, mpCvnPath);
+
+      // read the file
+      ReadCepsNormFile(cvn_file_name.c_str(), &mpLastCvnFile, &mpCvn,
+          mHeader.mSampleKind, CNF_Variance, trg_vec_size);
+                      
+      // recompute feature values
+      for (i=0; i < tot_frames; i++) 
+      {
+        for (j=trg_N; j < trg_vec_size; j++) 
+        {
+          rFeatureMatrix[i][j - trg_N] *= mpCvn[j];
+        }
+      }
+    }
+    
+    //.........................................................................
+    // process the global covariance file
+    if (mpCvgFile != NULL) 
+    {
+      ReadCepsNormFile(mpCvgFile, &mpLastCvgFile, &mpCvg,
+                      -1, CNF_VarScale, trg_vec_size);
+                      
+      // recompute feature values
+      for (i=0; i < tot_frames; i++) 
+      {
+        for (j=trg_N; j < trg_vec_size; j++) 
+        {
+          rFeatureMatrix[i][j - trg_N] *= mpCvg[j];
+        }
+      }
+    }
+
+  TIMER_END(mTim,mTimeNormalize);
+    
+    return true;
+  }
+
+
+  //***************************************************************************
+  //***************************************************************************
+  int 
+  FeatureRepository::
+  ReadParmKind(const char *str, bool checkBrackets)
+  {
+    unsigned int  i;
+    int           parmKind =0;
+    int           slen     = strlen(str);
+  
+    if (checkBrackets) 
+    {
+      if (str[0] != '<' || str[slen-1] != '>')  return -1;
+      str++; slen -= 2;
+    }
+    
+    for (; slen >= 0 && str[slen-2] == '_'; slen -= 2) 
+    {
+      parmKind |= str[slen-1] == 'E' ? PARAMKIND_E :
+                  str[slen-1] == 'N' ? PARAMKIND_N :
+                  str[slen-1] == 'D' ? PARAMKIND_D :
+                  str[slen-1] == 'A' ? PARAMKIND_A :
+                  str[slen-1] == 'C' ? PARAMKIND_C :
+                  str[slen-1] == 'Z' ? PARAMKIND_Z :
+                  str[slen-1] == 'K' ? PARAMKIND_K :
+                  str[slen-1] == '0' ? PARAMKIND_0 :
+                  str[slen-1] == 'V' ? PARAMKIND_V :
+                  str[slen-1] == 'T' ? PARAMKIND_T : -1;
+  
+      if (parmKind == -1) return -1;
+    }
+    
+    for (i = 0; i < sizeof(mpParmKindNames) / sizeof(char*); i++) 
+    {
+      if (!strncmp(str, mpParmKindNames[i], slen))
+        return parmKind | i;
+    }
+    return -1;
+  }
+
+
+
+
+  //***************************************************************************
+  //***************************************************************************
+  int
+  FeatureRepository:: 
+  WriteHTKHeader (FILE * pOutFp, HtkHeader header, bool swap)
+  {
+    int cc;
+  
+    if (swap) {
+      swap4(header.mNSamples);
+      swap4(header.mSamplePeriod);
+      swap2(header.mSampleSize);
+      swap2(header.mSampleKind);
+    }
+  
+    fseek (pOutFp, 0L, SEEK_SET);
+    cc = fwrite(&header, sizeof(HtkHeader), 1, pOutFp);
+  
+    if (swap) {
+      swap4(header.mNSamples);
+      swap4(header.mSamplePeriod);
+      swap2(header.mSampleSize);
+      swap2(header.mSampleKind);
+    }
+  
+    return cc == 1 ? 0 : -1;
+  }
+  
+  
+  //***************************************************************************
+  //***************************************************************************
+  int 
+  FeatureRepository::
+  WriteHTKFeature(
+    FILE * pOutFp,
+    FLOAT * pOut,
+    size_t feaLen,
+    bool swap,
+    bool compress,
+    FLOAT* pScale, 
+    FLOAT* pBias)
+  {
+    size_t    i;
+    size_t    cc = 0;
+
+
+    if (compress) 
+    {
+      INT_16 s;
+        
+      for (i = 0; i < feaLen; i++) 
+      {
+	s = pOut[i] * pScale[i] - pBias[i];
+        if (swap) 
+	  swap2(s);
+	cc += fwrite(&s, sizeof(INT_16), 1, pOutFp);
+      }
+      
+    } else {
+  #if !DOUBLEPRECISION
+      if (swap) 
+        for (i = 0; i < feaLen; i++) 
+          swap4(pOut[i]);
+    
+        cc = fwrite(pOut, sizeof(FLOAT_32), feaLen, pOutFp);
+    
+      if (swap) 
+        for (i = 0; i < feaLen; i++) 
+          swap4(pOut[i]);
+  #else
+      FLOAT_32 f;
+  
+      for (i = 0; i < feaLen; i++) 
+      {
+        f = pOut[i];
+        if (swap) 
+          swap4(f);
+        cc += fwrite(&f, sizeof(FLOAT_32), 1, pOutFp);
+      }
+  #endif
+    }
+    return cc == feaLen ? 0 : -1;
+  }
+
+  //***************************************************************************
+  //***************************************************************************
+  int 
+  FeatureRepository::
+  WriteHTKFeatures(
+    FILE *  pOutFp,
+    FLOAT * pOut,
+    int     nCoeffs,
+    int     nSamples,
+    int     samplePeriod,
+    int     targetKind,  
+    bool    swap) 
+  {
+    HtkHeader header;
+    int i, j;
+    FLOAT *pScale = NULL;
+    FLOAT *pBias = NULL;
+    
+    header.mNSamples = nSamples  + ((targetKind & PARAMKIND_C) ? 2 * sizeof(FLOAT_32) / sizeof(INT_16) : 0);
+    header.mSamplePeriod = samplePeriod;
+    header.mSampleSize = nCoeffs * ((targetKind & PARAMKIND_C) ?    sizeof(INT_16)   : sizeof(FLOAT_32));;
+    header.mSampleKind = targetKind;
+    
+    WriteHTKHeader (pOutFp, header, swap);
+
+    if(targetKind & PARAMKIND_C) {
+      pScale = (FLOAT*) malloc(nCoeffs * sizeof(FLOAT));
+      pBias = (FLOAT*)  malloc(nCoeffs * sizeof(FLOAT));
+      if (pScale == NULL || pBias == NULL) Error("Insufficient memory");
+      
+      for(i = 0; i < nCoeffs; i++) {
+        float xmin, xmax;
+	xmin = xmax = pOut[i];
+	for(j = 1; j < nSamples; j++) {
+	  if(pOut[j*nCoeffs+i] > xmax) xmax = pOut[j*nCoeffs+i];
+	  if(pOut[j*nCoeffs+i] < xmin) xmin = pOut[j*nCoeffs+i];
+	}
+	pScale[i] = (2*32767) / (xmax - xmin);
+        pBias[i]  = pScale[i] * (xmax + xmin) / 2;
+	
+	
+      }
+      if (WriteHTKFeature(pOutFp, pScale, nCoeffs, swap, false, 0, 0)
+      ||  WriteHTKFeature(pOutFp, pBias,  nCoeffs, swap, false, 0, 0)) {
+        return -1;
+      }
+    }
+    for(j = 0; j < nSamples; j++) {
+      if (WriteHTKFeature(pOutFp, &pOut[j*nCoeffs], nCoeffs, swap, targetKind & PARAMKIND_C, pScale, pBias)) {
+        return -1;
+      }
+    }
+    return 0;
+  }
+  
+
+  //***************************************************************************
+  //***************************************************************************
+  int 
+  FeatureRepository::
+  WriteHTKFeatures(
+    FILE *  pOutFp,
+    int     samplePeriod,
+    int     targetKind,  
+    bool    swap,
+    Matrix<BaseFloat>&        rFeatureMatrix)
+  {
+    HtkHeader header;
+    size_t i, j;
+    FLOAT *p_scale = NULL;
+    FLOAT *p_bias = NULL;
+    size_t n_samples = rFeatureMatrix.Rows();
+    size_t n_coeffs  = rFeatureMatrix.Cols();
+    
+    header.mNSamples = n_samples  + ((targetKind & PARAMKIND_C) ? 2 * sizeof(FLOAT_32) / sizeof(INT_16) : 0);
+    header.mSamplePeriod = samplePeriod;
+    header.mSampleSize = n_coeffs * ((targetKind & PARAMKIND_C) ?    sizeof(INT_16)   : sizeof(FLOAT_32));;
+    header.mSampleKind = targetKind;
+    
+    WriteHTKHeader (pOutFp, header, swap);
+
+    if(targetKind & PARAMKIND_C) {
+      p_scale = (FLOAT*) malloc(n_coeffs * sizeof(FLOAT));
+      p_bias = (FLOAT*)  malloc(n_coeffs * sizeof(FLOAT));
+      if (p_scale == NULL || p_bias == NULL) Error("Insufficient memory");
+      
+      for(i = 0; i < n_coeffs; i++) {
+        float xmin, xmax;
+	xmin = xmax = rFeatureMatrix[0][i];
+
+	for(j = 1; j < n_samples; j++) {
+	  if(rFeatureMatrix[j][i] > xmax) xmax = rFeatureMatrix[j][i];
+	  if(rFeatureMatrix[j][i] < xmin) xmin = rFeatureMatrix[j][i];
+	}
+
+	p_scale[i] = (2*32767) / (xmax - xmin);
+        p_bias[i]  = p_scale[i] * (xmax + xmin) / 2;
+      }
+
+      if (WriteHTKFeature(pOutFp, p_scale, n_coeffs, swap, false, 0, 0)
+      ||  WriteHTKFeature(pOutFp, p_bias,  n_coeffs, swap, false, 0, 0)) {
+        return -1;
+      }
+    }
+
+    for(j = 0; j < n_samples; j++) {
+      if (WriteHTKFeature(pOutFp, rFeatureMatrix[j].pData(), n_coeffs, swap, targetKind & PARAMKIND_C, p_scale, p_bias)) {
+        return -1;
+      }
+    }
+
+    return 0;
+  }
+  
+  //***************************************************************************
+  //***************************************************************************
+
+
+  bool 
+  FeatureRepository::
+  ReadGzipAsciiFeatures(const FileListElem& rFileNameRecord, Matrix<BaseFloat>& rFeatureMatrix)
+  {
+    //build the command
+    std::string cmd("gunzip -c "); cmd += rFileNameRecord.Physical();
+
+    //define buffer
+    const int buf_size=262144;
+    char buf[buf_size];
+    char vbuf[2*buf_size];
+
+   TIMER_START(mTim);      
+    //open the pipe
+    FILE* fp = popen(cmd.c_str(),"r");
+    if(fp == NULL) {
+      //2nd try...
+      Warning(std::string("2nd try to open pipe: ")+cmd);
+      sleep(5);
+      fp = popen(cmd.c_str(),"r");
+      if(fp == NULL) {
+        KALDI_ERR << "Cannot open pipe: " << cmd;
+      }
+    }
+    setvbuf(fp,vbuf,_IOFBF,2*buf_size);
+   TIMER_END(mTim,mTimeOpen);
+
+    //string will stay allocated across calls
+    static std::string line; line.resize(0);
+
+    //define matrix storage
+    static int cols = 131072;
+    std::list<std::vector<BaseFloat> > matrix(1);
+    matrix.front().reserve(cols);
+
+    //read all the lines to a vector
+    int line_ctr=1;
+    while(1) {
+     TIMER_START(mTim);      
+      if(NULL == fgets(buf,buf_size,fp)) break;
+     TIMER_END(mTim,mTimeRead);
+      
+      line += buf;
+      if(*(line.rbegin()) == '\n' || feof(fp)) {
+        //parse the line of numbers
+       TIMER_START(mTim);      
+        const char* ptr = line.c_str();
+        char* end;
+        while(1) {
+          //skip whitespace
+          while(isspace(*ptr)) ptr++;
+          if(*ptr == 0) break;
+          //check that a number follows
+          switch(*ptr) {
+            case '0': case '1': case '2': case '3': case '4': 
+            case '5': case '6': case '7': case '8': case '9':
+            case '.': case '+': case '-': 
+            break;
+            default : KALDI_ERR << "A number was expected:" << ptr
+                                << " reading from" << cmd; 
+                      exit(1);
+          }
+          //read a number
+          BaseFloat val = strtof(ptr,&end); ptr=end;
+          matrix.back().push_back(val);
+        }
+       TIMER_END(mTim,mTimeNormalize);
+        //we have the line of numbers, insert empty row to matrix
+        if(matrix.back().size() > 0 && !feof(fp)) {
+          matrix.push_back(std::vector<BaseFloat>());
+          matrix.back().reserve(matrix.front().size());
+        }
+        //dispose the current line
+        line.resize(0);//but stay allocated... 
+        line_ctr++;
+      }
+    }
+    if(matrix.back().size() == 0) matrix.pop_back();
+
+    //get matrix dimensions
+    int rows = matrix.size();
+    /*int*/ cols = matrix.front().size();
+
+    //define interators
+    std::list<std::vector<BaseFloat> >::iterator it_r;
+    std::vector<BaseFloat>::iterator it_c;
+
+    //check that all lines have same size
+    int i;
+    for(i=0,it_r=matrix.begin(); it_r != matrix.end(); ++i,++it_r) {
+      if(it_r->size() != cols) {
+        KALDI_ERR << "All rows must have same dimension, 1st line cols: " << cols 
+                  << ", " << i << "th line cols: " << it_r->size();
+      }
+    }
+
+    //copy data to matrix
+   TIMER_START(mTim);      
+    rFeatureMatrix.Init(rows,cols);
+    int r,c;
+    for(r=0,it_r=matrix.begin(); it_r!=matrix.end(); ++r,++it_r) {
+      for(c=0,it_c=it_r->begin(); it_c!=it_r->end(); ++c,++it_c) {
+        rFeatureMatrix(r,c) = *it_c;
+      }
+    }
+   TIMER_END(mTim,mTimeSeek);
+
+    //close the pipe
+    if(pclose(fp) == -1) {
+      KALDI_ERR << "Cannot close pipe: " << cmd;
+    }
+    
+    return true;
+  }
+
+
+  //***************************************************************************
+  //***************************************************************************
+
+} // namespace TNet
diff --git a/src/KaldiLib/.svn/text-base/Features.h.svn-base b/src/KaldiLib/.svn/text-base/Features.h.svn-base
new file mode 100644
index 0000000..0980ab6
--- /dev/null
+++ b/src/KaldiLib/.svn/text-base/Features.h.svn-base
@@ -0,0 +1,597 @@
+//
+// C++ Interface: %{MODULE}
+//
+// Description: 
+//
+//
+// Author: %{AUTHOR} <%{EMAIL}>, (C) %{YEAR}
+//
+// Copyright: See COPYING file that comes with this distribution
+//
+//
+
+#ifndef TNet_Features_h
+#define TNet_Features_h
+
+//*****************************************************************************
+//*****************************************************************************
+// Standard includes
+//
+#include <list>
+#include <queue>
+#include <string>
+
+
+//*****************************************************************************
+//*****************************************************************************
+// Specific includes
+//
+#include "Common.h"
+#include "Matrix.h"
+#include "StkStream.h"
+#include "Types.h"
+#include "Timer.h"
+
+
+
+// we need these for reading and writing
+#define UINT_16  unsigned short
+#define UINT_32  unsigned
+#define INT_16   short
+#define INT_32   int
+#define FLOAT_32 float
+#define DOUBLE_64 double
+
+
+#define PARAMKIND_WAVEFORM  0
+#define PARAMKIND_LPC       1
+#define PARAMKIND_LPREFC    2
+#define PARAMKIND_LPCEPSTRA 3
+#define PARAMKIND_LPDELCEP  4
+#define PARAMKIND_IREFC     5
+#define PARAMKIND_MFCC      6
+#define PARAMKIND_FBANK     7
+#define PARAMKIND_MELSPEC   8
+#define PARAMKIND_USER      9
+#define PARAMKIND_DISCRETE 10
+#define PARAMKIND_PLP      11
+#define PARAMKIND_ANON     12
+
+#define PARAMKIND_E   0000100 /// has energy
+#define PARAMKIND_N   0000200 /// absolute energy suppressed
+#define PARAMKIND_D   0000400 /// has delta coefficients
+#define PARAMKIND_A   0001000 /// has acceleration coefficients
+#define PARAMKIND_C   0002000 /// is compressed
+#define PARAMKIND_Z   0004000 /// has zero mean static coef.
+#define PARAMKIND_K   0010000 /// has CRC checksum
+#define PARAMKIND_0   0020000 /// has 0'th cepstral coef.
+#define PARAMKIND_V   0040000 /// has VQ codebook index
+#define PARAMKIND_T   0100000 /// has triple delta coefficients
+
+
+//*****************************************************************************
+//*****************************************************************************
+// Code ...
+//
+
+namespace TNet
+{
+
+  /** **************************************************************************
+   ** **************************************************************************
+   */
+  class FileListElem
+  {
+  private:
+    std::string         mLogical;     ///< Logical file name representation
+    std::string         mPhysical;    ///< Pysical file name representation
+    float               mWeight;
+    
+  public:
+    FileListElem(const std::string & rFileName);
+    ~FileListElem() {}
+    
+    const std::string &
+    Logical() const { return mLogical; }
+
+    const std::string &
+    Physical() const { return mPhysical; }
+
+    const float&
+    Weight() const { return mWeight; }
+  };
+
+  /** *************************************************************************
+   * @brief 
+   */
+  class FeatureRepository
+  {
+  public:
+    /** 
+     * @brief HTK parameter file header (see HTK manual)
+     */
+    struct HtkHeader
+    {
+      int   mNSamples;
+      int   mSamplePeriod;
+      short mSampleSize;
+      short mSampleKind;
+
+      HtkHeader() 
+       : mNSamples(0),mSamplePeriod(100000),mSampleSize(0),mSampleKind(12)
+      { }
+    };
+
+
+    /**
+     *     @brief Extension of the HTK header
+     */
+    struct HtkHeaderExt
+    {
+      int mHeaderSize;
+      int mVersion;
+      int mSampSize;
+    };
+
+
+    /** 
+     * @brief Normalization file type
+     */
+    enum CNFileType
+    {
+      CNF_Mean,
+      CNF_Variance,
+      CNF_VarScale
+    };
+
+
+    static int     
+    ReadParmKind(const char *pStr, bool checkBrackets);
+
+    static int     
+    ParmKind2Str(unsigned parmKind, char *pOutstr);
+
+    static void 
+    ReadCepsNormFile(
+        const char*   pFileName,
+        char**        lastFile,
+        BaseFloat**   vecBuff,
+        int           sampleKind,
+        CNFileType    type,
+        int           coefs);
+
+    static const char mpParmKindNames[13][16];
+
+  
+  
+  //////////////////////////////////////////////////////////////////////////////
+  //  PUBLIC SECTION
+  //////////////////////////////////////////////////////////////////////////////
+  public:
+    /// Iterates through the list of feature file records
+    typedef   std::list<FileListElem>::iterator  ListIterator;
+
+    // some params for loading features
+    bool                        mSwapFeatures;
+    int                         mStartFrameExt;
+    int                         mEndFrameExt;
+    int                         mTargetKind;
+    int                         mDerivOrder;
+    int*                        mDerivWinLengths;
+    const char*                 mpCvgFile;
+    //:TODO: get rid of these
+    const char*                 mpCmnPath;
+    const char*                 mpCmnMask;
+    const char*                 mpCvnPath;
+    const char*                 mpCvnMask;
+
+    int                         mTrace;
+    
+    
+    // Constructors and destructors
+    /**
+     * @brief Default constructor that creates an empty repository
+     */
+    FeatureRepository() : mDerivWinLengths(NULL), mpCvgFile(NULL), 
+       mpCmnPath(NULL), mpCmnMask(NULL), mpCvnPath(NULL), mpCvnMask(NULL),
+       mTrace(0),
+       mpLastFileName(NULL), mLastFileName(""), mpLastCmnFile (NULL), 
+       mpLastCvnFile (NULL), mpLastCvgFile (NULL), mpCmn(NULL), 
+       mpCvn(NULL), mpCvg(NULL), mpA(NULL), mpB(NULL),
+       mTimeOpen(0), mTimeSeek(0), mTimeRead(0), mTimeNormalize(0) 
+    { 
+      mInputQueueIterator        = mInputQueue.end();
+    }
+
+    /**
+     * @brief Copy constructor which copies filled repository
+     */
+    FeatureRepository(const FeatureRepository& ori)
+     : mDerivWinLengths(NULL), mpCvgFile(NULL), 
+       mpCmnPath(NULL), mpCmnMask(NULL), mpCvnPath(NULL), mpCvnMask(NULL),
+       mTrace(0),
+       mpLastFileName(NULL), mLastFileName(""), mpLastCmnFile (NULL), 
+       mpLastCvnFile (NULL), mpLastCvgFile (NULL), mpCmn(NULL), 
+       mpCvn(NULL), mpCvg(NULL), mpA(NULL), mpB(NULL),
+       mTimeOpen(0), mTimeSeek(0), mTimeRead(0), mTimeNormalize(0) 
+    {
+      //copy all the data from the input queue
+      mInputQueue = ori.mInputQueue;
+
+      //initialize like the original
+      Init(
+        ori.mSwapFeatures,
+        ori.mStartFrameExt,
+        ori.mEndFrameExt,
+        ori.mTargetKind,
+        ori.mDerivOrder,
+        ori.mDerivWinLengths,
+        ori.mpCmnPath,
+        ori.mpCmnMask,
+        ori.mpCvnPath,
+        ori.mpCvnMask,
+        ori.mpCvgFile);
+     
+      //set on the end 
+      mInputQueueIterator        = mInputQueue.end(); 
+      //copy default header values
+      mHeader = ori.mHeader;
+    }
+
+
+    /**
+     * @brief Destroys the repository
+     */
+    ~FeatureRepository()
+    {
+      if (NULL != mpA) {
+        free(mpA);
+      }
+
+      if (NULL != mpB) {
+        free(mpB);
+      }
+      //remove all entries
+      mInputQueue.clear();
+
+      if(mTrace&4) {
+        std::cout << "[FeatureRepository -- open:" << mTimeOpen << "s seek:" << mTimeSeek << "s read:" << mTimeRead << "s normalize:" << mTimeNormalize << "s]\n";
+      }
+
+    }
+
+
+    /**
+     * @brief Initializes the object using the given parameters
+     *
+     * @param swap          Boolean value specifies whether to swap bytes 
+     *                      when reading file or not. 
+     * @param extLeft       Features read from file are extended with extLeft 
+     *                      initial frames. Normally, these frames are 
+     *                      repetitions of the first feature frame in file 
+     *                      (with its derivative, if derivatives are preset in
+     *                      the file). However, if segment of feature frames 
+     *                      is extracted according to range specification, the 
+     *                      true feature frames from beyond the segment boundary
+     *                      are used, wherever it is possible. Note that value 
+     *                      of extLeft can be also negative. In such case
+     *                      corresponding number of initial frames is discarded. 
+     * @param extRight      The paramerer is complementary to parameter extLeft 
+     *                      and has obvious meaning. (Controls extensions over
+     *                      the last frame, last frame from file is repeated 
+     *                      only if necessary).
+     * @param targetKind    The parameters is used to check whether 
+     *                      pHeader->mSampleKind match to requited targetKind 
+     *                      and to control suppression of 0'th cepstral or 
+     *                      energy coefficients accorging to modifiers _E, _0, 
+     *                      and _N. Modifiers _D, _A and _T are ignored; 
+     *                      Computation of derivatives is controled by parameters
+     *                      derivOrder and derivWinLen. Value PARAMKIND_ANON 
+     *                      ensures that function do not result in targetKind 
+     *                      mismatch error and cause no _E or _0 suppression.
+     * @param derivOrder    Final features will be augmented with their 
+     *                      derivatives up to 'derivOrder' order. If 'derivOrder'
+     *                      is negative value, no new derivatives are appended 
+     *                      and derivatives that already present in feature file
+     *                      are preserved.  Straight features are considered 
+     *                      to be of zero order. If some derivatives are already 
+     *                      present in feature file, these are not computed 
+     *                      again, only higher order derivatives are appended 
+     *                      if required. Note, that HTK feature file cannot 
+     *                      contain higher order derivatives (e.g. double delta)
+     *                      without containing lower ones (e.g. delta). 
+     *                      Derivative present in feature file that are of 
+     *                      higher order than is required are discarded.  
+     *                      Derivatives are computed in the final stage from 
+     *                      (extracted segment of) feature frames possibly 
+     *                      extended by repeated frames. Derivatives are 
+     *                      computed using the same formula that is employed 
+     *                      also by HTK tools. Lengths of windows used for 
+     *                      computation of derivatives are passed in parameter 
+     *                      derivWinLen. To compute derivatives for frames close 
+     *                      to boundaries, frames before the first and after the 
+     *                      last frame (of the extracted segment) are considered 
+     *                      to be (yet another) repetitions of the first and the 
+     *                      last frame, respectively. If the segment of frames 
+     *                      is extracted according to range specification and 
+     *                      parameters extLeft and extLeft are set to zero, the 
+     *                      first and the last frames of the segment are 
+     *                      considered to be repeated, eventough the true feature
+     *                      frames from beyond the segment boundary can be
+     *                      available in the file. Therefore, segment extracted 
+     *                      from features that were before augmented with 
+     *                      derivatives will differ 
+     *                      from the same segment augmented with derivatives by 
+     *                      this function. Difference will be of course only on 
+     *                      boundaries and only in derivatives. This "incorrect" 
+     *                      behavior was chosen to fully simulate behavior of 
+     *                      HTK tools. To obtain more correct computation of 
+     *                      derivatives, use parameters extLeft and extRight, 
+     *                      which correctly extend segment with the true frames 
+     *                      (if possible) and in resulting feature matrix ignore 
+     *                      first extLeft and last extRight frames. For this 
+     *                      purpose, both extLeft and extRight should be set to 
+     *                      sum of all values in the array derivWinLen.
+     * @param pDerivWinLen  Array of size derivOrder specifying lengths of 
+     *                      windows used for computation of derivatives. 
+     *                      Individual values represents one side context 
+     *                      used in the computation. The each window length is 
+     *                      therefore twice the value from array plus one. 
+     *                      Value at index zero specify window length for first 
+     *                      order derivatives (delta), higher indices 
+     *                      corresponds to higher order derivatives.
+     * @param pCmnPath      Cepstral mean normalization path
+     * @param pCmnMask      Cepstral mean normalization mask
+     * @param pCvnPath      Cepstral variance normalization path
+     * @param pCvnMask      Cepstral variance normalization mask
+     * @param pCvgFile      Global variance file to be parsed
+     *
+     * The given parameters are necessary for propper feature extraction 
+     */
+    void
+    Init(
+        bool                  swap,
+        int                   extLeft,
+        int                   extRight,
+        int                   targetKind,
+        int                   derivOrder,
+        int*                  pDerivWinLen,
+        const char*           pCmnPath,
+        const char*           pCmnMask,
+        const char*           pCvnPath,
+        const char*           pCvnMask,
+        const char*           pCvgFile);
+   
+    
+    void Trace(int trace)
+    { mTrace = trace; } 
+        
+    /** 
+     * @brief Returns a refference to the current file header
+     */
+    const HtkHeader&
+    CurrentHeader() const 
+    { return mHeader; }
+
+    /** 
+     * @brief Returns a refference to the current file header
+     */
+    const HtkHeaderExt&
+    CurrentHeaderExt() const 
+    { return mHeaderExt; }
+
+    /**
+     * @brief Returns the current file details
+     *
+     * @return Refference to a class @c FileListElem
+     *
+     * Logical and physical file names are stored in @c FileListElem class
+     */
+    const std::list<FileListElem>::iterator&
+    pCurrentRecord() const
+    { return mInputQueueIterator; }
+
+
+    /**
+     * @brief Returns the following file details
+     *
+     * @return Refference to a class @c FileListElem
+     *
+     * Logical and physical file names are stored in @c FileListElem class
+     */
+    const std::list<FileListElem>::iterator&
+    pFollowingRecord() const
+    { return mInputQueueIterator; }
+
+
+    void
+    Rewind()
+    { mInputQueueIterator = mInputQueue.begin(); }
+    
+    
+    /**
+     * @brief Adds a single feature file to the repository
+     * @param rFileName file to read features from
+     */
+    void
+    AddFile(const std::string & rFileName);
+    
+
+    /**
+     * @brief Adds a list of feature files to the repository
+     * @param rFileName feature list file to read from
+     */
+    void
+    AddFileList(const char* pFileName, const char* pFilter = "");
+  
+    
+    const FileListElem&
+    Current() const
+    { return *mInputQueueIterator; }
+
+    
+    /** 
+     * @brief Moves to the next record
+     */
+    void
+    MoveNext();
+    
+    /**
+     * @brief Reads full feature matrix from a feature file
+     * @param rMatrix matrix to be created and filled with read data
+     * @return number of successfully read feature vectors
+     */
+    bool
+    ReadFullMatrix(Matrix<BaseFloat>& rMatrix); 
+    
+    bool
+    WriteFeatureMatrix(const Matrix<BaseFloat>& rMatrix, const std::string& filename, int targetKind, int samplePeriod);
+    
+    size_t
+    QueueSize() const {return mInputQueue.size(); }
+
+    /**
+     * @brief Reads feature vectors from a feature file
+     * @param rMatrix matrix to be (only!) filled with read data. 
+     * @return number of successfully read feature vectors
+     * 
+     * The function tries to fill @c pMatrix with feature vectors comming from
+     * the current stream. If there are less vectors left in the stream, 
+     * they are used and true number of successfuly read vectors is returned.
+     */
+    int
+    ReadPartialMatrix(Matrix<BaseFloat>& rMatrix);    
+    
+    /** 
+     * @brief Filters the records of this repository based on HTK logical name
+     * masking. If pFilter equals to NULL, all source repository entries are
+     * coppied to rOut repository.
+     * 
+     * @param pFilter HTK mask that defines the filter
+     * @param pValue Filter value
+     * @param rOut Reference to the new FeatureRepository which will be filled
+     * with the matching records
+     */
+    void
+    HtkFilter(const char* pFilter, const char* pValue, FeatureRepository& rOut);
+
+
+    /** 
+     * @brief Filters the records of this repository based on HTK logical name
+     * masking and returns list of unique names. If pFilter equals to NULL, 
+     * single name "default" is returned.
+     * 
+     * @param pFilter HTK mask that defines the filter
+     * @param rOut Reference to the list of results (std::list< std::string >)
+     */
+    void
+    HtkSelection(const char* pFilter, std::list< std::string >& rOut);
+
+
+    /**
+     * @brief Returns true if there are no feature files left on input
+     */
+    bool
+    EndOfList() const 
+    { return mInputQueueIterator == mInputQueue.end(); }
+
+    const std::string&
+    CurrentIndexFileName() const
+    { return mCurrentIndexFileName; }
+    
+    friend
+    void
+    AddFileListToFeatureRepositories(
+      const char* pFileName,
+      const char* pFilter,
+      std::queue<FeatureRepository *> &featureRepositoryList);
+
+
+////////////////////////////////////////////////////////////////////////////////
+//  PRIVATE SECTION
+////////////////////////////////////////////////////////////////////////////////
+  private:
+    /// List (queue) of input feature files
+    std::list<FileListElem>             mInputQueue;
+    std::list<FileListElem>::iterator   mInputQueueIterator;
+    
+    std::string                         mCurrentIndexFileName;
+    std::string                         mCurrentIndexFileDir;
+    std::string                         mCurrentIndexFileExt;
+
+    /// current stream
+    IStkStream                  mStream;
+      
+    // stores feature file's HTK header
+    HtkHeader                   mHeader;
+    HtkHeaderExt                mHeaderExt;
+
+
+    // this group of variables serve for working withthe same physical
+    // file name more than once
+    char*                       mpLastFileName;
+    std::string                 mLastFileName;
+    char*                       mpLastCmnFile;
+    char*                       mpLastCvnFile;
+    char*                       mpLastCvgFile;
+    BaseFloat*                      mpCmn;
+    BaseFloat*                      mpCvn;
+    BaseFloat*                      mpCvg;
+    HtkHeader                   mLastHeader;
+    BaseFloat*                      mpA;
+    BaseFloat*                      mpB;
+
+
+
+    Timer mTim;
+    double mTimeOpen;
+    double mTimeSeek;
+    double mTimeRead;
+    double mTimeNormalize;
+
+
+    // Reads HTK feature file header
+    int 
+    ReadHTKHeader();
+
+    int 
+    ReadHTKFeature(BaseFloat*    pIn, 
+      size_t    feaLen, 
+      bool      decompress, 
+      BaseFloat*    pScale, 
+      BaseFloat*    pBias);
+
+    
+    bool 
+    ReadHTKFeatures(const std::string& rFileName, Matrix<BaseFloat>& rFeatureMatrix);
+    
+    bool 
+    ReadHTKFeatures(const FileListElem& rFileNameRecord, Matrix<BaseFloat>& rFeatureMatrix);
+
+
+    int 
+    WriteHTKHeader  (FILE* fp_out, HtkHeader header, bool swap);
+
+    int 
+    WriteHTKFeature (FILE* fp_out, FLOAT *out, size_t fea_len, bool swap, bool compress, FLOAT* pScale, FLOAT* pBias);
+
+    int 
+    WriteHTKFeatures(FILE* pOutFp, FLOAT * pOut, int nCoeffs, int nSamples, int samplePeriod, int targetKind, bool swap);
+
+    int 
+    WriteHTKFeatures(
+      FILE *  pOutFp,
+      int     samplePeriod,
+      int     targetKind,  
+      bool    swap,
+      Matrix<BaseFloat>& rFeatureMatrix
+    );
+
+    bool 
+    ReadGzipAsciiFeatures(const FileListElem& rFileNameRecord, Matrix<BaseFloat>& rFeatureMatrix);
+
+  }; // class FeatureStream
+
+} //namespace TNet
+
+#endif // TNet_Features_h
diff --git a/src/KaldiLib/.svn/text-base/Labels.cc.svn-base b/src/KaldiLib/.svn/text-base/Labels.cc.svn-base
new file mode 100644
index 0000000..c76b72c
--- /dev/null
+++ b/src/KaldiLib/.svn/text-base/Labels.cc.svn-base
@@ -0,0 +1,215 @@
+#include "Labels.h"
+#include "Timer.h"
+
+
+namespace TNet {
+
+
+  ////////////////////////////////////////////////////////////////////////
+  // Class LabelRepository::
+  void
+  LabelRepository::
+  Init(const char* pLabelMlfFile, const char* pOutputLabelMapFile, const char* pLabelDir, const char* pLabelExt)
+  {
+    assert(NULL != pLabelMlfFile);
+    assert(NULL != pOutputLabelMapFile);
+
+    // initialize the label streams
+    delete mpLabelStream; //if NULL, does nothing
+    delete _mpLabelStream;
+    _mpLabelStream = new std::ifstream(pLabelMlfFile);
+    mpLabelStream  = new IMlfStream(*_mpLabelStream);
+
+    // Label stream is initialized, just test it
+    if(!mpLabelStream->good()) 
+      Error(std::string("Cannot open Label MLF file: ")+pLabelMlfFile);
+
+    // Index the labels (good for randomized file lists)
+    Timer tim; tim.Start();
+    mpLabelStream->Index();
+    tim.End(); mIndexTime += tim.Val(); 
+
+    // Read the state-label to state-id map
+    ReadOutputLabelMap(pOutputLabelMapFile);
+
+    // Store the label dir/ext
+    mpLabelDir = pLabelDir;
+    mpLabelExt = pLabelExt;
+  }
+
+
+
+  void 
+  LabelRepository::
+  GenDesiredMatrix(BfMatrix& rDesired, size_t nFrames, size_t sourceRate, const char* pFeatureLogical)
+  {
+    //timer
+    Timer tim; tim.Start();
+    
+    //Get the MLF stream reference...
+    IMlfStream& mLabelStream = *mpLabelStream;
+    //Build the file name of the label
+    MakeHtkFileName(mpLabelFile, pFeatureLogical, mpLabelDir, mpLabelExt);
+
+    //Find block in MLF file
+    mLabelStream.Open(mpLabelFile);
+    if(!mLabelStream.good()) {
+      Error(std::string("Cannot open label MLF record: ") + mpLabelFile);
+    }
+
+
+    //resize the matrix
+    if(nFrames < 1) {
+      KALDI_ERR << "Number of frames:" << nFrames << " is lower than 1!!!\n"
+                << pFeatureLogical;
+    }
+    rDesired.Init(nFrames, mLabelMap.size(), true); //true: Zero()
+
+    //aux variables
+    std::string line, state;
+    unsigned long long beg, end;
+    size_t state_index;
+    size_t trunc_frames = 0;
+    TagToIdMap::iterator it;
+    
+    //parse the label file
+    while(!mLabelStream.eof()) {
+      std::getline(mLabelStream, line);
+      if(line == "") continue; //skip newlines/comments from MLF
+      if(line[0] == '#') continue;
+
+      std::istringstream& iss = mGenDesiredMatrixStream;
+      iss.clear();
+      iss.str(line);
+
+      //parse the line
+      //begin
+      iss >> std::ws >> beg;
+      if(iss.fail()) { 
+        KALDI_ERR << "Cannot parse column 1 (begin)\n"
+                  << "line: " << line << "\n"
+                  << "file: " << mpLabelFile << "\n";
+      }
+      //end
+      iss >> std::ws >> end;
+      if(iss.fail()) { 
+        KALDI_ERR << "Cannot parse column 2 (end)\n"
+                  << "line: " << line << "\n"
+                  << "file: " << mpLabelFile << "\n";
+      }
+      //state tag
+      iss >> std::ws >> state;
+      if(iss.fail()) { 
+        KALDI_ERR << "Cannot parse column 3 (state_tag)\n"
+                  << "line: " << line << "\n"
+                  << "file: " << mpLabelFile << "\n";
+      }
+
+      //divide beg/end by sourceRate and round up to get interval of frames
+      beg = (beg+sourceRate/2)/sourceRate;
+      end = (end+sourceRate/2)/sourceRate; 
+      //beg = (int)round(beg / (double)sourceRate);
+      //end = (int)round(end / (double)sourceRate); 
+      
+      //find the state id
+      it = mLabelMap.find(state);
+      if(mLabelMap.end() == it) {
+        Error(std::string("Unknown state tag: '") + state + "' file:'" + mpLabelFile);
+      }
+      state_index = it->second;
+
+      // Fill the desired matrix
+      for(unsigned long long frame=beg; frame<end; frame++) { 
+        //don't write after matrix... (possible longer transcript than feature file)
+        if(frame >= (int)rDesired.Rows()) { trunc_frames++; continue; }
+
+        //check the next frame is empty:
+        if(0.0 != rDesired[frame].Sum()) {
+          //ERROR!!!
+          //find out what was previously filled!!!
+          BaseFloat max = rDesired[frame].Max();
+          int idx = -1;
+          for(int i=0; i<(int)rDesired[frame].Dim(); i++) { 
+            if(rDesired[frame][i] == max) idx = i; 
+          }
+          for(it=mLabelMap.begin(); it!=mLabelMap.end(); ++it) {
+            if((int)it->second == idx) break;
+          }
+          std::string state_prev = "error";
+          if(it != mLabelMap.end()) {
+            state_prev = it->first;
+          }
+          //print the error message
+          std::ostringstream os; 
+          os << "Frame already assigned to other state, "
+             << " file: " << mpLabelFile 
+             << " frame: " << frame
+             << " nframes: " << nFrames 
+             << " sum: " << rDesired[frame].Sum()  
+             << " previously assigned to: " << state_prev << "(" << idx << ")" 
+             << " now should be assigned to: " << state << "(" << state_index << ")"
+             << "\n";
+          Error(os.str());
+        }
+
+        //fill the row
+        rDesired[(size_t)frame][state_index] = 1.0f;
+      }
+    }
+
+    mLabelStream.Close();
+
+    //check the desired matrix (rows sum up to 1.0)
+    for(size_t i=0; i<rDesired.Rows(); ++i) {
+      float desired_row_sum = rDesired[i].Sum();
+      if(!desired_row_sum == 1.0) {
+        std::ostringstream os;
+        os << "Desired vector sum isn't 1.0, "
+           << " file: " << mpLabelFile 
+           << " row: " << i 
+           << " nframes: " << nFrames 
+           << " content: " << rDesired[i] 
+           << " sum: " << desired_row_sum  << "\n";
+        Error(os.str());
+      }
+    }
+    
+    //warning when truncating many frames
+    if(trunc_frames > 10) {
+      std::ostringstream os;
+      os << "Truncated frames: " << trunc_frames 
+         << " Check sourcerate in features and validity of labels\n";
+      Warning(os.str());
+    }
+
+    //timer
+    tim.End(); mGenDesiredMatrixTime += tim.Val();
+  }
+
+  
+
+  void
+  LabelRepository::
+  ReadOutputLabelMap(const char* file)
+  {
+    assert(mLabelMap.size() == 0);
+    int i = 0;
+    std::string state_tag;
+    std::ifstream in(file);
+    if(!in.good())
+      Error(std::string("Cannot open OutputLabelMapFile: ")+file);
+
+    in >> std::ws;
+    while(!in.eof()) {
+      in >> state_tag;
+      in >> std::ws;
+      assert(mLabelMap.find(state_tag) == mLabelMap.end());
+      mLabelMap[state_tag] = i++;
+    }
+
+    in.close();
+    assert(mLabelMap.size() > 0);
+  }
+
+
+}//namespace
diff --git a/src/KaldiLib/.svn/text-base/Labels.h.svn-base b/src/KaldiLib/.svn/text-base/Labels.h.svn-base
new file mode 100644
index 0000000..6b78d1a
--- /dev/null
+++ b/src/KaldiLib/.svn/text-base/Labels.h.svn-base
@@ -0,0 +1,75 @@
+#ifndef _LABELS_H_
+#define _LABELS_H_
+
+
+#include "Matrix.h"
+#include "MlfStream.h"
+#include "Features.h"
+
+#include <map>
+#include <iostream>
+
+namespace TNet {
+
+
+  class FeaCatPool;
+
+  /**
+   * Desired matrix generation object,
+   * supports background-reading and caching, however can be 
+   * used in foreground as well by GenDesiredMatrix()
+   */
+  class LabelRepository 
+  {
+    typedef std::map<std::string,size_t> TagToIdMap;
+
+    public:
+      LabelRepository()
+        : _mpLabelStream(NULL), mpLabelStream(NULL), mpLabelDir(NULL), mpLabelExt(NULL), mGenDesiredMatrixTime(0), mIndexTime(0), mTrace(0) 
+      { }
+
+      ~LabelRepository()
+      { 
+        if(mTrace&4) {
+          std::cout << "[LabelRepository -- indexing:" << mIndexTime << "s"
+                       " genDesiredMatrix:" << mGenDesiredMatrixTime << "s]" << std::endl;
+        }
+        delete mpLabelStream;
+        delete _mpLabelStream;
+      }
+
+      /// Initialize the LabelRepository      
+      void Init(const char* pLabelMlfFile, const char* pOutputLabelMapFile, const char* pLabelDir, const char* pLabelExt);
+
+      /// Set trace level
+      void Trace(int trace)
+      { mTrace = trace; }
+
+      /// Get desired matrix from labels
+      void GenDesiredMatrix(BfMatrix& rDesired, size_t nFrames, size_t sourceRate, const char* pFeatureLogical);
+
+    private:
+      /// Prepare the state-label to state-id map
+      void ReadOutputLabelMap(const char* file);
+      
+    private:
+      // Streams and state-map
+      std::ifstream* _mpLabelStream; ///< Helper stream for Label stream
+      IMlfStream* mpLabelStream;     ///< Label stream
+      std::istringstream mGenDesiredMatrixStream; ///< Label file parsing stream
+     
+      const char* mpLabelDir;  ///< Label dir in MLF 
+      const char* mpLabelExt;  ///< Label ext in MLF
+      char mpLabelFile[4096];  ///< Buffer for filenames in MLF
+      
+      TagToIdMap mLabelMap; ///< Map of state tags to net output indices
+
+      double mGenDesiredMatrixTime;
+      float  mIndexTime;
+
+      int mTrace;
+  };
+
+}//namespace
+
+#endif
diff --git a/src/KaldiLib/.svn/text-base/Makefile.svn-base b/src/KaldiLib/.svn/text-base/Makefile.svn-base
new file mode 100644
index 0000000..0c238f4
--- /dev/null
+++ b/src/KaldiLib/.svn/text-base/Makefile.svn-base
@@ -0,0 +1,28 @@
+
+include ../tnet.mk
+
+INCLUDE = -I. 
+
+all: libKaldiLib.a
+
+libKaldiLib.a: $(OBJ)
+	$(AR) ruv $@ $(OBJ)
+	$(RANLIB) $@ 
+
+%.o : %.cc
+	$(CXX)  -o $@  -c $< $(CFLAGS) $(CXXFLAGS) $(INCLUDE)
+
+
+
+.PHONY: clean doc depend
+clean:
+	rm -f *.o *.a
+
+doc:
+	doxygen ../../doc/doxyfile_TNetLib
+
+depend:
+	$(CXX) -M $(CXXFLAGS) *.cc $(INCLUDE) > .depend.mk
+
+-include .depend.mk
+
diff --git a/src/KaldiLib/.svn/text-base/MathAux.h.svn-base b/src/KaldiLib/.svn/text-base/MathAux.h.svn-base
new file mode 100644
index 0000000..c08e836
--- /dev/null
+++ b/src/KaldiLib/.svn/text-base/MathAux.h.svn-base
@@ -0,0 +1,117 @@
+#ifndef TNet_MathAux_h
+#define TNet_MathAux_h
+
+#include <cmath>
+
+
+#if !defined(SQR)
+# define SQR(x) ((x) * (x))
+#endif
+
+
+#if !defined(LOG_0)
+# define LOG_0 (-1.0e10)
+#endif
+
+#if !defined(LOG_MIN)
+# define LOG_MIN   (0.5 * LOG_0)
+#endif
+
+
+#ifndef DBL_EPSILON
+#define DBL_EPSILON 2.2204460492503131e-16
+#endif
+
+
+#ifndef M_PI
+#  define M_PI 3.1415926535897932384626433832795
+#endif
+
+#define M_LOG_2PI 1.8378770664093454835606594728112
+
+
+#if DOUBLEPRECISION
+#  define FLOAT double
+#  define EPSILON DBL_EPSILON
+#  define FLOAT_FMT "%lg"
+#  define swapFLOAT swap8
+#  define _ABS  fabs
+#  define _COS  cos
+#  define _EXP  exp
+#  define _LOG  log
+#  define _SQRT sqrt
+#else
+#  define FLOAT float
+#  define EPSILON FLT_EPSILON
+#  define FLOAT_FMT "%g"
+#  define swapFLOAT swap4
+#  define _ABS  fabsf
+#  define _COS  cosf
+#  define _EXP  expf
+#  define _LOG  logf
+#  define _SQRT sqrtf
+#endif
+
+namespace TNet
+{
+  inline float frand(){ // random between 0 and 1.
+	return (float(rand()) + 1.0f) / (float(RAND_MAX)+2.0f);
+  }
+  inline float gauss_rand(){
+	return _SQRT( -2.0f * _LOG(frand()) ) * _COS(2.0f*float(M_PI)*frand());
+  }
+  
+  static const double gMinLogDiff = log(DBL_EPSILON);
+  
+  //***************************************************************************
+  //***************************************************************************
+  inline double
+  LogAdd(double x, double y)
+  {
+    double diff;
+  
+    if (x < y) {
+      diff = x - y;
+      x = y;
+    } else {
+      diff = y - x;
+    }
+  
+    double res;
+    if (x >= LOG_MIN) {
+      if (diff >= gMinLogDiff) {
+        res = x + log(1.0 + exp(diff));
+      } else {
+        res = x;
+      }
+    } else {
+      res = LOG_0;
+    }
+    return res;
+  } 
+
+
+  //***************************************************************************
+  //***************************************************************************
+  inline double
+  LogSub(double x, double y) // returns exp(x) - exp(y).  Throws exception if y>=x.
+  {
+
+    if(y >= x){
+      if(y==x)  return LOG_0;
+      else throw std::runtime_error("LogSub: cannot subtract a larger from a smaller number.");
+    }
+
+    double diff = y - x;  // Will be negative.
+    
+    double res = x + log(1.0 - exp(diff));
+
+    if(res != res) // test for res==NaN.. could happen if diff ~0.0, so 1.0-exp(diff) == 0.0 to machine precision.
+      res = LOG_0;
+    return res;
+  } 
+
+} // namespace TNet
+
+
+#endif
diff --git a/src/KaldiLib/.svn/text-base/Matrix.cc.svn-base b/src/KaldiLib/.svn/text-base/Matrix.cc.svn-base
new file mode 100644
index 0000000..f9d5909
--- /dev/null
+++ b/src/KaldiLib/.svn/text-base/Matrix.cc.svn-base
@@ -0,0 +1,295 @@
+/** 
+ * @file Matrix.cc 
+ * 
+ * Implementation of specialized Matrix template methods 
+ */
+
+
+#include "Matrix.h"
+
+#if defined(HAVE_CLAPACK)
+#include "CLAPACK-3.1.1.1/INCLUDE/f2c.h"
+extern "C" {
+#include "CLAPACK-3.1.1.1/INCLUDE/clapack.h"
+}
+// These are some stupid clapack things that we want to get rid of
+#ifdef min
+#undef min
+#endif
+
+#ifdef max
+#undef max
+#endif
+
+#endif
+
+
+
+
+namespace TNet
+{
+  //***************************************************************************
+  //***************************************************************************
+#ifdef HAVE_ATLAS
+  //***************************************************************************
+  //***************************************************************************
+  template<>
+    Matrix<float> &
+    Matrix<float>::
+     Invert(float *LogDet, float *DetSign, bool inverse_needed)
+  { 
+      assert(Rows() == Cols());
+      
+#if defined(HAVE_CLAPACK)
+      integer* pivot = new integer[mMRows];
+      integer  M = Rows();
+      integer  N = Cols();
+      integer  LDA = mStride;
+      integer  result;
+      integer  l_work = std::max<integer>(1, N);
+      float*   p_work = new float[l_work];
+
+      sgetrf_(&M, &N, mpData, &LDA, pivot, &result);
+	  const int pivot_offset=1;
+#else
+      int* pivot = new int[mMRows];
+      int result = clapack_sgetrf(CblasColMajor, Rows(), Cols(), mpData, mStride, pivot);
+	  const int pivot_offset=0;
+#endif
+      assert(result >= 0 && "Call to CLAPACK sgetrf_ or ATLAS clapack_sgetrf called with wrong arguments");
+      if(result != 0) {
+        Error("Matrix is singular");
+      }
+	  if(DetSign!=NULL){ *DetSign=1.0; for(size_t i=0;i<mMRows;i++) if(pivot[i]!=(int)i+pivot_offset) *DetSign *= -1.0; }
+	  if(LogDet!=NULL||DetSign!=NULL){ // Compute log determinant...
+		assert(mMRows==mMCols); // Can't take determinant of non-square matrix.
+		*LogDet = 0.0;  float prod = 1.0;
+		for(size_t i=0;i<mMRows;i++){ 
+		  prod *= (*this)(i,i); 
+		  if(i==mMRows-1 || fabs(prod)<1.0e-10 || fabs(prod)>1.0e+10){ 
+			if(LogDet!=NULL) *LogDet += log(fabs(prod)); 
+			if(DetSign!=NULL) *DetSign *= (prod>0?1.0:-1.0);
+			prod=1.0;
+		  }
+		}
+	  }
+#if defined(HAVE_CLAPACK)
+      if(inverse_needed) sgetri_(&M, mpData, &LDA, pivot, p_work, &l_work, &result);
+      delete [] pivot;
+#else
+      if(inverse_needed) result = clapack_sgetri(CblasColMajor, Rows(), mpData, mStride, pivot);
+      delete [] pivot;
+#endif
+      assert(result == 0 && "Call to CLAPACK sgetri_ or ATLAS clapack_sgetri called with wrong arguments");
+      return *this;
+    }
+
+  
+  //***************************************************************************
+  //***************************************************************************
+  template<>
+    Matrix<double> &
+    Matrix<double>::
+     Invert(double *LogDet, double *DetSign, bool inverse_needed)
+    { 
+      assert(Rows() == Cols());
+      
+#if defined(HAVE_CLAPACK)
+      integer* pivot = new integer[mMRows];
+      integer  M = Rows();
+      integer  N = Cols();
+      integer  LDA = mStride;
+      integer  result;
+      integer  l_work = std::max<integer>(1, N);
+      double*   p_work = new double[l_work];
+
+      dgetrf_(&M, &N, mpData, &LDA, pivot, &result);
+	  const int pivot_offset=1;
+#else
+      int* pivot = new int[mMRows];
+      int result = clapack_dgetrf(CblasColMajor, Rows(), Cols(), mpData, mStride, pivot);
+	  const int pivot_offset=0;
+#endif
+      assert(result >= 0 && "Call to CLAPACK dgetrf_ or ATLAS clapack_dgetrf called with wrong arguments");
+      if(result != 0) {
+        Error("Matrix is singular");
+      }
+	  if(DetSign!=NULL){ *DetSign=1.0; for(size_t i=0;i<mMRows;i++) if(pivot[i]!=(int)i+pivot_offset) *DetSign *= -1.0; }
+	  if(LogDet!=NULL||DetSign!=NULL){ // Compute log determinant...
+		assert(mMRows==mMCols); // Can't take determinant of non-square matrix.
+		*LogDet = 0.0;  double prod = 1.0;
+		for(size_t i=0;i<mMRows;i++){ 
+		  prod *= (*this)(i,i); 
+		  if(i==mMRows-1 || fabs(prod)<1.0e-10 || fabs(prod)>1.0e+10){ 
+			if(LogDet!=NULL) *LogDet += log(fabs(prod)); 
+			if(DetSign!=NULL) *DetSign *= (prod>0?1.0:-1.0);
+			prod=1.0;
+		  }
+		}
+	  }
+#if defined(HAVE_CLAPACK)
+      if(inverse_needed) dgetri_(&M, mpData, &LDA, pivot, p_work, &l_work, &result);
+      delete [] pivot;
+#else
+      if(inverse_needed) result = clapack_dgetri(CblasColMajor, Rows(), mpData, mStride, pivot);
+      delete [] pivot;
+#endif
+      assert(result == 0 && "Call to CLAPACK dgetri_ or ATLAS clapack_dgetri called with wrong arguments");
+      return *this;
+    }
+
+  template<>
+    Matrix<float> &
+    Matrix<float>::
+    BlasGer(const float alpha, const Vector<float>& rA, const Vector<float>& rB)
+    {
+      assert(rA.Dim() == mMRows && rB.Dim() == mMCols);
+      cblas_sger(CblasRowMajor, rA.Dim(), rB.Dim(), alpha, rA.pData(), 1, rB.pData(), 1, mpData, mStride);
+      return *this;
+    }
+
+  template<>
+    Matrix<double> &
+    Matrix<double>::
+  BlasGer(const double alpha, const Vector<double>& rA, const Vector<double>& rB)
+    {
+      assert(rA.Dim() == mMRows && rB.Dim() == mMCols);
+      cblas_dger(CblasRowMajor, rA.Dim(), rB.Dim(), alpha, rA.pData(), 1, rB.pData(), 1, mpData, mStride);
+      return *this;
+    }
+  
+  template<>
+    Matrix<float>&
+    Matrix<float>::
+    BlasGemm(const float alpha,
+              const Matrix<float>& rA, MatrixTrasposeType transA,
+              const Matrix<float>& rB, MatrixTrasposeType transB,
+              const float beta)
+    {
+      assert((transA == NO_TRANS && transB == NO_TRANS && rA.Cols() == rB.Rows() && rA.Rows() == Rows() && rB.Cols() == Cols())
+	     || (transA ==    TRANS && transB == NO_TRANS && rA.Rows() == rB.Rows() && rA.Cols() == Rows() && rB.Cols() == Cols())
+	     || (transA == NO_TRANS && transB ==    TRANS && rA.Cols() == rB.Cols() && rA.Rows() == Rows() && rB.Rows() == Cols())
+	     || (transA ==    TRANS && transB ==    TRANS && rA.Rows() == rB.Cols() && rA.Cols() == Rows() && rB.Rows() == Cols()));
+
+      cblas_sgemm(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(transA), static_cast<CBLAS_TRANSPOSE>(transB),
+                  Rows(), Cols(), transA == NO_TRANS ? rA.Cols() : rA.Rows(),
+                  alpha, rA.mpData, rA.mStride, rB.mpData, rB.mStride,
+                  beta, mpData, mStride);
+      return *this;
+    }
+
+  template<>
+   Matrix<double>&
+    Matrix<double>::
+    BlasGemm(const double alpha,
+              const Matrix<double>& rA, MatrixTrasposeType transA,
+              const Matrix<double>& rB, MatrixTrasposeType transB,
+              const double beta)
+    {
+      assert((transA == NO_TRANS && transB == NO_TRANS && rA.Cols() == rB.Rows() && rA.Rows() == Rows() && rB.Cols() == Cols())
+	     || (transA ==    TRANS && transB == NO_TRANS && rA.Rows() == rB.Rows() && rA.Cols() == Rows() && rB.Cols() == Cols())
+	     || (transA == NO_TRANS && transB ==    TRANS && rA.Cols() == rB.Cols() && rA.Rows() == Rows() && rB.Rows() == Cols())
+	     || (transA ==    TRANS && transB ==    TRANS && rA.Rows() == rB.Cols() && rA.Cols() == Rows() && rB.Rows() == Cols()));
+
+      cblas_dgemm(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(transA), static_cast<CBLAS_TRANSPOSE>(transB),
+                  Rows(), Cols(), transA == NO_TRANS ? rA.Cols() : rA.Rows(),
+                  alpha, rA.mpData, rA.mStride, rB.mpData, rB.mStride,
+                  beta, mpData, mStride);
+      return *this;
+    }
+
+  template<>
+    Matrix<float>&
+    Matrix<float>::
+         Axpy(const float alpha,
+              const Matrix<float>& rA, MatrixTrasposeType transA){
+	int aStride = (int)rA.mStride, stride = mStride;
+	float *adata=rA.mpData, *data=mpData;
+	if(transA == NO_TRANS){
+	  assert(rA.Rows()==Rows() && rA.Cols()==Cols());
+	  for(size_t row=0;row<mMRows;row++,adata+=aStride,data+=stride)
+		cblas_saxpy(mMCols, alpha, adata, 1, data, 1);
+	} else {
+	  assert(rA.Cols()==Rows() && rA.Rows()==Cols());
+	  for(size_t row=0;row<mMRows;row++,adata++,data+=stride)
+		cblas_saxpy(mMCols, alpha, adata, aStride, data, 1);
+	}
+	return *this;
+  } 
+
+  template<>
+    Matrix<double>&
+    Matrix<double>::
+         Axpy(const double alpha,
+              const Matrix<double>& rA, MatrixTrasposeType transA){
+	int aStride = (int)rA.mStride, stride = mStride;
+	double *adata=rA.mpData, *data=mpData;
+	if(transA == NO_TRANS){
+	  assert(rA.Rows()==Rows() && rA.Cols()==Cols());
+	  for(size_t row=0;row<mMRows;row++,adata+=aStride,data+=stride)
+		cblas_daxpy(mMCols, alpha, adata, 1, data, 1);
+	} else {
+	  assert(rA.Cols()==Rows() && rA.Rows()==Cols());
+	  for(size_t row=0;row<mMRows;row++,adata++,data+=stride)
+		cblas_daxpy(mMCols, alpha, adata, aStride, data, 1);
+	}
+	return *this;
+  } 
+
+  template <>  //non-member but friend!
+  double TraceOfProduct(const Matrix<double> &A, const Matrix<double> &B){ // tr(A B), equivalent to sum of each element of A times same element in B'
+	size_t aStride = A.mStride, bStride = B.mStride;
+	assert(A.Rows()==B.Cols() && A.Cols()==B.Rows());
+	double ans = 0.0;
+	double *adata=A.mpData, *bdata=B.mpData;
+	size_t arows=A.Rows(), acols=A.Cols();
+	for(size_t row=0;row<arows;row++,adata+=aStride,bdata++)
+	  ans += cblas_ddot(acols, adata, 1, bdata, bStride);
+	return ans;
+  }
+
+  template <>  //non-member but friend!
+  double TraceOfProductT(const Matrix<double> &A, const Matrix<double> &B){ // tr(A B), equivalent to sum of each element of A times same element in B'
+	size_t aStride = A.mStride, bStride = B.mStride;
+	assert(A.Rows()==B.Rows() && A.Cols()==B.Cols());
+	double ans = 0.0;
+	double *adata=A.mpData, *bdata=B.mpData;
+	size_t arows=A.Rows(), acols=A.Cols();
+	for(size_t row=0;row<arows;row++,adata+=aStride,bdata+=bStride)
+	  ans += cblas_ddot(acols, adata, 1, bdata, 1);
+	return ans;
+  }
+
+
+  template <>  //non-member but friend!
+  float TraceOfProduct(const Matrix<float> &A, const Matrix<float> &B){ // tr(A B), equivalent to sum of each element of A times same element in B'
+	size_t aStride = A.mStride, bStride = B.mStride;
+	assert(A.Rows()==B.Cols() && A.Cols()==B.Rows());
+	float ans = 0.0;
+	float *adata=A.mpData, *bdata=B.mpData;
+	size_t arows=A.Rows(), acols=A.Cols();
+	for(size_t row=0;row<arows;row++,adata+=aStride,bdata++)
+	  ans += cblas_sdot(acols, adata, 1, bdata, bStride);
+	return ans;
+  }
+
+  template <>  //non-member but friend!
+  float TraceOfProductT(const Matrix<float> &A, const Matrix<float> &B){ // tr(A B), equivalent to sum of each element of A times same element in B'
+	size_t aStride = A.mStride, bStride = B.mStride;
+	assert(A.Rows()==B.Rows() && A.Cols()==B.Cols());
+	float ans = 0.0;
+	float *adata=A.mpData, *bdata=B.mpData;
+	size_t arows=A.Rows(), acols=A.Cols();
+	for(size_t row=0;row<arows;row++,adata+=aStride,bdata+=bStride)
+	  ans += cblas_sdot(acols, adata, 1, bdata, 1);
+	return ans;
+  }
+
+
+
+
+#endif //HAVE_ATLAS
+
+
+
+} //namespace STK
diff --git a/src/KaldiLib/.svn/text-base/Matrix.h.svn-base b/src/KaldiLib/.svn/text-base/Matrix.h.svn-base
new file mode 100644
index 0000000..d33cb0c
--- /dev/null
+++ b/src/KaldiLib/.svn/text-base/Matrix.h.svn-base
@@ -0,0 +1,677 @@
+#ifndef TNet_Matrix_h
+#define TNet_Matrix_h
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdexcept>
+#include <iostream>
+
+#ifdef HAVE_ATLAS
+extern "C"{
+  #include <cblas.h>
+  #include <clapack.h>
+}
+#endif
+
+#include "Common.h"
+#include "MathAux.h"
+#include "Types.h"
+#include "Error.h"
+
+//#define TRACE_MATRIX_OPERATIONS
+#define CHECKSIZE
+
+namespace TNet
+{
+
+
+  //  class matrix_error : public std::logic_error {};
+  //  class matrix_sizes_error : public matrix_error {};
+
+  // declare the class so the header knows about it
+  template<typename _ElemT> class Vector;
+  template<typename _ElemT> class SubVector;
+  template<typename _ElemT> class Matrix;
+  template<typename _ElemT> class SubMatrix;
+
+  // we need to declare the friend << operator here
+  template<typename _ElemT>
+    std::ostream & operator << (std::ostream & rOut, const Matrix<_ElemT> & rM);
+
+  // we need to declare the friend << operator here
+  template<typename _ElemT>
+    std::istream & operator >> (std::istream & rIn, Matrix<_ElemT> & rM);
+
+  // we need to declare this friend function here
+  template<typename _ElemT>
+   _ElemT TraceOfProduct(const Matrix<_ElemT> &A, const Matrix<_ElemT> &B); // tr(A B)
+
+  // we need to declare this friend function here
+  template<typename _ElemT> 
+	 _ElemT TraceOfProductT(const Matrix<_ElemT> &A, const Matrix<_ElemT> &B); // tr(A B^T)==tr(A^T B)
+
+
+  /** **************************************************************************
+   ** **************************************************************************
+   *  @brief Provides a matrix class
+   *
+   *  This class provides a way to work with matrices in TNet.
+   *  It encapsulates basic operations and memory optimizations.
+   *
+   */
+  template<typename _ElemT>
+    class Matrix
+    {
+    public:
+      /// defines a transpose type
+
+      struct HtkHeader
+      {
+        INT_32    mNSamples;              
+        INT_32    mSamplePeriod;
+        INT_16    mSampleSize;
+        UINT_16   mSampleKind;
+      };
+
+
+      /** 
+       * @brief Extension of the HTK header
+       */
+      struct HtkHeaderExt
+      {
+        INT_32 mHeaderSize;
+        INT_32 mVersion;
+        INT_32 mSampSize;
+      };  
+
+
+
+
+      /// defines a type of this
+      typedef Matrix<_ElemT>    ThisType;
+
+      // Constructors
+
+      /// Empty constructor
+      Matrix<_ElemT> ():
+        mpData(NULL), mMCols(0), mMRows(0), mStride(0)
+#ifdef STK_MEMALIGN_MANUAL
+        , mpFreeData(NULL)
+#endif
+      {}
+
+      /// Copy constructor
+      Matrix<_ElemT> (const Matrix<_ElemT> & rM, MatrixTrasposeType trans=NO_TRANS):
+        mpData(NULL) 
+      { if(trans==NO_TRANS){ Init(rM.mMRows, rM.mMCols); Copy(rM); } else { Init(rM.mMCols,rM.mMRows); Copy(rM,TRANS); } }
+
+      /// Copy constructor from another type.
+      template<typename _ElemU>
+      explicit Matrix<_ElemT> (const Matrix<_ElemU> & rM, MatrixTrasposeType trans=NO_TRANS):
+        mpData(NULL) 
+      { if(trans==NO_TRANS){ Init(rM.Rows(), rM.Cols()); Copy(rM); } else { Init(rM.Cols(),rM.Rows()); Copy(rM,TRANS); } }
+
+      /// Basic constructor
+      Matrix(const size_t r, const size_t c, bool clear=true)
+      { mpData=NULL; Init(r, c, clear);  }
+
+
+	  Matrix<_ElemT> &operator = (const Matrix <_ElemT> &other) { Init(other.Rows(), other.Cols()); Copy(other); return *this; } // Needed for inclusion in std::vector
+
+      /// Destructor
+      ~Matrix()
+      { Destroy(); }
+
+
+      /// Initializes matrix (if not done by constructor)
+      ThisType &
+      Init(const size_t r,
+           const size_t c, bool clear=true);
+
+      /**
+       * @brief Dealocates the matrix from memory and resets the dimensions to (0, 0)
+       */
+      void
+      Destroy();
+
+
+      ThisType &
+      Zero();
+
+      ThisType &
+      Unit(); // set to unit.
+
+      /** 
+       * @brief Copies the contents of a matrix
+       * @param rM Source data matrix
+       * @return Returns reference to this
+       */
+      template<typename _ElemU> ThisType &
+        Copy(const Matrix<_ElemU> & rM, MatrixTrasposeType Trans=NO_TRANS);
+
+
+
+      /**
+       * @brief Copies the elements of a vector row-by-row into a matrix
+       * @param rV Source vector
+       * @param nRows Number of rows of returned matrix
+       * @param nCols Number of columns of returned matrix
+       *
+       * Note that rV.Dim() must equal nRows*nCols
+       */
+      ThisType &
+      CopyVectorSplicedRows(const Vector<_ElemT> &rV, const size_t nRows, const size_t nCols);
+
+      /**
+       * @brief Returns @c true if matrix is initialized
+       */
+      bool
+		IsInitialized() const
+      { return mpData != NULL; }
+
+      /// Returns number of rows in the matrix
+      inline size_t
+		Rows() const
+      {
+        return mMRows;
+      }
+
+      /// Returns number of columns in the matrix
+      inline size_t
+      Cols() const
+      {
+        return mMCols;
+      }
+
+      /// Returns number of columns in the matrix memory
+      inline size_t
+		Stride() const
+      {
+        return mStride;
+      }
+
+
+      /**
+       *  @brief Gives access to a specified matrix row without range check
+       *  @return Pointer to the const array
+       */
+      inline const _ElemT*  __attribute__((aligned(16)))
+       pData () const
+      {
+        return mpData;
+      }
+
+
+      /**
+       *  @brief Gives access to a specified matrix row without range check
+       *  @return Pointer to the non-const data array
+       */
+      inline _ElemT* __attribute__((aligned(16)))
+       pData () 
+      {
+        return mpData;
+      }
+
+
+      /**
+       *  @brief pData_workaround is a workaround that allows SubMatrix to get a 
+       *  @return pointer to non-const data even though the Matrix is const... 
+       */
+    protected:
+      inline _ElemT*  __attribute__((aligned(16)))
+       pData_workaround () const
+      {
+        return mpData;
+      }
+    public:
+
+
+      /// Returns size of matrix in memory
+      size_t
+      MSize() const
+      {
+        return mMRows * mStride * sizeof(_ElemT);
+      }
+
+      /// Checks the content of the matrix for nan and inf values
+      void
+      CheckData(const std::string file = "") const
+      {
+        for(size_t row=0; row<Rows(); row++) {
+          for(size_t col=0; col<Cols(); col++) {
+            if(isnan((*this)(row,col)) || isinf((*this)(row,col))) {
+              std::ostringstream os;
+              os << "Invalid value: " << (*this)(row,col)
+                 << " in matrix row: " << row
+                 << " col: " << col
+                 << " file: " << file;
+              Error(os.str());
+            }
+          }
+        }
+      }
+
+      /**
+       *  **********************************************************************
+       *  **********************************************************************
+       *  @defgroup RESHAPE Matrix reshaping rutines
+       *  **********************************************************************
+       *  **********************************************************************
+       * @{
+       */
+
+      /**
+       *  @brief Removes one row from the matrix. The memory is not reallocated.
+       */
+      ThisType &
+      RemoveRow(size_t i);      
+
+      /** @} */
+
+      /**
+       *  **********************************************************************
+       *  **********************************************************************
+       *  @defgroup ACCESS Access functions and operators
+       *  **********************************************************************
+       *  **********************************************************************
+       * @{
+       */
+
+      /**
+       *  @brief Gives access to a specified matrix row without range check
+       *  @return Subvector object representing the row
+       */
+      inline const SubVector<_ElemT>
+      operator []  (size_t i) const
+      {
+        assert(i < mMRows);
+        return SubVector<_ElemT>(mpData + (i * mStride), Cols());
+      }
+
+      inline SubVector<_ElemT>
+      operator []  (size_t i)
+      {
+        assert(i < mMRows);
+        return SubVector<_ElemT>(mpData + (i * mStride), Cols());
+      }
+
+      /**
+       *  @brief Gives access to a specified matrix row without range check
+       *  @return pointer to the first field of the row
+       */
+      inline  _ElemT*
+      pRowData(size_t i) 
+      {
+        assert(i < mMRows);
+        return mpData + i * mStride;
+      }
+
+      /**
+       *  @brief Gives access to a specified matrix row without range check
+       *  @return pointer to the first field of the row (const version)
+       */
+      inline const _ElemT*
+      pRowData(size_t i) const
+      {
+        assert(i < mMRows);
+        return mpData + i * mStride;
+      }
+
+      /**
+       *  @brief Gives access to matrix elements (row, col)
+       *  @return reference to the desired field
+       */
+      inline _ElemT&
+		operator () (size_t r, size_t c)
+      { 
+#ifdef PARANOID
+        assert(r < mMRows && c < mMCols);
+#endif
+		return *(mpData + r * mStride + c); 
+	  }
+
+      /**
+       *  @brief Gives access to matrix elements (row, col)
+       *  @return pointer to the desired field (const version)
+       */
+      inline const _ElemT
+		operator () (size_t r, size_t c) const
+      { 
+#ifdef PARANOID
+        assert(r < mMRows && c < mMCols);
+#endif
+		return *(mpData + r * mStride + c); 
+	  }
+
+      /**
+       * @brief Returns a matrix sub-range
+       * @param ro Row offset
+       * @param r  Rows in range
+       * @param co Column offset
+       * @param c  Coluns in range
+       * See @c SubMatrix class for details
+       */
+      SubMatrix<_ElemT>
+      Range(const size_t    ro, const size_t    r,
+            const size_t    co, const size_t    c)
+      { return SubMatrix<_ElemT>(*this, ro, r, co, c); }
+
+      const SubMatrix<_ElemT>
+      Range(const size_t    ro, const size_t    r,
+            const size_t    co, const size_t    c) const
+      { return SubMatrix<_ElemT>(*this, ro, r, co, c); }
+      /** @} */
+
+
+      /**
+       *  **********************************************************************
+       *  **********************************************************************
+       *  @defgroup MATH ROUTINES
+       *  **********************************************************************
+       *  **********************************************************************
+       *  @{
+       **/
+
+      /**
+       *  @brief Returns sum of all elements
+       */
+      _ElemT&
+      Sum() const;
+
+      ThisType &
+      DotMul(const ThisType& a);
+
+      ThisType &
+      Scale(_ElemT alpha);
+
+      ThisType &
+      ScaleCols(const Vector<_ElemT> &scale); // Equivalent to (*this) = (*this) * diag(scale).
+
+      ThisType &
+      ScaleRows(const Vector<_ElemT> &scale); // Equivalent to (*this) = diag(scale) * (*this);
+
+      /// Sum another matrix rMatrix with this matrix
+      ThisType&
+      Add(const Matrix<_ElemT>& rMatrix);
+
+   
+      /// Sum scaled matrix rMatrix with this matrix
+      ThisType&
+      AddScaled(_ElemT alpha, const Matrix<_ElemT>& rMatrix);
+
+      /// Apply log to all items of the matrix
+      ThisType&
+      ApplyLog();
+
+      /**
+       * @brief Computes the determinant of this matrix
+       * @return Returns the determinant of a matrix
+       * @ingroup MATH
+       *
+       */
+      _ElemT LogAbsDeterminant(_ElemT *DetSign=NULL);
+
+
+      /**
+       *  @brief Performs matrix inplace inversion
+       */
+      ThisType &
+      Invert(_ElemT *LogDet=NULL, _ElemT *DetSign=NULL, bool inverse_needed=true);
+
+      /**
+       *  @brief Performs matrix inplace inversion in double precision, even if this object is not double precision.
+       */
+      ThisType &
+      InvertDouble(_ElemT *LogDet=NULL, _ElemT *DetSign=NULL, bool inverse_needed=true){
+        double LogDet_tmp, DetSign_tmp;
+        Matrix<double> dmat(*this); dmat.Invert(&LogDet_tmp, &DetSign_tmp, inverse_needed); if(inverse_needed) (*this).Copy(dmat); 
+        if(LogDet) *LogDet = LogDet_tmp; if(DetSign) *DetSign = DetSign_tmp;
+        return *this;
+      }
+
+
+      /**
+       *  @brief Inplace matrix transposition. Applicable only to square matrices
+       */
+      ThisType &
+      Transpose()
+      {
+        assert(Rows()==Cols());
+        size_t M=Rows();
+        for(size_t i=0;i<M;i++)
+          for(size_t j=0;j<i;j++){
+           _ElemT &a = (*this)(i,j), &b = (*this)(j,i);
+		   std::swap(a,b);
+        }
+		return *this;
+      }
+
+
+      
+
+
+      bool IsSymmetric(_ElemT cutoff = 1.0e-05) const;
+
+      bool IsDiagonal(_ElemT cutoff = 1.0e-05) const;
+
+      bool IsUnit(_ElemT cutoff = 1.0e-05) const;
+
+      bool IsZero(_ElemT cutoff = 1.0e-05) const;
+
+      _ElemT FrobeniusNorm() const; // sqrt of sum of square elements.
+
+      _ElemT LargestAbsElem() const; // largest absolute value.
+
+	
+      friend _ElemT TNet::TraceOfProduct<_ElemT>(const Matrix<_ElemT> &A, const Matrix<_ElemT> &B); // tr(A B)
+      friend _ElemT TNet::TraceOfProductT<_ElemT>(const Matrix<_ElemT> &A, const Matrix<_ElemT> &B); // tr(A B^T)==tr(A^T B)
+      friend class SubMatrix<_ElemT>; // so it can get around const restrictions on the pointer to mpData.
+
+      /** **********************************************************************
+       *  **********************************************************************
+       *  @defgroup BLAS_ROUTINES BLAS ROUTINES
+       *  @ingroup MATH
+       *  **********************************************************************
+       *  **********************************************************************
+       **/
+
+      ThisType &
+      BlasGer(const _ElemT alpha, const Vector<_ElemT>& rA, const Vector<_ElemT>& rB);
+
+      ThisType &
+	    Axpy(const _ElemT alpha, const Matrix<_ElemT> &rM, MatrixTrasposeType transA=NO_TRANS);
+
+      ThisType &
+      BlasGemm(const _ElemT alpha,
+               const ThisType& rA, MatrixTrasposeType transA,
+               const ThisType& rB, MatrixTrasposeType transB,
+               const _ElemT beta = 0.0);
+
+
+      /** @} */
+
+
+      /** **********************************************************************
+       *  **********************************************************************
+       *  @defgroup IO Input/Output ROUTINES
+       *  **********************************************************************
+       *  **********************************************************************
+       *  @{
+       **/
+
+      friend std::ostream &
+      operator << <> (std::ostream & out, const ThisType & m);
+	  	
+      void PrintOut(char *file);
+      void ReadIn(char *file);
+
+
+      bool
+      LoadHTK(const char* pFileName);
+
+      /** @} */
+
+
+    protected:
+//      inline void swap4b(void *a);
+//      inline void swap2b(void *a);
+
+
+    protected:
+      /// data memory area
+      _ElemT*   mpData;
+
+      /// these atributes store the real matrix size as it is stored in memory
+      /// including memalignment
+      size_t    mMCols;       ///< Number of columns
+      size_t    mMRows;       ///< Number of rows
+      size_t    mStride;      ///< true number of columns for the internal matrix.
+                              ///< This number may differ from M_cols as memory
+                              ///< alignment might be used
+
+#ifdef STK_MEMALIGN_MANUAL
+      /// data to be freed (in case of manual memalignment use, see Common.h)
+      _ElemT*   mpFreeData;
+#endif
+    }; // class Matrix
+
+    template<>  Matrix<float> &  Matrix<float>::Invert(float *LogDet, float *DetSign, bool inverse_needed); // state that we will implement separately for float and double.
+    template<>  Matrix<double> &  Matrix<double>::Invert(double *LogDet, double *DetSign, bool inverse_needed);
+
+
+
+  /** **************************************************************************
+   ** **************************************************************************
+   *  @brief Sub-matrix representation
+   *
+   *  This class provides a way to work with matrix cutouts in STK.
+   *
+   *
+   */
+  template<typename _ElemT>
+    class SubMatrix : public Matrix<_ElemT>
+    {
+    typedef SubMatrix<_ElemT>    ThisType;
+
+    public:
+      /// Constructor
+      SubMatrix(const Matrix<_ElemT>& rT, // Input matrix cannot be const because SubMatrix can change its contents.
+                const size_t    ro,
+                const size_t    r,
+                const size_t    co,
+                const size_t    c);
+
+
+      /// The destructor
+      ~SubMatrix<_ElemT>()
+      {
+#ifndef STK_MEMALIGN_MANUAL
+        Matrix<_ElemT>::mpData = NULL;
+#else
+        Matrix<_ElemT>::mpFreeData = NULL;
+#endif
+      }
+
+      /// Assign operator
+      ThisType& operator=(const ThisType& rSrc)
+      {
+        //std::cout << "[PERFORMing operator= SubMatrix&^2]" << std::flush;
+        this->mpData = rSrc.mpData;
+        this->mMCols = rSrc.mMCols;
+        this->mMRows = rSrc.mMRows;
+        this->mStride = rSrc.mStride;
+        this->mpFreeData = rSrc.mpFreeData;
+        return *this;
+      }
+
+   
+
+      /// Initializes matrix (if not done by constructor)
+      ThisType &
+      Init(const size_t r,
+           const size_t c, bool clear=true)
+      { Error("Submatrix cannot do Init"); return *this; }
+
+      /**
+       * @brief Dealocates the matrix from memory and resets the dimensions to (0, 0)
+       */
+      void
+      Destroy()
+      { Error("Submatrix cannot do Destroy"); }
+
+
+
+    };
+
+
+
+  //Create useful shortcuts
+  typedef Matrix<BaseFloat> BfMatrix;
+  typedef SubMatrix<BaseFloat> BfSubMatrix;
+
+  /**
+   * Function for summing matrices of different types
+   */
+  template<typename _ElemT, typename _ElemU>
+  void Add(Matrix<_ElemT>& rDst,  const Matrix<_ElemU>& rSrc) {
+    assert(rDst.Cols() == rSrc.Cols());
+    assert(rDst.Rows() == rSrc.Rows());
+
+    for(size_t i=0; i<rDst.Rows(); i++) {
+      const _ElemU* p_src = rSrc.pRowData(i);
+      _ElemT* p_dst = rDst.pRowData(i);
+      for(size_t j=0; j<rDst.Cols(); j++) {
+        *p_dst++ += (_ElemT)*p_src++;
+      }
+    }
+  }
+
+  /**
+   * Function for summing matrices of different types
+   */
+  template<typename _ElemT, typename _ElemU>
+  void AddScaled(Matrix<_ElemT>& rDst, const Matrix<_ElemU>& rSrc, _ElemT scale) {
+    assert(rDst.Cols() == rSrc.Cols());
+    assert(rDst.Rows() == rSrc.Rows());
+
+    Vector<_ElemT> tmp(rDst[0]);
+
+    for(size_t i=0; i<rDst.Rows(); i++) {
+      tmp.Copy(rSrc[i]);
+      rDst[i].BlasAxpy(scale, tmp);
+
+      /*
+      const _ElemU* p_src = rSrc.pRowData(i);
+      _ElemT* p_dst = rDst.pRowData(i);
+      for(size_t j=0; j<rDst.Cols(); j++) {
+        *p_dst++ += (_ElemT)(*p_src++) * scale;
+      }
+      */
+    }
+  }
+
+
+
+
+
+} // namespace STK
+
+
+
+//*****************************************************************************
+//*****************************************************************************
+// we need to include the implementation
+#include "Matrix.tcc"
+//*****************************************************************************
+//*****************************************************************************
+
+
+/******************************************************************************
+ ******************************************************************************
+ * The following section contains specialized template definitions
+ * whose implementation is in Matrix.cc
+ */
+
+
+//#ifndef TNet_Matrix_h
+#endif
diff --git a/src/KaldiLib/.svn/text-base/Matrix.tcc.svn-base b/src/KaldiLib/.svn/text-base/Matrix.tcc.svn-base
new file mode 100644
index 0000000..110abe0
--- /dev/null
+++ b/src/KaldiLib/.svn/text-base/Matrix.tcc.svn-base
@@ -0,0 +1,796 @@
+
+/** @file Matrix.tcc
+ *  This is an internal header file, included by other library headers.
+ *  You should not attempt to use it directly.
+ */
+
+
+#ifndef TNet_Matrix_tcc
+#define TNet_Matrix_tcc
+
+//#pragma GCC system_header
+
+#include <cstdlib>
+#include <cmath>
+#include <cfloat>
+#include <fstream>
+#include <iomanip>
+#include <typeinfo>
+#include <algorithm>
+#include <limits>
+#include <vector>
+#include "Common.h"
+
+#ifndef _XOPEN_SOURCE
+  #define _XOPEN_SOURCE 600
+#endif
+
+
+#ifdef HAVE_ATLAS
+extern "C"{
+  #include <cblas.h>
+}
+#endif
+
+
+#include "Common.h"
+#include "Vector.h"
+namespace TNet
+{
+
+//******************************************************************************
+  template<typename _ElemT>
+  Matrix<_ElemT> &
+  Matrix<_ElemT>::
+  Init(const size_t rows,
+       const size_t cols, 
+       bool clear)
+  {
+    if(mpData != NULL) Destroy();
+    if(rows*cols == 0){
+      assert(rows==0 && cols==0);
+      mMRows=rows; 
+      mMCols=cols;
+#ifdef STK_MEMALIGN_MANUAL
+      mpFreeData=NULL;
+#endif
+      mpData=NULL;
+      return *this;
+    }
+    // initialize some helping vars
+    size_t  skip;
+    size_t  real_cols;
+    size_t  size;
+    void*   data;       // aligned memory block
+    void*   free_data;  // memory block to be really freed
+
+    // compute the size of skip and real cols
+    skip      = ((16 / sizeof(_ElemT)) - cols % (16 / sizeof(_ElemT))) % (16 / sizeof(_ElemT));
+    real_cols = cols + skip;
+    size      = rows * real_cols * sizeof(_ElemT);
+
+    // allocate the memory and set the right dimensions and parameters
+
+    if (NULL != (data = stk_memalign(16, size, &free_data)))
+    {
+      mpData        = static_cast<_ElemT *> (data);
+#ifdef STK_MEMALIGN_MANUAL
+      mpFreeData    = static_cast<_ElemT *> (free_data);
+#endif
+      mMRows      = rows;
+      mMCols      = cols;
+      mStride  = real_cols;
+    }
+    else
+    {
+      throw std::bad_alloc();
+    }
+    if(clear) Zero();
+    return *this;
+  } //
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    template<typename _ElemU>
+    Matrix<_ElemT> &
+    Matrix<_ElemT>::
+  Copy(const Matrix<_ElemU> & rM, MatrixTrasposeType Trans)
+    {
+      if(Trans==NO_TRANS){
+        assert(mMRows == rM.Rows() && mMCols == rM.Cols());
+        for(size_t i = 0; i < mMRows; i++) 
+          (*this)[i].Copy(rM[i]);
+        return *this;
+      } else {
+        assert(mMCols == rM.Rows() && mMRows == rM.Cols());        
+        for(size_t i = 0; i < mMRows; i++) 
+          for(size_t j = 0; j < mMCols; j++)
+            (*this)(i,j) = rM(j,i);
+        return *this;
+      }
+    }
+
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+  Matrix<_ElemT> &
+  Matrix<_ElemT>::
+  CopyVectorSplicedRows(const Vector<_ElemT> &rV, const size_t nRows, const size_t nCols) {
+    assert(rV.Dim() == nRows*nCols);
+    mMRows = nRows;
+    mMCols = nCols;
+
+    for(size_t r=0; r<mMRows; r++)
+      for(size_t c=0; c<mMCols; c++)
+        (*this)(r,c) = rV(r*mMCols + c);
+
+      return *this;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Matrix<_ElemT> &
+    Matrix<_ElemT>::
+  RemoveRow(size_t i)
+  {
+    assert(i < mMRows && "Access out of matrix");
+    for(size_t j = i + 1; j < mMRows; j++)
+      (*this)[j - 1].Copy((*this)[j]);
+    mMRows--;
+    return *this;
+  }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  // The destructor
+  template<typename _ElemT>
+    void
+    Matrix<_ElemT>::
+    Destroy()
+    {
+      // we need to free the data block if it was defined
+#ifndef STK_MEMALIGN_MANUAL
+      if (NULL != mpData) free(mpData);
+#else
+      if (NULL != mpData) free(mpFreeData);
+      mpFreeData = NULL;
+#endif
+
+      mpData = NULL;
+      mMRows = mMCols = 0;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+//  template<typename _ElemT>
+//  void
+//  Matrix<_ElemT>::
+//  VectorizeRows(Vector<_ElemT> &rV) {
+//#ifdef PARANIOD
+//    assert(rV.Dim() == mMRows*mMCols);
+//#endif
+//    for(size_t r=0; r<mMRows; r++) {
+//      rV.Range((r-1)*mMCols, mMCols).Copy((*this)[r]);
+//    }
+//  }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    bool
+    Matrix<_ElemT>::
+    LoadHTK(const char* pFileName)
+    {
+      HtkHeader htk_hdr;
+
+      FILE *fp = fopen(pFileName, "rb");
+      if(!fp)
+      {
+        return false;
+      }
+
+      read(fileno(fp), &htk_hdr, sizeof(htk_hdr));
+
+      swap4(htk_hdr.mNSamples);
+      swap4(htk_hdr.mSamplePeriod);
+      swap2(htk_hdr.mSampleSize);
+      swap2(htk_hdr.mSampleKind);
+
+      Init(htk_hdr.mNSamples, htk_hdr.mSampleSize / sizeof(float));
+
+      size_t i;
+      size_t j;
+      if (typeid(_ElemT) == typeid(float))
+      {
+        for (i=0; i< Rows(); ++i) {
+          read(fileno(fp), (*this).pRowData(i), Cols() * sizeof(float));
+
+          for(j = 0; j < Cols(); j++) {
+            swap4(((*this)(i,j)));
+          }
+        }
+      }
+      else
+      {
+        float *pmem = new (std::nothrow) float[Cols()];
+        if (!pmem)
+        {
+          fclose(fp);
+          return false;
+        }
+
+        for(i = 0; i < Rows(); i++) {
+          read(fileno(fp), pmem, Cols() * sizeof(float));
+
+          for (j = 0; j < Cols(); ++j) {
+            swap4(pmem[j]);
+            (*this)(i,j) = static_cast<_ElemT>(pmem[j]);
+          }
+        }
+        delete [] pmem;
+      }
+
+      fclose(fp);
+
+      return true;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Matrix<_ElemT> &
+    Matrix<_ElemT>::
+    DotMul(const ThisType& a)
+    {
+      size_t i;
+      size_t j;
+
+      for (i = 0; i < mMRows; ++i) {
+        for (j = 0; j < mMCols; ++j) {
+          (*this)(i,j) *= a(i,j);
+        }
+      }
+      return *this;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    _ElemT &
+    Matrix<_ElemT>::
+    Sum() const
+    {
+      double sum = 0.0;
+
+      for (size_t i = 0; i < Rows(); ++i) {
+        for (size_t j = 0; j < Cols(); ++j) {
+          sum += (*this)(i,j);
+        }
+      }
+
+      return sum;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Matrix<_ElemT>&
+    Matrix<_ElemT>::
+    Scale(_ElemT alpha)
+    {
+#if 0
+      for (size_t i = 0; i < Rows(); ++i) 
+        for (size_t j = 0; j < Cols(); ++j) 
+          (*this)(i,j) *= alpha;
+#else
+      for (size_t i = 0; i < Rows(); ++i) {
+        _ElemT* p_data = pRowData(i);
+        for (size_t j = 0; j < Cols(); ++j) {
+          *p_data++ *= alpha; 
+        }
+      }
+#endif
+      return *this;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Matrix<_ElemT>&
+    Matrix<_ElemT>::
+    ScaleRows(const Vector<_ElemT>& scale) // scales each row by scale[i].
+    {
+      assert(scale.Dim() == Rows());
+      size_t M = Rows(), N = Cols();
+
+      for (size_t i = 0; i < M; i++) {
+        _ElemT this_scale = scale(i);
+        for (size_t j = 0; j < N; j++) {
+          (*this)(i,j) *= this_scale;
+        }
+      }
+      return *this;
+     }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Matrix<_ElemT>&
+    Matrix<_ElemT>::
+    ScaleCols(const Vector<_ElemT>& scale) // scales each column by scale[i].
+    {
+      assert(scale.Dim() == Cols());
+      for (size_t i = 0; i < Rows(); i++) {
+        for (size_t j = 0; j < Cols(); j++) {
+          _ElemT this_scale = scale(j);
+          (*this)(i,j) *= this_scale;
+        }
+      }
+      return *this;
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+  Matrix<_ElemT>&
+  Matrix<_ElemT>::
+  Add(const Matrix<_ElemT>& rMatrix) 
+  {
+    assert(rMatrix.Cols() == Cols());
+    assert(rMatrix.Rows() == Rows());
+      
+#if 0
+    //this can be slow
+    for (size_t i = 0; i < Rows(); i++) {
+      for (size_t j = 0; j < Cols(); j++) {
+        (*this)(i,j) += rMatrix(i,j);
+      }
+    }
+#else
+    //this will be faster (but less secure)
+    for(size_t i=0; i<Rows(); i++) {
+      const _ElemT* p_src = rMatrix.pRowData(i);
+      _ElemT* p_dst = pRowData(i);
+      for(size_t j=0; j<Cols(); j++) {
+        *p_dst++ += *p_src++;
+      }
+    }
+#endif
+    return *this;
+  }
+
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+  Matrix<_ElemT>&
+  Matrix<_ElemT>::
+  AddScaled(_ElemT alpha, const Matrix<_ElemT>& rMatrix) 
+  {
+    assert(rMatrix.Cols() == Cols());
+    assert(rMatrix.Rows() == Rows());
+      
+#if 0
+    //this can be slow
+    for (size_t i = 0; i < Rows(); i++) {
+      for (size_t j = 0; j < Cols(); j++) {
+        (*this)(i,j) += rMatrix(i,j) * alpha;
+      }
+    }
+#else
+  /*
+    //this will be faster (but less secure)
+    for(size_t i=0; i<Rows(); i++) {
+      const _ElemT* p_src = rMatrix.pRowData(i);
+      _ElemT* p_dst = pRowData(i);
+      for(size_t j=0; j<Cols(); j++) {
+        *p_dst++ += *p_src++ * alpha;
+      }
+    }
+    */
+
+  //let's use BLAS
+  for(size_t i=0; i<Rows(); i++) {
+    (*this)[i].BlasAxpy(alpha, rMatrix[i]);
+  }
+#endif
+    return *this;
+  }
+
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+  Matrix<_ElemT>&
+  Matrix<_ElemT>::
+  ApplyLog()
+  {
+      
+#if 0
+    //this can be slow
+    for (size_t i = 0; i < Rows(); i++) {
+      for (size_t j = 0; j < Cols(); j++) {
+        (*this)(i,j) = += _LOG((*this)(i,j));
+      }
+    }
+#else
+    //this will be faster (but less secure)
+    for(size_t i=0; i<Rows(); i++) {
+      _ElemT* p_data = pRowData(i);
+      for(size_t j=0; j<Cols(); j++) {
+        *p_data = _LOG(*p_data);
+        p_data++;
+      }
+    }
+#endif
+    return *this;
+  }
+
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Matrix<_ElemT> &
+    Matrix<_ElemT>::
+    Zero()
+    {
+    for(size_t row=0;row<mMRows;row++)
+    memset(mpData + row*mStride, 0, sizeof(_ElemT)*mMCols);
+      return *this;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Matrix<_ElemT> &
+    Matrix<_ElemT>::
+    Unit()
+    {
+    for(size_t row=0;row<std::min(mMRows,mMCols);row++){
+    memset(mpData + row*mStride, 0, sizeof(_ElemT)*mMCols);
+    (*this)(row,row) = 1.0;
+    }
+      return *this;
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    void
+    Matrix<_ElemT>::
+    PrintOut(char* file)
+    {
+      FILE* f = fopen(file, "w");
+      unsigned i,j;
+      fprintf(f, "%dx%d\n", this->mMRows, this->mMCols);
+
+      for(i=0; i<this->mMRows; i++)
+      {
+        _ElemT*   row = (*this)[i];
+
+        for(j=0; j<this->mStride; j++){
+          fprintf(f, "%20.17f ",row[j]);
+        }
+        fprintf(f, "\n");
+      }
+
+      fclose(f);
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    void
+    Matrix<_ElemT>::
+    ReadIn(char* file)
+    {
+      FILE* f = fopen(file, "r");
+      int  i = 0;
+      int j = 0;
+      fscanf(f, "%dx%d\n", &i,&j);
+      fprintf(stderr, "%dx%d\n", i,j);
+
+      for(i=0; i<this->mMRows; i++)
+      {
+        _ElemT*   row = (*this)[i];
+
+        for(j=0; j<this->mStride; j++){
+          fscanf(f, "%f ",&row[j]);
+        }
+        //fprintf(f, "\n");
+      }
+
+      fclose(f);
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    void Save (std::ostream &rOut, const Matrix<_ElemT> &rM)
+    {
+      for (size_t i = 0; i < rM.Rows(); i++) {
+        for (size_t j = 0; j < rM.Cols(); j++) {
+          rOut << rM(i,j) << ' ';
+        }
+        rOut << '\n';
+      }
+      if(rOut.fail()) 
+        throw std::runtime_error("Failed to write matrix to stream");
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    std::ostream &
+    operator << (std::ostream & rOut, const Matrix<_ElemT> & rM)
+    {
+      rOut << "m " << rM.Rows() << ' ' << rM.Cols() << '\n';
+      Save(rOut, rM);
+      return rOut;
+    }
+
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    void Load (std::istream & rIn, Matrix<_ElemT> & rM)
+    {
+      if(MatrixVectorIostreamControl::Flags(rIn, ACCUMULATE_INPUT)) {
+        for (size_t i = 0; i < rM.Rows(); i++) {
+          std::streamoff pos = rIn.tellg();
+          for (size_t j = 0; j < rM.Cols(); j++) {
+            _ElemT tmp;
+            rIn >> tmp;
+            rM(i,j) += tmp;
+            if(rIn.fail()){
+              throw std::runtime_error("Failed to read matrix from stream.  File position is "+to_string(pos));
+            }        
+          }
+        }
+      } else {
+        for (size_t i = 0; i < rM.Rows(); i++) {
+          std::streamoff pos = rIn.tellg();
+          for (size_t j = 0; j < rM.Cols(); j++) {
+            rIn >> rM(i,j);
+            if(rIn.fail()){
+              throw std::runtime_error("Failed to read matrix from stream.  File position is "+to_string(pos));
+            }        
+
+          }
+        }
+      }
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    std::istream &
+    operator >> (std::istream & rIn, Matrix<_ElemT> & rM)
+    {
+      while(isascii(rIn.peek()) && isspace(rIn.peek())) rIn.get(); // eat up space.
+      if(rIn.peek() == 'm'){ // "new" format: m <nrows> <ncols> \n 1.0 0.2 4.3 ...
+        rIn.get();// eat up the 'm'.
+        long long int nrows=-1; rIn>>nrows; 
+        long long int ncols=-1; rIn>>ncols; 
+        if(rIn.fail()||nrows<0||ncols<0){ throw std::runtime_error("Failed to read matrix from stream: no size\n"); }
+
+        size_t nrows2 = size_t(nrows), ncols2 = size_t(ncols);
+        assert((long long int)nrows2 == nrows && (long long int)ncols2 == ncols);
+
+        if(rM.Rows()!=nrows2 || rM.Cols()!=ncols2) rM.Init(nrows2,ncols2);
+      }
+      Load(rIn,rM);
+      return rIn;
+    }
+
+
+
+  //****************************************************************************
+  //****************************************************************************
+  // Constructor
+  template<typename _ElemT>
+    SubMatrix<_ElemT>::
+    SubMatrix(const Matrix<_ElemT>& rT, // Matrix cannot be const because SubMatrix can change its contents.  Would have to have a ConstSubMatrix or something...
+              const size_t    ro,
+              const size_t    r,
+              const size_t    co,
+              const size_t    c)
+    {
+      assert(ro >= 0 && ro <= rT.Rows());
+      assert(co >= 0 && co <= rT.Cols());
+      assert(r  >  0 && r  <= rT.Rows() - ro);
+      assert(c  >  0 && c  <= rT.Cols() - co);
+      // point to the begining of window
+      Matrix<_ElemT>::mMRows = r;
+      Matrix<_ElemT>::mMCols = c;
+      Matrix<_ElemT>::mStride = rT.Stride();
+      Matrix<_ElemT>::mpData = rT.pData_workaround() + co + ro * rT.Stride();
+    }
+
+
+
+#ifdef HAVE_ATLAS
+
+  template<>
+    Matrix<float> &
+    Matrix<float>::
+   BlasGer(const float alpha, const Vector<float>& rA, const Vector<float>& rB);
+
+
+  template<>
+    Matrix<double> &
+    Matrix<double>::
+   BlasGer(const double alpha, const Vector<double>& rA, const Vector<double>& rB);
+
+
+  template<>
+    Matrix<float>&
+    Matrix<float>::
+    BlasGemm(const float alpha,
+              const Matrix<float>& rA, MatrixTrasposeType transA,
+              const Matrix<float>& rB, MatrixTrasposeType transB,
+       const float beta);
+
+  template<>
+   Matrix<double>&
+    Matrix<double>::
+    BlasGemm(const double alpha,
+              const Matrix<double>& rA, MatrixTrasposeType transA,
+              const Matrix<double>& rB, MatrixTrasposeType transB,
+       const double beta);
+
+  template<>
+    Matrix<float>&
+    Matrix<float>::
+         Axpy(const float alpha,
+              const Matrix<float>& rA, MatrixTrasposeType transA);
+
+  template<>
+    Matrix<double>&
+    Matrix<double>::
+         Axpy(const double alpha,
+              const Matrix<double>& rA, MatrixTrasposeType transA);
+
+  template <>  // non-member so automatic namespace lookup can occur.
+  double TraceOfProduct(const Matrix<double> &A, const Matrix<double> &B);
+
+  template <>  // non-member so automatic namespace lookup can occur.
+  double TraceOfProductT(const Matrix<double> &A, const Matrix<double> &B);
+
+  template <>  // non-member so automatic namespace lookup can occur.
+  float TraceOfProduct(const Matrix<float> &A, const Matrix<float> &B);
+
+  template <>  // non-member so automatic namespace lookup can occur.
+  float TraceOfProductT(const Matrix<float> &A, const Matrix<float> &B);
+
+  
+
+#else // HAVE_ATLAS
+      #error Routines in this section are not implemented yet without BLAS
+#endif // HAVE_ATLAS
+
+  template<class _ElemT>
+  bool
+  Matrix<_ElemT>::
+  IsSymmetric(_ElemT cutoff) const {
+  size_t R=Rows(), C=Cols();
+  if(R!=C) return false;
+  _ElemT bad_sum=0.0, good_sum=0.0;
+  for(size_t i=0;i<R;i++){
+    for(size_t j=0;j<i;j++){
+    _ElemT a=(*this)(i,j),b=(*this)(j,i), avg=0.5*(a+b), diff=0.5*(a-b);    
+    good_sum += fabs(avg); bad_sum += fabs(diff);
+    }
+    good_sum += fabs((*this)(i,i));
+  }
+  if(bad_sum > cutoff*good_sum) return false;
+  return true;
+  }
+
+  template<class _ElemT>
+  bool
+  Matrix<_ElemT>::
+  IsDiagonal(_ElemT cutoff) const{
+  size_t R=Rows(), C=Cols();
+  _ElemT bad_sum=0.0, good_sum=0.0;
+  for(size_t i=0;i<R;i++){
+    for(size_t j=0;j<C;j++){
+    if(i==j) good_sum += (*this)(i,j);
+    else bad_sum += (*this)(i,j);
+    }
+  }
+  return (!(bad_sum > good_sum * cutoff));
+  }
+
+  template<class _ElemT>
+  bool
+  Matrix<_ElemT>::
+  IsUnit(_ElemT cutoff) const {
+  size_t R=Rows(), C=Cols();
+  if(R!=C) return false;
+  _ElemT bad_sum=0.0;
+  for(size_t i=0;i<R;i++)
+    for(size_t j=0;j<C;j++)
+    bad_sum += fabs( (*this)(i,j) - (i==j?1.0:0.0));
+  return (bad_sum <= cutoff);
+  }
+
+  template<class _ElemT>
+  bool
+  Matrix<_ElemT>::
+  IsZero(_ElemT cutoff)const {
+  size_t R=Rows(), C=Cols();
+  _ElemT bad_sum=0.0;
+  for(size_t i=0;i<R;i++)
+    for(size_t j=0;j<C;j++)
+    bad_sum += fabs( (*this)(i,j) );
+  return (bad_sum <= cutoff);
+  }
+
+  template<class _ElemT>
+  _ElemT
+  Matrix<_ElemT>::
+  FrobeniusNorm() const{
+  size_t R=Rows(), C=Cols();
+  _ElemT sum=0.0;
+  for(size_t i=0;i<R;i++)
+    for(size_t j=0;j<C;j++){
+        _ElemT tmp = (*this)(i,j);
+    sum +=  tmp*tmp;
+      }
+    return sqrt(sum);
+  }
+
+  template<class _ElemT>
+  _ElemT
+  Matrix<_ElemT>::
+  LargestAbsElem() const{
+  size_t R=Rows(), C=Cols();
+  _ElemT largest=0.0;
+  for(size_t i=0;i<R;i++)
+    for(size_t j=0;j<C;j++)
+        largest = std::max(largest, (_ElemT)fabs((*this)(i,j)));
+    return largest;
+  }
+
+
+
+  // Uses SVD to compute the eigenvalue decomposition of a symmetric positive semidefinite 
+  //   matrix: 
+  // (*this) = rU * diag(rS) * rU^T, with rU an orthogonal matrix so rU^{-1} = rU^T.
+  // Does this by computing svd (*this) = U diag(rS) V^T ... answer is just U diag(rS) U^T.
+  // Throws exception if this failed to within supplied precision (typically because *this was not 
+  // symmetric positive definite).  
+  
+  
+
+  template<class _ElemT>
+  _ElemT
+  Matrix<_ElemT>::
+  LogAbsDeterminant(_ElemT *DetSign){
+    _ElemT LogDet;
+  Matrix<_ElemT> tmp(*this);
+  tmp.Invert(&LogDet, DetSign, false); // false== output not needed (saves some computation).
+    return LogDet;
+  }
+
+}// namespace TNet
+
+// #define TNet_Matrix_tcc
+#endif
diff --git a/src/KaldiLib/.svn/text-base/MlfStream.cc.svn-base b/src/KaldiLib/.svn/text-base/MlfStream.cc.svn-base
new file mode 100644
index 0000000..a2f6478
--- /dev/null
+++ b/src/KaldiLib/.svn/text-base/MlfStream.cc.svn-base
@@ -0,0 +1,268 @@
+#include "MlfStream.h"
+#include "Common.h"
+#include "Error.h"
+
+
+namespace TNet
+{
+  //******************************************************************************
+  LabelContainer::
+  ~LabelContainer()
+  {
+    while (!this->mLabelList.empty())
+    {
+      delete this->mLabelList.back();
+      this->mLabelList.pop_back();
+    }
+  }
+
+  //******************************************************************************
+  size_t
+  LabelContainer::
+  DirDepth(const std::string & rPath)
+  {
+    size_t depth     = 0;
+    size_t length    = rPath.length();
+    const char * s   = rPath.c_str();
+
+    for (size_t i = 0; i < length; i++)
+    {
+      if (*s == '/' || *s == '\\')
+      {
+        depth++;
+      }
+      s++;
+    }
+    return depth;
+  }
+
+
+  //******************************************************************************
+  void
+  LabelContainer::
+  Insert(const std::string &  rLabel,
+         std::streampos  Pos)
+  {
+    LabelRecord     ls;
+    size_t          depth;
+    LabelRecord     tmp_ls;
+
+    // we need to compute the depth of the label path if
+    // wildcard is used
+    // do we have a wildcard???
+    if (rLabel[0] == '*')
+    {
+      depth = this->DirDepth(rLabel);
+    }
+    else
+    {
+      depth = MAX_LABEL_DEPTH;
+    }
+
+    // perhaps we want to store the depth of the path in the label for the wildcards
+    // to work
+    this->mDepths.insert(depth);
+
+    // store the values
+    ls.mStreamPos       = Pos;
+    ls.miLabelListLimit = mLabelList.end();
+
+
+    if (mLabelList.begin() != mLabelList.end()) {
+      ls.miLabelListLimit--;
+    }
+
+    // if no wildcard chars, then we try to store in hash, otherwise store in 
+    // list
+    if (rLabel.find_first_of("*?%",1) == rLabel.npos)
+    {
+      if (!Find(rLabel, tmp_ls))
+      {
+        // look in the
+        this->mLabelMap[rLabel] = ls;
+      }
+      else {
+        ;
+        //Warning("More general definition found when inserting " + rLabel + " ... label: " + MatchedPattern());
+      }
+    }
+    else
+    {
+      this->mLabelList.push_back(new std::pair<std::string,LabelRecord>(rLabel, ls));
+    }
+  }
+
+
+  //******************************************************************************
+  bool
+  LabelContainer::
+  FindInHash(const std::string & rLabel, LabelRecord & rLS)
+  {
+    bool found = false;
+
+    std::string str;
+
+    // current depth within the str
+    DepthType  current_depth    = MAX_LABEL_DEPTH;
+
+    // current search position within the str
+    size_t     prev             = rLabel.size() + 1;
+
+    // we will walk through the set depts bacwards so we begin at the end and move
+    // to the front...
+    std::set<DepthType>::reverse_iterator ri    (this->mDepths.end());
+    std::set<DepthType>::reverse_iterator rlast (this->mDepths.begin());
+    LabelHashType::iterator               lab;
+
+    // we perform the search until we run to the end of the set or we find something
+    while ((!found) && (ri != rlast))
+    {
+      // we don't need to do anything with the string if the depth is set to
+      // max label depth since it contains no *
+      if (*ri == MAX_LABEL_DEPTH)
+      {
+        found = ((lab=this->mLabelMap.find(rLabel)) != this->mLabelMap.end());
+        if (found) str = rLabel;
+      }
+      // we will crop the string and put * in the begining and try to search
+      else
+      {
+        // we know that we walk backwards in the depths, so we need to first find
+        // the last / and
+        if (current_depth == MAX_LABEL_DEPTH)
+        {
+          if (*ri > 0)
+          {
+            // we find the ri-th / from back
+            for (DepthType i=1; (i <= *ri) && (prev != rLabel.npos); i++)
+            {
+              prev = rLabel.find_last_of("/\\", prev-1);
+            }
+          }
+          else
+          {
+            prev = 0;
+          }
+
+          // check if finding succeeded (prev == str.npos => failure, see STL)
+          if (prev != rLabel.npos)
+          {
+            // construct the new string beign sought for
+            str.assign(rLabel, prev, rLabel.size());
+            str = '*' + str;
+
+            // now we try to find
+            found = ((lab=this->mLabelMap.find(str)) != this->mLabelMap.end());
+
+            // say, that current depth is *ri
+            current_depth = *ri;
+          }
+          else
+          {
+            prev = rLabel.size() + 1;
+          }
+        }     // if (current_depth == MAX_LABEL_DEPTH)
+        else
+        {
+          // now we know at which / we are from the back, so we search forward now
+          // and we need to reach the ri-th /
+          while (current_depth > *ri)
+          {
+            // we try to find next /
+            if ((prev = rLabel.find_first_of("/\\", prev+1)) != rLabel.npos)
+              current_depth--;
+            else
+              return false;
+          }
+
+          // construct the new string beign sought for
+          str.assign(rLabel, prev, rLabel.size());
+          str = '*' + str;
+
+          // now we try to find
+          found = ((lab=this->mLabelMap.find(str)) != this->mLabelMap.end());
+        }
+      }
+
+      // move one element further (jump to next observed depth)
+      ri++;
+    } // while (run)
+
+    // some debug info
+    if (found)
+    {
+      rLS                   = lab->second;
+      this->mMatchedPattern = str;
+    }
+
+    return found;
+  }
+
+
+  //******************************************************************************
+  bool
+  LabelContainer::
+  FindInList(const std::string & rLabel, LabelRecord & rLS, bool limitSearch)
+  {
+
+    bool                      found = false;
+    std::string                    str;
+    LabelListType::iterator   lab   = mLabelList.begin();
+    LabelListType::iterator   limit;
+
+    if (limitSearch && (rLS.miLabelListLimit != mLabelList.end()))
+    {
+      limit = rLS.miLabelListLimit;
+      limit++;
+    }
+    else
+    {
+      limit = this->mLabelList.end();
+    }
+
+    // we perform sequential search until we run to the end of the list or we find
+    // something
+    while ((!found) && (lab != limit))
+    {
+      if (ProcessMask(rLabel, (*lab)->first, str))
+      {
+        found = true;
+      }
+      else
+      {
+        lab++;
+      }
+    } // while (run)
+
+    // some debug info
+    if (found)
+    {
+      rLS                       = (*lab)->second;
+      this->mMatchedPattern     = (*lab)->first;
+      this->mMatchedPatternMask = str;
+    }
+    return found;
+  }
+
+
+  //******************************************************************************
+  bool
+  LabelContainer::
+  Find(const std::string & rLabel, LabelRecord & rLS)
+  {
+    // try to find the label in the Hash
+    if (FindInHash(rLabel, rLS))
+    {
+      // we look in the list, but we limit the search.
+      FindInList(rLabel, rLS, true);
+      return true;
+    } //if (this->mLabelContainer.FindInHash(rLabel, label_stream))
+    else
+    {
+      // we didn't find it in the hash so we look in the list
+      return FindInList(rLabel, rLS);
+    }
+  }
+
+} // namespace TNet
+
diff --git a/src/KaldiLib/.svn/text-base/MlfStream.h.svn-base b/src/KaldiLib/.svn/text-base/MlfStream.h.svn-base
new file mode 100644
index 0000000..81f2d6e
--- /dev/null
+++ b/src/KaldiLib/.svn/text-base/MlfStream.h.svn-base
@@ -0,0 +1,639 @@
+/** @file MlfStream.h
+ *  This is an TNet C++ Library header.
+ *
+ *  The naming convention in this file coppies the std::* naming as well as STK
+ */
+
+
+#ifndef STK_MlfStream_h
+#define STK_MlfStream_h
+
+#include <iostream>
+#include <vector>
+#include <map>
+#include <list>
+#include <set>
+
+
+namespace TNet
+{
+  class LabelRecord;
+  class LabelContainer;
+
+
+  /// this container stores the lables in linear order as they came
+  /// i.e. they cannot be hashed
+  typedef  std::list< std::pair<std::string,LabelRecord> *> LabelListType;
+
+  /// type of the container used to store the labels
+  typedef  std::map<std::string, LabelRecord>               LabelHashType;
+
+
+
+  /**
+   *  @brief Describes type of MLF definition
+   *
+   *  See HTK book for MLF structure. Terms used in TNet are
+   *  compatible with those in HTK book.
+   */
+  enum MlfDefType
+  {
+    MLF_DEF_UNKNOWN = 0,              ///< unknown definition
+    MLF_DEF_IMMEDIATE_TRANSCRIPTION,  ///< immediate transcription
+    MLF_DEF_SUB_DIR_DEF               ///< subdirectory definition
+  };
+
+
+
+  /** **************************************************************************
+   *  @brief Holds association between label and stream
+   */
+  class LabelRecord
+  {
+
+  public:
+    LabelRecord() : miLabelListLimit(NULL)
+    { }
+
+    ~LabelRecord()
+    { }
+
+    /// definition type
+    MlfDefType                mDefType;
+
+    /// position of the label in the stream
+    std::streampos            mStreamPos;
+
+    /**
+     *  @brief points to the current end of the LabelList
+     *
+     *  The reason for storing this value is to know when we inserted
+     *  a label into the hash. It is possible, that the hash label came
+     *  after list label, in which case the list label is prefered
+     */
+    LabelListType::iterator   miLabelListLimit;
+
+  };
+
+
+
+
+  /**
+   *  @brief Provides an interface to label hierarchy and searching
+   *
+   *  This class stores label files in a map structure. When a wildcard
+   *  convence is used, the class stores the labels in separate maps according
+   *  to level of wildcard abstraction. By level we mean the directory structure
+   *  depth.
+   */
+  class LabelContainer
+  {
+  public:
+    /// The constructor
+    LabelContainer() : mUseHashedSearch(true) {}
+
+    /// The destructor
+    ~LabelContainer();
+
+    /**
+     *  @brief Inserts new label to the hash structure
+     */
+    void
+    Insert(
+      const std::string &      rLabel,
+      std::streampos           Pos);
+
+
+    /**
+     *  @brief Looks for a record in the hash
+     */
+    bool
+    FindInHash(
+      const std::string&        rLabel,
+      LabelRecord&              rLS);
+
+    /**
+     *  @brief Looks for a record in the list
+     *  @param rLabel Label to look for
+     *  @param rLS    Structure to fill with found data
+     *  @param limitSearch If true @p rLS's @c mLabelListLimit gives the limiting position in the list
+     */
+    bool
+    FindInList(
+      const std::string&        rLabel,
+      LabelRecord&              rLS,
+      bool                      limitSearch = false);
+
+    /**
+     *  @brief Looks for a record
+     */
+    bool
+    Find(
+      const std::string &       rLabel,
+      LabelRecord &             rLS);
+
+    /**
+     *  @brief Returns the matched pattern
+     */
+    const std::string &
+    MatchedPattern() const
+    {
+      return mMatchedPattern;
+    }
+
+    /**
+     *  @brief Returns the matched pattern mask (%%%)
+     */
+    const std::string &
+    MatchedPatternMask() const
+    {
+      return mMatchedPatternMask;
+    }
+
+    /** 
+     * @brief Writes contents to stream (text)
+     * @param rOStream stream to write to
+     */
+    void
+    Write(std::ostream& rOStream);
+
+  private:
+    /// type used for directory depth notation
+    typedef  size_t                 DepthType;
+
+
+    /// this set stores depths of * labels observed at insertion
+    std::set<DepthType>             mDepths;
+
+    /// stores the labels
+    LabelHashType                   mLabelMap;
+    LabelListType                   mLabelList;
+
+    /// true if labels are to be sought by hashing function (fast) or by
+    /// sequential search (slow)
+    bool                            mUseHashedSearch;
+
+    /// if Find matches the label, this var stores the pattern that matched the
+    /// query
+    std::string                     mMatchedPattern;
+
+    /// if Find matches the label, this var stores the the masked characters.
+    /// The mask is given by '%' symbols
+    std::string                     mMatchedPatternMask;
+
+    /**
+     *  @brief Returns the directory depth of path
+     */
+    size_t
+    DirDepth(const std::string & path);
+
+
+  };
+
+
+  /** 
+   * @brief MLF output buffer definition
+   */
+  template<
+    typename _CharT, 
+    typename _Traits = std::char_traits<_CharT>,
+    typename _CharTA = std::allocator<_CharT>,
+    typename ByteT = char,
+    typename ByteAT = std::allocator<ByteT> 
+  > 
+    class BasicOMlfStreamBuf 
+    : public std::basic_streambuf<_CharT, _Traits> 
+    {
+    public:
+      // necessary typedefs ....................................................
+      typedef BasicOMlfStreamBuf<_CharT,_Traits,_CharTA,ByteT,ByteAT>
+                            this_type; 
+      typedef std::basic_ostream<_CharT, _Traits>& 
+                            OStreamReference;
+      typedef std::basic_streambuf<_CharT, _Traits>
+                            StreamBufType;
+      typedef _CharTA       char_allocator_type;
+      typedef _CharT        char_type;
+      typedef typename _Traits::int_type int_type;
+      typedef typename _Traits::pos_type pos_type;
+      typedef ByteT         byte_type;
+      typedef ByteAT        byte_allocator_type; 
+      typedef byte_type*    byte_buffer_type;
+      typedef std::vector<byte_type, byte_allocator_type > byte_vector_type;
+      typedef std::vector<char_type, char_allocator_type > char_vector_type;
+
+
+      BasicOMlfStreamBuf(OStreamReference rOStream, size_t bufferSize);
+
+      ~BasicOMlfStreamBuf();
+
+      // virtual functions inherited from basic_streambuf.......................
+      int 
+      sync();
+
+      /** 
+       * @brief Write character in the case of overflow
+       * @param c Character to be written.
+       * @return A value different than EOF (or traits::eof() for other traits) 
+       *         signals success.  If the function fails, either EOF 
+       *         (or traits::eof() for other traits) is returned or an 
+       *         exception is thrown.
+       */
+      int_type
+      overflow(int_type c = _Traits::eof());
+
+
+      // MLF specific functions ................................................
+      /** 
+       * @brief Creates a new MLF block
+       * @param rFileName filename to be opened
+       */
+      this_type*
+      Open(const std::string& rFileName);
+
+      /** 
+       * @brief Closes MLF block
+       */
+      void
+      Close();
+
+      /** 
+       * @brief Returns true if the MLF is now in open state
+       */
+      bool
+      IsOpen() const
+      { return mIsOpen; }
+
+      LabelContainer&
+      rLabels()
+      { return mLabels; }
+
+    private:
+      bool             mIsOpen;
+      char_type        mLastChar;
+      OStreamReference mOStream;
+      LabelContainer   mLabels;
+    }; // class BasicOMlfStreamBuf
+
+
+
+  /** 
+   * @brief MLF input buffer definition
+   */
+  template<
+    typename _CharT, 
+    typename _Traits = std::char_traits<_CharT>,
+    typename _CharTA = std::allocator<_CharT>,
+    typename ByteT = char,
+    typename ByteAT = std::allocator<ByteT> 
+  > 
+    class BasicIMlfStreamBuf 
+    : public std::basic_streambuf<_CharT, _Traits> 
+    {
+    private:
+      // internal automaton states
+      static const int IN_HEADER_STATE   = 0;
+      static const int OUT_OF_BODY_STATE = 1;
+      static const int IN_TITLE_STATE    = 2;
+      static const int IN_BODY_STATE     = 3;
+
+
+    public: // necessary typedefs ..............................................
+      typedef BasicIMlfStreamBuf<_CharT,_Traits,_CharTA,ByteT,ByteAT>
+                            this_type; 
+      typedef std::basic_istream<_CharT, _Traits>& IStreamReference;
+      typedef std::basic_streambuf<_CharT, _Traits>
+                            StreamBufType;
+      typedef _CharTA       char_allocator_type;
+      typedef _CharT        char_type;
+      typedef typename _Traits::int_type int_type;
+      typedef typename _Traits::pos_type pos_type;
+      typedef ByteT         byte_type;
+      typedef ByteAT        byte_allocator_type; 
+      typedef byte_type*    byte_buffer_type;
+      typedef std::vector<byte_type, byte_allocator_type > byte_vector_type;
+      typedef std::vector<char_type, char_allocator_type > char_vector_type;
+
+
+    public:
+      // constructors and destructors ..........................................
+      BasicIMlfStreamBuf(IStreamReference rIStream, size_t bufferSize = 1024);
+
+      ~BasicIMlfStreamBuf();
+
+      // virtual functions inherited from basic_streambuf.......................
+      /** 
+       * @brief Get character in the case of underflow
+       * 
+       * @return The new character available at the get pointer position, if 
+       *         any. Otherwise, traits::eof() is returned.  
+       */
+      int_type
+      underflow();
+
+
+      // MLF specific functions ................................................
+      /** 
+       * @brief Creates a new MLF block
+       * @param rFileName filename to be opened
+       */
+      this_type*
+      Open(const std::string& rFileName);
+
+      /** 
+       * @brief Closes MLF block
+       */
+      this_type*
+      Close();
+
+      /** 
+       * @brief Returns true if the MLF is now in open state
+       */
+      bool
+      IsOpen() const
+      { return mIsOpen; }
+
+      /** 
+       * @brief Parses the stream (if possible) and stores positions to the 
+       *        label titles
+       */
+      void
+      Index();
+
+	bool
+      IsHashed() const
+      { return mIsHashed; }
+
+      /** 
+       * @brief Jumps to next label definition
+       * @param rName std::string to be filled with the label name
+       * @return true on success
+       *
+       * The procedure automatically tries to hash the labels.
+       */
+      bool
+      JumpToNextDefinition(std::string& rName);
+
+      /** 
+       * @brief Returns reference to the base stream
+       * @return reference to the stream
+       *
+       */
+      IStreamReference
+      GetBaseStream() 
+      {
+        return mIStream;
+      }
+
+    private: // auxillary functions ............................................
+      /** 
+       * @brief Fills the line buffer with next line and updates the internal
+       * state of the finite automaton
+       */
+      void
+      FillLineBuffer();
+
+
+    private: // atributes ......................................................
+      // some flags
+      bool              mIsOpen;
+      bool              mIsHashed;
+      bool              mIsEof;
+
+      /// internal state of the finite automaton
+      int               mState;
+
+      IStreamReference  mIStream;
+      LabelContainer    mLabels;
+
+      std::vector<char_type>  mLineBuffer;
+    }; // class BasicIMlfStreamBuf
+
+
+
+
+  /** 
+   * @brief Base class with type-independent members for the Mlf Output 
+   *        Stram class
+   *
+   * This is a derivative of the basic_ios class. We derive it as we need 
+   * to override some member functions
+   */
+  template<
+    typename Elem, 
+    typename Tr = std::char_traits<Elem>,
+    typename ElemA = std::allocator<Elem>,
+    typename ByteT = char,
+    typename ByteAT = std::allocator<ByteT>
+  >	
+    class BasicOMlfStreamBase
+    : virtual public std::basic_ios<Elem,Tr>
+    {
+    public:
+      typedef std::basic_ostream<Elem, Tr>& OStreamReference;
+      typedef BasicOMlfStreamBuf <
+        Elem,Tr,ElemA,ByteT,ByteAT> OMlfStreamBufType;
+
+      /** 
+       * @brief constructor
+       * 
+       * @param rOStream user defined output stream 
+       */
+      BasicOMlfStreamBase(OStreamReference rOStream, 
+          size_t bufferSize)
+      : mBuf(rOStream, bufferSize)
+      { init(&mBuf); };
+      
+      /** 
+       * @brief Returns a pointer to the buffer object for this stream
+       */
+      OMlfStreamBufType* 
+      rdbuf() 
+      { return &mBuf; };
+
+    private:
+      OMlfStreamBufType mBuf;
+    };  
+
+
+  template<
+    typename Elem, 
+    typename Tr = std::char_traits<Elem>,
+    typename ElemA = std::allocator<Elem>,
+    typename ByteT = char,
+    typename ByteAT = std::allocator<ByteT>
+  >	
+    class BasicIMlfStreamBase
+    : virtual public std::basic_ios<Elem,Tr>
+    {
+    public:
+      typedef std::basic_istream<Elem, Tr>& IStreamReference;
+      typedef BasicIMlfStreamBuf <
+        Elem,Tr,ElemA,ByteT,ByteAT> IMlfStreamBufType;
+
+      BasicIMlfStreamBase( IStreamReference rIStream,
+          size_t bufferSize)
+      : mBuf(rIStream, bufferSize)
+      { init(&mBuf ); };
+      
+      IMlfStreamBufType* 
+      rdbuf() 
+      { return &mBuf; };
+
+      IStreamReference
+      GetBaseStream()
+      { return mBuf.GetBaseStream(); }
+
+    private:
+      IMlfStreamBufType mBuf;
+    };
+
+
+  template<
+    typename Elem, 
+    typename Tr = std::char_traits<Elem>,
+    typename ElemA = std::allocator<Elem>,
+    typename ByteT = char,
+    typename ByteAT = std::allocator<ByteT>
+  >
+    class BasicOMlfStream 
+    : public BasicOMlfStreamBase<Elem,Tr,ElemA,ByteT,ByteAT>, 
+      public std::basic_ostream<Elem,Tr>
+    {
+    public:
+      typedef BasicOMlfStreamBase< Elem,Tr,ElemA,ByteT,ByteAT> 
+                                          BasicOMlfStreamBaseType;
+      typedef std::basic_ostream<Elem,Tr> OStreamType;
+      typedef OStreamType&                OStreamReference;
+
+      BasicOMlfStream(OStreamReference rOStream, size_t bufferSize = 32)
+      : BasicOMlfStreamBaseType(rOStream, bufferSize), 
+        OStreamType(BasicOMlfStreamBaseType::rdbuf())
+      { }
+
+      /** 
+       * @brief Destructor closes the stream
+       */
+      ~BasicOMlfStream()
+      { }
+
+
+      /** 
+       * @brief Creates a new MLF block
+       * @param rFileName filename to be opened
+       */
+      void
+      Open(const std::string& rFileName)
+      { BasicOMlfStreamBaseType::rdbuf()->Open(rFileName); }
+
+      /** 
+       * @brief Closes MLF block
+       */
+      void
+      Close()
+      { BasicOMlfStreamBaseType::rdbuf()->Close(); }
+
+      /** 
+       * @brief Returns true if the MLF is now in open state
+       */
+      bool
+      IsOpen() const
+      { return BasicOMlfStreamBaseType::rdbuf()->IsOpen(); }
+
+      /** 
+       * @brief Accessor to the label container
+       * @return Reference to the label container
+       */
+      LabelContainer&
+      rLabels()
+      { return BasicOMlfStreamBaseType::rdbuf()->rLabels(); }
+    };
+
+
+
+  template<
+    typename Elem, 
+    typename Tr = std::char_traits<Elem>,
+    typename ElemA = std::allocator<Elem>,
+    typename ByteT = char,
+    typename ByteAT = std::allocator<ByteT>
+  >	
+    class BasicIMlfStream 
+    : public BasicIMlfStreamBase<Elem,Tr,ElemA,ByteT,ByteAT>, 
+      public std::basic_istream<Elem,Tr>
+    {
+    public:
+      typedef BasicIMlfStreamBase <Elem,Tr,ElemA,ByteT,ByteAT> 
+                                          BasicIMlfStreamBaseType;
+      typedef std::basic_istream<Elem,Tr> IStreamType;
+      typedef IStreamType&                IStreamReference;
+      typedef unsigned char               byte_type;
+
+      BasicIMlfStream(IStreamReference rIStream, size_t bufferSize = 32)
+      : BasicIMlfStreamBaseType(rIStream, bufferSize), 
+        IStreamType(BasicIMlfStreamBaseType::rdbuf())
+      {};
+
+
+      /** 
+       * @brief Creates a new MLF block
+       * @param rFileName filename to be opened
+       */
+      void
+      Open(const std::string& rFileName)
+      { 
+        std::basic_streambuf<Elem, Tr>* p_buf;
+
+        p_buf = BasicIMlfStreamBaseType::rdbuf()->Open(rFileName);
+
+        if (NULL == p_buf) {
+          IStreamType::clear(IStreamType::rdstate() | std::ios::failbit);
+        }
+        else {
+          IStreamType::clear();
+        }
+      }
+
+      /** 
+       * @brief Closes MLF block.
+       * In fact, nothing is done
+       */
+      void 
+      Close()
+      { 
+        if (NULL == BasicIMlfStreamBaseType::rdbuf()->Close()) {
+          IStreamType::clear(IStreamType::rdstate() | std::ios::failbit);
+        }
+      }
+
+      void
+      Index()
+      { BasicIMlfStreamBaseType::rdbuf()->Index(); }
+
+      bool
+	  IsHashed() const
+      { return BasicIMlfStreamBaseType::rdbuf()->IsHashed(); }
+
+    };
+
+
+
+  // MAIN TYPEDEFS..............................................................
+  typedef BasicOMlfStream<char>     OMlfStream;
+  typedef BasicOMlfStream<wchar_t>  WOMlfStream;
+  typedef BasicIMlfStream<char>     IMlfStream;
+  typedef BasicIMlfStream<wchar_t>  WIMlfStream;
+
+
+#ifdef PATH_MAX
+  const size_t MAX_LABEL_DEPTH = PATH_MAX;
+#else
+  const size_t MAX_LABEL_DEPTH = 1024;
+#endif
+
+
+} // namespace TNet
+
+#include "MlfStream.tcc"
+
+#endif
diff --git a/src/KaldiLib/.svn/text-base/MlfStream.tcc.svn-base b/src/KaldiLib/.svn/text-base/MlfStream.tcc.svn-base
new file mode 100644
index 0000000..8978545
--- /dev/null
+++ b/src/KaldiLib/.svn/text-base/MlfStream.tcc.svn-base
@@ -0,0 +1,517 @@
+#ifndef STK_MlfStream_tcc
+#define STK_MlfStream_tcc
+
+#include <algorithm>
+
+#include "Common.h"
+#include "StkMatch.h"
+
+namespace TNet
+{
+  //****************************************************************************
+  //****************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits,
+    typename _CharTA,
+    typename ByteT,
+    typename ByteAT
+  > 
+    BasicOMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>::
+    BasicOMlfStreamBuf(OStreamReference rOStream, size_t bufferSize)
+    : mIsOpen(false), mOStream(rOStream)
+    { }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits,
+    typename _CharTA,
+    typename ByteT,
+    typename ByteAT
+  > 
+    BasicOMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>::
+    ~BasicOMlfStreamBuf()
+    {
+      mOStream.flush();
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits,
+    typename _CharTA,
+    typename ByteT,
+    typename ByteAT
+  > 
+    int 
+    BasicOMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>::
+    sync()
+    {
+      mOStream.flush();
+      return 0;
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits,
+    typename _CharTA,
+    typename ByteT,
+    typename ByteAT
+  > 
+    typename _Traits::int_type 
+    BasicOMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>::
+    overflow(typename _Traits::int_type c)
+    {
+      // we don't use buffer here... 
+      if (mIsOpen) {
+        if (_Traits::eof() == c) {
+          return _Traits::not_eof(c);
+        }
+        // only pass the character to the stream
+        mOStream.rdbuf()->sputc(c);
+
+        // remember last char (in case we want to close)
+        mLastChar = c;
+
+        return c;
+      }
+      else {
+        return _Traits::eof();
+      }
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits,
+    typename _CharTA,
+    typename ByteT,
+    typename ByteAT
+  > 
+    void 
+    BasicOMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>::
+    Close()
+    {
+      // if last character was not EOL, we need to insert it
+      if (mLastChar != '\n') {
+        mOStream.put('\n');
+      }
+      mOStream << ".\n";
+
+      // flush the stream and declare the stream closed
+      mOStream.flush();
+      mIsOpen = false;
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits,
+    typename _CharTA,
+    typename ByteT,
+    typename ByteAT
+  > 
+    BasicOMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT> *
+    BasicOMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>::
+    Open(const std::string& rFileName)
+    {
+      // retreive position
+      std::streampos pos = mOStream.tellp();
+
+      // write the initial "filename" in parantheses
+      mOStream << '"' << rFileName << '"' << std::endl;
+      mLastChar = '\n';
+
+      // return NULL if we canot open
+      if (!mOStream.good()) {
+        return NULL;
+      }
+
+      // if ok, store the name position
+      if (-1 != pos) {
+        pos = mOStream.tellp();
+        mLabels.Insert(rFileName, pos);
+      }
+
+      // set open flag and return this
+      mIsOpen = true;
+      return this;
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  // BasicIMlfStreamBuf section
+  //
+  //****************************************************************************
+  //****************************************************************************
+
+  //****************************************************************************
+  //****************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits,
+    typename _CharTA,
+    typename ByteT,
+    typename ByteAT
+  > 
+    BasicIMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>::
+    BasicIMlfStreamBuf(IStreamReference rIStream, size_t bufferSize)
+    : mIsOpen(false), mIsHashed(false), mIsEof(true), mState(IN_HEADER_STATE), 
+      mIStream(rIStream), mLineBuffer()
+    {
+      // we reserve some place for the buffer...
+      mLineBuffer.reserve(bufferSize);
+
+      //StreamBufType::setg(mpBuffer, mpBuffer + bufferSize, mpBuffer + bufferSize);
+      StreamBufType::setg(&(mLineBuffer.front()), &(mLineBuffer.back()), &(mLineBuffer.back()));
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits,
+    typename _CharTA,
+    typename ByteT,
+    typename ByteAT
+  > 
+    BasicIMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>::
+    ~BasicIMlfStreamBuf()
+    { 
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits,
+    typename _CharTA,
+    typename ByteT,
+    typename ByteAT
+  > 
+    void
+    BasicIMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>::
+    Index()
+    {
+      // retreive position
+      std::streampos orig_pos   = mIStream.tellg();
+      int      orig_state = mState;
+
+      // for streams like stdin, pos will by definition be -1, so we can only 
+      // rely on sequential access and cannot hash it.
+      if (-1 != orig_pos) {
+        std::string aux_name;
+        // we will constantly jump to next definition. the function automatically
+        // hashes the stream if possible
+        while (JumpToNextDefinition(aux_name)) 
+        { }
+
+        // move to the original position
+        mIStream.clear();
+        mIStream.seekg(orig_pos);
+        mState = orig_state;
+
+        // set as hashed
+        mIsHashed=true;
+      }
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits,
+    typename _CharTA,
+    typename ByteT,
+    typename ByteAT
+  > 
+    bool
+    BasicIMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>::
+    JumpToNextDefinition(std::string& rName)
+    {
+      if (!mIStream.good()) {
+        return false;
+      }
+
+      // if we can, we will try to index the label
+      std::streampos pos = mIStream.tellg();
+
+      // we might be at a definition already, so first move one line further
+      FillLineBuffer();
+
+      // read lines till we get to definition again
+      while (mIStream.good() && mState != IN_TITLE_STATE) {
+        FillLineBuffer();
+      }
+
+      // decide what happened
+      if (IN_TITLE_STATE == mState) {
+        // if we can, we will try to index the label
+        pos = mIStream.tellg();
+
+        if (pos != static_cast<const std::streampos>(-1)) {
+        // if (pos !=std::string::npos) {  // This line does not work under MinGW
+          std::string line_buffer(mLineBuffer.begin(), mLineBuffer.end());
+          TNet::ParseHTKString(line_buffer, rName);
+          mLabels.Insert(rName, pos);
+        }
+
+        return true;
+      }
+      else {
+        // we have been hashing all the way through so we know that if this is 
+        // is the EOF, we are done hashing this stream
+        if (pos != static_cast<const std::streampos>(-1)) {
+          mIsHashed = true;
+        }
+
+        // we are not in body state, so we just return false
+        return false;
+      }
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits,
+    typename _CharTA,
+    typename ByteT,
+    typename ByteAT
+  > 
+    BasicIMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>*
+    BasicIMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>::
+    Close()
+    {
+      if (!mIsOpen) {
+        mIsEof = true;
+        return NULL;
+      }
+      else {
+        // if we try to close while in the body, we need to reach the end
+        if (mState == IN_BODY_STATE) {
+          while (mState == IN_BODY_STATE) {
+            FillLineBuffer();
+          }
+        }
+
+        // disable buffer mechanism
+        StreamBufType::setg(&(mLineBuffer.front()), &(mLineBuffer.front()), 
+            &(mLineBuffer.front()));
+
+        mIsEof  = true;
+        mIsOpen = false;
+
+        return this;
+      }
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits,
+    typename _CharTA,
+    typename ByteT,
+    typename ByteAT
+  > 
+    BasicIMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>*
+    BasicIMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>::
+    Open(const std::string& rFileName)
+    {
+      BasicIMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>* ret_val = NULL;
+
+      // this behavior is compatible with ifstream
+      if (mIsOpen) {
+        Close();
+        return NULL;
+      }
+
+      // retreive position
+      std::streampos pos = mIStream.tellg();
+      LabelRecord label_record;
+
+      // for streams like stdin, pos will by definition be -1, so we can only 
+      // rely on sequential access. At this place, we decide what to do
+      if ((-1 != pos) && (mLabels.Find(rFileName, label_record))) {
+        mIStream.seekg(label_record.mStreamPos);
+        mState = IN_TITLE_STATE;
+
+        // we don't want the other stream to be bad, so we transfer the 
+        // flagbits to this stream
+        if (!mIStream.good()) {
+          mIStream.clear();
+          mIsOpen = false;
+          ret_val = NULL;
+        }
+        else {
+          mIsOpen = true;
+          mIsEof = false;
+          ret_val = this;
+        }
+      }
+
+      // we don't have sequential stream and we didn't find the label, but
+      // we are hashed, so we can be sure, that we failed
+      else if ((-1 != pos) && mIsHashed) {
+        mIsOpen = false;
+        ret_val = NULL;
+      }
+
+      // we either have sequential stream or didn't find anything, but we can 
+      // still try to sequentially go and look for it
+      else {
+        bool        found = false;
+        std::string aux_name;
+        std::string aux_name2;
+
+        while ((!found) && JumpToNextDefinition(aux_name)) {
+          if (TNet::ProcessMask(rFileName, aux_name, aux_name2)) {
+            mIsOpen = true;
+            mIsEof  = false;
+            found   = true;
+            ret_val = this;
+          }
+        }
+
+        if (!found) {
+          mIsOpen = false;
+          ret_val = NULL;
+        }
+      }
+
+      return ret_val;
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits,
+    typename _CharTA,
+    typename ByteT,
+    typename ByteAT
+  > 
+    typename _Traits::int_type
+    BasicIMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>::
+    underflow()
+    {
+      // we don't do anything if EOF
+      if (mIsEof) {
+        StreamBufType::setg(&(mLineBuffer.front()), &(mLineBuffer.front()), 
+            &(mLineBuffer.front()));
+        return _Traits::eof();
+      }
+
+      // read from buffer if we can
+      if (StreamBufType::gptr() && (StreamBufType::gptr() < StreamBufType::egptr())) {
+        return _Traits::not_eof(*StreamBufType::gptr());
+      }
+
+      // might happen that stream is in !good state
+      if (!mIStream.good()) {
+        mIsEof = true;
+        StreamBufType::setg(&(mLineBuffer.front()), &(mLineBuffer.front()), 
+            &(mLineBuffer.front()));
+        return _Traits::eof();
+      }
+
+      // fill the line buffer and update my state
+      FillLineBuffer();
+
+      // if the whole line is just period or it's eof, declare EOF
+      if (mState == OUT_OF_BODY_STATE) {
+        mIsEof = true;
+        StreamBufType::setg(&(mLineBuffer.front()), &(mLineBuffer.front()), 
+            &(mLineBuffer.front()));
+        return _Traits::eof();
+      }
+
+      // restore the buffer mechanism
+      StreamBufType::setg(&(mLineBuffer.front()), &(mLineBuffer.front()),
+          &(mLineBuffer.back()) + 1);
+
+      return *StreamBufType::gptr();
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits,
+    typename _CharTA,
+    typename ByteT,
+    typename ByteAT
+  > 
+    void
+    BasicIMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>::
+    FillLineBuffer()
+    {
+      // reset line buffer
+      size_t capacity = mLineBuffer.capacity();
+      mLineBuffer.clear();
+      mLineBuffer.reserve(capacity);
+
+      // read one line into buffer
+      int c;
+      while ((c = mIStream.get()) != '\n' && c != _Traits::eof()) {
+        mLineBuffer.push_back(char(c));
+      }
+
+      // we want to be able to pass last eol symbol
+      if (c == '\n') {
+        mLineBuffer.push_back(char(c));
+      }
+
+      // we will decide where we are
+      switch (mState) {
+        case IN_HEADER_STATE:
+
+        case OUT_OF_BODY_STATE:
+          if (mLineBuffer[0] != '#') {
+            mState = IN_TITLE_STATE;
+          }
+          break;
+
+        case IN_TITLE_STATE:
+          if (mLineBuffer[0] == '.' && (mLineBuffer.back() == '\n' || mIStream.eof())) {
+            mState = OUT_OF_BODY_STATE;
+          }
+          else {
+            mState = IN_BODY_STATE;
+          }
+          break;
+
+        case IN_BODY_STATE:
+          // period or EOF will end the file
+          if (mLineBuffer[0] == '.' && (mLineBuffer.back() == '\n' || mIStream.eof())) {
+            mState = OUT_OF_BODY_STATE;
+          }
+          if (mLineBuffer.size() == 0) {
+            mState = OUT_OF_BODY_STATE;
+          }
+          break;
+      }
+    }
+} // namespace TNet
+
+
+#endif // STK_MlfStream_tcc
diff --git a/src/KaldiLib/.svn/text-base/StkMatch.cc.svn-base b/src/KaldiLib/.svn/text-base/StkMatch.cc.svn-base
new file mode 100644
index 0000000..4ff4b18
--- /dev/null
+++ b/src/KaldiLib/.svn/text-base/StkMatch.cc.svn-base
@@ -0,0 +1,582 @@
+/*
+ EPSHeader
+
+   File: filmatch.c
+   Author: J. Kercheval
+   Created: Thu, 03/14/1991  22:22:01
+*/
+
+/*
+ EPSRevision History
+   O. Glembek    Thu, 03/11/2005  01:58:00  Added Mask extraction support (char % does this)
+   J. Kercheval  Wed, 02/20/1991  22:29:01  Released to Public Domain
+   J. Kercheval  Fri, 02/22/1991  15:29:01  fix '\' bugs (two :( of them)
+   J. Kercheval  Sun, 03/10/1991  19:31:29  add error return to matche()
+   J. Kercheval  Sun, 03/10/1991  20:11:11  add is_valid_pattern code
+   J. Kercheval  Sun, 03/10/1991  20:37:11  beef up main()
+   J. Kercheval  Tue, 03/12/1991  22:25:10  Released as V1.1 to Public Domain
+   J. Kercheval  Thu, 03/14/1991  22:22:25  remove '\' for DOS file parsing
+   J. Kercheval  Thu, 03/28/1991  20:58:27  include filmatch.h
+*/
+
+/*
+   Wildcard Pattern Matching
+*/
+
+
+#include "StkMatch.h"
+#include "Common.h"
+
+namespace TNet
+{
+  //#define TEST
+  static int matche_after_star (register const char *pattern, register const char *text, register char *s);
+  // following function is not defined or used.
+  // static int fast_match_after_star (register const char *pattern, register const char *text);
+
+  /*----------------------------------------------------------------------------
+  *
+  * Return true if PATTERN has any special wildcard characters
+  *
+  ----------------------------------------------------------------------------*/
+
+  bool is_pattern (const char *p)
+  {
+      while ( *p ) {
+          switch ( *p++ ) {
+              case '?':
+              case '*':
+              case '%':
+              case '[':
+                return true;
+          }
+      }
+      return false;
+  }
+
+
+  /*----------------------------------------------------------------------------
+  *
+  * Return true if PATTERN has is a well formed regular expression according
+  * to the above syntax
+  *
+  * error_type is a return code based on the type of pattern error.  Zero is
+  * returned in error_type if the pattern is a valid one.  error_type return
+  * values are as follows:
+  *
+  *   PATTERN_VALID - pattern is well formed
+  *   PATTERN_RANGE - [..] construct has a no end range in a '-' pair (ie [a-])
+  *   PATTERN_CLOSE - [..] construct has no end bracket (ie [abc-g )
+  *   PATTERN_EMPTY - [..] construct is empty (ie [])
+  *
+  ----------------------------------------------------------------------------*/
+
+  bool is_valid_pattern (const char *p, int *error_type)
+  {
+
+  /* init error_type */
+  *error_type = PATTERN_VALID;
+
+    /* loop through pattern to EOS */
+    while ( *p )
+    {
+      /* determine pattern type */
+      switch ( *p )
+      {
+        /* the [..] construct must be well formed */
+        case '[':
+        {
+          p++;
+
+          /* if the next character is ']' then bad pattern */
+          if ( *p == ']' ) {
+            *error_type = PATTERN_EMPTY;
+            return false;
+          }
+
+          /* if end of pattern here then bad pattern */
+          if ( !*p )
+          {
+            *error_type = PATTERN_CLOSE;
+            return false;
+          }
+
+          /* loop to end of [..] construct */
+          while ( *p != ']' )
+          {
+            /* check for literal escape */
+            if ( *p == '\\' )
+            {
+                p++;
+
+                /* if end of pattern here then bad pattern */
+                if ( !*p++ ) {
+                    *error_type = PATTERN_ESC;
+                    return false;
+                }
+            }
+            else
+                p++;
+
+            /* if end of pattern here then bad pattern */
+            if ( !*p )
+            {
+              *error_type = PATTERN_CLOSE;
+              return false;
+            }
+
+            /* if this a range */
+            if ( *p == '-' )
+            {
+              /* we must have an end of range */
+              if ( !*++p || *p == ']' )
+              {
+                *error_type = PATTERN_RANGE;
+                return false;
+              }
+              else
+              {
+
+                /* check for literal escape */
+                if ( *p == '\\' )
+                    p++;
+
+                /* if end of pattern here then bad pattern */
+                if ( !*p++ )
+                {
+                    *error_type = PATTERN_ESC;
+                    return false;
+                }
+              }
+            }
+          }
+          break;
+        } //case '[':
+
+
+        /* all other characters are valid pattern elements */
+        case '*':
+        case '?':
+        case '%':
+        default:
+          p++;                              /* "normal" character */
+          break;
+      } // switch ( *p )
+    } // while ( *p )
+
+    return true;
+  } //bool is_valid_pattern (const char *p, int *error_type)
+
+
+  /*----------------------------------------------------------------------------
+  *
+  *  Match the pattern PATTERN against the string TEXT;
+  *
+  *  returns MATCH_VALID if pattern matches, or an errorcode as follows
+  *  otherwise:
+  *
+  *            MATCH_PATTERN  - bad pattern
+  *            MATCH_RANGE    - match failure on [..] construct
+  *            MATCH_ABORT    - premature end of text string
+  *            MATCH_END      - premature end of pattern string
+  *            MATCH_VALID    - valid match
+  *
+  *
+  *  A match means the entire string TEXT is used up in matching.
+  *
+  *  In the pattern string:
+  *       `*' matches any sequence of characters (zero or more)
+  *       `?' matches any character
+  *       `%' matches any character and stores it in the s string
+  *       [SET] matches any character in the specified set,
+  *       [!SET] or [^SET] matches any character not in the specified set.
+  *       \ is allowed within a set to escape a character like ']' or '-'
+  *
+  *  A set is composed of characters or ranges; a range looks like
+  *  character hyphen character (as in 0-9 or A-Z).  [0-9a-zA-Z_] is the
+  *  minimal set of characters allowed in the [..] pattern construct.
+  *  Other characters are allowed (ie. 8 bit characters) if your system
+  *  will support them.
+  *
+  *  To suppress the special syntactic significance of any of `[]*?%!^-\',
+  *  within a [..] construct and match the character exactly, precede it
+  *  with a `\'.
+  *
+  ----------------------------------------------------------------------------*/
+
+  int matche ( register const char *p, register const char *t, register char *s )
+  {
+      register char range_start, range_end;  /* start and end in range */
+
+      bool invert;             /* is this [..] or [!..] */
+      bool member_match;       /* have I matched the [..] construct? */
+      bool loop;               /* should I terminate? */
+
+      for ( ; *p; p++, t++ ) {
+
+          /* if this is the end of the text then this is the end of the match */
+          if (!*t) {
+              return ( *p == '*' && *++p == '\0' ) ? MATCH_VALID : MATCH_ABORT;
+          }
+
+          /* determine and react to pattern type */
+          switch ( *p ) {
+
+              /* single any character match */
+              case '?':
+                  break;
+
+              /* single any character match, with extraction*/
+              case '%': {
+                  *s++ = *t;
+                  *s   = '\0';
+                  break;
+              }
+
+              /* multiple any character match */
+              case '*':
+                  return matche_after_star (p, t, s);
+
+              /* [..] construct, single member/exclusion character match */
+              case '[': {
+                /* move to beginning of range */
+                p++;
+
+                /* check if this is a member match or exclusion match */
+                invert = false;
+                if ( *p == '!' || *p == '^') {
+                    invert = true;
+                    p++;
+                }
+
+                /* if closing bracket here or at range start then we have a
+                   malformed pattern */
+                if ( *p == ']' ) {
+                    return MATCH_PATTERN;
+                }
+
+                member_match = false;
+                loop = true;
+
+                while ( loop ) {
+
+                    /* if end of construct then loop is done */
+                    if (*p == ']') {
+                        loop = false;
+                        continue;
+                    }
+
+                    /* matching a '!', '^', '-', '\' or a ']' */
+                    if ( *p == '\\' ) {
+                        range_start = range_end = *++p;
+                    }
+                    else {
+                        range_start = range_end = *p;
+                    }
+
+                    /* if end of pattern then bad pattern (Missing ']') */
+                    if (!*p)
+                        return MATCH_PATTERN;
+
+                    /* check for range bar */
+                    if (*++p == '-') {
+
+                        /* get the range end */
+                        range_end = *++p;
+
+                        /* if end of pattern or construct then bad pattern */
+                        if (range_end == '\0' || range_end == ']')
+                            return MATCH_PATTERN;
+
+                        /* special character range end */
+                        if (range_end == '\\') {
+                            range_end = *++p;
+
+                            /* if end of text then we have a bad pattern */
+                            if (!range_end)
+                                return MATCH_PATTERN;
+                        }
+
+                        /* move just beyond this range */
+                        p++;
+                    }
+
+                    /* if the text character is in range then match found.
+                       make sure the range letters have the proper
+                       relationship to one another before comparison */
+                    if ( range_start < range_end  ) {
+                        if (*t >= range_start && *t <= range_end) {
+                            member_match = true;
+                            loop = false;
+                        }
+                    }
+                    else {
+                        if (*t >= range_end && *t <= range_start) {
+                            member_match = true;
+                            loop = false;
+                        }
+                    }
+                }
+
+                /* if there was a match in an exclusion set then no match */
+                /* if there was no match in a member set then no match */
+                if ((invert && member_match) ||
+                   !(invert || member_match))
+                    return MATCH_RANGE;
+
+                /* if this is not an exclusion then skip the rest of the [...]
+                    construct that already matched. */
+                if (member_match) {
+                    while (*p != ']') {
+
+                        /* bad pattern (Missing ']') */
+                        if (!*p)
+                            return MATCH_PATTERN;
+
+                        /* skip exact match */
+                        if (*p == '\\') {
+                            p++;
+
+                            /* if end of text then we have a bad pattern */
+                            if (!*p)
+                                return MATCH_PATTERN;
+                        }
+
+                        /* move to next pattern char */
+                        p++;
+                    }
+                }
+
+                break;
+            } // case ']'
+
+            /* must match this character exactly */
+            default:
+                if (*p != *t)
+                    return MATCH_LITERAL;
+        }
+      }
+
+      //*s = '\0';
+      /* if end of text not reached then the pattern fails */
+      if ( *t )
+          return MATCH_END;
+      else
+          return MATCH_VALID;
+  }
+
+
+  /*----------------------------------------------------------------------------
+  *
+  * recursively call matche() with final segment of PATTERN and of TEXT.
+  *
+  ----------------------------------------------------------------------------*/
+
+  static int matche_after_star (register const char *p, register const char *t, register char *s)
+  {
+      register int match = 0;
+      register char nextp;
+
+      /* pass over existing ? and * in pattern */
+      while ( *p == '?' || *p == '%' || *p == '*' ) {
+
+          /* take one char for each ? and + */
+          if ( *p == '?') {
+
+              /* if end of text then no match */
+              if ( !*t++ ) {
+                  return MATCH_ABORT;
+              }
+          }
+
+          if ( *p == '%') {
+              *s++ = *t;
+              *s   = '\0';
+              /* if end of text then no match */
+              if ( !*t++ ) {
+                  return MATCH_ABORT;
+              }
+          }
+
+          /* move to next char in pattern */
+          p++;
+      }
+
+      /* if end of pattern we have matched regardless of text left */
+      if ( !*p ) {
+          return MATCH_VALID;
+      }
+
+      /* get the next character to match which must be a literal or '[' */
+      nextp = *p;
+
+      /* Continue until we run out of text or definite result seen */
+      do {
+
+          /* a precondition for matching is that the next character
+             in the pattern match the next character in the text or that
+             the next pattern char is the beginning of a range.  Increment
+             text pointer as we go here */
+          if ( nextp == *t || nextp == '[' ) {
+              match = matche(p, t, s);
+          }
+
+          /* if the end of text is reached then no match */
+          if ( !*t++ ) match = MATCH_ABORT;
+
+      } while ( match != MATCH_VALID && 
+                match != MATCH_ABORT &&
+                match != MATCH_PATTERN);
+
+      /* return result */
+      return match;
+  }
+
+
+  /*----------------------------------------------------------------------------
+  *
+  * match() is a shell to matche() to return only bool values.
+  *
+  ----------------------------------------------------------------------------*/
+
+  bool match(const  char *p, const char *t, char *s)
+  {
+      int error_type;
+      error_type = matche(p,t,s);
+      return (error_type != MATCH_VALID ) ? false : true;
+  }
+
+
+  //***************************************************************************
+  //***************************************************************************
+  bool
+  ProcessMask(const std::string & rString,
+              const std::string & rWildcard,
+                    std::string & rSubstr)
+  {
+    char *  substr;
+    int     percent_count        = 0;
+    int     ret ;
+    size_t  pos                  = 0;
+
+    // let's find how many % to allocate enough space for the return substring
+    while ((pos = rWildcard.find('%', pos)) != rWildcard.npos)
+    {
+      percent_count++;
+      pos++;
+    }
+
+    // allocate space for the substring
+    substr = new char[percent_count + 1];
+    substr[percent_count] = 0;
+    substr[0]             = '\0';
+
+    // optionally prepend '*/' to wildcard
+    std::string wildcard(rWildcard);
+    if(wildcard[0] != '*') {
+      wildcard = "*/" + wildcard;
+    }
+
+    //optionally prepend '/' to string
+    std::string string1(rString);
+    if(string1[0] != '/') {
+      string1 = "/" + string1;
+    }
+
+    // parse the string
+    if (0 != (ret = match(wildcard.c_str(), string1.c_str(), substr)))
+    {
+      rSubstr = substr;
+    }
+    delete[] substr;
+    return ret;
+  } // ProcessMask
+}
+
+
+#ifdef TEST
+
+/*
+* This test main expects as first arg the pattern and as second arg
+* the match string.  Output is yaeh or nay on match.  If nay on
+* match then the error code is parsed and written.
+*/
+
+#include <stdio.h>
+
+int main(int argc, char *argv[])
+{
+    int error;
+    int is_valid_error;
+
+    char * tmp = argv[0];
+    int i = 0;
+    for (; *tmp; tmp++)
+      if (*tmp=='%') i++;
+
+    char s[i+1];
+
+
+    if (argc != 3) {
+        printf("Usage:  MATCH Pattern Text\n");
+    }
+    else {
+        printf("Pattern: %s\n", argv[1]);
+        printf("Text   : %s\n", argv[2]);
+        
+        if (!is_pattern(argv[1])) {
+            printf("    First Argument Is Not A Pattern\n");
+        }
+        else {
+            match(argv[1],argv[2], s) ? printf("true") : printf("false");
+            error = matche(argv[1],argv[2], s);
+            is_valid_pattern(argv[1],&is_valid_error);
+
+            switch ( error ) {
+                case MATCH_VALID:
+                    printf("    Match Successful");
+                    if (is_valid_error != PATTERN_VALID)
+                        printf(" -- is_valid_pattern() is complaining\n");
+                    else
+                        printf("\n");
+                    printf("%s\n", s);
+
+                    break;
+                case MATCH_RANGE:
+                    printf("    Match Failed on [..]\n");
+                    break;
+                case MATCH_ABORT:
+                    printf("    Match Failed on Early Text Termination\n");
+                    break;
+                case MATCH_END:
+                    printf("    Match Failed on Early Pattern Termination\n");
+                    break;
+                case MATCH_PATTERN:
+                    switch ( is_valid_error ) {
+                        case PATTERN_VALID:
+                            printf("    Internal Disagreement On Pattern\n");
+                            break;
+                        case PATTERN_RANGE:
+                            printf("    No End of Range in [..] Construct\n");
+                            break;
+                        case PATTERN_CLOSE:
+                            printf("    [..] Construct is Open\n");
+                            break;
+                        case PATTERN_EMPTY:
+                            printf("    [..] Construct is Empty\n");
+                            break;
+                        default:
+                            printf("    Internal Error in is_valid_pattern()\n");
+                    }
+                    break;
+                default:
+                    printf("    Internal Error in matche()\n");
+                    break;
+            }
+        }
+
+    }
+    return(0);
+}
+
+#endif
diff --git a/src/KaldiLib/.svn/text-base/StkMatch.h.svn-base b/src/KaldiLib/.svn/text-base/StkMatch.h.svn-base
new file mode 100644
index 0000000..42c6b97
--- /dev/null
+++ b/src/KaldiLib/.svn/text-base/StkMatch.h.svn-base
@@ -0,0 +1,123 @@
+#ifndef TNet_StkMatch_h
+#define TNet_StkMatch_h
+
+#include <string>
+namespace TNet 
+{
+  /*
+   EPSHeader
+
+     File: filmatch.h
+     Author: J. Kercheval
+     Created: Thu, 03/14/1991  22:24:34
+  */
+
+  /*
+   EPSRevision History
+     O. Glembek    Thu, 03/11/2005  01:58:00  Added Mask extraction support (char % does this)
+     J. Kercheval  Wed, 02/20/1991  22:28:37  Released to Public Domain
+     J. Kercheval  Sun, 03/10/1991  18:02:56  add is_valid_pattern
+     J. Kercheval  Sun, 03/10/1991  18:25:48  add error_type in is_valid_pattern
+     J. Kercheval  Sun, 03/10/1991  18:47:47  error return from matche()
+     J. Kercheval  Tue, 03/12/1991  22:24:49  Released as V1.1 to Public Domain
+     J. Kercheval  Thu, 03/14/1991  22:25:00  remove '\' for DOS file matching
+     J. Kercheval  Thu, 03/28/1991  21:03:59  add in PATTERN_ESC & MATCH_LITERAL
+  */
+
+  /*
+     Wildcard Pattern Matching
+  */
+
+
+  /* match defines */
+#define MATCH_PATTERN  6    /* bad pattern */
+#define MATCH_LITERAL  5    /* match failure on literal match */
+#define MATCH_RANGE    4    /* match failure on [..] construct */
+#define MATCH_ABORT    3    /* premature end of text string */
+#define MATCH_END      2    /* premature end of pattern string */
+#define MATCH_VALID    1    /* valid match */
+
+  /* pattern defines */
+#define PATTERN_VALID  0    /* valid pattern */
+#define PATTERN_ESC   -1    /* literal escape at end of pattern */
+#define PATTERN_RANGE -2    /* malformed range in [..] construct */
+#define PATTERN_CLOSE -3    /* no end bracket in [..] construct */
+#define PATTERN_EMPTY -4    /* [..] contstruct is empty */
+
+
+  /*----------------------------------------------------------------------------
+  *
+  *  Match the pattern PATTERN against the string TEXT;
+  *
+  *       match() returns TRUE if pattern matches, FALSE otherwise.
+  *       matche() returns MATCH_VALID if pattern matches, or an errorcode
+  *           as follows otherwise:
+  *
+  *            MATCH_PATTERN  - bad pattern
+  *            MATCH_RANGE    - match failure on [..] construct
+  *            MATCH_ABORT    - premature end of text string
+  *            MATCH_END      - premature end of pattern string
+  *            MATCH_VALID    - valid match
+  *
+  *
+  *  A match means the entire string TEXT is used up in matching.
+  *
+  *  In the pattern string:
+  *       `*' matches any sequence of characters (zero or more)
+  *       `?' matches any character
+  *       [SET] matches any character in the specified set,
+  *       [!SET] or [^SET] matches any character not in the specified set.
+  *
+  *  A set is composed of characters or ranges; a range looks like
+  *  character hyphen character (as in 0-9 or A-Z).  [0-9a-zA-Z_] is the
+  *  minimal set of characters allowed in the [..] pattern construct.
+  *  Other characters are allowed (ie. 8 bit characters) if your system
+  *  will support them.
+  *
+  *  To suppress the special syntactic significance of any of `[]*?!^-\',
+  *  in a [..] construct and match the character exactly, precede it
+  *  with a `\'.
+  *
+  ----------------------------------------------------------------------------*/
+  bool 
+  match (const char *pattern, const char *text, char *s);
+
+  int  
+  matche(register const char *pattern, register const char *text, register char *s);
+
+
+  /*----------------------------------------------------------------------------
+  *
+  * Return TRUE if PATTERN has any special wildcard characters
+  *
+  ----------------------------------------------------------------------------*/
+  bool 
+  is_pattern (const char *pattern);
+
+
+  /** --------------------------------------------------------------------------
+   *
+   * Return TRUE if PATTERN has is a well formed regular expression according
+   * to the above syntax
+   *
+   * error_type is a return code based on the type of pattern error.  Zero is
+   * returned in error_type if the pattern is a valid one.  error_type return
+   * values are as follows:
+   *
+   *   PATTERN_VALID - pattern is well formed
+   *   PATTERN_RANGE - [..] construct has a no end range in a '-' pair (ie [a-])
+   *   PATTERN_CLOSE - [..] construct has no end bracket (ie [abc-g )
+   *   PATTERN_EMPTY - [..] construct is empty (ie [])
+   *  --------------------------------------------------------------------------
+  **/
+  bool 
+  is_valid_pattern (const char *pattern, int *error_type);
+
+
+  //****************************************************************************
+  //****************************************************************************
+  bool
+  ProcessMask(const std::string & rString, const std::string & rWildcard,
+      std::string & rSubstr);
+}
+#endif
diff --git a/src/KaldiLib/.svn/text-base/StkStream.h.svn-base b/src/KaldiLib/.svn/text-base/StkStream.h.svn-base
new file mode 100644
index 0000000..9188205
--- /dev/null
+++ b/src/KaldiLib/.svn/text-base/StkStream.h.svn-base
@@ -0,0 +1,526 @@
+
+
+/** @file stkstream.h
+ *  This is an TNet C++ Library header.
+ */
+
+
+#ifndef TNet_StkStream_h
+#define TNet_StkStream_h
+
+#include <fstream>
+#include <string>
+#include <vector>
+#include <list>
+#include <stdexcept>
+
+#pragma GCC system_header
+
+
+//extern const char * gpFilterWldcrd;
+
+namespace TNet
+{
+
+  /**
+   *  @brief Expands a filter command into a runnable form
+   *
+   *  This function replaces all occurances of *filter_wldcard in *command by
+   *  *filename
+   */
+  //char * ExpandFilterCommand(const char *command, const char *filename);
+
+  /**
+   *  @brief Provides a layer of compatibility for C/POSIX.
+   *
+   *  This GNU extension provides extensions for working with standard C
+   *  FILE*'s and POSIX file descriptors.  It must be instantiated by the
+   *  user with the type of character used in the file stream, e.g.,
+   *  basic_stkbuf<char>.
+   */
+  template<
+    typename _CharT, 
+    typename _Traits = std::char_traits<_CharT>
+  > 
+    class basic_stkbuf : public std::basic_filebuf<_CharT, _Traits>
+    {
+    public:
+
+      typedef basic_stkbuf<_CharT, _Traits>     this_type;
+
+      // Types:
+      typedef _CharT                        char_type;
+      typedef _Traits                       traits_type;
+
+      typedef typename traits_type::int_type        int_type;
+      typedef typename traits_type::pos_type        pos_type;
+      typedef typename traits_type::off_type        off_type;
+      typedef std::size_t                           size_t;
+
+    public:
+
+      /// @{
+      /// Type of streambuffer
+      static const unsigned int t_undef  = 0; ///< undefined
+      static const unsigned int t_file   = 1; ///< file stream
+      static const unsigned int t_pipe   = 2; ///< pipe
+      static const unsigned int t_filter = 4; ///< filter
+      static const unsigned int t_stdio  = 8; ///< standard input/output
+      /// @}
+
+    public:
+
+      /**
+      * deferred initialization
+      */
+      basic_stkbuf() : std::basic_filebuf<_CharT, _Traits>(),
+        mFilename(""), mpFilePtr(0), mStreamType(t_undef){}
+
+      /**
+       *  @brief  Opens a stream.
+       *  @param  fName  The name of the file.
+       *  @param  m      The open mode flags.
+       *  @param  pFilter The pFilter command to use
+       *  @return  @c this on success, NULL on failure
+       *
+       *  If a file is already open, this function immediately fails.
+       *  Otherwise it tries to open the file named @a s using the flags
+       *  given in @a mode.
+       *
+       *  [Table 92 gives the relation between openmode combinations and the
+       *  equivalent fopen() flags, but the table has not been copied yet.]
+       */
+      basic_stkbuf(const char* fName, std::ios_base::openmode m, const char* pFilter="");
+
+      
+      /**
+      *  @return  The underlying FILE*.
+      *
+      *  This function can be used to access the underlying "C" file pointer.
+      *  Note that there is no way for the library to track what you do
+      *  with the file, so be careful.
+      */
+      std::__c_file*
+      file() { return this->_M_file.file(); }
+
+
+      /**
+      *  @return  The underlying FILE*.
+      *
+      *  This function can be used to access the underlying "C" file pointer.
+      *  Note that there is no way for the library to track what you do
+      *  with the file, so be careful.
+      */
+      std::__c_file*
+      fp() { return this->_M_file.file(); }
+      
+      
+      /**
+       *  @brief  Opens an external file.
+       *  @param  fName  The name of the file.
+       *  @param  m      The open mode flags.
+       *  @param  pFilter The pFilter command to use
+       *  @return  @c this on success, NULL on failure
+       *
+       *  If a file is already open, this function immediately fails.
+       *  Otherwise it tries to open the file named @a s using the flags
+       *  given in @a mode.
+       *
+       *  [Table 92 gives the relation between openmode combinations and the
+       *  equivalent fopen() flags, but the table has not been copied yet.]
+       */
+      this_type*
+      open(const char* pFName, std::ios_base::openmode m, const char* pFilter="");
+      
+      /**
+       *  @brief  Closes the currently associated file.
+       *  @return  @c this on success, NULL on failure
+       *
+       *  If no file is currently open, this function immediately fails.
+       *
+       *  If a "put buffer area" exists, @c overflow(eof) is called to flush
+       *  all the characters.  The file is then closed.
+       *
+       *  If any operations fail, this function also fails.
+       */
+      this_type*
+      close();
+
+      /**
+      *  Closes the external data stream if the file descriptor constructor
+      *  was used.
+      */
+      virtual
+      ~basic_stkbuf() 
+      {close();};
+
+      /// Returns the file name
+      const std::string 
+      name() const 
+      {return mFilename;}
+
+
+    private:
+      /// converts the ios::xxx mode to stdio style
+      static void open_mode(std::ios_base::openmode __mode, int&, int&,  char* __c_mode);
+
+      /**
+       *  @param  __f  An open @c FILE*.
+       *  @param  __mode  Same meaning as in a standard filebuf.
+       *  @param  __size  Optimal or preferred size of internal buffer, in chars.
+       *                Defaults to system's @c BUFSIZ.
+       *
+       *  This method associates a file stream buffer with an open
+       *  C @c FILE*.  The @c FILE* will not be automatically closed when the
+       *  basic_stkbuf is closed/destroyed. It is equivalent to one of the constructors
+       *  of the stdio_filebuf class defined in GNU ISO C++ ext/stdio_filebuf.h
+      */
+      void superopen(std::__c_file* __f, std::ios_base::openmode __mode,
+            size_t __size = static_cast<size_t>(BUFSIZ));
+
+
+    private:
+      /// Holds the full file name
+      std::string           mFilename;
+
+      std::ios_base::openmode  mMode;
+
+      /// Holds a pointer to the main FILE structure
+      FILE *                mpFilePtr;
+
+      /// tells what kind of stream we use (stdio, file, pipe)
+      unsigned int          mStreamType;
+
+    };
+
+
+
+  /**
+   *  @brief This extension wraps stkbuf stream buffer into the standard ios class.
+   *
+   *  This class is inherited by (i/o)stkstream classes which make explicit use of
+   *  the custom stream buffer
+   */
+  template<
+    typename _CharT, 
+    typename _Traits = std::char_traits<_CharT>
+  >	
+    class BasicStkIos 
+    : virtual public std::basic_ios<_CharT, _Traits>
+    {
+    public:
+      typedef basic_stkbuf <_CharT,_Traits>        StkBufType;
+
+      BasicStkIos() 
+      : mBuf() 
+      { init(&mBuf) ;};
+
+      BasicStkIos(const char* fName, std::ios::openmode m, const char* pFilter) 
+      : mBuf(fName, m, pFilter) 
+      { init(&mBuf) ; }
+
+      StkBufType*
+      rdbuf() 
+      { return &mBuf; }
+
+    protected:
+      StkBufType  mBuf;
+    };
+
+
+  /**
+   *  @brief  Controlling input for files.
+   *
+   *  This class supports reading from named files, using the inherited
+   *  functions from std::istream.  To control the associated
+   *  sequence, an instance of std::stkbuf is used.
+  */
+  template<
+    typename _CharT, 
+    typename _Traits = std::char_traits<_CharT>
+  >	
+    class BasicIStkStream 
+    : public BasicStkIos<_CharT, _Traits>, 
+      public std::basic_istream<_CharT, _Traits>
+    {
+    public:
+      typedef BasicStkIos<_CharT, _Traits> BasicStkIosType;
+      typedef std::basic_istream<_CharT,_Traits>  IStreamType;
+
+
+      // Constructors:
+      /**
+       *  @brief  Default constructor.
+       *
+       *  Initializes @c mBuf using its default constructor, and passes
+       *  @c &sb to the base class initializer.  Does not open any files
+       *  (you haven't given it a filename to open).
+       */
+      BasicIStkStream() 
+      : BasicStkIosType(),
+        IStreamType(BasicStkIosType::rdbuf())
+      {};
+
+     /**
+       *  @brief  Create an input file stream.
+       *  @param  fName  String specifying the filename.
+       *  @param  m      Open file in specified mode (see std::ios_base).
+       *  @param  pFilter String specifying pFilter command to use on fName
+       *
+       *  @c ios_base::in is automatically included in
+       *  @a m.
+       *
+       *  Tip:  When using std::string to hold the filename, you must use
+       *  .c_str() before passing it to this constructor.
+      */
+      BasicIStkStream(const char* pFName, std::ios::openmode m=std::ios::out, const char* pFilter="")
+      : BasicStkIosType(),
+        IStreamType(BasicStkIosType::rdbuf())
+      {this->open(pFName, std::ios::in, pFilter);}
+
+      ~BasicIStkStream() 
+      {
+        this->close();
+      }
+        
+      /**
+      *  @brief  Opens an external file.
+      *  @param  s  The name of the file.
+      *  @param  mode  The open mode flags.
+      *  @param  pFilter The pFilter command to use
+         *
+      *  Calls @c std::basic_filebuf::open(s,mode|in).  If that function
+      *  fails, @c failbit is set in the stream's error state.
+      *
+      *  Tip:  When using std::string to hold the filename, you must use
+      *  .c_str() before passing it to this constructor.
+      */
+      void open(const char* pFName, std::ios::openmode m=std::ios::in, const char* pFilter = "")
+      {
+        if (!BasicStkIosType::mBuf.open(pFName, m | std::ios_base::in, pFilter)) {
+          this->setstate(std::ios_base::failbit);
+        }
+        else {
+        // Closing an fstream should clear error state
+          BasicStkIosType::clear();
+        }
+      }
+
+      /**
+      *  @brief  Returns true if the external file is open.
+      */
+      bool is_open() const {return BasicStkIosType::mBuf.is_open();}
+
+
+      /**
+      *  @brief  Closes the stream
+      */
+      void close() {BasicStkIosType::mBuf.close();}
+
+      /**
+      *  @brief  Returns the filename
+      */
+      const std::string name() const {return BasicStkIosType::mBuf.name();}
+
+      /// Returns a pointer to the main FILE structure
+      std::__c_file*
+      file() {return BasicStkIosType::mBuf.file();}
+
+      /// Returns a pointer to the main FILE structure
+      std::__c_file*
+      fp() {return BasicStkIosType::mBuf.fp();}
+
+      // /**
+      //  *  @brief  Reads a single line
+      //  *
+      //  *  This is a specialized function as std::getline does not provide a way to
+      //  *  read multiple end-of-line symbols (we need both '\n' and EOF to delimit
+      //  *  the line)
+      //  */
+      // void
+      // GetLine(string& rLine);
+
+    }; // class BasicIStkStream
+
+
+  /**
+   *  @brief  Controlling output for files.
+   *
+   *  This class supports reading from named files, using the inherited
+   *  functions from std::ostream.  To control the associated
+   *  sequence, an instance of TNet::stkbuf is used.
+  */
+  template<
+    typename _CharT, 
+    typename _Traits = std::char_traits<_CharT>
+  >	
+    class BasicOStkStream 
+    : public BasicStkIos<_CharT, _Traits>,
+      public std::basic_ostream<_CharT, _Traits>
+    {
+    public:
+      typedef BasicStkIos<_CharT, _Traits> BasicStkIosType;
+      typedef std::basic_ostream<_CharT,_Traits>  OStreamType;
+
+      // Constructors:
+      /**
+       *  @brief  Default constructor.
+       *
+       *  Initializes @c sb using its default constructor, and passes
+       *  @c &sb to the base class initializer.  Does not open any files
+       *  (you haven't given it a filename to open).
+       */
+      BasicOStkStream() 
+      : BasicStkIosType(),
+        OStreamType(BasicStkIosType::rdbuf())
+      {};
+
+      /**
+       *  @brief  Create an output file stream.
+       *  @param  fName  String specifying the filename.
+       *  @param  m      Open file in specified mode (see std::ios_base).
+       *  @param  pFilter String specifying pFilter command to use on fName
+       *
+       *  @c ios_base::out|ios_base::trunc is automatically included in
+       *  @a mode.
+       *
+       *  Tip:  When using std::string to hold the filename, you must use
+       *  .c_str() before passing it to this constructor.
+       */
+      BasicOStkStream(const char* pFName, std::ios::openmode m=std::ios::out, const char* pFilter="")
+      : BasicStkIosType(),
+        OStreamType(BasicStkIosType::rdbuf())
+      {this->open(pFName, std::ios::out, pFilter);}
+
+      /**
+      *  @brief  Opens an external file.
+      *  @param  fName  The name of the file.
+      *  @param  m  The open mode flags.
+      *  @param  pFilter String specifying pFilter command to use on fName
+      *
+      *  Calls @c std::basic_filebuf::open(s,mode|out).  If that function
+      *  fails, @c failbit is set in the stream's error state.
+      *
+      *  Tip:  When using std::string to hold the filename, you must use
+      *  .c_str() before passing it to this constructor.
+      */
+      void open(const char* pFName, std::ios::openmode m=std::ios::out, const char* pFilter="")
+      {
+        if (!BasicStkIosType::mBuf.open(pFName, m | std::ios_base::out, pFilter))
+          this->setstate(std::ios_base::failbit);
+        else
+        // Closing an fstream should clear error state
+          this->clear();
+      }
+
+      /**
+      *  @brief  Returns true if the external file is open.
+      */
+      bool is_open() const 
+      { return BasicStkIosType::mBuf.is_open();}
+
+      /**
+      *  @brief  Closes the stream
+      */
+      void close() 
+      { BasicStkIosType::mBuf.close();}
+
+      /**
+      *  @brief  Returns the filename
+      */
+      const std::string name() const 
+      { return BasicStkIosType::mBuf.name();}
+
+      /// Returns a pointer to the main FILE structure
+      std::__c_file*
+      file() 
+      { return BasicStkIosType::mBuf.file();}
+
+      /// Returns a pointer to the main FILE structure
+      std::__c_file*
+      fp() 
+      { return BasicStkIosType::mBuf.fp();}
+
+    }; // class BasicOStkStream
+
+
+  /**
+   * We define some implicit stkbuf class
+   */
+  ///@{
+#ifndef _GLIBCPP_USE_WCHAR_T
+  typedef BasicOStkStream<char>      OStkStream;
+  typedef BasicOStkStream<wchar_t>  WOStkStream;
+  typedef BasicIStkStream<char>      IStkStream;
+  typedef BasicIStkStream<wchar_t>  WIStkStream;
+#else 
+  typedef BasicOStkStream<char>     WOStkStream;
+  typedef BasicOStkStream<wchar_t>  WOStkStream;
+  typedef BasicIStkStream<char>     WIStkStream;
+  typedef BasicIStkStream<wchar_t>  WIStkStream;
+#endif
+  /// @}
+
+  /*
+  template<class T,class char_type> inline
+   BasicOStkStream<char_type>& operator << (BasicOStkStream<char_type> &ostream, const std::vector<T> &vec){
+    ostream << vec.size() << std::endl;
+    for(size_t i=0;i<vec.size();i++) ostream << vec[i];
+    return ostream;
+  }
+
+  template<class T,class char_type> inline BasicIStkStream<char_type> &operator >> (BasicIStkStream<char_type> &istream, std::vector<T> &vec){
+    size_t sz;
+    istream >> sz; if(!istream.good()){ throw std::runtime_error(std::string("Error reading to vector of [something]: stream bad\n"));  }
+    int ch = istream.get(); if(ch!='\n' || !istream.good()){ throw std::runtime_error(std::string("Expecting newline after vector size, got " + (std::string)(char)ch));} // TODO: This code may not be right for wchar.
+    vec.resize(sz);
+    for(size_t i=0;i<vec.size();i++) istream >> vec[i];
+    return istream;
+    }*/
+
+  template<class T> inline
+  std::ostream & operator << (std::ostream &ostream, const std::vector<T> &vec){
+    ostream << vec.size() << std::endl;
+    for(size_t i=0;i<vec.size();i++) ostream << vec[i] << "\n"; // '\n' is necessary in case item is atomic e.g. a number.
+    return ostream;
+  }
+
+  template<class T> inline std::istream& operator >> (std::istream &istream, std::vector<T> &vec){
+    size_t sz;
+    istream >> sz; if(!istream.good()){ throw std::runtime_error(std::string("Error reading to vector of [something]: stream bad\n"));  }
+    // int ch = istream.get(); if(ch!='\n' || !istream.good()){ throw std::runtime_error(std::string("Expecting newline after vector size\n")); // TODO: This code may not be right for wchar.
+    vec.resize(sz);
+    for(size_t i=0;i<vec.size();i++) istream >> vec[i];
+    return istream;
+  }
+
+  template<class T> inline
+  std::ostream & operator << (std::ostream &ostream, const std::list<T> &lst){
+    ostream << lst.size() << std::endl;
+    typename std::list<T>::iterator it;
+    for(it = lst.begin(); it != lst.end(); it++)
+      ostream << *it << "\n"; // '\n' is necessary in case item is atomic e.g. a number.
+    return ostream;
+  }
+
+  template<class T> inline std::istream& operator >> (std::istream &istream, std::list<T> &lst){
+    size_t sz;
+    istream >> sz; if(!istream.good()){ throw std::runtime_error(std::string("Error reading to list of [something]: stream bad\n"));  }
+    lst.resize(sz);
+    typename std::list<T>::iterator it;
+    for(it = lst.begin(); it != lst.end(); it++)
+      istream >> *it;
+    return istream;
+  }
+
+}; // namespace TNet
+
+
+using TNet::operator >>; 
+using TNet::operator <<;
+
+
+# include "StkStream.tcc"
+
+// TNet_StkStream_h
+#endif 
diff --git a/src/KaldiLib/.svn/text-base/StkStream.tcc.svn-base b/src/KaldiLib/.svn/text-base/StkStream.tcc.svn-base
new file mode 100644
index 0000000..e3de1ae
--- /dev/null
+++ b/src/KaldiLib/.svn/text-base/StkStream.tcc.svn-base
@@ -0,0 +1,228 @@
+#ifndef TNet_StkStream_tcc
+#define TNet_StkStream_tcc
+
+#include <cstring>
+#include <iostream>
+
+#include "Common.h"
+
+#pragma GCC system_header
+
+namespace TNet
+{
+  
+  //******************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits
+  > 
+  basic_stkbuf<_CharT, _Traits> *
+  basic_stkbuf<_CharT, _Traits>::
+  close(void)
+  {
+    // we only want to close an opened file
+    if (this->is_open())
+    {
+      // we want to call the parent close() procedure
+      std::basic_filebuf<_CharT, _Traits>::close();
+
+      // and for different stream type we perform different closing
+      if (mStreamType == basic_stkbuf::t_file)
+      {
+        fclose(mpFilePtr);
+      }
+      else if (mStreamType == basic_stkbuf::t_pipe)
+      {
+        pclose(mpFilePtr);
+      }
+      else if (mStreamType == basic_stkbuf::t_stdio)
+      {
+
+      }
+      
+      mpFilePtr     = NULL;
+      mFilename    = "";
+      mMode        = std::ios_base::openmode(0);
+      mStreamType = basic_stkbuf::t_undef;
+      return this;
+    }
+    else
+      return 0;
+  }
+
+
+  template<
+    typename _CharT, 
+    typename _Traits
+  > 
+  void
+  basic_stkbuf<_CharT, _Traits>::
+  open_mode(std::ios_base::openmode __mode, int&, int&,  char* __c_mode)
+  {
+    bool __testb = __mode & std::ios_base::binary;
+    bool __testi = __mode & std::ios_base::in;
+    bool __testo = __mode & std::ios_base::out;
+    bool __testt = __mode & std::ios_base::trunc;
+    bool __testa = __mode & std::ios_base::app;
+
+    if (!__testi && __testo && !__testt && !__testa)
+      strcpy(__c_mode, "w");
+    if (!__testi && __testo && !__testt && __testa)
+      strcpy(__c_mode, "a");
+    if (!__testi && __testo && __testt && !__testa)
+      strcpy(__c_mode, "w");
+    if (__testi && !__testo && !__testt && !__testa)
+      strcpy(__c_mode, "r");
+    if (__testi && __testo && !__testt && !__testa)
+      strcpy(__c_mode, "r+");
+    if (__testi && __testo && __testt && !__testa)
+      strcpy(__c_mode, "w+");
+    if (__testb)
+      strcat(__c_mode, "b");
+  }
+
+
+  //******************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits
+  > 
+  basic_stkbuf<_CharT, _Traits> *
+  basic_stkbuf<_CharT, _Traits>::
+  open(const char* pFName, std::ios::openmode m, const char* pFilter)
+  {
+    basic_stkbuf<_CharT, _Traits>* p_ret = NULL;
+
+    if (NULL == pFName)
+      return NULL;
+      
+    // we need to assure, that the stream is not open
+    if (!this->is_open())
+    {
+      char mstr[4] = {'\0', '\0', '\0', '\0'};
+      int __p_mode = 0;
+      int __rw_mode = 0;
+
+      // now we decide, what kind of file we open
+      if (!strcmp(pFName,"-"))
+      {
+        if      ((m & std::ios::in) && !(m & std::ios::out))
+        {
+          mpFilePtr   = stdin;
+          mMode       = std::ios::in;
+          mFilename   = pFName;
+          mStreamType = t_stdio;
+          p_ret       = this;
+        }
+        else if ((m & std::ios::out) && !(m & std::ios::in))
+        {
+          mpFilePtr   = stdout;
+          mMode       = std::ios::out;
+          mFilename   = pFName;
+          mStreamType = t_stdio;
+          p_ret       = this;
+        }
+        else
+          p_ret = NULL;
+      }
+      else if ( pFName[0] == '|' )
+      {
+        const char* command = pFName + 1;
+
+        if      ((m & std::ios::in) && !(m & std::ios::out)) m = std::ios::in;
+        else if ((m & std::ios::out) && !(m & std::ios::in)) m = std::ios::out;
+        else return NULL;
+
+        // we need to make some conversion
+        // iostream -> stdio open mode string
+        this->open_mode(m, __p_mode, __rw_mode, mstr);
+
+        if ((mpFilePtr = popen(command, mstr)))
+        {
+          mFilename   = command;
+          mMode       = m;
+          mStreamType = t_pipe;
+          p_ret       = this;
+        }
+        else
+          p_ret = 0;
+      }
+      else
+      {
+        // maybe we have a filter specified
+        if ( pFilter 
+        && ('\0' != pFilter[0]))
+        {
+          char* command = ExpandHtkFilterCmd(pFilter, pFName, "$");
+
+          if      ((m & std::ios::in) && !(m & std::ios::out)) m = std::ios::in;
+          else if ((m & std::ios::out) && !(m & std::ios::in)) m = std::ios::out;
+          else return NULL;
+
+          // we need to make some conversion
+          // iostream -> stdio open mode string
+          this->open_mode(m, __p_mode, __rw_mode, mstr);
+
+          if ((mpFilePtr = popen(command, mstr)))
+          {
+            mFilename     = pFName;
+            mMode         = m;
+            mStreamType   = t_pipe;
+            p_ret         = this;
+          }
+          else
+            p_ret = 0;
+        }
+        else // if (!filter.empty())
+        {
+          // we need to make some conversion
+          // iostream -> stdio open mode string
+          this->open_mode(m, __p_mode, __rw_mode, mstr);
+
+          if ((mpFilePtr = fopen(pFName, mstr)))
+          {
+            mFilename   = pFName;
+            mMode       = m;
+            mStreamType = t_file;
+            p_ret       = this;
+          }
+          else {
+            p_ret = NULL;
+          }
+        }
+      }
+
+      // here we perform what the stdio_filebuf would do
+      if (p_ret) {
+        superopen(mpFilePtr, m);
+      }
+    } //if (!isopen)
+
+    return p_ret;
+  }
+
+  //******************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits
+  > 
+  void
+  basic_stkbuf<_CharT, _Traits>::
+  superopen(std::__c_file* __f, std::ios_base::openmode __mode,
+        size_t __size)
+  {
+    this->_M_file.sys_open(__f, __mode);
+    if (this->is_open())
+    {
+      this->_M_mode = __mode;
+      this->_M_buf_size = __size;
+      this->_M_allocate_internal_buffer();
+      this->_M_reading = false;
+      this->_M_writing = false;
+      this->_M_set_buffer(-1);
+    }
+  }
+}
+
+// TNet_StkStream_tcc
+#endif
diff --git a/src/KaldiLib/.svn/text-base/Timer.cc.svn-base b/src/KaldiLib/.svn/text-base/Timer.cc.svn-base
new file mode 100644
index 0000000..692969b
--- /dev/null
+++ b/src/KaldiLib/.svn/text-base/Timer.cc.svn-base
@@ -0,0 +1,5 @@
+#include "Timer.h"
+
+/*
+TNet::Timer gTimer;
+*/
diff --git a/src/KaldiLib/.svn/text-base/Timer.h.svn-base b/src/KaldiLib/.svn/text-base/Timer.h.svn-base
new file mode 100644
index 0000000..b220b93
--- /dev/null
+++ b/src/KaldiLib/.svn/text-base/Timer.h.svn-base
@@ -0,0 +1,103 @@
+#ifndef Timer_h
+#define Timer_h
+
+#include "Error.h"
+#include <sstream>
+
+
+
+#if defined(_WIN32) || defined(MINGW)
+
+# include <windows.h>
+
+namespace TNet
+{
+  class Timer {
+  public:
+    void 
+    Start(void)
+    {
+      static int first = 1;
+
+      if(first) {
+              QueryPerformanceFrequency(&mFreq);
+              first = 0;
+      }
+      QueryPerformanceCounter(&mTStart);
+    }
+
+    void 
+    End(void)
+    { QueryPerformanceCounter(&mTEnd); }
+
+    double 
+    Val()
+    {
+      return ((double)mTEnd.QuadPart - (double)mTStart.QuadPart) / 
+        ((double)mFreq.QuadPart);
+    }
+
+  private:
+    LARGE_INTEGER mTStart;
+    LARGE_INTEGER mTEnd;
+    LARGE_INTEGER mFreq;
+  };
+}
+
+#else
+
+# include <sys/time.h>
+# include <unistd.h>
+
+namespace TNet
+{
+  class Timer 
+  {
+  public:
+    void 
+    Start()
+    { gettimeofday(&this->mTStart, &mTz); }
+
+    void 
+    End()
+    { gettimeofday(&mTEnd,&mTz); }
+
+    double 
+    Val()
+    {
+      double t1, t2;
+
+      t1 =  (double)mTStart.tv_sec + (double)mTStart.tv_usec/(1000*1000);
+      t2 =  (double)mTEnd.tv_sec + (double)mTEnd.tv_usec/(1000*1000);
+      return t2-t1;
+    }
+
+  private:
+    struct timeval mTStart;
+    struct timeval mTEnd;
+    struct timezone mTz;
+  };
+}
+
+#endif
+
+
+
+
+
+
+
+///////////////////////////////////////////////////////////////
+// Macros for adding the time intervals to time accumulator
+#if PROFILING==1
+#  define TIMER_START(timer) timer.Start()
+#  define TIMER_END(timer,sum) timer.End(); sum += timer.Val()
+#else
+#  define TIMER_START(timer) 
+#  define TIMER_END(timer,sum) 
+#endif
+
+#endif
+
+
+
diff --git a/src/KaldiLib/.svn/text-base/Tokenizer.cc.svn-base b/src/KaldiLib/.svn/text-base/Tokenizer.cc.svn-base
new file mode 100644
index 0000000..0c49050
--- /dev/null
+++ b/src/KaldiLib/.svn/text-base/Tokenizer.cc.svn-base
@@ -0,0 +1,53 @@
+#include "Tokenizer.h"
+#include "string.h"
+
+namespace TNet
+{
+  //****************************************************************************
+  //****************************************************************************
+  void
+  Tokenizer::
+  AddString(const char* pString)
+  {
+    // copy into string struct, which is more convenient
+    std::string       aux_string(pString);
+    std::string       aux_record;
+    std::string::size_type    cur_pos = 0;
+    std::string::size_type    old_pos = 0;
+    std::string::size_type    search_start = 0;
+
+    // make sure we have enough space
+    aux_record.reserve(aux_string.length());
+
+    // find all of separators and make a list of tokens
+    while(old_pos < std::string::npos) {
+      // find the next separator
+      cur_pos = aux_string.find_first_of(mSeparator, search_start);
+
+      // if backslash is in front of separator, ignore this separator
+      if (cur_pos != 0 && cur_pos != std::string::npos && 
+          pString[cur_pos - 1] == '\\') {
+        search_start = cur_pos + 1;
+        continue;
+      }
+
+      // we don't want to have empty records
+      if (!(cur_pos == old_pos && mSkipEmpty)) {
+        // extract token
+        aux_record.insert(0, pString+old_pos, cur_pos==std::string::npos ? strlen(pString+old_pos) : cur_pos - old_pos);
+        // insert to list
+        this->push_back(aux_record);
+
+        // we don't need the contents of the token
+        aux_record.erase();
+      }
+
+      // update old position so that it points behind the separator
+      old_pos = cur_pos < std::string::npos ? cur_pos + 1 : cur_pos;
+      search_start = old_pos;
+    }
+  }
+
+
+} // namespace TNet
+
diff --git a/src/KaldiLib/.svn/text-base/Tokenizer.h.svn-base b/src/KaldiLib/.svn/text-base/Tokenizer.h.svn-base
new file mode 100644
index 0000000..1be717b
--- /dev/null
+++ b/src/KaldiLib/.svn/text-base/Tokenizer.h.svn-base
@@ -0,0 +1,45 @@
+#include <list>
+#include <string>
+
+namespace TNet {
+  /** 
+   * @brief General string tokenizer
+   */
+  class Tokenizer 
+  : public std::list<std::string>
+  {
+  public:
+    // Constructors and Destructors ............................................
+    Tokenizer(const char* pSeparator, bool skipEmpty = false)
+    : std::list<std::string>(), mSeparator(pSeparator), mSkipEmpty(skipEmpty)
+    {}
+
+    Tokenizer(const char* pString, const char* pSeparator, bool skipEmpty = false)
+    : std::list<std::string>(), mSeparator(pSeparator), mSkipEmpty(skipEmpty)
+    { AddString(pString); }
+
+    ~Tokenizer()
+    {}
+
+    /** 
+     * @brief Parses a string and appends the tokens to the list
+     * @param pString string to parse
+     */
+    void
+    AddString(const char* pString);
+
+    /** 
+     * @brief Constant accessor to the separators string
+     * @return Const refference
+     */
+    const std::string&
+    Separator() const
+    {return mSeparator;}
+
+  private:
+    std::string mSeparator;   ///< holds the list of separators
+    bool        mSkipEmpty;   ///< if true, multiple separators will be regarded as one
+  }; // class Tokenizer
+} // namespace TNet
+
+
diff --git a/src/KaldiLib/.svn/text-base/Types.h.svn-base b/src/KaldiLib/.svn/text-base/Types.h.svn-base
new file mode 100644
index 0000000..6a5bfac
--- /dev/null
+++ b/src/KaldiLib/.svn/text-base/Types.h.svn-base
@@ -0,0 +1,78 @@
+#ifndef TNet_Types_h
+#define TNet_Types_h
+
+#ifdef HAVE_ATLAS
+extern "C"{
+  #include <cblas.h>
+  #include <clapack.h>
+}
+#endif
+
+
+namespace TNet 
+{
+  // TYPEDEFS ..................................................................
+#if DOUBLEPRECISION
+  typedef double  BaseFloat;
+#else
+  typedef float   BaseFloat;
+#endif
+
+#ifndef UINT_16
+  typedef unsigned short  UINT_16   ;
+  typedef unsigned        UINT_32   ;
+  typedef short           INT_16    ;
+  typedef int             INT_32    ;
+  typedef float           FLOAT_32  ;
+  typedef double          DOUBLE_64 ;
+#endif
+
+
+
+  // ...........................................................................
+  // The following declaration assumes that SSE instructions are enabled
+  // and that we are using GNU C/C++ compiler, which defines the __attribute__ 
+  // notation.
+  //
+  // ENABLE_SSE is defined in <config.h>. Its value depends on options given
+  // in the configure phase of builidng the library
+#if defined(__GNUC__ )
+  // vector of four single floats
+  typedef float  v4sf __attribute__((vector_size(16))); 
+  // vector of two single doubles
+  typedef double v2sd __attribute__((vector_size(16))); 
+
+  typedef BaseFloat BaseFloat16Aligned __attribute__((aligned(16))) ;
+
+  typedef union 
+  {
+    v4sf    v;
+    float   f[4];
+  } f4vector; 
+
+  typedef union 
+  {
+    v2sd    v;
+    double  f[2];
+  } d2vector; 
+#endif // ENABLE_SSE && defined(__GNUC__ )
+
+
+
+  typedef enum
+  {
+#ifdef HAVE_ATLAS
+    TRANS    = CblasTrans,
+    NO_TRANS = CblasNoTrans
+#else
+    TRANS    = 'T',
+    NO_TRANS = 'N'
+#endif
+  } MatrixTrasposeType;
+
+
+
+} // namespace TNet
+
+#endif // #ifndef TNet_Types_h
+
diff --git a/src/KaldiLib/.svn/text-base/UserInterface.cc.svn-base b/src/KaldiLib/.svn/text-base/UserInterface.cc.svn-base
new file mode 100644
index 0000000..b59a6c5
--- /dev/null
+++ b/src/KaldiLib/.svn/text-base/UserInterface.cc.svn-base
@@ -0,0 +1,669 @@
+#include <stdexcept>
+#include <sstream>
+#include <stdarg.h>
+
+#include "UserInterface.h"
+#include "StkStream.h"
+#include "Features.h"
+
+namespace TNet
+{
+  //***************************************************************************
+  //***************************************************************************
+  int 
+  npercents(const char *str)
+  {
+    int ret = 0;
+    while (*str) if (*str++ == '%') ret++;
+    return ret;
+  }
+  
+
+  //***************************************************************************
+  //***************************************************************************
+  void 
+  UserInterface::
+  ReadConfig(const char *file_name)
+  {
+    std::string   line_buf;
+    std::string::iterator chptr;
+    std::string   key;
+    std::string   value;
+    std::ostringstream ss;
+    int           line_no = 0;
+    IStkStream    i_stream;
+
+
+    i_stream.open(file_name, std::ios::binary);
+    if (!i_stream.good()) {
+      throw std::runtime_error(std::string("Cannot open input config file ")
+          + file_name);
+    }
+    i_stream >> std::ws;
+
+    while (!i_stream.eof()) {
+      size_t i_pos;
+
+      // read line
+      std::getline(i_stream, line_buf);
+      i_stream >> std::ws;
+
+      if (i_stream.fail()) {
+        throw std::runtime_error(std::string("Error reading (") 
+            + file_name + ":" + (ss << line_no,ss).str() + ")");
+      }
+
+      // increase line counter
+      line_no++;
+
+      // cut comments
+      if (std::string::npos != (i_pos = line_buf.find('#'))) {
+        line_buf.erase(i_pos);
+      }
+
+      // cut leading and trailing spaces
+      Trim(line_buf);
+
+      // if empty line, then skip it
+      if (0 == line_buf.length()) {
+        continue;
+      }
+  
+      // line = line_buf.c_str();
+      // chptr = parptr;
+
+      chptr = line_buf.begin();
+
+      for (;;) {
+        // Replace speces by '_', which is removed in InsertConfigParam
+        while (isalnum(*chptr) || *chptr == '_' || *chptr == '-') {
+          chptr++;
+        }
+
+        while (std::isspace(*chptr)) {
+          *chptr = '_';
+          chptr++;
+        }
+
+        if (*chptr != ':') {
+          break;
+        }
+
+        chptr++;
+
+        while (std::isspace(*chptr)) {
+          *chptr = '_';
+          chptr++;
+        }
+      }
+      
+      if (*chptr != '=') {
+        throw std::runtime_error(std::string("Character '=' expected (") 
+            + file_name + ":" + (ss.str(""),ss<<line_no,ss).str() + ")");
+      }
+
+      key.assign(line_buf.begin(), chptr);
+
+      chptr++;
+
+      value.assign(chptr, line_buf.end());
+
+      ParseHTKString(value, value);
+      InsertConfigParam(key.c_str(), value.c_str(), 'C');
+    }
+  
+    i_stream.close();
+  }
+  
+
+  //***************************************************************************
+  //***************************************************************************
+  void 
+  UserInterface::
+  InsertConfigParam(const char *pParamName, const char *value, int optionChar) 
+  {
+    std::string key(pParamName);
+    std::string::iterator i_key = key.begin();
+
+    while (i_key != key.end()) {
+      if (*i_key == '-' || *i_key == '_') {
+        i_key = key.erase(i_key);
+      }
+      else {
+        *i_key = toupper(*i_key);
+        i_key ++;
+      }
+    }
+  
+    mMap[key].mValue  = value;
+    mMap[key].mRead   = false;
+    mMap[key].mOption = optionChar;
+  }
+  
+  //***************************************************************************
+  //***************************************************************************
+  int 
+  UserInterface::
+  ParseOptions(
+    int             argc,
+    char*           argv[],
+    const char*     pOptionMapping,
+    const char*     pToolName)
+  {
+    int          i;
+    int          opt = '?';
+    int          optind;
+    bool         option_must_follow = false;
+    char         param[1024];
+    char*        value;
+    const char*  optfmt;
+    const char*  optarg;
+    char*        chptr;
+    char*        bptr;
+    char         tstr[4] = " -?";
+    unsigned long long option_mask = 0;
+    std::ostringstream ss;
+  
+    #define MARK_OPTION(ch) {if (isalpha(ch)) option_mask |= 1ULL << ((ch) - 'A');}
+    #define OPTION_MARK(ch) (isalpha(ch) && ((1ULL << ((ch) - 'A')) & option_mask))
+    #define IS_OPTION(str) ((str)[0] == '-' && (isalpha((str)[1]) || (str)[1] == '-'))
+ 
+    //search for the -A param
+    for (optind = 1; optind < argc; optind++) {
+      // we found "--", no -A
+      if (!strcmp(argv[optind], "--")) {
+        break;
+      }
+
+      //repeat till we find -A
+      if (argv[optind][0] != '-' || argv[optind][1] != 'A') {
+        continue;
+      }
+
+      // just "-A" form
+      if (argv[optind][2] != '\0') {
+        throw std::runtime_error(std::string("Unexpected argument '") 
+            + (argv[optind] + 2) + "' after option '-A'");
+      }
+
+      for (i=0; i < argc; i++) {
+        // display all params
+        if(strchr(argv[i], ' ') || strchr(argv[i], '*')) 
+          std::cout << '\'' << argv[i] << '\'' << " ";
+        else std::cout << argv[i] << " ";
+      }
+
+      std::cout << std::endl;
+
+      break;
+    }
+
+    for (optind = 1; optind < argc; optind++) {
+      // find the '-C?' parameter (possible two configs)
+      if (!strcmp(argv[optind], "--")) break;
+      if (argv[optind][0] != '-' || argv[optind][1] != 'C') continue;
+      if (argv[optind][2] != '\0') {
+        ReadConfig(argv[optind] + 2);
+      } else if (optind+1 < argc && !IS_OPTION(argv[optind+1])) {
+        ReadConfig(argv[++optind]);
+      } else {
+        throw std::runtime_error("Config file name expected after option '-C'");
+      }
+    }
+
+    for (optind = 1; optind < argc; optind++) {
+      if (!strcmp(argv[optind], "--")) break;
+      if (argv[optind][0] != '-' || argv[optind][1] != '-') continue;
+
+      bptr = new char[strlen(pToolName) + strlen(argv[optind]+2) + 2];
+      strcat(strcat(strcpy(bptr, pToolName), ":"), argv[optind]+2);
+      value = strchr(bptr, '=');
+      if (!value) {
+        throw std::runtime_error(std::string("Character '=' expected after option '")
+            + argv[optind] + "'");
+      }
+
+      *value++ = '\0';
+      
+      InsertConfigParam(bptr, value /*? value : "TRUE"*/, '-');
+      delete [] bptr;
+    }
+
+    for (optind = 1; optind < argc && IS_OPTION(argv[optind]); optind++) {
+      option_must_follow = false;
+      tstr[2] = opt = argv[optind][1];
+      optarg = argv[optind][2] != '\0' ? argv[optind] + 2 : NULL;
+  
+      if (opt == '-' && !optarg) {    // '--' terminates the option list
+        return optind+1;
+      }
+      if (opt == 'C' || opt == '-') { // C, A and long options have been already processed
+        if (!optarg) optind++;
+        continue;
+      }
+      if (opt == 'A') continue;
+  
+      chptr = strstr((char*)pOptionMapping, tstr);
+      if (chptr == NULL) {
+        throw std::runtime_error(std::string("Invalid command line option '-") 
+            + static_cast<char>(opt) + "'");
+      }
+  
+      chptr += 3;
+      while (std::isspace(*chptr)) {
+        chptr++;
+      }
+  
+      if (!chptr || chptr[0] == '-') {// Option without format string will be ignored
+        optfmt = " ";
+      } else {
+        optfmt = chptr;
+        while (*chptr && !std::isspace(*chptr)) {
+          chptr++;
+        }
+        if (!*chptr) {
+          throw std::runtime_error("Fatal: Unexpected end of optionMap string");
+        }
+      }
+      for (i = 0; !std::isspace(*optfmt); optfmt++) {
+        while (std::isspace(*chptr)) chptr++;
+        value = chptr;
+        while (*chptr && !std::isspace(*chptr)) chptr++;
+        assert(static_cast<unsigned int>(chptr-value+1) < sizeof(param));
+        strncat(strcat(strcpy(param, pToolName), ":"), value, chptr-value);
+        param[chptr-value+strlen(pToolName)+1] = '\0';
+        switch (*optfmt) {
+          case 'n': 
+            value = strchr(param, '=');
+            if (value) *value = '\0';
+            InsertConfigParam(param,
+                              value ? value + 1: "TRUE", opt);
+            break;
+
+          case 'l':
+          case 'o':
+          case 'r': 
+            i++;
+            if (!optarg && (optind+1==argc || IS_OPTION(argv[optind+1]))) {
+              if (*optfmt == 'r' || *optfmt == 'l') {
+                throw std::runtime_error(std::string("Argument ") 
+                    + (ss<<i,ss).str() + " of option '-" 
+                    + static_cast<char>(opt) + "' expected");
+              }
+              optfmt = "  "; // Stop reading option arguments
+              break;
+            }
+            if (!optarg) optarg = argv[++optind];
+            if (*optfmt == 'o') {
+              option_must_follow = (bool) 1;
+            }
+            bptr = NULL;
+
+            // For repeated use of option with 'l' (list) format, append
+            // ',' and argument string to existing config parameter value.
+            if (*optfmt == 'l' && OPTION_MARK(opt)) {
+              bptr = strdup(GetStr(param, ""));
+              if (bptr == NULL) throw std::runtime_error("Insufficient memory");
+              bptr = (char*) realloc(bptr, strlen(bptr) + strlen(optarg) + 2);
+              if (bptr == NULL) throw std::runtime_error("Insufficient memory");
+              strcat(strcat(bptr, ","), optarg);
+              optarg = bptr;
+            }
+            MARK_OPTION(opt);
+            InsertConfigParam(param, optarg, opt);
+            free(bptr);
+            optarg = NULL;
+            break;
+
+          default : 
+            throw std::runtime_error(std::string("Fatal: Invalid character '")
+                + *optfmt + "' in optionMap after " + tstr);
+        }
+      }
+      if (optarg) {
+        throw std::runtime_error(std::string("Unexpected argument '")
+            + optarg + "' after option '-" 
+            + static_cast<char>(opt) + "'");
+      }
+    }
+  
+    for (i = optind; i < argc && !IS_OPTION(argv[i]); i++)
+    {}
+   
+    if (i < argc) {
+      throw std::runtime_error(std::string("No option expected after first non-option argument '")
+          + argv[optind] + "'");
+    }
+
+    if (option_must_follow) {
+      throw std::runtime_error(std::string("Option '-")
+          + static_cast<char>(opt) 
+          + "' with optional argument must not be the last option");
+    }
+
+    return optind;
+  }
+  
+
+  //***************************************************************************
+  //***************************************************************************
+  int 
+  UserInterface::
+  GetFeatureParams(
+    int *           derivOrder,
+    int **          derivWinLens,
+    int *           startFrmExt,
+    int *           endFrmExt,
+    char **         CMNPath,
+    char **         CMNFile,
+    const char **   CMNMask,
+    char **         CVNPath,
+    char **         CVNFile,
+    const char **   CVNMask,
+    const char **   CVGFile,
+    const char *    pToolName,
+    int             pseudoModeule)
+  {
+    const char *  str;
+    int           targetKind;
+    char *        chrptr;
+    char          paramName[32];
+    const char *  CMNDir;
+    const char *  CVNDir;
+    
+    strcpy(paramName, pToolName);
+    strcat(paramName, pseudoModeule == 1 ? "SPARM1:" :
+                      pseudoModeule == 2 ? "SPARM2:" : "");
+                      
+    chrptr = paramName + strlen(paramName);
+  
+    strcpy(chrptr, "STARTFRMEXT");
+    *startFrmExt = GetInt(paramName, 0);
+    strcpy(chrptr, "ENDFRMEXT");
+    *endFrmExt   = GetInt(paramName, 0);
+  
+    *CMNPath = *CVNPath = NULL;
+    strcpy(chrptr, "CMEANDIR");
+    CMNDir       = GetStr(paramName, NULL);
+    strcpy(chrptr, "CMEANMASK");
+    *CMNMask     = GetStr(paramName, NULL);
+
+    if (*CMNMask != NULL) {
+      *CMNPath = (char*) malloc((CMNDir ? strlen(CMNDir) : 0) + npercents(*CMNMask) + 2);
+      if (*CMNPath == NULL) throw std::runtime_error("Insufficient memory");
+      if (CMNDir != NULL) strcat(strcpy(*CMNPath, CMNDir), "/");
+      *CMNFile = *CMNPath + strlen(*CMNPath);
+    }
+    strcpy(chrptr, "VARSCALEDIR");
+    CVNDir      = GetStr(paramName, NULL);
+    strcpy(chrptr, "VARSCALEMASK");
+    *CVNMask     = GetStr(paramName, NULL);
+
+
+    if (*CVNMask != NULL) {
+      *CVNPath = (char*) malloc((CVNDir ? strlen(CVNDir) : 0) + npercents(*CVNMask) + 2);
+      if (*CVNPath == NULL) throw std::runtime_error("Insufficient memory");
+      if (CVNDir != NULL) strcat(strcpy(*CVNPath, CVNDir), "/");
+      *CVNFile = *CVNPath + strlen(*CVNPath);
+    }
+    strcpy(chrptr, "VARSCALEFN");
+    *CVGFile     = GetStr(paramName, NULL);
+    strcpy(chrptr, "TARGETKIND");
+    str = GetStr(paramName, "ANON");
+
+    targetKind = FeatureRepository::ReadParmKind(str, false);
+
+    if (targetKind == -1) {
+      throw std::runtime_error(std::string("Invalid TARGETKIND = '")
+          + str + "'");
+    }
+  
+    strcpy(chrptr, "DERIVWINDOWS");
+    if ((str = GetStr(paramName, NULL)) != NULL) {
+      long lval;
+      *derivOrder      = 0;
+      *derivWinLens = NULL;
+      
+      if (NULL != str)
+      {
+        while ((str = strtok((char *) str, " \t_")) != NULL) 
+        {
+          lval = strtol(str, &chrptr, 0);
+          if (!*str || *chrptr) {
+            throw std::runtime_error("Integers separated by '_' expected for parameter DERIVWINDOWS");
+          }
+          *derivWinLens = (int *)realloc(*derivWinLens, ++*derivOrder*sizeof(int));
+          if (*derivWinLens == NULL) throw std::runtime_error("Insufficient memory");
+          (*derivWinLens)[*derivOrder-1] = lval;
+          str = NULL;
+        }
+      }
+      
+      return targetKind;
+    }
+    *derivOrder = targetKind & PARAMKIND_T ? 3 :
+                  targetKind & PARAMKIND_A ? 2 :
+                  targetKind & PARAMKIND_D ? 1 : 0;
+  
+    if (*derivOrder || targetKind != PARAMKIND_ANON) {
+    *derivWinLens = (int *) malloc(3 * sizeof(int));
+      if (*derivWinLens == NULL) throw std::runtime_error("Insufficient memory");
+  
+      strcpy(chrptr, "DELTAWINDOW");
+      (*derivWinLens)[0] = GetInt(paramName, 2);
+      strcpy(chrptr, "ACCWINDOW");
+      (*derivWinLens)[1] = GetInt(paramName, 2);
+      strcpy(chrptr, "THIRDWINDOW");
+      (*derivWinLens)[2] = GetInt(paramName, 2);
+      return targetKind;
+    }
+    *derivWinLens = NULL;
+    *derivOrder   = -1;
+    return targetKind;
+  }
+  
+
+  //***************************************************************************
+  //***************************************************************************
+  UserInterface::ValueRecord*
+  UserInterface::
+  GetParam(const char* pParamName)
+  {
+    MapType::iterator it;
+
+    // this is done only for convenience. in the loop we will increase the 
+    // pointer again
+    pParamName--;
+
+    // we iteratively try to find the param name in the map. if an attempt 
+    // fails, we strip off all characters until the first ':' and we search 
+    // again
+    do {
+      pParamName++;
+      it = mMap.find(pParamName);
+    } while ((it == mMap.end()) && (NULL != (pParamName = strchr(pParamName, ':'))));
+
+    if (it == mMap.end()) {
+      return NULL;
+    }
+    else {
+      it->second.mRead = true;
+      return &(it->second);
+    }
+  }
+  
+
+  //***************************************************************************
+  //***************************************************************************
+  const char * 
+  UserInterface::
+  GetStr(
+    const char *    pParamName,
+    const char *    default_value)
+  {
+    ValueRecord* p_val = GetParam(pParamName);
+
+    if (NULL == p_val) {
+      return default_value;
+    }
+    else {
+      return p_val->mValue.c_str();
+    }
+  }
+  
+
+  //***************************************************************************
+  //***************************************************************************
+  long 
+  UserInterface::
+  GetInt(
+    const char *pParamName,
+    long default_value)
+  {
+    char *chrptr;
+    ValueRecord* p_val = GetParam(pParamName);
+
+    if (NULL == p_val) {
+      return default_value;
+    }
+  
+    const char *val = p_val->mValue.c_str();
+    default_value = strtol(val, &chrptr, 0);
+    if (!*val || *chrptr) {
+      throw std::runtime_error(std::string("Integer number expected for ") 
+          + pParamName + " but found '" + val + "'");
+    }
+    return default_value;
+  }
+  
+  //***************************************************************************
+  //***************************************************************************
+  float 
+  UserInterface::
+  GetFlt(
+    const char *      pParamName,
+    float             default_value)
+  {
+    char *chrptr;
+    ValueRecord* p_val = GetParam(pParamName);
+
+    if (NULL == p_val) {
+      return default_value;
+    }
+  
+    const char *val = p_val->mValue.c_str();
+    default_value = strtod(val, &chrptr);
+    if (!*val || *chrptr) {
+      throw std::runtime_error(std::string("Decimal number expected for ") 
+          + pParamName + " but found '" + val + "'");
+    }
+    return default_value;
+  }
+  
+  //***************************************************************************
+  //***************************************************************************
+  bool 
+  UserInterface::
+  GetBool(
+    const char *    pParamName,
+    bool            default_value)
+  {
+    ValueRecord* p_val = GetParam(pParamName);
+
+    if (NULL == p_val) {
+      return default_value;
+    }
+  
+    const char* val = p_val->mValue.c_str();
+
+    if (!strcasecmp(val, "TRUE") || !strcmp(val, "T")) return 1;
+    if (strcasecmp(val, "FALSE") && strcmp(val, "F")) {
+      throw std::runtime_error(std::string("TRUE or FALSE expected for ")
+          + pParamName + " but found '" + val + "'");
+    }
+    return false;
+  }
+  
+  //***************************************************************************
+  //***************************************************************************
+  // '...' are pairs: string and corresponding integer value , terminated by NULL
+  int 
+  UserInterface::
+  GetEnum(
+    const char *    pParamName,
+    int             default_value, 
+    ...)  
+  {
+    ValueRecord* p_val = GetParam(pParamName);
+
+    if (NULL == p_val) {
+      return default_value;
+    }
+
+    const char* val = p_val->mValue.c_str();
+    char*       s;
+    int i = 0, cnt = 0, l = 0;
+    va_list ap;
+  
+    va_start(ap, default_value);
+    while ((s = va_arg(ap, char *)) != NULL) {
+      l += strlen(s) + 2;
+      ++cnt;
+      i = va_arg(ap, int);
+      if (!strcmp(val, s)) break;
+    }
+    va_end(ap);
+
+    if (s) {
+      return i;
+    }
+  
+    //To report error, create string listing all possible values
+    s = (char*) malloc(l + 1);
+    s[0] = '\0';
+    va_start(ap, default_value);
+    for (i = 0; i < cnt; i++) {
+      strcat(s, va_arg(ap, char *));
+      va_arg(ap, int);
+      if (i < cnt - 2) strcat(s, ", ");
+      else if (i == cnt - 2) strcat(s, " or ");
+    }
+
+    va_end(ap);
+
+    throw std::runtime_error(std::string(s) + " expected for "
+        + pParamName + " but found '" + val + "'");
+
+    return 0;
+  }
+  
+
+  //***************************************************************************
+  //***************************************************************************
+  void
+  UserInterface::
+  PrintConfig(std::ostream& rStream)
+  {
+    rStream << "Configuration Parameters[" << mMap.size() << "]\n";
+    for (MapType::iterator it = mMap.begin(); it != mMap.end(); ++it) {
+      rStream << (it->second.mRead ? "  " : "# ") 
+        << std::setw(35) << std::left << it->first << " = "
+        << std::setw(30) << std::left << it->second.mValue 
+        << " # -" << it->second.mOption << std::endl;
+    }
+  }
+  
+  //***************************************************************************
+  //***************************************************************************
+  void 
+  UserInterface::
+  CheckCommandLineParamUse()
+  {
+    for (MapType::iterator it = mMap.begin(); it != mMap.end(); ++it) {
+      if (!it->second.mRead && it->second.mOption != 'C') {
+        Error("Unexpected command line parameter " + it->first);
+      }
+    }
+  }
+
+}
diff --git a/src/KaldiLib/.svn/text-base/UserInterface.h.svn-base b/src/KaldiLib/.svn/text-base/UserInterface.h.svn-base
new file mode 100644
index 0000000..fa189e7
--- /dev/null
+++ b/src/KaldiLib/.svn/text-base/UserInterface.h.svn-base
@@ -0,0 +1,166 @@
+#ifndef TNet_UserInterface_h
+#define TNet_UserInterface_h
+
+#include <iostream>
+#include <cstdlib>
+#include <string>
+#include <map>
+
+namespace TNet 
+{
+  /** **************************************************************************
+   ** **************************************************************************
+   */
+  class UserInterface 
+  {
+  public:
+    struct ValueRecord {
+      std::string   mValue;
+      char          mOption;
+      bool          mRead;
+    };
+
+
+    void InsertConfigParam(
+      const char *param_name,
+      const char *value,
+      int optionChar);
+    
+
+    void 
+    ReadConfig(const char *pFileName);
+
+
+    void 
+    CheckCommandLineParamUse();
+    
+
+    /** 
+     * @brief Retreives the content of a parameter
+     * @param pParamName Name of the parameter to look for
+     * @return Returns the pointer to the ValueRecord structure if success,
+     *         otherwise return NULL
+     *
+     *  We iteratively try to find the param name in the map. If an attempt 
+     *  fails, we strip off all characters until the first occurance of ':' 
+     *  and we search again
+     */
+    ValueRecord*
+    GetParam(const char* pParamName);
+
+
+    /** 
+     * @brief Returns the parameter's value as string
+     * 
+     * @param param_name Parameter name
+     * @param default_value Value, which is returned in case the parameter 
+     * was not found
+     * 
+     * @return Pointer to the begining of the string if success, default_value
+     * otherwise
+     */
+    const char* 
+    GetStr( const char *param_name, const char *default_value);
+    
+
+    /** 
+     * @brief Returns the parameter's value as int
+     * 
+     * @param param_name Parameter name
+     * @param default_value Value, which is returned in case the parameter 
+     * was not found
+     * 
+     * @return Returns the integer value if success, default_value
+     * otherwise
+     */
+    long 
+    GetInt( const char *param_name, long default_value);
+    
+
+    /** 
+     * @brief Returns the parameter's value as float
+     * 
+     * @param param_name Parameter name
+     * @param default_value Value, which is returned in case the parameter 
+     * was not found
+     * 
+     * @return Returns the float value if success, default_value
+     * otherwise
+     */
+    float 
+    GetFlt( const char *param_name, float default_value);
+    
+
+    /** 
+     * @brief Returns the parameter's value as bool
+     * 
+     * @param param_name Parameter name
+     * @param default_value Value, which is returned in case the parameter 
+     * was not found
+     * 
+     * @return Returns the bool value if success, default_value
+     * otherwise
+     *
+     * Note that true is returned if the value is 'TRUE' or 'T', false is
+     * returned if the value is 'FALSE' or 'F'. Otherwise exception is thrown
+     */
+    bool 
+    GetBool(const char *param_name, bool default_value);
+    
+
+    /** 
+     * @brief Returns the parameter's value as enum integer
+     * 
+     * @param param_name Parameter name
+     * @param default_value Value, which is returned in case the parameter 
+     * was not found
+     * 
+     * @return Returns the index value if success, default_value
+     * otherwise
+     *
+     * Variable arguments specify the possible values of this parameter. If the
+     * value does not match any of these, exception is thrown.
+     */
+    int 
+    GetEnum( const char *param_name, int default_value, ...);
+    
+
+    int GetFeatureParams(
+        int *derivOrder,
+        int **derivWinLens,
+        int *startFrmExt,
+        int *endFrmExt,
+        char **CMNPath,
+        char **CMNFile,
+        const char **CMNMask,
+        char **CVNPath,
+        char **CVNFile,
+        const char **CVNMask,
+        const char **CVGFile,
+        const char *toolName,
+        int pseudoModeule);
+    
+
+    int ParseOptions(
+        int             argc,
+        char*           argv[],
+        const char*     optionMapping,
+        const char*     toolName);
+
+
+    /** 
+     * @brief Send the defined paramaters to a stream
+     * 
+     * @param rStream stream to use
+     */
+    void
+    PrintConfig(std::ostream& rStream);
+
+  public:
+    typedef std::map<std::string, ValueRecord> MapType;
+    MapType             mMap;
+  };
+}
+
+#endif
+  
diff --git a/src/KaldiLib/.svn/text-base/Vector.cc.svn-base b/src/KaldiLib/.svn/text-base/Vector.cc.svn-base
new file mode 100644
index 0000000..020bae2
--- /dev/null
+++ b/src/KaldiLib/.svn/text-base/Vector.cc.svn-base
@@ -0,0 +1,110 @@
+#ifndef TNet_Vector_cc
+#define TNet_Vector_cc
+
+#include <cstdlib>
+#include <cmath>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+#include "Common.h"
+
+#ifdef HAVE_ATLAS
+extern "C"{
+  #include <cblas.h>
+}
+#endif
+
+#include "Common.h"
+#include "Matrix.h"
+#include "Vector.h"
+
+namespace TNet
+{
+
+#ifdef HAVE_ATLAS
+  template<>
+     float
+    BlasDot<>(const Vector<float>& rA, const Vector<float>& rB)
+    {
+      assert(rA.mDim == rB.mDim);
+      return cblas_sdot(rA.mDim, rA.pData(), 1, rB.pData(), 1);
+    }
+
+  template<>
+     double
+    BlasDot<>(const Vector<double>& rA, const Vector<double>& rB)
+    {
+      assert(rA.mDim == rB.mDim);
+      return cblas_ddot(rA.mDim, rA.pData(), 1, rB.pData(), 1);
+    }
+
+  template<>
+     Vector<float>&
+    Vector<float>::
+    BlasAxpy(const float alpha, const Vector<float>& rV)
+    {
+      assert(mDim == rV.mDim);
+      cblas_saxpy(mDim, alpha, rV.pData(), 1, mpData, 1);
+      return *this;
+    }
+
+  template<>
+     Vector<double>&
+    Vector<double>::
+    BlasAxpy(const double alpha, const Vector<double>& rV)
+    {
+      assert(mDim == rV.mDim);
+      cblas_daxpy(mDim, alpha, rV.pData(), 1, mpData, 1);
+      return *this;
+    }
+
+  template<>
+     Vector<int>&
+    Vector<int>::
+    BlasAxpy(const int alpha, const Vector<int>& rV)
+    {
+      assert(mDim == rV.mDim);
+      for(int i=0; i<Dim(); i++) {
+        (*this)[i] += rV[i];
+      }
+      return *this;
+    }
+
+
+  template<>
+     Vector<float>&
+    Vector<float>::
+    BlasGemv(const float alpha, const Matrix<float>& rM, MatrixTrasposeType trans, const Vector<float>& rV, const float beta)
+    {
+      assert((trans == NO_TRANS && rM.Cols() == rV.mDim && rM.Rows() == mDim)
+          || (trans ==    TRANS && rM.Rows() == rV.mDim && rM.Cols() == mDim));
+
+      cblas_sgemv(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(trans), rM.Rows(), rM.Cols(), alpha, rM.pData(), rM.Stride(),
+                  rV.pData(), 1, beta, mpData, 1);
+      return *this;
+    }
+
+
+
+  template<>
+     Vector<double>&
+    Vector<double>::
+    BlasGemv(const double alpha, const Matrix<double>& rM, MatrixTrasposeType trans, const Vector<double>& rV, const double beta)
+    {
+      assert((trans == NO_TRANS && rM.Cols() == rV.mDim && rM.Rows() == mDim)
+          || (trans ==    TRANS && rM.Rows() == rV.mDim && rM.Cols() == mDim));
+
+      cblas_dgemv(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(trans), rM.Rows(), rM.Cols(), alpha, rM.pData(), rM.Stride(),
+                  rV.pData(), 1, beta, mpData, 1);
+      return *this;
+    }
+
+
+#else
+      #error Routines in this section are not implemented yet without BLAS
+#endif
+
+} // namespace TNet
+
+
+#endif // TNet_Vector_tcc
diff --git a/src/KaldiLib/.svn/text-base/Vector.h.svn-base b/src/KaldiLib/.svn/text-base/Vector.h.svn-base
new file mode 100644
index 0000000..384c5d2
--- /dev/null
+++ b/src/KaldiLib/.svn/text-base/Vector.h.svn-base
@@ -0,0 +1,496 @@
+//
+// C++ Interface: %{MODULE}
+//
+// Description:
+//
+//
+// Author: %{AUTHOR} <%{EMAIL}>, (C) %{YEAR}
+//
+// Copyright: See COPYING file that comes with this distribution
+//
+//
+
+#ifndef TNet_Vector_h
+#define TNet_Vector_h
+
+#include <cstddef>
+#include <cstdlib>
+#include <stdexcept>
+#include <iostream>
+
+#ifdef HAVE_ATLAS
+extern "C"{
+  #include <cblas.h>
+  #include <clapack.h>
+}
+#endif
+
+#include "Common.h"
+#include "MathAux.h"
+#include "Types.h"
+#include "Error.h"
+
+namespace TNet
+{
+  template<typename _ElemT> class Vector;
+  template<typename _ElemT> class SubVector;
+  template<typename _ElemT> class Matrix;
+  template<typename _ElemT> class SpMatrix;
+
+  // we need to declare the friend functions here
+  template<typename _ElemT>
+    std::ostream & operator << (std::ostream & rOut, const Vector<_ElemT> & rV);
+
+  template<typename _ElemT>
+    std::istream & operator >> (std::istream & rIn, Vector<_ElemT> & rV);
+
+  template<typename _ElemT>
+    _ElemT
+    BlasDot(const Vector<_ElemT>& rA, const Vector<_ElemT>& rB);
+
+  /** **************************************************************************
+   ** **************************************************************************
+   *  @brief Provides a matrix abstraction class
+   *
+   *  This class provides a way to work with matrices in TNet.
+   *  It encapsulates basic operations and memory optimizations.
+   *
+   */
+  template<typename _ElemT>
+    class Vector
+    {
+    public:
+
+    /// defines a type of this
+    typedef Vector<_ElemT> ThisType;
+
+
+    Vector(): mpData(NULL)
+#ifdef STK_MEMALIGN_MANUAL
+    ,mpFreeData(NULL)
+#endif
+    , mDim(0)
+      {}
+
+      /**
+       * @brief Copy constructor
+       * @param rV
+       */
+      Vector(const Vector<_ElemT>& rV)
+  	  { mpData=NULL; Init(rV.Dim()); Copy(rV); }
+
+
+      /* Type conversion constructor. */
+      template<typename _ElemU>
+      explicit Vector(const Vector<_ElemU>& rV)
+  	  { mpData=NULL; Init(rV.Dim()); Copy(rV); }
+
+
+      Vector(const _ElemT* ppData, const size_t s)
+      { mpData=NULL; Init(s); Copy(ppData); }
+
+      explicit Vector(const size_t s, bool clear=true)
+      { mpData=NULL; Init(s,clear); }
+
+      ~Vector()
+      { Destroy(); }
+
+       Vector<_ElemT> &operator = (const Vector <_ElemT> &other)
+       { Init(other.Dim()); Copy(other); return *this; } // Needed for inclusion in std::vector
+
+      Vector<_ElemT>&
+      Init(size_t length, bool clear=true);
+
+      /**
+       * @brief Dealocates the window from memory and resets the dimensions to (0)
+       */
+      void
+      Destroy();
+
+      /**
+       * @brief Returns @c true if vector is initialized
+       */
+      bool
+      IsInitialized() const
+      { return mpData != NULL; }
+
+      /**
+       * @brief Sets all elements to 0
+       */
+      void
+      Zero();
+
+      void
+      Set(_ElemT f);
+
+      inline size_t
+      Dim() const
+      { return mDim; }
+
+      /**
+       * @brief Returns size of matrix in memory (in bytes)
+       */
+      inline size_t
+      MSize() const
+      {
+        return (mDim + (((16 / sizeof(_ElemT)) - mDim%(16 / sizeof(_ElemT)))
+                          % (16 / sizeof(_ElemT)))) * sizeof(_ElemT);
+      }
+
+      /**
+       *  @brief Gives access to the vector memory area
+       *  @return pointer to the first field
+       */
+      inline _ElemT*
+      pData()
+      { return mpData; }
+
+      /**
+       *  @brief Gives access to the vector memory area
+       *  @return pointer to the first field (const version)
+       */
+      inline const _ElemT*
+      pData() const
+      { return mpData; }
+
+      /**
+       *  @brief Gives access to a specified vector element (const).
+       */
+      inline _ElemT
+      operator [] (size_t i) const
+      {
+#ifdef PARANOID
+		assert(i<mDim);
+#endif
+		return *(mpData + i);
+	  }
+
+      /**
+       *  @brief Gives access to a specified vector element (non-const).
+       */
+      inline _ElemT &
+      operator [] (size_t i)
+      {
+#ifdef PARANOID
+		assert(i<mDim);
+#endif
+		return *(mpData + i);
+	  }
+
+      /**
+       *  @brief Gives access to a specified vector element (const).
+       */
+      inline _ElemT
+		operator () (size_t i) const
+      {
+#ifdef PARANOID
+		assert(i<mDim);
+#endif
+		return *(mpData + i);
+	  }
+
+      /**
+       *  @brief Gives access to a specified vector element (non-const).
+       */
+      inline _ElemT &
+		operator () (size_t i)
+      {
+#ifdef PARANOID
+		assert(i<mDim);
+#endif
+		return *(mpData + i);
+	  }
+
+      /**
+       * @brief Returns a matrix sub-range
+       * @param o Origin
+       * @param l Length
+       * See @c SubVector class for details
+       */
+      SubVector<_ElemT>
+      Range(const size_t o, const size_t l)
+      { return SubVector<_ElemT>(*this, o, l); }
+
+      /**
+       * @brief Returns a matrix sub-range
+       * @param o Origin
+       * @param l Length
+       * See @c SubVector class for details
+       */
+      const SubVector<_ElemT>
+      Range(const size_t o, const size_t l) const
+      { return SubVector<_ElemT>(*this, o, l); }
+
+
+
+      //########################################################################
+      //########################################################################
+
+      /// Copy data from another vector
+      Vector<_ElemT>&
+      Copy(const Vector<_ElemT>& rV);
+
+      /// Copy data from another vector of a different type.
+      template<typename _ElemU> Vector<_ElemT>&
+      Copy(const Vector<_ElemU>& rV);
+
+
+      /// Load data into the vector
+      Vector<_ElemT>&
+      Copy(const _ElemT* ppData);
+
+      Vector<_ElemT>&
+      CopyVectorizedMatrixRows(const Matrix<_ElemT> &rM);
+
+      Vector<_ElemT>&
+      RemoveElement(size_t i);
+
+      Vector<_ElemT>&
+      ApplyLog();
+
+      Vector<_ElemT>&
+      ApplyLog(const Vector<_ElemT>& rV);//ApplyLog to rV and put the result in (*this)
+
+      Vector<_ElemT>&
+      ApplyExp();
+
+      Vector<_ElemT>&
+      ApplySoftMax();
+
+      Vector<_ElemT>&
+      Invert();
+
+      Vector<_ElemT>&
+      DotMul(const Vector<_ElemT>& rV); // Multiplies each element (*this)(i) by rV(i).
+
+      Vector<_ElemT>&
+      BlasAxpy(const _ElemT alpha, const Vector<_ElemT>& rV);
+
+      Vector<_ElemT>&
+      BlasGemv(const _ElemT alpha, const Matrix<_ElemT>& rM, const MatrixTrasposeType trans, const Vector<_ElemT>& rV, const _ElemT beta = 0.0);
+
+
+      //########################################################################
+      //########################################################################
+
+      Vector<_ElemT>&
+      Add(const Vector<_ElemT>& rV)
+      { return BlasAxpy(1.0, rV); }
+
+      Vector<_ElemT>&
+      Subtract(const Vector<_ElemT>& rV)
+      { return BlasAxpy(-1.0, rV); }
+
+      Vector<_ElemT>&
+      AddScaled(_ElemT alpha, const Vector<_ElemT>& rV)
+      { return BlasAxpy(alpha, rV); }
+
+      Vector<_ElemT>&
+      Add(_ElemT c);
+
+      Vector<_ElemT>&
+      MultiplyElements(const Vector<_ElemT>& rV);
+
+      // @brief elementwise : rV.*rR+beta*this --> this
+      Vector<_ElemT>&
+      MultiplyElements(_ElemT alpha, const Vector<_ElemT>& rV, const Vector<_ElemT>& rR,_ElemT beta);
+
+      Vector<_ElemT>&
+      DivideElements(const Vector<_ElemT>& rV);
+
+      /// @brief elementwise : rV./rR+beta*this --> this
+      Vector<_ElemT>&
+      DivideElements(_ElemT alpha, const Vector<_ElemT>& rV, const Vector<_ElemT>& rR,_ElemT beta);
+
+      Vector<_ElemT>&
+      Subtract(_ElemT c);
+
+      Vector<_ElemT>&
+      Scale(_ElemT c);
+
+
+      //########################################################################
+      //########################################################################
+
+      /// Performs a row stack of the matrix rMa
+      Vector<_ElemT>&
+      MatrixRowStack(const Matrix<_ElemT>& rMa);
+
+      // Extracts a row of the matrix rMa.  .. could also do this with vector.Copy(rMa[row]).
+      Vector<_ElemT>&
+      Row(const Matrix<_ElemT>& rMa, size_t row);
+
+      // Extracts a column of the matrix rMa.
+      Vector<_ElemT>&
+      Col(const Matrix<_ElemT>& rMa, size_t col);
+
+      // Takes all elements to a power.
+      Vector<_ElemT>&
+      Power(_ElemT power);
+
+      _ElemT 
+      Max() const;
+
+      _ElemT 
+      Min() const;
+
+      /// Returns sum of the elements
+      _ElemT
+      Sum() const;
+
+      /// Returns sum of the elements
+      Vector<_ElemT>&
+      AddRowSum(const Matrix<_ElemT>& rM);
+
+      /// Returns sum of the elements
+      Vector<_ElemT>&
+      AddColSum(const Matrix<_ElemT>& rM);
+
+      /// Returns log(sum(exp())) without exp overflow
+      _ElemT
+      LogSumExp() const;
+
+      //########################################################################
+      //########################################################################
+
+      friend std::ostream &
+      operator << <> (std::ostream& rOut, const Vector<_ElemT>& rV);
+
+      friend _ElemT
+      BlasDot<>(const Vector<_ElemT>& rA, const Vector<_ElemT>& rB);
+
+      /**
+       * Computes v1^T * M * v2.  
+       * Not as efficient as it could be where v1==v2 (but no suitable blas
+       * routines available).
+       */
+      _ElemT
+      InnerProduct(const Vector<_ElemT> &v1, const Matrix<_ElemT> &M, const Vector<_ElemT> &v2) const;
+
+
+    //##########################################################################
+    //##########################################################################
+    //protected:
+    public:
+      /// data memory area
+      _ElemT*   mpData;
+#ifdef STK_MEMALIGN_MANUAL
+      /// data to be freed (in case of manual memalignment use, see common.h)
+      _ElemT*   mpFreeData;
+#endif
+      size_t  mDim;      ///< Number of elements
+    }; // class Vector
+
+
+
+
+  /**
+   * @brief Represents a non-allocating general vector which can be defined
+   * as a sub-vector of higher-level vector
+   */
+  template<typename _ElemT>
+    class SubVector : public Vector<_ElemT>
+    {
+    protected:
+      /// Constructor
+      SubVector(const Vector<_ElemT>& rT,
+                const size_t  origin,
+                const size_t  length)
+      {
+        assert(origin+length <= rT.mDim);
+        Vector<_ElemT>::mpData = rT.mpData+origin;
+        Vector<_ElemT>::mDim   = length;
+      }
+      //only Vector class can call this protected constructor
+      friend class Vector<_ElemT>; 
+
+    public:
+      /// Constructor
+      SubVector(Vector<_ElemT>& rT,
+                const size_t  origin,
+                const size_t  length)
+      {
+        assert(origin+length <= rT.mDim);
+        Vector<_ElemT>::mpData = rT.mpData+origin;
+        Vector<_ElemT>::mDim   = length;
+      }
+
+
+      /**
+       * @brief Constructs a vector representation out of a standard array
+       *
+       * @param pData pointer to data array to associate with this vector
+       * @param length length of this vector
+       */
+      inline
+      SubVector(_ElemT *ppData,
+                size_t length)
+      {
+        Vector<_ElemT>::mpData = ppData;
+        Vector<_ElemT>::mDim   = length;
+      }
+
+
+      /**
+       * @brief Destructor
+       */
+      ~SubVector()
+      {
+        Vector<_ElemT>::mpData = NULL;
+      }
+    };
+
+
+    // Useful shortcuts
+    typedef Vector<BaseFloat> BfVector;
+    typedef SubVector<BaseFloat> BfSubVector;
+
+    //Adding two vectors of different types
+    template <typename _ElemT, typename _ElemU>
+    void Add(Vector<_ElemT>& rDst, const Vector<_ElemU>& rSrc)
+    {
+      assert(rDst.Dim() == rSrc.Dim());
+      const _ElemU* p_src = rSrc.pData();
+      _ElemT* p_dst = rDst.pData();
+
+      for(size_t i=0; i<rSrc.Dim(); i++) {
+        *p_dst++ += (_ElemT)*p_src++;
+      }
+    }
+   
+      
+    //Scales adding two vectors of different types
+    template <typename _ElemT, typename _ElemU>
+    void AddScaled(Vector<_ElemT>& rDst, const Vector<_ElemU>& rSrc, _ElemT scale)
+    {
+      assert(rDst.Dim() == rSrc.Dim());
+
+      Vector<_ElemT> tmp(rSrc);
+      rDst.BlasAxpy(scale, tmp); 
+
+/*
+      const _ElemU* p_src = rSrc.pData();
+      _ElemT* p_dst = rDst.pData();
+
+      for(size_t i=0; i<rDst.Dim(); i++) {
+        *p_dst++ += *p_src++ * scale;
+      }
+*/
+    }
+
+
+} // namespace TNet
+
+//*****************************************************************************
+//*****************************************************************************
+// we need to include the implementation
+#include "Vector.tcc"
+
+/******************************************************************************
+ ******************************************************************************
+ * The following section contains specialized template definitions
+ * whose implementation is in Vector.cc
+ */
+
+
+#endif // #ifndef TNet_Vector_h
diff --git a/src/KaldiLib/.svn/text-base/Vector.tcc.svn-base b/src/KaldiLib/.svn/text-base/Vector.tcc.svn-base
new file mode 100644
index 0000000..751ffa7
--- /dev/null
+++ b/src/KaldiLib/.svn/text-base/Vector.tcc.svn-base
@@ -0,0 +1,638 @@
+/** @file Vector.tcc
+ *  This is an internal header file, included by other library headers.
+ *  You should not attempt to use it directly.
+ */
+
+#ifndef TNet_Vector_tcc
+#define TNet_Vector_tcc
+
+#include <cstdlib>
+#include <cmath>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+#include "Common.h"
+
+#ifdef HAVE_ATLAS
+extern "C"{
+  #include <cblas.h>
+}
+#endif
+
+#include "Common.h"
+#include "MathAux.h"
+#include "Matrix.h"
+
+namespace TNet
+{
+  //******************************************************************************
+  //******************************************************************************
+  template<typename _ElemT>
+    inline Vector<_ElemT>&
+    Vector<_ElemT>::
+    Init(const size_t length, bool clear)
+    {
+	  if(mpData != NULL) Destroy();
+	  if(length==0){
+		mpData=NULL;
+#ifdef STK_MEMALIGN_MANUAL
+		mpFreeData=NULL;
+#endif
+		mDim=0;
+		return *this;
+	  }
+      size_t size;
+      void*  data;
+      void*  free_data;
+
+      size = align<16>(length * sizeof(_ElemT));
+
+      if (NULL != (data = stk_memalign(16, size, &free_data))) {
+        mpData        = static_cast<_ElemT*> (data);
+#ifdef STK_MEMALIGN_MANUAL
+        mpFreeData    = static_cast<_ElemT*> (free_data);
+#endif
+        mDim = length;
+      } else {
+        throw std::bad_alloc();
+      }
+      if(clear) Zero();
+      return *this;
+    }
+
+
+  //******************************************************************************
+  //******************************************************************************
+  /// Copy data from another vector
+  template<typename _ElemT>
+    inline Vector<_ElemT>&
+    Vector<_ElemT>::
+    Copy(const Vector<_ElemT>& rV) {
+      assert(Dim() == rV.Dim());
+      Copy(rV.mpData);
+      return *this;
+    }
+
+  /// Load data into the vector
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    Copy(const _ElemT* ppData) {
+      std::memcpy(this->mpData, ppData, Dim() * sizeof(_ElemT));
+      return *this;
+    }
+
+  template<typename _ElemT>
+  template<typename _ElemU>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    Copy(const Vector<_ElemU> &other){
+      assert(Dim()==other.Dim());
+      size_t D=Dim();
+      for(size_t d=0;d<D;d++) (*this)(d) = (_ElemT) other[d];
+      return *this;
+  }
+
+
+  //******************************************************************************
+  //******************************************************************************
+  template<typename _ElemT>
+  Vector<_ElemT>&
+  Vector<_ElemT>::
+  CopyVectorizedMatrixRows(const Matrix<_ElemT> &rM) {
+//    TraceLog("Dim = "+to_string(Dim())+", Rows = "+to_string(rM.Rows())+", Cols = "+to_string(rM.Cols()));
+    assert(Dim() == rM.Cols()*rM.Rows());
+    size_t nCols = rM.Cols();
+    for(size_t r=0; r<rM.Rows(); r++)
+      Range(r*nCols, nCols).Copy(rM[r]);
+    return *this;
+  }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  // Remove element from the vector. The vector is non reallocated
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    RemoveElement(size_t i) {
+      assert(i < mDim && "Access out of vector");
+      for(size_t j = i + 1; j < mDim; j++)
+        this->mpData[j - 1] = this->mpData[j];
+      mDim--;
+      return *this;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  // The destructor
+  template<typename _ElemT>
+    inline void
+    Vector<_ElemT>::
+    Destroy()
+    {
+      // we need to free the data block if it was defined
+#ifndef STK_MEMALIGN_MANUAL
+      if (NULL != mpData) free(mpData);
+#else
+      if (NULL != mpData) free(mpFreeData);
+      mpFreeData = NULL;
+#endif
+
+      mpData = NULL;
+      mDim = 0;
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    inline void
+    Vector<_ElemT>::
+    Zero()
+    {
+      std::memset(mpData, 0, mDim * sizeof(_ElemT));
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    inline void
+    Vector<_ElemT>::
+    Set(_ElemT f)
+    {
+      for(size_t i=0;i<mDim;i++) mpData[i] = f;
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    MatrixRowStack(const Matrix<_ElemT>& rMa)
+    {
+      assert(mDim == rMa.Cols() * rMa.Rows());
+
+      _ElemT*       inc_data = mpData;
+      const size_t  cols     = rMa.Cols();
+
+      for (size_t i = 0; i < rMa.Rows(); i++)
+      {
+        // copy the data to the propper position
+        memcpy(inc_data, rMa[i], cols * sizeof(_ElemT));
+
+        // set new copy position
+        inc_data += cols;
+      }
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+	  Row(const Matrix<_ElemT> &rMa, size_t row)
+    {
+	  assert(row < rMa.Rows());
+      const _ElemT *mRow = rMa.pRowData(row);
+      // if(mDim != rMa.Cols()) Init(rMa.Cols()); // automatically resize.
+      memcpy(mpData, mRow, sizeof(_ElemT)*mDim);
+	  return *this;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+  Power(_ElemT power) // takes elements to a power.  Throws exception if could not.
+    {
+      for(size_t i=0;i<Dim();i++){
+        _ElemT tmp = (*this)(i);
+        (*this)(i) = pow(tmp, power);
+        if((*this)(i) == HUGE_VAL)
+          throw std::runtime_error((std::string)"Error in Vector::power, could not take " +to_string(tmp)+ " to power " +to_string((*this)(i)));
+      }
+      return (*this);
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    _ElemT
+    Vector<_ElemT>::
+    Max() const 
+    {
+      if(Dim()==0) throw std::runtime_error("Error in Vector::Max(), empty vector\n");
+      _ElemT ans = (*this)(0);
+      for(size_t i=1;i<Dim();i++) ans = std::max(ans, (*this)(i));
+      return ans;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    _ElemT
+    Vector<_ElemT>::
+    Min() const 
+    {
+      if(Dim()==0) throw std::runtime_error("Error in Vector::Min(), empty vector\n");
+      _ElemT ans = (*this)(0);
+      for(size_t i=1;i<Dim();i++) ans = std::min(ans, (*this)(i));
+      return ans;
+    }
+
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+	  Col(const Matrix<_ElemT> &rMa, size_t col)
+  {
+	  assert(col < rMa.Cols());
+      // if(mDim != rMa.Cols()) Init(rMa.Cols()); // automatically resize.
+	  for(size_t i=0;i<mDim;i++)
+		mpData[i] = rMa(i,col); // can't do this efficiently so don't really bother.
+	  return *this;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    _ElemT
+    Vector<_ElemT>::
+    Sum() const
+    {
+      //note the double accumulator
+      double sum = 0.0;
+
+      for (size_t i = 0; i < mDim; ++i) {
+        sum += mpData[i];
+      }
+      return (_ElemT)sum;
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    AddColSum(const Matrix<_ElemT>& rM)
+    {
+      // note the double accumulator
+      double sum;
+
+      assert(mDim == rM.Cols());
+
+      for (size_t i = 0; i < mDim; ++i) {
+        sum = 0.0;
+        for (size_t j = 0; j < rM.Rows(); ++j) {
+          sum += rM[j][i];
+        }
+        mpData[i] += sum;
+      }
+      return *this;
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    AddRowSum(const Matrix<_ElemT>& rM)
+    {
+      // note the double accumulator
+      double sum;
+
+      assert(mDim == rM.Rows());
+
+      for (size_t i = 0; i < mDim; ++i) {
+        sum = 0.0;
+        for (size_t j = 0; j < rM.Cols(); ++j) {
+          sum += rM[i][j];
+        }
+        mpData[i] += sum;
+      }
+      return *this;
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    _ElemT
+    Vector<_ElemT>::
+    LogSumExp() const
+    {
+      double sum = LOG_0;
+
+      for (size_t i = 0; i < mDim; ++i) {
+        sum = LogAdd(sum, mpData[i]);
+      }
+      return sum;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    Invert() {
+      for (size_t i = 0; i < mDim; ++i) {
+        mpData[i] = static_cast<_ElemT>(1 / mpData[i]);
+      }
+      return *this;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    ApplyLog() {
+      for (size_t i = 0; i < mDim; ++i) {
+        mpData[i] = _LOG(mpData[i]);
+      }
+      return *this;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    ApplyLog(const Vector<_ElemT>& rV) {
+      assert(mDim==rV.Dim());
+      for (size_t i = 0; i < mDim; ++i) {
+        mpData[i] = log(rV[i]);
+      }
+      return *this;
+    }
+    
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    ApplyExp() {
+      for (size_t i = 0; i < mDim; ++i) {
+        mpData[i] = _EXP(mpData[i]);
+      }
+      return *this;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    ApplySoftMax() {
+      _ElemT lse = LogSumExp();
+
+      for (size_t i = 0; i < mDim; ++i) {
+        mpData[i] = exp(mpData[i] - lse);
+      }
+      return *this;
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    Add(_ElemT c)
+    {
+      for(size_t i = 0; i < mDim; i++) {
+        mpData[i] += c;
+      }
+      return *this;
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    Subtract(_ElemT c)
+    {
+      for(size_t i = 0; i < mDim; i++) {
+        mpData[i] -= c;
+      }
+      return *this;
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    Scale(_ElemT c)
+    {
+      for(size_t i = 0; i < mDim; i++) {
+        mpData[i] *= c;
+      }
+      return *this;
+    }
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    MultiplyElements(const Vector<_ElemT>& rV)
+    {
+      assert(mDim == rV.Dim());
+      for(size_t i = 0; i < mDim; i++) {
+        mpData[i] *= rV[i];
+      }
+      return *this;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    MultiplyElements(_ElemT alpha, const Vector<_ElemT>& rV, const Vector<_ElemT>& rR, _ElemT beta)
+    {
+      assert((mDim == rV.Dim() && mDim == rR.Dim()));
+      for(size_t i = 0; i < mDim; i++) {
+        mpData[i] = alpha * rV[i] * rR[i] + beta * mpData[i];
+      }
+      return *this;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    DivideElements(const Vector<_ElemT>& rV)
+    {
+      assert(mDim == rV.Dim());
+      for(size_t i = 0; i < mDim; i++) {
+        mpData[i] /= rV[i];
+      }
+      return *this;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    DivideElements(_ElemT alpha, const Vector<_ElemT>& rV, const Vector<_ElemT>& rR, _ElemT beta)
+    {
+      assert((mDim == rV.Dim() && mDim == rR.Dim()));
+      for(size_t i = 0; i < mDim; i++) {
+        mpData[i] = alpha * rV[i]/rR[i] + beta * mpData[i] ;
+      }
+      return *this;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+  void Load(std::istream& rIn, Vector<_ElemT>& rV)
+    { 
+      std::streamoff pos = rIn.tellg();
+      if(MatrixVectorIostreamControl::Flags(rIn, ACCUMULATE_INPUT)) {
+        for (size_t i = 0; i < rV.Dim(); i++) {
+          _ElemT tmp;
+          rIn >> tmp;
+          rV[i] += tmp;
+        }
+      } else {
+        for (size_t i = 0; i < rV.Dim(); i++) {
+          rIn >> rV[i];
+        }
+      }
+      if(rIn.fail()) { 
+        throw std::runtime_error("Failed to read vector from stream.  File position is "+to_string(pos));
+      }
+    }
+
+  template<typename _ElemT>
+    std::istream &
+     operator >> (std::istream& rIn, Vector<_ElemT>& rV)
+    {
+      rIn >> std::ws;
+      if(rIn.peek() == 'v'){ // "new" format: v <dim> 1.0 0.2 4.3 ...
+        rIn.get();
+        long long int tmp=-1; 
+        rIn >> tmp; 
+        if(rIn.fail() || tmp<0) { 
+          throw std::runtime_error("Failed to read vector from stream: no size"); 
+        }
+        size_t tmp2 = size_t(tmp);
+        assert((long long int)tmp2 == tmp);
+
+        if(rV.Dim() != tmp2) rV.Init(tmp2);
+      }
+      Load(rIn,rV);
+      return rIn;
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    void Save (std::ostream& rOut, const Vector<_ElemT>& rV)
+    {
+
+      for (size_t i = 0; i < rV.Dim(); i++) {
+        rOut << rV[i] << ' ';
+      }
+      if(rOut.fail()) { 
+        throw std::runtime_error("Failed to write vector to stream"); 
+      }
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    std::ostream &
+    operator << (std::ostream& rOut, const Vector<_ElemT>& rV)
+    {
+      rOut << "v " << rV.Dim() << "  ";
+      Save(rOut,rV);
+      return rOut;
+    }
+
+
+
+  //****************************************************************************
+  //****************************************************************************
+
+#ifdef HAVE_ATLAS
+  template<>
+    float
+   BlasDot<>(const Vector<float>& rA, const Vector<float>& rB);
+
+  template<>
+   double
+   BlasDot<>(const Vector<double>& rA, const Vector<double>& rB);
+
+  template<typename _ElemT>
+    inline Vector<_ElemT>&
+    Vector<_ElemT>::
+   DotMul(const Vector<_ElemT> &rV){
+	 assert(mDim == rV.mDim);
+	 const _ElemT *other_data = rV.pData();
+	 _ElemT *my_data = mpData, *my_data_end = my_data+mDim;
+	 for(;my_data<my_data_end;) *(my_data++) *= *(other_data++);
+	 return *this;
+  }
+
+  template<>
+    Vector<float>&
+    Vector<float>::
+    BlasAxpy(const float alpha, const Vector<float>& rV);
+
+
+  template<>
+    Vector<double>&
+    Vector<double>::
+   BlasAxpy(const double alpha, const Vector<double>& rV);
+
+
+  template<>
+    Vector<float>&
+    Vector<float>::
+    BlasGemv(const float alpha, const Matrix<float>& rM, MatrixTrasposeType trans, const Vector<float>& rV, const float beta);
+
+  template<>
+    Vector<double>&
+    Vector<double>::
+    BlasGemv(const double alpha, const Matrix<double>& rM, MatrixTrasposeType trans, const Vector<double>& rV, const double beta);
+
+#else
+      #error Routines in this section are not implemented yet without BLAS
+#endif
+
+
+  template<class _ElemT>
+  _ElemT
+  InnerProduct(const Vector<_ElemT> &v1, const Matrix<_ElemT> &M, const Vector<_ElemT> &v2){
+    assert(v1.size()==M.Rows() && v2.size()==M.Cols());
+    Vector<_ElemT> vtmp(M.Rows());
+    vtmp.BlasGemv(1.0, M, NO_TRANS, v2, 0.0);
+    return BlasDot(v1, vtmp);
+  }
+
+
+} // namespace TNet
+
+
+#endif // TNet_Vector_tcc
diff --git a/src/KaldiLib/.svn/text-base/cblas.h.svn-base b/src/KaldiLib/.svn/text-base/cblas.h.svn-base
new file mode 100644
index 0000000..4087ffb
--- /dev/null
+++ b/src/KaldiLib/.svn/text-base/cblas.h.svn-base
@@ -0,0 +1,596 @@
+#ifndef CBLAS_H
+
+#ifndef CBLAS_ENUM_DEFINED_H
+   #define CBLAS_ENUM_DEFINED_H
+   enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102 };
+   enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113,
+                         AtlasConj=114};
+   enum CBLAS_UPLO  {CblasUpper=121, CblasLower=122};
+   enum CBLAS_DIAG  {CblasNonUnit=131, CblasUnit=132};
+   enum CBLAS_SIDE  {CblasLeft=141, CblasRight=142};
+#endif
+
+#ifndef CBLAS_ENUM_ONLY
+#define CBLAS_H
+#define CBLAS_INDEX int
+
+int cblas_errprn(int ierr, int info, char *form, ...);
+
+/*
+ * ===========================================================================
+ * Prototypes for level 1 BLAS functions (complex are recast as routines)
+ * ===========================================================================
+ */
+float  cblas_sdsdot(const int N, const float alpha, const float *X,
+                    const int incX, const float *Y, const int incY);
+double cblas_dsdot(const int N, const float *X, const int incX, const float *Y,
+                   const int incY);
+float  cblas_sdot(const int N, const float  *X, const int incX,
+                  const float  *Y, const int incY);
+double cblas_ddot(const int N, const double *X, const int incX,
+                  const double *Y, const int incY);
+/*
+ * Functions having prefixes Z and C only
+ */
+void   cblas_cdotu_sub(const int N, const void *X, const int incX,
+                       const void *Y, const int incY, void *dotu);
+void   cblas_cdotc_sub(const int N, const void *X, const int incX,
+                       const void *Y, const int incY, void *dotc);
+
+void   cblas_zdotu_sub(const int N, const void *X, const int incX,
+                       const void *Y, const int incY, void *dotu);
+void   cblas_zdotc_sub(const int N, const void *X, const int incX,
+                       const void *Y, const int incY, void *dotc);
+
+
+/*
+ * Functions having prefixes S D SC DZ
+ */
+float  cblas_snrm2(const int N, const float *X, const int incX);
+float  cblas_sasum(const int N, const float *X, const int incX);
+
+double cblas_dnrm2(const int N, const double *X, const int incX);
+double cblas_dasum(const int N, const double *X, const int incX);
+
+float  cblas_scnrm2(const int N, const void *X, const int incX);
+float  cblas_scasum(const int N, const void *X, const int incX);
+
+double cblas_dznrm2(const int N, const void *X, const int incX);
+double cblas_dzasum(const int N, const void *X, const int incX);
+
+
+/*
+ * Functions having standard 4 prefixes (S D C Z)
+ */
+CBLAS_INDEX cblas_isamax(const int N, const float  *X, const int incX);
+CBLAS_INDEX cblas_idamax(const int N, const double *X, const int incX);
+CBLAS_INDEX cblas_icamax(const int N, const void   *X, const int incX);
+CBLAS_INDEX cblas_izamax(const int N, const void   *X, const int incX);
+
+/*
+ * ===========================================================================
+ * Prototypes for level 1 BLAS routines
+ * ===========================================================================
+ */
+
+/*
+ * Routines with standard 4 prefixes (s, d, c, z)
+ */
+void cblas_sswap(const int N, float *X, const int incX,
+                 float *Y, const int incY);
+void cblas_scopy(const int N, const float *X, const int incX,
+                 float *Y, const int incY);
+void cblas_saxpy(const int N, const float alpha, const float *X,
+                 const int incX, float *Y, const int incY);
+void catlas_saxpby(const int N, const float alpha, const float *X,
+                  const int incX, const float beta, float *Y, const int incY);
+void catlas_sset
+   (const int N, const float alpha, float *X, const int incX);
+
+void cblas_dswap(const int N, double *X, const int incX,
+                 double *Y, const int incY);
+void cblas_dcopy(const int N, const double *X, const int incX,
+                 double *Y, const int incY);
+void cblas_daxpy(const int N, const double alpha, const double *X,
+                 const int incX, double *Y, const int incY);
+void catlas_daxpby(const int N, const double alpha, const double *X,
+                  const int incX, const double beta, double *Y, const int incY);
+void catlas_dset
+   (const int N, const double alpha, double *X, const int incX);
+
+void cblas_cswap(const int N, void *X, const int incX,
+                 void *Y, const int incY);
+void cblas_ccopy(const int N, const void *X, const int incX,
+                 void *Y, const int incY);
+void cblas_caxpy(const int N, const void *alpha, const void *X,
+                 const int incX, void *Y, const int incY);
+void catlas_caxpby(const int N, const void *alpha, const void *X,
+                  const int incX, const void *beta, void *Y, const int incY);
+void catlas_cset
+   (const int N, const void *alpha, void *X, const int incX);
+
+void cblas_zswap(const int N, void *X, const int incX,
+                 void *Y, const int incY);
+void cblas_zcopy(const int N, const void *X, const int incX,
+                 void *Y, const int incY);
+void cblas_zaxpy(const int N, const void *alpha, const void *X,
+                 const int incX, void *Y, const int incY);
+void catlas_zaxpby(const int N, const void *alpha, const void *X,
+                  const int incX, const void *beta, void *Y, const int incY);
+void catlas_zset
+   (const int N, const void *alpha, void *X, const int incX);
+
+
+/*
+ * Routines with S and D prefix only
+ */
+void cblas_srotg(float *a, float *b, float *c, float *s);
+void cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P);
+void cblas_srot(const int N, float *X, const int incX,
+                float *Y, const int incY, const float c, const float s);
+void cblas_srotm(const int N, float *X, const int incX,
+                float *Y, const int incY, const float *P);
+
+void cblas_drotg(double *a, double *b, double *c, double *s);
+void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P);
+void cblas_drot(const int N, double *X, const int incX,
+                double *Y, const int incY, const double c, const double s);
+void cblas_drotm(const int N, double *X, const int incX,
+                double *Y, const int incY, const double *P);
+
+
+/*
+ * Routines with S D C Z CS and ZD prefixes
+ */
+void cblas_sscal(const int N, const float alpha, float *X, const int incX);
+void cblas_dscal(const int N, const double alpha, double *X, const int incX);
+void cblas_cscal(const int N, const void *alpha, void *X, const int incX);
+void cblas_zscal(const int N, const void *alpha, void *X, const int incX);
+void cblas_csscal(const int N, const float alpha, void *X, const int incX);
+void cblas_zdscal(const int N, const double alpha, void *X, const int incX);
+
+/*
+ * Extra reference routines provided by ATLAS, but not mandated by the standard
+ */
+void cblas_crotg(void *a, void *b, void *c, void *s);
+void cblas_zrotg(void *a, void *b, void *c, void *s);
+void cblas_csrot(const int N, void *X, const int incX, void *Y, const int incY,
+                 const float c, const float s);
+void cblas_zdrot(const int N, void *X, const int incX, void *Y, const int incY,
+                 const double c, const double s);
+
+/*
+ * ===========================================================================
+ * Prototypes for level 2 BLAS
+ * ===========================================================================
+ */
+
+/*
+ * Routines with standard 4 prefixes (S, D, C, Z)
+ */
+void cblas_sgemv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const float alpha, const float *A, const int lda,
+                 const float *X, const int incX, const float beta,
+                 float *Y, const int incY);
+void cblas_sgbmv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const int KL, const int KU, const float alpha,
+                 const float *A, const int lda, const float *X,
+                 const int incX, const float beta, float *Y, const int incY);
+void cblas_strmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const float *A, const int lda,
+                 float *X, const int incX);
+void cblas_stbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const float *A, const int lda,
+                 float *X, const int incX);
+void cblas_stpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const float *Ap, float *X, const int incX);
+void cblas_strsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const float *A, const int lda, float *X,
+                 const int incX);
+void cblas_stbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const float *A, const int lda,
+                 float *X, const int incX);
+void cblas_stpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const float *Ap, float *X, const int incX);
+
+void cblas_dgemv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const double alpha, const double *A, const int lda,
+                 const double *X, const int incX, const double beta,
+                 double *Y, const int incY);
+void cblas_dgbmv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const int KL, const int KU, const double alpha,
+                 const double *A, const int lda, const double *X,
+                 const int incX, const double beta, double *Y, const int incY);
+void cblas_dtrmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const double *A, const int lda,
+                 double *X, const int incX);
+void cblas_dtbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const double *A, const int lda,
+                 double *X, const int incX);
+void cblas_dtpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const double *Ap, double *X, const int incX);
+void cblas_dtrsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const double *A, const int lda, double *X,
+                 const int incX);
+void cblas_dtbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const double *A, const int lda,
+                 double *X, const int incX);
+void cblas_dtpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const double *Ap, double *X, const int incX);
+
+void cblas_cgemv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 const void *X, const int incX, const void *beta,
+                 void *Y, const int incY);
+void cblas_cgbmv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const int KL, const int KU, const void *alpha,
+                 const void *A, const int lda, const void *X,
+                 const int incX, const void *beta, void *Y, const int incY);
+void cblas_ctrmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *A, const int lda,
+                 void *X, const int incX);
+void cblas_ctbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const void *A, const int lda,
+                 void *X, const int incX);
+void cblas_ctpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *Ap, void *X, const int incX);
+void cblas_ctrsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *A, const int lda, void *X,
+                 const int incX);
+void cblas_ctbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const void *A, const int lda,
+                 void *X, const int incX);
+void cblas_ctpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *Ap, void *X, const int incX);
+
+void cblas_zgemv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 const void *X, const int incX, const void *beta,
+                 void *Y, const int incY);
+void cblas_zgbmv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const int KL, const int KU, const void *alpha,
+                 const void *A, const int lda, const void *X,
+                 const int incX, const void *beta, void *Y, const int incY);
+void cblas_ztrmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *A, const int lda,
+                 void *X, const int incX);
+void cblas_ztbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const void *A, const int lda,
+                 void *X, const int incX);
+void cblas_ztpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *Ap, void *X, const int incX);
+void cblas_ztrsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *A, const int lda, void *X,
+                 const int incX);
+void cblas_ztbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const void *A, const int lda,
+                 void *X, const int incX);
+void cblas_ztpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *Ap, void *X, const int incX);
+
+
+/*
+ * Routines with S and D prefixes only
+ */
+void cblas_ssymv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const float alpha, const float *A,
+                 const int lda, const float *X, const int incX,
+                 const float beta, float *Y, const int incY);
+void cblas_ssbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const int K, const float alpha, const float *A,
+                 const int lda, const float *X, const int incX,
+                 const float beta, float *Y, const int incY);
+void cblas_sspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const float alpha, const float *Ap,
+                 const float *X, const int incX,
+                 const float beta, float *Y, const int incY);
+void cblas_sger(const enum CBLAS_ORDER Order, const int M, const int N,
+                const float alpha, const float *X, const int incX,
+                const float *Y, const int incY, float *A, const int lda);
+void cblas_ssyr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const float alpha, const float *X,
+                const int incX, float *A, const int lda);
+void cblas_sspr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const float alpha, const float *X,
+                const int incX, float *Ap);
+void cblas_ssyr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const float alpha, const float *X,
+                const int incX, const float *Y, const int incY, float *A,
+                const int lda);
+void cblas_sspr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const float alpha, const float *X,
+                const int incX, const float *Y, const int incY, float *A);
+
+void cblas_dsymv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const double alpha, const double *A,
+                 const int lda, const double *X, const int incX,
+                 const double beta, double *Y, const int incY);
+void cblas_dsbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const int K, const double alpha, const double *A,
+                 const int lda, const double *X, const int incX,
+                 const double beta, double *Y, const int incY);
+void cblas_dspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const double alpha, const double *Ap,
+                 const double *X, const int incX,
+                 const double beta, double *Y, const int incY);
+void cblas_dger(const enum CBLAS_ORDER Order, const int M, const int N,
+                const double alpha, const double *X, const int incX,
+                const double *Y, const int incY, double *A, const int lda);
+void cblas_dsyr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const double alpha, const double *X,
+                const int incX, double *A, const int lda);
+void cblas_dspr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const double alpha, const double *X,
+                const int incX, double *Ap);
+void cblas_dsyr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const double alpha, const double *X,
+                const int incX, const double *Y, const int incY, double *A,
+                const int lda);
+void cblas_dspr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const double alpha, const double *X,
+                const int incX, const double *Y, const int incY, double *A);
+
+
+/*
+ * Routines with C and Z prefixes only
+ */
+void cblas_chemv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const void *alpha, const void *A,
+                 const int lda, const void *X, const int incX,
+                 const void *beta, void *Y, const int incY);
+void cblas_chbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const int K, const void *alpha, const void *A,
+                 const int lda, const void *X, const int incX,
+                 const void *beta, void *Y, const int incY);
+void cblas_chpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const void *alpha, const void *Ap,
+                 const void *X, const int incX,
+                 const void *beta, void *Y, const int incY);
+void cblas_cgeru(const enum CBLAS_ORDER Order, const int M, const int N,
+                 const void *alpha, const void *X, const int incX,
+                 const void *Y, const int incY, void *A, const int lda);
+void cblas_cgerc(const enum CBLAS_ORDER Order, const int M, const int N,
+                 const void *alpha, const void *X, const int incX,
+                 const void *Y, const int incY, void *A, const int lda);
+void cblas_cher(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const float alpha, const void *X, const int incX,
+                void *A, const int lda);
+void cblas_chpr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const float alpha, const void *X,
+                const int incX, void *A);
+void cblas_cher2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N,
+                const void *alpha, const void *X, const int incX,
+                const void *Y, const int incY, void *A, const int lda);
+void cblas_chpr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N,
+                const void *alpha, const void *X, const int incX,
+                const void *Y, const int incY, void *Ap);
+
+void cblas_zhemv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const void *alpha, const void *A,
+                 const int lda, const void *X, const int incX,
+                 const void *beta, void *Y, const int incY);
+void cblas_zhbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const int K, const void *alpha, const void *A,
+                 const int lda, const void *X, const int incX,
+                 const void *beta, void *Y, const int incY);
+void cblas_zhpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const void *alpha, const void *Ap,
+                 const void *X, const int incX,
+                 const void *beta, void *Y, const int incY);
+void cblas_zgeru(const enum CBLAS_ORDER Order, const int M, const int N,
+                 const void *alpha, const void *X, const int incX,
+                 const void *Y, const int incY, void *A, const int lda);
+void cblas_zgerc(const enum CBLAS_ORDER Order, const int M, const int N,
+                 const void *alpha, const void *X, const int incX,
+                 const void *Y, const int incY, void *A, const int lda);
+void cblas_zher(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const double alpha, const void *X, const int incX,
+                void *A, const int lda);
+void cblas_zhpr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const double alpha, const void *X,
+                const int incX, void *A);
+void cblas_zher2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N,
+                const void *alpha, const void *X, const int incX,
+                const void *Y, const int incY, void *A, const int lda);
+void cblas_zhpr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N,
+                const void *alpha, const void *X, const int incX,
+                const void *Y, const int incY, void *Ap);
+
+/*
+ * ===========================================================================
+ * Prototypes for level 3 BLAS
+ * ===========================================================================
+ */
+
+/*
+ * Routines with standard 4 prefixes (S, D, C, Z)
+ */
+void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
+                 const int K, const float alpha, const float *A,
+                 const int lda, const float *B, const int ldb,
+                 const float beta, float *C, const int ldc);
+void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const int M, const int N,
+                 const float alpha, const float *A, const int lda,
+                 const float *B, const int ldb, const float beta,
+                 float *C, const int ldc);
+void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                 const float alpha, const float *A, const int lda,
+                 const float beta, float *C, const int ldc);
+void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                  const float alpha, const float *A, const int lda,
+                  const float *B, const int ldb, const float beta,
+                  float *C, const int ldc);
+void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const float alpha, const float *A, const int lda,
+                 float *B, const int ldb);
+void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const float alpha, const float *A, const int lda,
+                 float *B, const int ldb);
+
+void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
+                 const int K, const double alpha, const double *A,
+                 const int lda, const double *B, const int ldb,
+                 const double beta, double *C, const int ldc);
+void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const int M, const int N,
+                 const double alpha, const double *A, const int lda,
+                 const double *B, const int ldb, const double beta,
+                 double *C, const int ldc);
+void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                 const double alpha, const double *A, const int lda,
+                 const double beta, double *C, const int ldc);
+void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                  const double alpha, const double *A, const int lda,
+                  const double *B, const int ldb, const double beta,
+                  double *C, const int ldc);
+void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const double alpha, const double *A, const int lda,
+                 double *B, const int ldb);
+void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const double alpha, const double *A, const int lda,
+                 double *B, const int ldb);
+
+void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
+                 const int K, const void *alpha, const void *A,
+                 const int lda, const void *B, const int ldb,
+                 const void *beta, void *C, const int ldc);
+void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 const void *B, const int ldb, const void *beta,
+                 void *C, const int ldc);
+void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                 const void *alpha, const void *A, const int lda,
+                 const void *beta, void *C, const int ldc);
+void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                  const void *alpha, const void *A, const int lda,
+                  const void *B, const int ldb, const void *beta,
+                  void *C, const int ldc);
+void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 void *B, const int ldb);
+void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 void *B, const int ldb);
+
+void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
+                 const int K, const void *alpha, const void *A,
+                 const int lda, const void *B, const int ldb,
+                 const void *beta, void *C, const int ldc);
+void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 const void *B, const int ldb, const void *beta,
+                 void *C, const int ldc);
+void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                 const void *alpha, const void *A, const int lda,
+                 const void *beta, void *C, const int ldc);
+void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                  const void *alpha, const void *A, const int lda,
+                  const void *B, const int ldb, const void *beta,
+                  void *C, const int ldc);
+void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 void *B, const int ldb);
+void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 void *B, const int ldb);
+
+
+/*
+ * Routines with prefixes C and Z only
+ */
+void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 const void *B, const int ldb, const void *beta,
+                 void *C, const int ldc);
+void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                 const float alpha, const void *A, const int lda,
+                 const float beta, void *C, const int ldc);
+void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                  const void *alpha, const void *A, const int lda,
+                  const void *B, const int ldb, const float beta,
+                  void *C, const int ldc);
+void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 const void *B, const int ldb, const void *beta,
+                 void *C, const int ldc);
+void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                 const double alpha, const void *A, const int lda,
+                 const double beta, void *C, const int ldc);
+void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                  const void *alpha, const void *A, const int lda,
+                  const void *B, const int ldb, const double beta,
+                  void *C, const int ldc);
+
+int cblas_errprn(int ierr, int info, char *form, ...);
+
+#endif  /* end #ifdef CBLAS_ENUM_ONLY */
+#endif
diff --git a/src/KaldiLib/.svn/text-base/clapack.cc.svn-base b/src/KaldiLib/.svn/text-base/clapack.cc.svn-base
new file mode 100644
index 0000000..a486bef
--- /dev/null
+++ b/src/KaldiLib/.svn/text-base/clapack.cc.svn-base
@@ -0,0 +1,61 @@
+
+extern "C" {
+
+  /**
+   * Wrapper to GotoBLAS lapack for STK and TNet (sgetrf sgetri dgetrf dgetri)
+   */
+  typedef float real;
+  typedef double doublereal;
+  typedef int integer;
+
+
+  /**
+   * The lapack interface (used in gotoblas)
+   */
+  /* Subroutine */ int sgetrf_(integer *m, integer *n, real *a, integer *lda,
+          integer *ipiv, integer *info);
+  /* Subroutine */ int sgetri_(integer *n, real *a, integer *lda, integer *ipiv,
+          real *work, integer *lwork, integer *info);
+  /* Subroutine */ int dgetrf_(integer *m, integer *n, doublereal *a, integer *
+          lda, integer *ipiv, integer *info);
+  /* Subroutine */ int dgetri_(integer *n, doublereal *a, integer *lda, integer
+          *ipiv, doublereal *work, integer *lwork, integer *info);
+
+
+
+
+
+  /**
+   * The clapack interface as used by ATLAS (used in STK, 
+   */
+  enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102 };
+
+  int clapack_sgetrf(const enum CBLAS_ORDER Order, const int M, const int N,
+                     float *A, const int lda, int *ipiv) 
+  {
+    return sgetrf_((int*)&M, (int*)&N, A, (int*)&lda, (int*)ipiv, 0);
+  }
+
+
+  int clapack_sgetri(const enum CBLAS_ORDER Order, const int N, float *A,
+                     const int lda, const int *ipiv) 
+  {
+    return sgetri_((int*)&N, A, (int*)&lda, (int*)ipiv, 0, 0, 0);
+  }
+
+
+  int clapack_dgetrf(const enum CBLAS_ORDER Order, const int M, const int N,
+                     double *A, const int lda, int *ipiv) 
+  {
+    return dgetrf_((int*)&M, (int*)&N, A, (int*)&lda, (int*)ipiv, 0);
+  }
+    
+
+  int clapack_dgetri(const enum CBLAS_ORDER Order, const int N, double *A,
+                     const int lda, const int *ipiv)
+  {
+    return dgetri_((int*)&N, A, (int*)&lda, (int*)ipiv, 0, 0, 0);
+  }
+
+
+}
diff --git a/src/KaldiLib/.svn/text-base/clapack.h.svn-base b/src/KaldiLib/.svn/text-base/clapack.h.svn-base
new file mode 100644
index 0000000..0c6855d
--- /dev/null
+++ b/src/KaldiLib/.svn/text-base/clapack.h.svn-base
@@ -0,0 +1,149 @@
+/*
+ *             Automatically Tuned Linear Algebra Software v3.8.2
+ *                    (C) Copyright 1999 R. Clint Whaley
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *   1. Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *   2. Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions, and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *   3. The name of the ATLAS group or the names of its contributers may
+ *      not be used to endorse or promote products derived from this
+ *      software without specific written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef CLAPACK_H
+
+#define CLAPACK_H
+#include "cblas.h"
+
+#ifndef ATLAS_ORDER
+   #define ATLAS_ORDER CBLAS_ORDER
+#endif
+#ifndef ATLAS_UPLO
+   #define ATLAS_UPLO CBLAS_UPLO
+#endif
+#ifndef ATLAS_DIAG
+   #define ATLAS_DIAG CBLAS_DIAG
+#endif
+int clapack_sgesv(const enum CBLAS_ORDER Order, const int N, const int NRHS,
+                  float *A, const int lda, int *ipiv,
+                  float *B, const int ldb);
+int clapack_sgetrf(const enum CBLAS_ORDER Order, const int M, const int N,
+                   float *A, const int lda, int *ipiv);
+int clapack_sgetrs
+   (const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE Trans,
+    const int N, const int NRHS, const float *A, const int lda,
+    const int *ipiv, float *B, const int ldb);
+int clapack_sgetri(const enum CBLAS_ORDER Order, const int N, float *A,
+                   const int lda, const int *ipiv);
+int clapack_sposv(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+                  const int N, const int NRHS, float *A, const int lda,
+                  float *B, const int ldb);
+int clapack_spotrf(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+                   const int N, float *A, const int lda);
+int clapack_spotrs(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                   const int N, const int NRHS, const float *A, const int lda,
+                   float *B, const int ldb);
+int clapack_spotri(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+                   const int N, float *A, const int lda);
+int clapack_slauum(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+                   const int N, float *A, const int lda);
+int clapack_strtri(const enum ATLAS_ORDER Order,const enum ATLAS_UPLO Uplo,
+                  const enum ATLAS_DIAG Diag,const int N, float *A, const int lda);
+
+int clapack_dgesv(const enum CBLAS_ORDER Order, const int N, const int NRHS,
+                  double *A, const int lda, int *ipiv,
+                  double *B, const int ldb);
+int clapack_dgetrf(const enum CBLAS_ORDER Order, const int M, const int N,
+                   double *A, const int lda, int *ipiv);
+int clapack_dgetrs
+   (const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE Trans,
+    const int N, const int NRHS, const double *A, const int lda,
+    const int *ipiv, double *B, const int ldb);
+int clapack_dgetri(const enum CBLAS_ORDER Order, const int N, double *A,
+                   const int lda, const int *ipiv);
+int clapack_dposv(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+                  const int N, const int NRHS, double *A, const int lda,
+                  double *B, const int ldb);
+int clapack_dpotrf(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+                   const int N, double *A, const int lda);
+int clapack_dpotrs(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                   const int N, const int NRHS, const double *A, const int lda,
+                   double *B, const int ldb);
+int clapack_dpotri(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+                   const int N, double *A, const int lda);
+int clapack_dlauum(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+                   const int N, double *A, const int lda);
+int clapack_dtrtri(const enum ATLAS_ORDER Order,const enum ATLAS_UPLO Uplo,
+                  const enum ATLAS_DIAG Diag,const int N, double *A, const int lda);
+
+int clapack_cgesv(const enum CBLAS_ORDER Order, const int N, const int NRHS,
+                  void *A, const int lda, int *ipiv,
+                  void *B, const int ldb);
+int clapack_cgetrf(const enum CBLAS_ORDER Order, const int M, const int N,
+                   void *A, const int lda, int *ipiv);
+int clapack_cgetrs
+   (const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE Trans,
+    const int N, const int NRHS, const void *A, const int lda,
+    const int *ipiv, void *B, const int ldb);
+int clapack_cgetri(const enum CBLAS_ORDER Order, const int N, void *A,
+                   const int lda, const int *ipiv);
+int clapack_cposv(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+                  const int N, const int NRHS, void *A, const int lda,
+                  void *B, const int ldb);
+int clapack_cpotrf(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+                   const int N, void *A, const int lda);
+int clapack_cpotrs(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                   const int N, const int NRHS, const void *A, const int lda,
+                   void *B, const int ldb);
+int clapack_cpotri(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+                   const int N, void *A, const int lda);
+int clapack_clauum(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+                   const int N, void *A, const int lda);
+int clapack_ctrtri(const enum ATLAS_ORDER Order,const enum ATLAS_UPLO Uplo,
+                  const enum ATLAS_DIAG Diag,const int N, void *A, const int lda);
+
+int clapack_zgesv(const enum CBLAS_ORDER Order, const int N, const int NRHS,
+                  void *A, const int lda, int *ipiv,
+                  void *B, const int ldb);
+int clapack_zgetrf(const enum CBLAS_ORDER Order, const int M, const int N,
+                   void *A, const int lda, int *ipiv);
+int clapack_zgetrs
+   (const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE Trans,
+    const int N, const int NRHS, const void *A, const int lda,
+    const int *ipiv, void *B, const int ldb);
+int clapack_zgetri(const enum CBLAS_ORDER Order, const int N, void *A,
+                   const int lda, const int *ipiv);
+int clapack_zposv(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+                  const int N, const int NRHS, void *A, const int lda,
+                  void *B, const int ldb);
+int clapack_zpotrf(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+                   const int N, void *A, const int lda);
+int clapack_zpotrs(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                   const int N, const int NRHS, const void *A, const int lda,
+                   void *B, const int ldb);
+int clapack_zpotri(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+                   const int N, void *A, const int lda);
+int clapack_zlauum(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+                   const int N, void *A, const int lda);
+int clapack_ztrtri(const enum ATLAS_ORDER Order,const enum ATLAS_UPLO Uplo,
+                  const enum ATLAS_DIAG Diag,const int N, void *A, const int lda);
+
+#endif
diff --git a/src/KaldiLib/Common.cc b/src/KaldiLib/Common.cc
new file mode 100644
index 0000000..40909ee
--- /dev/null
+++ b/src/KaldiLib/Common.cc
@@ -0,0 +1,277 @@
+#include <string>
+#include <stdexcept>
+#include <cmath>
+#include <cfloat>
+#include <cstdio>
+
+#include "Common.h"
+#include "MathAux.h"
+
+
+/// Defines the white chars for string trimming
+#if !defined(WHITE_CHARS)
+#  define WHITE_CHARS " \t"
+#endif
+
+namespace TNet {
+
+#include <ios>
+  
+  // Allocating stream variable used by stream modifier MatrixVectorIostreamControl
+  const int MATRIX_IOS_FORMAT_IWORD = std::ios_base::xalloc();
+
+  //***************************************************************************
+  //***************************************************************************
+  int getHTKstr(char *str)
+  {
+    char termChar = '\0';
+    char *chrptr = str;
+  
+    while (std::isspace(*chrptr)) ++chrptr;
+  
+    if (*chrptr == '\'' || *chrptr == '"') {
+      termChar = *chrptr;
+      chrptr++;
+    }
+  
+    for (; *chrptr; chrptr++) {
+      if (*chrptr == '\'' || *chrptr == '"') {
+        if (termChar == *chrptr) {
+          termChar = '\0';
+          chrptr++;
+          break;
+        }
+      }
+  
+      if (std::isspace(*chrptr) && !termChar) {
+        break;
+      }
+  
+      if (*chrptr == '\\') {
+        ++chrptr;
+        if (*chrptr == '\0' || (*chrptr    >= '0' && *chrptr <= '7' &&
+                              (*++chrptr  <  '0' || *chrptr >  '7' ||
+                              *++chrptr  <  '0' || *chrptr >  '7'))) {
+          return -1;
+        }
+  
+        if (*chrptr  >= '0' && *chrptr  <= '7') {
+          *chrptr = (char)((*chrptr - '0') + (chrptr[-1] - '0') * 8 + (chrptr[-2] - '0') * 64);
+        }
+      }
+      *str++ = *chrptr;
+    }
+  
+    if (termChar) {
+      return -2;
+    }
+  
+    *str = '\0';
+  
+    return 0;
+  }
+  
+
+  //*****************************************************************************
+  //*****************************************************************************
+  void
+  ParseHTKString(const std::string & rIn, std::string & rOut)
+  {
+    int ret_val;
+
+    // the new string will be at most as long as the original, so we allocate
+    // space
+    char* new_str = new char[rIn.size() + 1];
+
+    char* p_htk_str = new_str;
+
+    strcpy(p_htk_str, rIn.c_str());
+    ret_val = getHTKstr(p_htk_str);
+
+    // call the function
+    if (!ret_val) {
+      rOut = p_htk_str;
+    }
+
+    delete [] new_str;
+
+    if (ret_val) {
+      throw std::runtime_error("Error parsing HTK string");
+    }
+  }
+
+  
+
+  //***************************************************************************
+  //***************************************************************************
+  bool 
+  IsBigEndian()
+  {
+    int a = 1;
+    return (bool) ((char *) &a)[0] != 1;
+  }
+  
+
+  //***************************************************************************
+  //***************************************************************************
+  void 
+  MakeHtkFileName(char* pOutFileName,  const char* inFileName,
+               const char* out_dir, const char* out_ext)
+  {
+    const char* base_name;
+    const char* bname_end = NULL;
+    const char* chrptr;
+  
+    //  if (*inFileName == '*' && *++inFileName == '/') ++inFileName;
+    
+    // we don't do anything if file is stdin/out
+    if (!strcmp(inFileName, "-"))
+    {
+      pOutFileName[0] = '-';
+      pOutFileName[1] = '\0';
+      return;
+    }    
+    
+    base_name = strrchr(inFileName, '/');
+    base_name = base_name != NULL ? base_name + 1 : inFileName;
+    
+    if (out_ext) bname_end = strrchr(base_name, '.');
+    if (!bname_end) bname_end = base_name + strlen(base_name);
+  
+  
+    if ((chrptr = strstr(inFileName, "/./")) != NULL) 
+    {
+      // what is in path after /./ serve as base name
+      base_name = chrptr + 3;
+    }
+    /* else if (*inFileName != '/') 
+    {
+      // if inFileName isn't absolut path, don't forget directory structure
+      base_name = inFileName;
+    }*/
+  
+    *pOutFileName = '\0';
+    if (out_dir) 
+    {
+      if (*out_dir) 
+      {
+        strcat(pOutFileName, out_dir);
+        strcat(pOutFileName, "/");
+      }
+      strncat(pOutFileName, base_name, bname_end-base_name);
+    } 
+    else 
+    {
+      strncat(pOutFileName, inFileName, bname_end-inFileName);
+    }
+  
+    if (out_ext && *out_ext) 
+    {
+      strcat(pOutFileName, ".");
+      strcat(pOutFileName, out_ext);
+    }
+  }
+
+  
+  //****************************************************************************
+  //****************************************************************************
+  bool 
+  CloseEnough(const float f1, const float f2, const float nRounds)
+  {
+    bool ret_val = (_ABS((f1 - f2) / (f2 == 0.0f ? 1.0f : f2))
+        < (nRounds * FLT_EPSILON));
+
+    return ret_val;
+  } 
+
+  
+  //****************************************************************************
+  //****************************************************************************
+  bool 
+  CloseEnough(const double f1, const double f2, const double nRounds)
+  {
+    bool ret_val = (_ABS((f1 - f2) / (f2 == 0.0 ? 1.0 : f2))
+        < (nRounds * DBL_EPSILON));
+
+    return ret_val;
+  } 
+
+
+  //****************************************************************************
+  //****************************************************************************
+  char* 
+  ExpandHtkFilterCmd(const char *command, const char *filename, const char* pFilter)
+  {
+
+    char *out, *outend;
+    const char *chrptr = command;
+    int ndollars = 0;
+    int fnlen = strlen(filename);
+
+    while (*chrptr++) ndollars += (*chrptr ==  *pFilter);
+
+    out = (char*) malloc(strlen(command) - ndollars + ndollars * fnlen + 1);
+
+    outend = out;
+
+    for (chrptr = command; *chrptr; chrptr++) {
+      if (*chrptr ==  *pFilter) {
+        strcpy(outend, filename);
+        outend += fnlen;
+      } else {
+        *outend++ = *chrptr;
+      }
+    }
+    *outend = '\0';
+    return out;
+  }
+
+  //***************************************************************************
+  //***************************************************************************
+  char *
+  StrToUpper(char *str)
+  {
+    char *chptr;
+    for (chptr = str; *chptr; chptr++) {
+      *chptr = (char)toupper(*chptr);
+    }
+    return str;
+  }
+  
+
+  //**************************************************************************** 
+  //**************************************************************************** 
+  std::string&
+  Trim(std::string& rStr)
+  {
+    // WHITE_CHARS is defined in common.h
+    std::string::size_type pos = rStr.find_last_not_of(WHITE_CHARS);
+    if(pos != std::string::npos) 
+    {
+      rStr.erase(pos + 1);
+      pos = rStr.find_first_not_of(WHITE_CHARS);
+      if(pos != std::string::npos) rStr.erase(0, pos);
+    }
+    else 
+      rStr.erase(rStr.begin(), rStr.end());
+
+    return rStr;
+  }
+
+
+} // namespace TNet
+
+//#ifdef CYGWIN
+
+void assertf(const char *c, int i, const char *msg){
+  printf("Assertion \"%s\" failed: file \"%s\", line %d\n", msg?msg:"(null)", c?c:"(null)", i);
+  abort();
+}
+
+
+void assertf_throw(const char *c, int i, const char *msg){
+  char buf[2000];
+  snprintf(buf, 1999, "Assertion \"%s\" failed, throwing exception: file \"%s\", line %d\n", msg?msg:"(null)", c?c:"(null)", i);
+  throw std::runtime_error((std::string)buf);
+}
+//#endif
diff --git a/src/KaldiLib/Common.h b/src/KaldiLib/Common.h
new file mode 100644
index 0000000..9cd9658
--- /dev/null
+++ b/src/KaldiLib/Common.h
@@ -0,0 +1,233 @@
+#ifndef TNet_Common_h
+#define TNet_Common_h
+
+#include <cstdlib>
+#include <string.h> // C string stuff like strcpy
+#include <string>
+#include <sstream>
+#include <stdexcept>
+
+/* Alignment of critical dynamic data structure
+ *
+ * Not all platforms support memalign so we provide a stk_memalign wrapper
+ * void *stk_memalign( size_t align, size_t size, void **pp_orig )
+ * *pp_orig is the pointer that has to be freed afterwards.
+ */
+#ifdef HAVE_POSIX_MEMALIGN
+#  define stk_memalign(align,size,pp_orig) \
+     ( !posix_memalign( pp_orig, align, size ) ? *(pp_orig) : NULL )
+#  ifdef STK_MEMALIGN_MANUAL
+#    undef STK_MEMALIGN_MANUAL
+#  endif
+#elif defined(HAVE_MEMALIGN)
+   /* Some systems have memalign() but no declaration for it */
+   //void * memalign( size_t align, size_t size );
+#  define stk_memalign(align,size,pp_orig) \
+     ( *(pp_orig) = memalign( align, size ) )
+#  ifdef STK_MEMALIGN_MANUAL
+#    undef STK_MEMALIGN_MANUAL
+#  endif
+#else /* We don't have any choice but to align manually */
+#  define stk_memalign(align,size,pp_orig) \
+     (( *(pp_orig) = malloc( size + align - 1 )) ? \
+     (void *)( (((unsigned long)*(pp_orig)) + 15) & ~0xFUL ) : NULL )
+#  define STK_MEMALIGN_MANUAL
+#endif
+
+
+#define swap8(a) { \
+  char t=((char*)&a)[0]; ((char*)&a)[0]=((char*)&a)[7]; ((char*)&a)[7]=t;\
+      t=((char*)&a)[1]; ((char*)&a)[1]=((char*)&a)[6]; ((char*)&a)[6]=t;\
+      t=((char*)&a)[2]; ((char*)&a)[2]=((char*)&a)[5]; ((char*)&a)[5]=t;\
+      t=((char*)&a)[3]; ((char*)&a)[3]=((char*)&a)[4]; ((char*)&a)[4]=t;}
+#define swap4(a) { \
+  char t=((char*)&a)[0]; ((char*)&a)[0]=((char*)&a)[3]; ((char*)&a)[3]=t;\
+      t=((char*)&a)[1]; ((char*)&a)[1]=((char*)&a)[2]; ((char*)&a)[2]=t;}
+#define swap2(a) { \
+  char t=((char*)&a)[0]; ((char*)&a)[0]=((char*)&a)[1]; ((char*)&a)[1]=t;}
+
+
+namespace TNet
+{
+  /** **************************************************************************
+   ** **************************************************************************
+   * @brief Aligns a number to a specified base
+   * @param n Number of type @c _T to align
+   * @return Aligned value of type @c _T
+   */
+  template<size_t _align, typename _T>
+    inline _T 
+    align(const _T n)
+    {
+      const _T x(_align - 1); 
+      return (n + x) & ~(x);
+    }
+
+
+  /** 
+   * @brief Returns true if architecture is big endian
+   */
+  bool 
+  IsBigEndian();
+
+
+  /** 
+   * @brief Returns true if two numbers are close enough to each other
+   * 
+   * @param f1  First operand
+   * @param f2  Second operand
+   * @param nRounds Expected number of operations prior to this comparison
+  */
+  bool 
+  CloseEnough(const float f1, const float f2, const float nRounds);
+  
+
+  /** 
+   * @brief Returns true if two numbers are close enough to each other
+   * 
+   * @param f1  First operand
+   * @param f2  Second operand
+   * @param nRounds Expected number of operations prior to this comparison
+  */
+  bool 
+  CloseEnough(const double f1, const double f2, const double nRounds);
+  
+
+  /** 
+   * @brief Parses a HTK-style string into a C++ std::string readable
+   * 
+   * @param rIn  HTK input string
+   * @param rOut output parsed string
+   */
+  void
+  ParseHTKString(const std::string & rIn, std::string & rOut);
+
+  
+  /** 
+   * @brief Synthesize new file name based on name, path, and extension
+   * 
+   * @param pOutFileName  full ouptut file name
+   * @param pInFileName   file name
+   * @param pOutDir       directory
+   * @param pOutExt       extension
+   */
+  void    
+  MakeHtkFileName(char *pOutFileName, const char* pInFileName, const char *pOutDir, 
+      const char *pOutExt);
+  
+
+  /** 
+   * @brief Removes the leading and trailing white chars
+   *
+   * @param rStr Refference to the string to be processed
+   * @return Refference to the original string
+   *
+   * The white characters are determined by the @c WHITE_CHARS macro defined 
+   * above.
+   */
+  std::string&
+  Trim(std::string& rStr);
+
+
+  char*
+  StrToUpper(char* pStr);
+
+  char* 
+  ExpandHtkFilterCmd(const char *command, const char *filename, const char* pFilter);
+  
+  
+  template <class T>
+  std::string to_string(const T& val)
+  {
+    std::stringstream ss;
+    ss << val;
+    return ss.str();
+  }
+  
+  inline void 
+  ExpectKeyword(std::istream &i_stream, const char *kwd)
+  {
+     std::string token;
+     i_stream >> token;
+     if (token != kwd) {
+       throw std::runtime_error(std::string(kwd) + " expected");
+     }
+  }
+  
+  extern const int MATRIX_IOS_FORMAT_IWORD;
+
+  enum MatrixVectorIostreamControlBits {
+    ACCUMULATE_INPUT = 1,
+//  BINARY_OUTPUT    = 2
+  };
+  
+  class MatrixVectorIostreamControl
+  {
+    public:
+      MatrixVectorIostreamControl(enum MatrixVectorIostreamControlBits bitsToBeSet, bool valueToBeSet)
+      : mBitsToBeSet(bitsToBeSet), mValueToBeSet(valueToBeSet) {}
+      
+      static long Flags(std::ios_base &rIos, enum MatrixVectorIostreamControlBits bits)
+      { return rIos.iword(MATRIX_IOS_FORMAT_IWORD); }
+      
+      long mBitsToBeSet;
+      bool mValueToBeSet;
+            
+      friend std::ostream & operator <<(std::ostream &rOs, const MatrixVectorIostreamControl modifier)
+      {
+        if(modifier.mValueToBeSet) {
+          rOs.iword(MATRIX_IOS_FORMAT_IWORD) |= modifier.mBitsToBeSet;
+        } else {
+          rOs.iword(MATRIX_IOS_FORMAT_IWORD) &= ~modifier.mBitsToBeSet;
+        }
+        return rOs;
+      }
+
+      friend std::istream & operator >>(std::istream &rIs, const MatrixVectorIostreamControl modifier)
+      {
+        if(modifier.mValueToBeSet) {
+          rIs.iword(MATRIX_IOS_FORMAT_IWORD) |= modifier.mBitsToBeSet;
+        } else {
+          rIs.iword(MATRIX_IOS_FORMAT_IWORD) &= ~modifier.mBitsToBeSet;
+        }
+        return rIs;
+      }
+  };
+  
+  
+  
+
+} // namespace TNet
+
+#ifdef __ICC
+#pragma warning (disable: 383) // ICPC remark we don't want.
+#pragma warning (disable: 810) // ICPC remark we don't want.
+#pragma warning (disable: 981) // ICPC remark we don't want.
+#pragma warning (disable: 1418) // ICPC remark we don't want.
+#pragma warning (disable: 444) // ICPC remark we don't want.
+#pragma warning (disable: 869) // ICPC remark we don't want.
+#pragma warning (disable: 1287) // ICPC remark we don't want.
+#pragma warning (disable: 279) // ICPC remark we don't want.
+#pragma warning (disable: 981) // ICPC remark we don't want.
+#endif
+
+//#ifdef CYGWIN
+#if 1
+#undef assert
+#ifndef NDEBUG
+#define assert(e)          ((e) ? (void)0 : assertf(__FILE__, __LINE__, #e))
+#else
+#define assert(e)         ((void)0)
+#endif
+void assertf(const char *c, int i, const char *msg); // Just make it possible to break into assert on gdb-- has some kind of bug on cygwin.
+#else
+#include <cassert>
+#endif
+
+#define assert_throw(e)          ((e) ? (void)0 : assertf_throw(__FILE__, __LINE__, #e))
+void assertf_throw(const char *c, int i, const char *msg); 
+
+#define DAN_STYLE_IO
+
+#endif // ifndef TNet_Common_h
+
diff --git a/src/KaldiLib/Error.h b/src/KaldiLib/Error.h
new file mode 100644
index 0000000..873f3db
--- /dev/null
+++ b/src/KaldiLib/Error.h
@@ -0,0 +1,155 @@
+//
+// C++ Interface: %{MODULE}
+//
+// Description: 
+//
+//
+// Author: %{AUTHOR} <%{EMAIL}>, (C) %{YEAR}
+//
+// Copyright: See COPYING file that comes with this distribution
+//
+//
+
+/** @file Error.h
+ *  This header defines several types and functions relating to the
+ *  handling of exceptions in STK.
+ */
+ 
+#ifndef TNET_Error_h
+#define TNET_Error_h
+
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <sstream>
+
+#include <cstdlib>
+#include <execinfo.h>
+
+// THESE MACROS TERRIBLY CLASH WITH STK!!!!
+// WE MUST USE SAME MACROS!
+//
+//#define Error(msg) _Error_(__func__, __FILE__, __LINE__, msg)
+//#define Warning(msg) _Warning_(__func__, __FILE__, __LINE__, msg)
+//#define TraceLog(msg) _TraceLog_(__func__, __FILE__, __LINE__, msg)
+//
+
+#ifndef Error
+  #define Error(...) _Error_(__func__, __FILE__, __LINE__, __VA_ARGS__)
+#endif
+#ifndef Warning
+  #define Warning(...) _Warning_(__func__, __FILE__, __LINE__, __VA_ARGS__)
+#endif
+#ifndef TraceLog
+  #define TraceLog(...) _TraceLog_(__func__, __FILE__, __LINE__, __VA_ARGS__)
+#endif
+
+namespace TNet {
+  
+
+
+  /** MyException
+   * Custom exception class, gets the stacktrace
+   */
+  class MyException 
+    : public std::runtime_error
+  {
+    public:
+      explicit MyException(const std::string& what_arg) throw();
+      virtual ~MyException() throw();
+
+      const char* what() const throw() 
+      { return mWhat.c_str(); }
+
+    private:
+      std::string mWhat;
+  };
+
+  /** 
+   * MyException:: implemenatation
+   */
+  inline
+  MyException::
+  MyException(const std::string& what_arg) throw()
+    : std::runtime_error(what_arg)
+  {
+    mWhat = what_arg;
+    mWhat += "\nTHE STACKTRACE INSIDE MyException OBJECT IS:\n";
+    
+    void *array[10];
+    size_t size;
+    char **strings;
+    size_t i;
+
+    size = backtrace (array, 10);
+    strings = backtrace_symbols (array, size);
+    
+    //<< 0th string is the MyException ctor, so ignore and start by 1
+    for (i = 1; i < size; i++) { 
+      mWhat += strings[i];
+      mWhat += "\n";
+    }
+
+    free (strings);
+  }
+
+
+  inline
+  MyException::
+  ~MyException() throw()
+  { } 
+
+
+
+  /**
+   *  @brief Error throwing function (with backtrace)
+   */
+  inline void 
+  _Error_(const char *func, const char *file, int line, const std::string &msg)
+  {
+     std::stringstream ss;
+     ss << "ERROR (" << func << ':' << file  << ':' << line << ") " << msg;
+     throw MyException(ss.str());
+  }
+  
+  /**
+   *  @brief Warning handling function
+   */
+  inline void 
+  _Warning_(const char *func, const char *file, int line, const std::string &msg)
+  {
+	std::cout << "WARNING (" << func << ':' << file  << ':' << line << ") " << msg << std::endl;
+  }
+
+  inline void 
+  _TraceLog_(const char *func, const char *file, int line, const std::string &msg)
+  {
+	std::cout << "INFO (" << func << ':' << file  << ':' << line << ") " << msg << std::endl;
+  }
+
+  /**
+   * New kaldi error handling:
+   *
+   * class KaldiErrorMessage is invoked from the KALDI_ERROR macro.
+   * The destructor throws an exception.
+   */
+  class KaldiErrorMessage {
+   public:
+    KaldiErrorMessage(const char *func, const char *file, int line) {
+      this->stream() << "ERROR (" 
+                     << func << "():"
+                     << file << ':' << line << ") ";
+    }
+    inline std::ostream &stream() { return ss; }
+    ~KaldiErrorMessage() { throw MyException(ss.str()); }
+   private:
+    std::ostringstream ss;
+  };
+  #define KALDI_ERR TNet::KaldiErrorMessage(__func__, __FILE__, __LINE__).stream() 
+
+
+
+} // namespace TNet
+
+//#define TNET_Error_h
+#endif
diff --git a/src/KaldiLib/Features.cc b/src/KaldiLib/Features.cc
new file mode 100644
index 0000000..8d173bc
--- /dev/null
+++ b/src/KaldiLib/Features.cc
@@ -0,0 +1,1798 @@
+
+//enable feature repository profiling
+#define PROFILING 1
+
+#include <sstream>
+#include <map>
+#include <list>
+#include <cstdio>
+
+#include "Features.h"
+#include "Tokenizer.h"
+#include "StkMatch.h"
+#include "Types.h"
+
+
+
+namespace TNet
+{
+  const char 
+  FeatureRepository::
+  mpParmKindNames[13][16] =
+  {
+    {"WAVEFORM"},
+    {"LPC"},
+    {"LPREFC"},
+    {"LPCEPSTRA"},
+    {"LPDELCEP"},
+    {"IREFC"},
+    {"MFCC"},
+    {"FBANK"},
+    {"MELSPEC"},
+    {"USER"},
+    {"DISCRETE"},
+    {"PLP"},
+    {"ANON"}
+  };
+
+  //***************************************************************************
+  //***************************************************************************
+
+  FileListElem::
+  FileListElem(const std::string & rFileName)
+  {
+    std::string::size_type  pos;
+    
+    mLogical = rFileName;
+    mWeight  = 1.0;
+    
+    // some slash-backslash replacement hack
+    for (size_t i = 0; i < mLogical.size(); i++) {
+      if (mLogical[i] == '\\') {
+        mLogical[i] = '/';
+      }
+    }
+        
+    // read sentence weight definition if any ( physical_file.fea[s,e]{weight} )
+    if ((pos = mLogical.find('{')) != std::string::npos)
+    {
+      std::string       tmp_weight(mLogical.begin() + pos + 1, mLogical.end());
+      std::stringstream tmp_ss(tmp_weight);
+
+      tmp_ss >> mWeight;
+      mLogical.erase(pos);
+    }
+
+    // look for "=" symbol and if found, split it
+    if ((pos = mLogical.find('=')) != std::string::npos)
+    {
+      // copy all from mLogical[pos+1] till the end to mPhysical
+      mPhysical.assign(mLogical.begin() + pos + 1, mLogical.end());
+      // erase all from pos + 1 till the end from mLogical
+      mLogical.erase(pos);
+      // trim the leading and trailing spaces
+      Trim(mPhysical);
+      Trim(mLogical);
+    }
+    else
+    {
+      // trim the leading and trailing spaces
+      Trim(mLogical);
+
+      mPhysical = mLogical;
+    }    
+  }    
+
+
+  //###########################################################################
+  //###########################################################################
+  // FeatureRepository section
+  //###########################################################################
+  //###########################################################################
+  
+  //***************************************************************************
+  //***************************************************************************
+  void 
+  FeatureRepository::
+  ReadCepsNormFile(
+    const char *  pFileName, 
+    char **       pLastFileName, 
+    BaseFloat **      vec_buff,
+    int           sampleKind, 
+    CNFileType    type, 
+    int           coefs)
+  {
+    FILE*   fp;
+    int     i;
+    char    s1[64];
+    char    s2[64];
+    const char*   typeStr = (type == CNF_Mean     ? "MEAN" :
+                    type == CNF_Variance ? "VARIANCE" : "VARSCALE");
+  
+    const char*   typeStr2 = (type == CNF_Mean     ? "CMN" :
+                    type == CNF_Variance ? "CVN" : "VarScale");
+  
+    if (*pLastFileName != NULL && !strcmp(*pLastFileName, pFileName)) {
+      return;
+    }
+    free(*pLastFileName);
+    *pLastFileName=strdup(pFileName);
+    *vec_buff = (BaseFloat*) realloc(*vec_buff, coefs * sizeof(BaseFloat));
+  
+    if (*pLastFileName == NULL || *vec_buff== NULL) 
+      throw std::runtime_error("Insufficient memory");
+    
+    if ((fp = fopen(pFileName, "r")) == NULL)  {
+      throw std::runtime_error(std::string("Cannot open ") + typeStr2 
+          + " pFileName: '" + pFileName + "'");
+    }
+    
+    if ((type != CNF_VarScale
+        && (fscanf(fp, " <%64[^>]> <%64[^>]>", s1, s2) != 2
+          || strcmp(StrToUpper(s1), "CEPSNORM")
+          || ReadParmKind(s2, false) != sampleKind))
+        || fscanf(fp, " <%64[^>]> %d", s1, &i) != 2
+        || strcmp(StrToUpper(s1), typeStr)
+        || i != coefs) 
+    {
+      ParmKind2Str(sampleKind, s2);
+
+      //std::cout << "[[[TADY!!!!]]]" << pFileName << "\n" << std::flush;
+
+      throw std::runtime_error(std::string("")
+            + (type == CNF_VarScale ? "" : "<CEPSNORM> <")
+            + (type == CNF_VarScale ? "" : s2)
+            + (type == CNF_VarScale ? "" : ">")
+            + " <" + typeStr + " ... expected in " + typeStr2
+            + " file " + pFileName);
+    }
+    
+    for (i = 0; i < coefs; i++) {
+      if (fscanf(fp, " "FLOAT_FMT, *vec_buff+i) != 1) {
+        if (fscanf(fp, "%64s", s2) == 1) {
+          throw std::runtime_error(std::string("Decimal number expected but '")
+              + s2 + "' found in " + typeStr2 + " file " + pFileName);
+        } 
+        else if (feof(fp)) {
+          throw std::runtime_error(std::string("Unexpected end of ") 
+              + typeStr2 + " file "+ pFileName);
+        } 
+        else {
+          throw std::runtime_error(std::string("Cannot read ") + typeStr2 
+              + " file " + pFileName);
+        }
+      }
+      
+      if (type == CNF_Variance)      
+        (*vec_buff)[i] = BaseFloat(1 / sqrt((*vec_buff)[i]));
+      else if (type == CNF_VarScale) 
+        (*vec_buff)[i] = BaseFloat(sqrt((*vec_buff)[i]));
+    }
+    
+    if (fscanf(fp, "%64s", s2) == 1) 
+    {
+      throw std::runtime_error(std::string("End of file expected but '") 
+          + s2 + "' found in " + typeStr2 + " file " + pFileName);
+    }
+    
+    fclose(fp);
+  } // ReadCepsNormFile(...)
+  
+
+  //***************************************************************************
+  //***************************************************************************
+  void
+  FeatureRepository::
+  HtkFilter(const char* pFilter, const char* pValue, FeatureRepository& rOut)
+  {
+    std::list<FileListElem>::iterator   it;
+    std::string  str;
+
+    rOut.mSwapFeatures    = mSwapFeatures;
+    rOut.mStartFrameExt   = mStartFrameExt;
+    rOut.mEndFrameExt     = mEndFrameExt;
+    rOut.mTargetKind      = mTargetKind;
+    rOut.mDerivOrder      = mDerivOrder;
+    rOut.mDerivWinLengths = mDerivWinLengths;
+
+    rOut.mpCvgFile        = mpCvgFile;
+    rOut.mpCmnPath        = mpCmnPath;
+    rOut.mpCmnMask        = mpCmnMask;
+    rOut.mpCvnPath        = mpCvnPath;
+    rOut.mpCvnMask        = mpCvnMask;
+
+    rOut.mInputQueue.clear();
+
+    // go through all records and check the mask
+    for (it=mInputQueue.begin(); it!= mInputQueue.end(); ++it) {
+      if (pFilter == NULL
+      ||  (ProcessMask(it->Logical(), pFilter, str) && (str == pValue))) {
+        rOut.mInputQueue.push_back(*it);
+      }
+    }
+
+    // set the queue position to the begining
+    rOut.mInputQueueIterator = mInputQueue.end(); 
+
+    rOut.mCurrentIndexFileName  = "";
+    rOut.mCurrentIndexFileDir   = "";
+    rOut.mCurrentIndexFileExt   = "";
+
+    mStream.close();
+    mStream.clear();
+
+    rOut.mpLastFileName = NULL;
+    rOut.mLastFileName  = "";
+    rOut.mpLastCmnFile  = NULL;
+    rOut.mpLastCvnFile  = NULL;
+    rOut.mpLastCvgFile  = NULL;
+    rOut.mpCmn          = NULL;
+    rOut.mpCvn          = NULL;
+    rOut.mpCvg          = NULL;
+    rOut.mpA            = NULL;
+    rOut.mpB            = NULL;
+
+  }
+
+
+  //***************************************************************************
+  //***************************************************************************
+  void
+  FeatureRepository::
+  HtkSelection(const char* pFilter, std::list< std::string >& rOut)
+  {
+    std::map< std::string, bool> aux_map;
+    std::map< std::string, bool>::iterator map_it;
+    std::list<FileListElem>::iterator   it;
+    std::string  str;
+
+    rOut.clear();
+    
+    if(pFilter != NULL) {
+      // go through all records and check the mask
+      for (it=mInputQueue.begin(); it!= mInputQueue.end(); ++it) {
+        if (ProcessMask(it->Logical(), pFilter, str)) {
+          aux_map[str] = true;
+        }
+      }
+    } else {
+      aux_map[std::string("default speaker")] = true;
+    }
+
+    for (map_it = aux_map.begin(); map_it != aux_map.end(); ++map_it) {
+      rOut.push_back(map_it->first);
+    }
+  }
+
+
+  //***************************************************************************
+  //***************************************************************************
+  int     
+  FeatureRepository::
+  ParmKind2Str(unsigned parmKind, char *pOutString)
+  {
+    // :KLUDGE: Absolutely no idea what this is...
+      if ((parmKind & 0x003F) >= sizeof(mpParmKindNames)/sizeof(mpParmKindNames[0])) 
+      return 0;
+  
+    strcpy(pOutString, mpParmKindNames[parmKind & 0x003F]);
+  
+    if (parmKind & PARAMKIND_E) strcat(pOutString, "_E");
+    if (parmKind & PARAMKIND_N) strcat(pOutString, "_N");
+    if (parmKind & PARAMKIND_D) strcat(pOutString, "_D");
+    if (parmKind & PARAMKIND_A) strcat(pOutString, "_A");
+    if (parmKind & PARAMKIND_C) strcat(pOutString, "_C");
+    if (parmKind & PARAMKIND_Z) strcat(pOutString, "_Z");
+    if (parmKind & PARAMKIND_K) strcat(pOutString, "_K");
+    if (parmKind & PARAMKIND_0) strcat(pOutString, "_0");
+    if (parmKind & PARAMKIND_V) strcat(pOutString, "_V");
+    if (parmKind & PARAMKIND_T) strcat(pOutString, "_T");
+    
+    return 1;
+  }
+
+
+  // //***************************************************************************
+  // //***************************************************************************
+  // void
+  // AddFileListToFeatureRepositories(
+  //   const char* pFileName, 
+  //   const char* pFilter, 
+  //   std::queue<FeatureRepository *> &featureRepositoryList)
+  // {
+  //   IStkStream            l_stream;
+  //   std::string           file_name;
+  //   Tokenizer             file_list(pFileName, ",");
+  //   Tokenizer::iterator   p_file_name;
+
+  //   //:TODO: error if empty featureRepositoryList
+  //   
+  //   for (p_file_name = file_list.begin(); p_file_name != file_list.end(); ++p_file_name)
+  //   {
+  //     // get rid of initial and trailing blanks
+  //     Trim(*p_file_name);
+
+  //     // open file name
+  //     l_stream.open(p_file_name->c_str(), std::ios::in, pFilter);
+  //     
+  //     if (!l_stream.good()) {
+  //       //:TODO:
+  //       // Warning or error ... Why warning? -Lukas
+  //       throw std::runtime_error(std::string("Cannot not open list file ") +
+  //           *p_file_name);
+  //     }
+
+  //     // read all lines and parse them
+  //     for(;;)
+  //     {
+  //       l_stream >> file_name;
+  //       //:TODO: if(l_stream.badl()) Error()
+  //       // Reading after last token set the fail bit
+  //       if(l_stream.fail()) 
+  //         break;
+  //       // we can push_back a std::string as new FileListElem object
+  //       // is created using FileListElem(const std::string&) constructor
+  //       // and logical and physical names are correctly extracted
+  //       featureRepositoryList.front()->mInputQueue.push_back(file_name);
+  //       
+  //       //cycle in the featureRepositoryList
+  //       featureRepositoryList.push(featureRepositoryList.front());
+  //       featureRepositoryList.pop();
+  //     }
+  //     l_stream.close();
+  //   }
+  // } // AddFileList(const std::string & rFileName)
+
+
+  //***************************************************************************
+  //***************************************************************************
+  void
+  FeatureRepository::
+  Init(
+      bool                  swap,
+      int                   extLeft,
+      int                   extRight,
+      int                   targetKind,
+      int                   derivOrder,
+      int*                  pDerivWinLen,
+      const char*           pCmnPath,
+      const char*           pCmnMask,
+      const char*           pCvnPath,
+      const char*           pCvnMask,
+      const char*           pCvgFile)
+  {
+    mSwapFeatures       =   swap;         
+    mStartFrameExt      =   extLeft;      
+    mEndFrameExt        =   extRight;     
+    mTargetKind         =   targetKind;   
+    mDerivOrder         =   derivOrder;   
+    mDerivWinLengths    =   pDerivWinLen; 
+    mpCmnPath           =   pCmnPath;     
+    mpCmnMask           =   pCmnMask;     
+    mpCvnPath           =   pCvnPath;     
+    mpCvnMask           =   pCvnMask;     
+    mpCvgFile           =   pCvgFile;    
+  } // Init()
+
+
+  //***************************************************************************
+  //***************************************************************************
+  void
+  FeatureRepository::
+  AddFile(const std::string & rFileName)
+  {
+    mInputQueue.push_back(rFileName);
+  } // AddFile(const std::string & rFileName)
+
+  
+  //***************************************************************************
+  //***************************************************************************
+  void
+  FeatureRepository::
+  AddFileList(const char* pFileName, const char* pFilter)
+  {
+    IStkStream            l_stream;
+    std::string           file_name;
+    Tokenizer             file_list(pFileName, ",");
+    Tokenizer::iterator   p_file_name;
+
+    for (p_file_name = file_list.begin(); p_file_name != file_list.end(); ++p_file_name)
+    {
+      // get rid of spaces
+      Trim(*p_file_name);
+
+      // open the file
+      l_stream.open(p_file_name->c_str(), std::ios::in, pFilter);
+      
+      if (!l_stream.good())
+      {
+        //:TODO:
+        // Warning or error ... Why warning? -Lukas
+        throw std::runtime_error(std::string("Cannot not open list file ") +
+            *p_file_name);
+      }
+      // read all lines and parse them
+      for(;;)
+      {
+        l_stream >> file_name;
+        //:TODO: if(l_stream.badl()) Error()
+        // Reading after last token set the fail bit
+        if(l_stream.fail()) 
+	  break;
+        // we can push_back a std::string as new FileListElem object
+        // is created using FileListElem(const std::string&) constructor
+        // and logical and physical names are correctly extracted
+        mInputQueue.push_back(file_name);
+      }
+      l_stream.close();
+    }
+  } // AddFileList(const std::string & rFileName)
+
+  
+  //***************************************************************************
+  //***************************************************************************
+  void
+  FeatureRepository::
+  MoveNext()
+  {
+    assert (mInputQueueIterator != mInputQueue.end());
+    mInputQueueIterator++;
+  } // ReadFullMatrix(Matrix<BaseFloat>& rMatrix)
+
+
+  //***************************************************************************
+  //***************************************************************************
+  bool
+  FeatureRepository::
+  ReadFullMatrix(Matrix<BaseFloat>& rMatrix)
+  {
+    // clear the matrix
+    rMatrix.Destroy();
+
+    // extract index file name
+    if (!mCurrentIndexFileDir.empty())
+    {
+      char tmp_name[mCurrentIndexFileDir.length() + 
+        mCurrentIndexFileExt.length() + 
+        mInputQueueIterator->Physical().length()]; 
+      
+      MakeHtkFileName(tmp_name, mInputQueueIterator->Physical().c_str(), 
+          mCurrentIndexFileDir.c_str(), mCurrentIndexFileExt.c_str());
+      
+      mCurrentIndexFileName = tmp_name;
+    }
+    else
+      mCurrentIndexFileName = "";
+
+    //get the 3-letter suffix
+    int pos_last_three_chars = mInputQueueIterator->Physical().size() - 3;
+    if (pos_last_three_chars < 0) pos_last_three_chars = 0;
+    //read the gzipped ascii features
+    if (mInputQueueIterator->Physical().substr(pos_last_three_chars) == ".gz") {
+      return ReadGzipAsciiFeatures(*mInputQueueIterator, rMatrix);
+    }
+     
+    // read the matrix and return the result
+    return ReadHTKFeatures(*mInputQueueIterator, rMatrix);
+  } // ReadFullMatrix(Matrix<BaseFloat>& rMatrix)
+
+
+
+  //***************************************************************************
+  //***************************************************************************
+  bool
+  FeatureRepository::
+  WriteFeatureMatrix(const Matrix<BaseFloat>& rMatrix, const std::string& filename, int targetKind, int samplePeriod)
+  {
+    FILE* fp = fopen(filename.c_str(),"w");
+    if(NULL == fp) { Error(std::string("Cannot create file:") + filename); return false; }
+
+    WriteHTKFeatures(fp, samplePeriod, targetKind, mSwapFeatures, const_cast<Matrix<BaseFloat>&>(rMatrix));
+
+    fclose(fp);
+
+    return true;
+  }
+
+
+  //***************************************************************************
+  //***************************************************************************
+  // private:
+  int 
+  FeatureRepository::
+  ReadHTKHeader()
+  {
+    // TODO 
+    // Change this... We should read from StkStream
+    FILE* fp = mStream.fp();
+    
+    if (!fread(&mHeader.mNSamples,     sizeof(INT_32),  1, fp)) return -1;
+    if (!fread(&mHeader.mSamplePeriod, sizeof(INT_32),  1, fp)) return -1;
+    if (!fread(&mHeader.mSampleSize,   sizeof(INT_16),  1, fp)) return -1;
+    if (!fread(&mHeader.mSampleKind,   sizeof(UINT_16), 1, fp)) return -1;
+
+    if (mSwapFeatures) 
+    {
+      swap4(mHeader.mNSamples);
+      swap4(mHeader.mSamplePeriod);
+      swap2(mHeader.mSampleSize);
+      swap2(mHeader.mSampleKind);
+    }
+  
+    if (mHeader.mSamplePeriod < 0 
+    ||  mHeader.mSamplePeriod > 100000 
+    ||  mHeader.mNSamples     < 0 
+    ||  mHeader.mSampleSize   < 0) 
+    {
+      return -1;
+    }
+  
+    return 0;
+  }
+  
+  
+  //***************************************************************************
+  //***************************************************************************
+  // private:
+  int 
+  FeatureRepository::
+  ReadHTKFeature(
+      BaseFloat*    pIn, 
+      size_t    feaLen, 
+      bool      decompress, 
+      BaseFloat*    pScale, 
+      BaseFloat*    pBias)
+  {
+    FILE*  fp = mStream.fp();
+    
+    size_t i;
+    
+    if (decompress) 
+    {
+      INT_16 s;
+  //    BaseFloat pScale = (xmax - xmin) / (2*32767);
+  //    BaseFloat pBias  = (xmax + xmin) / 2;
+  
+      for (i = 0; i < feaLen; i++) 
+      {
+        if (fread(&s, sizeof(INT_16), 1, fp) != 1) 
+          return -1;
+        
+        if (mSwapFeatures) swap2(s);
+        pIn[i] = ((BaseFloat)s + pBias[i]) / pScale[i];
+      }
+      
+      return 0;
+    }
+  
+#if !DOUBLEPRECISION
+    if (fread(pIn, sizeof(FLOAT_32), feaLen, fp) != feaLen) 
+      return -1;
+    
+    if (mSwapFeatures) 
+      for (i = 0; i < feaLen; i++) 
+        swap4(pIn[i]);
+#else
+    float f;
+  
+    for (i = 0; i < feaLen; i++) 
+    {
+      if (fread(&f, sizeof(FLOAT_32), 1, fp) != 1)
+        return -1;
+      
+      if (mSwapFeatures) 
+        swap4(f);
+        
+      pIn[i] = f;
+    }
+#endif
+    return 0;
+  }  // int ReadHTKFeature
+  
+  
+
+  //***************************************************************************
+  //***************************************************************************
+/*  bool 
+  FeatureRepository::
+  ReadHTKFeatures(const std::string& rFileName, Matrix<BaseFloat>& rFeatureMatrix)
+  {
+    std::string           file_name(rFileName);
+    std::string           cmn_file_name;
+    std::string           cvn_file_name;  
+    
+    int                   ext_left  = mStartFrameExt;
+    int                   ext_right = mEndFrameExt;
+    int                   from_frame;
+    int                   to_frame;
+    int                   tot_frames;
+    int                   trg_vec_size;
+    int                   src_vec_size;
+    int                   src_deriv_order;
+    int                   lo_src_tgz_deriv_order;
+    int                   i;
+    int                   j;
+    int                   k;
+    int                   e;
+    int                   coefs;
+    int                   trg_E;
+    int                   trg_0;
+    int                   trg_N;
+    int                   src_E;
+    int                   src_0;
+    int                   src_N;
+    int                   comp;
+    int                   coef_size;
+    char*                 chptr;
+  
+    
+   
+    // read frame range definition if any ( physical_file.fea[s,e] )
+    if ((chptr = strrchr(file_name.c_str(), '[')) == NULL ||
+        ((i=0), sscanf(chptr, "[%d,%d]%n", &from_frame, &to_frame, &i), 
+         chptr[i] != '\0')) 
+    {
+      chptr = NULL;
+    }
+    
+    if (chptr != NULL)                                
+      *chptr = '\0';
+  
+  // Experimental changes...
+  // if ((strcmp(file_name.c_str(), "-"))
+  // &&  (mpLastFileName != NULL) 
+  // &&  (!strcmp(mpLastFileName, file_name.c_str()))) 
+  //   {
+  //     mHeader = mLastHeader;
+  //   } 
+  //   else 
+  //   {
+  //     if (mpLastFileName) 
+  //     {
+  //       //if (mpFp != stdin) 
+  //       //  fclose(mpFp);
+  //       mStream.close();
+  //       
+  //       free(mpLastFileName);
+  //       mpLastFileName = NULL;
+  //     }
+
+    if ((file_name != "-" )
+    &&  (!mLastFileName.empty()) 
+    &&  (mLastFileName == file_name)) 
+    {
+      mHeader = mLastHeader;
+    } 
+    else 
+    {
+      if (!mLastFileName.empty()) 
+      {
+        mStream.close();
+        mLastFileName = "";
+      }
+      
+      
+      // open the feature file
+      mStream.open(file_name.c_str(), ios::binary);
+      if (!mStream.good())
+      {
+        Error("Cannot open feature file: '%s'", file_name.c_str());
+      }
+      
+      
+      if (ReadHTKHeader()) 
+        Error("Invalid HTK header in feature file: '%s'", file_name.c_str());
+      
+      if (mHeader.mSampleKind & PARAMKIND_C) 
+      {
+        // File is in compressed form, scale and pBias vectors
+        // are appended after HTK header.
+  
+        int coefs = mHeader.mSampleSize/sizeof(INT_16);
+        mpA = (BaseFloat*) realloc(mpA, coefs * sizeof(BaseFloat));
+        mpB = (BaseFloat*) realloc(mpB, coefs * sizeof(BaseFloat));
+        if (mpA == NULL || mpB == NULL) Error("Insufficient memory");
+  
+        e  = ReadHTKFeature(mpA, coefs, 0, 0, 0);
+        e |= ReadHTKFeature(mpB, coefs, 0, 0, 0);
+        
+        if (e) 
+          Error("Cannot read feature file: '%s'", file_name.c_str());
+        
+        mHeader.mNSamples -= 2 * sizeof(FLOAT_32) / sizeof(INT_16);
+      }
+      
+      // remember current settings
+      mLastFileName = file_name;
+      mLastHeader   = mHeader;
+    }
+    
+    if (chptr != NULL) 
+      *chptr = '[';
+  
+    if (chptr == NULL) 
+    { // Range [s,e] was not specified
+      from_frame = 0;
+      to_frame   = mHeader.mNSamples-1;
+    }
+    
+    src_deriv_order = PARAMKIND_T & mHeader.mSampleKind ? 3 :
+                      PARAMKIND_A & mHeader.mSampleKind ? 2 :
+                      PARAMKIND_D & mHeader.mSampleKind ? 1 : 0;
+    src_E =  (PARAMKIND_E & mHeader.mSampleKind) != 0;
+    src_0 =  (PARAMKIND_0 & mHeader.mSampleKind) != 0;
+    src_N = ((PARAMKIND_N & mHeader.mSampleKind) != 0) * (src_E + src_0);
+    comp =    PARAMKIND_C & mHeader.mSampleKind;
+    
+    mHeader.mSampleKind &= ~PARAMKIND_C;
+  
+    if (mTargetKind == PARAMKIND_ANON) 
+    {
+      mTargetKind = mHeader.mSampleKind;
+    } 
+    else if ((mTargetKind & 077) == PARAMKIND_ANON) 
+    {
+      mTargetKind &= ~077;
+      mTargetKind |= mHeader.mSampleKind & 077;
+    }
+    
+    trg_E = (PARAMKIND_E & mTargetKind) != 0;
+    trg_0 = (PARAMKIND_0 & mTargetKind) != 0;
+    trg_N =((PARAMKIND_N & mTargetKind) != 0) * (trg_E + trg_0);
+  
+    coef_size     = comp ? sizeof(INT_16) : sizeof(FLOAT_32);
+    coefs         = (mHeader.mSampleSize/coef_size + src_N) / 
+                    (src_deriv_order+1) - src_E - src_0;
+    src_vec_size  = (coefs + src_E + src_0) * (src_deriv_order+1) - src_N;
+  
+    //Is coefs dividable by 1 + number of derivatives specified in header
+    if (src_vec_size * coef_size != mHeader.mSampleSize) 
+    {
+      Error("Invalid HTK header in feature file: '%s'. "
+            "mSampleSize do not match with parmKind", file_name.c_str());
+    }
+    
+    if (mDerivOrder < 0) 
+      mDerivOrder = src_deriv_order;
+  
+  
+    if ((!src_E && trg_E) || (!src_0 && trg_0) || (src_N && !trg_N) ||
+        (trg_N && !trg_E && !trg_0) || (trg_N && !mDerivOrder) ||
+        (src_N && !src_deriv_order && mDerivOrder) ||
+        ((mHeader.mSampleKind & 077) != (mTargetKind & 077) &&
+         (mHeader.mSampleKind & 077) != PARAMKIND_ANON)) 
+    {
+      char srcParmKind[64];
+      char trgParmKind[64];
+      
+      ParmKind2Str(mHeader.mSampleKind, srcParmKind);
+      ParmKind2Str(mTargetKind,       trgParmKind);
+      Error("Cannot convert %s to %s", srcParmKind, trgParmKind);
+    }
+  
+    lo_src_tgz_deriv_order = LOWER_OF(src_deriv_order, mDerivOrder);
+    trg_vec_size  = (coefs + trg_E + trg_0) * (mDerivOrder+1) - trg_N;
+    
+    i =  LOWER_OF(from_frame, mStartFrameExt);
+    from_frame  -= i;
+    ext_left     -= i;
+  
+    i =  LOWER_OF(mHeader.mNSamples-to_frame-1, mEndFrameExt);
+    to_frame    += i;
+    ext_right    -= i;
+  
+    if (from_frame > to_frame || from_frame >= mHeader.mNSamples || to_frame< 0)
+      Error("Invalid frame range for feature file: '%s'", file_name.c_str());
+    
+    tot_frames = to_frame - from_frame + 1 + ext_left + ext_right;
+    
+    // initialize matrix 
+    rFeatureMatrix.Init(tot_frames, trg_vec_size);
+    
+    // fill the matrix with features
+    for (i = 0; i <= to_frame - from_frame; i++) 
+    {
+      BaseFloat* A      = mpA;
+      BaseFloat* B      = mpB;
+      BaseFloat* mxPtr  = rFeatureMatrix[i+ext_left];
+      
+      // seek to the desired position
+      fseek(mStream.fp(), 
+          sizeof(HtkHeader) + (comp ? src_vec_size * 2 * sizeof(FLOAT_32) : 0)
+          + (from_frame + i) * src_vec_size * coef_size, 
+          SEEK_SET);
+  
+      e = ReadHTKFeature(mxPtr, coefs, comp, A, B);
+      
+      mxPtr += coefs; 
+      A     += coefs; 
+      B     += coefs;
+        
+      if (src_0 && !src_N) e |= ReadHTKFeature(mxPtr, 1, comp, A++, B++);
+      if (trg_0 && !trg_N) mxPtr++;
+      if (src_E && !src_N) e |= ReadHTKFeature(mxPtr, 1, comp, A++, B++);
+      if (trg_E && !trg_N) mxPtr++;
+  
+      for (j = 0; j < lo_src_tgz_deriv_order; j++) 
+      {
+        e |= ReadHTKFeature(mxPtr, coefs, comp, A, B);
+        mxPtr += coefs; 
+        A     += coefs; 
+        B     += coefs;
+        
+        if (src_0) e |= ReadHTKFeature(mxPtr, 1, comp, A++, B++);
+        if (trg_0) mxPtr++;
+        if (src_E) e |= ReadHTKFeature(mxPtr, 1, comp, A++, B++);
+        if (trg_E) mxPtr++;
+      }
+      
+      if (e) 
+        Error("Cannot read feature file: '%s' frame %d/%d", file_name.c_str(),
+            i, to_frame - from_frame + 1);
+    }
+  
+    // From now, coefs includes also trg_0 + trg_E !
+    coefs += trg_0 + trg_E; 
+    
+    // If extension of the matrix to the left or to the right is required,
+    // perform it here
+    for (i = 0; i < ext_left; i++) 
+    {
+      memcpy(rFeatureMatrix[i],
+             rFeatureMatrix[ext_left],
+             (coefs * (1+lo_src_tgz_deriv_order) - trg_N) * sizeof(BaseFloat));
+    }
+    
+    for (i = tot_frames - ext_right; i < tot_frames; i++) 
+    {
+      memcpy(rFeatureMatrix[i],
+             rFeatureMatrix[tot_frames - ext_right - 1],
+             (coefs * (1+lo_src_tgz_deriv_order) - trg_N) * sizeof(BaseFloat));
+    }
+
+    // Sentence cepstral mean normalization
+    if( (mpCmnPath == NULL)
+    && !(PARAMKIND_Z & mHeader.mSampleKind) 
+    &&  (PARAMKIND_Z & mTargetKind)) 
+    {
+      // for each coefficient
+      for(j=0; j < coefs; j++) 
+      {          
+        BaseFloat norm = 0.0;
+        for(i=0; i < tot_frames; i++)      // for each frame
+        {
+          norm += rFeatureMatrix[i][j - trg_N];
+          //norm += fea_mx[i*trg_vec_size - trg_N + j];
+        }
+        
+        norm /= tot_frames;
+  
+        for(i=0; i < tot_frames; i++)      // for each frame
+          rFeatureMatrix[i][j - trg_N] -= norm;
+          //fea_mx[i*trg_vec_size - trg_N + j] -= norm;
+      }
+    }
+    
+    // Compute missing derivatives
+    for (; src_deriv_order < mDerivOrder; src_deriv_order++) 
+    { 
+      int winLen = mDerivWinLengths[src_deriv_order];
+      BaseFloat norm = 0.0;
+      
+      for (k = 1; k <= winLen; k++) 
+      {
+        norm += 2 * k * k;
+      }
+      
+      // for each frame
+      for (i=0; i < tot_frames; i++) 
+      {        
+        // for each coefficient
+        for (j=0; j < coefs; j++) 
+        {          
+          //BaseFloat* src = fea_mx + i*trg_vec_size + src_deriv_order*coefs - trg_N + j;
+          BaseFloat* src = &rFeatureMatrix[i][src_deriv_order*coefs - trg_N + j];
+          
+          *(src + coefs) = 0.0;
+          
+          if (i < winLen || i >= tot_frames-winLen) 
+          { // boundaries need special treatment
+            for (k = 1; k <= winLen; k++) 
+            {  
+              *(src+coefs) += k*(src[ LOWER_OF(tot_frames-1-i,k)*rFeatureMatrix.Stride()]
+                                -src[-LOWER_OF(i,             k)*rFeatureMatrix.Stride()]);
+            }
+          } 
+          else 
+          { // otherwise use more efficient code
+            for (k = 1; k <= winLen; k++) 
+            {  
+              *(src+coefs) += k*(src[ k * rFeatureMatrix.Stride()]
+                                -src[-k * rFeatureMatrix.Stride()]);
+            }
+          }
+          *(src + coefs) /= norm;
+        }
+      }
+    }
+    
+    mHeader.mNSamples    = tot_frames;
+    mHeader.mSampleSize  = trg_vec_size * sizeof(FLOAT_32);
+    mHeader.mSampleKind  = mTargetKind & ~(PARAMKIND_D | PARAMKIND_A | PARAMKIND_T);
+  
+
+    ////////////////////////////////////////////////////////////////////////////
+    /////////////// Cepstral mean and variance normalization ///////////////////
+    ////////////////////////////////////////////////////////////////////////////
+    //.........................................................................
+    if (mpCmnPath != NULL
+    &&  mpCmnMask != NULL) 
+    {
+      // retrieve file name
+      ProcessMask(file_name, mpCmnMask, cmn_file_name);
+      // add the path correctly
+      cmn_file_name.insert(0, "/");
+      cmn_file_name.insert(0, mpCmnPath);
+
+      // read the file
+      ReadCepsNormFile(cmn_file_name.c_str(), &mpLastCmnFile, &mpCmn,
+          mHeader.mSampleKind & ~PARAMKIND_Z, CNF_Mean, coefs);
+                      
+      // recompute feature values
+      for (i=0; i < tot_frames; i++) 
+      {
+        for (j=trg_N; j < coefs; j++) 
+        {
+          rFeatureMatrix[i][j - trg_N] -= mpCmn[j];
+        }
+      }
+    }
+  
+    mHeader.mSampleKind |= mDerivOrder==3 ? PARAMKIND_D | PARAMKIND_A | PARAMKIND_T :
+                           mDerivOrder==2 ? PARAMKIND_D | PARAMKIND_A :
+                           mDerivOrder==1 ? PARAMKIND_D : 0;
+  
+    //.........................................................................
+    if (mpCvnPath != NULL
+    &&  mpCvnMask != NULL) 
+    {
+      // retrieve file name
+      ProcessMask(file_name, mpCvnMask, cvn_file_name);
+      // add the path correctly
+      cvn_file_name.insert(0, "/");
+      cvn_file_name.insert(0, mpCvnPath);
+
+      // read the file
+      ReadCepsNormFile(cvn_file_name.c_str(), &mpLastCvnFile, &mpCvn,
+          mHeader.mSampleKind, CNF_Variance, trg_vec_size);
+                      
+      // recompute feature values
+      for (i=0; i < tot_frames; i++) 
+      {
+        for (j=trg_N; j < trg_vec_size; j++) 
+        {
+          rFeatureMatrix[i][j - trg_N] *= mpCvn[j];
+        }
+      }
+    }
+    
+    //.........................................................................
+    // process the global covariance file
+    if (mpCvgFile != NULL) 
+    {
+      ReadCepsNormFile(mpCvgFile, &mpLastCvgFile, &mpCvg,
+                      -1, CNF_VarScale, trg_vec_size);
+                      
+      // recompute feature values
+      for (i=0; i < tot_frames; i++) 
+      {
+        for (j=trg_N; j < trg_vec_size; j++) 
+        {
+          rFeatureMatrix[i][j - trg_N] *= mpCvg[j];
+        }
+      }
+    }
+    
+    return true;
+  }
+*/
+
+  //***************************************************************************
+  //***************************************************************************
+
+
+
+
+  
+  //***************************************************************************
+  //***************************************************************************
+  bool 
+  FeatureRepository::
+  ReadHTKFeatures(const FileListElem&    rFileNameRecord, 
+                        Matrix<BaseFloat>&        rFeatureMatrix)
+  {
+    std::string           file_name(rFileNameRecord.Physical());
+    std::string           cmn_file_name;
+    std::string           cvn_file_name;  
+    
+    int                   ext_left  = mStartFrameExt;
+    int                   ext_right = mEndFrameExt;
+    int                   from_frame;
+    int                   to_frame;
+    int                   tot_frames;
+    int                   trg_vec_size;
+    int                   src_vec_size;
+    int                   src_deriv_order;
+    int                   lo_src_tgz_deriv_order;
+    int                   i;
+    int                   j;
+    int                   k;
+    int                   e;
+    int                   coefs;
+    int                   trg_E;
+    int                   trg_0;
+    int                   trg_N;
+    int                   src_E;
+    int                   src_0;
+    int                   src_N;
+    int                   comp;
+    int                   coef_size;
+    char*                 chptr;
+  
+
+  TIMER_START(mTim);
+   
+    // read frame range definition if any ( physical_file.fea[s,e] )
+    if ((chptr = strrchr((char*)file_name.c_str(), '[')) == NULL ||
+        ((i=0), sscanf(chptr, "[%d,%d]%n", &from_frame, &to_frame, &i), 
+         chptr[i] != '\0')) 
+    {
+      chptr = NULL;
+    }
+    
+    if (chptr != NULL)                                
+      *chptr = '\0';
+  
+
+    if ((file_name != "-" )
+    &&  (!mLastFileName.empty()) 
+    &&  (mLastFileName == file_name)) 
+    {
+      mHeader = mLastHeader;
+    } 
+    else 
+    {
+      if (!mLastFileName.empty()) 
+      {
+        mStream.close();
+        mLastFileName = "";
+      }
+      
+      
+      // open the feature file
+      mStream.open(file_name.c_str(), std::ios::binary);
+      if (!mStream.good())
+      {
+        throw std::runtime_error(std::string("Cannot open feature file: '") 
+            + file_name.c_str() + "'");
+      }
+      
+      
+      if (ReadHTKHeader())  {
+        throw std::runtime_error(std::string("Invalid HTK header in feature file: '") 
+            + file_name.c_str() + "'");
+      }
+      
+      if (mHeader.mSampleKind & PARAMKIND_C) 
+      {
+        // File is in compressed form, scale and pBias vectors
+        // are appended after HTK header.
+	    coefs = mHeader.mSampleSize/sizeof(INT_16);
+
+        mpA = (BaseFloat*) realloc(mpA, coefs * sizeof(BaseFloat));
+        mpB = (BaseFloat*) realloc(mpB, coefs * sizeof(BaseFloat));
+
+        if (mpA == NULL || mpB == NULL) {
+          throw std::runtime_error("Insufficient memory");
+        }
+  
+        e  = ReadHTKFeature(mpA, coefs, 0, 0, 0);
+        e |= ReadHTKFeature(mpB, coefs, 0, 0, 0);
+        
+        if (e) {
+          throw std::runtime_error(std::string("Cannot read feature file: '") 
+              + file_name.c_str() + "'");
+        }
+        
+        mHeader.mNSamples -= 2 * sizeof(FLOAT_32) / sizeof(INT_16);
+      }
+      
+      // remember current settings
+      mLastFileName = file_name;
+      mLastHeader   = mHeader;
+    }
+    
+    if (chptr != NULL) {
+      *chptr = '[';
+    }
+  
+    if (chptr == NULL) { 
+      // Range [s,e] was not specified
+      from_frame = 0;
+      to_frame   = mHeader.mNSamples-1;
+    }
+    
+    src_deriv_order = PARAMKIND_T & mHeader.mSampleKind ? 3 :
+                      PARAMKIND_A & mHeader.mSampleKind ? 2 :
+                      PARAMKIND_D & mHeader.mSampleKind ? 1 : 0;
+    src_E =  (PARAMKIND_E & mHeader.mSampleKind) != 0;
+    src_0 =  (PARAMKIND_0 & mHeader.mSampleKind) != 0;
+    src_N = ((PARAMKIND_N & mHeader.mSampleKind) != 0) * (src_E + src_0);
+    comp =    PARAMKIND_C & mHeader.mSampleKind;
+    
+    mHeader.mSampleKind &= ~PARAMKIND_C;
+  
+    if (mTargetKind == PARAMKIND_ANON) 
+    {
+      mTargetKind = mHeader.mSampleKind;
+    } 
+    else if ((mTargetKind & 077) == PARAMKIND_ANON) 
+    {
+      mTargetKind &= ~077;
+      mTargetKind |= mHeader.mSampleKind & 077;
+    }
+    
+    trg_E = (PARAMKIND_E & mTargetKind) != 0;
+    trg_0 = (PARAMKIND_0 & mTargetKind) != 0;
+    trg_N =((PARAMKIND_N & mTargetKind) != 0) * (trg_E + trg_0);
+  
+    coef_size     = comp ? sizeof(INT_16) : sizeof(FLOAT_32);
+    coefs         = (mHeader.mSampleSize/coef_size + src_N) / 
+                    (src_deriv_order+1) - src_E - src_0;
+    src_vec_size  = (coefs + src_E + src_0) * (src_deriv_order+1) - src_N;
+  
+    //Is coefs dividable by 1 + number of derivatives specified in header
+    if (src_vec_size * coef_size != mHeader.mSampleSize) 
+    {
+      throw std::runtime_error(std::string("Invalid HTK header in feature file: '") 
+            + file_name + "' mSampleSize do not match with parmKind");
+    }
+    
+    if (mDerivOrder < 0) 
+      mDerivOrder = src_deriv_order;
+  
+  
+    if ((!src_E && trg_E) || (!src_0 && trg_0) || (src_N && !trg_N) ||
+        (trg_N && !trg_E && !trg_0) || (trg_N && !mDerivOrder) ||
+        (src_N && !src_deriv_order && mDerivOrder) ||
+        ((mHeader.mSampleKind & 077) != (mTargetKind & 077) &&
+         (mHeader.mSampleKind & 077) != PARAMKIND_ANON)) 
+    {
+      char srcParmKind[64];
+      char trgParmKind[64];
+      memset(srcParmKind,0,64);
+      memset(trgParmKind,0,64);
+      
+      ParmKind2Str(mHeader.mSampleKind, srcParmKind);
+      ParmKind2Str(mTargetKind,       trgParmKind);
+      throw std::runtime_error(std::string("Cannot convert ") + srcParmKind 
+          + " to " + trgParmKind);
+    }
+  
+    lo_src_tgz_deriv_order = std::min(src_deriv_order, mDerivOrder);
+    trg_vec_size  = (coefs + trg_E + trg_0) * (mDerivOrder+1) - trg_N;
+    
+    i =  std::min(from_frame, mStartFrameExt);
+    from_frame  -= i;
+    ext_left     -= i;
+  
+    i =  std::min(mHeader.mNSamples-to_frame-1, mEndFrameExt);
+    to_frame    += i;
+    ext_right    -= i;
+  
+    if (from_frame > to_frame || from_frame >= mHeader.mNSamples || to_frame< 0)
+      throw std::runtime_error(std::string("Invalid frame range for feature file: '")
+            + file_name.c_str() + "'");
+    
+    tot_frames = to_frame - from_frame + 1 + ext_left + ext_right;
+   
+    
+   TIMER_END(mTim,mTimeOpen);
+
+
+    // initialize matrix 
+    rFeatureMatrix.Init(tot_frames, trg_vec_size, false);
+    
+    // fill the matrix with features
+    for (i = 0; i <= to_frame - from_frame; i++) 
+    {
+      BaseFloat* A      = mpA;
+      BaseFloat* B      = mpB;
+      BaseFloat* mxPtr  = rFeatureMatrix.pRowData(i+ext_left);
+
+    TIMER_START(mTim);      
+      // seek to the desired position
+      fseek(mStream.fp(), 
+          sizeof(HtkHeader) + (comp ? src_vec_size * 2 * sizeof(FLOAT_32) : 0)
+          + (from_frame + i) * src_vec_size * coef_size, 
+          SEEK_SET);
+    TIMER_END(mTim,mTimeSeek);
+ 
+    TIMER_START(mTim);
+      // read 
+      e = ReadHTKFeature(mxPtr, coefs, comp, A, B);
+    TIMER_END(mTim,mTimeRead);
+      
+      mxPtr += coefs; 
+      A     += coefs; 
+      B     += coefs;
+        
+      if (src_0 && !src_N) e |= ReadHTKFeature(mxPtr, 1, comp, A++, B++);
+      if (trg_0 && !trg_N) mxPtr++;
+      if (src_E && !src_N) e |= ReadHTKFeature(mxPtr, 1, comp, A++, B++);
+      if (trg_E && !trg_N) mxPtr++;
+  
+      for (j = 0; j < lo_src_tgz_deriv_order; j++) 
+      {
+        e |= ReadHTKFeature(mxPtr, coefs, comp, A, B);
+        mxPtr += coefs; 
+        A     += coefs; 
+        B     += coefs;
+        
+        if (src_0) e |= ReadHTKFeature(mxPtr, 1, comp, A++, B++);
+        if (trg_0) mxPtr++;
+        if (src_E) e |= ReadHTKFeature(mxPtr, 1, comp, A++, B++);
+        if (trg_E) mxPtr++;
+      }
+
+      if (e) {
+        std::cout << mHeader.mNSamples << "\n";
+        std::cout << 2 * sizeof(FLOAT_32) / sizeof(INT_16) << "\n";
+        std::cout << "from" << from_frame << "to" << to_frame << "i" << i << "\n";
+
+        std::ostringstream s;
+        s << i << "/" << to_frame - from_frame + 1, s.str();
+        throw std::runtime_error(std::string("Cannot read feature file: '")
+              + file_name + "' frame " + s.str());
+      }
+    }
+  
+    // From now, coefs includes also trg_0 + trg_E !
+    coefs += trg_0 + trg_E; 
+    
+    // If extension of the matrix to the left or to the right is required,
+    // perform it here
+    for (i = 0; i < ext_left; i++) 
+    {
+      memcpy(rFeatureMatrix.pRowData(i),
+             rFeatureMatrix.pRowData(ext_left),
+             (coefs * (1+lo_src_tgz_deriv_order) - trg_N) * sizeof(BaseFloat));
+    }
+    
+    for (i = tot_frames - ext_right; i < tot_frames; i++) 
+    {
+      memcpy(rFeatureMatrix.pRowData(i),
+             rFeatureMatrix.pRowData(tot_frames - ext_right - 1),
+             (coefs * (1+lo_src_tgz_deriv_order) - trg_N) * sizeof(BaseFloat));
+    }
+
+    // Sentence cepstral mean normalization
+    if( (mpCmnPath == NULL)
+    && !(PARAMKIND_Z & mHeader.mSampleKind) 
+    &&  (PARAMKIND_Z & mTargetKind)) 
+    {
+      // for each coefficient
+      for(j=0; j < coefs; j++) 
+      {          
+        BaseFloat norm = 0.0;
+        for(i=0; i < tot_frames; i++)      // for each frame
+        {
+          norm += rFeatureMatrix[i][j - trg_N];
+          //norm += fea_mx[i*trg_vec_size - trg_N + j];
+        }
+        
+        norm /= tot_frames;
+  
+        for(i=0; i < tot_frames; i++)      // for each frame
+          rFeatureMatrix[i][j - trg_N] -= norm;
+          //fea_mx[i*trg_vec_size - trg_N + j] -= norm;
+      }
+    }
+    
+    // Compute missing derivatives
+    for (; src_deriv_order < mDerivOrder; src_deriv_order++) 
+    { 
+      int winLen = mDerivWinLengths[src_deriv_order];
+      BaseFloat norm = 0.0;
+      
+      for (k = 1; k <= winLen; k++) 
+      {
+        norm += 2 * k * k;
+      }
+      
+      // for each frame
+      for (i=0; i < tot_frames; i++) 
+      {        
+        // for each coefficient
+        for (j=0; j < coefs; j++) 
+        {          
+          //BaseFloat* src = fea_mx + i*trg_vec_size + src_deriv_order*coefs - trg_N + j;
+          BaseFloat* src = &rFeatureMatrix[i][src_deriv_order*coefs - trg_N + j];
+          
+          *(src + coefs) = 0.0;
+          
+          if (i < winLen || i >= tot_frames-winLen) 
+          { // boundaries need special treatment
+            for (k = 1; k <= winLen; k++) 
+            {  
+              *(src+coefs) += k*(src[ std::min(tot_frames-1-i,k)*rFeatureMatrix.Stride()]
+                                -src[-std::min(i,             k)*rFeatureMatrix.Stride()]);
+            }
+          } 
+          else 
+          { // otherwise use more efficient code
+            for (k = 1; k <= winLen; k++) 
+            {  
+              *(src+coefs) += k*(src[ k * rFeatureMatrix.Stride()]
+                                -src[-k * rFeatureMatrix.Stride()]);
+            }
+          }
+          *(src + coefs) /= norm;
+        }
+      }
+    }
+    
+    mHeader.mNSamples    = tot_frames;
+    mHeader.mSampleSize  = trg_vec_size * sizeof(FLOAT_32);
+    mHeader.mSampleKind  = mTargetKind & ~(PARAMKIND_D | PARAMKIND_A | PARAMKIND_T);
+  
+
+   TIMER_START(mTim);
+    ////////////////////////////////////////////////////////////////////////////
+    /////////////// Cepstral mean and variance normalization ///////////////////
+    ////////////////////////////////////////////////////////////////////////////
+    //.........................................................................
+    if (mpCmnPath != NULL
+    &&  mpCmnMask != NULL) 
+    {
+      // retrieve file name
+      ProcessMask(rFileNameRecord.Logical(), mpCmnMask, cmn_file_name);
+      // add the path correctly
+
+      if(cmn_file_name == "") {
+        throw std::runtime_error("CMN Matching failed");
+      }
+
+      cmn_file_name.insert(0, "/");
+      cmn_file_name.insert(0, mpCmnPath);
+
+      // read the file
+      ReadCepsNormFile(cmn_file_name.c_str(), &mpLastCmnFile, &mpCmn,
+          mHeader.mSampleKind & ~PARAMKIND_Z, CNF_Mean, coefs);
+                      
+      // recompute feature values
+      for (i=0; i < tot_frames; i++) 
+      {
+        for (j=trg_N; j < coefs; j++) 
+        {
+          rFeatureMatrix[i][j - trg_N] -= mpCmn[j];
+        }
+      }
+    }
+  
+    mHeader.mSampleKind |= mDerivOrder==3 ? PARAMKIND_D | PARAMKIND_A | PARAMKIND_T :
+                           mDerivOrder==2 ? PARAMKIND_D | PARAMKIND_A :
+                           mDerivOrder==1 ? PARAMKIND_D : 0;
+  
+    //.........................................................................
+    if (mpCvnPath != NULL
+    &&  mpCvnMask != NULL) 
+    {
+      // retrieve file name
+      ProcessMask(rFileNameRecord.Logical(), mpCvnMask, cvn_file_name);
+      // add the path correctly
+      cvn_file_name.insert(0, "/");
+      cvn_file_name.insert(0, mpCvnPath);
+
+      // read the file
+      ReadCepsNormFile(cvn_file_name.c_str(), &mpLastCvnFile, &mpCvn,
+          mHeader.mSampleKind, CNF_Variance, trg_vec_size);
+                      
+      // recompute feature values
+      for (i=0; i < tot_frames; i++) 
+      {
+        for (j=trg_N; j < trg_vec_size; j++) 
+        {
+          rFeatureMatrix[i][j - trg_N] *= mpCvn[j];
+        }
+      }
+    }
+    
+    //.........................................................................
+    // process the global covariance file
+    if (mpCvgFile != NULL) 
+    {
+      ReadCepsNormFile(mpCvgFile, &mpLastCvgFile, &mpCvg,
+                      -1, CNF_VarScale, trg_vec_size);
+                      
+      // recompute feature values
+      for (i=0; i < tot_frames; i++) 
+      {
+        for (j=trg_N; j < trg_vec_size; j++) 
+        {
+          rFeatureMatrix[i][j - trg_N] *= mpCvg[j];
+        }
+      }
+    }
+
+  TIMER_END(mTim,mTimeNormalize);
+    
+    return true;
+  }
+
+
+  //***************************************************************************
+  //***************************************************************************
+  int 
+  FeatureRepository::
+  ReadParmKind(const char *str, bool checkBrackets)
+  {
+    unsigned int  i;
+    int           parmKind =0;
+    int           slen     = strlen(str);
+  
+    if (checkBrackets) 
+    {
+      if (str[0] != '<' || str[slen-1] != '>')  return -1;
+      str++; slen -= 2;
+    }
+    
+    for (; slen >= 0 && str[slen-2] == '_'; slen -= 2) 
+    {
+      parmKind |= str[slen-1] == 'E' ? PARAMKIND_E :
+                  str[slen-1] == 'N' ? PARAMKIND_N :
+                  str[slen-1] == 'D' ? PARAMKIND_D :
+                  str[slen-1] == 'A' ? PARAMKIND_A :
+                  str[slen-1] == 'C' ? PARAMKIND_C :
+                  str[slen-1] == 'Z' ? PARAMKIND_Z :
+                  str[slen-1] == 'K' ? PARAMKIND_K :
+                  str[slen-1] == '0' ? PARAMKIND_0 :
+                  str[slen-1] == 'V' ? PARAMKIND_V :
+                  str[slen-1] == 'T' ? PARAMKIND_T : -1;
+  
+      if (parmKind == -1) return -1;
+    }
+    
+    for (i = 0; i < sizeof(mpParmKindNames) / sizeof(char*); i++) 
+    {
+      if (!strncmp(str, mpParmKindNames[i], slen))
+        return parmKind | i;
+    }
+    return -1;
+  }
+
+
+
+
+  //***************************************************************************
+  //***************************************************************************
+  int
+  FeatureRepository:: 
+  WriteHTKHeader (FILE * pOutFp, HtkHeader header, bool swap)
+  {
+    int cc;
+  
+    if (swap) {
+      swap4(header.mNSamples);
+      swap4(header.mSamplePeriod);
+      swap2(header.mSampleSize);
+      swap2(header.mSampleKind);
+    }
+  
+    fseek (pOutFp, 0L, SEEK_SET);
+    cc = fwrite(&header, sizeof(HtkHeader), 1, pOutFp);
+  
+    if (swap) {
+      swap4(header.mNSamples);
+      swap4(header.mSamplePeriod);
+      swap2(header.mSampleSize);
+      swap2(header.mSampleKind);
+    }
+  
+    return cc == 1 ? 0 : -1;
+  }
+  
+  
+  //***************************************************************************
+  //***************************************************************************
+  int 
+  FeatureRepository::
+  WriteHTKFeature(
+    FILE * pOutFp,
+    FLOAT * pOut,
+    size_t feaLen,
+    bool swap,
+    bool compress,
+    FLOAT* pScale, 
+    FLOAT* pBias)
+  {
+    size_t    i;
+    size_t    cc = 0;
+
+
+    if (compress) 
+    {
+      INT_16 s;
+        
+      for (i = 0; i < feaLen; i++) 
+      {
+	s = pOut[i] * pScale[i] - pBias[i];
+        if (swap) 
+	  swap2(s);
+	cc += fwrite(&s, sizeof(INT_16), 1, pOutFp);
+      }
+      
+    } else {
+  #if !DOUBLEPRECISION
+      if (swap) 
+        for (i = 0; i < feaLen; i++) 
+          swap4(pOut[i]);
+    
+        cc = fwrite(pOut, sizeof(FLOAT_32), feaLen, pOutFp);
+    
+      if (swap) 
+        for (i = 0; i < feaLen; i++) 
+          swap4(pOut[i]);
+  #else
+      FLOAT_32 f;
+  
+      for (i = 0; i < feaLen; i++) 
+      {
+        f = pOut[i];
+        if (swap) 
+          swap4(f);
+        cc += fwrite(&f, sizeof(FLOAT_32), 1, pOutFp);
+      }
+  #endif
+    }
+    return cc == feaLen ? 0 : -1;
+  }
+
+  //***************************************************************************
+  //***************************************************************************
+  int 
+  FeatureRepository::
+  WriteHTKFeatures(
+    FILE *  pOutFp,
+    FLOAT * pOut,
+    int     nCoeffs,
+    int     nSamples,
+    int     samplePeriod,
+    int     targetKind,  
+    bool    swap) 
+  {
+    HtkHeader header;
+    int i, j;
+    FLOAT *pScale = NULL;
+    FLOAT *pBias = NULL;
+    
+    header.mNSamples = nSamples  + ((targetKind & PARAMKIND_C) ? 2 * sizeof(FLOAT_32) / sizeof(INT_16) : 0);
+    header.mSamplePeriod = samplePeriod;
+    header.mSampleSize = nCoeffs * ((targetKind & PARAMKIND_C) ?    sizeof(INT_16)   : sizeof(FLOAT_32));;
+    header.mSampleKind = targetKind;
+    
+    WriteHTKHeader (pOutFp, header, swap);
+
+    if(targetKind & PARAMKIND_C) {
+      pScale = (FLOAT*) malloc(nCoeffs * sizeof(FLOAT));
+      pBias = (FLOAT*)  malloc(nCoeffs * sizeof(FLOAT));
+      if (pScale == NULL || pBias == NULL) Error("Insufficient memory");
+      
+      for(i = 0; i < nCoeffs; i++) {
+        float xmin, xmax;
+	xmin = xmax = pOut[i];
+	for(j = 1; j < nSamples; j++) {
+	  if(pOut[j*nCoeffs+i] > xmax) xmax = pOut[j*nCoeffs+i];
+	  if(pOut[j*nCoeffs+i] < xmin) xmin = pOut[j*nCoeffs+i];
+	}
+	pScale[i] = (2*32767) / (xmax - xmin);
+        pBias[i]  = pScale[i] * (xmax + xmin) / 2;
+	
+	
+      }
+      if (WriteHTKFeature(pOutFp, pScale, nCoeffs, swap, false, 0, 0)
+      ||  WriteHTKFeature(pOutFp, pBias,  nCoeffs, swap, false, 0, 0)) {
+        return -1;
+      }
+    }
+    for(j = 0; j < nSamples; j++) {
+      if (WriteHTKFeature(pOutFp, &pOut[j*nCoeffs], nCoeffs, swap, targetKind & PARAMKIND_C, pScale, pBias)) {
+        return -1;
+      }
+    }
+    return 0;
+  }
+  
+
+  //***************************************************************************
+  //***************************************************************************
+  int 
+  FeatureRepository::
+  WriteHTKFeatures(
+    FILE *  pOutFp,
+    int     samplePeriod,
+    int     targetKind,  
+    bool    swap,
+    Matrix<BaseFloat>&        rFeatureMatrix)
+  {
+    HtkHeader header;
+    size_t i, j;
+    FLOAT *p_scale = NULL;
+    FLOAT *p_bias = NULL;
+    size_t n_samples = rFeatureMatrix.Rows();
+    size_t n_coeffs  = rFeatureMatrix.Cols();
+    
+    header.mNSamples = n_samples  + ((targetKind & PARAMKIND_C) ? 2 * sizeof(FLOAT_32) / sizeof(INT_16) : 0);
+    header.mSamplePeriod = samplePeriod;
+    header.mSampleSize = n_coeffs * ((targetKind & PARAMKIND_C) ?    sizeof(INT_16)   : sizeof(FLOAT_32));;
+    header.mSampleKind = targetKind;
+    
+    WriteHTKHeader (pOutFp, header, swap);
+
+    if(targetKind & PARAMKIND_C) {
+      p_scale = (FLOAT*) malloc(n_coeffs * sizeof(FLOAT));
+      p_bias = (FLOAT*)  malloc(n_coeffs * sizeof(FLOAT));
+      if (p_scale == NULL || p_bias == NULL) Error("Insufficient memory");
+      
+      for(i = 0; i < n_coeffs; i++) {
+        float xmin, xmax;
+	xmin = xmax = rFeatureMatrix[0][i];
+
+	for(j = 1; j < n_samples; j++) {
+	  if(rFeatureMatrix[j][i] > xmax) xmax = rFeatureMatrix[j][i];
+	  if(rFeatureMatrix[j][i] < xmin) xmin = rFeatureMatrix[j][i];
+	}
+
+	p_scale[i] = (2*32767) / (xmax - xmin);
+        p_bias[i]  = p_scale[i] * (xmax + xmin) / 2;
+      }
+
+      if (WriteHTKFeature(pOutFp, p_scale, n_coeffs, swap, false, 0, 0)
+      ||  WriteHTKFeature(pOutFp, p_bias,  n_coeffs, swap, false, 0, 0)) {
+        return -1;
+      }
+    }
+
+    for(j = 0; j < n_samples; j++) {
+      if (WriteHTKFeature(pOutFp, rFeatureMatrix[j].pData(), n_coeffs, swap, targetKind & PARAMKIND_C, p_scale, p_bias)) {
+        return -1;
+      }
+    }
+
+    return 0;
+  }
+  
+  //***************************************************************************
+  //***************************************************************************
+
+
+  bool 
+  FeatureRepository::
+  ReadGzipAsciiFeatures(const FileListElem& rFileNameRecord, Matrix<BaseFloat>& rFeatureMatrix)
+  {
+    //build the command
+    std::string cmd("gunzip -c "); cmd += rFileNameRecord.Physical();
+
+    //define buffer
+    const int buf_size=262144;
+    char buf[buf_size];
+    char vbuf[2*buf_size];
+
+   TIMER_START(mTim);      
+    //open the pipe
+    FILE* fp = popen(cmd.c_str(),"r");
+    if(fp == NULL) {
+      //2nd try...
+      Warning(std::string("2nd try to open pipe: ")+cmd);
+      sleep(5);
+      fp = popen(cmd.c_str(),"r");
+      if(fp == NULL) {
+        KALDI_ERR << "Cannot open pipe: " << cmd;
+      }
+    }
+    setvbuf(fp,vbuf,_IOFBF,2*buf_size);
+   TIMER_END(mTim,mTimeOpen);
+
+    //string will stay allocated across calls
+    static std::string line; line.resize(0);
+
+    //define matrix storage
+    static int cols = 131072;
+    std::list<std::vector<BaseFloat> > matrix(1);
+    matrix.front().reserve(cols);
+
+    //read all the lines to a vector
+    int line_ctr=1;
+    while(1) {
+     TIMER_START(mTim);      
+      if(NULL == fgets(buf,buf_size,fp)) break;
+     TIMER_END(mTim,mTimeRead);
+      
+      line += buf;
+      if(*(line.rbegin()) == '\n' || feof(fp)) {
+        //parse the line of numbers
+       TIMER_START(mTim);      
+        const char* ptr = line.c_str();
+        char* end;
+        while(1) {
+          //skip whitespace
+          while(isspace(*ptr)) ptr++;
+          if(*ptr == 0) break;
+          //check that a number follows
+          switch(*ptr) {
+            case '0': case '1': case '2': case '3': case '4': 
+            case '5': case '6': case '7': case '8': case '9':
+            case '.': case '+': case '-': 
+            break;
+            default : KALDI_ERR << "A number was expected:" << ptr
+                                << " reading from" << cmd; 
+                      exit(1);
+          }
+          //read a number
+          BaseFloat val = strtof(ptr,&end); ptr=end;
+          matrix.back().push_back(val);
+        }
+       TIMER_END(mTim,mTimeNormalize);
+        //we have the line of numbers, insert empty row to matrix
+        if(matrix.back().size() > 0 && !feof(fp)) {
+          matrix.push_back(std::vector<BaseFloat>());
+          matrix.back().reserve(matrix.front().size());
+        }
+        //dispose the current line
+        line.resize(0);//but stay allocated... 
+        line_ctr++;
+      }
+    }
+    if(matrix.back().size() == 0) matrix.pop_back();
+
+    //get matrix dimensions
+    int rows = matrix.size();
+    /*int*/ cols = matrix.front().size();
+
+    //define interators
+    std::list<std::vector<BaseFloat> >::iterator it_r;
+    std::vector<BaseFloat>::iterator it_c;
+
+    //check that all lines have same size
+    int i;
+    for(i=0,it_r=matrix.begin(); it_r != matrix.end(); ++i,++it_r) {
+      if(it_r->size() != cols) {
+        KALDI_ERR << "All rows must have same dimension, 1st line cols: " << cols 
+                  << ", " << i << "th line cols: " << it_r->size();
+      }
+    }
+
+    //copy data to matrix
+   TIMER_START(mTim);      
+    rFeatureMatrix.Init(rows,cols);
+    int r,c;
+    for(r=0,it_r=matrix.begin(); it_r!=matrix.end(); ++r,++it_r) {
+      for(c=0,it_c=it_r->begin(); it_c!=it_r->end(); ++c,++it_c) {
+        rFeatureMatrix(r,c) = *it_c;
+      }
+    }
+   TIMER_END(mTim,mTimeSeek);
+
+    //close the pipe
+    if(pclose(fp) == -1) {
+      KALDI_ERR << "Cannot close pipe: " << cmd;
+    }
+    
+    return true;
+  }
+
+
+  //***************************************************************************
+  //***************************************************************************
+
+} // namespace TNet
diff --git a/src/KaldiLib/Features.h b/src/KaldiLib/Features.h
new file mode 100644
index 0000000..0980ab6
--- /dev/null
+++ b/src/KaldiLib/Features.h
@@ -0,0 +1,597 @@
+//
+// C++ Interface: %{MODULE}
+//
+// Description: 
+//
+//
+// Author: %{AUTHOR} <%{EMAIL}>, (C) %{YEAR}
+//
+// Copyright: See COPYING file that comes with this distribution
+//
+//
+
+#ifndef TNet_Features_h
+#define TNet_Features_h
+
+//*****************************************************************************
+//*****************************************************************************
+// Standard includes
+//
+#include <list>
+#include <queue>
+#include <string>
+
+
+//*****************************************************************************
+//*****************************************************************************
+// Specific includes
+//
+#include "Common.h"
+#include "Matrix.h"
+#include "StkStream.h"
+#include "Types.h"
+#include "Timer.h"
+
+
+
+// we need these for reading and writing
+#define UINT_16  unsigned short
+#define UINT_32  unsigned
+#define INT_16   short
+#define INT_32   int
+#define FLOAT_32 float
+#define DOUBLE_64 double
+
+
+#define PARAMKIND_WAVEFORM  0
+#define PARAMKIND_LPC       1
+#define PARAMKIND_LPREFC    2
+#define PARAMKIND_LPCEPSTRA 3
+#define PARAMKIND_LPDELCEP  4
+#define PARAMKIND_IREFC     5
+#define PARAMKIND_MFCC      6
+#define PARAMKIND_FBANK     7
+#define PARAMKIND_MELSPEC   8
+#define PARAMKIND_USER      9
+#define PARAMKIND_DISCRETE 10
+#define PARAMKIND_PLP      11
+#define PARAMKIND_ANON     12
+
+#define PARAMKIND_E   0000100 /// has energy
+#define PARAMKIND_N   0000200 /// absolute energy suppressed
+#define PARAMKIND_D   0000400 /// has delta coefficients
+#define PARAMKIND_A   0001000 /// has acceleration coefficients
+#define PARAMKIND_C   0002000 /// is compressed
+#define PARAMKIND_Z   0004000 /// has zero mean static coef.
+#define PARAMKIND_K   0010000 /// has CRC checksum
+#define PARAMKIND_0   0020000 /// has 0'th cepstral coef.
+#define PARAMKIND_V   0040000 /// has VQ codebook index
+#define PARAMKIND_T   0100000 /// has triple delta coefficients
+
+
+//*****************************************************************************
+//*****************************************************************************
+// Code ...
+//
+
+namespace TNet
+{
+
+  /** **************************************************************************
+   ** **************************************************************************
+   */
+  class FileListElem
+  {
+  private:
+    std::string         mLogical;     ///< Logical file name representation
+    std::string         mPhysical;    ///< Pysical file name representation
+    float               mWeight;
+    
+  public:
+    FileListElem(const std::string & rFileName);
+    ~FileListElem() {}
+    
+    const std::string &
+    Logical() const { return mLogical; }
+
+    const std::string &
+    Physical() const { return mPhysical; }
+
+    const float&
+    Weight() const { return mWeight; }
+  };
+
+  /** *************************************************************************
+   * @brief 
+   */
+  class FeatureRepository
+  {
+  public:
+    /** 
+     * @brief HTK parameter file header (see HTK manual)
+     */
+    struct HtkHeader
+    {
+      int   mNSamples;
+      int   mSamplePeriod;
+      short mSampleSize;
+      short mSampleKind;
+
+      HtkHeader() 
+       : mNSamples(0),mSamplePeriod(100000),mSampleSize(0),mSampleKind(12)
+      { }
+    };
+
+
+    /**
+     *     @brief Extension of the HTK header
+     */
+    struct HtkHeaderExt
+    {
+      int mHeaderSize;
+      int mVersion;
+      int mSampSize;
+    };
+
+
+    /** 
+     * @brief Normalization file type
+     */
+    enum CNFileType
+    {
+      CNF_Mean,
+      CNF_Variance,
+      CNF_VarScale
+    };
+
+
+    static int     
+    ReadParmKind(const char *pStr, bool checkBrackets);
+
+    static int     
+    ParmKind2Str(unsigned parmKind, char *pOutstr);
+
+    static void 
+    ReadCepsNormFile(
+        const char*   pFileName,
+        char**        lastFile,
+        BaseFloat**   vecBuff,
+        int           sampleKind,
+        CNFileType    type,
+        int           coefs);
+
+    static const char mpParmKindNames[13][16];
+
+  
+  
+  //////////////////////////////////////////////////////////////////////////////
+  //  PUBLIC SECTION
+  //////////////////////////////////////////////////////////////////////////////
+  public:
+    /// Iterates through the list of feature file records
+    typedef   std::list<FileListElem>::iterator  ListIterator;
+
+    // some params for loading features
+    bool                        mSwapFeatures;
+    int                         mStartFrameExt;
+    int                         mEndFrameExt;
+    int                         mTargetKind;
+    int                         mDerivOrder;
+    int*                        mDerivWinLengths;
+    const char*                 mpCvgFile;
+    //:TODO: get rid of these
+    const char*                 mpCmnPath;
+    const char*                 mpCmnMask;
+    const char*                 mpCvnPath;
+    const char*                 mpCvnMask;
+
+    int                         mTrace;
+    
+    
+    // Constructors and destructors
+    /**
+     * @brief Default constructor that creates an empty repository
+     */
+    FeatureRepository() : mDerivWinLengths(NULL), mpCvgFile(NULL), 
+       mpCmnPath(NULL), mpCmnMask(NULL), mpCvnPath(NULL), mpCvnMask(NULL),
+       mTrace(0),
+       mpLastFileName(NULL), mLastFileName(""), mpLastCmnFile (NULL), 
+       mpLastCvnFile (NULL), mpLastCvgFile (NULL), mpCmn(NULL), 
+       mpCvn(NULL), mpCvg(NULL), mpA(NULL), mpB(NULL),
+       mTimeOpen(0), mTimeSeek(0), mTimeRead(0), mTimeNormalize(0) 
+    { 
+      mInputQueueIterator        = mInputQueue.end();
+    }
+
+    /**
+     * @brief Copy constructor which copies filled repository
+     */
+    FeatureRepository(const FeatureRepository& ori)
+     : mDerivWinLengths(NULL), mpCvgFile(NULL), 
+       mpCmnPath(NULL), mpCmnMask(NULL), mpCvnPath(NULL), mpCvnMask(NULL),
+       mTrace(0),
+       mpLastFileName(NULL), mLastFileName(""), mpLastCmnFile (NULL), 
+       mpLastCvnFile (NULL), mpLastCvgFile (NULL), mpCmn(NULL), 
+       mpCvn(NULL), mpCvg(NULL), mpA(NULL), mpB(NULL),
+       mTimeOpen(0), mTimeSeek(0), mTimeRead(0), mTimeNormalize(0) 
+    {
+      //copy all the data from the input queue
+      mInputQueue = ori.mInputQueue;
+
+      //initialize like the original
+      Init(
+        ori.mSwapFeatures,
+        ori.mStartFrameExt,
+        ori.mEndFrameExt,
+        ori.mTargetKind,
+        ori.mDerivOrder,
+        ori.mDerivWinLengths,
+        ori.mpCmnPath,
+        ori.mpCmnMask,
+        ori.mpCvnPath,
+        ori.mpCvnMask,
+        ori.mpCvgFile);
+     
+      //set on the end 
+      mInputQueueIterator        = mInputQueue.end(); 
+      //copy default header values
+      mHeader = ori.mHeader;
+    }
+
+
+    /**
+     * @brief Destroys the repository
+     */
+    ~FeatureRepository()
+    {
+      if (NULL != mpA) {
+        free(mpA);
+      }
+
+      if (NULL != mpB) {
+        free(mpB);
+      }
+      //remove all entries
+      mInputQueue.clear();
+
+      if(mTrace&4) {
+        std::cout << "[FeatureRepository -- open:" << mTimeOpen << "s seek:" << mTimeSeek << "s read:" << mTimeRead << "s normalize:" << mTimeNormalize << "s]\n";
+      }
+
+    }
+
+
+    /**
+     * @brief Initializes the object using the given parameters
+     *
+     * @param swap          Boolean value specifies whether to swap bytes 
+     *                      when reading file or not. 
+     * @param extLeft       Features read from file are extended with extLeft 
+     *                      initial frames. Normally, these frames are 
+     *                      repetitions of the first feature frame in file 
+     *                      (with its derivative, if derivatives are preset in
+     *                      the file). However, if segment of feature frames 
+     *                      is extracted according to range specification, the 
+     *                      true feature frames from beyond the segment boundary
+     *                      are used, wherever it is possible. Note that value 
+     *                      of extLeft can be also negative. In such case
+     *                      corresponding number of initial frames is discarded. 
+     * @param extRight      The paramerer is complementary to parameter extLeft 
+     *                      and has obvious meaning. (Controls extensions over
+     *                      the last frame, last frame from file is repeated 
+     *                      only if necessary).
+     * @param targetKind    The parameters is used to check whether 
+     *                      pHeader->mSampleKind match to requited targetKind 
+     *                      and to control suppression of 0'th cepstral or 
+     *                      energy coefficients accorging to modifiers _E, _0, 
+     *                      and _N. Modifiers _D, _A and _T are ignored; 
+     *                      Computation of derivatives is controled by parameters
+     *                      derivOrder and derivWinLen. Value PARAMKIND_ANON 
+     *                      ensures that function do not result in targetKind 
+     *                      mismatch error and cause no _E or _0 suppression.
+     * @param derivOrder    Final features will be augmented with their 
+     *                      derivatives up to 'derivOrder' order. If 'derivOrder'
+     *                      is negative value, no new derivatives are appended 
+     *                      and derivatives that already present in feature file
+     *                      are preserved.  Straight features are considered 
+     *                      to be of zero order. If some derivatives are already 
+     *                      present in feature file, these are not computed 
+     *                      again, only higher order derivatives are appended 
+     *                      if required. Note, that HTK feature file cannot 
+     *                      contain higher order derivatives (e.g. double delta)
+     *                      without containing lower ones (e.g. delta). 
+     *                      Derivative present in feature file that are of 
+     *                      higher order than is required are discarded.  
+     *                      Derivatives are computed in the final stage from 
+     *                      (extracted segment of) feature frames possibly 
+     *                      extended by repeated frames. Derivatives are 
+     *                      computed using the same formula that is employed 
+     *                      also by HTK tools. Lengths of windows used for 
+     *                      computation of derivatives are passed in parameter 
+     *                      derivWinLen. To compute derivatives for frames close 
+     *                      to boundaries, frames before the first and after the 
+     *                      last frame (of the extracted segment) are considered 
+     *                      to be (yet another) repetitions of the first and the 
+     *                      last frame, respectively. If the segment of frames 
+     *                      is extracted according to range specification and 
+     *                      parameters extLeft and extLeft are set to zero, the 
+     *                      first and the last frames of the segment are 
+     *                      considered to be repeated, eventough the true feature
+     *                      frames from beyond the segment boundary can be
+     *                      available in the file. Therefore, segment extracted 
+     *                      from features that were before augmented with 
+     *                      derivatives will differ 
+     *                      from the same segment augmented with derivatives by 
+     *                      this function. Difference will be of course only on 
+     *                      boundaries and only in derivatives. This "incorrect" 
+     *                      behavior was chosen to fully simulate behavior of 
+     *                      HTK tools. To obtain more correct computation of 
+     *                      derivatives, use parameters extLeft and extRight, 
+     *                      which correctly extend segment with the true frames 
+     *                      (if possible) and in resulting feature matrix ignore 
+     *                      first extLeft and last extRight frames. For this 
+     *                      purpose, both extLeft and extRight should be set to 
+     *                      sum of all values in the array derivWinLen.
+     * @param pDerivWinLen  Array of size derivOrder specifying lengths of 
+     *                      windows used for computation of derivatives. 
+     *                      Individual values represents one side context 
+     *                      used in the computation. The each window length is 
+     *                      therefore twice the value from array plus one. 
+     *                      Value at index zero specify window length for first 
+     *                      order derivatives (delta), higher indices 
+     *                      corresponds to higher order derivatives.
+     * @param pCmnPath      Cepstral mean normalization path
+     * @param pCmnMask      Cepstral mean normalization mask
+     * @param pCvnPath      Cepstral variance normalization path
+     * @param pCvnMask      Cepstral variance normalization mask
+     * @param pCvgFile      Global variance file to be parsed
+     *
+     * The given parameters are necessary for propper feature extraction 
+     */
+    void
+    Init(
+        bool                  swap,
+        int                   extLeft,
+        int                   extRight,
+        int                   targetKind,
+        int                   derivOrder,
+        int*                  pDerivWinLen,
+        const char*           pCmnPath,
+        const char*           pCmnMask,
+        const char*           pCvnPath,
+        const char*           pCvnMask,
+        const char*           pCvgFile);
+   
+    
+    void Trace(int trace)
+    { mTrace = trace; } 
+        
+    /** 
+     * @brief Returns a refference to the current file header
+     */
+    const HtkHeader&
+    CurrentHeader() const 
+    { return mHeader; }
+
+    /** 
+     * @brief Returns a refference to the current file header
+     */
+    const HtkHeaderExt&
+    CurrentHeaderExt() const 
+    { return mHeaderExt; }
+
+    /**
+     * @brief Returns the current file details
+     *
+     * @return Refference to a class @c FileListElem
+     *
+     * Logical and physical file names are stored in @c FileListElem class
+     */
+    const std::list<FileListElem>::iterator&
+    pCurrentRecord() const
+    { return mInputQueueIterator; }
+
+
+    /**
+     * @brief Returns the following file details
+     *
+     * @return Refference to a class @c FileListElem
+     *
+     * Logical and physical file names are stored in @c FileListElem class
+     */
+    const std::list<FileListElem>::iterator&
+    pFollowingRecord() const
+    { return mInputQueueIterator; }
+
+
+    void
+    Rewind()
+    { mInputQueueIterator = mInputQueue.begin(); }
+    
+    
+    /**
+     * @brief Adds a single feature file to the repository
+     * @param rFileName file to read features from
+     */
+    void
+    AddFile(const std::string & rFileName);
+    
+
+    /**
+     * @brief Adds a list of feature files to the repository
+     * @param rFileName feature list file to read from
+     */
+    void
+    AddFileList(const char* pFileName, const char* pFilter = "");
+  
+    
+    const FileListElem&
+    Current() const
+    { return *mInputQueueIterator; }
+
+    
+    /** 
+     * @brief Moves to the next record
+     */
+    void
+    MoveNext();
+    
+    /**
+     * @brief Reads full feature matrix from a feature file
+     * @param rMatrix matrix to be created and filled with read data
+     * @return number of successfully read feature vectors
+     */
+    bool
+    ReadFullMatrix(Matrix<BaseFloat>& rMatrix); 
+    
+    bool
+    WriteFeatureMatrix(const Matrix<BaseFloat>& rMatrix, const std::string& filename, int targetKind, int samplePeriod);
+    
+    size_t
+    QueueSize() const {return mInputQueue.size(); }
+
+    /**
+     * @brief Reads feature vectors from a feature file
+     * @param rMatrix matrix to be (only!) filled with read data. 
+     * @return number of successfully read feature vectors
+     * 
+     * The function tries to fill @c pMatrix with feature vectors comming from
+     * the current stream. If there are less vectors left in the stream, 
+     * they are used and true number of successfuly read vectors is returned.
+     */
+    int
+    ReadPartialMatrix(Matrix<BaseFloat>& rMatrix);    
+    
+    /** 
+     * @brief Filters the records of this repository based on HTK logical name
+     * masking. If pFilter equals to NULL, all source repository entries are
+     * coppied to rOut repository.
+     * 
+     * @param pFilter HTK mask that defines the filter
+     * @param pValue Filter value
+     * @param rOut Reference to the new FeatureRepository which will be filled
+     * with the matching records
+     */
+    void
+    HtkFilter(const char* pFilter, const char* pValue, FeatureRepository& rOut);
+
+
+    /** 
+     * @brief Filters the records of this repository based on HTK logical name
+     * masking and returns list of unique names. If pFilter equals to NULL, 
+     * single name "default" is returned.
+     * 
+     * @param pFilter HTK mask that defines the filter
+     * @param rOut Reference to the list of results (std::list< std::string >)
+     */
+    void
+    HtkSelection(const char* pFilter, std::list< std::string >& rOut);
+
+
+    /**
+     * @brief Returns true if there are no feature files left on input
+     */
+    bool
+    EndOfList() const 
+    { return mInputQueueIterator == mInputQueue.end(); }
+
+    const std::string&
+    CurrentIndexFileName() const
+    { return mCurrentIndexFileName; }
+    
+    friend
+    void
+    AddFileListToFeatureRepositories(
+      const char* pFileName,
+      const char* pFilter,
+      std::queue<FeatureRepository *> &featureRepositoryList);
+
+
+////////////////////////////////////////////////////////////////////////////////
+//  PRIVATE SECTION
+////////////////////////////////////////////////////////////////////////////////
+  private:
+    /// List (queue) of input feature files
+    std::list<FileListElem>             mInputQueue;
+    std::list<FileListElem>::iterator   mInputQueueIterator;
+    
+    std::string                         mCurrentIndexFileName;
+    std::string                         mCurrentIndexFileDir;
+    std::string                         mCurrentIndexFileExt;
+
+    /// current stream
+    IStkStream                  mStream;
+      
+    // stores feature file's HTK header
+    HtkHeader                   mHeader;
+    HtkHeaderExt                mHeaderExt;
+
+
+    // this group of variables serve for working withthe same physical
+    // file name more than once
+    char*                       mpLastFileName;
+    std::string                 mLastFileName;
+    char*                       mpLastCmnFile;
+    char*                       mpLastCvnFile;
+    char*                       mpLastCvgFile;
+    BaseFloat*                      mpCmn;
+    BaseFloat*                      mpCvn;
+    BaseFloat*                      mpCvg;
+    HtkHeader                   mLastHeader;
+    BaseFloat*                      mpA;
+    BaseFloat*                      mpB;
+
+
+
+    Timer mTim;
+    double mTimeOpen;
+    double mTimeSeek;
+    double mTimeRead;
+    double mTimeNormalize;
+
+
+    // Reads HTK feature file header
+    int 
+    ReadHTKHeader();
+
+    int 
+    ReadHTKFeature(BaseFloat*    pIn, 
+      size_t    feaLen, 
+      bool      decompress, 
+      BaseFloat*    pScale, 
+      BaseFloat*    pBias);
+
+    
+    bool 
+    ReadHTKFeatures(const std::string& rFileName, Matrix<BaseFloat>& rFeatureMatrix);
+    
+    bool 
+    ReadHTKFeatures(const FileListElem& rFileNameRecord, Matrix<BaseFloat>& rFeatureMatrix);
+
+
+    int 
+    WriteHTKHeader  (FILE* fp_out, HtkHeader header, bool swap);
+
+    int 
+    WriteHTKFeature (FILE* fp_out, FLOAT *out, size_t fea_len, bool swap, bool compress, FLOAT* pScale, FLOAT* pBias);
+
+    int 
+    WriteHTKFeatures(FILE* pOutFp, FLOAT * pOut, int nCoeffs, int nSamples, int samplePeriod, int targetKind, bool swap);
+
+    int 
+    WriteHTKFeatures(
+      FILE *  pOutFp,
+      int     samplePeriod,
+      int     targetKind,  
+      bool    swap,
+      Matrix<BaseFloat>& rFeatureMatrix
+    );
+
+    bool 
+    ReadGzipAsciiFeatures(const FileListElem& rFileNameRecord, Matrix<BaseFloat>& rFeatureMatrix);
+
+  }; // class FeatureStream
+
+} //namespace TNet
+
+#endif // TNet_Features_h
diff --git a/src/KaldiLib/Labels.cc b/src/KaldiLib/Labels.cc
new file mode 100644
index 0000000..c76b72c
--- /dev/null
+++ b/src/KaldiLib/Labels.cc
@@ -0,0 +1,215 @@
+#include "Labels.h"
+#include "Timer.h"
+
+
+namespace TNet {
+
+
+  ////////////////////////////////////////////////////////////////////////
+  // Class LabelRepository::
+  void
+  LabelRepository::
+  Init(const char* pLabelMlfFile, const char* pOutputLabelMapFile, const char* pLabelDir, const char* pLabelExt)
+  {
+    assert(NULL != pLabelMlfFile);
+    assert(NULL != pOutputLabelMapFile);
+
+    // initialize the label streams
+    delete mpLabelStream; //if NULL, does nothing
+    delete _mpLabelStream;
+    _mpLabelStream = new std::ifstream(pLabelMlfFile);
+    mpLabelStream  = new IMlfStream(*_mpLabelStream);
+
+    // Label stream is initialized, just test it
+    if(!mpLabelStream->good()) 
+      Error(std::string("Cannot open Label MLF file: ")+pLabelMlfFile);
+
+    // Index the labels (good for randomized file lists)
+    Timer tim; tim.Start();
+    mpLabelStream->Index();
+    tim.End(); mIndexTime += tim.Val(); 
+
+    // Read the state-label to state-id map
+    ReadOutputLabelMap(pOutputLabelMapFile);
+
+    // Store the label dir/ext
+    mpLabelDir = pLabelDir;
+    mpLabelExt = pLabelExt;
+  }
+
+
+
+  void 
+  LabelRepository::
+  GenDesiredMatrix(BfMatrix& rDesired, size_t nFrames, size_t sourceRate, const char* pFeatureLogical)
+  {
+    //timer
+    Timer tim; tim.Start();
+    
+    //Get the MLF stream reference...
+    IMlfStream& mLabelStream = *mpLabelStream;
+    //Build the file name of the label
+    MakeHtkFileName(mpLabelFile, pFeatureLogical, mpLabelDir, mpLabelExt);
+
+    //Find block in MLF file
+    mLabelStream.Open(mpLabelFile);
+    if(!mLabelStream.good()) {
+      Error(std::string("Cannot open label MLF record: ") + mpLabelFile);
+    }
+
+
+    //resize the matrix
+    if(nFrames < 1) {
+      KALDI_ERR << "Number of frames:" << nFrames << " is lower than 1!!!\n"
+                << pFeatureLogical;
+    }
+    rDesired.Init(nFrames, mLabelMap.size(), true); //true: Zero()
+
+    //aux variables
+    std::string line, state;
+    unsigned long long beg, end;
+    size_t state_index;
+    size_t trunc_frames = 0;
+    TagToIdMap::iterator it;
+    
+    //parse the label file
+    while(!mLabelStream.eof()) {
+      std::getline(mLabelStream, line);
+      if(line == "") continue; //skip newlines/comments from MLF
+      if(line[0] == '#') continue;
+
+      std::istringstream& iss = mGenDesiredMatrixStream;
+      iss.clear();
+      iss.str(line);
+
+      //parse the line
+      //begin
+      iss >> std::ws >> beg;
+      if(iss.fail()) { 
+        KALDI_ERR << "Cannot parse column 1 (begin)\n"
+                  << "line: " << line << "\n"
+                  << "file: " << mpLabelFile << "\n";
+      }
+      //end
+      iss >> std::ws >> end;
+      if(iss.fail()) { 
+        KALDI_ERR << "Cannot parse column 2 (end)\n"
+                  << "line: " << line << "\n"
+                  << "file: " << mpLabelFile << "\n";
+      }
+      //state tag
+      iss >> std::ws >> state;
+      if(iss.fail()) { 
+        KALDI_ERR << "Cannot parse column 3 (state_tag)\n"
+                  << "line: " << line << "\n"
+                  << "file: " << mpLabelFile << "\n";
+      }
+
+      //divide beg/end by sourceRate and round up to get interval of frames
+      beg = (beg+sourceRate/2)/sourceRate;
+      end = (end+sourceRate/2)/sourceRate; 
+      //beg = (int)round(beg / (double)sourceRate);
+      //end = (int)round(end / (double)sourceRate); 
+      
+      //find the state id
+      it = mLabelMap.find(state);
+      if(mLabelMap.end() == it) {
+        Error(std::string("Unknown state tag: '") + state + "' file:'" + mpLabelFile);
+      }
+      state_index = it->second;
+
+      // Fill the desired matrix
+      for(unsigned long long frame=beg; frame<end; frame++) { 
+        //don't write after matrix... (possible longer transcript than feature file)
+        if(frame >= (int)rDesired.Rows()) { trunc_frames++; continue; }
+
+        //check the next frame is empty:
+        if(0.0 != rDesired[frame].Sum()) {
+          //ERROR!!!
+          //find out what was previously filled!!!
+          BaseFloat max = rDesired[frame].Max();
+          int idx = -1;
+          for(int i=0; i<(int)rDesired[frame].Dim(); i++) { 
+            if(rDesired[frame][i] == max) idx = i; 
+          }
+          for(it=mLabelMap.begin(); it!=mLabelMap.end(); ++it) {
+            if((int)it->second == idx) break;
+          }
+          std::string state_prev = "error";
+          if(it != mLabelMap.end()) {
+            state_prev = it->first;
+          }
+          //print the error message
+          std::ostringstream os; 
+          os << "Frame already assigned to other state, "
+             << " file: " << mpLabelFile 
+             << " frame: " << frame
+             << " nframes: " << nFrames 
+             << " sum: " << rDesired[frame].Sum()  
+             << " previously assigned to: " << state_prev << "(" << idx << ")" 
+             << " now should be assigned to: " << state << "(" << state_index << ")"
+             << "\n";
+          Error(os.str());
+        }
+
+        //fill the row
+        rDesired[(size_t)frame][state_index] = 1.0f;
+      }
+    }
+
+    mLabelStream.Close();
+
+    //check the desired matrix (rows sum up to 1.0)
+    for(size_t i=0; i<rDesired.Rows(); ++i) {
+      float desired_row_sum = rDesired[i].Sum();
+      if(!desired_row_sum == 1.0) {
+        std::ostringstream os;
+        os << "Desired vector sum isn't 1.0, "
+           << " file: " << mpLabelFile 
+           << " row: " << i 
+           << " nframes: " << nFrames 
+           << " content: " << rDesired[i] 
+           << " sum: " << desired_row_sum  << "\n";
+        Error(os.str());
+      }
+    }
+    
+    //warning when truncating many frames
+    if(trunc_frames > 10) {
+      std::ostringstream os;
+      os << "Truncated frames: " << trunc_frames 
+         << " Check sourcerate in features and validity of labels\n";
+      Warning(os.str());
+    }
+
+    //timer
+    tim.End(); mGenDesiredMatrixTime += tim.Val();
+  }
+
+  
+
+  void
+  LabelRepository::
+  ReadOutputLabelMap(const char* file)
+  {
+    assert(mLabelMap.size() == 0);
+    int i = 0;
+    std::string state_tag;
+    std::ifstream in(file);
+    if(!in.good())
+      Error(std::string("Cannot open OutputLabelMapFile: ")+file);
+
+    in >> std::ws;
+    while(!in.eof()) {
+      in >> state_tag;
+      in >> std::ws;
+      assert(mLabelMap.find(state_tag) == mLabelMap.end());
+      mLabelMap[state_tag] = i++;
+    }
+
+    in.close();
+    assert(mLabelMap.size() > 0);
+  }
+
+
+}//namespace
diff --git a/src/KaldiLib/Labels.h b/src/KaldiLib/Labels.h
new file mode 100644
index 0000000..6b78d1a
--- /dev/null
+++ b/src/KaldiLib/Labels.h
@@ -0,0 +1,75 @@
+#ifndef _LABELS_H_
+#define _LABELS_H_
+
+
+#include "Matrix.h"
+#include "MlfStream.h"
+#include "Features.h"
+
+#include <map>
+#include <iostream>
+
+namespace TNet {
+
+
+  class FeaCatPool;
+
+  /**
+   * Desired matrix generation object,
+   * supports background-reading and caching, however can be 
+   * used in foreground as well by GenDesiredMatrix()
+   */
+  class LabelRepository 
+  {
+    typedef std::map<std::string,size_t> TagToIdMap;
+
+    public:
+      LabelRepository()
+        : _mpLabelStream(NULL), mpLabelStream(NULL), mpLabelDir(NULL), mpLabelExt(NULL), mGenDesiredMatrixTime(0), mIndexTime(0), mTrace(0) 
+      { }
+
+      ~LabelRepository()
+      { 
+        if(mTrace&4) {
+          std::cout << "[LabelRepository -- indexing:" << mIndexTime << "s"
+                       " genDesiredMatrix:" << mGenDesiredMatrixTime << "s]" << std::endl;
+        }
+        delete mpLabelStream;
+        delete _mpLabelStream;
+      }
+
+      /// Initialize the LabelRepository      
+      void Init(const char* pLabelMlfFile, const char* pOutputLabelMapFile, const char* pLabelDir, const char* pLabelExt);
+
+      /// Set trace level
+      void Trace(int trace)
+      { mTrace = trace; }
+
+      /// Get desired matrix from labels
+      void GenDesiredMatrix(BfMatrix& rDesired, size_t nFrames, size_t sourceRate, const char* pFeatureLogical);
+
+    private:
+      /// Prepare the state-label to state-id map
+      void ReadOutputLabelMap(const char* file);
+      
+    private:
+      // Streams and state-map
+      std::ifstream* _mpLabelStream; ///< Helper stream for Label stream
+      IMlfStream* mpLabelStream;     ///< Label stream
+      std::istringstream mGenDesiredMatrixStream; ///< Label file parsing stream
+     
+      const char* mpLabelDir;  ///< Label dir in MLF 
+      const char* mpLabelExt;  ///< Label ext in MLF
+      char mpLabelFile[4096];  ///< Buffer for filenames in MLF
+      
+      TagToIdMap mLabelMap; ///< Map of state tags to net output indices
+
+      double mGenDesiredMatrixTime;
+      float  mIndexTime;
+
+      int mTrace;
+  };
+
+}//namespace
+
+#endif
diff --git a/src/KaldiLib/Makefile b/src/KaldiLib/Makefile
new file mode 100644
index 0000000..0c238f4
--- /dev/null
+++ b/src/KaldiLib/Makefile
@@ -0,0 +1,28 @@
+
+include ../tnet.mk
+
+INCLUDE = -I. 
+
+all: libKaldiLib.a
+
+libKaldiLib.a: $(OBJ)
+	$(AR) ruv $@ $(OBJ)
+	$(RANLIB) $@ 
+
+%.o : %.cc
+	$(CXX)  -o $@  -c $< $(CFLAGS) $(CXXFLAGS) $(INCLUDE)
+
+
+
+.PHONY: clean doc depend
+clean:
+	rm -f *.o *.a
+
+doc:
+	doxygen ../../doc/doxyfile_TNetLib
+
+depend:
+	$(CXX) -M $(CXXFLAGS) *.cc $(INCLUDE) > .depend.mk
+
+-include .depend.mk
+
diff --git a/src/KaldiLib/MathAux.h b/src/KaldiLib/MathAux.h
new file mode 100644
index 0000000..c08e836
--- /dev/null
+++ b/src/KaldiLib/MathAux.h
@@ -0,0 +1,117 @@
+#ifndef TNet_MathAux_h
+#define TNet_MathAux_h
+
+#include <cmath>
+
+
+#if !defined(SQR)
+# define SQR(x) ((x) * (x))
+#endif
+
+
+#if !defined(LOG_0)
+# define LOG_0 (-1.0e10)
+#endif
+
+#if !defined(LOG_MIN)
+# define LOG_MIN   (0.5 * LOG_0)
+#endif
+
+
+#ifndef DBL_EPSILON
+#define DBL_EPSILON 2.2204460492503131e-16
+#endif
+
+
+#ifndef M_PI
+#  define M_PI 3.1415926535897932384626433832795
+#endif
+
+#define M_LOG_2PI 1.8378770664093454835606594728112
+
+
+#if DOUBLEPRECISION
+#  define FLOAT double
+#  define EPSILON DBL_EPSILON
+#  define FLOAT_FMT "%lg"
+#  define swapFLOAT swap8
+#  define _ABS  fabs
+#  define _COS  cos
+#  define _EXP  exp
+#  define _LOG  log
+#  define _SQRT sqrt
+#else
+#  define FLOAT float
+#  define EPSILON FLT_EPSILON
+#  define FLOAT_FMT "%g"
+#  define swapFLOAT swap4
+#  define _ABS  fabsf
+#  define _COS  cosf
+#  define _EXP  expf
+#  define _LOG  logf
+#  define _SQRT sqrtf
+#endif
+
+namespace TNet
+{
+  inline float frand(){ // random between 0 and 1.
+	return (float(rand()) + 1.0f) / (float(RAND_MAX)+2.0f);
+  }
+  inline float gauss_rand(){
+	return _SQRT( -2.0f * _LOG(frand()) ) * _COS(2.0f*float(M_PI)*frand());
+  }
+  
+  static const double gMinLogDiff = log(DBL_EPSILON);
+  
+  //***************************************************************************
+  //***************************************************************************
+  inline double
+  LogAdd(double x, double y)
+  {
+    double diff;
+  
+    if (x < y) {
+      diff = x - y;
+      x = y;
+    } else {
+      diff = y - x;
+    }
+  
+    double res;
+    if (x >= LOG_MIN) {
+      if (diff >= gMinLogDiff) {
+        res = x + log(1.0 + exp(diff));
+      } else {
+        res = x;
+      }
+    } else {
+      res = LOG_0;
+    }
+    return res;
+  } 
+
+
+  //***************************************************************************
+  //***************************************************************************
+  inline double
+  LogSub(double x, double y) // returns exp(x) - exp(y).  Throws exception if y>=x.
+  {
+
+    if(y >= x){
+      if(y==x)  return LOG_0;
+      else throw std::runtime_error("LogSub: cannot subtract a larger from a smaller number.");
+    }
+
+    double diff = y - x;  // Will be negative.
+    
+    double res = x + log(1.0 - exp(diff));
+
+    if(res != res) // test for res==NaN.. could happen if diff ~0.0, so 1.0-exp(diff) == 0.0 to machine precision.
+      res = LOG_0;
+    return res;
+  } 
+
+} // namespace TNet
+
+
+#endif
diff --git a/src/KaldiLib/Matrix.cc b/src/KaldiLib/Matrix.cc
new file mode 100644
index 0000000..f9d5909
--- /dev/null
+++ b/src/KaldiLib/Matrix.cc
@@ -0,0 +1,295 @@
+/** 
+ * @file Matrix.cc 
+ * 
+ * Implementation of specialized Matrix template methods 
+ */
+
+
+#include "Matrix.h"
+
+#if defined(HAVE_CLAPACK)
+#include "CLAPACK-3.1.1.1/INCLUDE/f2c.h"
+extern "C" {
+#include "CLAPACK-3.1.1.1/INCLUDE/clapack.h"
+}
+// These are some stupid clapack things that we want to get rid of
+#ifdef min
+#undef min
+#endif
+
+#ifdef max
+#undef max
+#endif
+
+#endif
+
+
+
+
+namespace TNet
+{
+  //***************************************************************************
+  //***************************************************************************
+#ifdef HAVE_ATLAS
+  //***************************************************************************
+  //***************************************************************************
+  template<>
+    Matrix<float> &
+    Matrix<float>::
+     Invert(float *LogDet, float *DetSign, bool inverse_needed)
+  { 
+      assert(Rows() == Cols());
+      
+#if defined(HAVE_CLAPACK)
+      integer* pivot = new integer[mMRows];
+      integer  M = Rows();
+      integer  N = Cols();
+      integer  LDA = mStride;
+      integer  result;
+      integer  l_work = std::max<integer>(1, N);
+      float*   p_work = new float[l_work];
+
+      sgetrf_(&M, &N, mpData, &LDA, pivot, &result);
+	  const int pivot_offset=1;
+#else
+      int* pivot = new int[mMRows];
+      int result = clapack_sgetrf(CblasColMajor, Rows(), Cols(), mpData, mStride, pivot);
+	  const int pivot_offset=0;
+#endif
+      assert(result >= 0 && "Call to CLAPACK sgetrf_ or ATLAS clapack_sgetrf called with wrong arguments");
+      if(result != 0) {
+        Error("Matrix is singular");
+      }
+	  if(DetSign!=NULL){ *DetSign=1.0; for(size_t i=0;i<mMRows;i++) if(pivot[i]!=(int)i+pivot_offset) *DetSign *= -1.0; }
+	  if(LogDet!=NULL||DetSign!=NULL){ // Compute log determinant...
+		assert(mMRows==mMCols); // Can't take determinant of non-square matrix.
+		*LogDet = 0.0;  float prod = 1.0;
+		for(size_t i=0;i<mMRows;i++){ 
+		  prod *= (*this)(i,i); 
+		  if(i==mMRows-1 || fabs(prod)<1.0e-10 || fabs(prod)>1.0e+10){ 
+			if(LogDet!=NULL) *LogDet += log(fabs(prod)); 
+			if(DetSign!=NULL) *DetSign *= (prod>0?1.0:-1.0);
+			prod=1.0;
+		  }
+		}
+	  }
+#if defined(HAVE_CLAPACK)
+      if(inverse_needed) sgetri_(&M, mpData, &LDA, pivot, p_work, &l_work, &result);
+      delete [] pivot;
+#else
+      if(inverse_needed) result = clapack_sgetri(CblasColMajor, Rows(), mpData, mStride, pivot);
+      delete [] pivot;
+#endif
+      assert(result == 0 && "Call to CLAPACK sgetri_ or ATLAS clapack_sgetri called with wrong arguments");
+      return *this;
+    }
+
+  
+  //***************************************************************************
+  //***************************************************************************
+  template<>
+    Matrix<double> &
+    Matrix<double>::
+     Invert(double *LogDet, double *DetSign, bool inverse_needed)
+    { 
+      assert(Rows() == Cols());
+      
+#if defined(HAVE_CLAPACK)
+      integer* pivot = new integer[mMRows];
+      integer  M = Rows();
+      integer  N = Cols();
+      integer  LDA = mStride;
+      integer  result;
+      integer  l_work = std::max<integer>(1, N);
+      double*   p_work = new double[l_work];
+
+      dgetrf_(&M, &N, mpData, &LDA, pivot, &result);
+	  const int pivot_offset=1;
+#else
+      int* pivot = new int[mMRows];
+      int result = clapack_dgetrf(CblasColMajor, Rows(), Cols(), mpData, mStride, pivot);
+	  const int pivot_offset=0;
+#endif
+      assert(result >= 0 && "Call to CLAPACK dgetrf_ or ATLAS clapack_dgetrf called with wrong arguments");
+      if(result != 0) {
+        Error("Matrix is singular");
+      }
+	  if(DetSign!=NULL){ *DetSign=1.0; for(size_t i=0;i<mMRows;i++) if(pivot[i]!=(int)i+pivot_offset) *DetSign *= -1.0; }
+	  if(LogDet!=NULL||DetSign!=NULL){ // Compute log determinant...
+		assert(mMRows==mMCols); // Can't take determinant of non-square matrix.
+		*LogDet = 0.0;  double prod = 1.0;
+		for(size_t i=0;i<mMRows;i++){ 
+		  prod *= (*this)(i,i); 
+		  if(i==mMRows-1 || fabs(prod)<1.0e-10 || fabs(prod)>1.0e+10){ 
+			if(LogDet!=NULL) *LogDet += log(fabs(prod)); 
+			if(DetSign!=NULL) *DetSign *= (prod>0?1.0:-1.0);
+			prod=1.0;
+		  }
+		}
+	  }
+#if defined(HAVE_CLAPACK)
+      if(inverse_needed) dgetri_(&M, mpData, &LDA, pivot, p_work, &l_work, &result);
+      delete [] pivot;
+#else
+      if(inverse_needed) result = clapack_dgetri(CblasColMajor, Rows(), mpData, mStride, pivot);
+      delete [] pivot;
+#endif
+      assert(result == 0 && "Call to CLAPACK dgetri_ or ATLAS clapack_dgetri called with wrong arguments");
+      return *this;
+    }
+
+  template<>
+    Matrix<float> &
+    Matrix<float>::
+    BlasGer(const float alpha, const Vector<float>& rA, const Vector<float>& rB)
+    {
+      assert(rA.Dim() == mMRows && rB.Dim() == mMCols);
+      cblas_sger(CblasRowMajor, rA.Dim(), rB.Dim(), alpha, rA.pData(), 1, rB.pData(), 1, mpData, mStride);
+      return *this;
+    }
+
+  template<>
+    Matrix<double> &
+    Matrix<double>::
+  BlasGer(const double alpha, const Vector<double>& rA, const Vector<double>& rB)
+    {
+      assert(rA.Dim() == mMRows && rB.Dim() == mMCols);
+      cblas_dger(CblasRowMajor, rA.Dim(), rB.Dim(), alpha, rA.pData(), 1, rB.pData(), 1, mpData, mStride);
+      return *this;
+    }
+  
+  template<>
+    Matrix<float>&
+    Matrix<float>::
+    BlasGemm(const float alpha,
+              const Matrix<float>& rA, MatrixTrasposeType transA,
+              const Matrix<float>& rB, MatrixTrasposeType transB,
+              const float beta)
+    {
+      assert((transA == NO_TRANS && transB == NO_TRANS && rA.Cols() == rB.Rows() && rA.Rows() == Rows() && rB.Cols() == Cols())
+	     || (transA ==    TRANS && transB == NO_TRANS && rA.Rows() == rB.Rows() && rA.Cols() == Rows() && rB.Cols() == Cols())
+	     || (transA == NO_TRANS && transB ==    TRANS && rA.Cols() == rB.Cols() && rA.Rows() == Rows() && rB.Rows() == Cols())
+	     || (transA ==    TRANS && transB ==    TRANS && rA.Rows() == rB.Cols() && rA.Cols() == Rows() && rB.Rows() == Cols()));
+
+      cblas_sgemm(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(transA), static_cast<CBLAS_TRANSPOSE>(transB),
+                  Rows(), Cols(), transA == NO_TRANS ? rA.Cols() : rA.Rows(),
+                  alpha, rA.mpData, rA.mStride, rB.mpData, rB.mStride,
+                  beta, mpData, mStride);
+      return *this;
+    }
+
+  template<>
+   Matrix<double>&
+    Matrix<double>::
+    BlasGemm(const double alpha,
+              const Matrix<double>& rA, MatrixTrasposeType transA,
+              const Matrix<double>& rB, MatrixTrasposeType transB,
+              const double beta)
+    {
+      assert((transA == NO_TRANS && transB == NO_TRANS && rA.Cols() == rB.Rows() && rA.Rows() == Rows() && rB.Cols() == Cols())
+	     || (transA ==    TRANS && transB == NO_TRANS && rA.Rows() == rB.Rows() && rA.Cols() == Rows() && rB.Cols() == Cols())
+	     || (transA == NO_TRANS && transB ==    TRANS && rA.Cols() == rB.Cols() && rA.Rows() == Rows() && rB.Rows() == Cols())
+	     || (transA ==    TRANS && transB ==    TRANS && rA.Rows() == rB.Cols() && rA.Cols() == Rows() && rB.Rows() == Cols()));
+
+      cblas_dgemm(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(transA), static_cast<CBLAS_TRANSPOSE>(transB),
+                  Rows(), Cols(), transA == NO_TRANS ? rA.Cols() : rA.Rows(),
+                  alpha, rA.mpData, rA.mStride, rB.mpData, rB.mStride,
+                  beta, mpData, mStride);
+      return *this;
+    }
+
+  template<>
+    Matrix<float>&
+    Matrix<float>::
+         Axpy(const float alpha,
+              const Matrix<float>& rA, MatrixTrasposeType transA){
+	int aStride = (int)rA.mStride, stride = mStride;
+	float *adata=rA.mpData, *data=mpData;
+	if(transA == NO_TRANS){
+	  assert(rA.Rows()==Rows() && rA.Cols()==Cols());
+	  for(size_t row=0;row<mMRows;row++,adata+=aStride,data+=stride)
+		cblas_saxpy(mMCols, alpha, adata, 1, data, 1);
+	} else {
+	  assert(rA.Cols()==Rows() && rA.Rows()==Cols());
+	  for(size_t row=0;row<mMRows;row++,adata++,data+=stride)
+		cblas_saxpy(mMCols, alpha, adata, aStride, data, 1);
+	}
+	return *this;
+  } 
+
+  template<>
+    Matrix<double>&
+    Matrix<double>::
+         Axpy(const double alpha,
+              const Matrix<double>& rA, MatrixTrasposeType transA){
+	int aStride = (int)rA.mStride, stride = mStride;
+	double *adata=rA.mpData, *data=mpData;
+	if(transA == NO_TRANS){
+	  assert(rA.Rows()==Rows() && rA.Cols()==Cols());
+	  for(size_t row=0;row<mMRows;row++,adata+=aStride,data+=stride)
+		cblas_daxpy(mMCols, alpha, adata, 1, data, 1);
+	} else {
+	  assert(rA.Cols()==Rows() && rA.Rows()==Cols());
+	  for(size_t row=0;row<mMRows;row++,adata++,data+=stride)
+		cblas_daxpy(mMCols, alpha, adata, aStride, data, 1);
+	}
+	return *this;
+  } 
+
+  template <>  //non-member but friend!
+  double TraceOfProduct(const Matrix<double> &A, const Matrix<double> &B){ // tr(A B), equivalent to sum of each element of A times same element in B'
+	size_t aStride = A.mStride, bStride = B.mStride;
+	assert(A.Rows()==B.Cols() && A.Cols()==B.Rows());
+	double ans = 0.0;
+	double *adata=A.mpData, *bdata=B.mpData;
+	size_t arows=A.Rows(), acols=A.Cols();
+	for(size_t row=0;row<arows;row++,adata+=aStride,bdata++)
+	  ans += cblas_ddot(acols, adata, 1, bdata, bStride);
+	return ans;
+  }
+
+  template <>  //non-member but friend!
+  double TraceOfProductT(const Matrix<double> &A, const Matrix<double> &B){ // tr(A B), equivalent to sum of each element of A times same element in B'
+	size_t aStride = A.mStride, bStride = B.mStride;
+	assert(A.Rows()==B.Rows() && A.Cols()==B.Cols());
+	double ans = 0.0;
+	double *adata=A.mpData, *bdata=B.mpData;
+	size_t arows=A.Rows(), acols=A.Cols();
+	for(size_t row=0;row<arows;row++,adata+=aStride,bdata+=bStride)
+	  ans += cblas_ddot(acols, adata, 1, bdata, 1);
+	return ans;
+  }
+
+
+  template <>  //non-member but friend!
+  float TraceOfProduct(const Matrix<float> &A, const Matrix<float> &B){ // tr(A B), equivalent to sum of each element of A times same element in B'
+	size_t aStride = A.mStride, bStride = B.mStride;
+	assert(A.Rows()==B.Cols() && A.Cols()==B.Rows());
+	float ans = 0.0;
+	float *adata=A.mpData, *bdata=B.mpData;
+	size_t arows=A.Rows(), acols=A.Cols();
+	for(size_t row=0;row<arows;row++,adata+=aStride,bdata++)
+	  ans += cblas_sdot(acols, adata, 1, bdata, bStride);
+	return ans;
+  }
+
+  template <>  //non-member but friend!
+  float TraceOfProductT(const Matrix<float> &A, const Matrix<float> &B){ // tr(A B), equivalent to sum of each element of A times same element in B'
+	size_t aStride = A.mStride, bStride = B.mStride;
+	assert(A.Rows()==B.Rows() && A.Cols()==B.Cols());
+	float ans = 0.0;
+	float *adata=A.mpData, *bdata=B.mpData;
+	size_t arows=A.Rows(), acols=A.Cols();
+	for(size_t row=0;row<arows;row++,adata+=aStride,bdata+=bStride)
+	  ans += cblas_sdot(acols, adata, 1, bdata, 1);
+	return ans;
+  }
+
+
+
+
+#endif //HAVE_ATLAS
+
+
+
+} //namespace STK
diff --git a/src/KaldiLib/Matrix.h b/src/KaldiLib/Matrix.h
new file mode 100644
index 0000000..d33cb0c
--- /dev/null
+++ b/src/KaldiLib/Matrix.h
@@ -0,0 +1,677 @@
+#ifndef TNet_Matrix_h
+#define TNet_Matrix_h
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdexcept>
+#include <iostream>
+
+#ifdef HAVE_ATLAS
+extern "C"{
+  #include <cblas.h>
+  #include <clapack.h>
+}
+#endif
+
+#include "Common.h"
+#include "MathAux.h"
+#include "Types.h"
+#include "Error.h"
+
+//#define TRACE_MATRIX_OPERATIONS
+#define CHECKSIZE
+
+namespace TNet
+{
+
+
+  //  class matrix_error : public std::logic_error {};
+  //  class matrix_sizes_error : public matrix_error {};
+
+  // declare the class so the header knows about it
+  template<typename _ElemT> class Vector;
+  template<typename _ElemT> class SubVector;
+  template<typename _ElemT> class Matrix;
+  template<typename _ElemT> class SubMatrix;
+
+  // we need to declare the friend << operator here
+  template<typename _ElemT>
+    std::ostream & operator << (std::ostream & rOut, const Matrix<_ElemT> & rM);
+
+  // we need to declare the friend << operator here
+  template<typename _ElemT>
+    std::istream & operator >> (std::istream & rIn, Matrix<_ElemT> & rM);
+
+  // we need to declare this friend function here
+  template<typename _ElemT>
+   _ElemT TraceOfProduct(const Matrix<_ElemT> &A, const Matrix<_ElemT> &B); // tr(A B)
+
+  // we need to declare this friend function here
+  template<typename _ElemT> 
+	 _ElemT TraceOfProductT(const Matrix<_ElemT> &A, const Matrix<_ElemT> &B); // tr(A B^T)==tr(A^T B)
+
+
+  /** **************************************************************************
+   ** **************************************************************************
+   *  @brief Provides a matrix class
+   *
+   *  This class provides a way to work with matrices in TNet.
+   *  It encapsulates basic operations and memory optimizations.
+   *
+   */
+  template<typename _ElemT>
+    class Matrix
+    {
+    public:
+      /// defines a transpose type
+
+      struct HtkHeader
+      {
+        INT_32    mNSamples;              
+        INT_32    mSamplePeriod;
+        INT_16    mSampleSize;
+        UINT_16   mSampleKind;
+      };
+
+
+      /** 
+       * @brief Extension of the HTK header
+       */
+      struct HtkHeaderExt
+      {
+        INT_32 mHeaderSize;
+        INT_32 mVersion;
+        INT_32 mSampSize;
+      };  
+
+
+
+
+      /// defines a type of this
+      typedef Matrix<_ElemT>    ThisType;
+
+      // Constructors
+
+      /// Empty constructor
+      Matrix<_ElemT> ():
+        mpData(NULL), mMCols(0), mMRows(0), mStride(0)
+#ifdef STK_MEMALIGN_MANUAL
+        , mpFreeData(NULL)
+#endif
+      {}
+
+      /// Copy constructor
+      Matrix<_ElemT> (const Matrix<_ElemT> & rM, MatrixTrasposeType trans=NO_TRANS):
+        mpData(NULL) 
+      { if(trans==NO_TRANS){ Init(rM.mMRows, rM.mMCols); Copy(rM); } else { Init(rM.mMCols,rM.mMRows); Copy(rM,TRANS); } }
+
+      /// Copy constructor from another type.
+      template<typename _ElemU>
+      explicit Matrix<_ElemT> (const Matrix<_ElemU> & rM, MatrixTrasposeType trans=NO_TRANS):
+        mpData(NULL) 
+      { if(trans==NO_TRANS){ Init(rM.Rows(), rM.Cols()); Copy(rM); } else { Init(rM.Cols(),rM.Rows()); Copy(rM,TRANS); } }
+
+      /// Basic constructor
+      Matrix(const size_t r, const size_t c, bool clear=true)
+      { mpData=NULL; Init(r, c, clear);  }
+
+
+	  Matrix<_ElemT> &operator = (const Matrix <_ElemT> &other) { Init(other.Rows(), other.Cols()); Copy(other); return *this; } // Needed for inclusion in std::vector
+
+      /// Destructor
+      ~Matrix()
+      { Destroy(); }
+
+
+      /// Initializes matrix (if not done by constructor)
+      ThisType &
+      Init(const size_t r,
+           const size_t c, bool clear=true);
+
+      /**
+       * @brief Dealocates the matrix from memory and resets the dimensions to (0, 0)
+       */
+      void
+      Destroy();
+
+
+      ThisType &
+      Zero();
+
+      ThisType &
+      Unit(); // set to unit.
+
+      /** 
+       * @brief Copies the contents of a matrix
+       * @param rM Source data matrix
+       * @return Returns reference to this
+       */
+      template<typename _ElemU> ThisType &
+        Copy(const Matrix<_ElemU> & rM, MatrixTrasposeType Trans=NO_TRANS);
+
+
+
+      /**
+       * @brief Copies the elements of a vector row-by-row into a matrix
+       * @param rV Source vector
+       * @param nRows Number of rows of returned matrix
+       * @param nCols Number of columns of returned matrix
+       *
+       * Note that rV.Dim() must equal nRows*nCols
+       */
+      ThisType &
+      CopyVectorSplicedRows(const Vector<_ElemT> &rV, const size_t nRows, const size_t nCols);
+
+      /**
+       * @brief Returns @c true if matrix is initialized
+       */
+      bool
+		IsInitialized() const
+      { return mpData != NULL; }
+
+      /// Returns number of rows in the matrix
+      inline size_t
+		Rows() const
+      {
+        return mMRows;
+      }
+
+      /// Returns number of columns in the matrix
+      inline size_t
+      Cols() const
+      {
+        return mMCols;
+      }
+
+      /// Returns number of columns in the matrix memory
+      inline size_t
+		Stride() const
+      {
+        return mStride;
+      }
+
+
+      /**
+       *  @brief Gives access to a specified matrix row without range check
+       *  @return Pointer to the const array
+       */
+      inline const _ElemT*  __attribute__((aligned(16)))
+       pData () const
+      {
+        return mpData;
+      }
+
+
+      /**
+       *  @brief Gives access to a specified matrix row without range check
+       *  @return Pointer to the non-const data array
+       */
+      inline _ElemT* __attribute__((aligned(16)))
+       pData () 
+      {
+        return mpData;
+      }
+
+
+      /**
+       *  @brief pData_workaround is a workaround that allows SubMatrix to get a 
+       *  @return pointer to non-const data even though the Matrix is const... 
+       */
+    protected:
+      inline _ElemT*  __attribute__((aligned(16)))
+       pData_workaround () const
+      {
+        return mpData;
+      }
+    public:
+
+
+      /// Returns size of matrix in memory
+      size_t
+      MSize() const
+      {
+        return mMRows * mStride * sizeof(_ElemT);
+      }
+
+      /// Checks the content of the matrix for nan and inf values
+      void
+      CheckData(const std::string file = "") const
+      {
+        for(size_t row=0; row<Rows(); row++) {
+          for(size_t col=0; col<Cols(); col++) {
+            if(isnan((*this)(row,col)) || isinf((*this)(row,col))) {
+              std::ostringstream os;
+              os << "Invalid value: " << (*this)(row,col)
+                 << " in matrix row: " << row
+                 << " col: " << col
+                 << " file: " << file;
+              Error(os.str());
+            }
+          }
+        }
+      }
+
+      /**
+       *  **********************************************************************
+       *  **********************************************************************
+       *  @defgroup RESHAPE Matrix reshaping rutines
+       *  **********************************************************************
+       *  **********************************************************************
+       * @{
+       */
+
+      /**
+       *  @brief Removes one row from the matrix. The memory is not reallocated.
+       */
+      ThisType &
+      RemoveRow(size_t i);      
+
+      /** @} */
+
+      /**
+       *  **********************************************************************
+       *  **********************************************************************
+       *  @defgroup ACCESS Access functions and operators
+       *  **********************************************************************
+       *  **********************************************************************
+       * @{
+       */
+
+      /**
+       *  @brief Gives access to a specified matrix row without range check
+       *  @return Subvector object representing the row
+       */
+      inline const SubVector<_ElemT>
+      operator []  (size_t i) const
+      {
+        assert(i < mMRows);
+        return SubVector<_ElemT>(mpData + (i * mStride), Cols());
+      }
+
+      inline SubVector<_ElemT>
+      operator []  (size_t i)
+      {
+        assert(i < mMRows);
+        return SubVector<_ElemT>(mpData + (i * mStride), Cols());
+      }
+
+      /**
+       *  @brief Gives access to a specified matrix row without range check
+       *  @return pointer to the first field of the row
+       */
+      inline  _ElemT*
+      pRowData(size_t i) 
+      {
+        assert(i < mMRows);
+        return mpData + i * mStride;
+      }
+
+      /**
+       *  @brief Gives access to a specified matrix row without range check
+       *  @return pointer to the first field of the row (const version)
+       */
+      inline const _ElemT*
+      pRowData(size_t i) const
+      {
+        assert(i < mMRows);
+        return mpData + i * mStride;
+      }
+
+      /**
+       *  @brief Gives access to matrix elements (row, col)
+       *  @return reference to the desired field
+       */
+      inline _ElemT&
+		operator () (size_t r, size_t c)
+      { 
+#ifdef PARANOID
+        assert(r < mMRows && c < mMCols);
+#endif
+		return *(mpData + r * mStride + c); 
+	  }
+
+      /**
+       *  @brief Gives access to matrix elements (row, col)
+       *  @return pointer to the desired field (const version)
+       */
+      inline const _ElemT
+		operator () (size_t r, size_t c) const
+      { 
+#ifdef PARANOID
+        assert(r < mMRows && c < mMCols);
+#endif
+		return *(mpData + r * mStride + c); 
+	  }
+
+      /**
+       * @brief Returns a matrix sub-range
+       * @param ro Row offset
+       * @param r  Rows in range
+       * @param co Column offset
+       * @param c  Coluns in range
+       * See @c SubMatrix class for details
+       */
+      SubMatrix<_ElemT>
+      Range(const size_t    ro, const size_t    r,
+            const size_t    co, const size_t    c)
+      { return SubMatrix<_ElemT>(*this, ro, r, co, c); }
+
+      const SubMatrix<_ElemT>
+      Range(const size_t    ro, const size_t    r,
+            const size_t    co, const size_t    c) const
+      { return SubMatrix<_ElemT>(*this, ro, r, co, c); }
+      /** @} */
+
+
+      /**
+       *  **********************************************************************
+       *  **********************************************************************
+       *  @defgroup MATH ROUTINES
+       *  **********************************************************************
+       *  **********************************************************************
+       *  @{
+       **/
+
+      /**
+       *  @brief Returns sum of all elements
+       */
+      _ElemT&
+      Sum() const;
+
+      ThisType &
+      DotMul(const ThisType& a);
+
+      ThisType &
+      Scale(_ElemT alpha);
+
+      ThisType &
+      ScaleCols(const Vector<_ElemT> &scale); // Equivalent to (*this) = (*this) * diag(scale).
+
+      ThisType &
+      ScaleRows(const Vector<_ElemT> &scale); // Equivalent to (*this) = diag(scale) * (*this);
+
+      /// Sum another matrix rMatrix with this matrix
+      ThisType&
+      Add(const Matrix<_ElemT>& rMatrix);
+
+   
+      /// Sum scaled matrix rMatrix with this matrix
+      ThisType&
+      AddScaled(_ElemT alpha, const Matrix<_ElemT>& rMatrix);
+
+      /// Apply log to all items of the matrix
+      ThisType&
+      ApplyLog();
+
+      /**
+       * @brief Computes the determinant of this matrix
+       * @return Returns the determinant of a matrix
+       * @ingroup MATH
+       *
+       */
+      _ElemT LogAbsDeterminant(_ElemT *DetSign=NULL);
+
+
+      /**
+       *  @brief Performs matrix inplace inversion
+       */
+      ThisType &
+      Invert(_ElemT *LogDet=NULL, _ElemT *DetSign=NULL, bool inverse_needed=true);
+
+      /**
+       *  @brief Performs matrix inplace inversion in double precision, even if this object is not double precision.
+       */
+      ThisType &
+      InvertDouble(_ElemT *LogDet=NULL, _ElemT *DetSign=NULL, bool inverse_needed=true){
+        double LogDet_tmp, DetSign_tmp;
+        Matrix<double> dmat(*this); dmat.Invert(&LogDet_tmp, &DetSign_tmp, inverse_needed); if(inverse_needed) (*this).Copy(dmat); 
+        if(LogDet) *LogDet = LogDet_tmp; if(DetSign) *DetSign = DetSign_tmp;
+        return *this;
+      }
+
+
+      /**
+       *  @brief Inplace matrix transposition. Applicable only to square matrices
+       */
+      ThisType &
+      Transpose()
+      {
+        assert(Rows()==Cols());
+        size_t M=Rows();
+        for(size_t i=0;i<M;i++)
+          for(size_t j=0;j<i;j++){
+           _ElemT &a = (*this)(i,j), &b = (*this)(j,i);
+		   std::swap(a,b);
+        }
+		return *this;
+      }
+
+
+      
+
+
+      bool IsSymmetric(_ElemT cutoff = 1.0e-05) const;
+
+      bool IsDiagonal(_ElemT cutoff = 1.0e-05) const;
+
+      bool IsUnit(_ElemT cutoff = 1.0e-05) const;
+
+      bool IsZero(_ElemT cutoff = 1.0e-05) const;
+
+      _ElemT FrobeniusNorm() const; // sqrt of sum of square elements.
+
+      _ElemT LargestAbsElem() const; // largest absolute value.
+
+	
+      friend _ElemT TNet::TraceOfProduct<_ElemT>(const Matrix<_ElemT> &A, const Matrix<_ElemT> &B); // tr(A B)
+      friend _ElemT TNet::TraceOfProductT<_ElemT>(const Matrix<_ElemT> &A, const Matrix<_ElemT> &B); // tr(A B^T)==tr(A^T B)
+      friend class SubMatrix<_ElemT>; // so it can get around const restrictions on the pointer to mpData.
+
+      /** **********************************************************************
+       *  **********************************************************************
+       *  @defgroup BLAS_ROUTINES BLAS ROUTINES
+       *  @ingroup MATH
+       *  **********************************************************************
+       *  **********************************************************************
+       **/
+
+      ThisType &
+      BlasGer(const _ElemT alpha, const Vector<_ElemT>& rA, const Vector<_ElemT>& rB);
+
+      ThisType &
+	    Axpy(const _ElemT alpha, const Matrix<_ElemT> &rM, MatrixTrasposeType transA=NO_TRANS);
+
+      ThisType &
+      BlasGemm(const _ElemT alpha,
+               const ThisType& rA, MatrixTrasposeType transA,
+               const ThisType& rB, MatrixTrasposeType transB,
+               const _ElemT beta = 0.0);
+
+
+      /** @} */
+
+
+      /** **********************************************************************
+       *  **********************************************************************
+       *  @defgroup IO Input/Output ROUTINES
+       *  **********************************************************************
+       *  **********************************************************************
+       *  @{
+       **/
+
+      friend std::ostream &
+      operator << <> (std::ostream & out, const ThisType & m);
+	  	
+      void PrintOut(char *file);
+      void ReadIn(char *file);
+
+
+      bool
+      LoadHTK(const char* pFileName);
+
+      /** @} */
+
+
+    protected:
+//      inline void swap4b(void *a);
+//      inline void swap2b(void *a);
+
+
+    protected:
+      /// data memory area
+      _ElemT*   mpData;
+
+      /// these atributes store the real matrix size as it is stored in memory
+      /// including memalignment
+      size_t    mMCols;       ///< Number of columns
+      size_t    mMRows;       ///< Number of rows
+      size_t    mStride;      ///< true number of columns for the internal matrix.
+                              ///< This number may differ from M_cols as memory
+                              ///< alignment might be used
+
+#ifdef STK_MEMALIGN_MANUAL
+      /// data to be freed (in case of manual memalignment use, see Common.h)
+      _ElemT*   mpFreeData;
+#endif
+    }; // class Matrix
+
+    template<>  Matrix<float> &  Matrix<float>::Invert(float *LogDet, float *DetSign, bool inverse_needed); // state that we will implement separately for float and double.
+    template<>  Matrix<double> &  Matrix<double>::Invert(double *LogDet, double *DetSign, bool inverse_needed);
+
+
+
+  /** **************************************************************************
+   ** **************************************************************************
+   *  @brief Sub-matrix representation
+   *
+   *  This class provides a way to work with matrix cutouts in STK.
+   *
+   *
+   */
+  template<typename _ElemT>
+    class SubMatrix : public Matrix<_ElemT>
+    {
+    typedef SubMatrix<_ElemT>    ThisType;
+
+    public:
+      /// Constructor
+      SubMatrix(const Matrix<_ElemT>& rT, // Input matrix cannot be const because SubMatrix can change its contents.
+                const size_t    ro,
+                const size_t    r,
+                const size_t    co,
+                const size_t    c);
+
+
+      /// The destructor
+      ~SubMatrix<_ElemT>()
+      {
+#ifndef STK_MEMALIGN_MANUAL
+        Matrix<_ElemT>::mpData = NULL;
+#else
+        Matrix<_ElemT>::mpFreeData = NULL;
+#endif
+      }
+
+      /// Assign operator
+      ThisType& operator=(const ThisType& rSrc)
+      {
+        //std::cout << "[PERFORMing operator= SubMatrix&^2]" << std::flush;
+        this->mpData = rSrc.mpData;
+        this->mMCols = rSrc.mMCols;
+        this->mMRows = rSrc.mMRows;
+        this->mStride = rSrc.mStride;
+        this->mpFreeData = rSrc.mpFreeData;
+        return *this;
+      }
+
+   
+
+      /// Initializes matrix (if not done by constructor)
+      ThisType &
+      Init(const size_t r,
+           const size_t c, bool clear=true)
+      { Error("Submatrix cannot do Init"); return *this; }
+
+      /**
+       * @brief Dealocates the matrix from memory and resets the dimensions to (0, 0)
+       */
+      void
+      Destroy()
+      { Error("Submatrix cannot do Destroy"); }
+
+
+
+    };
+
+
+
+  //Create useful shortcuts
+  typedef Matrix<BaseFloat> BfMatrix;
+  typedef SubMatrix<BaseFloat> BfSubMatrix;
+
+  /**
+   * Function for summing matrices of different types
+   */
+  template<typename _ElemT, typename _ElemU>
+  void Add(Matrix<_ElemT>& rDst,  const Matrix<_ElemU>& rSrc) {
+    assert(rDst.Cols() == rSrc.Cols());
+    assert(rDst.Rows() == rSrc.Rows());
+
+    for(size_t i=0; i<rDst.Rows(); i++) {
+      const _ElemU* p_src = rSrc.pRowData(i);
+      _ElemT* p_dst = rDst.pRowData(i);
+      for(size_t j=0; j<rDst.Cols(); j++) {
+        *p_dst++ += (_ElemT)*p_src++;
+      }
+    }
+  }
+
+  /**
+   * Function for summing matrices of different types
+   */
+  template<typename _ElemT, typename _ElemU>
+  void AddScaled(Matrix<_ElemT>& rDst, const Matrix<_ElemU>& rSrc, _ElemT scale) {
+    assert(rDst.Cols() == rSrc.Cols());
+    assert(rDst.Rows() == rSrc.Rows());
+
+    Vector<_ElemT> tmp(rDst[0]);
+
+    for(size_t i=0; i<rDst.Rows(); i++) {
+      tmp.Copy(rSrc[i]);
+      rDst[i].BlasAxpy(scale, tmp);
+
+      /*
+      const _ElemU* p_src = rSrc.pRowData(i);
+      _ElemT* p_dst = rDst.pRowData(i);
+      for(size_t j=0; j<rDst.Cols(); j++) {
+        *p_dst++ += (_ElemT)(*p_src++) * scale;
+      }
+      */
+    }
+  }
+
+
+
+
+
+} // namespace STK
+
+
+
+//*****************************************************************************
+//*****************************************************************************
+// we need to include the implementation
+#include "Matrix.tcc"
+//*****************************************************************************
+//*****************************************************************************
+
+
+/******************************************************************************
+ ******************************************************************************
+ * The following section contains specialized template definitions
+ * whose implementation is in Matrix.cc
+ */
+
+
+//#ifndef TNet_Matrix_h
+#endif
diff --git a/src/KaldiLib/Matrix.tcc b/src/KaldiLib/Matrix.tcc
new file mode 100644
index 0000000..110abe0
--- /dev/null
+++ b/src/KaldiLib/Matrix.tcc
@@ -0,0 +1,796 @@
+
+/** @file Matrix.tcc
+ *  This is an internal header file, included by other library headers.
+ *  You should not attempt to use it directly.
+ */
+
+
+#ifndef TNet_Matrix_tcc
+#define TNet_Matrix_tcc
+
+//#pragma GCC system_header
+
+#include <cstdlib>
+#include <cmath>
+#include <cfloat>
+#include <fstream>
+#include <iomanip>
+#include <typeinfo>
+#include <algorithm>
+#include <limits>
+#include <vector>
+#include "Common.h"
+
+#ifndef _XOPEN_SOURCE
+  #define _XOPEN_SOURCE 600
+#endif
+
+
+#ifdef HAVE_ATLAS
+extern "C"{
+  #include <cblas.h>
+}
+#endif
+
+
+#include "Common.h"
+#include "Vector.h"
+namespace TNet
+{
+
+//******************************************************************************
+  template<typename _ElemT>
+  Matrix<_ElemT> &
+  Matrix<_ElemT>::
+  Init(const size_t rows,
+       const size_t cols, 
+       bool clear)
+  {
+    if(mpData != NULL) Destroy();
+    if(rows*cols == 0){
+      assert(rows==0 && cols==0);
+      mMRows=rows; 
+      mMCols=cols;
+#ifdef STK_MEMALIGN_MANUAL
+      mpFreeData=NULL;
+#endif
+      mpData=NULL;
+      return *this;
+    }
+    // initialize some helping vars
+    size_t  skip;
+    size_t  real_cols;
+    size_t  size;
+    void*   data;       // aligned memory block
+    void*   free_data;  // memory block to be really freed
+
+    // compute the size of skip and real cols
+    skip      = ((16 / sizeof(_ElemT)) - cols % (16 / sizeof(_ElemT))) % (16 / sizeof(_ElemT));
+    real_cols = cols + skip;
+    size      = rows * real_cols * sizeof(_ElemT);
+
+    // allocate the memory and set the right dimensions and parameters
+
+    if (NULL != (data = stk_memalign(16, size, &free_data)))
+    {
+      mpData        = static_cast<_ElemT *> (data);
+#ifdef STK_MEMALIGN_MANUAL
+      mpFreeData    = static_cast<_ElemT *> (free_data);
+#endif
+      mMRows      = rows;
+      mMCols      = cols;
+      mStride  = real_cols;
+    }
+    else
+    {
+      throw std::bad_alloc();
+    }
+    if(clear) Zero();
+    return *this;
+  } //
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    template<typename _ElemU>
+    Matrix<_ElemT> &
+    Matrix<_ElemT>::
+  Copy(const Matrix<_ElemU> & rM, MatrixTrasposeType Trans)
+    {
+      if(Trans==NO_TRANS){
+        assert(mMRows == rM.Rows() && mMCols == rM.Cols());
+        for(size_t i = 0; i < mMRows; i++) 
+          (*this)[i].Copy(rM[i]);
+        return *this;
+      } else {
+        assert(mMCols == rM.Rows() && mMRows == rM.Cols());        
+        for(size_t i = 0; i < mMRows; i++) 
+          for(size_t j = 0; j < mMCols; j++)
+            (*this)(i,j) = rM(j,i);
+        return *this;
+      }
+    }
+
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+  Matrix<_ElemT> &
+  Matrix<_ElemT>::
+  CopyVectorSplicedRows(const Vector<_ElemT> &rV, const size_t nRows, const size_t nCols) {
+    assert(rV.Dim() == nRows*nCols);
+    mMRows = nRows;
+    mMCols = nCols;
+
+    for(size_t r=0; r<mMRows; r++)
+      for(size_t c=0; c<mMCols; c++)
+        (*this)(r,c) = rV(r*mMCols + c);
+
+      return *this;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Matrix<_ElemT> &
+    Matrix<_ElemT>::
+  RemoveRow(size_t i)
+  {
+    assert(i < mMRows && "Access out of matrix");
+    for(size_t j = i + 1; j < mMRows; j++)
+      (*this)[j - 1].Copy((*this)[j]);
+    mMRows--;
+    return *this;
+  }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  // The destructor
+  template<typename _ElemT>
+    void
+    Matrix<_ElemT>::
+    Destroy()
+    {
+      // we need to free the data block if it was defined
+#ifndef STK_MEMALIGN_MANUAL
+      if (NULL != mpData) free(mpData);
+#else
+      if (NULL != mpData) free(mpFreeData);
+      mpFreeData = NULL;
+#endif
+
+      mpData = NULL;
+      mMRows = mMCols = 0;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+//  template<typename _ElemT>
+//  void
+//  Matrix<_ElemT>::
+//  VectorizeRows(Vector<_ElemT> &rV) {
+//#ifdef PARANIOD
+//    assert(rV.Dim() == mMRows*mMCols);
+//#endif
+//    for(size_t r=0; r<mMRows; r++) {
+//      rV.Range((r-1)*mMCols, mMCols).Copy((*this)[r]);
+//    }
+//  }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    bool
+    Matrix<_ElemT>::
+    LoadHTK(const char* pFileName)
+    {
+      HtkHeader htk_hdr;
+
+      FILE *fp = fopen(pFileName, "rb");
+      if(!fp)
+      {
+        return false;
+      }
+
+      read(fileno(fp), &htk_hdr, sizeof(htk_hdr));
+
+      swap4(htk_hdr.mNSamples);
+      swap4(htk_hdr.mSamplePeriod);
+      swap2(htk_hdr.mSampleSize);
+      swap2(htk_hdr.mSampleKind);
+
+      Init(htk_hdr.mNSamples, htk_hdr.mSampleSize / sizeof(float));
+
+      size_t i;
+      size_t j;
+      if (typeid(_ElemT) == typeid(float))
+      {
+        for (i=0; i< Rows(); ++i) {
+          read(fileno(fp), (*this).pRowData(i), Cols() * sizeof(float));
+
+          for(j = 0; j < Cols(); j++) {
+            swap4(((*this)(i,j)));
+          }
+        }
+      }
+      else
+      {
+        float *pmem = new (std::nothrow) float[Cols()];
+        if (!pmem)
+        {
+          fclose(fp);
+          return false;
+        }
+
+        for(i = 0; i < Rows(); i++) {
+          read(fileno(fp), pmem, Cols() * sizeof(float));
+
+          for (j = 0; j < Cols(); ++j) {
+            swap4(pmem[j]);
+            (*this)(i,j) = static_cast<_ElemT>(pmem[j]);
+          }
+        }
+        delete [] pmem;
+      }
+
+      fclose(fp);
+
+      return true;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Matrix<_ElemT> &
+    Matrix<_ElemT>::
+    DotMul(const ThisType& a)
+    {
+      size_t i;
+      size_t j;
+
+      for (i = 0; i < mMRows; ++i) {
+        for (j = 0; j < mMCols; ++j) {
+          (*this)(i,j) *= a(i,j);
+        }
+      }
+      return *this;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    _ElemT &
+    Matrix<_ElemT>::
+    Sum() const
+    {
+      double sum = 0.0;
+
+      for (size_t i = 0; i < Rows(); ++i) {
+        for (size_t j = 0; j < Cols(); ++j) {
+          sum += (*this)(i,j);
+        }
+      }
+
+      return sum;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Matrix<_ElemT>&
+    Matrix<_ElemT>::
+    Scale(_ElemT alpha)
+    {
+#if 0
+      for (size_t i = 0; i < Rows(); ++i) 
+        for (size_t j = 0; j < Cols(); ++j) 
+          (*this)(i,j) *= alpha;
+#else
+      for (size_t i = 0; i < Rows(); ++i) {
+        _ElemT* p_data = pRowData(i);
+        for (size_t j = 0; j < Cols(); ++j) {
+          *p_data++ *= alpha; 
+        }
+      }
+#endif
+      return *this;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Matrix<_ElemT>&
+    Matrix<_ElemT>::
+    ScaleRows(const Vector<_ElemT>& scale) // scales each row by scale[i].
+    {
+      assert(scale.Dim() == Rows());
+      size_t M = Rows(), N = Cols();
+
+      for (size_t i = 0; i < M; i++) {
+        _ElemT this_scale = scale(i);
+        for (size_t j = 0; j < N; j++) {
+          (*this)(i,j) *= this_scale;
+        }
+      }
+      return *this;
+     }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Matrix<_ElemT>&
+    Matrix<_ElemT>::
+    ScaleCols(const Vector<_ElemT>& scale) // scales each column by scale[i].
+    {
+      assert(scale.Dim() == Cols());
+      for (size_t i = 0; i < Rows(); i++) {
+        for (size_t j = 0; j < Cols(); j++) {
+          _ElemT this_scale = scale(j);
+          (*this)(i,j) *= this_scale;
+        }
+      }
+      return *this;
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+  Matrix<_ElemT>&
+  Matrix<_ElemT>::
+  Add(const Matrix<_ElemT>& rMatrix) 
+  {
+    assert(rMatrix.Cols() == Cols());
+    assert(rMatrix.Rows() == Rows());
+      
+#if 0
+    //this can be slow
+    for (size_t i = 0; i < Rows(); i++) {
+      for (size_t j = 0; j < Cols(); j++) {
+        (*this)(i,j) += rMatrix(i,j);
+      }
+    }
+#else
+    //this will be faster (but less secure)
+    for(size_t i=0; i<Rows(); i++) {
+      const _ElemT* p_src = rMatrix.pRowData(i);
+      _ElemT* p_dst = pRowData(i);
+      for(size_t j=0; j<Cols(); j++) {
+        *p_dst++ += *p_src++;
+      }
+    }
+#endif
+    return *this;
+  }
+
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+  Matrix<_ElemT>&
+  Matrix<_ElemT>::
+  AddScaled(_ElemT alpha, const Matrix<_ElemT>& rMatrix) 
+  {
+    assert(rMatrix.Cols() == Cols());
+    assert(rMatrix.Rows() == Rows());
+      
+#if 0
+    //this can be slow
+    for (size_t i = 0; i < Rows(); i++) {
+      for (size_t j = 0; j < Cols(); j++) {
+        (*this)(i,j) += rMatrix(i,j) * alpha;
+      }
+    }
+#else
+  /*
+    //this will be faster (but less secure)
+    for(size_t i=0; i<Rows(); i++) {
+      const _ElemT* p_src = rMatrix.pRowData(i);
+      _ElemT* p_dst = pRowData(i);
+      for(size_t j=0; j<Cols(); j++) {
+        *p_dst++ += *p_src++ * alpha;
+      }
+    }
+    */
+
+  //let's use BLAS
+  for(size_t i=0; i<Rows(); i++) {
+    (*this)[i].BlasAxpy(alpha, rMatrix[i]);
+  }
+#endif
+    return *this;
+  }
+
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+  Matrix<_ElemT>&
+  Matrix<_ElemT>::
+  ApplyLog()
+  {
+      
+#if 0
+    //this can be slow
+    for (size_t i = 0; i < Rows(); i++) {
+      for (size_t j = 0; j < Cols(); j++) {
+        (*this)(i,j) = += _LOG((*this)(i,j));
+      }
+    }
+#else
+    //this will be faster (but less secure)
+    for(size_t i=0; i<Rows(); i++) {
+      _ElemT* p_data = pRowData(i);
+      for(size_t j=0; j<Cols(); j++) {
+        *p_data = _LOG(*p_data);
+        p_data++;
+      }
+    }
+#endif
+    return *this;
+  }
+
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Matrix<_ElemT> &
+    Matrix<_ElemT>::
+    Zero()
+    {
+    for(size_t row=0;row<mMRows;row++)
+    memset(mpData + row*mStride, 0, sizeof(_ElemT)*mMCols);
+      return *this;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Matrix<_ElemT> &
+    Matrix<_ElemT>::
+    Unit()
+    {
+    for(size_t row=0;row<std::min(mMRows,mMCols);row++){
+    memset(mpData + row*mStride, 0, sizeof(_ElemT)*mMCols);
+    (*this)(row,row) = 1.0;
+    }
+      return *this;
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    void
+    Matrix<_ElemT>::
+    PrintOut(char* file)
+    {
+      FILE* f = fopen(file, "w");
+      unsigned i,j;
+      fprintf(f, "%dx%d\n", this->mMRows, this->mMCols);
+
+      for(i=0; i<this->mMRows; i++)
+      {
+        _ElemT*   row = (*this)[i];
+
+        for(j=0; j<this->mStride; j++){
+          fprintf(f, "%20.17f ",row[j]);
+        }
+        fprintf(f, "\n");
+      }
+
+      fclose(f);
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    void
+    Matrix<_ElemT>::
+    ReadIn(char* file)
+    {
+      FILE* f = fopen(file, "r");
+      int  i = 0;
+      int j = 0;
+      fscanf(f, "%dx%d\n", &i,&j);
+      fprintf(stderr, "%dx%d\n", i,j);
+
+      for(i=0; i<this->mMRows; i++)
+      {
+        _ElemT*   row = (*this)[i];
+
+        for(j=0; j<this->mStride; j++){
+          fscanf(f, "%f ",&row[j]);
+        }
+        //fprintf(f, "\n");
+      }
+
+      fclose(f);
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    void Save (std::ostream &rOut, const Matrix<_ElemT> &rM)
+    {
+      for (size_t i = 0; i < rM.Rows(); i++) {
+        for (size_t j = 0; j < rM.Cols(); j++) {
+          rOut << rM(i,j) << ' ';
+        }
+        rOut << '\n';
+      }
+      if(rOut.fail()) 
+        throw std::runtime_error("Failed to write matrix to stream");
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    std::ostream &
+    operator << (std::ostream & rOut, const Matrix<_ElemT> & rM)
+    {
+      rOut << "m " << rM.Rows() << ' ' << rM.Cols() << '\n';
+      Save(rOut, rM);
+      return rOut;
+    }
+
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    void Load (std::istream & rIn, Matrix<_ElemT> & rM)
+    {
+      if(MatrixVectorIostreamControl::Flags(rIn, ACCUMULATE_INPUT)) {
+        for (size_t i = 0; i < rM.Rows(); i++) {
+          std::streamoff pos = rIn.tellg();
+          for (size_t j = 0; j < rM.Cols(); j++) {
+            _ElemT tmp;
+            rIn >> tmp;
+            rM(i,j) += tmp;
+            if(rIn.fail()){
+              throw std::runtime_error("Failed to read matrix from stream.  File position is "+to_string(pos));
+            }        
+          }
+        }
+      } else {
+        for (size_t i = 0; i < rM.Rows(); i++) {
+          std::streamoff pos = rIn.tellg();
+          for (size_t j = 0; j < rM.Cols(); j++) {
+            rIn >> rM(i,j);
+            if(rIn.fail()){
+              throw std::runtime_error("Failed to read matrix from stream.  File position is "+to_string(pos));
+            }        
+
+          }
+        }
+      }
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    std::istream &
+    operator >> (std::istream & rIn, Matrix<_ElemT> & rM)
+    {
+      while(isascii(rIn.peek()) && isspace(rIn.peek())) rIn.get(); // eat up space.
+      if(rIn.peek() == 'm'){ // "new" format: m <nrows> <ncols> \n 1.0 0.2 4.3 ...
+        rIn.get();// eat up the 'm'.
+        long long int nrows=-1; rIn>>nrows; 
+        long long int ncols=-1; rIn>>ncols; 
+        if(rIn.fail()||nrows<0||ncols<0){ throw std::runtime_error("Failed to read matrix from stream: no size\n"); }
+
+        size_t nrows2 = size_t(nrows), ncols2 = size_t(ncols);
+        assert((long long int)nrows2 == nrows && (long long int)ncols2 == ncols);
+
+        if(rM.Rows()!=nrows2 || rM.Cols()!=ncols2) rM.Init(nrows2,ncols2);
+      }
+      Load(rIn,rM);
+      return rIn;
+    }
+
+
+
+  //****************************************************************************
+  //****************************************************************************
+  // Constructor
+  template<typename _ElemT>
+    SubMatrix<_ElemT>::
+    SubMatrix(const Matrix<_ElemT>& rT, // Matrix cannot be const because SubMatrix can change its contents.  Would have to have a ConstSubMatrix or something...
+              const size_t    ro,
+              const size_t    r,
+              const size_t    co,
+              const size_t    c)
+    {
+      assert(ro >= 0 && ro <= rT.Rows());
+      assert(co >= 0 && co <= rT.Cols());
+      assert(r  >  0 && r  <= rT.Rows() - ro);
+      assert(c  >  0 && c  <= rT.Cols() - co);
+      // point to the begining of window
+      Matrix<_ElemT>::mMRows = r;
+      Matrix<_ElemT>::mMCols = c;
+      Matrix<_ElemT>::mStride = rT.Stride();
+      Matrix<_ElemT>::mpData = rT.pData_workaround() + co + ro * rT.Stride();
+    }
+
+
+
+#ifdef HAVE_ATLAS
+
+  template<>
+    Matrix<float> &
+    Matrix<float>::
+   BlasGer(const float alpha, const Vector<float>& rA, const Vector<float>& rB);
+
+
+  template<>
+    Matrix<double> &
+    Matrix<double>::
+   BlasGer(const double alpha, const Vector<double>& rA, const Vector<double>& rB);
+
+
+  template<>
+    Matrix<float>&
+    Matrix<float>::
+    BlasGemm(const float alpha,
+              const Matrix<float>& rA, MatrixTrasposeType transA,
+              const Matrix<float>& rB, MatrixTrasposeType transB,
+       const float beta);
+
+  template<>
+   Matrix<double>&
+    Matrix<double>::
+    BlasGemm(const double alpha,
+              const Matrix<double>& rA, MatrixTrasposeType transA,
+              const Matrix<double>& rB, MatrixTrasposeType transB,
+       const double beta);
+
+  template<>
+    Matrix<float>&
+    Matrix<float>::
+         Axpy(const float alpha,
+              const Matrix<float>& rA, MatrixTrasposeType transA);
+
+  template<>
+    Matrix<double>&
+    Matrix<double>::
+         Axpy(const double alpha,
+              const Matrix<double>& rA, MatrixTrasposeType transA);
+
+  template <>  // non-member so automatic namespace lookup can occur.
+  double TraceOfProduct(const Matrix<double> &A, const Matrix<double> &B);
+
+  template <>  // non-member so automatic namespace lookup can occur.
+  double TraceOfProductT(const Matrix<double> &A, const Matrix<double> &B);
+
+  template <>  // non-member so automatic namespace lookup can occur.
+  float TraceOfProduct(const Matrix<float> &A, const Matrix<float> &B);
+
+  template <>  // non-member so automatic namespace lookup can occur.
+  float TraceOfProductT(const Matrix<float> &A, const Matrix<float> &B);
+
+  
+
+#else // HAVE_ATLAS
+      #error Routines in this section are not implemented yet without BLAS
+#endif // HAVE_ATLAS
+
+  template<class _ElemT>
+  bool
+  Matrix<_ElemT>::
+  IsSymmetric(_ElemT cutoff) const {
+  size_t R=Rows(), C=Cols();
+  if(R!=C) return false;
+  _ElemT bad_sum=0.0, good_sum=0.0;
+  for(size_t i=0;i<R;i++){
+    for(size_t j=0;j<i;j++){
+    _ElemT a=(*this)(i,j),b=(*this)(j,i), avg=0.5*(a+b), diff=0.5*(a-b);    
+    good_sum += fabs(avg); bad_sum += fabs(diff);
+    }
+    good_sum += fabs((*this)(i,i));
+  }
+  if(bad_sum > cutoff*good_sum) return false;
+  return true;
+  }
+
+  template<class _ElemT>
+  bool
+  Matrix<_ElemT>::
+  IsDiagonal(_ElemT cutoff) const{
+  size_t R=Rows(), C=Cols();
+  _ElemT bad_sum=0.0, good_sum=0.0;
+  for(size_t i=0;i<R;i++){
+    for(size_t j=0;j<C;j++){
+    if(i==j) good_sum += (*this)(i,j);
+    else bad_sum += (*this)(i,j);
+    }
+  }
+  return (!(bad_sum > good_sum * cutoff));
+  }
+
+  template<class _ElemT>
+  bool
+  Matrix<_ElemT>::
+  IsUnit(_ElemT cutoff) const {
+  size_t R=Rows(), C=Cols();
+  if(R!=C) return false;
+  _ElemT bad_sum=0.0;
+  for(size_t i=0;i<R;i++)
+    for(size_t j=0;j<C;j++)
+    bad_sum += fabs( (*this)(i,j) - (i==j?1.0:0.0));
+  return (bad_sum <= cutoff);
+  }
+
+  template<class _ElemT>
+  bool
+  Matrix<_ElemT>::
+  IsZero(_ElemT cutoff)const {
+  size_t R=Rows(), C=Cols();
+  _ElemT bad_sum=0.0;
+  for(size_t i=0;i<R;i++)
+    for(size_t j=0;j<C;j++)
+    bad_sum += fabs( (*this)(i,j) );
+  return (bad_sum <= cutoff);
+  }
+
+  template<class _ElemT>
+  _ElemT
+  Matrix<_ElemT>::
+  FrobeniusNorm() const{
+  size_t R=Rows(), C=Cols();
+  _ElemT sum=0.0;
+  for(size_t i=0;i<R;i++)
+    for(size_t j=0;j<C;j++){
+        _ElemT tmp = (*this)(i,j);
+    sum +=  tmp*tmp;
+      }
+    return sqrt(sum);
+  }
+
+  template<class _ElemT>
+  _ElemT
+  Matrix<_ElemT>::
+  LargestAbsElem() const{
+  size_t R=Rows(), C=Cols();
+  _ElemT largest=0.0;
+  for(size_t i=0;i<R;i++)
+    for(size_t j=0;j<C;j++)
+        largest = std::max(largest, (_ElemT)fabs((*this)(i,j)));
+    return largest;
+  }
+
+
+
+  // Uses SVD to compute the eigenvalue decomposition of a symmetric positive semidefinite 
+  //   matrix: 
+  // (*this) = rU * diag(rS) * rU^T, with rU an orthogonal matrix so rU^{-1} = rU^T.
+  // Does this by computing svd (*this) = U diag(rS) V^T ... answer is just U diag(rS) U^T.
+  // Throws exception if this failed to within supplied precision (typically because *this was not 
+  // symmetric positive definite).  
+  
+  
+
+  template<class _ElemT>
+  _ElemT
+  Matrix<_ElemT>::
+  LogAbsDeterminant(_ElemT *DetSign){
+    _ElemT LogDet;
+  Matrix<_ElemT> tmp(*this);
+  tmp.Invert(&LogDet, DetSign, false); // false== output not needed (saves some computation).
+    return LogDet;
+  }
+
+}// namespace TNet
+
+// #define TNet_Matrix_tcc
+#endif
diff --git a/src/KaldiLib/MlfStream.cc b/src/KaldiLib/MlfStream.cc
new file mode 100644
index 0000000..a2f6478
--- /dev/null
+++ b/src/KaldiLib/MlfStream.cc
@@ -0,0 +1,268 @@
+#include "MlfStream.h"
+#include "Common.h"
+#include "Error.h"
+
+
+namespace TNet
+{
+  //******************************************************************************
+  LabelContainer::
+  ~LabelContainer()
+  {
+    while (!this->mLabelList.empty())
+    {
+      delete this->mLabelList.back();
+      this->mLabelList.pop_back();
+    }
+  }
+
+  //******************************************************************************
+  size_t
+  LabelContainer::
+  DirDepth(const std::string & rPath)
+  {
+    size_t depth     = 0;
+    size_t length    = rPath.length();
+    const char * s   = rPath.c_str();
+
+    for (size_t i = 0; i < length; i++)
+    {
+      if (*s == '/' || *s == '\\')
+      {
+        depth++;
+      }
+      s++;
+    }
+    return depth;
+  }
+
+
+  //******************************************************************************
+  void
+  LabelContainer::
+  Insert(const std::string &  rLabel,
+         std::streampos  Pos)
+  {
+    LabelRecord     ls;
+    size_t          depth;
+    LabelRecord     tmp_ls;
+
+    // we need to compute the depth of the label path if
+    // wildcard is used
+    // do we have a wildcard???
+    if (rLabel[0] == '*')
+    {
+      depth = this->DirDepth(rLabel);
+    }
+    else
+    {
+      depth = MAX_LABEL_DEPTH;
+    }
+
+    // perhaps we want to store the depth of the path in the label for the wildcards
+    // to work
+    this->mDepths.insert(depth);
+
+    // store the values
+    ls.mStreamPos       = Pos;
+    ls.miLabelListLimit = mLabelList.end();
+
+
+    if (mLabelList.begin() != mLabelList.end()) {
+      ls.miLabelListLimit--;
+    }
+
+    // if no wildcard chars, then we try to store in hash, otherwise store in 
+    // list
+    if (rLabel.find_first_of("*?%",1) == rLabel.npos)
+    {
+      if (!Find(rLabel, tmp_ls))
+      {
+        // look in the
+        this->mLabelMap[rLabel] = ls;
+      }
+      else {
+        ;
+        //Warning("More general definition found when inserting " + rLabel + " ... label: " + MatchedPattern());
+      }
+    }
+    else
+    {
+      this->mLabelList.push_back(new std::pair<std::string,LabelRecord>(rLabel, ls));
+    }
+  }
+
+
+  //******************************************************************************
+  bool
+  LabelContainer::
+  FindInHash(const std::string & rLabel, LabelRecord & rLS)
+  {
+    bool found = false;
+
+    std::string str;
+
+    // current depth within the str
+    DepthType  current_depth    = MAX_LABEL_DEPTH;
+
+    // current search position within the str
+    size_t     prev             = rLabel.size() + 1;
+
+    // we will walk through the set depts bacwards so we begin at the end and move
+    // to the front...
+    std::set<DepthType>::reverse_iterator ri    (this->mDepths.end());
+    std::set<DepthType>::reverse_iterator rlast (this->mDepths.begin());
+    LabelHashType::iterator               lab;
+
+    // we perform the search until we run to the end of the set or we find something
+    while ((!found) && (ri != rlast))
+    {
+      // we don't need to do anything with the string if the depth is set to
+      // max label depth since it contains no *
+      if (*ri == MAX_LABEL_DEPTH)
+      {
+        found = ((lab=this->mLabelMap.find(rLabel)) != this->mLabelMap.end());
+        if (found) str = rLabel;
+      }
+      // we will crop the string and put * in the begining and try to search
+      else
+      {
+        // we know that we walk backwards in the depths, so we need to first find
+        // the last / and
+        if (current_depth == MAX_LABEL_DEPTH)
+        {
+          if (*ri > 0)
+          {
+            // we find the ri-th / from back
+            for (DepthType i=1; (i <= *ri) && (prev != rLabel.npos); i++)
+            {
+              prev = rLabel.find_last_of("/\\", prev-1);
+            }
+          }
+          else
+          {
+            prev = 0;
+          }
+
+          // check if finding succeeded (prev == str.npos => failure, see STL)
+          if (prev != rLabel.npos)
+          {
+            // construct the new string beign sought for
+            str.assign(rLabel, prev, rLabel.size());
+            str = '*' + str;
+
+            // now we try to find
+            found = ((lab=this->mLabelMap.find(str)) != this->mLabelMap.end());
+
+            // say, that current depth is *ri
+            current_depth = *ri;
+          }
+          else
+          {
+            prev = rLabel.size() + 1;
+          }
+        }     // if (current_depth == MAX_LABEL_DEPTH)
+        else
+        {
+          // now we know at which / we are from the back, so we search forward now
+          // and we need to reach the ri-th /
+          while (current_depth > *ri)
+          {
+            // we try to find next /
+            if ((prev = rLabel.find_first_of("/\\", prev+1)) != rLabel.npos)
+              current_depth--;
+            else
+              return false;
+          }
+
+          // construct the new string beign sought for
+          str.assign(rLabel, prev, rLabel.size());
+          str = '*' + str;
+
+          // now we try to find
+          found = ((lab=this->mLabelMap.find(str)) != this->mLabelMap.end());
+        }
+      }
+
+      // move one element further (jump to next observed depth)
+      ri++;
+    } // while (run)
+
+    // some debug info
+    if (found)
+    {
+      rLS                   = lab->second;
+      this->mMatchedPattern = str;
+    }
+
+    return found;
+  }
+
+
+  //******************************************************************************
+  bool
+  LabelContainer::
+  FindInList(const std::string & rLabel, LabelRecord & rLS, bool limitSearch)
+  {
+
+    bool                      found = false;
+    std::string                    str;
+    LabelListType::iterator   lab   = mLabelList.begin();
+    LabelListType::iterator   limit;
+
+    if (limitSearch && (rLS.miLabelListLimit != mLabelList.end()))
+    {
+      limit = rLS.miLabelListLimit;
+      limit++;
+    }
+    else
+    {
+      limit = this->mLabelList.end();
+    }
+
+    // we perform sequential search until we run to the end of the list or we find
+    // something
+    while ((!found) && (lab != limit))
+    {
+      if (ProcessMask(rLabel, (*lab)->first, str))
+      {
+        found = true;
+      }
+      else
+      {
+        lab++;
+      }
+    } // while (run)
+
+    // some debug info
+    if (found)
+    {
+      rLS                       = (*lab)->second;
+      this->mMatchedPattern     = (*lab)->first;
+      this->mMatchedPatternMask = str;
+    }
+    return found;
+  }
+
+
+  //******************************************************************************
+  bool
+  LabelContainer::
+  Find(const std::string & rLabel, LabelRecord & rLS)
+  {
+    // try to find the label in the Hash
+    if (FindInHash(rLabel, rLS))
+    {
+      // we look in the list, but we limit the search.
+      FindInList(rLabel, rLS, true);
+      return true;
+    } //if (this->mLabelContainer.FindInHash(rLabel, label_stream))
+    else
+    {
+      // we didn't find it in the hash so we look in the list
+      return FindInList(rLabel, rLS);
+    }
+  }
+
+} // namespace TNet
+
diff --git a/src/KaldiLib/MlfStream.h b/src/KaldiLib/MlfStream.h
new file mode 100644
index 0000000..81f2d6e
--- /dev/null
+++ b/src/KaldiLib/MlfStream.h
@@ -0,0 +1,639 @@
+/** @file MlfStream.h
+ *  This is an TNet C++ Library header.
+ *
+ *  The naming convention in this file coppies the std::* naming as well as STK
+ */
+
+
+#ifndef STK_MlfStream_h
+#define STK_MlfStream_h
+
+#include <iostream>
+#include <vector>
+#include <map>
+#include <list>
+#include <set>
+
+
+namespace TNet
+{
+  class LabelRecord;
+  class LabelContainer;
+
+
+  /// this container stores the lables in linear order as they came
+  /// i.e. they cannot be hashed
+  typedef  std::list< std::pair<std::string,LabelRecord> *> LabelListType;
+
+  /// type of the container used to store the labels
+  typedef  std::map<std::string, LabelRecord>               LabelHashType;
+
+
+
+  /**
+   *  @brief Describes type of MLF definition
+   *
+   *  See HTK book for MLF structure. Terms used in TNet are
+   *  compatible with those in HTK book.
+   */
+  enum MlfDefType
+  {
+    MLF_DEF_UNKNOWN = 0,              ///< unknown definition
+    MLF_DEF_IMMEDIATE_TRANSCRIPTION,  ///< immediate transcription
+    MLF_DEF_SUB_DIR_DEF               ///< subdirectory definition
+  };
+
+
+
+  /** **************************************************************************
+   *  @brief Holds association between label and stream
+   */
+  class LabelRecord
+  {
+
+  public:
+    LabelRecord() : miLabelListLimit(NULL)
+    { }
+
+    ~LabelRecord()
+    { }
+
+    /// definition type
+    MlfDefType                mDefType;
+
+    /// position of the label in the stream
+    std::streampos            mStreamPos;
+
+    /**
+     *  @brief points to the current end of the LabelList
+     *
+     *  The reason for storing this value is to know when we inserted
+     *  a label into the hash. It is possible, that the hash label came
+     *  after list label, in which case the list label is prefered
+     */
+    LabelListType::iterator   miLabelListLimit;
+
+  };
+
+
+
+
+  /**
+   *  @brief Provides an interface to label hierarchy and searching
+   *
+   *  This class stores label files in a map structure. When a wildcard
+   *  convence is used, the class stores the labels in separate maps according
+   *  to level of wildcard abstraction. By level we mean the directory structure
+   *  depth.
+   */
+  class LabelContainer
+  {
+  public:
+    /// The constructor
+    LabelContainer() : mUseHashedSearch(true) {}
+
+    /// The destructor
+    ~LabelContainer();
+
+    /**
+     *  @brief Inserts new label to the hash structure
+     */
+    void
+    Insert(
+      const std::string &      rLabel,
+      std::streampos           Pos);
+
+
+    /**
+     *  @brief Looks for a record in the hash
+     */
+    bool
+    FindInHash(
+      const std::string&        rLabel,
+      LabelRecord&              rLS);
+
+    /**
+     *  @brief Looks for a record in the list
+     *  @param rLabel Label to look for
+     *  @param rLS    Structure to fill with found data
+     *  @param limitSearch If true @p rLS's @c mLabelListLimit gives the limiting position in the list
+     */
+    bool
+    FindInList(
+      const std::string&        rLabel,
+      LabelRecord&              rLS,
+      bool                      limitSearch = false);
+
+    /**
+     *  @brief Looks for a record
+     */
+    bool
+    Find(
+      const std::string &       rLabel,
+      LabelRecord &             rLS);
+
+    /**
+     *  @brief Returns the matched pattern
+     */
+    const std::string &
+    MatchedPattern() const
+    {
+      return mMatchedPattern;
+    }
+
+    /**
+     *  @brief Returns the matched pattern mask (%%%)
+     */
+    const std::string &
+    MatchedPatternMask() const
+    {
+      return mMatchedPatternMask;
+    }
+
+    /** 
+     * @brief Writes contents to stream (text)
+     * @param rOStream stream to write to
+     */
+    void
+    Write(std::ostream& rOStream);
+
+  private:
+    /// type used for directory depth notation
+    typedef  size_t                 DepthType;
+
+
+    /// this set stores depths of * labels observed at insertion
+    std::set<DepthType>             mDepths;
+
+    /// stores the labels
+    LabelHashType                   mLabelMap;
+    LabelListType                   mLabelList;
+
+    /// true if labels are to be sought by hashing function (fast) or by
+    /// sequential search (slow)
+    bool                            mUseHashedSearch;
+
+    /// if Find matches the label, this var stores the pattern that matched the
+    /// query
+    std::string                     mMatchedPattern;
+
+    /// if Find matches the label, this var stores the the masked characters.
+    /// The mask is given by '%' symbols
+    std::string                     mMatchedPatternMask;
+
+    /**
+     *  @brief Returns the directory depth of path
+     */
+    size_t
+    DirDepth(const std::string & path);
+
+
+  };
+
+
+  /** 
+   * @brief MLF output buffer definition
+   */
+  template<
+    typename _CharT, 
+    typename _Traits = std::char_traits<_CharT>,
+    typename _CharTA = std::allocator<_CharT>,
+    typename ByteT = char,
+    typename ByteAT = std::allocator<ByteT> 
+  > 
+    class BasicOMlfStreamBuf 
+    : public std::basic_streambuf<_CharT, _Traits> 
+    {
+    public:
+      // necessary typedefs ....................................................
+      typedef BasicOMlfStreamBuf<_CharT,_Traits,_CharTA,ByteT,ByteAT>
+                            this_type; 
+      typedef std::basic_ostream<_CharT, _Traits>& 
+                            OStreamReference;
+      typedef std::basic_streambuf<_CharT, _Traits>
+                            StreamBufType;
+      typedef _CharTA       char_allocator_type;
+      typedef _CharT        char_type;
+      typedef typename _Traits::int_type int_type;
+      typedef typename _Traits::pos_type pos_type;
+      typedef ByteT         byte_type;
+      typedef ByteAT        byte_allocator_type; 
+      typedef byte_type*    byte_buffer_type;
+      typedef std::vector<byte_type, byte_allocator_type > byte_vector_type;
+      typedef std::vector<char_type, char_allocator_type > char_vector_type;
+
+
+      BasicOMlfStreamBuf(OStreamReference rOStream, size_t bufferSize);
+
+      ~BasicOMlfStreamBuf();
+
+      // virtual functions inherited from basic_streambuf.......................
+      int 
+      sync();
+
+      /** 
+       * @brief Write character in the case of overflow
+       * @param c Character to be written.
+       * @return A value different than EOF (or traits::eof() for other traits) 
+       *         signals success.  If the function fails, either EOF 
+       *         (or traits::eof() for other traits) is returned or an 
+       *         exception is thrown.
+       */
+      int_type
+      overflow(int_type c = _Traits::eof());
+
+
+      // MLF specific functions ................................................
+      /** 
+       * @brief Creates a new MLF block
+       * @param rFileName filename to be opened
+       */
+      this_type*
+      Open(const std::string& rFileName);
+
+      /** 
+       * @brief Closes MLF block
+       */
+      void
+      Close();
+
+      /** 
+       * @brief Returns true if the MLF is now in open state
+       */
+      bool
+      IsOpen() const
+      { return mIsOpen; }
+
+      LabelContainer&
+      rLabels()
+      { return mLabels; }
+
+    private:
+      bool             mIsOpen;
+      char_type        mLastChar;
+      OStreamReference mOStream;
+      LabelContainer   mLabels;
+    }; // class BasicOMlfStreamBuf
+
+
+
+  /** 
+   * @brief MLF input buffer definition
+   */
+  template<
+    typename _CharT, 
+    typename _Traits = std::char_traits<_CharT>,
+    typename _CharTA = std::allocator<_CharT>,
+    typename ByteT = char,
+    typename ByteAT = std::allocator<ByteT> 
+  > 
+    class BasicIMlfStreamBuf 
+    : public std::basic_streambuf<_CharT, _Traits> 
+    {
+    private:
+      // internal automaton states
+      static const int IN_HEADER_STATE   = 0;
+      static const int OUT_OF_BODY_STATE = 1;
+      static const int IN_TITLE_STATE    = 2;
+      static const int IN_BODY_STATE     = 3;
+
+
+    public: // necessary typedefs ..............................................
+      typedef BasicIMlfStreamBuf<_CharT,_Traits,_CharTA,ByteT,ByteAT>
+                            this_type; 
+      typedef std::basic_istream<_CharT, _Traits>& IStreamReference;
+      typedef std::basic_streambuf<_CharT, _Traits>
+                            StreamBufType;
+      typedef _CharTA       char_allocator_type;
+      typedef _CharT        char_type;
+      typedef typename _Traits::int_type int_type;
+      typedef typename _Traits::pos_type pos_type;
+      typedef ByteT         byte_type;
+      typedef ByteAT        byte_allocator_type; 
+      typedef byte_type*    byte_buffer_type;
+      typedef std::vector<byte_type, byte_allocator_type > byte_vector_type;
+      typedef std::vector<char_type, char_allocator_type > char_vector_type;
+
+
+    public:
+      // constructors and destructors ..........................................
+      BasicIMlfStreamBuf(IStreamReference rIStream, size_t bufferSize = 1024);
+
+      ~BasicIMlfStreamBuf();
+
+      // virtual functions inherited from basic_streambuf.......................
+      /** 
+       * @brief Get character in the case of underflow
+       * 
+       * @return The new character available at the get pointer position, if 
+       *         any. Otherwise, traits::eof() is returned.  
+       */
+      int_type
+      underflow();
+
+
+      // MLF specific functions ................................................
+      /** 
+       * @brief Creates a new MLF block
+       * @param rFileName filename to be opened
+       */
+      this_type*
+      Open(const std::string& rFileName);
+
+      /** 
+       * @brief Closes MLF block
+       */
+      this_type*
+      Close();
+
+      /** 
+       * @brief Returns true if the MLF is now in open state
+       */
+      bool
+      IsOpen() const
+      { return mIsOpen; }
+
+      /** 
+       * @brief Parses the stream (if possible) and stores positions to the 
+       *        label titles
+       */
+      void
+      Index();
+
+	bool
+      IsHashed() const
+      { return mIsHashed; }
+
+      /** 
+       * @brief Jumps to next label definition
+       * @param rName std::string to be filled with the label name
+       * @return true on success
+       *
+       * The procedure automatically tries to hash the labels.
+       */
+      bool
+      JumpToNextDefinition(std::string& rName);
+
+      /** 
+       * @brief Returns reference to the base stream
+       * @return reference to the stream
+       *
+       */
+      IStreamReference
+      GetBaseStream() 
+      {
+        return mIStream;
+      }
+
+    private: // auxillary functions ............................................
+      /** 
+       * @brief Fills the line buffer with next line and updates the internal
+       * state of the finite automaton
+       */
+      void
+      FillLineBuffer();
+
+
+    private: // atributes ......................................................
+      // some flags
+      bool              mIsOpen;
+      bool              mIsHashed;
+      bool              mIsEof;
+
+      /// internal state of the finite automaton
+      int               mState;
+
+      IStreamReference  mIStream;
+      LabelContainer    mLabels;
+
+      std::vector<char_type>  mLineBuffer;
+    }; // class BasicIMlfStreamBuf
+
+
+
+
+  /** 
+   * @brief Base class with type-independent members for the Mlf Output 
+   *        Stram class
+   *
+   * This is a derivative of the basic_ios class. We derive it as we need 
+   * to override some member functions
+   */
+  template<
+    typename Elem, 
+    typename Tr = std::char_traits<Elem>,
+    typename ElemA = std::allocator<Elem>,
+    typename ByteT = char,
+    typename ByteAT = std::allocator<ByteT>
+  >	
+    class BasicOMlfStreamBase
+    : virtual public std::basic_ios<Elem,Tr>
+    {
+    public:
+      typedef std::basic_ostream<Elem, Tr>& OStreamReference;
+      typedef BasicOMlfStreamBuf <
+        Elem,Tr,ElemA,ByteT,ByteAT> OMlfStreamBufType;
+
+      /** 
+       * @brief constructor
+       * 
+       * @param rOStream user defined output stream 
+       */
+      BasicOMlfStreamBase(OStreamReference rOStream, 
+          size_t bufferSize)
+      : mBuf(rOStream, bufferSize)
+      { init(&mBuf); };
+      
+      /** 
+       * @brief Returns a pointer to the buffer object for this stream
+       */
+      OMlfStreamBufType* 
+      rdbuf() 
+      { return &mBuf; };
+
+    private:
+      OMlfStreamBufType mBuf;
+    };  
+
+
+  template<
+    typename Elem, 
+    typename Tr = std::char_traits<Elem>,
+    typename ElemA = std::allocator<Elem>,
+    typename ByteT = char,
+    typename ByteAT = std::allocator<ByteT>
+  >	
+    class BasicIMlfStreamBase
+    : virtual public std::basic_ios<Elem,Tr>
+    {
+    public:
+      typedef std::basic_istream<Elem, Tr>& IStreamReference;
+      typedef BasicIMlfStreamBuf <
+        Elem,Tr,ElemA,ByteT,ByteAT> IMlfStreamBufType;
+
+      BasicIMlfStreamBase( IStreamReference rIStream,
+          size_t bufferSize)
+      : mBuf(rIStream, bufferSize)
+      { init(&mBuf ); };
+      
+      IMlfStreamBufType* 
+      rdbuf() 
+      { return &mBuf; };
+
+      IStreamReference
+      GetBaseStream()
+      { return mBuf.GetBaseStream(); }
+
+    private:
+      IMlfStreamBufType mBuf;
+    };
+
+
+  template<
+    typename Elem, 
+    typename Tr = std::char_traits<Elem>,
+    typename ElemA = std::allocator<Elem>,
+    typename ByteT = char,
+    typename ByteAT = std::allocator<ByteT>
+  >
+    class BasicOMlfStream 
+    : public BasicOMlfStreamBase<Elem,Tr,ElemA,ByteT,ByteAT>, 
+      public std::basic_ostream<Elem,Tr>
+    {
+    public:
+      typedef BasicOMlfStreamBase< Elem,Tr,ElemA,ByteT,ByteAT> 
+                                          BasicOMlfStreamBaseType;
+      typedef std::basic_ostream<Elem,Tr> OStreamType;
+      typedef OStreamType&                OStreamReference;
+
+      BasicOMlfStream(OStreamReference rOStream, size_t bufferSize = 32)
+      : BasicOMlfStreamBaseType(rOStream, bufferSize), 
+        OStreamType(BasicOMlfStreamBaseType::rdbuf())
+      { }
+
+      /** 
+       * @brief Destructor closes the stream
+       */
+      ~BasicOMlfStream()
+      { }
+
+
+      /** 
+       * @brief Creates a new MLF block
+       * @param rFileName filename to be opened
+       */
+      void
+      Open(const std::string& rFileName)
+      { BasicOMlfStreamBaseType::rdbuf()->Open(rFileName); }
+
+      /** 
+       * @brief Closes MLF block
+       */
+      void
+      Close()
+      { BasicOMlfStreamBaseType::rdbuf()->Close(); }
+
+      /** 
+       * @brief Returns true if the MLF is now in open state
+       */
+      bool
+      IsOpen() const
+      { return BasicOMlfStreamBaseType::rdbuf()->IsOpen(); }
+
+      /** 
+       * @brief Accessor to the label container
+       * @return Reference to the label container
+       */
+      LabelContainer&
+      rLabels()
+      { return BasicOMlfStreamBaseType::rdbuf()->rLabels(); }
+    };
+
+
+
+  template<
+    typename Elem, 
+    typename Tr = std::char_traits<Elem>,
+    typename ElemA = std::allocator<Elem>,
+    typename ByteT = char,
+    typename ByteAT = std::allocator<ByteT>
+  >	
+    class BasicIMlfStream 
+    : public BasicIMlfStreamBase<Elem,Tr,ElemA,ByteT,ByteAT>, 
+      public std::basic_istream<Elem,Tr>
+    {
+    public:
+      typedef BasicIMlfStreamBase <Elem,Tr,ElemA,ByteT,ByteAT> 
+                                          BasicIMlfStreamBaseType;
+      typedef std::basic_istream<Elem,Tr> IStreamType;
+      typedef IStreamType&                IStreamReference;
+      typedef unsigned char               byte_type;
+
+      BasicIMlfStream(IStreamReference rIStream, size_t bufferSize = 32)
+      : BasicIMlfStreamBaseType(rIStream, bufferSize), 
+        IStreamType(BasicIMlfStreamBaseType::rdbuf())
+      {};
+
+
+      /** 
+       * @brief Creates a new MLF block
+       * @param rFileName filename to be opened
+       */
+      void
+      Open(const std::string& rFileName)
+      { 
+        std::basic_streambuf<Elem, Tr>* p_buf;
+
+        p_buf = BasicIMlfStreamBaseType::rdbuf()->Open(rFileName);
+
+        if (NULL == p_buf) {
+          IStreamType::clear(IStreamType::rdstate() | std::ios::failbit);
+        }
+        else {
+          IStreamType::clear();
+        }
+      }
+
+      /** 
+       * @brief Closes MLF block.
+       * In fact, nothing is done
+       */
+      void 
+      Close()
+      { 
+        if (NULL == BasicIMlfStreamBaseType::rdbuf()->Close()) {
+          IStreamType::clear(IStreamType::rdstate() | std::ios::failbit);
+        }
+      }
+
+      void
+      Index()
+      { BasicIMlfStreamBaseType::rdbuf()->Index(); }
+
+      bool
+	  IsHashed() const
+      { return BasicIMlfStreamBaseType::rdbuf()->IsHashed(); }
+
+    };
+
+
+
+  // MAIN TYPEDEFS..............................................................
+  typedef BasicOMlfStream<char>     OMlfStream;
+  typedef BasicOMlfStream<wchar_t>  WOMlfStream;
+  typedef BasicIMlfStream<char>     IMlfStream;
+  typedef BasicIMlfStream<wchar_t>  WIMlfStream;
+
+
+#ifdef PATH_MAX
+  const size_t MAX_LABEL_DEPTH = PATH_MAX;
+#else
+  const size_t MAX_LABEL_DEPTH = 1024;
+#endif
+
+
+} // namespace TNet
+
+#include "MlfStream.tcc"
+
+#endif
diff --git a/src/KaldiLib/MlfStream.tcc b/src/KaldiLib/MlfStream.tcc
new file mode 100644
index 0000000..8978545
--- /dev/null
+++ b/src/KaldiLib/MlfStream.tcc
@@ -0,0 +1,517 @@
+#ifndef STK_MlfStream_tcc
+#define STK_MlfStream_tcc
+
+#include <algorithm>
+
+#include "Common.h"
+#include "StkMatch.h"
+
+namespace TNet
+{
+  //****************************************************************************
+  //****************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits,
+    typename _CharTA,
+    typename ByteT,
+    typename ByteAT
+  > 
+    BasicOMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>::
+    BasicOMlfStreamBuf(OStreamReference rOStream, size_t bufferSize)
+    : mIsOpen(false), mOStream(rOStream)
+    { }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits,
+    typename _CharTA,
+    typename ByteT,
+    typename ByteAT
+  > 
+    BasicOMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>::
+    ~BasicOMlfStreamBuf()
+    {
+      mOStream.flush();
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits,
+    typename _CharTA,
+    typename ByteT,
+    typename ByteAT
+  > 
+    int 
+    BasicOMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>::
+    sync()
+    {
+      mOStream.flush();
+      return 0;
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits,
+    typename _CharTA,
+    typename ByteT,
+    typename ByteAT
+  > 
+    typename _Traits::int_type 
+    BasicOMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>::
+    overflow(typename _Traits::int_type c)
+    {
+      // we don't use buffer here... 
+      if (mIsOpen) {
+        if (_Traits::eof() == c) {
+          return _Traits::not_eof(c);
+        }
+        // only pass the character to the stream
+        mOStream.rdbuf()->sputc(c);
+
+        // remember last char (in case we want to close)
+        mLastChar = c;
+
+        return c;
+      }
+      else {
+        return _Traits::eof();
+      }
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits,
+    typename _CharTA,
+    typename ByteT,
+    typename ByteAT
+  > 
+    void 
+    BasicOMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>::
+    Close()
+    {
+      // if last character was not EOL, we need to insert it
+      if (mLastChar != '\n') {
+        mOStream.put('\n');
+      }
+      mOStream << ".\n";
+
+      // flush the stream and declare the stream closed
+      mOStream.flush();
+      mIsOpen = false;
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits,
+    typename _CharTA,
+    typename ByteT,
+    typename ByteAT
+  > 
+    BasicOMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT> *
+    BasicOMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>::
+    Open(const std::string& rFileName)
+    {
+      // retreive position
+      std::streampos pos = mOStream.tellp();
+
+      // write the initial "filename" in parantheses
+      mOStream << '"' << rFileName << '"' << std::endl;
+      mLastChar = '\n';
+
+      // return NULL if we canot open
+      if (!mOStream.good()) {
+        return NULL;
+      }
+
+      // if ok, store the name position
+      if (-1 != pos) {
+        pos = mOStream.tellp();
+        mLabels.Insert(rFileName, pos);
+      }
+
+      // set open flag and return this
+      mIsOpen = true;
+      return this;
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  // BasicIMlfStreamBuf section
+  //
+  //****************************************************************************
+  //****************************************************************************
+
+  //****************************************************************************
+  //****************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits,
+    typename _CharTA,
+    typename ByteT,
+    typename ByteAT
+  > 
+    BasicIMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>::
+    BasicIMlfStreamBuf(IStreamReference rIStream, size_t bufferSize)
+    : mIsOpen(false), mIsHashed(false), mIsEof(true), mState(IN_HEADER_STATE), 
+      mIStream(rIStream), mLineBuffer()
+    {
+      // we reserve some place for the buffer...
+      mLineBuffer.reserve(bufferSize);
+
+      //StreamBufType::setg(mpBuffer, mpBuffer + bufferSize, mpBuffer + bufferSize);
+      StreamBufType::setg(&(mLineBuffer.front()), &(mLineBuffer.back()), &(mLineBuffer.back()));
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits,
+    typename _CharTA,
+    typename ByteT,
+    typename ByteAT
+  > 
+    BasicIMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>::
+    ~BasicIMlfStreamBuf()
+    { 
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits,
+    typename _CharTA,
+    typename ByteT,
+    typename ByteAT
+  > 
+    void
+    BasicIMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>::
+    Index()
+    {
+      // retreive position
+      std::streampos orig_pos   = mIStream.tellg();
+      int      orig_state = mState;
+
+      // for streams like stdin, pos will by definition be -1, so we can only 
+      // rely on sequential access and cannot hash it.
+      if (-1 != orig_pos) {
+        std::string aux_name;
+        // we will constantly jump to next definition. the function automatically
+        // hashes the stream if possible
+        while (JumpToNextDefinition(aux_name)) 
+        { }
+
+        // move to the original position
+        mIStream.clear();
+        mIStream.seekg(orig_pos);
+        mState = orig_state;
+
+        // set as hashed
+        mIsHashed=true;
+      }
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits,
+    typename _CharTA,
+    typename ByteT,
+    typename ByteAT
+  > 
+    bool
+    BasicIMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>::
+    JumpToNextDefinition(std::string& rName)
+    {
+      if (!mIStream.good()) {
+        return false;
+      }
+
+      // if we can, we will try to index the label
+      std::streampos pos = mIStream.tellg();
+
+      // we might be at a definition already, so first move one line further
+      FillLineBuffer();
+
+      // read lines till we get to definition again
+      while (mIStream.good() && mState != IN_TITLE_STATE) {
+        FillLineBuffer();
+      }
+
+      // decide what happened
+      if (IN_TITLE_STATE == mState) {
+        // if we can, we will try to index the label
+        pos = mIStream.tellg();
+
+        if (pos != static_cast<const std::streampos>(-1)) {
+        // if (pos !=std::string::npos) {  // This line does not work under MinGW
+          std::string line_buffer(mLineBuffer.begin(), mLineBuffer.end());
+          TNet::ParseHTKString(line_buffer, rName);
+          mLabels.Insert(rName, pos);
+        }
+
+        return true;
+      }
+      else {
+        // we have been hashing all the way through so we know that if this is 
+        // is the EOF, we are done hashing this stream
+        if (pos != static_cast<const std::streampos>(-1)) {
+          mIsHashed = true;
+        }
+
+        // we are not in body state, so we just return false
+        return false;
+      }
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits,
+    typename _CharTA,
+    typename ByteT,
+    typename ByteAT
+  > 
+    BasicIMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>*
+    BasicIMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>::
+    Close()
+    {
+      if (!mIsOpen) {
+        mIsEof = true;
+        return NULL;
+      }
+      else {
+        // if we try to close while in the body, we need to reach the end
+        if (mState == IN_BODY_STATE) {
+          while (mState == IN_BODY_STATE) {
+            FillLineBuffer();
+          }
+        }
+
+        // disable buffer mechanism
+        StreamBufType::setg(&(mLineBuffer.front()), &(mLineBuffer.front()), 
+            &(mLineBuffer.front()));
+
+        mIsEof  = true;
+        mIsOpen = false;
+
+        return this;
+      }
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits,
+    typename _CharTA,
+    typename ByteT,
+    typename ByteAT
+  > 
+    BasicIMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>*
+    BasicIMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>::
+    Open(const std::string& rFileName)
+    {
+      BasicIMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>* ret_val = NULL;
+
+      // this behavior is compatible with ifstream
+      if (mIsOpen) {
+        Close();
+        return NULL;
+      }
+
+      // retreive position
+      std::streampos pos = mIStream.tellg();
+      LabelRecord label_record;
+
+      // for streams like stdin, pos will by definition be -1, so we can only 
+      // rely on sequential access. At this place, we decide what to do
+      if ((-1 != pos) && (mLabels.Find(rFileName, label_record))) {
+        mIStream.seekg(label_record.mStreamPos);
+        mState = IN_TITLE_STATE;
+
+        // we don't want the other stream to be bad, so we transfer the 
+        // flagbits to this stream
+        if (!mIStream.good()) {
+          mIStream.clear();
+          mIsOpen = false;
+          ret_val = NULL;
+        }
+        else {
+          mIsOpen = true;
+          mIsEof = false;
+          ret_val = this;
+        }
+      }
+
+      // we don't have sequential stream and we didn't find the label, but
+      // we are hashed, so we can be sure, that we failed
+      else if ((-1 != pos) && mIsHashed) {
+        mIsOpen = false;
+        ret_val = NULL;
+      }
+
+      // we either have sequential stream or didn't find anything, but we can 
+      // still try to sequentially go and look for it
+      else {
+        bool        found = false;
+        std::string aux_name;
+        std::string aux_name2;
+
+        while ((!found) && JumpToNextDefinition(aux_name)) {
+          if (TNet::ProcessMask(rFileName, aux_name, aux_name2)) {
+            mIsOpen = true;
+            mIsEof  = false;
+            found   = true;
+            ret_val = this;
+          }
+        }
+
+        if (!found) {
+          mIsOpen = false;
+          ret_val = NULL;
+        }
+      }
+
+      return ret_val;
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits,
+    typename _CharTA,
+    typename ByteT,
+    typename ByteAT
+  > 
+    typename _Traits::int_type
+    BasicIMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>::
+    underflow()
+    {
+      // we don't do anything if EOF
+      if (mIsEof) {
+        StreamBufType::setg(&(mLineBuffer.front()), &(mLineBuffer.front()), 
+            &(mLineBuffer.front()));
+        return _Traits::eof();
+      }
+
+      // read from buffer if we can
+      if (StreamBufType::gptr() && (StreamBufType::gptr() < StreamBufType::egptr())) {
+        return _Traits::not_eof(*StreamBufType::gptr());
+      }
+
+      // might happen that stream is in !good state
+      if (!mIStream.good()) {
+        mIsEof = true;
+        StreamBufType::setg(&(mLineBuffer.front()), &(mLineBuffer.front()), 
+            &(mLineBuffer.front()));
+        return _Traits::eof();
+      }
+
+      // fill the line buffer and update my state
+      FillLineBuffer();
+
+      // if the whole line is just period or it's eof, declare EOF
+      if (mState == OUT_OF_BODY_STATE) {
+        mIsEof = true;
+        StreamBufType::setg(&(mLineBuffer.front()), &(mLineBuffer.front()), 
+            &(mLineBuffer.front()));
+        return _Traits::eof();
+      }
+
+      // restore the buffer mechanism
+      StreamBufType::setg(&(mLineBuffer.front()), &(mLineBuffer.front()),
+          &(mLineBuffer.back()) + 1);
+
+      return *StreamBufType::gptr();
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits,
+    typename _CharTA,
+    typename ByteT,
+    typename ByteAT
+  > 
+    void
+    BasicIMlfStreamBuf<_CharT, _Traits, _CharTA, ByteT, ByteAT>::
+    FillLineBuffer()
+    {
+      // reset line buffer
+      size_t capacity = mLineBuffer.capacity();
+      mLineBuffer.clear();
+      mLineBuffer.reserve(capacity);
+
+      // read one line into buffer
+      int c;
+      while ((c = mIStream.get()) != '\n' && c != _Traits::eof()) {
+        mLineBuffer.push_back(char(c));
+      }
+
+      // we want to be able to pass last eol symbol
+      if (c == '\n') {
+        mLineBuffer.push_back(char(c));
+      }
+
+      // we will decide where we are
+      switch (mState) {
+        case IN_HEADER_STATE:
+
+        case OUT_OF_BODY_STATE:
+          if (mLineBuffer[0] != '#') {
+            mState = IN_TITLE_STATE;
+          }
+          break;
+
+        case IN_TITLE_STATE:
+          if (mLineBuffer[0] == '.' && (mLineBuffer.back() == '\n' || mIStream.eof())) {
+            mState = OUT_OF_BODY_STATE;
+          }
+          else {
+            mState = IN_BODY_STATE;
+          }
+          break;
+
+        case IN_BODY_STATE:
+          // period or EOF will end the file
+          if (mLineBuffer[0] == '.' && (mLineBuffer.back() == '\n' || mIStream.eof())) {
+            mState = OUT_OF_BODY_STATE;
+          }
+          if (mLineBuffer.size() == 0) {
+            mState = OUT_OF_BODY_STATE;
+          }
+          break;
+      }
+    }
+} // namespace TNet
+
+
+#endif // STK_MlfStream_tcc
diff --git a/src/KaldiLib/StkMatch.cc b/src/KaldiLib/StkMatch.cc
new file mode 100644
index 0000000..4ff4b18
--- /dev/null
+++ b/src/KaldiLib/StkMatch.cc
@@ -0,0 +1,582 @@
+/*
+ EPSHeader
+
+   File: filmatch.c
+   Author: J. Kercheval
+   Created: Thu, 03/14/1991  22:22:01
+*/
+
+/*
+ EPSRevision History
+   O. Glembek    Thu, 03/11/2005  01:58:00  Added Mask extraction support (char % does this)
+   J. Kercheval  Wed, 02/20/1991  22:29:01  Released to Public Domain
+   J. Kercheval  Fri, 02/22/1991  15:29:01  fix '\' bugs (two :( of them)
+   J. Kercheval  Sun, 03/10/1991  19:31:29  add error return to matche()
+   J. Kercheval  Sun, 03/10/1991  20:11:11  add is_valid_pattern code
+   J. Kercheval  Sun, 03/10/1991  20:37:11  beef up main()
+   J. Kercheval  Tue, 03/12/1991  22:25:10  Released as V1.1 to Public Domain
+   J. Kercheval  Thu, 03/14/1991  22:22:25  remove '\' for DOS file parsing
+   J. Kercheval  Thu, 03/28/1991  20:58:27  include filmatch.h
+*/
+
+/*
+   Wildcard Pattern Matching
+*/
+
+
+#include "StkMatch.h"
+#include "Common.h"
+
+namespace TNet
+{
+  //#define TEST
+  static int matche_after_star (register const char *pattern, register const char *text, register char *s);
+  // following function is not defined or used.
+  // static int fast_match_after_star (register const char *pattern, register const char *text);
+
+  /*----------------------------------------------------------------------------
+  *
+  * Return true if PATTERN has any special wildcard characters
+  *
+  ----------------------------------------------------------------------------*/
+
+  bool is_pattern (const char *p)
+  {
+      while ( *p ) {
+          switch ( *p++ ) {
+              case '?':
+              case '*':
+              case '%':
+              case '[':
+                return true;
+          }
+      }
+      return false;
+  }
+
+
+  /*----------------------------------------------------------------------------
+  *
+  * Return true if PATTERN has is a well formed regular expression according
+  * to the above syntax
+  *
+  * error_type is a return code based on the type of pattern error.  Zero is
+  * returned in error_type if the pattern is a valid one.  error_type return
+  * values are as follows:
+  *
+  *   PATTERN_VALID - pattern is well formed
+  *   PATTERN_RANGE - [..] construct has a no end range in a '-' pair (ie [a-])
+  *   PATTERN_CLOSE - [..] construct has no end bracket (ie [abc-g )
+  *   PATTERN_EMPTY - [..] construct is empty (ie [])
+  *
+  ----------------------------------------------------------------------------*/
+
+  bool is_valid_pattern (const char *p, int *error_type)
+  {
+
+  /* init error_type */
+  *error_type = PATTERN_VALID;
+
+    /* loop through pattern to EOS */
+    while ( *p )
+    {
+      /* determine pattern type */
+      switch ( *p )
+      {
+        /* the [..] construct must be well formed */
+        case '[':
+        {
+          p++;
+
+          /* if the next character is ']' then bad pattern */
+          if ( *p == ']' ) {
+            *error_type = PATTERN_EMPTY;
+            return false;
+          }
+
+          /* if end of pattern here then bad pattern */
+          if ( !*p )
+          {
+            *error_type = PATTERN_CLOSE;
+            return false;
+          }
+
+          /* loop to end of [..] construct */
+          while ( *p != ']' )
+          {
+            /* check for literal escape */
+            if ( *p == '\\' )
+            {
+                p++;
+
+                /* if end of pattern here then bad pattern */
+                if ( !*p++ ) {
+                    *error_type = PATTERN_ESC;
+                    return false;
+                }
+            }
+            else
+                p++;
+
+            /* if end of pattern here then bad pattern */
+            if ( !*p )
+            {
+              *error_type = PATTERN_CLOSE;
+              return false;
+            }
+
+            /* if this a range */
+            if ( *p == '-' )
+            {
+              /* we must have an end of range */
+              if ( !*++p || *p == ']' )
+              {
+                *error_type = PATTERN_RANGE;
+                return false;
+              }
+              else
+              {
+
+                /* check for literal escape */
+                if ( *p == '\\' )
+                    p++;
+
+                /* if end of pattern here then bad pattern */
+                if ( !*p++ )
+                {
+                    *error_type = PATTERN_ESC;
+                    return false;
+                }
+              }
+            }
+          }
+          break;
+        } //case '[':
+
+
+        /* all other characters are valid pattern elements */
+        case '*':
+        case '?':
+        case '%':
+        default:
+          p++;                              /* "normal" character */
+          break;
+      } // switch ( *p )
+    } // while ( *p )
+
+    return true;
+  } //bool is_valid_pattern (const char *p, int *error_type)
+
+
+  /*----------------------------------------------------------------------------
+  *
+  *  Match the pattern PATTERN against the string TEXT;
+  *
+  *  returns MATCH_VALID if pattern matches, or an errorcode as follows
+  *  otherwise:
+  *
+  *            MATCH_PATTERN  - bad pattern
+  *            MATCH_RANGE    - match failure on [..] construct
+  *            MATCH_ABORT    - premature end of text string
+  *            MATCH_END      - premature end of pattern string
+  *            MATCH_VALID    - valid match
+  *
+  *
+  *  A match means the entire string TEXT is used up in matching.
+  *
+  *  In the pattern string:
+  *       `*' matches any sequence of characters (zero or more)
+  *       `?' matches any character
+  *       `%' matches any character and stores it in the s string
+  *       [SET] matches any character in the specified set,
+  *       [!SET] or [^SET] matches any character not in the specified set.
+  *       \ is allowed within a set to escape a character like ']' or '-'
+  *
+  *  A set is composed of characters or ranges; a range looks like
+  *  character hyphen character (as in 0-9 or A-Z).  [0-9a-zA-Z_] is the
+  *  minimal set of characters allowed in the [..] pattern construct.
+  *  Other characters are allowed (ie. 8 bit characters) if your system
+  *  will support them.
+  *
+  *  To suppress the special syntactic significance of any of `[]*?%!^-\',
+  *  within a [..] construct and match the character exactly, precede it
+  *  with a `\'.
+  *
+  ----------------------------------------------------------------------------*/
+
+  int matche ( register const char *p, register const char *t, register char *s )
+  {
+      register char range_start, range_end;  /* start and end in range */
+
+      bool invert;             /* is this [..] or [!..] */
+      bool member_match;       /* have I matched the [..] construct? */
+      bool loop;               /* should I terminate? */
+
+      for ( ; *p; p++, t++ ) {
+
+          /* if this is the end of the text then this is the end of the match */
+          if (!*t) {
+              return ( *p == '*' && *++p == '\0' ) ? MATCH_VALID : MATCH_ABORT;
+          }
+
+          /* determine and react to pattern type */
+          switch ( *p ) {
+
+              /* single any character match */
+              case '?':
+                  break;
+
+              /* single any character match, with extraction*/
+              case '%': {
+                  *s++ = *t;
+                  *s   = '\0';
+                  break;
+              }
+
+              /* multiple any character match */
+              case '*':
+                  return matche_after_star (p, t, s);
+
+              /* [..] construct, single member/exclusion character match */
+              case '[': {
+                /* move to beginning of range */
+                p++;
+
+                /* check if this is a member match or exclusion match */
+                invert = false;
+                if ( *p == '!' || *p == '^') {
+                    invert = true;
+                    p++;
+                }
+
+                /* if closing bracket here or at range start then we have a
+                   malformed pattern */
+                if ( *p == ']' ) {
+                    return MATCH_PATTERN;
+                }
+
+                member_match = false;
+                loop = true;
+
+                while ( loop ) {
+
+                    /* if end of construct then loop is done */
+                    if (*p == ']') {
+                        loop = false;
+                        continue;
+                    }
+
+                    /* matching a '!', '^', '-', '\' or a ']' */
+                    if ( *p == '\\' ) {
+                        range_start = range_end = *++p;
+                    }
+                    else {
+                        range_start = range_end = *p;
+                    }
+
+                    /* if end of pattern then bad pattern (Missing ']') */
+                    if (!*p)
+                        return MATCH_PATTERN;
+
+                    /* check for range bar */
+                    if (*++p == '-') {
+
+                        /* get the range end */
+                        range_end = *++p;
+
+                        /* if end of pattern or construct then bad pattern */
+                        if (range_end == '\0' || range_end == ']')
+                            return MATCH_PATTERN;
+
+                        /* special character range end */
+                        if (range_end == '\\') {
+                            range_end = *++p;
+
+                            /* if end of text then we have a bad pattern */
+                            if (!range_end)
+                                return MATCH_PATTERN;
+                        }
+
+                        /* move just beyond this range */
+                        p++;
+                    }
+
+                    /* if the text character is in range then match found.
+                       make sure the range letters have the proper
+                       relationship to one another before comparison */
+                    if ( range_start < range_end  ) {
+                        if (*t >= range_start && *t <= range_end) {
+                            member_match = true;
+                            loop = false;
+                        }
+                    }
+                    else {
+                        if (*t >= range_end && *t <= range_start) {
+                            member_match = true;
+                            loop = false;
+                        }
+                    }
+                }
+
+                /* if there was a match in an exclusion set then no match */
+                /* if there was no match in a member set then no match */
+                if ((invert && member_match) ||
+                   !(invert || member_match))
+                    return MATCH_RANGE;
+
+                /* if this is not an exclusion then skip the rest of the [...]
+                    construct that already matched. */
+                if (member_match) {
+                    while (*p != ']') {
+
+                        /* bad pattern (Missing ']') */
+                        if (!*p)
+                            return MATCH_PATTERN;
+
+                        /* skip exact match */
+                        if (*p == '\\') {
+                            p++;
+
+                            /* if end of text then we have a bad pattern */
+                            if (!*p)
+                                return MATCH_PATTERN;
+                        }
+
+                        /* move to next pattern char */
+                        p++;
+                    }
+                }
+
+                break;
+            } // case ']'
+
+            /* must match this character exactly */
+            default:
+                if (*p != *t)
+                    return MATCH_LITERAL;
+        }
+      }
+
+      //*s = '\0';
+      /* if end of text not reached then the pattern fails */
+      if ( *t )
+          return MATCH_END;
+      else
+          return MATCH_VALID;
+  }
+
+
+  /*----------------------------------------------------------------------------
+  *
+  * recursively call matche() with final segment of PATTERN and of TEXT.
+  *
+  ----------------------------------------------------------------------------*/
+
+  static int matche_after_star (register const char *p, register const char *t, register char *s)
+  {
+      register int match = 0;
+      register char nextp;
+
+      /* pass over existing ? and * in pattern */
+      while ( *p == '?' || *p == '%' || *p == '*' ) {
+
+          /* take one char for each ? and + */
+          if ( *p == '?') {
+
+              /* if end of text then no match */
+              if ( !*t++ ) {
+                  return MATCH_ABORT;
+              }
+          }
+
+          if ( *p == '%') {
+              *s++ = *t;
+              *s   = '\0';
+              /* if end of text then no match */
+              if ( !*t++ ) {
+                  return MATCH_ABORT;
+              }
+          }
+
+          /* move to next char in pattern */
+          p++;
+      }
+
+      /* if end of pattern we have matched regardless of text left */
+      if ( !*p ) {
+          return MATCH_VALID;
+      }
+
+      /* get the next character to match which must be a literal or '[' */
+      nextp = *p;
+
+      /* Continue until we run out of text or definite result seen */
+      do {
+
+          /* a precondition for matching is that the next character
+             in the pattern match the next character in the text or that
+             the next pattern char is the beginning of a range.  Increment
+             text pointer as we go here */
+          if ( nextp == *t || nextp == '[' ) {
+              match = matche(p, t, s);
+          }
+
+          /* if the end of text is reached then no match */
+          if ( !*t++ ) match = MATCH_ABORT;
+
+      } while ( match != MATCH_VALID && 
+                match != MATCH_ABORT &&
+                match != MATCH_PATTERN);
+
+      /* return result */
+      return match;
+  }
+
+
+  /*----------------------------------------------------------------------------
+  *
+  * match() is a shell to matche() to return only bool values.
+  *
+  ----------------------------------------------------------------------------*/
+
+  bool match(const  char *p, const char *t, char *s)
+  {
+      int error_type;
+      error_type = matche(p,t,s);
+      return (error_type != MATCH_VALID ) ? false : true;
+  }
+
+
+  //***************************************************************************
+  //***************************************************************************
+  bool
+  ProcessMask(const std::string & rString,
+              const std::string & rWildcard,
+                    std::string & rSubstr)
+  {
+    char *  substr;
+    int     percent_count        = 0;
+    int     ret ;
+    size_t  pos                  = 0;
+
+    // let's find how many % to allocate enough space for the return substring
+    while ((pos = rWildcard.find('%', pos)) != rWildcard.npos)
+    {
+      percent_count++;
+      pos++;
+    }
+
+    // allocate space for the substring
+    substr = new char[percent_count + 1];
+    substr[percent_count] = 0;
+    substr[0]             = '\0';
+
+    // optionally prepend '*/' to wildcard
+    std::string wildcard(rWildcard);
+    if(wildcard[0] != '*') {
+      wildcard = "*/" + wildcard;
+    }
+
+    //optionally prepend '/' to string
+    std::string string1(rString);
+    if(string1[0] != '/') {
+      string1 = "/" + string1;
+    }
+
+    // parse the string
+    if (0 != (ret = match(wildcard.c_str(), string1.c_str(), substr)))
+    {
+      rSubstr = substr;
+    }
+    delete[] substr;
+    return ret;
+  } // ProcessMask
+}
+
+
+#ifdef TEST
+
+/*
+* This test main expects as first arg the pattern and as second arg
+* the match string.  Output is yaeh or nay on match.  If nay on
+* match then the error code is parsed and written.
+*/
+
+#include <stdio.h>
+
+int main(int argc, char *argv[])
+{
+    int error;
+    int is_valid_error;
+
+    char * tmp = argv[0];
+    int i = 0;
+    for (; *tmp; tmp++)
+      if (*tmp=='%') i++;
+
+    char s[i+1];
+
+
+    if (argc != 3) {
+        printf("Usage:  MATCH Pattern Text\n");
+    }
+    else {
+        printf("Pattern: %s\n", argv[1]);
+        printf("Text   : %s\n", argv[2]);
+        
+        if (!is_pattern(argv[1])) {
+            printf("    First Argument Is Not A Pattern\n");
+        }
+        else {
+            match(argv[1],argv[2], s) ? printf("true") : printf("false");
+            error = matche(argv[1],argv[2], s);
+            is_valid_pattern(argv[1],&is_valid_error);
+
+            switch ( error ) {
+                case MATCH_VALID:
+                    printf("    Match Successful");
+                    if (is_valid_error != PATTERN_VALID)
+                        printf(" -- is_valid_pattern() is complaining\n");
+                    else
+                        printf("\n");
+                    printf("%s\n", s);
+
+                    break;
+                case MATCH_RANGE:
+                    printf("    Match Failed on [..]\n");
+                    break;
+                case MATCH_ABORT:
+                    printf("    Match Failed on Early Text Termination\n");
+                    break;
+                case MATCH_END:
+                    printf("    Match Failed on Early Pattern Termination\n");
+                    break;
+                case MATCH_PATTERN:
+                    switch ( is_valid_error ) {
+                        case PATTERN_VALID:
+                            printf("    Internal Disagreement On Pattern\n");
+                            break;
+                        case PATTERN_RANGE:
+                            printf("    No End of Range in [..] Construct\n");
+                            break;
+                        case PATTERN_CLOSE:
+                            printf("    [..] Construct is Open\n");
+                            break;
+                        case PATTERN_EMPTY:
+                            printf("    [..] Construct is Empty\n");
+                            break;
+                        default:
+                            printf("    Internal Error in is_valid_pattern()\n");
+                    }
+                    break;
+                default:
+                    printf("    Internal Error in matche()\n");
+                    break;
+            }
+        }
+
+    }
+    return(0);
+}
+
+#endif
diff --git a/src/KaldiLib/StkMatch.h b/src/KaldiLib/StkMatch.h
new file mode 100644
index 0000000..42c6b97
--- /dev/null
+++ b/src/KaldiLib/StkMatch.h
@@ -0,0 +1,123 @@
+#ifndef TNet_StkMatch_h
+#define TNet_StkMatch_h
+
+#include <string>
+namespace TNet 
+{
+  /*
+   EPSHeader
+
+     File: filmatch.h
+     Author: J. Kercheval
+     Created: Thu, 03/14/1991  22:24:34
+  */
+
+  /*
+   EPSRevision History
+     O. Glembek    Thu, 03/11/2005  01:58:00  Added Mask extraction support (char % does this)
+     J. Kercheval  Wed, 02/20/1991  22:28:37  Released to Public Domain
+     J. Kercheval  Sun, 03/10/1991  18:02:56  add is_valid_pattern
+     J. Kercheval  Sun, 03/10/1991  18:25:48  add error_type in is_valid_pattern
+     J. Kercheval  Sun, 03/10/1991  18:47:47  error return from matche()
+     J. Kercheval  Tue, 03/12/1991  22:24:49  Released as V1.1 to Public Domain
+     J. Kercheval  Thu, 03/14/1991  22:25:00  remove '\' for DOS file matching
+     J. Kercheval  Thu, 03/28/1991  21:03:59  add in PATTERN_ESC & MATCH_LITERAL
+  */
+
+  /*
+     Wildcard Pattern Matching
+  */
+
+
+  /* match defines */
+#define MATCH_PATTERN  6    /* bad pattern */
+#define MATCH_LITERAL  5    /* match failure on literal match */
+#define MATCH_RANGE    4    /* match failure on [..] construct */
+#define MATCH_ABORT    3    /* premature end of text string */
+#define MATCH_END      2    /* premature end of pattern string */
+#define MATCH_VALID    1    /* valid match */
+
+  /* pattern defines */
+#define PATTERN_VALID  0    /* valid pattern */
+#define PATTERN_ESC   -1    /* literal escape at end of pattern */
+#define PATTERN_RANGE -2    /* malformed range in [..] construct */
+#define PATTERN_CLOSE -3    /* no end bracket in [..] construct */
+#define PATTERN_EMPTY -4    /* [..] contstruct is empty */
+
+
+  /*----------------------------------------------------------------------------
+  *
+  *  Match the pattern PATTERN against the string TEXT;
+  *
+  *       match() returns TRUE if pattern matches, FALSE otherwise.
+  *       matche() returns MATCH_VALID if pattern matches, or an errorcode
+  *           as follows otherwise:
+  *
+  *            MATCH_PATTERN  - bad pattern
+  *            MATCH_RANGE    - match failure on [..] construct
+  *            MATCH_ABORT    - premature end of text string
+  *            MATCH_END      - premature end of pattern string
+  *            MATCH_VALID    - valid match
+  *
+  *
+  *  A match means the entire string TEXT is used up in matching.
+  *
+  *  In the pattern string:
+  *       `*' matches any sequence of characters (zero or more)
+  *       `?' matches any character
+  *       [SET] matches any character in the specified set,
+  *       [!SET] or [^SET] matches any character not in the specified set.
+  *
+  *  A set is composed of characters or ranges; a range looks like
+  *  character hyphen character (as in 0-9 or A-Z).  [0-9a-zA-Z_] is the
+  *  minimal set of characters allowed in the [..] pattern construct.
+  *  Other characters are allowed (ie. 8 bit characters) if your system
+  *  will support them.
+  *
+  *  To suppress the special syntactic significance of any of `[]*?!^-\',
+  *  in a [..] construct and match the character exactly, precede it
+  *  with a `\'.
+  *
+  ----------------------------------------------------------------------------*/
+  bool 
+  match (const char *pattern, const char *text, char *s);
+
+  int  
+  matche(register const char *pattern, register const char *text, register char *s);
+
+
+  /*----------------------------------------------------------------------------
+  *
+  * Return TRUE if PATTERN has any special wildcard characters
+  *
+  ----------------------------------------------------------------------------*/
+  bool 
+  is_pattern (const char *pattern);
+
+
+  /** --------------------------------------------------------------------------
+   *
+   * Return TRUE if PATTERN has is a well formed regular expression according
+   * to the above syntax
+   *
+   * error_type is a return code based on the type of pattern error.  Zero is
+   * returned in error_type if the pattern is a valid one.  error_type return
+   * values are as follows:
+   *
+   *   PATTERN_VALID - pattern is well formed
+   *   PATTERN_RANGE - [..] construct has a no end range in a '-' pair (ie [a-])
+   *   PATTERN_CLOSE - [..] construct has no end bracket (ie [abc-g )
+   *   PATTERN_EMPTY - [..] construct is empty (ie [])
+   *  --------------------------------------------------------------------------
+  **/
+  bool 
+  is_valid_pattern (const char *pattern, int *error_type);
+
+
+  //****************************************************************************
+  //****************************************************************************
+  bool
+  ProcessMask(const std::string & rString, const std::string & rWildcard,
+      std::string & rSubstr);
+}
+#endif
diff --git a/src/KaldiLib/StkStream.h b/src/KaldiLib/StkStream.h
new file mode 100644
index 0000000..9188205
--- /dev/null
+++ b/src/KaldiLib/StkStream.h
@@ -0,0 +1,526 @@
+
+
+/** @file stkstream.h
+ *  This is an TNet C++ Library header.
+ */
+
+
+#ifndef TNet_StkStream_h
+#define TNet_StkStream_h
+
+#include <fstream>
+#include <string>
+#include <vector>
+#include <list>
+#include <stdexcept>
+
+#pragma GCC system_header
+
+
+//extern const char * gpFilterWldcrd;
+
+namespace TNet
+{
+
+  /**
+   *  @brief Expands a filter command into a runnable form
+   *
+   *  This function replaces all occurances of *filter_wldcard in *command by
+   *  *filename
+   */
+  //char * ExpandFilterCommand(const char *command, const char *filename);
+
+  /**
+   *  @brief Provides a layer of compatibility for C/POSIX.
+   *
+   *  This GNU extension provides extensions for working with standard C
+   *  FILE*'s and POSIX file descriptors.  It must be instantiated by the
+   *  user with the type of character used in the file stream, e.g.,
+   *  basic_stkbuf<char>.
+   */
+  template<
+    typename _CharT, 
+    typename _Traits = std::char_traits<_CharT>
+  > 
+    class basic_stkbuf : public std::basic_filebuf<_CharT, _Traits>
+    {
+    public:
+
+      typedef basic_stkbuf<_CharT, _Traits>     this_type;
+
+      // Types:
+      typedef _CharT                        char_type;
+      typedef _Traits                       traits_type;
+
+      typedef typename traits_type::int_type        int_type;
+      typedef typename traits_type::pos_type        pos_type;
+      typedef typename traits_type::off_type        off_type;
+      typedef std::size_t                           size_t;
+
+    public:
+
+      /// @{
+      /// Type of streambuffer
+      static const unsigned int t_undef  = 0; ///< undefined
+      static const unsigned int t_file   = 1; ///< file stream
+      static const unsigned int t_pipe   = 2; ///< pipe
+      static const unsigned int t_filter = 4; ///< filter
+      static const unsigned int t_stdio  = 8; ///< standard input/output
+      /// @}
+
+    public:
+
+      /**
+      * deferred initialization
+      */
+      basic_stkbuf() : std::basic_filebuf<_CharT, _Traits>(),
+        mFilename(""), mpFilePtr(0), mStreamType(t_undef){}
+
+      /**
+       *  @brief  Opens a stream.
+       *  @param  fName  The name of the file.
+       *  @param  m      The open mode flags.
+       *  @param  pFilter The pFilter command to use
+       *  @return  @c this on success, NULL on failure
+       *
+       *  If a file is already open, this function immediately fails.
+       *  Otherwise it tries to open the file named @a s using the flags
+       *  given in @a mode.
+       *
+       *  [Table 92 gives the relation between openmode combinations and the
+       *  equivalent fopen() flags, but the table has not been copied yet.]
+       */
+      basic_stkbuf(const char* fName, std::ios_base::openmode m, const char* pFilter="");
+
+      
+      /**
+      *  @return  The underlying FILE*.
+      *
+      *  This function can be used to access the underlying "C" file pointer.
+      *  Note that there is no way for the library to track what you do
+      *  with the file, so be careful.
+      */
+      std::__c_file*
+      file() { return this->_M_file.file(); }
+
+
+      /**
+      *  @return  The underlying FILE*.
+      *
+      *  This function can be used to access the underlying "C" file pointer.
+      *  Note that there is no way for the library to track what you do
+      *  with the file, so be careful.
+      */
+      std::__c_file*
+      fp() { return this->_M_file.file(); }
+      
+      
+      /**
+       *  @brief  Opens an external file.
+       *  @param  fName  The name of the file.
+       *  @param  m      The open mode flags.
+       *  @param  pFilter The pFilter command to use
+       *  @return  @c this on success, NULL on failure
+       *
+       *  If a file is already open, this function immediately fails.
+       *  Otherwise it tries to open the file named @a s using the flags
+       *  given in @a mode.
+       *
+       *  [Table 92 gives the relation between openmode combinations and the
+       *  equivalent fopen() flags, but the table has not been copied yet.]
+       */
+      this_type*
+      open(const char* pFName, std::ios_base::openmode m, const char* pFilter="");
+      
+      /**
+       *  @brief  Closes the currently associated file.
+       *  @return  @c this on success, NULL on failure
+       *
+       *  If no file is currently open, this function immediately fails.
+       *
+       *  If a "put buffer area" exists, @c overflow(eof) is called to flush
+       *  all the characters.  The file is then closed.
+       *
+       *  If any operations fail, this function also fails.
+       */
+      this_type*
+      close();
+
+      /**
+      *  Closes the external data stream if the file descriptor constructor
+      *  was used.
+      */
+      virtual
+      ~basic_stkbuf() 
+      {close();};
+
+      /// Returns the file name
+      const std::string 
+      name() const 
+      {return mFilename;}
+
+
+    private:
+      /// converts the ios::xxx mode to stdio style
+      static void open_mode(std::ios_base::openmode __mode, int&, int&,  char* __c_mode);
+
+      /**
+       *  @param  __f  An open @c FILE*.
+       *  @param  __mode  Same meaning as in a standard filebuf.
+       *  @param  __size  Optimal or preferred size of internal buffer, in chars.
+       *                Defaults to system's @c BUFSIZ.
+       *
+       *  This method associates a file stream buffer with an open
+       *  C @c FILE*.  The @c FILE* will not be automatically closed when the
+       *  basic_stkbuf is closed/destroyed. It is equivalent to one of the constructors
+       *  of the stdio_filebuf class defined in GNU ISO C++ ext/stdio_filebuf.h
+      */
+      void superopen(std::__c_file* __f, std::ios_base::openmode __mode,
+            size_t __size = static_cast<size_t>(BUFSIZ));
+
+
+    private:
+      /// Holds the full file name
+      std::string           mFilename;
+
+      std::ios_base::openmode  mMode;
+
+      /// Holds a pointer to the main FILE structure
+      FILE *                mpFilePtr;
+
+      /// tells what kind of stream we use (stdio, file, pipe)
+      unsigned int          mStreamType;
+
+    };
+
+
+
+  /**
+   *  @brief This extension wraps stkbuf stream buffer into the standard ios class.
+   *
+   *  This class is inherited by (i/o)stkstream classes which make explicit use of
+   *  the custom stream buffer
+   */
+  template<
+    typename _CharT, 
+    typename _Traits = std::char_traits<_CharT>
+  >	
+    class BasicStkIos 
+    : virtual public std::basic_ios<_CharT, _Traits>
+    {
+    public:
+      typedef basic_stkbuf <_CharT,_Traits>        StkBufType;
+
+      BasicStkIos() 
+      : mBuf() 
+      { init(&mBuf) ;};
+
+      BasicStkIos(const char* fName, std::ios::openmode m, const char* pFilter) 
+      : mBuf(fName, m, pFilter) 
+      { init(&mBuf) ; }
+
+      StkBufType*
+      rdbuf() 
+      { return &mBuf; }
+
+    protected:
+      StkBufType  mBuf;
+    };
+
+
+  /**
+   *  @brief  Controlling input for files.
+   *
+   *  This class supports reading from named files, using the inherited
+   *  functions from std::istream.  To control the associated
+   *  sequence, an instance of std::stkbuf is used.
+  */
+  template<
+    typename _CharT, 
+    typename _Traits = std::char_traits<_CharT>
+  >	
+    class BasicIStkStream 
+    : public BasicStkIos<_CharT, _Traits>, 
+      public std::basic_istream<_CharT, _Traits>
+    {
+    public:
+      typedef BasicStkIos<_CharT, _Traits> BasicStkIosType;
+      typedef std::basic_istream<_CharT,_Traits>  IStreamType;
+
+
+      // Constructors:
+      /**
+       *  @brief  Default constructor.
+       *
+       *  Initializes @c mBuf using its default constructor, and passes
+       *  @c &sb to the base class initializer.  Does not open any files
+       *  (you haven't given it a filename to open).
+       */
+      BasicIStkStream() 
+      : BasicStkIosType(),
+        IStreamType(BasicStkIosType::rdbuf())
+      {};
+
+     /**
+       *  @brief  Create an input file stream.
+       *  @param  fName  String specifying the filename.
+       *  @param  m      Open file in specified mode (see std::ios_base).
+       *  @param  pFilter String specifying pFilter command to use on fName
+       *
+       *  @c ios_base::in is automatically included in
+       *  @a m.
+       *
+       *  Tip:  When using std::string to hold the filename, you must use
+       *  .c_str() before passing it to this constructor.
+      */
+      BasicIStkStream(const char* pFName, std::ios::openmode m=std::ios::out, const char* pFilter="")
+      : BasicStkIosType(),
+        IStreamType(BasicStkIosType::rdbuf())
+      {this->open(pFName, std::ios::in, pFilter);}
+
+      ~BasicIStkStream() 
+      {
+        this->close();
+      }
+        
+      /**
+      *  @brief  Opens an external file.
+      *  @param  s  The name of the file.
+      *  @param  mode  The open mode flags.
+      *  @param  pFilter The pFilter command to use
+         *
+      *  Calls @c std::basic_filebuf::open(s,mode|in).  If that function
+      *  fails, @c failbit is set in the stream's error state.
+      *
+      *  Tip:  When using std::string to hold the filename, you must use
+      *  .c_str() before passing it to this constructor.
+      */
+      void open(const char* pFName, std::ios::openmode m=std::ios::in, const char* pFilter = "")
+      {
+        if (!BasicStkIosType::mBuf.open(pFName, m | std::ios_base::in, pFilter)) {
+          this->setstate(std::ios_base::failbit);
+        }
+        else {
+        // Closing an fstream should clear error state
+          BasicStkIosType::clear();
+        }
+      }
+
+      /**
+      *  @brief  Returns true if the external file is open.
+      */
+      bool is_open() const {return BasicStkIosType::mBuf.is_open();}
+
+
+      /**
+      *  @brief  Closes the stream
+      */
+      void close() {BasicStkIosType::mBuf.close();}
+
+      /**
+      *  @brief  Returns the filename
+      */
+      const std::string name() const {return BasicStkIosType::mBuf.name();}
+
+      /// Returns a pointer to the main FILE structure
+      std::__c_file*
+      file() {return BasicStkIosType::mBuf.file();}
+
+      /// Returns a pointer to the main FILE structure
+      std::__c_file*
+      fp() {return BasicStkIosType::mBuf.fp();}
+
+      // /**
+      //  *  @brief  Reads a single line
+      //  *
+      //  *  This is a specialized function as std::getline does not provide a way to
+      //  *  read multiple end-of-line symbols (we need both '\n' and EOF to delimit
+      //  *  the line)
+      //  */
+      // void
+      // GetLine(string& rLine);
+
+    }; // class BasicIStkStream
+
+
+  /**
+   *  @brief  Controlling output for files.
+   *
+   *  This class supports reading from named files, using the inherited
+   *  functions from std::ostream.  To control the associated
+   *  sequence, an instance of TNet::stkbuf is used.
+  */
+  template<
+    typename _CharT, 
+    typename _Traits = std::char_traits<_CharT>
+  >	
+    class BasicOStkStream 
+    : public BasicStkIos<_CharT, _Traits>,
+      public std::basic_ostream<_CharT, _Traits>
+    {
+    public:
+      typedef BasicStkIos<_CharT, _Traits> BasicStkIosType;
+      typedef std::basic_ostream<_CharT,_Traits>  OStreamType;
+
+      // Constructors:
+      /**
+       *  @brief  Default constructor.
+       *
+       *  Initializes @c sb using its default constructor, and passes
+       *  @c &sb to the base class initializer.  Does not open any files
+       *  (you haven't given it a filename to open).
+       */
+      BasicOStkStream() 
+      : BasicStkIosType(),
+        OStreamType(BasicStkIosType::rdbuf())
+      {};
+
+      /**
+       *  @brief  Create an output file stream.
+       *  @param  fName  String specifying the filename.
+       *  @param  m      Open file in specified mode (see std::ios_base).
+       *  @param  pFilter String specifying pFilter command to use on fName
+       *
+       *  @c ios_base::out|ios_base::trunc is automatically included in
+       *  @a mode.
+       *
+       *  Tip:  When using std::string to hold the filename, you must use
+       *  .c_str() before passing it to this constructor.
+       */
+      BasicOStkStream(const char* pFName, std::ios::openmode m=std::ios::out, const char* pFilter="")
+      : BasicStkIosType(),
+        OStreamType(BasicStkIosType::rdbuf())
+      {this->open(pFName, std::ios::out, pFilter);}
+
+      /**
+      *  @brief  Opens an external file.
+      *  @param  fName  The name of the file.
+      *  @param  m  The open mode flags.
+      *  @param  pFilter String specifying pFilter command to use on fName
+      *
+      *  Calls @c std::basic_filebuf::open(s,mode|out).  If that function
+      *  fails, @c failbit is set in the stream's error state.
+      *
+      *  Tip:  When using std::string to hold the filename, you must use
+      *  .c_str() before passing it to this constructor.
+      */
+      void open(const char* pFName, std::ios::openmode m=std::ios::out, const char* pFilter="")
+      {
+        if (!BasicStkIosType::mBuf.open(pFName, m | std::ios_base::out, pFilter))
+          this->setstate(std::ios_base::failbit);
+        else
+        // Closing an fstream should clear error state
+          this->clear();
+      }
+
+      /**
+      *  @brief  Returns true if the external file is open.
+      */
+      bool is_open() const 
+      { return BasicStkIosType::mBuf.is_open();}
+
+      /**
+      *  @brief  Closes the stream
+      */
+      void close() 
+      { BasicStkIosType::mBuf.close();}
+
+      /**
+      *  @brief  Returns the filename
+      */
+      const std::string name() const 
+      { return BasicStkIosType::mBuf.name();}
+
+      /// Returns a pointer to the main FILE structure
+      std::__c_file*
+      file() 
+      { return BasicStkIosType::mBuf.file();}
+
+      /// Returns a pointer to the main FILE structure
+      std::__c_file*
+      fp() 
+      { return BasicStkIosType::mBuf.fp();}
+
+    }; // class BasicOStkStream
+
+
+  /**
+   * We define some implicit stkbuf class
+   */
+  ///@{
+#ifndef _GLIBCPP_USE_WCHAR_T
+  typedef BasicOStkStream<char>      OStkStream;
+  typedef BasicOStkStream<wchar_t>  WOStkStream;
+  typedef BasicIStkStream<char>      IStkStream;
+  typedef BasicIStkStream<wchar_t>  WIStkStream;
+#else 
+  typedef BasicOStkStream<char>     WOStkStream;
+  typedef BasicOStkStream<wchar_t>  WOStkStream;
+  typedef BasicIStkStream<char>     WIStkStream;
+  typedef BasicIStkStream<wchar_t>  WIStkStream;
+#endif
+  /// @}
+
+  /*
+  template<class T,class char_type> inline
+   BasicOStkStream<char_type>& operator << (BasicOStkStream<char_type> &ostream, const std::vector<T> &vec){
+    ostream << vec.size() << std::endl;
+    for(size_t i=0;i<vec.size();i++) ostream << vec[i];
+    return ostream;
+  }
+
+  template<class T,class char_type> inline BasicIStkStream<char_type> &operator >> (BasicIStkStream<char_type> &istream, std::vector<T> &vec){
+    size_t sz;
+    istream >> sz; if(!istream.good()){ throw std::runtime_error(std::string("Error reading to vector of [something]: stream bad\n"));  }
+    int ch = istream.get(); if(ch!='\n' || !istream.good()){ throw std::runtime_error(std::string("Expecting newline after vector size, got " + (std::string)(char)ch));} // TODO: This code may not be right for wchar.
+    vec.resize(sz);
+    for(size_t i=0;i<vec.size();i++) istream >> vec[i];
+    return istream;
+    }*/
+
+  template<class T> inline
+  std::ostream & operator << (std::ostream &ostream, const std::vector<T> &vec){
+    ostream << vec.size() << std::endl;
+    for(size_t i=0;i<vec.size();i++) ostream << vec[i] << "\n"; // '\n' is necessary in case item is atomic e.g. a number.
+    return ostream;
+  }
+
+  template<class T> inline std::istream& operator >> (std::istream &istream, std::vector<T> &vec){
+    size_t sz;
+    istream >> sz; if(!istream.good()){ throw std::runtime_error(std::string("Error reading to vector of [something]: stream bad\n"));  }
+    // int ch = istream.get(); if(ch!='\n' || !istream.good()){ throw std::runtime_error(std::string("Expecting newline after vector size\n")); // TODO: This code may not be right for wchar.
+    vec.resize(sz);
+    for(size_t i=0;i<vec.size();i++) istream >> vec[i];
+    return istream;
+  }
+
+  template<class T> inline
+  std::ostream & operator << (std::ostream &ostream, const std::list<T> &lst){
+    ostream << lst.size() << std::endl;
+    typename std::list<T>::iterator it;
+    for(it = lst.begin(); it != lst.end(); it++)
+      ostream << *it << "\n"; // '\n' is necessary in case item is atomic e.g. a number.
+    return ostream;
+  }
+
+  template<class T> inline std::istream& operator >> (std::istream &istream, std::list<T> &lst){
+    size_t sz;
+    istream >> sz; if(!istream.good()){ throw std::runtime_error(std::string("Error reading to list of [something]: stream bad\n"));  }
+    lst.resize(sz);
+    typename std::list<T>::iterator it;
+    for(it = lst.begin(); it != lst.end(); it++)
+      istream >> *it;
+    return istream;
+  }
+
+}; // namespace TNet
+
+
+using TNet::operator >>; 
+using TNet::operator <<;
+
+
+# include "StkStream.tcc"
+
+// TNet_StkStream_h
+#endif 
diff --git a/src/KaldiLib/StkStream.tcc b/src/KaldiLib/StkStream.tcc
new file mode 100644
index 0000000..e3de1ae
--- /dev/null
+++ b/src/KaldiLib/StkStream.tcc
@@ -0,0 +1,228 @@
+#ifndef TNet_StkStream_tcc
+#define TNet_StkStream_tcc
+
+#include <cstring>
+#include <iostream>
+
+#include "Common.h"
+
+#pragma GCC system_header
+
+namespace TNet
+{
+  
+  //******************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits
+  > 
+  basic_stkbuf<_CharT, _Traits> *
+  basic_stkbuf<_CharT, _Traits>::
+  close(void)
+  {
+    // we only want to close an opened file
+    if (this->is_open())
+    {
+      // we want to call the parent close() procedure
+      std::basic_filebuf<_CharT, _Traits>::close();
+
+      // and for different stream type we perform different closing
+      if (mStreamType == basic_stkbuf::t_file)
+      {
+        fclose(mpFilePtr);
+      }
+      else if (mStreamType == basic_stkbuf::t_pipe)
+      {
+        pclose(mpFilePtr);
+      }
+      else if (mStreamType == basic_stkbuf::t_stdio)
+      {
+
+      }
+      
+      mpFilePtr     = NULL;
+      mFilename    = "";
+      mMode        = std::ios_base::openmode(0);
+      mStreamType = basic_stkbuf::t_undef;
+      return this;
+    }
+    else
+      return 0;
+  }
+
+
+  template<
+    typename _CharT, 
+    typename _Traits
+  > 
+  void
+  basic_stkbuf<_CharT, _Traits>::
+  open_mode(std::ios_base::openmode __mode, int&, int&,  char* __c_mode)
+  {
+    bool __testb = __mode & std::ios_base::binary;
+    bool __testi = __mode & std::ios_base::in;
+    bool __testo = __mode & std::ios_base::out;
+    bool __testt = __mode & std::ios_base::trunc;
+    bool __testa = __mode & std::ios_base::app;
+
+    if (!__testi && __testo && !__testt && !__testa)
+      strcpy(__c_mode, "w");
+    if (!__testi && __testo && !__testt && __testa)
+      strcpy(__c_mode, "a");
+    if (!__testi && __testo && __testt && !__testa)
+      strcpy(__c_mode, "w");
+    if (__testi && !__testo && !__testt && !__testa)
+      strcpy(__c_mode, "r");
+    if (__testi && __testo && !__testt && !__testa)
+      strcpy(__c_mode, "r+");
+    if (__testi && __testo && __testt && !__testa)
+      strcpy(__c_mode, "w+");
+    if (__testb)
+      strcat(__c_mode, "b");
+  }
+
+
+  //******************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits
+  > 
+  basic_stkbuf<_CharT, _Traits> *
+  basic_stkbuf<_CharT, _Traits>::
+  open(const char* pFName, std::ios::openmode m, const char* pFilter)
+  {
+    basic_stkbuf<_CharT, _Traits>* p_ret = NULL;
+
+    if (NULL == pFName)
+      return NULL;
+      
+    // we need to assure, that the stream is not open
+    if (!this->is_open())
+    {
+      char mstr[4] = {'\0', '\0', '\0', '\0'};
+      int __p_mode = 0;
+      int __rw_mode = 0;
+
+      // now we decide, what kind of file we open
+      if (!strcmp(pFName,"-"))
+      {
+        if      ((m & std::ios::in) && !(m & std::ios::out))
+        {
+          mpFilePtr   = stdin;
+          mMode       = std::ios::in;
+          mFilename   = pFName;
+          mStreamType = t_stdio;
+          p_ret       = this;
+        }
+        else if ((m & std::ios::out) && !(m & std::ios::in))
+        {
+          mpFilePtr   = stdout;
+          mMode       = std::ios::out;
+          mFilename   = pFName;
+          mStreamType = t_stdio;
+          p_ret       = this;
+        }
+        else
+          p_ret = NULL;
+      }
+      else if ( pFName[0] == '|' )
+      {
+        const char* command = pFName + 1;
+
+        if      ((m & std::ios::in) && !(m & std::ios::out)) m = std::ios::in;
+        else if ((m & std::ios::out) && !(m & std::ios::in)) m = std::ios::out;
+        else return NULL;
+
+        // we need to make some conversion
+        // iostream -> stdio open mode string
+        this->open_mode(m, __p_mode, __rw_mode, mstr);
+
+        if ((mpFilePtr = popen(command, mstr)))
+        {
+          mFilename   = command;
+          mMode       = m;
+          mStreamType = t_pipe;
+          p_ret       = this;
+        }
+        else
+          p_ret = 0;
+      }
+      else
+      {
+        // maybe we have a filter specified
+        if ( pFilter 
+        && ('\0' != pFilter[0]))
+        {
+          char* command = ExpandHtkFilterCmd(pFilter, pFName, "$");
+
+          if      ((m & std::ios::in) && !(m & std::ios::out)) m = std::ios::in;
+          else if ((m & std::ios::out) && !(m & std::ios::in)) m = std::ios::out;
+          else return NULL;
+
+          // we need to make some conversion
+          // iostream -> stdio open mode string
+          this->open_mode(m, __p_mode, __rw_mode, mstr);
+
+          if ((mpFilePtr = popen(command, mstr)))
+          {
+            mFilename     = pFName;
+            mMode         = m;
+            mStreamType   = t_pipe;
+            p_ret         = this;
+          }
+          else
+            p_ret = 0;
+        }
+        else // if (!filter.empty())
+        {
+          // we need to make some conversion
+          // iostream -> stdio open mode string
+          this->open_mode(m, __p_mode, __rw_mode, mstr);
+
+          if ((mpFilePtr = fopen(pFName, mstr)))
+          {
+            mFilename   = pFName;
+            mMode       = m;
+            mStreamType = t_file;
+            p_ret       = this;
+          }
+          else {
+            p_ret = NULL;
+          }
+        }
+      }
+
+      // here we perform what the stdio_filebuf would do
+      if (p_ret) {
+        superopen(mpFilePtr, m);
+      }
+    } //if (!isopen)
+
+    return p_ret;
+  }
+
+  //******************************************************************************
+  template<
+    typename _CharT, 
+    typename _Traits
+  > 
+  void
+  basic_stkbuf<_CharT, _Traits>::
+  superopen(std::__c_file* __f, std::ios_base::openmode __mode,
+        size_t __size)
+  {
+    this->_M_file.sys_open(__f, __mode);
+    if (this->is_open())
+    {
+      this->_M_mode = __mode;
+      this->_M_buf_size = __size;
+      this->_M_allocate_internal_buffer();
+      this->_M_reading = false;
+      this->_M_writing = false;
+      this->_M_set_buffer(-1);
+    }
+  }
+}
+
+// TNet_StkStream_tcc
+#endif
diff --git a/src/KaldiLib/Timer.cc b/src/KaldiLib/Timer.cc
new file mode 100644
index 0000000..692969b
--- /dev/null
+++ b/src/KaldiLib/Timer.cc
@@ -0,0 +1,5 @@
+#include "Timer.h"
+
+/*
+TNet::Timer gTimer;
+*/
diff --git a/src/KaldiLib/Timer.h b/src/KaldiLib/Timer.h
new file mode 100644
index 0000000..b220b93
--- /dev/null
+++ b/src/KaldiLib/Timer.h
@@ -0,0 +1,103 @@
+#ifndef Timer_h
+#define Timer_h
+
+#include "Error.h"
+#include <sstream>
+
+
+
+#if defined(_WIN32) || defined(MINGW)
+
+# include <windows.h>
+
+namespace TNet
+{
+  class Timer {
+  public:
+    void 
+    Start(void)
+    {
+      static int first = 1;
+
+      if(first) {
+              QueryPerformanceFrequency(&mFreq);
+              first = 0;
+      }
+      QueryPerformanceCounter(&mTStart);
+    }
+
+    void 
+    End(void)
+    { QueryPerformanceCounter(&mTEnd); }
+
+    double 
+    Val()
+    {
+      return ((double)mTEnd.QuadPart - (double)mTStart.QuadPart) / 
+        ((double)mFreq.QuadPart);
+    }
+
+  private:
+    LARGE_INTEGER mTStart;
+    LARGE_INTEGER mTEnd;
+    LARGE_INTEGER mFreq;
+  };
+}
+
+#else
+
+# include <sys/time.h>
+# include <unistd.h>
+
+namespace TNet
+{
+  class Timer 
+  {
+  public:
+    void 
+    Start()
+    { gettimeofday(&this->mTStart, &mTz); }
+
+    void 
+    End()
+    { gettimeofday(&mTEnd,&mTz); }
+
+    double 
+    Val()
+    {
+      double t1, t2;
+
+      t1 =  (double)mTStart.tv_sec + (double)mTStart.tv_usec/(1000*1000);
+      t2 =  (double)mTEnd.tv_sec + (double)mTEnd.tv_usec/(1000*1000);
+      return t2-t1;
+    }
+
+  private:
+    struct timeval mTStart;
+    struct timeval mTEnd;
+    struct timezone mTz;
+  };
+}
+
+#endif
+
+
+
+
+
+
+
+///////////////////////////////////////////////////////////////
+// Macros for adding the time intervals to time accumulator
+#if PROFILING==1
+#  define TIMER_START(timer) timer.Start()
+#  define TIMER_END(timer,sum) timer.End(); sum += timer.Val()
+#else
+#  define TIMER_START(timer) 
+#  define TIMER_END(timer,sum) 
+#endif
+
+#endif
+
+
+
diff --git a/src/KaldiLib/Tokenizer.cc b/src/KaldiLib/Tokenizer.cc
new file mode 100644
index 0000000..0c49050
--- /dev/null
+++ b/src/KaldiLib/Tokenizer.cc
@@ -0,0 +1,53 @@
+#include "Tokenizer.h"
+#include "string.h"
+
+namespace TNet
+{
+  //****************************************************************************
+  //****************************************************************************
+  void
+  Tokenizer::
+  AddString(const char* pString)
+  {
+    // copy into string struct, which is more convenient
+    std::string       aux_string(pString);
+    std::string       aux_record;
+    std::string::size_type    cur_pos = 0;
+    std::string::size_type    old_pos = 0;
+    std::string::size_type    search_start = 0;
+
+    // make sure we have enough space
+    aux_record.reserve(aux_string.length());
+
+    // find all of separators and make a list of tokens
+    while(old_pos < std::string::npos) {
+      // find the next separator
+      cur_pos = aux_string.find_first_of(mSeparator, search_start);
+
+      // if backslash is in front of separator, ignore this separator
+      if (cur_pos != 0 && cur_pos != std::string::npos && 
+          pString[cur_pos - 1] == '\\') {
+        search_start = cur_pos + 1;
+        continue;
+      }
+
+      // we don't want to have empty records
+      if (!(cur_pos == old_pos && mSkipEmpty)) {
+        // extract token
+        aux_record.insert(0, pString+old_pos, cur_pos==std::string::npos ? strlen(pString+old_pos) : cur_pos - old_pos);
+        // insert to list
+        this->push_back(aux_record);
+
+        // we don't need the contents of the token
+        aux_record.erase();
+      }
+
+      // update old position so that it points behind the separator
+      old_pos = cur_pos < std::string::npos ? cur_pos + 1 : cur_pos;
+      search_start = old_pos;
+    }
+  }
+
+
+} // namespace TNet
+
diff --git a/src/KaldiLib/Tokenizer.h b/src/KaldiLib/Tokenizer.h
new file mode 100644
index 0000000..1be717b
--- /dev/null
+++ b/src/KaldiLib/Tokenizer.h
@@ -0,0 +1,45 @@
+#include <list>
+#include <string>
+
+namespace TNet {
+  /** 
+   * @brief General string tokenizer
+   */
+  class Tokenizer 
+  : public std::list<std::string>
+  {
+  public:
+    // Constructors and Destructors ............................................
+    Tokenizer(const char* pSeparator, bool skipEmpty = false)
+    : std::list<std::string>(), mSeparator(pSeparator), mSkipEmpty(skipEmpty)
+    {}
+
+    Tokenizer(const char* pString, const char* pSeparator, bool skipEmpty = false)
+    : std::list<std::string>(), mSeparator(pSeparator), mSkipEmpty(skipEmpty)
+    { AddString(pString); }
+
+    ~Tokenizer()
+    {}
+
+    /** 
+     * @brief Parses a string and appends the tokens to the list
+     * @param pString string to parse
+     */
+    void
+    AddString(const char* pString);
+
+    /** 
+     * @brief Constant accessor to the separators string
+     * @return Const refference
+     */
+    const std::string&
+    Separator() const
+    {return mSeparator;}
+
+  private:
+    std::string mSeparator;   ///< holds the list of separators
+    bool        mSkipEmpty;   ///< if true, multiple separators will be regarded as one
+  }; // class Tokenizer
+} // namespace TNet
+
+
diff --git a/src/KaldiLib/Types.h b/src/KaldiLib/Types.h
new file mode 100644
index 0000000..6a5bfac
--- /dev/null
+++ b/src/KaldiLib/Types.h
@@ -0,0 +1,78 @@
+#ifndef TNet_Types_h
+#define TNet_Types_h
+
+#ifdef HAVE_ATLAS
+extern "C"{
+  #include <cblas.h>
+  #include <clapack.h>
+}
+#endif
+
+
+namespace TNet 
+{
+  // TYPEDEFS ..................................................................
+#if DOUBLEPRECISION
+  typedef double  BaseFloat;
+#else
+  typedef float   BaseFloat;
+#endif
+
+#ifndef UINT_16
+  typedef unsigned short  UINT_16   ;
+  typedef unsigned        UINT_32   ;
+  typedef short           INT_16    ;
+  typedef int             INT_32    ;
+  typedef float           FLOAT_32  ;
+  typedef double          DOUBLE_64 ;
+#endif
+
+
+
+  // ...........................................................................
+  // The following declaration assumes that SSE instructions are enabled
+  // and that we are using GNU C/C++ compiler, which defines the __attribute__ 
+  // notation.
+  //
+  // ENABLE_SSE is defined in <config.h>. Its value depends on options given
+  // in the configure phase of builidng the library
+#if defined(__GNUC__ )
+  // vector of four single floats
+  typedef float  v4sf __attribute__((vector_size(16))); 
+  // vector of two single doubles
+  typedef double v2sd __attribute__((vector_size(16))); 
+
+  typedef BaseFloat BaseFloat16Aligned __attribute__((aligned(16))) ;
+
+  typedef union 
+  {
+    v4sf    v;
+    float   f[4];
+  } f4vector; 
+
+  typedef union 
+  {
+    v2sd    v;
+    double  f[2];
+  } d2vector; 
+#endif // ENABLE_SSE && defined(__GNUC__ )
+
+
+
+  typedef enum
+  {
+#ifdef HAVE_ATLAS
+    TRANS    = CblasTrans,
+    NO_TRANS = CblasNoTrans
+#else
+    TRANS    = 'T',
+    NO_TRANS = 'N'
+#endif
+  } MatrixTrasposeType;
+
+
+
+} // namespace TNet
+
+#endif // #ifndef TNet_Types_h
+
diff --git a/src/KaldiLib/UserInterface.cc b/src/KaldiLib/UserInterface.cc
new file mode 100644
index 0000000..b59a6c5
--- /dev/null
+++ b/src/KaldiLib/UserInterface.cc
@@ -0,0 +1,669 @@
+#include <stdexcept>
+#include <sstream>
+#include <stdarg.h>
+
+#include "UserInterface.h"
+#include "StkStream.h"
+#include "Features.h"
+
+namespace TNet
+{
+  //***************************************************************************
+  //***************************************************************************
+  int 
+  npercents(const char *str)
+  {
+    int ret = 0;
+    while (*str) if (*str++ == '%') ret++;
+    return ret;
+  }
+  
+
+  //***************************************************************************
+  //***************************************************************************
+  void 
+  UserInterface::
+  ReadConfig(const char *file_name)
+  {
+    std::string   line_buf;
+    std::string::iterator chptr;
+    std::string   key;
+    std::string   value;
+    std::ostringstream ss;
+    int           line_no = 0;
+    IStkStream    i_stream;
+
+
+    i_stream.open(file_name, std::ios::binary);
+    if (!i_stream.good()) {
+      throw std::runtime_error(std::string("Cannot open input config file ")
+          + file_name);
+    }
+    i_stream >> std::ws;
+
+    while (!i_stream.eof()) {
+      size_t i_pos;
+
+      // read line
+      std::getline(i_stream, line_buf);
+      i_stream >> std::ws;
+
+      if (i_stream.fail()) {
+        throw std::runtime_error(std::string("Error reading (") 
+            + file_name + ":" + (ss << line_no,ss).str() + ")");
+      }
+
+      // increase line counter
+      line_no++;
+
+      // cut comments
+      if (std::string::npos != (i_pos = line_buf.find('#'))) {
+        line_buf.erase(i_pos);
+      }
+
+      // cut leading and trailing spaces
+      Trim(line_buf);
+
+      // if empty line, then skip it
+      if (0 == line_buf.length()) {
+        continue;
+      }
+  
+      // line = line_buf.c_str();
+      // chptr = parptr;
+
+      chptr = line_buf.begin();
+
+      for (;;) {
+        // Replace speces by '_', which is removed in InsertConfigParam
+        while (isalnum(*chptr) || *chptr == '_' || *chptr == '-') {
+          chptr++;
+        }
+
+        while (std::isspace(*chptr)) {
+          *chptr = '_';
+          chptr++;
+        }
+
+        if (*chptr != ':') {
+          break;
+        }
+
+        chptr++;
+
+        while (std::isspace(*chptr)) {
+          *chptr = '_';
+          chptr++;
+        }
+      }
+      
+      if (*chptr != '=') {
+        throw std::runtime_error(std::string("Character '=' expected (") 
+            + file_name + ":" + (ss.str(""),ss<<line_no,ss).str() + ")");
+      }
+
+      key.assign(line_buf.begin(), chptr);
+
+      chptr++;
+
+      value.assign(chptr, line_buf.end());
+
+      ParseHTKString(value, value);
+      InsertConfigParam(key.c_str(), value.c_str(), 'C');
+    }
+  
+    i_stream.close();
+  }
+  
+
+  //***************************************************************************
+  //***************************************************************************
+  void 
+  UserInterface::
+  InsertConfigParam(const char *pParamName, const char *value, int optionChar) 
+  {
+    std::string key(pParamName);
+    std::string::iterator i_key = key.begin();
+
+    while (i_key != key.end()) {
+      if (*i_key == '-' || *i_key == '_') {
+        i_key = key.erase(i_key);
+      }
+      else {
+        *i_key = toupper(*i_key);
+        i_key ++;
+      }
+    }
+  
+    mMap[key].mValue  = value;
+    mMap[key].mRead   = false;
+    mMap[key].mOption = optionChar;
+  }
+  
+  //***************************************************************************
+  //***************************************************************************
+  int 
+  UserInterface::
+  ParseOptions(
+    int             argc,
+    char*           argv[],
+    const char*     pOptionMapping,
+    const char*     pToolName)
+  {
+    int          i;
+    int          opt = '?';
+    int          optind;
+    bool         option_must_follow = false;
+    char         param[1024];
+    char*        value;
+    const char*  optfmt;
+    const char*  optarg;
+    char*        chptr;
+    char*        bptr;
+    char         tstr[4] = " -?";
+    unsigned long long option_mask = 0;
+    std::ostringstream ss;
+  
+    #define MARK_OPTION(ch) {if (isalpha(ch)) option_mask |= 1ULL << ((ch) - 'A');}
+    #define OPTION_MARK(ch) (isalpha(ch) && ((1ULL << ((ch) - 'A')) & option_mask))
+    #define IS_OPTION(str) ((str)[0] == '-' && (isalpha((str)[1]) || (str)[1] == '-'))
+ 
+    //search for the -A param
+    for (optind = 1; optind < argc; optind++) {
+      // we found "--", no -A
+      if (!strcmp(argv[optind], "--")) {
+        break;
+      }
+
+      //repeat till we find -A
+      if (argv[optind][0] != '-' || argv[optind][1] != 'A') {
+        continue;
+      }
+
+      // just "-A" form
+      if (argv[optind][2] != '\0') {
+        throw std::runtime_error(std::string("Unexpected argument '") 
+            + (argv[optind] + 2) + "' after option '-A'");
+      }
+
+      for (i=0; i < argc; i++) {
+        // display all params
+        if(strchr(argv[i], ' ') || strchr(argv[i], '*')) 
+          std::cout << '\'' << argv[i] << '\'' << " ";
+        else std::cout << argv[i] << " ";
+      }
+
+      std::cout << std::endl;
+
+      break;
+    }
+
+    for (optind = 1; optind < argc; optind++) {
+      // find the '-C?' parameter (possible two configs)
+      if (!strcmp(argv[optind], "--")) break;
+      if (argv[optind][0] != '-' || argv[optind][1] != 'C') continue;
+      if (argv[optind][2] != '\0') {
+        ReadConfig(argv[optind] + 2);
+      } else if (optind+1 < argc && !IS_OPTION(argv[optind+1])) {
+        ReadConfig(argv[++optind]);
+      } else {
+        throw std::runtime_error("Config file name expected after option '-C'");
+      }
+    }
+
+    for (optind = 1; optind < argc; optind++) {
+      if (!strcmp(argv[optind], "--")) break;
+      if (argv[optind][0] != '-' || argv[optind][1] != '-') continue;
+
+      bptr = new char[strlen(pToolName) + strlen(argv[optind]+2) + 2];
+      strcat(strcat(strcpy(bptr, pToolName), ":"), argv[optind]+2);
+      value = strchr(bptr, '=');
+      if (!value) {
+        throw std::runtime_error(std::string("Character '=' expected after option '")
+            + argv[optind] + "'");
+      }
+
+      *value++ = '\0';
+      
+      InsertConfigParam(bptr, value /*? value : "TRUE"*/, '-');
+      delete [] bptr;
+    }
+
+    for (optind = 1; optind < argc && IS_OPTION(argv[optind]); optind++) {
+      option_must_follow = false;
+      tstr[2] = opt = argv[optind][1];
+      optarg = argv[optind][2] != '\0' ? argv[optind] + 2 : NULL;
+  
+      if (opt == '-' && !optarg) {    // '--' terminates the option list
+        return optind+1;
+      }
+      if (opt == 'C' || opt == '-') { // C, A and long options have been already processed
+        if (!optarg) optind++;
+        continue;
+      }
+      if (opt == 'A') continue;
+  
+      chptr = strstr((char*)pOptionMapping, tstr);
+      if (chptr == NULL) {
+        throw std::runtime_error(std::string("Invalid command line option '-") 
+            + static_cast<char>(opt) + "'");
+      }
+  
+      chptr += 3;
+      while (std::isspace(*chptr)) {
+        chptr++;
+      }
+  
+      if (!chptr || chptr[0] == '-') {// Option without format string will be ignored
+        optfmt = " ";
+      } else {
+        optfmt = chptr;
+        while (*chptr && !std::isspace(*chptr)) {
+          chptr++;
+        }
+        if (!*chptr) {
+          throw std::runtime_error("Fatal: Unexpected end of optionMap string");
+        }
+      }
+      for (i = 0; !std::isspace(*optfmt); optfmt++) {
+        while (std::isspace(*chptr)) chptr++;
+        value = chptr;
+        while (*chptr && !std::isspace(*chptr)) chptr++;
+        assert(static_cast<unsigned int>(chptr-value+1) < sizeof(param));
+        strncat(strcat(strcpy(param, pToolName), ":"), value, chptr-value);
+        param[chptr-value+strlen(pToolName)+1] = '\0';
+        switch (*optfmt) {
+          case 'n': 
+            value = strchr(param, '=');
+            if (value) *value = '\0';
+            InsertConfigParam(param,
+                              value ? value + 1: "TRUE", opt);
+            break;
+
+          case 'l':
+          case 'o':
+          case 'r': 
+            i++;
+            if (!optarg && (optind+1==argc || IS_OPTION(argv[optind+1]))) {
+              if (*optfmt == 'r' || *optfmt == 'l') {
+                throw std::runtime_error(std::string("Argument ") 
+                    + (ss<<i,ss).str() + " of option '-" 
+                    + static_cast<char>(opt) + "' expected");
+              }
+              optfmt = "  "; // Stop reading option arguments
+              break;
+            }
+            if (!optarg) optarg = argv[++optind];
+            if (*optfmt == 'o') {
+              option_must_follow = (bool) 1;
+            }
+            bptr = NULL;
+
+            // For repeated use of option with 'l' (list) format, append
+            // ',' and argument string to existing config parameter value.
+            if (*optfmt == 'l' && OPTION_MARK(opt)) {
+              bptr = strdup(GetStr(param, ""));
+              if (bptr == NULL) throw std::runtime_error("Insufficient memory");
+              bptr = (char*) realloc(bptr, strlen(bptr) + strlen(optarg) + 2);
+              if (bptr == NULL) throw std::runtime_error("Insufficient memory");
+              strcat(strcat(bptr, ","), optarg);
+              optarg = bptr;
+            }
+            MARK_OPTION(opt);
+            InsertConfigParam(param, optarg, opt);
+            free(bptr);
+            optarg = NULL;
+            break;
+
+          default : 
+            throw std::runtime_error(std::string("Fatal: Invalid character '")
+                + *optfmt + "' in optionMap after " + tstr);
+        }
+      }
+      if (optarg) {
+        throw std::runtime_error(std::string("Unexpected argument '")
+            + optarg + "' after option '-" 
+            + static_cast<char>(opt) + "'");
+      }
+    }
+  
+    for (i = optind; i < argc && !IS_OPTION(argv[i]); i++)
+    {}
+   
+    if (i < argc) {
+      throw std::runtime_error(std::string("No option expected after first non-option argument '")
+          + argv[optind] + "'");
+    }
+
+    if (option_must_follow) {
+      throw std::runtime_error(std::string("Option '-")
+          + static_cast<char>(opt) 
+          + "' with optional argument must not be the last option");
+    }
+
+    return optind;
+  }
+  
+
+  //***************************************************************************
+  //***************************************************************************
+  int 
+  UserInterface::
+  GetFeatureParams(
+    int *           derivOrder,
+    int **          derivWinLens,
+    int *           startFrmExt,
+    int *           endFrmExt,
+    char **         CMNPath,
+    char **         CMNFile,
+    const char **   CMNMask,
+    char **         CVNPath,
+    char **         CVNFile,
+    const char **   CVNMask,
+    const char **   CVGFile,
+    const char *    pToolName,
+    int             pseudoModeule)
+  {
+    const char *  str;
+    int           targetKind;
+    char *        chrptr;
+    char          paramName[32];
+    const char *  CMNDir;
+    const char *  CVNDir;
+    
+    strcpy(paramName, pToolName);
+    strcat(paramName, pseudoModeule == 1 ? "SPARM1:" :
+                      pseudoModeule == 2 ? "SPARM2:" : "");
+                      
+    chrptr = paramName + strlen(paramName);
+  
+    strcpy(chrptr, "STARTFRMEXT");
+    *startFrmExt = GetInt(paramName, 0);
+    strcpy(chrptr, "ENDFRMEXT");
+    *endFrmExt   = GetInt(paramName, 0);
+  
+    *CMNPath = *CVNPath = NULL;
+    strcpy(chrptr, "CMEANDIR");
+    CMNDir       = GetStr(paramName, NULL);
+    strcpy(chrptr, "CMEANMASK");
+    *CMNMask     = GetStr(paramName, NULL);
+
+    if (*CMNMask != NULL) {
+      *CMNPath = (char*) malloc((CMNDir ? strlen(CMNDir) : 0) + npercents(*CMNMask) + 2);
+      if (*CMNPath == NULL) throw std::runtime_error("Insufficient memory");
+      if (CMNDir != NULL) strcat(strcpy(*CMNPath, CMNDir), "/");
+      *CMNFile = *CMNPath + strlen(*CMNPath);
+    }
+    strcpy(chrptr, "VARSCALEDIR");
+    CVNDir      = GetStr(paramName, NULL);
+    strcpy(chrptr, "VARSCALEMASK");
+    *CVNMask     = GetStr(paramName, NULL);
+
+
+    if (*CVNMask != NULL) {
+      *CVNPath = (char*) malloc((CVNDir ? strlen(CVNDir) : 0) + npercents(*CVNMask) + 2);
+      if (*CVNPath == NULL) throw std::runtime_error("Insufficient memory");
+      if (CVNDir != NULL) strcat(strcpy(*CVNPath, CVNDir), "/");
+      *CVNFile = *CVNPath + strlen(*CVNPath);
+    }
+    strcpy(chrptr, "VARSCALEFN");
+    *CVGFile     = GetStr(paramName, NULL);
+    strcpy(chrptr, "TARGETKIND");
+    str = GetStr(paramName, "ANON");
+
+    targetKind = FeatureRepository::ReadParmKind(str, false);
+
+    if (targetKind == -1) {
+      throw std::runtime_error(std::string("Invalid TARGETKIND = '")
+          + str + "'");
+    }
+  
+    strcpy(chrptr, "DERIVWINDOWS");
+    if ((str = GetStr(paramName, NULL)) != NULL) {
+      long lval;
+      *derivOrder      = 0;
+      *derivWinLens = NULL;
+      
+      if (NULL != str)
+      {
+        while ((str = strtok((char *) str, " \t_")) != NULL) 
+        {
+          lval = strtol(str, &chrptr, 0);
+          if (!*str || *chrptr) {
+            throw std::runtime_error("Integers separated by '_' expected for parameter DERIVWINDOWS");
+          }
+          *derivWinLens = (int *)realloc(*derivWinLens, ++*derivOrder*sizeof(int));
+          if (*derivWinLens == NULL) throw std::runtime_error("Insufficient memory");
+          (*derivWinLens)[*derivOrder-1] = lval;
+          str = NULL;
+        }
+      }
+      
+      return targetKind;
+    }
+    *derivOrder = targetKind & PARAMKIND_T ? 3 :
+                  targetKind & PARAMKIND_A ? 2 :
+                  targetKind & PARAMKIND_D ? 1 : 0;
+  
+    if (*derivOrder || targetKind != PARAMKIND_ANON) {
+    *derivWinLens = (int *) malloc(3 * sizeof(int));
+      if (*derivWinLens == NULL) throw std::runtime_error("Insufficient memory");
+  
+      strcpy(chrptr, "DELTAWINDOW");
+      (*derivWinLens)[0] = GetInt(paramName, 2);
+      strcpy(chrptr, "ACCWINDOW");
+      (*derivWinLens)[1] = GetInt(paramName, 2);
+      strcpy(chrptr, "THIRDWINDOW");
+      (*derivWinLens)[2] = GetInt(paramName, 2);
+      return targetKind;
+    }
+    *derivWinLens = NULL;
+    *derivOrder   = -1;
+    return targetKind;
+  }
+  
+
+  //***************************************************************************
+  //***************************************************************************
+  UserInterface::ValueRecord*
+  UserInterface::
+  GetParam(const char* pParamName)
+  {
+    MapType::iterator it;
+
+    // this is done only for convenience. in the loop we will increase the 
+    // pointer again
+    pParamName--;
+
+    // we iteratively try to find the param name in the map. if an attempt 
+    // fails, we strip off all characters until the first ':' and we search 
+    // again
+    do {
+      pParamName++;
+      it = mMap.find(pParamName);
+    } while ((it == mMap.end()) && (NULL != (pParamName = strchr(pParamName, ':'))));
+
+    if (it == mMap.end()) {
+      return NULL;
+    }
+    else {
+      it->second.mRead = true;
+      return &(it->second);
+    }
+  }
+  
+
+  //***************************************************************************
+  //***************************************************************************
+  const char * 
+  UserInterface::
+  GetStr(
+    const char *    pParamName,
+    const char *    default_value)
+  {
+    ValueRecord* p_val = GetParam(pParamName);
+
+    if (NULL == p_val) {
+      return default_value;
+    }
+    else {
+      return p_val->mValue.c_str();
+    }
+  }
+  
+
+  //***************************************************************************
+  //***************************************************************************
+  long 
+  UserInterface::
+  GetInt(
+    const char *pParamName,
+    long default_value)
+  {
+    char *chrptr;
+    ValueRecord* p_val = GetParam(pParamName);
+
+    if (NULL == p_val) {
+      return default_value;
+    }
+  
+    const char *val = p_val->mValue.c_str();
+    default_value = strtol(val, &chrptr, 0);
+    if (!*val || *chrptr) {
+      throw std::runtime_error(std::string("Integer number expected for ") 
+          + pParamName + " but found '" + val + "'");
+    }
+    return default_value;
+  }
+  
+  //***************************************************************************
+  //***************************************************************************
+  float 
+  UserInterface::
+  GetFlt(
+    const char *      pParamName,
+    float             default_value)
+  {
+    char *chrptr;
+    ValueRecord* p_val = GetParam(pParamName);
+
+    if (NULL == p_val) {
+      return default_value;
+    }
+  
+    const char *val = p_val->mValue.c_str();
+    default_value = strtod(val, &chrptr);
+    if (!*val || *chrptr) {
+      throw std::runtime_error(std::string("Decimal number expected for ") 
+          + pParamName + " but found '" + val + "'");
+    }
+    return default_value;
+  }
+  
+  //***************************************************************************
+  //***************************************************************************
+  bool 
+  UserInterface::
+  GetBool(
+    const char *    pParamName,
+    bool            default_value)
+  {
+    ValueRecord* p_val = GetParam(pParamName);
+
+    if (NULL == p_val) {
+      return default_value;
+    }
+  
+    const char* val = p_val->mValue.c_str();
+
+    if (!strcasecmp(val, "TRUE") || !strcmp(val, "T")) return 1;
+    if (strcasecmp(val, "FALSE") && strcmp(val, "F")) {
+      throw std::runtime_error(std::string("TRUE or FALSE expected for ")
+          + pParamName + " but found '" + val + "'");
+    }
+    return false;
+  }
+  
+  //***************************************************************************
+  //***************************************************************************
+  // '...' are pairs: string and corresponding integer value , terminated by NULL
+  int 
+  UserInterface::
+  GetEnum(
+    const char *    pParamName,
+    int             default_value, 
+    ...)  
+  {
+    ValueRecord* p_val = GetParam(pParamName);
+
+    if (NULL == p_val) {
+      return default_value;
+    }
+
+    const char* val = p_val->mValue.c_str();
+    char*       s;
+    int i = 0, cnt = 0, l = 0;
+    va_list ap;
+  
+    va_start(ap, default_value);
+    while ((s = va_arg(ap, char *)) != NULL) {
+      l += strlen(s) + 2;
+      ++cnt;
+      i = va_arg(ap, int);
+      if (!strcmp(val, s)) break;
+    }
+    va_end(ap);
+
+    if (s) {
+      return i;
+    }
+  
+    //To report error, create string listing all possible values
+    s = (char*) malloc(l + 1);
+    s[0] = '\0';
+    va_start(ap, default_value);
+    for (i = 0; i < cnt; i++) {
+      strcat(s, va_arg(ap, char *));
+      va_arg(ap, int);
+      if (i < cnt - 2) strcat(s, ", ");
+      else if (i == cnt - 2) strcat(s, " or ");
+    }
+
+    va_end(ap);
+
+    throw std::runtime_error(std::string(s) + " expected for "
+        + pParamName + " but found '" + val + "'");
+
+    return 0;
+  }
+  
+
+  //***************************************************************************
+  //***************************************************************************
+  void
+  UserInterface::
+  PrintConfig(std::ostream& rStream)
+  {
+    rStream << "Configuration Parameters[" << mMap.size() << "]\n";
+    for (MapType::iterator it = mMap.begin(); it != mMap.end(); ++it) {
+      rStream << (it->second.mRead ? "  " : "# ") 
+        << std::setw(35) << std::left << it->first << " = "
+        << std::setw(30) << std::left << it->second.mValue 
+        << " # -" << it->second.mOption << std::endl;
+    }
+  }
+  
+  //***************************************************************************
+  //***************************************************************************
+  void 
+  UserInterface::
+  CheckCommandLineParamUse()
+  {
+    for (MapType::iterator it = mMap.begin(); it != mMap.end(); ++it) {
+      if (!it->second.mRead && it->second.mOption != 'C') {
+        Error("Unexpected command line parameter " + it->first);
+      }
+    }
+  }
+
+}
diff --git a/src/KaldiLib/UserInterface.h b/src/KaldiLib/UserInterface.h
new file mode 100644
index 0000000..fa189e7
--- /dev/null
+++ b/src/KaldiLib/UserInterface.h
@@ -0,0 +1,166 @@
+#ifndef TNet_UserInterface_h
+#define TNet_UserInterface_h
+
+#include <iostream>
+#include <cstdlib>
+#include <string>
+#include <map>
+
+namespace TNet 
+{
+  /** **************************************************************************
+   ** **************************************************************************
+   */
+  class UserInterface 
+  {
+  public:
+    struct ValueRecord {
+      std::string   mValue;
+      char          mOption;
+      bool          mRead;
+    };
+
+
+    void InsertConfigParam(
+      const char *param_name,
+      const char *value,
+      int optionChar);
+    
+
+    void 
+    ReadConfig(const char *pFileName);
+
+
+    void 
+    CheckCommandLineParamUse();
+    
+
+    /** 
+     * @brief Retreives the content of a parameter
+     * @param pParamName Name of the parameter to look for
+     * @return Returns the pointer to the ValueRecord structure if success,
+     *         otherwise return NULL
+     *
+     *  We iteratively try to find the param name in the map. If an attempt 
+     *  fails, we strip off all characters until the first occurance of ':' 
+     *  and we search again
+     */
+    ValueRecord*
+    GetParam(const char* pParamName);
+
+
+    /** 
+     * @brief Returns the parameter's value as string
+     * 
+     * @param param_name Parameter name
+     * @param default_value Value, which is returned in case the parameter 
+     * was not found
+     * 
+     * @return Pointer to the begining of the string if success, default_value
+     * otherwise
+     */
+    const char* 
+    GetStr( const char *param_name, const char *default_value);
+    
+
+    /** 
+     * @brief Returns the parameter's value as int
+     * 
+     * @param param_name Parameter name
+     * @param default_value Value, which is returned in case the parameter 
+     * was not found
+     * 
+     * @return Returns the integer value if success, default_value
+     * otherwise
+     */
+    long 
+    GetInt( const char *param_name, long default_value);
+    
+
+    /** 
+     * @brief Returns the parameter's value as float
+     * 
+     * @param param_name Parameter name
+     * @param default_value Value, which is returned in case the parameter 
+     * was not found
+     * 
+     * @return Returns the float value if success, default_value
+     * otherwise
+     */
+    float 
+    GetFlt( const char *param_name, float default_value);
+    
+
+    /** 
+     * @brief Returns the parameter's value as bool
+     * 
+     * @param param_name Parameter name
+     * @param default_value Value, which is returned in case the parameter 
+     * was not found
+     * 
+     * @return Returns the bool value if success, default_value
+     * otherwise
+     *
+     * Note that true is returned if the value is 'TRUE' or 'T', false is
+     * returned if the value is 'FALSE' or 'F'. Otherwise exception is thrown
+     */
+    bool 
+    GetBool(const char *param_name, bool default_value);
+    
+
+    /** 
+     * @brief Returns the parameter's value as enum integer
+     * 
+     * @param param_name Parameter name
+     * @param default_value Value, which is returned in case the parameter 
+     * was not found
+     * 
+     * @return Returns the index value if success, default_value
+     * otherwise
+     *
+     * Variable arguments specify the possible values of this parameter. If the
+     * value does not match any of these, exception is thrown.
+     */
+    int 
+    GetEnum( const char *param_name, int default_value, ...);
+    
+
+    int GetFeatureParams(
+        int *derivOrder,
+        int **derivWinLens,
+        int *startFrmExt,
+        int *endFrmExt,
+        char **CMNPath,
+        char **CMNFile,
+        const char **CMNMask,
+        char **CVNPath,
+        char **CVNFile,
+        const char **CVNMask,
+        const char **CVGFile,
+        const char *toolName,
+        int pseudoModeule);
+    
+
+    int ParseOptions(
+        int             argc,
+        char*           argv[],
+        const char*     optionMapping,
+        const char*     toolName);
+
+
+    /** 
+     * @brief Send the defined paramaters to a stream
+     * 
+     * @param rStream stream to use
+     */
+    void
+    PrintConfig(std::ostream& rStream);
+
+  public:
+    typedef std::map<std::string, ValueRecord> MapType;
+    MapType             mMap;
+  };
+}
+
+#endif
+  
diff --git a/src/KaldiLib/Vector.cc b/src/KaldiLib/Vector.cc
new file mode 100644
index 0000000..020bae2
--- /dev/null
+++ b/src/KaldiLib/Vector.cc
@@ -0,0 +1,110 @@
+#ifndef TNet_Vector_cc
+#define TNet_Vector_cc
+
+#include <cstdlib>
+#include <cmath>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+#include "Common.h"
+
+#ifdef HAVE_ATLAS
+extern "C"{
+  #include <cblas.h>
+}
+#endif
+
+#include "Common.h"
+#include "Matrix.h"
+#include "Vector.h"
+
+namespace TNet
+{
+
+#ifdef HAVE_ATLAS
+  template<>
+     float
+    BlasDot<>(const Vector<float>& rA, const Vector<float>& rB)
+    {
+      assert(rA.mDim == rB.mDim);
+      return cblas_sdot(rA.mDim, rA.pData(), 1, rB.pData(), 1);
+    }
+
+  template<>
+     double
+    BlasDot<>(const Vector<double>& rA, const Vector<double>& rB)
+    {
+      assert(rA.mDim == rB.mDim);
+      return cblas_ddot(rA.mDim, rA.pData(), 1, rB.pData(), 1);
+    }
+
+  template<>
+     Vector<float>&
+    Vector<float>::
+    BlasAxpy(const float alpha, const Vector<float>& rV)
+    {
+      assert(mDim == rV.mDim);
+      cblas_saxpy(mDim, alpha, rV.pData(), 1, mpData, 1);
+      return *this;
+    }
+
+  template<>
+     Vector<double>&
+    Vector<double>::
+    BlasAxpy(const double alpha, const Vector<double>& rV)
+    {
+      assert(mDim == rV.mDim);
+      cblas_daxpy(mDim, alpha, rV.pData(), 1, mpData, 1);
+      return *this;
+    }
+
+  template<>
+     Vector<int>&
+    Vector<int>::
+    BlasAxpy(const int alpha, const Vector<int>& rV)
+    {
+      assert(mDim == rV.mDim);
+      for(int i=0; i<Dim(); i++) {
+        (*this)[i] += rV[i];
+      }
+      return *this;
+    }
+
+
+  template<>
+     Vector<float>&
+    Vector<float>::
+    BlasGemv(const float alpha, const Matrix<float>& rM, MatrixTrasposeType trans, const Vector<float>& rV, const float beta)
+    {
+      assert((trans == NO_TRANS && rM.Cols() == rV.mDim && rM.Rows() == mDim)
+          || (trans ==    TRANS && rM.Rows() == rV.mDim && rM.Cols() == mDim));
+
+      cblas_sgemv(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(trans), rM.Rows(), rM.Cols(), alpha, rM.pData(), rM.Stride(),
+                  rV.pData(), 1, beta, mpData, 1);
+      return *this;
+    }
+
+
+
+  template<>
+     Vector<double>&
+    Vector<double>::
+    BlasGemv(const double alpha, const Matrix<double>& rM, MatrixTrasposeType trans, const Vector<double>& rV, const double beta)
+    {
+      assert((trans == NO_TRANS && rM.Cols() == rV.mDim && rM.Rows() == mDim)
+          || (trans ==    TRANS && rM.Rows() == rV.mDim && rM.Cols() == mDim));
+
+      cblas_dgemv(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(trans), rM.Rows(), rM.Cols(), alpha, rM.pData(), rM.Stride(),
+                  rV.pData(), 1, beta, mpData, 1);
+      return *this;
+    }
+
+
+#else
+      #error Routines in this section are not implemented yet without BLAS
+#endif
+
+} // namespace TNet
+
+
+#endif // TNet_Vector_tcc
diff --git a/src/KaldiLib/Vector.h b/src/KaldiLib/Vector.h
new file mode 100644
index 0000000..384c5d2
--- /dev/null
+++ b/src/KaldiLib/Vector.h
@@ -0,0 +1,496 @@
+//
+// C++ Interface: %{MODULE}
+//
+// Description:
+//
+//
+// Author: %{AUTHOR} <%{EMAIL}>, (C) %{YEAR}
+//
+// Copyright: See COPYING file that comes with this distribution
+//
+//
+
+#ifndef TNet_Vector_h
+#define TNet_Vector_h
+
+#include <cstddef>
+#include <cstdlib>
+#include <stdexcept>
+#include <iostream>
+
+#ifdef HAVE_ATLAS
+extern "C"{
+  #include <cblas.h>
+  #include <clapack.h>
+}
+#endif
+
+#include "Common.h"
+#include "MathAux.h"
+#include "Types.h"
+#include "Error.h"
+
+namespace TNet
+{
+  template<typename _ElemT> class Vector;
+  template<typename _ElemT> class SubVector;
+  template<typename _ElemT> class Matrix;
+  template<typename _ElemT> class SpMatrix;
+
+  // we need to declare the friend functions here
+  template<typename _ElemT>
+    std::ostream & operator << (std::ostream & rOut, const Vector<_ElemT> & rV);
+
+  template<typename _ElemT>
+    std::istream & operator >> (std::istream & rIn, Vector<_ElemT> & rV);
+
+  template<typename _ElemT>
+    _ElemT
+    BlasDot(const Vector<_ElemT>& rA, const Vector<_ElemT>& rB);
+
+  /** **************************************************************************
+   ** **************************************************************************
+   *  @brief Provides a matrix abstraction class
+   *
+   *  This class provides a way to work with matrices in TNet.
+   *  It encapsulates basic operations and memory optimizations.
+   *
+   */
+  template<typename _ElemT>
+    class Vector
+    {
+    public:
+
+    /// defines a type of this
+    typedef Vector<_ElemT> ThisType;
+
+
+    Vector(): mpData(NULL)
+#ifdef STK_MEMALIGN_MANUAL
+    ,mpFreeData(NULL)
+#endif
+    , mDim(0)
+      {}
+
+      /**
+       * @brief Copy constructor
+       * @param rV
+       */
+      Vector(const Vector<_ElemT>& rV)
+  	  { mpData=NULL; Init(rV.Dim()); Copy(rV); }
+
+
+      /* Type conversion constructor. */
+      template<typename _ElemU>
+      explicit Vector(const Vector<_ElemU>& rV)
+  	  { mpData=NULL; Init(rV.Dim()); Copy(rV); }
+
+
+      Vector(const _ElemT* ppData, const size_t s)
+      { mpData=NULL; Init(s); Copy(ppData); }
+
+      explicit Vector(const size_t s, bool clear=true)
+      { mpData=NULL; Init(s,clear); }
+
+      ~Vector()
+      { Destroy(); }
+
+       Vector<_ElemT> &operator = (const Vector <_ElemT> &other)
+       { Init(other.Dim()); Copy(other); return *this; } // Needed for inclusion in std::vector
+
+      Vector<_ElemT>&
+      Init(size_t length, bool clear=true);
+
+      /**
+       * @brief Dealocates the window from memory and resets the dimensions to (0)
+       */
+      void
+      Destroy();
+
+      /**
+       * @brief Returns @c true if vector is initialized
+       */
+      bool
+      IsInitialized() const
+      { return mpData != NULL; }
+
+      /**
+       * @brief Sets all elements to 0
+       */
+      void
+      Zero();
+
+      void
+      Set(_ElemT f);
+
+      inline size_t
+      Dim() const
+      { return mDim; }
+
+      /**
+       * @brief Returns size of matrix in memory (in bytes)
+       */
+      inline size_t
+      MSize() const
+      {
+        return (mDim + (((16 / sizeof(_ElemT)) - mDim%(16 / sizeof(_ElemT)))
+                          % (16 / sizeof(_ElemT)))) * sizeof(_ElemT);
+      }
+
+      /**
+       *  @brief Gives access to the vector memory area
+       *  @return pointer to the first field
+       */
+      inline _ElemT*
+      pData()
+      { return mpData; }
+
+      /**
+       *  @brief Gives access to the vector memory area
+       *  @return pointer to the first field (const version)
+       */
+      inline const _ElemT*
+      pData() const
+      { return mpData; }
+
+      /**
+       *  @brief Gives access to a specified vector element (const).
+       */
+      inline _ElemT
+      operator [] (size_t i) const
+      {
+#ifdef PARANOID
+		assert(i<mDim);
+#endif
+		return *(mpData + i);
+	  }
+
+      /**
+       *  @brief Gives access to a specified vector element (non-const).
+       */
+      inline _ElemT &
+      operator [] (size_t i)
+      {
+#ifdef PARANOID
+		assert(i<mDim);
+#endif
+		return *(mpData + i);
+	  }
+
+      /**
+       *  @brief Gives access to a specified vector element (const).
+       */
+      inline _ElemT
+		operator () (size_t i) const
+      {
+#ifdef PARANOID
+		assert(i<mDim);
+#endif
+		return *(mpData + i);
+	  }
+
+      /**
+       *  @brief Gives access to a specified vector element (non-const).
+       */
+      inline _ElemT &
+		operator () (size_t i)
+      {
+#ifdef PARANOID
+		assert(i<mDim);
+#endif
+		return *(mpData + i);
+	  }
+
+      /**
+       * @brief Returns a matrix sub-range
+       * @param o Origin
+       * @param l Length
+       * See @c SubVector class for details
+       */
+      SubVector<_ElemT>
+      Range(const size_t o, const size_t l)
+      { return SubVector<_ElemT>(*this, o, l); }
+
+      /**
+       * @brief Returns a matrix sub-range
+       * @param o Origin
+       * @param l Length
+       * See @c SubVector class for details
+       */
+      const SubVector<_ElemT>
+      Range(const size_t o, const size_t l) const
+      { return SubVector<_ElemT>(*this, o, l); }
+
+
+
+      //########################################################################
+      //########################################################################
+
+      /// Copy data from another vector
+      Vector<_ElemT>&
+      Copy(const Vector<_ElemT>& rV);
+
+      /// Copy data from another vector of a different type.
+      template<typename _ElemU> Vector<_ElemT>&
+      Copy(const Vector<_ElemU>& rV);
+
+
+      /// Load data into the vector
+      Vector<_ElemT>&
+      Copy(const _ElemT* ppData);
+
+      Vector<_ElemT>&
+      CopyVectorizedMatrixRows(const Matrix<_ElemT> &rM);
+
+      Vector<_ElemT>&
+      RemoveElement(size_t i);
+
+      Vector<_ElemT>&
+      ApplyLog();
+
+      Vector<_ElemT>&
+      ApplyLog(const Vector<_ElemT>& rV);//ApplyLog to rV and put the result in (*this)
+
+      Vector<_ElemT>&
+      ApplyExp();
+
+      Vector<_ElemT>&
+      ApplySoftMax();
+
+      Vector<_ElemT>&
+      Invert();
+
+      Vector<_ElemT>&
+      DotMul(const Vector<_ElemT>& rV); // Multiplies each element (*this)(i) by rV(i).
+
+      Vector<_ElemT>&
+      BlasAxpy(const _ElemT alpha, const Vector<_ElemT>& rV);
+
+      Vector<_ElemT>&
+      BlasGemv(const _ElemT alpha, const Matrix<_ElemT>& rM, const MatrixTrasposeType trans, const Vector<_ElemT>& rV, const _ElemT beta = 0.0);
+
+
+      //########################################################################
+      //########################################################################
+
+      Vector<_ElemT>&
+      Add(const Vector<_ElemT>& rV)
+      { return BlasAxpy(1.0, rV); }
+
+      Vector<_ElemT>&
+      Subtract(const Vector<_ElemT>& rV)
+      { return BlasAxpy(-1.0, rV); }
+
+      Vector<_ElemT>&
+      AddScaled(_ElemT alpha, const Vector<_ElemT>& rV)
+      { return BlasAxpy(alpha, rV); }
+
+      Vector<_ElemT>&
+      Add(_ElemT c);
+
+      Vector<_ElemT>&
+      MultiplyElements(const Vector<_ElemT>& rV);
+
+      // @brief elementwise : rV.*rR+beta*this --> this
+      Vector<_ElemT>&
+      MultiplyElements(_ElemT alpha, const Vector<_ElemT>& rV, const Vector<_ElemT>& rR,_ElemT beta);
+
+      Vector<_ElemT>&
+      DivideElements(const Vector<_ElemT>& rV);
+
+      /// @brief elementwise : rV./rR+beta*this --> this
+      Vector<_ElemT>&
+      DivideElements(_ElemT alpha, const Vector<_ElemT>& rV, const Vector<_ElemT>& rR,_ElemT beta);
+
+      Vector<_ElemT>&
+      Subtract(_ElemT c);
+
+      Vector<_ElemT>&
+      Scale(_ElemT c);
+
+
+      //########################################################################
+      //########################################################################
+
+      /// Performs a row stack of the matrix rMa
+      Vector<_ElemT>&
+      MatrixRowStack(const Matrix<_ElemT>& rMa);
+
+      // Extracts a row of the matrix rMa.  .. could also do this with vector.Copy(rMa[row]).
+      Vector<_ElemT>&
+      Row(const Matrix<_ElemT>& rMa, size_t row);
+
+      // Extracts a column of the matrix rMa.
+      Vector<_ElemT>&
+      Col(const Matrix<_ElemT>& rMa, size_t col);
+
+      // Takes all elements to a power.
+      Vector<_ElemT>&
+      Power(_ElemT power);
+
+      _ElemT 
+      Max() const;
+
+      _ElemT 
+      Min() const;
+
+      /// Returns sum of the elements
+      _ElemT
+      Sum() const;
+
+      /// Returns sum of the elements
+      Vector<_ElemT>&
+      AddRowSum(const Matrix<_ElemT>& rM);
+
+      /// Returns sum of the elements
+      Vector<_ElemT>&
+      AddColSum(const Matrix<_ElemT>& rM);
+
+      /// Returns log(sum(exp())) without exp overflow
+      _ElemT
+      LogSumExp() const;
+
+      //########################################################################
+      //########################################################################
+
+      friend std::ostream &
+      operator << <> (std::ostream& rOut, const Vector<_ElemT>& rV);
+
+      friend _ElemT
+      BlasDot<>(const Vector<_ElemT>& rA, const Vector<_ElemT>& rB);
+
+      /**
+       * Computes v1^T * M * v2.  
+       * Not as efficient as it could be where v1==v2 (but no suitable blas
+       * routines available).
+       */
+      _ElemT
+      InnerProduct(const Vector<_ElemT> &v1, const Matrix<_ElemT> &M, const Vector<_ElemT> &v2) const;
+
+
+    //##########################################################################
+    //##########################################################################
+    //protected:
+    public:
+      /// data memory area
+      _ElemT*   mpData;
+#ifdef STK_MEMALIGN_MANUAL
+      /// data to be freed (in case of manual memalignment use, see common.h)
+      _ElemT*   mpFreeData;
+#endif
+      size_t  mDim;      ///< Number of elements
+    }; // class Vector
+
+
+
+
+  /**
+   * @brief Represents a non-allocating general vector which can be defined
+   * as a sub-vector of higher-level vector
+   */
+  template<typename _ElemT>
+    class SubVector : public Vector<_ElemT>
+    {
+    protected:
+      /// Constructor
+      SubVector(const Vector<_ElemT>& rT,
+                const size_t  origin,
+                const size_t  length)
+      {
+        assert(origin+length <= rT.mDim);
+        Vector<_ElemT>::mpData = rT.mpData+origin;
+        Vector<_ElemT>::mDim   = length;
+      }
+      //only Vector class can call this protected constructor
+      friend class Vector<_ElemT>; 
+
+    public:
+      /// Constructor
+      SubVector(Vector<_ElemT>& rT,
+                const size_t  origin,
+                const size_t  length)
+      {
+        assert(origin+length <= rT.mDim);
+        Vector<_ElemT>::mpData = rT.mpData+origin;
+        Vector<_ElemT>::mDim   = length;
+      }
+
+
+      /**
+       * @brief Constructs a vector representation out of a standard array
+       *
+       * @param pData pointer to data array to associate with this vector
+       * @param length length of this vector
+       */
+      inline
+      SubVector(_ElemT *ppData,
+                size_t length)
+      {
+        Vector<_ElemT>::mpData = ppData;
+        Vector<_ElemT>::mDim   = length;
+      }
+
+
+      /**
+       * @brief Destructor
+       */
+      ~SubVector()
+      {
+        Vector<_ElemT>::mpData = NULL;
+      }
+    };
+
+
+    // Useful shortcuts
+    typedef Vector<BaseFloat> BfVector;
+    typedef SubVector<BaseFloat> BfSubVector;
+
+    //Adding two vectors of different types
+    template <typename _ElemT, typename _ElemU>
+    void Add(Vector<_ElemT>& rDst, const Vector<_ElemU>& rSrc)
+    {
+      assert(rDst.Dim() == rSrc.Dim());
+      const _ElemU* p_src = rSrc.pData();
+      _ElemT* p_dst = rDst.pData();
+
+      for(size_t i=0; i<rSrc.Dim(); i++) {
+        *p_dst++ += (_ElemT)*p_src++;
+      }
+    }
+   
+      
+    //Scales adding two vectors of different types
+    template <typename _ElemT, typename _ElemU>
+    void AddScaled(Vector<_ElemT>& rDst, const Vector<_ElemU>& rSrc, _ElemT scale)
+    {
+      assert(rDst.Dim() == rSrc.Dim());
+
+      Vector<_ElemT> tmp(rSrc);
+      rDst.BlasAxpy(scale, tmp); 
+
+/*
+      const _ElemU* p_src = rSrc.pData();
+      _ElemT* p_dst = rDst.pData();
+
+      for(size_t i=0; i<rDst.Dim(); i++) {
+        *p_dst++ += *p_src++ * scale;
+      }
+*/
+    }
+
+
+} // namespace TNet
+
+//*****************************************************************************
+//*****************************************************************************
+// we need to include the implementation
+#include "Vector.tcc"
+
+/******************************************************************************
+ ******************************************************************************
+ * The following section contains specialized template definitions
+ * whose implementation is in Vector.cc
+ */
+
+
+#endif // #ifndef TNet_Vector_h
diff --git a/src/KaldiLib/Vector.tcc b/src/KaldiLib/Vector.tcc
new file mode 100644
index 0000000..751ffa7
--- /dev/null
+++ b/src/KaldiLib/Vector.tcc
@@ -0,0 +1,638 @@
+/** @file Vector.tcc
+ *  This is an internal header file, included by other library headers.
+ *  You should not attempt to use it directly.
+ */
+
+#ifndef TNet_Vector_tcc
+#define TNet_Vector_tcc
+
+#include <cstdlib>
+#include <cmath>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+#include "Common.h"
+
+#ifdef HAVE_ATLAS
+extern "C"{
+  #include <cblas.h>
+}
+#endif
+
+#include "Common.h"
+#include "MathAux.h"
+#include "Matrix.h"
+
+namespace TNet
+{
+  //******************************************************************************
+  //******************************************************************************
+  template<typename _ElemT>
+    inline Vector<_ElemT>&
+    Vector<_ElemT>::
+    Init(const size_t length, bool clear)
+    {
+	  if(mpData != NULL) Destroy();
+	  if(length==0){
+		mpData=NULL;
+#ifdef STK_MEMALIGN_MANUAL
+		mpFreeData=NULL;
+#endif
+		mDim=0;
+		return *this;
+	  }
+      size_t size;
+      void*  data;
+      void*  free_data;
+
+      size = align<16>(length * sizeof(_ElemT));
+
+      if (NULL != (data = stk_memalign(16, size, &free_data))) {
+        mpData        = static_cast<_ElemT*> (data);
+#ifdef STK_MEMALIGN_MANUAL
+        mpFreeData    = static_cast<_ElemT*> (free_data);
+#endif
+        mDim = length;
+      } else {
+        throw std::bad_alloc();
+      }
+      if(clear) Zero();
+      return *this;
+    }
+
+
+  //******************************************************************************
+  //******************************************************************************
+  /// Copy data from another vector
+  template<typename _ElemT>
+    inline Vector<_ElemT>&
+    Vector<_ElemT>::
+    Copy(const Vector<_ElemT>& rV) {
+      assert(Dim() == rV.Dim());
+      Copy(rV.mpData);
+      return *this;
+    }
+
+  /// Load data into the vector
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    Copy(const _ElemT* ppData) {
+      std::memcpy(this->mpData, ppData, Dim() * sizeof(_ElemT));
+      return *this;
+    }
+
+  template<typename _ElemT>
+  template<typename _ElemU>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    Copy(const Vector<_ElemU> &other){
+      assert(Dim()==other.Dim());
+      size_t D=Dim();
+      for(size_t d=0;d<D;d++) (*this)(d) = (_ElemT) other[d];
+      return *this;
+  }
+
+
+  //******************************************************************************
+  //******************************************************************************
+  template<typename _ElemT>
+  Vector<_ElemT>&
+  Vector<_ElemT>::
+  CopyVectorizedMatrixRows(const Matrix<_ElemT> &rM) {
+//    TraceLog("Dim = "+to_string(Dim())+", Rows = "+to_string(rM.Rows())+", Cols = "+to_string(rM.Cols()));
+    assert(Dim() == rM.Cols()*rM.Rows());
+    size_t nCols = rM.Cols();
+    for(size_t r=0; r<rM.Rows(); r++)
+      Range(r*nCols, nCols).Copy(rM[r]);
+    return *this;
+  }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  // Remove element from the vector. The vector is non reallocated
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    RemoveElement(size_t i) {
+      assert(i < mDim && "Access out of vector");
+      for(size_t j = i + 1; j < mDim; j++)
+        this->mpData[j - 1] = this->mpData[j];
+      mDim--;
+      return *this;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  // The destructor
+  template<typename _ElemT>
+    inline void
+    Vector<_ElemT>::
+    Destroy()
+    {
+      // we need to free the data block if it was defined
+#ifndef STK_MEMALIGN_MANUAL
+      if (NULL != mpData) free(mpData);
+#else
+      if (NULL != mpData) free(mpFreeData);
+      mpFreeData = NULL;
+#endif
+
+      mpData = NULL;
+      mDim = 0;
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    inline void
+    Vector<_ElemT>::
+    Zero()
+    {
+      std::memset(mpData, 0, mDim * sizeof(_ElemT));
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    inline void
+    Vector<_ElemT>::
+    Set(_ElemT f)
+    {
+      for(size_t i=0;i<mDim;i++) mpData[i] = f;
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    MatrixRowStack(const Matrix<_ElemT>& rMa)
+    {
+      assert(mDim == rMa.Cols() * rMa.Rows());
+
+      _ElemT*       inc_data = mpData;
+      const size_t  cols     = rMa.Cols();
+
+      for (size_t i = 0; i < rMa.Rows(); i++)
+      {
+        // copy the data to the propper position
+        memcpy(inc_data, rMa[i], cols * sizeof(_ElemT));
+
+        // set new copy position
+        inc_data += cols;
+      }
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+	  Row(const Matrix<_ElemT> &rMa, size_t row)
+    {
+	  assert(row < rMa.Rows());
+      const _ElemT *mRow = rMa.pRowData(row);
+      // if(mDim != rMa.Cols()) Init(rMa.Cols()); // automatically resize.
+      memcpy(mpData, mRow, sizeof(_ElemT)*mDim);
+	  return *this;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+  Power(_ElemT power) // takes elements to a power.  Throws exception if could not.
+    {
+      for(size_t i=0;i<Dim();i++){
+        _ElemT tmp = (*this)(i);
+        (*this)(i) = pow(tmp, power);
+        if((*this)(i) == HUGE_VAL)
+          throw std::runtime_error((std::string)"Error in Vector::power, could not take " +to_string(tmp)+ " to power " +to_string((*this)(i)));
+      }
+      return (*this);
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    _ElemT
+    Vector<_ElemT>::
+    Max() const 
+    {
+      if(Dim()==0) throw std::runtime_error("Error in Vector::Max(), empty vector\n");
+      _ElemT ans = (*this)(0);
+      for(size_t i=1;i<Dim();i++) ans = std::max(ans, (*this)(i));
+      return ans;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    _ElemT
+    Vector<_ElemT>::
+    Min() const 
+    {
+      if(Dim()==0) throw std::runtime_error("Error in Vector::Min(), empty vector\n");
+      _ElemT ans = (*this)(0);
+      for(size_t i=1;i<Dim();i++) ans = std::min(ans, (*this)(i));
+      return ans;
+    }
+
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+	  Col(const Matrix<_ElemT> &rMa, size_t col)
+  {
+	  assert(col < rMa.Cols());
+      // if(mDim != rMa.Cols()) Init(rMa.Cols()); // automatically resize.
+	  for(size_t i=0;i<mDim;i++)
+		mpData[i] = rMa(i,col); // can't do this efficiently so don't really bother.
+	  return *this;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    _ElemT
+    Vector<_ElemT>::
+    Sum() const
+    {
+      //note the double accumulator
+      double sum = 0.0;
+
+      for (size_t i = 0; i < mDim; ++i) {
+        sum += mpData[i];
+      }
+      return (_ElemT)sum;
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    AddColSum(const Matrix<_ElemT>& rM)
+    {
+      // note the double accumulator
+      double sum;
+
+      assert(mDim == rM.Cols());
+
+      for (size_t i = 0; i < mDim; ++i) {
+        sum = 0.0;
+        for (size_t j = 0; j < rM.Rows(); ++j) {
+          sum += rM[j][i];
+        }
+        mpData[i] += sum;
+      }
+      return *this;
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    AddRowSum(const Matrix<_ElemT>& rM)
+    {
+      // note the double accumulator
+      double sum;
+
+      assert(mDim == rM.Rows());
+
+      for (size_t i = 0; i < mDim; ++i) {
+        sum = 0.0;
+        for (size_t j = 0; j < rM.Cols(); ++j) {
+          sum += rM[i][j];
+        }
+        mpData[i] += sum;
+      }
+      return *this;
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    _ElemT
+    Vector<_ElemT>::
+    LogSumExp() const
+    {
+      double sum = LOG_0;
+
+      for (size_t i = 0; i < mDim; ++i) {
+        sum = LogAdd(sum, mpData[i]);
+      }
+      return sum;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    Invert() {
+      for (size_t i = 0; i < mDim; ++i) {
+        mpData[i] = static_cast<_ElemT>(1 / mpData[i]);
+      }
+      return *this;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    ApplyLog() {
+      for (size_t i = 0; i < mDim; ++i) {
+        mpData[i] = _LOG(mpData[i]);
+      }
+      return *this;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    ApplyLog(const Vector<_ElemT>& rV) {
+      assert(mDim==rV.Dim());
+      for (size_t i = 0; i < mDim; ++i) {
+        mpData[i] = log(rV[i]);
+      }
+      return *this;
+    }
+    
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    ApplyExp() {
+      for (size_t i = 0; i < mDim; ++i) {
+        mpData[i] = _EXP(mpData[i]);
+      }
+      return *this;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    ApplySoftMax() {
+      _ElemT lse = LogSumExp();
+
+      for (size_t i = 0; i < mDim; ++i) {
+        mpData[i] = exp(mpData[i] - lse);
+      }
+      return *this;
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    Add(_ElemT c)
+    {
+      for(size_t i = 0; i < mDim; i++) {
+        mpData[i] += c;
+      }
+      return *this;
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    Subtract(_ElemT c)
+    {
+      for(size_t i = 0; i < mDim; i++) {
+        mpData[i] -= c;
+      }
+      return *this;
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    Scale(_ElemT c)
+    {
+      for(size_t i = 0; i < mDim; i++) {
+        mpData[i] *= c;
+      }
+      return *this;
+    }
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    MultiplyElements(const Vector<_ElemT>& rV)
+    {
+      assert(mDim == rV.Dim());
+      for(size_t i = 0; i < mDim; i++) {
+        mpData[i] *= rV[i];
+      }
+      return *this;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    MultiplyElements(_ElemT alpha, const Vector<_ElemT>& rV, const Vector<_ElemT>& rR, _ElemT beta)
+    {
+      assert((mDim == rV.Dim() && mDim == rR.Dim()));
+      for(size_t i = 0; i < mDim; i++) {
+        mpData[i] = alpha * rV[i] * rR[i] + beta * mpData[i];
+      }
+      return *this;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    DivideElements(const Vector<_ElemT>& rV)
+    {
+      assert(mDim == rV.Dim());
+      for(size_t i = 0; i < mDim; i++) {
+        mpData[i] /= rV[i];
+      }
+      return *this;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+
+  template<typename _ElemT>
+    Vector<_ElemT>&
+    Vector<_ElemT>::
+    DivideElements(_ElemT alpha, const Vector<_ElemT>& rV, const Vector<_ElemT>& rR, _ElemT beta)
+    {
+      assert((mDim == rV.Dim() && mDim == rR.Dim()));
+      for(size_t i = 0; i < mDim; i++) {
+        mpData[i] = alpha * rV[i]/rR[i] + beta * mpData[i] ;
+      }
+      return *this;
+    }
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+  void Load(std::istream& rIn, Vector<_ElemT>& rV)
+    { 
+      std::streamoff pos = rIn.tellg();
+      if(MatrixVectorIostreamControl::Flags(rIn, ACCUMULATE_INPUT)) {
+        for (size_t i = 0; i < rV.Dim(); i++) {
+          _ElemT tmp;
+          rIn >> tmp;
+          rV[i] += tmp;
+        }
+      } else {
+        for (size_t i = 0; i < rV.Dim(); i++) {
+          rIn >> rV[i];
+        }
+      }
+      if(rIn.fail()) { 
+        throw std::runtime_error("Failed to read vector from stream.  File position is "+to_string(pos));
+      }
+    }
+
+  template<typename _ElemT>
+    std::istream &
+     operator >> (std::istream& rIn, Vector<_ElemT>& rV)
+    {
+      rIn >> std::ws;
+      if(rIn.peek() == 'v'){ // "new" format: v <dim> 1.0 0.2 4.3 ...
+        rIn.get();
+        long long int tmp=-1; 
+        rIn >> tmp; 
+        if(rIn.fail() || tmp<0) { 
+          throw std::runtime_error("Failed to read vector from stream: no size"); 
+        }
+        size_t tmp2 = size_t(tmp);
+        assert((long long int)tmp2 == tmp);
+
+        if(rV.Dim() != tmp2) rV.Init(tmp2);
+      }
+      Load(rIn,rV);
+      return rIn;
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    void Save (std::ostream& rOut, const Vector<_ElemT>& rV)
+    {
+
+      for (size_t i = 0; i < rV.Dim(); i++) {
+        rOut << rV[i] << ' ';
+      }
+      if(rOut.fail()) { 
+        throw std::runtime_error("Failed to write vector to stream"); 
+      }
+    }
+
+
+  //****************************************************************************
+  //****************************************************************************
+  template<typename _ElemT>
+    std::ostream &
+    operator << (std::ostream& rOut, const Vector<_ElemT>& rV)
+    {
+      rOut << "v " << rV.Dim() << "  ";
+      Save(rOut,rV);
+      return rOut;
+    }
+
+
+
+  //****************************************************************************
+  //****************************************************************************
+
+#ifdef HAVE_ATLAS
+  template<>
+    float
+   BlasDot<>(const Vector<float>& rA, const Vector<float>& rB);
+
+  template<>
+   double
+   BlasDot<>(const Vector<double>& rA, const Vector<double>& rB);
+
+  template<typename _ElemT>
+    inline Vector<_ElemT>&
+    Vector<_ElemT>::
+   DotMul(const Vector<_ElemT> &rV){
+	 assert(mDim == rV.mDim);
+	 const _ElemT *other_data = rV.pData();
+	 _ElemT *my_data = mpData, *my_data_end = my_data+mDim;
+	 for(;my_data<my_data_end;) *(my_data++) *= *(other_data++);
+	 return *this;
+  }
+
+  template<>
+    Vector<float>&
+    Vector<float>::
+    BlasAxpy(const float alpha, const Vector<float>& rV);
+
+
+  template<>
+    Vector<double>&
+    Vector<double>::
+   BlasAxpy(const double alpha, const Vector<double>& rV);
+
+
+  template<>
+    Vector<float>&
+    Vector<float>::
+    BlasGemv(const float alpha, const Matrix<float>& rM, MatrixTrasposeType trans, const Vector<float>& rV, const float beta);
+
+  template<>
+    Vector<double>&
+    Vector<double>::
+    BlasGemv(const double alpha, const Matrix<double>& rM, MatrixTrasposeType trans, const Vector<double>& rV, const double beta);
+
+#else
+      #error Routines in this section are not implemented yet without BLAS
+#endif
+
+
+  template<class _ElemT>
+  _ElemT
+  InnerProduct(const Vector<_ElemT> &v1, const Matrix<_ElemT> &M, const Vector<_ElemT> &v2){
+    assert(v1.size()==M.Rows() && v2.size()==M.Cols());
+    Vector<_ElemT> vtmp(M.Rows());
+    vtmp.BlasGemv(1.0, M, NO_TRANS, v2, 0.0);
+    return BlasDot(v1, vtmp);
+  }
+
+
+} // namespace TNet
+
+
+#endif // TNet_Vector_tcc
diff --git a/src/KaldiLib/cblas.h b/src/KaldiLib/cblas.h
new file mode 100644
index 0000000..4087ffb
--- /dev/null
+++ b/src/KaldiLib/cblas.h
@@ -0,0 +1,596 @@
+#ifndef CBLAS_H
+
+#ifndef CBLAS_ENUM_DEFINED_H
+   #define CBLAS_ENUM_DEFINED_H
+   enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102 };
+   enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113,
+                         AtlasConj=114};
+   enum CBLAS_UPLO  {CblasUpper=121, CblasLower=122};
+   enum CBLAS_DIAG  {CblasNonUnit=131, CblasUnit=132};
+   enum CBLAS_SIDE  {CblasLeft=141, CblasRight=142};
+#endif
+
+#ifndef CBLAS_ENUM_ONLY
+#define CBLAS_H
+#define CBLAS_INDEX int
+
+int cblas_errprn(int ierr, int info, char *form, ...);
+
+/*
+ * ===========================================================================
+ * Prototypes for level 1 BLAS functions (complex are recast as routines)
+ * ===========================================================================
+ */
+float  cblas_sdsdot(const int N, const float alpha, const float *X,
+                    const int incX, const float *Y, const int incY);
+double cblas_dsdot(const int N, const float *X, const int incX, const float *Y,
+                   const int incY);
+float  cblas_sdot(const int N, const float  *X, const int incX,
+                  const float  *Y, const int incY);
+double cblas_ddot(const int N, const double *X, const int incX,
+                  const double *Y, const int incY);
+/*
+ * Functions having prefixes Z and C only
+ */
+void   cblas_cdotu_sub(const int N, const void *X, const int incX,
+                       const void *Y, const int incY, void *dotu);
+void   cblas_cdotc_sub(const int N, const void *X, const int incX,
+                       const void *Y, const int incY, void *dotc);
+
+void   cblas_zdotu_sub(const int N, const void *X, const int incX,
+                       const void *Y, const int incY, void *dotu);
+void   cblas_zdotc_sub(const int N, const void *X, const int incX,
+                       const void *Y, const int incY, void *dotc);
+
+
+/*
+ * Functions having prefixes S D SC DZ
+ */
+float  cblas_snrm2(const int N, const float *X, const int incX);
+float  cblas_sasum(const int N, const float *X, const int incX);
+
+double cblas_dnrm2(const int N, const double *X, const int incX);
+double cblas_dasum(const int N, const double *X, const int incX);
+
+float  cblas_scnrm2(const int N, const void *X, const int incX);
+float  cblas_scasum(const int N, const void *X, const int incX);
+
+double cblas_dznrm2(const int N, const void *X, const int incX);
+double cblas_dzasum(const int N, const void *X, const int incX);
+
+
+/*
+ * Functions having standard 4 prefixes (S D C Z)
+ */
+CBLAS_INDEX cblas_isamax(const int N, const float  *X, const int incX);
+CBLAS_INDEX cblas_idamax(const int N, const double *X, const int incX);
+CBLAS_INDEX cblas_icamax(const int N, const void   *X, const int incX);
+CBLAS_INDEX cblas_izamax(const int N, const void   *X, const int incX);
+
+/*
+ * ===========================================================================
+ * Prototypes for level 1 BLAS routines
+ * ===========================================================================
+ */
+
+/*
+ * Routines with standard 4 prefixes (s, d, c, z)
+ */
+void cblas_sswap(const int N, float *X, const int incX,
+                 float *Y, const int incY);
+void cblas_scopy(const int N, const float *X, const int incX,
+                 float *Y, const int incY);
+void cblas_saxpy(const int N, const float alpha, const float *X,
+                 const int incX, float *Y, const int incY);
+void catlas_saxpby(const int N, const float alpha, const float *X,
+                  const int incX, const float beta, float *Y, const int incY);
+void catlas_sset
+   (const int N, const float alpha, float *X, const int incX);
+
+void cblas_dswap(const int N, double *X, const int incX,
+                 double *Y, const int incY);
+void cblas_dcopy(const int N, const double *X, const int incX,
+                 double *Y, const int incY);
+void cblas_daxpy(const int N, const double alpha, const double *X,
+                 const int incX, double *Y, const int incY);
+void catlas_daxpby(const int N, const double alpha, const double *X,
+                  const int incX, const double beta, double *Y, const int incY);
+void catlas_dset
+   (const int N, const double alpha, double *X, const int incX);
+
+void cblas_cswap(const int N, void *X, const int incX,
+                 void *Y, const int incY);
+void cblas_ccopy(const int N, const void *X, const int incX,
+                 void *Y, const int incY);
+void cblas_caxpy(const int N, const void *alpha, const void *X,
+                 const int incX, void *Y, const int incY);
+void catlas_caxpby(const int N, const void *alpha, const void *X,
+                  const int incX, const void *beta, void *Y, const int incY);
+void catlas_cset
+   (const int N, const void *alpha, void *X, const int incX);
+
+void cblas_zswap(const int N, void *X, const int incX,
+                 void *Y, const int incY);
+void cblas_zcopy(const int N, const void *X, const int incX,
+                 void *Y, const int incY);
+void cblas_zaxpy(const int N, const void *alpha, const void *X,
+                 const int incX, void *Y, const int incY);
+void catlas_zaxpby(const int N, const void *alpha, const void *X,
+                  const int incX, const void *beta, void *Y, const int incY);
+void catlas_zset
+   (const int N, const void *alpha, void *X, const int incX);
+
+
+/*
+ * Routines with S and D prefix only
+ */
+void cblas_srotg(float *a, float *b, float *c, float *s);
+void cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P);
+void cblas_srot(const int N, float *X, const int incX,
+                float *Y, const int incY, const float c, const float s);
+void cblas_srotm(const int N, float *X, const int incX,
+                float *Y, const int incY, const float *P);
+
+void cblas_drotg(double *a, double *b, double *c, double *s);
+void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P);
+void cblas_drot(const int N, double *X, const int incX,
+                double *Y, const int incY, const double c, const double s);
+void cblas_drotm(const int N, double *X, const int incX,
+                double *Y, const int incY, const double *P);
+
+
+/*
+ * Routines with S D C Z CS and ZD prefixes
+ */
+void cblas_sscal(const int N, const float alpha, float *X, const int incX);
+void cblas_dscal(const int N, const double alpha, double *X, const int incX);
+void cblas_cscal(const int N, const void *alpha, void *X, const int incX);
+void cblas_zscal(const int N, const void *alpha, void *X, const int incX);
+void cblas_csscal(const int N, const float alpha, void *X, const int incX);
+void cblas_zdscal(const int N, const double alpha, void *X, const int incX);
+
+/*
+ * Extra reference routines provided by ATLAS, but not mandated by the standard
+ */
+void cblas_crotg(void *a, void *b, void *c, void *s);
+void cblas_zrotg(void *a, void *b, void *c, void *s);
+void cblas_csrot(const int N, void *X, const int incX, void *Y, const int incY,
+                 const float c, const float s);
+void cblas_zdrot(const int N, void *X, const int incX, void *Y, const int incY,
+                 const double c, const double s);
+
+/*
+ * ===========================================================================
+ * Prototypes for level 2 BLAS
+ * ===========================================================================
+ */
+
+/*
+ * Routines with standard 4 prefixes (S, D, C, Z)
+ */
+void cblas_sgemv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const float alpha, const float *A, const int lda,
+                 const float *X, const int incX, const float beta,
+                 float *Y, const int incY);
+void cblas_sgbmv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const int KL, const int KU, const float alpha,
+                 const float *A, const int lda, const float *X,
+                 const int incX, const float beta, float *Y, const int incY);
+void cblas_strmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const float *A, const int lda,
+                 float *X, const int incX);
+void cblas_stbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const float *A, const int lda,
+                 float *X, const int incX);
+void cblas_stpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const float *Ap, float *X, const int incX);
+void cblas_strsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const float *A, const int lda, float *X,
+                 const int incX);
+void cblas_stbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const float *A, const int lda,
+                 float *X, const int incX);
+void cblas_stpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const float *Ap, float *X, const int incX);
+
+void cblas_dgemv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const double alpha, const double *A, const int lda,
+                 const double *X, const int incX, const double beta,
+                 double *Y, const int incY);
+void cblas_dgbmv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const int KL, const int KU, const double alpha,
+                 const double *A, const int lda, const double *X,
+                 const int incX, const double beta, double *Y, const int incY);
+void cblas_dtrmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const double *A, const int lda,
+                 double *X, const int incX);
+void cblas_dtbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const double *A, const int lda,
+                 double *X, const int incX);
+void cblas_dtpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const double *Ap, double *X, const int incX);
+void cblas_dtrsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const double *A, const int lda, double *X,
+                 const int incX);
+void cblas_dtbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const double *A, const int lda,
+                 double *X, const int incX);
+void cblas_dtpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const double *Ap, double *X, const int incX);
+
+void cblas_cgemv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 const void *X, const int incX, const void *beta,
+                 void *Y, const int incY);
+void cblas_cgbmv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const int KL, const int KU, const void *alpha,
+                 const void *A, const int lda, const void *X,
+                 const int incX, const void *beta, void *Y, const int incY);
+void cblas_ctrmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *A, const int lda,
+                 void *X, const int incX);
+void cblas_ctbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const void *A, const int lda,
+                 void *X, const int incX);
+void cblas_ctpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *Ap, void *X, const int incX);
+void cblas_ctrsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *A, const int lda, void *X,
+                 const int incX);
+void cblas_ctbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const void *A, const int lda,
+                 void *X, const int incX);
+void cblas_ctpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *Ap, void *X, const int incX);
+
+void cblas_zgemv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 const void *X, const int incX, const void *beta,
+                 void *Y, const int incY);
+void cblas_zgbmv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const int KL, const int KU, const void *alpha,
+                 const void *A, const int lda, const void *X,
+                 const int incX, const void *beta, void *Y, const int incY);
+void cblas_ztrmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *A, const int lda,
+                 void *X, const int incX);
+void cblas_ztbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const void *A, const int lda,
+                 void *X, const int incX);
+void cblas_ztpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *Ap, void *X, const int incX);
+void cblas_ztrsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *A, const int lda, void *X,
+                 const int incX);
+void cblas_ztbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const void *A, const int lda,
+                 void *X, const int incX);
+void cblas_ztpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *Ap, void *X, const int incX);
+
+
+/*
+ * Routines with S and D prefixes only
+ */
+void cblas_ssymv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const float alpha, const float *A,
+                 const int lda, const float *X, const int incX,
+                 const float beta, float *Y, const int incY);
+void cblas_ssbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const int K, const float alpha, const float *A,
+                 const int lda, const float *X, const int incX,
+                 const float beta, float *Y, const int incY);
+void cblas_sspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const float alpha, const float *Ap,
+                 const float *X, const int incX,
+                 const float beta, float *Y, const int incY);
+void cblas_sger(const enum CBLAS_ORDER Order, const int M, const int N,
+                const float alpha, const float *X, const int incX,
+                const float *Y, const int incY, float *A, const int lda);
+void cblas_ssyr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const float alpha, const float *X,
+                const int incX, float *A, const int lda);
+void cblas_sspr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const float alpha, const float *X,
+                const int incX, float *Ap);
+void cblas_ssyr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const float alpha, const float *X,
+                const int incX, const float *Y, const int incY, float *A,
+                const int lda);
+void cblas_sspr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const float alpha, const float *X,
+                const int incX, const float *Y, const int incY, float *A);
+
+void cblas_dsymv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const double alpha, const double *A,
+                 const int lda, const double *X, const int incX,
+                 const double beta, double *Y, const int incY);
+void cblas_dsbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const int K, const double alpha, const double *A,
+                 const int lda, const double *X, const int incX,
+                 const double beta, double *Y, const int incY);
+void cblas_dspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const double alpha, const double *Ap,
+                 const double *X, const int incX,
+                 const double beta, double *Y, const int incY);
+void cblas_dger(const enum CBLAS_ORDER Order, const int M, const int N,
+                const double alpha, const double *X, const int incX,
+                const double *Y, const int incY, double *A, const int lda);
+void cblas_dsyr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const double alpha, const double *X,
+                const int incX, double *A, const int lda);
+void cblas_dspr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const double alpha, const double *X,
+                const int incX, double *Ap);
+void cblas_dsyr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const double alpha, const double *X,
+                const int incX, const double *Y, const int incY, double *A,
+                const int lda);
+void cblas_dspr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const double alpha, const double *X,
+                const int incX, const double *Y, const int incY, double *A);
+
+
+/*
+ * Routines with C and Z prefixes only
+ */
+void cblas_chemv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const void *alpha, const void *A,
+                 const int lda, const void *X, const int incX,
+                 const void *beta, void *Y, const int incY);
+void cblas_chbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const int K, const void *alpha, const void *A,
+                 const int lda, const void *X, const int incX,
+                 const void *beta, void *Y, const int incY);
+void cblas_chpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const void *alpha, const void *Ap,
+                 const void *X, const int incX,
+                 const void *beta, void *Y, const int incY);
+void cblas_cgeru(const enum CBLAS_ORDER Order, const int M, const int N,
+                 const void *alpha, const void *X, const int incX,
+                 const void *Y, const int incY, void *A, const int lda);
+void cblas_cgerc(const enum CBLAS_ORDER Order, const int M, const int N,
+                 const void *alpha, const void *X, const int incX,
+                 const void *Y, const int incY, void *A, const int lda);
+void cblas_cher(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const float alpha, const void *X, const int incX,
+                void *A, const int lda);
+void cblas_chpr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const float alpha, const void *X,
+                const int incX, void *A);
+void cblas_cher2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N,
+                const void *alpha, const void *X, const int incX,
+                const void *Y, const int incY, void *A, const int lda);
+void cblas_chpr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N,
+                const void *alpha, const void *X, const int incX,
+                const void *Y, const int incY, void *Ap);
+
+void cblas_zhemv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const void *alpha, const void *A,
+                 const int lda, const void *X, const int incX,
+                 const void *beta, void *Y, const int incY);
+void cblas_zhbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const int K, const void *alpha, const void *A,
+                 const int lda, const void *X, const int incX,
+                 const void *beta, void *Y, const int incY);
+void cblas_zhpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const void *alpha, const void *Ap,
+                 const void *X, const int incX,
+                 const void *beta, void *Y, const int incY);
+void cblas_zgeru(const enum CBLAS_ORDER Order, const int M, const int N,
+                 const void *alpha, const void *X, const int incX,
+                 const void *Y, const int incY, void *A, const int lda);
+void cblas_zgerc(const enum CBLAS_ORDER Order, const int M, const int N,
+                 const void *alpha, const void *X, const int incX,
+                 const void *Y, const int incY, void *A, const int lda);
+void cblas_zher(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const double alpha, const void *X, const int incX,
+                void *A, const int lda);
+void cblas_zhpr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const double alpha, const void *X,
+                const int incX, void *A);
+void cblas_zher2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N,
+                const void *alpha, const void *X, const int incX,
+                const void *Y, const int incY, void *A, const int lda);
+void cblas_zhpr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N,
+                const void *alpha, const void *X, const int incX,
+                const void *Y, const int incY, void *Ap);
+
+/*
+ * ===========================================================================
+ * Prototypes for level 3 BLAS
+ * ===========================================================================
+ */
+
+/*
+ * Routines with standard 4 prefixes (S, D, C, Z)
+ */
+void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
+                 const int K, const float alpha, const float *A,
+                 const int lda, const float *B, const int ldb,
+                 const float beta, float *C, const int ldc);
+void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const int M, const int N,
+                 const float alpha, const float *A, const int lda,
+                 const float *B, const int ldb, const float beta,
+                 float *C, const int ldc);
+void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                 const float alpha, const float *A, const int lda,
+                 const float beta, float *C, const int ldc);
+void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                  const float alpha, const float *A, const int lda,
+                  const float *B, const int ldb, const float beta,
+                  float *C, const int ldc);
+void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const float alpha, const float *A, const int lda,
+                 float *B, const int ldb);
+void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const float alpha, const float *A, const int lda,
+                 float *B, const int ldb);
+
+void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
+                 const int K, const double alpha, const double *A,
+                 const int lda, const double *B, const int ldb,
+                 const double beta, double *C, const int ldc);
+void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const int M, const int N,
+                 const double alpha, const double *A, const int lda,
+                 const double *B, const int ldb, const double beta,
+                 double *C, const int ldc);
+void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                 const double alpha, const double *A, const int lda,
+                 const double beta, double *C, const int ldc);
+void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                  const double alpha, const double *A, const int lda,
+                  const double *B, const int ldb, const double beta,
+                  double *C, const int ldc);
+void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const double alpha, const double *A, const int lda,
+                 double *B, const int ldb);
+void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const double alpha, const double *A, const int lda,
+                 double *B, const int ldb);
+
+void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
+                 const int K, const void *alpha, const void *A,
+                 const int lda, const void *B, const int ldb,
+                 const void *beta, void *C, const int ldc);
+void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 const void *B, const int ldb, const void *beta,
+                 void *C, const int ldc);
+void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                 const void *alpha, const void *A, const int lda,
+                 const void *beta, void *C, const int ldc);
+void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                  const void *alpha, const void *A, const int lda,
+                  const void *B, const int ldb, const void *beta,
+                  void *C, const int ldc);
+void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 void *B, const int ldb);
+void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 void *B, const int ldb);
+
+void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
+                 const int K, const void *alpha, const void *A,
+                 const int lda, const void *B, const int ldb,
+                 const void *beta, void *C, const int ldc);
+void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 const void *B, const int ldb, const void *beta,
+                 void *C, const int ldc);
+void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                 const void *alpha, const void *A, const int lda,
+                 const void *beta, void *C, const int ldc);
+void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                  const void *alpha, const void *A, const int lda,
+                  const void *B, const int ldb, const void *beta,
+                  void *C, const int ldc);
+void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 void *B, const int ldb);
+void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 void *B, const int ldb);
+
+
+/*
+ * Routines with prefixes C and Z only
+ */
+void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 const void *B, const int ldb, const void *beta,
+                 void *C, const int ldc);
+void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                 const float alpha, const void *A, const int lda,
+                 const float beta, void *C, const int ldc);
+void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                  const void *alpha, const void *A, const int lda,
+                  const void *B, const int ldb, const float beta,
+                  void *C, const int ldc);
+void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 const void *B, const int ldb, const void *beta,
+                 void *C, const int ldc);
+void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                 const double alpha, const void *A, const int lda,
+                 const double beta, void *C, const int ldc);
+void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                  const void *alpha, const void *A, const int lda,
+                  const void *B, const int ldb, const double beta,
+                  void *C, const int ldc);
+
+int cblas_errprn(int ierr, int info, char *form, ...);
+
+#endif  /* end #ifdef CBLAS_ENUM_ONLY */
+#endif
diff --git a/src/KaldiLib/clapack.cc b/src/KaldiLib/clapack.cc
new file mode 100644
index 0000000..a486bef
--- /dev/null
+++ b/src/KaldiLib/clapack.cc
@@ -0,0 +1,61 @@
+
+extern "C" {
+
+  /**
+   * Wrapper to GotoBLAS lapack for STK and TNet (sgetrf sgetri dgetrf dgetri)
+   */
+  typedef float real;
+  typedef double doublereal;
+  typedef int integer;
+
+
+  /**
+   * The lapack interface (used in gotoblas)
+   */
+  /* Subroutine */ int sgetrf_(integer *m, integer *n, real *a, integer *lda,
+          integer *ipiv, integer *info);
+  /* Subroutine */ int sgetri_(integer *n, real *a, integer *lda, integer *ipiv,
+          real *work, integer *lwork, integer *info);
+  /* Subroutine */ int dgetrf_(integer *m, integer *n, doublereal *a, integer *
+          lda, integer *ipiv, integer *info);
+  /* Subroutine */ int dgetri_(integer *n, doublereal *a, integer *lda, integer
+          *ipiv, doublereal *work, integer *lwork, integer *info);
+
+
+
+
+
+  /**
+   * The clapack interface as used by ATLAS (used in STK, 
+   */
+  enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102 };
+
+  int clapack_sgetrf(const enum CBLAS_ORDER Order, const int M, const int N,
+                     float *A, const int lda, int *ipiv) 
+  {
+    return sgetrf_((int*)&M, (int*)&N, A, (int*)&lda, (int*)ipiv, 0);
+  }
+
+
+  int clapack_sgetri(const enum CBLAS_ORDER Order, const int N, float *A,
+                     const int lda, const int *ipiv) 
+  {
+    return sgetri_((int*)&N, A, (int*)&lda, (int*)ipiv, 0, 0, 0);
+  }
+
+
+  int clapack_dgetrf(const enum CBLAS_ORDER Order, const int M, const int N,
+                     double *A, const int lda, int *ipiv) 
+  {
+    return dgetrf_((int*)&M, (int*)&N, A, (int*)&lda, (int*)ipiv, 0);
+  }
+    
+
+  int clapack_dgetri(const enum CBLAS_ORDER Order, const int N, double *A,
+                     const int lda, const int *ipiv)
+  {
+    return dgetri_((int*)&N, A, (int*)&lda, (int*)ipiv, 0, 0, 0);
+  }
+
+
+}
diff --git a/src/KaldiLib/clapack.h b/src/KaldiLib/clapack.h
new file mode 100644
index 0000000..0c6855d
--- /dev/null
+++ b/src/KaldiLib/clapack.h
@@ -0,0 +1,149 @@
+/*
+ *             Automatically Tuned Linear Algebra Software v3.8.2
+ *                    (C) Copyright 1999 R. Clint Whaley
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *   1. Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *   2. Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions, and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *   3. The name of the ATLAS group or the names of its contributers may
+ *      not be used to endorse or promote products derived from this
+ *      software without specific written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef CLAPACK_H
+
+#define CLAPACK_H
+#include "cblas.h"
+
+#ifndef ATLAS_ORDER
+   #define ATLAS_ORDER CBLAS_ORDER
+#endif
+#ifndef ATLAS_UPLO
+   #define ATLAS_UPLO CBLAS_UPLO
+#endif
+#ifndef ATLAS_DIAG
+   #define ATLAS_DIAG CBLAS_DIAG
+#endif
+int clapack_sgesv(const enum CBLAS_ORDER Order, const int N, const int NRHS,
+                  float *A, const int lda, int *ipiv,
+                  float *B, const int ldb);
+int clapack_sgetrf(const enum CBLAS_ORDER Order, const int M, const int N,
+                   float *A, const int lda, int *ipiv);
+int clapack_sgetrs
+   (const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE Trans,
+    const int N, const int NRHS, const float *A, const int lda,
+    const int *ipiv, float *B, const int ldb);
+int clapack_sgetri(const enum CBLAS_ORDER Order, const int N, float *A,
+                   const int lda, const int *ipiv);
+int clapack_sposv(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+                  const int N, const int NRHS, float *A, const int lda,
+                  float *B, const int ldb);
+int clapack_spotrf(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+                   const int N, float *A, const int lda);
+int clapack_spotrs(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                   const int N, const int NRHS, const float *A, const int lda,
+                   float *B, const int ldb);
+int clapack_spotri(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+                   const int N, float *A, const int lda);
+int clapack_slauum(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+                   const int N, float *A, const int lda);
+int clapack_strtri(const enum ATLAS_ORDER Order,const enum ATLAS_UPLO Uplo,
+                  const enum ATLAS_DIAG Diag,const int N, float *A, const int lda);
+
+int clapack_dgesv(const enum CBLAS_ORDER Order, const int N, const int NRHS,
+                  double *A, const int lda, int *ipiv,
+                  double *B, const int ldb);
+int clapack_dgetrf(const enum CBLAS_ORDER Order, const int M, const int N,
+                   double *A, const int lda, int *ipiv);
+int clapack_dgetrs
+   (const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE Trans,
+    const int N, const int NRHS, const double *A, const int lda,
+    const int *ipiv, double *B, const int ldb);
+int clapack_dgetri(const enum CBLAS_ORDER Order, const int N, double *A,
+                   const int lda, const int *ipiv);
+int clapack_dposv(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+                  const int N, const int NRHS, double *A, const int lda,
+                  double *B, const int ldb);
+int clapack_dpotrf(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+                   const int N, double *A, const int lda);
+int clapack_dpotrs(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                   const int N, const int NRHS, const double *A, const int lda,
+                   double *B, const int ldb);
+int clapack_dpotri(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+                   const int N, double *A, const int lda);
+int clapack_dlauum(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+                   const int N, double *A, const int lda);
+int clapack_dtrtri(const enum ATLAS_ORDER Order,const enum ATLAS_UPLO Uplo,
+                  const enum ATLAS_DIAG Diag,const int N, double *A, const int lda);
+
+int clapack_cgesv(const enum CBLAS_ORDER Order, const int N, const int NRHS,
+                  void *A, const int lda, int *ipiv,
+                  void *B, const int ldb);
+int clapack_cgetrf(const enum CBLAS_ORDER Order, const int M, const int N,
+                   void *A, const int lda, int *ipiv);
+int clapack_cgetrs
+   (const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE Trans,
+    const int N, const int NRHS, const void *A, const int lda,
+    const int *ipiv, void *B, const int ldb);
+int clapack_cgetri(const enum CBLAS_ORDER Order, const int N, void *A,
+                   const int lda, const int *ipiv);
+int clapack_cposv(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+                  const int N, const int NRHS, void *A, const int lda,
+                  void *B, const int ldb);
+int clapack_cpotrf(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+                   const int N, void *A, const int lda);
+int clapack_cpotrs(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                   const int N, const int NRHS, const void *A, const int lda,
+                   void *B, const int ldb);
+int clapack_cpotri(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+                   const int N, void *A, const int lda);
+int clapack_clauum(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+                   const int N, void *A, const int lda);
+int clapack_ctrtri(const enum ATLAS_ORDER Order,const enum ATLAS_UPLO Uplo,
+                  const enum ATLAS_DIAG Diag,const int N, void *A, const int lda);
+
+int clapack_zgesv(const enum CBLAS_ORDER Order, const int N, const int NRHS,
+                  void *A, const int lda, int *ipiv,
+                  void *B, const int ldb);
+int clapack_zgetrf(const enum CBLAS_ORDER Order, const int M, const int N,
+                   void *A, const int lda, int *ipiv);
+int clapack_zgetrs
+   (const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE Trans,
+    const int N, const int NRHS, const void *A, const int lda,
+    const int *ipiv, void *B, const int ldb);
+int clapack_zgetri(const enum CBLAS_ORDER Order, const int N, void *A,
+                   const int lda, const int *ipiv);
+int clapack_zposv(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+                  const int N, const int NRHS, void *A, const int lda,
+                  void *B, const int ldb);
+int clapack_zpotrf(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+                   const int N, void *A, const int lda);
+int clapack_zpotrs(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                   const int N, const int NRHS, const void *A, const int lda,
+                   void *B, const int ldb);
+int clapack_zpotri(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+                   const int N, void *A, const int lda);
+int clapack_zlauum(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
+                   const int N, void *A, const int lda);
+int clapack_ztrtri(const enum ATLAS_ORDER Order,const enum ATLAS_UPLO Uplo,
+                  const enum ATLAS_DIAG Diag,const int N, void *A, const int lda);
+
+#endif
diff --git a/src/Makefile b/src/Makefile
new file mode 100644
index 0000000..eb071fb
--- /dev/null
+++ b/src/Makefile
@@ -0,0 +1,162 @@
+
+include tnet.mk
+
+##### Check that CUDA Toolkit directory was set
+ifneq ($(HAVE_CUDA), true)
+  $(warning %%% WARNING!!!)
+  $(warning %%% CUDA not found! Incorrect path in CUDA_TK_BASE: $(CUDA_TK_BASE))
+  $(warning %%% Try setting CUDA_TK_BASE in 'trunk/src/tnet.mk')
+  $(warning %%% WARNING!!!)
+else
+  #$(warning %%% INFO: Using CUDA from CUDA_TK_BASE: $(CUDA_TK_BASE))
+endif
+
+
+##### Includes
+INCLUDE := -IKaldiLib -ITNetLib -ISTKLib 
+INCLUDE += -ICuBaseLib -ICuTNetLib
+INCLUDE += -I$(CUDA_TK_BASE)/include
+
+CXXFLAGS += $(INCLUDE)
+
+##### CPU implementation libs
+LDFLAGS :=   -LTNetLib -lTNetLib
+LDFLAGS +=   -LKaldiLib -lKaldiLib
+LDFLAGS +=   -pthread 
+
+##### Link with GotoBLAS
+ifeq ($(BITS64), true) 
+  LDFLAGS += -LGotoBLASLib -lgoto2_64 -lgfortran
+else
+  LDFLAGS += -LGotoBLASLib -lgoto2 -lgfortran
+endif
+LDFLAGS += -Wl,-rpath,$(PWD)/GotoBLASLib
+
+##### CUDA implementation libs
+ifeq ($(CUDA), true)
+  #TNet libs
+  LDFLAGS_CUDA := -LCuTNetLib -lCuTNet
+  LDFLAGS_CUDA += -LCuBaseLib -lCuBase
+  #CUDA toolkit libs
+  ifeq ($(BITS64), true)
+    LDFLAGS_CUDA += -L$(CUDA_TK_BASE)/lib64 -Wl,-rpath,$(CUDA_TK_BASE)/lib64
+  else
+    LDFLAGS_CUDA += -L$(CUDA_TK_BASE)/lib -Wl,-rpath,$(CUDA_TK_BASE)/lib
+  endif
+  LDFLAGS_CUDA += -lcublas -lcudart -lcuda 
+endif
+
+
+##############################################################
+# Target programs 
+##############################################################
+
+#CPU tools
+BINS := TNet TNorm TFeaCat TSegmenter TJoiner
+all : $(BINS) 
+$(BINS): lib
+
+#GPU tools
+CUBINS := TNetCu TNormCu TFeaCatCu TRbmCu TRecurrentCu
+ifeq ($(STK), true)
+  CUBINS += TMpeCu TMmiCu
+endif
+ifeq ($(CUDA), true)
+cubins : $(CUBINS)
+##HINT: Link CUDA libs only with CUDA tools!!!##
+##(recursive target-specific variable value)##
+cubins : LDFLAGS += $(LDFLAGS_CUDA)
+##
+all : cubins
+$(CUBINS): lib culib
+endif
+
+
+##############################################################
+# program compliling implicit rule
+##############################################################
+% : %.o
+	$(CXX)  -o $@  $< $(CXXFLAGS) $(INCLUDE) $(LDFLAGS)
+
+ 
+##############################################################
+# module compliling implicit rule
+##############################################################
+%.o : %.cc lib
+	$(CXX)  -o $@  -c $< $(CFLAGS) $(CXXFLAGS) $(INCLUDE)
+
+
+##############################################################
+# STK specific rules
+##############################################################
+#TMpeCu depends on STK
+TMpeCu.o: stklib
+TMpeCu: LDFLAGS := -LSTKLib -lSTKLib $(LDFLAGS) $(LDFLAGS_CUDA)
+#TMmiCu depends on STK
+TMmiCu.o: stklib
+TMmiCu: LDFLAGS := -LSTKLib -lSTKLib $(LDFLAGS) $(LDFLAGS_CUDA)
+
+
+##############################################################
+# Source files for CPU/GPU tools
+##############################################################
+CC_BINS=$(addsuffix .cc, $(BINS))
+CC_CUBINS=$(addsuffix .cc, $(CUBINS))
+
+O_BINS=$(addsuffix .o, $(BINS))
+O_CUBINS=$(addsuffix .o, $(CUBINS))
+
+$(O_BINS) : $(CC_BINS) 
+$(O_CUBINS) : $(CC_CUBINS) 
+
+$(BINS) : $(O_BINS)
+$(CUBINS) : $(O_CUBINS)
+
+##############################################################
+.PHONY: lib culib stklib clean doc depend
+
+lib:
+	@cd KaldiLib && make $(FWDPARAM)
+	@cd TNetLib && make $(FWDPARAM)
+
+culib: 
+	@cd CuBaseLib && make $(FWDPARAM)
+	@cd CuTNetLib && make $(FWDPARAM)
+	
+stklib:
+	@cd STKLib && make $(FWDPARAM)
+
+clean:
+	rm -f *.o $(BINS) $(CUBINS)
+	@cd STKLib && make clean
+	@cd KaldiLib && make clean
+	@cd TNetLib && make clean
+	@cd CuBaseLib && make clean
+	@cd CuTNetLib && make clean
+
+doc:
+	doxygen ../doc/doxyfile_TNet
+
+depend: 
+	$(CXX) -M $(CXXFLAGS) $(CC_BINS) $(INCLUDE) > .depend.mk1
+	@cd KaldiLib && make depend
+	@cd TNetLib && make depend
+	touch .depend.mk{1,2}
+	cat .depend.mk{1,2} > .depend.mk
+	rm .depend.mk{1,2}
+
+cudepend:
+	$(CXX) -M $(CXXFLAGS) $(CC_CUBINS) $(INCLUDE) > .depend.mk2
+	@cd CuBaseLib && make depend
+	@cd CuTNetLib && make depend
+ifeq ($(HAVE_CUDA), true)
+depend: cudepend
+endif
+ifeq ($(STK), true)
+cudepend: stklib
+endif
+
+
+-include .depend.mk
+
+
diff --git a/src/STKLib/.svn/entries b/src/STKLib/.svn/entries
new file mode 100644
index 0000000..1a57f8b
--- /dev/null
+++ b/src/STKLib/.svn/entries
@@ -0,0 +1,62 @@
+10
+
+dir
+117
+svn+ssh://merlin.fit.vutbr.cz/svn/TNet/trunk/src/STKLib
+svn+ssh://merlin.fit.vutbr.cz/svn/TNet
+
+
+
+2011-04-29T12:18:20.752880Z
+49
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+bda6da93-004a-4ae9-8e07-715c10848801
+
+Makefile
+file
+
+
+
+
+2012-04-02T13:49:13.000000Z
+c21f3a35c7471bc1b42a90014b234702
+2011-04-29T12:18:20.752880Z
+49
+iveselyk
+has-props
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+731
+
diff --git a/src/STKLib/.svn/prop-base/Makefile.svn-base b/src/STKLib/.svn/prop-base/Makefile.svn-base
new file mode 100644
index 0000000..eff4929
--- /dev/null
+++ b/src/STKLib/.svn/prop-base/Makefile.svn-base
@@ -0,0 +1,5 @@
+K 12
+svn:keywords
+V 23
+Date Revision Author Id
+END
diff --git a/src/STKLib/.svn/text-base/Makefile.svn-base b/src/STKLib/.svn/text-base/Makefile.svn-base
new file mode 100644
index 0000000..a95533f
--- /dev/null
+++ b/src/STKLib/.svn/text-base/Makefile.svn-base
@@ -0,0 +1,29 @@
+
+include ../tnet.mk 
+
+ifeq ($(BITS64), true)
+  BITS=64
+  CROSS_COMPILE = x86_64-linux-
+else
+  BITS=32
+endif
+
+ifeq ($(DOUBLEPRECISION), true)
+  PRECISION := --enable-double-precision
+endif
+
+STKCONFIG := CFLAGS="-m$(BITS) -g -O2 -fPIC" CXXFLAGS="-m$(BITS) -g -O2 -fPIC" CXX="$(CROSS_COMPILE)g++" CC="$(CROSS_COMPILE)g++" AR="$(CROSS_COMPILE)ar" RANLIB="$(CROSS_COMPILE)ranlib" AS="$(CROSS_COMPILE)as" --with-atlas $(PRECISION)
+
+
+
+all : libSTKLib.a
+
+libSTKLib.a : libSTKLib.a
+	echo $(CC) $(CXX)
+	svn co http://merlin.fit.vutbr.cz/svn/STK/trunk/
+	(cd trunk/; ./reconf; ./configure $(STKCONFIG) ; make clean; make SERest; cd ..;)
+	cp trunk/src/STKLib/libSTKLib.a .
+	cp trunk/src/STKLib/*.{h,tcc} .
+
+clean :
+	rm -f libSTKLib.a *.{h,tcc}
diff --git a/src/STKLib/Makefile b/src/STKLib/Makefile
new file mode 100644
index 0000000..a95533f
--- /dev/null
+++ b/src/STKLib/Makefile
@@ -0,0 +1,29 @@
+
+include ../tnet.mk 
+
+ifeq ($(BITS64), true)
+  BITS=64
+  CROSS_COMPILE = x86_64-linux-
+else
+  BITS=32
+endif
+
+ifeq ($(DOUBLEPRECISION), true)
+  PRECISION := --enable-double-precision
+endif
+
+STKCONFIG := CFLAGS="-m$(BITS) -g -O2 -fPIC" CXXFLAGS="-m$(BITS) -g -O2 -fPIC" CXX="$(CROSS_COMPILE)g++" CC="$(CROSS_COMPILE)g++" AR="$(CROSS_COMPILE)ar" RANLIB="$(CROSS_COMPILE)ranlib" AS="$(CROSS_COMPILE)as" --with-atlas $(PRECISION)
+
+
+
+all : libSTKLib.a
+
+libSTKLib.a : libSTKLib.a
+	echo $(CC) $(CXX)
+	svn co http://merlin.fit.vutbr.cz/svn/STK/trunk/
+	(cd trunk/; ./reconf; ./configure $(STKCONFIG) ; make clean; make SERest; cd ..;)
+	cp trunk/src/STKLib/libSTKLib.a .
+	cp trunk/src/STKLib/*.{h,tcc} .
+
+clean :
+	rm -f libSTKLib.a *.{h,tcc}
diff --git a/src/TFeaCat.cc b/src/TFeaCat.cc
new file mode 100644
index 0000000..6a15433
--- /dev/null
+++ b/src/TFeaCat.cc
@@ -0,0 +1,281 @@
+
+/***************************************************************************
+ *   copyright            : (C) 2011 by Karel Vesely,UPGM,FIT,VUT,Brno     *
+ *   email                : iveselyk@fit.vutbr.cz                          *
+ ***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the APACHE License as published by the          *
+ *   Apache Software Foundation; either version 2.0 of the License,        *
+ *   or (at your option) any later version.                                *
+ *                                                                         *
+ ***************************************************************************/
+
+#define SVN_DATE       "$Date: 2012-01-27 16:33:21 +0100 (Fri, 27 Jan 2012) $"
+#define SVN_AUTHOR     "$Author: iveselyk $"
+#define SVN_REVISION   "$Revision: 98 $"
+#define SVN_ID         "$Id: TFeaCat.cc 98 2012-01-27 15:33:21Z iveselyk $"
+
+#define MODULE_VERSION "1.0.0 "__TIME__" "__DATE__" "SVN_ID 
+
+
+ 
+#include "Error.h"
+#include "Timer.h"
+#include "Features.h"
+#include "Common.h"
+#include "UserInterface.h"
+
+#include "Nnet.h"
+
+#include <iostream>
+#include <sstream>
+
+
+
+//////////////////////////////////////////////////////////////////////
+// DEFINES
+//
+
+#define SNAME "TFEACAT"
+
+using namespace TNet;
+
+void usage(const char* progname) 
+{
+  const char *tchrptr;
+  if ((tchrptr = strrchr(progname, '\\')) != NULL) progname = tchrptr+1;
+  if ((tchrptr = strrchr(progname, '/')) != NULL) progname = tchrptr+1;
+  fprintf(stderr,
+"\n%s version " MODULE_VERSION "\n"
+"\nUSAGE: %s [options] DataFiles...\n\n"
+":TODO:\n\n"
+" Option                                                     Default\n\n"
+" -l dir     Set target directory for features               Current\n"
+" -y ext     Set target feature ext                          fea\n"
+" -A         Print command line arguments                    Off\n" 
+" -C cf      Set config file to cf                           Default\n"
+" -D         Display configuration variables                 Off\n"
+" -H mmf     Load NN macro file                              \n"  
+" -S file    Set script file                                 None\n"
+" -T N       Set trace flags to N                            0\n"
+" -V         Print version information                       Off\n"
+"\n"
+"FEATURETRANSFORM GMMBYPASS LOGPOSTERIOR NATURALREADORDER PRINTCONFIG PRINTVERSION SCRIPT SOURCEMMF TARGETPARAMDIR TARGETPARAMEXT TRACE\n"
+"\n"
+"STARTFRMEXT ENDFRMEXT CMEANDIR CMEANMASK VARSCALEDIR VARSCALEMASK VARSCALEFN TARGETKIND DERIVWINDOWS DELTAWINDOW ACCWINDOW THIRDWINDOW\n"
+"\n"
+" %s is Copyright (C) 2010-2011 Karel Vesely\n"
+" licensed under the APACHE License, version 2.0\n"
+" Bug reports, feedback, etc, to: iveselyk@fit.vutbr.cz\n"
+"\n", progname, progname, progname);
+  exit(-1);
+}
+
+
+
+
+///////////////////////////////////////////////////////////////////////
+// MAIN FUNCTION
+//
+
+
+int main(int argc, char *argv[]) try
+{
+
+  const char* p_option_string =
+    " -l r   TARGETPARAMDIR" 
+    " -y r   TARGETPARAMEXT" 
+    " -D n   PRINTCONFIG=TRUE"
+    " -H l   SOURCEMMF"
+    " -S l   SCRIPT"
+    " -T r   TRACE"
+    " -V n   PRINTVERSION=TRUE";
+
+  if(argc == 1) { usage(argv[0]); }
+
+  UserInterface        ui;
+  FeatureRepository    feature_repo;
+  Network              transform_network;
+  Network              network;
+  Timer                tim;
+
+ 
+  const char*                       p_script;
+        char                        p_target_fea[4096];
+  const char*                       p_target_fea_dir;
+  const char*                       p_target_fea_ext;
+
+  const char*                       p_source_mmf_file;
+  const char*                       p_input_transform;
+
+  bool                              gmm_bypass;
+  bool                              log_posterior;
+  int                               trace;
+
+  // variables for feature repository
+  bool                              swap_features;
+  int                               target_kind;
+  int                               deriv_order;
+  int*                              p_deriv_win_lenghts;
+  int                               start_frm_ext;
+  int                               end_frm_ext;
+        char*                       cmn_path;
+        char*                       cmn_file;
+  const char*                       cmn_mask;
+        char*                       cvn_path;
+        char*                       cvn_file;
+  const char*                       cvn_mask;
+  const char*                       cvg_file;
+
+
+  // OPTION PARSING ..........................................................
+  // use the STK option parsing
+  int ii = ui.ParseOptions(argc, argv, p_option_string, SNAME);
+
+
+  // OPTION RETRIEVAL ........................................................
+  // extract the feature parameters
+  swap_features = !ui.GetBool(SNAME":NATURALREADORDER", TNet::IsBigEndian());
+  
+  target_kind = ui.GetFeatureParams(&deriv_order, &p_deriv_win_lenghts,
+       &start_frm_ext, &end_frm_ext, &cmn_path, &cmn_file, &cmn_mask,
+       &cvn_path, &cvn_file, &cvn_mask, &cvg_file, SNAME":", 0);
+
+
+  // extract other parameters
+  p_source_mmf_file   = ui.GetStr(SNAME":SOURCEMMF",     NULL);
+  p_input_transform   = ui.GetStr(SNAME":FEATURETRANSFORM",  NULL);
+
+  p_script            = ui.GetStr(SNAME":SCRIPT",         NULL);
+  p_target_fea_dir    = ui.GetStr(SNAME":TARGETPARAMDIR", NULL);
+  p_target_fea_ext    = ui.GetStr(SNAME":TARGETPARAMEXT", NULL);
+  
+  gmm_bypass          = ui.GetBool(SNAME":GMMBYPASS",     false);
+  log_posterior       = ui.GetBool(SNAME":LOGPOSTERIOR",  false);
+   
+  trace               = ui.GetInt(SNAME":TRACE",          00);
+
+  
+  // process the parameters
+  if(ui.GetBool(SNAME":PRINTVERSION", false)) {
+    std::cout << "Version: "MODULE_VERSION"" << std::endl;
+  }
+  if(ui.GetBool(SNAME":PRINTCONFIG", false)) {
+    std::cout << std::endl;
+    ui.PrintConfig(std::cout);
+    std::cout << std::endl;
+  }
+  ui.CheckCommandLineParamUse();
+  
+
+  // the rest of the parameters are the feature files
+  for (; ii < argc; ii++) {
+    feature_repo.AddFile(argv[ii]);
+  }
+
+  //**************************************************************************
+  //**************************************************************************
+  // OPTION PARSING DONE .....................................................
+
+  //read the input transform network
+  if(NULL != p_input_transform) { 
+    if(trace&1) TraceLog(std::string("Reading input transform network: ")+p_input_transform);
+    transform_network.ReadNetwork(p_input_transform);
+  }
+
+  //read the neural network
+  if(NULL != p_source_mmf_file) { 
+    if(trace&1) TraceLog(std::string("Reading network: ")+p_source_mmf_file);
+    network.ReadNetwork(p_source_mmf_file);
+  } else {
+    Error("Source MMF must be specified [-H]");
+  }
+
+  //initialize the FeatureRepository
+  feature_repo.Init(
+    swap_features, start_frm_ext, end_frm_ext, target_kind,
+    deriv_order, p_deriv_win_lenghts, 
+    cmn_path, cmn_mask, cvn_path, cvn_mask, cvg_file
+  );
+  if(NULL != p_script) {
+    feature_repo.AddFileList(p_script);
+  } 
+  if(feature_repo.QueueSize() <= 0) {
+    KALDI_ERR << "No input features specified,\n"
+              << " try [-S SCP] or positional argument";
+  }
+
+  //**************************************************************************
+  //**************************************************************************
+  // MAIN LOOP ...............................................................
+
+  //progress
+  size_t cnt = 0;
+  size_t step = feature_repo.QueueSize() / 100;
+  if(step == 0) step = 1;
+  tim.Start();
+
+  //data carriers
+  Matrix<BaseFloat> feats_in,feats_out,nnet_out;
+  //process all the feature files
+  for(feature_repo.Rewind(); !feature_repo.EndOfList(); feature_repo.MoveNext()) {
+    //read file
+    feature_repo.ReadFullMatrix(feats_in);
+
+    //pass through transform network
+    //transform_network.Propagate(feats_in, feats_out);
+    transform_network.Feedforward(feats_in, feats_out, start_frm_ext, end_frm_ext);
+
+    //pass through network
+    //network.Propagate(feats_out,nnet_out);
+    network.Feedforward(feats_out,nnet_out,start_frm_ext,end_frm_ext);
+
+    //get the ouput, trim the start/end context
+    feats_out.Init(nnet_out.Rows()-start_frm_ext-end_frm_ext,nnet_out.Cols());
+    memcpy(feats_out.pData(),nnet_out.pRowData(start_frm_ext),feats_out.MSize());
+   
+    //GMM bypass for HVite using posteriors as features
+    if(gmm_bypass) {
+      for(size_t i=0; i<feats_out.Rows(); i++) {
+        for(size_t j=0; j<feats_out.Cols(); j++) {
+          feats_out(i,j) = static_cast<BaseFloat>(sqrt(-2.0*log(feats_out(i,j))));
+        }
+      }
+    }
+  
+    //Convert posteriors to logdomain
+    if(log_posterior) {
+      for(size_t i=0; i<feats_out.Rows(); i++) {
+        for(size_t j=0; j<feats_out.Cols(); j++) {
+          feats_out(i,j) = static_cast<BaseFloat>(log(feats_out(i,j)));
+        }
+      }
+    }
+
+    //build filename
+    MakeHtkFileName(p_target_fea, 
+                    feature_repo.Current().Logical().c_str(),
+                    p_target_fea_dir, p_target_fea_ext);
+    //save output   
+    int sample_period = feature_repo.CurrentHeader().mSamplePeriod;
+    feature_repo.WriteFeatureMatrix(feats_out,p_target_fea,PARAMKIND_USER,sample_period);
+    
+    //progress
+    if(trace&1) {
+      if((cnt++ % step) == 0) std::cout << 100 * cnt / feature_repo.QueueSize() << "%, " << std::flush;
+    }
+  }
+  
+  //finish
+  if(trace&1) {
+    tim.End();
+    std::cout << "TFeaCat finished: " << tim.Val() << "s" <<std::endl;
+  }
+  return 0;
+
+} catch (std::exception& rExc) {
+  std::cerr << "Exception thrown" << std::endl;
+  std::cerr << rExc.what() << std::endl;
+  return 1;
+}
diff --git a/src/TFeaCatCu.cc b/src/TFeaCatCu.cc
new file mode 100644
index 0000000..cf0c0be
--- /dev/null
+++ b/src/TFeaCatCu.cc
@@ -0,0 +1,283 @@
+
+/***************************************************************************
+ *   copyright            : (C) 2011 by Karel Vesely,UPGM,FIT,VUT,Brno     *
+ *   email                : iveselyk@fit.vutbr.cz                          *
+ ***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the APACHE License as published by the          *
+ *   Apache Software Foundation; either version 2.0 of the License,        *
+ *   or (at your option) any later version.                                *
+ *                                                                         *
+ ***************************************************************************/
+
+
+#define SVN_DATE       "$Date: 2012-01-27 16:33:21 +0100 (Fri, 27 Jan 2012) $"
+#define SVN_AUTHOR     "$Author: iveselyk $"
+#define SVN_REVISION   "$Revision: 98 $"
+#define SVN_ID         "$Id: TFeaCatCu.cc 98 2012-01-27 15:33:21Z iveselyk $"
+
+#define MODULE_VERSION "1.0.0 "__TIME__" "__DATE__" "SVN_ID  
+
+
+
+
+
+#include "Error.h"
+#include "Timer.h"
+#include "Features.h"
+#include "Common.h"
+#include "UserInterface.h"
+
+#include "cuNetwork.h"
+
+#include <iostream>
+#include <sstream>
+
+
+
+//////////////////////////////////////////////////////////////////////
+// DEFINES
+//
+
+#define SNAME "TFEACAT"
+
+using namespace TNet;
+
+void usage(const char* progname) 
+{
+  const char *tchrptr;
+  if ((tchrptr = strrchr(progname, '\\')) != NULL) progname = tchrptr+1;
+  if ((tchrptr = strrchr(progname, '/')) != NULL) progname = tchrptr+1;
+  fprintf(stderr,
+"\n%s version " MODULE_VERSION "\n"
+"\nUSAGE: %s [options] DataFiles...\n\n"
+" Option                                                     Default\n\n"
+" -l dir     Set target directory for features               Current\n"
+" -y ext     Set target feature ext                          fea\n"
+" -A         Print command line arguments                    Off\n" 
+" -C cf      Set config file to cf                           Default\n"
+" -D         Display configuration variables                 Off\n"
+" -H mmf     Load NN macro file                              \n"  
+" -S file    Set script file                                 None\n"
+" -T N       Set trace flags to N                            0\n"
+" -V         Print version information                       Off\n"
+"\n"
+"FEATURETRANSFORM GMMBYPASS LOGPOSTERIOR NATURALREADORDER PRINTCONFIG PRINTVERSION SCRIPT SOURCEMMF TARGETPARAMDIR TARGETPARAMEXT TRACE\n"
+"\n"
+"STARTFRMEXT ENDFRMEXT CMEANDIR CMEANMASK VARSCALEDIR VARSCALEMASK VARSCALEFN TARGETKIND DERIVWINDOWS DELTAWINDOW ACCWINDOW THIRDWINDOW\n"
+"\n"
+" %s is Copyright (C) 2010-2011 Karel Vesely\n"
+" licensed under the APACHE License, version 2.0\n"
+" Bug reports, feedback, etc, to: iveselyk@fit.vutbr.cz\n"
+"\n", progname, progname, progname);
+  exit(-1);
+}
+
+
+
+
+///////////////////////////////////////////////////////////////////////
+// MAIN FUNCTION
+//
+
+
+int main(int argc, char *argv[]) try 
+{
+  const char* p_option_string =
+    " -l r   TARGETPARAMDIR" 
+    " -y r   TARGETPARAMEXT" 
+    " -D n   PRINTCONFIG=TRUE"
+    " -H l   SOURCEMMF"
+    " -S l   SCRIPT"
+    " -T r   TRACE"
+    " -V n   PRINTVERSION=TRUE";
+
+  if(argc == 1) { usage(argv[0]); }
+
+  UserInterface        ui;
+  FeatureRepository    feature_repo;
+  CuNetwork            transform_network;
+  CuNetwork            network;
+  Timer                tim;
+
+ 
+  const char*                       p_script;
+        char                        p_target_fea[4096];
+  const char*                       p_target_fea_dir;
+  const char*                       p_target_fea_ext;
+
+  const char*                       p_source_mmf_file;
+  const char*                       p_input_transform;
+
+  bool                              gmm_bypass;
+  bool                              log_posterior;
+  int                               trace;
+
+  // variables for feature repository
+  bool                              swap_features;
+  int                               target_kind;
+  int                               deriv_order;
+  int*                              p_deriv_win_lenghts;
+  int                               start_frm_ext;
+  int                               end_frm_ext;
+        char*                       cmn_path;
+        char*                       cmn_file;
+  const char*                       cmn_mask;
+        char*                       cvn_path;
+        char*                       cvn_file;
+  const char*                       cvn_mask;
+  const char*                       cvg_file;
+
+
+  // OPTION PARSING ..........................................................
+  // use the STK option parsing
+  int ii = ui.ParseOptions(argc, argv, p_option_string, SNAME);
+
+
+  // OPTION RETRIEVAL ........................................................
+  // extract the feature parameters
+  swap_features = !ui.GetBool(SNAME":NATURALREADORDER", TNet::IsBigEndian());
+  
+  target_kind = ui.GetFeatureParams(&deriv_order, &p_deriv_win_lenghts,
+       &start_frm_ext, &end_frm_ext, &cmn_path, &cmn_file, &cmn_mask,
+       &cvn_path, &cvn_file, &cvn_mask, &cvg_file, SNAME":", 0);
+
+
+  // extract other parameters
+  p_source_mmf_file   = ui.GetStr(SNAME":SOURCEMMF",     NULL);
+  p_input_transform   = ui.GetStr(SNAME":FEATURETRANSFORM",  NULL);
+
+  p_script            = ui.GetStr(SNAME":SCRIPT",         NULL);
+  p_target_fea_dir    = ui.GetStr(SNAME":TARGETPARAMDIR", NULL);
+  p_target_fea_ext    = ui.GetStr(SNAME":TARGETPARAMEXT", "fea");
+ 
+  gmm_bypass          = ui.GetBool(SNAME":GMMBYPASS",     false);
+  log_posterior       = ui.GetBool(SNAME":LOGPOSTERIOR",  false);
+   
+  trace               = ui.GetInt(SNAME":TRACE",          00);
+  if(trace&1) { CuDevice::Instantiate().Verbose(true); }
+
+  
+  // process the parameters
+  if(ui.GetBool(SNAME":PRINTVERSION", false)) {
+    std::cout << "Version: "MODULE_VERSION"" << std::endl;
+  }
+  if(ui.GetBool(SNAME":PRINTCONFIG", false)) {
+    std::cout << std::endl;
+    ui.PrintConfig(std::cout);
+    std::cout << std::endl;
+  }
+  ui.CheckCommandLineParamUse();
+  
+
+  // the rest of the parameters are the feature files
+  for (; ii < argc; ii++) {
+    feature_repo.AddFile(argv[ii]);
+  }
+
+  //**************************************************************************
+  //**************************************************************************
+  // OPTION PARSING DONE .....................................................
+
+  //read the input transform network
+  if(NULL != p_input_transform) { 
+    if(trace&1) TraceLog(std::string("Reading input transform network: ")+p_input_transform);
+    transform_network.ReadNetwork(p_input_transform);
+  }
+
+  //read the neural network
+  if(NULL != p_source_mmf_file) { 
+    if(trace&1) TraceLog(std::string("Reading network: ")+p_source_mmf_file);
+    network.ReadNetwork(p_source_mmf_file);
+  } else {
+    Error("Source MMF must be specified [-H]");
+  }
+  
+  //initialize the FeatureRepository
+  feature_repo.Init(
+    swap_features, start_frm_ext, end_frm_ext, target_kind,
+    deriv_order, p_deriv_win_lenghts, 
+    cmn_path, cmn_mask, cvn_path, cvn_mask, cvg_file
+  );
+  if(NULL != p_script) {
+    feature_repo.AddFileList(p_script);
+  } 
+  if(feature_repo.QueueSize() <= 0) {
+    KALDI_ERR << "No input features specified,\n"
+              << " try [-S SCP] or positional argument";
+  }
+  
+
+  //**************************************************************************
+  //**************************************************************************
+  // MAIN LOOP ...............................................................
+
+  //progress
+  size_t cnt = 0;
+  size_t step = feature_repo.QueueSize() / 100;
+  if(step == 0) step = 1;
+  tim.Start();
+
+  Matrix<BaseFloat> feats_in, feats_out;
+  CuMatrix<BaseFloat> feats_in_cu, feats_transf_cu, feats_out_cu;
+  //process all the feature files
+  for(feature_repo.Rewind(); !feature_repo.EndOfList(); feature_repo.MoveNext()) {
+    //read file
+    feature_repo.ReadFullMatrix(feats_in);
+    feats_in_cu.CopyFrom(feats_in);
+
+    //apply input transform (even empty)
+    transform_network.Propagate(feats_in_cu,feats_transf_cu);
+    
+    //propagate through the network
+    network.Propagate(feats_transf_cu,feats_out_cu);
+   
+    //trim the start/end context
+    int rows = feats_out_cu.Rows()-start_frm_ext-end_frm_ext;
+    CuMatrix<BaseFloat> feats_trim_cu(rows,feats_out_cu.Cols());
+    feats_trim_cu.CopyRows(rows,start_frm_ext,feats_out_cu,0);
+
+    feats_trim_cu.CopyTo(feats_out);
+    
+    //GMM bypass for HVite using posteriors as features
+    if(gmm_bypass) {
+      for(size_t i=0; i<feats_out.Rows(); i++) {
+        for(size_t j=0; j<feats_out.Cols(); j++) {
+          feats_out(i,j) = static_cast<BaseFloat>(sqrt(-2.0*log(feats_out(i,j))));
+        }
+      }
+    }
+  
+    //Convert posteriors to logdomain
+    if(log_posterior) {
+      for(size_t i=0; i<feats_out.Rows(); i++) {
+        for(size_t j=0; j<feats_out.Cols(); j++) {
+          feats_out(i,j) = static_cast<BaseFloat>(log(feats_out(i,j)));
+        }
+      }
+    }
+
+    
+    //save output
+    MakeHtkFileName(p_target_fea, feature_repo.Current().Logical().c_str(), p_target_fea_dir, p_target_fea_ext);
+    int sample_period = feature_repo.CurrentHeader().mSamplePeriod;
+    feature_repo.WriteFeatureMatrix(feats_out,p_target_fea,PARAMKIND_USER,sample_period);
+
+    if(trace&1) {
+      if((cnt++ % step) == 0) std::cout << 100 * cnt / feature_repo.QueueSize() << "%, " << std::flush;
+    }
+  }
+    
+  //finish
+  if(trace&1) {
+    tim.End();
+    std::cout << "TFeaCat finished: " << tim.Val() << "s" << std::endl;
+  }
+  return 0;
+
+} catch (std::exception& rExc) {
+  std::cerr << "Exception thrown" << std::endl;
+  std::cerr << rExc.what() << std::endl;
+  return 1;
+}
diff --git a/src/TJoiner.cc b/src/TJoiner.cc
new file mode 100644
index 0000000..a8d335b
--- /dev/null
+++ b/src/TJoiner.cc
@@ -0,0 +1,342 @@
+
+/***************************************************************************
+ *   copyright            : (C) 2011 by Karel Vesely,UPGM,FIT,VUT,Brno     *
+ *   email                : iveselyk@fit.vutbr.cz                          *
+ ***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the APACHE License as published by the          *
+ *   Apache Software Foundation; either version 2.0 of the License,        *
+ *   or (at your option) any later version.                                *
+ *                                                                         *
+ ***************************************************************************/
+
+#define SVN_DATE       "$Date: 2012-03-23 14:22:49 +0100 (Fri, 23 Mar 2012) $"
+#define SVN_AUTHOR     "$Author: iveselyk $"
+#define SVN_REVISION   "$Revision: 110 $"
+#define SVN_ID         "$Id: TJoiner.cc 110 2012-03-23 13:22:49Z iveselyk $"
+
+#define MODULE_VERSION "1.0.0 "__TIME__" "__DATE__" "SVN_ID  
+
+
+
+/*** TNetLib includes */
+#include "Error.h"
+#include "Timer.h"
+#include "Features.h"
+#include "Common.h"
+#include "MlfStream.h"
+#include "UserInterface.h"
+#include "Timer.h"
+
+/*** STL includes */
+#include <iostream>
+#include <sstream>
+#include <numeric>
+#include <limits>
+
+/*** Unix includes */
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+
+
+
+
+
+//////////////////////////////////////////////////////////////////////
+// DEFINES
+//
+
+#define SNAME "TJOINER"
+
+using namespace TNet;
+
+void usage(const char* progname) 
+{
+  const char *tchrptr;
+  if ((tchrptr = strrchr(progname, '\\')) != NULL) progname = tchrptr+1;
+  if ((tchrptr = strrchr(progname, '/')) != NULL) progname = tchrptr+1;
+  fprintf(stderr,
+"\n%s version " MODULE_VERSION "\n"
+"\nUSAGE: %s [options] DataFiles...\n\n"
+" Option                                                     Default\n\n"
+" -l dir     Set target directory for features               !REQ!\n"
+" -y ext     Set target feature ext                          fea_join\n"
+" -A         Print command line arguments                    Off\n" 
+" -C cf      Set config file to cf                           Default\n"
+" -D         Display configuration variables                 Off\n"
+" -S file    Set script file                                 None\n"
+" -T N       Set trace flags to N                            0\n" 
+" -V         Print version information                       Off\n"
+"\n"
+"NATURALREADORDER OUTPUTSCRIPT PRINTCONFIG PRINTVERSION SCRIPT TARGETPARAMDIR TARGETPARAMEXT TARGETSIZE TRACE\n"
+"\n"
+"STARTFRMEXT ENDFRMEXT CMEANDIR CMEANMASK VARSCALEDIR VARSCALEMASK VARSCALEFN TARGETKIND DERIVWINDOWS DELTAWINDOW ACCWINDOW THIRDWINDOW\n"
+"\n"
+" %s is Copyright (C) 2010-2011 Karel Vesely\n"
+" licensed under the APACHE License, version 2.0\n"
+" Bug reports, feedback, etc, to: iveselyk@fit.vutbr.cz\n"
+"\n", progname, progname, progname);
+  exit(-1);
+}
+
+
+
+inline std::string int2str(int i) {
+  char buf[64];
+  sprintf(buf,"%06d",i);
+  return buf;
+}
+
+
+
+
+///////////////////////////////////////////////////////////////////////
+// MAIN FUNCTION
+//
+
+
+int main(int argc, char *argv[]) try
+{
+  const char* p_option_string =
+    " -l r   TARGETPARAMDIR"
+    " -y r   TARGETPARAMEXT"
+    " -D n   PRINTCONFIG=TRUE"
+    " -S l   SCRIPT"
+    " -T r   TRACE"
+    " -V n   PRINTVERSION=TRUE"
+    ;
+
+
+  UserInterface        ui;
+  FeatureRepository    features;
+  Timer                timer;
+
+ 
+  const char*                       p_script;
+  const char*                       p_tgt_param_dir;
+  const char*                       p_tgt_param_ext;
+  const char*                       p_output_script;
+  int                               trace;
+  int                               target_size;
+  bool                              dir_strip;
+
+  // variables for feature repository
+  bool                              swap_features;
+  int                               target_kind;
+  int                               deriv_order;
+  int*                              p_deriv_win_lenghts;
+  int                               start_frm_ext;
+  int                               end_frm_ext;
+        char*                       cmn_path;
+        char*                       cmn_file;
+  const char*                       cmn_mask;
+        char*                       cvn_path;
+        char*                       cvn_file;
+  const char*                       cvn_mask;
+  const char*                       cvg_file;
+
+ 
+  // OPTION PARSING ..........................................................
+  // use the STK option parsing
+  if (argc == 1) { usage(argv[0]); return 1; }
+  int args_parsed = ui.ParseOptions(argc, argv, p_option_string, SNAME);
+
+
+  // OPTION RETRIEVAL ........................................................
+  // extract the feature parameters
+  swap_features = !ui.GetBool(SNAME":NATURALREADORDER", TNet::IsBigEndian());
+  
+  target_kind = ui.GetFeatureParams(&deriv_order, &p_deriv_win_lenghts,
+       &start_frm_ext, &end_frm_ext, &cmn_path, &cmn_file, &cmn_mask,
+       &cvn_path, &cvn_file, &cvn_mask, &cvg_file, SNAME":", 0);
+
+
+  // extract other parameters
+  p_script            = ui.GetStr(SNAME":SCRIPT",         NULL);
+  p_tgt_param_dir     = ui.GetStr(SNAME":TARGETPARAMDIR",      NULL);
+  p_tgt_param_ext     = ui.GetStr(SNAME":TARGETPARAMEXT","fea_join");
+  p_output_script     = ui.GetStr(SNAME":OUTPUTSCRIPT",   NULL);
+  trace               = ui.GetInt(SNAME":TRACE",          00);
+  target_size         = ui.GetInt(SNAME":TARGETSIZE",   20000);
+  dir_strip           = ui.GetBool(SNAME":DIRSTRIP", true);
+
+  // process the parameters
+  if(ui.GetBool(SNAME":PRINTCONFIG", false)) {
+    std::cout << std::endl;
+    ui.PrintConfig(std::cout);
+    std::cout << std::endl;
+  }
+  if(ui.GetBool(SNAME":PRINTVERSION", false)) {
+    std::cout << std::endl;
+    std::cout << "Version: "MODULE_VERSION"\n";
+    std::cout << std::endl;
+  }
+  ui.CheckCommandLineParamUse();
+  
+
+  // the rest of the parameters are the feature files
+  for (; args_parsed < argc; args_parsed++) {
+    features.AddFile(argv[args_parsed]);
+  }
+
+
+
+  if(NULL == p_tgt_param_dir) {
+    Error("OUTPUTDIR must be specified");
+  }
+
+
+  //**************************************************************************
+  //**************************************************************************
+  // OPTION PARSING DONE .....................................................
+
+
+  //initialize FeatureRepository
+  features.AddFileList(p_script);
+  
+  features.Init(
+    swap_features, start_frm_ext, end_frm_ext, target_kind,
+    deriv_order, p_deriv_win_lenghts, 
+    cmn_path, cmn_mask, cvn_path, cvn_mask, cvg_file
+  );
+
+  //start timer
+  timer.Start();
+
+  std::cout << "[Feature joining started]" << std::endl;
+
+  //segment the features
+  size_t cnt = 0;
+  size_t step = features.QueueSize() / 100;
+  if(step == 0) step = 1;
+
+  //open output script file
+  std::ofstream out_scp;
+  if(NULL == p_output_script) Error("OUTPUTSCRIPT parameter needed");
+  out_scp.open(p_output_script);
+  if(!out_scp.good()) Error(std::string("Cannot open output script file")+p_output_script);
+
+  //store short segments of the data
+  Matrix<BaseFloat> mat_in, mat_buffer, mat_out;
+  Vector<BaseFloat> vec_sep;
+  int pos_buf = 0;
+  int dim = -1;
+  
+  int file_out_ctr = 1;
+  std::string file_out;
+  file_out = std::string(p_tgt_param_dir) + "/" + int2str(file_out_ctr) + "." + p_tgt_param_ext;
+
+  features.Rewind();
+  for( ; !features.EndOfList(); features.MoveNext(), cnt++) {
+    //read the features
+    features.ReadFullMatrix(mat_in);
+
+    //skip invalid segments
+    bool skip = false;
+    for(size_t r=0; r<mat_in.Rows(); r++) {
+      for(size_t c=0; c<mat_in.Cols(); c++) {
+        if(isnan(mat_in(r,c)) || isinf(mat_in(r,c))) {
+          skip = true;
+        }
+      }
+    }
+    if(skip) {
+      Warning(std::string("Skipping:")+features.Current().Logical()+"\nIt contains nan or inf!!!");
+      continue;
+    }
+
+    //lazy buffer init
+    if(mat_buffer.Rows() == 0) {
+      dim = mat_in.Cols();
+      //init buffer
+      mat_buffer.Init(target_size,dim);
+      //set the separator frame to nan
+      vec_sep.Init(dim);
+      vec_sep.Set(std::numeric_limits<BaseFloat>::quiet_NaN());
+    }
+
+    if(pos_buf+1+mat_in.Rows() >= (unsigned)target_size) {
+      mat_out.Init(pos_buf+mat_in.Rows(),dim);
+      //copy buffer
+      if(pos_buf > 0) {
+        memcpy(mat_out.pData(),mat_buffer.pData(),pos_buf*mat_buffer.Stride()*sizeof(BaseFloat));
+      }
+      //copy matrix
+      memcpy(mat_out.pRowData(pos_buf),mat_in.pData(),mat_in.MSize());
+      //strip directory from logical filename
+      std::string name_logical(features.Current().Logical());
+      size_t str_pos;
+      if(dir_strip && (str_pos = name_logical.rfind("/")) != std::string::npos) {
+        name_logical.erase(0,str_pos+1);
+      }
+      //add scriptfile record
+      out_scp << name_logical << "=" << file_out << "[" << pos_buf+start_frm_ext << "," << pos_buf+mat_in.Rows()-end_frm_ext-1 << "]\n";
+
+      //save the file
+      //get the targetkind and source_rate 
+      if(target_kind == PARAMKIND_ANON) {
+        target_kind = features.CurrentHeader().mSampleKind;
+      }
+      int source_rate = features.CurrentHeader().mSamplePeriod;
+      //write the output feature
+      features.WriteFeatureMatrix(mat_out, file_out, target_kind, source_rate);
+      //get next filename
+      file_out_ctr++;
+      file_out = std::string(p_tgt_param_dir) + "/" + int2str(file_out_ctr) + "." + p_tgt_param_ext;
+
+      //set the buffer empty
+      pos_buf = 0;
+      continue;
+    }
+
+    //strip directory from logical filename
+    std::string name_logical(features.Current().Logical());
+    size_t str_pos;
+    if(dir_strip && (str_pos = name_logical.rfind("/")) != std::string::npos) {
+      name_logical.erase(0,str_pos+1);
+    }
+    //add scriptfile record
+    out_scp << name_logical << "=" << file_out << "[" << pos_buf+start_frm_ext << "," << pos_buf+mat_in.Rows()-end_frm_ext-1 << "]\n";
+
+    //add mat_in to cache, add separator
+    memcpy(mat_buffer.pRowData(pos_buf),mat_in.pData(),mat_in.MSize());
+    pos_buf += mat_in.Rows();
+    mat_buffer[pos_buf].Copy(vec_sep);
+    pos_buf++;
+
+    if((cnt % step) == 0) std::cout << 100 * cnt / features.QueueSize() << "%, " << std::flush;
+  }
+
+  //store the content of the buffer
+  if(pos_buf > 0) {
+    mat_out.Init(pos_buf-1,dim); //don't store separator! => -1
+    memcpy(mat_out.pData(),mat_buffer.pData(),mat_out.MSize());
+    //save the file
+    //get the targetkind and source_rate 
+    if(target_kind == PARAMKIND_ANON) {
+      target_kind = features.CurrentHeader().mSampleKind;
+    }
+    int source_rate = features.CurrentHeader().mSamplePeriod;
+    //write the output feature
+    features.WriteFeatureMatrix(mat_out, file_out, target_kind, source_rate);
+;
+  }
+
+  //close output script file
+  out_scp.close();
+
+  timer.End();
+  std::cout << "\n[Segmentation finished, elapsed time:( " << timer.Val() <<"s )]" << std::endl;
+
+
+  return  0; ///finish OK
+
+} catch (std::exception& rExc) {
+  std::cerr << "Exception thrown" << std::endl;
+  std::cerr << rExc.what() << std::endl;
+  return  1;
+}
+
diff --git a/src/TMpeCu.cc b/src/TMpeCu.cc
new file mode 100644
index 0000000..32c1b21
--- /dev/null
+++ b/src/TMpeCu.cc
@@ -0,0 +1,715 @@
+
+/***************************************************************************
+ *   copyright            : (C) 2011 by Karel Vesely,UPGM,FIT,VUT,Brno     *
+ *   email                : iveselyk@fit.vutbr.cz                          *
+ ***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the APACHE License as published by the          *
+ *   Apache Software Foundation; either version 2.0 of the License,        *
+ *   or (at your option) any later version.                                *
+ *                                                                         *
+ ***************************************************************************/
+
+#define SVN_DATE       "$Date: 2011-04-29 14:18:20 +0200 (Fri, 29 Apr 2011) $"
+#define SVN_AUTHOR     "$Author: iveselyk $"
+#define SVN_REVISION   "$Revision: 49 $"
+#define SVN_ID         "$Id: TMpeCu.cc 49 2011-04-29 12:18:20Z iveselyk $"
+
+#define MODULE_VERSION "1.0.0 "__TIME__" "__DATE__" "SVN_ID  
+
+/**
+ * \file TMpeCu.cc
+ */
+
+/*** STK includes */
+#include "STKLib/trunk/config.h"
+#ifdef HAVE_MEMALIGN
+  #undef HAVE_MEMALIGN
+#endif
+#ifdef HAVE_POSIX_MEMALIGN
+  #undef HAVE_POSIX_MEMALIGN
+#endif
+
+
+/*** include commons */
+#include "STKLib/common.h"
+
+
+#include "Common.h"
+
+/*** STK includes */
+#include "STKLib/fileio.h"
+#include "STKLib/Models.h"
+#include "STKLib/Decoder.h"
+#include "STKLib/stkstream.h"
+#include "STKLib/MlfStream.h"
+#include "STKLib/labels.h"
+
+
+/*** Kaldi includes */
+#include "Error.h"
+#include "Timer.h"
+#include "Features.h"
+#include "UserInterface.h"
+
+
+/*** TNet includes */
+#include "cuObjectiveFunction.h"
+#include "cuNetwork.h"
+#include "cuCache.h"
+#include "cuda.h"
+
+
+/*** STL includes */
+#include <iostream>
+#include <sstream>
+#include <numeric>
+
+
+
+
+//////////////////////////////////////////////////////////////////////
+// DEFINES
+//
+
+#define SNAME "TMPECU"
+
+using namespace TNet;
+
+void usage(const char* progname) 
+{
+  const char *tchrptr;
+  if ((tchrptr = strrchr(progname, '\\')) != NULL) progname = tchrptr+1;
+  if ((tchrptr = strrchr(progname, '/')) != NULL) progname = tchrptr+1;
+  fprintf(stderr,
+"\n%s version " MODULE_VERSION "\n"
+"\nUSAGE: %s [options] DataFiles...\n\n"
+" Option                                                     Default\n\n"
+" -n f       Set learning rate to f                          0.06\n"
+" -t f [i l] Set pruning to f [inc limit]                    Off\n"
+" -A         Print command line arguments                    Off\n" 
+" -C cf      Set config file to cf                           Default\n"
+" -D         Display configuration variables                 Off\n"
+" -G fmt     Set source trascription format to fmt           As config\n"
+" -H mmf     Load NN macro file                              \n"
+" -I mlf     Load master label file mlf (with den_num latts) \n"
+" -L dir     Set input label (or net) dir                    Current\n"
+//" -O fn      Objective function [mpe,mmi]                  mpe\n"
+" -S file    Set script file                                 None\n"
+" -T N       Set trace flags to N                            0\n" 
+" -V         Print version information                       Off\n"
+" -X ext     Set input label file ext                        lab\n"
+"\n"
+"ALLOWXWRDEXP ENDTIMESHIFT EXACTTIMEMERGE FEATURETRANSFORM GRADDIVFRM HMM HNETFILTER LEARNINGRATE LEARNRATEFACTORS LMSCALE MAXACTIVEMODELS MINACTIVEMODELS MINIMIZENET MLGAMMA MODELPENALTY NATURALREADORDER NFRAMEOUTPNORM OCCUPPSCALE OUTPSCALE POSTERIORSCALE PRINTCONFIG PRINTVERSION PRONUNSCALE PRUNING PRUNINGINC PRUNINGMAX REMEXPWRDNODES RESPECTPRONVARS SCRIPT SHOWGAMMA SOURCEDICT SOURCEMLF SOURCEMMF SOURCETRANSCDIR SOURCETRANSCEXT STARTTIMESHIFT TARGETMMF TIMEPRUNING TRACE TRANSPSCALE WEIGHTCOST WEIGHTPUSHING WORDPENALTY\n"
+"\n"
+"STARTFRMEXT ENDFRMEXT CMEANDIR CMEANMASK VARSCALEDIR VARSCALEMASK VARSCALEFN TARGETKIND DERIVWINDOWS DELTAWINDOW ACCWINDOW THIRDWINDOW\n"
+"\n"
+" %s is Copyright (C) 2010-2011 Karel Vesely\n"
+" licensed under the APACHE License, version 2.0\n"
+" Bug reports, feedback, etc, to: iveselyk@fit.vutbr.cz\n"
+"\n", progname, progname, progname);
+  exit(-1);
+}
+
+
+
+
+
+///////////////////////////////////////////////////////////////////////
+// MAIN FUNCTION
+//
+
+
+int main(int argc, char *argv[]) try
+{
+  const char* p_option_string =
+    " -n r   LEARNINGRATE" 
+    " -t ror PRUNING PRUNINGINC PRUNINGMAX"
+    " -D n   PRINTCONFIG=TRUE"
+    " -G r   SOURCETRANSCFMT"
+    " -H l   SOURCEMMF"
+    " -I r   SOURCEMLF"
+    " -L r   SOURCETRANSCDIR"
+    " -S l   SCRIPT"
+    " -T r   TRACE"
+    " -V n   PRINTVERSION=TRUE"
+    " -X r   SOURCETRANSCEXT";
+
+  //STK global objects
+  STK::ModelSet                        hset;
+  STK::Decoder<STK::DecoderNetwork>    decoder;
+  std::ostringstream                   os_warn;
+
+  //TNet global objects
+  UserInterface        ui;
+  FeatureRepository    feature_repo;
+  CuNetwork            network;
+  CuNetwork            transform_network;
+  Timer                timer;
+  Timer                timer_frontend;
+  double               time_frontend = 0.0;
+  Timer                timer_decoder;
+  double               time_decoder = 0.0;
+
+  // vars for STK
+  const char*                       p_hmm_file;
+  const char*                       p_src_mlf;
+
+  MyHSearchData                nonCDphHash;
+  MyHSearchData                phoneHash;
+  MyHSearchData                dictHash;
+
+  double                            outprb_scale;
+  char                              label_file[1024];
+  FILE*                             ilfp = NULL;
+
+  const char*                       src_lbl_dir;
+  const char*                       src_lbl_ext;
+
+  const char*                       dictionary;
+
+  double                            word_penalty;
+  double                            model_penalty;
+  double                            grammar_scale;
+  double                            posterior_scale;
+
+  bool                              time_pruning;
+  double                            pronun_scale;
+  double                            transp_scale;
+  double                            occprb_scale;
+  double                            state_pruning;
+  int                               max_active;
+  int                               min_active;
+
+  STK::ExpansionOptions             expOptions = {0};
+  STKNetworkOutputFormat            in_net_fmt = {0};
+
+  double                            stprn_step;
+  double                            stprn_limit;
+
+  STK::BasicVector<FLOAT>*          p_weight_vector = NULL;
+
+  const char*                       net_filter;
+
+  double                            avg_accuracy = 0.0;
+
+  // vars for TNet 
+  const char*                       p_script;
+
+  BaseFloat                         learning_rate;
+  const char*                       learning_rate_factors;
+  BaseFloat                         weightcost;
+  bool                              grad_div_frm;
+
+  const char*                       p_source_mmf_file;
+  const char*                       p_input_transform;
+
+  const char*                       p_targetmmf;
+
+  bool                              show_gamma;
+  bool                              ml_gamma;
+
+  int                               trace;
+
+
+  // variables for feature repository
+  bool                              swap_features;
+  int                               target_kind;
+  int                               deriv_order;
+  int*                              p_deriv_win_lenghts;
+  int                               start_frm_ext;
+  int                               end_frm_ext;
+        char*                       cmn_path;
+        char*                       cmn_file;
+  const char*                       cmn_mask;
+        char*                       cvn_path;
+        char*                       cvn_file;
+  const char*                       cvn_mask;
+  const char*                       cvg_file;
+
+ 
+  // OPTION PARSING ..........................................................
+  // use the STK option parsing
+  if (argc == 1) { usage(argv[0]); return 1; }
+  int args_parsed = ui.ParseOptions(argc, argv, p_option_string, SNAME);
+
+
+  // OPTION RETRIEVAL ........................................................
+  // extract the feature parameters
+  swap_features = !ui.GetBool(SNAME":NATURALREADORDER", TNet::IsBigEndian());
+  
+  target_kind = ui.GetFeatureParams(&deriv_order, &p_deriv_win_lenghts,
+       &start_frm_ext, &end_frm_ext, &cmn_path, &cmn_file, &cmn_mask,
+       &cvn_path, &cvn_file, &cvn_mask, &cvg_file, SNAME":", 0);
+
+
+  // extract STK parameters
+  p_hmm_file          = ui.GetStr(SNAME":HMM",     NULL);
+  p_src_mlf           = ui.GetStr(SNAME":SOURCEMLF",     NULL);
+  
+  outprb_scale        = ui.GetFlt(SNAME":OUTPSCALE", 1.0);
+
+  src_lbl_dir         = ui.GetStr(SNAME":SOURCETRANSCDIR", NULL);
+  src_lbl_ext         = ui.GetStr(SNAME":SOURCETRANSCEXT", NULL);
+
+  dictionary   =   ui.GetStr(SNAME":SOURCEDICT",      NULL);
+  
+  word_penalty =   ui.GetFlt(SNAME":WORDPENALTY",     0.0);
+  model_penalty=   ui.GetFlt(SNAME":MODELPENALTY",    0.0);
+  grammar_scale=   ui.GetFlt(SNAME":LMSCALE",         1.0);
+  posterior_scale= ui.GetFlt(SNAME":POSTERIORSCALE", 1.0);
+
+
+  time_pruning = ui.GetBool(SNAME":TIMEPRUNING",     false);
+  in_net_fmt.mNoTimes = !time_pruning;
+
+  pronun_scale = ui.GetFlt(SNAME":PRONUNSCALE",     1.0);
+  transp_scale = ui.GetFlt(SNAME":TRANSPSCALE",     1.0);
+  occprb_scale = ui.GetFlt(SNAME":OCCUPPSCALE",     1.0);
+  state_pruning= ui.GetFlt(SNAME":PRUNING",         0.0);
+  max_active   = ui.GetInt(SNAME":MAXACTIVEMODELS", 0);
+  min_active   = ui.GetInt(SNAME":MINACTIVEMODELS", 0);
+
+  expOptions.mCDPhoneExpansion =
+                   ui.GetBool(SNAME":ALLOWXWRDEXP",    false);
+  expOptions.mRespectPronunVar
+                 = ui.GetBool(SNAME":RESPECTPRONVARS", false);
+  expOptions.mStrictTiming
+                 = ui.GetBool(SNAME":EXACTTIMEMERGE",  false);
+  expOptions.mNoWeightPushing
+                 =!ui.GetBool(SNAME":WEIGHTPUSHING",   true);
+  expOptions.mNoOptimization
+                 =!ui.GetBool(SNAME":MINIMIZENET",     false);
+  expOptions.mRemoveWordsNodes
+                 = ui.GetBool(SNAME":REMEXPWRDNODES",  false);
+
+  stprn_step   = ui.GetFlt(SNAME":PRUNINGINC",      0.0);
+  stprn_limit  = ui.GetFlt(SNAME":PRUNINGMAX",      0.0);
+
+  net_filter   = ui.GetStr(SNAME":HNETFILTER",      NULL);
+  if(NULL != net_filter) {
+    transc_filter = net_filter;
+  }
+
+  in_net_fmt.mStartTimeShift =
+                   ui.GetFlt(SNAME":STARTTIMESHIFT",  0.0);
+  in_net_fmt.mEndTimeShift =
+                   ui.GetFlt(SNAME":ENDTIMESHIFT",    0.0);
+
+
+
+
+  // extract other parameters
+  p_source_mmf_file   = ui.GetStr(SNAME":SOURCEMMF",       NULL);
+  p_input_transform   = ui.GetStr(SNAME":FEATURETRANSFORM",NULL);
+  
+  p_targetmmf         = ui.GetStr(SNAME":TARGETMMF",       NULL);
+
+  p_script            = ui.GetStr(SNAME":SCRIPT",          NULL);
+
+  learning_rate       = ui.GetFlt(SNAME":LEARNINGRATE"  , 0.06f);
+  learning_rate_factors = ui.GetStr(SNAME":LEARNRATEFACTORS", NULL);
+  weightcost         = ui.GetFlt(SNAME":WEIGHTCOST"  , 0.0f);
+  grad_div_frm        = ui.GetBool(SNAME":GRADDIVFRM",     true);
+
+  show_gamma          = ui.GetBool(SNAME":SHOWGAMMA",     false);
+  ml_gamma            = ui.GetBool(SNAME":MLGAMMA",       false);
+
+  trace               = ui.GetInt(SNAME":TRACE",              0);
+  if(trace&1) { CuDevice::Instantiate().Verbose(true); }
+
+
+
+
+  // process the parameters
+  if(ui.GetBool(SNAME":PRINTCONFIG", false)) {
+    std::cout << std::endl;
+    ui.PrintConfig(std::cout);
+    std::cout << std::endl;
+  }
+  if(ui.GetBool(SNAME":PRINTVERSION", false)) {
+    std::cout << std::endl;
+    std::cout << "======= TNET v"MODULE_VERSION" xvesel39 =======" << std::endl;
+    std::cout << std::endl;
+  }
+  ui.CheckCommandLineParamUse();
+  
+
+  // the rest of the parameters are the feature files
+  for (; args_parsed < argc; args_parsed++) {
+    feature_repo.AddFile(argv[args_parsed]);
+  }
+
+  //**************************************************************************
+  //**************************************************************************
+  // OPTION PARSING DONE .....................................................
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  // initialize STK
+  
+  // initialize basic ModelSet
+  hset.Init(STK::MODEL_SET_WITH_ACCUM);
+  hset.mUpdateMask = 0;
+
+  if (NULL != p_hmm_file) {
+    TraceLog(std::string("Reading HMM model:")+p_hmm_file);
+    hset.ParseMmf(p_hmm_file, NULL, false);
+  } else {
+    Error("Missing HMM model, use: --HMM=FILE");
+  }
+  
+  hset.ExpandPredefXforms();
+  hset.AttachPriors(&hset);
+
+  nonCDphHash = hset.MakeCIPhoneHash();
+
+  hset.mCmllrStats = false;
+  hset.AllocateAccumulatorsForXformStats();
+
+
+  hset.mUpdateType          = STK::UT_EBW;
+  hset.mMinVariance         = 0.0;    ///< global minimum variance floor
+  hset.MMI_E                = 2.0;
+  hset.MMI_h                = 2.0;
+  hset.MMI_tauI             = 200.0;
+  hset.JSmoothing           = false;
+  hset.mISmoothingMaxOccup  = -1.0;
+  hset.mMinOccupation       = 0.0;
+  hset.mMapTau              = 0;
+  hset.mGaussLvl2ModelReest = false;
+  hset.mMinOccurances       = 3;
+  hset.mMinMixWeight        = 1.0 * MIN_WEGIHT;
+  hset.mUpdateMask          = 0;
+  hset.mSaveGlobOpts        = true;
+  hset.mModelUpdateDoesNotNormalize      = false;
+  hset.ResetAccums();  
+
+  //open mlf with lattices
+  ilfp = OpenInputMLF(p_src_mlf);
+
+  //reserve space for hashes
+  if (!STK::my_hcreate_r(100,  &dictHash) 
+    || !STK::my_hcreate_r(100,  &phoneHash))
+  {
+    Error("Insufficient memory");
+  }
+
+  //read dictionary
+  if (dictionary != NULL) {
+    ReadDictionary(dictionary, &dictHash, &phoneHash);
+  }
+  if (dictHash.mNEntries == 0) 
+    expOptions.mNoWordExpansion = 1;
+
+  ////////////////////////////////////////////////////////////////////////////
+  // initialize TNet
+
+  //read the input transform network
+  if(NULL != p_input_transform) { 
+    if(trace&1) TraceLog(std::string("Reading input transform network: ")+p_input_transform);
+    transform_network.ReadNetwork(p_input_transform);
+  }
+
+
+  //read the neural network
+  if(NULL != p_source_mmf_file) { 
+    if(trace&1) TraceLog(std::string("Reading network: ")+p_source_mmf_file);
+    network.ReadNetwork(p_source_mmf_file);
+  } else {
+    Error("Source MMF must be specified [-H]");
+  }
+
+
+  // initialize the feature repository 
+  feature_repo.Init(
+    swap_features, start_frm_ext, end_frm_ext, target_kind,
+    deriv_order, p_deriv_win_lenghts, 
+    cmn_path, cmn_mask, cvn_path, cvn_mask, cvg_file
+  );
+  if(NULL != p_script) {
+    feature_repo.AddFileList(p_script);
+  } else {
+    Warning("WARNING: The script file is missing [-S]");
+  }
+
+  //set the learnrate
+  network.SetLearnRate(learning_rate, learning_rate_factors);
+  
+  //set the L2 regularization constant
+  network.SetWeightcost(weightcost);
+
+  //set division of gradient by number of frames
+  network.SetGradDivFrm(grad_div_frm);
+
+
+  
+  
+  //**********************************************************************
+  //**********************************************************************
+  // INITIALIZATION DONE .................................................
+  //
+  // Start training
+  timer.Start();
+  std::cout << "===== TMpeCu TRAINING STARTED =====" << std::endl;
+
+  feature_repo.Rewind();
+  
+  //**********************************************************************
+  //**********************************************************************
+  // MAIN LOOP
+  //
+  int frames = 0;
+  int done = 0;
+  CuMatrix<BaseFloat> feats, posteriors, globerr;
+  for(feature_repo.Rewind(); !feature_repo.EndOfList(); feature_repo.MoveNext()) {
+    
+    timer_frontend.Start();
+      
+    Matrix<BaseFloat> feats_host, posteriors_host, globerr_host;
+    CuMatrix<BaseFloat> feats_original;
+    CuMatrix<BaseFloat> feats_expanded;
+
+    //read feats, perfrom feature transform
+    feature_repo.ReadFullMatrix(feats_host);
+    feats_original.CopyFrom(feats_host);
+    transform_network.Propagate(feats_original,feats_expanded);
+
+    //trim the start/end context
+    int rows = feats_expanded.Rows()-start_frm_ext-end_frm_ext;
+    feats.Init(rows,feats_expanded.Cols());
+    feats.CopyRows(rows,start_frm_ext,feats_expanded,0);
+
+    timer_frontend.End(); time_frontend += timer_frontend.Val();
+
+    //forward pass
+    network.Propagate(feats,posteriors);
+    posteriors.CopyTo(posteriors_host);
+    posteriors_host.ApplyLog();
+
+    /***************************************************
+     *************************************************** 
+     * DECODER PART get the error derivatives
+     *
+     */
+    {
+      timer_decoder.Start();
+
+      STK::Matrix<BaseFloat> posteriors_stk, gammas_stk;
+
+      //copy the posteriors to STK matrix
+      posteriors_stk.Init(posteriors_host.Rows(), posteriors_host.Cols());
+      for(size_t r=0; r<posteriors_host.Rows(); r++) {
+        memcpy(posteriors_stk[r], posteriors_host.pRowData(r),
+           posteriors_host.Cols()*sizeof(BaseFloat));
+      }
+
+      //check dims
+      if (hset.mInputVectorSize != posteriors_stk.Cols()) {
+        std::ostringstream os;
+        os <<"Vector size ["<<posteriors_stk.Cols()<<"]"
+           <<" in '"<<feature_repo.Current().Logical()<<"'"
+           <<" is incompatible with source HMM set ["<<hset.mInputVectorSize<<"]";
+        Error(os.str());
+      }
+
+      //load lattice
+      strcpy(label_file, feature_repo.Current().Logical().c_str());
+      
+      ilfp = OpenInputLabelFile(
+              label_file, 
+              src_lbl_dir,
+              src_lbl_ext ? src_lbl_ext : "net",
+              ilfp, 
+              p_src_mlf);
+
+      ReadSTKNetwork(
+              ilfp, 
+              &dictHash, 
+              &phoneHash, 
+              STK::WORD_NOT_IN_DIC_WARN, 
+              in_net_fmt,
+              feature_repo.CurrentHeader().mSamplePeriod, 
+              label_file, 
+              p_src_mlf, false, decoder.rNetwork());
+      
+      decoder.rNetwork().ExpansionsAndOptimizations(
+            expOptions,
+            in_net_fmt,
+            &dictHash,
+            &nonCDphHash,
+            &phoneHash,
+            word_penalty,
+            model_penalty,
+            grammar_scale,
+            posterior_scale);
+
+   //   CloseInputLabelFile(ilfp, p_src_mlf);
+      
+      //initialize the decoder
+      decoder.Init(&hset, &hset);
+       
+      decoder.mTimePruning     = time_pruning;
+      decoder.mWPenalty        = word_penalty;
+      decoder.mMPenalty        = model_penalty;
+      decoder.mLmScale         = grammar_scale;
+      decoder.mPronScale       = pronun_scale;
+      decoder.mTranScale       = transp_scale;
+      decoder.mOutpScale       = outprb_scale;
+      decoder.mOcpScale        = occprb_scale;
+      decoder.mPruningThresh   = state_pruning > 0.0 ? state_pruning : -LOG_0;
+      decoder.mMaxActiveModels = max_active;
+      decoder.mMinActiveModels = min_active;
+      decoder.mAccumType       = STK::AT_MPE;
+
+      if(ml_gamma) {
+        decoder.mAccumType = STK::AT_ML;
+      }
+
+      //decode
+      double prn_step   = stprn_step;
+      double prn_limit  = stprn_limit;
+
+      int n_frames = (int)posteriors_stk.Rows();
+      if (ui.GetBool(SNAME":NFRAMEOUTPNORM", false)) 
+      {
+        decoder.mOutpScale  = outprb_scale / n_frames;
+        decoder.mPruningThresh /= n_frames;
+        prn_step       /= n_frames;
+        prn_limit      /= n_frames;
+      }
+      
+      if(n_frames < 1) {
+        Error(std::string("No posterior frames, ")+feature_repo.Current().Logical());
+      }
+
+      FLOAT P;
+      FLOAT avgAcc;
+      for (;;) 
+      {
+        //***** RUN FWBW with MPE, return gamma values *********/
+        P = decoder.GetMpeGamma(posteriors_stk,gammas_stk, avgAcc,
+              n_frames, feature_repo.Current().Weight(), p_weight_vector);
+
+        if(P > LOG_MIN)
+          break;
+
+        if (decoder.mPruningThresh <= LOG_MIN ||
+          prn_step <= 0.0 ||
+          (decoder.mPruningThresh += prn_step) > prn_limit) 
+        {
+          Error(std::string("Overpruning or bad data, skipping file " +
+             feature_repo.Current().Logical()));
+          break;
+        }
+        
+        os_warn.clear();
+        os_warn << "Overpruning or bad data in file " << feature_repo.Current().Logical()
+                << ", trying pruning threshold: " << decoder.mPruningThresh;
+        Warning(os_warn.str());
+      }
+      avg_accuracy += avgAcc;
+      
+      //cleanup
+      posteriors_stk.Destroy();
+      decoder.Clear();
+
+      //copy gammas to TNet matrix
+      globerr_host.Init(gammas_stk.Rows(),gammas_stk.Cols());
+      for(size_t r=0; r<posteriors_host.Rows(); r++) {
+        memcpy(globerr_host.pRowData(r), gammas_stk[r],
+           gammas_stk.Cols()*sizeof(BaseFloat));
+      }
+      
+      //print gamma matrix for debug
+      if(show_gamma) {
+        std::cout << globerr_host;
+      }
+
+      //scale gammas by negative acoustic scale kapa
+      // dE/d_activation = kapa(gama_den - gama_num) = -kapa(gama_mpe)
+      globerr_host.Scale(-outprb_scale);
+      //globerr_host.Scale(outprb_scale);
+
+      timer_decoder.End(); time_decoder += timer_decoder.Val(); 
+    }
+    /**DECODER PART END********************************
+     **************************************************/
+
+
+    globerr.CopyFrom(globerr_host);
+
+    //check the dimensionalities
+    if(globerr.Rows() != posteriors.Rows()) {
+      std::ostringstream os;
+      os << "Non-matching number of rows," 
+         << " netout:" << posteriors.Rows() 
+         << " errfile:" << globerr.Rows();
+      Error(os.str());
+    }
+    if(globerr.Cols() != posteriors.Cols()) {
+      std::ostringstream os;
+      os << "Non-matching number of network outputs," 
+         << " netout:" << posteriors.Cols() 
+         << " errfile:" << globerr.Cols();
+      Error(os.str());
+    }
+
+    if(learning_rate != 0.0) {
+      //backward pass
+      network.Backpropagate(globerr);
+    }
+
+    frames += feats.Rows();
+    if(trace&1 && (++done%100)==1) {
+      std::cout << "(" << done << "/" << feature_repo.QueueSize() << ") ";
+    }
+   
+    /* 
+    unsigned int free, total;
+    cuMemGetInfo(&free, &total);
+    std::cout << "freemem:" << free / (1024*1024) << "MB "; 
+    */
+  }
+
+  CloseInputMLF(ilfp);
+
+  //**********************************************************************
+  //**********************************************************************
+  // TRAINING FINISHED .................................................
+  //
+  // Let's store the network, report the log
+
+
+  if(trace&1) TraceLog("Training finished");
+
+  //write the network
+  if (NULL != p_targetmmf) {
+    if(trace&1) TraceLog(std::string("Writing network: ")+p_targetmmf);
+    network.WriteNetwork(p_targetmmf);
+  } else {
+    Error("forgot to specify --TARGETMMF argument");
+  }
+  
+  timer.End();
+  std::cout << "===== TMpeCu FINISHED ( " << timer.Val() << "s ) "
+            << "[FPS:" << float(frames) / timer.Val() 
+            << ",RT:" << 1.0f / (float(frames) / timer.Val() / 100.0f)
+            << "] =====" << std::endl;
+
+  std::cout << "-- MPE average approximate accuracy: "
+            << avg_accuracy/(float)feature_repo.QueueSize()
+            << " utterances: " << feature_repo.QueueSize()
+            << std::endl;
+  std::cout << "T-fe: " << time_frontend << std::endl;
+  std::cout << "T-decode: " << time_decoder << std::endl;
+
+  
+  return  0; ///finish OK
+
+} catch (std::exception& rExc) {
+  std::cerr << "Exception thrown" << std::endl;
+  std::cerr << rExc.what() << std::endl;
+  return 1;
+}
diff --git a/src/TNet.cc b/src/TNet.cc
new file mode 100644
index 0000000..7d60e63
--- /dev/null
+++ b/src/TNet.cc
@@ -0,0 +1,379 @@
+
+/***************************************************************************
+ *   copyright            : (C) 2011 by Karel Vesely,UPGM,FIT,VUT,Brno     *
+ *   email                : iveselyk@fit.vutbr.cz                          *
+ ***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the APACHE License as published by the          *
+ *   Apache Software Foundation; either version 2.0 of the License,        *
+ *   or (at your option) any later version.                                *
+ *                                                                         *
+ ***************************************************************************/
+
+#define SVN_DATE       "$Date: 2011-04-04 19:14:16 +0200 (Mon, 04 Apr 2011) $"
+#define SVN_AUTHOR     "$Author: iveselyk $"
+#define SVN_REVISION   "$Revision: 46 $"
+#define SVN_ID         "$Id: TNet.cc 46 2011-04-04 17:14:16Z iveselyk $"
+
+#define MODULE_VERSION "1.0.0 "__TIME__" "__DATE__" "SVN_ID  
+
+/**
+ * \file TNet.cc
+ * \brief NNet training entry program Multi-core version
+ */
+
+/*** TNetLib includes */
+#include "Error.h"
+#include "Timer.h"
+#include "Features.h"
+#include "Common.h"
+#include "MlfStream.h"
+#include "UserInterface.h"
+#include "Timer.h"
+
+/*** TNet includes */
+#include "Nnet.h"
+#include "ObjFun.h"
+#include "Platform.h"
+
+
+/*** STL includes */
+#include <iostream>
+#include <sstream>
+#include <numeric>
+
+
+#define SNAME "TNET"
+
+using namespace TNet;
+
+void usage(const char* progname) 
+{
+  const char *tchrptr;
+  if ((tchrptr = strrchr(progname, '\\')) != NULL) progname = tchrptr+1;
+  if ((tchrptr = strrchr(progname, '/')) != NULL) progname = tchrptr+1;
+  fprintf(stderr,
+"\n%s version " MODULE_VERSION "\n"
+"\nUSAGE: %s [options] DataFiles...\n\n"
+":TODO:\n\n"
+" Option                                                     Default\n\n"
+" -c         Enable crossvalidation                          off\n"
+" -m file    Set label map of NN outputs                     \n"
+" -n f       Set learning rate to f                          0.06\n"
+" -o ext     Set target model ext                            None\n"
+" -A         Print command line arguments                    Off\n" 
+" -C cf      Set config file to cf                           Default\n"
+" -D         Display configuration variables                 Off\n"
+" -H mmf     Load NN macro file                              \n"
+" -I mlf     Load master label file mlf                      \n"
+" -L dir     Set input label (or net) dir                    Current\n"
+" -M dir     Dir to write NN macro files                     Current\n"
+" -O fn      Objective function [mse,xent]                   xent\n"
+" -S file    Set script file                                 None\n"
+" -T N       Set trace flags to N                            0\n" 
+" -V         Print version information                       Off\n"
+" -X ext     Set input label file ext                        lab\n"
+"\n"
+"BUNCHSIZE CACHESIZE CONFUSIONMODE[no,max,soft,dmax,dsoft] CROSSVALIDATE FEATURETRANSFORM LEARNINGRATE LEARNRATEFACTORS MLFTRANSC MOMENTUM NATURALREADORDER OBJECTIVEFUNCTION[mse,xent] OUTPUTLABELMAP PRINTCONFIG PRINTVERSION RANDOMIZE SCRIPT SEED SOURCEMLF SOURCEMMF SOURCETRANSCDIR SOURCETRANSCEXT TARGETMMF TARGETMODELDIR TARGETMODELEXT TRACE WEIGHTCOST\n"
+"\n"
+"STARTFRMEXT ENDFRMEXT CMEANDIR CMEANMASK VARSCALEDIR VARSCALEMASK VARSCALEFN TARGETKIND DERIVWINDOWS DELTAWINDOW ACCWINDOW THIRDWINDOW\n"
+"\n"
+" %s is Copyright (C) 2010-2011 Karel Vesely\n"
+" licensed under the APACHE License, version 2.0\n"
+" Bug reports, feedback, etc, to: iveselyk@fit.vutbr.cz\n"
+"\n", progname, progname, progname);
+  exit(-1);
+}
+
+
+
+///////////////////////////////////////////////////////////////////////
+// MAIN FUNCTION
+//
+
+
+int main(int argc, char *argv[])
+{
+  const char* p_option_string =
+    " -c n   CROSSVALIDATE=TRUE"
+//  " -d r   SOURCEMODELDIR"
+    " -m r   OUTPUTLABELMAP" 
+    " -n r   LEARNINGRATE" 
+    " -o r   TARGETMODELEXT" 
+    " -p r   PARALLELMODE" 
+//  " -r n   REGULARISATION=TRUE" //add later
+//  " -u r   UPDATE" //add later, update only certain weights...
+//  " -x r   SOURCEMODELEXT"
+    " -B n   SAVEBINARY=TRUE" 
+    " -D n   PRINTCONFIG=TRUE"
+//  " -G r   SOURCETRANSCFMT" //add if more transcription formats
+    " -H l   SOURCEMMF"
+    " -I r   SOURCEMLF"
+    " -L r   SOURCETRANSCDIR"
+    " -M r   TARGETMODELDIR"
+    " -O r   OBJECTIVEFUNCTION" 
+    " -S l   SCRIPT"
+    " -T r   TRACE"
+    " -V n   PRINTVERSION=TRUE"
+    " -X r   SOURCETRANSCEXT";
+
+
+  try {
+    UserInterface        ui;
+    Platform             pl;
+    Timer                timer;
+
+   
+    const char*                       p_script;
+    const char*                       p_output_label_map;
+    BaseFloat                         learning_rate;
+    BaseFloat                         weightcost;
+    ObjectiveFunction::ObjFunType     obj_fun_id;
+    CrossEntropy::ConfusionMode       xent_conf_mode;
+
+    const char*                       p_source_mmf_file;
+    const char*                       p_input_transform;
+
+    const char*                       p_targetmmf; //< SNet legacy --TARGETMMF
+          char                        p_trg_mmf_file[4096];
+    const char*                       p_trg_mmf_dir;
+    const char*                       p_trg_mmf_ext;
+
+    const char*                       p_source_mlf_file;
+    const char*                       p_src_lbl_dir;
+    const char*                       p_src_lbl_ext;
+
+    int                               bunch_size;
+    int                               cache_size;
+    bool                              randomize;
+    long int                          seed;
+
+
+    int                               trace;
+    bool                              crossval;
+    int                               num_threads;
+
+
+    // variables for feature repository
+    bool                              swap_features;
+    int                               target_kind;
+    int                               deriv_order;
+    int*                              p_deriv_win_lenghts;
+    int                               start_frm_ext;
+    int                               end_frm_ext;
+          char*                       cmn_path;
+          char*                       cmn_file;
+    const char*                       cmn_mask;
+          char*                       cvn_path;
+          char*                       cvn_file;
+    const char*                       cvn_mask;
+    const char*                       cvg_file;
+ 
+    
+    // OPTION PARSING ..........................................................
+    // use the STK option parsing
+    if (argc == 1) { usage(argv[0]); return 1; }
+    int args_parsed = ui.ParseOptions(argc, argv, p_option_string, SNAME);
+
+
+    // OPTION RETRIEVAL ........................................................
+    // extract the feature parameters
+    swap_features = !ui.GetBool(SNAME":NATURALREADORDER", TNet::IsBigEndian());
+    
+    target_kind = ui.GetFeatureParams(&deriv_order, &p_deriv_win_lenghts,
+         &start_frm_ext, &end_frm_ext, &cmn_path, &cmn_file, &cmn_mask,
+         &cvn_path, &cvn_file, &cvn_mask, &cvg_file, SNAME":", 0);
+
+
+    // extract other parameters
+    p_source_mmf_file   = ui.GetStr(SNAME":SOURCEMMF",     NULL);
+    p_input_transform   = ui.GetStr(SNAME":FEATURETRANSFORM",  NULL);
+    
+    p_targetmmf         = ui.GetStr(SNAME":TARGETMMF",     NULL);//< has higher priority than "dir/file.ext" composition (SNet legacy)
+    p_trg_mmf_dir       = ui.GetStr(SNAME":TARGETMODELDIR",  "");//< dir for composition
+    p_trg_mmf_ext       = ui.GetStr(SNAME":TARGETMODELEXT",  "");//< ext for composition
+
+    p_script            = ui.GetStr(SNAME":SCRIPT",         NULL);
+    p_output_label_map  = ui.GetStr(SNAME":OUTPUTLABELMAP", NULL);
+    learning_rate       = ui.GetFlt(SNAME":LEARNINGRATE"  , 0.06f);
+    weightcost          = ui.GetFlt(SNAME":WEIGHTCOST"    , 0.0);
+
+    obj_fun_id          = static_cast<ObjectiveFunction::ObjFunType>(
+                          ui.GetEnum(SNAME":OBJECTIVEFUNCTION", 
+                                     ObjectiveFunction::CROSS_ENTROPY, //< default
+                                     "ent", ObjectiveFunction::CROSS_ENTROPY,
+                                     "mse", ObjectiveFunction::MEAN_SQUARE_ERROR
+                          ));
+
+    xent_conf_mode      = static_cast<CrossEntropy::ConfusionMode>(
+                          ui.GetEnum(SNAME":CONFUSIONMODE",
+                                     CrossEntropy::NO_CONF, //< default
+                                     "no", CrossEntropy::NO_CONF,
+                                     "max", CrossEntropy::MAX_CONF,
+                                     "soft", CrossEntropy::SOFT_CONF,
+                                     "dmax", CrossEntropy::DIAG_MAX_CONF,
+                                     "dsoft", CrossEntropy::DIAG_SOFT_CONF
+                          ));
+
+    p_source_mlf_file   = ui.GetStr(SNAME":SOURCEMLF",       NULL);
+    p_src_lbl_dir       = ui.GetStr(SNAME":SOURCETRANSCDIR", NULL);
+    p_src_lbl_ext       = ui.GetStr(SNAME":SOURCETRANSCEXT", "lab");
+
+    bunch_size          = ui.GetInt(SNAME":BUNCHSIZE", 256);
+    cache_size          = ui.GetInt(SNAME":CACHESIZE", 12800);
+    randomize           = ui.GetBool(SNAME":RANDOMIZE", true);
+
+    //cannot get long int
+    seed                = ui.GetInt(SNAME":SEED", 0);
+
+
+    //Fill the global variables of the singleton 'Gl'
+    trace               = ui.GetInt(SNAME":TRACE",               0);
+    num_threads         = ui.GetInt(SNAME":THREADS",          1);
+    crossval            = ui.GetBool(SNAME":CROSSVALIDATE",  false);
+
+
+    // process the parameters
+    if(ui.GetBool(SNAME":PRINTCONFIG", false)) {
+      std::cout << std::endl;
+      ui.PrintConfig(std::cout);
+      std::cout << std::endl;
+    }
+    if(ui.GetBool(SNAME":PRINTVERSION", false)) {
+      std::cout << std::endl;
+      std::cout << "======= TNET v"MODULE_VERSION" =======" << std::endl;
+      std::cout << std::endl;
+    }
+    ui.CheckCommandLineParamUse();
+    
+
+    // the rest of the parameters are the feature files
+    for (; args_parsed < argc; args_parsed++) {
+      pl.feature_.AddFile(argv[args_parsed]);
+    }
+
+    //**************************************************************************
+    //**************************************************************************
+    // OPTION PARSING DONE .....................................................
+
+
+    //initialize the InputProxy
+    if(NULL == p_script)
+      Warning("WARNING: The script file is missing [-S]");
+    if(NULL == p_source_mlf_file)
+      Error("Source mlf file file is missing [-I]");
+    if(NULL == p_output_label_map)
+      Error("Output label map is missing [-m]");
+    // initialize the feature repository
+    if(trace&1) TraceLog("Initializing FeatureRepository");
+    pl.feature_.Init(
+      swap_features, start_frm_ext, end_frm_ext, target_kind,
+      deriv_order, p_deriv_win_lenghts, 
+      cmn_path, cmn_mask, cvn_path, cvn_mask, cvg_file
+    );
+    //open the scp file
+    pl.feature_.AddFileList(p_script);
+
+    // initialize the label repository
+    if(trace&1) TraceLog("Initializing LabelRepository");
+    pl.label_.Init(p_source_mlf_file,p_output_label_map, p_src_lbl_dir, p_src_lbl_ext);
+
+    // read input transform    
+    if(NULL != p_input_transform) {
+      if(trace&1) TraceLog(std::string("Reading input transform:")+p_input_transform);
+      pl.nnet_transf_.ReadNetwork(p_input_transform);
+    }
+
+    // read network
+    if(NULL != p_source_mmf_file) { 
+      if(trace&1) TraceLog(std::string("Reading network:")+p_source_mmf_file);
+      pl.nnet_.ReadNetwork(p_source_mmf_file);
+    } else {
+      Error("Source MMF must be specified [-H]");
+    }
+    pl.nnet_.SetLearnRate(learning_rate);
+    pl.nnet_.SetWeightcost(weightcost);
+
+    //get objective function instance
+    pl.obj_fun_ = ObjectiveFunction::Factory(obj_fun_id);
+    //setup the cross entropy
+    if(obj_fun_id == ObjectiveFunction::CROSS_ENTROPY) {
+      CrossEntropy* xent = dynamic_cast<CrossEntropy*>(pl.obj_fun_);
+      //confusion mode
+      xent->SetConfusionMode(xent_conf_mode);
+      //pass the outputlabelmap
+      xent->SetOutputLabelMap(p_output_label_map);
+    }
+
+    //initialize the cache
+    pl.bunchsize_ = bunch_size;
+    pl.cachesize_ = cache_size;
+    pl.randomize_ = randomize;
+    //
+    pl.start_frm_ext_ = start_frm_ext;
+    pl.end_frm_ext_ = end_frm_ext;
+    pl.trace_ = trace;
+    pl.crossval_ = crossval;
+
+    //TODO do someting with seed!!!
+    pl.seed_ = seed;
+    //data_proxy.InitCache(cache_size, bunch_size, network, randomize, seed);
+    
+    timer.Start();
+    std::cout << "===== TNET " 
+              << (crossval?"CROSSVALIDATION":"TRAINING") 
+              << " STARTED =====" << std::endl;
+    std::cout << "Objective function: " 
+              << pl.obj_fun_->GetName() << std::endl;
+    if(!crossval) {
+      std::cout << "Learning rate: " << learning_rate << std::endl;
+    }
+
+
+    /*
+     * PERFORM ONE ITERATION OF THE TRAINING
+     */
+    pl.RunTrain(num_threads);
+    /*
+     *
+     */
+
+
+    if(trace&1) TraceLog("Training finished");
+
+    //write the network
+    if(!crossval) {
+      if(trace&1) TraceLog("Writing network");
+      if (NULL != p_targetmmf) {
+        pl.nnet_.WriteNetwork(p_targetmmf);
+      } else {
+        MakeHtkFileName(p_trg_mmf_file, p_source_mmf_file, p_trg_mmf_dir, p_trg_mmf_ext);
+        pl.nnet_.WriteNetwork(p_trg_mmf_file);
+      }
+    }
+
+    //show report
+    timer.End();
+
+    pl.cout_mutex_.Lock();
+
+    std::cout << "===== TNET FINISHED ( " << timer.Val() << "s ) "
+              << "[ FPS: " << pl.obj_fun_->GetFrames() / timer.Val() 
+              << " RT: " << 1.0f / (pl.obj_fun_->GetFrames() / timer.Val() / 100.0f)
+              << " ] =====" << std::endl;
+
+    //report objective function
+    std::cout << "-- " << (crossval?"CV ":"TR ") 
+              << pl.obj_fun_->Report();
+
+    pl.cout_mutex_.Unlock();
+
+  }
+  catch (std::exception& rExc) {
+    std::cerr << "Exception thrown" << std::endl;
+    std::cerr << rExc.what() << std::endl;
+    return 1;
+  }
+  return 0;
+}
diff --git a/src/TNetCu.cc b/src/TNetCu.cc
new file mode 100644
index 0000000..83c58cc
--- /dev/null
+++ b/src/TNetCu.cc
@@ -0,0 +1,508 @@
+
+/***************************************************************************
+ *   copyright            : (C) 2011 by Karel Vesely,UPGM,FIT,VUT,Brno     *
+ *   email                : iveselyk@fit.vutbr.cz                          *
+ ***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the APACHE License as published by the          *
+ *   Apache Software Foundation; either version 2.0 of the License,        *
+ *   or (at your option) any later version.                                *
+ *                                                                         *
+ ***************************************************************************/
+
+#define SVN_DATE       "$Date: 2012-03-23 14:22:49 +0100 (Fri, 23 Mar 2012) $"
+#define SVN_AUTHOR     "$Author: iveselyk $"
+#define SVN_REVISION   "$Revision: 110 $"
+#define SVN_ID         "$Id: TNetCu.cc 110 2012-03-23 13:22:49Z iveselyk $"
+
+#define MODULE_VERSION "1.0.0 "__TIME__" "__DATE__" "SVN_ID  
+
+/**
+ * \file TNetCu.cc
+ * \brief DNN training Entry Program CUDA-version
+ */
+
+
+/*** TNetLib includes */
+#include "Error.h"
+#include "Timer.h"
+#include "Features.h"
+#include "Labels.h"
+#include "Common.h"
+#include "MlfStream.h"
+#include "UserInterface.h"
+#include "Timer.h"
+
+/*** TNet includes */
+#include "cuObjectiveFunction.h"
+#include "cuNetwork.h"
+#include "cuCache.h"
+
+/*** STL includes */
+#include <iostream>
+#include <sstream>
+#include <numeric>
+
+
+
+
+//////////////////////////////////////////////////////////////////////
+// DEFINES
+//
+
+#define SNAME "TNET"
+
+using namespace TNet;
+
+void usage(const char* progname) 
+{
+  const char *tchrptr;
+  if ((tchrptr = strrchr(progname, '\\')) != NULL) progname = tchrptr+1;
+  if ((tchrptr = strrchr(progname, '/')) != NULL) progname = tchrptr+1;
+  fprintf(stderr,
+"\n%s version " MODULE_VERSION "\n"
+"\nUSAGE: %s [options] DataFiles...\n\n"
+" Option                                                     Default\n\n"
+" -c         Enable crossvalidation                          off\n"
+" -m file    Set label map of NN outputs                     \n"
+" -n f       Set learning rate to f                          0.06\n"
+" -o ext     Set target model ext                            None\n"
+" -A         Print command line arguments                    Off\n" 
+" -C cf      Set config file to cf                           Default\n"
+" -D         Display configuration variables                 Off\n"
+" -H mmf     Load NN macro file                              \n"
+" -I mlf     Load master label file mlf                      \n"
+" -L dir     Set input label (or net) dir                    Current\n"
+" -M dir     Dir to write NN macro files                     Current\n"
+" -O fn      Objective function [mse,xent]                   xent\n"
+" -S file    Set script file                                 None\n"
+" -T N       Set trace flags to N                            0\n" 
+" -V         Print version information                       Off\n"
+" -X ext     Set input label file ext                        lab\n"
+"\n"
+"BUNCHSIZE CACHESIZE CROSSVALIDATE FEATURETRANSFORM GPUSELECT GRADDIVFRM L1 LEARNINGRATE LEARNRATEFACTORS MLFTRANSC MOMENTUM NATURALREADORDER OBJECTIVEFUNCTION OUTPUTLABELMAP PRINTCONFIG PRINTVERSION RANDOMIZE SCRIPT SEED SOURCEMLF SOURCEMMF SOURCETRANSCDIR SOURCETRANSCEXT TARGETMMF TARGETMODELDIR TARGETMODELEXT TRACE WEIGHTCOST\n"
+"\n"
+"STARTFRMEXT ENDFRMEXT CMEANDIR CMEANMASK VARSCALEDIR VARSCALEMASK VARSCALEFN TARGETKIND DERIVWINDOWS DELTAWINDOW ACCWINDOW THIRDWINDOW\n"
+"\n"
+" %s is Copyright (C) 2010-2011 Karel Vesely\n"
+" licensed under the APACHE License, version 2.0\n"
+" Bug reports, feedback, etc, to: iveselyk@fit.vutbr.cz\n"
+"\n", progname, progname, progname);
+  exit(-1);
+}
+
+
+
+
+///////////////////////////////////////////////////////////////////////
+// MAIN FUNCTION
+//
+
+/**
+ * \brief Main Procedure
+ *
+ * Handles params extraction and all other inputs
+ * Entry point of CUDA based neural network training methods.
+ */
+int main(int argc, char *argv[]) try
+{
+  const char* p_option_string =
+    " -c n   CROSSVALIDATE=TRUE"
+    " -m r   OUTPUTLABELMAP" 
+    " -n r   LEARNINGRATE" 
+    " -o r   TARGETMODELEXT" 
+    " -D n   PRINTCONFIG=TRUE"
+    " -H l   SOURCEMMF"
+    " -I r   SOURCEMLF"
+    " -L r   SOURCETRANSCDIR"
+    " -M r   TARGETMODELDIR"
+    " -O r   OBJECTIVEFUNCTION" 
+    " -S l   SCRIPT"
+    " -T r   TRACE"
+    " -V n   PRINTVERSION=TRUE"
+    " -X r   SOURCETRANSCEXT";
+
+
+  UserInterface        ui;
+  /** \brief Feature specified in params& scp*/
+  FeatureRepository    feature_repo;       
+  /** \brief Label file*/
+  LabelRepository      label_repo;
+  /** \brief DNN network*/
+  CuNetwork            network;
+  /** \brief Transform network*/
+  CuNetwork            transform_network;
+  /** \brief Objective Function*/
+  CuObjectiveFunction* p_obj_function = NULL;
+  Timer                timer;
+  Timer                timer_frontend;
+  double               time_frontend = 0.0;
+
+ 
+  const char*                       p_script;
+  const char*                       p_output_label_map;
+  BaseFloat                         learning_rate;
+  const char*                       learning_rate_factors;
+  BaseFloat                         momentum;
+  BaseFloat                         weightcost;
+  BaseFloat                         l1;
+  bool                              grad_div_frm;
+  CuObjectiveFunction::ObjFunType   obj_fun_id;
+
+  const char*                       p_source_mmf_file;
+  const char*                       p_input_transform;
+  //const char*                       p_input_transform2;
+
+  const char*                       p_targetmmf; ///< SNet legacy --TARGETMMF
+        char                        p_trg_mmf_file[4096];
+  const char*                       p_trg_mmf_dir;
+  const char*                       p_trg_mmf_ext;
+
+  const char*                       p_source_mlf_file;
+  const char*                       p_src_lbl_dir;
+  const char*                       p_src_lbl_ext;
+        char                        p_lbl_file[4096];
+  bool                              mlf_transc;
+
+  int                               bunch_size;
+  int                               cache_size;
+  bool                              randomize;
+  long int                          seed;
+
+  bool                              cross_validate;
+
+  int                               trace;
+
+  int                               gpu_select;
+
+  // variables for feature repository
+  bool                              swap_features;
+  int                               target_kind;
+  int                               deriv_order;
+  int*                              p_deriv_win_lenghts;
+  int                               start_frm_ext;
+  int                               end_frm_ext;
+        char*                       cmn_path;
+        char*                       cmn_file;
+  const char*                       cmn_mask;
+        char*                       cvn_path;
+        char*                       cvn_file;
+  const char*                       cvn_mask;
+  const char*                       cvg_file;
+
+ 
+  /// OPTION PARSING ........ use the STK option parsing
+  if (argc == 1) { usage(argv[0]); return 1; }
+  int args_parsed = ui.ParseOptions(argc, argv, p_option_string, SNAME);
+
+
+  /// OPTION RETRIEVAL ........ extract the feature parameters
+  swap_features = !ui.GetBool(SNAME":NATURALREADORDER", TNet::IsBigEndian());
+  
+  target_kind = ui.GetFeatureParams(&deriv_order, &p_deriv_win_lenghts,
+       &start_frm_ext, &end_frm_ext, &cmn_path, &cmn_file, &cmn_mask,
+       &cvn_path, &cvn_file, &cvn_mask, &cvg_file, SNAME":", 0);
+
+
+  /// extract other parameters
+  p_source_mmf_file   = ui.GetStr(SNAME":SOURCEMMF",     NULL);
+  p_input_transform   = ui.GetStr(SNAME":FEATURETRANSFORM",  NULL);
+  
+  p_targetmmf         = ui.GetStr(SNAME":TARGETMMF",     NULL);///< has higher priority than "dir/file.ext" composition (SNet legacy)
+  p_trg_mmf_dir       = ui.GetStr(SNAME":TARGETMODELDIR",  "");///< dir for composition
+  p_trg_mmf_ext       = ui.GetStr(SNAME":TARGETMODELEXT",  "");///< ext for composition
+
+  p_script            = ui.GetStr(SNAME":SCRIPT",         NULL);
+  p_output_label_map  = ui.GetStr(SNAME":OUTPUTLABELMAP", NULL);
+  learning_rate       = ui.GetFlt(SNAME":LEARNINGRATE"  , 0.06f);
+  learning_rate_factors = ui.GetStr(SNAME":LEARNRATEFACTORS", NULL);
+  momentum            = ui.GetFlt(SNAME":MOMENTUM"      , 0.0);
+  weightcost          = ui.GetFlt(SNAME":WEIGHTCOST"    , 0.0);
+  l1                  = ui.GetFlt(SNAME":L1"            , 0.0);
+  grad_div_frm        = ui.GetBool(SNAME":GRADDIVFRM",     true);
+
+  obj_fun_id          = static_cast<CuObjectiveFunction::ObjFunType>(
+                        ui.GetEnum(SNAME":OBJECTIVEFUNCTION", 
+                                   CuObjectiveFunction::CROSS_ENTROPY, //< default
+                                   "xent", CuObjectiveFunction::CROSS_ENTROPY,
+                                   "mse", CuObjectiveFunction::MEAN_SQUARE_ERROR
+                        ));
+
+  p_source_mlf_file   = ui.GetStr(SNAME":SOURCEMLF",       NULL);
+  p_src_lbl_dir       = ui.GetStr(SNAME":SOURCETRANSCDIR", NULL);
+  p_src_lbl_ext       = ui.GetStr(SNAME":SOURCETRANSCEXT", "lab");
+  mlf_transc          = ui.GetBool(SNAME":MLFTRANSC",      true);
+
+
+
+  bunch_size          = ui.GetInt(SNAME":BUNCHSIZE", 256);
+  cache_size          = ui.GetInt(SNAME":CACHESIZE", 12800);
+  randomize           = ui.GetBool(SNAME":RANDOMIZE", true);
+
+  //cannot get long int
+  seed                = ui.GetInt(SNAME":SEED", 0);
+
+  cross_validate      = ui.GetBool(SNAME":CROSSVALIDATE",  false);
+
+  trace       = ui.GetInt(SNAME":TRACE",               0);
+  if(trace&4) { CuDevice::Instantiate().Verbose(true); }
+
+  gpu_select  = ui.GetInt(SNAME":GPUSELECT", -1);
+  if(gpu_select >= 0) { CuDevice::Instantiate().SelectGPU(gpu_select); }
+
+
+
+  /// process the parameters
+  if(ui.GetBool(SNAME":PRINTCONFIG", false)) {
+    std::cout << std::endl;
+    ui.PrintConfig(std::cout);
+    std::cout << std::endl;
+  }
+  if(ui.GetBool(SNAME":PRINTVERSION", false)) {
+    std::cout << std::endl;
+    std::cout << "======= TNET v"MODULE_VERSION" =======" << std::endl;
+    std::cout << std::endl;
+  }
+  ui.CheckCommandLineParamUse();
+  
+
+  /// the rest of the parameters are the feature files
+  for (; args_parsed < argc; args_parsed++) {
+    feature_repo.AddFile(argv[args_parsed]);
+  }
+
+  //**************************************************************************
+  //**************************************************************************
+  /// OPTION PARSING DONE .....................................................
+
+
+  /// read the input transform network from file p_input_transform
+  if(NULL != p_input_transform) { 
+    if(trace&1) TraceLog(std::string("Reading input transform network: ")+p_input_transform);
+    transform_network.ReadNetwork(p_input_transform);
+  }
+
+
+  /// read the neural network from file p_source_mmf_file
+  if(NULL != p_source_mmf_file) { 
+    if(trace&1) TraceLog(std::string("Reading network: ")+p_source_mmf_file);
+    network.ReadNetwork(p_source_mmf_file);
+  } else {
+    Error("Source MMF must be specified [-H]");
+  }
+
+
+  /// initialize the feature repository 
+  feature_repo.Init(
+    swap_features, start_frm_ext, end_frm_ext, target_kind,
+    deriv_order, p_deriv_win_lenghts, 
+    cmn_path, cmn_mask, cvn_path, cvn_mask, cvg_file
+  );
+  feature_repo.Trace(trace);
+  if(NULL != p_script) {
+    feature_repo.AddFileList(p_script);
+  } else {
+    Warning("WARNING: The script file is missing [-S]");
+  }
+
+
+  /// initialize the label repository
+  if(mlf_transc) {
+    if(NULL == p_source_mlf_file)
+      Error("Source mlf file file is missing [-I]");
+    if(NULL == p_output_label_map)
+      Error("Output label map is missing [-m]");
+
+    if(trace&1) TraceLog(std::string("Indexing labels: ")+p_source_mlf_file);
+    label_repo.Init(p_source_mlf_file, p_output_label_map, p_src_lbl_dir, p_src_lbl_ext);
+    label_repo.Trace(trace);
+  }
+
+
+  /// get objective function instance
+  p_obj_function = CuObjectiveFunction::Factory(obj_fun_id);
+
+  /// set the learnrate, momentum, weightcost
+  network.SetLearnRate(learning_rate, learning_rate_factors);
+  network.SetMomentum(momentum);
+  network.SetWeightcost(weightcost);
+  network.SetL1(l1);
+
+  /// set division of gradient by number of frames -> grad_div_frm
+  /// why grad div by frame num.
+  network.SetGradDivFrm(grad_div_frm);
+
+  /// seed the random number generator
+  if(seed == 0) {
+    struct timeval tv;
+    if (gettimeofday(&tv, 0) == -1) {
+      assert(0 && "gettimeofday does not work.");
+      exit(-1);
+    }
+    seed = (int)(tv.tv_sec) + (int)tv.tv_usec;
+  }
+  srand48(seed);
+
+
+  
+  
+  //**********************************************************************
+  //**********************************************************************
+  /// INITIALIZATION DONE .................................................
+  //
+  /// Start training
+  timer.Start();
+  std::cout << "===== TNET " 
+            << (cross_validate?"CROSSVALIDATION":"TRAINING") 
+            << " STARTED =====" << std::endl;
+  std::cout << "Objective function: " 
+            << p_obj_function->GetTypeLabel() << std::endl;
+  if(!cross_validate) {
+    network.PrintLearnRate();
+    std::cout << "momentum: " << momentum
+              << " weightcost: " << weightcost << std::endl;
+    std::cout << "using seed: " << seed << std::endl;
+  }
+
+  /// make the cachesize divisible by bunchsize
+  cache_size = (cache_size/bunch_size)*bunch_size;
+  std::cout << "Bunchsize:" << bunch_size
+            << " Cachesize:" << cache_size << "\n";
+
+  CuCache cache;
+  cache.Init(cache_size,bunch_size);
+  cache.Trace(trace);
+  feature_repo.Rewind();
+  
+  //**********************************************************************
+  //**********************************************************************
+  /// MAIN LOOP start
+  /**
+   * Main loop
+   *  - Filling Cache from feature_repo
+   *    - Read Features, perform transform, trim feature
+   *    - Read labels (From label repo/HTK-matrix file)
+   *    .
+   *  - Randomize the Cache (Only time random ever used?!)
+   *  - Training when Cache is not empty
+   *    - Get training data from cache
+   *    - Eval error using obj_fnc
+   *    - BP
+   *    .
+   *  .
+   */
+  CuMatrix<BaseFloat> feats, output, labs, globerr;
+  while(!feature_repo.EndOfList()) {
+    timer_frontend.Start();
+    //fill cache
+    while(!cache.Full() && !feature_repo.EndOfList()) {
+      Matrix<BaseFloat> feats_host;
+      CuMatrix<BaseFloat> feats_original;
+      CuMatrix<BaseFloat> feats_expanded;
+
+      //read feats, perfrom feature transform
+      feature_repo.ReadFullMatrix(feats_host);
+      feats_host.CheckData(feature_repo.Current().Logical());
+      feats_original.CopyFrom(feats_host);
+      transform_network.Propagate(feats_original,feats_expanded);
+
+      //trim the start/end context
+      int rows = feats_expanded.Rows()-start_frm_ext-end_frm_ext;
+      CuMatrix<BaseFloat> feats_trim(rows,feats_expanded.Cols());
+      feats_trim.CopyRows(rows,start_frm_ext,feats_expanded,0);
+
+      //read labels
+      Matrix<BaseFloat> labs_host; CuMatrix<BaseFloat> labs_cu;
+      if(mlf_transc) {
+        //read from label repository
+        label_repo.GenDesiredMatrix(labs_host,feats_trim.Rows(), 
+                                    feature_repo.CurrentHeader().mSamplePeriod,
+                                    feature_repo.Current().Logical().c_str());
+      } else {
+        //read targets from HTK-matrix file
+        MakeHtkFileName(p_lbl_file,feature_repo.Current().Logical().c_str(), 
+                        p_src_lbl_dir, p_src_lbl_ext);
+        labs_host.LoadHTK(p_lbl_file);
+      }
+      labs_cu.CopyFrom(labs_host);
+      //test number of rows
+      if(labs_cu.Rows() != feats_trim.Rows()) {
+        Error(std::string("Nonmatching number number of input/target examples")
+                          + feature_repo.Current().Logical().c_str());
+      }
+      
+      //add to cache
+      cache.AddData(feats_trim,labs_cu);
+
+      feature_repo.MoveNext();
+    }
+    timer_frontend.End(); time_frontend += timer_frontend.Val();
+   
+    if(randomize) { 
+      //randomize the cache
+      cache.Randomize();
+    }
+
+    while(!cache.Empty()) {
+      //get training data
+      cache.GetBunch(feats,labs);
+
+      //forward pass
+      network.Propagate(feats,output);
+      //accumulate error, get global err
+      p_obj_function->Evaluate(output,labs,globerr);
+
+      //backward pass
+      if(!cross_validate) {
+        network.Backpropagate(globerr);
+      }
+      if(trace&2) std::cout << "." << std::flush; 
+    }
+  }
+
+
+
+  //**********************************************************************
+  //**********************************************************************
+  /// TRAINING FINISHED .................................................
+  //
+  /// Let's store the network, report the log
+
+
+  if(trace&1) TraceLog("Training finished");
+
+  //write the network
+  if(!cross_validate) {
+    if (NULL != p_targetmmf) {
+      if(trace&1) TraceLog(std::string("Writing network: ")+p_targetmmf);
+      network.WriteNetwork(p_targetmmf);
+    } else {
+      MakeHtkFileName(p_trg_mmf_file, p_source_mmf_file, p_trg_mmf_dir, p_trg_mmf_ext);
+      if(trace&1) TraceLog(std::string("Writing network: ")+p_trg_mmf_file);
+      network.WriteNetwork(p_trg_mmf_file);
+    }
+  }
+
+  timer.End();
+  std::cout << "===== TNET "
+            << (cross_validate?"CROSSVALIDATION":"TRAINING") 
+            << " FINISHED ( " << timer.Val() << "s ) "
+            << "[FPS:" << p_obj_function->GetFrames() / timer.Val() 
+            << ",RT:" << 1.0f / (p_obj_function->GetFrames() / timer.Val() / 100.0f)
+            << "] =====" << std::endl;
+
+  //report objective function (accuracy, frame counts...)
+  std::cout << "-- " << (cross_validate?"CV ":"TR ") << p_obj_function->Report();
+
+  if(trace &4) {
+    std::cout << "\n== PROFILE ==\nT-fe: " << time_frontend << std::endl;
+  }
+  
+  return  0; ///finish OK
+
+} catch (std::exception& rExc) {
+  std::cerr << "Exception thrown" << std::endl;
+  std::cerr << rExc.what() << std::endl;
+  return  1;
+}
diff --git a/src/TNetLib/.depend.mk b/src/TNetLib/.depend.mk
new file mode 100644
index 0000000..dd073f6
--- /dev/null
+++ b/src/TNetLib/.depend.mk
@@ -0,0 +1,946 @@
+Activation.o: Activation.cc Activation.h Component.h ../KaldiLib/Vector.h \
+ /usr/include/c++/4.6/cstddef \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/endian.h /usr/include/bits/endian.h \
+ /usr/include/bits/byteswap.h /usr/include/xlocale.h \
+ /usr/include/sys/types.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/time.h \
+ /usr/include/sys/select.h /usr/include/bits/select.h \
+ /usr/include/bits/sigset.h /usr/include/bits/time.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/bits/pthreadtypes.h /usr/include/alloca.h \
+ /usr/include/bits/stdlib.h /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/postypes.h \
+ /usr/include/c++/4.6/cwchar /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/iosfwd \
+ /usr/include/c++/4.6/cctype /usr/include/ctype.h \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/bits/sched.h \
+ /usr/include/bits/timex.h /usr/include/bits/setjmp.h \
+ /usr/include/unistd.h /usr/include/bits/posix_opt.h \
+ /usr/include/bits/environments.h /usr/include/bits/confname.h \
+ /usr/include/getopt.h /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/bits/locale_classes.h \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc ../KaldiLib/cblas.h \
+ ../KaldiLib/clapack.h ../KaldiLib/cblas.h ../KaldiLib/Common.h \
+ /usr/include/string.h /usr/include/bits/string3.h \
+ /usr/include/c++/4.6/sstream /usr/include/c++/4.6/bits/sstream.tcc \
+ ../KaldiLib/MathAux.h /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ ../KaldiLib/Types.h ../KaldiLib/Error.h /usr/include/execinfo.h \
+ ../KaldiLib/Vector.tcc /usr/include/c++/4.6/cstring \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ ../KaldiLib/Matrix.h ../KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h \
+ /usr/include/c++/4.6/bits/stl_construct.h /usr/include/c++/4.6/limits \
+ /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc ../KaldiLib/Vector.h \
+ ../KaldiLib/Matrix.h
+Barrier.o: Barrier.cc /usr/include/pthread.h /usr/include/features.h \
+ /usr/include/bits/predefs.h /usr/include/sys/cdefs.h \
+ /usr/include/bits/wordsize.h /usr/include/gnu/stubs.h \
+ /usr/include/gnu/stubs-32.h /usr/include/endian.h \
+ /usr/include/bits/endian.h /usr/include/bits/byteswap.h \
+ /usr/include/sched.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h /usr/include/time.h \
+ /usr/include/bits/sched.h /usr/include/bits/time.h \
+ /usr/include/bits/timex.h /usr/include/xlocale.h \
+ /usr/include/bits/pthreadtypes.h /usr/include/bits/setjmp.h \
+ ../KaldiLib/Error.h /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/iosfwd /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/postypes.h /usr/include/c++/4.6/cwchar \
+ /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/cctype \
+ /usr/include/ctype.h /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/unistd.h /usr/include/bits/posix_opt.h \
+ /usr/include/bits/environments.h /usr/include/bits/confname.h \
+ /usr/include/getopt.h /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/bits/locale_classes.h /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/sstream /usr/include/c++/4.6/bits/sstream.tcc \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/sys/types.h /usr/include/sys/select.h \
+ /usr/include/bits/select.h /usr/include/bits/sigset.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/alloca.h /usr/include/bits/stdlib.h /usr/include/execinfo.h \
+ Barrier.h
+BiasedLinearity.o: BiasedLinearity.cc BiasedLinearity.h Component.h \
+ ../KaldiLib/Vector.h /usr/include/c++/4.6/cstddef \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/endian.h /usr/include/bits/endian.h \
+ /usr/include/bits/byteswap.h /usr/include/xlocale.h \
+ /usr/include/sys/types.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/time.h \
+ /usr/include/sys/select.h /usr/include/bits/select.h \
+ /usr/include/bits/sigset.h /usr/include/bits/time.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/bits/pthreadtypes.h /usr/include/alloca.h \
+ /usr/include/bits/stdlib.h /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/postypes.h \
+ /usr/include/c++/4.6/cwchar /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/iosfwd \
+ /usr/include/c++/4.6/cctype /usr/include/ctype.h \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/bits/sched.h \
+ /usr/include/bits/timex.h /usr/include/bits/setjmp.h \
+ /usr/include/unistd.h /usr/include/bits/posix_opt.h \
+ /usr/include/bits/environments.h /usr/include/bits/confname.h \
+ /usr/include/getopt.h /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/bits/locale_classes.h \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc ../KaldiLib/cblas.h \
+ ../KaldiLib/clapack.h ../KaldiLib/cblas.h ../KaldiLib/Common.h \
+ /usr/include/string.h /usr/include/bits/string3.h \
+ /usr/include/c++/4.6/sstream /usr/include/c++/4.6/bits/sstream.tcc \
+ ../KaldiLib/MathAux.h /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ ../KaldiLib/Types.h ../KaldiLib/Error.h /usr/include/execinfo.h \
+ ../KaldiLib/Vector.tcc /usr/include/c++/4.6/cstring \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ ../KaldiLib/Matrix.h ../KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h \
+ /usr/include/c++/4.6/bits/stl_construct.h /usr/include/c++/4.6/limits \
+ /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc ../KaldiLib/Vector.h \
+ ../KaldiLib/Matrix.h
+BlockArray.o: BlockArray.cc BlockArray.h Component.h ../KaldiLib/Vector.h \
+ /usr/include/c++/4.6/cstddef \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/endian.h /usr/include/bits/endian.h \
+ /usr/include/bits/byteswap.h /usr/include/xlocale.h \
+ /usr/include/sys/types.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/time.h \
+ /usr/include/sys/select.h /usr/include/bits/select.h \
+ /usr/include/bits/sigset.h /usr/include/bits/time.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/bits/pthreadtypes.h /usr/include/alloca.h \
+ /usr/include/bits/stdlib.h /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/postypes.h \
+ /usr/include/c++/4.6/cwchar /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/iosfwd \
+ /usr/include/c++/4.6/cctype /usr/include/ctype.h \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/bits/sched.h \
+ /usr/include/bits/timex.h /usr/include/bits/setjmp.h \
+ /usr/include/unistd.h /usr/include/bits/posix_opt.h \
+ /usr/include/bits/environments.h /usr/include/bits/confname.h \
+ /usr/include/getopt.h /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/bits/locale_classes.h \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc ../KaldiLib/cblas.h \
+ ../KaldiLib/clapack.h ../KaldiLib/cblas.h ../KaldiLib/Common.h \
+ /usr/include/string.h /usr/include/bits/string3.h \
+ /usr/include/c++/4.6/sstream /usr/include/c++/4.6/bits/sstream.tcc \
+ ../KaldiLib/MathAux.h /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ ../KaldiLib/Types.h ../KaldiLib/Error.h /usr/include/execinfo.h \
+ ../KaldiLib/Vector.tcc /usr/include/c++/4.6/cstring \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ ../KaldiLib/Matrix.h ../KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h \
+ /usr/include/c++/4.6/bits/stl_construct.h /usr/include/c++/4.6/limits \
+ /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc ../KaldiLib/Vector.h \
+ ../KaldiLib/Matrix.h Nnet.h BiasedLinearity.h SharedLinearity.h \
+ Activation.h
+Cache.o: Cache.cc /usr/include/sys/time.h /usr/include/features.h \
+ /usr/include/bits/predefs.h /usr/include/sys/cdefs.h \
+ /usr/include/bits/wordsize.h /usr/include/gnu/stubs.h \
+ /usr/include/gnu/stubs-32.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/time.h \
+ /usr/include/bits/time.h /usr/include/sys/select.h \
+ /usr/include/bits/select.h /usr/include/bits/sigset.h \
+ /usr/include/bits/select2.h Cache.h ../KaldiLib/Matrix.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/endian.h /usr/include/bits/endian.h \
+ /usr/include/bits/byteswap.h /usr/include/xlocale.h \
+ /usr/include/sys/types.h /usr/include/sys/sysmacros.h \
+ /usr/include/bits/pthreadtypes.h /usr/include/alloca.h \
+ /usr/include/bits/stdlib.h /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/exception \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/c++/4.6/string /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/postypes.h \
+ /usr/include/c++/4.6/cwchar /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/iosfwd \
+ /usr/include/c++/4.6/cctype /usr/include/ctype.h \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/bits/sched.h \
+ /usr/include/bits/timex.h /usr/include/bits/setjmp.h \
+ /usr/include/unistd.h /usr/include/bits/posix_opt.h \
+ /usr/include/bits/environments.h /usr/include/bits/confname.h \
+ /usr/include/getopt.h /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/bits/locale_classes.h \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc ../KaldiLib/cblas.h \
+ ../KaldiLib/clapack.h ../KaldiLib/cblas.h ../KaldiLib/Common.h \
+ /usr/include/c++/4.6/cstdlib /usr/include/string.h \
+ /usr/include/bits/string3.h /usr/include/c++/4.6/sstream \
+ /usr/include/c++/4.6/bits/sstream.tcc ../KaldiLib/MathAux.h \
+ /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ ../KaldiLib/Types.h ../KaldiLib/Error.h /usr/include/execinfo.h \
+ ../KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h \
+ /usr/include/c++/4.6/bits/stl_construct.h /usr/include/c++/4.6/limits \
+ /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc ../KaldiLib/Vector.h \
+ /usr/include/c++/4.6/cstddef ../KaldiLib/Vector.tcc \
+ /usr/include/c++/4.6/cstring ../KaldiLib/Matrix.h ../KaldiLib/Vector.h
+Mutex.o: Mutex.cc /usr/include/pthread.h /usr/include/features.h \
+ /usr/include/bits/predefs.h /usr/include/sys/cdefs.h \
+ /usr/include/bits/wordsize.h /usr/include/gnu/stubs.h \
+ /usr/include/gnu/stubs-32.h /usr/include/endian.h \
+ /usr/include/bits/endian.h /usr/include/bits/byteswap.h \
+ /usr/include/sched.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h /usr/include/time.h \
+ /usr/include/bits/sched.h /usr/include/bits/time.h \
+ /usr/include/bits/timex.h /usr/include/xlocale.h \
+ /usr/include/bits/pthreadtypes.h /usr/include/bits/setjmp.h \
+ /usr/include/c++/4.6/cerrno /usr/include/errno.h \
+ /usr/include/bits/errno.h /usr/include/linux/errno.h \
+ /usr/include/asm/errno.h /usr/include/asm-generic/errno.h \
+ /usr/include/asm-generic/errno-base.h ../KaldiLib/Error.h \
+ /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/iosfwd /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/postypes.h /usr/include/c++/4.6/cwchar \
+ /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/cctype \
+ /usr/include/ctype.h /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/unistd.h /usr/include/bits/posix_opt.h \
+ /usr/include/bits/environments.h /usr/include/bits/confname.h \
+ /usr/include/getopt.h /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/bits/locale_classes.h /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/sstream /usr/include/c++/4.6/bits/sstream.tcc \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/sys/types.h /usr/include/sys/select.h \
+ /usr/include/bits/select.h /usr/include/bits/sigset.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/alloca.h /usr/include/bits/stdlib.h /usr/include/execinfo.h \
+ Mutex.h
+Nnet.o: Nnet.cc /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/endian.h /usr/include/bits/endian.h \
+ /usr/include/bits/byteswap.h /usr/include/xlocale.h \
+ /usr/include/sys/types.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/time.h \
+ /usr/include/sys/select.h /usr/include/bits/select.h \
+ /usr/include/bits/sigset.h /usr/include/bits/time.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/bits/pthreadtypes.h /usr/include/alloca.h \
+ /usr/include/bits/stdlib.h /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h \
+ /usr/include/c++/4.6/bits/stl_construct.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/cctype \
+ /usr/include/ctype.h Nnet.h Component.h ../KaldiLib/Vector.h \
+ /usr/include/c++/4.6/cstddef /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/string /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/postypes.h /usr/include/c++/4.6/cwchar \
+ /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h \
+ /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/iosfwd \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/bits/sched.h \
+ /usr/include/bits/timex.h /usr/include/bits/setjmp.h \
+ /usr/include/unistd.h /usr/include/bits/posix_opt.h \
+ /usr/include/bits/environments.h /usr/include/bits/confname.h \
+ /usr/include/getopt.h /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/bits/basic_string.tcc /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/bits/locale_classes.h \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc ../KaldiLib/cblas.h \
+ ../KaldiLib/clapack.h ../KaldiLib/cblas.h ../KaldiLib/Common.h \
+ /usr/include/string.h /usr/include/bits/string3.h \
+ /usr/include/c++/4.6/sstream /usr/include/c++/4.6/bits/sstream.tcc \
+ ../KaldiLib/MathAux.h /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ ../KaldiLib/Types.h ../KaldiLib/Error.h /usr/include/execinfo.h \
+ ../KaldiLib/Vector.tcc /usr/include/c++/4.6/cstring \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ ../KaldiLib/Matrix.h ../KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/limits \
+ /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc ../KaldiLib/Vector.h \
+ ../KaldiLib/Matrix.h BiasedLinearity.h SharedLinearity.h Activation.h \
+ CRBEDctFeat.h BlockArray.h
+ObjFun.o: ObjFun.cc ObjFun.h /usr/include/c++/4.6/cassert \
+ /usr/include/assert.h /usr/include/features.h \
+ /usr/include/bits/predefs.h /usr/include/sys/cdefs.h \
+ /usr/include/bits/wordsize.h /usr/include/gnu/stubs.h \
+ /usr/include/gnu/stubs-32.h /usr/include/c++/4.6/limits \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/include/c++/4.6/cmath /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ ../KaldiLib/Matrix.h /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/stdlib.h /usr/include/bits/waitflags.h \
+ /usr/include/bits/waitstatus.h /usr/include/endian.h \
+ /usr/include/bits/endian.h /usr/include/bits/byteswap.h \
+ /usr/include/xlocale.h /usr/include/sys/types.h \
+ /usr/include/bits/types.h /usr/include/bits/typesizes.h \
+ /usr/include/time.h /usr/include/sys/select.h /usr/include/bits/select.h \
+ /usr/include/bits/sigset.h /usr/include/bits/time.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/bits/pthreadtypes.h /usr/include/alloca.h \
+ /usr/include/bits/stdlib.h /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/postypes.h \
+ /usr/include/c++/4.6/cwchar /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/iosfwd \
+ /usr/include/c++/4.6/cctype /usr/include/ctype.h \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/bits/sched.h \
+ /usr/include/bits/timex.h /usr/include/bits/setjmp.h \
+ /usr/include/unistd.h /usr/include/bits/posix_opt.h \
+ /usr/include/bits/environments.h /usr/include/bits/confname.h \
+ /usr/include/getopt.h /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/bits/locale_classes.h \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc ../KaldiLib/cblas.h \
+ ../KaldiLib/clapack.h ../KaldiLib/cblas.h ../KaldiLib/Common.h \
+ /usr/include/c++/4.6/cstdlib /usr/include/string.h \
+ /usr/include/bits/string3.h /usr/include/c++/4.6/sstream \
+ /usr/include/c++/4.6/bits/sstream.tcc ../KaldiLib/MathAux.h \
+ ../KaldiLib/Types.h ../KaldiLib/Error.h /usr/include/execinfo.h \
+ ../KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h \
+ /usr/include/c++/4.6/bits/stl_construct.h /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc ../KaldiLib/Vector.h \
+ /usr/include/c++/4.6/cstddef ../KaldiLib/Vector.tcc \
+ /usr/include/c++/4.6/cstring ../KaldiLib/Matrix.h ../KaldiLib/Vector.h \
+ ../KaldiLib/Error.h
+Semaphore.o: Semaphore.cc Semaphore.h /usr/include/pthread.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/endian.h /usr/include/bits/endian.h \
+ /usr/include/bits/byteswap.h /usr/include/sched.h \
+ /usr/include/bits/types.h /usr/include/bits/typesizes.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h /usr/include/time.h \
+ /usr/include/bits/sched.h /usr/include/bits/time.h \
+ /usr/include/bits/timex.h /usr/include/xlocale.h \
+ /usr/include/bits/pthreadtypes.h /usr/include/bits/setjmp.h
+SharedLinearity.o: SharedLinearity.cc SharedLinearity.h Component.h \
+ ../KaldiLib/Vector.h /usr/include/c++/4.6/cstddef \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++config.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/os_defines.h \
+ /usr/include/features.h /usr/include/bits/predefs.h \
+ /usr/include/sys/cdefs.h /usr/include/bits/wordsize.h \
+ /usr/include/gnu/stubs.h /usr/include/gnu/stubs-32.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/cpu_defines.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stddef.h \
+ /usr/include/c++/4.6/cstdlib /usr/include/stdlib.h \
+ /usr/include/bits/waitflags.h /usr/include/bits/waitstatus.h \
+ /usr/include/endian.h /usr/include/bits/endian.h \
+ /usr/include/bits/byteswap.h /usr/include/xlocale.h \
+ /usr/include/sys/types.h /usr/include/bits/types.h \
+ /usr/include/bits/typesizes.h /usr/include/time.h \
+ /usr/include/sys/select.h /usr/include/bits/select.h \
+ /usr/include/bits/sigset.h /usr/include/bits/time.h \
+ /usr/include/bits/select2.h /usr/include/sys/sysmacros.h \
+ /usr/include/bits/pthreadtypes.h /usr/include/alloca.h \
+ /usr/include/bits/stdlib.h /usr/include/c++/4.6/stdexcept \
+ /usr/include/c++/4.6/exception /usr/include/c++/4.6/string \
+ /usr/include/c++/4.6/bits/stringfwd.h \
+ /usr/include/c++/4.6/bits/char_traits.h \
+ /usr/include/c++/4.6/bits/stl_algobase.h \
+ /usr/include/c++/4.6/bits/functexcept.h \
+ /usr/include/c++/4.6/bits/exception_defines.h \
+ /usr/include/c++/4.6/bits/cpp_type_traits.h \
+ /usr/include/c++/4.6/ext/type_traits.h \
+ /usr/include/c++/4.6/ext/numeric_traits.h \
+ /usr/include/c++/4.6/bits/stl_pair.h /usr/include/c++/4.6/bits/move.h \
+ /usr/include/c++/4.6/bits/concept_check.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_types.h \
+ /usr/include/c++/4.6/bits/stl_iterator_base_funcs.h \
+ /usr/include/c++/4.6/bits/stl_iterator.h \
+ /usr/include/c++/4.6/debug/debug.h /usr/include/c++/4.6/bits/postypes.h \
+ /usr/include/c++/4.6/cwchar /usr/include/wchar.h /usr/include/stdio.h \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/stdarg.h \
+ /usr/include/bits/wchar.h /usr/include/bits/wchar2.h \
+ /usr/include/c++/4.6/bits/allocator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++allocator.h \
+ /usr/include/c++/4.6/ext/new_allocator.h /usr/include/c++/4.6/new \
+ /usr/include/c++/4.6/bits/localefwd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++locale.h \
+ /usr/include/c++/4.6/clocale /usr/include/locale.h \
+ /usr/include/bits/locale.h /usr/include/c++/4.6/iosfwd \
+ /usr/include/c++/4.6/cctype /usr/include/ctype.h \
+ /usr/include/c++/4.6/bits/ostream_insert.h \
+ /usr/include/c++/4.6/bits/cxxabi_forced.h \
+ /usr/include/c++/4.6/bits/stl_function.h \
+ /usr/include/c++/4.6/backward/binders.h \
+ /usr/include/c++/4.6/bits/range_access.h \
+ /usr/include/c++/4.6/bits/basic_string.h \
+ /usr/include/c++/4.6/ext/atomicity.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/gthr-default.h \
+ /usr/include/pthread.h /usr/include/sched.h /usr/include/bits/sched.h \
+ /usr/include/bits/timex.h /usr/include/bits/setjmp.h \
+ /usr/include/unistd.h /usr/include/bits/posix_opt.h \
+ /usr/include/bits/environments.h /usr/include/bits/confname.h \
+ /usr/include/getopt.h /usr/include/bits/unistd.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/atomic_word.h \
+ /usr/include/c++/4.6/initializer_list \
+ /usr/include/c++/4.6/bits/basic_string.tcc /usr/include/c++/4.6/iostream \
+ /usr/include/c++/4.6/ostream /usr/include/c++/4.6/ios \
+ /usr/include/c++/4.6/bits/ios_base.h \
+ /usr/include/c++/4.6/bits/locale_classes.h \
+ /usr/include/c++/4.6/bits/locale_classes.tcc \
+ /usr/include/c++/4.6/streambuf /usr/include/c++/4.6/bits/streambuf.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.h \
+ /usr/include/c++/4.6/bits/locale_facets.h /usr/include/c++/4.6/cwctype \
+ /usr/include/wctype.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_base.h \
+ /usr/include/c++/4.6/bits/streambuf_iterator.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/ctype_inline.h \
+ /usr/include/c++/4.6/bits/locale_facets.tcc \
+ /usr/include/c++/4.6/bits/basic_ios.tcc \
+ /usr/include/c++/4.6/bits/ostream.tcc /usr/include/c++/4.6/istream \
+ /usr/include/c++/4.6/bits/istream.tcc ../KaldiLib/cblas.h \
+ ../KaldiLib/clapack.h ../KaldiLib/cblas.h ../KaldiLib/Common.h \
+ /usr/include/string.h /usr/include/bits/string3.h \
+ /usr/include/c++/4.6/sstream /usr/include/c++/4.6/bits/sstream.tcc \
+ ../KaldiLib/MathAux.h /usr/include/c++/4.6/cmath /usr/include/math.h \
+ /usr/include/bits/huge_val.h /usr/include/bits/huge_valf.h \
+ /usr/include/bits/huge_vall.h /usr/include/bits/inf.h \
+ /usr/include/bits/nan.h /usr/include/bits/mathdef.h \
+ /usr/include/bits/mathcalls.h /usr/include/bits/mathinline.h \
+ ../KaldiLib/Types.h ../KaldiLib/Error.h /usr/include/execinfo.h \
+ ../KaldiLib/Vector.tcc /usr/include/c++/4.6/cstring \
+ /usr/include/c++/4.6/fstream /usr/include/c++/4.6/bits/codecvt.h \
+ /usr/include/c++/4.6/cstdio /usr/include/libio.h \
+ /usr/include/_G_config.h /usr/include/bits/stdio_lim.h \
+ /usr/include/bits/sys_errlist.h /usr/include/bits/stdio.h \
+ /usr/include/bits/stdio2.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/basic_file.h \
+ /usr/include/c++/4.6/x86_64-linux-gnu/32/bits/c++io.h \
+ /usr/include/c++/4.6/bits/fstream.tcc /usr/include/c++/4.6/iomanip \
+ ../KaldiLib/Matrix.h ../KaldiLib/Matrix.tcc /usr/include/c++/4.6/cfloat \
+ /usr/lib/gcc/x86_64-linux-gnu/4.6/include/float.h \
+ /usr/include/c++/4.6/typeinfo /usr/include/c++/4.6/algorithm \
+ /usr/include/c++/4.6/utility /usr/include/c++/4.6/bits/stl_relops.h \
+ /usr/include/c++/4.6/bits/stl_algo.h \
+ /usr/include/c++/4.6/bits/algorithmfwd.h \
+ /usr/include/c++/4.6/bits/stl_heap.h \
+ /usr/include/c++/4.6/bits/stl_tempbuf.h \
+ /usr/include/c++/4.6/bits/stl_construct.h /usr/include/c++/4.6/limits \
+ /usr/include/c++/4.6/vector \
+ /usr/include/c++/4.6/bits/stl_uninitialized.h \
+ /usr/include/c++/4.6/bits/stl_vector.h \
+ /usr/include/c++/4.6/bits/stl_bvector.h \
+ /usr/include/c++/4.6/bits/vector.tcc ../KaldiLib/Vector.h \
+ ../KaldiLib/Matrix.h
diff --git a/src/TNetLib/.svn/entries b/src/TNetLib/.svn/entries
new file mode 100644
index 0000000..ed607ae
--- /dev/null
+++ b/src/TNetLib/.svn/entries
@@ -0,0 +1,878 @@
+10
+
+dir
+117
+svn+ssh://merlin.fit.vutbr.cz/svn/TNet/trunk/src/TNetLib
+svn+ssh://merlin.fit.vutbr.cz/svn/TNet
+
+
+
+2012-03-23T13:22:49.912359Z
+110
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+bda6da93-004a-4ae9-8e07-715c10848801
+
+BlockArray.h
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+ec62c49112c4f3bd3e61e13d742b1397
+2012-02-07T17:50:53.635354Z
+103
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+1521
+
+Activation.cc
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+8a83e5928a8ee9d2669aaab03140a700
+2012-03-23T13:22:49.912359Z
+110
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+3322
+
+Mutex.cc
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+1a348d0aabada64e969a5ea81762eb9f
+2011-09-26T13:54:40.854717Z
+71
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+801
+
+Cache.cc
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+d970ce69139e8ae23c4d9adcd863ea0c
+2011-12-22T15:49:51.623339Z
+96
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+6601
+
+Activation.h
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+12d3b3c4b4b0224cc73d0b8e623d91fc
+2012-03-23T13:22:49.912359Z
+110
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+2326
+
+Nnet.cc
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+4802785011f458bcec1f1a3470a07e42
+2012-03-23T13:22:49.912359Z
+110
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+9013
+
+Mutex.h
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+254e1c11aaf2a523b7e38cc45d96d6f7
+2011-09-26T13:54:40.854717Z
+71
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+564
+
+Component.h
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+616cb6be98fa3dcdd2d8183ba890b22c
+2012-03-23T13:22:49.912359Z
+110
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+8904
+
+Cache.h
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+266d0fff2fdd76091947532e70cc75a3
+2011-09-26T13:54:40.854717Z
+71
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+2012
+
+Nnet.h
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+9f5e67cda2fd05d98c21d015b4d57a1e
+2012-03-23T13:22:49.912359Z
+110
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+4671
+
+CRBEDctFeat.h
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+c2be5cbeeb5fff014e057de0f6a93369
+2011-09-26T13:54:40.854717Z
+71
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+10944
+
+BiasedLinearity.cc
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+8a2cf2fc9b1b9b27f5357782b69f04ee
+2012-03-23T13:22:49.912359Z
+110
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+3927
+
+Thread.h
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+8430b7172bec8551de3f94814050c23c
+2012-01-28T20:04:40.717883Z
+101
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+1089
+
+BiasedLinearity.h
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+5780ba3194b5d3dcc7cf7dc2c73d4d31
+2012-03-23T13:22:49.912359Z
+110
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+2345
+
+Semaphore.cc
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+a1d5c7d673835fd44c2fb268fce1526d
+2011-09-26T13:54:40.854717Z
+71
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+966
+
+ObjFun.cc
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+0f12777ac8f37a36cdeb296d0390c6c5
+2012-03-23T13:22:49.912359Z
+110
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+6387
+
+ObjFun.h
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+972e58ba9310592ad6b0e6da7925eebd
+2012-03-23T13:22:49.912359Z
+110
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+3680
+
+Semaphore.h
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+319edc999ff2279c8cc225f8e1d666cb
+2011-09-26T13:54:40.854717Z
+71
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+379
+
+Barrier.cc
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+83b98a4717f4ed3262c5a046892b78dd
+2011-10-03T11:00:38.812084Z
+78
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+3910
+
+Platform.h
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+d1f1b4fb0f8061ec4174b0fd51906f80
+2012-03-23T13:22:49.912359Z
+110
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+9235
+
+SharedLinearity.cc
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+60da58805959d192b64349ce192f3d2e
+2012-03-23T13:22:49.912359Z
+110
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+7833
+
+Barrier.h
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+49ffe12bf135c9391de4f76e11cf98b7
+2011-10-03T11:00:38.812084Z
+78
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+1065
+
+Makefile
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+ef89868dc4e4a9f796f091cbc15057b2
+2011-09-26T13:54:40.854717Z
+71
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+384
+
+BlockArray.cc
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+9d9d3d057f5915898d87ad57e448e00f
+2012-02-07T17:50:53.635354Z
+103
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+3199
+
+SharedLinearity.h
+file
+
+
+
+
+2012-04-02T13:49:14.000000Z
+38bf25a1a2e051d9ac2445e25a39152f
+2011-10-11T11:00:50.704096Z
+81
+iveselyk
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+2287
+
diff --git a/src/TNetLib/.svn/text-base/Activation.cc.svn-base b/src/TNetLib/.svn/text-base/Activation.cc.svn-base
new file mode 100644
index 0000000..8e84190
--- /dev/null
+++ b/src/TNetLib/.svn/text-base/Activation.cc.svn-base
@@ -0,0 +1,138 @@
+
+#include "Activation.h"
+
+
+namespace TNet {
+
+void Sigmoid::PropagateFnc(const BfMatrix& X, BfMatrix& Y) {
+  //Y = 1/(1+e^{-X})
+  for(size_t r=0; r<X.Rows(); r++) {
+    for(size_t c=0; c<X.Cols(); c++) {
+      Y(r,c) = 1.0f/(1.0f+exp(-X(r,c)));
+    }
+  }
+}
+
+
+void Sigmoid::BackpropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y) {
+  const Matrix<BaseFloat>& out = GetOutput();
+  //Y = OUT*(1-OUT)*X //ODVOZENO
+  for(size_t r=0; r<X.Rows(); r++) {
+    for(size_t c=0; c<X.Cols(); c++) {
+      Y(r,c) = X(r,c)*out(r,c)*(1.0f-out(r,c));
+    }
+  }
+}
+
+
+
+void Softmax::PropagateFnc(const BfMatrix& X, BfMatrix& Y) {
+  //Y_j = e^X_j / sum_i(e^X_i)
+  //
+  //    e^(X_j+c) / sum_i(e^X_i+c)
+  //    = e^c.e^X_h / e^c.sum_i(e^X_i)
+  //    = e^X_j / sum_i(e^X_i)
+  //
+  size_t rows = X.Rows();
+  for(size_t i=0; i<rows; i++) {
+    BfSubVector y_i(Y[i]); //<< y_i gets pointer to i'th row of matrix Y
+    y_i.Copy(X[i]);
+    BaseFloat max = y_i.Max();
+    y_i.Subtract(max);
+    y_i.ApplyExp();
+    BaseFloat sum = y_i.Sum();
+    y_i.Scale(1.0f/sum);
+  }
+}
+
+
+void Softmax::BackpropagateFnc(const BfMatrix& X, BfMatrix& Y) {
+  //simply copy the error...,
+  Y.Copy(X);
+}
+
+
+void BlockSoftmax::ReadFromStream(std::istream& rIn) {
+  rIn >> mDim; 
+  mDimOffset.Init(mDim.Dim()+1);
+  
+  int off=0; 
+  for(int i=0; i<mDim.Dim(); i++) { 
+    mDimOffset[i]=off;
+    off += mDim[i];
+  }
+  mDimOffset[mDim.Dim()]=off;
+
+  if(off!=GetNOutputs()) {
+    KALDI_ERR << "Non-matching dimension of sum of softmaxes,"
+      << " the sum:" << off 
+      << " GetNOutputs:" << GetNOutputs();
+  }
+}
+
+void BlockSoftmax::WriteToStream(std::ostream& rOut) {
+  rOut << mDim;
+}
+
+
+
+
+void BlockSoftmax::PropagateFnc(const BfMatrix& X, BfMatrix& Y) {
+  //Y_j = e^X_j / sum_i(e^X_i)
+  //
+  //    e^(X_j+c) / sum_i(e^X_i+c)
+  //    = e^c.e^X_h / e^c.sum_i(e^X_i)
+  //    = e^X_j / sum_i(e^X_i)
+  //
+  size_t rows = X.Rows();
+  for(size_t i=0; i<rows; i++) {
+    BfSubVector y_i(Y[i]); //<< y_i gets pointer to i'th row of matrix Y
+    y_i.Copy(X[i]);
+    //BaseFloat max = y_i.Max();
+    //y_i.Subtract(max);
+    //y_i.ApplyExp();
+    //normalize separately on each softmax interval...
+    for(int j=0; j<mDim.Dim(); j++) {
+      BfSubVector y_i_smx_j(y_i.Range(mDimOffset[j],mDim[j]));
+      BaseFloat max = y_i_smx_j.Max();
+      y_i_smx_j.Subtract(max);
+      y_i_smx_j.ApplyExp();
+      BaseFloat sum = y_i_smx_j.Sum();
+      y_i_smx_j.Scale(1.0f/sum);
+    }
+  }
+
+//  X.CheckData("BlockSoftmax PropagateFnc X");
+//  Y.CheckData("BlockSoftmax PropagateFnc Y");
+}
+
+
+void BlockSoftmax::BackpropagateFnc(const BfMatrix& X, BfMatrix& Y) {
+  //set the output to zero
+  Y.Zero();
+  //copy only parts of the error
+  //from softmax intervals which sum up to 0.0, not 1.0
+  for(int i=0; i<X.Rows(); i++) {
+    for(int j=0; j<mDim.Dim(); j++) {
+      const BfSubVector x_i_smx_j(X[i].Range(mDimOffset[j],mDim[j]));
+      BaseFloat sum = x_i_smx_j.Sum();
+      if(sum > -0.1 && sum < 0.1) {
+        BfSubVector y_i_smx_j(Y[i].Range(mDimOffset[j],mDim[j]));
+        y_i_smx_j.Copy(x_i_smx_j);
+      } else if (sum > 0.9 && sum < 1.1) {
+        ; //do nothing
+      } else {
+        KALDI_ERR << "Invalid sum: " << sum;
+      }
+    }
+  }
+
+//  X.CheckData("BlockSoftmax BackpropagateFnc X");
+//  Y.CheckData("BlockSoftmax BackpropagateFnc Y");
+
+}
+
+
+
+} //namespace TNet
+
diff --git a/src/TNetLib/.svn/text-base/Activation.h.svn-base b/src/TNetLib/.svn/text-base/Activation.h.svn-base
new file mode 100644
index 0000000..90263d0
--- /dev/null
+++ b/src/TNetLib/.svn/text-base/Activation.h.svn-base
@@ -0,0 +1,104 @@
+
+#ifndef _ACT_FUN_I_
+#define _ACT_FUN_I_
+
+
+#include "Component.h"
+
+
+namespace TNet
+{
+
+  /**
+   * Sigmoid activation function
+   */
+  class Sigmoid : public Component
+  {
+    public:
+      Sigmoid(size_t nInputs, size_t nOutputs, Component *pPred)
+       : Component(nInputs,nOutputs,pPred)
+      { }
+
+      ComponentType GetType() const
+      { return SIGMOID; }
+
+      const char* GetName() const
+      { return "<sigmoid>"; }
+
+      Component* Clone() const
+      { return new Sigmoid(GetNInputs(),GetNOutputs(),NULL); }
+
+    protected:
+      void PropagateFnc(const BfMatrix& X, BfMatrix& Y);
+      void BackpropagateFnc(const BfMatrix& X, BfMatrix& Y);
+  };
+    
+
+  /**
+   * Softmax activation function
+   */
+  class Softmax : public Component
+  {
+    public:
+      Softmax(size_t nInputs, size_t nOutputs, Component *pPred)
+       : Component(nInputs,nOutputs,pPred)
+      { }
+
+      ComponentType GetType() const
+      { return SOFTMAX; }
+
+      const char* GetName() const
+      { return "<softmax>"; }
+
+      Component* Clone() const
+      { return new Softmax(GetNInputs(),GetNOutputs(),NULL); }
+
+    protected:
+      void PropagateFnc(const BfMatrix& X, BfMatrix& Y);
+      void BackpropagateFnc(const BfMatrix& X, BfMatrix& Y);
+  };
+
+
+  /**
+   * BlockSoftmax activation function.
+   * It is several softmaxes in one.
+   * The dimensions of softmaxes are given by integer vector.
+   * During backpropagation: 
+   *  If the derivatives sum up to 0, they are backpropagated. 
+   *  If the derivatives sup up to 1, they are discarded
+   *  (like this we know that the softmax was 'inactive').
+   */
+  class BlockSoftmax : public Component
+  {
+    public:
+      BlockSoftmax(size_t nInputs, size_t nOutputs, Component *pPred)
+       : Component(nInputs,nOutputs,pPred)
+      { }
+
+      ComponentType GetType() const
+      { return BLOCK_SOFTMAX; }
+
+      const char* GetName() const
+      { return "<blocksoftmax>"; }
+
+      Component* Clone() const
+      { return new BlockSoftmax(*this); }
+
+      void ReadFromStream(std::istream& rIn);
+      void WriteToStream(std::ostream& rOut);
+
+    protected:
+      void PropagateFnc(const BfMatrix& X, BfMatrix& Y);
+      void BackpropagateFnc(const BfMatrix& X, BfMatrix& Y);
+
+    private:
+      Vector<int> mDim;
+      Vector<int> mDimOffset;
+  };
+
+
+  
+} //namespace
+
+
+#endif
diff --git a/src/TNetLib/.svn/text-base/Barrier.cc.svn-base b/src/TNetLib/.svn/text-base/Barrier.cc.svn-base
new file mode 100644
index 0000000..0170e04
--- /dev/null
+++ b/src/TNetLib/.svn/text-base/Barrier.cc.svn-base
@@ -0,0 +1,143 @@
+/*
+ * barrier.c
+ *
+ * This file implements the "barrier" synchronization construct.
+ *
+ * A barrier causes threads to wait until a set of threads has
+ * all "reached" the barrier. The number of threads required is
+ * set when the barrier is initialized, and cannot be changed
+ * except by reinitializing.
+ *
+ * The barrier_init() and barrier_destroy() functions,
+ * respectively, allow you to initialize and destroy the
+ * barrier.
+ *
+ * The barrier_wait() function allows a thread to wait for a
+ * barrier to be completed. One thread (the one that happens to
+ * arrive last) will return from barrier_wait() with the status
+ * -1 on success -- others will return with 0. The special
+ * status makes it easy for the calling code to cause one thread
+ * to do something in a serial region before entering another
+ * parallel section of code.
+ */
+#include <pthread.h>
+#include "Error.h"
+#include "Barrier.h"
+
+namespace TNet {
+
+/*
+ * Initialize a barrier for use.
+ */
+Barrier::Barrier(int count)
+ : threshold_(count), counter_(count), cycle_(0) {
+
+  if(0 != pthread_mutex_init(&mutex_, NULL))
+    KALDI_ERR << "Cannot initialize mutex";
+  
+  if(0 != pthread_cond_init(&cv_, NULL)) {
+    pthread_mutex_destroy(&mutex_);
+    KALDI_ERR << "Cannot initilize condv";
+  }
+}
+
+/*
+ * Destroy a barrier when done using it.
+ */
+Barrier::~Barrier() {
+
+  if(0 != pthread_mutex_lock(&mutex_))
+    KALDI_ERR << "Cannot lock mutex";
+
+  /*
+   * Check whether any threads are known to be waiting; report
+   * "BUSY" if so.
+   */
+  if(counter_ != threshold_) {
+    pthread_mutex_unlock (&mutex_);
+    KALDI_ERR << "Cannot destroy barrier with waiting thread";
+  }
+
+  if(0 != pthread_mutex_unlock(&mutex_))
+    KALDI_ERR << "Cannot unlock barrier";
+
+  /*
+   * If unable to destroy either 1003.1c synchronization
+   * object, halt
+   */
+  if(0 != pthread_mutex_destroy(&mutex_))
+    KALDI_ERR << "Cannot destroy mutex";
+
+  if(0 != pthread_cond_destroy(&cv_)) 
+    KALDI_ERR << "Cannot destroy condv";
+}
+
+
+void Barrier::SetThreshold(int thr) {
+  if(counter_ != threshold_) 
+    KALDI_ERR << "Cannot set threshold, while a thread is waiting";
+
+  threshold_ = thr; counter_ = thr;
+}
+
+
+
+/*
+ * Wait for all members of a barrier to reach the barrier. When
+ * the count (of remaining members) reaches 0, broadcast to wake
+ * all threads waiting.
+ */
+int Barrier::Wait() {
+  int status, cancel, tmp, cycle;
+
+  if(threshold_ == 0)
+    KALDI_ERR << "Cannot wait when Threshold value was not set";
+
+  if(0 != pthread_mutex_lock(&mutex_)) 
+    KALDI_ERR << "Cannot lock mutex";
+
+  cycle = cycle_;   /* Remember which cycle we're on */
+
+  if(--counter_ == 0) {
+    cycle_ = !cycle_;
+    counter_ = threshold_;
+    status = pthread_cond_broadcast(&cv_);
+    /*
+     * The last thread into the barrier will return status
+     * -1 rather than 0, so that it can be used to perform
+     * some special serial code following the barrier.
+     */
+    if(status == 0) status = -1;
+  } else {
+    /*
+     * Wait with cancellation disabled, because barrier_wait
+     * should not be a cancellation point.
+     */
+    pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cancel);
+
+    /*
+     * Wait until the barrier's cycle changes, which means
+     * that it has been broadcast, and we don't want to wait
+     * anymore.
+     */
+    while (cycle == cycle_) {
+      status = pthread_cond_wait(&cv_, &mutex_);
+      if (status != 0) break;
+    }
+
+    pthread_setcancelstate(cancel, &tmp);
+  }
+  /*
+   * Ignore an error in unlocking. It shouldn't happen, and
+   * reporting it here would be misleading -- the barrier wait
+   * completed, after all, whereas returning, for example,
+   * EINVAL would imply the wait had failed. The next attempt
+   * to use the barrier *will* return an error, or hang, due
+   * to whatever happened to the mutex.
+   */
+  pthread_mutex_unlock (&mutex_);
+  return status;          /* error, -1 for waker, or 0 */
+}
+
+
+}//namespace TNet
diff --git a/src/TNetLib/.svn/text-base/Barrier.h.svn-base b/src/TNetLib/.svn/text-base/Barrier.h.svn-base
new file mode 100644
index 0000000..a5849d2
--- /dev/null
+++ b/src/TNetLib/.svn/text-base/Barrier.h.svn-base
@@ -0,0 +1,41 @@
+/*
+ * barrier.h
+ *
+ * This header file describes the "barrier" synchronization
+ * construct. The type barrier_t describes the full state of the
+ * barrier including the POSIX 1003.1c synchronization objects
+ * necessary.
+ *
+ * A barrier causes threads to wait until a set of threads has
+ * all "reached" the barrier. The number of threads required is
+ * set when the barrier is initialized, and cannot be changed
+ * except by reinitializing.
+ */
+#include <pthread.h>
+
+#ifndef barrier_h
+#define barrier_h
+
+namespace TNet {
+
+/*
+ * Structure describing a barrier.
+ */
+class Barrier {
+ public:
+  Barrier(int count=0);
+  ~Barrier();
+  void SetThreshold(int thr);
+  int Wait();
+ private:
+  pthread_mutex_t     mutex_;          /* Control access to barrier */
+  pthread_cond_t      cv_;             /* wait for barrier */
+  int                 threshold_;      /* number of threads required */
+  int                 counter_;        /* current number of threads */
+  int                 cycle_;          /* alternate wait cycles (0 or 1) */
+};
+
+}//namespace TNet
+
+#endif
+
diff --git a/src/TNetLib/.svn/text-base/BiasedLinearity.cc.svn-base b/src/TNetLib/.svn/text-base/BiasedLinearity.cc.svn-base
new file mode 100644
index 0000000..b52aeb0
--- /dev/null
+++ b/src/TNetLib/.svn/text-base/BiasedLinearity.cc.svn-base
@@ -0,0 +1,180 @@
+
+
+#include "BiasedLinearity.h"
+
+
+namespace TNet {
+
+
+void
+BiasedLinearity::
+PropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+{
+  //y = b + x.A
+
+  //precopy bias
+  size_t rows = X.Rows();
+  for(size_t i=0; i<rows; i++) {
+    Y[i].Copy(*mpBias);
+  }
+
+  //multiply matrix by matrix with mLinearity
+  Y.BlasGemm(1.0f, X, NO_TRANS, *mpLinearity, NO_TRANS, 1.0f);
+}
+
+
+void
+BiasedLinearity::
+BackpropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+{
+  // e' = e.A^T
+  Y.Zero();
+  Y.BlasGemm(1.0f, X, NO_TRANS, *mpLinearity, TRANS, 0.0f);
+}
+
+
+
+void
+BiasedLinearity::
+ReadFromStream(std::istream& rIn)
+{
+  //matrix is stored transposed as SNet does
+  Matrix<BaseFloat> transpose;
+  rIn >> transpose;
+  mLinearity = Matrix<BaseFloat>(transpose, TRANS);
+  //biases stored normally
+  rIn >> mBias;
+}
+
+ 
+void
+BiasedLinearity::
+WriteToStream(std::ostream& rOut)
+{
+  //matrix is stored transposed as SNet does
+  Matrix<BaseFloat> transpose(mLinearity, TRANS);
+  rOut << transpose;
+  //biases stored normally
+  rOut << mBias;
+  rOut << std::endl;
+}
+
+
+void
+BiasedLinearity::
+Gradient()
+{
+  //calculate gradient of weight matrix
+  mLinearityCorrection.Zero();
+  mLinearityCorrection.BlasGemm(1.0f, GetInput(), TRANS, 
+                                GetErrorInput(), NO_TRANS, 
+                                0.0f);
+
+  //calculate gradient of bias
+  mBiasCorrection.Set(0.0f);
+  size_t rows = GetInput().Rows();
+  for(size_t i=0; i<rows; i++) {
+    mBiasCorrection.Add(GetErrorInput()[i]);
+  }
+
+  /* 
+  //perform update
+  mLinearity.AddScaled(-mLearningRate, mLinearityCorrection);
+  mBias.AddScaled(-mLearningRate, mBiasCorrection);
+  */
+}
+
+
+void 
+BiasedLinearity::
+AccuGradient(const UpdatableComponent& src, int thr, int thrN) {
+  //cast the argument
+  const BiasedLinearity& src_comp = dynamic_cast<const BiasedLinearity&>(src);
+
+  //allocate accumulators when needed
+  if(mLinearityCorrectionAccu.MSize() == 0) {
+    mLinearityCorrectionAccu.Init(mLinearity.Rows(),mLinearity.Cols());
+  }
+  if(mBiasCorrectionAccu.MSize() == 0) {
+    mBiasCorrectionAccu.Init(mBias.Dim());
+  }
+
+  //need to find out which rows to sum...
+  int div = mLinearityCorrection.Rows() / thrN;
+  int mod = mLinearityCorrection.Rows() % thrN;
+
+  int origin = thr * div + ((mod > thr)? thr : mod);
+  int rows = div + ((mod > thr)? 1 : 0);
+
+  //create the matrix windows
+  const SubMatrix<BaseFloat> src_mat (
+    src_comp.mLinearityCorrection, 
+    origin, rows, 
+    0, mLinearityCorrection.Cols()
+  );
+  SubMatrix<double> tgt_mat (
+    mLinearityCorrectionAccu, 
+    origin, rows, 
+    0, mLinearityCorrection.Cols()
+  );
+  //sum the rows
+  Add(tgt_mat,src_mat);
+
+  //first thread will always sum the bias correction
+  if(thr == 0) {
+    Add(mBiasCorrectionAccu,src_comp.mBiasCorrection);
+  }
+
+}
+
+
+void
+BiasedLinearity::
+Update(int thr, int thrN)
+{
+  //need to find out which rows to sum...
+  int div = mLinearity.Rows() / thrN;
+  int mod = mLinearity.Rows() % thrN;
+
+  int origin = thr * div + ((mod > thr)? thr : mod);
+  int rows = div + ((mod > thr)? 1 : 0);
+
+  //std::cout << "[P" << thr << "," << origin << "," << rows << "]" << std::flush;
+
+  //get the matrix windows
+  SubMatrix<double> src_mat (
+    mLinearityCorrectionAccu, 
+    origin, rows, 
+    0, mLinearityCorrection.Cols()
+  );
+  SubMatrix<BaseFloat> tgt_mat (
+    mLinearity, 
+    origin, rows, 
+    0, mLinearityCorrection.Cols()
+  );
+
+
+  //update weights
+  AddScaled(tgt_mat, src_mat, -mLearningRate);
+
+  //perform L2 regularization (weight decay)
+  BaseFloat L2_decay = -mLearningRate * mWeightcost * mBunchsize;
+  if(L2_decay != 0.0) {
+    tgt_mat.AddScaled(L2_decay, tgt_mat);
+  }
+
+  //first thread always update bias
+  if(thr == 0) {
+    //std::cout << "[" << thr << "BP]" << std::flush;
+    AddScaled(mBias, mBiasCorrectionAccu, -mLearningRate);
+  }
+
+  //reset the accumulators
+  src_mat.Zero();
+  if(thr == 0) {
+    mBiasCorrectionAccu.Zero();
+  }
+
+}
+
+} //namespace
diff --git a/src/TNetLib/.svn/text-base/BiasedLinearity.h.svn-base b/src/TNetLib/.svn/text-base/BiasedLinearity.h.svn-base
new file mode 100644
index 0000000..5018637
--- /dev/null
+++ b/src/TNetLib/.svn/text-base/BiasedLinearity.h.svn-base
@@ -0,0 +1,92 @@
+#ifndef _BIASED_LINEARITY_H_
+#define _BIASED_LINEARITY_H_
+
+
+#include "Component.h"
+
+#include "Matrix.h"
+#include "Vector.h"
+
+
+namespace TNet {
+
+class BiasedLinearity : public UpdatableComponent
+{
+ public:
+
+  BiasedLinearity(size_t nInputs, size_t nOutputs, Component *pPred);
+  ~BiasedLinearity() { } 
+  
+  ComponentType GetType() const
+  { return BIASED_LINEARITY; }
+
+  const char* GetName() const
+  { return "<BiasedLinearity>"; }
+
+  Component* Clone() const;
+
+  void PropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y);
+  void BackpropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y);
+
+  void ReadFromStream(std::istream& rIn);
+  void WriteToStream(std::ostream& rOut);
+
+  /// calculate gradient
+  void Gradient();
+  /// accumulate gradient from other components
+  void AccuGradient(const UpdatableComponent& src, int thr, int thrN);  
+  /// update weights, reset the accumulator
+  void Update(int thr, int thrN);
+
+ protected:
+  Matrix<BaseFloat> mLinearity;  ///< Matrix with neuron weights
+  Vector<BaseFloat> mBias;       ///< Vector with biases
+
+  const Matrix<BaseFloat>* mpLinearity;
+  const Vector<BaseFloat>* mpBias;
+
+  Matrix<BaseFloat> mLinearityCorrection; ///< Matrix for linearity updates
+  Vector<BaseFloat> mBiasCorrection;      ///< Vector for bias updates
+
+  Matrix<double> mLinearityCorrectionAccu; ///< Matrix for summing linearity updates
+  Vector<double> mBiasCorrectionAccu;      ///< Vector for summing bias updates
+
+};
+
+
+
+
+////////////////////////////////////////////////////////////////////////////
+// INLINE FUNCTIONS 
+// BiasedLinearity::
+inline 
+BiasedLinearity::
+BiasedLinearity(size_t nInputs, size_t nOutputs, Component *pPred)
+  : UpdatableComponent(nInputs, nOutputs, pPred), 
+    mLinearity(), mBias(), //cloned instaces don't need this
+    mpLinearity(&mLinearity), mpBias(&mBias), 
+    mLinearityCorrection(nInputs,nOutputs), mBiasCorrection(nOutputs),
+    mLinearityCorrectionAccu(), mBiasCorrectionAccu() //cloned instances don't need this
+{ }
+
+inline
+Component* 
+BiasedLinearity::
+Clone() const
+{
+  BiasedLinearity* ptr = new BiasedLinearity(GetNInputs(), GetNOutputs(), NULL);
+  ptr->mpLinearity = mpLinearity; //copy pointer from currently active weights
+  ptr->mpBias = mpBias;           //...
+
+  ptr->mLearningRate = mLearningRate;
+  
+  return ptr;
+}
+
+
+
+} //namespace
+
+
+
+#endif
diff --git a/src/TNetLib/.svn/text-base/BlockArray.cc.svn-base b/src/TNetLib/.svn/text-base/BlockArray.cc.svn-base
new file mode 100644
index 0000000..18a41d2
--- /dev/null
+++ b/src/TNetLib/.svn/text-base/BlockArray.cc.svn-base
@@ -0,0 +1,136 @@
+
+
+#include "BlockArray.h"
+#include "Nnet.h"
+
+
+namespace TNet
+{
+
+  void 
+  BlockArray::
+  PropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+  {
+    SubMatrix<BaseFloat> colsX(X,0,1,0,1); //dummy dimensions
+    SubMatrix<BaseFloat> colsY(Y,0,1,0,1); //dummy dimensions
+    
+    int X_src_ori=0, Y_tgt_ori=0;
+    for(int i=0; i<mNBlocks; i++) {
+      //get the correct submatrices
+      int colsX_cnt=mBlocks[i]->GetNInputs();
+      int colsY_cnt=mBlocks[i]->GetNOutputs();
+      colsX = X.Range(0,X.Rows(),X_src_ori,colsX_cnt);
+      colsY = Y.Range(0,Y.Rows(),Y_tgt_ori,colsY_cnt);
+
+      //propagate through the block(network)
+      mBlocks[i]->Propagate(colsX,colsY);
+
+      //shift the origin coordinates
+      X_src_ori += colsX_cnt;
+      Y_tgt_ori += colsY_cnt;
+    }
+
+    assert(X_src_ori == X.Cols());
+    assert(Y_tgt_ori == Y.Cols());
+  }
+
+
+  void 
+  BlockArray::
+  BackpropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+  {
+    KALDI_ERR << "Unimplemented";
+  }
+
+  
+  void 
+  BlockArray::
+  Update() 
+  {
+    KALDI_ERR << "Unimplemented";
+  }
+
+
+  void
+  BlockArray::
+  ReadFromStream(std::istream& rIn)
+  {
+    if(mBlocks.size() > 0) {
+      KALDI_ERR << "Cannot read block vector, "
+                << "aleady filled bt "
+                << mBlocks.size()
+                << "elements";
+    }
+
+    rIn >> std::ws >> mNBlocks;
+    if(mNBlocks < 1) {
+      KALDI_ERR << "Bad number of blocks:" << mNBlocks;
+    }
+
+    //read all the blocks
+    std::string tag;
+    int block_id;
+    for(int i=0; i<mNBlocks; i++) {
+      //read tag <block>
+      rIn >> std::ws >> tag;
+      //make it lowercase
+      std::transform(tag.begin(), tag.end(), tag.begin(), tolower);
+      //check
+      if(tag!="<block>") {
+        KALDI_ERR << "<block> keywotd expected";
+      }
+    
+      //read block number
+      rIn >> std::ws >> block_id;
+      if(block_id != i+1) {
+        KALDI_ERR << "Expected block number:" << i+1
+                  << " read block number: " << block_id;
+      }
+
+      //read the nnet
+      Network* p_nnet = new Network;
+      p_nnet->ReadNetwork(rIn);
+      if(p_nnet->Layers() == 0) {
+        KALDI_ERR << "Cannot read empty network to a block";
+      }
+
+      //add it to the vector
+      mBlocks.push_back(p_nnet);
+    }
+
+    //check the declared dimensionality
+    int sum_inputs=0, sum_outputs=0;
+    for(int i=0; i<mNBlocks; i++) {
+      sum_inputs += mBlocks[i]->GetNInputs();
+      sum_outputs += mBlocks[i]->GetNOutputs();
+    }
+    if(sum_inputs != GetNInputs()) {
+      KALDI_ERR << "Non-matching number of INPUTS! Declared:"
+                << GetNInputs()
+                << " summed from blocks"
+                << sum_inputs;
+    }
+    if(sum_outputs != GetNOutputs()) {
+      KALDI_ERR << "Non-matching number of OUTPUTS! Declared:"
+                << GetNOutputs()
+                << " summed from blocks"
+                << sum_outputs;
+    }
+  }
+
+   
+  void
+  BlockArray::
+  WriteToStream(std::ostream& rOut)
+  {
+    rOut << " " << mBlocks.size() << " ";
+    for(int i=0; i<mBlocks.size(); i++) {
+      rOut << "<block> " << i+1 << "\n";
+      mBlocks[i]->WriteNetwork(rOut);
+      rOut << "<endblock>\n";
+    }
+  }
+
+ 
+} //namespace
+
diff --git a/src/TNetLib/.svn/text-base/BlockArray.h.svn-base b/src/TNetLib/.svn/text-base/BlockArray.h.svn-base
new file mode 100644
index 0000000..e6a8657
--- /dev/null
+++ b/src/TNetLib/.svn/text-base/BlockArray.h.svn-base
@@ -0,0 +1,85 @@
+#ifndef _BLOCK_ARRAY_H_
+#define _BLOCK_ARRAY_H_
+
+
+#include "Component.h"
+
+#include "Matrix.h"
+#include "Vector.h"
+
+
+namespace TNet {
+
+  class Network;
+
+  class BlockArray : public Component
+  {
+    public:
+
+      BlockArray(size_t nInputs, size_t nOutputs, Component *pPred); 
+      ~BlockArray();  
+      
+      ComponentType GetType() const;
+      const char* GetName() const;
+
+      void PropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y);
+      void BackpropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y);
+
+      void Update();
+
+      void ReadFromStream(std::istream& rIn);
+      void WriteToStream(std::ostream& rOut);
+ 
+      //:TODO:
+      Component* Clone() const { KALDI_ERR << "Unimplemented"; }
+
+    protected:
+      std::vector<Network*> mBlocks; ///< vector with networks, one network is one block
+      size_t mNBlocks;  
+  };
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // BlockArray::
+  inline 
+  BlockArray::
+  BlockArray(size_t nInputs, size_t nOutputs, Component *pPred)
+    : Component(nInputs, nOutputs, pPred), 
+      mNBlocks(0) 
+  { }
+
+
+  inline
+  BlockArray::
+  ~BlockArray()
+  { 
+    for(int i=0; i<mBlocks.size(); i++) {
+      delete mBlocks[i];
+    }
+    mBlocks.clear();
+  }
+
+  inline Component::ComponentType
+  BlockArray::
+  GetType() const
+  {
+    return Component::BLOCK_ARRAY;
+  }
+
+  inline const char*
+  BlockArray::
+  GetName() const
+  {
+    return "<blockarray>";
+  }
+
+
+
+} //namespace
+
+
+
+#endif
diff --git a/src/TNetLib/.svn/text-base/CRBEDctFeat.h.svn-base b/src/TNetLib/.svn/text-base/CRBEDctFeat.h.svn-base
new file mode 100644
index 0000000..0984c36
--- /dev/null
+++ b/src/TNetLib/.svn/text-base/CRBEDctFeat.h.svn-base
@@ -0,0 +1,432 @@
+#ifndef _CUCRBEDCTFEATURES_H_
+#define _CUCRBEDCTFEATURES_H_
+
+
+#include "Component.h"
+#include "Matrix.h"
+#include "Vector.h"
+#include "cblas.h"
+
+
+namespace TNet {
+
+  /**
+   * Expands the time context of the input features
+   * in N, out k*N, FrameOffset o_1,o_2,...,o_k
+   * FrameOffset example 11frames: -5 -4 -3 -2 -1 0 1 2 3 4 5
+   */
+  class Expand : public Component
+  {
+   public:
+    Expand(size_t nInputs, size_t nOutputs, Component* pPred)
+      : Component(nInputs,nOutputs,pPred)
+    { }
+
+    ~Expand()
+    { }
+
+    ComponentType GetType() const
+    { return EXPAND; }
+
+    const char* GetName() const
+    { return "<expand>"; }
+   
+    Component* Clone() const 
+    { 
+      Expand* p = new Expand(GetNInputs(),GetNOutputs(),NULL);
+      p->mFrameOffset.Init(mFrameOffset.Dim()); 
+      p->mFrameOffset.Copy(mFrameOffset); 
+      return p; 
+    }
+
+    void ReadFromStream(std::istream& rIn)
+    { rIn >> mFrameOffset; }
+
+    void WriteToStream(std::ostream& rOut)  
+    { rOut << mFrameOffset; }
+     
+   protected:
+    void PropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+    {
+      assert(X.Cols()*mFrameOffset.Dim() == Y.Cols());
+      assert(X.Rows() == Y.Rows());
+
+      for(size_t r=0;r<X.Rows();r++) {
+        for(size_t off=0;off<mFrameOffset.Dim();off++) {
+          int r_off = r + mFrameOffset[off];
+          if(r_off < 0) r_off = 0;
+          if(r_off >= X.Rows()) r_off = X.Rows()-1;
+          memcpy(Y.pRowData(r)+off*X.Cols(),X.pRowData(r_off),sizeof(BaseFloat)*X.Cols());
+        }
+      }
+    }
+
+    void BackpropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+    { Error("__func__ Nonsense"); }
+
+   protected:
+    Vector<int> mFrameOffset;
+  };
+
+
+
+  /**
+   * Rearrange the matrix columns according to the indices in mCopyFromIndices
+   */
+  class Copy : public Component
+  {
+   public:
+    Copy(size_t nInputs, size_t nOutputs, Component* pPred)
+      : Component(nInputs,nOutputs,pPred)
+    { }
+
+    ~Copy()
+    { }
+
+    ComponentType GetType() const
+    { return COPY; }
+
+    const char* GetName() const
+    { return "<copy>"; }
+    
+    Component* Clone() const 
+    { 
+      Copy* p = new Copy(GetNInputs(),GetNOutputs(),NULL);
+      p->mCopyFromIndices.Init(mCopyFromIndices.Dim()); 
+      p->mCopyFromIndices.Copy(mCopyFromIndices); 
+      return p; 
+    }
+
+    void ReadFromStream(std::istream& rIn)
+    { 
+      Vector<int> vec; rIn >> vec; vec.Add(-1); 
+      mCopyFromIndices.Init(vec.Dim()).Copy(vec);
+    }
+
+    void WriteToStream(std::ostream& rOut)  
+    { 
+      Vector<int> vec(mCopyFromIndices); 
+      vec.Add(1); rOut << vec; 
+    }
+     
+   protected:
+    void PropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+    {
+      assert(mCopyFromIndices.Dim() == Y.Cols());
+      for(int i=0; i<mCopyFromIndices.Dim();i++) {
+        assert(mCopyFromIndices[i] >= 0 && mCopyFromIndices[i] < X.Cols());
+      }
+        
+      for(size_t r=0; r<X.Rows(); r++) {
+        for(size_t c=0; c<Y.Cols(); c++) {
+          Y(r,c) = X(r,mCopyFromIndices[c]);
+        }
+      }
+    }
+
+    void BackpropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+    { Error("__func__ Nonsense"); }
+
+   protected:
+    Vector<int> mCopyFromIndices;
+  };
+  
+  class Transpose : public Component
+  {
+   public:
+    Transpose(size_t nInputs, size_t nOutputs, Component* pPred)
+      : Component(nInputs,nOutputs,pPred), mContext(0)
+    { }
+
+    ~Transpose()
+    { }
+
+    ComponentType GetType() const
+    { return TRANSPOSE; }
+
+    const char* GetName() const
+    { return "<transpose>"; }
+ 
+    Component* Clone() const  
+    { 
+      Transpose* p = new Transpose(GetNInputs(),GetNOutputs(),NULL); 
+      p->mCopyFromIndices.Init(mCopyFromIndices.Dim());
+      p->mCopyFromIndices.Copy(mCopyFromIndices); 
+      p->mContext = mContext;
+      return p; 
+    }
+  
+    void ReadFromStream(std::istream& rIn)
+    { 
+      rIn >> std::ws >> mContext;
+
+      if(GetNInputs() != GetNOutputs()) { 
+        Error("Input dim must be same as output dim"); 
+      }
+      
+      Vector<int> vec(GetNInputs());
+      int channels = GetNInputs() / mContext;
+      for(int i=0, ch=0; ch<channels; ch++) {
+        for(int idx=ch; idx < (int)GetNInputs(); idx+=channels, i++) {
+          assert(i < (int)GetNInputs());
+          vec[i] = idx;
+        }
+      }
+
+      mCopyFromIndices.Init(vec.Dim()).Copy(vec); 
+    }
+
+    void WriteToStream(std::ostream& rOut)  
+    { rOut << " " << mContext << "\n"; }
+     
+   protected:
+    void PropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+    { 
+      assert(mCopyFromIndices.Dim() == Y.Cols());
+      for(int i=0; i<mCopyFromIndices.Dim();i++) {
+        assert(mCopyFromIndices[i] >= 0 && mCopyFromIndices[i] < X.Cols());
+      }
+        
+      for(size_t r=0; r<X.Rows(); r++) {
+        for(size_t c=0; c<Y.Cols(); c++) {
+          Y(r,c) = X(r,mCopyFromIndices[c]);
+        }
+      }
+    }
+
+    void BackpropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+    { Error("__func__ Nonsense"); }
+
+   protected:
+    int mContext;
+    Vector<int> mCopyFromIndices;
+  };
+
+
+  /**
+   * BlockLinearity is used for the blockwise multiplication by 
+   * DCT transform loaded from disk
+   */
+  class BlockLinearity : public Component
+  {
+    public:
+      BlockLinearity(size_t nInputs, size_t nOutputs, Component* pPred)
+        : Component(nInputs,nOutputs,pPred)
+      { }
+
+      ~BlockLinearity()
+      { }
+
+
+      ComponentType GetType() const
+      { return Component::BLOCK_LINEARITY; }
+
+      const char* GetName() const
+      { return "<blocklinearity>"; }
+
+      Component* Clone() const 
+      { 
+        BlockLinearity* p = new BlockLinearity(GetNInputs(),GetNOutputs(),NULL);
+        p->mBlockLinearity.Init(mBlockLinearity.Rows(),mBlockLinearity.Cols()); 
+        p->mBlockLinearity.Copy(mBlockLinearity); 
+        return p; 
+      }
+
+      void PropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y) 
+      {
+        assert(X.Rows() == Y.Rows());
+        assert(X.Cols()%mBlockLinearity.Rows() == 0);
+        assert(Y.Cols()%mBlockLinearity.Cols() == 0);
+        assert(X.Cols()/mBlockLinearity.Rows() == Y.Cols()/mBlockLinearity.Cols());
+        
+        int instN = X.Cols()/mBlockLinearity.Rows();
+        for(int inst=0; inst<instN; inst++) {
+#ifndef DOUBLEPRECISION
+          cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                  X.Rows(), mBlockLinearity.Cols(), mBlockLinearity.Rows(),
+                  1.0, X.pData()+inst*mBlockLinearity.Rows(), X.Stride(), 
+                  mBlockLinearity.pData(), mBlockLinearity.Stride(),
+                  0.0, Y.pData()+inst*mBlockLinearity.Cols(), Y.Stride());
+#else
+          cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                  X.Rows(), mBlockLinearity.Cols(), mBlockLinearity.Rows(),
+                  1.0, X.pData()+inst*mBlockLinearity.Rows(), X.Stride(), 
+                  mBlockLinearity.pData(), mBlockLinearity.Stride(),
+                  0.0, Y.pData()+inst*mBlockLinearity.Cols(), Y.Stride());
+#endif
+        }
+      }
+        
+      void BackpropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y) 
+      { Error("__func__ Not implemented"); }
+
+
+      void ReadFromStream(std::istream& rIn)
+      { 
+        Matrix<BaseFloat> mat;
+        rIn >> mat;
+        Matrix<BaseFloat> trans(mat,TRANS);
+        mBlockLinearity.Init(trans.Rows(),trans.Cols()).Copy(trans);
+
+        if((GetNOutputs() % mBlockLinearity.Cols() != 0) ||
+           (GetNInputs() % mBlockLinearity.Rows() != 0) ||
+           ((GetNOutputs() / mBlockLinearity.Cols()) != 
+            (GetNInputs() / mBlockLinearity.Rows()))) 
+        {
+          Error("BlockLinearity matrix dimensions must divide IO dims");
+        }
+      }
+
+      void WriteToStream(std::ostream& rOut)
+      {
+        Matrix<BaseFloat> trans(mBlockLinearity,TRANS);
+        rOut << trans;
+      }
+
+    private:
+      Matrix<BaseFloat> mBlockLinearity;
+  };
+
+
+  
+  class Bias : public Component
+  {
+    public:
+      Bias(size_t nInputs, size_t nOutputs, Component* pPred)
+        : Component(nInputs,nOutputs,pPred)
+      { }
+
+      ~Bias()
+      { }
+
+
+      ComponentType GetType() const
+      { return Component::BIAS; }
+
+      const char* GetName() const
+      { return "<bias>"; }
+
+      Component* Clone() const  
+      { 
+        Bias* p = new Bias(GetNInputs(),GetNOutputs(),NULL);
+        p->mBias.Init(mBias.Dim()); 
+        p->mBias.Copy(mBias); 
+        return p; 
+      }
+
+      void PropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+      { 
+        Y.Copy(X); 
+        for(size_t r=0; r<X.Rows(); r++) {
+          for(size_t c=0; c<X.Cols(); c++) {
+            Y(r,c) += mBias[c];
+          }
+        }
+      }
+
+      void BackpropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+      { Y.Copy(X); }
+  
+     
+      void ReadFromStream(std::istream& rIn)
+      { rIn >> mBias; }
+
+      void WriteToStream(std::ostream& rOut)
+      { rOut << mBias; }
+
+    private:
+      Vector<BaseFloat> mBias;
+  };
+
+
+
+  class Window : public Component
+  {
+    public:
+      Window(size_t nInputs, size_t nOutputs, Component* pPred)
+        : Component(nInputs, nOutputs, pPred)
+      { }
+
+      ~Window()
+      { }
+
+
+      ComponentType GetType() const
+      { return Component::WINDOW; }
+
+      const char* GetName() const
+      { return "<window>"; }
+
+      Component* Clone() const  
+      { 
+        Window* p = new Window(GetNInputs(),GetNOutputs(),NULL);
+        p->mWindow.Init(mWindow.Dim()); 
+        p->mWindow.Copy(mWindow); 
+        return p; 
+      }
+
+
+      void PropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+      { Y.Copy(X); 
+        for(size_t r=0; r<X.Rows(); r++) {
+          for(size_t c=0; c<X.Cols(); c++) {
+            Y(r,c) *= mWindow[c];
+          }
+        }
+      }
+
+      void BackpropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+      { Error("__func__ Not implemented"); }
+     
+      
+      void ReadFromStream(std::istream& rIn)
+      { rIn >> mWindow; }
+
+      void WriteToStream(std::ostream& rOut)
+      { rOut << mWindow; }
+
+    private:
+      Vector<BaseFloat> mWindow;
+  };
+
+  class Log : public Component
+  {
+    public:
+      Log(size_t nInputs, size_t nOutputs, Component* pPred)
+        : Component(nInputs, nOutputs, pPred)
+      { }
+
+      ~Log()
+      { }
+
+
+      ComponentType GetType() const
+      { return Component::LOG; }
+
+      const char* GetName() const
+      { return "<log>"; }
+
+      Component* Clone() const  
+      { return new Log(GetNInputs(),GetNOutputs(),NULL); }
+
+
+      void PropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+      { Y.Copy(X); Y.ApplyLog(); }
+
+      void BackpropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+      { Error("__func__ Not implemented"); }
+     
+      
+      void ReadFromStream(std::istream& rIn)
+      { }
+
+      void WriteToStream(std::ostream& rOut)
+      { }
+
+  };
+
+}
+
+
+#endif
+
diff --git a/src/TNetLib/.svn/text-base/Cache.cc.svn-base b/src/TNetLib/.svn/text-base/Cache.cc.svn-base
new file mode 100644
index 0000000..f498318
--- /dev/null
+++ b/src/TNetLib/.svn/text-base/Cache.cc.svn-base
@@ -0,0 +1,248 @@
+
+#include <sys/time.h>
+
+#include "Cache.h"
+#include "Matrix.h"
+#include "Vector.h"
+
+
+namespace TNet {
+
+  Cache::
+  Cache()
+    : mState(EMPTY), mIntakePos(0), mExhaustPos(0), mDiscarded(0), 
+      mRandomized(false), mTrace(0)
+  { }
+
+  Cache::
+  ~Cache()
+  { }
+
+  void
+  Cache::
+  Init(size_t cachesize, size_t bunchsize, long int seed)
+  {
+    if((cachesize % bunchsize) != 0) {
+      KALDI_ERR << "Non divisible cachesize" << cachesize
+                << " by bunchsize" << bunchsize;
+    }
+    
+    mCachesize = cachesize;
+    mBunchsize = bunchsize;
+
+    mState = EMPTY;
+
+    mIntakePos = 0;
+    mExhaustPos = 0;
+
+    mRandomized = false;
+
+    if(seed == 0) {
+      //generate seed
+      struct timeval tv;
+      if (gettimeofday(&tv, 0) == -1) {
+        Error("gettimeofday does not work.");
+        exit(-1);
+      }
+      seed = (int)(tv.tv_sec) + (int)tv.tv_usec + (int)(tv.tv_usec*tv.tv_usec);
+    }
+
+    srand48(seed);
+
+  }
+
+  void 
+  Cache::
+  AddData(const Matrix<BaseFloat>& rFeatures, const Matrix<BaseFloat>& rDesired)
+  {
+    assert(rFeatures.Rows() == rDesired.Rows());
+
+    //lazy buffers allocation
+    if(mFeatures.Rows() != mCachesize) {
+      mFeatures.Init(mCachesize,rFeatures.Cols());
+      mDesired.Init(mCachesize,rDesired.Cols());
+    }
+
+    //warn if segment longer than half-cache
+    if(rFeatures.Rows() > mCachesize/2) {
+      std::ostringstream os;
+      os << "Too long segment and small feature cache! "
+         << " cachesize: " << mCachesize
+         << " segmentsize: " << rFeatures.Rows();
+      Warning(os.str());
+    }
+
+    //change state
+    if(mState == EMPTY) { 
+      if(mTrace&3) std::cout << "/" << std::flush; 
+      mState = INTAKE; mIntakePos = 0;
+     
+      //check for leftover from previous segment 
+      int leftover = mFeaturesLeftover.Rows();
+      //check if leftover is not bigger than cachesize
+      if(leftover > mCachesize) {
+        std::ostringstream os;
+        os << "Too small feature cache: " << mCachesize
+           << ", truncating: "
+           << leftover - mCachesize << " frames from previous segment leftover";
+        //Error(os.str());
+        Warning(os.str());
+        leftover = mCachesize;
+      }
+      //prefill cache with leftover
+      if(leftover > 0) {
+        memcpy(mFeatures.pData(),mFeaturesLeftover.pData(),
+          (mFeaturesLeftover.MSize() < mFeatures.MSize()?
+           mFeaturesLeftover.MSize() : mFeatures.MSize()) 
+        );
+        memcpy(mDesired.pData(),mDesiredLeftover.pData(),
+          (mDesiredLeftover.MSize() < mDesired.MSize()?
+           mDesiredLeftover.MSize() : mDesired.MSize()) 
+        );
+        mFeaturesLeftover.Destroy();
+        mDesiredLeftover.Destroy();
+        mIntakePos += leftover;
+      } 
+    }
+
+    assert(mState == INTAKE);
+    assert(rFeatures.Rows() == rDesired.Rows());
+    if(mTrace&2) std::cout << "F" << std::flush; 
+
+    int cache_space = mCachesize - mIntakePos;
+    int feature_length = rFeatures.Rows();
+    int fill_rows = (cache_space<feature_length)? cache_space : feature_length;
+    int leftover = feature_length - fill_rows;
+
+    assert(cache_space > 0);
+    assert(mFeatures.Stride()==rFeatures.Stride());
+    assert(mDesired.Stride()==rDesired.Stride());
+
+    //copy the data to cache
+    memcpy(mFeatures.pData()+mIntakePos*mFeatures.Stride(),
+           rFeatures.pData(),
+           fill_rows*mFeatures.Stride()*sizeof(BaseFloat));
+
+    memcpy(mDesired.pData()+mIntakePos*mDesired.Stride(),
+           rDesired.pData(),
+           fill_rows*mDesired.Stride()*sizeof(BaseFloat));
+
+    //copy leftovers
+    if(leftover > 0) {
+      mFeaturesLeftover.Init(leftover,mFeatures.Cols());
+      mDesiredLeftover.Init(leftover,mDesired.Cols());
+
+      memcpy(mFeaturesLeftover.pData(),
+             rFeatures.pData()+fill_rows*rFeatures.Stride(),
+             mFeaturesLeftover.MSize());
+
+      memcpy(mDesiredLeftover.pData(),
+             rDesired.pData()+fill_rows*rDesired.Stride(),
+             mDesiredLeftover.MSize());       
+    }
+ 
+    //update cursor
+    mIntakePos += fill_rows;
+    
+    //change state
+    if(mIntakePos == mCachesize) { 
+      if(mTrace&3) std::cout << "\\" << std::flush; 
+      mState = FULL;
+    }
+  }
+
+
+
+  void
+  Cache::
+  Randomize()
+  {
+    assert(mState == FULL || mState == INTAKE);
+
+    if(mTrace&3) std::cout << "R" << std::flush;
+
+    //lazy initialization of the output buffers
+    mFeaturesRandom.Init(mCachesize,mFeatures.Cols());
+    mDesiredRandom.Init(mCachesize,mDesired.Cols());
+
+    //generate random series of integers
+    Vector<int> randmask(mIntakePos);
+    for(unsigned int i=0; i<mIntakePos; i++) {
+      randmask[i]=i;
+    }
+    int* ptr = randmask.pData();
+    std::random_shuffle(ptr, ptr+mIntakePos, GenerateRandom);
+
+    //randomize
+    for(int i=0; i<randmask.Dim(); i++) {
+      mFeaturesRandom[i].Copy(mFeatures[randmask[i]]);
+      mDesiredRandom[i].Copy(mDesired[randmask[i]]);
+    }
+
+    mRandomized = true;
+  }
+
+  void
+  Cache::
+  GetBunch(Matrix<BaseFloat>& rFeatures, Matrix<BaseFloat>& rDesired)
+  {
+    if(mState == EMPTY) {
+      Error("GetBunch on empty cache!!!");
+    }
+
+    //change state if full...
+    if(mState == FULL) { 
+      if(mTrace&3) std::cout << "\\" << std::flush; 
+      mState = EXHAUST; mExhaustPos = 0; 
+    }
+
+    //final cache is not completely filled
+    if(mState == INTAKE) {
+      if(mTrace&3) std::cout << "\\-LAST_CACHE\n" << std::flush; 
+      mState = EXHAUST; mExhaustPos = 0; 
+    } 
+
+    assert(mState == EXHAUST);
+
+    //init the output
+    if(rFeatures.Rows()!=mBunchsize || rFeatures.Cols()!=mFeatures.Cols()) {
+      rFeatures.Init(mBunchsize,mFeatures.Cols());
+    }
+    if(rDesired.Rows()!=mBunchsize || rDesired.Cols()!=mDesired.Cols()) {
+      rDesired.Init(mBunchsize,mDesired.Cols());
+    }
+
+    //copy the output
+    if(mRandomized) {
+      memcpy(rFeatures.pData(),
+             mFeaturesRandom.pData()+mExhaustPos*mFeatures.Stride(),
+             rFeatures.MSize());
+
+      memcpy(rDesired.pData(),
+             mDesiredRandom.pData()+mExhaustPos*mDesired.Stride(),
+             rDesired.MSize());
+    } else {
+      memcpy(rFeatures.pData(),
+             mFeatures.pData()+mExhaustPos*mFeatures.Stride(),
+             rFeatures.MSize());
+
+      memcpy(rDesired.pData(),
+             mDesired.pData()+mExhaustPos*mDesired.Stride(),
+             rDesired.MSize());
+    }
+
+
+    //update cursor
+    mExhaustPos += mBunchsize;
+
+    //change state to EMPTY
+    if(mExhaustPos > mIntakePos-mBunchsize) {
+      //we don't have more complete bunches...
+      mDiscarded += mIntakePos - mExhaustPos;
+
+      mState = EMPTY;
+    }
+  }
+
+
+}
diff --git a/src/TNetLib/.svn/text-base/Cache.h.svn-base b/src/TNetLib/.svn/text-base/Cache.h.svn-base
new file mode 100644
index 0000000..800d92c
--- /dev/null
+++ b/src/TNetLib/.svn/text-base/Cache.h.svn-base
@@ -0,0 +1,74 @@
+#ifndef _CUCACHE_H_
+#define _CUCACHE_H_
+
+#include "Matrix.h"
+
+namespace TNet {
+
+
+  /**
+   * The feature-target pair cache
+   */
+  class Cache {
+    typedef enum { EMPTY, INTAKE, FULL, EXHAUST } State;
+    public:
+      Cache();
+      ~Cache();
+     
+      /// Initialize the cache
+      void Init(size_t cachesize, size_t bunchsize, long int seed = 0);
+
+      /// Add data to cache, returns number of added vectors
+      void AddData(const Matrix<BaseFloat>& rFeatures, const Matrix<BaseFloat>& rDesired);
+      /// Randomizes the cache
+      void Randomize();
+      /// Get the bunch of training data
+      void GetBunch(Matrix<BaseFloat>& rFeatures, Matrix<BaseFloat>& rDesired);
+
+
+      /// Returns true if the cache was completely filled
+      bool Full()
+      { return (mState == FULL); }
+      
+      /// Returns true if the cache is empty
+      bool Empty()
+      { return (mState == EMPTY || mIntakePos < mBunchsize); }
+      
+      /// Number of discarded frames
+      int Discarded() 
+      { return mDiscarded; }
+      
+      /// Set the trace message level
+      void Trace(int trace)
+      { mTrace = trace; }
+
+    private:
+    
+      static long int GenerateRandom(int max)
+      { return lrand48() % max; }
+      
+      State mState; ///< Current state of the cache
+
+      size_t mIntakePos; ///< Number of intaken vectors by AddData
+      size_t mExhaustPos; ///< Number of exhausted vectors by GetBunch
+      
+      size_t mCachesize; ///< Size of cache
+      size_t mBunchsize; ///< Size of bunch
+      int mDiscarded; ///< Number of discarded frames
+
+      Matrix<BaseFloat> mFeatures; ///< Feature cache
+      Matrix<BaseFloat> mFeaturesRandom; ///< Feature cache
+      Matrix<BaseFloat> mFeaturesLeftover; ///< Feature cache
+      
+      Matrix<BaseFloat> mDesired;  ///< Desired vector cache
+      Matrix<BaseFloat> mDesiredRandom;  ///< Desired vector cache
+      Matrix<BaseFloat> mDesiredLeftover;  ///< Desired vector cache
+
+      bool mRandomized;
+
+      int mTrace;
+  }; 
+
+}
+
+#endif
diff --git a/src/TNetLib/.svn/text-base/Component.h.svn-base b/src/TNetLib/.svn/text-base/Component.h.svn-base
new file mode 100644
index 0000000..762451e
--- /dev/null
+++ b/src/TNetLib/.svn/text-base/Component.h.svn-base
@@ -0,0 +1,387 @@
+#ifndef _NETWORK_COMPONENT_I_H
+#define _NETWORK_COMPONENT_I_H
+
+
+#include "Vector.h"
+#include "Matrix.h"
+
+#include <iostream>
+#include <stdexcept>
+
+
+namespace TNet {
+
+    
+  /**
+   * Basic element of the network,
+   * it is a box with defined inputs and outputs, 
+   * and functions to refresh outputs
+   *
+   * it is able to compute tranformation function (forward pass) 
+   * and jacobian function (backward pass), 
+   * which is to be implemented in descendents
+   */ 
+  class Component 
+  {
+    public:
+    /// Types of the net components
+    typedef enum { 
+      UPDATABLE_COMPONENT = 0x0100, 
+      BIASED_LINEARITY,
+      SHARED_LINEARITY,
+
+      ACT_FUN = 0x0200, 
+      SOFTMAX,
+      SIGMOID,
+      BLOCK_SOFTMAX, 
+
+      OTHER = 0x0400,
+      EXPAND,
+      COPY,
+      TRANSPOSE,
+      BLOCK_LINEARITY,
+      WINDOW,
+      BIAS,
+      LOG,
+      
+      BLOCK_ARRAY,
+    } ComponentType;
+
+
+    //////////////////////////////////////////////////////////////
+    // Constructor & Destructor
+    public: 
+      Component(size_t nInputs, size_t nOutputs, Component *pPred); 
+      virtual ~Component();  
+       
+    //////////////////////////////////////////////////////////////
+    // Interface specification (public)
+    public:
+      /// Get Type Identification of the component
+      virtual ComponentType GetType() const = 0;  
+      /// Get Type Label of the component
+      virtual const char* GetName() const = 0;
+      /// 
+      virtual bool IsUpdatable() const 
+      { return false; }
+      /// Clone the component
+      virtual Component* Clone() const = 0; 
+
+      /// Get size of input vectors
+      size_t GetNInputs() const;  
+      /// Get size of output vectors 
+      size_t GetNOutputs() const; 
+     
+      /// IO Data getters
+      const Matrix<BaseFloat>& GetInput() const; 
+      const Matrix<BaseFloat>& GetOutput() const;
+      const Matrix<BaseFloat>& GetErrorInput() const;
+      const Matrix<BaseFloat>& GetErrorOutput() const;
+      
+      /// Set input vector (bind with the preceding NetworkComponent)
+      void SetInput(const Matrix<BaseFloat>& rInput);           
+      /// Set error input vector (bind with the following NetworkComponent) 
+      void SetErrorInput(const Matrix<BaseFloat>& rErrorInput);  
+       
+      /// Perform forward pass propagateion Input->Output
+      void Propagate(); 
+      /// Perform backward pass propagateion ErrorInput->ErrorOutput
+      void Backpropagate(); 
+ 
+      /// Reads the component parameters from stream
+      virtual void ReadFromStream(std::istream& rIn)  { }
+      /// Writes the components parameters to stream
+      virtual void WriteToStream(std::ostream& rOut)  { } 
+
+
+    ///////////////////////////////////////////////////////////////
+    // Nonpublic member functions used to update data outputs 
+    protected:
+      /// Forward pass transformation (to be implemented by descendents...)
+      virtual void PropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y) = 0;
+      /// Backward pass transformation (to be implemented by descendents...)
+      virtual void BackpropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y) = 0;
+
+   
+    ///////////////////////////////////////////////////////////////
+    // data members
+    protected:
+
+      size_t mNInputs;  ///< Size of input vectors
+      size_t mNOutputs; ///< Size of output vectors 
+      
+      const Matrix<BaseFloat>* mpInput; ///< inputs are NOT OWNED by component
+      const Matrix<BaseFloat>* mpErrorInput;///< inputs are NOT OWNED by component
+
+      Matrix<BaseFloat> mOutput; ///< outputs are OWNED by component
+      Matrix<BaseFloat> mErrorOutput; ///< outputs are OWNED by component
+
+  };
+
+
+  /**
+   * Class UpdatableComponent is a box which has some 
+   * parameters adjustable by learning
+   *
+   * you can set the learning rate, lock the params,
+   * and learn from each data observation
+   */
+  class UpdatableComponent : public Component
+  {
+    //////////////////////////////////////////////////////////////
+    // Constructor & Destructor
+    public: 
+      UpdatableComponent(size_t nInputs, size_t nOutputs, Component *pPred); 
+      virtual ~UpdatableComponent();
+
+
+    //////////////////////////////////////////////////////////////
+    // Interface specification (public)
+    public:
+      ///
+      virtual bool IsUpdatable() const 
+      { return true; }
+
+      /// calculate gradient
+      virtual void Gradient() = 0;
+      /// accumulate gradient from other components
+      virtual void AccuGradient(const UpdatableComponent& src, int thr, int thrN) = 0;  
+      /// update weights, reset the accumulator
+      virtual void Update(int thr, int thrN) = 0;
+
+      /// Sets the learning rate of gradient descent
+      void LearnRate(BaseFloat rate);
+      /// Gets the learning rate of gradient descent
+      BaseFloat LearnRate() const;
+
+      void Momentum(BaseFloat mmt);
+      BaseFloat Momentum() const ;
+
+      void Weightcost(BaseFloat cost);
+      BaseFloat Weightcost() const;
+
+      void Bunchsize(size_t size);
+      size_t Bunchsize() const;
+
+    protected:
+      BaseFloat mLearningRate;
+      BaseFloat mMomentum;
+      BaseFloat mWeightcost;
+      size_t mBunchsize;
+  };
+
+
+
+
+  //////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // Component::
+  inline
+  Component::
+  Component(size_t nInputs, size_t nOutputs, Component *pPred) 
+    : mNInputs(nInputs), mNOutputs(nOutputs), 
+      mpInput(NULL), mpErrorInput(NULL), 
+      mOutput(), mErrorOutput()
+  { 
+    /* DOUBLE LINK the Components */
+    if (pPred != NULL) {
+      SetInput(pPred->GetOutput());
+      pPred->SetErrorInput(GetErrorOutput());
+    }
+  } 
+
+
+  inline
+  Component::
+  ~Component()
+  {
+    ;
+  }
+
+  inline void
+  Component::
+  Propagate()
+  {
+    //initialize output buffer
+    if(mOutput.Rows() != GetInput().Rows() || mOutput.Cols() != GetNOutputs()) {
+      mOutput.Init(GetInput().Rows(),GetNOutputs());
+    }
+    //do the dimensionality test
+    if(GetNInputs() != GetInput().Cols()) {
+      KALDI_ERR << "Non-matching INPUT dim!!! Network dim: " << GetNInputs() 
+                << " Data dim: " << GetInput().Cols();
+    }
+    //run transform
+    PropagateFnc(GetInput(),mOutput);
+  
+  }
+
+
+  inline void
+  Component::
+  Backpropagate()
+  {
+    //re-initialize the output buffer
+    if(mErrorOutput.Rows() != GetErrorInput().Rows() || mErrorOutput.Cols() != GetNInputs()) {
+      mErrorOutput.Init(GetErrorInput().Rows(),GetNInputs());
+    }
+
+    //do the dimensionality test
+    assert(GetErrorInput().Cols() == mNOutputs);
+    assert(mErrorOutput.Cols() == mNInputs);
+    assert(mErrorOutput.Rows() == GetErrorInput().Rows());
+
+    //transform
+    BackpropagateFnc(GetErrorInput(),mErrorOutput);
+
+ }
+
+
+  inline void
+  Component::
+  SetInput(const Matrix<BaseFloat>& rInput)
+  {
+    mpInput = &rInput;
+  }
+
+
+  inline void
+  Component::
+  SetErrorInput(const Matrix<BaseFloat>& rErrorInput)
+  {
+    mpErrorInput = &rErrorInput;
+  }
+
+
+  inline const Matrix<BaseFloat>&
+  Component::
+  GetInput() const
+  {
+    if (NULL == mpInput) Error("mpInput is NULL");
+    return *mpInput;
+  }
+
+  inline const Matrix<BaseFloat>&
+  Component::
+  GetOutput() const
+  {
+    return mOutput;
+  }
+
+  inline const Matrix<BaseFloat>&
+  Component::
+  GetErrorInput() const
+  {
+    if (NULL == mpErrorInput) Error("mpErrorInput is NULL");
+    return *mpErrorInput;
+  }
+
+  inline const Matrix<BaseFloat>&
+  Component::
+  GetErrorOutput() const
+  {
+    return mErrorOutput;
+  }
+
+  inline size_t
+  Component::
+  GetNInputs() const
+  {
+    return mNInputs;
+  }
+
+  inline size_t
+  Component::
+  GetNOutputs() const
+  {
+    return mNOutputs;
+  }
+
+
+
+  //////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // UpdatableComponent::
+  
+  inline 
+  UpdatableComponent::
+  UpdatableComponent(size_t nInputs, size_t nOutputs, Component *pPred) 
+    : Component(nInputs, nOutputs, pPred), 
+      mLearningRate(0.0), mMomentum(0.0), mWeightcost(0.0), mBunchsize(0)
+  {
+    ; 
+  } 
+
+
+  inline
+  UpdatableComponent::
+  ~UpdatableComponent()
+  {
+    ;
+  }
+
+
+  inline void
+  UpdatableComponent::
+  LearnRate(BaseFloat rate)
+  {
+    mLearningRate = rate;
+  }
+
+  inline BaseFloat
+  UpdatableComponent::
+  LearnRate() const
+  {
+    return mLearningRate;
+  }
+
+
+  inline void
+  UpdatableComponent::
+  Momentum(BaseFloat mmt)
+  {
+    mMomentum = mmt;
+  }
+
+  inline BaseFloat
+  UpdatableComponent::
+  Momentum() const
+  {
+    return mMomentum;
+  }
+  
+  
+  inline void
+  UpdatableComponent::
+  Weightcost(BaseFloat cost)
+  {
+    mWeightcost = cost;
+  }
+
+  inline BaseFloat
+  UpdatableComponent::
+  Weightcost() const
+  {
+    return mWeightcost;
+  }
+
+  
+  inline void
+  UpdatableComponent::
+  Bunchsize(size_t size)
+  {
+    mBunchsize = size;
+  }
+
+  inline size_t
+  UpdatableComponent::
+  Bunchsize() const
+  {
+    return mBunchsize;
+  }
+
+
+} // namespace TNet
+
+
+#endif
diff --git a/src/TNetLib/.svn/text-base/Makefile.svn-base b/src/TNetLib/.svn/text-base/Makefile.svn-base
new file mode 100644
index 0000000..58ff988
--- /dev/null
+++ b/src/TNetLib/.svn/text-base/Makefile.svn-base
@@ -0,0 +1,29 @@
+
+include ../tnet.mk
+
+INCLUDE = -I. -I../KaldiLib -I../STKLib/ 
+
+all: libTNetLib.a
+
+libTNetLib.a: $(OBJ)
+	$(AR) ruv $@ $(OBJ) 
+	$(RANLIB) $@
+
+%.o : %.cc
+	$(CXX)  -o $@  -c $< $(CFLAGS) $(CXXFLAGS) $(INCLUDE)
+
+
+
+.PHONY: clean doc depend
+clean:
+	rm -f *.o *.a
+
+doc:
+	doxygen ../../doc/doxyfile_TNetLib 
+
+depend:
+	$(CXX) -M $(CXXFLAGS) *.cc $(INCLUDE) > .depend.mk
+
+-include .depend.mk
+
+
diff --git a/src/TNetLib/.svn/text-base/Mutex.cc.svn-base b/src/TNetLib/.svn/text-base/Mutex.cc.svn-base
new file mode 100644
index 0000000..4ec956a
--- /dev/null
+++ b/src/TNetLib/.svn/text-base/Mutex.cc.svn-base
@@ -0,0 +1,48 @@
+
+#include <pthread.h>
+#include <cerrno>
+
+#include "Error.h"
+#include "Mutex.h"
+
+namespace TNet {
+  
+
+Mutex::Mutex() {
+  if(0 != pthread_mutex_init(&mutex_,NULL)) 
+    KALDI_ERR << "Cannot initialize mutex";
+}
+
+
+Mutex::~Mutex() {
+  if(0 != pthread_mutex_destroy(&mutex_)) 
+    KALDI_ERR << "Cannot destroy mutex";
+}
+
+
+void Mutex::Lock() {
+  if(0 != pthread_mutex_lock(&mutex_))
+    KALDI_ERR << "Error on locking mutex";
+}
+
+ 
+bool Mutex::TryLock() {
+  int ret = pthread_mutex_lock(&mutex_);
+  switch (ret) {
+    case 0: return true;
+    case EBUSY: return false;
+    default: KALDI_ERR << "Error on try-locking mutex";
+  }
+  return 0;//make compiler not complain
+}
+
+
+void Mutex::Unlock() {
+  if(0 != pthread_mutex_unlock(&mutex_))
+    KALDI_ERR << "Error on unlocking mutex";
+}
+
+
+  
+}//namespace TNet
+
diff --git a/src/TNetLib/.svn/text-base/Mutex.h.svn-base b/src/TNetLib/.svn/text-base/Mutex.h.svn-base
new file mode 100644
index 0000000..ae2cfff
--- /dev/null
+++ b/src/TNetLib/.svn/text-base/Mutex.h.svn-base
@@ -0,0 +1,34 @@
+
+#include <pthread.h>
+
+namespace TNet {
+
+/**
+ * This class encapsulates mutex to ensure 
+ * exclusive access to some critical section
+ * which manipulates shared resources.
+ *
+ * The mutex must be unlocked from the 
+ * SAME THREAD which locked it
+ */
+class Mutex {
+ public:
+  Mutex();
+  ~Mutex();
+
+  void Lock();
+
+  /**
+   * Try to lock the mutex without waiting for it.
+   * Returns: true when lock successfull,
+   *         false when mutex was already locked
+   */
+  bool TryLock();
+
+  void Unlock();
+
+ private:
+  pthread_mutex_t mutex_;
+};
+
+} //namespace TNet
diff --git a/src/TNetLib/.svn/text-base/Nnet.cc.svn-base b/src/TNetLib/.svn/text-base/Nnet.cc.svn-base
new file mode 100644
index 0000000..4b364ac
--- /dev/null
+++ b/src/TNetLib/.svn/text-base/Nnet.cc.svn-base
@@ -0,0 +1,360 @@
+
+#include <algorithm>
+//#include <locale>
+#include <cctype>
+
+#include "Nnet.h"
+#include "CRBEDctFeat.h"
+#include "BlockArray.h"
+
+namespace TNet {
+
+
+
+
+void Network::Feedforward(const Matrix<BaseFloat>& in, Matrix<BaseFloat>& out, 
+                          size_t start_frm_ext, size_t end_frm_ext) {
+  //empty network: copy input to output 
+  if(mNnet.size() == 0) {
+    if(out.Rows() != in.Rows() || out.Cols() != in.Cols()) {
+      out.Init(in.Rows(),in.Cols());
+    }
+    out.Copy(in);
+    return;
+  }
+  
+  //short input: propagate in one block  
+  if(in.Rows() < 5000) { 
+    Propagate(in,out);
+  } else {//long input: propagate per parts
+    //initialize
+    out.Init(in.Rows(),GetNOutputs());
+    Matrix<BaseFloat> tmp_in, tmp_out;
+    int done=0, block=1024;
+    //propagate first part
+    tmp_in.Init(block+end_frm_ext,in.Cols());
+    tmp_in.Copy(in.Range(0,block+end_frm_ext,0,in.Cols()));
+    Propagate(tmp_in,tmp_out);
+    out.Range(0,block,0,tmp_out.Cols()).Copy(
+      tmp_out.Range(0,block,0,tmp_out.Cols())
+    );
+    done += block;
+    //propagate middle parts
+    while((done+2*block) < in.Rows()) {
+      tmp_in.Init(block+start_frm_ext+end_frm_ext,in.Cols());
+      tmp_in.Copy(in.Range(done-start_frm_ext, block+start_frm_ext+end_frm_ext, 0,in.Cols()));      Propagate(tmp_in,tmp_out);
+      out.Range(done,block,0,tmp_out.Cols()).Copy(
+        tmp_out.Range(start_frm_ext,block,0,tmp_out.Cols())
+      );
+      done += block;
+    }
+    //propagate last part
+    tmp_in.Init(in.Rows()-done+start_frm_ext,in.Cols());
+    tmp_in.Copy(in.Range(done-start_frm_ext,in.Rows()-done+start_frm_ext,0,in.Cols()));
+    Propagate(tmp_in,tmp_out);
+    out.Range(done,out.Rows()-done,0,out.Cols()).Copy(
+      tmp_out.Range(start_frm_ext,tmp_out.Rows()-start_frm_ext,0,tmp_out.Cols())   
+    );
+
+    done += tmp_out.Rows()-start_frm_ext;
+    assert(done == out.Rows());
+  }
+}
+
+
+void Network::Propagate(const Matrix<BaseFloat>& in, Matrix<BaseFloat>& out) {
+  //empty network: copy input to output 
+  if(mNnet.size() == 0) {
+    if(out.Rows() != in.Rows() || out.Cols() != in.Cols()) {
+      out.Init(in.Rows(),in.Cols());
+    }
+    out.Copy(in);
+    return;
+  }
+  
+  //this will keep pointer to matrix 'in', for backprop
+  mNnet.front()->SetInput(in); 
+
+  //propagate
+  LayeredType::iterator it;
+  for(it=mNnet.begin(); it!=mNnet.end(); ++it) {
+    (*it)->Propagate();
+  }
+
+  //copy the output matrix
+  const Matrix<BaseFloat>& mat = mNnet.back()->GetOutput();
+  if(out.Rows() != mat.Rows() || out.Cols() != mat.Cols()) {
+    out.Init(mat.Rows(),mat.Cols());
+  }
+  out.Copy(mat);
+
+}
+
+
+void Network::Backpropagate(const Matrix<BaseFloat>& globerr) {
+  //pass matrix to last component
+  mNnet.back()->SetErrorInput(globerr);
+
+  // back-propagation : reversed order,
+  LayeredType::reverse_iterator it;
+  for(it=mNnet.rbegin(); it!=mNnet.rend(); ++it) {
+    //first component does not backpropagate error (no predecessors)
+    if(*it != mNnet.front()) {
+      (*it)->Backpropagate();
+    }
+    //compute gradient if updatable component
+    if((*it)->IsUpdatable()) {
+      UpdatableComponent& comp = dynamic_cast<UpdatableComponent&>(**it);
+      comp.Gradient(); //compute gradient 
+    }
+  }
+}
+
+
+void Network::AccuGradient(const Network& src, int thr, int thrN) {
+  LayeredType::iterator it;
+  LayeredType::const_iterator it2;
+
+  for(it=mNnet.begin(), it2=src.mNnet.begin(); it!=mNnet.end(); ++it,++it2) {
+    if((*it)->IsUpdatable()) {
+      UpdatableComponent& comp = dynamic_cast<UpdatableComponent&>(**it);
+      const UpdatableComponent& comp2 = dynamic_cast<const UpdatableComponent&>(**it2);
+      comp.AccuGradient(comp2,thr,thrN);
+    }
+  }
+}
+
+
+void Network::Update(int thr, int thrN) {
+  LayeredType::iterator it;
+
+  for(it=mNnet.begin(); it!=mNnet.end(); ++it) {
+    if((*it)->IsUpdatable()) {
+      UpdatableComponent& comp = dynamic_cast<UpdatableComponent&>(**it);
+      comp.Update(thr,thrN);
+    }
+  }
+}
+
+
+Network* Network::Clone() {
+  Network* net = new Network;
+  LayeredType::iterator it;
+  for(it = mNnet.begin(); it != mNnet.end(); ++it) {
+    //clone
+    net->mNnet.push_back((*it)->Clone());
+    //connect network
+    if(net->mNnet.size() > 1) {
+      Component* last = *(net->mNnet.end()-1);
+      Component* prev = *(net->mNnet.end()-2);
+      last->SetInput(prev->GetOutput());
+      prev->SetErrorInput(last->GetErrorOutput());
+    }
+  }
+
+  //copy the learning rate
+  //net->SetLearnRate(GetLearnRate());
+
+  return net;
+}
+
+
+void Network::ReadNetwork(const char* pSrc) {
+  std::ifstream in(pSrc);
+  if(!in.good()) {
+    Error(std::string("Error, cannot read model: ")+pSrc);
+  }
+  ReadNetwork(in);
+  in.close();
+}
+
+  
+
+void Network::ReadNetwork(std::istream& rIn) {
+  //get the network elements from a factory
+  Component *pComp;
+  while(NULL != (pComp = ComponentFactory(rIn))) 
+    mNnet.push_back(pComp);
+}
+
+
+void Network::WriteNetwork(const char* pDst) {
+  std::ofstream out(pDst);
+  if(!out.good()) {
+    Error(std::string("Error, cannot write model: ")+pDst);
+  }
+  WriteNetwork(out);
+  out.close();
+}
+
+
+void Network::WriteNetwork(std::ostream& rOut) {
+  //dump all the componetns
+  LayeredType::iterator it;
+  for(it=mNnet.begin(); it!=mNnet.end(); ++it) {
+    ComponentDumper(rOut, **it);
+  }
+}
+ 
+
+Component*
+Network::
+ComponentFactory(std::istream& rIn)
+{
+  rIn >> std::ws;
+  if(rIn.eof()) return NULL;
+
+  Component* pRet=NULL;
+  Component* pPred=NULL;
+
+  std::string componentTag;
+  size_t nInputs, nOutputs;
+
+  rIn >> std::ws;
+  rIn >> componentTag;
+  if(componentTag == "") return NULL; //nothing left in the file
+
+  //make it lowercase
+  std::transform(componentTag.begin(), componentTag.end(), 
+                 componentTag.begin(), tolower);
+
+  //the 'endblock' tag terminates the network
+  if(componentTag == "<endblock>") return NULL;
+
+  
+  if(componentTag[0] != '<' || componentTag[componentTag.size()-1] != '>') {
+    Error(std::string("Invalid component tag:")+componentTag);
+  }
+
+  rIn >> std::ws;
+  rIn >> nOutputs;
+  rIn >> std::ws;
+  rIn >> nInputs;
+  assert(nInputs > 0 && nOutputs > 0);
+
+  //make coupling with predecessor
+  if(mNnet.size() == 0) {
+    pPred = NULL;
+  } else {
+    pPred = mNnet.back();
+  }
+  
+  //array with list of component tags
+  static const std::string TAGS[] = {
+    "<biasedlinearity>",
+    "<sharedlinearity>",
+    
+    "<sigmoid>",
+    "<softmax>",
+    "<blocksoftmax>",
+
+    "<expand>",
+    "<copy>",
+    "<transpose>",
+    "<blocklinearity>",
+    "<bias>",
+    "<window>",
+    "<log>",
+
+    "<blockarray>",
+  };
+
+  static const int n_tags = sizeof(TAGS) / sizeof(TAGS[0]);
+  int i = 0;
+  for(i=0; i<n_tags; i++) {
+    if(componentTag == TAGS[i]) break;
+  }
+  
+  //switch according to position in array TAGS
+  switch(i) {
+    case 0: pRet = new BiasedLinearity(nInputs,nOutputs,pPred); break;
+    case 1: pRet = new SharedLinearity(nInputs,nOutputs,pPred); break;
+
+    case 2: pRet = new Sigmoid(nInputs,nOutputs,pPred); break;
+    case 3: pRet = new Softmax(nInputs,nOutputs,pPred); break;
+    case 4: pRet = new BlockSoftmax(nInputs,nOutputs,pPred); break;
+
+    case 5: pRet = new Expand(nInputs,nOutputs,pPred); break;
+    case 6: pRet = new Copy(nInputs,nOutputs,pPred); break;
+    case 7: pRet = new Transpose(nInputs,nOutputs,pPred); break;
+    case 8: pRet = new BlockLinearity(nInputs,nOutputs,pPred); break;
+    case 9: pRet = new Bias(nInputs,nOutputs,pPred); break;
+    case 10: pRet = new Window(nInputs,nOutputs,pPred); break;
+    case 11: pRet = new Log(nInputs,nOutputs,pPred); break;
+    
+    case 12: pRet = new BlockArray(nInputs,nOutputs,pPred); break;
+
+    default: Error(std::string("Unknown Component tag:")+componentTag);
+  }
+ 
+  //read params if it is updatable component
+  pRet->ReadFromStream(rIn);
+  //return
+  return pRet;
+}
+
+
+void
+Network::
+ComponentDumper(std::ostream& rOut, Component& rComp)
+{
+  //use tags of all the components; or the identification codes
+  //array with list of component tags
+  static const Component::ComponentType TYPES[] = {
+    Component::BIASED_LINEARITY,
+    Component::SHARED_LINEARITY,
+    
+    Component::SIGMOID,
+    Component::SOFTMAX,
+    Component::BLOCK_SOFTMAX,
+
+    Component::EXPAND,
+    Component::COPY,
+    Component::TRANSPOSE,
+    Component::BLOCK_LINEARITY,
+    Component::BIAS,
+    Component::WINDOW,
+    Component::LOG,
+
+    Component::BLOCK_ARRAY,
+  };
+  static const std::string TAGS[] = {
+    "<biasedlinearity>",
+    "<sharedlinearity>",
+
+    "<sigmoid>",
+    "<softmax>",
+    "<blocksoftmax>",
+
+    "<expand>",
+    "<copy>",
+    "<transpose>",
+    "<blocklinearity>",
+    "<bias>",
+    "<window>",
+    "<log>",
+
+    "<blockarray>",
+  };
+  static const int MAX = sizeof TYPES / sizeof TYPES[0];
+
+  int i;
+  for(i=0; i<MAX; ++i) {
+    if(TYPES[i] == rComp.GetType()) break;
+  }
+  if(i == MAX) Error("Unknown ComponentType");
+  
+  //dump the component tag
+  rOut << TAGS[i] << " " 
+       << rComp.GetNOutputs() << " " 
+       << rComp.GetNInputs() << std::endl;
+
+  //dump the parameters (if any)
+  rComp.WriteToStream(rOut);
+}
+
+
+
+  
+} //namespace
+
diff --git a/src/TNetLib/.svn/text-base/Nnet.h.svn-base b/src/TNetLib/.svn/text-base/Nnet.h.svn-base
new file mode 100644
index 0000000..12e2585
--- /dev/null
+++ b/src/TNetLib/.svn/text-base/Nnet.h.svn-base
@@ -0,0 +1,194 @@
+#ifndef _NETWORK_H_
+#define _NETWORK_H_
+
+#include "Component.h"
+#include "BiasedLinearity.h"
+#include "SharedLinearity.h"
+#include "Activation.h"
+
+#include "Vector.h"
+
+#include <vector>
+
+
+namespace TNet {
+
+class Network
+{
+//////////////////////////////////////
+// Typedefs
+typedef std::vector<Component*> LayeredType;
+  
+  //////////////////////////////////////
+  // Disable copy construction and assignment
+ private:
+  Network(Network&); 
+  Network& operator=(Network&);
+   
+ public:
+  // allow incomplete network creation
+  Network() 
+  { }
+
+  ~Network();
+
+  int Layers() const
+  { return mNnet.size(); }
+
+  Component& Layer(int i)
+  { return *mNnet[i]; }
+ 
+  const Component& Layer(int i) const
+  { return *mNnet[i]; }
+
+  /// Feedforward the data per blocks, this needs less memory, 
+  /// and allows to process very long files.
+  /// It does not trim the *_frm_ext, but uses it 
+  /// for concatenation of segments
+  void Feedforward(const Matrix<BaseFloat>& in, Matrix<BaseFloat>& out, 
+                   size_t start_frm_ext, size_t end_frm_ext);
+  /// forward the data to the output
+  void Propagate(const Matrix<BaseFloat>& in, Matrix<BaseFloat>& out);
+  /// backpropagate the error while calculating the gradient
+  void Backpropagate(const Matrix<BaseFloat>& globerr); 
+
+  /// accumulate the gradient from other networks
+  void AccuGradient(const Network& src, int thr, int thrN);
+  /// update weights, reset the accumulator
+  void Update(int thr, int thrN);
+  
+  Network* Clone(); ///< Clones the network
+
+  void ReadNetwork(const char* pSrc);     ///< read the network from file
+  void ReadNetwork(std::istream& rIn);    ///< read the network from stream
+  void WriteNetwork(const char* pDst);    ///< write network to file
+  void WriteNetwork(std::ostream& rOut);  ///< write network to stream
+
+  size_t GetNInputs() const; ///< Dimensionality of the input features
+  size_t GetNOutputs() const; ///< Dimensionality of the desired vectors
+
+  void SetLearnRate(BaseFloat learnRate); ///< set the learning rate value
+  BaseFloat GetLearnRate();  ///< get the learning rate value
+
+  void SetWeightcost(BaseFloat l2); ///< set the L2 regularization const
+
+  void ResetBunchsize(); ///< reset the frame counter (needed for L2 regularization
+  void AccuBunchsize(const Network& src); ///< accumulate frame counts in bunch (needed in L2 regularization
+
+ private:
+  /// Creates a component by reading from stream
+  Component* ComponentFactory(std::istream& In);
+  /// Dumps component into a stream
+  void ComponentDumper(std::ostream& rOut, Component& rComp);
+
+ private:
+  LayeredType mNnet; ///< container with the network layers
+
+};
+  
+
+//////////////////////////////////////////////////////////////////////////
+// INLINE FUNCTIONS 
+// Network::
+inline Network::~Network() {
+  //delete all the components
+  LayeredType::iterator it;
+  for(it=mNnet.begin(); it!=mNnet.end(); ++it) {
+    delete *it;
+  }
+}
+
+    
+inline size_t Network::GetNInputs() const {
+  assert(mNnet.size() > 0);
+  return mNnet.front()->GetNInputs();
+}
+
+
+inline size_t
+Network::
+GetNOutputs() const
+{
+  assert(mNnet.size() > 0);
+  return mNnet.back()->GetNOutputs();
+}
+
+
+
+inline void
+Network::
+SetLearnRate(BaseFloat learnRate)
+{
+  LayeredType::iterator it;
+  for(it=mNnet.begin(); it!=mNnet.end(); ++it) {
+    if((*it)->IsUpdatable()) {
+      dynamic_cast<UpdatableComponent*>(*it)->LearnRate(learnRate);
+    }
+  }
+}
+
+
+inline BaseFloat
+Network::
+GetLearnRate()
+{
+  //TODO - learn rates may differ layer to layer
+  assert(mNnet.size() > 0);
+  for(size_t i=0; i<mNnet.size(); i++) {
+    if(mNnet[i]->IsUpdatable()) {
+      return dynamic_cast<UpdatableComponent*>(mNnet[i])->LearnRate();
+    }
+  }
+  Error("No updatable NetComponents");
+  return -1;
+}
+
+
+inline void
+Network::
+SetWeightcost(BaseFloat l2)
+{
+  LayeredType::iterator it;
+  for(it=mNnet.begin(); it!=mNnet.end(); ++it) {
+    if((*it)->IsUpdatable()) {
+      dynamic_cast<UpdatableComponent*>(*it)->Weightcost(l2);
+    }
+  }
+}
+
+
+inline void 
+Network::
+ResetBunchsize()
+{
+  LayeredType::iterator it;
+  for(it=mNnet.begin(); it!=mNnet.end(); ++it) {
+    if((*it)->IsUpdatable()) {
+      dynamic_cast<UpdatableComponent*>(*it)->Bunchsize(0);
+    }
+  }
+}
+
+inline void
+Network::
+AccuBunchsize(const Network& src)
+{
+  assert(Layers() == src.Layers());
+  assert(Layers() > 0);
+
+  for(int i=0; i<Layers(); i++) {
+    if(Layer(i).IsUpdatable()) {
+      UpdatableComponent& tgt_comp = dynamic_cast<UpdatableComponent&>(Layer(i));
+      const UpdatableComponent& src_comp = dynamic_cast<const UpdatableComponent&>(src.Layer(i));
+      tgt_comp.Bunchsize(tgt_comp.Bunchsize()+src_comp.GetOutput().Rows());
+    }
+  }
+}
+
+  
+
+} //namespace
+
+#endif
+
+
diff --git a/src/TNetLib/.svn/text-base/ObjFun.cc.svn-base b/src/TNetLib/.svn/text-base/ObjFun.cc.svn-base
new file mode 100644
index 0000000..c899fb1
--- /dev/null
+++ b/src/TNetLib/.svn/text-base/ObjFun.cc.svn-base
@@ -0,0 +1,231 @@
+
+#include "ObjFun.h"
+#include "Error.h"
+
+#include <limits>
+
+namespace TNet {
+
+
+ObjectiveFunction* ObjectiveFunction::Factory(ObjFunType type) {
+  ObjectiveFunction* ret = NULL;
+  switch(type) {
+    case MEAN_SQUARE_ERROR: ret = new MeanSquareError;    break;
+    case CROSS_ENTROPY:     ret = new CrossEntropy;       break;
+    default: Error("Unknown ObjectiveFunction type");
+  }
+  return ret;
+}
+
+
+/*
+ * MeanSquareError
+ */
+void MeanSquareError::Evaluate(const Matrix<BaseFloat>& net_out, const Matrix<BaseFloat>& target, Matrix<BaseFloat>* err) {
+  
+  //check dimensions
+  assert(net_out.Rows() == target.Rows());
+  assert(net_out.Cols() == target.Cols());
+  if(err->Rows() != net_out.Rows() || err->Cols() != net_out.Cols()) {
+    err->Init(net_out.Rows(),net_out.Cols());
+  }
+
+  //compute global gradient
+  err->Copy(net_out);
+  err->AddScaled(-1,target);
+
+  //compute loss function
+  double sum = 0;
+  for(size_t r=0; r<err->Rows(); r++) {
+    for(size_t c=0; c<err->Cols(); c++) {
+      BaseFloat val = (*err)(r,c);
+      sum += val*val;
+    }
+  }
+  error_ += sum/2.0;
+  frames_ += net_out.Rows();
+}
+
+
+std::string MeanSquareError::Report() {
+  std::stringstream ss;
+  ss << "Mse:" << error_ << " frames:" << frames_
+     << " err/frm:" << error_/frames_
+     << "\n";
+  return ss.str();
+}
+
+
+/*
+ * CrossEntropy
+ */
+
+///Find maximum in float array
+inline int FindMaxId(const BaseFloat* ptr, size_t N) {
+  BaseFloat mval = -1e20f;
+  int mid = -1;
+  for(size_t i=0; i<N; i++) {
+    if(ptr[i] > mval) {
+      mid = i; mval = ptr[i];
+    }
+  }
+  return mid;
+}
+
+
+void
+CrossEntropy::Evaluate(const Matrix<BaseFloat>& net_out, const Matrix<BaseFloat>& target, Matrix<BaseFloat>* err)
+{
+  //check dimensions
+  assert(net_out.Rows() == target.Rows());
+  assert(net_out.Cols() == target.Cols());
+  if(err->Rows() != net_out.Rows() || err->Cols() != net_out.Cols()) {
+    err->Init(net_out.Rows(),net_out.Cols());
+  }
+
+  //allocate confunsion buffers
+  if(confusion_mode_ != NO_CONF) {
+    if(confusion_.Rows() != target.Cols() || confusion_.Cols() != target.Cols()) {
+      confusion_.Init(target.Cols(),target.Cols());
+      confusion_count_.Init(target.Cols());
+      diag_confusion_.Init(target.Cols());
+    }
+  }
+
+  //compute global gradient (assuming on softmax input)
+  err->Copy(net_out);
+  err->AddScaled(-1,target);
+
+  //collect max values
+  std::vector<size_t> max_target_id(target.Rows());
+  std::vector<size_t> max_netout_id(target.Rows());
+  //check correct classification
+  int corr = 0;
+  for(size_t r=0; r<net_out.Rows(); r++) {
+    int id_netout = FindMaxId(net_out[r].pData(),net_out.Cols());
+    int id_target = FindMaxId(target[r].pData(),target.Cols());
+    if(id_netout == id_target) corr++;
+    max_target_id[r] = id_target;//store the max value
+    max_netout_id[r] = id_netout;
+  }
+
+  //compute loss function
+  double sumerr = 0;
+  for(size_t r=0; r<net_out.Rows(); r++) {
+    if(target(r,max_target_id[r]) == 1.0) {
+      //pick the max value..., rest is zero
+      BaseFloat val = log(net_out(r,max_target_id[r]));
+      if(val < -1e10f) val = -1e10f;
+      sumerr += val;
+    } else {
+      //process whole posterior vect.
+      for(size_t c=0; c<net_out.Cols(); c++) {
+        if(target(r,c) != 0.0) {
+          BaseFloat val = target(r,c)*log(net_out(r,c));
+          if(val < -1e10f) val = -1e10f;
+          sumerr += val;
+        }
+      }
+    }
+  }
+
+  //accumulate confusuion network
+  if(confusion_mode_ != NO_CONF) {
+    for(size_t r=0; r<net_out.Rows(); r++) {
+      int id_target = max_target_id[r];
+      int id_netout = max_netout_id[r];
+      switch(confusion_mode_) {
+        case MAX_CONF:
+          confusion_(id_target,id_netout) += 1;
+          break;
+        case SOFT_CONF:
+          confusion_[id_target].Add(net_out[r]);
+          break;
+        case DIAG_MAX_CONF:
+          diag_confusion_[id_target] += ((id_target==id_netout)?1:0);
+          break;
+        case DIAG_SOFT_CONF:
+          diag_confusion_[id_target] += net_out[r][id_target];
+          break;
+        default:
+          KALDI_ERR << "unknown confusion type" << confusion_mode_;
+      }
+      confusion_count_[id_target] += 1;
+    }
+  }
+
+  error_ -= sumerr;
+  frames_ += net_out.Rows();
+  corr_ += corr;
+}
+
+
+std::string CrossEntropy::Report() {
+  std::stringstream ss;
+  ss << "Xent:" << error_ << " frames:" << frames_
+     << " err/frm:" << error_/frames_
+     << " correct[" << 100.0*corr_/frames_ << "%]"
+     << "\n";
+
+  if(confusion_mode_ != NO_CONF) {
+    //read class tags
+    std::vector<std::string> tag;
+    { 
+      std::ifstream ifs(output_label_map_);
+      assert(ifs.good());
+      std::string str;
+      while(!ifs.eof()) {
+        ifs >> str;
+        tag.push_back(str);
+      }
+    }
+    assert(confusion_count_.Dim() <= tag.size());
+
+    //print confusion matrix
+    if(confusion_mode_ == MAX_CONF || confusion_mode_ == SOFT_CONF) {
+      ss << "Row:label Col:hyp\n" << confusion_ << "\n";
+    }
+    
+    //***print per-target accuracies
+    for(int i=0; i<confusion_count_.Dim(); i++) {
+      //get the numerator
+      BaseFloat numerator = 0.0;
+      switch (confusion_mode_) {
+        case MAX_CONF: case SOFT_CONF:
+          numerator = confusion_[i][i];
+          break;
+        case DIAG_MAX_CONF: case DIAG_SOFT_CONF:
+          numerator = diag_confusion_[i];
+          break;
+        default:
+          KALDI_ERR << "Usupported confusion mode:" << confusion_mode_;
+      }
+      //add line to report
+      ss << std::setw(30) << tag[i] << " " 
+         << std::setw(10) << 100.0*numerator/confusion_count_[i] << "%" 
+         << " [" << numerator << "/" << confusion_count_[i] << "]\n";
+    } //***print per-target accuracies
+  }// != NO_CONF
+
+  return ss.str();
+}
+
+
+void CrossEntropy::MergeStats(const ObjectiveFunction& inst) { 
+  const CrossEntropy& xent = dynamic_cast<const CrossEntropy&>(inst);
+  frames_ += xent.frames_; error_ += xent.error_; corr_ += xent.corr_;
+  //sum the confustion statistics
+  if(confusion_mode_ != NO_CONF) {
+    if(confusion_.Rows() != xent.confusion_.Rows()) {
+      confusion_.Init(xent.confusion_.Rows(),xent.confusion_.Cols());
+      confusion_count_.Init(xent.confusion_count_.Dim());
+      diag_confusion_.Init(xent.diag_confusion_.Dim());
+    }
+    confusion_.Add(xent.confusion_);
+    confusion_count_.Add(xent.confusion_count_);
+    diag_confusion_.Add(xent.diag_confusion_);
+  }
+}
+ 
+
+} // namespace TNet
diff --git a/src/TNetLib/.svn/text-base/ObjFun.h.svn-base b/src/TNetLib/.svn/text-base/ObjFun.h.svn-base
new file mode 100644
index 0000000..c458340
--- /dev/null
+++ b/src/TNetLib/.svn/text-base/ObjFun.h.svn-base
@@ -0,0 +1,160 @@
+#ifndef _TNET_OBJ_FUN_H
+#define _TNET_OBJ_FUN_H
+
+#include <cassert>
+#include <limits>
+#include <cmath>
+
+#include "Matrix.h"
+#include "Vector.h"
+
+namespace TNet {
+
+  /**
+   * General interface for objective functions
+   */
+  class ObjectiveFunction
+  {
+    public:
+    /// Enum with objective function types
+    typedef enum { 
+      OBJ_FUN_I = 0x0300, 
+      MEAN_SQUARE_ERROR, 
+      CROSS_ENTROPY, 
+    } ObjFunType;
+    
+    public:
+      /// Factory for creating objective function instances
+      static ObjectiveFunction* Factory(ObjFunType type);
+    
+    //////////////////////////////////////////////////////////////
+    // Interface specification
+    protected:
+      ObjectiveFunction() { }; /// constructor
+    public:
+      virtual ~ObjectiveFunction() { };  /// destructor
+
+      virtual ObjFunType GetType() = 0;
+      virtual const char* GetName() = 0;
+      virtual ObjectiveFunction* Clone() = 0; 
+
+      ///calculate error of network output
+      virtual void Evaluate(const Matrix<BaseFloat>& net_out, const Matrix<BaseFloat>& target, Matrix<BaseFloat>* err) = 0;
+ 
+      ///get the accumulated error
+      virtual double GetError() = 0;
+      ///the number of processed frames 
+      virtual size_t GetFrames() = 0;
+       
+      ///report the error to string 
+      virtual std::string Report() = 0;     
+
+      ///sum the frame counts from more instances
+      virtual void MergeStats(const ObjectiveFunction& inst) = 0;
+  };
+
+
+
+  /**
+   * Mean square error function
+   */
+  class MeanSquareError : public ObjectiveFunction
+  {
+   public:
+    MeanSquareError()
+     : ObjectiveFunction(), frames_(0), error_(0)
+    { }
+
+    ~MeanSquareError()
+    { }
+
+    ObjFunType GetType()
+    { return MEAN_SQUARE_ERROR; }
+
+    const char* GetName()
+    { return "<MeanSquareError>"; }
+
+    ObjectiveFunction* Clone()
+    { return new MeanSquareError(*this); }
+    
+    void Evaluate(const Matrix<BaseFloat>& net_out, const Matrix<BaseFloat>& target, Matrix<BaseFloat>* err);
+
+    size_t GetFrames()
+    { return frames_; }
+    
+    double GetError()
+    { return error_; }
+
+    std::string Report();    
+     
+    void MergeStats(const ObjectiveFunction& inst) { 
+      const MeanSquareError& mse = dynamic_cast<const MeanSquareError&>(inst);
+      frames_ += mse.frames_; error_ += mse.error_; 
+    }
+   
+   private:
+    size_t frames_;
+    double error_;
+
+  };
+
+
+  /**
+   * Cross entropy error function
+   */
+  class CrossEntropy : public ObjectiveFunction
+  {
+   public:
+    enum ConfusionMode { NO_CONF=0, MAX_CONF, SOFT_CONF, DIAG_MAX_CONF, DIAG_SOFT_CONF };
+
+   public:
+    CrossEntropy()
+     : ObjectiveFunction(), frames_(0), error_(0), corr_(0), confusion_mode_(NO_CONF), output_label_map_(NULL)
+    { }
+
+    ~CrossEntropy()
+    { }
+
+    ObjFunType GetType()
+    { return CROSS_ENTROPY; }
+
+    const char* GetName() 
+    { return "<cross_entropy>"; }
+
+    ObjectiveFunction* Clone()
+    { return new CrossEntropy(*this); }
+
+    void Evaluate(const Matrix<BaseFloat>& net_out, const Matrix<BaseFloat>& target, Matrix<BaseFloat>* err);
+
+    size_t GetFrames()
+    { return frames_; }
+    
+    double GetError()
+    { return error_; }
+
+    void SetConfusionMode(enum ConfusionMode m)
+    { confusion_mode_ = m; }
+
+    void SetOutputLabelMap(const char* map)
+    { output_label_map_ = map; }
+
+    std::string Report();    
+     
+    void MergeStats(const ObjectiveFunction& inst);   
+   private:
+    size_t frames_;
+    double error_;
+    size_t corr_;
+ 
+    ConfusionMode confusion_mode_;
+    Matrix<float> confusion_;
+    Vector<int> confusion_count_;
+    Vector<double> diag_confusion_;
+    const char* output_label_map_;
+  };
+ 
+
+} //namespace TNet
+
+
+#endif
diff --git a/src/TNetLib/.svn/text-base/Platform.h.svn-base b/src/TNetLib/.svn/text-base/Platform.h.svn-base
new file mode 100644
index 0000000..66ebacb
--- /dev/null
+++ b/src/TNetLib/.svn/text-base/Platform.h.svn-base
@@ -0,0 +1,397 @@
+#ifndef _TNET_PLATFORM_H
+#define _TNET_PLATFORM_H
+
+#include "Thread.h"
+#include "Matrix.h"
+
+#include "Features.h"
+#include "Labels.h"
+
+#include "Cache.h"
+#include "Nnet.h"
+#include "ObjFun.h"
+
+#include "Mutex.h"
+#include "Semaphore.h"
+#include "Barrier.h"
+#include "Thread.h"
+
+#include <vector>
+#include <list>
+#include <iterator>
+
+namespace TNet {
+
+class PlatformThread;
+
+class Platform {
+
+/*
+* Variables to be initialized directly from the main function
+*/
+public:
+  FeatureRepository feature_;
+  LabelRepository label_;
+
+  Network nnet_transf_;
+  Network nnet_;
+  ObjectiveFunction* obj_fun_;
+
+  int bunchsize_;
+  int cachesize_;
+  bool randomize_;
+   
+  int start_frm_ext_;
+  int end_frm_ext_;
+
+  int trace_;
+  bool crossval_;
+  
+  long int seed_;
+
+ /*
+  * Variables to be used internally during the multi-threaded training
+  */
+ private:
+  Semaphore semaphore_read_;
+ 
+  std::vector<std::list<Matrix<BaseFloat>*> > feature_buf_;
+  std::vector<std::list<Matrix<BaseFloat>*> > label_buf_;
+  std::vector<Mutex> mutex_buf_;
+
+  std::vector<Network*> nnet_transf2_;
+
+  std::vector<Cache> cache_;
+
+  std::vector<Network*> nnet2_;
+  std::vector<ObjectiveFunction*> obj_fun2_;
+  std::vector<bool> sync_mask_;
+
+  Barrier barrier_;
+  bool end_reading_;
+  std::vector<Timer> tim_;
+  std::vector<double> tim_accu_;
+
+  int num_thr_;
+  Semaphore semaphore_endtrain_;
+  Semaphore semaphore_endtrain2_;
+
+ public:
+  Mutex cout_mutex_;
+
+ /*
+  * Methods
+  */
+ public:
+  Platform()
+   : bunchsize_(0), cachesize_(0), randomize_(false),
+     start_frm_ext_(0), end_frm_ext_(0), trace_(0),
+     crossval_(false), seed_(0),
+     end_reading_(false), num_thr_(0)
+  { }
+
+  ~Platform()
+  {
+    for(size_t i=0; i<nnet_transf2_.size(); i++) {
+      delete nnet_transf2_[i];
+    }
+    for(size_t i=0; i<nnet2_.size(); i++) {
+      delete nnet2_[i];
+    }
+    for(size_t i=0; i<obj_fun2_.size(); i++) {
+      delete obj_fun2_[i];
+    }
+  }
+ 
+  /// Run the training using num_threads threads
+  void RunTrain(int num_threads);
+
+ private:
+  /// The data-reading thread
+  void ReadData();
+  /// The training thread
+  void Thread(int thr);
+
+ friend class PlatformThread;
+};
+
+
+
+/**
+ * Inherit Thread for the training threads
+ */
+class PlatformThread : public Thread {
+ public:
+  PlatformThread(Platform* pf)
+   : platform_(*pf)
+  { }
+ 
+ private:
+  void Execute(void* arg) {
+    long long thr_id = reinterpret_cast<long long>(arg);
+    platform_.Thread(thr_id);
+  }
+   
+ private:
+  Platform& platform_;
+};
+
+
+
+
+
+void Platform::RunTrain(int num_thr) {
+  num_thr_ = num_thr;
+  
+  /*
+   * Initialize parallel training
+   */
+  feature_buf_.resize(num_thr);
+  label_buf_.resize(num_thr);
+  mutex_buf_.resize(num_thr);
+  cache_.resize(num_thr);
+  sync_mask_.resize(num_thr);
+  barrier_.SetThreshold(num_thr);
+
+  tim_.resize(num_thr);
+  tim_accu_.resize(num_thr,0.0);
+
+  int bunchsize = bunchsize_/num_thr;
+  int cachesize = (cachesize_/num_thr/bunchsize)*bunchsize;
+  std::cout << "Bunchsize:" << bunchsize << "*" << num_thr << "=" << bunchsize*num_thr
+            << " Cachesize:" << cachesize << "*" << num_thr << "=" << cachesize*num_thr << "\n";
+  for(int i=0; i<num_thr; i++) {
+    //clone transforms
+    nnet_transf2_.push_back(nnet_transf_.Clone()); 
+    //create cache
+    cache_[i].Init(cachesize,bunchsize,seed_);
+    cache_[i].Trace(trace_);
+    //clone networks
+    nnet2_.push_back(nnet_.Clone());
+    //clone objective function objects
+    obj_fun2_.push_back(obj_fun_->Clone());
+    //enable threads to sync weights
+    sync_mask_[i] = true;
+  }
+
+  /*
+   * Run training threads
+   */
+  std::vector<PlatformThread*> threads;
+  for(intptr_t i=0; i<num_thr; i++) {
+    PlatformThread* t = new PlatformThread(this);
+    t->Start(reinterpret_cast<void*>(i));
+    threads.push_back(t);
+  }
+
+  /*
+   * Read the training data
+   */
+  ReadData();
+
+  /*
+   * Wait for training to finish
+   */
+  semaphore_endtrain2_.Wait(); 
+
+}
+
+
+
+void Platform::ReadData() try {
+  cout_mutex_.Lock();  
+  std::cout << "queuesize " << feature_.QueueSize() << "\n";
+  cout_mutex_.Unlock();  
+
+  int thr = 0;
+  for(feature_.Rewind();!feature_.EndOfList();feature_.MoveNext()) {
+    Matrix<BaseFloat>* fea = new Matrix<BaseFloat>;
+    Matrix<BaseFloat>* lab = new Matrix<BaseFloat>;
+
+    feature_.ReadFullMatrix(*fea);
+    label_.GenDesiredMatrix(*lab,
+                            fea->Rows()-start_frm_ext_-end_frm_ext_,
+                            feature_.CurrentHeader().mSamplePeriod,
+                            feature_.Current().Logical().c_str());
+
+    
+    fea->CheckData(feature_.Current().Logical());
+
+    mutex_buf_[thr].Lock();
+    feature_buf_[thr].push_back(fea);
+    label_buf_[thr].push_back(lab);
+    mutex_buf_[thr].Unlock();
+
+    //suspend reading when shortest buffer has 50 matrices
+    if(thr == 0) {
+      int minsize=1e6;
+      for(size_t i=0; i<feature_buf_.size(); i++) {
+        int s = feature_buf_[i].size();
+        if(s < minsize) minsize = s;
+      }
+      if(minsize > 20) semaphore_read_.Wait();
+    }
+
+    thr = (thr+1) % num_thr_;
+  }
+
+  std::cout << "[Reading finished]\n" << std::flush; 
+  end_reading_ = true;
+
+} catch (std::exception& rExc) {
+  std::cerr << "Exception thrown" << std::endl;
+  std::cerr << rExc.what() << std::endl;
+  exit(1);
+}
+
+void Platform::Thread(int thr_id) try {
+
+  const int thr = thr_id; //make id const for safety!
+
+  while(1) {
+    //fill the cache
+    while(!cache_[thr].Full() && !(end_reading_ && (feature_buf_[thr].size() == 0))) {
+
+      if(feature_buf_[thr].size() <= 5) {
+        semaphore_read_.Post();//wake the reader
+      }
+      if(feature_buf_[thr].size() == 0) {
+        cout_mutex_.Lock();  
+        std::cout << "Thread" << thr << ",waiting for data\n";
+        cout_mutex_.Unlock();  
+        sleep(1);
+      } else {
+        //get the matrices
+        mutex_buf_[thr].Lock();
+        Matrix<BaseFloat>* fea = feature_buf_[thr].front();
+        Matrix<BaseFloat>* lab = label_buf_[thr].front();
+        feature_buf_[thr].pop_front();
+        label_buf_[thr].pop_front();
+        mutex_buf_[thr].Unlock();
+
+        //transform the features
+        Matrix<BaseFloat> fea_transf;
+        nnet_transf2_[thr]->Propagate(*fea,fea_transf);
+
+        //trim the ext
+        SubMatrix<BaseFloat> fea_trim(
+          fea_transf,
+          start_frm_ext_,
+          fea_transf.Rows()-start_frm_ext_-end_frm_ext_,
+          0,
+          fea_transf.Cols()
+        );
+
+        //add to cache
+        cache_[thr].AddData(fea_trim,*lab);
+
+        delete fea; delete lab;
+      }
+    }
+
+    //no more data, end training...
+    if(cache_[thr].Empty()) break;
+
+    if(randomize_) { cache_[thr].Randomize(); }
+
+
+    //std::cout << "Thread" << thr << ", Cache#" << nr_cache++ << "\n";
+
+    //train from cache
+    Matrix<BaseFloat> fea2,lab2,out,err;
+    while(!cache_[thr].Empty()) {
+      cache_[thr].GetBunch(fea2,lab2);
+      nnet2_[thr]->Propagate(fea2,out);
+      obj_fun2_[thr]->Evaluate(out,lab2,&err);
+
+      if(!crossval_) {
+        nnet2_[thr]->Backpropagate(err);
+
+         tim_[thr].Start();
+        barrier_.Wait();//*********/
+         tim_[thr].End(); tim_accu_[thr] += tim_[thr].Val();
+       
+        //sum the gradient and bunchsize
+        for(int i=0; i<num_thr_; i++) {
+          if(sync_mask_[i]) {
+            nnet_.AccuGradient(*nnet2_[i],thr,num_thr_);
+            if(thr == 0) nnet_.AccuBunchsize(*nnet2_[i]);
+          }
+        }
+
+         tim_[thr].Start();
+        barrier_.Wait();//*********/
+         tim_[thr].End(); tim_accu_[thr] += tim_[thr].Val();
+
+        //update
+        nnet_.Update(thr,num_thr_);
+       
+         tim_[thr].Start();
+        barrier_.Wait();//*********/
+         tim_[thr].End(); tim_accu_[thr] += tim_[thr].Val();
+
+        //reset the bunchsize counter
+        if(thr == 0) nnet_.ResetBunchsize();
+      }
+    }
+
+  }
+
+  std::cout << "Thread" << thr << " end of data\n";
+  
+  //deactivate threads' update from summing
+  sync_mask_[thr] = false;
+  //increase number of finished threads
+  semaphore_endtrain_.Post();
+   
+  //synchronize the updates of other threads
+  while(1) {
+    barrier_.Wait();//*********/
+    if(semaphore_endtrain_.GetValue() == num_thr_) break;
+        
+    //sum the gradient and bunchsize
+    for(int i=0; i<num_thr_; i++) {
+      if(sync_mask_[i]) {
+        nnet_.AccuGradient(*nnet2_[i],thr,num_thr_);
+        if(thr == 0) nnet_.AccuBunchsize(*nnet2_[i]);
+      }
+    }
+    barrier_.Wait();//*********/
+    //update
+    nnet_.Update(thr,num_thr_);
+    barrier_.Wait();//*********/
+    //reset bunchsize counter
+    if(thr == 0) nnet_.ResetBunchsize();
+  }
+
+  //finally merge objfun stats
+  if(thr == 0) {
+    for(int i=0; i<num_thr_; i++) {
+      obj_fun_->MergeStats(*obj_fun2_[i]);
+    }
+    
+    cout_mutex_.Lock();
+    std::cout << "Barrier waiting times per thread\n"; 
+    std::copy(tim_accu_.begin(),tim_accu_.end(),std::ostream_iterator<double>(std::cout," "));
+    std::cout << "\n";
+    cout_mutex_.Unlock();
+  }
+
+  cout_mutex_.Lock();
+  std::cout << "[Thread" << thr << " finished]\n";
+  cout_mutex_.Unlock();
+
+  if(thr == 0) {
+    semaphore_endtrain2_.Post();
+  }
+} catch (std::exception& rExc) {
+  std::cerr << "Exception thrown" << std::endl;
+  std::cerr << rExc.what() << std::endl;
+  exit(1);
+}
+
+
+
+}//namespace TNet
+
+#endif
diff --git a/src/TNetLib/.svn/text-base/Semaphore.cc.svn-base b/src/TNetLib/.svn/text-base/Semaphore.cc.svn-base
new file mode 100644
index 0000000..d149fb3
--- /dev/null
+++ b/src/TNetLib/.svn/text-base/Semaphore.cc.svn-base
@@ -0,0 +1,64 @@
+
+#include "Semaphore.h"
+
+namespace TNet {
+  
+  Semaphore::
+  Semaphore(int initValue) 
+  {
+    mSemValue = initValue;
+    pthread_mutex_init(&mMutex, NULL);
+    pthread_cond_init(&mCond, NULL);
+  }
+
+  Semaphore::
+  ~Semaphore()
+  {
+    pthread_mutex_destroy(&mMutex);
+    pthread_cond_destroy(&mCond);
+  }
+
+  int 
+  Semaphore::
+  TryWait()
+  {
+    pthread_mutex_lock(&mMutex);
+    if(mSemValue > 0) {
+      mSemValue--;
+      pthread_mutex_unlock(&mMutex);
+      return 0;
+    }
+    pthread_mutex_unlock(&mMutex);
+    return -1;
+  }
+
+  void 
+  Semaphore::
+  Wait()
+  {
+    pthread_mutex_lock(&mMutex);
+    while(mSemValue <= 0) {
+      pthread_cond_wait(&mCond, &mMutex);
+    }
+    mSemValue--;
+    pthread_mutex_unlock(&mMutex);
+  }
+
+  void
+  Semaphore::
+  Post()
+  {
+    pthread_mutex_lock(&mMutex);
+    mSemValue++;
+    pthread_cond_signal(&mCond);
+    pthread_mutex_unlock(&mMutex);
+  }
+
+  int
+  Semaphore::
+  GetValue()
+  { return mSemValue; }
+
+
+
+} //namespace
diff --git a/src/TNetLib/.svn/text-base/Semaphore.h.svn-base b/src/TNetLib/.svn/text-base/Semaphore.h.svn-base
new file mode 100644
index 0000000..a28ee44
--- /dev/null
+++ b/src/TNetLib/.svn/text-base/Semaphore.h.svn-base
@@ -0,0 +1,26 @@
+#ifndef _SEMPAHORE_H_
+#define _SEMPAHORE_H_
+
+#include <pthread.h>
+
+namespace TNet {
+  
+  class Semaphore {
+    public:
+      Semaphore(int initValue = 0); 
+      ~Semaphore();
+
+      int TryWait();
+      void Wait();
+      void Post();
+      int GetValue();
+
+    private:
+      int mSemValue;
+      pthread_mutex_t mMutex;
+      pthread_cond_t mCond;
+
+  };
+} //namespace
+
+#endif
diff --git a/src/TNetLib/.svn/text-base/SharedLinearity.cc.svn-base b/src/TNetLib/.svn/text-base/SharedLinearity.cc.svn-base
new file mode 100644
index 0000000..108212c
--- /dev/null
+++ b/src/TNetLib/.svn/text-base/SharedLinearity.cc.svn-base
@@ -0,0 +1,277 @@
+
+
+#include "SharedLinearity.h"
+#include "cblas.h"
+
+namespace TNet {
+
+void 
+SharedLinearity::
+PropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+{
+  //precopy bias
+  for(int k=0; k<mNInstances; k++) {
+    for(size_t r=0; r<X.Rows(); r++) {
+      memcpy(Y.pRowData(r)+k*mpBias->Dim(),mpBias->pData(),mpBias->Dim()*sizeof(BaseFloat));
+    }
+  }
+  
+  //multiply blockwise
+  for(int k=0; k<mNInstances; k++) {
+    SubMatrix<BaseFloat> xblock(X,0,X.Rows(),k*mpLinearity->Rows(),mpLinearity->Rows());
+    SubMatrix<BaseFloat> yblock(Y,0,Y.Rows(),k*mpLinearity->Cols(),mpLinearity->Cols());
+    yblock.BlasGemm(1.0,xblock,NO_TRANS,*mpLinearity,NO_TRANS,1.0);
+  }
+}
+
+
+void 
+SharedLinearity::
+BackpropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+{
+  for(int k=0; k<mNInstances; k++) {
+    SubMatrix<BaseFloat> xblock(X,0,X.Rows(),k*mpLinearity->Cols(),mpLinearity->Cols());
+    SubMatrix<BaseFloat> yblock(Y,0,Y.Rows(),k*mpLinearity->Rows(),mpLinearity->Rows());
+    yblock.BlasGemm(1.0,xblock,NO_TRANS,*mpLinearity,TRANS,1.0);
+  }
+}
+
+#if 0
+void 
+SharedLinearity::
+AccuUpdate() 
+{
+  BaseFloat N = 1;
+  /* 
+  //Not part of the interface!!!
+  if(mGradDivFrm) {
+    N = static_cast<BaseFloat>(GetInput().Rows());
+  }
+  */
+  BaseFloat mmt_gain = static_cast<BaseFloat>(1.0/(1.0-mMomentum));
+  N *= mmt_gain; //compensate higher gradient estimates due to momentum 
+  
+  //compensate augmented dyn. range of gradient caused by multiple instances
+  N *= static_cast<BaseFloat>(mNInstances); 
+
+  const Matrix<BaseFloat>& X = GetInput().Data();
+  const Matrix<BaseFloat>& E = GetErrorInput().Data();
+  //get gradient of shared linearity
+  for(int k=0; k<mNInstances; k++) {
+    SubMatrix<BaseFloat> xblock(X,0,X.Rows(),k*mLinearity.Rows(),mLinearity.Rows());
+    SubMatrix<BaseFloat> eblock(E,0,E.Rows(),k*mLinearity.Cols(),mLinearity.Cols());
+    mLinearityCorrection.BlasGemm(1.0,xblock,TRANS,eblock,NO_TRANS,((k==0)?mMomentum:1.0f));
+  }
+
+  //get gradient of shared bias
+  mBiasCorrection.Scale(mMomentum);
+  for(int r=0; r<E.Rows(); r++) {
+    for(int c=0; c<E.Cols(); c++) {
+      mBiasCorrection[c%mBiasCorrection.Dim()] += E(r,c);
+    }
+  }
+
+  //perform update 
+  mLinearity.AddScaled(-mLearningRate/N,mLinearityCorrection);
+  mBias.AddScaled(-mLearningRate/N,mBiasCorrection);
+  
+  //regularization weight decay
+  mLinearity.AddScaled(-mLearningRate*mWeightcost,mLinearity);
+}
+#endif
+
+void
+SharedLinearity::
+ReadFromStream(std::istream& rIn)
+{
+  //number of instances of shared weights in layer
+  rIn >> std::ws >> mNInstances;
+  if(mNInstances < 1) {
+    std::ostringstream os;
+    os << "Bad number of instances:" << mNInstances;
+    Error(os.str());
+  }
+  if(GetNInputs() % mNInstances != 0 || GetNOutputs() % mNInstances != 0) {
+    std::ostringstream os;
+    os << "Number of Inputs/Outputs must be divisible by number of instances"
+       << " Inputs:" << GetNInputs()
+       << " Outputs" << GetNOutputs()
+       << " Intances:" << mNInstances;
+    Error(os.str());
+  }
+    
+  //matrix is stored transposed as SNet does
+  BfMatrix transpose;
+  rIn >> transpose;
+  mLinearity = BfMatrix(transpose, TRANS);
+  //biases stored normally
+  rIn >> mBias;
+
+  if(transpose.Cols()*transpose.Rows() == 0) {
+    Error("Missing linearity matrix in network file");
+  }
+  if(mBias.Dim() == 0) {
+    Error("Missing bias vector in network file");
+  }
+
+
+  if(mLinearity.Cols() != (GetNOutputs() / mNInstances) || 
+     mLinearity.Rows() != (GetNInputs() / mNInstances) ||
+     mBias.Dim() != (GetNOutputs() / mNInstances)
+  ){
+    std::ostringstream os;
+    os << "Wrong dimensionalities of matrix/vector in network file\n"
+       << "Inputs:" << GetNInputs()
+       << " Outputs:" << GetNOutputs()
+       << "\n"
+       << "N-Instances:" << mNInstances
+       << "\n"
+       << "linearityCols:" << mLinearity.Cols() << "(" << mLinearity.Cols()*mNInstances << ")"
+       << " linearityRows:" << mLinearity.Rows() << "(" << mLinearity.Rows()*mNInstances << ")"
+       << " biasDims:" << mBias.Dim() << "(" << mBias.Dim()*mNInstances << ")"
+       << "\n";
+    Error(os.str());
+  }
+
+  mLinearityCorrection.Init(mLinearity.Rows(),mLinearity.Cols());
+  mBiasCorrection.Init(mBias.Dim());
+}
+
+ 
+void
+SharedLinearity::
+WriteToStream(std::ostream& rOut)
+{
+  rOut << mNInstances << std::endl;
+  //matrix is stored transposed as SNet does
+  BfMatrix transpose(mLinearity, TRANS);
+  rOut << transpose;
+  //biases stored normally
+  rOut << mBias;
+  rOut << std::endl;
+}
+
+
+void 
+SharedLinearity::
+Gradient() 
+{
+  const Matrix<BaseFloat>& X = GetInput();
+  const Matrix<BaseFloat>& E = GetErrorInput();
+  //get gradient of shared linearity
+  for(int k=0; k<mNInstances; k++) {
+    SubMatrix<BaseFloat> xblock(X,0,X.Rows(),k*mpLinearity->Rows(),mpLinearity->Rows());
+    SubMatrix<BaseFloat> eblock(E,0,E.Rows(),k*mpLinearity->Cols(),mpLinearity->Cols());
+    mLinearityCorrection.BlasGemm(1.0,xblock,TRANS,eblock,NO_TRANS,((k==0)?0.0f:1.0f));
+  }
+
+  //get gradient of shared bias
+  mBiasCorrection.Set(0.0f);
+  for(int r=0; r<E.Rows(); r++) {
+    for(int c=0; c<E.Cols(); c++) {
+      mBiasCorrection[c%mBiasCorrection.Dim()] += E(r,c);
+    }
+  }
+}
+
+
+void 
+SharedLinearity::
+AccuGradient(const UpdatableComponent& src, int thr, int thrN)
+{
+  //cast the argument
+  const SharedLinearity& src_comp = dynamic_cast<const SharedLinearity&>(src);
+
+  //allocate accumulators when needed
+  if(mLinearityCorrectionAccu.MSize() == 0) {
+    mLinearityCorrectionAccu.Init(mpLinearity->Rows(),mpLinearity->Cols());
+  }
+  if(mBiasCorrectionAccu.MSize() == 0) {
+    mBiasCorrectionAccu.Init(mpBias->Dim());
+  }
+ 
+
+  //assert the dimensions
+  /*
+  assert(mLinearityCorrection.Rows() == src_comp.mLinearityCorrection.Rows());
+  assert(mLinearityCorrection.Cols() == src_comp.mLinearityCorrection.Cols());
+  assert(mBiasCorrection.Dim() == src_comp.mBiasCorrection.Dim());
+  */
+
+  //need to find out which rows to sum...
+  int div = mLinearityCorrection.Rows() / thrN;
+  int mod = mLinearityCorrection.Rows() % thrN;
+
+  int origin = thr * div + ((mod > thr)? thr : mod);
+  int rows = div + ((mod > thr)? 1 : 0);
+
+  //std::cout << "[S" << thr << "," << origin << "," << rows << "]" << std::flush;
+
+  //create the matrix windows
+  const SubMatrix<BaseFloat> src_mat (
+    src_comp.mLinearityCorrection, 
+    origin, rows, 
+    0, mLinearityCorrection.Cols()
+  );
+  SubMatrix<double> tgt_mat (
+    mLinearityCorrectionAccu, 
+    origin, rows, 
+    0, mLinearityCorrection.Cols()
+  );
+  //sum the rows
+  Add(tgt_mat,src_mat);
+
+  //first thread will always sum the bias correction and adds frame count
+  if(thr == 0) {
+    //std::cout << "[BS" << thr << "]" << std::flush;
+    Add(mBiasCorrectionAccu,src_comp.mBiasCorrection);
+  }
+}
+
+
+void 
+SharedLinearity::
+Update(int thr, int thrN) 
+{
+  //need to find out which rows to sum...
+  int div = mLinearity.Rows() / thrN;
+  int mod = mLinearity.Rows() % thrN;
+
+  int origin = thr * div + ((mod > thr)? thr : mod);
+  int rows = div + ((mod > thr)? 1 : 0);
+
+  //std::cout << "[P" << thr << "," << origin << "," << rows << "]" << std::flush;
+
+  //get the matrix windows
+  SubMatrix<double> src_mat (
+    mLinearityCorrectionAccu, 
+    origin, rows, 
+    0, mLinearityCorrection.Cols()
+  );
+  SubMatrix<BaseFloat> tgt_mat (
+    mLinearity, 
+    origin, rows, 
+    0, mLinearityCorrection.Cols()
+  );
+
+  //TODO perform L2 regularization
+  //tgt_mat.AddScaled(tgt_mat, -mWeightcost * num_frames);
+
+  //update weights
+  AddScaled(tgt_mat, src_mat, -mLearningRate/static_cast<BaseFloat>(mNInstances));
+
+  //first thread always update bias
+  if(thr == 0) {
+    //std::cout << "[" << thr << "BP]" << std::flush;
+    AddScaled(mBias, mBiasCorrectionAccu, -mLearningRate/static_cast<BaseFloat>(mNInstances));
+  }
+
+  //reset the accumulators
+  src_mat.Zero();
+  if(thr == 0) {
+    mBiasCorrectionAccu.Zero();
+  }
+}
+
+ 
+} //namespace
diff --git a/src/TNetLib/.svn/text-base/SharedLinearity.h.svn-base b/src/TNetLib/.svn/text-base/SharedLinearity.h.svn-base
new file mode 100644
index 0000000..83feeee
--- /dev/null
+++ b/src/TNetLib/.svn/text-base/SharedLinearity.h.svn-base
@@ -0,0 +1,103 @@
+#ifndef _CUSHARED_LINEARITY_H_
+#define _CUSHARED_LINEARITY_H_
+
+
+#include "Component.h"
+
+#include "Matrix.h"
+#include "Vector.h"
+
+
+namespace TNet {
+
+class SharedLinearity : public UpdatableComponent
+{
+ public:
+  SharedLinearity(size_t nInputs, size_t nOutputs, Component *pPred); 
+  ~SharedLinearity();  
+  
+  ComponentType GetType() const 
+  { return SHARED_LINEARITY; }
+
+  const char* GetName() const
+  { return "<SharedLinearity>"; }
+
+  Component* Clone() const;
+
+  void PropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y);
+  void BackpropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y);
+
+  void ReadFromStream(std::istream& rIn);
+  void WriteToStream(std::ostream& rOut);
+
+  /// calculate gradient
+  void Gradient(); 
+  /// accumulate gradient from other components
+  void AccuGradient(const UpdatableComponent& src, int thr, int thrN);
+  /// update weights, reset the accumulator
+  void Update(int thr, int thrN);
+
+protected:
+  Matrix<BaseFloat> mLinearity;  ///< Matrix with neuron weights
+  Vector<BaseFloat> mBias;       ///< Vector with biases
+
+  Matrix<BaseFloat>* mpLinearity;
+  Vector<BaseFloat>* mpBias;
+
+  Matrix<BaseFloat> mLinearityCorrection; ///< Matrix for linearity updates
+  Vector<BaseFloat> mBiasCorrection;      ///< Vector for bias updates
+
+  Matrix<double> mLinearityCorrectionAccu; ///< Accumulator for linearity updates
+  Vector<double> mBiasCorrectionAccu;      ///< Accumulator for bias updates
+  
+  int mNInstances;
+};
+
+
+
+
+////////////////////////////////////////////////////////////////////////////
+// INLINE FUNCTIONS 
+// SharedLinearity::
+inline 
+SharedLinearity::
+SharedLinearity(size_t nInputs, size_t nOutputs, Component *pPred)
+  : UpdatableComponent(nInputs, nOutputs, pPred),
+    mpLinearity(&mLinearity), mpBias(&mBias), 
+    mNInstances(0)
+{ }
+
+
+inline
+SharedLinearity::
+~SharedLinearity()
+{ }
+
+
+inline
+Component*
+SharedLinearity::
+Clone() const
+{
+  SharedLinearity* ptr = new SharedLinearity(GetNInputs(),GetNOutputs(),NULL);
+  ptr->mpLinearity = mpLinearity;
+  ptr->mpBias = mpBias;
+
+  ptr->mLinearityCorrection.Init(mpLinearity->Rows(),mpLinearity->Cols());
+  ptr->mBiasCorrection.Init(mpBias->Dim());
+
+  ptr->mNInstances = mNInstances;
+
+  ptr->mLearningRate = mLearningRate;
+
+
+  return ptr;
+}
+
+
+
+} //namespace
+
+
+
+#endif
diff --git a/src/TNetLib/.svn/text-base/Thread.h.svn-base b/src/TNetLib/.svn/text-base/Thread.h.svn-base
new file mode 100644
index 0000000..ba6d7ba
--- /dev/null
+++ b/src/TNetLib/.svn/text-base/Thread.h.svn-base
@@ -0,0 +1,53 @@
+#ifndef _TNET_THREAD_H
+#define _TNET_THREAD_H
+
+namespace TNet {
+
+class Thread {
+ public:
+  Thread() 
+  { }
+  virtual ~Thread() 
+  { }
+
+  int Start(void* arg);
+
+ protected:
+  static void* EntryPoint(void*);
+  virtual void Execute(void*) = 0; ///< Override this function
+  void* Arg() const { return arg_; }
+  void Arg(void* a) { arg_ = a; }
+
+ private:
+  pthread_t thread_id_;
+  void * arg_;
+};
+
+int Thread::Start(void * arg) {
+  Arg(arg); // store user data
+ 
+  int ret=0;
+  //create thread as detached (don't wait for it)
+  pthread_attr_t tattr;
+  ret |= pthread_attr_init(&tattr);
+  ret |= pthread_attr_setdetachstate(&tattr,PTHREAD_CREATE_DETACHED);
+  ret |= pthread_create(&thread_id_, &tattr, &Thread::EntryPoint, this);
+  if(ret != 0) KALDI_ERR << "Failed to create thread";
+  return ret;
+}
+
+/*static */
+void* Thread::EntryPoint(void* pthis) try {
+  Thread* pt = (Thread*)pthis;
+  pt->Execute(pt->Arg());
+  return NULL;
+} catch (std::exception& rExc) {
+  std::cerr << "Exception thrown" << std::endl;
+  std::cerr << rExc.what() << std::endl;
+  exit(1);
+}
+
+
+} //namespace TNet
+
+#endif
diff --git a/src/TNetLib/Activation.cc b/src/TNetLib/Activation.cc
new file mode 100644
index 0000000..8e84190
--- /dev/null
+++ b/src/TNetLib/Activation.cc
@@ -0,0 +1,138 @@
+
+#include "Activation.h"
+
+
+namespace TNet {
+
+void Sigmoid::PropagateFnc(const BfMatrix& X, BfMatrix& Y) {
+  //Y = 1/(1+e^{-X})
+  for(size_t r=0; r<X.Rows(); r++) {
+    for(size_t c=0; c<X.Cols(); c++) {
+      Y(r,c) = 1.0f/(1.0f+exp(-X(r,c)));
+    }
+  }
+}
+
+
+void Sigmoid::BackpropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y) {
+  const Matrix<BaseFloat>& out = GetOutput();
+  //Y = OUT*(1-OUT)*X //ODVOZENO
+  for(size_t r=0; r<X.Rows(); r++) {
+    for(size_t c=0; c<X.Cols(); c++) {
+      Y(r,c) = X(r,c)*out(r,c)*(1.0f-out(r,c));
+    }
+  }
+}
+
+
+
+void Softmax::PropagateFnc(const BfMatrix& X, BfMatrix& Y) {
+  //Y_j = e^X_j / sum_i(e^X_i)
+  //
+  //    e^(X_j+c) / sum_i(e^X_i+c)
+  //    = e^c.e^X_h / e^c.sum_i(e^X_i)
+  //    = e^X_j / sum_i(e^X_i)
+  //
+  size_t rows = X.Rows();
+  for(size_t i=0; i<rows; i++) {
+    BfSubVector y_i(Y[i]); //<< y_i gets pointer to i'th row of matrix Y
+    y_i.Copy(X[i]);
+    BaseFloat max = y_i.Max();
+    y_i.Subtract(max);
+    y_i.ApplyExp();
+    BaseFloat sum = y_i.Sum();
+    y_i.Scale(1.0f/sum);
+  }
+}
+
+
+void Softmax::BackpropagateFnc(const BfMatrix& X, BfMatrix& Y) {
+  //simply copy the error...,
+  Y.Copy(X);
+}
+
+
+void BlockSoftmax::ReadFromStream(std::istream& rIn) {
+  rIn >> mDim; 
+  mDimOffset.Init(mDim.Dim()+1);
+  
+  int off=0; 
+  for(int i=0; i<mDim.Dim(); i++) { 
+    mDimOffset[i]=off;
+    off += mDim[i];
+  }
+  mDimOffset[mDim.Dim()]=off;
+
+  if(off!=GetNOutputs()) {
+    KALDI_ERR << "Non-matching dimension of sum of softmaxes,"
+      << " the sum:" << off 
+      << " GetNOutputs:" << GetNOutputs();
+  }
+}
+
+void BlockSoftmax::WriteToStream(std::ostream& rOut) {
+  rOut << mDim;
+}
+
+
+
+
+void BlockSoftmax::PropagateFnc(const BfMatrix& X, BfMatrix& Y) {
+  //Y_j = e^X_j / sum_i(e^X_i)
+  //
+  //    e^(X_j+c) / sum_i(e^X_i+c)
+  //    = e^c.e^X_h / e^c.sum_i(e^X_i)
+  //    = e^X_j / sum_i(e^X_i)
+  //
+  size_t rows = X.Rows();
+  for(size_t i=0; i<rows; i++) {
+    BfSubVector y_i(Y[i]); //<< y_i gets pointer to i'th row of matrix Y
+    y_i.Copy(X[i]);
+    //BaseFloat max = y_i.Max();
+    //y_i.Subtract(max);
+    //y_i.ApplyExp();
+    //normalize separately on each softmax interval...
+    for(int j=0; j<mDim.Dim(); j++) {
+      BfSubVector y_i_smx_j(y_i.Range(mDimOffset[j],mDim[j]));
+      BaseFloat max = y_i_smx_j.Max();
+      y_i_smx_j.Subtract(max);
+      y_i_smx_j.ApplyExp();
+      BaseFloat sum = y_i_smx_j.Sum();
+      y_i_smx_j.Scale(1.0f/sum);
+    }
+  }
+
+//  X.CheckData("BlockSoftmax PropagateFnc X");
+//  Y.CheckData("BlockSoftmax PropagateFnc Y");
+}
+
+
+void BlockSoftmax::BackpropagateFnc(const BfMatrix& X, BfMatrix& Y) {
+  //set the output to zero
+  Y.Zero();
+  //copy only parts of the error
+  //from softmax intervals which sum up to 0.0, not 1.0
+  for(int i=0; i<X.Rows(); i++) {
+    for(int j=0; j<mDim.Dim(); j++) {
+      const BfSubVector x_i_smx_j(X[i].Range(mDimOffset[j],mDim[j]));
+      BaseFloat sum = x_i_smx_j.Sum();
+      if(sum > -0.1 && sum < 0.1) {
+        BfSubVector y_i_smx_j(Y[i].Range(mDimOffset[j],mDim[j]));
+        y_i_smx_j.Copy(x_i_smx_j);
+      } else if (sum > 0.9 && sum < 1.1) {
+        ; //do nothing
+      } else {
+        KALDI_ERR << "Invalid sum: " << sum;
+      }
+    }
+  }
+
+//  X.CheckData("BlockSoftmax BackpropagateFnc X");
+//  Y.CheckData("BlockSoftmax BackpropagateFnc Y");
+
+}
+
+
+
+} //namespace TNet
+
diff --git a/src/TNetLib/Activation.h b/src/TNetLib/Activation.h
new file mode 100644
index 0000000..90263d0
--- /dev/null
+++ b/src/TNetLib/Activation.h
@@ -0,0 +1,104 @@
+
+#ifndef _ACT_FUN_I_
+#define _ACT_FUN_I_
+
+
+#include "Component.h"
+
+
+namespace TNet
+{
+
+  /**
+   * Sigmoid activation function
+   */
+  class Sigmoid : public Component
+  {
+    public:
+      Sigmoid(size_t nInputs, size_t nOutputs, Component *pPred)
+       : Component(nInputs,nOutputs,pPred)
+      { }
+
+      ComponentType GetType() const
+      { return SIGMOID; }
+
+      const char* GetName() const
+      { return "<sigmoid>"; }
+
+      Component* Clone() const
+      { return new Sigmoid(GetNInputs(),GetNOutputs(),NULL); }
+
+    protected:
+      void PropagateFnc(const BfMatrix& X, BfMatrix& Y);
+      void BackpropagateFnc(const BfMatrix& X, BfMatrix& Y);
+  };
+    
+
+  /**
+   * Softmax activation function
+   */
+  class Softmax : public Component
+  {
+    public:
+      Softmax(size_t nInputs, size_t nOutputs, Component *pPred)
+       : Component(nInputs,nOutputs,pPred)
+      { }
+
+      ComponentType GetType() const
+      { return SOFTMAX; }
+
+      const char* GetName() const
+      { return "<softmax>"; }
+
+      Component* Clone() const
+      { return new Softmax(GetNInputs(),GetNOutputs(),NULL); }
+
+    protected:
+      void PropagateFnc(const BfMatrix& X, BfMatrix& Y);
+      void BackpropagateFnc(const BfMatrix& X, BfMatrix& Y);
+  };
+
+
+  /**
+   * BlockSoftmax activation function.
+   * It is several softmaxes in one.
+   * The dimensions of softmaxes are given by integer vector.
+   * During backpropagation: 
+   *  If the derivatives sum up to 0, they are backpropagated. 
+   *  If the derivatives sup up to 1, they are discarded
+   *  (like this we know that the softmax was 'inactive').
+   */
+  class BlockSoftmax : public Component
+  {
+    public:
+      BlockSoftmax(size_t nInputs, size_t nOutputs, Component *pPred)
+       : Component(nInputs,nOutputs,pPred)
+      { }
+
+      ComponentType GetType() const
+      { return BLOCK_SOFTMAX; }
+
+      const char* GetName() const
+      { return "<blocksoftmax>"; }
+
+      Component* Clone() const
+      { return new BlockSoftmax(*this); }
+
+      void ReadFromStream(std::istream& rIn);
+      void WriteToStream(std::ostream& rOut);
+
+    protected:
+      void PropagateFnc(const BfMatrix& X, BfMatrix& Y);
+      void BackpropagateFnc(const BfMatrix& X, BfMatrix& Y);
+
+    private:
+      Vector<int> mDim;
+      Vector<int> mDimOffset;
+  };
+
+
+  
+} //namespace
+
+
+#endif
diff --git a/src/TNetLib/Barrier.cc b/src/TNetLib/Barrier.cc
new file mode 100644
index 0000000..0170e04
--- /dev/null
+++ b/src/TNetLib/Barrier.cc
@@ -0,0 +1,143 @@
+/*
+ * barrier.c
+ *
+ * This file implements the "barrier" synchronization construct.
+ *
+ * A barrier causes threads to wait until a set of threads has
+ * all "reached" the barrier. The number of threads required is
+ * set when the barrier is initialized, and cannot be changed
+ * except by reinitializing.
+ *
+ * The barrier_init() and barrier_destroy() functions,
+ * respectively, allow you to initialize and destroy the
+ * barrier.
+ *
+ * The barrier_wait() function allows a thread to wait for a
+ * barrier to be completed. One thread (the one that happens to
+ * arrive last) will return from barrier_wait() with the status
+ * -1 on success -- others will return with 0. The special
+ * status makes it easy for the calling code to cause one thread
+ * to do something in a serial region before entering another
+ * parallel section of code.
+ */
+#include <pthread.h>
+#include "Error.h"
+#include "Barrier.h"
+
+namespace TNet {
+
+/*
+ * Initialize a barrier for use.
+ */
+Barrier::Barrier(int count)
+ : threshold_(count), counter_(count), cycle_(0) {
+
+  if(0 != pthread_mutex_init(&mutex_, NULL))
+    KALDI_ERR << "Cannot initialize mutex";
+  
+  if(0 != pthread_cond_init(&cv_, NULL)) {
+    pthread_mutex_destroy(&mutex_);
+    KALDI_ERR << "Cannot initilize condv";
+  }
+}
+
+/*
+ * Destroy a barrier when done using it.
+ */
+Barrier::~Barrier() {
+
+  if(0 != pthread_mutex_lock(&mutex_))
+    KALDI_ERR << "Cannot lock mutex";
+
+  /*
+   * Check whether any threads are known to be waiting; report
+   * "BUSY" if so.
+   */
+  if(counter_ != threshold_) {
+    pthread_mutex_unlock (&mutex_);
+    KALDI_ERR << "Cannot destroy barrier with waiting thread";
+  }
+
+  if(0 != pthread_mutex_unlock(&mutex_))
+    KALDI_ERR << "Cannot unlock barrier";
+
+  /*
+   * If unable to destroy either 1003.1c synchronization
+   * object, halt
+   */
+  if(0 != pthread_mutex_destroy(&mutex_))
+    KALDI_ERR << "Cannot destroy mutex";
+
+  if(0 != pthread_cond_destroy(&cv_)) 
+    KALDI_ERR << "Cannot destroy condv";
+}
+
+
+void Barrier::SetThreshold(int thr) {
+  if(counter_ != threshold_) 
+    KALDI_ERR << "Cannot set threshold, while a thread is waiting";
+
+  threshold_ = thr; counter_ = thr;
+}
+
+
+
+/*
+ * Wait for all members of a barrier to reach the barrier. When
+ * the count (of remaining members) reaches 0, broadcast to wake
+ * all threads waiting.
+ */
+int Barrier::Wait() {
+  int status, cancel, tmp, cycle;
+
+  if(threshold_ == 0)
+    KALDI_ERR << "Cannot wait when Threshold value was not set";
+
+  if(0 != pthread_mutex_lock(&mutex_)) 
+    KALDI_ERR << "Cannot lock mutex";
+
+  cycle = cycle_;   /* Remember which cycle we're on */
+
+  if(--counter_ == 0) {
+    cycle_ = !cycle_;
+    counter_ = threshold_;
+    status = pthread_cond_broadcast(&cv_);
+    /*
+     * The last thread into the barrier will return status
+     * -1 rather than 0, so that it can be used to perform
+     * some special serial code following the barrier.
+     */
+    if(status == 0) status = -1;
+  } else {
+    /*
+     * Wait with cancellation disabled, because barrier_wait
+     * should not be a cancellation point.
+     */
+    pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cancel);
+
+    /*
+     * Wait until the barrier's cycle changes, which means
+     * that it has been broadcast, and we don't want to wait
+     * anymore.
+     */
+    while (cycle == cycle_) {
+      status = pthread_cond_wait(&cv_, &mutex_);
+      if (status != 0) break;
+    }
+
+    pthread_setcancelstate(cancel, &tmp);
+  }
+  /*
+   * Ignore an error in unlocking. It shouldn't happen, and
+   * reporting it here would be misleading -- the barrier wait
+   * completed, after all, whereas returning, for example,
+   * EINVAL would imply the wait had failed. The next attempt
+   * to use the barrier *will* return an error, or hang, due
+   * to whatever happened to the mutex.
+   */
+  pthread_mutex_unlock (&mutex_);
+  return status;          /* error, -1 for waker, or 0 */
+}
+
+
+}//namespace TNet
diff --git a/src/TNetLib/Barrier.h b/src/TNetLib/Barrier.h
new file mode 100644
index 0000000..a5849d2
--- /dev/null
+++ b/src/TNetLib/Barrier.h
@@ -0,0 +1,41 @@
+/*
+ * barrier.h
+ *
+ * This header file describes the "barrier" synchronization
+ * construct. The type barrier_t describes the full state of the
+ * barrier including the POSIX 1003.1c synchronization objects
+ * necessary.
+ *
+ * A barrier causes threads to wait until a set of threads has
+ * all "reached" the barrier. The number of threads required is
+ * set when the barrier is initialized, and cannot be changed
+ * except by reinitializing.
+ */
+#include <pthread.h>
+
+#ifndef barrier_h
+#define barrier_h
+
+namespace TNet {
+
+/*
+ * Structure describing a barrier.
+ */
+class Barrier {
+ public:
+  Barrier(int count=0);
+  ~Barrier();
+  void SetThreshold(int thr);
+  int Wait();
+ private:
+  pthread_mutex_t     mutex_;          /* Control access to barrier */
+  pthread_cond_t      cv_;             /* wait for barrier */
+  int                 threshold_;      /* number of threads required */
+  int                 counter_;        /* current number of threads */
+  int                 cycle_;          /* alternate wait cycles (0 or 1) */
+};
+
+}//namespace TNet
+
+#endif
+
diff --git a/src/TNetLib/BiasedLinearity.cc b/src/TNetLib/BiasedLinearity.cc
new file mode 100644
index 0000000..b52aeb0
--- /dev/null
+++ b/src/TNetLib/BiasedLinearity.cc
@@ -0,0 +1,180 @@
+
+
+#include "BiasedLinearity.h"
+
+
+namespace TNet {
+
+
+void
+BiasedLinearity::
+PropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+{
+  //y = b + x.A
+
+  //precopy bias
+  size_t rows = X.Rows();
+  for(size_t i=0; i<rows; i++) {
+    Y[i].Copy(*mpBias);
+  }
+
+  //multiply matrix by matrix with mLinearity
+  Y.BlasGemm(1.0f, X, NO_TRANS, *mpLinearity, NO_TRANS, 1.0f);
+}
+
+
+void
+BiasedLinearity::
+BackpropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+{
+  // e' = e.A^T
+  Y.Zero();
+  Y.BlasGemm(1.0f, X, NO_TRANS, *mpLinearity, TRANS, 0.0f);
+}
+
+
+
+void
+BiasedLinearity::
+ReadFromStream(std::istream& rIn)
+{
+  //matrix is stored transposed as SNet does
+  Matrix<BaseFloat> transpose;
+  rIn >> transpose;
+  mLinearity = Matrix<BaseFloat>(transpose, TRANS);
+  //biases stored normally
+  rIn >> mBias;
+}
+
+ 
+void
+BiasedLinearity::
+WriteToStream(std::ostream& rOut)
+{
+  //matrix is stored transposed as SNet does
+  Matrix<BaseFloat> transpose(mLinearity, TRANS);
+  rOut << transpose;
+  //biases stored normally
+  rOut << mBias;
+  rOut << std::endl;
+}
+
+
+void
+BiasedLinearity::
+Gradient()
+{
+  //calculate gradient of weight matrix
+  mLinearityCorrection.Zero();
+  mLinearityCorrection.BlasGemm(1.0f, GetInput(), TRANS, 
+                                GetErrorInput(), NO_TRANS, 
+                                0.0f);
+
+  //calculate gradient of bias
+  mBiasCorrection.Set(0.0f);
+  size_t rows = GetInput().Rows();
+  for(size_t i=0; i<rows; i++) {
+    mBiasCorrection.Add(GetErrorInput()[i]);
+  }
+
+  /* 
+  //perform update
+  mLinearity.AddScaled(-mLearningRate, mLinearityCorrection);
+  mBias.AddScaled(-mLearningRate, mBiasCorrection);
+  */
+}
+
+
+void 
+BiasedLinearity::
+AccuGradient(const UpdatableComponent& src, int thr, int thrN) {
+  //cast the argument
+  const BiasedLinearity& src_comp = dynamic_cast<const BiasedLinearity&>(src);
+
+  //allocate accumulators when needed
+  if(mLinearityCorrectionAccu.MSize() == 0) {
+    mLinearityCorrectionAccu.Init(mLinearity.Rows(),mLinearity.Cols());
+  }
+  if(mBiasCorrectionAccu.MSize() == 0) {
+    mBiasCorrectionAccu.Init(mBias.Dim());
+  }
+
+  //need to find out which rows to sum...
+  int div = mLinearityCorrection.Rows() / thrN;
+  int mod = mLinearityCorrection.Rows() % thrN;
+
+  int origin = thr * div + ((mod > thr)? thr : mod);
+  int rows = div + ((mod > thr)? 1 : 0);
+
+  //create the matrix windows
+  const SubMatrix<BaseFloat> src_mat (
+    src_comp.mLinearityCorrection, 
+    origin, rows, 
+    0, mLinearityCorrection.Cols()
+  );
+  SubMatrix<double> tgt_mat (
+    mLinearityCorrectionAccu, 
+    origin, rows, 
+    0, mLinearityCorrection.Cols()
+  );
+  //sum the rows
+  Add(tgt_mat,src_mat);
+
+  //first thread will always sum the bias correction
+  if(thr == 0) {
+    Add(mBiasCorrectionAccu,src_comp.mBiasCorrection);
+  }
+
+}
+
+
+void
+BiasedLinearity::
+Update(int thr, int thrN)
+{
+  //need to find out which rows to sum...
+  int div = mLinearity.Rows() / thrN;
+  int mod = mLinearity.Rows() % thrN;
+
+  int origin = thr * div + ((mod > thr)? thr : mod);
+  int rows = div + ((mod > thr)? 1 : 0);
+
+  //std::cout << "[P" << thr << "," << origin << "," << rows << "]" << std::flush;
+
+  //get the matrix windows
+  SubMatrix<double> src_mat (
+    mLinearityCorrectionAccu, 
+    origin, rows, 
+    0, mLinearityCorrection.Cols()
+  );
+  SubMatrix<BaseFloat> tgt_mat (
+    mLinearity, 
+    origin, rows, 
+    0, mLinearityCorrection.Cols()
+  );
+
+
+  //update weights
+  AddScaled(tgt_mat, src_mat, -mLearningRate);
+
+  //perform L2 regularization (weight decay)
+  BaseFloat L2_decay = -mLearningRate * mWeightcost * mBunchsize;
+  if(L2_decay != 0.0) {
+    tgt_mat.AddScaled(L2_decay, tgt_mat);
+  }
+
+  //first thread always update bias
+  if(thr == 0) {
+    //std::cout << "[" << thr << "BP]" << std::flush;
+    AddScaled(mBias, mBiasCorrectionAccu, -mLearningRate);
+  }
+
+  //reset the accumulators
+  src_mat.Zero();
+  if(thr == 0) {
+    mBiasCorrectionAccu.Zero();
+  }
+
+}
+
+} //namespace
diff --git a/src/TNetLib/BiasedLinearity.h b/src/TNetLib/BiasedLinearity.h
new file mode 100644
index 0000000..5018637
--- /dev/null
+++ b/src/TNetLib/BiasedLinearity.h
@@ -0,0 +1,92 @@
+#ifndef _BIASED_LINEARITY_H_
+#define _BIASED_LINEARITY_H_
+
+
+#include "Component.h"
+
+#include "Matrix.h"
+#include "Vector.h"
+
+
+namespace TNet {
+
+class BiasedLinearity : public UpdatableComponent
+{
+ public:
+
+  BiasedLinearity(size_t nInputs, size_t nOutputs, Component *pPred);
+  ~BiasedLinearity() { } 
+  
+  ComponentType GetType() const
+  { return BIASED_LINEARITY; }
+
+  const char* GetName() const
+  { return "<BiasedLinearity>"; }
+
+  Component* Clone() const;
+
+  void PropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y);
+  void BackpropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y);
+
+  void ReadFromStream(std::istream& rIn);
+  void WriteToStream(std::ostream& rOut);
+
+  /// calculate gradient
+  void Gradient();
+  /// accumulate gradient from other components
+  void AccuGradient(const UpdatableComponent& src, int thr, int thrN);  
+  /// update weights, reset the accumulator
+  void Update(int thr, int thrN);
+
+ protected:
+  Matrix<BaseFloat> mLinearity;  ///< Matrix with neuron weights
+  Vector<BaseFloat> mBias;       ///< Vector with biases
+
+  const Matrix<BaseFloat>* mpLinearity;
+  const Vector<BaseFloat>* mpBias;
+
+  Matrix<BaseFloat> mLinearityCorrection; ///< Matrix for linearity updates
+  Vector<BaseFloat> mBiasCorrection;      ///< Vector for bias updates
+
+  Matrix<double> mLinearityCorrectionAccu; ///< Matrix for summing linearity updates
+  Vector<double> mBiasCorrectionAccu;      ///< Vector for summing bias updates
+
+};
+
+
+
+
+////////////////////////////////////////////////////////////////////////////
+// INLINE FUNCTIONS 
+// BiasedLinearity::
+inline 
+BiasedLinearity::
+BiasedLinearity(size_t nInputs, size_t nOutputs, Component *pPred)
+  : UpdatableComponent(nInputs, nOutputs, pPred), 
+    mLinearity(), mBias(), //cloned instaces don't need this
+    mpLinearity(&mLinearity), mpBias(&mBias), 
+    mLinearityCorrection(nInputs,nOutputs), mBiasCorrection(nOutputs),
+    mLinearityCorrectionAccu(), mBiasCorrectionAccu() //cloned instances don't need this
+{ }
+
+inline
+Component* 
+BiasedLinearity::
+Clone() const
+{
+  BiasedLinearity* ptr = new BiasedLinearity(GetNInputs(), GetNOutputs(), NULL);
+  ptr->mpLinearity = mpLinearity; //copy pointer from currently active weights
+  ptr->mpBias = mpBias;           //...
+
+  ptr->mLearningRate = mLearningRate;
+  
+  return ptr;
+}
+
+
+
+} //namespace
+
+
+
+#endif
diff --git a/src/TNetLib/BlockArray.cc b/src/TNetLib/BlockArray.cc
new file mode 100644
index 0000000..18a41d2
--- /dev/null
+++ b/src/TNetLib/BlockArray.cc
@@ -0,0 +1,136 @@
+
+
+#include "BlockArray.h"
+#include "Nnet.h"
+
+
+namespace TNet
+{
+
+  void 
+  BlockArray::
+  PropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+  {
+    SubMatrix<BaseFloat> colsX(X,0,1,0,1); //dummy dimensions
+    SubMatrix<BaseFloat> colsY(Y,0,1,0,1); //dummy dimensions
+    
+    int X_src_ori=0, Y_tgt_ori=0;
+    for(int i=0; i<mNBlocks; i++) {
+      //get the correct submatrices
+      int colsX_cnt=mBlocks[i]->GetNInputs();
+      int colsY_cnt=mBlocks[i]->GetNOutputs();
+      colsX = X.Range(0,X.Rows(),X_src_ori,colsX_cnt);
+      colsY = Y.Range(0,Y.Rows(),Y_tgt_ori,colsY_cnt);
+
+      //propagate through the block(network)
+      mBlocks[i]->Propagate(colsX,colsY);
+
+      //shift the origin coordinates
+      X_src_ori += colsX_cnt;
+      Y_tgt_ori += colsY_cnt;
+    }
+
+    assert(X_src_ori == X.Cols());
+    assert(Y_tgt_ori == Y.Cols());
+  }
+
+
+  void 
+  BlockArray::
+  BackpropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+  {
+    KALDI_ERR << "Unimplemented";
+  }
+
+  
+  void 
+  BlockArray::
+  Update() 
+  {
+    KALDI_ERR << "Unimplemented";
+  }
+
+
+  void
+  BlockArray::
+  ReadFromStream(std::istream& rIn)
+  {
+    if(mBlocks.size() > 0) {
+      KALDI_ERR << "Cannot read block vector, "
+                << "aleady filled bt "
+                << mBlocks.size()
+                << "elements";
+    }
+
+    rIn >> std::ws >> mNBlocks;
+    if(mNBlocks < 1) {
+      KALDI_ERR << "Bad number of blocks:" << mNBlocks;
+    }
+
+    //read all the blocks
+    std::string tag;
+    int block_id;
+    for(int i=0; i<mNBlocks; i++) {
+      //read tag <block>
+      rIn >> std::ws >> tag;
+      //make it lowercase
+      std::transform(tag.begin(), tag.end(), tag.begin(), tolower);
+      //check
+      if(tag!="<block>") {
+        KALDI_ERR << "<block> keywotd expected";
+      }
+    
+      //read block number
+      rIn >> std::ws >> block_id;
+      if(block_id != i+1) {
+        KALDI_ERR << "Expected block number:" << i+1
+                  << " read block number: " << block_id;
+      }
+
+      //read the nnet
+      Network* p_nnet = new Network;
+      p_nnet->ReadNetwork(rIn);
+      if(p_nnet->Layers() == 0) {
+        KALDI_ERR << "Cannot read empty network to a block";
+      }
+
+      //add it to the vector
+      mBlocks.push_back(p_nnet);
+    }
+
+    //check the declared dimensionality
+    int sum_inputs=0, sum_outputs=0;
+    for(int i=0; i<mNBlocks; i++) {
+      sum_inputs += mBlocks[i]->GetNInputs();
+      sum_outputs += mBlocks[i]->GetNOutputs();
+    }
+    if(sum_inputs != GetNInputs()) {
+      KALDI_ERR << "Non-matching number of INPUTS! Declared:"
+                << GetNInputs()
+                << " summed from blocks"
+                << sum_inputs;
+    }
+    if(sum_outputs != GetNOutputs()) {
+      KALDI_ERR << "Non-matching number of OUTPUTS! Declared:"
+                << GetNOutputs()
+                << " summed from blocks"
+                << sum_outputs;
+    }
+  }
+
+   
+  void
+  BlockArray::
+  WriteToStream(std::ostream& rOut)
+  {
+    rOut << " " << mBlocks.size() << " ";
+    for(int i=0; i<mBlocks.size(); i++) {
+      rOut << "<block> " << i+1 << "\n";
+      mBlocks[i]->WriteNetwork(rOut);
+      rOut << "<endblock>\n";
+    }
+  }
+
+ 
+} //namespace
+
diff --git a/src/TNetLib/BlockArray.h b/src/TNetLib/BlockArray.h
new file mode 100644
index 0000000..e6a8657
--- /dev/null
+++ b/src/TNetLib/BlockArray.h
@@ -0,0 +1,85 @@
+#ifndef _BLOCK_ARRAY_H_
+#define _BLOCK_ARRAY_H_
+
+
+#include "Component.h"
+
+#include "Matrix.h"
+#include "Vector.h"
+
+
+namespace TNet {
+
+  class Network;
+
+  class BlockArray : public Component
+  {
+    public:
+
+      BlockArray(size_t nInputs, size_t nOutputs, Component *pPred); 
+      ~BlockArray();  
+      
+      ComponentType GetType() const;
+      const char* GetName() const;
+
+      void PropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y);
+      void BackpropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y);
+
+      void Update();
+
+      void ReadFromStream(std::istream& rIn);
+      void WriteToStream(std::ostream& rOut);
+ 
+      //:TODO:
+      Component* Clone() const { KALDI_ERR << "Unimplemented"; }
+
+    protected:
+      std::vector<Network*> mBlocks; ///< vector with networks, one network is one block
+      size_t mNBlocks;  
+  };
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // BlockArray::
+  inline 
+  BlockArray::
+  BlockArray(size_t nInputs, size_t nOutputs, Component *pPred)
+    : Component(nInputs, nOutputs, pPred), 
+      mNBlocks(0) 
+  { }
+
+
+  inline
+  BlockArray::
+  ~BlockArray()
+  { 
+    for(int i=0; i<mBlocks.size(); i++) {
+      delete mBlocks[i];
+    }
+    mBlocks.clear();
+  }
+
+  inline Component::ComponentType
+  BlockArray::
+  GetType() const
+  {
+    return Component::BLOCK_ARRAY;
+  }
+
+  inline const char*
+  BlockArray::
+  GetName() const
+  {
+    return "<blockarray>";
+  }
+
+
+
+} //namespace
+
+
+
+#endif
diff --git a/src/TNetLib/CRBEDctFeat.h b/src/TNetLib/CRBEDctFeat.h
new file mode 100644
index 0000000..0984c36
--- /dev/null
+++ b/src/TNetLib/CRBEDctFeat.h
@@ -0,0 +1,432 @@
+#ifndef _CUCRBEDCTFEATURES_H_
+#define _CUCRBEDCTFEATURES_H_
+
+
+#include "Component.h"
+#include "Matrix.h"
+#include "Vector.h"
+#include "cblas.h"
+
+
+namespace TNet {
+
+  /**
+   * Expands the time context of the input features
+   * in N, out k*N, FrameOffset o_1,o_2,...,o_k
+   * FrameOffset example 11frames: -5 -4 -3 -2 -1 0 1 2 3 4 5
+   */
+  class Expand : public Component
+  {
+   public:
+    Expand(size_t nInputs, size_t nOutputs, Component* pPred)
+      : Component(nInputs,nOutputs,pPred)
+    { }
+
+    ~Expand()
+    { }
+
+    ComponentType GetType() const
+    { return EXPAND; }
+
+    const char* GetName() const
+    { return "<expand>"; }
+   
+    Component* Clone() const 
+    { 
+      Expand* p = new Expand(GetNInputs(),GetNOutputs(),NULL);
+      p->mFrameOffset.Init(mFrameOffset.Dim()); 
+      p->mFrameOffset.Copy(mFrameOffset); 
+      return p; 
+    }
+
+    void ReadFromStream(std::istream& rIn)
+    { rIn >> mFrameOffset; }
+
+    void WriteToStream(std::ostream& rOut)  
+    { rOut << mFrameOffset; }
+     
+   protected:
+    void PropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+    {
+      assert(X.Cols()*mFrameOffset.Dim() == Y.Cols());
+      assert(X.Rows() == Y.Rows());
+
+      for(size_t r=0;r<X.Rows();r++) {
+        for(size_t off=0;off<mFrameOffset.Dim();off++) {
+          int r_off = r + mFrameOffset[off];
+          if(r_off < 0) r_off = 0;
+          if(r_off >= X.Rows()) r_off = X.Rows()-1;
+          memcpy(Y.pRowData(r)+off*X.Cols(),X.pRowData(r_off),sizeof(BaseFloat)*X.Cols());
+        }
+      }
+    }
+
+    void BackpropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+    { Error("__func__ Nonsense"); }
+
+   protected:
+    Vector<int> mFrameOffset;
+  };
+
+
+
+  /**
+   * Rearrange the matrix columns according to the indices in mCopyFromIndices
+   */
+  class Copy : public Component
+  {
+   public:
+    Copy(size_t nInputs, size_t nOutputs, Component* pPred)
+      : Component(nInputs,nOutputs,pPred)
+    { }
+
+    ~Copy()
+    { }
+
+    ComponentType GetType() const
+    { return COPY; }
+
+    const char* GetName() const
+    { return "<copy>"; }
+    
+    Component* Clone() const 
+    { 
+      Copy* p = new Copy(GetNInputs(),GetNOutputs(),NULL);
+      p->mCopyFromIndices.Init(mCopyFromIndices.Dim()); 
+      p->mCopyFromIndices.Copy(mCopyFromIndices); 
+      return p; 
+    }
+
+    void ReadFromStream(std::istream& rIn)
+    { 
+      Vector<int> vec; rIn >> vec; vec.Add(-1); 
+      mCopyFromIndices.Init(vec.Dim()).Copy(vec);
+    }
+
+    void WriteToStream(std::ostream& rOut)  
+    { 
+      Vector<int> vec(mCopyFromIndices); 
+      vec.Add(1); rOut << vec; 
+    }
+     
+   protected:
+    void PropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+    {
+      assert(mCopyFromIndices.Dim() == Y.Cols());
+      for(int i=0; i<mCopyFromIndices.Dim();i++) {
+        assert(mCopyFromIndices[i] >= 0 && mCopyFromIndices[i] < X.Cols());
+      }
+        
+      for(size_t r=0; r<X.Rows(); r++) {
+        for(size_t c=0; c<Y.Cols(); c++) {
+          Y(r,c) = X(r,mCopyFromIndices[c]);
+        }
+      }
+    }
+
+    void BackpropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+    { Error("__func__ Nonsense"); }
+
+   protected:
+    Vector<int> mCopyFromIndices;
+  };
+  
+  class Transpose : public Component
+  {
+   public:
+    Transpose(size_t nInputs, size_t nOutputs, Component* pPred)
+      : Component(nInputs,nOutputs,pPred), mContext(0)
+    { }
+
+    ~Transpose()
+    { }
+
+    ComponentType GetType() const
+    { return TRANSPOSE; }
+
+    const char* GetName() const
+    { return "<transpose>"; }
+ 
+    Component* Clone() const  
+    { 
+      Transpose* p = new Transpose(GetNInputs(),GetNOutputs(),NULL); 
+      p->mCopyFromIndices.Init(mCopyFromIndices.Dim());
+      p->mCopyFromIndices.Copy(mCopyFromIndices); 
+      p->mContext = mContext;
+      return p; 
+    }
+  
+    void ReadFromStream(std::istream& rIn)
+    { 
+      rIn >> std::ws >> mContext;
+
+      if(GetNInputs() != GetNOutputs()) { 
+        Error("Input dim must be same as output dim"); 
+      }
+      
+      Vector<int> vec(GetNInputs());
+      int channels = GetNInputs() / mContext;
+      for(int i=0, ch=0; ch<channels; ch++) {
+        for(int idx=ch; idx < (int)GetNInputs(); idx+=channels, i++) {
+          assert(i < (int)GetNInputs());
+          vec[i] = idx;
+        }
+      }
+
+      mCopyFromIndices.Init(vec.Dim()).Copy(vec); 
+    }
+
+    void WriteToStream(std::ostream& rOut)  
+    { rOut << " " << mContext << "\n"; }
+     
+   protected:
+    void PropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+    { 
+      assert(mCopyFromIndices.Dim() == Y.Cols());
+      for(int i=0; i<mCopyFromIndices.Dim();i++) {
+        assert(mCopyFromIndices[i] >= 0 && mCopyFromIndices[i] < X.Cols());
+      }
+        
+      for(size_t r=0; r<X.Rows(); r++) {
+        for(size_t c=0; c<Y.Cols(); c++) {
+          Y(r,c) = X(r,mCopyFromIndices[c]);
+        }
+      }
+    }
+
+    void BackpropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+    { Error("__func__ Nonsense"); }
+
+   protected:
+    int mContext;
+    Vector<int> mCopyFromIndices;
+  };
+
+
+  /**
+   * BlockLinearity is used for the blockwise multiplication by 
+   * DCT transform loaded from disk
+   */
+  class BlockLinearity : public Component
+  {
+    public:
+      BlockLinearity(size_t nInputs, size_t nOutputs, Component* pPred)
+        : Component(nInputs,nOutputs,pPred)
+      { }
+
+      ~BlockLinearity()
+      { }
+
+
+      ComponentType GetType() const
+      { return Component::BLOCK_LINEARITY; }
+
+      const char* GetName() const
+      { return "<blocklinearity>"; }
+
+      Component* Clone() const 
+      { 
+        BlockLinearity* p = new BlockLinearity(GetNInputs(),GetNOutputs(),NULL);
+        p->mBlockLinearity.Init(mBlockLinearity.Rows(),mBlockLinearity.Cols()); 
+        p->mBlockLinearity.Copy(mBlockLinearity); 
+        return p; 
+      }
+
+      void PropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y) 
+      {
+        assert(X.Rows() == Y.Rows());
+        assert(X.Cols()%mBlockLinearity.Rows() == 0);
+        assert(Y.Cols()%mBlockLinearity.Cols() == 0);
+        assert(X.Cols()/mBlockLinearity.Rows() == Y.Cols()/mBlockLinearity.Cols());
+        
+        int instN = X.Cols()/mBlockLinearity.Rows();
+        for(int inst=0; inst<instN; inst++) {
+#ifndef DOUBLEPRECISION
+          cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                  X.Rows(), mBlockLinearity.Cols(), mBlockLinearity.Rows(),
+                  1.0, X.pData()+inst*mBlockLinearity.Rows(), X.Stride(), 
+                  mBlockLinearity.pData(), mBlockLinearity.Stride(),
+                  0.0, Y.pData()+inst*mBlockLinearity.Cols(), Y.Stride());
+#else
+          cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                  X.Rows(), mBlockLinearity.Cols(), mBlockLinearity.Rows(),
+                  1.0, X.pData()+inst*mBlockLinearity.Rows(), X.Stride(), 
+                  mBlockLinearity.pData(), mBlockLinearity.Stride(),
+                  0.0, Y.pData()+inst*mBlockLinearity.Cols(), Y.Stride());
+#endif
+        }
+      }
+        
+      void BackpropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y) 
+      { Error("__func__ Not implemented"); }
+
+
+      void ReadFromStream(std::istream& rIn)
+      { 
+        Matrix<BaseFloat> mat;
+        rIn >> mat;
+        Matrix<BaseFloat> trans(mat,TRANS);
+        mBlockLinearity.Init(trans.Rows(),trans.Cols()).Copy(trans);
+
+        if((GetNOutputs() % mBlockLinearity.Cols() != 0) ||
+           (GetNInputs() % mBlockLinearity.Rows() != 0) ||
+           ((GetNOutputs() / mBlockLinearity.Cols()) != 
+            (GetNInputs() / mBlockLinearity.Rows()))) 
+        {
+          Error("BlockLinearity matrix dimensions must divide IO dims");
+        }
+      }
+
+      void WriteToStream(std::ostream& rOut)
+      {
+        Matrix<BaseFloat> trans(mBlockLinearity,TRANS);
+        rOut << trans;
+      }
+
+    private:
+      Matrix<BaseFloat> mBlockLinearity;
+  };
+
+
+  
+  class Bias : public Component
+  {
+    public:
+      Bias(size_t nInputs, size_t nOutputs, Component* pPred)
+        : Component(nInputs,nOutputs,pPred)
+      { }
+
+      ~Bias()
+      { }
+
+
+      ComponentType GetType() const
+      { return Component::BIAS; }
+
+      const char* GetName() const
+      { return "<bias>"; }
+
+      Component* Clone() const  
+      { 
+        Bias* p = new Bias(GetNInputs(),GetNOutputs(),NULL);
+        p->mBias.Init(mBias.Dim()); 
+        p->mBias.Copy(mBias); 
+        return p; 
+      }
+
+      void PropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+      { 
+        Y.Copy(X); 
+        for(size_t r=0; r<X.Rows(); r++) {
+          for(size_t c=0; c<X.Cols(); c++) {
+            Y(r,c) += mBias[c];
+          }
+        }
+      }
+
+      void BackpropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+      { Y.Copy(X); }
+  
+     
+      void ReadFromStream(std::istream& rIn)
+      { rIn >> mBias; }
+
+      void WriteToStream(std::ostream& rOut)
+      { rOut << mBias; }
+
+    private:
+      Vector<BaseFloat> mBias;
+  };
+
+
+
+  class Window : public Component
+  {
+    public:
+      Window(size_t nInputs, size_t nOutputs, Component* pPred)
+        : Component(nInputs, nOutputs, pPred)
+      { }
+
+      ~Window()
+      { }
+
+
+      ComponentType GetType() const
+      { return Component::WINDOW; }
+
+      const char* GetName() const
+      { return "<window>"; }
+
+      Component* Clone() const  
+      { 
+        Window* p = new Window(GetNInputs(),GetNOutputs(),NULL);
+        p->mWindow.Init(mWindow.Dim()); 
+        p->mWindow.Copy(mWindow); 
+        return p; 
+      }
+
+
+      void PropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+      { Y.Copy(X); 
+        for(size_t r=0; r<X.Rows(); r++) {
+          for(size_t c=0; c<X.Cols(); c++) {
+            Y(r,c) *= mWindow[c];
+          }
+        }
+      }
+
+      void BackpropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+      { Error("__func__ Not implemented"); }
+     
+      
+      void ReadFromStream(std::istream& rIn)
+      { rIn >> mWindow; }
+
+      void WriteToStream(std::ostream& rOut)
+      { rOut << mWindow; }
+
+    private:
+      Vector<BaseFloat> mWindow;
+  };
+
+  class Log : public Component
+  {
+    public:
+      Log(size_t nInputs, size_t nOutputs, Component* pPred)
+        : Component(nInputs, nOutputs, pPred)
+      { }
+
+      ~Log()
+      { }
+
+
+      ComponentType GetType() const
+      { return Component::LOG; }
+
+      const char* GetName() const
+      { return "<log>"; }
+
+      Component* Clone() const  
+      { return new Log(GetNInputs(),GetNOutputs(),NULL); }
+
+
+      void PropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+      { Y.Copy(X); Y.ApplyLog(); }
+
+      void BackpropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+      { Error("__func__ Not implemented"); }
+     
+      
+      void ReadFromStream(std::istream& rIn)
+      { }
+
+      void WriteToStream(std::ostream& rOut)
+      { }
+
+  };
+
+}
+
+
+#endif
+
diff --git a/src/TNetLib/Cache.cc b/src/TNetLib/Cache.cc
new file mode 100644
index 0000000..f498318
--- /dev/null
+++ b/src/TNetLib/Cache.cc
@@ -0,0 +1,248 @@
+
+#include <sys/time.h>
+
+#include "Cache.h"
+#include "Matrix.h"
+#include "Vector.h"
+
+
+namespace TNet {
+
+  Cache::
+  Cache()
+    : mState(EMPTY), mIntakePos(0), mExhaustPos(0), mDiscarded(0), 
+      mRandomized(false), mTrace(0)
+  { }
+
+  Cache::
+  ~Cache()
+  { }
+
+  void
+  Cache::
+  Init(size_t cachesize, size_t bunchsize, long int seed)
+  {
+    if((cachesize % bunchsize) != 0) {
+      KALDI_ERR << "Non divisible cachesize" << cachesize
+                << " by bunchsize" << bunchsize;
+    }
+    
+    mCachesize = cachesize;
+    mBunchsize = bunchsize;
+
+    mState = EMPTY;
+
+    mIntakePos = 0;
+    mExhaustPos = 0;
+
+    mRandomized = false;
+
+    if(seed == 0) {
+      //generate seed
+      struct timeval tv;
+      if (gettimeofday(&tv, 0) == -1) {
+        Error("gettimeofday does not work.");
+        exit(-1);
+      }
+      seed = (int)(tv.tv_sec) + (int)tv.tv_usec + (int)(tv.tv_usec*tv.tv_usec);
+    }
+
+    srand48(seed);
+
+  }
+
+  void 
+  Cache::
+  AddData(const Matrix<BaseFloat>& rFeatures, const Matrix<BaseFloat>& rDesired)
+  {
+    assert(rFeatures.Rows() == rDesired.Rows());
+
+    //lazy buffers allocation
+    if(mFeatures.Rows() != mCachesize) {
+      mFeatures.Init(mCachesize,rFeatures.Cols());
+      mDesired.Init(mCachesize,rDesired.Cols());
+    }
+
+    //warn if segment longer than half-cache
+    if(rFeatures.Rows() > mCachesize/2) {
+      std::ostringstream os;
+      os << "Too long segment and small feature cache! "
+         << " cachesize: " << mCachesize
+         << " segmentsize: " << rFeatures.Rows();
+      Warning(os.str());
+    }
+
+    //change state
+    if(mState == EMPTY) { 
+      if(mTrace&3) std::cout << "/" << std::flush; 
+      mState = INTAKE; mIntakePos = 0;
+     
+      //check for leftover from previous segment 
+      int leftover = mFeaturesLeftover.Rows();
+      //check if leftover is not bigger than cachesize
+      if(leftover > mCachesize) {
+        std::ostringstream os;
+        os << "Too small feature cache: " << mCachesize
+           << ", truncating: "
+           << leftover - mCachesize << " frames from previous segment leftover";
+        //Error(os.str());
+        Warning(os.str());
+        leftover = mCachesize;
+      }
+      //prefill cache with leftover
+      if(leftover > 0) {
+        memcpy(mFeatures.pData(),mFeaturesLeftover.pData(),
+          (mFeaturesLeftover.MSize() < mFeatures.MSize()?
+           mFeaturesLeftover.MSize() : mFeatures.MSize()) 
+        );
+        memcpy(mDesired.pData(),mDesiredLeftover.pData(),
+          (mDesiredLeftover.MSize() < mDesired.MSize()?
+           mDesiredLeftover.MSize() : mDesired.MSize()) 
+        );
+        mFeaturesLeftover.Destroy();
+        mDesiredLeftover.Destroy();
+        mIntakePos += leftover;
+      } 
+    }
+
+    assert(mState == INTAKE);
+    assert(rFeatures.Rows() == rDesired.Rows());
+    if(mTrace&2) std::cout << "F" << std::flush; 
+
+    int cache_space = mCachesize - mIntakePos;
+    int feature_length = rFeatures.Rows();
+    int fill_rows = (cache_space<feature_length)? cache_space : feature_length;
+    int leftover = feature_length - fill_rows;
+
+    assert(cache_space > 0);
+    assert(mFeatures.Stride()==rFeatures.Stride());
+    assert(mDesired.Stride()==rDesired.Stride());
+
+    //copy the data to cache
+    memcpy(mFeatures.pData()+mIntakePos*mFeatures.Stride(),
+           rFeatures.pData(),
+           fill_rows*mFeatures.Stride()*sizeof(BaseFloat));
+
+    memcpy(mDesired.pData()+mIntakePos*mDesired.Stride(),
+           rDesired.pData(),
+           fill_rows*mDesired.Stride()*sizeof(BaseFloat));
+
+    //copy leftovers
+    if(leftover > 0) {
+      mFeaturesLeftover.Init(leftover,mFeatures.Cols());
+      mDesiredLeftover.Init(leftover,mDesired.Cols());
+
+      memcpy(mFeaturesLeftover.pData(),
+             rFeatures.pData()+fill_rows*rFeatures.Stride(),
+             mFeaturesLeftover.MSize());
+
+      memcpy(mDesiredLeftover.pData(),
+             rDesired.pData()+fill_rows*rDesired.Stride(),
+             mDesiredLeftover.MSize());       
+    }
+ 
+    //update cursor
+    mIntakePos += fill_rows;
+    
+    //change state
+    if(mIntakePos == mCachesize) { 
+      if(mTrace&3) std::cout << "\\" << std::flush; 
+      mState = FULL;
+    }
+  }
+
+
+
+  void
+  Cache::
+  Randomize()
+  {
+    assert(mState == FULL || mState == INTAKE);
+
+    if(mTrace&3) std::cout << "R" << std::flush;
+
+    //lazy initialization of the output buffers
+    mFeaturesRandom.Init(mCachesize,mFeatures.Cols());
+    mDesiredRandom.Init(mCachesize,mDesired.Cols());
+
+    //generate random series of integers
+    Vector<int> randmask(mIntakePos);
+    for(unsigned int i=0; i<mIntakePos; i++) {
+      randmask[i]=i;
+    }
+    int* ptr = randmask.pData();
+    std::random_shuffle(ptr, ptr+mIntakePos, GenerateRandom);
+
+    //randomize
+    for(int i=0; i<randmask.Dim(); i++) {
+      mFeaturesRandom[i].Copy(mFeatures[randmask[i]]);
+      mDesiredRandom[i].Copy(mDesired[randmask[i]]);
+    }
+
+    mRandomized = true;
+  }
+
+  void
+  Cache::
+  GetBunch(Matrix<BaseFloat>& rFeatures, Matrix<BaseFloat>& rDesired)
+  {
+    if(mState == EMPTY) {
+      Error("GetBunch on empty cache!!!");
+    }
+
+    //change state if full...
+    if(mState == FULL) { 
+      if(mTrace&3) std::cout << "\\" << std::flush; 
+      mState = EXHAUST; mExhaustPos = 0; 
+    }
+
+    //final cache is not completely filled
+    if(mState == INTAKE) {
+      if(mTrace&3) std::cout << "\\-LAST_CACHE\n" << std::flush; 
+      mState = EXHAUST; mExhaustPos = 0; 
+    } 
+
+    assert(mState == EXHAUST);
+
+    //init the output
+    if(rFeatures.Rows()!=mBunchsize || rFeatures.Cols()!=mFeatures.Cols()) {
+      rFeatures.Init(mBunchsize,mFeatures.Cols());
+    }
+    if(rDesired.Rows()!=mBunchsize || rDesired.Cols()!=mDesired.Cols()) {
+      rDesired.Init(mBunchsize,mDesired.Cols());
+    }
+
+    //copy the output
+    if(mRandomized) {
+      memcpy(rFeatures.pData(),
+             mFeaturesRandom.pData()+mExhaustPos*mFeatures.Stride(),
+             rFeatures.MSize());
+
+      memcpy(rDesired.pData(),
+             mDesiredRandom.pData()+mExhaustPos*mDesired.Stride(),
+             rDesired.MSize());
+    } else {
+      memcpy(rFeatures.pData(),
+             mFeatures.pData()+mExhaustPos*mFeatures.Stride(),
+             rFeatures.MSize());
+
+      memcpy(rDesired.pData(),
+             mDesired.pData()+mExhaustPos*mDesired.Stride(),
+             rDesired.MSize());
+    }
+
+
+    //update cursor
+    mExhaustPos += mBunchsize;
+
+    //change state to EMPTY
+    if(mExhaustPos > mIntakePos-mBunchsize) {
+      //we don't have more complete bunches...
+      mDiscarded += mIntakePos - mExhaustPos;
+
+      mState = EMPTY;
+    }
+  }
+
+
+}
diff --git a/src/TNetLib/Cache.h b/src/TNetLib/Cache.h
new file mode 100644
index 0000000..800d92c
--- /dev/null
+++ b/src/TNetLib/Cache.h
@@ -0,0 +1,74 @@
+#ifndef _CUCACHE_H_
+#define _CUCACHE_H_
+
+#include "Matrix.h"
+
+namespace TNet {
+
+
+  /**
+   * The feature-target pair cache
+   */
+  class Cache {
+    typedef enum { EMPTY, INTAKE, FULL, EXHAUST } State;
+    public:
+      Cache();
+      ~Cache();
+     
+      /// Initialize the cache
+      void Init(size_t cachesize, size_t bunchsize, long int seed = 0);
+
+      /// Add data to cache, returns number of added vectors
+      void AddData(const Matrix<BaseFloat>& rFeatures, const Matrix<BaseFloat>& rDesired);
+      /// Randomizes the cache
+      void Randomize();
+      /// Get the bunch of training data
+      void GetBunch(Matrix<BaseFloat>& rFeatures, Matrix<BaseFloat>& rDesired);
+
+
+      /// Returns true if the cache was completely filled
+      bool Full()
+      { return (mState == FULL); }
+      
+      /// Returns true if the cache is empty
+      bool Empty()
+      { return (mState == EMPTY || mIntakePos < mBunchsize); }
+      
+      /// Number of discarded frames
+      int Discarded() 
+      { return mDiscarded; }
+      
+      /// Set the trace message level
+      void Trace(int trace)
+      { mTrace = trace; }
+
+    private:
+    
+      static long int GenerateRandom(int max)
+      { return lrand48() % max; }
+      
+      State mState; ///< Current state of the cache
+
+      size_t mIntakePos; ///< Number of intaken vectors by AddData
+      size_t mExhaustPos; ///< Number of exhausted vectors by GetBunch
+      
+      size_t mCachesize; ///< Size of cache
+      size_t mBunchsize; ///< Size of bunch
+      int mDiscarded; ///< Number of discarded frames
+
+      Matrix<BaseFloat> mFeatures; ///< Feature cache
+      Matrix<BaseFloat> mFeaturesRandom; ///< Feature cache
+      Matrix<BaseFloat> mFeaturesLeftover; ///< Feature cache
+      
+      Matrix<BaseFloat> mDesired;  ///< Desired vector cache
+      Matrix<BaseFloat> mDesiredRandom;  ///< Desired vector cache
+      Matrix<BaseFloat> mDesiredLeftover;  ///< Desired vector cache
+
+      bool mRandomized;
+
+      int mTrace;
+  }; 
+
+}
+
+#endif
diff --git a/src/TNetLib/Component.h b/src/TNetLib/Component.h
new file mode 100644
index 0000000..762451e
--- /dev/null
+++ b/src/TNetLib/Component.h
@@ -0,0 +1,387 @@
+#ifndef _NETWORK_COMPONENT_I_H
+#define _NETWORK_COMPONENT_I_H
+
+
+#include "Vector.h"
+#include "Matrix.h"
+
+#include <iostream>
+#include <stdexcept>
+
+
+namespace TNet {
+
+    
+  /**
+   * Basic element of the network,
+   * it is a box with defined inputs and outputs, 
+   * and functions to refresh outputs
+   *
+   * it is able to compute tranformation function (forward pass) 
+   * and jacobian function (backward pass), 
+   * which is to be implemented in descendents
+   */ 
+  class Component 
+  {
+    public:
+    /// Types of the net components
+    typedef enum { 
+      UPDATABLE_COMPONENT = 0x0100, 
+      BIASED_LINEARITY,
+      SHARED_LINEARITY,
+
+      ACT_FUN = 0x0200, 
+      SOFTMAX,
+      SIGMOID,
+      BLOCK_SOFTMAX, 
+
+      OTHER = 0x0400,
+      EXPAND,
+      COPY,
+      TRANSPOSE,
+      BLOCK_LINEARITY,
+      WINDOW,
+      BIAS,
+      LOG,
+      
+      BLOCK_ARRAY,
+    } ComponentType;
+
+
+    //////////////////////////////////////////////////////////////
+    // Constructor & Destructor
+    public: 
+      Component(size_t nInputs, size_t nOutputs, Component *pPred); 
+      virtual ~Component();  
+       
+    //////////////////////////////////////////////////////////////
+    // Interface specification (public)
+    public:
+      /// Get Type Identification of the component
+      virtual ComponentType GetType() const = 0;  
+      /// Get Type Label of the component
+      virtual const char* GetName() const = 0;
+      /// 
+      virtual bool IsUpdatable() const 
+      { return false; }
+      /// Clone the component
+      virtual Component* Clone() const = 0; 
+
+      /// Get size of input vectors
+      size_t GetNInputs() const;  
+      /// Get size of output vectors 
+      size_t GetNOutputs() const; 
+     
+      /// IO Data getters
+      const Matrix<BaseFloat>& GetInput() const; 
+      const Matrix<BaseFloat>& GetOutput() const;
+      const Matrix<BaseFloat>& GetErrorInput() const;
+      const Matrix<BaseFloat>& GetErrorOutput() const;
+      
+      /// Set input vector (bind with the preceding NetworkComponent)
+      void SetInput(const Matrix<BaseFloat>& rInput);           
+      /// Set error input vector (bind with the following NetworkComponent) 
+      void SetErrorInput(const Matrix<BaseFloat>& rErrorInput);  
+       
+      /// Perform forward pass propagateion Input->Output
+      void Propagate(); 
+      /// Perform backward pass propagateion ErrorInput->ErrorOutput
+      void Backpropagate(); 
+ 
+      /// Reads the component parameters from stream
+      virtual void ReadFromStream(std::istream& rIn)  { }
+      /// Writes the components parameters to stream
+      virtual void WriteToStream(std::ostream& rOut)  { } 
+
+
+    ///////////////////////////////////////////////////////////////
+    // Nonpublic member functions used to update data outputs 
+    protected:
+      /// Forward pass transformation (to be implemented by descendents...)
+      virtual void PropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y) = 0;
+      /// Backward pass transformation (to be implemented by descendents...)
+      virtual void BackpropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y) = 0;
+
+   
+    ///////////////////////////////////////////////////////////////
+    // data members
+    protected:
+
+      size_t mNInputs;  ///< Size of input vectors
+      size_t mNOutputs; ///< Size of output vectors 
+      
+      const Matrix<BaseFloat>* mpInput; ///< inputs are NOT OWNED by component
+      const Matrix<BaseFloat>* mpErrorInput;///< inputs are NOT OWNED by component
+
+      Matrix<BaseFloat> mOutput; ///< outputs are OWNED by component
+      Matrix<BaseFloat> mErrorOutput; ///< outputs are OWNED by component
+
+  };
+
+
+  /**
+   * Class UpdatableComponent is a box which has some 
+   * parameters adjustable by learning
+   *
+   * you can set the learning rate, lock the params,
+   * and learn from each data observation
+   */
+  class UpdatableComponent : public Component
+  {
+    //////////////////////////////////////////////////////////////
+    // Constructor & Destructor
+    public: 
+      UpdatableComponent(size_t nInputs, size_t nOutputs, Component *pPred); 
+      virtual ~UpdatableComponent();
+
+
+    //////////////////////////////////////////////////////////////
+    // Interface specification (public)
+    public:
+      ///
+      virtual bool IsUpdatable() const 
+      { return true; }
+
+      /// calculate gradient
+      virtual void Gradient() = 0;
+      /// accumulate gradient from other components
+      virtual void AccuGradient(const UpdatableComponent& src, int thr, int thrN) = 0;  
+      /// update weights, reset the accumulator
+      virtual void Update(int thr, int thrN) = 0;
+
+      /// Sets the learning rate of gradient descent
+      void LearnRate(BaseFloat rate);
+      /// Gets the learning rate of gradient descent
+      BaseFloat LearnRate() const;
+
+      void Momentum(BaseFloat mmt);
+      BaseFloat Momentum() const ;
+
+      void Weightcost(BaseFloat cost);
+      BaseFloat Weightcost() const;
+
+      void Bunchsize(size_t size);
+      size_t Bunchsize() const;
+
+    protected:
+      BaseFloat mLearningRate;
+      BaseFloat mMomentum;
+      BaseFloat mWeightcost;
+      size_t mBunchsize;
+  };
+
+
+
+
+  //////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // Component::
+  inline
+  Component::
+  Component(size_t nInputs, size_t nOutputs, Component *pPred) 
+    : mNInputs(nInputs), mNOutputs(nOutputs), 
+      mpInput(NULL), mpErrorInput(NULL), 
+      mOutput(), mErrorOutput()
+  { 
+    /* DOUBLE LINK the Components */
+    if (pPred != NULL) {
+      SetInput(pPred->GetOutput());
+      pPred->SetErrorInput(GetErrorOutput());
+    }
+  } 
+
+
+  inline
+  Component::
+  ~Component()
+  {
+    ;
+  }
+
+  inline void
+  Component::
+  Propagate()
+  {
+    //initialize output buffer
+    if(mOutput.Rows() != GetInput().Rows() || mOutput.Cols() != GetNOutputs()) {
+      mOutput.Init(GetInput().Rows(),GetNOutputs());
+    }
+    //do the dimensionality test
+    if(GetNInputs() != GetInput().Cols()) {
+      KALDI_ERR << "Non-matching INPUT dim!!! Network dim: " << GetNInputs() 
+                << " Data dim: " << GetInput().Cols();
+    }
+    //run transform
+    PropagateFnc(GetInput(),mOutput);
+  
+  }
+
+
+  inline void
+  Component::
+  Backpropagate()
+  {
+    //re-initialize the output buffer
+    if(mErrorOutput.Rows() != GetErrorInput().Rows() || mErrorOutput.Cols() != GetNInputs()) {
+      mErrorOutput.Init(GetErrorInput().Rows(),GetNInputs());
+    }
+
+    //do the dimensionality test
+    assert(GetErrorInput().Cols() == mNOutputs);
+    assert(mErrorOutput.Cols() == mNInputs);
+    assert(mErrorOutput.Rows() == GetErrorInput().Rows());
+
+    //transform
+    BackpropagateFnc(GetErrorInput(),mErrorOutput);
+
+ }
+
+
+  inline void
+  Component::
+  SetInput(const Matrix<BaseFloat>& rInput)
+  {
+    mpInput = &rInput;
+  }
+
+
+  inline void
+  Component::
+  SetErrorInput(const Matrix<BaseFloat>& rErrorInput)
+  {
+    mpErrorInput = &rErrorInput;
+  }
+
+
+  inline const Matrix<BaseFloat>&
+  Component::
+  GetInput() const
+  {
+    if (NULL == mpInput) Error("mpInput is NULL");
+    return *mpInput;
+  }
+
+  inline const Matrix<BaseFloat>&
+  Component::
+  GetOutput() const
+  {
+    return mOutput;
+  }
+
+  inline const Matrix<BaseFloat>&
+  Component::
+  GetErrorInput() const
+  {
+    if (NULL == mpErrorInput) Error("mpErrorInput is NULL");
+    return *mpErrorInput;
+  }
+
+  inline const Matrix<BaseFloat>&
+  Component::
+  GetErrorOutput() const
+  {
+    return mErrorOutput;
+  }
+
+  inline size_t
+  Component::
+  GetNInputs() const
+  {
+    return mNInputs;
+  }
+
+  inline size_t
+  Component::
+  GetNOutputs() const
+  {
+    return mNOutputs;
+  }
+
+
+
+  //////////////////////////////////////////////////////////////////////////
+  // INLINE FUNCTIONS 
+  // UpdatableComponent::
+  
+  inline 
+  UpdatableComponent::
+  UpdatableComponent(size_t nInputs, size_t nOutputs, Component *pPred) 
+    : Component(nInputs, nOutputs, pPred), 
+      mLearningRate(0.0), mMomentum(0.0), mWeightcost(0.0), mBunchsize(0)
+  {
+    ; 
+  } 
+
+
+  inline
+  UpdatableComponent::
+  ~UpdatableComponent()
+  {
+    ;
+  }
+
+
+  inline void
+  UpdatableComponent::
+  LearnRate(BaseFloat rate)
+  {
+    mLearningRate = rate;
+  }
+
+  inline BaseFloat
+  UpdatableComponent::
+  LearnRate() const
+  {
+    return mLearningRate;
+  }
+
+
+  inline void
+  UpdatableComponent::
+  Momentum(BaseFloat mmt)
+  {
+    mMomentum = mmt;
+  }
+
+  inline BaseFloat
+  UpdatableComponent::
+  Momentum() const
+  {
+    return mMomentum;
+  }
+  
+  
+  inline void
+  UpdatableComponent::
+  Weightcost(BaseFloat cost)
+  {
+    mWeightcost = cost;
+  }
+
+  inline BaseFloat
+  UpdatableComponent::
+  Weightcost() const
+  {
+    return mWeightcost;
+  }
+
+  
+  inline void
+  UpdatableComponent::
+  Bunchsize(size_t size)
+  {
+    mBunchsize = size;
+  }
+
+  inline size_t
+  UpdatableComponent::
+  Bunchsize() const
+  {
+    return mBunchsize;
+  }
+
+
+} // namespace TNet
+
+
+#endif
diff --git a/src/TNetLib/Makefile b/src/TNetLib/Makefile
new file mode 100644
index 0000000..58ff988
--- /dev/null
+++ b/src/TNetLib/Makefile
@@ -0,0 +1,29 @@
+
+include ../tnet.mk
+
+INCLUDE = -I. -I../KaldiLib -I../STKLib/ 
+
+all: libTNetLib.a
+
+libTNetLib.a: $(OBJ)
+	$(AR) ruv $@ $(OBJ) 
+	$(RANLIB) $@
+
+%.o : %.cc
+	$(CXX)  -o $@  -c $< $(CFLAGS) $(CXXFLAGS) $(INCLUDE)
+
+
+
+.PHONY: clean doc depend
+clean:
+	rm -f *.o *.a
+
+doc:
+	doxygen ../../doc/doxyfile_TNetLib 
+
+depend:
+	$(CXX) -M $(CXXFLAGS) *.cc $(INCLUDE) > .depend.mk
+
+-include .depend.mk
+
+
diff --git a/src/TNetLib/Mutex.cc b/src/TNetLib/Mutex.cc
new file mode 100644
index 0000000..4ec956a
--- /dev/null
+++ b/src/TNetLib/Mutex.cc
@@ -0,0 +1,48 @@
+
+#include <pthread.h>
+#include <cerrno>
+
+#include "Error.h"
+#include "Mutex.h"
+
+namespace TNet {
+  
+
+Mutex::Mutex() {
+  if(0 != pthread_mutex_init(&mutex_,NULL)) 
+    KALDI_ERR << "Cannot initialize mutex";
+}
+
+
+Mutex::~Mutex() {
+  if(0 != pthread_mutex_destroy(&mutex_)) 
+    KALDI_ERR << "Cannot destroy mutex";
+}
+
+
+void Mutex::Lock() {
+  if(0 != pthread_mutex_lock(&mutex_))
+    KALDI_ERR << "Error on locking mutex";
+}
+
+ 
+bool Mutex::TryLock() {
+  int ret = pthread_mutex_lock(&mutex_);
+  switch (ret) {
+    case 0: return true;
+    case EBUSY: return false;
+    default: KALDI_ERR << "Error on try-locking mutex";
+  }
+  return 0;//make compiler not complain
+}
+
+
+void Mutex::Unlock() {
+  if(0 != pthread_mutex_unlock(&mutex_))
+    KALDI_ERR << "Error on unlocking mutex";
+}
+
+
+  
+}//namespace TNet
+
diff --git a/src/TNetLib/Mutex.h b/src/TNetLib/Mutex.h
new file mode 100644
index 0000000..ae2cfff
--- /dev/null
+++ b/src/TNetLib/Mutex.h
@@ -0,0 +1,34 @@
+
+#include <pthread.h>
+
+namespace TNet {
+
+/**
+ * This class encapsulates mutex to ensure 
+ * exclusive access to some critical section
+ * which manipulates shared resources.
+ *
+ * The mutex must be unlocked from the 
+ * SAME THREAD which locked it
+ */
+class Mutex {
+ public:
+  Mutex();
+  ~Mutex();
+
+  void Lock();
+
+  /**
+   * Try to lock the mutex without waiting for it.
+   * Returns: true when lock successfull,
+   *         false when mutex was already locked
+   */
+  bool TryLock();
+
+  void Unlock();
+
+ private:
+  pthread_mutex_t mutex_;
+};
+
+} //namespace TNet
diff --git a/src/TNetLib/Nnet.cc b/src/TNetLib/Nnet.cc
new file mode 100644
index 0000000..4b364ac
--- /dev/null
+++ b/src/TNetLib/Nnet.cc
@@ -0,0 +1,360 @@
+
+#include <algorithm>
+//#include <locale>
+#include <cctype>
+
+#include "Nnet.h"
+#include "CRBEDctFeat.h"
+#include "BlockArray.h"
+
+namespace TNet {
+
+
+
+
+void Network::Feedforward(const Matrix<BaseFloat>& in, Matrix<BaseFloat>& out, 
+                          size_t start_frm_ext, size_t end_frm_ext) {
+  //empty network: copy input to output 
+  if(mNnet.size() == 0) {
+    if(out.Rows() != in.Rows() || out.Cols() != in.Cols()) {
+      out.Init(in.Rows(),in.Cols());
+    }
+    out.Copy(in);
+    return;
+  }
+  
+  //short input: propagate in one block  
+  if(in.Rows() < 5000) { 
+    Propagate(in,out);
+  } else {//long input: propagate per parts
+    //initialize
+    out.Init(in.Rows(),GetNOutputs());
+    Matrix<BaseFloat> tmp_in, tmp_out;
+    int done=0, block=1024;
+    //propagate first part
+    tmp_in.Init(block+end_frm_ext,in.Cols());
+    tmp_in.Copy(in.Range(0,block+end_frm_ext,0,in.Cols()));
+    Propagate(tmp_in,tmp_out);
+    out.Range(0,block,0,tmp_out.Cols()).Copy(
+      tmp_out.Range(0,block,0,tmp_out.Cols())
+    );
+    done += block;
+    //propagate middle parts
+    while((done+2*block) < in.Rows()) {
+      tmp_in.Init(block+start_frm_ext+end_frm_ext,in.Cols());
+      tmp_in.Copy(in.Range(done-start_frm_ext, block+start_frm_ext+end_frm_ext, 0,in.Cols()));      Propagate(tmp_in,tmp_out);
+      out.Range(done,block,0,tmp_out.Cols()).Copy(
+        tmp_out.Range(start_frm_ext,block,0,tmp_out.Cols())
+      );
+      done += block;
+    }
+    //propagate last part
+    tmp_in.Init(in.Rows()-done+start_frm_ext,in.Cols());
+    tmp_in.Copy(in.Range(done-start_frm_ext,in.Rows()-done+start_frm_ext,0,in.Cols()));
+    Propagate(tmp_in,tmp_out);
+    out.Range(done,out.Rows()-done,0,out.Cols()).Copy(
+      tmp_out.Range(start_frm_ext,tmp_out.Rows()-start_frm_ext,0,tmp_out.Cols())   
+    );
+
+    done += tmp_out.Rows()-start_frm_ext;
+    assert(done == out.Rows());
+  }
+}
+
+
+void Network::Propagate(const Matrix<BaseFloat>& in, Matrix<BaseFloat>& out) {
+  //empty network: copy input to output 
+  if(mNnet.size() == 0) {
+    if(out.Rows() != in.Rows() || out.Cols() != in.Cols()) {
+      out.Init(in.Rows(),in.Cols());
+    }
+    out.Copy(in);
+    return;
+  }
+  
+  //this will keep pointer to matrix 'in', for backprop
+  mNnet.front()->SetInput(in); 
+
+  //propagate
+  LayeredType::iterator it;
+  for(it=mNnet.begin(); it!=mNnet.end(); ++it) {
+    (*it)->Propagate();
+  }
+
+  //copy the output matrix
+  const Matrix<BaseFloat>& mat = mNnet.back()->GetOutput();
+  if(out.Rows() != mat.Rows() || out.Cols() != mat.Cols()) {
+    out.Init(mat.Rows(),mat.Cols());
+  }
+  out.Copy(mat);
+
+}
+
+
+void Network::Backpropagate(const Matrix<BaseFloat>& globerr) {
+  //pass matrix to last component
+  mNnet.back()->SetErrorInput(globerr);
+
+  // back-propagation : reversed order,
+  LayeredType::reverse_iterator it;
+  for(it=mNnet.rbegin(); it!=mNnet.rend(); ++it) {
+    //first component does not backpropagate error (no predecessors)
+    if(*it != mNnet.front()) {
+      (*it)->Backpropagate();
+    }
+    //compute gradient if updatable component
+    if((*it)->IsUpdatable()) {
+      UpdatableComponent& comp = dynamic_cast<UpdatableComponent&>(**it);
+      comp.Gradient(); //compute gradient 
+    }
+  }
+}
+
+
+void Network::AccuGradient(const Network& src, int thr, int thrN) {
+  LayeredType::iterator it;
+  LayeredType::const_iterator it2;
+
+  for(it=mNnet.begin(), it2=src.mNnet.begin(); it!=mNnet.end(); ++it,++it2) {
+    if((*it)->IsUpdatable()) {
+      UpdatableComponent& comp = dynamic_cast<UpdatableComponent&>(**it);
+      const UpdatableComponent& comp2 = dynamic_cast<const UpdatableComponent&>(**it2);
+      comp.AccuGradient(comp2,thr,thrN);
+    }
+  }
+}
+
+
+void Network::Update(int thr, int thrN) {
+  LayeredType::iterator it;
+
+  for(it=mNnet.begin(); it!=mNnet.end(); ++it) {
+    if((*it)->IsUpdatable()) {
+      UpdatableComponent& comp = dynamic_cast<UpdatableComponent&>(**it);
+      comp.Update(thr,thrN);
+    }
+  }
+}
+
+
+Network* Network::Clone() {
+  Network* net = new Network;
+  LayeredType::iterator it;
+  for(it = mNnet.begin(); it != mNnet.end(); ++it) {
+    //clone
+    net->mNnet.push_back((*it)->Clone());
+    //connect network
+    if(net->mNnet.size() > 1) {
+      Component* last = *(net->mNnet.end()-1);
+      Component* prev = *(net->mNnet.end()-2);
+      last->SetInput(prev->GetOutput());
+      prev->SetErrorInput(last->GetErrorOutput());
+    }
+  }
+
+  //copy the learning rate
+  //net->SetLearnRate(GetLearnRate());
+
+  return net;
+}
+
+
+void Network::ReadNetwork(const char* pSrc) {
+  std::ifstream in(pSrc);
+  if(!in.good()) {
+    Error(std::string("Error, cannot read model: ")+pSrc);
+  }
+  ReadNetwork(in);
+  in.close();
+}
+
+  
+
+void Network::ReadNetwork(std::istream& rIn) {
+  //get the network elements from a factory
+  Component *pComp;
+  while(NULL != (pComp = ComponentFactory(rIn))) 
+    mNnet.push_back(pComp);
+}
+
+
+void Network::WriteNetwork(const char* pDst) {
+  std::ofstream out(pDst);
+  if(!out.good()) {
+    Error(std::string("Error, cannot write model: ")+pDst);
+  }
+  WriteNetwork(out);
+  out.close();
+}
+
+
+void Network::WriteNetwork(std::ostream& rOut) {
+  //dump all the componetns
+  LayeredType::iterator it;
+  for(it=mNnet.begin(); it!=mNnet.end(); ++it) {
+    ComponentDumper(rOut, **it);
+  }
+}
+ 
+
+Component*
+Network::
+ComponentFactory(std::istream& rIn)
+{
+  rIn >> std::ws;
+  if(rIn.eof()) return NULL;
+
+  Component* pRet=NULL;
+  Component* pPred=NULL;
+
+  std::string componentTag;
+  size_t nInputs, nOutputs;
+
+  rIn >> std::ws;
+  rIn >> componentTag;
+  if(componentTag == "") return NULL; //nothing left in the file
+
+  //make it lowercase
+  std::transform(componentTag.begin(), componentTag.end(), 
+                 componentTag.begin(), tolower);
+
+  //the 'endblock' tag terminates the network
+  if(componentTag == "<endblock>") return NULL;
+
+  
+  if(componentTag[0] != '<' || componentTag[componentTag.size()-1] != '>') {
+    Error(std::string("Invalid component tag:")+componentTag);
+  }
+
+  rIn >> std::ws;
+  rIn >> nOutputs;
+  rIn >> std::ws;
+  rIn >> nInputs;
+  assert(nInputs > 0 && nOutputs > 0);
+
+  //make coupling with predecessor
+  if(mNnet.size() == 0) {
+    pPred = NULL;
+  } else {
+    pPred = mNnet.back();
+  }
+  
+  //array with list of component tags
+  static const std::string TAGS[] = {
+    "<biasedlinearity>",
+    "<sharedlinearity>",
+    
+    "<sigmoid>",
+    "<softmax>",
+    "<blocksoftmax>",
+
+    "<expand>",
+    "<copy>",
+    "<transpose>",
+    "<blocklinearity>",
+    "<bias>",
+    "<window>",
+    "<log>",
+
+    "<blockarray>",
+  };
+
+  static const int n_tags = sizeof(TAGS) / sizeof(TAGS[0]);
+  int i = 0;
+  for(i=0; i<n_tags; i++) {
+    if(componentTag == TAGS[i]) break;
+  }
+  
+  //switch according to position in array TAGS
+  switch(i) {
+    case 0: pRet = new BiasedLinearity(nInputs,nOutputs,pPred); break;
+    case 1: pRet = new SharedLinearity(nInputs,nOutputs,pPred); break;
+
+    case 2: pRet = new Sigmoid(nInputs,nOutputs,pPred); break;
+    case 3: pRet = new Softmax(nInputs,nOutputs,pPred); break;
+    case 4: pRet = new BlockSoftmax(nInputs,nOutputs,pPred); break;
+
+    case 5: pRet = new Expand(nInputs,nOutputs,pPred); break;
+    case 6: pRet = new Copy(nInputs,nOutputs,pPred); break;
+    case 7: pRet = new Transpose(nInputs,nOutputs,pPred); break;
+    case 8: pRet = new BlockLinearity(nInputs,nOutputs,pPred); break;
+    case 9: pRet = new Bias(nInputs,nOutputs,pPred); break;
+    case 10: pRet = new Window(nInputs,nOutputs,pPred); break;
+    case 11: pRet = new Log(nInputs,nOutputs,pPred); break;
+    
+    case 12: pRet = new BlockArray(nInputs,nOutputs,pPred); break;
+
+    default: Error(std::string("Unknown Component tag:")+componentTag);
+  }
+ 
+  //read params if it is updatable component
+  pRet->ReadFromStream(rIn);
+  //return
+  return pRet;
+}
+
+
+void
+Network::
+ComponentDumper(std::ostream& rOut, Component& rComp)
+{
+  //use tags of all the components; or the identification codes
+  //array with list of component tags
+  static const Component::ComponentType TYPES[] = {
+    Component::BIASED_LINEARITY,
+    Component::SHARED_LINEARITY,
+    
+    Component::SIGMOID,
+    Component::SOFTMAX,
+    Component::BLOCK_SOFTMAX,
+
+    Component::EXPAND,
+    Component::COPY,
+    Component::TRANSPOSE,
+    Component::BLOCK_LINEARITY,
+    Component::BIAS,
+    Component::WINDOW,
+    Component::LOG,
+
+    Component::BLOCK_ARRAY,
+  };
+  static const std::string TAGS[] = {
+    "<biasedlinearity>",
+    "<sharedlinearity>",
+
+    "<sigmoid>",
+    "<softmax>",
+    "<blocksoftmax>",
+
+    "<expand>",
+    "<copy>",
+    "<transpose>",
+    "<blocklinearity>",
+    "<bias>",
+    "<window>",
+    "<log>",
+
+    "<blockarray>",
+  };
+  static const int MAX = sizeof TYPES / sizeof TYPES[0];
+
+  int i;
+  for(i=0; i<MAX; ++i) {
+    if(TYPES[i] == rComp.GetType()) break;
+  }
+  if(i == MAX) Error("Unknown ComponentType");
+  
+  //dump the component tag
+  rOut << TAGS[i] << " " 
+       << rComp.GetNOutputs() << " " 
+       << rComp.GetNInputs() << std::endl;
+
+  //dump the parameters (if any)
+  rComp.WriteToStream(rOut);
+}
+
+
+
+  
+} //namespace
+
diff --git a/src/TNetLib/Nnet.h b/src/TNetLib/Nnet.h
new file mode 100644
index 0000000..12e2585
--- /dev/null
+++ b/src/TNetLib/Nnet.h
@@ -0,0 +1,194 @@
+#ifndef _NETWORK_H_
+#define _NETWORK_H_
+
+#include "Component.h"
+#include "BiasedLinearity.h"
+#include "SharedLinearity.h"
+#include "Activation.h"
+
+#include "Vector.h"
+
+#include <vector>
+
+
+namespace TNet {
+
+class Network
+{
+//////////////////////////////////////
+// Typedefs
+typedef std::vector<Component*> LayeredType;
+  
+  //////////////////////////////////////
+  // Disable copy construction and assignment
+ private:
+  Network(Network&); 
+  Network& operator=(Network&);
+   
+ public:
+  // allow incomplete network creation
+  Network() 
+  { }
+
+  ~Network();
+
+  int Layers() const
+  { return mNnet.size(); }
+
+  Component& Layer(int i)
+  { return *mNnet[i]; }
+ 
+  const Component& Layer(int i) const
+  { return *mNnet[i]; }
+
+  /// Feedforward the data per blocks, this needs less memory, 
+  /// and allows to process very long files.
+  /// It does not trim the *_frm_ext, but uses it 
+  /// for concatenation of segments
+  void Feedforward(const Matrix<BaseFloat>& in, Matrix<BaseFloat>& out, 
+                   size_t start_frm_ext, size_t end_frm_ext);
+  /// forward the data to the output
+  void Propagate(const Matrix<BaseFloat>& in, Matrix<BaseFloat>& out);
+  /// backpropagate the error while calculating the gradient
+  void Backpropagate(const Matrix<BaseFloat>& globerr); 
+
+  /// accumulate the gradient from other networks
+  void AccuGradient(const Network& src, int thr, int thrN);
+  /// update weights, reset the accumulator
+  void Update(int thr, int thrN);
+  
+  Network* Clone(); ///< Clones the network
+
+  void ReadNetwork(const char* pSrc);     ///< read the network from file
+  void ReadNetwork(std::istream& rIn);    ///< read the network from stream
+  void WriteNetwork(const char* pDst);    ///< write network to file
+  void WriteNetwork(std::ostream& rOut);  ///< write network to stream
+
+  size_t GetNInputs() const; ///< Dimensionality of the input features
+  size_t GetNOutputs() const; ///< Dimensionality of the desired vectors
+
+  void SetLearnRate(BaseFloat learnRate); ///< set the learning rate value
+  BaseFloat GetLearnRate();  ///< get the learning rate value
+
+  void SetWeightcost(BaseFloat l2); ///< set the L2 regularization const
+
+  void ResetBunchsize(); ///< reset the frame counter (needed for L2 regularization
+  void AccuBunchsize(const Network& src); ///< accumulate frame counts in bunch (needed in L2 regularization
+
+ private:
+  /// Creates a component by reading from stream
+  Component* ComponentFactory(std::istream& In);
+  /// Dumps component into a stream
+  void ComponentDumper(std::ostream& rOut, Component& rComp);
+
+ private:
+  LayeredType mNnet; ///< container with the network layers
+
+};
+  
+
+//////////////////////////////////////////////////////////////////////////
+// INLINE FUNCTIONS 
+// Network::
+inline Network::~Network() {
+  //delete all the components
+  LayeredType::iterator it;
+  for(it=mNnet.begin(); it!=mNnet.end(); ++it) {
+    delete *it;
+  }
+}
+
+    
+inline size_t Network::GetNInputs() const {
+  assert(mNnet.size() > 0);
+  return mNnet.front()->GetNInputs();
+}
+
+
+inline size_t
+Network::
+GetNOutputs() const
+{
+  assert(mNnet.size() > 0);
+  return mNnet.back()->GetNOutputs();
+}
+
+
+
+inline void
+Network::
+SetLearnRate(BaseFloat learnRate)
+{
+  LayeredType::iterator it;
+  for(it=mNnet.begin(); it!=mNnet.end(); ++it) {
+    if((*it)->IsUpdatable()) {
+      dynamic_cast<UpdatableComponent*>(*it)->LearnRate(learnRate);
+    }
+  }
+}
+
+
+inline BaseFloat
+Network::
+GetLearnRate()
+{
+  //TODO - learn rates may differ layer to layer
+  assert(mNnet.size() > 0);
+  for(size_t i=0; i<mNnet.size(); i++) {
+    if(mNnet[i]->IsUpdatable()) {
+      return dynamic_cast<UpdatableComponent*>(mNnet[i])->LearnRate();
+    }
+  }
+  Error("No updatable NetComponents");
+  return -1;
+}
+
+
+inline void
+Network::
+SetWeightcost(BaseFloat l2)
+{
+  LayeredType::iterator it;
+  for(it=mNnet.begin(); it!=mNnet.end(); ++it) {
+    if((*it)->IsUpdatable()) {
+      dynamic_cast<UpdatableComponent*>(*it)->Weightcost(l2);
+    }
+  }
+}
+
+
+inline void 
+Network::
+ResetBunchsize()
+{
+  LayeredType::iterator it;
+  for(it=mNnet.begin(); it!=mNnet.end(); ++it) {
+    if((*it)->IsUpdatable()) {
+      dynamic_cast<UpdatableComponent*>(*it)->Bunchsize(0);
+    }
+  }
+}
+
+inline void
+Network::
+AccuBunchsize(const Network& src)
+{
+  assert(Layers() == src.Layers());
+  assert(Layers() > 0);
+
+  for(int i=0; i<Layers(); i++) {
+    if(Layer(i).IsUpdatable()) {
+      UpdatableComponent& tgt_comp = dynamic_cast<UpdatableComponent&>(Layer(i));
+      const UpdatableComponent& src_comp = dynamic_cast<const UpdatableComponent&>(src.Layer(i));
+      tgt_comp.Bunchsize(tgt_comp.Bunchsize()+src_comp.GetOutput().Rows());
+    }
+  }
+}
+
+  
+
+} //namespace
+
+#endif
+
+
diff --git a/src/TNetLib/ObjFun.cc b/src/TNetLib/ObjFun.cc
new file mode 100644
index 0000000..c899fb1
--- /dev/null
+++ b/src/TNetLib/ObjFun.cc
@@ -0,0 +1,231 @@
+
+#include "ObjFun.h"
+#include "Error.h"
+
+#include <limits>
+
+namespace TNet {
+
+
+ObjectiveFunction* ObjectiveFunction::Factory(ObjFunType type) {
+  ObjectiveFunction* ret = NULL;
+  switch(type) {
+    case MEAN_SQUARE_ERROR: ret = new MeanSquareError;    break;
+    case CROSS_ENTROPY:     ret = new CrossEntropy;       break;
+    default: Error("Unknown ObjectiveFunction type");
+  }
+  return ret;
+}
+
+
+/*
+ * MeanSquareError
+ */
+void MeanSquareError::Evaluate(const Matrix<BaseFloat>& net_out, const Matrix<BaseFloat>& target, Matrix<BaseFloat>* err) {
+  
+  //check dimensions
+  assert(net_out.Rows() == target.Rows());
+  assert(net_out.Cols() == target.Cols());
+  if(err->Rows() != net_out.Rows() || err->Cols() != net_out.Cols()) {
+    err->Init(net_out.Rows(),net_out.Cols());
+  }
+
+  //compute global gradient
+  err->Copy(net_out);
+  err->AddScaled(-1,target);
+
+  //compute loss function
+  double sum = 0;
+  for(size_t r=0; r<err->Rows(); r++) {
+    for(size_t c=0; c<err->Cols(); c++) {
+      BaseFloat val = (*err)(r,c);
+      sum += val*val;
+    }
+  }
+  error_ += sum/2.0;
+  frames_ += net_out.Rows();
+}
+
+
+std::string MeanSquareError::Report() {
+  std::stringstream ss;
+  ss << "Mse:" << error_ << " frames:" << frames_
+     << " err/frm:" << error_/frames_
+     << "\n";
+  return ss.str();
+}
+
+
+/*
+ * CrossEntropy
+ */
+
+///Find maximum in float array
+inline int FindMaxId(const BaseFloat* ptr, size_t N) {
+  BaseFloat mval = -1e20f;
+  int mid = -1;
+  for(size_t i=0; i<N; i++) {
+    if(ptr[i] > mval) {
+      mid = i; mval = ptr[i];
+    }
+  }
+  return mid;
+}
+
+
+void
+CrossEntropy::Evaluate(const Matrix<BaseFloat>& net_out, const Matrix<BaseFloat>& target, Matrix<BaseFloat>* err)
+{
+  //check dimensions
+  assert(net_out.Rows() == target.Rows());
+  assert(net_out.Cols() == target.Cols());
+  if(err->Rows() != net_out.Rows() || err->Cols() != net_out.Cols()) {
+    err->Init(net_out.Rows(),net_out.Cols());
+  }
+
+  //allocate confunsion buffers
+  if(confusion_mode_ != NO_CONF) {
+    if(confusion_.Rows() != target.Cols() || confusion_.Cols() != target.Cols()) {
+      confusion_.Init(target.Cols(),target.Cols());
+      confusion_count_.Init(target.Cols());
+      diag_confusion_.Init(target.Cols());
+    }
+  }
+
+  //compute global gradient (assuming on softmax input)
+  err->Copy(net_out);
+  err->AddScaled(-1,target);
+
+  //collect max values
+  std::vector<size_t> max_target_id(target.Rows());
+  std::vector<size_t> max_netout_id(target.Rows());
+  //check correct classification
+  int corr = 0;
+  for(size_t r=0; r<net_out.Rows(); r++) {
+    int id_netout = FindMaxId(net_out[r].pData(),net_out.Cols());
+    int id_target = FindMaxId(target[r].pData(),target.Cols());
+    if(id_netout == id_target) corr++;
+    max_target_id[r] = id_target;//store the max value
+    max_netout_id[r] = id_netout;
+  }
+
+  //compute loss function
+  double sumerr = 0;
+  for(size_t r=0; r<net_out.Rows(); r++) {
+    if(target(r,max_target_id[r]) == 1.0) {
+      //pick the max value..., rest is zero
+      BaseFloat val = log(net_out(r,max_target_id[r]));
+      if(val < -1e10f) val = -1e10f;
+      sumerr += val;
+    } else {
+      //process whole posterior vect.
+      for(size_t c=0; c<net_out.Cols(); c++) {
+        if(target(r,c) != 0.0) {
+          BaseFloat val = target(r,c)*log(net_out(r,c));
+          if(val < -1e10f) val = -1e10f;
+          sumerr += val;
+        }
+      }
+    }
+  }
+
+  //accumulate confusuion network
+  if(confusion_mode_ != NO_CONF) {
+    for(size_t r=0; r<net_out.Rows(); r++) {
+      int id_target = max_target_id[r];
+      int id_netout = max_netout_id[r];
+      switch(confusion_mode_) {
+        case MAX_CONF:
+          confusion_(id_target,id_netout) += 1;
+          break;
+        case SOFT_CONF:
+          confusion_[id_target].Add(net_out[r]);
+          break;
+        case DIAG_MAX_CONF:
+          diag_confusion_[id_target] += ((id_target==id_netout)?1:0);
+          break;
+        case DIAG_SOFT_CONF:
+          diag_confusion_[id_target] += net_out[r][id_target];
+          break;
+        default:
+          KALDI_ERR << "unknown confusion type" << confusion_mode_;
+      }
+      confusion_count_[id_target] += 1;
+    }
+  }
+
+  error_ -= sumerr;
+  frames_ += net_out.Rows();
+  corr_ += corr;
+}
+
+
+std::string CrossEntropy::Report() {
+  std::stringstream ss;
+  ss << "Xent:" << error_ << " frames:" << frames_
+     << " err/frm:" << error_/frames_
+     << " correct[" << 100.0*corr_/frames_ << "%]"
+     << "\n";
+
+  if(confusion_mode_ != NO_CONF) {
+    //read class tags
+    std::vector<std::string> tag;
+    { 
+      std::ifstream ifs(output_label_map_);
+      assert(ifs.good());
+      std::string str;
+      while(!ifs.eof()) {
+        ifs >> str;
+        tag.push_back(str);
+      }
+    }
+    assert(confusion_count_.Dim() <= tag.size());
+
+    //print confusion matrix
+    if(confusion_mode_ == MAX_CONF || confusion_mode_ == SOFT_CONF) {
+      ss << "Row:label Col:hyp\n" << confusion_ << "\n";
+    }
+    
+    //***print per-target accuracies
+    for(int i=0; i<confusion_count_.Dim(); i++) {
+      //get the numerator
+      BaseFloat numerator = 0.0;
+      switch (confusion_mode_) {
+        case MAX_CONF: case SOFT_CONF:
+          numerator = confusion_[i][i];
+          break;
+        case DIAG_MAX_CONF: case DIAG_SOFT_CONF:
+          numerator = diag_confusion_[i];
+          break;
+        default:
+          KALDI_ERR << "Usupported confusion mode:" << confusion_mode_;
+      }
+      //add line to report
+      ss << std::setw(30) << tag[i] << " " 
+         << std::setw(10) << 100.0*numerator/confusion_count_[i] << "%" 
+         << " [" << numerator << "/" << confusion_count_[i] << "]\n";
+    } //***print per-target accuracies
+  }// != NO_CONF
+
+  return ss.str();
+}
+
+
+void CrossEntropy::MergeStats(const ObjectiveFunction& inst) { 
+  const CrossEntropy& xent = dynamic_cast<const CrossEntropy&>(inst);
+  frames_ += xent.frames_; error_ += xent.error_; corr_ += xent.corr_;
+  //sum the confustion statistics
+  if(confusion_mode_ != NO_CONF) {
+    if(confusion_.Rows() != xent.confusion_.Rows()) {
+      confusion_.Init(xent.confusion_.Rows(),xent.confusion_.Cols());
+      confusion_count_.Init(xent.confusion_count_.Dim());
+      diag_confusion_.Init(xent.diag_confusion_.Dim());
+    }
+    confusion_.Add(xent.confusion_);
+    confusion_count_.Add(xent.confusion_count_);
+    diag_confusion_.Add(xent.diag_confusion_);
+  }
+}
+ 
+
+} // namespace TNet
diff --git a/src/TNetLib/ObjFun.h b/src/TNetLib/ObjFun.h
new file mode 100644
index 0000000..c458340
--- /dev/null
+++ b/src/TNetLib/ObjFun.h
@@ -0,0 +1,160 @@
+#ifndef _TNET_OBJ_FUN_H
+#define _TNET_OBJ_FUN_H
+
+#include <cassert>
+#include <limits>
+#include <cmath>
+
+#include "Matrix.h"
+#include "Vector.h"
+
+namespace TNet {
+
+  /**
+   * General interface for objective functions
+   */
+  class ObjectiveFunction
+  {
+    public:
+    /// Enum with objective function types
+    typedef enum { 
+      OBJ_FUN_I = 0x0300, 
+      MEAN_SQUARE_ERROR, 
+      CROSS_ENTROPY, 
+    } ObjFunType;
+    
+    public:
+      /// Factory for creating objective function instances
+      static ObjectiveFunction* Factory(ObjFunType type);
+    
+    //////////////////////////////////////////////////////////////
+    // Interface specification
+    protected:
+      ObjectiveFunction() { }; /// constructor
+    public:
+      virtual ~ObjectiveFunction() { };  /// destructor
+
+      virtual ObjFunType GetType() = 0;
+      virtual const char* GetName() = 0;
+      virtual ObjectiveFunction* Clone() = 0; 
+
+      ///calculate error of network output
+      virtual void Evaluate(const Matrix<BaseFloat>& net_out, const Matrix<BaseFloat>& target, Matrix<BaseFloat>* err) = 0;
+ 
+      ///get the accumulated error
+      virtual double GetError() = 0;
+      ///the number of processed frames 
+      virtual size_t GetFrames() = 0;
+       
+      ///report the error to string 
+      virtual std::string Report() = 0;     
+
+      ///sum the frame counts from more instances
+      virtual void MergeStats(const ObjectiveFunction& inst) = 0;
+  };
+
+
+
+  /**
+   * Mean square error function
+   */
+  class MeanSquareError : public ObjectiveFunction
+  {
+   public:
+    MeanSquareError()
+     : ObjectiveFunction(), frames_(0), error_(0)
+    { }
+
+    ~MeanSquareError()
+    { }
+
+    ObjFunType GetType()
+    { return MEAN_SQUARE_ERROR; }
+
+    const char* GetName()
+    { return "<MeanSquareError>"; }
+
+    ObjectiveFunction* Clone()
+    { return new MeanSquareError(*this); }
+    
+    void Evaluate(const Matrix<BaseFloat>& net_out, const Matrix<BaseFloat>& target, Matrix<BaseFloat>* err);
+
+    size_t GetFrames()
+    { return frames_; }
+    
+    double GetError()
+    { return error_; }
+
+    std::string Report();    
+     
+    void MergeStats(const ObjectiveFunction& inst) { 
+      const MeanSquareError& mse = dynamic_cast<const MeanSquareError&>(inst);
+      frames_ += mse.frames_; error_ += mse.error_; 
+    }
+   
+   private:
+    size_t frames_;
+    double error_;
+
+  };
+
+
+  /**
+   * Cross entropy error function
+   */
+  class CrossEntropy : public ObjectiveFunction
+  {
+   public:
+    enum ConfusionMode { NO_CONF=0, MAX_CONF, SOFT_CONF, DIAG_MAX_CONF, DIAG_SOFT_CONF };
+
+   public:
+    CrossEntropy()
+     : ObjectiveFunction(), frames_(0), error_(0), corr_(0), confusion_mode_(NO_CONF), output_label_map_(NULL)
+    { }
+
+    ~CrossEntropy()
+    { }
+
+    ObjFunType GetType()
+    { return CROSS_ENTROPY; }
+
+    const char* GetName() 
+    { return "<cross_entropy>"; }
+
+    ObjectiveFunction* Clone()
+    { return new CrossEntropy(*this); }
+
+    void Evaluate(const Matrix<BaseFloat>& net_out, const Matrix<BaseFloat>& target, Matrix<BaseFloat>* err);
+
+    size_t GetFrames()
+    { return frames_; }
+    
+    double GetError()
+    { return error_; }
+
+    void SetConfusionMode(enum ConfusionMode m)
+    { confusion_mode_ = m; }
+
+    void SetOutputLabelMap(const char* map)
+    { output_label_map_ = map; }
+
+    std::string Report();    
+     
+    void MergeStats(const ObjectiveFunction& inst);   
+   private:
+    size_t frames_;
+    double error_;
+    size_t corr_;
+ 
+    ConfusionMode confusion_mode_;
+    Matrix<float> confusion_;
+    Vector<int> confusion_count_;
+    Vector<double> diag_confusion_;
+    const char* output_label_map_;
+  };
+ 
+
+} //namespace TNet
+
+
+#endif
diff --git a/src/TNetLib/Platform.h b/src/TNetLib/Platform.h
new file mode 100644
index 0000000..628b9cd
--- /dev/null
+++ b/src/TNetLib/Platform.h
@@ -0,0 +1,402 @@
+#ifndef _TNET_PLATFORM_H
+#define _TNET_PLATFORM_H
+
+/**
+ * \file Platform.h
+ * \brief DNN training class multicore version
+ */
+
+#include "Thread.h"
+#include "Matrix.h"
+
+#include "Features.h"
+#include "Labels.h"
+
+#include "Cache.h"
+#include "Nnet.h"
+#include "ObjFun.h"
+
+#include "Mutex.h"
+#include "Semaphore.h"
+#include "Barrier.h"
+#include "Thread.h"
+
+#include <vector>
+#include <list>
+#include <iterator>
+
+namespace TNet {
+
+class PlatformThread;
+
+class Platform {
+
+/*
+* Variables to be initialized directly from the main function
+*/
+public:
+  FeatureRepository feature_;   ///< Features specified in the input arguments and script file
+  LabelRepository label_;       ///< Labels specified in the lable map file
+
+  Network nnet_transf_;	        ///< NNet transform
+  Network nnet_;                ///< N network
+  ObjectiveFunction* obj_fun_;  ///< Specified in the ObjectiveFunction
+
+  int bunchsize_;
+  int cachesize_;
+  bool randomize_;
+   
+  int start_frm_ext_;
+  int end_frm_ext_;
+
+  int trace_;
+  bool crossval_;
+  
+  long int seed_;
+
+ /*
+  * Variables to be used internally during the multi-threaded training
+  */
+ private:
+  Semaphore semaphore_read_;
+ 
+  std::vector<std::list<Matrix<BaseFloat>*> > feature_buf_;
+  std::vector<std::list<Matrix<BaseFloat>*> > label_buf_;
+  std::vector<Mutex> mutex_buf_;
+
+  std::vector<Network*> nnet_transf2_;
+
+  std::vector<Cache> cache_;
+
+  std::vector<Network*> nnet2_;
+  std::vector<ObjectiveFunction*> obj_fun2_;
+  std::vector<bool> sync_mask_;
+
+  Barrier barrier_;
+  bool end_reading_;
+  std::vector<Timer> tim_;
+  std::vector<double> tim_accu_;
+
+  int num_thr_;
+  Semaphore semaphore_endtrain_;
+  Semaphore semaphore_endtrain2_;
+
+ public:
+  Mutex cout_mutex_;
+
+ /*
+  * Methods
+  */
+ public:
+  Platform()
+   : bunchsize_(0), cachesize_(0), randomize_(false),
+     start_frm_ext_(0), end_frm_ext_(0), trace_(0),
+     crossval_(false), seed_(0),
+     end_reading_(false), num_thr_(0)
+  { }
+
+  ~Platform()
+  {
+    for(size_t i=0; i<nnet_transf2_.size(); i++) {
+      delete nnet_transf2_[i];
+    }
+    for(size_t i=0; i<nnet2_.size(); i++) {
+      delete nnet2_[i];
+    }
+    for(size_t i=0; i<obj_fun2_.size(); i++) {
+      delete obj_fun2_[i];
+    }
+  }
+ 
+  /// Run the training using num_threads threads
+  void RunTrain(int num_threads);
+
+ private:
+  /// The data-reading thread
+  void ReadData();
+  /// The training thread
+  void Thread(int thr);
+
+ friend class PlatformThread;
+};
+
+
+
+/**
+ * Inherit Thread for the training threads
+ */
+class PlatformThread : public Thread {
+ public:
+  PlatformThread(Platform* pf)
+   : platform_(*pf)
+  { }
+ 
+ private:
+  void Execute(void* arg) {
+    long long thr_id = reinterpret_cast<long long>(arg);
+    platform_.Thread(thr_id);
+  }
+   
+ private:
+  Platform& platform_;
+};
+
+
+
+
+
+void Platform::RunTrain(int num_thr) {
+  num_thr_ = num_thr;
+  
+  /*
+   * Initialize parallel training
+   */
+  feature_buf_.resize(num_thr);
+  label_buf_.resize(num_thr);
+  mutex_buf_.resize(num_thr);
+  cache_.resize(num_thr);
+  sync_mask_.resize(num_thr);
+  barrier_.SetThreshold(num_thr);
+
+  tim_.resize(num_thr);
+  tim_accu_.resize(num_thr,0.0);
+
+  int bunchsize = bunchsize_/num_thr;
+  int cachesize = (cachesize_/num_thr/bunchsize)*bunchsize;
+  std::cout << "Bunchsize:" << bunchsize << "*" << num_thr << "=" << bunchsize*num_thr
+            << " Cachesize:" << cachesize << "*" << num_thr << "=" << cachesize*num_thr << "\n";
+  for(int i=0; i<num_thr; i++) {
+    //clone transforms
+    nnet_transf2_.push_back(nnet_transf_.Clone()); 
+    //create cache
+    cache_[i].Init(cachesize,bunchsize,seed_);
+    cache_[i].Trace(trace_);
+    //clone networks
+    nnet2_.push_back(nnet_.Clone());
+    //clone objective function objects
+    obj_fun2_.push_back(obj_fun_->Clone());
+    //enable threads to sync weights
+    sync_mask_[i] = true;
+  }
+
+  /*
+   * Run training threads
+   */
+  std::vector<PlatformThread*> threads;
+  for(intptr_t i=0; i<num_thr; i++) {
+    PlatformThread* t = new PlatformThread(this);
+    t->Start(reinterpret_cast<void*>(i));
+    threads.push_back(t);
+  }
+
+  /*
+   * Read the training data
+   */
+  ReadData();
+
+  /*
+   * Wait for training to finish
+   */
+  semaphore_endtrain2_.Wait(); 
+
+}
+
+
+
+void Platform::ReadData() try {
+  cout_mutex_.Lock();  
+  std::cout << "queuesize " << feature_.QueueSize() << "\n";
+  cout_mutex_.Unlock();  
+
+  int thr = 0;
+  for(feature_.Rewind();!feature_.EndOfList();feature_.MoveNext()) {
+    Matrix<BaseFloat>* fea = new Matrix<BaseFloat>;
+    Matrix<BaseFloat>* lab = new Matrix<BaseFloat>;
+
+    feature_.ReadFullMatrix(*fea);
+    label_.GenDesiredMatrix(*lab,
+                            fea->Rows()-start_frm_ext_-end_frm_ext_,
+                            feature_.CurrentHeader().mSamplePeriod,
+                            feature_.Current().Logical().c_str());
+
+    
+    fea->CheckData(feature_.Current().Logical());
+
+    mutex_buf_[thr].Lock();
+    feature_buf_[thr].push_back(fea);
+    label_buf_[thr].push_back(lab);
+    mutex_buf_[thr].Unlock();
+
+    //suspend reading when shortest buffer has 50 matrices
+    if(thr == 0) {
+      int minsize=1e6;
+      for(size_t i=0; i<feature_buf_.size(); i++) {
+        int s = feature_buf_[i].size();
+        if(s < minsize) minsize = s;
+      }
+      if(minsize > 20) semaphore_read_.Wait();
+    }
+
+    thr = (thr+1) % num_thr_;
+  }
+
+  std::cout << "[Reading finished]\n" << std::flush; 
+  end_reading_ = true;
+
+} catch (std::exception& rExc) {
+  std::cerr << "Exception thrown" << std::endl;
+  std::cerr << rExc.what() << std::endl;
+  exit(1);
+}
+
+void Platform::Thread(int thr_id) try {
+
+  const int thr = thr_id; //make id const for safety!
+
+  while(1) {
+    //fill the cache
+    while(!cache_[thr].Full() && !(end_reading_ && (feature_buf_[thr].size() == 0))) {
+
+      if(feature_buf_[thr].size() <= 5) {
+        semaphore_read_.Post();//wake the reader
+      }
+      if(feature_buf_[thr].size() == 0) {
+        cout_mutex_.Lock();  
+        std::cout << "Thread" << thr << ",waiting for data\n";
+        cout_mutex_.Unlock();  
+        sleep(1);
+      } else {
+        //get the matrices
+        mutex_buf_[thr].Lock();
+        Matrix<BaseFloat>* fea = feature_buf_[thr].front();
+        Matrix<BaseFloat>* lab = label_buf_[thr].front();
+        feature_buf_[thr].pop_front();
+        label_buf_[thr].pop_front();
+        mutex_buf_[thr].Unlock();
+
+        //transform the features
+        Matrix<BaseFloat> fea_transf;
+        nnet_transf2_[thr]->Propagate(*fea,fea_transf);
+
+        //trim the ext
+        SubMatrix<BaseFloat> fea_trim(
+          fea_transf,
+          start_frm_ext_,
+          fea_transf.Rows()-start_frm_ext_-end_frm_ext_,
+          0,
+          fea_transf.Cols()
+        );
+
+        //add to cache
+        cache_[thr].AddData(fea_trim,*lab);
+
+        delete fea; delete lab;
+      }
+    }
+
+    //no more data, end training...
+    if(cache_[thr].Empty()) break;
+
+    if(randomize_) { cache_[thr].Randomize(); }
+
+
+    //std::cout << "Thread" << thr << ", Cache#" << nr_cache++ << "\n";
+
+    //train from cache
+    Matrix<BaseFloat> fea2,lab2,out,err;
+    while(!cache_[thr].Empty()) {
+      cache_[thr].GetBunch(fea2,lab2);
+      nnet2_[thr]->Propagate(fea2,out);
+      obj_fun2_[thr]->Evaluate(out,lab2,&err);
+
+      if(!crossval_) {
+        nnet2_[thr]->Backpropagate(err);
+
+         tim_[thr].Start();
+        barrier_.Wait();//*********/
+         tim_[thr].End(); tim_accu_[thr] += tim_[thr].Val();
+       
+        //sum the gradient and bunchsize
+        for(int i=0; i<num_thr_; i++) {
+          if(sync_mask_[i]) {
+            nnet_.AccuGradient(*nnet2_[i],thr,num_thr_);
+            if(thr == 0) nnet_.AccuBunchsize(*nnet2_[i]);
+          }
+        }
+
+         tim_[thr].Start();
+        barrier_.Wait();//*********/
+         tim_[thr].End(); tim_accu_[thr] += tim_[thr].Val();
+
+        //update
+        nnet_.Update(thr,num_thr_);
+       
+         tim_[thr].Start();
+        barrier_.Wait();//*********/
+         tim_[thr].End(); tim_accu_[thr] += tim_[thr].Val();
+
+        //reset the bunchsize counter
+        if(thr == 0) nnet_.ResetBunchsize();
+      }
+    }
+
+  }
+
+  std::cout << "Thread" << thr << " end of data\n";
+  
+  //deactivate threads' update from summing
+  sync_mask_[thr] = false;
+  //increase number of finished threads
+  semaphore_endtrain_.Post();
+   
+  //synchronize the updates of other threads
+  while(1) {
+    barrier_.Wait();//*********/
+    if(semaphore_endtrain_.GetValue() == num_thr_) break;
+        
+    //sum the gradient and bunchsize
+    for(int i=0; i<num_thr_; i++) {
+      if(sync_mask_[i]) {
+        nnet_.AccuGradient(*nnet2_[i],thr,num_thr_);
+        if(thr == 0) nnet_.AccuBunchsize(*nnet2_[i]);
+      }
+    }
+    barrier_.Wait();//*********/
+    //update
+    nnet_.Update(thr,num_thr_);
+    barrier_.Wait();//*********/
+    //reset bunchsize counter
+    if(thr == 0) nnet_.ResetBunchsize();
+  }
+
+  //finally merge objfun stats
+  if(thr == 0) {
+    for(int i=0; i<num_thr_; i++) {
+      obj_fun_->MergeStats(*obj_fun2_[i]);
+    }
+    
+    cout_mutex_.Lock();
+    std::cout << "Barrier waiting times per thread\n"; 
+    std::copy(tim_accu_.begin(),tim_accu_.end(),std::ostream_iterator<double>(std::cout," "));
+    std::cout << "\n";
+    cout_mutex_.Unlock();
+  }
+
+  cout_mutex_.Lock();
+  std::cout << "[Thread" << thr << " finished]\n";
+  cout_mutex_.Unlock();
+
+  if(thr == 0) {
+    semaphore_endtrain2_.Post();
+  }
+} catch (std::exception& rExc) {
+  std::cerr << "Exception thrown" << std::endl;
+  std::cerr << rExc.what() << std::endl;
+  exit(1);
+}
+
+
+
+}//namespace TNet
+
+#endif
diff --git a/src/TNetLib/Platform.h~ b/src/TNetLib/Platform.h~
new file mode 100644
index 0000000..0981876
--- /dev/null
+++ b/src/TNetLib/Platform.h~
@@ -0,0 +1,402 @@
+#ifndef _TNET_PLATFORM_H
+#define _TNET_PLATFORM_H
+
+/**
+ * \file Platform.h
+ * \brief DNN training class
+ */
+
+#include "Thread.h"
+#include "Matrix.h"
+
+#include "Features.h"
+#include "Labels.h"
+
+#include "Cache.h"
+#include "Nnet.h"
+#include "ObjFun.h"
+
+#include "Mutex.h"
+#include "Semaphore.h"
+#include "Barrier.h"
+#include "Thread.h"
+
+#include <vector>
+#include <list>
+#include <iterator>
+
+namespace TNet {
+
+class PlatformThread;
+
+class Platform {
+
+/*
+* Variables to be initialized directly from the main function
+*/
+public:
+  FeatureRepository feature_;   ///< Features specified in the input arguments and script file
+  LabelRepository label_;       ///< Labels specified in the lable map file
+
+  Network nnet_transf_;	        ///< NNet transform
+  Network nnet_;                ///< N network
+  ObjectiveFunction* obj_fun_;  ///< Specified in the ObjectiveFunction
+
+  int bunchsize_;
+  int cachesize_;
+  bool randomize_;
+   
+  int start_frm_ext_;
+  int end_frm_ext_;
+
+  int trace_;
+  bool crossval_;
+  
+  long int seed_;
+
+ /*
+  * Variables to be used internally during the multi-threaded training
+  */
+ private:
+  Semaphore semaphore_read_;
+ 
+  std::vector<std::list<Matrix<BaseFloat>*> > feature_buf_;
+  std::vector<std::list<Matrix<BaseFloat>*> > label_buf_;
+  std::vector<Mutex> mutex_buf_;
+
+  std::vector<Network*> nnet_transf2_;
+
+  std::vector<Cache> cache_;
+
+  std::vector<Network*> nnet2_;
+  std::vector<ObjectiveFunction*> obj_fun2_;
+  std::vector<bool> sync_mask_;
+
+  Barrier barrier_;
+  bool end_reading_;
+  std::vector<Timer> tim_;
+  std::vector<double> tim_accu_;
+
+  int num_thr_;
+  Semaphore semaphore_endtrain_;
+  Semaphore semaphore_endtrain2_;
+
+ public:
+  Mutex cout_mutex_;
+
+ /*
+  * Methods
+  */
+ public:
+  Platform()
+   : bunchsize_(0), cachesize_(0), randomize_(false),
+     start_frm_ext_(0), end_frm_ext_(0), trace_(0),
+     crossval_(false), seed_(0),
+     end_reading_(false), num_thr_(0)
+  { }
+
+  ~Platform()
+  {
+    for(size_t i=0; i<nnet_transf2_.size(); i++) {
+      delete nnet_transf2_[i];
+    }
+    for(size_t i=0; i<nnet2_.size(); i++) {
+      delete nnet2_[i];
+    }
+    for(size_t i=0; i<obj_fun2_.size(); i++) {
+      delete obj_fun2_[i];
+    }
+  }
+ 
+  /// Run the training using num_threads threads
+  void RunTrain(int num_threads);
+
+ private:
+  /// The data-reading thread
+  void ReadData();
+  /// The training thread
+  void Thread(int thr);
+
+ friend class PlatformThread;
+};
+
+
+
+/**
+ * Inherit Thread for the training threads
+ */
+class PlatformThread : public Thread {
+ public:
+  PlatformThread(Platform* pf)
+   : platform_(*pf)
+  { }
+ 
+ private:
+  void Execute(void* arg) {
+    long long thr_id = reinterpret_cast<long long>(arg);
+    platform_.Thread(thr_id);
+  }
+   
+ private:
+  Platform& platform_;
+};
+
+
+
+
+
+void Platform::RunTrain(int num_thr) {
+  num_thr_ = num_thr;
+  
+  /*
+   * Initialize parallel training
+   */
+  feature_buf_.resize(num_thr);
+  label_buf_.resize(num_thr);
+  mutex_buf_.resize(num_thr);
+  cache_.resize(num_thr);
+  sync_mask_.resize(num_thr);
+  barrier_.SetThreshold(num_thr);
+
+  tim_.resize(num_thr);
+  tim_accu_.resize(num_thr,0.0);
+
+  int bunchsize = bunchsize_/num_thr;
+  int cachesize = (cachesize_/num_thr/bunchsize)*bunchsize;
+  std::cout << "Bunchsize:" << bunchsize << "*" << num_thr << "=" << bunchsize*num_thr
+            << " Cachesize:" << cachesize << "*" << num_thr << "=" << cachesize*num_thr << "\n";
+  for(int i=0; i<num_thr; i++) {
+    //clone transforms
+    nnet_transf2_.push_back(nnet_transf_.Clone()); 
+    //create cache
+    cache_[i].Init(cachesize,bunchsize,seed_);
+    cache_[i].Trace(trace_);
+    //clone networks
+    nnet2_.push_back(nnet_.Clone());
+    //clone objective function objects
+    obj_fun2_.push_back(obj_fun_->Clone());
+    //enable threads to sync weights
+    sync_mask_[i] = true;
+  }
+
+  /*
+   * Run training threads
+   */
+  std::vector<PlatformThread*> threads;
+  for(intptr_t i=0; i<num_thr; i++) {
+    PlatformThread* t = new PlatformThread(this);
+    t->Start(reinterpret_cast<void*>(i));
+    threads.push_back(t);
+  }
+
+  /*
+   * Read the training data
+   */
+  ReadData();
+
+  /*
+   * Wait for training to finish
+   */
+  semaphore_endtrain2_.Wait(); 
+
+}
+
+
+
+void Platform::ReadData() try {
+  cout_mutex_.Lock();  
+  std::cout << "queuesize " << feature_.QueueSize() << "\n";
+  cout_mutex_.Unlock();  
+
+  int thr = 0;
+  for(feature_.Rewind();!feature_.EndOfList();feature_.MoveNext()) {
+    Matrix<BaseFloat>* fea = new Matrix<BaseFloat>;
+    Matrix<BaseFloat>* lab = new Matrix<BaseFloat>;
+
+    feature_.ReadFullMatrix(*fea);
+    label_.GenDesiredMatrix(*lab,
+                            fea->Rows()-start_frm_ext_-end_frm_ext_,
+                            feature_.CurrentHeader().mSamplePeriod,
+                            feature_.Current().Logical().c_str());
+
+    
+    fea->CheckData(feature_.Current().Logical());
+
+    mutex_buf_[thr].Lock();
+    feature_buf_[thr].push_back(fea);
+    label_buf_[thr].push_back(lab);
+    mutex_buf_[thr].Unlock();
+
+    //suspend reading when shortest buffer has 50 matrices
+    if(thr == 0) {
+      int minsize=1e6;
+      for(size_t i=0; i<feature_buf_.size(); i++) {
+        int s = feature_buf_[i].size();
+        if(s < minsize) minsize = s;
+      }
+      if(minsize > 20) semaphore_read_.Wait();
+    }
+
+    thr = (thr+1) % num_thr_;
+  }
+
+  std::cout << "[Reading finished]\n" << std::flush; 
+  end_reading_ = true;
+
+} catch (std::exception& rExc) {
+  std::cerr << "Exception thrown" << std::endl;
+  std::cerr << rExc.what() << std::endl;
+  exit(1);
+}
+
+void Platform::Thread(int thr_id) try {
+
+  const int thr = thr_id; //make id const for safety!
+
+  while(1) {
+    //fill the cache
+    while(!cache_[thr].Full() && !(end_reading_ && (feature_buf_[thr].size() == 0))) {
+
+      if(feature_buf_[thr].size() <= 5) {
+        semaphore_read_.Post();//wake the reader
+      }
+      if(feature_buf_[thr].size() == 0) {
+        cout_mutex_.Lock();  
+        std::cout << "Thread" << thr << ",waiting for data\n";
+        cout_mutex_.Unlock();  
+        sleep(1);
+      } else {
+        //get the matrices
+        mutex_buf_[thr].Lock();
+        Matrix<BaseFloat>* fea = feature_buf_[thr].front();
+        Matrix<BaseFloat>* lab = label_buf_[thr].front();
+        feature_buf_[thr].pop_front();
+        label_buf_[thr].pop_front();
+        mutex_buf_[thr].Unlock();
+
+        //transform the features
+        Matrix<BaseFloat> fea_transf;
+        nnet_transf2_[thr]->Propagate(*fea,fea_transf);
+
+        //trim the ext
+        SubMatrix<BaseFloat> fea_trim(
+          fea_transf,
+          start_frm_ext_,
+          fea_transf.Rows()-start_frm_ext_-end_frm_ext_,
+          0,
+          fea_transf.Cols()
+        );
+
+        //add to cache
+        cache_[thr].AddData(fea_trim,*lab);
+
+        delete fea; delete lab;
+      }
+    }
+
+    //no more data, end training...
+    if(cache_[thr].Empty()) break;
+
+    if(randomize_) { cache_[thr].Randomize(); }
+
+
+    //std::cout << "Thread" << thr << ", Cache#" << nr_cache++ << "\n";
+
+    //train from cache
+    Matrix<BaseFloat> fea2,lab2,out,err;
+    while(!cache_[thr].Empty()) {
+      cache_[thr].GetBunch(fea2,lab2);
+      nnet2_[thr]->Propagate(fea2,out);
+      obj_fun2_[thr]->Evaluate(out,lab2,&err);
+
+      if(!crossval_) {
+        nnet2_[thr]->Backpropagate(err);
+
+         tim_[thr].Start();
+        barrier_.Wait();//*********/
+         tim_[thr].End(); tim_accu_[thr] += tim_[thr].Val();
+       
+        //sum the gradient and bunchsize
+        for(int i=0; i<num_thr_; i++) {
+          if(sync_mask_[i]) {
+            nnet_.AccuGradient(*nnet2_[i],thr,num_thr_);
+            if(thr == 0) nnet_.AccuBunchsize(*nnet2_[i]);
+          }
+        }
+
+         tim_[thr].Start();
+        barrier_.Wait();//*********/
+         tim_[thr].End(); tim_accu_[thr] += tim_[thr].Val();
+
+        //update
+        nnet_.Update(thr,num_thr_);
+       
+         tim_[thr].Start();
+        barrier_.Wait();//*********/
+         tim_[thr].End(); tim_accu_[thr] += tim_[thr].Val();
+
+        //reset the bunchsize counter
+        if(thr == 0) nnet_.ResetBunchsize();
+      }
+    }
+
+  }
+
+  std::cout << "Thread" << thr << " end of data\n";
+  
+  //deactivate threads' update from summing
+  sync_mask_[thr] = false;
+  //increase number of finished threads
+  semaphore_endtrain_.Post();
+   
+  //synchronize the updates of other threads
+  while(1) {
+    barrier_.Wait();//*********/
+    if(semaphore_endtrain_.GetValue() == num_thr_) break;
+        
+    //sum the gradient and bunchsize
+    for(int i=0; i<num_thr_; i++) {
+      if(sync_mask_[i]) {
+        nnet_.AccuGradient(*nnet2_[i],thr,num_thr_);
+        if(thr == 0) nnet_.AccuBunchsize(*nnet2_[i]);
+      }
+    }
+    barrier_.Wait();//*********/
+    //update
+    nnet_.Update(thr,num_thr_);
+    barrier_.Wait();//*********/
+    //reset bunchsize counter
+    if(thr == 0) nnet_.ResetBunchsize();
+  }
+
+  //finally merge objfun stats
+  if(thr == 0) {
+    for(int i=0; i<num_thr_; i++) {
+      obj_fun_->MergeStats(*obj_fun2_[i]);
+    }
+    
+    cout_mutex_.Lock();
+    std::cout << "Barrier waiting times per thread\n"; 
+    std::copy(tim_accu_.begin(),tim_accu_.end(),std::ostream_iterator<double>(std::cout," "));
+    std::cout << "\n";
+    cout_mutex_.Unlock();
+  }
+
+  cout_mutex_.Lock();
+  std::cout << "[Thread" << thr << " finished]\n";
+  cout_mutex_.Unlock();
+
+  if(thr == 0) {
+    semaphore_endtrain2_.Post();
+  }
+} catch (std::exception& rExc) {
+  std::cerr << "Exception thrown" << std::endl;
+  std::cerr << rExc.what() << std::endl;
+  exit(1);
+}
+
+
+
+}//namespace TNet
+
+#endif
diff --git a/src/TNetLib/Semaphore.cc b/src/TNetLib/Semaphore.cc
new file mode 100644
index 0000000..d149fb3
--- /dev/null
+++ b/src/TNetLib/Semaphore.cc
@@ -0,0 +1,64 @@
+
+#include "Semaphore.h"
+
+namespace TNet {
+  
+  Semaphore::
+  Semaphore(int initValue) 
+  {
+    mSemValue = initValue;
+    pthread_mutex_init(&mMutex, NULL);
+    pthread_cond_init(&mCond, NULL);
+  }
+
+  Semaphore::
+  ~Semaphore()
+  {
+    pthread_mutex_destroy(&mMutex);
+    pthread_cond_destroy(&mCond);
+  }
+
+  int 
+  Semaphore::
+  TryWait()
+  {
+    pthread_mutex_lock(&mMutex);
+    if(mSemValue > 0) {
+      mSemValue--;
+      pthread_mutex_unlock(&mMutex);
+      return 0;
+    }
+    pthread_mutex_unlock(&mMutex);
+    return -1;
+  }
+
+  void 
+  Semaphore::
+  Wait()
+  {
+    pthread_mutex_lock(&mMutex);
+    while(mSemValue <= 0) {
+      pthread_cond_wait(&mCond, &mMutex);
+    }
+    mSemValue--;
+    pthread_mutex_unlock(&mMutex);
+  }
+
+  void
+  Semaphore::
+  Post()
+  {
+    pthread_mutex_lock(&mMutex);
+    mSemValue++;
+    pthread_cond_signal(&mCond);
+    pthread_mutex_unlock(&mMutex);
+  }
+
+  int
+  Semaphore::
+  GetValue()
+  { return mSemValue; }
+
+
+
+} //namespace
diff --git a/src/TNetLib/Semaphore.h b/src/TNetLib/Semaphore.h
new file mode 100644
index 0000000..a28ee44
--- /dev/null
+++ b/src/TNetLib/Semaphore.h
@@ -0,0 +1,26 @@
+#ifndef _SEMPAHORE_H_
+#define _SEMPAHORE_H_
+
+#include <pthread.h>
+
+namespace TNet {
+  
+  class Semaphore {
+    public:
+      Semaphore(int initValue = 0); 
+      ~Semaphore();
+
+      int TryWait();
+      void Wait();
+      void Post();
+      int GetValue();
+
+    private:
+      int mSemValue;
+      pthread_mutex_t mMutex;
+      pthread_cond_t mCond;
+
+  };
+} //namespace
+
+#endif
diff --git a/src/TNetLib/SharedLinearity.cc b/src/TNetLib/SharedLinearity.cc
new file mode 100644
index 0000000..108212c
--- /dev/null
+++ b/src/TNetLib/SharedLinearity.cc
@@ -0,0 +1,277 @@
+
+
+#include "SharedLinearity.h"
+#include "cblas.h"
+
+namespace TNet {
+
+void 
+SharedLinearity::
+PropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+{
+  //precopy bias
+  for(int k=0; k<mNInstances; k++) {
+    for(size_t r=0; r<X.Rows(); r++) {
+      memcpy(Y.pRowData(r)+k*mpBias->Dim(),mpBias->pData(),mpBias->Dim()*sizeof(BaseFloat));
+    }
+  }
+  
+  //multiply blockwise
+  for(int k=0; k<mNInstances; k++) {
+    SubMatrix<BaseFloat> xblock(X,0,X.Rows(),k*mpLinearity->Rows(),mpLinearity->Rows());
+    SubMatrix<BaseFloat> yblock(Y,0,Y.Rows(),k*mpLinearity->Cols(),mpLinearity->Cols());
+    yblock.BlasGemm(1.0,xblock,NO_TRANS,*mpLinearity,NO_TRANS,1.0);
+  }
+}
+
+
+void 
+SharedLinearity::
+BackpropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y)
+{
+  for(int k=0; k<mNInstances; k++) {
+    SubMatrix<BaseFloat> xblock(X,0,X.Rows(),k*mpLinearity->Cols(),mpLinearity->Cols());
+    SubMatrix<BaseFloat> yblock(Y,0,Y.Rows(),k*mpLinearity->Rows(),mpLinearity->Rows());
+    yblock.BlasGemm(1.0,xblock,NO_TRANS,*mpLinearity,TRANS,1.0);
+  }
+}
+
+#if 0
+void 
+SharedLinearity::
+AccuUpdate() 
+{
+  BaseFloat N = 1;
+  /* 
+  //Not part of the interface!!!
+  if(mGradDivFrm) {
+    N = static_cast<BaseFloat>(GetInput().Rows());
+  }
+  */
+  BaseFloat mmt_gain = static_cast<BaseFloat>(1.0/(1.0-mMomentum));
+  N *= mmt_gain; //compensate higher gradient estimates due to momentum 
+  
+  //compensate augmented dyn. range of gradient caused by multiple instances
+  N *= static_cast<BaseFloat>(mNInstances); 
+
+  const Matrix<BaseFloat>& X = GetInput().Data();
+  const Matrix<BaseFloat>& E = GetErrorInput().Data();
+  //get gradient of shared linearity
+  for(int k=0; k<mNInstances; k++) {
+    SubMatrix<BaseFloat> xblock(X,0,X.Rows(),k*mLinearity.Rows(),mLinearity.Rows());
+    SubMatrix<BaseFloat> eblock(E,0,E.Rows(),k*mLinearity.Cols(),mLinearity.Cols());
+    mLinearityCorrection.BlasGemm(1.0,xblock,TRANS,eblock,NO_TRANS,((k==0)?mMomentum:1.0f));
+  }
+
+  //get gradient of shared bias
+  mBiasCorrection.Scale(mMomentum);
+  for(int r=0; r<E.Rows(); r++) {
+    for(int c=0; c<E.Cols(); c++) {
+      mBiasCorrection[c%mBiasCorrection.Dim()] += E(r,c);
+    }
+  }
+
+  //perform update 
+  mLinearity.AddScaled(-mLearningRate/N,mLinearityCorrection);
+  mBias.AddScaled(-mLearningRate/N,mBiasCorrection);
+  
+  //regularization weight decay
+  mLinearity.AddScaled(-mLearningRate*mWeightcost,mLinearity);
+}
+#endif
+
+void
+SharedLinearity::
+ReadFromStream(std::istream& rIn)
+{
+  //number of instances of shared weights in layer
+  rIn >> std::ws >> mNInstances;
+  if(mNInstances < 1) {
+    std::ostringstream os;
+    os << "Bad number of instances:" << mNInstances;
+    Error(os.str());
+  }
+  if(GetNInputs() % mNInstances != 0 || GetNOutputs() % mNInstances != 0) {
+    std::ostringstream os;
+    os << "Number of Inputs/Outputs must be divisible by number of instances"
+       << " Inputs:" << GetNInputs()
+       << " Outputs" << GetNOutputs()
+       << " Intances:" << mNInstances;
+    Error(os.str());
+  }
+    
+  //matrix is stored transposed as SNet does
+  BfMatrix transpose;
+  rIn >> transpose;
+  mLinearity = BfMatrix(transpose, TRANS);
+  //biases stored normally
+  rIn >> mBias;
+
+  if(transpose.Cols()*transpose.Rows() == 0) {
+    Error("Missing linearity matrix in network file");
+  }
+  if(mBias.Dim() == 0) {
+    Error("Missing bias vector in network file");
+  }
+
+
+  if(mLinearity.Cols() != (GetNOutputs() / mNInstances) || 
+     mLinearity.Rows() != (GetNInputs() / mNInstances) ||
+     mBias.Dim() != (GetNOutputs() / mNInstances)
+  ){
+    std::ostringstream os;
+    os << "Wrong dimensionalities of matrix/vector in network file\n"
+       << "Inputs:" << GetNInputs()
+       << " Outputs:" << GetNOutputs()
+       << "\n"
+       << "N-Instances:" << mNInstances
+       << "\n"
+       << "linearityCols:" << mLinearity.Cols() << "(" << mLinearity.Cols()*mNInstances << ")"
+       << " linearityRows:" << mLinearity.Rows() << "(" << mLinearity.Rows()*mNInstances << ")"
+       << " biasDims:" << mBias.Dim() << "(" << mBias.Dim()*mNInstances << ")"
+       << "\n";
+    Error(os.str());
+  }
+
+  mLinearityCorrection.Init(mLinearity.Rows(),mLinearity.Cols());
+  mBiasCorrection.Init(mBias.Dim());
+}
+
+ 
+void
+SharedLinearity::
+WriteToStream(std::ostream& rOut)
+{
+  rOut << mNInstances << std::endl;
+  //matrix is stored transposed as SNet does
+  BfMatrix transpose(mLinearity, TRANS);
+  rOut << transpose;
+  //biases stored normally
+  rOut << mBias;
+  rOut << std::endl;
+}
+
+
+void 
+SharedLinearity::
+Gradient() 
+{
+  const Matrix<BaseFloat>& X = GetInput();
+  const Matrix<BaseFloat>& E = GetErrorInput();
+  //get gradient of shared linearity
+  for(int k=0; k<mNInstances; k++) {
+    SubMatrix<BaseFloat> xblock(X,0,X.Rows(),k*mpLinearity->Rows(),mpLinearity->Rows());
+    SubMatrix<BaseFloat> eblock(E,0,E.Rows(),k*mpLinearity->Cols(),mpLinearity->Cols());
+    mLinearityCorrection.BlasGemm(1.0,xblock,TRANS,eblock,NO_TRANS,((k==0)?0.0f:1.0f));
+  }
+
+  //get gradient of shared bias
+  mBiasCorrection.Set(0.0f);
+  for(int r=0; r<E.Rows(); r++) {
+    for(int c=0; c<E.Cols(); c++) {
+      mBiasCorrection[c%mBiasCorrection.Dim()] += E(r,c);
+    }
+  }
+}
+
+
+void 
+SharedLinearity::
+AccuGradient(const UpdatableComponent& src, int thr, int thrN)
+{
+  //cast the argument
+  const SharedLinearity& src_comp = dynamic_cast<const SharedLinearity&>(src);
+
+  //allocate accumulators when needed
+  if(mLinearityCorrectionAccu.MSize() == 0) {
+    mLinearityCorrectionAccu.Init(mpLinearity->Rows(),mpLinearity->Cols());
+  }
+  if(mBiasCorrectionAccu.MSize() == 0) {
+    mBiasCorrectionAccu.Init(mpBias->Dim());
+  }
+ 
+
+  //assert the dimensions
+  /*
+  assert(mLinearityCorrection.Rows() == src_comp.mLinearityCorrection.Rows());
+  assert(mLinearityCorrection.Cols() == src_comp.mLinearityCorrection.Cols());
+  assert(mBiasCorrection.Dim() == src_comp.mBiasCorrection.Dim());
+  */
+
+  //need to find out which rows to sum...
+  int div = mLinearityCorrection.Rows() / thrN;
+  int mod = mLinearityCorrection.Rows() % thrN;
+
+  int origin = thr * div + ((mod > thr)? thr : mod);
+  int rows = div + ((mod > thr)? 1 : 0);
+
+  //std::cout << "[S" << thr << "," << origin << "," << rows << "]" << std::flush;
+
+  //create the matrix windows
+  const SubMatrix<BaseFloat> src_mat (
+    src_comp.mLinearityCorrection, 
+    origin, rows, 
+    0, mLinearityCorrection.Cols()
+  );
+  SubMatrix<double> tgt_mat (
+    mLinearityCorrectionAccu, 
+    origin, rows, 
+    0, mLinearityCorrection.Cols()
+  );
+  //sum the rows
+  Add(tgt_mat,src_mat);
+
+  //first thread will always sum the bias correction and adds frame count
+  if(thr == 0) {
+    //std::cout << "[BS" << thr << "]" << std::flush;
+    Add(mBiasCorrectionAccu,src_comp.mBiasCorrection);
+  }
+}
+
+
+void 
+SharedLinearity::
+Update(int thr, int thrN) 
+{
+  //need to find out which rows to sum...
+  int div = mLinearity.Rows() / thrN;
+  int mod = mLinearity.Rows() % thrN;
+
+  int origin = thr * div + ((mod > thr)? thr : mod);
+  int rows = div + ((mod > thr)? 1 : 0);
+
+  //std::cout << "[P" << thr << "," << origin << "," << rows << "]" << std::flush;
+
+  //get the matrix windows
+  SubMatrix<double> src_mat (
+    mLinearityCorrectionAccu, 
+    origin, rows, 
+    0, mLinearityCorrection.Cols()
+  );
+  SubMatrix<BaseFloat> tgt_mat (
+    mLinearity, 
+    origin, rows, 
+    0, mLinearityCorrection.Cols()
+  );
+
+  //TODO perform L2 regularization
+  //tgt_mat.AddScaled(tgt_mat, -mWeightcost * num_frames);
+
+  //update weights
+  AddScaled(tgt_mat, src_mat, -mLearningRate/static_cast<BaseFloat>(mNInstances));
+
+  //first thread always update bias
+  if(thr == 0) {
+    //std::cout << "[" << thr << "BP]" << std::flush;
+    AddScaled(mBias, mBiasCorrectionAccu, -mLearningRate/static_cast<BaseFloat>(mNInstances));
+  }
+
+  //reset the accumulators
+  src_mat.Zero();
+  if(thr == 0) {
+    mBiasCorrectionAccu.Zero();
+  }
+}
+
+ 
+} //namespace
diff --git a/src/TNetLib/SharedLinearity.h b/src/TNetLib/SharedLinearity.h
new file mode 100644
index 0000000..83feeee
--- /dev/null
+++ b/src/TNetLib/SharedLinearity.h
@@ -0,0 +1,103 @@
+#ifndef _CUSHARED_LINEARITY_H_
+#define _CUSHARED_LINEARITY_H_
+
+
+#include "Component.h"
+
+#include "Matrix.h"
+#include "Vector.h"
+
+
+namespace TNet {
+
+class SharedLinearity : public UpdatableComponent
+{
+ public:
+  SharedLinearity(size_t nInputs, size_t nOutputs, Component *pPred); 
+  ~SharedLinearity();  
+  
+  ComponentType GetType() const 
+  { return SHARED_LINEARITY; }
+
+  const char* GetName() const
+  { return "<SharedLinearity>"; }
+
+  Component* Clone() const;
+
+  void PropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y);
+  void BackpropagateFnc(const Matrix<BaseFloat>& X, Matrix<BaseFloat>& Y);
+
+  void ReadFromStream(std::istream& rIn);
+  void WriteToStream(std::ostream& rOut);
+
+  /// calculate gradient
+  void Gradient(); 
+  /// accumulate gradient from other components
+  void AccuGradient(const UpdatableComponent& src, int thr, int thrN);
+  /// update weights, reset the accumulator
+  void Update(int thr, int thrN);
+
+protected:
+  Matrix<BaseFloat> mLinearity;  ///< Matrix with neuron weights
+  Vector<BaseFloat> mBias;       ///< Vector with biases
+
+  Matrix<BaseFloat>* mpLinearity;
+  Vector<BaseFloat>* mpBias;
+
+  Matrix<BaseFloat> mLinearityCorrection; ///< Matrix for linearity updates
+  Vector<BaseFloat> mBiasCorrection;      ///< Vector for bias updates
+
+  Matrix<double> mLinearityCorrectionAccu; ///< Accumulator for linearity updates
+  Vector<double> mBiasCorrectionAccu;      ///< Accumulator for bias updates
+  
+  int mNInstances;
+};
+
+
+
+
+////////////////////////////////////////////////////////////////////////////
+// INLINE FUNCTIONS 
+// SharedLinearity::
+inline 
+SharedLinearity::
+SharedLinearity(size_t nInputs, size_t nOutputs, Component *pPred)
+  : UpdatableComponent(nInputs, nOutputs, pPred),
+    mpLinearity(&mLinearity), mpBias(&mBias), 
+    mNInstances(0)
+{ }
+
+
+inline
+SharedLinearity::
+~SharedLinearity()
+{ }
+
+
+inline
+Component*
+SharedLinearity::
+Clone() const
+{
+  SharedLinearity* ptr = new SharedLinearity(GetNInputs(),GetNOutputs(),NULL);
+  ptr->mpLinearity = mpLinearity;
+  ptr->mpBias = mpBias;
+
+  ptr->mLinearityCorrection.Init(mpLinearity->Rows(),mpLinearity->Cols());
+  ptr->mBiasCorrection.Init(mpBias->Dim());
+
+  ptr->mNInstances = mNInstances;
+
+  ptr->mLearningRate = mLearningRate;
+
+
+  return ptr;
+}
+
+
+
+} //namespace
+
+
+
+#endif
diff --git a/src/TNetLib/Thread.h b/src/TNetLib/Thread.h
new file mode 100644
index 0000000..ba6d7ba
--- /dev/null
+++ b/src/TNetLib/Thread.h
@@ -0,0 +1,53 @@
+#ifndef _TNET_THREAD_H
+#define _TNET_THREAD_H
+
+namespace TNet {
+
+class Thread {
+ public:
+  Thread() 
+  { }
+  virtual ~Thread() 
+  { }
+
+  int Start(void* arg);
+
+ protected:
+  static void* EntryPoint(void*);
+  virtual void Execute(void*) = 0; ///< Override this function
+  void* Arg() const { return arg_; }
+  void Arg(void* a) { arg_ = a; }
+
+ private:
+  pthread_t thread_id_;
+  void * arg_;
+};
+
+int Thread::Start(void * arg) {
+  Arg(arg); // store user data
+ 
+  int ret=0;
+  //create thread as detached (don't wait for it)
+  pthread_attr_t tattr;
+  ret |= pthread_attr_init(&tattr);
+  ret |= pthread_attr_setdetachstate(&tattr,PTHREAD_CREATE_DETACHED);
+  ret |= pthread_create(&thread_id_, &tattr, &Thread::EntryPoint, this);
+  if(ret != 0) KALDI_ERR << "Failed to create thread";
+  return ret;
+}
+
+/*static */
+void* Thread::EntryPoint(void* pthis) try {
+  Thread* pt = (Thread*)pthis;
+  pt->Execute(pt->Arg());
+  return NULL;
+} catch (std::exception& rExc) {
+  std::cerr << "Exception thrown" << std::endl;
+  std::cerr << rExc.what() << std::endl;
+  exit(1);
+}
+
+
+} //namespace TNet
+
+#endif
diff --git a/src/TNorm.cc b/src/TNorm.cc
new file mode 100644
index 0000000..1402f8f
--- /dev/null
+++ b/src/TNorm.cc
@@ -0,0 +1,324 @@
+
+/***************************************************************************
+ *   copyright            : (C) 2011 by Karel Vesely,UPGM,FIT,VUT,Brno     *
+ *   email                : iveselyk@fit.vutbr.cz                          *
+ ***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the APACHE License as published by the          *
+ *   Apache Software Foundation; either version 2.0 of the License,        *
+ *   or (at your option) any later version.                                *
+ *                                                                         *
+ ***************************************************************************/
+
+#define SVN_DATE       "$Date: 2011-09-26 16:48:24 +0200 (Mon, 26 Sep 2011) $"
+#define SVN_AUTHOR     "$Author: iveselyk $"
+#define SVN_REVISION   "$Revision: 73 $"
+#define SVN_ID         "$Id: TNorm.cc 73 2011-09-26 14:48:24Z iveselyk $"
+
+#define MODULE_VERSION "1.0.0 "__TIME__" "__DATE__" "SVN_ID  
+
+
+
+/*** KaldiLib includes */
+#include "Error.h"
+#include "Timer.h"
+#include "Features.h"
+#include "Common.h"
+#include "UserInterface.h"
+#include "Timer.h"
+
+/*** TNet includes */
+#include "Nnet.h"
+
+/*** STL includes */
+#include <iostream>
+#include <sstream>
+#include <numeric>
+
+
+
+
+//////////////////////////////////////////////////////////////////////
+// DEFINES
+//
+
+#define SNAME "TNORM"
+
+using namespace TNet;
+
+void usage(const char* progname) 
+{
+  const char *tchrptr;
+  if ((tchrptr = strrchr(progname, '\\')) != NULL) progname = tchrptr+1;
+  if ((tchrptr = strrchr(progname, '/')) != NULL) progname = tchrptr+1;
+  fprintf(stderr,
+"\n%s version " MODULE_VERSION "\n"
+"\nUSAGE: %s [options] DataFiles...\n\n"
+" Option                                                     Default\n\n"
+" -A         Print command line arguments                    Off\n" 
+" -C cf      Set config file to cf                           Default\n"
+" -D         Display configuration variables                 Off\n"
+" -H mmf     Load NN macro file                              \n"
+" -S file    Set script file                                 None\n"
+" -T N       Set trace flags to N                            0\n" 
+" -V         Print version information                       Off\n"
+"\n"
+"NATURALREADORDER PRINTCONFIG PRINTVERSION SCRIPT SOURCEMMF TARGETMMF TRACE\n"
+"\n"
+"STARTFRMEXT ENDFRMEXT CMEANDIR CMEANMASK VARSCALEDIR VARSCALEMASK VARSCALEFN TARGETKIND DERIVWINDOWS DELTAWINDOW ACCWINDOW THIRDWINDOW\n"
+"\n"
+" %s is Copyright (C) 2010-2011 Karel Vesely\n"
+" licensed under the APACHE License, version 2.0\n"
+" Bug reports, feedback, etc, to: iveselyk@fit.vutbr.cz\n"
+"\n", progname, progname, progname);
+  exit(-1);
+}
+
+
+
+
+///////////////////////////////////////////////////////////////////////
+// MAIN FUNCTION
+//
+
+
+int main(int argc, char *argv[]) try
+{
+  const char* p_option_string =
+    " -D n   PRINTCONFIG=TRUE"
+    " -H l   SOURCEMMF"
+    " -S l   SCRIPT"
+    " -T r   TRACE"
+    " -V n   PRINTVERSION=TRUE"
+    ;
+
+
+  UserInterface        ui;
+  FeatureRepository    features;
+  Network              network_cpu;
+  Timer                timer;
+
+ 
+  const char*                       p_script;
+  const char*                       p_source_mmf_file;
+  const char*                       p_targetmmf; 
+
+  int traceFlag;
+
+
+  // variables for feature repository
+  bool                              swap_features;
+  int                               target_kind;
+  int                               deriv_order;
+  int*                              p_deriv_win_lenghts;
+  int                               start_frm_ext;
+  int                               end_frm_ext;
+        char*                       cmn_path;
+        char*                       cmn_file;
+  const char*                       cmn_mask;
+        char*                       cvn_path;
+        char*                       cvn_file;
+  const char*                       cvn_mask;
+  const char*                       cvg_file;
+
+ 
+  // OPTION PARSING ..........................................................
+  // use the STK option parsing
+  if (argc == 1) { usage(argv[0]); return 1; }
+  int args_parsed = ui.ParseOptions(argc, argv, p_option_string, SNAME);
+
+
+  // OPTION RETRIEVAL ........................................................
+  // extract the feature parameters
+  swap_features = !ui.GetBool(SNAME":NATURALREADORDER", TNet::IsBigEndian());
+  
+  target_kind = ui.GetFeatureParams(&deriv_order, &p_deriv_win_lenghts,
+       &start_frm_ext, &end_frm_ext, &cmn_path, &cmn_file, &cmn_mask,
+       &cvn_path, &cvn_file, &cvn_mask, &cvg_file, SNAME":", 0);
+
+
+  // extract other parameters
+  p_source_mmf_file   = ui.GetStr(SNAME":SOURCEMMF",     NULL);
+  p_targetmmf         = ui.GetStr(SNAME":TARGETMMF",     NULL);//< target for mean/variance
+
+  p_script            = ui.GetStr(SNAME":SCRIPT",         NULL);
+
+  traceFlag       = ui.GetInt(SNAME":TRACE",               0);
+
+
+  // process the parameters
+  if(ui.GetBool(SNAME":PRINTCONFIG", false)) {
+    std::cout << std::endl;
+    ui.PrintConfig(std::cout);
+    std::cout << std::endl;
+  }
+  if(ui.GetBool(SNAME":PRINTVERSION", false)) {
+    std::cout << std::endl;
+    std::cout << "======= TNET v"MODULE_VERSION" xvesel39 =======" << std::endl;
+    std::cout << std::endl;
+  }
+  ui.CheckCommandLineParamUse();
+  
+
+  // the rest of the parameters are the feature files
+  for (; args_parsed < argc; args_parsed++) {
+    features.AddFile(argv[args_parsed]);
+  }
+
+  //**************************************************************************
+  //**************************************************************************
+  // OPTION PARSING DONE .....................................................
+
+  //read the neural network
+  if(NULL != p_source_mmf_file) { 
+    if(traceFlag&1) TraceLog(std::string("Reading network: ")+p_source_mmf_file);
+    network_cpu.ReadNetwork(p_source_mmf_file);
+  } else {
+    Error("Source MMF must be specified [-H]");
+  }
+
+
+
+
+  // initialize the feature repository 
+  features.Init(
+    swap_features, start_frm_ext, end_frm_ext, target_kind,
+    deriv_order, p_deriv_win_lenghts, 
+    cmn_path, cmn_mask, cvn_path, cvn_mask, cvg_file
+  );
+  if(NULL != p_script) {
+    features.AddFileList(p_script);
+  } else {
+    Warning("WARNING: The script file is missing [-S]");
+  }
+
+
+  
+  
+  //**********************************************************************
+  //**********************************************************************
+  // INITIALIZATION DONE .................................................
+  //
+  // Start training
+  timer.Start();
+  std::cout << "===== TNorm STARTED =====" << std::endl;
+
+  int dim = network_cpu.GetNOutputs();
+
+  Vector<double> first(dim); first.Set(0.0);
+  Vector<double> second(dim); second.Set(0.0);
+
+  unsigned long framesN = 0;
+ 
+  //progress
+  size_t cnt = 0;
+  size_t step = features.QueueSize() / 100;
+  if(step == 0) step = 1;
+ 
+  //**********************************************************************
+  //**********************************************************************
+  // MAIN LOOP
+
+  for(features.Rewind(); !features.EndOfList(); features.MoveNext()) {
+
+    Matrix<BaseFloat> feats_host,net_out;
+    Matrix<BaseFloat> feats_host_out;
+  
+    //get features 
+    features.ReadFullMatrix(feats_host);
+
+    //propagate
+    network_cpu.Propagate(feats_host,net_out);
+    //trim the xxx_frm_ext
+    feats_host_out.Init(net_out.Rows()-start_frm_ext-end_frm_ext,net_out.Cols());
+    memcpy(feats_host_out.pData(),net_out.pRowData(start_frm_ext),feats_host_out.MSize());
+
+    //accumulate first/second order statistics
+    for(size_t m=0; m<feats_host_out.Rows(); m++) {
+      for(size_t n=0; n<feats_host_out.Cols(); n++) {
+        BaseFloat val = feats_host_out(m,n);
+        first[n] += val; 
+        second[n] += val*val;
+
+        if(isnan(first[n])||isnan(second[n])||
+           isinf(first[n])||isinf(second[n])) 
+        {
+          std::ostringstream oss;
+          oss << "nan/inf in accumulators\n"
+              << "first:" << first << "\n"
+              << "second:" << second << "\n"
+              << "frames:" << framesN << "\n"
+              << "utterance:" << features.Current().Logical() << "\n"
+              << "feats_host: " << feats_host << "\n"
+              << "feats_host_out: " << feats_host_out << "\n";
+          Error(oss.str());
+        }
+      }
+    }
+
+    framesN += feats_host.Rows();
+    
+    //progress 
+    if((cnt++ % step) == 0) std::cout << 100 * cnt / features.QueueSize() << "%, " << std::flush;
+  }
+
+  //**********************************************************************
+  //**********************************************************************
+  // ACCUMULATING FINISHED .................................................
+  //
+
+
+  //get the mean/variance vectors
+  Vector<double> mean(first);
+  mean.Scale(1.0/framesN);
+  Vector<double> variance(second);
+  variance.Scale(1.0/framesN);
+  for(size_t i=0; i<mean.Dim(); i++) {
+    variance[i] -= mean[i]*mean[i];
+  }
+
+  //get the mean normalization biase vector, 
+  //use negative mean vector
+  Vector<double> bias(mean);
+  bias.Scale(-1.0);
+
+  //get the variance normalization window vector, 
+  //inverse of square root of variance
+  Vector<double> window(variance);
+  for(size_t i=0; i<window.Dim(); i++) {
+    window[i] = 1.0/sqrt(window[i]);
+  }
+
+  //store the normalization network
+  std::ofstream os(p_targetmmf);
+  if(!os.good()) Error(std::string("Cannot open file for writing: ")+p_targetmmf);
+
+  dim = mean.Dim();
+  os << "<bias> " << dim << " " << dim << "\n"
+     << bias << "\n\n"
+     << "<window> " << dim << " " << dim << "\n"
+     << window << "\n\n";
+
+  os.close();
+
+  timer.End();
+  std::cout << "\n\n===== TNorm FINISHED ( " << timer.Val() << "s ) "
+            << "[FPS:" << framesN / timer.Val() 
+            << ",RT:" << 1.0f / (framesN / timer.Val() / 100.0f)
+            << "] =====" << std::endl;
+
+  std::cout << "frames: " << framesN 
+            << ", max_bias: " << bias.Max()
+            << ", max_window: " << window.Max()
+            << ", min_window: " << window.Min()
+            << "\n";
+  
+  return  0; ///finish OK
+
+} catch (std::exception& rExc) {
+  std::cerr << "Exception thrown" << std::endl;
+  std::cerr << rExc.what() << std::endl;
+  return  1;
+}
+
diff --git a/src/TNormCu.cc b/src/TNormCu.cc
new file mode 100644
index 0000000..ccc7167
--- /dev/null
+++ b/src/TNormCu.cc
@@ -0,0 +1,350 @@
+
+/***************************************************************************
+ *   copyright            : (C) 2011 by Karel Vesely,UPGM,FIT,VUT,Brno     *
+ *   email                : iveselyk@fit.vutbr.cz                          *
+ ***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the APACHE License as published by the          *
+ *   Apache Software Foundation; either version 2.0 of the License,        *
+ *   or (at your option) any later version.                                *
+ *                                                                         *
+ ***************************************************************************/
+
+#define SVN_DATE       "$Date: 2011-12-08 11:59:03 +0100 (Thu, 08 Dec 2011) $"
+#define SVN_AUTHOR     "$Author: iveselyk $"
+#define SVN_REVISION   "$Revision: 94 $"
+#define SVN_ID         "$Id: TNormCu.cc 94 2011-12-08 10:59:03Z iveselyk $"
+
+#define MODULE_VERSION "1.0.0 "__TIME__" "__DATE__" "SVN_ID  
+
+
+
+/*** KaldiLib includes */
+#include "Error.h"
+#include "Timer.h"
+#include "Features.h"
+#include "Common.h"
+#include "UserInterface.h"
+#include "Timer.h"
+
+/*** TNet includes */
+#include "cuNetwork.h"
+#include "Nnet.h"
+
+/*** STL includes */
+#include <iostream>
+#include <sstream>
+#include <numeric>
+
+
+
+
+//////////////////////////////////////////////////////////////////////
+// DEFINES
+//
+
+#define SNAME "TNORM"
+
+using namespace TNet;
+
+void usage(const char* progname) 
+{
+  const char *tchrptr;
+  if ((tchrptr = strrchr(progname, '\\')) != NULL) progname = tchrptr+1;
+  if ((tchrptr = strrchr(progname, '/')) != NULL) progname = tchrptr+1;
+  fprintf(stderr,
+"\n%s version " MODULE_VERSION "\n"
+"\nUSAGE: %s [options] DataFiles...\n\n"
+" Option                                                     Default\n\n"
+" -A         Print command line arguments                    Off\n" 
+" -C cf      Set config file to cf                           Default\n"
+" -D         Display configuration variables                 Off\n"
+" -H mmf     Load NN macro file                              \n"
+" -S file    Set script file                                 None\n"
+" -T N       Set trace flags to N                            0\n" 
+" -V         Print version information                       Off\n"
+"\n"
+"NATURALREADORDER PRINTCONFIG PRINTVERSION SCRIPT SOURCEMMF TARGETMMF TRACE\n"
+"\n"
+"STARTFRMEXT ENDFRMEXT CMEANDIR CMEANMASK VARSCALEDIR VARSCALEMASK VARSCALEFN TARGETKIND DERIVWINDOWS DELTAWINDOW ACCWINDOW THIRDWINDOW\n"
+"\n"
+" %s is Copyright (C) 2010-2011 Karel Vesely\n"
+" licensed under the APACHE License, version 2.0\n"
+" Bug reports, feedback, etc, to: iveselyk@fit.vutbr.cz\n"
+"\n", progname, progname, progname);
+  exit(-1);
+}
+
+
+
+
+///////////////////////////////////////////////////////////////////////
+// MAIN FUNCTION
+//
+
+
+int main(int argc, char *argv[]) try
+{
+  const char* p_option_string =
+    " -D n   PRINTCONFIG=TRUE"
+    " -H l   SOURCEMMF"
+    " -S l   SCRIPT"
+    " -T r   TRACE"
+    " -V n   PRINTVERSION=TRUE"
+    ;
+
+
+  UserInterface        ui;
+  FeatureRepository    features;
+  CuNetwork            network;
+  Network              network_cpu;
+  Timer                timer;
+
+ 
+  const char*                       p_script;
+  const char*                       p_source_mmf_file;
+  const char*                       p_targetmmf; 
+
+  int traceFlag;
+
+
+  // variables for feature repository
+  bool                              swap_features;
+  int                               target_kind;
+  int                               deriv_order;
+  int*                              p_deriv_win_lenghts;
+  int                               start_frm_ext;
+  int                               end_frm_ext;
+        char*                       cmn_path;
+        char*                       cmn_file;
+  const char*                       cmn_mask;
+        char*                       cvn_path;
+        char*                       cvn_file;
+  const char*                       cvn_mask;
+  const char*                       cvg_file;
+
+ 
+  // OPTION PARSING ..........................................................
+  // use the STK option parsing
+  if (argc == 1) { usage(argv[0]); return 1; }
+  int args_parsed = ui.ParseOptions(argc, argv, p_option_string, SNAME);
+
+
+  // OPTION RETRIEVAL ........................................................
+  // extract the feature parameters
+  swap_features = !ui.GetBool(SNAME":NATURALREADORDER", TNet::IsBigEndian());
+  
+  target_kind = ui.GetFeatureParams(&deriv_order, &p_deriv_win_lenghts,
+       &start_frm_ext, &end_frm_ext, &cmn_path, &cmn_file, &cmn_mask,
+       &cvn_path, &cvn_file, &cvn_mask, &cvg_file, SNAME":", 0);
+
+
+  // extract other parameters
+  p_source_mmf_file   = ui.GetStr(SNAME":SOURCEMMF",     NULL);
+  p_targetmmf         = ui.GetStr(SNAME":TARGETMMF",     NULL);//< target for mean/variance
+
+  p_script            = ui.GetStr(SNAME":SCRIPT",         NULL);
+
+  traceFlag       = ui.GetInt(SNAME":TRACE",               0);
+  if(traceFlag&1) { CuDevice::Instantiate().Verbose(true); }
+
+
+  // process the parameters
+  if(ui.GetBool(SNAME":PRINTCONFIG", false)) {
+    std::cout << std::endl;
+    ui.PrintConfig(std::cout);
+    std::cout << std::endl;
+  }
+  if(ui.GetBool(SNAME":PRINTVERSION", false)) {
+    std::cout << std::endl;
+    std::cout << "======= TNET v"MODULE_VERSION" xvesel39 =======" << std::endl;
+    std::cout << std::endl;
+  }
+  ui.CheckCommandLineParamUse();
+  
+
+  // the rest of the parameters are the feature files
+  for (; args_parsed < argc; args_parsed++) {
+    features.AddFile(argv[args_parsed]);
+  }
+
+  //**************************************************************************
+  //**************************************************************************
+  // OPTION PARSING DONE .....................................................
+
+  //read the neural network
+  if(NULL != p_source_mmf_file) { 
+    if(CuDevice::Instantiate().IsPresent()) {
+      if(traceFlag&1) TraceLog(std::string("Reading GPU network: ")+p_source_mmf_file);
+      network.ReadNetwork(p_source_mmf_file);
+    } else {
+      if(traceFlag&1) TraceLog(std::string("Reading CPU network: ")+p_source_mmf_file);
+      network_cpu.ReadNetwork(p_source_mmf_file);
+    }
+  } else {
+    Error("Source MMF must be specified [-H]");
+  }
+
+
+
+
+  // initialize the feature repository 
+  features.Init(
+    swap_features, start_frm_ext, end_frm_ext, target_kind,
+    deriv_order, p_deriv_win_lenghts, 
+    cmn_path, cmn_mask, cvn_path, cvn_mask, cvg_file
+  );
+  if(NULL != p_script) {
+    features.AddFileList(p_script);
+  } else {
+    Warning("WARNING: The script file is missing [-S]");
+  }
+
+
+  
+  
+  //**********************************************************************
+  //**********************************************************************
+  // INITIALIZATION DONE .................................................
+  //
+  // Start training
+  timer.Start();
+  std::cout << "===== TNormCu STARTED =====" << std::endl;
+
+  int dim = CuDevice::Instantiate().IsPresent() ?
+                network.GetNOutputs() :
+                network_cpu.GetNOutputs();
+
+  Vector<double> first(dim); first.Set(0.0);
+  Vector<double> second(dim); second.Set(0.0);
+
+  unsigned long framesN = 0;
+ 
+  //progress
+  size_t cnt = 0;
+  size_t step = features.QueueSize() / 100;
+  if(step == 0) step = 1;
+ 
+  //**********************************************************************
+  //**********************************************************************
+  // MAIN LOOP
+
+  for(features.Rewind(); !features.EndOfList(); features.MoveNext()) {
+
+    Matrix<BaseFloat> feats_host,net_out;
+    Matrix<BaseFloat> feats_host_out;
+    CuMatrix<BaseFloat> feats;
+    CuMatrix<BaseFloat> feats_expanded;
+  
+    //get features 
+    features.ReadFullMatrix(feats_host);
+
+    if(CuDevice::Instantiate().IsPresent()) {
+      //propagate 
+      feats.CopyFrom(feats_host);
+      network.Propagate(feats,feats_expanded);
+      
+      //trim the xxx_frm_ext
+      int rows = feats_expanded.Rows()-start_frm_ext-end_frm_ext;
+      CuMatrix<BaseFloat> feats_trim(rows,feats_expanded.Cols());
+      feats_trim.CopyRows(rows,start_frm_ext,feats_expanded,0);
+      feats_trim.CopyTo(feats_host_out);
+    } else {
+      //propagate
+      network_cpu.Propagate(feats_host,net_out);
+      //trim the xxx_frm_ext
+      feats_host_out.Init(net_out.Rows()-start_frm_ext-end_frm_ext,net_out.Cols());
+      memcpy(feats_host_out.pData(),net_out.pRowData(start_frm_ext),feats_host_out.MSize());
+    }
+
+    //accumulate first/second order statistics
+    for(size_t m=0; m<feats_host_out.Rows(); m++) {
+      for(size_t n=0; n<feats_host_out.Cols(); n++) {
+        BaseFloat val = feats_host_out(m,n);
+        first[n] += val; 
+        second[n] += val*val;
+
+        if(isnan(first[n])||isnan(second[n])||
+           isinf(first[n])||isinf(second[n])) 
+        {
+          std::ostringstream oss;
+          oss << "nan/inf in accumulators\n"
+              << "first:" << first << "\n"
+              << "second:" << second << "\n"
+              << "frames:" << framesN << "\n"
+              << "utterance:" << features.Current().Logical() << "\n"
+              << "feats_host: " << feats_host << "\n"
+              << "feats_host_out: " << feats_host_out << "\n";
+          Error(oss.str());
+        }
+      }
+    }
+
+    
+
+    framesN += feats_host.Rows();
+    
+    //progress 
+    if((cnt++ % step) == 0) std::cout << 100 * cnt / features.QueueSize() << "%, " << std::flush;
+  }
+
+  //**********************************************************************
+  //**********************************************************************
+  // ACCUMULATING FINISHED .................................................
+  //
+
+
+  //get the mean/variance vectors
+  Vector<double> mean(first);
+  mean.Scale(1.0/framesN);
+  Vector<double> variance(second);
+  variance.Scale(1.0/framesN);
+  for(size_t i=0; i<mean.Dim(); i++) {
+    variance[i] -= mean[i]*mean[i];
+  }
+
+  //get the mean normalization biase vector, 
+  //use negative mean vector
+  Vector<double> bias(mean);
+  bias.Scale(-1.0);
+
+  //get the variance normalization window vector, 
+  //inverse of square root of variance
+  Vector<double> window(variance);
+  for(size_t i=0; i<window.Dim(); i++) {
+    window[i] = 1.0/sqrt(window[i]);
+  }
+
+  //store the normalization network
+  std::ofstream os(p_targetmmf);
+  if(!os.good()) Error(std::string("Cannot open file for writing: ")+p_targetmmf);
+
+  dim = mean.Dim();
+  os << "<bias> " << dim << " " << dim << "\n"
+     << bias << "\n\n"
+     << "<window> " << dim << " " << dim << "\n"
+     << window << "\n\n";
+
+  os.close();
+
+  timer.End();
+  std::cout << "\n\n===== TNormCu FINISHED ( " << timer.Val() << "s ) "
+            << "[FPS:" << framesN / timer.Val() 
+            << ",RT:" << 1.0f / (framesN / timer.Val() / 100.0f)
+            << "] =====" << std::endl;
+
+  std::cout << "frames: " << framesN 
+            << ", max_bias: " << bias.Max()
+            << ", max_window: " << window.Max()
+            << ", min_window: " << window.Min()
+            << "\n";
+  
+  return  0; ///finish OK
+
+} catch (std::exception& rExc) {
+  std::cerr << "Exception thrown" << std::endl;
+  std::cerr << rExc.what() << std::endl;
+  return  1;
+}
+
diff --git a/src/TRbmCu.cc b/src/TRbmCu.cc
new file mode 100644
index 0000000..b2d5ea8
--- /dev/null
+++ b/src/TRbmCu.cc
@@ -0,0 +1,396 @@
+
+/***************************************************************************
+ *   copyright            : (C) 2011 by Karel Vesely,UPGM,FIT,VUT,Brno     *
+ *   email                : iveselyk@fit.vutbr.cz                          *
+ ***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the APACHE License as published by the          *
+ *   Apache Software Foundation; either version 2.0 of the License,        *
+ *   or (at your option) any later version.                                *
+ *                                                                         *
+ ***************************************************************************/
+
+#define SVN_DATE       "$Date: 2011-12-08 11:59:03 +0100 (Thu, 08 Dec 2011) $"
+#define SVN_AUTHOR     "$Author: iveselyk $"
+#define SVN_REVISION   "$Revision: 94 $"
+#define SVN_ID         "$Id: TRbmCu.cc 94 2011-12-08 10:59:03Z iveselyk $"
+
+#define MODULE_VERSION "1.0.0 "__TIME__" "__DATE__" "SVN_ID  
+
+
+
+
+
+/*** TNetLib includes */
+#include "Error.h"
+#include "Timer.h"
+#include "Features.h"
+#include "Common.h"
+#include "UserInterface.h"
+#include "Timer.h"
+
+/*** TNet includes */
+#include "cuNetwork.h"
+#include "cuRbm.h"
+#include "cuCache.h"
+#include "cuObjectiveFunction.h"
+#include "curand.h"
+
+/*** STL includes */
+#include <iostream>
+#include <sstream>
+#include <numeric>
+
+
+
+
+//////////////////////////////////////////////////////////////////////
+// DEFINES
+//
+
+#define SNAME "TRBM"
+
+using namespace TNet;
+
+void usage(const char* progname) 
+{
+  const char *tchrptr;
+  if ((tchrptr = strrchr(progname, '\\')) != NULL) progname = tchrptr+1;
+  if ((tchrptr = strrchr(progname, '/')) != NULL) progname = tchrptr+1;
+  fprintf(stderr,
+"\n%s version " MODULE_VERSION "\n"
+"\nUSAGE: %s [options] DataFiles...\n\n"
+" Option                                                     Default\n\n"
+" -n f       Set learning rate to f                          0.06\n"
+" -A         Print command line arguments                    Off\n" 
+" -C cf      Set config file to cf                           Default\n"
+" -D         Display configuration variables                 Off\n"
+" -H mmf     Load NN macro file                              \n"
+" -S file    Set script file                                 None\n"
+" -T N       Set trace flags to N                            0\n" 
+" -V         Print version information                       Off\n"
+"\n"
+"FEATURETRANSFORM LEARNINGRATE MOMENTUM NATURALREADORDER PRINTCONFIG PRINTVERSION SCRIPT SOURCEMMF TARGETMMF TRACE WEIGHTCOST\n"
+"\n"
+"STARTFRMEXT ENDFRMEXT CMEANDIR CMEANMASK VARSCALEDIR VARSCALEMASK VARSCALEFN TARGETKIND DERIVWINDOWS DELTAWINDOW ACCWINDOW THIRDWINDOW\n"
+"\n"
+" %s is Copyright (C) 2010-2011 Karel Vesely\n"
+" licensed under the APACHE License, version 2.0\n"
+" Bug reports, feedback, etc, to: iveselyk@fit.vutbr.cz\n"
+"\n", progname, progname, progname);
+  exit(-1);
+}
+
+
+
+///////////////////////////////////////////////////////////////////////
+// MAIN FUNCTION
+//
+
+
+int main(int argc, char *argv[]) try
+{
+  const char* p_option_string =
+    " -n r   LEARNINGRATE"
+    " -D n   PRINTCONFIG=TRUE"
+    " -H l   SOURCEMMF"
+    " -S l   SCRIPT"
+    " -T r   TRACE"
+    " -V n   PRINTVERSION=TRUE"
+    ;
+
+
+  UserInterface        ui;
+  FeatureRepository    feature_repo;
+  CuNetwork            network;
+  CuNetwork            transform_network;
+  CuMeanSquareError    mse;
+  Timer                timer;
+  Timer                timer_frontend;
+  double               time_frontend = 0.0;
+
+ 
+  const char*                       p_script;
+  BaseFloat                         learning_rate;
+  BaseFloat                         momentum;
+  BaseFloat                         weightcost;
+
+  const char*                       p_source_mmf_file;
+  const char*                       p_input_transform;
+
+  const char*                       p_targetmmf; 
+
+  int                               bunch_size;
+  int                               cache_size;
+  bool                              randomize;
+  long int                          seed;
+  
+  int                               trace;
+
+  // variables for feature repository
+  bool                              swap_features;
+  int                               target_kind;
+  int                               deriv_order;
+  int*                              p_deriv_win_lenghts;
+  int                               start_frm_ext;
+  int                               end_frm_ext;
+        char*                       cmn_path;
+        char*                       cmn_file;
+  const char*                       cmn_mask;
+        char*                       cvn_path;
+        char*                       cvn_file;
+  const char*                       cvn_mask;
+  const char*                       cvg_file;
+
+ 
+  // OPTION PARSING ..........................................................
+  // use the STK option parsing
+  if (argc == 1) { usage(argv[0]); return 1; }
+  int args_parsed = ui.ParseOptions(argc, argv, p_option_string, SNAME);
+
+
+  // OPTION RETRIEVAL ........................................................
+  // extract the feature parameters
+  swap_features = !ui.GetBool(SNAME":NATURALREADORDER", TNet::IsBigEndian());
+  
+  target_kind = ui.GetFeatureParams(&deriv_order, &p_deriv_win_lenghts,
+       &start_frm_ext, &end_frm_ext, &cmn_path, &cmn_file, &cmn_mask,
+       &cvn_path, &cvn_file, &cvn_mask, &cvg_file, SNAME":", 0);
+
+
+  // extract other parameters
+  p_source_mmf_file   = ui.GetStr(SNAME":SOURCEMMF",     NULL);
+  p_input_transform   = ui.GetStr(SNAME":FEATURETRANSFORM",  NULL);
+  
+  p_targetmmf         = ui.GetStr(SNAME":TARGETMMF",     NULL);
+
+  p_script            = ui.GetStr(SNAME":SCRIPT",         NULL);
+  learning_rate       = ui.GetFlt(SNAME":LEARNINGRATE"  , 0.10f);
+  momentum            = ui.GetFlt(SNAME":MOMENTUM"      , 0.50f);
+  weightcost          = ui.GetFlt(SNAME":WEIGHTCOST"    , 0.0002f);
+
+
+  bunch_size          = ui.GetInt(SNAME":BUNCHSIZE", 256);
+  cache_size          = ui.GetInt(SNAME":CACHESIZE", 12800);
+  randomize           = ui.GetBool(SNAME":RANDOMIZE", true);
+
+  //cannot get long int
+  seed                = ui.GetInt(SNAME":SEED", 0);
+
+  trace               = ui.GetInt(SNAME":TRACE", 0);
+  if(trace&4) { CuDevice::Instantiate().Verbose(true); }
+
+
+
+
+  // process the parameters
+  if(ui.GetBool(SNAME":PRINTCONFIG", false)) {
+    std::cout << std::endl;
+    ui.PrintConfig(std::cout);
+    std::cout << std::endl;
+  }
+  if(ui.GetBool(SNAME":PRINTVERSION", false)) {
+    std::cout << std::endl;
+    std::cout << "======= TRbmCu v"MODULE_VERSION" xvesel39 =======" << std::endl;
+    std::cout << std::endl;
+  }
+  ui.CheckCommandLineParamUse();
+  
+
+  // the rest of the parameters are the feature files
+  for (; args_parsed < argc; args_parsed++) {
+    feature_repo.AddFile(argv[args_parsed]);
+  }
+
+  //**************************************************************************
+  //**************************************************************************
+  // OPTION PARSING DONE .....................................................
+
+
+  //read the input transform network
+  if(NULL != p_input_transform) { 
+    if(trace&1) TraceLog(std::string("Reading input transform network: ")+p_input_transform);
+    transform_network.ReadNetwork(p_input_transform);
+  }
+
+
+  //read the neural network
+  if(NULL != p_source_mmf_file) { 
+    if(trace&1) TraceLog(std::string("Reading network: ")+p_source_mmf_file);
+    network.ReadNetwork(p_source_mmf_file);
+  } else {
+    Error("Source MMF must be specified [-H]");
+  }
+  //extract the RBM from the network
+  if(network.Layers() != 1) { 
+    Error(std::string("Number of layers must be 1")+p_source_mmf_file); 
+  }
+  if(network.Layer(0).GetType() != CuComponent::RBM && network.Layer(0).GetType() != CuComponent::RBM_SPARSE) {
+    Error(std::string("Layer must be RBM")+p_source_mmf_file);
+  }
+  CuRbmBase& rbm = dynamic_cast<CuRbmBase&>(network.Layer(0));
+
+  // initialize the feature repository 
+  feature_repo.Init(
+    swap_features, start_frm_ext, end_frm_ext, target_kind,
+    deriv_order, p_deriv_win_lenghts, 
+    cmn_path, cmn_mask, cvn_path, cvn_mask, cvg_file
+  );
+  if(NULL != p_script) {
+    feature_repo.AddFileList(p_script);
+  } else {
+    Warning("WARNING: The script file is missing [-S]");
+  }
+  feature_repo.Trace(trace);
+
+  //set the learnrate, momentum, weightcost
+  rbm.LearnRate(learning_rate);
+  rbm.Momentum(momentum);
+  rbm.Weightcost(weightcost);
+
+  //seed the random number generator
+  if(seed == 0) {
+    struct timeval tv;
+    if (gettimeofday(&tv, 0) == -1) {
+      assert(0 && "gettimeofday does not work.");
+      exit(-1);
+    }
+    seed = (int)(tv.tv_sec) + (int)tv.tv_usec;
+  }
+  srand48(seed);
+
+  //initialize the matrix random number generator
+  CuRand<BaseFloat> cu_rand(bunch_size,rbm.GetNOutputs());
+
+
+  
+  //**********************************************************************
+  //**********************************************************************
+  // INITIALIZATION DONE .................................................
+  //
+  // Start training
+  timer.Start();
+  std::cout << "===== TRbmCu TRAINING STARTED =====" << std::endl;
+  std::cout << "learning rate: " << learning_rate 
+            << " momentum: " << momentum 
+            << " weightcost: " << weightcost
+            << std::endl;
+  std::cout << "Using seed: " << seed << "\n";
+
+
+  CuCache cache;
+  cache.Init(cache_size,bunch_size);
+  cache.Trace(trace);
+  feature_repo.Rewind();
+  
+  //**********************************************************************
+  //**********************************************************************
+  // MAIN LOOP
+  //
+  CuMatrix<BaseFloat> pos_vis, pos_hid, neg_vis, neg_hid;
+  CuMatrix<BaseFloat> dummy_labs, dummy_err;
+  while(!feature_repo.EndOfList()) {
+    timer_frontend.Start();
+    //fill cache
+    while(!cache.Full() && !feature_repo.EndOfList()) {
+      Matrix<BaseFloat> feats_host;
+      CuMatrix<BaseFloat> feats_original;
+      CuMatrix<BaseFloat> feats_expanded;
+
+      //read feats, perfrom feature transform
+      feature_repo.ReadFullMatrix(feats_host);
+      feats_original.CopyFrom(feats_host);
+      transform_network.Propagate(feats_original,feats_expanded);
+
+      //trim the start/end context
+      int rows = feats_expanded.Rows()-start_frm_ext-end_frm_ext;
+      CuMatrix<BaseFloat> feats_trim(rows,feats_expanded.Cols());
+      feats_trim.CopyRows(rows,start_frm_ext,feats_expanded,0);
+
+      //fake the labels!!!
+      CuMatrix<BaseFloat> labs_cu(feats_trim.Rows(),1);
+      
+      //add to cache
+      cache.AddData(feats_trim,labs_cu);
+
+      feature_repo.MoveNext();
+    }
+    timer_frontend.End(); time_frontend += timer_frontend.Val();
+   
+    if(randomize) { 
+      //randomize the cache
+      cache.Randomize();
+    }
+
+    while(!cache.Empty()) {
+      //get training data
+      cache.GetBunch(pos_vis,dummy_labs);
+
+      //forward pass
+      rbm.Propagate(pos_vis,pos_hid);
+
+      //change the hidden values so we can generate negative example
+      if(rbm.HidType() == CuRbmBase::BERNOULLI) {
+        cu_rand.BinarizeProbs(pos_hid,neg_hid);
+      } else {
+        neg_hid.CopyFrom(pos_hid);
+        cu_rand.AddGaussNoise(neg_hid);
+      }
+
+      //reconstruct pass
+      rbm.Reconstruct(neg_hid,neg_vis);
+
+      //forward pass
+      rbm.Propagate(neg_vis, neg_hid);
+
+      //update the weioghts
+      rbm.RbmUpdate(pos_vis, pos_hid, neg_vis, neg_hid);
+
+      //evalueate mean square error
+      mse.Evaluate(neg_vis,pos_vis,dummy_err);
+
+      if(trace&2) std::cout << "." << std::flush;
+    }
+    //check the NaN/inf
+    pos_hid.CheckData();
+  }
+
+
+
+  //**********************************************************************
+  //**********************************************************************
+  // TRAINING FINISHED .................................................
+  //
+  // Let's store the network, report the log
+
+  if(trace&1) TraceLog("Training finished");
+
+  //write the network
+  if (NULL != p_targetmmf) {
+    if(trace&1) TraceLog(std::string("Writing network: ")+p_targetmmf);
+    network.WriteNetwork(p_targetmmf);
+  } else {
+    Error("missing argument --TARGETMMF");
+  }
+
+  timer.End();
+  std::cout << "===== TRbmCu FINISHED ( " << timer.Val() << "s ) "
+            << "[FPS:" << mse.GetFrames() / timer.Val() 
+            << ",RT:" << 1.0f / (mse.GetFrames() / timer.Val() / 100.0f)
+            << "] =====" << std::endl;
+
+  //report objective function (accuracy, frame counts...)
+  std::cout << mse.Report();
+
+  if(trace &4) {
+    std::cout << "\n== PROFILE ==\nT-fe: " << time_frontend << std::endl;
+  }
+  
+  return  0; ///finish OK
+
+} catch (std::exception& rExc) {
+  std::cerr << "Exception thrown" << std::endl;
+  std::cerr << rExc.what() << std::endl;
+  return  1;
+}
diff --git a/src/TRecurrentCu.cc b/src/TRecurrentCu.cc
new file mode 100644
index 0000000..f05008d
--- /dev/null
+++ b/src/TRecurrentCu.cc
@@ -0,0 +1,420 @@
+
+/***************************************************************************
+ *   copyright            : (C) 2011 by Karel Vesely,UPGM,FIT,VUT,Brno     *
+ *   email                : iveselyk@fit.vutbr.cz                          *
+ ***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the APACHE License as published by the          *
+ *   Apache Software Foundation; either version 2.0 of the License,        *
+ *   or (at your option) any later version.                                *
+ *                                                                         *
+ ***************************************************************************/
+
+#define SVN_DATE       "$Date: 2011-10-18 12:42:04 +0200 (Tue, 18 Oct 2011) $"
+#define SVN_AUTHOR     "$Author: iveselyk $"
+#define SVN_REVISION   "$Revision: 86 $"
+#define SVN_ID         "$Id: TRecurrentCu.cc 86 2011-10-18 10:42:04Z iveselyk $"
+
+#define MODULE_VERSION "1.0.0 "__TIME__" "__DATE__" "SVN_ID  
+
+
+
+
+/*** TNetLib includes */
+#include "Error.h"
+#include "Timer.h"
+#include "Features.h"
+#include "Labels.h"
+#include "Common.h"
+#include "MlfStream.h"
+#include "UserInterface.h"
+#include "Timer.h"
+
+/*** TNet includes */
+#include "cuObjectiveFunction.h"
+#include "cuNetwork.h"
+#include "cuRecurrent.h"
+
+/*** STL includes */
+#include <iostream>
+#include <sstream>
+#include <numeric>
+
+
+
+
+//////////////////////////////////////////////////////////////////////
+// DEFINES
+//
+
+#define SNAME "TNET"
+
+using namespace TNet;
+
+void usage(const char* progname) 
+{
+  const char *tchrptr;
+  if ((tchrptr = strrchr(progname, '\\')) != NULL) progname = tchrptr+1;
+  if ((tchrptr = strrchr(progname, '/')) != NULL) progname = tchrptr+1;
+  fprintf(stderr,
+"\n%s version " MODULE_VERSION "\n"
+"\nUSAGE: %s [options] DataFiles...\n\n"
+"\n:TODO:\n\n"
+" Option                                                     Default\n\n"
+" -c         Enable crossvalidation                          off\n"
+" -m file    Set label map of NN outputs                     \n"
+" -n f       Set learning rate to f                          0.06\n"
+" -o ext     Set target model ext                            None\n"
+" -A         Print command line arguments                    Off\n" 
+" -C cf      Set config file to cf                           Default\n"
+" -D         Display configuration variables                 Off\n"
+" -H mmf     Load NN macro file                              \n"
+" -I mlf     Load master label file mlf                      \n"
+" -L dir     Set input label (or net) dir                    Current\n"
+" -M dir     Dir to write NN macro files                     Current\n"
+" -O fn      Objective function [mse,xent]                   xent\n"
+" -S file    Set script file                                 None\n"
+" -T N       Set trace flags to N                            0\n" 
+" -V         Print version information                       Off\n"
+" -X ext     Set input label file ext                        lab\n"
+"\n"
+"BUNCHSIZE CACHESIZE CROSSVALIDATE FEATURETRANSFORM LEARNINGRATE LEARNRATEFACTORS MLFTRANSC MOMENTUM NATURALREADORDER OBJECTIVEFUNCTION OUTPUTLABELMAP PRINTCONFIG PRINTVERSION RANDOMIZE SCRIPT SEED SOURCEMLF SOURCEMMF SOURCETRANSCDIR SOURCETRANSCEXT TARGETMMF TARGETMODELDIR TARGETMODELEXT TRACE WEIGHTCOST\n"
+"\n"
+"STARTFRMEXT ENDFRMEXT CMEANDIR CMEANMASK VARSCALEDIR VARSCALEMASK VARSCALEFN TARGETKIND DERIVWINDOWS DELTAWINDOW ACCWINDOW THIRDWINDOW\n"
+"\n"
+" %s is Copyright (C) 2010-2011 Karel Vesely\n"
+" licensed under the APACHE License, version 2.0\n"
+" Bug reports, feedback, etc, to: iveselyk@fit.vutbr.cz\n"
+"\n", progname, progname, progname);
+  exit(-1);
+}
+
+
+
+
+
+
+///////////////////////////////////////////////////////////////////////
+// MAIN FUNCTION
+//
+
+
+int main(int argc, char *argv[]) try
+{
+  const char* p_option_string =
+    " -m r   OUTPUTLABELMAP" 
+    " -n r   LEARNINGRATE" 
+    " -D n   PRINTCONFIG=TRUE"
+    " -H l   SOURCEMMF"
+    " -I r   SOURCEMLF"
+    " -L r   SOURCETRANSCDIR"
+    " -S l   SCRIPT"
+    " -T r   TRACE"
+    " -V n   PRINTVERSION=TRUE"
+    " -X r   SOURCETRANSCEXT";
+
+
+  UserInterface        ui;
+  FeatureRepository    feature_repo;
+  LabelRepository      label_repo;
+  CuNetwork            network;
+  CuNetwork            transform_network;
+  CuObjectiveFunction* p_obj_function = NULL;
+  Timer                timer;
+  Timer                timer_frontend;
+  double               time_frontend = 0.0;
+
+  const char*                       p_source_mmf_file;
+  const char*                       p_input_transform;
+  const char*                       p_targetmmf;
+ 
+  const char*                       p_script;
+  const char*                       p_output_label_map;
+
+  BaseFloat                         learning_rate;
+  const char*                       learning_rate_factors;
+  BaseFloat                         momentum;
+  BaseFloat                         weightcost;
+  int                               bptt;
+  CuObjectiveFunction::ObjFunType   obj_fun_id;
+
+  const char*                       p_source_mlf_file;
+  const char*                       p_src_lbl_dir;
+  const char*                       p_src_lbl_ext;
+
+  bool                              cross_validate;
+
+  int                               trace;
+
+  // variables for feature repository
+  bool                              swap_features;
+  int                               target_kind;
+  int                               deriv_order;
+  int*                              p_deriv_win_lenghts;
+  int                               start_frm_ext;
+  int                               end_frm_ext;
+        char*                       cmn_path;
+        char*                       cmn_file;
+  const char*                       cmn_mask;
+        char*                       cvn_path;
+        char*                       cvn_file;
+  const char*                       cvn_mask;
+  const char*                       cvg_file;
+
+ 
+  // OPTION PARSING ..........................................................
+  // use the STK option parsing
+  if (argc == 1) { usage(argv[0]); return 1; }
+  int args_parsed = ui.ParseOptions(argc, argv, p_option_string, SNAME);
+
+
+  // OPTION RETRIEVAL ........................................................
+  // extract the feature parameters
+  swap_features = !ui.GetBool(SNAME":NATURALREADORDER", TNet::IsBigEndian());
+  
+  target_kind = ui.GetFeatureParams(&deriv_order, &p_deriv_win_lenghts,
+       &start_frm_ext, &end_frm_ext, &cmn_path, &cmn_file, &cmn_mask,
+       &cvn_path, &cvn_file, &cvn_mask, &cvg_file, SNAME":", 0);
+
+
+  // extract other parameters
+  p_source_mmf_file   = ui.GetStr(SNAME":SOURCEMMF",     NULL);
+  p_input_transform   = ui.GetStr(SNAME":FEATURETRANSFORM",  NULL);
+  
+  p_targetmmf         = ui.GetStr(SNAME":TARGETMMF",     NULL);
+
+  p_script            = ui.GetStr(SNAME":SCRIPT",         NULL);
+  p_output_label_map  = ui.GetStr(SNAME":OUTPUTLABELMAP", NULL);
+
+  learning_rate       = ui.GetFlt(SNAME":LEARNINGRATE"  , 0.06f);
+  learning_rate_factors = ui.GetStr(SNAME":LEARNRATEFACTORS", NULL);
+  momentum            = ui.GetFlt(SNAME":MOMENTUM"      , 0.0);
+  weightcost          = ui.GetFlt(SNAME":WEIGHTCOST"    , 0.0);
+  bptt                = ui.GetInt(SNAME":BPTT"          , 4);
+
+  obj_fun_id          = static_cast<CuObjectiveFunction::ObjFunType>(
+                        ui.GetEnum(SNAME":OBJECTIVEFUNCTION", 
+                                   CuObjectiveFunction::CROSS_ENTROPY, //< default
+                                   "xent", CuObjectiveFunction::CROSS_ENTROPY,
+                                   "mse", CuObjectiveFunction::MEAN_SQUARE_ERROR
+                        ));
+
+
+
+  p_source_mlf_file   = ui.GetStr(SNAME":SOURCEMLF",       NULL);
+  p_src_lbl_dir       = ui.GetStr(SNAME":SOURCETRANSCDIR", NULL);
+  p_src_lbl_ext       = ui.GetStr(SNAME":SOURCETRANSCEXT", "lab");
+
+  cross_validate      = ui.GetBool(SNAME":CROSSVALIDATE",  false);
+
+  trace               = ui.GetInt(SNAME":TRACE",               0);
+  //if(trace&1) { 
+    CuDevice::Instantiate().Verbose(true); 
+  //}
+
+
+  //throw away...
+  ui.GetInt(SNAME":BUNCHSIZE", 256);
+  ui.GetInt(SNAME":CACHESIZE", 12800);
+  ui.GetBool(SNAME":RANDOMIZE", true);
+
+
+  // process the parameters
+  if(ui.GetBool(SNAME":PRINTCONFIG", false)) {
+    std::cout << std::endl;
+    ui.PrintConfig(std::cout);
+    std::cout << std::endl;
+  }
+  if(ui.GetBool(SNAME":PRINTVERSION", false)) {
+    std::cout << std::endl;
+    std::cout << "======= "MODULE_VERSION" =======" << std::endl;
+    std::cout << std::endl;
+  }
+  ui.CheckCommandLineParamUse();
+  
+
+  // the rest of the parameters are the feature files
+  for (; args_parsed < argc; args_parsed++) {
+    feature_repo.AddFile(argv[args_parsed]);
+  }
+
+  //**************************************************************************
+  //**************************************************************************
+  // OPTION PARSING DONE .....................................................
+
+
+  //read the input transform network
+  if(NULL != p_input_transform) { 
+    if(trace&1) TraceLog(std::string("Reading input transform network: ")+p_input_transform);
+    transform_network.ReadNetwork(p_input_transform);
+  }
+
+
+  //read the neural network
+  if(NULL != p_source_mmf_file) { 
+    if(trace&1) TraceLog(std::string("Reading network: ")+p_source_mmf_file);
+    network.ReadNetwork(p_source_mmf_file);
+  } else {
+    Error("Source MMF must be specified [-H]");
+  }
+
+
+  // initialize the feature repository 
+  feature_repo.Init(
+    swap_features, start_frm_ext, end_frm_ext, target_kind,
+    deriv_order, p_deriv_win_lenghts, 
+    cmn_path, cmn_mask, cvn_path, cvn_mask, cvg_file
+  );
+  if(NULL != p_script) {
+    feature_repo.AddFileList(p_script);
+  } else {
+    Warning("WARNING: The script file is missing [-S]");
+  }
+
+  // initialize the label repository
+  if(NULL == p_source_mlf_file)
+    Error("Source mlf file file is missing [-I]");
+  if(NULL == p_output_label_map)
+    Error("Output label map is missing [-m]");
+  label_repo.Init(p_source_mlf_file, p_output_label_map, p_src_lbl_dir, p_src_lbl_ext);
+
+  //get objective function instance
+  p_obj_function = CuObjectiveFunction::Factory(obj_fun_id);
+
+  //set the learnrate, etc
+  network.SetLearnRate(learning_rate, learning_rate_factors);
+  network.SetMomentum(momentum);
+  network.SetWeightcost(weightcost);
+
+  //set the BPTT order
+  for(int i=0; i<network.Layers(); i++) {
+    if(network.Layer(i).GetType() == CuComponent::RECURRENT) {
+      dynamic_cast<CuRecurrent&>(network.Layer(i)).BpttOrder(bptt);
+    }
+  }
+  
+  
+  //**********************************************************************
+  //**********************************************************************
+  // INITIALIZATION DONE .................................................
+  //
+  // Start training
+  timer.Start();
+  if(cross_validate) {
+    std::cout << "===== TRecurrentCu CROSSVAL STARTED =====" << std::endl;
+  } else {
+    std::cout << "===== TRecurrentCu TRAINING STARTED =====" << std::endl;
+  }
+
+  feature_repo.Rewind();
+  
+  //**********************************************************************
+  //**********************************************************************
+  // MAIN LOOP
+  //
+  int frames = 0;
+  Matrix<BaseFloat> targets_host;
+  CuMatrix<BaseFloat> feats, output, targets, globerr;
+  for(feature_repo.Rewind(); !feature_repo.EndOfList(); feature_repo.MoveNext()) {
+    
+    timer_frontend.Start();
+      
+    Matrix<BaseFloat> feats_host, globerr_host;
+    CuMatrix<BaseFloat> feats_original;
+    CuMatrix<BaseFloat> feats_expanded;
+
+    //read feats, perfrom feature transform
+    feature_repo.ReadFullMatrix(feats_host);
+    feats_original.CopyFrom(feats_host);
+    transform_network.Propagate(feats_original,feats_expanded);
+
+    //trim the start/end context
+    int rows = feats_expanded.Rows()-start_frm_ext-end_frm_ext;
+    feats.Init(rows,feats_expanded.Cols());
+    feats.CopyRows(rows,start_frm_ext,feats_expanded,0);
+
+    timer_frontend.End(); time_frontend += timer_frontend.Val();
+
+    //read the targets
+    label_repo.GenDesiredMatrix(targets_host,feats.Rows(),
+                                feature_repo.CurrentHeader().mSamplePeriod,
+                                feature_repo.Current().Logical().c_str());
+    targets.CopyFrom(targets_host);
+
+    //reset the history context
+    for(int i=0; i<network.Layers(); i++) {
+      if(network.Layer(i).GetType() == CuComponent::RECURRENT) {
+        dynamic_cast<CuRecurrent&>(network.Layer(i)).ClearHistory();
+      }
+    }
+
+    CuMatrix<BaseFloat> input_row(1,feats.Cols());
+    CuMatrix<BaseFloat> output_row(1,network.GetNOutputs());
+    CuMatrix<BaseFloat> target_row(1,network.GetNOutputs());
+    CuMatrix<BaseFloat> error_row(1,network.GetNOutputs());
+    for(size_t frm=0; frm<feats.Rows(); frm++) {
+      //select data rows
+      input_row.CopyRows(1,frm,feats,0);
+      target_row.CopyRows(1,frm,targets,0);
+
+      //forward
+      network.Propagate(input_row,output_row);
+      
+      //xetropy
+      p_obj_function->Evaluate(output_row,target_row,error_row);
+
+      if(!cross_validate) {
+        //backward
+        network.Backpropagate(error_row);
+      }
+    }
+
+    frames += feats.Rows();
+    std::cout << "." << std::flush; 
+  }
+
+
+
+  //**********************************************************************
+  //**********************************************************************
+  // TRAINING FINISHED .................................................
+  //
+  // Let's store the network, report the log
+
+ 
+  if(cross_validate) {
+    if(trace&1) TraceLog("Crossval finished");
+  } else {
+    if(trace&1) TraceLog("Training finished");
+  }
+
+  //write the network
+  if(!cross_validate) {
+    if (NULL != p_targetmmf) {
+      if(trace&1) TraceLog(std::string("Writing network: ")+p_targetmmf);
+      network.WriteNetwork(p_targetmmf);
+    } else {
+      Error("forgot to specify --TARGETMMF argument");
+    }
+  }
+  
+  timer.End();
+  std::cout << std::endl;
+  std::cout << "===== TRecurrentCu FINISHED ( " << timer.Val() << "s ) "
+            << "[FPS:" << float(frames) / timer.Val() 
+            << ",RT:" << 1.0f / (float(frames) / timer.Val() / 100.0f)
+            << "] =====" << std::endl;
+
+  //report objective function (accuracy, frame counts...)
+  std::cout << "-- " << (cross_validate?"CV":"TR") << p_obj_function->Report();
+  std::cout << "T-fe: " << time_frontend << std::endl;
+  
+  return  0; ///finish OK
+
+} catch (std::exception& rExc) {
+  std::cerr << "Exception thrown" << std::endl;
+  std::cerr << rExc.what() << std::endl;
+  return  1;
+}
+
diff --git a/src/TSegmenter.cc b/src/TSegmenter.cc
new file mode 100644
index 0000000..db75f67
--- /dev/null
+++ b/src/TSegmenter.cc
@@ -0,0 +1,265 @@
+
+/***************************************************************************
+ *   copyright            : (C) 2011 by Karel Vesely,UPGM,FIT,VUT,Brno     *
+ *   email                : iveselyk@fit.vutbr.cz                          *
+ ***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the APACHE License as published by the          *
+ *   Apache Software Foundation; either version 2.0 of the License,        *
+ *   or (at your option) any later version.                                *
+ *                                                                         *
+ ***************************************************************************/
+
+#define SVN_DATE       "$Date: 2011-03-24 14:59:03 +0100 (Thu, 24 Mar 2011) $"
+#define SVN_AUTHOR     "$Author: iveselyk $"
+#define SVN_REVISION   "$Revision: 42 $"
+#define SVN_ID         "$Id: TSegmenter.cc 42 2011-03-24 13:59:03Z iveselyk $"
+
+#define MODULE_VERSION "1.0.0 "__TIME__" "__DATE__" "SVN_ID  
+
+
+
+
+/*** TNetLib includes */
+#include "Error.h"
+#include "Timer.h"
+#include "Features.h"
+#include "Common.h"
+#include "MlfStream.h"
+#include "UserInterface.h"
+#include "Timer.h"
+
+/*** STL includes */
+#include <iostream>
+#include <sstream>
+#include <numeric>
+
+/*** Unix includes */
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+
+
+
+
+
+//////////////////////////////////////////////////////////////////////
+// DEFINES
+//
+
+#define SNAME "TSEGMNTER"
+
+using namespace TNet;
+
+void usage(const char* progname) 
+{
+  const char *tchrptr;
+  if ((tchrptr = strrchr(progname, '\\')) != NULL) progname = tchrptr+1;
+  if ((tchrptr = strrchr(progname, '/')) != NULL) progname = tchrptr+1;
+  fprintf(stderr,
+"\n%s version " MODULE_VERSION "\n"
+"\nUSAGE: %s [options] DataFiles...\n\n"
+" Option                                                     Default\n\n"
+" -l dir     Set target directory for features               !REQ!\n"
+//" -y ext     Set target feature ext                          fea_join\n"
+" -A         Print command line arguments                    Off\n" 
+" -C cf      Set config file to cf                           Default\n"
+" -D         Display configuration variables                 Off\n"
+" -S file    Set script file                                 None\n"
+" -T N       Set trace flags to N                            0\n" 
+" -V         Print version information                       Off\n"
+"\n"
+"NATURALREADORDER NOSUBDIRS OUTPUTSCRIPT PRINTCONFIG PRINTVERSION SCRIPT TARGETPARAMDIR "/*TARGETPARAMEXT*/" TRACE\n"
+"\n"
+"STARTFRMEXT ENDFRMEXT CMEANDIR CMEANMASK VARSCALEDIR VARSCALEMASK VARSCALEFN TARGETKIND DERIVWINDOWS DELTAWINDOW ACCWINDOW THIRDWINDOW\n"
+"\n"
+" %s is Copyright (C) 2010-2011 Karel Vesely\n"
+" licensed under the APACHE License, version 2.0\n"
+" Bug reports, feedback, etc, to: iveselyk@fit.vutbr.cz\n"
+"\n", progname, progname, progname);
+  exit(-1);
+}
+
+
+///////////////////////////////////////////////////////////////////////
+// MAIN FUNCTION
+//
+
+
+int main(int argc, char *argv[]) try
+{
+  const char* p_option_string =
+    " -l r   TARGETPARAMDIR"
+//    " -y r   TARGETPARAMEXT"
+    " -D n   PRINTCONFIG=TRUE"
+    " -S l   SCRIPT"
+    " -T r   TRACE"
+    " -V n   PRINTVERSION=TRUE"
+    ;
+
+
+  UserInterface        ui;
+  FeatureRepository    features;
+  //InputDataProxy       data_proxy;
+  //Network              network;
+  //ObjectiveFunction*             p_obj_function = NULL;
+  Timer                timer;
+
+ 
+  const char*                       p_script;
+  const char*                       p_tgt_param_dir;
+//  const char*                       p_tgt_param_ext;
+  const char*                       p_output_script;
+  int                               trace;
+  bool                              create_subdirs;
+
+  // variables for feature repository
+  bool                              swap_features;
+  int                               target_kind;
+  int                               deriv_order;
+  int*                              p_deriv_win_lenghts;
+  int                               start_frm_ext;
+  int                               end_frm_ext;
+        char*                       cmn_path;
+        char*                       cmn_file;
+  const char*                       cmn_mask;
+        char*                       cvn_path;
+        char*                       cvn_file;
+  const char*                       cvn_mask;
+  const char*                       cvg_file;
+
+ 
+  // OPTION PARSING ..........................................................
+  // use the STK option parsing
+  if (argc == 1) { usage(argv[0]); return 1; }
+  int args_parsed = ui.ParseOptions(argc, argv, p_option_string, SNAME);
+
+
+  // OPTION RETRIEVAL ........................................................
+  // extract the feature parameters
+  swap_features = !ui.GetBool(SNAME":NATURALREADORDER", TNet::IsBigEndian());
+  
+  target_kind = ui.GetFeatureParams(&deriv_order, &p_deriv_win_lenghts,
+       &start_frm_ext, &end_frm_ext, &cmn_path, &cmn_file, &cmn_mask,
+       &cvn_path, &cvn_file, &cvn_mask, &cvg_file, SNAME":", 0);
+
+
+  // extract other parameters
+  p_script            = ui.GetStr(SNAME":SCRIPT",         NULL);
+  p_tgt_param_dir     = ui.GetStr(SNAME":TARGETPARAMDIR", NULL);
+//  p_tgt_param_ext     = ui.GetStr(SNAME":TARGETPARAMEXT", NULL);
+  p_output_script     = ui.GetStr(SNAME":OUTPUTSCRIPT",   NULL);
+  create_subdirs      = !ui.GetBool(SNAME":NOSUBDIRS", false);
+  trace               = ui.GetInt(SNAME":TRACE",          00);
+
+
+  // process the parameters
+  if(ui.GetBool(SNAME":PRINTCONFIG", false)) {
+    std::cout << std::endl;
+    ui.PrintConfig(std::cout);
+    std::cout << std::endl;
+  }
+  if(ui.GetBool(SNAME":PRINTVERSION", false)) {
+    std::cout << std::endl;
+    std::cout << "======= TNET v"MODULE_VERSION" xvesel39 =======" << std::endl;
+    std::cout << std::endl;
+  }
+  ui.CheckCommandLineParamUse();
+  
+
+  // the rest of the parameters are the feature files
+  for (; args_parsed < argc; args_parsed++) {
+    features.AddFile(argv[args_parsed]);
+  }
+
+  //**************************************************************************
+  //**************************************************************************
+  // OPTION PARSING DONE .....................................................
+
+
+  //initialize FeatureRepository
+  features.AddFileList(p_script);
+  
+  features.Init(
+    swap_features, start_frm_ext, end_frm_ext, target_kind,
+    deriv_order, p_deriv_win_lenghts, 
+    cmn_path, cmn_mask, cvn_path, cvn_mask, cvg_file
+  );
+
+  //start timer
+  timer.Start();
+
+  std::cout << "[Segmentation started]" << std::endl;
+
+  //segment the features
+  size_t cnt = 0;
+  size_t step = features.QueueSize() / 100;
+  if(step == 0) step = 1;
+
+  //open output script file
+  std::ofstream out_scp;
+  if(NULL == p_output_script) Error("OUTPUTSCRIPT parameter needed");
+  out_scp.open(p_output_script);
+  if(!out_scp.good()) Error(std::string("Cannot open output script file")+p_output_script);
+
+  //store short segments of the data
+  Matrix<BaseFloat> matrix;
+  std::string file_out;
+
+  features.Rewind();
+  for( ; !features.EndOfList(); features.MoveNext(), cnt++) {
+    //read the features
+    features.ReadFullMatrix(matrix);
+
+    //build the output feature filename
+    file_out = "";
+    if(NULL != p_tgt_param_dir) {
+      (file_out += p_tgt_param_dir) += "/";
+    }
+    
+    //create directory structure
+    if(create_subdirs) {
+      char subd[64];
+      sprintf(subd,"%06d/",cnt/1000);
+      file_out += subd;
+      //create dir
+      if(access(file_out.c_str(), R_OK|W_OK|X_OK)) {
+        if(mkdir(file_out.c_str(),0770)) {
+          Error(std::string("Cannot create directory:")+file_out);
+        }
+      }
+    }
+
+    //append logical filename
+    file_out += features.Current().Logical();
+
+    //get the targetkind and source_rate 
+    if(target_kind == PARAMKIND_ANON) {
+      target_kind = features.CurrentHeader().mSampleKind;
+    }
+    int source_rate = features.CurrentHeader().mSamplePeriod;
+    //write the output feature
+    features.WriteFeatureMatrix(matrix, file_out, target_kind, source_rate);
+    //write the output scriptfile record
+    out_scp << file_out << "[" << start_frm_ext << "," << matrix.Rows()-end_frm_ext-1 << "]\n";
+    out_scp << std::flush;
+
+    if((cnt % step) == 0) std::cout << 100 * cnt / features.QueueSize() << "%, " << std::flush;
+  }
+
+  //close output script file
+  out_scp.close();
+
+  timer.End();
+  std::cout << "\n[Segmentation finished, elapsed time:( " << timer.Val() <<"s )]" << std::endl;
+
+
+  return  0; ///finish OK
+
+} catch (std::exception& rExc) {
+  std::cerr << "Exception thrown" << std::endl;
+  std::cerr << rExc.what() << std::endl;
+  return  1;
+}
diff --git a/src/quick_build.sh b/src/quick_build.sh
new file mode 100755
index 0000000..2b7621c
--- /dev/null
+++ b/src/quick_build.sh
@@ -0,0 +1,2 @@
+make depend && make clean && make #build the CPU tools
+make CUDA=true #build the CUDA dependent tools
diff --git a/src/tnet.mk b/src/tnet.mk
new file mode 100644
index 0000000..d9b6ff4
--- /dev/null
+++ b/src/tnet.mk
@@ -0,0 +1,82 @@
+#
+# This makefile contains some global definitions,
+# that are used during the build process.
+# It is included by all the subridrectory libraries.
+#
+
+
+##############################################################
+##### 64-BIT CROSS-COMPILATION #####
+CXXFLAGS=
+FWDPARAM=
+ifeq ($(BITS64), true)
+  ##### CHANGE WHEN DIFFERENT 64BIT g++ PREFIX ##### 
+  CROSS_COMPILE = x86_64-linux-
+  ##### CHANGE WHEN DIFFERENT 64BIT g++ PREFIX ##### 
+  CXXFLAGS += -m64
+  FWDPARAM += BITS64=true
+else
+  CXXFLAGS += -m32
+endif
+
+# disable cross-compile prefix if CXX not exists
+CXX=$(CROSS_COMPILE)g++-4.6
+CXX2=$(notdir $(shell which $(CXX) 2>/dev/null))
+ifneq ("$(CXX)", "$(CXX2)")
+  CROSS_COMPILE=
+endif
+
+# compilation tools
+CC = $(CROSS_COMPILE)g++-4.6
+CXX = $(CROSS_COMPILE)g++-4.6
+AR = $(CROSS_COMPILE)ar
+RANLIB = $(CROSS_COMPILE)ranlib
+AS = $(CROSS_COMPILE)as
+
+
+
+
+##############################################################
+##### PATH TO CUDA TOOLKIT #####
+#CUDA_TK_BASE=/usr/local/share/cuda-3.2.12
+CUDA_TK_BASE=/usr/local/cuda-5.0
+##### PATH TO CUDA TOOLKIT #####
+
+
+
+
+# compilation args
+CXXFLAGS += -g -Wall -O2 -DHAVE_ATLAS -rdynamic
+CXXFLAGS += -Wshadow -Wpointer-arith -Wcast-qual -Wcast-align -Wwrite-strings -Wconversion
+
+# enable double-precision
+ifeq ($(DOUBLEPRECISION), true)
+  CXXFLAGS += -DDOUBLEPRECISION
+  FWDPARAM += DOUBLEPRECISION=true
+endif
+
+
+# compile all the source .cc files 
+SRC=$(wildcard *.cc)
+OBJ=$(patsubst %.cc, %.o, $(SRC))
+
+
+
+
+#########################################################
+# CONFIGURATION CHECKS
+#
+
+#check that CUDA_TK_BASE is set correctly
+ifeq ("$(wildcard $(CUDA_TK_BASE)/bin/nvcc)", "$(CUDA_TK_BASE)/bin/nvcc")
+  HAVE_CUDA=true
+else 
+  ifeq ($(CUDA), true)
+    $(error %%% CUDA not found! Incorrect path in CUDA_TK_BASE: $(CUDA_TK_BASE) in 'trunk/src/tnet.mk')
+  endif
+endif
+
+#
+#########################################################
+
+
author	Joe Zhao <ztuowen@gmail.com>	2014-04-14 08:14:45 +0800
committer	Joe Zhao <ztuowen@gmail.com>	2014-04-14 08:14:45 +0800
commit	cccccbf6cca94a3eaf813b4468453160e91c332b (patch)
tree	23418cb73a10ae3b0688681a7f0ba9b06424583e
download	tnet-cccccbf6cca94a3eaf813b4468453160e91c332b.tar.gz tnet-cccccbf6cca94a3eaf813b4468453160e91c332b.tar.bz2 tnet-cccccbf6cca94a3eaf813b4468453160e91c332b.zip