16 files changed, 3361 insertions, 0 deletions
diff --git a/src/CuBaseLib/.svn/text-base/Makefile.svn-base b/src/CuBaseLib/.svn/text-base/Makefile.svn-base
new file mode 100644
index 0000000..b574c4a
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/Makefile.svn-base
@@ -0,0 +1,59 @@
+
+include ../tnet.mk
+
+INCLUDE = -I. -I../ -I../KaldiLib 
+
+
+CUDA_INCLUDE= -I$(CUDA_TK_BASE)/include
+CUDA_BIN=$(CUDA_TK_BASE)/bin
+
+
+CUSRC=$(wildcard *.cu)
+CUOBJ=$(patsubst %.cu, %.o, $(CUSRC))
+
+
+
+CUDA_FLAGS = -g -Xcompiler -fPIC --verbose 
+ifeq ($(BITS64), true)
+  CUDA_FLAGS += --machine 64
+  BUT_FORCE_GCC64  = ln -s `which x86_64-linux-gcc` $(PWD)/gcc 
+  BUT_UNLINK_GCC64 = unlink $(PWD)/gcc
+else
+  CUDA_FLAGS += --machine 32
+endif
+
+ifeq ($(DOUBLEPRECISION), true)
+  CUDA_FLAGS += --gpu-architecture compute_13 --gpu-code sm_13
+endif
+
+
+
+
+all : libCuBase.a
+
+libCuBase.a : $(CUOBJ) $(OBJ)
+	$(AR) ruv $@ $?
+	$(RANLIB) $@
+
+
+%.o : %.cu
+	$(BUT_FORCE_GCC64)
+	export PATH=$(PWD):$(CUDA_BIN):$(PATH); $(CUDA_BIN)/nvcc -c $< -o $@ -I. $(CUDA_INCLUDE) $(CUDA_FLAGS)
+	$(BUT_UNLINK_GCC64)
+
+%.o : %.cc
+	$(CXX) -c $< -o $@ $(CXXFLAGS) $(CUDA_INCLUDE) $(INCLUDE)
+
+
+
+
+.PHONY: clean depend
+
+clean :
+	rm -f *.o *.a
+
+depend:
+	$(CXX) -M $(CXXFLAGS) *.cc $(INCLUDE) $(CUDA_INCLUDE) > .depend.mk
+
+-include .depend.mk
+
diff --git a/src/CuBaseLib/.svn/text-base/cucommon.h.svn-base b/src/CuBaseLib/.svn/text-base/cucommon.h.svn-base
new file mode 100644
index 0000000..6dc7e94
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/cucommon.h.svn-base
@@ -0,0 +1,46 @@
+#ifndef _CUCOMMON_H_
+#define _CUCOMMON_H_
+
+#include <iostream>
+#include <sstream>
+
+#include <cuda_runtime_api.h>
+
+#include "Error.h"
+
+
+
+#define cuSafeCall(fun) \
+{ \
+  int ret; \
+  if((ret = (fun)) != 0) { \
+    std::ostringstream os; \
+    os << "CUDA ERROR #" << ret << " " << __FILE__ ":" << __LINE__ << " "  << __func__ << "()" << " '" << #fun << "' " << cudaGetErrorString((cudaError_t)ret); \
+    throw(MyException(os.str())); \
+  } \
+  cudaThreadSynchronize(); \
+} 
+
+
+
+
+namespace TNet {
+
+  /** The size of edge of CUDA square block **/
+  static const int CUBLOCK = 16;
+
+  /** Number of blocks in which is split task of size 'size' **/
+  inline int n_blocks(int size, int block_size) 
+  { return size / block_size + ((size % block_size == 0)? 0 : 1); }
+
+  /** Printing dim3 output operator **/
+  inline std::ostream& operator<<(std::ostream& os, dim3 arr) {
+    os << "[" << arr.x << "," << arr.y << "," << arr.z << "]";
+    return os;
+  }
+
+}
+
+
+
+#endif
diff --git a/src/CuBaseLib/.svn/text-base/cudevice.cc.svn-base b/src/CuBaseLib/.svn/text-base/cudevice.cc.svn-base
new file mode 100644
index 0000000..90c5bf3
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/cudevice.cc.svn-base
@@ -0,0 +1,129 @@
+
+#include <cudevice.h>
+#include <cublas.h>
+#include <cuda.h>
+
+///////////////////
+//DEBUG: Just make sure it compiles...
+#include "cumatrix.h"
+#include "cuvector.h"
+#include "cumath.h"
+template class TNet::CuMatrix<float>;
+template class TNet::CuVector<float>;
+template class TNet::CuMath<float>;
+///////////////////
+
+namespace TNet {
+
+
+  /**********************************************************************************
+   * CuDevice::
+   */
+  CuDevice::
+  CuDevice()
+    : mIsPresent(false), mVerbose(false)
+  {
+    //get number of devices
+    int N_GPU = 0;
+    cudaGetDeviceCount(&N_GPU);
+
+    //select device if more than one
+    if(N_GPU > 1) {
+      char name[128];
+      size_t free, total;
+      std::vector<float> free_mem_ratio;
+      //get ratios of memory use
+      std::cout << "Selecting from " << N_GPU << " GPUs\n";
+      for(int n=0; n<N_GPU; n++) {
+        std::cout << "cudaSetDevice(" << n << "): ";
+        cuSafeCall(cudaSetDevice(n));//context created by cuSafeCall(...)
+        cuDeviceGetName(name,128,n);
+        std::cout << name << "\t";
+        cuSafeCall(cuMemGetInfo(&free,&total));
+        std::cout << "free: " << free/1024/1024 << "M, "
+                  << "total: "<< total/1024/1024 << "M, "
+                  << "ratio: "<< free/(float)total << "\n";
+        free_mem_ratio.push_back(free/(float)total);
+        cudaThreadExit();//destroy context
+      }
+      //find GPU with max free memory
+      int max_id=0;
+      for(int n=1; n<free_mem_ratio.size(); n++) {
+        if(free_mem_ratio[n] > free_mem_ratio[max_id]) max_id=n;
+      }
+      std::cout << "Selected device: " << max_id << " (automatically)\n";
+      cuSafeCall(cudaSetDevice(max_id));
+    }
+      
+    if(N_GPU > 0) {
+      //initialize the CUBLAS
+      cuSafeCall(cublasInit());
+      mIsPresent = true;
+    } else {
+      Warning("No CUDA enabled GPU is present!");
+    }
+  }
+
+  CuDevice::
+  ~CuDevice()
+  {
+    if(mIsPresent) {
+      cuSafeCall(cublasShutdown());
+      if(mVerbose) {
+        TraceLog("CUBLAS released");
+        PrintProfile();
+      }
+    } else {
+      Warning("No CUDA enabled GPU was present!");
+    }
+  }
+
+
+  void 
+  CuDevice::
+  SelectGPU(int gpu_id)
+  {
+    //get number of devices
+    int N_GPU = 0;
+    cudaGetDeviceCount(&N_GPU);
+    if(gpu_id >= N_GPU) {
+      KALDI_ERR << "Cannot select GPU " << gpu_id 
+                << ", detected " << N_GPU << " CUDA capable cards!";
+    }
+    //release old card
+    cuSafeCall(cublasShutdown());
+    cudaThreadExit();
+    //select new card
+    cuSafeCall(cudaSetDevice(gpu_id));
+    //initialize CUBLAS
+    cuSafeCall(cublasInit());
+    std::cout << "Selected device " << gpu_id << " (manually)\n";
+  }
+
+
+  std::string
+  CuDevice::
+  GetFreeMemory()
+  {
+    size_t mem_free, mem_total;
+    cuMemGetInfo(&mem_free, &mem_total);
+    std::ostringstream os;
+    os << "Free:" << mem_free/(1024*1024) << "MB "
+       << "Used:" << (mem_total-mem_free)/(1024*1024) << "MB "
+       << "Total:" << mem_total/(1024*1024) << "MB";
+    return os.str();
+  }
+
+
+  ////////////////////////////////////////////////
+  // Instance of the static singleton 
+  //
+  CuDevice CuDevice::msDevice;
+  //
+  ////////////////////////////////////////////////
+  
+
+
+}
+
+
diff --git a/src/CuBaseLib/.svn/text-base/cudevice.h.svn-base b/src/CuBaseLib/.svn/text-base/cudevice.h.svn-base
new file mode 100644
index 0000000..c5eeb7b
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/cudevice.h.svn-base
@@ -0,0 +1,79 @@
+#ifndef _CUDEVICE_H_
+#define _CUDEVICE_H_
+
+#include <map>
+#include <string>
+#include <iostream>
+
+namespace TNet {
+
+  /**
+   * Singleton object which represents CUDA device
+   * responsible for CUBLAS initilalisation
+   * and memory block registration
+   */
+  class CuDevice 
+  {
+    // Singleton interface...
+    private:
+      CuDevice();
+      CuDevice(CuDevice&);
+      CuDevice& operator=(CuDevice&);
+
+    public:
+      ~CuDevice();
+      static CuDevice& Instantiate()
+      { return msDevice; }
+
+    private:
+      static CuDevice msDevice;
+
+
+    /**********************************/
+    // Instance interface
+    public:
+
+      void SelectGPU(int gpu_id);
+     
+      /// Check if the CUDA device is in the system      
+      bool IsPresent()
+      { return mIsPresent; }
+
+      void Verbose(bool verbose)
+      { mVerbose = verbose; }
+
+      /// Sum the IO time
+      void AccuProfile(const std::string& key,double time) 
+      { 
+        if(mProfileMap.find(key) == mProfileMap.end()) {
+          mProfileMap[key] = 0.0;
+        }
+        mProfileMap[key] += time;
+      }
+
+      void PrintProfile()
+      { 
+        std::cout << "[cudevice profile]\n";
+        std::map<std::string, double>::iterator it;
+        for(it = mProfileMap.begin(); it != mProfileMap.end(); ++it) {
+          std::cout << it->first << "\t" << it->second << "s\n";
+        }
+      }
+
+      void ResetProfile()
+      { mProfileMap.clear(); }
+
+      std::string GetFreeMemory();
+
+
+    private:
+      std::map<std::string, double> mProfileMap;
+      bool mIsPresent;
+      bool mVerbose;
+  }; //class CuDevice
+
+
+}
+
+
+#endif
diff --git a/src/CuBaseLib/.svn/text-base/cukernels.cu.svn-base b/src/CuBaseLib/.svn/text-base/cukernels.cu.svn-base
new file mode 100644
index 0000000..d6f866d
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/cukernels.cu.svn-base
@@ -0,0 +1,626 @@
+
+#include <cfloat>
+#include "cukernels.h"
+
+
+
+/*****************
+ * CUDA kernels
+ */
+//CuMatrix
+template<typename T>
+__global__
+static void _set_const(T* mat, T value, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if ( i < d.cols  &&  j < d.rows )
+    mat[index] = value;
+}
+
+
+
+template<typename T>
+__global__
+static void _apply_log(T* mat, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if ( i < d.cols  &&  j < d.rows )
+    mat[index] = log(mat[index]);
+}
+
+
+template<typename T>
+__global__
+static void _apply_mask(T* mat, const float* mask, MatrixDim dmat, MatrixDim dmask) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*dmat.stride;
+  int index2 = i + j*dmask.stride;
+  if ( i < dmat.cols  &&  j < dmat.rows ) 
+    if(mask[index2] == 0) mat[index] = 0;
+}
+
+
+template<typename T>
+__global__
+static void _apply_l1(T* mat, T l1, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if ( i < d.cols  &&  j < d.rows ) {
+    T value = mat[index];
+    T tgt;
+    if(abs(value) < l1) {
+      tgt = 0;
+    } else {
+      tgt = (value > 0?value-l1:value+l1);
+    }
+    mat[index] = tgt;
+  }
+}
+
+
+template<typename T>
+__global__
+static void _scale_cols(T* mat, const T* scale, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if ( i < d.cols  &&  j < d.rows )
+    mat[index] *= scale[i];
+}
+
+
+template<typename T>
+__global__
+static void _scale_rows(T* mat, const T* scale, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if ( i < d.cols  &&  j < d.rows )
+    mat[index] *= scale[j];
+}
+
+
+template<typename T>
+__global__
+static void _add_scaled(T alpha, const T* A, T beta, T* dst, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if ( i < d.cols  &&  j < d.rows )
+    dst[index] = alpha*A[index] + beta*dst[index];
+}
+
+
+template<typename T>
+__global__
+static void _add_scaled_row(T alpha, const T* row, T beta, T* dst, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+
+#if 0
+  //this does not accelerate :(
+  __shared__ T aux[16];
+  if(threadIdx.y == 0 && i < d.cols) aux[threadIdx.x] = row[i];
+  __syncthreads();
+  
+  if ( i < d.cols  &&  j < d.rows )
+    dst[index] = alpha*aux[threadIdx.x] + beta*dst[index];
+#else
+  if ( i < d.cols  &&  j < d.rows )
+    dst[index] = alpha*row[i] + beta*dst[index];
+#endif
+}
+
+
+template<typename T>
+__global__
+static void _mul_elem(T* mat, const T* A, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if ( i < d.cols  &&  j < d.rows )
+    mat[index] = mat[index] * A[index];
+}
+
+
+template<typename T>
+__global__
+static void _log_elem(T* mat, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if ( i < d.cols  &&  j < d.rows ) {
+    if(mat[index] < FLT_MIN) mat[index] = FLT_MIN;
+    mat[index] = log(mat[index]);
+  }
+}
+
+
+
+
+//CuVector
+template<typename T>
+__global__
+static void _add_col_sum(T alpha, const T* mat, T beta, T* vec, MatrixDim d) {
+
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  //This should be called 1-D
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  if(j > 0) return;
+  
+  if(i < d.cols) {
+    double sum = 0.0;
+    for(int k = 0; k < d.rows; k++) {
+      sum += mat[i+k*d.stride];
+    }
+    vec[i] = alpha*sum + beta*vec[i];
+  }
+}
+
+
+template<typename T>
+__global__
+static void _add_col_sum_reduce(T alpha, const T* mat, T beta, T* vec, MatrixDim d) {
+
+  //flipped x,y for reducing... x..row, y..col
+  int j = blockIdx.x * blockDim.x + threadIdx.x;
+  int i = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if(blockIdx.x > 0) return;
+  if(blockDim.y != 1) return;
+
+  //copy vector to shared mem
+  __shared__ T aux[512];
+  aux[threadIdx.x] = mat[i+j*d.stride];
+  __syncthreads();
+
+  T sum = _sum_reduce(aux);
+  __syncthreads();
+  //copy out the result
+  vec[i] = alpha*sum + beta*vec[i];
+}
+
+
+
+//CuMath
+template<typename T>
+__global__
+static void _sigmoid(T*y, const T*x, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if( i < d.cols  &&  j < d.rows ) {
+    T res = 1.0 / (1.0 + exp(-x[index]));
+    /*
+    if(res < 0.001) res = 0.001;
+    if(res > 0.999) res = 0.999;
+    */
+    y[index] = res;
+  }
+}
+
+
+template<typename T>
+__global__
+static void _diff_sigmoid(T*eout, const T*e, const T*y, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if( i < d.cols  && j < d.rows ) 
+    eout[index] = y[index]*(1.0-y[index]) * e[index];
+}
+
+
+template<typename T>
+__global__
+static void _softmax(T*y, const T*x, MatrixDim d) {
+  int j = blockIdx.x * blockDim.x + threadIdx.x;
+  if(j >= d.rows) return;
+
+  //copy to output and find max...
+  double max = -1e20;
+  double sum = 0.0;
+  for(int i=0; i<d.cols; i++) {
+    if(max < x[i+j*d.stride]) max = x[i+j*d.stride];
+    y[i+j*d.stride] = x[i+j*d.stride];
+  }
+  //subtract max, apply exp, sum up...
+  for(int i=0; i<d.cols; i++) {
+    y[i+j*d.stride] = exp(y[i+j*d.stride] - max);
+    sum += y[i+j*d.stride];
+  }
+  //normalize by sum...
+  for(int i=0; i<d.cols; i++) {
+    y[i+j*d.stride] /= sum;
+  }
+}
+
+
+
+
+template<typename T>
+__device__
+static T _max_reduce(T buffer[]) {
+
+  // Total number of active threads
+  int nTotalThreads = blockDim.x;	
+  __syncthreads();
+
+  while(nTotalThreads > 1) {
+    int halfPoint = ((1+nTotalThreads) >> 1);	// divide by two
+    // only the first half of the threads will be active.
+    if (threadIdx.x < halfPoint)  {
+      // Get the shared value stored by another thread
+      T temp = -1e20;
+      if(threadIdx.x+halfPoint < nTotalThreads) {
+        temp = buffer[threadIdx.x + halfPoint];
+      }
+      if (temp > buffer[threadIdx.x]) buffer[threadIdx.x] = temp;
+    }
+    __syncthreads();
+    nTotalThreads = ((1+nTotalThreads) >> 1);	// divide by two.
+  }
+  // the result
+  return buffer[0];
+}
+
+
+
+
+template<typename T>
+__device__
+static T _sum_reduce(T buffer[]) {
+
+  // Total number of active threads
+  int nTotalThreads = blockDim.x;	
+  __syncthreads();
+
+  while(nTotalThreads > 1) {
+    int halfPoint = ((1+nTotalThreads) >> 1);	// divide by two
+    // only the first half of the threads will be active.
+    if (threadIdx.x < halfPoint)  {
+      // Get the shared value stored by another thread
+      T temp = 0.0;
+      if(threadIdx.x+halfPoint < nTotalThreads) {
+        temp = buffer[threadIdx.x + halfPoint];
+      }
+      buffer[threadIdx.x] += temp;
+    }
+    __syncthreads();
+    nTotalThreads = ((1+nTotalThreads) >> 1);	// divide by two.
+  }
+  // the result
+  return buffer[0];
+}
+
+
+
+template<typename T>
+__global__
+static void _softmax_reduce(T*y, const T*x, MatrixDim d) {
+  
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if(blockIdx.x > 0) return;
+  if(blockDim.y > 1) return;
+
+  __shared__ T row_data[256];
+  __shared__ T aux[256];
+
+  //copy the input to row_data
+  row_data[i] = x[i+j*d.stride];
+  __syncthreads();
+
+  //copy input to aux
+  aux[i] = row_data[i];
+  __syncthreads();
+  //get the maximum value
+  T max = _max_reduce(aux);
+  __syncthreads();
+
+  //calculate exp(data-max)
+  row_data[i] = exp(row_data[i]-max);
+ 
+  //copy the values to aux
+  aux[i] = row_data[i];
+  __syncthreads();
+  //get the sum
+  T sum = _sum_reduce(aux);
+  __syncthreads();
+
+  //divide the values
+  row_data[i] /= sum;
+  //copy out
+  y[i+j*d.stride] = row_data[i];
+
+}
+
+
+
+template<typename T>
+__global__
+static void _expand(T* y, const T* x, const int* off, MatrixDim d_out, MatrixDim d_in)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d_out.stride;
+  if( i < d_out.cols  && j < d_out.rows ) {
+    int src_col = i % d_in.cols;
+    int src_row = j + off[i / d_in.cols];
+    if(src_row < 0) src_row = 0;
+    if(src_row >= d_in.rows) src_row = d_in.rows-1;
+    y[index] = x[src_col + src_row*d_in.stride];
+  }
+}
+
+
+template<typename T>
+__global__
+static void _rearrange(T* y, const T* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d_out.stride;
+  if( i < d_out.cols  && j < d_out.rows ) {
+    int src_col = copy_from[i];
+    if(src_col >= 0 && src_col < d_in.cols) {
+      y[index] = x[src_col + j*d_in.stride];
+    } else {
+      y[index] = 1.0/0.0;
+    }
+  }
+}
+
+
+template<typename T>
+__global__
+static void _randomize(T* y, const T* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d_out.stride;
+  if( i < d_out.cols  && j < d_out.rows ) {
+    int src_row = copy_from[j];
+    y[index] = x[i + src_row*d_in.stride];
+  }
+}
+
+
+template<typename T>
+__global__
+static void _check_class(const T* out, const T* des, int* match, MatrixDim d)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  if(j>0) return;
+
+  if(i<d.rows) {
+    int out_id = -1, des_id = -2;
+    T out_max = -1e20, des_max = -1e20;
+
+    for(int k=0; k<d.cols; k++) {
+      T val = out[k + i*d.stride];
+      if(val > out_max) { out_max = val; out_id = k; }
+    }
+    for(int k=0; k<d.cols; k++) {
+      T val = des[k + i*d.stride];
+      if(val > des_max) { des_max = val; des_id = k; }
+    }
+    
+    match[i] = ((out_id == des_id)?1:0);
+  }
+}
+
+
+template<typename T>
+__device__
+static int _max_id_reduce(T val[],int idx[]) {
+
+  // Total number of active threads
+  int nTotalThreads = blockDim.x;	
+  __syncthreads();
+
+  while(nTotalThreads > 1) {
+    int halfPoint = ((1+nTotalThreads) >> 1);	// divide by two
+    // only the first half of the threads will be active.
+    if (threadIdx.x < halfPoint)  {
+      // Get the shared value stored by another thread
+      T temp = -1e20;
+      if(threadIdx.x+halfPoint < nTotalThreads) {
+        temp = val[idx[threadIdx.x + halfPoint]];
+      }
+      if (temp > val[idx[threadIdx.x]]) idx[threadIdx.x]=idx[threadIdx.x + halfPoint];
+    }
+    __syncthreads();
+    nTotalThreads = ((1+nTotalThreads) >> 1);	// divide by two.
+  }
+  // the result
+  return idx[0];
+}
+
+
+
+
+
+
+template<typename T>
+__global__
+static void _check_class_reduce(const T* out, const T* des, int* match, MatrixDim d)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if(blockIdx.x > 0) return;
+  if(blockDim.y != 1) return;
+
+  __shared__ T value[256];
+  __shared__ int index[256];
+
+  value[threadIdx.x] = out[i+j*d.stride];
+  index[threadIdx.x] = threadIdx.x;
+  __syncthreads();
+
+  int out_max = _max_id_reduce(value,index);
+  __syncthreads();
+
+  value[threadIdx.x] = des[i+j*d.stride];
+  index[threadIdx.x] = threadIdx.x;
+  __syncthreads();
+  
+  int des_max = _max_id_reduce(value,index);
+  __syncthreads();
+
+  if(threadIdx.x == 0) {
+    match[j] = ((out_max == des_max)?1:0);
+  }
+}
+
+
+
+
+/**************
+ * C wrappers around CUDA kernels
+ */
+//:FLOAT:
+//CuMatrix
+void cudaF_set_const(dim3 Gr, dim3 Bl, float* mat, float value, MatrixDim d) 
+{ _set_const<<<Gr,Bl>>>(mat,value,d); }
+
+void cudaF_apply_log(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) 
+{ _apply_log<<<Gr,Bl>>>(mat,d); }
+
+void cudaF_apply_mask(dim3 Gr, dim3 Bl, float* mat, const float* mask, MatrixDim dmat, MatrixDim dmask)
+{ _apply_mask<<<Gr,Bl>>>(mat,mask,dmat,dmask); }
+
+void cudaF_apply_l1(dim3 Gr, dim3 Bl, float* mat, float l1, MatrixDim d)
+{ _apply_l1<<<Gr,Bl>>>(mat,l1,d); }
+
+void cudaF_scale_cols(dim3 Gr, dim3 Bl, float* mat, const float* scale, MatrixDim d)
+{ _scale_cols<<<Gr,Bl>>>(mat,scale,d); }
+
+void cudaF_scale_rows(dim3 Gr, dim3 Bl, float* mat, const float* scale, MatrixDim d)
+{ _scale_rows<<<Gr,Bl>>>(mat,scale,d); }
+
+void cudaF_add_scaled(dim3 Gr, dim3 Bl, float alpha, const float* A, float beta, float* dst, MatrixDim d)
+{ _add_scaled<<<Gr,Bl>>>(alpha,A,beta,dst,d); }
+
+void cudaF_add_scaled_row(dim3 Gr, dim3 Bl, float alpha, const float* row, float beta, float* dst, MatrixDim d)
+{ _add_scaled_row<<<Gr,Bl>>>(alpha,row,beta,dst,d); }
+
+void cudaF_mul_elem(dim3 Gr, dim3 Bl, float*mat, const float*A, MatrixDim d)
+{ _mul_elem<<<Gr,Bl>>>(mat,A,d); }
+
+void cudaF_log_elem(dim3 Gr, dim3 Bl, float*mat, MatrixDim d)
+{ _log_elem<<<Gr,Bl>>>(mat,d); }
+
+//CuVector
+void cudaF_add_col_sum(size_t Gr, size_t Bl, float alpha, const float* mat, float beta, float* vec, MatrixDim d)
+{ _add_col_sum<<<Gr,Bl>>>(alpha,mat,beta,vec,d); }
+
+void cudaF_add_col_sum_reduce(dim3 Gr, dim3 Bl, float alpha, const float* mat, float beta, float* vec, MatrixDim d) 
+{ _add_col_sum_reduce<<<Gr,Bl>>>(alpha,mat,beta,vec,d); }
+
+//CuMath
+void cudaF_sigmoid (dim3 Gr, dim3 Bl, float *y, const float*x, MatrixDim d)
+{ _sigmoid<<<Gr,Bl>>>(y, x, d); }
+
+void cudaF_diff_sigmoid (dim3 Gr, dim3 Bl, float*eout, const float*e, const float*y, MatrixDim d) {
+  _diff_sigmoid<<<Gr,Bl>>>(eout, e, y, d);
+}
+
+void cudaF_softmax (size_t Gr, size_t Bl, float*y, const float*x, MatrixDim d) 
+{ _softmax<<<Gr,Bl>>>(y, x, d); }
+
+void cudaF_softmax_reduce (dim3 Gr, dim3 Bl, float*y, const float*x, MatrixDim d) 
+{ _softmax_reduce<<<Gr,Bl>>>(y, x, d); }
+
+
+void cudaF_expand(dim3 Gr, dim3 Bl, float* y, const float* x, const int* off, MatrixDim d_out, MatrixDim d_in)
+{ _expand<<<Gr,Bl>>>(y,x,off,d_out,d_in); }
+
+
+void cudaF_rearrange(dim3 Gr, dim3 Bl, float* y, const float* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in)
+{ _rearrange<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in); }
+
+  
+void cudaF_randomize(dim3 Gr, dim3 Bl, float* y, const float* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in)
+{ _randomize<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in); }
+
+
+void cudaF_check_class(size_t Gr, size_t Bl, const float* out, const float* des, int* match, MatrixDim d)
+{ _check_class<<<Gr,Bl>>>(out,des,match,d); }
+
+void cudaF_check_class_reduce(dim3 Gr, dim3 Bl, const float* out, const float* des, int* match, MatrixDim d)
+{ _check_class_reduce<<<Gr,Bl>>>(out,des,match,d); }
+
+
+
+
+//:DOUBLE:
+//CuMatrix
+void cudaD_set_const(dim3 Gr, dim3 Bl, double* mat, double value, MatrixDim d) 
+{ _set_const<<<Gr,Bl>>>(mat,value,d); }
+
+void cudaD_apply_log(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) 
+{ _apply_log<<<Gr,Bl>>>(mat,d); }
+
+void cudaD_scale_cols(dim3 Gr, dim3 Bl, double* mat, const double* scale, MatrixDim d)
+{ _scale_cols<<<Gr,Bl>>>(mat,scale,d); }
+
+void cudaD_scale_rows(dim3 Gr, dim3 Bl, double* mat, const double* scale, MatrixDim d)
+{ _scale_rows<<<Gr,Bl>>>(mat,scale,d); }
+
+void cudaD_add_scaled(dim3 Gr, dim3 Bl, double alpha, const double* A, double beta, double* dst, MatrixDim d)
+{ _add_scaled<<<Gr,Bl>>>(alpha,A,beta,dst,d); }
+
+void cudaD_add_scaled_row(dim3 Gr, dim3 Bl, double alpha, const double* row, double beta, double* dst, MatrixDim d)
+{ _add_scaled_row<<<Gr,Bl>>>(alpha,row,beta,dst,d); }
+
+void cudaD_mul_elem(dim3 Gr, dim3 Bl, double*mat, const double*A, MatrixDim d)
+{ _mul_elem<<<Gr,Bl>>>(mat,A,d); }
+
+void cudaD_log_elem(dim3 Gr, dim3 Bl, double*mat, MatrixDim d)
+{ _log_elem<<<Gr,Bl>>>(mat,d); }
+
+//CuVector
+void cudaD_add_col_sum(size_t Gr, size_t Bl, double alpha, const double* mat, double beta, double* vec, MatrixDim d)
+{ _add_col_sum<<<Gr,Bl>>>(alpha,mat,beta,vec,d); }
+
+//CuMath
+void cudaD_sigmoid (dim3 Gr, dim3 Bl, double *y, const double*x, MatrixDim d)
+{ _sigmoid<<<Gr,Bl>>>(y, x, d); }
+
+
+void cudaD_diff_sigmoid (dim3 Gr, dim3 Bl, double*eout, const double*e, const double*y, MatrixDim d) {
+  _diff_sigmoid<<<Gr,Bl>>>(eout, e, y, d);
+}
+
+void cudaD_softmax (size_t Gr, size_t Bl, double*y, const double*x, MatrixDim d) 
+{ _softmax<<<Gr,Bl>>>(y, x, d); }
+
+
+void cudaD_expand(dim3 Gr, dim3 Bl, double* y, const double* x, const int* off, MatrixDim d_out, MatrixDim d_in)
+{ _expand<<<Gr,Bl>>>(y,x,off,d_out,d_in); }
+
+
+void cudaD_rearrange(dim3 Gr, dim3 Bl, double* y, const double* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in)
+{ _rearrange<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in); }
+
+  
+void cudaD_randomize(dim3 Gr, dim3 Bl, double* y, const double* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in)
+{ _randomize<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in); }
+
+
+void cudaD_check_class(size_t Gr, size_t Bl, const double* out, const double* des, int* match, MatrixDim d)
+{ _check_class<<<Gr,Bl>>>(out,des,match,d); }
+
+
+
+
diff --git a/src/CuBaseLib/.svn/text-base/cukernels.h.svn-base b/src/CuBaseLib/.svn/text-base/cukernels.h.svn-base
new file mode 100644
index 0000000..d8320b5
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/cukernels.h.svn-base
@@ -0,0 +1,81 @@
+#ifndef _cuda_kernels_h_
+#define _cuda_kernels_h_
+
+
+extern "C" {
+
+#pragma GCC diagnostic ignored "-Wshadow";
+#include <vector_types.h>
+#pragma GCC diagnostic warning "-Wshadow";
+
+  typedef struct MatrixDim_ {
+    int rows;
+    int cols;
+    int stride;
+  } MatrixDim;
+
+  /*************
+   * Float instances
+   */
+  //CuMatrix 
+  void cudaF_set_const(dim3 Gr, dim3 Bl, float*mat, float value, MatrixDim d);
+  void cudaF_apply_log(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
+  void cudaF_apply_mask(dim3 Gr, dim3 Bl, float* mat, const float* mask, MatrixDim dmat, MatrixDim dmask);
+  void cudaF_apply_l1(dim3 Gr, dim3 Bl, float* mat, float l1, MatrixDim d);
+  void cudaF_scale_cols(dim3 Gr, dim3 Bl, float*mat, const float* scale, MatrixDim d);
+  void cudaF_scale_rows(dim3 Gr, dim3 Bl, float*mat, const float* scale, MatrixDim d);
+  void cudaF_add_scaled(dim3 Gr, dim3 Bl, float alpha, const float* A, float beta, float* dst, MatrixDim d);
+  void cudaF_add_scaled_row(dim3 Gr, dim3 Bl, float alpha, const float* row, float beta, float* dst, MatrixDim d);
+  void cudaF_mul_elem(dim3 Gr, dim3 Bl, float*mat, const float*A, MatrixDim d);
+  void cudaF_log_elem(dim3 Gr, dim3 Bl, float*mat, MatrixDim d);
+   
+  //CuVector
+  void cudaF_add_col_sum(size_t Gr, size_t Bl, float alpha, const float* mat, float beta, float* vec, MatrixDim d);
+  void cudaF_add_col_sum_reduce(dim3 Gr, dim3 Bl, float alpha, const float* mat, float beta, float* vec, MatrixDim d);
+
+  //CuMath
+  void cudaF_softmax      (size_t Gr, size_t Bl, float*y, const float*x, MatrixDim d);
+  void cudaF_softmax_reduce (dim3 Gr, dim3 Bl, float*y, const float*x, MatrixDim d); 
+  void cudaF_sigmoid      (dim3 Gr, dim3 Bl, float*y, const float*x, MatrixDim d);
+  void cudaF_diff_sigmoid (dim3 Gr, dim3 Bl, float* eout, const float* e, const float* y, MatrixDim d);
+
+  void cudaF_expand(dim3 Gr, dim3 Bl, float* y, const float* x, const int* off, MatrixDim d_out, MatrixDim d_in);
+  void cudaF_rearrange(dim3 Gr, dim3 Bl, float* y, const float* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in);
+  void cudaF_randomize(dim3 Gr, dim3 Bl, float* y, const float* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in);
+  
+  void cudaF_check_class(size_t Gr, size_t Bl, const float* out, const float* des, int* match, MatrixDim d);
+  void cudaF_check_class_reduce(dim3 Gr, dim3 Bl, const float* out, const float* des, int* match, MatrixDim d);
+
+
+
+  /*************
+   * Double instances
+   */
+  //CuMatrix 
+  void cudaD_set_const(dim3 Gr, dim3 Bl, double*mat, double value, MatrixDim d);
+  void cudaD_apply_log(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
+  void cudaD_scale_cols(dim3 Gr, dim3 Bl, double*mat, const double* scale, MatrixDim d);
+  void cudaD_scale_rows(dim3 Gr, dim3 Bl, double*mat, const double* scale, MatrixDim d);
+  void cudaD_add_scaled(dim3 Gr, dim3 Bl, double alpha, const double* A, double beta, double* dst, MatrixDim d);
+  void cudaD_add_scaled_row(dim3 Gr, dim3 Bl, double alpha, const double* row, double beta, double* dst, MatrixDim d);
+  void cudaD_mul_elem(dim3 Gr, dim3 Bl, double*mat, const double*A, MatrixDim d);
+  void cudaD_log_elem(dim3 Gr, dim3 Bl, double*mat, MatrixDim d);
+   
+  //CuVector
+  void cudaD_add_col_sum(size_t Gr, size_t Bl, double alpha, const double* mat, double beta, double* vec, MatrixDim d);
+
+  //CuMath
+  void cudaD_softmax      (size_t Gr, size_t Bl, double*y, const double*x, MatrixDim d);
+  void cudaD_sigmoid      (dim3 Gr, dim3 Bl, double*y, const double*x, MatrixDim d);
+  void cudaD_diff_sigmoid (dim3 Gr, dim3 Bl, double* eout, const double* e, const double* y, MatrixDim d);
+
+  void cudaD_expand(dim3 Gr, dim3 Bl, double* y, const double* x, const int* off, MatrixDim d_out, MatrixDim d_in);
+  void cudaD_rearrange(dim3 Gr, dim3 Bl, double* y, const double* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in);
+  void cudaD_randomize(dim3 Gr, dim3 Bl, double* y, const double* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in);
+  
+  void cudaD_check_class(size_t Gr, size_t Bl, const double* out, const double* des, int* match, MatrixDim d);
+
+
+}
+
+#endif
diff --git a/src/CuBaseLib/.svn/text-base/cumath.cc.svn-base b/src/CuBaseLib/.svn/text-base/cumath.cc.svn-base
new file mode 100644
index 0000000..d718324
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/cumath.cc.svn-base
@@ -0,0 +1,574 @@
+
+
+
+#include "cumath.h"
+#include "cukernels.h"
+
+
+namespace TNet {
+
+  //////////////////////////////////////////////////////////////////////////////
+  //// CuMath<> Template specializations (float)
+  ////
+  template<>
+  void CuMath<float>::Sigmoid(CuMatrix<float>& Y, const CuMatrix<float>& X)
+  {
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(X.Cols(),CUBLOCK), n_blocks(X.Rows(), CUBLOCK));
+
+    cudaF_sigmoid(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+  template<>
+  void CuMath<float>::DiffSigmoid(CuMatrix<float>& Eout, const CuMatrix<float>& Ein, const CuMatrix<float>& Y)
+  {
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Eout.Cols(), CUBLOCK), n_blocks(Eout.Rows(),CUBLOCK));
+
+    cudaF_diff_sigmoid(dimGrid, dimBlock, Eout.pCUData(), Ein.pCUData(), Y.pCUData(), Eout.Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+    
+  template<>
+  void CuMath<float>::Softmax(CuMatrix<float>& Y, const CuMatrix<float>& X)
+  {
+    Timer tim; tim.Start();
+
+#if 0
+    //disable 'reduce' functions
+    size_t dimBlock = CUBLOCK;
+    size_t dimGrid  = n_blocks(X.Rows(),CUBLOCK);
+
+    cudaF_softmax(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim());
+    cuSafeCall(cudaGetLastError());
+#else
+    if(X.Cols() > 256) {
+      //use old implementation (can't use reduction due to 
+      //limited size of shared memory)
+      size_t dimBlock = CUBLOCK;
+      size_t dimGrid  = n_blocks(X.Rows(),CUBLOCK);
+
+      cudaF_softmax(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim());
+      cuSafeCall(cudaGetLastError());
+    } else {
+      //use implementation with reduction
+      dim3 dimBlock(X.Cols(),1);
+      dim3 dimGrid(1,X.Rows());
+
+      cudaF_softmax_reduce(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim());
+      cuSafeCall(cudaGetLastError());
+    }
+#endif
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  void CuMath<float>::BlockLinearity(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuMatrix<float>& block_transf)
+  {
+    Timer tim; tim.Start();
+
+    assert(Y.Rows() == X.Rows());
+    assert((X.Cols() % block_transf.Rows()) == 0);
+    assert((Y.Cols() % block_transf.Cols()) == 0);
+    assert((X.Cols() / block_transf.Rows()) == (Y.Cols() / block_transf.Cols()));
+
+    int blocks = X.Cols() / block_transf.Rows();
+
+    for(int i = 0; i < blocks; i++) {
+      int m = block_transf.Cols();
+      int n = X.Rows();
+      int k = block_transf.Rows();
+
+      /*
+      //DEBUG MESSAGE
+      std::cout << "N N " << m << " " << n << " " << k << " " 
+                << 1.0 << " " << block_transf << " " << block_transf.Stride() 
+                << " " << X+i*k << " " << X.Stride() << " " 
+                << 0.0 << " " << Y+i*n << " " << Y.Stride() 
+                << "\n" << std::flush;
+      */
+
+
+      cublasSgemm('N', 'N', m, n, k, 
+                  1.0, block_transf.pCUData(), block_transf.Stride(), 
+                  X.pCUData()+i*k, X.Stride(), 
+                  0.0, Y.pCUData()+i*m, Y.Stride());
+    }
+    cuSafeCall(cublasGetError());    
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+
+  template<>
+  void CuMath<float>::Expand(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& frameOffsets)
+  {
+    Timer tim; tim.Start();
+
+    assert(Y.Rows() == X.Rows());
+    assert(X.Cols() * frameOffsets.Dim() == Y.Cols());
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK));
+
+    cudaF_expand(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), frameOffsets.pCUData(), Y.Dim(), X.Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  void CuMath<float>::Rearrange(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& copyFrom)
+  {
+    Timer tim; tim.Start();
+
+    assert(copyFrom.Dim() == Y.Cols());
+    assert(Y.Rows() == X.Rows());
+    
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK));
+    
+    cudaF_rearrange(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), Y.Dim(), X.Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  void CuMath<float>::Randomize(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& copyFrom)
+  {
+    Timer tim; tim.Start();
+
+    assert(X.Cols() == Y.Cols());
+    assert(X.Rows() == Y.Rows());
+    assert(copyFrom.Dim() <= Y.Rows());
+    
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(copyFrom.Dim(),CUBLOCK));
+    
+    MatrixDim dimX = X.Dim(); dimX.rows=copyFrom.Dim();
+    MatrixDim dimY = Y.Dim(); dimY.rows=copyFrom.Dim();
+
+    cudaF_randomize(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), dimY, dimX);
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  void CuMath<float>::CheckClass(const CuMatrix<float>& out, const CuMatrix<float> &des, CuVector<int>& match)
+  {
+    Timer tim; tim.Start();
+
+    assert(out.Cols() == des.Cols());
+    assert(out.Rows() == des.Rows());
+    assert(out.Stride() == des.Stride());
+    assert(match.Dim() == out.Rows());
+
+    if(out.Cols() > 256) {
+      size_t dimBlock = CUBLOCK;
+      size_t dimGrid = n_blocks(out.Rows(),CUBLOCK);
+
+      cudaF_check_class(dimGrid, dimBlock, out.pCUData(), des.pCUData(), match.pCUData(), out.Dim());
+      cuSafeCall(cudaGetLastError());
+    } else {
+      dim3 dimBlock(out.Cols(),1);
+      dim3 dimGrid(1,out.Rows());
+
+      cudaF_check_class_reduce(dimGrid, dimBlock, out.pCUData(), des.pCUData(), match.pCUData(), out.Dim());
+      cuSafeCall(cudaGetLastError());
+    }
+
+
+
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  void CuMath<float>::OffsetGemm(char transA, char transB, float alpha, const CuMatrix<float>& A, const CuMatrix<float>& B, float beta, CuMatrix<float>& C, int offA, int offB, int offC)
+  {
+    Timer tim; tim.Start();
+    // CUBLAS is col major, TNet is row major
+    // keep trans..., just swap A&B argumets: A->B B->A
+    //
+    // WARNING
+    // NO DIMENSION CHECK!!!
+    
+    //m,n,k is cublas m,n,k
+    size_t m = ((transB=='T' || transB=='t')? B.Rows() : B.Cols()); 
+    size_t n = ((transA=='T' || transA=='t')? A.Cols() : A.Rows());
+    size_t k = ((transB=='T' || transB=='t')? B.Cols() : B.Rows());
+    size_t k1 = ((transA=='T' || transA=='t')? A.Rows() : A.Cols());
+
+    k = ((k<k1)?k:k1);
+    m = ((m<C.Cols())?m:C.Cols());
+    n = ((n<C.Rows())?m:C.Rows());
+
+#if 0
+    std::cout << "A " << transA << " "<< A.Rows() << " " << A.Cols() << " " << A.Stride() << " " << offA
+         << "; B " << transB << " "<< B.Rows() << " " << B.Cols() << " " << B.Stride() << " " << offB
+         << "; C " << C.Rows() << " " << C.Cols() << " " << C.Stride() << " " << offC
+         << "; alpha" << alpha << " beta" << beta << " REALmnk:" << m <<" "<< n <<" "<< k << std::endl;
+#endif
+         
+
+    cublasSgemm(transB, transA, m, n, k, 
+                alpha, B.pCUData()+offB, B.Stride(), 
+                A.pCUData()+offA, A.Stride(), 
+                beta, C.pCUData()+offC, C.Stride());
+    cuSafeCall(cublasGetError());    
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+/*
+
+  template<>
+  void CuMath<float>::Gemv(char trans, float alpha, const CuMatrix<float>& A, const float* x, size_t dimX, float beta, float* y, size_t dimY)
+  {
+    Timer tim; tim.Start();
+    // CUBLAS is col major, TNet is row major
+    // y = alpha * op(A) * x + beta * y,
+    
+    size_t m = A.Cols(); //m..rows of A in colmajor (== cols in rowmajor)
+    size_t n = A.Rows(); //n..cols of A in colmajor (== rows in rowmajor)
+ 
+    // switch the trans parameter!
+    char cu_trans;
+    if(trans == 't' || trans == 'T') {
+      cu_trans = 'n';
+    } else if (trans == 'n' || trans == 'N') {
+      cu_trans = 't';
+    } else {
+      Error(std::string("Unknown trans")+trans);
+    }
+   
+    //check the dims
+    if(cu_trans == 'n') {
+      assert(dimX == n);
+      assert(dimY == m);
+    } else {
+      assert(dimX == m);
+      assert(dimY == n);
+    }
+ 
+    //run gemv
+    cublasSgemv(cu_trans,m,n,alpha,
+                A.pCUData(), A.Stride(), x, 1,
+                beta, y, 1);
+    
+    cuSafeCall(cublasGetError());    
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+  */
+
+  /**
+   * offsetY tells how many outputs of 'Ax' mutiplication is skipped at the beginning, 
+   */
+  template<>
+  void CuMath<float>::OffsetGemv(char trans, float alpha, const CuMatrix<float>& A, const float* x, size_t dimX, float beta, float* y, size_t dimY, size_t offsetY)
+  {
+    Timer tim; tim.Start();
+    // CUBLAS is col major, TNet is row major
+    // y = alpha * op(A) * x + beta * y,
+    
+    size_t m = A.Cols(); //m..rows of A in colmajor (== cols in rowmajor)
+    size_t n = A.Rows(); //n..cols of A in colmajor (== rows in rowmajor)
+ 
+    // switch the trans parameter!
+    char cu_trans;
+    if(trans == 't' || trans == 'T') {
+      cu_trans = 'n';
+    } else if (trans == 'n' || trans == 'N') {
+      cu_trans = 't';
+    } else {
+      Error(std::string("Unknown trans")+trans);
+    }
+
+    // select part of matrix for compute
+    size_t cu_offset = 0;
+    if(cu_trans == 'n') {
+      cu_offset += offsetY;
+      assert(m >= dimY+offsetY);
+      m = dimY;
+    } else {
+      cu_offset += offsetY*A.Stride();
+      assert(n >= dimY+offsetY);
+      n = dimY;
+    }
+   
+    //check the dims
+    if(cu_trans == 'n') {
+      assert(dimX == n);
+      assert(dimY == m);
+    } else {
+      assert(dimX == m);
+      assert(dimY == n);
+    }
+ 
+    //run gemv
+    cublasSgemv(cu_trans,m,n,alpha,
+                A.pCUData()+cu_offset, A.Stride(), x, 1,
+                beta, y, 1);
+    
+    cuSafeCall(cublasGetError());    
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+  
+  template<>
+  void CuMath<float>::BlasGer(float alpha, const float* x, size_t dimX, const float* y, size_t dimY, CuMatrix<float>& A) {
+    Timer tim; tim.Start();
+    // CUBLAS is col major, TNet is row major
+    // -> switch x and y
+
+    // A = alpha * x * transpose(y) + A,
+    
+    assert(dimX == A.Rows());
+    assert(dimY == A.Cols());
+
+    size_t m = A.Cols(); //m..rows of A in colmajor (== cols in rowmajor)
+    size_t n = A.Rows(); //n..cols of A in colmajor (== rows in rowmajor)
+
+    cublasSger(m,n,alpha,y,1,x,1,A.pCUData(),A.Stride()); 
+    cuSafeCall(cublasGetError());    
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  void CuMath<float>::VecExpand(const CuVector<float>&in, CuVector<float>&out)
+  {
+    Timer tim; tim.Start();
+
+    assert(out.Dim() % in.Dim() == 0);
+    int n_copies = out.Dim()/in.Dim();
+    CuVector<int> offsets(n_copies);
+    //offsets.SetConst(0); done implicitly!
+
+    dim3 dimBlock(CUBLOCK);
+    dim3 dimGrid(n_blocks(out.Dim(), CUBLOCK));
+    
+    MatrixDim dim_in = { 1, in.Dim(), in.Dim() };
+    MatrixDim dim_out = { 1, out.Dim(), out.Dim() };
+    cudaF_expand(dimGrid, dimBlock, out.pCUData(), in.pCUData(), offsets.pCUData(), dim_out, dim_in);
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  void CuMath<float>::VecAddColSum(float alpha, const CuVector<float>&in, float beta, CuVector<float>&out)
+  {
+    Timer tim; tim.Start();
+
+    assert(in.Dim() % out.Dim() == 0);
+
+    size_t dimBlock = CUBLOCK;
+    size_t dimGrid = n_blocks(out.Dim(),CUBLOCK); 
+
+    MatrixDim dim = { in.Dim()/out.Dim(), out.Dim(), out.Dim() };
+
+    cudaF_add_col_sum(dimGrid,dimBlock,alpha,in.pCUData(),beta,out.pCUData(),dim);
+
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  //////////////////////////////////////////////////////////////////////////////
+  //// CuMath<> Template specializations (double)
+  ////
+  template<>
+  void CuMath<double>::Sigmoid(CuMatrix<double>& Y, const CuMatrix<double>& X)
+  {
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(X.Cols(),CUBLOCK), n_blocks(X.Rows(), CUBLOCK));
+
+    cudaD_sigmoid(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+  template<>
+  void CuMath<double>::DiffSigmoid(CuMatrix<double>& Eout, const CuMatrix<double>& Ein, const CuMatrix<double>& Y)
+  {
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Eout.Cols(), CUBLOCK), n_blocks(Eout.Rows(),CUBLOCK));
+
+    cudaD_diff_sigmoid(dimGrid, dimBlock, Eout.pCUData(), Ein.pCUData(), Y.pCUData(), Eout.Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+    
+  template<>
+  void CuMath<double>::Softmax(CuMatrix<double>& Y, const CuMatrix<double>& X)
+  {
+    Timer tim; tim.Start();
+
+    size_t dimBlock = CUBLOCK;
+    size_t dimGrid  = n_blocks(X.Rows(),CUBLOCK);
+
+    cudaD_softmax(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  void CuMath<double>::BlockLinearity(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuMatrix<double>& block_transf)
+  {
+    Timer tim; tim.Start();
+
+    assert(Y.Rows() == X.Rows());
+    assert((X.Cols() % block_transf.Rows()) == 0);
+    assert((Y.Cols() % block_transf.Cols()) == 0);
+    assert((X.Cols() / block_transf.Rows()) == (Y.Cols() / block_transf.Cols()));
+
+    int blocks = X.Cols() / block_transf.Rows();
+
+    for(int i = 0; i < blocks; i++) {
+      int m = block_transf.Cols();
+      int n = X.Rows();
+      int k = block_transf.Rows();
+
+      /*
+      //DEBUG MESSAGE
+      std::cout << "N N " << m << " " << n << " " << k << " " 
+                << 1.0 << " " << block_transf << " " << block_transf.Stride() 
+                << " " << X+i*k << " " << X.Stride() << " " 
+                << 0.0 << " " << Y+i*n << " " << Y.Stride() 
+                << "\n" << std::flush;
+      */
+
+
+      cublasDgemm('N', 'N', m, n, k, 
+                  1.0, block_transf.pCUData(), block_transf.Stride(), 
+                  X.pCUData()+i*k, X.Stride(), 
+                  0.0, Y.pCUData()+i*m, Y.Stride());
+    }
+    cuSafeCall(cublasGetError());    
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+
+  template<>
+  void CuMath<double>::Expand(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& frameOffsets)
+  {
+    Timer tim; tim.Start();
+
+    assert(Y.Rows() == X.Rows());
+    assert(X.Cols() * frameOffsets.Dim() == Y.Cols());
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK));
+
+    cudaD_expand(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), frameOffsets.pCUData(), Y.Dim(), X.Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  void CuMath<double>::Rearrange(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& copyFrom)
+  {
+    Timer tim; tim.Start();
+
+    assert(copyFrom.Dim() == Y.Cols());
+    assert(Y.Rows() == X.Rows());
+    
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK));
+    
+    cudaD_rearrange(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), Y.Dim(), X.Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  void CuMath<double>::Randomize(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& copyFrom)
+  {
+    Timer tim; tim.Start();
+
+    assert(X.Cols() == Y.Cols());
+    assert(X.Rows() == Y.Rows());
+    assert(copyFrom.Dim() <= Y.Rows());
+    
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(copyFrom.Dim(),CUBLOCK));
+    
+    MatrixDim dimX = X.Dim(); dimX.rows=copyFrom.Dim();
+    MatrixDim dimY = Y.Dim(); dimY.rows=copyFrom.Dim();
+
+    cudaD_randomize(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), dimY, dimX);
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  void CuMath<double>::CheckClass(const CuMatrix<double>& out, const CuMatrix<double> &des, CuVector<int>& match)
+  {
+    Timer tim; tim.Start();
+
+    assert(out.Cols() == des.Cols());
+    assert(out.Rows() == des.Rows());
+    assert(out.Stride() == des.Stride());
+    assert(match.Dim() == out.Rows());
+
+    size_t dimBlock = CUBLOCK;
+    size_t dimGrid = n_blocks(out.Rows(),CUBLOCK);
+
+    cudaD_check_class(dimGrid, dimBlock, out.pCUData(), des.pCUData(), match.pCUData(), out.Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+}
diff --git a/src/CuBaseLib/.svn/text-base/cumath.h.svn-base b/src/CuBaseLib/.svn/text-base/cumath.h.svn-base
new file mode 100644
index 0000000..5680082
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/cumath.h.svn-base
@@ -0,0 +1,146 @@
+#ifndef _CUMATH_H_
+#define _CUMATH_H_
+
+#include "cumatrix.h"
+
+#include "Timer.h"
+#include "cudevice.h"
+
+namespace TNet {
+  
+  
+  /**
+   * Group of Math operations for the NN training
+   */
+  template<typename _ElemT>
+  class CuMath 
+  {
+   public:
+
+    /// Y = Sigmoid(X)
+    static void Sigmoid(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X)
+    { Error("__func__ Not implemented"); }
+
+    /// Eout = E(1-E) * Y
+    static void DiffSigmoid(CuMatrix<_ElemT>& Eout, const CuMatrix<_ElemT>& Ein, const CuMatrix<_ElemT>& Y)
+    { Error("__func__ Not implemented"); }
+
+    /// Y = Softmax(X)
+    static void Softmax(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X)
+    { Error("__func__ Not implemented"); }
+
+    /// for DCT in FeaCat
+    static void BlockLinearity(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X, const CuMatrix<_ElemT>& block_transf)
+    { Error("__func__ Not implemented"); }
+
+    static void Expand(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X, const CuVector<int>& frameOffsets)
+    { Error("__func__ Not implemented"); }
+
+    /// ie. switch cols according to copyFrom
+    static void Rearrange(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X, const CuVector<int>& copyFrom)
+    { Error("__func__ Not implemented"); }
+
+    /// ie. switch rows according to copyFrom   
+    static void Randomize(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X, const CuVector<int>& copyFrom)
+    { Error("__func__ Not implemented"); }
+
+    /// check match in the classification for Xentropy
+    static void CheckClass(const CuMatrix<_ElemT>& out, const CuMatrix<_ElemT> &des, CuVector<int>& match)
+    { Error("__func__ Not implemented"); }
+    
+    /// gemm with offset for CuSharedLinearity
+    static void OffsetGemm(char transA, char transB, _ElemT alpha, const CuMatrix<_ElemT>& A, const CuMatrix<_ElemT>& B, _ElemT beta, CuMatrix<_ElemT>& C, int offA, int offB, int offC)
+    { Error("__func__ Not implemented"); }
+
+    /// gemv with offset for CuRecurrent
+    static void OffsetGemv(char trans, _ElemT alpha, const CuMatrix<_ElemT>& A, const _ElemT* x, size_t dimX, _ElemT beta, _ElemT* y, size_t dimY, size_t offsetY)
+    { Error("__func__ Not implemented"); }
+
+    /// ger for weight updates in CuRecurrent
+    static void BlasGer(_ElemT alpha, const _ElemT* x, size_t dimX, const _ElemT* y, size_t dimY, CuMatrix<_ElemT>& A)
+    { Error("__func__ Not implemented"); }
+
+    /// concatenate one vector several times for CuSharedLinearity
+    static void VecExpand(const CuVector<_ElemT>&in, CuVector<_ElemT>&out)
+    { Error("__func__ Not implemented"); }
+
+    /// sum the vector as if it was matrix data for CuSharedLinearity
+    static void VecAddColSum(_ElemT alpha, const CuVector<_ElemT>&in, _ElemT beta, CuVector<_ElemT>&out)
+    { Error("__func__ Not implemented"); }
+
+  }; //class CuMath::
+
+
+  //////////////////////////////////////////////////////////////////////////////
+  //// CuMath<> Template specializations (float)
+  ////
+  template<>
+  void CuMath<float>::Sigmoid(CuMatrix<float>& Y, const CuMatrix<float>& X);
+
+  template<>
+  void CuMath<float>::DiffSigmoid(CuMatrix<float>& Eout, const CuMatrix<float>& Ein, const CuMatrix<float>& Y);
+    
+  template<>
+  void CuMath<float>::Softmax(CuMatrix<float>& Y, const CuMatrix<float>& X);
+
+  template<>
+  void CuMath<float>::BlockLinearity(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuMatrix<float>& block_transf);
+
+  template<>
+  void CuMath<float>::Expand(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& frameOffsets);
+
+  template<>
+  void CuMath<float>::Rearrange(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& copyFrom);
+
+  template<>
+  void CuMath<float>::Randomize(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& copyFrom);
+
+  template<>
+  void CuMath<float>::CheckClass(const CuMatrix<float>& out, const CuMatrix<float> &des, CuVector<int>& match);
+
+  template<>
+  void CuMath<float>::OffsetGemm(char transA, char transB, float alpha, const CuMatrix<float>& A, const CuMatrix<float>& B, float beta, CuMatrix<float>& C, int offA, int offB, int offC);
+
+  template<>
+  void CuMath<float>::OffsetGemv(char trans, float alpha, const CuMatrix<float>& A, const float* x, size_t dimX, float beta, float* y, size_t dimY, size_t offsetY);
+
+  template<>
+  void CuMath<float>::BlasGer(float alpha, const float* x, size_t dimX, const float* y, size_t dimY, CuMatrix<float>& A);
+
+  template<>
+  void CuMath<float>::VecExpand(const CuVector<float>&in, CuVector<float>&out);
+
+  template<>
+  void CuMath<float>::VecAddColSum(float alpha, const CuVector<float>&in, float beta, CuVector<float>&out);
+
+
+  //////////////////////////////////////////////////////////////////////////////
+  //// CuMath<> Template specializations (double)
+  ////
+  template<>
+  void CuMath<double>::Sigmoid(CuMatrix<double>& Y, const CuMatrix<double>& X);
+
+  template<>
+  void CuMath<double>::DiffSigmoid(CuMatrix<double>& Eout, const CuMatrix<double>& Ein, const CuMatrix<double>& Y);
+    
+  template<>
+  void CuMath<double>::Softmax(CuMatrix<double>& Y, const CuMatrix<double>& X);
+
+  template<>
+  void CuMath<double>::BlockLinearity(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuMatrix<double>& block_transf);
+
+  template<>
+  void CuMath<double>::Expand(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& frameOffsets);
+
+  template<>
+  void CuMath<double>::Rearrange(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& copyFrom);
+
+  template<>
+  void CuMath<double>::Randomize(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& copyFrom);
+
+  template<>
+  void CuMath<double>::CheckClass(const CuMatrix<double>& out, const CuMatrix<double> &des, CuVector<int>& match);
+
+}
+
+#endif
diff --git a/src/CuBaseLib/.svn/text-base/cumatrix.h.svn-base b/src/CuBaseLib/.svn/text-base/cumatrix.h.svn-base
new file mode 100644
index 0000000..4e767e3
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/cumatrix.h.svn-base
@@ -0,0 +1,199 @@
+#ifndef _CUMATRIX_H_
+#define _CUMATRIX_H_
+
+#include <sstream>
+
+#include "Matrix.h"
+#include "cukernels.h"
+
+
+
+namespace TNet {
+
+  template<typename _ElemT> class CuVector;
+
+  /**
+   * Matrix for CUDA computing
+   */
+  template<typename _ElemT>
+  class CuMatrix 
+  {
+    typedef CuMatrix<_ElemT> ThisType;
+
+    public:
+
+      /// Default Constructor
+      CuMatrix<_ElemT>()
+       : mRows(0), mCols(0), mStride(0), mpCUData(NULL)
+      { }
+      /// Constructor with memory initialisation
+      CuMatrix<_ElemT>(size_t rows, size_t cols)
+       : mRows(0), mCols(0), mStride(0), mpCUData(NULL)
+      { Init(rows, cols); }
+
+      /// Destructor
+      ~CuMatrix()
+      { Destroy(); }
+
+      /// Dimensions
+      size_t Rows() const
+      { return mRows; }
+
+      size_t Cols() const 
+      { return mCols; }
+
+      size_t Stride() const
+      { return mStride; }
+
+      ::MatrixDim Dim() const
+      { ::MatrixDim d = { 
+          static_cast<int>(mRows), 
+          static_cast<int>(mCols), 
+          static_cast<int>(mStride) 
+        }; 
+        return d; 
+      }
+
+      /// Get raw pointer
+      const _ElemT* pCUData() const
+      { return mpCUData; }
+      _ElemT* pCUData()
+      { return mpCUData; }
+
+      /// Get raw row pointer
+      const _ElemT* pCURowData(size_t r) const
+      { assert(r < Rows()); return mpCUData+r*mStride; }
+      _ElemT* pCURowData(size_t r)
+      { assert(r < Rows()); return mpCUData+r*mStride; }
+
+      /// Get size of matrix in bytes
+      size_t MSize() const
+      { return mRows*mStride*sizeof(_ElemT); }
+      /// Get size of matrix row in bytes
+      size_t MRowSize() const
+      { return mStride*sizeof(_ElemT); }
+
+      /// Allocate the memory
+      ThisType& Init(size_t rows, size_t cols);
+
+      /// Deallocate the memory
+      void Destroy();
+
+      /// Copy functions (reallocates when needed)
+      ThisType&        CopyFrom(const CuMatrix<_ElemT>& rSrc);
+      ThisType&        CopyFrom(const Matrix<_ElemT>& rSrc);
+      Matrix<_ElemT>&  CopyTo(Matrix<_ElemT>& rDst) const;
+
+      /// Copy rowCnt rows from rSrc, starting by row srcOri, 
+      /// copying to memory block starting by row dstOri
+      void CopyRows(size_t rowCnt, size_t srcOri, const CuMatrix<_ElemT>& rSrc, size_t dstOri);
+
+      /// Copy colCnt columns from rSrc, starting by col srcOri, 
+      /// copying to memory block starting by row dstOri
+      void CopyCols(size_t colCnt, size_t srcOri, const CuMatrix<_ElemT>& rSrc, size_t dstOri);
+
+
+      // Math operations, some calling kernels
+      //
+      void SetZero();
+
+      void SetConst(_ElemT value)
+      { Error("__func__ Not implemented"); }
+
+      void ApplyLog()
+      { Error("__func__ Not implemented"); }
+
+      void ApplyMask(const CuMatrix<BaseFloat>& mask)
+      { Error("__func__ Not implemented"); }
+
+      void ApplyL1(BaseFloat l1)
+      { Error("__func__ Not implemented"); }
+
+      /// scale i'th column by scale[i]
+      void ScaleCols(const CuVector<_ElemT>& scale)
+      { Error("__func__ Not implemented"); }
+
+      /// scale i'th row by scale[i]
+      void ScaleRows(const CuVector<_ElemT>& scale)
+      { Error("__func__ Not implemented"); }
+
+      /// B = aplha * A + beta * B
+      void AddScaled(_ElemT alpha, const CuMatrix<_ElemT>& A, _ElemT beta)
+      { Error("__func__ Not implemented"); }
+
+      /// B = aplha * row + beta * B
+      void AddScaledRow(_ElemT alpha, const CuVector<_ElemT>& row, _ElemT beta)
+      { Error("__func__ Not implemented"); }
+
+      /// C = alpha * A(^T)*B(^T) + beta * C
+      void Gemm(char transa, char transb, 
+                _ElemT alpha, 
+                const CuMatrix<_ElemT>& A, const CuMatrix<_ElemT>& B, 
+                _ElemT beta)
+      { Error("__func__ Not implemented"); }
+
+      /// A = alpha * x*y^T + A
+      void BlasGer(_ElemT alpha, 
+                const CuVector<_ElemT>& x, const CuVector<_ElemT>& y)
+      { Error("__func__ Not implemented"); }
+
+
+      /// Multiply two matrices elementhwise: C = A .* C
+      void MulElem(const CuMatrix<_ElemT>& A)
+      { Error("__func__ Not implemented"); }
+      
+      /// A = log(A)
+      void LogElem()
+      { Error("__func__ Not implemented"); }
+
+      void Print() const
+      { 
+        Matrix<_ElemT> mat(Rows(),Cols());
+        CopyTo(mat);
+        std::cout << mat;
+      }
+
+      
+
+      void CheckData()
+      {
+        Matrix<_ElemT> mat;
+        CopyTo(mat);
+        for(size_t i=0; i<Rows(); i++) {
+          for(size_t j=0; j<Cols(); j++) {
+            if(std::isnan(mat(i,j)) || std::isinf(mat(i,j))) {
+              std::ostringstream os;
+              os << "Invalid value:" << mat(i,j) << "at row"<<i<<" col"<<j<<"\n";
+              Error(os.str());
+            }
+          }
+        }
+      }
+        
+      
+    private:
+      size_t mRows;
+      size_t mCols;
+      size_t mStride;
+
+      _ElemT* mpCUData;
+
+  };
+
+
+  /// Prints the matrix dimensions and pointer to stream
+  template<typename _ElemT>
+  inline std::ostream& operator << (std::ostream& out, const CuMatrix<_ElemT>& mat)
+  { 
+    out << "[CUMATRIX R" << mat.Rows() << " C" << mat.Cols() << " S" << mat.Stride() 
+        << " PTR" << mat.pCUData() << "]" << std::flush;
+    return out;
+  }
+  
+  
+}
+
+
+#include "cumatrix.tcc"
+
+#endif
diff --git a/src/CuBaseLib/.svn/text-base/cumatrix.tcc.svn-base b/src/CuBaseLib/.svn/text-base/cumatrix.tcc.svn-base
new file mode 100644
index 0000000..4582e8d
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/cumatrix.tcc.svn-base
@@ -0,0 +1,627 @@
+
+#include <cuda_runtime_api.h>
+#include <cublas.h>
+
+#include "Timer.h"
+#include "cucommon.h"
+#include "cuvector.h"
+#include "cudevice.h"
+
+namespace TNet {
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+  template<typename _ElemT>
+  CuMatrix<_ElemT>&
+  CuMatrix<_ElemT>::
+  Init(size_t rows, size_t cols)
+  {
+    if(mRows == rows && mCols == cols) {
+      //SetZero();
+      return *this;
+    }
+
+    Destroy();
+
+    size_t row_bytes = cols * sizeof(_ElemT);
+    size_t pitch;
+    cuSafeCall(cudaMallocPitch((void**)&mpCUData, &pitch, row_bytes, rows));
+    mRows = rows; mCols = cols; 
+    mStride = pitch/sizeof(_ElemT);
+    SetZero();
+    
+    return *this;
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+  
+  template<typename _ElemT>
+  void
+  CuMatrix<_ElemT>::
+  Destroy()
+  {
+    if(NULL != mpCUData) {
+      cuSafeCall(cudaFree(mpCUData));
+      mpCUData = NULL;
+    }
+    mRows = mCols = mStride = 0;
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+  template<typename _ElemT>
+  CuMatrix<_ElemT>&
+  CuMatrix<_ElemT>::
+  CopyFrom(const CuMatrix<_ElemT>& rSrc)
+  {
+    Init(rSrc.Rows(),rSrc.Cols());
+    
+    Timer tim; tim.Start();
+
+    size_t dst_pitch = mStride*sizeof(_ElemT);
+    size_t src_pitch = rSrc.Stride()*sizeof(_ElemT);
+    size_t width = rSrc.Cols()*sizeof(_ElemT);
+    cuSafeCall(cudaMemcpy2D(mpCUData, dst_pitch, rSrc.pCUData(), src_pitch, width, rSrc.Rows(), cudaMemcpyDeviceToDevice));
+
+    tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::CopyFromD2D",tim.Val());
+    return *this;
+  }
+  
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+  template<typename _ElemT>
+  CuMatrix<_ElemT>&
+  CuMatrix<_ElemT>::
+  CopyFrom(const Matrix<_ElemT>& rSrc)
+  {
+    Init(rSrc.Rows(),rSrc.Cols());
+
+    Timer tim; tim.Start();
+
+    size_t dst_pitch = mStride*sizeof(_ElemT);
+    size_t src_pitch = rSrc.Stride()*sizeof(_ElemT);
+    size_t width = rSrc.Cols()*sizeof(_ElemT);
+    cuSafeCall(cudaMemcpy2D(mpCUData, dst_pitch, rSrc.pData(), src_pitch, width, rSrc.Rows(), cudaMemcpyHostToDevice));
+
+    tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::CopyFromH2D",tim.Val());
+    return *this;
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+  template<typename _ElemT>
+  Matrix<_ElemT>&
+  CuMatrix<_ElemT>::
+  CopyTo(Matrix<_ElemT>& rDst) const
+  {
+    if(rDst.Rows() != Rows()  ||  rDst.Cols() != Cols()) {
+      rDst.Init(Rows(),Cols());
+    }
+
+    Timer tim; tim.Start();
+   
+    size_t src_pitch = mStride*sizeof(_ElemT);
+    size_t dst_pitch = rDst.Stride()*sizeof(_ElemT);
+    size_t width = Cols()*sizeof(_ElemT);
+    cuSafeCall(cudaMemcpy2D(rDst.pData(), dst_pitch, pCUData(), src_pitch, width, Rows(), cudaMemcpyDeviceToHost));
+
+    tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::CopyToD2H",tim.Val());
+
+    return rDst;
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+  
+  template<typename _ElemT>
+  void
+  CuMatrix<_ElemT>::
+  CopyRows(size_t rowCnt, size_t srcOri, const CuMatrix<_ElemT>& rSrc, size_t dstOri)
+  {
+    assert(rowCnt+srcOri <= rSrc.Rows());
+    assert(rowCnt+dstOri <= Rows());
+    assert(Cols() == rSrc.Cols());
+ 
+    Timer tim; tim.Start();
+
+    size_t dst_pitch = mStride*sizeof(_ElemT);
+    size_t src_pitch = rSrc.Stride()*sizeof(_ElemT);
+    size_t width = rSrc.Cols()*sizeof(_ElemT);
+
+    const _ElemT* p_src = rSrc.pCUData() + srcOri*rSrc.Stride();  
+    _ElemT* p_dst = mpCUData + dstOri*mStride;
+
+    cuSafeCall(cudaMemcpy2D(p_dst, dst_pitch, p_src, src_pitch, width, rowCnt, cudaMemcpyDeviceToDevice));
+
+    tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::CopyRowsD2D",tim.Val());
+   
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+  
+  template<typename _ElemT>
+  void
+  CuMatrix<_ElemT>::
+  CopyCols(size_t colCnt, size_t srcOri, const CuMatrix<_ElemT>& rSrc, size_t dstOri)
+  {
+    assert(colCnt+srcOri <= rSrc.Cols());
+    assert(colCnt+dstOri <= Cols());
+    assert(Rows() == rSrc.Rows());
+ 
+    Timer tim; tim.Start();
+
+    size_t dst_pitch = mStride*sizeof(_ElemT);
+    size_t src_pitch = rSrc.Stride()*sizeof(_ElemT);
+    size_t width = colCnt*sizeof(_ElemT);
+
+    const _ElemT* p_src = rSrc.pCUData() + srcOri;  
+    _ElemT* p_dst = mpCUData + dstOri;
+
+    cuSafeCall(cudaMemcpy2D(p_dst, dst_pitch, p_src, src_pitch, width, Rows(), cudaMemcpyDeviceToDevice));
+
+    tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::CopyColsD2D",tim.Val());
+   
+  }
+ 
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+ 
+  template<typename _ElemT>
+  void
+  CuMatrix<_ElemT>::
+  SetZero() 
+  {
+    Timer tim; tim.Start();
+    cuSafeCall(cudaMemset(mpCUData, 0, mRows*mStride*sizeof(_ElemT)));
+    tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::SetZero",tim.Val());
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+ 
+ 
+  ////////////////////////////////////////////////////////////////////////
+  //// CuMatrix:: templeate specializations (float)
+  ////
+  template<> 
+  inline void CuMatrix<float>::SetConst(float value)
+  { 
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaF_set_const(dimGrid,dimBlock,mpCUData,value,Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<> 
+  inline void CuMatrix<float>::ApplyLog()
+  { 
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaF_apply_log(dimGrid,dimBlock,mpCUData,Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+  
+  
+  template<> 
+  inline void CuMatrix<float>::ApplyMask(const CuMatrix<BaseFloat>& mask)
+  { 
+    Timer tim; tim.Start();
+
+    assert(mask.Rows() == Rows());
+    assert(mask.Cols() == Cols());
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaF_apply_mask(dimGrid,dimBlock,mpCUData,mask.pCUData(),Dim(),mask.Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+  
+
+  template<> 
+  inline void CuMatrix<float>::ApplyL1(float l1)
+  { 
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaF_apply_l1(dimGrid,dimBlock,mpCUData,l1,Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  inline void CuMatrix<float>::ScaleCols(const CuVector<float>& scale)
+  {
+    Timer tim; tim.Start();
+
+    assert(scale.Dim() == Cols());
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaF_scale_cols(dimGrid,dimBlock,mpCUData,scale.pCUData(),Dim());
+    cuSafeCall(cudaGetLastError());
+
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  
+  template<>
+  inline void CuMatrix<float>::ScaleRows(const CuVector<float>& scale)
+  { 
+    Timer tim; tim.Start();
+
+    assert(scale.Dim() == Rows());
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaF_scale_rows(dimGrid,dimBlock,mpCUData,scale.pCUData(),Dim());
+    cuSafeCall(cudaGetLastError());
+
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  inline void CuMatrix<float>::AddScaled(float alpha, const CuMatrix<float>& A, float beta)
+  {
+    Timer tim; tim.Start();
+
+    assert(A.Rows() == Rows());
+    assert(A.Cols() == Cols());
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaF_add_scaled(dimGrid,dimBlock,alpha,A.pCUData(),beta,mpCUData,Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  inline void CuMatrix<float>::AddScaledRow(float alpha, const CuVector<float>& row, float beta)
+  { 
+    Timer tim; tim.Start();
+
+    if(row.Dim() != Cols()) {
+      std::ostringstream os;
+      os << "Non matching dimensions: Cols:" << Cols() << " VectorDim:" << row.Dim();
+      Error(os.str());
+    }
+    assert(row.Dim() == Cols());
+   
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaF_add_scaled_row(dimGrid,dimBlock,alpha,row.pCUData(),beta,mpCUData,Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  inline void CuMatrix<float>::Gemm(char transa, char transb, 
+            float alpha, 
+            const CuMatrix<float>& A, const CuMatrix<float>& B, 
+            float beta)
+  { 
+    // CUBLAS is col major, TNet is row major
+    // keep trans..., just swap A&B argumets: A->B B->A
+    size_t m = ((transb=='T' || transb=='t')? B.Rows() : B.Cols()); 
+    size_t n = ((transa=='T' || transa=='t')? A.Cols() : A.Rows());
+    size_t k = ((transb=='T' || transb=='t')? B.Cols() : B.Rows());
+    size_t k1 = ((transa=='T' || transa=='t')? A.Rows() : A.Cols());
+
+    assert(m == Cols());
+    assert(n == Rows());
+    assert(k == k1);
+
+    #if 0
+     //DEBUG MESSAGE
+    std::cout << "\n" << transb << " " << transa << " " << m << " " << n << " " << k << " " <<
+                alpha << " " << B << " " << B.Stride() << " " <<
+                A << " " << A.Stride() << " " << beta << " " << C << " " << 
+                C.Stride() << "\n" << std::flush;
+    #endif
+
+    Timer tim; tim.Start();
+
+    cublasSgemm(transb, transa, m, n, k, 
+                alpha, B.pCUData(), B.Stride(), A.pCUData(), A.Stride(), 
+                beta, mpCUData, Stride());
+
+    cuSafeCall(cublasGetError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  inline void CuMatrix<float>::BlasGer(float alpha, 
+            const CuVector<float>& x, const CuVector<float>& y)
+  { 
+    // CUBLAS is col major, TNet is row major
+    // just swap x and y
+    assert(x.Dim() == Rows());
+    assert(y.Dim() == Cols());
+
+    Timer tim; tim.Start();
+    
+    cublasSger(Cols(),Rows(),alpha,y.pCUData(),1,x.pCUData(),1,mpCUData,Stride());
+    cuSafeCall(cublasGetError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  inline void CuMatrix<float>::MulElem(const CuMatrix<float>& A)
+  {
+    Timer tim; tim.Start();
+
+    assert(mCols == A.Cols());
+    assert(mRows == A.Rows());
+    assert(mStride == A.Stride());
+    
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaF_mul_elem(dimGrid,dimBlock,mpCUData, A.pCUData(), Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  inline void CuMatrix<float>::LogElem()
+  {
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaF_log_elem(dimGrid,dimBlock,mpCUData, Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////
+  //// CuMatrix:: templeate specializations (double)
+  ////
+  template<> 
+  inline void CuMatrix<double>::SetConst(double value)
+  { 
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaD_set_const(dimGrid,dimBlock,mpCUData,value,Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<> 
+  inline void CuMatrix<double>::ApplyLog()
+  { 
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaD_apply_log(dimGrid,dimBlock,mpCUData,Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  inline void CuMatrix<double>::ScaleCols(const CuVector<double>& scale)
+  {
+    Timer tim; tim.Start();
+
+    assert(scale.Dim() == Cols());
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaD_scale_cols(dimGrid,dimBlock,mpCUData,scale.pCUData(),Dim());
+    cuSafeCall(cudaGetLastError());
+
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  
+  template<>
+  inline void CuMatrix<double>::ScaleRows(const CuVector<double>& scale)
+  { 
+    Timer tim; tim.Start();
+
+    assert(scale.Dim() == Rows());
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaD_scale_rows(dimGrid,dimBlock,mpCUData,scale.pCUData(),Dim());
+    cuSafeCall(cudaGetLastError());
+
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  inline void CuMatrix<double>::AddScaled(double alpha, const CuMatrix<double>& A, double beta)
+  {
+    Timer tim; tim.Start();
+
+    assert(A.Rows() == Rows());
+    assert(A.Cols() == Cols());
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaD_add_scaled(dimGrid,dimBlock,alpha,A.pCUData(),beta,mpCUData,Dim());
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  inline void CuMatrix<double>::AddScaledRow(double alpha, const CuVector<double>& row, double beta)
+  { 
+    Timer tim; tim.Start();
+
+    assert(row.Dim() == Cols());
+   
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaD_add_scaled_row(dimGrid,dimBlock,alpha,row.pCUData(),beta,mpCUData,Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+  template<>
+  inline void CuMatrix<double>::Gemm(char transa, char transb, 
+            double alpha, 
+            const CuMatrix<double>& A, const CuMatrix<double>& B, 
+            double beta)
+  { 
+    // CUBLAS is col major, TNet is row major
+    // keep trans..., just swap A&B argumets: A->B B->A
+    size_t m = ((transb=='T' || transb=='t')? B.Rows() : B.Cols()); 
+    size_t n = ((transa=='T' || transa=='t')? A.Cols() : A.Rows());
+    size_t k = ((transb=='T' || transb=='t')? B.Cols() : B.Rows());
+    size_t k1 = ((transa=='T' || transa=='t')? A.Rows() : A.Cols());
+
+    assert(m == Cols());
+    assert(n == Rows());
+    assert(k == k1);
+
+    #if 0
+     //DEBUG MESSAGE
+    std::cout << "\n" << transb << " " << transa << " " << m << " " << n << " " << k << " " <<
+                alpha << " " << B << " " << B.Stride() << " " <<
+                A << " " << A.Stride() << " " << beta << " " << C << " " << 
+                C.Stride() << "\n" << std::flush;
+    #endif
+
+    Timer tim; tim.Start();
+
+    cublasDgemm(transb, transa, m, n, k, 
+                alpha, B.pCUData(), B.Stride(), A.pCUData(), A.Stride(), 
+                beta, mpCUData, Stride());
+
+    cuSafeCall(cublasGetError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+  template<>
+  inline void CuMatrix<double>::BlasGer(double alpha, 
+            const CuVector<double>& x, const CuVector<double>& y)
+  { 
+    // CUBLAS is col major, TNet is row major
+    // just swap x and y
+    assert(x.Dim() == Rows());
+    assert(y.Dim() == Cols());
+
+    Timer tim; tim.Start();
+    
+    cublasDger(Cols(),Rows(),alpha,y.pCUData(),1,x.pCUData(),1,mpCUData,Stride());
+    cuSafeCall(cublasGetError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+
+  template<>
+  inline void CuMatrix<double>::MulElem(const CuMatrix<double>& A)
+  {
+    Timer tim; tim.Start();
+
+    assert(mCols == A.Cols());
+    assert(mRows == A.Rows());
+    assert(mStride == A.Stride());
+    
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaD_mul_elem(dimGrid,dimBlock,mpCUData, A.pCUData(), Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  inline void CuMatrix<double>::LogElem()
+  {
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+
+    cudaD_log_elem(dimGrid,dimBlock,mpCUData, Dim());
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+}
diff --git a/src/CuBaseLib/.svn/text-base/curand.h.svn-base b/src/CuBaseLib/.svn/text-base/curand.h.svn-base
new file mode 100644
index 0000000..8aa66d5
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/curand.h.svn-base
@@ -0,0 +1,40 @@
+#ifndef _CU_RAND_H_
+#define _CU_RAND_H_
+
+
+#include "cumatrix.h"
+
+
+namespace TNet {
+  
+  template<typename T> 
+  class CuRand {
+   public:
+
+    CuRand(size_t rows, size_t cols)
+    { SeedGpu(rows,cols); }
+
+    ~CuRand() { }
+
+    void SeedGpu(size_t rows, size_t cols);
+    void Rand(CuMatrix<T>& tgt);
+    void GaussRand(CuMatrix<T>& tgt);
+
+    void BinarizeProbs(const CuMatrix<T>& probs, CuMatrix<T>& states);
+    void AddGaussNoise(CuMatrix<T>& tgt, T gscale = 1.0);
+  
+   private:
+    static void SeedRandom(Matrix<unsigned>& mat);
+     
+   private:
+    CuMatrix<unsigned> z1, z2, z3, z4;
+    CuMatrix<T> tmp;
+  };
+
+}
+
+
+#include "curand.tcc"
+
+
+#endif
diff --git a/src/CuBaseLib/.svn/text-base/curand.tcc.svn-base b/src/CuBaseLib/.svn/text-base/curand.tcc.svn-base
new file mode 100644
index 0000000..e337189
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/curand.tcc.svn-base
@@ -0,0 +1,228 @@
+
+#include <cstdlib>
+#include "curandkernels.h"
+
+
+namespace TNet {
+ 
+ 
+
+  template<typename T>
+  inline void
+  CuRand<T>::
+  SeedGpu(size_t rows, size_t cols)
+  {
+    Matrix<unsigned> mat(rows,cols);
+    SeedRandom(mat);
+    z1.CopyFrom(mat);
+    SeedRandom(mat);
+    z2.CopyFrom(mat);
+    SeedRandom(mat);
+    z3.CopyFrom(mat);
+    SeedRandom(mat);
+    z4.CopyFrom(mat);
+
+    /*
+    std::cout << "RANDININIT" << std::endl;
+    z1.Print();
+    z2.Print();
+    z3.Print();
+    z4.Print();
+    std::cout << "RANDININIT" << std::endl;
+    */
+
+    tmp.Init(rows,cols);
+  }
+
+
+
+  template<typename T>  
+  inline void 
+  CuRand<T>::
+  SeedRandom(Matrix<unsigned>& mat) {
+    for(size_t j=0; j<mat.Rows(); j++) {
+      for(size_t i=0; i<mat.Cols(); i++) {
+        unsigned value = 0;
+        while(value <= 128) { value = lrand48(); }
+        mat(j,i) = value;
+      }
+    }
+  }
+
+
+  template<typename T>
+  inline void
+  CuRand<T>::
+  AddGaussNoise(CuMatrix<T>& tgt, T gscale)
+  { 
+    GaussRand(tmp);
+    tgt.AddScaled(gscale,tmp,1.0);
+  }
+
+
+
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////////
+  //// invalid general wrappers over CUDA kernels
+  template<typename T>
+  inline void
+  CuRand<T>::
+  Rand(CuMatrix<T>& tgt)
+  { Error("Unimplemented"); }
+  
+  template<typename T>
+  inline void
+  CuRand<T>::
+  GaussRand(CuMatrix<T>& tgt)
+  { Error("Unimplemented"); }
+ 
+  template<typename T>
+  inline void
+  CuRand<T>::
+  BinarizeProbs(const CuMatrix<T>& probs, CuMatrix<T>& states)
+  { Error("Unimplemented"); }
+
+
+  //////////////////////////////////////////////////////////////////////////
+  //// float specializations
+  template<>
+  inline void
+  CuRand<float>::
+  Rand(CuMatrix<float>& tgt)
+  {
+    Timer tim; tim.Start();
+
+    tgt.Init(z1.Rows(), z1.Cols());
+  
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(tgt.Cols(), CUBLOCK), n_blocks(tgt.Rows(),CUBLOCK));
+
+    cudaF_rand(dimGrid,dimBlock,tgt.pCUData(), z1.pCUData(), z2.pCUData(), z3.pCUData(), z4.pCUData(),tgt.Dim());
+
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+    
+  
+  template<>
+  inline void
+  CuRand<float>::
+  GaussRand(CuMatrix<float>& tgt)
+  {
+
+    Timer tim; tim.Start();
+
+    tgt.Init(z1.Rows(), z1.Cols());
+  
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(tgt.Cols(), CUBLOCK), n_blocks(tgt.Rows(),CUBLOCK));
+
+    cudaF_gauss_rand(dimGrid,dimBlock,tgt.pCUData(), z1.pCUData(), z2.pCUData(), z3.pCUData(), z4.pCUData(),tgt.Dim());
+
+    cuSafeCall(cudaGetLastError());
+
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+ 
+  template<>
+  inline void
+  CuRand<float>::
+  BinarizeProbs(const CuMatrix<float>& probs, CuMatrix<float>& states)
+  {
+    if(probs.Rows() != z1.Rows() || probs.Cols() != z1.Cols()) {
+      Error("Non matching dims!!");
+    }
+
+    states.Init(z1.Rows(),z1.Cols());
+    Rand(tmp);
+
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(z1.Cols(), CUBLOCK), n_blocks(z1.Rows(),CUBLOCK));
+
+    cudaF_binarize_probs(dimGrid,dimBlock,states.pCUData(), probs.pCUData(), tmp.pCUData(),states.Dim());
+
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  //////////////////////////////////////////////////////////////////////////
+  //// double specializations
+  template<>
+  inline void
+  CuRand<double>::
+  Rand(CuMatrix<double>& tgt)
+  {
+    Timer tim; tim.Start();
+
+    tgt.Init(z1.Rows(), z1.Cols());
+  
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(tgt.Cols(), CUBLOCK), n_blocks(tgt.Rows(),CUBLOCK));
+
+    cudaD_rand(dimGrid,dimBlock,tgt.pCUData(), z1.pCUData(), z2.pCUData(), z3.pCUData(), z4.pCUData(),tgt.Dim());
+
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+    
+  
+  template<>
+  inline void
+  CuRand<double>::
+  GaussRand(CuMatrix<double>& tgt)
+  {
+
+    Timer tim; tim.Start();
+
+    tgt.Init(z1.Rows(), z1.Cols());
+  
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(tgt.Cols(), CUBLOCK), n_blocks(tgt.Rows(),CUBLOCK));
+
+    cudaD_gauss_rand(dimGrid,dimBlock,tgt.pCUData(), z1.pCUData(), z2.pCUData(), z3.pCUData(), z4.pCUData(),tgt.Dim());
+
+    cuSafeCall(cudaGetLastError());
+
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+ 
+  template<>
+  inline void
+  CuRand<double>::
+  BinarizeProbs(const CuMatrix<double>& probs, CuMatrix<double>& states)
+  {
+    if(probs.Rows() != z1.Rows() || probs.Cols() != z1.Cols()) {
+      Error("Non matching dims!!");
+    }
+
+    states.Init(z1.Rows(),z1.Cols());
+    Rand(tmp);
+
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK,CUBLOCK);
+    dim3 dimGrid(n_blocks(z1.Cols(), CUBLOCK), n_blocks(z1.Rows(),CUBLOCK));
+
+    cudaD_binarize_probs(dimGrid,dimBlock,states.pCUData(), probs.pCUData(), tmp.pCUData(),states.Dim());
+
+    cuSafeCall(cudaGetLastError());
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+}
diff --git a/src/CuBaseLib/.svn/text-base/curandkernels.cu.svn-base b/src/CuBaseLib/.svn/text-base/curandkernels.cu.svn-base
new file mode 100644
index 0000000..7e1c8dd
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/curandkernels.cu.svn-base
@@ -0,0 +1,135 @@
+
+#include "curandkernels.h"
+
+
+
+//
+//Hybrid Tauss/LCG random number generator
+//
+//http://http.developer.nvidia.com/GPUGems3/gpugems3_ch37.html
+
+
+// S1, S2, S3, and M are all constants, and z is part of the  
+// private per-thread generator state.
+__device__
+static unsigned TausStep(unsigned &z, int S1, int S2, int S3, unsigned M)  
+{  
+  unsigned b=(((z << S1) ^ z) >> S2);  
+  return z = (((z & M) << S3) ^ b);  
+}  
+
+// A and C are constants  
+__device__
+static unsigned LCGStep(unsigned &z, unsigned A, unsigned C)  
+{  
+  return z=(A*z+C);  
+} 
+
+template<typename T>
+__device__
+static T HybridTaus(unsigned& z1, unsigned& z2, unsigned& z3, unsigned& z4)  
+{  
+  // Combined period is lcm(p1,p2,p3,p4)~ 2^121
+  T randval;
+  do { 
+   randval = 2.3283064365387e-10 * (          // Periods  
+    TausStep(z1, 13, 19, 12, 4294967294UL) ^  // p1=2^31-1  
+    TausStep(z2, 2, 25, 4, 4294967288UL) ^    // p2=2^30-1  
+    TausStep(z3, 3, 11, 17, 4294967280UL) ^   // p3=2^28-1  
+    LCGStep(z4, 1664525, 1013904223UL)        // p4=2^32  
+   );
+  } while (!(randval > 0.0 && randval < 1.0));
+  return randval;
+}  
+
+
+
+
+template<typename T>
+__global__
+static void _rand(T* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if( i < d.cols  && j < d.rows ) {
+    mat[index] = HybridTaus<T>(z1[index],z2[index],z3[index],z4[index]);
+  }
+}
+
+/*
+float2 BoxMuller()  
+{  
+  float u0=HybridTaus (), u1=HybridTaus ();  
+  float r=sqrt(-2 log(u0));  
+  float theta=2*PI*u1;  
+  return make_float2(r*sin(theta),r*cos(theta));  
+} 
+*/
+ 
+template<typename T>
+__device__
+static T BoxMuller(unsigned& z1, unsigned& z2, unsigned& z3, unsigned& z4)  
+{
+  const T M_2PI = 6.283185307179586476925286766558;
+
+  T u0 = HybridTaus<T>(z1,z2,z3,z4), u1 = HybridTaus<T>(z1,z2,z3,z4);
+  T r = sqrt(-2.0 * log(u0));
+  T theta = M_2PI * u1;
+  return r*sin(theta);
+  
+}  
+
+
+template<typename T>
+__global__
+static void _gauss_rand(T* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if( i < d.cols  && j < d.rows ) {
+    mat[index] = BoxMuller<T>(z1[index],z2[index],z3[index],z4[index]);
+  }
+}
+
+
+template<typename T>
+__global__
+static void _binarize_probs(T* states, const T* probs, const T* rand, MatrixDim d)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if( i < d.cols  && j < d.rows ) {
+    states[index] = ((probs[index] > rand[index])? 1.0 : 0.0);
+  }
+}
+
+
+
+/************
+ * :FLOAT:
+ */
+void cudaF_rand(dim3 Gr, dim3 Bl, float* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
+{ _rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); }
+
+void cudaF_gauss_rand(dim3 Gr, dim3 Bl, float* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
+{ _gauss_rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); }
+
+void cudaF_binarize_probs(dim3 Gr, dim3 Bl, float* states, const float* probs, float* rand, MatrixDim d) 
+{ _binarize_probs<<<Gr,Bl>>>(states,probs,rand,d); }
+
+
+/************
+ * :DOUBLE:
+ */
+void cudaD_rand(dim3 Gr, dim3 Bl, double* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
+{ _rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); }
+
+void cudaD_gauss_rand(dim3 Gr, dim3 Bl, double* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
+{ _gauss_rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); }
+
+void cudaD_binarize_probs(dim3 Gr, dim3 Bl, double* states, const double* probs, double* rand, MatrixDim d) 
+{ _binarize_probs<<<Gr,Bl>>>(states,probs,rand,d); }
+
diff --git a/src/CuBaseLib/.svn/text-base/curandkernels.h.svn-base b/src/CuBaseLib/.svn/text-base/curandkernels.h.svn-base
new file mode 100644
index 0000000..69b589f
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/curandkernels.h.svn-base
@@ -0,0 +1,34 @@
+#ifndef _cuda_rand_kernels_h_
+#define _cuda_rand_kernels_h_
+
+
+#include "cukernels.h"
+
+
+extern "C" {
+  //**************
+  //float
+  //
+  void cudaF_rand(dim3 Gr, dim3 Bl, float* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d);
+
+
+  void cudaF_gauss_rand(dim3 Gr, dim3 Bl, float* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d);
+
+
+  void cudaF_binarize_probs(dim3 Gr, dim3 Bl, float* states, const float* probs, float* rand, MatrixDim d);
+  
+  //**************
+  //double
+  //
+  void cudaD_rand(dim3 Gr, dim3 Bl, double* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d);
+
+
+  void cudaD_gauss_rand(dim3 Gr, dim3 Bl, double* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d);
+
+
+  void cudaD_binarize_probs(dim3 Gr, dim3 Bl, double* states, const double* probs, double* rand, MatrixDim d);
+
+}
+
+
+#endif
diff --git a/src/CuBaseLib/.svn/text-base/cuvector.h.svn-base b/src/CuBaseLib/.svn/text-base/cuvector.h.svn-base
new file mode 100644
index 0000000..945565a
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/cuvector.h.svn-base
@@ -0,0 +1,104 @@
+#ifndef _CUVECTOR_H_
+#define _CUVECTOR_H_
+
+#include "Vector.h"
+
+namespace TNet {
+
+  template<typename _ElemT> class CuMatrix;
+
+  /**
+   * Matrix for CUDA computing
+   */
+  template<typename _ElemT>
+  class CuVector 
+  {
+    typedef CuVector<_ElemT> ThisType;
+
+    public:
+
+      /// Default Constructor
+      CuVector<_ElemT>()
+       : mDim(0), mpCUData(NULL)
+      { }
+      /// Constructor with memory initialisation
+      CuVector<_ElemT>(size_t dim)
+       : mDim(0), mpCUData(NULL)
+      { Init(dim); }
+
+      /// Destructor
+      ~CuVector()
+      { Destroy(); }
+
+      /// Dimensions
+      size_t Dim() const
+      { return mDim; }
+
+      /*
+      ::MatrixDim Dim() const
+      { ::MatrixDim d = { mDim, 1, 1 }; return d; }
+      */
+
+      /// Get raw pointer
+      const _ElemT* pCUData() const
+      { return mpCUData; }
+      _ElemT* pCUData()
+      { return mpCUData; }
+
+      /// Allocate the memory
+      ThisType& Init(size_t dim);
+
+      /// Deallocate the memory
+      void Destroy();
+
+      /// Copy functions (reallocates when needed)
+      ThisType&        CopyFrom(const CuVector<_ElemT>& rSrc);
+      ThisType&        CopyFrom(const Vector<_ElemT>& rSrc);
+      Vector<_ElemT>&  CopyTo(Vector<_ElemT>& rDst) const;
+
+
+      
+      // Math operations
+      //
+      void SetZero();
+
+      void SetConst(_ElemT value)
+      { Error("__func__ Not implemented"); }
+
+      void AddScaled(_ElemT alpha, const CuVector<_ElemT>& vec, _ElemT beta)
+      { Error("__func__ Not implemented"); }
+
+      void AddColSum(_ElemT alpha, const CuMatrix<_ElemT>& mat, _ElemT beta)
+      { Error("__func__ Not implemented"); }
+
+      void Print() const
+      { 
+        Vector<_ElemT> vec(Dim());
+        CopyTo(vec);
+        std::cout << vec << "\n";
+      }
+
+
+    private:
+      size_t mDim;
+      _ElemT* mpCUData;
+  };
+
+
+  /// Prints the matrix dimensions and pointer to stream
+  template<typename _ElemT>
+  inline std::ostream& operator << (std::ostream& out, const CuVector<_ElemT>& vec)
+  { 
+    size_t d = vec.Dim(); 
+    out << "[CuVector D" << d
+        << " PTR" << vec.pCUData() << "]" << std::flush;
+    return out;
+  }
+  
+  
+}
+
+
+#include "cuvector.tcc"
+
+#endif
diff --git a/src/CuBaseLib/.svn/text-base/cuvector.tcc.svn-base b/src/CuBaseLib/.svn/text-base/cuvector.tcc.svn-base
new file mode 100644
index 0000000..0107859
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/cuvector.tcc.svn-base
@@ -0,0 +1,254 @@
+
+#include <cuda_runtime_api.h>
+
+#include "Timer.h"
+#include "cucommon.h"
+#include "cumatrix.h"
+#include "cudevice.h"
+
+namespace TNet {
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+  template<typename _ElemT>
+  CuVector<_ElemT>&
+  CuVector<_ElemT>::
+  Init(size_t dim)
+  {
+    if(mDim == dim) {
+      //SetZero();
+      return *this;
+    }
+
+    Destroy();
+
+    cuSafeCall(cudaMalloc((void**)&mpCUData, dim*sizeof(_ElemT)));
+    mDim = dim;
+    SetZero();
+
+    return *this;
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+  
+  template<typename _ElemT>
+  void
+  CuVector<_ElemT>::
+  Destroy()
+  {
+    if(NULL != mpCUData) {
+      cuSafeCall(cudaFree(mpCUData));
+      mpCUData = NULL;
+    }
+    mDim = 0;
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+  template<typename _ElemT>
+  CuVector<_ElemT>&
+  CuVector<_ElemT>::
+  CopyFrom(const CuVector<_ElemT>& rSrc)
+  {
+    Init(rSrc.Dim());
+    
+    Timer tim; tim.Start();
+
+    cuSafeCall(cudaMemcpy(mpCUData, rSrc.pCUData(), rSrc.Dim()*sizeof(_ElemT), cudaMemcpyDeviceToDevice));
+
+    tim.End(); CuDevice::Instantiate().AccuProfile("CuVector::CopyFromD2D",tim.Val());
+    return *this;
+  }
+  
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+  template<typename _ElemT>
+  CuVector<_ElemT>&
+  CuVector<_ElemT>::
+  CopyFrom(const Vector<_ElemT>& rSrc)
+  {
+    Init(rSrc.Dim());
+
+    Timer tim; tim.Start();
+
+    cuSafeCall(cudaMemcpy(mpCUData, rSrc.pData(), rSrc.Dim()*sizeof(_ElemT), cudaMemcpyHostToDevice));
+
+    tim.End(); CuDevice::Instantiate().AccuProfile("CuVector::CopyFromH2D",tim.Val());
+    return *this;
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+  template<typename _ElemT>
+  Vector<_ElemT>&
+  CuVector<_ElemT>::
+  CopyTo(Vector<_ElemT>& rDst) const
+  {
+    if(rDst.Dim() != mDim) {
+      rDst.Init(mDim);
+    }
+
+    Timer tim; tim.Start();
+   
+    cuSafeCall(cudaMemcpy(rDst.pData(), pCUData(), mDim*sizeof(_ElemT), cudaMemcpyDeviceToHost));
+
+    tim.End(); CuDevice::Instantiate().AccuProfile("CuVector::CopyToD2H",tim.Val());
+
+    return rDst;
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+
+  template<typename _ElemT>
+  void 
+  CuVector<_ElemT>::
+  SetZero()
+  {
+    Timer tim; tim.Start();
+    cuSafeCall(cudaMemset(mpCUData, 0, mDim*sizeof(_ElemT)));
+    tim.End(); CuDevice::Instantiate().AccuProfile("CuVector::SetZero",tim.Val());
+  }
+
+
+  ////////////////////////////////////////////////////////////////////////
+  ////////////////////////////////////////////////////////////////////////
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////
+  //// CuVector:: templeate specializations (float)
+  ////
+  template<>
+  inline void CuVector<float>::SetConst(float value)
+  {
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK);
+    dim3 dimGrid(n_blocks(Dim(), CUBLOCK));
+    ::MatrixDim d = { 1, Dim(), Dim() };
+
+    cudaF_set_const(dimGrid,dimBlock,mpCUData,value,d);
+    cuSafeCall(cudaGetLastError());
+
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  inline void CuVector<float>::AddScaled(float alpha, const CuVector<float>& vec, float beta)
+  {
+    Timer tim; tim.Start();
+
+    assert(vec.Dim() == Dim());
+
+    dim3 dimBlock(CUBLOCK);
+    dim3 dimGrid(n_blocks(Dim(), CUBLOCK));
+    ::MatrixDim d = { 1, Dim(), Dim() };
+
+    cudaF_add_scaled(dimGrid,dimBlock,alpha,vec.pCUData(),beta,mpCUData,d);
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  inline void CuVector<float>::AddColSum(float alpha, const CuMatrix<float>& mat, float beta)
+  {
+    Timer tim; tim.Start();
+
+    assert(mat.Cols() == Dim());
+    
+    /**
+     * Rows()<=512 limit due to limited shared memory
+     * Cols()<=256 limit due to coalesced memory alignment:
+     *             matrices with huge strides have slow access!!!
+     */
+    if(mat.Rows() > 512 || mat.Cols() > 256) {
+      size_t dimBlock = CUBLOCK*2;
+      size_t dimGrid = n_blocks(Dim(),CUBLOCK*2); 
+
+      cudaF_add_col_sum(dimGrid,dimBlock,alpha,mat.pCUData(),beta,mpCUData,mat.Dim());
+      cuSafeCall(cudaGetLastError());
+    } else {
+      dim3 dimBlock(mat.Rows(),1);
+      dim3 dimGrid(1,Dim()); 
+
+      cudaF_add_col_sum_reduce(dimGrid,dimBlock,alpha,mat.pCUData(),beta,mpCUData,mat.Dim());
+      cuSafeCall(cudaGetLastError());
+    }
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////
+  //// CuVector:: templeate specializations (double)
+  ////
+  template<>
+  inline void CuVector<double>::SetConst(double value)
+  {
+    Timer tim; tim.Start();
+
+    dim3 dimBlock(CUBLOCK);
+    dim3 dimGrid(n_blocks(Dim(), CUBLOCK));
+    ::MatrixDim d = { 1, Dim(), Dim() };
+
+    cudaD_set_const(dimGrid,dimBlock,mpCUData,value,d);
+    cuSafeCall(cudaGetLastError());
+
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  inline void CuVector<double>::AddScaled(double alpha, const CuVector<double>& vec, double beta)
+  {
+    Timer tim; tim.Start();
+
+    assert(vec.Dim() == Dim());
+
+    dim3 dimBlock(CUBLOCK);
+    dim3 dimGrid(n_blocks(Dim(), CUBLOCK));
+    ::MatrixDim d = { 1, Dim(), Dim() };
+
+    cudaD_add_scaled(dimGrid,dimBlock,alpha,vec.pCUData(),beta,mpCUData,d);
+    cuSafeCall(cudaGetLastError());
+    
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+
+  template<>
+  inline void CuVector<double>::AddColSum(double alpha, const CuMatrix<double>& mat, double beta)
+  {
+    Timer tim; tim.Start();
+
+    assert(mat.Cols() == Dim());
+
+    size_t dimBlock = CUBLOCK*2;
+    size_t dimGrid = n_blocks(Dim(),CUBLOCK*2); 
+
+    cudaD_add_col_sum(dimGrid,dimBlock,alpha,mat.pCUData(),beta,mpCUData,mat.Dim());
+    cuSafeCall(cudaGetLastError());
+
+
+    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+  }
+
+}
+
+
+