path: root/src/CuBaseLib/.svn/text-base
diff options
authorJoe Zhao <>2014-04-14 08:14:45 +0800
committerJoe Zhao <>2014-04-14 08:14:45 +0800
commitcccccbf6cca94a3eaf813b4468453160e91c332b (patch)
tree23418cb73a10ae3b0688681a7f0ba9b06424583e /src/CuBaseLib/.svn/text-base
First commit
Diffstat (limited to 'src/CuBaseLib/.svn/text-base')
16 files changed, 3361 insertions, 0 deletions
diff --git a/src/CuBaseLib/.svn/text-base/Makefile.svn-base b/src/CuBaseLib/.svn/text-base/Makefile.svn-base
new file mode 100644
index 0000000..b574c4a
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/Makefile.svn-base
@@ -0,0 +1,59 @@
+include ../
+INCLUDE = -I. -I../ -I../KaldiLib
+CUSRC=$(wildcard *.cu)
+CUOBJ=$(patsubst, %.o, $(CUSRC))
+CUDA_FLAGS = -g -Xcompiler -fPIC --verbose
+ifeq ($(BITS64), true)
+ CUDA_FLAGS += --machine 64
+ BUT_FORCE_GCC64 = ln -s `which x86_64-linux-gcc` $(PWD)/gcc
+ BUT_UNLINK_GCC64 = unlink $(PWD)/gcc
+ CUDA_FLAGS += --machine 32
+ifeq ($(DOUBLEPRECISION), true)
+ CUDA_FLAGS += --gpu-architecture compute_13 --gpu-code sm_13
+all : libCuBase.a
+libCuBase.a : $(CUOBJ) $(OBJ)
+ $(AR) ruv $@ $?
+ $(RANLIB) $@
+%.o :
+ export PATH=$(PWD):$(CUDA_BIN):$(PATH); $(CUDA_BIN)/nvcc -c $< -o $@ -I. $(CUDA_INCLUDE) $(CUDA_FLAGS)
+%.o :
+ $(CXX) -c $< -o $@ $(CXXFLAGS) $(CUDA_INCLUDE) $(INCLUDE)
+.PHONY: clean depend
+clean :
+ rm -f *.o *.a
diff --git a/src/CuBaseLib/.svn/text-base/cucommon.h.svn-base b/src/CuBaseLib/.svn/text-base/cucommon.h.svn-base
new file mode 100644
index 0000000..6dc7e94
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/cucommon.h.svn-base
@@ -0,0 +1,46 @@
+#ifndef _CUCOMMON_H_
+#define _CUCOMMON_H_
+#include <iostream>
+#include <sstream>
+#include <cuda_runtime_api.h>
+#include "Error.h"
+#define cuSafeCall(fun) \
+{ \
+ int ret; \
+ if((ret = (fun)) != 0) { \
+ std::ostringstream os; \
+ os << "CUDA ERROR #" << ret << " " << __FILE__ ":" << __LINE__ << " " << __func__ << "()" << " '" << #fun << "' " << cudaGetErrorString((cudaError_t)ret); \
+ throw(MyException(os.str())); \
+ } \
+ cudaThreadSynchronize(); \
+namespace TNet {
+ /** The size of edge of CUDA square block **/
+ static const int CUBLOCK = 16;
+ /** Number of blocks in which is split task of size 'size' **/
+ inline int n_blocks(int size, int block_size)
+ { return size / block_size + ((size % block_size == 0)? 0 : 1); }
+ /** Printing dim3 output operator **/
+ inline std::ostream& operator<<(std::ostream& os, dim3 arr) {
+ os << "[" << arr.x << "," << arr.y << "," << arr.z << "]";
+ return os;
+ }
diff --git a/src/CuBaseLib/.svn/text-base/ b/src/CuBaseLib/.svn/text-base/
new file mode 100644
index 0000000..90c5bf3
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/
@@ -0,0 +1,129 @@
+#include <cudevice.h>
+#include <cublas.h>
+#include <cuda.h>
+//DEBUG: Just make sure it compiles...
+#include "cumatrix.h"
+#include "cuvector.h"
+#include "cumath.h"
+template class TNet::CuMatrix<float>;
+template class TNet::CuVector<float>;
+template class TNet::CuMath<float>;
+namespace TNet {
+ /**********************************************************************************
+ * CuDevice::
+ */
+ CuDevice::
+ CuDevice()
+ : mIsPresent(false), mVerbose(false)
+ {
+ //get number of devices
+ int N_GPU = 0;
+ cudaGetDeviceCount(&N_GPU);
+ //select device if more than one
+ if(N_GPU > 1) {
+ char name[128];
+ size_t free, total;
+ std::vector<float> free_mem_ratio;
+ //get ratios of memory use
+ std::cout << "Selecting from " << N_GPU << " GPUs\n";
+ for(int n=0; n<N_GPU; n++) {
+ std::cout << "cudaSetDevice(" << n << "): ";
+ cuSafeCall(cudaSetDevice(n));//context created by cuSafeCall(...)
+ cuDeviceGetName(name,128,n);
+ std::cout << name << "\t";
+ cuSafeCall(cuMemGetInfo(&free,&total));
+ std::cout << "free: " << free/1024/1024 << "M, "
+ << "total: "<< total/1024/1024 << "M, "
+ << "ratio: "<< free/(float)total << "\n";
+ free_mem_ratio.push_back(free/(float)total);
+ cudaThreadExit();//destroy context
+ }
+ //find GPU with max free memory
+ int max_id=0;
+ for(int n=1; n<free_mem_ratio.size(); n++) {
+ if(free_mem_ratio[n] > free_mem_ratio[max_id]) max_id=n;
+ }
+ std::cout << "Selected device: " << max_id << " (automatically)\n";
+ cuSafeCall(cudaSetDevice(max_id));
+ }
+ if(N_GPU > 0) {
+ //initialize the CUBLAS
+ cuSafeCall(cublasInit());
+ mIsPresent = true;
+ } else {
+ Warning("No CUDA enabled GPU is present!");
+ }
+ }
+ CuDevice::
+ ~CuDevice()
+ {
+ if(mIsPresent) {
+ cuSafeCall(cublasShutdown());
+ if(mVerbose) {
+ TraceLog("CUBLAS released");
+ PrintProfile();
+ }
+ } else {
+ Warning("No CUDA enabled GPU was present!");
+ }
+ }
+ void
+ CuDevice::
+ SelectGPU(int gpu_id)
+ {
+ //get number of devices
+ int N_GPU = 0;
+ cudaGetDeviceCount(&N_GPU);
+ if(gpu_id >= N_GPU) {
+ KALDI_ERR << "Cannot select GPU " << gpu_id
+ << ", detected " << N_GPU << " CUDA capable cards!";
+ }
+ //release old card
+ cuSafeCall(cublasShutdown());
+ cudaThreadExit();
+ //select new card
+ cuSafeCall(cudaSetDevice(gpu_id));
+ //initialize CUBLAS
+ cuSafeCall(cublasInit());
+ std::cout << "Selected device " << gpu_id << " (manually)\n";
+ }
+ std::string
+ CuDevice::
+ GetFreeMemory()
+ {
+ size_t mem_free, mem_total;
+ cuMemGetInfo(&mem_free, &mem_total);
+ std::ostringstream os;
+ os << "Free:" << mem_free/(1024*1024) << "MB "
+ << "Used:" << (mem_total-mem_free)/(1024*1024) << "MB "
+ << "Total:" << mem_total/(1024*1024) << "MB";
+ return os.str();
+ }
+ ////////////////////////////////////////////////
+ // Instance of the static singleton
+ //
+ CuDevice CuDevice::msDevice;
+ //
+ ////////////////////////////////////////////////
diff --git a/src/CuBaseLib/.svn/text-base/cudevice.h.svn-base b/src/CuBaseLib/.svn/text-base/cudevice.h.svn-base
new file mode 100644
index 0000000..c5eeb7b
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/cudevice.h.svn-base
@@ -0,0 +1,79 @@
+#ifndef _CUDEVICE_H_
+#define _CUDEVICE_H_
+#include <map>
+#include <string>
+#include <iostream>
+namespace TNet {
+ /**
+ * Singleton object which represents CUDA device
+ * responsible for CUBLAS initilalisation
+ * and memory block registration
+ */
+ class CuDevice
+ {
+ // Singleton interface...
+ private:
+ CuDevice();
+ CuDevice(CuDevice&);
+ CuDevice& operator=(CuDevice&);
+ public:
+ ~CuDevice();
+ static CuDevice& Instantiate()
+ { return msDevice; }
+ private:
+ static CuDevice msDevice;
+ /**********************************/
+ // Instance interface
+ public:
+ void SelectGPU(int gpu_id);
+ /// Check if the CUDA device is in the system
+ bool IsPresent()
+ { return mIsPresent; }
+ void Verbose(bool verbose)
+ { mVerbose = verbose; }
+ /// Sum the IO time
+ void AccuProfile(const std::string& key,double time)
+ {
+ if(mProfileMap.find(key) == mProfileMap.end()) {
+ mProfileMap[key] = 0.0;
+ }
+ mProfileMap[key] += time;
+ }
+ void PrintProfile()
+ {
+ std::cout << "[cudevice profile]\n";
+ std::map<std::string, double>::iterator it;
+ for(it = mProfileMap.begin(); it != mProfileMap.end(); ++it) {
+ std::cout << it->first << "\t" << it->second << "s\n";
+ }
+ }
+ void ResetProfile()
+ { mProfileMap.clear(); }
+ std::string GetFreeMemory();
+ private:
+ std::map<std::string, double> mProfileMap;
+ bool mIsPresent;
+ bool mVerbose;
+ }; //class CuDevice
diff --git a/src/CuBaseLib/.svn/text-base/ b/src/CuBaseLib/.svn/text-base/
new file mode 100644
index 0000000..d6f866d
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/
@@ -0,0 +1,626 @@
+#include <cfloat>
+#include "cukernels.h"
+ * CUDA kernels
+ */
+template<typename T>
+static void _set_const(T* mat, T value, MatrixDim d) {
+ int i = blockIdx.x * blockDim.x + threadIdx.x;
+ int j = blockIdx.y * blockDim.y + threadIdx.y;
+ int index = i + j*d.stride;
+ if ( i < d.cols && j < d.rows )
+ mat[index] = value;
+template<typename T>
+static void _apply_log(T* mat, MatrixDim d) {
+ int i = blockIdx.x * blockDim.x + threadIdx.x;
+ int j = blockIdx.y * blockDim.y + threadIdx.y;
+ int index = i + j*d.stride;
+ if ( i < d.cols && j < d.rows )
+ mat[index] = log(mat[index]);
+template<typename T>
+static void _apply_mask(T* mat, const float* mask, MatrixDim dmat, MatrixDim dmask) {
+ int i = blockIdx.x * blockDim.x + threadIdx.x;
+ int j = blockIdx.y * blockDim.y + threadIdx.y;
+ int index = i + j*dmat.stride;
+ int index2 = i + j*dmask.stride;
+ if ( i < dmat.cols && j < dmat.rows )
+ if(mask[index2] == 0) mat[index] = 0;
+template<typename T>
+static void _apply_l1(T* mat, T l1, MatrixDim d) {
+ int i = blockIdx.x * blockDim.x + threadIdx.x;
+ int j = blockIdx.y * blockDim.y + threadIdx.y;
+ int index = i + j*d.stride;
+ if ( i < d.cols && j < d.rows ) {
+ T value = mat[index];
+ T tgt;
+ if(abs(value) < l1) {
+ tgt = 0;
+ } else {
+ tgt = (value > 0?value-l1:value+l1);
+ }
+ mat[index] = tgt;
+ }
+template<typename T>
+static void _scale_cols(T* mat, const T* scale, MatrixDim d) {
+ int i = blockIdx.x * blockDim.x + threadIdx.x;
+ int j = blockIdx.y * blockDim.y + threadIdx.y;
+ int index = i + j*d.stride;
+ if ( i < d.cols && j < d.rows )
+ mat[index] *= scale[i];
+template<typename T>
+static void _scale_rows(T* mat, const T* scale, MatrixDim d) {
+ int i = blockIdx.x * blockDim.x + threadIdx.x;
+ int j = blockIdx.y * blockDim.y + threadIdx.y;
+ int index = i + j*d.stride;
+ if ( i < d.cols && j < d.rows )
+ mat[index] *= scale[j];
+template<typename T>
+static void _add_scaled(T alpha, const T* A, T beta, T* dst, MatrixDim d) {
+ int i = blockIdx.x * blockDim.x + threadIdx.x;
+ int j = blockIdx.y * blockDim.y + threadIdx.y;
+ int index = i + j*d.stride;
+ if ( i < d.cols && j < d.rows )
+ dst[index] = alpha*A[index] + beta*dst[index];
+template<typename T>
+static void _add_scaled_row(T alpha, const T* row, T beta, T* dst, MatrixDim d) {
+ int i = blockIdx.x * blockDim.x + threadIdx.x;
+ int j = blockIdx.y * blockDim.y + threadIdx.y;
+ int index = i + j*d.stride;
+#if 0
+ //this does not accelerate :(
+ __shared__ T aux[16];
+ if(threadIdx.y == 0 && i < d.cols) aux[threadIdx.x] = row[i];
+ __syncthreads();
+ if ( i < d.cols && j < d.rows )
+ dst[index] = alpha*aux[threadIdx.x] + beta*dst[index];
+ if ( i < d.cols && j < d.rows )
+ dst[index] = alpha*row[i] + beta*dst[index];
+template<typename T>
+static void _mul_elem(T* mat, const T* A, MatrixDim d) {
+ int i = blockIdx.x * blockDim.x + threadIdx.x;
+ int j = blockIdx.y * blockDim.y + threadIdx.y;
+ int index = i + j*d.stride;
+ if ( i < d.cols && j < d.rows )
+ mat[index] = mat[index] * A[index];
+template<typename T>
+static void _log_elem(T* mat, MatrixDim d) {
+ int i = blockIdx.x * blockDim.x + threadIdx.x;
+ int j = blockIdx.y * blockDim.y + threadIdx.y;
+ int index = i + j*d.stride;
+ if ( i < d.cols && j < d.rows ) {
+ if(mat[index] < FLT_MIN) mat[index] = FLT_MIN;
+ mat[index] = log(mat[index]);
+ }
+template<typename T>
+static void _add_col_sum(T alpha, const T* mat, T beta, T* vec, MatrixDim d) {
+ int i = blockIdx.x * blockDim.x + threadIdx.x;
+ //This should be called 1-D
+ int j = blockIdx.y * blockDim.y + threadIdx.y;
+ if(j > 0) return;
+ if(i < d.cols) {
+ double sum = 0.0;
+ for(int k = 0; k < d.rows; k++) {
+ sum += mat[i+k*d.stride];
+ }
+ vec[i] = alpha*sum + beta*vec[i];
+ }
+template<typename T>
+static void _add_col_sum_reduce(T alpha, const T* mat, T beta, T* vec, MatrixDim d) {
+ //flipped x,y for reducing... x..row, y..col
+ int j = blockIdx.x * blockDim.x + threadIdx.x;
+ int i = blockIdx.y * blockDim.y + threadIdx.y;
+ if(blockIdx.x > 0) return;
+ if(blockDim.y != 1) return;
+ //copy vector to shared mem
+ __shared__ T aux[512];
+ aux[threadIdx.x] = mat[i+j*d.stride];
+ __syncthreads();
+ T sum = _sum_reduce(aux);
+ __syncthreads();
+ //copy out the result
+ vec[i] = alpha*sum + beta*vec[i];
+template<typename T>
+static void _sigmoid(T*y, const T*x, MatrixDim d) {
+ int i = blockIdx.x * blockDim.x + threadIdx.x;
+ int j = blockIdx.y * blockDim.y + threadIdx.y;
+ int index = i + j*d.stride;
+ if( i < d.cols && j < d.rows ) {
+ T res = 1.0 / (1.0 + exp(-x[index]));
+ /*
+ if(res < 0.001) res = 0.001;
+ if(res > 0.999) res = 0.999;
+ */
+ y[index] = res;
+ }
+template<typename T>
+static void _diff_sigmoid(T*eout, const T*e, const T*y, MatrixDim d) {
+ int i = blockIdx.x * blockDim.x + threadIdx.x;
+ int j = blockIdx.y * blockDim.y + threadIdx.y;
+ int index = i + j*d.stride;
+ if( i < d.cols && j < d.rows )
+ eout[index] = y[index]*(1.0-y[index]) * e[index];
+template<typename T>
+static void _softmax(T*y, const T*x, MatrixDim d) {
+ int j = blockIdx.x * blockDim.x + threadIdx.x;
+ if(j >= d.rows) return;
+ //copy to output and find max...
+ double max = -1e20;
+ double sum = 0.0;
+ for(int i=0; i<d.cols; i++) {
+ if(max < x[i+j*d.stride]) max = x[i+j*d.stride];
+ y[i+j*d.stride] = x[i+j*d.stride];
+ }
+ //subtract max, apply exp, sum up...
+ for(int i=0; i<d.cols; i++) {
+ y[i+j*d.stride] = exp(y[i+j*d.stride] - max);
+ sum += y[i+j*d.stride];
+ }
+ //normalize by sum...
+ for(int i=0; i<d.cols; i++) {
+ y[i+j*d.stride] /= sum;
+ }
+template<typename T>
+static T _max_reduce(T buffer[]) {
+ // Total number of active threads
+ int nTotalThreads = blockDim.x;
+ __syncthreads();
+ while(nTotalThreads > 1) {
+ int halfPoint = ((1+nTotalThreads) >> 1); // divide by two
+ // only the first half of the threads will be active.
+ if (threadIdx.x < halfPoint) {
+ // Get the shared value stored by another thread
+ T temp = -1e20;
+ if(threadIdx.x+halfPoint < nTotalThreads) {
+ temp = buffer[threadIdx.x + halfPoint];
+ }
+ if (temp > buffer[threadIdx.x]) buffer[threadIdx.x] = temp;
+ }
+ __syncthreads();
+ nTotalThreads = ((1+nTotalThreads) >> 1); // divide by two.
+ }
+ // the result
+ return buffer[0];
+template<typename T>
+static T _sum_reduce(T buffer[]) {
+ // Total number of active threads
+ int nTotalThreads = blockDim.x;
+ __syncthreads();
+ while(nTotalThreads > 1) {
+ int halfPoint = ((1+nTotalThreads) >> 1); // divide by two
+ // only the first half of the threads will be active.
+ if (threadIdx.x < halfPoint) {
+ // Get the shared value stored by another thread
+ T temp = 0.0;
+ if(threadIdx.x+halfPoint < nTotalThreads) {
+ temp = buffer[threadIdx.x + halfPoint];
+ }
+ buffer[threadIdx.x] += temp;
+ }
+ __syncthreads();
+ nTotalThreads = ((1+nTotalThreads) >> 1); // divide by two.
+ }
+ // the result
+ return buffer[0];
+template<typename T>
+static void _softmax_reduce(T*y, const T*x, MatrixDim d) {
+ int i = blockIdx.x * blockDim.x + threadIdx.x;
+ int j = blockIdx.y * blockDim.y + threadIdx.y;
+ if(blockIdx.x > 0) return;
+ if(blockDim.y > 1) return;
+ __shared__ T row_data[256];
+ __shared__ T aux[256];
+ //copy the input to row_data
+ row_data[i] = x[i+j*d.stride];
+ __syncthreads();
+ //copy input to aux
+ aux[i] = row_data[i];
+ __syncthreads();
+ //get the maximum value
+ T max = _max_reduce(aux);
+ __syncthreads();
+ //calculate exp(data-max)
+ row_data[i] = exp(row_data[i]-max);
+ //copy the values to aux
+ aux[i] = row_data[i];
+ __syncthreads();
+ //get the sum
+ T sum = _sum_reduce(aux);
+ __syncthreads();
+ //divide the values
+ row_data[i] /= sum;
+ //copy out
+ y[i+j*d.stride] = row_data[i];
+template<typename T>
+static void _expand(T* y, const T* x, const int* off, MatrixDim d_out, MatrixDim d_in)
+ int i = blockIdx.x * blockDim.x + threadIdx.x;
+ int j = blockIdx.y * blockDim.y + threadIdx.y;
+ int index = i + j*d_out.stride;
+ if( i < d_out.cols && j < d_out.rows ) {
+ int src_col = i % d_in.cols;
+ int src_row = j + off[i / d_in.cols];
+ if(src_row < 0) src_row = 0;
+ if(src_row >= d_in.rows) src_row = d_in.rows-1;
+ y[index] = x[src_col + src_row*d_in.stride];
+ }
+template<typename T>
+static void _rearrange(T* y, const T* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in)
+ int i = blockIdx.x * blockDim.x + threadIdx.x;
+ int j = blockIdx.y * blockDim.y + threadIdx.y;
+ int index = i + j*d_out.stride;
+ if( i < d_out.cols && j < d_out.rows ) {
+ int src_col = copy_from[i];
+ if(src_col >= 0 && src_col < d_in.cols) {
+ y[index] = x[src_col + j*d_in.stride];
+ } else {
+ y[index] = 1.0/0.0;
+ }
+ }
+template<typename T>
+static void _randomize(T* y, const T* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in)
+ int i = blockIdx.x * blockDim.x + threadIdx.x;
+ int j = blockIdx.y * blockDim.y + threadIdx.y;
+ int index = i + j*d_out.stride;
+ if( i < d_out.cols && j < d_out.rows ) {
+ int src_row = copy_from[j];
+ y[index] = x[i + src_row*d_in.stride];
+ }
+template<typename T>
+static void _check_class(const T* out, const T* des, int* match, MatrixDim d)
+ int i = blockIdx.x * blockDim.x + threadIdx.x;
+ int j = blockIdx.y * blockDim.y + threadIdx.y;
+ if(j>0) return;
+ if(i<d.rows) {
+ int out_id = -1, des_id = -2;
+ T out_max = -1e20, des_max = -1e20;
+ for(int k=0; k<d.cols; k++) {
+ T val = out[k + i*d.stride];
+ if(val > out_max) { out_max = val; out_id = k; }
+ }
+ for(int k=0; k<d.cols; k++) {
+ T val = des[k + i*d.stride];
+ if(val > des_max) { des_max = val; des_id = k; }
+ }
+ match[i] = ((out_id == des_id)?1:0);
+ }
+template<typename T>
+static int _max_id_reduce(T val[],int idx[]) {
+ // Total number of active threads
+ int nTotalThreads = blockDim.x;
+ __syncthreads();
+ while(nTotalThreads > 1) {
+ int halfPoint = ((1+nTotalThreads) >> 1); // divide by two
+ // only the first half of the threads will be active.
+ if (threadIdx.x < halfPoint) {
+ // Get the shared value stored by another thread
+ T temp = -1e20;
+ if(threadIdx.x+halfPoint < nTotalThreads) {
+ temp = val[idx[threadIdx.x + halfPoint]];
+ }
+ if (temp > val[idx[threadIdx.x]]) idx[threadIdx.x]=idx[threadIdx.x + halfPoint];
+ }
+ __syncthreads();
+ nTotalThreads = ((1+nTotalThreads) >> 1); // divide by two.
+ }
+ // the result
+ return idx[0];
+template<typename T>
+static void _check_class_reduce(const T* out, const T* des, int* match, MatrixDim d)
+ int i = blockIdx.x * blockDim.x + threadIdx.x;
+ int j = blockIdx.y * blockDim.y + threadIdx.y;
+ if(blockIdx.x > 0) return;
+ if(blockDim.y != 1) return;
+ __shared__ T value[256];
+ __shared__ int index[256];
+ value[threadIdx.x] = out[i+j*d.stride];
+ index[threadIdx.x] = threadIdx.x;
+ __syncthreads();
+ int out_max = _max_id_reduce(value,index);
+ __syncthreads();
+ value[threadIdx.x] = des[i+j*d.stride];
+ index[threadIdx.x] = threadIdx.x;
+ __syncthreads();
+ int des_max = _max_id_reduce(value,index);
+ __syncthreads();
+ if(threadIdx.x == 0) {
+ match[j] = ((out_max == des_max)?1:0);
+ }
+ * C wrappers around CUDA kernels
+ */
+void cudaF_set_const(dim3 Gr, dim3 Bl, float* mat, float value, MatrixDim d)
+{ _set_const<<<Gr,Bl>>>(mat,value,d); }
+void cudaF_apply_log(dim3 Gr, dim3 Bl, float* mat, MatrixDim d)
+{ _apply_log<<<Gr,Bl>>>(mat,d); }
+void cudaF_apply_mask(dim3 Gr, dim3 Bl, float* mat, const float* mask, MatrixDim dmat, MatrixDim dmask)
+{ _apply_mask<<<Gr,Bl>>>(mat,mask,dmat,dmask); }
+void cudaF_apply_l1(dim3 Gr, dim3 Bl, float* mat, float l1, MatrixDim d)
+{ _apply_l1<<<Gr,Bl>>>(mat,l1,d); }
+void cudaF_scale_cols(dim3 Gr, dim3 Bl, float* mat, const float* scale, MatrixDim d)
+{ _scale_cols<<<Gr,Bl>>>(mat,scale,d); }
+void cudaF_scale_rows(dim3 Gr, dim3 Bl, float* mat, const float* scale, MatrixDim d)
+{ _scale_rows<<<Gr,Bl>>>(mat,scale,d); }
+void cudaF_add_scaled(dim3 Gr, dim3 Bl, float alpha, const float* A, float beta, float* dst, MatrixDim d)
+{ _add_scaled<<<Gr,Bl>>>(alpha,A,beta,dst,d); }
+void cudaF_add_scaled_row(dim3 Gr, dim3 Bl, float alpha, const float* row, float beta, float* dst, MatrixDim d)
+{ _add_scaled_row<<<Gr,Bl>>>(alpha,row,beta,dst,d); }
+void cudaF_mul_elem(dim3 Gr, dim3 Bl, float*mat, const float*A, MatrixDim d)
+{ _mul_elem<<<Gr,Bl>>>(mat,A,d); }
+void cudaF_log_elem(dim3 Gr, dim3 Bl, float*mat, MatrixDim d)
+{ _log_elem<<<Gr,Bl>>>(mat,d); }
+void cudaF_add_col_sum(size_t Gr, size_t Bl, float alpha, const float* mat, float beta, float* vec, MatrixDim d)
+{ _add_col_sum<<<Gr,Bl>>>(alpha,mat,beta,vec,d); }
+void cudaF_add_col_sum_reduce(dim3 Gr, dim3 Bl, float alpha, const float* mat, float beta, float* vec, MatrixDim d)
+{ _add_col_sum_reduce<<<Gr,Bl>>>(alpha,mat,beta,vec,d); }
+void cudaF_sigmoid (dim3 Gr, dim3 Bl, float *y, const float*x, MatrixDim d)
+{ _sigmoid<<<Gr,Bl>>>(y, x, d); }
+void cudaF_diff_sigmoid (dim3 Gr, dim3 Bl, float*eout, const float*e, const float*y, MatrixDim d) {
+ _diff_sigmoid<<<Gr,Bl>>>(eout, e, y, d);
+void cudaF_softmax (size_t Gr, size_t Bl, float*y, const float*x, MatrixDim d)
+{ _softmax<<<Gr,Bl>>>(y, x, d); }
+void cudaF_softmax_reduce (dim3 Gr, dim3 Bl, float*y, const float*x, MatrixDim d)
+{ _softmax_reduce<<<Gr,Bl>>>(y, x, d); }
+void cudaF_expand(dim3 Gr, dim3 Bl, float* y, const float* x, const int* off, MatrixDim d_out, MatrixDim d_in)
+{ _expand<<<Gr,Bl>>>(y,x,off,d_out,d_in); }
+void cudaF_rearrange(dim3 Gr, dim3 Bl, float* y, const float* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in)
+{ _rearrange<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in); }
+void cudaF_randomize(dim3 Gr, dim3 Bl, float* y, const float* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in)
+{ _randomize<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in); }
+void cudaF_check_class(size_t Gr, size_t Bl, const float* out, const float* des, int* match, MatrixDim d)
+{ _check_class<<<Gr,Bl>>>(out,des,match,d); }
+void cudaF_check_class_reduce(dim3 Gr, dim3 Bl, const float* out, const float* des, int* match, MatrixDim d)
+{ _check_class_reduce<<<Gr,Bl>>>(out,des,match,d); }
+void cudaD_set_const(dim3 Gr, dim3 Bl, double* mat, double value, MatrixDim d)
+{ _set_const<<<Gr,Bl>>>(mat,value,d); }
+void cudaD_apply_log(dim3 Gr, dim3 Bl, double* mat, MatrixDim d)
+{ _apply_log<<<Gr,Bl>>>(mat,d); }
+void cudaD_scale_cols(dim3 Gr, dim3 Bl, double* mat, const double* scale, MatrixDim d)
+{ _scale_cols<<<Gr,Bl>>>(mat,scale,d); }
+void cudaD_scale_rows(dim3 Gr, dim3 Bl, double* mat, const double* scale, MatrixDim d)
+{ _scale_rows<<<Gr,Bl>>>(mat,scale,d); }
+void cudaD_add_scaled(dim3 Gr, dim3 Bl, double alpha, const double* A, double beta, double* dst, MatrixDim d)
+{ _add_scaled<<<Gr,Bl>>>(alpha,A,beta,dst,d); }
+void cudaD_add_scaled_row(dim3 Gr, dim3 Bl, double alpha, const double* row, double beta, double* dst, MatrixDim d)
+{ _add_scaled_row<<<Gr,Bl>>>(alpha,row,beta,dst,d); }
+void cudaD_mul_elem(dim3 Gr, dim3 Bl, double*mat, const double*A, MatrixDim d)
+{ _mul_elem<<<Gr,Bl>>>(mat,A,d); }
+void cudaD_log_elem(dim3 Gr, dim3 Bl, double*mat, MatrixDim d)
+{ _log_elem<<<Gr,Bl>>>(mat,d); }
+void cudaD_add_col_sum(size_t Gr, size_t Bl, double alpha, const double* mat, double beta, double* vec, MatrixDim d)
+{ _add_col_sum<<<Gr,Bl>>>(alpha,mat,beta,vec,d); }
+void cudaD_sigmoid (dim3 Gr, dim3 Bl, double *y, const double*x, MatrixDim d)
+{ _sigmoid<<<Gr,Bl>>>(y, x, d); }
+void cudaD_diff_sigmoid (dim3 Gr, dim3 Bl, double*eout, const double*e, const double*y, MatrixDim d) {
+ _diff_sigmoid<<<Gr,Bl>>>(eout, e, y, d);
+void cudaD_softmax (size_t Gr, size_t Bl, double*y, const double*x, MatrixDim d)
+{ _softmax<<<Gr,Bl>>>(y, x, d); }
+void cudaD_expand(dim3 Gr, dim3 Bl, double* y, const double* x, const int* off, MatrixDim d_out, MatrixDim d_in)
+{ _expand<<<Gr,Bl>>>(y,x,off,d_out,d_in); }
+void cudaD_rearrange(dim3 Gr, dim3 Bl, double* y, const double* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in)
+{ _rearrange<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in); }
+void cudaD_randomize(dim3 Gr, dim3 Bl, double* y, const double* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in)
+{ _randomize<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in); }
+void cudaD_check_class(size_t Gr, size_t Bl, const double* out, const double* des, int* match, MatrixDim d)
+{ _check_class<<<Gr,Bl>>>(out,des,match,d); }
diff --git a/src/CuBaseLib/.svn/text-base/cukernels.h.svn-base b/src/CuBaseLib/.svn/text-base/cukernels.h.svn-base
new file mode 100644
index 0000000..d8320b5
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/cukernels.h.svn-base
@@ -0,0 +1,81 @@
+#ifndef _cuda_kernels_h_
+#define _cuda_kernels_h_
+extern "C" {
+#pragma GCC diagnostic ignored "-Wshadow";
+#include <vector_types.h>
+#pragma GCC diagnostic warning "-Wshadow";
+ typedef struct MatrixDim_ {
+ int rows;
+ int cols;
+ int stride;
+ } MatrixDim;
+ /*************
+ * Float instances
+ */
+ //CuMatrix
+ void cudaF_set_const(dim3 Gr, dim3 Bl, float*mat, float value, MatrixDim d);
+ void cudaF_apply_log(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
+ void cudaF_apply_mask(dim3 Gr, dim3 Bl, float* mat, const float* mask, MatrixDim dmat, MatrixDim dmask);
+ void cudaF_apply_l1(dim3 Gr, dim3 Bl, float* mat, float l1, MatrixDim d);
+ void cudaF_scale_cols(dim3 Gr, dim3 Bl, float*mat, const float* scale, MatrixDim d);
+ void cudaF_scale_rows(dim3 Gr, dim3 Bl, float*mat, const float* scale, MatrixDim d);
+ void cudaF_add_scaled(dim3 Gr, dim3 Bl, float alpha, const float* A, float beta, float* dst, MatrixDim d);
+ void cudaF_add_scaled_row(dim3 Gr, dim3 Bl, float alpha, const float* row, float beta, float* dst, MatrixDim d);
+ void cudaF_mul_elem(dim3 Gr, dim3 Bl, float*mat, const float*A, MatrixDim d);
+ void cudaF_log_elem(dim3 Gr, dim3 Bl, float*mat, MatrixDim d);
+ //CuVector
+ void cudaF_add_col_sum(size_t Gr, size_t Bl, float alpha, const float* mat, float beta, float* vec, MatrixDim d);
+ void cudaF_add_col_sum_reduce(dim3 Gr, dim3 Bl, float alpha, const float* mat, float beta, float* vec, MatrixDim d);
+ //CuMath
+ void cudaF_softmax (size_t Gr, size_t Bl, float*y, const float*x, MatrixDim d);
+ void cudaF_softmax_reduce (dim3 Gr, dim3 Bl, float*y, const float*x, MatrixDim d);
+ void cudaF_sigmoid (dim3 Gr, dim3 Bl, float*y, const float*x, MatrixDim d);
+ void cudaF_diff_sigmoid (dim3 Gr, dim3 Bl, float* eout, const float* e, const float* y, MatrixDim d);
+ void cudaF_expand(dim3 Gr, dim3 Bl, float* y, const float* x, const int* off, MatrixDim d_out, MatrixDim d_in);
+ void cudaF_rearrange(dim3 Gr, dim3 Bl, float* y, const float* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in);
+ void cudaF_randomize(dim3 Gr, dim3 Bl, float* y, const float* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in);
+ void cudaF_check_class(size_t Gr, size_t Bl, const float* out, const float* des, int* match, MatrixDim d);
+ void cudaF_check_class_reduce(dim3 Gr, dim3 Bl, const float* out, const float* des, int* match, MatrixDim d);
+ /*************
+ * Double instances
+ */
+ //CuMatrix
+ void cudaD_set_const(dim3 Gr, dim3 Bl, double*mat, double value, MatrixDim d);
+ void cudaD_apply_log(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
+ void cudaD_scale_cols(dim3 Gr, dim3 Bl, double*mat, const double* scale, MatrixDim d);
+ void cudaD_scale_rows(dim3 Gr, dim3 Bl, double*mat, const double* scale, MatrixDim d);
+ void cudaD_add_scaled(dim3 Gr, dim3 Bl, double alpha, const double* A, double beta, double* dst, MatrixDim d);
+ void cudaD_add_scaled_row(dim3 Gr, dim3 Bl, double alpha, const double* row, double beta, double* dst, MatrixDim d);
+ void cudaD_mul_elem(dim3 Gr, dim3 Bl, double*mat, const double*A, MatrixDim d);
+ void cudaD_log_elem(dim3 Gr, dim3 Bl, double*mat, MatrixDim d);
+ //CuVector
+ void cudaD_add_col_sum(size_t Gr, size_t Bl, double alpha, const double* mat, double beta, double* vec, MatrixDim d);
+ //CuMath
+ void cudaD_softmax (size_t Gr, size_t Bl, double*y, const double*x, MatrixDim d);
+ void cudaD_sigmoid (dim3 Gr, dim3 Bl, double*y, const double*x, MatrixDim d);
+ void cudaD_diff_sigmoid (dim3 Gr, dim3 Bl, double* eout, const double* e, const double* y, MatrixDim d);
+ void cudaD_expand(dim3 Gr, dim3 Bl, double* y, const double* x, const int* off, MatrixDim d_out, MatrixDim d_in);
+ void cudaD_rearrange(dim3 Gr, dim3 Bl, double* y, const double* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in);
+ void cudaD_randomize(dim3 Gr, dim3 Bl, double* y, const double* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in);
+ void cudaD_check_class(size_t Gr, size_t Bl, const double* out, const double* des, int* match, MatrixDim d);
diff --git a/src/CuBaseLib/.svn/text-base/ b/src/CuBaseLib/.svn/text-base/
new file mode 100644
index 0000000..d718324
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/
@@ -0,0 +1,574 @@
+#include "cumath.h"
+#include "cukernels.h"
+namespace TNet {
+ //////////////////////////////////////////////////////////////////////////////
+ //// CuMath<> Template specializations (float)
+ ////
+ template<>
+ void CuMath<float>::Sigmoid(CuMatrix<float>& Y, const CuMatrix<float>& X)
+ {
+ Timer tim; tim.Start();
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(X.Cols(),CUBLOCK), n_blocks(X.Rows(), CUBLOCK));
+ cudaF_sigmoid(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ void CuMath<float>::DiffSigmoid(CuMatrix<float>& Eout, const CuMatrix<float>& Ein, const CuMatrix<float>& Y)
+ {
+ Timer tim; tim.Start();
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Eout.Cols(), CUBLOCK), n_blocks(Eout.Rows(),CUBLOCK));
+ cudaF_diff_sigmoid(dimGrid, dimBlock, Eout.pCUData(), Ein.pCUData(), Y.pCUData(), Eout.Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ void CuMath<float>::Softmax(CuMatrix<float>& Y, const CuMatrix<float>& X)
+ {
+ Timer tim; tim.Start();
+#if 0
+ //disable 'reduce' functions
+ size_t dimBlock = CUBLOCK;
+ size_t dimGrid = n_blocks(X.Rows(),CUBLOCK);
+ cudaF_softmax(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim());
+ cuSafeCall(cudaGetLastError());
+ if(X.Cols() > 256) {
+ //use old implementation (can't use reduction due to
+ //limited size of shared memory)
+ size_t dimBlock = CUBLOCK;
+ size_t dimGrid = n_blocks(X.Rows(),CUBLOCK);
+ cudaF_softmax(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim());
+ cuSafeCall(cudaGetLastError());
+ } else {
+ //use implementation with reduction
+ dim3 dimBlock(X.Cols(),1);
+ dim3 dimGrid(1,X.Rows());
+ cudaF_softmax_reduce(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim());
+ cuSafeCall(cudaGetLastError());
+ }
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ void CuMath<float>::BlockLinearity(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuMatrix<float>& block_transf)
+ {
+ Timer tim; tim.Start();
+ assert(Y.Rows() == X.Rows());
+ assert((X.Cols() % block_transf.Rows()) == 0);
+ assert((Y.Cols() % block_transf.Cols()) == 0);
+ assert((X.Cols() / block_transf.Rows()) == (Y.Cols() / block_transf.Cols()));
+ int blocks = X.Cols() / block_transf.Rows();
+ for(int i = 0; i < blocks; i++) {
+ int m = block_transf.Cols();
+ int n = X.Rows();
+ int k = block_transf.Rows();
+ /*
+ std::cout << "N N " << m << " " << n << " " << k << " "
+ << 1.0 << " " << block_transf << " " << block_transf.Stride()
+ << " " << X+i*k << " " << X.Stride() << " "
+ << 0.0 << " " << Y+i*n << " " << Y.Stride()
+ << "\n" << std::flush;
+ */
+ cublasSgemm('N', 'N', m, n, k,
+ 1.0, block_transf.pCUData(), block_transf.Stride(),
+ X.pCUData()+i*k, X.Stride(),
+ 0.0, Y.pCUData()+i*m, Y.Stride());
+ }
+ cuSafeCall(cublasGetError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ void CuMath<float>::Expand(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& frameOffsets)
+ {
+ Timer tim; tim.Start();
+ assert(Y.Rows() == X.Rows());
+ assert(X.Cols() * frameOffsets.Dim() == Y.Cols());
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK));
+ cudaF_expand(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), frameOffsets.pCUData(), Y.Dim(), X.Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ void CuMath<float>::Rearrange(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& copyFrom)
+ {
+ Timer tim; tim.Start();
+ assert(copyFrom.Dim() == Y.Cols());
+ assert(Y.Rows() == X.Rows());
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK));
+ cudaF_rearrange(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), Y.Dim(), X.Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ void CuMath<float>::Randomize(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& copyFrom)
+ {
+ Timer tim; tim.Start();
+ assert(X.Cols() == Y.Cols());
+ assert(X.Rows() == Y.Rows());
+ assert(copyFrom.Dim() <= Y.Rows());
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(copyFrom.Dim(),CUBLOCK));
+ MatrixDim dimX = X.Dim(); dimX.rows=copyFrom.Dim();
+ MatrixDim dimY = Y.Dim(); dimY.rows=copyFrom.Dim();
+ cudaF_randomize(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), dimY, dimX);
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ void CuMath<float>::CheckClass(const CuMatrix<float>& out, const CuMatrix<float> &des, CuVector<int>& match)
+ {
+ Timer tim; tim.Start();
+ assert(out.Cols() == des.Cols());
+ assert(out.Rows() == des.Rows());
+ assert(out.Stride() == des.Stride());
+ assert(match.Dim() == out.Rows());
+ if(out.Cols() > 256) {
+ size_t dimBlock = CUBLOCK;
+ size_t dimGrid = n_blocks(out.Rows(),CUBLOCK);
+ cudaF_check_class(dimGrid, dimBlock, out.pCUData(), des.pCUData(), match.pCUData(), out.Dim());
+ cuSafeCall(cudaGetLastError());
+ } else {
+ dim3 dimBlock(out.Cols(),1);
+ dim3 dimGrid(1,out.Rows());
+ cudaF_check_class_reduce(dimGrid, dimBlock, out.pCUData(), des.pCUData(), match.pCUData(), out.Dim());
+ cuSafeCall(cudaGetLastError());
+ }
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ void CuMath<float>::OffsetGemm(char transA, char transB, float alpha, const CuMatrix<float>& A, const CuMatrix<float>& B, float beta, CuMatrix<float>& C, int offA, int offB, int offC)
+ {
+ Timer tim; tim.Start();
+ // CUBLAS is col major, TNet is row major
+ // keep trans..., just swap A&B argumets: A->B B->A
+ //
+ //m,n,k is cublas m,n,k
+ size_t m = ((transB=='T' || transB=='t')? B.Rows() : B.Cols());
+ size_t n = ((transA=='T' || transA=='t')? A.Cols() : A.Rows());
+ size_t k = ((transB=='T' || transB=='t')? B.Cols() : B.Rows());
+ size_t k1 = ((transA=='T' || transA=='t')? A.Rows() : A.Cols());
+ k = ((k<k1)?k:k1);
+ m = ((m<C.Cols())?m:C.Cols());
+ n = ((n<C.Rows())?m:C.Rows());
+#if 0
+ std::cout << "A " << transA << " "<< A.Rows() << " " << A.Cols() << " " << A.Stride() << " " << offA
+ << "; B " << transB << " "<< B.Rows() << " " << B.Cols() << " " << B.Stride() << " " << offB
+ << "; C " << C.Rows() << " " << C.Cols() << " " << C.Stride() << " " << offC
+ << "; alpha" << alpha << " beta" << beta << " REALmnk:" << m <<" "<< n <<" "<< k << std::endl;
+ cublasSgemm(transB, transA, m, n, k,
+ alpha, B.pCUData()+offB, B.Stride(),
+ A.pCUData()+offA, A.Stride(),
+ beta, C.pCUData()+offC, C.Stride());
+ cuSafeCall(cublasGetError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ void CuMath<float>::Gemv(char trans, float alpha, const CuMatrix<float>& A, const float* x, size_t dimX, float beta, float* y, size_t dimY)
+ {
+ Timer tim; tim.Start();
+ // CUBLAS is col major, TNet is row major
+ // y = alpha * op(A) * x + beta * y,
+ size_t m = A.Cols(); //m..rows of A in colmajor (== cols in rowmajor)
+ size_t n = A.Rows(); //n..cols of A in colmajor (== rows in rowmajor)
+ // switch the trans parameter!
+ char cu_trans;
+ if(trans == 't' || trans == 'T') {
+ cu_trans = 'n';
+ } else if (trans == 'n' || trans == 'N') {
+ cu_trans = 't';
+ } else {
+ Error(std::string("Unknown trans")+trans);
+ }
+ //check the dims
+ if(cu_trans == 'n') {
+ assert(dimX == n);
+ assert(dimY == m);
+ } else {
+ assert(dimX == m);
+ assert(dimY == n);
+ }
+ //run gemv
+ cublasSgemv(cu_trans,m,n,alpha,
+ A.pCUData(), A.Stride(), x, 1,
+ beta, y, 1);
+ cuSafeCall(cublasGetError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ */
+ /**
+ * offsetY tells how many outputs of 'Ax' mutiplication is skipped at the beginning,
+ */
+ template<>
+ void CuMath<float>::OffsetGemv(char trans, float alpha, const CuMatrix<float>& A, const float* x, size_t dimX, float beta, float* y, size_t dimY, size_t offsetY)
+ {
+ Timer tim; tim.Start();
+ // CUBLAS is col major, TNet is row major
+ // y = alpha * op(A) * x + beta * y,
+ size_t m = A.Cols(); //m..rows of A in colmajor (== cols in rowmajor)
+ size_t n = A.Rows(); //n..cols of A in colmajor (== rows in rowmajor)
+ // switch the trans parameter!
+ char cu_trans;
+ if(trans == 't' || trans == 'T') {
+ cu_trans = 'n';
+ } else if (trans == 'n' || trans == 'N') {
+ cu_trans = 't';
+ } else {
+ Error(std::string("Unknown trans")+trans);
+ }
+ // select part of matrix for compute
+ size_t cu_offset = 0;
+ if(cu_trans == 'n') {
+ cu_offset += offsetY;
+ assert(m >= dimY+offsetY);
+ m = dimY;
+ } else {
+ cu_offset += offsetY*A.Stride();
+ assert(n >= dimY+offsetY);
+ n = dimY;
+ }
+ //check the dims
+ if(cu_trans == 'n') {
+ assert(dimX == n);
+ assert(dimY == m);
+ } else {
+ assert(dimX == m);
+ assert(dimY == n);
+ }
+ //run gemv
+ cublasSgemv(cu_trans,m,n,alpha,
+ A.pCUData()+cu_offset, A.Stride(), x, 1,
+ beta, y, 1);
+ cuSafeCall(cublasGetError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ void CuMath<float>::BlasGer(float alpha, const float* x, size_t dimX, const float* y, size_t dimY, CuMatrix<float>& A) {
+ Timer tim; tim.Start();
+ // CUBLAS is col major, TNet is row major
+ // -> switch x and y
+ // A = alpha * x * transpose(y) + A,
+ assert(dimX == A.Rows());
+ assert(dimY == A.Cols());
+ size_t m = A.Cols(); //m..rows of A in colmajor (== cols in rowmajor)
+ size_t n = A.Rows(); //n..cols of A in colmajor (== rows in rowmajor)
+ cublasSger(m,n,alpha,y,1,x,1,A.pCUData(),A.Stride());
+ cuSafeCall(cublasGetError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ void CuMath<float>::VecExpand(const CuVector<float>&in, CuVector<float>&out)
+ {
+ Timer tim; tim.Start();
+ assert(out.Dim() % in.Dim() == 0);
+ int n_copies = out.Dim()/in.Dim();
+ CuVector<int> offsets(n_copies);
+ //offsets.SetConst(0); done implicitly!
+ dim3 dimBlock(CUBLOCK);
+ dim3 dimGrid(n_blocks(out.Dim(), CUBLOCK));
+ MatrixDim dim_in = { 1, in.Dim(), in.Dim() };
+ MatrixDim dim_out = { 1, out.Dim(), out.Dim() };
+ cudaF_expand(dimGrid, dimBlock, out.pCUData(), in.pCUData(), offsets.pCUData(), dim_out, dim_in);
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ void CuMath<float>::VecAddColSum(float alpha, const CuVector<float>&in, float beta, CuVector<float>&out)
+ {
+ Timer tim; tim.Start();
+ assert(in.Dim() % out.Dim() == 0);
+ size_t dimBlock = CUBLOCK;
+ size_t dimGrid = n_blocks(out.Dim(),CUBLOCK);
+ MatrixDim dim = { in.Dim()/out.Dim(), out.Dim(), out.Dim() };
+ cudaF_add_col_sum(dimGrid,dimBlock,alpha,in.pCUData(),beta,out.pCUData(),dim);
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ //////////////////////////////////////////////////////////////////////////////
+ //// CuMath<> Template specializations (double)
+ ////
+ template<>
+ void CuMath<double>::Sigmoid(CuMatrix<double>& Y, const CuMatrix<double>& X)
+ {
+ Timer tim; tim.Start();
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(X.Cols(),CUBLOCK), n_blocks(X.Rows(), CUBLOCK));
+ cudaD_sigmoid(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ void CuMath<double>::DiffSigmoid(CuMatrix<double>& Eout, const CuMatrix<double>& Ein, const CuMatrix<double>& Y)
+ {
+ Timer tim; tim.Start();
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Eout.Cols(), CUBLOCK), n_blocks(Eout.Rows(),CUBLOCK));
+ cudaD_diff_sigmoid(dimGrid, dimBlock, Eout.pCUData(), Ein.pCUData(), Y.pCUData(), Eout.Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ void CuMath<double>::Softmax(CuMatrix<double>& Y, const CuMatrix<double>& X)
+ {
+ Timer tim; tim.Start();
+ size_t dimBlock = CUBLOCK;
+ size_t dimGrid = n_blocks(X.Rows(),CUBLOCK);
+ cudaD_softmax(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ void CuMath<double>::BlockLinearity(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuMatrix<double>& block_transf)
+ {
+ Timer tim; tim.Start();
+ assert(Y.Rows() == X.Rows());
+ assert((X.Cols() % block_transf.Rows()) == 0);
+ assert((Y.Cols() % block_transf.Cols()) == 0);
+ assert((X.Cols() / block_transf.Rows()) == (Y.Cols() / block_transf.Cols()));
+ int blocks = X.Cols() / block_transf.Rows();
+ for(int i = 0; i < blocks; i++) {
+ int m = block_transf.Cols();
+ int n = X.Rows();
+ int k = block_transf.Rows();
+ /*
+ std::cout << "N N " << m << " " << n << " " << k << " "
+ << 1.0 << " " << block_transf << " " << block_transf.Stride()
+ << " " << X+i*k << " " << X.Stride() << " "
+ << 0.0 << " " << Y+i*n << " " << Y.Stride()
+ << "\n" << std::flush;
+ */
+ cublasDgemm('N', 'N', m, n, k,
+ 1.0, block_transf.pCUData(), block_transf.Stride(),
+ X.pCUData()+i*k, X.Stride(),
+ 0.0, Y.pCUData()+i*m, Y.Stride());
+ }
+ cuSafeCall(cublasGetError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ void CuMath<double>::Expand(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& frameOffsets)
+ {
+ Timer tim; tim.Start();
+ assert(Y.Rows() == X.Rows());
+ assert(X.Cols() * frameOffsets.Dim() == Y.Cols());
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK));
+ cudaD_expand(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), frameOffsets.pCUData(), Y.Dim(), X.Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ void CuMath<double>::Rearrange(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& copyFrom)
+ {
+ Timer tim; tim.Start();
+ assert(copyFrom.Dim() == Y.Cols());
+ assert(Y.Rows() == X.Rows());
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK));
+ cudaD_rearrange(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), Y.Dim(), X.Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ void CuMath<double>::Randomize(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& copyFrom)
+ {
+ Timer tim; tim.Start();
+ assert(X.Cols() == Y.Cols());
+ assert(X.Rows() == Y.Rows());
+ assert(copyFrom.Dim() <= Y.Rows());
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(copyFrom.Dim(),CUBLOCK));
+ MatrixDim dimX = X.Dim(); dimX.rows=copyFrom.Dim();
+ MatrixDim dimY = Y.Dim(); dimY.rows=copyFrom.Dim();
+ cudaD_randomize(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), dimY, dimX);
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ void CuMath<double>::CheckClass(const CuMatrix<double>& out, const CuMatrix<double> &des, CuVector<int>& match)
+ {
+ Timer tim; tim.Start();
+ assert(out.Cols() == des.Cols());
+ assert(out.Rows() == des.Rows());
+ assert(out.Stride() == des.Stride());
+ assert(match.Dim() == out.Rows());
+ size_t dimBlock = CUBLOCK;
+ size_t dimGrid = n_blocks(out.Rows(),CUBLOCK);
+ cudaD_check_class(dimGrid, dimBlock, out.pCUData(), des.pCUData(), match.pCUData(), out.Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
diff --git a/src/CuBaseLib/.svn/text-base/cumath.h.svn-base b/src/CuBaseLib/.svn/text-base/cumath.h.svn-base
new file mode 100644
index 0000000..5680082
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/cumath.h.svn-base
@@ -0,0 +1,146 @@
+#ifndef _CUMATH_H_
+#define _CUMATH_H_
+#include "cumatrix.h"
+#include "Timer.h"
+#include "cudevice.h"
+namespace TNet {
+ /**
+ * Group of Math operations for the NN training
+ */
+ template<typename _ElemT>
+ class CuMath
+ {
+ public:
+ /// Y = Sigmoid(X)
+ static void Sigmoid(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X)
+ { Error("__func__ Not implemented"); }
+ /// Eout = E(1-E) * Y
+ static void DiffSigmoid(CuMatrix<_ElemT>& Eout, const CuMatrix<_ElemT>& Ein, const CuMatrix<_ElemT>& Y)
+ { Error("__func__ Not implemented"); }
+ /// Y = Softmax(X)
+ static void Softmax(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X)
+ { Error("__func__ Not implemented"); }
+ /// for DCT in FeaCat
+ static void BlockLinearity(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X, const CuMatrix<_ElemT>& block_transf)
+ { Error("__func__ Not implemented"); }
+ static void Expand(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X, const CuVector<int>& frameOffsets)
+ { Error("__func__ Not implemented"); }
+ /// ie. switch cols according to copyFrom
+ static void Rearrange(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X, const CuVector<int>& copyFrom)
+ { Error("__func__ Not implemented"); }
+ /// ie. switch rows according to copyFrom
+ static void Randomize(CuMatrix<_ElemT>& Y, const CuMatrix<_ElemT>& X, const CuVector<int>& copyFrom)
+ { Error("__func__ Not implemented"); }
+ /// check match in the classification for Xentropy
+ static void CheckClass(const CuMatrix<_ElemT>& out, const CuMatrix<_ElemT> &des, CuVector<int>& match)
+ { Error("__func__ Not implemented"); }
+ /// gemm with offset for CuSharedLinearity
+ static void OffsetGemm(char transA, char transB, _ElemT alpha, const CuMatrix<_ElemT>& A, const CuMatrix<_ElemT>& B, _ElemT beta, CuMatrix<_ElemT>& C, int offA, int offB, int offC)
+ { Error("__func__ Not implemented"); }
+ /// gemv with offset for CuRecurrent
+ static void OffsetGemv(char trans, _ElemT alpha, const CuMatrix<_ElemT>& A, const _ElemT* x, size_t dimX, _ElemT beta, _ElemT* y, size_t dimY, size_t offsetY)
+ { Error("__func__ Not implemented"); }
+ /// ger for weight updates in CuRecurrent
+ static void BlasGer(_ElemT alpha, const _ElemT* x, size_t dimX, const _ElemT* y, size_t dimY, CuMatrix<_ElemT>& A)
+ { Error("__func__ Not implemented"); }
+ /// concatenate one vector several times for CuSharedLinearity
+ static void VecExpand(const CuVector<_ElemT>&in, CuVector<_ElemT>&out)
+ { Error("__func__ Not implemented"); }
+ /// sum the vector as if it was matrix data for CuSharedLinearity
+ static void VecAddColSum(_ElemT alpha, const CuVector<_ElemT>&in, _ElemT beta, CuVector<_ElemT>&out)
+ { Error("__func__ Not implemented"); }
+ }; //class CuMath::
+ //////////////////////////////////////////////////////////////////////////////
+ //// CuMath<> Template specializations (float)
+ ////
+ template<>
+ void CuMath<float>::Sigmoid(CuMatrix<float>& Y, const CuMatrix<float>& X);
+ template<>
+ void CuMath<float>::DiffSigmoid(CuMatrix<float>& Eout, const CuMatrix<float>& Ein, const CuMatrix<float>& Y);
+ template<>
+ void CuMath<float>::Softmax(CuMatrix<float>& Y, const CuMatrix<float>& X);
+ template<>
+ void CuMath<float>::BlockLinearity(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuMatrix<float>& block_transf);
+ template<>
+ void CuMath<float>::Expand(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& frameOffsets);
+ template<>
+ void CuMath<float>::Rearrange(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& copyFrom);
+ template<>
+ void CuMath<float>::Randomize(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& copyFrom);
+ template<>
+ void CuMath<float>::CheckClass(const CuMatrix<float>& out, const CuMatrix<float> &des, CuVector<int>& match);
+ template<>
+ void CuMath<float>::OffsetGemm(char transA, char transB, float alpha, const CuMatrix<float>& A, const CuMatrix<float>& B, float beta, CuMatrix<float>& C, int offA, int offB, int offC);
+ template<>
+ void CuMath<float>::OffsetGemv(char trans, float alpha, const CuMatrix<float>& A, const float* x, size_t dimX, float beta, float* y, size_t dimY, size_t offsetY);
+ template<>
+ void CuMath<float>::BlasGer(float alpha, const float* x, size_t dimX, const float* y, size_t dimY, CuMatrix<float>& A);
+ template<>
+ void CuMath<float>::VecExpand(const CuVector<float>&in, CuVector<float>&out);
+ template<>
+ void CuMath<float>::VecAddColSum(float alpha, const CuVector<float>&in, float beta, CuVector<float>&out);
+ //////////////////////////////////////////////////////////////////////////////
+ //// CuMath<> Template specializations (double)
+ ////
+ template<>
+ void CuMath<double>::Sigmoid(CuMatrix<double>& Y, const CuMatrix<double>& X);
+ template<>
+ void CuMath<double>::DiffSigmoid(CuMatrix<double>& Eout, const CuMatrix<double>& Ein, const CuMatrix<double>& Y);
+ template<>
+ void CuMath<double>::Softmax(CuMatrix<double>& Y, const CuMatrix<double>& X);
+ template<>
+ void CuMath<double>::BlockLinearity(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuMatrix<double>& block_transf);
+ template<>
+ void CuMath<double>::Expand(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& frameOffsets);
+ template<>
+ void CuMath<double>::Rearrange(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& copyFrom);
+ template<>
+ void CuMath<double>::Randomize(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& copyFrom);
+ template<>
+ void CuMath<double>::CheckClass(const CuMatrix<double>& out, const CuMatrix<double> &des, CuVector<int>& match);
diff --git a/src/CuBaseLib/.svn/text-base/cumatrix.h.svn-base b/src/CuBaseLib/.svn/text-base/cumatrix.h.svn-base
new file mode 100644
index 0000000..4e767e3
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/cumatrix.h.svn-base
@@ -0,0 +1,199 @@
+#ifndef _CUMATRIX_H_
+#define _CUMATRIX_H_
+#include <sstream>
+#include "Matrix.h"
+#include "cukernels.h"
+namespace TNet {
+ template<typename _ElemT> class CuVector;
+ /**
+ * Matrix for CUDA computing
+ */
+ template<typename _ElemT>
+ class CuMatrix
+ {
+ typedef CuMatrix<_ElemT> ThisType;
+ public:
+ /// Default Constructor
+ CuMatrix<_ElemT>()
+ : mRows(0), mCols(0), mStride(0), mpCUData(NULL)
+ { }
+ /// Constructor with memory initialisation
+ CuMatrix<_ElemT>(size_t rows, size_t cols)
+ : mRows(0), mCols(0), mStride(0), mpCUData(NULL)
+ { Init(rows, cols); }
+ /// Destructor
+ ~CuMatrix()
+ { Destroy(); }
+ /// Dimensions
+ size_t Rows() const
+ { return mRows; }
+ size_t Cols() const
+ { return mCols; }
+ size_t Stride() const
+ { return mStride; }
+ ::MatrixDim Dim() const
+ { ::MatrixDim d = {
+ static_cast<int>(mRows),
+ static_cast<int>(mCols),
+ static_cast<int>(mStride)
+ };
+ return d;
+ }
+ /// Get raw pointer
+ const _ElemT* pCUData() const
+ { return mpCUData; }
+ _ElemT* pCUData()
+ { return mpCUData; }
+ /// Get raw row pointer
+ const _ElemT* pCURowData(size_t r) const
+ { assert(r < Rows()); return mpCUData+r*mStride; }
+ _ElemT* pCURowData(size_t r)
+ { assert(r < Rows()); return mpCUData+r*mStride; }
+ /// Get size of matrix in bytes
+ size_t MSize() const
+ { return mRows*mStride*sizeof(_ElemT); }
+ /// Get size of matrix row in bytes
+ size_t MRowSize() const
+ { return mStride*sizeof(_ElemT); }
+ /// Allocate the memory
+ ThisType& Init(size_t rows, size_t cols);
+ /// Deallocate the memory
+ void Destroy();
+ /// Copy functions (reallocates when needed)
+ ThisType& CopyFrom(const CuMatrix<_ElemT>& rSrc);
+ ThisType& CopyFrom(const Matrix<_ElemT>& rSrc);
+ Matrix<_ElemT>& CopyTo(Matrix<_ElemT>& rDst) const;
+ /// Copy rowCnt rows from rSrc, starting by row srcOri,
+ /// copying to memory block starting by row dstOri
+ void CopyRows(size_t rowCnt, size_t srcOri, const CuMatrix<_ElemT>& rSrc, size_t dstOri);
+ /// Copy colCnt columns from rSrc, starting by col srcOri,
+ /// copying to memory block starting by row dstOri
+ void CopyCols(size_t colCnt, size_t srcOri, const CuMatrix<_ElemT>& rSrc, size_t dstOri);
+ // Math operations, some calling kernels
+ //
+ void SetZero();
+ void SetConst(_ElemT value)
+ { Error("__func__ Not implemented"); }
+ void ApplyLog()
+ { Error("__func__ Not implemented"); }
+ void ApplyMask(const CuMatrix<BaseFloat>& mask)
+ { Error("__func__ Not implemented"); }
+ void ApplyL1(BaseFloat l1)
+ { Error("__func__ Not implemented"); }
+ /// scale i'th column by scale[i]
+ void ScaleCols(const CuVector<_ElemT>& scale)
+ { Error("__func__ Not implemented"); }
+ /// scale i'th row by scale[i]
+ void ScaleRows(const CuVector<_ElemT>& scale)
+ { Error("__func__ Not implemented"); }
+ /// B = aplha * A + beta * B
+ void AddScaled(_ElemT alpha, const CuMatrix<_ElemT>& A, _ElemT beta)
+ { Error("__func__ Not implemented"); }
+ /// B = aplha * row + beta * B
+ void AddScaledRow(_ElemT alpha, const CuVector<_ElemT>& row, _ElemT beta)
+ { Error("__func__ Not implemented"); }
+ /// C = alpha * A(^T)*B(^T) + beta * C
+ void Gemm(char transa, char transb,
+ _ElemT alpha,
+ const CuMatrix<_ElemT>& A, const CuMatrix<_ElemT>& B,
+ _ElemT beta)
+ { Error("__func__ Not implemented"); }
+ /// A = alpha * x*y^T + A
+ void BlasGer(_ElemT alpha,
+ const CuVector<_ElemT>& x, const CuVector<_ElemT>& y)
+ { Error("__func__ Not implemented"); }
+ /// Multiply two matrices elementhwise: C = A .* C
+ void MulElem(const CuMatrix<_ElemT>& A)
+ { Error("__func__ Not implemented"); }
+ /// A = log(A)
+ void LogElem()
+ { Error("__func__ Not implemented"); }
+ void Print() const
+ {
+ Matrix<_ElemT> mat(Rows(),Cols());
+ CopyTo(mat);
+ std::cout << mat;
+ }
+ void CheckData()
+ {
+ Matrix<_ElemT> mat;
+ CopyTo(mat);
+ for(size_t i=0; i<Rows(); i++) {
+ for(size_t j=0; j<Cols(); j++) {
+ if(std::isnan(mat(i,j)) || std::isinf(mat(i,j))) {
+ std::ostringstream os;
+ os << "Invalid value:" << mat(i,j) << "at row"<<i<<" col"<<j<<"\n";
+ Error(os.str());
+ }
+ }
+ }
+ }
+ private:
+ size_t mRows;
+ size_t mCols;
+ size_t mStride;
+ _ElemT* mpCUData;
+ };
+ /// Prints the matrix dimensions and pointer to stream
+ template<typename _ElemT>
+ inline std::ostream& operator << (std::ostream& out, const CuMatrix<_ElemT>& mat)
+ {
+ out << "[CUMATRIX R" << mat.Rows() << " C" << mat.Cols() << " S" << mat.Stride()
+ << " PTR" << mat.pCUData() << "]" << std::flush;
+ return out;
+ }
+#include "cumatrix.tcc"
diff --git a/src/CuBaseLib/.svn/text-base/cumatrix.tcc.svn-base b/src/CuBaseLib/.svn/text-base/cumatrix.tcc.svn-base
new file mode 100644
index 0000000..4582e8d
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/cumatrix.tcc.svn-base
@@ -0,0 +1,627 @@
+#include <cuda_runtime_api.h>
+#include <cublas.h>
+#include "Timer.h"
+#include "cucommon.h"
+#include "cuvector.h"
+#include "cudevice.h"
+namespace TNet {
+ ////////////////////////////////////////////////////////////////////////
+ ////////////////////////////////////////////////////////////////////////
+ template<typename _ElemT>
+ CuMatrix<_ElemT>&
+ CuMatrix<_ElemT>::
+ Init(size_t rows, size_t cols)
+ {
+ if(mRows == rows && mCols == cols) {
+ //SetZero();
+ return *this;
+ }
+ Destroy();
+ size_t row_bytes = cols * sizeof(_ElemT);
+ size_t pitch;
+ cuSafeCall(cudaMallocPitch((void**)&mpCUData, &pitch, row_bytes, rows));
+ mRows = rows; mCols = cols;
+ mStride = pitch/sizeof(_ElemT);
+ SetZero();
+ return *this;
+ }
+ ////////////////////////////////////////////////////////////////////////
+ ////////////////////////////////////////////////////////////////////////
+ template<typename _ElemT>
+ void
+ CuMatrix<_ElemT>::
+ Destroy()
+ {
+ if(NULL != mpCUData) {
+ cuSafeCall(cudaFree(mpCUData));
+ mpCUData = NULL;
+ }
+ mRows = mCols = mStride = 0;
+ }
+ ////////////////////////////////////////////////////////////////////////
+ ////////////////////////////////////////////////////////////////////////
+ template<typename _ElemT>
+ CuMatrix<_ElemT>&
+ CuMatrix<_ElemT>::
+ CopyFrom(const CuMatrix<_ElemT>& rSrc)
+ {
+ Init(rSrc.Rows(),rSrc.Cols());
+ Timer tim; tim.Start();
+ size_t dst_pitch = mStride*sizeof(_ElemT);
+ size_t src_pitch = rSrc.Stride()*sizeof(_ElemT);
+ size_t width = rSrc.Cols()*sizeof(_ElemT);
+ cuSafeCall(cudaMemcpy2D(mpCUData, dst_pitch, rSrc.pCUData(), src_pitch, width, rSrc.Rows(), cudaMemcpyDeviceToDevice));
+ tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::CopyFromD2D",tim.Val());
+ return *this;
+ }
+ ////////////////////////////////////////////////////////////////////////
+ ////////////////////////////////////////////////////////////////////////
+ template<typename _ElemT>
+ CuMatrix<_ElemT>&
+ CuMatrix<_ElemT>::
+ CopyFrom(const Matrix<_ElemT>& rSrc)
+ {
+ Init(rSrc.Rows(),rSrc.Cols());
+ Timer tim; tim.Start();
+ size_t dst_pitch = mStride*sizeof(_ElemT);
+ size_t src_pitch = rSrc.Stride()*sizeof(_ElemT);
+ size_t width = rSrc.Cols()*sizeof(_ElemT);
+ cuSafeCall(cudaMemcpy2D(mpCUData, dst_pitch, rSrc.pData(), src_pitch, width, rSrc.Rows(), cudaMemcpyHostToDevice));
+ tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::CopyFromH2D",tim.Val());
+ return *this;
+ }
+ ////////////////////////////////////////////////////////////////////////
+ ////////////////////////////////////////////////////////////////////////
+ template<typename _ElemT>
+ Matrix<_ElemT>&
+ CuMatrix<_ElemT>::
+ CopyTo(Matrix<_ElemT>& rDst) const
+ {
+ if(rDst.Rows() != Rows() || rDst.Cols() != Cols()) {
+ rDst.Init(Rows(),Cols());
+ }
+ Timer tim; tim.Start();
+ size_t src_pitch = mStride*sizeof(_ElemT);
+ size_t dst_pitch = rDst.Stride()*sizeof(_ElemT);
+ size_t width = Cols()*sizeof(_ElemT);
+ cuSafeCall(cudaMemcpy2D(rDst.pData(), dst_pitch, pCUData(), src_pitch, width, Rows(), cudaMemcpyDeviceToHost));
+ tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::CopyToD2H",tim.Val());
+ return rDst;
+ }
+ ////////////////////////////////////////////////////////////////////////
+ ////////////////////////////////////////////////////////////////////////
+ template<typename _ElemT>
+ void
+ CuMatrix<_ElemT>::
+ CopyRows(size_t rowCnt, size_t srcOri, const CuMatrix<_ElemT>& rSrc, size_t dstOri)
+ {
+ assert(rowCnt+srcOri <= rSrc.Rows());
+ assert(rowCnt+dstOri <= Rows());
+ assert(Cols() == rSrc.Cols());
+ Timer tim; tim.Start();
+ size_t dst_pitch = mStride*sizeof(_ElemT);
+ size_t src_pitch = rSrc.Stride()*sizeof(_ElemT);
+ size_t width = rSrc.Cols()*sizeof(_ElemT);
+ const _ElemT* p_src = rSrc.pCUData() + srcOri*rSrc.Stride();
+ _ElemT* p_dst = mpCUData + dstOri*mStride;
+ cuSafeCall(cudaMemcpy2D(p_dst, dst_pitch, p_src, src_pitch, width, rowCnt, cudaMemcpyDeviceToDevice));
+ tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::CopyRowsD2D",tim.Val());
+ }
+ ////////////////////////////////////////////////////////////////////////
+ ////////////////////////////////////////////////////////////////////////
+ template<typename _ElemT>
+ void
+ CuMatrix<_ElemT>::
+ CopyCols(size_t colCnt, size_t srcOri, const CuMatrix<_ElemT>& rSrc, size_t dstOri)
+ {
+ assert(colCnt+srcOri <= rSrc.Cols());
+ assert(colCnt+dstOri <= Cols());
+ assert(Rows() == rSrc.Rows());
+ Timer tim; tim.Start();
+ size_t dst_pitch = mStride*sizeof(_ElemT);
+ size_t src_pitch = rSrc.Stride()*sizeof(_ElemT);
+ size_t width = colCnt*sizeof(_ElemT);
+ const _ElemT* p_src = rSrc.pCUData() + srcOri;
+ _ElemT* p_dst = mpCUData + dstOri;
+ cuSafeCall(cudaMemcpy2D(p_dst, dst_pitch, p_src, src_pitch, width, Rows(), cudaMemcpyDeviceToDevice));
+ tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::CopyColsD2D",tim.Val());
+ }
+ ////////////////////////////////////////////////////////////////////////
+ ////////////////////////////////////////////////////////////////////////
+ template<typename _ElemT>
+ void
+ CuMatrix<_ElemT>::
+ SetZero()
+ {
+ Timer tim; tim.Start();
+ cuSafeCall(cudaMemset(mpCUData, 0, mRows*mStride*sizeof(_ElemT)));
+ tim.End(); CuDevice::Instantiate().AccuProfile("CuMatrix::SetZero",tim.Val());
+ }
+ ////////////////////////////////////////////////////////////////////////
+ ////////////////////////////////////////////////////////////////////////
+ ////////////////////////////////////////////////////////////////////////
+ //// CuMatrix:: templeate specializations (float)
+ ////
+ template<>
+ inline void CuMatrix<float>::SetConst(float value)
+ {
+ Timer tim; tim.Start();
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+ cudaF_set_const(dimGrid,dimBlock,mpCUData,value,Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ inline void CuMatrix<float>::ApplyLog()
+ {
+ Timer tim; tim.Start();
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+ cudaF_apply_log(dimGrid,dimBlock,mpCUData,Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ inline void CuMatrix<float>::ApplyMask(const CuMatrix<BaseFloat>& mask)
+ {
+ Timer tim; tim.Start();
+ assert(mask.Rows() == Rows());
+ assert(mask.Cols() == Cols());
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+ cudaF_apply_mask(dimGrid,dimBlock,mpCUData,mask.pCUData(),Dim(),mask.Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ inline void CuMatrix<float>::ApplyL1(float l1)
+ {
+ Timer tim; tim.Start();
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+ cudaF_apply_l1(dimGrid,dimBlock,mpCUData,l1,Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ inline void CuMatrix<float>::ScaleCols(const CuVector<float>& scale)
+ {
+ Timer tim; tim.Start();
+ assert(scale.Dim() == Cols());
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+ cudaF_scale_cols(dimGrid,dimBlock,mpCUData,scale.pCUData(),Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ inline void CuMatrix<float>::ScaleRows(const CuVector<float>& scale)
+ {
+ Timer tim; tim.Start();
+ assert(scale.Dim() == Rows());
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+ cudaF_scale_rows(dimGrid,dimBlock,mpCUData,scale.pCUData(),Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ inline void CuMatrix<float>::AddScaled(float alpha, const CuMatrix<float>& A, float beta)
+ {
+ Timer tim; tim.Start();
+ assert(A.Rows() == Rows());
+ assert(A.Cols() == Cols());
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+ cudaF_add_scaled(dimGrid,dimBlock,alpha,A.pCUData(),beta,mpCUData,Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ inline void CuMatrix<float>::AddScaledRow(float alpha, const CuVector<float>& row, float beta)
+ {
+ Timer tim; tim.Start();
+ if(row.Dim() != Cols()) {
+ std::ostringstream os;
+ os << "Non matching dimensions: Cols:" << Cols() << " VectorDim:" << row.Dim();
+ Error(os.str());
+ }
+ assert(row.Dim() == Cols());
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+ cudaF_add_scaled_row(dimGrid,dimBlock,alpha,row.pCUData(),beta,mpCUData,Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ inline void CuMatrix<float>::Gemm(char transa, char transb,
+ float alpha,
+ const CuMatrix<float>& A, const CuMatrix<float>& B,
+ float beta)
+ {
+ // CUBLAS is col major, TNet is row major
+ // keep trans..., just swap A&B argumets: A->B B->A
+ size_t m = ((transb=='T' || transb=='t')? B.Rows() : B.Cols());
+ size_t n = ((transa=='T' || transa=='t')? A.Cols() : A.Rows());
+ size_t k = ((transb=='T' || transb=='t')? B.Cols() : B.Rows());
+ size_t k1 = ((transa=='T' || transa=='t')? A.Rows() : A.Cols());
+ assert(m == Cols());
+ assert(n == Rows());
+ assert(k == k1);
+ #if 0
+ std::cout << "\n" << transb << " " << transa << " " << m << " " << n << " " << k << " " <<
+ alpha << " " << B << " " << B.Stride() << " " <<
+ A << " " << A.Stride() << " " << beta << " " << C << " " <<
+ C.Stride() << "\n" << std::flush;
+ #endif
+ Timer tim; tim.Start();
+ cublasSgemm(transb, transa, m, n, k,
+ alpha, B.pCUData(), B.Stride(), A.pCUData(), A.Stride(),
+ beta, mpCUData, Stride());
+ cuSafeCall(cublasGetError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ inline void CuMatrix<float>::BlasGer(float alpha,
+ const CuVector<float>& x, const CuVector<float>& y)
+ {
+ // CUBLAS is col major, TNet is row major
+ // just swap x and y
+ assert(x.Dim() == Rows());
+ assert(y.Dim() == Cols());
+ Timer tim; tim.Start();
+ cublasSger(Cols(),Rows(),alpha,y.pCUData(),1,x.pCUData(),1,mpCUData,Stride());
+ cuSafeCall(cublasGetError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ inline void CuMatrix<float>::MulElem(const CuMatrix<float>& A)
+ {
+ Timer tim; tim.Start();
+ assert(mCols == A.Cols());
+ assert(mRows == A.Rows());
+ assert(mStride == A.Stride());
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+ cudaF_mul_elem(dimGrid,dimBlock,mpCUData, A.pCUData(), Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ inline void CuMatrix<float>::LogElem()
+ {
+ Timer tim; tim.Start();
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+ cudaF_log_elem(dimGrid,dimBlock,mpCUData, Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ ////////////////////////////////////////////////////////////////////////
+ //// CuMatrix:: templeate specializations (double)
+ ////
+ template<>
+ inline void CuMatrix<double>::SetConst(double value)
+ {
+ Timer tim; tim.Start();
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+ cudaD_set_const(dimGrid,dimBlock,mpCUData,value,Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ inline void CuMatrix<double>::ApplyLog()
+ {
+ Timer tim; tim.Start();
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+ cudaD_apply_log(dimGrid,dimBlock,mpCUData,Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ inline void CuMatrix<double>::ScaleCols(const CuVector<double>& scale)
+ {
+ Timer tim; tim.Start();
+ assert(scale.Dim() == Cols());
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+ cudaD_scale_cols(dimGrid,dimBlock,mpCUData,scale.pCUData(),Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ inline void CuMatrix<double>::ScaleRows(const CuVector<double>& scale)
+ {
+ Timer tim; tim.Start();
+ assert(scale.Dim() == Rows());
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+ cudaD_scale_rows(dimGrid,dimBlock,mpCUData,scale.pCUData(),Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ inline void CuMatrix<double>::AddScaled(double alpha, const CuMatrix<double>& A, double beta)
+ {
+ Timer tim; tim.Start();
+ assert(A.Rows() == Rows());
+ assert(A.Cols() == Cols());
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+ cudaD_add_scaled(dimGrid,dimBlock,alpha,A.pCUData(),beta,mpCUData,Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ inline void CuMatrix<double>::AddScaledRow(double alpha, const CuVector<double>& row, double beta)
+ {
+ Timer tim; tim.Start();
+ assert(row.Dim() == Cols());
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+ cudaD_add_scaled_row(dimGrid,dimBlock,alpha,row.pCUData(),beta,mpCUData,Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ inline void CuMatrix<double>::Gemm(char transa, char transb,
+ double alpha,
+ const CuMatrix<double>& A, const CuMatrix<double>& B,
+ double beta)
+ {
+ // CUBLAS is col major, TNet is row major
+ // keep trans..., just swap A&B argumets: A->B B->A
+ size_t m = ((transb=='T' || transb=='t')? B.Rows() : B.Cols());
+ size_t n = ((transa=='T' || transa=='t')? A.Cols() : A.Rows());
+ size_t k = ((transb=='T' || transb=='t')? B.Cols() : B.Rows());
+ size_t k1 = ((transa=='T' || transa=='t')? A.Rows() : A.Cols());
+ assert(m == Cols());
+ assert(n == Rows());
+ assert(k == k1);
+ #if 0
+ std::cout << "\n" << transb << " " << transa << " " << m << " " << n << " " << k << " " <<
+ alpha << " " << B << " " << B.Stride() << " " <<
+ A << " " << A.Stride() << " " << beta << " " << C << " " <<
+ C.Stride() << "\n" << std::flush;
+ #endif
+ Timer tim; tim.Start();
+ cublasDgemm(transb, transa, m, n, k,
+ alpha, B.pCUData(), B.Stride(), A.pCUData(), A.Stride(),
+ beta, mpCUData, Stride());
+ cuSafeCall(cublasGetError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ inline void CuMatrix<double>::BlasGer(double alpha,
+ const CuVector<double>& x, const CuVector<double>& y)
+ {
+ // CUBLAS is col major, TNet is row major
+ // just swap x and y
+ assert(x.Dim() == Rows());
+ assert(y.Dim() == Cols());
+ Timer tim; tim.Start();
+ cublasDger(Cols(),Rows(),alpha,y.pCUData(),1,x.pCUData(),1,mpCUData,Stride());
+ cuSafeCall(cublasGetError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ inline void CuMatrix<double>::MulElem(const CuMatrix<double>& A)
+ {
+ Timer tim; tim.Start();
+ assert(mCols == A.Cols());
+ assert(mRows == A.Rows());
+ assert(mStride == A.Stride());
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+ cudaD_mul_elem(dimGrid,dimBlock,mpCUData, A.pCUData(), Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ inline void CuMatrix<double>::LogElem()
+ {
+ Timer tim; tim.Start();
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Cols(), CUBLOCK), n_blocks(Rows(),CUBLOCK));
+ cudaD_log_elem(dimGrid,dimBlock,mpCUData, Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
diff --git a/src/CuBaseLib/.svn/text-base/curand.h.svn-base b/src/CuBaseLib/.svn/text-base/curand.h.svn-base
new file mode 100644
index 0000000..8aa66d5
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/curand.h.svn-base
@@ -0,0 +1,40 @@
+#ifndef _CU_RAND_H_
+#define _CU_RAND_H_
+#include "cumatrix.h"
+namespace TNet {
+ template<typename T>
+ class CuRand {
+ public:
+ CuRand(size_t rows, size_t cols)
+ { SeedGpu(rows,cols); }
+ ~CuRand() { }
+ void SeedGpu(size_t rows, size_t cols);
+ void Rand(CuMatrix<T>& tgt);
+ void GaussRand(CuMatrix<T>& tgt);
+ void BinarizeProbs(const CuMatrix<T>& probs, CuMatrix<T>& states);
+ void AddGaussNoise(CuMatrix<T>& tgt, T gscale = 1.0);
+ private:
+ static void SeedRandom(Matrix<unsigned>& mat);
+ private:
+ CuMatrix<unsigned> z1, z2, z3, z4;
+ CuMatrix<T> tmp;
+ };
+#include "curand.tcc"
diff --git a/src/CuBaseLib/.svn/text-base/curand.tcc.svn-base b/src/CuBaseLib/.svn/text-base/curand.tcc.svn-base
new file mode 100644
index 0000000..e337189
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/curand.tcc.svn-base
@@ -0,0 +1,228 @@
+#include <cstdlib>
+#include "curandkernels.h"
+namespace TNet {
+ template<typename T>
+ inline void
+ CuRand<T>::
+ SeedGpu(size_t rows, size_t cols)
+ {
+ Matrix<unsigned> mat(rows,cols);
+ SeedRandom(mat);
+ z1.CopyFrom(mat);
+ SeedRandom(mat);
+ z2.CopyFrom(mat);
+ SeedRandom(mat);
+ z3.CopyFrom(mat);
+ SeedRandom(mat);
+ z4.CopyFrom(mat);
+ /*
+ std::cout << "RANDININIT" << std::endl;
+ z1.Print();
+ z2.Print();
+ z3.Print();
+ z4.Print();
+ std::cout << "RANDININIT" << std::endl;
+ */
+ tmp.Init(rows,cols);
+ }
+ template<typename T>
+ inline void
+ CuRand<T>::
+ SeedRandom(Matrix<unsigned>& mat) {
+ for(size_t j=0; j<mat.Rows(); j++) {
+ for(size_t i=0; i<mat.Cols(); i++) {
+ unsigned value = 0;
+ while(value <= 128) { value = lrand48(); }
+ mat(j,i) = value;
+ }
+ }
+ }
+ template<typename T>
+ inline void
+ CuRand<T>::
+ AddGaussNoise(CuMatrix<T>& tgt, T gscale)
+ {
+ GaussRand(tmp);
+ tgt.AddScaled(gscale,tmp,1.0);
+ }
+ ////////////////////////////////////////////////////////////////////////////
+ //// invalid general wrappers over CUDA kernels
+ template<typename T>
+ inline void
+ CuRand<T>::
+ Rand(CuMatrix<T>& tgt)
+ { Error("Unimplemented"); }
+ template<typename T>
+ inline void
+ CuRand<T>::
+ GaussRand(CuMatrix<T>& tgt)
+ { Error("Unimplemented"); }
+ template<typename T>
+ inline void
+ CuRand<T>::
+ BinarizeProbs(const CuMatrix<T>& probs, CuMatrix<T>& states)
+ { Error("Unimplemented"); }
+ //////////////////////////////////////////////////////////////////////////
+ //// float specializations
+ template<>
+ inline void
+ CuRand<float>::
+ Rand(CuMatrix<float>& tgt)
+ {
+ Timer tim; tim.Start();
+ tgt.Init(z1.Rows(), z1.Cols());
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(tgt.Cols(), CUBLOCK), n_blocks(tgt.Rows(),CUBLOCK));
+ cudaF_rand(dimGrid,dimBlock,tgt.pCUData(), z1.pCUData(), z2.pCUData(), z3.pCUData(), z4.pCUData(),tgt.Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ inline void
+ CuRand<float>::
+ GaussRand(CuMatrix<float>& tgt)
+ {
+ Timer tim; tim.Start();
+ tgt.Init(z1.Rows(), z1.Cols());
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(tgt.Cols(), CUBLOCK), n_blocks(tgt.Rows(),CUBLOCK));
+ cudaF_gauss_rand(dimGrid,dimBlock,tgt.pCUData(), z1.pCUData(), z2.pCUData(), z3.pCUData(), z4.pCUData(),tgt.Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ inline void
+ CuRand<float>::
+ BinarizeProbs(const CuMatrix<float>& probs, CuMatrix<float>& states)
+ {
+ if(probs.Rows() != z1.Rows() || probs.Cols() != z1.Cols()) {
+ Error("Non matching dims!!");
+ }
+ states.Init(z1.Rows(),z1.Cols());
+ Rand(tmp);
+ Timer tim; tim.Start();
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(z1.Cols(), CUBLOCK), n_blocks(z1.Rows(),CUBLOCK));
+ cudaF_binarize_probs(dimGrid,dimBlock,states.pCUData(), probs.pCUData(), tmp.pCUData(),states.Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ //////////////////////////////////////////////////////////////////////////
+ //// double specializations
+ template<>
+ inline void
+ CuRand<double>::
+ Rand(CuMatrix<double>& tgt)
+ {
+ Timer tim; tim.Start();
+ tgt.Init(z1.Rows(), z1.Cols());
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(tgt.Cols(), CUBLOCK), n_blocks(tgt.Rows(),CUBLOCK));
+ cudaD_rand(dimGrid,dimBlock,tgt.pCUData(), z1.pCUData(), z2.pCUData(), z3.pCUData(), z4.pCUData(),tgt.Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ inline void
+ CuRand<double>::
+ GaussRand(CuMatrix<double>& tgt)
+ {
+ Timer tim; tim.Start();
+ tgt.Init(z1.Rows(), z1.Cols());
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(tgt.Cols(), CUBLOCK), n_blocks(tgt.Rows(),CUBLOCK));
+ cudaD_gauss_rand(dimGrid,dimBlock,tgt.pCUData(), z1.pCUData(), z2.pCUData(), z3.pCUData(), z4.pCUData(),tgt.Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ inline void
+ CuRand<double>::
+ BinarizeProbs(const CuMatrix<double>& probs, CuMatrix<double>& states)
+ {
+ if(probs.Rows() != z1.Rows() || probs.Cols() != z1.Cols()) {
+ Error("Non matching dims!!");
+ }
+ states.Init(z1.Rows(),z1.Cols());
+ Rand(tmp);
+ Timer tim; tim.Start();
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(z1.Cols(), CUBLOCK), n_blocks(z1.Rows(),CUBLOCK));
+ cudaD_binarize_probs(dimGrid,dimBlock,states.pCUData(), probs.pCUData(), tmp.pCUData(),states.Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
diff --git a/src/CuBaseLib/.svn/text-base/ b/src/CuBaseLib/.svn/text-base/
new file mode 100644
index 0000000..7e1c8dd
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/
@@ -0,0 +1,135 @@
+#include "curandkernels.h"
+//Hybrid Tauss/LCG random number generator
+// S1, S2, S3, and M are all constants, and z is part of the
+// private per-thread generator state.
+static unsigned TausStep(unsigned &z, int S1, int S2, int S3, unsigned M)
+ unsigned b=(((z << S1) ^ z) >> S2);
+ return z = (((z & M) << S3) ^ b);
+// A and C are constants
+static unsigned LCGStep(unsigned &z, unsigned A, unsigned C)
+ return z=(A*z+C);
+template<typename T>
+static T HybridTaus(unsigned& z1, unsigned& z2, unsigned& z3, unsigned& z4)
+ // Combined period is lcm(p1,p2,p3,p4)~ 2^121
+ T randval;
+ do {
+ randval = 2.3283064365387e-10 * ( // Periods
+ TausStep(z1, 13, 19, 12, 4294967294UL) ^ // p1=2^31-1
+ TausStep(z2, 2, 25, 4, 4294967288UL) ^ // p2=2^30-1
+ TausStep(z3, 3, 11, 17, 4294967280UL) ^ // p3=2^28-1
+ LCGStep(z4, 1664525, 1013904223UL) // p4=2^32
+ );
+ } while (!(randval > 0.0 && randval < 1.0));
+ return randval;
+template<typename T>
+static void _rand(T* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
+ int i = blockIdx.x * blockDim.x + threadIdx.x;
+ int j = blockIdx.y * blockDim.y + threadIdx.y;
+ int index = i + j*d.stride;
+ if( i < d.cols && j < d.rows ) {
+ mat[index] = HybridTaus<T>(z1[index],z2[index],z3[index],z4[index]);
+ }
+float2 BoxMuller()
+ float u0=HybridTaus (), u1=HybridTaus ();
+ float r=sqrt(-2 log(u0));
+ float theta=2*PI*u1;
+ return make_float2(r*sin(theta),r*cos(theta));
+template<typename T>
+static T BoxMuller(unsigned& z1, unsigned& z2, unsigned& z3, unsigned& z4)
+ const T M_2PI = 6.283185307179586476925286766558;
+ T u0 = HybridTaus<T>(z1,z2,z3,z4), u1 = HybridTaus<T>(z1,z2,z3,z4);
+ T r = sqrt(-2.0 * log(u0));
+ T theta = M_2PI * u1;
+ return r*sin(theta);
+template<typename T>
+static void _gauss_rand(T* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
+ int i = blockIdx.x * blockDim.x + threadIdx.x;
+ int j = blockIdx.y * blockDim.y + threadIdx.y;
+ int index = i + j*d.stride;
+ if( i < d.cols && j < d.rows ) {
+ mat[index] = BoxMuller<T>(z1[index],z2[index],z3[index],z4[index]);
+ }
+template<typename T>
+static void _binarize_probs(T* states, const T* probs, const T* rand, MatrixDim d)
+ int i = blockIdx.x * blockDim.x + threadIdx.x;
+ int j = blockIdx.y * blockDim.y + threadIdx.y;
+ int index = i + j*d.stride;
+ if( i < d.cols && j < d.rows ) {
+ states[index] = ((probs[index] > rand[index])? 1.0 : 0.0);
+ }
+ * :FLOAT:
+ */
+void cudaF_rand(dim3 Gr, dim3 Bl, float* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
+{ _rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); }
+void cudaF_gauss_rand(dim3 Gr, dim3 Bl, float* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
+{ _gauss_rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); }
+void cudaF_binarize_probs(dim3 Gr, dim3 Bl, float* states, const float* probs, float* rand, MatrixDim d)
+{ _binarize_probs<<<Gr,Bl>>>(states,probs,rand,d); }
+ * :DOUBLE:
+ */
+void cudaD_rand(dim3 Gr, dim3 Bl, double* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
+{ _rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); }
+void cudaD_gauss_rand(dim3 Gr, dim3 Bl, double* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
+{ _gauss_rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); }
+void cudaD_binarize_probs(dim3 Gr, dim3 Bl, double* states, const double* probs, double* rand, MatrixDim d)
+{ _binarize_probs<<<Gr,Bl>>>(states,probs,rand,d); }
diff --git a/src/CuBaseLib/.svn/text-base/curandkernels.h.svn-base b/src/CuBaseLib/.svn/text-base/curandkernels.h.svn-base
new file mode 100644
index 0000000..69b589f
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/curandkernels.h.svn-base
@@ -0,0 +1,34 @@
+#ifndef _cuda_rand_kernels_h_
+#define _cuda_rand_kernels_h_
+#include "cukernels.h"
+extern "C" {
+ //**************
+ //float
+ //
+ void cudaF_rand(dim3 Gr, dim3 Bl, float* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d);
+ void cudaF_gauss_rand(dim3 Gr, dim3 Bl, float* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d);
+ void cudaF_binarize_probs(dim3 Gr, dim3 Bl, float* states, const float* probs, float* rand, MatrixDim d);
+ //**************
+ //double
+ //
+ void cudaD_rand(dim3 Gr, dim3 Bl, double* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d);
+ void cudaD_gauss_rand(dim3 Gr, dim3 Bl, double* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d);
+ void cudaD_binarize_probs(dim3 Gr, dim3 Bl, double* states, const double* probs, double* rand, MatrixDim d);
diff --git a/src/CuBaseLib/.svn/text-base/cuvector.h.svn-base b/src/CuBaseLib/.svn/text-base/cuvector.h.svn-base
new file mode 100644
index 0000000..945565a
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/cuvector.h.svn-base
@@ -0,0 +1,104 @@
+#ifndef _CUVECTOR_H_
+#define _CUVECTOR_H_
+#include "Vector.h"
+namespace TNet {
+ template<typename _ElemT> class CuMatrix;
+ /**
+ * Matrix for CUDA computing
+ */
+ template<typename _ElemT>
+ class CuVector
+ {
+ typedef CuVector<_ElemT> ThisType;
+ public:
+ /// Default Constructor
+ CuVector<_ElemT>()
+ : mDim(0), mpCUData(NULL)
+ { }
+ /// Constructor with memory initialisation
+ CuVector<_ElemT>(size_t dim)
+ : mDim(0), mpCUData(NULL)
+ { Init(dim); }
+ /// Destructor
+ ~CuVector()
+ { Destroy(); }
+ /// Dimensions
+ size_t Dim() const
+ { return mDim; }
+ /*
+ ::MatrixDim Dim() const
+ { ::MatrixDim d = { mDim, 1, 1 }; return d; }
+ */
+ /// Get raw pointer
+ const _ElemT* pCUData() const
+ { return mpCUData; }
+ _ElemT* pCUData()
+ { return mpCUData; }
+ /// Allocate the memory
+ ThisType& Init(size_t dim);
+ /// Deallocate the memory
+ void Destroy();
+ /// Copy functions (reallocates when needed)
+ ThisType& CopyFrom(const CuVector<_ElemT>& rSrc);
+ ThisType& CopyFrom(const Vector<_ElemT>& rSrc);
+ Vector<_ElemT>& CopyTo(Vector<_ElemT>& rDst) const;
+ // Math operations
+ //
+ void SetZero();
+ void SetConst(_ElemT value)
+ { Error("__func__ Not implemented"); }
+ void AddScaled(_ElemT alpha, const CuVector<_ElemT>& vec, _ElemT beta)
+ { Error("__func__ Not implemented"); }
+ void AddColSum(_ElemT alpha, const CuMatrix<_ElemT>& mat, _ElemT beta)
+ { Error("__func__ Not implemented"); }
+ void Print() const
+ {
+ Vector<_ElemT> vec(Dim());
+ CopyTo(vec);
+ std::cout << vec << "\n";
+ }
+ private:
+ size_t mDim;
+ _ElemT* mpCUData;
+ };
+ /// Prints the matrix dimensions and pointer to stream
+ template<typename _ElemT>
+ inline std::ostream& operator << (std::ostream& out, const CuVector<_ElemT>& vec)
+ {
+ size_t d = vec.Dim();
+ out << "[CuVector D" << d
+ << " PTR" << vec.pCUData() << "]" << std::flush;
+ return out;
+ }
+#include "cuvector.tcc"
diff --git a/src/CuBaseLib/.svn/text-base/cuvector.tcc.svn-base b/src/CuBaseLib/.svn/text-base/cuvector.tcc.svn-base
new file mode 100644
index 0000000..0107859
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/cuvector.tcc.svn-base
@@ -0,0 +1,254 @@
+#include <cuda_runtime_api.h>
+#include "Timer.h"
+#include "cucommon.h"
+#include "cumatrix.h"
+#include "cudevice.h"
+namespace TNet {
+ ////////////////////////////////////////////////////////////////////////
+ ////////////////////////////////////////////////////////////////////////
+ template<typename _ElemT>
+ CuVector<_ElemT>&
+ CuVector<_ElemT>::
+ Init(size_t dim)
+ {
+ if(mDim == dim) {
+ //SetZero();
+ return *this;
+ }
+ Destroy();
+ cuSafeCall(cudaMalloc((void**)&mpCUData, dim*sizeof(_ElemT)));
+ mDim = dim;
+ SetZero();
+ return *this;
+ }
+ ////////////////////////////////////////////////////////////////////////
+ ////////////////////////////////////////////////////////////////////////
+ template<typename _ElemT>
+ void
+ CuVector<_ElemT>::
+ Destroy()
+ {
+ if(NULL != mpCUData) {
+ cuSafeCall(cudaFree(mpCUData));
+ mpCUData = NULL;
+ }
+ mDim = 0;
+ }
+ ////////////////////////////////////////////////////////////////////////
+ ////////////////////////////////////////////////////////////////////////
+ template<typename _ElemT>
+ CuVector<_ElemT>&
+ CuVector<_ElemT>::
+ CopyFrom(const CuVector<_ElemT>& rSrc)
+ {
+ Init(rSrc.Dim());
+ Timer tim; tim.Start();
+ cuSafeCall(cudaMemcpy(mpCUData, rSrc.pCUData(), rSrc.Dim()*sizeof(_ElemT), cudaMemcpyDeviceToDevice));
+ tim.End(); CuDevice::Instantiate().AccuProfile("CuVector::CopyFromD2D",tim.Val());
+ return *this;
+ }
+ ////////////////////////////////////////////////////////////////////////
+ ////////////////////////////////////////////////////////////////////////
+ template<typename _ElemT>
+ CuVector<_ElemT>&
+ CuVector<_ElemT>::
+ CopyFrom(const Vector<_ElemT>& rSrc)
+ {
+ Init(rSrc.Dim());
+ Timer tim; tim.Start();
+ cuSafeCall(cudaMemcpy(mpCUData, rSrc.pData(), rSrc.Dim()*sizeof(_ElemT), cudaMemcpyHostToDevice));
+ tim.End(); CuDevice::Instantiate().AccuProfile("CuVector::CopyFromH2D",tim.Val());
+ return *this;
+ }
+ ////////////////////////////////////////////////////////////////////////
+ ////////////////////////////////////////////////////////////////////////
+ template<typename _ElemT>
+ Vector<_ElemT>&
+ CuVector<_ElemT>::
+ CopyTo(Vector<_ElemT>& rDst) const
+ {
+ if(rDst.Dim() != mDim) {
+ rDst.Init(mDim);
+ }
+ Timer tim; tim.Start();
+ cuSafeCall(cudaMemcpy(rDst.pData(), pCUData(), mDim*sizeof(_ElemT), cudaMemcpyDeviceToHost));
+ tim.End(); CuDevice::Instantiate().AccuProfile("CuVector::CopyToD2H",tim.Val());
+ return rDst;
+ }
+ ////////////////////////////////////////////////////////////////////////
+ ////////////////////////////////////////////////////////////////////////
+ template<typename _ElemT>
+ void
+ CuVector<_ElemT>::
+ SetZero()
+ {
+ Timer tim; tim.Start();
+ cuSafeCall(cudaMemset(mpCUData, 0, mDim*sizeof(_ElemT)));
+ tim.End(); CuDevice::Instantiate().AccuProfile("CuVector::SetZero",tim.Val());
+ }
+ ////////////////////////////////////////////////////////////////////////
+ ////////////////////////////////////////////////////////////////////////
+ ////////////////////////////////////////////////////////////////////////
+ //// CuVector:: templeate specializations (float)
+ ////
+ template<>
+ inline void CuVector<float>::SetConst(float value)
+ {
+ Timer tim; tim.Start();
+ dim3 dimBlock(CUBLOCK);
+ dim3 dimGrid(n_blocks(Dim(), CUBLOCK));
+ ::MatrixDim d = { 1, Dim(), Dim() };
+ cudaF_set_const(dimGrid,dimBlock,mpCUData,value,d);
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ inline void CuVector<float>::AddScaled(float alpha, const CuVector<float>& vec, float beta)
+ {
+ Timer tim; tim.Start();
+ assert(vec.Dim() == Dim());
+ dim3 dimBlock(CUBLOCK);
+ dim3 dimGrid(n_blocks(Dim(), CUBLOCK));
+ ::MatrixDim d = { 1, Dim(), Dim() };
+ cudaF_add_scaled(dimGrid,dimBlock,alpha,vec.pCUData(),beta,mpCUData,d);
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ inline void CuVector<float>::AddColSum(float alpha, const CuMatrix<float>& mat, float beta)
+ {
+ Timer tim; tim.Start();
+ assert(mat.Cols() == Dim());
+ /**
+ * Rows()<=512 limit due to limited shared memory
+ * Cols()<=256 limit due to coalesced memory alignment:
+ * matrices with huge strides have slow access!!!
+ */
+ if(mat.Rows() > 512 || mat.Cols() > 256) {
+ size_t dimBlock = CUBLOCK*2;
+ size_t dimGrid = n_blocks(Dim(),CUBLOCK*2);
+ cudaF_add_col_sum(dimGrid,dimBlock,alpha,mat.pCUData(),beta,mpCUData,mat.Dim());
+ cuSafeCall(cudaGetLastError());
+ } else {
+ dim3 dimBlock(mat.Rows(),1);
+ dim3 dimGrid(1,Dim());
+ cudaF_add_col_sum_reduce(dimGrid,dimBlock,alpha,mat.pCUData(),beta,mpCUData,mat.Dim());
+ cuSafeCall(cudaGetLastError());
+ }
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ ////////////////////////////////////////////////////////////////////////
+ //// CuVector:: templeate specializations (double)
+ ////
+ template<>
+ inline void CuVector<double>::SetConst(double value)
+ {
+ Timer tim; tim.Start();
+ dim3 dimBlock(CUBLOCK);
+ dim3 dimGrid(n_blocks(Dim(), CUBLOCK));
+ ::MatrixDim d = { 1, Dim(), Dim() };
+ cudaD_set_const(dimGrid,dimBlock,mpCUData,value,d);
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ inline void CuVector<double>::AddScaled(double alpha, const CuVector<double>& vec, double beta)
+ {
+ Timer tim; tim.Start();
+ assert(vec.Dim() == Dim());
+ dim3 dimBlock(CUBLOCK);
+ dim3 dimGrid(n_blocks(Dim(), CUBLOCK));
+ ::MatrixDim d = { 1, Dim(), Dim() };
+ cudaD_add_scaled(dimGrid,dimBlock,alpha,vec.pCUData(),beta,mpCUData,d);
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ template<>
+ inline void CuVector<double>::AddColSum(double alpha, const CuMatrix<double>& mat, double beta)
+ {
+ Timer tim; tim.Start();
+ assert(mat.Cols() == Dim());
+ size_t dimBlock = CUBLOCK*2;
+ size_t dimGrid = n_blocks(Dim(),CUBLOCK*2);
+ cudaD_add_col_sum(dimGrid,dimBlock,alpha,mat.pCUData(),beta,mpCUData,mat.Dim());
+ cuSafeCall(cudaGetLastError());
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }