1 files changed, 626 insertions, 0 deletions
diff --git a/src/CuBaseLib/.svn/text-base/cukernels.cu.svn-base b/src/CuBaseLib/.svn/text-base/cukernels.cu.svn-base
new file mode 100644
index 0000000..d6f866d
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/cukernels.cu.svn-base
@@ -0,0 +1,626 @@
+
+#include <cfloat>
+#include "cukernels.h"
+
+
+
+/*****************
+ * CUDA kernels
+ */
+//CuMatrix
+template<typename T>
+__global__
+static void _set_const(T* mat, T value, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if ( i < d.cols  &&  j < d.rows )
+    mat[index] = value;
+}
+
+
+
+template<typename T>
+__global__
+static void _apply_log(T* mat, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if ( i < d.cols  &&  j < d.rows )
+    mat[index] = log(mat[index]);
+}
+
+
+template<typename T>
+__global__
+static void _apply_mask(T* mat, const float* mask, MatrixDim dmat, MatrixDim dmask) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*dmat.stride;
+  int index2 = i + j*dmask.stride;
+  if ( i < dmat.cols  &&  j < dmat.rows ) 
+    if(mask[index2] == 0) mat[index] = 0;
+}
+
+
+template<typename T>
+__global__
+static void _apply_l1(T* mat, T l1, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if ( i < d.cols  &&  j < d.rows ) {
+    T value = mat[index];
+    T tgt;
+    if(abs(value) < l1) {
+      tgt = 0;
+    } else {
+      tgt = (value > 0?value-l1:value+l1);
+    }
+    mat[index] = tgt;
+  }
+}
+
+
+template<typename T>
+__global__
+static void _scale_cols(T* mat, const T* scale, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if ( i < d.cols  &&  j < d.rows )
+    mat[index] *= scale[i];
+}
+
+
+template<typename T>
+__global__
+static void _scale_rows(T* mat, const T* scale, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if ( i < d.cols  &&  j < d.rows )
+    mat[index] *= scale[j];
+}
+
+
+template<typename T>
+__global__
+static void _add_scaled(T alpha, const T* A, T beta, T* dst, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if ( i < d.cols  &&  j < d.rows )
+    dst[index] = alpha*A[index] + beta*dst[index];
+}
+
+
+template<typename T>
+__global__
+static void _add_scaled_row(T alpha, const T* row, T beta, T* dst, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+
+#if 0
+  //this does not accelerate :(
+  __shared__ T aux[16];
+  if(threadIdx.y == 0 && i < d.cols) aux[threadIdx.x] = row[i];
+  __syncthreads();
+  
+  if ( i < d.cols  &&  j < d.rows )
+    dst[index] = alpha*aux[threadIdx.x] + beta*dst[index];
+#else
+  if ( i < d.cols  &&  j < d.rows )
+    dst[index] = alpha*row[i] + beta*dst[index];
+#endif
+}
+
+
+template<typename T>
+__global__
+static void _mul_elem(T* mat, const T* A, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if ( i < d.cols  &&  j < d.rows )
+    mat[index] = mat[index] * A[index];
+}
+
+
+template<typename T>
+__global__
+static void _log_elem(T* mat, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if ( i < d.cols  &&  j < d.rows ) {
+    if(mat[index] < FLT_MIN) mat[index] = FLT_MIN;
+    mat[index] = log(mat[index]);
+  }
+}
+
+
+
+
+//CuVector
+template<typename T>
+__global__
+static void _add_col_sum(T alpha, const T* mat, T beta, T* vec, MatrixDim d) {
+
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  //This should be called 1-D
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  if(j > 0) return;
+  
+  if(i < d.cols) {
+    double sum = 0.0;
+    for(int k = 0; k < d.rows; k++) {
+      sum += mat[i+k*d.stride];
+    }
+    vec[i] = alpha*sum + beta*vec[i];
+  }
+}
+
+
+template<typename T>
+__global__
+static void _add_col_sum_reduce(T alpha, const T* mat, T beta, T* vec, MatrixDim d) {
+
+  //flipped x,y for reducing... x..row, y..col
+  int j = blockIdx.x * blockDim.x + threadIdx.x;
+  int i = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if(blockIdx.x > 0) return;
+  if(blockDim.y != 1) return;
+
+  //copy vector to shared mem
+  __shared__ T aux[512];
+  aux[threadIdx.x] = mat[i+j*d.stride];
+  __syncthreads();
+
+  T sum = _sum_reduce(aux);
+  __syncthreads();
+  //copy out the result
+  vec[i] = alpha*sum + beta*vec[i];
+}
+
+
+
+//CuMath
+template<typename T>
+__global__
+static void _sigmoid(T*y, const T*x, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if( i < d.cols  &&  j < d.rows ) {
+    T res = 1.0 / (1.0 + exp(-x[index]));
+    /*
+    if(res < 0.001) res = 0.001;
+    if(res > 0.999) res = 0.999;
+    */
+    y[index] = res;
+  }
+}
+
+
+template<typename T>
+__global__
+static void _diff_sigmoid(T*eout, const T*e, const T*y, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d.stride;
+  if( i < d.cols  && j < d.rows ) 
+    eout[index] = y[index]*(1.0-y[index]) * e[index];
+}
+
+
+template<typename T>
+__global__
+static void _softmax(T*y, const T*x, MatrixDim d) {
+  int j = blockIdx.x * blockDim.x + threadIdx.x;
+  if(j >= d.rows) return;
+
+  //copy to output and find max...
+  double max = -1e20;
+  double sum = 0.0;
+  for(int i=0; i<d.cols; i++) {
+    if(max < x[i+j*d.stride]) max = x[i+j*d.stride];
+    y[i+j*d.stride] = x[i+j*d.stride];
+  }
+  //subtract max, apply exp, sum up...
+  for(int i=0; i<d.cols; i++) {
+    y[i+j*d.stride] = exp(y[i+j*d.stride] - max);
+    sum += y[i+j*d.stride];
+  }
+  //normalize by sum...
+  for(int i=0; i<d.cols; i++) {
+    y[i+j*d.stride] /= sum;
+  }
+}
+
+
+
+
+template<typename T>
+__device__
+static T _max_reduce(T buffer[]) {
+
+  // Total number of active threads
+  int nTotalThreads = blockDim.x;	
+  __syncthreads();
+
+  while(nTotalThreads > 1) {
+    int halfPoint = ((1+nTotalThreads) >> 1);	// divide by two
+    // only the first half of the threads will be active.
+    if (threadIdx.x < halfPoint)  {
+      // Get the shared value stored by another thread
+      T temp = -1e20;
+      if(threadIdx.x+halfPoint < nTotalThreads) {
+        temp = buffer[threadIdx.x + halfPoint];
+      }
+      if (temp > buffer[threadIdx.x]) buffer[threadIdx.x] = temp;
+    }
+    __syncthreads();
+    nTotalThreads = ((1+nTotalThreads) >> 1);	// divide by two.
+  }
+  // the result
+  return buffer[0];
+}
+
+
+
+
+template<typename T>
+__device__
+static T _sum_reduce(T buffer[]) {
+
+  // Total number of active threads
+  int nTotalThreads = blockDim.x;	
+  __syncthreads();
+
+  while(nTotalThreads > 1) {
+    int halfPoint = ((1+nTotalThreads) >> 1);	// divide by two
+    // only the first half of the threads will be active.
+    if (threadIdx.x < halfPoint)  {
+      // Get the shared value stored by another thread
+      T temp = 0.0;
+      if(threadIdx.x+halfPoint < nTotalThreads) {
+        temp = buffer[threadIdx.x + halfPoint];
+      }
+      buffer[threadIdx.x] += temp;
+    }
+    __syncthreads();
+    nTotalThreads = ((1+nTotalThreads) >> 1);	// divide by two.
+  }
+  // the result
+  return buffer[0];
+}
+
+
+
+template<typename T>
+__global__
+static void _softmax_reduce(T*y, const T*x, MatrixDim d) {
+  
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if(blockIdx.x > 0) return;
+  if(blockDim.y > 1) return;
+
+  __shared__ T row_data[256];
+  __shared__ T aux[256];
+
+  //copy the input to row_data
+  row_data[i] = x[i+j*d.stride];
+  __syncthreads();
+
+  //copy input to aux
+  aux[i] = row_data[i];
+  __syncthreads();
+  //get the maximum value
+  T max = _max_reduce(aux);
+  __syncthreads();
+
+  //calculate exp(data-max)
+  row_data[i] = exp(row_data[i]-max);
+ 
+  //copy the values to aux
+  aux[i] = row_data[i];
+  __syncthreads();
+  //get the sum
+  T sum = _sum_reduce(aux);
+  __syncthreads();
+
+  //divide the values
+  row_data[i] /= sum;
+  //copy out
+  y[i+j*d.stride] = row_data[i];
+
+}
+
+
+
+template<typename T>
+__global__
+static void _expand(T* y, const T* x, const int* off, MatrixDim d_out, MatrixDim d_in)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d_out.stride;
+  if( i < d_out.cols  && j < d_out.rows ) {
+    int src_col = i % d_in.cols;
+    int src_row = j + off[i / d_in.cols];
+    if(src_row < 0) src_row = 0;
+    if(src_row >= d_in.rows) src_row = d_in.rows-1;
+    y[index] = x[src_col + src_row*d_in.stride];
+  }
+}
+
+
+template<typename T>
+__global__
+static void _rearrange(T* y, const T* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d_out.stride;
+  if( i < d_out.cols  && j < d_out.rows ) {
+    int src_col = copy_from[i];
+    if(src_col >= 0 && src_col < d_in.cols) {
+      y[index] = x[src_col + j*d_in.stride];
+    } else {
+      y[index] = 1.0/0.0;
+    }
+  }
+}
+
+
+template<typename T>
+__global__
+static void _randomize(T* y, const T* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int index = i + j*d_out.stride;
+  if( i < d_out.cols  && j < d_out.rows ) {
+    int src_row = copy_from[j];
+    y[index] = x[i + src_row*d_in.stride];
+  }
+}
+
+
+template<typename T>
+__global__
+static void _check_class(const T* out, const T* des, int* match, MatrixDim d)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  if(j>0) return;
+
+  if(i<d.rows) {
+    int out_id = -1, des_id = -2;
+    T out_max = -1e20, des_max = -1e20;
+
+    for(int k=0; k<d.cols; k++) {
+      T val = out[k + i*d.stride];
+      if(val > out_max) { out_max = val; out_id = k; }
+    }
+    for(int k=0; k<d.cols; k++) {
+      T val = des[k + i*d.stride];
+      if(val > des_max) { des_max = val; des_id = k; }
+    }
+    
+    match[i] = ((out_id == des_id)?1:0);
+  }
+}
+
+
+template<typename T>
+__device__
+static int _max_id_reduce(T val[],int idx[]) {
+
+  // Total number of active threads
+  int nTotalThreads = blockDim.x;	
+  __syncthreads();
+
+  while(nTotalThreads > 1) {
+    int halfPoint = ((1+nTotalThreads) >> 1);	// divide by two
+    // only the first half of the threads will be active.
+    if (threadIdx.x < halfPoint)  {
+      // Get the shared value stored by another thread
+      T temp = -1e20;
+      if(threadIdx.x+halfPoint < nTotalThreads) {
+        temp = val[idx[threadIdx.x + halfPoint]];
+      }
+      if (temp > val[idx[threadIdx.x]]) idx[threadIdx.x]=idx[threadIdx.x + halfPoint];
+    }
+    __syncthreads();
+    nTotalThreads = ((1+nTotalThreads) >> 1);	// divide by two.
+  }
+  // the result
+  return idx[0];
+}
+
+
+
+
+
+
+template<typename T>
+__global__
+static void _check_class_reduce(const T* out, const T* des, int* match, MatrixDim d)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if(blockIdx.x > 0) return;
+  if(blockDim.y != 1) return;
+
+  __shared__ T value[256];
+  __shared__ int index[256];
+
+  value[threadIdx.x] = out[i+j*d.stride];
+  index[threadIdx.x] = threadIdx.x;
+  __syncthreads();
+
+  int out_max = _max_id_reduce(value,index);
+  __syncthreads();
+
+  value[threadIdx.x] = des[i+j*d.stride];
+  index[threadIdx.x] = threadIdx.x;
+  __syncthreads();
+  
+  int des_max = _max_id_reduce(value,index);
+  __syncthreads();
+
+  if(threadIdx.x == 0) {
+    match[j] = ((out_max == des_max)?1:0);
+  }
+}
+
+
+
+
+/**************
+ * C wrappers around CUDA kernels
+ */
+//:FLOAT:
+//CuMatrix
+void cudaF_set_const(dim3 Gr, dim3 Bl, float* mat, float value, MatrixDim d) 
+{ _set_const<<<Gr,Bl>>>(mat,value,d); }
+
+void cudaF_apply_log(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) 
+{ _apply_log<<<Gr,Bl>>>(mat,d); }
+
+void cudaF_apply_mask(dim3 Gr, dim3 Bl, float* mat, const float* mask, MatrixDim dmat, MatrixDim dmask)
+{ _apply_mask<<<Gr,Bl>>>(mat,mask,dmat,dmask); }
+
+void cudaF_apply_l1(dim3 Gr, dim3 Bl, float* mat, float l1, MatrixDim d)
+{ _apply_l1<<<Gr,Bl>>>(mat,l1,d); }
+
+void cudaF_scale_cols(dim3 Gr, dim3 Bl, float* mat, const float* scale, MatrixDim d)
+{ _scale_cols<<<Gr,Bl>>>(mat,scale,d); }
+
+void cudaF_scale_rows(dim3 Gr, dim3 Bl, float* mat, const float* scale, MatrixDim d)
+{ _scale_rows<<<Gr,Bl>>>(mat,scale,d); }
+
+void cudaF_add_scaled(dim3 Gr, dim3 Bl, float alpha, const float* A, float beta, float* dst, MatrixDim d)
+{ _add_scaled<<<Gr,Bl>>>(alpha,A,beta,dst,d); }
+
+void cudaF_add_scaled_row(dim3 Gr, dim3 Bl, float alpha, const float* row, float beta, float* dst, MatrixDim d)
+{ _add_scaled_row<<<Gr,Bl>>>(alpha,row,beta,dst,d); }
+
+void cudaF_mul_elem(dim3 Gr, dim3 Bl, float*mat, const float*A, MatrixDim d)
+{ _mul_elem<<<Gr,Bl>>>(mat,A,d); }
+
+void cudaF_log_elem(dim3 Gr, dim3 Bl, float*mat, MatrixDim d)
+{ _log_elem<<<Gr,Bl>>>(mat,d); }
+
+//CuVector
+void cudaF_add_col_sum(size_t Gr, size_t Bl, float alpha, const float* mat, float beta, float* vec, MatrixDim d)
+{ _add_col_sum<<<Gr,Bl>>>(alpha,mat,beta,vec,d); }
+
+void cudaF_add_col_sum_reduce(dim3 Gr, dim3 Bl, float alpha, const float* mat, float beta, float* vec, MatrixDim d) 
+{ _add_col_sum_reduce<<<Gr,Bl>>>(alpha,mat,beta,vec,d); }
+
+//CuMath
+void cudaF_sigmoid (dim3 Gr, dim3 Bl, float *y, const float*x, MatrixDim d)
+{ _sigmoid<<<Gr,Bl>>>(y, x, d); }
+
+void cudaF_diff_sigmoid (dim3 Gr, dim3 Bl, float*eout, const float*e, const float*y, MatrixDim d) {
+  _diff_sigmoid<<<Gr,Bl>>>(eout, e, y, d);
+}
+
+void cudaF_softmax (size_t Gr, size_t Bl, float*y, const float*x, MatrixDim d) 
+{ _softmax<<<Gr,Bl>>>(y, x, d); }
+
+void cudaF_softmax_reduce (dim3 Gr, dim3 Bl, float*y, const float*x, MatrixDim d) 
+{ _softmax_reduce<<<Gr,Bl>>>(y, x, d); }
+
+
+void cudaF_expand(dim3 Gr, dim3 Bl, float* y, const float* x, const int* off, MatrixDim d_out, MatrixDim d_in)
+{ _expand<<<Gr,Bl>>>(y,x,off,d_out,d_in); }
+
+
+void cudaF_rearrange(dim3 Gr, dim3 Bl, float* y, const float* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in)
+{ _rearrange<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in); }
+
+  
+void cudaF_randomize(dim3 Gr, dim3 Bl, float* y, const float* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in)
+{ _randomize<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in); }
+
+
+void cudaF_check_class(size_t Gr, size_t Bl, const float* out, const float* des, int* match, MatrixDim d)
+{ _check_class<<<Gr,Bl>>>(out,des,match,d); }
+
+void cudaF_check_class_reduce(dim3 Gr, dim3 Bl, const float* out, const float* des, int* match, MatrixDim d)
+{ _check_class_reduce<<<Gr,Bl>>>(out,des,match,d); }
+
+
+
+
+//:DOUBLE:
+//CuMatrix
+void cudaD_set_const(dim3 Gr, dim3 Bl, double* mat, double value, MatrixDim d) 
+{ _set_const<<<Gr,Bl>>>(mat,value,d); }
+
+void cudaD_apply_log(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) 
+{ _apply_log<<<Gr,Bl>>>(mat,d); }
+
+void cudaD_scale_cols(dim3 Gr, dim3 Bl, double* mat, const double* scale, MatrixDim d)
+{ _scale_cols<<<Gr,Bl>>>(mat,scale,d); }
+
+void cudaD_scale_rows(dim3 Gr, dim3 Bl, double* mat, const double* scale, MatrixDim d)
+{ _scale_rows<<<Gr,Bl>>>(mat,scale,d); }
+
+void cudaD_add_scaled(dim3 Gr, dim3 Bl, double alpha, const double* A, double beta, double* dst, MatrixDim d)
+{ _add_scaled<<<Gr,Bl>>>(alpha,A,beta,dst,d); }
+
+void cudaD_add_scaled_row(dim3 Gr, dim3 Bl, double alpha, const double* row, double beta, double* dst, MatrixDim d)
+{ _add_scaled_row<<<Gr,Bl>>>(alpha,row,beta,dst,d); }
+
+void cudaD_mul_elem(dim3 Gr, dim3 Bl, double*mat, const double*A, MatrixDim d)
+{ _mul_elem<<<Gr,Bl>>>(mat,A,d); }
+
+void cudaD_log_elem(dim3 Gr, dim3 Bl, double*mat, MatrixDim d)
+{ _log_elem<<<Gr,Bl>>>(mat,d); }
+
+//CuVector
+void cudaD_add_col_sum(size_t Gr, size_t Bl, double alpha, const double* mat, double beta, double* vec, MatrixDim d)
+{ _add_col_sum<<<Gr,Bl>>>(alpha,mat,beta,vec,d); }
+
+//CuMath
+void cudaD_sigmoid (dim3 Gr, dim3 Bl, double *y, const double*x, MatrixDim d)
+{ _sigmoid<<<Gr,Bl>>>(y, x, d); }
+
+
+void cudaD_diff_sigmoid (dim3 Gr, dim3 Bl, double*eout, const double*e, const double*y, MatrixDim d) {
+  _diff_sigmoid<<<Gr,Bl>>>(eout, e, y, d);
+}
+
+void cudaD_softmax (size_t Gr, size_t Bl, double*y, const double*x, MatrixDim d) 
+{ _softmax<<<Gr,Bl>>>(y, x, d); }
+
+
+void cudaD_expand(dim3 Gr, dim3 Bl, double* y, const double* x, const int* off, MatrixDim d_out, MatrixDim d_in)
+{ _expand<<<Gr,Bl>>>(y,x,off,d_out,d_in); }
+
+
+void cudaD_rearrange(dim3 Gr, dim3 Bl, double* y, const double* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in)
+{ _rearrange<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in); }
+
+  
+void cudaD_randomize(dim3 Gr, dim3 Bl, double* y, const double* x, const int* copy_from, MatrixDim d_out, MatrixDim d_in)
+{ _randomize<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in); }
+
+
+void cudaD_check_class(size_t Gr, size_t Bl, const double* out, const double* des, int* match, MatrixDim d)
+{ _check_class<<<Gr,Bl>>>(out,des,match,d); }
+
+
+
+