summaryrefslogtreecommitdiff
path: root/src/CuBaseLib/.svn/text-base/cumath.cc.svn-base
diff options
context:
space:
mode:
authorJoe Zhao <ztuowen@gmail.com>2014-04-14 08:14:45 +0800
committerJoe Zhao <ztuowen@gmail.com>2014-04-14 08:14:45 +0800
commitcccccbf6cca94a3eaf813b4468453160e91c332b (patch)
tree23418cb73a10ae3b0688681a7f0ba9b06424583e /src/CuBaseLib/.svn/text-base/cumath.cc.svn-base
downloadtnet-cccccbf6cca94a3eaf813b4468453160e91c332b.tar.gz
tnet-cccccbf6cca94a3eaf813b4468453160e91c332b.tar.bz2
tnet-cccccbf6cca94a3eaf813b4468453160e91c332b.zip
First commit
Diffstat (limited to 'src/CuBaseLib/.svn/text-base/cumath.cc.svn-base')
-rw-r--r--src/CuBaseLib/.svn/text-base/cumath.cc.svn-base574
1 files changed, 574 insertions, 0 deletions
diff --git a/src/CuBaseLib/.svn/text-base/cumath.cc.svn-base b/src/CuBaseLib/.svn/text-base/cumath.cc.svn-base
new file mode 100644
index 0000000..d718324
--- /dev/null
+++ b/src/CuBaseLib/.svn/text-base/cumath.cc.svn-base
@@ -0,0 +1,574 @@
+
+
+
+#include "cumath.h"
+#include "cukernels.h"
+
+
+namespace TNet {
+
+ //////////////////////////////////////////////////////////////////////////////
+ //// CuMath<> Template specializations (float)
+ ////
+ template<>
+ void CuMath<float>::Sigmoid(CuMatrix<float>& Y, const CuMatrix<float>& X)
+ {
+ Timer tim; tim.Start();
+
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(X.Cols(),CUBLOCK), n_blocks(X.Rows(), CUBLOCK));
+
+ cudaF_sigmoid(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim());
+ cuSafeCall(cudaGetLastError());
+
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+
+ template<>
+ void CuMath<float>::DiffSigmoid(CuMatrix<float>& Eout, const CuMatrix<float>& Ein, const CuMatrix<float>& Y)
+ {
+ Timer tim; tim.Start();
+
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Eout.Cols(), CUBLOCK), n_blocks(Eout.Rows(),CUBLOCK));
+
+ cudaF_diff_sigmoid(dimGrid, dimBlock, Eout.pCUData(), Ein.pCUData(), Y.pCUData(), Eout.Dim());
+ cuSafeCall(cudaGetLastError());
+
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+
+
+ template<>
+ void CuMath<float>::Softmax(CuMatrix<float>& Y, const CuMatrix<float>& X)
+ {
+ Timer tim; tim.Start();
+
+#if 0
+ //disable 'reduce' functions
+ size_t dimBlock = CUBLOCK;
+ size_t dimGrid = n_blocks(X.Rows(),CUBLOCK);
+
+ cudaF_softmax(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim());
+ cuSafeCall(cudaGetLastError());
+#else
+ if(X.Cols() > 256) {
+ //use old implementation (can't use reduction due to
+ //limited size of shared memory)
+ size_t dimBlock = CUBLOCK;
+ size_t dimGrid = n_blocks(X.Rows(),CUBLOCK);
+
+ cudaF_softmax(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim());
+ cuSafeCall(cudaGetLastError());
+ } else {
+ //use implementation with reduction
+ dim3 dimBlock(X.Cols(),1);
+ dim3 dimGrid(1,X.Rows());
+
+ cudaF_softmax_reduce(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim());
+ cuSafeCall(cudaGetLastError());
+ }
+#endif
+
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+
+
+
+ template<>
+ void CuMath<float>::BlockLinearity(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuMatrix<float>& block_transf)
+ {
+ Timer tim; tim.Start();
+
+ assert(Y.Rows() == X.Rows());
+ assert((X.Cols() % block_transf.Rows()) == 0);
+ assert((Y.Cols() % block_transf.Cols()) == 0);
+ assert((X.Cols() / block_transf.Rows()) == (Y.Cols() / block_transf.Cols()));
+
+ int blocks = X.Cols() / block_transf.Rows();
+
+ for(int i = 0; i < blocks; i++) {
+ int m = block_transf.Cols();
+ int n = X.Rows();
+ int k = block_transf.Rows();
+
+ /*
+ //DEBUG MESSAGE
+ std::cout << "N N " << m << " " << n << " " << k << " "
+ << 1.0 << " " << block_transf << " " << block_transf.Stride()
+ << " " << X+i*k << " " << X.Stride() << " "
+ << 0.0 << " " << Y+i*n << " " << Y.Stride()
+ << "\n" << std::flush;
+ */
+
+
+ cublasSgemm('N', 'N', m, n, k,
+ 1.0, block_transf.pCUData(), block_transf.Stride(),
+ X.pCUData()+i*k, X.Stride(),
+ 0.0, Y.pCUData()+i*m, Y.Stride());
+ }
+ cuSafeCall(cublasGetError());
+
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+
+
+
+
+ template<>
+ void CuMath<float>::Expand(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& frameOffsets)
+ {
+ Timer tim; tim.Start();
+
+ assert(Y.Rows() == X.Rows());
+ assert(X.Cols() * frameOffsets.Dim() == Y.Cols());
+
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK));
+
+ cudaF_expand(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), frameOffsets.pCUData(), Y.Dim(), X.Dim());
+ cuSafeCall(cudaGetLastError());
+
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+
+
+ template<>
+ void CuMath<float>::Rearrange(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& copyFrom)
+ {
+ Timer tim; tim.Start();
+
+ assert(copyFrom.Dim() == Y.Cols());
+ assert(Y.Rows() == X.Rows());
+
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK));
+
+ cudaF_rearrange(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), Y.Dim(), X.Dim());
+ cuSafeCall(cudaGetLastError());
+
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+
+
+
+ template<>
+ void CuMath<float>::Randomize(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& copyFrom)
+ {
+ Timer tim; tim.Start();
+
+ assert(X.Cols() == Y.Cols());
+ assert(X.Rows() == Y.Rows());
+ assert(copyFrom.Dim() <= Y.Rows());
+
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(copyFrom.Dim(),CUBLOCK));
+
+ MatrixDim dimX = X.Dim(); dimX.rows=copyFrom.Dim();
+ MatrixDim dimY = Y.Dim(); dimY.rows=copyFrom.Dim();
+
+ cudaF_randomize(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), dimY, dimX);
+ cuSafeCall(cudaGetLastError());
+
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+
+
+
+ template<>
+ void CuMath<float>::CheckClass(const CuMatrix<float>& out, const CuMatrix<float> &des, CuVector<int>& match)
+ {
+ Timer tim; tim.Start();
+
+ assert(out.Cols() == des.Cols());
+ assert(out.Rows() == des.Rows());
+ assert(out.Stride() == des.Stride());
+ assert(match.Dim() == out.Rows());
+
+ if(out.Cols() > 256) {
+ size_t dimBlock = CUBLOCK;
+ size_t dimGrid = n_blocks(out.Rows(),CUBLOCK);
+
+ cudaF_check_class(dimGrid, dimBlock, out.pCUData(), des.pCUData(), match.pCUData(), out.Dim());
+ cuSafeCall(cudaGetLastError());
+ } else {
+ dim3 dimBlock(out.Cols(),1);
+ dim3 dimGrid(1,out.Rows());
+
+ cudaF_check_class_reduce(dimGrid, dimBlock, out.pCUData(), des.pCUData(), match.pCUData(), out.Dim());
+ cuSafeCall(cudaGetLastError());
+ }
+
+
+
+
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+
+
+ template<>
+ void CuMath<float>::OffsetGemm(char transA, char transB, float alpha, const CuMatrix<float>& A, const CuMatrix<float>& B, float beta, CuMatrix<float>& C, int offA, int offB, int offC)
+ {
+ Timer tim; tim.Start();
+ // CUBLAS is col major, TNet is row major
+ // keep trans..., just swap A&B argumets: A->B B->A
+ //
+ // WARNING
+ // NO DIMENSION CHECK!!!
+
+ //m,n,k is cublas m,n,k
+ size_t m = ((transB=='T' || transB=='t')? B.Rows() : B.Cols());
+ size_t n = ((transA=='T' || transA=='t')? A.Cols() : A.Rows());
+ size_t k = ((transB=='T' || transB=='t')? B.Cols() : B.Rows());
+ size_t k1 = ((transA=='T' || transA=='t')? A.Rows() : A.Cols());
+
+ k = ((k<k1)?k:k1);
+ m = ((m<C.Cols())?m:C.Cols());
+ n = ((n<C.Rows())?m:C.Rows());
+
+#if 0
+ std::cout << "A " << transA << " "<< A.Rows() << " " << A.Cols() << " " << A.Stride() << " " << offA
+ << "; B " << transB << " "<< B.Rows() << " " << B.Cols() << " " << B.Stride() << " " << offB
+ << "; C " << C.Rows() << " " << C.Cols() << " " << C.Stride() << " " << offC
+ << "; alpha" << alpha << " beta" << beta << " REALmnk:" << m <<" "<< n <<" "<< k << std::endl;
+#endif
+
+
+ cublasSgemm(transB, transA, m, n, k,
+ alpha, B.pCUData()+offB, B.Stride(),
+ A.pCUData()+offA, A.Stride(),
+ beta, C.pCUData()+offC, C.Stride());
+ cuSafeCall(cublasGetError());
+
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+
+
+/*
+
+ template<>
+ void CuMath<float>::Gemv(char trans, float alpha, const CuMatrix<float>& A, const float* x, size_t dimX, float beta, float* y, size_t dimY)
+ {
+ Timer tim; tim.Start();
+ // CUBLAS is col major, TNet is row major
+ // y = alpha * op(A) * x + beta * y,
+
+ size_t m = A.Cols(); //m..rows of A in colmajor (== cols in rowmajor)
+ size_t n = A.Rows(); //n..cols of A in colmajor (== rows in rowmajor)
+
+ // switch the trans parameter!
+ char cu_trans;
+ if(trans == 't' || trans == 'T') {
+ cu_trans = 'n';
+ } else if (trans == 'n' || trans == 'N') {
+ cu_trans = 't';
+ } else {
+ Error(std::string("Unknown trans")+trans);
+ }
+
+ //check the dims
+ if(cu_trans == 'n') {
+ assert(dimX == n);
+ assert(dimY == m);
+ } else {
+ assert(dimX == m);
+ assert(dimY == n);
+ }
+
+ //run gemv
+ cublasSgemv(cu_trans,m,n,alpha,
+ A.pCUData(), A.Stride(), x, 1,
+ beta, y, 1);
+
+ cuSafeCall(cublasGetError());
+
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+ */
+
+ /**
+ * offsetY tells how many outputs of 'Ax' mutiplication is skipped at the beginning,
+ */
+ template<>
+ void CuMath<float>::OffsetGemv(char trans, float alpha, const CuMatrix<float>& A, const float* x, size_t dimX, float beta, float* y, size_t dimY, size_t offsetY)
+ {
+ Timer tim; tim.Start();
+ // CUBLAS is col major, TNet is row major
+ // y = alpha * op(A) * x + beta * y,
+
+ size_t m = A.Cols(); //m..rows of A in colmajor (== cols in rowmajor)
+ size_t n = A.Rows(); //n..cols of A in colmajor (== rows in rowmajor)
+
+ // switch the trans parameter!
+ char cu_trans;
+ if(trans == 't' || trans == 'T') {
+ cu_trans = 'n';
+ } else if (trans == 'n' || trans == 'N') {
+ cu_trans = 't';
+ } else {
+ Error(std::string("Unknown trans")+trans);
+ }
+
+ // select part of matrix for compute
+ size_t cu_offset = 0;
+ if(cu_trans == 'n') {
+ cu_offset += offsetY;
+ assert(m >= dimY+offsetY);
+ m = dimY;
+ } else {
+ cu_offset += offsetY*A.Stride();
+ assert(n >= dimY+offsetY);
+ n = dimY;
+ }
+
+ //check the dims
+ if(cu_trans == 'n') {
+ assert(dimX == n);
+ assert(dimY == m);
+ } else {
+ assert(dimX == m);
+ assert(dimY == n);
+ }
+
+ //run gemv
+ cublasSgemv(cu_trans,m,n,alpha,
+ A.pCUData()+cu_offset, A.Stride(), x, 1,
+ beta, y, 1);
+
+ cuSafeCall(cublasGetError());
+
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+
+
+ template<>
+ void CuMath<float>::BlasGer(float alpha, const float* x, size_t dimX, const float* y, size_t dimY, CuMatrix<float>& A) {
+ Timer tim; tim.Start();
+ // CUBLAS is col major, TNet is row major
+ // -> switch x and y
+
+ // A = alpha * x * transpose(y) + A,
+
+ assert(dimX == A.Rows());
+ assert(dimY == A.Cols());
+
+ size_t m = A.Cols(); //m..rows of A in colmajor (== cols in rowmajor)
+ size_t n = A.Rows(); //n..cols of A in colmajor (== rows in rowmajor)
+
+ cublasSger(m,n,alpha,y,1,x,1,A.pCUData(),A.Stride());
+ cuSafeCall(cublasGetError());
+
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+
+
+ template<>
+ void CuMath<float>::VecExpand(const CuVector<float>&in, CuVector<float>&out)
+ {
+ Timer tim; tim.Start();
+
+ assert(out.Dim() % in.Dim() == 0);
+ int n_copies = out.Dim()/in.Dim();
+ CuVector<int> offsets(n_copies);
+ //offsets.SetConst(0); done implicitly!
+
+ dim3 dimBlock(CUBLOCK);
+ dim3 dimGrid(n_blocks(out.Dim(), CUBLOCK));
+
+ MatrixDim dim_in = { 1, in.Dim(), in.Dim() };
+ MatrixDim dim_out = { 1, out.Dim(), out.Dim() };
+ cudaF_expand(dimGrid, dimBlock, out.pCUData(), in.pCUData(), offsets.pCUData(), dim_out, dim_in);
+ cuSafeCall(cudaGetLastError());
+
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+
+
+ template<>
+ void CuMath<float>::VecAddColSum(float alpha, const CuVector<float>&in, float beta, CuVector<float>&out)
+ {
+ Timer tim; tim.Start();
+
+ assert(in.Dim() % out.Dim() == 0);
+
+ size_t dimBlock = CUBLOCK;
+ size_t dimGrid = n_blocks(out.Dim(),CUBLOCK);
+
+ MatrixDim dim = { in.Dim()/out.Dim(), out.Dim(), out.Dim() };
+
+ cudaF_add_col_sum(dimGrid,dimBlock,alpha,in.pCUData(),beta,out.pCUData(),dim);
+
+ cuSafeCall(cudaGetLastError());
+
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+
+
+ //////////////////////////////////////////////////////////////////////////////
+ //// CuMath<> Template specializations (double)
+ ////
+ template<>
+ void CuMath<double>::Sigmoid(CuMatrix<double>& Y, const CuMatrix<double>& X)
+ {
+ Timer tim; tim.Start();
+
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(X.Cols(),CUBLOCK), n_blocks(X.Rows(), CUBLOCK));
+
+ cudaD_sigmoid(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim());
+ cuSafeCall(cudaGetLastError());
+
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+
+ template<>
+ void CuMath<double>::DiffSigmoid(CuMatrix<double>& Eout, const CuMatrix<double>& Ein, const CuMatrix<double>& Y)
+ {
+ Timer tim; tim.Start();
+
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Eout.Cols(), CUBLOCK), n_blocks(Eout.Rows(),CUBLOCK));
+
+ cudaD_diff_sigmoid(dimGrid, dimBlock, Eout.pCUData(), Ein.pCUData(), Y.pCUData(), Eout.Dim());
+ cuSafeCall(cudaGetLastError());
+
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+
+
+ template<>
+ void CuMath<double>::Softmax(CuMatrix<double>& Y, const CuMatrix<double>& X)
+ {
+ Timer tim; tim.Start();
+
+ size_t dimBlock = CUBLOCK;
+ size_t dimGrid = n_blocks(X.Rows(),CUBLOCK);
+
+ cudaD_softmax(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim());
+ cuSafeCall(cudaGetLastError());
+
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+
+
+
+ template<>
+ void CuMath<double>::BlockLinearity(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuMatrix<double>& block_transf)
+ {
+ Timer tim; tim.Start();
+
+ assert(Y.Rows() == X.Rows());
+ assert((X.Cols() % block_transf.Rows()) == 0);
+ assert((Y.Cols() % block_transf.Cols()) == 0);
+ assert((X.Cols() / block_transf.Rows()) == (Y.Cols() / block_transf.Cols()));
+
+ int blocks = X.Cols() / block_transf.Rows();
+
+ for(int i = 0; i < blocks; i++) {
+ int m = block_transf.Cols();
+ int n = X.Rows();
+ int k = block_transf.Rows();
+
+ /*
+ //DEBUG MESSAGE
+ std::cout << "N N " << m << " " << n << " " << k << " "
+ << 1.0 << " " << block_transf << " " << block_transf.Stride()
+ << " " << X+i*k << " " << X.Stride() << " "
+ << 0.0 << " " << Y+i*n << " " << Y.Stride()
+ << "\n" << std::flush;
+ */
+
+
+ cublasDgemm('N', 'N', m, n, k,
+ 1.0, block_transf.pCUData(), block_transf.Stride(),
+ X.pCUData()+i*k, X.Stride(),
+ 0.0, Y.pCUData()+i*m, Y.Stride());
+ }
+ cuSafeCall(cublasGetError());
+
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+
+
+
+
+ template<>
+ void CuMath<double>::Expand(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& frameOffsets)
+ {
+ Timer tim; tim.Start();
+
+ assert(Y.Rows() == X.Rows());
+ assert(X.Cols() * frameOffsets.Dim() == Y.Cols());
+
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK));
+
+ cudaD_expand(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), frameOffsets.pCUData(), Y.Dim(), X.Dim());
+ cuSafeCall(cudaGetLastError());
+
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+
+
+ template<>
+ void CuMath<double>::Rearrange(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& copyFrom)
+ {
+ Timer tim; tim.Start();
+
+ assert(copyFrom.Dim() == Y.Cols());
+ assert(Y.Rows() == X.Rows());
+
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK));
+
+ cudaD_rearrange(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), Y.Dim(), X.Dim());
+ cuSafeCall(cudaGetLastError());
+
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+
+
+
+ template<>
+ void CuMath<double>::Randomize(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& copyFrom)
+ {
+ Timer tim; tim.Start();
+
+ assert(X.Cols() == Y.Cols());
+ assert(X.Rows() == Y.Rows());
+ assert(copyFrom.Dim() <= Y.Rows());
+
+ dim3 dimBlock(CUBLOCK,CUBLOCK);
+ dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(copyFrom.Dim(),CUBLOCK));
+
+ MatrixDim dimX = X.Dim(); dimX.rows=copyFrom.Dim();
+ MatrixDim dimY = Y.Dim(); dimY.rows=copyFrom.Dim();
+
+ cudaD_randomize(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), dimY, dimX);
+ cuSafeCall(cudaGetLastError());
+
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+
+
+
+ template<>
+ void CuMath<double>::CheckClass(const CuMatrix<double>& out, const CuMatrix<double> &des, CuVector<int>& match)
+ {
+ Timer tim; tim.Start();
+
+ assert(out.Cols() == des.Cols());
+ assert(out.Rows() == des.Rows());
+ assert(out.Stride() == des.Stride());
+ assert(match.Dim() == out.Rows());
+
+ size_t dimBlock = CUBLOCK;
+ size_t dimGrid = n_blocks(out.Rows(),CUBLOCK);
+
+ cudaD_check_class(dimGrid, dimBlock, out.pCUData(), des.pCUData(), match.pCUData(), out.Dim());
+ cuSafeCall(cudaGetLastError());
+
+ tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val());
+ }
+
+}