#include "cumath.h" #include "cukernels.h" namespace TNet { ////////////////////////////////////////////////////////////////////////////// //// CuMath<> Template specializations (float) //// template<> void CuMath::Sigmoid(CuMatrix& Y, const CuMatrix& X) { Timer tim; tim.Start(); dim3 dimBlock(CUBLOCK,CUBLOCK); dim3 dimGrid(n_blocks(X.Cols(),CUBLOCK), n_blocks(X.Rows(), CUBLOCK)); cudaF_sigmoid(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim()); cuSafeCall(cudaGetLastError()); tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); } template<> void CuMath::DiffSigmoid(CuMatrix& Eout, const CuMatrix& Ein, const CuMatrix& Y) { Timer tim; tim.Start(); dim3 dimBlock(CUBLOCK,CUBLOCK); dim3 dimGrid(n_blocks(Eout.Cols(), CUBLOCK), n_blocks(Eout.Rows(),CUBLOCK)); cudaF_diff_sigmoid(dimGrid, dimBlock, Eout.pCUData(), Ein.pCUData(), Y.pCUData(), Eout.Dim()); cuSafeCall(cudaGetLastError()); tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); } template<> void CuMath::Softmax(CuMatrix& Y, const CuMatrix& X) { Timer tim; tim.Start(); #if 0 //disable 'reduce' functions size_t dimBlock = CUBLOCK; size_t dimGrid = n_blocks(X.Rows(),CUBLOCK); cudaF_softmax(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim()); cuSafeCall(cudaGetLastError()); #else if(X.Cols() > 256) { //use old implementation (can't use reduction due to //limited size of shared memory) size_t dimBlock = CUBLOCK; size_t dimGrid = n_blocks(X.Rows(),CUBLOCK); cudaF_softmax(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim()); cuSafeCall(cudaGetLastError()); } else { //use implementation with reduction dim3 dimBlock(X.Cols(),1); dim3 dimGrid(1,X.Rows()); cudaF_softmax_reduce(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim()); cuSafeCall(cudaGetLastError()); } #endif tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); } template<> void CuMath::BlockLinearity(CuMatrix& Y, const CuMatrix& X, const CuMatrix& block_transf) { Timer tim; tim.Start(); assert(Y.Rows() == X.Rows()); assert((X.Cols() % block_transf.Rows()) == 0); assert((Y.Cols() % block_transf.Cols()) == 0); assert((X.Cols() / block_transf.Rows()) == (Y.Cols() / block_transf.Cols())); int blocks = X.Cols() / block_transf.Rows(); for(int i = 0; i < blocks; i++) { int m = block_transf.Cols(); int n = X.Rows(); int k = block_transf.Rows(); /* //DEBUG MESSAGE std::cout << "N N " << m << " " << n << " " << k << " " << 1.0 << " " << block_transf << " " << block_transf.Stride() << " " << X+i*k << " " << X.Stride() << " " << 0.0 << " " << Y+i*n << " " << Y.Stride() << "\n" << std::flush; */ cublasSgemm('N', 'N', m, n, k, 1.0, block_transf.pCUData(), block_transf.Stride(), X.pCUData()+i*k, X.Stride(), 0.0, Y.pCUData()+i*m, Y.Stride()); } cuSafeCall(cublasGetError()); tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); } template<> void CuMath::Expand(CuMatrix& Y, const CuMatrix& X, const CuVector& frameOffsets) { Timer tim; tim.Start(); assert(Y.Rows() == X.Rows()); assert(X.Cols() * frameOffsets.Dim() == Y.Cols()); dim3 dimBlock(CUBLOCK,CUBLOCK); dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK)); cudaF_expand(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), frameOffsets.pCUData(), Y.Dim(), X.Dim()); cuSafeCall(cudaGetLastError()); tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); } template<> void CuMath::Rearrange(CuMatrix& Y, const CuMatrix& X, const CuVector& copyFrom) { Timer tim; tim.Start(); assert(copyFrom.Dim() == Y.Cols()); assert(Y.Rows() == X.Rows()); dim3 dimBlock(CUBLOCK,CUBLOCK); dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK)); cudaF_rearrange(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), Y.Dim(), X.Dim()); cuSafeCall(cudaGetLastError()); tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); } template<> void CuMath::Randomize(CuMatrix& Y, const CuMatrix& X, const CuVector& copyFrom) { Timer tim; tim.Start(); assert(X.Cols() == Y.Cols()); assert(X.Rows() == Y.Rows()); assert(copyFrom.Dim() <= Y.Rows()); dim3 dimBlock(CUBLOCK,CUBLOCK); dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(copyFrom.Dim(),CUBLOCK)); MatrixDim dimX = X.Dim(); dimX.rows=copyFrom.Dim(); MatrixDim dimY = Y.Dim(); dimY.rows=copyFrom.Dim(); cudaF_randomize(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), dimY, dimX); cuSafeCall(cudaGetLastError()); tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); } template<> void CuMath::CheckClass(const CuMatrix& out, const CuMatrix &des, CuVector& match) { Timer tim; tim.Start(); assert(out.Cols() == des.Cols()); assert(out.Rows() == des.Rows()); assert(out.Stride() == des.Stride()); assert(match.Dim() == out.Rows()); if(out.Cols() > 256) { size_t dimBlock = CUBLOCK; size_t dimGrid = n_blocks(out.Rows(),CUBLOCK); cudaF_check_class(dimGrid, dimBlock, out.pCUData(), des.pCUData(), match.pCUData(), out.Dim()); cuSafeCall(cudaGetLastError()); } else { dim3 dimBlock(out.Cols(),1); dim3 dimGrid(1,out.Rows()); cudaF_check_class_reduce(dimGrid, dimBlock, out.pCUData(), des.pCUData(), match.pCUData(), out.Dim()); cuSafeCall(cudaGetLastError()); } tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); } template<> void CuMath::OffsetGemm(char transA, char transB, float alpha, const CuMatrix& A, const CuMatrix& B, float beta, CuMatrix& C, int offA, int offB, int offC) { Timer tim; tim.Start(); // CUBLAS is col major, TNet is row major // keep trans..., just swap A&B argumets: A->B B->A // // WARNING // NO DIMENSION CHECK!!! //m,n,k is cublas m,n,k size_t m = ((transB=='T' || transB=='t')? B.Rows() : B.Cols()); size_t n = ((transA=='T' || transA=='t')? A.Cols() : A.Rows()); size_t k = ((transB=='T' || transB=='t')? B.Cols() : B.Rows()); size_t k1 = ((transA=='T' || transA=='t')? A.Rows() : A.Cols()); k = ((k void CuMath::Gemv(char trans, float alpha, const CuMatrix& A, const float* x, size_t dimX, float beta, float* y, size_t dimY) { Timer tim; tim.Start(); // CUBLAS is col major, TNet is row major // y = alpha * op(A) * x + beta * y, size_t m = A.Cols(); //m..rows of A in colmajor (== cols in rowmajor) size_t n = A.Rows(); //n..cols of A in colmajor (== rows in rowmajor) // switch the trans parameter! char cu_trans; if(trans == 't' || trans == 'T') { cu_trans = 'n'; } else if (trans == 'n' || trans == 'N') { cu_trans = 't'; } else { Error(std::string("Unknown trans")+trans); } //check the dims if(cu_trans == 'n') { assert(dimX == n); assert(dimY == m); } else { assert(dimX == m); assert(dimY == n); } //run gemv cublasSgemv(cu_trans,m,n,alpha, A.pCUData(), A.Stride(), x, 1, beta, y, 1); cuSafeCall(cublasGetError()); tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); } */ /** * offsetY tells how many outputs of 'Ax' mutiplication is skipped at the beginning, */ template<> void CuMath::OffsetGemv(char trans, float alpha, const CuMatrix& A, const float* x, size_t dimX, float beta, float* y, size_t dimY, size_t offsetY) { Timer tim; tim.Start(); // CUBLAS is col major, TNet is row major // y = alpha * op(A) * x + beta * y, size_t m = A.Cols(); //m..rows of A in colmajor (== cols in rowmajor) size_t n = A.Rows(); //n..cols of A in colmajor (== rows in rowmajor) // switch the trans parameter! char cu_trans; if(trans == 't' || trans == 'T') { cu_trans = 'n'; } else if (trans == 'n' || trans == 'N') { cu_trans = 't'; } else { Error(std::string("Unknown trans")+trans); } // select part of matrix for compute size_t cu_offset = 0; if(cu_trans == 'n') { cu_offset += offsetY; assert(m >= dimY+offsetY); m = dimY; } else { cu_offset += offsetY*A.Stride(); assert(n >= dimY+offsetY); n = dimY; } //check the dims if(cu_trans == 'n') { assert(dimX == n); assert(dimY == m); } else { assert(dimX == m); assert(dimY == n); } //run gemv cublasSgemv(cu_trans,m,n,alpha, A.pCUData()+cu_offset, A.Stride(), x, 1, beta, y, 1); cuSafeCall(cublasGetError()); tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); } template<> void CuMath::BlasGer(float alpha, const float* x, size_t dimX, const float* y, size_t dimY, CuMatrix& A) { Timer tim; tim.Start(); // CUBLAS is col major, TNet is row major // -> switch x and y // A = alpha * x * transpose(y) + A, assert(dimX == A.Rows()); assert(dimY == A.Cols()); size_t m = A.Cols(); //m..rows of A in colmajor (== cols in rowmajor) size_t n = A.Rows(); //n..cols of A in colmajor (== rows in rowmajor) cublasSger(m,n,alpha,y,1,x,1,A.pCUData(),A.Stride()); cuSafeCall(cublasGetError()); tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); } template<> void CuMath::VecExpand(const CuVector&in, CuVector&out) { Timer tim; tim.Start(); assert(out.Dim() % in.Dim() == 0); int n_copies = out.Dim()/in.Dim(); CuVector offsets(n_copies); //offsets.SetConst(0); done implicitly! dim3 dimBlock(CUBLOCK); dim3 dimGrid(n_blocks(out.Dim(), CUBLOCK)); MatrixDim dim_in = { 1, in.Dim(), in.Dim() }; MatrixDim dim_out = { 1, out.Dim(), out.Dim() }; cudaF_expand(dimGrid, dimBlock, out.pCUData(), in.pCUData(), offsets.pCUData(), dim_out, dim_in); cuSafeCall(cudaGetLastError()); tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); } template<> void CuMath::VecAddColSum(float alpha, const CuVector&in, float beta, CuVector&out) { Timer tim; tim.Start(); assert(in.Dim() % out.Dim() == 0); size_t dimBlock = CUBLOCK; size_t dimGrid = n_blocks(out.Dim(),CUBLOCK); MatrixDim dim = { in.Dim()/out.Dim(), out.Dim(), out.Dim() }; cudaF_add_col_sum(dimGrid,dimBlock,alpha,in.pCUData(),beta,out.pCUData(),dim); cuSafeCall(cudaGetLastError()); tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); } ////////////////////////////////////////////////////////////////////////////// //// CuMath<> Template specializations (double) //// template<> void CuMath::Sigmoid(CuMatrix& Y, const CuMatrix& X) { Timer tim; tim.Start(); dim3 dimBlock(CUBLOCK,CUBLOCK); dim3 dimGrid(n_blocks(X.Cols(),CUBLOCK), n_blocks(X.Rows(), CUBLOCK)); cudaD_sigmoid(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim()); cuSafeCall(cudaGetLastError()); tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); } template<> void CuMath::DiffSigmoid(CuMatrix& Eout, const CuMatrix& Ein, const CuMatrix& Y) { Timer tim; tim.Start(); dim3 dimBlock(CUBLOCK,CUBLOCK); dim3 dimGrid(n_blocks(Eout.Cols(), CUBLOCK), n_blocks(Eout.Rows(),CUBLOCK)); cudaD_diff_sigmoid(dimGrid, dimBlock, Eout.pCUData(), Ein.pCUData(), Y.pCUData(), Eout.Dim()); cuSafeCall(cudaGetLastError()); tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); } template<> void CuMath::Softmax(CuMatrix& Y, const CuMatrix& X) { Timer tim; tim.Start(); size_t dimBlock = CUBLOCK; size_t dimGrid = n_blocks(X.Rows(),CUBLOCK); cudaD_softmax(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim()); cuSafeCall(cudaGetLastError()); tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); } template<> void CuMath::BlockLinearity(CuMatrix& Y, const CuMatrix& X, const CuMatrix& block_transf) { Timer tim; tim.Start(); assert(Y.Rows() == X.Rows()); assert((X.Cols() % block_transf.Rows()) == 0); assert((Y.Cols() % block_transf.Cols()) == 0); assert((X.Cols() / block_transf.Rows()) == (Y.Cols() / block_transf.Cols())); int blocks = X.Cols() / block_transf.Rows(); for(int i = 0; i < blocks; i++) { int m = block_transf.Cols(); int n = X.Rows(); int k = block_transf.Rows(); /* //DEBUG MESSAGE std::cout << "N N " << m << " " << n << " " << k << " " << 1.0 << " " << block_transf << " " << block_transf.Stride() << " " << X+i*k << " " << X.Stride() << " " << 0.0 << " " << Y+i*n << " " << Y.Stride() << "\n" << std::flush; */ cublasDgemm('N', 'N', m, n, k, 1.0, block_transf.pCUData(), block_transf.Stride(), X.pCUData()+i*k, X.Stride(), 0.0, Y.pCUData()+i*m, Y.Stride()); } cuSafeCall(cublasGetError()); tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); } template<> void CuMath::Expand(CuMatrix& Y, const CuMatrix& X, const CuVector& frameOffsets) { Timer tim; tim.Start(); assert(Y.Rows() == X.Rows()); assert(X.Cols() * frameOffsets.Dim() == Y.Cols()); dim3 dimBlock(CUBLOCK,CUBLOCK); dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK)); cudaD_expand(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), frameOffsets.pCUData(), Y.Dim(), X.Dim()); cuSafeCall(cudaGetLastError()); tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); } template<> void CuMath::Rearrange(CuMatrix& Y, const CuMatrix& X, const CuVector& copyFrom) { Timer tim; tim.Start(); assert(copyFrom.Dim() == Y.Cols()); assert(Y.Rows() == X.Rows()); dim3 dimBlock(CUBLOCK,CUBLOCK); dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK)); cudaD_rearrange(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), Y.Dim(), X.Dim()); cuSafeCall(cudaGetLastError()); tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); } template<> void CuMath::Randomize(CuMatrix& Y, const CuMatrix& X, const CuVector& copyFrom) { Timer tim; tim.Start(); assert(X.Cols() == Y.Cols()); assert(X.Rows() == Y.Rows()); assert(copyFrom.Dim() <= Y.Rows()); dim3 dimBlock(CUBLOCK,CUBLOCK); dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(copyFrom.Dim(),CUBLOCK)); MatrixDim dimX = X.Dim(); dimX.rows=copyFrom.Dim(); MatrixDim dimY = Y.Dim(); dimY.rows=copyFrom.Dim(); cudaD_randomize(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), dimY, dimX); cuSafeCall(cudaGetLastError()); tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); } template<> void CuMath::CheckClass(const CuMatrix& out, const CuMatrix &des, CuVector& match) { Timer tim; tim.Start(); assert(out.Cols() == des.Cols()); assert(out.Rows() == des.Rows()); assert(out.Stride() == des.Stride()); assert(match.Dim() == out.Rows()); size_t dimBlock = CUBLOCK; size_t dimGrid = n_blocks(out.Rows(),CUBLOCK); cudaD_check_class(dimGrid, dimBlock, out.pCUData(), des.pCUData(), match.pCUData(), out.Dim()); cuSafeCall(cudaGetLastError()); tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); } }