diff options
| author | Joe Zhao <ztuowen@gmail.com> | 2014-04-14 08:14:45 +0800 | 
|---|---|---|
| committer | Joe Zhao <ztuowen@gmail.com> | 2014-04-14 08:14:45 +0800 | 
| commit | cccccbf6cca94a3eaf813b4468453160e91c332b (patch) | |
| tree | 23418cb73a10ae3b0688681a7f0ba9b06424583e /src/CuBaseLib/.svn/text-base/cumath.cc.svn-base | |
| download | tnet-cccccbf6cca94a3eaf813b4468453160e91c332b.tar.gz tnet-cccccbf6cca94a3eaf813b4468453160e91c332b.tar.bz2 tnet-cccccbf6cca94a3eaf813b4468453160e91c332b.zip | |
First commit
Diffstat (limited to 'src/CuBaseLib/.svn/text-base/cumath.cc.svn-base')
| -rw-r--r-- | src/CuBaseLib/.svn/text-base/cumath.cc.svn-base | 574 | 
1 files changed, 574 insertions, 0 deletions
| diff --git a/src/CuBaseLib/.svn/text-base/cumath.cc.svn-base b/src/CuBaseLib/.svn/text-base/cumath.cc.svn-base new file mode 100644 index 0000000..d718324 --- /dev/null +++ b/src/CuBaseLib/.svn/text-base/cumath.cc.svn-base @@ -0,0 +1,574 @@ + + + +#include "cumath.h" +#include "cukernels.h" + + +namespace TNet { + +  ////////////////////////////////////////////////////////////////////////////// +  //// CuMath<> Template specializations (float) +  //// +  template<> +  void CuMath<float>::Sigmoid(CuMatrix<float>& Y, const CuMatrix<float>& X) +  { +    Timer tim; tim.Start(); + +    dim3 dimBlock(CUBLOCK,CUBLOCK); +    dim3 dimGrid(n_blocks(X.Cols(),CUBLOCK), n_blocks(X.Rows(), CUBLOCK)); + +    cudaF_sigmoid(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim()); +    cuSafeCall(cudaGetLastError()); +     +    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); +  } + +  template<> +  void CuMath<float>::DiffSigmoid(CuMatrix<float>& Eout, const CuMatrix<float>& Ein, const CuMatrix<float>& Y) +  { +    Timer tim; tim.Start(); + +    dim3 dimBlock(CUBLOCK,CUBLOCK); +    dim3 dimGrid(n_blocks(Eout.Cols(), CUBLOCK), n_blocks(Eout.Rows(),CUBLOCK)); + +    cudaF_diff_sigmoid(dimGrid, dimBlock, Eout.pCUData(), Ein.pCUData(), Y.pCUData(), Eout.Dim()); +    cuSafeCall(cudaGetLastError()); + +    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); +  } + +     +  template<> +  void CuMath<float>::Softmax(CuMatrix<float>& Y, const CuMatrix<float>& X) +  { +    Timer tim; tim.Start(); + +#if 0 +    //disable 'reduce' functions +    size_t dimBlock = CUBLOCK; +    size_t dimGrid  = n_blocks(X.Rows(),CUBLOCK); + +    cudaF_softmax(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim()); +    cuSafeCall(cudaGetLastError()); +#else +    if(X.Cols() > 256) { +      //use old implementation (can't use reduction due to  +      //limited size of shared memory) +      size_t dimBlock = CUBLOCK; +      size_t dimGrid  = n_blocks(X.Rows(),CUBLOCK); + +      cudaF_softmax(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim()); +      cuSafeCall(cudaGetLastError()); +    } else { +      //use implementation with reduction +      dim3 dimBlock(X.Cols(),1); +      dim3 dimGrid(1,X.Rows()); + +      cudaF_softmax_reduce(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim()); +      cuSafeCall(cudaGetLastError()); +    } +#endif + +    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); +  } + + + +  template<> +  void CuMath<float>::BlockLinearity(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuMatrix<float>& block_transf) +  { +    Timer tim; tim.Start(); + +    assert(Y.Rows() == X.Rows()); +    assert((X.Cols() % block_transf.Rows()) == 0); +    assert((Y.Cols() % block_transf.Cols()) == 0); +    assert((X.Cols() / block_transf.Rows()) == (Y.Cols() / block_transf.Cols())); + +    int blocks = X.Cols() / block_transf.Rows(); + +    for(int i = 0; i < blocks; i++) { +      int m = block_transf.Cols(); +      int n = X.Rows(); +      int k = block_transf.Rows(); + +      /* +      //DEBUG MESSAGE +      std::cout << "N N " << m << " " << n << " " << k << " "  +                << 1.0 << " " << block_transf << " " << block_transf.Stride()  +                << " " << X+i*k << " " << X.Stride() << " "  +                << 0.0 << " " << Y+i*n << " " << Y.Stride()  +                << "\n" << std::flush; +      */ + + +      cublasSgemm('N', 'N', m, n, k,  +                  1.0, block_transf.pCUData(), block_transf.Stride(),  +                  X.pCUData()+i*k, X.Stride(),  +                  0.0, Y.pCUData()+i*m, Y.Stride()); +    } +    cuSafeCall(cublasGetError());     +     +    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); +  } + + + + +  template<> +  void CuMath<float>::Expand(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& frameOffsets) +  { +    Timer tim; tim.Start(); + +    assert(Y.Rows() == X.Rows()); +    assert(X.Cols() * frameOffsets.Dim() == Y.Cols()); + +    dim3 dimBlock(CUBLOCK,CUBLOCK); +    dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK)); + +    cudaF_expand(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), frameOffsets.pCUData(), Y.Dim(), X.Dim()); +    cuSafeCall(cudaGetLastError()); +     +    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); +  } + + +  template<> +  void CuMath<float>::Rearrange(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& copyFrom) +  { +    Timer tim; tim.Start(); + +    assert(copyFrom.Dim() == Y.Cols()); +    assert(Y.Rows() == X.Rows()); +     +    dim3 dimBlock(CUBLOCK,CUBLOCK); +    dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK)); +     +    cudaF_rearrange(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), Y.Dim(), X.Dim()); +    cuSafeCall(cudaGetLastError()); +     +    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); +  } + + + +  template<> +  void CuMath<float>::Randomize(CuMatrix<float>& Y, const CuMatrix<float>& X, const CuVector<int>& copyFrom) +  { +    Timer tim; tim.Start(); + +    assert(X.Cols() == Y.Cols()); +    assert(X.Rows() == Y.Rows()); +    assert(copyFrom.Dim() <= Y.Rows()); +     +    dim3 dimBlock(CUBLOCK,CUBLOCK); +    dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(copyFrom.Dim(),CUBLOCK)); +     +    MatrixDim dimX = X.Dim(); dimX.rows=copyFrom.Dim(); +    MatrixDim dimY = Y.Dim(); dimY.rows=copyFrom.Dim(); + +    cudaF_randomize(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), dimY, dimX); +    cuSafeCall(cudaGetLastError()); + +    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); +  } + + + +  template<> +  void CuMath<float>::CheckClass(const CuMatrix<float>& out, const CuMatrix<float> &des, CuVector<int>& match) +  { +    Timer tim; tim.Start(); + +    assert(out.Cols() == des.Cols()); +    assert(out.Rows() == des.Rows()); +    assert(out.Stride() == des.Stride()); +    assert(match.Dim() == out.Rows()); + +    if(out.Cols() > 256) { +      size_t dimBlock = CUBLOCK; +      size_t dimGrid = n_blocks(out.Rows(),CUBLOCK); + +      cudaF_check_class(dimGrid, dimBlock, out.pCUData(), des.pCUData(), match.pCUData(), out.Dim()); +      cuSafeCall(cudaGetLastError()); +    } else { +      dim3 dimBlock(out.Cols(),1); +      dim3 dimGrid(1,out.Rows()); + +      cudaF_check_class_reduce(dimGrid, dimBlock, out.pCUData(), des.pCUData(), match.pCUData(), out.Dim()); +      cuSafeCall(cudaGetLastError()); +    } + + + +     +    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); +  } + + +  template<> +  void CuMath<float>::OffsetGemm(char transA, char transB, float alpha, const CuMatrix<float>& A, const CuMatrix<float>& B, float beta, CuMatrix<float>& C, int offA, int offB, int offC) +  { +    Timer tim; tim.Start(); +    // CUBLAS is col major, TNet is row major +    // keep trans..., just swap A&B argumets: A->B B->A +    // +    // WARNING +    // NO DIMENSION CHECK!!! +     +    //m,n,k is cublas m,n,k +    size_t m = ((transB=='T' || transB=='t')? B.Rows() : B.Cols());  +    size_t n = ((transA=='T' || transA=='t')? A.Cols() : A.Rows()); +    size_t k = ((transB=='T' || transB=='t')? B.Cols() : B.Rows()); +    size_t k1 = ((transA=='T' || transA=='t')? A.Rows() : A.Cols()); + +    k = ((k<k1)?k:k1); +    m = ((m<C.Cols())?m:C.Cols()); +    n = ((n<C.Rows())?m:C.Rows()); + +#if 0 +    std::cout << "A " << transA << " "<< A.Rows() << " " << A.Cols() << " " << A.Stride() << " " << offA +         << "; B " << transB << " "<< B.Rows() << " " << B.Cols() << " " << B.Stride() << " " << offB +         << "; C " << C.Rows() << " " << C.Cols() << " " << C.Stride() << " " << offC +         << "; alpha" << alpha << " beta" << beta << " REALmnk:" << m <<" "<< n <<" "<< k << std::endl; +#endif +          + +    cublasSgemm(transB, transA, m, n, k,  +                alpha, B.pCUData()+offB, B.Stride(),  +                A.pCUData()+offA, A.Stride(),  +                beta, C.pCUData()+offC, C.Stride()); +    cuSafeCall(cublasGetError());     +     +    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); +  } + + +/* + +  template<> +  void CuMath<float>::Gemv(char trans, float alpha, const CuMatrix<float>& A, const float* x, size_t dimX, float beta, float* y, size_t dimY) +  { +    Timer tim; tim.Start(); +    // CUBLAS is col major, TNet is row major +    // y = alpha * op(A) * x + beta * y, +     +    size_t m = A.Cols(); //m..rows of A in colmajor (== cols in rowmajor) +    size_t n = A.Rows(); //n..cols of A in colmajor (== rows in rowmajor) +  +    // switch the trans parameter! +    char cu_trans; +    if(trans == 't' || trans == 'T') { +      cu_trans = 'n'; +    } else if (trans == 'n' || trans == 'N') { +      cu_trans = 't'; +    } else { +      Error(std::string("Unknown trans")+trans); +    } +    +    //check the dims +    if(cu_trans == 'n') { +      assert(dimX == n); +      assert(dimY == m); +    } else { +      assert(dimX == m); +      assert(dimY == n); +    } +  +    //run gemv +    cublasSgemv(cu_trans,m,n,alpha, +                A.pCUData(), A.Stride(), x, 1, +                beta, y, 1); +     +    cuSafeCall(cublasGetError());     +     +    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); +  } +  */ + +  /** +   * offsetY tells how many outputs of 'Ax' mutiplication is skipped at the beginning,  +   */ +  template<> +  void CuMath<float>::OffsetGemv(char trans, float alpha, const CuMatrix<float>& A, const float* x, size_t dimX, float beta, float* y, size_t dimY, size_t offsetY) +  { +    Timer tim; tim.Start(); +    // CUBLAS is col major, TNet is row major +    // y = alpha * op(A) * x + beta * y, +     +    size_t m = A.Cols(); //m..rows of A in colmajor (== cols in rowmajor) +    size_t n = A.Rows(); //n..cols of A in colmajor (== rows in rowmajor) +  +    // switch the trans parameter! +    char cu_trans; +    if(trans == 't' || trans == 'T') { +      cu_trans = 'n'; +    } else if (trans == 'n' || trans == 'N') { +      cu_trans = 't'; +    } else { +      Error(std::string("Unknown trans")+trans); +    } + +    // select part of matrix for compute +    size_t cu_offset = 0; +    if(cu_trans == 'n') { +      cu_offset += offsetY; +      assert(m >= dimY+offsetY); +      m = dimY; +    } else { +      cu_offset += offsetY*A.Stride(); +      assert(n >= dimY+offsetY); +      n = dimY; +    } +    +    //check the dims +    if(cu_trans == 'n') { +      assert(dimX == n); +      assert(dimY == m); +    } else { +      assert(dimX == m); +      assert(dimY == n); +    } +  +    //run gemv +    cublasSgemv(cu_trans,m,n,alpha, +                A.pCUData()+cu_offset, A.Stride(), x, 1, +                beta, y, 1); +     +    cuSafeCall(cublasGetError());     +     +    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); +  } + +   +  template<> +  void CuMath<float>::BlasGer(float alpha, const float* x, size_t dimX, const float* y, size_t dimY, CuMatrix<float>& A) { +    Timer tim; tim.Start(); +    // CUBLAS is col major, TNet is row major +    // -> switch x and y + +    // A = alpha * x * transpose(y) + A, +     +    assert(dimX == A.Rows()); +    assert(dimY == A.Cols()); + +    size_t m = A.Cols(); //m..rows of A in colmajor (== cols in rowmajor) +    size_t n = A.Rows(); //n..cols of A in colmajor (== rows in rowmajor) + +    cublasSger(m,n,alpha,y,1,x,1,A.pCUData(),A.Stride());  +    cuSafeCall(cublasGetError());     + +    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); +  } + + +  template<> +  void CuMath<float>::VecExpand(const CuVector<float>&in, CuVector<float>&out) +  { +    Timer tim; tim.Start(); + +    assert(out.Dim() % in.Dim() == 0); +    int n_copies = out.Dim()/in.Dim(); +    CuVector<int> offsets(n_copies); +    //offsets.SetConst(0); done implicitly! + +    dim3 dimBlock(CUBLOCK); +    dim3 dimGrid(n_blocks(out.Dim(), CUBLOCK)); +     +    MatrixDim dim_in = { 1, in.Dim(), in.Dim() }; +    MatrixDim dim_out = { 1, out.Dim(), out.Dim() }; +    cudaF_expand(dimGrid, dimBlock, out.pCUData(), in.pCUData(), offsets.pCUData(), dim_out, dim_in); +    cuSafeCall(cudaGetLastError()); + +    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); +  } + + +  template<> +  void CuMath<float>::VecAddColSum(float alpha, const CuVector<float>&in, float beta, CuVector<float>&out) +  { +    Timer tim; tim.Start(); + +    assert(in.Dim() % out.Dim() == 0); + +    size_t dimBlock = CUBLOCK; +    size_t dimGrid = n_blocks(out.Dim(),CUBLOCK);  + +    MatrixDim dim = { in.Dim()/out.Dim(), out.Dim(), out.Dim() }; + +    cudaF_add_col_sum(dimGrid,dimBlock,alpha,in.pCUData(),beta,out.pCUData(),dim); + +    cuSafeCall(cudaGetLastError()); +     +    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); +  } + + +  ////////////////////////////////////////////////////////////////////////////// +  //// CuMath<> Template specializations (double) +  //// +  template<> +  void CuMath<double>::Sigmoid(CuMatrix<double>& Y, const CuMatrix<double>& X) +  { +    Timer tim; tim.Start(); + +    dim3 dimBlock(CUBLOCK,CUBLOCK); +    dim3 dimGrid(n_blocks(X.Cols(),CUBLOCK), n_blocks(X.Rows(), CUBLOCK)); + +    cudaD_sigmoid(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim()); +    cuSafeCall(cudaGetLastError()); +     +    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); +  } + +  template<> +  void CuMath<double>::DiffSigmoid(CuMatrix<double>& Eout, const CuMatrix<double>& Ein, const CuMatrix<double>& Y) +  { +    Timer tim; tim.Start(); + +    dim3 dimBlock(CUBLOCK,CUBLOCK); +    dim3 dimGrid(n_blocks(Eout.Cols(), CUBLOCK), n_blocks(Eout.Rows(),CUBLOCK)); + +    cudaD_diff_sigmoid(dimGrid, dimBlock, Eout.pCUData(), Ein.pCUData(), Y.pCUData(), Eout.Dim()); +    cuSafeCall(cudaGetLastError()); + +    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); +  } + +     +  template<> +  void CuMath<double>::Softmax(CuMatrix<double>& Y, const CuMatrix<double>& X) +  { +    Timer tim; tim.Start(); + +    size_t dimBlock = CUBLOCK; +    size_t dimGrid  = n_blocks(X.Rows(),CUBLOCK); + +    cudaD_softmax(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), X.Dim()); +    cuSafeCall(cudaGetLastError()); + +    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); +  } + + + +  template<> +  void CuMath<double>::BlockLinearity(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuMatrix<double>& block_transf) +  { +    Timer tim; tim.Start(); + +    assert(Y.Rows() == X.Rows()); +    assert((X.Cols() % block_transf.Rows()) == 0); +    assert((Y.Cols() % block_transf.Cols()) == 0); +    assert((X.Cols() / block_transf.Rows()) == (Y.Cols() / block_transf.Cols())); + +    int blocks = X.Cols() / block_transf.Rows(); + +    for(int i = 0; i < blocks; i++) { +      int m = block_transf.Cols(); +      int n = X.Rows(); +      int k = block_transf.Rows(); + +      /* +      //DEBUG MESSAGE +      std::cout << "N N " << m << " " << n << " " << k << " "  +                << 1.0 << " " << block_transf << " " << block_transf.Stride()  +                << " " << X+i*k << " " << X.Stride() << " "  +                << 0.0 << " " << Y+i*n << " " << Y.Stride()  +                << "\n" << std::flush; +      */ + + +      cublasDgemm('N', 'N', m, n, k,  +                  1.0, block_transf.pCUData(), block_transf.Stride(),  +                  X.pCUData()+i*k, X.Stride(),  +                  0.0, Y.pCUData()+i*m, Y.Stride()); +    } +    cuSafeCall(cublasGetError());     +     +    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); +  } + + + + +  template<> +  void CuMath<double>::Expand(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& frameOffsets) +  { +    Timer tim; tim.Start(); + +    assert(Y.Rows() == X.Rows()); +    assert(X.Cols() * frameOffsets.Dim() == Y.Cols()); + +    dim3 dimBlock(CUBLOCK,CUBLOCK); +    dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK)); + +    cudaD_expand(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), frameOffsets.pCUData(), Y.Dim(), X.Dim()); +    cuSafeCall(cudaGetLastError()); +     +    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); +  } + + +  template<> +  void CuMath<double>::Rearrange(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& copyFrom) +  { +    Timer tim; tim.Start(); + +    assert(copyFrom.Dim() == Y.Cols()); +    assert(Y.Rows() == X.Rows()); +     +    dim3 dimBlock(CUBLOCK,CUBLOCK); +    dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(Y.Rows(),CUBLOCK)); +     +    cudaD_rearrange(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), Y.Dim(), X.Dim()); +    cuSafeCall(cudaGetLastError()); +     +    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); +  } + + + +  template<> +  void CuMath<double>::Randomize(CuMatrix<double>& Y, const CuMatrix<double>& X, const CuVector<int>& copyFrom) +  { +    Timer tim; tim.Start(); + +    assert(X.Cols() == Y.Cols()); +    assert(X.Rows() == Y.Rows()); +    assert(copyFrom.Dim() <= Y.Rows()); +     +    dim3 dimBlock(CUBLOCK,CUBLOCK); +    dim3 dimGrid(n_blocks(Y.Cols(), CUBLOCK), n_blocks(copyFrom.Dim(),CUBLOCK)); +     +    MatrixDim dimX = X.Dim(); dimX.rows=copyFrom.Dim(); +    MatrixDim dimY = Y.Dim(); dimY.rows=copyFrom.Dim(); + +    cudaD_randomize(dimGrid, dimBlock, Y.pCUData(), X.pCUData(), copyFrom.pCUData(), dimY, dimX); +    cuSafeCall(cudaGetLastError()); + +    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); +  } + + + +  template<> +  void CuMath<double>::CheckClass(const CuMatrix<double>& out, const CuMatrix<double> &des, CuVector<int>& match) +  { +    Timer tim; tim.Start(); + +    assert(out.Cols() == des.Cols()); +    assert(out.Rows() == des.Rows()); +    assert(out.Stride() == des.Stride()); +    assert(match.Dim() == out.Rows()); + +    size_t dimBlock = CUBLOCK; +    size_t dimGrid = n_blocks(out.Rows(),CUBLOCK); + +    cudaD_check_class(dimGrid, dimBlock, out.pCUData(), des.pCUData(), match.pCUData(), out.Dim()); +    cuSafeCall(cudaGetLastError()); +     +    tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); +  } + +} | 
