diff options
Diffstat (limited to 'src/CuBaseLib/cuvector.tcc')
-rw-r--r-- | src/CuBaseLib/cuvector.tcc | 254 |
1 files changed, 254 insertions, 0 deletions
diff --git a/src/CuBaseLib/cuvector.tcc b/src/CuBaseLib/cuvector.tcc new file mode 100644 index 0000000..0107859 --- /dev/null +++ b/src/CuBaseLib/cuvector.tcc @@ -0,0 +1,254 @@ + +#include <cuda_runtime_api.h> + +#include "Timer.h" +#include "cucommon.h" +#include "cumatrix.h" +#include "cudevice.h" + +namespace TNet { + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + CuVector<_ElemT>& + CuVector<_ElemT>:: + Init(size_t dim) + { + if(mDim == dim) { + //SetZero(); + return *this; + } + + Destroy(); + + cuSafeCall(cudaMalloc((void**)&mpCUData, dim*sizeof(_ElemT))); + mDim = dim; + SetZero(); + + return *this; + } + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + void + CuVector<_ElemT>:: + Destroy() + { + if(NULL != mpCUData) { + cuSafeCall(cudaFree(mpCUData)); + mpCUData = NULL; + } + mDim = 0; + } + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + CuVector<_ElemT>& + CuVector<_ElemT>:: + CopyFrom(const CuVector<_ElemT>& rSrc) + { + Init(rSrc.Dim()); + + Timer tim; tim.Start(); + + cuSafeCall(cudaMemcpy(mpCUData, rSrc.pCUData(), rSrc.Dim()*sizeof(_ElemT), cudaMemcpyDeviceToDevice)); + + tim.End(); CuDevice::Instantiate().AccuProfile("CuVector::CopyFromD2D",tim.Val()); + return *this; + } + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + CuVector<_ElemT>& + CuVector<_ElemT>:: + CopyFrom(const Vector<_ElemT>& rSrc) + { + Init(rSrc.Dim()); + + Timer tim; tim.Start(); + + cuSafeCall(cudaMemcpy(mpCUData, rSrc.pData(), rSrc.Dim()*sizeof(_ElemT), cudaMemcpyHostToDevice)); + + tim.End(); CuDevice::Instantiate().AccuProfile("CuVector::CopyFromH2D",tim.Val()); + return *this; + } + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + template<typename _ElemT> + Vector<_ElemT>& + CuVector<_ElemT>:: + CopyTo(Vector<_ElemT>& rDst) const + { + if(rDst.Dim() != mDim) { + rDst.Init(mDim); + } + + Timer tim; tim.Start(); + + cuSafeCall(cudaMemcpy(rDst.pData(), pCUData(), mDim*sizeof(_ElemT), cudaMemcpyDeviceToHost)); + + tim.End(); CuDevice::Instantiate().AccuProfile("CuVector::CopyToD2H",tim.Val()); + + return rDst; + } + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + + template<typename _ElemT> + void + CuVector<_ElemT>:: + SetZero() + { + Timer tim; tim.Start(); + cuSafeCall(cudaMemset(mpCUData, 0, mDim*sizeof(_ElemT))); + tim.End(); CuDevice::Instantiate().AccuProfile("CuVector::SetZero",tim.Val()); + } + + + //////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////// + + + + + //////////////////////////////////////////////////////////////////////// + //// CuVector:: templeate specializations (float) + //// + template<> + inline void CuVector<float>::SetConst(float value) + { + Timer tim; tim.Start(); + + dim3 dimBlock(CUBLOCK); + dim3 dimGrid(n_blocks(Dim(), CUBLOCK)); + ::MatrixDim d = { 1, Dim(), Dim() }; + + cudaF_set_const(dimGrid,dimBlock,mpCUData,value,d); + cuSafeCall(cudaGetLastError()); + + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void CuVector<float>::AddScaled(float alpha, const CuVector<float>& vec, float beta) + { + Timer tim; tim.Start(); + + assert(vec.Dim() == Dim()); + + dim3 dimBlock(CUBLOCK); + dim3 dimGrid(n_blocks(Dim(), CUBLOCK)); + ::MatrixDim d = { 1, Dim(), Dim() }; + + cudaF_add_scaled(dimGrid,dimBlock,alpha,vec.pCUData(),beta,mpCUData,d); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void CuVector<float>::AddColSum(float alpha, const CuMatrix<float>& mat, float beta) + { + Timer tim; tim.Start(); + + assert(mat.Cols() == Dim()); + + /** + * Rows()<=512 limit due to limited shared memory + * Cols()<=256 limit due to coalesced memory alignment: + * matrices with huge strides have slow access!!! + */ + if(mat.Rows() > 512 || mat.Cols() > 256) { + size_t dimBlock = CUBLOCK*2; + size_t dimGrid = n_blocks(Dim(),CUBLOCK*2); + + cudaF_add_col_sum(dimGrid,dimBlock,alpha,mat.pCUData(),beta,mpCUData,mat.Dim()); + cuSafeCall(cudaGetLastError()); + } else { + dim3 dimBlock(mat.Rows(),1); + dim3 dimGrid(1,Dim()); + + cudaF_add_col_sum_reduce(dimGrid,dimBlock,alpha,mat.pCUData(),beta,mpCUData,mat.Dim()); + cuSafeCall(cudaGetLastError()); + } + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + + + //////////////////////////////////////////////////////////////////////// + //// CuVector:: templeate specializations (double) + //// + template<> + inline void CuVector<double>::SetConst(double value) + { + Timer tim; tim.Start(); + + dim3 dimBlock(CUBLOCK); + dim3 dimGrid(n_blocks(Dim(), CUBLOCK)); + ::MatrixDim d = { 1, Dim(), Dim() }; + + cudaD_set_const(dimGrid,dimBlock,mpCUData,value,d); + cuSafeCall(cudaGetLastError()); + + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void CuVector<double>::AddScaled(double alpha, const CuVector<double>& vec, double beta) + { + Timer tim; tim.Start(); + + assert(vec.Dim() == Dim()); + + dim3 dimBlock(CUBLOCK); + dim3 dimGrid(n_blocks(Dim(), CUBLOCK)); + ::MatrixDim d = { 1, Dim(), Dim() }; + + cudaD_add_scaled(dimGrid,dimBlock,alpha,vec.pCUData(),beta,mpCUData,d); + cuSafeCall(cudaGetLastError()); + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + + + template<> + inline void CuVector<double>::AddColSum(double alpha, const CuMatrix<double>& mat, double beta) + { + Timer tim; tim.Start(); + + assert(mat.Cols() == Dim()); + + size_t dimBlock = CUBLOCK*2; + size_t dimGrid = n_blocks(Dim(),CUBLOCK*2); + + cudaD_add_col_sum(dimGrid,dimBlock,alpha,mat.pCUData(),beta,mpCUData,mat.Dim()); + cuSafeCall(cudaGetLastError()); + + + tim.End(); CuDevice::Instantiate().AccuProfile(__func__,tim.Val()); + } + +} + + + |