#include "curandkernels.h" // //Hybrid Tauss/LCG random number generator // //http://http.developer.nvidia.com/GPUGems3/gpugems3_ch37.html // S1, S2, S3, and M are all constants, and z is part of the // private per-thread generator state. __device__ static unsigned TausStep(unsigned &z, int S1, int S2, int S3, unsigned M) { unsigned b=(((z << S1) ^ z) >> S2); return z = (((z & M) << S3) ^ b); } // A and C are constants __device__ static unsigned LCGStep(unsigned &z, unsigned A, unsigned C) { return z=(A*z+C); } template __device__ static T HybridTaus(unsigned& z1, unsigned& z2, unsigned& z3, unsigned& z4) { // Combined period is lcm(p1,p2,p3,p4)~ 2^121 T randval; do { randval = 2.3283064365387e-10 * ( // Periods TausStep(z1, 13, 19, 12, 4294967294UL) ^ // p1=2^31-1 TausStep(z2, 2, 25, 4, 4294967288UL) ^ // p2=2^30-1 TausStep(z3, 3, 11, 17, 4294967280UL) ^ // p3=2^28-1 LCGStep(z4, 1664525, 1013904223UL) // p4=2^32 ); } while (!(randval > 0.0 && randval < 1.0)); return randval; } template __global__ static void _rand(T* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d) { int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y + threadIdx.y; int index = i + j*d.stride; if( i < d.cols && j < d.rows ) { mat[index] = HybridTaus(z1[index],z2[index],z3[index],z4[index]); } } /* float2 BoxMuller() { float u0=HybridTaus (), u1=HybridTaus (); float r=sqrt(-2 log(u0)); float theta=2*PI*u1; return make_float2(r*sin(theta),r*cos(theta)); } */ template __device__ static T BoxMuller(unsigned& z1, unsigned& z2, unsigned& z3, unsigned& z4) { const T M_2PI = 6.283185307179586476925286766558; T u0 = HybridTaus(z1,z2,z3,z4), u1 = HybridTaus(z1,z2,z3,z4); T r = sqrt(-2.0 * log(u0)); T theta = M_2PI * u1; return r*sin(theta); } template __global__ static void _gauss_rand(T* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d) { int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y + threadIdx.y; int index = i + j*d.stride; if( i < d.cols && j < d.rows ) { mat[index] = BoxMuller(z1[index],z2[index],z3[index],z4[index]); } } template __global__ static void _binarize_probs(T* states, const T* probs, const T* rand, MatrixDim d) { int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y + threadIdx.y; int index = i + j*d.stride; if( i < d.cols && j < d.rows ) { states[index] = ((probs[index] > rand[index])? 1.0 : 0.0); } } /************ * :FLOAT: */ void cudaF_rand(dim3 Gr, dim3 Bl, float* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d) { _rand<<>>(mat,z1,z2,z3,z4,d); } void cudaF_gauss_rand(dim3 Gr, dim3 Bl, float* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d) { _gauss_rand<<>>(mat,z1,z2,z3,z4,d); } void cudaF_binarize_probs(dim3 Gr, dim3 Bl, float* states, const float* probs, float* rand, MatrixDim d) { _binarize_probs<<>>(states,probs,rand,d); } /************ * :DOUBLE: */ void cudaD_rand(dim3 Gr, dim3 Bl, double* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d) { _rand<<>>(mat,z1,z2,z3,z4,d); } void cudaD_gauss_rand(dim3 Gr, dim3 Bl, double* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d) { _gauss_rand<<>>(mat,z1,z2,z3,z4,d); } void cudaD_binarize_probs(dim3 Gr, dim3 Bl, double* states, const double* probs, double* rand, MatrixDim d) { _binarize_probs<<>>(states,probs,rand,d); }