src/CuBaseLib/curandkernels.cu


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135


#include "curandkernels.h"


//
//Hybrid Tausworthe/LCG random number generator
//
//http://http.developer.nvidia.com/GPUGems3/gpugems3_ch37.html


// S1, S2, S3, and M are all constants, and z is part of the  
// private per-thread generator state.
__device__
static unsigned TausStep(unsigned &z, int S1, int S2, int S3, unsigned M)  
{  
  unsigned b=(((z << S1) ^ z) >> S2);  
  return z = (((z & M) << S3) ^ b);  
}  

// A and C are constants  
__device__
static unsigned LCGStep(unsigned &z, unsigned A, unsigned C)  
{  
  return z=(A*z+C);  
} 

template<typename T>
__device__
static T HybridTaus(unsigned& z1, unsigned& z2, unsigned& z3, unsigned& z4)  
{  
  // Combined period is lcm(p1,p2,p3,p4)~ 2^121
  T randval;
  do { 
   randval = 2.3283064365387e-10 * (          // Periods  
    TausStep(z1, 13, 19, 12, 4294967294UL) ^  // p1=2^31-1  
    TausStep(z2, 2, 25, 4, 4294967288UL) ^    // p2=2^30-1  
    TausStep(z3, 3, 11, 17, 4294967280UL) ^   // p3=2^28-1  
    LCGStep(z4, 1664525, 1013904223UL)        // p4=2^32  
   );
  } while (!(randval > 0.0 && randval < 1.0));
  return randval;
}  


template<typename T>
__global__
static void _rand(T* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
{
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  int j = blockIdx.y * blockDim.y + threadIdx.y;
  int index = i + j*d.stride;
  if( i < d.cols  && j < d.rows ) {
    mat[index] = HybridTaus<T>(z1[index],z2[index],z3[index],z4[index]);
  }
}

/*
float2 BoxMuller()  
{  
  float u0=HybridTaus (), u1=HybridTaus ();  
  float r=sqrt(-2 log(u0));  
  float theta=2*PI*u1;  
  return make_float2(r*sin(theta),r*cos(theta));  
} 
*/
 
template<typename T>
__device__
static T BoxMuller(unsigned& z1, unsigned& z2, unsigned& z3, unsigned& z4)  
{
  const T M_2PI = 6.283185307179586476925286766558;

  T u0 = HybridTaus<T>(z1,z2,z3,z4), u1 = HybridTaus<T>(z1,z2,z3,z4);
  T r = sqrt(-2.0 * log(u0));
  T theta = M_2PI * u1;
  return r*sin(theta);
  
}  


template<typename T>
__global__
static void _gauss_rand(T* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
{
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  int j = blockIdx.y * blockDim.y + threadIdx.y;
  int index = i + j*d.stride;
  if( i < d.cols  && j < d.rows ) {
    mat[index] = BoxMuller<T>(z1[index],z2[index],z3[index],z4[index]);
  }
}


template<typename T>
__global__
static void _binarize_probs(T* states, const T* probs, const T* rand, MatrixDim d)
{
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  int j = blockIdx.y * blockDim.y + threadIdx.y;
  int index = i + j*d.stride;
  if( i < d.cols  && j < d.rows ) {
    states[index] = ((probs[index] > rand[index])? 1.0 : 0.0);
  }
}


/************
 * :FLOAT:
 */
void cudaF_rand(dim3 Gr, dim3 Bl, float* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
{ _rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); }

void cudaF_gauss_rand(dim3 Gr, dim3 Bl, float* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
{ _gauss_rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); }

void cudaF_binarize_probs(dim3 Gr, dim3 Bl, float* states, const float* probs, float* rand, MatrixDim d) 
{ _binarize_probs<<<Gr,Bl>>>(states,probs,rand,d); }


/************
 * :DOUBLE:
 */
void cudaD_rand(dim3 Gr, dim3 Bl, double* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
{ _rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); }

void cudaD_gauss_rand(dim3 Gr, dim3 Bl, double* mat, unsigned* z1, unsigned* z2, unsigned* z3, unsigned* z4, MatrixDim d)
{ _gauss_rand<<<Gr,Bl>>>(mat,z1,z2,z3,z4,d); }

void cudaD_binarize_probs(dim3 Gr, dim3 Bl, double* states, const double* probs, double* rand, MatrixDim d) 
{ _binarize_probs<<<Gr,Bl>>>(states,probs,rand,d); }