__device__ inline unsigned int lcg_rand(unsigned int& seed)
{
seed = seed * 1664525 + 1013904223UL;
return seed;
}
__global__ void GenerateRandNums(unsigned int* out, unsigned int* states)
{
const int tid = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int s = states[tid];
out[tid] = lcg_rand(s) + lcg_rand(s);
states[tid] = s;
}
//S1,S2,S3,M -
__device__ inline unsigned int taus_rand_step(unsigned int& state, int S1, int S2, int S3, int M)
{
unsigned int b = (((state << S1) ^ state) >> S2);
state = (((state & M) << S3) ^ b);
return state;
}
__device__ unsigned int taus_rand(uint4& state)
{
return
taus_rand_step(state.x, 6, 13, 18, 4294967294UL) ^
taus_rand_step(state.y, 2, 27, 2, 4294967288UL) ^
taus_rand_step(state.z, 13, 21, 7, 4294967280UL) ^
taus_rand_step(state.w, 3, 12, 13, 4294967268UL);
}
__device__ unsigned int hybrid_taus(uint4& state)
{
return
taus_rand_step(state.x, 13, 19, 12, 4294967294UL) ^
taus_rand_step(state.y, 2, 25, 4, 4294967288UL) ^
taus_rand_step(state.z, 3, 11, 17, 4294967280UL) ^
lcg_rand(state.w);
}
__device__ inline unsigned int rng_xor128(uint4& s)
{
unsigned int t;
t = sx^(sx<<11);
sx = sy;
sy = sz;
sz = sw;
sw = (sw^(sw>>19))^(t^(t>>8));
return sw;
}
#define SEED 999
#define DCMT_SEED 4172
#define MT_RNG_PERIOD 607
typedef struct{
unsigned int matrix_a;
unsigned int mask_b;
unsigned int mask_c;
unsigned int seed;
} mt_struct_stripped;
#define MT_RNG_COUNT (4096*16)
#define MT_MM 9
#define MT_NN 19
#define MT_WMASK 0xFFFFFFFFU
#define MT_UMASK 0xFFFFFFFEU
#define MT_LMASK 0x1U
#define MT_SHIFT0 12
#define MT_SHIFTB 7
#define MT_SHIFTC 15
#define MT_SHIFT1 18
__device__ struct mt_state
{
int iState;
unsigned int mti1;
unsigned int mt[MT_NN];
};
__device__ mt_struct_stripped ds_MT[MT_RNG_COUNT];
static mt_struct_stripped h_MT[MT_RNG_COUNT];
__device__ mt_state ds_mtState[MT_RNG_COUNT];
void InitGPUTwisters(const char* fname, unsigned int seed)
{
FILE *fd = fopen(fname, "rb");
if(!fd)
printf("initMTGPU(): failed to open %s\n", fname);
if( !fread(h_MT, sizeof(h_MT), 1, fd) )
printf("initMTGPU(): failed to load %s\n", fname);
fclose(fd);
mt_struct_stripped *MT = (mt_struct_stripped *)malloc(MT_RNG_COUNT * sizeof(mt_struct_stripped));
for(int i = 0; i < MT_RNG_COUNT; i++)
{
MT[i] = h_MT[i];
seed = lcg_rand(seed);
MT[i].seed = seed;
}
cudaMemcpyToSymbol(ds_MT, MT, sizeof(h_MT));
free(MT);
}
__device__ unsigned int GetRandomIntegerFast(unsigned int tid, mt_state* mts, mt_struct_stripped* config)
{
int iState1, iStateM;
unsigned int mti, mtiM, x;
iState1 = mts->iState + 1;
iStateM = mts->iState + MT_MM;
if(iState1 >= MT_NN)
iState1 -= MT_NN;
if(iStateM >= MT_NN)
iStateM -= MT_NN;
mti = mts->mti1;
mts->mti1 = mts->mt[iState1];
mtiM = mts->mt[iStateM];
x = (mti & MT_UMASK) | (mts->mti1 & MT_LMASK);
x = mtiM ^ (x >> 1) ^ ((x & 1) ? config->matrix_a : 0);
mts->mt[mts->iState] = x;
mts->iState = iState1;
x ^= (x >> MT_SHIFT0);
x ^= (x << MT_SHIFTB) & config->mask_b;
x ^= (x << MT_SHIFTC) & config->mask_c;
x ^= (x >> MT_SHIFT1);
return x;
}
__device__ float GetRandomFloatFast(unsigned int tid, mt_state* mts, mt_struct_stripped* config)
{
return ((float)GetRandomIntegerFast(tid,mts,config) + 1.0f) / 4294967296.0f;
}
__device__ void InitTwisterFast(unsigned int tid, mt_state* mts, mt_struct_stripped* config)
{
mts->mt[0] = config->seed;
for(int i = 1; i < MT_NN; i++)
mts->mt[i] = (1812433253U * (mts->mt[i - 1] ^ (mts->mt[i - 1] >> 30)) + i) & MT_WMASK;
mts->iState = 0;
mts->mti1 = mts->mt[0];
}
__global__ void GenerateUniformNumbers(float* output)
{
const uint tid = gridDim.x*gridDim.y*threadIdx.x + blockIdx.y*gridDim.x + blockIdx.x;
mt_state mts = ds_mtState[tid];
mt_struct_stripped config = ds_MT[tid];
InitTwisterFast(tid, &mts, &config);
output[tid] = GetRandomFloatFast(tid, &mts, &config)
ds_mtState[tid] = mts;
}
Source: https://habr.com/ru/post/99876/
All Articles