You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
280 lines
9.2 KiB
280 lines
9.2 KiB
/*
|
|
A C-program for MT19937, with initialization improved 2002/1/26.
|
|
Coded by Takuji Nishimura and Makoto Matsumoto.
|
|
|
|
Before using, initialize the state by using init_genrand(seed)
|
|
or init_by_array(init_key, key_length).
|
|
|
|
Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
|
|
All rights reserved.
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions
|
|
are met:
|
|
|
|
1. Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
|
|
2. Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in the
|
|
documentation and/or other materials provided with the distribution.
|
|
|
|
3. The names of its contributors may not be used to endorse or promote
|
|
products derived from this software without specific prior written
|
|
permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
|
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
|
|
Any feedback is very welcome.
|
|
http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
|
|
email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space)
|
|
|
|
Modifications for use in OpenCL by Ian Ollmann, Apple Inc.
|
|
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include "mt19937.h"
|
|
#include "mingw_compat.h"
|
|
#include "harness/alloc.h"
|
|
|
|
#ifdef __SSE2__
|
|
#include <emmintrin.h>
|
|
#endif
|
|
|
|
/* Period parameters */
|
|
#define N 624 /* vector code requires multiple of 4 here */
|
|
#define M 397
|
|
#define MATRIX_A (cl_uint)0x9908b0dfUL /* constant vector a */
|
|
#define UPPER_MASK (cl_uint)0x80000000UL /* most significant w-r bits */
|
|
#define LOWER_MASK (cl_uint)0x7fffffffUL /* least significant r bits */
|
|
|
|
typedef struct _MTdata
|
|
{
|
|
cl_uint mt[N];
|
|
#ifdef __SSE2__
|
|
cl_uint cache[N];
|
|
#endif
|
|
cl_int mti;
|
|
} _MTdata;
|
|
|
|
/* initializes mt[N] with a seed */
|
|
MTdata init_genrand(cl_uint s)
|
|
{
|
|
MTdata r = (MTdata)align_malloc(sizeof(_MTdata), 16);
|
|
if (NULL != r)
|
|
{
|
|
cl_uint *mt = r->mt;
|
|
int mti = 0;
|
|
mt[0] = s; // & 0xffffffffUL;
|
|
for (mti = 1; mti < N; mti++)
|
|
{
|
|
mt[mti] = (cl_uint)(
|
|
1812433253UL * (mt[mti - 1] ^ (mt[mti - 1] >> 30)) + mti);
|
|
/* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
|
|
/* In the previous versions, MSBs of the seed affect */
|
|
/* only MSBs of the array mt[]. */
|
|
/* 2002/01/09 modified by Makoto Matsumoto */
|
|
// mt[mti] &= 0xffffffffUL;
|
|
/* for >32 bit machines */
|
|
}
|
|
r->mti = mti;
|
|
}
|
|
|
|
return r;
|
|
}
|
|
|
|
void free_mtdata(MTdata d)
|
|
{
|
|
if (d) align_free(d);
|
|
}
|
|
|
|
/* generates a random number on [0,0xffffffff]-interval */
|
|
cl_uint genrand_int32(MTdata d)
|
|
{
|
|
/* mag01[x] = x * MATRIX_A for x=0,1 */
|
|
static const cl_uint mag01[2] = { 0x0UL, MATRIX_A };
|
|
#ifdef __SSE2__
|
|
static volatile int init = 0;
|
|
static union {
|
|
__m128i v;
|
|
cl_uint s[4];
|
|
} upper_mask, lower_mask, one, matrix_a, c0, c1;
|
|
#endif
|
|
|
|
|
|
cl_uint *mt = d->mt;
|
|
cl_uint y;
|
|
|
|
if (d->mti == N)
|
|
{ /* generate N words at one time */
|
|
int kk;
|
|
|
|
#ifdef __SSE2__
|
|
if (0 == init)
|
|
{
|
|
upper_mask.s[0] = upper_mask.s[1] = upper_mask.s[2] =
|
|
upper_mask.s[3] = UPPER_MASK;
|
|
lower_mask.s[0] = lower_mask.s[1] = lower_mask.s[2] =
|
|
lower_mask.s[3] = LOWER_MASK;
|
|
one.s[0] = one.s[1] = one.s[2] = one.s[3] = 1;
|
|
matrix_a.s[0] = matrix_a.s[1] = matrix_a.s[2] = matrix_a.s[3] =
|
|
MATRIX_A;
|
|
c0.s[0] = c0.s[1] = c0.s[2] = c0.s[3] = (cl_uint)0x9d2c5680UL;
|
|
c1.s[0] = c1.s[1] = c1.s[2] = c1.s[3] = (cl_uint)0xefc60000UL;
|
|
init = 1;
|
|
}
|
|
#endif
|
|
|
|
kk = 0;
|
|
#ifdef __SSE2__
|
|
// vector loop
|
|
for (; kk + 4 <= N - M; kk += 4)
|
|
{
|
|
// ((mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK))
|
|
__m128i vy = _mm_or_si128(
|
|
_mm_and_si128(_mm_load_si128((__m128i *)(mt + kk)),
|
|
upper_mask.v),
|
|
_mm_and_si128(_mm_loadu_si128((__m128i *)(mt + kk + 1)),
|
|
lower_mask.v));
|
|
|
|
// y & 1 ? -1 : 0
|
|
__m128i mask = _mm_cmpeq_epi32(_mm_and_si128(vy, one.v), one.v);
|
|
// y & 1 ? MATRIX_A, 0 = mag01[y & (cl_uint) 0x1UL]
|
|
__m128i vmag01 = _mm_and_si128(mask, matrix_a.v);
|
|
// mt[kk+M] ^ (y >> 1)
|
|
__m128i vr =
|
|
_mm_xor_si128(_mm_loadu_si128((__m128i *)(mt + kk + M)),
|
|
(__m128i)_mm_srli_epi32(vy, 1));
|
|
// mt[kk+M] ^ (y >> 1) ^ mag01[y & (cl_uint) 0x1UL]
|
|
vr = _mm_xor_si128(vr, vmag01);
|
|
_mm_store_si128((__m128i *)(mt + kk), vr);
|
|
}
|
|
#endif
|
|
for (; kk < N - M; kk++)
|
|
{
|
|
y = (cl_uint)((mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK));
|
|
mt[kk] = mt[kk + M] ^ (y >> 1) ^ mag01[y & (cl_uint)0x1UL];
|
|
}
|
|
|
|
#ifdef __SSE2__
|
|
// advance to next aligned location
|
|
for (; kk < N - 1 && (kk & 3); kk++)
|
|
{
|
|
y = (cl_uint)((mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK));
|
|
mt[kk] = mt[kk + (M - N)] ^ (y >> 1) ^ mag01[y & (cl_uint)0x1UL];
|
|
}
|
|
|
|
// vector loop
|
|
for (; kk + 4 <= N - 1; kk += 4)
|
|
{
|
|
__m128i vy = _mm_or_si128(
|
|
_mm_and_si128(_mm_load_si128((__m128i *)(mt + kk)),
|
|
upper_mask.v),
|
|
// ((mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK))
|
|
_mm_and_si128(_mm_loadu_si128((__m128i *)(mt + kk + 1)),
|
|
lower_mask.v));
|
|
|
|
// y & 1 ? -1 : 0
|
|
__m128i mask = _mm_cmpeq_epi32(_mm_and_si128(vy, one.v), one.v);
|
|
// y & 1 ? MATRIX_A, 0 = mag01[y & (cl_uint) 0x1UL]
|
|
__m128i vmag01 = _mm_and_si128(mask, matrix_a.v);
|
|
// mt[kk+M-N] ^ (y >> 1)
|
|
__m128i vr =
|
|
_mm_xor_si128(_mm_loadu_si128((__m128i *)(mt + kk + M - N)),
|
|
_mm_srli_epi32(vy, 1));
|
|
// mt[kk+M] ^ (y >> 1) ^ mag01[y & (cl_uint) 0x1UL]
|
|
vr = _mm_xor_si128(vr, vmag01);
|
|
_mm_store_si128((__m128i *)(mt + kk), vr);
|
|
}
|
|
#endif
|
|
|
|
for (; kk < N - 1; kk++)
|
|
{
|
|
y = (cl_uint)((mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK));
|
|
mt[kk] = mt[kk + (M - N)] ^ (y >> 1) ^ mag01[y & (cl_uint)0x1UL];
|
|
}
|
|
y = (cl_uint)((mt[N - 1] & UPPER_MASK) | (mt[0] & LOWER_MASK));
|
|
mt[N - 1] = mt[M - 1] ^ (y >> 1) ^ mag01[y & (cl_uint)0x1UL];
|
|
|
|
#ifdef __SSE2__
|
|
// Do the tempering ahead of time in vector code
|
|
for (kk = 0; kk + 4 <= N; kk += 4)
|
|
{
|
|
// y = mt[k];
|
|
__m128i vy = _mm_load_si128((__m128i *)(mt + kk));
|
|
// y ^= (y >> 11);
|
|
vy = _mm_xor_si128(vy, _mm_srli_epi32(vy, 11));
|
|
// y ^= (y << 7) & (cl_uint) 0x9d2c5680UL;
|
|
vy = _mm_xor_si128(vy, _mm_and_si128(_mm_slli_epi32(vy, 7), c0.v));
|
|
// y ^= (y << 15) & (cl_uint) 0xefc60000UL;
|
|
vy = _mm_xor_si128(vy, _mm_and_si128(_mm_slli_epi32(vy, 15), c1.v));
|
|
// y ^= (y >> 18);
|
|
vy = _mm_xor_si128(vy, _mm_srli_epi32(vy, 18));
|
|
_mm_store_si128((__m128i *)(d->cache + kk), vy);
|
|
}
|
|
#endif
|
|
|
|
d->mti = 0;
|
|
}
|
|
#ifdef __SSE2__
|
|
y = d->cache[d->mti++];
|
|
#else
|
|
y = mt[d->mti++];
|
|
|
|
/* Tempering */
|
|
y ^= (y >> 11);
|
|
y ^= (y << 7) & (cl_uint)0x9d2c5680UL;
|
|
y ^= (y << 15) & (cl_uint)0xefc60000UL;
|
|
y ^= (y >> 18);
|
|
#endif
|
|
|
|
|
|
return y;
|
|
}
|
|
|
|
cl_ulong genrand_int64(MTdata d)
|
|
{
|
|
return ((cl_ulong)genrand_int32(d) << 32) | (cl_uint)genrand_int32(d);
|
|
}
|
|
|
|
/* generates a random number on [0,1]-real-interval */
|
|
double genrand_real1(MTdata d)
|
|
{
|
|
return genrand_int32(d) * (1.0 / 4294967295.0);
|
|
/* divided by 2^32-1 */
|
|
}
|
|
|
|
/* generates a random number on [0,1)-real-interval */
|
|
double genrand_real2(MTdata d)
|
|
{
|
|
return genrand_int32(d) * (1.0 / 4294967296.0);
|
|
/* divided by 2^32 */
|
|
}
|
|
|
|
/* generates a random number on (0,1)-real-interval */
|
|
double genrand_real3(MTdata d)
|
|
{
|
|
return (((double)genrand_int32(d)) + 0.5) * (1.0 / 4294967296.0);
|
|
/* divided by 2^32 */
|
|
}
|
|
|
|
/* generates a random number on [0,1) with 53-bit resolution*/
|
|
double genrand_res53(MTdata d)
|
|
{
|
|
unsigned long a = genrand_int32(d) >> 5, b = genrand_int32(d) >> 6;
|
|
return (a * 67108864.0 + b) * (1.0 / 9007199254740992.0);
|
|
}
|