/build/rocrand-7S8maf/rocrand-7.1.1/library/include/rocrand/rocrand_philox4x32_10.h Source File

/build/rocrand-7S8maf/rocrand-7.1.1/library/include/rocrand/rocrand_philox4x32_10.h Source File#

API library: /build/rocrand-7S8maf/rocrand-7.1.1/library/include/rocrand/rocrand_philox4x32_10.h Source File
rocrand_philox4x32_10.h
1// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved.
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"), to deal
5// in the Software without restriction, including without limitation the rights
6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7// copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in
11// all copies or substantial portions of the Software.
12//
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19// THE SOFTWARE.
20
21/*
22Copyright 2010-2011, D. E. Shaw Research.
23All rights reserved.
24
25Redistribution and use in source and binary forms, with or without
26modification, are permitted provided that the following conditions are
27met:
28
29* Redistributions of source code must retain the above copyright
30 notice, this list of conditions, and the following disclaimer.
31
32* Redistributions in binary form must reproduce the above copyright
33 notice, this list of conditions, and the following disclaimer in the
34 documentation and/or other materials provided with the distribution.
35
36* Neither the name of D. E. Shaw Research nor the names of its
37 contributors may be used to endorse or promote products derived from
38 this software without specific prior written permission.
39
40THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
41"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
42LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
43A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
44OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
45SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
46LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
47DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
48THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
49(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
50OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
51*/
52
53#ifndef ROCRAND_PHILOX4X32_10_H_
54#define ROCRAND_PHILOX4X32_10_H_
55
56#include "rocrand/rocrand_common.h"
57
58#include <hip/hip_runtime.h>
59
60// Constants from Random123
61// See https://www.deshawresearch.com/resources_random123.html
62#define ROCRAND_PHILOX_M4x32_0 0xD2511F53U
63#define ROCRAND_PHILOX_M4x32_1 0xCD9E8D57U
64#define ROCRAND_PHILOX_W32_0 0x9E3779B9U
65#define ROCRAND_PHILOX_W32_1 0xBB67AE85U
66
75#define ROCRAND_PHILOX4x32_DEFAULT_SEED 0xdeadbeefdeadbeefULL // end of group rocranddevice
77
78namespace rocrand_device
79{
80
81class philox4x32_10_engine
82{
83public:
84 struct philox4x32_10_state
85 {
86 uint4 counter;
87 uint4 result;
88 uint2 key;
89 unsigned int substate;
90
91 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
92 // The Box–Muller transform requires two inputs to convert uniformly
93 // distributed real values [0; 1] to normally distributed real values
94 // (with mean = 0, and stddev = 1). Often user wants only one
95 // normally distributed number, to save performance and random
96 // numbers the 2nd value is saved for future requests.
97 unsigned int boxmuller_float_state; // is there a float in boxmuller_float
98 unsigned int boxmuller_double_state; // is there a double in boxmuller_double
99 float boxmuller_float; // normally distributed float
100 double boxmuller_double; // normally distributed double
101 #endif
102 };
103
104 __forceinline__ __device__ __host__ philox4x32_10_engine()
105 {
106 this->seed(ROCRAND_PHILOX4x32_DEFAULT_SEED, 0, 0);
107 }
108
114 __forceinline__ __device__ __host__ philox4x32_10_engine(const unsigned long long seed,
115 const unsigned long long subsequence,
116 const unsigned long long offset)
117 {
118 this->seed(seed, subsequence, offset);
119 }
120
126 __forceinline__ __device__ __host__ void seed(unsigned long long seed_value,
127 const unsigned long long subsequence,
128 const unsigned long long offset)
129 {
130 m_state.key.x = static_cast<unsigned int>(seed_value);
131 m_state.key.y = static_cast<unsigned int>(seed_value >> 32);
132 this->restart(subsequence, offset);
133 }
134
136 __forceinline__ __device__ __host__ void discard(unsigned long long offset)
137 {
138 this->discard_impl(offset);
139 this->m_state.result = this->ten_rounds(m_state.counter, m_state.key);
140 }
141
146 __forceinline__ __device__ __host__ void discard_subsequence(unsigned long long subsequence)
147 {
148 this->discard_subsequence_impl(subsequence);
149 m_state.result = this->ten_rounds(m_state.counter, m_state.key);
150 }
151
152 __forceinline__ __device__ __host__ void restart(const unsigned long long subsequence,
153 const unsigned long long offset)
154 {
155 m_state.counter = {0, 0, 0, 0};
156 m_state.result = {0, 0, 0, 0};
157 m_state.substate = 0;
158 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
159 m_state.boxmuller_float_state = 0;
160 m_state.boxmuller_double_state = 0;
161 #endif
162 this->discard_subsequence_impl(subsequence);
163 this->discard_impl(offset);
164 m_state.result = this->ten_rounds(m_state.counter, m_state.key);
165 }
166
167 __forceinline__ __device__ __host__ unsigned int operator()()
168 {
169 return this->next();
170 }
171
172 __forceinline__ __device__ __host__ unsigned int next()
173 {
174 #if defined(__HIP_PLATFORM_AMD__)
175 unsigned int ret = ROCRAND_HIPVEC_ACCESS(m_state.result)[m_state.substate];
176 #else
177 unsigned int ret = (&m_state.result.x)[m_state.substate];
178 #endif
179
180 m_state.substate++;
181 if(m_state.substate == 4)
182 {
183 m_state.substate = 0;
184 this->discard_state();
185 m_state.result = this->ten_rounds(m_state.counter, m_state.key);
186 }
187 return ret;
188 }
189
190 __forceinline__ __device__ __host__ uint4 next4()
191 {
192 uint4 ret = m_state.result;
193 this->discard_state();
194 m_state.result = this->ten_rounds(m_state.counter, m_state.key);
195 return this->interleave(ret, m_state.result);
196 }
197
198protected:
199 // Advances the internal state to skip \p offset numbers.
200 // DOES NOT CALCULATE NEW 4 UINTs (m_state.result)
201 __forceinline__ __device__ __host__ void discard_impl(unsigned long long offset)
202 {
203 // Adjust offset for subset
204 m_state.substate += offset & 3;
205 unsigned long long counter_offset = offset / 4;
206 counter_offset += m_state.substate < 4 ? 0 : 1;
207 m_state.substate += m_state.substate < 4 ? 0 : -4;
208 // Discard states
209 this->discard_state(counter_offset);
210 }
211
212 // DOES NOT CALCULATE NEW 4 UINTs (m_state.result)
213 __forceinline__ __device__ __host__ void
214 discard_subsequence_impl(unsigned long long subsequence)
215 {
216 unsigned int lo = static_cast<unsigned int>(subsequence);
217 unsigned int hi = static_cast<unsigned int>(subsequence >> 32);
218
219 unsigned int temp = m_state.counter.z;
220 m_state.counter.z += lo;
221 m_state.counter.w += hi + (m_state.counter.z < temp ? 1 : 0);
222 }
223
224 // Advances the internal state by offset times.
225 // DOES NOT CALCULATE NEW 4 UINTs (m_state.result)
226 __forceinline__ __device__ __host__ void discard_state(unsigned long long offset)
227 {
228 unsigned int lo = static_cast<unsigned int>(offset);
229 unsigned int hi = static_cast<unsigned int>(offset >> 32);
230
231 uint4 temp = m_state.counter;
232 m_state.counter.x += lo;
233 m_state.counter.y += hi + (m_state.counter.x < temp.x ? 1 : 0);
234 m_state.counter.z += (m_state.counter.y < temp.y ? 1 : 0);
235 m_state.counter.w += (m_state.counter.z < temp.z ? 1 : 0);
236 }
237
238 // Advances the internal state to the next state
239 // DOES NOT CALCULATE NEW 4 UINTs (m_state.result)
240 __forceinline__ __device__ __host__ void discard_state()
241 {
242 m_state.counter = this->bump_counter(m_state.counter);
243 }
244
245 __forceinline__ __device__ __host__ static uint4 bump_counter(uint4 counter)
246 {
247 counter.x++;
248 unsigned int add = counter.x == 0 ? 1 : 0;
249 counter.y += add; add = counter.y == 0 ? add : 0;
250 counter.z += add; add = counter.z == 0 ? add : 0;
251 counter.w += add;
252 return counter;
253 }
254
255 __forceinline__ __device__ __host__ uint4 interleave(const uint4 prev, const uint4 next) const
256 {
257 switch(m_state.substate)
258 {
259 case 0:
260 return prev;
261 case 1:
262 return uint4{ prev.y, prev.z, prev.w, next.x };
263 case 2:
264 return uint4{ prev.z, prev.w, next.x, next.y };
265 case 3:
266 return uint4{ prev.w, next.x, next.y, next.z };
267 }
268 __builtin_unreachable();
269 }
270
271 // 10 Philox4x32 rounds
272 __forceinline__ __device__ __host__ uint4 ten_rounds(uint4 counter, uint2 key)
273 {
274 counter = this->single_round(counter, key); key = this->bumpkey(key); // 1
275 counter = this->single_round(counter, key); key = this->bumpkey(key); // 2
276 counter = this->single_round(counter, key); key = this->bumpkey(key); // 3
277 counter = this->single_round(counter, key); key = this->bumpkey(key); // 4
278 counter = this->single_round(counter, key); key = this->bumpkey(key); // 5
279 counter = this->single_round(counter, key); key = this->bumpkey(key); // 6
280 counter = this->single_round(counter, key); key = this->bumpkey(key); // 7
281 counter = this->single_round(counter, key); key = this->bumpkey(key); // 8
282 counter = this->single_round(counter, key); key = this->bumpkey(key); // 9
283 return this->single_round(counter, key); // 10
284 }
285
286private:
287 // Single Philox4x32 round
288 __forceinline__ __device__ __host__ static uint4 single_round(uint4 counter, uint2 key)
289 {
290 // Source: Random123
291 unsigned long long mul0 = detail::mul_u64_u32(ROCRAND_PHILOX_M4x32_0, counter.x);
292 unsigned int hi0 = static_cast<unsigned int>(mul0 >> 32);
293 unsigned int lo0 = static_cast<unsigned int>(mul0);
294 unsigned long long mul1 = detail::mul_u64_u32(ROCRAND_PHILOX_M4x32_1, counter.z);
295 unsigned int hi1 = static_cast<unsigned int>(mul1 >> 32);
296 unsigned int lo1 = static_cast<unsigned int>(mul1);
297 return uint4{hi1 ^ counter.y ^ key.x, lo1, hi0 ^ counter.w ^ key.y, lo0};
298 }
299
300 __forceinline__ __device__ __host__ static uint2 bumpkey(uint2 key)
301 {
302 key.x += ROCRAND_PHILOX_W32_0;
303 key.y += ROCRAND_PHILOX_W32_1;
304 return key;
305 }
306
307protected:
308 // State
309 philox4x32_10_state m_state;
310
311 #ifndef ROCRAND_DETAIL_BM_NOT_IN_STATE
312 friend struct detail::engine_boxmuller_helper<philox4x32_10_engine>;
313 #endif
314
315}; // philox4x32_10_engine class
316
317} // end namespace rocrand_device
318
323
325typedef rocrand_device::philox4x32_10_engine rocrand_state_philox4x32_10;
327
339__forceinline__ __device__ __host__
340void rocrand_init(const unsigned long long seed,
341 const unsigned long long subsequence,
342 const unsigned long long offset,
343 rocrand_state_philox4x32_10* state)
344{
345 *state = rocrand_state_philox4x32_10(seed, subsequence, offset);
346}
347
360__forceinline__ __device__ __host__
361unsigned int rocrand(rocrand_state_philox4x32_10* state)
362{
363 return state->next();
364}
365
378__forceinline__ __device__ __host__
379uint4 rocrand4(rocrand_state_philox4x32_10* state)
380{
381 return state->next4();
382}
383
392__forceinline__ __device__ __host__
393void skipahead(unsigned long long offset, rocrand_state_philox4x32_10* state)
394{
395 return state->discard(offset);
396}
397
407__forceinline__ __device__ __host__
408void skipahead_subsequence(unsigned long long subsequence, rocrand_state_philox4x32_10* state)
409{
410 return state->discard_subsequence(subsequence);
411}
412
422__forceinline__ __device__ __host__
423void skipahead_sequence(unsigned long long sequence, rocrand_state_philox4x32_10* state)
424{
425 return state->discard_subsequence(sequence);
426}
427 // end of group rocranddevice
429
430#endif // ROCRAND_PHILOX4X32_10_H_
__forceinline__ __device__ __host__ void skipahead(unsigned long long offset, rocrand_state_philox4x32_10 *state)
Updates Philox state to skip ahead by offset elements.
Definition rocrand_philox4x32_10.h:393
__forceinline__ __device__ __host__ void rocrand_init(const unsigned long long seed, const unsigned long long subsequence, const unsigned long long offset, rocrand_state_philox4x32_10 *state)
Initializes Philox state.
Definition rocrand_philox4x32_10.h:340
#define ROCRAND_PHILOX4x32_DEFAULT_SEED
Default seed for PHILOX4x32 PRNG.
Definition rocrand_philox4x32_10.h:75
__forceinline__ __device__ __host__ uint4 rocrand4(rocrand_state_philox4x32_10 *state)
Returns four uniformly distributed random unsigned int values from [0; 2^32 - 1] range.
Definition rocrand_philox4x32_10.h:379
__forceinline__ __device__ __host__ void skipahead_sequence(unsigned long long sequence, rocrand_state_philox4x32_10 *state)
Updates Philox state to skip ahead by sequence sequences.
Definition rocrand_philox4x32_10.h:423
__forceinline__ __device__ __host__ unsigned int rocrand(rocrand_state_philox4x32_10 *state)
Returns uniformly distributed random unsigned int value from [0; 2^32 - 1] range.
Definition rocrand_philox4x32_10.h:361
__forceinline__ __device__ __host__ void skipahead_subsequence(unsigned long long subsequence, rocrand_state_philox4x32_10 *state)
Updates Philox state to skip ahead by subsequence subsequences.
Definition rocrand_philox4x32_10.h:408