-
-
Notifications
You must be signed in to change notification settings - Fork 2
SIMD Implementations Deep Dive
"Each SIMD implementation carefully manages state using class-based encapsulation, with robust initialization via the SplitMix64 seeding algorithm."
Every SIMD variant follows our consistent architecture principles while maximizing the unique capabilities of each instruction set. The goal: squeeze every cycle of performance from the silicon while maintaining code clarity and safety.
The scalar implementation serves as both fallback guarantee and performance baseline. Every optimization is measured against this rock-solid foundation.
class Xoroshiro128ppScalar : public RNGBase {
public:
explicit Xoroshiro128ppScalar(uint64_t seed) {
seed_state(seed);
}
uint64_t next_u64() override {
const uint64_t s0 = state_[0];
uint64_t s1 = state_[1];
const uint64_t result = rotl64(s0 + s1, 17) + s0; // xoroshiro128++ scrambler
s1 ^= s0;
state_[0] = rotl64(s0, 49) ^ s1 ^ (s1 << 21);
state_[1] = rotl64(s1, 28);
return result;
}
};
- 🎯 Reference Implementation - Bit-perfect algorithm execution
-
🔒 Memory Safety -
std::arrayfor state management - ⚡ Optimal Compiler - Lets modern compilers auto-vectorize when possible
- 📱 Universal Compatibility - Runs on everything from embedded to supercomputers
- Intel: Pentium 4+ (2001)
- AMD: Opteron+ (2003)
- Parallelism: 2-way (128-bit vectors)
class Xoroshiro128ppSSE2 {
public:
explicit Xoroshiro128ppSSE2(uint64_t seed) {
auto seeds = generate_seeds<4>(seed);
state0_ = _mm_set_epi64x(seeds[1], seeds[0]);
state1_ = _mm_set_epi64x(seeds[3], seeds[2]);
}
void generate_batch(uint64_t* dest, size_t count) {
for (size_t i = 0; i + 1 < count; i += 2) {
// No direct 64-bit rotl in SSE2 - manual implementation
alignas(16) uint64_t s0[2], s1[2];
_mm_store_si128((__m128i*)s0, state0_);
_mm_store_si128((__m128i*)s1, state1_);
for (int j = 0; j < 2; j++) {
uint64_t sum = s0[j] + s1[j];
uint64_t rotated_sum = rotl64(sum, 17);
dest[i + j] = rotated_sum + s0[j];
// State update
uint64_t s1_xor_s0 = s1[j] ^ s0[j];
s0[j] = rotl64(s0[j], 49) ^ s1_xor_s0 ^ (s1_xor_s0 << 21);
s1[j] = rotl64(s1_xor_s0, 28);
}
state0_ = _mm_load_si128((__m128i*)s0);
state1_ = _mm_load_si128((__m128i*)s1);
}
}
};
| Challenge | Problem | Solution |
|---|---|---|
| No 64-bit Rotations | SSE2 lacks native 64-bit bit rotation | Manual scalar rotations in SIMD context |
| Limited Instructions | Fewer vector operations available | Hybrid scalar/vector approach |
| Memory Bandwidth | 128-bit vectors less efficient | Optimized for 2-way parallelism |
- Speedup: ~8.1x in batch mode
- Memory: 64-byte aligned state
- Power: Higher power consumption
- Best Use: High-performance computing, servers
- ARM: Cortex-A series
- Apple: M1/M2 processors
- Parallelism: 2-way (128-bit vectors)
class Xoroshiro128ppNeon {
public:
void generate_batch(uint64_t* dest, size_t count) {
for (size_t i = 0; i + 1 < count; i += 2) {
// NEON-optimized xoroshiro128++
uint64x2_t sum = vaddq_u64(state0_, state1_);
uint64x2_t rotated_sum = rotl_neon(sum, 17);
uint64x2_t result = vaddq_u64(rotated_sum,
Continued from Part 1...
- ARM: Cortex-A series
- Apple: M1/M2 processors
- Parallelism: 2-way (128-bit vectors)
class Xoroshiro128ppNeon {
public:
void generate_batch(uint64_t* dest, size_t count) {
for (size_t i = 0; i + 1 < count; i += 2) {
// NEON-optimized xoroshiro128++
uint64x2_t sum = vaddq_u64(state0_, state1_);
uint64x2_t rotated_sum = rotl_neon(sum, 17);
uint64x2_t result = vaddq_u64(rotated_sum, state0_);
vst1q_u64(dest + i, result);
// State update with NEON intrinsics
uint64x2_t s1 = veorq_u64(state0_, state1_);
state0_ = veorq_u64(
rotl_neon(state0_, 49),
veorq_u64(s1, vshlq_n_u64(s1, 21))
);
state1_ = rotl_neon(s1, 28);
}
}
private:
static inline uint64x2_t rotl_neon(uint64x2_t x, int k) {
return vsriq_n_u64(vshlq_n_u64(x, k), x, 64 - k);
}
};Feature | Capability | Performance Impact -- | -- | -- 64-bit Ops | vaddq_u64, veorq_u64 | Native 64-bit support Bit Manipulation | vshlq_n_u64, vsriq_n_u64 | Efficient rotations Memory Ops | vld1q_u64, vst1q_u64 | Optimized load/store Power Efficiency | Low power consumption | Excellent for mobile
- Measure First - Always benchmark on your target hardware
- Batch When Possible - SIMD shines with bulk generation
- Consider Power - GPU implementations use significantly more power
- Profile Memory - Ensure proper alignment for SIMD loads/stores
- Test Edge Cases - Verify performance with non-aligned batch sizes
Next: 🎯 Performance Analysis - Benchmark results and optimization stories
There is currently data lost off the bottom off the page - a search party needs to be sent in to rescue!
PLEASE DO BEAR IN CONSTANT MIND ABOVE ALL ELSE: CURRENT STATE OF DEVELOPMENT THE C++ STD LIBRARY EMPLOYING MERSENNE TWISTER STILL OUTPERFORMS SINGLE CALCULATION OPERATIONS FOR NON-SIMD BOOSTED COMPUTERS. THESE LIBRARIES FULLY REQUIRE AT LEAST AVX2 MINIMUM TO BENEFIT OVER THE STD GENERATION METHODS WHEN CONSIDERING SINGLE NUMBER GENERATION TASKS.