commit 83ef5b48800a47cc30b3d4bcfdf31de9c3bd0dc8 Author: Patrick Lauer Date: Sun Jul 28 07:43:54 2024 +0000 Revert "ockl: Don't use wave32 ballot builtin" This reverts commit 066a0b2716b7ade96a2b3e79e5ddcd0c110e9f98. diff --git a/ockl/src/dm.cl b/ockl/src/dm.cl index 18efc54203b7..a3f06c448aee 100644 --- a/ockl/src/dm.cl +++ b/ockl/src/dm.cl @@ -287,14 +287,9 @@ first(__global void * v) return __builtin_astype(w2, __global void *); } -// Read val from one active lane whose predicate is one. -// If no lanes have the predicate set, return none -// This is like first, except that first may not have its predicate set +REQUIRES_WAVE64 static uint -elect_uint(int pred, uint val, uint none) -{ - // Pretend wave32 doesn't exist. The wave64 ballot works, and the high half - // will fold out as 0. +elect_uint_wave64(int pred, uint val, uint none) { uint ret = none; ulong mask = __builtin_amdgcn_ballot_w64(pred != 0); @@ -306,14 +301,51 @@ elect_uint(int pred, uint val, uint none) return ret; } -// Count the number of nonzero arguments across the wave +REQUIRES_WAVE32 static uint -votes(bool b) +elect_uint_wave32(int pred, uint val, uint none) { + uint ret = none; + uint mask = __builtin_amdgcn_ballot_w32(pred != 0); + if (mask != 0U) { + uint l = __ockl_ctz_u32(mask); + ret = __builtin_amdgcn_ds_bpermute(l << 2, val); + } + + return ret; +} + +// Read val from one active lane whose predicate is one. +// If no lanes have the predicate set, return none +// This is like first, except that first may not have its predicate set +static uint +elect_uint(int pred, uint val, uint none) +{ + return __oclc_wavefrontsize64 ? elect_uint_wave64(pred, val, none) : elect_uint_wave32(pred, val, none); +} + +REQUIRES_WAVE64 +static uint +votes_wave64(bool b) { ulong mask = __builtin_amdgcn_ballot_w64(b); return __builtin_popcountl(mask); } +REQUIRES_WAVE32 +static uint +votes_wave32(bool b) +{ + uint mask = __builtin_amdgcn_ballot_w32(b); + return __builtin_popcount(mask); +} + +// Count the number of nonzero arguments across the wave +static uint +votes(bool b) +{ + return __oclc_wavefrontsize64 ? votes_wave64(b) : votes_wave32(b); +} + // The kind of the smallest block that can hold sz bytes static uint size_to_kind(uint sz) diff --git a/ockl/src/wfaas.cl b/ockl/src/wfaas.cl index 3861a5bb3eab..4dab97cea5f7 100644 --- a/ockl/src/wfaas.cl +++ b/ockl/src/wfaas.cl @@ -21,25 +21,60 @@ static int optimizationBarrierHack(int in_val) return out_val; } +REQUIRES_WAVE32 +static bool wfany_impl_w32(int e) { + return __builtin_amdgcn_ballot_w32(e) != 0; +} + +REQUIRES_WAVE64 +static bool wfany_impl_w64(int e) { + return __builtin_amdgcn_ballot_w64(e) != 0; +} + ATTR bool OCKL_MANGLE_I32(wfany)(int e) { e = optimizationBarrierHack(e); - return __builtin_amdgcn_ballot_w64(e) != 0; + return __oclc_wavefrontsize64 ? + wfany_impl_w64(e) : wfany_impl_w32(e); +} + +REQUIRES_WAVE32 +static bool wfall_impl_w32(int e) { + return __builtin_amdgcn_ballot_w32(e) == __builtin_amdgcn_read_exec_lo(); +} + +REQUIRES_WAVE64 +static bool wfall_impl_w64(int e) { + return __builtin_amdgcn_ballot_w64(e) == __builtin_amdgcn_read_exec(); } ATTR bool OCKL_MANGLE_I32(wfall)(int e) { e = optimizationBarrierHack(e); - return __builtin_amdgcn_ballot_w64(e) == __builtin_amdgcn_read_exec(); + return __oclc_wavefrontsize64 ? + wfall_impl_w64(e) : wfall_impl_w32(e); +} + + +REQUIRES_WAVE32 +static bool wfsame_impl_w32(int e) { + uint u = __builtin_amdgcn_ballot_w32(e); + return (u == 0) | (u == __builtin_amdgcn_read_exec_lo()); +} + +REQUIRES_WAVE64 +static bool wfsame_impl_w64(int e) { + ulong u = __builtin_amdgcn_ballot_w64(e); + return (u == 0UL) | (u == __builtin_amdgcn_read_exec()); } ATTR bool OCKL_MANGLE_I32(wfsame)(int e) { e = optimizationBarrierHack(e); - ulong u = __builtin_amdgcn_ballot_w64(e); - return (u == 0UL) | (u == __builtin_amdgcn_read_exec()); + return __oclc_wavefrontsize64 ? + wfsame_impl_w64(e) : wfsame_impl_w32(e); }