Skip to content

Commit

Permalink
Drop all work_group_reduce_X()
Browse files Browse the repository at this point in the history
Because:
- they allocate additional LDS,
- are not supported by RustiCL,
- potentially slow,
- better replaced by sub_group_reduce
  • Loading branch information
preda committed Dec 23, 2024
1 parent 002f686 commit 3737df5
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 10 deletions.
7 changes: 5 additions & 2 deletions src/cl/carryutil.cl
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,12 @@
#if STATS || ROE
void updateStats(global uint *bufROE, u32 posROE, float roundMax) {
assert(roundMax >= 0);
u32 groupRound = work_group_reduce_max(as_uint(roundMax));
// work_group_reduce_max() allocates an additional 256Bytes LDS for a 64lane workgroup, so avoid it.
// u32 groupRound = work_group_reduce_max(as_uint(roundMax));
// if (get_local_id(0) == 0) { atomic_max(bufROE + posROE, groupRound); }

if (get_local_id(0) == 0) { atomic_max(bufROE + posROE, groupRound); }
// Do the reduction directly over global mem.
atomic_max(bufROE + posROE, as_uint(roundMax));
}
#endif

Expand Down
12 changes: 4 additions & 8 deletions src/cl/etc.cl
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,16 @@ KERNEL(32) readResidue(P(Word2) out, CP(Word2) in) {
#endif

#if SUM64
KERNEL(256) sum64(global ulong* out, u32 sizeBytes, global ulong* in) {
KERNEL(64) sum64(global ulong* out, u32 sizeBytes, global ulong* in) {
if (get_global_id(0) == 0) { out[0] = 0; }

ulong sum = 0;
for (i32 p = get_global_id(0); p < sizeBytes / sizeof(u64); p += get_global_size(0)) {
sum += in[p];
}
sum = work_group_reduce_add(sum);
if (get_local_id(0) == 0) {
u32 low = sum;
u32 prev = atomic_add((global u32*)out, low);
u32 high = (sum + prev) >> 32;
atomic_add(((global u32*)out) + 1, high);
}
u32 prev = atomic_add((global u32*)out, (u32) sum);
u32 high = (sum + prev) >> 32;
atomic_add(((global u32*)out) + 1, high);
}
#endif

Expand Down

0 comments on commit 3737df5

Please sign in to comment.