Skip to content

Commit

Permalink
Merge pull request pulp-platform#70 from LucaRufer/sysinfo
Browse files Browse the repository at this point in the history
Sysinfo
  • Loading branch information
SamuelRiedel authored Nov 18, 2022
2 parents d24b7c7 + 03d1f02 commit 3093b0f
Show file tree
Hide file tree
Showing 18 changed files with 120 additions and 68 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- Upgrade to LLVM 14
- Support multiple outstanding wake-up calls in Snitch
- Clean out tracing script and improve the traces' size and checks
- Replace NUM_CORES and similar macros with function calls in software

## 0.5.0 - 2022-08-03

Expand Down
9 changes: 6 additions & 3 deletions software/apps/dotp/dotp_parallel.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ void dotp_parallel(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len,
uint32_t nPE) {

uint32_t core_id = mempool_get_core_id();
uint32_t num_cores = mempool_get_core_count();
uint32_t step = Len / nPE;

register int32_t local_sum = 0;
Expand All @@ -23,8 +24,9 @@ void dotp_parallel(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len,
__atomic_fetch_add(&s[0], local_sum, __ATOMIC_RELAXED);
#ifdef LOG_BARRIERS
mempool_log_barrier(2, core_id);
(void)num_cores;
#else
mempool_barrier(NUM_CORES);
mempool_barrier(num_cores);
#endif
}

Expand All @@ -33,6 +35,7 @@ void dotp_parallel_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s,
uint32_t Len, uint32_t nPE) {

uint32_t core_id = mempool_get_core_id();
uint32_t num_cores = mempool_get_core_count();
uint32_t step = Len / nPE;
uint32_t reminder = step % 4;
uint32_t i;
Expand Down Expand Up @@ -67,14 +70,14 @@ void dotp_parallel_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s,
local_sum0 += local_sum1;
local_sum2 += local_sum3;
local_sum0 += local_sum2;
mempool_barrier(NUM_CORES);
mempool_barrier(num_cores);

mempool_stop_benchmark();
mempool_start_benchmark();
__atomic_fetch_add(&s[0], local_sum0, __ATOMIC_RELAXED);
#ifdef LOG_BARRIERS
mempool_log_barrier(2, core_id);
#else
mempool_barrier(NUM_CORES);
mempool_barrier(num_cores);
#endif
}
14 changes: 8 additions & 6 deletions software/apps/dotp/dotp_parallel_local.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,9 @@ void dotp_parallel_local(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len,
uint32_t const remainder = Len % 4;
uint32_t const idx_stop = Len - remainder;
uint32_t core_id = mempool_get_core_id();
uint32_t num_cores = mempool_get_core_count();

if (nPE == NUM_CORES) {
if (nPE == num_cores) {
register int32_t local_sum = 0;
uint32_t idx = core_id * 4;
while (idx < idx_stop) {
Expand All @@ -45,7 +46,7 @@ void dotp_parallel_local(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len,
#ifdef LOG_BARRIERS
mempool_log_barrier(2, core_id);
#else
mempool_barrier(NUM_CORES);
mempool_barrier(num_cores);
#endif
} else {
register int32_t local_sum = 0;
Expand All @@ -71,7 +72,7 @@ void dotp_parallel_local(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len,
#ifdef LOG_BARRIERS
mempool_log_partial_barrier(2, core_id, nPE);
#else
mempool_barrier(NUM_CORES);
mempool_barrier(num_cores);
#endif
}
}
Expand All @@ -83,12 +84,13 @@ void dotp_parallel_local_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s,
uint32_t const remainder = Len % 4;
uint32_t const idx_stop = Len - remainder;
uint32_t core_id = mempool_get_core_id();
uint32_t num_cores = mempool_get_core_count();
register int32_t local_sum_1 = 0;
register int32_t local_sum_2 = 0;
register int32_t local_sum_3 = 0;
register int32_t local_sum_4 = 0;

if (nPE == NUM_CORES) {
if (nPE == num_cores) {
uint32_t idx = core_id * 4;
while (idx < idx_stop) {
int32_t in_a1 = in_a[idx];
Expand Down Expand Up @@ -120,7 +122,7 @@ void dotp_parallel_local_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s,
#ifdef LOG_BARRIERS
mempool_log_barrier(2, core_id);
#else
mempool_barrier(NUM_CORES);
mempool_barrier(num_cores);
#endif
} else {
uint32_t idx = core_id * 4;
Expand Down Expand Up @@ -154,7 +156,7 @@ void dotp_parallel_local_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s,
#ifdef LOG_BARRIERS
mempool_log_partial_barrier(2, core_id, nPE);
#else
mempool_barrier(NUM_CORES);
mempool_barrier(num_cores);
#endif
}
}
6 changes: 4 additions & 2 deletions software/apps/dotp/dotp_parallel_red0.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ void dotp_parallel_red0(int32_t *in_a, int32_t *in_b, int32_t *s,
uint32_t const remainder = Len % 4;
uint32_t const idx_stop = Len - remainder;
uint32_t core_id = mempool_get_core_id();
uint32_t num_cores = mempool_get_core_count();
int32_t local_sum = 0;

uint32_t idx = core_id * 4;
Expand All @@ -45,7 +46,7 @@ void dotp_parallel_red0(int32_t *in_a, int32_t *in_b, int32_t *s,
mempool_stop_benchmark();

mempool_start_benchmark();
if ((NUM_CORES - 1) ==
if ((num_cores - 1) ==
__atomic_fetch_add(&red_barrier[0], 1, __ATOMIC_RELAXED)) {
__atomic_store_n(&red_barrier[0], 0, __ATOMIC_RELAXED);
__sync_synchronize(); // Full memory barrier
Expand All @@ -68,6 +69,7 @@ void dotp_parallel_unrolled4_red0(int32_t *in_a, int32_t *in_b, int32_t *s,
uint32_t const remainder = Len % 4;
uint32_t const idx_stop = Len - remainder;
uint32_t core_id = mempool_get_core_id();
uint32_t num_cores = mempool_get_core_count();
int32_t local_sum_1 = 0;
int32_t local_sum_2 = 0;
int32_t local_sum_3 = 0;
Expand Down Expand Up @@ -103,7 +105,7 @@ void dotp_parallel_unrolled4_red0(int32_t *in_a, int32_t *in_b, int32_t *s,
mempool_stop_benchmark();

mempool_start_benchmark();
if ((NUM_CORES - 1) ==
if ((num_cores - 1) ==
__atomic_fetch_add(&red_barrier[0], 1, __ATOMIC_RELAXED)) {
__atomic_store_n(&red_barrier[0], 0, __ATOMIC_RELAXED);
__sync_synchronize(); // Full memory barrier
Expand Down
3 changes: 2 additions & 1 deletion software/apps/dotp/dotp_parallel_redtree.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ void mempool_log_reduction(int32_t *sum, uint32_t volatile step,
uint32_t idx_sum, idx = (step * (core_id / step)) * 4;
uint32_t next_step, previous_step;
register int32_t local_sum;
uint32_t num_cores = mempool_get_core_count();

previous_step = step >> 1;
if ((step - previous_step) ==
Expand All @@ -114,7 +115,7 @@ void mempool_log_reduction(int32_t *sum, uint32_t volatile step,
next_step = step << 1;
__atomic_store_n(&red_barrier[idx + previous_step - 1], 0,
__ATOMIC_RELAXED);
if (NUM_CORES == step) {
if (num_cores == step) {
sum[0] = sum[idx];
__sync_synchronize(); // Full memory barrier
wake_up_all();
Expand Down
6 changes: 4 additions & 2 deletions software/apps/dotp/dotp_single.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
void dotp_single(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len) {

uint32_t core_id = mempool_get_core_id();
uint32_t num_cores = mempool_get_core_count();
if (core_id == 0) {

mempool_start_benchmark();
Expand All @@ -21,14 +22,15 @@ void dotp_single(int32_t *in_a, int32_t *in_b, int32_t *s, uint32_t Len) {
*s = local_sum;
mempool_stop_benchmark();
}
mempool_barrier(NUM_CORES);
mempool_barrier(num_cores);
}

/* Single-core dot-product unrolled4 */
void dotp_single_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s,
uint32_t Len) {

uint32_t core_id = mempool_get_core_id();
uint32_t num_cores = mempool_get_core_count();
if (core_id == 0) {

mempool_start_benchmark();
Expand Down Expand Up @@ -67,6 +69,6 @@ void dotp_single_unrolled4(int32_t *in_a, int32_t *in_b, int32_t *s,
*s = local_sum_1;
mempool_stop_benchmark();
}
mempool_barrier(NUM_CORES);
mempool_barrier(num_cores);
// mempool_log_barrier(2, core_id);
}
10 changes: 6 additions & 4 deletions software/apps/dotp/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,9 @@ void init_vectors(int32_t *in_a, int32_t *in_b, int32_t *s, int32_t *p_result,
*p_result = 0;
*p_check = 0;
uint32_t j = 0;
uint32_t num_cores = mempool_get_core_count();
while (j < Len) {
int32_t a = (int32_t)(j % NUM_CORES);
int32_t a = (int32_t)(j % num_cores);
int32_t b = (int32_t)(j % 4 + 3);
in_a[j] = a;
in_b[j] = b;
Expand All @@ -48,6 +49,7 @@ void init_vectors(int32_t *in_a, int32_t *in_b, int32_t *s, int32_t *p_result,
int main() {

uint32_t core_id = mempool_get_core_id();
uint32_t num_cores = mempool_get_core_count();
uint32_t time_init, time_end;
// initialize synchronization variables
mempool_barrier_init(core_id);
Expand All @@ -63,7 +65,7 @@ int main() {
init_vectors(vector_a, vector_b, &sum, &result, &check, LEN);
#endif
}
mempool_barrier(NUM_CORES); // wait until all cores have finished
mempool_barrier(num_cores); // wait until all cores have finished

// Kernel execution

Expand Down Expand Up @@ -117,7 +119,7 @@ int main() {
time_end = mempool_get_timer();
}

mempool_barrier(NUM_CORES);
mempool_barrier(num_cores);
// Check results
if (core_id == 0) {
uint32_t clock_cycles = (time_end - time_init);
Expand All @@ -131,7 +133,7 @@ int main() {
printf("Result ==> %d\n", result);
printf("Check ==> %d\n\n", check);
}
mempool_barrier(NUM_CORES);
mempool_barrier(num_cores);

return error;
}
5 changes: 2 additions & 3 deletions software/apps/malloc_test/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ int main() {
mempool_barrier_init(core_id);

// Initialization
mempool_init(core_id, num_cores);
mempool_init(core_id);

// Test
if (core_id == 0) {
Expand Down Expand Up @@ -129,8 +129,7 @@ int main() {
// ------------------------------------------------------------------------
// Sequential Memory Basic Tests
// ------------------------------------------------------------------------
for (uint32_t tile_id = 0; tile_id < num_cores / NUM_CORES_PER_TILE;
++tile_id) {
for (uint32_t tile_id = 0; tile_id < mempool_get_tile_count(); ++tile_id) {
printf("Test tile allocator %u:\n", tile_id);

// Get tile allocator
Expand Down
4 changes: 2 additions & 2 deletions software/apps/systolic/matmul/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,13 @@ void print_matrix(int32_t const *matrix, uint32_t num_rows,
int main() {
uint32_t core_id = mempool_get_core_id();
uint32_t num_cores = mempool_get_core_count();
uint32_t tile_id = core_id / NUM_CORES_PER_TILE;
uint32_t tile_id = mempool_get_tile_id();

// Initialize synchronization variables
mempool_barrier_init(core_id);

// Initialization
mempool_init(core_id, num_cores);
mempool_init(core_id);

// Allocate systolic grid mapping
if (core_id == 0) {
Expand Down
2 changes: 1 addition & 1 deletion software/apps/systolic/queue_multi_test/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ int main() {
mempool_barrier_init(core_id);

// Initialization
mempool_init(core_id, num_cores);
mempool_init(core_id);

// Wait for all cores
mempool_barrier(num_cores);
Expand Down
2 changes: 1 addition & 1 deletion software/apps/systolic/queue_test/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ int main() {
mempool_barrier_init(core_id);

// Initialization
mempool_init(core_id, num_cores);
mempool_init(core_id);

// Setup
if (core_id == 0) {
Expand Down
28 changes: 15 additions & 13 deletions software/apps/test_tile_wu/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,17 @@ int main() {

uint32_t core_id = mempool_get_core_id();
uint32_t num_cores = mempool_get_core_count();
uint32_t num_cores_per_tile = mempool_get_core_count_per_tile();
uint32_t num_cores_per_group = mempool_get_core_count_per_group();

mempool_barrier_init(core_id);
if (core_id == 0) {
sleep = 0;
}
mempool_barrier(num_cores);

if (core_id > (5 * NUM_CORES_PER_TILE - 1) &&
core_id < (6 * NUM_CORES_PER_TILE)) {
if (core_id > (5 * num_cores_per_tile - 1) &&
core_id < (6 * num_cores_per_tile)) {
if (3 == __atomic_fetch_add(&sleep, 1, __ATOMIC_RELAXED)) {
__atomic_store_n(&sleep, 0, __ATOMIC_RELAXED);
__sync_synchronize();
Expand All @@ -39,8 +41,8 @@ int main() {
mempool_barrier(num_cores);

for (uint32_t i = 0; i < NUM_TILES_PER_GROUP; i++) {
if (core_id < (i + 1) * NUM_CORES_PER_TILE) {
if ((i + 1) * NUM_CORES_PER_TILE - 1 ==
if (core_id < (i + 1) * num_cores_per_tile) {
if ((i + 1) * num_cores_per_tile - 1 ==
__atomic_fetch_add(&sleep, 1, __ATOMIC_RELAXED)) {
__atomic_store_n(&sleep, 0, __ATOMIC_RELAXED);
__sync_synchronize();
Expand All @@ -56,9 +58,9 @@ int main() {
mempool_barrier(num_cores);

for (uint32_t i = 0; i < NUM_TILES_PER_GROUP; i++) {
if (core_id < NUM_CORES_PER_GROUP + (i + 1) * NUM_CORES_PER_TILE &&
core_id > NUM_CORES_PER_GROUP - 1) {
if ((i + 1) * NUM_CORES_PER_TILE - 1 ==
if (core_id < num_cores_per_group + (i + 1) * num_cores_per_tile &&
core_id > num_cores_per_group - 1) {
if ((i + 1) * num_cores_per_tile - 1 ==
__atomic_fetch_add(&sleep, 1, __ATOMIC_RELAXED)) {
__atomic_store_n(&sleep, 0, __ATOMIC_RELAXED);
__sync_synchronize();
Expand All @@ -74,9 +76,9 @@ int main() {
mempool_barrier(num_cores);

for (uint32_t i = 0; i < NUM_TILES_PER_GROUP; i++) {
if (core_id < 2 * NUM_CORES_PER_GROUP + (i + 1) * NUM_CORES_PER_TILE &&
core_id > 2 * NUM_CORES_PER_GROUP - 1) {
if ((i + 1) * NUM_CORES_PER_TILE - 1 ==
if (core_id < 2 * num_cores_per_group + (i + 1) * num_cores_per_tile &&
core_id > 2 * num_cores_per_group - 1) {
if ((i + 1) * num_cores_per_tile - 1 ==
__atomic_fetch_add(&sleep, 1, __ATOMIC_RELAXED)) {
__atomic_store_n(&sleep, 0, __ATOMIC_RELAXED);
__sync_synchronize();
Expand All @@ -90,9 +92,9 @@ int main() {
}

for (uint32_t i = 0; i < NUM_TILES_PER_GROUP; i++) {
if (core_id < 3 * NUM_CORES_PER_GROUP + (i + 1) * NUM_CORES_PER_TILE &&
core_id > 3 * NUM_CORES_PER_GROUP - 1) {
if ((i + 1) * NUM_CORES_PER_TILE - 1 ==
if (core_id < 3 * num_cores_per_group + (i + 1) * num_cores_per_tile &&
core_id > 3 * num_cores_per_group - 1) {
if ((i + 1) * num_cores_per_tile - 1 ==
__atomic_fetch_add(&sleep, 1, __ATOMIC_RELAXED)) {
__atomic_store_n(&sleep, 0, __ATOMIC_RELAXED);
__sync_synchronize();
Expand Down
3 changes: 2 additions & 1 deletion software/omp/critical/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
int test_omp_critical() {
int sum;
int known_sum, mysum;
int num_cores = (int)mempool_get_core_count();

sum = 0;
#pragma omp parallel
Expand All @@ -36,7 +37,7 @@ int test_omp_critical() {
// printf("Sum: %d, thread_id: %d\n",sum,omp_get_thread_num());
}
}
known_sum = 99 * 100 / 2 * NUM_CORES;
known_sum = 99 * 100 / 2 * num_cores;
return (known_sum == sum);
}

Expand Down
2 changes: 1 addition & 1 deletion software/omp/critical_benchmark/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ void parallel_critical_manual() {

islocked = __atomic_fetch_or(lock, 1, __ATOMIC_SEQ_CST);
while (islocked) {
mempool_wait(NUM_CORES);
mempool_wait(num_cores);
islocked = __atomic_fetch_or(lock, 1, __ATOMIC_SEQ_CST);
}

Expand Down
Loading

0 comments on commit 3093b0f

Please sign in to comment.