Skip to content

Commit

Permalink
examples/math/matmul: Decide team size dynamically
Browse files Browse the repository at this point in the history
Host queries device for its dimensions.
Host creates largest possible team to perform the matrix multiplication.
(32x32, 16x16, 8x8, or 4x4).

Signed-off-by: Ola Jeppsson <[email protected]>
  • Loading branch information
olajep committed Aug 4, 2016
1 parent abb9352 commit d568ca7
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 30 deletions.
40 changes: 25 additions & 15 deletions examples/math/matmul/matmul-dev-epiphany.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,9 @@
#include "static-buffers.h"
#include "common-buffers.h"

int matmul(unsigned nside);
void init(unsigned nside);
void bigmatmul();
void init();
void data_copy_2d(void *dst,
const void *src,
unsigned inner_count,
Expand All @@ -48,14 +49,20 @@ void data_copy_2d(void *dst,

p_mutex_t pmutex = P_MUTEX_INITIALIZER;


int main(int argc, char *argv[])
{
return matmul(4);
}

int matmul(unsigned nside)
{
int status;

status = 0;

// Initialize data structures - mainly target pointers
init();
init(nside);

// Sync with all other cores
p_barrier(P_TEAM_DEFAULT);
Expand All @@ -71,7 +78,7 @@ int main(int argc, char *argv[])
return status;
}

void init()
void init(unsigned nside)
{
int32_t eram_base;
p_coords_t tmp_coords;
Expand All @@ -82,6 +89,9 @@ void init()
dev = p_init(P_DEV_EPIPHANY, 0);
eram_base = p_query(dev, P_PROP_MEMBASE);

me.nside = nside;
me.schip = _Score * me.nside;

me.rank = p_team_rank(P_TEAM_DEFAULT);
p_rank_to_coords(P_TEAM_DEFAULT, me.rank, &me.coords, 0);

Expand Down Expand Up @@ -137,27 +147,27 @@ void bigmatmul()
// Chip loop through operand matrix:
// Smtx is the size of operand matrices (Smtx x Smtx)
// Schip is size of a chip matrix (Schip x Schip)
for (im=0; im<_Smtx; im+=_Schip)
for (im=0; im<_Smtx; im+=me.schip)
{
for (jm=0; jm<_Smtx; jm+=_Schip)
for (jm=0; jm<_Smtx; jm+=me.schip)
{
// First clear the local result submatrix. The product result will be
// integrated into this submatrix.
matclr(me.bank_C, _Score);

for (km=0; km<_Smtx; km+=_Schip)
for (km=0; km<_Smtx; km+=me.schip)
{
// Core loop through chip:
// for every chip (mesh) iteration on the operand matrix
// calculate the matmul of the chip-sized submatrices
// in granularity of cores

// Wait for the DMA token
p_mutex_lock(&pmutex);
//p_mutex_lock(&pmutex);

// get A block from external DRAM
ic = me.coords.row * _Score;
jc = ((me.coords.col + me.coords.row) % _Nside) * _Score;
jc = ((me.coords.col + me.coords.row) % me.nside) * _Score;

src = &(Mailbox.pA[(im+ic)*_Smtx + (km+jc)]);
dst = me.bank_A[me.pingpong];
Expand All @@ -167,7 +177,7 @@ void bigmatmul()

// get B block from DRAM
jc = me.coords.col * _Score;
ic = ((me.coords.row + me.coords.col) % _Nside) * _Score;
ic = ((me.coords.row + me.coords.col) % me.nside) * _Score;

src = &(Mailbox.pB[(km+ic)*_Smtx + (jm+jc)]);
dst = me.bank_B[me.pingpong];
Expand All @@ -176,10 +186,10 @@ void bigmatmul()
data_copy_2d(dst, src, (_Score >> 1), _Score, 8, 8, ((_Smtx - _Score) * sizeof(float) + 8));

// Pass the DMA token to next core
p_mutex_unlock(&pmutex);
//p_mutex_unlock(&pmutex);

// Multiply submatrices (inner product of row x column)
for (kc=0; kc<_Nside; kc++)
for (kc=0; kc<me.nside; kc++)
{
// Core matmul:
// for every core calculate the matmul
Expand All @@ -193,13 +203,13 @@ void bigmatmul()
// Swap A banks horizontally
src = me.bank_A[me.pingpong];
dst = me.tgt_A[me.pingpong];
if (kc < (_Nside - 1))
if (kc < (me.nside - 1))
data_copy_2d(dst, src, (_Score >> 1), _Score, 8, 8, 8);

// Swap B banks vertically
src = me.bank_B[me.pingpong];
dst = me.tgt_B[me.pingpong];
if (kc < (_Nside - 1))
if (kc < (me.nside - 1))
data_copy_2d(dst, src, (_Score >> 1), _Score, 8, 8, 8);

me.pingpong = 1 - me.pingpong;
Expand All @@ -217,13 +227,13 @@ void bigmatmul()
dst = &(Mailbox.pC[(im+ic)*_Smtx + (jm+jc)]);

// Wait for the DMA token
p_mutex_lock(&pmutex);
//p_mutex_lock(&pmutex);

// Write data
data_copy_2d(dst, src, (_Score >> 1), _Score, 8, ((_Smtx - _Score) * sizeof(float) + 8), 8);

// Pass the DMA token to the next core
p_mutex_unlock(&pmutex);
//p_mutex_unlock(&pmutex);
}
}

Expand Down
41 changes: 36 additions & 5 deletions examples/math/matmul/matmul-host.c
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,14 @@ struct timespec timer[6];
int main(int argc, char *argv[])
{
p_mem_t shared_mem, results_mem;
uint32_t eram_base;
char results[1024] = { '\0' };
int device_cols, device_rows, nside;
p_dev_t dev;
p_prog_t prog;
p_team_t team;
p_coords_t size;
p_coords_t start = { .row = 0, .col = 0 };
p_coords_t size = { .row = 4, .col = 4 };

unsigned int msize;
float seed;
Expand All @@ -117,10 +119,38 @@ int main(int argc, char *argv[])
}

dev = p_init(P_DEV_EPIPHANY, 0);
prog = p_load(dev, ar.elfFile, 0);
team = p_open4(dev, P_TOPOLOGY_2D, &start, &size);
if (p_error(dev)) {
fprintf(stderr, "Error initializing PAL\n");
return p_error(dev);
}

device_cols = p_query(dev, P_PROP_COLS);
device_rows = p_query(dev, P_PROP_ROWS);

// Use min size
nside = device_cols > device_rows ? device_cols : device_rows;

if (nside < 4) {
fprintf(stderr, "Error: Too small device, need at least 4x4\n");
return 1;
}

// Either 1024, 256, 64, or 16 cores (side must be power of two),
nside = nside >= 32 ? 32 : nside >= 16 ? 16 : nside >= 8 ? 8 : 4;

size.row = nside;
size.col = nside;
team = p_open4(dev, P_TOPOLOGY_2D, &start, &size);
printf("Using team of size %d\n", p_team_size(team));
if (p_error(team)) {
fprintf(stderr, "Error opening team\n");
return p_error(team);
}

prog = p_load(dev, ar.elfFile, 0);

shared_mem = p_map(dev, 0x8e000000, msize);
eram_base = (unsigned) p_query(dev, P_PROP_MEMBASE);
shared_mem = p_map(dev, eram_base, msize);

// Clear mailbox contents
memset(&Mailbox, 0, sizeof(Mailbox));
Expand Down Expand Up @@ -168,7 +198,8 @@ int main(int argc, char *argv[])
printf("Loading program on Epiphany chip...\n");
}

if (p_run(prog, "main", team, 0, 16, 0, NULL, 0)) {
p_arg_t args[] = { &nside, sizeof(nside), true };
if (p_run(prog, "matmul", team, 0, p_team_size(team), 1, args, 0)) {
fprintf(stderr, "Error loading Epiphany program.\n");
exit(1);
}
Expand Down
18 changes: 8 additions & 10 deletions examples/math/matmul/matmul.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,8 @@
#include <stdint.h>
#include <pal.h>

#define _Nchips 4 // # of chips in operand matrix side
#define _Nside 4 // # of cores in chip side
#define _Ncores (_Nside * _Nside) // Num of cores = 16
#define _Score 32 // side size of per-core sub-submatrix (max 32)
#define _Schip (_Score * _Nside) // side size of per-chip submatrix
#define _Smtx (_Schip * _Nchips) // side size of operand matrix
#define _Score 16 // side size of per-core sub-submatrix (max 32)
#define _Smtx 512 // side size of operand matrix

#define _Nbanks 4 // Num of SRAM banks on core

Expand All @@ -42,12 +38,14 @@
#define _PING 0
#define _PONG 1


typedef struct {
p_coords_t coords;
int rank;
int west_rank;
int north_rank;
int rank; // My rank in team
int west_rank; // Core west of me
int north_rank; // Core north of me

unsigned schip; // side size of per-chip submatrix
unsigned nside; // # of cores in chip side

void *bank_A[2]; // A Ping Pong Bank local space pointers
void *bank_B[2]; // B Ping Pong Bank local space pointers
Expand Down

0 comments on commit d568ca7

Please sign in to comment.