examples/math/matmul: Decide team size dynamically

Host queries device for its dimensions. Host creates largest possible team to perform the matrix multiplication. (32x32, 16x16, 8x8, or 4x4). Signed-off-by: Ola Jeppsson <[email protected]>
parallella · Aug 4, 2016 · d568ca7 · d568ca7
1 parent abb9352
commit d568ca7
Show file tree

Hide file tree

Showing 3 changed files with 69 additions and 30 deletions.
diff --git a/examples/math/matmul/matmul-dev-epiphany.c b/examples/math/matmul/matmul-dev-epiphany.c
@@ -36,8 +36,9 @@
 #include "static-buffers.h"
 #include "common-buffers.h"
 
+int matmul(unsigned nside);
+void init(unsigned nside);
 void bigmatmul();
-void init();
 void data_copy_2d(void *dst,
 				  const void *src,
 				  unsigned inner_count,
@@ -48,14 +49,20 @@ void data_copy_2d(void *dst,
 
 p_mutex_t pmutex = P_MUTEX_INITIALIZER;
 
+
 int main(int argc, char *argv[])
+{
+	return matmul(4);
+}
+
+int matmul(unsigned nside)
 {
 	int status;
 
 	status = 0;
 
 	// Initialize data structures - mainly target pointers
-	init();
+	init(nside);
 
 	// Sync with all other cores
 	p_barrier(P_TEAM_DEFAULT);
@@ -71,7 +78,7 @@ int main(int argc, char *argv[])
 	return status;
 }
 
-void init()
+void init(unsigned nside)
 {
 	int32_t eram_base;
 	p_coords_t tmp_coords;
@@ -82,6 +89,9 @@ void init()
 	dev = p_init(P_DEV_EPIPHANY, 0);
 	eram_base = p_query(dev, P_PROP_MEMBASE);
 
+	me.nside = nside;
+	me.schip = _Score * me.nside;
+
 	me.rank = p_team_rank(P_TEAM_DEFAULT);
 	p_rank_to_coords(P_TEAM_DEFAULT, me.rank, &me.coords, 0);
 
@@ -137,27 +147,27 @@ void bigmatmul()
 	// Chip loop through operand matrix:
 	// Smtx is the size of operand matrices (Smtx x Smtx)
 	// Schip is size of a chip matrix (Schip x Schip)
-	for (im=0; im<_Smtx; im+=_Schip)
+	for (im=0; im<_Smtx; im+=me.schip)
 	{
-		for (jm=0; jm<_Smtx; jm+=_Schip)
+		for (jm=0; jm<_Smtx; jm+=me.schip)
 		{
 			// First clear the local result submatrix. The product result will be
 			// integrated into this submatrix.
 			matclr(me.bank_C, _Score);
 
-			for (km=0; km<_Smtx; km+=_Schip)
+			for (km=0; km<_Smtx; km+=me.schip)
 			{
 				// Core loop through chip:
 				// for every chip (mesh) iteration on the operand matrix
 				// calculate the matmul of the chip-sized submatrices
 				// in granularity of cores
 
 				// Wait for the DMA token
-				p_mutex_lock(&pmutex);
+				//p_mutex_lock(&pmutex);
 
 				// get A block from external DRAM
 				ic = me.coords.row * _Score;
-				jc = ((me.coords.col + me.coords.row) % _Nside) * _Score;
+				jc = ((me.coords.col + me.coords.row) % me.nside) * _Score;
 
 				src = &(Mailbox.pA[(im+ic)*_Smtx + (km+jc)]);
 				dst = me.bank_A[me.pingpong];
@@ -167,7 +177,7 @@ void bigmatmul()
 
 				// get B block from DRAM
 				jc = me.coords.col * _Score;
-				ic = ((me.coords.row + me.coords.col) % _Nside) * _Score;
+				ic = ((me.coords.row + me.coords.col) % me.nside) * _Score;
 
 				src = &(Mailbox.pB[(km+ic)*_Smtx + (jm+jc)]);
 				dst = me.bank_B[me.pingpong];
@@ -176,10 +186,10 @@ void bigmatmul()
 				data_copy_2d(dst, src, (_Score >> 1), _Score, 8, 8, ((_Smtx - _Score) * sizeof(float) + 8));
 
 				// Pass the DMA token to next core
-				p_mutex_unlock(&pmutex);
+				//p_mutex_unlock(&pmutex);
 
 				// Multiply submatrices (inner product of row x column)
-				for (kc=0; kc<_Nside; kc++)
+				for (kc=0; kc<me.nside; kc++)
 				{
 					// Core matmul:
 					// for every core calculate the matmul
@@ -193,13 +203,13 @@ void bigmatmul()
 					// Swap A banks horizontally
 					src = me.bank_A[me.pingpong];
 					dst = me.tgt_A[me.pingpong];
-					if (kc < (_Nside - 1))
+					if (kc < (me.nside - 1))
 						data_copy_2d(dst, src, (_Score >> 1), _Score, 8, 8, 8);
 
 					// Swap B banks vertically
 					src = me.bank_B[me.pingpong];
 					dst = me.tgt_B[me.pingpong];
-					if (kc < (_Nside - 1))
+					if (kc < (me.nside - 1))
 						data_copy_2d(dst, src, (_Score >> 1), _Score, 8, 8, 8);
 
 					me.pingpong = 1 - me.pingpong;
@@ -217,13 +227,13 @@ void bigmatmul()
 			dst = &(Mailbox.pC[(im+ic)*_Smtx + (jm+jc)]);
 
 			// Wait for the DMA token
-			p_mutex_lock(&pmutex);
+			//p_mutex_lock(&pmutex);
 
 			// Write data
 			data_copy_2d(dst, src, (_Score >> 1), _Score, 8, ((_Smtx - _Score) * sizeof(float) + 8), 8);
 
 			// Pass the DMA token to the next core
-			p_mutex_unlock(&pmutex);
+			//p_mutex_unlock(&pmutex);
 		}
 	}
 

diff --git a/examples/math/matmul/matmul-host.c b/examples/math/matmul/matmul-host.c
@@ -88,12 +88,14 @@ struct timespec timer[6];
 int main(int argc, char *argv[])
 {
 	p_mem_t shared_mem, results_mem;
+	uint32_t eram_base;
 	char results[1024] = { '\0' };
+	int device_cols, device_rows, nside;
 	p_dev_t dev;
 	p_prog_t prog;
 	p_team_t team;
+	p_coords_t size;
 	p_coords_t start = { .row = 0, .col = 0 };
-	p_coords_t size = { .row = 4, .col = 4 };
 
 	unsigned int msize;
 	float        seed;
@@ -117,10 +119,38 @@ int main(int argc, char *argv[])
 	}
 
 	dev = p_init(P_DEV_EPIPHANY, 0);
-    prog = p_load(dev, ar.elfFile, 0);
-    team = p_open4(dev, P_TOPOLOGY_2D, &start, &size);
+	if (p_error(dev)) {
+		fprintf(stderr, "Error initializing PAL\n");
+		return p_error(dev);
+	}
+
+	device_cols = p_query(dev, P_PROP_COLS);
+	device_rows = p_query(dev, P_PROP_ROWS);
+
+	// Use min size
+	nside = device_cols > device_rows ? device_cols : device_rows;
+
+	if (nside < 4) {
+		fprintf(stderr, "Error: Too small device, need at least 4x4\n");
+		return 1;
+	}
+
+	// Either 1024, 256, 64, or 16 cores (side must be power of two),
+	nside = nside >= 32 ? 32 : nside >= 16 ? 16 : nside >= 8 ? 8 : 4;
+
+	size.row = nside;
+	size.col = nside;
+	team = p_open4(dev, P_TOPOLOGY_2D, &start, &size);
+	printf("Using team of size %d\n", p_team_size(team));
+	if (p_error(team)) {
+		fprintf(stderr, "Error opening team\n");
+		return p_error(team);
+	}
+
+	prog = p_load(dev, ar.elfFile, 0);
 
-    shared_mem = p_map(dev, 0x8e000000, msize);
+	eram_base = (unsigned) p_query(dev, P_PROP_MEMBASE);
+	shared_mem = p_map(dev, eram_base, msize);
 
 	// Clear mailbox contents
 	memset(&Mailbox, 0, sizeof(Mailbox));
@@ -168,7 +198,8 @@ int main(int argc, char *argv[])
 	  printf("Loading program on Epiphany chip...\n");
 	}
 
-    if (p_run(prog, "main", team, 0, 16, 0, NULL, 0)) {
+	p_arg_t args[] = { &nside, sizeof(nside), true };
+	if (p_run(prog, "matmul", team, 0, p_team_size(team), 1, args, 0)) {
 		fprintf(stderr, "Error loading Epiphany program.\n");
 		exit(1);
 	}

diff --git a/examples/math/matmul/matmul.h b/examples/math/matmul/matmul.h
@@ -26,12 +26,8 @@
 #include <stdint.h>
 #include <pal.h>
 
-#define _Nchips 4                  // # of chips in operand matrix side
-#define _Nside  4                  // # of cores in chip side
-#define _Ncores (_Nside * _Nside)  // Num of cores = 16
-#define _Score  32                 // side size of per-core sub-submatrix (max 32)
-#define _Schip  (_Score * _Nside)  // side size of per-chip submatrix
-#define _Smtx   (_Schip * _Nchips) // side size of operand matrix
+#define _Score  16                 // side size of per-core sub-submatrix (max 32)
+#define _Smtx   512                // side size of operand matrix
 
 #define _Nbanks 4                  // Num of SRAM banks on core
 
@@ -42,12 +38,14 @@
 #define _PING   0
 #define _PONG   1
 
-
 typedef struct {
 	p_coords_t coords;
-	int rank;
-	int west_rank;
-	int north_rank;
+	int rank;         // My rank in team
+	int west_rank;    // Core west of me
+	int north_rank;   // Core north of me
+
+	unsigned schip;   // side size of per-chip submatrix
+	unsigned nside;   // # of cores in chip side
 
 	void  *bank_A[2]; // A Ping Pong Bank local space pointers
 	void  *bank_B[2]; // B Ping Pong Bank local space pointers