finally some progress

pkavvadias · Jan 26, 2020 · 9b46114 · 9b46114
1 parent 68638b6
commit 9b46114
Show file tree

Hide file tree

Showing 4 changed files with 13 additions and 12 deletions.
diff --git a/Main.cpp b/Main.cpp
@@ -4,7 +4,7 @@
 #include "device_launch_parameters.h"
 #include "cuda_runtime.h"
 #define ROWS 300
-#define COLUMNS 300
+#define COLUMNS 200
 #define THREADS 32
 
 /*For optimized algorithm*/
@@ -45,10 +45,14 @@ int main(int argc, char* argv[])
 	/*
 	 *Grid and block for the optimized algorithm
 	 */
+	/**
 	unsigned int numBlocksX = (ROWS - 1) / BLOCK_SIZE_PER_DIM + 1;
-	unsigned int numBlocksY = (ROWS - 1) / BLOCK_SIZE_PER_DIM + 1;
+	unsigned int numBlocksY = (COLUMNS - 1) / BLOCK_SIZE_PER_DIM + 1;
 	dim3 dimGridOpt(numBlocksX, numBlocksY, 1);
 	dim3 dimBlockOpt(BLOCK_SIZE_PER_DIM, BLOCK_SIZE_PER_DIM, 1);
+	*/
+	dim3 dimGridOpt(ceil((float)COLUMNS / TILE_DIM), ceil((float)ROWS / TILE_DIM),1);
+	dim3 dimBlockOpt(TILE_DIM, TILE_DIM, 1);
 	/*
 	 * Initialize timer
 	 */
@@ -78,6 +82,7 @@ int main(int argc, char* argv[])
 			simple_algorithm << <grid, block >> > (device_A, device_C, ROWS, COLUMNS);
 			t.stop_count();
 			std::cout << "Time elapsed to multiply using our simple algorithm is " << t.time() << " ms" << std::endl<<std::endl;
+			//std::cout << cudaGetLastError();
 			break;
 		case 3:
 			t.start_count();

diff --git a/Multiplications.cuh b/Multiplications.cuh
@@ -1,5 +1,4 @@
 #pragma once
 void cublas_multiplication(const double* A, double* C, const int rows, const int columns);
 __global__ void simple_algorithm(const double* A, double* C, const int rows, const int columns);
-//__global__ void optimized_algorithm(const double* __restrict__ A, double* __restrict__ C, const int rows, const int columns);
 __global__ void optimized_algorithm(const double* __restrict__ A_d, double* C_d, int ARows, int ACols);
diff --git a/optimizedAlgorithm.cu b/optimizedAlgorithm.cu
@@ -4,29 +4,26 @@
 
 
 #define TILE_DIM 32	//Tile dimension
-#define BLOCK_SIZE_PER_DIM 16	//Block dimension
+//#define BLOCK_SIZE_PER_DIM 16	//Block dimension
 
 __global__ void optimized_algorithm(const double* __restrict__ A_d, double* C_d, int ARows, int ACols) {
 
 	double CValue = 0.0;
-	//Õðïëïãéóìüò ôùí äåéêôþí ôçò ãñáììÞò êáé ôçò óôÞëçò
 	int Row = blockIdx.y * TILE_DIM + threadIdx.y;
 	int Col = blockIdx.x * TILE_DIM + threadIdx.x;
-	//×ñÞóç êáôá÷ùñçôþí
 	int Var0 = blockIdx.y * TILE_DIM + threadIdx.x;
 	int Var1 = threadIdx.y * ACols + Var0;
 	int Var2 = threadIdx.y * ACols + Col;
 	int Var3 = TILE_DIM * ACols;
 	int Var4 = ((blockIdx.y * blockDim.y + threadIdx.y) * ACols) + (blockIdx.x * blockDim.x) + threadIdx.x;
 	int Var5 = threadIdx.y;
-	//×ñÞóç êïéíÞò ìíÞìçò
 	__shared__ volatile double As[TILE_DIM][TILE_DIM];
 	__shared__ volatile double Bs[TILE_DIM][TILE_DIM];
 
 	int counter = (TILE_DIM + ARows - 1) / TILE_DIM;
 
 	for (int k = 0; k < counter; k++) {
-		//Ìçäåíéóìüò ôùí óôïé÷åßùí ôùí tiles ðïõ âñßóêïíôáé åêôïò ïñßùí ôïõ ìçôñþïõ
+
 		if (Var5 < ARows && Var0 < ACols)
 			As[threadIdx.x][threadIdx.y] = A_d[k * Var3 + Var1];
 		else
@@ -38,7 +35,7 @@ __global__ void optimized_algorithm(const double* __restrict__ A_d, double* C_d,
 			Bs[threadIdx.y][threadIdx.x] = 0.0;
 		Var5 = k * TILE_DIM + threadIdx.y;
 		__syncthreads();
-		//Õðïëïãéóìüò åíäéÜìåóùí ôéìþí
+
 		CValue = CValue + As[threadIdx.y][0] * Bs[0][threadIdx.x]
 			+ As[threadIdx.y][1] * Bs[1][threadIdx.x]
 			+ As[threadIdx.y][2] * Bs[2][threadIdx.x]
@@ -57,7 +54,7 @@ __global__ void optimized_algorithm(const double* __restrict__ A_d, double* C_d,
 			+ As[threadIdx.y][15] * Bs[15][threadIdx.x];
 		__syncthreads();
 	}
-	//ÁðïèÞêåõóç ôåëéêÞò ôéìÞò
+
 	if (Row < ACols && Col < ACols)
 		C_d[Var4] = CValue;
 }
diff --git a/simpleAlgorithm.cu b/simpleAlgorithm.cu
@@ -4,12 +4,12 @@
 
 __global__ void simple_algorithm(const double* A, double* C, const int rows, const int columns) {
 
-	double element = 0.0;
 	const int 	row = blockIdx.y * blockDim.y + threadIdx.y,
 	            col = blockIdx.x * blockDim.x + threadIdx.x;
 	if (row < rows && col < columns)
 	{
-		for (int k = 0; k < rows; ++k)
+		double element = 0.0;
+		for (int k = 0; k < rows; k++)
 		{
 			element += A[k * columns + row] * A[k * columns + col];
 		}