Skip to content

Commit

Permalink
finally some progress
Browse files Browse the repository at this point in the history
  • Loading branch information
pkavvadias committed Jan 26, 2020
1 parent 68638b6 commit 9b46114
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 12 deletions.
9 changes: 7 additions & 2 deletions Main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include "device_launch_parameters.h"
#include "cuda_runtime.h"
#define ROWS 300
#define COLUMNS 300
#define COLUMNS 200
#define THREADS 32

/*For optimized algorithm*/
Expand Down Expand Up @@ -45,10 +45,14 @@ int main(int argc, char* argv[])
/*
*Grid and block for the optimized algorithm
*/
/**
unsigned int numBlocksX = (ROWS - 1) / BLOCK_SIZE_PER_DIM + 1;
unsigned int numBlocksY = (ROWS - 1) / BLOCK_SIZE_PER_DIM + 1;
unsigned int numBlocksY = (COLUMNS - 1) / BLOCK_SIZE_PER_DIM + 1;
dim3 dimGridOpt(numBlocksX, numBlocksY, 1);
dim3 dimBlockOpt(BLOCK_SIZE_PER_DIM, BLOCK_SIZE_PER_DIM, 1);
*/
dim3 dimGridOpt(ceil((float)COLUMNS / TILE_DIM), ceil((float)ROWS / TILE_DIM),1);
dim3 dimBlockOpt(TILE_DIM, TILE_DIM, 1);
/*
* Initialize timer
*/
Expand Down Expand Up @@ -78,6 +82,7 @@ int main(int argc, char* argv[])
simple_algorithm << <grid, block >> > (device_A, device_C, ROWS, COLUMNS);
t.stop_count();
std::cout << "Time elapsed to multiply using our simple algorithm is " << t.time() << " ms" << std::endl<<std::endl;
//std::cout << cudaGetLastError();
break;
case 3:
t.start_count();
Expand Down
1 change: 0 additions & 1 deletion Multiplications.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#pragma once
void cublas_multiplication(const double* A, double* C, const int rows, const int columns);
__global__ void simple_algorithm(const double* A, double* C, const int rows, const int columns);
//__global__ void optimized_algorithm(const double* __restrict__ A, double* __restrict__ C, const int rows, const int columns);
__global__ void optimized_algorithm(const double* __restrict__ A_d, double* C_d, int ARows, int ACols);
11 changes: 4 additions & 7 deletions optimizedAlgorithm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,29 +4,26 @@


#define TILE_DIM 32 //Tile dimension
#define BLOCK_SIZE_PER_DIM 16 //Block dimension
//#define BLOCK_SIZE_PER_DIM 16 //Block dimension

__global__ void optimized_algorithm(const double* __restrict__ A_d, double* C_d, int ARows, int ACols) {

double CValue = 0.0;
//Õðïëïãéóìüò ôùí äåéêôþí ôçò ãñáììÞò êáé ôçò óôÞëçò
int Row = blockIdx.y * TILE_DIM + threadIdx.y;
int Col = blockIdx.x * TILE_DIM + threadIdx.x;
//×ñÞóç êáôá÷ùñçôþí
int Var0 = blockIdx.y * TILE_DIM + threadIdx.x;
int Var1 = threadIdx.y * ACols + Var0;
int Var2 = threadIdx.y * ACols + Col;
int Var3 = TILE_DIM * ACols;
int Var4 = ((blockIdx.y * blockDim.y + threadIdx.y) * ACols) + (blockIdx.x * blockDim.x) + threadIdx.x;
int Var5 = threadIdx.y;
//×ñÞóç êïéíÞò ìíÞìçò
__shared__ volatile double As[TILE_DIM][TILE_DIM];
__shared__ volatile double Bs[TILE_DIM][TILE_DIM];

int counter = (TILE_DIM + ARows - 1) / TILE_DIM;

for (int k = 0; k < counter; k++) {
//Ìçäåíéóìüò ôùí óôïé÷åßùí ôùí tiles ðïõ âñßóêïíôáé åêôïò ïñßùí ôïõ ìçôñþïõ

if (Var5 < ARows && Var0 < ACols)
As[threadIdx.x][threadIdx.y] = A_d[k * Var3 + Var1];
else
Expand All @@ -38,7 +35,7 @@ __global__ void optimized_algorithm(const double* __restrict__ A_d, double* C_d,
Bs[threadIdx.y][threadIdx.x] = 0.0;
Var5 = k * TILE_DIM + threadIdx.y;
__syncthreads();
//Õðïëïãéóìüò åíäéÜìåóùí ôéìþí

CValue = CValue + As[threadIdx.y][0] * Bs[0][threadIdx.x]
+ As[threadIdx.y][1] * Bs[1][threadIdx.x]
+ As[threadIdx.y][2] * Bs[2][threadIdx.x]
Expand All @@ -57,7 +54,7 @@ __global__ void optimized_algorithm(const double* __restrict__ A_d, double* C_d,
+ As[threadIdx.y][15] * Bs[15][threadIdx.x];
__syncthreads();
}
//ÁðïèÞêåõóç ôåëéêÞò ôéìÞò

if (Row < ACols && Col < ACols)
C_d[Var4] = CValue;
}
4 changes: 2 additions & 2 deletions simpleAlgorithm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@

__global__ void simple_algorithm(const double* A, double* C, const int rows, const int columns) {

double element = 0.0;
const int row = blockIdx.y * blockDim.y + threadIdx.y,
col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < rows && col < columns)
{
for (int k = 0; k < rows; ++k)
double element = 0.0;
for (int k = 0; k < rows; k++)
{
element += A[k * columns + row] * A[k * columns + col];
}
Expand Down

0 comments on commit 9b46114

Please sign in to comment.