-
Notifications
You must be signed in to change notification settings - Fork 23
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
7c8868e
commit 2688849
Showing
10 changed files
with
192 additions
and
0 deletions.
There are no files selected for viewing
File renamed without changes.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
#include <cstdio> | ||
#include <limits> | ||
#include <algorithm> | ||
#include <vector> | ||
|
||
#include <omp.h> | ||
|
||
void transpose(int m, int n, | ||
const double* A, int lda, | ||
double* B, int ldb); | ||
|
||
template <typename Experiment> | ||
double benchmark(Experiment&& exp, int num_repeat=1) | ||
{ | ||
double min_time = std::numeric_limits<double>::max(); | ||
|
||
for (int i = 0;i < num_repeat;i++) | ||
{ | ||
double t0 = omp_get_wtime(); | ||
exp(); | ||
double t1 = omp_get_wtime(); | ||
min_time = std::min(min_time, t1-t0); | ||
} | ||
|
||
return min_time; | ||
} | ||
|
||
int main(int argc, char** argv) | ||
{ | ||
int nmax = 1000; | ||
|
||
std::vector<double> A(nmax*nmax), B(nmax*nmax); | ||
|
||
for (int n = 4;n <= nmax;n += 4) | ||
{ | ||
double elapsed = benchmark([&] { transpose(n, n, A.data(), n, B.data(), n); }, 100); | ||
double bytes = 2*sizeof(double)*n*n; | ||
double gbps = bytes/elapsed/1024/1024/1024; | ||
|
||
printf("%d %g\n", n, gbps); | ||
fflush(stdout); | ||
} | ||
|
||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
void dscal(int n, double alpha, double* x /* assume incx = 1 */) | ||
{ | ||
for (int i = 0;i < n;i++) | ||
{ | ||
x[i] *= alpha; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
void daxpy(int n, double alpha, | ||
const double* x, /* assume incx = 1 */ | ||
double* y /* assume incy = 1 */) | ||
{ | ||
for (int i = 0;i < n;i++) | ||
{ | ||
y[i] += alpha*x[i]; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
void daxpy(int n, double alpha, | ||
// | ||
// Use the __restrict__ keyword to promise to the compiler | ||
// that x and y don't overlap. | ||
// | ||
const double* __restrict__ x, /* assume incx = 1 */ | ||
double* __restrict__ y /* assume incy = 1 */) | ||
{ | ||
// OR: use #pragmas to force vectorization: | ||
// | ||
// ivdep: ignore assumed data dependencies | ||
// | ||
#pragma GCC ivdep // gcc | ||
#pragma ivdep // icpc | ||
// | ||
// simd: always vectorize | ||
// | ||
#pragma simd //icpc | ||
#pragma omp simd //any compiler with OpenMP 4 | ||
for (int i = 0;i < n;i++) | ||
{ | ||
y[i] += alpha*x[i]; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
// transpose the column-major m*n matrix A | ||
// into the column-major n*m matrix B | ||
void transpose(int m, int n, | ||
const double* __restrict__ A, int lda, | ||
double* __restrict__ B, int ldb) | ||
{ | ||
for (int i = 0;i < m;i++) | ||
{ | ||
for (int j = 0;j < n;j++) | ||
{ | ||
B[i*ldb + j] = A[i + j*lda]; | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
// transpose the column-major m*n matrix A | ||
// into the column-major n*m matrix B | ||
void transpose(int m, int n, | ||
const double* __restrict__ A, int lda, | ||
double* __restrict__ B, int ldb) | ||
{ | ||
// assume m%4 == 0 and n%4 == 0 | ||
for (int i = 0;i < m;i += 4) | ||
{ | ||
for (int j = 0;j < n;j += 4) | ||
{ | ||
const double* __restrict__ Asub = &A[i + j*lda]; | ||
double* __restrict__ Bsub = &B[i*ldb + j]; | ||
|
||
Bsub[0*ldb + 0] = Asub[0 + 0*lda]; | ||
Bsub[0*ldb + 1] = Asub[0 + 1*lda]; | ||
Bsub[0*ldb + 2] = Asub[0 + 2*lda]; | ||
Bsub[0*ldb + 3] = Asub[0 + 3*lda]; | ||
|
||
Bsub[1*ldb + 0] = Asub[1 + 0*lda]; | ||
Bsub[1*ldb + 1] = Asub[1 + 1*lda]; | ||
Bsub[1*ldb + 2] = Asub[1 + 2*lda]; | ||
Bsub[1*ldb + 3] = Asub[1 + 3*lda]; | ||
|
||
Bsub[2*ldb + 0] = Asub[2 + 0*lda]; | ||
Bsub[2*ldb + 1] = Asub[2 + 1*lda]; | ||
Bsub[2*ldb + 2] = Asub[2 + 2*lda]; | ||
Bsub[2*ldb + 3] = Asub[2 + 3*lda]; | ||
|
||
Bsub[3*ldb + 0] = Asub[3 + 0*lda]; | ||
Bsub[3*ldb + 1] = Asub[3 + 1*lda]; | ||
Bsub[3*ldb + 2] = Asub[3 + 2*lda]; | ||
Bsub[3*ldb + 3] = Asub[3 + 3*lda]; | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
#include <immintrin.h> | ||
|
||
void transpose_4x4(__m256d A[4], __m256d B[4]) | ||
{ | ||
__m256d tmp[4]; | ||
// A[0] = (A00, A10, A20, A30) | ||
// A[1] = (A01, A11, A21, A31) | ||
// A[2] = (A02, A12, A22, A32) | ||
// A[3] = (A03, A13, A23, A33) | ||
tmp[0] = _mm256_shuffle_pd(A[0], A[1], 0x0); | ||
tmp[1] = _mm256_shuffle_pd(A[0], A[1], 0xf); | ||
tmp[2] = _mm256_shuffle_pd(A[2], A[3], 0x0); | ||
tmp[3] = _mm256_shuffle_pd(A[2], A[3], 0xf); | ||
// tmp[0] = (A00, A01, A20, A21) | ||
// tmp[1] = (A10, A11, A30, A31) | ||
// tmp[2] = (A02, A03, A22, A23) | ||
// tmp[3] = (A12, A13, A32, A33) | ||
B[0] = _mm256_permute2f128_pd(tmp[0], tmp[2], 0x20); | ||
B[1] = _mm256_permute2f128_pd(tmp[1], tmp[3], 0x20); | ||
B[2] = _mm256_permute2f128_pd(tmp[0], tmp[2], 0x31); | ||
B[3] = _mm256_permute2f128_pd(tmp[1], tmp[3], 0x31); | ||
// B[0] = (A00, A01, A02, A03) | ||
// B[1] = (A10, A11, A12, A13) | ||
// B[2] = (A20, A21, A22, A23) | ||
// B[3] = (A30, A31, A32, A33) | ||
} | ||
|
||
// transpose the column-major m*n matrix A | ||
// into the column-major n*m matrix B | ||
void transpose(int m, int n, | ||
const double* __restrict__ A, int lda, | ||
double* __restrict__ B, int ldb) | ||
{ | ||
__m256d Areg[4], Breg[4]; | ||
|
||
// assume m%4 == 0 and n%4 == 0 | ||
for (int i = 0;i < m;i += 4) | ||
{ | ||
for (int j = 0;j < n;j += 4) | ||
{ | ||
const double* __restrict__ Asub = &A[i + j*lda]; | ||
double* __restrict__ Bsub = &B[i*ldb + j]; | ||
|
||
Areg[0] = _mm256_loadu_pd(Asub + 0*lda); | ||
Areg[1] = _mm256_loadu_pd(Asub + 1*lda); | ||
Areg[2] = _mm256_loadu_pd(Asub + 2*lda); | ||
Areg[3] = _mm256_loadu_pd(Asub + 3*lda); | ||
|
||
transpose_4x4(Areg, Breg); | ||
|
||
_mm256_storeu_pd(Bsub + 0*ldb, Breg[0]); | ||
_mm256_storeu_pd(Bsub + 1*ldb, Breg[1]); | ||
_mm256_storeu_pd(Bsub + 2*ldb, Breg[2]); | ||
_mm256_storeu_pd(Bsub + 3*ldb, Breg[3]); | ||
} | ||
} | ||
} |