Skip to content

Commit

Permalink
All lectures and samples up now.
Browse files Browse the repository at this point in the history
  • Loading branch information
devinamatthews committed Jul 25, 2017
1 parent 7c8868e commit 2688849
Show file tree
Hide file tree
Showing 10 changed files with 192 additions and 0 deletions.
File renamed without changes.
Binary file added lectures/vectorization.pdf
Binary file not shown.
Binary file added lectures/vectorization.pptx
Binary file not shown.
45 changes: 45 additions & 0 deletions vectorization_samples/driver.cxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#include <cstdio>
#include <limits>
#include <algorithm>
#include <vector>

#include <omp.h>

void transpose(int m, int n,
const double* A, int lda,
double* B, int ldb);

template <typename Experiment>
double benchmark(Experiment&& exp, int num_repeat=1)
{
double min_time = std::numeric_limits<double>::max();

for (int i = 0;i < num_repeat;i++)
{
double t0 = omp_get_wtime();
exp();
double t1 = omp_get_wtime();
min_time = std::min(min_time, t1-t0);
}

return min_time;
}

int main(int argc, char** argv)
{
int nmax = 1000;

std::vector<double> A(nmax*nmax), B(nmax*nmax);

for (int n = 4;n <= nmax;n += 4)
{
double elapsed = benchmark([&] { transpose(n, n, A.data(), n, B.data(), n); }, 100);
double bytes = 2*sizeof(double)*n*n;
double gbps = bytes/elapsed/1024/1024/1024;

printf("%d %g\n", n, gbps);
fflush(stdout);
}

return 0;
}
7 changes: 7 additions & 0 deletions vectorization_samples/sample1.cxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
void dscal(int n, double alpha, double* x /* assume incx = 1 */)
{
for (int i = 0;i < n;i++)
{
x[i] *= alpha;
}
}
9 changes: 9 additions & 0 deletions vectorization_samples/sample2a.cxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
void daxpy(int n, double alpha,
const double* x, /* assume incx = 1 */
double* y /* assume incy = 1 */)
{
for (int i = 0;i < n;i++)
{
y[i] += alpha*x[i];
}
}
24 changes: 24 additions & 0 deletions vectorization_samples/sample2b.cxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
void daxpy(int n, double alpha,
//
// Use the __restrict__ keyword to promise to the compiler
// that x and y don't overlap.
//
const double* __restrict__ x, /* assume incx = 1 */
double* __restrict__ y /* assume incy = 1 */)
{
// OR: use #pragmas to force vectorization:
//
// ivdep: ignore assumed data dependencies
//
#pragma GCC ivdep // gcc
#pragma ivdep // icpc
//
// simd: always vectorize
//
#pragma simd //icpc
#pragma omp simd //any compiler with OpenMP 4
for (int i = 0;i < n;i++)
{
y[i] += alpha*x[i];
}
}
14 changes: 14 additions & 0 deletions vectorization_samples/sample3a.cxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
// transpose the column-major m*n matrix A
// into the column-major n*m matrix B
void transpose(int m, int n,
const double* __restrict__ A, int lda,
double* __restrict__ B, int ldb)
{
for (int i = 0;i < m;i++)
{
for (int j = 0;j < n;j++)
{
B[i*ldb + j] = A[i + j*lda];
}
}
}
36 changes: 36 additions & 0 deletions vectorization_samples/sample3b.cxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
// transpose the column-major m*n matrix A
// into the column-major n*m matrix B
void transpose(int m, int n,
const double* __restrict__ A, int lda,
double* __restrict__ B, int ldb)
{
// assume m%4 == 0 and n%4 == 0
for (int i = 0;i < m;i += 4)
{
for (int j = 0;j < n;j += 4)
{
const double* __restrict__ Asub = &A[i + j*lda];
double* __restrict__ Bsub = &B[i*ldb + j];

Bsub[0*ldb + 0] = Asub[0 + 0*lda];
Bsub[0*ldb + 1] = Asub[0 + 1*lda];
Bsub[0*ldb + 2] = Asub[0 + 2*lda];
Bsub[0*ldb + 3] = Asub[0 + 3*lda];

Bsub[1*ldb + 0] = Asub[1 + 0*lda];
Bsub[1*ldb + 1] = Asub[1 + 1*lda];
Bsub[1*ldb + 2] = Asub[1 + 2*lda];
Bsub[1*ldb + 3] = Asub[1 + 3*lda];

Bsub[2*ldb + 0] = Asub[2 + 0*lda];
Bsub[2*ldb + 1] = Asub[2 + 1*lda];
Bsub[2*ldb + 2] = Asub[2 + 2*lda];
Bsub[2*ldb + 3] = Asub[2 + 3*lda];

Bsub[3*ldb + 0] = Asub[3 + 0*lda];
Bsub[3*ldb + 1] = Asub[3 + 1*lda];
Bsub[3*ldb + 2] = Asub[3 + 2*lda];
Bsub[3*ldb + 3] = Asub[3 + 3*lda];
}
}
}
57 changes: 57 additions & 0 deletions vectorization_samples/sample3c.cxx
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#include <immintrin.h>

void transpose_4x4(__m256d A[4], __m256d B[4])
{
__m256d tmp[4];
// A[0] = (A00, A10, A20, A30)
// A[1] = (A01, A11, A21, A31)
// A[2] = (A02, A12, A22, A32)
// A[3] = (A03, A13, A23, A33)
tmp[0] = _mm256_shuffle_pd(A[0], A[1], 0x0);
tmp[1] = _mm256_shuffle_pd(A[0], A[1], 0xf);
tmp[2] = _mm256_shuffle_pd(A[2], A[3], 0x0);
tmp[3] = _mm256_shuffle_pd(A[2], A[3], 0xf);
// tmp[0] = (A00, A01, A20, A21)
// tmp[1] = (A10, A11, A30, A31)
// tmp[2] = (A02, A03, A22, A23)
// tmp[3] = (A12, A13, A32, A33)
B[0] = _mm256_permute2f128_pd(tmp[0], tmp[2], 0x20);
B[1] = _mm256_permute2f128_pd(tmp[1], tmp[3], 0x20);
B[2] = _mm256_permute2f128_pd(tmp[0], tmp[2], 0x31);
B[3] = _mm256_permute2f128_pd(tmp[1], tmp[3], 0x31);
// B[0] = (A00, A01, A02, A03)
// B[1] = (A10, A11, A12, A13)
// B[2] = (A20, A21, A22, A23)
// B[3] = (A30, A31, A32, A33)
}

// transpose the column-major m*n matrix A
// into the column-major n*m matrix B
void transpose(int m, int n,
const double* __restrict__ A, int lda,
double* __restrict__ B, int ldb)
{
__m256d Areg[4], Breg[4];

// assume m%4 == 0 and n%4 == 0
for (int i = 0;i < m;i += 4)
{
for (int j = 0;j < n;j += 4)
{
const double* __restrict__ Asub = &A[i + j*lda];
double* __restrict__ Bsub = &B[i*ldb + j];

Areg[0] = _mm256_loadu_pd(Asub + 0*lda);
Areg[1] = _mm256_loadu_pd(Asub + 1*lda);
Areg[2] = _mm256_loadu_pd(Asub + 2*lda);
Areg[3] = _mm256_loadu_pd(Asub + 3*lda);

transpose_4x4(Areg, Breg);

_mm256_storeu_pd(Bsub + 0*ldb, Breg[0]);
_mm256_storeu_pd(Bsub + 1*ldb, Breg[1]);
_mm256_storeu_pd(Bsub + 2*ldb, Breg[2]);
_mm256_storeu_pd(Bsub + 3*ldb, Breg[3]);
}
}
}

0 comments on commit 2688849

Please sign in to comment.