All lectures and samples up now.

devinamatthews · Jul 25, 2017 · 2688849 · 2688849
1 parent 7c8868e
commit 2688849
Show file tree

Hide file tree

Showing 10 changed files with 192 additions and 0 deletions.
diff --git a/optimizing_gemm/LICENSE → LICENSE b/optimizing_gemm/LICENSE → LICENSE
diff --git a/lectures/vectorization.pdf b/lectures/vectorization.pdf
diff --git a/lectures/vectorization.pptx b/lectures/vectorization.pptx
diff --git a/vectorization_samples/driver.cxx b/vectorization_samples/driver.cxx
@@ -0,0 +1,45 @@
+#include <cstdio>
+#include <limits>
+#include <algorithm>
+#include <vector>
+
+#include <omp.h>
+
+void transpose(int m, int n,
+               const double* A, int lda,
+                     double* B, int ldb);
+
+template <typename Experiment>
+double benchmark(Experiment&& exp, int num_repeat=1)
+{
+    double min_time = std::numeric_limits<double>::max();
+
+    for (int i = 0;i < num_repeat;i++)
+    {
+        double t0 = omp_get_wtime();
+        exp();
+        double t1 = omp_get_wtime();
+        min_time = std::min(min_time, t1-t0);
+    }
+
+    return min_time;
+}
+
+int main(int argc, char** argv)
+{
+    int nmax = 1000;
+
+    std::vector<double> A(nmax*nmax), B(nmax*nmax);
+
+    for (int n = 4;n <= nmax;n += 4)
+    {
+        double elapsed = benchmark([&] { transpose(n, n, A.data(), n, B.data(), n); }, 100);
+        double bytes = 2*sizeof(double)*n*n;
+        double gbps = bytes/elapsed/1024/1024/1024;
+
+        printf("%d %g\n", n, gbps);
+        fflush(stdout);
+    }
+
+    return 0;
+}
diff --git a/vectorization_samples/sample1.cxx b/vectorization_samples/sample1.cxx
@@ -0,0 +1,7 @@
+void dscal(int n, double alpha, double* x /* assume incx = 1 */)
+{
+    for (int i = 0;i < n;i++)
+    {
+        x[i] *= alpha;
+    }
+}
diff --git a/vectorization_samples/sample2a.cxx b/vectorization_samples/sample2a.cxx
@@ -0,0 +1,9 @@
+void daxpy(int n, double alpha,
+           const double* x, /* assume incx = 1 */
+                 double* y  /* assume incy = 1 */)
+{
+    for (int i = 0;i < n;i++)
+    {
+        y[i] += alpha*x[i];
+    }
+}
diff --git a/vectorization_samples/sample2b.cxx b/vectorization_samples/sample2b.cxx
@@ -0,0 +1,24 @@
+void daxpy(int n, double alpha,
+           //
+           // Use the __restrict__ keyword to promise to the compiler
+           // that x and y don't overlap.
+           //
+           const double* __restrict__ x, /* assume incx = 1 */
+                 double* __restrict__ y  /* assume incy = 1 */)
+{
+    // OR: use #pragmas to force vectorization:
+    //
+    // ivdep: ignore assumed data dependencies
+    //
+    #pragma GCC ivdep // gcc
+    #pragma ivdep // icpc
+    //
+    // simd: always vectorize
+    //
+    #pragma simd //icpc
+    #pragma omp simd //any compiler with OpenMP 4
+    for (int i = 0;i < n;i++)
+    {
+        y[i] += alpha*x[i];
+    }
+}
diff --git a/vectorization_samples/sample3a.cxx b/vectorization_samples/sample3a.cxx
@@ -0,0 +1,14 @@
+// transpose the column-major m*n matrix A
+// into the column-major n*m matrix B
+void transpose(int m, int n,
+               const double* __restrict__ A, int lda,
+                     double* __restrict__ B, int ldb)
+{
+    for (int i = 0;i < m;i++)
+    {
+        for (int j = 0;j < n;j++)
+        {
+            B[i*ldb + j] = A[i + j*lda];
+        }
+    }
+}
diff --git a/vectorization_samples/sample3b.cxx b/vectorization_samples/sample3b.cxx
@@ -0,0 +1,36 @@
+// transpose the column-major m*n matrix A
+// into the column-major n*m matrix B
+void transpose(int m, int n,
+               const double* __restrict__ A, int lda,
+                     double* __restrict__ B, int ldb)
+{
+    // assume m%4 == 0 and n%4 == 0
+    for (int i = 0;i < m;i += 4)
+    {
+        for (int j = 0;j < n;j += 4)
+        {
+            const double* __restrict__ Asub = &A[i + j*lda];
+                  double* __restrict__ Bsub = &B[i*ldb + j];
+
+             Bsub[0*ldb + 0] = Asub[0 + 0*lda];
+             Bsub[0*ldb + 1] = Asub[0 + 1*lda];
+             Bsub[0*ldb + 2] = Asub[0 + 2*lda];
+             Bsub[0*ldb + 3] = Asub[0 + 3*lda];
+
+             Bsub[1*ldb + 0] = Asub[1 + 0*lda];
+             Bsub[1*ldb + 1] = Asub[1 + 1*lda];
+             Bsub[1*ldb + 2] = Asub[1 + 2*lda];
+             Bsub[1*ldb + 3] = Asub[1 + 3*lda];
+
+             Bsub[2*ldb + 0] = Asub[2 + 0*lda];
+             Bsub[2*ldb + 1] = Asub[2 + 1*lda];
+             Bsub[2*ldb + 2] = Asub[2 + 2*lda];
+             Bsub[2*ldb + 3] = Asub[2 + 3*lda];
+
+             Bsub[3*ldb + 0] = Asub[3 + 0*lda];
+             Bsub[3*ldb + 1] = Asub[3 + 1*lda];
+             Bsub[3*ldb + 2] = Asub[3 + 2*lda];
+             Bsub[3*ldb + 3] = Asub[3 + 3*lda];
+        }
+    }
+}
diff --git a/vectorization_samples/sample3c.cxx b/vectorization_samples/sample3c.cxx
@@ -0,0 +1,57 @@
+#include <immintrin.h>
+
+void transpose_4x4(__m256d A[4], __m256d B[4])
+{
+    __m256d tmp[4];
+    // A[0] = (A00, A10, A20, A30)
+    // A[1] = (A01, A11, A21, A31)
+    // A[2] = (A02, A12, A22, A32)
+    // A[3] = (A03, A13, A23, A33)
+    tmp[0] = _mm256_shuffle_pd(A[0], A[1], 0x0);
+    tmp[1] = _mm256_shuffle_pd(A[0], A[1], 0xf);
+    tmp[2] = _mm256_shuffle_pd(A[2], A[3], 0x0);
+    tmp[3] = _mm256_shuffle_pd(A[2], A[3], 0xf);
+    // tmp[0] = (A00, A01, A20, A21)
+    // tmp[1] = (A10, A11, A30, A31)
+    // tmp[2] = (A02, A03, A22, A23)
+    // tmp[3] = (A12, A13, A32, A33)
+    B[0] = _mm256_permute2f128_pd(tmp[0], tmp[2], 0x20);
+    B[1] = _mm256_permute2f128_pd(tmp[1], tmp[3], 0x20);
+    B[2] = _mm256_permute2f128_pd(tmp[0], tmp[2], 0x31);
+    B[3] = _mm256_permute2f128_pd(tmp[1], tmp[3], 0x31);
+    // B[0] = (A00, A01, A02, A03)
+    // B[1] = (A10, A11, A12, A13)
+    // B[2] = (A20, A21, A22, A23)
+    // B[3] = (A30, A31, A32, A33)
+}
+
+// transpose the column-major m*n matrix A
+// into the column-major n*m matrix B
+void transpose(int m, int n,
+               const double* __restrict__ A, int lda,
+                     double* __restrict__ B, int ldb)
+{
+    __m256d Areg[4], Breg[4];
+
+    // assume m%4 == 0 and n%4 == 0
+    for (int i = 0;i < m;i += 4)
+    {
+        for (int j = 0;j < n;j += 4)
+        {
+            const double* __restrict__ Asub = &A[i + j*lda];
+                  double* __restrict__ Bsub = &B[i*ldb + j];
+
+             Areg[0] = _mm256_loadu_pd(Asub + 0*lda);
+             Areg[1] = _mm256_loadu_pd(Asub + 1*lda);
+             Areg[2] = _mm256_loadu_pd(Asub + 2*lda);
+             Areg[3] = _mm256_loadu_pd(Asub + 3*lda);
+
+             transpose_4x4(Areg, Breg);
+
+             _mm256_storeu_pd(Bsub + 0*ldb, Breg[0]);
+             _mm256_storeu_pd(Bsub + 1*ldb, Breg[1]);
+             _mm256_storeu_pd(Bsub + 2*ldb, Breg[2]);
+             _mm256_storeu_pd(Bsub + 3*ldb, Breg[3]);
+        }
+    }
+}