Add code: Generating Families of Practical Fast Matrix Multiplication…

… Algorithms
flame · Apr 26, 2017 · 9157c1d · 9157c1d
1 parent 86036de
commit 9157c1d
Show file tree

Hide file tree

Showing 81 changed files with 7,759 additions and 1 deletion.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,35 @@
+
+Generating Families of Practical Fast Matrix Multiplication Algorithms
+License
+---
+
+fmm-gen is licensed under the following license, typically
+known as the "new" or "modified" or "3-clause" BSD license.
+
+
+Copyright (C) 2017, The University of Texas at Austin
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+ - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+ - Neither the name of The University of Texas at Austin nor the names
+   of its contributors may be used to endorse or promote products
+   derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/README.md b/README.md
@@ -1 +1,81 @@
-# fmm-gen
+Generating Families of Practical Fast Matrix Multiplication Algorithms
+
+- Download fmm-gen.tar.gz from
+http://www.cs.utexas.edu/users/jianyu/code/fmm-gen.tar.gz
+
+- Decompress the source code:
+
+$ tar -zxvf fmm-gen.tar.gz
+$ cd fmm-gen/meta
+
+- Set up environment variables:
+Replace $core_num with the number of cores the user wants to run.
+
+$ export OMP_NUM_THREADS=$core_num
+$ export KMP_AFFINITY=compact
+
+Note: if hyper-threading is enabled, the following alternative must be used:
+
+$ export KMP_AFFINITY=compact,1
+
+Code generators:
+
+-- If you want to generate the different implementations for a specific algorithm:
+
+$ python control.py ${N} \
+             $m1n1p1 $L1 \
+             $m2n2p2 $L2 ...... \
+             $m{N}n{N}p{N} $L{N} \
+             ${pack_type} ${gen_path}
+e.g.
+$ python control.py 2 222 1 323 1 abc \
+               ${HOME}/fmm-gen
+$ python control.py 1 222 2 abc ../
+
+This script will generate the code and compile it.
+
+To further execute the code, go to the generated code directory (e.g. ${HOME}/fmm-gen/222-1_333-1_abc}, or ../222-2_abc). 
+
+When $core_num is equal to 1,
+run 
+./test/test_xxx-x_st.x $m $n $k
+When $core_num is greater than 1,
+run
+./test/test_xxx-x_mt.x $m $n $k
+
+-- If you have access of a job submission system on a cluster, change the path_prefix variable in config.py, then:
+$ python run_sbatch_script.py
+
+This script will generate the code for all implementations, compile them, and submit the jobs to SLURM submission queue for execution.
+
+- Hybrid partitions:
+
+$ python control.py 1 222 1 abc
+$ python control.py 1 222 2 abc
+$ python control.py 1 232 1 abc
+$ python control.py 1 232 2 abc
+$ python control.py 1 333 1 abc
+$ python control.py 1 333 2 abc
+$ python control.py 2 222 1 232 1 abc
+$ python control.py 2 222 1 333 1 abc
+
+- Model:
+
+$ python model_gen.py
+This script will generate csv files for plotting the modeled performance curves.
+
+- Evaluation and expected result
+
+The output will include the following components:
+- Input problem size.
+- Running time (in seconds).
+- Effective GFLOPS (\mycircle{1} in Figure 5).
+
+The user can compare the relative Effective GFLOPS for different implementations.
+The trend should match the performance curves shown in this paper.
+Since the machines may be different from ours, the absolute GFLOPS could be different.
+
+
+
+Bugs can be reported to [email protected]
+
diff --git a/common/dgemm/.gitignore b/common/dgemm/.gitignore
@@ -0,0 +1,3 @@
+algorithms/*
+my_dgemm_strassen_abc.c
+my_dgemm_strassen_naive.c
diff --git a/common/dgemm/bl_dgemm_ref.c b/common/dgemm/bl_dgemm_ref.c
@@ -0,0 +1,51 @@
+
+#include "bl_dgemm.h"
+
+#ifdef USE_BLAS
+/* 
+ * dgemm prototype
+ *
+ */ 
+//void dgemm(char*, char*, int*, int*, int*, double*, double*, 
+//        int*, double*, int*, double*, double*, int*);
+extern void dgemm_(char*, char*, int*, int*, int*, double*, double*, 
+        int*, double*, int*, double*, double*, int*);
+#endif
+
+void bl_dgemm_ref(
+        int    m,
+        int    n,
+        int    k,
+        double *XA,
+        int    lda,
+        double *XB,
+        int    ldb,
+        double *XC,
+        int    ldc
+        )
+{
+    // Local variables.
+    int    i, j, p;
+    double alpha = 1.0, beta = 1.0;
+
+    // Sanity check for early return.
+    if ( m == 0 || n == 0 || k == 0 ) return;
+
+    // Reference GEMM implementation.
+
+#ifdef USE_BLAS
+    dgemm_( "N", "N", &m, &n, &k, &alpha,
+            XA, &lda, XB, &ldb, &beta, XC, &ldc );
+#else
+    #pragma omp parallel for private( i, p )
+    for ( j = 0; j < n; j ++ ) {
+        for ( i = 0; i < m; i ++ ) {
+            for ( p = 0; p < k; p ++ ) {
+                XC[ j * ldc + i ] += XA[ p * lda + i ] * XB[ j * ldb + p ];
+            }
+        }
+    }
+#endif
+
+}
+