added better radix short requiring less passes

hfp · Jun 3, 2021 · 84805a4 · 84805a4
1 parent 6d84038
commit 84805a4
Show file tree

Hide file tree

Showing 4 changed files with 114 additions and 53 deletions.
diff --git a/samples/deeplearning/sparse_adagrad_fused/Makefile b/samples/deeplearning/sparse_adagrad_fused/Makefile
@@ -17,11 +17,13 @@ VERIFY_CORRECTNESS = 1
 USE_LIBXSMM_JIT = 1
 USE_PERF_COUNTERS=1
 USE_LLC_COUNTERS=0
+DEBUG_TIME=0
 
 yesnolist += VERIFY_CORRECTNESS
 yesnolist += USE_LIBXSMM_JIT
 yesnolist += USE_PERF_COUNTERS
 yesnolist += USE_LLC_COUNTERS
+yesnolist += DEBUG_TIME
 #yesnolist +=
 
 DFLAGS += $(strip $(foreach var, $(yesnolist), $(if $(filter 1, $($(var))), -D$(var))))

diff --git a/samples/deeplearning/sparse_adagrad_fused/radix_sort.h b/samples/deeplearning/sparse_adagrad_fused/radix_sort.h
@@ -2,37 +2,54 @@
 #define _PCL_RADIX_SORT_
 
 #include <utility>
-#include <omp.h>
+#include <limits>
+
+#include "utils.h"
+
+#ifndef BKT_BITS
+#define BKT_BITS 12
+#endif
 
 template<typename T>
 using Key_Value_Pair = std::pair<T, T>;
 
-  template<typename T>
+template<typename T>
 Key_Value_Pair<T>* radix_sort_parallel(Key_Value_Pair<T>* inp_buf, Key_Value_Pair<T>* tmp_buf, int64_t elements_count, int64_t max_value)
 {
+  constexpr int bkt_bits = BKT_BITS;
+  constexpr int nbkts = (1 << bkt_bits);
+  constexpr int bkt_mask = (nbkts - 1);
+
   int maxthreads = omp_get_max_threads();
-  int histogram[256*maxthreads], histogram_ps[256*maxthreads + 1];
+  int histogram[nbkts*maxthreads], histogram_ps[nbkts*maxthreads + 1];
   if(max_value == 0) return inp_buf;
-  int num_bits = sizeof(T) * 8 - __builtin_clz(max_value);
-  int num_passes = (num_bits + 7) / 8;
+  int num_bits = 64;
+  if(sizeof(T) == 8 && max_value > std::numeric_limits<int>::max()) {
+    num_bits = sizeof(T) * 8 - __builtin_clzll(max_value);
+  } else {
+    num_bits = 32 - __builtin_clz((unsigned int)max_value);
+  }
+
+  int num_passes = (num_bits + bkt_bits - 1) / bkt_bits;
 
 #pragma omp parallel
   {
     int tid = omp_get_thread_num();
     int nthreads = omp_get_num_threads();
 
-    int * local_histogram = &histogram[256*tid];
-    int * local_histogram_ps = &histogram_ps[256*tid];
+    int * local_histogram = &histogram[nbkts*tid];
+    int * local_histogram_ps = &histogram_ps[nbkts*tid];
     int elements_count_4 = elements_count/4*4;
     Key_Value_Pair<T> * input = inp_buf;
     Key_Value_Pair<T> * output = tmp_buf;
 
     for(unsigned int pass = 0; pass < num_passes; pass++)
     {
 
-      /* Step 1: compute histogram
-         Reset histogram */
-      for(int i = 0; i < 256; i++) local_histogram[i] = 0;
+      auto t1 = get_time();
+      // Step 1: compute histogram
+      // Reset histogram
+      for(int i = 0; i < nbkts; i++) local_histogram[i] = 0;
 
 #pragma omp for schedule(static)
       for(int64_t i = 0; i < elements_count_4; i+=4)
@@ -42,41 +59,43 @@ Key_Value_Pair<T>* radix_sort_parallel(Key_Value_Pair<T>* inp_buf, Key_Value_Pai
         T val_3 = input[i+2].first;
         T val_4 = input[i+3].first;
 
-        local_histogram[ (val_1>>(pass*8)) &0xFF]++;
-        local_histogram[ (val_2>>(pass*8)) &0xFF]++;
-        local_histogram[ (val_3>>(pass*8)) &0xFF]++;
-        local_histogram[ (val_4>>(pass*8)) &0xFF]++;
+        local_histogram[ (val_1>>(pass*bkt_bits)) & bkt_mask]++;
+        local_histogram[ (val_2>>(pass*bkt_bits)) & bkt_mask]++;
+        local_histogram[ (val_3>>(pass*bkt_bits)) & bkt_mask]++;
+        local_histogram[ (val_4>>(pass*bkt_bits)) & bkt_mask]++;
       }
       if(tid == (nthreads -1))
       {
         for(int64_t i = elements_count_4; i < elements_count; i++)
         {
           T val = input[i].first;
-          local_histogram[ (val>>(pass*8)) &0xFF]++;
+          local_histogram[ (val>>(pass*bkt_bits)) & bkt_mask]++;
         }
       }
 #pragma omp barrier
-      /* Step 2: prefix sum */
+      auto t11 = get_time();
+      // Step 2: prefix sum
       if(tid == 0)
       {
         int sum = 0, prev_sum = 0;
-        for(int bins = 0; bins < 256; bins++) for(int t = 0; t < nthreads; t++) { sum += histogram[t*256 + bins]; histogram_ps[t*256 + bins] = prev_sum; prev_sum = sum; }
-        histogram_ps[256*nthreads] = prev_sum; if(prev_sum != elements_count) { printf("Error1!\n"); exit(123); }
+        for(int bins = 0; bins < nbkts; bins++) for(int t = 0; t < nthreads; t++) { sum += histogram[t*nbkts + bins]; histogram_ps[t*nbkts + bins] = prev_sum; prev_sum = sum; }
+        histogram_ps[nbkts*nthreads] = prev_sum; if(prev_sum != elements_count) { printf("Error1!\n"); exit(123); }
       }
 #pragma omp barrier
+      auto t12 = get_time();
 
-      /* Step 3: scatter */
+      // Step 3: scatter
 #pragma omp for schedule(static)
       for(int64_t i = 0; i < elements_count_4; i+=4)
       {
         T val_1 = input[i].first;
         T val_2 = input[i+1].first;
         T val_3 = input[i+2].first;
         T val_4 = input[i+3].first;
-        T bin_1 = (val_1>>(pass*8)) &0xFF;
-        T bin_2 = (val_2>>(pass*8)) &0xFF;
-        T bin_3 = (val_3>>(pass*8)) &0xFF;
-        T bin_4 = (val_4>>(pass*8)) &0xFF;
+        T bin_1 = (val_1>>(pass*bkt_bits)) & bkt_mask;
+        T bin_2 = (val_2>>(pass*bkt_bits)) & bkt_mask;
+        T bin_3 = (val_3>>(pass*bkt_bits)) & bkt_mask;
+        T bin_4 = (val_4>>(pass*bkt_bits)) & bkt_mask;
         int pos;
         pos = local_histogram_ps[bin_1]++;
         output[pos] = input[i];
@@ -92,13 +111,17 @@ Key_Value_Pair<T>* radix_sort_parallel(Key_Value_Pair<T>* inp_buf, Key_Value_Pai
         for(int64_t i = elements_count_4; i < elements_count; i++)
         {
           T val = input[i].first;
-          int pos = local_histogram_ps[ (val>>(pass*8)) &0xFF]++;
+          int pos = local_histogram_ps[ (val>>(pass*bkt_bits)) & bkt_mask]++;
           output[pos] = input[i];
         }
       }
 
       Key_Value_Pair<T> * temp = input; input = output; output = temp;
 #pragma omp barrier
+      auto t2 = get_time();
+#ifdef DEBUG_TIME
+      if (tid == 0) printf("pass = %d  total time = %.3f  step1 = %.3f  step2 = %.3f %.3f\n", pass, t2-t1, t11-t1, t12-t11, t2-t12);
+#endif
     }
   }
   return (num_passes % 2 == 0 ? inp_buf : tmp_buf);

diff --git a/samples/deeplearning/sparse_adagrad_fused/sparse_adagrad.cpp b/samples/deeplearning/sparse_adagrad_fused/sparse_adagrad.cpp
@@ -25,20 +25,14 @@
 #include <unistd.h>
 #include <assert.h>
 #include <immintrin.h>
-#ifdef _OPENMP
-#include <omp.h>
-#else
-#define omp_get_num_threads() (1)
-#define omp_get_thread_num() (0)
-#define omp_get_max_threads() (1)
-#endif
 #include <libxsmm.h>
 
 #ifdef USE_PERF_COUNTERS
 #include "counters.h"
 #endif
 
 #include "radix_sort.h"
+#include "utils.h"
 
 const int alignment = 64;
 typedef long ITyp;
@@ -55,21 +49,6 @@ void set_random_seed(int seed)
   }
 }
 
-static double get_time() {
-  static bool init_done = false;
-  static struct timespec stp = {0,0};
-  struct timespec tp;
-  clock_gettime(CLOCK_REALTIME, &tp);
-  /*clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &tp);*/
-
-  if(!init_done) {
-    init_done = true;
-    stp = tp;
-  }
-  double ret = (tp.tv_sec - stp.tv_sec) * 1e3 + (tp.tv_nsec - stp.tv_nsec)*1e-6;
-  return ret;
-}
-
 template<typename T>
 void init_zero(size_t sz, T *buf)
 {
@@ -257,12 +236,16 @@ void sparse_transpose_radix(EmbeddingInOut *eio)
     }
   }
   auto t1 = get_time();
-  //printf("Keypair buffer fill Time = %.3f ms\n", t1-t0);
+#ifdef DEBUG_TIME
+  printf("Keypair buffer fill Time = %.3f ms\n", t1-t0);
+#endif
 
   t0 = get_time();
   Key_Value_Pair<int>* tmpBuf2 = radix_sort_parallel<int>(&tmpBuf[0], &tmpBuf1[0], NS, M);
   t1 = get_time();
-  //printf("Radix Sort Time = %.3f ms\n", t1-t0);
+#ifdef DEBUG_TIME
+  printf("Radix Sort Time = %.3f ms\n", t1-t0);
+#endif
 
   int max_thds = omp_get_max_threads();
   int num_uniq[max_thds];
@@ -283,7 +266,9 @@ void sparse_transpose_radix(EmbeddingInOut *eio)
     num_uniq[i] += num_uniq[i-1];
   int U = num_uniq[max_thds-1];
   t1 = get_time();
-  //printf("Num Unique Index Time = %.3f ms\n", t1-t0);
+#ifdef DEBUG_TIME
+  printf("Num Unique Index Time = %.3f ms\n", t1-t0);
+#endif
 
   t0 = get_time();
   eio->mb_offsets[0] = 0;
@@ -306,7 +291,9 @@ void sparse_transpose_radix(EmbeddingInOut *eio)
     }
   }
   t1 = get_time();
-  //printf("Offset/Index array construction Time = %.3f ms\n", t1-t0);
+#ifdef DEBUG_TIME
+  printf("Offset/Index array construction Time = %.3f ms\n", t1-t0);
+#endif
   eio->mb_offsets[U] = NS;
   eio->U = U;
   my_free(tmpBuf);
@@ -431,7 +418,9 @@ void allocate_buffers_and_generte_rnd_input(int N, int P, double alpha, Embeddin
   auto t0 = get_time();
   sparse_transpose_radix(eio);
   auto t1 = get_time();
-  //printf("Trans Time = %.3f ms\n", t1-t0);
+#ifdef DEBUG_TIME
+  printf("Trans Time = %.3f ms\n", t1-t0);
+#endif
 }
 
 void free_buffers(EmbeddingInOut *eio)
@@ -517,7 +506,9 @@ int main(int argc, char * argv[]) {
       auto t0 = get_time();
       allocate_buffers_and_generte_rnd_input(N, P, alpha, eb[i], eio[j][i]);
       auto t1 = get_time();
-      //printf("Rand init time = %.3f ms\n", t1 - t0);
+#ifdef DEBUG_TIME
+      printf("Rand init time = %.3f ms\n", t1 - t0);
+#endif
       tNS += eio[j][i]->NS;
       tU += eio[j][i]->U;
     }
@@ -530,7 +521,9 @@ int main(int argc, char * argv[]) {
       eb[s]->fused_backward_update_adagrad(eio[i][s]->U, eio[i][s]->NS, N, eio[i][s]->mb_offsets, eio[i][s]->mb_indices, eio[i][s]->wt_indices, eio[i][s]->gradout, -0.1, 1.0e-6);
     }
     double t1 = get_time();
+#ifdef DEBUG_TIME
     printf("Warmup Iter %4d: Time = %.3f ms\n", i, t1-t0);
+#endif
   }
 
 #ifdef USE_PERF_COUNTERS
@@ -574,7 +567,6 @@ int main(int argc, char * argv[]) {
   size_t bwdupdBytesMax = bwdupdBytesMaxRd + bwdupdBytesMaxWr;
 
   my_printf("Iters = %d, LS = %d, N = %d, M = %d, E = %d, avgNS = %ld, avgU = %ld, P = %d\n", iters, LS, N, M, E, tNS/(iters*LS), tU/(iters*LS), P);
-  //printf("Time: Fwd: %.3f ms Bwd: %.3f ms Upd: %.3f  Total: %.3f\n", fwdTime, bwdTime, updTime, t1-t0);
   my_printf("Per Iter  Time: %.3f ms  Total: %.3f ms\n", bwdupdTime/(iters), (t1-t0)/(iters));
   my_printf("Per Table Time: %.3f ms  Total: %.3f ms\n", bwdupdTime/(iters*LS), (t1-t0)/(iters*LS));
 

diff --git a/samples/deeplearning/sparse_adagrad_fused/utils.h b/samples/deeplearning/sparse_adagrad_fused/utils.h
@@ -0,0 +1,44 @@
+/******************************************************************************
+* Copyright (c) Intel Corporation - All rights reserved.                      *
+* This file is part of the LIBXSMM library.                                   *
+*                                                                             *
+* For information on the license, see the LICENSE file.                       *
+* Further information: https://github.com/hfp/libxsmm/                        *
+* SPDX-License-Identifier: BSD-3-Clause                                       *
+******************************************************************************/
+/* Ishwar Bhati (Intel Corp.)
+   Dhiraj Kalamkar (Intel Corp.)
+******************************************************************************/
+
+#ifndef _UTILS_H_
+#define _UTILS_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <unistd.h>
+#ifdef _OPENMP
+#include <omp.h>
+#else
+#define omp_get_num_threads() (1)
+#define omp_get_thread_num() (0)
+#define omp_get_max_threads() (1)
+#endif
+
+
+static double get_time() {
+  static bool init_done = false;
+  static struct timespec stp = {0,0};
+  struct timespec tp;
+  clock_gettime(CLOCK_REALTIME, &tp);
+  /*clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &tp);*/
+
+  if(!init_done) {
+    init_done = true;
+    stp = tp;
+  }
+  double ret = (tp.tv_sec - stp.tv_sec) * 1e3 + (tp.tv_nsec - stp.tv_nsec)*1e-6;
+  return ret;
+}
+
+#endif /*_UTILS_H_*/