Skip to content

Commit

Permalink
added better radix short requiring less passes
Browse files Browse the repository at this point in the history
  • Loading branch information
ibhati committed Jun 3, 2021
1 parent 6d84038 commit 84805a4
Show file tree
Hide file tree
Showing 4 changed files with 114 additions and 53 deletions.
2 changes: 2 additions & 0 deletions samples/deeplearning/sparse_adagrad_fused/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,13 @@ VERIFY_CORRECTNESS = 1
USE_LIBXSMM_JIT = 1
USE_PERF_COUNTERS=1
USE_LLC_COUNTERS=0
DEBUG_TIME=0

yesnolist += VERIFY_CORRECTNESS
yesnolist += USE_LIBXSMM_JIT
yesnolist += USE_PERF_COUNTERS
yesnolist += USE_LLC_COUNTERS
yesnolist += DEBUG_TIME
#yesnolist +=

DFLAGS += $(strip $(foreach var, $(yesnolist), $(if $(filter 1, $($(var))), -D$(var))))
Expand Down
71 changes: 47 additions & 24 deletions samples/deeplearning/sparse_adagrad_fused/radix_sort.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,37 +2,54 @@
#define _PCL_RADIX_SORT_

#include <utility>
#include <omp.h>
#include <limits>

#include "utils.h"

#ifndef BKT_BITS
#define BKT_BITS 12
#endif

template<typename T>
using Key_Value_Pair = std::pair<T, T>;

template<typename T>
template<typename T>
Key_Value_Pair<T>* radix_sort_parallel(Key_Value_Pair<T>* inp_buf, Key_Value_Pair<T>* tmp_buf, int64_t elements_count, int64_t max_value)
{
constexpr int bkt_bits = BKT_BITS;
constexpr int nbkts = (1 << bkt_bits);
constexpr int bkt_mask = (nbkts - 1);

int maxthreads = omp_get_max_threads();
int histogram[256*maxthreads], histogram_ps[256*maxthreads + 1];
int histogram[nbkts*maxthreads], histogram_ps[nbkts*maxthreads + 1];
if(max_value == 0) return inp_buf;
int num_bits = sizeof(T) * 8 - __builtin_clz(max_value);
int num_passes = (num_bits + 7) / 8;
int num_bits = 64;
if(sizeof(T) == 8 && max_value > std::numeric_limits<int>::max()) {
num_bits = sizeof(T) * 8 - __builtin_clzll(max_value);
} else {
num_bits = 32 - __builtin_clz((unsigned int)max_value);
}

int num_passes = (num_bits + bkt_bits - 1) / bkt_bits;

#pragma omp parallel
{
int tid = omp_get_thread_num();
int nthreads = omp_get_num_threads();

int * local_histogram = &histogram[256*tid];
int * local_histogram_ps = &histogram_ps[256*tid];
int * local_histogram = &histogram[nbkts*tid];
int * local_histogram_ps = &histogram_ps[nbkts*tid];
int elements_count_4 = elements_count/4*4;
Key_Value_Pair<T> * input = inp_buf;
Key_Value_Pair<T> * output = tmp_buf;

for(unsigned int pass = 0; pass < num_passes; pass++)
{

/* Step 1: compute histogram
Reset histogram */
for(int i = 0; i < 256; i++) local_histogram[i] = 0;
auto t1 = get_time();
// Step 1: compute histogram
// Reset histogram
for(int i = 0; i < nbkts; i++) local_histogram[i] = 0;

#pragma omp for schedule(static)
for(int64_t i = 0; i < elements_count_4; i+=4)
Expand All @@ -42,41 +59,43 @@ Key_Value_Pair<T>* radix_sort_parallel(Key_Value_Pair<T>* inp_buf, Key_Value_Pai
T val_3 = input[i+2].first;
T val_4 = input[i+3].first;

local_histogram[ (val_1>>(pass*8)) &0xFF]++;
local_histogram[ (val_2>>(pass*8)) &0xFF]++;
local_histogram[ (val_3>>(pass*8)) &0xFF]++;
local_histogram[ (val_4>>(pass*8)) &0xFF]++;
local_histogram[ (val_1>>(pass*bkt_bits)) & bkt_mask]++;
local_histogram[ (val_2>>(pass*bkt_bits)) & bkt_mask]++;
local_histogram[ (val_3>>(pass*bkt_bits)) & bkt_mask]++;
local_histogram[ (val_4>>(pass*bkt_bits)) & bkt_mask]++;
}
if(tid == (nthreads -1))
{
for(int64_t i = elements_count_4; i < elements_count; i++)
{
T val = input[i].first;
local_histogram[ (val>>(pass*8)) &0xFF]++;
local_histogram[ (val>>(pass*bkt_bits)) & bkt_mask]++;
}
}
#pragma omp barrier
/* Step 2: prefix sum */
auto t11 = get_time();
// Step 2: prefix sum
if(tid == 0)
{
int sum = 0, prev_sum = 0;
for(int bins = 0; bins < 256; bins++) for(int t = 0; t < nthreads; t++) { sum += histogram[t*256 + bins]; histogram_ps[t*256 + bins] = prev_sum; prev_sum = sum; }
histogram_ps[256*nthreads] = prev_sum; if(prev_sum != elements_count) { printf("Error1!\n"); exit(123); }
for(int bins = 0; bins < nbkts; bins++) for(int t = 0; t < nthreads; t++) { sum += histogram[t*nbkts + bins]; histogram_ps[t*nbkts + bins] = prev_sum; prev_sum = sum; }
histogram_ps[nbkts*nthreads] = prev_sum; if(prev_sum != elements_count) { printf("Error1!\n"); exit(123); }
}
#pragma omp barrier
auto t12 = get_time();

/* Step 3: scatter */
// Step 3: scatter
#pragma omp for schedule(static)
for(int64_t i = 0; i < elements_count_4; i+=4)
{
T val_1 = input[i].first;
T val_2 = input[i+1].first;
T val_3 = input[i+2].first;
T val_4 = input[i+3].first;
T bin_1 = (val_1>>(pass*8)) &0xFF;
T bin_2 = (val_2>>(pass*8)) &0xFF;
T bin_3 = (val_3>>(pass*8)) &0xFF;
T bin_4 = (val_4>>(pass*8)) &0xFF;
T bin_1 = (val_1>>(pass*bkt_bits)) & bkt_mask;
T bin_2 = (val_2>>(pass*bkt_bits)) & bkt_mask;
T bin_3 = (val_3>>(pass*bkt_bits)) & bkt_mask;
T bin_4 = (val_4>>(pass*bkt_bits)) & bkt_mask;
int pos;
pos = local_histogram_ps[bin_1]++;
output[pos] = input[i];
Expand All @@ -92,13 +111,17 @@ Key_Value_Pair<T>* radix_sort_parallel(Key_Value_Pair<T>* inp_buf, Key_Value_Pai
for(int64_t i = elements_count_4; i < elements_count; i++)
{
T val = input[i].first;
int pos = local_histogram_ps[ (val>>(pass*8)) &0xFF]++;
int pos = local_histogram_ps[ (val>>(pass*bkt_bits)) & bkt_mask]++;
output[pos] = input[i];
}
}

Key_Value_Pair<T> * temp = input; input = output; output = temp;
#pragma omp barrier
auto t2 = get_time();
#ifdef DEBUG_TIME
if (tid == 0) printf("pass = %d total time = %.3f step1 = %.3f step2 = %.3f %.3f\n", pass, t2-t1, t11-t1, t12-t11, t2-t12);
#endif
}
}
return (num_passes % 2 == 0 ? inp_buf : tmp_buf);
Expand Down
50 changes: 21 additions & 29 deletions samples/deeplearning/sparse_adagrad_fused/sparse_adagrad.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,20 +25,14 @@
#include <unistd.h>
#include <assert.h>
#include <immintrin.h>
#ifdef _OPENMP
#include <omp.h>
#else
#define omp_get_num_threads() (1)
#define omp_get_thread_num() (0)
#define omp_get_max_threads() (1)
#endif
#include <libxsmm.h>

#ifdef USE_PERF_COUNTERS
#include "counters.h"
#endif

#include "radix_sort.h"
#include "utils.h"

const int alignment = 64;
typedef long ITyp;
Expand All @@ -55,21 +49,6 @@ void set_random_seed(int seed)
}
}

static double get_time() {
static bool init_done = false;
static struct timespec stp = {0,0};
struct timespec tp;
clock_gettime(CLOCK_REALTIME, &tp);
/*clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &tp);*/

if(!init_done) {
init_done = true;
stp = tp;
}
double ret = (tp.tv_sec - stp.tv_sec) * 1e3 + (tp.tv_nsec - stp.tv_nsec)*1e-6;
return ret;
}

template<typename T>
void init_zero(size_t sz, T *buf)
{
Expand Down Expand Up @@ -257,12 +236,16 @@ void sparse_transpose_radix(EmbeddingInOut *eio)
}
}
auto t1 = get_time();
//printf("Keypair buffer fill Time = %.3f ms\n", t1-t0);
#ifdef DEBUG_TIME
printf("Keypair buffer fill Time = %.3f ms\n", t1-t0);
#endif

t0 = get_time();
Key_Value_Pair<int>* tmpBuf2 = radix_sort_parallel<int>(&tmpBuf[0], &tmpBuf1[0], NS, M);
t1 = get_time();
//printf("Radix Sort Time = %.3f ms\n", t1-t0);
#ifdef DEBUG_TIME
printf("Radix Sort Time = %.3f ms\n", t1-t0);
#endif

int max_thds = omp_get_max_threads();
int num_uniq[max_thds];
Expand All @@ -283,7 +266,9 @@ void sparse_transpose_radix(EmbeddingInOut *eio)
num_uniq[i] += num_uniq[i-1];
int U = num_uniq[max_thds-1];
t1 = get_time();
//printf("Num Unique Index Time = %.3f ms\n", t1-t0);
#ifdef DEBUG_TIME
printf("Num Unique Index Time = %.3f ms\n", t1-t0);
#endif

t0 = get_time();
eio->mb_offsets[0] = 0;
Expand All @@ -306,7 +291,9 @@ void sparse_transpose_radix(EmbeddingInOut *eio)
}
}
t1 = get_time();
//printf("Offset/Index array construction Time = %.3f ms\n", t1-t0);
#ifdef DEBUG_TIME
printf("Offset/Index array construction Time = %.3f ms\n", t1-t0);
#endif
eio->mb_offsets[U] = NS;
eio->U = U;
my_free(tmpBuf);
Expand Down Expand Up @@ -431,7 +418,9 @@ void allocate_buffers_and_generte_rnd_input(int N, int P, double alpha, Embeddin
auto t0 = get_time();
sparse_transpose_radix(eio);
auto t1 = get_time();
//printf("Trans Time = %.3f ms\n", t1-t0);
#ifdef DEBUG_TIME
printf("Trans Time = %.3f ms\n", t1-t0);
#endif
}

void free_buffers(EmbeddingInOut *eio)
Expand Down Expand Up @@ -517,7 +506,9 @@ int main(int argc, char * argv[]) {
auto t0 = get_time();
allocate_buffers_and_generte_rnd_input(N, P, alpha, eb[i], eio[j][i]);
auto t1 = get_time();
//printf("Rand init time = %.3f ms\n", t1 - t0);
#ifdef DEBUG_TIME
printf("Rand init time = %.3f ms\n", t1 - t0);
#endif
tNS += eio[j][i]->NS;
tU += eio[j][i]->U;
}
Expand All @@ -530,7 +521,9 @@ int main(int argc, char * argv[]) {
eb[s]->fused_backward_update_adagrad(eio[i][s]->U, eio[i][s]->NS, N, eio[i][s]->mb_offsets, eio[i][s]->mb_indices, eio[i][s]->wt_indices, eio[i][s]->gradout, -0.1, 1.0e-6);
}
double t1 = get_time();
#ifdef DEBUG_TIME
printf("Warmup Iter %4d: Time = %.3f ms\n", i, t1-t0);
#endif
}

#ifdef USE_PERF_COUNTERS
Expand Down Expand Up @@ -574,7 +567,6 @@ int main(int argc, char * argv[]) {
size_t bwdupdBytesMax = bwdupdBytesMaxRd + bwdupdBytesMaxWr;

my_printf("Iters = %d, LS = %d, N = %d, M = %d, E = %d, avgNS = %ld, avgU = %ld, P = %d\n", iters, LS, N, M, E, tNS/(iters*LS), tU/(iters*LS), P);
//printf("Time: Fwd: %.3f ms Bwd: %.3f ms Upd: %.3f Total: %.3f\n", fwdTime, bwdTime, updTime, t1-t0);
my_printf("Per Iter Time: %.3f ms Total: %.3f ms\n", bwdupdTime/(iters), (t1-t0)/(iters));
my_printf("Per Table Time: %.3f ms Total: %.3f ms\n", bwdupdTime/(iters*LS), (t1-t0)/(iters*LS));

Expand Down
44 changes: 44 additions & 0 deletions samples/deeplearning/sparse_adagrad_fused/utils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
/******************************************************************************
* Copyright (c) Intel Corporation - All rights reserved. *
* This file is part of the LIBXSMM library. *
* *
* For information on the license, see the LICENSE file. *
* Further information: https://github.com/hfp/libxsmm/ *
* SPDX-License-Identifier: BSD-3-Clause *
******************************************************************************/
/* Ishwar Bhati (Intel Corp.)
Dhiraj Kalamkar (Intel Corp.)
******************************************************************************/

#ifndef _UTILS_H_
#define _UTILS_H_

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
#ifdef _OPENMP
#include <omp.h>
#else
#define omp_get_num_threads() (1)
#define omp_get_thread_num() (0)
#define omp_get_max_threads() (1)
#endif


static double get_time() {
static bool init_done = false;
static struct timespec stp = {0,0};
struct timespec tp;
clock_gettime(CLOCK_REALTIME, &tp);
/*clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &tp);*/

if(!init_done) {
init_done = true;
stp = tp;
}
double ret = (tp.tv_sec - stp.tv_sec) * 1e3 + (tp.tv_nsec - stp.tv_nsec)*1e-6;
return ret;
}

#endif /*_UTILS_H_*/

0 comments on commit 84805a4

Please sign in to comment.