fix: simplify chunk size and window size calculation of multiexp (fil…

…ecoin-project#25) The chunk size calculation used to be based on the number of CUDA cores. Instead use a fixed number of threads that is then split nicely into blocks for optimal performance. Benchmarks have been run on Quadro RTX 6000, GeForce RTX 2080 Ti and GeForce RTX 3090 to make sure there isn't any big regression on CUDA or OpenCL. In some limited cases, the performance is less good. Though for large number of terms, things got significantly faster, which is what this library is optimized for. Below are the numbers of those runs. For each graphics card the multiexp benchmark was run twice. For each size the better (lower) number of each run was used. It compares the runtime prior to this commit (old) to the runtime with this commit applied (new). The cases where either old or new are better are bold. | GPU | CUDA/OpenCL | Version | 1024 | 2048 | 4096 | 8192 | 16384 | 32768 | 65536 | 131072 | 262144 | 524288 | 1048576 | 2097152 | 4194304 | 8388608 | 16777216 | 33554432 | 67108864 | 134217728 | 268435456 | | --------- | ----------- | ------- | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | ----------- | ----------- | ----------- | ----------- | ----------- | | RTX6000 | CUDA | old | 15.751ms | 15.738ms | 17.320ms | 19.831ms | 21.809ms | 29.624ms | **40.618ms** | **65.155ms** | **105.18ms** | **185.14ms** | **272.54ms** | 420.49ms | 790.19ms | 1.4999s | 2.9603s | 5.7531s | 11.434s | 22.874s | 51.522s | | | | new | **14.532ms** | **15.147ms** | **16.314ms** | **19.004ms** | **20.955ms** | **28.732ms** | 41.564ms | 66.707ms | 109.32ms | 186.84ms | 330.00ms | **410.14ms** | **774.19ms** | **1.3458s** | **2.4790s** | **4.7752s** | **9.7055s** | **19.031s** | **41.963s** | | | OpenCL | old | 15.600ms | 15.877ms | 17.590ms | 19.921ms | 22.696ms | 32.388ms | 42.934ms | 71.716ms | 112.10ms | 191.42ms | 341.43ms | 580.50ms | 1.0662s | 2.0566s | 4.1424s | 8.4653s | 16.047s | 35.467s | 72.832s | | | | new | **14.544ms** | **15.009ms** | **16.649ms** | **18.487ms** | **20.833ms** | **27.015ms** | **38.307ms** | **62.836ms** | **102.64ms** | **176.04ms** | **309.27ms** | **526.33ms** | **984.03ms** | **1.8042s** | **3.4835s** | **6.8203s** | **14.003s** | **28.640s** | **58.824s** | | RTX2080Ti | CUDA | old | 10.994ms | 11.244ms | 14.179ms | 15.410ms | 17.584ms | 24.994ms | **31.996ms** | 51.912ms | 93.754ms | **151.65ms** | **221.34ms** | 364.90ms | 677.16ms | 1.3217s | 2.5868s | 5.2162s | 10.402s | 20.883s | 41.937s | | | | new | **10.344ms** | **9.6598ms** | **14.143ms** | **15.274ms** | **17.198ms** | **21.817ms** | 35.415ms | **50.726ms** | **88.575ms** | 153.75ms | 271.72ms | **319.03ms** | **590.31ms** | **1.0330s** | **1.9006s** | **3.8953s** | **7.9497s** | **16.002s** | **32.062s** | | | OpenCl | old | 11.447ms | **11.552ms** | **14.123ms** | 16.393ms | 21.599ms | 27.510ms | 37.208ms | 60.860ms | 105.99ms | 170.86ms | 302.54ms | 523.51ms | 962.37ms | 1.9242s | 3.8334s | 7.7212s | 15.376s | 30.795s | 61.678s | | | | new | **11.140ms** | 11.987ms | 14.837ms | **13.714ms** | **16.898ms** | **24.077ms** | **32.700ms** | **50.925ms** | **87.819ms** | **153.94ms** | **267.78ms** | **467.17ms** | **856.95ms** | **1.6093s** | **3.1487s** | **6.3742s** | **12.894s** | **25.888s** | **52.105s** | | RTX3090 | CUDA | old | 28.924ms | 28.606ms | 29.551ms | **20.608ms** | 33.097ms | 36.271ms | 36.353ms | 43.155ms | 67.801ms | 86.059ms | 150.68ms | 340.78ms | 534.71ms | 985.17ms | 1.7543s | 3.5924s | 7.2819s | 14.658s | 29.133s | | | | new | **15.513ms** | **16.934ms** | **19.606ms** | 23.755ms | **24.186ms** | **28.759ms** | **32.147ms** | **35.125ms** | **50.428ms** | **76.278ms** | **122.85ms** | **206.41ms** | **529.83ms** | **953.46ms** | **1.7170s** | **3.2375s** | **6.6036s** | **13.378s** | **26.999s** | | | OpenCL | old | **18.875ms** | **22.025ms** | 26.669ms | **25.151ms** | 29.823ms | **29.561ms** | **34.674ms** | 43.384ms | 67.859ms | 100.48ms | 174.86ms | 313.63ms | 489.99ms | 899.34ms | 1.5981s | 3.2942s | 6.6854s | 13.473s | 26.754s | | | | new | 21.406ms | 22.300ms | **24.353ms** | 30.037ms | **28.156ms** | 32.799ms | 39.520ms | **41.796ms** | **57.424ms** | **89.439ms** | **147.61ms** | **258.56ms** | **489.23ms** | **865.07ms** | **1.5351s** | **2.8767s** | **5.8899s** | **11.910s** | **24.049s** |
3for · Jul 4, 2022 · c68c369 · c68c369
1 parent 765ebc4
commit c68c369
Show file tree

Hide file tree

Showing 2 changed files with 83 additions and 75 deletions.
diff --git a/ec-gpu-gen/Cargo.toml b/ec-gpu-gen/Cargo.toml
@@ -19,7 +19,7 @@ num_cpus = "1.13.0"
 once_cell = "1.8.0"
 pairing = "0.22.0"
 rayon = "1.5.1"
-rust-gpu-tools = { version = "0.6.0", default-features = false, optional = true }
+rust-gpu-tools = { version = "0.6.1", default-features = false, optional = true }
 temp-env = "0.2.0"
 thiserror = "1.0.30"
 yastl = "0.1.2"

diff --git a/ec-gpu-gen/src/multiexp.rs b/ec-gpu-gen/src/multiexp.rs
@@ -5,9 +5,9 @@ use std::sync::{Arc, RwLock};
 use ec_gpu::GpuEngine;
 use ff::PrimeField;
 use group::{prime::PrimeCurveAffine, Group};
-use log::{error, info, warn};
+use log::{error, info};
 use pairing::Engine;
-use rust_gpu_tools::{program_closures, Device, Program, Vendor, CUDA_CORES};
+use rust_gpu_tools::{program_closures, Device, Program, Vendor};
 use yastl::Scope;
 
 use crate::{
@@ -17,21 +17,33 @@ use crate::{
     Limb32, Limb64,
 };
 
+/// On the GPU, the exponents are split into windows, this is the maximum number of such windows.
 const MAX_WINDOW_SIZE: usize = 10;
-const LOCAL_WORK_SIZE: usize = 256;
-const MEMORY_PADDING: f64 = 0.2f64; // Let 20% of GPU memory be free
-const DEFAULT_CUDA_CORES: usize = 2560;
-
-fn get_cuda_cores_count(name: &str) -> usize {
-    *CUDA_CORES.get(name).unwrap_or_else(|| {
-        warn!(
-            "Number of CUDA cores for your device ({}) is unknown! Best performance is only \
-            achieved when the number of CUDA cores is known! You can find the instructions on \
-            how to support custom GPUs here: https://docs.rs/rust-gpu-tools",
-            name
-        );
-        &DEFAULT_CUDA_CORES
-    })
+/// In CUDA this is the number of blocks per grid (grid size).
+const LOCAL_WORK_SIZE: usize = 128;
+/// Let 20% of GPU memory be free, this is an arbitrary value.
+const MEMORY_PADDING: f64 = 0.2f64;
+/// The Nvidia Ampere architecture is compute capability major version 8.
+const AMPERE: u32 = 8;
+
+/// Divide and ceil to the next value.
+const fn div_ceil(a: usize, b: usize) -> usize {
+    if a % b == 0 {
+        a / b
+    } else {
+        (a / b) + 1
+    }
+}
+
+/// The number of units the work is split into. One unit will result in one CUDA thread.
+///
+/// Based on empirical results, it turns out that on Nvidia devices with the Ampere architecture,
+/// it's faster to use two times the number of work units.
+const fn work_units(compute_units: u32, compute_capabilities: Option<(u32, u32)>) -> usize {
+    match compute_capabilities {
+        Some((AMPERE, _)) => LOCAL_WORK_SIZE * compute_units as usize * 2,
+        _ => LOCAL_WORK_SIZE * compute_units as usize,
+    }
 }
 
 /// Multiexp kernel for a single GPU.
@@ -40,8 +52,11 @@ where
     E: Engine + GpuEngine,
 {
     program: Program,
-    core_count: usize,
+    /// The number of exponentiations the GPU can handle in a single execution of the kernel.
     n: usize,
+    /// The number of units the work is split into. It will results in this amount of threads on
+    /// the GPU.
+    work_units: usize,
     /// An optional function which will be called at places where it is possible to abort the
     /// multiexp calculations. If it returns true, the calculation will be aborted with an
     /// [`EcError::Aborted`].
@@ -50,53 +65,32 @@ where
     _phantom: std::marker::PhantomData<E::Fr>,
 }
 
-fn calc_num_groups(core_count: usize, num_windows: usize) -> usize {
-    // Observations show that we get the best performance when num_groups * num_windows ~= 2 * CUDA_CORES
-    2 * core_count / num_windows
-}
-
-fn calc_window_size(n: usize, exp_bits: usize, core_count: usize) -> usize {
-    // window_size = ln(n / num_groups)
-    // num_windows = exp_bits / window_size
-    // num_groups = 2 * core_count / num_windows = 2 * core_count * window_size / exp_bits
-    // window_size = ln(n / num_groups) = ln(n * exp_bits / (2 * core_count * window_size))
-    // window_size = ln(exp_bits * n / (2 * core_count)) - ln(window_size)
-    //
-    // Thus we need to solve the following equation:
-    // window_size + ln(window_size) = ln(exp_bits * n / (2 * core_count))
-    let lower_bound = (((exp_bits * n) as f64) / ((2 * core_count) as f64)).ln();
-    for w in 0..MAX_WINDOW_SIZE {
-        if (w as f64) + (w as f64).ln() > lower_bound {
-            return w;
-        }
-    }
-
-    MAX_WINDOW_SIZE
-}
-
-fn calc_best_chunk_size(max_window_size: usize, core_count: usize, exp_bits: usize) -> usize {
-    // Best chunk-size (N) can also be calculated using the same logic as calc_window_size:
-    // n = e^window_size * window_size * 2 * core_count / exp_bits
-    (((max_window_size as f64).exp() as f64)
-        * (max_window_size as f64)
-        * 2f64
-        * (core_count as f64)
-        / (exp_bits as f64))
-        .ceil() as usize
-}
-
-fn calc_chunk_size<E>(mem: u64, core_count: usize) -> usize
+/// Calculates the maximum number of terms that can be put onto the GPU memory.
+fn calc_chunk_size<E>(mem: u64, work_units: usize) -> usize
 where
     E: Engine,
 {
     let aff_size = std::mem::size_of::<E::G1Affine>() + std::mem::size_of::<E::G2Affine>();
     let exp_size = exp_size::<E>();
     let proj_size = std::mem::size_of::<E::G1>() + std::mem::size_of::<E::G2>();
-    ((((mem as f64) * (1f64 - MEMORY_PADDING)) as usize)
-        - (2 * core_count * ((1 << MAX_WINDOW_SIZE) + 1) * proj_size))
-        / (aff_size + exp_size)
+
+    // Leave `MEMORY_PADDING` percent of the memory free.
+    let max_memory = ((mem as f64) * (1f64 - MEMORY_PADDING)) as usize;
+    // The amount of memory (in bytes) of a single term.
+    let term_size = aff_size + exp_size;
+    // The number of buckets needed for one work unit
+    let max_buckets_per_work_unit = 1 << MAX_WINDOW_SIZE;
+    // The amount of memory (in bytes) we need for the intermediate steps (buckets).
+    let buckets_size = work_units * max_buckets_per_work_unit * proj_size;
+    // The amount of memory (in bytes) we need for the results.
+    let results_size = work_units * proj_size;
+
+    (max_memory - buckets_size - results_size) / term_size
 }
 
+/// The size of the exponent in bytes.
+///
+/// It's the actual bytes size it needs in memory, not it's theoratical bit size.
 fn exp_size<E: Engine>() -> usize {
     std::mem::size_of::<<E::Fr as ff::PrimeField>::Repr>()
 }
@@ -113,12 +107,11 @@ where
         device: &Device,
         maybe_abort: Option<&'a (dyn Fn() -> bool + Send + Sync)>,
     ) -> EcResult<Self> {
-        let exp_bits = exp_size::<E>() * 8;
-        let core_count = get_cuda_cores_count(&device.name());
         let mem = device.memory();
-        let max_n = calc_chunk_size::<E>(mem, core_count);
-        let best_n = calc_best_chunk_size(MAX_WINDOW_SIZE, core_count, exp_bits);
-        let n = std::cmp::min(max_n, best_n);
+        let compute_units = device.compute_units();
+        let compute_capability = device.compute_capability();
+        let work_units = work_units(compute_units, compute_capability);
+        let chunk_size = calc_chunk_size::<E>(mem, work_units);
 
         let source = match device.vendor() {
             Vendor::Nvidia => crate::gen_source::<E, Limb32>(),
@@ -128,14 +121,18 @@ where
 
         Ok(SingleMultiexpKernel {
             program,
-            core_count,
-            n,
+            n: chunk_size,
+            work_units,
             maybe_abort,
             _phantom: std::marker::PhantomData,
         })
     }
 
     /// Run the actual multiexp computation on the GPU.
+    ///
+    /// The number of `bases` and `exponents` are determined by [`SingleMultiexpKernel`]`::n`, this
+    /// means that it is guaranteed that this amount of calculations fit on the GPU this kernel is
+    /// running on.
     pub fn multiexp<G>(
         &self,
         bases: &[G],
@@ -150,11 +147,10 @@ where
                 return Err(EcError::Aborted);
             }
         }
-
-        let exp_bits = exp_size::<E>() * 8;
-        let window_size = calc_window_size(n as usize, exp_bits, self.core_count);
-        let num_windows = ((exp_bits as f64) / (window_size as f64)).ceil() as usize;
-        let num_groups = calc_num_groups(self.core_count, num_windows);
+        let window_size = self.calc_window_size(bases.len());
+        // windows_size * num_windows needs to be >= 256 in order for the kernel to work correctly.
+        let num_windows = div_ceil(256, window_size);
+        let num_groups = self.work_units / num_windows;
         let bucket_len = 1 << window_size;
 
         // Each group will have `num_windows` threads and as there are `num_groups` groups, there will
@@ -169,18 +165,17 @@ where
                 // It is safe as the GPU will initialize that buffer
                 let bucket_buffer = unsafe {
                     program.create_buffer::<<G as PrimeCurveAffine>::Curve>(
-                        2 * self.core_count * bucket_len,
+                        self.work_units * bucket_len,
                     )?
                 };
                 // It is safe as the GPU will initialize that buffer
                 let result_buffer = unsafe {
-                    program.create_buffer::<<G as PrimeCurveAffine>::Curve>(2 * self.core_count)?
+                    program.create_buffer::<<G as PrimeCurveAffine>::Curve>(self.work_units)?
                 };
 
                 // The global work size follows CUDA's definition and is the number of
                 // `LOCAL_WORK_SIZE` sized thread groups.
-                let global_work_size =
-                    (num_windows * num_groups + LOCAL_WORK_SIZE - 1) / LOCAL_WORK_SIZE;
+                let global_work_size = div_ceil(num_windows * num_groups, LOCAL_WORK_SIZE);
 
                 let kernel = program.create_kernel(
                     if TypeId::of::<G>() == TypeId::of::<E::G1Affine>() {
@@ -205,8 +200,7 @@ where
                     .arg(&(window_size as u32))
                     .run()?;
 
-                let mut results =
-                    vec![<G as PrimeCurveAffine>::Curve::identity(); 2 * self.core_count];
+                let mut results = vec![<G as PrimeCurveAffine>::Curve::identity(); self.work_units];
                 program.read_into_buffer(&result_buffer, &mut results)?;
 
                 Ok(results)
@@ -219,6 +213,7 @@ where
         // of those `NUM_GROUPS` * `NUM_WINDOWS` threads.
         let mut acc = <G as PrimeCurveAffine>::Curve::identity();
         let mut bits = 0;
+        let exp_bits = exp_size::<E>() * 8;
         for i in 0..num_windows {
             let w = std::cmp::min(window_size, exp_bits - bits);
             for _ in 0..w {
@@ -232,6 +227,19 @@ where
 
         Ok(acc)
     }
+
+    /// Calculates the window size, based on the given number of terms.
+    ///
+    /// For best performance, the window size is reduced, so that maximum parallelism is possible.
+    /// If you e.g. have put only a subset of the terms into the GPU memory, then a smaller window
+    /// size leads to more windows, hence more units to work on, as we split the work into
+    /// `num_windows * num_groups`.
+    fn calc_window_size(&self, num_terms: usize) -> usize {
+        // The window size was determined by running the `gpu_multiexp_consistency` test and
+        // looking at the resulting numbers.
+        let window_size = ((div_ceil(num_terms, self.work_units) as f64).log2() as usize) + 2;
+        std::cmp::min(window_size, MAX_WINDOW_SIZE)
+    }
 }
 
 /// A struct that containts several multiexp kernels for different devices.