From 10d05b57a6a32ffa552d453832fa6b9df43c6525 Mon Sep 17 00:00:00 2001
From: S1M0N21 <s1m0n21@hotmail.com>
Date: Sat, 6 Mar 2021 10:40:12 +0800
Subject: [PATCH] update multiexp params

---
 src/gpu/multiexp.rs | 69 +++++++++++++++++++++-----------------------
 src/gpu/utils.rs    | 70 ---------------------------------------------
 2 files changed, 32 insertions(+), 107 deletions(-)
diff --git a/src/gpu/multiexp.rs b/src/gpu/multiexp.rs
index dc45e4825..456a0cfbe 100644
--- a/src/gpu/multiexp.rs
+++ b/src/gpu/multiexp.rs
@@ -15,8 +15,8 @@ use std::time::Instant;
 
 extern crate scoped_threadpool;
 use scoped_threadpool::Pool;
-use crate::gpu::get_max_window_size;
 
+const MAX_WINDOW_SIZE: usize = 10;
 const LOCAL_WORK_SIZE: usize = 256;
 const MEMORY_PADDING: f64 = 0.1f64; // Let 10% of GPU memory be free
 
@@ -52,27 +52,27 @@ where
 
 fn calc_num_groups(core_count: usize, num_windows: usize) -> usize {
     // Observations show that we get the best performance when num_groups * num_windows ~= 2 * CUDA_CORES
-    2 * core_count / num_windows // TODO: 4
+    2 * core_count / num_windows
 }
 
-// fn calc_window_size(n: usize, exp_bits: usize, core_count: usize) -> usize {
-//     // window_size = ln(n / num_groups)
-//     // num_windows = exp_bits / window_size
-//     // num_groups = 2 * core_count / num_windows = 2 * core_count * window_size / exp_bits
-//     // window_size = ln(n / num_groups) = ln(n * exp_bits / (2 * core_count * window_size))
-//     // window_size = ln(exp_bits * n / (2 * core_count)) - ln(window_size)
-//     //
-//     // Thus we need to solve the following equation:
-//     // window_size + ln(window_size) = ln(exp_bits * n / (2 * core_count))
-//     let lower_bound = (((exp_bits * n) as f64) / ((2 * core_count) as f64)).ln();
-//     for w in 0..MAX_WINDOW_SIZE {
-//         if (w as f64) + (w as f64).ln() > lower_bound {
-//             return w;
-//         }
-//     }
-//
-//     MAX_WINDOW_SIZE
-// }
+fn calc_window_size(n: usize, exp_bits: usize, core_count: usize) -> usize {
+    // window_size = ln(n / num_groups)
+    // num_windows = exp_bits / window_size
+    // num_groups = 2 * core_count / num_windows = 2 * core_count * window_size / exp_bits
+    // window_size = ln(n / num_groups) = ln(n * exp_bits / (2 * core_count * window_size))
+    // window_size = ln(exp_bits * n / (2 * core_count)) - ln(window_size)
+    //
+    // Thus we need to solve the following equation:
+    // window_size + ln(window_size) = ln(exp_bits * n / (2 * core_count))
+    let lower_bound = (((exp_bits * n) as f64) / ((2 * core_count) as f64)).ln();
+    for w in 0..MAX_WINDOW_SIZE {
+        if (w as f64) + (w as f64).ln() > lower_bound {
+            return w;
+        }
+    }
+
+    MAX_WINDOW_SIZE
+}
 
 fn calc_best_chunk_size(max_window_size: usize, core_count: usize, exp_bits: usize) -> usize {
     // Best chunk-size (N) can also be calculated using the same logic as calc_window_size:
@@ -85,7 +85,7 @@ fn calc_best_chunk_size(max_window_size: usize, core_count: usize, exp_bits: usi
         .ceil() as usize
 }
 
-fn calc_chunk_size<E>(mem: u64, core_count: usize, max_window_size: usize) -> usize
+fn calc_chunk_size<E>(mem: u64, core_count: usize) -> usize
 where
     E: Engine,
 {
@@ -93,7 +93,7 @@ where
     let exp_size = exp_size::<E>();
     let proj_size = std::mem::size_of::<E::G1>() + std::mem::size_of::<E::G2>();
     ((((mem as f64) * (1f64 - MEMORY_PADDING)) as usize)
-        - (2 * core_count * ((1 << max_window_size) + 1) * proj_size))
+        - (2 * core_count * ((1 << MAX_WINDOW_SIZE) + 1) * proj_size))
         / (aff_size + exp_size)
 }
 
@@ -110,10 +110,9 @@ where
 
         let exp_bits = exp_size::<E>() * 8;
         let core_count = utils::get_core_count(&d);
-        let max_window_size = utils::get_max_window_size(&d);
         let mem = d.memory();
-        let max_n = calc_chunk_size::<E>(mem, core_count, max_window_size);
-        let best_n = calc_best_chunk_size(max_window_size, core_count, exp_bits);
+        let max_n = calc_chunk_size::<E>(mem, core_count);
+        let best_n = calc_best_chunk_size(MAX_WINDOW_SIZE, core_count, exp_bits);
         let n = std::cmp::min(max_n, best_n);
 
         Ok(SingleMultiexpKernel {
@@ -140,7 +139,7 @@ where
         }
 
         let exp_bits = exp_size::<E>() * 8;
-        let window_size = get_max_window_size(&self.device);
+        let window_size = calc_window_size(n, exp_bits, self.core_count);
         let num_windows = ((exp_bits as f64) / (window_size as f64)).ceil() as usize;
         let num_groups = calc_num_groups(self.core_count, num_windows);
         let bucket_len = 1 << window_size;
@@ -158,10 +157,10 @@ where
 
         let bucket_buffer = self
             .program
-            .create_buffer::<<G as CurveAffine>::Projective>(2 * self.core_count * bucket_len)?; // TODO: 4
+            .create_buffer::<<G as CurveAffine>::Projective>(2 * self.core_count * bucket_len)?;
         let result_buffer = self
             .program
-            .create_buffer::<<G as CurveAffine>::Projective>(2 * self.core_count)?; // TODO: 4
+            .create_buffer::<<G as CurveAffine>::Projective>(2 * self.core_count)?;
 
         // Make global work size divisible by `LOCAL_WORK_SIZE`
         let mut global_work_size = num_windows * num_groups;
@@ -310,14 +309,7 @@ where
                             .zip(self.kernels.par_iter_mut())
                             .map(|((bases, exps), kern)| -> Result<<G as CurveAffine>::Projective, GPUError> {
                                 let mut acc = <G as CurveAffine>::Projective::zero();
-                                let mut chunk = {
-                                    let mut chunk_size = utils::get_chunk_size(&kern.device);
-                                    if chunk_size == 0 {
-                                        chunk_size = kern.n
-                                    }
-
-                                    chunk_size
-                                };
+                                let mut chunk = kern.n;
                                 let size_result = std::mem::size_of::<<G as CurveAffine>::Projective>();
 
                                 if size_result > 144 {
@@ -326,7 +318,10 @@ where
                                     chunk = (chunk as f64 / 1.2f64).ceil() as usize;
                                 }
 
-                                for (bases, exps) in bases.chunks(chunk).zip(exps.chunks(chunk)) {
+                                for (bases, exps) in bases
+                                    .chunks(chunk)
+                                    .zip(exps.chunks(chunk))
+                                {
                                     let result = kern.multiexp(bases, exps, bases.len())?;
                                     acc.add_assign(&result);
                                 }
diff --git a/src/gpu/utils.rs b/src/gpu/utils.rs
index 703a12230..30f1003d6 100644
--- a/src/gpu/utils.rs
+++ b/src/gpu/utils.rs
@@ -53,47 +53,6 @@ lazy_static::lazy_static! {
 
         core_counts
     };
-
-    static ref MAX_WINDOW_SIZE: HashMap::<String, usize> = {
-        let mut max_window_size: HashMap<String, usize> = vec![
-            ("GeForce RTX 3090".to_string(), 12),
-            ("GeForce RTX 3080".to_string(), 11),
-            ("GeForce RTX 2080 Ti".to_string(), 11),
-        ].into_iter().collect();
-
-        match env::var("BELLMAN_MAX_WINDOW_SIZE").and_then(|var| {
-            for card in var.split(",") {
-                let splitted = card.split(":").collect::<Vec<_>>();
-                if splitted.len() != 2 { panic!("Invalid BELLMAN_MAX_WINDOW_SIZE!"); }
-                let name = splitted[0].trim().to_string();
-                let size : usize = splitted[1].trim().parse().expect("Invalid BELLMAN_MAX_WINDOW_SIZE!");
-                max_window_size.insert(name, size);
-             }
-            Ok(())
-        }) { Err(_) => { }, Ok(_) => { } }
-
-        max_window_size
-    };
-
-    static ref CHUNK_SIZE: HashMap::<String, usize> = {
-        let mut chunk_size: HashMap<String, usize> = vec![
-            ("GeForce RTX 3090".to_string(), 67108864),
-            ("GeForce RTX 3080".to_string(), 33554466),
-        ].into_iter().collect();
-
-        match env::var("BELLMAN_CHUNK_SIZE").and_then(|var| {
-            for card in var.split(",") {
-                let splitted = card.split(":").collect::<Vec<_>>();
-                if splitted.len() != 2 { panic!("Invalid BELLMAN_CHUNK_SIZE!"); }
-                let name = splitted[0].trim().to_string();
-                let size: usize = splitted[1].trim().parse().expect("Invalid BELLMAN_CHUNK_SIZE!");
-                chunk_size.insert(name, size);
-            }
-            Ok(())
-        }) { Err(_) => { }, Ok(_) => { } }
-
-        chunk_size
-    };
 }
 
 const DEFAULT_CORE_COUNT: usize = 2560;
@@ -114,35 +73,6 @@ pub fn get_core_count(d: &opencl::Device) -> usize {
     }
 }
 
-const DEFAULT_MAX_WINDOW_SIZE: usize = 10;
-pub fn get_max_window_size(d: &opencl::Device) -> usize {
-    let name = d.name();
-    match MAX_WINDOW_SIZE.get(&name[..]) {
-        Some(&w) => {
-            info!("max_window_size: {}", w);
-            w
-        },
-        None => {
-            warn!("use default max window size");
-            DEFAULT_MAX_WINDOW_SIZE
-        }
-    }
-}
-
-pub fn get_chunk_size(d: &opencl::Device) -> usize {
-    let name = d.name();
-    match CHUNK_SIZE.get(&name[..]) {
-        Some(&chunk_size) => {
-            info!("chunk_size: {}", chunk_size);
-            chunk_size
-        },
-        None => {
-            warn!("use default chunk size");
-            0
-        }
-    }
-}
-
 pub fn dump_device_list() {
     for d in opencl::Device::all().unwrap() {
         info!("Device: {:?}", d);