From 10d05b57a6a32ffa552d453832fa6b9df43c6525 Mon Sep 17 00:00:00 2001 From: S1M0N21 Date: Sat, 6 Mar 2021 10:40:12 +0800 Subject: [PATCH] update multiexp params --- src/gpu/multiexp.rs | 69 +++++++++++++++++++++----------------------- src/gpu/utils.rs | 70 --------------------------------------------- 2 files changed, 32 insertions(+), 107 deletions(-) diff --git a/src/gpu/multiexp.rs b/src/gpu/multiexp.rs index dc45e4825..456a0cfbe 100644 --- a/src/gpu/multiexp.rs +++ b/src/gpu/multiexp.rs @@ -15,8 +15,8 @@ use std::time::Instant; extern crate scoped_threadpool; use scoped_threadpool::Pool; -use crate::gpu::get_max_window_size; +const MAX_WINDOW_SIZE: usize = 10; const LOCAL_WORK_SIZE: usize = 256; const MEMORY_PADDING: f64 = 0.1f64; // Let 10% of GPU memory be free @@ -52,27 +52,27 @@ where fn calc_num_groups(core_count: usize, num_windows: usize) -> usize { // Observations show that we get the best performance when num_groups * num_windows ~= 2 * CUDA_CORES - 2 * core_count / num_windows // TODO: 4 + 2 * core_count / num_windows } -// fn calc_window_size(n: usize, exp_bits: usize, core_count: usize) -> usize { -// // window_size = ln(n / num_groups) -// // num_windows = exp_bits / window_size -// // num_groups = 2 * core_count / num_windows = 2 * core_count * window_size / exp_bits -// // window_size = ln(n / num_groups) = ln(n * exp_bits / (2 * core_count * window_size)) -// // window_size = ln(exp_bits * n / (2 * core_count)) - ln(window_size) -// // -// // Thus we need to solve the following equation: -// // window_size + ln(window_size) = ln(exp_bits * n / (2 * core_count)) -// let lower_bound = (((exp_bits * n) as f64) / ((2 * core_count) as f64)).ln(); -// for w in 0..MAX_WINDOW_SIZE { -// if (w as f64) + (w as f64).ln() > lower_bound { -// return w; -// } -// } -// -// MAX_WINDOW_SIZE -// } +fn calc_window_size(n: usize, exp_bits: usize, core_count: usize) -> usize { + // window_size = ln(n / num_groups) + // num_windows = exp_bits / window_size + // num_groups = 2 * core_count / num_windows = 2 * core_count * window_size / exp_bits + // window_size = ln(n / num_groups) = ln(n * exp_bits / (2 * core_count * window_size)) + // window_size = ln(exp_bits * n / (2 * core_count)) - ln(window_size) + // + // Thus we need to solve the following equation: + // window_size + ln(window_size) = ln(exp_bits * n / (2 * core_count)) + let lower_bound = (((exp_bits * n) as f64) / ((2 * core_count) as f64)).ln(); + for w in 0..MAX_WINDOW_SIZE { + if (w as f64) + (w as f64).ln() > lower_bound { + return w; + } + } + + MAX_WINDOW_SIZE +} fn calc_best_chunk_size(max_window_size: usize, core_count: usize, exp_bits: usize) -> usize { // Best chunk-size (N) can also be calculated using the same logic as calc_window_size: @@ -85,7 +85,7 @@ fn calc_best_chunk_size(max_window_size: usize, core_count: usize, exp_bits: usi .ceil() as usize } -fn calc_chunk_size(mem: u64, core_count: usize, max_window_size: usize) -> usize +fn calc_chunk_size(mem: u64, core_count: usize) -> usize where E: Engine, { @@ -93,7 +93,7 @@ where let exp_size = exp_size::(); let proj_size = std::mem::size_of::() + std::mem::size_of::(); ((((mem as f64) * (1f64 - MEMORY_PADDING)) as usize) - - (2 * core_count * ((1 << max_window_size) + 1) * proj_size)) + - (2 * core_count * ((1 << MAX_WINDOW_SIZE) + 1) * proj_size)) / (aff_size + exp_size) } @@ -110,10 +110,9 @@ where let exp_bits = exp_size::() * 8; let core_count = utils::get_core_count(&d); - let max_window_size = utils::get_max_window_size(&d); let mem = d.memory(); - let max_n = calc_chunk_size::(mem, core_count, max_window_size); - let best_n = calc_best_chunk_size(max_window_size, core_count, exp_bits); + let max_n = calc_chunk_size::(mem, core_count); + let best_n = calc_best_chunk_size(MAX_WINDOW_SIZE, core_count, exp_bits); let n = std::cmp::min(max_n, best_n); Ok(SingleMultiexpKernel { @@ -140,7 +139,7 @@ where } let exp_bits = exp_size::() * 8; - let window_size = get_max_window_size(&self.device); + let window_size = calc_window_size(n, exp_bits, self.core_count); let num_windows = ((exp_bits as f64) / (window_size as f64)).ceil() as usize; let num_groups = calc_num_groups(self.core_count, num_windows); let bucket_len = 1 << window_size; @@ -158,10 +157,10 @@ where let bucket_buffer = self .program - .create_buffer::<::Projective>(2 * self.core_count * bucket_len)?; // TODO: 4 + .create_buffer::<::Projective>(2 * self.core_count * bucket_len)?; let result_buffer = self .program - .create_buffer::<::Projective>(2 * self.core_count)?; // TODO: 4 + .create_buffer::<::Projective>(2 * self.core_count)?; // Make global work size divisible by `LOCAL_WORK_SIZE` let mut global_work_size = num_windows * num_groups; @@ -310,14 +309,7 @@ where .zip(self.kernels.par_iter_mut()) .map(|((bases, exps), kern)| -> Result<::Projective, GPUError> { let mut acc = ::Projective::zero(); - let mut chunk = { - let mut chunk_size = utils::get_chunk_size(&kern.device); - if chunk_size == 0 { - chunk_size = kern.n - } - - chunk_size - }; + let mut chunk = kern.n; let size_result = std::mem::size_of::<::Projective>(); if size_result > 144 { @@ -326,7 +318,10 @@ where chunk = (chunk as f64 / 1.2f64).ceil() as usize; } - for (bases, exps) in bases.chunks(chunk).zip(exps.chunks(chunk)) { + for (bases, exps) in bases + .chunks(chunk) + .zip(exps.chunks(chunk)) + { let result = kern.multiexp(bases, exps, bases.len())?; acc.add_assign(&result); } diff --git a/src/gpu/utils.rs b/src/gpu/utils.rs index 703a12230..30f1003d6 100644 --- a/src/gpu/utils.rs +++ b/src/gpu/utils.rs @@ -53,47 +53,6 @@ lazy_static::lazy_static! { core_counts }; - - static ref MAX_WINDOW_SIZE: HashMap:: = { - let mut max_window_size: HashMap = vec![ - ("GeForce RTX 3090".to_string(), 12), - ("GeForce RTX 3080".to_string(), 11), - ("GeForce RTX 2080 Ti".to_string(), 11), - ].into_iter().collect(); - - match env::var("BELLMAN_MAX_WINDOW_SIZE").and_then(|var| { - for card in var.split(",") { - let splitted = card.split(":").collect::>(); - if splitted.len() != 2 { panic!("Invalid BELLMAN_MAX_WINDOW_SIZE!"); } - let name = splitted[0].trim().to_string(); - let size : usize = splitted[1].trim().parse().expect("Invalid BELLMAN_MAX_WINDOW_SIZE!"); - max_window_size.insert(name, size); - } - Ok(()) - }) { Err(_) => { }, Ok(_) => { } } - - max_window_size - }; - - static ref CHUNK_SIZE: HashMap:: = { - let mut chunk_size: HashMap = vec![ - ("GeForce RTX 3090".to_string(), 67108864), - ("GeForce RTX 3080".to_string(), 33554466), - ].into_iter().collect(); - - match env::var("BELLMAN_CHUNK_SIZE").and_then(|var| { - for card in var.split(",") { - let splitted = card.split(":").collect::>(); - if splitted.len() != 2 { panic!("Invalid BELLMAN_CHUNK_SIZE!"); } - let name = splitted[0].trim().to_string(); - let size: usize = splitted[1].trim().parse().expect("Invalid BELLMAN_CHUNK_SIZE!"); - chunk_size.insert(name, size); - } - Ok(()) - }) { Err(_) => { }, Ok(_) => { } } - - chunk_size - }; } const DEFAULT_CORE_COUNT: usize = 2560; @@ -114,35 +73,6 @@ pub fn get_core_count(d: &opencl::Device) -> usize { } } -const DEFAULT_MAX_WINDOW_SIZE: usize = 10; -pub fn get_max_window_size(d: &opencl::Device) -> usize { - let name = d.name(); - match MAX_WINDOW_SIZE.get(&name[..]) { - Some(&w) => { - info!("max_window_size: {}", w); - w - }, - None => { - warn!("use default max window size"); - DEFAULT_MAX_WINDOW_SIZE - } - } -} - -pub fn get_chunk_size(d: &opencl::Device) -> usize { - let name = d.name(); - match CHUNK_SIZE.get(&name[..]) { - Some(&chunk_size) => { - info!("chunk_size: {}", chunk_size); - chunk_size - }, - None => { - warn!("use default chunk size"); - 0 - } - } -} - pub fn dump_device_list() { for d in opencl::Device::all().unwrap() { info!("Device: {:?}", d);