From 68d5af7acf69f4153dc81453d20a909fc0817f7a Mon Sep 17 00:00:00 2001 From: Juan Leni Date: Wed, 1 Jul 2020 00:46:28 +0200 Subject: [PATCH] AMD Support (#92) * Initial AMD support * Fix name change * additional debug logging * addressing PR review comments * fixing rustfmt issues * fixing CI issues --- README.md | 119 ++++++++++++++++++++++------------- README_AMD.md | 21 +++++++ src/domain.rs | 83 +++++++++++++----------- src/gadgets/blake2s.rs | 32 +++++----- src/gpu/fft.rs | 22 +++++-- src/gpu/fft/fft.cl | 16 ++--- src/gpu/mod.rs | 15 ++--- src/gpu/multiexp.rs | 39 +++++++++--- src/gpu/multiexp/ec.cl | 90 ++++++++++++++------------ src/gpu/multiexp/field2.cl | 10 +-- src/gpu/multiexp/multiexp.cl | 18 +++--- src/gpu/sources.rs | 23 +++---- src/gpu/utils.rs | 58 +++++++++++++++-- src/lib.rs | 6 +- src/multiexp.rs | 3 + tests/gpu_provers.rs | 2 +- 16 files changed, 361 insertions(+), 196 deletions(-) create mode 100644 README_AMD.md diff --git a/README.md b/README.md index 288dd1969..650ef056a 100644 --- a/README.md +++ b/README.md @@ -11,38 +11,34 @@ booleans and number abstractions. This fork contains GPU parallel acceleration to the FFT and Multiexponentation algorithms in the groth16 prover codebase under a conditional compilation feature `#[cfg(feature = "gpu")]` and `gpu-test` for testing. ### Requirements -- NVIDIA GPU Graphics Driver - +- NVIDIA or AMD GPU Graphics Driver - OpenCL +( For AMD devices we recommend [ROCm](https://rocm-documentation.readthedocs.io/en/latest/Installation_Guide/Installation-Guide.html) ) + ### Environment variables The gpu extension contains some env vars that may be set externally to this library. -`BELLMAN_NO_GPU` +- `BELLMAN_NO_GPU` -Will disable the GPU feature from the library and force usage of the CPU. -``` -Example -env::set_var("BELLMAN_NO_GPU", "1"); -``` + Will disable the GPU feature from the library and force usage of the CPU. -`BELLMAN_CUSTOM_GPU` + ```rust + // Example + env::set_var("BELLMAN_NO_GPU", "1"); + ``` -Will allow for adding a GPU not in the tested list. This requires researching the name of the GPU device and the number of cores in the format `["name:cores"]`. -``` -Example -env::set_var("BELLMAN_CUSTOM_GPU", "GeForce RTX 2080 Ti:4352, GeForce GTX 1060:1280"); -``` +- `BELLMAN_PLATFORM` -`BELLMAN_CPU_UTILIZATION` + Can be used to select the default OpenCL platform: -Can be set in the interval [0,1] to designate a proportion of the multiexponenation calculation to be moved to cpu in parallel to the GPU to keep all hardware occupied. + ```rust + // Example + env::set_var("BELLMAN_PLATFORM", "AMD Accelerated Parallel Processing"); + ``` -``` -Example -env::set_var("BELLMAN_CPU_UTILIZATION", "0.5"); -``` + Some possible values: `BELLMAN_VERIFIER` @@ -53,38 +49,77 @@ Example env::set_var("BELLMAN_VERIFIER", "gpu"); ``` -#### Supported / Tested Cards + - NVIDIA CUDA + - AMD Accelerated Parallel Processing + + If not set, and the code does not select any platform, "NVIDIA CUDA" will be selected. + +- `BELLMAN_CUSTOM_GPU` + + Will allow for adding a GPU not in the tested list. This requires researching the name of the GPU device and the number of cores in the format `["name:cores"]`. + + ```rust + // Example + env::set_var("BELLMAN_CUSTOM_GPU", "GeForce RTX 2080 Ti:4352, GeForce GTX 1060:1280"); + ``` + +- `BELLMAN_CPU_UTILIZATION` -Currently only Nvidia hardware is supported, see [issue](https://github.com/finalitylabs/bellman/issues/3). Depending on the size of the proof being passed to the gpu for work, certain cards will not be able to allocate enough memory to either the FFT or Multiexp kernel. Below are a list of devices that work for small sets. In the future we will add the cuttoff point at which a given card will not be able to allocate enough memory to utilize the GPU. + Can be set in the interval [0,1] to designate a proportion of the multiexponenation calculation to be moved to cpu in parallel to the GPU to keep all hardware occupied. + ```rust + // Example + env::set_var("BELLMAN_CPU_UTILIZATION", "0.5"); + ``` + +#### Supported / Tested Cards + +Depending on the size of the proof being passed to the gpu for work, certain cards will not be able to allocate enough memory to either the FFT or Multiexp kernel. Below are a list of devices that work for small sets. In the future we will add the cuttoff point at which a given card will not be able to allocate enough memory to utilize the GPU. + +| Device Name | Cores | Comments | +|------------------------|-------|----------------| +| Quadro RTX 6000 | 4608 | | +| TITAN RTX | 4608 | | +| Tesla V100 | 5120 | | +| Tesla P100 | 3584 | | +| Tesla T4 | 2560 | | +| Quadro M5000 | 2048 | | +| GeForce RTX 2080 Ti | 4352 | | +| GeForce RTX 2080 SUPER | 3072 | | +| GeForce RTX 2080 | 2944 | | +| GeForce RTX 2070 SUPER | 2560 | | +| GeForce GTX 1080 Ti | 3584 | | +| GeForce GTX 1080 | 2560 | | +| GeForce GTX 2060 | 1920 | | +| GeForce GTX 1660 Ti | 1536 | | +| GeForce GTX 1060 | 1280 | | +| GeForce GTX 1650 SUPER | 1280 | | +| GeForce GTX 1650 | 896 | | +| | | | +| gfx1010 | 2560 | AMD RX 5700 XT | + +### Running Tests + +To run the multiexp_consistency test you can use: + +```bash +RUST_LOG=info cargo test --features gpu -- --exact multiexp::gpu_multiexp_consistency --nocapture ``` -("Device_Name", Cores), -("Quadro RTX 6000", 4608), -("TITAN RTX", 4608), -("Tesla V100", 5120), -("Tesla P100", 3584), -("Tesla T4", 2560), -("Quadro M5000", 2048), -("GeForce RTX 2080 Ti", 4352), -("GeForce RTX 2080 SUPER", 3072), -("GeForce RTX 2080", 2944), -("GeForce RTX 2070 SUPER", 2560), -("GeForce GTX 1080 Ti", 3584), -("GeForce GTX 1080", 2560), -("GeForce GTX 2060", 1920), -("GeForce GTX 1660 Ti", 1536), -("GeForce GTX 1060", 1280), -("GeForce GTX 1650 SUPER", 1280), -("GeForce GTX 1650", 896), + +to run on some specific platform you can do + +```bash +export BELLMAN_PLATFORM="AMD Accelerated Parallel Processing" +RUST_LOG=info cargo test --features gpu -- --exact multiexp::gpu_multiexp_consistency --nocapture ``` ## License Licensed under either of - * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or +- Apache License, Version 2.0, |[LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) - * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) +- MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) at your option. diff --git a/README_AMD.md b/README_AMD.md new file mode 100644 index 000000000..1728ae1b6 --- /dev/null +++ b/README_AMD.md @@ -0,0 +1,21 @@ +# Using AMD GPUs + +## Prerequisites + +- [Install ROCm 3.5](https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html#supported-operating-systems) + +## Running tests + +The environment variable `BELLMAN_PLATFORM` determines which backend will be used. + +To use the AMD backend, you can do something like: + +```bash +export BELLMAN_PLATFORM="AMD Accelerated Parallel Processing" +RUST_LOG=info cargo test --features gpu -- --exact multiexp::gpu_multiexp_consistency --nocapture +``` + +## Notes + +- We had trouble in Ubuntu 20.04 when running a single computer with both NVIDIA and AMD cards. +- The initial kernel compilation may take > 60sec at start up. This is not a problem afterwards. A possible mitigation would be to add kernel binary caching in the ocl-fil crate. diff --git a/src/domain.rs b/src/domain.rs index a7ef1f0ff..1d69225d3 100644 --- a/src/domain.rs +++ b/src/domain.rs @@ -576,44 +576,57 @@ where } #[cfg(feature = "gpu")] -#[test] -pub fn gpu_fft_consistency() { - use paired::bls12_381::{Bls12, Fr}; - use std::time::Instant; - let rng = &mut rand::thread_rng(); - - let worker = Worker::new(); - let log_cpus = worker.log_num_cpus(); - let mut kern = gpu::FFTKernel::create(1 << 24, false).expect("Cannot initialize kernel!"); - - for log_d in 1..25 { - let d = 1 << log_d; +#[cfg(test)] +mod tests { + use crate::domain::{gpu_fft, parallel_fft, serial_fft, EvaluationDomain, Scalar}; + use crate::gpu; + use crate::multicore::Worker; + use ff::Field; + + #[test] + pub fn gpu_fft_consistency() { + let _ = env_logger::try_init(); + gpu::dump_device_list(); + + use paired::bls12_381::{Bls12, Fr}; + use std::time::Instant; + let rng = &mut rand::thread_rng(); - let elems = (0..d) - .map(|_| Scalar::(Fr::random(rng))) - .collect::>(); - let mut v1 = EvaluationDomain::from_coeffs(elems.clone()).unwrap(); - let mut v2 = EvaluationDomain::from_coeffs(elems.clone()).unwrap(); - - println!("Testing FFT for {} elements...", d); + let worker = Worker::new(); + let log_cpus = worker.log_num_cpus(); + let mut kern = gpu::FFTKernel::create(1 << 24, false).expect("Cannot initialize kernel!"); + + for log_d in 1..25 { + let d = 1 << log_d; + + let elems = (0..d) + .map(|_| Scalar::(Fr::random(rng))) + .collect::>(); + let mut v1 = EvaluationDomain::from_coeffs(elems.clone()).unwrap(); + let mut v2 = EvaluationDomain::from_coeffs(elems.clone()).unwrap(); + + println!("Testing FFT for {} elements...", d); + + let mut now = Instant::now(); + gpu_fft(&mut kern, &mut v1.coeffs, &v1.omega, log_d).expect("GPU FFT failed!"); + let gpu_dur = + now.elapsed().as_secs() * 1000 as u64 + now.elapsed().subsec_millis() as u64; + println!("GPU took {}ms.", gpu_dur); + + now = Instant::now(); + if log_d <= log_cpus { + serial_fft(&mut v2.coeffs, &v2.omega, log_d); + } else { + parallel_fft(&mut v2.coeffs, &worker, &v2.omega, log_d, log_cpus); + } + let cpu_dur = + now.elapsed().as_secs() * 1000 as u64 + now.elapsed().subsec_millis() as u64; + println!("CPU ({} cores) took {}ms.", 1 << log_cpus, cpu_dur); - let mut now = Instant::now(); - gpu_fft(&mut kern, &mut v1.coeffs, &v1.omega, log_d).expect("GPU FFT failed!"); - let gpu_dur = now.elapsed().as_secs() * 1000 as u64 + now.elapsed().subsec_millis() as u64; - println!("GPU took {}ms.", gpu_dur); + println!("Speedup: x{}", cpu_dur as f32 / gpu_dur as f32); - now = Instant::now(); - if log_d <= log_cpus { - serial_fft(&mut v2.coeffs, &v2.omega, log_d); - } else { - parallel_fft(&mut v2.coeffs, &worker, &v2.omega, log_d, log_cpus); + assert!(v1.coeffs == v2.coeffs); + println!("============================"); } - let cpu_dur = now.elapsed().as_secs() * 1000 as u64 + now.elapsed().subsec_millis() as u64; - println!("CPU ({} cores) took {}ms.", 1 << log_cpus, cpu_dur); - - println!("Speedup: x{}", cpu_dur as f32 / gpu_dur as f32); - - assert!(v1.coeffs == v2.coeffs); - println!("============================"); } } diff --git a/src/gadgets/blake2s.rs b/src/gadgets/blake2s.rs index 097525bc3..2ba5d339d 100644 --- a/src/gadgets/blake2s.rs +++ b/src/gadgets/blake2s.rs @@ -184,14 +184,14 @@ fn blake2s_compression>( let mut v = Vec::with_capacity(16); v.extend_from_slice(h); - v.push(UInt32::constant(0x6A09E667)); - v.push(UInt32::constant(0xBB67AE85)); - v.push(UInt32::constant(0x3C6EF372)); - v.push(UInt32::constant(0xA54FF53A)); - v.push(UInt32::constant(0x510E527F)); - v.push(UInt32::constant(0x9B05688C)); - v.push(UInt32::constant(0x1F83D9AB)); - v.push(UInt32::constant(0x5BE0CD19)); + v.push(UInt32::constant(0x6A09_E667)); + v.push(UInt32::constant(0xBB67_AE85)); + v.push(UInt32::constant(0x3C6E_F372)); + v.push(UInt32::constant(0xA54F_F53A)); + v.push(UInt32::constant(0x510E_527F)); + v.push(UInt32::constant(0x9B05_688C)); + v.push(UInt32::constant(0x1F83_D9AB)); + v.push(UInt32::constant(0x5BE0_CD19)); assert_eq!(v.len(), 16); @@ -348,19 +348,19 @@ pub fn blake2s>( assert!(input.len() % 8 == 0); let mut h = Vec::with_capacity(8); - h.push(UInt32::constant(0x6A09E667 ^ 0x01010000 ^ 32)); - h.push(UInt32::constant(0xBB67AE85)); - h.push(UInt32::constant(0x3C6EF372)); - h.push(UInt32::constant(0xA54FF53A)); - h.push(UInt32::constant(0x510E527F)); - h.push(UInt32::constant(0x9B05688C)); + h.push(UInt32::constant(0x6A09_E667 ^ 0x0101_0000 ^ 32)); + h.push(UInt32::constant(0xBB67_AE85)); + h.push(UInt32::constant(0x3C6E_F372)); + h.push(UInt32::constant(0xA54F_F53A)); + h.push(UInt32::constant(0x510E_527F)); + h.push(UInt32::constant(0x9B05_688C)); // Personalization is stored here h.push(UInt32::constant( - 0x1F83D9AB ^ LittleEndian::read_u32(&personalization[0..4]), + 0x1F83_D9AB ^ LittleEndian::read_u32(&personalization[0..4]), )); h.push(UInt32::constant( - 0x5BE0CD19 ^ LittleEndian::read_u32(&personalization[4..8]), + 0x5BE0_CD19 ^ LittleEndian::read_u32(&personalization[4..8]), )); let mut blocks: Vec> = vec![]; diff --git a/src/gpu/fft.rs b/src/gpu/fft.rs index 778ec7919..373c21215 100644 --- a/src/gpu/fft.rs +++ b/src/gpu/fft.rs @@ -1,6 +1,7 @@ +use crate::gpu::get_platform; use crate::gpu::{ error::{GPUError, GPUResult}, - locks, sources, structs, GPU_NVIDIA_DEVICES, + get_devices, locks, sources, structs, }; use ff::Field; use log::info; @@ -33,14 +34,25 @@ where { pub fn create(n: u32, priority: bool) -> GPUResult> { let lock = locks::GPULock::lock(); - let src = sources::kernel::(); - let devices = &GPU_NVIDIA_DEVICES; + + let platform = get_platform(None)?; + info!("Platform selected: {}", platform.name()?); + + let devices = get_devices(&platform).unwrap_or_default(); if devices.is_empty() { return Err(GPUError::Simple("No working GPUs found!")); } - let device = devices[0]; // Select the first device for FFT - let pq = ProQue::builder().device(device).src(src).dims(n).build()?; + + // Select the first device for FFT + let device = devices[0]; + + let pq = ProQue::builder() + .platform(platform) + .device(device) + .src(src) + .dims(n) + .build()?; let srcbuff = Buffer::builder() .queue(pq.queue().clone()) diff --git a/src/gpu/fft/fft.cl b/src/gpu/fft/fft.cl index 8ee00eb71..b340a9d42 100644 --- a/src/gpu/fft/fft.cl +++ b/src/gpu/fft/fft.cl @@ -37,7 +37,7 @@ __kernel void radix_fft(__global FIELD* x, // Source buffer uint counte = counts + count / lsize; // Compute powers of twiddle - FIELD twiddle = FIELD_pow_lookup(omegas, (n >> lgp >> deg) * k); + const FIELD twiddle = FIELD_pow_lookup(omegas, (n >> lgp >> deg) * k); FIELD tmp = FIELD_pow(twiddle, counts); for(uint i = counts; i < counte; i++) { u[i] = FIELD_mul(tmp, x[i*t]); @@ -45,13 +45,13 @@ __kernel void radix_fft(__global FIELD* x, // Source buffer } barrier(CLK_LOCAL_MEM_FENCE); - uint pqshift = max_deg - deg; + const uint pqshift = max_deg - deg; for(uint rnd = 0; rnd < deg; rnd++) { - uint bit = counth >> rnd; + const uint bit = counth >> rnd; for(uint i = counts >> 1; i < counte >> 1; i++) { - uint di = i & (bit - 1); - uint i0 = (i << 1) - di; - uint i1 = i0 + bit; + const uint di = i & (bit - 1); + const uint i0 = (i << 1) - di; + const uint i1 = i0 + bit; tmp = u[i0]; u[i0] = FIELD_add(u[i0], u[i1]); u[i1] = FIELD_sub(tmp, u[i1]); @@ -71,6 +71,6 @@ __kernel void radix_fft(__global FIELD* x, // Source buffer __kernel void mul_by_field(__global FIELD* elements, uint n, FIELD field) { - uint gid = get_global_id(0); + const uint gid = get_global_id(0); elements[gid] = FIELD_mul(elements[gid], field); -} \ No newline at end of file +} diff --git a/src/gpu/mod.rs b/src/gpu/mod.rs index 9fac8620f..64228f07b 100644 --- a/src/gpu/mod.rs +++ b/src/gpu/mod.rs @@ -1,44 +1,45 @@ mod error; + pub use self::error::*; #[cfg(feature = "gpu")] mod locks; + #[cfg(feature = "gpu")] pub use self::locks::*; #[cfg(feature = "gpu")] mod sources; + #[cfg(feature = "gpu")] pub use self::sources::*; #[cfg(feature = "gpu")] mod utils; + #[cfg(feature = "gpu")] pub use self::utils::*; #[cfg(feature = "gpu")] mod structs; + #[cfg(feature = "gpu")] pub use self::structs::*; #[cfg(feature = "gpu")] mod fft; + #[cfg(feature = "gpu")] pub use self::fft::*; #[cfg(feature = "gpu")] mod multiexp; + #[cfg(feature = "gpu")] pub use self::multiexp::*; #[cfg(not(feature = "gpu"))] mod nogpu; + #[cfg(not(feature = "gpu"))] pub use self::nogpu::*; - -#[cfg(feature = "gpu")] -use ocl::Device; -#[cfg(feature = "gpu")] -lazy_static::lazy_static! { - pub static ref GPU_NVIDIA_DEVICES: Vec = get_devices(GPU_NVIDIA_PLATFORM_NAME).unwrap_or_default(); -} diff --git a/src/gpu/multiexp.rs b/src/gpu/multiexp.rs index 419bc9182..b0d668485 100644 --- a/src/gpu/multiexp.rs +++ b/src/gpu/multiexp.rs @@ -3,13 +3,14 @@ use super::locks; use super::sources; use super::structs; use super::utils; -use super::GPU_NVIDIA_DEVICES; +use crate::gpu::{get_devices, get_platform}; use crate::multicore::Worker; use crate::multiexp::{multiexp as cpu_multiexp, FullDensity}; use crossbeam::thread; use ff::{PrimeField, ScalarEngine}; use futures::Future; use groupy::{CurveAffine, CurveProjective}; +use log::debug; use log::{error, info}; use ocl::{Buffer, Device, MemFlags, ProQue}; use paired::Engine; @@ -61,7 +62,7 @@ where fn calc_num_groups(core_count: usize, num_windows: usize) -> usize { // Observations show that we get the best performance when num_groups * num_windows ~= 2 * CUDA_CORES - return 2 * core_count / num_windows; + 2 * core_count / num_windows } fn calc_window_size(n: usize, exp_bits: usize, core_count: usize) -> usize { @@ -79,7 +80,8 @@ fn calc_window_size(n: usize, exp_bits: usize, core_count: usize) -> usize { return w; } } - return MAX_WINDOW_SIZE; + + MAX_WINDOW_SIZE } fn calc_best_chunk_size(max_window_size: usize, core_count: usize, exp_bits: usize) -> usize { @@ -111,7 +113,11 @@ where { pub fn create(d: Device, priority: bool) -> GPUResult> { let src = sources::kernel::(); - let pq = ProQue::builder().device(d).src(src).dims(1).build()?; + + let platform = match d.info(ocl::enums::DeviceInfo::Platform)? { + ocl::enums::DeviceInfoResult::Platform(p) => ocl::Platform::new(p), + _ => ocl::Platform::default(), + }; let exp_bits = std::mem::size_of::() * 8; let core_count = utils::get_core_count(d)?; @@ -121,6 +127,17 @@ where let n = std::cmp::min(max_n, best_n); let max_bucket_len = 1 << MAX_WINDOW_SIZE; + let pq = ProQue::builder() + .platform(platform) + .device(d) + .src(src) + .dims(1) + .build() + .map_err(|err| { + debug!("{:?}", err); + err + })?; + // Each group will have `num_windows` threads and as there are `num_groups` groups, there will // be `num_groups` * `num_windows` threads in total. // Each thread will use `num_groups` * `num_windows` * `bucket_len` buckets. @@ -172,7 +189,7 @@ where g2_bucket_buffer: g2buckbuff, g2_result_buffer: g2resbuff, exp_buffer: expbuff, - core_count: core_count, + core_count, n, priority, }) @@ -303,12 +320,18 @@ where pub fn create(priority: bool) -> GPUResult> { let lock = locks::GPULock::lock(); - let kernels: Vec<_> = GPU_NVIDIA_DEVICES + let platform = get_platform(None)?; + let devices = &get_devices(&platform).unwrap_or_default(); + + info!("Platform selected: {}", platform.name()?); + + let kernels: Vec<_> = devices .iter() .map(|d| SingleMultiexpKernel::::create(*d, priority)) .filter(|res| res.is_ok()) .map(|res| res.unwrap()) .collect(); + if kernels.is_empty() { return Err(GPUError::Simple("No working GPUs found!")); } @@ -325,10 +348,10 @@ where k.n ); } - return Ok(MultiexpKernel:: { + Ok(MultiexpKernel:: { kernels, _lock: lock, - }); + }) } pub fn multiexp( diff --git a/src/gpu/multiexp/ec.cl b/src/gpu/multiexp/ec.cl index a6a65214f..1eb2afbed 100644 --- a/src/gpu/multiexp/ec.cl +++ b/src/gpu/multiexp/ec.cl @@ -16,18 +16,21 @@ typedef struct { // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l POINT_projective POINT_double(POINT_projective inp) { - if(FIELD_eq(inp.z, FIELD_ZERO)) return inp; - FIELD a = FIELD_sqr(inp.x); // A = X1^2 - FIELD b = FIELD_sqr(inp.y); // B = Y1^2 + const FIELD local_zero = FIELD_ZERO; + if(FIELD_eq(inp.z, local_zero)) { + return inp; + } + + const FIELD a = FIELD_sqr(inp.x); // A = X1^2 + const FIELD b = FIELD_sqr(inp.y); // B = Y1^2 FIELD c = FIELD_sqr(b); // C = B^2 // D = 2*((X1+B)2-A-C) FIELD d = FIELD_add(inp.x, b); d = FIELD_sqr(d); d = FIELD_sub(FIELD_sub(d, a), c); d = FIELD_double(d); - FIELD e = FIELD_add(FIELD_double(a), a); // E = 3*A - - FIELD f = FIELD_sqr(e); + const FIELD e = FIELD_add(FIELD_double(a), a); // E = 3*A + const FIELD f = FIELD_sqr(e); inp.z = FIELD_mul(inp.y, inp.z); inp.z = FIELD_double(inp.z); // Z3 = 2*Y1*Z1 inp.x = FIELD_sub(FIELD_sub(f, d), d); // X3 = F-2*D @@ -41,65 +44,70 @@ POINT_projective POINT_double(POINT_projective inp) { // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-madd-2007-bl POINT_projective POINT_add_mixed(POINT_projective a, POINT_affine b) { - if(b.inf) return a; + if(b.inf) { + return a; + } - if(FIELD_eq(a.z, FIELD_ZERO)) { + const FIELD local_zero = FIELD_ZERO; + if(FIELD_eq(a.z, local_zero)) { + const FIELD local_one = FIELD_ONE; a.x = b.x; a.y = b.y; - a.z = FIELD_ONE; + a.z = local_one; return a; } - FIELD z1z1 = FIELD_sqr(a.z); - FIELD u2 = FIELD_mul(b.x, z1z1); - FIELD s2 = FIELD_mul(FIELD_mul(b.y, a.z), z1z1); + const FIELD z1z1 = FIELD_sqr(a.z); + const FIELD u2 = FIELD_mul(b.x, z1z1); + const FIELD s2 = FIELD_mul(FIELD_mul(b.y, a.z), z1z1); - if(FIELD_eq(a.x, u2) && FIELD_eq(a.y, s2)) - return POINT_double(a); - else { - FIELD h = FIELD_sub(u2, a.x); // H = U2-X1 - FIELD hh = FIELD_sqr(h); // HH = H^2 - FIELD i = FIELD_double(hh); i = FIELD_double(i); // I = 4*HH - FIELD j = FIELD_mul(h, i); // J = H*I - FIELD r = FIELD_sub(s2, a.y); r = FIELD_double(r); // r = 2*(S2-Y1) - FIELD v = FIELD_mul(a.x, i); + if(FIELD_eq(a.x, u2) && FIELD_eq(a.y, s2)) { + return POINT_double(a); + } - POINT_projective ret; + const FIELD h = FIELD_sub(u2, a.x); // H = U2-X1 + const FIELD hh = FIELD_sqr(h); // HH = H^2 + FIELD i = FIELD_double(hh); i = FIELD_double(i); // I = 4*HH + FIELD j = FIELD_mul(h, i); // J = H*I + FIELD r = FIELD_sub(s2, a.y); r = FIELD_double(r); // r = 2*(S2-Y1) + const FIELD v = FIELD_mul(a.x, i); - // X3 = r^2 - J - 2*V - ret.x = FIELD_sub(FIELD_sub(FIELD_sqr(r), j), FIELD_double(v)); + POINT_projective ret; - // Y3 = r*(V-X3)-2*Y1*J - j = FIELD_mul(a.y, j); j = FIELD_double(j); - ret.y = FIELD_sub(FIELD_mul(FIELD_sub(v, ret.x), r), j); + // X3 = r^2 - J - 2*V + ret.x = FIELD_sub(FIELD_sub(FIELD_sqr(r), j), FIELD_double(v)); - // Z3 = (Z1+H)^2-Z1Z1-HH - ret.z = FIELD_add(a.z, h); ret.z = FIELD_sub(FIELD_sub(FIELD_sqr(ret.z), z1z1), hh); - return ret; - } + // Y3 = r*(V-X3)-2*Y1*J + j = FIELD_mul(a.y, j); j = FIELD_double(j); + ret.y = FIELD_sub(FIELD_mul(FIELD_sub(v, ret.x), r), j); + + // Z3 = (Z1+H)^2-Z1Z1-HH + ret.z = FIELD_add(a.z, h); ret.z = FIELD_sub(FIELD_sub(FIELD_sqr(ret.z), z1z1), hh); + return ret; } // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl POINT_projective POINT_add(POINT_projective a, POINT_projective b) { - if(FIELD_eq(a.z, FIELD_ZERO)) return b; - if(FIELD_eq(b.z, FIELD_ZERO)) return a; + const FIELD local_zero = FIELD_ZERO; + if(FIELD_eq(a.z, local_zero)) return b; + if(FIELD_eq(b.z, local_zero)) return a; - FIELD z1z1 = FIELD_sqr(a.z); // Z1Z1 = Z1^2 - FIELD z2z2 = FIELD_sqr(b.z); // Z2Z2 = Z2^2 - FIELD u1 = FIELD_mul(a.x, z2z2); // U1 = X1*Z2Z2 - FIELD u2 = FIELD_mul(b.x, z1z1); // U2 = X2*Z1Z1 + const FIELD z1z1 = FIELD_sqr(a.z); // Z1Z1 = Z1^2 + const FIELD z2z2 = FIELD_sqr(b.z); // Z2Z2 = Z2^2 + const FIELD u1 = FIELD_mul(a.x, z2z2); // U1 = X1*Z2Z2 + const FIELD u2 = FIELD_mul(b.x, z1z1); // U2 = X2*Z1Z1 FIELD s1 = FIELD_mul(FIELD_mul(a.y, b.z), z2z2); // S1 = Y1*Z2*Z2Z2 - FIELD s2 = FIELD_mul(FIELD_mul(b.y, a.z), z1z1); // S2 = Y2*Z1*Z1Z1 + const FIELD s2 = FIELD_mul(FIELD_mul(b.y, a.z), z1z1); // S2 = Y2*Z1*Z1Z1 if(FIELD_eq(u1, u2) && FIELD_eq(s1, s2)) return POINT_double(a); else { - FIELD h = FIELD_sub(u2, u1); // H = U2-U1 + const FIELD h = FIELD_sub(u2, u1); // H = U2-U1 FIELD i = FIELD_double(h); i = FIELD_sqr(i); // I = (2*H)^2 - FIELD j = FIELD_mul(h, i); // J = H*I + const FIELD j = FIELD_mul(h, i); // J = H*I FIELD r = FIELD_sub(s2, s1); r = FIELD_double(r); // r = 2*(S2-S1) - FIELD v = FIELD_mul(u1, i); // V = U1*I + const FIELD v = FIELD_mul(u1, i); // V = U1*I a.x = FIELD_sub(FIELD_sub(FIELD_sub(FIELD_sqr(r), j), v), v); // X3 = r^2 - J - 2*V // Y3 = r*(V - X3) - 2*S1*J diff --git a/src/gpu/multiexp/field2.cl b/src/gpu/multiexp/field2.cl index ad387d825..634d785c7 100644 --- a/src/gpu/multiexp/field2.cl +++ b/src/gpu/multiexp/field2.cl @@ -34,9 +34,9 @@ FIELD2 FIELD2_double(FIELD2 a) { * c_1 = (a_0 * b_1 + a_1 * b_0) = (a_0 + a_1) * (b_0 + b_1) - a_0 * b_0 - a_1 * b_1 */ FIELD2 FIELD2_mul(FIELD2 a, FIELD2 b) { - FIELD aa = FIELD_mul(a.c0, b.c0); - FIELD bb = FIELD_mul(a.c1, b.c1); - FIELD o = FIELD_add(b.c0, b.c1); + const FIELD aa = FIELD_mul(a.c0, b.c0); + const FIELD bb = FIELD_mul(a.c1, b.c1); + const FIELD o = FIELD_add(b.c0, b.c1); a.c1 = FIELD_add(a.c1, a.c0); a.c1 = FIELD_mul(a.c1, o); a.c1 = FIELD_sub(a.c1, aa); @@ -52,8 +52,8 @@ FIELD2 FIELD2_mul(FIELD2 a, FIELD2 b) { * c_1 = 2 * a_0 * a_1 */ FIELD2 FIELD2_sqr(FIELD2 a) { - FIELD ab = FIELD_mul(a.c0, a.c1); - FIELD c0c1 = FIELD_add(a.c0, a.c1); + const FIELD ab = FIELD_mul(a.c0, a.c1); + const FIELD c0c1 = FIELD_add(a.c0, a.c1); a.c0 = FIELD_mul(FIELD_sub(a.c0, a.c1), c0c1); a.c1 = FIELD_double(ab); return a; diff --git a/src/gpu/multiexp/multiexp.cl b/src/gpu/multiexp/multiexp.cl index a4072c779..f950ace3c 100644 --- a/src/gpu/multiexp/multiexp.cl +++ b/src/gpu/multiexp/multiexp.cl @@ -19,24 +19,26 @@ __kernel void POINT_bellman_multiexp( uint window_size) { // We have `num_windows` * `num_groups` threads per multiexp. - uint gid = get_global_id(0); + const uint gid = get_global_id(0); if(gid >= num_windows * num_groups) return; // We have (2^window_size - 1) buckets. - uint bucket_len = ((1 << window_size) - 1); + const uint bucket_len = ((1 << window_size) - 1); // Each thread has its own set of buckets in global memory. buckets += bucket_len * gid; - for(uint i = 0; i < bucket_len; i++) buckets[i] = POINT_ZERO; - uint len = (uint)ceil(n / (float)num_groups); // Num of elements in each group + const POINT_projective local_zero = POINT_ZERO; + for(uint i = 0; i < bucket_len; i++) buckets[i] = local_zero; + + const uint len = (uint)ceil(n / (float)num_groups); // Num of elements in each group // This thread runs the multiexp algorithm on elements from `nstart` to `nened` // on the window [`bits`, `bits` + `w`) - uint nstart = len * (gid / num_windows); - uint nend = min(nstart + len, n); - uint bits = (gid % num_windows) * window_size; - ushort w = min((ushort)window_size, (ushort)(EXPONENT_BITS - bits)); + const uint nstart = len * (gid / num_windows); + const uint nend = min(nstart + len, n); + const uint bits = (gid % num_windows) * window_size; + const ushort w = min((ushort)window_size, (ushort)(EXPONENT_BITS - bits)); POINT_projective res = POINT_ZERO; for(uint i = nstart; i < nend; i++) { diff --git a/src/gpu/sources.rs b/src/gpu/sources.rs index 18b8657fb..631357cc4 100644 --- a/src/gpu/sources.rs +++ b/src/gpu/sources.rs @@ -1,4 +1,5 @@ use ff_cl_gen as ffgen; +use log::debug; use paired::Engine; // Instead of having a very large OpenCL program written for a specific curve, with a lot of @@ -11,25 +12,25 @@ static EC_SRC: &str = include_str!("multiexp/ec.cl"); static MULTIEXP_SRC: &str = include_str!("multiexp/multiexp.cl"); fn field2(field2: &str, field: &str) -> String { - return String::from(FIELD2_SRC) + String::from(FIELD2_SRC) .replace("FIELD2", field2) - .replace("FIELD", field); + .replace("FIELD", field) } fn fft(field: &str) -> String { - return String::from(FFT_SRC).replace("FIELD", field); + String::from(FFT_SRC).replace("FIELD", field) } fn ec(field: &str, point: &str) -> String { - return String::from(EC_SRC) + String::from(EC_SRC) .replace("FIELD", field) - .replace("POINT", point); + .replace("POINT", point) } fn multiexp(point: &str, exp: &str) -> String { - return String::from(MULTIEXP_SRC) + String::from(MULTIEXP_SRC) .replace("POINT", point) - .replace("EXPONENT", exp); + .replace("EXPONENT", exp) } // WARNING: This function works only with Short Weierstrass Jacobian curves with Fq2 extension field. @@ -37,8 +38,7 @@ pub fn kernel() -> String where E: Engine, { - return String::from(format!( - "{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}", + vec![ ffgen::field::("Fr"), fft("Fr"), ffgen::field::("Fq"), @@ -46,6 +46,7 @@ where multiexp("G1", "Fr"), field2("Fq2", "Fq"), ec("Fq2", "G2"), - multiexp("G2", "Fr") - )); + multiexp("G2", "Fr"), + ] + .join("\n\n") } diff --git a/src/gpu/utils.rs b/src/gpu/utils.rs index 596eeaf25..8e7e15613 100644 --- a/src/gpu/utils.rs +++ b/src/gpu/utils.rs @@ -6,26 +6,59 @@ use std::collections::HashMap; use std::env; pub const GPU_NVIDIA_PLATFORM_NAME: &str = "NVIDIA CUDA"; -// pub const CPU_INTEL_PLATFORM_NAME: &str = "Intel(R) CPU Runtime for OpenCL(TM) Applications"; +pub const GPU_AMD_PLATFORM_NAME: &str = "AMD Accelerated Parallel Processing"; +//pub const CPU_INTEL_PLATFORM_NAME: &str = "Intel(R) CPU Runtime for OpenCL(TM) Applications"; -pub fn get_devices(platform_name: &str) -> GPUResult> { +fn find_platform(platform_name: &str) -> GPUResult { if env::var("BELLMAN_NO_GPU").is_ok() { return Err(GPUError::Simple("GPU accelerator is disabled!")); } let platform = Platform::list()?.into_iter().find(|&p| match p.name() { - Ok(p) => p == platform_name, + Ok(p) => p == platform_name.to_string(), Err(_) => false, }); + match platform { - Some(p) => Ok(Device::list_all(p)?), + Some(p) => Ok(p), None => Err(GPUError::Simple("GPU platform not found!")), } } +pub fn get_platform(platform_name: Option<&str>) -> GPUResult { + if platform_name.is_none() { + // Retrieve platform name from environment variable + info!("Platform not set by source code"); + + let platform_environment = match env::var("BELLMAN_PLATFORM") { + Ok(p) => { + info!("Platform set by environment: {}", p); + p + } + Err(_) => GPU_NVIDIA_PLATFORM_NAME.to_string(), + }; + + return find_platform(&platform_environment.as_str()); + } + + info!("Platform set by source code: {}", platform_name.unwrap()); + find_platform(&platform_name.unwrap()) +} + +pub fn get_devices(platform: &Platform) -> GPUResult> { + if env::var("BELLMAN_NO_GPU").is_ok() { + return Err(GPUError::Simple("GPU accelerator is disabled!")); + } + Ok(Device::list_all(platform)?) +} + lazy_static::lazy_static! { static ref CORE_COUNTS: HashMap = { let mut core_counts : HashMap = vec![ + // AMD + ("gfx1010".to_string(), 2560), + + // NVIDIA ("Quadro RTX 6000".to_string(), 4608), ("TITAN RTX".to_string(), 4608), @@ -89,3 +122,20 @@ pub fn get_memory(d: Device) -> GPUResult { _ => Err(GPUError::Simple("Cannot extract GPU memory!")), } } + +pub fn dump_device_list() { + for p in Platform::list().unwrap_or_default().iter() { + info!("Platform: {:?} - {:?}", p.name(), p.as_ptr()); + for d in Device::list_all(p).unwrap_or_default().iter() { + let info_kind = ocl::enums::DeviceInfo::MaxComputeUnits; + let dev_info = d.info(info_kind).unwrap(); + info!("\tDevice: {:?} {:?}", d.name(), dev_info); + } + } +} + +#[cfg(feature = "gpu")] +#[test] +pub fn test_list_platform() { + dump_device_list(); +} diff --git a/src/lib.rs b/src/lib.rs index 4d6b7c531..b22c46b1d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -148,10 +148,6 @@ pub mod multicore; pub mod multiexp; pub mod util_cs; - -#[cfg(feature = "gpu")] -pub use gpu::GPU_NVIDIA_DEVICES; - use ff::{Field, ScalarEngine}; use ahash::AHashMap as HashMap; @@ -159,7 +155,7 @@ use std::io; use std::marker::PhantomData; use std::ops::{Add, Sub}; -const BELLMAN_VERSION: &'static str = env!("CARGO_PKG_VERSION"); +const BELLMAN_VERSION: &str = env!("CARGO_PKG_VERSION"); /// Computations are expressed in terms of arithmetic circuits, in particular /// rank-1 quadratic constraint systems. The `Circuit` trait represents a diff --git a/src/multiexp.rs b/src/multiexp.rs index 747654556..7d76e1f1b 100644 --- a/src/multiexp.rs +++ b/src/multiexp.rs @@ -421,6 +421,9 @@ pub fn gpu_multiexp_consistency() { use paired::bls12_381::Bls12; use std::time::Instant; + let _ = env_logger::try_init(); + gpu::dump_device_list(); + const MAX_LOG_D: usize = 20; const START_LOG_D: usize = 10; let mut kern = Some(gpu::LockedMultiexpKernel::::new(MAX_LOG_D, false)); diff --git a/tests/gpu_provers.rs b/tests/gpu_provers.rs index f48851b1c..99083661b 100644 --- a/tests/gpu_provers.rs +++ b/tests/gpu_provers.rs @@ -55,7 +55,7 @@ pub fn test_parallel_prover() { use std::thread; use std::time::{Duration, Instant}; - env_logger::init(); + let _ = env_logger::try_init(); let rng = &mut thread_rng(); println!("Initializing circuit...");