AMD Support (filecoin-project#92)

* Initial AMD support * Fix name change * additional debug logging * addressing PR review comments * fixing rustfmt issues * fixing CI issues
linxxx3 · Jun 30, 2020 · 68d5af7 · 68d5af7
1 parent 8ce2bab
commit 68d5af7
Show file tree

Hide file tree

Showing 16 changed files with 361 additions and 196 deletions.
diff --git a/README.md b/README.md
@@ -11,38 +11,34 @@ booleans and number abstractions.
 This fork contains GPU parallel acceleration to the FFT and Multiexponentation algorithms in the groth16 prover codebase under a conditional compilation feature `#[cfg(feature = "gpu")]` and `gpu-test` for testing.
 
 ### Requirements
-- NVIDIA GPU Graphics Driver
-
+- NVIDIA or AMD GPU Graphics Driver 
 - OpenCL
 
+( For AMD devices we recommend [ROCm](https://rocm-documentation.readthedocs.io/en/latest/Installation_Guide/Installation-Guide.html) )
+
 ### Environment variables
 
 The gpu extension contains some env vars that may be set externally to this library.
 
-`BELLMAN_NO_GPU`
+- `BELLMAN_NO_GPU`
 
-Will disable the GPU feature from the library and force usage of the CPU.
-```
-Example
-env::set_var("BELLMAN_NO_GPU", "1");
-```
+    Will disable the GPU feature from the library and force usage of the CPU.
 
-`BELLMAN_CUSTOM_GPU`
+    ```rust
+    // Example
+    env::set_var("BELLMAN_NO_GPU", "1");
+    ```
 
-Will allow for adding a GPU not in the tested list. This requires researching the name of the GPU device and the number of cores in the format `["name:cores"]`.
-```
-Example
-env::set_var("BELLMAN_CUSTOM_GPU", "GeForce RTX 2080 Ti:4352, GeForce GTX 1060:1280");
-```
+- `BELLMAN_PLATFORM`
 
-`BELLMAN_CPU_UTILIZATION`
+    Can be used to select the default OpenCL platform:
 
-Can be set in the interval [0,1] to designate a proportion of the multiexponenation calculation to be moved to cpu in parallel to the GPU to keep all hardware occupied.
+    ```rust
+    // Example
+    env::set_var("BELLMAN_PLATFORM", "AMD Accelerated Parallel Processing");
+    ```
 
-```
-Example
-env::set_var("BELLMAN_CPU_UTILIZATION", "0.5");
-```
+    Some possible values:
 
 `BELLMAN_VERIFIER`
 
@@ -53,38 +49,77 @@ Example
 env::set_var("BELLMAN_VERIFIER", "gpu");
 ```
 
-#### Supported / Tested Cards
+  - NVIDIA CUDA
+  - AMD Accelerated Parallel Processing
+
+    If not set, and the code does not select any platform, "NVIDIA CUDA" will be selected.
+
+- `BELLMAN_CUSTOM_GPU`
+
+    Will allow for adding a GPU not in the tested list. This requires researching the name of the GPU device and the number of cores in the format `["name:cores"]`.
+
+    ```rust
+    // Example
+    env::set_var("BELLMAN_CUSTOM_GPU", "GeForce RTX 2080 Ti:4352, GeForce GTX 1060:1280");
+    ```
+
+- `BELLMAN_CPU_UTILIZATION`
 
-Currently only Nvidia hardware is supported, see [issue](https://github.com/finalitylabs/bellman/issues/3). Depending on the size of the proof being passed to the gpu for work, certain cards will not be able to allocate enough memory to either the FFT or Multiexp kernel. Below are a list of devices that work for small sets. In the future we will add the cuttoff point at which a given card will not be able to allocate enough memory to utilize the GPU.
+    Can be set in the interval [0,1] to designate a proportion of the multiexponenation calculation to be moved to cpu in parallel to the GPU to keep all hardware occupied.
 
+    ```rust
+    // Example
+    env::set_var("BELLMAN_CPU_UTILIZATION", "0.5");
+    ```
+
+#### Supported / Tested Cards
+
+Depending on the size of the proof being passed to the gpu for work, certain cards will not be able to allocate enough memory to either the FFT or Multiexp kernel. Below are a list of devices that work for small sets. In the future we will add the cuttoff point at which a given card will not be able to allocate enough memory to utilize the GPU.
+
+| Device Name            | Cores | Comments       |
+|------------------------|-------|----------------|
+| Quadro RTX 6000        | 4608  |                |
+| TITAN RTX              | 4608  |                |
+| Tesla V100             | 5120  |                |
+| Tesla P100             | 3584  |                |
+| Tesla T4               | 2560  |                |
+| Quadro M5000           | 2048  |                |
+| GeForce RTX 2080 Ti    | 4352  |                |
+| GeForce RTX 2080 SUPER | 3072  |                |
+| GeForce RTX 2080       | 2944  |                |
+| GeForce RTX 2070 SUPER | 2560  |                |
+| GeForce GTX 1080 Ti    | 3584  |                |
+| GeForce GTX 1080       | 2560  |                |
+| GeForce GTX 2060       | 1920  |                |
+| GeForce GTX 1660 Ti    | 1536  |                |
+| GeForce GTX 1060       | 1280  |                |
+| GeForce GTX 1650 SUPER | 1280  |                |
+| GeForce GTX 1650       | 896   |                |
+|                        |       |                |
+| gfx1010                | 2560  | AMD RX 5700 XT |
+
+### Running Tests
+
+To run the multiexp_consistency test you can use:
+
+```bash
+RUST_LOG=info cargo test --features gpu -- --exact multiexp::gpu_multiexp_consistency --nocapture
 ```
-("Device_Name", Cores),
-("Quadro RTX 6000", 4608),
-("TITAN RTX", 4608),
-("Tesla V100", 5120),
-("Tesla P100", 3584),
-("Tesla T4", 2560),
-("Quadro M5000", 2048),
-("GeForce RTX 2080 Ti", 4352),
-("GeForce RTX 2080 SUPER", 3072),
-("GeForce RTX 2080", 2944),
-("GeForce RTX 2070 SUPER", 2560),
-("GeForce GTX 1080 Ti", 3584),
-("GeForce GTX 1080", 2560),
-("GeForce GTX 2060", 1920),
-("GeForce GTX 1660 Ti", 1536),
-("GeForce GTX 1060", 1280),
-("GeForce GTX 1650 SUPER", 1280),
-("GeForce GTX 1650", 896),
+
+to run on some specific platform you can do
+
+```bash
+export BELLMAN_PLATFORM="AMD Accelerated Parallel Processing" 
+RUST_LOG=info cargo test --features gpu -- --exact multiexp::gpu_multiexp_consistency --nocapture
 ```
 
 ## License
 
 Licensed under either of
 
- * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or
+- Apache License, Version 2.0, |[LICENSE-APACHE](LICENSE-APACHE) or
    http://www.apache.org/licenses/LICENSE-2.0)
- * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
+- MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
 
 at your option.
 

diff --git a/README_AMD.md b/README_AMD.md
@@ -0,0 +1,21 @@
+# Using AMD GPUs
+
+## Prerequisites
+
+- [Install ROCm 3.5](https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html#supported-operating-systems)
+
+## Running tests
+
+The environment variable `BELLMAN_PLATFORM` determines which backend will be used. 
+
+To use the AMD backend, you can do something like: 
+
+```bash
+export BELLMAN_PLATFORM="AMD Accelerated Parallel Processing"
+RUST_LOG=info cargo test --features gpu -- --exact multiexp::gpu_multiexp_consistency --nocapture
+```
+
+## Notes
+
+- We had trouble in Ubuntu 20.04 when running a single computer with both NVIDIA and AMD cards. 
+- The initial kernel compilation may take > 60sec at start up. This is not a problem afterwards. A possible mitigation would be to add kernel binary caching in the ocl-fil crate.
diff --git a/src/domain.rs b/src/domain.rs
@@ -576,44 +576,57 @@ where
 }
 
 #[cfg(feature = "gpu")]
-#[test]
-pub fn gpu_fft_consistency() {
-    use paired::bls12_381::{Bls12, Fr};
-    use std::time::Instant;
-    let rng = &mut rand::thread_rng();
-
-    let worker = Worker::new();
-    let log_cpus = worker.log_num_cpus();
-    let mut kern = gpu::FFTKernel::create(1 << 24, false).expect("Cannot initialize kernel!");
-
-    for log_d in 1..25 {
-        let d = 1 << log_d;
+#[cfg(test)]
+mod tests {
+    use crate::domain::{gpu_fft, parallel_fft, serial_fft, EvaluationDomain, Scalar};
+    use crate::gpu;
+    use crate::multicore::Worker;
+    use ff::Field;
+
+    #[test]
+    pub fn gpu_fft_consistency() {
+        let _ = env_logger::try_init();
+        gpu::dump_device_list();
+
+        use paired::bls12_381::{Bls12, Fr};
+        use std::time::Instant;
+        let rng = &mut rand::thread_rng();
 
-        let elems = (0..d)
-            .map(|_| Scalar::<Bls12>(Fr::random(rng)))
-            .collect::<Vec<_>>();
-        let mut v1 = EvaluationDomain::from_coeffs(elems.clone()).unwrap();
-        let mut v2 = EvaluationDomain::from_coeffs(elems.clone()).unwrap();
-
-        println!("Testing FFT for {} elements...", d);
+        let worker = Worker::new();
+        let log_cpus = worker.log_num_cpus();
+        let mut kern = gpu::FFTKernel::create(1 << 24, false).expect("Cannot initialize kernel!");
+
+        for log_d in 1..25 {
+            let d = 1 << log_d;
+
+            let elems = (0..d)
+                .map(|_| Scalar::<Bls12>(Fr::random(rng)))
+                .collect::<Vec<_>>();
+            let mut v1 = EvaluationDomain::from_coeffs(elems.clone()).unwrap();
+            let mut v2 = EvaluationDomain::from_coeffs(elems.clone()).unwrap();
+
+            println!("Testing FFT for {} elements...", d);
+
+            let mut now = Instant::now();
+            gpu_fft(&mut kern, &mut v1.coeffs, &v1.omega, log_d).expect("GPU FFT failed!");
+            let gpu_dur =
+                now.elapsed().as_secs() * 1000 as u64 + now.elapsed().subsec_millis() as u64;
+            println!("GPU took {}ms.", gpu_dur);
+
+            now = Instant::now();
+            if log_d <= log_cpus {
+                serial_fft(&mut v2.coeffs, &v2.omega, log_d);
+            } else {
+                parallel_fft(&mut v2.coeffs, &worker, &v2.omega, log_d, log_cpus);
+            }
+            let cpu_dur =
+                now.elapsed().as_secs() * 1000 as u64 + now.elapsed().subsec_millis() as u64;
+            println!("CPU ({} cores) took {}ms.", 1 << log_cpus, cpu_dur);
 
-        let mut now = Instant::now();
-        gpu_fft(&mut kern, &mut v1.coeffs, &v1.omega, log_d).expect("GPU FFT failed!");
-        let gpu_dur = now.elapsed().as_secs() * 1000 as u64 + now.elapsed().subsec_millis() as u64;
-        println!("GPU took {}ms.", gpu_dur);
+            println!("Speedup: x{}", cpu_dur as f32 / gpu_dur as f32);
 
-        now = Instant::now();
-        if log_d <= log_cpus {
-            serial_fft(&mut v2.coeffs, &v2.omega, log_d);
-        } else {
-            parallel_fft(&mut v2.coeffs, &worker, &v2.omega, log_d, log_cpus);
+            assert!(v1.coeffs == v2.coeffs);
+            println!("============================");
         }
-        let cpu_dur = now.elapsed().as_secs() * 1000 as u64 + now.elapsed().subsec_millis() as u64;
-        println!("CPU ({} cores) took {}ms.", 1 << log_cpus, cpu_dur);
-
-        println!("Speedup: x{}", cpu_dur as f32 / gpu_dur as f32);
-
-        assert!(v1.coeffs == v2.coeffs);
-        println!("============================");
     }
 }
diff --git a/src/gadgets/blake2s.rs b/src/gadgets/blake2s.rs
@@ -184,14 +184,14 @@ fn blake2s_compression<E: ScalarEngine, CS: ConstraintSystem<E>>(
 
     let mut v = Vec::with_capacity(16);
     v.extend_from_slice(h);
-    v.push(UInt32::constant(0x6A09E667));
-    v.push(UInt32::constant(0xBB67AE85));
-    v.push(UInt32::constant(0x3C6EF372));
-    v.push(UInt32::constant(0xA54FF53A));
-    v.push(UInt32::constant(0x510E527F));
-    v.push(UInt32::constant(0x9B05688C));
-    v.push(UInt32::constant(0x1F83D9AB));
-    v.push(UInt32::constant(0x5BE0CD19));
+    v.push(UInt32::constant(0x6A09_E667));
+    v.push(UInt32::constant(0xBB67_AE85));
+    v.push(UInt32::constant(0x3C6E_F372));
+    v.push(UInt32::constant(0xA54F_F53A));
+    v.push(UInt32::constant(0x510E_527F));
+    v.push(UInt32::constant(0x9B05_688C));
+    v.push(UInt32::constant(0x1F83_D9AB));
+    v.push(UInt32::constant(0x5BE0_CD19));
 
     assert_eq!(v.len(), 16);
 
@@ -348,19 +348,19 @@ pub fn blake2s<E: ScalarEngine, CS: ConstraintSystem<E>>(
     assert!(input.len() % 8 == 0);
 
     let mut h = Vec::with_capacity(8);
-    h.push(UInt32::constant(0x6A09E667 ^ 0x01010000 ^ 32));
-    h.push(UInt32::constant(0xBB67AE85));
-    h.push(UInt32::constant(0x3C6EF372));
-    h.push(UInt32::constant(0xA54FF53A));
-    h.push(UInt32::constant(0x510E527F));
-    h.push(UInt32::constant(0x9B05688C));
+    h.push(UInt32::constant(0x6A09_E667 ^ 0x0101_0000 ^ 32));
+    h.push(UInt32::constant(0xBB67_AE85));
+    h.push(UInt32::constant(0x3C6E_F372));
+    h.push(UInt32::constant(0xA54F_F53A));
+    h.push(UInt32::constant(0x510E_527F));
+    h.push(UInt32::constant(0x9B05_688C));
 
     // Personalization is stored here
     h.push(UInt32::constant(
-        0x1F83D9AB ^ LittleEndian::read_u32(&personalization[0..4]),
+        0x1F83_D9AB ^ LittleEndian::read_u32(&personalization[0..4]),
     ));
     h.push(UInt32::constant(
-        0x5BE0CD19 ^ LittleEndian::read_u32(&personalization[4..8]),
+        0x5BE0_CD19 ^ LittleEndian::read_u32(&personalization[4..8]),
     ));
 
     let mut blocks: Vec<Vec<UInt32>> = vec![];

diff --git a/src/gpu/fft.rs b/src/gpu/fft.rs
@@ -1,6 +1,7 @@
+use crate::gpu::get_platform;
 use crate::gpu::{
     error::{GPUError, GPUResult},
-    locks, sources, structs, GPU_NVIDIA_DEVICES,
+    get_devices, locks, sources, structs,
 };
 use ff::Field;
 use log::info;
@@ -33,14 +34,25 @@ where
 {
     pub fn create(n: u32, priority: bool) -> GPUResult<FFTKernel<E>> {
         let lock = locks::GPULock::lock();
-
         let src = sources::kernel::<E>();
-        let devices = &GPU_NVIDIA_DEVICES;
+
+        let platform = get_platform(None)?;
+        info!("Platform selected: {}", platform.name()?);
+
+        let devices = get_devices(&platform).unwrap_or_default();
         if devices.is_empty() {
             return Err(GPUError::Simple("No working GPUs found!"));
         }
-        let device = devices[0]; // Select the first device for FFT
-        let pq = ProQue::builder().device(device).src(src).dims(n).build()?;
+
+        // Select the first device for FFT
+        let device = devices[0];
+
+        let pq = ProQue::builder()
+            .platform(platform)
+            .device(device)
+            .src(src)
+            .dims(n)
+            .build()?;
 
         let srcbuff = Buffer::builder()
             .queue(pq.queue().clone())

diff --git a/src/gpu/fft/fft.cl b/src/gpu/fft/fft.cl
@@ -37,21 +37,21 @@ __kernel void radix_fft(__global FIELD* x, // Source buffer
   uint counte = counts + count / lsize;
 
   // Compute powers of twiddle
-  FIELD twiddle = FIELD_pow_lookup(omegas, (n >> lgp >> deg) * k);
+  const FIELD twiddle = FIELD_pow_lookup(omegas, (n >> lgp >> deg) * k);
   FIELD tmp = FIELD_pow(twiddle, counts);
   for(uint i = counts; i < counte; i++) {
     u[i] = FIELD_mul(tmp, x[i*t]);
     tmp = FIELD_mul(tmp, twiddle);
   }
   barrier(CLK_LOCAL_MEM_FENCE);
 
-  uint pqshift = max_deg - deg;
+  const uint pqshift = max_deg - deg;
   for(uint rnd = 0; rnd < deg; rnd++) {
-    uint bit = counth >> rnd;
+    const uint bit = counth >> rnd;
     for(uint i = counts >> 1; i < counte >> 1; i++) {
-      uint di = i & (bit - 1);
-      uint i0 = (i << 1) - di;
-      uint i1 = i0 + bit;
+      const uint di = i & (bit - 1);
+      const uint i0 = (i << 1) - di;
+      const uint i1 = i0 + bit;
       tmp = u[i0];
       u[i0] = FIELD_add(u[i0], u[i1]);
       u[i1] = FIELD_sub(tmp, u[i1]);
@@ -71,6 +71,6 @@ __kernel void radix_fft(__global FIELD* x, // Source buffer
 __kernel void mul_by_field(__global FIELD* elements,
                         uint n,
                         FIELD field) {
-  uint gid = get_global_id(0);
+  const uint gid = get_global_id(0);
   elements[gid] = FIELD_mul(elements[gid], field);
-}
+}