fix: generate OpenCL source at compile time (filecoin-project#28)

The OpenCL kernel source could be generated at run-time, but with this change it makes it more similar to the CUDA version, where we need to compile the kernel at compile-time. This is to improve the symmetry between those features and should also simplify the code for future changes.
3for · Jul 7, 2022 · 1e24689 · 1e24689
1 parent cc48fd2
commit 1e24689
Show file tree

Hide file tree

Showing 4 changed files with 139 additions and 113 deletions.
diff --git a/ec-gpu-gen/build.rs b/ec-gpu-gen/build.rs
@@ -1,106 +1,152 @@
 /// The build script is needed to compile the CUDA kernel.
 ///
 /// It will compile the kernel at compile time if the `fft` and/or the `multiexp` features are
-/// enabled.
+/// enabled. It also generates the OpenCL source code at compile time.
+
+#[cfg(all(
+    any(feature = "fft", feature = "multiexp"),
+    not(feature = "cargo-clippy")
+))]
+#[path = "src/source.rs"]
+mod source;
+
+/// The build script is use to generate the CUDA kernel and OpenCL source at compile-time, if the
+/// if the `fft` and/or the `multiexp` features are enabled.
 #[cfg(all(
-    feature = "cuda",
     any(feature = "fft", feature = "multiexp"),
     not(feature = "cargo-clippy")
 ))]
 fn main() {
+    #[cfg(feature = "cuda")]
+    kernel::generate_cuda();
+    #[cfg(feature = "opencl")]
+    kernel::generate_opencl();
+}
+
+// This is a hack for the case when we run Clippy while we don't generate any GPU kernel. For
+// Clippy we don't need a proper source or properly compiled kernel, but just some arbitrary bytes.
+#[cfg(not(all(
+    any(feature = "fft", feature = "multiexp"),
+    not(feature = "cargo-clippy")
+)))]
+fn main() {
+    println!("cargo:rustc-env=CUDA_KERNEL_FATBIN=../build.rs");
+    println!("cargo:rustc-env=OPENCL_KERNEL_SOURCE=../build.rs");
+}
+
+// Put the code into a module, so that we need to repeat the feature flags less often.
+#[cfg(all(
+    any(feature = "fft", feature = "multiexp"),
+    not(feature = "cargo-clippy")
+))]
+mod kernel {
     use std::path::PathBuf;
-    use std::process::Command;
     use std::{env, fs};
 
     use blstrs::Bls12;
-    use sha2::{Digest, Sha256};
 
-    #[path = "src/source.rs"]
-    mod source;
-
-    // This is a hack for the case when the documentation is built on docs.rs. For the
-    // documentation  we don't need a properly compiled kernel, but just some arbitrary bytes.
-    if env::var("DOCS_RS").is_ok() {
-        println!("cargo:rustc-env=CUDA_KERNEL_FATBIN=../build.rs");
-        return;
-    }
+    #[cfg(feature = "cuda")]
+    pub(crate) fn generate_cuda() {
+        use sha2::{Digest, Sha256};
 
-    let kernel_source = source::gen_source::<Bls12, source::Limb32>();
-
-    let out_dir = env::var("OUT_DIR").expect("OUT_DIR was not set.");
-
-    // Make it possible to override the default options. Though the source and output file is
-    // always set automatically.
-    let mut nvcc = match env::var("EC_GPU_CUDA_NVCC_ARGS") {
-        Ok(args) => execute::command(format!("nvcc {}", args)),
-        Err(_) => {
-            let mut command = Command::new("nvcc");
-            command
-                .arg("--optimize=6")
-                // Compile with as many threads as CPUs are available.
-                .arg("--threads=0")
-                .arg("--fatbin")
-                .arg("--gpu-architecture=sm_86")
-                .arg("--generate-code=arch=compute_86,code=sm_86")
-                .arg("--generate-code=arch=compute_80,code=sm_80")
-                .arg("--generate-code=arch=compute_75,code=sm_75");
-            command
+        // This is a hack for the case when the documentation is built on docs.rs. For the
+        // documentation  we don't need a properly compiled kernel, but just some arbitrary bytes.
+        if env::var("DOCS_RS").is_ok() {
+            println!("cargo:rustc-env=CUDA_KERNEL_FATBIN=../build.rs");
+            return;
         }
-    };
-
-    // Hash the source and and the compile flags. Use that as the filename, so that the kernel is
-    // only rebuilt if any of them change.
-    let mut hasher = Sha256::new();
-    hasher.update(kernel_source.as_bytes());
-    hasher.update(&format!("{:?}", &nvcc));
-    let kernel_digest = hex::encode(hasher.finalize());
-
-    let source_path: PathBuf = [&out_dir, &format!("{}.cu", &kernel_digest)]
-        .iter()
-        .collect();
-    let fatbin_path: PathBuf = [&out_dir, &format!("{}.fatbin", &kernel_digest)]
-        .iter()
-        .collect();
-
-    fs::write(&source_path, &kernel_source).unwrap_or_else(|_| {
-        panic!(
-            "Cannot write kernel source at {}.",
-            source_path.to_str().unwrap()
-        )
-    });
-
-    // Only compile if the output doesn't exist yet.
-    if !fatbin_path.as_path().exists() {
-        let status = nvcc
-            .arg("--output-file")
-            .arg(&fatbin_path)
-            .arg(&source_path)
-            .status()
-            .expect("Cannot run nvcc. Install the NVIDIA toolkit or disable the `cuda` feature.");
-
-        if !status.success() {
+
+        let kernel_source = crate::source::gen_source::<Bls12, crate::source::Limb32>();
+        let out_dir = env::var("OUT_DIR").expect("OUT_DIR was not set.");
+
+        // Make it possible to override the default options. Though the source and output file is
+        // always set automatically.
+        let mut nvcc = match env::var("EC_GPU_CUDA_NVCC_ARGS") {
+            Ok(args) => execute::command(format!("nvcc {}", args)),
+            Err(_) => {
+                let mut command = std::process::Command::new("nvcc");
+                command
+                    .arg("--optimize=6")
+                    // Compile with as many threads as CPUs are available.
+                    .arg("--threads=0")
+                    .arg("--fatbin")
+                    .arg("--gpu-architecture=sm_86")
+                    .arg("--generate-code=arch=compute_86,code=sm_86")
+                    .arg("--generate-code=arch=compute_80,code=sm_80")
+                    .arg("--generate-code=arch=compute_75,code=sm_75");
+                command
+            }
+        };
+
+        // Hash the source and the compile flags. Use that as the filename, so that the kernel is only
+        // rebuilt if any of them change.
+        let mut hasher = Sha256::new();
+        hasher.update(kernel_source.as_bytes());
+        hasher.update(&format!("{:?}", &nvcc));
+        let kernel_digest = hex::encode(hasher.finalize());
+
+        let source_path: PathBuf = [&out_dir, &format!("{}.cu", &kernel_digest)]
+            .iter()
+            .collect();
+        let fatbin_path: PathBuf = [&out_dir, &format!("{}.fatbin", &kernel_digest)]
+            .iter()
+            .collect();
+
+        fs::write(&source_path, &kernel_source).unwrap_or_else(|_| {
             panic!(
-                "nvcc failed. See the kernel source at {}",
+                "Cannot write kernel source at {}.",
                 source_path.to_str().unwrap()
-            );
+            )
+        });
+
+        // Only compile if the output doesn't exist yet.
+        if !fatbin_path.as_path().exists() {
+            let status = nvcc
+                .arg("--output-file")
+                .arg(&fatbin_path)
+                .arg(&source_path)
+                .status()
+                .expect(
+                    "Cannot run nvcc. Install the NVIDIA toolkit or disable the `cuda` feature.",
+                );
+
+            if !status.success() {
+                panic!(
+                    "nvcc failed. See the kernel source at {}",
+                    source_path.to_str().unwrap()
+                );
+            }
         }
+
+        // The idea to put the path to the farbin into a compile-time env variable is from
+        // https://github.com/LutzCle/fast-interconnects-demo/blob/b80ea8e04825167f486ab8ac1b5d67cf7dd51d2c/rust-demo/build.rs
+        println!(
+            "cargo:rustc-env=CUDA_KERNEL_FATBIN={}",
+            fatbin_path.to_str().unwrap()
+        );
     }
 
-    // The idea to put the path to the farbin into a compile-time env variable is from
-    // https://github.com/LutzCle/fast-interconnects-demo/blob/b80ea8e04825167f486ab8ac1b5d67cf7dd51d2c/rust-demo/build.rs
-    println!(
-        "cargo:rustc-env=CUDA_KERNEL_FATBIN={}",
-        fatbin_path.to_str().unwrap()
-    );
-}
+    #[cfg(feature = "opencl")]
+    pub(crate) fn generate_opencl() {
+        let kernel_source = crate::source::gen_source::<Bls12, crate::source::Limb64>();
+        let out_dir = env::var("OUT_DIR").expect("OUT_DIR was not set.");
 
-#[cfg(not(all(
-    feature = "cuda",
-    any(feature = "fft", feature = "multiexp"),
-    not(feature = "cargo-clippy")
-)))]
-fn main() {
-    // This is a hack for the case when the `cuda` and `cargo-clippy` features are enabled. For
-    // Clippy we don't need a properly compiled kernel, but just some arbitrary bytes.
-    println!("cargo:rustc-env=CUDA_KERNEL_FATBIN=../build.rs");
+        // Generating the kernel source is cheap, hence use a fixed name and override it on every
+        // build.
+        let source_path: PathBuf = [&out_dir, "kernel.cl"].iter().collect();
+
+        fs::write(&source_path, &kernel_source).unwrap_or_else(|_| {
+            panic!(
+                "Cannot write kernel source at {}.",
+                source_path.to_str().unwrap()
+            )
+        });
+
+        // For OpenCL we only need the kernel source, it is compiled at runtime.
+        #[cfg(feature = "opencl")]
+        println!(
+            "cargo:rustc-env=OPENCL_KERNEL_SOURCE={}",
+            source_path.to_str().unwrap()
+        );
+    }
 }
diff --git a/ec-gpu-gen/src/fft.rs b/ec-gpu-gen/src/fft.rs
@@ -6,12 +6,12 @@ use ec_gpu::GpuEngine;
 use ff::Field;
 use log::{error, info};
 use pairing::Engine;
-use rust_gpu_tools::{program_closures, Device, LocalBuffer, Program, Vendor};
+use rust_gpu_tools::{program_closures, Device, LocalBuffer, Program};
 
 use crate::threadpool::THREAD_POOL;
 use crate::{
     error::{EcError, EcResult},
-    program, Limb32, Limb64,
+    program,
 };
 
 const LOG2_MAX_ELEMENTS: usize = 32; // At most 2^32 elements is supported.
@@ -40,11 +40,7 @@ impl<'a, E: Engine + GpuEngine> SingleFftKernel<'a, E> {
         device: &Device,
         maybe_abort: Option<&'a (dyn Fn() -> bool + Send + Sync)>,
     ) -> EcResult<Self> {
-        let source = match device.vendor() {
-            Vendor::Nvidia => crate::gen_source::<E, Limb32>(),
-            _ => crate::gen_source::<E, Limb64>(),
-        };
-        let program = program::program::<E>(device, &source)?;
+        let program = program::program(device)?;
 
         Ok(SingleFftKernel {
             program,

diff --git a/ec-gpu-gen/src/multiexp.rs b/ec-gpu-gen/src/multiexp.rs
@@ -7,14 +7,13 @@ use ff::PrimeField;
 use group::{prime::PrimeCurveAffine, Group};
 use log::{error, info};
 use pairing::Engine;
-use rust_gpu_tools::{program_closures, Device, Program, Vendor};
+use rust_gpu_tools::{program_closures, Device, Program};
 use yastl::Scope;
 
 use crate::{
     error::{EcError, EcResult},
     program,
     threadpool::Worker,
-    Limb32, Limb64,
 };
 
 /// On the GPU, the exponents are split into windows, this is the maximum number of such windows.
@@ -113,11 +112,7 @@ where
         let work_units = work_units(compute_units, compute_capability);
         let chunk_size = calc_chunk_size::<E>(mem, work_units);
 
-        let source = match device.vendor() {
-            Vendor::Nvidia => crate::gen_source::<E, Limb32>(),
-            _ => crate::gen_source::<E, Limb64>(),
-        };
-        let program = program::program::<E>(device, &source)?;
+        let program = program::program(device)?;
 
         Ok(SingleMultiexpKernel {
             program,

diff --git a/ec-gpu-gen/src/program.rs b/ec-gpu-gen/src/program.rs
@@ -1,8 +1,6 @@
 use std::env;
 
-use ec_gpu::GpuEngine;
 use log::info;
-use pairing::Engine;
 #[cfg(feature = "cuda")]
 use rust_gpu_tools::cuda;
 #[cfg(feature = "opencl")]
@@ -49,23 +47,13 @@ fn select_framework(default_framework: Framework) -> EcResult<Framework> {
 ///
 /// If the device supports CUDA, then CUDA is used, else OpenCL. You can force a selection with
 /// the environment variable `EC_GPU_FRAMEWORK`, which can be set either to `cuda` or `opencl`.
-pub fn program<E>(device: &Device, source: &str) -> EcResult<Program>
-where
-    E: Engine + GpuEngine,
-{
+pub fn program(device: &Device) -> EcResult<Program> {
     let framework = select_framework(device.framework())?;
-    program_use_framework::<E>(device, source, &framework)
+    program_use_framework(device, &framework)
 }
 
 /// Returns the program for the specified [`rust_gpu_tools::Framework`].
-pub fn program_use_framework<E>(
-    device: &Device,
-    #[allow(unused_variables)] source: &str,
-    framework: &Framework,
-) -> EcResult<Program>
-where
-    E: Engine + GpuEngine,
-{
+pub fn program_use_framework(device: &Device, framework: &Framework) -> EcResult<Program> {
     match framework {
         #[cfg(feature = "cuda")]
         Framework::Cuda => {
@@ -78,6 +66,7 @@ where
         #[cfg(feature = "opencl")]
         Framework::Opencl => {
             info!("Using kernel on OpenCL.");
+            let source = include_str!(env!("OPENCL_KERNEL_SOURCE"));
             let opencl_device = device.opencl_device().ok_or(GPUError::DeviceNotFound)?;
             let program = opencl::Program::from_opencl(opencl_device, source)?;
             Ok(Program::Opencl(program))