Code review changes.

MarkBicknellONT · MarkBicknellONT · commit 542d9982440c · 2024-09-26T13:41:27.000+01:00
diff --git a/benchmark/generate_chunk_auto_batchsize_benchmarks.sh b/benchmark/generate_chunk_auto_batchsize_benchmarks.sh
@@ -63,7 +63,7 @@ echo "#include \"${gpu_name}.h\"
 
 namespace dorado::basecall {
 
-void Add${gpu_name_no_dashes}Benchmarks(std::map<std::tuple<std::string, std::string>, std::map<int, float>>& chunk_benchmarks) {" >> ${gpu_name}.cpp
+void Add${gpu_name_no_dashes}Benchmarks(std::map<std::pair<std::string, std::string>, std::unordered_map<int, float>>& chunk_benchmarks) {" >> ${gpu_name}.cpp
 
 # Add the chunk benchmarks for every model 
 cat chunk_benchmarks__*.txt >> ${gpu_name}.cpp
@@ -77,9 +77,9 @@ echo "#pragma once
 
 #include <map>
 #include <string>
-#include <tuple>
+#include <unordered_map>
 
 namespace dorado::basecall {
-    void Add${gpu_name_no_dashes}Benchmarks(std::map<std::tuple<std::string, std::string>, std::map<int, float>>& chunk_benchmarks);
+    void Add${gpu_name_no_dashes}Benchmarks(std::map<std::pair<std::string, std::string>, std::unordered_map<int, float>>& chunk_benchmarks);
 } // namespace dorado::basecall
 " >> ${gpu_name}.h
diff --git a/dorado/basecall/CudaCaller.cpp b/dorado/basecall/CudaCaller.cpp
@@ -309,6 +309,7 @@ void CudaCaller::determine_batch_dims(const BasecallerCreationParams &params) {
 
     float best_time = std::numeric_limits<float>::max();
 
+    assert(m_batch_dims.size() > 0);
     int chunk_size = m_batch_dims.back().T_in;
     // We limit the maximum when doing benchmarking to avoid excessive startup time.
     // The limit for transformer models should be increased at a later time.
@@ -319,8 +320,8 @@ void CudaCaller::determine_batch_dims(const BasecallerCreationParams &params) {
     if (params.emit_batchsize_benchmarks) {
         // When we are emitting benchmarks, prefer accuracy over speed of benchmark generation, so run the benchmarks
         //  at full chunk size.  We must round down the requested chunk size to a multiple of the minimum granularity.
-        size_t chunk_granularity = params.model_config.chunk_size_granularity();
-        chunk_size = int((chunk_size / chunk_granularity) * chunk_granularity);
+        const size_t chunk_granularity = params.model_config.chunk_size_granularity();
+        chunk_size = static_cast<int>((chunk_size / chunk_granularity) * chunk_granularity);
     } else {
         // 288 * stride (much shorter than the default chunk size of 10k) is a somewhat arbitrary
         // trade-off between getting more accurate measurements and avoiding excessive startup time,
diff --git a/dorado/basecall/CudaChunkBenchmarks.cpp b/dorado/basecall/CudaChunkBenchmarks.cpp
@@ -25,8 +25,9 @@ std::optional<const CudaChunkBenchmarks::ChunkTimings> CudaChunkBenchmarks::get_
     ModelName model_name = std::filesystem::path(model_path).filename().string();
 
     // Try looking up the specified gpu name directly
-    if (m_chunk_benchmarks.find({gpu_name, model_name}) != m_chunk_benchmarks.end()) {
-        return m_chunk_benchmarks.at({gpu_name, model_name});
+    auto iter = m_chunk_benchmarks.find({gpu_name, model_name});
+    if (iter != m_chunk_benchmarks.cend()) {
+        return iter->second;
     }
 
     // If the direct lookup fails, try looking up via an alias
@@ -35,10 +36,11 @@ std::optional<const CudaChunkBenchmarks::ChunkTimings> CudaChunkBenchmarks::get_
             {"NVIDIA A800 80GB PCIe", "NVIDIA A100 80GB PCIe"},
     };
 
-    if (gpu_name_alias.find(gpu_name) != gpu_name_alias.end()) {
+    if (gpu_name_alias.find(gpu_name) != gpu_name_alias.cend()) {
         gpu_name = gpu_name_alias[gpu_name];
-        if (m_chunk_benchmarks.find({gpu_name, model_name}) != m_chunk_benchmarks.end()) {
-            return m_chunk_benchmarks.at({gpu_name, model_name});
+        iter = m_chunk_benchmarks.find({gpu_name, model_name});
+        if (iter != m_chunk_benchmarks.cend()) {
+            return iter->second;
         }
     }
 
diff --git a/dorado/basecall/CudaChunkBenchmarks.h b/dorado/basecall/CudaChunkBenchmarks.h
@@ -3,7 +3,7 @@
 #include <map>
 #include <optional>
 #include <string>
-#include <tuple>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -12,10 +12,10 @@ namespace dorado::basecall {
 class CudaChunkBenchmarks final {
 private:
     CudaChunkBenchmarks();
-    using ChunkTimings = std::map<int, float>;
+    using ChunkTimings = std::unordered_map<int, float>;
     using ModelName = std::string;
     using GPUName = std::string;
-    std::map<std::tuple<GPUName, ModelName>, ChunkTimings> m_chunk_benchmarks;
+    std::map<std::pair<GPUName, ModelName>, ChunkTimings> m_chunk_benchmarks;
 
 public:
     static CudaChunkBenchmarks& instance() {
diff --git a/dorado/basecall/benchmarks/NVIDIA_A100_80GB_PCIe.cpp b/dorado/basecall/benchmarks/NVIDIA_A100_80GB_PCIe.cpp
@@ -3,7 +3,8 @@
 namespace dorado::basecall {
 
 void AddNVIDIA_A100_80GB_PCIeBenchmarks(
-        std::map<std::tuple<std::string, std::string>, std::map<int, float>>& chunk_benchmarks) {
+        std::map<std::pair<std::string, std::string>, std::unordered_map<int, float>>&
+                chunk_benchmarks) {
     chunk_benchmarks[{"NVIDIA A100 80GB PCIe", "dna_r10.4.1_e8.2_260bps_fast@v4.1.0"}] = {
             {64, 0.098176f},   {128, 0.054744f},  {192, 0.038688f},  {256, 0.032228f},
             {320, 0.027174f},  {384, 0.024059f},  {640, 0.022741f},  {704, 0.021700f},
diff --git a/dorado/basecall/benchmarks/NVIDIA_A100_80GB_PCIe.h b/dorado/basecall/benchmarks/NVIDIA_A100_80GB_PCIe.h
@@ -2,9 +2,9 @@
 
 #include <map>
 #include <string>
-#include <tuple>
+#include <unordered_map>
 
 namespace dorado::basecall {
-void AddNVIDIA_A100_80GB_PCIeBenchmarks(
-        std::map<std::tuple<std::string, std::string>, std::map<int, float>>& chunk_benchmarks);
+void AddNVIDIA_A100_80GB_PCIeBenchmarks(std::map<std::pair<std::string, std::string>,
+                                                 std::unordered_map<int, float>>& chunk_benchmarks);
 }  // namespace dorado::basecall
diff --git a/dorado/basecall/benchmarks/NVIDIA_H100_PCIe.cpp b/dorado/basecall/benchmarks/NVIDIA_H100_PCIe.cpp
@@ -2,8 +2,8 @@
 
 namespace dorado::basecall {
 
-void AddNVIDIA_H100_PCIeBenchmarks(
-        std::map<std::tuple<std::string, std::string>, std::map<int, float>>& chunk_benchmarks) {
+void AddNVIDIA_H100_PCIeBenchmarks(std::map<std::pair<std::string, std::string>,
+                                            std::unordered_map<int, float>>& chunk_benchmarks) {
     chunk_benchmarks[{"NVIDIA H100 PCIe", "dna_r10.4.1_e8.2_260bps_fast@v4.1.0"}] = {
             {64, 0.078797f},   {128, 0.043509f},  {192, 0.030815f}, {256, 0.025591f},
             {320, 0.021523f},  {384, 0.019044f},  {448, 0.017026f}, {640, 0.016945f},
diff --git a/dorado/basecall/benchmarks/NVIDIA_H100_PCIe.h b/dorado/basecall/benchmarks/NVIDIA_H100_PCIe.h
@@ -2,9 +2,9 @@
 
 #include <map>
 #include <string>
-#include <tuple>
+#include <unordered_map>
 
 namespace dorado::basecall {
-void AddNVIDIA_H100_PCIeBenchmarks(
-        std::map<std::tuple<std::string, std::string>, std::map<int, float>>& chunk_benchmarks);
+void AddNVIDIA_H100_PCIeBenchmarks(std::map<std::pair<std::string, std::string>,
+                                            std::unordered_map<int, float>>& chunk_benchmarks);
 }  // namespace dorado::basecall
diff --git a/dorado/basecall/benchmarks/NVIDIA_RTX_A6000.cpp b/dorado/basecall/benchmarks/NVIDIA_RTX_A6000.cpp
@@ -2,8 +2,8 @@
 
 namespace dorado::basecall {
 
-void AddNVIDIA_RTX_A6000Benchmarks(
-        std::map<std::tuple<std::string, std::string>, std::map<int, float>>& chunk_benchmarks) {
+void AddNVIDIA_RTX_A6000Benchmarks(std::map<std::pair<std::string, std::string>,
+                                            std::unordered_map<int, float>>& chunk_benchmarks) {
     chunk_benchmarks[{"NVIDIA RTX A6000", "dna_r10.4.1_e8.2_260bps_fast@v4.1.0"}] = {
             {64, 0.081234f},   {128, 0.047160f},  {192, 0.036165f},  {256, 0.031301f},
             {320, 0.027645f},  {576, 0.026394f},  {640, 0.025800f},  {1344, 0.025478f},
diff --git a/dorado/basecall/benchmarks/NVIDIA_RTX_A6000.h b/dorado/basecall/benchmarks/NVIDIA_RTX_A6000.h
@@ -2,9 +2,9 @@
 
 #include <map>
 #include <string>
-#include <tuple>
+#include <unordered_map>
 
 namespace dorado::basecall {
-void AddNVIDIA_RTX_A6000Benchmarks(
-        std::map<std::tuple<std::string, std::string>, std::map<int, float>>& chunk_benchmarks);
+void AddNVIDIA_RTX_A6000Benchmarks(std::map<std::pair<std::string, std::string>,
+                                            std::unordered_map<int, float>>& chunk_benchmarks);
 }  // namespace dorado::basecall
diff --git a/dorado/basecall/benchmarks/Quadro_GV100.cpp b/dorado/basecall/benchmarks/Quadro_GV100.cpp
@@ -2,8 +2,8 @@
 
 namespace dorado::basecall {
 
-void AddQuadro_GV100Benchmarks(
-        std::map<std::tuple<std::string, std::string>, std::map<int, float>>& chunk_benchmarks) {
+void AddQuadro_GV100Benchmarks(std::map<std::pair<std::string, std::string>,
+                                        std::unordered_map<int, float>>& chunk_benchmarks) {
     chunk_benchmarks[{"Quadro GV100", "dna_r10.4.1_e8.2_260bps_fast@v4.1.0"}] = {
             {64, 0.133248f},  {128, 0.075928f},  {192, 0.059051f}, {256, 0.048244f},
             {320, 0.041699f}, {448, 0.040158f},  {512, 0.038198f}, {576, 0.033886f},
diff --git a/dorado/basecall/benchmarks/Quadro_GV100.h b/dorado/basecall/benchmarks/Quadro_GV100.h
@@ -2,9 +2,9 @@
 
 #include <map>
 #include <string>
-#include <tuple>
+#include <unordered_map>
 
 namespace dorado::basecall {
-void AddQuadro_GV100Benchmarks(
-        std::map<std::tuple<std::string, std::string>, std::map<int, float>>& chunk_benchmarks);
+void AddQuadro_GV100Benchmarks(std::map<std::pair<std::string, std::string>,
+                                        std::unordered_map<int, float>>& chunk_benchmarks);
 }  // namespace dorado::basecall
diff --git a/dorado/basecall/benchmarks/Tesla_V100-PCIE-16GB.cpp b/dorado/basecall/benchmarks/Tesla_V100-PCIE-16GB.cpp
@@ -2,8 +2,8 @@
 
 namespace dorado::basecall {
 
-void AddTesla_V100_PCIE_16GBBenchmarks(
-        std::map<std::tuple<std::string, std::string>, std::map<int, float>>& chunk_benchmarks) {
+void AddTesla_V100_PCIE_16GBBenchmarks(std::map<std::pair<std::string, std::string>,
+                                                std::unordered_map<int, float>>& chunk_benchmarks) {
     chunk_benchmarks[{"Tesla V100-PCIE-16GB", "dna_r10.4.1_e8.2_260bps_fast@v4.1.0"}] = {
             {64, 0.107872f},  {128, 0.060736f}, {192, 0.046971f}, {256, 0.038268f},
             {320, 0.032979f}, {512, 0.032126f}, {576, 0.029948f}, {640, 0.028114f},
diff --git a/dorado/basecall/benchmarks/Tesla_V100-PCIE-16GB.h b/dorado/basecall/benchmarks/Tesla_V100-PCIE-16GB.h
@@ -2,9 +2,9 @@
 
 #include <map>
 #include <string>
-#include <tuple>
+#include <unordered_map>
 
 namespace dorado::basecall {
-void AddTesla_V100_PCIE_16GBBenchmarks(
-        std::map<std::tuple<std::string, std::string>, std::map<int, float>>& chunk_benchmarks);
+void AddTesla_V100_PCIE_16GBBenchmarks(std::map<std::pair<std::string, std::string>,
+                                                std::unordered_map<int, float>>& chunk_benchmarks);
 }  // namespace dorado::basecall