Faster CPU algorithms. Easier vector library. Cache for frequent CUDA…

… API calls.
dy11 · Dec 2, 2016 · acb7958 · acb7958
1 parent 2ac2096
commit acb7958
Show file tree

Hide file tree

Showing 13 changed files with 1,027 additions and 1,129 deletions.
diff --git a/Makefile b/Makefile
@@ -75,7 +75,7 @@ CUDAROOT = $(subst /bin/,,$(dir $(shell which $(CUDAC))))
 
 CFLAGS = -I${CUDAROOT}/include -std=c++11 $(DEFS) $(OPTLEV) -march=native
 
-CUDA_CFLAGS = -I${CUDAROOT}/include $(OPTLEV) -Xptxas -dlcm=ca -lineinfo $(GENCODE_FLAGS) --resource-usage -Xcompiler "$(CUDA_CCFLAGS)" $(DEFS) -D_FORCE_INLINES
+CUDA_CFLAGS = -I${CUDAROOT}/include -std=c++11 $(OPTLEV) -Xptxas -dlcm=ca -lineinfo $(GENCODE_FLAGS) --resource-usage -Xcompiler "$(CUDA_CCFLAGS)" $(DEFS) -D_FORCE_INLINES
 
 ifeq ($(OS),osx)
 CUDA_LFLAGS = -L$(CUDAROOT)/lib

diff --git a/src/LRUCache.h b/src/LRUCache.h
@@ -0,0 +1,98 @@
+/******************************************************************************
+MIT License
+
+Copyright (c) 2016 Antti-Pekka Hynninen
+Copyright (c) 2016 NVIDIA
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*******************************************************************************/
+
+#include <utility>
+#include <list>
+#include <unordered_map>
+
+using namespace std;
+
+//
+// Simple LRU cache implementation
+//
+template <typename key_type, typename value_type>
+class LRUCache {
+private:
+
+  struct ValueIterator {
+    value_type value;
+    typename list<key_type>::iterator it;
+  };
+
+  // Size of the cache
+  const size_t capacity;
+
+  // Value that is returned when the key is not found
+  const value_type null_value;
+
+  // Double linked list of keys. Oldest is at the back
+  list<key_type> keys;
+
+  // Cache: (hash table)
+  // key = key
+  // value = {value, pointer to linked list}
+  unordered_map<key_type, ValueIterator> cache;
+
+public:
+
+  LRUCache(const size_t capacity, const value_type null_value) : capacity(capacity), null_value(null_value) {} 
+
+  value_type get(key_type key) {
+    auto it = cache.find(key);
+    if (it == cache.end()) return null_value;
+    touch(it);
+    return it->second.value;
+  }
+
+  void set(key_type key, value_type value) {
+    auto it = cache.find(key);
+    if (it != cache.end()) {
+      // key found
+      it->second.value = value;
+      touch(it);
+    } else {
+      // key not found
+      if (cache.size() == capacity) {
+        key_type oldest_key = keys.back();
+        keys.pop_back();
+        cache.erase( cache.find(oldest_key) );
+      }
+      keys.push_front(key);
+      ValueIterator vi;
+      vi.value = value;
+      vi.it = keys.begin();
+      pair<key_type, ValueIterator> boo(key, vi);
+      cache.insert(boo);
+    }
+  }
+
+private:
+
+  void touch(typename unordered_map<key_type, ValueIterator>::iterator it) {
+    keys.erase(it->second.it);
+    keys.push_front(it->first);
+    it->second.it = keys.begin();
+  }
+};
diff --git a/src/cutt.cpp b/src/cutt.cpp
@@ -25,12 +25,12 @@ SOFTWARE.
 #include <cuda.h>
 #include <list>
 #include <unordered_map>
-#include <unordered_set>
 #include "CudaUtils.h"
 #include "cuttplan.h"
 #include "cuttkernel.h"
 #include "cuttTimer.h"
 #include "cutt.h"
+// #include <chrono>
 
 // Hash table to store the plans
 static std::unordered_map< cuttHandle, cuttPlan_t* > planStorage;
@@ -39,7 +39,22 @@ static std::unordered_map< cuttHandle, cuttPlan_t* > planStorage;
 static cuttHandle curHandle = 0;
 
 // Table of devices that have been initialized
-static std::unordered_set<int> devicesReady;
+static std::unordered_map<int, cudaDeviceProp> deviceProps;
+
+// Checks prepares device if it's not ready yet and returns device properties
+// Also sets shared memory configuration
+void getDeviceProp(int& deviceID, cudaDeviceProp &prop) {
+  cudaCheck(cudaGetDevice(&deviceID));
+  auto it = deviceProps.find(deviceID);
+  if (it == deviceProps.end()) {
+    // Get device properties and store it for later use
+    cudaCheck(cudaGetDeviceProperties(&prop, deviceID));
+    cuttKernelSetSharedMemConfig();
+    deviceProps.insert({deviceID, prop});
+  } else {
+    prop = it->second;
+  }
+}
 
 cuttResult cuttPlanCheckInput(int rank, int* dim, int* permutation, size_t sizeofType) {
   // Check sizeofType
@@ -84,11 +99,10 @@ cuttResult cuttPlan(cuttHandle* handle, int rank, int* dim, int* permutation, si
   // Check that the current handle is available (it better be!)
   if (planStorage.count(*handle) != 0) return CUTT_INTERNAL_ERROR;
 
-  // Get all possible ways tensor can be transposed
+  // Prepare device
   int deviceID;
-  cudaCheck(cudaGetDevice(&deviceID));
   cudaDeviceProp prop;
-  cudaCheck(cudaGetDeviceProperties(&prop, deviceID));
+  getDeviceProp(deviceID, prop);
 
   // Reduce ranks
   std::vector<int> redDim;
@@ -113,8 +127,16 @@ cuttResult cuttPlan(cuttHandle* handle, int rank, int* dim, int* permutation, si
   gpuRangeStart("createPlans");
 #endif
 
-  if (!createPlans(rank, dim, permutation, redDim.size(), redDim.data(), redPermutation.data(), 
-    sizeofType, prop, plans)) return CUTT_INTERNAL_ERROR;
+  // std::chrono::high_resolution_clock::time_point plan_start;
+  // plan_start = std::chrono::high_resolution_clock::now();
+
+  if (!cuttPlan_t::createPlans(rank, dim, permutation, redDim.size(), redDim.data(), redPermutation.data(), 
+    sizeofType, deviceID, prop, plans)) return CUTT_INTERNAL_ERROR;
+
+  // std::chrono::high_resolution_clock::time_point plan_end;
+  // plan_end = std::chrono::high_resolution_clock::now();
+  // double plan_duration = std::chrono::duration_cast< std::chrono::duration<double> >(plan_end - plan_start).count();
+  // printf("createPlans took %lf ms\n", plan_duration*1000.0);
 
 #ifdef ENABLE_NVTOOLS
   gpuRangeStop();
@@ -177,17 +199,10 @@ cuttResult cuttPlanMeasure(cuttHandle* handle, int rank, int* dim, int* permutat
   // Check that the current handle is available (it better be!)
   if (planStorage.count(*handle) != 0) return CUTT_INTERNAL_ERROR;
 
-  // Get all possible ways tensor can be transposed
+  // Prepare device
   int deviceID;
-  cudaCheck(cudaGetDevice(&deviceID));
   cudaDeviceProp prop;
-  cudaCheck(cudaGetDeviceProperties(&prop, deviceID));
-
-  // Set shared memory configuration if necessary
-  if (!devicesReady.count(deviceID)) {
-    cuttKernelSetSharedMemConfig();
-    devicesReady.insert(deviceID);
-  }
+  getDeviceProp(deviceID, prop);
 
   // Reduce ranks
   std::vector<int> redDim;
@@ -204,8 +219,8 @@ cuttResult cuttPlanMeasure(cuttHandle* handle, int rank, int* dim, int* permutat
   // Create plans from non-reduced ranks
   // if (!createPlans(rank, dim, permutation, sizeofType, prop, plans)) return CUTT_INTERNAL_ERROR;
 #else
-  if (!createPlans(rank, dim, permutation, redDim.size(), redDim.data(), redPermutation.data(), 
-    sizeofType, prop, plans)) return CUTT_INTERNAL_ERROR;
+  if (!cuttPlan_t::createPlans(rank, dim, permutation, redDim.size(), redDim.data(), redPermutation.data(), 
+    sizeofType, deviceID, prop, plans)) return CUTT_INTERNAL_ERROR;
 #endif
 
   // // Count cycles
@@ -290,12 +305,6 @@ cuttResult cuttExecute(cuttHandle handle, void* idata, void* odata) {
   cudaCheck(cudaGetDevice(&deviceID));
   if (deviceID != plan.deviceID) return CUTT_INVALID_DEVICE;
 
-  // Set shared memory configuration if necessary
-  if (!devicesReady.count(deviceID)) {
-    cuttKernelSetSharedMemConfig();
-    devicesReady.insert(deviceID);
-  }
-
   if (!cuttKernel(plan, idata, odata)) return CUTT_INTERNAL_ERROR;
   return CUTT_SUCCESS;
 }