Back out "Only keep fbgemm namespace for torch.ops" (pytorch#1026)

Summary: Pull Request resolved: pytorch#1026 Original commit changeset: 88ee7df864d6 Original Phabricator Diff: D35104996 (pytorch@356ca9c) Background: https://fb.workplace.com/groups/3440841732711443/permalink/4698534793608791/ Differential Revision: D35261680 fbshipit-source-id: 6f08f7fd2710b72da23c8861098156ff069a9918
ajtulloch · Apr 1, 2022 · 8a0c2cc · 8a0c2cc
1 parent 6e6204c
commit 8a0c2cc
Show file tree

Hide file tree

Showing 8 changed files with 105 additions and 0 deletions.
diff --git a/fbgemm_gpu/codegen/embedding_backward_dense_host.cpp b/fbgemm_gpu/codegen/embedding_backward_dense_host.cpp
@@ -392,6 +392,13 @@ Tensor split_embedding_codegen_lookup_dense_function(
   }
 }
 
+// Deprecated for fb namespace! Please use fbgemm namespace instead!
+TORCH_LIBRARY_FRAGMENT(fb, m) {
+  DISPATCH_TO_CUDA(
+      "dense_embedding_codegen_lookup_function",
+      split_embedding_codegen_lookup_dense_function);
+}
+
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   DISPATCH_TO_CUDA(
       "dense_embedding_codegen_lookup_function",

diff --git a/fbgemm_gpu/codegen/embedding_backward_dense_host_cpu.cpp b/fbgemm_gpu/codegen/embedding_backward_dense_host_cpu.cpp
@@ -175,6 +175,15 @@ Tensor split_embedding_codegen_lookup_dense_function(
       feature_requires_grad)[0];
 }
 
+// Deprecated for fb namespace! Please use fbgemm namespace instead!
+TORCH_LIBRARY_FRAGMENT(fb, m) {
+  m.def(
+      "dense_embedding_codegen_lookup_function(Tensor dev_weights, Tensor weights_offsets, Tensor D_offsets, int total_D, int max_D, Tensor hash_size_cumsum, int total_hash_size_bits, Tensor indices, Tensor offsets, int pooling_mode, Tensor? indice_weights, Tensor? feature_requires_grad) -> Tensor");
+  DISPATCH_TO_CPU(
+      "dense_embedding_codegen_lookup_function",
+      split_embedding_codegen_lookup_dense_function);
+}
+
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "dense_embedding_codegen_lookup_function(Tensor dev_weights, Tensor weights_offsets, Tensor D_offsets, int total_D, int max_D, Tensor hash_size_cumsum, int total_hash_size_bits, Tensor indices, Tensor offsets, int pooling_mode, Tensor? indice_weights, Tensor? feature_requires_grad) -> Tensor");

diff --git a/fbgemm_gpu/codegen/embedding_backward_split_host_cpu_template.cpp b/fbgemm_gpu/codegen/embedding_backward_split_host_cpu_template.cpp
@@ -216,6 +216,14 @@ Tensor split_embedding_codegen_lookup_{{ optimizer }}_function_cpu(
       output_dtype)[0];
 }
 
+// Deprecated for fb namespace! Please use fbgemm namespace instead!
+TORCH_LIBRARY_FRAGMENT(fb, m) {
+    m.def("split_embedding_codegen_lookup_{{ optimizer }}_function_cpu(Tensor host_weights, Tensor weights_placements, Tensor weights_offsets, Tensor D_offsets, int total_D, int max_D, Tensor hash_size_cumsum, int total_hash_size_bits, Tensor indices, Tensor offsets, int pooling_mode, Tensor? indice_weights, Tensor? feature_requires_grad, bool gradient_clipping, float max_gradient, bool stochastic_rounding, {{ args.split_function_schemas | join(", ") }}, int output_dtype=0) -> Tensor");
+    DISPATCH_TO_CPU(
+        "split_embedding_codegen_lookup_{{ optimizer }}_function_cpu",
+        split_embedding_codegen_lookup_{{ optimizer }}_function_cpu);
+}
+
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
     m.def("split_embedding_codegen_lookup_{{ optimizer }}_function_cpu(Tensor host_weights, Tensor weights_placements, Tensor weights_offsets, Tensor D_offsets, int total_D, int max_D, Tensor hash_size_cumsum, int total_hash_size_bits, Tensor indices, Tensor offsets, int pooling_mode, Tensor? indice_weights, Tensor? feature_requires_grad, bool gradient_clipping, float max_gradient, bool stochastic_rounding, {{ args.split_function_schemas | join(", ") }}, int output_dtype=0) -> Tensor");
     DISPATCH_TO_CPU(

diff --git a/fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp b/fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp
@@ -491,6 +491,12 @@ Tensor split_embedding_codegen_lookup_{{ optimizer }}_function(
   }
 }
 
+// Deprecated for fb namespace! Please use fbgemm namespace instead!
+TORCH_LIBRARY_FRAGMENT(fb, m) {
+    m.def("split_embedding_codegen_lookup_{{ optimizer }}_function(Tensor placeholder_autograd_tensor, Tensor dev_weights, Tensor uvm_weights, Tensor lxu_cache_weights, Tensor weights_placements, Tensor weights_offsets, Tensor D_offsets, int total_D, int max_D, Tensor hash_size_cumsum, int total_hash_size_bits, Tensor indices, Tensor offsets, int pooling_mode, Tensor? indice_weights, Tensor? feature_requires_grad, Tensor lxu_cache_locations, bool gradient_clipping, float max_gradient, bool stochastic_rounding, {{ args.split_function_schemas | join(", ") }}, int output_dtype=0) -> Tensor");
+    DISPATCH_TO_CUDA("split_embedding_codegen_lookup_{{ optimizer }}_function", split_embedding_codegen_lookup_{{ optimizer }}_function);
+}
+
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
     m.def("split_embedding_codegen_lookup_{{ optimizer }}_function(Tensor placeholder_autograd_tensor, Tensor dev_weights, Tensor uvm_weights, Tensor lxu_cache_weights, Tensor weights_placements, Tensor weights_offsets, Tensor D_offsets, int total_D, int max_D, Tensor hash_size_cumsum, int total_hash_size_bits, Tensor indices, Tensor offsets, int pooling_mode, Tensor? indice_weights, Tensor? feature_requires_grad, Tensor lxu_cache_locations, bool gradient_clipping, float max_gradient, bool stochastic_rounding, {{ args.split_function_schemas | join(", ") }}, int output_dtype=0) -> Tensor");
     DISPATCH_TO_CUDA("split_embedding_codegen_lookup_{{ optimizer }}_function", split_embedding_codegen_lookup_{{ optimizer }}_function);

diff --git a/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp b/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp
@@ -21,6 +21,11 @@ void bounds_check_indices_cuda(
     int64_t bounds_check_mode,
     Tensor warning);
 
+// Deprecated for fb namespace! Please use fbgemm namespace instead!
+TORCH_LIBRARY_FRAGMENT(fb, m) {
+  DISPATCH_TO_CUDA("bounds_check_indices", bounds_check_indices_cuda);
+}
+
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   DISPATCH_TO_CUDA("bounds_check_indices", bounds_check_indices_cuda);
 }
diff --git a/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp b/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp
@@ -105,6 +105,15 @@ void bounds_check_indices_cpu(
 }
 } // namespace
 
+// Deprecated for fb namespace! Please use fbgemm namespace instead!
+TORCH_LIBRARY_FRAGMENT(fb, m) {
+  // The (a!) tells PyTorch this is an impure operation and so cannot be CSE'd
+  // or DCE'd, etc.
+  m.def(
+      "bounds_check_indices(Tensor rows_per_table, Tensor(a!) indices, Tensor(a!) offsets, int bounds_check_mode, Tensor(a!) warning) -> ()");
+  DISPATCH_TO_CPU("bounds_check_indices", bounds_check_indices_cpu);
+}
+
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   // The (a!) tells PyTorch this is an impure operation and so cannot be CSE'd
   // or DCE'd, etc.

diff --git a/fbgemm_gpu/src/cumem_utils_host.cpp b/fbgemm_gpu/src/cumem_utils_host.cpp
@@ -15,6 +15,37 @@ using Tensor = at::Tensor;
 
 namespace fbgemm_gpu {
 
+// Deprecated for fb namespace! Please use fbgemm namespace instead!
+TORCH_LIBRARY_FRAGMENT(fb, m) {
+  m.def("is_uvm_tensor(Tensor t) -> bool", TORCH_FN(is_uvm_tensor));
+  m.def("uvm_storage(Tensor t) -> bool", TORCH_FN(uvm_storage));
+  m.def(
+      "uvm_to_device(Tensor self, Tensor prototype) -> Tensor",
+      TORCH_FN(uvm_to_device));
+  m.def("uvm_to_cpu(Tensor t) -> Tensor");
+  DISPATCH_TO_CUDA("uvm_to_cpu", uvm_to_cpu);
+  m.def("new_managed_tensor(Tensor self, int[] sizes) -> Tensor");
+  DISPATCH_TO_CUDA("new_managed_tensor", new_managed_tensor);
+  m.def("new_vanilla_managed_tensor(Tensor self, int[] sizes) -> Tensor");
+  DISPATCH_TO_CUDA("new_vanilla_managed_tensor", new_vanilla_managed_tensor);
+  m.def(
+      "cuda_mem_advise(Tensor t, int advice) -> ()",
+      TORCH_FN(uvm_cuda_mem_advise));
+  m.def(
+      "cuda_mem_prefetch_async(Tensor t, Tensor? device_t) -> ()",
+      TORCH_FN(uvm_cuda_mem_prefetch_async));
+  m.def(
+      "uvm_mem_advice_dont_fork(Tensor t) -> ()",
+      TORCH_FN(uvm_mem_advice_dont_fork));
+
+  m.def("uvm_to_cpu_clone(Tensor t) -> Tensor", TORCH_FN(uvm_to_cpu_clone));
+
+#ifndef __HIP_PLATFORM_HCC__
+  // FIXME: some advanced "cudaMemAdvise" flags are not supported by HIP.
+  m.def(FBGEMM_GPU_ENUM_OP(uvm, fbgemm_gpu_uvm_enum_query));
+#endif
+}
+
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def("is_uvm_tensor(Tensor t) -> bool", TORCH_FN(is_uvm_tensor));
   m.def("uvm_storage(Tensor t) -> bool", TORCH_FN(uvm_storage));

diff --git a/fbgemm_gpu/src/split_table_batched_embeddings.cpp b/fbgemm_gpu/src/split_table_batched_embeddings.cpp
@@ -107,6 +107,36 @@ void lxu_cache_flush_cuda(
 
 namespace {
 
+// Deprecated for fb namespace! Please use fbgemm namespace instead!
+TORCH_LIBRARY_FRAGMENT(fb, m) {
+  m.def(
+      "linearize_cache_indices(Tensor cache_hash_size_cumsum, Tensor indices, Tensor offsets) -> Tensor");
+  DISPATCH_TO_CUDA("linearize_cache_indices", linearize_cache_indices_cuda);
+  m.def(
+      "lru_cache_populate(Tensor weights, Tensor hash_size_cumsum, int total_cache_hash_size, Tensor cache_index_table_map, Tensor weights_offsets, Tensor D_offsets, Tensor linear_cache_indices, Tensor(a!) lxu_cache_state, Tensor(b!) lxu_cache_weights, int time_stamp, Tensor(c!) lru_state, bool stochastic_rounding) -> ()");
+  DISPATCH_TO_CUDA("lru_cache_populate", lru_cache_populate_cuda);
+  m.def(
+      "lru_cache_populate_byte(Tensor weights, Tensor hash_size_cumsum, int total_cache_hash_size, Tensor cache_index_table_map, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, Tensor linear_cache_indices, Tensor(a!) lxu_cache_state, Tensor(b!) lxu_cache_weights, int time_stamp, Tensor(c!) lru_state) -> ()");
+  DISPATCH_TO_CUDA("lru_cache_populate_byte", lru_cache_populate_byte_cuda);
+  m.def(
+      "lfu_cache_populate(Tensor weights, Tensor cache_hash_size_cumsum, int total_cache_hash_size, Tensor cache_index_table_map, Tensor weights_offsets, Tensor D_offsets, Tensor linear_cache_indices, Tensor(a!) lxu_cache_state, Tensor(b!) lxu_cache_weights, Tensor(c!) lfu_state, bool stochastic_rounding) -> ()");
+  DISPATCH_TO_CUDA("lfu_cache_populate", lfu_cache_populate_cuda);
+  m.def(
+      "lfu_cache_populate_byte(Tensor weights, Tensor cache_hash_size_cumsum, int total_cache_hash_size, Tensor cache_index_table_map, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, Tensor linear_cache_indices, Tensor(a!) lxu_cache_state, Tensor(b!) lxu_cache_weights, Tensor(c!) lfu_state) -> ()");
+  DISPATCH_TO_CUDA("lfu_cache_populate_byte", lfu_cache_populate_byte_cuda);
+  m.def(
+      "lxu_cache_lookup(Tensor linear_cache_indices, Tensor lxu_cache_state, int invalid_index = -1) -> Tensor");
+  DISPATCH_TO_CUDA("lxu_cache_lookup", lxu_cache_lookup_cuda);
+  m.def(
+      "lxu_cache_flush(Tensor(a!) uvm_weights, Tensor cache_hash_size_cumsum, Tensor cache_index_table_map, Tensor weights_offsets, Tensor D_offsets, int total_D, Tensor(b!) lxu_cache_state, Tensor(c!) lxu_cache_weights, bool stochastic_rounding) -> ()");
+  DISPATCH_TO_CUDA("lxu_cache_flush", lxu_cache_flush_cuda);
+  m.def("lxu_cache_slot(int h_in, int C) -> int");
+  m.impl(
+      "lxu_cache_slot",
+      torch::dispatch(
+          c10::DispatchKey::CatchAll, TORCH_FN(host_lxu_cache_slot)));
+}
+
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "linearize_cache_indices(Tensor cache_hash_size_cumsum, Tensor indices, Tensor offsets) -> Tensor");