Merge branch 'remove-trainable-layers-ray' into 'main'

Remove all trainable layers using legacy tensor See merge request dl/hugectr/hugectr!1470
lausannel · Sep 27, 2023 · 398d8e0 · 398d8e0
2 parents d16fac7 + 8d0a2b2
commit 398d8e0
Show file tree

Hide file tree

Showing 51 changed files with 374 additions and 10,390 deletions.
diff --git a/HugeCTR/include/layers/batch_norm_layer.hpp b/HugeCTR/include/layers/batch_norm_layer.hpp
@@ -31,15 +31,6 @@ class BatchNormLayer : public TrainableLayer<T, true> {
   using Base = TrainableLayer<T, true>;
   using WeightType = typename Base::WeightType;
 
-  /*
-   * stores the references to the input tensors of this layer.
-   */
-  Tensors2<T> in_tensors_;
-  /*
-   * stores the references to the output tensors of this layer.
-   */
-  Tensors2<T> out_tensors_;
-
  public:
   /**
    * BatchNorm parameters
@@ -51,20 +42,14 @@ class BatchNormLayer : public TrainableLayer<T, true> {
 
   /**
    * Ctor of BatchNormLayer.
-   * @param weight_buff weight buffer for internal gamma/beta tensors
-   * @param wgrad_buff gradient buffer for internal gamma/beta tensors
    * @param in_tensor the input tensor
    * @param out_tensor the output tensor which has the same dim with in_tensor
    * @param params BatchNorm parameters
    * @param cudnn_handle cuDNN handle created externally
    * @param device_id the id of GPU where this layer belongs
    */
-  BatchNormLayer(const std::shared_ptr<BufferBlock2<float>>& master_weight_buff,
-                 const std::shared_ptr<BufferBlock2<WeightType>>& weight_buff,
-                 const std::shared_ptr<BufferBlock2<WeightType>>& wgrad_buff,
-                 const std::shared_ptr<GeneralBuffer2<CudaAllocator>>& blob_buff,
-                 const Tensor2<T>& in_tensor, const Tensor2<T>& out_tensor, const Params& params,
-                 const std::shared_ptr<GPUResource>& gpu_resource,
+  BatchNormLayer(const core23::Tensor& in_tensor, const core23::Tensor& out_tensor,
+                 const Params& params, const std::shared_ptr<GPUResource>& gpu_resource,
                  std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>());
   ~BatchNormLayer() override;
 
@@ -89,91 +74,6 @@ class BatchNormLayer : public TrainableLayer<T, true> {
    */
   std::string get_no_trained_params_in_string() override;
 
-  std::vector<TensorBag2> get_tensors_for_non_trainable_params() override;
-
- private:
-  /**
-   * A method of defining how gamma and beta are initialized.
-   * Gamma is initialized to 1s while Beta is 0ed.
-   * Override this function to change the initialization behavior.
-   */
-  std::unique_ptr<DataSimulator> get_default_initializer(const int index) override;
-
-  const Params params_;
-  const cudnnBatchNormMode_t mode_;
-  cudnnTensorDescriptor_t in_out_desc_;
-  cudnnTensorDescriptor_t gamma_beta_desc_;
-
-  // these four pointers are just for convenience
-  // they are deleted by Layer d'tor through the other pointer aliases: weight_ and wgrad_
-  Tensor2<float> gamma_;
-  Tensor2<float> beta_;
-  Tensor2<float> gamma_grad_;
-  Tensor2<float> beta_grad_;
-
-  // these tensors are internal only managed by smart ptrs
-  Tensor2<float> result_running_mean_;
-  Tensor2<float> result_running_var_;
-  Tensor2<float> result_save_mean_;
-  Tensor2<float> result_save_inv_var_;
-
-  // host arCore23Temp to do device-to-host copy for mean and var
-  Tensor2<float> h_result_running_mean_;
-  Tensor2<float> h_result_running_var_;
-};
-
-/**
- * BatchNorm layer based on cuDNN
- */
-template <typename T>
-class Core23TempBatchNormLayer : public Core23TempTrainableLayer<T, true> {
-  using Base = Core23TempTrainableLayer<T, true>;
-  using WeightType = typename Base::WeightType;
-
- public:
-  /**
-   * BatchNorm parameters
-   */
-  struct Params {
-    double factor; /**<  moving average computation factor*/
-    double eps;    /**< small value to avoid divide-by-zero error*/
-  };
-
-  /**
-   * Ctor of Core23TempBatchNormLayer.
-   * @param in_tensor the input tensor
-   * @param out_tensor the output tensor which has the same dim with in_tensor
-   * @param params BatchNorm parameters
-   * @param cudnn_handle cuDNN handle created externally
-   * @param device_id the id of GPU where this layer belongs
-   */
-  Core23TempBatchNormLayer(
-      const core23::Tensor& in_tensor, const core23::Tensor& out_tensor, const Params& params,
-      const std::shared_ptr<GPUResource>& gpu_resource,
-      std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>());
-  ~Core23TempBatchNormLayer() override;
-
-  void initialize() override;
-
-  /**
-   * A method of implementing the forward pass of BatchNorm
-   * @param stream CUDA stream where the forward propagation is executed
-   */
-  void fprop(bool is_train) override;
-
-  /**
-   * A method of implementing the forward pass of BatchNorm
-   * @param stream CUDA stream where the forward propagation is executed
-   */
-  void bprop() override;
-
-  /**
-   * A method to get mean and variance which are needed for inference as string.
-   * Session is in charge of calling this method and store the contensts to file.
-   * See Session::download_params_to_file() for more detailed information.
-   */
-  std::string get_no_trained_params_in_string() override;
-
   std::vector<core23::Tensor> get_non_trainable_params_as_tensors() override;
 
  private:
@@ -202,7 +102,7 @@ class Core23TempBatchNormLayer : public Core23TempTrainableLayer<T, true> {
   core23::Tensor result_save_mean_;
   core23::Tensor result_save_inv_var_;
 
-  // host arCore23Temp to do device-to-host copy for mean and var
+  // host ar to do device-to-host copy for mean and var
   core23::Tensor h_result_running_mean_;
   core23::Tensor h_result_running_var_;
 };

diff --git a/HugeCTR/include/layers/fully_connected_layer.hpp b/HugeCTR/include/layers/fully_connected_layer.hpp
@@ -41,16 +41,7 @@ class FullyConnectedLayer<float> : public TrainableLayer<float> {
   cublasGemmAlgo_t balgo_W_{CUBLAS_GEMM_DEFAULT};
   cublasGemmAlgo_t balgo_Xn_{CUBLAS_GEMM_DEFAULT};
 
-  /*
-   * stores the references to the input tensors of this layer.
-   */
-  Tensors2<float> in_tensors_;
-  /*
-   * stores the references to the output tensors of this layer.
-   */
-  Tensors2<float> out_tensors_;
-
-  Tensors2<float>& get_in_tensors(bool is_train) { return in_tensors_; }
+  std::vector<core23::Tensor>& get_in_tensors(bool is_train) { return this->input_tensors_; }
 
  public:
   /**
@@ -71,16 +62,12 @@ class FullyConnectedLayer<float> : public TrainableLayer<float> {
    * Only two kinds of tensor formats are supported:
    * (1) weight, input, output, wgrad are all in row-major.
    * (2) weight, input, output, wgrad are all in column-major.
-   * @param weight_buff: stores the weight tensor
-   * @param wgrad_buff: stores the gradient values of the weight calculated in backward pass
    * @param in_tensor: stores the input tensor
    * @param out_tensor: stores the output tensor
    * @param weight_format: specifies the format of the weight tensor, either HW (row major) or WH
    * (col-major)
    */
-  FullyConnectedLayer(const std::shared_ptr<BufferBlock2<float>>& weight_buff,
-                      const std::shared_ptr<BufferBlock2<float>>& wgrad_buff,
-                      const Tensor2<float>& in_tensor, const Tensor2<float>& out_tensor,
+  FullyConnectedLayer(const core23::Tensor& in_tensor, const core23::Tensor& out_tensor,
                       const std::shared_ptr<GPUResource>& gpu_resource, bool use_mixed_precision,
                       bool enable_tf32_compute,
                       std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>());
@@ -97,65 +84,4 @@ class FullyConnectedLayer<float> : public TrainableLayer<float> {
   std::unique_ptr<DataSimulator> get_default_initializer(const int index) override;
 };
 
-template <typename T>
-class Core23TempFullyConnectedLayer;
-
-/**
- * @brief
- * This class implements the fully connected layer.
- */
-template <>
-class Core23TempFullyConnectedLayer<float> : public Core23TempTrainableLayer<float> {
- private:
-  const bool use_mixed_precision_{false};
-  const bool enable_tf32_compute_{false};
-  // Optimized cublasGemmEx algorithm selection
-  cublasGemmAlgo_t falgo_{CUBLAS_GEMM_DEFAULT};
-  cublasGemmAlgo_t balgo_W_{CUBLAS_GEMM_DEFAULT};
-  cublasGemmAlgo_t balgo_Xn_{CUBLAS_GEMM_DEFAULT};
-
-  std::vector<core23::Tensor>& get_in_tensors(bool is_train) { return this->input_tensors_; }
-
- public:
-  /**
-   * forward pass
-   */
-  void fprop(bool is_train) final;
-  /**
-   * backward pass
-   */
-  void bprop() final;
-  /*
-   * algorithm search for cublasGemmEx
-   */
-  void search_algorithm() final;
-  /**
-   * This is the constructor of the Core23TempFullyConnectedLayer.
-   * It will check whether the format combination of all tensors is supported or not.
-   * Only two kinds of tensor formats are supported:
-   * (1) weight, input, output, wgrad are all in row-major.
-   * (2) weight, input, output, wgrad are all in column-major.
-   * @param in_tensor: stores the input tensor
-   * @param out_tensor: stores the output tensor
-   * @param weight_format: specifies the format of the weight tensor, either HW (row major) or WH
-   * (col-major)
-   */
-  Core23TempFullyConnectedLayer(
-      const core23::Tensor& in_tensor, const core23::Tensor& out_tensor,
-      const std::shared_ptr<GPUResource>& gpu_resource, bool use_mixed_precision,
-      bool enable_tf32_compute,
-      std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>());
-  Core23TempFullyConnectedLayer(const Core23TempFullyConnectedLayer& C) = delete;
-  Core23TempFullyConnectedLayer& operator=(const Core23TempFullyConnectedLayer&);
-
- private:
-  /*
-   * initializers for this layer.
-   */
-  std::unique_ptr<DataSimulator> get_uniform_initializer(const int index) override;
-  std::unique_ptr<DataSimulator> get_xavier_uniform_initializer(const int index) override;
-  std::unique_ptr<DataSimulator> get_xavier_norm_initializer(const int index) override;
-  std::unique_ptr<DataSimulator> get_default_initializer(const int index) override;
-};
-
 }  // namespace HugeCTR
diff --git a/HugeCTR/include/layers/fully_connected_layer_half.hpp b/HugeCTR/include/layers/fully_connected_layer_half.hpp
@@ -38,20 +38,10 @@ class FullyConnectedLayer<__half> : public TrainableLayer<__half> {
   cublasGemmAlgo_t balgo_k_;
   cublasGemmAlgo_t balgo_x_;
 
-  /*
-   * stores the references to the input tensors of this layer.
-   */
-  Tensor2<__half> bottom_tensor_;
-
-  /*
-   * stores the references to the output tensors of this layer.
-   */
-  Tensor2<__half> top_tensor_;
-
   /*
    * stores the references to the output tensors of GEMM.
    */
-  Tensor2<__half> identity_tensor_;
+  core23::Tensor identity_tensor_;
 
   /*
    * initializers for this layer.
@@ -61,7 +51,7 @@ class FullyConnectedLayer<__half> : public TrainableLayer<__half> {
   std::unique_ptr<DataSimulator> get_xavier_norm_initializer(const int index) override;
   std::unique_ptr<DataSimulator> get_default_initializer(const int index) override;
 
-  Tensor2<__half>& get_bottom_tensor(bool is_train) { return bottom_tensor_; }
+  core23::Tensor& get_bottom_tensor(bool is_train) { return this->input_tensors_[0]; }
 
  public:
   /**
@@ -87,87 +77,16 @@ class FullyConnectedLayer<__half> : public TrainableLayer<__half> {
    * Only two kinds of tensor formats are supported:
    * (1) weight, input, output, wgrad are all in row-major.
    * (2) weight, input, output, wgrad are all in column-major.
-   * @param weight_buff: stores the weight tensor
-   * @param wgrad_buff: stores the gradient values of the weight calculated in backward pass
    * @param bottom_tensor: stores the tensor from bottom layer
    * @param top_tensor: stores the tensor to top layer
    * @param tensor_format: specifies the format of the weight tensor, either HW (row major) or WH
    * (col-major)
    */
-  FullyConnectedLayer(const std::shared_ptr<BufferBlock2<float>>& master_weights_buff,
-                      const std::shared_ptr<BufferBlock2<__half>>& weights_buff,
-                      const std::shared_ptr<BufferBlock2<__half>>& weights_grad_buff,
-                      const std::shared_ptr<GeneralBuffer2<CudaAllocator>>& blobs_buff,
-                      const Tensor2<__half>& bottom_tensor, const Tensor2<__half>& top_tensor,
+  FullyConnectedLayer(const core23::Tensor& bottom_tensor, const core23::Tensor& top_tensor,
                       const std::shared_ptr<GPUResource>& gpu_resource,
                       std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>());
   FullyConnectedLayer(const FullyConnectedLayer&) = delete;
   FullyConnectedLayer& operator=(const FullyConnectedLayer&);
 };
 
-/**
- * @brief
- * This class implements the fully connected layer.
- */
-template <>
-class Core23TempFullyConnectedLayer<__half> : public Core23TempTrainableLayer<__half> {
-  // Optimized cublasGemmEx algorithm selection
-  cublasGemmAlgo_t falgo_b_;
-  cublasGemmAlgo_t falgo_k_;
-  cublasGemmAlgo_t balgo_b_;
-  cublasGemmAlgo_t balgo_k_;
-  cublasGemmAlgo_t balgo_x_;
-
-  /*
-   * stores the references to the output tensors of GEMM.
-   */
-  core23::Tensor identity_tensor_;
-
-  /*
-   * initializers for this layer.
-   */
-  std::unique_ptr<DataSimulator> get_uniform_initializer(const int index) override;
-  std::unique_ptr<DataSimulator> get_xavier_uniform_initializer(const int index) override;
-  std::unique_ptr<DataSimulator> get_xavier_norm_initializer(const int index) override;
-  std::unique_ptr<DataSimulator> get_default_initializer(const int index) override;
-
-  core23::Tensor& get_bottom_tensor(bool is_train) { return this->input_tensors_[0]; }
-
- public:
-  /**
-   * forward pass
-   */
-  void fprop(bool is_train) final;
-  /**
-   * backward pass
-   */
-  void bprop() final;
-  /*
-   * initialize for cublasGemmEx
-   */
-  void initialize() final;
-  /*
-   * algorithm search for cublasGemmEx
-   */
-  void search_algorithm() final;
-
-  /**
-   * This is the constructor of the Core23TempFullyConnectedLayer.
-   * It will check whether the format combination of all tensors is supported or not.
-   * Only two kinds of tensor formats are supported:
-   * (1) weight, input, output, wgrad are all in row-major.
-   * (2) weight, input, output, wgrad are all in column-major.
-   * @param bottom_tensor: stores the tensor from bottom layer
-   * @param top_tensor: stores the tensor to top layer
-   * @param tensor_format: specifies the format of the weight tensor, either HW (row major) or WH
-   * (col-major)
-   */
-  Core23TempFullyConnectedLayer(
-      const core23::Tensor& bottom_tensor, const core23::Tensor& top_tensor,
-      const std::shared_ptr<GPUResource>& gpu_resource,
-      std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>());
-  Core23TempFullyConnectedLayer(const Core23TempFullyConnectedLayer&) = delete;
-  Core23TempFullyConnectedLayer& operator=(const Core23TempFullyConnectedLayer&);
-};
-
 }  // namespace HugeCTR