From 8d0a2b2b13de7644a31808bd265ee210c22d3013 Mon Sep 17 00:00:00 2001
From: Ray Wang <raywang@nvidia.com>
Date: Wed, 13 Sep 2023 07:45:47 +0000
Subject: [PATCH] Remove all trainable layers using legacy tensor

---
 HugeCTR/include/layers/batch_norm_layer.hpp   |  106 +-
 .../include/layers/fully_connected_layer.hpp  |   78 +-
 .../layers/fully_connected_layer_half.hpp     |   87 +-
 .../layers/fused_fully_connected_layer.hpp    |   82 +-
 .../fused_relu_bias_fully_connected_layer.hpp |  215 +--
 HugeCTR/include/layers/gru_layer.hpp          |   89 +-
 HugeCTR/include/layers/layer_norm_layer.hpp   |   80 +-
 HugeCTR/include/layers/mlp_layer.hpp          |   95 +-
 HugeCTR/include/layers/multi_cross_layer.hpp  |  204 +--
 .../include/layers/weight_multiply_layer.hpp  |   68 +-
 HugeCTR/include/network_helpers.hpp           |    4 +-
 HugeCTR/include/pybind/model.hpp              |    2 +-
 HugeCTR/include/trainable_layer.hpp           |  193 +--
 HugeCTR/src/layers/batch_norm_layer.cu        |  217 +--
 HugeCTR/src/layers/fully_connected_layer.cu   |  369 +----
 .../src/layers/fully_connected_layer_half.cu  |  456 +-----
 .../src/layers/fused_fully_connected_layer.cu |  344 +---
 .../fused_relu_bias_fully_connected_layer.cu  |  796 +---------
 HugeCTR/src/layers/gru_layer.cu               |  391 +----
 HugeCTR/src/layers/layer_norm_layer.cu        |  156 +-
 HugeCTR/src/layers/mlp_layer.cu               |  285 +---
 HugeCTR/src/layers/multi_cross_layer.cu       | 1392 ++---------------
 HugeCTR/src/layers/weight_multiply_layer.cu   |  144 +-
 .../src/pybind/add_dense_layer_helpers.cpp    |   58 +-
 .../batch_norm_layer_test.cpp                 |    5 +-
 .../fully_connected_layer_half_test.cpp       |    8 +-
 .../fully_connected_layer_test.cpp            |    8 +-
 .../fused_fully_connected_layer_test.cpp      |    4 +-
 ...d_relu_bias_fully_connected_layer_test.cpp |    2 +-
 .../core23_layer_test/gru_layer_test.cpp      |    4 +-
 .../layer_norm_layer_test.cpp                 |    5 +-
 test/utest/core23_layer_test/mlp_test.cpp     |   24 +-
 .../multi_cross_layer_test.cpp                |    6 +-
 .../multi_head_attention_layer_test.cpp       |    2 +-
 .../trainable_layer_test.cpp                  |    4 +-
 .../weight_multiply_layer_test.cpp            |    4 +-
 .../batch_norm_layer_test_old.cpp             |  290 ----
 .../fully_connected_layer_half_test_old.cpp   |  283 ----
 .../fully_connected_layer_test_old.cpp        |  263 ----
 .../fused_fully_connected_layer_test_old.cpp  |  204 ---
 ...lu_bias_fully_connected_layer_test_old.cpp |  217 ---
 .../group_dense_layer_test_old.cpp            |  583 -------
 .../legacy_layer_test/gru_layer_test_old.cpp  |  570 -------
 .../layer_norm_layer_test_old.cpp             |  339 ----
 test/utest/legacy_layer_test/mlp_test_old.cpp |  619 --------
 .../multi_cross_layer_test_old.cpp            |  839 ----------
 .../trainable_layer_test_old.cpp              |  135 --
 .../weight_multiply_layer_test_old.cpp        |  173 --
 .../utest/loss/loss_with_regularizer_test.cpp |    8 +-
 test/utest/network/network_build_test.cpp     |    2 +-
 .../loss_with_regularizer_test.cpp            |  252 ---
 51 files changed, 374 insertions(+), 10390 deletions(-)
 delete mode 100644 test/utest/legacy_layer_test/batch_norm_layer_test_old.cpp
 delete mode 100644 test/utest/legacy_layer_test/fully_connected_layer_half_test_old.cpp
 delete mode 100644 test/utest/legacy_layer_test/fully_connected_layer_test_old.cpp
 delete mode 100644 test/utest/legacy_layer_test/fused_fully_connected_layer_test_old.cpp
 delete mode 100644 test/utest/legacy_layer_test/fused_relu_bias_fully_connected_layer_test_old.cpp
 delete mode 100644 test/utest/legacy_layer_test/group_dense_layer_test_old.cpp
 delete mode 100644 test/utest/legacy_layer_test/gru_layer_test_old.cpp
 delete mode 100644 test/utest/legacy_layer_test/layer_norm_layer_test_old.cpp
 delete mode 100644 test/utest/legacy_layer_test/mlp_test_old.cpp
 delete mode 100644 test/utest/legacy_layer_test/multi_cross_layer_test_old.cpp
 delete mode 100644 test/utest/legacy_layer_test/trainable_layer_test_old.cpp
 delete mode 100644 test/utest/legacy_layer_test/weight_multiply_layer_test_old.cpp
 delete mode 100644 test/utest/regularizers/loss_with_regularizer_test.cpp

diff --git a/HugeCTR/include/layers/batch_norm_layer.hpp b/HugeCTR/include/layers/batch_norm_layer.hpp
index 06c9cdfc40..66370ff76f 100644
--- a/HugeCTR/include/layers/batch_norm_layer.hpp
+++ b/HugeCTR/include/layers/batch_norm_layer.hpp
@@ -31,15 +31,6 @@ class BatchNormLayer : public TrainableLayer<T, true> {
   using Base = TrainableLayer<T, true>;
   using WeightType = typename Base::WeightType;
 
-  /*
-   * stores the references to the input tensors of this layer.
-   */
-  Tensors2<T> in_tensors_;
-  /*
-   * stores the references to the output tensors of this layer.
-   */
-  Tensors2<T> out_tensors_;
-
  public:
   /**
    * BatchNorm parameters
@@ -51,20 +42,14 @@ class BatchNormLayer : public TrainableLayer<T, true> {
 
   /**
    * Ctor of BatchNormLayer.
-   * @param weight_buff weight buffer for internal gamma/beta tensors
-   * @param wgrad_buff gradient buffer for internal gamma/beta tensors
    * @param in_tensor the input tensor
    * @param out_tensor the output tensor which has the same dim with in_tensor
    * @param params BatchNorm parameters
    * @param cudnn_handle cuDNN handle created externally
    * @param device_id the id of GPU where this layer belongs
    */
-  BatchNormLayer(const std::shared_ptr<BufferBlock2<float>>& master_weight_buff,
-                 const std::shared_ptr<BufferBlock2<WeightType>>& weight_buff,
-                 const std::shared_ptr<BufferBlock2<WeightType>>& wgrad_buff,
-                 const std::shared_ptr<GeneralBuffer2<CudaAllocator>>& blob_buff,
-                 const Tensor2<T>& in_tensor, const Tensor2<T>& out_tensor, const Params& params,
-                 const std::shared_ptr<GPUResource>& gpu_resource,
+  BatchNormLayer(const core23::Tensor& in_tensor, const core23::Tensor& out_tensor,
+                 const Params& params, const std::shared_ptr<GPUResource>& gpu_resource,
                  std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>());
   ~BatchNormLayer() override;
 
@@ -89,91 +74,6 @@ class BatchNormLayer : public TrainableLayer<T, true> {
    */
   std::string get_no_trained_params_in_string() override;
 
-  std::vector<TensorBag2> get_tensors_for_non_trainable_params() override;
-
- private:
-  /**
-   * A method of defining how gamma and beta are initialized.
-   * Gamma is initialized to 1s while Beta is 0ed.
-   * Override this function to change the initialization behavior.
-   */
-  std::unique_ptr<DataSimulator> get_default_initializer(const int index) override;
-
-  const Params params_;
-  const cudnnBatchNormMode_t mode_;
-  cudnnTensorDescriptor_t in_out_desc_;
-  cudnnTensorDescriptor_t gamma_beta_desc_;
-
-  // these four pointers are just for convenience
-  // they are deleted by Layer d'tor through the other pointer aliases: weight_ and wgrad_
-  Tensor2<float> gamma_;
-  Tensor2<float> beta_;
-  Tensor2<float> gamma_grad_;
-  Tensor2<float> beta_grad_;
-
-  // these tensors are internal only managed by smart ptrs
-  Tensor2<float> result_running_mean_;
-  Tensor2<float> result_running_var_;
-  Tensor2<float> result_save_mean_;
-  Tensor2<float> result_save_inv_var_;
-
-  // host arCore23Temp to do device-to-host copy for mean and var
-  Tensor2<float> h_result_running_mean_;
-  Tensor2<float> h_result_running_var_;
-};
-
-/**
- * BatchNorm layer based on cuDNN
- */
-template <typename T>
-class Core23TempBatchNormLayer : public Core23TempTrainableLayer<T, true> {
-  using Base = Core23TempTrainableLayer<T, true>;
-  using WeightType = typename Base::WeightType;
-
- public:
-  /**
-   * BatchNorm parameters
-   */
-  struct Params {
-    double factor; /**<  moving average computation factor*/
-    double eps;    /**< small value to avoid divide-by-zero error*/
-  };
-
-  /**
-   * Ctor of Core23TempBatchNormLayer.
-   * @param in_tensor the input tensor
-   * @param out_tensor the output tensor which has the same dim with in_tensor
-   * @param params BatchNorm parameters
-   * @param cudnn_handle cuDNN handle created externally
-   * @param device_id the id of GPU where this layer belongs
-   */
-  Core23TempBatchNormLayer(
-      const core23::Tensor& in_tensor, const core23::Tensor& out_tensor, const Params& params,
-      const std::shared_ptr<GPUResource>& gpu_resource,
-      std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>());
-  ~Core23TempBatchNormLayer() override;
-
-  void initialize() override;
-
-  /**
-   * A method of implementing the forward pass of BatchNorm
-   * @param stream CUDA stream where the forward propagation is executed
-   */
-  void fprop(bool is_train) override;
-
-  /**
-   * A method of implementing the forward pass of BatchNorm
-   * @param stream CUDA stream where the forward propagation is executed
-   */
-  void bprop() override;
-
-  /**
-   * A method to get mean and variance which are needed for inference as string.
-   * Session is in charge of calling this method and store the contensts to file.
-   * See Session::download_params_to_file() for more detailed information.
-   */
-  std::string get_no_trained_params_in_string() override;
-
   std::vector<core23::Tensor> get_non_trainable_params_as_tensors() override;
 
  private:
@@ -202,7 +102,7 @@ class Core23TempBatchNormLayer : public Core23TempTrainableLayer<T, true> {
   core23::Tensor result_save_mean_;
   core23::Tensor result_save_inv_var_;
 
-  // host arCore23Temp to do device-to-host copy for mean and var
+  // host ar to do device-to-host copy for mean and var
   core23::Tensor h_result_running_mean_;
   core23::Tensor h_result_running_var_;
 };
diff --git a/HugeCTR/include/layers/fully_connected_layer.hpp b/HugeCTR/include/layers/fully_connected_layer.hpp
index 07f04e7e65..f5b22b16fc 100644
--- a/HugeCTR/include/layers/fully_connected_layer.hpp
+++ b/HugeCTR/include/layers/fully_connected_layer.hpp
@@ -41,16 +41,7 @@ class FullyConnectedLayer<float> : public TrainableLayer<float> {
   cublasGemmAlgo_t balgo_W_{CUBLAS_GEMM_DEFAULT};
   cublasGemmAlgo_t balgo_Xn_{CUBLAS_GEMM_DEFAULT};
 
-  /*
-   * stores the references to the input tensors of this layer.
-   */
-  Tensors2<float> in_tensors_;
-  /*
-   * stores the references to the output tensors of this layer.
-   */
-  Tensors2<float> out_tensors_;
-
-  Tensors2<float>& get_in_tensors(bool is_train) { return in_tensors_; }
+  std::vector<core23::Tensor>& get_in_tensors(bool is_train) { return this->input_tensors_; }
 
  public:
   /**
@@ -71,16 +62,12 @@ class FullyConnectedLayer<float> : public TrainableLayer<float> {
    * Only two kinds of tensor formats are supported:
    * (1) weight, input, output, wgrad are all in row-major.
    * (2) weight, input, output, wgrad are all in column-major.
-   * @param weight_buff: stores the weight tensor
-   * @param wgrad_buff: stores the gradient values of the weight calculated in backward pass
    * @param in_tensor: stores the input tensor
    * @param out_tensor: stores the output tensor
    * @param weight_format: specifies the format of the weight tensor, either HW (row major) or WH
    * (col-major)
    */
-  FullyConnectedLayer(const std::shared_ptr<BufferBlock2<float>>& weight_buff,
-                      const std::shared_ptr<BufferBlock2<float>>& wgrad_buff,
-                      const Tensor2<float>& in_tensor, const Tensor2<float>& out_tensor,
+  FullyConnectedLayer(const core23::Tensor& in_tensor, const core23::Tensor& out_tensor,
                       const std::shared_ptr<GPUResource>& gpu_resource, bool use_mixed_precision,
                       bool enable_tf32_compute,
                       std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>());
@@ -97,65 +84,4 @@ class FullyConnectedLayer<float> : public TrainableLayer<float> {
   std::unique_ptr<DataSimulator> get_default_initializer(const int index) override;
 };
 
-template <typename T>
-class Core23TempFullyConnectedLayer;
-
-/**
- * @brief
- * This class implements the fully connected layer.
- */
-template <>
-class Core23TempFullyConnectedLayer<float> : public Core23TempTrainableLayer<float> {
- private:
-  const bool use_mixed_precision_{false};
-  const bool enable_tf32_compute_{false};
-  // Optimized cublasGemmEx algorithm selection
-  cublasGemmAlgo_t falgo_{CUBLAS_GEMM_DEFAULT};
-  cublasGemmAlgo_t balgo_W_{CUBLAS_GEMM_DEFAULT};
-  cublasGemmAlgo_t balgo_Xn_{CUBLAS_GEMM_DEFAULT};
-
-  std::vector<core23::Tensor>& get_in_tensors(bool is_train) { return this->input_tensors_; }
-
- public:
-  /**
-   * forward pass
-   */
-  void fprop(bool is_train) final;
-  /**
-   * backward pass
-   */
-  void bprop() final;
-  /*
-   * algorithm search for cublasGemmEx
-   */
-  void search_algorithm() final;
-  /**
-   * This is the constructor of the Core23TempFullyConnectedLayer.
-   * It will check whether the format combination of all tensors is supported or not.
-   * Only two kinds of tensor formats are supported:
-   * (1) weight, input, output, wgrad are all in row-major.
-   * (2) weight, input, output, wgrad are all in column-major.
-   * @param in_tensor: stores the input tensor
-   * @param out_tensor: stores the output tensor
-   * @param weight_format: specifies the format of the weight tensor, either HW (row major) or WH
-   * (col-major)
-   */
-  Core23TempFullyConnectedLayer(
-      const core23::Tensor& in_tensor, const core23::Tensor& out_tensor,
-      const std::shared_ptr<GPUResource>& gpu_resource, bool use_mixed_precision,
-      bool enable_tf32_compute,
-      std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>());
-  Core23TempFullyConnectedLayer(const Core23TempFullyConnectedLayer& C) = delete;
-  Core23TempFullyConnectedLayer& operator=(const Core23TempFullyConnectedLayer&);
-
- private:
-  /*
-   * initializers for this layer.
-   */
-  std::unique_ptr<DataSimulator> get_uniform_initializer(const int index) override;
-  std::unique_ptr<DataSimulator> get_xavier_uniform_initializer(const int index) override;
-  std::unique_ptr<DataSimulator> get_xavier_norm_initializer(const int index) override;
-  std::unique_ptr<DataSimulator> get_default_initializer(const int index) override;
-};
-
 }  // namespace HugeCTR
diff --git a/HugeCTR/include/layers/fully_connected_layer_half.hpp b/HugeCTR/include/layers/fully_connected_layer_half.hpp
index de5a7b08e6..cb97ff6018 100644
--- a/HugeCTR/include/layers/fully_connected_layer_half.hpp
+++ b/HugeCTR/include/layers/fully_connected_layer_half.hpp
@@ -38,20 +38,10 @@ class FullyConnectedLayer<__half> : public TrainableLayer<__half> {
   cublasGemmAlgo_t balgo_k_;
   cublasGemmAlgo_t balgo_x_;
 
-  /*
-   * stores the references to the input tensors of this layer.
-   */
-  Tensor2<__half> bottom_tensor_;
-
-  /*
-   * stores the references to the output tensors of this layer.
-   */
-  Tensor2<__half> top_tensor_;
-
   /*
    * stores the references to the output tensors of GEMM.
    */
-  Tensor2<__half> identity_tensor_;
+  core23::Tensor identity_tensor_;
 
   /*
    * initializers for this layer.
@@ -61,7 +51,7 @@ class FullyConnectedLayer<__half> : public TrainableLayer<__half> {
   std::unique_ptr<DataSimulator> get_xavier_norm_initializer(const int index) override;
   std::unique_ptr<DataSimulator> get_default_initializer(const int index) override;
 
-  Tensor2<__half>& get_bottom_tensor(bool is_train) { return bottom_tensor_; }
+  core23::Tensor& get_bottom_tensor(bool is_train) { return this->input_tensors_[0]; }
 
  public:
   /**
@@ -87,87 +77,16 @@ class FullyConnectedLayer<__half> : public TrainableLayer<__half> {
    * Only two kinds of tensor formats are supported:
    * (1) weight, input, output, wgrad are all in row-major.
    * (2) weight, input, output, wgrad are all in column-major.
-   * @param weight_buff: stores the weight tensor
-   * @param wgrad_buff: stores the gradient values of the weight calculated in backward pass
    * @param bottom_tensor: stores the tensor from bottom layer
    * @param top_tensor: stores the tensor to top layer
    * @param tensor_format: specifies the format of the weight tensor, either HW (row major) or WH
    * (col-major)
    */
-  FullyConnectedLayer(const std::shared_ptr<BufferBlock2<float>>& master_weights_buff,
-                      const std::shared_ptr<BufferBlock2<__half>>& weights_buff,
-                      const std::shared_ptr<BufferBlock2<__half>>& weights_grad_buff,
-                      const std::shared_ptr<GeneralBuffer2<CudaAllocator>>& blobs_buff,
-                      const Tensor2<__half>& bottom_tensor, const Tensor2<__half>& top_tensor,
+  FullyConnectedLayer(const core23::Tensor& bottom_tensor, const core23::Tensor& top_tensor,
                       const std::shared_ptr<GPUResource>& gpu_resource,
                       std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>());
   FullyConnectedLayer(const FullyConnectedLayer&) = delete;
   FullyConnectedLayer& operator=(const FullyConnectedLayer&);
 };
 
-/**
- * @brief
- * This class implements the fully connected layer.
- */
-template <>
-class Core23TempFullyConnectedLayer<__half> : public Core23TempTrainableLayer<__half> {
-  // Optimized cublasGemmEx algorithm selection
-  cublasGemmAlgo_t falgo_b_;
-  cublasGemmAlgo_t falgo_k_;
-  cublasGemmAlgo_t balgo_b_;
-  cublasGemmAlgo_t balgo_k_;
-  cublasGemmAlgo_t balgo_x_;
-
-  /*
-   * stores the references to the output tensors of GEMM.
-   */
-  core23::Tensor identity_tensor_;
-
-  /*
-   * initializers for this layer.
-   */
-  std::unique_ptr<DataSimulator> get_uniform_initializer(const int index) override;
-  std::unique_ptr<DataSimulator> get_xavier_uniform_initializer(const int index) override;
-  std::unique_ptr<DataSimulator> get_xavier_norm_initializer(const int index) override;
-  std::unique_ptr<DataSimulator> get_default_initializer(const int index) override;
-
-  core23::Tensor& get_bottom_tensor(bool is_train) { return this->input_tensors_[0]; }
-
- public:
-  /**
-   * forward pass
-   */
-  void fprop(bool is_train) final;
-  /**
-   * backward pass
-   */
-  void bprop() final;
-  /*
-   * initialize for cublasGemmEx
-   */
-  void initialize() final;
-  /*
-   * algorithm search for cublasGemmEx
-   */
-  void search_algorithm() final;
-
-  /**
-   * This is the constructor of the Core23TempFullyConnectedLayer.
-   * It will check whether the format combination of all tensors is supported or not.
-   * Only two kinds of tensor formats are supported:
-   * (1) weight, input, output, wgrad are all in row-major.
-   * (2) weight, input, output, wgrad are all in column-major.
-   * @param bottom_tensor: stores the tensor from bottom layer
-   * @param top_tensor: stores the tensor to top layer
-   * @param tensor_format: specifies the format of the weight tensor, either HW (row major) or WH
-   * (col-major)
-   */
-  Core23TempFullyConnectedLayer(
-      const core23::Tensor& bottom_tensor, const core23::Tensor& top_tensor,
-      const std::shared_ptr<GPUResource>& gpu_resource,
-      std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>());
-  Core23TempFullyConnectedLayer(const Core23TempFullyConnectedLayer&) = delete;
-  Core23TempFullyConnectedLayer& operator=(const Core23TempFullyConnectedLayer&);
-};
-
 }  // namespace HugeCTR
diff --git a/HugeCTR/include/layers/fused_fully_connected_layer.hpp b/HugeCTR/include/layers/fused_fully_connected_layer.hpp
index 468d363e20..46979f487f 100644
--- a/HugeCTR/include/layers/fused_fully_connected_layer.hpp
+++ b/HugeCTR/include/layers/fused_fully_connected_layer.hpp
@@ -24,86 +24,12 @@
 #include <vector>
 
 namespace HugeCTR {
-/**
- * @brief
- * This class implements the fully connected layer.
- */
-class FusedFullyConnectedLayer : public TrainableLayer<__half> {
-  // Optimized cublasGemmEx algorithm selection
-  cublasGemmAlgo_t falgo_k_{CUBLAS_GEMM_DEFAULT};
-  cublasGemmAlgo_t balgo_k_{CUBLAS_GEMM_DEFAULT};
-  cublasGemmAlgo_t balgo_x_{CUBLAS_GEMM_DEFAULT};
-
-  /*
-   * stores the references to the bottom tensors of this layer.
-   */
-  Tensor2<__half> bottom_tensor_;
-
-  /*
-   * stores the references to the top tensors of this layer.
-   */
-  Tensor2<__half> top_tensor_;
-
-  /*
-   * stores the references to the intermediate top tensors of this layer.
-   */
-  Tensor2<__half> middle_tensor_;
-
-  /*
-   * stores the references to the intermediate bias grad tensors of this layer.
-   */
-  Tensor2<float> bias_grad_tensor_;
-
-  std::unique_ptr<DataSimulator> get_uniform_initializer(const int index) override;
-  std::unique_ptr<DataSimulator> get_xavier_uniform_initializer(const int index) override;
-  std::unique_ptr<DataSimulator> get_xavier_norm_initializer(const int index) override;
-  std::unique_ptr<DataSimulator> get_default_initializer(const int index) override;
-
-  Tensor2<__half>& get_bottom_tensor(bool is_train) { return bottom_tensor_; }
-
- public:
-  /**
-   * forward pass
-   */
-  void fprop(bool is_train) final;
-  /**
-   * backward pass
-   */
-  void bprop() final;
-  /*
-   * algorithm search for cublasGemmEx
-   */
-  void search_algorithm() final;
-  /**
-   * This is the constructor of the FullyConnectedLayer.
-   * It will check whether the format combination of all tensors is supported or not.
-   * Only two kinds of tensor formats are supported:
-   * (1) weight, input, output, wgrad are all in row-major.
-   * (2) weight, input, output, wgrad are all in column-major.
-   * @param weight_buff: stores the weight tensor
-   * @param wgrad_buff: stores the gradient values of the weight calculated in backward pass
-   * @param bottom_tensor: stores the tensor from bottom layer
-   * @param top_tensor: stores the tensor to top layer
-   * @param tensor_format: specifies the format of the weight tensor, either HW (row major) or WH
-   * (col-major)
-   */
-  FusedFullyConnectedLayer(
-      const std::shared_ptr<BufferBlock2<float>>& master_weights_buff,
-      const std::shared_ptr<BufferBlock2<__half>>& weights_buff,
-      const std::shared_ptr<BufferBlock2<__half>>& weights_grad_buff,
-      const std::shared_ptr<GeneralBuffer2<CudaAllocator>>& blobs_buff,
-      const Tensor2<__half>& bottom_tensor, const Tensor2<__half>& top_tensor,
-      const std::shared_ptr<GPUResource>& gpu_resource,
-      std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>());
-  FusedFullyConnectedLayer(const FusedFullyConnectedLayer&) = delete;
-  FusedFullyConnectedLayer& operator=(const FusedFullyConnectedLayer&);
-};
 
 /**
  * @brief
  * This class implements the fully connected layer.
  */
-class Core23TempFusedFullyConnectedLayer : public Core23TempTrainableLayer<__half> {
+class FusedFullyConnectedLayer : public TrainableLayer<__half> {
   // Optimized cublasGemmEx algorithm selection
   cublasGemmAlgo_t falgo_k_{CUBLAS_GEMM_DEFAULT};
   cublasGemmAlgo_t balgo_k_{CUBLAS_GEMM_DEFAULT};
@@ -150,12 +76,12 @@ class Core23TempFusedFullyConnectedLayer : public Core23TempTrainableLayer<__hal
    * @param tensor_format: specifies the format of the weight tensor, either HW (row major) or WH
    * (col-major)
    */
-  Core23TempFusedFullyConnectedLayer(
+  FusedFullyConnectedLayer(
       const core23::Tensor& bottom_tensor, const core23::Tensor& top_tensor,
       const std::shared_ptr<GPUResource>& gpu_resource,
       std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>());
-  Core23TempFusedFullyConnectedLayer(const Core23TempFusedFullyConnectedLayer&) = delete;
-  Core23TempFusedFullyConnectedLayer& operator=(const Core23TempFusedFullyConnectedLayer&);
+  FusedFullyConnectedLayer(const FusedFullyConnectedLayer&) = delete;
+  FusedFullyConnectedLayer& operator=(const FusedFullyConnectedLayer&);
 };
 
 }  // namespace HugeCTR
diff --git a/HugeCTR/include/layers/fused_relu_bias_fully_connected_layer.hpp b/HugeCTR/include/layers/fused_relu_bias_fully_connected_layer.hpp
index 8aea519957..6013cc8bf0 100644
--- a/HugeCTR/include/layers/fused_relu_bias_fully_connected_layer.hpp
+++ b/HugeCTR/include/layers/fused_relu_bias_fully_connected_layer.hpp
@@ -62,208 +62,6 @@ class FusedReluBiasFullyConnectedLayer : public TrainableLayer<__half> {
   // std::vector<TensorPtr<float>> master_weights_; It is inherited from Layer, and named as
   // weights_;
 
-  /*
-   * stores the weight tensors for compute of this layer.
-   */
-  // std::vector<TensorPtr<__half>> weights_;
-  Tensors2<__half> weights_half_;
-
-  /*
-   * stores the weight gradient tensors of this layer.
-   */
-  Tensors2<__half> weights_grad_;
-
-  /*
-   * stores the references to the bottom tensors of this layer.
-   */
-  Tensor2<__half> train_in_tensor_;
-  Tensor2<__half> mask_in_tensor_;
-  Tensor2<__half> dRelu_in_tensor_;
-  Tensor2<__half> db_in_tensor_;
-  Tensor2<int> mask_in_tensor_temp_;
-
-  /*
-   * stores the references to the top tensors of this layer.
-   */
-  Tensor2<__half> train_out_tensor_;
-  Tensor2<__half> mask_out_tensor_;
-  Tensor2<__half> dRelu_out_tensor_;
-  Tensor2<__half> db_out_tensor_;
-
-  /*
-   * stores the references to the output tensors of GEMM.
-   */
-  Tensor2<__half> identity_tensor_;
-
-  /*
-   * stores the references to the intermediate bias grad tensors of this layer.
-   */
-  Tensor2<float> bias_grad_tensor_;
-
-  void* bprop_fusion_;
-
-  /*
-   * stores the position of this layer in the network
-   */
-  FcPosition_t pos_;
-
-  /*
-   * stores the activation function of this layer
-   */
-  Activation_t act_;
-
-  /*
-   * skip the computation of dgrad or not
-   */
-  bool skip_dgrad_;
-
-  /*
-   * indicates whether overlap dgrad and wgrad
-   */
-  bool async_mlp_wgrad_;
-
-  /*
-   * determines the kind of fusion pattern.
-   * There are two fuse patterns available:
-   * (fuse_wb_ == true)  DGRAD + DReLU, WGRAD + BGRAD
-   * (fuse_wb_ == false) DGRAD + DReLU + BGRAD, WGRAD
-   */
-  bool fuse_wb_;
-
-  /*
-   * indicates whether there is mask in tensor for Head layer
-   */
-  bool head_mask_in_;
-
-  bool event_overlap_created_;
-
-  cublasHandle_t cublas_handle_wgrad_;
-
-  /*
-   * record the event when starting to compute wgrad
-   */
-  cudaEvent_t event_overlap_;
-
-  /*
-   * record the event when finishing computing wgrad (host, async)
-   */
-  // cudaEvent_t event_overlap_end_;
-
-  std::unique_ptr<DataSimulator> get_uniform_initializer(const int index) override;
-  std::unique_ptr<DataSimulator> get_xavier_uniform_initializer(const int index) override;
-  std::unique_ptr<DataSimulator> get_xavier_norm_initializer(const int index) override;
-  std::unique_ptr<DataSimulator> get_default_initializer(const int index) override;
-
-  Tensor2<__half>& get_bottom_tensor_fprop(bool is_train) { return train_in_tensor_; }
-
- public:
-  /**
-   * forward pass
-   */
-  void fprop(bool is_train) final;
-  /**
-   * backward pass
-   */
-  void bprop() final;
-  /*
-   * algorithm search for cublasGemmEx
-   */
-  void search_algorithm() final;
-  void initialize() final;
-  void initialize_dgrad();
-  void initialize_wgrad();
-
-  /*
-   * Interfaces for unit tests to debug
-   */
-  Tensors2<__half>& get_weights_half_tensor() { return weights_half_; }
-  Tensors2<__half>& get_weights_grad_tensor() { return weights_grad_; }
-
-  /*
-   * return the cuda event recording the finish point of wgrad
-   */
-  // cudaEvent_t& get_event_overlap_end() { return event_overlap_end_; }
-
-  /**
-   * This is the constructor of the FullyConnectedLayer.
-   * It will check whether the format combination of all tensors is supported or not.
-   * Only two kinds of tensor formats are supported:
-   * (1) weight, input, output, wgrad are all in row-major.
-   * (2) weight, input, output, wgrad are all in column-major.
-   * @param weight_buff: stores the weight tensor
-   * @param wgrad_buff: stores the gradient values of the weight calculated in backward pass
-   * @param train_bottom_tensor_fprop: stores the tensor from bottom layer for forward propagation
-   * @param train_bottom_tensor_fprop: stores the tensor from bottom layer for forward propagation
-   * @param top_tensor_fprop: stores the tensor to top layer when forward propagation
-   * @param top_tensor_bprop: stores the tensor to top layer when backward propagation
-   * @param pos: stores the position of this layer: HEAD, BODY, TAIL, ISOLATED.
-   */
-  FusedReluBiasFullyConnectedLayer(
-      const std::shared_ptr<BufferBlock2<float>>& master_weights_buff,
-      const std::shared_ptr<BufferBlock2<__half>>& weights_buff,
-      const std::shared_ptr<BufferBlock2<__half>>& weights_grad_buff,
-      const std::shared_ptr<GeneralBuffer2<CudaAllocator>>& blobs_buff,
-      const Tensor2<__half>& train_in_tensor, const Tensor2<__half>& mask_in_tensor,
-      const Tensor2<__half>& dRelu_in_tensor, const Tensor2<__half>& db_in_tensor,
-      const Tensor2<__half>& train_out_tensor, const Tensor2<__half>& mask_out_tensor,
-      const Tensor2<__half>& dRelu_out_tensor, Tensor2<__half>& db_out_tensor,
-      const std::shared_ptr<GPUResource>& gpu_resource, const FcPosition_t& pos,
-      const Activation_t& act, const bool& skip_dgrad,
-      std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>(),
-      const bool async_mlp_wgrad = false, const bool head_mask_in = false,
-      const bool fuse_wb = false);
-  FusedReluBiasFullyConnectedLayer(const FusedReluBiasFullyConnectedLayer&) = delete;
-  FusedReluBiasFullyConnectedLayer& operator=(const FusedReluBiasFullyConnectedLayer&);
-
-  ~FusedReluBiasFullyConnectedLayer() {
-    try {
-      if (event_overlap_created_) {
-        CudaDeviceContext context(get_device_id());
-        HCTR_LIB_THROW(cudaEventDestroy(event_overlap_));
-      }
-    } catch (const std::exception& error) {
-      HCTR_LOG(INFO, WORLD, "FusedReluBiasFullyConnectedLayer Dtor error:%s", error.what());
-    }
-  };
-};
-
-/**
- * @brief
- * This class implements the fully connected layer.
- */
-class Core23TempFusedReluBiasFullyConnectedLayer : public Core23TempTrainableLayer<__half> {
-  // Optimized cublasGemmEx algorithm selection
-  cublasLtMatmulAlgo_t falgo_k_;
-  cublasLtMatmulAlgo_t balgo_dRelu_;
-  cublasLtMatmulAlgo_t balgo_wgrad_;
-  cublasGemmAlgo_t balgo_k_{CUBLAS_GEMM_DEFAULT};
-  cublasGemmAlgo_t balgo_x_{CUBLAS_GEMM_DEFAULT};
-  cublasGemmAlgo_t balgo_b_{CUBLAS_GEMM_DEFAULT};
-
-  cublasLtMatrixLayout_t cublas_kernel_desc_ = NULL;
-  cublasLtMatrixLayout_t cublas_top_desc_ = NULL;
-  cublasLtMatrixLayout_t cublas_bottom_desc_ = NULL;
-  cublasLtMatrixLayout_t cublas_dRelu_top_desc_ = NULL;
-  cublasLtMatrixLayout_t cublas_dRelu_bottom_desc_ = NULL;
-
-  cublasLtMatmulDesc_t cublas_op_desc_ = NULL;
-  cublasLtMatmulDesc_t cublas_op_desc_bprop_ = NULL;
-  cublasLtMatmulDesc_t cublas_op_desc_wgrad_ = NULL;
-
-  cublasLtMatmulPreference_t cublas_preference_ = NULL;
-  cublasLtMatmulPreference_t cublas_preference_dRelu_ = NULL;
-  cublasLtMatmulPreference_t cublas_preference_wgrad_ = NULL;
-  size_t cublaslt_workspace_size_ = 1024 * 1024 * 32;
-  void* cublaslt_workspace_;
-  void* cublaslt_workspace_dRelu_;
-  void* cublaslt_workspace_wgrad_;
-
-  /*
-   * stores the weight tensors for compute of this layer.
-   */
-  // std::vector<TensorPtr<float>> master_weights_; It is inherited from Layer, and named as
-  // weights_;
-
   /*
    * stores the weight tensors for compute of this layer.
    */
@@ -388,7 +186,7 @@ class Core23TempFusedReluBiasFullyConnectedLayer : public Core23TempTrainableLay
    * @param top_tensor_bprop: stores the tensor to top layer when backward propagation
    * @param pos: stores the position of this layer: HEAD, BODY, TAIL, ISOLATED.
    */
-  Core23TempFusedReluBiasFullyConnectedLayer(
+  FusedReluBiasFullyConnectedLayer(
       const core23::Tensor& train_in_tensor, const core23::Tensor& mask_in_tensor,
       const core23::Tensor& dRelu_in_tensor, const core23::Tensor& db_in_tensor,
       const core23::Tensor& train_out_tensor, const core23::Tensor& mask_out_tensor,
@@ -398,20 +196,17 @@ class Core23TempFusedReluBiasFullyConnectedLayer : public Core23TempTrainableLay
       std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>(),
       const bool async_mlp_wgrad = false, const bool head_mask_in = false,
       const bool fuse_wb = false);
-  Core23TempFusedReluBiasFullyConnectedLayer(const Core23TempFusedReluBiasFullyConnectedLayer&) =
-      delete;
-  Core23TempFusedReluBiasFullyConnectedLayer& operator=(
-      const Core23TempFusedReluBiasFullyConnectedLayer&);
+  FusedReluBiasFullyConnectedLayer(const FusedReluBiasFullyConnectedLayer&) = delete;
+  FusedReluBiasFullyConnectedLayer& operator=(const FusedReluBiasFullyConnectedLayer&);
 
-  ~Core23TempFusedReluBiasFullyConnectedLayer() {
+  ~FusedReluBiasFullyConnectedLayer() {
     try {
       if (event_overlap_created_) {
         CudaDeviceContext context(get_device_id());
         HCTR_LIB_THROW(cudaEventDestroy(event_overlap_));
       }
     } catch (const std::exception& error) {
-      HCTR_LOG(INFO, WORLD, "Core23TempFusedReluBiasFullyConnectedLayer Dtor error:%s",
-               error.what());
+      HCTR_LOG(INFO, WORLD, "FusedReluBiasFullyConnectedLayer Dtor error:%s", error.what());
     }
   };
 };
diff --git a/HugeCTR/include/layers/gru_layer.hpp b/HugeCTR/include/layers/gru_layer.hpp
index 06037d5058..007c12311b 100644
--- a/HugeCTR/include/layers/gru_layer.hpp
+++ b/HugeCTR/include/layers/gru_layer.hpp
@@ -25,26 +25,14 @@ namespace HugeCTR {
  * GRU function (Interest Extractor Layer) as a derived class of Layer
  */
 template <typename T>
-class GRULayer : public Layer {
+class GRULayer : public TrainableLayer<T> {
   cublasGemmAlgo_t falgo_{CUBLAS_GEMM_DEFAULT};
-  /*
-   * stores the weight gradient tensors of this layer.
-   */
-  Tensors2<T> wgrad_;
-  /*
-   * stores the references to the input tensors of this layer.
-   */
-  Tensors2<T> in_tensors_;
-  /*
-   * stores the references to the output tensors of this layer.
-   */
-  Tensors2<T> out_tensors_;
 
   size_t workSpaceSize;
   size_t reserveSpaceSize;
   size_t inputTensorSize, outputTensorSize, hiddenTensorSize;
 
-  Tensors2<float> &get_in_tensors(bool is_train) { return in_tensors_; }
+  std::vector<core23::Tensor> &get_in_tensors(bool is_train) { return this->input_tensors_; }
 
  public:
   /**
@@ -64,78 +52,11 @@ class GRULayer : public Layer {
    * @param out_tensor the output tensor which has the same dim with in_tensor
    * @param device_id the id of GPU where this layer belongs
    */
-  GRULayer(const std::shared_ptr<BufferBlock2<T>> &weight_buff,
-           const std::shared_ptr<BufferBlock2<T>> &wgrad_buff, const Tensor2<T> &in_tensor,
-           const Tensor2<T> &out_tensor, size_t hiddenSize, size_t batch_size, size_t SeqLength,
-           size_t embedding_vec_size, const std::shared_ptr<GPUResource> &gpu_resource,
+  GRULayer(const core23::Tensor &in_tensor, const core23::Tensor &out_tensor, int64_t hiddenSize,
+           int64_t batch_size, int64_t SeqLength, int64_t embedding_vec_size,
+           const std::shared_ptr<GPUResource> &gpu_resource,
            std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>());
 
- private:
-  int *seqLengthArray = NULL;
-  int *devSeqLengthArray = NULL;
-  void *weightSpace = NULL;
-  void *dweightSpace = NULL;
-  void *workSpace = NULL;
-  void *reserveSpace = NULL;
-  void *hx = NULL;
-
-  cudnnHandle_t cudnnHandle;
-  cudnnRNNDescriptor_t rnnDesc;
-  cudnnRNNDataDescriptor_t in_Desc;
-  cudnnRNNDataDescriptor_t out_Desc;
-  cudnnTensorDescriptor_t cDesc;
-  cudnnTensorDescriptor_t hDesc;
-  cudnnDropoutDescriptor_t dropoutDesc;
-  cudnnDataType_t data_type;
-
-  int dimHidden[3];
-  int strideHidden[3];
-  unsigned long long seed;
-  size_t stateSize;
-  void *states;
-  float dropout = 0;
-  size_t weightSpaceSize;
-  size_t seqLength_, miniBatch, embedding_vec_size_, m = 512;
-  int hiddenSize_;  // = 512; //half of the seqLength
-  int numLinearLayers;
-};
-
-/**
- * GRU function (Interest Extractor Layer) as a derived class of Layer
- */
-template <typename T>
-class Core23TempGRULayer : public Core23TempTrainableLayer<T> {
-  cublasGemmAlgo_t falgo_{CUBLAS_GEMM_DEFAULT};
-
-  size_t workSpaceSize;
-  size_t reserveSpaceSize;
-  size_t inputTensorSize, outputTensorSize, hiddenTensorSize;
-
-  std::vector<core23::Tensor> &get_in_tensors(bool is_train) { return this->input_tensors_; }
-
- public:
-  /**
-   * A method of implementing the forward pass of GRU
-   * @param stream CUDA stream where the forward propagation is executed
-   */
-  void fprop(bool is_train) final;
-  /**
-   * A method of implementing the backward pass of GRU
-   * @param stream CUDA stream where the backward propagation is executed
-   */
-  void bprop() final;
-
-  /**
-   * Ctor of Core23TempGRULayer.
-   * @param in_tensor the input tensor
-   * @param out_tensor the output tensor which has the same dim with in_tensor
-   * @param device_id the id of GPU where this layer belongs
-   */
-  Core23TempGRULayer(const core23::Tensor &in_tensor, const core23::Tensor &out_tensor,
-                     int64_t hiddenSize, int64_t batch_size, int64_t SeqLength,
-                     int64_t embedding_vec_size, const std::shared_ptr<GPUResource> &gpu_resource,
-                     std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>());
-
  private:
   int *seqLengthArray = nullptr;
   int *devSeqLengthArray = nullptr;
diff --git a/HugeCTR/include/layers/layer_norm_layer.hpp b/HugeCTR/include/layers/layer_norm_layer.hpp
index 20955a1765..a9ca99408b 100644
--- a/HugeCTR/include/layers/layer_norm_layer.hpp
+++ b/HugeCTR/include/layers/layer_norm_layer.hpp
@@ -30,15 +30,6 @@ template <typename T>
 class LayerNormLayer : public TrainableLayer<T> {
   using Base = TrainableLayer<T>;
 
-  /*
-   * stores the references to the input tensors of this layer.
-   */
-  Tensors2<T> in_tensors_;
-  /*
-   * stores the references to the output tensors of this layer.
-   */
-  Tensors2<T> out_tensors_;
-
  public:
   /**
    * LayerNorm parameters
@@ -48,21 +39,14 @@ class LayerNormLayer : public TrainableLayer<T> {
   };
   /**
    * Ctor of LayerNormLayer.
-   * @param master_weight_buff master_weight buffer for mixed precision training
-   * @param weight_buff weight buffer for internal gamma/beta tensors
-   * @param wgrad_buff gradient buffer for internal gamma/beta tensors
    * @param in_tensor the input tensor
    * @param out_tensor the output tensor which has the same dim with in_tensor
    * @param params LayerNorm parameters
    * @param cudnn_handle cuDNN handle created externally
    * @param device_id the id of GPU where this layer belongs
    */
-  LayerNormLayer(const std::shared_ptr<BufferBlock2<float>>& master_weight_buff,
-                 const std::shared_ptr<BufferBlock2<T>>& weight_buff,
-                 const std::shared_ptr<BufferBlock2<T>>& wgrad_buff,
-                 const std::shared_ptr<GeneralBuffer2<CudaAllocator>>& blob_buff,
-                 const Tensor2<T>& in_tensor, const Tensor2<T>& out_tensor, const Params& params,
-                 const std::shared_ptr<GPUResource>& gpu_resource,
+  LayerNormLayer(const core23::Tensor& in_tensor, const core23::Tensor& out_tensor,
+                 const Params& params, const std::shared_ptr<GPUResource>& gpu_resource,
                  std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>());
 
   /**
@@ -77,66 +61,6 @@ class LayerNormLayer : public TrainableLayer<T> {
    */
   void bprop() override;
 
- private:
-  /**
-   * A method of defining how gamma and beta are initialized.
-   * Gamma is initialized to 1s while Beta is 0ed.
-   * Override this function to change the initialization behavior.
-   */
-  std::unique_ptr<DataSimulator> get_default_initializer(const int index) override;
-  const Params params_;
-
-  // these four pointers are just for convenience
-  // they are deleted by Layer d'tor through the other pointer aliases: weight_ and wgrad_
-  Tensor2<T> gamma_;
-  Tensor2<T> beta_;
-  Tensor2<T> gamma_grad_;
-  Tensor2<T> beta_grad_;
-
-  // these tensors are internal only managed by smart ptrs
-  Tensor2<T> result_save_mean_;
-  Tensor2<T> result_save_var_;
-};
-
-/**
- * LayerNorm layer
- */
-template <typename T>
-class Core23TempLayerNormLayer : public Core23TempTrainableLayer<T> {
-  using Base = Core23TempTrainableLayer<T>;
-
- public:
-  /**
-   * LayerNorm parameters
-   */
-  struct Params {
-    double eps; /**< small value to avoid divide-by-zero error*/
-  };
-  /**
-   * Ctor of Core23TempLayerNormLayer.
-   * @param in_tensor the input tensor
-   * @param out_tensor the output tensor which has the same dim with in_tensor
-   * @param params LayerNorm parameters
-   * @param cudnn_handle cuDNN handle created externally
-   * @param device_id the id of GPU where this layer belongs
-   */
-  Core23TempLayerNormLayer(
-      const core23::Tensor& in_tensor, const core23::Tensor& out_tensor, const Params& params,
-      const std::shared_ptr<GPUResource>& gpu_resource,
-      std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>());
-
-  /**
-   * A method of implementing the forward pass of LayerNorm
-   * @param stream CUDA stream where the forward propagation is executed
-   */
-  void fprop(bool is_train) override;
-
-  /**
-   * A method of implementing the forward pass of LayerNorm
-   * @param stream CUDA stream where the forward propagation is executed
-   */
-  void bprop() override;
-
  private:
   /**
    * A method of defining how gamma and beta are initialized.
diff --git a/HugeCTR/include/layers/mlp_layer.hpp b/HugeCTR/include/layers/mlp_layer.hpp
index bce5bc20ec..a591714f32 100644
--- a/HugeCTR/include/layers/mlp_layer.hpp
+++ b/HugeCTR/include/layers/mlp_layer.hpp
@@ -28,15 +28,13 @@ namespace HugeCTR {
 
 template <typename T>
 class MLPLayer : public TrainableLayer<T> {
-  Tensors2<T> bottom_tensors_;
-  Tensors2<T> top_tensors_;
+  std::vector<core23::Tensor> train_tensors_, mask_tensors_, dact_tensors_, db_tensors_;
 
-  Tensors2<T> train_tensors_, mask_tensors_, dact_tensors_, db_tensors_;
+  std::vector<core23::Tensor> kernels_;
+  std::vector<core23::Tensor> biases_;
+  std::vector<core23::Tensor> kernels_grad_;
 
-  Tensors2<T> kernels_;
-  Tensors2<T> biases_;
-  Tensors2<T> kernels_grad_;
-  std::vector<size_t> num_outputs_;
+  std::vector<int64_t> num_outputs_;
   std::vector<Activation_t> acts_;
 
   std::vector<bool> output_mask_;
@@ -59,13 +57,10 @@ class MLPLayer : public TrainableLayer<T> {
   std::unique_ptr<DataSimulator> get_default_initializer(const int index) override;
 
  public:
-  MLPLayer(const std::shared_ptr<BufferBlock2<float>>& master_weights_buff,
-           const std::shared_ptr<BufferBlock2<T>>& weights_buff,
-           const std::shared_ptr<BufferBlock2<T>>& weights_grad_buff,
-           const std::shared_ptr<GeneralBuffer2<CudaAllocator>>& blobs_buff,
-           const Tensors2<T>& bottom_tensors, const Tensors2<T>& top_tensors,
-           const std::vector<size_t>& num_outputs, const std::shared_ptr<GPUResource>& gpu_resource,
-           const std::vector<Activation_t>& acts, const std::vector<bool>& use_bias,
+  MLPLayer(const std::vector<core23::Tensor>& bottom_tensors,
+           const std::vector<core23::Tensor>& top_tensors, const std::vector<int64_t>& num_outputs,
+           const std::shared_ptr<GPUResource>& gpu_resource, const std::vector<Activation_t>& acts,
+           const std::vector<bool>& use_bias,
            std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>(),
            bool skip_head_dgrad = false, bool async_wgrad = false, bool fuse_wb = false,
            bool enable_tf32_compute = false);
@@ -81,76 +76,6 @@ class MLPLayer : public TrainableLayer<T> {
 
   void initialize() final;
 
-  /*
-   * Interfaces for unit tests to debug
-   */
-  Tensor2<T>& get_kernel(int index) { return kernels_[index]; }
-  Tensor2<T>& get_bias(int index) { return biases_[index]; }
-  Tensor2<T>& get_kernel_grad(int index) { return kernels_grad_[index]; }
-  Tensor2<T>& get_bias_grad(int index) { return db_tensors_[index]; }
-  Tensors2<T>& get_inner_tensors() { return train_tensors_; }
-  Tensors2<T>& get_input_tensors() { return bottom_tensors_; }
-  Tensors2<T>& get_output_tensors() { return top_tensors_; }
-
-  ~MLPLayer() {
-    CudaDeviceContext context(this->get_device_id());
-    if (event_overlap_created_) {
-      cudaEventDestroy(event_overlap_);
-    }
-  };
-};
-
-template <typename T>
-class Core23TempMLPLayer : public Core23TempTrainableLayer<T> {
-  std::vector<core23::Tensor> train_tensors_, mask_tensors_, dact_tensors_, db_tensors_;
-
-  std::vector<core23::Tensor> kernels_;
-  std::vector<core23::Tensor> biases_;
-  std::vector<core23::Tensor> kernels_grad_;
-
-  std::vector<int64_t> num_outputs_;
-  std::vector<Activation_t> acts_;
-
-  std::vector<bool> output_mask_;
-  std::vector<bool> use_bias_;
-
-  bool async_wgrad_;
-  bool fuse_wb_;
-  bool enable_tf32_compute_;
-  bool skip_head_dgrad_;
-
-  bool event_overlap_created_;
-  cudaEvent_t event_overlap_;
-  std::vector<CublasFusedFCLayerDesc<T>> layer_desc_;
-  std::vector<CublasFusedFCLayerAlgo<T>> layer_algo_;
-  FusedFCLayerFunctors<T> layer_functors_;
-
-  std::unique_ptr<DataSimulator> get_uniform_initializer(const int index) override;
-  std::unique_ptr<DataSimulator> get_xavier_uniform_initializer(const int index) override;
-  std::unique_ptr<DataSimulator> get_xavier_norm_initializer(const int index) override;
-  std::unique_ptr<DataSimulator> get_default_initializer(const int index) override;
-
- public:
-  Core23TempMLPLayer(const std::vector<core23::Tensor>& bottom_tensors,
-                     const std::vector<core23::Tensor>& top_tensors,
-                     const std::vector<int64_t>& num_outputs,
-                     const std::shared_ptr<GPUResource>& gpu_resource,
-                     const std::vector<Activation_t>& acts, const std::vector<bool>& use_bias,
-                     std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>(),
-                     bool skip_head_dgrad = false, bool async_wgrad = false, bool fuse_wb = false,
-                     bool enable_tf32_compute = false);
-
-  Core23TempMLPLayer(const Core23TempMLPLayer& C) = delete;
-  Core23TempMLPLayer& operator=(const Core23TempMLPLayer&);
-
-  void fprop(bool is_train) final;
-
-  void bprop() final;
-
-  void search_algorithm() final;
-
-  void initialize() final;
-
   /*
    * Interfaces for unit tests to debug
    */
@@ -162,7 +87,7 @@ class Core23TempMLPLayer : public Core23TempTrainableLayer<T> {
   auto& get_input_tensors() { return this->input_tensors_; }
   auto& get_output_tensors() { return this->output_tensors_; }
 
-  ~Core23TempMLPLayer() {
+  ~MLPLayer() {
     CudaDeviceContext context(this->get_device_id());
     if (event_overlap_created_) {
       cudaEventDestroy(event_overlap_);
diff --git a/HugeCTR/include/layers/multi_cross_layer.hpp b/HugeCTR/include/layers/multi_cross_layer.hpp
index ca03ef814a..8902b18a2c 100644
--- a/HugeCTR/include/layers/multi_cross_layer.hpp
+++ b/HugeCTR/include/layers/multi_cross_layer.hpp
@@ -28,162 +28,6 @@ struct MultiCrossForwardFunctor {
   MultiCrossForwardFunctor(const MultiCrossForwardFunctor&) = delete;
   MultiCrossForwardFunctor& operator=(const MultiCrossForwardFunctor&) = delete;
 
-  void operator()(cudaStream_t stream, cublasHandle_t cublas_handle, const Tensor2<T>& input_tensor,
-                  const Tensors2<T>& kernel_tensors, const Tensors2<T>& bias_tensors,
-                  Tensors2<T>& layer_output_tensors, Tensors2<T>& layer_hidden_tensors,
-                  int num_layers) const;
-};
-template <typename T>
-struct MultiCrossForwardFunctorv2 {
-  GemmFunctor<T> gemm_functor_;
-  MultiCrossForwardFunctorv2() = default;
-  MultiCrossForwardFunctorv2(const MultiCrossForwardFunctorv2&) = delete;
-  MultiCrossForwardFunctorv2& operator=(const MultiCrossForwardFunctorv2&) = delete;
-  void search_algorithm(T* bottom, T* top, T* kernel, size_t batch_size, size_t input_size,
-                        size_t output_size, const CublasFusedFCLayerDesc<T>& cublas_layer_desc,
-                        cublasLtHandle_t cublaslt_handle, cudaStream_t stream);
-  void operator()(cudaStream_t stream, const Tensor2<T>& input_tensor,
-                  const Tensors2<T>& kernel_tensors, const Tensors2<T>& bias_tensors,
-                  Tensors2<T>& XU_tensors, Tensors2<T>& layer_output_tensors,
-                  Tensors2<T>& layer_hidden_tensors, int num_layers,
-                  const std::vector<CublasDesc<T>>& xu_descr_,
-                  const std::vector<CublasDesc<T>>& xuvb_descr_,
-                  const std::vector<CublasAlgo<T>>& xu_fprop_algo_,
-                  const std::vector<CublasAlgo<T>>& xuvb_fprop_algo_, cublasLtHandle_t = nullptr);
-};
-
-template <typename T>
-struct MultiCrossBackwardFunctorv2 {
-  GemmFunctor<T> gemm_functor_;
-  MultiCrossBackwardFunctorv2() = default;
-  MultiCrossBackwardFunctorv2(const MultiCrossBackwardFunctorv2&) = default;
-  MultiCrossBackwardFunctorv2& operator=(const MultiCrossBackwardFunctorv2&) = delete;
-
-  void operator()(cudaStream_t dgrad_stream, cudaStream_t wgrad_stream, bool async_wgrad,
-                  cudaEvent_t& event_overlap, const Tensor2<T>& input_tensor,
-                  const Tensors2<T>& kernel_tensors, const Tensors2<T>& act_tensors,
-                  const Tensors2<T>& layer_hidden_tensors, Tensors2<T>& kernel_output_tensors,
-                  Tensors2<T>& grad_tensors, Tensors2<T>& bias_output_tensors,
-                  Tensors2<T>& XU_tensors, Tensor2<T> accum_dx_tensor_, Tensors2<T> bprop_bottoms,
-                  int num_layers, const std::vector<CublasDesc<T>>& xu_descr_,
-                  const std::vector<CublasDesc<T>>& xuvb_descr_,
-                  const std::vector<CublasDesc<T>>& du_descrs_bprop_,
-                  const std::vector<CublasDesc<T>>& dhidden_descrs_bprop_,
-                  const std::vector<CublasAlgo<T>>& xu_bprop_algo_,
-                  const std::vector<CublasAlgo<T>>& xuvb_bprop_algo_,
-                  const std::vector<CublasAlgo<T>>& du_bprop_algos_,
-                  const std::vector<CublasAlgo<T>>& dhidden_bprop_algos_,
-                  cublasLtHandle_t cublaslt_handle = nullptr);
-};
-
-template <typename T>
-struct MultiCrossBackwardFunctor {
-  MultiCrossBackwardFunctor() = default;
-  MultiCrossBackwardFunctor(const MultiCrossBackwardFunctor&) = default;
-  MultiCrossBackwardFunctor& operator=(const MultiCrossBackwardFunctor&) = delete;
-
-  void operator()(cudaStream_t stream, const Tensor2<T>& input_tensor,
-                  const Tensors2<T>& kernel_tensors, const Tensors2<T>& layer_output_tensors,
-                  const Tensors2<T>& layer_hidden_tensors, const Tensor2<T>& grad_tensor,
-                  Tensor2<T>& output_tensor, Tensors2<T>& kernel_output_tensors,
-                  Tensors2<T>& bias_output_tensors, Tensor2<T>& tmp_vec_tensor,
-                  Tensor2<T> tmp_mat_tensors[], int num_layers) const;
-};
-
-template <typename T>
-class MultiCrossLayer : public TrainableLayer<T> {
- private:
-  const int num_layers_;
-  const size_t projection_dim_;
-  Tensors2<T> dgrads_; /**< vector of internal blobs' tensors, intermediate  dgrad of each
-                          interaction layer: T_4 */
-  Tensors2<T> activation_tensors_; /**< vector of internal blobs' tensors, intermediate output of
-                                each interaction layer: T_4 */
-  Tensors2<T> hidden_tensors_;     // DCNv1: x_i * w ; DCNv2: x * x_i * w + b; T_7
-  Tensors2<T> XU_tensors_;         // DCNv2:
-
-  Tensor2<T> tmp_mat_tensors_[4];  //[h,w]
-
-  Tensor2<T> accum_dx_tensor_;
-  Tensors2<T> bprop_bottom_;
-  Tensor2<T> tmp_vec_tensor_;  //[h,1]
-
-  /*
-   * stores the references to the input tensors of this layer.
-   */
-  Tensors2<T> in_tensors_;
-  /*
-   * stores the references to the output tensors of this layer.
-   */
-  Tensors2<T> out_tensors_;
-
-  std::vector<CublasDesc<T>> xu_descrs_fprop_;
-  std::vector<CublasDesc<T>> xuvb_descrs_fprop_;
-  std::vector<CublasDesc<T>> xu_descrs_bprop_;
-  std::vector<CublasDesc<T>> xuvb_descrs_bprop_;
-  std::vector<CublasDesc<T>> du_descrs_bprop_;
-  std::vector<CublasDesc<T>> dhidden_descrs_bprop_;
-
-  std::vector<CublasAlgo<T>> xu_fprop_algos_;
-  std::vector<CublasAlgo<T>> xuvb_fprop_algos_;
-  std::vector<CublasAlgo<T>> xu_bprop_algos_;
-  std::vector<CublasAlgo<T>> xuvb_bprop_algos_;
-  std::vector<CublasAlgo<T>> du_bprop_algos_;
-  std::vector<CublasAlgo<T>> dhidden_bprop_algos_;
-
-  bool enable_tf32_compute_;
-  bool async_wgrad_ = false;
-
-  MultiCrossForwardFunctorv2<T> dcnv2_forward_functor_;
-  MultiCrossBackwardFunctorv2<T> dcnv2_backward_functor_;
-
-  cudaStream_t wgrad_stream_;
-  cudaEvent_t event_fork_;
-
- public:
-  /**
-   * forward pass
-   */
-  void fprop(bool is_train) final;
-  Tensors2<T>& get_hidden_tensors() { return hidden_tensors_; };
-  Tensors2<T>& get_weight_tensor() { return XU_tensors_; };
-  /**
-   * backward pass
-   */
-  void search_algorithm() override;
-  void bprop() final;
-  void initialize() override;
-  MultiCrossLayer(const std::shared_ptr<BufferBlock2<float>>& master_weight_buff,
-                  const std::shared_ptr<BufferBlock2<T>>& weight_buff,
-                  const std::shared_ptr<BufferBlock2<T>>& wgrad_buff,
-                  const std::shared_ptr<GeneralBuffer2<CudaAllocator>>& blobs_buff,
-                  const Tensor2<T>& in_tensor, const Tensor2<T>& out_tensor,
-                  const std::shared_ptr<GPUResource>& gpu_resource, int num_layers,
-                  size_t projection_dim = 0,
-                  std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>(),
-                  bool enable_tf32_compute = false, bool async_wgrad = false);
-  MultiCrossLayer(const std::shared_ptr<BufferBlock2<float>>& master_weight_buff,
-                  const std::shared_ptr<BufferBlock2<T>>& weight_buff,
-                  const std::shared_ptr<BufferBlock2<T>>& wgrad_buff,
-                  const std::shared_ptr<GeneralBuffer2<CudaAllocator>>& blobs_buff,
-                  const Tensors2<T>& in_tensor, const Tensors2<T>& out_tensor,
-                  const std::shared_ptr<GPUResource>& gpu_resource, int num_layers,
-                  size_t projection_dim = 0,
-                  std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>(),
-                  bool enable_tf32_compute = false, bool async_wgrad = false);
-  MultiCrossLayer(const MultiCrossLayer&) = delete;
-  MultiCrossLayer& operator=(const MultiCrossLayer&) = delete;
-
- private:
-  std::unique_ptr<DataSimulator> get_default_initializer(const int index) override;
-};
-
-template <typename T>
-struct Core23TempMultiCrossForwardFunctor {
-  Core23TempMultiCrossForwardFunctor() = default;
-  Core23TempMultiCrossForwardFunctor(const Core23TempMultiCrossForwardFunctor&) = delete;
-  Core23TempMultiCrossForwardFunctor& operator=(const Core23TempMultiCrossForwardFunctor&) = delete;
-
   void operator()(cudaStream_t stream, cublasHandle_t cublas_handle,
                   const core23::Tensor& input_tensor,
                   const std::vector<core23::Tensor>& kernel_tensors,
@@ -192,12 +36,11 @@ struct Core23TempMultiCrossForwardFunctor {
                   std::vector<core23::Tensor>& layer_hidden_tensors, int num_layers) const;
 };
 template <typename T>
-struct Core23TempMultiCrossForwardFunctorv2 {
+struct MultiCrossForwardFunctorv2 {
   GemmFunctor<T> gemm_functor_;
-  Core23TempMultiCrossForwardFunctorv2() = default;
-  Core23TempMultiCrossForwardFunctorv2(const Core23TempMultiCrossForwardFunctorv2&) = delete;
-  Core23TempMultiCrossForwardFunctorv2& operator=(const Core23TempMultiCrossForwardFunctorv2&) =
-      delete;
+  MultiCrossForwardFunctorv2() = default;
+  MultiCrossForwardFunctorv2(const MultiCrossForwardFunctorv2&) = delete;
+  MultiCrossForwardFunctorv2& operator=(const MultiCrossForwardFunctorv2&) = delete;
   void search_algorithm(T* bottom, T* top, T* kernel, int64_t batch_size, int64_t input_size,
                         int64_t output_size, const CublasFusedFCLayerDesc<T>& cublas_layer_desc,
                         cublasLtHandle_t cublaslt_handle, cudaStream_t stream);
@@ -214,13 +57,12 @@ struct Core23TempMultiCrossForwardFunctorv2 {
 };
 
 template <typename T>
-struct Core23TempMultiCrossBackwardFunctorv2 {
+struct MultiCrossBackwardFunctorv2 {
   GemmFunctor<T> gemm_functor_;
 
-  Core23TempMultiCrossBackwardFunctorv2() = default;
-  Core23TempMultiCrossBackwardFunctorv2(const Core23TempMultiCrossBackwardFunctorv2&) = delete;
-  Core23TempMultiCrossBackwardFunctorv2& operator=(const Core23TempMultiCrossBackwardFunctorv2&) =
-      delete;
+  MultiCrossBackwardFunctorv2() = default;
+  MultiCrossBackwardFunctorv2(const MultiCrossBackwardFunctorv2&) = delete;
+  MultiCrossBackwardFunctorv2& operator=(const MultiCrossBackwardFunctorv2&) = delete;
   void operator()(cudaStream_t dgrad_stream, cudaStream_t wgrad_stream, bool async_wgrad,
                   cudaEvent_t& event_overlap, const core23::Tensor& input_tensor,
                   const std::vector<core23::Tensor>& kernel_tensors,
@@ -243,11 +85,10 @@ struct Core23TempMultiCrossBackwardFunctorv2 {
 };
 
 template <typename T>
-struct Core23TempMultiCrossBackwardFunctor {
-  Core23TempMultiCrossBackwardFunctor() = default;
-  Core23TempMultiCrossBackwardFunctor(const Core23TempMultiCrossBackwardFunctor&) = delete;
-  Core23TempMultiCrossBackwardFunctor& operator=(const Core23TempMultiCrossBackwardFunctor&) =
-      delete;
+struct MultiCrossBackwardFunctor {
+  MultiCrossBackwardFunctor() = default;
+  MultiCrossBackwardFunctor(const MultiCrossBackwardFunctor&) = delete;
+  MultiCrossBackwardFunctor& operator=(const MultiCrossBackwardFunctor&) = delete;
 
   void operator()(cudaStream_t stream, const core23::Tensor& input_tensor,
                   const std::vector<core23::Tensor>& kernel_tensors,
@@ -260,7 +101,7 @@ struct Core23TempMultiCrossBackwardFunctor {
 };
 
 template <typename T>
-class Core23TempMultiCrossLayer : public Core23TempTrainableLayer<T> {
+class MultiCrossLayer : public TrainableLayer<T> {
  private:
   const int num_layers_;
   const int64_t projection_dim_;
@@ -300,8 +141,8 @@ class Core23TempMultiCrossLayer : public Core23TempTrainableLayer<T> {
   std::vector<CublasAlgo<T>> du_bprop_algos_;
   std::vector<CublasAlgo<T>> dhidden_bprop_algos_;
 
-  Core23TempMultiCrossForwardFunctorv2<T> dcnv2_forward_functor_;
-  Core23TempMultiCrossBackwardFunctorv2<T> dcnv2_backward_functor_;
+  MultiCrossForwardFunctorv2<T> dcnv2_forward_functor_;
+  MultiCrossBackwardFunctorv2<T> dcnv2_backward_functor_;
   bool enable_tf32_compute_;
   bool async_wgrad_ = false;
   cudaStream_t wgrad_stream_;
@@ -320,13 +161,14 @@ class Core23TempMultiCrossLayer : public Core23TempTrainableLayer<T> {
   void search_algorithm() override;
   void bprop() final;
   void initialize() override;
-  Core23TempMultiCrossLayer(
-      const std::vector<core23::Tensor>& in_tensors, const std::vector<core23::Tensor>& out_tensors,
-      const std::shared_ptr<GPUResource>& gpu_resource, int num_layers, int64_t projection_dim,
-      std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>(),
-      bool enable_tf32_compute = false, bool async_wgrad = false);
-  Core23TempMultiCrossLayer(const Core23TempMultiCrossLayer&) = delete;
-  Core23TempMultiCrossLayer& operator=(const Core23TempMultiCrossLayer&) = delete;
+  MultiCrossLayer(const std::vector<core23::Tensor>& in_tensors,
+                  const std::vector<core23::Tensor>& out_tensors,
+                  const std::shared_ptr<GPUResource>& gpu_resource, int num_layers,
+                  int64_t projection_dim,
+                  std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>(),
+                  bool enable_tf32_compute = false, bool async_wgrad = false);
+  MultiCrossLayer(const MultiCrossLayer&) = delete;
+  MultiCrossLayer& operator=(const MultiCrossLayer&) = delete;
 
  private:
   std::unique_ptr<DataSimulator> get_default_initializer(const int index) override;
diff --git a/HugeCTR/include/layers/weight_multiply_layer.hpp b/HugeCTR/include/layers/weight_multiply_layer.hpp
index 5eca5c7677..9b668030c2 100644
--- a/HugeCTR/include/layers/weight_multiply_layer.hpp
+++ b/HugeCTR/include/layers/weight_multiply_layer.hpp
@@ -31,15 +31,6 @@ namespace HugeCTR {
  */
 template <typename T>
 class WeightMultiplyLayer : public TrainableLayer<T> {
-  /*
-   * stores the weight tensors of this layer.
-   */
-  Tensors2<T> in_tensors_;
-  /*
-   * stores the references to the output tensors of this layer.
-   */
-  Tensors2<T> out_tensors_;
-
  public:
   /**
    * Ctor of WeightMultiplyLayer.
@@ -47,12 +38,8 @@ class WeightMultiplyLayer : public TrainableLayer<T> {
    * @param out_tensor the resulting output tensor
    * @param device_id the id of GPU where this layer belongs
    */
-  WeightMultiplyLayer(const std::shared_ptr<BufferBlock2<float>>& master_weight_buff,
-                      const std::shared_ptr<BufferBlock2<T>>& weight_buff,
-                      const std::shared_ptr<BufferBlock2<T>>& wgrad_buff,
-                      const std::shared_ptr<GeneralBuffer2<CudaAllocator>>& blob_buff,
-                      const Tensor2<T>& in_tensor, Tensor2<T>& out_tensor,
-                      const std::vector<size_t>& weight_dims,
+  WeightMultiplyLayer(const core23::Tensor& in_tensor, core23::Tensor& out_tensor,
+                      const core23::Shape& weight_dims,
                       const std::shared_ptr<GPUResource>& gpu_resource,
                       std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>());
 
@@ -69,57 +56,6 @@ class WeightMultiplyLayer : public TrainableLayer<T> {
    */
   void bprop() override;
 
- private:
-  // void reserve_master_weight_tensor(const std::shared_ptr<BufferBlock2<float>>&
-  // master_weight_buff,
-  //                                   const std::vector<size_t>& weight_dims);
-  std::unique_ptr<DataSimulator> get_uniform_initializer(const int index) override;
-  std::unique_ptr<DataSimulator> get_xavier_uniform_initializer(const int index) override;
-  std::unique_ptr<DataSimulator> get_xavier_norm_initializer(const int index) override;
-  std::unique_ptr<DataSimulator> get_default_initializer(const int index) override;
-
-  size_t batch_size_;
-  size_t slot_num_;
-  size_t embedding_vec_size_;
-  Tensor2<T> wgrad_tmp_trans_;
-};
-
-/**
- * Layer which does element-wise product by input tensor X and weight W.
- * The input tensor X has dimension: [batch_size, slot_num], while
- * the input weight W has dimension: [slot_num, embedding_vec_size].
- * The Core23TempWeightMultiplyLayer will broadcast the value of W to "batch_size" dim
- * and broadcast the value of X to embedding_vec_size dim automatically
- * when doing element-wise product with X. So, the output tensor has
- * the dimension: [batch_size, slot_num*embedding_vec_size].
- */
-template <typename T>
-class Core23TempWeightMultiplyLayer : public Core23TempTrainableLayer<T> {
- public:
-  /**
-   * Ctor of Core23TempWeightMultiplyLayer.
-   * @param in_tensor the input tensor
-   * @param out_tensor the resulting output tensor
-   * @param device_id the id of GPU where this layer belongs
-   */
-  Core23TempWeightMultiplyLayer(
-      const core23::Tensor& in_tensor, core23::Tensor& out_tensor, const core23::Shape& weight_dims,
-      const std::shared_ptr<GPUResource>& gpu_resource,
-      std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>());
-
-  ~Core23TempWeightMultiplyLayer() override{};
-
-  /**
-   * Core23TempWeightMultiplyLayer's forward propagation to do element-wise production
-   * @param stream CUDA stream where the forward propagation is executed
-   */
-  void fprop(bool is_train) override;
-  /**
-   * Core23TempWeightMultiplyLayer's backward propagation
-   * @param stream CUDA stream where the forward propagation is executed
-   */
-  void bprop() override;
-
  private:
   // void reserve_master_weight_tensor(const std::shared_ptr<BufferBlock2<float>>&
   // master_weight_buff,
diff --git a/HugeCTR/include/network_helpers.hpp b/HugeCTR/include/network_helpers.hpp
index fc2483758a..1d198e979d 100644
--- a/HugeCTR/include/network_helpers.hpp
+++ b/HugeCTR/include/network_helpers.hpp
@@ -44,9 +44,9 @@ std::vector<core23::Tensor> get_trainable_tensor_vector(
     }
   };
   for (auto& layer : layers) {
-    auto trainable_layer = dynamic_cast<Core23TempTrainableLayer<DType>*>(layer.get());
+    auto trainable_layer = dynamic_cast<TrainableLayer<DType>*>(layer.get());
     if (!op(trainable_layer)) {
-      auto trainable_layer = dynamic_cast<Core23TempTrainableLayer<DType, true>*>(layer.get());
+      auto trainable_layer = dynamic_cast<TrainableLayer<DType, true>*>(layer.get());
       op(trainable_layer);
     }
   }
diff --git a/HugeCTR/include/pybind/model.hpp b/HugeCTR/include/pybind/model.hpp
index 7a000d4e14..71f478ea9b 100644
--- a/HugeCTR/include/pybind/model.hpp
+++ b/HugeCTR/include/pybind/model.hpp
@@ -43,7 +43,7 @@
 
 namespace HugeCTR {
 
-class Core23TempNetwork;
+class Network;
 
 namespace {
 
diff --git a/HugeCTR/include/trainable_layer.hpp b/HugeCTR/include/trainable_layer.hpp
index 19cc165eb4..2ed1b701bf 100644
--- a/HugeCTR/include/trainable_layer.hpp
+++ b/HugeCTR/include/trainable_layer.hpp
@@ -22,185 +22,6 @@
 #include <type_traits>
 
 namespace HugeCTR {
-/**
- * @brief
- * Trainable layer is the common parent of all layers with weights
- * @tparams DType the data type of inputs, outputs, and weights
- * @tparams use_FP32_weight if specified, the weight data type is in FP32, not DType
- */
-template <typename DType, bool use_FP32_weight = std::is_same<DType, float>::value>
-class TrainableLayer : public Layer {
-  // FP32 input/output but lower precision weight don't make much sense.
-  static_assert(!(std::is_same<DType, float>::value && use_FP32_weight == false));
-
- protected:
-  // Why WeightType is protected?
-  // it is convenient for a child trainable to access the weight type,
-  // especially if it wants to use FP32 weights but inputs/outputs the lower precision data.
-  // A typical example is when DType is __half but use_FP32_weight is true.
-  // Then, the child class should define the following alias to make their code cleaner:
-  // (1) using Base = TrainableLayer<DType, true>;
-  // (2) using WeightType = typename Base::WeightType;
-  // If  useFP32_weight is false, the aliases are not necessary.
-  using WeightType = typename std::conditional<use_FP32_weight, float, DType>::type;
-
- private:
-  Tensors2<float> master_weights_;
-  Tensors2<WeightType> weights_;
-  Tensors2<WeightType> wgrads_;
-  const std::shared_ptr<BufferBlock2<float>> master_weight_buff_;
-  const std::shared_ptr<BufferBlock2<WeightType>> weight_buff_;
-  const std::shared_ptr<BufferBlock2<WeightType>> wgrad_buff_;
-  // Layer initializers.
-  // if the layer need a specific weight initialization, override each function accordingly.
-  virtual std::unique_ptr<DataSimulator> get_zero_initializer(const int index) override {
-    return std::make_unique<ConstantDataSimulator>(0.0f);
-  }
-  virtual std::unique_ptr<DataSimulator> get_uniform_initializer(const int index) override {
-    return std::move(get_default_initializer(index));
-  };
-  virtual std::unique_ptr<DataSimulator> get_xavier_uniform_initializer(const int index) override {
-    return std::move(get_default_initializer(index));
-  };
-  virtual std::unique_ptr<DataSimulator> get_xavier_norm_initializer(const int index) override {
-    return std::move(get_default_initializer(index));
-  };
-  virtual std::unique_ptr<DataSimulator> get_default_initializer(const int index) override {
-    return std::move(get_zero_initializer(index));
-  };
-
- protected:
-  // @brief a modifier to reserve a weight tensor at idx with the specified dims.
-  // @details
-  // Usage: In a child class, this->set_weight(0, dims);
-  void set_weight(size_t idx, const std::vector<size_t>& dimensions) {
-    HCTR_CHECK_HINT(weights_.size() == idx, "Wrong index for setting weight tensors");
-
-    Tensor2<WeightType> tensor;
-    weight_buff_->reserve(dimensions, &tensor);
-    weights_.push_back(tensor);
-
-    // master weights are used only when compute weights have lower precision
-    if constexpr (!use_FP32_weight) {
-      HCTR_CHECK_HINT(master_weights_.size() == idx,
-                      "Wrong index for setting master weight tensors");
-
-      Tensor2<float> tensor;
-      master_weight_buff_->reserve(dimensions, &tensor);
-      master_weights_.push_back(tensor);
-    }
-  }
-  // @brief a modifier to reserve a weight tensor at idx with the specified dims.
-  // @details
-  // Usage: In a child class, this->set_wgrad(0, dims);
-  void set_wgrad(size_t idx, const std::vector<size_t>& dimensions) {
-    HCTR_CHECK_HINT(wgrads_.size() == idx, "Wrong index for setting weight gradient tensors");
-
-    Tensor2<WeightType> tensor;
-    wgrad_buff_->reserve(dimensions, &tensor);
-    wgrads_.push_back(tensor);
-  }
-  // @brief an accessor to get a weight tensor at idx
-  // @details
-  // Usage: In a child class, auto weight2 = this->get_weight(2);
-  auto& get_weight(size_t idx) {
-    HCTR_CHECK_HINT(idx < weights_.size(), "Wrong index for getting weight tensors");
-    return weights_[idx];
-  }
-  // @brief an accessor to get a wgrad tensor at idx
-  // @details
-  // Usage: In a child class, auto wgrad2 = this->get_wgrad(2);
-  auto& get_wgrad(size_t idx) {
-    HCTR_CHECK_HINT(idx < wgrads_.size(), "Wrong index for getting weight gradient tensors");
-    return wgrads_[idx];
-  }
-
- public:
-  // @brief a parameter initialization function
-  // @details
-  // init_params calls the specific initializers to initialize parameters. The types of initializers
-  // are specified by initializer_types_.
-  void init_params(const curandGenerator_t& generator) override;
-
-  /**
-   * Ctor of TrainableLayer.
-   * @param master_weight_buff the buffer to reserve master weight tensors, used only if WeightType
-   * is not FP32.
-   * @param weight_buff the buffer to reserve weight tensors
-   * @param wgrad_buff the buffer to reserve weight gradient tensors
-   * @param gpu_resource the abstraction of GPU where this dense layer resides
-   * @param initializer_types the list of initializer types of all weight tensors
-   */
-  TrainableLayer(const std::shared_ptr<BufferBlock2<float>>& master_weight_buff,
-                 const std::shared_ptr<BufferBlock2<WeightType>>& weight_buff,
-                 const std::shared_ptr<BufferBlock2<WeightType>>& wgrad_buff,
-                 const std::shared_ptr<GPUResource>& gpu_resource,
-                 std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>())
-      : Layer(gpu_resource, initializer_types),
-        // if WeightType is float, master weights are not used at all
-        master_weight_buff_(std::is_same<WeightType, float>::value ? nullptr : master_weight_buff),
-        weight_buff_(weight_buff),
-        wgrad_buff_(wgrad_buff) {}
-};
-
-template <typename DType, bool use_FP32_weight>
-void TrainableLayer<DType, use_FP32_weight>::init_params(const curandGenerator_t& generator) {
-  std::shared_ptr<GeneralBuffer2<CudaHostAllocator>> buff =
-      GeneralBuffer2<CudaHostAllocator>::create();
-  std::shared_ptr<BufferBlock2<float>> block = buff->create_block<float>();
-
-  Tensors2<float> weights = master_weights_;
-  if constexpr (std::is_same<DType, float>::value && use_FP32_weight) {
-    weights = weights_;
-  }
-
-  Tensors2<float> weight_cpu_tensors;
-  for (const Tensor2<float>& weight : weights) {
-    Tensor2<float> tensor;
-    block->reserve(weight.get_dimensions(), &tensor);
-    weight_cpu_tensors.push_back(tensor);
-  }
-
-  buff->allocate();
-
-  std::vector<std::unique_ptr<DataSimulator>> simulators;
-  // each weight has its own initializer
-  for (int index = 0; index < static_cast<int>(weights.size()); ++index) {
-    switch (initializer_types_[index % initializer_types_.size()]) {
-      case Initializer_t::Uniform: {
-        simulators.push_back(get_uniform_initializer(index));
-        break;
-      }
-      case Initializer_t::XavierNorm: {
-        simulators.push_back(get_xavier_norm_initializer(index));
-        break;
-      }
-      case Initializer_t::XavierUniform: {
-        simulators.push_back(get_xavier_uniform_initializer(index));
-        break;
-      }
-      case Initializer_t::Zero: {
-        simulators.push_back(get_zero_initializer(index));
-        break;
-      }
-      case Initializer_t::Default: {
-        simulators.push_back(get_default_initializer(index));
-        break;
-      }
-      default: {
-        HCTR_OWN_THROW(Error_t::OutOfBound, "Not supported initializer.");
-        break;
-      }
-    }
-  }
-
-  for (size_t i = 0; i < weights.size(); ++i) {
-    simulators[i]->fill(weight_cpu_tensors[i], generator);
-    HCTR_LIB_THROW(cudaMemcpyAsync(weights[i].get_ptr(), weight_cpu_tensors[i].get_ptr(),
-                                   weights[i].get_size_in_bytes(), cudaMemcpyHostToDevice,
-                                   get_gpu().get_stream()));
-  }
-}
 
 /**
  * @brief
@@ -209,7 +30,7 @@ void TrainableLayer<DType, use_FP32_weight>::init_params(const curandGenerator_t
  * @tparams use_FP32_weight if specified, the weight data type is in FP32, not DType
  */
 template <typename DType, bool use_FP32_weight = std::is_same<DType, float>::value>
-class Core23TempTrainableLayer : public Layer {
+class TrainableLayer : public Layer {
   // FP32 input/output but lower precision weight don't make much sense.
   static_assert(!(std::is_same<DType, float>::value && use_FP32_weight == false));
 
@@ -310,11 +131,10 @@ class Core23TempTrainableLayer : public Layer {
    * @param gpu_resource the abstraction of GPU where this dense layer resides
    * @param initializer_types the list of initializer types of all weight tensors
    */
-  Core23TempTrainableLayer(
-      const std::vector<core23::Tensor>& input_tensors,
-      const std::vector<core23::Tensor>& output_tensors,
-      const std::shared_ptr<GPUResource>& gpu_resource,
-      std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>())
+  TrainableLayer(const std::vector<core23::Tensor>& input_tensors,
+                 const std::vector<core23::Tensor>& output_tensors,
+                 const std::shared_ptr<GPUResource>& gpu_resource,
+                 std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>())
       : Layer(input_tensors, output_tensors, gpu_resource, initializer_types),
         master_weights_params_(core23::TensorParams()
                                    .alignment(sizeof(float))
@@ -339,8 +159,7 @@ class Core23TempTrainableLayer : public Layer {
 };
 
 template <typename DType, bool use_FP32_weight>
-void Core23TempTrainableLayer<DType, use_FP32_weight>::init_params(
-    const curandGenerator_t& generator) {
+void TrainableLayer<DType, use_FP32_weight>::init_params(const curandGenerator_t& generator) {
   std::vector<core23::Tensor> weights = master_weights_;
   if constexpr (std::is_same<DType, float>::value && use_FP32_weight) {
     weights = weights_;
diff --git a/HugeCTR/src/layers/batch_norm_layer.cu b/HugeCTR/src/layers/batch_norm_layer.cu
index ceb3ef107c..9c18b77182 100644
--- a/HugeCTR/src/layers/batch_norm_layer.cu
+++ b/HugeCTR/src/layers/batch_norm_layer.cu
@@ -29,206 +29,10 @@ using ToStringType = typename std::conditional<std::is_same<T, __half>::value, f
 }
 
 template <typename T>
-BatchNormLayer<T>::BatchNormLayer(const std::shared_ptr<BufferBlock2<float>>& master_weight_buff,
-                                  const std::shared_ptr<BufferBlock2<WeightType>>& weight_buff,
-                                  const std::shared_ptr<BufferBlock2<WeightType>>& wgrad_buff,
-                                  const std::shared_ptr<GeneralBuffer2<CudaAllocator>>& blob_buff,
-                                  const Tensor2<T>& in_tensor, const Tensor2<T>& out_tensor,
+BatchNormLayer<T>::BatchNormLayer(const core23::Tensor& in_tensor, const core23::Tensor& out_tensor,
                                   const Params& params,
                                   const std::shared_ptr<GPUResource>& gpu_resource,
                                   std::vector<Initializer_t> initializer_types)
-    : Base(master_weight_buff, weight_buff, wgrad_buff, gpu_resource, initializer_types),
-      params_(params),
-      mode_(CUDNN_BATCHNORM_PER_ACTIVATION) {
-  CudaDeviceContext context(this->get_device_id());
-  const auto& in_tensor_dim = in_tensor.get_dimensions();
-  const auto& out_tensor_dim = out_tensor.get_dimensions();
-
-  assert(get_size_from_dims(in_tensor_dim) == get_size_from_dims(out_tensor_dim));
-  assert(in_tensor_dim.size() == 2 && out_tensor_dim.size() == 2);
-  assert(in_tensor_dim[0] == out_tensor_dim[0]);
-  assert(in_tensor_dim[1] == out_tensor_dim[1]);
-
-  HCTR_LIB_THROW(cudnnCreateTensorDescriptor(&in_out_desc_));
-
-  size_t num_feature = in_tensor_dim[1];
-  int batch_size = in_tensor_dim[0];
-
-  cudnnDataType_t data_type = std::is_same<T, __half>::value ? CUDNN_DATA_HALF : CUDNN_DATA_FLOAT;
-  int n_stride = num_feature;
-  int w_stride = 1;
-
-  HCTR_LIB_THROW(cudnnSetTensor4dDescriptorEx(in_out_desc_, data_type, batch_size, 1, 1,
-                                              num_feature, n_stride, 1, 1, w_stride));
-
-  in_tensors_.push_back(in_tensor);
-  out_tensors_.push_back(out_tensor);
-
-  HCTR_LIB_THROW(cudnnCreateTensorDescriptor(&gamma_beta_desc_));
-
-  HCTR_LIB_THROW(cudnnDeriveBNTensorDescriptor(gamma_beta_desc_, in_out_desc_, mode_));
-
-  std::vector<size_t> gamma_dim = {num_feature, 1};
-
-  // gamma & beta
-  this->set_weight(0, gamma_dim);
-  this->set_weight(1, gamma_dim);
-
-  gamma_ = this->get_weight(0);
-  beta_ = this->get_weight(1);
-  // gamma grad & beta grad
-  this->set_wgrad(0, gamma_dim);
-  this->set_wgrad(1, gamma_dim);
-  gamma_grad_ = this->get_wgrad(0);
-  beta_grad_ = this->get_wgrad(1);
-
-  // result running mean & var
-  blob_buff->reserve(gamma_dim, &result_running_mean_);
-  blob_buff->reserve(gamma_dim, &result_running_var_);
-
-  // save running mean & var (cache)
-  blob_buff->reserve(gamma_dim, &result_save_mean_);
-  blob_buff->reserve(gamma_dim, &result_save_inv_var_);
-}
-
-template <typename T>
-BatchNormLayer<T>::~BatchNormLayer() {
-  try {
-    HCTR_LIB_THROW(cudnnDestroyTensorDescriptor(in_out_desc_));
-    HCTR_LIB_THROW(cudnnDestroyTensorDescriptor(gamma_beta_desc_));
-  } catch (const std::runtime_error& rt_err) {
-    HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl;
-  }
-}
-
-template <typename T>
-void BatchNormLayer<T>::initialize() {
-  // host array to get running mean & var
-
-  size_t num_feature = in_tensors_[0].get_dimensions()[1];
-
-  std::shared_ptr<GeneralBuffer2<HostAllocator>> internal_host_buf =
-      GeneralBuffer2<HostAllocator>::create();
-
-  internal_host_buf->reserve({num_feature}, &h_result_running_mean_);
-  internal_host_buf->reserve({num_feature}, &h_result_running_var_);
-
-  internal_host_buf->allocate();
-}
-
-template <typename T>
-void BatchNormLayer<T>::fprop(bool is_train) {
-  CudaDeviceContext context(this->get_device_id());
-  float one = 1.0f, zero = 0.0f;
-
-  Tensor2<T>& in_tensor = in_tensors_[0];
-  Tensor2<T>& out_tensor = out_tensors_[0];
-  T* in = in_tensor.get_ptr();
-  T* out = out_tensor.get_ptr();
-
-  float* gamma = gamma_.get_ptr();
-  float* beta = beta_.get_ptr();
-
-  float* result_running_mean = result_running_mean_.get_ptr();
-  float* result_running_var = result_running_var_.get_ptr();
-  float* result_save_mean = result_save_mean_.get_ptr();
-  float* result_save_inv_var = result_save_inv_var_.get_ptr();
-
-  if (is_train) {
-    HCTR_LIB_THROW(cudnnBatchNormalizationForwardTraining(
-        this->get_gpu().get_cudnn_handle(), mode_, &one, &zero, in_out_desc_, in, in_out_desc_, out,
-        gamma_beta_desc_, gamma, beta, params_.factor, result_running_mean, result_running_var,
-        params_.eps, result_save_mean, result_save_inv_var));
-  } else {
-    HCTR_LIB_THROW(cudnnBatchNormalizationForwardInference(
-        this->get_gpu().get_cudnn_handle(), mode_, &one, &zero, in_out_desc_, in, in_out_desc_, out,
-        gamma_beta_desc_, gamma, beta, result_running_mean, result_running_var, params_.eps));
-  }
-}
-
-template <typename T>
-void BatchNormLayer<T>::bprop() {
-  CudaDeviceContext context(this->get_device_id());
-
-  float one = 1.0f, zero = 0.0f;
-
-  Tensor2<T>& in_tensor = in_tensors_[0];
-  Tensor2<T>& out_tensor = out_tensors_[0];
-  T* in = in_tensor.get_ptr();
-  T* out = out_tensor.get_ptr();
-
-  float* gamma = gamma_.get_ptr();
-
-  float* gamma_grad = gamma_grad_.get_ptr();
-  float* beta_grad = beta_grad_.get_ptr();
-
-  float* result_save_mean = result_save_mean_.get_ptr();
-  float* result_save_inv_var = result_save_inv_var_.get_ptr();
-
-  HCTR_LIB_THROW(cudnnBatchNormalizationBackward(
-      this->get_gpu().get_cudnn_handle(), mode_, &one, &zero, &one, &zero, in_out_desc_, in,
-      in_out_desc_, out, in_out_desc_, in, gamma_beta_desc_, gamma, gamma_grad, beta_grad,
-      params_.eps, result_save_mean, result_save_inv_var));
-}
-
-template <typename T>
-std::string BatchNormLayer<T>::get_no_trained_params_in_string() {
-  float* d_result_running_mean = result_running_mean_.get_ptr();
-  float* d_result_running_var = result_running_var_.get_ptr();
-  size_t n_byte = result_running_mean_.get_size_in_bytes();
-  size_t n_elem = n_byte / sizeof(T);
-
-  HCTR_LIB_THROW(cudaMemcpy(h_result_running_mean_.get_ptr(), d_result_running_mean, n_byte,
-                            cudaMemcpyDeviceToHost));
-  HCTR_LIB_THROW(cudaMemcpy(h_result_running_var_.get_ptr(), d_result_running_var, n_byte,
-                            cudaMemcpyDeviceToHost));
-
-  std::string result = "      \"type\": \"BatchNorm\",\n";
-  result += "      \"mean\": [";
-  for (size_t i = 0; i < n_elem; i++) {
-    result += std::to_string(ToStringType<T>(h_result_running_mean_.get_ptr()[i]));
-    if (i != (n_elem - 1)) result += ", ";
-  }
-  result += "],\n";
-
-  result += "      \"var\": [";
-  for (size_t i = 0; i < n_elem; i++) {
-    result += std::to_string(ToStringType<T>(h_result_running_var_.get_ptr()[i]));
-    if (i != (n_elem - 1)) result += ", ";
-  }
-  result += "]";
-
-  return result;
-}
-
-template <typename T>
-std::vector<TensorBag2> BatchNormLayer<T>::get_tensors_for_non_trainable_params() {
-  std::vector<TensorBag2> tensors;
-  tensors.push_back(result_running_mean_.shrink());
-  tensors.push_back(result_running_var_.shrink());
-  return tensors;
-}
-
-template <typename T>
-std::unique_ptr<DataSimulator> BatchNormLayer<T>::get_default_initializer(const int index) {
-  std::unique_ptr<DataSimulator> simu;
-  if (0 == index) {
-    simu.reset(new ConstantDataSimulator(1.0f));
-  } else if (1 == index) {
-    simu.reset(new ConstantDataSimulator(0.0f));
-  } else {
-    HCTR_OWN_THROW(Error_t::OutOfBound, "index != {0, 1}.");
-  }
-  return simu;
-}
-
-template class BatchNormLayer<float>;
-template class BatchNormLayer<__half>;
-
-template <typename T>
-Core23TempBatchNormLayer<T>::Core23TempBatchNormLayer(
-    const core23::Tensor& in_tensor, const core23::Tensor& out_tensor, const Params& params,
-    const std::shared_ptr<GPUResource>& gpu_resource, std::vector<Initializer_t> initializer_types)
     : Base({in_tensor}, {out_tensor}, gpu_resource, initializer_types),
       params_(params),
       mode_(CUDNN_BATCHNORM_PER_ACTIVATION) {
@@ -303,7 +107,7 @@ Core23TempBatchNormLayer<T>::Core23TempBatchNormLayer(
 }
 
 template <typename T>
-Core23TempBatchNormLayer<T>::~Core23TempBatchNormLayer() {
+BatchNormLayer<T>::~BatchNormLayer() {
   try {
     HCTR_LIB_THROW(cudnnDestroyTensorDescriptor(in_out_desc_));
     HCTR_LIB_THROW(cudnnDestroyTensorDescriptor(gamma_beta_desc_));
@@ -313,7 +117,7 @@ Core23TempBatchNormLayer<T>::~Core23TempBatchNormLayer() {
 }
 
 template <typename T>
-void Core23TempBatchNormLayer<T>::initialize() {
+void BatchNormLayer<T>::initialize() {
   // host array to get running mean & var
 
   int64_t num_feature = this->input_tensors_[0].shape().size(1);
@@ -330,7 +134,7 @@ void Core23TempBatchNormLayer<T>::initialize() {
 }
 
 template <typename T>
-void Core23TempBatchNormLayer<T>::fprop(bool is_train) {
+void BatchNormLayer<T>::fprop(bool is_train) {
   CudaDeviceContext context(this->get_device_id());
   float one = 1.0f, zero = 0.0f;
 
@@ -360,7 +164,7 @@ void Core23TempBatchNormLayer<T>::fprop(bool is_train) {
 }
 
 template <typename T>
-void Core23TempBatchNormLayer<T>::bprop() {
+void BatchNormLayer<T>::bprop() {
   CudaDeviceContext context(this->get_device_id());
 
   float one = 1.0f, zero = 0.0f;
@@ -385,7 +189,7 @@ void Core23TempBatchNormLayer<T>::bprop() {
 }
 
 template <typename T>
-std::string Core23TempBatchNormLayer<T>::get_no_trained_params_in_string() {
+std::string BatchNormLayer<T>::get_no_trained_params_in_string() {
   float* d_result_running_mean = result_running_mean_.data<float>();
   float* d_result_running_var = result_running_var_.data<float>();
   int64_t n_byte = result_running_mean_.num_bytes();
@@ -416,13 +220,12 @@ std::string Core23TempBatchNormLayer<T>::get_no_trained_params_in_string() {
 }
 
 template <typename T>
-std::vector<core23::Tensor> Core23TempBatchNormLayer<T>::get_non_trainable_params_as_tensors() {
+std::vector<core23::Tensor> BatchNormLayer<T>::get_non_trainable_params_as_tensors() {
   return {result_running_mean_, result_running_var_};
 }
 
 template <typename T>
-std::unique_ptr<DataSimulator> Core23TempBatchNormLayer<T>::get_default_initializer(
-    const int index) {
+std::unique_ptr<DataSimulator> BatchNormLayer<T>::get_default_initializer(const int index) {
   std::unique_ptr<DataSimulator> simu;
   if (0 == index) {
     simu.reset(new ConstantDataSimulator(1.0f));
@@ -434,6 +237,6 @@ std::unique_ptr<DataSimulator> Core23TempBatchNormLayer<T>::get_default_initiali
   return simu;
 }
 
-template class Core23TempBatchNormLayer<float>;
-template class Core23TempBatchNormLayer<__half>;
+template class BatchNormLayer<float>;
+template class BatchNormLayer<__half>;
 }  // namespace HugeCTR
diff --git a/HugeCTR/src/layers/fully_connected_layer.cu b/HugeCTR/src/layers/fully_connected_layer.cu
index 62847d3779..a21ce3fe84 100644
--- a/HugeCTR/src/layers/fully_connected_layer.cu
+++ b/HugeCTR/src/layers/fully_connected_layer.cu
@@ -53,353 +53,12 @@ void add_bias(float* data, const float* bias, const int m, const int n, bool row
 
 }  // namespace
 
-FullyConnectedLayer<float>::FullyConnectedLayer(
-    const std::shared_ptr<BufferBlock2<float>>& weight_buff,
-    const std::shared_ptr<BufferBlock2<float>>& wgrad_buff, const Tensor2<float>& in_tensor,
-    const Tensor2<float>& out_tensor, const std::shared_ptr<GPUResource>& gpu_resource,
-    bool use_mixed_precision, bool enable_tf32_compute,
-    std::vector<Initializer_t> initializer_types)
-    : TrainableLayer<float>(weight_buff, weight_buff, wgrad_buff, gpu_resource, initializer_types),
-      use_mixed_precision_(use_mixed_precision),
-      enable_tf32_compute_(enable_tf32_compute) {
-  try {
-    // check the in_tensor and out_tensor
-    const auto& in_tensor_dim = in_tensor.get_dimensions();
-    const auto& out_tensor_dim = out_tensor.get_dimensions();
-    // 1. input and output have the same dim
-    if (in_tensor_dim.size() != out_tensor_dim.size()) {
-      HCTR_OWN_THROW(Error_t::WrongInput, "input and output tensor don't have same dimensions");
-    }
-    // 2. dim match?
-    size_t in_batch_size = 1;
-    size_t out_batch_size = 1;
-    size_t input_size = in_tensor_dim[in_tensor_dim.size() - 1];
-    size_t output_size = out_tensor_dim[out_tensor_dim.size() - 1];
-
-    for (size_t idx = 0; idx < in_tensor_dim.size() - 1; idx++) {
-      in_batch_size = in_batch_size * in_tensor_dim[idx];
-      out_batch_size = out_batch_size * out_tensor_dim[idx];
-    }
-
-    if (in_batch_size != out_batch_size) {
-      HCTR_OWN_THROW(Error_t::WrongInput, "size of input / output tensor doesn't match");
-    }
-
-    std::vector<size_t> weight_dim = {input_size, output_size};
-    std::vector<size_t> bias_dim = {1, output_size};
-
-    this->set_weight(0, weight_dim);
-    this->set_weight(1, bias_dim);
-    this->set_wgrad(0, weight_dim);
-    this->set_wgrad(1, bias_dim);
-
-    in_tensors_.push_back(in_tensor);
-    out_tensors_.push_back(out_tensor);
-    // Where should we create this cuBLAS handle?
-  } catch (const std::runtime_error& rt_err) {
-    HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl;
-    throw;
-  }
-}
-
-void FullyConnectedLayer<float>::fprop(bool is_train) {
-  CudaDeviceContext context(get_device_id());
-
-  Tensor2<float>& in_tensor = get_in_tensors(is_train)[0];
-  Tensor2<float>& out_tensor = out_tensors_[0];
-
-  float* weight = this->get_weight(0).get_ptr();
-  float* bias = this->get_weight(1).get_ptr();
-  float* in = in_tensor.get_ptr();
-  float* out = out_tensor.get_ptr();
-
-  const auto& in_tensor_dim = in_tensor.get_dimensions();
-  const auto& out_tensor_dim = out_tensor.get_dimensions();
-
-  size_t in_batch_size = 1;
-  size_t input_size = in_tensor_dim[in_tensor_dim.size() - 1];
-  size_t output_size = out_tensor_dim[out_tensor_dim.size() - 1];
-
-  for (size_t idx = 0; idx < in_tensor_dim.size() - 1; idx++) {
-    in_batch_size = in_batch_size * in_tensor_dim[idx];
-  }
-
-  float alpha = 1.0f, beta = 0.0f;
-
-  const cublasComputeType_t compute_type =
-      enable_tf32_compute_ ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F;
-
-  HCTR_LIB_THROW(cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, output_size,
-                              in_batch_size, input_size, &alpha, weight, CUDA_R_32F, output_size,
-                              in, CUDA_R_32F, input_size, &beta, out, CUDA_R_32F, output_size,
-                              compute_type, falgo_));
-  add_bias(out, bias, in_batch_size, output_size, true, get_gpu().get_stream());
-}
-
-void FullyConnectedLayer<float>::bprop() {
-  CudaDeviceContext context(get_device_id());
-
-  Tensor2<float>& in_tensor = get_in_tensors(true)[0];
-  Tensor2<float>& out_tensor = out_tensors_[0];
-
-  float* wgrad = this->get_wgrad(0).get_ptr();
-  float* bias_grad = this->get_wgrad(1).get_ptr();
-  float* weight = this->get_weight(0).get_ptr();
-  float* in = in_tensor.get_ptr();
-  float* out = out_tensor.get_ptr();
-
-  const auto& in_tensor_dim = in_tensor.get_dimensions();
-  const auto& out_tensor_dim = out_tensor.get_dimensions();
-
-  size_t in_batch_size = 1;
-  size_t input_size = in_tensor_dim[in_tensor_dim.size() - 1];
-  size_t output_size = out_tensor_dim[out_tensor_dim.size() - 1];
-
-  for (size_t idx = 0; idx < in_tensor_dim.size() - 1; idx++) {
-    in_batch_size = in_batch_size * in_tensor_dim[idx];
-  }
-
-  float alpha = 1.0f, beta_w = 1.0f, beta_x = 0.0f;
-
-  const cublasComputeType_t compute_type =
-      enable_tf32_compute_ ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F;
-
-  // gradient respect to W
-  HCTR_LIB_THROW(cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_T, output_size,
-                              input_size, in_batch_size, &alpha, out, CUDA_R_32F, output_size, in,
-                              CUDA_R_32F, input_size, &beta_w, wgrad, CUDA_R_32F, output_size,
-                              compute_type, balgo_W_));
-  // gradient respect to Xn
-  HCTR_LIB_THROW(cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, input_size,
-                              in_batch_size, output_size, &alpha, weight, CUDA_R_32F, output_size,
-                              out, CUDA_R_32F, output_size, &beta_x, in, CUDA_R_32F, input_size,
-                              compute_type, balgo_Xn_));
-  MLCommon::LinAlg::reduce(bias_grad, out, in_batch_size, output_size, float(0), false, true,
-                           get_gpu().get_stream(), true);
-}
-
-void FullyConnectedLayer<float>::search_algorithm() {
-  // Set to the CUDA device where this layer assigned to
-  CudaDeviceContext context(get_device_id());
-
-  const int repeat_num = 100;
-
-  // Device Tensors to be used
-  Tensor2<float>& in_tensor = get_in_tensors(true)[0];
-  Tensor2<float>& out_tensor = out_tensors_[0];
-  float* weight = this->get_weight(0).get_ptr();
-  float* in = in_tensor.get_ptr();
-  float* out = out_tensor.get_ptr();
-  float* wgrad = this->get_wgrad(0).get_ptr();
-
-  // Tensor dim
-  const auto& in_tensor_dim = in_tensor.get_dimensions();
-  const auto& out_tensor_dim = out_tensor.get_dimensions();
-
-  size_t in_batch_size = 1;
-  size_t out_batch_size = 1;
-  size_t input_size = in_tensor_dim[in_tensor_dim.size() - 1];
-  size_t output_size = out_tensor_dim[out_tensor_dim.size() - 1];
-
-  for (size_t idx = 0; idx < in_tensor_dim.size() - 1; idx++) {
-    in_batch_size = in_batch_size * in_tensor_dim[idx];
-    out_batch_size = out_batch_size * out_tensor_dim[idx];
-  }
-
-  // Record time for each algorithm
-  float shortestTime = 100000000.0;
-  float time;
-  cudaEvent_t start, stop;
-  HCTR_LIB_THROW(cudaEventCreate(&start));
-  HCTR_LIB_THROW(cudaEventCreate(&stop));
-
-  // cublas ret status
-  cublasStatus_t status;
-
-  // Start, end for search
-  int startAlgo, endAlgo;
-  if (use_mixed_precision_) {
-    startAlgo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
-    endAlgo = CUBLAS_GEMM_ALGO15_TENSOR_OP;
-  } else {
-    startAlgo = CUBLAS_GEMM_DEFAULT;
-    endAlgo = CUBLAS_GEMM_ALGO23;
-  }
-
-  const cublasComputeType_t compute_type =
-      enable_tf32_compute_ ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F;
-
-  // Search all the algorithm for fprop
-  for (int testAlgo = startAlgo; testAlgo <= endAlgo; testAlgo++) {
-    float alpha = 1.0f, beta = 0.0f;
-
-    // Record start event
-    HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream()));
-    for (int i = 0; i < repeat_num; ++i) {
-      status = cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, output_size,
-                            in_batch_size, input_size, &alpha, weight, CUDA_R_32F, output_size, in,
-                            CUDA_R_32F, input_size, &beta, out, CUDA_R_32F, output_size,
-                            compute_type, static_cast<cublasGemmAlgo_t>(testAlgo));
-    }
-    HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream()));
-    HCTR_LIB_THROW(cudaEventSynchronize(stop));
-    HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop));
-    // Avg Time(ms) for this algorithm for fprop GEMM
-    time = time / repeat_num;
-    // Skip if the algorithm is supported for fprop configuration
-    if (status != CUBLAS_STATUS_SUCCESS) {
-      // HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for fprop, skipped.\n",
-      // testAlgo);
-      continue;
-    }
-    // Record the optimal time and algorithm
-    if (time < shortestTime) {
-      shortestTime = time;
-      falgo_ = static_cast<cublasGemmAlgo_t>(testAlgo);
-    }
-  }
-
-  // Reset shortestTime
-  shortestTime = 100000000.0;
-
-  // Search all the algorithm for bprop_W
-  for (int testAlgo = startAlgo; testAlgo <= endAlgo; testAlgo++) {
-    float alpha = 1.0f, beta_w = 1.0f;
-
-    // Record start event
-    HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream()));
-    for (int i = 0; i < repeat_num; ++i) {
-      status = cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_T, output_size,
-                            input_size, in_batch_size, &alpha, out, CUDA_R_32F, output_size, in,
-                            CUDA_R_32F, input_size, &beta_w, wgrad, CUDA_R_32F, output_size,
-                            compute_type, static_cast<cublasGemmAlgo_t>(testAlgo));
-    }
-    HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream()));
-    HCTR_LIB_THROW(cudaEventSynchronize(stop));
-    HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop));
-    // Avg Time(ms) for this algorithm for fprop GEMM
-    time = time / repeat_num;
-    // Skip if the algorithm is supported for fprop configuration
-    if (status != CUBLAS_STATUS_SUCCESS) {
-      // HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for bprop_W, skipped.\n",
-      // testAlgo);
-      continue;
-    }
-    // Record the optimal time and algorithm
-    if (time < shortestTime) {
-      shortestTime = time;
-      balgo_W_ = static_cast<cublasGemmAlgo_t>(testAlgo);
-    }
-  }
-
-  // Reset shortestTime
-  shortestTime = 100000000.0;
-
-  // Search all the algorithm for bprop_Xn
-  for (int testAlgo = startAlgo; testAlgo <= endAlgo; testAlgo++) {
-    float alpha = 1.0f, beta_x = 0.0f;
-
-    // Record start event
-    HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream()));
-    for (int i = 0; i < repeat_num; ++i) {
-      status = cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, input_size,
-                            in_batch_size, output_size, &alpha, weight, CUDA_R_32F, output_size,
-                            out, CUDA_R_32F, output_size, &beta_x, in, CUDA_R_32F, input_size,
-                            compute_type, static_cast<cublasGemmAlgo_t>(testAlgo));
-    }
-    HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream()));
-    HCTR_LIB_THROW(cudaEventSynchronize(stop));
-    HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop));
-    // Avg Time(ms) for this algorithm for fprop GEMM
-    time = time / repeat_num;
-    // Skip if the algorithm is supported for fprop configuration
-    if (status != CUBLAS_STATUS_SUCCESS) {
-      // HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for bprop_Xn, skipped.\n",
-      // testAlgo);
-      continue;
-    }
-    // Record the optimal time and algorithm
-    if (time < shortestTime) {
-      shortestTime = time;
-      balgo_Xn_ = static_cast<cublasGemmAlgo_t>(testAlgo);
-    }
-  }
-
-  // Print selection information
-  // HCTR_LOG(INFO, WORLD, "The algorithm selection for fprop, bprop_W and bprop_Xn are: %d, %d and
-  // %d.\n",
-  //       (int)falgo_, (int)balgo_W_, (int)balgo_Xn_);
-
-  // Output msg
-  // HCTR_LOG(INFO, ROOT, "The fully-connected layer has finished choosing the algorithm for cublas
-  // Gemm.\n"); Clean-up
-  HCTR_LIB_THROW(cudaEventDestroy(start));
-  HCTR_LIB_THROW(cudaEventDestroy(stop));
-}
-
-std::unique_ptr<DataSimulator> FullyConnectedLayer<float>::get_uniform_initializer(
-    const int index) {
-  const Tensor2<float>& in_tensor = get_in_tensors(true)[0];
-  const Tensor2<float>& out_tensor = out_tensors_[0];
-  float bottom_dim = in_tensor.get_dimensions()[in_tensor.get_dimensions().size() - 1];
-  float top_dim = out_tensor.get_dimensions()[out_tensor.get_dimensions().size() - 1];
-
-  float limit = 1.0f / ((0 == index ? bottom_dim : 0) + top_dim);
-  return std::make_unique<UniformDataSimulator>(-1 * limit, limit);
-}
-
-std::unique_ptr<DataSimulator> FullyConnectedLayer<float>::get_xavier_uniform_initializer(
-    const int index) {
-  const Tensor2<float>& in_tensor = get_in_tensors(true)[0];
-  const Tensor2<float>& out_tensor = out_tensors_[0];
-  float bottom_dim = in_tensor.get_dimensions()[in_tensor.get_dimensions().size() - 1];
-  float top_dim = out_tensor.get_dimensions()[out_tensor.get_dimensions().size() - 1];
-
-  return std::make_unique<VarianceScalingSimulator>(1.f, data_simu::Mode_t::Fan_avg,
-                                                    data_simu::Distribution_t::Uniform,
-                                                    0 == index ? bottom_dim : 0, top_dim);
-}
-
-std::unique_ptr<DataSimulator> FullyConnectedLayer<float>::get_xavier_norm_initializer(
-    const int index) {
-  const Tensor2<float>& in_tensor = get_in_tensors(true)[0];
-  const Tensor2<float>& out_tensor = out_tensors_[0];
-  float bottom_dim = in_tensor.get_dimensions()[in_tensor.get_dimensions().size() - 1];
-  float top_dim = out_tensor.get_dimensions()[out_tensor.get_dimensions().size() - 1];
-
-  return std::make_unique<VarianceScalingSimulator>(1.f, data_simu::Mode_t::Fan_avg,
-                                                    data_simu::Distribution_t::Norm,
-                                                    0 == index ? bottom_dim : 0, top_dim);
-}
-
-std::unique_ptr<DataSimulator> FullyConnectedLayer<float>::get_default_initializer(
-    const int index) {
-  const Tensor2<float>& in_tensor = get_in_tensors(true)[0];
-  const Tensor2<float>& out_tensor = out_tensors_[0];
-  float bottom_dim = in_tensor.get_dimensions()[in_tensor.get_dimensions().size() - 1];
-  float top_dim = out_tensor.get_dimensions()[out_tensor.get_dimensions().size() - 1];
-
-  std::unique_ptr<DataSimulator> simu(nullptr);
-  if (0 == index) {
-    simu.reset(new VarianceScalingSimulator(1.f, data_simu::Mode_t::Fan_avg,
-                                            data_simu::Distribution_t::Norm, bottom_dim, top_dim));
-  } else if (1 == index) {
-    float stddev = sqrt(1.f / top_dim);
-    simu.reset(new GaussianDataSimulator(0, stddev, -2 * stddev, 2 * stddev));
-  } else {
-    HCTR_OWN_THROW(Error_t::OutOfBound, "index != {0, 1}.");
-  }
-
-  return simu;
-}
-
-template class FullyConnectedLayer<float>;
-
-Core23TempFullyConnectedLayer<float>::Core23TempFullyConnectedLayer(
-    const core23::Tensor& in_tensor, const core23::Tensor& out_tensor,
-    const std::shared_ptr<GPUResource>& gpu_resource, bool use_mixed_precision,
-    bool enable_tf32_compute, std::vector<Initializer_t> initializer_types)
-    : Core23TempTrainableLayer<float>({in_tensor}, {out_tensor}, gpu_resource, initializer_types),
+FullyConnectedLayer<float>::FullyConnectedLayer(const core23::Tensor& in_tensor,
+                                                const core23::Tensor& out_tensor,
+                                                const std::shared_ptr<GPUResource>& gpu_resource,
+                                                bool use_mixed_precision, bool enable_tf32_compute,
+                                                std::vector<Initializer_t> initializer_types)
+    : TrainableLayer<float>({in_tensor}, {out_tensor}, gpu_resource, initializer_types),
       use_mixed_precision_(use_mixed_precision),
       enable_tf32_compute_(enable_tf32_compute) {
   try {
@@ -439,7 +98,7 @@ Core23TempFullyConnectedLayer<float>::Core23TempFullyConnectedLayer(
   }
 }
 
-void Core23TempFullyConnectedLayer<float>::fprop(bool is_train) {
+void FullyConnectedLayer<float>::fprop(bool is_train) {
   CudaDeviceContext context(get_device_id());
 
   core23::Tensor& in_tensor = get_in_tensors(is_train)[0];
@@ -473,7 +132,7 @@ void Core23TempFullyConnectedLayer<float>::fprop(bool is_train) {
   add_bias(out, bias, in_batch_size, output_size, true, get_gpu().get_stream());
 }
 
-void Core23TempFullyConnectedLayer<float>::bprop() {
+void FullyConnectedLayer<float>::bprop() {
   CudaDeviceContext context(get_device_id());
 
   core23::Tensor& in_tensor = get_in_tensors(true)[0];
@@ -515,7 +174,7 @@ void Core23TempFullyConnectedLayer<float>::bprop() {
                            get_gpu().get_stream(), true);
 }
 
-void Core23TempFullyConnectedLayer<float>::search_algorithm() {
+void FullyConnectedLayer<float>::search_algorithm() {
   // Set to the CUDA device where this layer assigned to
   CudaDeviceContext context(get_device_id());
 
@@ -674,7 +333,7 @@ void Core23TempFullyConnectedLayer<float>::search_algorithm() {
   HCTR_LIB_THROW(cudaEventDestroy(stop));
 }
 
-std::unique_ptr<DataSimulator> Core23TempFullyConnectedLayer<float>::get_uniform_initializer(
+std::unique_ptr<DataSimulator> FullyConnectedLayer<float>::get_uniform_initializer(
     const int index) {
   const core23::Tensor& in_tensor = get_in_tensors(true)[0];
   const core23::Tensor& out_tensor = this->output_tensors_[0];
@@ -685,7 +344,7 @@ std::unique_ptr<DataSimulator> Core23TempFullyConnectedLayer<float>::get_uniform
   return std::make_unique<UniformDataSimulator>(-1 * limit, limit);
 }
 
-std::unique_ptr<DataSimulator> Core23TempFullyConnectedLayer<float>::get_xavier_uniform_initializer(
+std::unique_ptr<DataSimulator> FullyConnectedLayer<float>::get_xavier_uniform_initializer(
     const int index) {
   const core23::Tensor& in_tensor = get_in_tensors(true)[0];
   const core23::Tensor& out_tensor = this->output_tensors_[0];
@@ -697,7 +356,7 @@ std::unique_ptr<DataSimulator> Core23TempFullyConnectedLayer<float>::get_xavier_
                                                     0 == index ? bottom_dim : 0, top_dim);
 }
 
-std::unique_ptr<DataSimulator> Core23TempFullyConnectedLayer<float>::get_xavier_norm_initializer(
+std::unique_ptr<DataSimulator> FullyConnectedLayer<float>::get_xavier_norm_initializer(
     const int index) {
   const core23::Tensor& in_tensor = get_in_tensors(true)[0];
   const core23::Tensor& out_tensor = this->output_tensors_[0];
@@ -709,7 +368,7 @@ std::unique_ptr<DataSimulator> Core23TempFullyConnectedLayer<float>::get_xavier_
                                                     0 == index ? bottom_dim : 0, top_dim);
 }
 
-std::unique_ptr<DataSimulator> Core23TempFullyConnectedLayer<float>::get_default_initializer(
+std::unique_ptr<DataSimulator> FullyConnectedLayer<float>::get_default_initializer(
     const int index) {
   const core23::Tensor& in_tensor = get_in_tensors(true)[0];
   const core23::Tensor& out_tensor = this->output_tensors_[0];
@@ -730,6 +389,6 @@ std::unique_ptr<DataSimulator> Core23TempFullyConnectedLayer<float>::get_default
   return simu;
 }
 
-template class Core23TempFullyConnectedLayer<float>;
+template class FullyConnectedLayer<float>;
 
 }  // namespace HugeCTR
diff --git a/HugeCTR/src/layers/fully_connected_layer_half.cu b/HugeCTR/src/layers/fully_connected_layer_half.cu
index 462fd319a3..28bfd821a2 100644
--- a/HugeCTR/src/layers/fully_connected_layer_half.cu
+++ b/HugeCTR/src/layers/fully_connected_layer_half.cu
@@ -20,437 +20,11 @@
 
 namespace HugeCTR {
 
-FullyConnectedLayer<__half>::FullyConnectedLayer(
-    const std::shared_ptr<BufferBlock2<float>>& master_weights_buff,
-    const std::shared_ptr<BufferBlock2<__half>>& weights_buff,
-    const std::shared_ptr<BufferBlock2<__half>>& weights_grad_buff,
-    const std::shared_ptr<GeneralBuffer2<CudaAllocator>>& blobs_buff,
-    const Tensor2<__half>& bottom_tensor, const Tensor2<__half>& top_tensor,
-    const std::shared_ptr<GPUResource>& gpu_resource, std::vector<Initializer_t> initializer_types)
-    : TrainableLayer<__half>(master_weights_buff, weights_buff, weights_grad_buff, gpu_resource,
-                             initializer_types),
-      falgo_b_(CUBLAS_GEMM_DEFAULT_TENSOR_OP),
-      falgo_k_(CUBLAS_GEMM_DEFAULT_TENSOR_OP),
-      balgo_b_(CUBLAS_GEMM_DEFAULT_TENSOR_OP),
-      balgo_k_(CUBLAS_GEMM_DEFAULT_TENSOR_OP),
-      balgo_x_(CUBLAS_GEMM_DEFAULT_TENSOR_OP) {
-  const auto& bottom_tensor_dim = bottom_tensor.get_dimensions();
-  const auto& top_tensor_dim = top_tensor.get_dimensions();
-
-  if (bottom_tensor_dim.size() != top_tensor_dim.size()) {
-    HCTR_OWN_THROW(Error_t::WrongInput, "input or output tensor don't have same dimensions");
-  }
-  size_t in_batch_size = 1;
-  size_t out_batch_size = 1;
-  size_t input_size = bottom_tensor_dim[bottom_tensor_dim.size() - 1];
-  size_t output_size = top_tensor_dim[top_tensor_dim.size() - 1];
-
-  for (size_t idx = 0; idx < bottom_tensor_dim.size() - 1; idx++) {
-    in_batch_size = in_batch_size * bottom_tensor_dim[idx];
-    out_batch_size = out_batch_size * top_tensor_dim[idx];
-  }
-
-  if (in_batch_size != out_batch_size) {
-    HCTR_OWN_THROW(Error_t::WrongInput, "size of input / output tensor doesn't match");
-  }
-
-  std::vector<size_t> kernel_dim = {input_size, output_size};
-  std::vector<size_t> bias_dim = {1, output_size};
-  std::vector<size_t> identity_dim = {1, in_batch_size};
-
-  this->set_weight(0, kernel_dim);
-  this->set_weight(1, bias_dim);
-  this->set_wgrad(0, kernel_dim);
-  this->set_wgrad(1, bias_dim);
-
-  blobs_buff->reserve(identity_dim, &identity_tensor_);
-
-  bottom_tensor_ = bottom_tensor;
-  top_tensor_ = top_tensor;
-}
-
-void FullyConnectedLayer<__half>::fprop(bool is_train) {
-  CudaDeviceContext context(get_device_id());
-
-  const __half* kernel = this->get_weight(0).get_ptr();
-  const __half* bias = this->get_weight(1).get_ptr();
-  const __half* bottom = get_bottom_tensor(is_train).get_ptr();
-  const __half* identity = identity_tensor_.get_ptr();
-  __half* top = top_tensor_.get_ptr();
-
-  const auto& bottom_tensor_dim = get_bottom_tensor(is_train).get_dimensions();
-  const auto& top_tensor_dim = top_tensor_.get_dimensions();
-
-  size_t in_batch_size = 1;
-  size_t input_size = bottom_tensor_dim[bottom_tensor_dim.size() - 1];
-  size_t output_size = top_tensor_dim[top_tensor_dim.size() - 1];
-
-  for (size_t idx = 0; idx < bottom_tensor_dim.size() - 1; idx++) {
-    in_batch_size = in_batch_size * bottom_tensor_dim[idx];
-  }
-
-  const float alpha = 1.0f;
-  const float beta_b = 0.0f;
-  const float beta_k = 1.0f;
-
-  HCTR_LIB_THROW(cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, output_size,
-                              in_batch_size, 1, &alpha, bias, CUDA_R_16F, output_size, identity,
-                              CUDA_R_16F, 1, &beta_b, top, CUDA_R_16F, output_size, CUDA_R_32F,
-                              falgo_b_));
-
-  HCTR_LIB_THROW(cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, output_size,
-                              in_batch_size, input_size, &alpha, kernel, CUDA_R_16F, output_size,
-                              bottom, CUDA_R_16F, input_size, &beta_k, top, CUDA_R_16F, output_size,
-                              CUDA_R_32F, falgo_k_));
-}
-
-void FullyConnectedLayer<__half>::bprop() {
-  CudaDeviceContext context(get_device_id());
-
-  const __half* kernel = this->get_weight(0).get_ptr();
-  const __half* top = top_tensor_.get_ptr();
-  const __half* identity = identity_tensor_.get_ptr();
-  __half* kernel_grad = this->get_wgrad(0).get_ptr();
-  __half* bias_grad = this->get_wgrad(1).get_ptr();
-  __half* bottom = get_bottom_tensor(true).get_ptr();
-
-  const auto& bottom_tensor_dim = get_bottom_tensor(true).get_dimensions();
-  const auto& top_tensor_dim = top_tensor_.get_dimensions();
-
-  size_t in_batch_size = 1;
-  size_t input_size = bottom_tensor_dim[bottom_tensor_dim.size() - 1];
-  size_t output_size = top_tensor_dim[top_tensor_dim.size() - 1];
-
-  for (size_t idx = 0; idx < bottom_tensor_dim.size() - 1; idx++) {
-    in_batch_size = in_batch_size * bottom_tensor_dim[idx];
-  }
-
-  const float alpha = 1.0f;
-  const float beta_b = 0.0f;
-  const float beta_k = 1.0f;
-  const float beta_x = 0.0f;
-
-  HCTR_LIB_THROW(cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, output_size,
-                              1, in_batch_size, &alpha, top, CUDA_R_16F, output_size, identity,
-                              CUDA_R_16F, in_batch_size, &beta_b, bias_grad, CUDA_R_16F,
-                              output_size, CUDA_R_32F, balgo_b_));
-
-  HCTR_LIB_THROW(cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_T, output_size,
-                              input_size, in_batch_size, &alpha, top, CUDA_R_16F, output_size,
-                              bottom, CUDA_R_16F, input_size, &beta_k, kernel_grad, CUDA_R_16F,
-                              output_size, CUDA_R_32F, balgo_k_));
-
-  HCTR_LIB_THROW(cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, input_size,
-                              in_batch_size, output_size, &alpha, kernel, CUDA_R_16F, output_size,
-                              top, CUDA_R_16F, output_size, &beta_x, bottom, CUDA_R_16F, input_size,
-                              CUDA_R_32F, balgo_x_));
-}
-
-void FullyConnectedLayer<__half>::initialize() {
-  CudaDeviceContext context(get_device_id());
-
-  __half* identity = identity_tensor_.get_ptr();
-  const auto& bottom_tensor_dim = get_bottom_tensor(true).get_dimensions();
-  size_t m = 1;
-  for (size_t idx = 0; idx < bottom_tensor_dim.size() - 1; idx++) {
-    m = m * bottom_tensor_dim[idx];
-  }
-  // Initialize identity vector
-  initialize_array<<<(m - 1) / 1024 + 1, 1024, 0, get_gpu().get_stream()>>>(identity, m,
-                                                                            __float2half(1.0f));
-}
-
-void FullyConnectedLayer<__half>::search_algorithm() {
-  // Set to the CUDA device where this layer assigned to
-  CudaDeviceContext context(get_device_id());
-
-  const size_t repeat_num = 100;
-
-  // Device Tensors to be used
-  __half* bottom = get_bottom_tensor(true).get_ptr();
-  __half* top = top_tensor_.get_ptr();
-  __half* identity = identity_tensor_.get_ptr();
-  __half* kernel = this->get_weight(0).get_ptr();
-  __half* bias = this->get_weight(1).get_ptr();
-  __half* kernel_grad = this->get_wgrad(0).get_ptr();
-  __half* bias_grad = this->get_wgrad(1).get_ptr();
-
-  // Tensor dim
-  const auto& bottom_tensor_dim = get_bottom_tensor(true).get_dimensions();
-  const auto& top_tensor_dim = top_tensor_.get_dimensions();
-
-  size_t in_batch_size = 1;
-  size_t input_size = bottom_tensor_dim[bottom_tensor_dim.size() - 1];
-  size_t output_size = top_tensor_dim[top_tensor_dim.size() - 1];
-
-  for (size_t idx = 0; idx < bottom_tensor_dim.size() - 1; idx++) {
-    in_batch_size = in_batch_size * bottom_tensor_dim[idx];
-  }
-
-  // Record time for each algorithm
-  float shortestTime = std::numeric_limits<float>::max();
-  float time;
-  cudaEvent_t start, stop;
-  HCTR_LIB_THROW(cudaEventCreate(&start));
-  HCTR_LIB_THROW(cudaEventCreate(&stop));
-
-  // Start, end for search
-  const cublasGemmAlgo_t startAlgo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
-  const cublasGemmAlgo_t endAlgo = CUBLAS_GEMM_ALGO15_TENSOR_OP;
-
-  // Search all the algorithm for falgo_b_
-  for (int testAlgo = startAlgo; testAlgo <= endAlgo; testAlgo++) {
-    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
-
-    const float alpha = 1.0f;
-    const float beta = 0.0f;
-
-    // Record start event
-    HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream()));
-    for (size_t i = 0; i < repeat_num && status == CUBLAS_STATUS_SUCCESS; ++i) {
-      status = cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, output_size,
-                            in_batch_size, 1, &alpha, bias, CUDA_R_16F, output_size, identity,
-                            CUDA_R_16F, 1, &beta, top, CUDA_R_16F, output_size, CUDA_R_32F,
-                            static_cast<cublasGemmAlgo_t>(testAlgo));
-    }
-    HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream()));
-    HCTR_LIB_THROW(cudaEventSynchronize(stop));
-    HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop));
-    // Avg Time(ms) for this algorithm for fprop GEMM
-    time = time / repeat_num;
-    // Skip if the algorithm is supported for fprop configuration
-    if (status != CUBLAS_STATUS_SUCCESS) {
-      // HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for fprop_b, skipped.\n",
-      // testAlgo);
-      continue;
-    }
-    // Record the optimal time and algorithm
-    if (time < shortestTime) {
-      shortestTime = time;
-      falgo_b_ = static_cast<cublasGemmAlgo_t>(testAlgo);
-    }
-  }
-
-  // Reset shortestTime
-  shortestTime = std::numeric_limits<float>::max();
-
-  // Search all the algorithm for falgo_k_
-  for (int testAlgo = startAlgo; testAlgo <= endAlgo; testAlgo++) {
-    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
-
-    const float alpha = 1.0f;
-    const float beta = 1.0f;
-
-    // Record start event
-    HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream()));
-    for (size_t i = 0; i < repeat_num && status == CUBLAS_STATUS_SUCCESS; ++i) {
-      status = cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, output_size,
-                            in_batch_size, input_size, &alpha, kernel, CUDA_R_16F, output_size,
-                            bottom, CUDA_R_16F, input_size, &beta, top, CUDA_R_16F, output_size,
-                            CUDA_R_32F, static_cast<cublasGemmAlgo_t>(testAlgo));
-    }
-    HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream()));
-    HCTR_LIB_THROW(cudaEventSynchronize(stop));
-    HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop));
-    // Avg Time(ms) for this algorithm for fprop GEMM
-    time = time / repeat_num;
-    // Skip if the algorithm is supported for fprop configuration
-    if (status != CUBLAS_STATUS_SUCCESS) {
-      // HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for fprop, skipped.\n",
-      // testAlgo);
-      continue;
-    }
-    // Record the optimal time and algorithm
-    if (time < shortestTime) {
-      shortestTime = time;
-      falgo_k_ = static_cast<cublasGemmAlgo_t>(testAlgo);
-    }
-  }
-
-  // Reset shortestTime
-  shortestTime = std::numeric_limits<float>::max();
-
-  // Search all the algorithm for balgo_b_
-  for (int testAlgo = startAlgo; testAlgo <= endAlgo; testAlgo++) {
-    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
-
-    const float alpha = 1.0f;
-    const float beta = 0.0f;
-
-    // Record start event
-    HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream()));
-    for (size_t i = 0; i < repeat_num && status == CUBLAS_STATUS_SUCCESS; ++i) {
-      status = cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, output_size, 1,
-                            in_batch_size, &alpha, top, CUDA_R_16F, output_size, identity,
-                            CUDA_R_16F, in_batch_size, &beta, bias_grad, CUDA_R_16F, output_size,
-                            CUDA_R_32F, static_cast<cublasGemmAlgo_t>(testAlgo));
-    }
-    HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream()));
-    HCTR_LIB_THROW(cudaEventSynchronize(stop));
-    HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop));
-    // Avg Time(ms) for this algorithm for fprop GEMM
-    time = time / repeat_num;
-    // Skip if the algorithm is supported for fprop configuration
-    if (status != CUBLAS_STATUS_SUCCESS) {
-      // HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for bprop_W, skipped.\n",
-      // testAlgo);
-      continue;
-    }
-    // Record the optimal time and algorithm
-    if (time < shortestTime) {
-      shortestTime = time;
-      balgo_b_ = static_cast<cublasGemmAlgo_t>(testAlgo);
-    }
-  }
-
-  // Reset shortestTime
-  shortestTime = std::numeric_limits<float>::max();
-
-  // Search all the algorithm for balgo_k_
-  for (int testAlgo = startAlgo; testAlgo <= endAlgo; testAlgo++) {
-    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
-
-    const float alpha = 1.0f;
-    const float beta = 1.0f;
-
-    // Record start event
-    HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream()));
-    for (size_t i = 0; i < repeat_num && status == CUBLAS_STATUS_SUCCESS; ++i) {
-      status = cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_T, output_size,
-                            input_size, in_batch_size, &alpha, top, CUDA_R_16F, output_size, bottom,
-                            CUDA_R_16F, input_size, &beta, kernel_grad, CUDA_R_16F, output_size,
-                            CUDA_R_32F, static_cast<cublasGemmAlgo_t>(testAlgo));
-    }
-    HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream()));
-    HCTR_LIB_THROW(cudaEventSynchronize(stop));
-    HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop));
-    // Avg Time(ms) for this algorithm for fprop GEMM
-    time = time / repeat_num;
-    // Skip if the algorithm is supported for fprop configuration
-    if (status != CUBLAS_STATUS_SUCCESS) {
-      // HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for bprop_W, skipped.\n",
-      // testAlgo);
-      continue;
-    }
-    // Record the optimal time and algorithm
-    if (time < shortestTime) {
-      shortestTime = time;
-      balgo_k_ = static_cast<cublasGemmAlgo_t>(testAlgo);
-    }
-  }
-
-  // Reset shortestTime
-  shortestTime = std::numeric_limits<float>::max();
-
-  // Search all the algorithm for balgo_x_
-  for (int testAlgo = startAlgo; testAlgo <= endAlgo; testAlgo++) {
-    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
-
-    const float alpha = 1.0f;
-    const float beta = 0.0f;
-
-    // Record start event
-    HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream()));
-    for (size_t i = 0; i < repeat_num && status == CUBLAS_STATUS_SUCCESS; ++i) {
-      status = cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, input_size,
-                            in_batch_size, output_size, &alpha, kernel, CUDA_R_16F, output_size,
-                            top, CUDA_R_16F, output_size, &beta, bottom, CUDA_R_16F, input_size,
-                            CUDA_R_32F, static_cast<cublasGemmAlgo_t>(testAlgo));
-    }
-
-    HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream()));
-    HCTR_LIB_THROW(cudaEventSynchronize(stop));
-    HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop));
-    // Avg Time(ms) for this algorithm for fprop GEMM
-    time = time / repeat_num;
-    // Skip if the algorithm is supported for fprop configuration
-    if (status != CUBLAS_STATUS_SUCCESS) {
-      // HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for bprop_Xn, skipped.\n",
-      // testAlgo);
-      continue;
-    }
-    // Record the optimal time and algorithm
-    if (time < shortestTime) {
-      shortestTime = time;
-      balgo_x_ = static_cast<cublasGemmAlgo_t>(testAlgo);
-    }
-  }
-
-  // Print selection information
-  // HCTR_LOG(INFO, WORLD,
-  //     "The algorithm selection for falgo_b_, falgo_k_, balgo_b_, balgo_k_, balgo_x_ are: %d, %d,
-  //     "
-  //     "%d, %d and %d.\n",
-  //     (int)falgo_b_ - CUBLAS_GEMM_DEFAULT_TENSOR_OP, (int)falgo_k_ -
-  //     CUBLAS_GEMM_DEFAULT_TENSOR_OP, (int)balgo_b_ - CUBLAS_GEMM_DEFAULT_TENSOR_OP, (int)balgo_k_
-  //     - CUBLAS_GEMM_DEFAULT_TENSOR_OP, (int)balgo_x_ - CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-
-  // Output msg
-  // HCTR_LOG(INFO, ROOT, "The fully-connected layer has finished choosing the algorithm for cublas
-  // Gemm.\n"); Clean-up
-  HCTR_LIB_THROW(cudaEventDestroy(start));
-  HCTR_LIB_THROW(cudaEventDestroy(stop));
-}  // namespace HugeCTR
-
-std::unique_ptr<DataSimulator> FullyConnectedLayer<__half>::get_uniform_initializer(
-    const int index) {
-  size_t bottom_dim =
-      get_bottom_tensor(true).get_dimensions()[get_bottom_tensor(true).get_dimensions().size() - 1];
-  size_t top_dim = top_tensor_.get_dimensions()[top_tensor_.get_dimensions().size() - 1];
-
-  float limit = 1.0f / ((0 == index ? bottom_dim : 0) + top_dim);
-  return std::make_unique<UniformDataSimulator>(-1 * limit, limit);
-}
-
-std::unique_ptr<DataSimulator> FullyConnectedLayer<__half>::get_xavier_uniform_initializer(
-    const int index) {
-  size_t bottom_dim =
-      get_bottom_tensor(true).get_dimensions()[get_bottom_tensor(true).get_dimensions().size() - 1];
-  size_t top_dim = top_tensor_.get_dimensions()[top_tensor_.get_dimensions().size() - 1];
-
-  return std::make_unique<VarianceScalingSimulator>(1.f, data_simu::Mode_t::Fan_avg,
-                                                    data_simu::Distribution_t::Uniform,
-                                                    0 == index ? bottom_dim : 0, top_dim);
-}
-
-std::unique_ptr<DataSimulator> FullyConnectedLayer<__half>::get_xavier_norm_initializer(
-    const int index) {
-  size_t bottom_dim =
-      get_bottom_tensor(true).get_dimensions()[get_bottom_tensor(true).get_dimensions().size() - 1];
-  size_t top_dim = top_tensor_.get_dimensions()[top_tensor_.get_dimensions().size() - 1];
-
-  return std::make_unique<VarianceScalingSimulator>(1.f, data_simu::Mode_t::Fan_avg,
-                                                    data_simu::Distribution_t::Norm,
-                                                    0 == index ? bottom_dim : 0, top_dim);
-}
-
-std::unique_ptr<DataSimulator> FullyConnectedLayer<__half>::get_default_initializer(
-    const int index) {
-  size_t bottom_dim =
-      get_bottom_tensor(true).get_dimensions()[get_bottom_tensor(true).get_dimensions().size() - 1];
-  size_t top_dim = top_tensor_.get_dimensions()[top_tensor_.get_dimensions().size() - 1];
-
-  std::unique_ptr<DataSimulator> simu(nullptr);
-  if (0 == index) {
-    simu.reset(new VarianceScalingSimulator(1.f, data_simu::Mode_t::Fan_avg,
-                                            data_simu::Distribution_t::Norm, bottom_dim, top_dim));
-  } else if (1 == index) {
-    float stddev = sqrt(1.f / top_dim);
-    simu.reset(new GaussianDataSimulator(0, stddev, -2 * stddev, 2 * stddev));
-  } else {
-    HCTR_OWN_THROW(Error_t::OutOfBound, "index != {0, 1}.");
-  }
-
-  return simu;
-}
-
-template class FullyConnectedLayer<__half>;
-
-Core23TempFullyConnectedLayer<__half>::Core23TempFullyConnectedLayer(
-    const core23::Tensor& bottom_tensor, const core23::Tensor& top_tensor,
-    const std::shared_ptr<GPUResource>& gpu_resource, std::vector<Initializer_t> initializer_types)
-    : Core23TempTrainableLayer<__half>({bottom_tensor}, {top_tensor}, gpu_resource,
-                                       initializer_types),
+FullyConnectedLayer<__half>::FullyConnectedLayer(const core23::Tensor& bottom_tensor,
+                                                 const core23::Tensor& top_tensor,
+                                                 const std::shared_ptr<GPUResource>& gpu_resource,
+                                                 std::vector<Initializer_t> initializer_types)
+    : TrainableLayer<__half>({bottom_tensor}, {top_tensor}, gpu_resource, initializer_types),
       falgo_b_(CUBLAS_GEMM_DEFAULT_TENSOR_OP),
       falgo_k_(CUBLAS_GEMM_DEFAULT_TENSOR_OP),
       balgo_b_(CUBLAS_GEMM_DEFAULT_TENSOR_OP),
@@ -496,7 +70,7 @@ Core23TempFullyConnectedLayer<__half>::Core23TempFullyConnectedLayer(
                                         .buffer_params(blobs_buffer_params));
 }
 
-void Core23TempFullyConnectedLayer<__half>::fprop(bool is_train) {
+void FullyConnectedLayer<__half>::fprop(bool is_train) {
   CudaDeviceContext context(get_device_id());
 
   const __half* kernel = this->get_weight(0).data<__half>();
@@ -532,7 +106,7 @@ void Core23TempFullyConnectedLayer<__half>::fprop(bool is_train) {
                               CUDA_R_32F, falgo_k_));
 }
 
-void Core23TempFullyConnectedLayer<__half>::bprop() {
+void FullyConnectedLayer<__half>::bprop() {
   CudaDeviceContext context(get_device_id());
 
   const __half* kernel = this->get_weight(0).data<__half>();
@@ -575,7 +149,7 @@ void Core23TempFullyConnectedLayer<__half>::bprop() {
                               CUDA_R_32F, balgo_x_));
 }
 
-void Core23TempFullyConnectedLayer<__half>::initialize() {
+void FullyConnectedLayer<__half>::initialize() {
   CudaDeviceContext context(get_device_id());
 
   __half* identity = identity_tensor_.data<__half>();
@@ -589,7 +163,7 @@ void Core23TempFullyConnectedLayer<__half>::initialize() {
                                                                             __float2half(1.0f));
 }
 
-void Core23TempFullyConnectedLayer<__half>::search_algorithm() {
+void FullyConnectedLayer<__half>::search_algorithm() {
   // Set to the CUDA device where this layer assigned to
   CudaDeviceContext context(get_device_id());
 
@@ -821,7 +395,7 @@ void Core23TempFullyConnectedLayer<__half>::search_algorithm() {
   HCTR_LIB_THROW(cudaEventDestroy(stop));
 }  // namespace HugeCTR
 
-std::unique_ptr<DataSimulator> Core23TempFullyConnectedLayer<__half>::get_uniform_initializer(
+std::unique_ptr<DataSimulator> FullyConnectedLayer<__half>::get_uniform_initializer(
     const int index) {
   int64_t bottom_dim =
       get_bottom_tensor(true).shape().size(get_bottom_tensor(true).shape().dims() - 1);
@@ -832,8 +406,8 @@ std::unique_ptr<DataSimulator> Core23TempFullyConnectedLayer<__half>::get_unifor
   return std::make_unique<UniformDataSimulator>(-1 * limit, limit);
 }
 
-std::unique_ptr<DataSimulator>
-Core23TempFullyConnectedLayer<__half>::get_xavier_uniform_initializer(const int index) {
+std::unique_ptr<DataSimulator> FullyConnectedLayer<__half>::get_xavier_uniform_initializer(
+    const int index) {
   int64_t bottom_dim =
       get_bottom_tensor(true).shape().size(get_bottom_tensor(true).shape().dims() - 1);
   auto top_tensor = this->output_tensors_[0];
@@ -844,7 +418,7 @@ Core23TempFullyConnectedLayer<__half>::get_xavier_uniform_initializer(const int
                                                     0 == index ? bottom_dim : 0, top_dim);
 }
 
-std::unique_ptr<DataSimulator> Core23TempFullyConnectedLayer<__half>::get_xavier_norm_initializer(
+std::unique_ptr<DataSimulator> FullyConnectedLayer<__half>::get_xavier_norm_initializer(
     const int index) {
   int64_t bottom_dim =
       get_bottom_tensor(true).shape().size(get_bottom_tensor(true).shape().dims() - 1);
@@ -856,7 +430,7 @@ std::unique_ptr<DataSimulator> Core23TempFullyConnectedLayer<__half>::get_xavier
                                                     0 == index ? bottom_dim : 0, top_dim);
 }
 
-std::unique_ptr<DataSimulator> Core23TempFullyConnectedLayer<__half>::get_default_initializer(
+std::unique_ptr<DataSimulator> FullyConnectedLayer<__half>::get_default_initializer(
     const int index) {
   int64_t bottom_dim =
       get_bottom_tensor(true).shape().size(get_bottom_tensor(true).shape().dims() - 1);
@@ -877,6 +451,6 @@ std::unique_ptr<DataSimulator> Core23TempFullyConnectedLayer<__half>::get_defaul
   return simu;
 }
 
-template class Core23TempFullyConnectedLayer<__half>;
+template class FullyConnectedLayer<__half>;
 
 }  // namespace HugeCTR
diff --git a/HugeCTR/src/layers/fused_fully_connected_layer.cu b/HugeCTR/src/layers/fused_fully_connected_layer.cu
index 5a36750cb2..a841a2fb20 100644
--- a/HugeCTR/src/layers/fused_fully_connected_layer.cu
+++ b/HugeCTR/src/layers/fused_fully_connected_layer.cu
@@ -90,329 +90,11 @@ __global__ void reverse_add_bias_and_re_kernel(float* bias, __half* middle, cons
 
 }  // namespace
 
-FusedFullyConnectedLayer::FusedFullyConnectedLayer(
-    const std::shared_ptr<BufferBlock2<float>>& master_weights_buff,
-    const std::shared_ptr<BufferBlock2<__half>>& weights_buff,
-    const std::shared_ptr<BufferBlock2<__half>>& weights_grad_buff,
-    const std::shared_ptr<GeneralBuffer2<CudaAllocator>>& blobs_buff,
-    const Tensor2<__half>& bottom_tensor, const Tensor2<__half>& top_tensor,
-    const std::shared_ptr<GPUResource>& gpu_resource, std::vector<Initializer_t> initializer_types)
-    : TrainableLayer<__half>(master_weights_buff, weights_buff, weights_grad_buff, gpu_resource,
-                             initializer_types),
-      falgo_k_(CUBLAS_GEMM_DEFAULT_TENSOR_OP),
-      balgo_k_(CUBLAS_GEMM_DEFAULT_TENSOR_OP),
-      balgo_x_(CUBLAS_GEMM_DEFAULT_TENSOR_OP) {
-  const auto& bottom_tensor_dim = bottom_tensor.get_dimensions();
-  const auto& top_tensor_dim = top_tensor.get_dimensions();
-
-  if (bottom_tensor_dim.size() != 2 || top_tensor_dim.size() != 2) {
-    HCTR_OWN_THROW(Error_t::WrongInput, "input or output tensor doesn't has two dimensions");
-  }
-
-  size_t batch_size = bottom_tensor_dim[0];
-  size_t output_size = top_tensor_dim[1];
-  size_t input_size = bottom_tensor_dim[1];
-
-  if (batch_size % 32 != 0 || output_size % 64 != 0) {
-    HCTR_OWN_THROW(
-        Error_t::WrongInput,
-        "The first dimension of bottom tensor must be a multiple of 32, the second dimension "
-        "of top tensor must be a multiple of 64.");
-  }
-
-  std::vector<size_t> kernel_dim = {input_size, output_size};
-  std::vector<size_t> bias_dim = {1, output_size};
-
-  this->set_weight(0, kernel_dim);
-  this->set_weight(1, bias_dim);
-  this->set_wgrad(0, kernel_dim);
-  this->set_wgrad(1, bias_dim);
-
-  bottom_tensor_ = bottom_tensor;
-  top_tensor_ = top_tensor;
-  blobs_buff->reserve(top_tensor_.get_dimensions(), &middle_tensor_);
-  blobs_buff->reserve(bias_dim, &bias_grad_tensor_);
-}
-
-void FusedFullyConnectedLayer::fprop(bool is_train) {
-  CudaDeviceContext context(get_device_id());
-
-  const __half* kernel = this->get_weight(0).get_ptr();
-  const __half* bias = this->get_weight(1).get_ptr();
-  const __half* bottom = get_bottom_tensor(is_train).get_ptr();
-  __half* middle = middle_tensor_.get_ptr();
-  __half* top = top_tensor_.get_ptr();
-
-  const auto& bottom_tensor_dim = get_bottom_tensor(is_train).get_dimensions();
-  const auto& top_tensor_dim = top_tensor_.get_dimensions();
-
-  size_t batch_size = bottom_tensor_dim[0];
-  size_t output_size = top_tensor_dim[1];
-  size_t input_size = bottom_tensor_dim[1];
-
-  const float alpha = 1.0f;
-  const float beta = 0.0f;
-
-  HCTR_LIB_THROW(cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, output_size,
-                              batch_size, input_size, &alpha, kernel, CUDA_R_16F, output_size,
-                              bottom, CUDA_R_16F, input_size, &beta, middle, CUDA_R_16F,
-                              output_size, CUDA_R_32F, falgo_k_));
-
-  const size_t max_threads = 1024;
-  const size_t blocks = batch_size;
-  const size_t threads = min(output_size / 2, max_threads);
-
-  add_bias_and_re_kernel<<<blocks, threads, 0, get_gpu().get_stream()>>>(
-      top, middle, bias, output_size / 2, output_size / 2);
-}
-
-void FusedFullyConnectedLayer::bprop() {
-  CudaDeviceContext context(get_device_id());
-
-  const __half* kernel = this->get_weight(0).get_ptr();
-  const __half* top = top_tensor_.get_ptr();
-  __half* kernel_grad = this->get_wgrad(0).get_ptr();
-  __half* bias_grad = this->get_wgrad(1).get_ptr();
-  __half* bottom = get_bottom_tensor(true).get_ptr();
-  __half* middle = middle_tensor_.get_ptr();
-  float* bias_grad_float = bias_grad_tensor_.get_ptr();
-
-  const auto& bottom_tensor_dim = get_bottom_tensor(true).get_dimensions();
-  const auto& top_tensor_dim = top_tensor_.get_dimensions();
-
-  int batch_size = bottom_tensor_dim[0];
-  int output_size = top_tensor_dim[1];
-  int input_size = bottom_tensor_dim[1];
-
-  const float alpha = 1.0f;
-  const float beta_k = 1.0f;
-  const float beta_x = 0.0f;
-
-  initialize_array<<<(output_size - 1) / 1024 + 1, 1024, 0, get_gpu().get_stream()>>>(
-      bias_grad_float, output_size, 0.0f);
-
-  dim3 blocks(output_size / 64, batch_size / 32);
-  reverse_add_bias_and_re_kernel<32>
-      <<<blocks, 512, 0, get_gpu().get_stream()>>>(bias_grad_float, middle, top, output_size / 2);
-
-  convert_array<<<(output_size - 1) / 1024 + 1, 1024, 0, get_gpu().get_stream()>>>(
-      bias_grad, bias_grad_float, output_size);
-
-  HCTR_LIB_THROW(cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_T, output_size,
-                              input_size, batch_size, &alpha, middle, CUDA_R_16F, output_size,
-                              bottom, CUDA_R_16F, input_size, &beta_k, kernel_grad, CUDA_R_16F,
-                              output_size, CUDA_R_32F, balgo_k_));
-
-  HCTR_LIB_THROW(cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, input_size,
-                              batch_size, output_size, &alpha, kernel, CUDA_R_16F, output_size,
-                              middle, CUDA_R_16F, output_size, &beta_x, bottom, CUDA_R_16F,
-                              input_size, CUDA_R_32F, balgo_x_));
-}
-
-void FusedFullyConnectedLayer::search_algorithm() {
-  // Set to the CUDA device where this layer assigned to
-  CudaDeviceContext context(get_device_id());
-
-  const size_t repeat_num = 100;
-
-  // Device Tensors to be used
-  __half* bottom = get_bottom_tensor(true).get_ptr();
-  __half* top = top_tensor_.get_ptr();
-  __half* kernel = this->get_weight(0).get_ptr();
-  __half* bias = this->get_weight(1).get_ptr();
-  __half* kernel_grad = this->get_wgrad(0).get_ptr();
-  __half* bias_grad = this->get_wgrad(1).get_ptr();
-
-  // Tensor dim
-  const auto& bottom_tensor_dim = get_bottom_tensor(true).get_dimensions();
-  const auto& top_tensor_dim = top_tensor_.get_dimensions();
-
-  size_t batch_size = bottom_tensor_dim[0];
-  size_t output_size = top_tensor_dim[1];
-  size_t input_size = bottom_tensor_dim[1];
-
-  // Record time for each algorithm
-  float shortestTime = std::numeric_limits<float>::max();
-  float time;
-  cudaEvent_t start, stop;
-  HCTR_LIB_THROW(cudaEventCreate(&start));
-  HCTR_LIB_THROW(cudaEventCreate(&stop));
-
-  // Start, end for search
-  const cublasGemmAlgo_t startAlgo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
-  const cublasGemmAlgo_t endAlgo = CUBLAS_GEMM_ALGO15_TENSOR_OP;
-
-  // Search all the algorithm for falgo_k_
-  for (int testAlgo = startAlgo; testAlgo <= endAlgo; testAlgo++) {
-    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
-
-    const float alpha = 1.0f;
-    const float beta = 1.0f;
-
-    // Record start event
-    HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream()));
-    for (size_t i = 0; i < repeat_num && status == CUBLAS_STATUS_SUCCESS; ++i) {
-      status = cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, output_size,
-                            batch_size, input_size, &alpha, kernel, CUDA_R_16F, output_size, bottom,
-                            CUDA_R_16F, input_size, &beta, top, CUDA_R_16F, output_size, CUDA_R_32F,
-                            static_cast<cublasGemmAlgo_t>(testAlgo));
-    }
-    HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream()));
-    HCTR_LIB_THROW(cudaEventSynchronize(stop));
-    HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop));
-    // Avg Time(ms) for this algorithm for fprop GEMM
-    time = time / repeat_num;
-    // Skip if the algorithm is supported for fprop configuration
-    if (status != CUBLAS_STATUS_SUCCESS) {
-      //      HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for fprop, skipped.\n",
-      //      testAlgo);
-      continue;
-    }
-    // Record the optimal time and algorithm
-    if (time < shortestTime) {
-      shortestTime = time;
-      falgo_k_ = static_cast<cublasGemmAlgo_t>(testAlgo);
-    }
-  }
-
-  // Reset shortestTime
-  shortestTime = std::numeric_limits<float>::max();
-
-  // Search all the algorithm for balgo_k_
-  for (int testAlgo = startAlgo; testAlgo <= endAlgo; testAlgo++) {
-    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
-
-    const float alpha = 1.0f;
-    const float beta = 1.0f;
-
-    // Record start event
-    HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream()));
-    for (size_t i = 0; i < repeat_num && status == CUBLAS_STATUS_SUCCESS; ++i) {
-      status = cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_T, output_size,
-                            input_size, batch_size, &alpha, top, CUDA_R_16F, output_size, bottom,
-                            CUDA_R_16F, input_size, &beta, kernel_grad, CUDA_R_16F, output_size,
-                            CUDA_R_32F, static_cast<cublasGemmAlgo_t>(testAlgo));
-    }
-    HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream()));
-    HCTR_LIB_THROW(cudaEventSynchronize(stop));
-    HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop));
-    // Avg Time(ms) for this algorithm for fprop GEMM
-    time = time / repeat_num;
-    // Skip if the algorithm is supported for fprop configuration
-    if (status != CUBLAS_STATUS_SUCCESS) {
-      //      HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for bprop_W, skipped.\n",
-      //      testAlgo);
-      continue;
-    }
-    // Record the optimal time and algorithm
-    if (time < shortestTime) {
-      shortestTime = time;
-      balgo_k_ = static_cast<cublasGemmAlgo_t>(testAlgo);
-    }
-  }
-
-  // Reset shortestTime
-  shortestTime = std::numeric_limits<float>::max();
-
-  // Search all the algorithm for balgo_x_
-  for (int testAlgo = startAlgo; testAlgo <= endAlgo; testAlgo++) {
-    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
-
-    const float alpha = 1.0f;
-    const float beta = 0.0f;
-
-    // Record start event
-    HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream()));
-    for (size_t i = 0; i < repeat_num && status == CUBLAS_STATUS_SUCCESS; ++i) {
-      status = cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, input_size,
-                            batch_size, output_size, &alpha, kernel, CUDA_R_16F, output_size, top,
-                            CUDA_R_16F, output_size, &beta, bottom, CUDA_R_16F, input_size,
-                            CUDA_R_32F, static_cast<cublasGemmAlgo_t>(testAlgo));
-    }
-
-    HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream()));
-    HCTR_LIB_THROW(cudaEventSynchronize(stop));
-    HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop));
-    // Avg Time(ms) for this algorithm for fprop GEMM
-    time = time / repeat_num;
-    // Skip if the algorithm is supported for fprop configuration
-    if (status != CUBLAS_STATUS_SUCCESS) {
-      //      HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for bprop_Xn, skipped.\n",
-      //      testAlgo);
-      continue;
-    }
-    // Record the optimal time and algorithm
-    if (time < shortestTime) {
-      shortestTime = time;
-      balgo_x_ = static_cast<cublasGemmAlgo_t>(testAlgo);
-    }
-  }
-
-  // Print selection information
-  // HCTR_LOG(INFO, WORLD, "The algorithm selection for falgo_k_, balgo_k_, balgo_x_ are: %d, %d and
-  // %d.\n",
-  //        (int)falgo_k_ - CUBLAS_GEMM_DEFAULT_TENSOR_OP,
-  //        (int)balgo_k_ - CUBLAS_GEMM_DEFAULT_TENSOR_OP,
-  //        (int)balgo_x_ - CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-
-  // Output msg
-  // HCTR_LOG(INFO, ROOT, "The fully-connected layer has finished choosing the algorithm for cublas
-  // Gemm.\n"); Clean-up
-  HCTR_LIB_THROW(cudaEventDestroy(start));
-  HCTR_LIB_THROW(cudaEventDestroy(stop));
-}  // namespace HugeCTR
-
-std::unique_ptr<DataSimulator> FusedFullyConnectedLayer::get_uniform_initializer(const int index) {
-  size_t bottom_dim = get_bottom_tensor(true).get_dimensions()[1];
-  size_t top_dim = top_tensor_.get_dimensions()[1];
-
-  float limit = 1.0f / ((0 == index ? bottom_dim : 0) + top_dim);
-  return std::make_unique<UniformDataSimulator>(-1 * limit, limit);
-}
-
-std::unique_ptr<DataSimulator> FusedFullyConnectedLayer::get_xavier_uniform_initializer(
-    const int index) {
-  size_t bottom_dim = get_bottom_tensor(true).get_dimensions()[1];
-  size_t top_dim = top_tensor_.get_dimensions()[1];
-
-  return std::make_unique<VarianceScalingSimulator>(1.f, data_simu::Mode_t::Fan_avg,
-                                                    data_simu::Distribution_t::Uniform,
-                                                    0 == index ? bottom_dim : 0, top_dim);
-}
-
-std::unique_ptr<DataSimulator> FusedFullyConnectedLayer::get_xavier_norm_initializer(
-    const int index) {
-  size_t bottom_dim = get_bottom_tensor(true).get_dimensions()[1];
-  size_t top_dim = top_tensor_.get_dimensions()[1];
-
-  return std::make_unique<VarianceScalingSimulator>(1.f, data_simu::Mode_t::Fan_avg,
-                                                    data_simu::Distribution_t::Norm,
-                                                    0 == index ? bottom_dim : 0, top_dim);
-}
-
-std::unique_ptr<DataSimulator> FusedFullyConnectedLayer::get_default_initializer(const int index) {
-  size_t bottom_dim = get_bottom_tensor(true).get_dimensions()[1];
-  size_t top_dim = top_tensor_.get_dimensions()[1];
-
-  std::unique_ptr<DataSimulator> simu(nullptr);
-  if (0 == index) {
-    simu.reset(new VarianceScalingSimulator(1.f, data_simu::Mode_t::Fan_avg,
-                                            data_simu::Distribution_t::Norm, bottom_dim, top_dim));
-  } else if (1 == index) {
-    float stddev = sqrt(1.f / top_dim);
-    simu.reset(new GaussianDataSimulator(0, stddev, -2 * stddev, 2 * stddev));
-  } else {
-    HCTR_OWN_THROW(Error_t::OutOfBound, "index != {0, 1}.");
-  }
-
-  return simu;
-}
-
-Core23TempFusedFullyConnectedLayer::Core23TempFusedFullyConnectedLayer(
-    const core23::Tensor& bottom_tensor, const core23::Tensor& top_tensor,
-    const std::shared_ptr<GPUResource>& gpu_resource, std::vector<Initializer_t> initializer_types)
-    : Core23TempTrainableLayer<__half>({bottom_tensor}, {top_tensor}, gpu_resource,
-                                       initializer_types),
+FusedFullyConnectedLayer::FusedFullyConnectedLayer(const core23::Tensor& bottom_tensor,
+                                                   const core23::Tensor& top_tensor,
+                                                   const std::shared_ptr<GPUResource>& gpu_resource,
+                                                   std::vector<Initializer_t> initializer_types)
+    : TrainableLayer<__half>({bottom_tensor}, {top_tensor}, gpu_resource, initializer_types),
       falgo_k_(CUBLAS_GEMM_DEFAULT_TENSOR_OP),
       balgo_k_(CUBLAS_GEMM_DEFAULT_TENSOR_OP),
       balgo_x_(CUBLAS_GEMM_DEFAULT_TENSOR_OP) {
@@ -459,7 +141,7 @@ Core23TempFusedFullyConnectedLayer::Core23TempFusedFullyConnectedLayer(
                                          .buffer_params(blobs_buffer_params));
 }
 
-void Core23TempFusedFullyConnectedLayer::fprop(bool is_train) {
+void FusedFullyConnectedLayer::fprop(bool is_train) {
   CudaDeviceContext context(get_device_id());
 
   const __half* kernel = this->get_weight(0).data<__half>();
@@ -491,7 +173,7 @@ void Core23TempFusedFullyConnectedLayer::fprop(bool is_train) {
       top, middle, bias, output_size / 2, output_size / 2);
 }
 
-void Core23TempFusedFullyConnectedLayer::bprop() {
+void FusedFullyConnectedLayer::bprop() {
   CudaDeviceContext context(get_device_id());
 
   const __half* kernel = this->get_weight(0).data<__half>();
@@ -534,7 +216,7 @@ void Core23TempFusedFullyConnectedLayer::bprop() {
                               input_size, CUDA_R_32F, balgo_x_));
 }
 
-void Core23TempFusedFullyConnectedLayer::search_algorithm() {
+void FusedFullyConnectedLayer::search_algorithm() {
   // Set to the CUDA device where this layer assigned to
   CudaDeviceContext context(get_device_id());
 
@@ -687,8 +369,7 @@ void Core23TempFusedFullyConnectedLayer::search_algorithm() {
   HCTR_LIB_THROW(cudaEventDestroy(stop));
 }  // namespace HugeCTR
 
-std::unique_ptr<DataSimulator> Core23TempFusedFullyConnectedLayer::get_uniform_initializer(
-    const int index) {
+std::unique_ptr<DataSimulator> FusedFullyConnectedLayer::get_uniform_initializer(const int index) {
   int64_t bottom_dim = get_bottom_tensor(true).shape().size(1);
   int64_t top_dim = this->output_tensors_[0].shape().size(1);
 
@@ -696,7 +377,7 @@ std::unique_ptr<DataSimulator> Core23TempFusedFullyConnectedLayer::get_uniform_i
   return std::make_unique<UniformDataSimulator>(-1 * limit, limit);
 }
 
-std::unique_ptr<DataSimulator> Core23TempFusedFullyConnectedLayer::get_xavier_uniform_initializer(
+std::unique_ptr<DataSimulator> FusedFullyConnectedLayer::get_xavier_uniform_initializer(
     const int index) {
   int64_t bottom_dim = get_bottom_tensor(true).shape().size(1);
   int64_t top_dim = this->output_tensors_[0].shape().size(1);
@@ -706,7 +387,7 @@ std::unique_ptr<DataSimulator> Core23TempFusedFullyConnectedLayer::get_xavier_un
                                                     0 == index ? bottom_dim : 0, top_dim);
 }
 
-std::unique_ptr<DataSimulator> Core23TempFusedFullyConnectedLayer::get_xavier_norm_initializer(
+std::unique_ptr<DataSimulator> FusedFullyConnectedLayer::get_xavier_norm_initializer(
     const int index) {
   int64_t bottom_dim = get_bottom_tensor(true).shape().size(1);
   int64_t top_dim = this->output_tensors_[0].shape().size(1);
@@ -716,8 +397,7 @@ std::unique_ptr<DataSimulator> Core23TempFusedFullyConnectedLayer::get_xavier_no
                                                     0 == index ? bottom_dim : 0, top_dim);
 }
 
-std::unique_ptr<DataSimulator> Core23TempFusedFullyConnectedLayer::get_default_initializer(
-    const int index) {
+std::unique_ptr<DataSimulator> FusedFullyConnectedLayer::get_default_initializer(const int index) {
   int64_t bottom_dim = get_bottom_tensor(true).shape().size(1);
   int64_t top_dim = this->output_tensors_[0].shape().size(1);
 
diff --git a/HugeCTR/src/layers/fused_relu_bias_fully_connected_layer.cu b/HugeCTR/src/layers/fused_relu_bias_fully_connected_layer.cu
index 451217a259..c92c5e0cf0 100644
--- a/HugeCTR/src/layers/fused_relu_bias_fully_connected_layer.cu
+++ b/HugeCTR/src/layers/fused_relu_bias_fully_connected_layer.cu
@@ -52,774 +52,6 @@ __global__ void reverse_relu_kernel_not_aligned(__half* dRelu, __half* mask, con
 }  // namespace
 
 FusedReluBiasFullyConnectedLayer::FusedReluBiasFullyConnectedLayer(
-    const std::shared_ptr<BufferBlock2<float>>& master_weights_buff,
-    const std::shared_ptr<BufferBlock2<__half>>& weights_buff,
-    const std::shared_ptr<BufferBlock2<__half>>& weights_grad_buff,
-    const std::shared_ptr<GeneralBuffer2<CudaAllocator>>& blobs_buff,
-    const Tensor2<__half>& train_in_tensor, const Tensor2<__half>& mask_in_tensor,
-    const Tensor2<__half>& dRelu_in_tensor, const Tensor2<__half>& db_in_tensor,
-    const Tensor2<__half>& train_out_tensor, const Tensor2<__half>& mask_out_tensor,
-    const Tensor2<__half>& dRelu_out_tensor, Tensor2<__half>& db_out_tensor,
-    const std::shared_ptr<GPUResource>& gpu_resource, const FcPosition_t& pos,
-    const Activation_t& act, const bool& skip_dgrad, std::vector<Initializer_t> initializer_types,
-    const bool async_mlp_wgrad, const bool head_mask_in, const bool fuse_wb)
-    : TrainableLayer<__half>(master_weights_buff, weights_buff, weights_grad_buff, gpu_resource,
-                             initializer_types),
-      balgo_k_(CUBLAS_GEMM_DEFAULT_TENSOR_OP),
-      balgo_x_(CUBLAS_GEMM_DEFAULT_TENSOR_OP),
-      balgo_b_(CUBLAS_GEMM_DEFAULT_TENSOR_OP),
-      pos_(pos),
-      act_(act),
-      skip_dgrad_(skip_dgrad),
-      async_mlp_wgrad_(async_mlp_wgrad),
-      head_mask_in_(head_mask_in),
-      fuse_wb_(fuse_wb),
-      event_overlap_created_(false) {
-  const auto& bottom_tensor_dim = train_in_tensor.get_dimensions();
-  const auto& top_tensor_dim = train_out_tensor.get_dimensions();
-
-  if (bottom_tensor_dim.size() != 2 || top_tensor_dim.size() != 2) {
-    HCTR_OWN_THROW(Error_t::WrongInput, "input or output tensor doesn't has two dimensions");
-  }
-
-  size_t batch_size = bottom_tensor_dim[0];
-  size_t output_size = top_tensor_dim[1];
-  size_t input_size = bottom_tensor_dim[1];
-
-  std::vector<size_t> kernel_dim = {input_size, output_size};
-  std::vector<size_t> bias_dim = {1, output_size};
-  std::vector<size_t> identity_dim = {1, batch_size};
-
-  this->set_weight(0, kernel_dim);
-  weights_half_.push_back(this->get_weight(0));
-  this->set_weight(1, bias_dim);
-  weights_half_.push_back(this->get_weight(1));
-  this->set_wgrad(0, kernel_dim);
-  weights_grad_.push_back(this->get_wgrad(0));
-  this->set_wgrad(1, bias_dim);
-  db_out_tensor = this->get_wgrad(1);
-  weights_grad_.push_back(this->get_wgrad(1));
-
-  blobs_buff->reserve(identity_dim, &identity_tensor_);
-
-  train_in_tensor_ = train_in_tensor;
-  //  if (pos_ == FcPosition_t::Head || pos_ == FcPosition_t::Isolated) {
-  //    // mask_in_tensor_ = train_in_tensor;
-  //  } else {
-  mask_in_tensor_ = mask_in_tensor;
-  dRelu_in_tensor_ = dRelu_in_tensor;
-  db_in_tensor_ = db_in_tensor;
-  //  }
-  train_out_tensor_ = train_out_tensor;
-  mask_out_tensor_ = mask_out_tensor;
-  dRelu_out_tensor_ = dRelu_out_tensor;
-  db_out_tensor_ = db_out_tensor;
-  blobs_buff->reserve(kernel_dim, &bias_grad_tensor_);
-
-  std::vector<size_t> mask_dim = {batch_size, output_size};
-  blobs_buff->reserve(mask_dim, &mask_in_tensor_temp_);
-
-  if (async_mlp_wgrad_)
-    cublas_handle_wgrad_ = gpu_resource->get_cublas_handle_wgrad();
-  else
-    cublas_handle_wgrad_ = gpu_resource->get_cublas_handle();
-}
-
-void FusedReluBiasFullyConnectedLayer::initialize() {
-  CudaDeviceContext context(get_device_id());
-  HCTR_LIB_THROW(cudaEventCreate(&event_overlap_));
-  event_overlap_created_ = true;
-
-  // TODO: We need different bottom desc based on is_train or not
-  const auto& bottom_tensor_dim = get_bottom_tensor_fprop(true).get_dimensions();
-  const auto& top_tensor_dim = train_out_tensor_.get_dimensions();
-  __half* identity = identity_tensor_.get_ptr();
-
-  int batch_size = bottom_tensor_dim[0];
-  int output_size = top_tensor_dim[1];
-  int input_size = bottom_tensor_dim[1];
-
-  initialize_array<<<(batch_size - 1) / 1024 + 1, 1024, 0, get_gpu().get_stream()>>>(
-      identity, batch_size, __float2half(1.0f));
-
-  HCTR_LIB_THROW(cublasLtMatmulDescCreate(&cublas_op_desc_, CUBLAS_COMPUTE_32F, CUDA_R_32F));
-
-  cublasOperation_t trans = CUBLAS_OP_N;
-  HCTR_LIB_THROW(cublasLtMatmulDescSetAttribute(cublas_op_desc_, CUBLASLT_MATMUL_DESC_TRANSA,
-                                                &trans, sizeof(trans)));
-  HCTR_LIB_THROW(cublasLtMatmulDescSetAttribute(cublas_op_desc_, CUBLASLT_MATMUL_DESC_TRANSB,
-                                                &trans, sizeof(trans)));
-  cublasLtEpilogue_t epi = CUBLASLT_EPILOGUE_RELU_AUX_BIAS;
-  if (act_ == Activation_t::None) epi = CUBLASLT_EPILOGUE_BIAS;
-  HCTR_LIB_THROW(cublasLtMatmulDescSetAttribute(cublas_op_desc_, CUBLASLT_MATMUL_DESC_EPILOGUE,
-                                                &epi, sizeof(epi)));
-  const __half* bias = weights_half_[1].get_ptr();
-  HCTR_LIB_THROW(cublasLtMatmulDescSetAttribute(cublas_op_desc_, CUBLASLT_MATMUL_DESC_BIAS_POINTER,
-                                                &bias, sizeof(bias)));
-  if (act_ != Activation_t::None) {
-    __half* reluMask = mask_out_tensor_.get_ptr();
-    cublasLtMatmulDescSetAttribute(cublas_op_desc_, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER,
-                                   &reluMask, sizeof(reluMask));
-    long reluMaskLd = output_size;
-    cublasLtMatmulDescSetAttribute(cublas_op_desc_, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD,
-                                   &reluMaskLd, sizeof(reluMaskLd));
-  }
-
-  HCTR_LIB_THROW(cublasLtMatrixLayoutCreate(&cublas_kernel_desc_, CUDA_R_16F, output_size,
-                                            input_size, output_size));
-  HCTR_LIB_THROW(cublasLtMatrixLayoutCreate(&cublas_bottom_desc_, CUDA_R_16F, input_size,
-                                            batch_size, input_size));
-  HCTR_LIB_THROW(cublasLtMatrixLayoutCreate(&cublas_top_desc_, CUDA_R_16F, output_size, batch_size,
-                                            output_size));
-
-  HCTR_LIB_THROW(cublasLtMatmulPreferenceCreate(&cublas_preference_));
-
-  cublaslt_workspace_size_ = 1024 * 1024 * 8;  // Set it to 8MB for now
-  HCTR_LIB_THROW(cudaMalloc(&cublaslt_workspace_, cublaslt_workspace_size_));
-  HCTR_LIB_THROW(cublasLtMatmulPreferenceSetAttribute(
-      cublas_preference_, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &cublaslt_workspace_size_,
-      sizeof(cublaslt_workspace_size_)));
-
-  uint32_t pointer_mode = CUBLASLT_POINTER_MODE_HOST;
-  HCTR_LIB_THROW(cublasLtMatmulDescSetAttribute(cublas_op_desc_, CUBLASLT_MATMUL_DESC_POINTER_MODE,
-                                                &pointer_mode, sizeof(pointer_mode)));
-
-#if CUBLAS_VERSION < 120000
-  pointer_mode = CUBLASLT_POINTER_MODE_MASK_HOST;
-  HCTR_LIB_THROW(cublasLtMatmulPreferenceSetAttribute(cublas_preference_,
-                                                      CUBLASLT_MATMUL_PREF_POINTER_MODE_MASK,
-                                                      &pointer_mode, sizeof(pointer_mode)));
-  HCTR_LIB_THROW(cublasLtMatmulPreferenceSetAttribute(
-      cublas_preference_, CUBLASLT_MATMUL_PREF_EPILOGUE_MASK, &epi, sizeof(epi)));
-#endif
-
-  // By default set algo to best estimated heurstic
-  cublasLtMatmulHeuristicResult_t heuristic_result;
-  int returned_res = 0;
-  HCTR_LIB_THROW(cublasLtMatmulAlgoGetHeuristic(
-      get_gpu().get_cublaslt_handle(), cublas_op_desc_, cublas_kernel_desc_, cublas_bottom_desc_,
-      cublas_top_desc_, cublas_top_desc_, cublas_preference_, 1, &heuristic_result, &returned_res));
-
-  memcpy(&falgo_k_, &heuristic_result.algo, sizeof(falgo_k_));
-
-  if (returned_res == 0) {
-    HCTR_LIB_THROW(CUBLAS_STATUS_NOT_SUPPORTED);
-  }
-
-  initialize_dgrad();
-  initialize_wgrad();
-}
-
-void FusedReluBiasFullyConnectedLayer::initialize_dgrad() {
-  // TODO: We need different bottom desc based on is_train or not
-  const auto& bottom_tensor_dim = get_bottom_tensor_fprop(true).get_dimensions();
-  const auto& top_tensor_dim = train_out_tensor_.get_dimensions();
-
-  size_t batch_size = bottom_tensor_dim[0];
-  size_t output_size = top_tensor_dim[1];
-  size_t input_size = bottom_tensor_dim[1];
-
-  HCTR_LIB_THROW(cublasLtMatmulDescCreate(&cublas_op_desc_bprop_, CUBLAS_COMPUTE_32F, CUDA_R_32F));
-
-  cublasOperation_t transA = CUBLAS_OP_T;
-  cublasOperation_t transB = CUBLAS_OP_N;
-  HCTR_LIB_THROW(cublasLtMatmulDescSetAttribute(cublas_op_desc_bprop_, CUBLASLT_MATMUL_DESC_TRANSA,
-                                                &transA, sizeof(transA)));
-  HCTR_LIB_THROW(cublasLtMatmulDescSetAttribute(cublas_op_desc_bprop_, CUBLASLT_MATMUL_DESC_TRANSB,
-                                                &transB, sizeof(transB)));
-  cublasLtEpilogue_t epi;
-
-  if (pos_ == FcPosition_t::Head || pos_ == FcPosition_t::Isolated) {
-    epi = CUBLASLT_EPILOGUE_DEFAULT;
-    HCTR_LIB_THROW(cublasLtMatmulDescSetAttribute(
-        cublas_op_desc_bprop_, CUBLASLT_MATMUL_DESC_EPILOGUE, &epi, sizeof(epi)));
-  } else if (pos_ == FcPosition_t::Body || pos_ == FcPosition_t::Tail) {
-    epi = fuse_wb_ ? CUBLASLT_EPILOGUE_DRELU : CUBLASLT_EPILOGUE_DRELU_BGRAD;
-    cublasLtMatmulDescSetAttribute(cublas_op_desc_bprop_, CUBLASLT_MATMUL_DESC_EPILOGUE, &epi,
-                                   sizeof(epi));
-    if (!fuse_wb_) {
-      __half* bgrad = db_in_tensor_.get_ptr();
-      cublasLtMatmulDescSetAttribute(cublas_op_desc_bprop_, CUBLASLT_MATMUL_DESC_BIAS_POINTER,
-                                     &bgrad, sizeof(bgrad));
-    }
-    __half* reluMask = mask_in_tensor_.get_ptr();
-    cublasLtMatmulDescSetAttribute(cublas_op_desc_bprop_, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER,
-                                   &reluMask, sizeof(reluMask));
-    long reluMaskLd = input_size;
-    cublasLtMatmulDescSetAttribute(cublas_op_desc_bprop_, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD,
-                                   &reluMaskLd, sizeof(reluMaskLd));
-  }
-
-  HCTR_LIB_THROW(cublasLtMatrixLayoutCreate(&cublas_dRelu_top_desc_, CUDA_R_16F, output_size,
-                                            batch_size, output_size));
-  HCTR_LIB_THROW(cublasLtMatrixLayoutCreate(&cublas_dRelu_bottom_desc_, CUDA_R_16F, input_size,
-                                            batch_size, input_size));
-
-  HCTR_LIB_THROW(cublasLtMatmulPreferenceCreate(&cublas_preference_dRelu_));
-
-  cublaslt_workspace_size_ = 1024 * 1024 * 8;  // Set it to 8MB for now
-  HCTR_LIB_THROW(cudaMalloc(&cublaslt_workspace_dRelu_, cublaslt_workspace_size_));
-  HCTR_LIB_THROW(cublasLtMatmulPreferenceSetAttribute(
-      cublas_preference_dRelu_, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &cublaslt_workspace_size_,
-      sizeof(cublaslt_workspace_size_)));
-
-  uint32_t pointer_mode = CUBLASLT_POINTER_MODE_HOST;
-  HCTR_LIB_THROW(cublasLtMatmulDescSetAttribute(cublas_op_desc_bprop_,
-                                                CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointer_mode,
-                                                sizeof(pointer_mode)));
-
-#if CUBLAS_VERSION < 120000
-  pointer_mode = CUBLASLT_POINTER_MODE_MASK_HOST;
-  HCTR_LIB_THROW(cublasLtMatmulPreferenceSetAttribute(cublas_preference_dRelu_,
-                                                      CUBLASLT_MATMUL_PREF_POINTER_MODE_MASK,
-                                                      &pointer_mode, sizeof(pointer_mode)));
-  HCTR_LIB_THROW(cublasLtMatmulPreferenceSetAttribute(
-      cublas_preference_dRelu_, CUBLASLT_MATMUL_PREF_EPILOGUE_MASK, &epi, sizeof(epi)));
-#endif
-
-  // By default set algo to best estimated heurstic
-  cublasLtMatmulHeuristicResult_t heuristic_result;
-  int returned_res = 0;
-  HCTR_LIB_THROW(cublasLtMatmulAlgoGetHeuristic(
-      get_gpu().get_cublaslt_handle(), cublas_op_desc_bprop_, cublas_kernel_desc_,
-      cublas_dRelu_top_desc_, cublas_dRelu_bottom_desc_, cublas_dRelu_bottom_desc_,
-      cublas_preference_dRelu_, 1, &heuristic_result, &returned_res));
-
-  memcpy(&balgo_dRelu_, &heuristic_result.algo, sizeof(balgo_dRelu_));
-
-  if (returned_res == 0) {
-    HCTR_LIB_THROW(CUBLAS_STATUS_NOT_SUPPORTED);
-  }
-}
-
-void FusedReluBiasFullyConnectedLayer::initialize_wgrad() {
-  // TODO: We need different bottom desc based on is_train or not
-  const auto& bottom_tensor_dim = get_bottom_tensor_fprop(true).get_dimensions();
-  const auto& top_tensor_dim = train_out_tensor_.get_dimensions();
-  size_t batch_size = bottom_tensor_dim[0];
-  size_t output_size = top_tensor_dim[1];
-  size_t input_size = bottom_tensor_dim[1];
-
-  HCTR_LIB_THROW(cublasLtMatmulDescCreate(&cublas_op_desc_wgrad_, CUBLAS_COMPUTE_32F, CUDA_R_32F));
-
-  cublasOperation_t transA = CUBLAS_OP_N;
-  cublasOperation_t transB = CUBLAS_OP_T;
-  HCTR_LIB_THROW(cublasLtMatmulDescSetAttribute(cublas_op_desc_wgrad_, CUBLASLT_MATMUL_DESC_TRANSA,
-                                                &transA, sizeof(transA)));
-  HCTR_LIB_THROW(cublasLtMatmulDescSetAttribute(cublas_op_desc_wgrad_, CUBLASLT_MATMUL_DESC_TRANSB,
-                                                &transB, sizeof(transB)));
-  cublasLtEpilogue_t epi;
-  if (fuse_wb_ || pos_ == FcPosition_t::Tail || pos_ == FcPosition_t::Isolated) {
-    epi = CUBLASLT_EPILOGUE_BGRADA;
-    __half* bgrad = db_out_tensor_.get_ptr();
-    cublasLtMatmulDescSetAttribute(cublas_op_desc_wgrad_, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bgrad,
-                                   sizeof(bgrad));
-  } else {
-    epi = CUBLASLT_EPILOGUE_DEFAULT;
-  }
-
-  HCTR_LIB_THROW(cublasLtMatmulDescSetAttribute(cublas_op_desc_wgrad_,
-                                                CUBLASLT_MATMUL_DESC_EPILOGUE, &epi, sizeof(epi)));
-
-  HCTR_LIB_THROW(cublasLtMatmulPreferenceCreate(&cublas_preference_wgrad_));
-
-  cublaslt_workspace_size_ = 1024 * 1024 * 8;  // Set it to 8MB for now
-  HCTR_LIB_THROW(cudaMalloc(&cublaslt_workspace_wgrad_, cublaslt_workspace_size_));
-  HCTR_LIB_THROW(cublasLtMatmulPreferenceSetAttribute(
-      cublas_preference_wgrad_, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &cublaslt_workspace_size_,
-      sizeof(cublaslt_workspace_size_)));
-
-  uint32_t pointer_mode = CUBLASLT_POINTER_MODE_HOST;
-  HCTR_LIB_THROW(cublasLtMatmulDescSetAttribute(cublas_op_desc_wgrad_,
-                                                CUBLASLT_MATMUL_DESC_POINTER_MODE, &pointer_mode,
-                                                sizeof(pointer_mode)));
-
-#if CUBLAS_VERSION < 120000
-  pointer_mode = CUBLASLT_POINTER_MODE_MASK_HOST;
-  HCTR_LIB_THROW(cublasLtMatmulPreferenceSetAttribute(cublas_preference_wgrad_,
-                                                      CUBLASLT_MATMUL_PREF_POINTER_MODE_MASK,
-                                                      &pointer_mode, sizeof(pointer_mode)));
-  HCTR_LIB_THROW(cublasLtMatmulPreferenceSetAttribute(
-      cublas_preference_wgrad_, CUBLASLT_MATMUL_PREF_EPILOGUE_MASK, &epi, sizeof(epi)));
-#endif
-
-  // By default set algo to best estimated heurstic
-  cublasLtMatmulHeuristicResult_t heuristic_result;
-  int returned_res = 0;
-  HCTR_LIB_THROW(cublasLtMatmulAlgoGetHeuristic(
-      get_gpu().get_cublaslt_handle(), cublas_op_desc_wgrad_, cublas_dRelu_top_desc_,
-      cublas_dRelu_bottom_desc_, cublas_kernel_desc_, cublas_kernel_desc_, cublas_preference_wgrad_,
-      1, &heuristic_result, &returned_res));
-  memcpy(&balgo_wgrad_, &heuristic_result.algo, sizeof(balgo_wgrad_));
-  // returned_res is 0 indicates that there is no feasible algorithm.
-  if (returned_res == 0) {
-    HCTR_LIB_THROW(CUBLAS_STATUS_NOT_SUPPORTED);
-  }
-}
-
-void FusedReluBiasFullyConnectedLayer::fprop(bool is_train) {
-  CudaDeviceContext context(get_device_id());
-
-  const __half* kernel = weights_half_[0].get_ptr();
-  const __half* bias = weights_half_[1].get_ptr();
-  const __half* bottom = get_bottom_tensor_fprop(is_train).get_ptr();
-  __half* top_fprop = train_out_tensor_.get_ptr();
-  __half* mask_out = mask_out_tensor_.get_ptr();
-
-  const auto& bottom_tensor_dim = get_bottom_tensor_fprop(is_train).get_dimensions();
-  const auto& top_tensor_dim = train_out_tensor_.get_dimensions();
-
-  size_t batch_size = bottom_tensor_dim[0];
-  size_t output_size = top_tensor_dim[1];
-  size_t input_size = bottom_tensor_dim[1];
-
-  const float alpha = 1.0f;
-  const float beta = 0.0f;
-
-  HCTR_LIB_THROW(cublasLtMatmul(
-      get_gpu().get_cublaslt_handle(), cublas_op_desc_, &alpha, kernel, cublas_kernel_desc_, bottom,
-      cublas_bottom_desc_, &beta, top_fprop, cublas_top_desc_, top_fprop, cublas_top_desc_,
-      &falgo_k_, cublaslt_workspace_, cublaslt_workspace_size_, get_gpu().get_stream()));
-
-  if ((pos_ == FcPosition_t::Tail || pos_ == FcPosition_t::Isolated) &&
-      act_ != Activation_t::None) {
-    size_t len = train_out_tensor_.get_num_elements();
-    HCTR_LIB_THROW(cudaMemcpyAsync(mask_out, top_fprop, len * sizeof(__half),
-                                   cudaMemcpyDeviceToDevice, get_gpu().get_stream()));
-  }
-}
-
-void FusedReluBiasFullyConnectedLayer::bprop() {
-  CudaDeviceContext context(get_device_id());
-
-  const __half* kernel = weights_half_[0].get_ptr();
-  const __half* train_out = train_out_tensor_.get_ptr();
-  __half* mask_out = mask_out_tensor_.get_ptr();
-  __half* kernel_grad = weights_grad_[0].get_ptr();
-  __half* bias_grad = weights_grad_[1].get_ptr();
-  __half* bottom = get_bottom_tensor_fprop(true).get_ptr();
-  //__half* bottom_bprop = get_bottom_tensor_bprop(true).get_ptr();
-  float* bias_grad_float = bias_grad_tensor_.get_ptr();
-  __half* dRelu_top = dRelu_out_tensor_.get_ptr();
-  const __half* identity = identity_tensor_.get_ptr();
-
-  const auto& bottom_tensor_dim = get_bottom_tensor_fprop(true).get_dimensions();
-  const auto& top_tensor_dim = train_out_tensor_.get_dimensions();
-
-  size_t batch_size = bottom_tensor_dim[0];
-  size_t output_size = top_tensor_dim[1];
-  size_t input_size = bottom_tensor_dim[1];
-
-  const float alpha = 1.0f;
-  const float beta_k = 1.0f;
-  const float beta_x = 0.0f;
-  const float beta_b = 0.0f;
-
-  // dRelu
-  if (pos_ == FcPosition_t::Tail || pos_ == FcPosition_t::Isolated) {
-    if (act_ != Activation_t::None) {
-      if ((batch_size * output_size) % 4 == 0) {
-        reverse_relu_kernel<<<(batch_size * output_size / 4 - 1) / 1024 + 1, 1024, 0,
-                              get_gpu().get_stream()>>>(dRelu_top, mask_out, train_out,
-                                                        batch_size * output_size);
-      } else
-        reverse_relu_kernel_not_aligned<<<(batch_size * output_size - 1) / 1024 + 1, 1024, 0,
-                                          get_gpu().get_stream()>>>(dRelu_top, mask_out, train_out,
-                                                                    batch_size * output_size);
-    } else
-      dRelu_top = train_out_tensor_.get_ptr();
-  }
-
-  // wait for dRelu
-  if (async_mlp_wgrad_) {
-    HCTR_LIB_THROW(cudaEventRecord(event_overlap_, get_gpu().get_stream()));
-    HCTR_LIB_THROW(cudaStreamWaitEvent(get_gpu().get_comp_overlap_stream(), event_overlap_));
-  }
-
-  // bgrad+wgrad
-  HCTR_LIB_THROW(cublasLtMatmul(
-      get_gpu().get_cublaslt_handle(), cublas_op_desc_wgrad_, &alpha, dRelu_top,
-      cublas_dRelu_top_desc_, bottom, cublas_dRelu_bottom_desc_, &beta_k, kernel_grad,
-      cublas_kernel_desc_, kernel_grad, cublas_kernel_desc_, &balgo_wgrad_,
-      cublaslt_workspace_wgrad_, cublaslt_workspace_size_,
-      async_mlp_wgrad_ ? get_gpu().get_comp_overlap_stream() : get_gpu().get_stream()));
-
-  // dgrad
-  if (!skip_dgrad_) {
-    __half* bottom_bprop;
-    if (head_mask_in_) {
-      bottom_bprop = mask_in_tensor_.get_ptr();
-    } else {
-      bottom_bprop = train_in_tensor_.get_ptr();
-    }
-
-    if (pos_ == FcPosition_t::Body || pos_ == FcPosition_t::Tail) {
-      bottom_bprop = dRelu_in_tensor_.get_ptr();
-    }
-    HCTR_LIB_THROW(cublasLtMatmul(
-        get_gpu().get_cublaslt_handle(), cublas_op_desc_bprop_, &alpha, kernel, cublas_kernel_desc_,
-        dRelu_top, cublas_dRelu_top_desc_, &beta_x, bottom_bprop, cublas_dRelu_bottom_desc_,
-        bottom_bprop, cublas_dRelu_bottom_desc_, &balgo_dRelu_, cublaslt_workspace_dRelu_,
-        cublaslt_workspace_size_, get_gpu().get_stream()));
-  }
-
-  if (async_mlp_wgrad_ && pos_ == FcPosition_t::Head) {
-    HCTR_LIB_THROW(cudaEventRecord(event_overlap_, this->get_gpu().get_comp_overlap_stream()));
-    HCTR_LIB_THROW(cudaStreamWaitEvent(this->get_gpu().get_stream(), event_overlap_));
-  }
-}
-
-void FusedReluBiasFullyConnectedLayer::search_algorithm() {
-  // Set to the CUDA device where this layer assigned to
-  CudaDeviceContext context(get_device_id());
-  const size_t repeat_num = 100;
-  const int max_algo_count = 16;
-
-  // Device Tensors to be used
-  __half* bottom = get_bottom_tensor_fprop(true).get_ptr();
-  __half* top = train_out_tensor_.get_ptr();
-  __half* kernel = weights_half_[0].get_ptr();
-  __half* bias = weights_half_[1].get_ptr();
-  __half* kernel_grad = weights_grad_[0].get_ptr();
-  __half* bias_grad = weights_grad_[1].get_ptr();
-  __half* identity = identity_tensor_.get_ptr();
-
-  // Tensor dim
-  const auto& bottom_tensor_dim = get_bottom_tensor_fprop(true).get_dimensions();
-  const auto& top_tensor_dim = train_out_tensor_.get_dimensions();
-
-  int batch_size = bottom_tensor_dim[0];
-  int output_size = top_tensor_dim[1];
-  int input_size = bottom_tensor_dim[1];
-
-  // Record time for each algorithm
-  float shortestTime = std::numeric_limits<float>::max();
-  float time;
-  cudaEvent_t start, stop;
-  HCTR_LIB_THROW(cudaEventCreate(&start));
-  HCTR_LIB_THROW(cudaEventCreate(&stop));
-
-  cublasLtMatmulHeuristicResult_t heuristic_result[max_algo_count] = {0};
-  int algo_count = 0;
-  HCTR_LIB_THROW(cublasLtMatmulAlgoGetHeuristic(
-      get_gpu().get_cublaslt_handle(), cublas_op_desc_, cublas_kernel_desc_, cublas_bottom_desc_,
-      cublas_top_desc_, cublas_top_desc_, cublas_preference_, max_algo_count, heuristic_result,
-      &algo_count));
-
-  if (algo_count == 0) {
-    HCTR_LIB_THROW(CUBLAS_STATUS_NOT_SUPPORTED);
-  }
-
-  for (int algoIdx = 0; algoIdx < algo_count; algoIdx++) {
-    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
-
-    const float alpha = 1.0f;
-    const float beta = 0.0f;
-    HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream()));
-    for (size_t i = 0; i < repeat_num && status == CUBLAS_STATUS_SUCCESS; ++i) {
-      status =
-          cublasLtMatmul(get_gpu().get_cublaslt_handle(), cublas_op_desc_, &alpha, kernel,
-                         cublas_kernel_desc_, bottom, cublas_bottom_desc_, &beta, top,
-                         cublas_top_desc_, top, cublas_top_desc_, &heuristic_result[algoIdx].algo,
-                         cublaslt_workspace_, cublaslt_workspace_size_, get_gpu().get_stream());
-    }
-    HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream()));
-    HCTR_LIB_THROW(cudaEventSynchronize(stop));
-    HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop));
-
-    // Avg Time(ms) for this algorithm for fprop GEMM
-    time = time / repeat_num;
-    // Skip if the algorithm is supported for fprop configuration
-    if (status != CUBLAS_STATUS_SUCCESS) {
-      //      HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for fprop, skipped.\n",
-      //      testAlgo);
-      continue;
-    }
-
-    // if(get_device_id()==0) HCTR_LOG(INFO, WORLD, "Algo: %d, wavesCount: %f, time: %f\n",
-    //           (int)heuristic_result[algoIdx].algo,
-    //           heuristic_result[algoIdx].wavesCount,
-    //           time);
-    // Record the optimal time and algorithm
-    if (time < shortestTime) {
-      shortestTime = time;
-      memcpy(&falgo_k_, &heuristic_result[algoIdx].algo, sizeof(falgo_k_));
-      // if(get_device_id()==0) HCTR_LOG(INFO, WORLD, "Picked algorithm: %d",
-      // heuristic_result[algoIdx].algo);
-    }
-  }
-
-  // dRelu in backward pass
-  // Reset shortestTime
-  shortestTime = std::numeric_limits<float>::max();
-  cublasLtMatmulHeuristicResult_t heuristic_result_dRelu[max_algo_count] = {0};
-  int algo_count_dRelu = 0;
-  HCTR_LIB_THROW(cublasLtMatmulAlgoGetHeuristic(
-      get_gpu().get_cublaslt_handle(), cublas_op_desc_bprop_, cublas_kernel_desc_,
-      cublas_dRelu_top_desc_, cublas_dRelu_bottom_desc_, cublas_dRelu_bottom_desc_,
-      cublas_preference_dRelu_, max_algo_count, heuristic_result_dRelu, &algo_count_dRelu));
-
-  if (algo_count_dRelu == 0) {
-    HCTR_LIB_THROW(CUBLAS_STATUS_NOT_SUPPORTED);
-  }
-
-  for (int algoIdx = 0; algoIdx < algo_count_dRelu; algoIdx++) {
-    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
-
-    const float alpha = 1.0f;
-    const float beta = 0.0f;
-    HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream()));
-    for (size_t i = 0; i < repeat_num && status == CUBLAS_STATUS_SUCCESS; ++i) {
-      status = cublasLtMatmul(get_gpu().get_cublaslt_handle(), cublas_op_desc_bprop_, &alpha,
-                              kernel, cublas_kernel_desc_, top, cublas_dRelu_top_desc_, &beta,
-                              bottom, cublas_dRelu_bottom_desc_, bottom, cublas_dRelu_bottom_desc_,
-                              &heuristic_result_dRelu[algoIdx].algo, cublaslt_workspace_dRelu_,
-                              cublaslt_workspace_size_, get_gpu().get_stream());
-    }
-    HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream()));
-    HCTR_LIB_THROW(cudaEventSynchronize(stop));
-    HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop));
-
-    // Avg Time(ms) for this algorithm for fprop GEMM
-    time = time / repeat_num;
-    // Skip if the algorithm is supported for fprop configuration
-    if (status != CUBLAS_STATUS_SUCCESS) {
-      //      HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for fprop, skipped.\n",
-      //      testAlgo);
-      continue;
-    }
-    // Record the optimal time and algorithm
-    if (time < shortestTime) {
-      shortestTime = time;
-      memcpy(&balgo_dRelu_, &heuristic_result_dRelu[algoIdx].algo, sizeof(balgo_dRelu_));
-    }
-  }
-
-  // wgrad in backward pass
-  // Reset shortestTime
-  shortestTime = std::numeric_limits<float>::max();
-  cublasLtMatmulHeuristicResult_t heuristic_result_wgrad[max_algo_count] = {0};
-  int algo_count_wgrad = 0;
-  HCTR_LIB_THROW(cublasLtMatmulAlgoGetHeuristic(
-      get_gpu().get_cublaslt_handle(), cublas_op_desc_wgrad_, cublas_dRelu_top_desc_,
-      cublas_dRelu_bottom_desc_, cublas_kernel_desc_, cublas_kernel_desc_, cublas_preference_wgrad_,
-      max_algo_count, heuristic_result_wgrad, &algo_count_wgrad));
-
-  if (algo_count_wgrad == 0) {
-    HCTR_LIB_THROW(CUBLAS_STATUS_NOT_SUPPORTED);
-  }
-
-  for (int algoIdx = 0; algoIdx < algo_count_wgrad; algoIdx++) {
-    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
-
-    const float alpha = 1.0f;
-    const float beta = 1.0f;
-    HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream()));
-    for (size_t i = 0; i < repeat_num && status == CUBLAS_STATUS_SUCCESS; ++i) {
-      status = cublasLtMatmul(get_gpu().get_cublaslt_handle(), cublas_op_desc_wgrad_, &alpha, top,
-                              cublas_dRelu_top_desc_, bottom, cublas_dRelu_bottom_desc_, &beta,
-                              kernel, cublas_kernel_desc_, kernel, cublas_kernel_desc_,
-                              &heuristic_result_wgrad[algoIdx].algo, cublaslt_workspace_wgrad_,
-                              cublaslt_workspace_size_, get_gpu().get_stream());
-    }
-    HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream()));
-    HCTR_LIB_THROW(cudaEventSynchronize(stop));
-    HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop));
-
-    // Avg Time(ms) for this algorithm for fprop GEMM
-    time = time / repeat_num;
-    // HCTR_LOG(INFO, WORLD, "algoIdx: %d, time: %f, shortest time: %f\n", algoIdx, time,
-    // shortestTime); Skip if the algorithm is supported for fprop configuration
-    if (status != CUBLAS_STATUS_SUCCESS) {
-      //      HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for fprop, skipped.\n",
-      //      testAlgo);
-      continue;
-    }
-    // Record the optimal time and algorithm
-    if (time < shortestTime) {
-      shortestTime = time;
-      // HCTR_LOG(INFO, WORLD, "wgrad cublasMatmul algoIdx: %d, time: %f\n", algoIdx, shortestTime);
-      memcpy(&balgo_wgrad_, &heuristic_result_wgrad[algoIdx].algo, sizeof(balgo_wgrad_));
-    }
-  }
-
-  // Reset shortestTime
-  shortestTime = std::numeric_limits<float>::max();
-
-  // Start, end for search
-  const cublasGemmAlgo_t startAlgo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
-  const cublasGemmAlgo_t endAlgo = CUBLAS_GEMM_ALGO15_TENSOR_OP;
-
-  // Search all the algorithm for balgo_k_
-  for (int testAlgo = startAlgo; testAlgo <= endAlgo; testAlgo++) {
-    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
-
-    const float alpha = 1.0f;
-    const float beta = 1.0f;
-
-    // Record start event
-    HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream()));
-    for (size_t i = 0; i < repeat_num && status == CUBLAS_STATUS_SUCCESS; ++i) {
-      status = cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_T, output_size,
-                            input_size, batch_size, &alpha, top, CUDA_R_16F, output_size, bottom,
-                            CUDA_R_16F, input_size, &beta, kernel_grad, CUDA_R_16F, output_size,
-                            CUDA_R_32F, static_cast<cublasGemmAlgo_t>(testAlgo));
-    }
-    HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream()));
-    HCTR_LIB_THROW(cudaEventSynchronize(stop));
-    HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop));
-    // Avg Time(ms) for this algorithm for fprop GEMM
-    time = time / repeat_num;
-    // Skip if the algorithm is supported for fprop configuration
-    if (status != CUBLAS_STATUS_SUCCESS) {
-      //      HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for bprop_W, skipped.\n",
-      //      testAlgo);
-      continue;
-    }
-    // Record the optimal time and algorithm
-    if (time < shortestTime) {
-      shortestTime = time;
-      // HCTR_LOG(INFO, WORLD, "wgrad cublasGemmEx algoIdx: %d, time: %f\n", testAlgo,
-      // shortestTime);
-      balgo_k_ = static_cast<cublasGemmAlgo_t>(testAlgo);
-    }
-  }
-
-  // Reset shortestTime
-  shortestTime = std::numeric_limits<float>::max();
-
-  // Search all the algorithm for balgo_b_
-  for (int testAlgo = startAlgo; testAlgo <= endAlgo; testAlgo++) {
-    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
-
-    const float alpha = 1.0f;
-    const float beta = 0.0f;
-
-    // Record start event
-    HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream()));
-    for (size_t i = 0; i < repeat_num && status == CUBLAS_STATUS_SUCCESS; ++i) {
-      status = cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, output_size, 1,
-                            batch_size, &alpha, top, CUDA_R_16F, output_size, identity, CUDA_R_16F,
-                            batch_size, &beta, bias_grad, CUDA_R_16F, output_size, CUDA_R_32F,
-                            static_cast<cublasGemmAlgo_t>(testAlgo));
-    }
-    HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream()));
-    HCTR_LIB_THROW(cudaEventSynchronize(stop));
-    HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop));
-    // Avg Time(ms) for this algorithm for fprop GEMM
-    time = time / repeat_num;
-    // Skip if the algorithm is supported for fprop configuration
-    if (status != CUBLAS_STATUS_SUCCESS) {
-      // HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for bprop_W, skipped.\n",
-      // testAlgo);
-      continue;
-    }
-    // Record the optimal time and algorithm
-    if (time < shortestTime) {
-      shortestTime = time;
-      balgo_b_ = static_cast<cublasGemmAlgo_t>(testAlgo);
-    }
-  }
-  // Reset shortestTime
-  shortestTime = std::numeric_limits<float>::max();
-
-  // Search all the algorithm for balgo_x_
-  for (int testAlgo = startAlgo; testAlgo <= endAlgo; testAlgo++) {
-    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
-
-    const __half alpha = 1.0f;
-    const __half beta = 0.0f;
-
-    // Record start event
-    HCTR_LIB_THROW(cudaEventRecord(start, get_gpu().get_stream()));
-    for (size_t i = 0; i < repeat_num && status == CUBLAS_STATUS_SUCCESS; ++i) {
-      status = cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, input_size,
-                            batch_size, output_size, &alpha, kernel, CUDA_R_16F, output_size, top,
-                            CUDA_R_16F, output_size, &beta, bottom, CUDA_R_16F, input_size,
-                            CUDA_R_32F, static_cast<cublasGemmAlgo_t>(testAlgo));
-    }
-
-    HCTR_LIB_THROW(cudaEventRecord(stop, get_gpu().get_stream()));
-    HCTR_LIB_THROW(cudaEventSynchronize(stop));
-    HCTR_LIB_THROW(cudaEventElapsedTime(&time, start, stop));
-    // Avg Time(ms) for this algorithm for fprop GEMM
-    time = time / repeat_num;
-    // Skip if the algorithm is supported for fprop configuration
-    if (status != CUBLAS_STATUS_SUCCESS) {
-      //      HCTR_LOG(INFO, WORLD, "The algorithms %d is not supported for bprop_Xn, skipped.\n",
-      //      testAlgo);
-      continue;
-    }
-    // Record the optimal time and algorithm
-    if (time < shortestTime) {
-      shortestTime = time;
-      balgo_x_ = static_cast<cublasGemmAlgo_t>(testAlgo);
-    }
-  }
-
-  // Print selection information
-  // HCTR_LOG(INFO, WORLD, "The algorithm selection for falgo_k_, balgo_k_, balgo_x_ are: %d, %d and
-  // %d.\n",
-  //        (int)falgo_k_ - CUBLAS_GEMM_DEFAULT_TENSOR_OP,
-  //        (int)balgo_k_ - CUBLAS_GEMM_DEFAULT_TENSOR_OP,
-  //        (int)balgo_x_ - CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-
-  // Output msg
-  // HCTR_LOG(INFO, ROOT, "The fully-connected layer has finished choosing the algorithm for cublas
-  // Gemm.\n"); Clean-up
-  HCTR_LIB_THROW(cudaEventDestroy(start));
-  HCTR_LIB_THROW(cudaEventDestroy(stop));
-}  // namespace HugeCTR
-
-std::unique_ptr<DataSimulator> FusedReluBiasFullyConnectedLayer::get_uniform_initializer(
-    const int index) {
-  size_t bottom_dim = get_bottom_tensor_fprop(true).get_dimensions()[1];
-  size_t top_dim = train_out_tensor_.get_dimensions()[1];
-
-  float limit = 1.0f / ((0 == index ? bottom_dim : 0) + top_dim);
-  return std::make_unique<UniformDataSimulator>(-1 * limit, limit);
-}
-
-std::unique_ptr<DataSimulator> FusedReluBiasFullyConnectedLayer::get_xavier_uniform_initializer(
-    const int index) {
-  size_t bottom_dim = get_bottom_tensor_fprop(true).get_dimensions()[1];
-  size_t top_dim = train_out_tensor_.get_dimensions()[1];
-
-  return std::make_unique<VarianceScalingSimulator>(1.f, data_simu::Mode_t::Fan_avg,
-                                                    data_simu::Distribution_t::Uniform,
-                                                    0 == index ? bottom_dim : 0, top_dim);
-}
-
-std::unique_ptr<DataSimulator> FusedReluBiasFullyConnectedLayer::get_xavier_norm_initializer(
-    const int index) {
-  size_t bottom_dim = get_bottom_tensor_fprop(true).get_dimensions()[1];
-  size_t top_dim = train_out_tensor_.get_dimensions()[1];
-
-  return std::make_unique<VarianceScalingSimulator>(1.f, data_simu::Mode_t::Fan_avg,
-                                                    data_simu::Distribution_t::Norm,
-                                                    0 == index ? bottom_dim : 0, top_dim);
-}
-
-std::unique_ptr<DataSimulator> FusedReluBiasFullyConnectedLayer::get_default_initializer(
-    const int index) {
-  size_t bottom_dim = get_bottom_tensor_fprop(true).get_dimensions()[1];
-  size_t top_dim = train_out_tensor_.get_dimensions()[1];
-
-  std::unique_ptr<DataSimulator> simu(nullptr);
-  if (0 == index) {
-    simu.reset(new VarianceScalingSimulator(1.f, data_simu::Mode_t::Fan_avg,
-                                            data_simu::Distribution_t::Norm, bottom_dim, top_dim));
-  } else if (1 == index) {
-    float stddev = sqrt(1.f / top_dim);
-    simu.reset(new GaussianDataSimulator(0, stddev, -2 * stddev, 2 * stddev));
-  } else {
-    HCTR_OWN_THROW(Error_t::OutOfBound, "index != {0, 1}.");
-  }
-
-  return simu;
-}
-
-Core23TempFusedReluBiasFullyConnectedLayer::Core23TempFusedReluBiasFullyConnectedLayer(
     const core23::Tensor& train_in_tensor, const core23::Tensor& mask_in_tensor,
     const core23::Tensor& dRelu_in_tensor, const core23::Tensor& db_in_tensor,
     const core23::Tensor& train_out_tensor, const core23::Tensor& mask_out_tensor,
@@ -827,8 +59,8 @@ Core23TempFusedReluBiasFullyConnectedLayer::Core23TempFusedReluBiasFullyConnecte
     const std::shared_ptr<GPUResource>& gpu_resource, const FcPosition_t& pos,
     const Activation_t& act, const bool& skip_dgrad, std::vector<Initializer_t> initializer_types,
     const bool async_mlp_wgrad, const bool head_mask_in, const bool fuse_wb)
-    : Core23TempTrainableLayer<__half>({train_in_tensor}, {train_out_tensor}, gpu_resource,
-                                       initializer_types),
+    : TrainableLayer<__half>({train_in_tensor}, {train_out_tensor}, gpu_resource,
+                             initializer_types),
       balgo_k_(CUBLAS_GEMM_DEFAULT_TENSOR_OP),
       balgo_x_(CUBLAS_GEMM_DEFAULT_TENSOR_OP),
       balgo_b_(CUBLAS_GEMM_DEFAULT_TENSOR_OP),
@@ -891,7 +123,7 @@ Core23TempFusedReluBiasFullyConnectedLayer::Core23TempFusedReluBiasFullyConnecte
     cublas_handle_wgrad_ = gpu_resource->get_cublas_handle();
 }
 
-void Core23TempFusedReluBiasFullyConnectedLayer::initialize() {
+void FusedReluBiasFullyConnectedLayer::initialize() {
   CudaDeviceContext context(get_device_id());
   HCTR_LIB_THROW(cudaEventCreate(&event_overlap_));
   event_overlap_created_ = true;
@@ -976,7 +208,7 @@ void Core23TempFusedReluBiasFullyConnectedLayer::initialize() {
   initialize_wgrad();
 }
 
-void Core23TempFusedReluBiasFullyConnectedLayer::initialize_dgrad() {
+void FusedReluBiasFullyConnectedLayer::initialize_dgrad() {
   // TODO: We need different bottom desc based on is_train or not
   const auto& bottom_tensor_dim = get_bottom_tensor_fprop(true).shape();
   const auto& top_tensor_dim = this->output_tensors_[0].shape();
@@ -1058,7 +290,7 @@ void Core23TempFusedReluBiasFullyConnectedLayer::initialize_dgrad() {
   }
 }
 
-void Core23TempFusedReluBiasFullyConnectedLayer::initialize_wgrad() {
+void FusedReluBiasFullyConnectedLayer::initialize_wgrad() {
   // TODO: We need different bottom desc based on is_train or not
   const auto& bottom_tensor_dim = get_bottom_tensor_fprop(true).shape();
   const auto& top_tensor_dim = this->output_tensors_[0].shape();
@@ -1123,7 +355,7 @@ void Core23TempFusedReluBiasFullyConnectedLayer::initialize_wgrad() {
   }
 }
 
-void Core23TempFusedReluBiasFullyConnectedLayer::fprop(bool is_train) {
+void FusedReluBiasFullyConnectedLayer::fprop(bool is_train) {
   CudaDeviceContext context(get_device_id());
 
   const __half* kernel = weights_half_[0].data<__half>();
@@ -1155,7 +387,7 @@ void Core23TempFusedReluBiasFullyConnectedLayer::fprop(bool is_train) {
   }
 }
 
-void Core23TempFusedReluBiasFullyConnectedLayer::bprop() {
+void FusedReluBiasFullyConnectedLayer::bprop() {
   CudaDeviceContext context(get_device_id());
 
   const __half* kernel = weights_half_[0].data<__half>();
@@ -1235,7 +467,7 @@ void Core23TempFusedReluBiasFullyConnectedLayer::bprop() {
   }
 }
 
-void Core23TempFusedReluBiasFullyConnectedLayer::search_algorithm() {
+void FusedReluBiasFullyConnectedLayer::search_algorithm() {
   // Set to the CUDA device where this layer assigned to
   CudaDeviceContext context(get_device_id());
   const int64_t repeat_num = 100;
@@ -1537,7 +769,7 @@ void Core23TempFusedReluBiasFullyConnectedLayer::search_algorithm() {
   HCTR_LIB_THROW(cudaEventDestroy(stop));
 }  // namespace HugeCTR
 
-std::unique_ptr<DataSimulator> Core23TempFusedReluBiasFullyConnectedLayer::get_uniform_initializer(
+std::unique_ptr<DataSimulator> FusedReluBiasFullyConnectedLayer::get_uniform_initializer(
     const int index) {
   int64_t bottom_dim = get_bottom_tensor_fprop(true).shape().size(1);
   int64_t top_dim = this->output_tensors_[0].shape().size(1);
@@ -1546,8 +778,8 @@ std::unique_ptr<DataSimulator> Core23TempFusedReluBiasFullyConnectedLayer::get_u
   return std::make_unique<UniformDataSimulator>(-1 * limit, limit);
 }
 
-std::unique_ptr<DataSimulator>
-Core23TempFusedReluBiasFullyConnectedLayer::get_xavier_uniform_initializer(const int index) {
+std::unique_ptr<DataSimulator> FusedReluBiasFullyConnectedLayer::get_xavier_uniform_initializer(
+    const int index) {
   int64_t bottom_dim = get_bottom_tensor_fprop(true).shape().size(1);
   int64_t top_dim = this->output_tensors_[0].shape().size(1);
 
@@ -1556,8 +788,8 @@ Core23TempFusedReluBiasFullyConnectedLayer::get_xavier_uniform_initializer(const
                                                     0 == index ? bottom_dim : 0, top_dim);
 }
 
-std::unique_ptr<DataSimulator>
-Core23TempFusedReluBiasFullyConnectedLayer::get_xavier_norm_initializer(const int index) {
+std::unique_ptr<DataSimulator> FusedReluBiasFullyConnectedLayer::get_xavier_norm_initializer(
+    const int index) {
   int64_t bottom_dim = get_bottom_tensor_fprop(true).shape().size(1);
   int64_t top_dim = this->output_tensors_[0].shape().size(1);
 
@@ -1566,7 +798,7 @@ Core23TempFusedReluBiasFullyConnectedLayer::get_xavier_norm_initializer(const in
                                                     0 == index ? bottom_dim : 0, top_dim);
 }
 
-std::unique_ptr<DataSimulator> Core23TempFusedReluBiasFullyConnectedLayer::get_default_initializer(
+std::unique_ptr<DataSimulator> FusedReluBiasFullyConnectedLayer::get_default_initializer(
     const int index) {
   int64_t bottom_dim = get_bottom_tensor_fprop(true).shape().size(1);
   int64_t top_dim = this->output_tensors_[0].shape().size(1);
diff --git a/HugeCTR/src/layers/gru_layer.cu b/HugeCTR/src/layers/gru_layer.cu
index e06d69482a..cb7bb9eb21 100644
--- a/HugeCTR/src/layers/gru_layer.cu
+++ b/HugeCTR/src/layers/gru_layer.cu
@@ -31,386 +31,11 @@
 namespace HugeCTR {
 
 template <typename T>
-GRULayer<T>::GRULayer(const std::shared_ptr<BufferBlock2<T>>& weight_buff,
-                      const std::shared_ptr<BufferBlock2<T>>& wgrad_buff,
-                      const Tensor2<T>& in_tensor, const Tensor2<T>& out_tensor, size_t hiddenSize,
-                      size_t batch_size, size_t SeqLength, size_t embedding_vec_size,
-                      const std::shared_ptr<GPUResource>& gpu_resource,
+GRULayer<T>::GRULayer(const core23::Tensor& in_tensor, const core23::Tensor& out_tensor,
+                      int64_t hiddenSize, int64_t batch_size, int64_t SeqLength,
+                      int64_t embedding_vec_size, const std::shared_ptr<GPUResource>& gpu_resource,
                       std::vector<Initializer_t> initializer_types)
-    : Layer(gpu_resource, initializer_types) {
-  try {
-    CudaDeviceContext context(this->get_device_id());
-    // check the in_tensor and out_tensor
-    const auto& in_tensor_dim = in_tensor.get_dimensions();
-    const auto& out_tensor_dim = out_tensor.get_dimensions();
-
-    // 2. dim match?
-    // seqLength = in_tensor_dim[1];
-    // m = out_tensor_dim[1];
-    // miniBatch = in_tensor_dim[0];
-    // HCTR_LOG(INFO, WORLD, "m %lu n %lu k %lu \n ", m, n,k);
-    hiddenSize_ = hiddenSize;
-    miniBatch = batch_size;
-    seqLength_ = SeqLength;
-    embedding_vec_size_ = embedding_vec_size;
-
-    inputTensorSize = miniBatch * seqLength_ * embedding_vec_size_;
-    outputTensorSize = miniBatch * seqLength_ * hiddenSize_;
-    hiddenTensorSize = miniBatch * hiddenSize_;
-
-    // weightSpaceSize = m*k + m*m + 1*m; //include W, U weight matrixs and bias vector.
-
-    // HCTR_LIB_THROW(cudnnSetTensor4dDescriptorEx(hDesc, data_type, n, 1, 1, n,
-    //  n, 1, 1, 1));
-
-    // HCTR_LIB_THROW(cudnnSetTensor4dDescriptorEx(cDesc, data_type, 1, n, m, n,
-    //  n, 1, 1, 1));
-    seqLengthArray = new int[miniBatch];
-
-    for (size_t i = 0; i < miniBatch; i++) {
-      seqLengthArray[i] = seqLength_;
-    }
-
-    // cudnnHandle= get_gpu().get_cudnn_handle();
-    HCTR_LIB_THROW(cudnnCreate(&cudnnHandle));
-    data_type = CudnnDataType<T>::getType();
-    HCTR_LIB_THROW(cudnnCreateRNNDescriptor(&rnnDesc));
-    HCTR_LIB_THROW(cudnnCreateRNNDataDescriptor(&in_Desc));
-    HCTR_LIB_THROW(cudnnCreateRNNDataDescriptor(&out_Desc));
-    HCTR_LIB_THROW(cudnnCreateTensorDescriptor(&cDesc));
-    HCTR_LIB_THROW(cudnnCreateTensorDescriptor(&hDesc));
-    HCTR_LIB_THROW(cudnnCreateDropoutDescriptor(&dropoutDesc));
-
-    HCTR_LIB_THROW(cudnnSetRNNDataDescriptor(
-        in_Desc,                                   // cudnnRNNDataDescriptor_t RNNDataDesc,
-        data_type,                                 // cudnnDataType_t dataType,
-        CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED,  // CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED,
-                                                   // //cudnnRNNDataLayout_t layout,
-        seqLength_,                                // int maxSeqLength,
-        miniBatch,                                 // int batchSize,
-        embedding_vec_size_,                       // int vectorSize,
-        seqLengthArray,                            // const int seqLengthArray[],
-        NULL                                       // void *paddingFill
-        ));
-
-    HCTR_LIB_THROW(cudnnSetRNNDataDescriptor(
-        out_Desc,                                  // cudnnRNNDataDescriptor_t RNNDataDesc,
-        data_type,                                 // cudnnDataType_t dataType,
-        CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED,  // CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED,
-                                                   // //cudnnRNNDataLayout_t layout,
-        seqLength_,                                // int maxSeqLength,
-        miniBatch,                                 // int batchSize,
-        hiddenSize_,                               // int vectorSize,
-        seqLengthArray,                            // const int seqLengthArray[],
-        NULL                                       // void *paddingFill
-        ));
-    dimHidden[0] = 1 * 1;
-    dimHidden[1] = miniBatch;
-    dimHidden[2] = hiddenSize_;
-    strideHidden[0] = dimHidden[1] * dimHidden[2];
-    strideHidden[1] = dimHidden[2];
-    strideHidden[2] = 1;
-    HCTR_LIB_THROW(cudnnSetTensorNdDescriptor(hDesc, data_type, 3, dimHidden, strideHidden));
-    HCTR_LIB_THROW(cudnnSetTensorNdDescriptor(cDesc, data_type, 3, dimHidden, strideHidden));
-
-    HCTR_LIB_THROW(cudnnDropoutGetStatesSize(cudnnHandle, &stateSize));
-    HCTR_LIB_THROW(cudaMalloc(&states, stateSize));
-    seed = 0;  // 1337ull;
-    HCTR_LIB_THROW(
-        cudnnSetDropoutDescriptor(dropoutDesc, cudnnHandle, dropout, states, stateSize, seed));
-
-    HCTR_LIB_THROW(cudnnSetRNNDescriptor_v8(
-        rnnDesc,
-        CUDNN_RNN_ALGO_STANDARD,    // cudnnRNNAlgo_t algo,
-        CUDNN_GRU,                  // cudnnRNNMode_t cellMode,
-        CUDNN_RNN_SINGLE_INP_BIAS,  // cudnnRNNBiasMode_t biasMode,
-        CUDNN_UNIDIRECTIONAL,       // cudnnDirectionMode_t dirMode,
-        CUDNN_LINEAR_INPUT,         // CUDNN_SKIP_INPUT, //CUDNN_LINEAR_INPUT, //cudnnRNNInputMode_t
-                             // inputMode, CUDNN_SKIP_INPUT :without multiplying input by the weight
-                             // matrix
-        data_type,             // cudnnDataType_t dataType,
-        data_type,             // cudnnDataType_t mathPrec,
-        CUDNN_TENSOR_OP_MATH,  // CUDNN_DEFAULT_MATH , //cudnnMathType_t mathType,
-        embedding_vec_size_,   // int32_t embedding_vec_size, When the inputMode=CUDNN_SKIP_INPUT,
-                               // the embedding_vec_size should match the hiddenSize value
-        hiddenSize_,           // int32_t hiddenSize,
-        hiddenSize_,           // int32_t projSize,
-        1,                     // int32_t numLayers, BIDIRECTIONAL=2
-        dropoutDesc,           // cudnnDropoutDescriptor_t dropoutDesc,
-        CUDNN_RNN_PADDED_IO_DISABLED  // uint32_t auxFlags
-        ));
-
-    // const int seqLengthArray[in_tensor_dim[0]] = { [0...10] = int(in_tensor_dim[1]) };
-    // const int seqLengthArray[m] ={n,n....n};
-    // for(int i=0; i<in_tensor_dim[1]; i++)
-    // = { [0 . . . 3 ] = 3 };
-
-    HCTR_LIB_THROW(cudnnGetRNNWeightSpaceSize(cudnnHandle, rnnDesc, &weightSpaceSize));
-    HCTR_LIB_THROW(cudnnGetRNNTempSpaceSizes(cudnnHandle, rnnDesc, CUDNN_FWD_MODE_TRAINING, in_Desc,
-                                             &workSpaceSize, &reserveSpaceSize));
-    // std::vector<size_t> weight_dim = {weightSpaceSize/sizeof(T), 1};
-    // std::vector<size_t> dx_dim =  {inputTensorSize, 1};
-    // std::vector<size_t> dy_dim =  {outputTensorSize, 1};
-    // std::vector<size_t> dhx_dim = {hiddenTensorSize, 1};
-    // std::vector<size_t> dhy_dim = {hiddenTensorSize, 1};
-    // std::vector<size_t> dcx_dim = {hiddenTensorSize, 1};
-    // std::vector<size_t> dcy_dim = {hiddenTensorSize, 1};
-
-    std::vector<size_t> weight_dim = {1, weightSpaceSize / sizeof(T)};
-    std::vector<size_t> hx_dim = {1, hiddenTensorSize};
-    std::vector<size_t> dx_dim = {1, inputTensorSize};
-    std::vector<size_t> dy_dim = {1, outputTensorSize};
-    std::vector<size_t> dhx_dim = {1, hiddenTensorSize};
-    std::vector<size_t> dhy_dim = {1, hiddenTensorSize};
-    std::vector<size_t> dweigths_dim = {1, weightSpaceSize / sizeof(T)};
-    // HCTR_LOG(INFO, WORLD, "weighsize %zu\n", weightSpaceSize/sizeof(T));
-
-    {
-      Tensor2<T> tensor;
-      weight_buff->reserve(weight_dim, &tensor);
-      weights_.push_back(tensor);
-    }
-
-    {
-      Tensor2<T> tensor;
-      weight_buff->reserve(hx_dim, &tensor);
-      weights_.push_back(tensor);
-    }
-
-    {
-      Tensor2<T> tensor;
-      wgrad_buff->reserve(dx_dim, &tensor);
-      wgrad_.push_back(tensor);
-    }
-    {
-      Tensor2<T> tensor;
-      wgrad_buff->reserve(dy_dim, &tensor);
-      wgrad_.push_back(tensor);
-    }
-    {
-      Tensor2<T> tensor;
-      wgrad_buff->reserve(dhx_dim, &tensor);
-      wgrad_.push_back(tensor);
-    }
-    {
-      Tensor2<T> tensor;
-      wgrad_buff->reserve(dhy_dim, &tensor);
-      wgrad_.push_back(tensor);
-    }
-    {
-      Tensor2<T> tensor;
-      wgrad_buff->reserve(dweigths_dim, &tensor);
-      wgrad_.push_back(tensor);
-    }
-
-    HCTR_LIB_THROW(cudaMalloc((void**)&devSeqLengthArray, miniBatch * sizeof(int)));
-    HCTR_LIB_THROW(cudaMemcpy(devSeqLengthArray, seqLengthArray, miniBatch * sizeof(int),
-                              cudaMemcpyHostToDevice));
-    HCTR_LIB_THROW(cudaMalloc((void**)&weightSpace, weightSpaceSize));
-    HCTR_LIB_THROW(cudaMalloc((void**)&workSpace, workSpaceSize));
-    HCTR_LIB_THROW(cudaMalloc((void**)&reserveSpace, reserveSpaceSize));
-    // HCTR_LIB_THROW(cudaMalloc((void **)&dweightSpace, weightSpaceSize));
-
-    in_tensors_.push_back(in_tensor);
-    out_tensors_.push_back(out_tensor);
-    // Where should we create this cuBLAS handle?
-  } catch (const std::runtime_error& rt_err) {
-    HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl;
-    throw;
-  }
-}
-
-//#define KERAS_CHECK
-template <typename T>
-void GRULayer<T>::fprop(bool is_train) {
-  CudaDeviceContext context(get_device_id());
-
-  Tensor2<T>& in_tensor = get_in_tensors(is_train)[0];
-  Tensor2<T>& out_tensor = out_tensors_[0];
-
-  T* weight = weights_[0].get_ptr();
-  T* hx = weights_[1].get_ptr();
-  // T* Uweight = weights_[1].get_ptr();
-  // T* bias = weights_[2].get_ptr();
-
-  T* in = in_tensor.get_ptr();
-  T* out = out_tensor.get_ptr();
-// T* hx = weights_[0].get_ptr();
-// HCTR_LOG(INFO, WORLD, "datatype %lu\n", sizeof(data_type));
-// HCTR_LIB_THROW(cudaMalloc((void **)&in,  inputTensorSize * sizeof(T)));
-
-// HCTR_LIB_THROW(cublasGemmEx(get_gpu().get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, n, m, k,
-//                            &alpha, weight, CUDA_R_32F, n, in, CUDA_R_32F, k, &beta, out,
-//                            CUDA_R_32F, n, CUDA_R_32F, falgo_));
-#ifdef KERAS_CHECK
-  cudnnTensorDescriptor_t wDesc;
-  cudnnTensorDescriptor_t bDesc;
-  HCTR_LIB_THROW(cudnnCreateTensorDescriptor(&wDesc));
-  HCTR_LIB_THROW(cudnnCreateTensorDescriptor(&bDesc));
-
-  // Tensor2<T> linLayerMat;
-  // Tensor2<T> linLayerBias;
-  numLinearLayers = 6;  // cellMode == CUDNN_GRU
-  for (int linLayerID = 0; linLayerID < numLinearLayers; linLayerID++) {
-    T* linLayerMat = NULL;
-    T* linLayerBias = NULL;
-    int nbDims = 0;
-    int dim[3] = {0, 0, 0}, stride[3];
-    int layer = 0;
-    // HCTR_LOG(INFO, WORLD, "weightSpaceSize %zu\n", weightSpaceSize);
-    HCTR_LIB_THROW(cudnnGetRNNWeightParams(cudnnHandle, rnnDesc, layer, weightSpaceSize,
-                                           weights_[0].get_ptr(),  // weightSpace,
-                                           linLayerID, wDesc,
-                                           (void**)&linLayerMat,  //.get_ptr(),
-                                           bDesc,
-                                           (void**)&linLayerBias  //.get_ptr()
-                                           ));
-
-    if (linLayerMat) {
-      HCTR_LIB_THROW(cudnnGetTensorNdDescriptor(wDesc, 3, &data_type, &nbDims, dim, stride));
-      size_t w = dim[0] * dim[1] * dim[2];
-      T* h_weights = new T[w];
-      HCTR_LIB_THROW(cudaMemcpy(h_weights, linLayerMat, sizeof(T) * w, cudaMemcpyDeviceToHost));
-
-      HCTR_LOG(INFO, ROOT, "W_%d %zu ", linLayerID, w);
-      for (unsigned int i = 0; i < w; i++) {
-        HCTR_PRINT(INFO, "%f ", h_weights[i]);
-      }
-      HCTR_PRINT(INFO, "\n");
-
-      delete[] h_weights;
-    }
-
-    if (linLayerBias) {
-      HCTR_LIB_THROW(cudnnGetTensorNdDescriptor(bDesc, 3, &data_type, &nbDims, dim, stride));
-      size_t w = dim[0] * dim[1] * dim[2];
-      T* h_weights = new T[w];
-      HCTR_LIB_THROW(cudaMemcpy(h_weights, linLayerBias, sizeof(T) * w, cudaMemcpyDeviceToHost));
-
-      HCTR_LOG(INFO, ROOT, "B_%d %zu ", linLayerID, w);
-      for (unsigned int i = 0; i < w; i++) {
-        HCTR_PRINT(INFO, "%f ", h_weights[i]);
-      }
-      HCTR_PRINT(INFO, "\n");
-
-      delete[] h_weights;
-    }
-  }
-
-  HCTR_LIB_THROW(cudnnDestroyTensorDescriptor(wDesc));
-  HCTR_LIB_THROW(cudnnDestroyTensorDescriptor(bDesc));
-#endif
-  // CUDNN GRU
-  // T tmp[hiddenTensorSize];
-  // HCTR_LIB_THROW(cudaMemcpy(tmp, weight + weightSpaceSize/sizeof(T), sizeof(T) *
-  // hiddenTensorSize, cudaMemcpyDeviceToHost)); for(size_t i=0;i<hiddenTensorSize;i++)
-  //  if(tmp[i] != 0.0)
-  //    HCTR_LOG(INFO, WORLD, "tmp[i] %f\n", tmp[i]);
-  HCTR_LIB_THROW(cudnnRNNForward(
-      cudnnHandle, rnnDesc, CUDNN_FWD_MODE_TRAINING, devSeqLengthArray,
-      in_Desc,   // xDesc,
-      in,        // x, input data pointer
-      out_Desc,  // yDesc,
-      out,       // y, output data pointer
-      hDesc,
-      NULL,   // hx, Input. Pointer to the GPU buffer with the RNN initial hidden state, NULL:
-              // initialized zero.
-      NULL,   // hy,  Output. Pointer to the GPU buffer where the final RNN hidden state should be
-              // stored. NULL: not saved.
-      cDesc,  // cDesc, Input. A tensor descriptor, for LSTM networks only.
-      NULL,   // cx,
-      NULL,   // cy,
-      weightSpaceSize,
-      weight,         // weightSpace, The weight space buffer holds all RNN weight matrices and bias
-                      // vectors
-      workSpaceSize,  // size_t workSpaceSize,
-      workSpace,      // workSpace,
-      reserveSpaceSize,
-      reserveSpace  // reserveSpace
-      ));
-
-  // HCTR_LOG(INFO, WORLD, "forward end\n\n");
-  // cudnnDestroy(cudnnHandle);
-}
-
-template <typename T>
-void GRULayer<T>::bprop() {
-  CudaDeviceContext context(get_device_id());
-  Tensor2<T>& in_tensor = get_in_tensors(true)[0];
-  Tensor2<T>& out_tensor = out_tensors_[0];
-
-  T* weight = weights_[0].get_ptr();
-  T* in = in_tensor.get_ptr();
-  T* out = out_tensor.get_ptr();
-  T* dx = wgrad_[0].get_ptr();
-  T* dy = wgrad_[1].get_ptr();
-  T* dhx = wgrad_[2].get_ptr();
-  T* dhy = wgrad_[3].get_ptr();
-  T* dweightSpace = wgrad_[4].get_ptr();
-
-  HCTR_LIB_THROW(cudnnRNNBackwardData_v8(cudnnHandle,        // cudnnHandle_t handle,
-                                         rnnDesc,            // cudnnRNNDescriptor_t rnnDesc,
-                                         devSeqLengthArray,  // const int32_t devSeqLengths[],
-                                         out_Desc,           // cudnnRNNDataDescriptor_t yDesc,
-                                         out,                // const void *y, input
-                                         dy,                 // const void *dy, input
-                                         in_Desc,            // cudnnRNNDataDescriptor_t xDesc,
-                                         dx,                 // void *dx, output
-                                         hDesc,              // cudnnTensorDescriptor_t hDesc,
-                                         NULL,               // hx, //const void *hx, input
-                                         NULL,               // const void *dhy, input
-                                         dhx,                // void *dhx, output
-                                         cDesc,              // cudnnTensorDescriptor_t cDesc,
-                                         NULL,  // cx, //const void *cx, for LSTM only, input
-                                         NULL,  // const void *dcy, for LSTM only, input
-                                         NULL,  // void *dcx, output
-                                         weightSpaceSize,
-                                         weight,  // weightSpace,
-                                         workSpaceSize, workSpace, reserveSpaceSize, reserveSpace));
-
-  // cudnnRNNBackwardWeights adds to the data in dw.
-  HCTR_LIB_THROW(cudaMemset(dweightSpace, 0, weightSpaceSize));
-  // T* h_hx=NULL;
-  // cudaMemcpy(h_hx,hx,hiddenTensorSize*sizeof(T),cudaMemcpyDeviceToHost );
-  // for(unsigned int i=0;i<hiddenTensorSize;i++)
-  //    HCTR_LOG(INFO, WORLD, "hx %f \n",h_hx[i]);
-
-  HCTR_LIB_THROW(cudnnRNNBackwardWeights_v8(
-      cudnnHandle, rnnDesc, CUDNN_WGRAD_MODE_ADD, devSeqLengthArray, in_Desc, in, hDesc,
-      NULL,  // hx,
-      out_Desc,
-      out,  // output
-      weightSpaceSize,
-      dweightSpace,  // output
-      workSpaceSize, workSpace, reserveSpaceSize, reserveSpace));
-
-  HCTR_LIB_THROW(cudaFree(workSpace));
-  HCTR_LIB_THROW(cudaFree(reserveSpace));
-  HCTR_LIB_THROW(cudaFree(weightSpace));  // cudaFree(dweightSpace);
-  // cudaFree(x); cudaFree(y); cudaFree(hx);
-  HCTR_LIB_THROW(cudaFree(states));
-  HCTR_LIB_THROW(cudnnDestroyRNNDataDescriptor(in_Desc));
-  HCTR_LIB_THROW(cudnnDestroyRNNDataDescriptor(out_Desc));
-  delete[] seqLengthArray;
-
-  HCTR_LIB_THROW(cudnnDestroyTensorDescriptor(hDesc));
-  HCTR_LIB_THROW(cudnnDestroyTensorDescriptor(cDesc));
-
-  HCTR_LIB_THROW(cudnnDestroyDropoutDescriptor(dropoutDesc));
-  HCTR_LIB_THROW(cudnnDestroyRNNDescriptor(rnnDesc));
-}
-
-template class GRULayer<float>;
-// template class GRULayer<__half>;
-
-template <typename T>
-Core23TempGRULayer<T>::Core23TempGRULayer(const core23::Tensor& in_tensor,
-                                          const core23::Tensor& out_tensor, int64_t hiddenSize,
-                                          int64_t batch_size, int64_t SeqLength,
-                                          int64_t embedding_vec_size,
-                                          const std::shared_ptr<GPUResource>& gpu_resource,
-                                          std::vector<Initializer_t> initializer_types)
-    : Core23TempTrainableLayer<T>({in_tensor}, {out_tensor}, gpu_resource, initializer_types) {
+    : TrainableLayer<T>({in_tensor}, {out_tensor}, gpu_resource, initializer_types) {
   try {
     CudaDeviceContext context(this->get_device_id());
     // check the in_tensor and out_tensor
@@ -562,7 +187,7 @@ Core23TempGRULayer<T>::Core23TempGRULayer(const core23::Tensor& in_tensor,
 
 //#define KERAS_CHECK
 template <typename T>
-void Core23TempGRULayer<T>::fprop(bool is_train) {
+void GRULayer<T>::fprop(bool is_train) {
   CudaDeviceContext context(this->get_device_id());
 
   core23::Tensor& in_tensor = get_in_tensors(is_train)[0];
@@ -666,7 +291,7 @@ void Core23TempGRULayer<T>::fprop(bool is_train) {
 }
 
 template <typename T>
-void Core23TempGRULayer<T>::bprop() {
+void GRULayer<T>::bprop() {
   CudaDeviceContext context(this->get_device_id());
   core23::Tensor& in_tensor = get_in_tensors(true)[0];
   core23::Tensor& out_tensor = this->output_tensors_[0];
@@ -731,6 +356,6 @@ void Core23TempGRULayer<T>::bprop() {
   HCTR_LIB_THROW(cudnnDestroyRNNDescriptor(rnnDesc));
 }
 
-template class Core23TempGRULayer<float>;
-// template class Core23TempGRULayer<__half>;
+template class GRULayer<float>;
+// template class GRULayer<__half>;
 }  // namespace HugeCTR
diff --git a/HugeCTR/src/layers/layer_norm_layer.cu b/HugeCTR/src/layers/layer_norm_layer.cu
index ff2cb6f792..c088773700 100644
--- a/HugeCTR/src/layers/layer_norm_layer.cu
+++ b/HugeCTR/src/layers/layer_norm_layer.cu
@@ -363,153 +363,10 @@ __global__ void layer_norm_backward2(const __half* out_grad, __half* X_vals, con
 }  // namespace
 
 template <typename T>
-LayerNormLayer<T>::LayerNormLayer(const std::shared_ptr<BufferBlock2<float>>& master_weight_buff,
-                                  const std::shared_ptr<BufferBlock2<T>>& weight_buff,
-                                  const std::shared_ptr<BufferBlock2<T>>& wgrad_buff,
-                                  const std::shared_ptr<GeneralBuffer2<CudaAllocator>>& blob_buff,
-                                  const Tensor2<T>& in_tensor, const Tensor2<T>& out_tensor,
+LayerNormLayer<T>::LayerNormLayer(const core23::Tensor& in_tensor, const core23::Tensor& out_tensor,
                                   const Params& params,
                                   const std::shared_ptr<GPUResource>& gpu_resource,
                                   std::vector<Initializer_t> initializer_types)
-    : Base(master_weight_buff, weight_buff, wgrad_buff, gpu_resource, initializer_types),
-      params_(params) {
-  CudaDeviceContext context(this->get_device_id());
-  const auto& in_tensor_dim = in_tensor.get_dimensions();
-  const auto& out_tensor_dim = out_tensor.get_dimensions();
-
-  assert(in_tensor_dim.size() == out_tensor_dim.size());
-  if (in_tensor_dim.size() > 4 || in_tensor_dim.size() < 2) {
-    HCTR_OWN_THROW(Error_t::WrongInput, "Only 2D 3D 4D tensors can be layer-normed");
-  }
-  for (size_t idx = 0; idx < in_tensor_dim.size(); idx++) {
-    assert(in_tensor_dim[idx] == out_tensor_dim[idx]);
-  }
-
-  size_t batch = 1;
-  size_t hidden_dim = in_tensor_dim[in_tensor_dim.size() - 1];
-
-  for (size_t idx = 0; idx < in_tensor_dim.size() - 1; idx++) {
-    batch = batch * in_tensor_dim[idx];
-  }
-  if (hidden_dim > static_cast<size_t>(65535)) {
-    HCTR_OWN_THROW(Error_t::WrongInput,
-                   "Unsupported hidden_dim, the last dim should not be longer than 65535");
-  }
-
-  in_tensors_.push_back(in_tensor);
-  out_tensors_.push_back(out_tensor);
-
-  std::vector<size_t> gamma_dim = {hidden_dim, 1};
-  std::vector<size_t> mean_dim = {batch, 1};
-
-  // gamma & beta
-  this->set_weight(0, gamma_dim);
-  this->set_weight(1, gamma_dim);
-
-  gamma_ = this->get_weight(0);
-  beta_ = this->get_weight(1);
-  // gamma grad & beta grad
-  this->set_wgrad(0, gamma_dim);
-  this->set_wgrad(1, gamma_dim);
-  gamma_grad_ = this->get_wgrad(0);
-  beta_grad_ = this->get_wgrad(1);
-
-  // save running mean & var (cache)
-  blob_buff->reserve(mean_dim, &result_save_mean_);
-  blob_buff->reserve(mean_dim, &result_save_var_);
-}
-
-template <typename T>
-void LayerNormLayer<T>::fprop(bool is_train) {
-  CudaDeviceContext context(this->get_device_id());
-  float one = 1.0f, zero = 0.0f;
-
-  Tensor2<T>& in_tensor = in_tensors_[0];
-  Tensor2<T>& out_tensor = out_tensors_[0];
-  T* in = in_tensor.get_ptr();
-  T* out = out_tensor.get_ptr();
-
-  T* gamma = gamma_.get_ptr();
-  T* beta = beta_.get_ptr();
-
-  T* result_save_mean = result_save_mean_.get_ptr();
-  T* result_save_var = result_save_var_.get_ptr();
-
-  const auto& in_tensor_dim = in_tensor.get_dimensions();
-  size_t batch = 1;
-  size_t hidden_dim = in_tensor_dim[in_tensor_dim.size() - 1];
-
-  for (size_t idx = 0; idx < in_tensor_dim.size() - 1; idx++) {
-    batch = batch * in_tensor_dim[idx];
-  }
-  dim3 block_size(min(hidden_dim, static_cast<size_t>(MAX_THREADS)), 1, 1);
-  dim3 grid_size(batch, 1, 1);
-
-  layer_norm_kernel<<<grid_size, block_size, 0, this->get_gpu().get_stream()>>>(
-      out, in, result_save_var, result_save_mean, gamma, beta, batch, hidden_dim, params_.eps);
-}
-
-template <typename T>
-void LayerNormLayer<T>::bprop() {
-  CudaDeviceContext context(this->get_device_id());
-
-  float one = 1.0f, zero = 0.0f;
-
-  Tensor2<T>& in_tensor = in_tensors_[0];
-  Tensor2<T>& out_tensor = out_tensors_[0];
-  const auto& in_tensor_dim = in_tensor.get_dimensions();
-
-  T* in = in_tensor.get_ptr();
-  T* out = out_tensor.get_ptr();
-
-  T* gamma = gamma_.get_ptr();
-
-  T* gamma_grad = gamma_grad_.get_ptr();
-  T* beta_grad = beta_grad_.get_ptr();
-
-  T* result_save_mean = result_save_mean_.get_ptr();
-  T* result_save_var = result_save_var_.get_ptr();
-
-  size_t batch = 1;
-  size_t hidden_dim = in_tensor_dim[in_tensor_dim.size() - 1];
-
-  for (size_t idx = 0; idx < in_tensor_dim.size() - 1; idx++) {
-    batch = batch * in_tensor_dim[idx];
-  }
-
-  dim3 grid_dim1(max(hidden_dim / TILE_DIM, static_cast<size_t>(1)));
-  dim3 block_dim1(TILE_DIM, TILE_DIM);
-  layer_norm_backward1<<<grid_dim1, block_dim1, 0, this->get_gpu().get_stream()>>>(
-      out, in, result_save_var, result_save_mean, gamma_grad, beta_grad, batch, hidden_dim);
-
-  dim3 grid_dim2(batch);
-  size_t blockDimx = hidden_dim < 32 ? hidden_dim : ((hidden_dim >> 5) << 5);
-  dim3 block_dim2(min(blockDimx, static_cast<size_t>(MAX_THREADS)));
-
-  layer_norm_backward2<<<grid_dim2, block_dim2, 0, this->get_gpu().get_stream()>>>(
-      out, in, gamma, result_save_var, result_save_mean, in, hidden_dim);
-}
-
-template <typename T>
-std::unique_ptr<DataSimulator> LayerNormLayer<T>::get_default_initializer(const int index) {
-  std::unique_ptr<DataSimulator> simu;
-  if (0 == index) {
-    simu.reset(new ConstantDataSimulator(1.0f));
-  } else if (1 == index) {
-    simu.reset(new ConstantDataSimulator(0.0f));
-  } else {
-    HCTR_OWN_THROW(Error_t::OutOfBound, "index != {0, 1}.");
-  }
-  return simu;
-}
-
-template class LayerNormLayer<float>;
-template class LayerNormLayer<__half>;
-
-template <typename T>
-Core23TempLayerNormLayer<T>::Core23TempLayerNormLayer(
-    const core23::Tensor& in_tensor, const core23::Tensor& out_tensor, const Params& params,
-    const std::shared_ptr<GPUResource>& gpu_resource, std::vector<Initializer_t> initializer_types)
     : Base({in_tensor}, {out_tensor}, gpu_resource, initializer_types), params_(params) {
   CudaDeviceContext context(this->get_device_id());
   const auto& in_tensor_dim = in_tensor.shape();
@@ -569,7 +426,7 @@ Core23TempLayerNormLayer<T>::Core23TempLayerNormLayer(
 }
 
 template <typename T>
-void Core23TempLayerNormLayer<T>::fprop(bool is_train) {
+void LayerNormLayer<T>::fprop(bool is_train) {
   CudaDeviceContext context(this->get_device_id());
   float one = 1.0f, zero = 0.0f;
 
@@ -599,7 +456,7 @@ void Core23TempLayerNormLayer<T>::fprop(bool is_train) {
 }
 
 template <typename T>
-void Core23TempLayerNormLayer<T>::bprop() {
+void LayerNormLayer<T>::bprop() {
   CudaDeviceContext context(this->get_device_id());
 
   float one = 1.0f, zero = 0.0f;
@@ -640,8 +497,7 @@ void Core23TempLayerNormLayer<T>::bprop() {
 }
 
 template <typename T>
-std::unique_ptr<DataSimulator> Core23TempLayerNormLayer<T>::get_default_initializer(
-    const int index) {
+std::unique_ptr<DataSimulator> LayerNormLayer<T>::get_default_initializer(const int index) {
   std::unique_ptr<DataSimulator> simu;
   if (0 == index) {
     simu.reset(new ConstantDataSimulator(1.0f));
@@ -653,6 +509,6 @@ std::unique_ptr<DataSimulator> Core23TempLayerNormLayer<T>::get_default_initiali
   return simu;
 }
 
-template class Core23TempLayerNormLayer<float>;
-template class Core23TempLayerNormLayer<__half>;
+template class LayerNormLayer<float>;
+template class LayerNormLayer<__half>;
 }  // namespace HugeCTR
diff --git a/HugeCTR/src/layers/mlp_layer.cu b/HugeCTR/src/layers/mlp_layer.cu
index 72c73692a4..3c1e1c8f86 100644
--- a/HugeCTR/src/layers/mlp_layer.cu
+++ b/HugeCTR/src/layers/mlp_layer.cu
@@ -23,274 +23,14 @@ template class MLPLayer<float>;
 template class MLPLayer<__half>;
 
 template <typename T>
-MLPLayer<T>::MLPLayer(const std::shared_ptr<BufferBlock2<float>>& master_weights_buff,
-                      const std::shared_ptr<BufferBlock2<T>>& weights_buff,
-                      const std::shared_ptr<BufferBlock2<T>>& weights_grad_buff,
-                      const std::shared_ptr<GeneralBuffer2<CudaAllocator>>& blobs_buff,
-                      const Tensors2<T>& bottom_tensors_, const Tensors2<T>& top_tensors,
-                      const std::vector<size_t>& num_outputs,
+MLPLayer<T>::MLPLayer(const std::vector<core23::Tensor>& bottom_tensors,
+                      const std::vector<core23::Tensor>& top_tensors,
+                      const std::vector<int64_t>& num_outputs,
                       const std::shared_ptr<GPUResource>& gpu_resource,
                       const std::vector<Activation_t>& acts, const std::vector<bool>& use_bias,
                       std::vector<Initializer_t> initializer_types, bool skip_head_dgrad,
                       bool async_wgrad, bool fuse_wb, bool enable_tf32_compute)
-    : TrainableLayer<T>(master_weights_buff, weights_buff, weights_grad_buff, gpu_resource,
-                        initializer_types),
-      bottom_tensors_(bottom_tensors_),
-      top_tensors_(top_tensors),
-      num_outputs_(num_outputs),
-      acts_(acts),
-      use_bias_(use_bias),
-      skip_head_dgrad_(skip_head_dgrad),
-      async_wgrad_(async_wgrad),
-      fuse_wb_(fuse_wb),
-      enable_tf32_compute_(enable_tf32_compute) {
-  int num_layers = num_outputs.size();
-  train_tensors_.resize(num_layers);
-  mask_tensors_.resize(num_layers);
-  output_mask_.resize(num_layers);
-  dact_tensors_.resize(num_layers);
-  layer_desc_.resize(num_layers);
-  layer_algo_.resize(num_layers);
-
-  for (int i = 0; i < num_layers; i++) {
-    const auto& bottom_tensor_dim =
-        i == 0 ? bottom_tensors_[0].get_dimensions() : train_tensors_[i - 1].get_dimensions();
-    size_t batch_size = bottom_tensor_dim[0];
-    size_t input_size = bottom_tensor_dim[1];
-    size_t output_size = num_outputs[i];
-
-    std::vector<size_t> kernel_dim = {input_size, output_size};
-    std::vector<size_t> bias_dim = {1, output_size};
-
-    this->set_weight(i * 2, kernel_dim);
-    kernels_.push_back(this->get_weight(i * 2));
-    this->set_weight(i * 2 + 1, bias_dim);
-    biases_.push_back(this->get_weight(i * 2 + 1));
-    this->set_wgrad(i * 2, kernel_dim);
-    kernels_grad_.push_back(this->get_wgrad(i * 2));
-    this->set_wgrad(i * 2 + 1, bias_dim);
-    db_tensors_.push_back(this->get_wgrad(i * 2 + 1));
-
-    const auto& train_in_tensor = i == 0 ? bottom_tensors_[0] : train_tensors_[i - 1];
-    size_t num_output = num_outputs[i];
-
-    if (i != num_layers - 1) {
-      blobs_buff->reserve({train_in_tensor.get_dimensions()[0], num_output}, &train_tensors_[i]);
-      if (acts_[i] == Activation_t::Relu) {
-        blobs_buff->reserve({train_in_tensor.get_dimensions()[0], num_output}, &mask_tensors_[i]);
-        blobs_buff->reserve({train_in_tensor.get_dimensions()[0], num_output}, &dact_tensors_[i]);
-      }
-    } else {
-      train_tensors_[i] = top_tensors[0];
-      if (top_tensors.size() == 1) {
-        if (acts_[i] == Activation_t::Relu) {
-          blobs_buff->reserve({train_in_tensor.get_dimensions()[0], num_output}, &mask_tensors_[i]);
-        }
-        blobs_buff->reserve({train_in_tensor.get_dimensions()[0], num_output}, &dact_tensors_[i]);
-      }
-    }
-
-    output_mask_[i] = (acts_[i] == Activation_t::Relu) && (i != num_layers - 1);
-  }
-}
-
-template <typename T>
-void MLPLayer<T>::fprop(bool is_train) {
-  CudaDeviceContext context(this->get_device_id());
-  int num_layers = num_outputs_.size();
-  for (int i = 0; i < num_layers; i++) {
-    const T* kernel = kernels_[i].get_ptr();
-    const T* bottom = i == 0 ? bottom_tensors_[0].get_ptr() : train_tensors_[i - 1].get_ptr();
-    T* top_fprop = train_tensors_[i].get_ptr();
-
-    layer_functors_.fprop(kernel, bottom, top_fprop, layer_desc_[i], layer_algo_[i],
-                          this->get_gpu().get_cublaslt_handle(), this->get_gpu().get_stream());
-    if (i == num_layers - 1 && acts_[i] == Activation_t::Relu) {
-      T* mask_out = mask_tensors_[i].get_ptr();
-      size_t len = train_tensors_[i].get_num_elements();
-      HCTR_LIB_THROW(cudaMemcpyAsync(mask_out, top_fprop, len * sizeof(T), cudaMemcpyDeviceToDevice,
-                                     this->get_gpu().get_stream()));
-    }
-  }
-}
-
-template <typename T>
-void MLPLayer<T>::bprop() {
-  CudaDeviceContext context(this->get_device_id());
-
-  int num_layers = num_outputs_.size();
-  for (int i = num_layers - 1; i >= 0; i--) {
-    const auto& bottom_tensor_dim =
-        i == 0 ? bottom_tensors_[0].get_dimensions() : train_tensors_[i - 1].get_dimensions();
-    size_t batch_size = bottom_tensor_dim[0];
-    size_t top_size = num_outputs_[i];
-
-    const T* kernel = kernels_[i].get_ptr();
-    const T* train_top = train_tensors_[i].get_ptr();
-
-    // Only the last layer needs the mask of itself to get the grad.
-    const T* mask_top = (i == num_layers - 1 && acts_[i] == Activation_t::Relu)
-                            ? mask_tensors_[i].get_ptr()
-                            : nullptr;
-
-    T* grad_top =
-        acts_[i] == Activation_t::None ? train_tensors_[i].get_ptr() : dact_tensors_[i].get_ptr();
-    T* kernel_grad = kernels_grad_[i].get_ptr();
-    T* bottom = i == 0 ? bottom_tensors_[0].get_ptr() : train_tensors_[i - 1].get_ptr();
-    bool enable_async_wgrad = async_wgrad_;
-    T* bottom_bprop = nullptr;
-    if (i != 0) {
-      bottom_bprop = acts_[i - 1] == Activation_t::None ? train_tensors_[i - 1].get_ptr()
-                                                        : dact_tensors_[i - 1].get_ptr();
-    } else {
-      if (bottom_tensors_.size() == 1) {
-        // train_in_tensor
-        bottom_bprop = bottom_tensors_[0].get_ptr();
-        enable_async_wgrad = false;
-      } else {
-        bottom_bprop = bottom_tensors_[1].get_ptr();
-      }
-    }
-
-    layer_functors_.bprop(kernel, bottom, train_top, mask_top, batch_size * top_size, grad_top,
-                          bottom_bprop, kernel_grad, layer_desc_[i], layer_algo_[i],
-                          this->get_gpu().get_cublaslt_handle(), this->get_gpu().get_stream(),
-                          this->get_gpu().get_comp_overlap_stream(), event_overlap_,
-                          enable_async_wgrad, i == 0 ? skip_head_dgrad_ : false);
-  }
-
-  if (async_wgrad_) {
-    HCTR_LIB_THROW(cudaEventRecord(event_overlap_, this->get_gpu().get_comp_overlap_stream()));
-    HCTR_LIB_THROW(cudaStreamWaitEvent(this->get_gpu().get_stream(), event_overlap_));
-  }
-}
-
-template <typename T>
-void MLPLayer<T>::initialize() {
-  CudaDeviceContext context(this->get_device_id());
-
-  HCTR_LIB_THROW(cudaEventCreate(&event_overlap_));
-  event_overlap_created_ = true;
-
-  int num_layers = num_outputs_.size();
-  for (int i = 0; i < num_layers; i++) {
-    const auto& bottom_tensor_dim =
-        i == 0 ? bottom_tensors_[0].get_dimensions() : train_tensors_[i - 1].get_dimensions();
-    size_t batch_size = bottom_tensor_dim[0];
-    size_t input_size = bottom_tensor_dim[1];
-    size_t output_size = num_outputs_[i];
-
-    const T* bias_ptr = nullptr;
-    if (use_bias_[i]) {
-      bias_ptr = biases_[i].get_ptr();
-    }
-
-    T* mask_out_ptr = nullptr;
-    bool output_mask = output_mask_[i];
-    if (output_mask) {
-      mask_out_ptr = mask_tensors_[i].get_ptr();
-    }
-    layer_desc_[i].set_fprop_attr(bias_ptr, acts_[i], mask_out_ptr, batch_size, input_size,
-                                  output_size, enable_tf32_compute_);
-
-    T* mask_in_ptr = nullptr;
-    if (i > 0) {
-      if (acts_[i - 1] == Activation_t::Relu) {
-        mask_in_ptr = mask_tensors_[i - 1].get_ptr();
-      }
-    }
-    T* dbias_bottom_ptr = nullptr;
-    T* dbias_top_ptr = nullptr;
-    // If there is no ReLu, then dbias should be fused with wgrad.
-    // Compute the bias gradient for this layer.
-    if (fuse_wb_ || i == num_layers - 1 || acts_[i] == Activation_t::None) {
-      if (use_bias_[i]) {
-        dbias_top_ptr = db_tensors_[i].get_ptr();
-      }
-    }
-    // Compute the bias gradient for bottom layer. For the last layer of MLP, it should compute both
-    // gradients.
-    if (!fuse_wb_ && i > 0 && use_bias_[i - 1] && acts_[i - 1] != Activation_t::None) {
-      dbias_bottom_ptr = db_tensors_[i - 1].get_ptr();
-    }
-    layer_desc_[i].set_bprop_attr(dbias_bottom_ptr, dbias_top_ptr, mask_in_ptr, batch_size,
-                                  input_size, output_size, enable_tf32_compute_);
-    layer_algo_[i].set_fprop_algo(layer_desc_[i], this->get_gpu().get_cublaslt_handle());
-    layer_algo_[i].set_bprop_algo(layer_desc_[i], this->get_gpu().get_cublaslt_handle());
-  }
-}
-
-template <typename T>
-void MLPLayer<T>::search_algorithm() {
-  CudaDeviceContext context(this->get_device_id());
-  int num_layers = num_outputs_.size();
-  for (int i = 0; i < num_layers; i++) {
-    T* kernel = kernels_[i].get_ptr();
-    T* bottom = i == 0 ? bottom_tensors_[0].get_ptr() : train_tensors_[i - 1].get_ptr();
-    T* top = train_tensors_[i].get_ptr();
-
-    const auto& bottom_tensor_dim =
-        i == 0 ? bottom_tensors_[0].get_dimensions() : train_tensors_[i - 1].get_dimensions();
-    size_t batch_size = bottom_tensor_dim[0];
-    size_t input_size = bottom_tensor_dim[1];
-    size_t output_size = num_outputs_[i];
-
-    layer_functors_.search_algorithm(
-        bottom, top, kernel, batch_size, input_size, output_size, layer_desc_[i], layer_algo_[i],
-        this->get_gpu().get_cublaslt_handle(), this->get_gpu().get_stream());
-  }
-}
-
-template <typename T>
-std::unique_ptr<DataSimulator> MLPLayer<T>::get_uniform_initializer(const int index) {
-  int i = index / 2;
-  size_t bottom_dim =
-      i == 0 ? bottom_tensors_[0].get_dimensions()[1] : train_tensors_[i - 1].get_dimensions()[1];
-  float limit = sqrt(1.0f / (bottom_dim));
-  return std::make_unique<UniformDataSimulator>(-1 * limit, limit);
-}
-
-template <typename T>
-std::unique_ptr<DataSimulator> MLPLayer<T>::get_xavier_uniform_initializer(const int index) {
-  int i = index / 2;
-  size_t bottom_dim =
-      i == 0 ? bottom_tensors_[0].get_dimensions()[1] : train_tensors_[i - 1].get_dimensions()[1];
-  size_t top_dim = train_tensors_[i].get_dimensions()[1];
-  // fan_avg for weight
-  // fan_out for bias
-  auto fan_mode = i % 2 ? data_simu::Mode_t::Fan_out : data_simu::Mode_t::Fan_avg;
-  return std::make_unique<VarianceScalingSimulator>(
-      1.f, fan_mode, data_simu::Distribution_t::Uniform, bottom_dim, top_dim);
-}
-template <typename T>
-std::unique_ptr<DataSimulator> MLPLayer<T>::get_xavier_norm_initializer(const int index) {
-  int i = index / 2;
-  size_t bottom_dim =
-      i == 0 ? bottom_tensors_[0].get_dimensions()[1] : train_tensors_[i - 1].get_dimensions()[1];
-  size_t top_dim = train_tensors_[i].get_dimensions()[1];
-  // fan_avg for weight
-  // fan_out for bias
-  auto fan_mode = i % 2 ? data_simu::Mode_t::Fan_out : data_simu::Mode_t::Fan_avg;
-  return std::make_unique<VarianceScalingSimulator>(1.f, fan_mode, data_simu::Distribution_t::Norm,
-                                                    bottom_dim, top_dim);
-}
-
-template <typename T>
-std::unique_ptr<DataSimulator> MLPLayer<T>::get_default_initializer(const int index) {
-  return this->get_uniform_initializer(index);
-}
-
-template class Core23TempMLPLayer<float>;
-template class Core23TempMLPLayer<__half>;
-
-template <typename T>
-Core23TempMLPLayer<T>::Core23TempMLPLayer(
-    const std::vector<core23::Tensor>& bottom_tensors,
-    const std::vector<core23::Tensor>& top_tensors, const std::vector<int64_t>& num_outputs,
-    const std::shared_ptr<GPUResource>& gpu_resource, const std::vector<Activation_t>& acts,
-    const std::vector<bool>& use_bias, std::vector<Initializer_t> initializer_types,
-    bool skip_head_dgrad, bool async_wgrad, bool fuse_wb, bool enable_tf32_compute)
-    : Core23TempTrainableLayer<T>(bottom_tensors, top_tensors, gpu_resource, initializer_types),
+    : TrainableLayer<T>(bottom_tensors, top_tensors, gpu_resource, initializer_types),
       num_outputs_(num_outputs),
       acts_(acts),
       use_bias_(use_bias),
@@ -369,7 +109,7 @@ Core23TempMLPLayer<T>::Core23TempMLPLayer(
 }
 
 template <typename T>
-void Core23TempMLPLayer<T>::fprop(bool is_train) {
+void MLPLayer<T>::fprop(bool is_train) {
   CudaDeviceContext context(this->get_device_id());
   int num_layers = num_outputs_.size();
   for (int i = 0; i < num_layers; i++) {
@@ -389,7 +129,7 @@ void Core23TempMLPLayer<T>::fprop(bool is_train) {
 }
 
 template <typename T>
-void Core23TempMLPLayer<T>::bprop() {
+void MLPLayer<T>::bprop() {
   CudaDeviceContext context(this->get_device_id());
 
   int num_layers = num_outputs_.size();
@@ -440,7 +180,7 @@ void Core23TempMLPLayer<T>::bprop() {
 }
 
 template <typename T>
-void Core23TempMLPLayer<T>::initialize() {
+void MLPLayer<T>::initialize() {
   CudaDeviceContext context(this->get_device_id());
 
   HCTR_LIB_THROW(cudaEventCreate(&event_overlap_));
@@ -495,7 +235,7 @@ void Core23TempMLPLayer<T>::initialize() {
 }
 
 template <typename T>
-void Core23TempMLPLayer<T>::search_algorithm() {
+void MLPLayer<T>::search_algorithm() {
   CudaDeviceContext context(this->get_device_id());
   int num_layers = num_outputs_.size();
   for (int i = 0; i < num_layers; i++) {
@@ -517,7 +257,7 @@ void Core23TempMLPLayer<T>::search_algorithm() {
 }
 
 template <typename T>
-std::unique_ptr<DataSimulator> Core23TempMLPLayer<T>::get_uniform_initializer(const int index) {
+std::unique_ptr<DataSimulator> MLPLayer<T>::get_uniform_initializer(const int index) {
   int i = index / 2;
   int64_t bottom_dim =
       i == 0 ? this->input_tensors_[0].shape().size(1) : train_tensors_[i - 1].shape().size(1);
@@ -526,8 +266,7 @@ std::unique_ptr<DataSimulator> Core23TempMLPLayer<T>::get_uniform_initializer(co
 }
 
 template <typename T>
-std::unique_ptr<DataSimulator> Core23TempMLPLayer<T>::get_xavier_uniform_initializer(
-    const int index) {
+std::unique_ptr<DataSimulator> MLPLayer<T>::get_xavier_uniform_initializer(const int index) {
   int i = index / 2;
   int64_t bottom_dim =
       i == 0 ? this->input_tensors_[0].shape().size(1) : train_tensors_[i - 1].shape().size(1);
@@ -539,7 +278,7 @@ std::unique_ptr<DataSimulator> Core23TempMLPLayer<T>::get_xavier_uniform_initial
       1.f, fan_mode, data_simu::Distribution_t::Uniform, bottom_dim, top_dim);
 }
 template <typename T>
-std::unique_ptr<DataSimulator> Core23TempMLPLayer<T>::get_xavier_norm_initializer(const int index) {
+std::unique_ptr<DataSimulator> MLPLayer<T>::get_xavier_norm_initializer(const int index) {
   int i = index / 2;
   int64_t bottom_dim =
       i == 0 ? this->input_tensors_[0].shape().size(1) : train_tensors_[i - 1].shape().size(1);
@@ -552,7 +291,7 @@ std::unique_ptr<DataSimulator> Core23TempMLPLayer<T>::get_xavier_norm_initialize
 }
 
 template <typename T>
-std::unique_ptr<DataSimulator> Core23TempMLPLayer<T>::get_default_initializer(const int index) {
+std::unique_ptr<DataSimulator> MLPLayer<T>::get_default_initializer(const int index) {
   return this->get_uniform_initializer(index);
 }
 
diff --git a/HugeCTR/src/layers/multi_cross_layer.cu b/HugeCTR/src/layers/multi_cross_layer.cu
index b477cd1c12..c55c86a676 100644
--- a/HugeCTR/src/layers/multi_cross_layer.cu
+++ b/HugeCTR/src/layers/multi_cross_layer.cu
@@ -56,115 +56,6 @@ struct alignas(8) half2x4 : public cuda::std::array<__half2, 4> {};
 namespace {
 
 inline int calc_grid(int t, int b) { return (t - 1) / b + 1; }
-template <typename T>
-void matrix_vec_mul(Tensor2<T>& out, const Tensor2<T>& mat, const Tensor2<T>& vec,
-                    cublasHandle_t cublas_handle, cudaStream_t stream);
-
-template <>
-void matrix_vec_mul(Tensor2<float>& out, const Tensor2<float>& mat, const Tensor2<float>& vec,
-                    cublasHandle_t cublas_handle, cudaStream_t stream) {
-  float* pout = out.get_ptr();
-  const float* pmat = mat.get_ptr();
-  const float* pvec = vec.get_ptr();
-
-  const auto& dim = out.get_dimensions();
-  const auto& idim = mat.get_dimensions();
-  assert(dim.size() == 2 && idim.size() == 2 && idim[1] == vec.get_dimensions()[1] &&
-         vec.get_dimensions()[0] == 1);
-  assert(idim[0] == dim[0]);
-
-  const int h = idim[0];
-  const int w = idim[1];
-  const float alpha = 1.0f;
-  const float beta = 0.0f;
-
-  CUBLAS_CHECK(cublasSetStream(cublas_handle, stream));
-  CUBLAS_CHECK(cublasSgemm(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, h, 1, w, &alpha, pmat, w, pvec,
-                           w, &beta, pout, h));
-}
-
-template <>
-void matrix_vec_mul(Tensor2<__half>& out, const Tensor2<__half>& mat, const Tensor2<__half>& vec,
-                    cublasHandle_t cublas_handle, cudaStream_t stream) {
-  __half* pout = out.get_ptr();
-  const __half* pmat = mat.get_ptr();
-  const __half* pvec = vec.get_ptr();
-
-  const auto& dim = out.get_dimensions();
-  const auto& idim = mat.get_dimensions();
-  assert(dim.size() == 2 && idim.size() == 2 && idim[1] == vec.get_dimensions()[1] &&
-         vec.get_dimensions()[0] == 1);
-  assert(idim[0] == dim[0]);
-
-  const int h = idim[0];
-  const int w = idim[1];
-  const __half alpha = 1.0f;
-  const __half beta = 0.0f;
-
-  CUBLAS_CHECK(cublasSetStream(cublas_handle, stream));
-  CUBLAS_CHECK(cublasHgemm(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, h, 1, w, &alpha, pmat, w, pvec,
-                           w, &beta, pout, h));
-}
-
-template <typename T>
-void row_scaling(Tensor2<T>& o_mat, const Tensor2<T>& mat, const Tensor2<T>& vec,
-                 cudaStream_t stream) {
-  T* pout = o_mat.get_ptr();
-  const T* pmat = mat.get_ptr();
-  const T* pvec = vec.get_ptr();
-
-  const auto& dim = o_mat.get_dimensions();
-  const auto& idim = mat.get_dimensions();
-  assert(dim.size() == 2 && idim.size() == 2 && dim[0] == vec.get_dimensions()[0] &&
-         vec.get_dimensions()[1] == 1);
-  assert(idim[0] == dim[0] && idim[1] == dim[1]);
-
-  const int h = dim[0];
-  const int w = dim[1];
-
-  MLCommon::LinAlg::matrixVectorOp(
-      pout, pmat, pvec, h, w, false, true, [] __device__(T a, T b) { return a * b; }, stream);
-}
-
-template <typename T>
-void matrix_vec_add(Tensor2<T>& o_mat, const Tensor2<T>& mat, const Tensor2<T>& vec,
-                    cudaStream_t stream) {
-  T* pout = o_mat.get_ptr();
-  const T* pmat = mat.get_ptr();
-  const T* pvec = vec.get_ptr();
-
-  const auto& dim = o_mat.get_dimensions();
-  const auto& idim = mat.get_dimensions();
-  assert(dim.size() == 2 && idim.size() == 2 && dim[1] == vec.get_dimensions()[1] &&
-         vec.get_dimensions()[0] == 1);
-  assert(idim[0] == dim[0] && idim[1] == dim[1]);
-
-  const int h = dim[0];
-  const int w = dim[1];
-
-  MLCommon::LinAlg::matrixVectorOp(
-      pout, pmat, pvec, h, w, false, false, [] __device__(T a, T b) { return a + b; }, stream);
-}
-
-template <typename T>
-void matrix_add(Tensor2<T>& out_mat, const Tensor2<T>& mat_a, const Tensor2<T>& mat_b,
-                cudaStream_t stream) {
-  T* pout = out_mat.get_ptr();
-  const T* pmat_a = mat_a.get_ptr();
-  const T* pmat_b = mat_b.get_ptr();
-
-  const auto& dim = out_mat.get_dimensions();
-  const auto& idim1 = mat_a.get_dimensions();
-  const auto& idim2 = mat_b.get_dimensions();
-  assert(idim1[0] == dim[0] && idim1[1] == dim[1]);
-  assert(idim2[0] == dim[0] && idim2[1] == dim[1]);
-
-  const int h = dim[0];
-  const int w = dim[1];
-
-  MLCommon::LinAlg::binaryOp(
-      pout, pmat_a, pmat_b, h * w, [] __device__(T a, T b) { return a + b; }, stream);
-}
 
 template <typename T>
 __global__ void vector_fma4(T* pout, const T* pvec_a, const T* pvec_b, const T* pvec_c,
@@ -270,1146 +161,121 @@ __global__ void vector_mul_fma3_align<__half, 8, 3>(
     // store
     *out1_ptr = out1;
   }
-}
-// d = a * b + c
-template <>
-__global__ void vector_fma4_align8(__half* pout, const __half* pvec_a, const __half* pvec_b,
-                                   const __half* pvec_c, const int len) {
-  const int gtid = (blockDim.x * blockIdx.x + threadIdx.x) << 3;
-  if (gtid >= len) {
-    return;
-  }
-  float4 a_8, b_8, c_8;
-  half2x4 d_8;
-  half2x4* out_ptr;
-  out_ptr = reinterpret_cast<half2x4*>(pout + gtid);
-  // load
-  a_8 = *reinterpret_cast<const float4*>(pvec_a + gtid);
-  b_8 = *reinterpret_cast<const float4*>(pvec_b + gtid);
-  c_8 = *reinterpret_cast<const float4*>(pvec_c + gtid);
-  // fma
-  d_8[0] = __hfma2(*reinterpret_cast<half2*>(&a_8.x), *reinterpret_cast<half2*>(&b_8.x),
-                   *reinterpret_cast<half2*>(&c_8.x));
-  d_8[1] = __hfma2(*reinterpret_cast<half2*>(&a_8.y), *reinterpret_cast<half2*>(&b_8.y),
-                   *reinterpret_cast<half2*>(&c_8.y));
-  d_8[2] = __hfma2(*reinterpret_cast<half2*>(&a_8.z), *reinterpret_cast<half2*>(&b_8.z),
-                   *reinterpret_cast<half2*>(&c_8.z));
-  d_8[3] = __hfma2(*reinterpret_cast<half2*>(&a_8.w), *reinterpret_cast<half2*>(&b_8.w),
-                   *reinterpret_cast<half2*>(&c_8.w));
-  // store
-  *out_ptr = d_8;
-}
-// c = a * b + c
-template <>
-__global__ void vector_fma3_align8(__half* __restrict__ pout, const __half* __restrict__ pvec_a,
-                                   const __half* __restrict__ pvec_b, const int len) {
-  const int gtid = (blockDim.x * blockIdx.x + threadIdx.x) << 3;
-  if (gtid >= len) {
-    return;
-  }
-  float4 a_8, b_8, c_8;
-  half2x4 d_8;
-  half2x4* out_ptr;
-  out_ptr = reinterpret_cast<half2x4*>(pout + gtid);
-  // load
-  a_8 = *reinterpret_cast<const float4*>(pvec_a + gtid);
-  b_8 = *reinterpret_cast<const float4*>(pvec_b + gtid);
-  c_8 = *reinterpret_cast<const float4*>(pout + gtid);
-  // fma
-  d_8[0] = __hfma2(*reinterpret_cast<half2*>(&a_8.x), *reinterpret_cast<half2*>(&b_8.x),
-                   *reinterpret_cast<half2*>(&c_8.x));
-  d_8[1] = __hfma2(*reinterpret_cast<half2*>(&a_8.y), *reinterpret_cast<half2*>(&b_8.y),
-                   *reinterpret_cast<half2*>(&c_8.y));
-  d_8[2] = __hfma2(*reinterpret_cast<half2*>(&a_8.z), *reinterpret_cast<half2*>(&b_8.z),
-                   *reinterpret_cast<half2*>(&c_8.z));
-  d_8[3] = __hfma2(*reinterpret_cast<half2*>(&a_8.w), *reinterpret_cast<half2*>(&b_8.w),
-                   *reinterpret_cast<half2*>(&c_8.w));
-  // store
-  *out_ptr = d_8;
-}
-
-// Y0 = A .* B
-// Y1 += A .* C
-template <typename T>
-void fused_mul_fma3(Tensor2<T>& Y0, Tensor2<T>& Y1, const Tensor2<T>& A, const Tensor2<T>& B,
-                    const Tensor2<T>& C, cudaStream_t stream) {
-  const T* pmat_a = A.get_ptr();
-  const T* pmat_b = B.get_ptr();
-  const T* pmat_c = C.get_ptr();
-  T* pmat_o0 = Y0.get_ptr();
-  T* pmat_o1 = Y1.get_ptr();
-  const auto& idima = A.get_dimensions();
-  const auto& idimb = B.get_dimensions();
-  const auto& idimc = C.get_dimensions();
-  const auto& idimc0 = Y0.get_dimensions();
-  const auto& idimc1 = Y1.get_dimensions();
-
-  assert(idima[0] == idimb[0] && idima[1] == idimb[1] && idimc0[0] == idimb[0] &&
-         idimc0[1] == idimb[1] && idimc[0] == idima[0] && idimc[1] == idima[1]);
-  assert(idimc1[0] == idimc0[0] && idimc1[1] == idimc0[1]);
-  const int h = idima[0];
-  const int w = idima[1];
-  const int len = h * w;
-  constexpr int warp_per_sm = 8;
-  constexpr int warp_size = 32;
-  const int BLOCK_DIM = warp_size * warp_per_sm;  // 8 warps per block
-  int GRID_DIM = (len + BLOCK_DIM - 1) / BLOCK_DIM;
-  if (len % 8 == 0 && std::is_same<T, __half>::value) {
-    GRID_DIM = (len / 8 + BLOCK_DIM - 1) / BLOCK_DIM;
-    // GRID_DIM = std::min((len / 8 + BLOCK_DIM - 1) / BLOCK_DIM, 216);
-    vector_mul_fma3_align<T, 8, 3>
-        <<<GRID_DIM, BLOCK_DIM, 0, stream>>>(pmat_o0, pmat_o1, pmat_a, pmat_b, pmat_c, len);
-  } else {
-    vector_mul_fma3_align<T>
-        <<<GRID_DIM, BLOCK_DIM, 0, stream>>>(pmat_o0, pmat_o1, pmat_a, pmat_b, pmat_c, len);
-  }
-}
-// perform out_mat = mat_a * mat_b + mat_c
-template <typename T>
-void fused_matrix_elementwise_dot_add(Tensor2<T>& out_mat, const Tensor2<T>& mat_a,
-                                      const Tensor2<T>& mat_b, const Tensor2<T>& mat_c,
-                                      cudaStream_t stream) {
-  T* pout = out_mat.get_ptr();
-  const T* pmat_a = mat_a.get_ptr();
-  const T* pmat_b = mat_b.get_ptr();
-  const T* pmat_c = mat_c.get_ptr();
-  const auto& dim = out_mat.get_dimensions();
-  const auto& idima = mat_a.get_dimensions();
-  const auto& idimb = mat_b.get_dimensions();
-  const auto& idimc = mat_c.get_dimensions();
-  assert(idima[0] == dim[0] && idima[1] == dim[1] && idimc[0] == dim[0]);
-  assert(idimb[0] == dim[0] && idimb[1] == dim[1] && idimc[1] == dim[1]);
-
-  const int h = dim[0];
-  const int w = dim[1];
-
-  constexpr int sm_count = 108;
-  constexpr int warp_per_sm = 8;
-  constexpr int warp_size = 32;
-  constexpr int kNumWaves = 32;
-  const int BLOCK_DIM = warp_size * warp_per_sm;  // 8 warps per block
-  const int GRID_DIM = (h * w + BLOCK_DIM - 1) / BLOCK_DIM;
-  if (h * w % 8 == 0 && std::is_same<T, __half>::value) {
-    int num_items = h * w / 8;
-    const int GRID_DIM_h4 = (num_items + BLOCK_DIM - 1) / BLOCK_DIM;
-    if (pout == pmat_c) {
-      vector_fma3_align8<<<GRID_DIM_h4, BLOCK_DIM, 0, stream>>>(pout, pmat_a, pmat_b, h * w);
-    } else {
-      vector_fma4_align8<<<GRID_DIM_h4, BLOCK_DIM, 0, stream>>>(pout, pmat_a, pmat_b, pmat_c,
-                                                                h * w);
-    }
-  } else {
-    vector_fma4<<<GRID_DIM, BLOCK_DIM, 0, stream>>>(pout, pmat_a, pmat_b, pmat_c, h * w);
-  }
-}
-// c = a * b => 3
-template <typename T>
-void matrix_elementwise_dot(Tensor2<T>& out_mat, const Tensor2<T>& mat_a, const Tensor2<T>& mat_b,
-                            cudaStream_t stream) {
-  T* pout = out_mat.get_ptr();
-  const T* pmat_a = mat_a.get_ptr();
-  const T* pmat_b = mat_b.get_ptr();
-
-  const auto& dim = out_mat.get_dimensions();
-  const auto& idim1 = mat_a.get_dimensions();
-  const auto& idim2 = mat_b.get_dimensions();
-  assert(idim1[0] == dim[0] && idim1[1] == dim[1]);
-  assert(idim2[0] == dim[0] && idim2[1] == dim[1]);
-
-  const int h = dim[0];
-  const int w = dim[1];
-
-  MLCommon::LinAlg::binaryOp(
-      pout, pmat_a, pmat_b, h * w, [] __device__(T a, T b) { return a * b; }, stream);
-}
-
-/**
- * compute dot product for each pair of the rows in the two matrix,
- */
-template <typename T>
-__global__ void matrix_pair_mul_kernel(T* o_vec, const T* mat_a, int h, int w, const T* mat_b) {
-  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
-  const int wtid = tid % WARP_SIZE;  // thread id in warp
-  const int wid = tid / WARP_SIZE;   // warp id
-  const T* mat_a_with_offset = mat_a + wid * w;
-  const T* mat_b_with_offset = mat_b + wid * w;
-  if (wid < h) {
-    T accum = 0.f;
-    for (int i = wtid; i < w; i += WARP_SIZE) {
-      accum += mat_a_with_offset[i] * mat_b_with_offset[i];
-    }
-    T val = warpReduceSum(accum);
-    if (wtid == 0) {
-      o_vec[wid] = val;
-    }
-  }
-}
-
-template <typename T>
-void matrix_pair_mul(Tensor2<T>& o_vec, const Tensor2<T>& mat_a, const Tensor2<T>& mat_b,
-                     cudaStream_t stream) {
-  T* pout = o_vec.get_ptr();
-  const T* pmat_a = mat_a.get_ptr();
-  const T* pmat_b = mat_b.get_ptr();
-
-  const auto& dim = mat_a.get_dimensions();
-
-  const int h = dim[0];
-  const int w = dim[1];
-  assert(h == mat_b.get_dimensions()[0] && w == mat_a.get_dimensions()[1] &&
-         h == o_vec.get_dimensions()[0] && 1 == o_vec.get_dimensions()[1]);
-
-  const int BLOCK_DIM = 256;
-  const int GRID_DIM = calc_grid(h * WARP_SIZE, BLOCK_DIM);
-  matrix_pair_mul_kernel<<<GRID_DIM, BLOCK_DIM, 0, stream>>>(pout, pmat_a, h, w, pmat_b);
-}
-
-template <typename T>
-__global__ void mm_1d(T* out_mat, const T* vec_a, int h, const T* vec_b, int w) {
-  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
-  if (tid < h * w) {
-    const int col = tid % w;
-    const int row = tid / w;
-    out_mat[tid] = vec_a[row] * vec_b[col];
-  }
-}
-
-template <typename T>
-void out_product(Tensor2<T>& out_mat, const Tensor2<T>& vec_a, const Tensor2<T>& vec_b,
-                 cudaStream_t stream) {
-  T* pout = out_mat.get_ptr();
-  const T* pvec_a = vec_a.get_ptr();
-  const T* pvec_b = vec_b.get_ptr();
-  const auto& dim = out_mat.get_dimensions();
-
-  const int h = dim[0];
-  const int w = dim[1];
-
-  assert(h == vec_a.get_dimensions()[0] && w == vec_b.get_dimensions()[1] &&
-         vec_a.get_dimensions()[1] == 1 && vec_b.get_dimensions()[0] == 1);
-
-  const int BLOCK_DIM = 256;
-  const int GRID_DIM = calc_grid(h * w, BLOCK_DIM);
-  mm_1d<<<GRID_DIM, BLOCK_DIM, 0, stream>>>(pout, pvec_a, h, pvec_b, w);
-}
-
-/**
- * Each row in `mat` scale with the corresponding element in vec. and accum across rows
- * The length of vec should be h.
- * @param o_mat: hxw
- * @param mat: hxw
- * @param vec: hx1
- */
-template <typename T>
-__global__ void row_scaling_sum_kernel(T* out, const T* mat, int h, int w, const T* vec) {
-  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
-  const int wtid = tid % WARP_SIZE;  // thread id in warp
-  const int wid = tid / WARP_SIZE;   // warp id
-  if (wid < w) {
-    T accum = 0.f;
-    for (int i = wtid; i < h; i += WARP_SIZE) {
-      const int col = wid;
-      const int idx = i * w + col;
-      accum += mat[idx] * vec[i];
-    }
-    T val = warpReduceSum(accum);
-    if (wtid == 0) {
-      out[wid] += val;  // using += here to enable regularization
-    }
-  }
-}
-
-template <typename T>
-void row_scaling_sum(Tensor2<T>& out, const Tensor2<T>& mat, const Tensor2<T>& vec,
-                     cudaStream_t stream) {
-  T* pout = out.get_ptr();
-  const T* pmat = mat.get_ptr();
-  const T* pvec = vec.get_ptr();
-
-  const auto& dim = out.get_dimensions();
-  const auto& idim = mat.get_dimensions();
-  assert(dim.size() == 2 && idim.size() == 2 && idim[0] == vec.get_dimensions()[0] &&
-         vec.get_dimensions()[1] == 1);
-  assert(idim[1] == dim[1]);
-
-  const int h = idim[0];
-  const int w = idim[1];
-
-  const int BLOCK_DIM = 256;
-  const int GRID_DIM = calc_grid(w * WARP_SIZE, BLOCK_DIM);  // each col one warp
-
-  row_scaling_sum_kernel<<<GRID_DIM, BLOCK_DIM, 0, stream>>>(pout, pmat, h, w, pvec);
-}
-
-template <typename T>
-void rows_sum(Tensor2<T>& out, const Tensor2<T>& mat, cudaStream_t stream) {
-  T* pout = out.get_ptr();
-  const T* pmat = mat.get_ptr();
-
-  const auto& dim = out.get_dimensions();
-  const auto& idim = mat.get_dimensions();
-  assert(dim.size() == 2 && idim.size() == 2);
-  assert(idim[1] == dim[1]);
-
-  const int h = idim[0];
-  const int w = idim[1];
-
-  MLCommon::LinAlg::reduce(pout, pmat, h, w, (T)0, false, true, stream, false,
-                           [] __device__(T in, int i) { return in; });
-}
-
-}  // namespace
-
-/*
- * Equivalent TensorFlow Code:
- *
-def forward(x, k, b, layers):
-  y = []
-  h = []
-  for i in range(layers):
-    v = tf.linalg.matvec(x if i == 0 else y[i - 1], k[i])
-    v = tf.transpose(v)
-    h.append(v)
-    m = tf.multiply(x, v)
-    m = tf.add(m, x if i == 0 else y[i - 1])
-    m = tf.add(m, b[i])
-    y.append(m)
-  return y, h
- *
- */
-template <typename T>
-void MultiCrossForwardFunctor<T>::operator()(
-    cudaStream_t stream, cublasHandle_t cublas_handle, const Tensor2<T>& input_tensor,
-    const Tensors2<T>& kernel_tensors, const Tensors2<T>& bias_tensors,
-    Tensors2<T>& layer_output_tensors, Tensors2<T>& layer_hidden_tensors, int num_layers) const {
-  for (int i = 0; i < num_layers; i++) {
-    // weight: kernel_tensors[i] is a row vector
-    // layer_hidden_tensors[i] is a row vector
-    matrix_vec_mul(layer_hidden_tensors[i], i == 0 ? input_tensor : layer_output_tensors[i - 1],
-                   kernel_tensors[i], cublas_handle, stream);
-    row_scaling(layer_output_tensors[i], input_tensor, layer_hidden_tensors[i], stream);
-    matrix_add(layer_output_tensors[i], layer_output_tensors[i],
-               i == 0 ? input_tensor : layer_output_tensors[i - 1], stream);
-    matrix_vec_add(layer_output_tensors[i], layer_output_tensors[i], bias_tensors[i], stream);
-  }
-}
-
-//
-/*
-  output is x_{l+1} =  x_0 \. (w * x_l + b) + x_l , where
-  input is
-    input_tensor : x_0
-    kernel_tensors : w
-    bias_tensors   : n
-    layer_output_tensors : x_l
-
-
-  output is
-    layer_output_tensors : x_l
-
-  intermediate tensor:
-    layer_hidden_tensors : w * x_l
-
-h_i = gemv(x_i,w_i) ,
-o_i = row_scaling(h_i,x),
-o_i = matrix_vec_add(o_i,bias)
-o_i = matrix_add(o_i,o_{i-1})
-
-*
-*/
-template <typename T>
-
-void MultiCrossForwardFunctorv2<T>::operator()(
-    cudaStream_t stream, const Tensor2<T>& input_tensor, const Tensors2<T>& kernel_tensors,
-    const Tensors2<T>& bias_tensors, Tensors2<T>& XU_tensors, Tensors2<T>& layer_output_tensors,
-    Tensors2<T>& layer_hidden_tensors, int num_layers, const std::vector<CublasDesc<T>>& xu_descr_,
-    const std::vector<CublasDesc<T>>& xuvb_descr_, const std::vector<CublasAlgo<T>>& xu_fprop_algo_,
-    const std::vector<CublasAlgo<T>>& xuvb_fprop_algo_, cublasLtHandle_t cublaslt_handle) {
-  auto batchsize = input_tensor.get_dimensions()[0];
-  auto projection_dim = kernel_tensors[0].get_dimensions()[1];
-  auto vec_length = input_tensor.get_dimensions()[1];
-  auto U_row = kernel_tensors[0].get_dimensions()[0];
-  auto V_col = kernel_tensors[1].get_dimensions()[1];
-  float alpha = 1.0f;
-  float beta = 0.0f;
-  if (vec_length != U_row || vec_length != V_col) {
-    HCTR_LOG(INFO, WORLD, "vec_length %zu U_row %zu V_col %zu\n", vec_length, U_row, V_col);
-    HCTR_OWN_THROW(Error_t::WrongInput, "input or output tensor dimensions not matches");
-  }
-  for (int i = 0; i < num_layers; i++) {
-    const auto& tensor_input = i == 0 ? input_tensor : layer_output_tensors[i - 1];
-    // gemm with functor
-    // x_i * u
-    {
-      const T* mat_a = tensor_input.get_ptr();
-      const T* mat_b = kernel_tensors[2 * i].get_ptr();
-      T* mat_c = XU_tensors[i].get_ptr();
-      this->gemm_functor_(alpha, mat_a, mat_b, beta, mat_c, mat_c, xu_descr_[i], xu_fprop_algo_[i],
-                          cublaslt_handle, stream);
-    }
-
-    // gemm + bias with functor
-    // x_i * u * v + b
-    {
-      const T* mat_a = XU_tensors[i].get_ptr();
-      const T* mat_b = kernel_tensors[2 * i + 1].get_ptr();
-      T* mat_c = layer_hidden_tensors[i].get_ptr();
-      this->gemm_functor_(alpha, mat_a, mat_b, beta, mat_c, mat_c, xuvb_descr_[i],
-                          xuvb_fprop_algo_[i], cublaslt_handle, stream);
-    }
-    // x_0 .* (x_i * u * v + b) + x_i
-    fused_matrix_elementwise_dot_add(layer_output_tensors[i], layer_hidden_tensors[i], input_tensor,
-                                     i == 0 ? input_tensor : layer_output_tensors[i - 1], stream);
-  }
-}
-
-/*
- * Equivalent TensorFlow Code:
- *
-def backward(x, k, y, h, dy, layers):
-  dx = tf.zeros(x.shape)
-  dk = []
-  db = []
-  for i in reversed(range(layers)):
-    dx = tf.add(dx, tf.multiply(dy, h[i]))
-    dv = tf.expand_dims(tf.reduce_sum(tf.multiply(dy, x), 1), 1)
-    dk.insert(0, tf.linalg.matvec(x if i == 0 else y[i - 1], tf.transpose(dv), transpose_a=True))
-    db.insert(0, tf.expand_dims(tf.reduce_sum(dy, 0), 0))
-    dy = tf.add(dy, tf.matmul(dv, k[i]))
-  dx = tf.add(dx, dy)
-  return dx, dk, db
-grad_tensor : dy
-one multi-cross contains multiple cell:
-
-tmp_mat_tensors[0] : dy * h[i]
-tmp_mat_tensors[1] : tmp data gradient to current multicross cell
-tmp_mat_tensors[2]: sum(dy/dh * h[i])
- *
- */
-template <typename T>
-void MultiCrossBackwardFunctor<T>::operator()(
-    cudaStream_t stream, const Tensor2<T>& input_tensor, const Tensors2<T>& kernel_tensors,
-    const Tensors2<T>& layer_output_tensors, const Tensors2<T>& layer_hidden_tensors,
-    const Tensor2<T>& grad_tensor, Tensor2<T>& output_tensor, Tensors2<T>& kernel_output_tensors,
-    Tensors2<T>& bias_output_tensors, Tensor2<T>& tmp_vec_tensor, Tensor2<T> tmp_mat_tensors[],
-    int num_layers) const {
-  cudaMemsetAsync(tmp_mat_tensors[2].get_ptr(), 0, tmp_mat_tensors[2].get_size_in_bytes(), stream);
-  for (int i = num_layers - 1; i >= 0; i--) {
-    // tmp_mat_tensors[0] = dy * h_i (h_i = gemv(x_i , w_i))
-    row_scaling(tmp_mat_tensors[0], i == num_layers - 1 ? grad_tensor : tmp_mat_tensors[1],
-                layer_hidden_tensors[i], stream);
-    // dx
-    matrix_add(tmp_mat_tensors[2], tmp_mat_tensors[2], tmp_mat_tensors[0], stream);
-    // tmp_vec_tensor : {batchsize , 1}
-    matrix_pair_mul(tmp_vec_tensor, i == num_layers - 1 ? grad_tensor : tmp_mat_tensors[1],
-                    input_tensor, stream);
-
-    // gemv(layer_output_tensors^T, tmp_vec_tensor)
-    // gradient WRT weight
-    row_scaling_sum(kernel_output_tensors[i], i == 0 ? input_tensor : layer_output_tensors[i - 1],
-                    tmp_vec_tensor, stream);
-    // dbias
-    rows_sum(bias_output_tensors[i], i == num_layers - 1 ? grad_tensor : tmp_mat_tensors[1],
-             stream);
-
-    out_product(tmp_mat_tensors[0], tmp_vec_tensor, kernel_tensors[i], stream);
-    matrix_add(tmp_mat_tensors[1], i == num_layers - 1 ? grad_tensor : tmp_mat_tensors[1],
-               tmp_mat_tensors[0], stream);
-  }
-  matrix_add(output_tensor, tmp_mat_tensors[2], tmp_mat_tensors[1], stream);
-}
-// dgrad_stream is the main compute stream
-template <typename T>
-void MultiCrossBackwardFunctorv2<T>::operator()(
-    cudaStream_t dgrad_stream, cudaStream_t wgrad_stream, bool async_wgrad,
-    cudaEvent_t& event_overlap, const Tensor2<T>& input_tensor, const Tensors2<T>& kernel_tensors,
-    const Tensors2<T>& layer_output_tensors, const Tensors2<T>& layer_hidden_tensors,
-    Tensors2<T>& kernel_output_tensors, Tensors2<T>& grad_tensors, Tensors2<T>& bias_output_tensors,
-    Tensors2<T>& XU_tensors, Tensor2<T> accum_dx_tensor_, Tensors2<T> bprop_bottoms, int num_layers,
-    const std::vector<CublasDesc<T>>& xu_descr_, const std::vector<CublasDesc<T>>& xuvb_descr_,
-    const std::vector<CublasDesc<T>>& du_descrs_bprop_,
-    const std::vector<CublasDesc<T>>& dhidden_descrs_bprop_,
-    const std::vector<CublasAlgo<T>>& xu_bprop_algo_,
-    const std::vector<CublasAlgo<T>>& xuvb_bprop_algo_,
-    const std::vector<CublasAlgo<T>>& du_bprop_algos_,
-    const std::vector<CublasAlgo<T>>& dhidden_bprop_algos_, cublasLtHandle_t cublaslt_handle) {
-  cudaMemsetAsync(accum_dx_tensor_.get_ptr(), 0, accum_dx_tensor_.get_size_in_bytes(),
-                  dgrad_stream);
-  auto batchsize = input_tensor.get_dimensions()[0];
-  auto projection_dim = kernel_tensors[0].get_dimensions()[1];
-  auto vec_length = input_tensor.get_dimensions()[1];
-  auto U_row = kernel_tensors[0].get_dimensions()[0];
-  auto V_col = kernel_tensors[1].get_dimensions()[1];
-  bool dgrad_act_shared = grad_tensors[0].get_ptr() == input_tensor.get_ptr();
-  for (int i = num_layers - 1; i >= 0; i--) {
-    // S0 = dY_i .* X , shape: (batchsize, w)
-    // dX += dY_i .* H , shape: (batchsize, w)
-    fused_mul_fma3(bprop_bottoms[2 * i], accum_dx_tensor_, grad_tensors[i + 1], input_tensor,
-                   layer_hidden_tensors[i], dgrad_stream);
-
-    {
-      if (async_wgrad) {
-        HCTR_LIB_THROW(cudaEventRecord(event_overlap, dgrad_stream));
-        HCTR_LIB_THROW(cudaStreamWaitEvent(wgrad_stream, event_overlap));
-      }
-      // 2 dH = S1 = S0 * V^T shape: (batchsize, project_dim)
-      const T* mat_a = bprop_bottoms[2 * i].get_ptr();
-      const T* mat_b = kernel_tensors[2 * i + 1].get_ptr();
-      T* mat_c = bprop_bottoms[1 + 2 * i].get_ptr();
-      this->gemm_functor_(1.0f, mat_a, mat_b, 0.0f, mat_c, mat_c, xuvb_descr_[i],
-                          xuvb_bprop_algo_[i], cublaslt_handle, dgrad_stream);
-
-      // 1 db, dV = XU_{i}^T * S0 shape: (project_dim, w)
-      mat_a = XU_tensors[i].get_ptr();
-      mat_b = bprop_bottoms[2 * i].get_ptr();
-      mat_c = kernel_output_tensors[2 * i + 1].get_ptr();
-      this->gemm_functor_(1.0f, mat_a, mat_b, 1.0f, mat_c, mat_c, xu_descr_[i], xu_bprop_algo_[i],
-                          cublaslt_handle, async_wgrad ? wgrad_stream : dgrad_stream);
-
-      if (async_wgrad) {
-        HCTR_LIB_THROW(cudaEventRecord(event_overlap, dgrad_stream));
-        HCTR_LIB_THROW(cudaStreamWaitEvent(wgrad_stream, event_overlap));
-      }
-
-      // 3  dU = X_{i-1} ^T * S1 shape: (w, project_dim)
-      mat_a = i == 0 ? input_tensor.get_ptr() : layer_output_tensors[i - 1].get_ptr();
-      mat_b = bprop_bottoms[1 + 2 * i].get_ptr();
-      mat_c = kernel_output_tensors[2 * i].get_ptr();
-      this->gemm_functor_(1.0f, mat_a, mat_b, 1.0f, mat_c, mat_c, du_descrs_bprop_[i],
-                          du_bprop_algos_[i], cublaslt_handle,
-                          async_wgrad ? wgrad_stream : dgrad_stream);
-
-      if (!i && async_wgrad && dgrad_act_shared) {
-        HCTR_LIB_THROW(cudaEventRecord(event_overlap, wgrad_stream));
-        HCTR_LIB_THROW(cudaStreamWaitEvent(dgrad_stream, event_overlap));
-      }
-      // 4 dY_{i-1} = S1 * U^T + dY_{i} shape: (batchsize, w)
-      mat_a = bprop_bottoms[1 + 2 * i].get_ptr();
-      mat_b = kernel_tensors[i * 2].get_ptr();
-      mat_c = grad_tensors[i + 1].get_ptr();
-      T* mat_d = grad_tensors[i].get_ptr();
-      // gemm: mat_d = mat_a * mat_b + mat_c
-      this->gemm_functor_(1.0f, mat_a, mat_b, 1.0f, mat_c, mat_d, dhidden_descrs_bprop_[i],
-                          dhidden_bprop_algos_[i], cublaslt_handle, dgrad_stream);
-    }
-  }
-  matrix_add(grad_tensors[0], accum_dx_tensor_, grad_tensors[0], dgrad_stream);
-  if (async_wgrad) {
-    HCTR_LIB_THROW(cudaEventRecord(event_overlap, wgrad_stream));
-    HCTR_LIB_THROW(cudaStreamWaitEvent(dgrad_stream, event_overlap));
-  }
-}
-
-// TODO deprecate it
-template <typename T>
-MultiCrossLayer<T>::MultiCrossLayer(
-    const std::shared_ptr<BufferBlock2<float>>& master_weight_buff,
-    const std::shared_ptr<BufferBlock2<T>>& weight_buff,
-    const std::shared_ptr<BufferBlock2<T>>& wgrad_buff,
-    const std::shared_ptr<GeneralBuffer2<CudaAllocator>>& blobs_buff, const Tensor2<T>& in_tensor,
-    const Tensor2<T>& out_tensor, const std::shared_ptr<GPUResource>& gpu_resource, int num_layers,
-    size_t projection_dim, std::vector<Initializer_t> initializer_types, bool enable_tf32_compute,
-    bool async_wgrad)
-    : TrainableLayer<T>(master_weight_buff, weight_buff, wgrad_buff, gpu_resource,
-                        initializer_types),
-      num_layers_(num_layers),
-      projection_dim_(projection_dim),
-      enable_tf32_compute_(enable_tf32_compute),
-      async_wgrad_(async_wgrad) {
-  try {
-    HCTR_CHECK_HINT(!async_wgrad_, "async wgrad needs separate input and gradient tensor");
-    // check the in_tensor and out_tensor
-    const auto& in_tensor_dim = in_tensor.get_dimensions();
-    const auto& out_tensor_dim = out_tensor.get_dimensions();
-    size_t vec_length = in_tensor_dim[1];
-    size_t batchsize = in_tensor_dim[0];
-    if (projection_dim_ == 0) {
-      HCTR_LOG(WARNING, ROOT, "using multi-cross v1\n");
-    }
-    CudaDeviceContext context(this->get_device_id());
-    wgrad_stream_ = gpu_resource->get_stream("cross_layer_wgrad");
-    event_fork_ = gpu_resource->get_event("cross_layer_overlap");
-    // 1. two dim?
-    if (in_tensor_dim.size() != 2 || out_tensor_dim.size() != 2) {
-      HCTR_OWN_THROW(Error_t::WrongInput, "input or output tensor doesn't has two dimensions");
-    }
-    // 2. same dim?
-    for (int i = 0; i < 2; i++) {
-      if (in_tensor_dim[i] != out_tensor_dim[i]) {
-        HCTR_OWN_THROW(Error_t::WrongInput, "input and output tensor doesn't match");
-      }
-    }
-    // check num_lyaers
-    if (num_layers < 1) {
-      HCTR_OWN_THROW(Error_t::WrongInput, "num_layers < 1");
-    }
-
-    std::vector<size_t> bias_dim = {1, vec_length};
-    std::vector<size_t> weight_dim = {vec_length, vec_length};
-    std::vector<size_t> U_dim = {vec_length, this->projection_dim_};
-    std::vector<size_t> V_dim = {this->projection_dim_, vec_length};
-    if (!this->projection_dim_) {
-      weight_dim[0] = 1ul;
-    }
-    for (int i = 0; i < num_layers; i++) {
-      // setup weights and bias
-      {
-        // dcnv2
-        if (this->projection_dim_) {
-          this->set_weight(3 * i, U_dim);
-          this->set_weight(3 * i + 1, V_dim);
-          this->set_weight(3 * i + 2, bias_dim);
-          // dcnv1
-        } else {
-          this->set_weight(2 * i, weight_dim);
-          this->set_weight(2 * i + 1, bias_dim);
-        }
-      }
-      // setup weight gradient
-      // dcnv2
-      if (this->projection_dim_) {
-        this->set_wgrad(3 * i, U_dim);
-        this->set_wgrad(3 * i + 1, V_dim);
-        this->set_wgrad(3 * i + 2, bias_dim);
-        // dcnv1
-      } else {
-        this->set_wgrad(2 * i, weight_dim);
-        this->set_wgrad(2 * i + 1, bias_dim);
-      }
-
-      if (this->projection_dim_) {
-        xu_descrs_fprop_.emplace_back();
-        xuvb_descrs_fprop_.emplace_back();
-        xu_descrs_bprop_.emplace_back();
-        xuvb_descrs_bprop_.emplace_back();
-        du_descrs_bprop_.emplace_back();
-        dhidden_descrs_bprop_.emplace_back();
-
-        xu_fprop_algos_.emplace_back();
-        xuvb_fprop_algos_.emplace_back();
-        xu_bprop_algos_.emplace_back();
-        xuvb_bprop_algos_.emplace_back();
-        du_bprop_algos_.emplace_back();
-        dhidden_bprop_algos_.emplace_back();
-      }
-    }
-
-    in_tensors_.push_back(in_tensor);
-    out_tensors_.push_back(out_tensor);
-    // setup blobs
-    std::vector<size_t> blob_dim = {batchsize, vec_length};
-
-    // forward input
-    activation_tensors_.push_back(in_tensor);
-    // intermediate output (activation)
-    for (int i = 0; i < num_layers - 1; i++) {
-      Tensor2<T> tensor;
-      blobs_buff->reserve(blob_dim, &tensor);
-      activation_tensors_.push_back(tensor);
-    }
-    // forward output
-    activation_tensors_.push_back(out_tensor);
-    blobs_buff->reserve(blob_dim, &accum_dx_tensor_);
-
-    for (int i = 0; i < 3; i++) {
-      blobs_buff->reserve(blob_dim, &tmp_mat_tensors_[i]);
-    }
-    if (projection_dim_) {
-      blobs_buff->reserve({batchsize, projection_dim_}, &tmp_mat_tensors_[3]);
-    }
-    std::vector<size_t> tmp_vec_dim = {batchsize, 1};
-    std::vector<size_t> hidden_dim = {batchsize, weight_dim[0]};
-    blobs_buff->reserve(tmp_vec_dim, &tmp_vec_tensor_);
-    if (this->projection_dim_) {
-      std::vector<size_t> XU_dim = {batchsize, this->projection_dim_};
-      for (int i = 0; i < num_layers; i++) {
-        Tensor2<T> tensor;
-        blobs_buff->reserve(XU_dim, &tensor);
-        XU_tensors_.push_back(tensor);
-      }
-    }
-    for (int i = 0; i < num_layers; i++) {
-      Tensor2<T> tensor;
-      blobs_buff->reserve(hidden_dim, &tensor);
-      hidden_tensors_.push_back(tensor);
-    }
-    // bprop buffer
-    /*
-      bottom[ 2*i ] => dY .* X (elementwise dot)
-      bottom[ 2*i+1] => dH
-    */
-    if (this->projection_dim_) {
-      for (int i = 0; i < num_layers; i++) {
-        Tensor2<T> tensor[2];
-        blobs_buff->reserve(blob_dim, &tensor[0]);
-        blobs_buff->reserve({batchsize, projection_dim_}, &tensor[1]);
-
-        bprop_bottom_.push_back(tensor[0]);
-        bprop_bottom_.push_back(tensor[1]);
-      }
-      // backward output
-      dgrads_.push_back(in_tensor);
-      for (int i = 0; i < num_layers - 1; i++) {
-        Tensor2<T> tensor;
-        blobs_buff->reserve(blob_dim, &tensor);
-        dgrads_.push_back(tensor);
-      }
-      // backward input
-      dgrads_.push_back(out_tensor);
-    }
-
-  } catch (const std::runtime_error& rt_err) {
-    HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl;
-    throw;
-  }
-}
-template <typename T>
-MultiCrossLayer<T>::MultiCrossLayer(
-    const std::shared_ptr<BufferBlock2<float>>& master_weight_buff,
-    const std::shared_ptr<BufferBlock2<T>>& weight_buff,
-    const std::shared_ptr<BufferBlock2<T>>& wgrad_buff,
-    const std::shared_ptr<GeneralBuffer2<CudaAllocator>>& blobs_buff, const Tensors2<T>& in_tensors,
-    const Tensors2<T>& out_tensors, const std::shared_ptr<GPUResource>& gpu_resource,
-    int num_layers, size_t projection_dim, std::vector<Initializer_t> initializer_types,
-    bool enable_tf32_compute, bool async_wgrad)
-    : TrainableLayer<T>(master_weight_buff, weight_buff, wgrad_buff, gpu_resource,
-                        initializer_types),
-      num_layers_(num_layers),
-      projection_dim_(projection_dim),
-      enable_tf32_compute_(enable_tf32_compute),
-      async_wgrad_(async_wgrad) {
-  try {
-    const auto& in_tensor = in_tensors[0];
-    const auto& out_tensor = out_tensors[0];
-    // if (async_wgrad_) {
-    //   HCTR_CHECK_HINT(in_tensors.size() == 2, "async mlp requires 2 input tensors\n");
-    // }
-    // check the in_tensor and out_tensor
-    const auto& in_tensor_dim = in_tensor.get_dimensions();
-    const auto& out_tensor_dim = out_tensor.get_dimensions();
-    size_t vec_length = in_tensor_dim[1];
-    size_t batchsize = in_tensor_dim[0];
-    if (projection_dim_ == 0) {
-      HCTR_LOG(WARNING, ROOT, "using multi-cross v1\n");
-    }
-    CudaDeviceContext context(this->get_device_id());
-    wgrad_stream_ = gpu_resource->get_stream("cross_layer_wgrad");
-    event_fork_ = gpu_resource->get_event("cross_layer_overlap");
-
-    // 1. two dim?
-    if (in_tensor_dim.size() != 2 || out_tensor_dim.size() != 2) {
-      HCTR_OWN_THROW(Error_t::WrongInput, "input or output tensor doesn't has two dimensions");
-    }
-    // 2. same dim?
-    for (int i = 0; i < 2; i++) {
-      if (in_tensor_dim[i] != out_tensor_dim[i]) {
-        HCTR_OWN_THROW(Error_t::WrongInput, "input and output tensor doesn't match");
-      }
-    }
-    // check num_lyaers
-    if (num_layers < 1) {
-      HCTR_OWN_THROW(Error_t::WrongInput, "num_layers < 1");
-    }
-
-    std::vector<size_t> bias_dim = {1, vec_length};
-    std::vector<size_t> weight_dim = {vec_length, vec_length};
-    std::vector<size_t> U_dim = {vec_length, this->projection_dim_};
-    std::vector<size_t> V_dim = {this->projection_dim_, vec_length};
-    if (!this->projection_dim_) {
-      weight_dim[0] = 1ul;
-    }
-    for (int i = 0; i < num_layers; i++) {
-      // setup weights and bias
-      {
-        // dcnv2
-        if (this->projection_dim_) {
-          this->set_weight(3 * i, U_dim);
-          this->set_weight(3 * i + 1, V_dim);
-          this->set_weight(3 * i + 2, bias_dim);
-          // dcnv1
-        } else {
-          this->set_weight(2 * i, weight_dim);
-          this->set_weight(2 * i + 1, bias_dim);
-        }
-      }
-      // setup weight gradient
-      // dcnv2
-      if (this->projection_dim_) {
-        this->set_wgrad(3 * i, U_dim);
-        this->set_wgrad(3 * i + 1, V_dim);
-        this->set_wgrad(3 * i + 2, bias_dim);
-        // dcnv1
-      } else {
-        this->set_wgrad(2 * i, weight_dim);
-        this->set_wgrad(2 * i + 1, bias_dim);
-      }
-
-      if (this->projection_dim_) {
-        xu_descrs_fprop_.emplace_back();
-        xuvb_descrs_fprop_.emplace_back();
-        xu_descrs_bprop_.emplace_back();
-        xuvb_descrs_bprop_.emplace_back();
-        du_descrs_bprop_.emplace_back();
-        dhidden_descrs_bprop_.emplace_back();
-
-        xu_fprop_algos_.emplace_back();
-        xuvb_fprop_algos_.emplace_back();
-        xu_bprop_algos_.emplace_back();
-        xuvb_bprop_algos_.emplace_back();
-        du_bprop_algos_.emplace_back();
-        dhidden_bprop_algos_.emplace_back();
-      }
-    }
-
-    in_tensors_ = in_tensors;
-    out_tensors_ = out_tensors;
-    // setup blobs
-    std::vector<size_t> blob_dim = {batchsize, vec_length};
-
-    // forward input
-    activation_tensors_.push_back(in_tensor);
-    // intermediate output (activation)
-    for (int i = 0; i < num_layers - 1; i++) {
-      Tensor2<T> tensor;
-      blobs_buff->reserve(blob_dim, &tensor);
-      activation_tensors_.push_back(tensor);
-    }
-    // forward output
-    activation_tensors_.push_back(out_tensor);
-    blobs_buff->reserve(blob_dim, &accum_dx_tensor_);
-
-    for (int i = 0; i < 3; i++) {
-      blobs_buff->reserve(blob_dim, &tmp_mat_tensors_[i]);
-    }
-    if (projection_dim_) {
-      blobs_buff->reserve({batchsize, projection_dim_}, &tmp_mat_tensors_[3]);
-    }
-    std::vector<size_t> tmp_vec_dim = {batchsize, 1};
-    std::vector<size_t> hidden_dim = {batchsize, weight_dim[0]};
-    blobs_buff->reserve(tmp_vec_dim, &tmp_vec_tensor_);
-    if (this->projection_dim_) {
-      std::vector<size_t> XU_dim = {batchsize, this->projection_dim_};
-      for (int i = 0; i < num_layers; i++) {
-        Tensor2<T> tensor;
-        blobs_buff->reserve(XU_dim, &tensor);
-        XU_tensors_.push_back(tensor);
-      }
-    }
-    for (int i = 0; i < num_layers; i++) {
-      Tensor2<T> tensor;
-      blobs_buff->reserve(hidden_dim, &tensor);
-      hidden_tensors_.push_back(tensor);
-    }
-    // bprop buffer
-    /*
-      bottom[ 2*i ] => dY .* X (elementwise dot)
-      bottom[ 2*i+1] => dH
-    */
-    if (this->projection_dim_) {
-      for (int i = 0; i < num_layers; i++) {
-        Tensor2<T> tensor[2];
-        blobs_buff->reserve(blob_dim, &tensor[0]);
-        blobs_buff->reserve({batchsize, projection_dim_}, &tensor[1]);
-
-        bprop_bottom_.push_back(tensor[0]);
-        bprop_bottom_.push_back(tensor[1]);
-      }
-      // backward output
-      if (in_tensors.size() == 2) {
-        dgrads_.push_back(in_tensors[1]);
-      } else {
-        dgrads_.push_back(in_tensors[0]);
-      }
-      for (int i = 0; i < num_layers - 1; i++) {
-        Tensor2<T> tensor;
-        blobs_buff->reserve(blob_dim, &tensor);
-        dgrads_.push_back(tensor);
-      }
-      // backward input
-      if (out_tensors.size() == 2) {
-        dgrads_.push_back(out_tensors[1]);
-      } else {
-        dgrads_.push_back(out_tensors[0]);
-      }
-    }
-
-  } catch (const std::runtime_error& rt_err) {
-    HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl;
-    throw;
-  }
-}
-template <typename T>
-void MultiCrossLayer<T>::fprop(bool is_train) {
-  CudaDeviceContext context(this->get_device_id());
-  Tensors2<T> kernel_tensors;
-  Tensors2<T> bias_tensors;
-  Tensors2<T> output_tensors;
-  Tensors2<T> hidden_tensors;
-
-  if (this->projection_dim_) {
-    for (int i = 0; i < num_layers_; i++) {
-      kernel_tensors.push_back(this->get_weight(3 * i));
-      kernel_tensors.push_back(this->get_weight(3 * i + 1));
-      bias_tensors.push_back(this->get_weight(3 * i + 2));
-    }
-  } else {
-    for (int i = 0; i < num_layers_; i++) {
-      kernel_tensors.push_back(this->get_weight(2 * i));
-      bias_tensors.push_back(this->get_weight(2 * i + 1));
-    }
-  }
-  for (int i = 0; i < num_layers_; i++) {
-    output_tensors.push_back(activation_tensors_[i + 1]);
-    hidden_tensors.push_back(hidden_tensors_[i]);
-  }
-  if (this->projection_dim_ == 0) {
-    // dcn v1
-    MultiCrossForwardFunctor<T>()(this->get_gpu().get_stream(), this->get_gpu().get_cublas_handle(),
-                                  activation_tensors_[0], kernel_tensors, bias_tensors,
-                                  output_tensors, hidden_tensors, num_layers_);
-  } else {
-    // dcn v2
-    this->dcnv2_forward_functor_(this->get_gpu().get_stream(), activation_tensors_[0],
-                                 kernel_tensors, bias_tensors, XU_tensors_, output_tensors,
-                                 hidden_tensors, num_layers_, xu_descrs_fprop_, xuvb_descrs_fprop_,
-                                 xu_fprop_algos_, xuvb_fprop_algos_,
-                                 this->get_gpu().get_cublaslt_handle());
-  }
-}
-
-template <typename T>
-void MultiCrossLayer<T>::bprop() {
-  CudaDeviceContext context(this->get_device_id());
-  Tensors2<T> kernel_tensors;
-  Tensors2<T> kernel_output_tensors;
-  Tensors2<T> bias_output_tensors;
-  Tensors2<T> forward_output_tensors;
-  Tensors2<T> forward_hidden_tensors;
-  // dcnv2
-  if (this->projection_dim_) {
-    for (int i = 0; i < num_layers_; i++) {
-      // U
-      kernel_tensors.push_back(this->get_weight(3 * i));
-      // V
-      kernel_tensors.push_back(this->get_weight(3 * i + 1));
-      // dU
-      kernel_output_tensors.push_back(this->get_wgrad(3 * i));
-      // dV
-      kernel_output_tensors.push_back(this->get_wgrad(3 * i + 1));
-      // db
-      bias_output_tensors.push_back(this->get_wgrad(3 * i + 2));
-      // intermediate output
-      forward_hidden_tensors.push_back(hidden_tensors_[i]);
-    }
-  } else {
-    for (int i = 0; i < num_layers_; i++) {
-      kernel_tensors.push_back(this->get_weight(2 * i));
-      kernel_output_tensors.push_back(this->get_wgrad(2 * i));
-      bias_output_tensors.push_back(this->get_wgrad(2 * i + 1));
-      forward_hidden_tensors.push_back(hidden_tensors_[i]);
-    }
-  }
-
-  for (int i = 0; i < num_layers_ - 1; i++) {
-    forward_output_tensors.push_back(activation_tensors_[i + 1]);
+}
+// d = a * b + c
+template <>
+__global__ void vector_fma4_align8(__half* pout, const __half* pvec_a, const __half* pvec_b,
+                                   const __half* pvec_c, const int len) {
+  const int gtid = (blockDim.x * blockIdx.x + threadIdx.x) << 3;
+  if (gtid >= len) {
+    return;
   }
-  if (this->projection_dim_ == 0) {
-    // dcn v1
-    MultiCrossBackwardFunctor<T>()(this->get_gpu().get_stream(), activation_tensors_[0],
-                                   kernel_tensors, forward_output_tensors, forward_hidden_tensors,
-                                   activation_tensors_[num_layers_], activation_tensors_[0],
-                                   kernel_output_tensors, bias_output_tensors, tmp_vec_tensor_,
-                                   tmp_mat_tensors_, num_layers_);
-  } else {
-    // dcn v2
-    this->dcnv2_backward_functor_(
-        this->get_gpu().get_stream(), this->wgrad_stream_, this->async_wgrad_, this->event_fork_,
-        activation_tensors_[0], kernel_tensors, forward_output_tensors, forward_hidden_tensors,
-        kernel_output_tensors, this->dgrads_, bias_output_tensors, this->XU_tensors_,
-        accum_dx_tensor_, bprop_bottom_, num_layers_, xu_descrs_bprop_, xuvb_descrs_bprop_,
-        du_descrs_bprop_, dhidden_descrs_bprop_, xu_bprop_algos_, xuvb_bprop_algos_,
-        du_bprop_algos_, dhidden_bprop_algos_, this->get_gpu().get_cublaslt_handle());
+  float4 a_8, b_8, c_8;
+  half2x4 d_8;
+  half2x4* out_ptr;
+  out_ptr = reinterpret_cast<half2x4*>(pout + gtid);
+  // load
+  a_8 = *reinterpret_cast<const float4*>(pvec_a + gtid);
+  b_8 = *reinterpret_cast<const float4*>(pvec_b + gtid);
+  c_8 = *reinterpret_cast<const float4*>(pvec_c + gtid);
+  // fma
+  d_8[0] = __hfma2(*reinterpret_cast<half2*>(&a_8.x), *reinterpret_cast<half2*>(&b_8.x),
+                   *reinterpret_cast<half2*>(&c_8.x));
+  d_8[1] = __hfma2(*reinterpret_cast<half2*>(&a_8.y), *reinterpret_cast<half2*>(&b_8.y),
+                   *reinterpret_cast<half2*>(&c_8.y));
+  d_8[2] = __hfma2(*reinterpret_cast<half2*>(&a_8.z), *reinterpret_cast<half2*>(&b_8.z),
+                   *reinterpret_cast<half2*>(&c_8.z));
+  d_8[3] = __hfma2(*reinterpret_cast<half2*>(&a_8.w), *reinterpret_cast<half2*>(&b_8.w),
+                   *reinterpret_cast<half2*>(&c_8.w));
+  // store
+  *out_ptr = d_8;
+}
+// c = a * b + c
+template <>
+__global__ void vector_fma3_align8(__half* __restrict__ pout, const __half* __restrict__ pvec_a,
+                                   const __half* __restrict__ pvec_b, const int len) {
+  const int gtid = (blockDim.x * blockIdx.x + threadIdx.x) << 3;
+  if (gtid >= len) {
+    return;
   }
+  float4 a_8, b_8, c_8;
+  half2x4 d_8;
+  half2x4* out_ptr;
+  out_ptr = reinterpret_cast<half2x4*>(pout + gtid);
+  // load
+  a_8 = *reinterpret_cast<const float4*>(pvec_a + gtid);
+  b_8 = *reinterpret_cast<const float4*>(pvec_b + gtid);
+  c_8 = *reinterpret_cast<const float4*>(pout + gtid);
+  // fma
+  d_8[0] = __hfma2(*reinterpret_cast<half2*>(&a_8.x), *reinterpret_cast<half2*>(&b_8.x),
+                   *reinterpret_cast<half2*>(&c_8.x));
+  d_8[1] = __hfma2(*reinterpret_cast<half2*>(&a_8.y), *reinterpret_cast<half2*>(&b_8.y),
+                   *reinterpret_cast<half2*>(&c_8.y));
+  d_8[2] = __hfma2(*reinterpret_cast<half2*>(&a_8.z), *reinterpret_cast<half2*>(&b_8.z),
+                   *reinterpret_cast<half2*>(&c_8.z));
+  d_8[3] = __hfma2(*reinterpret_cast<half2*>(&a_8.w), *reinterpret_cast<half2*>(&b_8.w),
+                   *reinterpret_cast<half2*>(&c_8.w));
+  // store
+  *out_ptr = d_8;
 }
+/**
+ * compute dot product for each pair of the rows in the two matrix,
+ */
 template <typename T>
-void MultiCrossLayer<T>::search_algorithm() {
-  std::cout << "legacy DCN search_algo starts" << std::endl;
-  // dcnv1 no search_algorithm
-  CudaDeviceContext context(this->get_device_id());
-  auto cublaslt_handle = this->get_gpu().get_cublaslt_handle();
-  auto stream = this->get_gpu().get_stream();
-  if (this->projection_dim_) {
-    // setting up for fprop()
-    {
-      for (int i = 0; i < num_layers_; i++) {
-        const auto& tensor_input = activation_tensors_[i];
-        const T* mat_a = tensor_input.get_ptr();
-        const T* mat_b = this->get_weight(3 * i).get_ptr();
-        T* mat_c = XU_tensors_[i].get_ptr();
-
-        this->xu_fprop_algos_[i].search_algorithm(1.0f, mat_a, mat_b, 0.f, mat_c, mat_c,
-                                                  xu_descrs_fprop_[i], cublaslt_handle, stream);
-        mat_a = XU_tensors_[i].get_ptr();
-        mat_b = this->get_weight(3 * i + 1).get_ptr();
-        mat_c = hidden_tensors_[i].get_ptr();
-        this->xuvb_fprop_algos_[i].search_algorithm(1.0f, mat_a, mat_b, 0.f, mat_c, mat_c,
-                                                    xuvb_descrs_fprop_[i], cublaslt_handle, stream);
-      }
+__global__ void matrix_pair_mul_kernel(T* o_vec, const T* mat_a, int h, int w, const T* mat_b) {
+  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int wtid = tid % WARP_SIZE;  // thread id in warp
+  const int wid = tid / WARP_SIZE;   // warp id
+  const T* mat_a_with_offset = mat_a + wid * w;
+  const T* mat_b_with_offset = mat_b + wid * w;
+  if (wid < h) {
+    T accum = 0.f;
+    for (int i = wtid; i < w; i += WARP_SIZE) {
+      accum += mat_a_with_offset[i] * mat_b_with_offset[i];
     }
-
-    // setting up for bprop()
-    {
-      for (int i = 0; i < num_layers_; i++) {
-        // 1
-        const T* mat_a = XU_tensors_[i].get_ptr();
-        const T* mat_b = bprop_bottom_[2 * i].get_ptr();
-        T* mat_c = this->get_wgrad(3 * i + 1).get_ptr();
-        this->xu_bprop_algos_[i].search_algorithm(1.0, mat_a, mat_b, 1.0, mat_c, mat_c,
-                                                  xu_descrs_bprop_[i], cublaslt_handle, stream);
-        // 2
-        mat_a = bprop_bottom_[2 * i].get_ptr();
-        mat_b = this->get_wgrad(3 * i + 1).get_ptr();
-        mat_c = bprop_bottom_[1 + 2 * i].get_ptr();
-        this->xuvb_bprop_algos_[i].search_algorithm(1.0, mat_a, mat_b, 0.0, mat_c, mat_c,
-                                                    xuvb_descrs_bprop_[i], cublaslt_handle, stream);
-        // 3
-        mat_a = activation_tensors_[i].get_ptr();
-        mat_b = bprop_bottom_[1 + 2 * i].get_ptr();
-        mat_c = this->get_wgrad(3 * i).get_ptr();
-        this->du_bprop_algos_[i].search_algorithm(1.0, mat_a, mat_b, 1.0, mat_c, mat_c,
-                                                  du_descrs_bprop_[i], cublaslt_handle, stream);
-
-        // 4
-        mat_a = bprop_bottom_[1 + 2 * i].get_ptr();
-        mat_b = this->get_weight(3 * i).get_ptr();
-        mat_c = this->dgrads_[i + 1].get_ptr();
-        T* mat_d = this->dgrads_[i].get_ptr();
-        this->dhidden_bprop_algos_[i].search_algorithm(1.0, mat_a, mat_b, 1.0, mat_c, mat_d,
-                                                       dhidden_descrs_bprop_[i], cublaslt_handle,
-                                                       stream);
-      }
+    T val = warpReduceSum(accum);
+    if (wtid == 0) {
+      o_vec[wid] = val;
     }
   }
 }
-template <typename T>
-void MultiCrossLayer<T>::initialize() {
-  auto cublaslt_handle = this->get_gpu().get_cublaslt_handle();
-  auto stream = this->get_gpu().get_stream();
-  if (this->projection_dim_) {
-    // setting up for fprop()
-    {
-      for (int i = 0; i < num_layers_; i++) {
-        const auto& tensor_input = activation_tensors_[i];
-        std::vector<size_t> dims_a(tensor_input.get_dimensions());
-        std::vector<size_t> dims_b(this->get_weight(3 * i).get_dimensions());
-        this->xu_descrs_fprop_[i].set_fprop_attr(dims_a, dims_b, CUBLAS_OP_N, CUBLAS_OP_N,
-                                                 CUBLASLT_ORDER_ROW, this->enable_tf32_compute_,
-                                                 nullptr);
-        this->xu_fprop_algos_[i].init_algorithm(this->xu_descrs_fprop_[i], cublaslt_handle);
-
-        dims_a = XU_tensors_[i].get_dimensions();
-        dims_b = this->get_weight(3 * i + 1).get_dimensions();
-        T* bias = this->get_weight(3 * i + 2).get_ptr();
-
-        this->xuvb_descrs_fprop_[i].set_fprop_attr(dims_a, dims_b, CUBLAS_OP_N, CUBLAS_OP_N,
-                                                   CUBLASLT_ORDER_ROW, this->enable_tf32_compute_,
-                                                   bias);
-        this->xuvb_fprop_algos_[i].init_algorithm(this->xuvb_descrs_fprop_[i], cublaslt_handle);
-      }
-    }
-    // setting up for bprop()
-    {
-      for (int i = 0; i < num_layers_; i++) {
-        // 1
-        std::vector<size_t> dims_a(XU_tensors_[i].get_dimensions());
-        std::vector<size_t> dims_b(tmp_mat_tensors_[0].get_dimensions());
-        T* dbias = this->get_wgrad(3 * i + 2).get_ptr();
-        this->xu_descrs_bprop_[i].set_bprop_attr(dims_a, dims_b, CUBLAS_OP_T, CUBLAS_OP_N,
-                                                 CUBLASLT_ORDER_ROW, this->enable_tf32_compute_,
-                                                 dbias);
-        this->xu_bprop_algos_[i].init_algorithm(this->xu_descrs_bprop_[i], cublaslt_handle);
-
-        // 2
-        dims_a = tmp_mat_tensors_[0].get_dimensions();
-        dims_b = this->get_weight(3 * i + 1).get_dimensions();
-        this->xuvb_descrs_bprop_[i].set_bprop_attr(dims_a, dims_b, CUBLAS_OP_N, CUBLAS_OP_T,
-                                                   CUBLASLT_ORDER_ROW, this->enable_tf32_compute_);
-        this->xuvb_bprop_algos_[i].init_algorithm(this->xuvb_descrs_bprop_[i], cublaslt_handle);
-
-        // 3
-        dims_a = activation_tensors_[i].get_dimensions();
-        dims_b = XU_tensors_[i].get_dimensions();
-        this->du_descrs_bprop_[i].set_bprop_attr(dims_a, dims_b, CUBLAS_OP_T, CUBLAS_OP_N,
-                                                 CUBLASLT_ORDER_ROW, this->enable_tf32_compute_);
-        this->du_bprop_algos_[i].init_algorithm(this->du_descrs_bprop_[i], cublaslt_handle);
 
-        // 4
-        dims_a = XU_tensors_[i].get_dimensions();
-        dims_b = this->get_weight(3 * i).get_dimensions();
-        this->dhidden_descrs_bprop_[i].set_bprop_attr(dims_a, dims_b, CUBLAS_OP_N, CUBLAS_OP_T,
-                                                      CUBLASLT_ORDER_ROW,
-                                                      this->enable_tf32_compute_);
-        this->dhidden_bprop_algos_[i].init_algorithm(this->dhidden_descrs_bprop_[i],
-                                                     cublaslt_handle);
-      }
-    }
-
-    // new (&dcnv2_backward_functor_) MultiCrossBackwardFunctorv2<T>(
-    //     this->get_gpu().get_stream(), wgrad_stream_, async_wgrad_, event_fork_, event_joint_,
-    //     cublaslt_handle, xu_descrs_bprop_, xuvb_descrs_bprop_, du_descrs_bprop_,
-    //     dhidden_descrs_bprop_, xu_bprop_algos_, xuvb_bprop_algos_, du_bprop_algos_,
-    //     dhidden_bprop_algos_);
+template <typename T>
+__global__ void mm_1d(T* out_mat, const T* vec_a, int h, const T* vec_b, int w) {
+  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  if (tid < h * w) {
+    const int col = tid % w;
+    const int row = tid / w;
+    out_mat[tid] = vec_a[row] * vec_b[col];
   }
 }
+
+/**
+ * Each row in `mat` scale with the corresponding element in vec. and accum across rows
+ * The length of vec should be h.
+ * @param o_mat: hxw
+ * @param mat: hxw
+ * @param vec: hx1
+ */
 template <typename T>
-std::unique_ptr<DataSimulator> MultiCrossLayer<T>::get_default_initializer(const int index) {
-  const Tensor2<T>& in_tensor = in_tensors_[0];
-  const Tensor2<T>& out_tensor = out_tensors_[0];
-  float bottom_dim = in_tensor.get_dimensions()[1];
-  float top_dim = out_tensor.get_dimensions()[1];
-  assert(bottom_dim == top_dim);
-  std::unique_ptr<DataSimulator> simu(nullptr);
-  int idx = -1;
-  // each dcn2 layer has one more weight tensor (U and V)
-  // U V shares the same initializer, U (bottom_dim, projection_dim), V (projection_dim, top_dim)
-  if (this->projection_dim_) {
-    idx = index % 3;
-    // U;
-    if (0 == idx) {
-      simu.reset(new VarianceScalingSimulator(1.f, data_simu::Mode_t::Fan_avg,
-                                              data_simu::Distribution_t::Norm, bottom_dim,
-                                              this->projection_dim_, false));
-    }
-    // V;
-    else if (1 == idx) {
-      simu.reset(new VarianceScalingSimulator(1.f, data_simu::Mode_t::Fan_avg,
-                                              data_simu::Distribution_t::Norm,
-                                              this->projection_dim_, top_dim, false));
-    } else if (2 == idx) {
-      simu.reset(new ConstantDataSimulator(0.0f));
-    } else {
-      HCTR_OWN_THROW(Error_t::OutOfBound, "index != {0, 1}.");
+__global__ void row_scaling_sum_kernel(T* out, const T* mat, int h, int w, const T* vec) {
+  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  const int wtid = tid % WARP_SIZE;  // thread id in warp
+  const int wid = tid / WARP_SIZE;   // warp id
+  if (wid < w) {
+    T accum = 0.f;
+    for (int i = wtid; i < h; i += WARP_SIZE) {
+      const int col = wid;
+      const int idx = i * w + col;
+      accum += mat[idx] * vec[i];
     }
-  } else {
-    idx = index % 2;
-    if (0 == idx) {
-      simu.reset(new VarianceScalingSimulator(1.f, data_simu::Mode_t::Fan_avg,
-                                              data_simu::Distribution_t::Norm, bottom_dim, top_dim,
-                                              false));
-    } else if (1 == idx) {
-      simu.reset(new ConstantDataSimulator(0.0f));
-    } else {
-      HCTR_OWN_THROW(Error_t::OutOfBound, "index != {0, 1}.");
+    T val = warpReduceSum(accum);
+    if (wtid == 0) {
+      out[wid] += val;  // using += here to enable regularization
     }
   }
-  return simu;
 }
 
-template class MultiCrossLayer<float>;
-template class MultiCrossLayer<__half>;
-
-namespace {
-
 template <typename T>
 void matrix_vec_mul(core23::Tensor& out, const core23::Tensor& mat, const core23::Tensor& vec,
                     cublasHandle_t cublas_handle, cudaStream_t stream);
@@ -1714,12 +580,13 @@ def forward(x, k, b, layers):
  *
  */
 template <typename T>
-void Core23TempMultiCrossForwardFunctor<T>::operator()(
-    cudaStream_t stream, cublasHandle_t cublas_handle, const core23::Tensor& input_tensor,
-    const std::vector<core23::Tensor>& kernel_tensors,
-    const std::vector<core23::Tensor>& bias_tensors,
-    std::vector<core23::Tensor>& layer_output_tensors,
-    std::vector<core23::Tensor>& layer_hidden_tensors, int num_layers) const {
+void MultiCrossForwardFunctor<T>::operator()(cudaStream_t stream, cublasHandle_t cublas_handle,
+                                             const core23::Tensor& input_tensor,
+                                             const std::vector<core23::Tensor>& kernel_tensors,
+                                             const std::vector<core23::Tensor>& bias_tensors,
+                                             std::vector<core23::Tensor>& layer_output_tensors,
+                                             std::vector<core23::Tensor>& layer_hidden_tensors,
+                                             int num_layers) const {
   for (int i = 0; i < num_layers; i++) {
     // weight: kernel_tensors[i] is a row vector
     // layer_hidden_tensors[i] is a row vector
@@ -1757,7 +624,7 @@ o_i = matrix_add(o_i,o_{i-1})
 */
 template <typename T>
 
-void Core23TempMultiCrossForwardFunctorv2<T>::operator()(
+void MultiCrossForwardFunctorv2<T>::operator()(
     cudaStream_t stream, const core23::Tensor& input_tensor,
     const std::vector<core23::Tensor>& kernel_tensors,
     const std::vector<core23::Tensor>& bias_tensors, std::vector<core23::Tensor>& XU_tensors,
@@ -1829,7 +696,7 @@ tmp_mat_tensors[2]: sum(dy/dh * h[i])
  *
  */
 template <typename T>
-void Core23TempMultiCrossBackwardFunctor<T>::operator()(
+void MultiCrossBackwardFunctor<T>::operator()(
     cudaStream_t stream, const core23::Tensor& input_tensor,
     const std::vector<core23::Tensor>& kernel_tensors,
     const std::vector<core23::Tensor>& layer_output_tensors,
@@ -1864,7 +731,7 @@ void Core23TempMultiCrossBackwardFunctor<T>::operator()(
 }
 
 template <typename T>
-void Core23TempMultiCrossBackwardFunctorv2<T>::operator()(
+void MultiCrossBackwardFunctorv2<T>::operator()(
     cudaStream_t dgrad_stream, cudaStream_t wgrad_stream, bool async_wgrad,
     cudaEvent_t& event_overlap, const core23::Tensor& input_tensor,
     const std::vector<core23::Tensor>& kernel_tensors,
@@ -1945,11 +812,13 @@ void Core23TempMultiCrossBackwardFunctorv2<T>::operator()(
 }
 
 template <typename T>
-Core23TempMultiCrossLayer<T>::Core23TempMultiCrossLayer(
-    const std::vector<core23::Tensor>& in_tensors, const std::vector<core23::Tensor>& out_tensors,
-    const std::shared_ptr<GPUResource>& gpu_resource, int num_layers, int64_t projection_dim,
-    std::vector<Initializer_t> initializer_types, bool enable_tf32_compute, bool async_wgrad)
-    : Core23TempTrainableLayer<T>(in_tensors, out_tensors, gpu_resource, initializer_types),
+MultiCrossLayer<T>::MultiCrossLayer(const std::vector<core23::Tensor>& in_tensors,
+                                    const std::vector<core23::Tensor>& out_tensors,
+                                    const std::shared_ptr<GPUResource>& gpu_resource,
+                                    int num_layers, int64_t projection_dim,
+                                    std::vector<Initializer_t> initializer_types,
+                                    bool enable_tf32_compute, bool async_wgrad)
+    : TrainableLayer<T>(in_tensors, out_tensors, gpu_resource, initializer_types),
       num_layers_(num_layers),
       projection_dim_(projection_dim),
       enable_tf32_compute_(enable_tf32_compute),
@@ -2152,7 +1021,7 @@ Core23TempMultiCrossLayer<T>::Core23TempMultiCrossLayer(
 }
 
 template <typename T>
-void Core23TempMultiCrossLayer<T>::fprop(bool is_train) {
+void MultiCrossLayer<T>::fprop(bool is_train) {
   CudaDeviceContext context(this->get_device_id());
   std::vector<core23::Tensor> kernel_tensors;
   std::vector<core23::Tensor> bias_tensors;
@@ -2177,9 +1046,9 @@ void Core23TempMultiCrossLayer<T>::fprop(bool is_train) {
   }
   if (this->projection_dim_ == 0) {
     // dcn v1
-    Core23TempMultiCrossForwardFunctor<T>()(
-        this->get_gpu().get_stream(), this->get_gpu().get_cublas_handle(), activation_tensors_[0],
-        kernel_tensors, bias_tensors, output_tensors, hidden_tensors, num_layers_);
+    MultiCrossForwardFunctor<T>()(this->get_gpu().get_stream(), this->get_gpu().get_cublas_handle(),
+                                  activation_tensors_[0], kernel_tensors, bias_tensors,
+                                  output_tensors, hidden_tensors, num_layers_);
   } else {
     // dcn v2
     this->dcnv2_forward_functor_(this->get_gpu().get_stream(), activation_tensors_[0],
@@ -2191,7 +1060,7 @@ void Core23TempMultiCrossLayer<T>::fprop(bool is_train) {
 }
 
 template <typename T>
-void Core23TempMultiCrossLayer<T>::bprop() {
+void MultiCrossLayer<T>::bprop() {
   CudaDeviceContext context(this->get_device_id());
   std::vector<core23::Tensor> kernel_tensors;
   std::vector<core23::Tensor> kernel_output_tensors;
@@ -2228,11 +1097,11 @@ void Core23TempMultiCrossLayer<T>::bprop() {
   }
   if (this->projection_dim_ == 0) {
     // dcn v1
-    Core23TempMultiCrossBackwardFunctor<T>()(
-        this->get_gpu().get_stream(), activation_tensors_[0], kernel_tensors,
-        forward_output_tensors, forward_hidden_tensors, activation_tensors_[num_layers_],
-        activation_tensors_[0], kernel_output_tensors, bias_output_tensors, tmp_vec_tensor_,
-        tmp_mat_tensors_, num_layers_);
+    MultiCrossBackwardFunctor<T>()(this->get_gpu().get_stream(), activation_tensors_[0],
+                                   kernel_tensors, forward_output_tensors, forward_hidden_tensors,
+                                   activation_tensors_[num_layers_], activation_tensors_[0],
+                                   kernel_output_tensors, bias_output_tensors, tmp_vec_tensor_,
+                                   tmp_mat_tensors_, num_layers_);
   } else {
     // dcn v2
     this->dcnv2_backward_functor_(
@@ -2245,7 +1114,7 @@ void Core23TempMultiCrossLayer<T>::bprop() {
   }
 }
 template <typename T>
-void Core23TempMultiCrossLayer<T>::search_algorithm() {
+void MultiCrossLayer<T>::search_algorithm() {
   // dcnv1 no search_algorithm
   CudaDeviceContext context(this->get_device_id());
   auto cublaslt_handle = this->get_gpu().get_cublaslt_handle();
@@ -2305,7 +1174,7 @@ void Core23TempMultiCrossLayer<T>::search_algorithm() {
   }
 }
 template <typename T>
-void Core23TempMultiCrossLayer<T>::initialize() {
+void MultiCrossLayer<T>::initialize() {
   auto cublaslt_handle = this->get_gpu().get_cublaslt_handle();
   auto stream = this->get_gpu().get_stream();
   auto shape_to_vector = [](const core23::Shape shape) {
@@ -2378,8 +1247,7 @@ void Core23TempMultiCrossLayer<T>::initialize() {
   HCTR_LIB_THROW(cudaDeviceSynchronize());
 }
 template <typename T>
-std::unique_ptr<DataSimulator> Core23TempMultiCrossLayer<T>::get_default_initializer(
-    const int index) {
+std::unique_ptr<DataSimulator> MultiCrossLayer<T>::get_default_initializer(const int index) {
   const core23::Tensor& in_tensor = this->input_tensors_[0];
   const core23::Tensor& out_tensor = this->output_tensors_[0];
   float bottom_dim = in_tensor.shape().size(1);
@@ -2422,7 +1290,7 @@ std::unique_ptr<DataSimulator> Core23TempMultiCrossLayer<T>::get_default_initial
   return simu;
 }
 
-template class Core23TempMultiCrossLayer<float>;
-template class Core23TempMultiCrossLayer<__half>;
+template class MultiCrossLayer<float>;
+template class MultiCrossLayer<__half>;
 
 }  // namespace HugeCTR
diff --git a/HugeCTR/src/layers/weight_multiply_layer.cu b/HugeCTR/src/layers/weight_multiply_layer.cu
index eeca660963..08165bfa42 100644
--- a/HugeCTR/src/layers/weight_multiply_layer.cu
+++ b/HugeCTR/src/layers/weight_multiply_layer.cu
@@ -140,126 +140,12 @@ void weight_multiply_dgrad(const T* top_grad, const T* weight, T* dgrad, int bat
 }  // end of namespace
 
 template <typename T>
-WeightMultiplyLayer<T>::WeightMultiplyLayer(
-    const std::shared_ptr<BufferBlock2<float>>& master_weight_buff,
-    const std::shared_ptr<BufferBlock2<T>>& weight_buff,
-    const std::shared_ptr<BufferBlock2<T>>& wgrad_buff,
-    const std::shared_ptr<GeneralBuffer2<CudaAllocator>>& blob_buff, const Tensor2<T>& in_tensor,
-    Tensor2<T>& out_tensor, const std::vector<size_t>& weight_dims,
-    const std::shared_ptr<GPUResource>& gpu_resource, std::vector<Initializer_t> initializer_types)
-    : TrainableLayer<T>(master_weight_buff, weight_buff, wgrad_buff, gpu_resource,
-                        initializer_types) {
-  try {
-    const auto& in_dims = in_tensor.get_dimensions();
-    if (in_dims.size() != 2) {
-      HCTR_OWN_THROW(Error_t::WrongInput, "Only 2D tensors can be multiplied");
-    }
-    if (weight_dims.size() != 2) {
-      HCTR_OWN_THROW(Error_t::WrongInput, "Only 2D weights is allowed for weight_multiply layer");
-    }
-    if (weight_dims[0] != in_dims[1]) {
-      HCTR_OWN_THROW(Error_t::WrongInput, "weight_dims[0] must be equal to in_dims[1]");
-    }
-
-    batch_size_ = in_dims[0];
-    slot_num_ = weight_dims[0];
-    embedding_vec_size_ = weight_dims[1];
-
-    std::vector<size_t> out_dims{batch_size_, slot_num_ * embedding_vec_size_};
-    blob_buff->reserve(out_dims, &out_tensor);
-    in_tensors_.push_back(in_tensor);
-    out_tensors_.push_back(out_tensor);
-
-    this->set_weight(0, weight_dims);
-    this->set_wgrad(0, weight_dims);
-    blob_buff->reserve(out_dims, &wgrad_tmp_trans_);
-
-  } catch (const std::runtime_error& rt_err) {
-    HCTR_LOG_S(ERROR, WORLD) << rt_err.what() << std::endl;
-    throw;
-  }
-}
-
-template <typename T>
-std::unique_ptr<DataSimulator> WeightMultiplyLayer<T>::get_uniform_initializer(const int index) {
-  float bottom_dim = slot_num_;
-  float top_dim = slot_num_ * embedding_vec_size_;
-
-  float limit = 1.0f / ((0 == index ? bottom_dim : 0) + top_dim);
-  return std::make_unique<UniformDataSimulator>(-1 * limit, limit);
-}
-
-template <typename T>
-std::unique_ptr<DataSimulator> WeightMultiplyLayer<T>::get_xavier_uniform_initializer(
-    const int index) {
-  float bottom_dim = slot_num_;
-  float top_dim = slot_num_ * embedding_vec_size_;
-
-  return std::make_unique<VarianceScalingSimulator>(1.f, data_simu::Mode_t::Fan_avg,
-                                                    data_simu::Distribution_t::Uniform,
-                                                    0 == index ? bottom_dim : 0, top_dim);
-}
-
-template <typename T>
-std::unique_ptr<DataSimulator> WeightMultiplyLayer<T>::get_xavier_norm_initializer(
-    const int index) {
-  float bottom_dim = slot_num_;
-  float top_dim = slot_num_ * embedding_vec_size_;
-
-  return std::make_unique<VarianceScalingSimulator>(1.f, data_simu::Mode_t::Fan_avg,
-                                                    data_simu::Distribution_t::Norm,
-                                                    0 == index ? bottom_dim : 0, top_dim);
-}
-
-template <typename T>
-std::unique_ptr<DataSimulator> WeightMultiplyLayer<T>::get_default_initializer(const int index) {
-  float bottom_dim = slot_num_;
-  float top_dim = slot_num_ * embedding_vec_size_;
-
-  return std::make_unique<VarianceScalingSimulator>(1.f, data_simu::Mode_t::Fan_avg,
-                                                    data_simu::Distribution_t::Uniform,
-                                                    0 == index ? bottom_dim : 0, top_dim);
-}
-
-template <typename T>
-void WeightMultiplyLayer<T>::fprop(bool is_train) {
-  CudaDeviceContext context(this->get_device_id());
-
-  T* input = in_tensors_[0].get_ptr();
-  const T* weight = this->get_weight(0).get_ptr();
-  T* output = out_tensors_[0].get_ptr();
-
-  dim3 blockSize(embedding_vec_size_, 1, 1);
-  dim3 gridSize(batch_size_, 1, 1);
-  weight_multiply_kernel<<<gridSize, blockSize, 0, this->get_gpu().get_stream()>>>(
-      input, weight, output, batch_size_, slot_num_, embedding_vec_size_);
-}
-
-template <typename T>
-void WeightMultiplyLayer<T>::bprop() {
-  CudaDeviceContext context(this->get_device_id());
-
-  const T* weight = this->get_weight(0).get_ptr();
-  T* wgrad = this->get_wgrad(0).get_ptr();
-  T* wgrad_tmp_trans = wgrad_tmp_trans_.get_ptr();
-  T* input = in_tensors_[0].get_ptr();
-  T* output = out_tensors_[0].get_ptr();
-  weight_multiply_wgrad(output, input, wgrad, wgrad_tmp_trans, batch_size_, slot_num_,
-                        embedding_vec_size_, this->get_gpu().get_stream());
-
-  // CAUTION: dgrad computation will modify the "input", so it must be put after wgrad computation
-  weight_multiply_dgrad(output, weight, input, batch_size_, slot_num_, embedding_vec_size_,
-                        this->get_gpu().get_stream());
-}
-
-template class WeightMultiplyLayer<float>;
-template class WeightMultiplyLayer<__half>;
-
-template <typename T>
-Core23TempWeightMultiplyLayer<T>::Core23TempWeightMultiplyLayer(
-    const core23::Tensor& in_tensor, core23::Tensor& out_tensor, const core23::Shape& weight_dims,
-    const std::shared_ptr<GPUResource>& gpu_resource, std::vector<Initializer_t> initializer_types)
-    : Core23TempTrainableLayer<T>({in_tensor}, {}, gpu_resource, initializer_types) {
+WeightMultiplyLayer<T>::WeightMultiplyLayer(const core23::Tensor& in_tensor,
+                                            core23::Tensor& out_tensor,
+                                            const core23::Shape& weight_dims,
+                                            const std::shared_ptr<GPUResource>& gpu_resource,
+                                            std::vector<Initializer_t> initializer_types)
+    : TrainableLayer<T>({in_tensor}, {}, gpu_resource, initializer_types) {
   try {
     const auto& in_dims = in_tensor.shape();
     if (in_dims.dims() != 2) {
@@ -305,8 +191,7 @@ Core23TempWeightMultiplyLayer<T>::Core23TempWeightMultiplyLayer(
 }
 
 template <typename T>
-std::unique_ptr<DataSimulator> Core23TempWeightMultiplyLayer<T>::get_uniform_initializer(
-    const int index) {
+std::unique_ptr<DataSimulator> WeightMultiplyLayer<T>::get_uniform_initializer(const int index) {
   float bottom_dim = slot_num_;
   float top_dim = slot_num_ * embedding_vec_size_;
 
@@ -315,7 +200,7 @@ std::unique_ptr<DataSimulator> Core23TempWeightMultiplyLayer<T>::get_uniform_ini
 }
 
 template <typename T>
-std::unique_ptr<DataSimulator> Core23TempWeightMultiplyLayer<T>::get_xavier_uniform_initializer(
+std::unique_ptr<DataSimulator> WeightMultiplyLayer<T>::get_xavier_uniform_initializer(
     const int index) {
   float bottom_dim = slot_num_;
   float top_dim = slot_num_ * embedding_vec_size_;
@@ -326,7 +211,7 @@ std::unique_ptr<DataSimulator> Core23TempWeightMultiplyLayer<T>::get_xavier_unif
 }
 
 template <typename T>
-std::unique_ptr<DataSimulator> Core23TempWeightMultiplyLayer<T>::get_xavier_norm_initializer(
+std::unique_ptr<DataSimulator> WeightMultiplyLayer<T>::get_xavier_norm_initializer(
     const int index) {
   float bottom_dim = slot_num_;
   float top_dim = slot_num_ * embedding_vec_size_;
@@ -337,8 +222,7 @@ std::unique_ptr<DataSimulator> Core23TempWeightMultiplyLayer<T>::get_xavier_norm
 }
 
 template <typename T>
-std::unique_ptr<DataSimulator> Core23TempWeightMultiplyLayer<T>::get_default_initializer(
-    const int index) {
+std::unique_ptr<DataSimulator> WeightMultiplyLayer<T>::get_default_initializer(const int index) {
   float bottom_dim = slot_num_;
   float top_dim = slot_num_ * embedding_vec_size_;
 
@@ -348,7 +232,7 @@ std::unique_ptr<DataSimulator> Core23TempWeightMultiplyLayer<T>::get_default_ini
 }
 
 template <typename T>
-void Core23TempWeightMultiplyLayer<T>::fprop(bool is_train) {
+void WeightMultiplyLayer<T>::fprop(bool is_train) {
   CudaDeviceContext context(this->get_device_id());
 
   T* input = this->input_tensors_[0].template data<T>();
@@ -362,7 +246,7 @@ void Core23TempWeightMultiplyLayer<T>::fprop(bool is_train) {
 }
 
 template <typename T>
-void Core23TempWeightMultiplyLayer<T>::bprop() {
+void WeightMultiplyLayer<T>::bprop() {
   CudaDeviceContext context(this->get_device_id());
 
   const T* weight = this->get_weight(0).template data<T>();
@@ -378,7 +262,7 @@ void Core23TempWeightMultiplyLayer<T>::bprop() {
                         this->get_gpu().get_stream());
 }
 
-template class Core23TempWeightMultiplyLayer<float>;
-template class Core23TempWeightMultiplyLayer<__half>;
+template class WeightMultiplyLayer<float>;
+template class WeightMultiplyLayer<__half>;
 
 }  // namespace HugeCTR
diff --git a/HugeCTR/src/pybind/add_dense_layer_helpers.cpp b/HugeCTR/src/pybind/add_dense_layer_helpers.cpp
index 138b27ab5b..b5bc9dcc4f 100644
--- a/HugeCTR/src/pybind/add_dense_layer_helpers.cpp
+++ b/HugeCTR/src/pybind/add_dense_layer_helpers.cpp
@@ -160,13 +160,13 @@ void add_dense_layer_impl(DenseLayer& dense_layer, std::vector<TensorEntity>& te
       std::vector<Initializer_t> initializer_types{dense_layer.gamma_init_type,
                                                    dense_layer.beta_init_type};
       if (use_mixed_precision) {
-        Core23TempBatchNormLayer<__half>::Params params = {dense_layer.factor, dense_layer.eps};
-        layers.emplace_back(new Core23TempBatchNormLayer<__half>(
-            bn_in_tensor, bn_out_tensor, params, gpu_resource, initializer_types));
+        BatchNormLayer<__half>::Params params = {dense_layer.factor, dense_layer.eps};
+        layers.emplace_back(new BatchNormLayer<__half>(bn_in_tensor, bn_out_tensor, params,
+                                                       gpu_resource, initializer_types));
       } else {
-        Core23TempBatchNormLayer<float>::Params params = {dense_layer.factor, dense_layer.eps};
-        layers.emplace_back(new Core23TempBatchNormLayer<float>(bn_in_tensor, bn_out_tensor, params,
-                                                                gpu_resource, initializer_types));
+        BatchNormLayer<float>::Params params = {dense_layer.factor, dense_layer.eps};
+        layers.emplace_back(new BatchNormLayer<float>(bn_in_tensor, bn_out_tensor, params,
+                                                      gpu_resource, initializer_types));
       }
       break;
     }
@@ -178,13 +178,13 @@ void add_dense_layer_impl(DenseLayer& dense_layer, std::vector<TensorEntity>& te
                                                    dense_layer.beta_init_type};
 
       if (use_mixed_precision) {
-        Core23TempLayerNormLayer<__half>::Params params = {dense_layer.eps};
-        layers.emplace_back(new Core23TempLayerNormLayer<__half>(
-            ln_in_tensor, ln_out_tensor, params, gpu_resource, initializer_types));
+        LayerNormLayer<__half>::Params params = {dense_layer.eps};
+        layers.emplace_back(new LayerNormLayer<__half>(ln_in_tensor, ln_out_tensor, params,
+                                                       gpu_resource, initializer_types));
       } else {
-        Core23TempLayerNormLayer<float>::Params params = {dense_layer.eps};
-        layers.emplace_back(new Core23TempLayerNormLayer<float>(ln_in_tensor, ln_out_tensor, params,
-                                                                gpu_resource, initializer_types));
+        LayerNormLayer<float>::Params params = {dense_layer.eps};
+        layers.emplace_back(new LayerNormLayer<float>(ln_in_tensor, ln_out_tensor, params,
+                                                      gpu_resource, initializer_types));
       }
       break;
     }
@@ -373,12 +373,12 @@ void add_dense_layer_impl(DenseLayer& dense_layer, std::vector<TensorEntity>& te
       }
 
       if (use_mixed_precision) {
-        layers.emplace_back(new Core23TempMLPLayer<__half>(
+        layers.emplace_back(new MLPLayer<__half>(
             in_tensors, train_out_tensors, num_outputs, gpu_resource, acts, biases,
             initializer_types, skip_dgrad, dense_layer.compute_config.async_wgrad,
             dense_layer.compute_config.fuse_wb, enable_tf32_compute));
       } else {
-        layers.emplace_back(new Core23TempMLPLayer<float>(
+        layers.emplace_back(new MLPLayer<float>(
             in_tensors, train_out_tensors, num_outputs, gpu_resource, acts, biases,
             initializer_types, skip_dgrad, dense_layer.compute_config.async_wgrad,
             dense_layer.compute_config.fuse_wb, enable_tf32_compute));
@@ -448,10 +448,10 @@ void add_dense_layer_impl(DenseLayer& dense_layer, std::vector<TensorEntity>& te
         core23::Tensor db_out_tensor;
 
         if (pos_type == FcPosition_t::None) {
-          layers.emplace_back(new Core23TempFusedFullyConnectedLayer(
-              train_in_tensor, train_out_tensor, gpu_resource, initializer_types));
+          layers.emplace_back(new FusedFullyConnectedLayer(train_in_tensor, train_out_tensor,
+                                                           gpu_resource, initializer_types));
         } else {
-          layers.emplace_back(new Core23TempFusedReluBiasFullyConnectedLayer(
+          layers.emplace_back(new FusedReluBiasFullyConnectedLayer(
               train_in_tensor, mask_in_tensor, dRelu_in_tensor, db_in_tensor, train_out_tensor,
               mask_out_tensor, dRelu_out_tensor, db_out_tensor, gpu_resource, pos_type, act_type,
               skip_dgrad, initializer_types, dense_layer.compute_config.async_wgrad, head_mask_in,
@@ -499,12 +499,12 @@ void add_dense_layer_impl(DenseLayer& dense_layer, std::vector<TensorEntity>& te
       }
 
       if (use_mixed_precision) {
-        layers.emplace_back(new Core23TempFullyConnectedLayer<__half>(
-            in_tensor, fc_out_tensor, gpu_resource, initializer_types));
+        layers.emplace_back(new FullyConnectedLayer<__half>(in_tensor, fc_out_tensor, gpu_resource,
+                                                            initializer_types));
       } else {
-        layers.emplace_back(new Core23TempFullyConnectedLayer<float>(
-            in_tensor, fc_out_tensor, gpu_resource, use_mixed_precision, enable_tf32_compute,
-            initializer_types));
+        layers.emplace_back(new FullyConnectedLayer<float>(in_tensor, fc_out_tensor, gpu_resource,
+                                                           use_mixed_precision, enable_tf32_compute,
+                                                           initializer_types));
       }
       output_tensor_entities.push_back({input_output_info.output_names[0], fc_out_tensor});
       break;
@@ -599,11 +599,11 @@ void add_dense_layer_impl(DenseLayer& dense_layer, std::vector<TensorEntity>& te
       core23::Tensor out_tensor(tensor_params.shape(mc_in_tensor.shape()));
       output_tensor_entities.push_back({input_output_info.output_names[0], out_tensor});
       if (use_mixed_precision) {
-        layers.emplace_back(new Core23TempMultiCrossLayer<__half>(
+        layers.emplace_back(new MultiCrossLayer<__half>(
             {mc_in_tensor}, {out_tensor}, gpu_resource, num_layers, projection_dim,
             initializer_types, enable_tf32_compute, async_wgrad));
       } else {
-        layers.emplace_back(new Core23TempMultiCrossLayer<float>(
+        layers.emplace_back(new MultiCrossLayer<float>(
             {mc_in_tensor}, {out_tensor}, gpu_resource, num_layers, projection_dim,
             initializer_types, enable_tf32_compute, async_wgrad));
       }
@@ -773,11 +773,11 @@ void add_dense_layer_impl(DenseLayer& dense_layer, std::vector<TensorEntity>& te
       core23::Shape weight_dims{static_cast<int64_t>(dense_layer.weight_dims[0]),
                                 static_cast<int64_t>(dense_layer.weight_dims[1])};
       if (use_mixed_precision) {
-        layers.emplace_back(new Core23TempWeightMultiplyLayer<__half>(
-            in_tensor, out_tensor, weight_dims, gpu_resource, initializer_types));
+        layers.emplace_back(new WeightMultiplyLayer<__half>(in_tensor, out_tensor, weight_dims,
+                                                            gpu_resource, initializer_types));
       } else {
-        layers.emplace_back(new Core23TempWeightMultiplyLayer<float>(
-            in_tensor, out_tensor, weight_dims, gpu_resource, initializer_types));
+        layers.emplace_back(new WeightMultiplyLayer<float>(in_tensor, out_tensor, weight_dims,
+                                                           gpu_resource, initializer_types));
       }
       output_tensor_entities.push_back({input_output_info.output_names[0], out_tensor});
       break;
@@ -838,7 +838,7 @@ void add_dense_layer_impl(DenseLayer& dense_layer, std::vector<TensorEntity>& te
       auto& in_tensor = input_output_info.input_tensors[0];
       int64_t num_output = dense_layer.num_output;
       core23::Tensor gru_out_tensor(tensor_params.shape({in_tensor.shape().size(0), num_output}));
-      layers.emplace_back(new Core23TempGRULayer<float>(
+      layers.emplace_back(new GRULayer<float>(
           in_tensor, gru_out_tensor, dense_layer.num_output, dense_layer.batchsize,
           dense_layer.SeqLength, dense_layer.vector_size, gpu_resource, initializer_types));
       output_tensor_entities.push_back({input_output_info.output_names[0], gru_out_tensor});
diff --git a/test/utest/core23_layer_test/batch_norm_layer_test.cpp b/test/utest/core23_layer_test/batch_norm_layer_test.cpp
index f9201ca408..0863bb29f2 100644
--- a/test/utest/core23_layer_test/batch_norm_layer_test.cpp
+++ b/test/utest/core23_layer_test/batch_norm_layer_test.cpp
@@ -209,9 +209,8 @@ void batch_norm_test(int64_t batch_size, int64_t num_feature) {
                                                  .shape(dims)
                                                  .buffer_params(blobs_buffer_params));
 
-  typename Core23TempBatchNormLayer<T>::Params params = {1.0, eps};
-  Core23TempBatchNormLayer<T> batch_norm_layer(in_tensor, out_tensor, params,
-                                               test::get_default_gpu());
+  typename BatchNormLayer<T>::Params params = {1.0, eps};
+  BatchNormLayer<T> batch_norm_layer(in_tensor, out_tensor, params, test::get_default_gpu());
 
   batch_norm_layer.initialize();
 
diff --git a/test/utest/core23_layer_test/fully_connected_layer_half_test.cpp b/test/utest/core23_layer_test/fully_connected_layer_half_test.cpp
index 283780db72..2a85b629b3 100644
--- a/test/utest/core23_layer_test/fully_connected_layer_half_test.cpp
+++ b/test/utest/core23_layer_test/fully_connected_layer_half_test.cpp
@@ -83,8 +83,8 @@ static void fully_connected_layer_test(int64_t m, int64_t n, int64_t k) {
                                                  .shape({m, n})
                                                  .buffer_params(blobs_buffer_params));
 
-  Core23TempFullyConnectedLayer<__half> fully_connected_layer(bottom_tensor, top_tensor,
-                                                              test::get_default_gpu());
+  FullyConnectedLayer<__half> fully_connected_layer(bottom_tensor, top_tensor,
+                                                    test::get_default_gpu());
   // Initialize tensors to 0 and choose cublas algorithms
   fully_connected_layer.initialize();
   // Reset tensors to 0 to ensure all the data are the same as original utest(clear the side effect
@@ -202,8 +202,8 @@ static void fully_connected_layer_test_3d(int64_t batch_size, int64_t seq_len, i
                                                  .shape({batch_size, seq_len, n})
                                                  .buffer_params(blobs_buffer_params));
 
-  Core23TempFullyConnectedLayer<__half> fully_connected_layer(bottom_tensor, top_tensor,
-                                                              test::get_default_gpu());
+  FullyConnectedLayer<__half> fully_connected_layer(bottom_tensor, top_tensor,
+                                                    test::get_default_gpu());
   // Initialize tensors to 0 and choose cublas algorithms
   blobs_buff->allocate();
   fully_connected_layer.initialize();
diff --git a/test/utest/core23_layer_test/fully_connected_layer_test.cpp b/test/utest/core23_layer_test/fully_connected_layer_test.cpp
index 2556096092..d58c2e617a 100644
--- a/test/utest/core23_layer_test/fully_connected_layer_test.cpp
+++ b/test/utest/core23_layer_test/fully_connected_layer_test.cpp
@@ -77,8 +77,8 @@ static void fully_connected_layer_test(int64_t m, int64_t n, int64_t k, float to
                                                  .shape({m, n})
                                                  .buffer_params(blobs_buffer_params));
 
-  Core23TempFullyConnectedLayer<float> fully_connected_layer(
-      in_tensor, out_tensor, test::get_default_gpu(), false, enable_tf32_compute);
+  FullyConnectedLayer<float> fully_connected_layer(in_tensor, out_tensor, test::get_default_gpu(),
+                                                   false, enable_tf32_compute);
   fully_connected_layer.initialize();
   // Reset tensors to 0 to ensure all the data are the same as original utest(clear the side effect
   // of optimize)
@@ -175,8 +175,8 @@ static void fully_connected_layer_test_3d(int64_t batch_size, int64_t seq_len, i
                                                  .shape({batch_size, seq_len, n})
                                                  .buffer_params(blobs_buffer_params));
 
-  Core23TempFullyConnectedLayer<float> fully_connected_layer(
-      in_tensor, out_tensor, test::get_default_gpu(), false, enable_tf32_compute);
+  FullyConnectedLayer<float> fully_connected_layer(in_tensor, out_tensor, test::get_default_gpu(),
+                                                   false, enable_tf32_compute);
   // Initialize tensors to 0 and choose cublas algorithms
   fully_connected_layer.initialize();
   // Reset tensors to 0 to ensure all the data are the same as original utest(clear the side effect
diff --git a/test/utest/core23_layer_test/fused_fully_connected_layer_test.cpp b/test/utest/core23_layer_test/fused_fully_connected_layer_test.cpp
index 176b704503..2af2efc520 100644
--- a/test/utest/core23_layer_test/fused_fully_connected_layer_test.cpp
+++ b/test/utest/core23_layer_test/fused_fully_connected_layer_test.cpp
@@ -111,8 +111,8 @@ static void fully_connected_layer_test(int64_t m, int64_t n, int64_t k) {
                                                  .shape({m, n})
                                                  .buffer_params(blobs_buffer_params));
 
-  Core23TempFusedFullyConnectedLayer fully_connected_layer(bottom_tensor, top_tensor,
-                                                           test::get_default_gpu());
+  FusedFullyConnectedLayer fully_connected_layer(bottom_tensor, top_tensor,
+                                                 test::get_default_gpu());
 
   // Initialize tensors to 0 and choose cublas algorithms
   fully_connected_layer.initialize();
diff --git a/test/utest/core23_layer_test/fused_relu_bias_fully_connected_layer_test.cpp b/test/utest/core23_layer_test/fused_relu_bias_fully_connected_layer_test.cpp
index 2e465cd10e..e5b3721726 100644
--- a/test/utest/core23_layer_test/fused_relu_bias_fully_connected_layer_test.cpp
+++ b/test/utest/core23_layer_test/fused_relu_bias_fully_connected_layer_test.cpp
@@ -134,7 +134,7 @@ static void fully_connected_layer_test(int64_t m, int64_t n, int64_t k) {
                                         .shape({m, n})
                                         .buffer_params(blobs_buffer_params));
 
-  Core23TempFusedReluBiasFullyConnectedLayer fully_connected_layer(
+  FusedReluBiasFullyConnectedLayer fully_connected_layer(
       train_in_tensor, mask_in_tensor, dRelu_in_tensor, db_in_tensor, train_out_tensor,
       mask_out_tensor, dRelu_out_tensor, db_out_tensor, test::get_default_gpu(),
       FcPosition_t::Isolated, Activation_t::Relu, false, std::vector<Initializer_t>(), false, true);
diff --git a/test/utest/core23_layer_test/gru_layer_test.cpp b/test/utest/core23_layer_test/gru_layer_test.cpp
index 1980d2a36f..edbc2f4e3b 100644
--- a/test/utest/core23_layer_test/gru_layer_test.cpp
+++ b/test/utest/core23_layer_test/gru_layer_test.cpp
@@ -224,8 +224,8 @@ static void gru_layer_test(int64_t batch_size, int64_t hiddenSize, int64_t embed
                                                  .shape({1, batch_size * SeqLength * hiddenSize})
                                                  .buffer_params(blobs_buffer_params));
 
-  Core23TempGRULayer<T> gru_layer(in_tensor, out_tensor, hiddenSize, batch_size, SeqLength,
-                                  embedding_vec_size, test::get_default_gpu());
+  GRULayer<T> gru_layer(in_tensor, out_tensor, hiddenSize, batch_size, SeqLength,
+                        embedding_vec_size, test::get_default_gpu());
   // Initialize tensors to 0 and choose cublas algorithms
   gru_layer.initialize();
   // Reset tensors to 0 to ensure all the data are the same as original utest(clear the side effect
diff --git a/test/utest/core23_layer_test/layer_norm_layer_test.cpp b/test/utest/core23_layer_test/layer_norm_layer_test.cpp
index 281586a143..8616fa2c49 100644
--- a/test/utest/core23_layer_test/layer_norm_layer_test.cpp
+++ b/test/utest/core23_layer_test/layer_norm_layer_test.cpp
@@ -214,9 +214,8 @@ void layer_norm_test(core23::Shape dims) {
                                                  .shape(dims)
                                                  .buffer_params(blobs_buffer_params));
 
-  typename Core23TempLayerNormLayer<T>::Params params = {eps};
-  Core23TempLayerNormLayer<T> layer_norm_layer(in_tensor, out_tensor, params,
-                                               test::get_default_gpu());
+  typename LayerNormLayer<T>::Params params = {eps};
+  LayerNormLayer<T> layer_norm_layer(in_tensor, out_tensor, params, test::get_default_gpu());
 
   const auto& in_tensor_dim = dims;
   int64_t batch_size = 1;
diff --git a/test/utest/core23_layer_test/mlp_test.cpp b/test/utest/core23_layer_test/mlp_test.cpp
index 4220cb2041..9267329446 100644
--- a/test/utest/core23_layer_test/mlp_test.cpp
+++ b/test/utest/core23_layer_test/mlp_test.cpp
@@ -250,8 +250,8 @@ static void init_data_cpu(Param<T>& p, int* input_dims, int* output_dims, int n_
 }
 
 bool is_mlp_layer(Layer* layer) {
-  return dynamic_cast<Core23TempMLPLayer<__half>*>(layer) != nullptr ||
-         dynamic_cast<Core23TempMLPLayer<float>*>(layer) != nullptr;
+  return dynamic_cast<MLPLayer<__half>*>(layer) != nullptr ||
+         dynamic_cast<MLPLayer<float>*>(layer) != nullptr;
 }
 
 template <typename T>
@@ -264,7 +264,7 @@ static void copy_data_from_cpu(Param<T>& p, int* input_dims, int* output_dims, i
 
   for (int i = 0; i < n_layers; i++) {
     auto index = map_mlp[i];
-    auto layer = dynamic_cast<Core23TempMLPLayer<T>*>(layers[index[0]].get());
+    auto layer = dynamic_cast<MLPLayer<T>*>(layers[index[0]].get());
     d_kernel[i] = layer->get_kernel(index[1]).template data<T>();
     d_bias[i] = layer->get_bias(index[1]).template data<T>();
     d_kernel_grad[i] = layer->get_kernel_grad(index[1]).template data<T>();
@@ -339,10 +339,10 @@ static void mlp_test(std::vector<Layer_t> network,
                                               .data_type(core23::ToScalarType<T>::value)
                                               .buffer_params(buffer_params));
 
-    layers.emplace_back(
-        new Core23TempMLPLayer<T>(train_in_tensors, train_out_tensors, num_outputs, gpu_resource,
-                                  relu, bias, std::vector<Initializer_t>(), false,
-                                  config_set.async_mlp_wgrad, use_fuse_wb[i], enable_tf32_compute));
+    layers.emplace_back(new MLPLayer<T>(train_in_tensors, train_out_tensors, num_outputs,
+                                        gpu_resource, relu, bias, std::vector<Initializer_t>(),
+                                        false, config_set.async_mlp_wgrad, use_fuse_wb[i],
+                                        enable_tf32_compute));
 
     if (i != cnt_mlp - 1) {
       core23::Tensor in_mlp_tensor =
@@ -406,7 +406,7 @@ static void mlp_test(std::vector<Layer_t> network,
   std::vector<T*> d_bias_grad(n_fc_layers);
   for (int i = 0; i < n_fc_layers; i++) {
     auto index = map_mlp[i];
-    auto layer = dynamic_cast<Core23TempMLPLayer<T>*>(layers[index[0]].get());
+    auto layer = dynamic_cast<MLPLayer<T>*>(layers[index[0]].get());
     d_kernel[i] = layer->get_kernel(index[1]).template data<T>();
     d_bias[i] = layer->get_bias(index[1]).template data<T>();
     d_kernel_grad[i] = layer->get_kernel_grad(index[1]).template data<T>();
@@ -441,7 +441,7 @@ static void mlp_test(std::vector<Layer_t> network,
 
     for (int i = 0; i < n_fc_layers; i++) {
       auto index = map_mlp[i];
-      auto mlp_layer = dynamic_cast<Core23TempMLPLayer<T>*>(layers[index[0]].get());
+      auto mlp_layer = dynamic_cast<MLPLayer<T>*>(layers[index[0]].get());
       if (i > 0 && fc_in_dims[i] != fc_out_dims[i - 1]) {
         auto& input_tensors = mlp_layer->get_input_tensors();
         HCTR_LIB_THROW(cudaMemcpy(input_tensors[0].template data<T>(), p.h_bottom[i].get(),
@@ -459,7 +459,7 @@ static void mlp_test(std::vector<Layer_t> network,
     // Check results
     for (int i = 0; i < n_fc_layers; i++) {
       auto index = map_mlp[i];
-      auto mlp_layer = dynamic_cast<Core23TempMLPLayer<T>*>(layers[index[0]].get());
+      auto mlp_layer = dynamic_cast<MLPLayer<T>*>(layers[index[0]].get());
       auto& inner_tensors = mlp_layer->get_inner_tensors();
       ASSERT_LE(check_data_cpu_and_gpu(p.h_top[i].get(), inner_tensors[index[1]].template data<T>(),
                                        batch_size * fc_out_dims[i], 1e-2),
@@ -470,7 +470,7 @@ static void mlp_test(std::vector<Layer_t> network,
     // initialize dX
     {
       auto index = map_mlp[n_fc_layers - 1];
-      auto mlp_layer = dynamic_cast<Core23TempMLPLayer<T>*>(layers[index[0]].get());
+      auto mlp_layer = dynamic_cast<MLPLayer<T>*>(layers[index[0]].get());
       auto& inner_tensors = mlp_layer->get_inner_tensors();
       HCTR_LIB_THROW(cudaMemcpy(
           inner_tensors[index[1]].template data<T>(), p.h_top_grad[n_fc_layers - 1].get(),
@@ -512,7 +512,7 @@ static void mlp_test(std::vector<Layer_t> network,
     // Check results
     for (int i = n_fc_layers - 1; i >= 0; i--) {
       auto index = map_mlp[i];
-      auto mlp_layer = dynamic_cast<Core23TempMLPLayer<T>*>(layers[index[0]].get());
+      auto mlp_layer = dynamic_cast<MLPLayer<T>*>(layers[index[0]].get());
       auto& inner_tensors = mlp_layer->get_inner_tensors();
 
       ASSERT_LE(check_data_cpu_and_gpu(p.h_bias_grad[i].get(),
diff --git a/test/utest/core23_layer_test/multi_cross_layer_test.cpp b/test/utest/core23_layer_test/multi_cross_layer_test.cpp
index 967c706613..2d37e247fd 100644
--- a/test/utest/core23_layer_test/multi_cross_layer_test.cpp
+++ b/test/utest/core23_layer_test/multi_cross_layer_test.cpp
@@ -52,7 +52,7 @@ class MultiCrossLayerTest {
   std::vector<std::vector<T>> h_kernel_grads_;
   std::vector<std::vector<T>> h_bias_grads_;
 
-  std::shared_ptr<Core23TempMultiCrossLayer<T>> layer_;
+  std::shared_ptr<MultiCrossLayer<T>> layer_;
   test::GaussianDataSimulator data_sim_;
 
   std::vector<T*> weights_ptrs_;
@@ -611,8 +611,8 @@ class MultiCrossLayerTest {
     }
 
     // layer
-    layer_.reset(new Core23TempMultiCrossLayer<T>({d_input_}, {d_output_}, test::get_default_gpu(),
-                                                  layers, projection_dim_, {}, false, false));
+    layer_.reset(new MultiCrossLayer<T>({d_input_}, {d_output_}, test::get_default_gpu(), layers,
+                                        projection_dim_, {}, false, false));
 
     layer_->initialize();
 
diff --git a/test/utest/core23_layer_test/multi_head_attention_layer_test.cpp b/test/utest/core23_layer_test/multi_head_attention_layer_test.cpp
index ff48bf18ea..8fdece51b6 100644
--- a/test/utest/core23_layer_test/multi_head_attention_layer_test.cpp
+++ b/test/utest/core23_layer_test/multi_head_attention_layer_test.cpp
@@ -305,7 +305,7 @@ void multi_head_attention_cpu_fused(T *q_4d, T *k_4d, T *v_4d, T *output, T *tmp
   // Just to revert k_4d back
   transpose(k_4d, b, h, k, n);
   for (int64_t i = 0; i < b * h * m * n; i++) {
-    tmp_score[i] = tmp_score[i] / ((float)sqrt(k));
+    tmp_score[i] = (float)tmp_score[i] / sqrt((float)k);
   }
   // masked softmax
   softmax_fprop_cpu<T>(tmp_softmax, tmp_score.get(), b * h * m * n, n);
diff --git a/test/utest/core23_layer_test/trainable_layer_test.cpp b/test/utest/core23_layer_test/trainable_layer_test.cpp
index d5180ec5b5..63a475b092 100644
--- a/test/utest/core23_layer_test/trainable_layer_test.cpp
+++ b/test/utest/core23_layer_test/trainable_layer_test.cpp
@@ -30,8 +30,8 @@ namespace {
  * gradients for trainable layers. The fprop and bprop methods are not actually implemented.
  */
 template <typename DType, bool use_FP32_weight>
-class DummyTrainableLayer : public Core23TempTrainableLayer<DType, use_FP32_weight> {
-  using Base = Core23TempTrainableLayer<DType, use_FP32_weight>;
+class DummyTrainableLayer : public TrainableLayer<DType, use_FP32_weight> {
+  using Base = TrainableLayer<DType, use_FP32_weight>;
   using WeightType = typename Base::WeightType;
 
  public:
diff --git a/test/utest/core23_layer_test/weight_multiply_layer_test.cpp b/test/utest/core23_layer_test/weight_multiply_layer_test.cpp
index 7b149b6ecb..98252b562e 100644
--- a/test/utest/core23_layer_test/weight_multiply_layer_test.cpp
+++ b/test/utest/core23_layer_test/weight_multiply_layer_test.cpp
@@ -92,8 +92,8 @@ void weight_multiply_test(int64_t batch_size, int64_t slot_num, int64_t embeddin
                                                 .buffer_params(blobs_buffer_params));
   core23::Tensor out_tensor;
 
-  Core23TempWeightMultiplyLayer<T> weight_multiply_layer(in_tensor, out_tensor, weight_dims,
-                                                         test::get_default_gpu());
+  WeightMultiplyLayer<T> weight_multiply_layer(in_tensor, out_tensor, weight_dims,
+                                               test::get_default_gpu());
 
   weight_multiply_layer.initialize();
 
diff --git a/test/utest/legacy_layer_test/batch_norm_layer_test_old.cpp b/test/utest/legacy_layer_test/batch_norm_layer_test_old.cpp
deleted file mode 100644
index 6ac88f36ef..0000000000
--- a/test/utest/legacy_layer_test/batch_norm_layer_test_old.cpp
+++ /dev/null
@@ -1,290 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gtest/gtest.h>
-
-#include <general_buffer2.hpp>
-#include <layers/batch_norm_layer.hpp>
-#include <utest/test_utils.hpp>
-
-using namespace HugeCTR;
-
-namespace {
-
-constexpr float eps = 1e-4;  // Epsilon for CPU computation
-
-// Eps type for error
-template <typename T>
-struct Eps {
-  static T value();
-};
-
-template <>
-struct Eps<float> {
-  static constexpr float value() { return 1e-4f; }
-};
-
-template <>
-struct Eps<__half> {
-  static __half value() { return __float2half(1e-2f); }
-};
-
-template <typename T>
-void batch_norm_fprop_cpu(const float* gamma, const float* beta, const T* in, T* out,
-                          int batch_size, int num_feature) {
-  for (int j = 0; j < num_feature; j++) {
-    float mean = 0.0f;
-    for (int i = 0; i < batch_size; i++) {
-      int idx = i * num_feature + j;
-      mean += in[idx];
-    }
-    mean /= batch_size;
-
-    float var = 0.0f;
-    for (int i = 0; i < batch_size; i++) {
-      int idx = i * num_feature + j;
-      float diff = in[idx] - mean;
-      var += (diff * diff);
-    }
-    var /= batch_size;
-
-    for (int i = 0; i < batch_size; i++) {
-      int idx = i * num_feature + j;
-      float in_norm = (in[idx] - mean) / sqrt(var + eps);
-      out[idx] = gamma[j] * in_norm + beta[j];
-    }
-  }
-}
-
-template <>
-void batch_norm_fprop_cpu<__half>(const float* gamma, const float* beta, const __half* in,
-                                  __half* out, int batch_size, int num_feature) {
-  for (int j = 0; j < num_feature; j++) {
-    float mean = 0.0f;
-    for (int i = 0; i < batch_size; i++) {
-      int idx = i * num_feature + j;
-      mean += __half2float(in[idx]);
-    }
-    mean /= batch_size;
-
-    float var = 0.0f;
-    for (int i = 0; i < batch_size; i++) {
-      int idx = i * num_feature + j;
-      float diff = __half2float(in[idx]) - mean;
-      var += (diff * diff);
-    }
-    var /= batch_size;
-
-    for (int i = 0; i < batch_size; i++) {
-      int idx = i * num_feature + j;
-      float in_norm = (__half2float(in[idx]) - mean) / sqrt(var + eps);
-      out[idx] = __float2half(gamma[j] * in_norm + beta[j]);
-    }
-  }
-}
-
-template <typename T>
-void batch_norm_bprop_cpu(const float* gamma, const T* out, T* in, int batch_size,
-                          int num_feature) {
-  for (int j = 0; j < num_feature; j++) {
-    float mean = 0.0f;
-    for (int i = 0; i < batch_size; i++) {
-      int idx = i * num_feature + j;
-      mean += in[idx];
-    }
-    mean /= batch_size;
-
-    float var = 0.0f;
-    for (int i = 0; i < batch_size; i++) {
-      int idx = i * num_feature + j;
-      float diff = in[idx] - mean;
-      var += (diff * diff);
-    }
-    var /= batch_size;
-
-    float inv_std = 1.0f / sqrt(var + eps);
-
-    float d_var = 0.0f;
-    for (int i = 0; i < batch_size; i++) {
-      int idx = i * num_feature + j;
-      float val = (out[idx] * gamma[j]) * (in[idx] - mean);
-      d_var += val;
-    }
-    d_var *= (-0.5f) * pow(inv_std, 3);
-
-    float val1 = 0.0f;
-    float val2 = 0.0f;
-    for (int i = 0; i < batch_size; i++) {
-      int idx = i * num_feature + j;
-      val1 += (out[idx] * gamma[j]);
-      val2 += (in[idx] - mean);
-    }
-    val1 *= (-inv_std);
-    val2 *= (d_var / batch_size) * -2;
-    float d_mean = (val1 + val2);
-
-    for (int i = 0; i < batch_size; i++) {
-      int idx = i * num_feature + j;
-      in[idx] = (out[idx] * gamma[j]) * inv_std + d_var * (2.0 / batch_size) * (in[idx] - mean) +
-                d_mean / batch_size;
-    }
-  }
-}
-
-template <>
-void batch_norm_bprop_cpu<__half>(const float* gamma, const __half* out, __half* in, int batch_size,
-                                  int num_feature) {
-  for (int j = 0; j < num_feature; j++) {
-    float mean = 0.0f;
-    for (int i = 0; i < batch_size; i++) {
-      int idx = i * num_feature + j;
-      mean += __half2float(in[idx]);
-    }
-    mean /= batch_size;
-
-    float var = 0.0f;
-    for (int i = 0; i < batch_size; i++) {
-      int idx = i * num_feature + j;
-      float diff = __half2float(in[idx]) - mean;
-      var += (diff * diff);
-    }
-    var /= batch_size;
-
-    float inv_std = 1.0f / sqrt(var + eps);
-
-    float d_var = 0.0f;
-    for (int i = 0; i < batch_size; i++) {
-      int idx = i * num_feature + j;
-      float val = (__half2float(out[idx]) * gamma[j]) * (__half2float(in[idx]) - mean);
-      d_var += val;
-    }
-    d_var *= (-0.5f) * pow(inv_std, 3);
-
-    float val1 = 0.0f;
-    float val2 = 0.0f;
-    for (int i = 0; i < batch_size; i++) {
-      int idx = i * num_feature + j;
-      val1 += (__half2float(out[idx]) * gamma[j]);
-      val2 += __half2float(in[idx]) - mean;
-    }
-    val1 *= (-inv_std);
-    val2 *= (d_var / batch_size) * -2;
-    float d_mean = (val1 + val2);
-
-    for (int i = 0; i < batch_size; i++) {
-      int idx = i * num_feature + j;
-      in[idx] = __float2half((__half2float(out[idx]) * gamma[j]) * inv_std +
-                             d_var * (2.0 / batch_size) * (__half2float(in[idx]) - mean) +
-                             d_mean / batch_size);
-    }
-  }
-}
-
-template <typename T>
-void batch_norm_test(size_t batch_size, size_t num_feature) {
-  std::shared_ptr<GeneralBuffer2<CudaAllocator>> buff = GeneralBuffer2<CudaAllocator>::create();
-  std::shared_ptr<BufferBlock2<float>> wbuff = buff->create_block<float>();
-  std::shared_ptr<BufferBlock2<float>> wgbuff = buff->create_block<float>();
-
-  std::vector<size_t> dims = {batch_size, num_feature};
-
-  Tensor2<T> in_tensor;
-  buff->reserve(dims, &in_tensor);
-  Tensor2<T> out_tensor;
-  buff->reserve(dims, &out_tensor);
-
-  typename BatchNormLayer<T>::Params params = {1.0, eps};
-  BatchNormLayer<T> batch_norm_layer(wbuff, wbuff, wgbuff, buff, in_tensor, out_tensor, params,
-                                     test::get_default_gpu());
-
-  buff->allocate();
-  batch_norm_layer.initialize();
-
-  const size_t len = batch_size * num_feature;
-
-  T* d_in = in_tensor.get_ptr();
-  T* d_out = out_tensor.get_ptr();
-
-  std::unique_ptr<float[]> h_gamma(new float[num_feature]);
-  std::unique_ptr<float[]> h_beta(new float[num_feature]);
-  std::unique_ptr<T[]> h_in(new T[len]);
-  std::unique_ptr<T[]> h_out(new T[len]);
-  std::unique_ptr<T[]> h_expected(new T[len]);
-
-  test::GaussianDataSimulator simulator(0.0, 1.0);
-
-  // standard normal distribution is assumed
-  for (size_t j = 0; j < num_feature; j++) {
-    h_gamma[j] = 1.0f;
-    h_beta[j] = 0.0f;
-  }
-
-  Tensor2<float> weight_tensor = wbuff->as_tensor();
-
-  float* d_gamma = weight_tensor.get_ptr();
-  float* d_beta = weight_tensor.get_ptr() + num_feature;
-  HCTR_LIB_THROW(
-      cudaMemcpy(d_gamma, h_gamma.get(), num_feature * sizeof(float), cudaMemcpyHostToDevice));
-  HCTR_LIB_THROW(
-      cudaMemcpy(d_beta, h_beta.get(), num_feature * sizeof(float), cudaMemcpyHostToDevice));
-
-  simulator.fill(h_in.get(), len);
-
-  batch_norm_fprop_cpu<T>(h_gamma.get(), h_beta.get(), h_in.get(), h_expected.get(), batch_size,
-                          num_feature);
-
-  HCTR_LIB_THROW(cudaMemcpy(d_in, h_in.get(), len * sizeof(T), cudaMemcpyHostToDevice));
-
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-  batch_norm_layer.fprop(true);
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-  HCTR_LIB_THROW(cudaMemcpy(h_out.get(), d_out, len * sizeof(T), cudaMemcpyDeviceToHost));
-
-  ASSERT_TRUE(test::compare_array_approx<T>(h_out.get(), h_expected.get(), len, Eps<T>::value()));
-
-  simulator.fill(h_out.get(), len);
-
-  HCTR_LIB_THROW(cudaMemcpy(h_expected.get(), d_in, len * sizeof(T), cudaMemcpyDeviceToHost));
-  batch_norm_bprop_cpu<T>(h_gamma.get(), h_out.get(), h_expected.get(), batch_size, num_feature);
-
-  HCTR_LIB_THROW(cudaMemcpy(d_out, h_out.get(), len * sizeof(T), cudaMemcpyHostToDevice));
-
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-  batch_norm_layer.bprop();
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-  HCTR_LIB_THROW(cudaMemcpy(h_in.get(), d_in, len * sizeof(T), cudaMemcpyDeviceToHost));
-
-  ASSERT_TRUE(test::compare_array_approx<T>(h_in.get(), h_expected.get(), len, Eps<T>::value()));
-}
-
-}  // namespace
-
-TEST(batch_norm_layer_old, fp32_2x4) { batch_norm_test<float>(2, 4); }
-TEST(batch_norm_layer_old, fp32_4x2) { batch_norm_test<float>(4, 2); }
-TEST(batch_norm_layer_old, fp32_1024x2) { batch_norm_test<float>(1024, 2); }
-TEST(batch_norm_layer_old, fp32_1024x511) { batch_norm_test<float>(1024, 511); }
-TEST(batch_norm_layer_old, fp32_1024x512) { batch_norm_test<float>(1024, 512); }
-TEST(batch_norm_layer_old, fp32_512x1024) { batch_norm_test<float>(512, 1024); }
-TEST(batch_norm_layer_old, fp32_511x1024) { batch_norm_test<float>(511, 1024); }
-TEST(batch_norm_layer_old, fp16_2x4) { batch_norm_test<__half>(2, 4); }
-TEST(batch_norm_layer_old, fp16_4x2) { batch_norm_test<__half>(4, 2); }
-TEST(batch_norm_layer_old, fp16_1024x2) { batch_norm_test<__half>(1024, 2); }
-TEST(batch_norm_layer_old, fp16_1024x511) { batch_norm_test<__half>(1024, 511); }
-TEST(batch_norm_layer_old, fp16_1024x512) { batch_norm_test<__half>(1024, 512); }
-TEST(batch_norm_layer_old, fp16_512x1024) { batch_norm_test<__half>(512, 1024); }
-TEST(batch_norm_layer_old, fp16_511x1024) { batch_norm_test<__half>(511, 1024); }
diff --git a/test/utest/legacy_layer_test/fully_connected_layer_half_test_old.cpp b/test/utest/legacy_layer_test/fully_connected_layer_half_test_old.cpp
deleted file mode 100644
index ca68c3255d..0000000000
--- a/test/utest/legacy_layer_test/fully_connected_layer_half_test_old.cpp
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cmath>
-#include <cstdlib>
-#include <layers/fully_connected_layer_half.hpp>
-#include <utest/test_utils.hpp>
-#include <vector>
-
-using namespace HugeCTR;
-
-const __half eps = 1.0f;
-
-static void cpu_mm(__half *c, const __half *a, bool transpose_a, const __half *b, bool transpose_b,
-                   int m, int k, int n) {
-  for (int i = 0; i < m; ++i) {
-    for (int j = 0; j < n; ++j) {
-      float sum = 0.0f;
-      for (int kk = 0; kk < k; ++kk) {
-        int ai = transpose_a ? kk * m + i : i * k + kk;
-        int bi = transpose_b ? j * k + kk : kk * n + j;
-        sum += __half2float(a[ai] * b[bi]);
-      }
-      c[i * n + j] = sum;
-    }
-  }
-}
-
-static void cpu_add_bias(__half *top, const __half *bias, int m, int n) {
-  for (int i = 0; i < m; ++i) {
-    for (int j = 0; j < n; ++j) {
-      top[i * n + j] = top[i * n + j] + bias[j];
-    }
-  }
-}
-
-static void cpu_reverse_add_bias(__half *bias_grad, const __half *top, int m, int n) {
-  for (int i = 0; i < n; ++i) {
-    float sum = 0.0f;
-    for (int j = 0; j < m; ++j) sum += __half2float(top[j * n + i]);
-    bias_grad[i] = sum;
-  }
-}
-
-static float compare_array(const __half *arr1, const __half *arr2, size_t n, float threshold) {
-  size_t m = 0;
-  for (size_t i = 0; i < n; i++) {
-    if (fabs(__half2float(arr1[i] - arr2[i])) > threshold) {
-      m++;
-    }
-  }
-  return 1.0f * m / n;
-}
-
-static void fully_connected_layer_test(size_t m, size_t n, size_t k) {
-  HCTR_LOG(INFO, WORLD, "Testing m=%zu, n=%zu, k=%zu\n", m, n, k);
-
-  test::GaussianDataSimulator simulator(0.0f, 1.0f);
-
-  std::shared_ptr<GeneralBuffer2<CudaAllocator>> blobs_buff =
-      GeneralBuffer2<CudaAllocator>::create();
-  std::shared_ptr<BufferBlock2<float>> master_weights_buff = blobs_buff->create_block<float>();
-  std::shared_ptr<BufferBlock2<__half>> weights_buff = blobs_buff->create_block<__half>();
-  std::shared_ptr<BufferBlock2<__half>> weights_grad_buff = blobs_buff->create_block<__half>();
-
-  Tensor2<__half> bottom_tensor;
-  blobs_buff->reserve({m, k}, &bottom_tensor);
-  Tensor2<__half> top_tensor;
-  blobs_buff->reserve({m, n}, &top_tensor);
-
-  FullyConnectedLayer<__half> fully_connected_layer(master_weights_buff, weights_buff,
-                                                    weights_grad_buff, blobs_buff, bottom_tensor,
-                                                    top_tensor, test::get_default_gpu());
-  // Initialize tensors to 0 and choose cublas algorithms
-  blobs_buff->allocate();
-  fully_connected_layer.initialize();
-  // Reset tensors to 0 to ensure all the data are the same as original utest(clear the side effect
-  // of optimize)
-
-  Tensor2<__half> weights = weights_buff->as_tensor();
-  Tensor2<__half> weights_grad = weights_grad_buff->as_tensor();
-  cudaMemset(weights.get_ptr(), 0, weights.get_size_in_bytes());
-  cudaMemset(weights_grad.get_ptr(), 0, weights_grad.get_size_in_bytes());
-
-  // TODO: result check
-  __half *d_kernel = weights.get_ptr();
-  __half *d_bias = weights.get_ptr() + k * n;
-  __half *d_kernel_grad = weights_grad.get_ptr();
-  __half *d_bias_grad = weights_grad.get_ptr() + k * n;
-  __half *d_bottom = bottom_tensor.get_ptr();
-  __half *d_top = top_tensor.get_ptr();
-
-  std::unique_ptr<__half[]> h_kernel(new __half[test::align_to_even(k * n)]);
-  std::unique_ptr<__half[]> h_kernel_grad(new __half[k * n]);
-  std::unique_ptr<__half[]> h_bias_grad(new __half[n]);
-  std::unique_ptr<__half[]> h_bottom(new __half[test::align_to_even(m * k)]);
-  std::unique_ptr<__half[]> h_top(new __half[test::align_to_even(m * n)]);
-  std::unique_ptr<__half[]> h_bias(new __half[test::align_to_even(n)]);
-
-  std::unique_ptr<__half[]> d2h_top(new __half[m * n]);
-  std::unique_ptr<__half[]> d2h_bottom(new __half[m * k]);
-  std::unique_ptr<__half[]> d2h_kernel_grad(new __half[k * n]);
-  std::unique_ptr<__half[]> d2h_bias_grad(new __half[n]);
-
-  simulator.fill(h_bottom.get(), test::align_to_even(m * k));
-  simulator.fill(h_kernel.get(), test::align_to_even(k * n));
-  simulator.fill(h_bias.get(), test::align_to_even(n));
-
-  // cpu fprop
-  cpu_mm(h_top.get(), h_bottom.get(), false, h_kernel.get(), false, m, k, n);
-  cpu_add_bias(h_top.get(), h_bias.get(), m, n);
-
-  HCTR_LIB_THROW(
-      cudaMemcpy(d_kernel, h_kernel.get(), sizeof(__half) * k * n, cudaMemcpyHostToDevice));
-  HCTR_LIB_THROW(cudaMemcpy(d_bias, h_bias.get(), sizeof(__half) * n, cudaMemcpyHostToDevice));
-  HCTR_LIB_THROW(
-      cudaMemcpy(d_bottom, h_bottom.get(), sizeof(__half) * m * k, cudaMemcpyHostToDevice));
-
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-  fully_connected_layer.fprop(true);
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-  HCTR_LIB_THROW(cudaMemcpy(d2h_top.get(), d_top, sizeof(__half) * m * n, cudaMemcpyDeviceToHost));
-
-  ASSERT_LT(compare_array(h_top.get(), d2h_top.get(), m * n, 1e-1), 0.01f)
-      << "fprop cross_check result fail" << std::endl;
-
-  simulator.fill(h_top.get(), test::align_to_even(m * n));
-
-  HCTR_LIB_THROW(cudaMemcpy(d_top, h_top.get(), sizeof(__half) * m * n, cudaMemcpyHostToDevice));
-
-  cpu_reverse_add_bias(h_bias_grad.get(), h_top.get(), m, n);
-
-  cpu_mm(h_kernel_grad.get(), h_bottom.get(), true, h_top.get(), false, k, m, n);
-  cpu_mm(h_bottom.get(), h_top.get(), false, h_kernel.get(), true, m, n, k);
-
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-  fully_connected_layer.bprop();
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-  HCTR_LIB_THROW(
-      cudaMemcpy(d2h_bottom.get(), d_bottom, sizeof(__half) * m * k, cudaMemcpyDeviceToHost));
-  HCTR_LIB_THROW(cudaMemcpy(d2h_kernel_grad.get(), d_kernel_grad, sizeof(__half) * k * n,
-                            cudaMemcpyDeviceToHost));
-  HCTR_LIB_THROW(
-      cudaMemcpy(d2h_bias_grad.get(), d_bias_grad, sizeof(__half) * n, cudaMemcpyDeviceToHost));
-
-  ASSERT_LT(compare_array(h_bottom.get(), d2h_bottom.get(), m * k, 1e-1), 0.01f)
-      << " bprop cross_check input_grad fail" << std::endl;
-  ASSERT_LT(compare_array(h_kernel_grad.get(), d2h_kernel_grad.get(), k * n, 1e-1), 0.05f)
-      << " bprop cross_check weight_grad fail" << std::endl;
-  ASSERT_LT(compare_array(h_bias_grad.get(), d2h_bias_grad.get(), n, 1e-5), 0.01f)
-      << " bprop cross_check bias_grad fail" << std::endl;
-}
-
-static void fully_connected_layer_test_3d(size_t batch_size, size_t seq_len, size_t n, size_t k) {
-  HCTR_LOG(INFO, WORLD, "Testing batch_size=%zu, seq_len=%zu, n=%zu, k=%zu\n", batch_size, seq_len,
-           n, k);
-
-  test::GaussianDataSimulator simulator(0.0f, 1.0f);
-
-  std::shared_ptr<GeneralBuffer2<CudaAllocator>> blobs_buff =
-      GeneralBuffer2<CudaAllocator>::create();
-  std::shared_ptr<BufferBlock2<float>> master_weights_buff = blobs_buff->create_block<float>();
-  std::shared_ptr<BufferBlock2<__half>> weights_buff = blobs_buff->create_block<__half>();
-  std::shared_ptr<BufferBlock2<__half>> weights_grad_buff = blobs_buff->create_block<__half>();
-
-  Tensor2<__half> bottom_tensor;
-  blobs_buff->reserve({batch_size, seq_len, k}, &bottom_tensor);
-  Tensor2<__half> top_tensor;
-  blobs_buff->reserve({batch_size, seq_len, n}, &top_tensor);
-
-  FullyConnectedLayer<__half> fully_connected_layer(master_weights_buff, weights_buff,
-                                                    weights_grad_buff, blobs_buff, bottom_tensor,
-                                                    top_tensor, test::get_default_gpu());
-  // Initialize tensors to 0 and choose cublas algorithms
-  blobs_buff->allocate();
-  fully_connected_layer.initialize();
-  // Reset tensors to 0 to ensure all the data are the same as original utest(clear the side effect
-  // of optimize)
-
-  Tensor2<__half> weights = weights_buff->as_tensor();
-  Tensor2<__half> weights_grad = weights_grad_buff->as_tensor();
-  cudaMemset(weights.get_ptr(), 0, weights.get_size_in_bytes());
-  cudaMemset(weights_grad.get_ptr(), 0, weights_grad.get_size_in_bytes());
-
-  // TODO: result check
-  __half *d_kernel = weights.get_ptr();
-  __half *d_bias = weights.get_ptr() + k * n;
-  __half *d_kernel_grad = weights_grad.get_ptr();
-  __half *d_bias_grad = weights_grad.get_ptr() + k * n;
-  __half *d_bottom = bottom_tensor.get_ptr();
-  __half *d_top = top_tensor.get_ptr();
-
-  std::unique_ptr<__half[]> h_kernel(new __half[test::align_to_even(k * n)]);
-  std::unique_ptr<__half[]> h_kernel_grad(new __half[k * n]);
-  std::unique_ptr<__half[]> h_bias_grad(new __half[n]);
-  std::unique_ptr<__half[]> h_bottom(new __half[test::align_to_even(batch_size * seq_len * k)]);
-  std::unique_ptr<__half[]> h_top(new __half[test::align_to_even(batch_size * seq_len * n)]);
-  std::unique_ptr<__half[]> h_bias(new __half[test::align_to_even(n)]);
-
-  std::unique_ptr<__half[]> d2h_top(new __half[batch_size * seq_len * n]);
-  std::unique_ptr<__half[]> d2h_bottom(new __half[batch_size * seq_len * k]);
-  std::unique_ptr<__half[]> d2h_kernel_grad(new __half[k * n]);
-  std::unique_ptr<__half[]> d2h_bias_grad(new __half[n]);
-
-  simulator.fill(h_bottom.get(), test::align_to_even(batch_size * seq_len * k));
-  simulator.fill(h_kernel.get(), test::align_to_even(k * n));
-  simulator.fill(h_bias.get(), test::align_to_even(n));
-
-  // cpu fprop
-  cpu_mm(h_top.get(), h_bottom.get(), false, h_kernel.get(), false, batch_size * seq_len, k, n);
-  cpu_add_bias(h_top.get(), h_bias.get(), batch_size * seq_len, n);
-
-  HCTR_LIB_THROW(
-      cudaMemcpy(d_kernel, h_kernel.get(), sizeof(__half) * k * n, cudaMemcpyHostToDevice));
-  HCTR_LIB_THROW(cudaMemcpy(d_bias, h_bias.get(), sizeof(__half) * n, cudaMemcpyHostToDevice));
-  HCTR_LIB_THROW(cudaMemcpy(d_bottom, h_bottom.get(), sizeof(__half) * batch_size * seq_len * k,
-                            cudaMemcpyHostToDevice));
-
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-  fully_connected_layer.fprop(true);
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-  HCTR_LIB_THROW(cudaMemcpy(d2h_top.get(), d_top, sizeof(__half) * batch_size * seq_len * n,
-                            cudaMemcpyDeviceToHost));
-
-  ASSERT_LT(compare_array(h_top.get(), d2h_top.get(), batch_size * seq_len * n, 1e-1), 0.01f)
-      << "fprop cross_check result fail" << std::endl;
-
-  simulator.fill(h_top.get(), test::align_to_even(batch_size * seq_len * n));
-
-  HCTR_LIB_THROW(cudaMemcpy(d_top, h_top.get(), sizeof(__half) * batch_size * seq_len * n,
-                            cudaMemcpyHostToDevice));
-
-  cpu_reverse_add_bias(h_bias_grad.get(), h_top.get(), batch_size * seq_len, n);
-
-  cpu_mm(h_kernel_grad.get(), h_bottom.get(), true, h_top.get(), false, k, batch_size * seq_len, n);
-  cpu_mm(h_bottom.get(), h_top.get(), false, h_kernel.get(), true, batch_size * seq_len, n, k);
-
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-  fully_connected_layer.bprop();
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-  HCTR_LIB_THROW(cudaMemcpy(d2h_bottom.get(), d_bottom, sizeof(__half) * batch_size * seq_len * k,
-                            cudaMemcpyDeviceToHost));
-  HCTR_LIB_THROW(cudaMemcpy(d2h_kernel_grad.get(), d_kernel_grad, sizeof(__half) * k * n,
-                            cudaMemcpyDeviceToHost));
-  HCTR_LIB_THROW(
-      cudaMemcpy(d2h_bias_grad.get(), d_bias_grad, sizeof(__half) * n, cudaMemcpyDeviceToHost));
-
-  ASSERT_LT(compare_array(h_bottom.get(), d2h_bottom.get(), batch_size * seq_len * k, 1e-1), 0.01f)
-      << " bprop cross_check input_grad fail" << std::endl;
-  ASSERT_LT(compare_array(h_kernel_grad.get(), d2h_kernel_grad.get(), k * n, 1e-1), 0.05f)
-      << " bprop cross_check weight_grad fail" << std::endl;
-  ASSERT_LT(compare_array(h_bias_grad.get(), d2h_bias_grad.get(), n, 1e-5), 0.01f)
-      << " bprop cross_check bias_grad fail" << std::endl;
-}
-
-TEST(fully_connected_layer_old, fp16_1x1x1) { fully_connected_layer_test(1, 1, 1); }
-TEST(fully_connected_layer_old, fp16_2048x1x256) { fully_connected_layer_test(2048, 1, 256); }
-TEST(fully_connected_layer_old, fp16_2048x512x13) { fully_connected_layer_test(2048, 512, 13); }
-TEST(fully_connected_layer_old, fp16_2048x1024x479) { fully_connected_layer_test(2048, 1024, 479); }
-TEST(fully_connected_layer_old, fp16_2048x512x1024) { fully_connected_layer_test(2048, 512, 1024); }
-TEST(fully_connected_layer_old, fp16_2048x1024x1024) {
-  fully_connected_layer_test(2048, 1024, 1024);
-}
-TEST(fully_connected_layer_old, fp16_2048x10x1024x1024) {
-  fully_connected_layer_test_3d(128, 10, 1024, 1024);
-}
diff --git a/test/utest/legacy_layer_test/fully_connected_layer_test_old.cpp b/test/utest/legacy_layer_test/fully_connected_layer_test_old.cpp
deleted file mode 100644
index 485a6b6552..0000000000
--- a/test/utest/legacy_layer_test/fully_connected_layer_test_old.cpp
+++ /dev/null
@@ -1,263 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cmath>
-#include <cstdlib>
-#include <layers/fully_connected_layer.hpp>
-#include <utest/test_utils.hpp>
-#include <vector>
-
-using namespace HugeCTR;
-
-static bool check_cpu_gpu(float *cpu_p, float *gpu_p, int len, float tol) {
-  float *cpu_tmp = (float *)malloc(sizeof(float) * len);
-  HCTR_LIB_THROW(cudaMemcpy(cpu_tmp, gpu_p, sizeof(float) * len, cudaMemcpyDeviceToHost));
-  float max_diff = fabs(cpu_p[0] - cpu_tmp[0]);
-  bool flag = true;
-  for (int i = 0; i < len; ++i) {
-    if (fabs(cpu_p[i] - cpu_tmp[i]) >= tol) flag = false;
-    max_diff = std::max(max_diff, fabs(cpu_p[i] - cpu_tmp[i]));
-  }
-  free(cpu_tmp);
-  return flag;
-}
-
-static void cpu_mm(float *a, float *b, float *c, int m, int k, int n) {
-  for (int i = 0; i < m; ++i) {
-    for (int j = 0; j < n; ++j) {
-      c[i * n + j] = 0.0f;
-      for (int kk = 0; kk < k; ++kk) c[i * n + j] += a[i * k + kk] * b[kk * n + j];
-    }
-  }
-}
-
-static void cpu_add_bias(float *out, float *bias, int m, int n) {
-  for (int i = 0; i < m; ++i) {
-    for (int j = 0; j < n; ++j) {
-      out[i * n + j] += bias[j];
-    }
-  }
-}
-
-static void transpose(float *a, int m, int n) {
-  std::unique_ptr<float[]> tmp(new float[m * n]);
-  for (int i = 0; i < m; ++i)
-    for (int j = 0; j < n; ++j) tmp[j * m + i] = a[i * n + j];
-  for (int i = 0; i < m * n; ++i) a[i] = tmp[i];
-}
-
-static void fully_connected_layer_test(size_t m, size_t n, size_t k, float tol = 1e-3,
-                                       bool enable_tf32_compute = false) {
-  std::shared_ptr<GeneralBuffer2<CudaAllocator>> blobs_buff =
-      GeneralBuffer2<CudaAllocator>::create();
-  std::shared_ptr<BufferBlock2<float>> weight_buff = blobs_buff->create_block<float>();
-  std::shared_ptr<BufferBlock2<float>> wgrad_buff = blobs_buff->create_block<float>();
-
-  Tensor2<float> in_tensor;
-  blobs_buff->reserve({m, k}, &in_tensor);
-  Tensor2<float> out_tensor;
-  blobs_buff->reserve({m, n}, &out_tensor);
-
-  FullyConnectedLayer<float> fully_connected_layer(weight_buff, wgrad_buff, in_tensor, out_tensor,
-                                                   test::get_default_gpu(), false,
-                                                   enable_tf32_compute);
-  // Initialize tensors to 0 and choose cublas algorithms
-  blobs_buff->allocate();
-  fully_connected_layer.initialize();
-  // Reset tensors to 0 to ensure all the data are the same as original utest(clear the side effect
-  // of optimize)
-  Tensor2<float> weight = weight_buff->as_tensor();
-  Tensor2<float> wgrad = wgrad_buff->as_tensor();
-
-  HCTR_LIB_THROW(cudaMemset(weight.get_ptr(), 0, weight.get_size_in_bytes()));
-  HCTR_LIB_THROW(cudaMemset(wgrad.get_ptr(), 0, wgrad.get_size_in_bytes()));
-
-  // TODO: result check
-  float *d_weight = weight.get_ptr();
-  float *d_weight_grad = wgrad.get_ptr();
-  float *d_in = in_tensor.get_ptr();
-  float *d_out = out_tensor.get_ptr();
-
-  std::unique_ptr<float[]> h_weight(new float[test::align_to_even(n * k)]);
-  std::unique_ptr<float[]> h_weight_grad(new float[n * k]);
-  std::unique_ptr<float[]> h_bias_grad(new float[n]);
-  std::unique_ptr<float[]> h_in(new float[test::align_to_even(k * m)]);
-  std::unique_ptr<float[]> h_out(new float[test::align_to_even(n * m)]);
-  std::unique_ptr<float[]> h_bias(new float[test::align_to_even(n)]);
-
-  test::GaussianDataSimulator simulator(0.0f, 1.0f);
-
-  simulator.fill(h_weight.get(), test::align_to_even(k * n));
-  simulator.fill(h_in.get(), test::align_to_even(m * k));
-  simulator.fill(h_bias.get(), test::align_to_even(n));
-
-  // cpu fprop
-  cpu_mm(h_in.get(), h_weight.get(), h_out.get(), m, k, n);
-  cpu_add_bias(h_out.get(), h_bias.get(), m, n);
-
-  HCTR_LIB_THROW(
-      cudaMemcpy(d_weight, h_weight.get(), sizeof(float) * k * n, cudaMemcpyHostToDevice));
-  HCTR_LIB_THROW(
-      cudaMemcpy(d_weight + k * n, h_bias.get(), sizeof(float) * n, cudaMemcpyHostToDevice));
-  HCTR_LIB_THROW(cudaMemcpy(d_in, h_in.get(), sizeof(float) * m * k, cudaMemcpyHostToDevice));
-
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-  fully_connected_layer.fprop(true);
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-  ASSERT_EQ(true, check_cpu_gpu(h_out.get(), d_out, m * n, tol))
-      << "fprop cross_check result fail" << std::endl;
-
-  simulator.fill(h_out.get(), test::align_to_even(m * n));
-
-  for (size_t i = 0; i < n; ++i) h_bias_grad[i] = 0.0f;
-  for (size_t i = 0; i < m; ++i) {
-    for (size_t j = 0; j < n; ++j) h_bias_grad[j] += h_out[i * n + j];
-  }
-  // CPU bprop
-  transpose(h_weight.get(), k, n);
-  transpose(h_in.get(), m, k);
-  cpu_mm(h_in.get(), h_out.get(), h_weight_grad.get(), k, m, n);
-  cpu_mm(h_out.get(), h_weight.get(), h_in.get(), m, n, k);
-
-  HCTR_LIB_THROW(cudaMemcpy(d_out, h_out.get(), sizeof(float) * m * n, cudaMemcpyHostToDevice));
-
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-  fully_connected_layer.bprop();
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-  ASSERT_EQ(true, check_cpu_gpu(h_in.get(), d_in, m * k, tol))
-      << " bprop cross_check input_grad fail" << std::endl;
-  ASSERT_EQ(true, check_cpu_gpu(h_weight_grad.get(), d_weight_grad, k * n, tol))
-      << " bprop cross_check weight_grad fail" << std::endl;
-  ASSERT_EQ(true, check_cpu_gpu(h_bias_grad.get(), d_weight_grad + k * n, n, tol))
-      << " bprop cross_check bias_grad fail" << std::endl;
-}
-
-static void fully_connected_layer_test_3d(size_t batch_size, size_t seq_len, size_t n, size_t k,
-                                          float tol = 1e-3, bool enable_tf32_compute = false) {
-  std::shared_ptr<GeneralBuffer2<CudaAllocator>> blobs_buff =
-      GeneralBuffer2<CudaAllocator>::create();
-  std::shared_ptr<BufferBlock2<float>> weight_buff = blobs_buff->create_block<float>();
-  std::shared_ptr<BufferBlock2<float>> wgrad_buff = blobs_buff->create_block<float>();
-
-  Tensor2<float> in_tensor;
-  blobs_buff->reserve({batch_size, seq_len, k}, &in_tensor);
-  Tensor2<float> out_tensor;
-  blobs_buff->reserve({batch_size, seq_len, n}, &out_tensor);
-
-  FullyConnectedLayer<float> fully_connected_layer(weight_buff, wgrad_buff, in_tensor, out_tensor,
-                                                   test::get_default_gpu(), false,
-                                                   enable_tf32_compute);
-  // Initialize tensors to 0 and choose cublas algorithms
-  blobs_buff->allocate();
-  fully_connected_layer.initialize();
-  // Reset tensors to 0 to ensure all the data are the same as original utest(clear the side effect
-  // of optimize)
-  Tensor2<float> weight = weight_buff->as_tensor();
-  Tensor2<float> wgrad = wgrad_buff->as_tensor();
-
-  HCTR_LIB_THROW(cudaMemset(weight.get_ptr(), 0, weight.get_size_in_bytes()));
-  HCTR_LIB_THROW(cudaMemset(wgrad.get_ptr(), 0, wgrad.get_size_in_bytes()));
-
-  // TODO: result check
-  float *d_weight = weight.get_ptr();
-  float *d_weight_grad = wgrad.get_ptr();
-  float *d_in = in_tensor.get_ptr();
-  float *d_out = out_tensor.get_ptr();
-
-  std::unique_ptr<float[]> h_weight(new float[test::align_to_even(n * k)]);
-  std::unique_ptr<float[]> h_weight_grad(new float[n * k]);
-  std::unique_ptr<float[]> h_bias_grad(new float[n]);
-  std::unique_ptr<float[]> h_in(new float[test::align_to_even(batch_size * seq_len * k)]);
-  std::unique_ptr<float[]> h_out(new float[test::align_to_even(batch_size * seq_len * n)]);
-  std::unique_ptr<float[]> h_bias(new float[test::align_to_even(n)]);
-
-  test::GaussianDataSimulator simulator(0.0f, 1.0f);
-
-  simulator.fill(h_weight.get(), test::align_to_even(k * n));
-  simulator.fill(h_in.get(), test::align_to_even(batch_size * seq_len * k));
-  simulator.fill(h_bias.get(), test::align_to_even(n));
-
-  // cpu fprop
-  cpu_mm(h_in.get(), h_weight.get(), h_out.get(), batch_size * seq_len, k, n);
-  cpu_add_bias(h_out.get(), h_bias.get(), batch_size * seq_len, n);
-
-  HCTR_LIB_THROW(
-      cudaMemcpy(d_weight, h_weight.get(), sizeof(float) * k * n, cudaMemcpyHostToDevice));
-  HCTR_LIB_THROW(
-      cudaMemcpy(d_weight + k * n, h_bias.get(), sizeof(float) * n, cudaMemcpyHostToDevice));
-  HCTR_LIB_THROW(cudaMemcpy(d_in, h_in.get(), sizeof(float) * batch_size * seq_len * k,
-                            cudaMemcpyHostToDevice));
-
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-  fully_connected_layer.fprop(true);
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-  ASSERT_EQ(true, check_cpu_gpu(h_out.get(), d_out, batch_size * seq_len * n, tol))
-      << "fprop cross_check result fail" << std::endl;
-
-  simulator.fill(h_out.get(), test::align_to_even(batch_size * seq_len * n));
-
-  for (size_t i = 0; i < n; ++i) h_bias_grad[i] = 0.0f;
-  for (size_t i = 0; i < batch_size * seq_len; ++i) {
-    for (size_t j = 0; j < n; ++j) h_bias_grad[j] += h_out[i * n + j];
-  }
-  // CPU bprop
-  transpose(h_weight.get(), k, n);
-  transpose(h_in.get(), batch_size * seq_len, k);
-  cpu_mm(h_in.get(), h_out.get(), h_weight_grad.get(), k, batch_size * seq_len, n);
-  cpu_mm(h_out.get(), h_weight.get(), h_in.get(), batch_size * seq_len, n, k);
-
-  HCTR_LIB_THROW(cudaMemcpy(d_out, h_out.get(), sizeof(float) * batch_size * seq_len * n,
-                            cudaMemcpyHostToDevice));
-
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-  fully_connected_layer.bprop();
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-  ASSERT_EQ(true, check_cpu_gpu(h_in.get(), d_in, batch_size * seq_len * k, tol))
-      << " bprop cross_check input_grad fail" << std::endl;
-  ASSERT_EQ(true, check_cpu_gpu(h_weight_grad.get(), d_weight_grad, k * n, tol))
-      << " bprop cross_check weight_grad fail" << std::endl;
-  ASSERT_EQ(true, check_cpu_gpu(h_bias_grad.get(), d_weight_grad + k * n, n, tol))
-      << " bprop cross_check bias_grad fail" << std::endl;
-}
-
-TEST(fully_connected_layer_old, fp32_1024x1024x1024) {
-  fully_connected_layer_test(1024, 1024, 1024);
-}
-TEST(fully_connected_layer_old, fp32_2048x2048x2048) {
-  fully_connected_layer_test(2048, 2048, 2048);
-}
-TEST(fully_connected_layer_old, fp32_1x1024x1024) { fully_connected_layer_test(1, 1024, 1024); }
-TEST(fully_connected_layer_old, fp32_1024x1x1024) { fully_connected_layer_test(1024, 1, 1024); }
-TEST(fully_connected_layer_old, fp32_1024x1024x1) { fully_connected_layer_test(1024, 1024, 1); }
-TEST(fully_connected_layer_old, fp32_1x1x1) { fully_connected_layer_test(1, 1, 1); }
-TEST(fully_connected_layer_old, fp32_256x512x1024) { fully_connected_layer_test(256, 512, 1024); }
-TEST(fully_connected_layer_old, fp32_251x511x1023) { fully_connected_layer_test(251, 511, 1023); }
-TEST(fully_connected_layer_old, fp32_512x4x512x256) {
-  fully_connected_layer_test_3d(512, 4, 512, 256);
-}
-TEST(fully_connected_layer_old, fp32_512x10x512x512) {
-  fully_connected_layer_test_3d(512, 10, 512, 512);
-}
-TEST(fully_connected_layer_old, tf32_256x512x1024) {
-  fully_connected_layer_test(256, 512, 1024, 5e-0, true);
-}
-TEST(fully_connected_layer_old, tf32_251x511x1023) {
-  fully_connected_layer_test(251, 511, 1023, 5e-0, true);
-}
diff --git a/test/utest/legacy_layer_test/fused_fully_connected_layer_test_old.cpp b/test/utest/legacy_layer_test/fused_fully_connected_layer_test_old.cpp
deleted file mode 100644
index bdec4e7939..0000000000
--- a/test/utest/legacy_layer_test/fused_fully_connected_layer_test_old.cpp
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cublas_v2.h>
-#include <gtest/gtest.h>
-
-#include <cmath>
-#include <cstdlib>
-#include <layers/fused_fully_connected_layer.hpp>
-#include <utest/test_utils.hpp>
-#include <vector>
-
-using namespace HugeCTR;
-template <typename T>
-static void fill_data(T *data, int N) {
-  unsigned seed = time(0);
-  srand(seed);
-  for (int i = 0; i < N; i++) {
-    data[i] = (T((float)(rand() % 3 - 1)));
-    if (rand() % 50) {
-      data[i] = (T((float)(0)));
-    }
-  }
-}
-
-static void cpu_mm(__half *c, const __half *a, bool transpose_a, const __half *b, bool transpose_b,
-                   int m, int k, int n) {
-  for (int i = 0; i < m; ++i) {
-    for (int j = 0; j < n; ++j) {
-      float sum = 0.0f;
-      for (int kk = 0; kk < k; ++kk) {
-        int ai = transpose_a ? kk * m + i : i * k + kk;
-        int bi = transpose_b ? j * k + kk : kk * n + j;
-        sum += __half2float(a[ai]) * __half2float(b[bi]);
-      }
-      c[i * n + j] = sum;
-    }
-  }
-}
-
-static void cpu_add_bias_and_re(__half *top, __half *middle, const __half *bias, int m, int n) {
-  for (int i = 0; i < m; ++i) {
-    for (int j = 0; j < n; ++j) {
-      __half t = top[i * n + j] + bias[j];
-      middle[i * n + j] = t;
-      top[i * n + j] = __half2float(t) <= 0.f ? __float2half(0.0f) : t;
-    }
-  }
-}
-
-static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *middle, const __half *top, int m,
-                                        int n) {
-  for (int i = 0; i < m; ++i)
-    for (int j = 0; j < n; ++j) {
-      if (__half2float(middle[i * n + j]) <= 0) {
-        middle[i * n + j] = 0.0f;
-      } else {
-        middle[i * n + j] = top[i * n + j];
-      }
-    }
-
-  for (int i = 0; i < n; ++i) {
-    float sum = 0.0f;
-    for (int j = 0; j < m; ++j) sum += __half2float(middle[j * n + i]);
-    bias_grad[i] = sum;
-  }
-}
-
-static float compare_array(const __half *arr1, const __half *arr2, size_t n, float threshold) {
-  size_t m = 0;
-  for (size_t i = 0; i < n; i++) {
-    if (fabs(__half2float(arr1[i] - arr2[i])) > threshold) {
-      m++;
-    }
-  }
-  return 1.0f * m / n;
-}
-
-static void fully_connected_layer_test(size_t m, size_t n, size_t k) {
-  HCTR_LOG(INFO, WORLD, "Testing m=%zu, n=%zu, k=%zu\n", m, n, k);
-
-  std::shared_ptr<GeneralBuffer2<CudaAllocator>> blobs_buff =
-      GeneralBuffer2<CudaAllocator>::create();
-  std::shared_ptr<BufferBlock2<float>> master_weights_buff = blobs_buff->create_block<float>();
-  std::shared_ptr<BufferBlock2<__half>> weights_buff = blobs_buff->create_block<__half>();
-  std::shared_ptr<BufferBlock2<__half>> weights_grad_buff = blobs_buff->create_block<__half>();
-
-  Tensor2<__half> bottom_tensor;
-  blobs_buff->reserve({m, k}, &bottom_tensor);
-  Tensor2<__half> top_tensor;
-  blobs_buff->reserve({m, n}, &top_tensor);
-
-  FusedFullyConnectedLayer fully_connected_layer(master_weights_buff, weights_buff,
-                                                 weights_grad_buff, blobs_buff, bottom_tensor,
-                                                 top_tensor, test::get_default_gpu());
-
-  // Initialize tensors to 0 and choose cublas algorithms
-  blobs_buff->allocate();
-  fully_connected_layer.initialize();
-  // Reset tensors to 0 to ensure all the data are the same as original utest(clear the side effect
-  // of optimize)
-
-  Tensor2<__half> weights = weights_buff->as_tensor();
-  Tensor2<__half> weights_grad = weights_grad_buff->as_tensor();
-  HCTR_LIB_THROW(cudaMemset(weights.get_ptr(), 0, weights.get_size_in_bytes()));
-  HCTR_LIB_THROW(cudaMemset(weights_grad.get_ptr(), 0, weights_grad.get_size_in_bytes()));
-  // TODO: result check
-  __half *d_kernel = weights.get_ptr();
-  __half *d_bias = weights.get_ptr() + k * n;
-  __half *d_kernel_grad = weights_grad.get_ptr();
-  __half *d_bias_grad = weights_grad.get_ptr() + k * n;
-  __half *d_bottom = bottom_tensor.get_ptr();
-  __half *d_top = top_tensor.get_ptr();
-
-  std::unique_ptr<__half[]> h_kernel(new __half[k * n]);
-  std::unique_ptr<__half[]> h_kernel_grad(new __half[k * n]);
-  std::unique_ptr<__half[]> h_bias_grad(new __half[n]);
-  std::unique_ptr<__half[]> h_bottom(new __half[m * k]);
-  std::unique_ptr<__half[]> h_middle(new __half[m * n]);
-  std::unique_ptr<__half[]> h_top(new __half[m * n]);
-  std::unique_ptr<__half[]> h_bias(new __half[n]);
-
-  std::unique_ptr<__half[]> d2h_top(new __half[m * n]);
-  std::unique_ptr<__half[]> d2h_bottom(new __half[m * k]);
-  std::unique_ptr<__half[]> d2h_kernel_grad(new __half[k * n]);
-  std::unique_ptr<__half[]> d2h_bias_grad(new __half[n]);
-
-  fill_data(h_bottom.get(), m * k);
-  fill_data(h_kernel.get(), n * k);
-  fill_data(h_bias.get(), n);
-
-  HCTR_LIB_THROW(
-      cudaMemcpy(d_kernel, h_kernel.get(), sizeof(__half) * k * n, cudaMemcpyHostToDevice));
-  HCTR_LIB_THROW(cudaMemcpy(d_bias, h_bias.get(), sizeof(__half) * n, cudaMemcpyHostToDevice));
-  HCTR_LIB_THROW(
-      cudaMemcpy(d_bottom, h_bottom.get(), sizeof(__half) * m * k, cudaMemcpyHostToDevice));
-
-  // cpu fprop
-  cpu_mm(h_top.get(), h_bottom.get(), false, h_kernel.get(), false, m, k, n);
-  cpu_add_bias_and_re(h_top.get(), h_middle.get(), h_bias.get(), m, n);
-
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-  fully_connected_layer.fprop(true);
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-  HCTR_LIB_THROW(cudaMemcpy(d2h_top.get(), d_top, sizeof(__half) * m * n, cudaMemcpyDeviceToHost));
-
-  ASSERT_LT(compare_array(h_top.get(), d2h_top.get(), m * n, 1e-5), 0.01f)
-      << "fprop cross_check result fail" << std::endl;
-
-  fill_data(h_top.get(), n * m);
-
-  HCTR_LIB_THROW(cudaMemcpy(d_top, h_top.get(), sizeof(__half) * m * n, cudaMemcpyHostToDevice));
-
-  cpu_reverse_add_bias_and_re(h_bias_grad.get(), h_middle.get(), h_top.get(), m, n);
-
-  cpu_mm(h_kernel_grad.get(), h_bottom.get(), true, h_middle.get(), false, k, m, n);
-  cpu_mm(h_bottom.get(), h_middle.get(), false, h_kernel.get(), true, m, n, k);
-
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-  fully_connected_layer.bprop();
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-  HCTR_LIB_THROW(
-      cudaMemcpy(d2h_bottom.get(), d_bottom, sizeof(__half) * m * k, cudaMemcpyDeviceToHost));
-  HCTR_LIB_THROW(cudaMemcpy(d2h_kernel_grad.get(), d_kernel_grad, sizeof(__half) * k * n,
-                            cudaMemcpyDeviceToHost));
-  HCTR_LIB_THROW(
-      cudaMemcpy(d2h_bias_grad.get(), d_bias_grad, sizeof(__half) * n, cudaMemcpyDeviceToHost));
-
-  ASSERT_LT(compare_array(h_bottom.get(), d2h_bottom.get(), m * k, 1e-1), 0.05f)
-      << " bprop cross_check input_grad fail" << std::endl;
-  ASSERT_LT(compare_array(h_kernel_grad.get(), d2h_kernel_grad.get(), k * n, 1e-1), 0.15f)
-      << " bprop cross_check weight_grad fail" << std::endl;
-  ASSERT_LT(compare_array(h_bias_grad.get(), d2h_bias_grad.get(), n, 1e-1), 0.15f)
-      << " bprop cross_check bias_grad fail" << std::endl;
-}
-
-TEST(fused_fully_connected_layer_old, fp16_32x64x32) { fully_connected_layer_test(32, 64, 32); }
-TEST(fused_fully_connected_layer_old, fp16_2048x512x16) {
-  fully_connected_layer_test(2048, 512, 16);
-}
-TEST(fused_fully_connected_layer_old, fp16_2048x1024x480) {
-  fully_connected_layer_test(2048, 1024, 480);
-}
-TEST(fused_fully_connected_layer_old, fp16_2048x512x1024) {
-  fully_connected_layer_test(2048, 512, 1024);
-}
-TEST(fused_fully_connected_layer, fp16_128x1024x1024) {
-  fully_connected_layer_test(128, 1024, 1024);
-}
diff --git a/test/utest/legacy_layer_test/fused_relu_bias_fully_connected_layer_test_old.cpp b/test/utest/legacy_layer_test/fused_relu_bias_fully_connected_layer_test_old.cpp
deleted file mode 100644
index 65e9c1c8ed..0000000000
--- a/test/utest/legacy_layer_test/fused_relu_bias_fully_connected_layer_test_old.cpp
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cublas_v2.h>
-#include <gtest/gtest.h>
-
-#include <cmath>
-#include <cstdlib>
-#include <layers/fused_relu_bias_fully_connected_layer.hpp>
-#include <utest/test_utils.hpp>
-#include <vector>
-
-using namespace HugeCTR;
-
-static void cpu_mm(__half *c, const __half *a, bool transpose_a, const __half *b, bool transpose_b,
-                   int m, int k, int n) {
-  for (int i = 0; i < m; ++i) {
-    for (int j = 0; j < n; ++j) {
-      float sum = 0.0f;
-      for (int kk = 0; kk < k; ++kk) {
-        int ai = transpose_a ? kk * m + i : i * k + kk;
-        int bi = transpose_b ? j * k + kk : kk * n + j;
-        sum += __half2float(a[ai]) * __half2float(b[bi]);
-      }
-      c[i * n + j] = sum;
-    }
-  }
-}
-
-static void cpu_add_bias_and_re(__half *top, __half *middle, const __half *bias, int m, int n) {
-  for (int i = 0; i < m; ++i) {
-    for (int j = 0; j < n; ++j) {
-      __half t = top[i * n + j] + bias[j];
-      middle[i * n + j] = t;
-      top[i * n + j] = __half2float(t) < 0 ? __float2half(0.0f) : t;
-    }
-  }
-}
-
-static void cpu_reverse_add_bias_and_re(__half *bias_grad, __half *top, const __half *bprop_out,
-                                        int m, int n) {
-  for (int i = 0; i < m; ++i)
-    for (int j = 0; j < n; ++j) {
-      if (__half2float(top[i * n + j]) < 0) {
-        top[i * n + j] = 0.0f;
-      } else {
-        top[i * n + j] = bprop_out[i * n + j];
-      }
-    }
-
-  for (int i = 0; i < n; ++i) {
-    float sum = 0.0f;
-    for (int j = 0; j < m; ++j) sum += __half2float(top[j * n + i]);
-    bias_grad[i] = sum;
-  }
-}
-
-static float compare_array(const __half *arr1, const __half *arr2, size_t n, float threshold) {
-  size_t m = 0;
-  for (size_t i = 0; i < n; i++) {
-    if (fabs(__half2float(arr1[i] - arr2[i])) > threshold) {
-      m++;
-    }
-  }
-  return 1.0f * m / n;
-}
-
-static void fully_connected_layer_test(size_t m, size_t n, size_t k) {
-  HCTR_LOG(INFO, WORLD, "Testing m=%zu, n=%zu, k=%zu\n", m, n, k);
-
-  test::GaussianDataSimulator simulator(0.0f, 1.0f);
-
-  std::shared_ptr<GeneralBuffer2<CudaAllocator>> blobs_buff =
-      GeneralBuffer2<CudaAllocator>::create();
-  std::shared_ptr<BufferBlock2<float>> master_weights_buff = blobs_buff->create_block<float>();
-  std::shared_ptr<BufferBlock2<__half>> weights_buff = blobs_buff->create_block<__half>();
-  std::shared_ptr<BufferBlock2<__half>> weights_grad_buff = blobs_buff->create_block<__half>();
-
-  Tensor2<__half> train_in_tensor, mask_in_tensor, dRelu_in_tensor, db_in_tensor;
-  blobs_buff->reserve({m, k}, &train_in_tensor);
-  blobs_buff->reserve({m, k}, &mask_in_tensor);
-  Tensor2<__half> train_out_tensor, mask_out_tensor, dRelu_out_tensor, db_out_tensor;
-  blobs_buff->reserve({m, n}, &train_out_tensor);
-  blobs_buff->reserve({m, n}, &mask_out_tensor);
-  blobs_buff->reserve({m, n}, &dRelu_out_tensor);
-
-  FusedReluBiasFullyConnectedLayer fully_connected_layer(
-      master_weights_buff, weights_buff, weights_grad_buff, blobs_buff, train_in_tensor,
-      mask_in_tensor, dRelu_in_tensor, db_in_tensor, train_out_tensor, mask_out_tensor,
-      dRelu_out_tensor, db_out_tensor, test::get_default_gpu(), FcPosition_t::Isolated,
-      Activation_t::Relu, false, std::vector<Initializer_t>(), false, true);
-
-  // Initialize tensors to 0 and choose cublas algorithms
-  blobs_buff->allocate();
-  fully_connected_layer.initialize();
-  // fully_connected_layer.search_algorithm();
-  // Reset tensors to 0 to ensure all the data are the same as original utest(clear the side effect
-  // of optimize)
-
-  Tensor2<__half> weights = weights_buff->as_tensor();
-  Tensor2<__half> weights_grad = weights_grad_buff->as_tensor();
-  HCTR_LIB_THROW(cudaMemset(weights.get_ptr(), 0, weights.get_size_in_bytes()));
-  HCTR_LIB_THROW(cudaMemset(weights_grad.get_ptr(), 0, weights_grad.get_size_in_bytes()));
-  // TODO: result check
-  __half *d_kernel = weights.get_ptr();
-  __half *d_bias = weights.get_ptr() + k * n;
-  __half *d_kernel_grad = weights_grad.get_ptr();
-  __half *d_bias_grad = weights_grad.get_ptr() + k * n;
-  __half *d_bottom = train_in_tensor.get_ptr();
-  __half *d_bprop_in = mask_in_tensor.get_ptr();
-  __half *d_top = train_out_tensor.get_ptr();
-  __half *d_mask_out = mask_out_tensor.get_ptr();
-
-  std::unique_ptr<__half[]> h_kernel(new __half[k * n]);
-  std::unique_ptr<__half[]> h_kernel_grad(new __half[k * n]);
-  std::unique_ptr<__half[]> h_bias_grad(new __half[n]);
-  std::unique_ptr<__half[]> h_bottom(new __half[m * k]);
-  std::unique_ptr<__half[]> h_bprop_in(new __half[m * k]);
-  std::unique_ptr<__half[]> h_middle(new __half[m * n]);
-  std::unique_ptr<__half[]> h_top(new __half[m * n]);
-  std::unique_ptr<__half[]> h_bprop_out(new __half[m * n]);
-  std::unique_ptr<__half[]> h_bias(new __half[n]);
-
-  std::unique_ptr<__half[]> d2h_top(new __half[m * n]);
-  std::unique_ptr<__half[]> d2h_bprop_in(new __half[m * k]);
-  std::unique_ptr<__half[]> d2h_bottom(new __half[m * k]);
-  std::unique_ptr<__half[]> d2h_kernel_grad(new __half[k * n]);
-  std::unique_ptr<__half[]> d2h_bias_grad(new __half[n]);
-
-  simulator.fill(h_bottom.get(), m * k);
-  simulator.fill(h_kernel.get(), k * n);
-  simulator.fill(h_bias.get(), n);
-
-  HCTR_LIB_THROW(
-      cudaMemcpy(d_kernel, h_kernel.get(), sizeof(__half) * k * n, cudaMemcpyHostToDevice));
-  HCTR_LIB_THROW(cudaMemcpy(d_bias, h_bias.get(), sizeof(__half) * n, cudaMemcpyHostToDevice));
-  HCTR_LIB_THROW(
-      cudaMemcpy(d_bottom, h_bottom.get(), sizeof(__half) * m * k, cudaMemcpyHostToDevice));
-
-  // cpu fprop
-  cpu_mm(h_top.get(), h_bottom.get(), false, h_kernel.get(), false, m, k, n);
-  cpu_add_bias_and_re(h_top.get(), h_middle.get(), h_bias.get(), m, n);
-
-  // gpu fprop
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-  fully_connected_layer.fprop(true);
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-  HCTR_LIB_THROW(cudaMemcpy(d2h_top.get(), d_top, sizeof(__half) * m * n, cudaMemcpyDeviceToHost));
-
-  // check result
-  ASSERT_LT(compare_array(h_top.get(), d2h_top.get(), m * n, 1e-3), 0.15f)
-      << "fprop cross_check result fail" << std::endl;
-
-  simulator.fill(h_top.get(), m * n);
-  simulator.fill(h_bprop_out.get(), m * n);
-
-  HCTR_LIB_THROW(
-      cudaMemcpy(d_top, h_bprop_out.get(), sizeof(__half) * m * n, cudaMemcpyHostToDevice));
-  HCTR_LIB_THROW(
-      cudaMemcpy(d_mask_out, h_top.get(), sizeof(__half) * m * n, cudaMemcpyHostToDevice));
-
-  // cpu bprop
-  cpu_reverse_add_bias_and_re(h_bias_grad.get(), h_top.get(), h_bprop_out.get(), m, n);
-
-  cpu_mm(h_kernel_grad.get(), h_bottom.get(), true, h_top.get(), false, k, m, n);
-  cpu_mm(h_bprop_in.get(), h_top.get(), false, h_kernel.get(), true, m, n, k);
-
-  // gpu bprop
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-  fully_connected_layer.bprop();
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-  HCTR_LIB_THROW(
-      cudaMemcpy(d2h_bprop_in.get(), d_bprop_in, sizeof(__half) * m * k, cudaMemcpyDeviceToHost));
-  HCTR_LIB_THROW(cudaMemcpy(d2h_kernel_grad.get(), d_kernel_grad, sizeof(__half) * k * n,
-                            cudaMemcpyDeviceToHost));
-  HCTR_LIB_THROW(
-      cudaMemcpy(d2h_bias_grad.get(), d_bias_grad, sizeof(__half) * n, cudaMemcpyDeviceToHost));
-
-  // check result
-  ASSERT_LT(compare_array(h_bprop_in.get(), d2h_bprop_in.get(), m * k, 1e-1), 0.05f)
-      << " bprop cross_check input_grad fail" << std::endl;
-  ASSERT_LT(compare_array(h_kernel_grad.get(), d2h_kernel_grad.get(), k * n, 1e-1), 0.05f)
-      << " bprop cross_check weight_grad fail" << std::endl;
-  ASSERT_LT(compare_array(h_bias_grad.get(), d2h_bias_grad.get(), n, 1e-1), 0.05f)
-      << " bprop cross_check bias_grad fail" << std::endl;
-}
-
-TEST(fused_relu_bias_fully_connected_layer_old, fp16_32x64x32) {
-  fully_connected_layer_test(32, 128, 32);
-}
-TEST(fused_relu_bias_fully_connected_layer_old, fp16_2048x512x16) {
-  fully_connected_layer_test(2048, 512, 16);
-}
-TEST(fused_relu_bias_fully_connected_layer_old, fp16_2048x1024x480) {
-  fully_connected_layer_test(2048, 1024, 480);
-}
-TEST(fused_relu_bias_fully_connected_layer_old, fp16_2048x512x1024) {
-  fully_connected_layer_test(2048, 512, 1024);
-}
-TEST(fused_relu_bias_fully_connected_layer_old, fp16_2048x1024x1024) {
-  fully_connected_layer_test(2048, 1024, 1024);
-}
diff --git a/test/utest/legacy_layer_test/group_dense_layer_test_old.cpp b/test/utest/legacy_layer_test/group_dense_layer_test_old.cpp
deleted file mode 100644
index 1dcbbd0d7f..0000000000
--- a/test/utest/legacy_layer_test/group_dense_layer_test_old.cpp
+++ /dev/null
@@ -1,583 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cublas_v2.h>
-#include <cuda_profiler_api.h>
-#include <gtest/gtest.h>
-#include <nvToolsExt.h>
-
-#include <cfloat>
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <layer.hpp>
-#include <layers/fused_relu_bias_fully_connected_layer.hpp>
-#include <layers/interaction_layer.hpp>
-#include <string>
-#include <utest/test_utils.hpp>
-#include <vector>
-
-#pragma GCC diagnostic ignored "-Wunused-variable"
-#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
-
-using namespace HugeCTR;
-
-namespace {
-struct ConfigSet {
-  bool is_perf_test;
-  bool use_nvtx;
-  bool use_record;
-  bool use_cuda_graph;
-  bool async_mlp_wgrad;
-  size_t test_loop_cnt;
-  size_t layer_loop_cnt;
-};
-
-static void fill_data(__half* data, int N) {
-  unsigned seed = time(0);
-  srand(seed);
-  for (int i = 0; i < N; i++) {
-    data[i] = (__half((float)(rand() % 3 - 1)));
-    if (rand() % 50) {
-      data[i] = (__half((float)(0)));
-    }
-  }
-}
-
-static void cpu_mm(__half* c, const __half* a, bool transpose_a, const __half* b, bool transpose_b,
-                   float beta, int m, int k, int n) {
-  for (int i = 0; i < m; ++i) {
-    for (int j = 0; j < n; ++j) {
-      float sum = 0.0f;
-      for (int kk = 0; kk < k; ++kk) {
-        int ai = transpose_a ? kk * m + i : i * k + kk;
-        int bi = transpose_b ? j * k + kk : kk * n + j;
-        sum += __half2float(a[ai] * b[bi]);
-      }
-      c[i * n + j] = static_cast<half>(beta * static_cast<float>(c[i * n + j]) + sum);
-    }
-  }
-}
-
-static void cpu_add_bias_and_re(__half* top, __half* middle, const __half* bias, bool is_relu,
-                                int m, int n) {
-  for (int i = 0; i < m; ++i) {
-    for (int j = 0; j < n; ++j) {
-      __half t = top[i * n + j] + bias[j];
-      middle[i * n + j] = t;
-      if (is_relu)
-        top[i * n + j] = __half2float(t) < 0 ? __float2half(0.0f) : t;
-      else
-        top[i * n + j] = t;
-    }
-  }
-}
-
-static void cpu_reverse_add_bias_and_re(__half* bias_grad, __half* dRelu, __half* middle,
-                                        const __half* bprop_out, int m, int n, bool is_tail) {
-  for (int i = 0; i < m; ++i) {
-    for (int j = 0; j < n; ++j) {
-      if ((__half2float(middle[i * n + j]) <= 0 && is_tail) ||
-          (__half2float(middle[i * n + j]) < 0 && !is_tail)) {
-        dRelu[i * n + j] = 0.0f;
-      } else {
-        dRelu[i * n + j] = bprop_out[i * n + j];
-      }
-    }
-  }
-  for (int i = 0; i < n; ++i) {
-    float sum = 0.0f;
-    for (int j = 0; j < m; ++j) sum += __half2float(dRelu[j * n + i]);
-    bias_grad[i] = sum;
-  }
-}
-
-static float compare_bit_array(const __half* arr1, const __half* arr2, size_t n, float threshold,
-                               bool is_print) {
-  size_t m = 0;
-  for (size_t i = 0; i < n; i++) {
-    int i_bit = i / 16;
-    int j_bit = i % 16;
-    int bit_val = (int)arr2[i / 16];
-    int val2 = (bit_val >> j_bit) & 1;
-    int val1 = (int)arr1[i];
-    // if (val1 != val2) HCTR_LOG(INFO, WORLD, "%d, %d, %d\n", (int)i, val1, val2);
-    // bool val = int(arr2[i / 8]) << (i % 8);
-  }
-  return m;
-}
-
-static float compare_array(const __half* arr1, const __half* arr2, size_t n, float threshold,
-                           bool is_print) {
-  size_t m = 0;
-  for (size_t i = 0; i < n; i++) {
-    // if(is_print && arr2[i] != 0.0) HCTR_LOG(INFO, WORLD, "%ld, %f, %f\n",i, (float)arr1[i],
-    // (float)arr2[i]);
-    if (isnan((float)arr1[i]) || isnan((float)arr2[i]) || isinf((float)arr1[i]) ||
-        isinf((float)arr2[i])) {
-      HCTR_LOG(INFO, WORLD, "Nan or Inf Error\n");
-      return INT_MAX;
-    }
-    if (fabs(__half2float(arr1[i] - arr2[i])) > threshold) {
-      if (__half2float(arr2[i]) == 0 && fabs(__half2float(arr1[i])) > threshold) {
-        HCTR_LOG(INFO, WORLD, "%ld, %f, %f\n", i, (float)arr1[i], (float)arr2[i]);
-        m++;
-      } else if (fabs(__half2float(arr1[i] - arr2[i])) / __half2float(arr2[i]) > threshold) {
-        HCTR_LOG(INFO, WORLD, "%ld, %f, %f\n", i, (float)arr1[i], (float)arr2[i]);
-        m++;
-      }
-    }
-  }
-  return 1.0f * m / n;
-}
-
-void set_l2_policy(const cudaStream_t& stream, __half* ptr, int num_bytes) {
-  cudaDeviceProp prop;
-  cudaGetDeviceProperties(&prop, 0);
-  size_t size = std::min(int(prop.l2CacheSize * 0.75), prop.persistingL2CacheMaxSize);
-  size_t window_size = std::min(prop.accessPolicyMaxWindowSize, num_bytes);
-  cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize, window_size);
-
-  cudaStreamAttrValue stream_attribute;
-  stream_attribute.accessPolicyWindow.base_ptr = reinterpret_cast<void*>(ptr);
-  stream_attribute.accessPolicyWindow.num_bytes = window_size;
-  stream_attribute.accessPolicyWindow.hitRatio = 1.0;
-  stream_attribute.accessPolicyWindow.hitProp = cudaAccessPropertyPersisting;
-  stream_attribute.accessPolicyWindow.missProp = cudaAccessPropertyStreaming;
-  cudaStreamSetAttribute(stream, cudaStreamAttributeAccessPolicyWindow, &stream_attribute);
-  HCTR_LOG(INFO, WORLD,
-           "Stream: %p, ptr: %p, num_bytes: %d, window_size: %d, set-aside cache: %d\n", &stream,
-           ptr, num_bytes, (int)window_size, (int)size);
-}
-
-__half **h_kernel, **h_kernel_grad, **h_bias_grad, **h_bias;
-__half **h_bottom, **h_bottom_grad, **h_middle, **h_middle_grad, **h_top, **h_top_grad;
-FusedReluBiasFullyConnectedLayer** fc_layers;
-InteractionLayer<__half>* inter_layer;
-std::vector<Layer*> layers;
-
-static void init_data_cpu(uint32_t* input_dims, uint32_t* output_dims, uint32_t n_layers,
-                          uint32_t batch_size) {
-  h_kernel = new __half*[n_layers];
-  h_kernel_grad = new __half*[n_layers];
-  h_bias_grad = new __half*[n_layers];
-  h_bias = new __half*[n_layers];
-  for (uint32_t i = 0; i < n_layers; i++) {
-    h_kernel[i] = new __half[input_dims[i] * output_dims[i]];
-    h_kernel_grad[i] = new __half[input_dims[i] * output_dims[i]];
-    h_bias_grad[i] = new __half[output_dims[i]];
-    h_bias[i] = new __half[output_dims[i]];
-    fill_data(h_kernel[i], input_dims[i] * output_dims[i]);
-    fill_data(h_kernel_grad[i], input_dims[i] * output_dims[i]);
-    fill_data(h_bias_grad[i], output_dims[i]);
-    fill_data(h_bias[i], output_dims[i]);
-  }
-  h_bottom = new __half*[n_layers];
-  h_bottom_grad = new __half*[n_layers];
-  h_middle = new __half*[n_layers];
-  h_middle_grad = new __half*[n_layers];
-  h_top = new __half*[n_layers];
-  h_top_grad = new __half*[n_layers];
-  // Forward
-  h_bottom[0] = new __half[batch_size * input_dims[0]];
-  h_middle[0] = new __half[batch_size * output_dims[0]];
-  h_top[0] = new __half[batch_size * output_dims[0]];
-  fill_data(h_bottom[0], batch_size * input_dims[0]);
-  fill_data(h_middle[0], batch_size * output_dims[0]);
-  fill_data(h_top[0], batch_size * output_dims[0]);
-
-  for (uint32_t i = 1; i < n_layers; i++) {
-    h_bottom[i] = h_top[i - 1];
-    h_middle[i] = new __half[batch_size * output_dims[i]];
-    uint32_t tmp_dim = output_dims[i];
-    if (i < n_layers - 1 && input_dims[i + 1] > tmp_dim) {
-      tmp_dim = input_dims[i + 1];
-    }
-    h_top[i] = new __half[batch_size * tmp_dim];
-    fill_data(h_middle[i], batch_size * output_dims[i]);
-    fill_data(h_top[i], batch_size * output_dims[i]);
-  }
-  // Backward
-  h_bottom_grad[n_layers - 1] = new __half[batch_size * input_dims[n_layers - 1]];
-  h_middle_grad[n_layers - 1] = new __half[batch_size * output_dims[n_layers - 1]];
-  h_top_grad[n_layers - 1] = new __half[batch_size * output_dims[n_layers - 1]];
-  fill_data(h_top_grad[n_layers - 1], batch_size * output_dims[n_layers - 1]);
-  fill_data(h_middle_grad[n_layers - 1], batch_size * output_dims[n_layers - 1]);
-  fill_data(h_bottom_grad[n_layers - 1], batch_size * input_dims[n_layers - 1]);
-
-  for (int i = n_layers - 2; i >= 0; i--) {
-    h_top_grad[i] = h_bottom_grad[i + 1];
-    h_middle_grad[i] = new __half[batch_size * output_dims[i]];
-    h_bottom_grad[i] = new __half[batch_size * input_dims[i]];
-    fill_data(h_middle_grad[i], batch_size * output_dims[i]);
-    fill_data(h_bottom_grad[i], batch_size * input_dims[i]);
-  }
-}
-
-static void copy_data_from_cpu(uint32_t* input_dims, uint32_t* output_dims, uint32_t n_layers,
-                               uint32_t batch_size) {
-  __half** d_kernel = new __half*[n_layers];
-  __half** d_bias = new __half*[n_layers];
-  __half** d_kernel_grad = new __half*[n_layers];
-  __half** d_bias_grad = new __half*[n_layers];
-  for (uint32_t i = 0; i < n_layers; i++) {
-    d_kernel[i] = fc_layers[i]->get_weights_half_tensor()[0].get_ptr();
-    d_bias[i] = fc_layers[i]->get_weights_half_tensor()[1].get_ptr();
-    d_kernel_grad[i] = fc_layers[i]->get_weights_grad_tensor()[0].get_ptr();
-    d_bias_grad[i] = fc_layers[i]->get_weights_grad_tensor()[1].get_ptr();
-    HCTR_LIB_THROW(cudaMemcpy(d_kernel[i], h_kernel[i],
-                              input_dims[i] * output_dims[i] * sizeof(__half),
-                              cudaMemcpyHostToDevice));
-    HCTR_LIB_THROW(
-        cudaMemcpy(d_bias[i], h_bias[i], output_dims[i] * sizeof(__half), cudaMemcpyHostToDevice));
-    HCTR_LIB_THROW(cudaMemcpy(d_kernel_grad[i], h_kernel_grad[i],
-                              input_dims[i] * output_dims[i] * sizeof(__half),
-                              cudaMemcpyHostToDevice));
-    HCTR_LIB_THROW(cudaMemcpy(d_bias_grad[i], h_bias_grad[i], output_dims[i] * sizeof(__half),
-                              cudaMemcpyHostToDevice));
-  }
-}
-
-static float check_data_cpu_and_gpu(__half* host, __half* device, uint32_t N, float threshold,
-                                    bool is_bit = false, bool is_print = false) {
-  __half* d2h = new __half[N];
-  HCTR_LIB_THROW(cudaMemcpy(d2h, device, N * sizeof(__half), cudaMemcpyDeviceToHost));
-  if (is_bit)
-    return compare_bit_array(host, d2h, N, threshold, is_print);
-  else
-    return compare_array(host, d2h, N, threshold, is_print);
-}
-
-static void group_dense_layer_test(uint32_t* input_dims, uint32_t* output_dims, int* head_body_tail,
-                                   bool* is_relu, bool* fuse_wb, uint32_t n_out_dims,
-                                   uint32_t batch_size, const ConfigSet& config_set) {
-  uint32_t n_layers = n_out_dims;
-  for (uint32_t i = 0; i < n_out_dims; i++) {
-    HCTR_LOG(INFO, WORLD,
-             "The %dth layer: batch size = %d, input dimension = %d, output dimension = %d\n",
-             i + 1, batch_size, input_dims[i], output_dims[i]);
-  }
-  std::shared_ptr<GeneralBuffer2<CudaAllocator>> blobs_buff =
-      GeneralBuffer2<CudaAllocator>::create();
-  std::shared_ptr<BufferBlock2<float>> master_weights_buff = blobs_buff->create_block<float>();
-  std::shared_ptr<BufferBlock2<__half>> weights_buff = blobs_buff->create_block<__half>();
-  std::shared_ptr<BufferBlock2<__half>> weights_grad_buff = blobs_buff->create_block<__half>();
-
-  Tensor2<__half> train_in_tensor[n_out_dims], train_out_tensor[n_out_dims];
-  Tensor2<__half> mask_in_tensor[n_out_dims], mask_out_tensor[n_out_dims];
-  Tensor2<__half> dRelu_in_tensor[n_out_dims], dRelu_out_tensor[n_out_dims];
-  Tensor2<__half> db_in_tensor[n_out_dims], db_out_tensor[n_out_dims];
-  fc_layers = new FusedReluBiasFullyConnectedLayer*[n_out_dims];
-  Activation_t relu[n_out_dims];
-  for (uint32_t i = 0; i < n_out_dims; i++) {
-    if (is_relu[i])
-      relu[i] = Activation_t::Relu;
-    else
-      relu[i] = Activation_t::None;
-  }
-  std::shared_ptr<GPUResource> gpu_resource = test::get_default_gpu();
-  // Head layer
-  blobs_buff->reserve({batch_size, input_dims[0]}, &train_in_tensor[0]);
-  blobs_buff->reserve({batch_size, input_dims[0]}, &mask_in_tensor[0]);
-  blobs_buff->reserve({batch_size, output_dims[0]}, &train_out_tensor[0]);
-  blobs_buff->reserve({batch_size, output_dims[0]}, &mask_out_tensor[0]);
-  blobs_buff->reserve({batch_size, output_dims[0]}, &dRelu_out_tensor[0]);
-  fc_layers[0] = new FusedReluBiasFullyConnectedLayer(
-      master_weights_buff, weights_buff, weights_grad_buff, blobs_buff, train_in_tensor[0],
-      mask_in_tensor[0], dRelu_in_tensor[0], db_in_tensor[0], train_out_tensor[0],
-      mask_out_tensor[0], dRelu_out_tensor[0], db_out_tensor[0], gpu_resource, FcPosition_t::Head,
-      relu[0], false, std::vector<Initializer_t>(), config_set.async_mlp_wgrad, false, fuse_wb[0]);
-  layers.push_back(fc_layers[0]);
-  // Body layer and Tail layer
-  for (uint32_t i = 1; i < n_layers; i++) {
-    // Interaction layer
-    if (input_dims[i] != output_dims[i - 1]) {
-      // int in_dims[8]  = {16,  512, 256, 480, 1024, 1024, 512, 256};
-      // int out_dims[8] = {512, 256, 128, 1024, 1024, 512, 256, 1};
-      Tensor2<__half> in_mlp_tensor = train_out_tensor[i - 1];  // 640x128
-      Tensor2<__half> in_emb_tensor;                            // 640 x 26 x 128
-      blobs_buff->reserve({batch_size, 26, output_dims[i - 1]}, &in_emb_tensor);
-
-      Tensor2<__half> out_tensor, grad_tensor;
-
-      // blobs_buff->reserve({batch_size, input_dims[i]}, &out_tensor); //640x480
-      // blobs_buff->reserve({batch_size, input_dims[i]}, &grad_tensor); //640x480
-      inter_layer =
-          new InteractionLayer<__half>(in_mlp_tensor, in_emb_tensor, out_tensor, grad_tensor,
-                                       blobs_buff, gpu_resource, true, false);
-      layers.push_back(inter_layer);
-      train_in_tensor[i] = out_tensor;
-      blobs_buff->reserve({batch_size, input_dims[i]}, &mask_in_tensor[i]);
-      // blobs_buff->reserve({batch_size, input_dims[i]}, &dRelu_in_tensor[i]);
-      db_in_tensor[i] = grad_tensor;
-      blobs_buff->reserve({batch_size, output_dims[i]}, &train_out_tensor[i]);
-      blobs_buff->reserve({batch_size, output_dims[i]}, &mask_out_tensor[i]);
-      blobs_buff->reserve({batch_size, output_dims[i]}, &dRelu_out_tensor[i]);
-    } else {
-      train_in_tensor[i] = train_out_tensor[i - 1];
-      mask_in_tensor[i] = mask_out_tensor[i - 1];
-      dRelu_in_tensor[i] = dRelu_out_tensor[i - 1];
-      db_in_tensor[i] = db_out_tensor[i - 1];
-      blobs_buff->reserve({batch_size, output_dims[i]}, &train_out_tensor[i]);
-      blobs_buff->reserve({batch_size, output_dims[i]}, &mask_out_tensor[i]);
-      blobs_buff->reserve({batch_size, output_dims[i]}, &dRelu_out_tensor[i]);
-    }
-
-    if (2 == head_body_tail[i]) {  // tail
-      fc_layers[i] = new FusedReluBiasFullyConnectedLayer(
-          master_weights_buff, weights_buff, weights_grad_buff, blobs_buff, train_in_tensor[i],
-          mask_in_tensor[i], dRelu_in_tensor[i], db_in_tensor[i], train_out_tensor[i],
-          mask_out_tensor[i], dRelu_out_tensor[i], db_out_tensor[i], gpu_resource,
-          FcPosition_t::Tail, relu[i], false, std::vector<Initializer_t>(),
-          config_set.async_mlp_wgrad, false, fuse_wb[i]);
-    } else if (1 == head_body_tail[i]) {  // body
-      fc_layers[i] = new FusedReluBiasFullyConnectedLayer(
-          master_weights_buff, weights_buff, weights_grad_buff, blobs_buff, train_in_tensor[i],
-          mask_in_tensor[i], dRelu_in_tensor[i], db_in_tensor[i], train_out_tensor[i],
-          mask_out_tensor[i], dRelu_out_tensor[i], db_out_tensor[i], gpu_resource,
-          FcPosition_t::Body, relu[i], false, std::vector<Initializer_t>(),
-          config_set.async_mlp_wgrad, false, fuse_wb[i]);
-    } else {  // head
-      fc_layers[i] = new FusedReluBiasFullyConnectedLayer(
-          master_weights_buff, weights_buff, weights_grad_buff, blobs_buff, train_in_tensor[i],
-          mask_in_tensor[i], dRelu_in_tensor[i], db_in_tensor[i], train_out_tensor[i],
-          mask_out_tensor[i], dRelu_out_tensor[i], db_out_tensor[i], gpu_resource,
-          FcPosition_t::Head, relu[i], false, std::vector<Initializer_t>(),
-          config_set.async_mlp_wgrad, false, fuse_wb[i]);
-    }
-    layers.push_back(fc_layers[i]);
-  }
-  // Initialize tensors to 0 and choose cublas algorithms
-  blobs_buff->allocate();
-  for (uint32_t i = 0; i < n_layers; i++) {
-    fc_layers[i]->initialize();
-    printf("------------------------layers = %d---------------------------\n", i);
-    fc_layers[i]->search_algorithm();
-  }
-  // Reset tensors to 0 to ensure all the data are the same as original utest(clear the side effect
-  // of optimize)
-  Tensor2<__half> weights = weights_buff->as_tensor();
-  Tensor2<__half> weights_grad = weights_grad_buff->as_tensor();
-  HCTR_LIB_THROW(cudaMemset(weights.get_ptr(), 0, weights.get_size_in_bytes()));
-  HCTR_LIB_THROW(cudaMemset(weights_grad.get_ptr(), 0, weights_grad.get_size_in_bytes()));
-
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-  init_data_cpu(input_dims, output_dims, n_layers, batch_size);
-  copy_data_from_cpu(input_dims, output_dims, n_layers, batch_size);
-
-  // check if grad and bias are equal
-  __half** d_kernel = new __half*[n_layers];
-  __half** d_bias = new __half*[n_layers];
-  __half** d_kernel_grad = new __half*[n_layers];
-  __half** d_bias_grad = new __half*[n_layers];
-  for (uint32_t i = 0; i < n_layers; i++) {
-    d_kernel[i] = fc_layers[i]->get_weights_half_tensor()[0].get_ptr();
-    d_bias[i] = fc_layers[i]->get_weights_half_tensor()[1].get_ptr();
-    d_kernel_grad[i] = fc_layers[i]->get_weights_grad_tensor()[0].get_ptr();
-    d_bias_grad[i] = fc_layers[i]->get_weights_grad_tensor()[1].get_ptr();
-    ASSERT_LT(
-        check_data_cpu_and_gpu(h_kernel[i], d_kernel[i], input_dims[i] * output_dims[i], 1e-3),
-        0.05)
-        << "kernel cross_check result fail" << std::endl;
-    ASSERT_LT(check_data_cpu_and_gpu(h_bias[i], d_bias[i], output_dims[i], 1e-3), 0.05)
-        << "bias cross_check result fail" << std::endl;
-    ASSERT_LT(check_data_cpu_and_gpu(h_kernel_grad[i], d_kernel_grad[i],
-                                     input_dims[i] * output_dims[i], 1e-3),
-              0.05)
-        << "kernel_grad cross_check result fail" << std::endl;
-    ASSERT_LT(check_data_cpu_and_gpu(h_bias_grad[i], d_bias_grad[i], output_dims[i], 1e-3), 0.05)
-        << "bias_grad cross_check result fail" << std::endl;
-  }
-
-  // initialize X
-  HCTR_LIB_THROW(cudaMemcpy(train_in_tensor[0].get_ptr(), h_bottom[0],
-                            sizeof(__half) * batch_size * input_dims[0], cudaMemcpyHostToDevice));
-
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-  // Forward pass (CPU)
-  if (!config_set.is_perf_test) {
-    for (uint32_t i = 0; i < n_layers; i++) {
-      cpu_mm(h_top[i], h_bottom[i], false, h_kernel[i], false, 0.0, batch_size, input_dims[i],
-             output_dims[i]);
-      cpu_add_bias_and_re(h_top[i], h_middle[i], h_bias[i], is_relu[i], batch_size, output_dims[i]);
-    }
-
-    // Forward pass (GPU)
-    HCTR_LIB_THROW(cudaDeviceSynchronize());
-    for (uint32_t i = 0; i < n_layers; i++) {
-      if (i > 0 && input_dims[i] != output_dims[i - 1]) {
-        HCTR_LIB_THROW(cudaMemcpy(train_in_tensor[i].get_ptr(), h_bottom[i],
-                                  sizeof(__half) * batch_size * input_dims[i],
-                                  cudaMemcpyHostToDevice));
-      }
-      HCTR_LIB_THROW(cudaDeviceSynchronize());
-      fc_layers[i]->fprop(true);
-    }
-    HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-    // Check results
-    for (uint32_t i = 0; i < n_layers; i++) {
-      ASSERT_LE(check_data_cpu_and_gpu(h_top[i], train_out_tensor[i].get_ptr(),
-                                       batch_size * output_dims[i], 1e-4),
-                0.0)
-          << "Forward, Y of the " << i << "th layer cross_check result fail" << std::endl;
-    }
-    // initialize dX
-    HCTR_LIB_THROW(cudaMemcpy(mask_out_tensor[n_layers - 1].get_ptr(), h_middle[n_layers - 1],
-                              sizeof(__half) * batch_size * output_dims[n_layers - 1],
-                              cudaMemcpyHostToDevice));
-    HCTR_LIB_THROW(cudaMemcpy(train_out_tensor[n_layers - 1].get_ptr(), h_top_grad[n_layers - 1],
-                              sizeof(__half) * batch_size * output_dims[n_layers - 1],
-                              cudaMemcpyHostToDevice));
-
-    HCTR_LIB_THROW(cudaDeviceSynchronize());
-    // Backward pass (CPU)
-    for (int i = n_layers - 1; i >= 0; i--) {
-      if (!is_relu[i]) {
-        memcpy(h_middle[i], h_top_grad[i], batch_size * output_dims[i] * sizeof(__half));
-        for (uint32_t col = 0; col < output_dims[i]; col++) {
-          float sum = 0.0;
-          for (uint32_t row = 0; row < batch_size; row++) {
-            sum = sum + __half2float(h_top_grad[i][row * output_dims[i] + col]);
-          }
-          h_bias_grad[i][col] = sum;
-        }
-      } else {
-        cpu_reverse_add_bias_and_re(h_bias_grad[i], h_middle[i], h_middle[i], h_top_grad[i],
-                                    batch_size, output_dims[i], i == int(n_layers - 1));
-      }
-      cpu_mm(h_kernel_grad[i], h_bottom[i], true, h_middle[i], false, 1.0, input_dims[i],
-             batch_size, output_dims[i]);
-      cpu_mm(h_bottom_grad[i], h_middle[i], false, h_kernel[i], true, 0.0, batch_size,
-             output_dims[i], input_dims[i]);
-    }
-
-    // Backward pass (GPU)
-    HCTR_LIB_THROW(cudaDeviceSynchronize());
-    for (int i = n_layers - 1; i >= 0; i--) {
-      fc_layers[i]->bprop();
-    }
-    HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-    // Check results
-    for (int i = n_layers - 1; i >= 0; i--) {
-      ASSERT_LE(
-          check_data_cpu_and_gpu(h_bias_grad[i], db_out_tensor[i].get_ptr(), output_dims[i], 1e-3),
-          0.0)
-          << "Backward, dBias of the " << i << "th layer cross_check result fail" << std::endl;
-      ASSERT_LE(check_data_cpu_and_gpu(h_kernel_grad[i],
-                                       fc_layers[i]->get_weights_grad_tensor()[0].get_ptr(),
-                                       input_dims[i] * output_dims[i], 1e-3),
-                0.0)
-          << "Backward, dW of the " << i << "th layer cross_check result fail" << std::endl;
-      if (i > 0 && input_dims[i] != output_dims[i - 1]) {
-        break;
-      }
-    }
-    // If async_mlp_wgrad is true, then here dX is stored in mask_in_tensor[0] rather than
-    // train_in_tensor[0]
-    // ASSERT_LE(check_data_cpu_and_gpu(h_bottom_grad[0], train_in_tensor[0].get_ptr(),
-    //                                  batch_size * input_dims[0], 1e-3),
-    //           0.0)
-    //     << "Backward, dX of the " << 0 << "th layer cross_check result fail" << endl;
-  } else {
-    float time_fprop = 0.0, time_bprop = 0.0;
-    int layer_start = 0;
-    int layer_end = (int)layers.size();
-    cudaGraph_t graph;
-    cudaGraphExec_t graph_exec;
-    bool graph_inited = false;
-    std::string nvtx_str;
-    cudaEvent_t start, stop;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-    for (size_t test_loop_idx = config_set.test_loop_cnt; test_loop_idx > 0; --test_loop_idx) {
-      printf("test_loop_idx = %ld\n", test_loop_idx);
-
-      if (test_loop_idx == 1) {
-        if (config_set.use_cuda_graph && !graph_inited) {
-          cudaStreamBeginCapture(gpu_resource->get_stream(), cudaStreamCaptureModeThreadLocal);
-        } else {
-          cudaProfilerStart();
-        }
-      }
-
-      for (int fprop_idx = layer_start; fprop_idx < layer_end; ++fprop_idx) {
-        // printf("fprop_idx = %d, layers = 0x%lx\n", fprop_idx, (size_t)layers[fprop_idx]);
-        layers[fprop_idx]->fprop(true);
-      }
-
-      for (int bprop_idx = layer_end - 1; bprop_idx >= layer_start; --bprop_idx) {
-        if (config_set.use_nvtx) {
-          nvtx_str = std::to_string(bprop_idx);
-          nvtxRangePush(nvtx_str.c_str());
-        }
-        for (size_t layer_loop_idx = 0; layer_loop_idx < config_set.layer_loop_cnt;
-             ++layer_loop_idx) {
-          layers[bprop_idx]->bprop();
-        }
-        if (config_set.use_nvtx) nvtxRangePop();
-      }
-      if (test_loop_idx == 1) {
-        if (config_set.use_cuda_graph && !graph_inited) {
-          cudaStreamEndCapture(gpu_resource->get_stream(), &graph);
-          cudaGraphInstantiate(&graph_exec, graph, NULL, NULL, 0);
-          graph_inited = true;
-        } else {
-          cudaProfilerStop();
-        }
-      }
-    }
-
-    float mean_time = 0.0f;
-    // size_t test_loop = config_set.test_loop_cnt;
-    size_t test_loop = 10000;
-    for (size_t test_loop_idx = 0; test_loop_idx < test_loop; ++test_loop_idx) {
-      if (graph_inited) {
-        if (0 == test_loop_idx) {
-          cudaProfilerStart();
-        }
-        float elapsedTime = 0.0f;
-        cudaEventRecord(start, gpu_resource->get_stream());
-        cudaGraphLaunch(graph_exec, gpu_resource->get_stream());
-        cudaEventRecord(stop, gpu_resource->get_stream());
-        cudaStreamSynchronize(gpu_resource->get_stream());
-        cudaEventElapsedTime(&elapsedTime, start, stop);
-        if (test_loop_idx % 1000 == 0) {
-          printf("test_loop_idx = %ld, elapsed_time = %f\n", test_loop_idx, elapsedTime);
-        }
-        mean_time += elapsedTime;
-        if (10 == test_loop_idx) {
-          cudaProfilerStop();
-        }
-      }
-    }
-    printf("test_loop = %ld, elapsed_time = %f\n", test_loop, mean_time / test_loop);
-  }
-}
-}  // namespace
-
-// ConfigSet config_set = {false, true, true, 10, 1000};
-ConfigSet config_set = {true, true, false, true, true, 10, 1};
-uint32_t in_dims[8] = {16, 512, 256, 480, 1024, 1024, 512, 256};
-uint32_t out_dims[8] = {512, 256, 128, 1024, 1024, 512, 256, 1};
-int head_body_tail[8] = {0, 1, 2, 0, 1, 1, 1, 2};
-bool is_relu[9] = {true, true, true, true, true, true, true, false};
-bool fuse_wb[8] = {false, false, false, true, true, true, true, true};
-TEST(group_dense_layer_test_old, all) {
-  group_dense_layer_test(in_dims, out_dims, head_body_tail, is_relu, fuse_wb, 8, 640, config_set);
-};
diff --git a/test/utest/legacy_layer_test/gru_layer_test_old.cpp b/test/utest/legacy_layer_test/gru_layer_test_old.cpp
deleted file mode 100644
index 299e65dad0..0000000000
--- a/test/utest/legacy_layer_test/gru_layer_test_old.cpp
+++ /dev/null
@@ -1,570 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cmath>
-#include <cstdlib>
-#include <data_simulator.hpp>
-#include <fstream>
-#include <iostream>
-#include <iterator>
-#include <layers/gru_layer.hpp>
-#include <random>
-#include <sstream>
-#include <utest/test_utils.hpp>
-#include <vector>
-
-using namespace HugeCTR;
-
-namespace {
-
-//#define KERAS_CHECK
-template <typename T>
-static bool check_cpu_gpu(T *cpu_p, T *gpu_p, size_t len) {
-  T *cpu_tmp = (T *)malloc(sizeof(T) * len);
-  HCTR_LIB_THROW(cudaMemcpy(cpu_tmp, gpu_p, sizeof(T) * len, cudaMemcpyDeviceToHost));
-  T max_diff = fabs(cpu_p[0] - cpu_tmp[0]);
-  bool flag = true;
-  int start_pos = 0;
-  for (unsigned int i = 0; i < len; ++i) {
-    if (fabs(cpu_p[i] - cpu_tmp[i]) >= 1e-3 && fabs((cpu_p[i] - cpu_tmp[i]) / cpu_p[i]) >= 1e-3) {
-      if (flag) start_pos = i;
-      flag = false;
-      // if(fabs(cpu_p[i] - cpu_tmp[i]) >= 0.03){
-      //  HCTR_LOG(INFO, WORLD, "wrong at %d %.32f %.32f\n",i, cpu_p[i], cpu_tmp[i]);
-      //  //break;
-      //}
-    }
-    max_diff = max(max_diff, fabs(cpu_p[i] - cpu_tmp[i]));
-  }
-  // if (!flag) HCTR_LOG(INFO, WORLD, "max_diff %f, start at %d\n", max_diff, start_pos);
-  free(cpu_tmp);
-  return flag;
-}
-
-bool check_correctness(float *a, float *b, int len, float error) {
-  float max_diff = fabs(a[0] - b[0]);
-  bool flag = true;
-  // int start_pos = 0;
-  int count = 0;
-  for (int i = 0; i < len; ++i) {
-    if (fabs(a[i] - b[i]) >= error && fabs((a[i] - b[i]) / a[i]) >= error) {
-      // if (flag) start_pos = i;
-      flag = false;
-      count++;
-      // if(fabs(a[i] - b[i]) >= 0.1)
-      // HCTR_LOG(INFO, WORLD, "i %d %f %f\n", i, a[i], b[i]);
-    }
-    max_diff = std::max(max_diff, fabs(a[i] - b[i]));
-  }
-  // HCTR_LOG(INFO, WORLD, "number %d\n", count);
-  // if (!flag) HCTR_LOG(INFO, WORLD, "Fail matched max_diff %f, start at %d\n", max_diff,
-  // start_pos);
-  return flag;
-}
-
-template <typename T>
-static void cpu_mm(T *a, T *b, T *c, size_t m, size_t k, size_t n) {
-  for (unsigned int i = 0; i < m; ++i) {
-    for (unsigned int j = 0; j < n; ++j) {
-      c[i * n + j] = 0.0f;
-      for (unsigned int kk = 0; kk < k; ++kk) c[i * n + j] += a[i * k + kk] * b[kk * n + j];
-    }
-  }
-}
-
-template <typename T>
-static void cpu_add_bias(T *out, T *bias, size_t h) {
-  for (unsigned int i = 0; i < h; ++i) {
-    out[i] += bias[i];
-  }
-}
-
-template <typename T>
-static void cpu_dotm(T *a, T *b, size_t h) {
-  for (unsigned int i = 0; i < h; ++i) {
-    a[i] *= b[i];
-  }
-}
-
-template <typename T>
-void cpu_tanh(T *a, size_t h) {
-  for (unsigned int i = 0; i < h; ++i) {
-    a[i] = tanh(a[i]);
-  }
-}
-
-template <typename T>
-void cpu_fast_sigmoid(T *a, size_t h) {
-  for (unsigned int i = 0; i < h; ++i) {
-    a[i] = a[i] / (1 + abs(a[i]));
-  }
-}
-
-template <typename T>
-void cpu_sigmoid(T *a, size_t h) {
-  for (unsigned int i = 0; i < h; ++i) {
-    a[i] = 1 / (1 + exp((double)-a[i]));
-  }
-}
-
-template <typename T>
-void cpu_ht(T *It, T *h1_t, T *ht_1, T *ht, size_t h) {
-  // ht = (1 - it) ◦ h't + it ◦ ht-1
-  for (unsigned int i = 0; i < h; ++i) {
-    ht[i] = (1 - It[i]) * h1_t[i] + It[i] * ht_1[i];
-  }
-}
-
-template <typename T>
-void cpu_gru(T *weight, T *in, T *hx, T *out, size_t h, size_t v, size_t b, size_t s) {
-  T *It = new T[h * 2];
-  T *Rt = new T[h * 2];
-  T *h1_t = new T[h * 2];
-  memset(It, 0, h * 2 * sizeof(T));
-  memset(Rt, 0, h * 2 * sizeof(T));
-  memset(h1_t, 0, h * 2 * sizeof(T));
-  T *Wr = weight;
-  T *Wi = weight + h * v;
-  T *Wh = Wi + h * v;
-  T *Rr = Wh + h * v;
-  T *Ri = Rr + h * h;
-  T *Rh = Ri + h * h;
-  T *br = Rh + h * h;
-  T *bi = br + h;
-  T *bh = bi + h;
-  for (unsigned int i = 0; i < s; i++) {    // time step
-    for (unsigned int j = 0; j < b; j++) {  // batch
-      // it = σ(Wixt + Riht-1 + bi)
-      int in_index = j * v + i * v * b;
-      int out_index = j * h + (i - 1) * h * b;
-      T *t_hx = hx + j * h;
-      cpu_mm(Wi, in + in_index, It, h, v, 1);  // Wi*Xt
-      if (i != 0)
-        cpu_mm(Ri, out + out_index, It + h, h, h, 1);  // Ri*ht-1
-      else
-        cpu_mm(Ri, t_hx, It + h, h, h, 1);  // Ri*ht-1
-      cpu_add_bias(It, It + h, h);
-      cpu_add_bias(It, bi, h);  // add bias
-      cpu_sigmoid(It, h);
-
-      // rt = σ(Wrxt + Rrht-1 + br)
-      cpu_mm(Wr, in + in_index, Rt, h, v, 1);  // Wi*Xt
-      if (i != 0)
-        cpu_mm(Rr, out + out_index, Rt + h, h, h, 1);  // Ri*ht-1
-      else
-        cpu_mm(Rr, t_hx, Rt + h, h, h, 1);  // Ri*ht-1
-      cpu_add_bias(Rt, Rt + h, h);
-      cpu_add_bias(Rt, br, h);  // add bias
-      cpu_sigmoid(Rt, h);
-
-      // h1_t = tanh(Whxt + rt ◦ (Rhht-1) + bWh)
-      cpu_mm(Wh, in + in_index, h1_t, h, v, 1);  // Wi*Xt
-      if (i != 0)
-        cpu_mm(Rh, out + out_index, h1_t + h, h, h, 1);  // Ri*ht-1
-      else
-        cpu_mm(Rh, t_hx, h1_t + h, h, h, 1);  // Ri*ht-1
-      cpu_dotm(h1_t + h, Rt, h);              // rt ◦ (Rhht-1)
-
-      cpu_add_bias(h1_t, h1_t + h, h);
-      cpu_add_bias(h1_t, bh, h);  // add bias
-      cpu_tanh(h1_t, h);
-
-      // for(unsigned int a = 0; a < h; a++)
-      //{
-      //  if( It[a]<-0.99) //h1_t[a]>0.99 ||
-      //    HCTR_LOG(INFO, WORLD, "satruate sigmoid timestep %d %f \n", i, h1_t[a]);
-      //}
-      // ht = (1 - it) ◦ h't + it ◦ ht-1
-      T *ht_1 = i > 0 ? out + out_index : t_hx;
-      cpu_ht(It, h1_t, ht_1, out + out_index + h * b, h);
-      // cpu_ht(It, h1_t, out + out_index, out + out_index + h*b, h);
-      // for(unsigned int a = 0; a < h; a++)
-      //{
-      //  if(ht_1[a]>0.99 || ht_1[a]<-0.99)
-      //    HCTR_LOG(INFO, WORLD, "satruate sigmoid timestep %d %f \n", i, ht_1[a]);
-      //}
-    }
-    // cpu_mm(Ri, out )
-    // hidden + h*b + j * h + i * h * b
-    // cpu_add_bias(out, )
-  }
-  delete[] It;
-  delete[] Rt;
-  delete[] h1_t;
-}
-
-template <typename T>
-static void gru_layer_test(size_t batch_size, size_t hiddenSize, size_t embedding_vec_size,
-                           size_t SeqLength) {
-  std::shared_ptr<GeneralBuffer2<CudaAllocator>> blobs_buff =
-      GeneralBuffer2<CudaAllocator>::create();
-  std::shared_ptr<BufferBlock2<T>> weight_buff = blobs_buff->create_block<T>();
-  std::shared_ptr<BufferBlock2<T>> wgrad_buff = blobs_buff->create_block<T>();
-
-  Tensor2<T> in_tensor;
-  blobs_buff->reserve({1, batch_size * SeqLength * embedding_vec_size}, &in_tensor);
-  Tensor2<T> out_tensor;
-  blobs_buff->reserve({1, batch_size * SeqLength * hiddenSize}, &out_tensor);
-
-  GRULayer<T> gru_layer(weight_buff, wgrad_buff, in_tensor, out_tensor, hiddenSize, batch_size,
-                        SeqLength, embedding_vec_size, test::get_default_gpu());
-  // Initialize tensors to 0 and choose cublas algorithms
-  blobs_buff->allocate();
-  gru_layer.initialize();
-  // Reset tensors to 0 to ensure all the data are the same as original utest(clear the side effect
-  // of optimize)
-  Tensor2<T> weight = weight_buff->as_tensor();
-  Tensor2<T> wgrad = wgrad_buff->as_tensor();
-  size_t inputTensorSize = batch_size * SeqLength * embedding_vec_size;
-  size_t outputTensorSize = batch_size * SeqLength * hiddenSize;
-  size_t hiddenTensorSize = batch_size * hiddenSize;
-  size_t weightSpaceSize =
-      3 * hiddenSize * embedding_vec_size + 3 * hiddenSize * hiddenSize + 3 * hiddenSize;
-
-  HCTR_LIB_THROW(cudaMemset(weight.get_ptr(), 0, weight.get_size_in_bytes()));
-  HCTR_LIB_THROW(cudaMemset(wgrad.get_ptr(), 0, wgrad.get_size_in_bytes()));
-
-  T *d_weight = weight.get_ptr();
-  T *d_in = in_tensor.get_ptr();
-  T *d_hx = d_weight + weightSpaceSize;
-  T *d_out = out_tensor.get_ptr();
-  T *d_dy = wgrad.get_ptr() + inputTensorSize;
-  T *d_dhy = d_dy + outputTensorSize + hiddenTensorSize;
-
-  std::unique_ptr<T[]> h_weight(new T[test::align_to_even(weightSpaceSize)]);
-  // std::unique_ptr<T[]> h_bias_grad(new T[n]);
-  std::unique_ptr<T[]> h_in(new T[test::align_to_even(inputTensorSize)]);
-  std::unique_ptr<T[]> h_out(new T[test::align_to_even(outputTensorSize)]);
-  std::unique_ptr<T[]> h_dx(new T[test::align_to_even(inputTensorSize)]);
-  std::unique_ptr<T[]> h_dy(new T[test::align_to_even(outputTensorSize)]);
-  std::unique_ptr<T[]> h_dhx(new T[test::align_to_even(hiddenTensorSize)]);
-  std::unique_ptr<T[]> h_dhy(new T[test::align_to_even(hiddenTensorSize)]);
-  std::unique_ptr<T[]> h_dweight(new T[test::align_to_even(weightSpaceSize)]);
-// std::unique_ptr<T[]> h_bias(new T[test::align_to_even(m)]);
-//#define RAND
-//#define CUSTOMIZE
-//#define RAND_WEIGHT
-#define UNIFORM
-//#define NORMAL
-#ifdef RAND
-  test::GaussianDataSimulator simulator(0.0f, 1.0f);
-  simulator.fill(h_weight.get(), test::align_to_even(weightSpaceSize));
-  simulator.fill(h_in.get(), test::align_to_even(inputTensorSize));
-  // T* h_data= h_in.get();
-  // HCTR_LOG(INFO, ROOT, "input %zu ", inputTensorSize);
-  // for(unsigned int i=0;i<inputTensorSize;i++)
-  //{
-  //  HCTR_PRINT(INFO, "%f", h_data[i]);
-  //}
-  // HCTR_PRINT(INFO, "\n");
-  // T* h_w = h_weight.get();
-  // HCTR_LOG(INFO, ROOT, "out weight start:\n");
-  // for(unsigned int i=0;i<weightSpaceSize/sizeof(T);i++)
-  //{
-  //  HCTR_PRINT(INFO, "%f ", h_w[i]);
-  //}
-  // HCTR_PRINT(INFO, "\nout weight end size %zu\n", weightSpaceSize/sizeof(T));
-  // exit(1);
-#elif defined CUSTOMIZE
-  T *tptr = h_in.get();
-  for (unsigned int i = 0; i < inputTensorSize; i++)
-    tptr[i] = float(rand() % 30) / 10 - 1.5;  //[0.7, 1.3]
-  tptr = h_weight.get();
-  for (unsigned int i = 0; i < weightSpaceSize; i++) tptr[i] = float(rand() % 30) / 10 - 1.5;
-#elif defined RAND_WEIGHT
-  test::GaussianDataSimulator simulator(0.0f, 1.0f);
-  simulator.fill(h_weight.get(), test::align_to_even(weightSpaceSize));
-
-  T *tptr = h_in.get();
-  for (unsigned int i = 0; i < inputTensorSize; i++) tptr[i] = 1;
-#elif defined UNIFORM
-  test::UniformDataSimulator simulator;
-  simulator.fill(h_weight.get(), test::align_to_even(weightSpaceSize), -1, 1);  // 0.002, 0.2
-  simulator.fill(h_in.get(), test::align_to_even(inputTensorSize), -1, 1);      // 0.7, 1.3
-#elif defined NORMAL
-  std::default_random_engine generator;
-  std::normal_distribution<double> distribution_in(0, 1.0);        // 0.7, 1.3
-  std::normal_distribution<double> distribution_weight(0.5, 0.5);  // 0.002, 0.2
-  T *tptr = h_in.get();
-  for (unsigned int i = 0; i < inputTensorSize; i++) tptr[i] = distribution_in(generator);  // 1.0
-  tptr = h_weight.get();
-  for (unsigned int i = 0; i < weightSpaceSize; i++) tptr[i] = distribution_weight(generator);
-#else
-  test::GaussianDataSimulator simulator(0.0f, 1.0f);
-  simulator.fill(h_in.get(), test::align_to_even(inputTensorSize));
-  T *tptr = h_weight.get();
-  for (unsigned int i = 0; i < weightSpaceSize; i++) tptr[i] = 1.0;
-#endif
-// T *tptr = h_in.get();
-// HCTR_LOG(INFO, ROOT, "inputX:\n");
-// for(unsigned int i =0; i<inputTensorSize;i++){
-//  HCTR_PRINT(INFO, "%.8f ", tptr[i]);
-//  if((i+1)% embedding_vec_size==0)
-//    HCTR_PRINT(INFO, "\n");
-//}
-// T *tptr = h_weight.get();
-////HCTR_LOG(INFO, ROOT, "inputW: %zu %zu %zu\n", weightSpaceSize, hiddenSize*embedding_vec_size*3,
-/// hiddenSize*hiddenSize*3 );
-// for(unsigned int i =0; i<weightSpaceSize;i++){
-//  HCTR_PRINT(INFO, "%.8f ", tptr[i]);
-//  if(i<hiddenSize*embedding_vec_size*3 && (i+1)% embedding_vec_size==0)
-//    HCTR_PRINT(INFO, "\n");
-//  if(i>hiddenSize*embedding_vec_size*3 && i< hiddenSize*embedding_vec_size*3 +
-//  hiddenSize*hiddenSize*3 && (i+1)% hiddenSize==0)
-//    HCTR_PRINT(INFO, "\n");
-//  if(i>=hiddenSize*embedding_vec_size*3 + hiddenSize*hiddenSize*3)
-//    HCTR_PRINT(INFO, "\n");
-//}
-// T* tptr = h_in.get();
-// T tmp[4] = {-0.747210, -1.379702, 0.995153, -1.136765};
-// for(unsigned int i =0; i<inputTensorSize;i++)
-//  tptr[i] = tmp[i];
-
-// simulator.fill(h_bias.get(), test::align_to_even(m));
-// cpu fprop
-// Py_Initialize();
-// PyRun_SimpleString("import sys; sys.path.append('../test/utest/layers/')");
-// PyRun_SimpleString("print(sys.path)");
-// PyRun_SimpleString("import keras_GRU;");
-// PyRun_SimpleString("keras_GRU.GRU()");
-// Py_Finalize();
-#ifdef KERAS_CHECK
-  T *d_dx = wgrad.get_ptr();
-  T *d_dhx = wgrad.get_ptr() + inputTensorSize + outputTensorSize;
-  T *d_dweight = wgrad.get_ptr() + inputTensorSize + outputTensorSize + 2 * hiddenTensorSize;
-
-  // T* h_data= h_in.get();
-  // HCTR_LOG(INFO, ROOT, "input %zu ", inputTensorSize);
-  // for(unsigned int i=0;i<inputTensorSize;i++)
-  //{
-  //  HCTR_PRINT(INFO, "%f", h_data[i]);
-  //}
-  // HCTR_PRINT(INFO, "\n");
-
-  HCTR_LOG(INFO, ROOT, "parameter 4 %zu %zu %zu %zu \n", batch_size, hiddenSize, embedding_vec_size,
-           SeqLength);
-
-  T *h_data = h_in.get();
-  HCTR_LOG(INFO, ROOT, "input %zu ", inputTensorSize);
-  for (unsigned int i = 0; i < inputTensorSize; i++) {
-    HCTR_PRINT(INFO, "%f ", h_data[i]);
-  }
-  HCTR_PRINT(INFO, "\n");
-#endif
-
-  //#define CUDNNTEST
-
-  T *C_in, *C_weight, *C_y;
-  bool result;
-#ifdef CUDNNTEST
-  C_in = testX;
-  C_weight = testW;
-  C_y = refY;
-#else
-  C_in = h_in.get();
-  C_weight = h_weight.get();
-  C_y = NULL;
-#endif
-  // for (unsigned int i = 0; i < inputTensorSize; i++)
-  //  if (C_in[i] > 1 || C_in[i] < -1) HCTR_LOG(INFO, ROOT, "C_in %f\n", C_in[i]);
-  // for (unsigned int i = 0; i < weightSpaceSize; i++)
-  //  if (C_weight[i] > 1 || C_weight[i] < -1) HCTR_LOG(INFO, ROOT, "C_weight %f\n", C_weight[i]);
-
-  // for(unsigned int i=0;i<outputTensorSize; i++)
-  //  if(C_y[i]>0.6 || C_y[i]<0.1)
-  //    HCTR_LOG(INFO, ROOT, "C_y %f\n", C_y[i]);
-  // memset(testHx, 0, sizeof(hiddenTensorSize)*sizeof(T));
-  // HCTR_LIB_THROW(cudaMemcpy(d_weight, h_weight.get(), weightSpaceSize*sizeof(T),
-  // cudaMemcpyHostToDevice)); HCTR_LIB_THROW(cudaMemcpy(d_in, h_in.get(), sizeof(T) *
-  // inputTensorSize, cudaMemcpyHostToDevice));
-  T *testHx = new T[hiddenTensorSize];
-  memset(testHx, 0, hiddenTensorSize);
-  HCTR_LIB_THROW(cudaMemcpy(d_hx, testHx, sizeof(T) * hiddenTensorSize, cudaMemcpyHostToDevice));
-  HCTR_LIB_THROW(
-      cudaMemcpy(d_weight, C_weight, weightSpaceSize * sizeof(T), cudaMemcpyHostToDevice));
-  HCTR_LIB_THROW(cudaMemcpy(d_in, C_in, sizeof(T) * inputTensorSize, cudaMemcpyHostToDevice));
-
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-  gru_layer.fprop(true);
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-  HCTR_LIB_THROW(
-      cudaMemcpy(h_out.get(), d_out, sizeof(T) * outputTensorSize, cudaMemcpyDeviceToHost));
-
-  // HCTR_LOG(INFO, ROOT, "outputY:\n");
-  // T* tptr = h_out.get();
-  // for(unsigned int i =0; i<outputTensorSize;i++){
-  //    HCTR_PRINT(INFO, "%.8f ", tptr[i]);
-  //    if((i+1)% hiddenSize==0)
-  //      HCTR_PRINT(INFO, "\n");
-  //}
-
-  // T* y = h_out.get();
-  // for(unsigned int i=0;i<outputTensorSize; i++)
-  //  if(y[i]>1 || y[i]< -1)
-  //    HCTR_LOG(INFO, ROOT, "y %f\n", y[i]);
-  // check refY
-  if (C_y != NULL) {
-    result = check_correctness(h_out.get(), C_y, outputTensorSize, 1e-3);
-    if (result)
-      HCTR_LOG(INFO, WORLD, "forward prop hugectr v8 matched with cudnnTest2\n");
-    else
-      HCTR_LOG(INFO, WORLD, "forward prop hugectr v8 failed match with cudnnTest2!!\n");
-  }
-  // rnn_v6(testX, testW, testHx, refY, h_dy.get(), h_dhy.get(), hiddenSize, batch_size,
-  // SeqLength, embedding_vec_size,  h_dweight.get(), h_out.get());
-  cpu_gru<T>(C_weight, C_in, testHx, h_out.get(), hiddenSize, embedding_vec_size, batch_size,
-             SeqLength);
-
-  // rnn_v6(C_in, C_weight, testHx, C_y, h_dy.get(), h_dhy.get(), hiddenSize, batch_size, SeqLength,
-  //       embedding_vec_size, h_dweight.get(), h_out.get());
-
-  T *cpu_y = new T[outputTensorSize];
-  cpu_gru<T>(C_weight, C_in, testHx, cpu_y, hiddenSize, embedding_vec_size, batch_size, SeqLength);
-  delete[] testHx;
-  result = check_correctness(h_out.get(), cpu_y, outputTensorSize, 1e-3);
-  if (result)
-    HCTR_LOG(INFO, WORLD, "forward prop CPU matched with V8\n");
-  else
-    HCTR_LOG(INFO, WORLD, "forward prop CPU failed match with V8!!\n");
-
-  if (C_y != NULL) {
-    result = check_correctness(cpu_y, C_y, outputTensorSize, 1e-3);
-    if (result)
-      HCTR_LOG(INFO, WORLD, "forward prop CPU matched with cudnnTest\n");
-    else
-      HCTR_LOG(INFO, WORLD, "forward prop CPU failed match with cudnnTest!!\n");
-  }
-  // check refY end
-
-#ifdef KERAS_CHECK
-  HCTR_LIB_THROW(
-      cudaMemcpy(h_weight.get(), d_weight, weightSpaceSize * sizeof(T), cudaMemcpyDeviceToHost));
-  h_data = h_out.get();
-  HCTR_LOG(INFO, ROOT, "output_gpu %zu ", outputTensorSize);
-  for (unsigned int i = 0; i < outputTensorSize; i++) {
-    HCTR_PRINT(INFO, "%f ", h_data[i]);
-  }
-  HCTR_PRINT(INFO, "\n");
-#endif
-// CPU reference
-// cpu_gru<T>(h_weight.get(), h_in.get(), h_out.get(), hiddenSize, embedding_vec_size, batch_size,
-// SeqLength);
-#ifndef KERAS_CHECK
-// T *d_out = out_tensor.get_ptr();
-// ASSERT_EQ(true, check_cpu_gpu(h_out.get(), d_out, outputTensorSize)) << "fprop cross_check result
-// fail"<< std::endl;
-#endif
-// CPU bprop TODO
-// simulator.fill(h_dy.get(), test::align_to_even(outputTensorSize));
-// simulator.fill(h_dhy.get(), test::align_to_even(hiddenTensorSize));
-#ifdef KERAS_CHECK
-  h_data = h_out.get();
-  HCTR_LOG(INFO, ROOT, "output_cpu %zu ", outputTensorSize);
-  for (unsigned int i = 0; i < outputTensorSize; i++) {
-    HCTR_PRINT(INFO, "%.32f ", h_data[i]);
-  }
-  HCTR_PRINT(INFO, "\n");
-
-  T *ptr = h_dy.get();
-  for (unsigned int i = 0; i < outputTensorSize; i++) ptr[i] = 1.0;
-
-  ptr = h_dhy.get();
-  for (unsigned int i = 0; i < hiddenTensorSize; i++) ptr[i] = 0;
-
-    // ptr = h_dy.get();
-    // HCTR_LOG(INFO, ROOT, "dy %zu ", outputTensorSize);
-    // for(unsigned int i=0;i<outputTensorSize;i++)
-    //{
-    //  HCTR_PRINT(INFO, "%f ", ptr[i]);
-    //}
-    // HCTR_PRINT(INFO, "\n");
-    //
-    // ptr = h_dhy.get();
-    // HCTR_LOG(INFO, ROOT, "dhy %zu ", hiddenTensorSize);
-    // for(unsigned int i=0;i<hiddenTensorSize;i++)
-    //{
-    //  HCTR_PRINT(INFO, "%f ", ptr[i]);
-    //}
-    // HCTR_PRINT(INFO, "\n");
-#endif
-  HCTR_LIB_THROW(
-      cudaMemcpy(d_dy, h_dy.get(), sizeof(T) * outputTensorSize, cudaMemcpyHostToDevice));
-  HCTR_LIB_THROW(
-      cudaMemcpy(d_dhy, h_dhy.get(), sizeof(T) * hiddenTensorSize, cudaMemcpyHostToDevice));
-
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-  gru_layer.bprop();
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-#ifdef KERAS_CHECK
-  HCTR_LIB_THROW(cudaMemcpy(h_dx.get(), d_dx, sizeof(T) * inputTensorSize, cudaMemcpyDeviceToHost));
-  HCTR_LIB_THROW(
-      cudaMemcpy(h_dhx.get(), d_dhx, hiddenTensorSize * sizeof(T), cudaMemcpyDeviceToHost));
-  HCTR_LIB_THROW(
-      cudaMemcpy(h_dweight.get(), d_dweight, weightSpaceSize * sizeof(T), cudaMemcpyDeviceToHost));
-
-  // ptr = h_dx.get();
-  // HCTR_LOG(INFO, ROOT, "dx %zu ", inputTensorSize);
-  // for(unsigned int i=0;i<inputTensorSize;i++)
-  //{
-  //  HCTR_PRINT(INFO, "%.32f ", ptr[i]);
-  //}
-  // HCTR_PRINT(INFO, "\n");
-  //
-  // ptr = h_dhx.get();
-  // HCTR_LOG(INFO, ROOT, "dhx %zu ", hiddenTensorSize);
-  // for(unsigned int i=0;i<hiddenTensorSize;i++)
-  //{
-  //  HCTR_PRINT(INFO, "%f ", ptr[i]);
-  //}
-  // HCTR_PRINT(INFO, "\n");
-
-  ptr = h_dweight.get();
-  HCTR_LOG(INFO, ROOT, "dweight %zu ", weightSpaceSize);
-  for (unsigned int i = 0; i < weightSpaceSize; i++) {
-    HCTR_PRINT(INFO, "%.32f ", ptr[i]);
-  }
-  HCTR_PRINT(INFO, "\n");
-
-// std::fstream fs;
-// fs.open("tmp.log", std::fstream::out | std::fstream::binary);
-// fs.write((char*)&num_streams, 4);
-// fs.write((const char*) &compressed[0], compressed.size()*sizeof(unsigned char));
-// fs.close();
-#endif
-  // rnn_v6(h_in.get(), h_weight.get(), h_dy.get(), h_dhy.get(), hiddenSize, batch_size, SeqLength,
-  // embedding_vec_size,  h_dweight.get(), h_out.get());
-}
-
-}  // namespace
-
-// batch_size, size_t hiddenSize, size_t embedding_vec_size, size_t SeqLength
-TEST(gru_layer_old, fp32_32x128x100x28) { gru_layer_test<float>(32, 128, 100, 28); }
-TEST(gru_layer_old, fp32_32x128x100x128) { gru_layer_test<float>(32, 128, 100, 128); }
-// TEST(gru_layer_old, fp32_1x2x3x4) { gru_layer_test<float>(1, 2, 3, 4); }
-// TEST(gru_layer_old, fp32_10x20x20x10) { gru_layer_test<float>(10, 20, 20, 10); } //not work
-TEST(gru_layer_old, fp32_8x16x32x12) { gru_layer_test<float>(8, 16, 32, 12); }
-TEST(gru_layer_old, fp32_256x256x256x256) { gru_layer_test<float>(256, 256, 256, 256); }
-TEST(gru_layer_old, fp32_32x32x100x32) { gru_layer_test<float>(32, 32, 100, 32); }
-TEST(gru_layer_old, fp32_32x32x32x20) { gru_layer_test<float>(32, 32, 32, 20); }
-TEST(gru_layer_old, fp32_32x64x100x32) { gru_layer_test<float>(32, 64, 100, 32); }
-TEST(gru_layer_old, fp32_1x512x256x8) { gru_layer_test<float>(1, 512, 256, 8); }
-TEST(gru_layer_old, fp32_256x128x256x256) { gru_layer_test<float>(256, 128, 256, 256); }
-// TEST(gru_layer_old, fp32_128x256x256x32) { gru_layer_test<float>(128, 256, 256, 32); }
-TEST(gru_layer_old, fp32_128x256x256x64) { gru_layer_test<float>(128, 256, 256, 64); }
-TEST(gru_layer_old, fp32_128x256x256x128) { gru_layer_test<float>(128, 256, 256, 128); }
-// TEST(gru_layer_old, fp32_1024x512x1024x128) { gru_layer_test<float>(1024, 512, 1024, 128); }
diff --git a/test/utest/legacy_layer_test/layer_norm_layer_test_old.cpp b/test/utest/legacy_layer_test/layer_norm_layer_test_old.cpp
deleted file mode 100644
index 06f893c508..0000000000
--- a/test/utest/legacy_layer_test/layer_norm_layer_test_old.cpp
+++ /dev/null
@@ -1,339 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gtest/gtest.h>
-
-#include <general_buffer2.hpp>
-#include <layers/layer_norm_layer.hpp>
-#include <utest/test_utils.hpp>
-
-using namespace std;
-using namespace HugeCTR;
-
-namespace {
-
-constexpr float eps = 1e-4;  // Epsilon for CPU computation
-
-// Eps type for error
-template <typename T>
-struct Eps {
-  static T value();
-};
-
-template <>
-struct Eps<float> {
-  static constexpr float value() { return 1e-3f; }
-};
-
-template <>
-struct Eps<__half> {
-  static __half value() { return __float2half(1e-2f); }
-};
-
-template <typename T>
-void layer_norm_fprop_cpu(const T* gamma, const T* beta, const T* in, T* out, int batch_size,
-                          int num_feature) {
-  for (int i = 0; i < batch_size; i++) {
-    float mean = 0.0f;
-    for (int j = 0; j < num_feature; j++) {
-      int idx = i * num_feature + j;
-      mean += in[idx];
-    }
-    mean /= num_feature;
-
-    float var = 0.0f;
-    for (int j = 0; j < num_feature; j++) {
-      int idx = i * num_feature + j;
-      float diff = in[idx] - mean;
-      var += (diff * diff);
-    }
-    var /= num_feature;
-
-    for (int j = 0; j < num_feature; j++) {
-      int idx = i * num_feature + j;
-      float in_norm = (in[idx] - mean) / sqrt(var + eps);
-      out[idx] = gamma[j] * in_norm + beta[j];
-    }
-  }
-}
-
-template <>
-void layer_norm_fprop_cpu<__half>(const __half* gamma, const __half* beta, const __half* in,
-                                  __half* out, int batch_size, int num_feature) {
-  for (int i = 0; i < batch_size; i++) {
-    float mean = 0.0f;
-    for (int j = 0; j < num_feature; j++) {
-      int idx = i * num_feature + j;
-      mean += __half2float(in[idx]);
-    }
-    mean /= num_feature;
-
-    float var = 0.0f;
-    for (int j = 0; j < num_feature; j++) {
-      int idx = i * num_feature + j;
-      float diff = __half2float(in[idx]) - mean;
-      var += (diff * diff);
-    }
-    var /= num_feature;
-
-    for (int j = 0; j < num_feature; j++) {
-      int idx = i * num_feature + j;
-      float in_norm = (__half2float(in[idx]) - mean) / sqrt(var + eps);
-      out[idx] = gamma[j] * __float2half(in_norm) + beta[j];
-    }
-  }
-}
-
-template <typename T>
-void layer_norm_bprop_cpu(const T* gamma, const T* out, T* in, T* gamma_grad, T* beta_grad,
-                          int batch_size, int num_feature) {
-  memset(gamma_grad, 0.0f, num_feature * sizeof(T));
-  memset(beta_grad, 0.0f, num_feature * sizeof(T));
-  for (int i = 0; i < batch_size; i++) {
-    float mean = 0.0f;
-    for (int j = 0; j < num_feature; j++) {
-      int idx = i * num_feature + j;
-      mean += in[idx];
-    }
-    mean /= num_feature;
-
-    float var = 0.0f;
-    for (int j = 0; j < num_feature; j++) {
-      int idx = i * num_feature + j;
-      float diff = in[idx] - mean;
-      var += (diff * diff);
-    }
-    var /= num_feature;
-
-    float inv_std = 1.0f / sqrt(var + eps);
-
-    float d_var = 0.0f;
-    for (int j = 0; j < num_feature; j++) {
-      int idx = i * num_feature + j;
-      float val = (out[idx] * gamma[j]) * (in[idx] - mean);
-      d_var += val;
-    }
-
-    d_var *= (-0.5f) * pow(inv_std, 3);
-
-    float d_mu = 0.0f;
-    for (int j = 0; j < num_feature; j++) {
-      int idx = i * num_feature + j;
-      d_mu += (out[idx] * gamma[j] * inv_std);
-    }
-    d_mu *= (-1.0f / num_feature);
-
-    for (int j = 0; j < num_feature; j++) {
-      int idx = i * num_feature + j;
-      gamma_grad[j] += out[idx] * (in[idx] - mean) * inv_std;
-      beta_grad[j] += out[idx];
-
-      in[idx] =
-          (out[idx] * gamma[j]) * inv_std + d_var * (2.0 * (in[idx] - mean) / num_feature) + d_mu;
-    }
-  }
-}
-
-template <>
-void layer_norm_bprop_cpu<__half>(const __half* gamma, const __half* out, __half* in,
-                                  __half* gamma_grad, __half* beta_grad, int batch_size,
-                                  int num_feature) {
-  for (int j = 0; j < num_feature; j++) {
-    gamma_grad[j] = __float2half(0.0f);
-    beta_grad[j] = __float2half(0.0f);
-  }
-  for (int i = 0; i < batch_size; i++) {
-    float mean = 0.0f;
-    for (int j = 0; j < num_feature; j++) {
-      int idx = i * num_feature + j;
-      mean += __half2float(in[idx]);
-    }
-    mean /= num_feature;
-
-    float var = 0.0f;
-    for (int j = 0; j < num_feature; j++) {
-      int idx = i * num_feature + j;
-      float diff = __half2float(in[idx]) - mean;
-      var += (diff * diff);
-    }
-    var /= num_feature;
-
-    float inv_std = 1.0f / sqrt(var + eps);
-
-    float d_var = 0.0f;
-    for (int j = 0; j < num_feature; j++) {
-      int idx = i * num_feature + j;
-      float val =
-          (__half2float(out[idx]) * __half2float(gamma[j])) * (__half2float(in[idx]) - mean);
-      d_var += val;
-    }
-    d_var *= (-0.5f) * pow(inv_std, 3);
-
-    float d_mu = 0.0f;
-    for (int j = 0; j < num_feature; j++) {
-      int idx = i * num_feature + j;
-      d_mu += __half2float(out[idx]) * __half2float(gamma[j]) * inv_std;
-    }
-    d_mu *= (-1.0f / num_feature);
-
-    for (int j = 0; j < num_feature; j++) {
-      int idx = i * num_feature + j;
-      gamma_grad[j] =
-          gamma_grad[j] + out[idx] * (in[idx] - __float2half(mean)) * __float2half(inv_std);
-      beta_grad[j] = beta_grad[j] + out[idx];
-      in[idx] = __float2half(__half2float(out[idx] * gamma[j]) * inv_std +
-                             d_var * (2.0 / num_feature) * (__half2float(in[idx]) - mean) + d_mu);
-    }
-  }
-}
-
-template <typename T>
-void layer_norm_test(std::vector<size_t> dims) {
-  std::shared_ptr<GeneralBuffer2<CudaAllocator>> buff = GeneralBuffer2<CudaAllocator>::create();
-  std::shared_ptr<BufferBlock2<T>> wbuff = buff->create_block<T>();
-  std::shared_ptr<BufferBlock2<float>> master_weight_buff = buff->create_block<float>();
-  std::shared_ptr<BufferBlock2<T>> wgbuff = buff->create_block<T>();
-
-  Tensor2<T> in_tensor;
-  buff->reserve(dims, &in_tensor);
-  Tensor2<T> out_tensor;
-  buff->reserve(dims, &out_tensor);
-
-  typename LayerNormLayer<T>::Params params = {eps};
-  LayerNormLayer<T> layer_norm_layer(master_weight_buff, wbuff, wgbuff, buff, in_tensor, out_tensor,
-                                     params, test::get_default_gpu());
-
-  buff->allocate();
-
-  const auto& in_tensor_dim = dims;
-  size_t batch_size = 1;
-  size_t num_feature = in_tensor_dim[in_tensor_dim.size() - 1];
-
-  for (size_t idx = 0; idx < in_tensor_dim.size(); idx++) {
-    cout << in_tensor_dim[idx] << endl;
-  }
-
-  for (size_t idx = 0; idx < in_tensor_dim.size() - 1; idx++) {
-    batch_size = batch_size * in_tensor_dim[idx];
-  }
-  const size_t len = batch_size * num_feature;
-
-  cout << "Batch_size: " << batch_size << " Num_feature: " << num_feature << endl;
-
-  T* d_in = in_tensor.get_ptr();
-  T* d_out = out_tensor.get_ptr();
-
-  std::unique_ptr<T[]> h_gamma(new T[num_feature]);
-  std::unique_ptr<T[]> h_beta(new T[num_feature]);
-  std::unique_ptr<T[]> h_gamma_grad(new T[num_feature]);
-  std::unique_ptr<T[]> h_beta_grad(new T[num_feature]);
-  std::unique_ptr<T[]> h_gamma_grad_expected(new T[num_feature]);
-  std::unique_ptr<T[]> h_beta_grad_expected(new T[num_feature]);
-
-  std::unique_ptr<T[]> h_in(new T[len]);
-  std::unique_ptr<T[]> h_out(new T[len]);
-  std::unique_ptr<T[]> h_expected(new T[len]);
-
-  test::GaussianDataSimulator simulator(0.0, 1.0);
-
-  // standard normal distribution is assumed
-  for (size_t j = 0; j < num_feature; j++) {
-    h_gamma[j] = 1.0f;
-    h_beta[j] = 0.0f;
-  }
-
-  Tensor2<T> weight_tensor = wbuff->as_tensor();
-  Tensor2<T> weight_grad_tensor = wgbuff->as_tensor();
-
-  T* d_gamma = weight_tensor.get_ptr();
-  T* d_beta = weight_tensor.get_ptr() + num_feature;
-  T* d_gamma_grad = weight_grad_tensor.get_ptr();
-  T* d_beta_grad = weight_grad_tensor.get_ptr() + num_feature;
-  HCTR_LIB_THROW(
-      cudaMemcpy(d_gamma, h_gamma.get(), num_feature * sizeof(T), cudaMemcpyHostToDevice));
-  HCTR_LIB_THROW(cudaMemcpy(d_beta, h_beta.get(), num_feature * sizeof(T), cudaMemcpyHostToDevice));
-
-  simulator.fill(h_in.get(), len);
-
-  layer_norm_fprop_cpu<T>(h_gamma.get(), h_beta.get(), h_in.get(), h_expected.get(), batch_size,
-                          num_feature);
-
-  HCTR_LIB_THROW(cudaMemcpy(d_in, h_in.get(), len * sizeof(T), cudaMemcpyHostToDevice));
-
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-  layer_norm_layer.fprop(true);
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-  HCTR_LIB_THROW(cudaMemcpy(h_out.get(), d_out, len * sizeof(T), cudaMemcpyDeviceToHost));
-
-  ASSERT_TRUE(test::compare_array_approx<T>(h_out.get(), h_expected.get(), len, Eps<T>::value()));
-
-  simulator.fill(h_out.get(), len);
-
-  HCTR_LIB_THROW(cudaMemcpy(h_expected.get(), d_in, len * sizeof(T), cudaMemcpyDeviceToHost));
-
-  layer_norm_bprop_cpu<T>(h_gamma.get(), h_out.get(), h_expected.get(), h_gamma_grad.get(),
-                          h_beta_grad.get(), batch_size, num_feature);
-
-  HCTR_LIB_THROW(cudaMemcpy(d_out, h_out.get(), len * sizeof(T), cudaMemcpyHostToDevice));
-
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-  layer_norm_layer.bprop();
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-  std::shared_ptr<GPUResource> gpu_resource = test::get_default_gpu();
-  cudaStreamSynchronize(gpu_resource->get_stream());
-  cout << cudaGetLastError() << endl;
-
-  HCTR_LIB_THROW(cudaMemcpy(h_in.get(), d_in, len * sizeof(T), cudaMemcpyDeviceToHost));
-  HCTR_LIB_THROW(cudaMemcpy(h_gamma_grad_expected.get(), d_gamma_grad, num_feature * sizeof(T),
-                            cudaMemcpyDeviceToHost));
-  HCTR_LIB_THROW(cudaMemcpy(h_beta_grad_expected.get(), d_beta_grad, num_feature * sizeof(T),
-                            cudaMemcpyDeviceToHost));
-
-  ASSERT_TRUE(test::compare_array_approx<T>(h_in.get(), h_expected.get(), len, Eps<T>::value()));
-  ASSERT_TRUE(test::compare_array_approx<T>(h_gamma_grad.get(), h_gamma_grad_expected.get(),
-                                            num_feature, Eps<T>::value()));
-  ASSERT_TRUE(test::compare_array_approx<T>(h_beta_grad.get(), h_beta_grad_expected.get(),
-                                            num_feature, Eps<T>::value()));
-}
-
-}  // namespace
-
-TEST(layer_norm_layer_old, fp32_2x512) {
-  std::vector<size_t> dims = {2, 512};
-  layer_norm_test<float>(dims);
-}
-TEST(layer_norm_layer_old, fp32_4x2048) {
-  std::vector<size_t> dims = {4, 2048};
-  layer_norm_test<float>(dims);
-}
-TEST(layer_norm_layer_old, fp32_4x10x1024) {
-  std::vector<size_t> dims{4, 10, 1024};
-  layer_norm_test<float>(dims);
-}
-TEST(layer_norm_layer_old, fp32_1x1024x2x768) {
-  std::vector<size_t> dims{1, 1024, 2, 768};
-  layer_norm_test<float>(dims);
-}
-TEST(layer_norm_layer_old, fp16_2x1024) {
-  std::vector<size_t> dims{2, 1024};
-  layer_norm_test<__half>(dims);
-}
-/*TEST(layer_norm_layer_old, fp16_2x1024x20x512) {
-  std::vector<size_t> dims{1, 4, 1, 768};
-  layer_norm_test<__half>(dims);
-}*/
diff --git a/test/utest/legacy_layer_test/mlp_test_old.cpp b/test/utest/legacy_layer_test/mlp_test_old.cpp
deleted file mode 100644
index 26cf1e1ee0..0000000000
--- a/test/utest/legacy_layer_test/mlp_test_old.cpp
+++ /dev/null
@@ -1,619 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cublas_v2.h>
-#include <cuda_profiler_api.h>
-#include <gtest/gtest.h>
-#include <nvToolsExt.h>
-
-#include <cfloat>
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <layer.hpp>
-#include <layers/interaction_layer.hpp>
-#include <layers/mlp_layer.hpp>
-#include <string>
-#include <utest/test_utils.hpp>
-#include <vector>
-
-#pragma GCC diagnostic ignored "-Wunused-variable"
-#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
-
-using namespace HugeCTR;
-
-struct ConfigSet {
-  bool is_perf_test;
-  bool use_cuda_graph;
-  bool async_mlp_wgrad;
-  size_t test_loop_cnt;
-  size_t layer_loop_cnt;
-};
-
-template <typename T>
-static void fill_data(T* data, int N) {
-  unsigned seed = time(0);
-  srand(seed);
-  for (int i = 0; i < N; i++) {
-    data[i] = (T((float)(rand() % 3 - 1)));
-    if (rand() % 50) {
-      data[i] = (T((float)(0)));
-    }
-  }
-}
-
-template <typename T>
-static void cpu_mm(T* c, const T* a, bool transpose_a, const T* b, bool transpose_b, float beta,
-                   int m, int k, int n) {
-  for (int i = 0; i < m; ++i) {
-    for (int j = 0; j < n; ++j) {
-      float sum = 0.0f;
-      for (int kk = 0; kk < k; ++kk) {
-        int ai = transpose_a ? kk * m + i : i * k + kk;
-        int bi = transpose_b ? j * k + kk : kk * n + j;
-        sum += __half2float(a[ai] * b[bi]);
-      }
-      c[i * n + j] = static_cast<half>(beta * static_cast<float>(c[i * n + j]) + sum);
-    }
-  }
-}
-
-template <typename T>
-static void cpu_add_bias_and_re(T* top, T* middle, const T* bias, bool is_relu, bool use_bias,
-                                int m, int n) {
-  for (int i = 0; i < m; ++i) {
-    for (int j = 0; j < n; ++j) {
-      T t = top[i * n + j] + (use_bias ? bias[j] : T(0.0f));
-      middle[i * n + j] = t;
-      if (is_relu)
-        top[i * n + j] = __half2float(t) < 0 ? T(0.0f) : t;
-      else
-        top[i * n + j] = t;
-    }
-  }
-}
-
-template <typename T>
-static void cpu_reverse_add_bias_and_re(T* bias_grad, T* dRelu, T* middle, const T* bprop_out,
-                                        int m, int n, bool is_tail, bool use_bias) {
-  for (int i = 0; i < m; ++i) {
-    for (int j = 0; j < n; ++j) {
-      if ((__half2float(middle[i * n + j]) <= 0 && is_tail) ||
-          (__half2float(middle[i * n + j]) < 0 && !is_tail)) {
-        dRelu[i * n + j] = 0.0f;
-      } else {
-        dRelu[i * n + j] = bprop_out[i * n + j];
-      }
-    }
-  }
-  if (use_bias) {
-    for (int i = 0; i < n; ++i) {
-      float sum = 0.0f;
-      for (int j = 0; j < m; ++j) sum += __half2float(dRelu[j * n + i]);
-      bias_grad[i] = sum;
-    }
-  }
-}
-
-template <typename T>
-static float compare_bit_array(const T* arr1, const T* arr2, size_t n, float threshold,
-                               bool is_print) {
-  size_t m = 0;
-  for (size_t i = 0; i < n; i++) {
-    int i_bit = i / 16;
-    int j_bit = i % 16;
-    int bit_val = (int)arr2[i / 16];
-    int val2 = (bit_val >> j_bit) & 1;
-    int val1 = (int)arr1[i];
-    // if (val1 != val2) HCTR_LOG(INFO, WORLD, "%d, %d, %d\n", (int)i, val1, val2);
-    // bool val = int(arr2[i / 8]) << (i % 8);
-  }
-  return m;
-}
-
-template <typename T>
-static float compare_array(const T* arr1, const T* arr2, size_t n, float threshold, bool is_print) {
-  size_t m = 0;
-  for (size_t i = 0; i < n; i++) {
-    // if(is_print && arr2[i] != 0.0) HCTR_LOG(INFO, WORLD, "%ld, %f, %f\n",i, (float)arr1[i],
-    // (float)arr2[i]);
-    if (isnan((float)arr1[i]) || isnan((float)arr2[i]) || isinf((float)arr1[i]) ||
-        isinf((float)arr2[i])) {
-      HCTR_LOG(INFO, WORLD, "Nan or Inf Error\n");
-      return INT_MAX;
-    }
-    if (fabs(__half2float(arr1[i] - arr2[i])) > threshold) {
-      if (__half2float(arr2[i]) == 0 && fabs(__half2float(arr1[i])) > threshold) {
-        HCTR_LOG(INFO, WORLD, "%ld, %f, %f\n", i, (float)arr1[i], (float)arr2[i]);
-        m++;
-      } else if (fabs(__half2float(arr1[i] - arr2[i])) / __half2float(arr2[i]) > threshold) {
-        HCTR_LOG(INFO, WORLD, "%ld, %f, %f\n", i, (float)arr1[i], (float)arr2[i]);
-        m++;
-      }
-    }
-  }
-  return 1.0f * m / n;
-}
-
-template <typename T>
-void set_l2_policy(const cudaStream_t& stream, T* ptr, int num_bytes) {
-  cudaDeviceProp prop;
-  cudaGetDeviceProperties(&prop, 0);
-  size_t size = std::min(int(prop.l2CacheSize * 0.75), prop.persistingL2CacheMaxSize);
-  size_t window_size = std::min(prop.accessPolicyMaxWindowSize, num_bytes);
-  cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize, window_size);
-
-  cudaStreamAttrValue stream_attribute;
-  stream_attribute.accessPolicyWindow.base_ptr = reinterpret_cast<void*>(ptr);
-  stream_attribute.accessPolicyWindow.num_bytes = window_size;
-  stream_attribute.accessPolicyWindow.hitRatio = 1.0;
-  stream_attribute.accessPolicyWindow.hitProp = cudaAccessPropertyPersisting;
-  stream_attribute.accessPolicyWindow.missProp = cudaAccessPropertyStreaming;
-  cudaStreamSetAttribute(stream, cudaStreamAttributeAccessPolicyWindow, &stream_attribute);
-  HCTR_LOG(INFO, WORLD,
-           "Stream: %p, ptr: %p, num_bytes: %d, window_size: %d, set-aside cache: %d\n", &stream,
-           ptr, num_bytes, (int)window_size, (int)size);
-}
-
-template <typename T>
-struct Param {
-  T **h_kernel, **h_kernel_grad, **h_bias_grad, **h_bias;
-  T **h_bottom, **h_bottom_grad, **h_middle, **h_middle_grad, **h_top, **h_top_grad;
-};
-
-std::vector<std::unique_ptr<Layer>> layers;
-
-std::map<int, std::vector<int>> map_mlp;
-
-template <typename T>
-static void init_data_cpu(Param<T>& p, int* input_dims, int* output_dims, int n_layers,
-                          int batch_size) {
-  p.h_kernel = new T*[n_layers];
-  p.h_kernel_grad = new T*[n_layers];
-  p.h_bias_grad = new T*[n_layers];
-  p.h_bias = new T*[n_layers];
-  for (int i = 0; i < n_layers; i++) {
-    p.h_kernel[i] = new T[input_dims[i] * output_dims[i]];
-    p.h_kernel_grad[i] = new T[input_dims[i] * output_dims[i]];
-    p.h_bias_grad[i] = new T[output_dims[i]];
-    p.h_bias[i] = new T[output_dims[i]];
-    fill_data(p.h_kernel[i], input_dims[i] * output_dims[i]);
-    fill_data(p.h_kernel_grad[i], input_dims[i] * output_dims[i]);
-    fill_data(p.h_bias_grad[i], output_dims[i]);
-    fill_data(p.h_bias[i], output_dims[i]);
-  }
-  p.h_bottom = new T*[n_layers];
-  p.h_bottom_grad = new T*[n_layers];
-  p.h_middle = new T*[n_layers];
-  p.h_middle_grad = new T*[n_layers];
-  p.h_top = new T*[n_layers];
-  p.h_top_grad = new T*[n_layers];
-  // Forward
-  p.h_bottom[0] = new T[batch_size * input_dims[0]];
-  p.h_middle[0] = new T[batch_size * output_dims[0]];
-  p.h_top[0] = new T[batch_size * output_dims[0]];
-  fill_data(p.h_bottom[0], batch_size * input_dims[0]);
-  fill_data(p.h_middle[0], batch_size * output_dims[0]);
-  fill_data(p.h_top[0], batch_size * output_dims[0]);
-
-  for (int i = 1; i < n_layers; i++) {
-    p.h_bottom[i] = p.h_top[i - 1];
-    p.h_middle[i] = new T[batch_size * output_dims[i]];
-    int tmp_dim = output_dims[i];
-    if (i < n_layers - 1 && input_dims[i + 1] > tmp_dim) {
-      tmp_dim = input_dims[i + 1];
-    }
-    p.h_top[i] = new T[batch_size * tmp_dim];
-    fill_data(p.h_middle[i], batch_size * output_dims[i]);
-    fill_data(p.h_top[i], batch_size * tmp_dim);
-  }
-  // Backward
-  p.h_bottom_grad[n_layers - 1] = new T[batch_size * input_dims[n_layers - 1]];
-  p.h_middle_grad[n_layers - 1] = new T[batch_size * output_dims[n_layers - 1]];
-  p.h_top_grad[n_layers - 1] = new T[batch_size * output_dims[n_layers - 1]];
-  fill_data(p.h_top_grad[n_layers - 1], batch_size * output_dims[n_layers - 1]);
-  fill_data(p.h_middle_grad[n_layers - 1], batch_size * output_dims[n_layers - 1]);
-  fill_data(p.h_bottom_grad[n_layers - 1], batch_size * input_dims[n_layers - 1]);
-
-  for (int i = n_layers - 2; i >= 0; i--) {
-    p.h_top_grad[i] = p.h_bottom_grad[i + 1];
-    p.h_middle_grad[i] = new T[batch_size * output_dims[i]];
-    p.h_bottom_grad[i] = new T[batch_size * input_dims[i]];
-    fill_data(p.h_middle_grad[i], batch_size * output_dims[i]);
-    fill_data(p.h_bottom_grad[i], batch_size * input_dims[i]);
-  }
-}
-
-bool is_mlp_layer(Layer* layer) {
-  return dynamic_cast<MLPLayer<__half>*>(layer) != nullptr ||
-         dynamic_cast<MLPLayer<float>*>(layer) != nullptr;
-}
-
-template <typename T>
-static void copy_data_from_cpu(Param<T>& p, int* input_dims, int* output_dims, int n_layers,
-                               int batch_size) {
-  T** d_kernel = new T*[n_layers];
-  T** d_bias = new T*[n_layers];
-  T** d_kernel_grad = new T*[n_layers];
-  T** d_bias_grad = new T*[n_layers];
-  for (int i = 0; i < n_layers; i++) {
-    auto index = map_mlp[i];
-    auto layer = dynamic_cast<MLPLayer<T>*>(layers[index[0]].get());
-    d_kernel[i] = layer->get_kernel(index[1]).get_ptr();
-    d_bias[i] = layer->get_bias(index[1]).get_ptr();
-    d_kernel_grad[i] = layer->get_kernel_grad(index[1]).get_ptr();
-    d_bias_grad[i] = layer->get_bias_grad(index[1]).get_ptr();
-    HCTR_LIB_THROW(cudaMemcpy(d_kernel[i], p.h_kernel[i],
-                              input_dims[i] * output_dims[i] * sizeof(T), cudaMemcpyHostToDevice));
-    HCTR_LIB_THROW(
-        cudaMemcpy(d_bias[i], p.h_bias[i], output_dims[i] * sizeof(T), cudaMemcpyHostToDevice));
-    HCTR_LIB_THROW(cudaMemcpy(d_kernel_grad[i], p.h_kernel_grad[i],
-                              input_dims[i] * output_dims[i] * sizeof(T), cudaMemcpyHostToDevice));
-    HCTR_LIB_THROW(cudaMemcpy(d_bias_grad[i], p.h_bias_grad[i], output_dims[i] * sizeof(T),
-                              cudaMemcpyHostToDevice));
-  }
-}
-
-template <typename T>
-static float check_data_cpu_and_gpu(T* host, T* device, uint32_t N, float threshold,
-                                    bool is_bit = false, bool is_print = false) {
-  T* d2h = new T[N];
-  HCTR_LIB_THROW(cudaMemcpy(d2h, device, N * sizeof(T), cudaMemcpyDeviceToHost));
-  if (is_bit)
-    return compare_bit_array(host, d2h, N, threshold, is_print);
-  else
-    return compare_array(host, d2h, N, threshold, is_print);
-}
-
-template <typename T>
-static void mlp_test(std::vector<Layer_t> network, std::vector<std::vector<size_t>> mlp_num_outputs,
-                     std::vector<std::vector<bool>> use_relu,
-                     std::vector<std::vector<bool>> use_bias, std::vector<bool> use_fuse_wb,
-                     bool enable_tf32_compute, size_t input_dim, size_t batch_size,
-                     const ConfigSet& config_set) {
-  int n_fc_layers = 0;
-  int cnt_mlp = 0;
-  size_t inter_emb_dim = 26;
-
-  std::shared_ptr<GeneralBuffer2<CudaAllocator>> blobs_buff =
-      GeneralBuffer2<CudaAllocator>::create();
-  std::shared_ptr<BufferBlock2<float>> master_weights_buff = blobs_buff->create_block<float>();
-  std::shared_ptr<BufferBlock2<T>> weights_buff = blobs_buff->create_block<T>();
-  std::shared_ptr<BufferBlock2<T>> weights_grad_buff = blobs_buff->create_block<T>();
-  std::shared_ptr<GPUResource> gpu_resource = test::get_default_gpu();
-
-  Tensors2<T> input_tensor(1);
-  blobs_buff->reserve({batch_size, input_dim}, &input_tensor[0]);
-
-  for (int i = 0; i < (int)network.size(); i++) {
-    Layer_t layer = network[i];
-    if (layer == Layer_t::MLP) {
-      for (int j = 0; j < (int)mlp_num_outputs[cnt_mlp].size(); j++) {
-        map_mlp[n_fc_layers] = {i, j};
-        n_fc_layers++;
-      }
-      cnt_mlp++;
-    }
-  }
-
-  Tensors2<T> train_in_tensors = input_tensor;
-  for (int i = 0; i < cnt_mlp; i++) {
-    std::vector<Activation_t> relu;
-    for (auto flag : use_relu[i]) {
-      relu.push_back(flag ? Activation_t::Relu : Activation_t::None);
-    }
-    auto bias = use_bias[i];
-    auto num_outputs = mlp_num_outputs[i];
-    Tensors2<T> train_out_tensors(1);
-    size_t last_num_output = *num_outputs.rbegin();
-    blobs_buff->reserve({batch_size, last_num_output}, &train_out_tensors[0]);
-
-    layers.emplace_back(new MLPLayer(
-        master_weights_buff, weights_buff, weights_grad_buff, blobs_buff, train_in_tensors,
-        train_out_tensors, num_outputs, gpu_resource, relu, bias, std::vector<Initializer_t>(),
-        false, config_set.async_mlp_wgrad, use_fuse_wb[i], enable_tf32_compute));
-    if (i != cnt_mlp - 1) {
-      Tensor2<T> in_mlp_tensor = train_out_tensors[0];
-      Tensor2<T> in_emb_tensor;
-      blobs_buff->reserve({batch_size, inter_emb_dim, last_num_output}, &in_emb_tensor);
-      Tensor2<T> out_tensor, grad_tensor;
-      layers.emplace_back(new InteractionLayer<T>(in_mlp_tensor, in_emb_tensor, out_tensor,
-                                                  grad_tensor, blobs_buff, gpu_resource, true,
-                                                  false));
-      train_in_tensors.resize(2);
-      train_in_tensors[0] = out_tensor;
-      train_in_tensors[1] = grad_tensor;
-    }
-  }
-
-  blobs_buff->allocate();
-
-  // Initialize tensors to 0 and choose cublas algorithms
-  for (const auto& layer : layers) {
-    layer->initialize();
-    layer->search_algorithm();
-  }
-
-  // Reset tensors to 0 to ensure all the data are the same as original utest(clear the side effect
-  // of optimize)
-  Tensor2<T> weights = weights_buff->as_tensor();
-  Tensor2<T> weights_grad = weights_grad_buff->as_tensor();
-  HCTR_LIB_THROW(cudaMemset(weights.get_ptr(), 0, weights.get_size_in_bytes()));
-  HCTR_LIB_THROW(cudaMemset(weights_grad.get_ptr(), 0, weights_grad.get_size_in_bytes()));
-
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-  std::vector<int> fc_in_dims{(int)input_dim};
-  std::vector<int> fc_out_dims;
-  std::vector<bool> is_relu;
-  std::vector<bool> use_bias_vector;
-
-  for (size_t i = 0; i < mlp_num_outputs.size(); i++) {
-    auto& x = mlp_num_outputs[i];
-    for (size_t j = 0; j < x.size(); j++) {
-      if (fc_in_dims.size() < n_fc_layers) {
-        if (j == x.size() - 1) {
-          int in_dim =
-              x[j] + (inter_emb_dim + 1) * (inter_emb_dim + 2) / 2 - (inter_emb_dim + 1) + 1;
-          fc_in_dims.push_back(in_dim);
-        } else {
-          fc_in_dims.push_back(x[j]);
-        }
-      }
-      fc_out_dims.push_back(x[j]);
-      is_relu.push_back(use_relu[i][j]);
-      use_bias_vector.push_back(use_bias[i][j]);
-    }
-  }
-  Param<T> p;
-  init_data_cpu(p, fc_in_dims.data(), fc_out_dims.data(), n_fc_layers, batch_size);
-  copy_data_from_cpu(p, fc_in_dims.data(), fc_out_dims.data(), n_fc_layers, batch_size);
-
-  // check if grad and bias are equal
-  T** d_kernel = new T*[n_fc_layers];
-  T** d_bias = new T*[n_fc_layers];
-  T** d_kernel_grad = new T*[n_fc_layers];
-  T** d_bias_grad = new T*[n_fc_layers];
-  for (int i = 0; i < n_fc_layers; i++) {
-    auto index = map_mlp[i];
-    auto layer = dynamic_cast<MLPLayer<T>*>(layers[index[0]].get());
-    d_kernel[i] = layer->get_kernel(index[1]).get_ptr();
-    d_bias[i] = layer->get_bias(index[1]).get_ptr();
-    d_kernel_grad[i] = layer->get_kernel_grad(index[1]).get_ptr();
-    d_bias_grad[i] = layer->get_bias_grad(index[1]).get_ptr();
-    ASSERT_EQ(
-        check_data_cpu_and_gpu(p.h_kernel[i], d_kernel[i], fc_in_dims[i] * fc_out_dims[i], 1e-3), 0)
-        << "kernel cross_check result fail" << std::endl;
-    ASSERT_EQ(check_data_cpu_and_gpu(p.h_bias[i], d_bias[i], fc_out_dims[i], 1e-3), 0)
-        << "bias cross_check result fail" << std::endl;
-    ASSERT_EQ(check_data_cpu_and_gpu(p.h_kernel_grad[i], d_kernel_grad[i],
-                                     fc_in_dims[i] * fc_out_dims[i], 1e-3),
-              0)
-        << "kernel_grad cross_check result fail" << std::endl;
-    ASSERT_EQ(check_data_cpu_and_gpu(p.h_bias_grad[i], d_bias_grad[i], fc_out_dims[i], 1e-3), 0)
-        << "bias_grad cross_check result fail" << std::endl;
-  }
-  // inner_tensors = mlp_layer->get_inner_tensors();
-  // initialize X
-  HCTR_LIB_THROW(cudaMemcpy(input_tensor[0].get_ptr(), p.h_bottom[0],
-                            sizeof(T) * batch_size * fc_in_dims[0], cudaMemcpyHostToDevice));
-
-  if (!config_set.is_perf_test) {
-    // Forward pass (CPU)
-    for (int i = 0; i < n_fc_layers; i++) {
-      cpu_mm(p.h_top[i], p.h_bottom[i], false, p.h_kernel[i], false, 0.0, batch_size, fc_in_dims[i],
-             fc_out_dims[i]);
-      cpu_add_bias_and_re(p.h_top[i], p.h_middle[i], p.h_bias[i], is_relu[i], use_bias_vector[i],
-                          batch_size, fc_out_dims[i]);
-    }
-
-    for (int i = 0; i < n_fc_layers; i++) {
-      auto index = map_mlp[i];
-      auto mlp_layer = dynamic_cast<MLPLayer<T>*>(layers[index[0]].get());
-      if (i > 0 && fc_in_dims[i] != fc_out_dims[i - 1]) {
-        auto& input_tensors = mlp_layer->get_input_tensors();
-        HCTR_LIB_THROW(cudaMemcpy(input_tensors[0].get_ptr(), p.h_bottom[i],
-                                  sizeof(T) * batch_size * fc_in_dims[i], cudaMemcpyHostToDevice));
-      }
-    }
-
-    for (size_t i = 0; i < layers.size(); i++) {
-      if (is_mlp_layer(layers[i].get())) {
-        layers[i]->fprop(true);
-      }
-    }
-    HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-    // Check results
-    for (int i = 0; i < n_fc_layers; i++) {
-      auto index = map_mlp[i];
-      auto mlp_layer = dynamic_cast<MLPLayer<T>*>(layers[index[0]].get());
-      auto& inner_tensors = mlp_layer->get_inner_tensors();
-      ASSERT_LE(check_data_cpu_and_gpu(p.h_top[i], inner_tensors[index[1]].get_ptr(),
-                                       batch_size * fc_out_dims[i], 1e-2),
-                0)
-          << "Forward, Y of the " << i << "th layer cross_check result fail" << std::endl;
-    }
-
-    // initialize dX
-    {
-      auto index = map_mlp[n_fc_layers - 1];
-      auto mlp_layer = dynamic_cast<MLPLayer<T>*>(layers[index[0]].get());
-      auto& inner_tensors = mlp_layer->get_inner_tensors();
-      HCTR_LIB_THROW(cudaMemcpy(inner_tensors[index[1]].get_ptr(), p.h_top_grad[n_fc_layers - 1],
-                                sizeof(T) * batch_size * fc_out_dims[n_fc_layers - 1],
-                                cudaMemcpyHostToDevice));
-    }
-
-    // Backward pass (CPU)
-    for (int i = n_fc_layers - 1; i >= 0; i--) {
-      if (!is_relu[i]) {
-        memcpy(p.h_middle[i], p.h_top_grad[i], batch_size * fc_out_dims[i] * sizeof(T));
-        if (use_bias_vector[i]) {
-          for (uint32_t col = 0; col < fc_out_dims[i]; col++) {
-            float sum = 0.0;
-            for (uint32_t row = 0; row < batch_size; row++) {
-              sum = sum + __half2float(p.h_top_grad[i][row * fc_out_dims[i] + col]);
-            }
-            p.h_bias_grad[i][col] = sum;
-          }
-        }
-      } else {
-        cpu_reverse_add_bias_and_re(p.h_bias_grad[i], p.h_middle[i], p.h_middle[i], p.h_top_grad[i],
-                                    batch_size, fc_out_dims[i], i == int(n_fc_layers - 1),
-                                    use_bias_vector[i]);
-      }
-      cpu_mm(p.h_kernel_grad[i], p.h_bottom[i], true, p.h_middle[i], false, 1.0, fc_in_dims[i],
-             batch_size, fc_out_dims[i]);
-      cpu_mm(p.h_bottom_grad[i], p.h_middle[i], false, p.h_kernel[i], true, 0.0, batch_size,
-             fc_out_dims[i], fc_in_dims[i]);
-    }
-
-    // Backward pass (GPU)
-    for (int i = layers.size() - 1; i >= 0; i--) {
-      if (is_mlp_layer(layers[i].get())) {
-        layers[i]->bprop();
-      }
-    }
-    HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-    // Check results
-    for (int i = n_fc_layers - 1; i >= 0; i--) {
-      auto index = map_mlp[i];
-      auto mlp_layer = dynamic_cast<MLPLayer<T>*>(layers[index[0]].get());
-      auto& inner_tensors = mlp_layer->get_inner_tensors();
-
-      ASSERT_LE(
-          check_data_cpu_and_gpu(p.h_bias_grad[i], mlp_layer->get_bias_grad(index[1]).get_ptr(),
-                                 fc_out_dims[i], 1e-3),
-          0)
-          << "Backward, dBias of the " << i << "th layer cross_check result fail" << std::endl;
-      ASSERT_LE(
-          check_data_cpu_and_gpu(p.h_kernel_grad[i], mlp_layer->get_kernel_grad(index[1]).get_ptr(),
-                                 fc_in_dims[i] * fc_out_dims[i], 1e-3),
-          0)
-          << "Backward, dW of the " << i << "th layer cross_check result fail" << std::endl;
-      if (i > 0 && fc_in_dims[i] != fc_out_dims[i - 1]) {
-        break;
-      }
-    }
-
-  } else {
-    float time_fprop = 0.0, time_bprop = 0.0;
-    int layer_start = 0;
-    int layer_end = (int)layers.size();
-    cudaGraph_t graph;
-    cudaGraphExec_t graph_exec;
-    bool graph_inited = false;
-    cudaEvent_t start, stop;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-
-    auto run_network = [&]() {
-      if (config_set.use_cuda_graph && !graph_inited) {
-        cudaStreamBeginCapture(gpu_resource->get_stream(), cudaStreamCaptureModeThreadLocal);
-      }
-
-      if (!graph_inited || !config_set.use_cuda_graph) {
-        for (int fprop_idx = layer_start; fprop_idx < layer_end; ++fprop_idx) {
-          layers[fprop_idx]->fprop(true);
-        }
-        for (int bprop_idx = layer_end - 1; bprop_idx >= layer_start; --bprop_idx) {
-          for (size_t layer_loop_idx = 0; layer_loop_idx < config_set.layer_loop_cnt;
-               ++layer_loop_idx) {
-            layers[bprop_idx]->bprop();
-          }
-        }
-      } else {
-        cudaGraphLaunch(graph_exec, gpu_resource->get_stream());
-      }
-
-      if (config_set.use_cuda_graph && !graph_inited) {
-        cudaStreamEndCapture(gpu_resource->get_stream(), &graph);
-        cudaGraphInstantiate(&graph_exec, graph, NULL, NULL, 0);
-        graph_inited = true;
-      }
-    };
-
-    run_network();
-
-    float mean_time = 0.0f;
-    // size_t test_loop = config_set.test_loop_cnt;
-    size_t test_loop = 10000;
-    for (size_t test_loop_idx = 0; test_loop_idx < test_loop; ++test_loop_idx) {
-      if (0 == test_loop_idx) {
-        cudaProfilerStart();
-      }
-      float elapsedTime = 0.0f;
-      cudaEventRecord(start, gpu_resource->get_stream());
-      run_network();
-      cudaEventRecord(stop, gpu_resource->get_stream());
-      cudaStreamSynchronize(gpu_resource->get_stream());
-      cudaEventElapsedTime(&elapsedTime, start, stop);
-      if (test_loop_idx % 1000 == 0) {
-        printf("test_loop_idx = %ld, elapsed_time = %f\n", test_loop_idx, elapsedTime);
-      }
-      mean_time += elapsedTime;
-      if (10 == test_loop_idx) {
-        cudaProfilerStop();
-      }
-    }
-    printf("test_loop = %ld, elapsed_time = %f\n", test_loop, mean_time / test_loop);
-  }
-  layers.clear();
-}
-
-ConfigSet function_config_set = {false, true, true, 10, 1};
-ConfigSet perf_config_set = {true, true, true, 10, 1};
-
-std::vector<Layer_t> network{Layer_t::MLP, Layer_t::Interaction, Layer_t::MLP};
-std::vector<std::vector<size_t>> mlp_num_outputs{{512, 256, 128}, {1024, 1024, 512, 256, 1}};
-std::vector<std::vector<bool>> use_relu{{true, true, true}, {false, false, true, true, false}};
-std::vector<std::vector<bool>> use_bias{{true, false, true}, {true, false, false, true, true}};
-std::vector<bool> use_fuse_wb{false, true};
-
-size_t input_dim = 16;
-size_t batch_size = 32;
-
-TEST(mlp_test_fp16_old, all) {
-  mlp_test<__half>(network, mlp_num_outputs, use_relu, use_bias, use_fuse_wb, false, input_dim,
-                   batch_size, function_config_set);
-};
-
-TEST(mlp_test_fp32_old, all) {
-  mlp_test<float>(network, mlp_num_outputs, use_relu, use_bias, use_fuse_wb, false, input_dim,
-                  batch_size, function_config_set);
-};
-
-TEST(mlp_test_tf32_old, all) {
-  mlp_test<float>(network, mlp_num_outputs, use_relu, use_bias, use_fuse_wb, true, input_dim,
-                  batch_size, function_config_set);
-};
-
-TEST(mlp_test_fp16_old_perf, all) {
-  mlp_test<__half>(network, mlp_num_outputs, use_relu, use_bias, use_fuse_wb, false, input_dim,
-                   batch_size, perf_config_set);
-};
-
-TEST(mlp_test_fp32_old_perf, all) {
-  mlp_test<float>(network, mlp_num_outputs, use_relu, use_bias, use_fuse_wb, false, input_dim,
-                  batch_size, perf_config_set);
-};
-
-TEST(mlp_test_tf32_old_perf, all) {
-  mlp_test<float>(network, mlp_num_outputs, use_relu, use_bias, use_fuse_wb, true, input_dim,
-                  batch_size, perf_config_set);
-};
diff --git a/test/utest/legacy_layer_test/multi_cross_layer_test_old.cpp b/test/utest/legacy_layer_test/multi_cross_layer_test_old.cpp
deleted file mode 100644
index a39c27fe0f..0000000000
--- a/test/utest/legacy_layer_test/multi_cross_layer_test_old.cpp
+++ /dev/null
@@ -1,839 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cuda_fp16.h>
-#include <gtest/gtest.h>
-
-#include <data_simulator.hpp>
-#include <layers/multi_cross_layer.hpp>
-#include <memory>
-#include <numeric>
-#include <utest/test_utils.hpp>
-#include <vector>
-
-using namespace HugeCTR;
-
-template <typename T>
-float cal_C(const T a, const float beta, const T b) {
-  return float(a * beta + b);
-}
-
-template <>
-float cal_C(const __half a, const float beta, const __half b) {
-  return __half2float(a) * beta + __half2float(b);
-}
-
-template <typename T>
-class MultiCrossLayerTest {
- private:
-  const float eps = 1;
-  const size_t batchsize_;
-  const size_t w_;
-  const int layers_;
-  const size_t projection_dim_;
-  std::shared_ptr<GeneralBuffer2<CudaAllocator>> blob_buf_;
-  std::shared_ptr<BufferBlock2<float>> master_weight_buff_;
-  std::shared_ptr<BufferBlock2<T>> weight_buf_;
-  std::shared_ptr<BufferBlock2<T>> wgrad_buf_;
-
-  Tensor2<T> weight_;
-  Tensor2<T> wgrad_;
-
-  Tensors2<T> d_inputs_;
-  Tensors2<T> d_outputs_;
-
-  std::vector<T> h_input_;
-  std::vector<T> h_input_grad_;
-  std::vector<T> h_output_grad_;
-  std::vector<std::vector<T>> XUs;
-  std::vector<std::vector<T>> h_kernels_;  // weight 3D matrix
-  std::vector<std::vector<T>> h_biases_;   // bias
-
-  std::vector<std::vector<T>> h_outputs_;
-  std::vector<std::vector<T>> h_hiddens_;
-
-  std::vector<std::vector<T>> h_kernel_grads_;
-  std::vector<std::vector<T>> h_bias_grads_;
-
-  std::shared_ptr<MultiCrossLayer<T>> layer_;
-  test::GaussianDataSimulator data_sim_;
-
-  constexpr std::vector<float> normal_params_() const noexcept {
-    if (std::is_same<T, __half>::value) {
-      return {0.0f, 0.125f, -2.f, 2.f};
-    }
-    return {0.0f, 1.0f, -2.f, 2.f};
-  }
-
-  constexpr T eps_() const noexcept {
-    if (std::is_same<T, __half>::value) {
-      return 2.0f;
-    }
-    return .001f;
-  }
-
-  void reset_forward_() {
-    data_sim_.fill(h_input_.data(), batchsize_ * w_);
-    std::transform(h_input_.begin(), h_input_.end(), h_input_.begin(),
-                   [](T data) { return std::min((T)0.09, std::max(data, T(-0.09))); });
-    for (auto& a : h_kernels_) {
-      data_sim_.fill(a.data(), a.size());
-      // double sum = std::accumulate(a.begin(), a.end(), 0.0);
-      // double mean = sum / a.size();
-      // std::cout<<" weight min "<<*std::min_element(a.begin(), a.end())<<" max
-      // "<<*std::max_element(a.begin(), a.end())<<" mean "<< mean<<std::endl;
-      std::transform(a.begin(), a.end(), a.begin(),
-                     [](T data) { return std::min((T)1.0, std::max(data, T(-1.0))); });
-      // std::cout<<" after transform weight min "<<*std::min_element(a.begin(), a.end())<<" max
-      // "<<*std::max_element(a.begin(), a.end())<<" mean "<< mean<<std::endl;
-    }
-    for (auto& a : h_biases_) {
-      data_sim_.fill(a.data(), w_);
-      std::transform(a.begin(), a.end(), a.begin(),
-                     [](T data) { return std::min((T)0.01, std::max(data, T(-0.01))); });
-    }
-
-    HCTR_LIB_THROW(cudaMemcpy(d_inputs_.front().get_ptr(), h_input_.data(),
-                              d_inputs_.front().get_size_in_bytes(), cudaMemcpyHostToDevice));
-    T* p = weight_.get_ptr();
-    for (int i = 0; i < layers_; i++) {
-      if (this->projection_dim_) {
-        HCTR_LIB_THROW(cudaMemcpy(p, h_kernels_[2 * i].data(),
-                                  w_ * sizeof(T) * this->projection_dim_, cudaMemcpyHostToDevice));
-        p += w_ * this->projection_dim_;
-        HCTR_LIB_THROW(cudaMemcpy(p, h_kernels_[2 * i + 1].data(),
-                                  w_ * sizeof(T) * this->projection_dim_, cudaMemcpyHostToDevice));
-        p += w_ * this->projection_dim_;
-      } else {
-        HCTR_LIB_THROW(cudaMemcpy(p, h_kernels_[i].data(), w_ * sizeof(T), cudaMemcpyHostToDevice));
-        p += w_;
-      }
-      HCTR_LIB_THROW(cudaMemcpy(p, h_biases_[i].data(), w_ * sizeof(T), cudaMemcpyHostToDevice));
-      p += w_;
-    }
-    return;
-  }
-
-  void reset_backward_() {
-    for (auto& a : h_output_grad_) {
-      a = 1e-1;
-    }
-    for (auto& a : h_kernel_grads_) {
-      for (auto& b : a) {
-        b = 0.0f;
-      }
-    }
-    for (auto& a : h_bias_grads_) {
-      for (auto& b : a) {
-        b = 0.0f;
-      }
-    }
-    HCTR_LIB_THROW(cudaMemcpy(d_outputs_.back().get_ptr(), h_output_grad_.data(),
-                              batchsize_ * w_ * sizeof(T), cudaMemcpyHostToDevice));
-    T* p = wgrad_.get_ptr();
-    for (int i = 0; i < layers_; i++) {
-      if (this->projection_dim_) {
-        HCTR_LIB_THROW(cudaMemcpy(p, h_kernel_grads_[2 * i].data(),
-                                  w_ * sizeof(T) * this->projection_dim_, cudaMemcpyHostToDevice));
-        p += w_ * this->projection_dim_;
-        HCTR_LIB_THROW(cudaMemcpy(p, h_kernel_grads_[2 * i + 1].data(),
-                                  w_ * sizeof(T) * this->projection_dim_, cudaMemcpyHostToDevice));
-        p += w_ * this->projection_dim_;
-      } else {
-        HCTR_LIB_THROW(
-            cudaMemcpy(p, h_kernel_grads_[i].data(), w_ * sizeof(T), cudaMemcpyHostToDevice));
-        p += w_;
-      }
-      HCTR_LIB_THROW(
-          cudaMemcpy(p, h_bias_grads_[i].data(), w_ * sizeof(T), cudaMemcpyHostToDevice));
-      p += w_;
-    }
-  }
-
-  void matrix_vec_mul(T* out, const T* in_m, const T* in_v, size_t h, size_t w) {
-    for (size_t j = 0; j < h; j++) {
-      out[j] = 0.0f;
-      for (size_t i = 0; i < w; i++) {
-        size_t k = j * w + i;
-        out[j] = out[j] + T(in_m[k] * in_v[i]);
-      }
-    }
-  }
-
-  // gemm: C = A * B, colA = rowB
-  // the rowA is always the row of op(A)
-  void special_gemm(T* C, const T* A, bool transA, const T* B, bool transB, size_t rowA,
-                    size_t colB, size_t rowB, float beta = 0.f) {
-    // HCTR_LOG(INFO, WORLD, " inside matrix_matrix_mul rowA %ld rowB %ld\n", rowA, rowB);
-    if (transB && transA) {
-      // c^T
-      // special_gemm(C, B, false, A, false, colB,rowA,rowB );
-      HCTR_LOG(ERROR, WORLD, "not supported\n");
-      return;
-    }
-
-    if (transB) {
-      // HCTR_LOG(ERROR, WORLD, "trans B, rowA %d, colB %d, rowB %d\n", rowA, colB, rowB);
-      for (size_t r = 0; r < rowA; r++) {
-        for (size_t c = 0; c < colB; c++) {
-          T acc = 0.f;
-          for (size_t k = 0; k < rowB; k++) {
-            // column of A is rowA
-            acc = acc + (A[r * rowB + k] * B[c * rowB + k]);
-          }
-          C[r * colB + c] = cal_C(C[r * colB + c], beta, acc);
-        }
-      }
-    } else if (transA) {
-      // rowA == row of A^T or col of A
-      for (size_t r = 0; r < rowA; r++) {
-        for (size_t c = 0; c < colB; c++) {
-          T acc = 0.f;
-          for (size_t k = 0; k < rowB; k++) {
-            // column of A is rowA
-            acc = acc + (A[k * rowA + r] * B[k * colB + c]);
-          }
-          C[r * colB + c] = cal_C(C[r * colB + c], beta, acc);
-        }
-      }
-    } else {
-      for (size_t r = 0; r < rowA; r++) {
-        for (size_t c = 0; c < colB; c++) {
-          T acc = 0.f;
-          for (size_t k = 0; k < rowB; k++) {
-            // column of A is rowB
-            acc = acc + (A[r * rowB + k] * B[k * colB + c]);
-          }
-          C[r * colB + c] = cal_C(C[r * colB + c], beta, acc);
-        }
-      }
-    }
-  }
-  // elementwise dot:  (vec_mul)
-  //       A ->  batchsizexw
-  //       B ->  batchsizexw
-  //       C ->  batchsizexw
-  // w => input vec size
-  // c => batchsize
-  // TODO exchange w & batchsize placement
-  void matrix_matrix_elementwise_dot(T* C, const T* A, const T* B, size_t w, size_t batchsize) {
-    for (size_t r = 0; r < batchsize; r++) {
-      for (size_t c = 0; c < w; c++) {
-        C[r * w + c] = A[r * w + c] * B[r * w + c];
-      }
-    }
-
-    /*
-      for(size_t idx = 0 ;idx < w * batchsize; idx ++){
-        C[idx] = A[idx] * B[idx];
-      }
-    */
-  }
-  void matrix_matrix_elementwise_dot(T* C, const T* A, const T* B, size_t w, size_t batchsize,
-                                     std::ofstream& ofs) {
-    for (size_t r = 0; r < batchsize; r++) {
-      for (size_t c = 0; c < w; c++) {
-        C[r * w + c] = A[r * w + c] * B[r * w + c];
-        ofs << "C(" << r << "," << c << ") is " << A[r * w + c] << " * " << B[r * w + c] << "="
-            << C[r * w + c] << std::endl;
-      }
-    }
-  }
-
-  void row_scaling(T* out, const T* in_m, const T* in_v, size_t h, size_t w) {
-    for (size_t j = 0; j < h; j++) {
-      for (size_t i = 0; i < w; i++) {
-        size_t k = j * w + i;
-        out[k] = in_m[k] * in_v[j];
-      }
-    }
-  }
-
-  void matrix_add(T* out, const T* in_m_1, const T* in_m_2, size_t h, size_t w) {
-    for (size_t j = 0; j < h; j++) {
-      for (size_t i = 0; i < w; i++) {
-        size_t k = j * w + i;
-        out[k] = in_m_1[k] + in_m_2[k];
-      }
-    }
-  }
-
-  void matrix_vec_add(T* out, const T* in_m, const T* in_v, size_t h, size_t w) {
-    for (size_t j = 0; j < h; j++) {
-      for (size_t i = 0; i < w; i++) {
-        size_t k = j * w + i;
-        out[k] = in_m[k] + in_v[i];
-      }
-    }
-  }
-
-  void matrix_pair_mul(T* out, const T* in_m_1, const T* in_m_2, size_t h, size_t w) {
-    for (size_t j = 0; j < h; j++) {
-      out[j] = 0.0f;
-      for (size_t i = 0; i < w; i++) {
-        size_t k = j * w + i;
-        out[j] = out[j] + T(in_m_1[k] * in_m_2[k]);
-      }
-    }
-  }
-
-  void row_scaling_sum(T* out, const T* in_m, const T* in_v, size_t h, size_t w) {
-    for (size_t i = 0; i < w; i++) {
-      out[i] = 0.0f;
-      for (size_t j = 0; j < h; j++) {
-        size_t k = j * w + i;
-        out[i] = out[i] + T(in_m[k] * in_v[j]);
-      }
-    }
-  }
-
-  void rows_sum(T* out, const T* in_m, size_t h, size_t w) {
-    for (size_t i = 0; i < w; i++) {
-      out[i] = 0.0f;
-      for (size_t j = 0; j < h; j++) {
-        size_t k = j * w + i;
-        out[i] = out[i] + in_m[k];
-      }
-    }
-  }
-
-  void out_product(T* out, const T* in_v_1, const T* in_v_2, size_t h, size_t w) {
-    for (size_t j = 0; j < h; j++) {
-      for (size_t i = 0; i < w; i++) {
-        size_t k = j * w + i;
-        out[k] = in_v_1[j] * in_v_2[i];
-      }
-    }
-  }
-
-  void cpu_fprop_() {
-    for (int i = 0; i < layers_; i++) {
-      matrix_vec_mul(h_hiddens_[i].data(), i == 0 ? h_input_.data() : h_outputs_[i - 1].data(),
-                     h_kernels_[i].data(), batchsize_, w_);
-      row_scaling(h_outputs_[i].data(), h_input_.data(), h_hiddens_[i].data(), batchsize_, w_);
-      matrix_add(h_outputs_[i].data(), h_outputs_[i].data(),
-                 i == 0 ? h_input_.data() : h_outputs_[i - 1].data(), batchsize_, w_);
-      matrix_vec_add(h_outputs_[i].data(), h_outputs_[i].data(), h_biases_[i].data(), batchsize_,
-                     w_);
-    }
-  }
-
-  // DCN v2
-  void cpu_fprop_v2_() {
-    // HCTR_LOG(INFO, WORLD, "cpu_fprop_v2_ starts\n");
-    // std::ofstream writer("CPUoutput_fprop.txt");
-    for (int i = 0; i < layers_; i++) {
-      // writer << "layer " << i << std::endl;
-      // X * U
-      special_gemm(XUs[i].data(), i == 0 ? h_input_.data() : h_outputs_[i - 1].data(), false,
-                   h_kernels_[i * 2].data(), false, batchsize_, this->projection_dim_, w_);
-      // X * U * V
-      const auto& tensor_input = i == 0 ? h_input_ : h_outputs_[i - 1];
-      // writer << "input"
-      // z       << " is\n";
-      // for (size_t b = 0; b < tensor_input.size(); b++) {
-      //   if (b % w_ == 0) writer << "batch" << b / w_ << "\n";
-      //   writer << "\t" << b << " " << (float)tensor_input[b] << "\n";
-      // }
-      // writer << "U"
-      //        << " is\n";
-      // for (size_t b = 0; b < h_kernels_[i * 2].size(); b++) {
-      //   if (b % projection_dim_ == 0) writer << "weight" << b / projection_dim_ << "\n";
-      //   writer << "\t" << b << " " << (float)h_kernels_[i * 2][b] << "\n";
-      // }
-
-      // writer << "XU"
-      //        << " is\n";
-      // for (size_t b = 0; b < XUs[i].size(); b++) {
-      //   if (b % projection_dim_ == 0) writer << "batch" << b / projection_dim_ << "\n";
-      //   writer << "\t" << b << " " << (float)XUs[i][b] << "\n";
-      // }
-      special_gemm(XUs[i].data(), tensor_input.data(), false, h_kernels_[i * 2].data(), false,
-                   batchsize_, this->projection_dim_, w_);
-      special_gemm(h_hiddens_[i].data(), XUs[i].data(), false, h_kernels_[i * 2 + 1].data(), false,
-                   batchsize_, w_, this->projection_dim_);
-      // HCTR_LOG(INFO,WORLD,"matrix_matrix_mul h_hiddens_[i].size() %ld OK\n",
-      // h_hiddens_[i].size());
-      matrix_vec_add(h_hiddens_[i].data(), h_hiddens_[i].data(), h_biases_[i].data(), batchsize_,
-                     w_);
-      // HCTR_LOG(INFO,WORLD,"matrix_vec_add OK\n");
-      matrix_matrix_elementwise_dot(h_outputs_[i].data(), h_hiddens_[i].data(), h_input_.data(), w_,
-                                    batchsize_);
-      // HCTR_LOG(INFO,WORLD,"matrix_matrix_elementwise_dot OK\n");
-      matrix_add(h_outputs_[i].data(), h_outputs_[i].data(),
-                 i == 0 ? h_input_.data() : h_outputs_[i - 1].data(), batchsize_, w_);
-    }
-  }
-
-  void gpu_fprop_() {
-    layer_->fprop(true);
-    HCTR_LIB_THROW(cudaDeviceSynchronize());
-    return;
-  }
-
-  void cpu_bprop_() {
-    std::vector<T> tmp_mat_0(batchsize_ * w_);
-    std::vector<T> tmp_mat_1(batchsize_ * w_);
-    std::vector<T> tmp_vec(batchsize_);
-    memset(h_input_grad_.data(), 0, h_input_grad_.size() * sizeof(T));
-    for (int i = layers_ - 1; i >= 0; i--) {
-      row_scaling(tmp_mat_0.data(), i == layers_ - 1 ? h_output_grad_.data() : tmp_mat_1.data(),
-                  h_hiddens_[i].data(), batchsize_, w_);
-      matrix_add(h_input_grad_.data(), h_input_grad_.data(), tmp_mat_0.data(), batchsize_, w_);
-      matrix_pair_mul(tmp_vec.data(), i == layers_ - 1 ? h_output_grad_.data() : tmp_mat_1.data(),
-                      h_input_.data(), batchsize_, w_);
-
-      // transposed_gemv
-      row_scaling_sum(h_kernel_grads_[i].data(),
-                      i == 0 ? h_input_.data() : h_outputs_[i - 1].data(), tmp_vec.data(),
-                      batchsize_, w_);
-      rows_sum(h_bias_grads_[i].data(), i == layers_ - 1 ? h_output_grad_.data() : tmp_mat_1.data(),
-               batchsize_, w_);
-      out_product(tmp_mat_0.data(), tmp_vec.data(), h_kernels_[i].data(), batchsize_, w_);
-      matrix_add(tmp_mat_1.data(), i == layers_ - 1 ? h_output_grad_.data() : tmp_mat_1.data(),
-                 tmp_mat_0.data(), batchsize_, w_);
-    }
-    matrix_add(h_input_grad_.data(), h_input_grad_.data(), tmp_mat_1.data(), batchsize_, w_);
-  }
-
-  void cpu_bprop_v2_() {
-    std::vector<T> tmp_mat_0(batchsize_ * w_);
-    std::vector<T> tmp_mat_1(batchsize_ * w_);
-    std::vector<T> tmp_mat_2(batchsize_ * w_);
-    std::vector<T> tmp_mat_3(batchsize_ * projection_dim_);
-    memset(h_input_grad_.data(), 0, h_input_grad_.size() * sizeof(T));
-    // std::ofstream writer("CPUoutput.txt");
-    for (int i = layers_ - 1; i >= 0; i--) {
-      {
-        // writer << "layer " << i << std::endl;
-        auto GRAD = i == layers_ - 1 ? h_output_grad_ : tmp_mat_1;
-        // S0 = dY_i .* X , shape: (batchsize, w)
-        matrix_matrix_elementwise_dot(tmp_mat_0.data(), GRAD.data(), h_input_.data(), w_,
-                                      batchsize_);
-        // hidden .* dY_i
-        matrix_matrix_elementwise_dot(tmp_mat_2.data(), GRAD.data(), h_hiddens_[i].data(), w_,
-                                      batchsize_);
-        // dX , shape: (batchsize, w)
-        matrix_add(h_input_grad_.data(), h_input_grad_.data(), tmp_mat_2.data(), batchsize_, w_);
-        // db , shape: (1,batchsize)
-        rows_sum(h_bias_grads_[i].data(), tmp_mat_0.data(), batchsize_, w_);
-        // dV = XU^T * S0  , (XU is the forward pass) , shape: (projection_dim_, w)
-        special_gemm(h_kernel_grads_[2 * i + 1].data(), XUs[i].data(), true, tmp_mat_0.data(),
-                     false, this->projection_dim_, w_, batchsize_, 1.0);
-        // S1 = S0 * V^T , shape: (batchsize, projection_dim_)
-        special_gemm(tmp_mat_3.data(), tmp_mat_0.data(), false, h_kernels_[2 * i + 1].data(), true,
-                     batchsize_, projection_dim_, w_);
-        // dU = H^T * S1 , shape: (w, projection_dim_)
-        special_gemm(h_kernel_grads_[2 * i].data(),
-                     i == 0 ? h_input_.data() : h_outputs_[i - 1].data(), true, tmp_mat_3.data(),
-                     false, w_, projection_dim_, batchsize_, 1.0);
-
-        // d_hidden = S1 * U^T, shape: (batchsize, w_)
-        special_gemm(tmp_mat_2.data(), tmp_mat_3.data(), false, h_kernels_[i * 2].data(), true,
-                     batchsize_, w_, projection_dim_);
-
-        // {
-        // dY_{i-1} += d0
-        matrix_add(tmp_mat_1.data(), i == layers_ - 1 ? h_output_grad_.data() : tmp_mat_1.data(),
-                   tmp_mat_2.data(), batchsize_, w_);
-        // }
-      }
-    }
-    // accumulative dgrad of elementwise op WRT input + gemm dgrad
-    matrix_add(h_input_grad_.data(), h_input_grad_.data(), tmp_mat_1.data(), batchsize_, w_);
-  }
-
-  void gpu_bprop_() {
-    layer_->bprop();
-    HCTR_LIB_THROW(cudaDeviceSynchronize());
-    return;
-  }
-
-  void compare_forward_() {
-    std::vector<T> d2h_output;
-    d2h_output.resize(batchsize_ * w_);
-
-    HCTR_LIB_THROW(cudaMemcpy(d2h_output.data(), d_outputs_.front().get_ptr(),
-                              d_outputs_.front().get_size_in_bytes(), cudaMemcpyDeviceToHost));
-
-    ASSERT_TRUE(test::compare_array_approx_rel<T>(d2h_output.data(), h_outputs_.back().data(),
-                                                  h_outputs_.back().size(), 0.1f, eps_()));
-  }
-
-  void compare_hidden_() {
-    std::vector<T> d2h_output;
-    size_t weight_width = this->projection_dim_ ? w_ : 1;
-    d2h_output.resize(batchsize_ * weight_width);
-    auto& hidden_tensors = layer_->get_hidden_tensors();
-    for (int i = 0; i < this->layers_; i++) {
-      auto& d_tensor = hidden_tensors[i];
-      HCTR_LIB_THROW(cudaMemcpy(d2h_output.data(), d_tensor.get_ptr(), d_tensor.get_size_in_bytes(),
-                                cudaMemcpyDeviceToHost));
-      ASSERT_TRUE(test::compare_array_approx_rel<T>(d2h_output.data(), h_hiddens_[i].data(),
-                                                    h_hiddens_[i].size(), 0.1f, eps_()));
-    }
-  }
-
-  void compare_XU_() {
-    std::vector<T> d2h_output;
-    d2h_output.resize(batchsize_ * projection_dim_);
-    auto& weight_tensor = layer_->get_weight_tensor();
-    for (int i = 0; i < this->layers_; i++) {
-      HCTR_LIB_THROW(cudaMemcpy(d2h_output.data(), weight_tensor[i].get_ptr(),
-                                weight_tensor[i].get_size_in_bytes(), cudaMemcpyDeviceToHost));
-      ASSERT_TRUE(test::compare_array_approx_rel<T>(d2h_output.data(), XUs[i].data(), XUs[i].size(),
-                                                    0.1f, eps_()));
-    }
-  }
-
-  void compare_backward_() {
-    std::vector<T> d2h_input_grad;
-    std::vector<std::vector<T>> d2h_kernel_grads_;
-    std::vector<std::vector<T>> d2h_bias_grads_;
-
-    d2h_input_grad.resize(batchsize_ * w_);
-    for (int i = 0; i < layers_; i++) {
-      d2h_kernel_grads_.push_back(std::vector<T>(1 * w_));
-      d2h_bias_grads_.push_back(std::vector<T>(1 * w_));
-    }
-
-    HCTR_LIB_THROW(cudaMemcpy(d2h_input_grad.data(), d_inputs_.front().get_ptr(),
-                              d_inputs_.front().get_size_in_bytes(), cudaMemcpyDeviceToHost));
-    T* p = wgrad_.get_ptr();
-    for (int i = 0; i < layers_; i++) {
-      HCTR_LIB_THROW(
-          cudaMemcpy(d2h_kernel_grads_[i].data(), p, w_ * sizeof(T), cudaMemcpyDeviceToHost));
-      p += w_;
-      HCTR_LIB_THROW(
-          cudaMemcpy(d2h_bias_grads_[i].data(), p, w_ * sizeof(T), cudaMemcpyDeviceToHost));
-      p += w_;
-    }
-
-    ASSERT_TRUE(test::compare_array_approx_rel<T>(d2h_input_grad.data(), h_input_grad_.data(),
-                                                  h_input_grad_.size(), 0.4f, eps_()));
-    for (int i = 0; i < layers_; i++) {
-      ASSERT_TRUE(test::compare_array_approx_rel<T>(d2h_kernel_grads_[i].data(),
-                                                    h_kernel_grads_[i].data(),
-                                                    h_kernel_grads_[i].size(), 0.4f, eps_()));
-      ASSERT_TRUE(test::compare_array_approx_rel<T>(d2h_bias_grads_[i].data(),
-                                                    h_bias_grads_[i].data(),
-                                                    h_bias_grads_[i].size(), 0.2f, eps_()));
-    }
-  }
-
-  void compare_backward_v2_() {
-    std::vector<T> d2h_input_grad;
-    std::vector<std::vector<T>> d2h_kernel_grads_u;
-    std::vector<std::vector<T>> d2h_kernel_grads_v;
-    std::vector<std::vector<T>> d2h_bias_grads_;
-
-    d2h_input_grad.resize(batchsize_ * w_);
-    for (int i = 0; i < layers_; i++) {
-      d2h_kernel_grads_u.push_back(std::vector<T>(this->projection_dim_ * w_));
-      d2h_kernel_grads_v.push_back(std::vector<T>(this->projection_dim_ * w_));
-      d2h_bias_grads_.push_back(std::vector<T>(1 * w_));
-    }
-
-    HCTR_LIB_THROW(cudaMemcpy(d2h_input_grad.data(), d_inputs_.back().get_ptr(),
-                              d_inputs_.back().get_size_in_bytes(), cudaMemcpyDeviceToHost));
-    T* p = wgrad_.get_ptr();
-    for (int i = 0; i < layers_; i++) {
-      HCTR_LIB_THROW(cudaMemcpy(d2h_kernel_grads_u[i].data(), p,
-                                d2h_kernel_grads_u[i].size() * sizeof(T), cudaMemcpyDeviceToHost));
-      p += w_ * this->projection_dim_;
-      HCTR_LIB_THROW(cudaMemcpy(d2h_kernel_grads_v[i].data(), p,
-                                d2h_kernel_grads_v[i].size() * sizeof(T), cudaMemcpyDeviceToHost));
-      p += w_ * this->projection_dim_;
-
-      HCTR_LIB_THROW(
-          cudaMemcpy(d2h_bias_grads_[i].data(), p, w_ * sizeof(T), cudaMemcpyDeviceToHost));
-      p += w_;
-    }
-
-    ASSERT_TRUE(test::compare_array_approx_rel<T>(d2h_input_grad.data(), h_input_grad_.data(),
-                                                  h_input_grad_.size(), 0.4f, eps_()));
-    for (int i = 0; i < layers_; i++) {
-      ASSERT_TRUE(test::compare_array_approx_rel<T>(d2h_bias_grads_[i].data(),
-                                                    h_bias_grads_[i].data(),
-                                                    h_bias_grads_[i].size(), 0.2f, eps_()));
-
-      ASSERT_TRUE(test::compare_array_approx_rel<T>(
-          d2h_kernel_grads_v[i].data(), h_kernel_grads_[i * 2 + 1].data(),
-          h_kernel_grads_[i * 2 + 1].size(), 0.4f, eps_()));
-
-      ASSERT_TRUE(test::compare_array_approx_rel<T>(d2h_kernel_grads_u[i].data(),
-                                                    h_kernel_grads_[i * 2].data(),
-                                                    h_kernel_grads_[i].size(), 0.4f, eps_()));
-    }
-  }
-
- public:
-  MultiCrossLayerTest(size_t batchsize, size_t w, int layers, size_t projection_dim = 0,
-                      bool grad_input_separate = false, bool async_wgrad = false)
-      : batchsize_(batchsize),
-        w_(w),
-        layers_(layers),
-        projection_dim_(projection_dim),
-        blob_buf_(GeneralBuffer2<CudaAllocator>::create()),
-        data_sim_(normal_params_()[0], normal_params_()[1]) {
-    master_weight_buff_ = blob_buf_->create_block<float>();
-    weight_buf_ = blob_buf_->create_block<T>();
-    wgrad_buf_ = blob_buf_->create_block<T>();
-    d_inputs_.resize(grad_input_separate ? 2 : 1);
-    d_outputs_.resize(grad_input_separate ? 2 : 1);
-    blob_buf_->reserve({batchsize, w}, &d_inputs_.front());
-    blob_buf_->reserve({batchsize, w}, &d_outputs_.front());
-    if (grad_input_separate) {
-      blob_buf_->reserve({batchsize, w}, &d_inputs_.back());
-      blob_buf_->reserve({batchsize, w}, &d_outputs_.back());
-    }
-
-    h_input_.resize(batchsize * w);
-    h_output_grad_.resize(batchsize * w);
-    h_input_grad_.resize(batchsize * w);
-    // w = x * u
-    for (int i = 0; i < layers; i++) {
-      if (this->projection_dim_) {
-        std::vector<T> xu(batchsize_ * this->projection_dim_);
-        XUs.push_back(xu);
-      }
-    }
-
-    size_t weight_width = this->projection_dim_ ? w_ : 1;
-    HCTR_LOG(INFO, ROOT, "weight_width %ld\n", weight_width);
-    for (int i = 0; i < layers_; i++) {
-      // dcn v2
-      if (this->projection_dim_) {
-        // two weight tensor W=U*V
-        // U
-        h_kernels_.push_back(std::vector<T>(w_ * projection_dim));
-        // V
-        h_kernels_.push_back(std::vector<T>(projection_dim * w_));
-        // dU
-        h_kernel_grads_.push_back(std::vector<T>(w_ * projection_dim));
-        // dV
-        h_kernel_grads_.push_back(std::vector<T>(w_ * projection_dim));
-      } else {
-        // only one vector tensor
-        h_kernels_.push_back(std::vector<T>(1 * w_));
-        h_kernel_grads_.push_back(std::vector<T>(1 * w));
-      }
-      h_biases_.push_back(std::vector<T>(1 * w));
-      h_outputs_.push_back(std::vector<T>(batchsize * w));
-      h_hiddens_.push_back(std::vector<T>(batchsize * weight_width));
-      h_bias_grads_.push_back(std::vector<T>(1 * w));
-    }
-
-    // layer
-    layer_.reset(new MultiCrossLayer<T>(master_weight_buff_, weight_buf_, wgrad_buf_, blob_buf_,
-                                        d_inputs_, d_outputs_, test::get_default_gpu(), layers,
-                                        projection_dim_, {}, false, async_wgrad));
-
-    blob_buf_->allocate();
-    layer_->initialize();
-
-    weight_ = weight_buf_->as_tensor();
-    wgrad_ = wgrad_buf_->as_tensor();
-
-    return;
-  }
-
-  void test() {
-    // this->layer_->initialize();
-    this->layer_->search_algorithm();
-    // dcnv1
-    if (this->projection_dim_ == 0) {
-      reset_forward_();
-      cpu_fprop_();
-      gpu_fprop_();
-      compare_forward_();
-      reset_backward_();
-      cpu_bprop_();
-      gpu_bprop_();
-      compare_backward_();
-    } else {
-      // constexpr int loops = 50;
-      // for (int i = 0; i < loops; i++) {
-      //   gpu_fprop_();
-      // }
-      // for (int i = 0; i < loops; i++) {
-      //   gpu_bprop_();
-      // }
-      reset_forward_();
-      cpu_fprop_v2_();
-      gpu_fprop_();
-      compare_XU_();
-      compare_hidden_();
-      compare_forward_();
-
-      reset_backward_();
-      cpu_bprop_v2_();
-      gpu_bprop_();
-      compare_backward_v2_();
-    }
-  }
-};
-
-TEST(multi_cross_layer_old_v1, fp32_32x1024x3) {
-  MultiCrossLayerTest<float> test(2, 1024, 3);
-  test.test();
-}
-
-TEST(multi_cross_layer_old_v1, fp32_4096x1024x2) {
-  MultiCrossLayerTest<float> test(4096, 1024, 2);
-  test.test();
-}
-
-TEST(multi_cross_layer_old_v1, fp32_4096x1024x3) {
-  MultiCrossLayerTest<float> test(4096, 1024, 3);
-  test.test();
-}
-
-TEST(multi_cross_layer_old_v1, fp32_40963x356x3) {
-  MultiCrossLayerTest<float> test(40963, 356, 3);
-  test.test();
-}
-
-TEST(multi_cross_layer_old_v1, fp16_32x1024x3) {
-  MultiCrossLayerTest<__half> test(32, 1024, 3);
-  test.test();
-}
-
-TEST(multi_cross_layer_old_v1, fp16_1024x1024x2) {
-  MultiCrossLayerTest<__half> test(1024, 1024, 2);
-  test.test();
-}
-
-TEST(multi_cross_layer_old_v1, fp16_1024x1024x3) {
-  MultiCrossLayerTest<__half> test(1024, 1024, 3);
-  test.test();
-}
-
-TEST(multi_cross_layer_old_v1, fp16_1283x356x3) {
-  MultiCrossLayerTest<__half> test(1283, 356, 3);
-  test.test();
-}
-
-// MultiCrossLayerTest(size_t batchsize, size_t w, int layers, size_t
-// projection_dim = 0)
-//
-TEST(multi_cross_layer_old_v2, fp32_1x1024x1) {
-  MultiCrossLayerTest<float> test2(4, 256, 1, 256);
-  MultiCrossLayerTest<float> test3(4, 1024, 1, 256);
-  MultiCrossLayerTest<float> test4(8, 1024, 1, 128);
-  test2.test();
-  test3.test();
-  test4.test();
-}
-TEST(multi_cross_layer_old_v2, fp32_4096x256x3) {
-  MultiCrossLayerTest<float> test1(4096, 256, 3, 512);
-  MultiCrossLayerTest<float> test2(4096, 256, 3, 512, true, true);
-  MultiCrossLayerTest<float> test3(4096, 256, 3, 512, true, false);
-  MultiCrossLayerTest<float> test4(4096, 256, 3, 512, false, true);
-  test1.test();
-  test2.test();
-  test3.test();
-  test4.test();
-}
-// MultiCrossLayerTest(size_t batchsize, size_t w, int layers, size_t projection_dim = 0)
-TEST(multi_cross_layer_old_v2, fp32_1x256x1) {
-  MultiCrossLayerTest<float> test(2, 1024, 4, 256);
-  test.test();
-}
-// TEST(multi_cross_layer_old_v2, fp32_3x1024x2) {
-//   MultiCrossLayerTest<float> test(3, 1024, 2, 1024);
-//   test.test();
-// }
-TEST(multi_cross_layer_old_v2, fp32_3x1024x3) {
-  MultiCrossLayerTest<float> test(10, 1024, 3, 512);
-  test.test();
-}
-
-TEST(multi_cross_layer_old_v2, fp32_3x1024x10) {
-  MultiCrossLayerTest<float> test(3, 1024, 10, 64);
-  test.test();
-}
-TEST(multi_cross_layer_old_v2, fp32_3x1024x2) {
-  MultiCrossLayerTest<float> test1(3, 1024, 2, 256);
-  MultiCrossLayerTest<float> test2(3, 1024, 1, 1024, false, true);
-  MultiCrossLayerTest<float> test3(3, 1024, 1, 1024, true, true);
-  MultiCrossLayerTest<float> test4(3, 1024, 1, 1024, true, false);
-  test1.test();
-  test2.test();
-  test3.test();
-  test4.test();
-}
-TEST(multi_cross_layer_old_v2, fp32_3x1024x4) {
-  MultiCrossLayerTest<float> test(3, 1024, 4, 256);
-  test.test();
-}
-//
-TEST(multi_cross_layer_old_v2, fp16_debug) {
-  MultiCrossLayerTest<__half> test3(3, 512, 3, 1024);
-  MultiCrossLayerTest<__half> test4(3, 1024, 3, 32);
-  MultiCrossLayerTest<__half> test5(3, 1024, 3, 64);
-  MultiCrossLayerTest<__half> test6(3, 1024, 3, 128);
-  MultiCrossLayerTest<__half> test7(3, 1024, 3, 256);
-  MultiCrossLayerTest<__half> test8(3, 1024, 3, 512);
-  MultiCrossLayerTest<__half> test9(3, 3456, 3, 256);
-  MultiCrossLayerTest<__half> test10(3, 3456, 3, 64);
-  MultiCrossLayerTest<__half> test11(3, 3456, 3, 512);
-  MultiCrossLayerTest<__half> test12(3, 3456, 3, 512);
-  test3.test();
-  std::cout << "Test3 \n";
-  test4.test();
-  std::cout << "Test4 \n";
-  test5.test();
-  std::cout << "Test5 \n";
-  test6.test();
-  std::cout << "Test6 \n";
-  test7.test();
-  std::cout << "Test7 \n";
-  test8.test();
-  std::cout << "Test8 \n";
-  test9.test();
-  std::cout << "Test9 \n";
-  test10.test();
-  std::cout << "Test10 \n";
-  test11.test();
-  std::cout << "Test11 \n";
-  test12.test();
-  std::cout << "Test12 \n";
-}
-
-TEST(multi_cross_layer_old_v2, fp16_3x1024x3) {
-  MultiCrossLayerTest<__half> test1(3, 1024, 3, 1024);
-  MultiCrossLayerTest<__half> test2(3, 1024, 3, 1024, false, true);
-  MultiCrossLayerTest<__half> test3(3, 1024, 3, 1024, true, true);
-  MultiCrossLayerTest<__half> test4(3, 1024, 3, 1024, true, false);
-  test1.test();
-  test2.test();
-  test3.test();
-  test4.test();
-}
-TEST(multi_cross_layer_old_v2, fp16_3x1024x2) {
-  MultiCrossLayerTest<__half> test(3, 1024, 2, 1024);
-  test.test();
-}
-
-// TEST(multi_cross_layer_old_v2, fp16_384) {
-//   MultiCrossLayerTest<__half> test(384, 3456, 1, 512);
-//   test.test();
-// }
-// TEST(multi_cross_layer_old_v2, fp16_dlrm) {
-//   MultiCrossLayerTest<__half> test(65536, 3456, 3, 512);
-//   test.test();
-// }
diff --git a/test/utest/legacy_layer_test/trainable_layer_test_old.cpp b/test/utest/legacy_layer_test/trainable_layer_test_old.cpp
deleted file mode 100644
index 04e16136f0..0000000000
--- a/test/utest/legacy_layer_test/trainable_layer_test_old.cpp
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gtest/gtest.h>
-
-#include <trainable_layer.hpp>
-#include <utest/test_utils.hpp>
-
-using namespace HugeCTR;
-
-namespace {
-
-/**
- * @brief
- * This is a dummy trainable layer class to demonstrate how to reserve weights and weight
- * gradients for trainable layers. The fprop and bprop methods are not actually implemented.
- */
-template <typename DType, bool use_FP32_weight>
-class DummyTrainableLayer : public TrainableLayer<DType, use_FP32_weight> {
-  using Base = TrainableLayer<DType, use_FP32_weight>;
-  using WeightType = typename Base::WeightType;
-
-  Tensors2<DType> in_tensors_;
-
-  Tensors2<DType> out_tensors_;
-
- public:
-  DummyTrainableLayer(const std::shared_ptr<BufferBlock2<float>>& master_weight_buff,
-                      const std::shared_ptr<BufferBlock2<WeightType>>& weight_buff,
-                      const std::shared_ptr<BufferBlock2<WeightType>>& wgrad_buff,
-                      const Tensor2<DType>& in_tensor, const Tensor2<DType>& out_tensor,
-                      const std::shared_ptr<GPUResource>& gpu_resource,
-                      std::vector<Initializer_t> initializer_types = std::vector<Initializer_t>())
-      : Base(master_weight_buff, weight_buff, wgrad_buff, gpu_resource, initializer_types) {
-    const auto& in_tensor_dim = in_tensor.get_dimensions();
-    const auto& out_tensor_dim = out_tensor.get_dimensions();
-
-    if (in_tensor_dim.size() != 2 || in_tensor_dim.size() != 2 ||
-        in_tensor_dim[0] != out_tensor_dim[0]) {
-      HCTR_OWN_THROW(Error_t::WrongInput, "Wrong dimensions for input and output tensors");
-    }
-
-    size_t input_size = in_tensor_dim[in_tensor_dim.size() - 1];
-    size_t output_size = out_tensor_dim[out_tensor_dim.size() - 1];
-
-    std::vector<size_t> dim0 = {input_size, output_size};
-    std::vector<size_t> dim1 = {1, output_size};
-
-    this->set_weight(0, dim0);
-    this->set_weight(1, dim1);
-    this->set_wgrad(0, dim0);
-    this->set_wgrad(1, dim1);
-
-    in_tensors_.push_back(in_tensor);
-    out_tensors_.push_back(out_tensor);
-  }
-
-  void fprop(bool is_train) override {
-    auto weight_0 = this->get_weight(0);
-    auto weight_1 = this->get_weight(1);
-  };
-
-  void bprop() override {
-    auto wgrad_0 = this->get_wgrad(0);
-    auto wgrad_1 = this->get_wgrad(1);
-  };
-};
-
-template <typename DType, bool use_FP32_weight>
-void trainable_layer_test(size_t batch_size, size_t in_dim, size_t out_dim) {
-  std::shared_ptr<GeneralBuffer2<CudaAllocator>> buf = GeneralBuffer2<CudaAllocator>::create();
-
-  using WeightType = typename std::conditional<use_FP32_weight, float, DType>::type;
-
-  std::shared_ptr<BufferBlock2<WeightType>> weight_buff = buf->create_block<WeightType>();
-  std::shared_ptr<BufferBlock2<WeightType>> wgrad_buff = buf->create_block<WeightType>();
-
-  Tensor2<DType> input_tensor;
-  buf->reserve({batch_size, in_dim}, &input_tensor);
-  Tensor2<DType> output_tensor;
-  buf->reserve({batch_size, out_dim}, &output_tensor);
-
-  auto test_impl = [&buf, &weight_buff, &wgrad_buff, &input_tensor,
-                    &output_tensor](auto master_weight_buff) {
-    DummyTrainableLayer<DType, use_FP32_weight> dummy_trainable_layer(
-        master_weight_buff, weight_buff, wgrad_buff, input_tensor, output_tensor,
-        test::get_default_gpu());
-    buf->allocate();
-    dummy_trainable_layer.initialize();
-    dummy_trainable_layer.fprop(true);
-    dummy_trainable_layer.bprop();
-  };
-
-  if constexpr (std::is_same<DType, float>::value || use_FP32_weight) {
-    test_impl(weight_buff);
-
-    ASSERT_TRUE(weight_buff->as_tensor().get_num_elements() ==
-                wgrad_buff->as_tensor().get_num_elements());
-  } else {
-    std::shared_ptr<BufferBlock2<float>> master_weight_buff = buf->create_block<float>();
-    test_impl(master_weight_buff);
-
-    ASSERT_TRUE(master_weight_buff->as_tensor().get_num_elements() ==
-                weight_buff->as_tensor().get_num_elements());
-    ASSERT_TRUE(master_weight_buff->as_tensor().get_num_elements() ==
-                wgrad_buff->as_tensor().get_num_elements());
-  }
-}
-
-}  // namespace
-
-TEST(trainable_layer_old, fp32_32_4096x1024x10) {
-  trainable_layer_test<float, true>(4096, 1024, 10);
-}
-// TEST(trainable_layer_old, fp32_16_4096x1024x10) { trainable_layer_test<float, false>(4096, 1024,
-// 10); }  // error
-TEST(trainable_layer_old, fp16_16_4096x1024x10) {
-  trainable_layer_test<__half, false>(4096, 1024, 10);
-}
-TEST(trainable_layer_old, fp16_32_4096x1024x10) {
-  trainable_layer_test<__half, true>(4096, 1024, 10);
-}
diff --git a/test/utest/legacy_layer_test/weight_multiply_layer_test_old.cpp b/test/utest/legacy_layer_test/weight_multiply_layer_test_old.cpp
deleted file mode 100644
index 9865b6ce4b..0000000000
--- a/test/utest/legacy_layer_test/weight_multiply_layer_test_old.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gtest/gtest.h>
-
-#include <layers/weight_multiply_layer.hpp>
-#include <utest/test_utils.hpp>
-#include <vector>
-
-using namespace HugeCTR;
-
-namespace {
-
-template <typename T>
-T eps();
-
-template <>
-constexpr float eps() {
-  return 1e-3f;
-}
-
-template <>
-__half eps() {
-  return __float2half(2e-0f);
-}
-
-template <typename T>
-void weight_multiply_cpu(const T* input, const T* weight, T* output, int batch_size, int slot_num,
-                         int embedding_vec_size) {
-  for (int i = 0; i < batch_size; i++) {
-    for (int j = 0; j < slot_num; j++) {
-      for (int k = 0; k < embedding_vec_size; k++) {
-        output[i * slot_num * embedding_vec_size + j * embedding_vec_size + k] =
-            input[i * slot_num + j] * weight[j * embedding_vec_size + k];
-      }
-    }
-  }
-}
-
-template <typename T>
-void weight_multiply_wgrad_cpu(const T* top_grad, const T* input, T* wgrad, int batch_size,
-                               int slot_num, int embedding_vec_size) {
-  int len_w = slot_num * embedding_vec_size;
-  for (int i = 0; i < len_w; i++) {
-    double tmp = 0.0;
-    for (int j = 0; j < batch_size; j++) {
-      tmp += (double)input[j * slot_num + i / embedding_vec_size] * (double)top_grad[j * len_w + i];
-    }
-    wgrad[i] = (T)tmp;
-  }
-}
-
-template <typename T>
-void weight_multiply_dgrad_cpu(const T* top_grad, const T* weight, T* dgrad, int batch_size,
-                               int slot_num, int embedding_vec_size) {
-  for (int i = 0; i < batch_size; i++) {
-    for (int j = 0; j < slot_num; j++) {
-      T tmp = T(0.0);
-      for (int k = 0; k < embedding_vec_size; k++) {
-        tmp = tmp + T(top_grad[i * slot_num * embedding_vec_size + j * embedding_vec_size + k] *
-                      weight[j * embedding_vec_size + k]);
-      }
-      dgrad[i * slot_num + j] = tmp;
-    }
-  }
-}
-
-template <typename T>
-void weight_multiply_test(size_t batch_size, size_t slot_num, size_t embedding_vec_size) {
-  std::shared_ptr<GeneralBuffer2<CudaAllocator>> buff = GeneralBuffer2<CudaAllocator>::create();
-  std::shared_ptr<BufferBlock2<float>> master_weight_buff = buff->create_block<float>();
-  std::shared_ptr<BufferBlock2<T>> weight_buff = buff->create_block<T>();
-  std::shared_ptr<BufferBlock2<T>> wgrad_buff = buff->create_block<T>();
-
-  std::vector<size_t> in_dims = {batch_size, slot_num};
-  std::vector<size_t> weight_dims = {slot_num, embedding_vec_size};
-
-  Tensor2<T> in_tensor;
-  buff->reserve(in_dims, &in_tensor);
-  Tensor2<T> out_tensor;
-
-  test::GaussianDataSimulator simulator(0.0f, 1.0f);
-  WeightMultiplyLayer<T> weight_multiply_layer(master_weight_buff, weight_buff, wgrad_buff, buff,
-                                               in_tensor, out_tensor, weight_dims,
-                                               test::get_default_gpu());
-
-  buff->allocate();
-  weight_multiply_layer.initialize();
-
-  Tensor2<T> weight = weight_buff->as_tensor();
-  Tensor2<T> wgrad = wgrad_buff->as_tensor();
-
-  T* d_weight = weight.get_ptr();
-  T* d_wgrad = wgrad.get_ptr();
-
-  const size_t len_in = batch_size * slot_num;
-  const size_t len_out = batch_size * slot_num * embedding_vec_size;
-  const size_t len_w = slot_num * embedding_vec_size;
-  T* d_in = in_tensor.get_ptr();
-  T* d_out = out_tensor.get_ptr();
-  std::unique_ptr<T[]> h_in(new T[len_in]);
-  std::unique_ptr<T[]> h_out(new T[len_out]);
-  std::unique_ptr<T[]> h_weight(new T[len_w]);
-  std::unique_ptr<T[]> h_wgrad(new T[len_w]);
-  std::unique_ptr<T[]> h_expected(new T[len_out]);
-  std::unique_ptr<T[]> h_expected_wgrad(new T[len_w]);
-
-  // fprop
-  simulator.fill(h_in.get(), len_in);
-  simulator.fill(h_weight.get(), len_w);
-  HCTR_LIB_THROW(cudaMemcpy(d_in, h_in.get(), len_in * sizeof(T), cudaMemcpyHostToDevice));
-  HCTR_LIB_THROW(cudaMemcpy(d_weight, h_weight.get(), len_w * sizeof(T), cudaMemcpyHostToDevice));
-
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-  weight_multiply_layer.fprop(true);
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-  HCTR_LIB_THROW(cudaMemcpy(h_out.get(), d_out, len_out * sizeof(T), cudaMemcpyDeviceToHost));
-
-  weight_multiply_cpu(h_in.get(), h_weight.get(), h_expected.get(), batch_size, slot_num,
-                      embedding_vec_size);
-  ASSERT_TRUE(test::compare_array_approx<T>(h_out.get(), h_expected.get(), len_out, eps<T>()));
-
-  // bprop
-  simulator.fill(h_in.get(), len_in);
-  for (size_t i = 0; i < len_in; ++i) {
-    h_expected[i] = h_in[i];
-  }
-  simulator.fill(h_out.get(), len_out);
-  simulator.fill(h_weight.get(), len_w);
-  HCTR_LIB_THROW(cudaMemcpy(d_in, h_in.get(), len_in * sizeof(T), cudaMemcpyHostToDevice));
-  HCTR_LIB_THROW(cudaMemcpy(d_out, h_out.get(), len_out * sizeof(T), cudaMemcpyHostToDevice));
-  HCTR_LIB_THROW(cudaMemcpy(d_weight, h_weight.get(), len_w * sizeof(T), cudaMemcpyHostToDevice));
-
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-  weight_multiply_layer.bprop();  // compute wgrad and dgrad
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-  HCTR_LIB_THROW(
-      cudaMemcpy(h_wgrad.get(), d_wgrad, len_w * sizeof(T), cudaMemcpyDeviceToHost));  // wgrad
-  HCTR_LIB_THROW(
-      cudaMemcpy(h_in.get(), d_in, len_in * sizeof(T), cudaMemcpyDeviceToHost));  // dgrad
-
-  weight_multiply_wgrad_cpu(h_out.get(), h_expected.get(), h_expected_wgrad.get(), batch_size,
-                            slot_num, embedding_vec_size);
-  // TODO: because of the accumulated error, comparing absolute error can not pass when esp<1e-3
-  ASSERT_TRUE(test::compare_array_approx<T>(h_wgrad.get(), h_expected_wgrad.get(), len_w,
-                                            eps<T>()));  // compare wgrad
-
-  // CAUTION: dgrad computation will modify the "input", so it must be put after wgrad computation
-  weight_multiply_dgrad_cpu(h_out.get(), h_weight.get(), h_expected.get(), batch_size, slot_num,
-                            embedding_vec_size);
-  ASSERT_TRUE(test::compare_array_approx<T>(h_in.get(), h_expected.get(), len_in,
-                                            eps<T>()));  // compare dgrad
-}
-
-}  // namespace
-
-TEST(weight_multiply_layer_old, fp32_40960x10x128) { weight_multiply_test<float>(40960, 10, 128); }
-TEST(weight_multiply_layer_old, fp16_40960x10x128) { weight_multiply_test<__half>(40960, 10, 128); }
diff --git a/test/utest/loss/loss_with_regularizer_test.cpp b/test/utest/loss/loss_with_regularizer_test.cpp
index d872426573..6fffaf7c82 100644
--- a/test/utest/loss/loss_with_regularizer_test.cpp
+++ b/test/utest/loss/loss_with_regularizer_test.cpp
@@ -98,11 +98,11 @@ void loss_with_regularizer_test(Regularizer_t type, int64_t batch_size, int64_t
                                                  .shape({batch_size, 1})
                                                  .buffer_params(blobs_buffer_params));
 
-  Core23TempFullyConnectedLayer<float> fc_layer_no(in_tensor, out_tensor, test::get_default_gpu(),
-                                                   false, false);
+  FullyConnectedLayer<float> fc_layer_no(in_tensor, out_tensor, test::get_default_gpu(), false,
+                                         false);
 
-  Core23TempFullyConnectedLayer<float> fc_layer_re(in_tensor, out_tensor, test::get_default_gpu(),
-                                                   false, false);
+  FullyConnectedLayer<float> fc_layer_re(in_tensor, out_tensor, test::get_default_gpu(), false,
+                                         false);
 
   core23::Tensor loss_tensor_no = core23::Tensor(core23::TensorParams()
                                                      .data_type(core23::ToScalarType<float>::value)
diff --git a/test/utest/network/network_build_test.cpp b/test/utest/network/network_build_test.cpp
index d9914f1e4a..0098bfc971 100644
--- a/test/utest/network/network_build_test.cpp
+++ b/test/utest/network/network_build_test.cpp
@@ -62,7 +62,7 @@ add_layers(std::shared_ptr<GPUResource> gpu, core23::Tensor& label_tensor, core2
 
   for (int64_t i = 1; i <= 5; i++) {
     if (i % 2 == 1) {
-      train_layers.emplace_back(new Core23TempFullyConnectedLayer<float>(
+      train_layers.emplace_back(new FullyConnectedLayer<float>(
           tensors[i - 1], tensors[i], gpu, use_mixed_precision, enable_tf32_compute,
           {Initializer_t::Default, Initializer_t::Default}));
     } else {
diff --git a/test/utest/regularizers/loss_with_regularizer_test.cpp b/test/utest/regularizers/loss_with_regularizer_test.cpp
deleted file mode 100644
index aac328ee50..0000000000
--- a/test/utest/regularizers/loss_with_regularizer_test.cpp
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cublas_v2.h>
-#include <curand.h>
-#include <gtest/gtest.h>
-
-#include <cmath>
-#include <common.hpp>
-#include <cstdlib>
-#include <layers/fully_connected_layer.hpp>
-#include <loss.hpp>
-#include <regularizers/l1_regularizer.hpp>
-#include <regularizers/l2_regularizer.hpp>
-#include <regularizers/no_regularizer.hpp>
-#include <utest/test_utils.hpp>
-#include <utility>
-#include <vector>
-
-using namespace HugeCTR;
-
-namespace {
-
-float get_ref_term(Regularizer_t type, std::vector<float>& h_weight, float lambda,
-                   size_t batch_size) {
-  float ref_term = 0.0f;
-  switch (type) {
-    case Regularizer_t::L1: {
-      for (auto& v : h_weight) {
-        ref_term += fabs(v);
-      }
-      const float alpha = lambda / batch_size;
-      ref_term *= alpha;
-      break;
-    }
-    case Regularizer_t::L2: {
-      for (auto& v : h_weight) {
-        ref_term += (v * v);
-      }
-      const float alpha = lambda / (batch_size * 2);
-      ref_term *= alpha;
-      break;
-    }
-    default:
-      assert(!"Error: no such Regularizer && should never get here!");
-      break;
-  }
-  return ref_term;
-}
-
-void get_ref_grad(Regularizer_t type, const std::vector<float>& h_weight,
-                  std::vector<float>& h_wgrad, float lambda, size_t batch_size) {
-  switch (type) {
-    case Regularizer_t::L1: {
-      for (size_t i = 0; i < h_wgrad.size(); i++) {
-        float sign = (h_weight[i] > 0.0f) ? 1.0f : -1.0f;
-        h_wgrad[i] += (lambda / batch_size) * sign;
-      }
-      break;
-    }
-    case Regularizer_t::L2: {
-      for (size_t i = 0; i < h_wgrad.size(); i++) {
-        h_wgrad[i] += (lambda / batch_size) * h_weight[i];
-      }
-      break;
-    }
-    default:
-      assert(!"Error: no such Regularizer && should never get here!");
-      break;
-  }
-}
-
-std::shared_ptr<Regularizer<float>> create_regularizer(
-    Regularizer_t type, const Tensor2<float>& weight_buff, const Tensor2<float>& wgrad_buff,
-    size_t batch_size, float lambda, const std::shared_ptr<GPUResource>& gpu_resource) {
-  std::shared_ptr<Regularizer<float>> reg;
-  switch (type) {
-    case Regularizer_t::L1:
-      reg.reset(
-          new L1Regularizer<float>(weight_buff, wgrad_buff, batch_size, lambda, gpu_resource));
-      break;
-    case Regularizer_t::L2:
-      reg.reset(
-          new L2Regularizer<float>(weight_buff, wgrad_buff, batch_size, lambda, gpu_resource));
-      break;
-    default:
-      assert(!"Error: no such optimizer && should never get here!");
-      break;
-  }
-  return reg;
-}
-
-const float eps = 1e-5;
-
-void loss_with_regularizer_test(Regularizer_t type, size_t batch_size, size_t num_features,
-                                float lambda) {
-  std::shared_ptr<GeneralBuffer2<CudaAllocator>> buff = GeneralBuffer2<CudaAllocator>::create();
-  std::shared_ptr<BufferBlock2<float>> weight_buff_no = buff->create_block<float>();
-  std::shared_ptr<BufferBlock2<float>> wgrad_buff_no = buff->create_block<float>();
-  std::shared_ptr<BufferBlock2<float>> weight_buff_re = buff->create_block<float>();
-  std::shared_ptr<BufferBlock2<float>> wgrad_buff_re = buff->create_block<float>();
-
-  Tensor2<float> in_tensor;
-  buff->reserve({batch_size, num_features}, &in_tensor);
-
-  Tensor2<float> out_tensor;
-  buff->reserve({batch_size, 1}, &out_tensor);
-
-  FullyConnectedLayer<float> fc_layer_no(weight_buff_no, wgrad_buff_no, in_tensor, out_tensor,
-                                         test::get_default_gpu(), false, false);
-
-  FullyConnectedLayer<float> fc_layer_re(weight_buff_re, wgrad_buff_re, in_tensor, out_tensor,
-                                         test::get_default_gpu(), false, false);
-
-  Tensor2<float> loss_tensor_no;
-  buff->reserve({1, 1}, &loss_tensor_no);
-
-  Tensor2<float> loss_tensor_re;
-  buff->reserve({1, 1}, &loss_tensor_re);
-
-  Tensor2<float> label_tensor;
-  buff->reserve({batch_size, 1}, &label_tensor);
-
-  BinaryCrossEntropyLoss<float> loss_no(
-      label_tensor, out_tensor, loss_tensor_no,
-      std::shared_ptr<NoRegularizer<float>>(
-          new NoRegularizer<float>(weight_buff_no->as_tensor(), wgrad_buff_no->as_tensor(),
-                                   batch_size, test::get_default_gpu())),
-      test::get_default_gpu(), 1);
-  loss_no.set_label_weight(1.0);
-
-  BinaryCrossEntropyLoss<float> loss_re(
-      label_tensor, out_tensor, loss_tensor_re,
-      create_regularizer(type, weight_buff_re->as_tensor(), wgrad_buff_re->as_tensor(), batch_size,
-                         lambda, test::get_default_gpu()),
-      test::get_default_gpu(), 1);
-  loss_re.set_label_weight(1.0);
-
-  buff->allocate();
-
-  test::GaussianDataSimulator input_simulator(0.0f, 1.0f);
-  std::vector<float> h_input(in_tensor.get_num_elements());
-  input_simulator.fill(h_input.data(), h_input.size());
-  HCTR_LIB_THROW(cudaMemcpy(in_tensor.get_ptr(), &h_input.front(), in_tensor.get_size_in_bytes(),
-                            cudaMemcpyHostToDevice));
-
-  const float sigma = 1.f / sqrt(num_features);
-  test::GaussianDataSimulator weight_simulator(0.0f, sigma);
-  std::vector<float> h_weight(weight_buff_re->as_tensor().get_num_elements());
-  weight_simulator.fill(h_weight.data(),
-                        h_weight.size() % 2 != 0 ? h_weight.size() + 1 : h_weight.size());
-  HCTR_LIB_THROW(cudaMemcpy(weight_buff_no->as_tensor().get_ptr(), &h_weight.front(),
-                            weight_buff_no->as_tensor().get_size_in_bytes(),
-                            cudaMemcpyHostToDevice));
-  HCTR_LIB_THROW(cudaMemcpy(weight_buff_re->as_tensor().get_ptr(), &h_weight.front(),
-                            weight_buff_re->as_tensor().get_size_in_bytes(),
-                            cudaMemcpyHostToDevice));
-
-  test::UniformDataSimulator label_simulator;
-  std::vector<float> h_label(label_tensor.get_num_elements());
-
-  label_simulator.fill(h_label.data(), h_label.size(), 0.0f, 1.0f);
-  HCTR_LIB_THROW(cudaMemcpy(label_tensor.get_ptr(), &h_label.front(),
-                            label_tensor.get_size_in_bytes(), cudaMemcpyHostToDevice));
-
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-  fc_layer_no.fprop(true);
-  loss_no.compute_and_init(true);
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-  std::unique_ptr<float> loss_no_val(new float);
-  HCTR_LIB_THROW(cudaMemcpy(loss_no_val.get(), loss_tensor_no.get_ptr(),
-                            loss_tensor_no.get_size_in_bytes(), cudaMemcpyDeviceToHost));
-
-  const float ref_term = get_ref_term(type, h_weight, lambda, batch_size);
-  *loss_no_val += ref_term;
-
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-  fc_layer_re.fprop(true);
-  loss_re.compute_and_init(true);
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-  std::unique_ptr<float> loss_re_val(new float);
-  HCTR_LIB_THROW(cudaMemcpy(loss_re_val.get(), loss_tensor_re.get_ptr(),
-                            loss_tensor_re.get_size_in_bytes(), cudaMemcpyDeviceToHost));
-
-  ASSERT_TRUE(test::compare_array_approx<float>(loss_re_val.get(), loss_no_val.get(), 1, eps));
-
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-  fc_layer_no.bprop();
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-  std::vector<float> h_wgrad_prev(wgrad_buff_no->as_tensor().get_num_elements());
-  HCTR_LIB_THROW(cudaMemcpy(&h_wgrad_prev.front(), wgrad_buff_no->as_tensor().get_ptr(),
-                            wgrad_buff_no->as_tensor().get_size_in_bytes(),
-                            cudaMemcpyDeviceToHost));
-
-  HCTR_LIB_THROW(cudaMemcpy(in_tensor.get_ptr(), &h_input.front(), in_tensor.get_size_in_bytes(),
-                            cudaMemcpyHostToDevice));
-
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-  fc_layer_re.bprop();
-  HCTR_LIB_THROW(cudaDeviceSynchronize());
-
-  std::vector<float> h_wgrad_next(wgrad_buff_re->as_tensor().get_num_elements());
-  HCTR_LIB_THROW(cudaMemcpy(&h_wgrad_next.front(), wgrad_buff_re->as_tensor().get_ptr(),
-                            wgrad_buff_re->as_tensor().get_size_in_bytes(),
-                            cudaMemcpyDeviceToHost));
-
-  get_ref_grad(type, h_weight, h_wgrad_prev, lambda, batch_size);
-  ASSERT_TRUE(test::compare_array_approx<float>(&h_wgrad_next.front(), &h_wgrad_prev.front(),
-                                                h_wgrad_next.size(), eps));
-}
-
-TEST(loss_with_regularizer, l2_32x64_64x1_small_lambda) {
-  loss_with_regularizer_test(Regularizer_t::L2, 32, 64, 0.001);
-}
-
-TEST(loss_with_regularizer, l2_32x64_64x1_big_lambda) {
-  loss_with_regularizer_test(Regularizer_t::L2, 32, 64, 0.1);
-}
-
-TEST(loss_with_regularizer, l2_128x256_256x1) {
-  loss_with_regularizer_test(Regularizer_t::L2, 128, 256, 0.001);
-}
-
-TEST(loss_with_regularizer, l1_32x64_64x1_small_lambda) {
-  loss_with_regularizer_test(Regularizer_t::L1, 32, 64, 0.001);
-}
-
-TEST(loss_with_regularizer, l1_32x64_64x1_big_lambda) {
-  loss_with_regularizer_test(Regularizer_t::L1, 32, 64, 0.1);
-}
-
-TEST(loss_with_regularizer, l1_128x256_256x1) {
-  loss_with_regularizer_test(Regularizer_t::L1, 128, 256, 0.001);
-}
-
-}  // namespace