fix bug related to workspace

ROCm · cognaiger9 · May 13, 2024 · May 16, 2024 · May 16, 2024 · May 17, 2024
commit 7e2014f3432480ad6ae35204325ec3de8e76a500
@@ -29,13 +29,13 @@
 
 static Driver* makeDriver(const std::string& base_arg)
 {
-    if(base_arg == "smoothl1loss")
+    if(base_arg == "l1loss")
         return new L1LossDriver<float, float>();
-    if(base_arg == "smoothl1lossfp16")
+    if(base_arg == "l1lossfp16")
         return new L1LossDriver<float16, float>();
-    if(base_arg == "smoothl1lossbfp16")
+    if(base_arg == "l1lossbfp16")
         return new L1LossDriver<bfloat16, float>();
     return nullptr;
 }
 
-REGISTER_DRIVER_MAKER(makeDriver);
+REGISTER_DRIVER_MAKER(makeDriver);
@@ -176,7 +176,8 @@ inline std::string ParseBaseArg(int argc, char* argv[])
        arg != "layernormfp16" && arg != "layernormbfp16" && arg != "sum" && arg != "sumfp16" &&
        arg != "sumbfp16" && arg != "argmax" && arg != "argmaxfp16" && arg != "argmaxbfp16" &&
        arg != "groupnorm" && arg != "groupnormfp16" && arg != "groupnormbfp16" && arg != "cat" &&
-       arg != "catfp16" && arg != "catbfp16" && arg != "l1loss" && arg != "l1lossfp16" && arg != "l1lossbfp16" && arg != "--version")
+       arg != "catfp16" && arg != "catbfp16" && arg != "l1loss" && arg != "l1lossfp16" &&
+       arg != "l1lossbfp16" && arg != "--version")
     {
         printf("FAILED: Invalid Base Input Argument\n");
         Usage();

@@ -46,7 +46,7 @@
 #define MLO_L1LOSSMHOST_H_
 
 template <typename Tgpu, typename Tcheck>
-int32_t mloSmoothL1LossReducedForwardRunHost(const miopenTensorDescriptor_t iDesc,
+int32_t mloL1LossReducedForwardRunHost(const miopenTensorDescriptor_t iDesc,
                                              const miopenTensorDescriptor_t tDesc,
                                              const Tgpu* input,
                                              const Tgpu* target,
@@ -63,18 +63,20 @@ int32_t mloSmoothL1LossReducedForwardRunHost(const miopenTensorDescriptor_t iDes
     int32_t divisor = (reduction == MIOPEN_L1LOSS_MEAN_REDUCTION) ? size : 1;
 
     /* Phase 1: Calc loss for each element. */
-    for (size_t i = 0; i < size; i++) {
+    for(size_t i = 0; i < size; i++)
+    {
         uint64_t n[5];
         GET_NCDHW(n[0], n[1], n[2], n[3], n[4], i, I_tv);
-        uint64_t Iidx = TV5D_IDX(I_tv, n[0], n[1], n[2], n[3], n[4]);
-        uint64_t Tidx = TV5D_IDX(T_tv, n[0], n[1], n[2], n[3], n[4]);
+        uint64_t Iidx       = TV5D_IDX(I_tv, n[0], n[1], n[2], n[3], n[4]);
+        uint64_t Tidx       = TV5D_IDX(T_tv, n[0], n[1], n[2], n[3], n[4]);
         workspacehost[Iidx] = abs(input[Iidx] - target[Tidx]) / divisor;
     }
 
     /* Phase 2: Reduce */
     double output = 0.0;
-    for (size_t i = 0; i < size; i++) {
-        output += workspacehost[i];   
+    for(size_t i = 0; i < size; i++)
+    {
+        output += workspacehost[i];
     }
     outputhost[0] = output;
 
@@ -198,28 +200,34 @@ int L1LossDriver<Tgpu, Tref>::GetandSetData()
     reduction = static_cast<miopenL1LossReduction_t>(inflags.GetValueInt("Reduction"));
 
     auto length      = GetTensorLengthsFromCmdLine();
-    auto in_strides  = GetStrides(length, 1);
-    auto tar_strides = GetStrides(length, inflags.GetValueInt("Contiguous"));
+    //auto in_strides  = GetStrides(length, 1);
+    //auto tar_strides = GetStrides(length, inflags.GetValueInt("Contiguous"));
 
-    SetTensorNd(inputDesc, length, in_strides, data_type);
-    SetTensorNd(targetDesc, length, tar_strides, data_type);
+    //SetTensorNd(inputDesc, length, in_strides, data_type);
+    //SetTensorNd(targetDesc, length, tar_strides, data_type);
+    SetTensorNd(inputDesc, length, data_type);
+    SetTensorNd(targetDesc, length, data_type);
 
     if(reduction == MIOPEN_L1LOSS_NONE_REDUCTION)
     {
-        SetTensorNd(outputDesc, length, in_strides, data_type);
+        //SetTensorNd(outputDesc, length, in_strides, data_type);
+        SetTensorNd(outputDesc, length, data_type);
     }
     else
     {
         std::vector<int> out_lens = {1};
         SetTensorNd(outputDesc, out_lens, data_type);
     }
 
-    SetTensorNd(diDesc, length, in_strides, data_type);
-    SetTensorNd(dtDesc, length, tar_strides, data_type);
+    //SetTensorNd(diDesc, length, in_strides, data_type);
+    //SetTensorNd(dtDesc, length, tar_strides, data_type);
+    SetTensorNd(diDesc, length, data_type);
+    SetTensorNd(dtDesc, length, data_type);
 
     if(reduction == MIOPEN_L1LOSS_NONE_REDUCTION)
     {
-        SetTensorNd(doDesc, length, in_strides, data_type);
+        //SetTensorNd(doDesc, length, in_strides, data_type);
+        SetTensorNd(doDesc, length, data_type);
     }
     else
     {
@@ -251,7 +259,7 @@ int L1LossDriver<Tgpu, Tref>::AddCmdLineArgs()
                          "(Default=0)",
                          "int");
     inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
-    inflags.AddInputFlag("verify", 'V', "0", "Verify Each Layer (Default=0)", "int");
+    inflags.AddInputFlag("verify", 'V', "1", "Verify Each Layer (Default=1)", "int");
     inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int");
     inflags.AddInputFlag(
         "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int");
@@ -301,13 +309,15 @@ int L1LossDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
     size_t in_sz  = GetTensorSize(inputDesc);
     size_t tar_sz = GetTensorSize(targetDesc);
     size_t out_sz = GetTensorSize(outputDesc);
-    size_t ws_sz = GetTensorSize(inputDesc);
 
-    miopenGetL1LossForwardWorkspaceSize(GetHandle(), reduction, inputDesc, targetDesc, outputDesc, &ws_sizeInBytes);
+    miopenGetL1LossForwardWorkspaceSize(
+        GetHandle(), reduction, inputDesc, targetDesc, outputDesc, &ws_sizeInBytes);
 
     if(ws_sizeInBytes == static_cast<size_t>(-1))
         return miopenStatusAllocFailed;
 
+    size_t ws_sz = ws_sizeInBytes / sizeof(Tgpu);
+
     uint32_t ctx = 0;
 
     in_dev        = std::unique_ptr<GPUMem>(new GPUMem(ctx, in_sz, sizeof(Tgpu)));
@@ -368,16 +378,16 @@ int L1LossDriver<Tgpu, Tref>::RunForwardGPU()
 
     for(int i = 0; i < inflags.GetValueInt("iter"); i++)
     {
-        miopenSmoothL1LossForward(GetHandle(),
-                                   reduction,
-                                    workspace_dev->GetMem(),
-                                    ws_sizeInBytes,
-                                    inputDesc,
-                                    in_dev->GetMem(),
-                                    targetDesc,
-                                    tar_dev->GetMem(),
-                                    outputDesc,
-                                    out_dev->GetMem());
+        miopenL1LossForward(GetHandle(),
+                                  reduction,
+                                  workspace_dev->GetMem(),
+                                  ws_sizeInBytes,
+                                  inputDesc,
+                                  in_dev->GetMem(),
+                                  targetDesc,
+                                  tar_dev->GetMem(),
+                                  outputDesc,
+                                  out_dev->GetMem());
 
         float time = 0.0;
         miopenGetKernelTime(GetHandle(), &time);
@@ -411,22 +421,22 @@ int L1LossDriver<Tgpu, Tref>::RunForwardCPU()
 {
     if(reduction == MIOPEN_L1LOSS_MEAN_REDUCTION || reduction == MIOPEN_L1LOSS_SUM_REDUCTION)
     {
-        mloSmoothL1LossReducedForwardRunHost<Tgpu, Tref>(inputDesc,
-                                                         targetDesc,
-                                                         in.data(),
-                                                         tar.data(),
-                                                         workspacehost.data(),
-                                                         outhost.data(),
-                                                         reduction);
+        mloL1LossReducedForwardRunHost<Tgpu, Tref>(inputDesc,
+                                                    targetDesc,
+                                                    in.data(),
+                                                    tar.data(),
+                                                    workspacehost.data(),
+                                                    outhost.data(),
+                                                    reduction);
     }
 
     return miopenStatusSuccess;
 }
 
-/*
 template <typename Tgpu, typename Tref>
 int L1LossDriver<Tgpu, Tref>::RunBackwardGPU()
 {
+    /*
     float kernel_total_time = 0;
     float kernel_first_time = 0;
 
@@ -478,13 +488,15 @@ int L1LossDriver<Tgpu, Tref>::RunBackwardGPU()
         std::cerr << "Error copying (dI_dev) from GPU, size: " << dI_dev->GetSize() << std::endl;
     if(dT_dev->FromGPU(GetStream(), dT.data()) != 0)
         std::cerr << "Error copying (dT_dev) from GPU, size: " << dT_dev->GetSize() << std::endl;
+    */
 
     return miopenStatusSuccess;
 }
 
 template <typename Tgpu, typename Tref>
 int L1LossDriver<Tgpu, Tref>::RunBackwardCPU()
 {
+    /*
     if(!std::isnan(divisor))
     {
         mloSmoothL1LossReducedBackwardRunHost<Tgpu, Tref>(inputDesc,
@@ -499,10 +511,10 @@ int L1LossDriver<Tgpu, Tref>::RunBackwardCPU()
                                                           beta,
                                                           divisor);
     }
+    */
 
     return miopenStatusSuccess;
 }
-*/
 
 template <typename Tgpu, typename Tref>
 Tref L1LossDriver<Tgpu, Tref>::GetTolerance()
@@ -531,17 +543,17 @@ int L1LossDriver<Tgpu, Tref>::VerifyForward()
     }
     else
     {
-        std::cout << "Forward L1Loss Verifies OK on CPU reference (" << error << " < "
-                  << tolerance << ')' << std::endl;
+        std::cout << "Forward L1Loss Verifies OK on CPU reference (" << error << " < " << tolerance
+                  << ')' << std::endl;
     }
 
     return miopenStatusSuccess;
 }
 
-/*
 template <typename Tgpu, typename Tref>
 int L1LossDriver<Tgpu, Tref>::VerifyBackward()
 {
+    /*
     RunBackwardCPU();
     const Tref tolerance = GetTolerance();
     auto error_dI        = miopen::rms_range(dIhost, dI);
@@ -559,9 +571,9 @@ int L1LossDriver<Tgpu, Tref>::VerifyBackward()
         std::cout << "Backward SmoothL1Loss Verifies OK on CPU reference ({" << error_dI << ","
                   << error_dT << "} < " << tolerance << ')' << std::endl;
     }
+    */
 
     return miopenStatusSuccess;
 }
-*/
 
 #endif // GUARD_MIOPEN_L1LOSS_DRIVER_HPP
@@ -134,6 +134,9 @@ set( MIOpen_Source
     invoker_cache.cpp
     kernel_build_params.cpp
     kernel_warnings.cpp
+    l1loss.cpp
+    l1loss_api.cpp
+    l1loss/problem_description.cpp
     layernorm_api.cpp
     layernorm/problem_description.cpp
     load_file.cpp
@@ -260,6 +263,7 @@ set( MIOpen_Source
     solver/gemm_bwd.cpp
     solver/gemm_wrw.cpp
     solver/groupnorm/forward_groupnorm.cpp
+    solver/l1loss/forward_l1loss.cpp
     solver/layernorm/forward_layernorm.cpp
     solver/layernorm/forward_layernorm2d_ck.cpp
     solver/layernorm/forward_layernorm4d_ck.cpp
@@ -421,6 +425,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/neuron.inc
         kernels/rocm_version.inc
         kernels/stride_array.hpp
+        kernels/tensor_view_5d.hpp
         kernels/utilities.inc
         kernels/workaround_issue_1431.hpp
         kernels/xform_bidirect_winograd_code.inc
@@ -455,6 +460,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/MIOpenConvDirBatchNormActiv.cl
         kernels/MIOpenConvDirGenFwd.cl
         kernels/MIOpenGroupNorm.cpp
+        kernels/MIOpenL1Loss.cpp
         kernels/MIOpenLayerNorm.cpp
         kernels/MIOpenLRNBwd.cl
         kernels/MIOpenLRNFwd.cl

@@ -98,7 +98,7 @@ struct L1LossFwdProblemDescription : ProblemDescriptionBase
         }
     }
 
-    miopenL1LossReduction_t GetReduction_() const { return reduction; }
+    miopenL1LossReduction_t GetReduction() const { return reduction; }
     const TensorDescriptor& GetIDesc() const { return iDesc; }
     const TensorDescriptor& GetTDesc() const { return tDesc; }
     const TensorDescriptor& GetODesc() const { return oDesc; }

@@ -51,6 +51,7 @@ struct L1LossForward5d final : L1LossForwardSolverBase
     std::size_t
     GetWorkspaceSize(const ExecutionContext& context,
                      const miopen::l1loss::L1LossFwdProblemDescription& problem) const override;
+    bool MayNeedWorkspace() const override { return true; }
 };
 
 /*

@@ -56,7 +56,8 @@ enum class Primitive
     Reduce,
     Cat,
     Mha,
-    Softmax
+    Softmax,
+    L1Loss
 };
 
 struct MIOPEN_EXPORT Id

@@ -23,8 +23,6 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#include <__clang_hip_runtime_wrapper.h>
-#include <cstddef>
 #ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
 #include <hip/hip_fp16.h>
 #include <hip/hip_runtime.h>

@@ -24,6 +24,7 @@
  *
  *******************************************************************************/
 
+#include "miopen/l1loss/problem_description.hpp"
 #include "miopen/miopen.h"
 #include <miopen/datatype.hpp>
 #include <miopen/find_solution.hpp>
@@ -37,23 +38,24 @@
 namespace miopen {
 
 size_t GetL1LossForwardWorkspaceSize(Handle& handle,
+                                    miopenL1LossReduction_t reduction,
                                      const TensorDescriptor& iDesc,
                                      const TensorDescriptor& tDesc,
                                      const TensorDescriptor& oDesc)
 {
     auto ctx           = ExecutionContext{&handle};
-    const auto problem = smoothl1loss::ReducedForwardProblemDescription{iDesc, tDesc, oDesc};
+    const auto problem = l1loss::L1LossFwdProblemDescription{iDesc, tDesc, oDesc, reduction};
 
-    const auto algo = AlgorithmName{"SmoothL1LossReducedForward"};
+    const auto algo = AlgorithmName{"L1LossForward"};
     const auto solvers =
-        solver::SolverContainer<solver::smoothl1loss::SmoothL1LossReducedForward5d>{};
+        solver::SolverContainer<solver::l1loss::L1LossForward5d>{};
 
     auto pair_size_vector = solvers.GetWorkspaceSizes(ctx, problem);
 
     return pair_size_vector.empty() ? static_cast<size_t>(-1) : pair_size_vector.front().second;
 }
 
-miopenStatus_t SmoothL1LossForward(Handle& handle,
+miopenStatus_t L1LossForward(Handle& handle,
                                    miopenL1LossReduction_t reduction,
                                    Data_t workspace,
                                    size_t workspaceSizeInBytes,
@@ -64,10 +66,10 @@ miopenStatus_t SmoothL1LossForward(Handle& handle,
                                    const TensorDescriptor& oDesc,
                                    Data_t o)
 {
-    const auto problem = l1loss::ReducedForwardProblemDescription{iDesc, tDesc, oDesc};
+    const auto problem = l1loss::L1LossFwdProblemDescription{iDesc, tDesc, oDesc, reduction};
 
     const auto invoke_params = [&]() {
-        auto tmp           = smoothl1loss::InvokeParams{};
+        auto tmp           = l1loss::InvokeParams{};
         tmp.type           = InvokeType::Run;
         tmp.iDesc          = &iDesc;
         tmp.tDesc          = &tDesc;
@@ -80,9 +82,9 @@ miopenStatus_t SmoothL1LossForward(Handle& handle,
         return tmp;
     }();
 
-    const auto algo = AlgorithmName{"SmoothL1LossReducedForward"};
+    const auto algo = AlgorithmName{"L1LossForward"};
     const auto solvers =
-        solver::SolverContainer<solver::smoothl1loss::SmoothL1LossReducedForward5d>{};
+        solver::SolverContainer<solver::l1loss::L1LossForward5d>{};
 
     solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
-Original file line number
+Diff line change
@@ Expand Up / @@ -56,7 +56,8 @@ enum class Primitive @@
         Reduce,
         Cat,
         Mha,
-        Softmax
+        Softmax,
+        L1Loss
     };
     struct MIOPEN_EXPORT Id
@@ Expand Down @@