[onert] Fix deallocation bug (Samsung#6271)

* [onert] Fix deallocation bug Fix deallocation bug that was introduced by Samsung#5926 . As there was no way to deallocate a dynamic tensor memory of tensors defined by other backends, dynamic tensor deallocation part needs to be changed. - Do not deallocate in FunctionSequence - Introduce `ITensor::deallocBuffer` which deallocates the buffer - Introduce `DeallocFunction` class for deallocation of tensors that are no longer used(only for Linear Executor) - `IDynamicTensorManager`'s method `planDealloc` and `deallocInput` are no longer used Plus, it now deallocates all tensors correctly, even between backend boundaries. ONE-DCO-1.0-Signed-off-by: Hanjoung Lee <[email protected]> * Update runtime/onert/core/src/exec/DynamicShapeInferer.cc Co-authored-by: Hyun Sik Yoon <[email protected]> Co-authored-by: Hyun Sik Yoon <[email protected]>
chxckj · Mar 17, 2021 · 44ae2a8 · 44ae2a8
1 parent 47461ee
commit 44ae2a8
Show file tree

Hide file tree

Showing 9 changed files with 116 additions and 20 deletions.
diff --git a/runtime/onert/core/include/backend/ITensor.h b/runtime/onert/core/include/backend/ITensor.h
@@ -88,6 +88,12 @@ class ITensor
     throw std::runtime_error("This backend does not support dynamic tensor");
   }
 
+  /// @brief Dealloc the buffer (only for dynamic tensors)
+  virtual void deallocBuffer()
+  {
+    throw std::runtime_error("This backend does not support resetting buffer");
+  }
+
   /**
    * @brief Set the shape of tenser to new_shape
    * @note  Higer dimension will be placed on front.

diff --git a/runtime/onert/core/include/backend/cpu_common/BackendContextHelpers.h b/runtime/onert/core/include/backend/cpu_common/BackendContextHelpers.h
@@ -157,12 +157,6 @@ template <typename T_BackendContext> void planTensors(const T_BackendContext &ct
       {
         // plan for deallocation of static tensornode
         tensor_builder->notifyLastUse(ind);
-
-        // plan for deallocation of dynamic tensor
-        auto dyn_tensor_manager = tensor_builder->dynamicTensorManager();
-        auto *tensor = ctx.tensor_registry->getITensor(ind);
-        assert(tensor);
-        dyn_tensor_manager->planDealloc(op_ind, tensor);
       }
     }
   }

diff --git a/runtime/onert/core/include/backend/cpu_common/Tensor.h b/runtime/onert/core/include/backend/cpu_common/Tensor.h
@@ -65,14 +65,9 @@ class Tensor : public IPortableTensor
   }
 
   /**
-   * @brief Mark this tensor does not have memory.
-   *        Real memory deallocation should be done by caller.
+   * @brief Reset the buffer and deallocate the allocation if it is managed by itself
    */
-  void resetBuffer()
-  {
-    _allocator.reset();
-    _buffer = nullptr;
-  }
+  void deallocBuffer() override;
 
 public:
   uint8_t *buffer() const override { return _buffer; }

diff --git a/runtime/onert/core/include/exec/DynamicShapeInferer.h b/runtime/onert/core/include/exec/DynamicShapeInferer.h
@@ -105,6 +105,19 @@ class DynamicShapeInferer : public ir::OperationVisitor
    */
   void handleSimpleUnaryOp(const ir::Operation &op, const ir::OperandIndex input_idx);
 
+  // in case of output tensor of an op, it is possible that
+  // the output became dynamic although it had been static before.
+  // Once a tensor becomes dynamic, it will lost memory allocated for static.
+  // Therefore once output is dynamic, it should be treated as dynamic tensor. (memory should be
+  // allocated at runtime) `previously` means `dynamic` or `static` has been set in previous loop in
+  // WHILE of previous call of `nnfw_run()`
+  bool previously_static(backend::ITensor *op_output) { return !op_output->is_dynamic(); }
+
+  // helper function that check if op's input is static
+  // Note that input of n'th op has been set to static or dynamic by (n-1)th op.
+  // That's why it is called `currently_static`
+  bool currently_static(backend::ITensor *op_input) { return !op_input->is_dynamic(); }
+
 private:
   /**
    * @brief To get operand-level info, e.g., ir::Operand::isConstant()

diff --git a/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc b/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc
@@ -61,7 +61,7 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind)
     _dynamic_mem_mgr->deallocate(tensor);
 
     auto *cpu_tensor = nnfw::misc::polymorphic_downcast<cpu_common::Tensor *>(tensor);
-    cpu_tensor->resetBuffer();
+    cpu_tensor->deallocBuffer();
 
     VERBOSE(DynamicTensorManager) << "Deallocating tensor " << (void *)cpu_tensor
                                   << " (input of op_ind: " << op_ind << ")" << std::endl;

diff --git a/runtime/onert/core/src/backend/cpu_common/Tensor.cc b/runtime/onert/core/src/backend/cpu_common/Tensor.cc
@@ -84,6 +84,19 @@ bool Tensor::applyShape(const ir::Shape &new_shape)
 
 ir::Shape Tensor::getShape() const { return _info.shape(); }
 
+void Tensor::deallocBuffer()
+{
+  if (_allocator)
+  {
+    _buffer = nullptr;
+    _allocator.reset();
+    if (_dynamic_mem_mgr)
+    {
+      _dynamic_mem_mgr->deallocate(this);
+    }
+  }
+}
+
 } // namespace cpu_common
 } // namespace backend
 } // namespace onert

diff --git a/runtime/onert/core/src/compiler/ExecutorFactory.cc b/runtime/onert/core/src/compiler/ExecutorFactory.cc
@@ -67,6 +67,27 @@ class SyncFunction final : public exec::IFunction
   std::shared_ptr<backend::IConfig> _config;
 };
 
+using DeallocList = std::vector<backend::ITensor *>;
+// Deallocation after execution of an operation used by Linear Executor
+class DeallocFunction final : public exec::IFunction
+{
+public:
+  DeallocFunction(const DeallocList &tensors) : _dealloc_list{tensors} {}
+
+  void run() override
+  {
+    for (auto tensor : _dealloc_list)
+    {
+      if (!tensor->is_dynamic())
+        continue;
+      tensor->deallocBuffer();
+    }
+  }
+
+private:
+  DeallocList _dealloc_list;
+};
+
 void initializeSubgraphIOTensors(compiler::LoweredGraph &lowered_graph,
                                  const backend::BackendContexts &backend_contexts,
                                  const ir::OperandIndexSequence &indices)
@@ -267,6 +288,8 @@ ExecutorFactory::createLinearExecutor(std::unique_ptr<compiler::LoweredGraph> lo
                                       const compiler::CompilerOptions &options,
                                       const std::shared_ptr<exec::ExecutorMap> &executor_map)
 {
+  auto graph = lowered_graph->graph();
+
   backend::BackendContexts backend_contexts =
     createBackendContexts(*lowered_graph, options.executor == "Linear");
 
@@ -316,6 +339,59 @@ ExecutorFactory::createLinearExecutor(std::unique_ptr<compiler::LoweredGraph> lo
       ordered_contexts.emplace_front(pair.first, pair.second.get());
   }
 
+  // Simulate the execution for deallocation of tensors
+  std::unordered_map<ir::OperationIndex, DeallocList> dealloc_list_map;
+  {
+    ir::OperandIndexMap<uint32_t> uses_map;
+    ir::OperandIndexSequence constants;
+
+    auto model_io =
+      (graph.getInputs() + graph.getOutputs()) | ir::Remove::UNDEFINED | ir::Remove::DUPLICATED;
+
+    // Prepare scanning
+    graph.operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &obj) {
+      uses_map[ind] = obj.getUses().size();
+
+      if (obj.isConstant())
+        constants.append(ind);
+    });
+
+    // A trick to consider constants as an execption
+    for (const auto &ind : constants)
+    {
+      uses_map[ind]++;
+    }
+
+    for (const auto op_ind : order)
+    {
+      const auto &op = graph.operations().at(op_ind);
+      auto op_inputs = op.getInputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED;
+      auto op_outputs = op.getOutputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED;
+
+      for (const auto &ind : op_inputs)
+      {
+        const auto &operand = graph.operands().at(ind);
+        assert(uses_map.find(ind) != uses_map.end());
+        assert(uses_map[ind] > 0);
+        uses_map[ind]--;
+        if (uses_map[ind] == 0 && !operand.info().isVariable() && !model_io.contains(ind))
+        {
+          dealloc_list_map[op_ind].emplace_back(tensor_regs.getITensor(ind));
+        }
+      }
+    }
+
+    // Dispose and validate
+    for (const auto &ind : constants)
+    {
+      --uses_map[ind];
+    }
+
+    assert(
+      std::all_of(uses_map.begin(), uses_map.end(),
+                  [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; }));
+  }
+
   // Generate kernels
   for (auto &pair : ordered_contexts)
   {
@@ -328,6 +404,8 @@ ExecutorFactory::createLinearExecutor(std::unique_ptr<compiler::LoweredGraph> lo
       auto lower_info = lowered_graph->lower_info().operation.getRawPtr(op_ind);
       if (options.he_profiling_mode)
         fn_seq->wrap<SyncFunction>(lower_info->backend()->config());
+      if (!dealloc_list_map[op_ind].empty())
+        fn_seq->append(std::make_unique<DeallocFunction>(dealloc_list_map[op_ind]));
       builder.append(op_ind, {op_ind, &op, lower_info, std::move(fn_seq)});
     }
   }

diff --git a/runtime/onert/core/src/exec/DynamicShapeInferer.cc b/runtime/onert/core/src/exec/DynamicShapeInferer.cc
@@ -48,12 +48,12 @@ void DynamicShapeInferer::handleBinaryArithmeticOp(const ir::Operation &op,
 
     So, only when all inputs are static, we can skip dynamic shape inference.
   */
-  if ((!lhs->is_dynamic()) && (!rhs->is_dynamic()))
-    return;
-
   auto output_idx = op.getOutputs().at(0);
   auto output = _tensor_registry->getITensor(output_idx);
 
+  if ((currently_static(lhs) && currently_static(rhs)) && previously_static(output))
+    return;
+
   ir::Shape new_shape = shape_inference::inferEltwiseShape(lhs_shape, rhs_shape);
 
   output->applyShape(new_shape);

diff --git a/runtime/onert/core/src/exec/FunctionSequence.cc b/runtime/onert/core/src/exec/FunctionSequence.cc
@@ -53,9 +53,6 @@ void FunctionSequence::run()
       // run kernel
       function->run();
     }
-
-    // deallocate input tensors which are no longer used
-    _dynamic_tensor_ctx->dynamic_tensor_manager->deallocInput(op_ind);
   }
   else
   {