[Diffusion] Add C++ dpm solver (PaddlePaddle#714)

* Add BetaForAlphaBar, ConvertModelOutput, SetTimesteps, and constructor for DPMSolverMultistepScheduler * tmp * Add DPMSolverFirstOrderUpdate * Add ScaleModelInput * Add MultiStepDPMSolverSecondOrderUpdate * add MultiStepDPMSolverThirdOrderUpdate * Add Step * Add FASTDEPLOY_DECL * Add AddNoise * Fix operator * update * Fix DPMSolverMultistepScheduler * Upgrade Slice * Fix DPMSolverFirstOrderUpdate * remove FASTDEPLOY_DECL * Add config for dpm solver
fengyunyang · Nov 30, 2022 · d95094c · d95094c
1 parent 3f8ed9b
commit d95094c
Show file tree

Hide file tree

Showing 14 changed files with 675 additions and 11 deletions.
diff --git a/examples/multimodal/stable_diffusion/cpp/CMakeLists.txt b/examples/multimodal/stable_diffusion/cpp/CMakeLists.txt
@@ -0,0 +1,27 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+PROJECT(main C CXX)
+CMAKE_MINIMUM_REQUIRED (VERSION 3.10)
+
+option(FASTDEPLOY_INSTALL_DIR "Path of downloaded fastdeploy sdk.")
+set(THIRD_LIBS "")
+include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake)
+
+include_directories(${FASTDEPLOY_INCS})
+
+file(GLOB_RECURSE ALL_SRCS ${PROJECT_SOURCE_DIR}/*.cc)
+
+add_executable(main ${ALL_SRCS})
+target_link_libraries(main ${FASTDEPLOY_LIBS} ${THIRD_LIBS})
diff --git a/examples/multimodal/stable_diffusion/cpp/dpm_solver_multistep_scheduler.cc b/examples/multimodal/stable_diffusion/cpp/dpm_solver_multistep_scheduler.cc
diff --git a/examples/multimodal/stable_diffusion/cpp/dpm_solver_multistep_scheduler.h b/examples/multimodal/stable_diffusion/cpp/dpm_solver_multistep_scheduler.h
@@ -0,0 +1,85 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "./scheduler.h"
+#include "fastdeploy/core/fd_tensor.h"
+
+namespace fastdeploy {
+
+class DPMSolverMultistepScheduler : public Scheduler {
+ public:
+  DPMSolverMultistepScheduler(int num_train_timesteps = 1000,
+                              float beta_start = 0.0001, float beta_end = 0.02,
+                              const std::string& beta_schedule = "linear",
+                              const std::vector<float>& trained_betas = {},
+                              int solver_order = 2, bool predict_epsilon = true,
+                              bool thresholding = false,
+                              float dynamic_thresholding_ratio = 0.995,
+                              float sample_max_value = 1.0,
+                              const std::string& algorithm_type = "dpmsolver++",
+                              const std::string& solver_type = "midpoint",
+                              bool lower_order_final = true);
+  void BetaForAlphaBar(FDTensor* out, int num_diffusion_timesteps,
+                       float max_beta = 0.999);
+  void ConvertModelOutput(const FDTensor& model_output, int timestep,
+                          const FDTensor& sample, FDTensor* out);
+  void DPMSolverFirstOrderUpdate(const FDTensor& model_output, int timestep,
+                                 int prev_timestep, const FDTensor& sample,
+                                 FDTensor* out);
+  void MultiStepDPMSolverSecondOrderUpdate(
+      const std::vector<FDTensor>& model_output_list,
+      const std::vector<int>& timestep_list, int prev_timestep,
+      const FDTensor& sample, FDTensor* out);
+  void MultiStepDPMSolverThirdOrderUpdate(
+      const std::vector<FDTensor>& model_output_list,
+      const std::vector<int>& timestep_list, int prev_timestep,
+      const FDTensor& sample, FDTensor* out);
+  void SetTimesteps(int num_inference_steps) override;
+  void Step(const FDTensor& model_output, int timestep, const FDTensor& sample,
+            FDTensor* prev_sample) override;
+  void ScaleModelInput(const FDTensor& sample, FDTensor* out,
+                       const std::vector<FDTensor>& timesteps = {}) override;
+  void AddNoise(const FDTensor& original_samples, const FDTensor& noise,
+                const FDTensor& timesteps, FDTensor* out) override;
+  struct Config {
+    int num_train_timesteps_;
+    float beta_start_;
+    float beta_end_;
+    std::string beta_schedule_;
+    int solver_order_;
+    bool predict_epsilon_;
+    bool thresholding_;
+    float dynamic_thresholding_ratio_;
+    float sample_max_value_;
+    std::string algorithm_type_;
+    std::string solver_type_;
+    bool lower_order_final_;
+  } config;
+
+ private:
+  FDTensor betas_;
+  FDTensor alphas_;
+  FDTensor alphas_cumprod_;
+  FDTensor alpha_t_;
+  FDTensor sigma_t_;
+  FDTensor lambda_t_;
+  int num_inference_steps_;
+  FDTensor timesteps_;
+  int lower_order_nums_;
+  std::vector<FDTensor> model_outputs_;
+};
+
+}  // namespace fastdeploy
diff --git a/examples/multimodal/stable_diffusion/cpp/main.cc b/examples/multimodal/stable_diffusion/cpp/main.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dpm_solver_multistep_scheduler.h"
+#include <iostream>
+
+int main() {
+  fastdeploy::DPMSolverMultistepScheduler dpm(
+      /* num_train_timesteps */ 1000,
+      /* beta_start = */ 0.00085,
+      /* beta_end = */ 0.012,
+      /* beta_schedule = */ "scaled_linear",
+      /* trained_betas = */ {},
+      /* solver_order = */ 2,
+      /* predict_epsilon = */ true,
+      /* thresholding = */ false,
+      /* dynamic_thresholding_ratio = */ 0.995,
+      /* sample_max_value = */ 1.0,
+      /* algorithm_type = */ "dpmsolver++",
+      /* solver_type = */ "midpoint",
+      /* lower_order_final = */ true);
+
+  return 0;
+}
diff --git a/examples/multimodal/stable_diffusion/cpp/scheduler.h b/examples/multimodal/stable_diffusion/cpp/scheduler.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "fastdeploy/core/fd_tensor.h"
+
+namespace fastdeploy {
+
+class Scheduler {
+  virtual void SetTimesteps(int num_inference_steps) = 0;
+  virtual void Step(const FDTensor& model_output, int timestep,
+                    const FDTensor& sample, FDTensor* prev_sample) = 0;
+  virtual void ScaleModelInput(const FDTensor& sample, FDTensor* out,
+                               const std::vector<FDTensor>& timesteps = {}) = 0;
+  virtual void AddNoise(const FDTensor& original_samples, const FDTensor& noise,
+                        const FDTensor& timesteps, FDTensor* out) = 0;
+};
+
+}  // namespace fastdeploy
diff --git a/fastdeploy/core/fd_tensor.cc b/fastdeploy/core/fd_tensor.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "fastdeploy/core/fd_tensor.h"
-#include "fastdeploy/core/fd_scalar.h"
 #include "fastdeploy/core/float16.h"
 #include "fastdeploy/utils/utils.h"
 
@@ -81,8 +80,7 @@ const void* FDTensor::CpuData() const {
 
 void FDTensor::SetExternalData(const std::vector<int64_t>& new_shape,
                                const FDDataType& data_type, void* data_buffer,
-                               const Device& new_device,
-                               int new_device_id) {
+                               const Device& new_device, int new_device_id) {
   dtype = data_type;
   shape.assign(new_shape.begin(), new_shape.end());
   external_data_ptr = data_buffer;

diff --git a/fastdeploy/core/fd_tensor.h b/fastdeploy/core/fd_tensor.h
@@ -19,12 +19,11 @@
 #include <vector>
 
 #include "fastdeploy/core/allocate.h"
+#include "fastdeploy/core/fd_scalar.h"
 #include "fastdeploy/core/fd_type.h"
 
 namespace fastdeploy {
 
-struct Scalar;
-
 struct FASTDEPLOY_DECL FDTensor {
   // std::vector<int8_t> data;
   void* buffer_ = nullptr;

diff --git a/fastdeploy/function/clip.cc b/fastdeploy/function/clip.cc
@@ -39,14 +39,15 @@ void ClipKernel(const FDTensor& x, double min, double max, FDTensor* out) {
            "max should be greater than or equal to min. But received min = %f, "
            "max = %f",
            static_cast<float>(min_), static_cast<float>(max_));
-
-  out->Allocate(x.Shape(), x.Dtype());
+  FDTensor tmp;
+  tmp.Allocate(x.Shape(), x.Dtype());
   const T* x_data = reinterpret_cast<const T*>(x.Data());
 
   int64_t numel = x.Numel();
-  T* out_data = reinterpret_cast<T*>(out->Data());
+  T* out_data = reinterpret_cast<T*>(tmp.Data());
 
   std::transform(x_data, x_data + numel, out_data, ClipFunctor<T>(min_, max_));
+  *out = std::move(tmp);
 }
 
 void Clip(const FDTensor& x, double min, double max, FDTensor* out) {

diff --git a/fastdeploy/function/elementwise.cc b/fastdeploy/function/elementwise.cc
@@ -86,4 +86,25 @@ FDTensor operator/(const FDTensor& x, const FDTensor& y) {
   return out;
 }
 
+#define INSTANTIATE_OPERATOR(operation_type)                                   \
+  template FDTensor operator operation_type(const FDTensor& x, bool y);        \
+  template FDTensor operator operation_type(const FDTensor& x, uint8_t y);     \
+  template FDTensor operator operation_type(const FDTensor& x, int16_t y);     \
+  template FDTensor operator operation_type(const FDTensor& x, int y);         \
+  template FDTensor operator operation_type(const FDTensor& x, int64_t y);     \
+  template FDTensor operator operation_type(const FDTensor& x, float y);       \
+  template FDTensor operator operation_type(const FDTensor& x, double y);      \
+  template FDTensor operator operation_type(bool x, const FDTensor& y);        \
+  template FDTensor operator operation_type(uint8_t x, const FDTensor& y);     \
+  template FDTensor operator operation_type(int16_t x, const FDTensor& y);     \
+  template FDTensor operator operation_type(int x, const FDTensor& y);         \
+  template FDTensor operator operation_type(int64_t x, const FDTensor& y);     \
+  template FDTensor operator operation_type(float x, const FDTensor& y);       \
+  template FDTensor operator operation_type(double x, const FDTensor& y)
+
+INSTANTIATE_OPERATOR(+);
+INSTANTIATE_OPERATOR(-);
+INSTANTIATE_OPERATOR(*);
+INSTANTIATE_OPERATOR(/);
+
 }  // namespace fastdeploy
diff --git a/fastdeploy/function/elementwise.h b/fastdeploy/function/elementwise.h
@@ -14,9 +14,11 @@
 
 #pragma once
 
+#include "fastdeploy/core/fd_scalar.h"
 #include "fastdeploy/core/fd_tensor.h"
 
 namespace fastdeploy {
+
 namespace function {
 
 /** Excute the add operation for input FDTensors. *out = x + y.
@@ -62,10 +64,42 @@ FASTDEPLOY_DECL void Maximum(const FDTensor& x, const FDTensor& y,
 
 FASTDEPLOY_DECL FDTensor operator+(const FDTensor& x, const FDTensor& y);
 
+template <typename T> FDTensor operator+(const FDTensor& x, T y) {
+  return x + FDTensor(Scalar(y));
+}
+
+template <typename T> FDTensor operator+(T x, const FDTensor& y) {
+  return FDTensor(Scalar(x)) + y;
+}
+
 FASTDEPLOY_DECL FDTensor operator-(const FDTensor& x, const FDTensor& y);
 
+template <typename T> FDTensor operator-(const FDTensor& x, T y) {
+  return x - FDTensor(Scalar(y));
+}
+
+template <typename T> FDTensor operator-(T x, const FDTensor& y) {
+  return FDTensor(Scalar(x)) - y;
+}
+
 FASTDEPLOY_DECL FDTensor operator*(const FDTensor& x, const FDTensor& y);
 
+template <typename T> FDTensor operator*(const FDTensor& x, T y) {
+  return x * FDTensor(Scalar(y));
+}
+
+template <typename T> FDTensor operator*(T x, const FDTensor& y) {
+  return FDTensor(Scalar(x)) * y;
+}
+
 FASTDEPLOY_DECL FDTensor operator/(const FDTensor& x, const FDTensor& y);
 
+template <typename T> FDTensor operator/(const FDTensor& x, T y) {
+  return x / FDTensor(Scalar(y));
+}
+
+template <typename T> FDTensor operator/(T x, const FDTensor& y) {
+  return FDTensor(Scalar(x)) / y;
+}
+
 }  // namespace fastdeploy
diff --git a/fastdeploy/function/elementwise_base.h b/fastdeploy/function/elementwise_base.h
@@ -213,10 +213,12 @@ void CommonElementwiseBroadcastForward(const FDTensor& x, const FDTensor& y,
   GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(),
                          y_dims_array.data(), out_dims_array.data(), max_dim,
                          axis);
-  z->Allocate(out_dims_array, TypeToDataType<OutType>::dtype);
+  FDTensor tmp;
+  tmp.Allocate(out_dims_array, TypeToDataType<OutType>::dtype);
   CommonForwardBroadcastCPU<Functor, T, OutType>(
-      x, y, z, x_dims_array.data(), y_dims_array.data(), out_dims_array.data(),
-      max_dim, func, is_xsize_larger);
+      x, y, &tmp, x_dims_array.data(), y_dims_array.data(),
+      out_dims_array.data(), max_dim, func, is_xsize_larger);
+  *z = std::move(tmp);
 }
 
 template <typename Functor, typename T, typename OutType = T>

diff --git a/fastdeploy/function/slice.cc b/fastdeploy/function/slice.cc
@@ -163,5 +163,20 @@ void Slice(const FDTensor& x, const std::vector<int64_t>& axes,
       }));
 }
 
+void Slice(const FDTensor& x, const std::vector<int64_t>& axes,
+           const std::vector<int64_t>& index, FDTensor* out) {
+  std::vector<int64_t> ends = index;
+  for (int i = 0; i < ends.size(); ++i) {
+    ends[i] += 1;
+  }
+  Slice(x, axes, index, ends, out);
+  for (int i = 0; i < axes.size(); ++i) {
+    if (out->Shape().size() <= 1) {
+      break;
+    }
+    out->Squeeze(axes[i]);
+  }
+}
+
 }  // namespace function
 }  // namespace fastdeploy
diff --git a/fastdeploy/function/slice.h b/fastdeploy/function/slice.h
@@ -37,5 +37,8 @@ FASTDEPLOY_DECL void Slice(const FDTensor& x, const std::vector<int64_t>& axes,
                            const std::vector<int64_t>& starts,
                            const std::vector<int64_t>& ends, FDTensor* out);
 
+FASTDEPLOY_DECL void Slice(const FDTensor& x, const std::vector<int64_t>& axes,
+                           const std::vector<int64_t>& index, FDTensor* out);
+
 }  // namespace function
 }  // namespace fastdeploy
diff --git a/tests/function/test_elementwise.cc b/tests/function/test_elementwise.cc
@@ -164,6 +164,15 @@ TEST(fastdeploy, check_same_dim) {
   check_shape(z.shape, {2, 3, 4});
   check_data(reinterpret_cast<const float*>(z.Data()), maximum_result.data(),
              maximum_result.size());
+
+  x = 1.0f - x;
+  sub_result = {0.157138, 0.353809, 0.862595, 0.885693, 0.340074, 0.464184,
+                0.257084, 0.154395, 0.787718, 0.700299, 0.137829, 0.591059,
+                0.873153, 0.843381, 0.571159, 0.152347, 0.754137, 0.330954,
+                0.121117, 0.323741, 0.333547, 0.67477,  0.586061, 0.165859};
+  check_shape(x.shape, {2, 3, 4});
+  check_data(reinterpret_cast<const float*>(x.Data()), sub_result.data(),
+             sub_result.size());
 }
 
 TEST(fastdeploy, check_broadcast_dim1) {
@@ -498,6 +507,15 @@ TEST(fastdeploy, mixed_operation) {
   check_shape(output.shape, {2, 3, 4});
   check_data(reinterpret_cast<const float*>(output.Data()), result.data(),
              result.size());
+
+  result = {2.854443,  1.87709,   1.585621,  1.012709,  0.332781,  0.998346,
+            0.228024,  2.140475,  0.246941,  0.301517,  1.575438,  0.595582,
+            -0.410393, -0.163718, -0.405571, 0.58563,   -0.177035, 0.263035,
+            0.075725,  0.591098,  0.156365,  -0.106078, -0.475957, 0.626429};
+  output = a + b * c / d - e;
+  check_shape(output.shape, {2, 3, 4});
+  check_data(reinterpret_cast<const float*>(output.Data()), result.data(),
+             result.size());
 }
 
 }  // namespace function