Video reader resize (NVIDIA#2097)

Added ViedoReaderResize Signed-off-by: Albert Wolant <[email protected]>
hahaxun · Jul 10, 2020 · ada6f49 · ada6f49
1 parent 29facf8
commit ada6f49
Show file tree

Hide file tree

Showing 14 changed files with 413 additions and 83 deletions.
diff --git a/dali/operators/image/resize/resize.cc b/dali/operators/image/resize/resize.cc
@@ -16,6 +16,17 @@
 #include "dali/pipeline/data/views.h"
 
 namespace dali {
+namespace detail {
+  kernels::ResamplingParams2D GetResamplingParams(
+    const TransformMeta &meta, kernels::FilterDesc min_filter, kernels::FilterDesc mag_filter) {
+    kernels::ResamplingParams2D params;
+    params[0].output_size = meta.rsz_h;
+    params[1].output_size = meta.rsz_w;
+    params[0].min_filter = params[1].min_filter = min_filter;
+    params[0].mag_filter = params[1].mag_filter = mag_filter;
+    return params;
+  }
+}  // namespace detail
 
 DALI_SCHEMA(ResizeAttr)
   .AddOptionalArg("image_type",
@@ -83,7 +94,8 @@ template <>
 void Resize<CPUBackend>::SetupSharedSampleParams(SampleWorkspace &ws) {
   const int thread_idx = ws.thread_idx();
   per_sample_meta_[thread_idx] = GetTransfomMeta(&ws, spec_);
-  resample_params_[thread_idx] = GetResamplingParams(per_sample_meta_[thread_idx]);
+  resample_params_[thread_idx] = detail::GetResamplingParams(
+    per_sample_meta_[thread_idx], min_filter_, mag_filter_);
 }
 
 template <>

diff --git a/dali/operators/image/resize/resize.cu b/dali/operators/image/resize/resize.cu
@@ -51,7 +51,8 @@ void Resize<GPUBackend>::SetupSharedSampleParams(DeviceWorkspace &ws) {
     DALI_ENFORCE(input_shape.size() == 3, "Expects 3-dimensional image input.");
 
     per_sample_meta_[i] = GetTransformMeta(spec_, input_shape, &ws, i, ResizeInfoNeeded());
-    resample_params_[i] = GetResamplingParams(per_sample_meta_[i]);
+    resample_params_[i] = detail::GetResamplingParams(
+      per_sample_meta_[i], min_filter_, mag_filter_);
   }
 }
 

diff --git a/dali/operators/image/resize/resize.h b/dali/operators/image/resize/resize.h
@@ -30,6 +30,10 @@
 #include "dali/kernels/imgproc/resample/params.h"
 
 namespace dali {
+namespace detail {
+  kernels::ResamplingParams2D GetResamplingParams(
+    const TransformMeta &meta, kernels::FilterDesc min_filter, kernels::FilterDesc mag_filter);
+}  // namespace detail
 
 class ResizeAttr : protected ResizeCropMirrorAttr {
  public:
@@ -61,15 +65,6 @@ class Resize : public Operator<Backend>
   void RunImpl(Workspace<Backend> &ws) override;
   void SetupSharedSampleParams(Workspace<Backend> &ws) override;
 
-  kernels::ResamplingParams2D GetResamplingParams(const TransformMeta &meta) const {
-    kernels::ResamplingParams2D params;
-    params[0].output_size = meta.rsz_h;
-    params[1].output_size = meta.rsz_w;
-    params[0].min_filter = params[1].min_filter = min_filter_;
-    params[0].mag_filter = params[1].mag_filter = mag_filter_;
-    return params;
-  }
-
   USE_OPERATOR_MEMBERS();
   using Operator<Backend>::RunImpl;
   bool save_attrs_;

diff --git a/dali/operators/image/resize/resize_base.h b/dali/operators/image/resize/resize_base.h
@@ -64,7 +64,7 @@ class DLL_PUBLIC ResizeBase : public ResamplingFilterAttr {
   std::vector<kernels::ResamplingParams2D> resample_params_;
   TensorListShape<> out_shape_;
 
- private:
+ protected:
   kernels::KernelManager kmgr_;
 
   struct MiniBatch {

diff --git a/dali/operators/image/resize/resize_crop_mirror.h b/dali/operators/image/resize/resize_crop_mirror.h
@@ -35,6 +35,14 @@ enum t_idInfo : uint32_t {
   t_mirrorVert
 };
 
+struct TransformMeta {
+  int H, W, C;
+  int rsz_h, rsz_w;
+  std::pair<int, int> crop;
+  int mirror;
+};
+
+
 /**
  * @brief Stores parameters for resize+crop+mirror
  */
@@ -61,13 +69,6 @@ class ResizeCropMirrorAttr : protected CropAttr {
     }
   }
 
-  struct TransformMeta {
-    int H, W, C;
-    int rsz_h, rsz_w;
-    std::pair<int, int> crop;
-    int mirror;
-  };
-
  protected:
   inline const TransformMeta GetTransformMeta(const OpSpec &spec,
                                               const TensorShape<> &input_shape,

diff --git a/dali/operators/reader/CMakeLists.txt b/dali/operators/reader/CMakeLists.txt
@@ -26,6 +26,7 @@ list(APPEND DALI_OPERATOR_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/sequence_reader_op.c
 
 if(BUILD_NVDEC)
   list(APPEND DALI_OPERATOR_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/video_reader_op.cc")
+  list(APPEND DALI_OPERATOR_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/video_reader_resize_op.cc")
 endif()
 
 if (BUILD_LMDB)

diff --git a/dali/operators/reader/nvdecoder/sequencewrapper.h b/dali/operators/reader/nvdecoder/sequencewrapper.h
@@ -92,6 +92,21 @@ struct SequenceWrapper {
     LOG_LINE << event_ << " synchronized!" << std::endl;
   }
 
+  void share_frames(TensorList<GPUBackend> &frames) {
+    void *current_sequence = sequence.raw_mutable_data();
+    auto shape = TensorListShape<>::make_uniform(count, frame_shape());
+
+    frames.ShareData(
+      current_sequence,
+      sequence.type().size() * count * height * width * channels,
+      shape,
+      sequence.type());
+  }
+
+  TensorShape<3> frame_shape() const {
+    return TensorShape<3>{height, width, channels};
+  }
+
   Tensor<GPUBackend> sequence;
   int count;
   int height;

diff --git a/dali/operators/reader/video_reader_op.cc b/dali/operators/reader/video_reader_op.cc
@@ -32,19 +32,7 @@ The video codecs can be contained in most of container file formats. FFmpeg is u
 Returns a batch of sequences of `sequence_length` frames of shape [N, F, H, W, C] (N being the batch size and F the
 number of frames). Supports only constant frame rate videos.)code")
   .NumInput(0)
-  .OutputFn([](const OpSpec &spec) {
-      std::string file_root = spec.GetArgument<std::string>("file_root");
-      std::string file_list = spec.GetArgument<std::string>("file_list");
-      bool enable_frame_num = spec.GetArgument<bool>("enable_frame_num");
-      bool enable_timestamps = spec.GetArgument<bool>("enable_timestamps");
-      int num_outputs = 1;
-      if (!file_root.empty() || !file_list.empty()) {
-        num_outputs++;
-        if (enable_frame_num) num_outputs++;
-        if (enable_timestamps) num_outputs++;
-      }
-      return num_outputs;
-    })
+  .OutputFn(detail::VideoReaderOutputFn)
   .AddOptionalArg("filenames",
       R"code(File names of the video files to load.
 This option is mutually exclusive with `file_root` and `file_list`.)code",

diff --git a/dali/operators/reader/video_reader_op.h b/dali/operators/reader/video_reader_op.h
@@ -21,7 +21,23 @@
 #include "dali/operators/reader/reader_op.h"
 #include "dali/operators/reader/loader/video_loader.h"
 
+
 namespace dali {
+namespace detail {
+inline int VideoReaderOutputFn(const OpSpec &spec) {
+    std::string file_root = spec.GetArgument<std::string>("file_root");
+    std::string file_list = spec.GetArgument<std::string>("file_list");
+    bool enable_frame_num = spec.GetArgument<bool>("enable_frame_num");
+    bool enable_timestamps = spec.GetArgument<bool>("enable_timestamps");
+    int num_outputs = 1;
+    if (!file_root.empty() || !file_list.empty()) {
+        num_outputs++;
+        if (enable_frame_num) num_outputs++;
+        if (enable_timestamps) num_outputs++;
+    }
+    return num_outputs;
+}
+}  // namespace detail
 
 class VideoReader : public DataReader<GPUBackend, SequenceWrapper> {
  public:
@@ -34,7 +50,6 @@ class VideoReader : public DataReader<GPUBackend, SequenceWrapper> {
     enable_timestamps_(spec.GetArgument<bool>("enable_timestamps")),
     count_(spec.GetArgument<int>("sequence_length")),
     channels_(spec.GetArgument<int>("channels")),
-    tl_shape_(batch_size_, sequence_dim),
     dtype_(spec.GetArgument<DALIDataType>("dtype")) {
     DALIImageType image_type(spec.GetArgument<DALIImageType>("image_type"));
 
@@ -58,7 +73,6 @@ class VideoReader : public DataReader<GPUBackend, SequenceWrapper> {
                   "timestamps can be enabled only when "
                   "`file_list` or `file_root` argument is passed");
 
-    // TODO(spanev): support rescale
     // TODO(spanev): Factor out the constructor body to make VideoReader compatible with lazy_init.
       try {
         loader_ = InitLoader<VideoLoader>(spec, filenames_);
@@ -83,75 +97,97 @@ class VideoReader : public DataReader<GPUBackend, SequenceWrapper> {
   void SetupSharedSampleParams(DeviceWorkspace &ws) override {
   }
 
-  void RunImpl(DeviceWorkspace &ws) override {
-    auto& tl_sequence_output = ws.Output<GPUBackend>(0);
-    TensorList<GPUBackend> *label_output = NULL;
-    TensorList<GPUBackend> *frame_num_output = NULL;
-    TensorList<GPUBackend> *timestamp_output = NULL;
-
+  void SetOutputType(TensorList<GPUBackend> &output) {
     if (dtype_ == DALI_FLOAT) {
-      tl_sequence_output.set_type(TypeInfo::Create<float>());
+      output.set_type(TypeTable::GetTypeInfoFromStatic<float>());
     } else {  // dtype_ == DALI_UINT8
-      tl_sequence_output.set_type(TypeInfo::Create<uint8>());
+      output.set_type(TypeTable::GetTypeInfoFromStatic<uint8>());
     }
+  }
 
+  virtual void SetOutputShape(TensorList<GPUBackend> &output, DeviceWorkspace &ws) {
+    TensorListShape<> output_shape(batch_size_, sequence_dim);
     for (int data_idx = 0; data_idx < batch_size_; ++data_idx) {
-      auto sequence_shape = GetSample(data_idx).sequence.shape();
-      tl_shape_.set_tensor_shape(data_idx, sequence_shape);
+      output_shape.set_tensor_shape(
+        data_idx, GetSample(data_idx).sequence.shape());
     }
+    output.Resize(output_shape);
+  }
 
-    tl_sequence_output.Resize(tl_shape_);
-    tl_sequence_output.SetLayout("FHWC");
+  void PrepareVideoOutput(TensorList<GPUBackend> &output, DeviceWorkspace &ws) {
+    SetOutputType(output);
+    SetOutputShape(output, ws);
+    output.SetLayout("FHWC");
+  }
 
+  void PrepareAdditionalOutputs(DeviceWorkspace &ws) {
     if (enable_label_output_) {
       int output_index = 1;
-      label_output = &ws.Output<GPUBackend>(output_index++);
-      label_output->set_type(TypeInfo::Create<int>());
-      label_output->Resize(label_shape_);
+      label_output_ = &ws.Output<GPUBackend>(output_index++);
+      label_output_->set_type(TypeTable::GetTypeInfoFromStatic<int>());
+      label_output_->Resize(label_shape_);
       if (enable_frame_num_) {
-        frame_num_output = &ws.Output<GPUBackend>(output_index++);
-        frame_num_output->set_type(TypeInfo::Create<int>());
-        frame_num_output->Resize(frame_num_shape_);
+        frame_num_output_ = &ws.Output<GPUBackend>(output_index++);
+        frame_num_output_->set_type(TypeTable::GetTypeInfoFromStatic<int>());
+        frame_num_output_->Resize(frame_num_shape_);
       }
 
       if (enable_timestamps_) {
-        timestamp_output = &ws.Output<GPUBackend>(output_index++);
-        timestamp_output->set_type(TypeInfo::Create<double>());
-        timestamp_output->Resize(timestamp_shape_);
+        timestamp_output_ = &ws.Output<GPUBackend>(output_index++);
+        timestamp_output_->set_type(TypeTable::GetTypeInfoFromStatic<double>());
+        timestamp_output_->Resize(timestamp_shape_);
       }
     }
+  }
 
-    for (int data_idx = 0; data_idx < batch_size_; ++data_idx) {
-      auto* sequence_output = tl_sequence_output.raw_mutable_tensor(data_idx);
-
-      auto& prefetched_sequence = GetSample(data_idx);
-      tl_sequence_output.type().Copy<GPUBackend, GPUBackend>(sequence_output,
-                                  prefetched_sequence.sequence.raw_data(),
-                                  prefetched_sequence.sequence.size(),
-                                  ws.stream());
-
-        if (enable_label_output_) {
-          auto *label = label_output->mutable_tensor<int>(data_idx);
-          CUDA_CALL(cudaMemcpyAsync(label, &prefetched_sequence.label, sizeof(int),
-                                    cudaMemcpyDefault, ws.stream()));
-          if (enable_frame_num_) {
-            auto *frame_num = frame_num_output->mutable_tensor<int>(data_idx);
-            CUDA_CALL(cudaMemcpyAsync(frame_num, &prefetched_sequence.first_frame_idx,
-                                      sizeof(int), cudaMemcpyDefault, ws.stream()));
-          }
-          if (enable_timestamps_) {
-            auto *timestamp = timestamp_output->mutable_tensor<double>(data_idx);
-            timestamp_output->type().Copy<GPUBackend, CPUBackend>(timestamp,
-                                                   prefetched_sequence.timestamps.data(),
-                                                   prefetched_sequence.timestamps.size(),
-                                                   ws.stream());
-          }
-        }
+  virtual void ProcessSingleVideo(
+    int data_idx,
+    TensorList<GPUBackend> &video_output,
+    SequenceWrapper &prefetched_video,
+    DeviceWorkspace &ws) {
+    video_output.type().Copy<GPUBackend, GPUBackend>(
+      video_output.raw_mutable_tensor(data_idx),
+      prefetched_video.sequence.raw_data(),
+      prefetched_video.sequence.size(),
+      ws.stream());
+  }
+
+  void ProcessAdditionalOutputs(
+    int data_idx, SequenceWrapper &prefetched_video, cudaStream_t stream) {
+    if (enable_label_output_) {
+      auto *label = label_output_->mutable_tensor<int>(data_idx);
+      CUDA_CALL(cudaMemcpyAsync(
+        label, &prefetched_video.label, sizeof(int), cudaMemcpyDefault, stream));
+      if (enable_frame_num_) {
+        auto *frame_num = frame_num_output_->mutable_tensor<int>(data_idx);
+        CUDA_CALL(cudaMemcpyAsync(
+          frame_num, &prefetched_video.first_frame_idx, sizeof(int), cudaMemcpyDefault, stream));
+      }
+      if (enable_timestamps_) {
+        auto *timestamp = timestamp_output_->mutable_tensor<double>(data_idx);
+        timestamp_output_->type().Copy<GPUBackend, CPUBackend>(
+          timestamp,
+          prefetched_video.timestamps.data(),
+          prefetched_video.timestamps.size(),
+          stream);
+      }
     }
   }
 
+  void RunImpl(DeviceWorkspace &ws) override {
+    auto& video_output = ws.Output<GPUBackend>(0);
+
+    PrepareVideoOutput(video_output, ws);
+    PrepareAdditionalOutputs(ws);
+
+    for (int data_idx = 0; data_idx < batch_size_; ++data_idx) {
+      auto& prefetched_video = GetSample(data_idx);
+
+      ProcessSingleVideo(data_idx, video_output, prefetched_video, ws);
+      ProcessAdditionalOutputs(data_idx, prefetched_video, ws.stream());
+    }
+  }
 
- private:
   static constexpr int sequence_dim = 4;
   std::vector<std::string> filenames_;
   std::string file_root_;
@@ -161,11 +197,14 @@ class VideoReader : public DataReader<GPUBackend, SequenceWrapper> {
   int count_;
   int channels_;
 
-  TensorListShape<> tl_shape_;
   TensorListShape<> label_shape_;
   TensorListShape<> timestamp_shape_;
   TensorListShape<> frame_num_shape_;
 
+  TensorList<GPUBackend> *label_output_ = NULL;
+  TensorList<GPUBackend> *frame_num_output_ = NULL;
+  TensorList<GPUBackend> *timestamp_output_ = NULL;
+
   DALIDataType dtype_;
   bool enable_label_output_;
 

diff --git a/dali/operators/reader/video_reader_resize_op.cc b/dali/operators/reader/video_reader_resize_op.cc
@@ -0,0 +1,41 @@
+// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "dali/core/common.h"
+#include "dali/core/error_handling.h"
+#include "dali/pipeline/operator/common.h"
+#include "dali/pipeline/operator/op_spec.h"
+#include "dali/pipeline/operator/operator.h"
+#include "dali/operators/reader/video_reader_op.h"
+#include "dali/operators/reader/video_reader_resize_op.h"
+
+namespace dali {
+
+DALI_REGISTER_OPERATOR(VideoReaderResize, VideoReaderResize, GPU);
+
+DALI_SCHEMA(VideoReaderResize)
+  .DocStr(R"code(
+Load and decode H264 video codec with FFmpeg and NVDECODE, NVIDIA GPU's hardware-accelerated video decoding.
+The video codecs can be contained in most of container file formats. FFmpeg is used to parse video containers.
+Returns a batch of sequences of `sequence_length` frames of shape [N, F, H, W, C] (N being the batch size and F the
+number of frames). Supports only constant frame rate videos. It resizes video based on provided params. It supports
+features of `Resize` operator.)code")
+  .NumInput(0)
+  .OutputFn(detail::VideoReaderOutputFn)
+  .AddParent("VideoReader")
+  .AddParent("ResizeAttr")
+  .AddParent("ResamplingFilterAttr");
+}  // namespace dali