add audio-visual decoding functionalities

facebookresearch · Jul 11, 2020 · 6412041 · 6412041
1 parent e7b1c93
commit 6412041
Show file tree

Hide file tree

Showing 12 changed files with 249 additions and 48 deletions.
diff --git a/lib/models/audio_visual_model.py b/lib/models/audio_visual_model.py
@@ -77,6 +77,7 @@ def create_model(
     acoustic_data="logmels",
     channel_multiplier=1.0,
     bottleneck_multiplier=1.0,
+    use_dropout=False,
     conv1_temporal_stride=1,
     conv1_temporal_kernel=3,
     use_convolutional_pred=False,

diff --git a/lib/utils/model_helper.py b/lib/utils/model_helper.py
@@ -157,12 +157,6 @@ def AddVideoInput(model, reader, **kwargs):
                 **kwargs
             )
     else:
-        util_folder, _ = os.path.split(os.path.dirname(__file__))
-        lib_folder, _ = os.path.split(util_folder)
-        vmz_folder, _ =os.path.split(lib_folder)
-        av_lib = os.path.join(vmz_folder, "build/av_ops.so")
-        assert os.path.exists(av_lib), "no av_lib found, please build first"
-        dyndep.InitOpsLibrary(av_lib)
         if input_type == 2:
             if get_video_id:
                 if get_start_frame:

diff --git a/ops/LogMels.cc b/ops/LogMels.cc
@@ -0,0 +1,203 @@
+/**
+ * Copyright (c) 2020-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "LogMels.h"
+
+#include <glog/logging.h>
+#include <complex>
+
+const int kAudioBufferSize = 20480;
+const double kGain = 1080674186.3482928;
+
+inline float hz2Mel(float hz) {
+  return 1127 * std::log(1 + hz / 700);
+}
+
+inline float mel2Hz(float mel) {
+  return 700 * (std::exp(mel / 1127) - 1);
+}
+
+LogSpectrum::LogSpectrum(
+    int numChannels,
+    float samplingRate,
+    int windowLength,
+    int windowStep,
+    float startFreq,
+    float endFreq,
+    bool normalized)
+    : numChannels_(numChannels),
+      samplingRate_(samplingRate),
+      windowLength_(windowLength * samplingRate / 1000.0f),
+      windowStep_(windowStep * samplingRate / 1000.0f),
+      startFreq_(startFreq),
+      endFreq_(endFreq > 0 ? endFreq : samplingRate * .5f),
+      normalized_(normalized),
+      fifo_(nullptr),
+      inited_(false) {
+  int bits = log2(windowLength_);
+  if (pow(2, bits) != windowLength_) {
+    bits += 1;
+  }
+  fftSize_ = (int)pow(2, bits);
+  filterLength_ = fftSize_ / 2 + 1;
+
+  filters_ = (float*)calloc(numChannels_ * filterLength_, sizeof(float));
+  featFrame_ = (double*)calloc(numChannels_, sizeof(double));
+  tempFrame_ = (float*)calloc(fftSize_, sizeof(float));
+  window_ = (float*)calloc(windowLength_, sizeof(float));
+
+  dftContext_ = av_rdft_init(bits, DFT_R2C);
+  for (int i = 0; i < windowLength_; i++) {
+    window_[i] = .5f * (1 - cos(2 * M_PI * i / (windowLength_ - 1)));
+  }
+
+  CHECK(Init());
+}
+
+void LogSpectrum::Cleanup() {
+  if (filters_) {
+    free(filters_);
+    filters_ = nullptr;
+  }
+  if (featFrame_) {
+    free(featFrame_);
+    featFrame_ = nullptr;
+  }
+  if (tempFrame_) {
+    free(tempFrame_);
+    tempFrame_ = nullptr;
+  }
+  if (window_) {
+    free(window_);
+    window_ = nullptr;
+  }
+  if (dftContext_) {
+    av_rdft_end(dftContext_);
+    dftContext_ = nullptr;
+  }
+  if (fifo_) {
+    av_audio_fifo_reset(fifo_);
+    av_audio_fifo_free(fifo_);
+    fifo_ = nullptr;
+  }
+}
+
+LogSpectrum::~LogSpectrum() {
+  Cleanup();
+}
+
+bool LogSpectrum::Init() {
+  if (inited_) {
+    return true;
+  }
+  if (!fifo_) {
+    if (!(fifo_ =
+              av_audio_fifo_alloc(AV_SAMPLE_FMT_FLT, 1, kAudioBufferSize))) {
+      LOG(ERROR) << "Could not allocate FIFO";
+      return false;
+    }
+  }
+
+  float maxFreq = samplingRate_ / 2.0f;
+  CHECK_LT(startFreq_, endFreq_) << "End frequency is larger than start freq";
+  CHECK_LE(endFreq_, maxFreq) << "End frequency is larger than maxFreq";
+
+  float startMel = hz2Mel(startFreq_);
+  float endMel = hz2Mel(endFreq_);
+  float dx = (endMel - startMel) / (numChannels_ + 1);
+  float freqStep = samplingRate_ / 2 / filterLength_;
+  for (int filter = 0; filter < numChannels_; ++filter) {
+    float start = mel2Hz(filter * dx) / freqStep;
+    float mid = mel2Hz((filter + 1) * dx) / freqStep;
+    float end = mel2Hz((filter + 2) * dx) / freqStep;
+    int from = int(start);
+    int to = int(end) + 1;
+    float leftWidth = std::max<float>(1.0, mid - start);
+    float rightWidth = std::max<float>(1.0, end - mid);
+    float sum = 0;
+    from = std::max<int>(0, from);
+    to = std::min<int>(filterLength_ - 1, to);
+    filterRanges_.emplace_back(from, to);
+    for (int i = from; i <= to; i++) {
+      float value =
+          1.f - ((i < mid) ? (mid - i) / leftWidth : (i - mid) / rightWidth);
+      if (value > 0) {
+        filters_[filter * filterLength_ + i] = value;
+        sum += value;
+      }
+    }
+    if (normalized_ && sum > 0) {
+      for (int inX = from; inX <= to; inX++) {
+        filters_[filter * filterLength_ + inX] /= sum;
+      }
+    }
+  }
+  inited_ = true;
+  return true;
+}
+
+int LogSpectrum::Write(float* samples, size_t numSamples) {
+  int samplesW = 0;
+  if ((samplesW = av_audio_fifo_write(fifo_, (void**)&samples, numSamples)) <
+      numSamples) {
+    LOG(ERROR) << "Could not write data to FIFO";
+    return -1;
+  }
+  return samplesW;
+}
+
+int LogSpectrum::Read(float* feat) {
+  if (av_audio_fifo_size(fifo_) > windowLength_) {
+    memset(tempFrame_, 0, fftSize_ * sizeof(float));
+    if (av_audio_fifo_peek(fifo_, (void**)&tempFrame_, windowLength_) <
+        windowLength_) {
+      LOG(ERROR) << "Could not read data from FIFO";
+      return -1;
+    }
+    if (av_audio_fifo_drain(fifo_, windowStep_)) {
+      LOG(ERROR) << "Could not drain data from FIFO";
+      return -1;
+    }
+    Apply(feat);
+    return 1;
+  }
+  return 0;
+}
+
+void LogSpectrum::Apply(float* feat) {
+  PowerSpectrum();
+  for (int i = 0; i < numChannels_; ++i) {
+    featFrame_[i] = 0;
+  }
+  for (int i = 0; i < numChannels_; ++i) {
+    for (int j = filterRanges_[i].first; j < filterRanges_[i].second; ++j) {
+      featFrame_[i] += tempFrame_[j] * filters_[i * filterLength_ + j] * kGain;
+    }
+    feat[i] =
+        featFrame_[i] > M_E ? std::log(featFrame_[i]) : featFrame_[i] / M_E;
+  }
+}
+
+void LogSpectrum::PowerSpectrum() {
+  for (int i = 0; i < windowLength_; i++) {
+    tempFrame_[i] *= window_[i];
+  }
+  av_rdft_calc(dftContext_, tempFrame_);
+  FFTComplex* comps = reinterpret_cast<FFTComplex*>(tempFrame_);
+  for (int i = 0; i < fftSize_ / 2; i++) {
+    tempFrame_[i] = comps[i].re * comps[i].re + comps[i].im * comps[i].im;
+  }
+}
diff --git a/ops/av_decoder.cc b/ops/av_decoder.cc
@@ -184,16 +184,15 @@ void AVDecoder::decodeLoop(
     }
     ret = avformat_open_input(&inputContext, "", nullptr, nullptr);
     if (ret < 0) {
-      LOG(ERROR) <<
-          folly::sformat("Unable to open stream : {}", ffmpegErrorStr(ret));
+      LOG(ERROR) << "Unable to open stream : " << ffmpegErrorStr(ret);
+      return;
     }
 
     ret = avformat_find_stream_info(inputContext, nullptr);
     if (ret < 0) {
-      LOG(ERROR) << (folly::sformat(
-          "Unable to find stream info in {:s}: {:s}",
-          videoName,
-          ffmpegErrorStr(ret)));
+      LOG(ERROR) << "Unable to find stream info in " << videoName << " "
+                 << ffmpegErrorStr(ret);
+      return;
     }
 
     // Decode the first video stream
@@ -249,8 +248,7 @@ void AVDecoder::decodeLoop(
             std::string(audioCodecContext_->codec->name) : "None";
 
 
-        LOG(ERROR) << (folly::sformat(
-            "Cannot open audio codec : {}", codecName));
+        LOG(ERROR) << "Cannot open audio codec : " << codecName;
       }
 
       convertCtx_ = swr_alloc_set_opts(
@@ -285,7 +283,6 @@ void AVDecoder::decodeLoop(
     double prevTimestamp = 0;
     int outWidth = 0;
     int outHeight = 0;
-    facebook::MonotonicUsecTimer frame_decode_timer(true);
 
     if (params.getVideo_ && videoStreamIndex_ >= 0) {
       videoCodecContext_ = videoStream_->codec;
@@ -299,9 +296,10 @@ void AVDecoder::decodeLoop(
       }
 
       if (ret < 0) {
-        LOG(ERROR) << (folly::sformat(
-            "Cannot open video codec : {}", videoCodecContext_->codec->name));
-      }
+      LOG(ERROR) << "Cannot open video codec : "
+                 << videoCodecContext_->codec->name;
+      return;
+    }
 
       // Calculate if we need to rescale the frames
       const int origWidth = videoCodecContext_->width;
@@ -332,8 +330,7 @@ void AVDecoder::decodeLoop(
         outWidth = params.outputWidth_;
         outHeight = params.outputHeight_;
       } else {
-        LOG(ERROR) << (
-            folly::sformat("Unknown VideoResType: {}", params.video_res_type_));
+        LOG(ERROR) << "Unknown VideoResType: " << params.video_res_type_;
       }
 
       // Make sure that we have a valid format
@@ -514,8 +511,7 @@ void AVDecoder::decodeLoop(
             av_free_packet(&packet);
             continue;
           } else if (ret < 0) {
-            LOG(ERROR) << (folly::sformat(
-                "Error reading packet : {}", ffmpegErrorStr(ret)));
+            LOG(ERROR) << "Error reading packet : " << ffmpegErrorStr(ret);
           }
           ipacket++;
 
@@ -556,8 +552,7 @@ void AVDecoder::decodeLoop(
           ret = avcodec_decode_video2(
               videoCodecContext_, videoStreamFrame_, &gotPicture, &packet);
           if (ret < 0) {
-            LOG(ERROR) << (folly::sformat(
-                "Error decoding video frame : {}", ffmpegErrorStr(ret)));
+            LOG(ERROR) << "Error decoding video frame : " << ffmpegErrorStr(ret);
           }
           try {
             // Nothing to do without a picture
@@ -673,7 +668,6 @@ void AVDecoder::decodeLoop(
                     rgbFrame->data,
                     rgbFrame->linesize);
 
-                auto frame_decode_time_us = frame_decode_timer.stop();
                 unique_ptr<DecodedFrame> frame = make_unique<DecodedFrame>();
                 frame->width_ = outWidth;
                 frame->height_ = outHeight;
@@ -683,10 +677,7 @@ void AVDecoder::decodeLoop(
                 frame->outputFrameIndex_ = outputFrameIndex;
                 frame->timestamp_ = timestamp;
                 frame->keyFrame_ = videoStreamFrame_->key_frame;
-                frame->frameDecodeTimeUS_ = frame_decode_time_us;
                 callback.frameDecoded(std::move(frame));
-                frame_decode_timer.reset();
-                frame_decode_timer.start();
 
                 selectiveDecodedFrames++;
                 av_frame_free(&rgbFrame);
@@ -768,7 +759,7 @@ void AVDecoder::decodeFile(
 string AVDecoder::ffmpegErrorStr(int result) {
   std::array<char, 128> buf;
   av_strerror(result, buf.data(), buf.size());
-  return folly::sformat("{} ({})", string(buf.data()), result);
+  return string(buf.data());
 }
 
 void FreeAVDecodedData(

diff --git a/ops/av_decoder.h b/ops/av_decoder.h
@@ -26,17 +26,17 @@
 #include <string>
 #include <vector>
 #include "caffe2/core/logging.h"
-#include "common/time/Time.h"
-#include "common/base/Exception.h"
-#include <folly/ScopeGuard.h>
-#include <folly/Format.h>
+// #include "common/time/Time.h"
+// #include "common/base/Exception.h"
+// #include <folly/ScopeGuard.h>
+// #include <folly/Format.h>
 #include "av_video_decoder_commons.h"
 
 extern "C" {
 #include <libavcodec/avcodec.h>
 #include <libavformat/avformat.h>
 #include <libavformat/avio.h>
-#include <libavutil/audioconvert.h>
+// #include <libavutil/audioconvert.h>
 #include <libavutil/log.h>
 #include <libavutil/motion_vector.h>
 #include <libswresample/swresample.h>

diff --git a/ops/av_input_op.cc b/ops/av_input_op.cc
@@ -25,9 +25,10 @@ OPERATOR_SCHEMA(AVInput)
       bool get_rgb = helper.GetSingleArgument<bool>("get_rgb", true);
       bool do_multi_label = helper.GetSingleArgument<bool>("do_multi_label", false);
       bool get_video_id = helper.GetSingleArgument<bool>("get_video_id", false);
+      bool get_start_frame = helper.GetSingleArgument<bool>("get_start_frame", false);
       bool get_logmels = helper.GetSingleArgument<bool>("get_logmels", false);
-      int logmel_frames = helper.GetSingleArgument<int>("logmel_frames", 0);
-      int logmel_filters = helper.GetSingleArgument<int>("logmel_filters", 0);
+      int logmel_frames = helper.GetSingleArgument<int>("logmel_frames", 100);
+      int logmel_filters = helper.GetSingleArgument<int>("logmel_filters", 40);
 
       int output_size = 1;
       if (get_rgb) {
@@ -39,6 +40,9 @@ OPERATOR_SCHEMA(AVInput)
       if (get_video_id) {
         output_size++;
       }
+      if (get_start_frame) {
+        output_size++;
+      }
 
       int index = 0;
       vector<TensorShape> out(output_size);
@@ -63,9 +67,14 @@ OPERATOR_SCHEMA(AVInput)
             vector<int>{batch_size, num_of_class}, TensorProto::INT32);
       }
       if (get_video_id) {
-        out[index] =
+        out[index++] =
             CreateTensorShape(vector<int64_t>{1, batch_size}, TensorProto::INT64);
       }
+      if (get_start_frame) {
+        out[index] = CreateTensorShape(
+            vector<int>{1, batch_size}, TensorProto::INT32);
+      }
+
 
       return out;
     });