Skip to content

Commit

Permalink
add audio-visual decoding functionalities
Browse files Browse the repository at this point in the history
  • Loading branch information
weiyaowang authored and Weiyao Wang committed Jul 11, 2020
1 parent e7b1c93 commit 6412041
Show file tree
Hide file tree
Showing 12 changed files with 249 additions and 48 deletions.
1 change: 1 addition & 0 deletions lib/models/audio_visual_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def create_model(
acoustic_data="logmels",
channel_multiplier=1.0,
bottleneck_multiplier=1.0,
use_dropout=False,
conv1_temporal_stride=1,
conv1_temporal_kernel=3,
use_convolutional_pred=False,
Expand Down
6 changes: 0 additions & 6 deletions lib/utils/model_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,12 +157,6 @@ def AddVideoInput(model, reader, **kwargs):
**kwargs
)
else:
util_folder, _ = os.path.split(os.path.dirname(__file__))
lib_folder, _ = os.path.split(util_folder)
vmz_folder, _ =os.path.split(lib_folder)
av_lib = os.path.join(vmz_folder, "build/av_ops.so")
assert os.path.exists(av_lib), "no av_lib found, please build first"
dyndep.InitOpsLibrary(av_lib)
if input_type == 2:
if get_video_id:
if get_start_frame:
Expand Down
203 changes: 203 additions & 0 deletions ops/LogMels.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
/**
* Copyright (c) 2020-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "LogMels.h"

#include <glog/logging.h>
#include <complex>

const int kAudioBufferSize = 20480;
const double kGain = 1080674186.3482928;

inline float hz2Mel(float hz) {
return 1127 * std::log(1 + hz / 700);
}

inline float mel2Hz(float mel) {
return 700 * (std::exp(mel / 1127) - 1);
}

LogSpectrum::LogSpectrum(
int numChannels,
float samplingRate,
int windowLength,
int windowStep,
float startFreq,
float endFreq,
bool normalized)
: numChannels_(numChannels),
samplingRate_(samplingRate),
windowLength_(windowLength * samplingRate / 1000.0f),
windowStep_(windowStep * samplingRate / 1000.0f),
startFreq_(startFreq),
endFreq_(endFreq > 0 ? endFreq : samplingRate * .5f),
normalized_(normalized),
fifo_(nullptr),
inited_(false) {
int bits = log2(windowLength_);
if (pow(2, bits) != windowLength_) {
bits += 1;
}
fftSize_ = (int)pow(2, bits);
filterLength_ = fftSize_ / 2 + 1;

filters_ = (float*)calloc(numChannels_ * filterLength_, sizeof(float));
featFrame_ = (double*)calloc(numChannels_, sizeof(double));
tempFrame_ = (float*)calloc(fftSize_, sizeof(float));
window_ = (float*)calloc(windowLength_, sizeof(float));

dftContext_ = av_rdft_init(bits, DFT_R2C);
for (int i = 0; i < windowLength_; i++) {
window_[i] = .5f * (1 - cos(2 * M_PI * i / (windowLength_ - 1)));
}

CHECK(Init());
}

void LogSpectrum::Cleanup() {
if (filters_) {
free(filters_);
filters_ = nullptr;
}
if (featFrame_) {
free(featFrame_);
featFrame_ = nullptr;
}
if (tempFrame_) {
free(tempFrame_);
tempFrame_ = nullptr;
}
if (window_) {
free(window_);
window_ = nullptr;
}
if (dftContext_) {
av_rdft_end(dftContext_);
dftContext_ = nullptr;
}
if (fifo_) {
av_audio_fifo_reset(fifo_);
av_audio_fifo_free(fifo_);
fifo_ = nullptr;
}
}

LogSpectrum::~LogSpectrum() {
Cleanup();
}

bool LogSpectrum::Init() {
if (inited_) {
return true;
}
if (!fifo_) {
if (!(fifo_ =
av_audio_fifo_alloc(AV_SAMPLE_FMT_FLT, 1, kAudioBufferSize))) {
LOG(ERROR) << "Could not allocate FIFO";
return false;
}
}

float maxFreq = samplingRate_ / 2.0f;
CHECK_LT(startFreq_, endFreq_) << "End frequency is larger than start freq";
CHECK_LE(endFreq_, maxFreq) << "End frequency is larger than maxFreq";

float startMel = hz2Mel(startFreq_);
float endMel = hz2Mel(endFreq_);
float dx = (endMel - startMel) / (numChannels_ + 1);
float freqStep = samplingRate_ / 2 / filterLength_;
for (int filter = 0; filter < numChannels_; ++filter) {
float start = mel2Hz(filter * dx) / freqStep;
float mid = mel2Hz((filter + 1) * dx) / freqStep;
float end = mel2Hz((filter + 2) * dx) / freqStep;
int from = int(start);
int to = int(end) + 1;
float leftWidth = std::max<float>(1.0, mid - start);
float rightWidth = std::max<float>(1.0, end - mid);
float sum = 0;
from = std::max<int>(0, from);
to = std::min<int>(filterLength_ - 1, to);
filterRanges_.emplace_back(from, to);
for (int i = from; i <= to; i++) {
float value =
1.f - ((i < mid) ? (mid - i) / leftWidth : (i - mid) / rightWidth);
if (value > 0) {
filters_[filter * filterLength_ + i] = value;
sum += value;
}
}
if (normalized_ && sum > 0) {
for (int inX = from; inX <= to; inX++) {
filters_[filter * filterLength_ + inX] /= sum;
}
}
}
inited_ = true;
return true;
}

int LogSpectrum::Write(float* samples, size_t numSamples) {
int samplesW = 0;
if ((samplesW = av_audio_fifo_write(fifo_, (void**)&samples, numSamples)) <
numSamples) {
LOG(ERROR) << "Could not write data to FIFO";
return -1;
}
return samplesW;
}

int LogSpectrum::Read(float* feat) {
if (av_audio_fifo_size(fifo_) > windowLength_) {
memset(tempFrame_, 0, fftSize_ * sizeof(float));
if (av_audio_fifo_peek(fifo_, (void**)&tempFrame_, windowLength_) <
windowLength_) {
LOG(ERROR) << "Could not read data from FIFO";
return -1;
}
if (av_audio_fifo_drain(fifo_, windowStep_)) {
LOG(ERROR) << "Could not drain data from FIFO";
return -1;
}
Apply(feat);
return 1;
}
return 0;
}

void LogSpectrum::Apply(float* feat) {
PowerSpectrum();
for (int i = 0; i < numChannels_; ++i) {
featFrame_[i] = 0;
}
for (int i = 0; i < numChannels_; ++i) {
for (int j = filterRanges_[i].first; j < filterRanges_[i].second; ++j) {
featFrame_[i] += tempFrame_[j] * filters_[i * filterLength_ + j] * kGain;
}
feat[i] =
featFrame_[i] > M_E ? std::log(featFrame_[i]) : featFrame_[i] / M_E;
}
}

void LogSpectrum::PowerSpectrum() {
for (int i = 0; i < windowLength_; i++) {
tempFrame_[i] *= window_[i];
}
av_rdft_calc(dftContext_, tempFrame_);
FFTComplex* comps = reinterpret_cast<FFTComplex*>(tempFrame_);
for (int i = 0; i < fftSize_ / 2; i++) {
tempFrame_[i] = comps[i].re * comps[i].re + comps[i].im * comps[i].im;
}
}
37 changes: 14 additions & 23 deletions ops/av_decoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -184,16 +184,15 @@ void AVDecoder::decodeLoop(
}
ret = avformat_open_input(&inputContext, "", nullptr, nullptr);
if (ret < 0) {
LOG(ERROR) <<
folly::sformat("Unable to open stream : {}", ffmpegErrorStr(ret));
LOG(ERROR) << "Unable to open stream : " << ffmpegErrorStr(ret);
return;
}

ret = avformat_find_stream_info(inputContext, nullptr);
if (ret < 0) {
LOG(ERROR) << (folly::sformat(
"Unable to find stream info in {:s}: {:s}",
videoName,
ffmpegErrorStr(ret)));
LOG(ERROR) << "Unable to find stream info in " << videoName << " "
<< ffmpegErrorStr(ret);
return;
}

// Decode the first video stream
Expand Down Expand Up @@ -249,8 +248,7 @@ void AVDecoder::decodeLoop(
std::string(audioCodecContext_->codec->name) : "None";


LOG(ERROR) << (folly::sformat(
"Cannot open audio codec : {}", codecName));
LOG(ERROR) << "Cannot open audio codec : " << codecName;
}

convertCtx_ = swr_alloc_set_opts(
Expand Down Expand Up @@ -285,7 +283,6 @@ void AVDecoder::decodeLoop(
double prevTimestamp = 0;
int outWidth = 0;
int outHeight = 0;
facebook::MonotonicUsecTimer frame_decode_timer(true);

if (params.getVideo_ && videoStreamIndex_ >= 0) {
videoCodecContext_ = videoStream_->codec;
Expand All @@ -299,9 +296,10 @@ void AVDecoder::decodeLoop(
}

if (ret < 0) {
LOG(ERROR) << (folly::sformat(
"Cannot open video codec : {}", videoCodecContext_->codec->name));
}
LOG(ERROR) << "Cannot open video codec : "
<< videoCodecContext_->codec->name;
return;
}

// Calculate if we need to rescale the frames
const int origWidth = videoCodecContext_->width;
Expand Down Expand Up @@ -332,8 +330,7 @@ void AVDecoder::decodeLoop(
outWidth = params.outputWidth_;
outHeight = params.outputHeight_;
} else {
LOG(ERROR) << (
folly::sformat("Unknown VideoResType: {}", params.video_res_type_));
LOG(ERROR) << "Unknown VideoResType: " << params.video_res_type_;
}

// Make sure that we have a valid format
Expand Down Expand Up @@ -514,8 +511,7 @@ void AVDecoder::decodeLoop(
av_free_packet(&packet);
continue;
} else if (ret < 0) {
LOG(ERROR) << (folly::sformat(
"Error reading packet : {}", ffmpegErrorStr(ret)));
LOG(ERROR) << "Error reading packet : " << ffmpegErrorStr(ret);
}
ipacket++;

Expand Down Expand Up @@ -556,8 +552,7 @@ void AVDecoder::decodeLoop(
ret = avcodec_decode_video2(
videoCodecContext_, videoStreamFrame_, &gotPicture, &packet);
if (ret < 0) {
LOG(ERROR) << (folly::sformat(
"Error decoding video frame : {}", ffmpegErrorStr(ret)));
LOG(ERROR) << "Error decoding video frame : " << ffmpegErrorStr(ret);
}
try {
// Nothing to do without a picture
Expand Down Expand Up @@ -673,7 +668,6 @@ void AVDecoder::decodeLoop(
rgbFrame->data,
rgbFrame->linesize);

auto frame_decode_time_us = frame_decode_timer.stop();
unique_ptr<DecodedFrame> frame = make_unique<DecodedFrame>();
frame->width_ = outWidth;
frame->height_ = outHeight;
Expand All @@ -683,10 +677,7 @@ void AVDecoder::decodeLoop(
frame->outputFrameIndex_ = outputFrameIndex;
frame->timestamp_ = timestamp;
frame->keyFrame_ = videoStreamFrame_->key_frame;
frame->frameDecodeTimeUS_ = frame_decode_time_us;
callback.frameDecoded(std::move(frame));
frame_decode_timer.reset();
frame_decode_timer.start();

selectiveDecodedFrames++;
av_frame_free(&rgbFrame);
Expand Down Expand Up @@ -768,7 +759,7 @@ void AVDecoder::decodeFile(
string AVDecoder::ffmpegErrorStr(int result) {
std::array<char, 128> buf;
av_strerror(result, buf.data(), buf.size());
return folly::sformat("{} ({})", string(buf.data()), result);
return string(buf.data());
}

void FreeAVDecodedData(
Expand Down
10 changes: 5 additions & 5 deletions ops/av_decoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,17 @@
#include <string>
#include <vector>
#include "caffe2/core/logging.h"
#include "common/time/Time.h"
#include "common/base/Exception.h"
#include <folly/ScopeGuard.h>
#include <folly/Format.h>
// #include "common/time/Time.h"
// #include "common/base/Exception.h"
// #include <folly/ScopeGuard.h>
// #include <folly/Format.h>
#include "av_video_decoder_commons.h"

extern "C" {
#include <libavcodec/avcodec.h>
#include <libavformat/avformat.h>
#include <libavformat/avio.h>
#include <libavutil/audioconvert.h>
// #include <libavutil/audioconvert.h>
#include <libavutil/log.h>
#include <libavutil/motion_vector.h>
#include <libswresample/swresample.h>
Expand Down
15 changes: 12 additions & 3 deletions ops/av_input_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,10 @@ OPERATOR_SCHEMA(AVInput)
bool get_rgb = helper.GetSingleArgument<bool>("get_rgb", true);
bool do_multi_label = helper.GetSingleArgument<bool>("do_multi_label", false);
bool get_video_id = helper.GetSingleArgument<bool>("get_video_id", false);
bool get_start_frame = helper.GetSingleArgument<bool>("get_start_frame", false);
bool get_logmels = helper.GetSingleArgument<bool>("get_logmels", false);
int logmel_frames = helper.GetSingleArgument<int>("logmel_frames", 0);
int logmel_filters = helper.GetSingleArgument<int>("logmel_filters", 0);
int logmel_frames = helper.GetSingleArgument<int>("logmel_frames", 100);
int logmel_filters = helper.GetSingleArgument<int>("logmel_filters", 40);

int output_size = 1;
if (get_rgb) {
Expand All @@ -39,6 +40,9 @@ OPERATOR_SCHEMA(AVInput)
if (get_video_id) {
output_size++;
}
if (get_start_frame) {
output_size++;
}

int index = 0;
vector<TensorShape> out(output_size);
Expand All @@ -63,9 +67,14 @@ OPERATOR_SCHEMA(AVInput)
vector<int>{batch_size, num_of_class}, TensorProto::INT32);
}
if (get_video_id) {
out[index] =
out[index++] =
CreateTensorShape(vector<int64_t>{1, batch_size}, TensorProto::INT64);
}
if (get_start_frame) {
out[index] = CreateTensorShape(
vector<int>{1, batch_size}, TensorProto::INT32);
}


return out;
});
Expand Down
Loading

0 comments on commit 6412041

Please sign in to comment.