Skip to content

Commit

Permalink
Revert "Support AVX2/FMA intrinsics in Audio Resampler module"
Browse files Browse the repository at this point in the history
This reverts commit 1ca8d87.

Reason for revert: breaks downstream project

Original change's description:
> Support AVX2/FMA intrinsics in Audio Resampler module
> 
> From the test result, using AVX2/FMA is 1.60x faster than SSE on atlas.
> 
> Bug: webrtc:11663
> Test: common_audio_unittests on atlas and octopus.
> Change-Id: Ibd45ea46aa97d5790a24e5116f741592b95f6416
> Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/176382
> Reviewed-by: Per Åhgren <[email protected]>
> Reviewed-by: Henrik Andreassson <[email protected]>
> Reviewed-by: Mirko Bonadei <[email protected]>
> Reviewed-by: Sam Zackrisson <[email protected]>
> Commit-Queue: Sam Zackrisson <[email protected]>
> Cr-Commit-Position: refs/heads/master@{#31810}

[email protected],[email protected],[email protected],[email protected],[email protected],[email protected],[email protected]

Change-Id: I1dad31df446e336dacb29ff637bd66f809376458
No-Presubmit: true
No-Tree-Checks: true
No-Try: true
Bug: webrtc:11663
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/180622
Reviewed-by: Åsa Persson <[email protected]>
Commit-Queue: Åsa Persson <[email protected]>
Cr-Commit-Position: refs/heads/master@{#31813}
  • Loading branch information
Åsa Persson authored and Commit Bot committed Jul 30, 2020
1 parent 378a948 commit 0c9204c
Show file tree
Hide file tree
Showing 9 changed files with 59 additions and 168 deletions.
4 changes: 0 additions & 4 deletions BUILD.gn
Original file line number Diff line number Diff line change
Expand Up @@ -129,10 +129,6 @@ config("common_inherited_config") {
defines += [ "RTC_DISABLE_CHECK_MSG" ]
}

if (rtc_enable_avx2) {
defines += [ "WEBRTC_ENABLE_AVX2" ]
}

# Some tests need to declare their own trace event handlers. If this define is
# not set, the first time TRACE_EVENT_* is called it will store the return
# value for the current handler in an static variable, so that subsequent
Expand Down
23 changes: 0 additions & 23 deletions common_audio/BUILD.gn
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@ rtc_library("common_audio") {

if (current_cpu == "x86" || current_cpu == "x64") {
deps += [ ":common_audio_sse2" ]
deps += [ ":common_audio_avx2" ]
}
}

Expand Down Expand Up @@ -236,7 +235,6 @@ rtc_library("fir_filter_factory") {
]
if (current_cpu == "x86" || current_cpu == "x64") {
deps += [ ":common_audio_sse2" ]
deps += [ ":common_audio_avx2" ]
}
if (rtc_build_with_neon) {
deps += [ ":common_audio_neon" ]
Expand All @@ -263,27 +261,6 @@ if (current_cpu == "x86" || current_cpu == "x64") {
"../rtc_base/memory:aligned_malloc",
]
}

rtc_library("common_audio_avx2") {
sources = [ "resampler/sinc_resampler_avx2.cc" ]

if (is_win) {
cflags = [ "/arch:AVX2" ]
} else {
cflags = [
"-mavx2",
"-mfma",
]
}

deps = [
":fir_filter",
":sinc_resampler",
"../rtc_base:checks",
"../rtc_base:rtc_base_approved",
"../rtc_base/memory:aligned_malloc",
]
}
}

if (rtc_build_with_neon) {
Expand Down
56 changes: 33 additions & 23 deletions common_audio/resampler/sinc_resampler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -122,22 +122,28 @@ double SincScaleFactor(double io_ratio) {
const size_t SincResampler::kKernelSize;

// If we know the minimum architecture at compile time, avoid CPU detection.
#if defined(WEBRTC_ARCH_X86_FAMILY)
#if defined(__SSE2__)
#define CONVOLVE_FUNC Convolve_SSE
void SincResampler::InitializeCPUSpecificFeatures() {}
#else
// x86 CPU detection required. Function will be set by
// InitializeCPUSpecificFeatures().
// TODO(dalecurtis): Once Chrome moves to an SSE baseline this can be removed.
#define CONVOLVE_FUNC convolve_proc_

void SincResampler::InitializeCPUSpecificFeatures() {
#if defined(WEBRTC_HAS_NEON)
convolve_proc_ = Convolve_NEON;
#elif defined(WEBRTC_ARCH_X86_FAMILY)
// Using AVX2 instead of SSE2 when AVX2 supported.
if (WebRtc_GetCPUInfo(kAVX2))
convolve_proc_ = Convolve_AVX2;
else if (WebRtc_GetCPUInfo(kSSE2))
convolve_proc_ = Convolve_SSE;
else
convolve_proc_ = Convolve_C;
convolve_proc_ = WebRtc_GetCPUInfo(kSSE2) ? Convolve_SSE : Convolve_C;
}
#endif
#elif defined(WEBRTC_HAS_NEON)
#define CONVOLVE_FUNC Convolve_NEON
void SincResampler::InitializeCPUSpecificFeatures() {}
#else
// Unknown architecture.
convolve_proc_ = Convolve_C;
// Unknown architecture.
#define CONVOLVE_FUNC Convolve_C
void SincResampler::InitializeCPUSpecificFeatures() {}
#endif
}

SincResampler::SincResampler(double io_sample_rate_ratio,
size_t request_frames,
Expand All @@ -146,20 +152,24 @@ SincResampler::SincResampler(double io_sample_rate_ratio,
read_cb_(read_cb),
request_frames_(request_frames),
input_buffer_size_(request_frames_ + kKernelSize),
// Create input buffers with a 32-byte alignment for SIMD optimizations.
// Create input buffers with a 16-byte alignment for SSE optimizations.
kernel_storage_(static_cast<float*>(
AlignedMalloc(sizeof(float) * kKernelStorageSize, 32))),
AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))),
kernel_pre_sinc_storage_(static_cast<float*>(
AlignedMalloc(sizeof(float) * kKernelStorageSize, 32))),
AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))),
kernel_window_storage_(static_cast<float*>(
AlignedMalloc(sizeof(float) * kKernelStorageSize, 32))),
AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))),
input_buffer_(static_cast<float*>(
AlignedMalloc(sizeof(float) * input_buffer_size_, 32))),
AlignedMalloc(sizeof(float) * input_buffer_size_, 16))),
#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(__SSE2__)
convolve_proc_(nullptr),
#endif
r1_(input_buffer_.get()),
r2_(input_buffer_.get() + kKernelSize / 2) {
#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(__SSE2__)
InitializeCPUSpecificFeatures();
RTC_DCHECK(convolve_proc_);
#endif
RTC_DCHECK_GT(request_frames_, 0);
Flush();
RTC_DCHECK_GT(block_size_, kKernelSize);
Expand Down Expand Up @@ -292,10 +302,10 @@ void SincResampler::Resample(size_t frames, float* destination) {
const float* const k1 = kernel_ptr + offset_idx * kKernelSize;
const float* const k2 = k1 + kKernelSize;

// Ensure |k1|, |k2| are 32-byte aligned for SIMD usage. Should always be
// true so long as kKernelSize is a multiple of 32.
RTC_DCHECK_EQ(0, reinterpret_cast<uintptr_t>(k1) % 32);
RTC_DCHECK_EQ(0, reinterpret_cast<uintptr_t>(k2) % 32);
// Ensure |k1|, |k2| are 16-byte aligned for SIMD usage. Should always be
// true so long as kKernelSize is a multiple of 16.
RTC_DCHECK_EQ(0, reinterpret_cast<uintptr_t>(k1) % 16);
RTC_DCHECK_EQ(0, reinterpret_cast<uintptr_t>(k2) % 16);

// Initialize input pointer based on quantized |virtual_source_idx_|.
const float* const input_ptr = r1_ + source_idx;
Expand All @@ -304,7 +314,7 @@ void SincResampler::Resample(size_t frames, float* destination) {
const double kernel_interpolation_factor =
virtual_offset_idx - offset_idx;
*destination++ =
convolve_proc_(input_ptr, k1, k2, kernel_interpolation_factor);
CONVOLVE_FUNC(input_ptr, k1, k2, kernel_interpolation_factor);

// Advance the virtual index.
virtual_source_idx_ += current_io_ratio;
Expand Down
6 changes: 2 additions & 4 deletions common_audio/resampler/sinc_resampler.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,10 +112,6 @@ class SincResampler {
const float* k1,
const float* k2,
double kernel_interpolation_factor);
static float Convolve_AVX2(const float* input_ptr,
const float* k1,
const float* k2,
double kernel_interpolation_factor);
#elif defined(WEBRTC_HAS_NEON)
static float Convolve_NEON(const float* input_ptr,
const float* k1,
Expand Down Expand Up @@ -159,11 +155,13 @@ class SincResampler {
// TODO(ajm): Move to using a global static which must only be initialized
// once by the user. We're not doing this initially, because we don't have
// e.g. a LazyInstance helper in webrtc.
#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(__SSE2__)
typedef float (*ConvolveProc)(const float*,
const float*,
const float*,
double);
ConvolveProc convolve_proc_;
#endif

// Pointers to the various regions inside |input_buffer_|. See the diagram at
// the top of the .cc file for more information.
Expand Down
66 changes: 0 additions & 66 deletions common_audio/resampler/sinc_resampler_avx2.cc

This file was deleted.

31 changes: 22 additions & 9 deletions common_audio/resampler/sinc_resampler_unittest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,17 @@ TEST(SincResamplerTest, DISABLED_SetRatioBench) {
printf("SetRatio() took %.2fms.\n", total_time_c_us / 1000);
}

// Define platform independent function name for Convolve* tests.
#if defined(WEBRTC_ARCH_X86_FAMILY)
#define CONVOLVE_FUNC Convolve_SSE
#elif defined(WEBRTC_ARCH_ARM_V7)
#define CONVOLVE_FUNC Convolve_NEON
#endif

// Ensure various optimized Convolve() methods return the same value. Only run
// this test if other optimized methods exist, otherwise the default Convolve()
// will be tested by the parameterized SincResampler tests below.
#if defined(CONVOLVE_FUNC)
TEST(SincResamplerTest, Convolve) {
#if defined(WEBRTC_ARCH_X86_FAMILY)
ASSERT_TRUE(WebRtc_GetCPUInfo(kSSE2));
Expand All @@ -140,7 +148,7 @@ TEST(SincResamplerTest, Convolve) {
double result = resampler.Convolve_C(
resampler.kernel_storage_.get(), resampler.kernel_storage_.get(),
resampler.kernel_storage_.get(), kKernelInterpolationFactor);
double result2 = resampler.convolve_proc_(
double result2 = resampler.CONVOLVE_FUNC(
resampler.kernel_storage_.get(), resampler.kernel_storage_.get(),
resampler.kernel_storage_.get(), kKernelInterpolationFactor);
EXPECT_NEAR(result2, result, kEpsilon);
Expand All @@ -149,11 +157,12 @@ TEST(SincResamplerTest, Convolve) {
result = resampler.Convolve_C(
resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(),
resampler.kernel_storage_.get(), kKernelInterpolationFactor);
result2 = resampler.convolve_proc_(
result2 = resampler.CONVOLVE_FUNC(
resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(),
resampler.kernel_storage_.get(), kKernelInterpolationFactor);
EXPECT_NEAR(result2, result, kEpsilon);
}
#endif

// Benchmark for the various Convolve() methods. Make sure to build with
// branding=Chrome so that RTC_DCHECKs are compiled out when benchmarking.
Expand Down Expand Up @@ -181,6 +190,7 @@ TEST(SincResamplerTest, ConvolveBenchmark) {
(rtc::TimeNanos() - start) / rtc::kNumNanosecsPerMicrosec;
printf("Convolve_C took %.2fms.\n", total_time_c_us / 1000);

#if defined(CONVOLVE_FUNC)
#if defined(WEBRTC_ARCH_X86_FAMILY)
ASSERT_TRUE(WebRtc_GetCPUInfo(kSSE2));
#elif defined(WEBRTC_ARCH_ARM_V7)
Expand All @@ -190,33 +200,36 @@ TEST(SincResamplerTest, ConvolveBenchmark) {
// Benchmark with unaligned input pointer.
start = rtc::TimeNanos();
for (int j = 0; j < kConvolveIterations; ++j) {
resampler.convolve_proc_(
resampler.CONVOLVE_FUNC(
resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(),
resampler.kernel_storage_.get(), kKernelInterpolationFactor);
}
double total_time_optimized_unaligned_us =
(rtc::TimeNanos() - start) / rtc::kNumNanosecsPerMicrosec;
printf(STRINGIZE(convolve_proc_) "(unaligned) took %.2fms; which is %.2fx "
printf(STRINGIZE(CONVOLVE_FUNC) "(unaligned) took %.2fms; which is %.2fx "
"faster than Convolve_C.\n", total_time_optimized_unaligned_us / 1000,
total_time_c_us / total_time_optimized_unaligned_us);

// Benchmark with aligned input pointer.
start = rtc::TimeNanos();
for (int j = 0; j < kConvolveIterations; ++j) {
resampler.convolve_proc_(
resampler.CONVOLVE_FUNC(
resampler.kernel_storage_.get(), resampler.kernel_storage_.get(),
resampler.kernel_storage_.get(), kKernelInterpolationFactor);
}
double total_time_optimized_aligned_us =
(rtc::TimeNanos() - start) / rtc::kNumNanosecsPerMicrosec;
printf(STRINGIZE(convolve_proc_) " (aligned) took %.2fms; which is %.2fx "
printf(STRINGIZE(CONVOLVE_FUNC) " (aligned) took %.2fms; which is %.2fx "
"faster than Convolve_C and %.2fx faster than "
STRINGIZE(convolve_proc_) " (unaligned).\n",
STRINGIZE(CONVOLVE_FUNC) " (unaligned).\n",
total_time_optimized_aligned_us / 1000,
total_time_c_us / total_time_optimized_aligned_us,
total_time_optimized_unaligned_us / total_time_optimized_aligned_us);
#endif
}

#undef CONVOLVE_FUNC

typedef std::tuple<int, int, double, double> SincResamplerTestData;
class SincResamplerTest
: public ::testing::TestWithParam<SincResamplerTestData> {
Expand Down Expand Up @@ -339,15 +352,15 @@ INSTANTIATE_TEST_SUITE_P(
std::make_tuple(16000, 44100, kResamplingRMSError, -62.54),
std::make_tuple(22050, 44100, kResamplingRMSError, -73.53),
std::make_tuple(32000, 44100, kResamplingRMSError, -63.32),
std::make_tuple(44100, 44100, kResamplingRMSError, -73.52),
std::make_tuple(44100, 44100, kResamplingRMSError, -73.53),
std::make_tuple(48000, 44100, -15.01, -64.04),
std::make_tuple(96000, 44100, -18.49, -25.51),
std::make_tuple(192000, 44100, -20.50, -13.31),

// To 48kHz
std::make_tuple(8000, 48000, kResamplingRMSError, -63.43),
std::make_tuple(11025, 48000, kResamplingRMSError, -62.61),
std::make_tuple(16000, 48000, kResamplingRMSError, -63.95),
std::make_tuple(16000, 48000, kResamplingRMSError, -63.96),
std::make_tuple(22050, 48000, kResamplingRMSError, -62.42),
std::make_tuple(32000, 48000, kResamplingRMSError, -64.04),
std::make_tuple(44100, 48000, kResamplingRMSError, -62.63),
Expand Down
2 changes: 1 addition & 1 deletion system_wrappers/include/cpu_features_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ extern "C" {
#endif

// List of features in x86.
typedef enum { kSSE2, kSSE3, kAVX2 } CPUFeature;
typedef enum { kSSE2, kSSE3 } CPUFeature;

// List of features in ARM.
enum {
Expand Down
Loading

0 comments on commit 0c9204c

Please sign in to comment.