Skip to content

Commit

Permalink
Merge pull request numpy#25346 from Mousius/highway-vqsort-float16
Browse files Browse the repository at this point in the history
ENH: Enable 16-bit VQSort routines on AArch64
  • Loading branch information
seiko2plus authored Dec 14, 2023
2 parents 1abec40 + 2710aa5 commit da8afcb
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 3 deletions.
7 changes: 7 additions & 0 deletions numpy/_core/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -787,6 +787,13 @@ foreach gen_mtargets : [
SVE, ASIMD,
]
],
[
'highway_qsort_16bit.dispatch.h',
'src/npysort/highway_qsort_16bit.dispatch.cpp',
[
ASIMDHP,
]
],
]
mtargets = mod_features.multi_targets(
gen_mtargets[0], multiarray_gen_headers + gen_mtargets[1],
Expand Down
2 changes: 1 addition & 1 deletion numpy/_core/src/highway
Submodule highway updated 71 files
+60 −5 .github/workflows/build_test.yml
+4 −4 .github/workflows/codeql.yml
+47 −0 .github/workflows/multiarch.yml
+41 −1 BUILD
+18 −0 CMakeLists.txt
+7 −2 README.md
+12 −0 WORKSPACE
+3 −3 docs/conf.py
+6 −8 g3doc/op_wishlist.md
+270 −72 g3doc/quick_reference.md
+152 −3 hwy/aligned_allocator.h
+101 −0 hwy/aligned_allocator_test.cc
+1,016 −498 hwy/base.h
+18 −0 hwy/base_test.cc
+19 −9 hwy/contrib/algo/copy-inl.h
+10 −6 hwy/contrib/algo/find-inl.h
+90 −35 hwy/contrib/algo/transform-inl.h
+7 −7 hwy/contrib/dot/dot-inl.h
+7 −23 hwy/contrib/dot/dot_test.cc
+361 −0 hwy/contrib/matvec/matvec-inl.h
+218 −0 hwy/contrib/matvec/matvec_test.cc
+1 −1 hwy/contrib/sort/algo-inl.h
+1 −10 hwy/contrib/sort/result-inl.h
+3 −2 hwy/contrib/sort/shared-inl.h
+1 −1 hwy/contrib/sort/traits-inl.h
+37 −31 hwy/contrib/sort/vqsort-inl.h
+1 −3 hwy/contrib/sort/vqsort.cc
+152 −0 hwy/contrib/thread_pool/thread_pool.cc
+174 −0 hwy/contrib/thread_pool/thread_pool.h
+134 −0 hwy/contrib/thread_pool/thread_pool_test.cc
+24 −31 hwy/contrib/unroller/unroller_test.cc
+3 −1 hwy/detect_compiler_arch.h
+11 −3 hwy/detect_targets.h
+17 −0 hwy/examples/skeleton_test.cc
+11 −9 hwy/highway_test.cc
+1 −3 hwy/nanobenchmark_test.cc
+840 −593 hwy/ops/arm_neon-inl.h
+758 −214 hwy/ops/arm_sve-inl.h
+192 −108 hwy/ops/emu128-inl.h
+871 −30 hwy/ops/generic_ops-inl.h
+339 −265 hwy/ops/ppc_vsx-inl.h
+750 −204 hwy/ops/rvv-inl.h
+101 −82 hwy/ops/scalar-inl.h
+10 −2 hwy/ops/set_macros-inl.h
+161 −57 hwy/ops/shared-inl.h
+137 −125 hwy/ops/wasm_128-inl.h
+83 −31 hwy/ops/wasm_256-inl.h
+1,030 −508 hwy/ops/x86_128-inl.h
+556 −270 hwy/ops/x86_256-inl.h
+578 −243 hwy/ops/x86_512-inl.h
+17 −7 hwy/print.cc
+6 −2 hwy/targets.cc
+51 −0 hwy/tests/arithmetic_test.cc
+64 −2 hwy/tests/combine_test.cc
+76 −120 hwy/tests/convert_test.cc
+18 −13 hwy/tests/demote_test.cc
+236 −0 hwy/tests/dup128_vec_test.cc
+98 −5 hwy/tests/if_test.cc
+187 −0 hwy/tests/mask_combine_test.cc
+347 −0 hwy/tests/mask_convert_test.cc
+16 −10 hwy/tests/mask_mem_test.cc
+37 −0 hwy/tests/mask_test.cc
+166 −53 hwy/tests/masked_arithmetic_test.cc
+78 −156 hwy/tests/memory_test.cc
+99 −54 hwy/tests/reduction_test.cc
+348 −0 hwy/tests/sums_abs_diff_test.cc
+70 −64 hwy/tests/swizzle_test.cc
+71 −1 hwy/tests/test_util-inl.h
+70 −9 hwy/tests/test_util.h
+125 −0 hwy/tests/truncate_test.cc
+11 −0 hwy/timer.h
7 changes: 7 additions & 0 deletions numpy/_core/src/npysort/highway_qsort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,13 @@ namespace np { namespace highway { namespace qsort_simd {
NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSort, (T *arr, npy_intp size))
NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSelect, (T* arr, npy_intp num, npy_intp kth))


#ifndef NPY_DISABLE_OPTIMIZATION
#include "highway_qsort_16bit.dispatch.h"
#endif
NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSort, (T *arr, npy_intp size))
NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSelect, (T* arr, npy_intp num, npy_intp kth))

} } } // np::highway::qsort_simd

#endif // NUMPY_SRC_COMMON_NPYSORT_HWY_SIMD_QSORT_HPP
20 changes: 20 additions & 0 deletions numpy/_core/src/npysort/highway_qsort_16bit.dispatch.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#include "highway_qsort.hpp"
#define VQSORT_ONLY_STATIC 1
#include "hwy/contrib/sort/vqsort-inl.h"

namespace np { namespace highway { namespace qsort_simd {

template<> void NPY_CPU_DISPATCH_CURFX(QSort)(Half *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(reinterpret_cast<hwy::float16_t*>(arr), size, hwy::SortAscending());
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint16_t *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int16_t *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}

} } } // np::highway::qsort_simd
9 changes: 7 additions & 2 deletions numpy/_core/src/npysort/quicksort.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,14 @@ inline bool quicksort_dispatch(T *start, npy_intp num)
void (*dispfunc)(TF*, intptr_t) = nullptr;
if (sizeof(T) == sizeof(uint16_t)) {
#ifndef NPY_DISABLE_OPTIMIZATION
#include "x86_simd_qsort_16bit.dispatch.h"
#if defined(NPY_CPU_AMD64) || defined(NPY_CPU_X86) // x86 32-bit and 64-bit
#include "x86_simd_qsort_16bit.dispatch.h"
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>);
#else
#include "highway_qsort_16bit.dispatch.h"
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::highway::qsort_simd::template QSort, <TF>);
#endif
#endif
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>);
}
else if (sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t)) {
#ifndef NPY_DISABLE_OPTIMIZATION
Expand Down

0 comments on commit da8afcb

Please sign in to comment.