Skip to content

Commit

Permalink
Merge pull request fast-pack#108 from seb711/simde
Browse files Browse the repository at this point in the history
 Add ARM Support for FastPFOR using SIMDe
  • Loading branch information
lemire authored Jan 31, 2024
2 parents 8e67fbd + c46afeb commit 5986b89
Show file tree
Hide file tree
Showing 15 changed files with 1,626 additions and 803 deletions.
31 changes: 31 additions & 0 deletions .github/workflows/vs17-arm-ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
name: VS17-ARM-CI

on: [push, pull_request]

jobs:
ci:
name: windows-vs17
runs-on: windows-latest
strategy:
fail-fast: false
matrix:
include:
- {gen: Visual Studio 17 2022, arch: ARM64}
steps:
- name: checkout
uses: actions/checkout@v2
- name: Configure
run: |
cmake -B build
- name: Build Debug
run: cmake --build build --config Debug --verbose
- name: Build Release
run: cmake --build build --config Release --verbose
- name: Run Release tests
run: |
cd build
ctest -C Release -LE explicitonly --output-on-failure
- name: Run Debug tests
run: |
cd build
ctest -C Debug -LE explicitonly --output-on-failure
28 changes: 22 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,11 @@ set(PROJECT_DESCRIPTION "The FastPFOR C++ library: Fast integer compression")
set(PROJECT_VERSION "0.1.9")

include(DetectCPUFeatures)
#
include("${CMAKE_MODULE_PATH}/environment.cmake")


message("Building for architecture: ${CMAKE_SYSTEM_PROCESSOR}")

# Runs compiler with "-dumpversion" and parses major/minor
# version with a regex.
#
Expand Down Expand Up @@ -57,7 +61,12 @@ MESSAGE( STATUS "CXX_COMPILER_VERSION: " ${CXX_COMPILER_VERSION} )
if( SUPPORT_SSE42 )
MESSAGE( STATUS "SSE 4.2 support detected" )
else()
MESSAGE( STATUS "SSE 4.2 support not detected" )
if (SUPPORT_NEON)
include("${CMAKE_MODULE_PATH}/simde.cmake")
MESSAGE(STATUS "USING SIMDE FOR SIMD OPERATIONS")
else ()
MESSAGE(STATUS "SIMDE and SSE 4.2 support not detected")
endif ()
endif()

if(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
Expand Down Expand Up @@ -85,10 +94,10 @@ elseif(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_CXX_COMPILER_ID} STR
if (CXX_COMPILER_VERSION VERSION_LESS 4.2.1)
message(STATUS "Clang version must be at least 4.2.1!" )
endif()
set (CMAKE_CXX_FLAGS_RELEASE "-Wall -Wcast-align -O3 -DNDEBUG -std=c++11 -DHAVE_CXX0X -msse4.1 -march=native")
set (CMAKE_CXX_FLAGS_DEBUG "-Wall -Wcast-align -ggdb -std=c++11 -DHAVE_CXX0X -msse4.1 -march=native")
set (CMAKE_C_FLAGS_RELEASE "-Wall -Wcast-align -O3 -DNDEBUG -std=c99 -msse4.1 -march=native")
set (CMAKE_C_FLAGS_DEBUG "-Wall -Wcast-align -ggdb -std=c99 -msse4.1 -march=native")
set (CMAKE_CXX_FLAGS_RELEASE "-Wall -Wcast-align -O3 -DNDEBUG -std=c++11 -DHAVE_CXX0X -march=native")
set (CMAKE_CXX_FLAGS_DEBUG "-Wall -Wcast-align -ggdb -std=c++11 -DHAVE_CXX0X -march=native")
set (CMAKE_C_FLAGS_RELEASE "-Wall -Wcast-align -O3 -DNDEBUG -std=c99 -march=native")
set (CMAKE_C_FLAGS_DEBUG "-Wall -Wcast-align -ggdb -std=c99 -march=native")
elseif(WIN32)
# TODO add support for later versions?
if(NOT MSVC12)
Expand Down Expand Up @@ -126,6 +135,13 @@ add_executable(gapstats src/gapstats.cpp)
add_executable(partitionbylength src/partitionbylength.cpp)
add_executable(csv2maropu src/csv2maropu.cpp)

if (SUPPORT_NEON)
target_link_libraries(FastPFOR PUBLIC simde)
target_link_libraries(gapstats PUBLIC simde)
target_link_libraries(partitionbylength PUBLIC simde)
target_link_libraries(csv2maropu PUBLIC simde)
endif()

add_executable(entropy src/entropy.cpp)
target_link_libraries(entropy FastPFOR)

Expand Down
10 changes: 10 additions & 0 deletions cmake_modules/environment.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
include(CheckCXXCompilerFlag)

if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm64" OR CMAKE_SYSTEM_PROCESSOR MATCHES "arm" OR CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
set(SUPPORT_NEON ON)
endif ()

# Check if the Visual Studio build is targeting ARM
if (CMAKE_GENERATOR_PLATFORM MATCHES "ARM64" OR CMAKE_GENERATOR_PLATFORM MATCHES "ARM")
set(SUPPORT_NEON ON)
endif ()
13 changes: 13 additions & 0 deletions cmake_modules/simde.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
include(FetchContent)
FetchContent_Declare(
simde
GIT_REPOSITORY https://github.com/simd-everywhere/simde.git
GIT_TAG master
)
FetchContent_MakeAvailable(simde)

add_library(simde INTERFACE IMPORTED GLOBAL)
target_include_directories(simde INTERFACE "${simde_SOURCE_DIR}")

# Enables native aliases. Not ideal but makes it easier to convert old code.
target_compile_definitions(simde INTERFACE SIMDE_ENABLE_NATIVE_ALIASES)
7 changes: 6 additions & 1 deletion headers/VarIntG8IU.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* This code is released under the
* Apache License Version 2.0 http://www.apache.org/licenses/.
*/
#if !defined(__SSSE3__) && !(defined(_MSC_VER) && defined(__AVX__))
#if (!defined(__SSSE3__) && !(defined(_MSC_VER) && defined(__AVX__))) && !(defined(__ARM_NEON) || defined(__aarch64__))
#ifndef _MSC_VER
#pragma message \
"Disabling varintg8iu due to lack of SSSE3 support, try adding -mssse3 or the equivalent on your compiler"
Expand All @@ -12,7 +12,12 @@
#else
#ifndef VARINTG8IU_H__
#define VARINTG8IU_H__
#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
#include <emmintrin.h>
#elif defined(__aarch64__)
/* GCC-compatible compiler, targeting ARM with NEON */
#include <simde/x86/sse3.h>
#endif
#include "codecs.h"
#ifdef __GNUC__
#define PREDICT_FALSE(x) (__builtin_expect(x, 0))
Expand Down
10 changes: 10 additions & 0 deletions headers/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,12 @@
// C headers (sorted)
#include <errno.h>
#include <fcntl.h>
#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
#include <immintrin.h>
#elif defined(__GNUC__) && defined(__aarch64__)
#include <simde/x86/sse4.1.h>
#endif

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
Expand Down Expand Up @@ -44,7 +49,12 @@
#ifdef _MSC_VER
#include <iso646.h>
#include <stdint.h>

#if (defined(_M_X64) || defined(_M_AMD64))
#include <intrin.h>
#elif defined(_M_ARM64)
#include <simde/x86/sse4.1.h>
#endif

#define __attribute__(n)
#define __restrict__ __restrict
Expand Down
43 changes: 38 additions & 5 deletions headers/cpubenchmark.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,34 @@ static __inline__ unsigned long long stopRDTSCP(void) {
"%rdx");
return (static_cast<unsigned long long>(cycles_high) << 32) | cycles_low;
}
#elif defined(_MSC_VER)
#elif (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64)))

static inline unsigned long long startRDTSC(void) { return __rdtsc(); }

static inline unsigned long long stopRDTSCP(void) { return __rdtsc(); }
#elif defined(_MSC_VER) && defined(_M_ARM64)
// oriented by zeromq implementation for msc arm/arm64
// https://github.com/zeromq/libzmq/blob/master/src/clock.cpp
inline unsigned long long rdtsc() {
const int64_t pmccntr_el0 = (((3 & 1) << 14) | // op0
((3 & 7) << 11) | // op1
((9 & 15) << 7) | // crn
((13 & 15) << 3) | // crm
((0 & 7) << 0)); // op2

return _ReadStatusReg (pmccntr_el0);
}

static inline unsigned long long startRDTSC(void) { return rdtsc(); }

static inline unsigned long long stopRDTSCP(void) { return rdtsc(); }
#elif (defined(_MSC_VER) && (defined(_M_ARM64)))
// Taken from microsoft documentation (see
// https://learn.microsoft.com/en-us/cpp/build/overview-of-arm-abi-conventions?view=msvc-170

static inline unsigned long long startRDTSC(void) { return __rdpmccntr64(); }

static inline unsigned long long stopRDTSCP(void) { return __rdpmccntr64(); }

#elif defined(__i386__) || defined(__x86_64__)

Expand All @@ -66,15 +89,25 @@ inline unsigned long long rdtsc() {
static __inline__ unsigned long long startRDTSC(void) { return rdtsc(); }

static __inline__ unsigned long long stopRDTSCP(void) { return rdtsc(); }
#elif (defined(__GNUC__) && (defined(__arch64__)))
inline uint64_t rdtsc() {
uint64_t cycles;
asm volatile("mrs %0, cntvct_el0"
: "=r"(cycles)); /* output */
return cycles;
}

static __inline__ uint64_t startRDTSC(void) { return rdtsc(); }

#elif(defined(__arm__) || defined(__ppc__) || defined(__ppc64__))
static __inline__ uint64_t stopRDTSCP(void) { return rdtsc(); }
#elif(defined(__arm__) || defined(__ppc__) || defined(__ppc64__)) || (defined(_MSC_VER) && defined(_M_ARM64))

// for PPC we should be able to use tbl, but I could not find
// an equivalent to rdtsc for ARM.

inline uint64 rdtsc() { return 0; }
static __inline__ ticks startRDTSC(void) { return 0; }
static __inline__ ticks stopRDTSCP(void) { return 0; }
inline uint64_t rdtsc() { return 0; }
static __inline__ uint64_t startRDTSC(void) { return 0; }
static __inline__ uint64_t stopRDTSCP(void) { return 0; }
#else
#error Unknown architecture
#endif
Expand Down
2 changes: 1 addition & 1 deletion headers/horizontalbitpacking.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
#ifndef HORIZONTALBITPACKING_H_
#define HORIZONTALBITPACKING_H_

#if !defined(__SSE4_1__) && !(defined(_MSC_VER) && defined(__AVX__))

#if (!defined(__SSE4_1__) && !(defined(_MSC_VER) && defined(__AVX__))) && (!(defined(__ARM_NEON) || defined(__aarch64__)))
#ifndef _MSC_VER
#pragma message "No SSSE4.1 support? try adding -msse4.1 or the equivalent on your compiler"
#else
Expand Down
Loading

0 comments on commit 5986b89

Please sign in to comment.