Document VecElement{T}. Support call-by-value of SIMD types on 64-bit…

… x86. Make C interop work for SSE types on 64-bit x86 and add tests for it, including cfunction round-trip test.
AndersBlomdell · May 24, 2016 · f88fb5c · f88fb5c
1 parent c43b5e5
commit f88fb5c
Show file tree

Hide file tree

Showing 11 changed files with 176 additions and 8 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -58,6 +58,8 @@ Command-line option changes
 Compiler/Runtime improvements
 -----------------------------
 
+  * Machine SIMD types can be represented in Julia as a homogeneous tuple of `VecElement` ([#15244]).
+
 Breaking changes
 ----------------
 

diff --git a/doc/index.rst b/doc/index.rst
@@ -86,6 +86,7 @@
    stdlib/libdl
    stdlib/profile
    stdlib/stacktraces
+   stdlib/simd-types
 
 .. _devdocs:
 

diff --git a/doc/manual/calling-c-and-fortran-code.rst b/doc/manual/calling-c-and-fortran-code.rst
@@ -561,6 +561,47 @@ Arrays of unknown size are not supported.
 
 In the future, some of these restrictions may be reduced or eliminated.
 
+SIMD Values
+~~~~~~~~~~~
+
+Note: This feature is currently implemented on 64-bit x86 platforms only.
+
+If a C/C++ routine has an argument or return value that is a native
+SIMD type, the corresponding Julia type is a homogeneous tuple
+of ``VecElement`` that naturally maps to the SIMD type.  Specifically:
+
+    - The tuple must be the same size as the SIMD type.
+      For example, a tuple representing an ``__m128`` on x86
+      must have a size of 16 bytes.
+
+    - The element type of the tuple must be an instance of ``VecElement{T}``
+      where ``T`` is a bitstype that is 1, 2, 4 or 8 bytes.
+
+For instance, consider this C routine that uses AVX intrinsics::
+
+    #include <immintrin.h>
+
+    __m256 dist( __m256 a, __m256 b ) {
+        return _mm256_sqrt_ps(_mm256_add_ps(_mm256_mul_ps(a, a),
+                                            _mm256_mul_ps(b, b)));
+    }
+
+The following Julia code calls ``dist`` using ``ccall``::
+
+    typealias m256 NTuple{8,VecElement{Float32}}
+
+    a = m256(ntuple(i->VecElement(sin(Float32(i))),8))
+    b = m256(ntuple(i->VecElement(cos(Float32(i))),8))
+
+    function call_dist(a::m256, b::m256)
+        ccall((:dist, "libdist"), m256, (m256, m256), a, b)
+    end
+
+    println(call_dist(a,b))
+
+The host machine must have the requisite SIMD registers.  For example,
+the code above will not work on hosts without AVX support.
+
 Memory Ownership
 ~~~~~~~~~~~~~~~~
 
@@ -1063,3 +1104,4 @@ C++
 
 Limited support for C++ is provided by the `Cpp <https://github.com/timholy/Cpp.jl>`_,
 `Clang <https://github.com/ihnorton/Clang.jl>`_, and `Cxx <https://github.com/Keno/Cxx.jl>`_ packages.
+
diff --git a/doc/stdlib/index.rst b/doc/stdlib/index.rst
@@ -27,3 +27,4 @@
    libc
    libdl
    profile
+   simd-types
diff --git a/doc/stdlib/simd-types.rst b/doc/stdlib/simd-types.rst
@@ -0,0 +1,37 @@
+.. _stdlib-simd-types:
+
+****************
+ SIMD Support
+****************
+
+Type ``VecElement{T}`` is intended for building libraries of SIMD operations.
+Practical use of it requires using ``llvmcall``. The type is defined as::
+
+    immutable VecElement{T}
+        value::T
+    end
+
+It has a special compilation rule: a homogeneous tuple of ``VecElement{T}``
+maps to an LLVM ``vector`` type when ``T`` is a bitstype and the tuple
+length is in the set {2-6,8-10,16}.
+
+At ``-O3``, the compiler *might* automatically vectorize operations
+on such tuples. For example, the following program, when compiled
+with ``julia -O3`` generates two SIMD addition instructions (``addps``)
+on x86 systems::
+
+    typealias m128 NTuple{4,VecElement{Float32}}
+
+    function add(a::m128, b::m128)
+        (VecElement(a[1].value+b[1].value),
+         VecElement(a[2].value+b[2].value),
+         VecElement(a[3].value+b[3].value),
+         VecElement(a[4].value+b[4].value))
+    end
+
+    triple(c::m128) = add(add(c,c),c)
+
+    code_native(triple,(m128,))
+
+However, since the automatic vectorization cannot be relied upon,
+future use will mostly be via libraries that use ``llvmcall``.
diff --git a/src/abi_win64.cpp b/src/abi_win64.cpp
@@ -37,19 +37,19 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "abi_x86_vec.h"
 
 struct AbiState {
 };
 
 const AbiState default_abi_state = {};
 
-
 bool use_sret(AbiState *state, jl_value_t *ty)
 {
     if(!jl_is_datatype(ty) || jl_is_abstracttype(ty) || jl_is_cpointer_type(ty) || jl_is_array_type(ty))
         return false;
     size_t size = jl_datatype_size(ty);
-    if (size <= 8)
+    if (size <= 8 || is_native_simd_type(ty))
         return false;
     return true;
 }

diff --git a/src/abi_x86_64.cpp b/src/abi_x86_64.cpp
@@ -36,6 +36,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "abi_x86_vec.h"
 
 // used to track the state of the ABI generator during
 // code generation
@@ -133,6 +134,10 @@ void classifyType(Classification& accum, jl_value_t *ty, uint64_t offset)
             accum.addField(offset, Memory);
         }
     }
+    // struct types that map to SIMD registers
+    else if (is_native_simd_type(ty)) {
+        accum.addField(offset, Sse);
+    }
     // Other struct types
     else if (jl_datatype_size(ty) <= 16) {
         size_t i;
@@ -191,13 +196,18 @@ void needPassByRef(AbiState *state, jl_value_t *ty, bool *byRef, bool *inReg)
     }
 }
 
+// Called on behalf of ccall to determine preferred LLVM representation
+// for an argument or return value.
 Type *preferred_llvm_type(jl_value_t *ty, bool isret)
 {
     (void) isret;
     // no need to rewrite these types (they are returned as pointers anyways)
     if (!jl_is_datatype(ty) || jl_is_abstracttype(ty) || jl_is_cpointer_type(ty) || jl_is_array_type(ty))
         return NULL;
 
+    if (is_native_simd_type(ty))
+        return NULL;
+
     int size = jl_datatype_size(ty);
     if (size > 16 || size == 0)
         return NULL;

diff --git a/src/abi_x86_vec.h b/src/abi_x86_vec.h
@@ -0,0 +1,25 @@
+// This file is a part of Julia. License is MIT: http://julialang.org/license
+
+#ifndef ABI_X86_VEC_H
+#define ABI_X86_VEC_H
+
+// Determine if object of bitstype ty maps to a __m128, __m256, or __m512 type in C.
+static bool is_native_simd_type(jl_value_t *ty) {
+    size_t size = jl_datatype_size(ty);
+    if (size!=16 && size!=32 && size!=64)
+        // Wrong size for xmm, ymm, or zmm register.
+        return false;
+    uint32_t n = jl_datatype_nfields(ty);
+    if (n<2)
+        // Not mapped to SIMD register.
+        return false;
+    jl_value_t *ft0 = jl_field_type(ty, 0);
+    for (uint32_t i = 1; i < n; ++i)
+        if (jl_field_type(ty, i)!=ft0)
+            // Not homogeneous
+            return false;
+    // Type is homogeneous.  Check if it maps to LLVM vector.
+    return jl_special_vector_alignment(n,ft0) != 0;
+}
+
+#endif
diff --git a/src/ccalltest.c b/src/ccalltest.c
@@ -388,3 +388,21 @@ JL_DLLEXPORT void set_verbose(int level) {
 JL_DLLEXPORT void *test_echo_p(void *p) {
     return p;
 }
+
+#if defined(_CPU_X86_64_)
+
+#include <xmmintrin.h>
+
+JL_DLLEXPORT __m128i test_m128i(__m128i a, __m128i b, __m128i c, __m128i d ) {
+    // 64-bit x86 has only level 2 SSE, which does not have a <4 x int32> multiplication,
+    // so we use floating-point instead, and assume caller knows about the hack.
+    return _mm_add_epi32(a,
+                         _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(b),
+                                                    _mm_cvtepi32_ps(_mm_sub_epi32(c,d)))));
+}
+
+JL_DLLEXPORT __m128 test_m128(__m128 a, __m128 b, __m128 c, __m128 d ) {
+    return _mm_add_ps(a, _mm_mul_ps(b, _mm_sub_ps(c, d)));
+}
+
+#endif
diff --git a/src/cgutils.cpp b/src/cgutils.cpp
@@ -1407,15 +1407,14 @@ static jl_value_t *static_constant_instance(Constant *constant, jl_value_t *jt)
     }
 
     size_t nargs = 0;
-    ConstantStruct *cst = NULL;
-    ConstantVector *cvec = NULL;
-    ConstantArray *carr = NULL;
-    if ((cst = dyn_cast<ConstantStruct>(constant)) != NULL)
+    if (ConstantStruct *cst = dyn_cast<ConstantStruct>(constant))
         nargs = cst->getType()->getNumElements();
-    else if ((cvec = dyn_cast<ConstantVector>(constant)) != NULL)
+    else if (ConstantVector *cvec = dyn_cast<ConstantVector>(constant))
         nargs = cvec->getType()->getNumElements();
-    else if ((carr = dyn_cast<ConstantArray>(constant)) != NULL)
+    else if (ConstantArray *carr = dyn_cast<ConstantArray>(constant))
         nargs = carr->getType()->getNumElements();
+    else if (ConstantDataVector *cdv = dyn_cast<ConstantDataVector>(constant))
+        nargs = cdv->getType()->getNumElements();
     else if (isa<Function>(constant))
         return NULL;
     else

diff --git a/test/ccall.jl b/test/ccall.jl
@@ -524,3 +524,36 @@ let A = [1]
     finalize(A)
     @test ccall((:get_c_int, libccalltest), Cint, ()) == -1
 end
+
+# SIMD Registers
+
+typealias VecReg{N,T} NTuple{N,VecElement{T}}
+typealias V4xF32 VecReg{4,Float32}
+typealias V4xI32 VecReg{4,Int32}
+
+if Sys.ARCH==:x86_64
+
+    function test_sse(a1::V4xF32,a2::V4xF32,a3::V4xF32,a4::V4xF32)
+        ccall((:test_m128, libccalltest), V4xF32, (V4xF32,V4xF32,V4xF32,V4xF32), a1, a2, a3, a4)
+    end
+
+    function test_sse(a1::V4xI32,a2::V4xI32,a3::V4xI32,a4::V4xI32)
+        ccall((:test_m128i, libccalltest), V4xI32, (V4xI32,V4xI32,V4xI32,V4xI32), a1, a2, a3, a4)
+    end
+
+    foo_ams(a1, a2, a3, a4) = VecReg(ntuple(i->VecElement(a1[i].value+a2[i].value*(a3[i].value-a4[i].value)),4))
+
+    rt_sse{T}(a1::T,a2::T,a3::T,a4::T) = ccall(cfunction(foo_ams,T,(T,T,T,T)), T, (T,T,T,T), a1, a2, a3,a4)
+
+    for s in [Float32,Int32]
+        a1 = VecReg(ntuple(i->VecElement(s(1i)),4))
+        a2 = VecReg(ntuple(i->VecElement(s(2i)),4))
+        a3 = VecReg(ntuple(i->VecElement(s(3i)),4))
+        a4 = VecReg(ntuple(i->VecElement(s(4i)),4))
+        r = VecReg(ntuple(i->VecElement(s(1i+2i*(3i-4i))),4))
+        @test test_sse(a1,a2,a3,a4) == r
+
+        # cfunction round-trip
+        @test rt_sse(a1,a2,a3,a4) == r
+    end
+end
-Original file line number
+Diff line change
@@ Expand Up / @@ -58,6 +58,8 @@ Command-line option changes @@
     Compiler/Runtime improvements
     -----------------------------
+      * Machine SIMD types can be represented in Julia as a homogeneous tuple of `VecElement` ([#15244]).
     Breaking changes
     ----------------
@@ Expand Down @@