Skip to content

Commit

Permalink
SSE2 transpose microkernel code generator.
Browse files Browse the repository at this point in the history
Generates x64, x32, x16, x8 sse2 microkernels.

- New microkernels
- Unit tests

PiperOrigin-RevId: 421781619
  • Loading branch information
alankelly authored and xnnpack-bot committed Jan 14, 2022
1 parent d19bde9 commit 5da6d38
Show file tree
Hide file tree
Showing 37 changed files with 7,341 additions and 2,090 deletions.
19 changes: 19 additions & 0 deletions BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -5171,16 +5171,35 @@ ALL_SSE2_MICROKERNEL_SRCS = [
"src/u8-maxpool/9p8x-minmax-sse2-c16.c",
"src/u8-rmax/sse2.c",
"src/u8-vclamp/sse2-x64.c",
"src/x8-transpose/gen/16x16-reuse-dec-sse2.c",
"src/x8-transpose/gen/16x16-reuse-switch-sse2.c",
"src/x8-zip/x2-sse2.c",
"src/x8-zip/x3-sse2.c",
"src/x8-zip/x4-sse2.c",
"src/x8-zip/xm-sse2.c",
"src/x16-transpose/4x8-sse2.c",
"src/x16-transpose/gen/8x8-multi-dec-sse2.c",
"src/x16-transpose/gen/8x8-multi-switch-sse2.c",
"src/x16-transpose/gen/8x8-reuse-dec-sse2.c",
"src/x16-transpose/gen/8x8-reuse-multi-sse2.c",
"src/x16-transpose/gen/8x8-reuse-switch-sse2.c",
"src/x32-transpose/gen/4x4-multi-dec-sse2.c",
"src/x32-transpose/gen/4x4-multi-multi-sse2.c",
"src/x32-transpose/gen/4x4-multi-switch-sse2.c",
"src/x32-transpose/gen/4x4-reuse-dec-sse2.c",
"src/x32-transpose/gen/4x4-reuse-multi-sse2.c",
"src/x32-transpose/gen/4x4-reuse-switch-sse2.c",
"src/x32-unpool/sse2.c",
"src/x32-zip/x2-sse2.c",
"src/x32-zip/x3-sse2.c",
"src/x32-zip/x4-sse2.c",
"src/x32-zip/xm-sse2.c",
"src/x64-transpose/gen/2x2-multi-dec-sse2.c",
"src/x64-transpose/gen/2x2-multi-multi-sse2.c",
"src/x64-transpose/gen/2x2-multi-switch-sse2.c",
"src/x64-transpose/gen/2x2-reuse-dec-sse2.c",
"src/x64-transpose/gen/2x2-reuse-multi-sse2.c",
"src/x64-transpose/gen/2x2-reuse-switch-sse2.c",
"src/xx-fill/sse2-x64.c",
"src/xx-pad/sse2.c",
]
Expand Down
42 changes: 42 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3642,15 +3642,35 @@ SET(PROD_SSE2_MICROKERNEL_SRCS
src/u8-maxpool/9p8x-minmax-sse2-c16.c
src/u8-rmax/sse2.c
src/u8-vclamp/sse2-x64.c
src/x8-transpose/gen/16x16-reuse-dec-sse2.c
src/x8-transpose/gen/16x16-reuse-switch-sse2.c
src/x8-zip/x2-sse2.c
src/x8-zip/x3-sse2.c
src/x8-zip/x4-sse2.c
src/x8-zip/xm-sse2.c
src/x16-transpose/4x8-sse2.c
src/x16-transpose/gen/8x8-multi-dec-sse2.c
src/x16-transpose/gen/8x8-multi-switch-sse2.c
src/x16-transpose/gen/8x8-reuse-dec-sse2.c
src/x16-transpose/gen/8x8-reuse-multi-sse2.c
src/x16-transpose/gen/8x8-reuse-switch-sse2.c
src/x32-transpose/gen/4x4-multi-dec-sse2.c
src/x32-transpose/gen/4x4-multi-multi-sse2.c
src/x32-transpose/gen/4x4-multi-switch-sse2.c
src/x32-transpose/gen/4x4-reuse-dec-sse2.c
src/x32-transpose/gen/4x4-reuse-multi-sse2.c
src/x32-transpose/gen/4x4-reuse-switch-sse2.c
src/x32-unpool/sse2.c
src/x32-zip/x2-sse2.c
src/x32-zip/x3-sse2.c
src/x32-zip/x4-sse2.c
src/x32-zip/xm-sse2.c
src/x64-transpose/gen/2x2-multi-dec-sse2.c
src/x64-transpose/gen/2x2-multi-multi-sse2.c
src/x64-transpose/gen/2x2-multi-switch-sse2.c
src/x64-transpose/gen/2x2-reuse-dec-sse2.c
src/x64-transpose/gen/2x2-reuse-multi-sse2.c
src/x64-transpose/gen/2x2-reuse-switch-sse2.c
src/xx-fill/sse2-x64.c
src/xx-pad/sse2.c)

Expand Down Expand Up @@ -3928,16 +3948,20 @@ SET(ALL_SSE2_MICROKERNEL_SRCS
src/u8-maxpool/9p8x-minmax-sse2-c16.c
src/u8-rmax/sse2.c
src/u8-vclamp/sse2-x64.c
src/x8-transpose/gen/16x16-sse2.c
src/x8-zip/x2-sse2.c
src/x8-zip/x3-sse2.c
src/x8-zip/x4-sse2.c
src/x8-zip/xm-sse2.c
src/x16-transpose/4x8-sse2.c
src/x16-transpose/gen/8x8-sse2.c
src/x32-transpose/gen/4x4-sse2.c
src/x32-unpool/sse2.c
src/x32-zip/x2-sse2.c
src/x32-zip/x3-sse2.c
src/x32-zip/x4-sse2.c
src/x32-zip/xm-sse2.c
src/x64-transpose/gen/2x2-sse2.c
src/xx-fill/sse2-x64.c
src/xx-pad/sse2.c)

Expand Down Expand Up @@ -8066,6 +8090,15 @@ IF(XNNPACK_BUILD_TESTS)
TARGET_LINK_LIBRARIES(u8-vclamp-test PRIVATE cpuinfo fp16 pthreadpool gtest gtest_main)
ADD_TEST(u8-vclamp-test u8-vclamp-test)

ADD_EXECUTABLE(x8-transpose-test test/x8-transpose.cc $<TARGET_OBJECTS:all_microkernels>)
SET_TARGET_PROPERTIES(x8-transpose-test PROPERTIES
CXX_STANDARD 11
CXX_STANDARD_REQUIRED YES
CXX_EXTENSIONS YES)
TARGET_INCLUDE_DIRECTORIES(x8-transpose-test PRIVATE include src test)
TARGET_LINK_LIBRARIES(x8-transpose-test PRIVATE cpuinfo fp16 pthreadpool gtest gtest_main)
ADD_TEST(x8-transpose-test x8-transpose-test)

ADD_EXECUTABLE(x16-transpose-test test/x16-transpose.cc $<TARGET_OBJECTS:all_microkernels>)
SET_TARGET_PROPERTIES(x16-transpose-test PROPERTIES
CXX_STANDARD 11
Expand Down Expand Up @@ -8120,6 +8153,15 @@ IF(XNNPACK_BUILD_TESTS)
TARGET_LINK_LIBRARIES(x32-zip-test PRIVATE cpuinfo fp16 pthreadpool gtest gtest_main)
ADD_TEST(x32-zip-test x32-zip-test)

ADD_EXECUTABLE(x64-transpose-test test/x64-transpose.cc $<TARGET_OBJECTS:all_microkernels>)
SET_TARGET_PROPERTIES(x64-transpose-test PROPERTIES
CXX_STANDARD 11
CXX_STANDARD_REQUIRED YES
CXX_EXTENSIONS YES)
TARGET_INCLUDE_DIRECTORIES(x64-transpose-test PRIVATE include src test)
TARGET_LINK_LIBRARIES(x64-transpose-test PRIVATE cpuinfo fp16 pthreadpool gtest gtest_main)
ADD_TEST(x64-transpose-test x64-transpose-test)

ADD_EXECUTABLE(x8-lut-test test/x8-lut.cc $<TARGET_OBJECTS:all_microkernels>)
SET_TARGET_PROPERTIES(x8-lut-test PROPERTIES
CXX_STANDARD 11
Expand Down
2 changes: 0 additions & 2 deletions bench/x16-transpose.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,6 @@ static void x16_transpose(
->UseRealTime();
BENCHMARK_CAPTURE(x16_transpose, sse2_117, xnn_x16_transpose_ukernel__4x8_sse2, 117)
->UseRealTime();
BENCHMARK_CAPTURE(x16_transpose, sse2_1024, xnn_x16_transpose_ukernel__4x8_sse2, 1024)
->UseRealTime();
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64

#ifndef XNNPACK_BENCHMARK_NO_MAIN
Expand Down
17 changes: 17 additions & 0 deletions scripts/generate-x16-transpose.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/sh
# Copyright 2021 Google LLC
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

#################################### SSE2 ###################################
tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=REUSE OUT_PTRS=DEC SIZE=16 -o src/x16-transpose/gen/8x8-reuse-dec-sse2.c &
tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=REUSE OUT_PTRS=SWITCH SIZE=16 -o src/x16-transpose/gen/8x8-reuse-switch-sse2.c &
tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=REUSE OUT_PTRS=MULTI SIZE=16 -o src/x16-transpose/gen/8x8-reuse-multi-sse2.c &
tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=MULTI OUT_PTRS=SWITCH SIZE=16 -o src/x16-transpose/gen/8x8-multi-switch-sse2.c &
tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=MULTI OUT_PTRS=DEC SIZE=16 -o src/x16-transpose/gen/8x8-multi-dec-sse2.c &

################################## Unit tests #################################
tools/generate-transpose-test.py --spec test/x16-transpose.yaml --output=test/x16-transpose.cc &

wait
18 changes: 18 additions & 0 deletions scripts/generate-x64-transpose.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/sh
# Copyright 2021 Google LLC
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

#################################### SSE2 ###################################
tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=REUSE OUT_PTRS=DEC SIZE=64 -o src/x64-transpose/gen/2x2-reuse-dec-sse2.c &
tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=REUSE OUT_PTRS=SWITCH SIZE=64 -o src/x64-transpose/gen/2x2-reuse-switch-sse2.c &
tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=REUSE OUT_PTRS=MULTI SIZE=64 -o src/x64-transpose/gen/2x2-reuse-multi-sse2.c &
tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=MULTI OUT_PTRS=SWITCH SIZE=64 -o src/x64-transpose/gen/2x2-multi-switch-sse2.c &
tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=MULTI OUT_PTRS=MULTI SIZE=64 -o src/x64-transpose/gen/2x2-multi-multi-sse2.c &
tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=MULTI OUT_PTRS=DEC SIZE=64 -o src/x64-transpose/gen/2x2-multi-dec-sse2.c &

################################## Unit tests #################################
tools/generate-transpose-test.py --spec test/x64-transpose.yaml --output=test/x64-transpose.cc &

wait
14 changes: 14 additions & 0 deletions scripts/generate-x8-transpose.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/sh
# Copyright 2021 Google LLC
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

#################################### SSE2 ###################################
tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=REUSE OUT_PTRS=DEC SIZE=8 -o src/x8-transpose/gen/16x16-reuse-dec-sse2.c &
tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=REUSE OUT_PTRS=SWITCH SIZE=8 -o src/x8-transpose/gen/16x16-reuse-switch-sse2.c &

################################## Unit tests #################################
tools/generate-transpose-test.py --spec test/x8-transpose.yaml --output=test/x8-transpose.cc &

wait
8 changes: 8 additions & 0 deletions scripts/generate-xN-transpose.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,14 @@ tools/xngen src/x32-transpose/scalar.c.in -D TILE_HEIGHT=2 TILE_WIDTH=2 TYPE=dou
tools/xngen src/x32-transpose/scalar.c.in -D TILE_HEIGHT=4 TILE_WIDTH=1 TYPE=double SIZE=64 -o src/x64-transpose/gen/4x1-scalar-float.c &
tools/xngen src/x32-transpose/scalar.c.in -D TILE_HEIGHT=4 TILE_WIDTH=2 TYPE=double SIZE=64 -o src/x64-transpose/gen/4x2-scalar-float.c &

#################################### SSE2 ###################################
tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=REUSE OUT_PTRS=DEC SIZE=32 -o src/x32-transpose/gen/4x4-reuse-dec-sse2.c &
tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=REUSE OUT_PTRS=SWITCH SIZE=32 -o src/x32-transpose/gen/4x4-reuse-switch-sse2.c &
tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=REUSE OUT_PTRS=MULTI SIZE=32 -o src/x32-transpose/gen/4x4-reuse-multi-sse2.c &
tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=MULTI OUT_PTRS=SWITCH SIZE=32 -o src/x32-transpose/gen/4x4-multi-switch-sse2.c &
tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=MULTI OUT_PTRS=MULTI SIZE=32 -o src/x32-transpose/gen/4x4-multi-multi-sse2.c &
tools/xngen src/x32-transpose/sse2.c.in -D IN_PTRS=MULTI OUT_PTRS=DEC SIZE=32 -o src/x32-transpose/gen/4x4-multi-dec-sse2.c &

################################## Unit tests #################################
tools/generate-transpose-test.py --spec test/x8-transpose.yaml --output=test/x8-transpose.cc &
tools/generate-transpose-test.py --spec test/x16-transpose.yaml --output=test/x16-transpose.cc &
Expand Down
Loading

0 comments on commit 5da6d38

Please sign in to comment.