Skip to content

Commit

Permalink
hash/crc32: add AMD64 optimized IEEE CRC calculation
Browse files Browse the repository at this point in the history
IEEE is the most commonly used CRC-32 polynomial, used by zip, gzip and others.

Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf

benchmark                       old ns/op     new ns/op     delta
BenchmarkIEEECrc1KB-8           3193          352           -88.98%
BenchmarkIEEECrc4KB-8           5025          1307          -73.99%
BenchmarkCastagnoliCrc1KB-8     126           126           +0.00%

benchmark                       old MB/s     new MB/s     speedup
BenchmarkIEEECrc1KB-8           320.68       2901.92      9.05x
BenchmarkIEEECrc4KB-8           815.08       3131.80      3.84x
BenchmarkCastagnoliCrc1KB-8     8100.80      8109.78      1.00x

Change-Id: I99c9a48365f631827f516e44f97e86155f03cb90
Reviewed-on: https://go-review.googlesource.com/14080
Reviewed-by: Keith Randall <[email protected]>
  • Loading branch information
klauspost authored and randall77 committed Sep 16, 2015
1 parent f0ea976 commit 2027b00
Show file tree
Hide file tree
Showing 5 changed files with 251 additions and 11 deletions.
12 changes: 4 additions & 8 deletions src/hash/crc32/crc32.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,15 +151,11 @@ func updateSlicingBy8(crc uint32, tab *slicing8Table, p []byte) uint32 {

// Update returns the result of adding the bytes in p to the crc.
func Update(crc uint32, tab *Table, p []byte) uint32 {
if tab == castagnoliTable {
switch tab {
case castagnoliTable:
return updateCastagnoli(crc, p)
}
// only use slicing-by-8 when input is larger than 4KB
if tab == IEEETable && len(p) >= 4096 {
iEEETable8Once.Do(func() {
iEEETable8 = makeTable8(IEEE)
})
return updateSlicingBy8(crc, iEEETable8, p)
case IEEETable:
return updateIEEE(crc, p)
}
return update(crc, tab, p)
}
Expand Down
54 changes: 54 additions & 0 deletions src/hash/crc32/crc32_amd64.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package crc32

// This file contains the code to call the SSE 4.2 version of the Castagnoli
// and IEEE CRC.

// haveSSE41/haveSSE42/haveCLMUL are defined in crc_amd64.s and use
// CPUID to test for SSE 4.1, 4.2 and CLMUL support.
func haveSSE41() bool
func haveSSE42() bool
func haveCLMUL() bool

// castagnoliSSE42 is defined in crc_amd64.s and uses the SSE4.2 CRC32
// instruction.
func castagnoliSSE42(crc uint32, p []byte) uint32

// ieeeCLMUL is defined in crc_amd64.s and uses the PCLMULQDQ
// instruction as well as SSE 4.1.
func ieeeCLMUL(crc uint32, p []byte) uint32

var sse42 = haveSSE42()
var useFastIEEE = haveCLMUL() && haveSSE41()

func updateCastagnoli(crc uint32, p []byte) uint32 {
if sse42 {
return castagnoliSSE42(crc, p)
}
return update(crc, castagnoliTable, p)
}

func updateIEEE(crc uint32, p []byte) uint32 {
if useFastIEEE && len(p) >= 64 {
left := len(p) & 15
do := len(p) - left
crc = ^ieeeCLMUL(^crc, p[:do])
if left > 0 {
crc = update(crc, IEEETable, p[do:])
}
return crc
}

// only use slicing-by-8 when input is >= 4KB
if len(p) >= 4096 {
iEEETable8Once.Do(func() {
iEEETable8 = makeTable8(IEEE)
})
return updateSlicingBy8(crc, iEEETable8, p)
}

return update(crc, IEEETable, p)
}
169 changes: 169 additions & 0 deletions src/hash/crc32/crc32_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,172 @@ TEXT ·haveSSE42(SB),NOSPLIT,$0
MOVB CX, ret+0(FP)
RET

// func haveCLMUL() bool
TEXT ·haveCLMUL(SB),NOSPLIT,$0
XORQ AX, AX
INCL AX
CPUID
SHRQ $1, CX
ANDQ $1, CX
MOVB CX, ret+0(FP)
RET

// func haveSSE41() bool
TEXT ·haveSSE41(SB),NOSPLIT,$0
XORQ AX, AX
INCL AX
CPUID
SHRQ $19, CX
ANDQ $1, CX
MOVB CX, ret+0(FP)
RET

// CRC32 polynomial data
//
// These constants are lifted from the
// Linux kernel, since they avoid the costly
// PSHUFB 16 byte reversal proposed in the
// original Intel paper.
DATA r2r1<>+0(SB)/8, $0x154442bd4
DATA r2r1<>+8(SB)/8, $0x1c6e41596
DATA r4r3<>+0(SB)/8, $0x1751997d0
DATA r4r3<>+8(SB)/8, $0x0ccaa009e
DATA rupoly<>+0(SB)/8, $0x1db710641
DATA rupoly<>+8(SB)/8, $0x1f7011641
DATA r5<>+0(SB)/8, $0x163cd6124

GLOBL r2r1<>(SB),RODATA,$16
GLOBL r4r3<>(SB),RODATA,$16
GLOBL rupoly<>(SB),RODATA,$16
GLOBL r5<>(SB),RODATA,$8

// Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
// len(p) must be at least 64, and must be a multiple of 16.

// func ieeeCLMUL(crc uint32, p []byte) uint32
TEXT ·ieeeCLMUL(SB),NOSPLIT,$0
MOVL crc+0(FP), X0 // Initial CRC value
MOVQ p+8(FP), SI // data pointer
MOVQ p_len+16(FP), CX // len(p)

MOVOU (SI), X1
MOVOU 16(SI), X2
MOVOU 32(SI), X3
MOVOU 48(SI), X4
PXOR X0, X1
ADDQ $64, SI // buf+=64
SUBQ $64, CX // len-=64
CMPQ CX, $64 // Less than 64 bytes left
JB remain64

MOVOA r2r1<>+0(SB), X0
loopback64:
MOVOA X1, X5
MOVOA X2, X6
MOVOA X3, X7
MOVOA X4, X8

PCLMULQDQ $0, X0, X1
PCLMULQDQ $0, X0, X2
PCLMULQDQ $0, X0, X3
PCLMULQDQ $0, X0, X4

/* Load next early */
MOVOU (SI), X11
MOVOU 16(SI), X12
MOVOU 32(SI), X13
MOVOU 48(SI), X14

PCLMULQDQ $0x11, X0, X5
PCLMULQDQ $0x11, X0, X6
PCLMULQDQ $0x11, X0, X7
PCLMULQDQ $0x11, X0, X8

PXOR X5, X1
PXOR X6, X2
PXOR X7, X3
PXOR X8, X4

PXOR X11, X1
PXOR X12, X2
PXOR X13, X3
PXOR X14, X4

ADDQ $0x40, DI
ADDQ $64, SI // buf+=64
SUBQ $64, CX // len-=64
CMPQ CX, $64 // Less than 64 bytes left?
JGE loopback64

/* Fold result into a single register (X1) */
remain64:
MOVOA r4r3<>+0(SB), X0

MOVOA X1, X5
PCLMULQDQ $0, X0, X1
PCLMULQDQ $0x11, X0, X5
PXOR X5, X1
PXOR X2, X1

MOVOA X1, X5
PCLMULQDQ $0, X0, X1
PCLMULQDQ $0x11, X0, X5
PXOR X5, X1
PXOR X3, X1

MOVOA X1, X5
PCLMULQDQ $0, X0, X1
PCLMULQDQ $0x11, X0, X5
PXOR X5, X1
PXOR X4, X1

/* If there is less than 16 bytes left we are done */
CMPQ CX, $16
JB finish

/* Encode 16 bytes */
remain16:
MOVOU (SI), X10
MOVOA X1, X5
PCLMULQDQ $0, X0, X1
PCLMULQDQ $0x11, X0, X5
PXOR X5, X1
PXOR X10, X1
SUBQ $16, CX
ADDQ $16, SI
CMPQ CX, $16
JGE remain16

finish:
/* Fold final result into 32 bits and return it */
PCMPEQB X3, X3
PCLMULQDQ $1, X1, X0
PSRLDQ $8, X1
PXOR X0, X1

MOVOA X1, X2
MOVQ r5<>+0(SB), X0

/* Creates 32 bit mask. Note that we don't care about upper half. */
PSRLQ $32, X3

PSRLDQ $4, X2
PAND X3, X1
PCLMULQDQ $0, X0, X1
PXOR X2, X1

MOVOA rupoly<>+0(SB), X0

MOVOA X1, X2
PAND X3, X1
PCLMULQDQ $0x10, X0, X1
PAND X3, X1
PCLMULQDQ $0, X0, X1
PXOR X2, X1

/* PEXTRD $1, X1, AX (SSE 4.1) */
BYTE $0x66; BYTE $0x0f; BYTE $0x3a;
BYTE $0x16; BYTE $0xc8; BYTE $0x01;
MOVL AX, ret+32(FP)

RET
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,12 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// +build amd64 amd64p32

package crc32

// This file contains the code to call the SSE 4.2 version of the Castagnoli
// CRC.

// haveSSE42 is defined in crc_amd64.s and uses CPUID to test for SSE 4.2
// haveSSE42 is defined in crc_amd64p32.s and uses CPUID to test for SSE 4.2
// support.
func haveSSE42() bool

Expand All @@ -25,3 +23,15 @@ func updateCastagnoli(crc uint32, p []byte) uint32 {
}
return update(crc, castagnoliTable, p)
}

func updateIEEE(crc uint32, p []byte) uint32 {
// only use slicing-by-8 when input is >= 4KB
if len(p) >= 4096 {
iEEETable8Once.Do(func() {
iEEETable8 = makeTable8(IEEE)
})
return updateSlicingBy8(crc, iEEETable8, p)
}

return update(crc, IEEETable, p)
}
11 changes: 11 additions & 0 deletions src/hash/crc32/crc32_generic.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,14 @@ package crc32
func updateCastagnoli(crc uint32, p []byte) uint32 {
return update(crc, castagnoliTable, p)
}

func updateIEEE(crc uint32, p []byte) uint32 {
// only use slicing-by-8 when input is >= 4KB
if len(p) >= 4096 {
iEEETable8Once.Do(func() {
iEEETable8 = makeTable8(IEEE)
})
return updateSlicingBy8(crc, iEEETable8, p)
}
return update(crc, IEEETable, p)
}

0 comments on commit 2027b00

Please sign in to comment.