Skip to content

Commit

Permalink
cmd/compile: ppc64x intrinsics for math/bits
Browse files Browse the repository at this point in the history
This adds math/bits intrinsics for OnesCount, Len, TrailingZeros on
ppc64x.

benchmark                       old ns/op     new ns/op     delta
BenchmarkLeadingZeros-16        4.26          1.71          -59.86%
BenchmarkLeadingZeros16-16      3.04          1.83          -39.80%
BenchmarkLeadingZeros32-16      3.31          1.82          -45.02%
BenchmarkLeadingZeros64-16      3.69          1.71          -53.66%
BenchmarkTrailingZeros-16       2.55          1.62          -36.47%
BenchmarkTrailingZeros32-16     2.55          1.77          -30.59%
BenchmarkTrailingZeros64-16     2.78          1.62          -41.73%
BenchmarkOnesCount-16           3.19          0.93          -70.85%
BenchmarkOnesCount32-16         2.55          1.18          -53.73%
BenchmarkOnesCount64-16         3.22          0.93          -71.12%

Update golang#18616

I also made a change to bits_test.go because when debugging some failures
the output was not quite providing the right argument information.

Change-Id: Ia58d31d1777cf4582a4505f85b11a1202ca07d3e
Reviewed-on: https://go-review.googlesource.com/41630
Run-TryBot: Lynn Boger <[email protected]>
TryBot-Result: Gobot Gobot <[email protected]>
Reviewed-by: Carlos Eduardo Seo <[email protected]>
Reviewed-by: Keith Randall <[email protected]>
  • Loading branch information
laboger committed May 10, 2017
1 parent a486409 commit 8304d10
Show file tree
Hide file tree
Showing 7 changed files with 279 additions and 17 deletions.
34 changes: 22 additions & 12 deletions src/cmd/compile/internal/gc/ssa.go
Original file line number Diff line number Diff line change
Expand Up @@ -2730,12 +2730,12 @@ func init() {
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpCtz64, types.Types[TINT], args[0])
},
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS)
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64)
addF("math/bits", "TrailingZeros32",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpCtz32, types.Types[TINT], args[0])
},
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS)
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64)
addF("math/bits", "TrailingZeros16",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
x := s.newValue1(ssa.OpZeroExt16to32, types.Types[TUINT32], args[0])
Expand Down Expand Up @@ -2776,7 +2776,7 @@ func init() {
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBitLen64, types.Types[TINT], args[0])
},
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS)
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64)
addF("math/bits", "Len32",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
if s.config.PtrSize == 4 {
Expand All @@ -2785,7 +2785,7 @@ func init() {
x := s.newValue1(ssa.OpZeroExt32to64, types.Types[TUINT64], args[0])
return s.newValue1(ssa.OpBitLen64, types.Types[TINT], x)
},
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS)
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64)
addF("math/bits", "Len16",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
if s.config.PtrSize == 4 {
Expand All @@ -2795,7 +2795,7 @@ func init() {
x := s.newValue1(ssa.OpZeroExt16to64, types.Types[TUINT64], args[0])
return s.newValue1(ssa.OpBitLen64, types.Types[TINT], x)
},
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS)
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64)
// Note: disabled on AMD64 because the Go code is faster!
addF("math/bits", "Len8",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
Expand All @@ -2806,7 +2806,7 @@ func init() {
x := s.newValue1(ssa.OpZeroExt8to64, types.Types[TUINT64], args[0])
return s.newValue1(ssa.OpBitLen64, types.Types[TINT], x)
},
sys.ARM64, sys.ARM, sys.S390X, sys.MIPS)
sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64)

addF("math/bits", "Len",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
Expand All @@ -2815,7 +2815,7 @@ func init() {
}
return s.newValue1(ssa.OpBitLen64, types.Types[TINT], args[0])
},
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS)
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64)
// LeadingZeros is handled because it trivially calls Len.
addF("math/bits", "Reverse64",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
Expand Down Expand Up @@ -2845,7 +2845,7 @@ func init() {
return s.newValue1(ssa.OpBitRev64, types.Types[TINT], args[0])
},
sys.ARM64)
makeOnesCount := func(op64 ssa.Op, op32 ssa.Op) func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
makeOnesCountAMD64 := func(op64 ssa.Op, op32 ssa.Op) func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
aux := s.lookupSymbol(n, &ssa.ExternSymbol{Sym: syslook("support_popcnt").Sym.Linksym()})
addr := s.entryNewValue1A(ssa.OpAddr, types.Types[TBOOL].PtrTo(), aux, s.sb)
Expand Down Expand Up @@ -2881,17 +2881,27 @@ func init() {
}
}
addF("math/bits", "OnesCount64",
makeOnesCount(ssa.OpPopCount64, ssa.OpPopCount64),
makeOnesCountAMD64(ssa.OpPopCount64, ssa.OpPopCount64),
sys.AMD64)
addF("math/bits", "OnesCount64",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpPopCount64, types.Types[TINT], args[0])
},
sys.PPC64)
addF("math/bits", "OnesCount32",
makeOnesCount(ssa.OpPopCount32, ssa.OpPopCount32),
makeOnesCountAMD64(ssa.OpPopCount32, ssa.OpPopCount32),
sys.AMD64)
addF("math/bits", "OnesCount32",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpPopCount32, types.Types[TINT], args[0])
},
sys.PPC64)
addF("math/bits", "OnesCount16",
makeOnesCount(ssa.OpPopCount16, ssa.OpPopCount16),
makeOnesCountAMD64(ssa.OpPopCount16, ssa.OpPopCount16),
sys.AMD64)
// Note: no OnesCount8, the Go implementation is faster - just a table load.
addF("math/bits", "OnesCount",
makeOnesCount(ssa.OpPopCount64, ssa.OpPopCount32),
makeOnesCountAMD64(ssa.OpPopCount64, ssa.OpPopCount32),
sys.AMD64)

/******** sync/atomic ********/
Expand Down
2 changes: 1 addition & 1 deletion src/cmd/compile/internal/ppc64/ssa.go
Original file line number Diff line number Diff line change
Expand Up @@ -596,7 +596,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REGTMP // Ignored; this is for the carry effect.

case ssa.OpPPC64NEG, ssa.OpPPC64FNEG, ssa.OpPPC64FSQRT, ssa.OpPPC64FSQRTS, ssa.OpPPC64FCTIDZ, ssa.OpPPC64FCTIWZ, ssa.OpPPC64FCFID, ssa.OpPPC64FRSP:
case ssa.OpPPC64NEG, ssa.OpPPC64FNEG, ssa.OpPPC64FSQRT, ssa.OpPPC64FSQRTS, ssa.OpPPC64FCTIDZ, ssa.OpPPC64FCTIWZ, ssa.OpPPC64FCFID, ssa.OpPPC64FRSP, ssa.OpPPC64CNTLZD, ssa.OpPPC64CNTLZW, ssa.OpPPC64POPCNTD, ssa.OpPPC64POPCNTW, ssa.OpPPC64POPCNTB:
r := v.Reg()
p := s.Prog(v.Op.Asm())
p.To.Type = obj.TYPE_REG
Expand Down
11 changes: 11 additions & 0 deletions src/cmd/compile/internal/ssa/gen/PPC64.rules
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,17 @@
// (Addr {sym} base) -> (ADDconst {sym} base)
(OffPtr [off] ptr) -> (ADD (MOVDconst <typ.Int64> [off]) ptr)

(Ctz64 x) -> (POPCNTD (ANDN <types.Int64> (ADDconst <types.Int64> [-1] x) x))
(Ctz32 x) -> (POPCNTW (MOVWZreg (ANDN <types.Int> (ADDconst <types.Int> [-1] x) x)))

(BitLen64 x) -> (SUB (MOVDconst [64]) (CNTLZD <types.Int> x))
(BitLen32 x) -> (SUB (MOVDconst [32]) (CNTLZW <types.Int> x))

(PopCount64 x) -> (POPCNTD x)
(PopCount32 x) -> (POPCNTW (MOVWZreg x))
(PopCount16 x) -> (POPCNTW (MOVHZreg x))
(PopCount8 x) -> (POPCNTB (MOVBreg x))

(And64 x y) -> (AND x y)
(And32 x y) -> (AND x y)
(And16 x y) -> (AND x y)
Expand Down
7 changes: 7 additions & 0 deletions src/cmd/compile/internal/ssa/gen/PPC64Ops.go
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,13 @@ func init() {
{name: "ROTLconst", argLength: 1, reg: gp11, asm: "ROTL", aux: "Int64"}, // arg0 rotate left by auxInt bits
{name: "ROTLWconst", argLength: 1, reg: gp11, asm: "ROTLW", aux: "Int64"}, // uint32(arg0) rotate left by auxInt bits

{name: "CNTLZD", argLength: 1, reg: gp11, asm: "CNTLZD", clobberFlags: true}, // count leading zeros
{name: "CNTLZW", argLength: 1, reg: gp11, asm: "CNTLZW", clobberFlags: true}, // count leading zeros (32 bit)

{name: "POPCNTD", argLength: 1, reg: gp11, asm: "POPCNTD"}, // number of set bits in arg0
{name: "POPCNTW", argLength: 1, reg: gp11, asm: "POPCNTW"}, // number of set bits in each word of arg0 placed in corresponding word
{name: "POPCNTB", argLength: 1, reg: gp11, asm: "POPCNTB"}, // number of set bits in each byte of arg0 placed in corresonding byte

{name: "FDIV", argLength: 2, reg: fp21, asm: "FDIV"}, // arg0/arg1
{name: "FDIVS", argLength: 2, reg: fp21, asm: "FDIVS"}, // arg0/arg1

Expand Down
72 changes: 72 additions & 0 deletions src/cmd/compile/internal/ssa/opGen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 8304d10

Please sign in to comment.