Skip to content

Commit

Permalink
cmd/compile: improve LoweredZero performance for ppc64x
Browse files Browse the repository at this point in the history
This change improves the performance of the LoweredZero rule
on ppc64x.

The improvement can be seen in the runtime ClearFat
benchmarks:

BenchmarkClearFat12-16       2.40          0.69          -71.25%
BenchmarkClearFat16-16       9.98          0.93          -90.68%
BenchmarkClearFat24-16       4.75          0.93          -80.42%
BenchmarkClearFat32-16       6.02          0.93          -84.55%
BenchmarkClearFat40-16       7.19          1.16          -83.87%
BenchmarkClearFat48-16       15.0          1.39          -90.73%
BenchmarkClearFat56-16       9.95          1.62          -83.72%
BenchmarkClearFat64-16       18.0          1.86          -89.67%
BenchmarkClearFat128-16      30.0          8.08          -73.07%
BenchmarkClearFat256-16      52.5          11.3          -78.48%
BenchmarkClearFat512-16      97.0          19.0          -80.41%
BenchmarkClearFat1024-16     244           34.2          -85.98%

Fixes: golang#19532

Change-Id: If493e28bc1d8e61bc79978498be9f5336a36cd3f
Reviewed-on: https://go-review.googlesource.com/38096
Run-TryBot: Lynn Boger <[email protected]>
TryBot-Result: Gobot Gobot <[email protected]>
Reviewed-by: Michael Munday <[email protected]>
  • Loading branch information
laboger committed Mar 21, 2017
1 parent d972dc2 commit 23bd919
Show file tree
Hide file tree
Showing 5 changed files with 356 additions and 235 deletions.
175 changes: 124 additions & 51 deletions src/cmd/compile/internal/ppc64/ssa.go
Original file line number Diff line number Diff line change
Expand Up @@ -831,62 +831,135 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
ssaGenISEL(v, ppc64.C_COND_EQ, iselRegs[1], v.Reg())

case ssa.OpPPC64LoweredZero:
// Similar to how this is done on ARM,
// except that PPC MOVDU x,off(y) is *(y+off) = x; y=y+off
// not store-and-increment.
// Therefore R3 should be dest-align
// and arg1 should be dest+size-align
// HOWEVER, the input dest address cannot be dest-align because
// that does not necessarily address valid memory and it's not
// known how that might be optimized. Therefore, correct it in
// in the expansion:

// unaligned data doesn't hurt performance
// for these instructions on power8 or later

// for sizes >= 64 generate a loop as follows:

// set up loop counter in CTR, used by BC
// MOVD len/32,REG_TMP
// MOVD REG_TMP,CTR
// loop:
// MOVD R0,(R3)
// MOVD R0,8(R3)
// MOVD R0,16(R3)
// MOVD R0,24(R3)
// ADD $32,R3
// BC 16, 0, loop
//
// ADD -8,R3,R3
// MOVDU R0, 8(R3)
// CMP R3, Rarg1
// BL -2(PC)
// arg1 is the address of the last element to zero
// auxint is alignment
var sz int64
var movu obj.As
switch {
case v.AuxInt%8 == 0:
sz = 8
movu = ppc64.AMOVDU
case v.AuxInt%4 == 0:
sz = 4
movu = ppc64.AMOVWZU // MOVWU instruction not implemented
case v.AuxInt%2 == 0:
sz = 2
movu = ppc64.AMOVHU
default:
sz = 1
movu = ppc64.AMOVBU
}
// any remainder is done as described below

p := gc.Prog(ppc64.AADD)
p.Reg = v.Args[0].Reg()
p.From.Type = obj.TYPE_CONST
p.From.Offset = -sz
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Args[0].Reg()
// for sizes < 64 bytes, first clear as many doublewords as possible,
// then handle the remainder
// MOVD R0,(R3)
// MOVD R0,8(R3)
// .... etc.
//
// the remainder bytes are cleared using one or more
// of the following instructions with the appropriate
// offsets depending which instructions are needed
//
// MOVW R0,n1(R3) 4 bytes
// MOVH R0,n2(R3) 2 bytes
// MOVB R0,n3(R3) 1 byte
//
// 7 bytes: MOVW, MOVH, MOVB
// 6 bytes: MOVW, MOVH
// 5 bytes: MOVW, MOVB
// 3 bytes: MOVH, MOVB

p = gc.Prog(movu)
p.From.Type = obj.TYPE_REG
p.From.Reg = ppc64.REG_R0
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
p.To.Offset = sz
// each loop iteration does 32 bytes
ctr := v.AuxInt / 32

p2 := gc.Prog(ppc64.ACMPU)
p2.From.Type = obj.TYPE_REG
p2.From.Reg = v.Args[0].Reg()
p2.To.Reg = v.Args[1].Reg()
p2.To.Type = obj.TYPE_REG
// remainder bytes
rem := v.AuxInt % 32

p3 := gc.Prog(ppc64.ABLT)
p3.To.Type = obj.TYPE_BRANCH
gc.Patch(p3, p)
// only generate a loop if there is more
// than 1 iteration.
if ctr > 1 {
// Set up CTR loop counter
p := gc.Prog(ppc64.AMOVD)
p.From.Type = obj.TYPE_CONST
p.From.Offset = ctr
p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REGTMP

p = gc.Prog(ppc64.AMOVD)
p.From.Type = obj.TYPE_REG
p.From.Reg = ppc64.REGTMP
p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REG_CTR

// generate 4 MOVDs
// when this is a loop then the top must be saved
var top *obj.Prog
for offset := int64(0); offset < 32; offset += 8 {
// This is the top of loop
p := gc.Prog(ppc64.AMOVD)
p.From.Type = obj.TYPE_REG
p.From.Reg = ppc64.REG_R0
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
p.To.Offset = offset
// Save the top of loop
if top == nil {
top = p
}
}

// Increment address for the
// 4 doublewords just zeroed.
p = gc.Prog(ppc64.AADD)
p.Reg = v.Args[0].Reg()
p.From.Type = obj.TYPE_CONST
p.From.Offset = 32
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Args[0].Reg()

// Branch back to top of loop
// based on CTR
// BC with BO_BCTR generates bdnz
p = gc.Prog(ppc64.ABC)
p.From.Type = obj.TYPE_CONST
p.From.Offset = ppc64.BO_BCTR
p.Reg = ppc64.REG_R0
p.To.Type = obj.TYPE_BRANCH
gc.Patch(p, top)
}

// when ctr == 1 the loop was not generated but
// there are at least 32 bytes to clear, so add
// that to the remainder to generate the code
// to clear those doublewords
if ctr == 1 {
rem += 32
}

// clear the remainder starting at offset zero
offset := int64(0)

// first clear as many doublewords as possible
// then clear remaining sizes as available
for rem > 0 {
op, size := ppc64.AMOVB, int64(1)
switch {
case rem >= 8:
op, size = ppc64.AMOVD, 8
case rem >= 4:
op, size = ppc64.AMOVW, 4
case rem >= 2:
op, size = ppc64.AMOVH, 2
}
p := gc.Prog(op)
p.From.Type = obj.TYPE_REG
p.From.Reg = ppc64.REG_R0
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
p.To.Offset = offset
rem -= size
offset += size
}

case ssa.OpPPC64LoweredMove:
// Similar to how this is done on ARM,
Expand Down
87 changes: 50 additions & 37 deletions src/cmd/compile/internal/ssa/gen/PPC64.rules
Original file line number Diff line number Diff line change
Expand Up @@ -485,60 +485,73 @@
(Store {t} ptr val mem) && t.(Type).Size() == 2 -> (MOVHstore ptr val mem)
(Store {t} ptr val mem) && t.(Type).Size() == 1 -> (MOVBstore ptr val mem)

// Using Zero instead of LoweredZero allows the
// target address to be folded where possible.
(Zero [0] _ mem) -> mem
(Zero [1] destptr mem) -> (MOVBstorezero destptr mem)
(Zero [2] {t} destptr mem) && t.(Type).Alignment()%2 == 0 ->
(MOVHstorezero destptr mem)
(Zero [2] destptr mem) ->
(MOVBstorezero [1] destptr
(MOVBstorezero [0] destptr mem))
(Zero [4] {t} destptr mem) && t.(Type).Alignment()%4 == 0 ->
(MOVWstorezero destptr mem)
(Zero [4] {t} destptr mem) && t.(Type).Alignment()%2 == 0 ->
(MOVHstorezero [2] destptr
(MOVHstorezero [0] destptr mem))
(Zero [4] destptr mem) ->
(MOVBstorezero [3] destptr
(MOVBstorezero [2] destptr
(MOVBstorezero [1] destptr
(MOVBstorezero [0] destptr mem))))
(Zero [8] {t} destptr mem) && t.(Type).Alignment()%8 == 0 ->
(MOVDstorezero [0] destptr mem)
(Zero [8] {t} destptr mem) && t.(Type).Alignment()%4 == 0 ->
(MOVWstorezero [4] destptr
(MOVWstorezero [0] destptr mem))
(Zero [8] {t} destptr mem) && t.(Type).Alignment()%2 == 0 ->
(MOVHstorezero [6] destptr
(MOVHstorezero [4] destptr
(MOVHstorezero [2] destptr
(MOVHstorezero [0] destptr mem))))

(MOVHstorezero destptr mem)
(Zero [3] destptr mem) ->
(MOVBstorezero [2] destptr
(MOVBstorezero [1] destptr
(MOVBstorezero [0] destptr mem)))
(MOVHstorezero destptr mem))
(Zero [4] destptr mem) ->
(MOVWstorezero destptr mem)
(Zero [5] destptr mem) ->
(MOVBstorezero [4] destptr
(MOVWstorezero destptr mem))
(Zero [6] destptr mem) ->
(MOVHstorezero [4] destptr
(MOVWstorezero destptr mem))
(Zero [7] destptr mem) ->
(MOVBstorezero [6] destptr
(MOVHstorezero [4] destptr
(MOVWstorezero destptr mem)))
(Zero [8] destptr mem) ->
(MOVDstorezero destptr mem)

// Zero small numbers of words directly.
(Zero [16] {t} destptr mem) && t.(Type).Alignment()%8 == 0 ->
(Zero [12] destptr mem) ->
(MOVWstorezero [8] destptr
(MOVDstorezero [0] destptr mem))
(Zero [16] destptr mem) ->
(MOVDstorezero [8] destptr
(MOVDstorezero [0] destptr mem))
(Zero [24] {t} destptr mem) && t.(Type).Alignment()%8 == 0 ->
(Zero [24] destptr mem) ->
(MOVDstorezero [16] destptr
(MOVDstorezero [8] destptr
(MOVDstorezero [0] destptr mem)))
(Zero [32] {t} destptr mem) && t.(Type).Alignment()%8 == 0 ->
(Zero [32] destptr mem) ->
(MOVDstorezero [24] destptr
(MOVDstorezero [16] destptr
(MOVDstorezero [8] destptr
(MOVDstorezero [0] destptr mem))))

// Large zeroing uses a loop
(Zero [s] {t} ptr mem)
&& (s > 512 || config.noDuffDevice) || t.(Type).Alignment()%8 != 0 ->
(LoweredZero [t.(Type).Alignment()]
ptr
(ADDconst <ptr.Type> ptr [s-moveSize(t.(Type).Alignment(), config)])
mem)
(Zero [40] destptr mem) ->
(MOVDstorezero [32] destptr
(MOVDstorezero [24] destptr
(MOVDstorezero [16] destptr
(MOVDstorezero [8] destptr
(MOVDstorezero [0] destptr mem)))))

(Zero [48] destptr mem) ->
(MOVDstorezero [40] destptr
(MOVDstorezero [32] destptr
(MOVDstorezero [24] destptr
(MOVDstorezero [16] destptr
(MOVDstorezero [8] destptr
(MOVDstorezero [0] destptr mem))))))

(Zero [56] destptr mem) ->
(MOVDstorezero [48] destptr
(MOVDstorezero [40] destptr
(MOVDstorezero [32] destptr
(MOVDstorezero [24] destptr
(MOVDstorezero [16] destptr
(MOVDstorezero [8] destptr
(MOVDstorezero [0] destptr mem)))))))

// Handle cases not handled above
(Zero [s] ptr mem) -> (LoweredZero [s] ptr mem)

// moves
(Move [0] _ _ mem) -> mem
Expand Down
34 changes: 26 additions & 8 deletions src/cmd/compile/internal/ssa/gen/PPC64Ops.go
Original file line number Diff line number Diff line change
Expand Up @@ -312,19 +312,37 @@ func init() {

// large or unaligned zeroing
// arg0 = address of memory to zero (in R3, changed as side effect)
// arg1 = address of the last element to zero
// arg2 = mem
// returns mem
// ADD -8,R3,R3 // intermediate value not valid GC ptr, cannot expose to opt+GC
// MOVDU R0, 8(R3)
// CMP R3, Rarg1
// BLE -2(PC)
//
// a loop is generated when there is more than one iteration
// needed to clear 4 doublewords
//
// MOVD $len/32,R31
// MOVD R31,CTR
// loop:
// MOVD R0,(R3)
// MOVD R0,8(R3)
// MOVD R0,16(R3)
// MOVD R0,24(R3)
// ADD R3,32
// BC loop

// remaining doubleword clears generated as needed
// MOVD R0,(R3)
// MOVD R0,8(R3)
// MOVD R0,16(R3)
// MOVD R0,24(R3)

// one or more of these to clear remainder < 8 bytes
// MOVW R0,n1(R3)
// MOVH R0,n2(R3)
// MOVB R0,n3(R3)
{
name: "LoweredZero",
aux: "Int64",
argLength: 3,
argLength: 2,
reg: regInfo{
inputs: []regMask{buildReg("R3"), gp},
inputs: []regMask{buildReg("R3")},
clobbers: buildReg("R3"),
},
clobberFlags: true,
Expand Down
5 changes: 2 additions & 3 deletions src/cmd/compile/internal/ssa/opGen.go
Original file line number Diff line number Diff line change
Expand Up @@ -17368,13 +17368,12 @@ var opcodeTable = [...]opInfo{
{
name: "LoweredZero",
auxType: auxInt64,
argLen: 3,
argLen: 2,
clobberFlags: true,
faultOnNilArg0: true,
reg: regInfo{
inputs: []inputInfo{
{0, 8}, // R3
{1, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
{0, 8}, // R3
},
clobbers: 8, // R3
},
Expand Down
Loading

0 comments on commit 23bd919

Please sign in to comment.