Skip to content

Commit

Permalink
runtime: get rid of most uses of REP for copying/zeroing.
Browse files Browse the repository at this point in the history
REP MOVSQ and REP STOSQ have a really high startup overhead.
Use a Duff's device to do the repetition instead.

benchmark                 old ns/op     new ns/op     delta
BenchmarkClearFat32       7.20          1.60          -77.78%
BenchmarkCopyFat32        6.88          2.38          -65.41%
BenchmarkClearFat64       7.15          3.20          -55.24%
BenchmarkCopyFat64        6.88          3.44          -50.00%
BenchmarkClearFat128      9.53          5.34          -43.97%
BenchmarkCopyFat128       9.27          5.56          -40.02%
BenchmarkClearFat256      13.8          9.53          -30.94%
BenchmarkCopyFat256       13.5          10.3          -23.70%
BenchmarkClearFat512      22.3          18.0          -19.28%
BenchmarkCopyFat512       22.0          19.7          -10.45%
BenchmarkCopyFat1024      36.5          38.4          +5.21%
BenchmarkClearFat1024     35.1          35.0          -0.28%

TODO: use for stack frame zeroing
TODO: REP prefixes are still used for "reverse" copying when src/dst
regions overlap.  Might be worth fixing.

LGTM=rsc
R=golang-codereviews, rsc
CC=golang-codereviews, r
https://golang.org/cl/81370046
  • Loading branch information
randall77 committed Apr 1, 2014
1 parent cfb347f commit 6c7cbf0
Show file tree
Hide file tree
Showing 13 changed files with 1,726 additions and 4 deletions.
9 changes: 8 additions & 1 deletion src/cmd/6g/cgen.c
Original file line number Diff line number Diff line change
Expand Up @@ -1345,6 +1345,7 @@ sgen(Node *n, Node *ns, int64 w)
Node nodl, nodr, nodsi, noddi, cx, oldcx, tmp;
vlong c, q, odst, osrc;
NodeList *l;
Prog *p;

if(debug['g']) {
print("\nsgen w=%lld\n", w);
Expand Down Expand Up @@ -1447,10 +1448,16 @@ sgen(Node *n, Node *ns, int64 w)
gins(ACLD, N, N);
} else {
// normal direction
if(q >= 4) {
if(q > 128) {
gconreg(movptr, q, D_CX);
gins(AREP, N, N); // repeat
gins(AMOVSQ, N, N); // MOVQ *(SI)+,*(DI)+
} else if (q >= 4) {
p = gins(ADUFFCOPY, N, N);
p->to.type = D_ADDR;
p->to.sym = linksym(pkglookup("duffcopy", runtimepkg));
// 14 and 128 = magic constants: see ../../pkg/runtime/asm_amd64.s
p->to.offset = 14*(128-q);
} else
while(q > 0) {
gins(AMOVSQ, N, N); // MOVQ *(SI)+,*(DI)+
Expand Down
8 changes: 7 additions & 1 deletion src/cmd/6g/ggen.c
Original file line number Diff line number Diff line change
Expand Up @@ -1088,10 +1088,16 @@ clearfat(Node *nl)
savex(D_AX, &ax, &oldax, N, types[tptr]);
gconreg(AMOVL, 0, D_AX);

if(q >= 4) {
if(q > 128) {
gconreg(movptr, q, D_CX);
gins(AREP, N, N); // repeat
gins(ASTOSQ, N, N); // STOQ AL,*(DI)+
} else if(q >= 4) {
p = gins(ADUFFZERO, N, N);
p->to.type = D_ADDR;
p->to.sym = linksym(pkglookup("duffzero", runtimepkg));
// 2 and 128 = magic constants: see ../../pkg/runtime/asm_amd64.s
p->to.offset = 2*(128-q);
} else
while(q > 0) {
gins(ASTOSQ, N, N); // STOQ AL,*(DI)+
Expand Down
2 changes: 2 additions & 0 deletions src/cmd/6g/prog.c
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ static ProgInfo progtable[ALAST] = {
[AMOVSL]= {OK, DI|SI, DI|SI},
[AMOVSQ]= {OK, DI|SI, DI|SI},
[AMOVSW]= {OK, DI|SI, DI|SI},
[ADUFFCOPY]= {OK, DI|SI, DI|SI|CX},

[AMOVSD]= {SizeD | LeftRead | RightWrite | Move},
[AMOVSS]= {SizeF | LeftRead | RightWrite | Move},
Expand Down Expand Up @@ -257,6 +258,7 @@ static ProgInfo progtable[ALAST] = {
[ASTOSL]= {OK, AX|DI, DI},
[ASTOSQ]= {OK, AX|DI, DI},
[ASTOSW]= {OK, AX|DI, DI},
[ADUFFZERO]= {OK, AX|DI, DI},

[ASUBB]= {SizeB | LeftRead | RightRdwr | SetCarry},
[ASUBL]= {SizeL | LeftRead | RightRdwr | SetCarry},
Expand Down
2 changes: 2 additions & 0 deletions src/cmd/6l/6.out.h
Original file line number Diff line number Diff line change
Expand Up @@ -764,6 +764,8 @@ enum as
ACHECKNIL,
AVARDEF,
AVARKILL,
ADUFFCOPY,
ADUFFZERO,

ALAST
};
Expand Down
9 changes: 8 additions & 1 deletion src/cmd/8g/cgen.c
Original file line number Diff line number Diff line change
Expand Up @@ -1212,6 +1212,7 @@ sgen(Node *n, Node *res, int64 w)
Node dst, src, tdst, tsrc;
int32 c, q, odst, osrc;
NodeList *l;
Prog *p;

if(debug['g']) {
print("\nsgen w=%lld\n", w);
Expand Down Expand Up @@ -1314,10 +1315,16 @@ sgen(Node *n, Node *res, int64 w)
} else {
gins(ACLD, N, N); // paranoia. TODO(rsc): remove?
// normal direction
if(q >= 4) {
if(q > 128) {
gconreg(AMOVL, q, D_CX);
gins(AREP, N, N); // repeat
gins(AMOVSL, N, N); // MOVL *(SI)+,*(DI)+
} else if(q >= 4) {
p = gins(ADUFFCOPY, N, N);
p->to.type = D_ADDR;
p->to.sym = linksym(pkglookup("duffcopy", runtimepkg));
// 10 and 128 = magic constants: see ../../pkg/runtime/asm_386.s
p->to.offset = 10*(128-q);
} else
while(q > 0) {
gins(AMOVSL, N, N); // MOVL *(SI)+,*(DI)+
Expand Down
9 changes: 8 additions & 1 deletion src/cmd/8g/ggen.c
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ clearfat(Node *nl)
{
uint32 w, c, q;
Node n1;
Prog *p;

/* clear a fat object */
if(debug['g'])
Expand All @@ -147,10 +148,16 @@ clearfat(Node *nl)
agen(nl, &n1);
gconreg(AMOVL, 0, D_AX);

if(q >= 4) {
if(q > 128) {
gconreg(AMOVL, q, D_CX);
gins(AREP, N, N); // repeat
gins(ASTOSL, N, N); // STOL AL,*(DI)+
} else if(q >= 4) {
p = gins(ADUFFZERO, N, N);
p->to.type = D_ADDR;
p->to.sym = linksym(pkglookup("duffzero", runtimepkg));
// 1 and 128 = magic constants: see ../../pkg/runtime/asm_386.s
p->to.offset = 1*(128-q);
} else
while(q > 0) {
gins(ASTOSL, N, N); // STOL AL,*(DI)+
Expand Down
2 changes: 2 additions & 0 deletions src/cmd/8g/prog.c
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ static ProgInfo progtable[ALAST] = {
[AMOVSB]= {OK, DI|SI, DI|SI},
[AMOVSL]= {OK, DI|SI, DI|SI},
[AMOVSW]= {OK, DI|SI, DI|SI},
[ADUFFCOPY]= {OK, DI|SI, DI|SI|CX},

[AMOVSD]= {SizeD | LeftRead | RightWrite | Move},
[AMOVSS]= {SizeF | LeftRead | RightWrite | Move},
Expand Down Expand Up @@ -287,6 +288,7 @@ static ProgInfo progtable[ALAST] = {
[ASTOSB]= {OK, AX|DI, DI},
[ASTOSL]= {OK, AX|DI, DI},
[ASTOSW]= {OK, AX|DI, DI},
[ADUFFZERO]= {OK, AX|DI, DI},

[ASUBB]= {SizeB | LeftRead | RightRdwr | SetCarry},
[ASUBL]= {SizeL | LeftRead | RightRdwr | SetCarry},
Expand Down
2 changes: 2 additions & 0 deletions src/cmd/8l/8.out.h
Original file line number Diff line number Diff line change
Expand Up @@ -581,6 +581,8 @@ enum as
ACHECKNIL,
AVARDEF,
AVARKILL,
ADUFFCOPY,
ADUFFZERO,

ALAST
};
Expand Down
9 changes: 9 additions & 0 deletions src/liblink/asm6.c
Original file line number Diff line number Diff line change
Expand Up @@ -507,6 +507,11 @@ static uchar ycall[] =
Ynone, Ybr, Zcall, 1,
0
};
static uchar yduff[] =
{
Ynone, Yi32, Zcall, 1,
0
};
static uchar yjmp[] =
{
Ynone, Yml, Zo_m64, 2,
Expand Down Expand Up @@ -1519,6 +1524,9 @@ Optab optab[] =
{ APCDATA, ypcdata, Px, 0,0 },
{ ACHECKNIL },
{ AVARDEF },
{ AVARKILL },
{ ADUFFCOPY, yduff, Px, 0xe8 },
{ ADUFFZERO, yduff, Px, 0xe8 },

{ AEND },
0
Expand Down Expand Up @@ -3030,6 +3038,7 @@ doasm(Link *ctxt, Prog *p)
r = addrel(ctxt->cursym);
r->off = p->pc + ctxt->andptr - ctxt->and;
r->sym = p->to.sym;
r->add = p->to.offset;
r->type = D_PCREL;
r->siz = 4;
put4(ctxt, 0);
Expand Down
9 changes: 9 additions & 0 deletions src/liblink/asm8.c
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,11 @@ static uchar ycall[] =
Ynone, Yi32, Zcallcon, 1,
0
};
static uchar yduff[] =
{
Ynone, Yi32, Zcall, 1,
0
};
static uchar yjmp[] =
{
Ynone, Yml, Zo_m, 2,
Expand Down Expand Up @@ -1147,6 +1152,9 @@ static Optab optab[] =
{ APCDATA, ypcdata, Px, 0,0 },
{ ACHECKNIL },
{ AVARDEF },
{ AVARKILL },
{ ADUFFCOPY, yduff, Px, 0xe8 },
{ ADUFFZERO, yduff, Px, 0xe8 },

0
};
Expand Down Expand Up @@ -2377,6 +2385,7 @@ doasm(Link *ctxt, Prog *p)
r->type = D_PCREL;
r->siz = 4;
r->sym = p->to.sym;
r->add = p->to.offset;
put4(ctxt, 0);
break;

Expand Down
Loading

0 comments on commit 6c7cbf0

Please sign in to comment.