diff --git a/src/cmd/internal/obj/x86/a.out.go b/src/cmd/internal/obj/x86/a.out.go index 345135ceecb63c..0b5d8eb9767442 100644 --- a/src/cmd/internal/obj/x86/a.out.go +++ b/src/cmd/internal/obj/x86/a.out.go @@ -749,6 +749,8 @@ const ( AVPCMPEQB AVPMOVMSKB AVPAND + AVPTEST + AVPBROADCASTB // from 386 AJCXZW diff --git a/src/cmd/internal/obj/x86/anames.go b/src/cmd/internal/obj/x86/anames.go index 2f1374ada0e13d..f545baf9940e0c 100644 --- a/src/cmd/internal/obj/x86/anames.go +++ b/src/cmd/internal/obj/x86/anames.go @@ -690,6 +690,8 @@ var Anames = []string{ "VPCMPEQB", "VPMOVMSKB", "VPAND", + "VPTEST", + "VPBROADCASTB", "JCXZW", "FCMOVCC", "FCMOVCS", diff --git a/src/cmd/internal/obj/x86/asm6.go b/src/cmd/internal/obj/x86/asm6.go index 6e3093819b36b6..919e00b6e4bd70 100644 --- a/src/cmd/internal/obj/x86/asm6.go +++ b/src/cmd/internal/obj/x86/asm6.go @@ -219,8 +219,9 @@ const ( Pf2 = 0xf2 /* xmm escape 1: f2 0f */ Pf3 = 0xf3 /* xmm escape 2: f3 0f */ Pq3 = 0x67 /* xmm escape 3: 66 48 0f */ - Pvex1 = 0xc5 /* 66 escape, vex encoding */ - Pvex2 = 0xc6 /* f3 escape, vex encoding */ + Pvex1 = 0xc5 /* 66.0f escape, vex encoding */ + Pvex2 = 0xc6 /* f3.0f escape, vex encoding */ + Pvex3 = 0xc7 /* 66.0f38 escape, vex encoding */ Pw = 0x48 /* Rex.w */ Pw8 = 0x90 // symbolic; exact value doesn't matter Py = 0x80 /* defaults to 64-bit mode */ @@ -631,6 +632,11 @@ var yxr_ml_vex = []ytab{ {Yxr, Ynone, Yml, Zr_m_xm_vex, 1}, } +var yml_xr_vex = []ytab{ + {Yml, Ynone, Yxr, Zm_r_xm_vex, 1}, + {Yxr, Ynone, Yxr, Zm_r_xm_vex, 1}, +} + var yxm_xm_xm = []ytab{ {Yxr, Yxr, Yxr, Zr_r_r_vex, 1}, {Yxm, Yxr, Yxr, Zr_r_r_vex, 1}, @@ -1510,6 +1516,8 @@ var optab = {AVPCMPEQB, yxm_xm_xm, Pvex1, [23]uint8{0x74, 0x74}}, {AVPMOVMSKB, ymskb_vex, Pvex1, [23]uint8{0xd7}}, {AVPAND, yxm_xm_xm, Pvex1, [23]uint8{0xdb, 0xdb}}, + {AVPBROADCASTB, yml_xr_vex, Pvex3, [23]uint8{0x78, 0x78}}, + {AVPTEST, yml_xr_vex, Pvex3, [23]uint8{0x17, 0x17}}, {obj.AUSEFIELD, ynop, Px, [23]uint8{0, 0}}, {obj.ATYPE, nil, 0, [23]uint8{}}, {obj.AFUNCDATA, yfuncdata, Px, [23]uint8{0, 0}}, @@ -2965,13 +2973,13 @@ func vexprefix(ctxt *obj.Link, to *obj.Addr, from *obj.Addr, from3 *obj.Addr, pr rexX := regrex[from.Index] var prefBit uint8 // This will go into VEX.PP field. - if pref == Pvex1 { + if pref == Pvex1 || pref == Pvex3 { prefBit = 1 } else if pref == Pvex2 { prefBit = 2 - } // TODO add Pvex0,Pvex3 + } // TODO add Pvex0 - if rexX == 0 && rexB == 0 { // 2-byte vex prefix + if rexX == 0 && rexB == 0 && pref != Pvex3 { // 2-byte vex prefix // In 2-byte case, first byte is always C5 ctxt.Andptr[0] = 0xc5 ctxt.Andptr = ctxt.Andptr[1:] @@ -2998,9 +3006,13 @@ func vexprefix(ctxt *obj.Link, to *obj.Addr, from *obj.Addr, from3 *obj.Addr, pr ctxt.Andptr[0] = 0xc4 ctxt.Andptr = ctxt.Andptr[1:] - // Encode VEX.mmmmm with prefix value, for now assume 0F 38, - // which encodes as 1. - ctxt.Andptr[0] = 0x1 // TODO handle different prefix + // Encode VEX.mmmmm with prefix value, assume 0F, + // which encodes as 1, unless 0F38 was specified with pvex3. + ctxt.Andptr[0] = 0x1 // TODO handle 0F3A + if pref == Pvex3 { + ctxt.Andptr[0] = 0x2 + } + // REX.[RXB] are inverted and encoded in 3 upper bits if rexR == 0 { ctxt.Andptr[0] |= 0x80 diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s index 8401accbcdc02b..68b342d4dbdb85 100644 --- a/src/runtime/asm_amd64.s +++ b/src/runtime/asm_amd64.s @@ -1940,6 +1940,9 @@ TEXT runtime·indexbytebody(SB),NOSPLIT,$0 CMPQ BX, $16 JLT small + CMPQ BX, $32 + JA avx2 +no_avx2: // round up to first 16-byte boundary TESTQ $15, SI JZ aligned @@ -2003,6 +2006,38 @@ small: MOVQ $-1, (R8) RET +avx2: + CMPB runtime·support_avx2(SB), $1 + JNE no_avx2 + MOVD AX, X0 + LEAQ -32(SI)(BX*1), R11 + VPBROADCASTB X0, X1 +avx2_loop: + MOVHDU (DI), X2 + VPCMPEQB X1, X2, X3 + VPTEST X3, X3 + JNZ avx2success + ADDQ $32, DI + CMPQ DI, R11 + JLT avx2_loop + MOVQ R11, DI + MOVHDU (DI), X2 + VPCMPEQB X1, X2, X3 + VPTEST X3, X3 + JNZ avx2success + VZEROUPPER + MOVQ $-1, (R8) + RET + +avx2success: + VPMOVMSKB X3, DX + BSFL DX, DX + SUBQ SI, DI + ADDQ DI, DX + MOVQ DX, (R8) + VZEROUPPER + RET + // we've found the chunk containing the byte // now just figure out which specific byte it is ssesuccess: