Merge pull request xenia-canary#50 from chrisps/canary_experimental

Ton of cpu changes
FluxRBLX · Jul 16, 2022 · 23ca372 · 23ca372
2 parents 5f11c5d + 6a612b4
commit 23ca372
Show file tree

Hide file tree

Showing 17 changed files with 575 additions and 232 deletions.
diff --git a/src/xenia/cpu/backend/x64/x64_backend.cc b/src/xenia/cpu/backend/x64/x64_backend.cc
@@ -446,10 +446,11 @@ HostToGuestThunk X64ThunkEmitter::EmitHostToGuestThunk() {
   EmitSaveNonvolatileRegs();
 
   mov(rax, rcx);
-  mov(rsi, rdx);  // context
-  mov(rcx, r8);   // return address
+  mov(rsi, rdx);                                                    // context
+  mov(rdi, ptr[rdx + offsetof(ppc::PPCContext, virtual_membase)]);  // membase
+  mov(rcx, r8);  // return address
   call(rax);
-
+  vzeroupper();
   EmitLoadNonvolatileRegs();
 
   code_offsets.epilog = getSize();
@@ -500,7 +501,8 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() {
 
   code_offsets.prolog_stack_alloc = getSize();
   code_offsets.body = getSize();
-
+  // chrispy: added this for proper vmsum impl, avx2 bitshifts
+  vzeroupper();
   // Save off volatile registers.
   EmitSaveVolatileRegs();
 

diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc
@@ -101,13 +101,11 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
   TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL);
   TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
   TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
-
-
-
-
+  TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI);
 #undef TEST_EMIT_FEATURE
   /*
-  fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in latest version of xbyak
+  fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in
+  latest version of xbyak
 */
   unsigned int data[4];
   Xbyak::util::Cpu::getCpuid(0x80000001, data);
@@ -117,21 +115,19 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
     }
   }
   if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
-
-      bool is_zennish = cpu_.displayFamily >= 0x17;
+    bool is_zennish = cpu_.displayFamily >= 0x17;
 
-      if (is_zennish) {
-        feature_flags_ |= kX64FastJrcx;
+    if (is_zennish) {
+      feature_flags_ |= kX64FastJrcx;
 
-        if (cpu_.displayFamily > 0x17) {
-          feature_flags_ |= kX64FastLoop;
+      if (cpu_.displayFamily > 0x17) {
+        feature_flags_ |= kX64FastLoop;
 
-        } else if (cpu_.displayFamily == 0x17 && cpu_.displayModel >= 0x31) {
-          feature_flags_ |= kX64FastLoop;
-        }  // todo:figure out at model zen+ became zen2, this is just the model
-           // for my cpu, which is ripper90
-
-      }
+      } else if (cpu_.displayFamily == 0x17 && cpu_.displayModel >= 0x31) {
+        feature_flags_ |= kX64FastLoop;
+      }  // todo:figure out at model zen+ became zen2, this is just the model
+         // for my cpu, which is ripper90
+    }
   }
 }
 
@@ -263,7 +259,10 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
   code_offsets.prolog_stack_alloc = getSize();
   code_offsets.body = getSize();
 
+  /*
+  * chrispy: removed this, it serves no purpose
   mov(qword[rsp + StackLayout::GUEST_CTX_HOME], GetContextReg());
+  */
   mov(qword[rsp + StackLayout::GUEST_RET_ADDR], rcx);
   mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], 0);
 
@@ -296,9 +295,11 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
   }
 
   // Load membase.
-  mov(GetMembaseReg(),
+  /*
+  * chrispy: removed this, as long as we load it in HostToGuestThunk we can
+  count on no other code modifying it. mov(GetMembaseReg(),
       qword[GetContextReg() + offsetof(ppc::PPCContext, virtual_membase)]);
-
+  */
   // Body.
   auto block = builder->first_block();
   while (block) {
@@ -318,7 +319,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
         // NOTE: If you encounter this after adding a new instruction, do a full
         // rebuild!
         assert_always();
-        XELOGE("Unable to process HIR opcode {}", instr->opcode->name);
+        XELOGE("Unable to process HIR opcode {}", GetOpcodeName(instr->opcode));
         break;
       }
       instr = new_tail;
@@ -331,8 +332,10 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
   L(epilog_label);
   epilog_label_ = nullptr;
   EmitTraceUserCallReturn();
+  /*
+  * chrispy: removed this, it serves no purpose
   mov(GetContextReg(), qword[rsp + StackLayout::GUEST_CTX_HOME]);
-
+  */
   code_offsets.epilog = getSize();
 
   add(rsp, (uint32_t)stack_size);
@@ -342,7 +345,6 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
 
   if (cvars::emit_source_annotations) {
     nop(5);
-
   }
 
   assert_zero(code_offsets.prolog);
@@ -676,37 +678,9 @@ Xbyak::Reg64 X64Emitter::GetNativeParam(uint32_t param) {
 Xbyak::Reg64 X64Emitter::GetContextReg() { return rsi; }
 Xbyak::Reg64 X64Emitter::GetMembaseReg() { return rdi; }
 
-void X64Emitter::ReloadContext() {
-  mov(GetContextReg(), qword[rsp + StackLayout::GUEST_CTX_HOME]);
-}
-
 void X64Emitter::ReloadMembase() {
   mov(GetMembaseReg(), qword[GetContextReg() + 8]);  // membase
 }
-#define __NH_CONCAT(x, y) x##y
-#define _MH_CONCAT(cb, ...) cb (__VA_ARGS__)
-
-#define mh_concat2_m(x, y) __NH_CONCAT(x, y)
-
-#define DECLNOP(n, ...) \
-  static constexpr unsigned char mh_concat2_m(nop_, n)[] = {__VA_ARGS__}
-
-DECLNOP(1, 0x90);
-DECLNOP(2, 0x66, 0x90);
-DECLNOP(3, 0x0F, 0x1F, 0x00);
-DECLNOP(4, 0x0F, 0x1F, 0x40, 0x00);
-DECLNOP(5, 0x0F, 0x1F, 0x44, 0x00, 0x00);
-DECLNOP(6, 0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00);
-DECLNOP(7, 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00);
-DECLNOP(8, 0x0F, 0x1F, 0x84, 00, 00, 00, 00, 00);
-DECLNOP(9, 0x66, 0x0F, 0x1F, 0x84, 00, 00, 00, 00, 00);
-
-static constexpr const unsigned char* const g_noptable[] = {
-    &nop_1[0], &nop_1[0], &nop_2[0], &nop_3[0], &nop_4[0],
-    &nop_5[0], &nop_6[0], &nop_7[0], &nop_8[0], &nop_9[0]};
-
-static constexpr unsigned LENGTHOF_NOPTABLE =
-    sizeof(g_noptable) / sizeof(g_noptable[0]);
 
 // Len Assembly                                   Byte Sequence
 // ============================================================================
@@ -720,17 +694,8 @@ static constexpr unsigned LENGTHOF_NOPTABLE =
 // 8b  NOP DWORD ptr [EAX + EAX*1 + 00000000H]    0F 1F 84 00 00 00 00 00H
 // 9b  66 NOP DWORD ptr [EAX + EAX*1 + 00000000H] 66 0F 1F 84 00 00 00 00 00H
 void X64Emitter::nop(size_t length) {
-  while (length != 0) {
-    unsigned patchsize = length % LENGTHOF_NOPTABLE;
-
-    // patch_memory(locptr, size, (char*)g_noptable[patchsize]);
-
-    for (unsigned i = 0; i < patchsize; ++i) {
-      db(g_noptable[patchsize][i]);
-    }
-
-    //locptr += patchsize;
-    length -= patchsize;
+  for (size_t i = 0; i < length; ++i) {
+    db(0x90);
   }
 }
 
@@ -912,8 +877,17 @@ static const vec128_t xmm_consts[] = {
                     0x80, 0x80, 0x80, 0x80),
     /*XMMShortsToBytes*/
     v128_setr_bytes(0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80,
-                    0x80, 0x80, 0x80)
-};
+                    0x80, 0x80, 0x80),
+    /*XMMLVSLTableBase*/
+    vec128b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
+    /*XMMLVSRTableBase*/
+    vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31),
+    /* XMMSingleDenormalMask */
+    vec128i(0x7f800000),
+    /* XMMThreeFloatMask */
+    vec128i(~0U, ~0U, ~0U, 0U),
+    /*XMMXenosF16ExtRangeStart*/
+    vec128f(65504)};
 
 void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
   for (auto& vec : xmm_consts) {
@@ -1013,7 +987,6 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) {
     // 1111...
     vpcmpeqb(dest, dest);
   } else {
-
     for (size_t i = 0; i < (kConstDataSize / sizeof(vec128_t)); ++i) {
       if (xmm_consts[i] == v) {
         vmovapd(dest, GetXmmConstPtr((XmmConst)i));

diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h
@@ -118,7 +118,12 @@ enum XmmConst {
   XMM2To32,
   XMMFloatInf,
   XMMIntsToBytes,
-  XMMShortsToBytes
+  XMMShortsToBytes,
+  XMMLVSLTableBase,
+  XMMLVSRTableBase,
+  XMMSingleDenormalMask,
+  XMMThreeFloatMask, //for clearing the fourth float prior to DOT_PRODUCT_3
+  XMMXenosF16ExtRangeStart
 };
 
 // Unfortunately due to the design of xbyak we have to pass this to the ctor.
@@ -147,6 +152,7 @@ enum X64EmitterFeatureFlags {
   kX64EmitAVX512Ortho64 = kX64EmitAVX512Ortho | kX64EmitAVX512DQ,
   kX64FastJrcx = 1 << 12, //jrcxz is as fast as any other jump ( >= Zen1)
   kX64FastLoop = 1 << 13, //loop/loope/loopne is as fast as any other jump ( >= Zen2)
+  kX64EmitAVX512VBMI = 1 << 14
 };
 class ResolvableGuestCall {
  public:
@@ -225,7 +231,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
 
   Xbyak::Reg64 GetContextReg();
   Xbyak::Reg64 GetMembaseReg();
-  void ReloadContext();
+
   void ReloadMembase();
 
   void nop(size_t length = 1);

diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
@@ -127,6 +127,26 @@ struct VECTOR_CONVERT_F2I
 };
 EMITTER_OPCODE_TABLE(OPCODE_VECTOR_CONVERT_F2I, VECTOR_CONVERT_F2I);
 
+struct VECTOR_DENORMFLUSH
+    : Sequence<VECTOR_DENORMFLUSH,
+               I<OPCODE_VECTOR_DENORMFLUSH, V128Op, V128Op>> {
+  static void Emit(X64Emitter& e, const EmitArgType& i) {
+    e.vxorps(e.xmm1, e.xmm1, e.xmm1);  // 0.25 P0123
+
+    e.vandps(e.xmm0, i.src1,
+             e.GetXmmConstPtr(XMMSingleDenormalMask));  // 0.25 P0123
+    e.vcmpneqps(e.xmm2, e.xmm0, e.xmm1);                // 0.5 P01
+    e.vandps(e.xmm1, i.src1,
+             e.GetXmmConstPtr(XMMSignMaskF32));  // 0.5 P0123 take signs, zeros
+                                                 // must keep their signs
+    e.vandps(e.xmm0, i.src1, e.xmm2);            // P0123
+    e.vorps(i.dest, e.xmm0, e.xmm1);  // P0123 make sure zeros keep signs
+
+    // if it does not equal zero, we stay
+  }
+};
+EMITTER_OPCODE_TABLE(OPCODE_VECTOR_DENORMFLUSH, VECTOR_DENORMFLUSH);
+
 // ============================================================================
 // OPCODE_LOAD_VECTOR_SHL
 // ============================================================================
@@ -154,15 +174,20 @@ struct LOAD_VECTOR_SHL_I8
     if (i.src1.is_constant) {
       auto sh = i.src1.constant();
       assert_true(sh < xe::countof(lvsl_table));
-      e.mov(e.rax, (uintptr_t)&lvsl_table[sh]);
-      e.vmovaps(i.dest, e.ptr[e.rax]);
+      if (sh == 0) {
+        e.vmovdqa(i.dest, e.GetXmmConstPtr(XMMLVSLTableBase));
+      } else {
+        // this is probably extremely rare
+        e.LoadConstantXmm(i.dest, lvsl_table[sh]);
+      }
     } else {
       // TODO(benvanik): find a cheaper way of doing this.
-      e.movzx(e.rdx, i.src1);
-      e.and_(e.dx, 0xF);
-      e.shl(e.dx, 4);
-      e.mov(e.rax, (uintptr_t)lvsl_table);
-      e.vmovaps(i.dest, e.ptr[e.rax + e.rdx]);
+      // chrispy: removed mask, ppc_emit_altivec already pre-ands it.
+      e.vmovd(e.xmm0, i.src1.reg().cvt32());
+      // broadcast byte
+      // dont use broadcastb with avx2, its slower than shuf
+      e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMZero));
+      e.vpaddb(i.dest, e.xmm0, e.GetXmmConstPtr(XMMLVSLTableBase));
     }
   }
 };
@@ -195,15 +220,23 @@ struct LOAD_VECTOR_SHR_I8
     if (i.src1.is_constant) {
       auto sh = i.src1.constant();
       assert_true(sh < xe::countof(lvsr_table));
-      e.mov(e.rax, (uintptr_t)&lvsr_table[sh]);
-      e.vmovaps(i.dest, e.ptr[e.rax]);
+      if (sh == 0) {
+        e.vmovdqa(i.dest, e.GetXmmConstPtr(XMMLVSRTableBase));
+      } else {
+        e.LoadConstantXmm(i.dest, lvsr_table[sh]);
+      }
     } else {
       // TODO(benvanik): find a cheaper way of doing this.
-      e.movzx(e.rdx, i.src1);
-      e.and_(e.dx, 0xF);
-      e.shl(e.dx, 4);
-      e.mov(e.rax, (uintptr_t)lvsr_table);
-      e.vmovaps(i.dest, e.ptr[e.rax + e.rdx]);
+
+      // chrispy: removed mask, ppc_emit_altivec already pre-ands it. removed
+      // lookup as well, compute from LVSR base instead
+      e.vmovd(e.xmm0, i.src1.reg().cvt32());
+      e.vmovdqa(e.xmm1, e.GetXmmConstPtr(XMMLVSRTableBase));
+      // broadcast byte
+      // dont use broadcastb with avx2, its slower than shuf
+      e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMZero));
+
+      e.vpsubb(i.dest, e.xmm1, e.xmm0);
     }
   }
 };
@@ -728,7 +761,7 @@ struct VECTOR_SHL_V128
     }
   }
 
-static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
+  static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
     // TODO(benvanik): native version (with shift magic).
 
     if (e.IsFeatureEnabled(kX64EmitAVX2)) {
@@ -1793,6 +1826,14 @@ struct PERMUTE_I32
     }
   }
 };
+//todo: use this on const src1
+static vec128_t FixupConstantShuf8(vec128_t input) {
+  for (uint32_t i = 0; i < 16; ++i) {
+    input.u8[i] ^= 0x03;
+    input.u8[i] &= 0x1F;
+  }
+  return input;
+}
 struct PERMUTE_V128
     : Sequence<PERMUTE_V128,
                I<OPCODE_PERMUTE, V128Op, V128Op, V128Op, V128Op>> {
@@ -1855,7 +1896,8 @@ struct PERMUTE_V128
       } else {
         e.vpshufb(src3_shuf, i.src3, e.xmm2);
       }
-      // Build a mask with values in src2 having 0 and values in src3 having 1.
+      // Build a mask with values in src2 having 0 and values in src3
+      // having 1.
       e.vpcmpgtb(i.dest, e.xmm2, e.GetXmmConstPtr(XMMPermuteControl15));
       e.vpblendvb(i.dest, src2_shuf, src3_shuf, i.dest);
     }