Skip to content

Commit

Permalink
Merge pull request xenia-canary#50 from chrisps/canary_experimental
Browse files Browse the repository at this point in the history
Ton of cpu changes
  • Loading branch information
Gliniak authored Jul 16, 2022
2 parents 5f11c5d + 6a612b4 commit 23ca372
Show file tree
Hide file tree
Showing 17 changed files with 575 additions and 232 deletions.
10 changes: 6 additions & 4 deletions src/xenia/cpu/backend/x64/x64_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -446,10 +446,11 @@ HostToGuestThunk X64ThunkEmitter::EmitHostToGuestThunk() {
EmitSaveNonvolatileRegs();

mov(rax, rcx);
mov(rsi, rdx); // context
mov(rcx, r8); // return address
mov(rsi, rdx); // context
mov(rdi, ptr[rdx + offsetof(ppc::PPCContext, virtual_membase)]); // membase
mov(rcx, r8); // return address
call(rax);

vzeroupper();
EmitLoadNonvolatileRegs();

code_offsets.epilog = getSize();
Expand Down Expand Up @@ -500,7 +501,8 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() {

code_offsets.prolog_stack_alloc = getSize();
code_offsets.body = getSize();

// chrispy: added this for proper vmsum impl, avx2 bitshifts
vzeroupper();
// Save off volatile registers.
EmitSaveVolatileRegs();

Expand Down
101 changes: 37 additions & 64 deletions src/xenia/cpu/backend/x64/x64_emitter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -101,13 +101,11 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL);
TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);




TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI);
#undef TEST_EMIT_FEATURE
/*
fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in latest version of xbyak
fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in
latest version of xbyak
*/
unsigned int data[4];
Xbyak::util::Cpu::getCpuid(0x80000001, data);
Expand All @@ -117,21 +115,19 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
}
}
if (cpu_.has(Xbyak::util::Cpu::tAMD)) {

bool is_zennish = cpu_.displayFamily >= 0x17;
bool is_zennish = cpu_.displayFamily >= 0x17;

if (is_zennish) {
feature_flags_ |= kX64FastJrcx;
if (is_zennish) {
feature_flags_ |= kX64FastJrcx;

if (cpu_.displayFamily > 0x17) {
feature_flags_ |= kX64FastLoop;
if (cpu_.displayFamily > 0x17) {
feature_flags_ |= kX64FastLoop;

} else if (cpu_.displayFamily == 0x17 && cpu_.displayModel >= 0x31) {
feature_flags_ |= kX64FastLoop;
} // todo:figure out at model zen+ became zen2, this is just the model
// for my cpu, which is ripper90

}
} else if (cpu_.displayFamily == 0x17 && cpu_.displayModel >= 0x31) {
feature_flags_ |= kX64FastLoop;
} // todo:figure out at model zen+ became zen2, this is just the model
// for my cpu, which is ripper90
}
}
}

Expand Down Expand Up @@ -263,7 +259,10 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
code_offsets.prolog_stack_alloc = getSize();
code_offsets.body = getSize();

/*
* chrispy: removed this, it serves no purpose
mov(qword[rsp + StackLayout::GUEST_CTX_HOME], GetContextReg());
*/
mov(qword[rsp + StackLayout::GUEST_RET_ADDR], rcx);
mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], 0);

Expand Down Expand Up @@ -296,9 +295,11 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
}

// Load membase.
mov(GetMembaseReg(),
/*
* chrispy: removed this, as long as we load it in HostToGuestThunk we can
count on no other code modifying it. mov(GetMembaseReg(),
qword[GetContextReg() + offsetof(ppc::PPCContext, virtual_membase)]);

*/
// Body.
auto block = builder->first_block();
while (block) {
Expand All @@ -318,7 +319,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
// NOTE: If you encounter this after adding a new instruction, do a full
// rebuild!
assert_always();
XELOGE("Unable to process HIR opcode {}", instr->opcode->name);
XELOGE("Unable to process HIR opcode {}", GetOpcodeName(instr->opcode));
break;
}
instr = new_tail;
Expand All @@ -331,8 +332,10 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
L(epilog_label);
epilog_label_ = nullptr;
EmitTraceUserCallReturn();
/*
* chrispy: removed this, it serves no purpose
mov(GetContextReg(), qword[rsp + StackLayout::GUEST_CTX_HOME]);

*/
code_offsets.epilog = getSize();

add(rsp, (uint32_t)stack_size);
Expand All @@ -342,7 +345,6 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {

if (cvars::emit_source_annotations) {
nop(5);

}

assert_zero(code_offsets.prolog);
Expand Down Expand Up @@ -676,37 +678,9 @@ Xbyak::Reg64 X64Emitter::GetNativeParam(uint32_t param) {
Xbyak::Reg64 X64Emitter::GetContextReg() { return rsi; }
Xbyak::Reg64 X64Emitter::GetMembaseReg() { return rdi; }

void X64Emitter::ReloadContext() {
mov(GetContextReg(), qword[rsp + StackLayout::GUEST_CTX_HOME]);
}

void X64Emitter::ReloadMembase() {
mov(GetMembaseReg(), qword[GetContextReg() + 8]); // membase
}
#define __NH_CONCAT(x, y) x##y
#define _MH_CONCAT(cb, ...) cb (__VA_ARGS__)

#define mh_concat2_m(x, y) __NH_CONCAT(x, y)

#define DECLNOP(n, ...) \
static constexpr unsigned char mh_concat2_m(nop_, n)[] = {__VA_ARGS__}

DECLNOP(1, 0x90);
DECLNOP(2, 0x66, 0x90);
DECLNOP(3, 0x0F, 0x1F, 0x00);
DECLNOP(4, 0x0F, 0x1F, 0x40, 0x00);
DECLNOP(5, 0x0F, 0x1F, 0x44, 0x00, 0x00);
DECLNOP(6, 0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00);
DECLNOP(7, 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00);
DECLNOP(8, 0x0F, 0x1F, 0x84, 00, 00, 00, 00, 00);
DECLNOP(9, 0x66, 0x0F, 0x1F, 0x84, 00, 00, 00, 00, 00);

static constexpr const unsigned char* const g_noptable[] = {
&nop_1[0], &nop_1[0], &nop_2[0], &nop_3[0], &nop_4[0],
&nop_5[0], &nop_6[0], &nop_7[0], &nop_8[0], &nop_9[0]};

static constexpr unsigned LENGTHOF_NOPTABLE =
sizeof(g_noptable) / sizeof(g_noptable[0]);

// Len Assembly Byte Sequence
// ============================================================================
Expand All @@ -720,17 +694,8 @@ static constexpr unsigned LENGTHOF_NOPTABLE =
// 8b NOP DWORD ptr [EAX + EAX*1 + 00000000H] 0F 1F 84 00 00 00 00 00H
// 9b 66 NOP DWORD ptr [EAX + EAX*1 + 00000000H] 66 0F 1F 84 00 00 00 00 00H
void X64Emitter::nop(size_t length) {
while (length != 0) {
unsigned patchsize = length % LENGTHOF_NOPTABLE;

// patch_memory(locptr, size, (char*)g_noptable[patchsize]);

for (unsigned i = 0; i < patchsize; ++i) {
db(g_noptable[patchsize][i]);
}

//locptr += patchsize;
length -= patchsize;
for (size_t i = 0; i < length; ++i) {
db(0x90);
}
}

Expand Down Expand Up @@ -912,8 +877,17 @@ static const vec128_t xmm_consts[] = {
0x80, 0x80, 0x80, 0x80),
/*XMMShortsToBytes*/
v128_setr_bytes(0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80)
};
0x80, 0x80, 0x80),
/*XMMLVSLTableBase*/
vec128b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
/*XMMLVSRTableBase*/
vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31),
/* XMMSingleDenormalMask */
vec128i(0x7f800000),
/* XMMThreeFloatMask */
vec128i(~0U, ~0U, ~0U, 0U),
/*XMMXenosF16ExtRangeStart*/
vec128f(65504)};

void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
for (auto& vec : xmm_consts) {
Expand Down Expand Up @@ -1013,7 +987,6 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) {
// 1111...
vpcmpeqb(dest, dest);
} else {

for (size_t i = 0; i < (kConstDataSize / sizeof(vec128_t)); ++i) {
if (xmm_consts[i] == v) {
vmovapd(dest, GetXmmConstPtr((XmmConst)i));
Expand Down
10 changes: 8 additions & 2 deletions src/xenia/cpu/backend/x64/x64_emitter.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,12 @@ enum XmmConst {
XMM2To32,
XMMFloatInf,
XMMIntsToBytes,
XMMShortsToBytes
XMMShortsToBytes,
XMMLVSLTableBase,
XMMLVSRTableBase,
XMMSingleDenormalMask,
XMMThreeFloatMask, //for clearing the fourth float prior to DOT_PRODUCT_3
XMMXenosF16ExtRangeStart
};

// Unfortunately due to the design of xbyak we have to pass this to the ctor.
Expand Down Expand Up @@ -147,6 +152,7 @@ enum X64EmitterFeatureFlags {
kX64EmitAVX512Ortho64 = kX64EmitAVX512Ortho | kX64EmitAVX512DQ,
kX64FastJrcx = 1 << 12, //jrcxz is as fast as any other jump ( >= Zen1)
kX64FastLoop = 1 << 13, //loop/loope/loopne is as fast as any other jump ( >= Zen2)
kX64EmitAVX512VBMI = 1 << 14
};
class ResolvableGuestCall {
public:
Expand Down Expand Up @@ -225,7 +231,7 @@ class X64Emitter : public Xbyak::CodeGenerator {

Xbyak::Reg64 GetContextReg();
Xbyak::Reg64 GetMembaseReg();
void ReloadContext();

void ReloadMembase();

void nop(size_t length = 1);
Expand Down
74 changes: 58 additions & 16 deletions src/xenia/cpu/backend/x64/x64_seq_vector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,26 @@ struct VECTOR_CONVERT_F2I
};
EMITTER_OPCODE_TABLE(OPCODE_VECTOR_CONVERT_F2I, VECTOR_CONVERT_F2I);

struct VECTOR_DENORMFLUSH
: Sequence<VECTOR_DENORMFLUSH,
I<OPCODE_VECTOR_DENORMFLUSH, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.vxorps(e.xmm1, e.xmm1, e.xmm1); // 0.25 P0123

e.vandps(e.xmm0, i.src1,
e.GetXmmConstPtr(XMMSingleDenormalMask)); // 0.25 P0123
e.vcmpneqps(e.xmm2, e.xmm0, e.xmm1); // 0.5 P01
e.vandps(e.xmm1, i.src1,
e.GetXmmConstPtr(XMMSignMaskF32)); // 0.5 P0123 take signs, zeros
// must keep their signs
e.vandps(e.xmm0, i.src1, e.xmm2); // P0123
e.vorps(i.dest, e.xmm0, e.xmm1); // P0123 make sure zeros keep signs

// if it does not equal zero, we stay
}
};
EMITTER_OPCODE_TABLE(OPCODE_VECTOR_DENORMFLUSH, VECTOR_DENORMFLUSH);

// ============================================================================
// OPCODE_LOAD_VECTOR_SHL
// ============================================================================
Expand Down Expand Up @@ -154,15 +174,20 @@ struct LOAD_VECTOR_SHL_I8
if (i.src1.is_constant) {
auto sh = i.src1.constant();
assert_true(sh < xe::countof(lvsl_table));
e.mov(e.rax, (uintptr_t)&lvsl_table[sh]);
e.vmovaps(i.dest, e.ptr[e.rax]);
if (sh == 0) {
e.vmovdqa(i.dest, e.GetXmmConstPtr(XMMLVSLTableBase));
} else {
// this is probably extremely rare
e.LoadConstantXmm(i.dest, lvsl_table[sh]);
}
} else {
// TODO(benvanik): find a cheaper way of doing this.
e.movzx(e.rdx, i.src1);
e.and_(e.dx, 0xF);
e.shl(e.dx, 4);
e.mov(e.rax, (uintptr_t)lvsl_table);
e.vmovaps(i.dest, e.ptr[e.rax + e.rdx]);
// chrispy: removed mask, ppc_emit_altivec already pre-ands it.
e.vmovd(e.xmm0, i.src1.reg().cvt32());
// broadcast byte
// dont use broadcastb with avx2, its slower than shuf
e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMZero));
e.vpaddb(i.dest, e.xmm0, e.GetXmmConstPtr(XMMLVSLTableBase));
}
}
};
Expand Down Expand Up @@ -195,15 +220,23 @@ struct LOAD_VECTOR_SHR_I8
if (i.src1.is_constant) {
auto sh = i.src1.constant();
assert_true(sh < xe::countof(lvsr_table));
e.mov(e.rax, (uintptr_t)&lvsr_table[sh]);
e.vmovaps(i.dest, e.ptr[e.rax]);
if (sh == 0) {
e.vmovdqa(i.dest, e.GetXmmConstPtr(XMMLVSRTableBase));
} else {
e.LoadConstantXmm(i.dest, lvsr_table[sh]);
}
} else {
// TODO(benvanik): find a cheaper way of doing this.
e.movzx(e.rdx, i.src1);
e.and_(e.dx, 0xF);
e.shl(e.dx, 4);
e.mov(e.rax, (uintptr_t)lvsr_table);
e.vmovaps(i.dest, e.ptr[e.rax + e.rdx]);

// chrispy: removed mask, ppc_emit_altivec already pre-ands it. removed
// lookup as well, compute from LVSR base instead
e.vmovd(e.xmm0, i.src1.reg().cvt32());
e.vmovdqa(e.xmm1, e.GetXmmConstPtr(XMMLVSRTableBase));
// broadcast byte
// dont use broadcastb with avx2, its slower than shuf
e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMZero));

e.vpsubb(i.dest, e.xmm1, e.xmm0);
}
}
};
Expand Down Expand Up @@ -728,7 +761,7 @@ struct VECTOR_SHL_V128
}
}

static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
// TODO(benvanik): native version (with shift magic).

if (e.IsFeatureEnabled(kX64EmitAVX2)) {
Expand Down Expand Up @@ -1793,6 +1826,14 @@ struct PERMUTE_I32
}
}
};
//todo: use this on const src1
static vec128_t FixupConstantShuf8(vec128_t input) {
for (uint32_t i = 0; i < 16; ++i) {
input.u8[i] ^= 0x03;
input.u8[i] &= 0x1F;
}
return input;
}
struct PERMUTE_V128
: Sequence<PERMUTE_V128,
I<OPCODE_PERMUTE, V128Op, V128Op, V128Op, V128Op>> {
Expand Down Expand Up @@ -1855,7 +1896,8 @@ struct PERMUTE_V128
} else {
e.vpshufb(src3_shuf, i.src3, e.xmm2);
}
// Build a mask with values in src2 having 0 and values in src3 having 1.
// Build a mask with values in src2 having 0 and values in src3
// having 1.
e.vpcmpgtb(i.dest, e.xmm2, e.GetXmmConstPtr(XMMPermuteControl15));
e.vpblendvb(i.dest, src2_shuf, src3_shuf, i.dest);
}
Expand Down
Loading

0 comments on commit 23ca372

Please sign in to comment.