Skip to content

Commit

Permalink
[GPU] EDRAM looped addressing (resolves xenia-project#2031)
Browse files Browse the repository at this point in the history
  • Loading branch information
Triang3l committed Jul 22, 2022
1 parent 74d83e4 commit 3c12814
Show file tree
Hide file tree
Showing 66 changed files with 41,401 additions and 40,977 deletions.
60 changes: 43 additions & 17 deletions src/xenia/gpu/d3d12/d3d12_render_target_cache.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3287,13 +3287,19 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {

// Apply the source 32bpp tile index.
// r1.w = destination to source EDRAM tile adjustment
a.OpIBFE(dxbc::Dest::R(1, 0b1000), dxbc::Src::LU(xenos::kEdramBaseTilesBits),
a.OpIBFE(dxbc::Dest::R(1, 0b1000),
dxbc::Src::LU(xenos::kEdramBaseTilesBits + 1),
dxbc::Src::LU(xenos::kEdramPitchTilesBits * 2),
dxbc::Src::CB(cbuffer_index_address, kTransferCBVRegisterAddress, 0,
dxbc::Src::kXXXX));
// r1.w = 32bpp tile index within the source
// r1.w = 32bpp tile index within the source, or the tile index within the
// source minus the EDRAM tile count if transferring across addressing
// wrapping (if negative)
a.OpIAdd(dxbc::Dest::R(1, 0b1000), dxbc::Src::R(0, dxbc::Src::kZZZZ),
dxbc::Src::R(1, dxbc::Src::kWWWW));
// r1.w = 32bpp tile index within the source
a.OpAnd(dxbc::Dest::R(1, 0b1000), dxbc::Src::R(1, dxbc::Src::kWWWW),
dxbc::Src::LU(xenos::kEdramTileCount - 1));
// r2.x = source pitch in 32bpp tiles
a.OpUBFE(dxbc::Dest::R(2, 0b0001), dxbc::Src::LU(xenos::kEdramPitchTilesBits),
dxbc::Src::LU(xenos::kEdramPitchTilesBits),
Expand Down Expand Up @@ -3887,6 +3893,9 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
}
// Combine the tile sample index and the tile index into buffer
// address to r0.x.
// The tile index doesn't need to be wrapped, as the host depth is
// written to the beginning of the buffer, without the base
// offset.
a.OpUMAd(dxbc::Dest::R(0, 0b0001),
dxbc::Src::LU(tile_width_samples),
dxbc::Src::R(0, dxbc::Src::kYYYY),
Expand All @@ -3906,16 +3915,23 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) {
// source.
// r0.w = destination to host depth source EDRAM tile adjustment
a.OpIBFE(dxbc::Dest::R(0, 0b1000),
dxbc::Src::LU(xenos::kEdramBaseTilesBits),
dxbc::Src::LU(xenos::kEdramBaseTilesBits + 1),
dxbc::Src::LU(xenos::kEdramPitchTilesBits * 2),
dxbc::Src::CB(cbuffer_index_host_depth_address,
kTransferCBVRegisterHostDepthAddress, 0,
dxbc::Src::kXXXX));
// r0.z = tile index relative to the host depth source base
// r0.z = tile index relative to the host depth source base, or
// the tile index within the host depth source minus the
// EDRAM tile count if transferring across addressing
// wrapping (if negative)
// r0.w = free
a.OpIAdd(dxbc::Dest::R(0, 0b0100),
dxbc::Src::R(0, dxbc::Src::kZZZZ),
dxbc::Src::R(0, dxbc::Src::kWWWW));
// r0.z = tile index relative to the host depth source base
a.OpAnd(dxbc::Dest::R(0, 0b0100),
dxbc::Src::R(0, dxbc::Src::kZZZZ),
dxbc::Src::LU(xenos::kEdramTileCount - 1));
// Convert position and sample index from within the destination
// tile to within the host depth source tile, like for the guest
// render target, but for 32bpp -> 32bpp only.
Expand Down Expand Up @@ -5943,28 +5959,38 @@ ID3D12PipelineState* D3D12RenderTargetCache::GetOrCreateDumpPipeline(
dxbc::Src::R(1, dxbc::Src::kXXXX),
dxbc::Src::R(0, dxbc::Src::kZZZZ));

// Extract the index of the first tile of the dispatch in the EDRAM to r0.z.
// Extract the index of the first tile (taking EDRAM addressing wrapping into
// account) of the dispatch in the EDRAM to r0.z.
// r0.x = X sample position within the tile
// r0.y = Y sample position within the tile
// r0.z = first EDRAM tile index in the dispatch
// r0.w = tile index relative to the dump rectangle base
a.OpUBFE(dxbc::Dest::R(0, 0b0100), dxbc::Src::LU(xenos::kEdramBaseTilesBits),
dxbc::Src::LU(0),
a.OpUBFE(dxbc::Dest::R(0, 0b0100),
dxbc::Src::LU(xenos::kEdramBaseTilesBits + 1), dxbc::Src::LU(0),
dxbc::Src::CB(kDumpCbufferOffsets, kDumpCbufferOffsets, 0,
dxbc::Src::kXXXX));
// Add the base tile in the dispatch to the dispatch-local tile index to r0.w.
// Add the base tile in the dispatch to the dispatch-local tile index to r0.w,
// not wrapping yet so in case of a wraparound, the address relative to the
// base in the texture after subtraction of the base won't be negative.
// r0.x = X sample position within the tile
// r0.y = Y sample position within the tile
// r0.z = free
// r0.w = tile index in the EDRAM
// r0.w = non-wrapped tile index in the EDRAM
a.OpIAdd(dxbc::Dest::R(0, 0b1000), dxbc::Src::R(0, dxbc::Src::kWWWW),
dxbc::Src::R(0, dxbc::Src::kZZZZ));
// Wrap the address of the tile in the EDRAM to r0.z.
// r0.x = X sample position within the tile
// r0.y = Y sample position within the tile
// r0.z = wrapped tile index in the EDRAM
// r0.w = non-wrapped tile index in the EDRAM
a.OpAnd(dxbc::Dest::R(0, 0b0100), dxbc::Src::R(0, dxbc::Src::kWWWW),
dxbc::Src::LU(xenos::kEdramTileCount - 1));
// Convert the tile index to samples and add the X sample index to it to r0.z.
// r0.x = X sample position within the tile
// r0.y = Y sample position within the tile
// r0.z = tile sample offset in the EDRAM plus X sample offset
// r0.w = tile index in the EDRAM
a.OpUMAd(dxbc::Dest::R(0, 0b0100), dxbc::Src::R(0, dxbc::Src::kWWWW),
// r0.w = non-wrapped tile index in the EDRAM
a.OpUMAd(dxbc::Dest::R(0, 0b0100), dxbc::Src::R(0, dxbc::Src::kZZZZ),
dxbc::Src::LU(
draw_resolution_scale_x * draw_resolution_scale_y *
(xenos::kEdramTileWidthSamples >> uint32_t(format_is_64bpp)) *
Expand All @@ -5975,7 +6001,7 @@ ID3D12PipelineState* D3D12RenderTargetCache::GetOrCreateDumpPipeline(
// r0.x = X sample position within the tile
// r0.y = Y sample position within the tile
// r0.z = sample offset in the EDRAM without the depth column swapping
// r0.w = tile index in the EDRAM
// r0.w = non-wrapped tile index in the EDRAM
a.OpUMAd(dxbc::Dest::R(0, 0b0100), dxbc::Src::R(0, dxbc::Src::kYYYY),
dxbc::Src::LU(tile_width), dxbc::Src::R(0, dxbc::Src::kZZZZ));
if (key.is_depth) {
Expand All @@ -5984,15 +6010,15 @@ ID3D12PipelineState* D3D12RenderTargetCache::GetOrCreateDumpPipeline(
// r0.x = X sample position within the tile
// r0.y = Y sample position within the tile
// r0.z = sample offset in the EDRAM without the depth column swapping
// r0.w = tile index in the EDRAM
// r0.w = non-wrapped tile index in the EDRAM
// r1.x = 0xFFFFFFFF if in the right 40-sample half, 0 otherwise
a.OpUGE(dxbc::Dest::R(1, 0b0001), dxbc::Src::R(0, dxbc::Src::kXXXX),
dxbc::Src::LU(tile_width_half));
// Get the offset needed to swap 40-sample halves for depth.
// r0.x = X sample position within the tile
// r0.y = Y sample position within the tile
// r0.z = sample offset in the EDRAM without the depth column swapping
// r0.w = tile index in the EDRAM
// r0.w = non-wrapped tile index in the EDRAM
// r1.x = depth half-tile flipping offset
a.OpMovC(dxbc::Dest::R(1, 0b0001), dxbc::Src::R(1, dxbc::Src::kXXXX),
dxbc::Src::LI(-int32_t(tile_width_half)),
Expand All @@ -6002,7 +6028,7 @@ ID3D12PipelineState* D3D12RenderTargetCache::GetOrCreateDumpPipeline(
// r0.x = X sample position within the tile
// r0.y = Y sample position within the tile
// r0.z = sample offset in the EDRAM
// r0.w = tile index in the EDRAM
// r0.w = non-wrapped tile index in the EDRAM
// r1.x = free
a.OpIAdd(dxbc::Dest::R(0, 0b0100), dxbc::Src::R(0, dxbc::Src::kZZZZ),
dxbc::Src::R(1, dxbc::Src::kXXXX));
Expand All @@ -6012,10 +6038,10 @@ ID3D12PipelineState* D3D12RenderTargetCache::GetOrCreateDumpPipeline(
// r0.x = X sample position within the tile
// r0.y = Y sample position within the tile
// r0.z = sample offset in the EDRAM
// r0.w = tile index in the EDRAM
// r0.w = non-wrapped tile index in the EDRAM
// r1.x = source texture base tile index
a.OpUBFE(dxbc::Dest::R(1, 0b0001), dxbc::Src::LU(xenos::kEdramBaseTilesBits),
dxbc::Src::LU(xenos::kEdramBaseTilesBits),
dxbc::Src::LU(xenos::kEdramBaseTilesBits + 1),
dxbc::Src::CB(kDumpCbufferOffsets, kDumpCbufferOffsets, 0,
dxbc::Src::kXXXX));
// Get the linear tile index within the source texture to r0.w.
Expand Down
11 changes: 5 additions & 6 deletions src/xenia/gpu/d3d12/d3d12_render_target_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -469,16 +469,13 @@ class D3D12RenderTargetCache final : public RenderTargetCache {
// All in tiles.
uint32_t dest_pitch : xenos::kEdramPitchTilesBits;
uint32_t source_pitch : xenos::kEdramPitchTilesBits;
// Safe to use 12 bits for signed difference - no ownership transfer can
// ever occur between render targets with EDRAM base >= 2048 as this would
// result in 0-length spans. 10 + 10 + 12 is exactly 32, any more bits,
// and more root 32-bit constants will be used.
// Destination base in tiles minus source base in tiles (not vice versa
// because this is a transform of the coordinate system, not addresses
// themselves).
// + 1 bit because this is a signed difference between two EDRAM bases.
// 0 for host_depth_source_is_copy (ignored in this case anyway as
// destination == source anyway).
int32_t source_to_dest : xenos::kEdramBaseTilesBits;
int32_t source_to_dest : xenos::kEdramBaseTilesBits + 1;
};
TransferAddressConstant() : constant(0) {
static_assert_size(*this, sizeof(constant));
Expand Down Expand Up @@ -576,7 +573,9 @@ class D3D12RenderTargetCache final : public RenderTargetCache {
union DumpOffsets {
uint32_t offsets;
struct {
uint32_t dispatch_first_tile : xenos::kEdramBaseTilesBits;
// May be beyond the EDRAM tile count in case of EDRAM addressing
// wrapping, thus + 1 bit.
uint32_t dispatch_first_tile : xenos::kEdramBaseTilesBits + 1;
uint32_t source_base_tiles : xenos::kEdramBaseTilesBits;
};
DumpOffsets() : offsets(0) { static_assert_size(*this, sizeof(offsets)); }
Expand Down
4 changes: 4 additions & 0 deletions src/xenia/gpu/draw_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1045,6 +1045,8 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
depth_edram_info.pitch_tiles = surface_pitch_tiles;
depth_edram_info.msaa_samples = rb_surface_info.msaa_samples;
depth_edram_info.is_depth = 1;
// If wrapping happens, it's fine, it doesn't matter how many times and
// where modulo xenos::kEdramTileCount is applied in this context.
depth_edram_info.base_tiles =
rb_depth_info.depth_base + edram_base_offset_tiles;
depth_edram_info.format = uint32_t(rb_depth_info.depth_format);
Expand All @@ -1067,6 +1069,8 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
color_edram_info.pitch_tiles = surface_pitch_tiles << is_64bpp;
color_edram_info.msaa_samples = rb_surface_info.msaa_samples;
color_edram_info.is_depth = 0;
// If wrapping happens, it's fine, it doesn't matter how many times and
// where modulo xenos::kEdramTileCount is applied in this context.
color_edram_info.base_tiles =
color_info.color_base + (edram_base_offset_tiles << is_64bpp);
color_edram_info.format = uint32_t(color_info.color_format);
Expand Down
4 changes: 3 additions & 1 deletion src/xenia/gpu/dxbc_shader_translator.h
Original file line number Diff line number Diff line change
Expand Up @@ -1143,7 +1143,9 @@ class DxbcShaderTranslator : public ShaderTranslator {
// 8:11 - Whether color buffers have been written to, if not written on the
// taken execution path, don't export according to Direct3D 9 register
// documentation (some games rely on this behavior).
// Y - Absolute resolution-scaled EDRAM offset for depth / stencil, in dwords.
// Y - Absolute resolution-scaled EDRAM offset for depth / stencil, in dwords,
// before and during depth testing. During color writing, when the depth /
// stencil address is not needed anymore, current color sample address.
// Z - Base-relative resolution-scaled EDRAM offset for 32bpp color data, in
// dwords.
// W - Base-relative resolution-scaled EDRAM offset for 64bpp color data, in
Expand Down
Loading

0 comments on commit 3c12814

Please sign in to comment.