Skip to content

Commit

Permalink
Bug 1870085 - Generalize gfxAlphaRecovery simd implementation r=jrmui…
Browse files Browse the repository at this point in the history
…zel,gfx-reviewers

Also get rid of unused code in the process.

Differential Revision: https://phabricator.services.mozilla.com/D196459
  • Loading branch information
serge-sans-paille committed Jan 4, 2024
1 parent 0dccdee commit 0e607b2
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 175 deletions.
19 changes: 0 additions & 19 deletions gfx/thebes/gfxAlphaRecovery.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,25 +40,6 @@ class gfxAlphaRecovery {
*/
static bool RecoverAlphaSSE2(gfxImageSurface* blackSurface,
const gfxImageSurface* whiteSurface);

/**
* A common use-case for alpha recovery is to paint into a
* temporary "white image", then paint onto a subrect of the
* surface, the "black image", into which alpha-recovered pixels
* are eventually to be written. This function returns a rect
* aligned so that recovering alpha for that rect will hit SIMD
* fast-paths, if possible. It's not always possible to align
* |aRect| so that fast-paths will be taken.
*
* The returned rect is always a superset of |aRect|.
*/
static mozilla::gfx::IntRect AlignRectForSubimageRecovery(
const mozilla::gfx::IntRect& aRect, gfxImageSurface* aSurface);
#else
static mozilla::gfx::IntRect AlignRectForSubimageRecovery(
const mozilla::gfx::IntRect& aRect, gfxImageSurface*) {
return aRect;
}
#endif

/** from cairo-xlib-utils.c, modified */
Expand Down
208 changes: 52 additions & 156 deletions gfx/thebes/gfxAlphaRecoverySSE2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,26 +6,21 @@
#include "gfxAlphaRecovery.h"
#include "gfxImageSurface.h"
#include "nsDebug.h"
#include <emmintrin.h>
#include <xsimd/xsimd.hpp>

using arch = xsimd::sse2;

// This file should only be compiled on x86 and x64 systems. Additionally,
// you'll need to compile it with -msse2 if you're using GCC on x86.

#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))
__declspec(align(16)) static uint32_t greenMaski[] = {0x0000ff00, 0x0000ff00,
0x0000ff00, 0x0000ff00};
__declspec(align(16)) static uint32_t alphaMaski[] = {0xff000000, 0xff000000,
0xff000000, 0xff000000};
#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
static uint32_t greenMaski[] __attribute__((aligned(16))) = {
0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00};
static uint32_t alphaMaski[] __attribute__((aligned(16))) = {
0xff000000, 0xff000000, 0xff000000, 0xff000000};
#elif defined(__SUNPRO_CC) && (defined(__i386) || defined(__x86_64__))
# pragma align 16(greenMaski, alphaMaski)
static uint32_t greenMaski[] = {0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00};
static uint32_t alphaMaski[] = {0xff000000, 0xff000000, 0xff000000, 0xff000000};
#endif
alignas(arch::alignment()) static const uint8_t greenMaski[] = {
0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00,
0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00,
};
alignas(arch::alignment()) static const uint8_t alphaMaski[] = {
0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff,
0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff,
};

bool gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf,
const gfxImageSurface* whiteSurf) {
Expand All @@ -51,8 +46,12 @@ bool gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf,
return false;
}

__m128i greenMask = _mm_load_si128((__m128i*)greenMaski);
__m128i alphaMask = _mm_load_si128((__m128i*)alphaMaski);
using batch_type = xsimd::batch<uint8_t, arch>;
constexpr size_t batch_size = batch_type::size;
static_assert(batch_size == 16);

batch_type greenMask = batch_type::load_aligned(greenMaski);
batch_type alphaMask = batch_type::load_aligned(alphaMaski);

for (int32_t i = 0; i < size.height; ++i) {
int32_t j = 0;
Expand All @@ -69,48 +68,48 @@ bool gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf,
// management and makes it about 5% faster than with only the 4 pixel
// at a time loop.
for (; j < size.width - 8; j += 8) {
__m128i black1 = _mm_load_si128((__m128i*)blackData);
__m128i white1 = _mm_load_si128((__m128i*)whiteData);
__m128i black2 = _mm_load_si128((__m128i*)(blackData + 16));
__m128i white2 = _mm_load_si128((__m128i*)(whiteData + 16));
auto black1 = batch_type::load_aligned(blackData);
auto white1 = batch_type::load_aligned(whiteData);
auto black2 = batch_type::load_aligned(blackData + batch_size);
auto white2 = batch_type::load_aligned(whiteData + batch_size);

// Execute the same instructions as described in RecoverPixel, only
// using an SSE2 packed saturated subtract.
white1 = _mm_subs_epu8(white1, black1);
white2 = _mm_subs_epu8(white2, black2);
white1 = _mm_subs_epu8(greenMask, white1);
white2 = _mm_subs_epu8(greenMask, white2);
// Producing the final black pixel in an XMM register and storing
// using a packed saturated subtract.
white1 = xsimd::ssub(white1, black1);
white2 = xsimd::ssub(white2, black2);
white1 = xsimd::ssub(greenMask, white1);
white2 = xsimd::ssub(greenMask, white2);
// Producing the final black pixel in a register and storing
// that is actually faster than doing a masked store since that
// does an unaligned storage. We have the black pixel in a register
// anyway.
black1 = _mm_andnot_si128(alphaMask, black1);
black2 = _mm_andnot_si128(alphaMask, black2);
white1 = _mm_slli_si128(white1, 2);
white2 = _mm_slli_si128(white2, 2);
white1 = _mm_and_si128(alphaMask, white1);
white2 = _mm_and_si128(alphaMask, white2);
black1 = _mm_or_si128(white1, black1);
black2 = _mm_or_si128(white2, black2);

_mm_store_si128((__m128i*)blackData, black1);
_mm_store_si128((__m128i*)(blackData + 16), black2);
blackData += 32;
whiteData += 32;
black1 = xsimd::bitwise_andnot(black1, alphaMask);
black2 = xsimd::bitwise_andnot(black2, alphaMask);
white1 = xsimd::slide_left<2>(white1);
white2 = xsimd::slide_left<2>(white2);
white1 &= alphaMask;
white2 &= alphaMask;
black1 |= white1;
black2 |= white2;

black1.store_aligned(blackData);
black2.store_aligned(blackData + batch_size);
blackData += 2 * batch_size;
whiteData += 2 * batch_size;
}
for (; j < size.width - 4; j += 4) {
__m128i black = _mm_load_si128((__m128i*)blackData);
__m128i white = _mm_load_si128((__m128i*)whiteData);

white = _mm_subs_epu8(white, black);
white = _mm_subs_epu8(greenMask, white);
black = _mm_andnot_si128(alphaMask, black);
white = _mm_slli_si128(white, 2);
white = _mm_and_si128(alphaMask, white);
black = _mm_or_si128(white, black);
_mm_store_si128((__m128i*)blackData, black);
blackData += 16;
whiteData += 16;
auto black = batch_type::load_aligned(blackData);
auto white = batch_type::load_aligned(whiteData);

white = xsimd::ssub(white, black);
white = xsimd::ssub(greenMask, white);
black = xsimd::bitwise_andnot(black, alphaMask);
white = xsimd::slide_left<2>(white);
white &= alphaMask;
black |= white;
black.store_aligned(blackData);
blackData += batch_size;
whiteData += batch_size;
}
// Loop single pixels until we're done.
while (j < size.width) {
Expand All @@ -129,106 +128,3 @@ bool gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf,

return true;
}

static int32_t ByteAlignment(int32_t aAlignToLog2, int32_t aX, int32_t aY = 0,
int32_t aStride = 1) {
return (aX + aStride * aY) & ((1 << aAlignToLog2) - 1);
}

/*static*/ mozilla::gfx::IntRect gfxAlphaRecovery::AlignRectForSubimageRecovery(
const mozilla::gfx::IntRect& aRect, gfxImageSurface* aSurface) {
NS_ASSERTION(
mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 == aSurface->Format(),
"Thebes grew support for non-ARGB32 COLOR_ALPHA?");
static const int32_t kByteAlignLog2 = GoodAlignmentLog2();
static const int32_t bpp = 4;
static const int32_t pixPerAlign = (1 << kByteAlignLog2) / bpp;
//
// We're going to create a subimage of the surface with size
// <sw,sh> for alpha recovery, and want a SIMD fast-path. The
// rect <x,y, w,h> /needs/ to be redrawn, but it might not be
// properly aligned for SIMD. So we want to find a rect <x',y',
// w',h'> that's a superset of what needs to be redrawn but is
// properly aligned. Proper alignment is
//
// BPP * (x' + y' * sw) \cong 0 (mod ALIGN)
// BPP * w' \cong BPP * sw (mod ALIGN)
//
// (We assume the pixel at surface <0,0> is already ALIGN'd.)
// That rect (obviously) has to fit within the surface bounds, and
// we should also minimize the extra pixels redrawn only for
// alignment's sake. So we also want
//
// minimize <x',y', w',h'>
// 0 <= x' <= x
// 0 <= y' <= y
// w <= w' <= sw
// h <= h' <= sh
//
// This is a messy integer non-linear programming problem, except
// ... we can assume that ALIGN/BPP is a very small constant. So,
// brute force is viable. The algorithm below will find a
// solution if one exists, but isn't guaranteed to find the
// minimum solution. (For SSE2, ALIGN/BPP = 4, so it'll do at
// most 64 iterations below). In what's likely the common case,
// an already-aligned rectangle, it only needs 1 iteration.
//
// Is this alignment worth doing? Recovering alpha will take work
// proportional to w*h (assuming alpha recovery computation isn't
// memory bound). This analysis can lead to O(w+h) extra work
// (with small constants). In exchange, we expect to shave off a
// ALIGN/BPP constant by using SIMD-ized alpha recovery. So as
// w*h diverges from w+h, the win factor approaches ALIGN/BPP. We
// only really care about the w*h >> w+h case anyway; others
// should be fast enough even with the overhead. (Unless the cost
// of repainting the expanded rect is high, but in that case
// SIMD-ized alpha recovery won't make a difference so this code
// shouldn't be called.)
//
mozilla::gfx::IntSize surfaceSize = aSurface->GetSize();
const int32_t stride = bpp * surfaceSize.width;
if (stride != aSurface->Stride()) {
NS_WARNING("Unexpected stride, falling back on slow alpha recovery");
return aRect;
}

const int32_t x = aRect.X(), y = aRect.Y(), w = aRect.Width(),
h = aRect.Height();
const int32_t r = x + w;
const int32_t sw = surfaceSize.width;
const int32_t strideAlign = ByteAlignment(kByteAlignLog2, stride);

// The outer two loops below keep the rightmost (|r| above) and
// bottommost pixels in |aRect| fixed wrt <x,y>, to ensure that we
// return only a superset of the original rect. These loops
// search for an aligned top-left pixel by trying to expand <x,y>
// left and up by <dx,dy> pixels, respectively.
//
// Then if a properly-aligned top-left pixel is found, the
// innermost loop tries to find an aligned stride by moving the
// rightmost pixel rightward by dr.
int32_t dx, dy, dr;
for (dy = 0; (dy < pixPerAlign) && (y - dy >= 0); ++dy) {
for (dx = 0; (dx < pixPerAlign) && (x - dx >= 0); ++dx) {
if (0 != ByteAlignment(kByteAlignLog2, bpp * (x - dx), y - dy, stride)) {
continue;
}
for (dr = 0; (dr < pixPerAlign) && (r + dr <= sw); ++dr) {
if (strideAlign == ByteAlignment(kByteAlignLog2, bpp * (w + dr + dx))) {
goto FOUND_SOLUTION;
}
}
}
}

// Didn't find a solution.
return aRect;

FOUND_SOLUTION:
mozilla::gfx::IntRect solution =
mozilla::gfx::IntRect(x - dx, y - dy, w + dr + dx, h + dy);
MOZ_ASSERT(
mozilla::gfx::IntRect(0, 0, sw, surfaceSize.height).Contains(solution),
"'Solution' extends outside surface bounds!");
return solution;
}
1 change: 1 addition & 0 deletions gfx/thebes/moz.build
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,7 @@ LOCAL_INCLUDES += [
"/dom/media/platforms/apple",
"/dom/xml",
"/gfx/cairo/cairo/src",
"/third_party/xsimd/include",
"/widget/gtk",
]

Expand Down

0 comments on commit 0e607b2

Please sign in to comment.