diff --git a/gfx/thebes/gfxAlphaRecovery.h b/gfx/thebes/gfxAlphaRecovery.h index 53b38f2ac0bd7..d8dd06de7fc11 100644 --- a/gfx/thebes/gfxAlphaRecovery.h +++ b/gfx/thebes/gfxAlphaRecovery.h @@ -40,25 +40,6 @@ class gfxAlphaRecovery { */ static bool RecoverAlphaSSE2(gfxImageSurface* blackSurface, const gfxImageSurface* whiteSurface); - - /** - * A common use-case for alpha recovery is to paint into a - * temporary "white image", then paint onto a subrect of the - * surface, the "black image", into which alpha-recovered pixels - * are eventually to be written. This function returns a rect - * aligned so that recovering alpha for that rect will hit SIMD - * fast-paths, if possible. It's not always possible to align - * |aRect| so that fast-paths will be taken. - * - * The returned rect is always a superset of |aRect|. - */ - static mozilla::gfx::IntRect AlignRectForSubimageRecovery( - const mozilla::gfx::IntRect& aRect, gfxImageSurface* aSurface); -#else - static mozilla::gfx::IntRect AlignRectForSubimageRecovery( - const mozilla::gfx::IntRect& aRect, gfxImageSurface*) { - return aRect; - } #endif /** from cairo-xlib-utils.c, modified */ diff --git a/gfx/thebes/gfxAlphaRecoverySSE2.cpp b/gfx/thebes/gfxAlphaRecoverySSE2.cpp index d64cb18badf5d..f962fdd4f71c8 100644 --- a/gfx/thebes/gfxAlphaRecoverySSE2.cpp +++ b/gfx/thebes/gfxAlphaRecoverySSE2.cpp @@ -6,26 +6,21 @@ #include "gfxAlphaRecovery.h" #include "gfxImageSurface.h" #include "nsDebug.h" -#include +#include + +using arch = xsimd::sse2; // This file should only be compiled on x86 and x64 systems. Additionally, // you'll need to compile it with -msse2 if you're using GCC on x86. -#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64)) -__declspec(align(16)) static uint32_t greenMaski[] = {0x0000ff00, 0x0000ff00, - 0x0000ff00, 0x0000ff00}; -__declspec(align(16)) static uint32_t alphaMaski[] = {0xff000000, 0xff000000, - 0xff000000, 0xff000000}; -#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) -static uint32_t greenMaski[] __attribute__((aligned(16))) = { - 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00}; -static uint32_t alphaMaski[] __attribute__((aligned(16))) = { - 0xff000000, 0xff000000, 0xff000000, 0xff000000}; -#elif defined(__SUNPRO_CC) && (defined(__i386) || defined(__x86_64__)) -# pragma align 16(greenMaski, alphaMaski) -static uint32_t greenMaski[] = {0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00}; -static uint32_t alphaMaski[] = {0xff000000, 0xff000000, 0xff000000, 0xff000000}; -#endif +alignas(arch::alignment()) static const uint8_t greenMaski[] = { + 0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, + 0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, +}; +alignas(arch::alignment()) static const uint8_t alphaMaski[] = { + 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff, + 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff, +}; bool gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf, const gfxImageSurface* whiteSurf) { @@ -51,8 +46,12 @@ bool gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf, return false; } - __m128i greenMask = _mm_load_si128((__m128i*)greenMaski); - __m128i alphaMask = _mm_load_si128((__m128i*)alphaMaski); + using batch_type = xsimd::batch; + constexpr size_t batch_size = batch_type::size; + static_assert(batch_size == 16); + + batch_type greenMask = batch_type::load_aligned(greenMaski); + batch_type alphaMask = batch_type::load_aligned(alphaMaski); for (int32_t i = 0; i < size.height; ++i) { int32_t j = 0; @@ -69,48 +68,48 @@ bool gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf, // management and makes it about 5% faster than with only the 4 pixel // at a time loop. for (; j < size.width - 8; j += 8) { - __m128i black1 = _mm_load_si128((__m128i*)blackData); - __m128i white1 = _mm_load_si128((__m128i*)whiteData); - __m128i black2 = _mm_load_si128((__m128i*)(blackData + 16)); - __m128i white2 = _mm_load_si128((__m128i*)(whiteData + 16)); + auto black1 = batch_type::load_aligned(blackData); + auto white1 = batch_type::load_aligned(whiteData); + auto black2 = batch_type::load_aligned(blackData + batch_size); + auto white2 = batch_type::load_aligned(whiteData + batch_size); // Execute the same instructions as described in RecoverPixel, only - // using an SSE2 packed saturated subtract. - white1 = _mm_subs_epu8(white1, black1); - white2 = _mm_subs_epu8(white2, black2); - white1 = _mm_subs_epu8(greenMask, white1); - white2 = _mm_subs_epu8(greenMask, white2); - // Producing the final black pixel in an XMM register and storing + // using a packed saturated subtract. + white1 = xsimd::ssub(white1, black1); + white2 = xsimd::ssub(white2, black2); + white1 = xsimd::ssub(greenMask, white1); + white2 = xsimd::ssub(greenMask, white2); + // Producing the final black pixel in a register and storing // that is actually faster than doing a masked store since that // does an unaligned storage. We have the black pixel in a register // anyway. - black1 = _mm_andnot_si128(alphaMask, black1); - black2 = _mm_andnot_si128(alphaMask, black2); - white1 = _mm_slli_si128(white1, 2); - white2 = _mm_slli_si128(white2, 2); - white1 = _mm_and_si128(alphaMask, white1); - white2 = _mm_and_si128(alphaMask, white2); - black1 = _mm_or_si128(white1, black1); - black2 = _mm_or_si128(white2, black2); - - _mm_store_si128((__m128i*)blackData, black1); - _mm_store_si128((__m128i*)(blackData + 16), black2); - blackData += 32; - whiteData += 32; + black1 = xsimd::bitwise_andnot(black1, alphaMask); + black2 = xsimd::bitwise_andnot(black2, alphaMask); + white1 = xsimd::slide_left<2>(white1); + white2 = xsimd::slide_left<2>(white2); + white1 &= alphaMask; + white2 &= alphaMask; + black1 |= white1; + black2 |= white2; + + black1.store_aligned(blackData); + black2.store_aligned(blackData + batch_size); + blackData += 2 * batch_size; + whiteData += 2 * batch_size; } for (; j < size.width - 4; j += 4) { - __m128i black = _mm_load_si128((__m128i*)blackData); - __m128i white = _mm_load_si128((__m128i*)whiteData); - - white = _mm_subs_epu8(white, black); - white = _mm_subs_epu8(greenMask, white); - black = _mm_andnot_si128(alphaMask, black); - white = _mm_slli_si128(white, 2); - white = _mm_and_si128(alphaMask, white); - black = _mm_or_si128(white, black); - _mm_store_si128((__m128i*)blackData, black); - blackData += 16; - whiteData += 16; + auto black = batch_type::load_aligned(blackData); + auto white = batch_type::load_aligned(whiteData); + + white = xsimd::ssub(white, black); + white = xsimd::ssub(greenMask, white); + black = xsimd::bitwise_andnot(black, alphaMask); + white = xsimd::slide_left<2>(white); + white &= alphaMask; + black |= white; + black.store_aligned(blackData); + blackData += batch_size; + whiteData += batch_size; } // Loop single pixels until we're done. while (j < size.width) { @@ -129,106 +128,3 @@ bool gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf, return true; } - -static int32_t ByteAlignment(int32_t aAlignToLog2, int32_t aX, int32_t aY = 0, - int32_t aStride = 1) { - return (aX + aStride * aY) & ((1 << aAlignToLog2) - 1); -} - -/*static*/ mozilla::gfx::IntRect gfxAlphaRecovery::AlignRectForSubimageRecovery( - const mozilla::gfx::IntRect& aRect, gfxImageSurface* aSurface) { - NS_ASSERTION( - mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 == aSurface->Format(), - "Thebes grew support for non-ARGB32 COLOR_ALPHA?"); - static const int32_t kByteAlignLog2 = GoodAlignmentLog2(); - static const int32_t bpp = 4; - static const int32_t pixPerAlign = (1 << kByteAlignLog2) / bpp; - // - // We're going to create a subimage of the surface with size - // for alpha recovery, and want a SIMD fast-path. The - // rect /needs/ to be redrawn, but it might not be - // properly aligned for SIMD. So we want to find a rect that's a superset of what needs to be redrawn but is - // properly aligned. Proper alignment is - // - // BPP * (x' + y' * sw) \cong 0 (mod ALIGN) - // BPP * w' \cong BPP * sw (mod ALIGN) - // - // (We assume the pixel at surface <0,0> is already ALIGN'd.) - // That rect (obviously) has to fit within the surface bounds, and - // we should also minimize the extra pixels redrawn only for - // alignment's sake. So we also want - // - // minimize - // 0 <= x' <= x - // 0 <= y' <= y - // w <= w' <= sw - // h <= h' <= sh - // - // This is a messy integer non-linear programming problem, except - // ... we can assume that ALIGN/BPP is a very small constant. So, - // brute force is viable. The algorithm below will find a - // solution if one exists, but isn't guaranteed to find the - // minimum solution. (For SSE2, ALIGN/BPP = 4, so it'll do at - // most 64 iterations below). In what's likely the common case, - // an already-aligned rectangle, it only needs 1 iteration. - // - // Is this alignment worth doing? Recovering alpha will take work - // proportional to w*h (assuming alpha recovery computation isn't - // memory bound). This analysis can lead to O(w+h) extra work - // (with small constants). In exchange, we expect to shave off a - // ALIGN/BPP constant by using SIMD-ized alpha recovery. So as - // w*h diverges from w+h, the win factor approaches ALIGN/BPP. We - // only really care about the w*h >> w+h case anyway; others - // should be fast enough even with the overhead. (Unless the cost - // of repainting the expanded rect is high, but in that case - // SIMD-ized alpha recovery won't make a difference so this code - // shouldn't be called.) - // - mozilla::gfx::IntSize surfaceSize = aSurface->GetSize(); - const int32_t stride = bpp * surfaceSize.width; - if (stride != aSurface->Stride()) { - NS_WARNING("Unexpected stride, falling back on slow alpha recovery"); - return aRect; - } - - const int32_t x = aRect.X(), y = aRect.Y(), w = aRect.Width(), - h = aRect.Height(); - const int32_t r = x + w; - const int32_t sw = surfaceSize.width; - const int32_t strideAlign = ByteAlignment(kByteAlignLog2, stride); - - // The outer two loops below keep the rightmost (|r| above) and - // bottommost pixels in |aRect| fixed wrt , to ensure that we - // return only a superset of the original rect. These loops - // search for an aligned top-left pixel by trying to expand - // left and up by pixels, respectively. - // - // Then if a properly-aligned top-left pixel is found, the - // innermost loop tries to find an aligned stride by moving the - // rightmost pixel rightward by dr. - int32_t dx, dy, dr; - for (dy = 0; (dy < pixPerAlign) && (y - dy >= 0); ++dy) { - for (dx = 0; (dx < pixPerAlign) && (x - dx >= 0); ++dx) { - if (0 != ByteAlignment(kByteAlignLog2, bpp * (x - dx), y - dy, stride)) { - continue; - } - for (dr = 0; (dr < pixPerAlign) && (r + dr <= sw); ++dr) { - if (strideAlign == ByteAlignment(kByteAlignLog2, bpp * (w + dr + dx))) { - goto FOUND_SOLUTION; - } - } - } - } - - // Didn't find a solution. - return aRect; - -FOUND_SOLUTION: - mozilla::gfx::IntRect solution = - mozilla::gfx::IntRect(x - dx, y - dy, w + dr + dx, h + dy); - MOZ_ASSERT( - mozilla::gfx::IntRect(0, 0, sw, surfaceSize.height).Contains(solution), - "'Solution' extends outside surface bounds!"); - return solution; -} diff --git a/gfx/thebes/moz.build b/gfx/thebes/moz.build index 97f02d59f51a7..09e3800456cb8 100644 --- a/gfx/thebes/moz.build +++ b/gfx/thebes/moz.build @@ -266,6 +266,7 @@ LOCAL_INCLUDES += [ "/dom/media/platforms/apple", "/dom/xml", "/gfx/cairo/cairo/src", + "/third_party/xsimd/include", "/widget/gtk", ]