diff --git a/decode_idct.cl b/decode_idct.cl index d2a3249..5ca17f6 100644 --- a/decode_idct.cl +++ b/decode_idct.cl @@ -9,6 +9,8 @@ typedef unsigned char JSAMPLE; typedef unsigned int JDIMENSION; typedef int INT32; typedef short INT16; +typedef float FAST_FLOAT; +typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */ #define MULTIPLIER int /* type for fastest integer multiply */ typedef MULTIPLIER ISLOW_MULT_TYPE; /* short or int, whichever is faster */ struct ComponentInfo @@ -54,29 +56,270 @@ struct DecodeInfo #define RIGHT_SHIFT(x,shft) ((x) >> (shft)) #define RANGE_MASK (MAXJSAMPLE * 4 + 3) /* 2 bits wider than legal samples */ +// void inverse_DCT(__global struct DecodeInfo * cinfo, +// __global struct ComponentInfo * compptr, +// __global JCOEF * coef_block, +// __global JSAMPLE * output_buf, +// JDIMENSION output_col) +// { +// INT32 tmp0, tmp1, tmp2, tmp3; +// INT32 tmp10, tmp11, tmp12, tmp13; +// INT32 z1, z2, z3, z4, z5; +// __global JCOEF * inptr; +// __global ISLOW_MULT_TYPE * quantptr; +// int * wsptr; +// __global JSAMPLE * outptr; +// __global JSAMPLE *range_limit = IDCT_range_limit(cinfo); +// int ctr; +// int workspace[DCTSIZE2]; /* buffers data between passes */ +// +// /* Pass 1: process columns from input, store into work array. */ +// /* Note results are scaled up by sqrt(8) compared to a true IDCT; */ +// /* furthermore, we scale the results by 2**PASS1_BITS. */ +// +// inptr = coef_block; +// quantptr = (__global ISLOW_MULT_TYPE *) compptr->dct_table; +// wsptr = workspace; +// for (ctr = DCTSIZE; ctr > 0; ctr--) { +// /* Due to quantization, we will usually find that many of the input +// * coefficients are zero, especially the AC terms. We can exploit this +// * by short-circuiting the IDCT calculation for any column in which all +// * the AC terms are zero. In that case each output is equal to the +// * DC coefficient (with scale factor as needed). +// * With typical images and quantization tables, half or more of the +// * column DCT calculations can be simplified this way. +// */ +// +// if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 && +// inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 && +// inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 && +// inptr[DCTSIZE*7] == 0) { +// /* AC terms all zero */ +// int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS; +// +// wsptr[DCTSIZE*0] = dcval; +// wsptr[DCTSIZE*1] = dcval; +// wsptr[DCTSIZE*2] = dcval; +// wsptr[DCTSIZE*3] = dcval; +// wsptr[DCTSIZE*4] = dcval; +// wsptr[DCTSIZE*5] = dcval; +// wsptr[DCTSIZE*6] = dcval; +// wsptr[DCTSIZE*7] = dcval; +// +// inptr++; /* advance pointers to next column */ +// quantptr++; +// wsptr++; +// continue; +// } +// /* Even part: reverse the even part of the forward DCT. */ +// /* The rotator is sqrt(2)*c(-6). */ +// +// z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); +// z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); +// +// z1 = MULTIPLY(z2 + z3, FIX_0_541196100); +// tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); +// tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); +// +// z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); +// z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); +// +// tmp0 = (z2 + z3) << CONST_BITS; +// tmp1 = (z2 - z3) << CONST_BITS; +// +// tmp10 = tmp0 + tmp3; +// tmp13 = tmp0 - tmp3; +// tmp11 = tmp1 + tmp2; +// tmp12 = tmp1 - tmp2; +// +// /* Odd part per figure 8; the matrix is unitary and hence its +// * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. +// */ +// +// tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); +// tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); +// tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); +// tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); +// +// z1 = tmp0 + tmp3; +// z2 = tmp1 + tmp2; +// z3 = tmp0 + tmp2; +// z4 = tmp1 + tmp3; +// z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ +// +// tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ +// tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ +// tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ +// tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ +// z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ +// z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ +// z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ +// z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ +// +// z3 += z5; +// z4 += z5; +// +// tmp0 += z1 + z3; +// tmp1 += z2 + z4; +// tmp2 += z2 + z3; +// tmp3 += z1 + z4; +// +// /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ +// +// wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS); +// wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS); +// wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS); +// wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS); +// wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS); +// wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS); +// wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS); +// wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS); +// +// inptr++; /* advance pointers to next column */ +// quantptr++; +// wsptr++; +// } +// +// /* Pass 2: process rows from work array, store into output array. */ +// /* Note that we must descale the results by a factor of 8 == 2**3, */ +// /* and also undo the PASS1_BITS scaling. */ +// +// wsptr = workspace; +// for (ctr = 0; ctr < DCTSIZE; ctr++) { +// outptr = output_buf + ctr * compptr->row_buffer_size + output_col; +// /* Rows of zeroes can be exploited in the same way as we did with columns. +// * However, the column calculation has created many nonzero AC terms, so +// * the simplification applies less often (typically 5% to 10% of the time). +// * On machines with very fast multiplication, it's possible that the +// * test takes more time than it's worth. In that case this section +// * may be commented out. +// */ +// +// #ifndef NO_ZERO_ROW_TEST +// if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 && +// wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) { +// /* AC terms all zero */ +// JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3) +// & RANGE_MASK]; +// +// outptr[0] = dcval; +// outptr[1] = dcval; +// outptr[2] = dcval; +// outptr[3] = dcval; +// outptr[4] = dcval; +// outptr[5] = dcval; +// outptr[6] = dcval; +// outptr[7] = dcval; +// +// wsptr += DCTSIZE; /* advance pointer to next row */ +// continue; +// } +// #endif +// +// /* Even part: reverse the even part of the forward DCT. */ +// /* The rotator is sqrt(2)*c(-6). */ +// +// z2 = (INT32) wsptr[2]; +// z3 = (INT32) wsptr[6]; +// +// z1 = MULTIPLY(z2 + z3, FIX_0_541196100); +// tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); +// tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); +// +// tmp0 = ((INT32) wsptr[0] + (INT32) wsptr[4]) << CONST_BITS; +// tmp1 = ((INT32) wsptr[0] - (INT32) wsptr[4]) << CONST_BITS; +// +// tmp10 = tmp0 + tmp3; +// tmp13 = tmp0 - tmp3; +// tmp11 = tmp1 + tmp2; +// tmp12 = tmp1 - tmp2; +// +// /* Odd part per figure 8; the matrix is unitary and hence its +// * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. +// */ +// +// tmp0 = (INT32) wsptr[7]; +// tmp1 = (INT32) wsptr[5]; +// tmp2 = (INT32) wsptr[3]; +// tmp3 = (INT32) wsptr[1]; +// +// z1 = tmp0 + tmp3; +// z2 = tmp1 + tmp2; +// z3 = tmp0 + tmp2; +// z4 = tmp1 + tmp3; +// z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ +// +// tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ +// tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ +// tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ +// tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ +// z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ +// z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ +// z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ +// z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ +// +// z3 += z5; +// z4 += z5; +// +// tmp0 += z1 + z3; +// tmp1 += z2 + z4; +// tmp2 += z2 + z3; +// tmp3 += z1 + z4; +// +// /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ +// +// outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp3, +// CONST_BITS+PASS1_BITS+3) +// & RANGE_MASK]; +// outptr[7] = range_limit[(int) DESCALE(tmp10 - tmp3, +// CONST_BITS+PASS1_BITS+3) +// & RANGE_MASK]; +// outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp2, +// CONST_BITS+PASS1_BITS+3) +// & RANGE_MASK]; +// outptr[6] = range_limit[(int) DESCALE(tmp11 - tmp2, +// CONST_BITS+PASS1_BITS+3) +// & RANGE_MASK]; +// outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp1, +// CONST_BITS+PASS1_BITS+3) +// & RANGE_MASK]; +// outptr[5] = range_limit[(int) DESCALE(tmp12 - tmp1, +// CONST_BITS+PASS1_BITS+3) +// & RANGE_MASK]; +// outptr[3] = range_limit[(int) DESCALE(tmp13 + tmp0, +// CONST_BITS+PASS1_BITS+3) +// & RANGE_MASK]; +// outptr[4] = range_limit[(int) DESCALE(tmp13 - tmp0, +// CONST_BITS+PASS1_BITS+3) +// & RANGE_MASK]; +// +// wsptr += DCTSIZE; /* advance pointer to next row */ +// } +// } + + + void inverse_DCT(__global struct DecodeInfo * cinfo, __global struct ComponentInfo * compptr, __global JCOEF * coef_block, __global JSAMPLE * output_buf, JDIMENSION output_col) { - INT32 tmp0, tmp1, tmp2, tmp3; - INT32 tmp10, tmp11, tmp12, tmp13; - INT32 z1, z2, z3, z4, z5; + FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + FAST_FLOAT tmp10, tmp11, tmp12, tmp13; + FAST_FLOAT z5, z10, z11, z12, z13; __global JCOEF * inptr; - __global ISLOW_MULT_TYPE * quantptr; - int * wsptr; + __global FLOAT_MULT_TYPE * quantptr; + FAST_FLOAT * wsptr; __global JSAMPLE * outptr; __global JSAMPLE *range_limit = IDCT_range_limit(cinfo); int ctr; - int workspace[DCTSIZE2]; /* buffers data between passes */ + FAST_FLOAT workspace[DCTSIZE2]; /* buffers data between passes */ /* Pass 1: process columns from input, store into work array. */ - /* Note results are scaled up by sqrt(8) compared to a true IDCT; */ - /* furthermore, we scale the results by 2**PASS1_BITS. */ inptr = coef_block; - quantptr = (__global ISLOW_MULT_TYPE *) compptr->dct_table; + quantptr = (__global FLOAT_MULT_TYPE *) compptr->dct_table; wsptr = workspace; for (ctr = DCTSIZE; ctr > 0; ctr--) { /* Due to quantization, we will usually find that many of the input @@ -93,7 +336,7 @@ void inverse_DCT(__global struct DecodeInfo * cinfo, inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 && inptr[DCTSIZE*7] == 0) { /* AC terms all zero */ - int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS; + FAST_FLOAT dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); wsptr[DCTSIZE*0] = dcval; wsptr[DCTSIZE*1] = dcval; @@ -109,78 +352,64 @@ void inverse_DCT(__global struct DecodeInfo * cinfo, wsptr++; continue; } - /* Even part: reverse the even part of the forward DCT. */ - /* The rotator is sqrt(2)*c(-6). */ - - z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); - z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); - z1 = MULTIPLY(z2 + z3, FIX_0_541196100); - tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); - tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); - - z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); - z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + /* Even part */ - tmp0 = (z2 + z3) << CONST_BITS; - tmp1 = (z2 - z3) << CONST_BITS; - - tmp10 = tmp0 + tmp3; - tmp13 = tmp0 - tmp3; - tmp11 = tmp1 + tmp2; - tmp12 = tmp1 - tmp2; - - /* Odd part per figure 8; the matrix is unitary and hence its - * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. - */ - - tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); - tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); - tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); - tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); - - z1 = tmp0 + tmp3; - z2 = tmp1 + tmp2; - z3 = tmp0 + tmp2; - z4 = tmp1 + tmp3; - z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ - - tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ - tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ - tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ - tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ - z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ - z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ - z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ - z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ - - z3 += z5; - z4 += z5; - - tmp0 += z1 + z3; - tmp1 += z2 + z4; - tmp2 += z2 + z3; - tmp3 += z1 + z4; - - /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ - - wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS); - wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS); - wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS); - wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS); - wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS); - wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS); - wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS); - wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS); + tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); + tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); + tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); + tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); + + tmp10 = tmp0 + tmp2; /* phase 3 */ + tmp11 = tmp0 - tmp2; + + tmp13 = tmp1 + tmp3; /* phases 5-3 */ + tmp12 = (tmp1 - tmp3) * ((FAST_FLOAT) 1.414213562) - tmp13; /* 2*c4 */ + + tmp0 = tmp10 + tmp13; /* phase 2 */ + tmp3 = tmp10 - tmp13; + tmp1 = tmp11 + tmp12; + tmp2 = tmp11 - tmp12; + /* Odd part */ + + tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); + tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); + tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); + tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); + + z13 = tmp6 + tmp5; /* phase 6 */ + z10 = tmp6 - tmp5; + z11 = tmp4 + tmp7; + z12 = tmp4 - tmp7; + + tmp7 = z11 + z13; /* phase 5 */ + tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562); /* 2*c4 */ + + z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2*c2 */ + tmp10 = ((FAST_FLOAT) 1.082392200) * z12 - z5; /* 2*(c2-c6) */ + tmp12 = ((FAST_FLOAT) -2.613125930) * z10 + z5; /* -2*(c2+c6) */ + + tmp6 = tmp12 - tmp7; /* phase 2 */ + tmp5 = tmp11 - tmp6; + tmp4 = tmp10 + tmp5; + + wsptr[DCTSIZE*0] = tmp0 + tmp7; + wsptr[DCTSIZE*7] = tmp0 - tmp7; + wsptr[DCTSIZE*1] = tmp1 + tmp6; + wsptr[DCTSIZE*6] = tmp1 - tmp6; + wsptr[DCTSIZE*2] = tmp2 + tmp5; + wsptr[DCTSIZE*5] = tmp2 - tmp5; + wsptr[DCTSIZE*4] = tmp3 + tmp4; + wsptr[DCTSIZE*3] = tmp3 - tmp4; + inptr++; /* advance pointers to next column */ quantptr++; wsptr++; } /* Pass 2: process rows from work array, store into output array. */ - /* Note that we must descale the results by a factor of 8 == 2**3, */ - /* and also undo the PASS1_BITS scaling. */ + /* Note that we must descale the results by a factor of 8 == 2**3. */ wsptr = workspace; for (ctr = 0; ctr < DCTSIZE; ctr++) { @@ -188,107 +417,57 @@ void inverse_DCT(__global struct DecodeInfo * cinfo, /* Rows of zeroes can be exploited in the same way as we did with columns. * However, the column calculation has created many nonzero AC terms, so * the simplification applies less often (typically 5% to 10% of the time). - * On machines with very fast multiplication, it's possible that the - * test takes more time than it's worth. In that case this section - * may be commented out. + * And testing floats for zero is relatively expensive, so we don't bother. */ -#ifndef NO_ZERO_ROW_TEST - if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 && - wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) { - /* AC terms all zero */ - JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3) - & RANGE_MASK]; - - outptr[0] = dcval; - outptr[1] = dcval; - outptr[2] = dcval; - outptr[3] = dcval; - outptr[4] = dcval; - outptr[5] = dcval; - outptr[6] = dcval; - outptr[7] = dcval; - - wsptr += DCTSIZE; /* advance pointer to next row */ - continue; - } -#endif - - /* Even part: reverse the even part of the forward DCT. */ - /* The rotator is sqrt(2)*c(-6). */ - - z2 = (INT32) wsptr[2]; - z3 = (INT32) wsptr[6]; - - z1 = MULTIPLY(z2 + z3, FIX_0_541196100); - tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); - tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); - - tmp0 = ((INT32) wsptr[0] + (INT32) wsptr[4]) << CONST_BITS; - tmp1 = ((INT32) wsptr[0] - (INT32) wsptr[4]) << CONST_BITS; - - tmp10 = tmp0 + tmp3; - tmp13 = tmp0 - tmp3; - tmp11 = tmp1 + tmp2; - tmp12 = tmp1 - tmp2; - - /* Odd part per figure 8; the matrix is unitary and hence its - * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. - */ - - tmp0 = (INT32) wsptr[7]; - tmp1 = (INT32) wsptr[5]; - tmp2 = (INT32) wsptr[3]; - tmp3 = (INT32) wsptr[1]; - - z1 = tmp0 + tmp3; - z2 = tmp1 + tmp2; - z3 = tmp0 + tmp2; - z4 = tmp1 + tmp3; - z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ - - tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ - tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ - tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ - tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ - z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ - z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ - z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ - z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ - - z3 += z5; - z4 += z5; - - tmp0 += z1 + z3; - tmp1 += z2 + z4; - tmp2 += z2 + z3; - tmp3 += z1 + z4; - - /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ - - outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp3, - CONST_BITS+PASS1_BITS+3) + /* Even part */ + + tmp10 = wsptr[0] + wsptr[4]; + tmp11 = wsptr[0] - wsptr[4]; + + tmp13 = wsptr[2] + wsptr[6]; + tmp12 = (wsptr[2] - wsptr[6]) * ((FAST_FLOAT) 1.414213562) - tmp13; + + tmp0 = tmp10 + tmp13; + tmp3 = tmp10 - tmp13; + tmp1 = tmp11 + tmp12; + tmp2 = tmp11 - tmp12; + + /* Odd part */ + + z13 = wsptr[5] + wsptr[3]; + z10 = wsptr[5] - wsptr[3]; + z11 = wsptr[1] + wsptr[7]; + z12 = wsptr[1] - wsptr[7]; + + tmp7 = z11 + z13; + tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562); + + z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2*c2 */ + tmp10 = ((FAST_FLOAT) 1.082392200) * z12 - z5; /* 2*(c2-c6) */ + tmp12 = ((FAST_FLOAT) -2.613125930) * z10 + z5; /* -2*(c2+c6) */ + + tmp6 = tmp12 - tmp7; + tmp5 = tmp11 - tmp6; + tmp4 = tmp10 + tmp5; + + /* Final output stage: scale down by a factor of 8 and range-limit */ + + outptr[0] = range_limit[(int) DESCALE((INT32) (tmp0 + tmp7), 3) & RANGE_MASK]; - outptr[7] = range_limit[(int) DESCALE(tmp10 - tmp3, - CONST_BITS+PASS1_BITS+3) + outptr[7] = range_limit[(int) DESCALE((INT32) (tmp0 - tmp7), 3) & RANGE_MASK]; - outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp2, - CONST_BITS+PASS1_BITS+3) + outptr[1] = range_limit[(int) DESCALE((INT32) (tmp1 + tmp6), 3) & RANGE_MASK]; - outptr[6] = range_limit[(int) DESCALE(tmp11 - tmp2, - CONST_BITS+PASS1_BITS+3) + outptr[6] = range_limit[(int) DESCALE((INT32) (tmp1 - tmp6), 3) & RANGE_MASK]; - outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp1, - CONST_BITS+PASS1_BITS+3) + outptr[2] = range_limit[(int) DESCALE((INT32) (tmp2 + tmp5), 3) & RANGE_MASK]; - outptr[5] = range_limit[(int) DESCALE(tmp12 - tmp1, - CONST_BITS+PASS1_BITS+3) + outptr[5] = range_limit[(int) DESCALE((INT32) (tmp2 - tmp5), 3) & RANGE_MASK]; - outptr[3] = range_limit[(int) DESCALE(tmp13 + tmp0, - CONST_BITS+PASS1_BITS+3) + outptr[4] = range_limit[(int) DESCALE((INT32) (tmp3 + tmp4), 3) & RANGE_MASK]; - outptr[4] = range_limit[(int) DESCALE(tmp13 - tmp0, - CONST_BITS+PASS1_BITS+3) + outptr[3] = range_limit[(int) DESCALE((INT32) (tmp3 - tmp4), 3) & RANGE_MASK]; wsptr += DCTSIZE; /* advance pointer to next row */ diff --git a/djpeg.c b/djpeg.c index 7d4ea3f..8f47435 100644 --- a/djpeg.c +++ b/djpeg.c @@ -543,6 +543,7 @@ main (int argc, char **argv) /* Adjust default decompression parameters by re-parsing the options */ file_index = parse_switches(&cinfo, argc, argv, 0, TRUE); + cinfo.dct_method = JDCT_FLOAT; /* Initialize the output module now to let it override any crucial * option settings (for instance, GIF wants to force color quantization). */ diff --git a/jdcoefct.c b/jdcoefct.c index 1ac5e6c..7e1fa41 100644 --- a/jdcoefct.c +++ b/jdcoefct.c @@ -234,6 +234,8 @@ struct DecodeInfo JSAMPLE sample_range_limit[(5 * (MAXJSAMPLE+1) + CENTERJSAMPLE)]; struct ComponentInfo component_infos[MAX_COMPONENT_INFO_COUNT]; }; + + METHODDEF(int) decompress_onepass2 (j_decompress_ptr cinfo, JSAMPIMAGE output_buf) { @@ -251,70 +253,70 @@ decompress_onepass2 (j_decompress_ptr cinfo, JSAMPIMAGE output_buf) cl_kernel my_kernel; cl_program my_program; - + for (componets_mcu_width = 0 ,ci = 0; ci < cinfo->comps_in_scan; ci++) { compptr = cinfo->cur_comp_info[ci]; componets_mcu_width += compptr->MCU_width * compptr->MCU_height; } - for( yheightoffset = 0 ; yheightoffset < cinfo->total_iMCU_rows ; ++ yheightoffset) - { - /* Loop to process as much as one whole iMCU row */ - for (MCU_col_num = coef->MCU_ctr; MCU_col_num <= last_MCU_col; MCU_col_num++) { - /* Determine where data should go in output_buf and do the IDCT thing. - * We skip dummy blocks at the right and bottom edges (but blkn gets - * incremented past them!). Note the inner loop relies on having - * allocated the MCU_buffer[] blocks sequentially. - */ - for (ci = 0; ci < cinfo->comps_in_scan; ci++) { - JBLOCK * sCurrentBlock; - - compptr = cinfo->cur_comp_info[ci]; - /* Don't bother to IDCT an uninteresting component. */ - if (! compptr->component_needed) { - cinfo->decoded_mcus_current += compptr->MCU_blocks; - continue; - } - inverse_DCT = cinfo->idct->inverse_DCT[compptr->component_index]; - useful_width = (MCU_col_num < last_MCU_col) ? compptr->MCU_width - : compptr->last_col_width; - { - int k; - for ( k = ci , cur_row = **output_buf ; k > 0 ; --k) - { - cur_row += compptr[ -k ].image_buffer_size; - } - cur_row += yheightoffset * compptr->DCT_scaled_size * compptr->row_buffer_size ; - } - output_ptr = &cur_row; - - start_col = MCU_col_num * compptr->MCU_sample_width; - for (yindex = 0; yindex < compptr->MCU_height; yindex++) { - output_col = start_col; - sCurrentBlock = cinfo->decoded_mcus_base + (( yheightoffset * cinfo->MCUs_per_row + MCU_col_num) * componets_mcu_width) ; - { - int k; - for( k = ci ; k > 0 ; --k) - { - sCurrentBlock += compptr[-k].MCU_width; - } - } - for (xindex = 0; xindex < useful_width; xindex++) { - (*inverse_DCT) (cinfo, compptr, - (JCOEFPTR) (sCurrentBlock + xindex), - output_ptr, output_col); - output_col += compptr->DCT_scaled_size; - } - sCurrentBlock += compptr->MCU_width; - cur_row += compptr->DCT_scaled_size * compptr->row_buffer_size ; - } - } - } - /* Completed an MCU row, but perhaps not an iMCU row */ - coef->MCU_ctr = 0; - - /* Completed the iMCU row, advance counters for next one */ - } + // for( yheightoffset = 0 ; yheightoffset < cinfo->total_iMCU_rows ; ++ yheightoffset) + // { + // /* Loop to process as much as one whole iMCU row */ + // for (MCU_col_num = coef->MCU_ctr; MCU_col_num <= last_MCU_col; MCU_col_num++) { + // /* Determine where data should go in output_buf and do the IDCT thing. + // * We skip dummy blocks at the right and bottom edges (but blkn gets + // * incremented past them!). Note the inner loop relies on having + // * allocated the MCU_buffer[] blocks sequentially. + // */ + // for (ci = 0; ci < cinfo->comps_in_scan; ci++) { + // JBLOCK * sCurrentBlock; + + // compptr = cinfo->cur_comp_info[ci]; + // /* Don't bother to IDCT an uninteresting component. */ + // if (! compptr->component_needed) { + // cinfo->decoded_mcus_current += compptr->MCU_blocks; + // continue; + // } + // inverse_DCT = cinfo->idct->inverse_DCT[compptr->component_index]; + // useful_width = (MCU_col_num < last_MCU_col) ? compptr->MCU_width + // : compptr->last_col_width; + // { + // int k; + // for ( k = ci , cur_row = **output_buf ; k > 0 ; --k) + // { + // cur_row += compptr[ -k ].image_buffer_size; + // } + // cur_row += yheightoffset * compptr->DCT_scaled_size * compptr->row_buffer_size ; + // } + // output_ptr = &cur_row; + + // start_col = MCU_col_num * compptr->MCU_sample_width; + // for (yindex = 0; yindex < compptr->MCU_height; yindex++) { + // output_col = start_col; + // sCurrentBlock = cinfo->decoded_mcus_base + (( yheightoffset * cinfo->MCUs_per_row + MCU_col_num) * componets_mcu_width) ; + // { + // int k; + // for( k = ci ; k > 0 ; --k) + // { + // sCurrentBlock += compptr[-k].MCU_width; + // } + // } + // for (xindex = 0; xindex < useful_width; xindex++) { + // (*inverse_DCT) (cinfo, compptr, + // (JCOEFPTR) (sCurrentBlock + xindex), + // output_ptr, output_col); + // output_col += compptr->DCT_scaled_size; + // } + // sCurrentBlock += compptr->MCU_width; + // cur_row += compptr->DCT_scaled_size * compptr->row_buffer_size ; + // } + // } + // } + // /* Completed an MCU row, but perhaps not an iMCU row */ + // coef->MCU_ctr = 0; + + // /* Completed the iMCU row, advance counters for next one */ + // } cinfo->output_iMCU_row = cinfo->total_iMCU_rows; cinfo->input_iMCU_row = cinfo->total_iMCU_rows; /* Completed the scan */ @@ -334,7 +336,7 @@ decompress_onepass2 (j_decompress_ptr cinfo, JSAMPIMAGE output_buf) cl_mem my_cl_output_buffer; size_t work_dim[3]; size_t local_work_dim[3]; - JSAMPLE * from_cl_output; + // JSAMPLE * from_cl_output; extern int read_all_bytes(const char * aFile,char ** aContent); @@ -419,13 +421,14 @@ decompress_onepass2 (j_decompress_ptr cinfo, JSAMPIMAGE output_buf) NULL, NULL, NULL); - from_cl_output = malloc(sizeof(JSAMPLE) * previous_image_size); + clFinish(cinfo->current_cl_queue); + // from_cl_output = malloc(sizeof(JSAMPLE) * previous_image_size); error_code = clEnqueueReadBuffer(cinfo->current_cl_queue, my_cl_output_buffer, CL_TRUE, 0, sizeof(JSAMPLE) * previous_image_size, - from_cl_output, + **output_buf, 0, NULL, NULL); @@ -434,11 +437,12 @@ decompress_onepass2 (j_decompress_ptr cinfo, JSAMPIMAGE output_buf) clReleaseMemObject(my_cl_output_buffer); clReleaseKernel(dct_kernel); clReleaseProgram(my_program); - if(0 != memcmp(from_cl_output,**output_buf,sizeof(JSAMPLE) * previous_image_size)) - { - fprintf(stderr,"LIN:Failed\n"); - } - free(from_cl_output); + // if(0 != memcmp(from_cl_output,**output_buf,sizeof(JSAMPLE) * previous_image_size)) + // { + // fprintf(stderr,"LIN:Failed\n"); + // } + // memcpy(**output_buf,from_cl_output,sizeof(JSAMPLE) * previous_image_size); + // free(from_cl_output); } return JPEG_SCAN_COMPLETED; diff --git a/jidctflt.c b/jidctflt.c index 0188ce3..b7af262 100644 --- a/jidctflt.c +++ b/jidctflt.c @@ -178,7 +178,7 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr, wsptr = workspace; for (ctr = 0; ctr < DCTSIZE; ctr++) { - outptr = output_buf[ctr] + output_col; + outptr = (*output_buf) + ctr * compptr->row_buffer_size + output_col; /* Rows of zeroes can be exploited in the same way as we did with columns. * However, the column calculation has created many nonzero AC terms, so * the simplification applies less often (typically 5% to 10% of the time). diff --git a/jmorecfg.h b/jmorecfg.h index 54a7d1c..6a52005 100644 --- a/jmorecfg.h +++ b/jmorecfg.h @@ -260,8 +260,8 @@ typedef int boolean; /* Capability options common to encoder and decoder: */ -#define DCT_ISLOW_SUPPORTED /* slow but accurate integer algorithm */ -#define DCT_IFAST_SUPPORTED /* faster, less accurate integer method */ +// #define DCT_ISLOW_SUPPORTED /* slow but accurate integer algorithm */ +// #define DCT_IFAST_SUPPORTED /* faster, less accurate integer method */ #define DCT_FLOAT_SUPPORTED /* floating-point: accurate, fast on fast HW */ /* Encoder capability options: */