diff --git a/decode_idct.cl b/decode_idct.cl
index d2a3249..5ca17f6 100644
--- a/decode_idct.cl
+++ b/decode_idct.cl
@@ -9,6 +9,8 @@ typedef unsigned char JSAMPLE;
 typedef unsigned int JDIMENSION;
 typedef int INT32;
 typedef short INT16;
+typedef float FAST_FLOAT;
+typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */
 #define MULTIPLIER  int		/* type for fastest integer multiply */
 typedef MULTIPLIER ISLOW_MULT_TYPE; /* short or int, whichever is faster */
 struct ComponentInfo
@@ -54,29 +56,270 @@ struct DecodeInfo
 #define RIGHT_SHIFT(x,shft)	((x) >> (shft))
 #define RANGE_MASK  (MAXJSAMPLE * 4 + 3) /* 2 bits wider than legal samples */
 
+// void inverse_DCT(__global struct DecodeInfo * cinfo,
+//                 __global struct ComponentInfo * compptr,
+//                 __global JCOEF * coef_block,
+//                 __global JSAMPLE * output_buf,
+//                 JDIMENSION output_col)
+// {
+//   INT32 tmp0, tmp1, tmp2, tmp3;
+//   INT32 tmp10, tmp11, tmp12, tmp13;
+//   INT32 z1, z2, z3, z4, z5;
+//   __global JCOEF * inptr;
+//   __global ISLOW_MULT_TYPE * quantptr;
+//   int * wsptr;
+//   __global JSAMPLE * outptr;
+//   __global JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+//   int ctr;
+//   int workspace[DCTSIZE2];	/* buffers data between passes */
+// 
+//   /* Pass 1: process columns from input, store into work array. */
+//   /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
+//   /* furthermore, we scale the results by 2**PASS1_BITS. */
+// 
+//   inptr = coef_block;
+//   quantptr = (__global ISLOW_MULT_TYPE *) compptr->dct_table;
+//   wsptr = workspace;
+//   for (ctr = DCTSIZE; ctr > 0; ctr--) {
+//     /* Due to quantization, we will usually find that many of the input
+//      * coefficients are zero, especially the AC terms.  We can exploit this
+//      * by short-circuiting the IDCT calculation for any column in which all
+//      * the AC terms are zero.  In that case each output is equal to the
+//      * DC coefficient (with scale factor as needed).
+//      * With typical images and quantization tables, half or more of the
+//      * column DCT calculations can be simplified this way.
+//      */
+//     
+//     if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
+// 	inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
+// 	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
+// 	inptr[DCTSIZE*7] == 0) {
+//       /* AC terms all zero */
+//       int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
+//       
+//       wsptr[DCTSIZE*0] = dcval;
+//       wsptr[DCTSIZE*1] = dcval;
+//       wsptr[DCTSIZE*2] = dcval;
+//       wsptr[DCTSIZE*3] = dcval;
+//       wsptr[DCTSIZE*4] = dcval;
+//       wsptr[DCTSIZE*5] = dcval;
+//       wsptr[DCTSIZE*6] = dcval;
+//       wsptr[DCTSIZE*7] = dcval;
+//       
+//       inptr++;			/* advance pointers to next column */
+//       quantptr++;
+//       wsptr++;
+//       continue;
+//     }
+//     /* Even part: reverse the even part of the forward DCT. */
+//     /* The rotator is sqrt(2)*c(-6). */
+//     
+//     z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+//     z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+//     
+//     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
+//     tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
+//     tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
+//     
+//     z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+//     z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+// 
+//     tmp0 = (z2 + z3) << CONST_BITS;
+//     tmp1 = (z2 - z3) << CONST_BITS;
+//     
+//     tmp10 = tmp0 + tmp3;
+//     tmp13 = tmp0 - tmp3;
+//     tmp11 = tmp1 + tmp2;
+//     tmp12 = tmp1 - tmp2;
+//     
+//     /* Odd part per figure 8; the matrix is unitary and hence its
+//      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
+//      */
+//     
+//     tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+//     tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+//     tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+//     tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+//     
+//     z1 = tmp0 + tmp3;
+//     z2 = tmp1 + tmp2;
+//     z3 = tmp0 + tmp2;
+//     z4 = tmp1 + tmp3;
+//     z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
+//     
+//     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+//     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
+//     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
+//     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+//     z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
+//     z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+//     z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+//     z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+//     
+//     z3 += z5;
+//     z4 += z5;
+//     
+//     tmp0 += z1 + z3;
+//     tmp1 += z2 + z4;
+//     tmp2 += z2 + z3;
+//     tmp3 += z1 + z4;
+//     
+//     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+//     
+//     wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
+//     wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
+//     wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
+//     wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
+//     wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
+//     wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
+//     wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
+//     wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
+//     
+//     inptr++;			/* advance pointers to next column */
+//     quantptr++;
+//     wsptr++;
+//   }
+//   
+//   /* Pass 2: process rows from work array, store into output array. */
+//   /* Note that we must descale the results by a factor of 8 == 2**3, */
+//   /* and also undo the PASS1_BITS scaling. */
+// 
+//   wsptr = workspace;
+//   for (ctr = 0; ctr < DCTSIZE; ctr++) {
+//     outptr = output_buf + ctr * compptr->row_buffer_size + output_col;
+//     /* Rows of zeroes can be exploited in the same way as we did with columns.
+//      * However, the column calculation has created many nonzero AC terms, so
+//      * the simplification applies less often (typically 5% to 10% of the time).
+//      * On machines with very fast multiplication, it's possible that the
+//      * test takes more time than it's worth.  In that case this section
+//      * may be commented out.
+//      */
+//     
+// #ifndef NO_ZERO_ROW_TEST
+//     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
+// 	wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
+//       /* AC terms all zero */
+//       JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)
+// 				  & RANGE_MASK];
+//       
+//       outptr[0] = dcval;
+//       outptr[1] = dcval;
+//       outptr[2] = dcval;
+//       outptr[3] = dcval;
+//       outptr[4] = dcval;
+//       outptr[5] = dcval;
+//       outptr[6] = dcval;
+//       outptr[7] = dcval;
+// 
+//       wsptr += DCTSIZE;		/* advance pointer to next row */
+//       continue;
+//     }
+// #endif
+//     
+//     /* Even part: reverse the even part of the forward DCT. */
+//     /* The rotator is sqrt(2)*c(-6). */
+//     
+//     z2 = (INT32) wsptr[2];
+//     z3 = (INT32) wsptr[6];
+//     
+//     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
+//     tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
+//     tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
+//     
+//     tmp0 = ((INT32) wsptr[0] + (INT32) wsptr[4]) << CONST_BITS;
+//     tmp1 = ((INT32) wsptr[0] - (INT32) wsptr[4]) << CONST_BITS;
+//     
+//     tmp10 = tmp0 + tmp3;
+//     tmp13 = tmp0 - tmp3;
+//     tmp11 = tmp1 + tmp2;
+//     tmp12 = tmp1 - tmp2;
+//     
+//     /* Odd part per figure 8; the matrix is unitary and hence its
+//      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
+//      */
+//     
+//     tmp0 = (INT32) wsptr[7];
+//     tmp1 = (INT32) wsptr[5];
+//     tmp2 = (INT32) wsptr[3];
+//     tmp3 = (INT32) wsptr[1];
+//     
+//     z1 = tmp0 + tmp3;
+//     z2 = tmp1 + tmp2;
+//     z3 = tmp0 + tmp2;
+//     z4 = tmp1 + tmp3;
+//     z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
+//     
+//     tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+//     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
+//     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
+//     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+//     z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
+//     z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+//     z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+//     z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+//     
+//     z3 += z5;
+//     z4 += z5;
+//     
+//     tmp0 += z1 + z3;
+//     tmp1 += z2 + z4;
+//     tmp2 += z2 + z3;
+//     tmp3 += z1 + z4;
+//     
+//     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
+//     
+//     outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp3,
+// 					  CONST_BITS+PASS1_BITS+3)
+// 			    & RANGE_MASK];
+//     outptr[7] = range_limit[(int) DESCALE(tmp10 - tmp3,
+// 					  CONST_BITS+PASS1_BITS+3)
+// 			    & RANGE_MASK];
+//     outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp2,
+// 					  CONST_BITS+PASS1_BITS+3)
+// 			    & RANGE_MASK];
+//     outptr[6] = range_limit[(int) DESCALE(tmp11 - tmp2,
+// 					  CONST_BITS+PASS1_BITS+3)
+// 			    & RANGE_MASK];
+//     outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp1,
+// 					  CONST_BITS+PASS1_BITS+3)
+// 			    & RANGE_MASK];
+//     outptr[5] = range_limit[(int) DESCALE(tmp12 - tmp1,
+// 					  CONST_BITS+PASS1_BITS+3)
+// 			    & RANGE_MASK];
+//     outptr[3] = range_limit[(int) DESCALE(tmp13 + tmp0,
+// 					  CONST_BITS+PASS1_BITS+3)
+// 			    & RANGE_MASK];
+//     outptr[4] = range_limit[(int) DESCALE(tmp13 - tmp0,
+// 					  CONST_BITS+PASS1_BITS+3)
+// 			    & RANGE_MASK];
+//     
+//     wsptr += DCTSIZE;		/* advance pointer to next row */
+//   }
+// }
+
+
+
 void inverse_DCT(__global struct DecodeInfo * cinfo,
                 __global struct ComponentInfo * compptr,
                 __global JCOEF * coef_block,
                 __global JSAMPLE * output_buf,
                 JDIMENSION output_col)
 {
-  INT32 tmp0, tmp1, tmp2, tmp3;
-  INT32 tmp10, tmp11, tmp12, tmp13;
-  INT32 z1, z2, z3, z4, z5;
+  FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  FAST_FLOAT tmp10, tmp11, tmp12, tmp13;
+  FAST_FLOAT z5, z10, z11, z12, z13;
   __global JCOEF * inptr;
-  __global ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
+  __global FLOAT_MULT_TYPE * quantptr;
+  FAST_FLOAT * wsptr;
   __global JSAMPLE * outptr;
   __global JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[DCTSIZE2];	/* buffers data between passes */
+  FAST_FLOAT workspace[DCTSIZE2]; /* buffers data between passes */
 
   /* Pass 1: process columns from input, store into work array. */
-  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
-  /* furthermore, we scale the results by 2**PASS1_BITS. */
 
   inptr = coef_block;
-  quantptr = (__global ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (__global FLOAT_MULT_TYPE *) compptr->dct_table;
   wsptr = workspace;
   for (ctr = DCTSIZE; ctr > 0; ctr--) {
     /* Due to quantization, we will usually find that many of the input
@@ -93,7 +336,7 @@ void inverse_DCT(__global struct DecodeInfo * cinfo,
 	inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
 	inptr[DCTSIZE*7] == 0) {
       /* AC terms all zero */
-      int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
+      FAST_FLOAT dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
       
       wsptr[DCTSIZE*0] = dcval;
       wsptr[DCTSIZE*1] = dcval;
@@ -109,78 +352,64 @@ void inverse_DCT(__global struct DecodeInfo * cinfo,
       wsptr++;
       continue;
     }
-    /* Even part: reverse the even part of the forward DCT. */
-    /* The rotator is sqrt(2)*c(-6). */
-    
-    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
     
-    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
-    tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
-    tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
-    
-    z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    /* Even part */
 
-    tmp0 = (z2 + z3) << CONST_BITS;
-    tmp1 = (z2 - z3) << CONST_BITS;
-    
-    tmp10 = tmp0 + tmp3;
-    tmp13 = tmp0 - tmp3;
-    tmp11 = tmp1 + tmp2;
-    tmp12 = tmp1 - tmp2;
-    
-    /* Odd part per figure 8; the matrix is unitary and hence its
-     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
-     */
-    
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
-    tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    
-    z1 = tmp0 + tmp3;
-    z2 = tmp1 + tmp2;
-    z3 = tmp0 + tmp2;
-    z4 = tmp1 + tmp3;
-    z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
-    
-    tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
-    tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
-    tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
-    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
-    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
-    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
-    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
-    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
-    
-    z3 += z5;
-    z4 += z5;
-    
-    tmp0 += z1 + z3;
-    tmp1 += z2 + z4;
-    tmp2 += z2 + z3;
-    tmp3 += z1 + z4;
-    
-    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
-    
-    wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+
+    tmp10 = tmp0 + tmp2;	/* phase 3 */
+    tmp11 = tmp0 - tmp2;
+
+    tmp13 = tmp1 + tmp3;	/* phases 5-3 */
+    tmp12 = (tmp1 - tmp3) * ((FAST_FLOAT) 1.414213562) - tmp13; /* 2*c4 */
+
+    tmp0 = tmp10 + tmp13;	/* phase 2 */
+    tmp3 = tmp10 - tmp13;
+    tmp1 = tmp11 + tmp12;
+    tmp2 = tmp11 - tmp12;
     
+    /* Odd part */
+
+    tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+
+    z13 = tmp6 + tmp5;		/* phase 6 */
+    z10 = tmp6 - tmp5;
+    z11 = tmp4 + tmp7;
+    z12 = tmp4 - tmp7;
+
+    tmp7 = z11 + z13;		/* phase 5 */
+    tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562); /* 2*c4 */
+
+    z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2*c2 */
+    tmp10 = ((FAST_FLOAT) 1.082392200) * z12 - z5; /* 2*(c2-c6) */
+    tmp12 = ((FAST_FLOAT) -2.613125930) * z10 + z5; /* -2*(c2+c6) */
+
+    tmp6 = tmp12 - tmp7;	/* phase 2 */
+    tmp5 = tmp11 - tmp6;
+    tmp4 = tmp10 + tmp5;
+
+    wsptr[DCTSIZE*0] = tmp0 + tmp7;
+    wsptr[DCTSIZE*7] = tmp0 - tmp7;
+    wsptr[DCTSIZE*1] = tmp1 + tmp6;
+    wsptr[DCTSIZE*6] = tmp1 - tmp6;
+    wsptr[DCTSIZE*2] = tmp2 + tmp5;
+    wsptr[DCTSIZE*5] = tmp2 - tmp5;
+    wsptr[DCTSIZE*4] = tmp3 + tmp4;
+    wsptr[DCTSIZE*3] = tmp3 - tmp4;
+
     inptr++;			/* advance pointers to next column */
     quantptr++;
     wsptr++;
   }
   
   /* Pass 2: process rows from work array, store into output array. */
-  /* Note that we must descale the results by a factor of 8 == 2**3, */
-  /* and also undo the PASS1_BITS scaling. */
+  /* Note that we must descale the results by a factor of 8 == 2**3. */
 
   wsptr = workspace;
   for (ctr = 0; ctr < DCTSIZE; ctr++) {
@@ -188,107 +417,57 @@ void inverse_DCT(__global struct DecodeInfo * cinfo,
     /* Rows of zeroes can be exploited in the same way as we did with columns.
      * However, the column calculation has created many nonzero AC terms, so
      * the simplification applies less often (typically 5% to 10% of the time).
-     * On machines with very fast multiplication, it's possible that the
-     * test takes more time than it's worth.  In that case this section
-     * may be commented out.
+     * And testing floats for zero is relatively expensive, so we don't bother.
      */
     
-#ifndef NO_ZERO_ROW_TEST
-    if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
-	wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
-      /* AC terms all zero */
-      JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)
-				  & RANGE_MASK];
-      
-      outptr[0] = dcval;
-      outptr[1] = dcval;
-      outptr[2] = dcval;
-      outptr[3] = dcval;
-      outptr[4] = dcval;
-      outptr[5] = dcval;
-      outptr[6] = dcval;
-      outptr[7] = dcval;
-
-      wsptr += DCTSIZE;		/* advance pointer to next row */
-      continue;
-    }
-#endif
-    
-    /* Even part: reverse the even part of the forward DCT. */
-    /* The rotator is sqrt(2)*c(-6). */
-    
-    z2 = (INT32) wsptr[2];
-    z3 = (INT32) wsptr[6];
-    
-    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
-    tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
-    tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
-    
-    tmp0 = ((INT32) wsptr[0] + (INT32) wsptr[4]) << CONST_BITS;
-    tmp1 = ((INT32) wsptr[0] - (INT32) wsptr[4]) << CONST_BITS;
-    
-    tmp10 = tmp0 + tmp3;
-    tmp13 = tmp0 - tmp3;
-    tmp11 = tmp1 + tmp2;
-    tmp12 = tmp1 - tmp2;
-    
-    /* Odd part per figure 8; the matrix is unitary and hence its
-     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
-     */
-    
-    tmp0 = (INT32) wsptr[7];
-    tmp1 = (INT32) wsptr[5];
-    tmp2 = (INT32) wsptr[3];
-    tmp3 = (INT32) wsptr[1];
-    
-    z1 = tmp0 + tmp3;
-    z2 = tmp1 + tmp2;
-    z3 = tmp0 + tmp2;
-    z4 = tmp1 + tmp3;
-    z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
-    
-    tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
-    tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
-    tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
-    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
-    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
-    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
-    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
-    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
-    
-    z3 += z5;
-    z4 += z5;
-    
-    tmp0 += z1 + z3;
-    tmp1 += z2 + z4;
-    tmp2 += z2 + z3;
-    tmp3 += z1 + z4;
-    
-    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
-    
-    outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp3,
-					  CONST_BITS+PASS1_BITS+3)
+    /* Even part */
+
+    tmp10 = wsptr[0] + wsptr[4];
+    tmp11 = wsptr[0] - wsptr[4];
+
+    tmp13 = wsptr[2] + wsptr[6];
+    tmp12 = (wsptr[2] - wsptr[6]) * ((FAST_FLOAT) 1.414213562) - tmp13;
+
+    tmp0 = tmp10 + tmp13;
+    tmp3 = tmp10 - tmp13;
+    tmp1 = tmp11 + tmp12;
+    tmp2 = tmp11 - tmp12;
+
+    /* Odd part */
+
+    z13 = wsptr[5] + wsptr[3];
+    z10 = wsptr[5] - wsptr[3];
+    z11 = wsptr[1] + wsptr[7];
+    z12 = wsptr[1] - wsptr[7];
+
+    tmp7 = z11 + z13;
+    tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562);
+
+    z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2*c2 */
+    tmp10 = ((FAST_FLOAT) 1.082392200) * z12 - z5; /* 2*(c2-c6) */
+    tmp12 = ((FAST_FLOAT) -2.613125930) * z10 + z5; /* -2*(c2+c6) */
+
+    tmp6 = tmp12 - tmp7;
+    tmp5 = tmp11 - tmp6;
+    tmp4 = tmp10 + tmp5;
+
+    /* Final output stage: scale down by a factor of 8 and range-limit */
+
+    outptr[0] = range_limit[(int) DESCALE((INT32) (tmp0 + tmp7), 3)
 			    & RANGE_MASK];
-    outptr[7] = range_limit[(int) DESCALE(tmp10 - tmp3,
-					  CONST_BITS+PASS1_BITS+3)
+    outptr[7] = range_limit[(int) DESCALE((INT32) (tmp0 - tmp7), 3)
 			    & RANGE_MASK];
-    outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp2,
-					  CONST_BITS+PASS1_BITS+3)
+    outptr[1] = range_limit[(int) DESCALE((INT32) (tmp1 + tmp6), 3)
 			    & RANGE_MASK];
-    outptr[6] = range_limit[(int) DESCALE(tmp11 - tmp2,
-					  CONST_BITS+PASS1_BITS+3)
+    outptr[6] = range_limit[(int) DESCALE((INT32) (tmp1 - tmp6), 3)
 			    & RANGE_MASK];
-    outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp1,
-					  CONST_BITS+PASS1_BITS+3)
+    outptr[2] = range_limit[(int) DESCALE((INT32) (tmp2 + tmp5), 3)
 			    & RANGE_MASK];
-    outptr[5] = range_limit[(int) DESCALE(tmp12 - tmp1,
-					  CONST_BITS+PASS1_BITS+3)
+    outptr[5] = range_limit[(int) DESCALE((INT32) (tmp2 - tmp5), 3)
 			    & RANGE_MASK];
-    outptr[3] = range_limit[(int) DESCALE(tmp13 + tmp0,
-					  CONST_BITS+PASS1_BITS+3)
+    outptr[4] = range_limit[(int) DESCALE((INT32) (tmp3 + tmp4), 3)
 			    & RANGE_MASK];
-    outptr[4] = range_limit[(int) DESCALE(tmp13 - tmp0,
-					  CONST_BITS+PASS1_BITS+3)
+    outptr[3] = range_limit[(int) DESCALE((INT32) (tmp3 - tmp4), 3)
 			    & RANGE_MASK];
     
     wsptr += DCTSIZE;		/* advance pointer to next row */
diff --git a/djpeg.c b/djpeg.c
index 7d4ea3f..8f47435 100644
--- a/djpeg.c
+++ b/djpeg.c
@@ -543,6 +543,7 @@ main (int argc, char **argv)
     /* Adjust default decompression parameters by re-parsing the options */
     file_index = parse_switches(&cinfo, argc, argv, 0, TRUE);
 
+    cinfo.dct_method = JDCT_FLOAT;
     /* Initialize the output module now to let it override any crucial
      * option settings (for instance, GIF wants to force color quantization).
      */
diff --git a/jdcoefct.c b/jdcoefct.c
index 1ac5e6c..7e1fa41 100644
--- a/jdcoefct.c
+++ b/jdcoefct.c
@@ -234,6 +234,8 @@ struct DecodeInfo
    JSAMPLE  sample_range_limit[(5 * (MAXJSAMPLE+1) + CENTERJSAMPLE)]; 
    struct ComponentInfo component_infos[MAX_COMPONENT_INFO_COUNT]; 
 };
+
+
     METHODDEF(int)
 decompress_onepass2 (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 {
@@ -251,70 +253,70 @@ decompress_onepass2 (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
     cl_kernel my_kernel;
     cl_program my_program;
 
-
+    
     for (componets_mcu_width = 0 ,ci = 0; ci < cinfo->comps_in_scan; ci++) {
         compptr = cinfo->cur_comp_info[ci];
         componets_mcu_width += compptr->MCU_width * compptr->MCU_height;
     }
 
-    for( yheightoffset = 0 ; yheightoffset < cinfo->total_iMCU_rows ; ++ yheightoffset)
-    {
-        /* Loop to process as much as one whole iMCU row */
-        for (MCU_col_num = coef->MCU_ctr; MCU_col_num <= last_MCU_col; MCU_col_num++) {
-            /* Determine where data should go in output_buf and do the IDCT thing.
-             * We skip dummy blocks at the right and bottom edges (but blkn gets
-             * incremented past them!).  Note the inner loop relies on having
-             * allocated the MCU_buffer[] blocks sequentially.
-             */
-            for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
-                JBLOCK * sCurrentBlock;
-
-                compptr = cinfo->cur_comp_info[ci];
-                /* Don't bother to IDCT an uninteresting component. */
-                if (! compptr->component_needed) {
-                    cinfo->decoded_mcus_current += compptr->MCU_blocks;
-                    continue;
-                }
-                inverse_DCT = cinfo->idct->inverse_DCT[compptr->component_index];
-                useful_width = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
-                    : compptr->last_col_width;
-                {
-                    int k;
-                    for ( k = ci , cur_row = **output_buf ; k > 0 ; --k)
-                    {
-                        cur_row += compptr[ -k ].image_buffer_size;
-                    }
-                    cur_row +=  yheightoffset * compptr->DCT_scaled_size * compptr->row_buffer_size ;
-                }
-                output_ptr = &cur_row;
-
-                start_col = MCU_col_num * compptr->MCU_sample_width;
-                for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
-                    output_col = start_col;
-                    sCurrentBlock = cinfo->decoded_mcus_base + (( yheightoffset * cinfo->MCUs_per_row  + MCU_col_num) * componets_mcu_width) ;
-                    {
-                        int k;
-                        for( k = ci ; k > 0 ; --k)
-                        {
-                            sCurrentBlock += compptr[-k].MCU_width;
-                        }
-                    }
-                    for (xindex = 0; xindex < useful_width; xindex++) {
-                        (*inverse_DCT) (cinfo, compptr,
-                                (JCOEFPTR) (sCurrentBlock +  xindex),
-                                output_ptr, output_col);
-                        output_col += compptr->DCT_scaled_size;
-                    }
-                    sCurrentBlock += compptr->MCU_width;
-                    cur_row += compptr->DCT_scaled_size * compptr->row_buffer_size ;
-                }
-            }
-        }
-        /* Completed an MCU row, but perhaps not an iMCU row */
-        coef->MCU_ctr = 0;
-
-        /* Completed the iMCU row, advance counters for next one */
-    }
+ //    for( yheightoffset = 0 ; yheightoffset < cinfo->total_iMCU_rows ; ++ yheightoffset)
+ //    {
+ //        /* Loop to process as much as one whole iMCU row */
+ //        for (MCU_col_num = coef->MCU_ctr; MCU_col_num <= last_MCU_col; MCU_col_num++) {
+ //            /* Determine where data should go in output_buf and do the IDCT thing.
+ //             * We skip dummy blocks at the right and bottom edges (but blkn gets
+ //             * incremented past them!).  Note the inner loop relies on having
+ //             * allocated the MCU_buffer[] blocks sequentially.
+ //             */
+ //            for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+ //                JBLOCK * sCurrentBlock;
+
+ //                compptr = cinfo->cur_comp_info[ci];
+ //                /* Don't bother to IDCT an uninteresting component. */
+ //                if (! compptr->component_needed) {
+ //                    cinfo->decoded_mcus_current += compptr->MCU_blocks;
+ //                    continue;
+ //                }
+ //                inverse_DCT = cinfo->idct->inverse_DCT[compptr->component_index];
+ //                useful_width = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
+ //                    : compptr->last_col_width;
+ //                {
+ //                    int k;
+ //                    for ( k = ci , cur_row = **output_buf ; k > 0 ; --k)
+ //                    {
+ //                        cur_row += compptr[ -k ].image_buffer_size;
+ //                    }
+ //                    cur_row +=  yheightoffset * compptr->DCT_scaled_size * compptr->row_buffer_size ;
+ //                }
+ //                output_ptr = &cur_row;
+
+ //                start_col = MCU_col_num * compptr->MCU_sample_width;
+ //                for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
+ //                    output_col = start_col;
+ //                    sCurrentBlock = cinfo->decoded_mcus_base + (( yheightoffset * cinfo->MCUs_per_row  + MCU_col_num) * componets_mcu_width) ;
+ //                    {
+ //                        int k;
+ //                        for( k = ci ; k > 0 ; --k)
+ //                        {
+ //                            sCurrentBlock += compptr[-k].MCU_width;
+ //                        }
+ //                    }
+ //                    for (xindex = 0; xindex < useful_width; xindex++) {
+ //                        (*inverse_DCT) (cinfo, compptr,
+ //                                (JCOEFPTR) (sCurrentBlock +  xindex),
+ //                                output_ptr, output_col);
+ //                        output_col += compptr->DCT_scaled_size;
+ //                    }
+ //                    sCurrentBlock += compptr->MCU_width;
+ //                    cur_row += compptr->DCT_scaled_size * compptr->row_buffer_size ;
+ //                }
+ //            }
+ //        }
+ //        /* Completed an MCU row, but perhaps not an iMCU row */
+ //        coef->MCU_ctr = 0;
+
+ //        /* Completed the iMCU row, advance counters for next one */
+ //    }
     cinfo->output_iMCU_row = cinfo->total_iMCU_rows;
     cinfo->input_iMCU_row = cinfo->total_iMCU_rows;
     /* Completed the scan */
@@ -334,7 +336,7 @@ decompress_onepass2 (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
         cl_mem my_cl_output_buffer; 
         size_t work_dim[3];
         size_t local_work_dim[3];
-        JSAMPLE * from_cl_output;
+        // JSAMPLE * from_cl_output;
 
         extern int read_all_bytes(const char * aFile,char ** aContent);
         
@@ -419,13 +421,14 @@ decompress_onepass2 (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
                     NULL,
                     NULL,
                     NULL);
-        from_cl_output = malloc(sizeof(JSAMPLE) * previous_image_size);
+        clFinish(cinfo->current_cl_queue);
+        // from_cl_output = malloc(sizeof(JSAMPLE) * previous_image_size);
         error_code = clEnqueueReadBuffer(cinfo->current_cl_queue,
                             my_cl_output_buffer,
                             CL_TRUE,
                             0,
                             sizeof(JSAMPLE) * previous_image_size,
-                            from_cl_output,
+                            **output_buf,
                             0,
                             NULL,
                             NULL);
@@ -434,11 +437,12 @@ decompress_onepass2 (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
         clReleaseMemObject(my_cl_output_buffer);
         clReleaseKernel(dct_kernel);
         clReleaseProgram(my_program);
-        if(0 != memcmp(from_cl_output,**output_buf,sizeof(JSAMPLE) * previous_image_size))
-        {
-            fprintf(stderr,"LIN:Failed\n");
-        }
-        free(from_cl_output);
+        // if(0 != memcmp(from_cl_output,**output_buf,sizeof(JSAMPLE) * previous_image_size))
+        // {
+        //     fprintf(stderr,"LIN:Failed\n");
+        // }
+        // memcpy(**output_buf,from_cl_output,sizeof(JSAMPLE) * previous_image_size);
+        // free(from_cl_output);
     }
 
     return JPEG_SCAN_COMPLETED;
diff --git a/jidctflt.c b/jidctflt.c
index 0188ce3..b7af262 100644
--- a/jidctflt.c
+++ b/jidctflt.c
@@ -178,7 +178,7 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 
   wsptr = workspace;
   for (ctr = 0; ctr < DCTSIZE; ctr++) {
-    outptr = output_buf[ctr] + output_col;
+    outptr = (*output_buf) + ctr * compptr->row_buffer_size + output_col;
     /* Rows of zeroes can be exploited in the same way as we did with columns.
      * However, the column calculation has created many nonzero AC terms, so
      * the simplification applies less often (typically 5% to 10% of the time).
diff --git a/jmorecfg.h b/jmorecfg.h
index 54a7d1c..6a52005 100644
--- a/jmorecfg.h
+++ b/jmorecfg.h
@@ -260,8 +260,8 @@ typedef int boolean;
 
 /* Capability options common to encoder and decoder: */
 
-#define DCT_ISLOW_SUPPORTED	/* slow but accurate integer algorithm */
-#define DCT_IFAST_SUPPORTED	/* faster, less accurate integer method */
+// #define DCT_ISLOW_SUPPORTED	/* slow but accurate integer algorithm */
+// #define DCT_IFAST_SUPPORTED	/* faster, less accurate integer method */
 #define DCT_FLOAT_SUPPORTED	/* floating-point: accurate, fast on fast HW */
 
 /* Encoder capability options: */