Implement a Bzip2Decoder

Motivation: Bzip2Decoder provides receiving data compressed in bzip2 format. Modifications: Added classes: - Bzip2Decoder - Bzip2Constants - Bzip2BlockDecompressor - Bzip2HuffmanStageDecoder - Bzip2MoveToFrontTable - Bzip2Rand - Crc32 - Bzip2DecoderTest Result: Implemented and tested new decoder which can uncompress incoming data in bzip2 format.
datatonic · Jun 24, 2014 · f9021a6 · f9021a6
1 parent 12a3e23
commit f9021a6
Show file tree

Hide file tree

Showing 12 changed files with 1,516 additions and 9 deletions.
diff --git a/NOTICE.txt b/NOTICE.txt
@@ -6,7 +6,7 @@ Please visit the Netty web site for more information:
 
   * http://netty.io/
 
-Copyright 2011 The Netty Project
+Copyright 2014 The Netty Project
 
 The Netty Project licenses this file to you under the Apache License,
 version 2.0 (the "License"); you may not use this file except in compliance
@@ -81,6 +81,14 @@ It can be obtained at:
   * HOMEPAGE:
     * https://github.com/akka/akka/blob/wip-2.2.3-for-scala-2.11/akka-actor/src/main/java/akka/dispatch/AbstractNodeQueue.java
 
+This product contains a modified portion of 'jbzip2', a Java bzip2 compression
+and decompression library written by Matthew J. Francis. It can be obtained at:
+
+  * LICENSE:
+    * license/LICENSE.jbzip2.txt (MIT License)
+  * HOMEPAGE:
+    * https://code.google.com/p/jbzip2/
+
 This product optionally depends on 'JZlib', a re-implementation of zlib in
 pure Java, which can be obtained at:
 

diff --git a/codec/pom.xml b/codec/pom.xml
@@ -49,7 +49,7 @@
       <artifactId>jzlib</artifactId>
       <optional>true</optional>
     </dependency>
-    
+
     <!-- Test dependencies for jboss marshalling encoder/decoder -->
     <dependency>
       <groupId>org.jboss.marshalling</groupId>
@@ -61,6 +61,13 @@
       <artifactId>jboss-marshalling-river</artifactId>
       <scope>test</scope>
     </dependency>
+
+    <!-- Test dependency for Bzip2Decoder -->
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-compress</artifactId>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
 </project>
 
diff --git a/codec/src/main/java/io/netty/handler/codec/compression/Bzip2BlockDecompressor.java b/codec/src/main/java/io/netty/handler/codec/compression/Bzip2BlockDecompressor.java
@@ -0,0 +1,320 @@
+/*
+ * Copyright 2014 The Netty Project
+ *
+ * The Netty Project licenses this file to you under the Apache License,
+ * version 2.0 (the "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at:
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ */
+package io.netty.handler.codec.compression;
+
+import io.netty.buffer.ByteBuf;
+
+import static io.netty.handler.codec.compression.Bzip2Constants.*;
+
+final class Bzip2BlockDecompressor {
+    /**
+     * Calculates the block CRC from the fully decoded bytes of the block.
+     */
+    private final Crc32 crc = new Crc32();
+
+    /**
+     * The CRC of the current block as read from the block header.
+     */
+    private final int blockCRC;
+
+    /**
+     * {@code true} if the current block is randomised, otherwise {@code false}.
+     */
+    private final boolean blockRandomised;
+
+    /* Huffman Decoding stage */
+    /**
+     * The end-of-block Huffman symbol. Decoding of the block ends when this is encountered.
+     */
+    int huffmanEndOfBlockSymbol;
+
+    /**
+     * Bitmap, of ranges of 16 bytes, present/not present.
+     */
+    int huffmanInUse16;
+
+    /**
+     * A map from Huffman symbol index to output character. Some types of data (e.g. ASCII text)
+     * may contain only a limited number of byte values; Huffman symbols are only allocated to
+     * those values that actually occur in the uncompressed data.
+     */
+    final byte[] huffmanSymbolMap = new byte[256];
+
+    /* Move To Front stage */
+    /**
+     * Counts of each byte value within the {@link Bzip2BlockDecompressor#huffmanSymbolMap} data.
+     * Collected at the Move To Front stage, consumed by the Inverse Burrows Wheeler Transform stage.
+     */
+    private final int[] bwtByteCounts = new int[256];
+
+    /**
+     * The Burrows-Wheeler Transform processed data. Read at the Move To Front stage, consumed by the
+     * Inverse Burrows Wheeler Transform stage.
+     */
+    private final byte[] bwtBlock;
+
+    /**
+     * Starting pointer into BWT for after untransform.
+     */
+    private final int bwtStartPointer;
+
+    /* Inverse Burrows-Wheeler Transform stage */
+    /**
+     * At each position contains the union of :-
+     *   An output character (8 bits)
+     *   A pointer from each position to its successor (24 bits, left shifted 8 bits)
+     * As the pointer cannot exceed the maximum block size of 900k, 24 bits is more than enough to
+     * hold it; Folding the character data into the spare bits while performing the inverse BWT,
+     * when both pieces of information are available, saves a large number of memory accesses in
+     * the final decoding stages.
+     */
+    private int[] bwtMergedPointers;
+
+    /**
+     * The current merged pointer into the Burrow-Wheeler Transform array.
+     */
+    private int bwtCurrentMergedPointer;
+
+    /**
+     * The actual length in bytes of the current block at the Inverse Burrows Wheeler Transform
+     * stage (before final Run-Length Decoding).
+     */
+    private int bwtBlockLength;
+
+    /**
+     * The number of output bytes that have been decoded up to the Inverse Burrows Wheeler Transform stage.
+     */
+    private int bwtBytesDecoded;
+
+    /* Run-Length Encoding and Random Perturbation stage */
+    /**
+     * The most recently RLE decoded byte.
+     */
+    private int rleLastDecodedByte = -1;
+
+    /**
+     * The number of previous identical output bytes decoded. After 4 identical bytes, the next byte
+     * decoded is an RLE repeat count.
+     */
+    private int rleAccumulator;
+
+    /**
+     * The RLE repeat count of the current decoded byte. When this reaches zero, a new byte is decoded.
+     */
+    private int rleRepeat;
+
+    /**
+     * If the current block is randomised, the position within the RNUMS randomisation array.
+     */
+    private int randomIndex;
+
+    /**
+     * If the current block is randomised, the remaining count at the current RNUMS position.
+     */
+    private int randomCount = Bzip2Rand.rNums(0) - 1;
+
+    /**
+     * Table for Move To Front transformations.
+     */
+    final Bzip2MoveToFrontTable symbolMTF = new Bzip2MoveToFrontTable();
+
+    int repeatCount;
+    int repeatIncrement = 1;
+    int mtfValue;
+
+    Bzip2BlockDecompressor(int blockSize, int blockCRC, boolean blockRandomised, int bwtStartPointer) {
+        bwtBlock = new byte[blockSize];
+
+        this.blockCRC = blockCRC;
+        this.blockRandomised = blockRandomised;
+        this.bwtStartPointer = bwtStartPointer;
+    }
+
+    /**
+     * Reads the Huffman encoded data from the input stream, performs Run-Length Decoding and
+     * applies the Move To Front transform to reconstruct the Burrows-Wheeler Transform array.
+     */
+    boolean decodeHuffmanData(final Bzip2HuffmanStageDecoder huffmanDecoder, ByteBuf in) {
+        final byte[] bwtBlock = this.bwtBlock;
+        final byte[] huffmanSymbolMap = this.huffmanSymbolMap;
+        final int streamBlockSize = this.bwtBlock.length;
+        final int huffmanEndOfBlockSymbol = this.huffmanEndOfBlockSymbol;
+        final int[] bwtByteCounts = this.bwtByteCounts;
+        final Bzip2MoveToFrontTable symbolMTF = this.symbolMTF;
+
+        int bwtBlockLength = this.bwtBlockLength;
+        int repeatCount = this.repeatCount;
+        int repeatIncrement = this.repeatIncrement;
+        int mtfValue = this.mtfValue;
+
+        for (;;) {
+            if (in.readableBytes() < 3) {   // 3 = (HUFFMAN_DECODE_MAX_CODE_LENGTH + 1) bits / 8
+                this.bwtBlockLength = bwtBlockLength;
+                this.repeatCount = repeatCount;
+                this.repeatIncrement = repeatIncrement;
+                this.mtfValue = mtfValue;
+                return false;
+            }
+            final int nextSymbol = huffmanDecoder.nextSymbol(in);
+
+            if (nextSymbol == HUFFMAN_SYMBOL_RUNA) {
+                repeatCount += repeatIncrement;
+                repeatIncrement <<= 1;
+            } else if (nextSymbol == HUFFMAN_SYMBOL_RUNB) {
+                repeatCount += repeatIncrement << 1;
+                repeatIncrement <<= 1;
+            } else {
+                if (repeatCount > 0) {
+                    if (bwtBlockLength + repeatCount > streamBlockSize) {
+                        throw new DecompressionException("block exceeds declared block size");
+                    }
+                    final byte nextByte = huffmanSymbolMap[mtfValue];
+                    bwtByteCounts[nextByte & 0xff] += repeatCount;
+                    while (--repeatCount >= 0) {
+                        bwtBlock[bwtBlockLength++] = nextByte;
+                    }
+
+                    repeatCount = 0;
+                    repeatIncrement = 1;
+                }
+
+                if (nextSymbol == huffmanEndOfBlockSymbol) {
+                    break;
+                }
+
+                if (bwtBlockLength >= streamBlockSize) {
+                    throw new DecompressionException("block exceeds declared block size");
+                }
+
+                mtfValue = symbolMTF.indexToFront(nextSymbol - 1) & 0xff;
+
+                final byte nextByte = huffmanSymbolMap[mtfValue];
+                bwtByteCounts[nextByte & 0xff]++;
+                bwtBlock[bwtBlockLength++] = nextByte;
+            }
+        }
+        this.bwtBlockLength = bwtBlockLength;
+        initialiseInverseBWT();
+        return true;
+    }
+
+    /**
+     * Set up the Inverse Burrows-Wheeler Transform merged pointer array.
+     */
+    private void initialiseInverseBWT() {
+        final int bwtStartPointer = this.bwtStartPointer;
+        final byte[] bwtBlock  = this.bwtBlock;
+        final int[] bwtMergedPointers = new int[bwtBlockLength];
+        final int[] characterBase = new int[256];
+
+        if (bwtStartPointer < 0 || bwtStartPointer >= bwtBlockLength) {
+            throw new DecompressionException("start pointer invalid");
+        }
+
+        // Cumulative character counts
+        System.arraycopy(bwtByteCounts, 0, characterBase, 1, 255);
+        for (int i = 2; i <= 255; i++) {
+            characterBase[i] += characterBase[i - 1];
+        }
+
+        // Merged-Array Inverse Burrows-Wheeler Transform
+        // Combining the output characters and forward pointers into a single array here, where we
+        // have already read both of the corresponding values, cuts down on memory accesses in the
+        // final walk through the array
+        for (int i = 0; i < bwtBlockLength; i++) {
+            int value = bwtBlock[i] & 0xff;
+            bwtMergedPointers[characterBase[value]++] = (i << 8) + value;
+        }
+
+        this.bwtMergedPointers = bwtMergedPointers;
+        bwtCurrentMergedPointer = bwtMergedPointers[bwtStartPointer];
+    }
+
+    /**
+     * Decodes a byte from the final Run-Length Encoding stage, pulling a new byte from the
+     * Burrows-Wheeler Transform stage when required.
+     * @return The decoded byte, or -1 if there are no more bytes
+     */
+    public int read() {
+        while (rleRepeat < 1) {
+            if (bwtBytesDecoded == bwtBlockLength) {
+                return -1;
+            }
+
+            int nextByte = decodeNextBWTByte();
+            if (nextByte != rleLastDecodedByte) {
+                // New byte, restart accumulation
+                rleLastDecodedByte = nextByte;
+                rleRepeat = 1;
+                rleAccumulator = 1;
+                crc.updateCRC(nextByte);
+            } else {
+                if (++rleAccumulator == 4) {
+                    // Accumulation complete, start repetition
+                    int rleRepeat = decodeNextBWTByte() + 1;
+                    this.rleRepeat = rleRepeat;
+                    rleAccumulator = 0;
+                    crc.updateCRC(nextByte, rleRepeat);
+                } else {
+                    rleRepeat = 1;
+                    crc.updateCRC(nextByte);
+                }
+            }
+        }
+        rleRepeat--;
+
+        return rleLastDecodedByte;
+    }
+
+    /**
+     * Decodes a byte from the Burrows-Wheeler Transform stage. If the block has randomisation
+     * applied, reverses the randomisation.
+     * @return The decoded byte
+     */
+    private int decodeNextBWTByte() {
+        int mergedPointer = bwtCurrentMergedPointer;
+        int nextDecodedByte =  mergedPointer & 0xff;
+        bwtCurrentMergedPointer = bwtMergedPointers[mergedPointer >>> 8];
+
+        if (blockRandomised) {
+            if (--randomCount == 0) {
+                nextDecodedByte ^= 1;
+                randomIndex = (randomIndex + 1) % 512;
+                randomCount = Bzip2Rand.rNums(randomIndex);
+            }
+        }
+        bwtBytesDecoded++;
+
+        return nextDecodedByte;
+    }
+
+    public int blockLength() {
+        return bwtBlockLength;
+    }
+
+    /**
+     * Verify and return the block CRC. This method may only be called
+     * after all of the block's bytes have been read.
+     * @return The block CRC
+     */
+    int checkCRC() {
+        final int computedBlockCRC = crc.getCRC();
+        if (blockCRC != computedBlockCRC) {
+            throw new DecompressionException("block CRC error");
+        }
+        return computedBlockCRC;
+    }
+}