Skip to content

Commit

Permalink
Adding deflate stream, still some issue with the extra bits
Browse files Browse the repository at this point in the history
  • Loading branch information
ruvmello committed Oct 23, 2023
1 parent 66d2332 commit 190d238
Show file tree
Hide file tree
Showing 7 changed files with 262 additions and 49 deletions.
13 changes: 8 additions & 5 deletions src/main/kotlin/Main.kt
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,24 @@ fun main(args: Array<String>) {
println("Program arguments: ${args.joinToString()}")

val inputFilePath = args[0]
val file = File(inputFilePath)
val windowSize = 32 * 1024 // Adjust the window size as needed
val lookaheadBufferSize = 258 // Adjust the lookahead buffer size as needed
val lz77 = LZ77Compressor(windowSize = windowSize, lookaheadBufferSize = lookaheadBufferSize)
val compressedTokens = lz77.compress(inputFilePath)
val compressedTokens = lz77.compress(file)

// Print the compressed tokens
for (token in compressedTokens) {
print(token)
}

val file = File(inputFilePath)
val zipper = ZIPArchiver("test.zip")
zipper.getLocalFileHeader(file)
val zipper = ZIPArchiver("twee.zip")
val compressedStream = zipper.getDeflateStream(file)
zipper.getLocalFileHeader(file, compressedStream.size)
zipper.zip.appendBytes(compressedStream)


val offset = zipper.zip.length().toInt()
zipper.getCentralDirectoryFileHeader(file)
zipper.getCentralDirectoryFileHeader(file, compressedStream.size)
zipper.getEndOfCentralDirectoryRecord(1, zipper.zip.length().toInt() - offset, offset)
}
111 changes: 99 additions & 12 deletions src/main/kotlin/huffman/HuffmanCompressor.kt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package huffman
import lz77.LZ77Literal
import lz77.LZ77Repeat
import lz77.LZ77Token
import utils.*

/**
* This class handles everything related to the huffman encoding of the deflate algorithm
Expand All @@ -16,29 +17,115 @@ class HuffmanCompressor {
* @return the ByteArray that is the data for in the ZIP file
*/
fun encode(tokens: List<LZ77Token>): ByteArray {
val outputBytes = byteArrayOf()
var outputBytes = byteArrayOf()

// Compute the frequency of the literals
val freq = computeFrequencies(tokens)

// Build the Huffman tree
val tree = buildTree(freq)

for (token in tokens) {
val literals = mutableListOf<LZ77Literal>()
val repeats = mutableListOf<LZ77Repeat>()
for (index in tokens.indices) {
val token = tokens[index]
when (token) {
is LZ77Literal -> {
// TODO: Encode literal
// outputBytes +=
if (repeats.isNotEmpty()) {
outputBytes += encodeRepeatStaticBlock(repeats, false)
repeats.clear()
}
literals.add(token)
}
is LZ77Repeat -> {
// TODO: Encode repeat
// outputBytes +=
if (literals.isNotEmpty()) {
outputBytes += encodeStoredBlock(literals, false)
literals.clear()
}
repeats.add(token)
}
}
}

if (repeats.isNotEmpty()) {
outputBytes += encodeRepeatStaticBlock(repeats, true)
repeats.clear()
} else if (literals.isNotEmpty()) {
outputBytes += encodeStoredBlock(literals, true)
literals.clear()
}

return outputBytes
}

fun encodeStoredBlock(literal: List<LZ77Literal>, isLast: Boolean): ByteArray {
// First bit
var firstByte: Int = if (isLast) 1 else 0

// Block type, 00 for stored
firstByte = firstByte shl 2

// Padding first byte
firstByte = firstByte shl 5

val len = literal.size

return byteArrayOf(firstByte.toByte()) + getByteArrayOf2Bytes(len) +
getByteArrayOf2Bytes(len.inv()) + literal.map { it.char }.toByteArray()
}

fun encodeRepeatStaticBlock(tokens: List<LZ77Repeat>, isLast: Boolean): ByteArray {
val encoded = mutableListOf<Byte>()
// First bit
var byte: Int = if (isLast) 1 else 0

// Block type, 01 for static (but it is read from right-to-left, so xor 2)
byte = byte shl 2 xor 2

var totalBitsSet = 3

for (token in tokens) {
// Length
var base = lengthMapStaticHuffman.keys.findLast { token.length >= it }
var code = lengthMapStaticHuffman[base]!!.first
var extraBits = lengthMapStaticHuffman[base]!!.second
if (code in 256..279) {
// 7 bits
byte = (byte shl 7) xor ((code shl 1).toByte().toInt() shr 1) // Cut off 25 most significant bits
byte = (byte shl extraBits) xor (token.length - base!!) // Add extra bits
totalBitsSet += 7 + extraBits

} else if (code in 280..287) {
// 8 bits
byte = (byte shl 8) xor code.toByte().toInt() // Cut off 24 most significant bits
byte = (byte shl extraBits) xor (token.length - base!!) // Add extra bits
totalBitsSet += 8 + extraBits
}

// Distance, base is always 5 bits
base = distanceMapStaticHuffman.keys.findLast { token.offset >= it }
code = distanceMapStaticHuffman[base]!!.first
extraBits = distanceMapStaticHuffman[base]!!.second
byte = (byte shl 5) xor ((code shl 3).toByte().toInt() shr 3) // Cut off 27 most significant bits
byte = (byte shl extraBits) xor (token.offset - base!!) // Add extra bits
totalBitsSet += 5 + extraBits

encoded.addAll(getListOfNBytes(byte, totalBitsSet))
val totalFullBytes = totalBitsSet / 8 // Integer division
totalBitsSet -= totalFullBytes * 8
byte = (byte shl (8 - totalBitsSet)).toByte().toInt() shr (8 - totalBitsSet)
}

// Zero padding to byte boundary
if (totalBitsSet > 0) {
byte = byte shl (8 - totalBitsSet)
encoded.add(reverseBits(byte.toByte()))
}

// End of block marker is 7 bits of 0,
// If the padding is not already 7 bits,
// add another byte of 0
if (8 - totalBitsSet != 7) {
byte = 0
encoded.add(byte.toByte())
}
return encoded.toByteArray()
}

/**
* Calculate the frequency for each byte in [tokens]
*
Expand Down
9 changes: 4 additions & 5 deletions src/main/kotlin/lz77/LZ77Compressor.kt
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,18 @@ import java.io.File
* @param windowSize is the sliding window in which we search for the longest repeated occurence
* @param lookaheadBufferSize is how far we look ahead from the current index
*/
class LZ77Compressor(private val windowSize: Int = 32 * 1024, private val lookaheadBufferSize: Int = 258) {
class LZ77Compressor(private val windowSize: Int = 32 * 1024, private val lookaheadBufferSize: Int = 258) { // 258 because match is between (3, 258), which can be stored in one byte

/**
* Transform the input data to a list of LZ77Token's
*
* @param inputFilePath the file path of the file we need to encode
* @param file the file we need to encode
* @param minlength the minimum length of a match in the longest repeated occurrence (less than three is not really compressing)
* @return a list of LZ77Token's that encodes the input data
*/
fun compress(inputFilePath: String, minlength: Int = 3): List<LZ77Token> {
val inputFile = File(inputFilePath)
fun compress(file: File, minlength: Int = 3): List<LZ77Token> {
// TODO: Also make it possible to handle files bigger than 2GB
val inputBytes: ByteArray = inputFile.readBytes()
val inputBytes: ByteArray = file.readBytes()

val compressedTokens: MutableList<LZ77Token> = mutableListOf()
var currentIndex = 0
Expand Down
109 changes: 109 additions & 0 deletions src/main/kotlin/utils/Utils.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
package utils

/**
* An integer has a size of 32 bits, get a ByteArray of the two least significant bytes
*
* @param input the integer for which we construct a ByteArray of size two
*/
fun getByteArrayOf2Bytes(input: Int): ByteArray {
return byteArrayOf((input shr 0).toByte(), (input shr 8).toByte())
}

/**
* An integer has a size of 32 bits, get a ByteArray of the size four with the least significant byte first
*
* @param input the integer for which we construct a ByteArray of size four
*/
fun getByteArrayOf4Bytes(input: Int): ByteArray {
return byteArrayOf((input shr 0).toByte(), (input shr 8).toByte(), (input shr 16).toByte(), (input shr 24).toByte())
}

fun getListOfNBytes(input: Int, n: Int): List<Byte> {

val totalBytes = n / 8
val bytes = mutableListOf<Byte>()
for (i in 1 .. totalBytes) {
bytes.add(reverseBits((input shr (n - 8 * i)).toByte()))
}
return bytes
}

fun reverseBits(byte: Byte): Byte {
var result = 0
var input = byte.toInt()

for (i in 0 until 8) {
result = (result shl 1) or (input and 1)
input = input ushr 1
}

return result.toByte()
}

// Mapping of length base to code
// https://calmarius.net/?lang=en&page=programming%2Fzlib_deflate_quick_reference
val lengthMapStaticHuffman: Map<Int, Pair<Int, Int>> = mapOf(
3 to Pair(257, 0),
4 to Pair(258, 0),
5 to Pair(259, 0),
6 to Pair(260, 0),
7 to Pair(261, 0),
8 to Pair(262, 0),
9 to Pair(263, 0),
10 to Pair(264, 0),
11 to Pair(265, 1),
13 to Pair(266, 1),
15 to Pair(267, 1),
17 to Pair(268, 1),
19 to Pair(269, 2),
23 to Pair(270, 2),
27 to Pair(271, 2),
31 to Pair(272, 2),
35 to Pair(273, 3),
43 to Pair(274, 3),
51 to Pair(275, 3),
59 to Pair(276, 3),
67 to Pair(277, 4),
83 to Pair(278, 4),
99 to Pair(279, 4),
115 to Pair(280, 4),
131 to Pair(281, 5),
163 to Pair(282, 5),
195 to Pair(283, 5),
227 to Pair(284, 5),
258 to Pair(285, 0)
)

// Mapping of distance base to code
// https://calmarius.net/?lang=en&page=programming%2Fzlib_deflate_quick_reference
val distanceMapStaticHuffman = mapOf(
1 to Pair(0, 0),
2 to Pair(1, 0),
3 to Pair(2, 0),
4 to Pair(3, 0),
5 to Pair(4, 1),
7 to Pair(5, 1),
9 to Pair(6, 2),
13 to Pair(7, 2),
17 to Pair(8, 3),
25 to Pair(9, 3),
33 to Pair(10, 4),
49 to Pair(11, 4),
65 to Pair(12, 5),
97 to Pair(13, 5),
129 to Pair(14, 6),
193 to Pair(15, 6),
257 to Pair(16, 7),
385 to Pair(17, 7),
513 to Pair(18, 8),
769 to Pair(19, 8),
1025 to Pair(20, 9),
1537 to Pair(21, 9),
2049 to Pair(22, 10),
3073 to Pair(23, 10),
4097 to Pair(24, 11),
6145 to Pair(25, 11),
8193 to Pair(26, 12),
12289 to Pair(27, 12),
16385 to Pair(28, 13),
24577 to Pair(29, 13))
Loading

0 comments on commit 190d238

Please sign in to comment.