Skip to content

Commit

Permalink
liblzma: LZMA decoder improvements.
Browse files Browse the repository at this point in the history
This adds macros for bittree decoding which prepares the code
for alternative C versions and inline assembly.
  • Loading branch information
Larhzu committed Feb 14, 2024
1 parent de5c5e4 commit e0c0ee4
Show file tree
Hide file tree
Showing 3 changed files with 210 additions and 200 deletions.
264 changes: 78 additions & 186 deletions src/liblzma/lzma/lzma_decoder.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,8 @@

// Minimum number of input bytes to safely decode one LZMA symbol.
// The worst case is that we decode 22 bits using probabilities and 26
// direct bits. This may decode at maximum 20 bytes of input plus one
// extra byte after the final EOPM normalization.
#define LZMA_IN_REQUIRED 21
// direct bits. This may decode at maximum 20 bytes of input.
#define LZMA_IN_REQUIRED 20


// Macros for (somewhat) size-optimized code.
Expand Down Expand Up @@ -73,32 +72,22 @@ do { \
symbol = 1; \
rc_if_0(ld.choice) { \
rc_update_0(ld.choice); \
rc_bit(ld.low[pos_state][symbol], , ); \
rc_bit(ld.low[pos_state][symbol], , ); \
rc_bit(ld.low[pos_state][symbol], , ); \
target = symbol - LEN_LOW_SYMBOLS + MATCH_LEN_MIN; \
rc_bittree3(ld.low[pos_state], \
-LEN_LOW_SYMBOLS + MATCH_LEN_MIN); \
target = symbol; \
} else { \
rc_update_1(ld.choice); \
rc_if_0(ld.choice2) { \
rc_update_0(ld.choice2); \
rc_bit(ld.mid[pos_state][symbol], , ); \
rc_bit(ld.mid[pos_state][symbol], , ); \
rc_bit(ld.mid[pos_state][symbol], , ); \
target = symbol - LEN_MID_SYMBOLS \
+ MATCH_LEN_MIN + LEN_LOW_SYMBOLS; \
rc_bittree3(ld.mid[pos_state], -LEN_MID_SYMBOLS \
+ MATCH_LEN_MIN + LEN_LOW_SYMBOLS); \
target = symbol; \
} else { \
rc_update_1(ld.choice2); \
rc_bit(ld.high[symbol], , ); \
rc_bit(ld.high[symbol], , ); \
rc_bit(ld.high[symbol], , ); \
rc_bit(ld.high[symbol], , ); \
rc_bit(ld.high[symbol], , ); \
rc_bit(ld.high[symbol], , ); \
rc_bit(ld.high[symbol], , ); \
rc_bit(ld.high[symbol], , ); \
target = symbol - LEN_HIGH_SYMBOLS \
rc_bittree8(ld.high, -LEN_HIGH_SYMBOLS \
+ MATCH_LEN_MIN \
+ LEN_LOW_SYMBOLS + LEN_MID_SYMBOLS; \
+ LEN_LOW_SYMBOLS + LEN_MID_SYMBOLS); \
target = symbol; \
} \
} \
} while (0)
Expand Down Expand Up @@ -369,8 +358,8 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,

// If there is not enough room for another LZMA symbol
// go to Resumable mode.
if (rc_in_pos + LZMA_IN_REQUIRED > in_size
|| dict.pos == dict.limit)
if (unlikely(rc_in_end - rc_in_ptr < LZMA_IN_REQUIRED
|| dict.pos == dict.limit))
goto slow;

// Decode the first bit from the next LZMA symbol.
Expand All @@ -390,64 +379,14 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
probs = literal_subcoder(coder->literal,
literal_context_bits, literal_pos_mask,
dict.pos, dict_get(&dict, 0));
symbol = 1;

if (is_literal_state(state)) {
// Decode literal without match byte.
// We need to decode 8 bits, so instead
// of looping from 1 - 8, we unroll the
// loop for a speed optimization.
rc_bit(probs[symbol], , );
rc_bit(probs[symbol], , );
rc_bit(probs[symbol], , );
rc_bit(probs[symbol], , );
rc_bit(probs[symbol], , );
rc_bit(probs[symbol], , );
rc_bit(probs[symbol], , );
rc_bit(probs[symbol], , );
rc_bittree8(probs, 0);
} else {
// Decode literal with match byte.
//
// We store the byte we compare against
// ("match byte") to "len" to minimize the
// number of variables we need to store
// between decoder calls.

len = (uint32_t)(dict_get(&dict, rep0)) << 1;

// The usage of "offset" allows omitting some
// branches, which should give tiny speed
// improvement on some CPUs. "offset" gets
// set to zero if match_bit didn't match.
offset = 0x100;

// Unroll the loop.
uint32_t match_bit;
uint32_t subcoder_index;

# define decode_with_match_bit \
match_bit = len & offset; \
subcoder_index = offset + match_bit + symbol; \
rc_bit(probs[subcoder_index], \
offset &= ~match_bit, \
offset &= match_bit)

decode_with_match_bit;
len <<= 1;
decode_with_match_bit;
len <<= 1;
decode_with_match_bit;
len <<= 1;
decode_with_match_bit;
len <<= 1;
decode_with_match_bit;
len <<= 1;
decode_with_match_bit;
len <<= 1;
decode_with_match_bit;
len <<= 1;
decode_with_match_bit;
# undef decode_match_bit
rc_matched_literal(probs,
dict_get(&dict, rep0));
}

state = next_state[state];
Expand Down Expand Up @@ -501,18 +440,8 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
// The next 6 bits determine how to decode the
// rest of the distance.
probs = coder->dist_slot[get_dist_state(len)];
symbol = 1;

rc_bit(probs[symbol], , );
rc_bit(probs[symbol], , );
rc_bit(probs[symbol], , );
rc_bit(probs[symbol], , );
rc_bit(probs[symbol], , );
rc_bit(probs[symbol], , );

// Get rid of the highest bit that was needed for
// indexing of the probability array.
symbol -= DIST_SLOTS;
rc_bittree6(probs, -DIST_SLOTS);
assert(symbol <= 63);

if (symbol < DIST_MODEL_START) {
Expand Down Expand Up @@ -540,6 +469,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
assert(limit <= 5);
rep0 <<= limit;
assert(rep0 <= 96);

// -1 is fine, because we start
// decoding at probs[1], not probs[0].
// NOTE: This violates the C standard,
Expand All @@ -553,106 +483,51 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
probs = coder->pos_special + rep0
- symbol - 1;
symbol = 1;
offset = 0;
offset = 1;

switch (limit) {
case 5:
assert(offset == 0);
rc_bit(probs[symbol], ,
rep0 += 1U);
++offset;
--limit;
case 4:
rc_bit(probs[symbol], ,
rep0 += 1U << offset);
++offset;
--limit;
case 3:
rc_bit(probs[symbol], ,
rep0 += 1U << offset);
++offset;
--limit;
case 2:
rc_bit(probs[symbol], ,
rep0 += 1U << offset);
++offset;
--limit;
case 1:
// We need "symbol" only for
// indexing the probability
// array, thus we can use
// rc_bit_last() here to
// omit the unneeded updating
// of "symbol".
rc_bit_last(probs[symbol], ,
rep0 += 1U << offset);
}
// Variable number (1-5) of bits
// from a reverse bittree. This
// isn't worth manual unrolling.
do {
rc_bit_add_if_1(probs,
rep0, offset);
offset <<= 1;
} while (--limit > 0);
} else {
// The distance is >= 128. Decode the
// lower bits without probabilities
// except the lowest four bits.
assert(symbol >= 14);
assert(limit >= 6);

limit -= ALIGN_BITS;
assert(limit >= 2);

// Not worth manual unrolling
do {
rc_direct(rep0);
} while (--limit > 0);
rc_direct(rep0, limit);

// Decode the lowest four bits using
// probabilities.
rep0 <<= ALIGN_BITS;
symbol = 1;

rc_bit(coder->pos_align[symbol], ,
rep0 += 1);

rc_bit(coder->pos_align[symbol], ,
rep0 += 2);

rc_bit(coder->pos_align[symbol], ,
rep0 += 4);

// Like when distance [4, 127], we
// don't need "symbol" for anything
// other than indexing the probability
// array.
rc_bit_last(
coder->pos_align[symbol], ,
rep0 += 8);

if (rep0 == UINT32_MAX) {
///////////////////////////
// End of payload marker //
///////////////////////////

// End of payload marker was
// found. It may only be
// present if
// - uncompressed size is
// unknown or
// - after known uncompressed
// size amount of bytes has
// been decompressed and
// caller has indicated
// that EOPM might be used
// (it's not allowed in
// LZMA2).
if (!eopm_is_valid) {
ret = LZMA_DATA_ERROR;
goto out;
}

// LZMA1 stream with
// end-of-payload marker.
rc_normalize();
ret = rc_is_finished(rc)
? LZMA_STREAM_END
: LZMA_DATA_ERROR;
goto out;
}
rc_bittree_rev4(coder->pos_align);
rep0 += symbol;

// If the end of payload marker (EOPM)
// is detected, jump to the safe code.
// The EOPM handling isn't speed
// critical at all.
//
// A final normalization is needed
// after the EOPM (there can be a
// dummy byte to read in some cases).
// If the normalization was done here
// in the fast code, it would need to
// be taken into account in the value
// of LZMA_IN_REQUIRED. Using the
// safe code allows keeping
// LZMA_IN_REQUIRED as 20 instead of
// 21.
if (rep0 == UINT32_MAX)
goto eopm;
}
}

Expand Down Expand Up @@ -948,31 +823,48 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
limit -= ALIGN_BITS;
assert(limit >= 2);
case SEQ_DIRECT:
do {
rc_direct_safe(rep0,
SEQ_DIRECT);
} while (--limit > 0);
rc_direct_safe(rep0, limit,
SEQ_DIRECT);

rep0 <<= ALIGN_BITS;
symbol = 1;

offset = 0;
symbol = 0;
offset = 1;
case SEQ_ALIGN:
do {
rc_bit_safe(coder->pos_align[
symbol], ,
rep0 += 1U << offset,
rc_bit_last_safe(
coder->pos_align[
offset
+ symbol],
,
symbol += offset,
SEQ_ALIGN);
} while (++offset < ALIGN_BITS);
offset <<= 1;
} while (offset < ALIGN_SIZE);

rep0 += symbol;

// End of payload marker
if (rep0 == UINT32_MAX) {
// End of payload marker was
// found. It may only be
// present if
// - uncompressed size is
// unknown or
// - after known uncompressed
// size amount of bytes has
// been decompressed and
// caller has indicated
// that EOPM might be used
// (it's not allowed in
// LZMA2).
eopm:
if (!eopm_is_valid) {
ret = LZMA_DATA_ERROR;
goto out;
}

case SEQ_EOPM:
// LZMA1 stream with
// end-of-payload marker.
rc_normalize_safe(SEQ_EOPM);
ret = rc_is_finished(rc)
? LZMA_STREAM_END
Expand Down
4 changes: 4 additions & 0 deletions src/liblzma/rangecoder/range_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@
///
/// I will be sticking to uint16_t unless some specific architectures
/// are *much* faster (20-50 %) with uint32_t.
///
/// Update in 2024: The branchless C and x86-64 assembly was written so that
/// probability is assumed to be uint16_t. (In contrast, LZMA SDK 23.01
/// assembly supports both types.)
typedef uint16_t probability;

#endif
Loading

0 comments on commit e0c0ee4

Please sign in to comment.