Skip to content

Commit

Permalink
Btrfs: heuristic: add byte set calculation
Browse files Browse the repository at this point in the history
Calculate byte set size for data sample:
- calculate how many unique bytes have been in the sample
- for all bytes count > 0, check if we're still in the low count range
  (~25%), such data are easily compressible, otherwise furhter analysis
  is needed

Signed-off-by: Timofey Titovets <[email protected]>
Reviewed-by: David Sterba <[email protected]>
[ update comments ]
Signed-off-by: David Sterba <[email protected]>
  • Loading branch information
nefelim4ag authored and kdave committed Nov 1, 2017
1 parent 1fe4f6f commit a288e92
Showing 1 changed file with 45 additions and 0 deletions.
45 changes: 45 additions & 0 deletions fs/btrfs/compression.c
Original file line number Diff line number Diff line change
Expand Up @@ -1222,6 +1222,45 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
return 1;
}

/*
* Count byte values in buckets.
* This heuristic can detect textual data (configs, xml, json, html, etc).
* Because in most text-like data byte set is restricted to limited number of
* possible characters, and that restriction in most cases makes data easy to
* compress.
*
* @BYTE_SET_THRESHOLD - consider all data within this byte set size:
* less - compressible
* more - need additional analysis
*/
#define BYTE_SET_THRESHOLD (64)

static u32 byte_set_size(const struct heuristic_ws *ws)
{
u32 i;
u32 byte_set_size = 0;

for (i = 0; i < BYTE_SET_THRESHOLD; i++) {
if (ws->bucket[i].count > 0)
byte_set_size++;
}

/*
* Continue collecting count of byte values in buckets. If the byte
* set size is bigger then the threshold, it's pointless to continue,
* the detection technique would fail for this type of data.
*/
for (; i < BUCKET_SIZE; i++) {
if (ws->bucket[i].count > 0) {
byte_set_size++;
if (byte_set_size > BYTE_SET_THRESHOLD)
return byte_set_size;
}
}

return byte_set_size;
}

static bool sample_repeated_patterns(struct heuristic_ws *ws)
{
const u32 half_of_sample = ws->sample_size / 2;
Expand Down Expand Up @@ -1321,6 +1360,12 @@ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
ws->bucket[byte].count++;
}

i = byte_set_size(ws);
if (i < BYTE_SET_THRESHOLD) {
ret = 2;
goto out;
}

out:
__free_workspace(0, ws_list, true);
return ret;
Expand Down

0 comments on commit a288e92

Please sign in to comment.