Skip to content

Commit

Permalink
improve diff-delta with sparse and/or repetitive data
Browse files Browse the repository at this point in the history
It is useless to preserve multiple hash entries for consecutive blocks
with the same hash.  Keeping only the first one will allow for matching
the longest string of identical bytes while subsequent blocks will only
allow for shorter matches.  The backward matching code will match the
end of it as necessary.

This improves both performances (no repeated string compare with long
successions of identical bytes, or even small group of bytes), as well
as compression (less likely to need random hash bucket entry culling),
especially with sparse files.

With well behaved data sets this patch doesn't change much.

Signed-off-by: Nicolas Pitre <[email protected]>
Signed-off-by: Junio C Hamano <[email protected]>
  • Loading branch information
Nicolas Pitre authored and Junio C Hamano committed May 3, 2006
1 parent 2d08e5d commit 06a9f92
Showing 1 changed file with 27 additions and 13 deletions.
40 changes: 27 additions & 13 deletions diff-delta.c
Original file line number Diff line number Diff line change
Expand Up @@ -136,11 +136,12 @@ struct delta_index {

struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
{
unsigned int i, hsize, hmask, entries, *hash_count;
unsigned int i, hsize, hmask, entries, prev_val, *hash_count;
const unsigned char *data, *buffer = buf;
struct delta_index *index;
struct index_entry *entry, **hash;
void *mem;
unsigned long memsize;

if (!buf || !bufsize)
return NULL;
Expand All @@ -155,9 +156,10 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
hmask = hsize - 1;

/* allocate lookup index */
mem = malloc(sizeof(*index) +
sizeof(*hash) * hsize +
sizeof(*entry) * entries);
memsize = sizeof(*index) +
sizeof(*hash) * hsize +
sizeof(*entry) * entries;
mem = malloc(memsize);
if (!mem)
return NULL;
index = mem;
Expand All @@ -179,18 +181,26 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
}

/* then populate the index */
data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
while (data >= buffer) {
prev_val = ~0;
for (data = buffer + entries * RABIN_WINDOW - RABIN_WINDOW;
data >= buffer;
data -= RABIN_WINDOW) {
unsigned int val = 0;
for (i = 1; i <= RABIN_WINDOW; i++)
val = ((val << 8) | data[i]) ^ T[val >> RABIN_SHIFT];
i = val & hmask;
entry->ptr = data + RABIN_WINDOW;
entry->val = val;
entry->next = hash[i];
hash[i] = entry++;
hash_count[i]++;
data -= RABIN_WINDOW;
if (val == prev_val) {
/* keep the lowest of consecutive identical blocks */
entry[-1].ptr = data + RABIN_WINDOW;
} else {
prev_val = val;
i = val & hmask;
entry->ptr = data + RABIN_WINDOW;
entry->val = val;
entry->next = hash[i];
hash[i] = entry++;
hash_count[i]++;
entries--;
}
}

/*
Expand Down Expand Up @@ -220,6 +230,10 @@ struct delta_index * create_delta_index(const void *buf, unsigned long bufsize)
}
free(hash_count);

/* If we didn't use all hash entries, free the unused memory. */
if (entries)
index = realloc(index, memsize - entries * sizeof(*entry));

return index;
}

Expand Down

0 comments on commit 06a9f92

Please sign in to comment.