Skip to content

Commit

Permalink
hash: Introduce an implementation of murmurhash.
Browse files Browse the repository at this point in the history
Murmurhash is generally superior to the Jenkins lookup3 hash according to
the available figures.  Perhaps we should generally replace our current
hashes by murmurhash.

For now, I'm introducing a parallel implementation to allow it to be used
in cases where an incremental one-word-at-a-time hash is desirable.  The
first user will be added in an upcoming commit.

Signed-off-by: Ben Pfaff <[email protected]>
Acked-by: Ethan Jackson <[email protected]>
  • Loading branch information
blp committed Sep 4, 2012
1 parent 0ee140f commit 9879b94
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 24 deletions.
15 changes: 15 additions & 0 deletions lib/hash.c
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,18 @@ hash_bytes(const void *p_, size_t n, uint32_t basis)

return c;
}

/* Returns the hash of the 'n' 32-bit words at 'p', starting from 'basis'.
* 'p' must be properly aligned. */
uint32_t
mhash_words(const uint32_t p[], size_t n_words, uint32_t basis)
{
uint32_t hash;
size_t i;

hash = basis;
for (i = 0; i < n_words; i++) {
hash = mhash_add(hash, p[i]);
}
return mhash_finish(hash, n_words);
}
42 changes: 42 additions & 0 deletions lib/hash.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,48 @@ static inline uint32_t hash_pointer(const void *p, uint32_t basis)
return hash_int((uint32_t) (uintptr_t) p, basis);
}

/* Murmurhash by Austin Appleby,
* from http://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp.
*
* The upstream license there says:
*
* // MurmurHash3 was written by Austin Appleby, and is placed in the public
* // domain. The author hereby disclaims copyright to this source code.
*
* Murmurhash is faster and higher-quality than the Jenkins lookup3 hash. When
* we have a little more familiarity with it, it's probably a good idea to
* switch all of OVS to it.
*
* For now, we have this implementation here for use by code that needs a hash
* that is convenient for use one word at a time, since the Jenkins lookup3
* hash works three words at a time.
*
* See mhash_words() for sample usage. */

uint32_t mhash_words(const uint32_t data[], size_t n_words, uint32_t basis);

static inline uint32_t mhash_add(uint32_t hash, uint32_t data)
{
data *= 0xcc9e2d51;
data = hash_rot(data, 15);
data *= 0x1b873593;

hash ^= data;
hash = hash_rot(hash, 13);
return hash * 5 + 0xe6546b64;
}

static inline uint32_t mhash_finish(uint32_t hash, size_t n)
{
hash ^= n * 4;
hash ^= hash_rot(hash, 16);
hash *= 0x85ebca6b;
hash ^= hash_rot(hash, 13);
hash *= 0xc2b2ae35;
hash ^= hash_rot(hash, 16);
return hash;
}

#ifdef __cplusplus
}
#endif
Expand Down
62 changes: 38 additions & 24 deletions tests/test-hash.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2009 Nicira, Inc.
* Copyright (c) 2009, 2012 Nicira, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -40,6 +40,12 @@ hash_words_cb(uint32_t input)
return hash_words(&input, 1, 0);
}

static uint32_t
mhash_words_cb(uint32_t input)
{
return mhash_words(&input, 1, 0);
}

static uint32_t
hash_int_cb(uint32_t input)
{
Expand Down Expand Up @@ -76,11 +82,37 @@ check_word_hash(uint32_t (*hash)(uint32_t), const char *name,
}
}

int
main(void)
static void
check_3word_hash(uint32_t (*hash)(const uint32_t[], size_t, uint32_t),
const char *name)
{
int i, j;

for (i = 0; i <= 96; i++) {
for (j = i + 1; j <= 96; j++) {
uint32_t in1[3], in2[3];
uint32_t out1, out2;
const int min_unique = 12;
const uint32_t unique_mask = (UINT32_C(1) << min_unique) - 1;

set_bit(in1, i);
set_bit(in2, j);
out1 = hash(in1, 3, 0);
out2 = hash(in2, 3, 0);
if ((out1 & unique_mask) == (out2 & unique_mask)) {
printf("%s has a partial collision:\n", name);
printf("hash(1 << %d) == %08"PRIx32"\n", i, out1);
printf("hash(1 << %d) == %08"PRIx32"\n", j, out2);
printf("The low-order %d bits of output are both "
"0x%"PRIx32"\n", min_unique, out1 & unique_mask);
}
}
}
}

int
main(void)
{
/* Check that all hashes computed with hash_words with one 1-bit (or no
* 1-bits) set within a single 32-bit word have different values in all
* 11-bit consecutive runs.
Expand All @@ -98,6 +130,7 @@ main(void)
* independence must be a bad assumption :-)
*/
check_word_hash(hash_words_cb, "hash_words", 11);
check_word_hash(mhash_words_cb, "mhash_words", 11);

/* Check that all hash functions of with one 1-bit (or no 1-bits) set
* within three 32-bit words have different values in their lowest 12
Expand All @@ -112,27 +145,8 @@ main(void)
*
* so we are doing pretty well to not have any collisions in 12 bits.
*/
for (i = 0; i <= 96; i++) {
for (j = i + 1; j <= 96; j++) {
uint32_t in1[3], in2[3];
uint32_t out1, out2;
const int min_unique = 12;
const uint32_t unique_mask = (UINT32_C(1) << min_unique) - 1;

set_bit(in1, i);
set_bit(in2, j);
out1 = hash_words(in1, 3, 0);
out2 = hash_words(in2, 3, 0);
if ((out1 & unique_mask) == (out2 & unique_mask)) {
printf("Partial collision:\n");
printf("hash(1 << %d) == %08"PRIx32"\n", i, out1);
printf("hash(1 << %d) == %08"PRIx32"\n", j, out2);
printf("The low-order %d bits of output are both "
"0x%"PRIx32"\n", min_unique, out1 & unique_mask);
exit(1);
}
}
}
check_3word_hash(hash_words, "hash_words");
check_3word_hash(mhash_words, "mhash_words");

/* Check that all hashes computed with hash_int with one 1-bit (or no
* 1-bits) set within a single 32-bit word have different values in all
Expand Down

0 comments on commit 9879b94

Please sign in to comment.