Merge branch 'hash' of git://ftp.sciencehorizons.net/linux

Pull string hash improvements from George Spelvin: "This series does several related things: - Makes the dcache hash (fs/namei.c) useful for general kernel use. (Thanks to Bruce for noticing the zero-length corner case) - Converts the string hashes in <linux/sunrpc/svcauth.h> to use the above. - Avoids 64-bit multiplies in hash_64() on 32-bit platforms. Two 32-bit multiplies will do well enough. - Rids the world of the bad hash multipliers in hash_32. This finishes the job started in commit 689de1d ("Minimal fix-up of bad hashing behavior of hash_64()") The vast majority of Linux architectures have hardware support for 32x32-bit multiply and so derive no benefit from "simplified" multipliers. The few processors that do not (68000, h8/300 and some models of Microblaze) have arch-specific implementations added. Those patches are last in the series. - Overhauls the dcache hash mixing. The patch in commit 0fed3ac ("namei: Improve hash mixing if CONFIG_DCACHE_WORD_ACCESS") was an off-the-cuff suggestion. Replaced with a much more careful design that's simultaneously faster and better. (My own invention, as there was noting suitable in the literature I could find. Comments welcome!) - Modify the hash_name() loop to skip the initial HASH_MIX(). This would let us salt the hash if we ever wanted to. - Sort out partial_name_hash(). The hash function is declared as using a long state, even though it's truncated to 32 bits at the end and the extra internal state contributes nothing to the result. And some callers do odd things: - fs/hfs/string.c only allocates 32 bits of state - fs/hfsplus/unicode.c uses it to hash 16-bit unicode symbols not bytes - Modify bytemask_from_count to handle inputs of 1..sizeof(long) rather than 0..sizeof(long)-1. This would simplify users other than full_name_hash" Special thanks to Bruce Fields for testing and finding bugs in v1. (I learned some humbling lessons about "obviously correct" code.) On the arch-specific front, the m68k assembly has been tested in a standalone test harness, I've been in contact with the Microblaze maintainers who mostly don't care, as the hardware multiplier is never omitted in real-world applications, and I haven't heard anything from the H8/300 world" * 'hash' of git://ftp.sciencehorizons.net/linux: h8300: Add <asm/hash.h> microblaze: Add <asm/hash.h> m68k: Add <asm/hash.h> <linux/hash.h>: Add support for architecture-specific functions fs/namei.c: Improve dcache hash function Eliminate bad hash multipliers from hash_32() and hash_64() Change hash_64() return value to 32 bits <linux/sunrpc/svcauth.h>: Define hash_str() in terms of hashlen_string() fs/namei.c: Add hashlen_string() function Pull out string hash to <linux/stringhash.h>
oohal · May 28, 2016 · 7e0fb73 · 7e0fb73
2 parents 4e8440b + 4684fe9
commit 7e0fb73
Show file tree

Hide file tree

Showing 17 changed files with 734 additions and 150 deletions.
diff --git a/arch/Kconfig b/arch/Kconfig
@@ -598,6 +598,14 @@ config HAVE_STACK_VALIDATION
 	  Architecture supports the 'objtool check' host tool command, which
 	  performs compile-time stack metadata validation.
 
+config HAVE_ARCH_HASH
+	bool
+	default n
+	help
+	  If this is set, the architecture provides an <asm/hash.h>
+	  file which provides platform-specific implementations of some
+	  functions in <linux/hash.h> or fs/namei.c.
+
 #
 # ABI hall of shame
 #

diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
@@ -20,6 +20,7 @@ config H8300
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZO
 	select HAVE_ARCH_KGDB
+	select HAVE_ARCH_HASH
 	select CPU_NO_EFFICIENT_FFS
 
 config RWSEM_GENERIC_SPINLOCK

diff --git a/arch/h8300/include/asm/hash.h b/arch/h8300/include/asm/hash.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_HASH_H
+#define _ASM_HASH_H
+
+/*
+ * The later H8SX models have a 32x32-bit multiply, but the H8/300H
+ * and H8S have only 16x16->32.  Since it's tolerably compact, this is
+ * basically an inlined version of the __mulsi3 code.  Since the inputs
+ * are not expected to be small, it's also simplfied by skipping the
+ * early-out checks.
+ *
+ * (Since neither CPU has any multi-bit shift instructions, a
+ * shift-and-add version is a non-starter.)
+ *
+ * TODO: come up with an arch-specific version of the hashing in fs/namei.c,
+ * since that is heavily dependent on rotates.  Which, as mentioned, suck
+ * horribly on H8.
+ */
+
+#if defined(CONFIG_CPU_H300H) || defined(CONFIG_CPU_H8S)
+
+#define HAVE_ARCH__HASH_32 1
+
+/*
+ * Multiply by k = 0x61C88647.  Fitting this into three registers requires
+ * one extra instruction, but reducing register pressure will probably
+ * make that back and then some.
+ *
+ * GCC asm note: %e1 is the high half of operand %1, while %f1 is the
+ * low half.  So if %1 is er4, then %e1 is e4 and %f1 is r4.
+ *
+ * This has been designed to modify x in place, since that's the most
+ * common usage, but preserve k, since hash_64() makes two calls in
+ * quick succession.
+ */
+static inline u32 __attribute_const__ __hash_32(u32 x)
+{
+	u32 temp;
+
+	asm(   "mov.w	%e1,%f0"
+	"\n	mulxu.w	%f2,%0"		/* klow * xhigh */
+	"\n	mov.w	%f0,%e1"	/* The extra instruction */
+	"\n	mov.w	%f1,%f0"
+	"\n	mulxu.w	%e2,%0"		/* khigh * xlow */
+	"\n	add.w	%e1,%f0"
+	"\n	mulxu.w	%f2,%1"		/* klow * xlow */
+	"\n	add.w	%f0,%e1"
+	: "=&r" (temp), "=r" (x)
+	: "%r" (GOLDEN_RATIO_32), "1" (x));
+	return x;
+}
+
+#endif
+#endif /* _ASM_HASH_H */
diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu
@@ -41,6 +41,7 @@ config M68000
 	select CPU_HAS_NO_UNALIGNED
 	select GENERIC_CSUM
 	select CPU_NO_EFFICIENT_FFS
+	select HAVE_ARCH_HASH
 	help
 	  The Freescale (was Motorola) 68000 CPU is the first generation of
 	  the well known M68K family of processors. The CPU core as well as

diff --git a/arch/m68k/include/asm/hash.h b/arch/m68k/include/asm/hash.h
@@ -0,0 +1,59 @@
+#ifndef _ASM_HASH_H
+#define _ASM_HASH_H
+
+/*
+ * If CONFIG_M68000=y (original mc68000/010), this file is #included
+ * to work around the lack of a MULU.L instruction.
+ */
+
+#define HAVE_ARCH__HASH_32 1
+/*
+ * While it would be legal to substitute a different hash operation
+ * entirely, let's keep it simple and just use an optimized multiply
+ * by GOLDEN_RATIO_32 = 0x61C88647.
+ *
+ * The best way to do that appears to be to multiply by 0x8647 with
+ * shifts and adds, and use mulu.w to multiply the high half by 0x61C8.
+ *
+ * Because the 68000 has multi-cycle shifts, this addition chain is
+ * chosen to minimise the shift distances.
+ *
+ * Despite every attempt to spoon-feed it simple operations, GCC
+ * 6.1.1 doggedly insists on doing annoying things like converting
+ * "lsl.l #2,<reg>" (12 cycles) to two adds (8+8 cycles).
+ *
+ * It also likes to notice two shifts in a row, like "a = x << 2" and
+ * "a <<= 7", and convert that to "a = x << 9".  But shifts longer
+ * than 8 bits are extra-slow on m68k, so that's a lose.
+ *
+ * Since the 68000 is a very simple in-order processor with no
+ * instruction scheduling effects on execution time, we can safely
+ * take it out of GCC's hands and write one big asm() block.
+ *
+ * Without calling overhead, this operation is 30 bytes (14 instructions
+ * plus one immediate constant) and 166 cycles.
+ *
+ * (Because %2 is fetched twice, it can't be postincrement, and thus it
+ * can't be a fully general "g" or "m".  Register is preferred, but
+ * offsettable memory or immediate will work.)
+ */
+static inline u32 __attribute_const__ __hash_32(u32 x)
+{
+	u32 a, b;
+
+	asm(   "move.l %2,%0"	/* a = x * 0x0001 */
+	"\n	lsl.l #2,%0"	/* a = x * 0x0004 */
+	"\n	move.l %0,%1"
+	"\n	lsl.l #7,%0"	/* a = x * 0x0200 */
+	"\n	add.l %2,%0"	/* a = x * 0x0201 */
+	"\n	add.l %0,%1"	/* b = x * 0x0205 */
+	"\n	add.l %0,%0"	/* a = x * 0x0402 */
+	"\n	add.l %0,%1"	/* b = x * 0x0607 */
+	"\n	lsl.l #5,%0"	/* a = x * 0x8040 */
+	: "=&d,d" (a), "=&r,r" (b)
+	: "r,roi?" (x));	/* a+b = x*0x8647 */
+
+	return ((u16)(x*0x61c8) << 16) + a + b;
+}
+
+#endif	/* _ASM_HASH_H */
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
@@ -16,6 +16,7 @@ config MICROBLAZE
 	select GENERIC_IRQ_SHOW
 	select GENERIC_PCI_IOMAP
 	select GENERIC_SCHED_CLOCK
+	select HAVE_ARCH_HASH
 	select HAVE_ARCH_KGDB
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DMA_API_DEBUG

diff --git a/arch/microblaze/include/asm/hash.h b/arch/microblaze/include/asm/hash.h
@@ -0,0 +1,81 @@
+#ifndef _ASM_HASH_H
+#define _ASM_HASH_H
+
+/*
+ * Fortunately, most people who want to run Linux on Microblaze enable
+ * both multiplier and barrel shifter, but omitting them is technically
+ * a supported configuration.
+ *
+ * With just a barrel shifter, we can implement an efficient constant
+ * multiply using shifts and adds.  GCC can find a 9-step solution, but
+ * this 6-step solution was found by Yevgen Voronenko's implementation
+ * of the Hcub algorithm at http://spiral.ece.cmu.edu/mcm/gen.html.
+ *
+ * That software is really not designed for a single multiplier this large,
+ * but if you run it enough times with different seeds, it'll find several
+ * 6-shift, 6-add sequences for computing x * 0x61C88647.  They are all
+ *	c = (x << 19) + x;
+ *	a = (x <<  9) + c;
+ *	b = (x << 23) + a;
+ *	return (a<<11) + (b<<6) + (c<<3) - b;
+ * with variations on the order of the final add.
+ *
+ * Without even a shifter, it's hopless; any hash function will suck.
+ */
+
+#if CONFIG_XILINX_MICROBLAZE0_USE_HW_MUL == 0
+
+#define HAVE_ARCH__HASH_32 1
+
+/* Multiply by GOLDEN_RATIO_32 = 0x61C88647 */
+static inline u32 __attribute_const__ __hash_32(u32 a)
+{
+#if CONFIG_XILINX_MICROBLAZE0_USE_BARREL
+	unsigned int b, c;
+
+	/* Phase 1: Compute three intermediate values */
+	b =  a << 23;
+	c = (a << 19) + a;
+	a = (a <<  9) + c;
+	b += a;
+
+	/* Phase 2: Compute (a << 11) + (b << 6) + (c << 3) - b */
+	a <<= 5;
+	a += b;		/* (a << 5) + b */
+	a <<= 3;
+	a += c;		/* (a << 8) + (b << 3) + c */
+	a <<= 3;
+	return a - b;	/* (a << 11) + (b << 6) + (c << 3) - b */
+#else
+	/*
+	 * "This is really going to hurt."
+	 *
+	 * Without a barrel shifter, left shifts are implemented as
+	 * repeated additions, and the best we can do is an optimal
+	 * addition-subtraction chain.  This one is not known to be
+	 * optimal, but at 37 steps, it's decent for a 31-bit multiplier.
+	 *
+	 * Question: given its size (37*4 = 148 bytes per instance),
+	 * and slowness, is this worth having inline?
+	 */
+	unsigned int b, c, d;
+
+	b = a << 4;	/* 4    */
+	c = b << 1;	/* 1  5 */
+	b += a;		/* 1  6 */
+	c += b;		/* 1  7 */
+	c <<= 3;	/* 3 10 */
+	c -= a;		/* 1 11 */
+	d = c << 7;	/* 7 18 */
+	d += b;		/* 1 19 */
+	d <<= 8;	/* 8 27 */
+	d += a;		/* 1 28 */
+	d <<= 1;	/* 1 29 */
+	d += b;		/* 1 30 */
+	d <<= 6;	/* 6 36 */
+	return d + c;	/* 1 37 total instructions*/
+#endif
+}
+
+#endif /* !CONFIG_XILINX_MICROBLAZE0_USE_HW_MUL */
+#endif /* _ASM_HASH_H */
diff --git a/drivers/media/usb/dvb-usb-v2/af9015.c b/drivers/media/usb/dvb-usb-v2/af9015.c
@@ -398,6 +398,8 @@ static int af9015_download_firmware(struct dvb_usb_device *d,
 }
 
 #define AF9015_EEPROM_SIZE 256
+/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
+#define GOLDEN_RATIO_PRIME_32 0x9e370001UL
 
 /* hash (and dump) eeprom */
 static int af9015_eeprom_hash(struct dvb_usb_device *d)

diff --git a/fs/dcache.c b/fs/dcache.c
@@ -1670,8 +1670,7 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name)
 	struct qstr q;
 
 	q.name = name;
-	q.len = strlen(name);
-	q.hash = full_name_hash(q.name, q.len);
+	q.hash_len = hashlen_string(name);
 	return d_alloc(parent, &q);
 }
 EXPORT_SYMBOL(d_alloc_name);