Skip to content

Commit

Permalink
x86: Add optimized popcnt variants
Browse files Browse the repository at this point in the history
Add support for the hardware version of the Hamming weight function,
popcnt, present in CPUs which advertize it under CPUID, Function
0x0000_0001_ECX[23]. On CPUs which don't support it, we fallback to the
default lib/hweight.c sw versions.

A synthetic benchmark comparing popcnt with __sw_hweight64 showed almost
a 3x speedup on a F10h machine.

Signed-off-by: Borislav Petkov <[email protected]>
LKML-Reference: <20100318112015.GC11152@aftab>
Signed-off-by: H. Peter Anvin <[email protected]>
  • Loading branch information
Borislav Petkov authored and H. Peter Anvin committed Apr 6, 2010
1 parent 1527bc8 commit d61931d
Show file tree
Hide file tree
Showing 8 changed files with 108 additions and 18 deletions.
5 changes: 5 additions & 0 deletions arch/x86/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,11 @@ config X86_32_LAZY_GS
def_bool y
depends on X86_32 && !CC_STACKPROTECTOR

config ARCH_HWEIGHT_CFLAGS
string
default "-fcall-saved-ecx -fcall-saved-edx" if X86_32
default "-fcall-saved-rdi -fcall-saved-rsi -fcall-saved-rdx -fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 -fcall-saved-r11" if X86_64

config KTIME_SCALAR
def_bool X86_32
source "init/Kconfig"
Expand Down
9 changes: 6 additions & 3 deletions arch/x86/include/asm/alternative.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,6 @@
#define LOCK_PREFIX ""
#endif

/* This must be included *after* the definition of LOCK_PREFIX */
#include <asm/cpufeature.h>

struct alt_instr {
u8 *instr; /* original instruction */
u8 *replacement;
Expand Down Expand Up @@ -95,6 +92,12 @@ static inline int alternatives_text_reserved(void *start, void *end)
"663:\n\t" newinstr "\n664:\n" /* replacement */ \
".previous"

/*
* This must be included *after* the definition of ALTERNATIVE due to
* <asm/arch_hweight.h>
*/
#include <asm/cpufeature.h>

/*
* Alternative instructions for different CPU types or capabilities.
*
Expand Down
59 changes: 59 additions & 0 deletions arch/x86/include/asm/arch_hweight.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#ifndef _ASM_X86_HWEIGHT_H
#define _ASM_X86_HWEIGHT_H

#ifdef CONFIG_64BIT
/* popcnt %rdi, %rax */
#define POPCNT ".byte 0xf3,0x48,0x0f,0xb8,0xc7"
#define REG_IN "D"
#define REG_OUT "a"
#else
/* popcnt %eax, %eax */
#define POPCNT ".byte 0xf3,0x0f,0xb8,0xc0"
#define REG_IN "a"
#define REG_OUT "a"
#endif

/*
* __sw_hweightXX are called from within the alternatives below
* and callee-clobbered registers need to be taken care of. See
* ARCH_HWEIGHT_CFLAGS in <arch/x86/Kconfig> for the respective
* compiler switches.
*/
static inline unsigned int __arch_hweight32(unsigned int w)
{
unsigned int res = 0;

asm (ALTERNATIVE("call __sw_hweight32", POPCNT, X86_FEATURE_POPCNT)
: "="REG_OUT (res)
: REG_IN (w));

return res;
}

static inline unsigned int __arch_hweight16(unsigned int w)
{
return __arch_hweight32(w & 0xffff);
}

static inline unsigned int __arch_hweight8(unsigned int w)
{
return __arch_hweight32(w & 0xff);
}

static inline unsigned long __arch_hweight64(__u64 w)
{
unsigned long res = 0;

#ifdef CONFIG_X86_32
return __arch_hweight32((u32)w) +
__arch_hweight32((u32)(w >> 32));
#else
asm (ALTERNATIVE("call __sw_hweight64", POPCNT, X86_FEATURE_POPCNT)
: "="REG_OUT (res)
: REG_IN (w));
#endif /* CONFIG_X86_32 */

return res;
}

#endif
4 changes: 3 additions & 1 deletion arch/x86/include/asm/bitops.h
Original file line number Diff line number Diff line change
Expand Up @@ -444,7 +444,9 @@ static inline int fls(int x)

#define ARCH_HAS_FAST_MULTIPLIER 1

#include <asm-generic/bitops/hweight.h>
#include <asm/arch_hweight.h>

#include <asm-generic/bitops/const_hweight.h>

#endif /* __KERNEL__ */

Expand Down
22 changes: 18 additions & 4 deletions include/asm-generic/bitops/arch_hweight.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,23 @@

#include <asm/types.h>

extern unsigned int __arch_hweight32(unsigned int w);
extern unsigned int __arch_hweight16(unsigned int w);
extern unsigned int __arch_hweight8(unsigned int w);
extern unsigned long __arch_hweight64(__u64 w);
inline unsigned int __arch_hweight32(unsigned int w)
{
return __sw_hweight32(w);
}

inline unsigned int __arch_hweight16(unsigned int w)
{
return __sw_hweight16(w);
}

inline unsigned int __arch_hweight8(unsigned int w)
{
return __sw_hweight8(w);
}

inline unsigned long __arch_hweight64(__u64 w)
{
return __sw_hweight64(w);
}
#endif /* _ASM_GENERIC_BITOPS_HWEIGHT_H_ */
3 changes: 3 additions & 0 deletions lib/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,10 @@ lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
lib-$(CONFIG_GENERIC_FIND_FIRST_BIT) += find_next_bit.o
lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
obj-$(CONFIG_GENERIC_FIND_LAST_BIT) += find_last_bit.o

CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS))
obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o

obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
obj-$(CONFIG_BTREE) += btree.o
obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
Expand Down
20 changes: 10 additions & 10 deletions lib/hweight.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
* The Hamming Weight of a number is the total number of bits set in it.
*/

unsigned int __arch_hweight32(unsigned int w)
unsigned int __sw_hweight32(unsigned int w)
{
#ifdef ARCH_HAS_FAST_MULTIPLIER
w -= (w >> 1) & 0x55555555;
Expand All @@ -24,30 +24,30 @@ unsigned int __arch_hweight32(unsigned int w)
return (res + (res >> 16)) & 0x000000FF;
#endif
}
EXPORT_SYMBOL(__arch_hweight32);
EXPORT_SYMBOL(__sw_hweight32);

unsigned int __arch_hweight16(unsigned int w)
unsigned int __sw_hweight16(unsigned int w)
{
unsigned int res = w - ((w >> 1) & 0x5555);
res = (res & 0x3333) + ((res >> 2) & 0x3333);
res = (res + (res >> 4)) & 0x0F0F;
return (res + (res >> 8)) & 0x00FF;
}
EXPORT_SYMBOL(__arch_hweight16);
EXPORT_SYMBOL(__sw_hweight16);

unsigned int __arch_hweight8(unsigned int w)
unsigned int __sw_hweight8(unsigned int w)
{
unsigned int res = w - ((w >> 1) & 0x55);
res = (res & 0x33) + ((res >> 2) & 0x33);
return (res + (res >> 4)) & 0x0F;
}
EXPORT_SYMBOL(__arch_hweight8);
EXPORT_SYMBOL(__sw_hweight8);

unsigned long __arch_hweight64(__u64 w)
unsigned long __sw_hweight64(__u64 w)
{
#if BITS_PER_LONG == 32
return __arch_hweight32((unsigned int)(w >> 32)) +
__arch_hweight32((unsigned int)w);
return __sw_hweight32((unsigned int)(w >> 32)) +
__sw_hweight32((unsigned int)w);
#elif BITS_PER_LONG == 64
#ifdef ARCH_HAS_FAST_MULTIPLIER
w -= (w >> 1) & 0x5555555555555555ul;
Expand All @@ -64,4 +64,4 @@ unsigned long __arch_hweight64(__u64 w)
#endif
#endif
}
EXPORT_SYMBOL(__arch_hweight64);
EXPORT_SYMBOL(__sw_hweight64);
4 changes: 4 additions & 0 deletions scripts/Makefile.lib
Original file line number Diff line number Diff line change
Expand Up @@ -245,3 +245,7 @@ quiet_cmd_lzo = LZO $@
cmd_lzo = (cat $(filter-out FORCE,$^) | \
lzop -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \
(rm -f $@ ; false)

# misc stuff
# ---------------------------------------------------------------------------
quote:="

0 comments on commit d61931d

Please sign in to comment.