summaryrefslogtreecommitdiff
path: root/arch/x86/include/asm/xor_32.h
diff options
context:
space:
mode:
authorJan Beulich <JBeulich@suse.com>2012-11-02 14:20:24 +0000
committerIngo Molnar <mingo@kernel.org>2013-01-25 09:23:50 +0100
commitf317820cb6ee3fb173319bf76e0e62437be78ad2 (patch)
treefc57358da4ba9f11a8d80e508d01e99c2c62c1f9 /arch/x86/include/asm/xor_32.h
parente8f6e3f8a14bae98197c6d9f280cd23d22eb1a33 (diff)
x86/xor: Add alternative SSE implementation only prefetching once per 64-byte line
On CPUs with 64-byte last level cache lines, this yields roughly 10% better performance, independent of CPU vendor or specific model (as far as I was able to test). Signed-off-by: Jan Beulich <jbeulich@suse.com> Acked-by: H. Peter Anvin <hpa@zytor.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Link: http://lkml.kernel.org/r/5093E4B802000078000A615E@nat28.tlf.novell.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch/x86/include/asm/xor_32.h')
-rw-r--r--arch/x86/include/asm/xor_32.h23
1 files changed, 11 insertions, 12 deletions
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
index b85dc87f3cc7..ce05722e3c68 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h
@@ -543,26 +543,25 @@ static struct xor_block_template xor_block_pIII_sse = {
/* Also try the generic routines. */
#include <asm-generic/xor.h>
+/* We force the use of the SSE xor block because it can write around L2.
+ We may also be able to load into the L1 only depending on how the cpu
+ deals with a load to a line that is being prefetched. */
#undef XOR_TRY_TEMPLATES
#define XOR_TRY_TEMPLATES \
do { \
- xor_speed(&xor_block_8regs); \
- xor_speed(&xor_block_8regs_p); \
- xor_speed(&xor_block_32regs); \
- xor_speed(&xor_block_32regs_p); \
AVX_XOR_SPEED; \
- if (cpu_has_xmm) \
+ if (cpu_has_xmm) { \
xor_speed(&xor_block_pIII_sse); \
- if (cpu_has_mmx) { \
+ xor_speed(&xor_block_sse_pf64); \
+ } else if (cpu_has_mmx) { \
xor_speed(&xor_block_pII_mmx); \
xor_speed(&xor_block_p5_mmx); \
+ } else { \
+ xor_speed(&xor_block_8regs); \
+ xor_speed(&xor_block_8regs_p); \
+ xor_speed(&xor_block_32regs); \
+ xor_speed(&xor_block_32regs_p); \
} \
} while (0)
-/* We force the use of the SSE xor block because it can write around L2.
- We may also be able to load into the L1 only depending on how the cpu
- deals with a load to a line that is being prefetched. */
-#define XOR_SELECT_TEMPLATE(FASTEST) \
- AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
-
#endif /* _ASM_X86_XOR_32_H */