summaryrefslogtreecommitdiff
path: root/arch/powerpc/lib
diff options
context:
space:
mode:
authorChristophe Leroy <christophe.leroy@c-s.fr>2015-09-22 16:34:32 +0200
committerScott Wood <oss@buserror.net>2016-03-04 23:03:45 -0600
commitf867d556dd8525fe6ff0d22a34249528e590f994 (patch)
tree32ebba9cfc1b00d1f394b480d5cfab443382864e /arch/powerpc/lib
parent48821a34b1bdc5d89505cb814b3f7c166940f200 (diff)
powerpc32: optimise csum_partial() loop
On the 8xx, load latency is 2 cycles and taking branches also takes 2 cycles. So let's unroll the loop. This patch improves csum_partial() speed by around 10% on both: * 8xx (single issue processor with parallel execution) * 83xx (superscalar 6xx processor with dual instruction fetch and parallel execution) Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr> Signed-off-by: Scott Wood <oss@buserror.net>
Diffstat (limited to 'arch/powerpc/lib')
-rw-r--r--arch/powerpc/lib/checksum_32.S16
1 files changed, 15 insertions, 1 deletions
diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S
index 9c126028ab9c..0d34f47c8a5e 100644
--- a/arch/powerpc/lib/checksum_32.S
+++ b/arch/powerpc/lib/checksum_32.S
@@ -38,10 +38,24 @@ _GLOBAL(csum_partial)
srwi. r6,r4,2 /* # words to do */
adde r5,r5,r0
beq 3f
-1: mtctr r6
+1: andi. r6,r6,3 /* Prepare to handle words 4 by 4 */
+ beq 21f
+ mtctr r6
2: lwzu r0,4(r3)
adde r5,r5,r0
bdnz 2b
+21: srwi. r6,r4,4 /* # blocks of 4 words to do */
+ beq 3f
+ mtctr r6
+22: lwz r0,4(r3)
+ lwz r6,8(r3)
+ lwz r7,12(r3)
+ lwzu r8,16(r3)
+ adde r5,r5,r0
+ adde r5,r5,r6
+ adde r5,r5,r7
+ adde r5,r5,r8
+ bdnz 22b
3: andi. r0,r4,2
beq+ 4f
lhz r0,4(r3)