patch-2.4.20 linux-2.4.20/arch/x86_64/lib/csum-copy.S

Next file: linux-2.4.20/arch/x86_64/lib/csum-partial.c
Previous file: linux-2.4.20/arch/x86_64/lib/copy_user.S
Back to the patch index
Back to the overall index

diff -urN linux-2.4.19/arch/x86_64/lib/csum-copy.S linux-2.4.20/arch/x86_64/lib/csum-copy.S
@@ -0,0 +1,278 @@
+/*
+ * Copyright 2002 Andi Kleen
+ *	
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of this archive
+ * for more details. No warranty for anything given at all.
+ */
+ 	#include <linux/linkage.h>
+	#include <asm/errno.h>
+
+//	#define FIX_ALIGNMENT 1
+/*
+ * Checksum copy with exception handling.
+ * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the 
+ * destination is zeroed.
+ * 
+ * Input
+ * rdi  source
+ * rsi  destination
+ * edx  len (32bit)
+ * ecx  sum (32bit) 
+ * r8   src_err_ptr (int)
+ * r9   dst_err_ptr (int)
+ *
+ * Output
+ * eax  64bit sum. undefined in case of exception.
+ * 
+ * Wrappers need to take care of valid exception sum and zeroing.		 
+ */
+
+/* for now - should vary this based on direction */
+ #define prefetch prefetcht2
+ #define movnti   movq
+
+	.macro source
+10:
+	.section __ex_table,"a"
+	.align 8
+	.quad 10b,bad_source
+	.previous
+	.endm
+		
+	.macro dest
+20:
+	.section __ex_table,"a"
+	.align 8
+	.quad 20b,bad_dest
+	.previous
+	.endm
+			
+	.globl csum_partial_copy_generic
+	.p2align
+csum_partial_copy_generic:
+	prefetchnta (%rdi)
+	
+	pushq %rbx
+	pushq %r12
+	pushq %r14
+	pushq %r15
+	movq %r8,%r14
+	movq %r9,%r15
+	movl  %ecx,%eax
+	movl  %edx,%ecx
+
+#ifdef FIX_ALIGNMENT
+	/* align source to 8 bytes */	
+	movl %edi,%r8d
+	andl $7,%r8d
+	jnz  bad_alignment	
+after_bad_alignment:
+#endif
+
+	movl  $64,%r10d
+	xorl  %r9d,%r9d
+	movq  %rcx,%r12
+
+	shrq  $6,%r12
+	/* loopcounter is maintained as one less to test efficiently for the
+	   previous to last iteration. This is needed to stop the prefetching. */
+	decq  %r12
+	js    handle_tail       /* < 64 */
+	jz    loop_no_prefetch  /* = 64 + X */ 
+	
+	/* main loop. clear in 64 byte blocks */
+	/* tries hard not to prefetch over the boundary */
+	/* r10:	64, r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
+	/* r11:	temp3, rdx: temp4, r12 loopcnt */
+	.p2align
+loop:
+	/* Could prefetch more than one loop, but then it would be even
+	   trickier to avoid prefetching over the boundary. The hardware prefetch
+	   should take care of this anyways. The reason for this prefetch is
+	   just the non temporal hint to avoid cache pollution. Hopefully this
+	   will be handled properly by the hardware. */
+	prefetchnta 64(%rdi) 
+	
+loop_no_prefetch:
+	source
+	movq  (%rdi),%rbx
+	source
+	movq  8(%rdi),%r8
+	source
+	movq  16(%rdi),%r11
+	source
+	movq  24(%rdi),%rdx
+
+	dest
+	movnti %rbx,(%rsi)
+	dest
+	movnti %r8,8(%rsi)
+	dest
+	movnti %r11,16(%rsi)
+	dest
+	movnti %rdx,24(%rsi)
+		
+	addq  %rbx,%rax
+	adcq  %r8,%rax
+	adcq  %r11,%rax
+	adcq  %rdx,%rax
+
+	source
+	movq  32(%rdi),%rbx
+	source
+	movq  40(%rdi),%r8
+	source
+	movq  48(%rdi),%r11
+	source
+	movq  56(%rdi),%rdx
+	
+	dest
+	movnti %rbx,32(%rsi)
+	dest
+	movnti %r8,40(%rsi)
+	dest
+	movnti %r11,48(%rsi)
+	dest
+	movnti %rdx,56(%rsi)
+
+	adcq %rbx,%rax
+	adcq %r8,%rax
+	adcq %r11,%rax
+	adcq %rdx,%rax
+	
+	adcq %r9,%rax	/* add in carry */
+	
+	addq %r10,%rdi		
+	addq %r10,%rsi
+
+	decq %r12
+	jz   loop_no_prefetch	/* previous to last iteration? */
+	jns  loop
+
+	/* do last upto 56 bytes */
+handle_tail:
+	/* ecx:	count */
+	movl %ecx,%r10d
+	andl $63,%ecx
+	shrl $3,%ecx
+	jz 	 fold
+	clc
+	movl $8,%edx
+loop_8:	
+	source
+	movq (%rdi),%rbx
+	adcq %rbx,%rax
+	dest
+	movnti %rbx,(%rsi)
+	leaq (%rsi,%rdx),%rsi /* preserve carry */
+	leaq (%rdi,%rdx),%rdi
+	decl %ecx
+	jnz	loop_8
+	adcq %r9,%rax	/* add in carry */
+
+fold:
+	movl %eax,%ebx
+	shrq $32,%rax
+	addq %rbx,%rax
+
+	/* do last upto 6 bytes */	
+handle_7:
+	movl %r10d,%ecx
+	andl $7,%ecx
+	shrl $1,%ecx
+	jz   handle_1
+	movl $2,%edx
+	xorl %ebx,%ebx
+	clc  
+loop_1:	
+	source
+	movw (%rdi),%bx
+	adcq %rbx,%rax
+	dest
+	movw %bx,(%rsi)
+	addq %rdx,%rdi
+	addq %rdx,%rsi
+	decl %ecx
+	jnz loop_1
+	adcw %r9w,%ax	/* add in carry */
+	
+	/* handle last odd byte */
+handle_1:	
+	testl $1,%r10d
+	jz    ende
+	xorl  %ebx,%ebx
+	source
+	movb (%rdi),%bl
+	dest
+	movb %bl,(%rsi)
+	addw %bx,%ax
+	adcw %r9w,%ax		/* carry */
+			
+ende:		
+	sfence
+	popq %r15
+	popq %r14
+	popq %r12
+	popq %rbx
+	ret
+
+#ifdef FIX_ALIGNMENT
+	/* align source to 8 bytes. */
+	/* r8d:	unalignedness, ecx len */
+bad_alignment:
+	testl $1,%edi
+	jnz   odd_source
+
+	/* compute distance to next aligned position */
+	movl $8,%r8d
+	xchgl %r8d,%ecx
+	subl %r8d,%ecx
+
+	/* handle unaligned part */
+	shrl $1,%ecx
+	xorl %ebx,%ebx	
+	movl $2,%r10d
+align_loop:
+	source
+	movw (%rdi),%bx
+	addq %rbx,%rax	/* carry cannot happen */
+	dest
+	movw %bx,(%rsi)
+	addq %r10,%rdi
+	addq %r10,%rsi
+	decl %ecx
+	jnz align_loop
+	jmp after_bad_alignment
+
+	/* weird case. need to swap the sum at the end because the spec requires
+	   16 bit words of the sum to be always paired. 
+	   handle it recursively because it should be rather rare. */
+odd_source:
+	/* copy odd byte */
+	xorl %ebx,%ebx
+	source
+	movb (%rdi),%bl
+	addl %ebx,%eax       /* add to old checksum */
+	adcl $0,%ecx
+	dest
+	movb %al,(%rsi)
+	
+	/* fix arguments */
+	movl %eax,%ecx
+	incq %rsi
+	incq %rdi
+	decq %rdx 
+	call csum_partial_copy_generic
+	bswap %eax        /* this should work, but check */
+	jmp ende
+#endif
+
+	/* Exception handlers. Very simple, zeroing is done in the wrappers */
+bad_source:
+	movl $-EFAULT,(%r14)
+	jmp  ende
+	
+bad_dest:
+	movl $-EFAULT,(%r15)
+	jmp ende

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)