patch-2.4.20 linux-2.4.20/include/asm-parisc/checksum.h

Next file: linux-2.4.20/include/asm-parisc/current.h
Previous file: linux-2.4.20/include/asm-parisc/cache.h
Back to the patch index
Back to the overall index

diff -urN linux-2.4.19/include/asm-parisc/checksum.h linux-2.4.20/include/asm-parisc/checksum.h
@@ -61,32 +61,31 @@
 	unsigned int sum;
 
 
-	__asm__ __volatile__ ("
-	ldws,ma		4(%1), %0
-	addi		-4, %2, %2
-	comib,>=	0, %2, 2f
-	
-	ldws,ma		4(%1), %%r19
-	add		%0, %%r19, %0
-	ldws,ma		4(%1), %%r19
-	addc		%0, %%r19, %0
-	ldws,ma		4(%1), %%r19
-	addc		%0, %%r19, %0
-1:	ldws,ma		4(%1), %%r19
-	addib,<>	-1, %2, 1b
-	addc		%0, %%r19, %0
-	addc		%0, %%r0, %0
-
-	zdepi		-1, 31, 16, %%r19
-	and		%0, %%r19, %%r20
-	extru		%0, 15, 16, %%r21
-	add		%%r20, %%r21, %0
-	and		%0, %%r19, %%r20
-	extru		%0, 15, 16, %%r21
-	add		%%r20, %%r21, %0
-	subi		-1, %0, %0
-2:
-	"
+	__asm__ __volatile__ (
+"	ldws,ma		4(%1), %0\n"
+"	addi		-4, %2, %2\n"
+"	comib,>=	0, %2, 2f\n"
+"\n"
+"	ldws,ma		4(%1), %%r19\n"
+"	add		%0, %%r19, %0\n"
+"	ldws,ma		4(%1), %%r19\n"
+"	addc		%0, %%r19, %0\n"
+"	ldws,ma		4(%1), %%r19\n"
+"	addc		%0, %%r19, %0\n"
+"1:	ldws,ma		4(%1), %%r19\n"
+"	addib,<>	-1, %2, 1b\n"
+"	addc		%0, %%r19, %0\n"
+"	addc		%0, %%r0, %0\n"
+"\n"
+"	zdepi		-1, 31, 16, %%r19\n"
+"	and		%0, %%r19, %%r20\n"
+"	extru		%0, 15, 16, %%r21\n"
+"	add		%%r20, %%r21, %0\n"
+"	and		%0, %%r19, %%r20\n"
+"	extru		%0, 15, 16, %%r21\n"
+"	add		%%r20, %%r21, %0\n"
+"	subi		-1, %0, %0\n"
+"2:\n"
 	: "=r" (sum), "=r" (iph), "=r" (ihl)
 	: "1" (iph), "2" (ihl)
 	: "r19", "r20", "r21" );
@@ -99,9 +98,12 @@
  */
 static inline unsigned int csum_fold(unsigned int sum)
 {
-	sum = (sum & 0xffff) + (sum >> 16);
-	sum = (sum & 0xffff) + (sum >> 16);
-	return ~sum;
+	/* add the swapped two 16-bit halves of sum,
+	   a possible carry from adding the two 16-bit halves,
+	   will carry from the lower half into the upper half,
+	   giving us the correct sum in the upper half. */
+	sum += (sum << 16) + (sum >> 16);
+	return (~sum) >> 16;
 }
  
 static inline unsigned long csum_tcpudp_nofold(unsigned long saddr,
@@ -110,11 +112,11 @@
 					       unsigned short proto,
 					       unsigned int sum) 
 {
-	__asm__("
-		add  %1, %0, %0
-		addc %2, %0, %0
-		addc %3, %0, %0
-		addc %%r0, %0, %0 "
+	__asm__(
+	"	add  %1, %0, %0\n"
+	"	addc %2, %0, %0\n"
+	"	addc %3, %0, %0\n"
+	"	addc %%r0, %0, %0\n"
 		: "=r" (sum)
 		: "r" (daddr), "r"(saddr), "r"((proto<<16)+len), "0"(sum));
     return sum;
@@ -141,6 +143,7 @@
 	 return csum_fold (csum_partial(buf, len, 0));
 }
 
+
 #define _HAVE_ARCH_IPV6_CSUM
 static __inline__ unsigned short int csum_ipv6_magic(struct in6_addr *saddr,
 						     struct in6_addr *daddr,
@@ -148,7 +151,62 @@
 						     unsigned short proto,
 						     unsigned int sum) 
 {
-	BUG();
+	__asm__ __volatile__ (
+
+#if BITS_PER_LONG > 32
+
+	/*
+	** We can execute two loads and two adds per cycle on PA 8000.
+	** But add insn's get serialized waiting for the carry bit.
+	** Try to keep 4 registers with "live" values ahead of the ALU.
+	*/
+
+"	ldd,ma		8(%1), %%r19\n"	/* get 1st saddr word */
+"	ldd,ma		8(%2), %%r20\n"	/* get 1st daddr word */
+"	add		%8, %3, %3\n"/* add 16-bit proto + len */
+"	add		%%r19, %0, %0\n"
+"	ldd,ma		8(%1), %%r21\n"	/* 2cd saddr */
+"	ldd,ma		8(%2), %%r22\n"	/* 2cd daddr */
+"	add,dc		%%r20, %0, %0\n"
+"	add,dc		%%r21, %0, %0\n"
+"	add,dc		%%r22, %0, %0\n"
+"	add,dc		%3, %0, %0\n"  /* fold in proto+len | carry bit */
+"	extrd,u		%0, 31, 32, %%r19\n"	/* copy upper half down */
+"	depdi		0, 31, 32, %0\n"	/* clear upper half */
+"	add		%%r19, %0, %0\n"	/* fold into 32-bits */
+"	addc		0, %0, %0\n"		/* add carry */
+
+#else
+
+	/*
+	** For PA 1.x, the insn order doesn't matter as much.
+	** Insn stream is serialized on the carry bit here too.
+	** result from the previous operation (eg r0 + x)
+	*/
+
+"	ldw,ma		4(%1), %%r19\n"	/* get 1st saddr word */
+"	ldw,ma		4(%2), %%r20\n"	/* get 1st daddr word */
+"	add		%8, %3, %3\n"	/* add 16-bit proto + len */
+"	add		%%r19, %0, %0\n"
+"	ldw,ma		4(%1), %%r21\n"	/* 2cd saddr */
+"	addc		%%r20, %0, %0\n"
+"	ldw,ma		4(%2), %%r22\n"	/* 2cd daddr */
+"	addc		%%r21, %0, %0\n"
+"	ldw,ma		4(%1), %%r19\n"	/* 3rd saddr */
+"	addc		%%r22, %0, %0\n"
+"	ldw,ma		4(%2), %%r20\n"	/* 3rd daddr */
+"	addc		%%r19, %0, %0\n"
+"	ldw,ma		4(%1), %%r21\n"	/* 4th saddr */
+"	addc		%%r20, %0, %0\n"
+"	ldw,ma		4(%2), %%r22\n"	/* 4th daddr */
+"	addc		%%r21, %0, %0\n"
+"	addc		%%r22, %0, %0\n"
+"	addc		%3, %0, %0\n"	/* fold in proto+len, catch carry */
+
+#endif
+	: "=r" (sum), "=r" (saddr), "=r" (daddr), "=r" (len)
+	: "0" (sum), "1" (saddr), "2" (daddr), "3" (len), "r" (proto)
+	: "r19", "r20", "r21", "r22");
 	return csum_fold(sum);
 }
 

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)