aboutsummaryrefslogtreecommitdiff
path: root/src/arch
diff options
context:
space:
mode:
authorMichael Brown <mcb30@ipxe.org>2012-06-26 17:19:18 +0100
committerMichael Brown <mcb30@ipxe.org>2012-06-28 16:02:31 +0100
commitec22e08db1fa61f5276b2ed1efbde28a1a45101e (patch)
treef8e65353e71dee19449d2934fefd684ed101f386 /src/arch
parentbb9961fb548068f769c666970c1de6c68743ed07 (diff)
downloadipxe-ec22e08db1fa61f5276b2ed1efbde28a1a45101e.zip
ipxe-ec22e08db1fa61f5276b2ed1efbde28a1a45101e.tar.gz
ipxe-ec22e08db1fa61f5276b2ed1efbde28a1a45101e.tar.bz2
[tcpip] Add faster algorithm for calculating the TCP/IP checksum
The generic TCP/IP checksum implementation requires approximately 10 CPU clocks per byte (as measured using the TSC). Improve this to approximately 0.5 CPU clocks per byte by using "lodsl ; adcl" in an unrolled loop. Signed-off-by: Michael Brown <mcb30@ipxe.org>
Diffstat (limited to 'src/arch')
-rw-r--r--src/arch/x86/core/x86_tcpip.c169
-rw-r--r--src/arch/x86/include/bits/tcpip.h5
2 files changed, 174 insertions, 0 deletions
diff --git a/src/arch/x86/core/x86_tcpip.c b/src/arch/x86/core/x86_tcpip.c
new file mode 100644
index 0000000..b4e7c3b
--- /dev/null
+++ b/src/arch/x86/core/x86_tcpip.c
@@ -0,0 +1,169 @@
+/*
+ * Copyright (C) 2012 Michael Brown <mbrown@fensystems.co.uk>.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+FILE_LICENCE ( GPL2_OR_LATER );
+
+/** @file
+ *
+ * TCP/IP checksum
+ *
+ */
+
+#include <limits.h>
+#include <ipxe/tcpip.h>
+
+extern char x86_tcpip_loop_end[];
+
+/**
+ * Calculate continued TCP/IP checkum
+ *
+ * @v partial Checksum of already-summed data, in network byte order
+ * @v data Data buffer
+ * @v len Length of data buffer
+ * @ret cksum Updated checksum, in network byte order
+ */
+uint16_t x86_tcpip_continue_chksum ( uint16_t partial,
+ const void *data, size_t len ) {
+ unsigned long sum = ( ( ~partial ) & 0xffff );
+ unsigned long initial_word_count;
+ unsigned long loop_count;
+ unsigned long loop_partial_count;
+ unsigned long final_word_count;
+ unsigned long final_byte;
+ unsigned long discard_S;
+ unsigned long discard_c;
+ unsigned long discard_a;
+ unsigned long discard_r1;
+ unsigned long discard_r2;
+
+ /* Calculate number of initial 16-bit words required to bring
+ * the main loop into alignment. (We don't care about the
+ * speed for data aligned to less than 16 bits, since this
+ * situation won't occur in practice.)
+ */
+ if ( len >= sizeof ( sum ) ) {
+ initial_word_count = ( ( -( ( intptr_t ) data ) &
+ ( sizeof ( sum ) - 1 ) ) >> 1 );
+ } else {
+ initial_word_count = 0;
+ }
+ len -= ( initial_word_count * 2 );
+
+ /* Calculate number of iterations of the main loop. This loop
+ * processes native machine words (32-bit or 64-bit), and is
+ * unrolled 16 times. We calculate an overall iteration
+ * count, and a starting point for the first iteration.
+ */
+ loop_count = ( len / ( sizeof ( sum ) * 16 ) );
+ loop_partial_count =
+ ( ( len % ( sizeof ( sum ) * 16 ) ) / sizeof ( sum ) );
+
+ /* Calculate number of 16-bit words remaining after the main
+ * loop completes.
+ */
+ final_word_count = ( ( len % sizeof ( sum ) ) / 2 );
+
+ /* Calculate whether or not a final byte remains at the end */
+ final_byte = ( len & 1 );
+
+ /* Calculate the checksum */
+ __asm__ ( /* Calculate position at which to jump into the
+ * unrolled loop.
+ */
+ "imul $( -x86_tcpip_loop_step_size ), %4\n\t"
+ "add %5, %4\n\t"
+
+ /* Clear carry flag before starting checksumming */
+ "clc\n\t"
+
+ /* Checksum initial words */
+ "jmp 2f\n\t"
+ "\n1:\n\t"
+ "lodsw\n\t"
+ "adcw %w2, %w0\n\t"
+ "\n2:\n\t"
+ "loop 1b\n\t"
+
+ /* Main "lods;adc" loop, unrolled x16 */
+ "mov %12, %3\n\t"
+ "jmp *%4\n\t"
+ "\nx86_tcpip_loop_start:\n\t"
+ "lods%z2\n\tadc %2, %0\n\t"
+ "lods%z2\n\tadc %2, %0\n\t"
+ "lods%z2\n\tadc %2, %0\n\t"
+ "lods%z2\n\tadc %2, %0\n\t"
+ "lods%z2\n\tadc %2, %0\n\t"
+ "lods%z2\n\tadc %2, %0\n\t"
+ "lods%z2\n\tadc %2, %0\n\t"
+ "lods%z2\n\tadc %2, %0\n\t"
+ "lods%z2\n\tadc %2, %0\n\t"
+ "lods%z2\n\tadc %2, %0\n\t"
+ "lods%z2\n\tadc %2, %0\n\t"
+ "lods%z2\n\tadc %2, %0\n\t"
+ "lods%z2\n\tadc %2, %0\n\t"
+ "lods%z2\n\tadc %2, %0\n\t"
+ "lods%z2\n\tadc %2, %0\n\t"
+ "lods%z2\n\tadc %2, %0\n\t"
+ "\nx86_tcpip_loop_end:\n\t"
+ "loop x86_tcpip_loop_start\n\t"
+ ".equ x86_tcpip_loop_step_size, "
+ " ( ( x86_tcpip_loop_end - x86_tcpip_loop_start ) / 16 )\n\t"
+
+ /* Checksum remaining whole words */
+ "mov %13, %3\n\t"
+ "jmp 2f\n\t"
+ "\n1:\n\t"
+ "lodsw\n\t"
+ "adcw %w2, %w0\n\t"
+ "\n2:\n\t"
+ "loop 1b\n\t"
+
+ /* Checksum final byte if applicable */
+ "mov %14, %3\n\t"
+ "loop 1f\n\t"
+ "adcb (%1), %b0\n\t"
+ "adcb $0, %h0\n\t"
+ "\n1:\n\t"
+
+ /* Fold down to a uint16_t */
+ "push %0\n\t"
+ "popw %w0\n\t"
+ "popw %w2\n\t"
+ "adcw %w2, %w0\n\t"
+#if ULONG_MAX > 0xffffffffUL /* 64-bit only */
+ "popw %w2\n\t"
+ "adcw %w2, %w0\n\t"
+ "popw %w2\n\t"
+ "adcw %w2, %w0\n\t"
+#endif /* 64-bit only */
+
+ /* Consume CF */
+ "adcw $0, %w0\n\t"
+ "adcw $0, %w0\n\t"
+
+ : "=&Q" ( sum ), "=&S" ( discard_S ), "=&a" ( discard_a ),
+ "=&c" ( discard_c ), "=&r" ( discard_r1 ),
+ "=&r" ( discard_r2 )
+ : "0" ( sum ), "1" ( data ), "2" ( 0 ),
+ "3" ( initial_word_count + 1 ), "4" ( loop_partial_count ),
+ "5" ( x86_tcpip_loop_end ), "g" ( loop_count + 1 ),
+ "g" ( final_word_count + 1 ), "g" ( final_byte ) );
+
+ return ( ~sum & 0xffff );
+}
diff --git a/src/arch/x86/include/bits/tcpip.h b/src/arch/x86/include/bits/tcpip.h
index 9ae8d92..a4b335e 100644
--- a/src/arch/x86/include/bits/tcpip.h
+++ b/src/arch/x86/include/bits/tcpip.h
@@ -9,4 +9,9 @@
FILE_LICENCE ( GPL2_OR_LATER );
+extern uint16_t x86_tcpip_continue_chksum ( uint16_t partial,
+ const void *data, size_t len );
+
+#define tcpip_continue_chksum x86_tcpip_continue_chksum
+
#endif /* _BITS_TCPIP_H */