[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [lwip-users] checksum routine in assembler
From: |
address@hidden |
Subject: |
Re: [lwip-users] checksum routine in assembler |
Date: |
Tue, 17 Nov 2009 19:35:14 +0100 |
User-agent: |
Thunderbird 2.0.0.23 (Macintosh/20090812) |
Jan Wester wrote:
Hi all
I’m trying to optimize my TCP/IP communication
Is it anyone how have the checksum, htons and htonl routines in
assembler for ARM
I do have a checksum routine in (gcc) assembler. It's quite optimized,
but I haven't used it for some years and don't remember if it can be
improved... I'll attach it.
I didn't need htons/htonl though, as I was running the arm in big endian
mode. Note that another possibility for optimization is to provide an
improved alternative for memcpy. I had a gcc arm assembler version for
that, too, but I can only find the word-only routine, which doesn't help
much as a generic memcpy replacement.
Feel free to use the routine as you like, I don't think there much brain
power in it ;-) - but don't blame me if it doesn't work!
Simon
#ifndef _ASM_FNS_H
#define _ASM_FNS_H
unsigned short asm_ip_chksum(void *buf, unsigned int length);
unsigned short asm_udp_chksum(void *buf, unsigned int len_udp, void *src_addr,
void *dest_addr);
unsigned short cksum(void const * data, unsigned short byte_cnt);
#endif
.text
// r0 / r1: args / output
addr .req r0
in_len .req r1
loopc_b .req r1
// r2: carry counter
carries .req r2
// r3: sum
sum .req r3
// r4: loaded long / temp1
temp1 .req r4
// r5: current address
//addr .req r5
temp2 .req r5
// r6: loop counter (longs)
loopc .req r6
// r7: loop counter (short)
loopc_s .req r7
temp3 .req r7
// r8: loop counter (bytes)
//loopc_b .req r8
// r9: temp2
//temp2 .req r9
temp4 .req r8
temp5 .req r9
temp6 .req r10
last_used_reg .req r10
// unsigned short asm_ip_chksum(unsigned int length, void *buf);
.globl asm_ip_chksum
asm_ip_chksum:
CMP r0, #0
MOVEQ r0, #-1
MOVEQ pc, lr
STMFD sp!, {r1-last_used_reg,r14}
// init
MOV carries, #0 // clear carry
counter
MOV sum, #0 // sum=0
// testen, ob byte-aligned
ANDS temp3, addr, #0x1 // temp3 = addr & 1
BEQ ip_word_sum // if temp3 == 0
goto ip_word_sum
// [byte][..]
LDRB temp1, [addr], #1 // load 1 byte
ADD sum, sum, temp1, lsl #16// sum += first_halfword
SUB loopc_b, loopc_b, #1 // loop_b == in_len (r1)
ip_word_sum:
// testen, ob word-aligned
ANDS temp3, addr, #0x2 // temp3 &= 2
BEQ ip_begin_sum // goto begin_sum if
not short-aligned
// [short][dwords]
LDRH temp1, [addr], #2 // load 1 byte
ADD sum, sum, temp1, lsl #16// sum += first_halfword
SUB loopc_b, loopc_b, #2 // loop_b == in_len (r1)
ip_begin_sum:
// Testen, ob länge durch 4 teilbar
MOV loopc, loopc_b, asr #2 // loopc = (arg0 / 4)
(long_count)
AND loopc_s, loopc_b, #2 // store last 2 bits of in_len
in temp2 (anzahl bytes (0-3) die kein ganzes dword sind)
AND loopc_b, loopc_b, #1 // store last 2 bits of in_len
in temp2 (anzahl bytes (0-3) die kein ganzes dword sind)
CMP loopc, #4
BLT ip_after_quad_dword_loop
ip_quad_dword_loop_begin:
// add 4 dwords
LDMIA addr!, {temp1, temp4, temp5, temp6}
ADDS sum, sum, temp1 // sum += buf[i]
ADC carries, carries, #0 // add carry to carry counter
ADDS sum, sum, temp4 // sum += buf[i]
ADC carries, carries, #0 // add carry to carry counter
ADDS sum, sum, temp5 // sum += buf[i]
ADC carries, carries, #0 // add carry to carry counter
ADDS sum, sum, temp6 // sum += buf[i]
ADC carries, carries, #0 // add carry to carry counter
SUBS loopc, loopc, #4 // 4 long per loop
CMP loopc, #4
BGE ip_quad_dword_loop_begin
ip_after_quad_dword_loop:
CMP loopc, #1
BLT ip_after_dword_loop
// first, only long words.
ip_dword_loop_begin:
LDR temp1, [addr], #4 // load long from
address of addr and inc.after
ADDS sum, sum, temp1 // sum += buf[i]
ADC carries, carries, #0 // add carry to carry counter
SUBS loopc, loopc, #1 // 1 long per loop
BNE ip_dword_loop_begin
ip_after_dword_loop:
// fold
// check if > 16 bit
LDR temp1, =0xffff // temp1 = 0xffff
MOVS temp2, sum, lsr #16 // temp2 = sum >> 16
BEQ ip_nofold1
AND sum, sum, temp1 // sum &= 0xffff
ADD sum, sum, temp2 // sum += temp2
ip_nofold1:
// now add shorts (if left)
CMP loopc_s, #0
BEQ ip_no_short_left // if loopc_s != 0 go
on here
LDRH temp2, [addr], #2 // load 2 bytes
ADD sum, sum, temp2 // sum += 2byte
ip_no_short_left:
// now add bytes (if left)
CMP loopc_b, #0
BEQ ip_no_byte_left
// there are some bytes left
LDRB temp2, [addr], #1 // load 1 byte
ADD sum, sum, temp2, lsl #8 // sum += (1byte << 8)
ip_no_byte_left:
// add carries
ADD sum, sum, carries // sum += carries (sum
+= all_carries)
// fold again
// check if > 16 bit
MOVS temp2, sum, lsr #16 // temp2 = sum >> 16
BEQ ip_nofold2
//LDR temp1, =0xffff // temp1 = 0xffff
AND temp2, temp2, temp1 // temp2 &= 0xffff
AND sum, sum, temp1 // sum &= 0xffff
ADD sum, sum, temp2 // sum += temp2
ip_nofold2:
// NO INVERT! for lwIP
MOV r0, sum // store in sum
result-reg
// invert
//MVN r0, sum // invert sum and store
in result-reg
// convert to u16_t again
//AND r0, r0, temp1 // temp2 &= 0xffff
// out of here
LDMIA sp!,{r1-last_used_reg,r15}