[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [avr-libc-dev] Interested in 64-bit printf support?
From: |
George Spelvin |
Subject: |
Re: [avr-libc-dev] Interested in 64-bit printf support? |
Date: |
5 Dec 2016 14:26:28 -0500 |
Looking at GCC's assembly output, I was disgusted by what it was doing
(note that I'm still using avr-gcc 4.9.1) and wrote an assembly version.
That let me do some useful space-saving tricks like allocating a different
zero register so that r1 is free for the multiplier.
Again, feel free to kibitz the code; this is literally my first AVR code ever.
I'm not quite clear whether I need to support the avr1 architecture.
I'm assuming not, and make significant use of adiw/sdiw and Z+/-Z.
Is that right?
It's now the same length (0x8c bytes) as the decimal-only part of
__ultoa_invert, and 3/4 the time of my previous C version. Compared to
__ultoa_invert, it's 2/3 of the time for large outputs, and less than
1/2 the time for small.
Updating the previous table (__ultoa_invert numbers reduced to account
for deleting the tests for bases 8 or 16):
Time in clock cycles
Input genprint genprint asm asm, !MUL __ultoa_invert
!MUL
0 104 114 156
165
0xff 316 193 227 476
479
0xffff 584 393 479 788
793
0xffffff 1005 705 873 1256
1264
0xffffffff 1434 1045 1310 1568
1578
0xffffffffff 2024 1497 1889
48 ones 2626 1977 2511
56 ones 3286 2513 3207
64 ones 4103 3161 4045
For the !__AVR_HAVE_MUL__ case, the size is 0xa2 (my code) vs. 0x90 bytes.
While the time saving is nice, I realize that space is very critical in
!__AVR_HAVE_MUL__ code, so it might not be worth it. Unless you want
64-bit printing support, in which case this can provide it for very
little additional space.
Note that changing vfprintf() to keep track of a 16-bit pointer rather
than a 32-bit value will reduce register pressure in it, saving a little
bit there, which will partially compensate for the extra %llu code.
#ifndef __tmp_reg__
# define __tmp_reg__ r0
#endif
#ifndef __zero_reg__
# define __zero_reg__ r1
#endif
/* Arguments */
#define out X /* Arrives in r24:r25, but we move it immediately */
#define out_lo r26
#define out_hi r27
#define bin Z /* Arrives in r22:r23, but we move it immediately */
#define bin_lo r30
#define bin_hi r31
#define len r20
/* Local variables */
#define acc_hi r25
#define acc_lo r24
#define digit r23
#define lsbit r22
#define tlen r21 /* Copy of len used for loop counter */
#if __AVR_HAVE_MUL__
#define zero r19 /* Used instead of r1 to free up multiplier */
#define k r18 /* Multiplier 0x33 */
#else
#define zero __zero_reg__
#endif
.text
.global genprint
.type genprint, @function
1: ret
genprint:
tst len /* Handle zero-length gracefully */
breq 1b
#if __AVR_HAVE_MUL__
clr zero
ldi k,0x33
#endif
movw out_lo,r24
/* bin += len, point to msbyte */
movw bin_lo,r22
add bin_lo,len
adc bin_hi,zero
/* Strip trailing (most-significant) zeros from bin */
2: ld __tmp_reg__,-bin
cpse __tmp_reg__,zero
rjmp 3f /* Found a non-zero byte, stop */
dec len
brne 2b
inc len /* But stop at 1 byte, so we print "0" */
3: adiw bin_lo,1
/* The main loop, repeated while len > 0 */
4: clr lsbit
ser digit /* Sum of all bytes, mod 255 */
mov tlen,len
/*
* Pass 1, msb-to-lsb: Finding the input mod 10.
*
* We do two things here: divide by 2 (saving the lsbit), and sum
* the result mod 255. This is then used to compute the result
* mod 5, which combined with the lsbit gives the decimal digit
* we want.
*/
5:
ld __tmp_reg__,-bin
lsr lsbit /* lsbit to carry bit */
ror __tmp_reg__
st bin,__tmp_reg__
rol lsbit /* carry bit to lsbit */
add digit,__tmp_reg__
adc digit,zero /* End-around carry */
dec tlen
brne 5b
/* Reduce digit mod 15 (from 1 <= digit <= 255 to 1 <= digit <= 15) */
mov __tmp_reg__,digit
swap __tmp_reg__
cbr digit,15
add digit,__tmp_reg__ /* Add high halves to get carry bit */
cbr digit,15
swap digit
adc digit,zero /* End-around carry */
/* Reduce digit mod 5 */
cpi digit,10
brlo 6f
subi digit,10
6: cpi digit,5
brlo 7f
subi digit,5
7:
/* Form and store ASCII digit (2*digit + lsbit) */
add lsbit,digit
add lsbit,digit
ori lsbit,'0'
st out+,lsbit
/*
* Pass 2, lsb-to-msb: dividing by 5
*
* Rather than do a general divide by 5, we can subtract the digit
* to produce a multiple of 5, and then do an exact division by
* multiplying by the 2-adic inverse of 5, 0xCCC...CCD.
*
* To get this into an even simpler form, we multiply by
* 0x333...333 and negate. Each byte is multiplied by 0x33 and
* added to an accumulator to be used for each higher byte.
*
* The accumulator has to be 16 bits wide, but after storing
* each output byte, we can fold the msbyte into the lsbyte.
*
* Negating the output can be "complement and add one", but
* we do it as "subtract one and complement", initializing the
* accumulator to 0xff, then complementing before storing.
*
* To subtract the digit without an additional carry propagation
* pass, subtract 0x33 times the digit from the accumulator
* to start. (Since 0 <= digit <= 4, this is very easy.)
*/
/* acc = 255 - (digit * 0x33) */
#if __AVR_HAVE_MUL__
mul digit,k
mov acc_lo,r0
#else
mov acc_lo,digit
swap acc_lo /* Digit < 16, so this is accum <<= 4 */
add acc_lo,digit /* Multiply by 0x11 */
mov r0,acc_lo
add acc_lo,r0
add acc_lo,r0 /* Multiply by 3 */
#endif
com acc_lo
clr acc_hi
/* Here's the actual loop */
mov tlen,len
8: ld r0,bin
/* acc += 0x33 * r0 */
#if __AVR_HAVE_MUL__
mul r0,k
add acc_lo,r0
adc acc_hi,r1
#else
/* Compute 0x11*r0 into digit:lsbit */
mov lsbit,r0
swap lsbit
mov digit,lsbit
andi digit,15 /* Mask off high 4 bits */
eor lsbit,digit /* Mask off low 4 bits */
add lsbit,r0
adc digit,zero
/* Now add it to the accumulator 3 times (there's no faster way) */
add acc_lo,lsbit
adc acc_hi,digit
add acc_lo,lsbit
adc acc_hi,digit
add acc_lo,lsbit
adc acc_hi,digit
#endif
/* Store the complemented accumulator (*bin++ = ~accum) */
mov r0,acc_lo
com r0
st bin+,r0
/* Fold the accumulator: acc = acc_hi + acc_lo */
add acc_lo,acc_hi
clr acc_hi
adc acc_hi,zero
dec tlen
brne 8b
/*
* End of main loop: check if the new msbyte was zero. If so,
* drop it (reduce len by 1), and test for termination.
*/
cpse r0,zero
rjmp 4b
sbiw bin_lo,1
dec len
#if __AVR_HAVE_MUL
brne 4b
clr __zero_reg__
#else
breq 8f
rjmp 4b
8:
#endif
/* Finally, put the return value in the expected place */
movw r24,out_lo
ret
.size genprint, .-genprint
- [avr-libc-dev] Interested in 64-bit printf support?, George Spelvin, 2016/12/04
- Re: [avr-libc-dev] Interested in 64-bit printf support?,
George Spelvin <=
- Re: [avr-libc-dev] Interested in 64-bit printf support?, Joerg Wunsch, 2016/12/05
- Re: [avr-libc-dev] Interested in 64-bit printf support?, Georg-Johann Lay, 2016/12/06
- Re: [avr-libc-dev] Interested in 64-bit printf support?, George Spelvin, 2016/12/06
- Re: [avr-libc-dev] Interested in 64-bit printf support?, George Spelvin, 2016/12/06
- [avr-libc-dev] I just noticed OPTIMIZE_SPEED, George Spelvin, 2016/12/06
- Re: [avr-libc-dev] I just noticed OPTIMIZE_SPEED, Joerg Wunsch, 2016/12/06
- Re: [avr-libc-dev] I just noticed OPTIMIZE_SPEED, George Spelvin, 2016/12/06
- Re: [avr-libc-dev] I just noticed OPTIMIZE_SPEED, Joerg Wunsch, 2016/12/07