[Top][All Lists]

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [avr-libc-dev] Interested in 64-bit printf support?

From: George Spelvin
Subject: Re: [avr-libc-dev] Interested in 64-bit printf support?
Date: 5 Dec 2016 14:26:28 -0500

Looking at GCC's assembly output, I was disgusted by what it was doing
(note that I'm still using avr-gcc 4.9.1) and wrote an assembly version.

That let me do some useful space-saving tricks like allocating a different
zero register so that r1 is free for the multiplier.

Again, feel free to kibitz the code; this is literally my first AVR code ever.

I'm not quite clear whether I need to support the avr1 architecture.
I'm assuming not, and make significant use of adiw/sdiw and Z+/-Z.
Is that right?

It's now the same length (0x8c bytes) as the decimal-only part of
__ultoa_invert, and 3/4 the time of my previous C version.  Compared to
__ultoa_invert, it's 2/3 of the time for large outputs, and less than
1/2 the time for small.

Updating the previous table (__ultoa_invert numbers reduced to account
for deleting the tests for bases 8 or 16):

                Time in clock cycles
Input           genprint        genprint asm    asm, !MUL       __ultoa_invert  
0                               104             114             156             
0xff            316             193             227             476             
0xffff          584             393             479             788             
0xffffff        1005            705             873             1256            
0xffffffff      1434            1045            1310            1568            
0xffffffffff    2024            1497            1889
48 ones         2626            1977            2511
56 ones         3286            2513            3207
64 ones         4103            3161            4045

For the !__AVR_HAVE_MUL__ case, the size is 0xa2 (my code) vs. 0x90 bytes.

While the time saving is nice, I realize that space is very critical in
!__AVR_HAVE_MUL__ code, so it might not be worth it.  Unless you want
64-bit printing support, in which case this can provide it for very
little additional space.

Note that changing vfprintf() to keep track of a 16-bit pointer rather
than a 32-bit value will reduce register pressure in it, saving a little
bit there, which will partially compensate for the extra %llu code.

#ifndef __tmp_reg__
# define __tmp_reg__ r0
#ifndef __zero_reg__
# define __zero_reg__ r1

/* Arguments */
#define out     X       /* Arrives in r24:r25, but we move it immediately */
#define out_lo  r26
#define out_hi  r27
#define bin     Z       /* Arrives in r22:r23, but we move it immediately */
#define bin_lo  r30
#define bin_hi  r31
#define len     r20

/* Local variables */
#define acc_hi  r25
#define acc_lo  r24
#define digit   r23
#define lsbit   r22
#define tlen    r21     /* Copy of len used for loop counter */

#if __AVR_HAVE_MUL__
#define zero    r19     /* Used instead of r1 to free up multiplier */
#define k       r18     /* Multiplier 0x33 */
#define zero    __zero_reg__

        .global genprint
        .type   genprint, @function
1:      ret
        tst     len     /* Handle zero-length gracefully */
        breq    1b

#if __AVR_HAVE_MUL__
        clr     zero
        ldi     k,0x33
        movw    out_lo,r24

        /* bin += len, point to msbyte */
        movw    bin_lo,r22
        add     bin_lo,len
        adc     bin_hi,zero

        /* Strip trailing (most-significant) zeros from bin */
2:      ld      __tmp_reg__,-bin
        cpse    __tmp_reg__,zero
         rjmp   3f      /* Found a non-zero byte, stop */
        dec     len
        brne    2b
        inc     len     /* But stop at 1 byte, so we print "0" */

3:      adiw    bin_lo,1

        /* The main loop, repeated while len > 0 */
4:      clr     lsbit
        ser     digit   /* Sum of all bytes, mod 255 */
        mov     tlen,len

         * Pass 1, msb-to-lsb: Finding the input mod 10.
         * We do two things here: divide by 2 (saving the lsbit), and sum
         * the result mod 255.  This is then used to compute the result
         * mod 5, which combined with the lsbit gives the decimal digit
         * we want.
        ld      __tmp_reg__,-bin
        lsr     lsbit           /* lsbit to carry bit */
        ror     __tmp_reg__
        st      bin,__tmp_reg__
        rol     lsbit           /* carry bit to lsbit */
        add     digit,__tmp_reg__
        adc     digit,zero      /* End-around carry */

        dec     tlen
        brne    5b

        /* Reduce digit mod 15  (from 1 <= digit <= 255 to 1 <= digit <= 15) */
        mov     __tmp_reg__,digit
        swap    __tmp_reg__
        cbr     digit,15
        add     digit,__tmp_reg__       /* Add high halves to get carry bit */
        cbr     digit,15
        swap    digit
        adc     digit,zero              /* End-around carry */

        /* Reduce digit mod 5 */
        cpi     digit,10
        brlo    6f
        subi    digit,10
6:      cpi     digit,5
        brlo    7f
        subi    digit,5
        /* Form and store ASCII digit (2*digit + lsbit) */
        add     lsbit,digit
        add     lsbit,digit
        ori     lsbit,'0'
        st      out+,lsbit

         * Pass 2, lsb-to-msb: dividing by 5
         * Rather than do a general divide by 5, we can subtract the digit
         * to produce a multiple of 5, and then do an exact division by
         * multiplying by the 2-adic inverse of 5, 0xCCC...CCD.
         * To get this into an even simpler form, we multiply by
         * 0x333...333 and negate.  Each byte is multiplied by 0x33 and
         * added to an accumulator to be used for each higher byte.
         * The accumulator has to be 16 bits wide, but after storing
         * each output byte, we can fold the msbyte into the lsbyte.
         * Negating the output can be "complement and add one", but
         * we do it as "subtract one and complement", initializing the
         * accumulator to 0xff, then complementing before storing.
         * To subtract the digit without an additional carry propagation
         * pass, subtract 0x33 times the digit from the accumulator
         * to start.  (Since 0 <= digit <= 4, this is very easy.)

        /* acc = 255 - (digit * 0x33) */
#if __AVR_HAVE_MUL__
        mul     digit,k
        mov     acc_lo,r0
        mov     acc_lo,digit
        swap    acc_lo          /* Digit < 16, so this is accum <<= 4 */
        add     acc_lo,digit    /* Multiply by 0x11 */
        mov     r0,acc_lo
        add     acc_lo,r0
        add     acc_lo,r0       /* Multiply by 3 */
        com     acc_lo
        clr     acc_hi

        /* Here's the actual loop */
        mov     tlen,len
8:      ld      r0,bin
        /* acc += 0x33 * r0 */
#if __AVR_HAVE_MUL__
        mul     r0,k
        add     acc_lo,r0
        adc     acc_hi,r1
        /* Compute 0x11*r0 into digit:lsbit */
        mov     lsbit,r0
        swap    lsbit
        mov     digit,lsbit
        andi    digit,15        /* Mask off high 4 bits */
        eor     lsbit,digit     /* Mask off low 4 bits */
        add     lsbit,r0
        adc     digit,zero

        /* Now add it to the accumulator 3 times (there's no faster way) */
        add     acc_lo,lsbit
        adc     acc_hi,digit
        add     acc_lo,lsbit
        adc     acc_hi,digit
        add     acc_lo,lsbit
        adc     acc_hi,digit
        /* Store the complemented accumulator (*bin++ = ~accum) */
        mov     r0,acc_lo
        com     r0
        st      bin+,r0

        /* Fold the accumulator: acc = acc_hi + acc_lo */
        add     acc_lo,acc_hi
        clr     acc_hi
        adc     acc_hi,zero

        dec     tlen
        brne    8b

         * End of main loop: check if the new msbyte was zero.  If so,
         * drop it (reduce len by 1), and test for termination.
        cpse    r0,zero
         rjmp   4b
        sbiw    bin_lo,1
        dec     len
        brne    4b
        clr     __zero_reg__
        breq    8f
         rjmp   4b

        /* Finally, put the return value in the expected place */
        movw    r24,out_lo
        .size   genprint, .-genprint

reply via email to

[Prev in Thread] Current Thread [Next in Thread]