commit-gnuradio
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Commit-gnuradio] r7783 - gnuradio/branches/developers/ngoergen/spe_fir_


From: ngoergen
Subject: [Commit-gnuradio] r7783 - gnuradio/branches/developers/ngoergen/spe_fir_fff
Date: Fri, 22 Feb 2008 13:55:45 -0700 (MST)

Author: ngoergen
Date: 2008-02-22 13:55:45 -0700 (Fri, 22 Feb 2008)
New Revision: 7783

Added:
   gnuradio/branches/developers/ngoergen/spe_fir_fff/spe_fir_fff_as.S
   gnuradio/branches/developers/ngoergen/spe_fir_fff/spe_fir_fff_as.h
Modified:
   gnuradio/branches/developers/ngoergen/spe_fir_fff/Makefile
   gnuradio/branches/developers/ngoergen/spe_fir_fff/fir_fff_spe.cpp
   gnuradio/branches/developers/ngoergen/spe_fir_fff/multi_fir_fff_ppe.c
Log:
spe_fir_fff: Collapsed outter-loop, moved hand-coded assembly to GAS file, 
initial attempt at scheduling optimization, updated (somewhat) for Linux ABI 
standard (more work here), tested against fir qa vectors: note . some of these 
fail due to the limits of the single-precision float impl.

Still need to verify scheduling, cleanup ABI considerations.  Double-precision 
and complex versions soon to come.



Modified: gnuradio/branches/developers/ngoergen/spe_fir_fff/Makefile
===================================================================
--- gnuradio/branches/developers/ngoergen/spe_fir_fff/Makefile  2008-02-22 
20:27:23 UTC (rev 7782)
+++ gnuradio/branches/developers/ngoergen/spe_fir_fff/Makefile  2008-02-22 
20:55:45 UTC (rev 7783)
@@ -8,6 +8,9 @@
 SPU_CLIBS  =
 SPU_CFLAGS = -Wall
 
+SPU_AS    = spu-as
+SPU_CPP           = spu-g++
+
 all: multi_fir_fff_ppe fir_fff_spe.elf
 
 asm: fir_fff_spe.s
@@ -18,8 +21,14 @@
 multi_fir_fff_ppe: multi_fir_fff_ppe.c
        $(CC) $(CFLAGS) $(CINCS) $(CLIBS) $^ -o $@
 
-fir_fff_spe.elf: fir_fff_spe.cpp
-       $(SPU_CC) $(SPU_CFLAGS) $(SPU_CINCS) $(SPU_CLIBS) $^ -o $@
+fir_fff_spe.o: fir_fff_spe.cpp
+       $(SPU_CC) -c $(SPU_CFLAGS) $(SPU_CINCS) $(SPU_CLIBS) $^ -o $@
 
+spe_fir_fff_as.o: spe_fir_fff_as.S
+       $(SPU_AS) -o $@ $^      
+
+fir_fff_spe.elf: fir_fff_spe.o spe_fir_fff_as.o
+       $(SPU_CPP) $(SPU_CFLAGS) $(SPU_CINCS) $(SPU_CLIBS) $^ -o $@
+
 clean:
-       rm -f multi_fir_fff_ppe fir_fff_spe.elf fir_fff_spe.s
+       rm -f multi_fir_fff_ppe fir_fff_spe.elf fir_fff_spe.s *.o

Modified: gnuradio/branches/developers/ngoergen/spe_fir_fff/fir_fff_spe.cpp
===================================================================
--- gnuradio/branches/developers/ngoergen/spe_fir_fff/fir_fff_spe.cpp   
2008-02-22 20:27:23 UTC (rev 7782)
+++ gnuradio/branches/developers/ngoergen/spe_fir_fff/fir_fff_spe.cpp   
2008-02-22 20:55:45 UTC (rev 7783)
@@ -2,6 +2,7 @@
 #include <spu_intrinsics.h>
 #include "gr_spe_dma_lock.h"
 #include "spe_fir_fff_params.h"
+#include "spe_fir_fff_as.h"
 
 #define MAX_BUFSIZE (128*100)
 
@@ -9,22 +10,8 @@
 float in2_spe[MAX_BUFSIZE]  __attribute__((aligned(16)));
 float out_spe[MAX_BUFSIZE] __attribute__((aligned(16)));
 
-static const __vector unsigned int shiftmasks[4] =
-       {       { 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f },
-               { 0x04050607, 0x08090a0b, 0x0c0d0e0f, 0x10111213 },
-               { 0x08090a0b, 0x0c0d0e0f, 0x10111213, 0x14151617 },
-               { 0x0c0d0e0f, 0x10111213, 0x14151617, 0x18191a1b } 
-       };
-
 spe_fir_fff_params_t spe_fir_fff_params __attribute__((aligned(16)));
 
-void test() {
-       unsigned int b =spe_fir_fff_params.ntaps;
-       __vector unsigned int bob = shiftmasks[0];
-       unsigned int bob2 = spe_fir_fff_params.offset;
-}
-
-
 int main(unsigned long long spe, unsigned long long argp, unsigned long long 
envp)
 {
     int tag = 1;
@@ -42,312 +29,15 @@
        gr_spe_dma_lock_out<__vector float> out_lock(
                spe_fir_fff_params.ea_out, vout, spe_fir_fff_params.size * 
sizeof(float), tag);
 
+       spe_fir_fff(
+               vin1, 
+               vin2, 
+               vout, 
+               0, 
+               spe_fir_fff_params.offset, 
+               spe_fir_fff_params.nsamples, 
+               spe_fir_fff_params.ntaps);
 
-               asm(
-                       "       lqr     $9,spe_fir_fff_params+16        \n"
-                       "       rotqbyi $31,$9,8        \n"             // 
number of nsamples
-                       "       lqr     $33,spe_fir_fff_params+32       \n"
-                       //"     rotqbyi $33,$9,0        \n"             // 
initial offset into samples  
-                       "       lqd             $32,32($sp)             \n"     
                // the index into out
-                       
-               );              // the current output vector
-                       
-
-               
/////////////////////////////////////////////////////////////////////////////////////////
               
-               
-               asm(    // this loop computes single dot-product for [ X _ _ _ ]
-                       ".start4: \n"
-                       "       xor $30,$30,$30 \n"                     // the 
current output vector                            
-
-                       "       xor     $5,$5,$5        \n" // even
-                       "       lqd     $2,64($sp)      \n" // odd
-                       "       ori     $40,$2,0        \n"
-                       "       xor     $6,$6,$6        \n"
-                       "       lqd     $3,48($sp)      \n"
-                       "       xor     $7,$7,$7        \n"
-                       "       lqr     $9,spe_fir_fff_params+16        \n"
-                       "       xor     $8,$8,$8        \n"
-                       "       rotqbyi $9,$9,12        \n"     // ntaps
-                       "       a       $2, $33, $2     \n"
-                               
-                       "       andi    $37,$33,0x0c    \n"                     
// find index into masks [0-4]
-                       "       shlqbii $38,$37,2       \n"                     
// mult by 16
-                       "       lqd     $39,shiftmasks($38)     \n"             
// load the right shift mask                    
-                       
-                       ".big_mama_loop:                \n"
-                       "       lqd     $10,0($2)       \n"
-                       "       lqd     $14,0($3)       \n"
-                       "       lqd     $11,16($2)      \n"
-                       "       lqd     $15,16($3)      \n"
-                       "       lqd     $12,32($2)      \n"
-                       "       lqd     $16,32($3)      \n"                     
-                       "       lqd     $13,48($2)      \n"
-                       "       lqd     $17,48($3)      \n"
-                       "       lqd     $18,64($2)      \n"
-
-                       "       shufb   $10,$10,$11,$39 \n"             // do 
the truffle shuffle
-                       "       shufb   $11,$11,$12,$39 \n"             // do 
the truffle shuffle                                       
-                       "       shufb   $12,$12,$13,$39 \n"             // do 
the truffle shuffle
-                       "       shufb   $13,$13,$18,$39 \n"             // do 
the truffle shuffle
-                               
-                       "       fma     $5, $10, $14, $5  \n" // even
-                       "       fma     $6, $11, $15, $6  \n" // even
-                       "       fma     $7, $12, $16, $7  \n" // even           
        
-                       "       fma     $8, $13, $17, $8  \n" // even
-
-                       "       ai      $2,$2,64        \n"
-                       "       ai      $3,$3,64        \n"             
-                       "       ai      $9,$9,-4        \n"
-                       "       brnz    $9,.big_mama_loop       \n"
-                       "       fa      $18,$5,$6               \n"
-                       "       fa      $19,$7,$8               \n"
-                       "       fa      $5,$18,$19              \n"
-               );
-
-               asm(    // accumulate word elements in r5 into first element in 
r5      
-                       "       ori     $6,$5,0         \n"
-                       "       shlqbyi $6,$6,4         \n"
-                       "       fa      $5,$6,$5                \n"
-                       "       shlqbyi $6,$6,4         \n"
-                       "       fa      $5,$6,$5                \n"
-                       "       shlqbyi $6,$6,4         \n"
-                       "       fa      $5,$6,$5                \n"
-                       "       fsmbi   $10,0xC000              \n"
-                       "       and             $11,$10,$5              \n"
-                       "       or              $30,$11,$30             \n"
-                       
-                       "       ai      $31,$31,-1      \n"
-                       "       brz     $31,.finish4    \n"
-                       "       ai      $33,$33,4       \n"                     
        
-               );
-               
-/////////////////////////////////////////////////////////////////////////////////////////
-               
-               
-               asm(    // this loop computes single dot-product for [ _ X _ _ ]
-                       
-                               "       xor     $5,$5,$5        \n" // even
-                               "       lqd     $2,64($sp)      \n" // odd
-                               "       ori     $40,$2,0        \n"
-                               "       xor     $6,$6,$6        \n"
-                               "       lqd     $3,48($sp)      \n"
-                               "       xor     $7,$7,$7        \n"
-                               "       lqr     $9,spe_fir_fff_params+16        
\n"
-                               "       xor     $8,$8,$8        \n"
-                               "       rotqbyi $9,$9,12        \n"     // ntaps
-                               "       a       $2, $33, $2     \n"
-                                       
-                               "       andi    $37,$33,0x0c    \n"             
        // find index into masks [0-4]
-                               "       shlqbii $38,$37,2       \n"             
        // mult by 16
-                               "       lqd     $39,shiftmasks($38)     \n"     
        // load the right shift mask                    
-                               
-                               ".big_mama_loop2:               \n"
-                               "       lqd     $10,0($2)       \n"
-                               "       lqd     $14,0($3)       \n"
-                               "       lqd     $11,16($2)      \n"
-                               "       lqd     $15,16($3)      \n"
-                               "       lqd     $12,32($2)      \n"
-                               "       lqd     $16,32($3)      \n"             
        
-                               "       lqd     $13,48($2)      \n"
-                               "       lqd     $17,48($3)      \n"
-                               "       lqd     $18,64($2)      \n"
-
-                               "       shufb   $10,$10,$11,$39 \n"             
// do the truffle shuffle
-                               "       shufb   $11,$11,$12,$39 \n"             
// do the truffle shuffle                                       
-                               "       shufb   $12,$12,$13,$39 \n"             
// do the truffle shuffle
-                               "       shufb   $13,$13,$18,$39 \n"             
// do the truffle shuffle
-                                       
-                               "       fma     $5, $10, $14, $5  \n" // even
-                               "       fma     $6, $11, $15, $6  \n" // even
-                               "       fma     $7, $12, $16, $7  \n" // even   
                
-                               "       fma     $8, $13, $17, $8  \n" // even
-
-                               "       ai      $2,$2,64        \n"
-                               "       ai      $3,$3,64        \n"             
-                               "       ai      $9,$9,-4        \n"
-                               "       brnz    $9,.big_mama_loop2      \n"
-                               "       fa      $18,$5,$6               \n"
-                               "       fa      $19,$7,$8               \n"
-                               "       fa      $5,$18,$19              \n"
-                       );
-
-               asm(    // accumulate word elements in r5 into first element in 
r5      
-                       "       ori     $6,$5,0         \n"
-                       "       shlqbyi $6,$6,4         \n"
-                       "       fa      $5,$6,$5                \n"
-                       "       shlqbyi $6,$6,4         \n"
-                       "       fa      $5,$6,$5                \n"
-                       "       shlqbyi $6,$6,4         \n"
-                       "       fa      $5,$6,$5                \n"
-                       "       fsmbi   $10,0xC000              \n"
-                       "       and             $11,$10,$5              \n"
-                       "       rotqbyi $12, $11, 12    \n"
-                       "       or              $30,$12,$30             \n"
-
-                       "       ai      $31,$31,-1      \n"
-                       "       brz     $31,.finish4    \n"
-                       "       ai      $33,$33,4       \n"
-               );
-               
-/////////////////////////////////////////////////////////////////////////////////////////
-               
-               
-               asm(    // this loop computes single dot-product for [ _ _ X _ ]
-                       
-                               "       xor     $5,$5,$5        \n" // even
-                               "       lqd     $2,64($sp)      \n" // odd
-                               "       ori     $40,$2,0        \n"
-                               "       xor     $6,$6,$6        \n"
-                               "       lqd     $3,48($sp)      \n"
-                               "       xor     $7,$7,$7        \n"
-                               "       lqr     $9,spe_fir_fff_params+16        
\n"
-                               "       xor     $8,$8,$8        \n"
-                               "       rotqbyi $9,$9,12        \n"     // ntaps
-                               "       a       $2, $33, $2     \n"
-                                       
-                               "       andi    $37,$33,0x0c    \n"             
        // find index into masks [0-4]
-                               "       shlqbii $38,$37,2       \n"             
        // mult by 16
-                               "       lqd     $39,shiftmasks($38)     \n"     
        // load the right shift mask                    
-                               
-                               ".big_mama_loop3:               \n"
-                               "       lqd     $10,0($2)       \n"
-                               "       lqd     $14,0($3)       \n"
-                               "       lqd     $11,16($2)      \n"
-                               "       lqd     $15,16($3)      \n"
-                               "       lqd     $12,32($2)      \n"
-                               "       lqd     $16,32($3)      \n"             
        
-                               "       lqd     $13,48($2)      \n"
-                               "       lqd     $17,48($3)      \n"
-                               "       lqd     $18,64($2)      \n"
-
-                               "       shufb   $10,$10,$11,$39 \n"             
// do the truffle shuffle
-                               "       shufb   $11,$11,$12,$39 \n"             
// do the truffle shuffle                                       
-                               "       shufb   $12,$12,$13,$39 \n"             
// do the truffle shuffle
-                               "       shufb   $13,$13,$18,$39 \n"             
// do the truffle shuffle
-                                       
-                               "       fma     $5, $10, $14, $5  \n" // even
-                               "       fma     $6, $11, $15, $6  \n" // even
-                               "       fma     $7, $12, $16, $7  \n" // even   
                
-                               "       fma     $8, $13, $17, $8  \n" // even
-
-                               "       ai      $2,$2,64        \n"
-                               "       ai      $3,$3,64        \n"             
-                               "       ai      $9,$9,-4        \n"
-                               "       brnz    $9,.big_mama_loop3      \n"
-                               "       fa      $18,$5,$6               \n"
-                               "       fa      $19,$7,$8               \n"
-                               "       fa      $5,$18,$19              \n"
-                       );
-
-               asm(    // accumulate word elements in r5 into first element in 
r5      
-                       "       ori     $6,$5,0         \n"
-                       "       shlqbyi $6,$6,4         \n"
-                       "       fa      $5,$6,$5                \n"
-                       "       shlqbyi $6,$6,4         \n"
-                       "       fa      $5,$6,$5                \n"
-                       "       shlqbyi $6,$6,4         \n"
-                       "       fa      $5,$6,$5                \n"
-                       "       fsmbi   $10,0xC000              \n"
-                       "       and             $11,$10,$5              \n"
-                       "       rotqbyi $12, $11, 8     \n"
-                       "       or              $30,$12,$30             \n"
-
-                       "       ai      $31,$31,-1      \n"
-                       "       brz     $31,.finish4    \n"
-                       "       ai      $33,$33,4       \n"                     
        
-               );
-
-/////////////////////////////////////////////////////////////////////////////////////////
-               
-               
-               asm(    // this loop computes single dot-product for [ _ _ _ X ]
-                       
-                               "       xor     $5,$5,$5        \n" // even
-                               "       lqd     $2,64($sp)      \n" // odd
-                               "       ori     $40,$2,0        \n"
-                               "       xor     $6,$6,$6        \n"
-                               "       lqd     $3,48($sp)      \n"
-                               "       xor     $7,$7,$7        \n"
-                               "       lqr     $9,spe_fir_fff_params+16        
\n"
-                               "       xor     $8,$8,$8        \n"
-                               "       rotqbyi $9,$9,12        \n"     // ntaps
-                               "       a       $2, $33, $2     \n"
-                                       
-                               "       andi    $37,$33,0x0c    \n"             
        // find index into masks [0-4]
-                               "       shlqbii $38,$37,2       \n"             
        // mult by 16
-                               "       lqd     $39,shiftmasks($38)     \n"     
        // load the right shift mask                    
-                               
-                               ".big_mama_loop4:               \n"
-                               "       lqd     $10,0($2)       \n"
-                               "       lqd     $14,0($3)       \n"
-                               "       lqd     $11,16($2)      \n"
-                               "       lqd     $15,16($3)      \n"
-                               "       lqd     $12,32($2)      \n"
-                               "       lqd     $16,32($3)      \n"             
        
-                               "       lqd     $13,48($2)      \n"
-                               "       lqd     $17,48($3)      \n"
-                               "       lqd     $18,64($2)      \n"
-
-                               "       shufb   $10,$10,$11,$39 \n"             
// do the truffle shuffle
-                               "       shufb   $11,$11,$12,$39 \n"             
// do the truffle shuffle                                       
-                               "       shufb   $12,$12,$13,$39 \n"             
// do the truffle shuffle
-                               "       shufb   $13,$13,$18,$39 \n"             
// do the truffle shuffle
-                                       
-                               "       fma     $5, $10, $14, $5  \n" // even
-                               "       fma     $6, $11, $15, $6  \n" // even
-                               "       fma     $7, $12, $16, $7  \n" // even   
                
-                               "       fma     $8, $13, $17, $8  \n" // even
-
-                               "       ai      $2,$2,64        \n"
-                               "       ai      $3,$3,64        \n"             
-                               "       ai      $9,$9,-4        \n"
-                               "       brnz    $9,.big_mama_loop4      \n"
-                               "       fa      $18,$5,$6               \n"
-                               "       fa      $19,$7,$8               \n"
-                               "       fa      $5,$18,$19              \n"
-                       );
-
-               asm(    // accumulate word elements in r5 into first element in 
r5      
-                       "       ori     $6,$5,0         \n"
-                       "       shlqbyi $6,$6,4         \n"
-                       "       fa      $5,$6,$5                \n"
-                       "       shlqbyi $6,$6,4         \n"
-                       "       fa      $5,$6,$5                \n"
-                       "       shlqbyi $6,$6,4         \n"
-                       "       fa      $5,$6,$5                \n"
-                       "       fsmbi   $10,0xC000              \n"
-                       "       and             $11,$10,$5              \n"
-                       "       rotqbyi $12, $11, 4     \n"
-                       "       or              $30,$12,$30             \n"
-
-                       "       ai      $31,$31,-1      \n"
-                       "       ai      $33,$33,4       \n"                     
        
-               );
-               
-/////////////////////////////////////////////////////////////////////////////////////////
              
-               
-               
-               asm(    // Stores r5 in output
-                       ".finish4: \n"
-
-                       //"     lqd     $2,64($sp)      \n" // odd
-                       //"     ai      $2,$2,0         \n"
-                       //"     lqd     $30,0($2)       \n"
-                       //"     lqd     $34,16($2)      \n"
-                       //"     lqd     $35,32($2)      \n"
-                       //"     andi    $37,$33,3       \n"                     
// find index into masks [0-4]
-                       //"     shlqbii $38,$37,4       \n"                     
// mult by 16
-                       
-                       //"     lqd     $39,shiftmasks($38)     \n"             
// load the right shift mask
-                       //"     shufb   $30,$30,$34,$39 \n"             // do 
the truffle shuffle
-                       //"     shufb   $34,$34,$35,$39 \n"                     
        
-                       
-                       
-                       "       stqd    $30,0($32)              \n"
-                       //"     stqd    $34,16($32)             \n"
-                       "       ai              $32,$32,16              \n"     
        // increment output pointer by 1 new vector.
-                       "       brnz    $31,.start4     \n"             // 
start another output vector if needed
-               );
     }
 
     return 0;

Modified: gnuradio/branches/developers/ngoergen/spe_fir_fff/multi_fir_fff_ppe.c
===================================================================
--- gnuradio/branches/developers/ngoergen/spe_fir_fff/multi_fir_fff_ppe.c       
2008-02-22 20:27:23 UTC (rev 7782)
+++ gnuradio/branches/developers/ngoergen/spe_fir_fff/multi_fir_fff_ppe.c       
2008-02-22 20:55:45 UTC (rev 7783)
@@ -5,7 +5,7 @@
 #include <spe_fir_fff_params.h>
 
 #define NUM_SPE 1
-#define SIZE    (64*10 )
+#define SIZE    (64*1 )
 
 //#define MYMATRIX 1,  2,  3,   4,  5,   6,  7,   8, \
                   9,  10, 11,  12, 13,  14, 15,  16, \
@@ -26,7 +26,7 @@
                         0,0,0,0,1,2,3,4
 
 #define MYMATRIX        1,    2,  3,    4,   5,   6,  7,   8, \
-                        9,    10,  1,    1,   1,   1,  1,   1, \
+                        9,    10,  1,    -1,   1,   1,  1,   1, \
                         1,    1,  1,    1,   1,   1,  1,   1, \
                         1,    1,  1,    1,   1,   1,  1,   1, \
                         1,    1,  1,    1,   1,   1,  1,   1, \
@@ -74,17 +74,20 @@
 
 #define MYMATRIX2100 MYMATRIX210, MYMATRIX210, MYMATRIX210, MYMATRIX210, 
MYMATRIX210, \
                                        MYMATRIX210, MYMATRIX210, MYMATRIX210, 
MYMATRIX210, MYMATRIX210
+#define TESTMATRIX 234,  -4,  23,  -56,  45,    98,  -23,  -7, 0, 0, 0, 0, 0, 
0, 0, 0 
+//#define TESTMATRIX 1,    2,  3,    4,   5,   6,  7,   8, \
+                        9,    10, 234, 234, 234, 234, 234, 234 
+#define TESTTAPS 5, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 
-float dumb = 9;
-float in1[SIZE]  = { MYMATRIX10 };
-float dumb2 = 9;
+float in1[16]  = { 
+       // MYMATRIX10 
+               TESTMATRIX
+       };
+float in2[16]  __attribute__((aligned(16))) = { 
+               // MYMATRIX210 
+       TESTTAPS        
+}      ;
 
-float dumb3 = 9;
-float in2[SIZE]  __attribute__((aligned(16))) = { MYMATRIX210 
-               };
-
-float dumb4 = 9;
-
 float out[SIZE] __attribute__((aligned(16)));
 
 spe_fir_fff_params_t spe_fir_fff_params[NUM_SPE] __attribute__((aligned(16)));
@@ -130,7 +133,6 @@
     }
 
     for (i = 0; i < NUM_SPE; ++i) {
-       spe_fir_fff_params[i].ntaps = 4;
         spe[i] = spe_context_create(0, NULL);
         if (!spe[i]) {
             perror("spe_context_create");
@@ -154,7 +156,8 @@
                spe_fir_fff_params[i].ea_in1  = (unsigned long) &in1[i*size ];
                spe_fir_fff_params[i].ea_in2  = (unsigned long) &in2[i*size ];
                spe_fir_fff_params[i].ea_out = (unsigned long) &out[i*size];
-               spe_fir_fff_params[i].nsamples   = 16;
+               spe_fir_fff_params[i].ntaps = 4;
+               spe_fir_fff_params[i].nsamples   = 7;
                spe_fir_fff_params[i].offset   = 0;
                spe_fir_fff_params[i].size  = size;
        

Added: gnuradio/branches/developers/ngoergen/spe_fir_fff/spe_fir_fff_as.S
===================================================================
--- gnuradio/branches/developers/ngoergen/spe_fir_fff/spe_fir_fff_as.S          
                (rev 0)
+++ gnuradio/branches/developers/ngoergen/spe_fir_fff/spe_fir_fff_as.S  
2008-02-22 20:55:45 UTC (rev 7783)
@@ -0,0 +1,164 @@
+       .file   "fir_fff_spe.S"
+.text
+       .align  3
+       .global spe_fir_fff
+       .type   spe_fir_fff, @function
+spe_fir_fff:
+                               ori     $32,$5,0        # 0-2
+                               ori     $33,$7,0        # 0-2
+                               ori     $31,$8,0        # 0-2
+                       .start4: 
+                               xor $30,$30,$30         # 0-2 initilize the 
current output vector                               
+                               il  $34, 16             # 0-2 shift mask for 
output insertion
+                       
+                       .start1: 
+                               andi    $37,$33,0x0c     # 0-2 find index into 
masks [0-4]
+                               xor     $5,$5,$5         # 0-2 
+                               xor     $6,$6,$6         # 0-2
+                               shlqbii $38,$37,2        # 1-4 mult by 4
+                               xor     $7,$7,$7         # 0-2
+                               ori     $36,$9,0         # 0-2
+                               lqd     $39,shiftmasks($38)      # 1-6 load the 
right shift mask                        
+                               xor     $8,$8,$8         # 0-2
+                               a       $35, $33, $3     # 0-2
+                               ori     $38,$4,0         # 0-2
+               
+                               hbra    .inner_loop_branch, .inner_loop         
# inner-loop hint       
+                       .inner_loop:            
+                               lqd     $10,0($35)       # 1-6
+                               lqd     $11,16($35)      # 1-6
+                               lqd     $12,32($35)      # 1-6
+                               lqd     $13,48($35)      # 1-6                  
        
+                               lqd     $18,64($35)      # 1-6
+                               lqd     $14,0($38)       # 1-6
+                               lqd     $15,16($38)      # 1-6
+                               ai      $36,$36,-4       # 0-2
+                               lqd     $16,32($38)      # 1-6
+                               ai      $35,$35,64       # 0-2
+                               lqd     $17,48($38)      # 1-6
+                               ai      $38,$38,64       # 0-2
+
+                               shufb   $10,$10,$11,$39         # 1-4 
+                               fma     $5, $10, $14, $5        # 0-6 
+                               shufb   $11,$11,$12,$39         # 1-4           
        
+                               fma     $6, $11, $15, $6        # 0-6
+                               shufb   $12,$12,$13,$39         # 1-4
+                               fma     $7, $12, $16, $7        # 0-6           
        
+                               shufb   $13,$13,$18,$39         # 1-4
+                               fma     $8, $13, $17, $8        # 0-6
+
+                       .inner_loop_branch:     
+                               brnz    $36,.inner_loop 
+                               fa      $18,$5,$6                # 0-6
+                               hbra    .outter_loop_branch, .start1     # 1-
+                               fa      $19,$7,$8                # 0-6
+                               hbra    .finish_branch, .finish_branch_targ     
 # 1-
+                               fa      $5,$18,$19               # 0-6
+
+# accumulate word elements in r5 into first element in r5      
+                               ori     $6,$5,0          # 0-2
+                               shlqbyi $6,$6,4          # 1-4
+                               fa      $5,$6,$5         # 0-2
+                               shlqbyi $6,$6,4          # 1-4
+                               fa      $5,$6,$5         # 0-2
+                               shlqbyi $6,$6,4          # 1-4
+                               fa      $5,$6,$5         # 0-2
+                               fsmbi   $10,0xC000       # 1-4
+                               and     $11,$10,$5       # 0-2
+                               rotqby  $12, $11, $34    # 1-4
+                               or      $30,$12,$30      # 0-2
+                       
+                               ai      $31,$31,-1       # 0-2
+                       .finish_branch: 
+                               brz     $31,.finish4     
+                       .finish_branch_targ:    
+                               ai      $33,$33,4        # 0-2
+                               ai      $34,$34,-4       # 0-2
+                       
+                       .outter_loop_branch:            
+                               brnz    $34, .start1    
+                               hbra    .outter_outter_loop_branch, .start4     
+# Stores r5 in output
+                       .finish4:       
+                               stqd    $30,0($32)              
+                               ai      $32,$32,16                              
# increment output pointer by 1 new vector.
+
+                       .outter_outter_loop_branch:     
+                               brnz    $31,.start4                     # start 
another output vector if needed
+
+       bi      $lr
+       .size   spe_fir_fff, .-spe_fir_fff
+.text
+        .global shiftmasks
+        .align  4
+        .type   shiftmasks, @object
+        .size   shiftmasks, 64
+shiftmasks:
+       .long 0x00010203
+       .long 0x04050607
+       .long 0x08090a0b
+       .long 0x0c0d0e0f
+
+       .long 0x04050607
+       .long 0x08090a0b
+       .long 0x0c0d0e0f
+       .long 0x10111213
+
+       .long 0x08090a0b
+       .long 0x0c0d0e0f
+       .long 0x10111213
+       .long 0x14151617
+
+       .long 0x0c0d0e0f
+       .long 0x10111213
+       .long 0x14151617
+       .long 0x18191a1b
+
+#        .long   66051
+#        .long   67438087
+#        .long   134810123
+#        .long   202182159
+#        .long   67438087
+#        .long   134810123
+#        .long   202182159
+#        .long   269554195
+#        .long   134810123
+#        .long   202182159
+#        .long   269554195
+#        .long   336926231
+#        .long   202182159
+#        .long   269554195
+#        .long   336926231
+#        .long   404298267
+
+# old code to build stack for above routine
+#      stqd    $sp,-128($sp)
+#      ai      $sp,$sp,-128
+#      lqd     $3,32($sp)
+#      hbrp    # 1
+#      cwd     $9,0($sp)
+#      shufb   $3,$4,$3,$9
+#      stqd    $3,32($sp)
+#      lqd     $3,48($sp)
+#      cwd     $4,0($sp)
+#      shufb   $3,$4,$3,$4
+#      stqd    $3,48($sp)
+#      lqd     $3,64($sp)
+#      cwd     $4,0($sp)
+#      shufb   $3,$5,$3,$4
+#      stqd    $3,64($sp)
+#      lqd     $3,80($sp)
+#      hbrp    # 2
+#      cwd     $4,0($sp)
+#      shufb   $3,$6,$3,$4
+#      stqd    $3,80($sp)
+#      lqd     $3,96($sp)
+#      cwd     $4,0($sp)
+#      shufb   $3,$7,$3,$4
+#      stqd    $3,96($sp)
+#      lqd     $3,112($sp)
+#      cwd     $4,0($sp)
+#      shufb   $3,$8,$3,$4
+#      stqd    $3,112($sp)
+
+       .ident  "Hand coded Cell SPU assembly"

Added: gnuradio/branches/developers/ngoergen/spe_fir_fff/spe_fir_fff_as.h
===================================================================
--- gnuradio/branches/developers/ngoergen/spe_fir_fff/spe_fir_fff_as.h          
                (rev 0)
+++ gnuradio/branches/developers/ngoergen/spe_fir_fff/spe_fir_fff_as.h  
2008-02-22 20:55:45 UTC (rev 7783)
@@ -0,0 +1,22 @@
+#ifndef SPE_FIR_FFF_AS_H_
+#define SPE_FIR_FFF_AS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern void spe_fir_fff (      
+       const __vector float *input,
+       const __vector float *taps, 
+       __vector float *output,
+       const __vector float *delayline,
+       const unsigned int offset,
+       const unsigned int nsamples,
+       const unsigned int ntaps
+);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //SPE_FIR_FFF_AS_H_





reply via email to

[Prev in Thread] Current Thread [Next in Thread]