commit-gnuradio
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Commit-gnuradio] [gnuradio] 01/14: added new proto-kernels


From: git
Subject: [Commit-gnuradio] [gnuradio] 01/14: added new proto-kernels
Date: Wed, 15 Oct 2014 23:25:08 +0000 (UTC)

This is an automated email from the git hooks/post-receive script.

trondeau pushed a commit to branch master
in repository gnuradio.

commit bb79c5b07b6c7156a355d5d055f9efa5e10be974
Author: Abhishek Bhowmick <address@hidden>
Date:   Mon May 12 21:18:42 2014 +0530

    added new proto-kernels
    
    Conflicts:
        volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
---
 volk/kernels/volk/volk_32f_convert_64f.h           |  72 +++++++++++++
 volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h  |  94 +++++++++++++++++
 volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h     | 116 +++++++++++++++++++++
 volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h   |  81 ++++++++++++--
 volk/kernels/volk/volk_8ic_deinterleave_real_16i.h |  48 +++++++++
 volk/kernels/volk/volk_8ic_deinterleave_real_8i.h  |  58 +++++++++++
 6 files changed, 462 insertions(+), 7 deletions(-)

diff --git a/volk/kernels/volk/volk_32f_convert_64f.h 
b/volk/kernels/volk/volk_32f_convert_64f.h
index 2f03695..12c2f22 100644
--- a/volk/kernels/volk/volk_32f_convert_64f.h
+++ b/volk/kernels/volk/volk_32f_convert_64f.h
@@ -4,6 +4,42 @@
 #include <inttypes.h>
 #include <stdio.h>
 
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+  /*!
+    \brief Converts the float values into double values
+    \param dVector The converted double vector values
+    \param fVector The float vector values to be converted
+    \param num_points The number of points in the two vectors to be converted
+  */
+
+static inline void volk_32f_convert_64f_u_avx(double* outputVector, const 
float* inputVector, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+
+  const float* inputVectorPtr = (const float*)inputVector;
+  double* outputVectorPtr = outputVector;
+  __m256d ret;
+  __m128 inputVal;
+
+  for(;number < quarterPoints; number++){
+    inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+    ret = _mm256_cvtps_pd(inputVal);
+    _mm256_storeu_pd(outputVectorPtr, ret);
+
+    outputVectorPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  for(; number < num_points; number++){
+    outputVector[number] = (double)(inputVector[number]);
+  }
+}
+
+#endif /* LV_HAVE_AVX */
+
 #ifdef LV_HAVE_SSE2
 #include <emmintrin.h>
   /*!
@@ -68,12 +104,48 @@ static inline void volk_32f_convert_64f_generic(double* 
outputVector, const floa
 
 
 #endif /* INCLUDED_volk_32f_convert_64f_u_H */
+
+
 #ifndef INCLUDED_volk_32f_convert_64f_a_H
 #define INCLUDED_volk_32f_convert_64f_a_H
 
 #include <inttypes.h>
 #include <stdio.h>
 
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+  /*!
+    \brief Converts the float values into double values
+    \param dVector The converted double vector values
+    \param fVector The float vector values to be converted
+    \param num_points The number of points in the two vectors to be converted
+  */
+static inline void volk_32f_convert_64f_a_avx(double* outputVector, const 
float* inputVector, unsigned int num_points){
+  unsigned int number = 0;
+
+  const unsigned int quarterPoints = num_points / 4;
+
+  const float* inputVectorPtr = (const float*)inputVector;
+  double* outputVectorPtr = outputVector;
+  __m256d ret;
+  __m128 inputVal;
+
+  for(;number < quarterPoints; number++){
+    inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+    ret = _mm256_cvtps_pd(inputVal);
+    _mm256_store_pd(outputVectorPtr, ret);
+
+    outputVectorPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  for(; number < num_points; number++){
+    outputVector[number] = (double)(inputVector[number]);
+  }
+}
+#endif /* LV_HAVE_AVX */
+
 #ifdef LV_HAVE_SSE2
 #include <emmintrin.h>
   /*!
diff --git a/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h 
b/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
index 668a047..3ae6f59 100644
--- a/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
+++ b/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
@@ -6,6 +6,53 @@
 #include <volk/volk_complex.h>
 #include <float.h>
 
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+  /*!
+    \brief Multiplies the two input complex vectors and stores their results 
in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector One of the vectors to be multiplied
+    \param bVector One of the vectors to be multiplied
+    \param num_points The number of complex values in aVector and bVector to 
be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t* cVector, 
const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+    unsigned int number = 0;
+    unsigned int i = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    unsigned int isodd = num_points & 3;
+    __m256 x, yl, yh, z, tmp1, tmp2;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+
+    // Set up constant scalar vector
+    yl = _mm256_set1_ps(lv_creal(scalar));
+    yh = _mm256_set1_ps(lv_cimag(scalar));
+
+    for(;number < quarterPoints; number++){
+
+      x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as 
ar,ai,br,bi
+
+      tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+      x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+      tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+      z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, 
br*dr-bi*di, bi*dr+br*di
+
+      _mm256_storeu_ps((float*)c,z); // Store the results back into the C 
container
+
+      a += 4;
+      c += 4;
+    }
+
+    for(i = num_points-isodd; i < num_points; i++) {
+    *c++ = (*a++) * scalar;
+  }
+
+}
+#endif /* LV_HAVE_AVX */
+
 #ifdef LV_HAVE_SSE3
 #include <pmmintrin.h>
 /*!
@@ -93,6 +140,53 @@ static inline void 
volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, con
 #include <volk/volk_complex.h>
 #include <float.h>
 
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+  /*!
+    \brief Multiplies the two input complex vectors and stores their results 
in the third vector
+    \param cVector The vector where the results will be stored
+    \param aVector One of the vectors to be multiplied
+    \param bVector One of the vectors to be multiplied
+    \param num_points The number of complex values in aVector and bVector to 
be multiplied together and stored into cVector
+  */
+static inline void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t* cVector, 
const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+    unsigned int number = 0;
+    unsigned int i = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    unsigned int isodd = num_points & 3;
+    __m256 x, yl, yh, z, tmp1, tmp2;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+
+    // Set up constant scalar vector
+    yl = _mm256_set1_ps(lv_creal(scalar));
+    yh = _mm256_set1_ps(lv_cimag(scalar));
+
+    for(;number < quarterPoints; number++){
+
+      x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as 
ar,ai,br,bi
+
+      tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+      x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+      tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+      z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, 
br*dr-bi*di, bi*dr+br*di
+
+      _mm256_store_ps((float*)c,z); // Store the results back into the C 
container
+
+      a += 4;
+      c += 4;
+    }
+
+    for(i = num_points-isodd; i < num_points; i++) {
+    *c++ = (*a++) * scalar;
+  }
+
+}
+#endif /* LV_HAVE_AVX */
+
 #ifdef LV_HAVE_SSE3
 #include <pmmintrin.h>
   /*!
diff --git a/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h 
b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
index 430b747..7e8c91a 100644
--- a/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
+++ b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
@@ -316,10 +316,67 @@ static inline void 
volk_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result, const
 
 #endif /*LV_HAVE_SSE4_1*/
 
+#ifdef LV_HAVE_AVX
 
+#include <immintrin.h>
+
+static inline void volk_32fc_x2_dot_prod_32fc_u_avx(lv_32fc_t* result, const 
lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+  unsigned int isodd = num_points & 3;
+  unsigned int i = 0;
+  lv_32fc_t dotProduct;
+  memset(&dotProduct, 0x0, 2*sizeof(float));
+
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+  const lv_32fc_t* a = input;
+  const lv_32fc_t* b = taps;
+
+  dotProdVal = _mm256_setzero_ps();
+
+  for(;number < quarterPoints; number++){
+
+    x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
+    y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
+
+    yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
+    yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
+
+    tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
+
+    x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be 
ai,ar,bi,br,ei,er,fi,fr
+
+    tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
+
+    z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, 
bi*dr+br*di
+
+    dotProdVal = _mm256_add_ps(dotProdVal, z); // Add the complex 
multiplication results together
+
+    a += 4;
+    b += 4;
+  }
+
+  __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
+
+  _mm256_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results 
back into the dot product vector
+
+  dotProduct += ( dotProductVector[0] + dotProductVector[1] + 
dotProductVector[2] + dotProductVector[3]);
+
+  for(i = num_points-isodd; i < num_points; i++) {
+    dotProduct += input[i] * taps[i];
+  }
+
+  *result = dotProduct;
+}
+
+#endif /*LV_HAVE_AVX*/
 
 
 #endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H*/
+
 #ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
 #define INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
 
@@ -680,6 +737,7 @@ static inline void 
volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv
 
 #endif /*LV_HAVE_SSE3*/
 
+
 #ifdef LV_HAVE_SSE4_1
 
 #include <smmintrin.h>
@@ -976,4 +1034,62 @@ static inline void 
volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* resul
 #endif /*LV_HAVE_NEON*/
 
 
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void volk_32fc_x2_dot_prod_32fc_a_avx(lv_32fc_t* result, const 
lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+  unsigned int isodd = num_points & 3;
+  unsigned int i = 0;
+  lv_32fc_t dotProduct;
+  memset(&dotProduct, 0x0, 2*sizeof(float));
+
+  unsigned int number = 0;
+  const unsigned int quarterPoints = num_points / 4;
+
+  __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+  const lv_32fc_t* a = input;
+  const lv_32fc_t* b = taps;
+
+  dotProdVal = _mm256_setzero_ps();
+
+  for(;number < quarterPoints; number++){
+
+    x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
+    y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
+
+    yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
+    yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
+
+    tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
+
+    x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be 
ai,ar,bi,br,ei,er,fi,fr
+
+    tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
+
+    z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, 
bi*dr+br*di
+
+    dotProdVal = _mm256_add_ps(dotProdVal, z); // Add the complex 
multiplication results together
+
+    a += 4;
+    b += 4;
+  }
+
+  __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
+
+  _mm256_store_ps((float*)dotProductVector,dotProdVal); // Store the results 
back into the dot product vector
+
+  dotProduct += ( dotProductVector[0] + dotProductVector[1] + 
dotProductVector[2] + dotProductVector[3]);
+
+  for(i = num_points-isodd; i < num_points; i++) {
+    dotProduct += input[i] * taps[i];
+  }
+
+  *result = dotProduct;
+}
+
+#endif /*LV_HAVE_AVX*/
+
 #endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H*/
diff --git a/volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h 
b/volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h
index b59d22d..37f9d74 100644
--- a/volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h
+++ b/volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h
@@ -13,30 +13,31 @@
   \param qBuffer The Q buffer output data
   \param num_points The number of complex data values to be deinterleaved
 */
+
 static inline void volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer, 
int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
   unsigned int number = 0;
   const int8_t* complexVectorPtr = (int8_t*)complexVector;
   int16_t* iBufferPtr = iBuffer;
   int16_t* qBufferPtr = qBuffer;
-  __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 
0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+  __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 
0x80, 14, 12, 10, 8, 6, 4, 2, 0); // set 16 byte values
   __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 
0x80, 15, 13, 11, 9, 7, 5, 3, 1);
   __m128i complexVal, iOutputVal, qOutputVal;
 
   unsigned int eighthPoints = num_points / 8;
 
   for(number = 0; number < eighthPoints; number++){
-    complexVal = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr 
+= 16;
+    complexVal = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr 
+= 16;          // aligned load
 
-    iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask);
+    iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask);                      
                // shuffle 16 bytes of 128bit complexVal
     qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask);
 
-    iOutputVal = _mm_cvtepi8_epi16(iOutputVal);
-    iOutputVal = _mm_slli_epi16(iOutputVal, 8);
+    iOutputVal = _mm_cvtepi8_epi16(iOutputVal);                // fills 2-byte 
sign extended versions of lower 8 bytes of input to output 
+    iOutputVal = _mm_slli_epi16(iOutputVal, 8);                // shift in 
left by 8 bits, each of the 8 16-bit integers, shift in with zeros
 
     qOutputVal = _mm_cvtepi8_epi16(qOutputVal);
     qOutputVal = _mm_slli_epi16(qOutputVal, 8);
 
-    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); // aligned store
     _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
 
     iBufferPtr += 8;
@@ -45,12 +46,78 @@ static inline void 
volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer, int16
 
   number = eighthPoints * 8;
   for(; number < num_points; number++){
-    *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+    *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;      // load 8 bit 
Complexvector into 16 bit, shift left by 8 bits and store
     *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
   }
 }
 #endif /* LV_HAVE_SSE4_1 */
 
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param qBuffer The Q buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_deinterleave_16i_x2_a_avx(int16_t* iBuffer, 
int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (int8_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  int16_t* qBufferPtr = qBuffer;
+  __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 
0x80, 14, 12, 10, 8, 6, 4, 2, 0); // set 16 byte values
+  __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 
0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+  __m256i complexVal, iOutputVal, qOutputVal;
+  __m128i complexVal1, complexVal0;
+  __m128i iOutputVal1, iOutputVal0, qOutputVal1, qOutputVal0;
+
+  unsigned int sixteenthPoints = num_points / 16;
+
+  for(number = 0; number < sixteenthPoints; number++){
+    complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);  
complexVectorPtr += 32;               // aligned load
+
+    // Extract from complexVal to iOutputVal and qOutputVal
+    complexVal1 = _mm256_extractf128_si256(complexVal, 1); 
+    complexVal0 = _mm256_extractf128_si256(complexVal, 0); 
+
+    iOutputVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask);                    
                // shuffle 16 bytes of 128bit complexVal
+    iOutputVal0 = _mm_shuffle_epi8(complexVal0, iMoveMask);                    
+    qOutputVal1 = _mm_shuffle_epi8(complexVal1, qMoveMask);
+    qOutputVal0 = _mm_shuffle_epi8(complexVal0, qMoveMask);
+
+    iOutputVal1 = _mm_cvtepi8_epi16(iOutputVal1);              // fills 2-byte 
sign extended versions of lower 8 bytes of input to output 
+    iOutputVal1 = _mm_slli_epi16(iOutputVal1, 8);              // shift in 
left by 8 bits, each of the 8 16-bit integers, shift in with zeros
+    iOutputVal0 = _mm_cvtepi8_epi16(iOutputVal0);              
+    iOutputVal0 = _mm_slli_epi16(iOutputVal0, 8);
+
+    qOutputVal1 = _mm_cvtepi8_epi16(qOutputVal1);
+    qOutputVal1 = _mm_slli_epi16(qOutputVal1, 8);
+    qOutputVal0 = _mm_cvtepi8_epi16(qOutputVal0);
+    qOutputVal0 = _mm_slli_epi16(qOutputVal0, 8);
+
+    // Pack iOutputVal0,1 to iOutputVal
+    __m256i dummy = _mm256_setzero_si256();
+    iOutputVal = _mm256_insertf128_si256(dummy, iOutputVal0, 0);       
+    iOutputVal = _mm256_insertf128_si256(iOutputVal, iOutputVal1, 1);  
+    qOutputVal = _mm256_insertf128_si256(dummy, qOutputVal0, 0);       
+    qOutputVal = _mm256_insertf128_si256(qOutputVal, qOutputVal1, 1);  
+
+    _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);      // aligned store
+    _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
+
+    iBufferPtr += 16;
+    qBufferPtr += 16;
+  }
+
+  number = sixteenthPoints * 16;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;      // load 8 bit 
Complexvector into 16 bit, shift left by 8 bits and store
+    *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+  }
+}
+#endif /* LV_HAVE_AVX */
+
 #ifdef LV_HAVE_GENERIC
 /*!
   \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data
diff --git a/volk/kernels/volk/volk_8ic_deinterleave_real_16i.h 
b/volk/kernels/volk/volk_8ic_deinterleave_real_16i.h
index 82cedb2..9adec93 100644
--- a/volk/kernels/volk/volk_8ic_deinterleave_real_16i.h
+++ b/volk/kernels/volk/volk_8ic_deinterleave_real_16i.h
@@ -41,6 +41,54 @@ static inline void 
volk_8ic_deinterleave_real_16i_a_sse4_1(int16_t* iBuffer, con
 }
 #endif /* LV_HAVE_SSE4_1 */
 
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_deinterleave_real_16i_a_avx(int16_t* iBuffer, 
const lv_8sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (int8_t*)complexVector;
+  int16_t* iBufferPtr = iBuffer;
+  __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 
0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+  __m256i complexVal, outputVal;
+  __m128i complexVal1, complexVal0, outputVal1, outputVal0;
+
+  unsigned int sixteenthPoints = num_points / 16;
+
+  for(number = 0; number < sixteenthPoints; number++){
+    complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);  
complexVectorPtr += 32;
+
+    complexVal1 = _mm256_extractf128_si256(complexVal, 1);
+    complexVal0 = _mm256_extractf128_si256(complexVal, 0);
+   
+    outputVal1 = _mm_shuffle_epi8(complexVal1, moveMask); 
+    outputVal0 = _mm_shuffle_epi8(complexVal0, moveMask); 
+
+    outputVal1 = _mm_cvtepi8_epi16(outputVal1);
+    outputVal1 = _mm_slli_epi16(outputVal1, 7);
+    outputVal0 = _mm_cvtepi8_epi16(outputVal0);
+    outputVal0 = _mm_slli_epi16(outputVal0, 7);
+
+    __m256i dummy = _mm256_setzero_si256();
+    outputVal = _mm256_insertf128_si256(dummy, outputVal0, 0);
+    outputVal = _mm256_insertf128_si256(outputVal, outputVal1, 1);
+    _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
+
+    iBufferPtr += 16;
+  }
+
+  number = sixteenthPoints * 16;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_AVX */
+
 
 #ifdef LV_HAVE_GENERIC
 /*!
diff --git a/volk/kernels/volk/volk_8ic_deinterleave_real_8i.h 
b/volk/kernels/volk/volk_8ic_deinterleave_real_8i.h
index 427c9ab..0749422 100644
--- a/volk/kernels/volk/volk_8ic_deinterleave_real_8i.h
+++ b/volk/kernels/volk/volk_8ic_deinterleave_real_8i.h
@@ -43,6 +43,64 @@ static inline void 
volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const
 }
 #endif /* LV_HAVE_SSSE3 */
 
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+  \brief Deinterleaves the complex 8 bit vector into I vector data
+  \param complexVector The complex input vector
+  \param iBuffer The I buffer output data
+  \param num_points The number of complex data values to be deinterleaved
+*/
+
+static inline void volk_8ic_deinterleave_real_8i_a_avx(int8_t* iBuffer, const 
lv_8sc_t* complexVector, unsigned int num_points){
+  unsigned int number = 0;
+  const int8_t* complexVectorPtr = (int8_t*)complexVector;
+  int8_t* iBufferPtr = iBuffer;
+  __m128i moveMaskL = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 
0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+  __m128i moveMaskH = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 
0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+  __m256i complexVal1, complexVal2, outputVal;
+  __m128i complexVal1H, complexVal1L, complexVal2H, complexVal2L, outputVal1, 
outputVal2;
+
+  unsigned int thirtysecondPoints = num_points / 32;
+
+  for(number = 0; number < thirtysecondPoints; number++){
+    
+    complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
+    complexVectorPtr += 32;
+    complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
+    complexVectorPtr += 32;
+
+    complexVal1H = _mm256_extractf128_si256(complexVal1, 1);
+    complexVal1L = _mm256_extractf128_si256(complexVal1, 0);
+    complexVal2H = _mm256_extractf128_si256(complexVal2, 1);
+    complexVal2L = _mm256_extractf128_si256(complexVal2, 0);
+
+    complexVal1H = _mm_shuffle_epi8(complexVal1H, moveMaskH);
+    complexVal1L = _mm_shuffle_epi8(complexVal1L, moveMaskL);
+    outputVal1 = _mm_or_si128(complexVal1H, complexVal1L);
+
+    
+    complexVal2H = _mm_shuffle_epi8(complexVal2H, moveMaskH);
+    complexVal2L = _mm_shuffle_epi8(complexVal2L, moveMaskL);
+    outputVal2 = _mm_or_si128(complexVal2H, complexVal2L);
+
+    __m256i dummy = _mm256_setzero_si256();
+    outputVal = _mm256_insertf128_si256(dummy, outputVal1, 0);
+    outputVal = _mm256_insertf128_si256(outputVal, outputVal2, 1);
+
+
+    _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
+    iBufferPtr += 32;
+  }
+
+  number = thirtysecondPoints * 32;
+  for(; number < num_points; number++){
+    *iBufferPtr++ = *complexVectorPtr++;
+    complexVectorPtr++;
+  }
+}
+#endif /* LV_HAVE_AVX */
+
 #ifdef LV_HAVE_GENERIC
 /*!
   \brief Deinterleaves the complex 8 bit vector into I vector data



reply via email to

[Prev in Thread] Current Thread [Next in Thread]