commit-gnuradio
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Commit-gnuradio] [gnuradio] 11/14: volk (gsoc): whitespace


From: git
Subject: [Commit-gnuradio] [gnuradio] 11/14: volk (gsoc): whitespace
Date: Wed, 15 Oct 2014 23:25:09 +0000 (UTC)

This is an automated email from the git hooks/post-receive script.

trondeau pushed a commit to branch master
in repository gnuradio.

commit 9230086cad0ea09087e38ce609ef920b24c7a606
Author: Tom Rondeau <address@hidden>
Date:   Wed Oct 15 10:50:10 2014 -0400

    volk (gsoc): whitespace
    
    Conflicts:
        volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
---
 volk/kernels/volk/volk_32f_acos_32f.h             | 234 ++++++------
 volk/kernels/volk/volk_32f_asin_32f.h             | 217 ++++++-----
 volk/kernels/volk/volk_32f_atan_32f.h             | 211 +++++------
 volk/kernels/volk/volk_32f_cos_32f.h              | 282 +++++++-------
 volk/kernels/volk/volk_32f_expfast_32f.h          | 221 +++++------
 volk/kernels/volk/volk_32f_sin_32f.h              | 274 +++++++-------
 volk/kernels/volk/volk_32f_tan_32f.h              | 293 +++++++--------
 volk/kernels/volk/volk_32f_x2_pow_32f.h           | 427 ++++++++++------------
 volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h |  10 +-
 volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h    |   1 -
 volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h  |  44 +--
 11 files changed, 1038 insertions(+), 1176 deletions(-)

diff --git a/volk/kernels/volk/volk_32f_acos_32f.h 
b/volk/kernels/volk/volk_32f_acos_32f.h
index deba615..19444df 100644
--- a/volk/kernels/volk/volk_32f_acos_32f.h
+++ b/volk/kernels/volk/volk_32f_acos_32f.h
@@ -18,83 +18,67 @@
 */
 static inline void volk_32f_acos_32f_a_sse4_1(float* bVector, const float* 
aVector, unsigned int num_points){
 
-       float* bPtr = bVector;
-       const float* aPtr = aVector;
-    
-       unsigned int number = 0;
-        unsigned int quarterPoints = num_points / 4;
-       int i, j;
-
-       __m128 aVal, d, pi, pio2, x, y, z, arccosine;
-       __m128 fzeroes, fones, ftwos, ffours, condition;
-
-       pi = _mm_set1_ps(3.14159265358979323846);
-       pio2 = _mm_set1_ps(3.14159265358979323846/2);
-       fzeroes = _mm_setzero_ps();
-       fones = _mm_set1_ps(1.0);
-       ftwos = _mm_set1_ps(2.0);
-       ffours = _mm_set1_ps(4.0);
-
-       for(;number < quarterPoints; number++){    
-               aVal = _mm_load_ps(aPtr);
-               d = aVal;
-               aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, 
aVal), _mm_sub_ps(fones, aVal))), aVal);
-               z = aVal;
-               condition = _mm_cmplt_ps(z, fzeroes);
-               z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
-               x = z;
-               condition = _mm_cmplt_ps(z, fones);
-               x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), 
z), condition));
-
-               for(i = 0; i < 2; i++)  x = _mm_add_ps(x, 
_mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
-               x = _mm_div_ps(fones, x);
-               y = fzeroes;
-               for(j = TERMS - 1; j >=0 ; j--) y = _mm_add_ps(_mm_mul_ps(y, 
_mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
-               
-               y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
-               condition = _mm_cmpgt_ps(z, fones);
-               
-               y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, 
ftwos)), condition));
-               arccosine = y;
-               condition = _mm_cmplt_ps(aVal, fzeroes);
-               arccosine = _mm_sub_ps(arccosine, 
_mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
-               condition = _mm_cmplt_ps(d, fzeroes);
-               arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
-
-               _mm_store_ps(bPtr, arccosine);
-               aPtr += 4;
-               bPtr += 4;
-       }
- 
-       number = quarterPoints * 4;
-       for(;number < num_points; number++){
-          *bPtr++ = acos(*aPtr++);
-       }
-}
-
-#endif /* LV_HAVE_SSE4_1 for aligned */
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Computes arccosine of input vector and stores results in output vector
-  \param bVector The vector where results will be stored
-  \param aVector The input vector of floats
-  \param num_points Number of points for which arccosine is to be computed
-*/
-static inline void volk_32f_acos_32f_a_generic(float* bVector, const float* 
aVector, unsigned int num_points){    
     float* bPtr = bVector;
     const float* aPtr = aVector;
+
     unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+    int i, j;
+
+    __m128 aVal, d, pi, pio2, x, y, z, arccosine;
+    __m128 fzeroes, fones, ftwos, ffours, condition;
+
+    pi = _mm_set1_ps(3.14159265358979323846);
+    pio2 = _mm_set1_ps(3.14159265358979323846/2);
+    fzeroes = _mm_setzero_ps();
+    fones = _mm_set1_ps(1.0);
+    ftwos = _mm_set1_ps(2.0);
+    ffours = _mm_set1_ps(4.0);
+
+    for(;number < quarterPoints; number++){
+        aVal = _mm_load_ps(aPtr);
+        d = aVal;
+        aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), 
_mm_sub_ps(fones, aVal))), aVal);
+        z = aVal;
+        condition = _mm_cmplt_ps(z, fzeroes);
+        z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+        x = z;
+        condition = _mm_cmplt_ps(z, fones);
+        x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), 
condition));
+
+        for(i = 0; i < 2; i++){
+            x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, 
x))));
+        }
+        x = _mm_div_ps(fones, x);
+        y = fzeroes;
+        for(j = TERMS - 1; j >=0 ; j--){
+            y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), 
_mm_set1_ps(pow(-1,j)/(2*j+1)));
+        }
+
+        y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+        condition = _mm_cmpgt_ps(z, fones);
+
+        y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), 
condition));
+        arccosine = y;
+        condition = _mm_cmplt_ps(aVal, fzeroes);
+        arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, 
ftwos), condition));
+        condition = _mm_cmplt_ps(d, fzeroes);
+        arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
+
+        _mm_store_ps(bPtr, arccosine);
+        aPtr += 4;
+        bPtr += 4;
+    }
 
-    for(number = 0; number < num_points; number++){
-      *bPtr++ = acos(*aPtr++);
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+       *bPtr++ = acos(*aPtr++);
     }
- 
 }
-#endif /* LV_HAVE_GENERIC */
 
-#endif /* INCLUDED_volk_32f_acos_32f_a_H */
+#endif /* LV_HAVE_SSE4_1 for aligned */
 
+#endif /* INCLUDED_volk_32f_acos_32_f_H */
 #ifndef INCLUDED_volk_32f_acos_32f_u_H
 #define INCLUDED_volk_32f_acos_32f_u_H
 
@@ -108,58 +92,62 @@ static inline void volk_32f_acos_32f_a_generic(float* 
bVector, const float* aVec
 */
 static inline void volk_32f_acos_32f_u_sse4_1(float* bVector, const float* 
aVector, unsigned int num_points){
 
-       float* bPtr = bVector;
-       const float* aPtr = aVector;
-    
-       unsigned int number = 0;
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
         unsigned int quarterPoints = num_points / 4;
-       int i, j;
-
-       __m128 aVal, d, pi, pio2, x, y, z, arccosine;
-       __m128 fzeroes, fones, ftwos, ffours, condition;
-
-       pi = _mm_set1_ps(3.14159265358979323846);
-       pio2 = _mm_set1_ps(3.14159265358979323846/2);
-       fzeroes = _mm_setzero_ps();
-       fones = _mm_set1_ps(1.0);
-       ftwos = _mm_set1_ps(2.0);
-       ffours = _mm_set1_ps(4.0);
-
-       for(;number < quarterPoints; number++){    
-               aVal = _mm_loadu_ps(aPtr);
-               d = aVal;
-               aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, 
aVal), _mm_sub_ps(fones, aVal))), aVal);
-               z = aVal;
-               condition = _mm_cmplt_ps(z, fzeroes);
-               z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
-               x = z;
-               condition = _mm_cmplt_ps(z, fones);
-               x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), 
z), condition));
-
-               for(i = 0; i < 2; i++)  x = _mm_add_ps(x, 
_mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
-               x = _mm_div_ps(fones, x);
-               y = fzeroes;
-               for(j = TERMS - 1; j >=0 ; j--) y = _mm_add_ps(_mm_mul_ps(y, 
_mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
-               
-               y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
-               condition = _mm_cmpgt_ps(z, fones);
-               
-               y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, 
ftwos)), condition));
-               arccosine = y;
-               condition = _mm_cmplt_ps(aVal, fzeroes);
-               arccosine = _mm_sub_ps(arccosine, 
_mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
-               condition = _mm_cmplt_ps(d, fzeroes);
-               arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
-
-               _mm_storeu_ps(bPtr, arccosine);
-               aPtr += 4;
-               bPtr += 4;
-       }
- 
-       number = quarterPoints * 4;
-       for(;number < num_points; number++){
-          *bPtr++ = acos(*aPtr++);
-       }
+    int i, j;
+
+    __m128 aVal, d, pi, pio2, x, y, z, arccosine;
+    __m128 fzeroes, fones, ftwos, ffours, condition;
+
+    pi = _mm_set1_ps(3.14159265358979323846);
+    pio2 = _mm_set1_ps(3.14159265358979323846/2);
+    fzeroes = _mm_setzero_ps();
+    fones = _mm_set1_ps(1.0);
+    ftwos = _mm_set1_ps(2.0);
+    ffours = _mm_set1_ps(4.0);
+
+    for(;number < quarterPoints; number++){
+        aVal = _mm_loadu_ps(aPtr);
+        d = aVal;
+        aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), 
_mm_sub_ps(fones, aVal))), aVal);
+        z = aVal;
+        condition = _mm_cmplt_ps(z, fzeroes);
+        z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+        x = z;
+        condition = _mm_cmplt_ps(z, fones);
+        x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), 
condition));
+
+        for(i = 0; i < 2; i++){
+            x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, 
x))));
+        }
+        x = _mm_div_ps(fones, x);
+        y = fzeroes;
+        for(j = TERMS - 1; j >=0 ; j--){
+            x = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), 
_mm_set1_ps(pow(-1,j)/(2*j+1)));
+        }
+
+        y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+        condition = _mm_cmpgt_ps(z, fones);
+
+        y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), 
condition));
+        arccosine = y;
+        condition = _mm_cmplt_ps(aVal, fzeroes);
+        arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, 
ftwos), condition));
+        condition = _mm_cmplt_ps(d, fzeroes);
+        arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
+
+        _mm_storeu_ps(bPtr, arccosine);
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+        *bPtr++ = acos(*aPtr++);
+    }
 }
 
 #endif /* LV_HAVE_SSE4_1 for aligned */
@@ -171,15 +159,15 @@ static inline void volk_32f_acos_32f_u_sse4_1(float* 
bVector, const float* aVect
   \param aVector The input vector of floats
   \param num_points Number of points for which arccosine is to be computed
 */
-static inline void volk_32f_acos_32f_u_generic(float* bVector, const float* 
aVector, unsigned int num_points){    
+static inline void volk_32f_acos_32f_generic(float* bVector, const float* 
aVector, unsigned int num_points){
     float* bPtr = bVector;
     const float* aPtr = aVector;
     unsigned int number = 0;
 
     for(number = 0; number < num_points; number++){
-      *bPtr++ = acos(*aPtr++);
+        *bPtr++ = acos(*aPtr++);
     }
- 
+
 }
 #endif /* LV_HAVE_GENERIC */
 
diff --git a/volk/kernels/volk/volk_32f_asin_32f.h 
b/volk/kernels/volk/volk_32f_asin_32f.h
index 976aabc..80a834b 100644
--- a/volk/kernels/volk/volk_32f_asin_32f.h
+++ b/volk/kernels/volk/volk_32f_asin_32f.h
@@ -18,76 +18,61 @@
 */
 static inline void volk_32f_asin_32f_a_sse4_1(float* bVector, const float* 
aVector, unsigned int num_points){
 
-       float* bPtr = bVector;
-       const float* aPtr = aVector;
-    
-       unsigned int number = 0;
-        unsigned int quarterPoints = num_points / 4;
-       int i, j;
-
-       __m128 aVal, pio2, x, y, z, arcsine;
-       __m128 fzeroes, fones, ftwos, ffours, condition;
-
-       pio2 = _mm_set1_ps(3.14159265358979323846/2);
-       fzeroes = _mm_setzero_ps();
-       fones = _mm_set1_ps(1.0);
-       ftwos = _mm_set1_ps(2.0);
-       ffours = _mm_set1_ps(4.0);
-
-       for(;number < quarterPoints; number++){    
-               aVal = _mm_load_ps(aPtr);
-               aVal = _mm_div_ps(aVal, 
_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
-               z = aVal;
-               condition = _mm_cmplt_ps(z, fzeroes);
-               z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
-               x = z;
-               condition = _mm_cmplt_ps(z, fones);
-               x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), 
z), condition));
-
-               for(i = 0; i < 2; i++)  x = _mm_add_ps(x, 
_mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
-               x = _mm_div_ps(fones, x);
-               y = fzeroes;
-               for(j = TERMS - 1; j >=0 ; j--) y = _mm_add_ps(_mm_mul_ps(y, 
_mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
-               
-               y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
-               condition = _mm_cmpgt_ps(z, fones);
-               
-               y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, 
ftwos)), condition));
-               arcsine = y;
-               condition = _mm_cmplt_ps(aVal, fzeroes);
-               arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, 
ftwos), condition));
-
-               _mm_store_ps(bPtr, arcsine);
-               aPtr += 4;
-               bPtr += 4;
-       }
- 
-       number = quarterPoints * 4;
-       for(;number < num_points; number++){
-          *bPtr++ = asin(*aPtr++);
-       }
-}
-
-#endif /* LV_HAVE_SSE4_1 for aligned */
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Computes arcsine of input vector and stores results in output vector
-  \param bVector The vector where results will be stored
-  \param aVector The input vector of floats
-  \param num_points Number of points for which arcsine is to be computed
-*/
-static inline void volk_32f_asin_32f_a_generic(float* bVector, const float* 
aVector, unsigned int num_points){    
     float* bPtr = bVector;
     const float* aPtr = aVector;
+
     unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+    int i, j;
+
+    __m128 aVal, pio2, x, y, z, arcsine;
+    __m128 fzeroes, fones, ftwos, ffours, condition;
+
+    pio2 = _mm_set1_ps(3.14159265358979323846/2);
+    fzeroes = _mm_setzero_ps();
+    fones = _mm_set1_ps(1.0);
+    ftwos = _mm_set1_ps(2.0);
+    ffours = _mm_set1_ps(4.0);
+
+    for(;number < quarterPoints; number++){
+        aVal = _mm_load_ps(aPtr);
+        aVal = _mm_div_ps(aVal, _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, 
aVal), _mm_sub_ps(fones, aVal))));
+        z = aVal;
+        condition = _mm_cmplt_ps(z, fzeroes);
+        z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+        x = z;
+        condition = _mm_cmplt_ps(z, fones);
+        x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), 
condition));
+
+        for(i = 0; i < 2; i++){
+            x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, 
x))));
+        }
+        x = _mm_div_ps(fones, x);
+        y = fzeroes;
+        for(j = TERMS - 1; j >=0 ; j--){
+            y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), 
_mm_set1_ps(pow(-1,j)/(2*j+1)));
+        }
+
+        y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+        condition = _mm_cmpgt_ps(z, fones);
+
+        y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), 
condition));
+        arcsine = y;
+        condition = _mm_cmplt_ps(aVal, fzeroes);
+        arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), 
condition));
+
+        _mm_store_ps(bPtr, arcsine);
+        aPtr += 4;
+        bPtr += 4;
+    }
 
-    for(number = 0; number < num_points; number++){
-      *bPtr++ = asin(*aPtr++);
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+        *bPtr++ = asin(*aPtr++);
     }
- 
 }
-#endif /* LV_HAVE_GENERIC */
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
 
 #endif /* INCLUDED_volk_32f_asin_32f_a_H */
 
@@ -104,54 +89,58 @@ static inline void volk_32f_asin_32f_a_generic(float* 
bVector, const float* aVec
 */
 static inline void volk_32f_asin_32f_u_sse4_1(float* bVector, const float* 
aVector, unsigned int num_points){
 
-       float* bPtr = bVector;
-       const float* aPtr = aVector;
-    
-       unsigned int number = 0;
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
         unsigned int quarterPoints = num_points / 4;
-       int i, j;
-
-       __m128 aVal, pio2, x, y, z, arcsine;
-       __m128 fzeroes, fones, ftwos, ffours, condition;
-
-       pio2 = _mm_set1_ps(3.14159265358979323846/2);
-       fzeroes = _mm_setzero_ps();
-       fones = _mm_set1_ps(1.0);
-       ftwos = _mm_set1_ps(2.0);
-       ffours = _mm_set1_ps(4.0);
-
-       for(;number < quarterPoints; number++){    
-               aVal = _mm_loadu_ps(aPtr);
-               aVal = _mm_div_ps(aVal, 
_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
-               z = aVal;
-               condition = _mm_cmplt_ps(z, fzeroes);
-               z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
-               x = z;
-               condition = _mm_cmplt_ps(z, fones);
-               x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), 
z), condition));
-
-               for(i = 0; i < 2; i++)  x = _mm_add_ps(x, 
_mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
-               x = _mm_div_ps(fones, x);
-               y = fzeroes;
-               for(j = TERMS - 1; j >=0 ; j--) y = _mm_add_ps(_mm_mul_ps(y, 
_mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
-               
-               y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
-               condition = _mm_cmpgt_ps(z, fones);
-               
-               y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, 
ftwos)), condition));
-               arcsine = y;
-               condition = _mm_cmplt_ps(aVal, fzeroes);
-               arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, 
ftwos), condition));
-
-               _mm_storeu_ps(bPtr, arcsine);
-               aPtr += 4;
-               bPtr += 4;
-       }
- 
-       number = quarterPoints * 4;
-       for(;number < num_points; number++){
-          *bPtr++ = asin(*aPtr++);
-       }
+    int i, j;
+
+    __m128 aVal, pio2, x, y, z, arcsine;
+    __m128 fzeroes, fones, ftwos, ffours, condition;
+
+    pio2 = _mm_set1_ps(3.14159265358979323846/2);
+    fzeroes = _mm_setzero_ps();
+    fones = _mm_set1_ps(1.0);
+    ftwos = _mm_set1_ps(2.0);
+    ffours = _mm_set1_ps(4.0);
+
+    for(;number < quarterPoints; number++){
+        aVal = _mm_loadu_ps(aPtr);
+        aVal = _mm_div_ps(aVal, _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, 
aVal), _mm_sub_ps(fones, aVal))));
+        z = aVal;
+        condition = _mm_cmplt_ps(z, fzeroes);
+        z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+        x = z;
+        condition = _mm_cmplt_ps(z, fones);
+        x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), 
condition));
+
+        for(i = 0; i < 2; i++){
+            x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, 
x))));
+        }
+        x = _mm_div_ps(fones, x);
+        y = fzeroes;
+        for(j = TERMS - 1; j >=0 ; j--){
+            y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), 
_mm_set1_ps(pow(-1,j)/(2*j+1)));
+        }
+
+        y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+        condition = _mm_cmpgt_ps(z, fones);
+
+        y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), 
condition));
+        arcsine = y;
+        condition = _mm_cmplt_ps(aVal, fzeroes);
+        arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), 
condition));
+
+        _mm_storeu_ps(bPtr, arcsine);
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+       *bPtr++ = asin(*aPtr++);
+    }
 }
 
 #endif /* LV_HAVE_SSE4_1 for unaligned */
@@ -163,15 +152,15 @@ static inline void volk_32f_asin_32f_u_sse4_1(float* 
bVector, const float* aVect
   \param aVector The input vector of floats
   \param num_points Number of points for which arcsine is to be computed
 */
-static inline void volk_32f_asin_32f_u_generic(float* bVector, const float* 
aVector, unsigned int num_points){    
+static inline void volk_32f_asin_32f_u_generic(float* bVector, const float* 
aVector, unsigned int num_points){
     float* bPtr = bVector;
     const float* aPtr = aVector;
     unsigned int number = 0;
 
     for(number = 0; number < num_points; number++){
-      *bPtr++ = asin(*aPtr++);
+        *bPtr++ = asin(*aPtr++);
     }
- 
+
 }
 #endif /* LV_HAVE_GENERIC */
 
diff --git a/volk/kernels/volk/volk_32f_atan_32f.h 
b/volk/kernels/volk/volk_32f_atan_32f.h
index a60e2b8..eaee7f3 100644
--- a/volk/kernels/volk/volk_32f_atan_32f.h
+++ b/volk/kernels/volk/volk_32f_atan_32f.h
@@ -18,75 +18,60 @@
 */
 static inline void volk_32f_atan_32f_a_sse4_1(float* bVector, const float* 
aVector, unsigned int num_points){
 
-       float* bPtr = bVector;
-       const float* aPtr = aVector;
-    
-       unsigned int number = 0;
-        unsigned int quarterPoints = num_points / 4;
-       int i, j;
-
-       __m128 aVal, pio2, x, y, z, arctangent;
-       __m128 fzeroes, fones, ftwos, ffours, condition;
-
-       pio2 = _mm_set1_ps(3.14159265358979323846/2);
-       fzeroes = _mm_setzero_ps();
-       fones = _mm_set1_ps(1.0);
-       ftwos = _mm_set1_ps(2.0);
-       ffours = _mm_set1_ps(4.0);
-
-       for(;number < quarterPoints; number++){    
-               aVal = _mm_load_ps(aPtr);
-               z = aVal;
-               condition = _mm_cmplt_ps(z, fzeroes);
-               z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
-               x = z;
-               condition = _mm_cmplt_ps(z, fones);
-               x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), 
z), condition));
-
-               for(i = 0; i < 2; i++)  x = _mm_add_ps(x, 
_mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
-               x = _mm_div_ps(fones, x);
-               y = fzeroes;
-               for(j = TERMS - 1; j >=0 ; j--) y = _mm_add_ps(_mm_mul_ps(y, 
_mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
-               
-               y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
-               condition = _mm_cmpgt_ps(z, fones);
-               
-               y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, 
ftwos)), condition));
-               arctangent = y;
-               condition = _mm_cmplt_ps(aVal, fzeroes);
-               arctangent = _mm_sub_ps(arctangent, 
_mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
-
-               _mm_store_ps(bPtr, arctangent);
-               aPtr += 4;
-               bPtr += 4;
-       }
- 
-       number = quarterPoints * 4;
-       for(;number < num_points; number++){
-          *bPtr++ = atan(*aPtr++);
-       }
-}
-
-#endif /* LV_HAVE_SSE4_1 for aligned */
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Computes arctangent of input vector and stores results in output 
vector
-  \param bVector The vector where results will be stored
-  \param aVector The input vector of floats
-  \param num_points Number of points for which arctangent is to be computed
-*/
-static inline void volk_32f_atan_32f_a_generic(float* bVector, const float* 
aVector, unsigned int num_points){    
     float* bPtr = bVector;
     const float* aPtr = aVector;
+
     unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+    int i, j;
+
+    __m128 aVal, pio2, x, y, z, arctangent;
+    __m128 fzeroes, fones, ftwos, ffours, condition;
+
+    pio2 = _mm_set1_ps(3.14159265358979323846/2);
+    fzeroes = _mm_setzero_ps();
+    fones = _mm_set1_ps(1.0);
+    ftwos = _mm_set1_ps(2.0);
+    ffours = _mm_set1_ps(4.0);
+
+    for(;number < quarterPoints; number++){
+        aVal = _mm_load_ps(aPtr);
+        z = aVal;
+        condition = _mm_cmplt_ps(z, fzeroes);
+        z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+        x = z;
+        condition = _mm_cmplt_ps(z, fones);
+        x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), 
condition));
+
+        for(i = 0; i < 2; i++){
+            x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, 
x))));
+        }
+        x = _mm_div_ps(fones, x);
+        y = fzeroes;
+        for(j = TERMS - 1; j >=0 ; j--){
+            y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), 
_mm_set1_ps(pow(-1,j)/(2*j+1)));
+        }
+
+        y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+        condition = _mm_cmpgt_ps(z, fones);
+
+        y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), 
condition));
+        arctangent = y;
+        condition = _mm_cmplt_ps(aVal, fzeroes);
+        arctangent = _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, 
ftwos), condition));
+
+        _mm_store_ps(bPtr, arctangent);
+        aPtr += 4;
+        bPtr += 4;
+    }
 
-    for(number = 0; number < num_points; number++){
-      *bPtr++ = atan(*aPtr++);
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+        *bPtr++ = atan(*aPtr++);
     }
- 
 }
-#endif /* LV_HAVE_GENERIC */
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
 
 #endif /* INCLUDED_volk_32f_atan_32f_a_H */
 
@@ -103,53 +88,53 @@ static inline void volk_32f_atan_32f_a_generic(float* 
bVector, const float* aVec
 */
 static inline void volk_32f_atan_32f_u_sse4_1(float* bVector, const float* 
aVector, unsigned int num_points){
 
-       float* bPtr = bVector;
-       const float* aPtr = aVector;
-    
-       unsigned int number = 0;
-        unsigned int quarterPoints = num_points / 4;
-       int i, j;
-       
-       __m128 aVal, pio2, x, y, z, arctangent;
-       __m128 fzeroes, fones, ftwos, ffours, condition;
-
-       pio2 = _mm_set1_ps(3.14159265358979323846/2);
-       fzeroes = _mm_setzero_ps();
-       fones = _mm_set1_ps(1.0);
-       ftwos = _mm_set1_ps(2.0);
-       ffours = _mm_set1_ps(4.0);
-
-       for(;number < quarterPoints; number++){    
-               aVal = _mm_loadu_ps(aPtr);
-               z = aVal;
-               condition = _mm_cmplt_ps(z, fzeroes);
-               z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
-               x = z;
-               condition = _mm_cmplt_ps(z, fones);
-               x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), 
z), condition));
-
-               for(i = 0; i < 2; i++)  x = _mm_add_ps(x, 
_mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
-               x = _mm_div_ps(fones, x);
-               y = fzeroes;
-               for(j = TERMS - 1; j >= 0; j--) y = _mm_add_ps(_mm_mul_ps(y, 
_mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
-               
-               y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
-               condition = _mm_cmpgt_ps(z, fones);
-               
-               y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, 
ftwos)), condition));
-               arctangent = y;
-               condition = _mm_cmplt_ps(aVal, fzeroes);
-               arctangent = _mm_sub_ps(arctangent, 
_mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
-
-               _mm_storeu_ps(bPtr, arctangent);
-               aPtr += 4;
-               bPtr += 4;
-       }
- 
-       number = quarterPoints * 4;
-       for(;number < num_points; number++){
-          *bPtr++ = atan(*aPtr++);
-       }
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+    int i, j;
+
+    __m128 aVal, pio2, x, y, z, arctangent;
+    __m128 fzeroes, fones, ftwos, ffours, condition;
+
+    pio2 = _mm_set1_ps(3.14159265358979323846/2);
+    fzeroes = _mm_setzero_ps();
+    fones = _mm_set1_ps(1.0);
+    ftwos = _mm_set1_ps(2.0);
+    ffours = _mm_set1_ps(4.0);
+
+    for(;number < quarterPoints; number++){
+        aVal = _mm_loadu_ps(aPtr);
+        z = aVal;
+        condition = _mm_cmplt_ps(z, fzeroes);
+        z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+        x = z;
+        condition = _mm_cmplt_ps(z, fones);
+        x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), 
condition));
+
+        for(i = 0; i < 2; i++)  x = _mm_add_ps(x, 
_mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+        x = _mm_div_ps(fones, x);
+        y = fzeroes;
+        for(j = TERMS - 1; j >= 0; j--) y = _mm_add_ps(_mm_mul_ps(y, 
_mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
+
+        y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+        condition = _mm_cmpgt_ps(z, fones);
+
+        y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), 
condition));
+        arctangent = y;
+        condition = _mm_cmplt_ps(aVal, fzeroes);
+        arctangent = _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, 
ftwos), condition));
+
+        _mm_storeu_ps(bPtr, arctangent);
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+        *bPtr++ = atan(*aPtr++);
+    }
 }
 
 #endif /* LV_HAVE_SSE4_1 for unaligned */
@@ -161,15 +146,15 @@ static inline void volk_32f_atan_32f_u_sse4_1(float* 
bVector, const float* aVect
   \param aVector The input vector of floats
   \param num_points Number of points for which arctangent is to be computed
 */
-static inline void volk_32f_atan_32f_u_generic(float* bVector, const float* 
aVector, unsigned int num_points){    
+static inline void volk_32f_atan_32f_generic(float* bVector, const float* 
aVector, unsigned int num_points){
     float* bPtr = bVector;
     const float* aPtr = aVector;
     unsigned int number = 0;
 
     for(number = 0; number < num_points; number++){
-      *bPtr++ = atan(*aPtr++);
+        *bPtr++ = atan(*aPtr++);
     }
- 
+
 }
 #endif /* LV_HAVE_GENERIC */
 
diff --git a/volk/kernels/volk/volk_32f_cos_32f.h 
b/volk/kernels/volk/volk_32f_cos_32f.h
index cd72672..7aa575f 100644
--- a/volk/kernels/volk/volk_32f_cos_32f.h
+++ b/volk/kernels/volk/volk_32f_cos_32f.h
@@ -15,94 +15,75 @@
 */
 static inline void volk_32f_cos_32f_a_sse4_1(float* bVector, const float* 
aVector, unsigned int num_points){
 
-       float* bPtr = bVector;
-       const float* aPtr = aVector;
-    
-       unsigned int number = 0;
-        unsigned int quarterPoints = num_points / 4;
-       unsigned int i = 0;
-
-       __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, 
ftwos, fones, fzeroes;
-       __m128 sine, cosine, condition1, condition2, condition3;
-       __m128i q, r, ones, twos, fours;
-
-       m4pi = _mm_set1_ps(1.273239545);
-       pio4A = _mm_set1_ps(0.78515625);
-       pio4B = _mm_set1_ps(0.241876e-3);
-       ffours = _mm_set1_ps(4.0);
-       ftwos = _mm_set1_ps(2.0);
-       fones = _mm_set1_ps(1.0);
-       fzeroes = _mm_setzero_ps();
-       ones = _mm_set1_epi32(1);
-       twos = _mm_set1_epi32(2);
-       fours = _mm_set1_epi32(4);
-
-       cp1 = _mm_set1_ps(1.0);
-       cp2 = _mm_set1_ps(0.83333333e-1);
-       cp3 = _mm_set1_ps(0.2777778e-2);
-       cp4 = _mm_set1_ps(0.49603e-4);
-       cp5 = _mm_set1_ps(0.551e-6);
-
-       for(;number < quarterPoints; number++){    
-
-       aVal = _mm_load_ps(aPtr);       
-       s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), 
_mm_cmplt_ps(aVal, fzeroes)));
-       q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
-       r = _mm_add_epi32(q, _mm_and_si128(q, ones));
-
-       s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
-       s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
-
-       s = _mm_div_ps(s, _mm_set1_ps(8.0));    // The constant is 2^N, for 3 
times argument reduction
-       s = _mm_mul_ps(s, s);
-       // Evaluate Taylor series
-       s = 
_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s,
 cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-       
-       for(i = 0; i < 3; i++)  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
-       s = _mm_div_ps(s, ftwos);
-
-       sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
-       cosine = _mm_sub_ps(fones, s);
-
-       condition1 = 
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), 
fzeroes);
-
-       // Need this condition only for sin
-       //condition2 = 
_mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), 
_mm_cmplt_ps(aVal, fzeroes));
-       condition3 = 
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), 
fzeroes);
-
-       cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), 
condition1));
-       cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, 
_mm_set1_ps(2.0f)), condition3));
-       _mm_store_ps(bPtr, cosine);
-       aPtr += 4;
-       bPtr += 4;
-       }
- 
-       number = quarterPoints * 4;
-       for(;number < num_points; number++){
-          *bPtr++ = cos(*aPtr++);
-       }
-}
-
-#endif /* LV_HAVE_SSE4_1 for aligned */
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Computes cosine of input vector and stores results in output vector
-  \param bVector The vector where results will be stored
-  \param aVector The input vector of floats
-  \param num_points Number of points for which cosine is to be computed
-*/
-static inline void volk_32f_cos_32f_a_generic(float* bVector, const float* 
aVector, unsigned int num_points){    
     float* bPtr = bVector;
     const float* aPtr = aVector;
+
     unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+    unsigned int i = 0;
+
+    __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, 
ftwos, fones, fzeroes;
+    __m128 sine, cosine, condition1, condition2, condition3;
+    __m128i q, r, ones, twos, fours;
+
+    m4pi = _mm_set1_ps(1.273239545);
+    pio4A = _mm_set1_ps(0.78515625);
+    pio4B = _mm_set1_ps(0.241876e-3);
+    ffours = _mm_set1_ps(4.0);
+    ftwos = _mm_set1_ps(2.0);
+    fones = _mm_set1_ps(1.0);
+    fzeroes = _mm_setzero_ps();
+    ones = _mm_set1_epi32(1);
+    twos = _mm_set1_epi32(2);
+    fours = _mm_set1_epi32(4);
+
+    cp1 = _mm_set1_ps(1.0);
+    cp2 = _mm_set1_ps(0.83333333e-1);
+    cp3 = _mm_set1_ps(0.2777778e-2);
+    cp4 = _mm_set1_ps(0.49603e-4);
+    cp5 = _mm_set1_ps(0.551e-6);
+
+    for(;number < quarterPoints; number++){
+
+    aVal = _mm_load_ps(aPtr);
+    s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), 
_mm_cmplt_ps(aVal, fzeroes)));
+    q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
+    r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+    s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+    s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+    s = _mm_div_ps(s, _mm_set1_ps(8.0));    // The constant is 2^N, for 3 
times argument reduction
+    s = _mm_mul_ps(s, s);
+    // Evaluate Taylor series
+    s = 
_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s,
 cp5), cp4), s), cp3), s), cp2), s), cp1), s);
+
+    for(i = 0; i < 3; i++)  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+    s = _mm_div_ps(s, ftwos);
+
+    sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+    cosine = _mm_sub_ps(fones, s);
+
+    condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, 
ones), twos)), fzeroes);
+
+    // Need this condition only for sin
+    //condition2 = 
_mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), 
_mm_cmplt_ps(aVal, fzeroes));
+    condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, 
twos), fours)), fzeroes);
+
+    cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), 
condition1));
+    cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, 
_mm_set1_ps(2.0f)), condition3));
+    _mm_store_ps(bPtr, cosine);
+    aPtr += 4;
+    bPtr += 4;
+    }
 
-    for(; number < num_points; number++){
-      *bPtr++ = cos(*aPtr++);
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+       *bPtr++ = cos(*aPtr++);
     }
- 
 }
-#endif /* LV_HAVE_GENERIC */
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
 
 #endif /* INCLUDED_volk_32f_cos_32f_a_H */
 
@@ -119,72 +100,73 @@ static inline void volk_32f_cos_32f_a_generic(float* 
bVector, const float* aVect
 */
 static inline void volk_32f_cos_32f_u_sse4_1(float* bVector, const float* 
aVector, unsigned int num_points){
 
-       float* bPtr = bVector;
-       const float* aPtr = aVector;
-    
-       unsigned int number = 0;
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
         unsigned int quarterPoints = num_points / 4;
-       unsigned int i = 0;
-
-       __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, 
ftwos, fones, fzeroes;
-       __m128 sine, cosine, condition1, condition2, condition3;
-       __m128i q, r, ones, twos, fours;
-
-       m4pi = _mm_set1_ps(1.273239545);
-       pio4A = _mm_set1_ps(0.78515625);
-       pio4B = _mm_set1_ps(0.241876e-3);
-       ffours = _mm_set1_ps(4.0);
-       ftwos = _mm_set1_ps(2.0);
-       fones = _mm_set1_ps(1.0);
-       fzeroes = _mm_setzero_ps();
-       ones = _mm_set1_epi32(1);
-       twos = _mm_set1_epi32(2);
-       fours = _mm_set1_epi32(4);
-
-       cp1 = _mm_set1_ps(1.0);
-       cp2 = _mm_set1_ps(0.83333333e-1);
-       cp3 = _mm_set1_ps(0.2777778e-2);
-       cp4 = _mm_set1_ps(0.49603e-4);
-       cp5 = _mm_set1_ps(0.551e-6);
-
-       for(;number < quarterPoints; number++){    
-
-       aVal = _mm_loadu_ps(aPtr);      
-       s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), 
_mm_cmplt_ps(aVal, fzeroes)));
-       q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
-       r = _mm_add_epi32(q, _mm_and_si128(q, ones));
-
-       s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
-       s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
-
-       s = _mm_div_ps(s, _mm_set1_ps(8.0));    // The constant is 2^N, for 3 
times argument reduction
-       s = _mm_mul_ps(s, s);
-       // Evaluate Taylor series
-       s = 
_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s,
 cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-       
-       for(i = 0; i < 3; i++)  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
-       s = _mm_div_ps(s, ftwos);
-
-       sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
-       cosine = _mm_sub_ps(fones, s);
-
-       condition1 = 
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), 
fzeroes);
-
-       // Need this condition only for sin
-       //condition2 = 
_mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), 
_mm_cmplt_ps(aVal, fzeroes));
-       condition3 = 
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), 
fzeroes);
-
-       cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), 
condition1));
-       cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, 
_mm_set1_ps(2.0f)), condition3));
-       _mm_storeu_ps(bPtr, cosine);
-       aPtr += 4;
-       bPtr += 4;
-       }
- 
-       number = quarterPoints * 4;
-       for(;number < num_points; number++){
-          *bPtr++ = cos(*aPtr++);
-       }
+    unsigned int i = 0;
+
+    __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, 
ftwos, fones, fzeroes;
+    __m128 sine, cosine, condition1, condition2, condition3;
+    __m128i q, r, ones, twos, fours;
+
+    m4pi = _mm_set1_ps(1.273239545);
+    pio4A = _mm_set1_ps(0.78515625);
+    pio4B = _mm_set1_ps(0.241876e-3);
+    ffours = _mm_set1_ps(4.0);
+    ftwos = _mm_set1_ps(2.0);
+    fones = _mm_set1_ps(1.0);
+    fzeroes = _mm_setzero_ps();
+    ones = _mm_set1_epi32(1);
+    twos = _mm_set1_epi32(2);
+    fours = _mm_set1_epi32(4);
+
+    cp1 = _mm_set1_ps(1.0);
+    cp2 = _mm_set1_ps(0.83333333e-1);
+    cp3 = _mm_set1_ps(0.2777778e-2);
+    cp4 = _mm_set1_ps(0.49603e-4);
+    cp5 = _mm_set1_ps(0.551e-6);
+
+    for(;number < quarterPoints; number++){
+        aVal = _mm_loadu_ps(aPtr);
+        s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), 
_mm_cmplt_ps(aVal, fzeroes)));
+        q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
+        r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+        s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+        s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+        s = _mm_div_ps(s, _mm_set1_ps(8.0));    // The constant is 2^N, for 3 
times argument reduction
+        s = _mm_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = 
_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s,
 cp5), cp4), s), cp3), s), cp2), s), cp1), s);
+
+        for(i = 0; i < 3; i++){
+            s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+        }
+        s = _mm_div_ps(s, ftwos);
+
+        sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+        cosine = _mm_sub_ps(fones, s);
+
+        condition1 = 
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), 
fzeroes);
+
+        // Need this condition only for sin
+        //condition2 = 
_mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), 
_mm_cmplt_ps(aVal, fzeroes));
+        condition3 = 
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), 
fzeroes);
+
+        cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), 
condition1));
+        cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, 
_mm_set1_ps(2.0f)), condition3));
+        _mm_storeu_ps(bPtr, cosine);
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+        *bPtr++ = cos(*aPtr++);
+    }
 }
 
 #endif /* LV_HAVE_SSE4_1 for unaligned */
@@ -196,15 +178,15 @@ static inline void volk_32f_cos_32f_u_sse4_1(float* 
bVector, const float* aVecto
   \param aVector The input vector of floats
   \param num_points Number of points for which cosine is to be computed
 */
-static inline void volk_32f_cos_32f_u_generic(float* bVector, const float* 
aVector, unsigned int num_points){    
+static inline void volk_32f_cos_32f_generic(float* bVector, const float* 
aVector, unsigned int num_points){
     float* bPtr = bVector;
     const float* aPtr = aVector;
     unsigned int number = 0;
 
     for(; number < num_points; number++){
-      *bPtr++ = cos(*aPtr++);
+        *bPtr++ = cos(*aPtr++);
     }
- 
+
 }
 #endif /* LV_HAVE_GENERIC */
 
diff --git a/volk/kernels/volk/volk_32f_expfast_32f.h 
b/volk/kernels/volk/volk_32f_expfast_32f.h
index 30f82d5..b8f6ea6 100644
--- a/volk/kernels/volk/volk_32f_expfast_32f.h
+++ b/volk/kernels/volk/volk_32f_expfast_32f.h
@@ -21,31 +21,31 @@
 */
 static inline void volk_32f_expfast_32f_a_avx(float* bVector, const float* 
aVector, unsigned int num_points){
 
-       float* bPtr = bVector;
-       const float* aPtr = aVector;
-    
-       unsigned int number = 0;
-        const unsigned int eighthPoints = num_points / 8;
-
-       __m256 aVal, bVal, a, b;
-       __m256i exp;
-        a = _mm256_set1_ps(A/Mln2);
-        b = _mm256_set1_ps(B-C);
-
-       for(;number < eighthPoints; number++){    
-       aVal = _mm256_load_ps(aPtr); 
-       exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a,aVal), b));
-       bVal = _mm256_castsi256_ps(exp);
-
-       _mm256_store_ps(bPtr, bVal);
-       aPtr += 8;
-       bPtr += 8;
-       }
- 
-       number = eighthPoints * 8;
-       for(;number < num_points; number++){
-          *bPtr++ = expf(*aPtr++);
-       }
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    __m256 aVal, bVal, a, b;
+    __m256i exp;
+    a = _mm256_set1_ps(A/Mln2);
+    b = _mm256_set1_ps(B-C);
+
+    for(;number < eighthPoints; number++){
+        aVal = _mm256_load_ps(aPtr);
+        exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a,aVal), b));
+        bVal = _mm256_castsi256_ps(exp);
+
+        _mm256_store_ps(bPtr, bVal);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for(;number < num_points; number++){
+        *bPtr++ = expf(*aPtr++);
+    }
 }
 
 #endif /* LV_HAVE_AVX for aligned */
@@ -60,54 +60,34 @@ static inline void volk_32f_expfast_32f_a_avx(float* 
bVector, const float* aVect
 */
 static inline void volk_32f_expfast_32f_a_sse4_1(float* bVector, const float* 
aVector, unsigned int num_points){
 
-       float* bPtr = bVector;
-       const float* aPtr = aVector;
-    
-       unsigned int number = 0;
-        const unsigned int quarterPoints = num_points / 4;
-
-       __m128 aVal, bVal, a, b;
-       __m128i exp;
-        a = _mm_set1_ps(A/Mln2);
-        b = _mm_set1_ps(B-C);
-
-       for(;number < quarterPoints; number++){    
-       aVal = _mm_load_ps(aPtr); 
-       exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a,aVal), b));
-       bVal = _mm_castsi128_ps(exp);
-
-       _mm_store_ps(bPtr, bVal);
-       aPtr += 4;
-       bPtr += 4;
-       }
- 
-       number = quarterPoints * 4;
-       for(;number < num_points; number++){
-          *bPtr++ = expf(*aPtr++);
-       }
-}
-
-#endif /* LV_HAVE_SSE4_1 for aligned */
-
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Computes fast exp (max 7% error) of input vector and stores results 
in output vector
-  \param bVector The vector where results will be stored
-  \param aVector The input vector of floats
-  \param num_points Number of points for which exp is to be computed
-*/
-static inline void volk_32f_expfast_32f_a_generic(float* bVector, const float* 
aVector, unsigned int num_points){    
     float* bPtr = bVector;
     const float* aPtr = aVector;
+
     unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
 
-    for(number = 0; number < num_points; number++){
-      *bPtr++ = expf(*aPtr++);
+    __m128 aVal, bVal, a, b;
+    __m128i exp;
+    a = _mm_set1_ps(A/Mln2);
+    b = _mm_set1_ps(B-C);
+
+    for(;number < quarterPoints; number++){
+        aVal = _mm_load_ps(aPtr);
+        exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a,aVal), b));
+        bVal = _mm_castsi128_ps(exp);
+
+        _mm_store_ps(bPtr, bVal);
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+        *bPtr++ = expf(*aPtr++);
     }
- 
 }
-#endif /* LV_HAVE_GENERIC */
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
 
 #endif /* INCLUDED_volk_32f_expfast_32f_a_H */
 
@@ -124,31 +104,31 @@ static inline void volk_32f_expfast_32f_a_generic(float* 
bVector, const float* a
 */
 static inline void volk_32f_expfast_32f_u_avx(float* bVector, const float* 
aVector, unsigned int num_points){
 
-       float* bPtr = bVector;
-       const float* aPtr = aVector;
-    
-       unsigned int number = 0;
-        const unsigned int eighthPoints = num_points / 8;
-
-       __m256 aVal, bVal, a, b;
-       __m256i exp;
-        a = _mm256_set1_ps(A/Mln2);
-        b = _mm256_set1_ps(B-C);
-
-       for(;number < eighthPoints; number++){    
-       aVal = _mm256_loadu_ps(aPtr); 
-       exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a,aVal), b));
-       bVal = _mm256_castsi256_ps(exp);
-
-       _mm256_storeu_ps(bPtr, bVal);
-       aPtr += 8;
-       bPtr += 8;
-       }
- 
-       number = eighthPoints * 8;
-       for(;number < num_points; number++){
-          *bPtr++ = expf(*aPtr++);
-       }
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    __m256 aVal, bVal, a, b;
+    __m256i exp;
+    a = _mm256_set1_ps(A/Mln2);
+    b = _mm256_set1_ps(B-C);
+
+    for(;number < eighthPoints; number++){
+        aVal = _mm256_loadu_ps(aPtr);
+        exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a,aVal), b));
+        bVal = _mm256_castsi256_ps(exp);
+
+        _mm256_storeu_ps(bPtr, bVal);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for(;number < num_points; number++){
+        *bPtr++ = expf(*aPtr++);
+    }
 }
 
 #endif /* LV_HAVE_AVX for aligned */
@@ -163,36 +143,35 @@ static inline void volk_32f_expfast_32f_u_avx(float* 
bVector, const float* aVect
 */
 static inline void volk_32f_expfast_32f_u_sse4_1(float* bVector, const float* 
aVector, unsigned int num_points){
 
-       float* bPtr = bVector;
-       const float* aPtr = aVector;
-    
-       unsigned int number = 0;
-        const unsigned int quarterPoints = num_points / 4;
-
-       __m128 aVal, bVal, a, b;
-       __m128i exp;
-        a = _mm_set1_ps(A/Mln2);
-        b = _mm_set1_ps(B-C);
-
-       for(;number < quarterPoints; number++){    
-       aVal = _mm_loadu_ps(aPtr); 
-       exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a,aVal), b));
-       bVal = _mm_castsi128_ps(exp);
-
-       _mm_storeu_ps(bPtr, bVal);
-       aPtr += 4;
-       bPtr += 4;
-       }
- 
-       number = quarterPoints * 4;
-       for(;number < num_points; number++){
-          *bPtr++ = expf(*aPtr++);
-       }
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m128 aVal, bVal, a, b;
+    __m128i exp;
+    a = _mm_set1_ps(A/Mln2);
+    b = _mm_set1_ps(B-C);
+
+    for(;number < quarterPoints; number++){
+        aVal = _mm_loadu_ps(aPtr);
+        exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a,aVal), b));
+        bVal = _mm_castsi128_ps(exp);
+
+        _mm_storeu_ps(bPtr, bVal);
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+        *bPtr++ = expf(*aPtr++);
+    }
 }
 
 #endif /* LV_HAVE_SSE4_1 for unaligned */
 
-
 #ifdef LV_HAVE_GENERIC
 /*!
   \brief Computes fast exp (max 7% error) of input vector and stores results 
in output vector
@@ -200,15 +179,15 @@ static inline void volk_32f_expfast_32f_u_sse4_1(float* 
bVector, const float* aV
   \param aVector The input vector of floats
   \param num_points Number of points for which log is to be computed
 */
-static inline void volk_32f_expfast_32f_u_generic(float* bVector, const float* 
aVector, unsigned int num_points){    
+static inline void volk_32f_expfast_32f_generic(float* bVector, const float* 
aVector, unsigned int num_points){
     float* bPtr = bVector;
     const float* aPtr = aVector;
     unsigned int number = 0;
 
     for(number = 0; number < num_points; number++){
-      *bPtr++ = expf(*aPtr++);
+        *bPtr++ = expf(*aPtr++);
     }
- 
+
 }
 #endif /* LV_HAVE_GENERIC */
 
diff --git a/volk/kernels/volk/volk_32f_sin_32f.h 
b/volk/kernels/volk/volk_32f_sin_32f.h
index 5147c54..96e021a 100644
--- a/volk/kernels/volk/volk_32f_sin_32f.h
+++ b/volk/kernels/volk/volk_32f_sin_32f.h
@@ -15,93 +15,74 @@
 */
 static inline void volk_32f_sin_32f_a_sse4_1(float* bVector, const float* 
aVector, unsigned int num_points){
 
-       float* bPtr = bVector;
-       const float* aPtr = aVector;
-    
-       unsigned int number = 0;
-        unsigned int quarterPoints = num_points / 4;
-       unsigned int i = 0;
-
-       __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, 
ftwos, fones, fzeroes;
-       __m128 sine, cosine, condition1, condition2, condition3;
-       __m128i q, r, ones, twos, fours;
-
-       m4pi = _mm_set1_ps(1.273239545);
-       pio4A = _mm_set1_ps(0.78515625);
-       pio4B = _mm_set1_ps(0.241876e-3);
-       ffours = _mm_set1_ps(4.0);
-       ftwos = _mm_set1_ps(2.0);
-       fones = _mm_set1_ps(1.0);
-       fzeroes = _mm_setzero_ps();
-       ones = _mm_set1_epi32(1);
-       twos = _mm_set1_epi32(2);
-       fours = _mm_set1_epi32(4);
-
-       cp1 = _mm_set1_ps(1.0);
-       cp2 = _mm_set1_ps(0.83333333e-1);
-       cp3 = _mm_set1_ps(0.2777778e-2);
-       cp4 = _mm_set1_ps(0.49603e-4);
-       cp5 = _mm_set1_ps(0.551e-6);
-
-       for(;number < quarterPoints; number++){    
-
-       aVal = _mm_load_ps(aPtr);       
-       s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), 
_mm_cmplt_ps(aVal, fzeroes)));
-       q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
-       r = _mm_add_epi32(q, _mm_and_si128(q, ones));
-
-       s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
-       s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
-
-       s = _mm_div_ps(s, _mm_set1_ps(8.0));    // The constant is 2^N, for 3 
times argument reduction
-       s = _mm_mul_ps(s, s);
-       // Evaluate Taylor series
-       s = 
_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s,
 cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-       
-       for(i = 0; i < 3; i++)  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
-       s = _mm_div_ps(s, ftwos);
-
-       sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
-       cosine = _mm_sub_ps(fones, s);
-
-       condition1 = 
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), 
fzeroes);
-       condition2 = 
_mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), 
_mm_cmplt_ps(aVal, fzeroes));
-       // Need this condition only for cos
-       //condition3 = 
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), 
fzeroes);
-
-       sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), 
condition1));
-       sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), 
condition2));
-       _mm_store_ps(bPtr, sine);
-       aPtr += 4;
-       bPtr += 4;
-       }
- 
-       number = quarterPoints * 4;
-       for(;number < num_points; number++){
-          *bPtr++ = sin(*aPtr++);
-       }
-}
-
-#endif /* LV_HAVE_SSE4_1 for aligned */
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Computes sine of input vector and stores results in output vector
-  \param bVector The vector where results will be stored
-  \param aVector The input vector of floats
-  \param num_points Number of points for which sine is to be computed
-*/
-static inline void volk_32f_sin_32f_a_generic(float* bVector, const float* 
aVector, unsigned int num_points){    
     float* bPtr = bVector;
     const float* aPtr = aVector;
+    
     unsigned int number = 0;
-
-    for(; number < num_points; number++){
-      *bPtr++ = sin(*aPtr++);
+    unsigned int quarterPoints = num_points / 4;
+    unsigned int i = 0;
+
+    __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, 
ftwos, fones, fzeroes;
+    __m128 sine, cosine, condition1, condition2, condition3;
+    __m128i q, r, ones, twos, fours;
+
+    m4pi = _mm_set1_ps(1.273239545);
+    pio4A = _mm_set1_ps(0.78515625);
+    pio4B = _mm_set1_ps(0.241876e-3);
+    ffours = _mm_set1_ps(4.0);
+    ftwos = _mm_set1_ps(2.0);
+    fones = _mm_set1_ps(1.0);
+    fzeroes = _mm_setzero_ps();
+    ones = _mm_set1_epi32(1);
+    twos = _mm_set1_epi32(2);
+    fours = _mm_set1_epi32(4);
+
+    cp1 = _mm_set1_ps(1.0);
+    cp2 = _mm_set1_ps(0.83333333e-1);
+    cp3 = _mm_set1_ps(0.2777778e-2);
+    cp4 = _mm_set1_ps(0.49603e-4);
+    cp5 = _mm_set1_ps(0.551e-6);
+
+    for(;number < quarterPoints; number++){    
+        aVal = _mm_load_ps(aPtr);   
+        s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), 
_mm_cmplt_ps(aVal, fzeroes)));
+        q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
+        r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+        s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+        s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+        s = _mm_div_ps(s, _mm_set1_ps(8.0));    // The constant is 2^N, for 3 
times argument reduction
+        s = _mm_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = 
_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s,
 cp5), cp4), s), cp3), s), cp2), s), cp1), s);
+        
+        for(i = 0; i < 3; i++) {
+            s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+        }
+        s = _mm_div_ps(s, ftwos);
+
+        sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+        cosine = _mm_sub_ps(fones, s);
+
+        condition1 = 
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), 
fzeroes);
+        condition2 = 
_mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), 
_mm_cmplt_ps(aVal, fzeroes));
+        // Need this condition only for cos
+        //condition3 = 
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), 
fzeroes);
+
+        sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), 
condition1));
+        sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, 
_mm_set1_ps(2.0f)), condition2));
+        _mm_store_ps(bPtr, sine);
+        aPtr += 4;
+        bPtr += 4;
     }
  
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+        *bPtr++ = sin(*aPtr++);
+    }
 }
-#endif /* LV_HAVE_GENERIC */
+#endif /* LV_HAVE_SSE4_1 for aligned */
 
 #endif /* INCLUDED_volk_32f_sin_32f_a_H */
 
@@ -118,71 +99,72 @@ static inline void volk_32f_sin_32f_a_generic(float* 
bVector, const float* aVect
 */
 static inline void volk_32f_sin_32f_u_sse4_1(float* bVector, const float* 
aVector, unsigned int num_points){
 
-       float* bPtr = bVector;
-       const float* aPtr = aVector;
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
     
-       unsigned int number = 0;
-        unsigned int quarterPoints = num_points / 4;
-       unsigned int i = 0;
-
-       __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, 
ftwos, fones, fzeroes;
-       __m128 sine, cosine, condition1, condition2, condition3;
-       __m128i q, r, ones, twos, fours;
-
-       m4pi = _mm_set1_ps(1.273239545);
-       pio4A = _mm_set1_ps(0.78515625);
-       pio4B = _mm_set1_ps(0.241876e-3);
-       ffours = _mm_set1_ps(4.0);
-       ftwos = _mm_set1_ps(2.0);
-       fones = _mm_set1_ps(1.0);
-       fzeroes = _mm_setzero_ps();
-       ones = _mm_set1_epi32(1);
-       twos = _mm_set1_epi32(2);
-       fours = _mm_set1_epi32(4);
-
-       cp1 = _mm_set1_ps(1.0);
-       cp2 = _mm_set1_ps(0.83333333e-1);
-       cp3 = _mm_set1_ps(0.2777778e-2);
-       cp4 = _mm_set1_ps(0.49603e-4);
-       cp5 = _mm_set1_ps(0.551e-6);
-
-       for(;number < quarterPoints; number++){    
-
-       aVal = _mm_loadu_ps(aPtr);      
-       s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), 
_mm_cmplt_ps(aVal, fzeroes)));
-       q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
-       r = _mm_add_epi32(q, _mm_and_si128(q, ones));
-
-       s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
-       s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
-
-       s = _mm_div_ps(s, _mm_set1_ps(8.0));    // The constant is 2^N, for 3 
times argument reduction
-       s = _mm_mul_ps(s, s);
-       // Evaluate Taylor series
-       s = 
_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s,
 cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-       
-       for(i = 0; i < 3; i++)  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
-       s = _mm_div_ps(s, ftwos);
-
-       sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
-       cosine = _mm_sub_ps(fones, s);
-
-       condition1 = 
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), 
fzeroes);
-       condition2 = 
_mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), 
_mm_cmplt_ps(aVal, fzeroes));
-       // Need this condition only for cos
-       //condition3 = 
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), 
fzeroes);
-
-       sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), 
condition1));
-       sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), 
condition2));
-       _mm_storeu_ps(bPtr, sine);
-       aPtr += 4;
-       bPtr += 4;
-       }
+    unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+    unsigned int i = 0;
+
+    __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, 
ftwos, fones, fzeroes;
+    __m128 sine, cosine, condition1, condition2, condition3;
+    __m128i q, r, ones, twos, fours;
+
+    m4pi = _mm_set1_ps(1.273239545);
+    pio4A = _mm_set1_ps(0.78515625);
+    pio4B = _mm_set1_ps(0.241876e-3);
+    ffours = _mm_set1_ps(4.0);
+    ftwos = _mm_set1_ps(2.0);
+    fones = _mm_set1_ps(1.0);
+    fzeroes = _mm_setzero_ps();
+    ones = _mm_set1_epi32(1);
+    twos = _mm_set1_epi32(2);
+    fours = _mm_set1_epi32(4);
+
+    cp1 = _mm_set1_ps(1.0);
+    cp2 = _mm_set1_ps(0.83333333e-1);
+    cp3 = _mm_set1_ps(0.2777778e-2);
+    cp4 = _mm_set1_ps(0.49603e-4);
+    cp5 = _mm_set1_ps(0.551e-6);
+
+    for(;number < quarterPoints; number++){    
+        aVal = _mm_loadu_ps(aPtr);  
+        s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), 
_mm_cmplt_ps(aVal, fzeroes)));
+        q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
+        r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+        s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+        s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+        s = _mm_div_ps(s, _mm_set1_ps(8.0));    // The constant is 2^N, for 3 
times argument reduction
+        s = _mm_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = 
_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s,
 cp5), cp4), s), cp3), s), cp2), s), cp1), s);
+        
+        for(i = 0; i < 3; i++) {
+            s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+        }
+        s = _mm_div_ps(s, ftwos);
+
+        sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+        cosine = _mm_sub_ps(fones, s);
+
+        condition1 = 
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), 
fzeroes);
+        condition2 = 
_mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), 
_mm_cmplt_ps(aVal, fzeroes));
+        // Need this condition only for cos
+        //condition3 = 
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), 
fzeroes);
+
+        sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), 
condition1));
+        sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, 
_mm_set1_ps(2.0f)), condition2));
+        _mm_storeu_ps(bPtr, sine);
+        aPtr += 4;
+        bPtr += 4;
+    }
  
-       number = quarterPoints * 4;
-       for(;number < num_points; number++){
-          *bPtr++ = sin(*aPtr++);
-       }
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+       *bPtr++ = sin(*aPtr++);
+    }
 }
 
 #endif /* LV_HAVE_SSE4_1 for unaligned */
@@ -194,13 +176,13 @@ static inline void volk_32f_sin_32f_u_sse4_1(float* 
bVector, const float* aVecto
   \param aVector The input vector of floats
   \param num_points Number of points for which sine is to be computed
 */
-static inline void volk_32f_sin_32f_u_generic(float* bVector, const float* 
aVector, unsigned int num_points){    
+static inline void volk_32f_sin_32f_generic(float* bVector, const float* 
aVector, unsigned int num_points){    
     float* bPtr = bVector;
     const float* aPtr = aVector;
     unsigned int number = 0;
 
     for(number = 0; number < num_points; number++){
-      *bPtr++ = sin(*aPtr++);
+        *bPtr++ = sin(*aPtr++);
     }
  
 }
diff --git a/volk/kernels/volk/volk_32f_tan_32f.h 
b/volk/kernels/volk/volk_32f_tan_32f.h
index 48611b0..70eb5e3 100644
--- a/volk/kernels/volk/volk_32f_tan_32f.h
+++ b/volk/kernels/volk/volk_32f_tan_32f.h
@@ -15,96 +15,78 @@
 */
 static inline void volk_32f_tan_32f_a_sse4_1(float* bVector, const float* 
aVector, unsigned int num_points){
 
-       float* bPtr = bVector;
-       const float* aPtr = aVector;
-    
-       unsigned int number = 0;
-        unsigned int quarterPoints = num_points / 4;
-       unsigned int i = 0;
-
-       __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, 
ftwos, fones, fzeroes;
-       __m128 sine, cosine, tangent, condition1, condition2, condition3;
-       __m128i q, r, ones, twos, fours;
-
-       m4pi = _mm_set1_ps(1.273239545);
-       pio4A = _mm_set1_ps(0.78515625);
-       pio4B = _mm_set1_ps(0.241876e-3);
-       ffours = _mm_set1_ps(4.0);
-       ftwos = _mm_set1_ps(2.0);
-       fones = _mm_set1_ps(1.0);
-       fzeroes = _mm_setzero_ps();
-       ones = _mm_set1_epi32(1);
-       twos = _mm_set1_epi32(2);
-       fours = _mm_set1_epi32(4);
-
-       cp1 = _mm_set1_ps(1.0);
-       cp2 = _mm_set1_ps(0.83333333e-1);
-       cp3 = _mm_set1_ps(0.2777778e-2);
-       cp4 = _mm_set1_ps(0.49603e-4);
-       cp5 = _mm_set1_ps(0.551e-6);
-
-       for(;number < quarterPoints; number++){    
-
-       aVal = _mm_load_ps(aPtr);       
-       s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), 
_mm_cmplt_ps(aVal, fzeroes)));
-       q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
-       r = _mm_add_epi32(q, _mm_and_si128(q, ones));
-
-       s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
-       s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
-
-       s = _mm_div_ps(s, _mm_set1_ps(8.0));    // The constant is 2^N, for 3 
times argument reduction
-       s = _mm_mul_ps(s, s);
-       // Evaluate Taylor series
-       s = 
_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s,
 cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-       
-       for(i = 0; i < 3; i++)  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
-       s = _mm_div_ps(s, ftwos);
-
-       sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
-       cosine = _mm_sub_ps(fones, s);
-
-       condition1 = 
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), 
fzeroes);
-       condition2 = 
_mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), 
_mm_cmplt_ps(aVal, fzeroes));
-       condition3 = 
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), 
fzeroes);
-
-       __m128 temp = cosine;
-       cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), 
condition1));
-       sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
-       sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), 
condition2));
-       cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, 
_mm_set1_ps(2.0f)), condition3));
-       tangent = _mm_div_ps(sine, cosine);
-       _mm_store_ps(bPtr, tangent);
-       aPtr += 4;
-       bPtr += 4;
-       }
- 
-       number = quarterPoints * 4;
-       for(;number < num_points; number++){
-          *bPtr++ = tan(*aPtr++);
-       }
-}
-
-#endif /* LV_HAVE_SSE4_1 for aligned */
-
-#ifdef LV_HAVE_GENERIC
-/*!
-  \brief Computes tangent of input vector and stores results in output vector
-  \param bVector The vector where results will be stored
-  \param aVector The input vector of floats
-  \param num_points Number of points for which tangent is to be computed
-*/
-static inline void volk_32f_tan_32f_a_generic(float* bVector, const float* 
aVector, unsigned int num_points){    
     float* bPtr = bVector;
     const float* aPtr = aVector;
+
     unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+    unsigned int i = 0;
+
+    __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, 
ftwos, fones, fzeroes;
+    __m128 sine, cosine, tangent, condition1, condition2, condition3;
+    __m128i q, r, ones, twos, fours;
+
+    m4pi = _mm_set1_ps(1.273239545);
+    pio4A = _mm_set1_ps(0.78515625);
+    pio4B = _mm_set1_ps(0.241876e-3);
+    ffours = _mm_set1_ps(4.0);
+    ftwos = _mm_set1_ps(2.0);
+    fones = _mm_set1_ps(1.0);
+    fzeroes = _mm_setzero_ps();
+    ones = _mm_set1_epi32(1);
+    twos = _mm_set1_epi32(2);
+    fours = _mm_set1_epi32(4);
+
+    cp1 = _mm_set1_ps(1.0);
+    cp2 = _mm_set1_ps(0.83333333e-1);
+    cp3 = _mm_set1_ps(0.2777778e-2);
+    cp4 = _mm_set1_ps(0.49603e-4);
+    cp5 = _mm_set1_ps(0.551e-6);
+
+    for(;number < quarterPoints; number++){
+        aVal = _mm_load_ps(aPtr);
+        s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), 
_mm_cmplt_ps(aVal, fzeroes)));
+        q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
+        r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+        s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+        s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+        s = _mm_div_ps(s, _mm_set1_ps(8.0));    // The constant is 2^N, for 3 
times argument reduction
+        s = _mm_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = 
_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s,
 cp5), cp4), s), cp3), s), cp2), s), cp1), s);
+
+        for(i = 0; i < 3; i++){
+            s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+        }
+        s = _mm_div_ps(s, ftwos);
+
+        sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+        cosine = _mm_sub_ps(fones, s);
+
+        condition1 = 
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), 
fzeroes);
+        condition2 = 
_mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), 
_mm_cmplt_ps(aVal, fzeroes));
+        condition3 = 
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), 
fzeroes);
+
+        __m128 temp = cosine;
+        cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), 
condition1));
+        sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), 
condition1));
+        sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, 
_mm_set1_ps(2.0f)), condition2));
+        cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, 
_mm_set1_ps(2.0f)), condition3));
+        tangent = _mm_div_ps(sine, cosine);
+        _mm_store_ps(bPtr, tangent);
+        aPtr += 4;
+        bPtr += 4;
+    }
 
-    for(; number < num_points; number++){
-      *bPtr++ = tan(*aPtr++);
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+       *bPtr++ = tan(*aPtr++);
     }
- 
 }
-#endif /* LV_HAVE_GENERIC */
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
 
 #endif /* INCLUDED_volk_32f_tan_32f_a_H */
 
@@ -121,74 +103,75 @@ static inline void volk_32f_tan_32f_a_generic(float* 
bVector, const float* aVect
 */
 static inline void volk_32f_tan_32f_u_sse4_1(float* bVector, const float* 
aVector, unsigned int num_points){
 
-       float* bPtr = bVector;
-       const float* aPtr = aVector;
-    
-       unsigned int number = 0;
-        unsigned int quarterPoints = num_points / 4;
-       unsigned int i = 0;
-
-       __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, 
ftwos, fones, fzeroes;
-       __m128 sine, cosine, tangent, condition1, condition2, condition3;
-       __m128i q, r, ones, twos, fours;
-
-       m4pi = _mm_set1_ps(1.273239545);
-       pio4A = _mm_set1_ps(0.78515625);
-       pio4B = _mm_set1_ps(0.241876e-3);
-       ffours = _mm_set1_ps(4.0);
-       ftwos = _mm_set1_ps(2.0);
-       fones = _mm_set1_ps(1.0);
-       fzeroes = _mm_setzero_ps();
-       ones = _mm_set1_epi32(1);
-       twos = _mm_set1_epi32(2);
-       fours = _mm_set1_epi32(4);
-
-       cp1 = _mm_set1_ps(1.0);
-       cp2 = _mm_set1_ps(0.83333333e-1);
-       cp3 = _mm_set1_ps(0.2777778e-2);
-       cp4 = _mm_set1_ps(0.49603e-4);
-       cp5 = _mm_set1_ps(0.551e-6);
-
-       for(;number < quarterPoints; number++){    
-
-       aVal = _mm_loadu_ps(aPtr);      
-       s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), 
_mm_cmplt_ps(aVal, fzeroes)));
-       q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
-       r = _mm_add_epi32(q, _mm_and_si128(q, ones));
-
-       s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
-       s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
-
-       s = _mm_div_ps(s, _mm_set1_ps(8.0));    // The constant is 2^N, for 3 
times argument reduction
-       s = _mm_mul_ps(s, s);
-       // Evaluate Taylor series
-       s = 
_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s,
 cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-       
-       for(i = 0; i < 3; i++)  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
-       s = _mm_div_ps(s, ftwos);
-
-       sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
-       cosine = _mm_sub_ps(fones, s);
-
-       condition1 = 
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), 
fzeroes);
-       condition2 = 
_mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), 
_mm_cmplt_ps(aVal, fzeroes));
-       condition3 = 
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), 
fzeroes);
-
-       __m128 temp = cosine;
-       cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), 
condition1));
-       sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
-       sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), 
condition2));
-       cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, 
_mm_set1_ps(2.0f)), condition3));
-       tangent = _mm_div_ps(sine, cosine);
-       _mm_storeu_ps(bPtr, tangent);
-       aPtr += 4;
-       bPtr += 4;
-       }
- 
-       number = quarterPoints * 4;
-       for(;number < num_points; number++){
-          *bPtr++ = tan(*aPtr++);
-       }
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+    unsigned int i = 0;
+
+    __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, 
ftwos, fones, fzeroes;
+    __m128 sine, cosine, tangent, condition1, condition2, condition3;
+    __m128i q, r, ones, twos, fours;
+
+    m4pi = _mm_set1_ps(1.273239545);
+    pio4A = _mm_set1_ps(0.78515625);
+    pio4B = _mm_set1_ps(0.241876e-3);
+    ffours = _mm_set1_ps(4.0);
+    ftwos = _mm_set1_ps(2.0);
+    fones = _mm_set1_ps(1.0);
+    fzeroes = _mm_setzero_ps();
+    ones = _mm_set1_epi32(1);
+    twos = _mm_set1_epi32(2);
+    fours = _mm_set1_epi32(4);
+
+    cp1 = _mm_set1_ps(1.0);
+    cp2 = _mm_set1_ps(0.83333333e-1);
+    cp3 = _mm_set1_ps(0.2777778e-2);
+    cp4 = _mm_set1_ps(0.49603e-4);
+    cp5 = _mm_set1_ps(0.551e-6);
+
+    for(;number < quarterPoints; number++){
+        aVal = _mm_loadu_ps(aPtr);
+        s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), 
_mm_cmplt_ps(aVal, fzeroes)));
+        q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
+        r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+        s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+        s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+        s = _mm_div_ps(s, _mm_set1_ps(8.0));    // The constant is 2^N, for 3 
times argument reduction
+        s = _mm_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = 
_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s,
 cp5), cp4), s), cp3), s), cp2), s), cp1), s);
+
+        for(i = 0; i < 3; i++){
+            s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+        }
+        s = _mm_div_ps(s, ftwos);
+
+        sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+        cosine = _mm_sub_ps(fones, s);
+
+        condition1 = 
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), 
fzeroes);
+        condition2 = 
_mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), 
_mm_cmplt_ps(aVal, fzeroes));
+        condition3 = 
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), 
fzeroes);
+
+        __m128 temp = cosine;
+        cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), 
condition1));
+        sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), 
condition1));
+        sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, 
_mm_set1_ps(2.0f)), condition2));
+        cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, 
_mm_set1_ps(2.0f)), condition3));
+        tangent = _mm_div_ps(sine, cosine);
+        _mm_storeu_ps(bPtr, tangent);
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+        *bPtr++ = tan(*aPtr++);
+    }
 }
 
 #endif /* LV_HAVE_SSE4_1 for unaligned */
@@ -200,15 +183,15 @@ static inline void volk_32f_tan_32f_u_sse4_1(float* 
bVector, const float* aVecto
   \param aVector The input vector of floats
   \param num_points Number of points for which tangent is to be computed
 */
-static inline void volk_32f_tan_32f_u_generic(float* bVector, const float* 
aVector, unsigned int num_points){    
+static inline void volk_32f_tan_32f_generic(float* bVector, const float* 
aVector, unsigned int num_points){
     float* bPtr = bVector;
     const float* aPtr = aVector;
     unsigned int number = 0;
 
     for(; number < num_points; number++){
-      *bPtr++ = tan(*aPtr++);
+        *bPtr++ = tan(*aPtr++);
     }
- 
+
 }
 #endif /* LV_HAVE_GENERIC */
 
diff --git a/volk/kernels/volk/volk_32f_x2_pow_32f.h 
b/volk/kernels/volk/volk_32f_x2_pow_32f.h
index cc11daf..43de06c 100755
--- a/volk/kernels/volk/volk_32f_x2_pow_32f.h
+++ b/volk/kernels/volk/volk_32f_x2_pow_32f.h
@@ -15,7 +15,8 @@
 #ifndef INCLUDED_volk_32f_x2_pow_32f_a_H
 #define INCLUDED_volk_32f_x2_pow_32f_a_H
 
-#ifdef LV_HAVE_GENERIC
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
 /*!
   \brief Computes pow(x,y) by using exp and log
   \param cVector The vector where results will be stored
@@ -23,132 +24,108 @@
   \param bVector The input vector of indices
   \param num_points Number of points for which pow is to be computed
 */
-static inline void volk_32f_x2_pow_32f_a_generic(float* cVector, const float* 
bVector, const float* aVector, unsigned int num_points){    
+static inline void volk_32f_x2_pow_32f_a_sse4_1(float* cVector, const float* 
bVector, const float* aVector, unsigned int num_points){
+
     float* cPtr = cVector;
     const float* bPtr = bVector;
     const float* aPtr = aVector;
-    unsigned int number = 0;
 
-    for(number = 0; number < num_points; number++){
-      *cPtr++ = pow(*aPtr++, *bPtr++);
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
+    __m128 tmp, fx, mask, pow2n, z, y;
+    __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
+    __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+    __m128i bias, exp, emm0, pi32_0x7f;
+
+    one = _mm_set1_ps(1.0);
+    exp_hi = _mm_set1_ps(88.3762626647949);
+    exp_lo = _mm_set1_ps(-88.3762626647949);
+    ln2 = _mm_set1_ps(0.6931471805);
+    log2EF = _mm_set1_ps(1.44269504088896341);
+    half = _mm_set1_ps(0.5);
+    exp_C1 = _mm_set1_ps(0.693359375);
+    exp_C2 = _mm_set1_ps(-2.12194440e-4);
+    pi32_0x7f = _mm_set1_epi32(0x7f);
+
+    exp_p0 = _mm_set1_ps(1.9875691500e-4);
+    exp_p1 = _mm_set1_ps(1.3981999507e-3);
+    exp_p2 = _mm_set1_ps(8.3334519073e-3);
+    exp_p3 = _mm_set1_ps(4.1665795894e-2);
+    exp_p4 = _mm_set1_ps(1.6666665459e-1);
+    exp_p5 = _mm_set1_ps(5.0000001201e-1);
+
+    for(;number < quarterPoints; number++){
+        // First compute the logarithm
+        aVal = _mm_load_ps(aPtr);
+        bias = _mm_set1_epi32(127);
+        leadingOne = _mm_set1_ps(1.0f);
+        exp = 
_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), 
_mm_set1_epi32(0x7f800000)), 23), bias);
+        logarithm = _mm_cvtepi32_ps(exp);
+
+        frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, 
_mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
+
+        #if LOG_POLY_DEGREE == 6
+          mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, 
-1.2315303f,  3.1821337e-1f, -3.4436006e-2f);
+        #elif LOG_POLY_DEGREE == 5
+          mantissa = POLY4( frac, 2.8882704548164776201f, 
-2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 
0.0596515482674574969533f);
+        #elif LOG_POLY_DEGREE == 4
+          mantissa = POLY3( frac, 2.61761038894603480148f, 
-1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+        #elif LOG_POLY_DEGREE == 3
+          mantissa = POLY2( frac, 2.28330284476918490682f, 
-1.04913055217340124191f, 0.204446009836232697516f);
+        #else
+        #error
+        #endif
+
+        logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, 
_mm_sub_ps(frac, leadingOne)));
+        logarithm = _mm_mul_ps(logarithm, ln2);
+
+
+        // Now calculate b*lna
+        bVal = _mm_load_ps(bPtr);
+        bVal = _mm_mul_ps(bVal, logarithm);
+
+        // Now compute exp(b*lna)
+        tmp = _mm_setzero_ps();
+
+        bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
+
+        fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
+
+        emm0 = _mm_cvttps_epi32(fx);
+        tmp = _mm_cvtepi32_ps(emm0);
+
+        mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
+        fx = _mm_sub_ps(tmp, mask);
+
+        tmp = _mm_mul_ps(fx, exp_C1);
+        z = _mm_mul_ps(fx, exp_C2);
+        bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
+        z = _mm_mul_ps(bVal, bVal);
+
+        y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
+        y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
+        y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
+        y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
+        y = _mm_add_ps(y, one);
+
+        emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 
23);
+
+        pow2n = _mm_castsi128_ps(emm0);
+        cVal = _mm_mul_ps(y, pow2n);
+
+        _mm_store_ps(cPtr, cVal);
+
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
     }
- 
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-#ifdef LV_HAVE_SSE4_1
-#include <smmintrin.h>
-/*!
-  \brief Computes pow(x,y) by using exp and log
-  \param cVector The vector where results will be stored
-  \param aVector The input vector of bases
-  \param bVector The input vector of indices
-  \param num_points Number of points for which pow is to be computed
-*/
-static inline void volk_32f_x2_pow_32f_a_sse4_1(float* cVector, const float* 
bVector, const float* aVector, unsigned int num_points){
 
-       float* cPtr = cVector;
-       const float* bPtr = bVector;
-       const float* aPtr = aVector;
-    
-       unsigned int number = 0;
-        const unsigned int quarterPoints = num_points / 4;
-
-       __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
-       __m128 tmp, fx, mask, pow2n, z, y;
-       __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
-       __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
-       __m128i bias, exp, emm0, pi32_0x7f;
-       
-       one = _mm_set1_ps(1.0);
-       exp_hi = _mm_set1_ps(88.3762626647949);
-       exp_lo = _mm_set1_ps(-88.3762626647949);
-       ln2 = _mm_set1_ps(0.6931471805);
-       log2EF = _mm_set1_ps(1.44269504088896341);
-       half = _mm_set1_ps(0.5);
-       exp_C1 = _mm_set1_ps(0.693359375);
-       exp_C2 = _mm_set1_ps(-2.12194440e-4);
-       pi32_0x7f = _mm_set1_epi32(0x7f);
-
-       exp_p0 = _mm_set1_ps(1.9875691500e-4);
-       exp_p1 = _mm_set1_ps(1.3981999507e-3);
-       exp_p2 = _mm_set1_ps(8.3334519073e-3);
-       exp_p3 = _mm_set1_ps(4.1665795894e-2);
-       exp_p4 = _mm_set1_ps(1.6666665459e-1);
-       exp_p5 = _mm_set1_ps(5.0000001201e-1);
-
-       for(;number < quarterPoints; number++){    
-
-       // First compute the logarithm
-       aVal = _mm_load_ps(aPtr); 
-       bias = _mm_set1_epi32(127);
-       leadingOne = _mm_set1_ps(1.0f);
-       exp = 
_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), 
_mm_set1_epi32(0x7f800000)), 23), bias);
-       logarithm = _mm_cvtepi32_ps(exp);
-
-       frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, 
_mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
-
-       #if LOG_POLY_DEGREE == 6
-         mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, 
-1.2315303f,  3.1821337e-1f, -3.4436006e-2f);
-       #elif LOG_POLY_DEGREE == 5
-         mantissa = POLY4( frac, 2.8882704548164776201f, 
-2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 
0.0596515482674574969533f);
-       #elif LOG_POLY_DEGREE == 4
-         mantissa = POLY3( frac, 2.61761038894603480148f, 
-1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
-       #elif LOG_POLY_DEGREE == 3
-         mantissa = POLY2( frac, 2.28330284476918490682f, 
-1.04913055217340124191f, 0.204446009836232697516f);
-       #else
-       #error
-       #endif
-
-       logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, 
leadingOne)));
-       logarithm = _mm_mul_ps(logarithm, ln2); 
-
-       
-       // Now calculate b*lna
-       bVal = _mm_load_ps(bPtr);
-       bVal = _mm_mul_ps(bVal, logarithm);
-
-       // Now compute exp(b*lna)
-       tmp = _mm_setzero_ps();
-       
-       bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
-
-       fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
-
-       emm0 = _mm_cvttps_epi32(fx);
-       tmp = _mm_cvtepi32_ps(emm0);
-
-       mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
-       fx = _mm_sub_ps(tmp, mask);
-
-       tmp = _mm_mul_ps(fx, exp_C1);
-       z = _mm_mul_ps(fx, exp_C2);
-       bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
-       z = _mm_mul_ps(bVal, bVal);
-       
-       y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
-       y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
-       y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
-       y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
-       y = _mm_add_ps(y, one);
-
-       emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 
23);
-
-       pow2n = _mm_castsi128_ps(emm0);
-       cVal = _mm_mul_ps(y, pow2n);
-
-       _mm_store_ps(cPtr, cVal);
-
-       aPtr += 4;
-       bPtr += 4;
-       cPtr += 4;
-       }
- 
-       number = quarterPoints * 4;
-       for(;number < num_points; number++){
-          *cPtr++ = pow(*aPtr++, *bPtr++);
-       }
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+        *cPtr++ = pow(*aPtr++, *bPtr++);
+    }
 }
 
 #endif /* LV_HAVE_SSE4_1 for aligned */
@@ -166,16 +143,16 @@ static inline void volk_32f_x2_pow_32f_a_sse4_1(float* 
cVector, const float* bVe
   \param bVector The input vector of indices
   \param num_points Number of points for which pow is to be computed
 */
-static inline void volk_32f_x2_pow_32f_u_generic(float* cVector, const float* 
bVector, const float* aVector, unsigned int num_points){    
+static inline void volk_32f_x2_pow_32f_generic(float* cVector, const float* 
bVector, const float* aVector, unsigned int num_points){
     float* cPtr = cVector;
     const float* bPtr = bVector;
     const float* aPtr = aVector;
     unsigned int number = 0;
 
     for(number = 0; number < num_points; number++){
-      *cPtr++ = pow(*aPtr++, *bPtr++);
+        *cPtr++ = pow(*aPtr++, *bPtr++);
     }
- 
+
 }
 #endif /* LV_HAVE_GENERIC */
 
@@ -191,107 +168,107 @@ static inline void volk_32f_x2_pow_32f_u_generic(float* 
cVector, const float* bV
 */
 static inline void volk_32f_x2_pow_32f_u_sse4_1(float* cVector, const float* 
bVector, const float* aVector, unsigned int num_points){
 
-       float* cPtr = cVector;
-       const float* bPtr = bVector;
-       const float* aPtr = aVector;
-    
-       unsigned int number = 0;
-        const unsigned int quarterPoints = num_points / 4;
-
-       __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
-       __m128 tmp, fx, mask, pow2n, z, y;
-       __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
-       __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
-       __m128i bias, exp, emm0, pi32_0x7f;
-       
-       one = _mm_set1_ps(1.0);
-       exp_hi = _mm_set1_ps(88.3762626647949);
-       exp_lo = _mm_set1_ps(-88.3762626647949);
-       ln2 = _mm_set1_ps(0.6931471805);
-       log2EF = _mm_set1_ps(1.44269504088896341);
-       half = _mm_set1_ps(0.5);
-       exp_C1 = _mm_set1_ps(0.693359375);
-       exp_C2 = _mm_set1_ps(-2.12194440e-4);
-       pi32_0x7f = _mm_set1_epi32(0x7f);
-
-       exp_p0 = _mm_set1_ps(1.9875691500e-4);
-       exp_p1 = _mm_set1_ps(1.3981999507e-3);
-       exp_p2 = _mm_set1_ps(8.3334519073e-3);
-       exp_p3 = _mm_set1_ps(4.1665795894e-2);
-       exp_p4 = _mm_set1_ps(1.6666665459e-1);
-       exp_p5 = _mm_set1_ps(5.0000001201e-1);
-
-       for(;number < quarterPoints; number++){    
-
-       // First compute the logarithm
-       aVal = _mm_loadu_ps(aPtr); 
-       bias = _mm_set1_epi32(127);
-       leadingOne = _mm_set1_ps(1.0f);
-       exp = 
_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), 
_mm_set1_epi32(0x7f800000)), 23), bias);
-       logarithm = _mm_cvtepi32_ps(exp);
-
-       frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, 
_mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
-
-       #if LOG_POLY_DEGREE == 6
-         mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, 
-1.2315303f,  3.1821337e-1f, -3.4436006e-2f);
-       #elif LOG_POLY_DEGREE == 5
-         mantissa = POLY4( frac, 2.8882704548164776201f, 
-2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 
0.0596515482674574969533f);
-       #elif LOG_POLY_DEGREE == 4
-         mantissa = POLY3( frac, 2.61761038894603480148f, 
-1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
-       #elif LOG_POLY_DEGREE == 3
-         mantissa = POLY2( frac, 2.28330284476918490682f, 
-1.04913055217340124191f, 0.204446009836232697516f);
-       #else
-       #error
-       #endif
-
-       logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, 
leadingOne)));
-       logarithm = _mm_mul_ps(logarithm, ln2); 
-
-       
-       // Now calculate b*lna
-       bVal = _mm_loadu_ps(bPtr);
-       bVal = _mm_mul_ps(bVal, logarithm);
-
-       // Now compute exp(b*lna)
-       tmp = _mm_setzero_ps();
-       
-       bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
-
-       fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
-
-       emm0 = _mm_cvttps_epi32(fx);
-       tmp = _mm_cvtepi32_ps(emm0);
-
-       mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
-       fx = _mm_sub_ps(tmp, mask);
-
-       tmp = _mm_mul_ps(fx, exp_C1);
-       z = _mm_mul_ps(fx, exp_C2);
-       bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
-       z = _mm_mul_ps(bVal, bVal);
-       
-       y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
-       y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
-       y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
-       y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
-       y = _mm_add_ps(y, one);
-
-       emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 
23);
-
-       pow2n = _mm_castsi128_ps(emm0);
-       cVal = _mm_mul_ps(y, pow2n);
-
-       _mm_storeu_ps(cPtr, cVal);
-
-       aPtr += 4;
-       bPtr += 4;
-       cPtr += 4;
-       }
- 
-       number = quarterPoints * 4;
-       for(;number < num_points; number++){
-          *cPtr++ = pow(*aPtr++, *bPtr++);
-       }
+    float* cPtr = cVector;
+    const float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
+    __m128 tmp, fx, mask, pow2n, z, y;
+    __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
+    __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+    __m128i bias, exp, emm0, pi32_0x7f;
+
+    one = _mm_set1_ps(1.0);
+    exp_hi = _mm_set1_ps(88.3762626647949);
+    exp_lo = _mm_set1_ps(-88.3762626647949);
+    ln2 = _mm_set1_ps(0.6931471805);
+    log2EF = _mm_set1_ps(1.44269504088896341);
+    half = _mm_set1_ps(0.5);
+    exp_C1 = _mm_set1_ps(0.693359375);
+    exp_C2 = _mm_set1_ps(-2.12194440e-4);
+    pi32_0x7f = _mm_set1_epi32(0x7f);
+
+    exp_p0 = _mm_set1_ps(1.9875691500e-4);
+    exp_p1 = _mm_set1_ps(1.3981999507e-3);
+    exp_p2 = _mm_set1_ps(8.3334519073e-3);
+    exp_p3 = _mm_set1_ps(4.1665795894e-2);
+    exp_p4 = _mm_set1_ps(1.6666665459e-1);
+    exp_p5 = _mm_set1_ps(5.0000001201e-1);
+
+    for(;number < quarterPoints; number++){
+
+    // First compute the logarithm
+    aVal = _mm_loadu_ps(aPtr);
+    bias = _mm_set1_epi32(127);
+    leadingOne = _mm_set1_ps(1.0f);
+    exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), 
_mm_set1_epi32(0x7f800000)), 23), bias);
+    logarithm = _mm_cvtepi32_ps(exp);
+
+    frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, 
_mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
+
+    #if LOG_POLY_DEGREE == 6
+      mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, 
-1.2315303f,  3.1821337e-1f, -3.4436006e-2f);
+    #elif LOG_POLY_DEGREE == 5
+      mantissa = POLY4( frac, 2.8882704548164776201f, 
-2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 
0.0596515482674574969533f);
+    #elif LOG_POLY_DEGREE == 4
+      mantissa = POLY3( frac, 2.61761038894603480148f, 
-1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+    #elif LOG_POLY_DEGREE == 3
+      mantissa = POLY2( frac, 2.28330284476918490682f, 
-1.04913055217340124191f, 0.204446009836232697516f);
+    #else
+    #error
+    #endif
+
+    logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, 
leadingOne)));
+    logarithm = _mm_mul_ps(logarithm, ln2);
+
+
+    // Now calculate b*lna
+    bVal = _mm_loadu_ps(bPtr);
+    bVal = _mm_mul_ps(bVal, logarithm);
+
+    // Now compute exp(b*lna)
+    tmp = _mm_setzero_ps();
+
+    bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
+
+    fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
+
+    emm0 = _mm_cvttps_epi32(fx);
+    tmp = _mm_cvtepi32_ps(emm0);
+
+    mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
+    fx = _mm_sub_ps(tmp, mask);
+
+    tmp = _mm_mul_ps(fx, exp_C1);
+    z = _mm_mul_ps(fx, exp_C2);
+    bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
+    z = _mm_mul_ps(bVal, bVal);
+
+    y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
+    y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
+    y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
+    y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
+    y = _mm_add_ps(y, one);
+
+    emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
+
+    pow2n = _mm_castsi128_ps(emm0);
+    cVal = _mm_mul_ps(y, pow2n);
+
+    _mm_storeu_ps(cPtr, cVal);
+
+    aPtr += 4;
+    bPtr += 4;
+    cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for(;number < num_points; number++){
+        *cPtr++ = pow(*aPtr++, *bPtr++);
+    }
 }
 
 #endif /* LV_HAVE_SSE4_1 for unaligned */
diff --git a/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h 
b/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
index 3ae6f59..337ef18 100644
--- a/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
+++ b/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
@@ -29,7 +29,6 @@ static inline void 
volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t* cVector, const
     yh = _mm256_set1_ps(lv_cimag(scalar));
 
     for(;number < quarterPoints; number++){
-
       x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as 
ar,ai,br,bi
 
       tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@@ -47,8 +46,8 @@ static inline void 
volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t* cVector, const
     }
 
     for(i = num_points-isodd; i < num_points; i++) {
-    *c++ = (*a++) * scalar;
-  }
+        *c++ = (*a++) * scalar;
+    }
 
 }
 #endif /* LV_HAVE_AVX */
@@ -163,7 +162,6 @@ static inline void 
volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t* cVector, const
     yh = _mm256_set1_ps(lv_cimag(scalar));
 
     for(;number < quarterPoints; number++){
-
       x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as 
ar,ai,br,bi
 
       tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@@ -181,8 +179,8 @@ static inline void 
volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t* cVector, const
     }
 
     for(i = num_points-isodd; i < num_points; i++) {
-    *c++ = (*a++) * scalar;
-  }
+        *c++ = (*a++) * scalar;
+    }
 
 }
 #endif /* LV_HAVE_AVX */
diff --git a/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h 
b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
index 7e8c91a..69eec0d 100644
--- a/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
+++ b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
@@ -338,7 +338,6 @@ static inline void 
volk_32fc_x2_dot_prod_32fc_u_avx(lv_32fc_t* result, const lv_
   dotProdVal = _mm256_setzero_ps();
 
   for(;number < quarterPoints; number++){
-
     x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
     y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
 
diff --git a/volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h 
b/volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h
index 37f9d74..ee43518 100644
--- a/volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h
+++ b/volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h
@@ -19,25 +19,25 @@ static inline void 
volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer, int16
   const int8_t* complexVectorPtr = (int8_t*)complexVector;
   int16_t* iBufferPtr = iBuffer;
   int16_t* qBufferPtr = qBuffer;
-  __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 
0x80, 14, 12, 10, 8, 6, 4, 2, 0); // set 16 byte values
+  __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 
0x80, 14, 12, 10, 8, 6, 4, 2, 0);  // set 16 byte values
   __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 
0x80, 15, 13, 11, 9, 7, 5, 3, 1);
   __m128i complexVal, iOutputVal, qOutputVal;
 
   unsigned int eighthPoints = num_points / 8;
 
   for(number = 0; number < eighthPoints; number++){
-    complexVal = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr 
+= 16;          // aligned load
+    complexVal = _mm_load_si128((__m128i*)complexVectorPtr);  complexVectorPtr 
+= 16;   // aligned load
 
-    iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask);                      
                // shuffle 16 bytes of 128bit complexVal
+    iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask);   // shuffle 16 
bytes of 128bit complexVal
     qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask);
 
-    iOutputVal = _mm_cvtepi8_epi16(iOutputVal);                // fills 2-byte 
sign extended versions of lower 8 bytes of input to output 
-    iOutputVal = _mm_slli_epi16(iOutputVal, 8);                // shift in 
left by 8 bits, each of the 8 16-bit integers, shift in with zeros
+    iOutputVal = _mm_cvtepi8_epi16(iOutputVal);     // fills 2-byte sign 
extended versions of lower 8 bytes of input to output
+    iOutputVal = _mm_slli_epi16(iOutputVal, 8);     // shift in left by 8 
bits, each of the 8 16-bit integers, shift in with zeros
 
     qOutputVal = _mm_cvtepi8_epi16(qOutputVal);
     qOutputVal = _mm_slli_epi16(qOutputVal, 8);
 
-    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); // aligned store
+    _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);  // aligned store
     _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
 
     iBufferPtr += 8;
@@ -46,7 +46,7 @@ static inline void 
volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer, int16
 
   number = eighthPoints * 8;
   for(; number < num_points; number++){
-    *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;      // load 8 bit 
Complexvector into 16 bit, shift left by 8 bits and store
+    *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;   // load 8 bit 
Complexvector into 16 bit, shift left by 8 bits and store
     *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
   }
 }
@@ -66,7 +66,7 @@ static inline void 
volk_8ic_deinterleave_16i_x2_a_avx(int16_t* iBuffer, int16_t*
   const int8_t* complexVectorPtr = (int8_t*)complexVector;
   int16_t* iBufferPtr = iBuffer;
   int16_t* qBufferPtr = qBuffer;
-  __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 
0x80, 14, 12, 10, 8, 6, 4, 2, 0); // set 16 byte values
+  __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 
0x80, 14, 12, 10, 8, 6, 4, 2, 0);  // set 16 byte values
   __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 
0x80, 15, 13, 11, 9, 7, 5, 3, 1);
   __m256i complexVal, iOutputVal, qOutputVal;
   __m128i complexVal1, complexVal0;
@@ -75,20 +75,20 @@ static inline void 
volk_8ic_deinterleave_16i_x2_a_avx(int16_t* iBuffer, int16_t*
   unsigned int sixteenthPoints = num_points / 16;
 
   for(number = 0; number < sixteenthPoints; number++){
-    complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);  
complexVectorPtr += 32;               // aligned load
+    complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);  
complexVectorPtr += 32;    // aligned load
 
     // Extract from complexVal to iOutputVal and qOutputVal
-    complexVal1 = _mm256_extractf128_si256(complexVal, 1); 
-    complexVal0 = _mm256_extractf128_si256(complexVal, 0); 
+    complexVal1 = _mm256_extractf128_si256(complexVal, 1);
+    complexVal0 = _mm256_extractf128_si256(complexVal, 0);
 
-    iOutputVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask);                    
                // shuffle 16 bytes of 128bit complexVal
-    iOutputVal0 = _mm_shuffle_epi8(complexVal0, iMoveMask);                    
+    iOutputVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask);     // shuffle 16 
bytes of 128bit complexVal
+    iOutputVal0 = _mm_shuffle_epi8(complexVal0, iMoveMask);
     qOutputVal1 = _mm_shuffle_epi8(complexVal1, qMoveMask);
     qOutputVal0 = _mm_shuffle_epi8(complexVal0, qMoveMask);
 
-    iOutputVal1 = _mm_cvtepi8_epi16(iOutputVal1);              // fills 2-byte 
sign extended versions of lower 8 bytes of input to output 
-    iOutputVal1 = _mm_slli_epi16(iOutputVal1, 8);              // shift in 
left by 8 bits, each of the 8 16-bit integers, shift in with zeros
-    iOutputVal0 = _mm_cvtepi8_epi16(iOutputVal0);              
+    iOutputVal1 = _mm_cvtepi8_epi16(iOutputVal1);   // fills 2-byte sign 
extended versions of lower 8 bytes of input to output
+    iOutputVal1 = _mm_slli_epi16(iOutputVal1, 8);   // shift in left by 8 
bits, each of the 8 16-bit integers, shift in with zeros
+    iOutputVal0 = _mm_cvtepi8_epi16(iOutputVal0);
     iOutputVal0 = _mm_slli_epi16(iOutputVal0, 8);
 
     qOutputVal1 = _mm_cvtepi8_epi16(qOutputVal1);
@@ -98,12 +98,12 @@ static inline void 
volk_8ic_deinterleave_16i_x2_a_avx(int16_t* iBuffer, int16_t*
 
     // Pack iOutputVal0,1 to iOutputVal
     __m256i dummy = _mm256_setzero_si256();
-    iOutputVal = _mm256_insertf128_si256(dummy, iOutputVal0, 0);       
-    iOutputVal = _mm256_insertf128_si256(iOutputVal, iOutputVal1, 1);  
-    qOutputVal = _mm256_insertf128_si256(dummy, qOutputVal0, 0);       
-    qOutputVal = _mm256_insertf128_si256(qOutputVal, qOutputVal1, 1);  
+    iOutputVal = _mm256_insertf128_si256(dummy, iOutputVal0, 0);
+    iOutputVal = _mm256_insertf128_si256(iOutputVal, iOutputVal1, 1);
+    qOutputVal = _mm256_insertf128_si256(dummy, qOutputVal0, 0);
+    qOutputVal = _mm256_insertf128_si256(qOutputVal, qOutputVal1, 1);
 
-    _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);      // aligned store
+    _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);   // aligned store
     _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
 
     iBufferPtr += 16;
@@ -112,7 +112,7 @@ static inline void 
volk_8ic_deinterleave_16i_x2_a_avx(int16_t* iBuffer, int16_t*
 
   number = sixteenthPoints * 16;
   for(; number < num_points; number++){
-    *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;      // load 8 bit 
Complexvector into 16 bit, shift left by 8 bits and store
+    *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;   // load 8 bit 
Complexvector into 16 bit, shift left by 8 bits and store
     *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
   }
 }



reply via email to

[Prev in Thread] Current Thread [Next in Thread]