commit-gnuradio
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Commit-gnuradio] [gnuradio] 09/14: volk: Added tan kernel.


From: git
Subject: [Commit-gnuradio] [gnuradio] 09/14: volk: Added tan kernel.
Date: Wed, 15 Oct 2014 23:25:09 +0000 (UTC)

This is an automated email from the git hooks/post-receive script.

trondeau pushed a commit to branch master
in repository gnuradio.

commit 2fdd530e75c345f6a89e2368c1ccced2fc1ef487
Author: Abhishek Bhowmick <address@hidden>
Date:   Sat Jun 14 18:31:03 2014 +0530

    volk: Added tan kernel.
---
 volk/apps/volk_profile.cc            |   1 +
 volk/kernels/volk/volk_32f_tan_32f.h | 215 +++++++++++++++++++++++++++++++++++
 volk/lib/testqa.cc                   |   1 +
 3 files changed, 217 insertions(+)

diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc
index 1c31add..46929b3 100644
--- a/volk/apps/volk_profile.cc
+++ b/volk/apps/volk_profile.cc
@@ -167,6 +167,7 @@ int main(int argc, char *argv[]) {
     VOLK_PROFILE(volk_32f_x2_pow_32f, 1e-2, 0, 204602, 1000, &results, 
benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32f_sin_32f, 1e-6, 0, 204602, 1000, &results, 
benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32f_cos_32f, 1e-6, 0, 204602, 1000, &results, 
benchmark_mode, kernel_regex);
+    VOLK_PROFILE(volk_32f_tan_32f, 1e-6, 0, 204602, 1000, &results, 
benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32fc_s32f_power_32fc, 1e-4, 0, 204602, 50, &results, 
benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32f_s32f_calc_spectral_noise_floor_32f, 1e-4, 20.0, 
204602, 1000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_32fc_s32f_atan2_32f, 1e-4, 10.0, 204602, 100, &results, 
benchmark_mode, kernel_regex);
diff --git a/volk/kernels/volk/volk_32f_tan_32f.h 
b/volk/kernels/volk/volk_32f_tan_32f.h
new file mode 100644
index 0000000..48611b0
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_tan_32f.h
@@ -0,0 +1,215 @@
+#include <stdio.h>
+#include <math.h>
+#include <inttypes.h>
+
+#ifndef INCLUDED_volk_32f_tan_32f_a_H
+#define INCLUDED_volk_32f_tan_32f_a_H
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Computes tangent of input vector and stores results in output vector
+  \param bVector The vector where results will be stored
+  \param aVector The input vector of floats
+  \param num_points Number of points for which tangent is to be computed
+*/
+static inline void volk_32f_tan_32f_a_sse4_1(float* bVector, const float* 
aVector, unsigned int num_points){
+
+       float* bPtr = bVector;
+       const float* aPtr = aVector;
+    
+       unsigned int number = 0;
+        unsigned int quarterPoints = num_points / 4;
+       unsigned int i = 0;
+
+       __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, 
ftwos, fones, fzeroes;
+       __m128 sine, cosine, tangent, condition1, condition2, condition3;
+       __m128i q, r, ones, twos, fours;
+
+       m4pi = _mm_set1_ps(1.273239545);
+       pio4A = _mm_set1_ps(0.78515625);
+       pio4B = _mm_set1_ps(0.241876e-3);
+       ffours = _mm_set1_ps(4.0);
+       ftwos = _mm_set1_ps(2.0);
+       fones = _mm_set1_ps(1.0);
+       fzeroes = _mm_setzero_ps();
+       ones = _mm_set1_epi32(1);
+       twos = _mm_set1_epi32(2);
+       fours = _mm_set1_epi32(4);
+
+       cp1 = _mm_set1_ps(1.0);
+       cp2 = _mm_set1_ps(0.83333333e-1);
+       cp3 = _mm_set1_ps(0.2777778e-2);
+       cp4 = _mm_set1_ps(0.49603e-4);
+       cp5 = _mm_set1_ps(0.551e-6);
+
+       for(;number < quarterPoints; number++){    
+
+       aVal = _mm_load_ps(aPtr);       
+       s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), 
_mm_cmplt_ps(aVal, fzeroes)));
+       q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
+       r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+       s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+       s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+       s = _mm_div_ps(s, _mm_set1_ps(8.0));    // The constant is 2^N, for 3 
times argument reduction
+       s = _mm_mul_ps(s, s);
+       // Evaluate Taylor series
+       s = 
_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s,
 cp5), cp4), s), cp3), s), cp2), s), cp1), s);
+       
+       for(i = 0; i < 3; i++)  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+       s = _mm_div_ps(s, ftwos);
+
+       sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+       cosine = _mm_sub_ps(fones, s);
+
+       condition1 = 
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), 
fzeroes);
+       condition2 = 
_mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), 
_mm_cmplt_ps(aVal, fzeroes));
+       condition3 = 
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), 
fzeroes);
+
+       __m128 temp = cosine;
+       cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), 
condition1));
+       sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
+       sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), 
condition2));
+       cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, 
_mm_set1_ps(2.0f)), condition3));
+       tangent = _mm_div_ps(sine, cosine);
+       _mm_store_ps(bPtr, tangent);
+       aPtr += 4;
+       bPtr += 4;
+       }
+ 
+       number = quarterPoints * 4;
+       for(;number < num_points; number++){
+          *bPtr++ = tan(*aPtr++);
+       }
+}
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Computes tangent of input vector and stores results in output vector
+  \param bVector The vector where results will be stored
+  \param aVector The input vector of floats
+  \param num_points Number of points for which tangent is to be computed
+*/
+static inline void volk_32f_tan_32f_a_generic(float* bVector, const float* 
aVector, unsigned int num_points){    
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    for(; number < num_points; number++){
+      *bPtr++ = tan(*aPtr++);
+    }
+ 
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_tan_32f_a_H */
+
+#ifndef INCLUDED_volk_32f_tan_32f_u_H
+#define INCLUDED_volk_32f_tan_32f_u_H
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Computes tangent of input vector and stores results in output vector
+  \param bVector The vector where results will be stored
+  \param aVector The input vector of floats
+  \param num_points Number of points for which tangent is to be computed
+*/
+static inline void volk_32f_tan_32f_u_sse4_1(float* bVector, const float* 
aVector, unsigned int num_points){
+
+       float* bPtr = bVector;
+       const float* aPtr = aVector;
+    
+       unsigned int number = 0;
+        unsigned int quarterPoints = num_points / 4;
+       unsigned int i = 0;
+
+       __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, 
ftwos, fones, fzeroes;
+       __m128 sine, cosine, tangent, condition1, condition2, condition3;
+       __m128i q, r, ones, twos, fours;
+
+       m4pi = _mm_set1_ps(1.273239545);
+       pio4A = _mm_set1_ps(0.78515625);
+       pio4B = _mm_set1_ps(0.241876e-3);
+       ffours = _mm_set1_ps(4.0);
+       ftwos = _mm_set1_ps(2.0);
+       fones = _mm_set1_ps(1.0);
+       fzeroes = _mm_setzero_ps();
+       ones = _mm_set1_epi32(1);
+       twos = _mm_set1_epi32(2);
+       fours = _mm_set1_epi32(4);
+
+       cp1 = _mm_set1_ps(1.0);
+       cp2 = _mm_set1_ps(0.83333333e-1);
+       cp3 = _mm_set1_ps(0.2777778e-2);
+       cp4 = _mm_set1_ps(0.49603e-4);
+       cp5 = _mm_set1_ps(0.551e-6);
+
+       for(;number < quarterPoints; number++){    
+
+       aVal = _mm_loadu_ps(aPtr);      
+       s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), 
_mm_cmplt_ps(aVal, fzeroes)));
+       q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
+       r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+       s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+       s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+       s = _mm_div_ps(s, _mm_set1_ps(8.0));    // The constant is 2^N, for 3 
times argument reduction
+       s = _mm_mul_ps(s, s);
+       // Evaluate Taylor series
+       s = 
_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s,
 cp5), cp4), s), cp3), s), cp2), s), cp1), s);
+       
+       for(i = 0; i < 3; i++)  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+       s = _mm_div_ps(s, ftwos);
+
+       sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+       cosine = _mm_sub_ps(fones, s);
+
+       condition1 = 
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), 
fzeroes);
+       condition2 = 
_mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), 
_mm_cmplt_ps(aVal, fzeroes));
+       condition3 = 
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), 
fzeroes);
+
+       __m128 temp = cosine;
+       cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), 
condition1));
+       sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
+       sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), 
condition2));
+       cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, 
_mm_set1_ps(2.0f)), condition3));
+       tangent = _mm_div_ps(sine, cosine);
+       _mm_storeu_ps(bPtr, tangent);
+       aPtr += 4;
+       bPtr += 4;
+       }
+ 
+       number = quarterPoints * 4;
+       for(;number < num_points; number++){
+          *bPtr++ = tan(*aPtr++);
+       }
+}
+
+#endif /* LV_HAVE_SSE4_1 for unaligned */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Computes tangent of input vector and stores results in output vector
+  \param bVector The vector where results will be stored
+  \param aVector The input vector of floats
+  \param num_points Number of points for which tangent is to be computed
+*/
+static inline void volk_32f_tan_32f_u_generic(float* bVector, const float* 
aVector, unsigned int num_points){    
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    for(; number < num_points; number++){
+      *bPtr++ = tan(*aPtr++);
+    }
+ 
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_tan_32f_u_H */
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
index 3510f3b..971c82f 100644
--- a/volk/lib/testqa.cc
+++ b/volk/lib/testqa.cc
@@ -49,6 +49,7 @@ VOLK_RUN_TESTS(volk_32f_expfast_32f, 1e-1, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_32f_x2_pow_32f, 1e-2, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_32f_sin_32f, 1e-6, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_32f_cos_32f, 1e-6, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_32f_tan_32f, 1e-6, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_32fc_s32f_power_32fc, 1e-4, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f, 1e-4, 20.0, 20462, 
1);
 VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f, 1e-4, 10.0, 20462, 1);



reply via email to

[Prev in Thread] Current Thread [Next in Thread]