commit-gnuradio
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Commit-gnuradio] [gnuradio] 08/22: volk: add neon protokernel for 16i_s


From: git
Subject: [Commit-gnuradio] [gnuradio] 08/22: volk: add neon protokernel for 16i_s32f_convert_32f.
Date: Fri, 31 Oct 2014 19:22:30 +0000 (UTC)

This is an automated email from the git hooks/post-receive script.

jcorgan pushed a commit to branch master
in repository gnuradio.

commit a4894c4f3ba117544e9aaa8a4a7648d7de48db82
Author: Nathan West <address@hidden>
Date:   Sat Oct 18 23:32:56 2014 -0500

    volk: add neon protokernel for 16i_s32f_convert_32f.
    
    This is slower than the generic on gcc 4.8, but it's worth keeping
    around. It's not possible to duplicate what gcc is doing with intrinsics,
    but we can get close using 2 lanes.
---
 volk/kernels/volk/volk_16i_s32f_convert_32f.h | 46 +++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/volk/kernels/volk/volk_16i_s32f_convert_32f.h 
b/volk/kernels/volk/volk_16i_s32f_convert_32f.h
index 24134c8..6ea28f0 100644
--- a/volk/kernels/volk/volk_16i_s32f_convert_32f.h
+++ b/volk/kernels/volk/volk_16i_s32f_convert_32f.h
@@ -195,7 +195,53 @@ static inline void 
volk_16i_s32f_convert_32f_generic(float* outputVector, const
 }
 #endif /* LV_HAVE_GENERIC */
 
+#ifdef LV_HAVE_NEON
+  /*!
+    \brief Converts the input 16 bit integer data into floating point data, 
and divides the each floating point output data point by the scalar value
+    \param inputVector The 16 bit input data buffer
+    \param outputVector The floating point output data buffer
+    \param scalar The value divided against each point in the output buffer
+    \param num_points The number of data values to be converted
+    \note Output buffer does NOT need to be properly aligned
+  */
+static inline void volk_16i_s32f_convert_32f_neon(float* outputVector, const 
int16_t* inputVector, const float scalar, unsigned int num_points){
+  float* outputPtr = outputVector;
+  const int16_t* inputPtr = inputVector;
+  unsigned int number = 0;
+  unsigned int eighth_points = num_points / 8;
+
+  int16x4x2_t input16;
+  int32x4_t input32_0, input32_1;
+  float32x4_t input_float_0, input_float_1;
+  float32x4x2_t output_float;
+  float32x4_t inv_scale;
+
+  inv_scale = vdupq_n_f32(1.0/scalar);
+
+  // the generic disassembles to a 128-bit load
+  // and duplicates every instruction to operate on 64-bits
+  // at a time. This is only possible with lanes, which is faster
+  // than just doing a vld1_s16, but still slower.
+  for(number = 0; number < eighth_points; number++){
+    input16 = vld2_s16(inputPtr);
+    // widen 16-bit int to 32-bit int
+    input32_0 = vmovl_s16(input16.val[0]);
+    input32_1 = vmovl_s16(input16.val[1]);
+    // convert 32-bit int to float with scale
+    input_float_0 = vcvtq_f32_s32(input32_0);
+    input_float_1 = vcvtq_f32_s32(input32_1);
+    output_float.val[0] = vmulq_f32(input_float_0, inv_scale);
+    output_float.val[1] = vmulq_f32(input_float_1, inv_scale);
+    vst2q_f32(outputPtr, output_float);
+    inputPtr += 8;
+    outputPtr += 8;
+  }
 
+  for(number = eighth_points*8; number < num_points; number++){
+    *outputPtr++ = ((float)(*inputPtr++)) / scalar;
+  }
+}
+#endif /* LV_HAVE_NEON */
 
 
 #endif /* INCLUDED_volk_16i_s32f_convert_32f_u_H */



reply via email to

[Prev in Thread] Current Thread [Next in Thread]