commit-gnuradio
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Commit-gnuradio] [gnuradio] 09/22: volk: add neon version for 32f_binar


From: git
Subject: [Commit-gnuradio] [gnuradio] 09/22: volk: add neon version for 32f_binary_slicer_8i
Date: Fri, 31 Oct 2014 19:22:30 +0000 (UTC)

This is an automated email from the git hooks/post-receive script.

jcorgan pushed a commit to branch master
in repository gnuradio.

commit c07d1a85c244215b5b9f1f4a078df99677175523
Author: Nathan West <address@hidden>
Date:   Sun Oct 19 17:46:16 2014 -0500

    volk: add neon version for 32f_binary_slicer_8i
---
 volk/kernels/volk/volk_32f_binary_slicer_8i.h | 80 +++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/volk/kernels/volk/volk_32f_binary_slicer_8i.h 
b/volk/kernels/volk/volk_32f_binary_slicer_8i.h
index 88a25b7..aa14c79 100644
--- a/volk/kernels/volk/volk_32f_binary_slicer_8i.h
+++ b/volk/kernels/volk/volk_32f_binary_slicer_8i.h
@@ -206,4 +206,84 @@ volk_32f_binary_slicer_8i_u_sse2(int8_t* cVector, const 
float* aVector,
 #endif /* LV_HAVE_SSE2 */
 
 
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+  \brief Returns integer 1 if float input is greater than or equal to 0, 1 
otherwise
+  \param cVector The char (int8_t) output (either 0 or 1)
+  \param aVector The float input
+  \param num_points The number of values in aVector and stored into cVector
+*/
+static inline void
+volk_32f_binary_slicer_8i_neon(int8_t* cVector, const float* aVector,
+                                  unsigned int num_points)
+{
+  int8_t* cPtr = cVector;
+  const float* aPtr = aVector;
+  unsigned int number = 0;
+  unsigned int n8points = num_points / 8;
+
+  float32x4x2_t input_val0, input_val1;
+  float32x4_t zero_val;
+  uint32x4x2_t res0_u32, res1_u32;
+  uint16x4x2_t res0_u16x4, res1_u16x4;
+  uint16x8x2_t res_u16x8;
+  uint8x8x2_t res_u8;
+  uint8x8_t zero_u8, one;
+
+  zero_val = vdupq_n_f32(0.0);
+  one = vdup_n_u8(0x01);
+  
+  // TODO: this is a good candidate for asm because the vcombines
+  // can be eliminated simply by picking dst registers that are
+  // adjacent.
+  for(number = 0; number < n8points; number++) {
+    input_val0 = vld2q_f32(aPtr);
+    input_val1 = vld2q_f32(aPtr+8);
+
+    // test against 0; return uint32
+    res0_u32.val[0] = vcgeq_f32(input_val0.val[0], zero_val);
+    res0_u32.val[1] = vcgeq_f32(input_val0.val[1], zero_val);
+    res1_u32.val[0] = vcgeq_f32(input_val1.val[0], zero_val);
+    res1_u32.val[1] = vcgeq_f32(input_val1.val[1], zero_val);
+
+    // narrow uint32 -> uint16 followed by combine to 8-element vectors
+    res0_u16x4.val[0] = vmovn_u32(res0_u32.val[0]);
+    res0_u16x4.val[1] = vmovn_u32(res0_u32.val[1]);
+    res1_u16x4.val[0] = vmovn_u32(res1_u32.val[0]);
+    res1_u16x4.val[1] = vmovn_u32(res1_u32.val[1]);
+
+    res_u16x8.val[0] = vcombine_u16(res0_u16x4.val[0], res1_u16x4.val[0]);
+    res_u16x8.val[1] = vcombine_u16(res0_u16x4.val[1], res1_u16x4.val[1]);
+    
+    // narrow uint16x8 -> uint8x8
+    res_u8.val[0] = vmovn_u16(res_u16x8.val[0]);
+    res_u8.val[1] = vmovn_u16(res_u16x8.val[1]);
+    // we *could* load twice as much data and do another vcombine here
+    // to get a uint8x16x2 vector, still only do 2 vandqs and a single store
+    // but that turns out to be ~16% slower than this version on zc702
+    // it's possible register contention in GCC scheduler slows it down
+    // and a hand-written asm with quad-word u8 registers is much faster.
+
+    res_u8.val[0] = vand_u8(one, res_u8.val[0]);
+    res_u8.val[1] = vand_u8(one, res_u8.val[1]);
+
+    vst2_u8((unsigned char*)cPtr, res_u8);
+    cPtr += 8;
+    aPtr += 8;
+
+  }
+
+  for(number = n8points * 8; number < num_points; number++) {
+    if(*aPtr++ >= 0) {
+      *cPtr++ = 1;
+    }
+    else {
+      *cPtr++ = 0;
+    }
+  }
+}
+#endif /* LV_HAVE_NEON */
+
+
 #endif /* INCLUDED_volk_32f_binary_slicer_8i_H */



reply via email to

[Prev in Thread] Current Thread [Next in Thread]