summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--volk/apps/volk_profile.cc3
-rw-r--r--volk/include/volk/volk_32f_s32f_convert_32i_a16.h36
2 files changed, 38 insertions, 1 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc
index fd9507207..c091a6289 100644
--- a/volk/apps/volk_profile.cc
+++ b/volk/apps/volk_profile.cc
@@ -13,7 +13,7 @@ extern "C" {
int main(int argc, char *argv[]) {
std::vector<std::string> results;
-
+ /*
//VOLK_PROFILE(volk_16i_x5_add_quad_16i_x4_a16, 1e-4, 2046, 10000, &results);
//VOLK_PROFILE(volk_16i_branch_4_state_8_a16, 1e-4, 2046, 10000, &results);
VOLK_PROFILE(volk_16ic_s32f_deinterleave_real_32f_a16, 1e-5, 32768.0, 204600, 10000, &results);
@@ -52,6 +52,7 @@ int main(int argc, char *argv[]) {
VOLK_PROFILE(volk_32fc_x2_multiply_32fc_a16, 1e-4, 0, 204600, 1000, &results);
VOLK_PROFILE(volk_32f_s32f_convert_16i_a16, 1, 32768, 204600, 10000, &results);
VOLK_PROFILE(volk_32f_s32f_convert_16i_u, 1, 32768, 204600, 10000, &results);
+ */
VOLK_PROFILE(volk_32f_s32f_convert_32i_a16, 1, 2<<31, 204600, 10000, &results);
VOLK_PROFILE(volk_32f_s32f_convert_32i_u, 1, 2<<31, 204600, 10000, &results);
VOLK_PROFILE(volk_32f_convert_64f_a16, 1e-4, 0, 204600, 10000, &results);
diff --git a/volk/include/volk/volk_32f_s32f_convert_32i_a16.h b/volk/include/volk/volk_32f_s32f_convert_32i_a16.h
index 2927d616c..3f5044313 100644
--- a/volk/include/volk/volk_32f_s32f_convert_32i_a16.h
+++ b/volk/include/volk/volk_32f_s32f_convert_32i_a16.h
@@ -5,6 +5,42 @@
#include <inttypes.h>
#include <stdio.h>
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 32 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 32 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_s32f_convert_32i_a16_avx(int32_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int32_t* outputVectorPtr = outputVector;
+ __m256 vScalar = _mm256_set1_ps(scalar);
+ __m256 inputVal1;
+ __m256i intInputVal1;
+
+ for(;number < eighthPoints; number++){
+ inputVal1 = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
+
+ intInputVal1 = _mm256_cvtps_epi32(_mm256_mul_ps(inputVal1, vScalar));
+
+ _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ outputVector[number] = (int32_t)(inputVector[number] * scalar);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
/*!