summaryrefslogtreecommitdiff
path: root/volk
diff options
context:
space:
mode:
Diffstat (limited to 'volk')
-rw-r--r--volk/apps/volk_profile.cc1
-rw-r--r--volk/gen/machines.xml6
-rw-r--r--volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h4
-rw-r--r--volk/include/volk/volk_16u_byteswap_u.h63
-rw-r--r--volk/include/volk/volk_32f_x2_dot_prod_32f_u.h51
-rw-r--r--volk/lib/CMakeLists.txt8
-rw-r--r--volk/lib/testqa.cc5
7 files changed, 103 insertions, 35 deletions
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc
index 6244abb35..648f4d878 100644
--- a/volk/apps/volk_profile.cc
+++ b/volk/apps/volk_profile.cc
@@ -33,6 +33,7 @@ int main(int argc, char *argv[]) {
//VOLK_PROFILE(volk_16i_permute_and_scalar_add_a, 1e-4, 0, 2046, 10000, &results);
//VOLK_PROFILE(volk_16i_x4_quad_max_star_16i_a, 1e-4, 0, 2046, 10000, &results);
VOLK_PROFILE(volk_16u_byteswap_a, 0, 0, 204600, 10000, &results);
+ VOLK_PROFILE(volk_16u_byteswap_u, 0, 0, 204600, 10000, &results);
VOLK_PROFILE(volk_16i_32fc_dot_prod_32fc_a, 1e-4, 0, 204600, 10000, &results);
VOLK_PROFILE(volk_32f_accumulator_s32f_a, 1e-4, 0, 204600, 10000, &results);
VOLK_PROFILE(volk_32f_x2_add_32f_a, 1e-4, 0, 204600, 10000, &results);
diff --git a/volk/gen/machines.xml b/volk/gen/machines.xml
index 8e3c9c8c2..d88a1a50c 100644
--- a/volk/gen/machines.xml
+++ b/volk/gen/machines.xml
@@ -10,7 +10,7 @@
</machine>
<machine name="sse">
-<archs>generic 32|64| mmx sse orc|</archs>
+<archs>generic 32|64| mmx| sse orc|</archs>
</machine>
-->
@@ -20,7 +20,7 @@
<!-- trailing | bar means generate without either for MSVC -->
<machine name="sse2">
-<archs>generic 32|64| mmx sse sse2 orc|</archs>
+<archs>generic 32|64| mmx| sse sse2 orc|</archs>
</machine>
<machine name="sse3">
@@ -45,7 +45,7 @@
<!-- trailing | bar means generate without either for MSVC -->
<machine name="avx">
-<archs>generic 32|64| mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx orc|</archs>
+<archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx orc|</archs>
</machine>
<machine name="altivec">
diff --git a/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h b/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h
index 940aa5de7..1f6554af8 100644
--- a/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h
+++ b/volk/include/volk/volk_16i_32fc_dot_prod_32fc_a.h
@@ -37,7 +37,7 @@ static inline void volk_16i_32fc_dot_prod_32fc_a_generic(lv_32fc_t* result, cons
#endif /*LV_HAVE_GENERIC*/
-#ifdef LV_HAVE_SSE
+#if LV_HAVE_SSE && LV_HAVE_MMX
static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const short* input, const lv_32fc_t* taps, unsigned int num_points) {
@@ -116,7 +116,7 @@ static inline void volk_16i_32fc_dot_prod_32fc_a_sse( lv_32fc_t* result, const
*result = *(lv_32fc_t*)(&res[0]);
}
-#endif /*LV_HAVE_SSE*/
+#endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
#endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_a_H*/
diff --git a/volk/include/volk/volk_16u_byteswap_u.h b/volk/include/volk/volk_16u_byteswap_u.h
new file mode 100644
index 000000000..8ef627a62
--- /dev/null
+++ b/volk/include/volk/volk_16u_byteswap_u.h
@@ -0,0 +1,63 @@
+#ifndef INCLUDED_volk_16u_byteswap_u_H
+#define INCLUDED_volk_16u_byteswap_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+/*!
+ \brief Byteswaps (in-place) an unaligned vector of int16_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points){
+ unsigned int number = 0;
+ uint16_t* inputPtr = intsToSwap;
+ __m128i input, left, right, output;
+
+ const unsigned int eighthPoints = num_points / 8;
+ for(;number < eighthPoints; number++){
+ // Load the 16t values, increment inputPtr later since we're doing it in-place.
+ input = _mm_loadu_si128((__m128i*)inputPtr);
+ // Do the two shifts
+ left = _mm_slli_epi16(input, 8);
+ right = _mm_srli_epi16(input, 8);
+ // Or the left and right halves together
+ output = _mm_or_si128(left, right);
+ // Store the results
+ _mm_storeu_si128((__m128i*)inputPtr, output);
+ inputPtr += 8;
+ }
+
+ // Byteswap any remaining points:
+ number = eighthPoints*8;
+ for(; number < num_points; number++){
+ uint16_t outputVal = *inputPtr;
+ outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+ *inputPtr = outputVal;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Byteswaps (in-place) an unaligned vector of int16_t's.
+ \param intsToSwap The vector of data to byte swap
+ \param numDataPoints The number of data points
+*/
+static inline void volk_16u_byteswap_u_generic(uint16_t* intsToSwap, unsigned int num_points){
+ unsigned int point;
+ uint16_t* inputPtr = intsToSwap;
+ for(point = 0; point < num_points; point++){
+ uint16_t output = *inputPtr;
+ output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+ *inputPtr = output;
+ inputPtr++;
+ }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_16u_byteswap_u_H */
diff --git a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
index ab33a2587..b24e8b1f7 100644
--- a/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
+++ b/volk/include/volk/volk_32f_x2_dot_prod_32f_u.h
@@ -48,14 +48,14 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float*
for(;number < sixteenthPoints; number++){
- a0Val = _mm_load_ps(aPtr);
- a1Val = _mm_load_ps(aPtr+4);
- a2Val = _mm_load_ps(aPtr+8);
- a3Val = _mm_load_ps(aPtr+12);
- b0Val = _mm_load_ps(bPtr);
- b1Val = _mm_load_ps(bPtr+4);
- b2Val = _mm_load_ps(bPtr+8);
- b3Val = _mm_load_ps(bPtr+12);
+ a0Val = _mm_loadu_ps(aPtr);
+ a1Val = _mm_loadu_ps(aPtr+4);
+ a2Val = _mm_loadu_ps(aPtr+8);
+ a3Val = _mm_loadu_ps(aPtr+12);
+ b0Val = _mm_loadu_ps(bPtr);
+ b1Val = _mm_loadu_ps(bPtr+4);
+ b2Val = _mm_loadu_ps(bPtr+8);
+ b3Val = _mm_loadu_ps(bPtr+12);
c0Val = _mm_mul_ps(a0Val, b0Val);
c1Val = _mm_mul_ps(a1Val, b1Val);
@@ -87,9 +87,6 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse( float* result, const float*
number = sixteenthPoints*16;
for(;number < num_points; number++){
dotProduct += ((*aPtr++) * (*bPtr++));
- dotProduct += ((*aPtr++) * (*bPtr++));
- dotProduct += ((*aPtr++) * (*bPtr++));
- dotProduct += ((*aPtr++) * (*bPtr++));
}
*result = dotProduct;
@@ -121,14 +118,14 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse3(float * result, const float *
for(;number < sixteenthPoints; number++){
- a0Val = _mm_load_ps(aPtr);
- a1Val = _mm_load_ps(aPtr+4);
- a2Val = _mm_load_ps(aPtr+8);
- a3Val = _mm_load_ps(aPtr+12);
- b0Val = _mm_load_ps(bPtr);
- b1Val = _mm_load_ps(bPtr+4);
- b2Val = _mm_load_ps(bPtr+8);
- b3Val = _mm_load_ps(bPtr+12);
+ a0Val = _mm_loadu_ps(aPtr);
+ a1Val = _mm_loadu_ps(aPtr+4);
+ a2Val = _mm_loadu_ps(aPtr+8);
+ a3Val = _mm_loadu_ps(aPtr+12);
+ b0Val = _mm_loadu_ps(bPtr);
+ b1Val = _mm_loadu_ps(bPtr+4);
+ b2Val = _mm_loadu_ps(bPtr+8);
+ b3Val = _mm_loadu_ps(bPtr+12);
c0Val = _mm_mul_ps(a0Val, b0Val);
c1Val = _mm_mul_ps(a1Val, b1Val);
@@ -187,15 +184,15 @@ static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float * result, const float
for(;number < sixteenthPoints; number++){
- aVal1 = _mm_load_ps(aPtr); aPtr += 4;
- aVal2 = _mm_load_ps(aPtr); aPtr += 4;
- aVal3 = _mm_load_ps(aPtr); aPtr += 4;
- aVal4 = _mm_load_ps(aPtr); aPtr += 4;
+ aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
+ aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
+ aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
+ aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
- bVal1 = _mm_load_ps(bPtr); bPtr += 4;
- bVal2 = _mm_load_ps(bPtr); bPtr += 4;
- bVal3 = _mm_load_ps(bPtr); bPtr += 4;
- bVal4 = _mm_load_ps(bPtr); bPtr += 4;
+ bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
+ bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
+ bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
+ bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
diff --git a/volk/lib/CMakeLists.txt b/volk/lib/CMakeLists.txt
index 8288786c9..59d78b446 100644
--- a/volk/lib/CMakeLists.txt
+++ b/volk/lib/CMakeLists.txt
@@ -135,6 +135,12 @@ if(NOT CROSSCOMPILE_MULTILIB AND CPU_IS_x86)
if (${SIZEOF_CPU} EQUAL 32)
OVERRULE_ARCH(64 "CPU width is 32 bits")
endif()
+
+ #MSVC 64 bit does not have MMX, overrule it
+ if (${SIZEOF_CPU} EQUAL 64 AND MSVC)
+ OVERRULE_ARCH(mmx "No MMX for Win64")
+ endif()
+
endif()
########################################################################
@@ -159,7 +165,7 @@ execute_process(
# When this occurs, eliminate the redundant machines
# to avoid unnecessary compilation of subset machines.
########################################################################
-foreach(arch orc 64 32)
+foreach(arch mmx orc 64 32)
foreach(machine_name ${available_machines})
string(REPLACE "_${arch}" "" machine_name_no_arch ${machine_name})
if (${machine_name} STREQUAL ${machine_name_no_arch})
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
index d1eb1cacb..2e41c25da 100644
--- a/volk/lib/testqa.cc
+++ b/volk/lib/testqa.cc
@@ -20,6 +20,7 @@ VOLK_RUN_TESTS(volk_16i_convert_8i_u, 0, 0, 20460, 1);
//VOLK_RUN_TESTS(volk_16i_permute_and_scalar_add_a, 1e-4, 0, 2046, 1000);
//VOLK_RUN_TESTS(volk_16i_x4_quad_max_star_16i_a, 1e-4, 0, 2046, 1000);
VOLK_RUN_TESTS(volk_16u_byteswap_a, 0, 0, 20460, 1);
+VOLK_RUN_TESTS(volk_16u_byteswap_u, 0, 0, 20460, 1);
//VOLK_RUN_TESTS(volk_16i_32fc_dot_prod_32fc_a, 1e-4, 0, 204600, 1);
VOLK_RUN_TESTS(volk_32f_accumulator_s32f_a, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32f_x2_add_32f_a, 1e-4, 0, 20460, 1);
@@ -36,7 +37,7 @@ VOLK_RUN_TESTS(volk_32fc_s32f_deinterleave_real_16i_a, 0, 32768, 20460, 1);
VOLK_RUN_TESTS(volk_32fc_deinterleave_real_32f_a, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32fc_deinterleave_real_64f_a, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32fc_x2_dot_prod_32fc_a, 1e-4, 0, 2046000, 1);
-VOLK_RUN_TESTS(volk_32fc_32f_dot_prod_32fc_a, 1e-4, 0, 2046000, 1);
+VOLK_RUN_TESTS(volk_32fc_32f_dot_prod_32fc_a, 1e-4, 0, 204600, 1);
VOLK_RUN_TESTS(volk_32fc_index_max_16u_a, 3, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32fc_s32f_magnitude_16i_a, 1, 32768, 20460, 1);
VOLK_RUN_TESTS(volk_32fc_magnitude_32f_a, 1e-4, 0, 20460, 1);
@@ -54,7 +55,7 @@ VOLK_RUN_TESTS(volk_32fc_x2_square_dist_32f_a, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a, 1e-4, 10, 20460, 1);
VOLK_RUN_TESTS(volk_32f_x2_divide_32f_a, 1e-4, 0, 20460, 1);
VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_a, 1e-4, 0, 204600, 1);
-//VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 204600, 1);
+VOLK_RUN_TESTS(volk_32f_x2_dot_prod_32f_u, 1e-4, 0, 204600, 1);
VOLK_RUN_TESTS(volk_32f_x2_dot_prod_16i_a, 1e-4, 0, 204600, 1);
//VOLK_RUN_TESTS(volk_32f_s32f_32f_fm_detect_32f_a, 1e-4, 2046, 10000);
VOLK_RUN_TESTS(volk_32f_index_max_16u_a, 3, 0, 20460, 1);