diff options
Diffstat (limited to 'volk/include/volk/volk_64u_popcnt_aligned16.h')
-rw-r--r-- | volk/include/volk/volk_64u_popcnt_aligned16.h | 74 |
1 files changed, 74 insertions, 0 deletions
diff --git a/volk/include/volk/volk_64u_popcnt_aligned16.h b/volk/include/volk/volk_64u_popcnt_aligned16.h new file mode 100644 index 000000000..e8e1396e7 --- /dev/null +++ b/volk/include/volk/volk_64u_popcnt_aligned16.h @@ -0,0 +1,74 @@ +#ifndef INCLUDED_VOLK_64u_POPCNT_ALIGNED16_H +#define INCLUDED_VOLK_64u_POPCNT_ALIGNED16_H + +#include <stdio.h> +#include <inttypes.h> + + +#if LV_HAVE_GENERIC + + +static inline void volk_64u_popcnt_aligned16_generic(uint64_t* ret, const uint64_t value) { + + const uint32_t* valueVector = (const uint32_t*)&value; + + // This is faster than a lookup table + uint32_t retVal = valueVector[0]; + + retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); + retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); + retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; + retVal = (retVal + (retVal >> 8)); + retVal = (retVal + (retVal >> 16)) & 0x0000003F; + uint64_t retVal64 = retVal; + + retVal = valueVector[1]; + retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); + retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); + retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; + retVal = (retVal + (retVal >> 8)); + retVal = (retVal + (retVal >> 16)) & 0x0000003F; + retVal64 += retVal; + + *ret = retVal64; + +} + +#endif /*LV_HAVE_GENERIC*/ + +#if LV_HAVE_SSE4_2 + +#include <nmmintrin.h> + +static inline void volk_64u_popcnt_aligned16_sse4_2(uint64_t* ret, const uint64_t value) { +#if LV_64 + *ret = _mm_popcnt_u64(value); +#else + const uint32_t* valueVector = (const uint32_t*)&value; + + // This is faster than a lookup table + uint32_t retVal = valueVector[0]; + + retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); + retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); + retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; + retVal = (retVal + (retVal >> 8)); + retVal = (retVal + (retVal >> 16)) & 0x0000003F; + uint64_t retVal64 = retVal; + + retVal = valueVector[1]; + retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); + retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); + retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; + retVal = (retVal + (retVal >> 8)); + retVal = (retVal + (retVal >> 16)) & 0x0000003F; + retVal64 += retVal; + + *ret = retVal64; + +#endif +} + +#endif /*LV_HAVE_SSE4_2*/ + +#endif /*INCLUDED_VOLK_64u_POPCNT_ALIGNED16_H*/ |