diff --git a/include/xsimd/arch/common/xsimd_common_logical.hpp b/include/xsimd/arch/common/xsimd_common_logical.hpp index 83e9a9f11..3716f6282 100644 --- a/include/xsimd/arch/common/xsimd_common_logical.hpp +++ b/include/xsimd/arch/common/xsimd_common_logical.hpp @@ -72,8 +72,7 @@ namespace xsimd XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept { alignas(A::alignment()) bool buffer[batch_bool::size]; - // This is inefficient but should never be called. It's just a - // temporary implementation until arm support is added. + // This is inefficient and should never be called. for (size_t i = 0; i < batch_bool::size; ++i) buffer[i] = mask & (1ull << i); return batch_bool::load_aligned(buffer); @@ -204,8 +203,7 @@ namespace xsimd { alignas(A::alignment()) bool buffer[batch_bool::size]; self.store_aligned(buffer); - // This is inefficient but should never be called. It's just a - // temporary implementation until arm support is added. + // This is inefficient and should never be called. uint64_t res = 0; for (size_t i = 0; i < batch_bool::size; ++i) if (buffer[i]) diff --git a/include/xsimd/arch/xsimd_neon.hpp b/include/xsimd/arch/xsimd_neon.hpp index 6bf5b4dad..55b3b8d81 100644 --- a/include/xsimd/arch/xsimd_neon.hpp +++ b/include/xsimd/arch/xsimd_neon.hpp @@ -3278,6 +3278,85 @@ namespace xsimd return { batch, A>(vaddl_u32(vget_low_u32(x), vdup_n_u32(0))), batch, A>(vaddl_u32(vget_high_u32(x), vdup_n_u32(0))) }; } + /******** + * mask * + ********/ + namespace detail + { +#ifdef XSIMD_LITTLE_ENDIAN + static constexpr bool do_swap = false; +#else + static constexpr bool do_swap = true; +#endif + } + + template = 0> + XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept + { + // From https://github.com/DLTcollab/sse2neon/blob/master/sse2neon.h + uint8x16_t msbs = vshrq_n_u8(self, 7); + XSIMD_IF_CONSTEXPR(detail::do_swap) + { + msbs = vrev64q_u8(msbs); + } + + uint64x2_t bits = vreinterpretq_u64_u8(msbs); + bits = vsraq_n_u64(bits, bits, 7); + bits = vsraq_n_u64(bits, bits, 14); + bits = vsraq_n_u64(bits, bits, 28); + + uint8x16_t output = vreinterpretq_u8_u64(bits); + constexpr int offset = detail::do_swap ? 7 : 0; + + return vgetq_lane_u8(output, offset) | vgetq_lane_u8(output, offset + 8) << 8; + } + + template = 0> + XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept + { + // Adapted from https://github.com/DLTcollab/sse2neon/blob/master/sse2neon.h + uint16x8_t msbs = vshrq_n_u16(self, 15); + XSIMD_IF_CONSTEXPR(detail::do_swap) + { + msbs = vrev64q_u16(msbs); + } + + uint64x2_t bits = vreinterpretq_u64_u16(msbs); + bits = vsraq_n_u64(bits, bits, 15); + bits = vsraq_n_u64(bits, bits, 30); + + uint8x16_t output = vreinterpretq_u8_u64(bits); + constexpr int offset = detail::do_swap ? 7 : 0; + + return vgetq_lane_u8(output, offset) | vgetq_lane_u8(output, offset + 8) << 4; + } + + template = 0> + XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept + { + // Adapted from https://github.com/DLTcollab/sse2neon/blob/master/sse2neon.h + uint32x4_t msbs = vshrq_n_u32(self, 31); + XSIMD_IF_CONSTEXPR(detail::do_swap) + { + msbs = vrev64q_u32(msbs); + } + + uint64x2_t bits = vreinterpretq_u64_u32(msbs); + bits = vsraq_n_u64(bits, bits, 31); + + uint8x16_t output = vreinterpretq_u8_u64(bits); + constexpr int offset = detail::do_swap ? 7 : 0; + + return vgetq_lane_u8(output, offset) | vgetq_lane_u8(output, offset + 8) << 2; + } + + template = 0> + XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept + { + uint64_t mask_lo = vgetq_lane_u64(self, 0); + uint64_t mask_hi = vgetq_lane_u64(self, 1); + return ((mask_lo >> 63) | (mask_hi << 1)) & 0x3; + } } } diff --git a/include/xsimd/config/xsimd_config.hpp b/include/xsimd/config/xsimd_config.hpp index 80af76f4b..58ce53462 100644 --- a/include/xsimd/config/xsimd_config.hpp +++ b/include/xsimd/config/xsimd_config.hpp @@ -16,6 +16,17 @@ #define XSIMD_VERSION_MINOR 0 #define XSIMD_VERSION_PATCH 0 +#if defined(__GNUC__) && defined(__BYTE_ORDER__) +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define XSIMD_LITTLE_ENDIAN +#endif +#elif defined(_WIN32) +// We can safely assume that Windows is always little endian +#define XSIMD_LITTLE_ENDIAN +#elif defined(i386) || defined(i486) || defined(intel) || defined(x86) || defined(i86pc) || defined(__alpha) || defined(__osf__) +#define XSIMD_LITTLE_ENDIAN +#endif + /** * high level free functions * diff --git a/include/xsimd/math/xsimd_rem_pio2.hpp b/include/xsimd/math/xsimd_rem_pio2.hpp index eb232c568..511efd17f 100644 --- a/include/xsimd/math/xsimd_rem_pio2.hpp +++ b/include/xsimd/math/xsimd_rem_pio2.hpp @@ -47,17 +47,6 @@ namespace xsimd * ==================================================== */ -#if defined(__GNUC__) && defined(__BYTE_ORDER__) -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -#define XSIMD_LITTLE_ENDIAN -#endif -#elif defined(_WIN32) - // We can safely assume that Windows is always little endian -#define XSIMD_LITTLE_ENDIAN -#elif defined(i386) || defined(i486) || defined(intel) || defined(x86) || defined(i86pc) || defined(__alpha) || defined(__osf__) -#define XSIMD_LITTLE_ENDIAN -#endif - #ifdef XSIMD_LITTLE_ENDIAN #define LOW_WORD_IDX 0 #define HIGH_WORD_IDX sizeof(std::uint32_t) @@ -708,7 +697,6 @@ namespace xsimd } } -#undef XSIMD_LITTLE_ENDIAN #undef SET_LOW_WORD #undef SET_HIGH_WORD #undef GET_LOW_WORD