Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions include/xsimd/arch/common/xsimd_common_logical.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,7 @@ namespace xsimd
XSIMD_INLINE batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<common>) noexcept
{
alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
// This is inefficient but should never be called. It's just a
// temporary implementation until arm support is added.
// This is inefficient and should never be called.
for (size_t i = 0; i < batch_bool<T, A>::size; ++i)
buffer[i] = mask & (1ull << i);
return batch_bool<T, A>::load_aligned(buffer);
Expand Down Expand Up @@ -204,8 +203,7 @@ namespace xsimd
{
alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
self.store_aligned(buffer);
// This is inefficient but should never be called. It's just a
// temporary implementation until arm support is added.
// This is inefficient and should never be called.
uint64_t res = 0;
for (size_t i = 0; i < batch_bool<T, A>::size; ++i)
if (buffer[i])
Expand Down
79 changes: 79 additions & 0 deletions include/xsimd/arch/xsimd_neon.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3278,6 +3278,85 @@ namespace xsimd
return { batch<widen_t<T>, A>(vaddl_u32(vget_low_u32(x), vdup_n_u32(0))), batch<widen_t<T>, A>(vaddl_u32(vget_high_u32(x), vdup_n_u32(0))) };
}

/********
* mask *
********/
namespace detail
{
#ifdef XSIMD_LITTLE_ENDIAN
static constexpr bool do_swap = false;
#else
static constexpr bool do_swap = true;
#endif
}

template <class A, class T, detail::enable_sized_t<T, 1> = 0>
XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<neon>) noexcept
{
// From https://github.com/DLTcollab/sse2neon/blob/master/sse2neon.h
uint8x16_t msbs = vshrq_n_u8(self, 7);
XSIMD_IF_CONSTEXPR(detail::do_swap)
{
msbs = vrev64q_u8(msbs);
}

uint64x2_t bits = vreinterpretq_u64_u8(msbs);
bits = vsraq_n_u64(bits, bits, 7);
bits = vsraq_n_u64(bits, bits, 14);
bits = vsraq_n_u64(bits, bits, 28);

uint8x16_t output = vreinterpretq_u8_u64(bits);
constexpr int offset = detail::do_swap ? 7 : 0;

return vgetq_lane_u8(output, offset) | vgetq_lane_u8(output, offset + 8) << 8;
}

template <class A, class T, detail::enable_sized_t<T, 2> = 0>
XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<neon>) noexcept
{
// Adapted from https://github.com/DLTcollab/sse2neon/blob/master/sse2neon.h
uint16x8_t msbs = vshrq_n_u16(self, 15);
XSIMD_IF_CONSTEXPR(detail::do_swap)
{
msbs = vrev64q_u16(msbs);
}

uint64x2_t bits = vreinterpretq_u64_u16(msbs);
bits = vsraq_n_u64(bits, bits, 15);
bits = vsraq_n_u64(bits, bits, 30);

uint8x16_t output = vreinterpretq_u8_u64(bits);
constexpr int offset = detail::do_swap ? 7 : 0;

return vgetq_lane_u8(output, offset) | vgetq_lane_u8(output, offset + 8) << 4;
}

template <class A, class T, detail::enable_sized_t<T, 4> = 0>
XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<neon>) noexcept
{
// Adapted from https://github.com/DLTcollab/sse2neon/blob/master/sse2neon.h
uint32x4_t msbs = vshrq_n_u32(self, 31);
XSIMD_IF_CONSTEXPR(detail::do_swap)
{
msbs = vrev64q_u32(msbs);
}

uint64x2_t bits = vreinterpretq_u64_u32(msbs);
bits = vsraq_n_u64(bits, bits, 31);

uint8x16_t output = vreinterpretq_u8_u64(bits);
constexpr int offset = detail::do_swap ? 7 : 0;

return vgetq_lane_u8(output, offset) | vgetq_lane_u8(output, offset + 8) << 2;
}

template <class A, class T, detail::enable_sized_t<T, 8> = 0>
XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<neon>) noexcept
{
uint64_t mask_lo = vgetq_lane_u64(self, 0);
uint64_t mask_hi = vgetq_lane_u64(self, 1);
return ((mask_lo >> 63) | (mask_hi << 1)) & 0x3;
}
}

}
Expand Down
11 changes: 11 additions & 0 deletions include/xsimd/config/xsimd_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,17 @@
#define XSIMD_VERSION_MINOR 0
#define XSIMD_VERSION_PATCH 0

#if defined(__GNUC__) && defined(__BYTE_ORDER__)
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
#define XSIMD_LITTLE_ENDIAN
#endif
#elif defined(_WIN32)
// We can safely assume that Windows is always little endian
#define XSIMD_LITTLE_ENDIAN
#elif defined(i386) || defined(i486) || defined(intel) || defined(x86) || defined(i86pc) || defined(__alpha) || defined(__osf__)
#define XSIMD_LITTLE_ENDIAN
#endif

/**
* high level free functions
*
Expand Down
12 changes: 0 additions & 12 deletions include/xsimd/math/xsimd_rem_pio2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,17 +47,6 @@ namespace xsimd
* ====================================================
*/

#if defined(__GNUC__) && defined(__BYTE_ORDER__)
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
#define XSIMD_LITTLE_ENDIAN
#endif
#elif defined(_WIN32)
// We can safely assume that Windows is always little endian
#define XSIMD_LITTLE_ENDIAN
#elif defined(i386) || defined(i486) || defined(intel) || defined(x86) || defined(i86pc) || defined(__alpha) || defined(__osf__)
#define XSIMD_LITTLE_ENDIAN
#endif

#ifdef XSIMD_LITTLE_ENDIAN
#define LOW_WORD_IDX 0
#define HIGH_WORD_IDX sizeof(std::uint32_t)
Expand Down Expand Up @@ -708,7 +697,6 @@ namespace xsimd
}
}

#undef XSIMD_LITTLE_ENDIAN
#undef SET_LOW_WORD
#undef SET_HIGH_WORD
#undef GET_LOW_WORD
Expand Down
Loading