diff --git a/include/xsimd/arch/common/xsimd_common_logical.hpp b/include/xsimd/arch/common/xsimd_common_logical.hpp
index 83e9a9f11..3716f6282 100644
--- a/include/xsimd/arch/common/xsimd_common_logical.hpp
+++ b/include/xsimd/arch/common/xsimd_common_logical.hpp
@@ -72,8 +72,7 @@ namespace xsimd
         XSIMD_INLINE batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<common>) noexcept
         {
             alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
-            // This is inefficient but should never be called. It's just a
-            // temporary implementation until arm support is added.
+            // This is inefficient and should never be called.
             for (size_t i = 0; i < batch_bool<T, A>::size; ++i)
                 buffer[i] = mask & (1ull << i);
             return batch_bool<T, A>::load_aligned(buffer);
@@ -204,8 +203,7 @@ namespace xsimd
         {
             alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
             self.store_aligned(buffer);
-            // This is inefficient but should never be called. It's just a
-            // temporary implementation until arm support is added.
+            // This is inefficient and should never be called.
             uint64_t res = 0;
             for (size_t i = 0; i < batch_bool<T, A>::size; ++i)
                 if (buffer[i])
diff --git a/include/xsimd/arch/xsimd_neon.hpp b/include/xsimd/arch/xsimd_neon.hpp
index 6bf5b4dad..55b3b8d81 100644
--- a/include/xsimd/arch/xsimd_neon.hpp
+++ b/include/xsimd/arch/xsimd_neon.hpp
@@ -3278,6 +3278,85 @@ namespace xsimd
             return { batch<widen_t<T>, A>(vaddl_u32(vget_low_u32(x), vdup_n_u32(0))), batch<widen_t<T>, A>(vaddl_u32(vget_high_u32(x), vdup_n_u32(0))) };
         }
 
+        /********
+         * mask *
+         ********/
+        namespace detail
+        {
+#ifdef XSIMD_LITTLE_ENDIAN
+            static constexpr bool do_swap = false;
+#else
+            static constexpr bool do_swap = true;
+#endif
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 1> = 0>
+        XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<neon>) noexcept
+        {
+            // From https://github.com/DLTcollab/sse2neon/blob/master/sse2neon.h
+            uint8x16_t msbs = vshrq_n_u8(self, 7);
+            XSIMD_IF_CONSTEXPR(detail::do_swap)
+            {
+                msbs = vrev64q_u8(msbs);
+            }
+
+            uint64x2_t bits = vreinterpretq_u64_u8(msbs);
+            bits = vsraq_n_u64(bits, bits, 7);
+            bits = vsraq_n_u64(bits, bits, 14);
+            bits = vsraq_n_u64(bits, bits, 28);
+
+            uint8x16_t output = vreinterpretq_u8_u64(bits);
+            constexpr int offset = detail::do_swap ? 7 : 0;
+
+            return vgetq_lane_u8(output, offset) | vgetq_lane_u8(output, offset + 8) << 8;
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 2> = 0>
+        XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<neon>) noexcept
+        {
+            // Adapted from https://github.com/DLTcollab/sse2neon/blob/master/sse2neon.h
+            uint16x8_t msbs = vshrq_n_u16(self, 15);
+            XSIMD_IF_CONSTEXPR(detail::do_swap)
+            {
+                msbs = vrev64q_u16(msbs);
+            }
+
+            uint64x2_t bits = vreinterpretq_u64_u16(msbs);
+            bits = vsraq_n_u64(bits, bits, 15);
+            bits = vsraq_n_u64(bits, bits, 30);
+
+            uint8x16_t output = vreinterpretq_u8_u64(bits);
+            constexpr int offset = detail::do_swap ? 7 : 0;
+
+            return vgetq_lane_u8(output, offset) | vgetq_lane_u8(output, offset + 8) << 4;
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 4> = 0>
+        XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<neon>) noexcept
+        {
+            // Adapted from https://github.com/DLTcollab/sse2neon/blob/master/sse2neon.h
+            uint32x4_t msbs = vshrq_n_u32(self, 31);
+            XSIMD_IF_CONSTEXPR(detail::do_swap)
+            {
+                msbs = vrev64q_u32(msbs);
+            }
+
+            uint64x2_t bits = vreinterpretq_u64_u32(msbs);
+            bits = vsraq_n_u64(bits, bits, 31);
+
+            uint8x16_t output = vreinterpretq_u8_u64(bits);
+            constexpr int offset = detail::do_swap ? 7 : 0;
+
+            return vgetq_lane_u8(output, offset) | vgetq_lane_u8(output, offset + 8) << 2;
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 8> = 0>
+        XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<neon>) noexcept
+        {
+            uint64_t mask_lo = vgetq_lane_u64(self, 0);
+            uint64_t mask_hi = vgetq_lane_u64(self, 1);
+            return ((mask_lo >> 63) | (mask_hi << 1)) & 0x3;
+        }
     }
 
 }
diff --git a/include/xsimd/config/xsimd_config.hpp b/include/xsimd/config/xsimd_config.hpp
index 80af76f4b..58ce53462 100644
--- a/include/xsimd/config/xsimd_config.hpp
+++ b/include/xsimd/config/xsimd_config.hpp
@@ -16,6 +16,17 @@
 #define XSIMD_VERSION_MINOR 0
 #define XSIMD_VERSION_PATCH 0
 
+#if defined(__GNUC__) && defined(__BYTE_ORDER__)
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define XSIMD_LITTLE_ENDIAN
+#endif
+#elif defined(_WIN32)
+// We can safely assume that Windows is always little endian
+#define XSIMD_LITTLE_ENDIAN
+#elif defined(i386) || defined(i486) || defined(intel) || defined(x86) || defined(i86pc) || defined(__alpha) || defined(__osf__)
+#define XSIMD_LITTLE_ENDIAN
+#endif
+
 /**
  * high level free functions
  *
diff --git a/include/xsimd/math/xsimd_rem_pio2.hpp b/include/xsimd/math/xsimd_rem_pio2.hpp
index eb232c568..511efd17f 100644
--- a/include/xsimd/math/xsimd_rem_pio2.hpp
+++ b/include/xsimd/math/xsimd_rem_pio2.hpp
@@ -47,17 +47,6 @@ namespace xsimd
          * ====================================================
          */
 
-#if defined(__GNUC__) && defined(__BYTE_ORDER__)
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-#define XSIMD_LITTLE_ENDIAN
-#endif
-#elif defined(_WIN32)
-        // We can safely assume that Windows is always little endian
-#define XSIMD_LITTLE_ENDIAN
-#elif defined(i386) || defined(i486) || defined(intel) || defined(x86) || defined(i86pc) || defined(__alpha) || defined(__osf__)
-#define XSIMD_LITTLE_ENDIAN
-#endif
-
 #ifdef XSIMD_LITTLE_ENDIAN
 #define LOW_WORD_IDX 0
 #define HIGH_WORD_IDX sizeof(std::uint32_t)
@@ -708,7 +697,6 @@ namespace xsimd
         }
     }
 
-#undef XSIMD_LITTLE_ENDIAN
 #undef SET_LOW_WORD
 #undef SET_HIGH_WORD
 #undef GET_LOW_WORD