git subrepo clone --branch=sono6good https://github.com/essej/JUCE.git deps/juce
subrepo: subdir: "deps/juce" merged: "b13f9084e" upstream: origin: "https://github.com/essej/JUCE.git" branch: "sono6good" commit: "b13f9084e" git-subrepo: version: "0.4.3" origin: "https://github.com/ingydotnet/git-subrepo.git" commit: "2f68596"
This commit is contained in:
58
deps/juce/modules/juce_dsp/native/juce_avx_SIMDNativeOps.cpp
vendored
Normal file
58
deps/juce/modules/juce_dsp/native/juce_avx_SIMDNativeOps.cpp
vendored
Normal file
@ -0,0 +1,58 @@
|
||||
/*
|
||||
==============================================================================
|
||||
|
||||
This file is part of the JUCE library.
|
||||
Copyright (c) 2020 - Raw Material Software Limited
|
||||
|
||||
JUCE is an open source library subject to commercial or open-source
|
||||
licensing.
|
||||
|
||||
By using JUCE, you agree to the terms of both the JUCE 6 End-User License
|
||||
Agreement and JUCE Privacy Policy (both effective as of the 16th June 2020).
|
||||
|
||||
End User License Agreement: www.juce.com/juce-6-licence
|
||||
Privacy Policy: www.juce.com/juce-privacy-policy
|
||||
|
||||
Or: You may also use this code under the terms of the GPL v3 (see
|
||||
www.gnu.org/licenses).
|
||||
|
||||
JUCE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
|
||||
EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
|
||||
DISCLAIMED.
|
||||
|
||||
==============================================================================
|
||||
*/
|
||||
|
||||
namespace juce
|
||||
{
|
||||
namespace dsp
|
||||
{
|
||||
DEFINE_AVX_SIMD_CONST (int32_t, float, kAllBitsSet) = { -1, -1, -1, -1, -1, -1, -1, -1 };
|
||||
DEFINE_AVX_SIMD_CONST (int32_t, float, kEvenHighBit) = { static_cast<int32_t>(0x80000000), 0, static_cast<int32_t>(0x80000000), 0, static_cast<int32_t>(0x80000000), 0, static_cast<int32_t>(0x80000000), 0 };
|
||||
DEFINE_AVX_SIMD_CONST (float, float, kOne) = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f };
|
||||
|
||||
DEFINE_AVX_SIMD_CONST (int64_t, double, kAllBitsSet) = { -1, -1, -1, -1 };
|
||||
DEFINE_AVX_SIMD_CONST (int64_t, double, kEvenHighBit) = { static_cast<int64_t> (0x8000000000000000), 0, static_cast<int64_t> (0x8000000000000000), 0 };
|
||||
DEFINE_AVX_SIMD_CONST (double, double, kOne) = { 1.0, 1.0, 1.0, 1.0 };
|
||||
|
||||
DEFINE_AVX_SIMD_CONST (int8_t, int8_t, kAllBitsSet) = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
|
||||
|
||||
DEFINE_AVX_SIMD_CONST (uint8_t, uint8_t, kAllBitsSet) = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
|
||||
DEFINE_AVX_SIMD_CONST (uint8_t, uint8_t, kHighBit) = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
|
||||
|
||||
DEFINE_AVX_SIMD_CONST (int16_t, int16_t, kAllBitsSet) = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
|
||||
|
||||
DEFINE_AVX_SIMD_CONST (uint16_t, uint16_t, kAllBitsSet) = { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff };
|
||||
DEFINE_AVX_SIMD_CONST (uint16_t, uint16_t, kHighBit) = { 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000 };
|
||||
|
||||
DEFINE_AVX_SIMD_CONST (int32_t, int32_t, kAllBitsSet) = { -1, -1, -1, -1, -1, -1, -1, -1 };
|
||||
|
||||
DEFINE_AVX_SIMD_CONST (uint32_t, uint32_t, kAllBitsSet) = { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff };
|
||||
DEFINE_AVX_SIMD_CONST (uint32_t, uint32_t, kHighBit) = { 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
|
||||
|
||||
DEFINE_AVX_SIMD_CONST (int64_t, int64_t, kAllBitsSet) = { -1LL, -1LL, -1LL, -1LL };
|
||||
|
||||
DEFINE_AVX_SIMD_CONST (uint64_t, uint64_t, kAllBitsSet) = { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL };
|
||||
DEFINE_AVX_SIMD_CONST (uint64_t, uint64_t, kHighBit) = { 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL };
|
||||
}
|
||||
}
|
661
deps/juce/modules/juce_dsp/native/juce_avx_SIMDNativeOps.h
vendored
Normal file
661
deps/juce/modules/juce_dsp/native/juce_avx_SIMDNativeOps.h
vendored
Normal file
@ -0,0 +1,661 @@
|
||||
/*
|
||||
==============================================================================
|
||||
|
||||
This file is part of the JUCE library.
|
||||
Copyright (c) 2020 - Raw Material Software Limited
|
||||
|
||||
JUCE is an open source library subject to commercial or open-source
|
||||
licensing.
|
||||
|
||||
By using JUCE, you agree to the terms of both the JUCE 6 End-User License
|
||||
Agreement and JUCE Privacy Policy (both effective as of the 16th June 2020).
|
||||
|
||||
End User License Agreement: www.juce.com/juce-6-licence
|
||||
Privacy Policy: www.juce.com/juce-privacy-policy
|
||||
|
||||
Or: You may also use this code under the terms of the GPL v3 (see
|
||||
www.gnu.org/licenses).
|
||||
|
||||
JUCE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
|
||||
EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
|
||||
DISCLAIMED.
|
||||
|
||||
==============================================================================
|
||||
*/
|
||||
|
||||
namespace juce
|
||||
{
|
||||
namespace dsp
|
||||
{
|
||||
|
||||
#ifndef DOXYGEN
|
||||
|
||||
JUCE_BEGIN_IGNORE_WARNINGS_GCC_LIKE ("-Wignored-attributes")
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define DECLARE_AVX_SIMD_CONST(type, name) \
|
||||
static __declspec(align(32)) const type name[32 / sizeof (type)]
|
||||
|
||||
#define DEFINE_AVX_SIMD_CONST(type, class_type, name) \
|
||||
__declspec(align(32)) const type SIMDNativeOps<class_type>:: name[32 / sizeof (type)]
|
||||
|
||||
#else
|
||||
#define DECLARE_AVX_SIMD_CONST(type, name) \
|
||||
static const type name[32 / sizeof (type)] __attribute__((aligned(32)))
|
||||
|
||||
#define DEFINE_AVX_SIMD_CONST(type, class_type, name) \
|
||||
const type SIMDNativeOps<class_type>:: name[32 / sizeof (type)] __attribute__((aligned(32)))
|
||||
|
||||
#endif
|
||||
|
||||
template <typename type>
|
||||
struct SIMDNativeOps;
|
||||
|
||||
//==============================================================================
|
||||
/** Single-precision floating point AVX intrinsics.
|
||||
|
||||
@tags{DSP}
|
||||
*/
|
||||
template <>
|
||||
struct SIMDNativeOps<float>
|
||||
{
|
||||
using vSIMDType = __m256;
|
||||
|
||||
//==============================================================================
|
||||
DECLARE_AVX_SIMD_CONST (int32_t, kAllBitsSet);
|
||||
DECLARE_AVX_SIMD_CONST (int32_t, kEvenHighBit);
|
||||
DECLARE_AVX_SIMD_CONST (float, kOne);
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE vconst (const float* a) noexcept { return load (a); }
|
||||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return _mm256_castsi256_ps (_mm256_load_si256 (reinterpret_cast <const __m256i*> (a))); }
|
||||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE expand (float s) noexcept { return _mm256_broadcast_ss (&s); }
|
||||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE load (const float* a) noexcept { return _mm256_load_ps (a); }
|
||||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256 value, float* dest) noexcept { _mm256_store_ps (dest, value); }
|
||||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE add (__m256 a, __m256 b) noexcept { return _mm256_add_ps (a, b); }
|
||||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE sub (__m256 a, __m256 b) noexcept { return _mm256_sub_ps (a, b); }
|
||||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE mul (__m256 a, __m256 b) noexcept { return _mm256_mul_ps (a, b); }
|
||||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_and (__m256 a, __m256 b) noexcept { return _mm256_and_ps (a, b); }
|
||||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_or (__m256 a, __m256 b) noexcept { return _mm256_or_ps (a, b); }
|
||||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_xor (__m256 a, __m256 b) noexcept { return _mm256_xor_ps (a, b); }
|
||||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_notand (__m256 a, __m256 b) noexcept { return _mm256_andnot_ps (a, b); }
|
||||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_not (__m256 a) noexcept { return bit_notand (a, vconst (kAllBitsSet)); }
|
||||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE min (__m256 a, __m256 b) noexcept { return _mm256_min_ps (a, b); }
|
||||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE max (__m256 a, __m256 b) noexcept { return _mm256_max_ps (a, b); }
|
||||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE equal (__m256 a, __m256 b) noexcept { return _mm256_cmp_ps (a, b, _CMP_EQ_OQ); }
|
||||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE notEqual (__m256 a, __m256 b) noexcept { return _mm256_cmp_ps (a, b, _CMP_NEQ_OQ); }
|
||||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE greaterThan (__m256 a, __m256 b) noexcept { return _mm256_cmp_ps (a, b, _CMP_GT_OQ); }
|
||||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256 a, __m256 b) noexcept { return _mm256_cmp_ps (a, b, _CMP_GE_OQ); }
|
||||
static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256 a, __m256 b) noexcept { return (_mm256_movemask_ps (equal (a, b)) == 0xff); }
|
||||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE dupeven (__m256 a) noexcept { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (2, 2, 0, 0)); }
|
||||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE dupodd (__m256 a) noexcept { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (3, 3, 1, 1)); }
|
||||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE swapevenodd (__m256 a) noexcept { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (2, 3, 0, 1)); }
|
||||
static forcedinline float JUCE_VECTOR_CALLTYPE get (__m256 v, size_t i) noexcept { return SIMDFallbackOps<float, __m256>::get (v, i); }
|
||||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE set (__m256 v, size_t i, float s) noexcept { return SIMDFallbackOps<float, __m256>::set (v, i, s); }
|
||||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE truncate (__m256 a) noexcept { return _mm256_cvtepi32_ps (_mm256_cvttps_epi32 (a)); }
|
||||
|
||||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE multiplyAdd (__m256 a, __m256 b, __m256 c) noexcept
|
||||
{
|
||||
#if __FMA__
|
||||
return _mm256_fmadd_ps (b, c, a);
|
||||
#else
|
||||
return add (a, mul (b, c));
|
||||
#endif
|
||||
}
|
||||
|
||||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE oddevensum (__m256 a) noexcept
|
||||
{
|
||||
a = _mm256_add_ps (_mm256_shuffle_ps (a, a, _MM_SHUFFLE (1, 0, 3, 2)), a);
|
||||
return add (_mm256_permute2f128_ps (a, a, 1), a);
|
||||
}
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE cmplxmul (__m256 a, __m256 b) noexcept
|
||||
{
|
||||
__m256 rr_ir = mul (a, dupeven (b));
|
||||
__m256 ii_ri = mul (swapevenodd (a), dupodd (b));
|
||||
return add (rr_ir, bit_xor (ii_ri, vconst (kEvenHighBit)));
|
||||
}
|
||||
|
||||
static forcedinline float JUCE_VECTOR_CALLTYPE sum (__m256 a) noexcept
|
||||
{
|
||||
__m256 retval = _mm256_dp_ps (a, vconst (kOne), 0xff);
|
||||
__m256 tmp = _mm256_permute2f128_ps (retval, retval, 1);
|
||||
retval = _mm256_add_ps (retval, tmp);
|
||||
|
||||
#if JUCE_GCC
|
||||
return retval[0];
|
||||
#else
|
||||
return _mm256_cvtss_f32 (retval);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
//==============================================================================
|
||||
/** Double-precision floating point AVX intrinsics.
|
||||
|
||||
@tags{DSP}
|
||||
*/
|
||||
template <>
|
||||
struct SIMDNativeOps<double>
|
||||
{
|
||||
using vSIMDType = __m256d;
|
||||
|
||||
//==============================================================================
|
||||
DECLARE_AVX_SIMD_CONST (int64_t, kAllBitsSet);
|
||||
DECLARE_AVX_SIMD_CONST (int64_t, kEvenHighBit);
|
||||
DECLARE_AVX_SIMD_CONST (double, kOne);
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept { return load (a); }
|
||||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return _mm256_castsi256_pd (_mm256_load_si256 (reinterpret_cast <const __m256i*> (a))); }
|
||||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE expand (double s) noexcept { return _mm256_broadcast_sd (&s); }
|
||||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE load (const double* a) noexcept { return _mm256_load_pd (a); }
|
||||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256d value, double* dest) noexcept { _mm256_store_pd (dest, value); }
|
||||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE add (__m256d a, __m256d b) noexcept { return _mm256_add_pd (a, b); }
|
||||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE sub (__m256d a, __m256d b) noexcept { return _mm256_sub_pd (a, b); }
|
||||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE mul (__m256d a, __m256d b) noexcept { return _mm256_mul_pd (a, b); }
|
||||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_and (__m256d a, __m256d b) noexcept { return _mm256_and_pd (a, b); }
|
||||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_or (__m256d a, __m256d b) noexcept { return _mm256_or_pd (a, b); }
|
||||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_xor (__m256d a, __m256d b) noexcept { return _mm256_xor_pd (a, b); }
|
||||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_notand (__m256d a, __m256d b) noexcept { return _mm256_andnot_pd (a, b); }
|
||||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_not (__m256d a) noexcept { return bit_notand (a, vconst (kAllBitsSet)); }
|
||||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE min (__m256d a, __m256d b) noexcept { return _mm256_min_pd (a, b); }
|
||||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE max (__m256d a, __m256d b) noexcept { return _mm256_max_pd (a, b); }
|
||||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE equal (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_EQ_OQ); }
|
||||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE notEqual (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_NEQ_OQ); }
|
||||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThan (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_GT_OQ); }
|
||||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_GE_OQ); }
|
||||
static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256d a, __m256d b) noexcept { return (_mm256_movemask_pd (equal (a, b)) == 0xf); }
|
||||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE multiplyAdd (__m256d a, __m256d b, __m256d c) noexcept { return _mm256_add_pd (a, _mm256_mul_pd (b, c)); }
|
||||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupeven (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, 0); }
|
||||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupodd (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3)); }
|
||||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE swapevenodd (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, (1 << 0) | (0 << 1) | (1 << 2) | (0 << 3)); }
|
||||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE oddevensum (__m256d a) noexcept { return _mm256_add_pd (_mm256_permute2f128_pd (a, a, 1), a); }
|
||||
static forcedinline double JUCE_VECTOR_CALLTYPE get (__m256d v, size_t i) noexcept { return SIMDFallbackOps<double, __m256d>::get (v, i); }
|
||||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE set (__m256d v, size_t i, double s) noexcept { return SIMDFallbackOps<double, __m256d>::set (v, i, s); }
|
||||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE truncate (__m256d a) noexcept { return _mm256_cvtepi32_pd (_mm256_cvttpd_epi32 (a)); }
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE cmplxmul (__m256d a, __m256d b) noexcept
|
||||
{
|
||||
__m256d rr_ir = mul (a, dupeven (b));
|
||||
__m256d ii_ri = mul (swapevenodd (a), dupodd (b));
|
||||
return add (rr_ir, bit_xor (ii_ri, vconst (kEvenHighBit)));
|
||||
}
|
||||
|
||||
static forcedinline double JUCE_VECTOR_CALLTYPE sum (__m256d a) noexcept
|
||||
{
|
||||
__m256d retval = _mm256_hadd_pd (a, a);
|
||||
__m256d tmp = _mm256_permute2f128_pd (retval, retval, 1);
|
||||
retval = _mm256_add_pd (retval, tmp);
|
||||
|
||||
#if JUCE_GCC
|
||||
return retval[0];
|
||||
#else
|
||||
return _mm256_cvtsd_f64 (retval);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
//==============================================================================
|
||||
/** Signed 8-bit integer AVX intrinsics
|
||||
|
||||
@tags{DSP}
|
||||
*/
|
||||
template <>
|
||||
struct SIMDNativeOps<int8_t>
|
||||
{
|
||||
using vSIMDType = __m256i;
|
||||
|
||||
//==============================================================================
|
||||
DECLARE_AVX_SIMD_CONST (int8_t, kAllBitsSet);
|
||||
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int8_t s) noexcept { return _mm256_set1_epi8 (s); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int8_t* p) noexcept { return _mm256_load_si256 (reinterpret_cast<const __m256i*> (p)); }
|
||||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int8_t* dest) noexcept { _mm256_store_si256 (reinterpret_cast<__m256i*> (dest), value); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi8 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi8 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, load (kAllBitsSet)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi8 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi8 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi8 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi8 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
|
||||
static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return _mm256_movemask_epi8 (equal (a, b)) == -1; }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
|
||||
static forcedinline int8_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept { return SIMDFallbackOps<int8_t, __m256i>::get (v, i); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, int8_t s) noexcept { return SIMDFallbackOps<int8_t, __m256i>::set (v, i, s); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE truncate (__m256i a) noexcept { return a; }
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline int8_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
|
||||
{
|
||||
__m256i lo = _mm256_unpacklo_epi8 (a, _mm256_setzero_si256());
|
||||
__m256i hi = _mm256_unpackhi_epi8 (a, _mm256_setzero_si256());
|
||||
|
||||
for (int i = 0; i < 3; ++i)
|
||||
{
|
||||
lo = _mm256_hadd_epi16 (lo, lo);
|
||||
hi = _mm256_hadd_epi16 (hi, hi);
|
||||
}
|
||||
|
||||
#if JUCE_GCC
|
||||
return (int8_t) ((lo[0] & 0xff) +
|
||||
(hi[0] & 0xff) +
|
||||
(lo[2] & 0xff) +
|
||||
(hi[2] & 0xff));
|
||||
#else
|
||||
constexpr int mask = (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6);
|
||||
|
||||
return (int8_t) ((_mm256_cvtsi256_si32 (lo) & 0xff) +
|
||||
(_mm256_cvtsi256_si32 (hi) & 0xff) +
|
||||
(_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (lo, mask)) & 0xff) +
|
||||
(_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (hi, mask)) & 0xff));
|
||||
#endif
|
||||
}
|
||||
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b)
|
||||
{
|
||||
// unpack and multiply
|
||||
__m256i even = _mm256_mullo_epi16 (a, b);
|
||||
__m256i odd = _mm256_mullo_epi16 (_mm256_srli_epi16 (a, 8), _mm256_srli_epi16 (b, 8));
|
||||
|
||||
return _mm256_or_si256 (_mm256_slli_epi16 (odd, 8),
|
||||
_mm256_srli_epi16 (_mm256_slli_epi16 (even, 8), 8));
|
||||
}
|
||||
};
|
||||
|
||||
//==============================================================================
|
||||
/** Unsigned 8-bit integer AVX intrinsics.
|
||||
|
||||
@tags{DSP}
|
||||
*/
|
||||
template <>
|
||||
struct SIMDNativeOps<uint8_t>
|
||||
{
|
||||
//==============================================================================
|
||||
using vSIMDType = __m256i;
|
||||
|
||||
//==============================================================================
|
||||
DECLARE_AVX_SIMD_CONST (uint8_t, kHighBit);
|
||||
DECLARE_AVX_SIMD_CONST (uint8_t, kAllBitsSet);
|
||||
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, load (kHighBit)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint8_t s) noexcept { return _mm256_set1_epi8 ((int8_t) s); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint8_t* p) noexcept { return _mm256_load_si256 (reinterpret_cast<const __m256i*> (p)); }
|
||||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint8_t* dest) noexcept { _mm256_store_si256 (reinterpret_cast<__m256i*> (dest), value); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi8 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi8 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, load (kAllBitsSet)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epu8 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epu8 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi8 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi8 (ssign (a), ssign (b)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
|
||||
static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
|
||||
static forcedinline uint8_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept { return SIMDFallbackOps<uint8_t, __m256i>::get (v, i); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, uint8_t s) noexcept { return SIMDFallbackOps<uint8_t, __m256i>::set (v, i, s); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE truncate (__m256i a) noexcept { return a; }
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline uint8_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
|
||||
{
|
||||
__m256i lo = _mm256_unpacklo_epi8 (a, _mm256_setzero_si256());
|
||||
__m256i hi = _mm256_unpackhi_epi8 (a, _mm256_setzero_si256());
|
||||
|
||||
for (int i = 0; i < 3; ++i)
|
||||
{
|
||||
lo = _mm256_hadd_epi16 (lo, lo);
|
||||
hi = _mm256_hadd_epi16 (hi, hi);
|
||||
}
|
||||
|
||||
#if JUCE_GCC
|
||||
return (uint8_t) ((static_cast<uint32_t> (lo[0]) & 0xffu) +
|
||||
(static_cast<uint32_t> (hi[0]) & 0xffu) +
|
||||
(static_cast<uint32_t> (lo[2]) & 0xffu) +
|
||||
(static_cast<uint32_t> (hi[2]) & 0xffu));
|
||||
#else
|
||||
constexpr int mask = (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6);
|
||||
|
||||
return (uint8_t) ((static_cast<uint32_t> (_mm256_cvtsi256_si32 (lo)) & 0xffu) +
|
||||
(static_cast<uint32_t> (_mm256_cvtsi256_si32 (hi)) & 0xffu) +
|
||||
(static_cast<uint32_t> (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (lo, mask))) & 0xffu) +
|
||||
(static_cast<uint32_t> (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (hi, mask))) & 0xffu));
|
||||
#endif
|
||||
}
|
||||
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b)
|
||||
{
|
||||
// unpack and multiply
|
||||
__m256i even = _mm256_mullo_epi16 (a, b);
|
||||
__m256i odd = _mm256_mullo_epi16 (_mm256_srli_epi16 (a, 8), _mm256_srli_epi16 (b, 8));
|
||||
|
||||
return _mm256_or_si256 (_mm256_slli_epi16 (odd, 8),
|
||||
_mm256_srli_epi16 (_mm256_slli_epi16 (even, 8), 8));
|
||||
}
|
||||
};
|
||||
|
||||
//==============================================================================
|
||||
/** Signed 16-bit integer AVX intrinsics.
|
||||
|
||||
@tags{DSP}
|
||||
*/
|
||||
template <>
|
||||
struct SIMDNativeOps<int16_t>
|
||||
{
|
||||
//==============================================================================
|
||||
using vSIMDType = __m256i;
|
||||
|
||||
//==============================================================================
|
||||
DECLARE_AVX_SIMD_CONST (int16_t, kAllBitsSet);
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int16_t s) noexcept { return _mm256_set1_epi16 (s); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int16_t* p) noexcept { return _mm256_load_si256 (reinterpret_cast<const __m256i*> (p)); }
|
||||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int16_t* dest) noexcept { _mm256_store_si256 (reinterpret_cast<__m256i*> (dest), value); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi16 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi16 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi16 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, load (kAllBitsSet)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi16 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi16 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi16 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi16 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
|
||||
static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
|
||||
static forcedinline int16_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept { return SIMDFallbackOps<int16_t, __m256i>::get (v, i); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, int16_t s) noexcept { return SIMDFallbackOps<int16_t, __m256i>::set (v, i, s); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE truncate (__m256i a) noexcept { return a; }
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline int16_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
|
||||
{
|
||||
__m256i tmp = _mm256_hadd_epi16 (a, a);
|
||||
tmp = _mm256_hadd_epi16 (tmp, tmp);
|
||||
tmp = _mm256_hadd_epi16 (tmp, tmp);
|
||||
|
||||
#if JUCE_GCC
|
||||
return (int16_t) ((tmp[0] & 0xffff) + (tmp[2] & 0xffff));
|
||||
#else
|
||||
constexpr int mask = (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6);
|
||||
|
||||
return (int16_t) ((_mm256_cvtsi256_si32 (tmp) & 0xffff) +
|
||||
(_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (tmp, mask)) & 0xffff));
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
//==============================================================================
|
||||
/** Unsigned 16-bit integer AVX intrinsics.
|
||||
|
||||
@tags{DSP}
|
||||
*/
|
||||
template <>
|
||||
struct SIMDNativeOps<uint16_t>
|
||||
{
|
||||
//==============================================================================
|
||||
using vSIMDType = __m256i;
|
||||
|
||||
//==============================================================================
|
||||
DECLARE_AVX_SIMD_CONST (uint16_t, kHighBit);
|
||||
DECLARE_AVX_SIMD_CONST (uint16_t, kAllBitsSet);
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, load (kHighBit)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint16_t s) noexcept { return _mm256_set1_epi16 ((int16_t) s); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint16_t* p) noexcept { return _mm256_load_si256 (reinterpret_cast<const __m256i*> (p)); }
|
||||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint16_t* dest) noexcept { _mm256_store_si256 (reinterpret_cast<__m256i*> (dest), value); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi16 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi16 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi16 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, load (kAllBitsSet)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epu16 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epu16 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi16 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi16 (ssign (a), ssign (b)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
|
||||
static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
|
||||
static forcedinline uint16_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept { return SIMDFallbackOps<uint16_t, __m256i>::get (v, i); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, uint16_t s) noexcept { return SIMDFallbackOps<uint16_t, __m256i>::set (v, i, s); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE truncate (__m256i a) noexcept { return a; }
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline uint16_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
|
||||
{
|
||||
__m256i tmp = _mm256_hadd_epi16 (a, a);
|
||||
tmp = _mm256_hadd_epi16 (tmp, tmp);
|
||||
tmp = _mm256_hadd_epi16 (tmp, tmp);
|
||||
|
||||
#if JUCE_GCC
|
||||
return (uint16_t) ((static_cast<uint32_t> (tmp[0]) & 0xffffu) +
|
||||
(static_cast<uint32_t> (tmp[2]) & 0xffffu));
|
||||
#else
|
||||
constexpr int mask = (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6);
|
||||
|
||||
return (uint16_t) ((static_cast<uint32_t> (_mm256_cvtsi256_si32 (tmp)) & 0xffffu) +
|
||||
(static_cast<uint32_t> (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (tmp, mask))) & 0xffffu));
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
//==============================================================================
|
||||
/** Signed 32-bit integer AVX intrinsics.
|
||||
|
||||
@tags{DSP}
|
||||
*/
|
||||
template <>
|
||||
struct SIMDNativeOps<int32_t>
|
||||
{
|
||||
//==============================================================================
|
||||
using vSIMDType = __m256i;
|
||||
|
||||
//==============================================================================
|
||||
DECLARE_AVX_SIMD_CONST (int32_t, kAllBitsSet);
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int32_t s) noexcept { return _mm256_set1_epi32 (s); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int32_t* p) noexcept { return _mm256_load_si256 (reinterpret_cast<const __m256i*> (p)); }
|
||||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int32_t* dest) noexcept { _mm256_store_si256 (reinterpret_cast<__m256i*> (dest), value); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi32 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi32 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi32 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, load (kAllBitsSet)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi32 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi32 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi32 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi32 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
|
||||
static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
|
||||
static forcedinline int32_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept { return SIMDFallbackOps<int32_t, __m256i>::get (v, i); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, int32_t s) noexcept { return SIMDFallbackOps<int32_t, __m256i>::set (v, i, s); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE truncate (__m256i a) noexcept { return a; }
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline int32_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
|
||||
{
|
||||
__m256i tmp = _mm256_hadd_epi32 (a, a);
|
||||
tmp = _mm256_hadd_epi32 (tmp, tmp);
|
||||
|
||||
#if JUCE_GCC
|
||||
return (int32_t) (tmp[0] + tmp[2]);
|
||||
#else
|
||||
constexpr int mask = (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6);
|
||||
|
||||
return _mm256_cvtsi256_si32 (tmp) + _mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (tmp, mask));
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
//==============================================================================
|
||||
/** Unsigned 32-bit integer AVX intrinsics.
|
||||
|
||||
@tags{DSP}
|
||||
*/
|
||||
template <>
|
||||
struct SIMDNativeOps<uint32_t>
|
||||
{
|
||||
//==============================================================================
|
||||
using vSIMDType = __m256i;
|
||||
|
||||
//==============================================================================
|
||||
DECLARE_AVX_SIMD_CONST (uint32_t, kAllBitsSet);
|
||||
DECLARE_AVX_SIMD_CONST (uint32_t, kHighBit);
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, load (kHighBit)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint32_t s) noexcept { return _mm256_set1_epi32 ((int32_t) s); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint32_t* p) noexcept { return _mm256_load_si256 (reinterpret_cast<const __m256i*> (p)); }
|
||||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint32_t* dest) noexcept { _mm256_store_si256 (reinterpret_cast<__m256i*> (dest), value); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi32 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi32 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi32 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, load (kAllBitsSet)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epu32 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epu32 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi32 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi32 (ssign (a), ssign (b)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
|
||||
static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
|
||||
static forcedinline uint32_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept { return SIMDFallbackOps<uint32_t, __m256i>::get (v, i); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, uint32_t s) noexcept { return SIMDFallbackOps<uint32_t, __m256i>::set (v, i, s); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE truncate (__m256i a) noexcept { return a; }
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline uint32_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
|
||||
{
|
||||
__m256i tmp = _mm256_hadd_epi32 (a, a);
|
||||
tmp = _mm256_hadd_epi32 (tmp, tmp);
|
||||
|
||||
#if JUCE_GCC
|
||||
return static_cast<uint32_t> (tmp[0]) + static_cast<uint32_t> (tmp[2]);
|
||||
#else
|
||||
constexpr int mask = (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6);
|
||||
|
||||
return static_cast<uint32_t> (_mm256_cvtsi256_si32 (tmp))
|
||||
+ static_cast<uint32_t> (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (tmp, mask)));
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
//==============================================================================
|
||||
/** Signed 64-bit integer AVX intrinsics.
|
||||
|
||||
@tags{DSP}
|
||||
*/
|
||||
template <>
|
||||
struct SIMDNativeOps<int64_t>
|
||||
{
|
||||
//==============================================================================
|
||||
using vSIMDType = __m256i;
|
||||
|
||||
//==============================================================================
|
||||
DECLARE_AVX_SIMD_CONST (int64_t, kAllBitsSet);
|
||||
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int64_t s) noexcept { return _mm256_set1_epi64x ((int64_t) s); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int64_t* p) noexcept { return _mm256_load_si256 (reinterpret_cast<const __m256i*> (p)); }
|
||||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int64_t* dest) noexcept { _mm256_store_si256 (reinterpret_cast<__m256i*> (dest), value); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi64 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi64 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, load (kAllBitsSet)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { __m256i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { __m256i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi64 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi64 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
|
||||
static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
|
||||
static forcedinline int64_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept { return SIMDFallbackOps<int64_t, __m256i>::get (v, i); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, int64_t s) noexcept { return SIMDFallbackOps<int64_t, __m256i>::set (v, i, s); }
|
||||
static forcedinline int64_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept { return SIMDFallbackOps<int64_t, __m256i>::sum (a); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return SIMDFallbackOps<int64_t, __m256i>::mul (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE truncate (__m256i a) noexcept { return a; }
|
||||
};
|
||||
|
||||
//==============================================================================
|
||||
/** Unsigned 64-bit integer AVX intrinsics.
|
||||
|
||||
@tags{DSP}
|
||||
*/
|
||||
template <>
|
||||
struct SIMDNativeOps<uint64_t>
|
||||
{
|
||||
//==============================================================================
|
||||
using vSIMDType = __m256i;
|
||||
|
||||
//==============================================================================
|
||||
DECLARE_AVX_SIMD_CONST (uint64_t, kAllBitsSet);
|
||||
DECLARE_AVX_SIMD_CONST (uint64_t, kHighBit);
|
||||
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint64_t s) noexcept { return _mm256_set1_epi64x ((int64_t) s); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint64_t* p) noexcept { return _mm256_load_si256 (reinterpret_cast<const __m256i*> (p)); }
|
||||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint64_t* dest) noexcept { _mm256_store_si256 (reinterpret_cast<__m256i*> (dest), value); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, load (kHighBit)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi64 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi64 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, load (kAllBitsSet)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { __m256i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { __m256i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi64 (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi64 (ssign (a), ssign (b)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
|
||||
static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
|
||||
static forcedinline uint64_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept { return SIMDFallbackOps<uint64_t, __m256i>::get (v, i); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, uint64_t s) noexcept { return SIMDFallbackOps<uint64_t, __m256i>::set (v, i, s); }
|
||||
static forcedinline uint64_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept { return SIMDFallbackOps<uint64_t, __m256i>::sum (a); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return SIMDFallbackOps<uint64_t, __m256i>::mul (a, b); }
|
||||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE truncate (__m256i a) noexcept { return a; }
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
JUCE_END_IGNORE_WARNINGS_GCC_LIKE
|
||||
|
||||
} // namespace dsp
|
||||
} // namespace juce
|
265
deps/juce/modules/juce_dsp/native/juce_fallback_SIMDNativeOps.h
vendored
Normal file
265
deps/juce/modules/juce_dsp/native/juce_fallback_SIMDNativeOps.h
vendored
Normal file
@ -0,0 +1,265 @@
|
||||
/*
|
||||
==============================================================================
|
||||
|
||||
This file is part of the JUCE library.
|
||||
Copyright (c) 2020 - Raw Material Software Limited
|
||||
|
||||
JUCE is an open source library subject to commercial or open-source
|
||||
licensing.
|
||||
|
||||
By using JUCE, you agree to the terms of both the JUCE 6 End-User License
|
||||
Agreement and JUCE Privacy Policy (both effective as of the 16th June 2020).
|
||||
|
||||
End User License Agreement: www.juce.com/juce-6-licence
|
||||
Privacy Policy: www.juce.com/juce-privacy-policy
|
||||
|
||||
Or: You may also use this code under the terms of the GPL v3 (see
|
||||
www.gnu.org/licenses).
|
||||
|
||||
JUCE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
|
||||
EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
|
||||
DISCLAIMED.
|
||||
|
||||
==============================================================================
|
||||
*/
|
||||
|
||||
namespace juce
|
||||
{
|
||||
namespace dsp
|
||||
{
|
||||
|
||||
/** A template specialisation to find corresponding mask type for primitives. */
|
||||
namespace SIMDInternal
|
||||
{
|
||||
template <typename Primitive> struct MaskTypeFor { using type = Primitive; };
|
||||
template <> struct MaskTypeFor <float> { using type = uint32_t; };
|
||||
template <> struct MaskTypeFor <double> { using type = uint64_t; };
|
||||
template <> struct MaskTypeFor <char> { using type = uint8_t; };
|
||||
template <> struct MaskTypeFor <int8_t> { using type = uint8_t; };
|
||||
template <> struct MaskTypeFor <int16_t> { using type = uint16_t; };
|
||||
template <> struct MaskTypeFor <int32_t> { using type = uint32_t; };
|
||||
template <> struct MaskTypeFor <int64_t> { using type = uint64_t; };
|
||||
template <> struct MaskTypeFor <std::complex<float>> { using type = uint32_t; };
|
||||
template <> struct MaskTypeFor <std::complex<double>> { using type = uint64_t; };
|
||||
|
||||
template <typename Primitive> struct PrimitiveType { using type = typename std::remove_cv<Primitive>::type; };
|
||||
template <typename Primitive> struct PrimitiveType<std::complex<Primitive>> { using type = typename std::remove_cv<Primitive>::type; };
|
||||
|
||||
template <int n> struct Log2Helper { enum { value = Log2Helper<n/2>::value + 1 }; };
|
||||
template <> struct Log2Helper<1> { enum { value = 0 }; };
|
||||
}
|
||||
|
||||
/**
|
||||
Useful fallback routines to use if the native SIMD op is not supported. You
|
||||
should never need to use this directly. Use juce_SIMDRegister instead.
|
||||
|
||||
@tags{DSP}
|
||||
*/
|
||||
template <typename ScalarType, typename vSIMDType>
|
||||
struct SIMDFallbackOps
|
||||
{
|
||||
static constexpr size_t n = sizeof (vSIMDType) / sizeof (ScalarType);
|
||||
static constexpr size_t mask = (sizeof (vSIMDType) / sizeof (ScalarType)) - 1;
|
||||
static constexpr size_t bits = SIMDInternal::Log2Helper<(int) n>::value;
|
||||
|
||||
// helper types
|
||||
using MaskType = typename SIMDInternal::MaskTypeFor<ScalarType>::type;
|
||||
union UnionType { vSIMDType v; ScalarType s[n]; };
|
||||
union UnionMaskType { vSIMDType v; MaskType m[n]; };
|
||||
|
||||
|
||||
// fallback methods
|
||||
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return apply<ScalarAdd> (a, b); }
|
||||
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return apply<ScalarSub> (a, b); }
|
||||
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return apply<ScalarMul> (a, b); }
|
||||
static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return bitapply<ScalarAnd> (a, b); }
|
||||
static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return bitapply<ScalarOr > (a, b); }
|
||||
static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return bitapply<ScalarXor> (a, b); }
|
||||
static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return bitapply<ScalarNot> (a, b); }
|
||||
|
||||
static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept { return apply<ScalarMin> (a, b); }
|
||||
static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept { return apply<ScalarMax> (a, b); }
|
||||
static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept { return cmp<ScalarEq > (a, b); }
|
||||
static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept { return cmp<ScalarNeq> (a, b); }
|
||||
static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept { return cmp<ScalarGt > (a, b); }
|
||||
static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept { return cmp<ScalarGeq> (a, b); }
|
||||
|
||||
static forcedinline ScalarType get (vSIMDType v, size_t i) noexcept
|
||||
{
|
||||
UnionType u {v};
|
||||
return u.s[i];
|
||||
}
|
||||
|
||||
static forcedinline vSIMDType set (vSIMDType v, size_t i, ScalarType s) noexcept
|
||||
{
|
||||
UnionType u {v};
|
||||
|
||||
u.s[i] = s;
|
||||
return u.v;
|
||||
}
|
||||
|
||||
static forcedinline vSIMDType bit_not (vSIMDType av) noexcept
|
||||
{
|
||||
UnionMaskType a {av};
|
||||
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
a.m[i] = ~a.m[i];
|
||||
|
||||
return a.v;
|
||||
}
|
||||
|
||||
static forcedinline ScalarType sum (vSIMDType av) noexcept
|
||||
{
|
||||
UnionType a {av};
|
||||
auto retval = static_cast<ScalarType> (0);
|
||||
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
retval = static_cast<ScalarType> (retval + a.s[i]);
|
||||
|
||||
return retval;
|
||||
}
|
||||
|
||||
static forcedinline vSIMDType truncate (vSIMDType av) noexcept
|
||||
{
|
||||
UnionType a {av};
|
||||
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
a.s[i] = static_cast<ScalarType> (static_cast<int> (a.s[i]));
|
||||
|
||||
return a.v;
|
||||
}
|
||||
|
||||
static forcedinline vSIMDType multiplyAdd (vSIMDType av, vSIMDType bv, vSIMDType cv) noexcept
|
||||
{
|
||||
UnionType a {av}, b {bv}, c {cv};
|
||||
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
a.s[i] += b.s[i] * c.s[i];
|
||||
|
||||
return a.v;
|
||||
}
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline bool allEqual (vSIMDType av, vSIMDType bv) noexcept
|
||||
{
|
||||
UnionType a {av}, b {bv};
|
||||
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
if (a.s[i] != b.s[i])
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline vSIMDType cmplxmul (vSIMDType av, vSIMDType bv) noexcept
|
||||
{
|
||||
UnionType a {av}, b {bv}, r;
|
||||
|
||||
const int m = n >> 1;
|
||||
for (int i = 0; i < m; ++i)
|
||||
{
|
||||
std::complex<ScalarType> result
|
||||
= std::complex<ScalarType> (a.s[i<<1], a.s[(i<<1)|1])
|
||||
* std::complex<ScalarType> (b.s[i<<1], b.s[(i<<1)|1]);
|
||||
|
||||
r.s[i<<1] = result.real();
|
||||
r.s[(i<<1)|1] = result.imag();
|
||||
}
|
||||
|
||||
return r.v;
|
||||
}
|
||||
|
||||
struct ScalarAdd { static forcedinline ScalarType op (ScalarType a, ScalarType b) noexcept { return a + b; } };
|
||||
struct ScalarSub { static forcedinline ScalarType op (ScalarType a, ScalarType b) noexcept { return a - b; } };
|
||||
struct ScalarMul { static forcedinline ScalarType op (ScalarType a, ScalarType b) noexcept { return a * b; } };
|
||||
struct ScalarMin { static forcedinline ScalarType op (ScalarType a, ScalarType b) noexcept { return jmin (a, b); } };
|
||||
struct ScalarMax { static forcedinline ScalarType op (ScalarType a, ScalarType b) noexcept { return jmax (a, b); } };
|
||||
struct ScalarAnd { static forcedinline MaskType op (MaskType a, MaskType b) noexcept { return a & b; } };
|
||||
struct ScalarOr { static forcedinline MaskType op (MaskType a, MaskType b) noexcept { return a | b; } };
|
||||
struct ScalarXor { static forcedinline MaskType op (MaskType a, MaskType b) noexcept { return a ^ b; } };
|
||||
struct ScalarNot { static forcedinline MaskType op (MaskType a, MaskType b) noexcept { return (~a) & b; } };
|
||||
struct ScalarEq { static forcedinline bool op (ScalarType a, ScalarType b) noexcept { return (a == b); } };
|
||||
struct ScalarNeq { static forcedinline bool op (ScalarType a, ScalarType b) noexcept { return (a != b); } };
|
||||
struct ScalarGt { static forcedinline bool op (ScalarType a, ScalarType b) noexcept { return (a > b); } };
|
||||
struct ScalarGeq { static forcedinline bool op (ScalarType a, ScalarType b) noexcept { return (a >= b); } };
|
||||
|
||||
// generic apply routines for operations above
|
||||
template <typename Op>
|
||||
static forcedinline vSIMDType apply (vSIMDType av, vSIMDType bv) noexcept
|
||||
{
|
||||
UnionType a {av}, b {bv};
|
||||
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
a.s[i] = Op::op (a.s[i], b.s[i]);
|
||||
|
||||
return a.v;
|
||||
}
|
||||
|
||||
template <typename Op>
|
||||
static forcedinline vSIMDType cmp (vSIMDType av, vSIMDType bv) noexcept
|
||||
{
|
||||
UnionType a {av}, b {bv};
|
||||
UnionMaskType r;
|
||||
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
r.m[i] = Op::op (a.s[i], b.s[i]) ? static_cast<MaskType> (-1) : static_cast<MaskType> (0);
|
||||
|
||||
return r.v;
|
||||
}
|
||||
|
||||
template <typename Op>
|
||||
static forcedinline vSIMDType bitapply (vSIMDType av, vSIMDType bv) noexcept
|
||||
{
|
||||
UnionMaskType a {av}, b {bv};
|
||||
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
a.m[i] = Op::op (a.m[i], b.m[i]);
|
||||
|
||||
return a.v;
|
||||
}
|
||||
|
||||
static forcedinline vSIMDType expand (ScalarType s) noexcept
|
||||
{
|
||||
UnionType r;
|
||||
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
r.s[i] = s;
|
||||
|
||||
return r.v;
|
||||
}
|
||||
|
||||
static forcedinline vSIMDType load (const ScalarType* a) noexcept
|
||||
{
|
||||
UnionType r;
|
||||
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
r.s[i] = a[i];
|
||||
|
||||
return r.v;
|
||||
}
|
||||
|
||||
static forcedinline void store (vSIMDType av, ScalarType* dest) noexcept
|
||||
{
|
||||
UnionType a {av};
|
||||
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
dest[i] = a.s[i];
|
||||
}
|
||||
|
||||
template <unsigned int shuffle_idx>
|
||||
static forcedinline vSIMDType shuffle (vSIMDType av) noexcept
|
||||
{
|
||||
UnionType a {av}, r;
|
||||
|
||||
// the compiler will unroll this loop and the index can
|
||||
// be computed at compile-time, so this will be super fast
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
r.s[i] = a.s[(shuffle_idx >> (bits * i)) & mask];
|
||||
|
||||
return r.v;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace dsp
|
||||
} // namespace juce
|
43
deps/juce/modules/juce_dsp/native/juce_neon_SIMDNativeOps.cpp
vendored
Normal file
43
deps/juce/modules/juce_dsp/native/juce_neon_SIMDNativeOps.cpp
vendored
Normal file
@ -0,0 +1,43 @@
|
||||
/*
|
||||
==============================================================================
|
||||
|
||||
This file is part of the JUCE library.
|
||||
Copyright (c) 2020 - Raw Material Software Limited
|
||||
|
||||
JUCE is an open source library subject to commercial or open-source
|
||||
licensing.
|
||||
|
||||
By using JUCE, you agree to the terms of both the JUCE 6 End-User License
|
||||
Agreement and JUCE Privacy Policy (both effective as of the 16th June 2020).
|
||||
|
||||
End User License Agreement: www.juce.com/juce-6-licence
|
||||
Privacy Policy: www.juce.com/juce-privacy-policy
|
||||
|
||||
Or: You may also use this code under the terms of the GPL v3 (see
|
||||
www.gnu.org/licenses).
|
||||
|
||||
JUCE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
|
||||
EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
|
||||
DISCLAIMED.
|
||||
|
||||
==============================================================================
|
||||
*/
|
||||
|
||||
namespace juce
|
||||
{
|
||||
namespace dsp
|
||||
{
|
||||
DEFINE_NEON_SIMD_CONST (int32_t, float, kAllBitsSet) = { -1, -1, -1, -1 };
|
||||
DEFINE_NEON_SIMD_CONST (int32_t, float, kEvenHighBit) = { static_cast<int32_t>(0x80000000), 0, static_cast<int32_t>(0x80000000), 0 };
|
||||
DEFINE_NEON_SIMD_CONST (float, float, kOne) = { 1.0f, 1.0f, 1.0f, 1.0f };
|
||||
|
||||
DEFINE_NEON_SIMD_CONST (int8_t, int8_t, kAllBitsSet) = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
|
||||
DEFINE_NEON_SIMD_CONST (uint8_t, uint8_t, kAllBitsSet) = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
|
||||
DEFINE_NEON_SIMD_CONST (int16_t, int16_t, kAllBitsSet) = { -1, -1, -1, -1, -1, -1, -1, -1 };
|
||||
DEFINE_NEON_SIMD_CONST (uint16_t, uint16_t, kAllBitsSet) = { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff };
|
||||
DEFINE_NEON_SIMD_CONST (int32_t, int32_t, kAllBitsSet) = { -1, -1, -1, -1 };
|
||||
DEFINE_NEON_SIMD_CONST (uint32_t, uint32_t, kAllBitsSet) = { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff };
|
||||
DEFINE_NEON_SIMD_CONST (int64_t, int64_t, kAllBitsSet) = { -1, -1 };
|
||||
DEFINE_NEON_SIMD_CONST (uint64_t, uint64_t, kAllBitsSet) = { 0xffffffffffffffff, 0xffffffffffffffff };
|
||||
}
|
||||
}
|
501
deps/juce/modules/juce_dsp/native/juce_neon_SIMDNativeOps.h
vendored
Normal file
501
deps/juce/modules/juce_dsp/native/juce_neon_SIMDNativeOps.h
vendored
Normal file
@ -0,0 +1,501 @@
|
||||
/*
|
||||
==============================================================================
|
||||
|
||||
This file is part of the JUCE library.
|
||||
Copyright (c) 2020 - Raw Material Software Limited
|
||||
|
||||
JUCE is an open source library subject to commercial or open-source
|
||||
licensing.
|
||||
|
||||
By using JUCE, you agree to the terms of both the JUCE 6 End-User License
|
||||
Agreement and JUCE Privacy Policy (both effective as of the 16th June 2020).
|
||||
|
||||
End User License Agreement: www.juce.com/juce-6-licence
|
||||
Privacy Policy: www.juce.com/juce-privacy-policy
|
||||
|
||||
Or: You may also use this code under the terms of the GPL v3 (see
|
||||
www.gnu.org/licenses).
|
||||
|
||||
JUCE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
|
||||
EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
|
||||
DISCLAIMED.
|
||||
|
||||
==============================================================================
|
||||
*/
|
||||
|
||||
namespace juce
|
||||
{
|
||||
namespace dsp
|
||||
{
|
||||
|
||||
#ifndef DOXYGEN
|
||||
|
||||
JUCE_BEGIN_IGNORE_WARNINGS_GCC_LIKE ("-Wignored-attributes")
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define DECLARE_NEON_SIMD_CONST(type, name) \
|
||||
static __declspec(align(16)) const type name [16 / sizeof (type)]
|
||||
|
||||
#define DEFINE_NEON_SIMD_CONST(type, class_type, name) \
|
||||
__declspec(align(16)) const type SIMDNativeOps<class_type>:: name [16 / sizeof (type)]
|
||||
|
||||
#else
|
||||
#define DECLARE_NEON_SIMD_CONST(type, name) \
|
||||
static const type name [16 / sizeof (type)] __attribute__((aligned(16)))
|
||||
|
||||
#define DEFINE_NEON_SIMD_CONST(type, class_type, name) \
|
||||
const type SIMDNativeOps<class_type>:: name [16 / sizeof (type)] __attribute__((aligned(16)))
|
||||
|
||||
#endif
|
||||
|
||||
template <typename type>
|
||||
struct SIMDNativeOps;
|
||||
|
||||
//==============================================================================
|
||||
/** Unsigned 32-bit integer NEON intrinsics.
|
||||
|
||||
@tags{DSP}
|
||||
*/
|
||||
template <>
|
||||
struct SIMDNativeOps<uint32_t>
|
||||
{
|
||||
//==============================================================================
|
||||
using vSIMDType = uint32x4_t;
|
||||
using fb = SIMDFallbackOps<uint32_t, vSIMDType>;
|
||||
|
||||
//==============================================================================
|
||||
DECLARE_NEON_SIMD_CONST (uint32_t, kAllBitsSet);
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline vSIMDType expand (uint32_t s) noexcept { return vdupq_n_u32 (s); }
|
||||
static forcedinline vSIMDType load (const uint32_t* a) noexcept { return vld1q_u32 (a); }
|
||||
static forcedinline void store (vSIMDType value, uint32_t* a) noexcept { vst1q_u32 (a, value); }
|
||||
static forcedinline uint32_t get (vSIMDType v, size_t i) noexcept { return v[i]; }
|
||||
static forcedinline vSIMDType set (vSIMDType v, size_t i, uint32_t s) noexcept { v[i] = s; return v; }
|
||||
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_u32 (a, b); }
|
||||
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_u32 (a, b); }
|
||||
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_u32 (a, b); }
|
||||
static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return vandq_u32 (a, b); }
|
||||
static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return vorrq_u32 (a, b); }
|
||||
static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return veorq_u32 (a, b); }
|
||||
static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_u32 (b, a); }
|
||||
static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_u32 ((uint32_t*) kAllBitsSet)); }
|
||||
static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept { return vminq_u32 (a, b); }
|
||||
static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept { return vmaxq_u32 (a, b); }
|
||||
static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vceqq_u32 (a, b); }
|
||||
static forcedinline bool allEqual (vSIMDType a, vSIMDType b) noexcept { return (sum (notEqual (a, b)) == 0); }
|
||||
static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept { return bit_not (equal (a, b)); }
|
||||
static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgtq_u32 (a, b); }
|
||||
static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgeq_u32 (a, b); }
|
||||
static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return vmlaq_u32 (a, b, c); }
|
||||
static forcedinline vSIMDType truncate (vSIMDType a) noexcept { return a; }
|
||||
|
||||
static forcedinline uint32_t sum (vSIMDType a) noexcept
|
||||
{
|
||||
auto rr = vadd_u32 (vget_high_u32 (a), vget_low_u32 (a));
|
||||
return vget_lane_u32 (vpadd_u32 (rr, rr), 0);
|
||||
}
|
||||
};
|
||||
|
||||
//==============================================================================
|
||||
/** Signed 32-bit integer NEON intrinsics.
|
||||
|
||||
@tags{DSP}
|
||||
*/
|
||||
template <>
|
||||
struct SIMDNativeOps<int32_t>
|
||||
{
|
||||
//==============================================================================
|
||||
using vSIMDType = int32x4_t;
|
||||
using fb = SIMDFallbackOps<int32_t, vSIMDType>;
|
||||
|
||||
//==============================================================================
|
||||
DECLARE_NEON_SIMD_CONST (int32_t, kAllBitsSet);
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline vSIMDType expand (int32_t s) noexcept { return vdupq_n_s32 (s); }
|
||||
static forcedinline vSIMDType load (const int32_t* a) noexcept { return vld1q_s32 (a); }
|
||||
static forcedinline void store (vSIMDType value, int32_t* a) noexcept { vst1q_s32 (a, value); }
|
||||
static forcedinline int32_t get (vSIMDType v, size_t i) noexcept { return v[i]; }
|
||||
static forcedinline vSIMDType set (vSIMDType v, size_t i, int32_t s) noexcept { v[i] = s; return v; }
|
||||
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_s32 (a, b); }
|
||||
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_s32 (a, b); }
|
||||
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_s32 (a, b); }
|
||||
static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return vandq_s32 (a, b); }
|
||||
static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return vorrq_s32 (a, b); }
|
||||
static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return veorq_s32 (a, b); }
|
||||
static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_s32 (b, a); }
|
||||
static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_s32 ((int32_t*) kAllBitsSet)); }
|
||||
static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept { return vminq_s32 (a, b); }
|
||||
static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept { return vmaxq_s32 (a, b); }
|
||||
static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vceqq_s32 (a, b); }
|
||||
static forcedinline bool allEqual (vSIMDType a, vSIMDType b) noexcept { return (sum (notEqual (a, b)) == 0); }
|
||||
static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept { return bit_not (equal (a, b)); }
|
||||
static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgtq_s32 (a, b); }
|
||||
static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgeq_s32 (a, b); }
|
||||
static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return vmlaq_s32 (a, b, c); }
|
||||
static forcedinline vSIMDType truncate (vSIMDType a) noexcept { return a; }
|
||||
|
||||
static forcedinline int32_t sum (vSIMDType a) noexcept
|
||||
{
|
||||
auto rr = vadd_s32 (vget_high_s32 (a), vget_low_s32 (a));
|
||||
rr = vpadd_s32 (rr, rr);
|
||||
return vget_lane_s32 (rr, 0);
|
||||
}
|
||||
};
|
||||
|
||||
//==============================================================================
|
||||
/** Signed 8-bit integer NEON intrinsics.
|
||||
|
||||
@tags{DSP}
|
||||
*/
|
||||
template <>
|
||||
struct SIMDNativeOps<int8_t>
|
||||
{
|
||||
//==============================================================================
|
||||
using vSIMDType = int8x16_t;
|
||||
using fb = SIMDFallbackOps<int8_t, vSIMDType>;
|
||||
|
||||
//==============================================================================
|
||||
DECLARE_NEON_SIMD_CONST (int8_t, kAllBitsSet);
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline vSIMDType expand (int8_t s) noexcept { return vdupq_n_s8 (s); }
|
||||
static forcedinline vSIMDType load (const int8_t* a) noexcept { return vld1q_s8 (a); }
|
||||
static forcedinline void store (vSIMDType value, int8_t* a) noexcept { vst1q_s8 (a, value); }
|
||||
static forcedinline int8_t get (vSIMDType v, size_t i) noexcept { return v[i]; }
|
||||
static forcedinline vSIMDType set (vSIMDType v, size_t i, int8_t s) noexcept { v[i] = s; return v; }
|
||||
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_s8 (a, b); }
|
||||
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_s8 (a, b); }
|
||||
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_s8 (a, b); }
|
||||
static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return vandq_s8 (a, b); }
|
||||
static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return vorrq_s8 (a, b); }
|
||||
static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return veorq_s8 (a, b); }
|
||||
static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_s8 (b, a); }
|
||||
static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_s8 ((int8_t*) kAllBitsSet)); }
|
||||
static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept { return vminq_s8 (a, b); }
|
||||
static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept { return vmaxq_s8 (a, b); }
|
||||
static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vceqq_s8 (a, b); }
|
||||
static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept { return bit_not (equal (a, b)); }
|
||||
static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgtq_s8 (a, b); }
|
||||
static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgeq_s8 (a, b); }
|
||||
static forcedinline bool allEqual (vSIMDType a, vSIMDType b) noexcept { return (SIMDNativeOps<int32_t>::sum ((SIMDNativeOps<int32_t>::vSIMDType) notEqual (a, b)) == 0); }
|
||||
static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return vmlaq_s8 (a, b, c); }
|
||||
static forcedinline int8_t sum (vSIMDType a) noexcept { return fb::sum (a); }
|
||||
static forcedinline vSIMDType truncate (vSIMDType a) noexcept { return a; }
|
||||
};
|
||||
|
||||
//==============================================================================
|
||||
/** Unsigned 8-bit integer NEON intrinsics.
|
||||
|
||||
@tags{DSP}
|
||||
*/
|
||||
template <>
|
||||
struct SIMDNativeOps<uint8_t>
|
||||
{
|
||||
//==============================================================================
|
||||
using vSIMDType = uint8x16_t;
|
||||
using fb = SIMDFallbackOps<uint8_t, vSIMDType>;
|
||||
|
||||
//==============================================================================
|
||||
DECLARE_NEON_SIMD_CONST (uint8_t, kAllBitsSet);
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline vSIMDType expand (uint8_t s) noexcept { return vdupq_n_u8 (s); }
|
||||
static forcedinline vSIMDType load (const uint8_t* a) noexcept { return vld1q_u8 (a); }
|
||||
static forcedinline void store (vSIMDType value, uint8_t* a) noexcept { vst1q_u8 (a, value); }
|
||||
static forcedinline uint8_t get (vSIMDType v, size_t i) noexcept { return v[i]; }
|
||||
static forcedinline vSIMDType set (vSIMDType v, size_t i, uint8_t s) noexcept { v[i] = s; return v; }
|
||||
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_u8 (a, b); }
|
||||
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_u8 (a, b); }
|
||||
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_u8 (a, b); }
|
||||
static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return vandq_u8 (a, b); }
|
||||
static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return vorrq_u8 (a, b); }
|
||||
static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return veorq_u8 (a, b); }
|
||||
static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_u8 (b, a); }
|
||||
static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_u8 ((uint8_t*) kAllBitsSet)); }
|
||||
static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept { return vminq_u8 (a, b); }
|
||||
static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept { return vmaxq_u8 (a, b); }
|
||||
static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vceqq_u8 (a, b); }
|
||||
static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept { return bit_not (equal (a, b)); }
|
||||
static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgtq_u8 (a, b); }
|
||||
static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgeq_u8 (a, b); }
|
||||
static forcedinline bool allEqual (vSIMDType a, vSIMDType b) noexcept { return (SIMDNativeOps<uint32_t>::sum ((SIMDNativeOps<uint32_t>::vSIMDType) notEqual (a, b)) == 0); }
|
||||
static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return vmlaq_u8 (a, b, c); }
|
||||
static forcedinline uint8_t sum (vSIMDType a) noexcept { return fb::sum (a); }
|
||||
static forcedinline vSIMDType truncate (vSIMDType a) noexcept { return a; }
|
||||
};
|
||||
|
||||
//==============================================================================
|
||||
/** Signed 16-bit integer NEON intrinsics.
|
||||
|
||||
@tags{DSP}
|
||||
*/
|
||||
template <>
|
||||
struct SIMDNativeOps<int16_t>
|
||||
{
|
||||
//==============================================================================
|
||||
using vSIMDType = int16x8_t;
|
||||
using fb = SIMDFallbackOps<int16_t, vSIMDType>;
|
||||
|
||||
//==============================================================================
|
||||
DECLARE_NEON_SIMD_CONST (int16_t, kAllBitsSet);
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline vSIMDType expand (int16_t s) noexcept { return vdupq_n_s16 (s); }
|
||||
static forcedinline vSIMDType load (const int16_t* a) noexcept { return vld1q_s16 (a); }
|
||||
static forcedinline void store (vSIMDType value, int16_t* a) noexcept { vst1q_s16 (a, value); }
|
||||
static forcedinline int16_t get (vSIMDType v, size_t i) noexcept { return v[i]; }
|
||||
static forcedinline vSIMDType set (vSIMDType v, size_t i, int16_t s) noexcept { v[i] = s; return v; }
|
||||
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_s16 (a, b); }
|
||||
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_s16 (a, b); }
|
||||
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_s16 (a, b); }
|
||||
static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return vandq_s16 (a, b); }
|
||||
static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return vorrq_s16 (a, b); }
|
||||
static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return veorq_s16 (a, b); }
|
||||
static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_s16 (b, a); }
|
||||
static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_s16 ((int16_t*) kAllBitsSet)); }
|
||||
static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept { return vminq_s16 (a, b); }
|
||||
static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept { return vmaxq_s16 (a, b); }
|
||||
static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vceqq_s16 (a, b); }
|
||||
static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept { return bit_not (equal (a, b)); }
|
||||
static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgtq_s16 (a, b); }
|
||||
static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgeq_s16 (a, b); }
|
||||
static forcedinline bool allEqual (vSIMDType a, vSIMDType b) noexcept { return (SIMDNativeOps<int32_t>::sum ((SIMDNativeOps<int32_t>::vSIMDType) notEqual (a, b)) == 0); }
|
||||
static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return vmlaq_s16 (a, b, c); }
|
||||
static forcedinline int16_t sum (vSIMDType a) noexcept { return fb::sum (a); }
|
||||
static forcedinline vSIMDType truncate (vSIMDType a) noexcept { return a; }
|
||||
};
|
||||
|
||||
|
||||
//==============================================================================
|
||||
/** Unsigned 16-bit integer NEON intrinsics.
|
||||
|
||||
@tags{DSP}
|
||||
*/
|
||||
template <>
|
||||
struct SIMDNativeOps<uint16_t>
|
||||
{
|
||||
//==============================================================================
|
||||
using vSIMDType = uint16x8_t;
|
||||
using fb = SIMDFallbackOps<uint16_t, vSIMDType>;
|
||||
|
||||
//==============================================================================
|
||||
DECLARE_NEON_SIMD_CONST (uint16_t, kAllBitsSet);
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline vSIMDType expand (uint16_t s) noexcept { return vdupq_n_u16 (s); }
|
||||
static forcedinline vSIMDType load (const uint16_t* a) noexcept { return vld1q_u16 (a); }
|
||||
static forcedinline void store (vSIMDType value, uint16_t* a) noexcept { vst1q_u16 (a, value); }
|
||||
static forcedinline uint16_t get (vSIMDType v, size_t i) noexcept { return v[i]; }
|
||||
static forcedinline vSIMDType set (vSIMDType v, size_t i, uint16_t s) noexcept { v[i] = s; return v; }
|
||||
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_u16 (a, b); }
|
||||
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_u16 (a, b); }
|
||||
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_u16 (a, b); }
|
||||
static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return vandq_u16 (a, b); }
|
||||
static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return vorrq_u16 (a, b); }
|
||||
static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return veorq_u16 (a, b); }
|
||||
static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_u16 (b, a); }
|
||||
static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_u16 ((uint16_t*) kAllBitsSet)); }
|
||||
static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept { return vminq_u16 (a, b); }
|
||||
static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept { return vmaxq_u16 (a, b); }
|
||||
static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vceqq_u16 (a, b); }
|
||||
static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept { return bit_not (equal (a, b)); }
|
||||
static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgtq_u16 (a, b); }
|
||||
static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgeq_u16 (a, b); }
|
||||
static forcedinline bool allEqual (vSIMDType a, vSIMDType b) noexcept { return (SIMDNativeOps<uint32_t>::sum ((SIMDNativeOps<uint32_t>::vSIMDType) notEqual (a, b)) == 0); }
|
||||
static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return vmlaq_u16 (a, b, c); }
|
||||
static forcedinline uint16_t sum (vSIMDType a) noexcept { return fb::sum (a); }
|
||||
static forcedinline vSIMDType truncate (vSIMDType a) noexcept { return a; }
|
||||
};
|
||||
|
||||
//==============================================================================
|
||||
/** Signed 64-bit integer NEON intrinsics.
|
||||
|
||||
@tags{DSP}
|
||||
*/
|
||||
template <>
|
||||
struct SIMDNativeOps<int64_t>
|
||||
{
|
||||
//==============================================================================
|
||||
using vSIMDType = int64x2_t;
|
||||
using fb = SIMDFallbackOps<int64_t, vSIMDType>;
|
||||
|
||||
//==============================================================================
|
||||
DECLARE_NEON_SIMD_CONST (int64_t, kAllBitsSet);
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline vSIMDType expand (int64_t s) noexcept { return vdupq_n_s64 (s); }
|
||||
static forcedinline vSIMDType load (const int64_t* a) noexcept { return vld1q_s64 (a); }
|
||||
static forcedinline void store (vSIMDType value, int64_t* a) noexcept { vst1q_s64 (a, value); }
|
||||
static forcedinline int64_t get (vSIMDType v, size_t i) noexcept { return v[i]; }
|
||||
static forcedinline vSIMDType set (vSIMDType v, size_t i, int64_t s) noexcept { v[i] = s; return v; }
|
||||
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_s64 (a, b); }
|
||||
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_s64 (a, b); }
|
||||
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return fb::mul (a, b); }
|
||||
static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return vandq_s64 (a, b); }
|
||||
static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return vorrq_s64 (a, b); }
|
||||
static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return veorq_s64 (a, b); }
|
||||
static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_s64 (b, a); }
|
||||
static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_s64 ((int64_t*) kAllBitsSet)); }
|
||||
static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept { return fb::min (a, b); }
|
||||
static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept { return fb::max (a, b); }
|
||||
static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept { return fb::equal (a, b); }
|
||||
static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept { return fb::notEqual (a, b); }
|
||||
static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept { return fb::greaterThan (a, b); }
|
||||
static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept { return fb::greaterThanOrEqual (a, b); }
|
||||
static forcedinline bool allEqual (vSIMDType a, vSIMDType b) noexcept { return (SIMDNativeOps<int32_t>::sum ((SIMDNativeOps<int32_t>::vSIMDType) notEqual (a, b)) == 0); }
|
||||
static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return fb::multiplyAdd (a, b, c); }
|
||||
static forcedinline int64_t sum (vSIMDType a) noexcept { return fb::sum (a); }
|
||||
static forcedinline vSIMDType truncate (vSIMDType a) noexcept { return a; }
|
||||
};
|
||||
|
||||
|
||||
//==============================================================================
|
||||
/** Unsigned 64-bit integer NEON intrinsics.
|
||||
|
||||
@tags{DSP}
|
||||
*/
|
||||
template <>
|
||||
struct SIMDNativeOps<uint64_t>
|
||||
{
|
||||
//==============================================================================
|
||||
using vSIMDType = uint64x2_t;
|
||||
using fb = SIMDFallbackOps<uint64_t, vSIMDType>;
|
||||
|
||||
//==============================================================================
|
||||
DECLARE_NEON_SIMD_CONST (uint64_t, kAllBitsSet);
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline vSIMDType expand (uint64_t s) noexcept { return vdupq_n_u64 (s); }
|
||||
static forcedinline vSIMDType load (const uint64_t* a) noexcept { return vld1q_u64 (a); }
|
||||
static forcedinline void store (vSIMDType value, uint64_t* a) noexcept { vst1q_u64 (a, value); }
|
||||
static forcedinline uint64_t get (vSIMDType v, size_t i) noexcept { return v[i]; }
|
||||
static forcedinline vSIMDType set (vSIMDType v, size_t i, uint64_t s) noexcept { v[i] = s; return v; }
|
||||
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_u64 (a, b); }
|
||||
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_u64 (a, b); }
|
||||
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return fb::mul (a, b); }
|
||||
static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return vandq_u64 (a, b); }
|
||||
static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return vorrq_u64 (a, b); }
|
||||
static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return veorq_u64 (a, b); }
|
||||
static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_u64 (b, a); }
|
||||
static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_u64 ((uint64_t*) kAllBitsSet)); }
|
||||
static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept { return fb::min (a, b); }
|
||||
static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept { return fb::max (a, b); }
|
||||
static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept { return fb::equal (a, b); }
|
||||
static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept { return fb::notEqual (a, b); }
|
||||
static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept { return fb::greaterThan (a, b); }
|
||||
static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept { return fb::greaterThanOrEqual (a, b); }
|
||||
static forcedinline bool allEqual (vSIMDType a, vSIMDType b) noexcept { return (SIMDNativeOps<uint32_t>::sum ((SIMDNativeOps<uint32_t>::vSIMDType) notEqual (a, b)) == 0); }
|
||||
static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return fb::multiplyAdd (a, b, c); }
|
||||
static forcedinline uint64_t sum (vSIMDType a) noexcept { return fb::sum (a); }
|
||||
static forcedinline vSIMDType truncate (vSIMDType a) noexcept { return a; }
|
||||
};
|
||||
|
||||
//==============================================================================
|
||||
/** Single-precision floating point NEON intrinsics.
|
||||
|
||||
@tags{DSP}
|
||||
*/
|
||||
template <>
|
||||
struct SIMDNativeOps<float>
|
||||
{
|
||||
//==============================================================================
|
||||
using vSIMDType = float32x4_t;
|
||||
using vMaskType = uint32x4_t;
|
||||
using fb = SIMDFallbackOps<float, vSIMDType>;
|
||||
|
||||
//==============================================================================
|
||||
DECLARE_NEON_SIMD_CONST (int32_t, kAllBitsSet);
|
||||
DECLARE_NEON_SIMD_CONST (int32_t, kEvenHighBit);
|
||||
DECLARE_NEON_SIMD_CONST (float, kOne);
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline vSIMDType expand (float s) noexcept { return vdupq_n_f32 (s); }
|
||||
static forcedinline vSIMDType load (const float* a) noexcept { return vld1q_f32 (a); }
|
||||
static forcedinline float get (vSIMDType v, size_t i) noexcept { return v[i]; }
|
||||
static forcedinline vSIMDType set (vSIMDType v, size_t i, float s) noexcept { v[i] = s; return v; }
|
||||
static forcedinline void store (vSIMDType value, float* a) noexcept { vst1q_f32 (a, value); }
|
||||
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_f32 (a, b); }
|
||||
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_f32 (a, b); }
|
||||
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_f32 (a, b); }
|
||||
static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vandq_u32 ((vMaskType) a, (vMaskType) b); }
|
||||
static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vorrq_u32 ((vMaskType) a, (vMaskType) b); }
|
||||
static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) veorq_u32 ((vMaskType) a, (vMaskType) b); }
|
||||
static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vbicq_u32 ((vMaskType) b, (vMaskType) a); }
|
||||
static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_f32 ((float*) kAllBitsSet)); }
|
||||
static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept { return vminq_f32 (a, b); }
|
||||
static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept { return vmaxq_f32 (a, b); }
|
||||
static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vceqq_f32 (a, b); }
|
||||
static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept { return bit_not (equal (a, b)); }
|
||||
static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgtq_f32 (a, b); }
|
||||
static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgeq_f32 (a, b); }
|
||||
static forcedinline bool allEqual (vSIMDType a, vSIMDType b) noexcept { return (SIMDNativeOps<uint32_t>::sum ((SIMDNativeOps<uint32_t>::vSIMDType) notEqual (a, b)) == 0); }
|
||||
static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return vmlaq_f32 (a, b, c); }
|
||||
static forcedinline vSIMDType dupeven (vSIMDType a) noexcept { return fb::shuffle<(0 << 0) | (0 << 2) | (2 << 4) | (2 << 6)> (a); }
|
||||
static forcedinline vSIMDType dupodd (vSIMDType a) noexcept { return fb::shuffle<(1 << 0) | (1 << 2) | (3 << 4) | (3 << 6)> (a); }
|
||||
static forcedinline vSIMDType swapevenodd (vSIMDType a) noexcept { return fb::shuffle<(1 << 0) | (0 << 2) | (3 << 4) | (2 << 6)> (a); }
|
||||
static forcedinline vSIMDType oddevensum (vSIMDType a) noexcept { return add (fb::shuffle<(2 << 0) | (3 << 2) | (0 << 4) | (1 << 6)> (a), a); }
|
||||
static forcedinline vSIMDType truncate (vSIMDType a) noexcept { return vcvtq_f32_s32 (vcvtq_s32_f32 (a)); }
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline vSIMDType cmplxmul (vSIMDType a, vSIMDType b) noexcept
|
||||
{
|
||||
vSIMDType rr_ir = mul (a, dupeven (b));
|
||||
vSIMDType ii_ri = mul (swapevenodd (a), dupodd (b));
|
||||
return add (rr_ir, bit_xor (ii_ri, vld1q_f32 ((float*) kEvenHighBit)));
|
||||
}
|
||||
|
||||
static forcedinline float sum (vSIMDType a) noexcept
|
||||
{
|
||||
auto rr = vadd_f32 (vget_high_f32 (a), vget_low_f32 (a));
|
||||
return vget_lane_f32 (vpadd_f32 (rr, rr), 0);
|
||||
}
|
||||
};
|
||||
|
||||
//==============================================================================
|
||||
/** Double-precision floating point NEON intrinsics does not exist in NEON
|
||||
so we need to emulate this.
|
||||
|
||||
@tags{DSP}
|
||||
*/
|
||||
template <>
|
||||
struct SIMDNativeOps<double>
|
||||
{
|
||||
//==============================================================================
|
||||
using vSIMDType = struct { double v[2]; };
|
||||
using fb = SIMDFallbackOps<double, vSIMDType>;
|
||||
|
||||
static forcedinline vSIMDType expand (double s) noexcept { return {{s, s}}; }
|
||||
static forcedinline vSIMDType load (const double* a) noexcept { return {{a[0], a[1]}}; }
|
||||
static forcedinline void store (vSIMDType v, double* a) noexcept { a[0] = v.v[0]; a[1] = v.v[1]; }
|
||||
static forcedinline double get (vSIMDType v, size_t i) noexcept { return v.v[i]; }
|
||||
static forcedinline vSIMDType set (vSIMDType v, size_t i, double s) noexcept { v.v[i] = s; return v; }
|
||||
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return {{a.v[0] + b.v[0], a.v[1] + b.v[1]}}; }
|
||||
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return {{a.v[0] - b.v[0], a.v[1] - b.v[1]}}; }
|
||||
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return {{a.v[0] * b.v[0], a.v[1] * b.v[1]}}; }
|
||||
static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return fb::bit_and (a, b); }
|
||||
static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return fb::bit_or (a, b); }
|
||||
static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return fb::bit_xor (a, b); }
|
||||
static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return fb::bit_notand (a, b); }
|
||||
static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return fb::bit_not (a); }
|
||||
static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept { return fb::min (a, b); }
|
||||
static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept { return fb::max (a, b); }
|
||||
static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept { return fb::equal (a, b); }
|
||||
static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept { return fb::notEqual (a, b); }
|
||||
static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept { return fb::greaterThan (a, b); }
|
||||
static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept { return fb::greaterThanOrEqual (a, b); }
|
||||
static forcedinline bool allEqual (vSIMDType a, vSIMDType b) noexcept { return fb::allEqual (a, b); }
|
||||
static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return fb::multiplyAdd (a, b, c); }
|
||||
static forcedinline vSIMDType cmplxmul (vSIMDType a, vSIMDType b) noexcept { return fb::cmplxmul (a, b); }
|
||||
static forcedinline double sum (vSIMDType a) noexcept { return fb::sum (a); }
|
||||
static forcedinline vSIMDType oddevensum (vSIMDType a) noexcept { return a; }
|
||||
static forcedinline vSIMDType truncate (vSIMDType a) noexcept { return fb::truncate (a); }
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
JUCE_END_IGNORE_WARNINGS_GCC_LIKE
|
||||
|
||||
} // namespace dsp
|
||||
} // namespace juce
|
58
deps/juce/modules/juce_dsp/native/juce_sse_SIMDNativeOps.cpp
vendored
Normal file
58
deps/juce/modules/juce_dsp/native/juce_sse_SIMDNativeOps.cpp
vendored
Normal file
@ -0,0 +1,58 @@
|
||||
/*
|
||||
==============================================================================
|
||||
|
||||
This file is part of the JUCE library.
|
||||
Copyright (c) 2020 - Raw Material Software Limited
|
||||
|
||||
JUCE is an open source library subject to commercial or open-source
|
||||
licensing.
|
||||
|
||||
By using JUCE, you agree to the terms of both the JUCE 6 End-User License
|
||||
Agreement and JUCE Privacy Policy (both effective as of the 16th June 2020).
|
||||
|
||||
End User License Agreement: www.juce.com/juce-6-licence
|
||||
Privacy Policy: www.juce.com/juce-privacy-policy
|
||||
|
||||
Or: You may also use this code under the terms of the GPL v3 (see
|
||||
www.gnu.org/licenses).
|
||||
|
||||
JUCE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
|
||||
EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
|
||||
DISCLAIMED.
|
||||
|
||||
==============================================================================
|
||||
*/
|
||||
|
||||
namespace juce
|
||||
{
|
||||
namespace dsp
|
||||
{
|
||||
DEFINE_SSE_SIMD_CONST (int32_t, float, kAllBitsSet) = { -1, -1, -1, -1 };
|
||||
DEFINE_SSE_SIMD_CONST (int32_t, float, kEvenHighBit) = { static_cast<int32_t>(0x80000000), 0, static_cast<int32_t>(0x80000000), 0 };
|
||||
DEFINE_SSE_SIMD_CONST (float, float, kOne) = { 1.0f, 1.0f, 1.0f, 1.0f };
|
||||
|
||||
DEFINE_SSE_SIMD_CONST (int64_t, double, kAllBitsSet) = { -1LL, -1LL };
|
||||
DEFINE_SSE_SIMD_CONST (int64_t, double, kEvenHighBit) = { static_cast<int64_t>(0x8000000000000000), 0 };
|
||||
DEFINE_SSE_SIMD_CONST (double, double, kOne) = { 1.0, 1.0 };
|
||||
|
||||
DEFINE_SSE_SIMD_CONST (int8_t, int8_t, kAllBitsSet) = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
|
||||
|
||||
DEFINE_SSE_SIMD_CONST (uint8_t, uint8_t, kAllBitsSet) = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
|
||||
DEFINE_SSE_SIMD_CONST (uint8_t, uint8_t, kHighBit) = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
|
||||
|
||||
DEFINE_SSE_SIMD_CONST (int16_t, int16_t, kAllBitsSet) = { -1, -1, -1, -1, -1, -1, -1, -1 };
|
||||
|
||||
DEFINE_SSE_SIMD_CONST (uint16_t, uint16_t, kAllBitsSet) = { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff };
|
||||
DEFINE_SSE_SIMD_CONST (uint16_t, uint16_t, kHighBit) = { 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000 };
|
||||
|
||||
DEFINE_SSE_SIMD_CONST (int32_t, int32_t, kAllBitsSet) = { -1, -1, -1, -1 };
|
||||
|
||||
DEFINE_SSE_SIMD_CONST (uint32_t, uint32_t, kAllBitsSet) = { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff };
|
||||
DEFINE_SSE_SIMD_CONST (uint32_t, uint32_t, kHighBit) = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
|
||||
|
||||
DEFINE_SSE_SIMD_CONST (int64_t, int64_t, kAllBitsSet) = { -1, -1 };
|
||||
|
||||
DEFINE_SSE_SIMD_CONST (uint64_t, uint64_t, kAllBitsSet) = { 0xffffffffffffffff, 0xffffffffffffffff };
|
||||
DEFINE_SSE_SIMD_CONST (uint64_t, uint64_t, kHighBit) = { 0x8000000000000000, 0x8000000000000000 };
|
||||
}
|
||||
}
|
729
deps/juce/modules/juce_dsp/native/juce_sse_SIMDNativeOps.h
vendored
Normal file
729
deps/juce/modules/juce_dsp/native/juce_sse_SIMDNativeOps.h
vendored
Normal file
@ -0,0 +1,729 @@
|
||||
/*
|
||||
==============================================================================
|
||||
|
||||
This file is part of the JUCE library.
|
||||
Copyright (c) 2020 - Raw Material Software Limited
|
||||
|
||||
JUCE is an open source library subject to commercial or open-source
|
||||
licensing.
|
||||
|
||||
By using JUCE, you agree to the terms of both the JUCE 6 End-User License
|
||||
Agreement and JUCE Privacy Policy (both effective as of the 16th June 2020).
|
||||
|
||||
End User License Agreement: www.juce.com/juce-6-licence
|
||||
Privacy Policy: www.juce.com/juce-privacy-policy
|
||||
|
||||
Or: You may also use this code under the terms of the GPL v3 (see
|
||||
www.gnu.org/licenses).
|
||||
|
||||
JUCE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
|
||||
EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
|
||||
DISCLAIMED.
|
||||
|
||||
==============================================================================
|
||||
*/
|
||||
|
||||
namespace juce
|
||||
{
|
||||
namespace dsp
|
||||
{
|
||||
|
||||
#ifndef DOXYGEN
|
||||
|
||||
JUCE_BEGIN_IGNORE_WARNINGS_GCC_LIKE ("-Wignored-attributes")
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define DECLARE_SSE_SIMD_CONST(type, name) \
|
||||
static __declspec(align(16)) const type name [16 / sizeof (type)]
|
||||
|
||||
#define DEFINE_SSE_SIMD_CONST(type, class_type, name) \
|
||||
__declspec(align(16)) const type SIMDNativeOps<class_type>:: name [16 / sizeof (type)]
|
||||
|
||||
#else
|
||||
#define DECLARE_SSE_SIMD_CONST(type, name) \
|
||||
static const type name [16 / sizeof (type)] __attribute__((aligned(16)))
|
||||
|
||||
#define DEFINE_SSE_SIMD_CONST(type, class_type, name) \
|
||||
const type SIMDNativeOps<class_type>:: name [16 / sizeof (type)] __attribute__((aligned(16)))
|
||||
|
||||
#endif
|
||||
|
||||
template <typename type>
|
||||
struct SIMDNativeOps;
|
||||
|
||||
//==============================================================================
|
||||
/** Single-precision floating point SSE intrinsics.
|
||||
|
||||
@tags{DSP}
|
||||
*/
|
||||
template <>
|
||||
struct SIMDNativeOps<float>
|
||||
{
|
||||
//==============================================================================
|
||||
using vSIMDType = __m128;
|
||||
|
||||
//==============================================================================
|
||||
DECLARE_SSE_SIMD_CONST (int32_t, kAllBitsSet);
|
||||
DECLARE_SSE_SIMD_CONST (int32_t, kEvenHighBit);
|
||||
DECLARE_SSE_SIMD_CONST (float, kOne);
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline __m128 JUCE_VECTOR_CALLTYPE expand (float s) noexcept { return _mm_load1_ps (&s); }
|
||||
static forcedinline __m128 JUCE_VECTOR_CALLTYPE load (const float* a) noexcept { return _mm_load_ps (a); }
|
||||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128 value, float* dest) noexcept { _mm_store_ps (dest, value); }
|
||||
static forcedinline __m128 JUCE_VECTOR_CALLTYPE add (__m128 a, __m128 b) noexcept { return _mm_add_ps (a, b); }
|
||||
static forcedinline __m128 JUCE_VECTOR_CALLTYPE sub (__m128 a, __m128 b) noexcept { return _mm_sub_ps (a, b); }
|
||||
static forcedinline __m128 JUCE_VECTOR_CALLTYPE mul (__m128 a, __m128 b) noexcept { return _mm_mul_ps (a, b); }
|
||||
static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_and (__m128 a, __m128 b) noexcept { return _mm_and_ps (a, b); }
|
||||
static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_or (__m128 a, __m128 b) noexcept { return _mm_or_ps (a, b); }
|
||||
static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_xor (__m128 a, __m128 b) noexcept { return _mm_xor_ps (a, b); }
|
||||
static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_notand (__m128 a, __m128 b) noexcept { return _mm_andnot_ps (a, b); }
|
||||
static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_not (__m128 a) noexcept { return bit_notand (a, _mm_loadu_ps ((float*) kAllBitsSet)); }
|
||||
static forcedinline __m128 JUCE_VECTOR_CALLTYPE min (__m128 a, __m128 b) noexcept { return _mm_min_ps (a, b); }
|
||||
static forcedinline __m128 JUCE_VECTOR_CALLTYPE max (__m128 a, __m128 b) noexcept { return _mm_max_ps (a, b); }
|
||||
static forcedinline __m128 JUCE_VECTOR_CALLTYPE equal (__m128 a, __m128 b) noexcept { return _mm_cmpeq_ps (a, b); }
|
||||
static forcedinline __m128 JUCE_VECTOR_CALLTYPE notEqual (__m128 a, __m128 b) noexcept { return _mm_cmpneq_ps (a, b); }
|
||||
static forcedinline __m128 JUCE_VECTOR_CALLTYPE greaterThan (__m128 a, __m128 b) noexcept { return _mm_cmpgt_ps (a, b); }
|
||||
static forcedinline __m128 JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128 a, __m128 b) noexcept { return _mm_cmpge_ps (a, b); }
|
||||
static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128 a, __m128 b ) noexcept { return (_mm_movemask_ps (equal (a, b)) == 0xf); }
|
||||
static forcedinline __m128 JUCE_VECTOR_CALLTYPE multiplyAdd (__m128 a, __m128 b, __m128 c) noexcept { return _mm_add_ps (a, _mm_mul_ps (b, c)); }
|
||||
static forcedinline __m128 JUCE_VECTOR_CALLTYPE dupeven (__m128 a) noexcept { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (2, 2, 0, 0)); }
|
||||
static forcedinline __m128 JUCE_VECTOR_CALLTYPE dupodd (__m128 a) noexcept { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (3, 3, 1, 1)); }
|
||||
static forcedinline __m128 JUCE_VECTOR_CALLTYPE swapevenodd (__m128 a) noexcept { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (2, 3, 0, 1)); }
|
||||
static forcedinline __m128 JUCE_VECTOR_CALLTYPE oddevensum (__m128 a) noexcept { return _mm_add_ps (_mm_shuffle_ps (a, a, _MM_SHUFFLE (1, 0, 3, 2)), a); }
|
||||
static forcedinline float JUCE_VECTOR_CALLTYPE get (__m128 v, size_t i) noexcept { return SIMDFallbackOps<float, __m128>::get (v, i); }
|
||||
static forcedinline __m128 JUCE_VECTOR_CALLTYPE set (__m128 v, size_t i, float s) noexcept { return SIMDFallbackOps<float, __m128>::set (v, i, s); }
|
||||
static forcedinline __m128 JUCE_VECTOR_CALLTYPE truncate (__m128 a) noexcept { return _mm_cvtepi32_ps (_mm_cvttps_epi32 (a)); }
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline __m128 JUCE_VECTOR_CALLTYPE cmplxmul (__m128 a, __m128 b) noexcept
|
||||
{
|
||||
__m128 rr_ir = mul (a, dupeven (b));
|
||||
__m128 ii_ri = mul (swapevenodd (a), dupodd (b));
|
||||
return add (rr_ir, bit_xor (ii_ri, _mm_loadu_ps ((float*) kEvenHighBit)));
|
||||
}
|
||||
|
||||
static forcedinline float JUCE_VECTOR_CALLTYPE sum (__m128 a) noexcept
|
||||
{
|
||||
#if defined(__SSE4__)
|
||||
__m128 retval = _mm_dp_ps (a, _mm_loadu_ps (kOne), 0xff);
|
||||
#elif defined(__SSE3__)
|
||||
__m128 retval = _mm_hadd_ps (_mm_hadd_ps (a, a), a);
|
||||
#else
|
||||
__m128 retval = _mm_add_ps (_mm_shuffle_ps (a, a, 0x4e), a);
|
||||
retval = _mm_add_ps (retval, _mm_shuffle_ps (retval, retval, 0xb1));
|
||||
#endif
|
||||
return _mm_cvtss_f32 (retval);
|
||||
}
|
||||
};
|
||||
|
||||
//==============================================================================
|
||||
/** Double-precision floating point SSE intrinsics.
|
||||
|
||||
@tags{DSP}
|
||||
*/
|
||||
template <>
|
||||
struct SIMDNativeOps<double>
|
||||
{
|
||||
//==============================================================================
|
||||
using vSIMDType = __m128d;
|
||||
|
||||
//==============================================================================
|
||||
DECLARE_SSE_SIMD_CONST (int64_t, kAllBitsSet);
|
||||
DECLARE_SSE_SIMD_CONST (int64_t, kEvenHighBit);
|
||||
DECLARE_SSE_SIMD_CONST (double, kOne);
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept { return load (a); }
|
||||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return _mm_castsi128_pd (_mm_load_si128 (reinterpret_cast<const __m128i*> (a))); }
|
||||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE expand (double s) noexcept { return _mm_load1_pd (&s); }
|
||||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE load (const double* a) noexcept { return _mm_load_pd (a); }
|
||||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128d value, double* dest) noexcept { _mm_store_pd (dest, value); }
|
||||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE add (__m128d a, __m128d b) noexcept { return _mm_add_pd (a, b); }
|
||||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE sub (__m128d a, __m128d b) noexcept { return _mm_sub_pd (a, b); }
|
||||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE mul (__m128d a, __m128d b) noexcept { return _mm_mul_pd (a, b); }
|
||||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_and (__m128d a, __m128d b) noexcept { return _mm_and_pd (a, b); }
|
||||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_or (__m128d a, __m128d b) noexcept { return _mm_or_pd (a, b); }
|
||||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_xor (__m128d a, __m128d b) noexcept { return _mm_xor_pd (a, b); }
|
||||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_notand (__m128d a, __m128d b) noexcept { return _mm_andnot_pd (a, b); }
|
||||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_not (__m128d a) noexcept { return bit_notand (a, vconst (kAllBitsSet)); }
|
||||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE min (__m128d a, __m128d b) noexcept { return _mm_min_pd (a, b); }
|
||||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE max (__m128d a, __m128d b) noexcept { return _mm_max_pd (a, b); }
|
||||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE equal (__m128d a, __m128d b) noexcept { return _mm_cmpeq_pd (a, b); }
|
||||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE notEqual (__m128d a, __m128d b) noexcept { return _mm_cmpneq_pd (a, b); }
|
||||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE greaterThan (__m128d a, __m128d b) noexcept { return _mm_cmpgt_pd (a, b); }
|
||||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128d a, __m128d b) noexcept { return _mm_cmpge_pd (a, b); }
|
||||
static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128d a, __m128d b ) noexcept { return (_mm_movemask_pd (equal (a, b)) == 0x3); }
|
||||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE multiplyAdd (__m128d a, __m128d b, __m128d c) noexcept { return _mm_add_pd (a, _mm_mul_pd (b, c)); }
|
||||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE dupeven (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (0, 0)); }
|
||||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE dupodd (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (1, 1)); }
|
||||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE swapevenodd (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (0, 1)); }
|
||||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE oddevensum (__m128d a) noexcept { return a; }
|
||||
static forcedinline double JUCE_VECTOR_CALLTYPE get (__m128d v, size_t i) noexcept { return SIMDFallbackOps<double, __m128d>::get (v, i); }
|
||||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE set (__m128d v, size_t i, double s) noexcept { return SIMDFallbackOps<double, __m128d>::set (v, i, s); }
|
||||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE truncate (__m128d a) noexcept { return _mm_cvtepi32_pd (_mm_cvttpd_epi32 (a)); }
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE cmplxmul (__m128d a, __m128d b) noexcept
|
||||
{
|
||||
__m128d rr_ir = mul (a, dupeven (b));
|
||||
__m128d ii_ri = mul (swapevenodd (a), dupodd (b));
|
||||
return add (rr_ir, bit_xor (ii_ri, vconst (kEvenHighBit)));
|
||||
}
|
||||
|
||||
static forcedinline double JUCE_VECTOR_CALLTYPE sum (__m128d a) noexcept
|
||||
{
|
||||
#if defined(__SSE4__)
|
||||
__m128d retval = _mm_dp_pd (a, vconst (kOne), 0xff);
|
||||
#elif defined(__SSE3__)
|
||||
__m128d retval = _mm_hadd_pd (a, a);
|
||||
#else
|
||||
__m128d retval = _mm_add_pd (_mm_shuffle_pd (a, a, 0x01), a);
|
||||
#endif
|
||||
return _mm_cvtsd_f64 (retval);
|
||||
}
|
||||
};
|
||||
|
||||
//==============================================================================
|
||||
/** Signed 8-bit integer SSE intrinsics.
|
||||
|
||||
@tags{DSP}
|
||||
*/
|
||||
template <>
|
||||
struct SIMDNativeOps<int8_t>
|
||||
{
|
||||
//==============================================================================
|
||||
using vSIMDType = __m128i;
|
||||
|
||||
//==============================================================================
|
||||
DECLARE_SSE_SIMD_CONST (int8_t, kAllBitsSet);
|
||||
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int8_t* a) noexcept { return load (a); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int8_t* a) noexcept { return _mm_load_si128 (reinterpret_cast<const __m128i*> (a)); }
|
||||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, int8_t* p) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (p), v); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int8_t s) noexcept { return _mm_set1_epi8 (s); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi8 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi8 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
|
||||
#if defined(__SSE4__)
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epi8 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epi8 (a, b); }
|
||||
#else
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
|
||||
#endif
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi8 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi8 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
|
||||
static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
|
||||
static forcedinline int8_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps<int8_t, __m128i>::get (v, i); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, int8_t s) noexcept { return SIMDFallbackOps<int8_t, __m128i>::set (v, i, s); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE truncate (__m128i a) noexcept { return a; }
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline int8_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
|
||||
{
|
||||
#ifdef __SSSE3__
|
||||
__m128i lo = _mm_unpacklo_epi8 (a, _mm_setzero_si128());
|
||||
__m128i hi = _mm_unpackhi_epi8 (a, _mm_setzero_si128());
|
||||
|
||||
for (int i = 0; i < 3; ++i)
|
||||
{
|
||||
lo = _mm_hadd_epi16 (lo, lo);
|
||||
hi = _mm_hadd_epi16 (hi, hi);
|
||||
}
|
||||
|
||||
return static_cast<int8_t> ((_mm_cvtsi128_si32 (lo) & 0xff) + (_mm_cvtsi128_si32 (hi) & 0xff));
|
||||
#else
|
||||
return SIMDFallbackOps<int8_t, __m128i>::sum (a);
|
||||
#endif
|
||||
}
|
||||
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b)
|
||||
{
|
||||
// unpack and multiply
|
||||
__m128i even = _mm_mullo_epi16 (a, b);
|
||||
__m128i odd = _mm_mullo_epi16 (_mm_srli_epi16 (a, 8), _mm_srli_epi16 (b, 8));
|
||||
|
||||
return _mm_or_si128 (_mm_slli_epi16 (odd, 8),
|
||||
_mm_srli_epi16 (_mm_slli_epi16 (even, 8), 8));
|
||||
}
|
||||
};
|
||||
|
||||
//==============================================================================
|
||||
/** Unsigned 8-bit integer SSE intrinsics.
|
||||
|
||||
@tags{DSP}
|
||||
*/
|
||||
template <>
|
||||
struct SIMDNativeOps<uint8_t>
|
||||
{
|
||||
//==============================================================================
|
||||
using vSIMDType = __m128i;
|
||||
|
||||
//==============================================================================
|
||||
DECLARE_SSE_SIMD_CONST (uint8_t, kHighBit);
|
||||
DECLARE_SSE_SIMD_CONST (uint8_t, kAllBitsSet);
|
||||
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint8_t* a) noexcept { return load (a); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint8_t* a) noexcept { return _mm_load_si128 (reinterpret_cast<const __m128i*> (a)); }
|
||||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, uint8_t* p) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (p), v); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint8_t s) noexcept { return _mm_set1_epi8 ((int8_t) s); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi8 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi8 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epu8 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epu8 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi8 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi8 (ssign (a), ssign (b)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
|
||||
static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
|
||||
static forcedinline uint8_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps<uint8_t, __m128i>::get (v, i); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, uint8_t s) noexcept { return SIMDFallbackOps<uint8_t, __m128i>::set (v, i, s); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE truncate (__m128i a) noexcept { return a; }
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline uint8_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
|
||||
{
|
||||
#ifdef __SSSE3__
|
||||
__m128i lo = _mm_unpacklo_epi8 (a, _mm_setzero_si128());
|
||||
__m128i hi = _mm_unpackhi_epi8 (a, _mm_setzero_si128());
|
||||
|
||||
for (int i = 0; i < 3; ++i)
|
||||
{
|
||||
lo = _mm_hadd_epi16 (lo, lo);
|
||||
hi = _mm_hadd_epi16 (hi, hi);
|
||||
}
|
||||
|
||||
return static_cast<uint8_t> ((static_cast<uint32_t> (_mm_cvtsi128_si32 (lo)) & 0xffu)
|
||||
+ (static_cast<uint32_t> (_mm_cvtsi128_si32 (hi)) & 0xffu));
|
||||
#else
|
||||
return SIMDFallbackOps<uint8_t, __m128i>::sum (a);
|
||||
#endif
|
||||
}
|
||||
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b)
|
||||
{
|
||||
// unpack and multiply
|
||||
__m128i even = _mm_mullo_epi16 (a, b);
|
||||
__m128i odd = _mm_mullo_epi16 (_mm_srli_epi16 (a, 8), _mm_srli_epi16 (b, 8));
|
||||
|
||||
return _mm_or_si128 (_mm_slli_epi16 (odd, 8),
|
||||
_mm_srli_epi16 (_mm_slli_epi16 (even, 8), 8));
|
||||
}
|
||||
};
|
||||
|
||||
//==============================================================================
|
||||
/** Signed 16-bit integer SSE intrinsics.
|
||||
|
||||
@tags{DSP}
|
||||
*/
|
||||
template <>
|
||||
struct SIMDNativeOps<int16_t>
|
||||
{
|
||||
//==============================================================================
|
||||
using vSIMDType = __m128i;
|
||||
|
||||
//==============================================================================
|
||||
DECLARE_SSE_SIMD_CONST (int16_t, kAllBitsSet);
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int16_t* a) noexcept { return load (a); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int16_t* a) noexcept { return _mm_load_si128 (reinterpret_cast<const __m128i*> (a)); }
|
||||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, int16_t* p) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (p), v); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int16_t s) noexcept { return _mm_set1_epi16 (s); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi16 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi16 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept { return _mm_mullo_epi16 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epi16 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epi16 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi16 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi16 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
|
||||
static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
|
||||
static forcedinline int16_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps<int16_t, __m128i>::get (v, i); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, int16_t s) noexcept { return SIMDFallbackOps<int16_t, __m128i>::set (v, i, s); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE truncate (__m128i a) noexcept { return a; }
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline int16_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
|
||||
{
|
||||
#ifdef __SSSE3__
|
||||
__m128i tmp = _mm_hadd_epi16 (a, a);
|
||||
tmp = _mm_hadd_epi16 (tmp, tmp);
|
||||
tmp = _mm_hadd_epi16 (tmp, tmp);
|
||||
|
||||
return static_cast<int16_t> (_mm_cvtsi128_si32 (tmp) & 0xffff);
|
||||
#else
|
||||
return SIMDFallbackOps<int16_t, __m128i>::sum (a);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
//==============================================================================
|
||||
/** Unsigned 16-bit integer SSE intrinsics.
|
||||
|
||||
@tags{DSP}
|
||||
*/
|
||||
template <>
|
||||
struct SIMDNativeOps<uint16_t>
|
||||
{
|
||||
//==============================================================================
|
||||
using vSIMDType = __m128i;
|
||||
|
||||
//==============================================================================
|
||||
DECLARE_SSE_SIMD_CONST (uint16_t, kHighBit);
|
||||
DECLARE_SSE_SIMD_CONST (uint16_t, kAllBitsSet);
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint16_t* a) noexcept { return load (a); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint16_t* a) noexcept { return _mm_load_si128 (reinterpret_cast<const __m128i*> (a)); }
|
||||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, uint16_t* p) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (p), v); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint16_t s) noexcept { return _mm_set1_epi16 ((int16_t) s); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi16 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi16 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept { return _mm_mullo_epi16 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
|
||||
#if defined(__SSE4__)
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epu16 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epu16 (a, b); }
|
||||
#else
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
|
||||
#endif
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi16 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi16 (ssign (a), ssign (b)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
|
||||
static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
|
||||
static forcedinline uint16_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps<uint16_t, __m128i>::get (v, i); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, uint16_t s) noexcept { return SIMDFallbackOps<uint16_t, __m128i>::set (v, i, s); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE truncate (__m128i a) noexcept { return a; }
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline uint16_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
|
||||
{
|
||||
#ifdef __SSSE3__
|
||||
__m128i tmp = _mm_hadd_epi16 (a, a);
|
||||
tmp = _mm_hadd_epi16 (tmp, tmp);
|
||||
tmp = _mm_hadd_epi16 (tmp, tmp);
|
||||
|
||||
return static_cast<uint16_t> (static_cast<uint32_t> (_mm_cvtsi128_si32 (tmp)) & 0xffffu);
|
||||
#else
|
||||
return SIMDFallbackOps<uint16_t, __m128i>::sum (a);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
//==============================================================================
|
||||
/** Signed 32-bit integer SSE intrinsics.
|
||||
|
||||
@tags{DSP}
|
||||
*/
|
||||
template <>
|
||||
struct SIMDNativeOps<int32_t>
|
||||
{
|
||||
//==============================================================================
|
||||
using vSIMDType = __m128i;
|
||||
|
||||
//==============================================================================
|
||||
DECLARE_SSE_SIMD_CONST (int32_t, kAllBitsSet);
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return load (a); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int32_t* a) noexcept { return _mm_load_si128 (reinterpret_cast<const __m128i*> (a)); }
|
||||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, int32_t* p) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (p), v); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int32_t s) noexcept { return _mm_set1_epi32 (s); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi32 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi32 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi32 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi32 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
|
||||
static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
|
||||
static forcedinline int32_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps<int32_t, __m128i>::get (v, i); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, int32_t s) noexcept { return SIMDFallbackOps<int32_t, __m128i>::set (v, i, s); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE truncate (__m128i a) noexcept { return a; }
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline int32_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
|
||||
{
|
||||
#ifdef __SSSE3__
|
||||
__m128i tmp = _mm_hadd_epi32 (a, a);
|
||||
return _mm_cvtsi128_si32 (_mm_hadd_epi32 (tmp, tmp));
|
||||
#else
|
||||
return SIMDFallbackOps<int32_t, __m128i>::sum (a);
|
||||
#endif
|
||||
}
|
||||
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
|
||||
{
|
||||
#if defined(__SSE4_1__)
|
||||
return _mm_mullo_epi32 (a, b);
|
||||
#else
|
||||
__m128i even = _mm_mul_epu32 (a,b);
|
||||
__m128i odd = _mm_mul_epu32 (_mm_srli_si128 (a,4), _mm_srli_si128 (b,4));
|
||||
return _mm_unpacklo_epi32 (_mm_shuffle_epi32(even, _MM_SHUFFLE (0,0,2,0)),
|
||||
_mm_shuffle_epi32(odd, _MM_SHUFFLE (0,0,2,0)));
|
||||
#endif
|
||||
}
|
||||
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept
|
||||
{
|
||||
#if defined(__SSE4_1__)
|
||||
return _mm_min_epi32 (a, b);
|
||||
#else
|
||||
__m128i lt = greaterThan (b, a);
|
||||
return bit_or (bit_and (lt, a), bit_andnot (lt, b));
|
||||
#endif
|
||||
}
|
||||
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept
|
||||
{
|
||||
#if defined(__SSE4_1__)
|
||||
return _mm_max_epi32 (a, b);
|
||||
#else
|
||||
__m128i gt = greaterThan (a, b);
|
||||
return bit_or (bit_and (gt, a), bit_andnot (gt, b));
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
//==============================================================================
|
||||
/** Unsigned 32-bit integer SSE intrinsics.
|
||||
|
||||
@tags{DSP}
|
||||
*/
|
||||
template <>
|
||||
struct SIMDNativeOps<uint32_t>
|
||||
{
|
||||
//==============================================================================
|
||||
using vSIMDType = __m128i;
|
||||
|
||||
//==============================================================================
|
||||
DECLARE_SSE_SIMD_CONST (uint32_t, kAllBitsSet);
|
||||
DECLARE_SSE_SIMD_CONST (uint32_t, kHighBit);
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint32_t* a) noexcept { return load (a); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint32_t* a) noexcept { return _mm_load_si128 (reinterpret_cast<const __m128i*> (a)); }
|
||||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, uint32_t* p) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (p), v); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint32_t s) noexcept { return _mm_set1_epi32 ((int32_t) s); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi32 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi32 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi32 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi32 (ssign (a), ssign (b)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
|
||||
static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
|
||||
static forcedinline uint32_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps<uint32_t, __m128i>::get (v, i); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, uint32_t s) noexcept { return SIMDFallbackOps<uint32_t, __m128i>::set (v, i, s); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE truncate (__m128i a) noexcept { return a; }
|
||||
|
||||
//==============================================================================
|
||||
static forcedinline uint32_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
|
||||
{
|
||||
#ifdef __SSSE3__
|
||||
__m128i tmp = _mm_hadd_epi32 (a, a);
|
||||
return static_cast<uint32_t> (_mm_cvtsi128_si32 (_mm_hadd_epi32 (tmp, tmp)));
|
||||
#else
|
||||
return SIMDFallbackOps<uint32_t, __m128i>::sum (a);
|
||||
#endif
|
||||
}
|
||||
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
|
||||
{
|
||||
#if defined(__SSE4_1__)
|
||||
return _mm_mullo_epi32 (a, b);
|
||||
#else
|
||||
__m128i even = _mm_mul_epu32 (a,b);
|
||||
__m128i odd = _mm_mul_epu32 (_mm_srli_si128 (a,4), _mm_srli_si128 (b,4));
|
||||
return _mm_unpacklo_epi32 (_mm_shuffle_epi32(even, _MM_SHUFFLE (0,0,2,0)),
|
||||
_mm_shuffle_epi32(odd, _MM_SHUFFLE (0,0,2,0)));
|
||||
#endif
|
||||
}
|
||||
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept
|
||||
{
|
||||
#if defined(__SSE4_1__)
|
||||
return _mm_min_epi32 (a, b);
|
||||
#else
|
||||
__m128i lt = greaterThan (b, a);
|
||||
return bit_or (bit_and (lt, a), bit_andnot (lt, b));
|
||||
#endif
|
||||
}
|
||||
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept
|
||||
{
|
||||
#if defined(__SSE4_1__)
|
||||
return _mm_max_epi32 (a, b);
|
||||
#else
|
||||
__m128i gt = greaterThan (a, b);
|
||||
return bit_or (bit_and (gt, a), bit_andnot (gt, b));
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
//==============================================================================
|
||||
/** Signed 64-bit integer SSE intrinsics.
|
||||
|
||||
@tags{DSP}
|
||||
*/
|
||||
template <>
|
||||
struct SIMDNativeOps<int64_t>
|
||||
{
|
||||
//==============================================================================
|
||||
using vSIMDType = __m128i;
|
||||
|
||||
//==============================================================================
|
||||
DECLARE_SSE_SIMD_CONST (int64_t, kAllBitsSet);
|
||||
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return load (a); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int64_t s) noexcept { return _mm_set1_epi64x (s); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int64_t* a) noexcept { return _mm_load_si128 (reinterpret_cast<const __m128i*> (a)); }
|
||||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, int64_t* p) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (p), v); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi64 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi64 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
|
||||
static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
|
||||
static forcedinline int64_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps<int64_t, __m128i>::get (v, i); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, int64_t s) noexcept { return SIMDFallbackOps<int64_t, __m128i>::set (v, i, s); }
|
||||
static forcedinline int64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept { return SIMDFallbackOps<int64_t, __m128i>::sum (a); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept { return SIMDFallbackOps<int64_t, __m128i>::mul (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE truncate (__m128i a) noexcept { return a; }
|
||||
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept
|
||||
{
|
||||
#if defined(__SSE4_1__)
|
||||
return _mm_cmpeq_epi64 (a, b);
|
||||
#else
|
||||
__m128i bitmask = _mm_cmpeq_epi32 (a, b);
|
||||
bitmask = _mm_and_si128 (bitmask, _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 3, 0, 1)));
|
||||
return _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 2, 0, 0));
|
||||
#endif
|
||||
}
|
||||
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept
|
||||
{
|
||||
#if defined(__SSE4_2__)
|
||||
return _mm_cmpgt_epi64 (a, b);
|
||||
#else
|
||||
return SIMDFallbackOps<int64_t, __m128i>::greaterThan (a, b);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
//==============================================================================
|
||||
/** Unsigned 64-bit integer SSE intrinsics.
|
||||
|
||||
@tags{DSP}
|
||||
*/
|
||||
template <>
|
||||
struct SIMDNativeOps<uint64_t>
|
||||
{
|
||||
//==============================================================================
|
||||
using vSIMDType = __m128i;
|
||||
|
||||
//==============================================================================
|
||||
DECLARE_SSE_SIMD_CONST (uint64_t, kAllBitsSet);
|
||||
DECLARE_SSE_SIMD_CONST (uint64_t, kHighBit);
|
||||
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint64_t* a) noexcept { return load (a); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint64_t s) noexcept { return _mm_set1_epi64x ((int64_t) s); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint64_t* a) noexcept { return _mm_load_si128 (reinterpret_cast<const __m128i*> (a)); }
|
||||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, uint64_t* p) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (p), v); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi64 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi64 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
|
||||
static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
|
||||
static forcedinline uint64_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps<uint64_t, __m128i>::get (v, i); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, uint64_t s) noexcept { return SIMDFallbackOps<uint64_t, __m128i>::set (v, i, s); }
|
||||
static forcedinline uint64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept { return SIMDFallbackOps<uint64_t, __m128i>::sum (a); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept { return SIMDFallbackOps<uint64_t, __m128i>::mul (a, b); }
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE truncate (__m128i a) noexcept { return a; }
|
||||
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept
|
||||
{
|
||||
#if defined(__SSE4_1__)
|
||||
return _mm_cmpeq_epi64 (a, b);
|
||||
#else
|
||||
__m128i bitmask = _mm_cmpeq_epi32 (a, b);
|
||||
bitmask = _mm_and_si128 (bitmask, _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 3, 0, 1)));
|
||||
return _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 2, 0, 0));
|
||||
#endif
|
||||
}
|
||||
|
||||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept
|
||||
{
|
||||
#if defined(__SSE4_2__)
|
||||
return _mm_cmpgt_epi64 (ssign (a), ssign (b));
|
||||
#else
|
||||
return SIMDFallbackOps<uint64_t, __m128i>::greaterThan (a, b);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
JUCE_END_IGNORE_WARNINGS_GCC_LIKE
|
||||
|
||||
} // namespace dsp
|
||||
} // namespace juce
|
Reference in New Issue
Block a user