2023-08-11 10:55:58 +08:00
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2023 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
# ifndef PXFOUNDATION_PXUNIXSSE2INLINEAOS_H
# define PXFOUNDATION_PXUNIXSSE2INLINEAOS_H
# if !COMPILE_VECTOR_INTRINSICS
# error Vector intrinsics should not be included when using scalar implementation.
# endif
# ifdef __SSE4_2__
# include "smmintrin.h"
# endif
# include "../../PxVecMathSSE.h"
namespace physx
{
namespace aos
{
# define PX_FPCLASS_SNAN 0x0001 /* signaling NaN */
# define PX_FPCLASS_QNAN 0x0002 /* quiet NaN */
# define PX_FPCLASS_NINF 0x0004 /* negative infinity */
# define PX_FPCLASS_PINF 0x0200 /* positive infinity */
PX_FORCE_INLINE __m128 m128_I2F ( __m128i n )
{
return _mm_castsi128_ps ( n ) ;
}
PX_FORCE_INLINE __m128i m128_F2I ( __m128 n )
{
return _mm_castps_si128 ( n ) ;
}
//////////////////////////////////////////////////////////////////////
//Test that Vec3V and FloatV are legal
//////////////////////////////////////////////////////////////////////
# define FLOAT_COMPONENTS_EQUAL_THRESHOLD 0.01f
PX_FORCE_INLINE static bool isValidFloatV ( const FloatV a )
{
const PxF32 x = V4ReadX ( a ) ;
const PxF32 y = V4ReadY ( a ) ;
const PxF32 z = V4ReadZ ( a ) ;
const PxF32 w = V4ReadW ( a ) ;
return ( x = = y & & x = = z & & x = = w ) ;
/*if (
( PxAbs ( x - y ) < FLOAT_COMPONENTS_EQUAL_THRESHOLD ) & &
( PxAbs ( x - z ) < FLOAT_COMPONENTS_EQUAL_THRESHOLD ) & &
( PxAbs ( x - w ) < FLOAT_COMPONENTS_EQUAL_THRESHOLD )
)
{
return true ;
}
if (
( PxAbs ( ( x - y ) / x ) < FLOAT_COMPONENTS_EQUAL_THRESHOLD ) & &
( PxAbs ( ( x - z ) / x ) < FLOAT_COMPONENTS_EQUAL_THRESHOLD ) & &
( PxAbs ( ( x - w ) / x ) < FLOAT_COMPONENTS_EQUAL_THRESHOLD )
)
{
return true ;
}
return false ; */
}
PX_FORCE_INLINE bool isValidVec3V ( const Vec3V a )
{
PX_ALIGN ( 16 , PxF32 f [ 4 ] ) ;
V4StoreA ( a , f ) ;
return ( f [ 3 ] = = 0.0f ) ;
}
PX_FORCE_INLINE bool isFiniteLength ( const Vec3V a )
{
return ! FAllEq ( V4LengthSq ( a ) , FZero ( ) ) ;
}
PX_FORCE_INLINE bool isAligned16 ( void * a )
{
return ( 0 = = ( size_t ( a ) & 0x0f ) ) ;
}
//ASSERT_FINITELENGTH is deactivated because there is a lot of code that calls a simd normalisation function with zero length but then ignores the result.
# if PX_DEBUG
# define ASSERT_ISVALIDVEC3V(a) PX_ASSERT(isValidVec3V(a))
# define ASSERT_ISVALIDFLOATV(a) PX_ASSERT(isValidFloatV(a))
# define ASSERT_ISALIGNED16(a) PX_ASSERT(isAligned16(reinterpret_cast<void*>(a)))
# define ASSERT_ISFINITELENGTH(a) //PX_ASSERT(isFiniteLength(a))
# else
# define ASSERT_ISVALIDVEC3V(a)
# define ASSERT_ISVALIDFLOATV(a)
# define ASSERT_ISALIGNED16(a)
# define ASSERT_ISFINITELENGTH(a)
# endif
namespace internalUnitSSE2Simd
{
PX_FORCE_INLINE PxU32 BAllTrue4_R ( const BoolV a )
{
const PxI32 moveMask = _mm_movemask_ps ( a ) ;
return PxU32 ( moveMask = = 0xf ) ;
}
PX_FORCE_INLINE PxU32 BAllTrue3_R ( const BoolV a )
{
const PxI32 moveMask = _mm_movemask_ps ( a ) ;
return PxU32 ( ( moveMask & 0x7 ) = = 0x7 ) ;
}
PX_FORCE_INLINE PxU32 BAnyTrue4_R ( const BoolV a )
{
const PxI32 moveMask = _mm_movemask_ps ( a ) ;
return PxU32 ( moveMask ! = 0x0 ) ;
}
PX_FORCE_INLINE PxU32 BAnyTrue3_R ( const BoolV a )
{
const PxI32 moveMask = _mm_movemask_ps ( a ) ;
return PxU32 ( ( moveMask & 0x7 ) ! = 0x0 ) ;
}
PX_FORCE_INLINE PxU32 FiniteTestEq ( const Vec4V a , const Vec4V b )
{
// This is a bit of a bodge.
//_mm_comieq_ss returns 1 if either value is nan so we need to re-cast a and b with true encoded as a non-nan
// number.
// There must be a better way of doing this in sse.
const BoolV one = FOne ( ) ;
const BoolV zero = FZero ( ) ;
const BoolV a1 = V4Sel ( a , one , zero ) ;
const BoolV b1 = V4Sel ( b , one , zero ) ;
return (
_mm_comieq_ss ( a1 , b1 ) & &
_mm_comieq_ss ( _mm_shuffle_ps ( a1 , a1 , _MM_SHUFFLE ( 1 , 1 , 1 , 1 ) ) , _mm_shuffle_ps ( b1 , b1 , _MM_SHUFFLE ( 1 , 1 , 1 , 1 ) ) ) & &
_mm_comieq_ss ( _mm_shuffle_ps ( a1 , a1 , _MM_SHUFFLE ( 2 , 2 , 2 , 2 ) ) , _mm_shuffle_ps ( b1 , b1 , _MM_SHUFFLE ( 2 , 2 , 2 , 2 ) ) ) & &
_mm_comieq_ss ( _mm_shuffle_ps ( a1 , a1 , _MM_SHUFFLE ( 3 , 3 , 3 , 3 ) ) , _mm_shuffle_ps ( b1 , b1 , _MM_SHUFFLE ( 3 , 3 , 3 , 3 ) ) ) ) ;
}
# if !PX_EMSCRIPTEN
# if PX_CLANG
# if PX_LINUX
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wglobal-constructors"
# endif
# endif
const PX_ALIGN ( 16 , PxF32 gMaskXYZ [ 4 ] ) = { physx : : PxUnionCast < PxF32 > ( 0xffffffff ) , physx : : PxUnionCast < PxF32 > ( 0xffffffff ) ,
physx : : PxUnionCast < PxF32 > ( 0xffffffff ) , 0 } ;
# if PX_CLANG
# if PX_LINUX
# pragma clang diagnostic pop
# endif
# endif
# else
// emscripten doesn't like the PxUnionCast data structure
// the following is what windows and xbox does -- using these for emscripten
const PX_ALIGN ( 16 , PxU32 gMaskXYZ [ 4 ] ) = { 0xffffffff , 0xffffffff , 0xffffffff , 0 } ;
# endif
}
namespace vecMathTests
{
// PT: this function returns an invalid Vec3V (W!=0.0f) just for unit-testing 'isValidVec3V'
PX_FORCE_INLINE Vec3V getInvalidVec3V ( )
{
const float f = 1.0f ;
return _mm_load1_ps ( & f ) ;
}
PX_FORCE_INLINE bool allElementsEqualFloatV ( const FloatV a , const FloatV b )
{
ASSERT_ISVALIDFLOATV ( a ) ;
ASSERT_ISVALIDFLOATV ( b ) ;
return _mm_comieq_ss ( a , b ) ! = 0 ;
}
PX_FORCE_INLINE bool allElementsEqualVec3V ( const Vec3V a , const Vec3V b )
{
return V3AllEq ( a , b ) ! = 0 ;
}
PX_FORCE_INLINE bool allElementsEqualVec4V ( const Vec4V a , const Vec4V b )
{
return V4AllEq ( a , b ) ! = 0 ;
}
PX_FORCE_INLINE bool allElementsEqualBoolV ( const BoolV a , const BoolV b )
{
return internalUnitSSE2Simd : : BAllTrue4_R ( VecI32V_IsEq ( m128_F2I ( a ) , m128_F2I ( b ) ) ) ! = 0 ;
}
PX_FORCE_INLINE bool allElementsEqualVecU32V ( const VecU32V a , const VecU32V b )
{
return internalUnitSSE2Simd : : BAllTrue4_R ( V4IsEqU32 ( a , b ) ) ! = 0 ;
}
PX_FORCE_INLINE bool allElementsEqualVecI32V ( const VecI32V a , const VecI32V b )
{
BoolV c = m128_I2F ( _mm_cmpeq_epi32 ( a , b ) ) ;
return internalUnitSSE2Simd : : BAllTrue4_R ( c ) ! = 0 ;
}
# define VECMATH_AOS_EPSILON (1e-3f)
PX_FORCE_INLINE bool allElementsNearEqualFloatV ( const FloatV a , const FloatV b )
{
ASSERT_ISVALIDFLOATV ( a ) ;
ASSERT_ISVALIDFLOATV ( b ) ;
const FloatV c = FSub ( a , b ) ;
const FloatV minError = FLoad ( - VECMATH_AOS_EPSILON ) ;
const FloatV maxError = FLoad ( VECMATH_AOS_EPSILON ) ;
return _mm_comigt_ss ( c , minError ) & & _mm_comilt_ss ( c , maxError ) ;
}
PX_FORCE_INLINE bool allElementsNearEqualVec3V ( const Vec3V a , const Vec3V b )
{
const Vec3V c = V3Sub ( a , b ) ;
const Vec3V minError = V3Load ( - VECMATH_AOS_EPSILON ) ;
const Vec3V maxError = V3Load ( VECMATH_AOS_EPSILON ) ;
return ( _mm_comigt_ss ( _mm_shuffle_ps ( c , c , _MM_SHUFFLE ( 0 , 0 , 0 , 0 ) ) , minError ) & &
_mm_comilt_ss ( _mm_shuffle_ps ( c , c , _MM_SHUFFLE ( 0 , 0 , 0 , 0 ) ) , maxError ) & &
_mm_comigt_ss ( _mm_shuffle_ps ( c , c , _MM_SHUFFLE ( 1 , 1 , 1 , 1 ) ) , minError ) & &
_mm_comilt_ss ( _mm_shuffle_ps ( c , c , _MM_SHUFFLE ( 1 , 1 , 1 , 1 ) ) , maxError ) & &
_mm_comigt_ss ( _mm_shuffle_ps ( c , c , _MM_SHUFFLE ( 2 , 2 , 2 , 2 ) ) , minError ) & &
_mm_comilt_ss ( _mm_shuffle_ps ( c , c , _MM_SHUFFLE ( 2 , 2 , 2 , 2 ) ) , maxError ) ) ;
}
PX_FORCE_INLINE bool allElementsNearEqualVec4V ( const Vec4V a , const Vec4V b )
{
const Vec4V c = V4Sub ( a , b ) ;
const Vec4V minError = V4Load ( - VECMATH_AOS_EPSILON ) ;
const Vec4V maxError = V4Load ( VECMATH_AOS_EPSILON ) ;
return ( _mm_comigt_ss ( _mm_shuffle_ps ( c , c , _MM_SHUFFLE ( 0 , 0 , 0 , 0 ) ) , minError ) & &
_mm_comilt_ss ( _mm_shuffle_ps ( c , c , _MM_SHUFFLE ( 0 , 0 , 0 , 0 ) ) , maxError ) & &
_mm_comigt_ss ( _mm_shuffle_ps ( c , c , _MM_SHUFFLE ( 1 , 1 , 1 , 1 ) ) , minError ) & &
_mm_comilt_ss ( _mm_shuffle_ps ( c , c , _MM_SHUFFLE ( 1 , 1 , 1 , 1 ) ) , maxError ) & &
_mm_comigt_ss ( _mm_shuffle_ps ( c , c , _MM_SHUFFLE ( 2 , 2 , 2 , 2 ) ) , minError ) & &
_mm_comilt_ss ( _mm_shuffle_ps ( c , c , _MM_SHUFFLE ( 2 , 2 , 2 , 2 ) ) , maxError ) & &
_mm_comigt_ss ( _mm_shuffle_ps ( c , c , _MM_SHUFFLE ( 3 , 3 , 3 , 3 ) ) , minError ) & &
_mm_comilt_ss ( _mm_shuffle_ps ( c , c , _MM_SHUFFLE ( 3 , 3 , 3 , 3 ) ) , maxError ) ) ;
}
}
/////////////////////////////////////////////////////////////////////
////FUNCTIONS USED ONLY FOR ASSERTS IN VECTORISED IMPLEMENTATIONS
/////////////////////////////////////////////////////////////////////
PX_FORCE_INLINE bool isFiniteFloatV ( const FloatV a )
{
PxF32 badNumber =
physx : : PxUnionCast < PxF32 , PxU32 > ( PX_FPCLASS_SNAN | PX_FPCLASS_QNAN | PX_FPCLASS_NINF | PX_FPCLASS_PINF ) ;
const FloatV vBadNum = FLoad ( badNumber ) ;
const BoolV vMask = BAnd ( vBadNum , a ) ;
return internalUnitSSE2Simd : : FiniteTestEq ( vMask , BFFFF ( ) ) = = 1 ;
}
PX_FORCE_INLINE bool isFiniteVec3V ( const Vec3V a )
{
PxF32 badNumber =
physx : : PxUnionCast < PxF32 , PxU32 > ( PX_FPCLASS_SNAN | PX_FPCLASS_QNAN | PX_FPCLASS_NINF | PX_FPCLASS_PINF ) ;
const Vec3V vBadNum = V3Load ( badNumber ) ;
const BoolV vMask = BAnd ( BAnd ( vBadNum , a ) , BTTTF ( ) ) ;
return internalUnitSSE2Simd : : FiniteTestEq ( vMask , BFFFF ( ) ) = = 1 ;
}
PX_FORCE_INLINE bool isFiniteVec4V ( const Vec4V a )
{
/*Vec4V a;
PX_ALIGN ( 16 , PxF32 f [ 4 ] ) ;
F32Array_Aligned_From_Vec4V ( a , f ) ;
return PxIsFinite ( f [ 0 ] )
& & PxIsFinite ( f [ 1 ] )
& & PxIsFinite ( f [ 2 ] )
& & PxIsFinite ( f [ 3 ] ) ; */
PxF32 badNumber =
physx : : PxUnionCast < PxF32 , PxU32 > ( PX_FPCLASS_SNAN | PX_FPCLASS_QNAN | PX_FPCLASS_NINF | PX_FPCLASS_PINF ) ;
const Vec4V vBadNum = V4Load ( badNumber ) ;
const BoolV vMask = BAnd ( vBadNum , a ) ;
return internalUnitSSE2Simd : : FiniteTestEq ( vMask , BFFFF ( ) ) = = 1 ;
}
PX_FORCE_INLINE bool hasZeroElementinFloatV ( const FloatV a )
{
ASSERT_ISVALIDFLOATV ( a ) ;
return _mm_comieq_ss ( _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 0 , 0 , 0 , 0 ) ) , FZero ( ) ) ? true : false ;
}
PX_FORCE_INLINE bool hasZeroElementInVec3V ( const Vec3V a )
{
return ( _mm_comieq_ss ( _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 0 , 0 , 0 , 0 ) ) , FZero ( ) ) | |
_mm_comieq_ss ( _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 1 , 1 , 1 , 1 ) ) , FZero ( ) ) | |
_mm_comieq_ss ( _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 2 , 2 , 2 , 2 ) ) , FZero ( ) ) ) ;
}
PX_FORCE_INLINE bool hasZeroElementInVec4V ( const Vec4V a )
{
return ( _mm_comieq_ss ( _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 0 , 0 , 0 , 0 ) ) , FZero ( ) ) | |
_mm_comieq_ss ( _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 1 , 1 , 1 , 1 ) ) , FZero ( ) ) | |
_mm_comieq_ss ( _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 2 , 2 , 2 , 2 ) ) , FZero ( ) ) | |
_mm_comieq_ss ( _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 3 , 3 , 3 , 3 ) ) , FZero ( ) ) ) ;
}
/////////////////////////////////////////////////////////////////////
////VECTORISED FUNCTION IMPLEMENTATIONS
/////////////////////////////////////////////////////////////////////
PX_FORCE_INLINE FloatV FLoad ( const PxF32 f )
{
return _mm_load1_ps ( & f ) ;
}
PX_FORCE_INLINE Vec3V V3Load ( const PxF32 f )
{
return _mm_set_ps ( 0.0f , f , f , f ) ;
}
PX_FORCE_INLINE Vec4V V4Load ( const PxF32 f )
{
return _mm_load1_ps ( & f ) ;
}
PX_FORCE_INLINE BoolV BLoad ( const bool f )
{
const PxU32 i = PxU32 ( - PxI32 ( f ) ) ;
return _mm_load1_ps ( reinterpret_cast < const float * > ( & i ) ) ;
}
PX_FORCE_INLINE Vec3V V3LoadA ( const PxVec3 & f )
{
ASSERT_ISALIGNED16 ( const_cast < PxVec3 * > ( & f ) ) ;
# if !PX_EMSCRIPTEN
return _mm_and_ps ( reinterpret_cast < const Vec3V & > ( f ) , V4LoadA ( internalUnitSSE2Simd : : gMaskXYZ ) ) ;
# else
return _mm_and_ps ( ( Vec3V & ) f , ( VecI32V & ) internalUnitSSE2Simd : : gMaskXYZ ) ;
# endif
}
PX_FORCE_INLINE Vec3V V3LoadU ( const PxVec3 & f )
{
return _mm_set_ps ( 0.0f , f . z , f . y , f . x ) ;
}
PX_FORCE_INLINE Vec3V V3LoadUnsafeA ( const PxVec3 & f )
{
ASSERT_ISALIGNED16 ( const_cast < PxVec3 * > ( & f ) ) ;
return _mm_set_ps ( 0.0f , f . z , f . y , f . x ) ;
}
PX_FORCE_INLINE Vec3V V3LoadA ( const PxF32 * const f )
{
ASSERT_ISALIGNED16 ( const_cast < PxF32 * > ( f ) ) ;
# if !PX_EMSCRIPTEN
return _mm_and_ps ( V4LoadA ( f ) , V4LoadA ( internalUnitSSE2Simd : : gMaskXYZ ) ) ;
# else
return _mm_and_ps ( ( Vec3V & ) * f , ( VecI32V & ) internalUnitSSE2Simd : : gMaskXYZ ) ;
# endif
}
PX_FORCE_INLINE Vec3V V3LoadU ( const PxF32 * const i )
{
return _mm_set_ps ( 0.0f , i [ 2 ] , i [ 1 ] , i [ 0 ] ) ;
}
PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V ( Vec4V v )
{
return V4ClearW ( v ) ;
}
PX_FORCE_INLINE Vec3V Vec3V_From_Vec4V_WUndefined ( const Vec4V v )
{
return v ;
}
PX_FORCE_INLINE Vec4V Vec4V_From_Vec3V ( Vec3V f )
{
ASSERT_ISVALIDVEC3V ( f ) ;
return f ; // ok if it is implemented as the same type.
}
PX_FORCE_INLINE Vec4V Vec4V_From_PxVec3_WUndefined ( const PxVec3 & f )
{
return _mm_set_ps ( 0.0f , f . z , f . y , f . x ) ;
}
PX_FORCE_INLINE Vec4V Vec4V_From_FloatV ( FloatV f )
{
return f ;
}
PX_FORCE_INLINE Vec3V Vec3V_From_FloatV ( FloatV f )
{
ASSERT_ISVALIDFLOATV ( f ) ;
return Vec3V_From_Vec4V ( Vec4V_From_FloatV ( f ) ) ;
}
PX_FORCE_INLINE Vec3V Vec3V_From_FloatV_WUndefined ( FloatV f )
{
ASSERT_ISVALIDVEC3V ( f ) ;
return Vec3V_From_Vec4V_WUndefined ( Vec4V_From_FloatV ( f ) ) ;
}
PX_FORCE_INLINE Mat33V Mat33V_From_PxMat33 ( const PxMat33 & m )
{
return Mat33V ( V3LoadU ( m . column0 ) , V3LoadU ( m . column1 ) , V3LoadU ( m . column2 ) ) ;
}
PX_FORCE_INLINE void PxMat33_From_Mat33V ( const Mat33V & m , PxMat33 & out )
{
V3StoreU ( m . col0 , out . column0 ) ;
V3StoreU ( m . col1 , out . column1 ) ;
V3StoreU ( m . col2 , out . column2 ) ;
}
PX_FORCE_INLINE Vec4V V4LoadA ( const PxF32 * const f )
{
ASSERT_ISALIGNED16 ( const_cast < PxF32 * > ( f ) ) ;
return _mm_load_ps ( f ) ;
}
PX_FORCE_INLINE void V4StoreA ( Vec4V a , PxF32 * f )
{
ASSERT_ISALIGNED16 ( f ) ;
_mm_store_ps ( f , a ) ;
}
PX_FORCE_INLINE void V4StoreU ( const Vec4V a , PxF32 * f )
{
_mm_storeu_ps ( f , a ) ;
}
PX_FORCE_INLINE void BStoreA ( const BoolV a , PxU32 * f )
{
ASSERT_ISALIGNED16 ( f ) ;
_mm_store_ps ( reinterpret_cast < PxF32 * > ( f ) , a ) ;
}
PX_FORCE_INLINE void U4StoreA ( const VecU32V uv , PxU32 * u )
{
ASSERT_ISALIGNED16 ( u ) ;
_mm_store_ps ( reinterpret_cast < float * > ( u ) , uv ) ;
}
PX_FORCE_INLINE VecI32V I4LoadXYZW ( const PxI32 & x , const PxI32 & y , const PxI32 & z , const PxI32 & w )
{
return _mm_set_epi32 ( w , z , y , x ) ;
}
PX_FORCE_INLINE void I4StoreA ( const VecI32V iv , PxI32 * i )
{
ASSERT_ISALIGNED16 ( i ) ;
_mm_store_ps ( reinterpret_cast < float * > ( i ) , m128_I2F ( iv ) ) ;
}
PX_FORCE_INLINE Vec4V V4LoadU ( const PxF32 * const f )
{
return _mm_loadu_ps ( f ) ;
}
PX_FORCE_INLINE BoolV BLoad ( const bool * const f )
{
const PX_ALIGN ( 16 , PxI32 ) b [ 4 ] = { - PxI32 ( f [ 0 ] ) , - PxI32 ( f [ 1 ] ) , - PxI32 ( f [ 2 ] ) , - PxI32 ( f [ 3 ] ) } ;
return _mm_load_ps ( reinterpret_cast < const float * > ( & b ) ) ;
}
PX_FORCE_INLINE void FStore ( const FloatV a , PxF32 * PX_RESTRICT f )
{
ASSERT_ISVALIDFLOATV ( a ) ;
_mm_store_ss ( f , a ) ;
}
PX_FORCE_INLINE void V3StoreA ( const Vec3V a , PxVec3 & f )
{
ASSERT_ISALIGNED16 ( & f ) ;
PX_ALIGN ( 16 , PxF32 ) f2 [ 4 ] ;
_mm_store_ps ( f2 , a ) ;
f = PxVec3 ( f2 [ 0 ] , f2 [ 1 ] , f2 [ 2 ] ) ;
}
PX_FORCE_INLINE void V3StoreU ( const Vec3V a , PxVec3 & f )
{
PX_ALIGN ( 16 , PxF32 ) f2 [ 4 ] ;
_mm_store_ps ( f2 , a ) ;
f = PxVec3 ( f2 [ 0 ] , f2 [ 1 ] , f2 [ 2 ] ) ;
}
PX_FORCE_INLINE void Store_From_BoolV ( const BoolV b , PxU32 * b2 )
{
_mm_store_ss ( reinterpret_cast < PxF32 * > ( b2 ) , b ) ;
}
PX_FORCE_INLINE VecU32V U4Load ( const PxU32 i )
{
return _mm_load1_ps ( reinterpret_cast < const PxF32 * > ( & i ) ) ;
}
PX_FORCE_INLINE VecU32V U4LoadU ( const PxU32 * i )
{
return _mm_loadu_ps ( reinterpret_cast < const PxF32 * > ( i ) ) ;
}
PX_FORCE_INLINE VecU32V U4LoadA ( const PxU32 * i )
{
ASSERT_ISALIGNED16 ( const_cast < PxU32 * > ( i ) ) ;
return _mm_load_ps ( reinterpret_cast < const PxF32 * > ( i ) ) ;
}
//////////////////////////////////
// FLOATV
//////////////////////////////////
PX_FORCE_INLINE FloatV FZero ( )
{
return FLoad ( 0.0f ) ;
}
PX_FORCE_INLINE FloatV FOne ( )
{
return FLoad ( 1.0f ) ;
}
PX_FORCE_INLINE FloatV FHalf ( )
{
return FLoad ( 0.5f ) ;
}
PX_FORCE_INLINE FloatV FEps ( )
{
return FLoad ( PX_EPS_REAL ) ;
}
PX_FORCE_INLINE FloatV FEps6 ( )
{
return FLoad ( 1e-6 f ) ;
}
PX_FORCE_INLINE FloatV FMax ( )
{
return FLoad ( PX_MAX_REAL ) ;
}
PX_FORCE_INLINE FloatV FNegMax ( )
{
return FLoad ( - PX_MAX_REAL ) ;
}
PX_FORCE_INLINE FloatV IZero ( )
{
const PxU32 zero = 0 ;
return _mm_load1_ps ( reinterpret_cast < const PxF32 * > ( & zero ) ) ;
}
PX_FORCE_INLINE FloatV IOne ( )
{
const PxU32 one = 1 ;
return _mm_load1_ps ( reinterpret_cast < const PxF32 * > ( & one ) ) ;
}
PX_FORCE_INLINE FloatV ITwo ( )
{
const PxU32 two = 2 ;
return _mm_load1_ps ( reinterpret_cast < const PxF32 * > ( & two ) ) ;
}
PX_FORCE_INLINE FloatV IThree ( )
{
const PxU32 three = 3 ;
return _mm_load1_ps ( reinterpret_cast < const PxF32 * > ( & three ) ) ;
}
PX_FORCE_INLINE FloatV IFour ( )
{
PxU32 four = 4 ;
return _mm_load1_ps ( reinterpret_cast < const PxF32 * > ( & four ) ) ;
}
PX_FORCE_INLINE FloatV FNeg ( const FloatV f )
{
ASSERT_ISVALIDFLOATV ( f ) ;
return _mm_sub_ps ( _mm_setzero_ps ( ) , f ) ;
}
PX_FORCE_INLINE FloatV FAdd ( const FloatV a , const FloatV b )
{
ASSERT_ISVALIDFLOATV ( a ) ;
ASSERT_ISVALIDFLOATV ( b ) ;
/*
if ( ! isValidFloatV ( a ) )
{
assert ( false ) ;
}
if ( ! isValidFloatV ( b ) )
{
assert ( false ) ;
}
*/
return _mm_add_ps ( a , b ) ;
}
PX_FORCE_INLINE FloatV FSub ( const FloatV a , const FloatV b )
{
ASSERT_ISVALIDFLOATV ( a ) ;
ASSERT_ISVALIDFLOATV ( b ) ;
return _mm_sub_ps ( a , b ) ;
}
PX_FORCE_INLINE FloatV FMul ( const FloatV a , const FloatV b )
{
ASSERT_ISVALIDFLOATV ( a ) ;
ASSERT_ISVALIDFLOATV ( b ) ;
return _mm_mul_ps ( a , b ) ;
}
PX_FORCE_INLINE FloatV FDiv ( const FloatV a , const FloatV b )
{
ASSERT_ISVALIDFLOATV ( a ) ;
ASSERT_ISVALIDFLOATV ( b ) ;
return _mm_div_ps ( a , b ) ;
}
PX_FORCE_INLINE FloatV FDivFast ( const FloatV a , const FloatV b )
{
ASSERT_ISVALIDFLOATV ( a ) ;
ASSERT_ISVALIDFLOATV ( b ) ;
return _mm_mul_ps ( a , _mm_rcp_ps ( b ) ) ;
}
PX_FORCE_INLINE FloatV FRecip ( const FloatV a )
{
ASSERT_ISVALIDFLOATV ( a ) ;
return _mm_div_ps ( FOne ( ) , a ) ;
}
PX_FORCE_INLINE FloatV FRecipFast ( const FloatV a )
{
ASSERT_ISVALIDFLOATV ( a ) ;
return _mm_rcp_ps ( a ) ;
}
PX_FORCE_INLINE FloatV FRsqrt ( const FloatV a )
{
ASSERT_ISVALIDFLOATV ( a ) ;
return _mm_div_ps ( FOne ( ) , _mm_sqrt_ps ( a ) ) ;
}
PX_FORCE_INLINE FloatV FSqrt ( const FloatV a )
{
ASSERT_ISVALIDFLOATV ( a ) ;
return _mm_sqrt_ps ( a ) ;
}
PX_FORCE_INLINE FloatV FRsqrtFast ( const FloatV a )
{
ASSERT_ISVALIDFLOATV ( a ) ;
return _mm_rsqrt_ps ( a ) ;
}
PX_FORCE_INLINE FloatV FScaleAdd ( const FloatV a , const FloatV b , const FloatV c )
{
ASSERT_ISVALIDFLOATV ( a ) ;
ASSERT_ISVALIDFLOATV ( b ) ;
ASSERT_ISVALIDFLOATV ( c ) ;
return FAdd ( FMul ( a , b ) , c ) ;
}
PX_FORCE_INLINE FloatV FNegScaleSub ( const FloatV a , const FloatV b , const FloatV c )
{
ASSERT_ISVALIDFLOATV ( a ) ;
ASSERT_ISVALIDFLOATV ( b ) ;
ASSERT_ISVALIDFLOATV ( c ) ;
return FSub ( c , FMul ( a , b ) ) ;
}
PX_FORCE_INLINE FloatV FAbs ( const FloatV a )
{
ASSERT_ISVALIDFLOATV ( a ) ;
PX_ALIGN ( 16 , const PxU32 ) absMask [ 4 ] = { 0x7fFFffFF , 0x7fFFffFF , 0x7fFFffFF , 0x7fFFffFF } ;
return _mm_and_ps ( a , _mm_load_ps ( reinterpret_cast < const PxF32 * > ( absMask ) ) ) ;
}
PX_FORCE_INLINE FloatV FSel ( const BoolV c , const FloatV a , const FloatV b )
{
PX_ASSERT ( vecMathTests : : allElementsEqualBoolV ( c , BTTTT ( ) ) | |
vecMathTests : : allElementsEqualBoolV ( c , BFFFF ( ) ) ) ;
ASSERT_ISVALIDFLOATV ( _mm_or_ps ( _mm_andnot_ps ( c , b ) , _mm_and_ps ( c , a ) ) ) ;
return _mm_or_ps ( _mm_andnot_ps ( c , b ) , _mm_and_ps ( c , a ) ) ;
}
PX_FORCE_INLINE BoolV FIsGrtr ( const FloatV a , const FloatV b )
{
ASSERT_ISVALIDFLOATV ( a ) ;
ASSERT_ISVALIDFLOATV ( b ) ;
return _mm_cmpgt_ps ( a , b ) ;
}
PX_FORCE_INLINE BoolV FIsGrtrOrEq ( const FloatV a , const FloatV b )
{
ASSERT_ISVALIDFLOATV ( a ) ;
ASSERT_ISVALIDFLOATV ( b ) ;
return _mm_cmpge_ps ( a , b ) ;
}
PX_FORCE_INLINE BoolV FIsEq ( const FloatV a , const FloatV b )
{
ASSERT_ISVALIDFLOATV ( a ) ;
ASSERT_ISVALIDFLOATV ( b ) ;
return _mm_cmpeq_ps ( a , b ) ;
}
PX_FORCE_INLINE FloatV FMax ( const FloatV a , const FloatV b )
{
ASSERT_ISVALIDFLOATV ( a ) ;
ASSERT_ISVALIDFLOATV ( b ) ;
return _mm_max_ps ( a , b ) ;
}
PX_FORCE_INLINE FloatV FMin ( const FloatV a , const FloatV b )
{
ASSERT_ISVALIDFLOATV ( a ) ;
ASSERT_ISVALIDFLOATV ( b ) ;
return _mm_min_ps ( a , b ) ;
}
PX_FORCE_INLINE FloatV FClamp ( const FloatV a , const FloatV minV , const FloatV maxV )
{
ASSERT_ISVALIDFLOATV ( minV ) ;
ASSERT_ISVALIDFLOATV ( maxV ) ;
return _mm_max_ps ( _mm_min_ps ( a , maxV ) , minV ) ;
}
PX_FORCE_INLINE PxU32 FAllGrtr ( const FloatV a , const FloatV b )
{
ASSERT_ISVALIDFLOATV ( a ) ;
ASSERT_ISVALIDFLOATV ( b ) ;
return PxU32 ( _mm_comigt_ss ( a , b ) ) ;
}
PX_FORCE_INLINE PxU32 FAllGrtrOrEq ( const FloatV a , const FloatV b )
{
ASSERT_ISVALIDFLOATV ( a ) ;
ASSERT_ISVALIDFLOATV ( b ) ;
return PxU32 ( _mm_comige_ss ( a , b ) ) ;
}
PX_FORCE_INLINE PxU32 FAllEq ( const FloatV a , const FloatV b )
{
ASSERT_ISVALIDFLOATV ( a ) ;
ASSERT_ISVALIDFLOATV ( b ) ;
return PxU32 ( _mm_comieq_ss ( a , b ) ) ;
}
PX_FORCE_INLINE FloatV FRound ( const FloatV a )
{
ASSERT_ISVALIDFLOATV ( a ) ;
# ifdef __SSE4_2__
return _mm_round_ps ( a , _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC ) ;
# else
// return _mm_round_ps(a, 0x0);
const FloatV half = FLoad ( 0.5f ) ;
const __m128 signBit = _mm_cvtepi32_ps ( _mm_srli_epi32 ( _mm_cvtps_epi32 ( a ) , 31 ) ) ;
const FloatV aRound = FSub ( FAdd ( a , half ) , signBit ) ;
__m128i tmp = _mm_cvttps_epi32 ( aRound ) ;
return _mm_cvtepi32_ps ( tmp ) ;
# endif
}
PX_FORCE_INLINE FloatV FSin ( const FloatV a )
{
ASSERT_ISVALIDFLOATV ( a ) ;
// Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI
const FloatV recipTwoPi = V4LoadA ( g_PXReciprocalTwoPi . f ) ;
const FloatV twoPi = V4LoadA ( g_PXTwoPi . f ) ;
const FloatV tmp = FMul ( a , recipTwoPi ) ;
const FloatV b = FRound ( tmp ) ;
const FloatV V1 = FNegScaleSub ( twoPi , b , a ) ;
// sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! -
// V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)
const FloatV V2 = FMul ( V1 , V1 ) ;
const FloatV V3 = FMul ( V2 , V1 ) ;
const FloatV V5 = FMul ( V3 , V2 ) ;
const FloatV V7 = FMul ( V5 , V2 ) ;
const FloatV V9 = FMul ( V7 , V2 ) ;
const FloatV V11 = FMul ( V9 , V2 ) ;
const FloatV V13 = FMul ( V11 , V2 ) ;
const FloatV V15 = FMul ( V13 , V2 ) ;
const FloatV V17 = FMul ( V15 , V2 ) ;
const FloatV V19 = FMul ( V17 , V2 ) ;
const FloatV V21 = FMul ( V19 , V2 ) ;
const FloatV V23 = FMul ( V21 , V2 ) ;
const Vec4V sinCoefficients0 = V4LoadA ( g_PXSinCoefficients0 . f ) ;
const Vec4V sinCoefficients1 = V4LoadA ( g_PXSinCoefficients1 . f ) ;
const Vec4V sinCoefficients2 = V4LoadA ( g_PXSinCoefficients2 . f ) ;
const FloatV S1 = V4GetY ( sinCoefficients0 ) ;
const FloatV S2 = V4GetZ ( sinCoefficients0 ) ;
const FloatV S3 = V4GetW ( sinCoefficients0 ) ;
const FloatV S4 = V4GetX ( sinCoefficients1 ) ;
const FloatV S5 = V4GetY ( sinCoefficients1 ) ;
const FloatV S6 = V4GetZ ( sinCoefficients1 ) ;
const FloatV S7 = V4GetW ( sinCoefficients1 ) ;
const FloatV S8 = V4GetX ( sinCoefficients2 ) ;
const FloatV S9 = V4GetY ( sinCoefficients2 ) ;
const FloatV S10 = V4GetZ ( sinCoefficients2 ) ;
const FloatV S11 = V4GetW ( sinCoefficients2 ) ;
FloatV Result ;
Result = FScaleAdd ( S1 , V3 , V1 ) ;
Result = FScaleAdd ( S2 , V5 , Result ) ;
Result = FScaleAdd ( S3 , V7 , Result ) ;
Result = FScaleAdd ( S4 , V9 , Result ) ;
Result = FScaleAdd ( S5 , V11 , Result ) ;
Result = FScaleAdd ( S6 , V13 , Result ) ;
Result = FScaleAdd ( S7 , V15 , Result ) ;
Result = FScaleAdd ( S8 , V17 , Result ) ;
Result = FScaleAdd ( S9 , V19 , Result ) ;
Result = FScaleAdd ( S10 , V21 , Result ) ;
Result = FScaleAdd ( S11 , V23 , Result ) ;
return Result ;
}
PX_FORCE_INLINE FloatV FCos ( const FloatV a )
{
ASSERT_ISVALIDFLOATV ( a ) ;
// Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI
const FloatV recipTwoPi = V4LoadA ( g_PXReciprocalTwoPi . f ) ;
const FloatV twoPi = V4LoadA ( g_PXTwoPi . f ) ;
const FloatV tmp = FMul ( a , recipTwoPi ) ;
const FloatV b = FRound ( tmp ) ;
const FloatV V1 = FNegScaleSub ( twoPi , b , a ) ;
// cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! -
// V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)
const FloatV V2 = FMul ( V1 , V1 ) ;
const FloatV V4 = FMul ( V2 , V2 ) ;
const FloatV V6 = FMul ( V4 , V2 ) ;
const FloatV V8 = FMul ( V4 , V4 ) ;
const FloatV V10 = FMul ( V6 , V4 ) ;
const FloatV V12 = FMul ( V6 , V6 ) ;
const FloatV V14 = FMul ( V8 , V6 ) ;
const FloatV V16 = FMul ( V8 , V8 ) ;
const FloatV V18 = FMul ( V10 , V8 ) ;
const FloatV V20 = FMul ( V10 , V10 ) ;
const FloatV V22 = FMul ( V12 , V10 ) ;
const Vec4V cosCoefficients0 = V4LoadA ( g_PXCosCoefficients0 . f ) ;
const Vec4V cosCoefficients1 = V4LoadA ( g_PXCosCoefficients1 . f ) ;
const Vec4V cosCoefficients2 = V4LoadA ( g_PXCosCoefficients2 . f ) ;
const FloatV C1 = V4GetY ( cosCoefficients0 ) ;
const FloatV C2 = V4GetZ ( cosCoefficients0 ) ;
const FloatV C3 = V4GetW ( cosCoefficients0 ) ;
const FloatV C4 = V4GetX ( cosCoefficients1 ) ;
const FloatV C5 = V4GetY ( cosCoefficients1 ) ;
const FloatV C6 = V4GetZ ( cosCoefficients1 ) ;
const FloatV C7 = V4GetW ( cosCoefficients1 ) ;
const FloatV C8 = V4GetX ( cosCoefficients2 ) ;
const FloatV C9 = V4GetY ( cosCoefficients2 ) ;
const FloatV C10 = V4GetZ ( cosCoefficients2 ) ;
const FloatV C11 = V4GetW ( cosCoefficients2 ) ;
FloatV Result ;
Result = FScaleAdd ( C1 , V2 , V4One ( ) ) ;
Result = FScaleAdd ( C2 , V4 , Result ) ;
Result = FScaleAdd ( C3 , V6 , Result ) ;
Result = FScaleAdd ( C4 , V8 , Result ) ;
Result = FScaleAdd ( C5 , V10 , Result ) ;
Result = FScaleAdd ( C6 , V12 , Result ) ;
Result = FScaleAdd ( C7 , V14 , Result ) ;
Result = FScaleAdd ( C8 , V16 , Result ) ;
Result = FScaleAdd ( C9 , V18 , Result ) ;
Result = FScaleAdd ( C10 , V20 , Result ) ;
Result = FScaleAdd ( C11 , V22 , Result ) ;
return Result ;
}
PX_FORCE_INLINE PxU32 FOutOfBounds ( const FloatV a , const FloatV min , const FloatV max )
{
ASSERT_ISVALIDFLOATV ( a ) ;
ASSERT_ISVALIDFLOATV ( min ) ;
ASSERT_ISVALIDFLOATV ( max ) ;
const BoolV c = BOr ( FIsGrtr ( a , max ) , FIsGrtr ( min , a ) ) ;
return ! BAllEqFFFF ( c ) ;
}
PX_FORCE_INLINE PxU32 FInBounds ( const FloatV a , const FloatV min , const FloatV max )
{
ASSERT_ISVALIDFLOATV ( a ) ;
ASSERT_ISVALIDFLOATV ( min ) ;
2023-08-22 18:25:34 +08:00
ASSERT_ISVALIDFLOATV ( max )
2023-08-11 10:55:58 +08:00
const BoolV c = BAnd ( FIsGrtrOrEq ( a , min ) , FIsGrtrOrEq ( max , a ) ) ;
return BAllEqTTTT ( c ) ;
}
PX_FORCE_INLINE PxU32 FOutOfBounds ( const FloatV a , const FloatV bounds )
{
ASSERT_ISVALIDFLOATV ( a ) ;
ASSERT_ISVALIDFLOATV ( bounds ) ;
return FOutOfBounds ( a , FNeg ( bounds ) , bounds ) ;
}
PX_FORCE_INLINE PxU32 FInBounds ( const FloatV a , const FloatV bounds )
{
ASSERT_ISVALIDFLOATV ( a ) ;
ASSERT_ISVALIDFLOATV ( bounds ) ;
return FInBounds ( a , FNeg ( bounds ) , bounds ) ;
}
//////////////////////////////////
// VEC3V
//////////////////////////////////
PX_FORCE_INLINE Vec3V V3Splat ( const FloatV f )
{
ASSERT_ISVALIDFLOATV ( f ) ;
const __m128 zero = FZero ( ) ;
const __m128 fff0 = _mm_move_ss ( f , zero ) ;
return _mm_shuffle_ps ( fff0 , fff0 , _MM_SHUFFLE ( 0 , 1 , 2 , 3 ) ) ;
}
PX_FORCE_INLINE Vec3V V3Merge ( const FloatVArg x , const FloatVArg y , const FloatVArg z )
{
ASSERT_ISVALIDFLOATV ( x ) ;
ASSERT_ISVALIDFLOATV ( y ) ;
ASSERT_ISVALIDFLOATV ( z ) ;
// static on zero causes compiler crash on x64 debug_opt
const __m128 zero = FZero ( ) ;
const __m128 xy = _mm_move_ss ( x , y ) ;
const __m128 z0 = _mm_move_ss ( zero , z ) ;
return _mm_shuffle_ps ( xy , z0 , _MM_SHUFFLE ( 1 , 0 , 0 , 1 ) ) ;
}
PX_FORCE_INLINE Vec3V V3UnitX ( )
{
const PX_ALIGN ( 16 , PxF32 ) x [ 4 ] = { 1.0f , 0.0f , 0.0f , 0.0f } ;
const __m128 x128 = _mm_load_ps ( x ) ;
return x128 ;
}
PX_FORCE_INLINE Vec3V V3UnitY ( )
{
const PX_ALIGN ( 16 , PxF32 ) y [ 4 ] = { 0.0f , 1.0f , 0.0f , 0.0f } ;
const __m128 y128 = _mm_load_ps ( y ) ;
return y128 ;
}
PX_FORCE_INLINE Vec3V V3UnitZ ( )
{
const PX_ALIGN ( 16 , PxF32 ) z [ 4 ] = { 0.0f , 0.0f , 1.0f , 0.0f } ;
const __m128 z128 = _mm_load_ps ( z ) ;
return z128 ;
}
PX_FORCE_INLINE FloatV V3GetX ( const Vec3V f )
{
ASSERT_ISVALIDVEC3V ( f ) ;
return _mm_shuffle_ps ( f , f , _MM_SHUFFLE ( 0 , 0 , 0 , 0 ) ) ;
}
PX_FORCE_INLINE FloatV V3GetY ( const Vec3V f )
{
2023-08-22 18:25:34 +08:00
ASSERT_ISVALIDVEC3V ( f )
2023-08-11 10:55:58 +08:00
return _mm_shuffle_ps ( f , f , _MM_SHUFFLE ( 1 , 1 , 1 , 1 ) ) ;
}
PX_FORCE_INLINE FloatV V3GetZ ( const Vec3V f )
{
ASSERT_ISVALIDVEC3V ( f ) ;
return _mm_shuffle_ps ( f , f , _MM_SHUFFLE ( 2 , 2 , 2 , 2 ) ) ;
}
PX_FORCE_INLINE Vec3V V3SetX ( const Vec3V v , const FloatV f )
{
ASSERT_ISVALIDVEC3V ( v ) ;
ASSERT_ISVALIDFLOATV ( f ) ;
return V4Sel ( BFTTT ( ) , v , f ) ;
}
PX_FORCE_INLINE Vec3V V3SetY ( const Vec3V v , const FloatV f )
{
ASSERT_ISVALIDVEC3V ( v ) ;
ASSERT_ISVALIDFLOATV ( f ) ;
return V4Sel ( BTFTT ( ) , v , f ) ;
}
PX_FORCE_INLINE Vec3V V3SetZ ( const Vec3V v , const FloatV f )
{
ASSERT_ISVALIDVEC3V ( v ) ;
ASSERT_ISVALIDFLOATV ( f ) ;
return V4Sel ( BTTFT ( ) , v , f ) ;
}
PX_FORCE_INLINE Vec3V V3ColX ( const Vec3V a , const Vec3V b , const Vec3V c )
{
ASSERT_ISVALIDVEC3V ( a ) ;
ASSERT_ISVALIDVEC3V ( b ) ;
ASSERT_ISVALIDVEC3V ( c ) ;
Vec3V r = _mm_shuffle_ps ( a , c , _MM_SHUFFLE ( 3 , 0 , 3 , 0 ) ) ;
return V3SetY ( r , V3GetX ( b ) ) ;
}
PX_FORCE_INLINE Vec3V V3ColY ( const Vec3V a , const Vec3V b , const Vec3V c )
{
ASSERT_ISVALIDVEC3V ( a ) ;
ASSERT_ISVALIDVEC3V ( b ) ;
2023-08-22 18:25:34 +08:00
ASSERT_ISVALIDVEC3V ( c )
2023-08-11 10:55:58 +08:00
Vec3V r = _mm_shuffle_ps ( a , c , _MM_SHUFFLE ( 3 , 1 , 3 , 1 ) ) ;
return V3SetY ( r , V3GetY ( b ) ) ;
}
PX_FORCE_INLINE Vec3V V3ColZ ( const Vec3V a , const Vec3V b , const Vec3V c )
{
ASSERT_ISVALIDVEC3V ( a ) ;
ASSERT_ISVALIDVEC3V ( b ) ;
ASSERT_ISVALIDVEC3V ( c ) ;
Vec3V r = _mm_shuffle_ps ( a , c , _MM_SHUFFLE ( 3 , 2 , 3 , 2 ) ) ;
return V3SetY ( r , V3GetZ ( b ) ) ;
}
PX_FORCE_INLINE Vec3V V3Zero ( )
{
return V3Load ( 0.0f ) ;
}
PX_FORCE_INLINE Vec3V V3Eps ( )
{
return V3Load ( PX_EPS_REAL ) ;
}
PX_FORCE_INLINE Vec3V V3One ( )
{
return V3Load ( 1.0f ) ;
}
PX_FORCE_INLINE Vec3V V3Neg ( const Vec3V f )
{
ASSERT_ISVALIDVEC3V ( f ) ;
return _mm_sub_ps ( _mm_setzero_ps ( ) , f ) ;
}
PX_FORCE_INLINE Vec3V V3Add ( const Vec3V a , const Vec3V b )
{
ASSERT_ISVALIDVEC3V ( a ) ;
ASSERT_ISVALIDVEC3V ( b ) ;
return _mm_add_ps ( a , b ) ;
}
PX_FORCE_INLINE Vec3V V3Sub ( const Vec3V a , const Vec3V b )
{
ASSERT_ISVALIDVEC3V ( a ) ;
ASSERT_ISVALIDVEC3V ( b ) ;
return _mm_sub_ps ( a , b ) ;
}
PX_FORCE_INLINE Vec3V V3Scale ( const Vec3V a , const FloatV b )
{
ASSERT_ISVALIDVEC3V ( a ) ;
ASSERT_ISVALIDFLOATV ( b ) ;
return _mm_mul_ps ( a , b ) ;
}
PX_FORCE_INLINE Vec3V V3Mul ( const Vec3V a , const Vec3V b )
{
ASSERT_ISVALIDVEC3V ( a ) ;
ASSERT_ISVALIDVEC3V ( b ) ;
return _mm_mul_ps ( a , b ) ;
}
PX_FORCE_INLINE Vec3V V3ScaleInv ( const Vec3V a , const FloatV b )
{
ASSERT_ISVALIDVEC3V ( a ) ;
ASSERT_ISVALIDFLOATV ( b ) ;
return _mm_div_ps ( a , b ) ;
}
PX_FORCE_INLINE Vec3V V3Div ( const Vec3V a , const Vec3V b )
{
ASSERT_ISVALIDVEC3V ( a ) ;
ASSERT_ISVALIDVEC3V ( b ) ;
return V4ClearW ( _mm_div_ps ( a , b ) ) ;
}
PX_FORCE_INLINE Vec3V V3ScaleInvFast ( const Vec3V a , const FloatV b )
{
ASSERT_ISVALIDVEC3V ( a ) ;
ASSERT_ISVALIDFLOATV ( b ) ;
return _mm_mul_ps ( a , _mm_rcp_ps ( b ) ) ;
}
PX_FORCE_INLINE Vec3V V3DivFast ( const Vec3V a , const Vec3V b )
{
ASSERT_ISVALIDVEC3V ( a ) ;
ASSERT_ISVALIDVEC3V ( b ) ;
return V4ClearW ( _mm_mul_ps ( a , _mm_rcp_ps ( b ) ) ) ;
}
PX_FORCE_INLINE Vec3V V3Recip ( const Vec3V a )
{
ASSERT_ISVALIDVEC3V ( a ) ;
const __m128 zero = V3Zero ( ) ;
const __m128 tttf = BTTTF ( ) ;
const __m128 recipA = _mm_div_ps ( V3One ( ) , a ) ;
return V4Sel ( tttf , recipA , zero ) ;
}
PX_FORCE_INLINE Vec3V V3RecipFast ( const Vec3V a )
{
ASSERT_ISVALIDVEC3V ( a ) ;
const __m128 zero = V3Zero ( ) ;
const __m128 tttf = BTTTF ( ) ;
const __m128 recipA = _mm_rcp_ps ( a ) ;
return V4Sel ( tttf , recipA , zero ) ;
}
PX_FORCE_INLINE Vec3V V3Rsqrt ( const Vec3V a )
{
ASSERT_ISVALIDVEC3V ( a ) ;
const __m128 zero = V3Zero ( ) ;
const __m128 tttf = BTTTF ( ) ;
const __m128 recipA = _mm_div_ps ( V3One ( ) , _mm_sqrt_ps ( a ) ) ;
return V4Sel ( tttf , recipA , zero ) ;
}
PX_FORCE_INLINE Vec3V V3RsqrtFast ( const Vec3V a )
{
ASSERT_ISVALIDVEC3V ( a ) ;
const __m128 zero = V3Zero ( ) ;
const __m128 tttf = BTTTF ( ) ;
const __m128 recipA = _mm_rsqrt_ps ( a ) ;
return V4Sel ( tttf , recipA , zero ) ;
}
PX_FORCE_INLINE Vec3V V3ScaleAdd ( const Vec3V a , const FloatV b , const Vec3V c )
{
ASSERT_ISVALIDVEC3V ( a ) ;
ASSERT_ISVALIDFLOATV ( b ) ;
ASSERT_ISVALIDVEC3V ( c ) ;
return V3Add ( V3Scale ( a , b ) , c ) ;
}
PX_FORCE_INLINE Vec3V V3NegScaleSub ( const Vec3V a , const FloatV b , const Vec3V c )
{
ASSERT_ISVALIDVEC3V ( a ) ;
ASSERT_ISVALIDFLOATV ( b ) ;
ASSERT_ISVALIDVEC3V ( c ) ;
return V3Sub ( c , V3Scale ( a , b ) ) ;
}
PX_FORCE_INLINE Vec3V V3MulAdd ( const Vec3V a , const Vec3V b , const Vec3V c )
{
ASSERT_ISVALIDVEC3V ( a ) ;
ASSERT_ISVALIDVEC3V ( b ) ;
ASSERT_ISVALIDVEC3V ( c ) ;
return V3Add ( V3Mul ( a , b ) , c ) ;
}
PX_FORCE_INLINE Vec3V V3NegMulSub ( const Vec3V a , const Vec3V b , const Vec3V c )
{
ASSERT_ISVALIDVEC3V ( a ) ;
ASSERT_ISVALIDVEC3V ( b ) ;
ASSERT_ISVALIDVEC3V ( c ) ;
return V3Sub ( c , V3Mul ( a , b ) ) ;
}
PX_FORCE_INLINE Vec3V V3Abs ( const Vec3V a )
{
ASSERT_ISVALIDVEC3V ( a ) ;
return V3Max ( a , V3Neg ( a ) ) ;
}
PX_FORCE_INLINE FloatV V3Dot ( const Vec3V a , const Vec3V b )
{
ASSERT_ISVALIDVEC3V ( a ) ;
ASSERT_ISVALIDVEC3V ( b ) ;
# ifdef __SSE4_2__
return _mm_dp_ps ( a , b , 0x7f ) ;
# else
const __m128 t0 = _mm_mul_ps ( a , b ) ; // aw*bw | az*bz | ay*by | ax*bx
const __m128 t1 = _mm_shuffle_ps ( t0 , t0 , _MM_SHUFFLE ( 1 , 0 , 3 , 2 ) ) ; // ay*by | ax*bx | aw*bw | az*bz
const __m128 t2 = _mm_add_ps ( t0 , t1 ) ; // ay*by + aw*bw | ax*bx + az*bz | aw*bw + ay*by | az*bz + ax*bx
const __m128 t3 = _mm_shuffle_ps ( t2 , t2 , _MM_SHUFFLE ( 2 , 3 , 0 , 1 ) ) ; // ax*bx + az*bz | ay*by + aw*bw | az*bz + ax*bx | aw*bw + ay*by
return _mm_add_ps ( t3 , t2 ) ; // ax*bx + az*bz + ay*by + aw*bw
// ay*by + aw*bw + ax*bx + az*bz
// az*bz + ax*bx + aw*bw + ay*by
// aw*bw + ay*by + az*bz + ax*bx
# endif
}
PX_FORCE_INLINE Vec3V V3Cross ( const Vec3V a , const Vec3V b )
{
ASSERT_ISVALIDVEC3V ( a ) ;
ASSERT_ISVALIDVEC3V ( b ) ;
const __m128 r1 = _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 3 , 1 , 0 , 2 ) ) ; // z,x,y,w
const __m128 r2 = _mm_shuffle_ps ( b , b , _MM_SHUFFLE ( 3 , 0 , 2 , 1 ) ) ; // y,z,x,w
const __m128 l1 = _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 3 , 0 , 2 , 1 ) ) ; // y,z,x,w
const __m128 l2 = _mm_shuffle_ps ( b , b , _MM_SHUFFLE ( 3 , 1 , 0 , 2 ) ) ; // z,x,y,w
return _mm_sub_ps ( _mm_mul_ps ( l1 , l2 ) , _mm_mul_ps ( r1 , r2 ) ) ;
}
PX_FORCE_INLINE VecCrossV V3PrepareCross ( const Vec3V a )
{
ASSERT_ISVALIDVEC3V ( a ) ;
VecCrossV v ;
v . mR1 = _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 3 , 1 , 0 , 2 ) ) ; // z,x,y,w
v . mL1 = _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 3 , 0 , 2 , 1 ) ) ; // y,z,x,w
return v ;
}
PX_FORCE_INLINE Vec3V V3Cross ( const VecCrossV & a , const Vec3V b )
{
ASSERT_ISVALIDVEC3V ( b ) ;
const __m128 r2 = _mm_shuffle_ps ( b , b , _MM_SHUFFLE ( 3 , 0 , 2 , 1 ) ) ; // y,z,x,w
const __m128 l2 = _mm_shuffle_ps ( b , b , _MM_SHUFFLE ( 3 , 1 , 0 , 2 ) ) ; // z,x,y,w
return _mm_sub_ps ( _mm_mul_ps ( a . mL1 , l2 ) , _mm_mul_ps ( a . mR1 , r2 ) ) ;
}
PX_FORCE_INLINE Vec3V V3Cross ( const Vec3V a , const VecCrossV & b )
{
ASSERT_ISVALIDVEC3V ( a ) ;
const __m128 r2 = _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 3 , 0 , 2 , 1 ) ) ; // y,z,x,w
const __m128 l2 = _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 3 , 1 , 0 , 2 ) ) ; // z,x,y,w
return _mm_sub_ps ( _mm_mul_ps ( b . mR1 , r2 ) , _mm_mul_ps ( b . mL1 , l2 ) ) ;
}
PX_FORCE_INLINE Vec3V V3Cross ( const VecCrossV & a , const VecCrossV & b )
{
return _mm_sub_ps ( _mm_mul_ps ( a . mL1 , b . mR1 ) , _mm_mul_ps ( a . mR1 , b . mL1 ) ) ;
}
PX_FORCE_INLINE FloatV V3Length ( const Vec3V a )
{
ASSERT_ISVALIDVEC3V ( a ) ;
return _mm_sqrt_ps ( V3Dot ( a , a ) ) ;
}
PX_FORCE_INLINE FloatV V3LengthSq ( const Vec3V a )
{
ASSERT_ISVALIDVEC3V ( a ) ;
return V3Dot ( a , a ) ;
}
PX_FORCE_INLINE Vec3V V3Normalize ( const Vec3V a )
{
ASSERT_ISVALIDVEC3V ( a ) ;
ASSERT_ISFINITELENGTH ( a ) ;
return V3ScaleInv ( a , _mm_sqrt_ps ( V3Dot ( a , a ) ) ) ;
}
PX_FORCE_INLINE Vec3V V3NormalizeFast ( const Vec3V a )
{
ASSERT_ISVALIDVEC3V ( a ) ;
ASSERT_ISFINITELENGTH ( a ) ;
return V3Scale ( a , _mm_rsqrt_ps ( V3Dot ( a , a ) ) ) ;
}
PX_FORCE_INLINE Vec3V V3NormalizeSafe ( const Vec3V a , const Vec3V unsafeReturnValue )
{
ASSERT_ISVALIDVEC3V ( a ) ;
const __m128 eps = V4Eps ( ) ;
const __m128 length = V3Length ( a ) ;
const __m128 isGreaterThanZero = FIsGrtr ( length , eps ) ;
return V3Sel ( isGreaterThanZero , V3ScaleInv ( a , length ) , unsafeReturnValue ) ;
}
PX_FORCE_INLINE Vec3V V3Sel ( const BoolV c , const Vec3V a , const Vec3V b )
{
ASSERT_ISVALIDVEC3V ( _mm_or_ps ( _mm_andnot_ps ( c , b ) , _mm_and_ps ( c , a ) ) ) ;
return _mm_or_ps ( _mm_andnot_ps ( c , b ) , _mm_and_ps ( c , a ) ) ;
}
PX_FORCE_INLINE BoolV V3IsGrtr ( const Vec3V a , const Vec3V b )
{
ASSERT_ISVALIDVEC3V ( a ) ;
ASSERT_ISVALIDVEC3V ( b ) ;
return _mm_cmpgt_ps ( a , b ) ;
}
PX_FORCE_INLINE BoolV V3IsGrtrOrEq ( const Vec3V a , const Vec3V b )
{
ASSERT_ISVALIDVEC3V ( a ) ;
ASSERT_ISVALIDVEC3V ( b ) ;
return _mm_cmpge_ps ( a , b ) ;
}
PX_FORCE_INLINE BoolV V3IsEq ( const Vec3V a , const Vec3V b )
{
ASSERT_ISVALIDVEC3V ( a ) ;
ASSERT_ISVALIDVEC3V ( b ) ;
return _mm_cmpeq_ps ( a , b ) ;
}
PX_FORCE_INLINE Vec3V V3Max ( const Vec3V a , const Vec3V b )
{
ASSERT_ISVALIDVEC3V ( a ) ;
ASSERT_ISVALIDVEC3V ( b ) ;
return _mm_max_ps ( a , b ) ;
}
PX_FORCE_INLINE Vec3V V3Min ( const Vec3V a , const Vec3V b )
{
ASSERT_ISVALIDVEC3V ( a ) ;
ASSERT_ISVALIDVEC3V ( b ) ;
return _mm_min_ps ( a , b ) ;
}
PX_FORCE_INLINE FloatV V3ExtractMax ( const Vec3V a )
{
ASSERT_ISVALIDVEC3V ( a ) ;
const __m128 shuf1 = _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 0 , 0 , 0 , 0 ) ) ;
const __m128 shuf2 = _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 1 , 1 , 1 , 1 ) ) ;
const __m128 shuf3 = _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 2 , 2 , 2 , 2 ) ) ;
return _mm_max_ps ( _mm_max_ps ( shuf1 , shuf2 ) , shuf3 ) ;
}
PX_FORCE_INLINE FloatV V3ExtractMin ( const Vec3V a )
{
ASSERT_ISVALIDVEC3V ( a ) ;
const __m128 shuf1 = _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 0 , 0 , 0 , 0 ) ) ;
const __m128 shuf2 = _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 1 , 1 , 1 , 1 ) ) ;
const __m128 shuf3 = _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 2 , 2 , 2 , 2 ) ) ;
return _mm_min_ps ( _mm_min_ps ( shuf1 , shuf2 ) , shuf3 ) ;
}
// return (a >= 0.0f) ? 1.0f : -1.0f;
PX_FORCE_INLINE Vec3V V3Sign ( const Vec3V a )
{
ASSERT_ISVALIDVEC3V ( a ) ;
const __m128 zero = V3Zero ( ) ;
const __m128 one = V3One ( ) ;
const __m128 none = V3Neg ( one ) ;
return V3Sel ( V3IsGrtrOrEq ( a , zero ) , one , none ) ;
}
PX_FORCE_INLINE Vec3V V3Clamp ( const Vec3V a , const Vec3V minV , const Vec3V maxV )
{
ASSERT_ISVALIDVEC3V ( maxV ) ;
ASSERT_ISVALIDVEC3V ( minV ) ;
return V3Max ( V3Min ( a , maxV ) , minV ) ;
}
PX_FORCE_INLINE PxU32 V3AllGrtr ( const Vec3V a , const Vec3V b )
{
ASSERT_ISVALIDVEC3V ( a ) ;
ASSERT_ISVALIDVEC3V ( b ) ;
return internalUnitSSE2Simd : : BAllTrue3_R ( V4IsGrtr ( a , b ) ) ;
}
PX_FORCE_INLINE PxU32 V3AllGrtrOrEq ( const Vec3V a , const Vec3V b )
{
ASSERT_ISVALIDVEC3V ( a ) ;
ASSERT_ISVALIDVEC3V ( b ) ;
return internalUnitSSE2Simd : : BAllTrue3_R ( V4IsGrtrOrEq ( a , b ) ) ;
}
PX_FORCE_INLINE PxU32 V3AllEq ( const Vec3V a , const Vec3V b )
{
ASSERT_ISVALIDVEC3V ( a ) ;
ASSERT_ISVALIDVEC3V ( b ) ;
return internalUnitSSE2Simd : : BAllTrue3_R ( V4IsEq ( a , b ) ) ;
}
PX_FORCE_INLINE Vec3V V3Round ( const Vec3V a )
{
ASSERT_ISVALIDVEC3V ( a ) ;
# ifdef __SSE4_2__
return _mm_round_ps ( a , _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC ) ;
# else
// return _mm_round_ps(a, 0x0);
const Vec3V half = V3Load ( 0.5f ) ;
const __m128 signBit = _mm_cvtepi32_ps ( _mm_srli_epi32 ( _mm_cvtps_epi32 ( a ) , 31 ) ) ;
const Vec3V aRound = V3Sub ( V3Add ( a , half ) , signBit ) ;
__m128i tmp = _mm_cvttps_epi32 ( aRound ) ;
return _mm_cvtepi32_ps ( tmp ) ;
# endif
}
PX_FORCE_INLINE Vec3V V3Sin ( const Vec3V a )
{
ASSERT_ISVALIDVEC3V ( a ) ;
// Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI
const Vec4V recipTwoPi = V4LoadA ( g_PXReciprocalTwoPi . f ) ;
const Vec4V twoPi = V4LoadA ( g_PXTwoPi . f ) ;
const Vec3V tmp = V3Scale ( a , recipTwoPi ) ;
const Vec3V b = V3Round ( tmp ) ;
const Vec3V V1 = V3NegScaleSub ( b , twoPi , a ) ;
// sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! -
// V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)
const Vec3V V2 = V3Mul ( V1 , V1 ) ;
const Vec3V V3 = V3Mul ( V2 , V1 ) ;
const Vec3V V5 = V3Mul ( V3 , V2 ) ;
const Vec3V V7 = V3Mul ( V5 , V2 ) ;
const Vec3V V9 = V3Mul ( V7 , V2 ) ;
const Vec3V V11 = V3Mul ( V9 , V2 ) ;
const Vec3V V13 = V3Mul ( V11 , V2 ) ;
const Vec3V V15 = V3Mul ( V13 , V2 ) ;
const Vec3V V17 = V3Mul ( V15 , V2 ) ;
const Vec3V V19 = V3Mul ( V17 , V2 ) ;
const Vec3V V21 = V3Mul ( V19 , V2 ) ;
const Vec3V V23 = V3Mul ( V21 , V2 ) ;
const Vec4V sinCoefficients0 = V4LoadA ( g_PXSinCoefficients0 . f ) ;
const Vec4V sinCoefficients1 = V4LoadA ( g_PXSinCoefficients1 . f ) ;
const Vec4V sinCoefficients2 = V4LoadA ( g_PXSinCoefficients2 . f ) ;
const FloatV S1 = V4GetY ( sinCoefficients0 ) ;
const FloatV S2 = V4GetZ ( sinCoefficients0 ) ;
const FloatV S3 = V4GetW ( sinCoefficients0 ) ;
const FloatV S4 = V4GetX ( sinCoefficients1 ) ;
const FloatV S5 = V4GetY ( sinCoefficients1 ) ;
const FloatV S6 = V4GetZ ( sinCoefficients1 ) ;
const FloatV S7 = V4GetW ( sinCoefficients1 ) ;
const FloatV S8 = V4GetX ( sinCoefficients2 ) ;
const FloatV S9 = V4GetY ( sinCoefficients2 ) ;
const FloatV S10 = V4GetZ ( sinCoefficients2 ) ;
const FloatV S11 = V4GetW ( sinCoefficients2 ) ;
Vec3V Result ;
Result = V3ScaleAdd ( V3 , S1 , V1 ) ;
Result = V3ScaleAdd ( V5 , S2 , Result ) ;
Result = V3ScaleAdd ( V7 , S3 , Result ) ;
Result = V3ScaleAdd ( V9 , S4 , Result ) ;
Result = V3ScaleAdd ( V11 , S5 , Result ) ;
Result = V3ScaleAdd ( V13 , S6 , Result ) ;
Result = V3ScaleAdd ( V15 , S7 , Result ) ;
Result = V3ScaleAdd ( V17 , S8 , Result ) ;
Result = V3ScaleAdd ( V19 , S9 , Result ) ;
Result = V3ScaleAdd ( V21 , S10 , Result ) ;
Result = V3ScaleAdd ( V23 , S11 , Result ) ;
ASSERT_ISVALIDVEC3V ( Result ) ;
return Result ;
}
PX_FORCE_INLINE Vec3V V3Cos ( const Vec3V a )
{
ASSERT_ISVALIDVEC3V ( a ) ;
// Modulo the range of the given angles such that -XM_2PI <= Angles < XM_2PI
const Vec4V recipTwoPi = V4LoadA ( g_PXReciprocalTwoPi . f ) ;
const Vec4V twoPi = V4LoadA ( g_PXTwoPi . f ) ;
const Vec3V tmp = V3Scale ( a , recipTwoPi ) ;
const Vec3V b = V3Round ( tmp ) ;
const Vec3V V1 = V3NegScaleSub ( b , twoPi , a ) ;
// cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! -
// V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)
const Vec3V V2 = V3Mul ( V1 , V1 ) ;
const Vec3V V4 = V3Mul ( V2 , V2 ) ;
const Vec3V V6 = V3Mul ( V4 , V2 ) ;
const Vec3V V8 = V3Mul ( V4 , V4 ) ;
const Vec3V V10 = V3Mul ( V6 , V4 ) ;
const Vec3V V12 = V3Mul ( V6 , V6 ) ;
const Vec3V V14 = V3Mul ( V8 , V6 ) ;
const Vec3V V16 = V3Mul ( V8 , V8 ) ;
const Vec3V V18 = V3Mul ( V10 , V8 ) ;
const Vec3V V20 = V3Mul ( V10 , V10 ) ;
const Vec3V V22 = V3Mul ( V12 , V10 ) ;
const Vec4V cosCoefficients0 = V4LoadA ( g_PXCosCoefficients0 . f ) ;
const Vec4V cosCoefficients1 = V4LoadA ( g_PXCosCoefficients1 . f ) ;
const Vec4V cosCoefficients2 = V4LoadA ( g_PXCosCoefficients2 . f ) ;
const FloatV C1 = V4GetY ( cosCoefficients0 ) ;
const FloatV C2 = V4GetZ ( cosCoefficients0 ) ;
const FloatV C3 = V4GetW ( cosCoefficients0 ) ;
const FloatV C4 = V4GetX ( cosCoefficients1 ) ;
const FloatV C5 = V4GetY ( cosCoefficients1 ) ;
const FloatV C6 = V4GetZ ( cosCoefficients1 ) ;
const FloatV C7 = V4GetW ( cosCoefficients1 ) ;
const FloatV C8 = V4GetX ( cosCoefficients2 ) ;
const FloatV C9 = V4GetY ( cosCoefficients2 ) ;
const FloatV C10 = V4GetZ ( cosCoefficients2 ) ;
const FloatV C11 = V4GetW ( cosCoefficients2 ) ;
Vec3V Result ;
Result = V3ScaleAdd ( V2 , C1 , V3One ( ) ) ;
Result = V3ScaleAdd ( V4 , C2 , Result ) ;
Result = V3ScaleAdd ( V6 , C3 , Result ) ;
Result = V3ScaleAdd ( V8 , C4 , Result ) ;
Result = V3ScaleAdd ( V10 , C5 , Result ) ;
Result = V3ScaleAdd ( V12 , C6 , Result ) ;
Result = V3ScaleAdd ( V14 , C7 , Result ) ;
Result = V3ScaleAdd ( V16 , C8 , Result ) ;
Result = V3ScaleAdd ( V18 , C9 , Result ) ;
Result = V3ScaleAdd ( V20 , C10 , Result ) ;
Result = V3ScaleAdd ( V22 , C11 , Result ) ;
ASSERT_ISVALIDVEC3V ( Result ) ;
return Result ;
}
PX_FORCE_INLINE Vec3V V3PermYZZ ( const Vec3V a )
{
ASSERT_ISVALIDVEC3V ( a ) ;
return _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 3 , 2 , 2 , 1 ) ) ;
}
PX_FORCE_INLINE Vec3V V3PermXYX ( const Vec3V a )
{
ASSERT_ISVALIDVEC3V ( a ) ;
return _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 3 , 0 , 1 , 0 ) ) ;
}
PX_FORCE_INLINE Vec3V V3PermYZX ( const Vec3V a )
{
ASSERT_ISVALIDVEC3V ( a ) ;
return _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 3 , 0 , 2 , 1 ) ) ;
}
PX_FORCE_INLINE Vec3V V3PermZXY ( const Vec3V a )
{
ASSERT_ISVALIDVEC3V ( a ) ;
return _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 3 , 1 , 0 , 2 ) ) ;
}
PX_FORCE_INLINE Vec3V V3PermZZY ( const Vec3V a )
{
ASSERT_ISVALIDVEC3V ( a ) ;
return _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 3 , 1 , 2 , 2 ) ) ;
}
PX_FORCE_INLINE Vec3V V3PermYXX ( const Vec3V a )
{
ASSERT_ISVALIDVEC3V ( a ) ;
return _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 3 , 0 , 0 , 1 ) ) ;
}
PX_FORCE_INLINE Vec3V V3Perm_Zero_1Z_0Y ( const Vec3V v0 , const Vec3V v1 )
{
ASSERT_ISVALIDVEC3V ( v0 ) ;
ASSERT_ISVALIDVEC3V ( v1 ) ;
return _mm_shuffle_ps ( v1 , v0 , _MM_SHUFFLE ( 3 , 1 , 2 , 3 ) ) ;
}
PX_FORCE_INLINE Vec3V V3Perm_0Z_Zero_1X ( const Vec3V v0 , const Vec3V v1 )
{
ASSERT_ISVALIDVEC3V ( v0 ) ;
ASSERT_ISVALIDVEC3V ( v1 ) ;
return _mm_shuffle_ps ( v0 , v1 , _MM_SHUFFLE ( 3 , 0 , 3 , 2 ) ) ;
}
PX_FORCE_INLINE Vec3V V3Perm_1Y_0X_Zero ( const Vec3V v0 , const Vec3V v1 )
{
ASSERT_ISVALIDVEC3V ( v0 ) ;
ASSERT_ISVALIDVEC3V ( v1 ) ;
// There must be a better way to do this.
Vec3V v2 = V3Zero ( ) ;
FloatV y1 = V3GetY ( v1 ) ;
FloatV x0 = V3GetX ( v0 ) ;
v2 = V3SetX ( v2 , y1 ) ;
return V3SetY ( v2 , x0 ) ;
}
PX_FORCE_INLINE FloatV V3SumElems ( const Vec3V a )
{
ASSERT_ISVALIDVEC3V ( a ) ;
# ifdef __SSE4_2__
Vec3V r = _mm_hadd_ps ( a , a ) ;
r = _mm_hadd_ps ( r , r ) ;
return r ;
# else
__m128 shuf1 = _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 0 , 0 , 0 , 0 ) ) ; // z,y,x,w
__m128 shuf2 = _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 1 , 1 , 1 , 1 ) ) ; // y,x,w,z
__m128 shuf3 = _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 2 , 2 , 2 , 2 ) ) ; // x,w,z,y
return _mm_add_ps ( _mm_add_ps ( shuf1 , shuf2 ) , shuf3 ) ;
# endif
}
PX_FORCE_INLINE PxU32 V3OutOfBounds ( const Vec3V a , const Vec3V min , const Vec3V max )
{
ASSERT_ISVALIDVEC3V ( a ) ;
ASSERT_ISVALIDVEC3V ( min ) ;
ASSERT_ISVALIDVEC3V ( max ) ;
const BoolV c = BOr ( V3IsGrtr ( a , max ) , V3IsGrtr ( min , a ) ) ;
return ! BAllEqFFFF ( c ) ;
}
PX_FORCE_INLINE PxU32 V3InBounds ( const Vec3V a , const Vec3V min , const Vec3V max )
{
ASSERT_ISVALIDVEC3V ( a ) ;
ASSERT_ISVALIDVEC3V ( min ) ;
ASSERT_ISVALIDVEC3V ( max ) ;
const BoolV c = BAnd ( V3IsGrtrOrEq ( a , min ) , V3IsGrtrOrEq ( max , a ) ) ;
return BAllEqTTTT ( c ) ;
}
PX_FORCE_INLINE PxU32 V3OutOfBounds ( const Vec3V a , const Vec3V bounds )
{
ASSERT_ISVALIDVEC3V ( a ) ;
ASSERT_ISVALIDVEC3V ( bounds ) ;
return V3OutOfBounds ( a , V3Neg ( bounds ) , bounds ) ;
}
PX_FORCE_INLINE PxU32 V3InBounds ( const Vec3V a , const Vec3V bounds )
{
ASSERT_ISVALIDVEC3V ( a ) ;
2023-08-22 18:25:34 +08:00
ASSERT_ISVALIDVEC3V ( bounds )
2023-08-11 10:55:58 +08:00
return V3InBounds ( a , V3Neg ( bounds ) , bounds ) ;
}
PX_FORCE_INLINE void V3Transpose ( Vec3V & col0 , Vec3V & col1 , Vec3V & col2 )
{
ASSERT_ISVALIDVEC3V ( col0 ) ;
ASSERT_ISVALIDVEC3V ( col1 ) ;
ASSERT_ISVALIDVEC3V ( col2 ) ;
const Vec3V col3 = _mm_setzero_ps ( ) ;
Vec3V tmp0 = _mm_unpacklo_ps ( col0 , col1 ) ;
Vec3V tmp2 = _mm_unpacklo_ps ( col2 , col3 ) ;
Vec3V tmp1 = _mm_unpackhi_ps ( col0 , col1 ) ;
Vec3V tmp3 = _mm_unpackhi_ps ( col2 , col3 ) ;
col0 = _mm_movelh_ps ( tmp0 , tmp2 ) ;
col1 = _mm_movehl_ps ( tmp2 , tmp0 ) ;
col2 = _mm_movelh_ps ( tmp1 , tmp3 ) ;
}
//////////////////////////////////
// VEC4V
//////////////////////////////////
PX_FORCE_INLINE Vec4V V4Splat ( const FloatV f )
{
ASSERT_ISVALIDFLOATV ( f ) ;
// return _mm_shuffle_ps(f, f, _MM_SHUFFLE(0,0,0,0));
return f ;
}
PX_FORCE_INLINE Vec4V V4Merge ( const FloatV * const floatVArray )
{
ASSERT_ISVALIDFLOATV ( floatVArray [ 0 ] ) ;
ASSERT_ISVALIDFLOATV ( floatVArray [ 1 ] ) ;
ASSERT_ISVALIDFLOATV ( floatVArray [ 2 ] ) ;
ASSERT_ISVALIDFLOATV ( floatVArray [ 3 ] ) ;
const __m128 xw = _mm_move_ss ( floatVArray [ 1 ] , floatVArray [ 0 ] ) ; // y, y, y, x
const __m128 yz = _mm_move_ss ( floatVArray [ 2 ] , floatVArray [ 3 ] ) ; // z, z, z, w
return _mm_shuffle_ps ( xw , yz , _MM_SHUFFLE ( 0 , 2 , 1 , 0 ) ) ;
}
PX_FORCE_INLINE Vec4V V4Merge ( const FloatVArg x , const FloatVArg y , const FloatVArg z , const FloatVArg w )
{
ASSERT_ISVALIDFLOATV ( x ) ;
ASSERT_ISVALIDFLOATV ( y ) ;
ASSERT_ISVALIDFLOATV ( z ) ;
ASSERT_ISVALIDFLOATV ( w ) ;
const __m128 xw = _mm_move_ss ( y , x ) ; // y, y, y, x
const __m128 yz = _mm_move_ss ( z , w ) ; // z, z, z, w
return _mm_shuffle_ps ( xw , yz , _MM_SHUFFLE ( 0 , 2 , 1 , 0 ) ) ;
}
PX_FORCE_INLINE Vec4V V4MergeW ( const Vec4VArg x , const Vec4VArg y , const Vec4VArg z , const Vec4VArg w )
{
const Vec4V xz = _mm_unpackhi_ps ( x , z ) ;
const Vec4V yw = _mm_unpackhi_ps ( y , w ) ;
return _mm_unpackhi_ps ( xz , yw ) ;
}
PX_FORCE_INLINE Vec4V V4MergeZ ( const Vec4VArg x , const Vec4VArg y , const Vec4VArg z , const Vec4VArg w )
{
const Vec4V xz = _mm_unpackhi_ps ( x , z ) ;
const Vec4V yw = _mm_unpackhi_ps ( y , w ) ;
return _mm_unpacklo_ps ( xz , yw ) ;
}
PX_FORCE_INLINE Vec4V V4MergeY ( const Vec4VArg x , const Vec4VArg y , const Vec4VArg z , const Vec4VArg w )
{
const Vec4V xz = _mm_unpacklo_ps ( x , z ) ;
const Vec4V yw = _mm_unpacklo_ps ( y , w ) ;
return _mm_unpackhi_ps ( xz , yw ) ;
}
PX_FORCE_INLINE Vec4V V4MergeX ( const Vec4VArg x , const Vec4VArg y , const Vec4VArg z , const Vec4VArg w )
{
const Vec4V xz = _mm_unpacklo_ps ( x , z ) ;
const Vec4V yw = _mm_unpacklo_ps ( y , w ) ;
return _mm_unpacklo_ps ( xz , yw ) ;
}
PX_FORCE_INLINE Vec4V V4UnpackXY ( const Vec4VArg a , const Vec4VArg b )
{
return _mm_unpacklo_ps ( a , b ) ;
}
PX_FORCE_INLINE Vec4V V4UnpackZW ( const Vec4VArg a , const Vec4VArg b )
{
return _mm_unpackhi_ps ( a , b ) ;
}
PX_FORCE_INLINE Vec4V V4UnitW ( )
{
const PX_ALIGN ( 16 , PxF32 ) w [ 4 ] = { 0.0f , 0.0f , 0.0f , 1.0f } ;
const __m128 w128 = _mm_load_ps ( w ) ;
return w128 ;
}
PX_FORCE_INLINE Vec4V V4UnitX ( )
{
const PX_ALIGN ( 16 , PxF32 ) x [ 4 ] = { 1.0f , 0.0f , 0.0f , 0.0f } ;
const __m128 x128 = _mm_load_ps ( x ) ;
return x128 ;
}
PX_FORCE_INLINE Vec4V V4UnitY ( )
{
const PX_ALIGN ( 16 , PxF32 ) y [ 4 ] = { 0.0f , 1.0f , 0.0f , 0.0f } ;
const __m128 y128 = _mm_load_ps ( y ) ;
return y128 ;
}
PX_FORCE_INLINE Vec4V V4UnitZ ( )
{
const PX_ALIGN ( 16 , PxF32 ) z [ 4 ] = { 0.0f , 0.0f , 1.0f , 0.0f } ;
const __m128 z128 = _mm_load_ps ( z ) ;
return z128 ;
}
PX_FORCE_INLINE FloatV V4GetW ( const Vec4V f )
{
return _mm_shuffle_ps ( f , f , _MM_SHUFFLE ( 3 , 3 , 3 , 3 ) ) ;
}
PX_FORCE_INLINE FloatV V4GetX ( const Vec4V f )
{
return _mm_shuffle_ps ( f , f , _MM_SHUFFLE ( 0 , 0 , 0 , 0 ) ) ;
}
PX_FORCE_INLINE FloatV V4GetY ( const Vec4V f )
{
return _mm_shuffle_ps ( f , f , _MM_SHUFFLE ( 1 , 1 , 1 , 1 ) ) ;
}
PX_FORCE_INLINE FloatV V4GetZ ( const Vec4V f )
{
return _mm_shuffle_ps ( f , f , _MM_SHUFFLE ( 2 , 2 , 2 , 2 ) ) ;
}
PX_FORCE_INLINE Vec4V V4SetW ( const Vec4V v , const FloatV f )
{
ASSERT_ISVALIDFLOATV ( f ) ;
return V4Sel ( BTTTF ( ) , v , f ) ;
}
PX_FORCE_INLINE Vec4V V4SetX ( const Vec4V v , const FloatV f )
{
ASSERT_ISVALIDFLOATV ( f ) ;
return V4Sel ( BFTTT ( ) , v , f ) ;
}
PX_FORCE_INLINE Vec4V V4SetY ( const Vec4V v , const FloatV f )
{
ASSERT_ISVALIDFLOATV ( f ) ;
return V4Sel ( BTFTT ( ) , v , f ) ;
}
PX_FORCE_INLINE Vec4V V4SetZ ( const Vec4V v , const FloatV f )
{
ASSERT_ISVALIDFLOATV ( f ) ;
return V4Sel ( BTTFT ( ) , v , f ) ;
}
PX_FORCE_INLINE Vec4V V4ClearW ( const Vec4V v )
{
# if !PX_EMSCRIPTEN
return _mm_and_ps ( v , V4LoadA ( internalUnitSSE2Simd : : gMaskXYZ ) ) ;
# else
return _mm_and_ps ( v , ( VecI32V & ) internalUnitSSE2Simd : : gMaskXYZ ) ;
# endif
}
PX_FORCE_INLINE Vec4V V4PermYXWZ ( const Vec4V a )
{
return _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 2 , 3 , 0 , 1 ) ) ;
}
PX_FORCE_INLINE Vec4V V4PermXZXZ ( const Vec4V a )
{
return _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 2 , 0 , 2 , 0 ) ) ;
}
PX_FORCE_INLINE Vec4V V4PermYWYW ( const Vec4V a )
{
return _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 3 , 1 , 3 , 1 ) ) ;
}
PX_FORCE_INLINE Vec4V V4PermYZXW ( const Vec4V a )
{
return _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 3 , 0 , 2 , 1 ) ) ;
}
PX_FORCE_INLINE Vec4V V4PermZWXY ( const Vec4V a )
{
return _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 1 , 0 , 3 , 2 ) ) ;
}
template < PxU8 x , PxU8 y , PxU8 z , PxU8 w >
PX_FORCE_INLINE Vec4V V4Perm ( const Vec4V a )
{
return _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( w , z , y , x ) ) ;
}
PX_FORCE_INLINE Vec4V V4Zero ( )
{
return V4Load ( 0.0f ) ;
}
PX_FORCE_INLINE Vec4V V4One ( )
{
return V4Load ( 1.0f ) ;
}
PX_FORCE_INLINE Vec4V V4Eps ( )
{
return V4Load ( PX_EPS_REAL ) ;
}
PX_FORCE_INLINE Vec4V V4Neg ( const Vec4V f )
{
return _mm_sub_ps ( _mm_setzero_ps ( ) , f ) ;
}
PX_FORCE_INLINE Vec4V V4Add ( const Vec4V a , const Vec4V b )
{
return _mm_add_ps ( a , b ) ;
}
PX_FORCE_INLINE Vec4V V4Sub ( const Vec4V a , const Vec4V b )
{
return _mm_sub_ps ( a , b ) ;
}
PX_FORCE_INLINE Vec4V V4Scale ( const Vec4V a , const FloatV b )
{
return _mm_mul_ps ( a , b ) ;
}
PX_FORCE_INLINE Vec4V V4Mul ( const Vec4V a , const Vec4V b )
{
return _mm_mul_ps ( a , b ) ;
}
PX_FORCE_INLINE Vec4V V4ScaleInv ( const Vec4V a , const FloatV b )
{
ASSERT_ISVALIDFLOATV ( b ) ;
return _mm_div_ps ( a , b ) ;
}
PX_FORCE_INLINE Vec4V V4Div ( const Vec4V a , const Vec4V b )
{
return _mm_div_ps ( a , b ) ;
}
PX_FORCE_INLINE Vec4V V4ScaleInvFast ( const Vec4V a , const FloatV b )
{
ASSERT_ISVALIDFLOATV ( b ) ;
return _mm_mul_ps ( a , _mm_rcp_ps ( b ) ) ;
}
PX_FORCE_INLINE Vec4V V4DivFast ( const Vec4V a , const Vec4V b )
{
return _mm_mul_ps ( a , _mm_rcp_ps ( b ) ) ;
}
PX_FORCE_INLINE Vec4V V4Recip ( const Vec4V a )
{
return _mm_div_ps ( V4One ( ) , a ) ;
}
PX_FORCE_INLINE Vec4V V4RecipFast ( const Vec4V a )
{
return _mm_rcp_ps ( a ) ;
}
PX_FORCE_INLINE Vec4V V4Rsqrt ( const Vec4V a )
{
return _mm_div_ps ( V4One ( ) , _mm_sqrt_ps ( a ) ) ;
}
PX_FORCE_INLINE Vec4V V4RsqrtFast ( const Vec4V a )
{
return _mm_rsqrt_ps ( a ) ;
}
PX_FORCE_INLINE Vec4V V4Sqrt ( const Vec4V a )
{
return _mm_sqrt_ps ( a ) ;
}
PX_FORCE_INLINE Vec4V V4ScaleAdd ( const Vec4V a , const FloatV b , const Vec4V c )
{
ASSERT_ISVALIDFLOATV ( b ) ;
return V4Add ( V4Scale ( a , b ) , c ) ;
}
PX_FORCE_INLINE Vec4V V4NegScaleSub ( const Vec4V a , const FloatV b , const Vec4V c )
{
ASSERT_ISVALIDFLOATV ( b ) ;
return V4Sub ( c , V4Scale ( a , b ) ) ;
}
PX_FORCE_INLINE Vec4V V4MulAdd ( const Vec4V a , const Vec4V b , const Vec4V c )
{
return V4Add ( V4Mul ( a , b ) , c ) ;
}
PX_FORCE_INLINE Vec4V V4NegMulSub ( const Vec4V a , const Vec4V b , const Vec4V c )
{
return V4Sub ( c , V4Mul ( a , b ) ) ;
}
PX_FORCE_INLINE Vec4V V4Abs ( const Vec4V a )
{
return V4Max ( a , V4Neg ( a ) ) ;
}
PX_FORCE_INLINE FloatV V4SumElements ( const Vec4V a )
{
# ifdef __SSE4_2__
Vec4V r = _mm_hadd_ps ( a , a ) ;
r = _mm_hadd_ps ( r , r ) ;
return r ;
# else
const Vec4V xy = V4UnpackXY ( a , a ) ; // x,x,y,y
const Vec4V zw = V4UnpackZW ( a , a ) ; // z,z,w,w
const Vec4V xz_yw = V4Add ( xy , zw ) ; // x+z,x+z,y+w,y+w
const FloatV xz = V4GetX ( xz_yw ) ; // x+z
const FloatV yw = V4GetZ ( xz_yw ) ; // y+w
return FAdd ( xz , yw ) ; // sum
# endif
}
PX_FORCE_INLINE FloatV V4Dot ( const Vec4V a , const Vec4V b )
{
# ifdef __SSE4_2__
return _mm_dp_ps ( a , b , 0xff ) ;
# else
//const __m128 dot1 = _mm_mul_ps(a, b); // x,y,z,w
//const __m128 shuf1 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(2, 1, 0, 3)); // w,x,y,z
//const __m128 shuf2 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(1, 0, 3, 2)); // z,w,x,y
//const __m128 shuf3 = _mm_shuffle_ps(dot1, dot1, _MM_SHUFFLE(0, 3, 2, 1)); // y,z,w,x
//return _mm_add_ps(_mm_add_ps(shuf2, shuf3), _mm_add_ps(dot1, shuf1));
// aw*bw | az*bz | ay*by | ax*bx
const __m128 t0 = _mm_mul_ps ( a , b ) ;
// ay*by | ax*bx | aw*bw | az*bz
const __m128 t1 = _mm_shuffle_ps ( t0 , t0 , _MM_SHUFFLE ( 1 , 0 , 3 , 2 ) ) ;
// ay*by + aw*bw | ax*bx + az*bz | aw*bw + ay*by | az*bz + ax*bx
const __m128 t2 = _mm_add_ps ( t0 , t1 ) ;
// ax*bx + az*bz | ay*by + aw*bw | az*bz + ax*bx | aw*bw + ay*by
const __m128 t3 = _mm_shuffle_ps ( t2 , t2 , _MM_SHUFFLE ( 2 , 3 , 0 , 1 ) ) ;
// ax*bx + az*bz + ay*by + aw*bw
return _mm_add_ps ( t3 , t2 ) ;
# endif
}
PX_FORCE_INLINE FloatV V4Dot3 ( const Vec4V a , const Vec4V b )
{
# ifdef __SSE4_2__
return _mm_dp_ps ( a , b , 0x7f ) ;
# else
const __m128 dot1 = _mm_mul_ps ( a , b ) ; // w,z,y,x
const __m128 shuf1 = _mm_shuffle_ps ( dot1 , dot1 , _MM_SHUFFLE ( 0 , 0 , 0 , 0 ) ) ; // z,y,x,w
const __m128 shuf2 = _mm_shuffle_ps ( dot1 , dot1 , _MM_SHUFFLE ( 1 , 1 , 1 , 1 ) ) ; // y,x,w,z
const __m128 shuf3 = _mm_shuffle_ps ( dot1 , dot1 , _MM_SHUFFLE ( 2 , 2 , 2 , 2 ) ) ; // x,w,z,y
return _mm_add_ps ( _mm_add_ps ( shuf1 , shuf2 ) , shuf3 ) ;
# endif
}
PX_FORCE_INLINE Vec4V V4Cross ( const Vec4V a , const Vec4V b )
{
const __m128 r1 = _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 3 , 1 , 0 , 2 ) ) ; // z,x,y,w
const __m128 r2 = _mm_shuffle_ps ( b , b , _MM_SHUFFLE ( 3 , 0 , 2 , 1 ) ) ; // y,z,x,w
const __m128 l1 = _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 3 , 0 , 2 , 1 ) ) ; // y,z,x,w
const __m128 l2 = _mm_shuffle_ps ( b , b , _MM_SHUFFLE ( 3 , 1 , 0 , 2 ) ) ; // z,x,y,w
return _mm_sub_ps ( _mm_mul_ps ( l1 , l2 ) , _mm_mul_ps ( r1 , r2 ) ) ;
}
PX_FORCE_INLINE FloatV V4Length ( const Vec4V a )
{
return _mm_sqrt_ps ( V4Dot ( a , a ) ) ;
}
PX_FORCE_INLINE FloatV V4LengthSq ( const Vec4V a )
{
return V4Dot ( a , a ) ;
}
PX_FORCE_INLINE Vec4V V4Normalize ( const Vec4V a )
{
ASSERT_ISFINITELENGTH ( a ) ;
return V4ScaleInv ( a , _mm_sqrt_ps ( V4Dot ( a , a ) ) ) ;
}
PX_FORCE_INLINE Vec4V V4NormalizeFast ( const Vec4V a )
{
ASSERT_ISFINITELENGTH ( a ) ;
return V4ScaleInvFast ( a , _mm_sqrt_ps ( V4Dot ( a , a ) ) ) ;
}
PX_FORCE_INLINE Vec4V V4NormalizeSafe ( const Vec4V a , const Vec3V unsafeReturnValue )
{
const __m128 eps = V3Eps ( ) ;
const __m128 length = V4Length ( a ) ;
const __m128 isGreaterThanZero = V4IsGrtr ( length , eps ) ;
return V4Sel ( isGreaterThanZero , V4ScaleInv ( a , length ) , unsafeReturnValue ) ;
}
PX_FORCE_INLINE BoolV V4IsEqU32 ( const VecU32V a , const VecU32V b )
{
return m128_I2F ( _mm_cmpeq_epi32 ( m128_F2I ( a ) , m128_F2I ( b ) ) ) ;
}
PX_FORCE_INLINE Vec4V V4Sel ( const BoolV c , const Vec4V a , const Vec4V b )
{
return _mm_or_ps ( _mm_andnot_ps ( c , b ) , _mm_and_ps ( c , a ) ) ;
}
PX_FORCE_INLINE BoolV V4IsGrtr ( const Vec4V a , const Vec4V b )
{
return _mm_cmpgt_ps ( a , b ) ;
}
PX_FORCE_INLINE BoolV V4IsGrtrOrEq ( const Vec4V a , const Vec4V b )
{
return _mm_cmpge_ps ( a , b ) ;
}
PX_FORCE_INLINE BoolV V4IsEq ( const Vec4V a , const Vec4V b )
{
return _mm_cmpeq_ps ( a , b ) ;
}
PX_FORCE_INLINE Vec4V V4Max ( const Vec4V a , const Vec4V b )
{
return _mm_max_ps ( a , b ) ;
}
PX_FORCE_INLINE Vec4V V4Min ( const Vec4V a , const Vec4V b )
{
return _mm_min_ps ( a , b ) ;
}
PX_FORCE_INLINE FloatV V4ExtractMax ( const Vec4V a )
{
const __m128 shuf1 = _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 2 , 1 , 0 , 3 ) ) ;
const __m128 shuf2 = _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 1 , 0 , 3 , 2 ) ) ;
const __m128 shuf3 = _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 0 , 3 , 2 , 1 ) ) ;
return _mm_max_ps ( _mm_max_ps ( a , shuf1 ) , _mm_max_ps ( shuf2 , shuf3 ) ) ;
}
PX_FORCE_INLINE FloatV V4ExtractMin ( const Vec4V a )
{
const __m128 shuf1 = _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 2 , 1 , 0 , 3 ) ) ;
const __m128 shuf2 = _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 1 , 0 , 3 , 2 ) ) ;
const __m128 shuf3 = _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 0 , 3 , 2 , 1 ) ) ;
return _mm_min_ps ( _mm_min_ps ( a , shuf1 ) , _mm_min_ps ( shuf2 , shuf3 ) ) ;
}
PX_FORCE_INLINE Vec4V V4Clamp ( const Vec4V a , const Vec4V minV , const Vec4V maxV )
{
return V4Max ( V4Min ( a , maxV ) , minV ) ;
}
PX_FORCE_INLINE PxU32 V4AllGrtr ( const Vec4V a , const Vec4V b )
{
return internalUnitSSE2Simd : : BAllTrue4_R ( V4IsGrtr ( a , b ) ) ;
}
PX_FORCE_INLINE PxU32 V4AllGrtrOrEq ( const Vec4V a , const Vec4V b )
{
return internalUnitSSE2Simd : : BAllTrue4_R ( V4IsGrtrOrEq ( a , b ) ) ;
}
PX_FORCE_INLINE PxU32 V4AllGrtrOrEq3 ( const Vec4V a , const Vec4V b )
{
return internalUnitSSE2Simd : : BAllTrue3_R ( V4IsGrtrOrEq ( a , b ) ) ;
}
PX_FORCE_INLINE PxU32 V4AllEq ( const Vec4V a , const Vec4V b )
{
return internalUnitSSE2Simd : : BAllTrue4_R ( V4IsEq ( a , b ) ) ;
}
PX_FORCE_INLINE PxU32 V4AnyGrtr3 ( const Vec4V a , const Vec4V b )
{
return internalUnitSSE2Simd : : BAnyTrue3_R ( V4IsGrtr ( a , b ) ) ;
}
PX_FORCE_INLINE Vec4V V4Round ( const Vec4V a )
{
# ifdef __SSE4_2__
return _mm_round_ps ( a , _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC ) ;
# else
// return _mm_round_ps(a, 0x0);
const Vec4V half = V4Load ( 0.5f ) ;
const __m128 signBit = _mm_cvtepi32_ps ( _mm_srli_epi32 ( _mm_cvtps_epi32 ( a ) , 31 ) ) ;
const Vec4V aRound = V4Sub ( V4Add ( a , half ) , signBit ) ;
__m128i tmp = _mm_cvttps_epi32 ( aRound ) ;
return _mm_cvtepi32_ps ( tmp ) ;
# endif
}
PX_FORCE_INLINE Vec4V V4Sin ( const Vec4V a )
{
const Vec4V recipTwoPi = V4LoadA ( g_PXReciprocalTwoPi . f ) ;
const Vec4V twoPi = V4LoadA ( g_PXTwoPi . f ) ;
const Vec4V tmp = V4Mul ( a , recipTwoPi ) ;
const Vec4V b = V4Round ( tmp ) ;
const Vec4V V1 = V4NegMulSub ( twoPi , b , a ) ;
// sin(V) ~= V - V^3 / 3! + V^5 / 5! - V^7 / 7! + V^9 / 9! - V^11 / 11! + V^13 / 13! -
// V^15 / 15! + V^17 / 17! - V^19 / 19! + V^21 / 21! - V^23 / 23! (for -PI <= V < PI)
const Vec4V V2 = V4Mul ( V1 , V1 ) ;
const Vec4V V3 = V4Mul ( V2 , V1 ) ;
const Vec4V V5 = V4Mul ( V3 , V2 ) ;
const Vec4V V7 = V4Mul ( V5 , V2 ) ;
const Vec4V V9 = V4Mul ( V7 , V2 ) ;
const Vec4V V11 = V4Mul ( V9 , V2 ) ;
const Vec4V V13 = V4Mul ( V11 , V2 ) ;
const Vec4V V15 = V4Mul ( V13 , V2 ) ;
const Vec4V V17 = V4Mul ( V15 , V2 ) ;
const Vec4V V19 = V4Mul ( V17 , V2 ) ;
const Vec4V V21 = V4Mul ( V19 , V2 ) ;
const Vec4V V23 = V4Mul ( V21 , V2 ) ;
const Vec4V sinCoefficients0 = V4LoadA ( g_PXSinCoefficients0 . f ) ;
const Vec4V sinCoefficients1 = V4LoadA ( g_PXSinCoefficients1 . f ) ;
const Vec4V sinCoefficients2 = V4LoadA ( g_PXSinCoefficients2 . f ) ;
const FloatV S1 = V4GetY ( sinCoefficients0 ) ;
const FloatV S2 = V4GetZ ( sinCoefficients0 ) ;
const FloatV S3 = V4GetW ( sinCoefficients0 ) ;
const FloatV S4 = V4GetX ( sinCoefficients1 ) ;
const FloatV S5 = V4GetY ( sinCoefficients1 ) ;
const FloatV S6 = V4GetZ ( sinCoefficients1 ) ;
const FloatV S7 = V4GetW ( sinCoefficients1 ) ;
const FloatV S8 = V4GetX ( sinCoefficients2 ) ;
const FloatV S9 = V4GetY ( sinCoefficients2 ) ;
const FloatV S10 = V4GetZ ( sinCoefficients2 ) ;
const FloatV S11 = V4GetW ( sinCoefficients2 ) ;
Vec4V Result ;
Result = V4MulAdd ( S1 , V3 , V1 ) ;
Result = V4MulAdd ( S2 , V5 , Result ) ;
Result = V4MulAdd ( S3 , V7 , Result ) ;
Result = V4MulAdd ( S4 , V9 , Result ) ;
Result = V4MulAdd ( S5 , V11 , Result ) ;
Result = V4MulAdd ( S6 , V13 , Result ) ;
Result = V4MulAdd ( S7 , V15 , Result ) ;
Result = V4MulAdd ( S8 , V17 , Result ) ;
Result = V4MulAdd ( S9 , V19 , Result ) ;
Result = V4MulAdd ( S10 , V21 , Result ) ;
Result = V4MulAdd ( S11 , V23 , Result ) ;
return Result ;
}
PX_FORCE_INLINE Vec4V V4Cos ( const Vec4V a )
{
const Vec4V recipTwoPi = V4LoadA ( g_PXReciprocalTwoPi . f ) ;
const Vec4V twoPi = V4LoadA ( g_PXTwoPi . f ) ;
const Vec4V tmp = V4Mul ( a , recipTwoPi ) ;
const Vec4V b = V4Round ( tmp ) ;
const Vec4V V1 = V4NegMulSub ( twoPi , b , a ) ;
// cos(V) ~= 1 - V^2 / 2! + V^4 / 4! - V^6 / 6! + V^8 / 8! - V^10 / 10! + V^12 / 12! -
// V^14 / 14! + V^16 / 16! - V^18 / 18! + V^20 / 20! - V^22 / 22! (for -PI <= V < PI)
const Vec4V V2 = V4Mul ( V1 , V1 ) ;
const Vec4V V4 = V4Mul ( V2 , V2 ) ;
const Vec4V V6 = V4Mul ( V4 , V2 ) ;
const Vec4V V8 = V4Mul ( V4 , V4 ) ;
const Vec4V V10 = V4Mul ( V6 , V4 ) ;
const Vec4V V12 = V4Mul ( V6 , V6 ) ;
const Vec4V V14 = V4Mul ( V8 , V6 ) ;
const Vec4V V16 = V4Mul ( V8 , V8 ) ;
const Vec4V V18 = V4Mul ( V10 , V8 ) ;
const Vec4V V20 = V4Mul ( V10 , V10 ) ;
const Vec4V V22 = V4Mul ( V12 , V10 ) ;
const Vec4V cosCoefficients0 = V4LoadA ( g_PXCosCoefficients0 . f ) ;
const Vec4V cosCoefficients1 = V4LoadA ( g_PXCosCoefficients1 . f ) ;
const Vec4V cosCoefficients2 = V4LoadA ( g_PXCosCoefficients2 . f ) ;
const FloatV C1 = V4GetY ( cosCoefficients0 ) ;
const FloatV C2 = V4GetZ ( cosCoefficients0 ) ;
const FloatV C3 = V4GetW ( cosCoefficients0 ) ;
const FloatV C4 = V4GetX ( cosCoefficients1 ) ;
const FloatV C5 = V4GetY ( cosCoefficients1 ) ;
const FloatV C6 = V4GetZ ( cosCoefficients1 ) ;
const FloatV C7 = V4GetW ( cosCoefficients1 ) ;
const FloatV C8 = V4GetX ( cosCoefficients2 ) ;
const FloatV C9 = V4GetY ( cosCoefficients2 ) ;
const FloatV C10 = V4GetZ ( cosCoefficients2 ) ;
const FloatV C11 = V4GetW ( cosCoefficients2 ) ;
Vec4V Result ;
Result = V4MulAdd ( C1 , V2 , V4One ( ) ) ;
Result = V4MulAdd ( C2 , V4 , Result ) ;
Result = V4MulAdd ( C3 , V6 , Result ) ;
Result = V4MulAdd ( C4 , V8 , Result ) ;
Result = V4MulAdd ( C5 , V10 , Result ) ;
Result = V4MulAdd ( C6 , V12 , Result ) ;
Result = V4MulAdd ( C7 , V14 , Result ) ;
Result = V4MulAdd ( C8 , V16 , Result ) ;
Result = V4MulAdd ( C9 , V18 , Result ) ;
Result = V4MulAdd ( C10 , V20 , Result ) ;
Result = V4MulAdd ( C11 , V22 , Result ) ;
return Result ;
}
PX_FORCE_INLINE void V4Transpose ( Vec4V & col0 , Vec4V & col1 , Vec4V & col2 , Vec4V & col3 )
{
Vec4V tmp0 = _mm_unpacklo_ps ( col0 , col1 ) ;
Vec4V tmp2 = _mm_unpacklo_ps ( col2 , col3 ) ;
Vec4V tmp1 = _mm_unpackhi_ps ( col0 , col1 ) ;
Vec4V tmp3 = _mm_unpackhi_ps ( col2 , col3 ) ;
col0 = _mm_movelh_ps ( tmp0 , tmp2 ) ;
col1 = _mm_movehl_ps ( tmp2 , tmp0 ) ;
col2 = _mm_movelh_ps ( tmp1 , tmp3 ) ;
col3 = _mm_movehl_ps ( tmp3 , tmp1 ) ;
}
//////////////////////////////////
// BoolV
//////////////////////////////////
PX_FORCE_INLINE BoolV BFFFF ( )
{
return _mm_setzero_ps ( ) ;
}
PX_FORCE_INLINE BoolV BFFFT ( )
{
/*const PX_ALIGN(16, PxU32 f[4])={0,0,0,0xFFFFFFFF};
const __m128 ffft = _mm_load_ps ( ( float * ) & f ) ;
return ffft ; */
return m128_I2F ( _mm_set_epi32 ( - 1 , 0 , 0 , 0 ) ) ;
}
PX_FORCE_INLINE BoolV BFFTF ( )
{
/*const PX_ALIGN(16, PxU32 f[4])={0,0,0xFFFFFFFF,0};
const __m128 fftf = _mm_load_ps ( ( float * ) & f ) ;
return fftf ; */
return m128_I2F ( _mm_set_epi32 ( 0 , - 1 , 0 , 0 ) ) ;
}
PX_FORCE_INLINE BoolV BFFTT ( )
{
/*const PX_ALIGN(16, PxU32 f[4])={0,0,0xFFFFFFFF,0xFFFFFFFF};
const __m128 fftt = _mm_load_ps ( ( float * ) & f ) ;
return fftt ; */
return m128_I2F ( _mm_set_epi32 ( - 1 , - 1 , 0 , 0 ) ) ;
}
PX_FORCE_INLINE BoolV BFTFF ( )
{
/*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0,0};
const __m128 ftff = _mm_load_ps ( ( float * ) & f ) ;
return ftff ; */
return m128_I2F ( _mm_set_epi32 ( 0 , 0 , - 1 , 0 ) ) ;
}
PX_FORCE_INLINE BoolV BFTFT ( )
{
/*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0,0xFFFFFFFF};
const __m128 ftft = _mm_load_ps ( ( float * ) & f ) ;
return ftft ; */
return m128_I2F ( _mm_set_epi32 ( - 1 , 0 , - 1 , 0 ) ) ;
}
PX_FORCE_INLINE BoolV BFTTF ( )
{
/*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0xFFFFFFFF,0};
const __m128 fttf = _mm_load_ps ( ( float * ) & f ) ;
return fttf ; */
return m128_I2F ( _mm_set_epi32 ( 0 , - 1 , - 1 , 0 ) ) ;
}
PX_FORCE_INLINE BoolV BFTTT ( )
{
/*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF};
const __m128 fttt = _mm_load_ps ( ( float * ) & f ) ;
return fttt ; */
return m128_I2F ( _mm_set_epi32 ( - 1 , - 1 , - 1 , 0 ) ) ;
}
PX_FORCE_INLINE BoolV BTFFF ( )
{
// const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0,0};
// const __m128 tfff=_mm_load_ps((float*)&f);
// return tfff;
return m128_I2F ( _mm_set_epi32 ( 0 , 0 , 0 , - 1 ) ) ;
}
PX_FORCE_INLINE BoolV BTFFT ( )
{
/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0,0xFFFFFFFF};
const __m128 tfft = _mm_load_ps ( ( float * ) & f ) ;
return tfft ; */
return m128_I2F ( _mm_set_epi32 ( - 1 , 0 , 0 , - 1 ) ) ;
}
PX_FORCE_INLINE BoolV BTFTF ( )
{
/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0xFFFFFFFF,0};
const __m128 tftf = _mm_load_ps ( ( float * ) & f ) ;
return tftf ; */
return m128_I2F ( _mm_set_epi32 ( 0 , - 1 , 0 , - 1 ) ) ;
}
PX_FORCE_INLINE BoolV BTFTT ( )
{
/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0xFFFFFFFF,0xFFFFFFFF};
const __m128 tftt = _mm_load_ps ( ( float * ) & f ) ;
return tftt ; */
return m128_I2F ( _mm_set_epi32 ( - 1 , - 1 , 0 , - 1 ) ) ;
}
PX_FORCE_INLINE BoolV BTTFF ( )
{
/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0,0};
const __m128 ttff = _mm_load_ps ( ( float * ) & f ) ;
return ttff ; */
return m128_I2F ( _mm_set_epi32 ( 0 , 0 , - 1 , - 1 ) ) ;
}
PX_FORCE_INLINE BoolV BTTFT ( )
{
/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0,0xFFFFFFFF};
const __m128 ttft = _mm_load_ps ( ( float * ) & f ) ;
return ttft ; */
return m128_I2F ( _mm_set_epi32 ( - 1 , 0 , - 1 , - 1 ) ) ;
}
PX_FORCE_INLINE BoolV BTTTF ( )
{
/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0};
const __m128 tttf = _mm_load_ps ( ( float * ) & f ) ;
return tttf ; */
return m128_I2F ( _mm_set_epi32 ( 0 , - 1 , - 1 , - 1 ) ) ;
}
PX_FORCE_INLINE BoolV BTTTT ( )
{
/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF};
const __m128 tttt = _mm_load_ps ( ( float * ) & f ) ;
return tttt ; */
return m128_I2F ( _mm_set_epi32 ( - 1 , - 1 , - 1 , - 1 ) ) ;
}
PX_FORCE_INLINE BoolV BXMask ( )
{
/*const PX_ALIGN(16, PxU32 f[4])={0xFFFFFFFF,0,0,0};
const __m128 tfff = _mm_load_ps ( ( float * ) & f ) ;
return tfff ; */
return m128_I2F ( _mm_set_epi32 ( 0 , 0 , 0 , - 1 ) ) ;
}
PX_FORCE_INLINE BoolV BYMask ( )
{
/*const PX_ALIGN(16, PxU32 f[4])={0,0xFFFFFFFF,0,0};
const __m128 ftff = _mm_load_ps ( ( float * ) & f ) ;
return ftff ; */
return m128_I2F ( _mm_set_epi32 ( 0 , 0 , - 1 , 0 ) ) ;
}
PX_FORCE_INLINE BoolV BZMask ( )
{
/*const PX_ALIGN(16, PxU32 f[4])={0,0,0xFFFFFFFF,0};
const __m128 fftf = _mm_load_ps ( ( float * ) & f ) ;
return fftf ; */
return m128_I2F ( _mm_set_epi32 ( 0 , - 1 , 0 , 0 ) ) ;
}
PX_FORCE_INLINE BoolV BWMask ( )
{
/*const PX_ALIGN(16, PxU32 f[4])={0,0,0,0xFFFFFFFF};
const __m128 ffft = _mm_load_ps ( ( float * ) & f ) ;
return ffft ; */
return m128_I2F ( _mm_set_epi32 ( - 1 , 0 , 0 , 0 ) ) ;
}
PX_FORCE_INLINE BoolV BGetX ( const BoolV f )
{
return _mm_shuffle_ps ( f , f , _MM_SHUFFLE ( 0 , 0 , 0 , 0 ) ) ;
}
PX_FORCE_INLINE BoolV BGetY ( const BoolV f )
{
return _mm_shuffle_ps ( f , f , _MM_SHUFFLE ( 1 , 1 , 1 , 1 ) ) ;
}
PX_FORCE_INLINE BoolV BGetZ ( const BoolV f )
{
return _mm_shuffle_ps ( f , f , _MM_SHUFFLE ( 2 , 2 , 2 , 2 ) ) ;
}
PX_FORCE_INLINE BoolV BGetW ( const BoolV f )
{
return _mm_shuffle_ps ( f , f , _MM_SHUFFLE ( 3 , 3 , 3 , 3 ) ) ;
}
PX_FORCE_INLINE BoolV BSetX ( const BoolV v , const BoolV f )
{
return V4Sel ( BFTTT ( ) , v , f ) ;
}
PX_FORCE_INLINE BoolV BSetY ( const BoolV v , const BoolV f )
{
return V4Sel ( BTFTT ( ) , v , f ) ;
}
PX_FORCE_INLINE BoolV BSetZ ( const BoolV v , const BoolV f )
{
return V4Sel ( BTTFT ( ) , v , f ) ;
}
PX_FORCE_INLINE BoolV BSetW ( const BoolV v , const BoolV f )
{
return V4Sel ( BTTTF ( ) , v , f ) ;
}
PX_FORCE_INLINE BoolV BAnd ( const BoolV a , const BoolV b )
{
return _mm_and_ps ( a , b ) ;
}
PX_FORCE_INLINE BoolV BNot ( const BoolV a )
{
const BoolV bAllTrue ( BTTTT ( ) ) ;
return _mm_xor_ps ( a , bAllTrue ) ;
}
PX_FORCE_INLINE BoolV BAndNot ( const BoolV a , const BoolV b )
{
return _mm_andnot_ps ( b , a ) ;
}
PX_FORCE_INLINE BoolV BOr ( const BoolV a , const BoolV b )
{
return _mm_or_ps ( a , b ) ;
}
PX_FORCE_INLINE BoolV BAllTrue4 ( const BoolV a )
{
const BoolV bTmp =
_mm_and_ps ( _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 0 , 1 , 0 , 1 ) ) , _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 2 , 3 , 2 , 3 ) ) ) ;
return _mm_and_ps ( _mm_shuffle_ps ( bTmp , bTmp , _MM_SHUFFLE ( 0 , 0 , 0 , 0 ) ) ,
_mm_shuffle_ps ( bTmp , bTmp , _MM_SHUFFLE ( 1 , 1 , 1 , 1 ) ) ) ;
}
PX_FORCE_INLINE BoolV BAnyTrue4 ( const BoolV a )
{
const BoolV bTmp =
_mm_or_ps ( _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 0 , 1 , 0 , 1 ) ) , _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 2 , 3 , 2 , 3 ) ) ) ;
return _mm_or_ps ( _mm_shuffle_ps ( bTmp , bTmp , _MM_SHUFFLE ( 0 , 0 , 0 , 0 ) ) ,
_mm_shuffle_ps ( bTmp , bTmp , _MM_SHUFFLE ( 1 , 1 , 1 , 1 ) ) ) ;
}
PX_FORCE_INLINE BoolV BAllTrue3 ( const BoolV a )
{
const BoolV bTmp =
_mm_and_ps ( _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 0 , 1 , 0 , 1 ) ) , _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 2 , 2 , 2 , 2 ) ) ) ;
return _mm_and_ps ( _mm_shuffle_ps ( bTmp , bTmp , _MM_SHUFFLE ( 0 , 0 , 0 , 0 ) ) ,
_mm_shuffle_ps ( bTmp , bTmp , _MM_SHUFFLE ( 1 , 1 , 1 , 1 ) ) ) ;
}
PX_FORCE_INLINE BoolV BAnyTrue3 ( const BoolV a )
{
const BoolV bTmp =
_mm_or_ps ( _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 0 , 1 , 0 , 1 ) ) , _mm_shuffle_ps ( a , a , _MM_SHUFFLE ( 2 , 2 , 2 , 2 ) ) ) ;
return _mm_or_ps ( _mm_shuffle_ps ( bTmp , bTmp , _MM_SHUFFLE ( 0 , 0 , 0 , 0 ) ) ,
_mm_shuffle_ps ( bTmp , bTmp , _MM_SHUFFLE ( 1 , 1 , 1 , 1 ) ) ) ;
}
PX_FORCE_INLINE PxU32 BAllEq ( const BoolV a , const BoolV b )
{
const BoolV bTest = m128_I2F ( _mm_cmpeq_epi32 ( m128_F2I ( a ) , m128_F2I ( b ) ) ) ;
return internalUnitSSE2Simd : : BAllTrue4_R ( bTest ) ;
}
PX_FORCE_INLINE PxU32 BAllEqTTTT ( const BoolV a )
{
return PxU32 ( _mm_movemask_ps ( a ) = = 15 ) ;
}
PX_FORCE_INLINE PxU32 BAllEqFFFF ( const BoolV a )
{
return PxU32 ( _mm_movemask_ps ( a ) = = 0 ) ;
}
PX_FORCE_INLINE PxU32 BGetBitMask ( const BoolV a )
{
return PxU32 ( _mm_movemask_ps ( a ) ) ;
}
//////////////////////////////////
// MAT33V
//////////////////////////////////
PX_FORCE_INLINE Vec3V M33MulV3 ( const Mat33V & a , const Vec3V b )
{
const FloatV x = V3GetX ( b ) ;
const FloatV y = V3GetY ( b ) ;
const FloatV z = V3GetZ ( b ) ;
const Vec3V v0 = V3Scale ( a . col0 , x ) ;
const Vec3V v1 = V3Scale ( a . col1 , y ) ;
const Vec3V v2 = V3Scale ( a . col2 , z ) ;
const Vec3V v0PlusV1 = V3Add ( v0 , v1 ) ;
return V3Add ( v0PlusV1 , v2 ) ;
}
PX_FORCE_INLINE Vec3V M33TrnspsMulV3 ( const Mat33V & a , const Vec3V b )
{
const FloatV x = V3Dot ( a . col0 , b ) ;
const FloatV y = V3Dot ( a . col1 , b ) ;
const FloatV z = V3Dot ( a . col2 , b ) ;
return V3Merge ( x , y , z ) ;
}
PX_FORCE_INLINE Vec3V M33MulV3AddV3 ( const Mat33V & A , const Vec3V b , const Vec3V c )
{
const FloatV x = V3GetX ( b ) ;
const FloatV y = V3GetY ( b ) ;
const FloatV z = V3GetZ ( b ) ;
Vec3V result = V3ScaleAdd ( A . col0 , x , c ) ;
result = V3ScaleAdd ( A . col1 , y , result ) ;
return V3ScaleAdd ( A . col2 , z , result ) ;
}
PX_FORCE_INLINE Mat33V M33MulM33 ( const Mat33V & a , const Mat33V & b )
{
return Mat33V ( M33MulV3 ( a , b . col0 ) , M33MulV3 ( a , b . col1 ) , M33MulV3 ( a , b . col2 ) ) ;
}
PX_FORCE_INLINE Mat33V M33Add ( const Mat33V & a , const Mat33V & b )
{
return Mat33V ( V3Add ( a . col0 , b . col0 ) , V3Add ( a . col1 , b . col1 ) , V3Add ( a . col2 , b . col2 ) ) ;
}
PX_FORCE_INLINE Mat33V M33Scale ( const Mat33V & a , const FloatV & b )
{
return Mat33V ( V3Scale ( a . col0 , b ) , V3Scale ( a . col1 , b ) , V3Scale ( a . col2 , b ) ) ;
}
PX_FORCE_INLINE Mat33V M33Inverse ( const Mat33V & a )
{
const BoolV tfft = BTFFT ( ) ;
const BoolV tttf = BTTTF ( ) ;
const FloatV zero = FZero ( ) ;
const Vec3V cross01 = V3Cross ( a . col0 , a . col1 ) ;
const Vec3V cross12 = V3Cross ( a . col1 , a . col2 ) ;
const Vec3V cross20 = V3Cross ( a . col2 , a . col0 ) ;
const FloatV dot = V3Dot ( cross01 , a . col2 ) ;
const FloatV invDet = _mm_rcp_ps ( dot ) ;
const Vec3V mergeh = _mm_unpacklo_ps ( cross12 , cross01 ) ;
const Vec3V mergel = _mm_unpackhi_ps ( cross12 , cross01 ) ;
Vec3V colInv0 = _mm_unpacklo_ps ( mergeh , cross20 ) ;
colInv0 = _mm_or_ps ( _mm_andnot_ps ( tttf , zero ) , _mm_and_ps ( tttf , colInv0 ) ) ;
const Vec3V zppd = _mm_shuffle_ps ( mergeh , cross20 , _MM_SHUFFLE ( 3 , 0 , 0 , 2 ) ) ;
const Vec3V pbwp = _mm_shuffle_ps ( cross20 , mergeh , _MM_SHUFFLE ( 3 , 3 , 1 , 0 ) ) ;
const Vec3V colInv1 = _mm_or_ps ( _mm_andnot_ps ( BTFFT ( ) , pbwp ) , _mm_and_ps ( BTFFT ( ) , zppd ) ) ;
const Vec3V xppd = _mm_shuffle_ps ( mergel , cross20 , _MM_SHUFFLE ( 3 , 0 , 0 , 0 ) ) ;
const Vec3V pcyp = _mm_shuffle_ps ( cross20 , mergel , _MM_SHUFFLE ( 3 , 1 , 2 , 0 ) ) ;
const Vec3V colInv2 = _mm_or_ps ( _mm_andnot_ps ( tfft , pcyp ) , _mm_and_ps ( tfft , xppd ) ) ;
return Mat33V ( _mm_mul_ps ( colInv0 , invDet ) , _mm_mul_ps ( colInv1 , invDet ) , _mm_mul_ps ( colInv2 , invDet ) ) ;
}
PX_FORCE_INLINE Mat33V M33Trnsps ( const Mat33V & a )
{
return Mat33V ( V3Merge ( V3GetX ( a . col0 ) , V3GetX ( a . col1 ) , V3GetX ( a . col2 ) ) ,
V3Merge ( V3GetY ( a . col0 ) , V3GetY ( a . col1 ) , V3GetY ( a . col2 ) ) ,
V3Merge ( V3GetZ ( a . col0 ) , V3GetZ ( a . col1 ) , V3GetZ ( a . col2 ) ) ) ;
}
PX_FORCE_INLINE Mat33V M33Identity ( )
{
return Mat33V ( V3UnitX ( ) , V3UnitY ( ) , V3UnitZ ( ) ) ;
}
PX_FORCE_INLINE Mat33V M33Sub ( const Mat33V & a , const Mat33V & b )
{
return Mat33V ( V3Sub ( a . col0 , b . col0 ) , V3Sub ( a . col1 , b . col1 ) , V3Sub ( a . col2 , b . col2 ) ) ;
}
PX_FORCE_INLINE Mat33V M33Neg ( const Mat33V & a )
{
return Mat33V ( V3Neg ( a . col0 ) , V3Neg ( a . col1 ) , V3Neg ( a . col2 ) ) ;
}
PX_FORCE_INLINE Mat33V M33Abs ( const Mat33V & a )
{
return Mat33V ( V3Abs ( a . col0 ) , V3Abs ( a . col1 ) , V3Abs ( a . col2 ) ) ;
}
PX_FORCE_INLINE Mat33V PromoteVec3V ( const Vec3V v )
{
const BoolV bTFFF = BTFFF ( ) ;
const BoolV bFTFF = BFTFF ( ) ;
const BoolV bFFTF = BTFTF ( ) ;
const Vec3V zero = V3Zero ( ) ;
return Mat33V ( V3Sel ( bTFFF , v , zero ) , V3Sel ( bFTFF , v , zero ) , V3Sel ( bFFTF , v , zero ) ) ;
}
PX_FORCE_INLINE Mat33V M33Diagonal ( const Vec3VArg d )
{
const FloatV x = V3Mul ( V3UnitX ( ) , d ) ;
const FloatV y = V3Mul ( V3UnitY ( ) , d ) ;
const FloatV z = V3Mul ( V3UnitZ ( ) , d ) ;
return Mat33V ( x , y , z ) ;
}
//////////////////////////////////
// MAT34V
//////////////////////////////////
PX_FORCE_INLINE Vec3V M34MulV3 ( const Mat34V & a , const Vec3V b )
{
const FloatV x = V3GetX ( b ) ;
const FloatV y = V3GetY ( b ) ;
const FloatV z = V3GetZ ( b ) ;
const Vec3V v0 = V3Scale ( a . col0 , x ) ;
const Vec3V v1 = V3Scale ( a . col1 , y ) ;
const Vec3V v2 = V3Scale ( a . col2 , z ) ;
const Vec3V v0PlusV1 = V3Add ( v0 , v1 ) ;
const Vec3V v0PlusV1Plusv2 = V3Add ( v0PlusV1 , v2 ) ;
return V3Add ( v0PlusV1Plusv2 , a . col3 ) ;
}
PX_FORCE_INLINE Vec3V M34Mul33V3 ( const Mat34V & a , const Vec3V b )
{
const FloatV x = V3GetX ( b ) ;
const FloatV y = V3GetY ( b ) ;
const FloatV z = V3GetZ ( b ) ;
const Vec3V v0 = V3Scale ( a . col0 , x ) ;
const Vec3V v1 = V3Scale ( a . col1 , y ) ;
const Vec3V v2 = V3Scale ( a . col2 , z ) ;
const Vec3V v0PlusV1 = V3Add ( v0 , v1 ) ;
return V3Add ( v0PlusV1 , v2 ) ;
}
PX_FORCE_INLINE Vec3V M34TrnspsMul33V3 ( const Mat34V & a , const Vec3V b )
{
const FloatV x = V3Dot ( a . col0 , b ) ;
const FloatV y = V3Dot ( a . col1 , b ) ;
const FloatV z = V3Dot ( a . col2 , b ) ;
return V3Merge ( x , y , z ) ;
}
PX_FORCE_INLINE Mat34V M34MulM34 ( const Mat34V & a , const Mat34V & b )
{
return Mat34V ( M34Mul33V3 ( a , b . col0 ) , M34Mul33V3 ( a , b . col1 ) , M34Mul33V3 ( a , b . col2 ) , M34MulV3 ( a , b . col3 ) ) ;
}
PX_FORCE_INLINE Mat33V M34MulM33 ( const Mat34V & a , const Mat33V & b )
{
return Mat33V ( M34Mul33V3 ( a , b . col0 ) , M34Mul33V3 ( a , b . col1 ) , M34Mul33V3 ( a , b . col2 ) ) ;
}
PX_FORCE_INLINE Mat33V M34Mul33MM34 ( const Mat34V & a , const Mat34V & b )
{
return Mat33V ( M34Mul33V3 ( a , b . col0 ) , M34Mul33V3 ( a , b . col1 ) , M34Mul33V3 ( a , b . col2 ) ) ;
}
PX_FORCE_INLINE Mat34V M34Add ( const Mat34V & a , const Mat34V & b )
{
return Mat34V ( V3Add ( a . col0 , b . col0 ) , V3Add ( a . col1 , b . col1 ) , V3Add ( a . col2 , b . col2 ) , V3Add ( a . col3 , b . col3 ) ) ;
}
PX_FORCE_INLINE Mat33V M34Trnsps33 ( const Mat34V & a )
{
return Mat33V ( V3Merge ( V3GetX ( a . col0 ) , V3GetX ( a . col1 ) , V3GetX ( a . col2 ) ) ,
V3Merge ( V3GetY ( a . col0 ) , V3GetY ( a . col1 ) , V3GetY ( a . col2 ) ) ,
V3Merge ( V3GetZ ( a . col0 ) , V3GetZ ( a . col1 ) , V3GetZ ( a . col2 ) ) ) ;
}
//////////////////////////////////
// MAT44V
//////////////////////////////////
PX_FORCE_INLINE Vec4V M44MulV4 ( const Mat44V & a , const Vec4V b )
{
const FloatV x = V4GetX ( b ) ;
const FloatV y = V4GetY ( b ) ;
const FloatV z = V4GetZ ( b ) ;
const FloatV w = V4GetW ( b ) ;
const Vec4V v0 = V4Scale ( a . col0 , x ) ;
const Vec4V v1 = V4Scale ( a . col1 , y ) ;
const Vec4V v2 = V4Scale ( a . col2 , z ) ;
const Vec4V v3 = V4Scale ( a . col3 , w ) ;
const Vec4V v0PlusV1 = V4Add ( v0 , v1 ) ;
const Vec4V v0PlusV1Plusv2 = V4Add ( v0PlusV1 , v2 ) ;
return V4Add ( v0PlusV1Plusv2 , v3 ) ;
}
PX_FORCE_INLINE Vec4V M44TrnspsMulV4 ( const Mat44V & a , const Vec4V b )
{
PX_ALIGN ( 16 , FloatV ) dotProdArray [ 4 ] = { V4Dot ( a . col0 , b ) , V4Dot ( a . col1 , b ) , V4Dot ( a . col2 , b ) , V4Dot ( a . col3 , b ) } ;
return V4Merge ( dotProdArray ) ;
}
PX_FORCE_INLINE Mat44V M44MulM44 ( const Mat44V & a , const Mat44V & b )
{
return Mat44V ( M44MulV4 ( a , b . col0 ) , M44MulV4 ( a , b . col1 ) , M44MulV4 ( a , b . col2 ) , M44MulV4 ( a , b . col3 ) ) ;
}
PX_FORCE_INLINE Mat44V M44Add ( const Mat44V & a , const Mat44V & b )
{
return Mat44V ( V4Add ( a . col0 , b . col0 ) , V4Add ( a . col1 , b . col1 ) , V4Add ( a . col2 , b . col2 ) , V4Add ( a . col3 , b . col3 ) ) ;
}
PX_FORCE_INLINE Mat44V M44Trnsps ( const Mat44V & a )
{
const Vec4V v0 = _mm_unpacklo_ps ( a . col0 , a . col2 ) ;
const Vec4V v1 = _mm_unpackhi_ps ( a . col0 , a . col2 ) ;
const Vec4V v2 = _mm_unpacklo_ps ( a . col1 , a . col3 ) ;
const Vec4V v3 = _mm_unpackhi_ps ( a . col1 , a . col3 ) ;
return Mat44V ( _mm_unpacklo_ps ( v0 , v2 ) , _mm_unpackhi_ps ( v0 , v2 ) , _mm_unpacklo_ps ( v1 , v3 ) , _mm_unpackhi_ps ( v1 , v3 ) ) ;
}
PX_FORCE_INLINE Mat44V M44Inverse ( const Mat44V & a )
{
__m128 minor0 , minor1 , minor2 , minor3 ;
__m128 row0 , row1 , row2 , row3 ;
__m128 det , tmp1 ;
tmp1 = V4Zero ( ) ;
row1 = V4Zero ( ) ;
row3 = V4Zero ( ) ;
row0 = a . col0 ;
row1 = _mm_shuffle_ps ( a . col1 , a . col1 , _MM_SHUFFLE ( 1 , 0 , 3 , 2 ) ) ;
row2 = a . col2 ;
row3 = _mm_shuffle_ps ( a . col3 , a . col3 , _MM_SHUFFLE ( 1 , 0 , 3 , 2 ) ) ;
tmp1 = _mm_mul_ps ( row2 , row3 ) ;
tmp1 = _mm_shuffle_ps ( tmp1 , tmp1 , 0xB1 ) ;
minor0 = _mm_mul_ps ( row1 , tmp1 ) ;
minor1 = _mm_mul_ps ( row0 , tmp1 ) ;
tmp1 = _mm_shuffle_ps ( tmp1 , tmp1 , 0x4E ) ;
minor0 = _mm_sub_ps ( _mm_mul_ps ( row1 , tmp1 ) , minor0 ) ;
minor1 = _mm_sub_ps ( _mm_mul_ps ( row0 , tmp1 ) , minor1 ) ;
minor1 = _mm_shuffle_ps ( minor1 , minor1 , 0x4E ) ;
tmp1 = _mm_mul_ps ( row1 , row2 ) ;
tmp1 = _mm_shuffle_ps ( tmp1 , tmp1 , 0xB1 ) ;
minor0 = _mm_add_ps ( _mm_mul_ps ( row3 , tmp1 ) , minor0 ) ;
minor3 = _mm_mul_ps ( row0 , tmp1 ) ;
tmp1 = _mm_shuffle_ps ( tmp1 , tmp1 , 0x4E ) ;
minor0 = _mm_sub_ps ( minor0 , _mm_mul_ps ( row3 , tmp1 ) ) ;
minor3 = _mm_sub_ps ( _mm_mul_ps ( row0 , tmp1 ) , minor3 ) ;
minor3 = _mm_shuffle_ps ( minor3 , minor3 , 0x4E ) ;
tmp1 = _mm_mul_ps ( _mm_shuffle_ps ( row1 , row1 , 0x4E ) , row3 ) ;
tmp1 = _mm_shuffle_ps ( tmp1 , tmp1 , 0xB1 ) ;
row2 = _mm_shuffle_ps ( row2 , row2 , 0x4E ) ;
minor0 = _mm_add_ps ( _mm_mul_ps ( row2 , tmp1 ) , minor0 ) ;
minor2 = _mm_mul_ps ( row0 , tmp1 ) ;
tmp1 = _mm_shuffle_ps ( tmp1 , tmp1 , 0x4E ) ;
minor0 = _mm_sub_ps ( minor0 , _mm_mul_ps ( row2 , tmp1 ) ) ;
minor2 = _mm_sub_ps ( _mm_mul_ps ( row0 , tmp1 ) , minor2 ) ;
minor2 = _mm_shuffle_ps ( minor2 , minor2 , 0x4E ) ;
tmp1 = _mm_mul_ps ( row0 , row1 ) ;
tmp1 = _mm_shuffle_ps ( tmp1 , tmp1 , 0xB1 ) ;
minor2 = _mm_add_ps ( _mm_mul_ps ( row3 , tmp1 ) , minor2 ) ;
minor3 = _mm_sub_ps ( _mm_mul_ps ( row2 , tmp1 ) , minor3 ) ;
tmp1 = _mm_shuffle_ps ( tmp1 , tmp1 , 0x4E ) ;
minor2 = _mm_sub_ps ( _mm_mul_ps ( row3 , tmp1 ) , minor2 ) ;
minor3 = _mm_sub_ps ( minor3 , _mm_mul_ps ( row2 , tmp1 ) ) ;
tmp1 = _mm_mul_ps ( row0 , row3 ) ;
tmp1 = _mm_shuffle_ps ( tmp1 , tmp1 , 0xB1 ) ;
minor1 = _mm_sub_ps ( minor1 , _mm_mul_ps ( row2 , tmp1 ) ) ;
minor2 = _mm_add_ps ( _mm_mul_ps ( row1 , tmp1 ) , minor2 ) ;
tmp1 = _mm_shuffle_ps ( tmp1 , tmp1 , 0x4E ) ;
minor1 = _mm_add_ps ( _mm_mul_ps ( row2 , tmp1 ) , minor1 ) ;
minor2 = _mm_sub_ps ( minor2 , _mm_mul_ps ( row1 , tmp1 ) ) ;
tmp1 = _mm_mul_ps ( row0 , row2 ) ;
tmp1 = _mm_shuffle_ps ( tmp1 , tmp1 , 0xB1 ) ;
minor1 = _mm_add_ps ( _mm_mul_ps ( row3 , tmp1 ) , minor1 ) ;
minor3 = _mm_sub_ps ( minor3 , _mm_mul_ps ( row1 , tmp1 ) ) ;
tmp1 = _mm_shuffle_ps ( tmp1 , tmp1 , 0x4E ) ;
minor1 = _mm_sub_ps ( minor1 , _mm_mul_ps ( row3 , tmp1 ) ) ;
minor3 = _mm_add_ps ( _mm_mul_ps ( row1 , tmp1 ) , minor3 ) ;
det = _mm_mul_ps ( row0 , minor0 ) ;
det = _mm_add_ps ( _mm_shuffle_ps ( det , det , 0x4E ) , det ) ;
det = _mm_add_ss ( _mm_shuffle_ps ( det , det , 0xB1 ) , det ) ;
tmp1 = _mm_rcp_ss ( det ) ;
#if 0
det = _mm_sub_ss ( _mm_add_ss ( tmp1 , tmp1 ) , _mm_mul_ss ( det , _mm_mul_ss ( tmp1 , tmp1 ) ) ) ;
det = _mm_shuffle_ps ( det , det , 0x00 ) ;
# else
det = _mm_shuffle_ps ( tmp1 , tmp1 , _MM_SHUFFLE ( 0 , 0 , 0 , 0 ) ) ;
# endif
minor0 = _mm_mul_ps ( det , minor0 ) ;
minor1 = _mm_mul_ps ( det , minor1 ) ;
minor2 = _mm_mul_ps ( det , minor2 ) ;
minor3 = _mm_mul_ps ( det , minor3 ) ;
Mat44V invTrans ( minor0 , minor1 , minor2 , minor3 ) ;
return M44Trnsps ( invTrans ) ;
}
PX_FORCE_INLINE Vec4V V4LoadXYZW ( const PxF32 & x , const PxF32 & y , const PxF32 & z , const PxF32 & w )
{
return _mm_set_ps ( w , z , y , x ) ;
}
/*
// AP: work in progress - use proper SSE intrinsics where possible
PX_FORCE_INLINE VecU16V V4U32PK ( VecU32V a , VecU32V b )
{
VecU16V result ;
result . m128_u16 [ 0 ] = PxU16 ( PxClamp < PxU32 > ( ( a ) . m128_u32 [ 0 ] , 0 , 0xFFFF ) ) ;
result . m128_u16 [ 1 ] = PxU16 ( PxClamp < PxU32 > ( ( a ) . m128_u32 [ 1 ] , 0 , 0xFFFF ) ) ;
result . m128_u16 [ 2 ] = PxU16 ( PxClamp < PxU32 > ( ( a ) . m128_u32 [ 2 ] , 0 , 0xFFFF ) ) ;
result . m128_u16 [ 3 ] = PxU16 ( PxClamp < PxU32 > ( ( a ) . m128_u32 [ 3 ] , 0 , 0xFFFF ) ) ;
result . m128_u16 [ 4 ] = PxU16 ( PxClamp < PxU32 > ( ( b ) . m128_u32 [ 0 ] , 0 , 0xFFFF ) ) ;
result . m128_u16 [ 5 ] = PxU16 ( PxClamp < PxU32 > ( ( b ) . m128_u32 [ 1 ] , 0 , 0xFFFF ) ) ;
result . m128_u16 [ 6 ] = PxU16 ( PxClamp < PxU32 > ( ( b ) . m128_u32 [ 2 ] , 0 , 0xFFFF ) ) ;
result . m128_u16 [ 7 ] = PxU16 ( PxClamp < PxU32 > ( ( b ) . m128_u32 [ 3 ] , 0 , 0xFFFF ) ) ;
return result ;
}
*/
PX_FORCE_INLINE VecU32V V4U32Sel ( const BoolV c , const VecU32V a , const VecU32V b )
{
return m128_I2F ( _mm_or_si128 ( _mm_andnot_si128 ( m128_F2I ( c ) , m128_F2I ( b ) ) , _mm_and_si128 ( m128_F2I ( c ) , m128_F2I ( a ) ) ) ) ;
}
PX_FORCE_INLINE VecU32V V4U32or ( VecU32V a , VecU32V b )
{
return m128_I2F ( _mm_or_si128 ( m128_F2I ( a ) , m128_F2I ( b ) ) ) ;
}
PX_FORCE_INLINE VecU32V V4U32xor ( VecU32V a , VecU32V b )
{
return m128_I2F ( _mm_xor_si128 ( m128_F2I ( a ) , m128_F2I ( b ) ) ) ;
}
PX_FORCE_INLINE VecU32V V4U32and ( VecU32V a , VecU32V b )
{
return m128_I2F ( _mm_and_si128 ( m128_F2I ( a ) , m128_F2I ( b ) ) ) ;
}
PX_FORCE_INLINE VecU32V V4U32Andc ( VecU32V a , VecU32V b )
{
return m128_I2F ( _mm_andnot_si128 ( m128_F2I ( b ) , m128_F2I ( a ) ) ) ;
}
/*
PX_FORCE_INLINE VecU16V V4U16Or ( VecU16V a , VecU16V b )
{
return m128_I2F ( _mm_or_si128 ( m128_F2I ( a ) , m128_F2I ( b ) ) ) ;
}
*/
/*
PX_FORCE_INLINE VecU16V V4U16And ( VecU16V a , VecU16V b )
{
return m128_I2F ( _mm_and_si128 ( m128_F2I ( a ) , m128_F2I ( b ) ) ) ;
}
*/
/*
PX_FORCE_INLINE VecU16V V4U16Andc ( VecU16V a , VecU16V b )
{
return m128_I2F ( _mm_andnot_si128 ( m128_F2I ( b ) , m128_F2I ( a ) ) ) ;
}
*/
PX_FORCE_INLINE VecI32V I4Load ( const PxI32 i )
{
return m128_F2I ( _mm_load1_ps ( reinterpret_cast < const PxF32 * > ( & i ) ) ) ;
}
PX_FORCE_INLINE VecI32V I4LoadU ( const PxI32 * i )
{
return m128_F2I ( _mm_loadu_ps ( reinterpret_cast < const PxF32 * > ( i ) ) ) ;
}
PX_FORCE_INLINE VecI32V I4LoadA ( const PxI32 * i )
{
return m128_F2I ( _mm_load_ps ( reinterpret_cast < const PxF32 * > ( i ) ) ) ;
}
PX_FORCE_INLINE VecI32V VecI32V_Add ( const VecI32VArg a , const VecI32VArg b )
{
return _mm_add_epi32 ( a , b ) ;
}
PX_FORCE_INLINE VecI32V VecI32V_Sub ( const VecI32VArg a , const VecI32VArg b )
{
return _mm_sub_epi32 ( a , b ) ;
}
PX_FORCE_INLINE BoolV VecI32V_IsGrtr ( const VecI32VArg a , const VecI32VArg b )
{
return m128_I2F ( _mm_cmpgt_epi32 ( a , b ) ) ;
}
PX_FORCE_INLINE BoolV VecI32V_IsEq ( const VecI32VArg a , const VecI32VArg b )
{
return m128_I2F ( _mm_cmpeq_epi32 ( a , b ) ) ;
}
PX_FORCE_INLINE VecI32V V4I32Sel ( const BoolV c , const VecI32V a , const VecI32V b )
{
return _mm_or_si128 ( _mm_andnot_si128 ( m128_F2I ( c ) , b ) , _mm_and_si128 ( m128_F2I ( c ) , a ) ) ;
}
PX_FORCE_INLINE VecI32V VecI32V_Zero ( )
{
return _mm_setzero_si128 ( ) ;
}
PX_FORCE_INLINE VecI32V VecI32V_One ( )
{
return I4Load ( 1 ) ;
}
PX_FORCE_INLINE VecI32V VecI32V_Two ( )
{
return I4Load ( 2 ) ;
}
PX_FORCE_INLINE VecI32V VecI32V_MinusOne ( )
{
return I4Load ( - 1 ) ;
}
PX_FORCE_INLINE VecU32V U4Zero ( )
{
return U4Load ( 0 ) ;
}
PX_FORCE_INLINE VecU32V U4One ( )
{
return U4Load ( 1 ) ;
}
PX_FORCE_INLINE VecU32V U4Two ( )
{
return U4Load ( 2 ) ;
}
PX_FORCE_INLINE VecI32V VecI32V_Sel ( const BoolV c , const VecI32VArg a , const VecI32VArg b )
{
return _mm_or_si128 ( _mm_andnot_si128 ( m128_F2I ( c ) , b ) , _mm_and_si128 ( m128_F2I ( c ) , a ) ) ;
}
PX_FORCE_INLINE VecShiftV VecI32V_PrepareShift ( const VecI32VArg shift )
{
VecShiftV s ;
s . shift = VecI32V_Sel ( BTFFF ( ) , shift , VecI32V_Zero ( ) ) ;
return s ;
}
PX_FORCE_INLINE VecI32V VecI32V_LeftShift ( const VecI32VArg a , const VecShiftVArg count )
{
return _mm_sll_epi32 ( a , count . shift ) ;
}
PX_FORCE_INLINE VecI32V VecI32V_RightShift ( const VecI32VArg a , const VecShiftVArg count )
{
return _mm_srl_epi32 ( a , count . shift ) ;
}
PX_FORCE_INLINE VecI32V VecI32V_LeftShift ( const VecI32VArg a , const PxU32 count )
{
return _mm_slli_epi32 ( a , PxI32 ( count ) ) ;
}
PX_FORCE_INLINE VecI32V VecI32V_RightShift ( const VecI32VArg a , const PxU32 count )
{
return _mm_srai_epi32 ( a , PxI32 ( count ) ) ;
}
PX_FORCE_INLINE VecI32V VecI32V_And ( const VecI32VArg a , const VecI32VArg b )
{
return _mm_and_si128 ( a , b ) ;
}
PX_FORCE_INLINE VecI32V VecI32V_Or ( const VecI32VArg a , const VecI32VArg b )
{
return _mm_or_si128 ( a , b ) ;
}
PX_FORCE_INLINE VecI32V VecI32V_GetX ( const VecI32VArg a )
{
return m128_F2I ( _mm_shuffle_ps ( m128_I2F ( a ) , m128_I2F ( a ) , _MM_SHUFFLE ( 0 , 0 , 0 , 0 ) ) ) ;
}
PX_FORCE_INLINE VecI32V VecI32V_GetY ( const VecI32VArg a )
{
return m128_F2I ( _mm_shuffle_ps ( m128_I2F ( a ) , m128_I2F ( a ) , _MM_SHUFFLE ( 1 , 1 , 1 , 1 ) ) ) ;
}
PX_FORCE_INLINE VecI32V VecI32V_GetZ ( const VecI32VArg a )
{
return m128_F2I ( _mm_shuffle_ps ( m128_I2F ( a ) , m128_I2F ( a ) , _MM_SHUFFLE ( 2 , 2 , 2 , 2 ) ) ) ;
}
PX_FORCE_INLINE VecI32V VecI32V_GetW ( const VecI32VArg a )
{
return m128_F2I ( _mm_shuffle_ps ( m128_I2F ( a ) , m128_I2F ( a ) , _MM_SHUFFLE ( 3 , 3 , 3 , 3 ) ) ) ;
}
PX_FORCE_INLINE void PxI32_From_VecI32V ( const VecI32VArg a , PxI32 * i )
{
_mm_store_ss ( reinterpret_cast < PxF32 * > ( i ) , m128_I2F ( a ) ) ;
}
PX_FORCE_INLINE VecI32V VecI32V_Merge ( const VecI32VArg x , const VecI32VArg y , const VecI32VArg z , const VecI32VArg w )
{
const __m128 xw = _mm_move_ss ( m128_I2F ( y ) , m128_I2F ( x ) ) ; // y, y, y, x
const __m128 yz = _mm_move_ss ( m128_I2F ( z ) , m128_I2F ( w ) ) ; // z, z, z, w
return m128_F2I ( _mm_shuffle_ps ( xw , yz , _MM_SHUFFLE ( 0 , 2 , 1 , 0 ) ) ) ;
}
PX_FORCE_INLINE VecI32V VecI32V_From_BoolV ( const BoolVArg a )
{
return m128_F2I ( a ) ;
}
PX_FORCE_INLINE VecU32V VecU32V_From_BoolV ( const BoolVArg a )
{
return a ;
}
/*
template < int a > PX_FORCE_INLINE VecI32V V4ISplat ( )
{
VecI32V result ;
result . m128_i32 [ 0 ] = a ;
result . m128_i32 [ 1 ] = a ;
result . m128_i32 [ 2 ] = a ;
result . m128_i32 [ 3 ] = a ;
return result ;
}
template < PxU32 a > PX_FORCE_INLINE VecU32V V4USplat ( )
{
VecU32V result ;
result . m128_u32 [ 0 ] = a ;
result . m128_u32 [ 1 ] = a ;
result . m128_u32 [ 2 ] = a ;
result . m128_u32 [ 3 ] = a ;
return result ;
}
*/
/*
PX_FORCE_INLINE void V4U16StoreAligned ( VecU16V val , VecU16V * address )
{
* address = val ;
}
*/
PX_FORCE_INLINE void V4U32StoreAligned ( VecU32V val , VecU32V * address )
{
* address = val ;
}
PX_FORCE_INLINE Vec4V V4LoadAligned ( Vec4V * addr )
{
return * addr ;
}
PX_FORCE_INLINE Vec4V V4LoadUnaligned ( Vec4V * addr )
{
return V4LoadU ( reinterpret_cast < float * > ( addr ) ) ;
}
PX_FORCE_INLINE Vec4V V4Andc ( const Vec4V a , const VecU32V b )
{
VecU32V result32 ( a ) ;
result32 = V4U32Andc ( result32 , b ) ;
return Vec4V ( result32 ) ;
}
PX_FORCE_INLINE VecU32V V4IsGrtrV32u ( const Vec4V a , const Vec4V b )
{
return V4IsGrtr ( a , b ) ;
}
PX_FORCE_INLINE VecU16V V4U16LoadAligned ( VecU16V * addr )
{
return * addr ;
}
PX_FORCE_INLINE VecU16V V4U16LoadUnaligned ( VecU16V * addr )
{
return * addr ;
}
PX_FORCE_INLINE VecU16V V4U16CompareGt ( VecU16V a , VecU16V b )
{
// _mm_cmpgt_epi16 doesn't work for unsigned values unfortunately
// return m128_I2F(_mm_cmpgt_epi16(m128_F2I(a), m128_F2I(b)));
VecU16V result ;
result . m128_u16 [ 0 ] = ( a ) . m128_u16 [ 0 ] > ( b ) . m128_u16 [ 0 ] ;
result . m128_u16 [ 1 ] = ( a ) . m128_u16 [ 1 ] > ( b ) . m128_u16 [ 1 ] ;
result . m128_u16 [ 2 ] = ( a ) . m128_u16 [ 2 ] > ( b ) . m128_u16 [ 2 ] ;
result . m128_u16 [ 3 ] = ( a ) . m128_u16 [ 3 ] > ( b ) . m128_u16 [ 3 ] ;
result . m128_u16 [ 4 ] = ( a ) . m128_u16 [ 4 ] > ( b ) . m128_u16 [ 4 ] ;
result . m128_u16 [ 5 ] = ( a ) . m128_u16 [ 5 ] > ( b ) . m128_u16 [ 5 ] ;
result . m128_u16 [ 6 ] = ( a ) . m128_u16 [ 6 ] > ( b ) . m128_u16 [ 6 ] ;
result . m128_u16 [ 7 ] = ( a ) . m128_u16 [ 7 ] > ( b ) . m128_u16 [ 7 ] ;
return result ;
}
PX_FORCE_INLINE VecU16V V4I16CompareGt ( VecU16V a , VecU16V b )
{
return m128_I2F ( _mm_cmpgt_epi16 ( m128_F2I ( a ) , m128_F2I ( b ) ) ) ;
}
PX_FORCE_INLINE Vec4V Vec4V_From_VecU32V ( VecU32V a )
{
Vec4V result = V4LoadXYZW ( PxF32 ( a . m128_u32 [ 0 ] ) , PxF32 ( a . m128_u32 [ 1 ] ) , PxF32 ( a . m128_u32 [ 2 ] ) , PxF32 ( a . m128_u32 [ 3 ] ) ) ;
return result ;
}
PX_FORCE_INLINE Vec4V Vec4V_From_VecI32V ( VecI32V in )
{
return _mm_cvtepi32_ps ( in ) ;
}
PX_FORCE_INLINE VecI32V VecI32V_From_Vec4V ( Vec4V a )
{
return _mm_cvttps_epi32 ( a ) ;
}
PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecU32V ( VecU32V a )
{
return Vec4V ( a ) ;
}
PX_FORCE_INLINE Vec4V Vec4V_ReinterpretFrom_VecI32V ( VecI32V a )
{
return m128_I2F ( a ) ;
}
PX_FORCE_INLINE VecU32V VecU32V_ReinterpretFrom_Vec4V ( Vec4V a )
{
return VecU32V ( a ) ;
}
PX_FORCE_INLINE VecI32V VecI32V_ReinterpretFrom_Vec4V ( Vec4V a )
{
return m128_F2I ( a ) ;
}
/*
template < int index > PX_FORCE_INLINE BoolV BSplatElement ( BoolV a )
{
BoolV result ;
result [ 0 ] = result [ 1 ] = result [ 2 ] = result [ 3 ] = a [ index ] ;
return result ;
}
*/
template < int index >
BoolV BSplatElement ( BoolV a )
{
float * data = reinterpret_cast < float * > ( & a ) ;
return V4Load ( data [ index ] ) ;
}
template < int index >
PX_FORCE_INLINE VecU32V V4U32SplatElement ( VecU32V a )
{
VecU32V result ;
result . m128_u32 [ 0 ] = result . m128_u32 [ 1 ] = result . m128_u32 [ 2 ] = result . m128_u32 [ 3 ] = a . m128_u32 [ index ] ;
return result ;
}
template < int index >
PX_FORCE_INLINE Vec4V V4SplatElement ( Vec4V a )
{
float * data = reinterpret_cast < float * > ( & a ) ;
return V4Load ( data [ index ] ) ;
}
PX_FORCE_INLINE VecU32V U4LoadXYZW ( PxU32 x , PxU32 y , PxU32 z , PxU32 w )
{
VecU32V result ;
result . m128_u32 [ 0 ] = x ;
result . m128_u32 [ 1 ] = y ;
result . m128_u32 [ 2 ] = z ;
result . m128_u32 [ 3 ] = w ;
return result ;
}
PX_FORCE_INLINE Vec4V V4Ceil ( const Vec4V in )
{
UnionM128 a ( in ) ;
return V4LoadXYZW ( PxCeil ( a . m128_f32 [ 0 ] ) , PxCeil ( a . m128_f32 [ 1 ] ) , PxCeil ( a . m128_f32 [ 2 ] ) , PxCeil ( a . m128_f32 [ 3 ] ) ) ;
}
PX_FORCE_INLINE Vec4V V4Floor ( const Vec4V in )
{
UnionM128 a ( in ) ;
return V4LoadXYZW ( PxFloor ( a . m128_f32 [ 0 ] ) , PxFloor ( a . m128_f32 [ 1 ] ) , PxFloor ( a . m128_f32 [ 2 ] ) , PxFloor ( a . m128_f32 [ 3 ] ) ) ;
}
PX_FORCE_INLINE VecU32V V4ConvertToU32VSaturate ( const Vec4V in , PxU32 power )
{
PX_ASSERT ( power = = 0 & & " Non-zero power not supported in convertToU32VSaturate " ) ;
PX_UNUSED ( power ) ; // prevent warning in release builds
PxF32 ffffFFFFasFloat = PxF32 ( 0xFFFF0000 ) ;
UnionM128 a ( in ) ;
VecU32V result ;
result . m128_u32 [ 0 ] = PxU32 ( PxClamp < PxF32 > ( ( a ) . m128_f32 [ 0 ] , 0.0f , ffffFFFFasFloat ) ) ;
result . m128_u32 [ 1 ] = PxU32 ( PxClamp < PxF32 > ( ( a ) . m128_f32 [ 1 ] , 0.0f , ffffFFFFasFloat ) ) ;
result . m128_u32 [ 2 ] = PxU32 ( PxClamp < PxF32 > ( ( a ) . m128_f32 [ 2 ] , 0.0f , ffffFFFFasFloat ) ) ;
result . m128_u32 [ 3 ] = PxU32 ( PxClamp < PxF32 > ( ( a ) . m128_f32 [ 3 ] , 0.0f , ffffFFFFasFloat ) ) ;
return result ;
}
} // namespace aos
} // namespace physx
# endif // PXFOUNDATION_PXUNIXSSE2INLINEAOS_H