Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: pacpussensors/trunk/Vislab/lib3dv/eigen/Eigen/src/Core/arch/SSE/PacketMath.h@ 136

Last change on this file since 136 was 136, checked in by ldecherf, 7 years ago
Doc
File size: 24.2 KB

Rev	Line
[136]	1	// This file is part of Eigen, a lightweight C++ template library
	2	// for linear algebra.
	3	//
	4	// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
	5	//
	6	// This Source Code Form is subject to the terms of the Mozilla
	7	// Public License v. 2.0. If a copy of the MPL was not distributed
	8	// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
	9
	10	#ifndef EIGEN_PACKET_MATH_SSE_H
	11	#define EIGEN_PACKET_MATH_SSE_H
	12
	13	namespace Eigen {
	14
	15	namespace internal {
	16
	17	#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
	18	#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
	19	#endif
	20
	21	#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
	22	#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2sizeof(void))
	23	#endif
	24
	25	typedef __m128 Packet4f;
	26	typedef __m128i Packet4i;
	27	typedef __m128d Packet2d;
	28
	29	template<> struct is_arithmetic<__m128> { enum { value = true }; };
	30	template<> struct is_arithmetic<__m128i> { enum { value = true }; };
	31	template<> struct is_arithmetic<__m128d> { enum { value = true }; };
	32
	33	#define vec4f_swizzle1(v,p,q,r,s) \
	34	(_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6\|(r)<<4\|(q)<<2\|(p)))))
	35
	36	#define vec4i_swizzle1(v,p,q,r,s) \
	37	(_mm_shuffle_epi32( v, ((s)<<6\|(r)<<4\|(q)<<2\|(p))))
	38
	39	#define vec2d_swizzle1(v,p,q) \
	40	(_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q2+1)<<6\|(q2)<<4\|(p2+1)<<2\|(p2)))))
	41
	42	#define vec4f_swizzle2(a,b,p,q,r,s) \
	43	(_mm_shuffle_ps( (a), (b), ((s)<<6\|(r)<<4\|(q)<<2\|(p))))
	44
	45	#define vec4i_swizzle2(a,b,p,q,r,s) \
	46	(_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6\|(r)<<4\|(q)<<2\|(p))))))
	47
	48	#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
	49	const Packet4f p4f_##NAME = pset1<Packet4f>(X)
	50
	51	#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
	52	const Packet2d p2d_##NAME = pset1<Packet2d>(X)
	53
	54	#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
	55	const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1<Packet4i>(X))
	56
	57	#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
	58	const Packet4i p4i_##NAME = pset1<Packet4i>(X)
	59
	60
	61	template<> struct packet_traits<float> : default_packet_traits
	62	{
	63	typedef Packet4f type;
	64	enum {
	65	Vectorizable = 1,
	66	AlignedOnScalar = 1,
	67	size=4,
	68
	69	HasDiv = 1,
	70	HasSin = EIGEN_FAST_MATH,
	71	HasCos = EIGEN_FAST_MATH,
	72	HasLog = 1,
	73	HasExp = 1,
	74	HasSqrt = 1
	75	};
	76	};
	77	template<> struct packet_traits<double> : default_packet_traits
	78	{
	79	typedef Packet2d type;
	80	enum {
	81	Vectorizable = 1,
	82	AlignedOnScalar = 1,
	83	size=2,
	84
	85	HasDiv = 1,
	86	HasExp = 1,
	87	HasSqrt = 1
	88	};
	89	};
	90	template<> struct packet_traits<int> : default_packet_traits
	91	{
	92	typedef Packet4i type;
	93	enum {
	94	// FIXME check the Has*
	95	Vectorizable = 1,
	96	AlignedOnScalar = 1,
	97	size=4
	98	};
	99	};
	100
	101	template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; };
	102	template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; };
	103	template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; };
	104
	105	#if defined(_MSC_VER) && (_MSC_VER==1500)
	106	// Workaround MSVC 9 internal compiler error.
	107	// TODO: It has been detected with win64 builds (amd64), so let's check whether it also happens in 32bits+SSE mode
	108	// TODO: let's check whether there does not exist a better fix, like adding a pset0() function. (it crashed on pset1(0)).
	109	template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set_ps(from,from,from,from); }
	110	template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set_pd(from,from); }
	111	template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set_epi32(from,from,from,from); }
	112	#else
	113	template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set1_ps(from); }
	114	template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
	115	template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set1_epi32(from); }
	116	#endif
	117
	118	template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
	119	template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
	120	template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
	121
	122	template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }
	123	template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); }
	124	template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); }
	125
	126	template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); }
	127	template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); }
	128	template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); }
	129
	130	template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
	131	{
	132	const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
	133	return _mm_xor_ps(a,mask);
	134	}
	135	template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
	136	{
	137	const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x80000000));
	138	return _mm_xor_pd(a,mask);
	139	}
	140	template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
	141	{
	142	return psub(_mm_setr_epi32(0,0,0,0), a);
	143	}
	144
	145	template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
	146	template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
	147	template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
	148
	149	template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_mul_ps(a,b); }
	150	template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_mul_pd(a,b); }
	151	template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
	152	{
	153	#ifdef EIGEN_VECTORIZE_SSE4_1
	154	return _mm_mullo_epi32(a,b);
	155	#else
	156	// this version is slightly faster than 4 scalar products
	157	return vec4i_swizzle1(
	158	vec4i_swizzle2(
	159	_mm_mul_epu32(a,b),
	160	_mm_mul_epu32(vec4i_swizzle1(a,1,0,3,2),
	161	vec4i_swizzle1(b,1,0,3,2)),
	162	0,2,0,2),
	163	0,2,1,3);
	164	#endif
	165	}
	166
	167	template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); }
	168	template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); }
	169	template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /a/, const Packet4i& /b/)
	170	{ eigen_assert(false && "packet integer division are not supported by SSE");
	171	return pset1<Packet4i>(0);
	172	}
	173
	174	// for some weird raisons, it has to be overloaded for packet of integers
	175	template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
	176
	177	template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); }
	178	template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); }
	179	template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b)
	180	{
	181	#ifdef EIGEN_VECTORIZE_SSE4_1
	182	return _mm_min_epi32(a,b);
	183	#else
	184	// after some bench, this version is faster than a scalar implementation
	185	Packet4i mask = _mm_cmplt_epi32(a,b);
	186	return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
	187	#endif
	188	}
	189
	190	template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_max_ps(a,b); }
	191	template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_max_pd(a,b); }
	192	template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b)
	193	{
	194	#ifdef EIGEN_VECTORIZE_SSE4_1
	195	return _mm_max_epi32(a,b);
	196	#else
	197	// after some bench, this version is faster than a scalar implementation
	198	Packet4i mask = _mm_cmpgt_epi32(a,b);
	199	return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
	200	#endif
	201	}
	202
	203	template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
	204	template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
	205	template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
	206
	207	template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
	208	template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
	209	template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
	210
	211	template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
	212	template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
	213	template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
	214
	215	template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(a,b); }
	216	template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); }
	217	template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); }
	218
	219	template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
	220	template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
	221	template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); }
	222
	223	#if defined(_MSC_VER)
	224	template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
	225	EIGEN_DEBUG_UNALIGNED_LOAD
	226	#if (_MSC_VER==1600)
	227	// NOTE Some version of MSVC10 generates bad code when using _mm_loadu_ps
	228	// (i.e., it does not generate an unaligned load!!
	229	// TODO On most architectures this version should also be faster than a single _mm_loadu_ps
	230	// so we could also enable it for MSVC08 but first we have to make this later does not generate crap when doing so...
	231	__m128 res = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from));
	232	res = _mm_loadh_pi(res, (const __m64*)(from+2));
	233	return res;
	234	#else
	235	return _mm_loadu_ps(from);
	236	#endif
	237	}
	238	#else
	239	// NOTE: with the code below, MSVC's compiler crashes!
	240
	241	template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
	242	{
	243	EIGEN_DEBUG_UNALIGNED_LOAD
	244	return _mm_loadu_ps(from);
	245	}
	246	#endif
	247
	248	template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
	249	{
	250	EIGEN_DEBUG_UNALIGNED_LOAD
	251	return _mm_loadu_pd(from);
	252	}
	253	template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
	254	{
	255	EIGEN_DEBUG_UNALIGNED_LOAD
	256	return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
	257	}
	258
	259
	260	template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
	261	{
	262	return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from))), 0, 0, 1, 1);
	263	}
	264	template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
	265	{ return pset1<Packet2d>(from[0]); }
	266	template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
	267	{
	268	Packet4i tmp;
	269	tmp = _mm_loadl_epi64(reinterpret_cast<const Packet4i*>(from));
	270	return vec4i_swizzle1(tmp, 0, 0, 1, 1);
	271	}
	272
	273	template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
	274	template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
	275	template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<Packet4i*>(to), from); }
	276
	277	template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
	278	EIGEN_DEBUG_UNALIGNED_STORE
	279	_mm_storel_pd((to), from);
	280	_mm_storeh_pd((to+1), from);
	281	}
	282	template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castps_pd(from)); }
	283	template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castsi128_pd(from)); }
	284
	285	// some compilers might be tempted to perform multiple moves instead of using a vector path.
	286	template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a)
	287	{
	288	Packet4f pa = _mm_set_ss(a);
	289	pstore(to, vec4f_swizzle1(pa,0,0,0,0));
	290	}
	291	// some compilers might be tempted to perform multiple moves instead of using a vector path.
	292	template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a)
	293	{
	294	Packet2d pa = _mm_set_sd(a);
	295	pstore(to, vec2d_swizzle1(pa,0,0));
	296	}
	297
	298	template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
	299	template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
	300	template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
	301
	302	#if defined(_MSC_VER) && defined(_WIN64) && !defined(__INTEL_COMPILER)
	303	// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
	304	// Direct of the struct members fixed bug #62.
	305	template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return a.m128_f32[0]; }
	306	template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return a.m128d_f64[0]; }
	307	template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
	308	#elif defined(_MSC_VER) && !defined(__INTEL_COMPILER)
	309	// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
	310	template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; }
	311	template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; }
	312	template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
	313	#else
	314	template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return _mm_cvtss_f32(a); }
	315	template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return _mm_cvtsd_f64(a); }
	316	template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { return _mm_cvtsi128_si32(a); }
	317	#endif
	318
	319	template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
	320	{ return _mm_shuffle_ps(a,a,0x1B); }
	321	template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
	322	{ return _mm_shuffle_pd(a,a,0x1); }
	323	template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
	324	{ return _mm_shuffle_epi32(a,0x1B); }
	325
	326
	327	template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
	328	{
	329	const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
	330	return _mm_and_ps(a,mask);
	331	}
	332	template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a)
	333	{
	334	const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
	335	return _mm_and_pd(a,mask);
	336	}
	337	template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
	338	{
	339	#ifdef EIGEN_VECTORIZE_SSSE3
	340	return _mm_abs_epi32(a);
	341	#else
	342	Packet4i aux = _mm_srai_epi32(a,31);
	343	return _mm_sub_epi32(_mm_xor_si128(a,aux),aux);
	344	#endif
	345	}
	346
	347	EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs)
	348	{
	349	vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
	350	vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA));
	351	vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF));
	352	vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
	353	}
	354
	355	#ifdef EIGEN_VECTORIZE_SSE3
	356	// TODO implement SSE2 versions as well as integer versions
	357	template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
	358	{
	359	return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3]));
	360	}
	361	template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
	362	{
	363	return _mm_hadd_pd(vecs[0], vecs[1]);
	364	}
	365	// SSSE3 version:
	366	// EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs)
	367	// {
	368	// return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3]));
	369	// }
	370
	371	template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
	372	{
	373	Packet4f tmp0 = _mm_hadd_ps(a,a);
	374	return pfirst(_mm_hadd_ps(tmp0, tmp0));
	375	}
	376
	377	template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst(_mm_hadd_pd(a, a)); }
	378
	379	// SSSE3 version:
	380	// EIGEN_STRONG_INLINE float predux(const Packet4i& a)
	381	// {
	382	// Packet4i tmp0 = _mm_hadd_epi32(a,a);
	383	// return pfirst(_mm_hadd_epi32(tmp0, tmp0));
	384	// }
	385	#else
	386	// SSE2 versions
	387	template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
	388	{
	389	Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
	390	return pfirst(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
	391	}
	392	template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
	393	{
	394	return pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
	395	}
	396
	397	template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
	398	{
	399	Packet4f tmp0, tmp1, tmp2;
	400	tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]);
	401	tmp1 = _mm_unpackhi_ps(vecs[0], vecs[1]);
	402	tmp2 = _mm_unpackhi_ps(vecs[2], vecs[3]);
	403	tmp0 = _mm_add_ps(tmp0, tmp1);
	404	tmp1 = _mm_unpacklo_ps(vecs[2], vecs[3]);
	405	tmp1 = _mm_add_ps(tmp1, tmp2);
	406	tmp2 = _mm_movehl_ps(tmp1, tmp0);
	407	tmp0 = _mm_movelh_ps(tmp0, tmp1);
	408	return _mm_add_ps(tmp0, tmp2);
	409	}
	410
	411	template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
	412	{
	413	return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1]));
	414	}
	415	#endif // SSE3
	416
	417	template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
	418	{
	419	Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
	420	return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1));
	421	}
	422
	423	template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
	424	{
	425	Packet4i tmp0, tmp1, tmp2;
	426	tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
	427	tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
	428	tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
	429	tmp0 = _mm_add_epi32(tmp0, tmp1);
	430	tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
	431	tmp1 = _mm_add_epi32(tmp1, tmp2);
	432	tmp2 = _mm_unpacklo_epi64(tmp0, tmp1);
	433	tmp0 = _mm_unpackhi_epi64(tmp0, tmp1);
	434	return _mm_add_epi32(tmp0, tmp2);
	435	}
	436
	437	// Other reduction functions:
	438
	439	// mul
	440	template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
	441	{
	442	Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a));
	443	return pfirst(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
	444	}
	445	template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
	446	{
	447	return pfirst(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
	448	}
	449	template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
	450	{
	451	// after some experiments, it is seems this is the fastest way to implement it
	452	// for GCC (eg., reusing pmul is very slow !)
	453	// TODO try to call _mm_mul_epu32 directly
	454	EIGEN_ALIGN16 int aux[4];
	455	pstore(aux, a);
	456	return (aux[0] * aux[1]) * (aux[2] * aux[3]);;
	457	}
	458
	459	// min
	460	template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
	461	{
	462	Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a));
	463	return pfirst(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
	464	}
	465	template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
	466	{
	467	return pfirst(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
	468	}
	469	template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
	470	{
	471	// after some experiments, it is seems this is the fastest way to implement it
	472	// for GCC (eg., it does not like using std::min after the pstore !!)
	473	EIGEN_ALIGN16 int aux[4];
	474	pstore(aux, a);
	475	int aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
	476	int aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
	477	return aux0<aux2 ? aux0 : aux2;
	478	}
	479
	480	// max
	481	template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
	482	{
	483	Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a));
	484	return pfirst(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
	485	}
	486	template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
	487	{
	488	return pfirst(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
	489	}
	490	template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
	491	{
	492	// after some experiments, it is seems this is the fastest way to implement it
	493	// for GCC (eg., it does not like using std::min after the pstore !!)
	494	EIGEN_ALIGN16 int aux[4];
	495	pstore(aux, a);
	496	int aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
	497	int aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
	498	return aux0>aux2 ? aux0 : aux2;
	499	}
	500
	501	#if (defined __GNUC__)
	502	// template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
	503	// {
	504	// Packet4f res = b;
	505	// asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c));
	506	// return res;
	507	// }
	508	// EIGEN_STRONG_INLINE Packet4i _mm_alignr_epi8(const Packet4i& a, const Packet4i& b, const int i)
	509	// {
	510	// Packet4i res = a;
	511	// asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i));
	512	// return res;
	513	// }
	514	#endif
	515
	516	#ifdef EIGEN_VECTORIZE_SSSE3
	517	// SSSE3 versions
	518	template<int Offset>
	519	struct palign_impl<Offset,Packet4f>
	520	{
	521	static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
	522	{
	523	if (Offset!=0)
	524	first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4));
	525	}
	526	};
	527
	528	template<int Offset>
	529	struct palign_impl<Offset,Packet4i>
	530	{
	531	static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
	532	{
	533	if (Offset!=0)
	534	first = _mm_alignr_epi8(second,first, Offset*4);
	535	}
	536	};
	537
	538	template<int Offset>
	539	struct palign_impl<Offset,Packet2d>
	540	{
	541	static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
	542	{
	543	if (Offset==1)
	544	first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8));
	545	}
	546	};
	547	#else
	548	// SSE2 versions
	549	template<int Offset>
	550	struct palign_impl<Offset,Packet4f>
	551	{
	552	static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
	553	{
	554	if (Offset==1)
	555	{
	556	first = _mm_move_ss(first,second);
	557	first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39));
	558	}
	559	else if (Offset==2)
	560	{
	561	first = _mm_movehl_ps(first,first);
	562	first = _mm_movelh_ps(first,second);
	563	}
	564	else if (Offset==3)
	565	{
	566	first = _mm_move_ss(first,second);
	567	first = _mm_shuffle_ps(first,second,0x93);
	568	}
	569	}
	570	};
	571
	572	template<int Offset>
	573	struct palign_impl<Offset,Packet4i>
	574	{
	575	static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
	576	{
	577	if (Offset==1)
	578	{
	579	first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
	580	first = _mm_shuffle_epi32(first,0x39);
	581	}
	582	else if (Offset==2)
	583	{
	584	first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first)));
	585	first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
	586	}
	587	else if (Offset==3)
	588	{
	589	first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
	590	first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93));
	591	}
	592	}
	593	};
	594
	595	template<int Offset>
	596	struct palign_impl<Offset,Packet2d>
	597	{
	598	static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
	599	{
	600	if (Offset==1)
	601	{
	602	first = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(first),_mm_castpd_ps(first)));
	603	first = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(first),_mm_castpd_ps(second)));
	604	}
	605	}
	606	};
	607	#endif
	608
	609	} // end namespace internal
	610
	611	} // end namespace Eigen
	612
	613	#endif // EIGEN_PACKET_MATH_SSE_H

Note: See TracBrowser for help on using the repository browser.

Download in other formats: