Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: pacpussensors/trunk/Vislab/lib3dv/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h@ 136

Last change on this file since 136 was 136, checked in by ldecherf, 7 years ago
Doc
File size: 18.4 KB

Rev	Line
[136]	1	// This file is part of Eigen, a lightweight C++ template library
	2	// for linear algebra.
	3	//
	4	// Copyright (C) 2008 Konstantinos Margaritis <markos@codex.gr>
	5	//
	6	// This Source Code Form is subject to the terms of the Mozilla
	7	// Public License v. 2.0. If a copy of the MPL was not distributed
	8	// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
	9
	10	#ifndef EIGEN_PACKET_MATH_ALTIVEC_H
	11	#define EIGEN_PACKET_MATH_ALTIVEC_H
	12
	13	namespace Eigen {
	14
	15	namespace internal {
	16
	17	#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
	18	#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
	19	#endif
	20
	21	#ifndef EIGEN_HAS_FUSE_CJMADD
	22	#define EIGEN_HAS_FUSE_CJMADD 1
	23	#endif
	24
	25	// NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
	26	#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
	27	#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
	28	#endif
	29
	30	typedef __vector float Packet4f;
	31	typedef __vector int Packet4i;
	32	typedef __vector unsigned int Packet4ui;
	33	typedef __vector __bool int Packet4bi;
	34	typedef __vector short int Packet8i;
	35	typedef __vector unsigned char Packet16uc;
	36
	37	// We don't want to write the same code all the time, but we need to reuse the constants
	38	// and it doesn't really work to declare them global, so we define macros instead
	39
	40	#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
	41	Packet4f p4f_##NAME = (Packet4f) vec_splat_s32(X)
	42
	43	#define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
	44	Packet4i p4i_##NAME = vec_splat_s32(X)
	45
	46	#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
	47	Packet4f p4f_##NAME = pset1<Packet4f>(X)
	48
	49	#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
	50	Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X))
	51
	52	#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
	53	Packet4i p4i_##NAME = pset1<Packet4i>(X)
	54
	55	#define DST_CHAN 1
	56	#define DST_CTRL(size, count, stride) (((size) << 24) \| ((count) << 16) \| (stride))
	57
	58	// Define global static constants:
	59	static Packet4f p4f_COUNTDOWN = { 3.0, 2.0, 1.0, 0.0 };
	60	static Packet4i p4i_COUNTDOWN = { 3, 2, 1, 0 };
	61	static Packet16uc p16uc_REVERSE = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
	62	static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);
	63	static Packet16uc p16uc_DUPLICATE = {0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7};
	64
	65	static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0);
	66	static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0);
	67	static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1);
	68	static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16);
	69	static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1);
	70	static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0);
	71	static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1);
	72
	73	template<> struct packet_traits<float> : default_packet_traits
	74	{
	75	typedef Packet4f type;
	76	enum {
	77	Vectorizable = 1,
	78	AlignedOnScalar = 1,
	79	size=4,
	80
	81	// FIXME check the Has*
	82	HasSin = 0,
	83	HasCos = 0,
	84	HasLog = 0,
	85	HasExp = 0,
	86	HasSqrt = 0
	87	};
	88	};
	89	template<> struct packet_traits<int> : default_packet_traits
	90	{
	91	typedef Packet4i type;
	92	enum {
	93	// FIXME check the Has*
	94	Vectorizable = 1,
	95	AlignedOnScalar = 1,
	96	size=4
	97	};
	98	};
	99
	100	template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; };
	101	template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; };
	102	/*
	103	inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
	104	{
	105	union {
	106	Packet4f v;
	107	float n[4];
	108	} vt;
	109	vt.v = v;
	110	s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
	111	return s;
	112	}
	113
	114	inline std::ostream & operator <<(std::ostream & s, const Packet4i & v)
	115	{
	116	union {
	117	Packet4i v;
	118	int n[4];
	119	} vt;
	120	vt.v = v;
	121	s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
	122	return s;
	123	}
	124
	125	inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
	126	{
	127	union {
	128	Packet4ui v;
	129	unsigned int n[4];
	130	} vt;
	131	vt.v = v;
	132	s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
	133	return s;
	134	}
	135
	136	inline std::ostream & operator <<(std::ostream & s, const Packetbi & v)
	137	{
	138	union {
	139	Packet4bi v;
	140	unsigned int n[4];
	141	} vt;
	142	vt.v = v;
	143	s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
	144	return s;
	145	}
	146	*/
	147	template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
	148	// Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
	149	float EIGEN_ALIGN16 af[4];
	150	af[0] = from;
	151	Packet4f vc = vec_ld(0, af);
	152	vc = vec_splat(vc, 0);
	153	return vc;
	154	}
	155
	156	template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
	157	int EIGEN_ALIGN16 ai[4];
	158	ai[0] = from;
	159	Packet4i vc = vec_ld(0, ai);
	160	vc = vec_splat(vc, 0);
	161	return vc;
	162	}
	163
	164	template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return vec_add(pset1<Packet4f>(a), p4f_COUNTDOWN); }
	165	template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return vec_add(pset1<Packet4i>(a), p4i_COUNTDOWN); }
	166
	167	template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_add(a,b); }
	168	template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_add(a,b); }
	169
	170	template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_sub(a,b); }
	171	template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_sub(a,b); }
	172
	173	template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return psub<Packet4f>(p4f_ZERO, a); }
	174	template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return psub<Packet4i>(p4i_ZERO, a); }
	175
	176	template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
	177	template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
	178
	179	template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b,p4f_ZERO); }
	180	/* Commented out: it's actually slower than processing it scalar
	181	*
	182	template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
	183	{
	184	// Detailed in: http://freevec.org/content/32bit_signed_integer_multiplication_altivec
	185	//Set up constants, variables
	186	Packet4i a1, b1, bswap, low_prod, high_prod, prod, prod_, v1sel;
	187
	188	// Get the absolute values
	189	a1 = vec_abs(a);
	190	b1 = vec_abs(b);
	191
	192	// Get the signs using xor
	193	Packet4bi sgn = (Packet4bi) vec_cmplt(vec_xor(a, b), p4i_ZERO);
	194
	195	// Do the multiplication for the asbolute values.
	196	bswap = (Packet4i) vec_rl((Packet4ui) b1, (Packet4ui) p4i_MINUS16 );
	197	low_prod = vec_mulo((Packet8i) a1, (Packet8i)b1);
	198	high_prod = vec_msum((Packet8i) a1, (Packet8i) bswap, p4i_ZERO);
	199	high_prod = (Packet4i) vec_sl((Packet4ui) high_prod, (Packet4ui) p4i_MINUS16);
	200	prod = vec_add( low_prod, high_prod );
	201
	202	// NOR the product and select only the negative elements according to the sign mask
	203	prod_ = vec_nor(prod, prod);
	204	prod_ = vec_sel(p4i_ZERO, prod_, sgn);
	205
	206	// Add 1 to the result to get the negative numbers
	207	v1sel = vec_sel(p4i_ZERO, p4i_ONE, sgn);
	208	prod_ = vec_add(prod_, v1sel);
	209
	210	// Merge the results back to the final vector.
	211	prod = vec_sel(prod, prod_, sgn);
	212
	213	return prod;
	214	}
	215	*/
	216	template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
	217	{
	218	Packet4f t, y_0, y_1, res;
	219
	220	// Altivec does not offer a divide instruction, we have to do a reciprocal approximation
	221	y_0 = vec_re(b);
	222
	223	// Do one Newton-Raphson iteration to get the needed accuracy
	224	t = vec_nmsub(y_0, b, p4f_ONE);
	225	y_1 = vec_madd(y_0, t, y_0);
	226
	227	res = vec_madd(a, y_1, p4f_ZERO);
	228	return res;
	229	}
	230
	231	template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /a/, const Packet4i& /b/)
	232	{ eigen_assert(false && "packet integer division are not supported by AltiVec");
	233	return pset1<Packet4i>(0);
	234	}
	235
	236	// for some weird raisons, it has to be overloaded for packet of integers
	237	template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); }
	238	template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
	239
	240	template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
	241	template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
	242
	243	template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
	244	template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
	245
	246	// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
	247	template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
	248	template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
	249
	250	template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); }
	251	template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
	252
	253	template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); }
	254	template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
	255
	256	template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
	257	template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }
	258
	259	template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
	260	template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
	261
	262	template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
	263	{
	264	EIGEN_DEBUG_ALIGNED_LOAD
	265	// Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
	266	Packet16uc MSQ, LSQ;
	267	Packet16uc mask;
	268	MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword
	269	LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword
	270	mask = vec_lvsl(0, from); // create the permute mask
	271	return (Packet4f) vec_perm(MSQ, LSQ, mask); // align the data
	272
	273	}
	274	template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
	275	{
	276	EIGEN_DEBUG_ALIGNED_LOAD
	277	// Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
	278	Packet16uc MSQ, LSQ;
	279	Packet16uc mask;
	280	MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword
	281	LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword
	282	mask = vec_lvsl(0, from); // create the permute mask
	283	return (Packet4i) vec_perm(MSQ, LSQ, mask); // align the data
	284	}
	285
	286	template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
	287	{
	288	Packet4f p;
	289	if((ptrdiff_t(&from) % 16) == 0) p = pload<Packet4f>(from);
	290	else p = ploadu<Packet4f>(from);
	291	return vec_perm(p, p, p16uc_DUPLICATE);
	292	}
	293	template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
	294	{
	295	Packet4i p;
	296	if((ptrdiff_t(&from) % 16) == 0) p = pload<Packet4i>(from);
	297	else p = ploadu<Packet4i>(from);
	298	return vec_perm(p, p, p16uc_DUPLICATE);
	299	}
	300
	301	template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
	302	template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
	303
	304	template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from)
	305	{
	306	EIGEN_DEBUG_UNALIGNED_STORE
	307	// Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
	308	// Warning: not thread safe!
	309	Packet16uc MSQ, LSQ, edges;
	310	Packet16uc edgeAlign, align;
	311
	312	MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword
	313	LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword
	314	edgeAlign = vec_lvsl(0, to); // permute map to extract edges
	315	edges=vec_perm(LSQ,MSQ,edgeAlign); // extract the edges
	316	align = vec_lvsr( 0, to ); // permute map to misalign data
	317	MSQ = vec_perm(edges,(Packet16uc)from,align); // misalign the data (MSQ)
	318	LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ)
	319	vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first
	320	vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part
	321	}
	322	template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from)
	323	{
	324	EIGEN_DEBUG_UNALIGNED_STORE
	325	// Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
	326	// Warning: not thread safe!
	327	Packet16uc MSQ, LSQ, edges;
	328	Packet16uc edgeAlign, align;
	329
	330	MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword
	331	LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword
	332	edgeAlign = vec_lvsl(0, to); // permute map to extract edges
	333	edges=vec_perm(LSQ, MSQ, edgeAlign); // extract the edges
	334	align = vec_lvsr( 0, to ); // permute map to misalign data
	335	MSQ = vec_perm(edges, (Packet16uc) from, align); // misalign the data (MSQ)
	336	LSQ = vec_perm((Packet16uc) from, edges, align); // misalign the data (LSQ)
	337	vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first
	338	vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part
	339	}
	340
	341	template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); }
	342	template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); }
	343
	344	template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
	345	template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
	346
	347	template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return (Packet4f)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE); }
	348	template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return (Packet4i)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE); }
	349
	350	template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); }
	351	template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); }
	352
	353	template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
	354	{
	355	Packet4f b, sum;
	356	b = (Packet4f) vec_sld(a, a, 8);
	357	sum = vec_add(a, b);
	358	b = (Packet4f) vec_sld(sum, sum, 4);
	359	sum = vec_add(sum, b);
	360	return pfirst(sum);
	361	}
	362
	363	template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
	364	{
	365	Packet4f v[4], sum[4];
	366
	367	// It's easier and faster to transpose then add as columns
	368	// Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
	369	// Do the transpose, first set of moves
	370	v[0] = vec_mergeh(vecs[0], vecs[2]);
	371	v[1] = vec_mergel(vecs[0], vecs[2]);
	372	v[2] = vec_mergeh(vecs[1], vecs[3]);
	373	v[3] = vec_mergel(vecs[1], vecs[3]);
	374	// Get the resulting vectors
	375	sum[0] = vec_mergeh(v[0], v[2]);
	376	sum[1] = vec_mergel(v[0], v[2]);
	377	sum[2] = vec_mergeh(v[1], v[3]);
	378	sum[3] = vec_mergel(v[1], v[3]);
	379
	380	// Now do the summation:
	381	// Lines 0+1
	382	sum[0] = vec_add(sum[0], sum[1]);
	383	// Lines 2+3
	384	sum[1] = vec_add(sum[2], sum[3]);
	385	// Add the results
	386	sum[0] = vec_add(sum[0], sum[1]);
	387
	388	return sum[0];
	389	}
	390
	391	template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
	392	{
	393	Packet4i sum;
	394	sum = vec_sums(a, p4i_ZERO);
	395	sum = vec_sld(sum, p4i_ZERO, 12);
	396	return pfirst(sum);
	397	}
	398
	399	template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
	400	{
	401	Packet4i v[4], sum[4];
	402
	403	// It's easier and faster to transpose then add as columns
	404	// Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
	405	// Do the transpose, first set of moves
	406	v[0] = vec_mergeh(vecs[0], vecs[2]);
	407	v[1] = vec_mergel(vecs[0], vecs[2]);
	408	v[2] = vec_mergeh(vecs[1], vecs[3]);
	409	v[3] = vec_mergel(vecs[1], vecs[3]);
	410	// Get the resulting vectors
	411	sum[0] = vec_mergeh(v[0], v[2]);
	412	sum[1] = vec_mergel(v[0], v[2]);
	413	sum[2] = vec_mergeh(v[1], v[3]);
	414	sum[3] = vec_mergel(v[1], v[3]);
	415
	416	// Now do the summation:
	417	// Lines 0+1
	418	sum[0] = vec_add(sum[0], sum[1]);
	419	// Lines 2+3
	420	sum[1] = vec_add(sum[2], sum[3]);
	421	// Add the results
	422	sum[0] = vec_add(sum[0], sum[1]);
	423
	424	return sum[0];
	425	}
	426
	427	// Other reduction functions:
	428	// mul
	429	template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
	430	{
	431	Packet4f prod;
	432	prod = pmul(a, (Packet4f)vec_sld(a, a, 8));
	433	return pfirst(pmul(prod, (Packet4f)vec_sld(prod, prod, 4)));
	434	}
	435
	436	template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
	437	{
	438	EIGEN_ALIGN16 int aux[4];
	439	pstore(aux, a);
	440	return aux[0] * aux[1] * aux[2] * aux[3];
	441	}
	442
	443	// min
	444	template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
	445	{
	446	Packet4f b, res;
	447	b = vec_min(a, vec_sld(a, a, 8));
	448	res = vec_min(b, vec_sld(b, b, 4));
	449	return pfirst(res);
	450	}
	451
	452	template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
	453	{
	454	Packet4i b, res;
	455	b = vec_min(a, vec_sld(a, a, 8));
	456	res = vec_min(b, vec_sld(b, b, 4));
	457	return pfirst(res);
	458	}
	459
	460	// max
	461	template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
	462	{
	463	Packet4f b, res;
	464	b = vec_max(a, vec_sld(a, a, 8));
	465	res = vec_max(b, vec_sld(b, b, 4));
	466	return pfirst(res);
	467	}
	468
	469	template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
	470	{
	471	Packet4i b, res;
	472	b = vec_max(a, vec_sld(a, a, 8));
	473	res = vec_max(b, vec_sld(b, b, 4));
	474	return pfirst(res);
	475	}
	476
	477	template<int Offset>
	478	struct palign_impl<Offset,Packet4f>
	479	{
	480	static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
	481	{
	482	if (Offset!=0)
	483	first = vec_sld(first, second, Offset*4);
	484	}
	485	};
	486
	487	template<int Offset>
	488	struct palign_impl<Offset,Packet4i>
	489	{
	490	static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
	491	{
	492	if (Offset!=0)
	493	first = vec_sld(first, second, Offset*4);
	494	}
	495	};
	496
	497	} // end namespace internal
	498
	499	} // end namespace Eigen
	500
	501	#endif // EIGEN_PACKET_MATH_ALTIVEC_H

Note: See TracBrowser for help on using the repository browser.

Download in other formats: