source: pacpussensors/trunk/Vislab/lib3dv/eigen/Eigen/src/Core/products/GeneralMatrixVector.h@ 136

Last change on this file since 136 was 136, checked in by ldecherf, 7 years ago

Doc

File size: 22.5 KB
Line 
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
11#define EIGEN_GENERAL_MATRIX_VECTOR_H
12
13namespace Eigen {
14
15namespace internal {
16
17/* Optimized col-major matrix * vector product:
18 * This algorithm processes 4 columns at onces that allows to both reduce
19 * the number of load/stores of the result by a factor 4 and to reduce
20 * the instruction dependency. Moreover, we know that all bands have the
21 * same alignment pattern.
22 *
23 * Mixing type logic: C += alpha * A * B
24 * | A | B |alpha| comments
25 * |real |cplx |cplx | no vectorization
26 * |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
27 * |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
28 * |cplx |real |real | optimal case, vectorization possible via real-cplx mul
29 */
30template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
31struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
32{
33typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
34
35enum {
36 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
37 && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
38 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
39 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
40 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
41};
42
43typedef typename packet_traits<LhsScalar>::type _LhsPacket;
44typedef typename packet_traits<RhsScalar>::type _RhsPacket;
45typedef typename packet_traits<ResScalar>::type _ResPacket;
46
47typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
48typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
49typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
50
51EIGEN_DONT_INLINE static void run(
52 Index rows, Index cols,
53 const LhsScalar* lhs, Index lhsStride,
54 const RhsScalar* rhs, Index rhsIncr,
55 ResScalar* res, Index resIncr, RhsScalar alpha);
56};
57
58template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
59EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
60 Index rows, Index cols,
61 const LhsScalar* lhs, Index lhsStride,
62 const RhsScalar* rhs, Index rhsIncr,
63 ResScalar* res, Index resIncr, RhsScalar alpha)
64{
65 EIGEN_UNUSED_VARIABLE(resIncr)
66 eigen_internal_assert(resIncr==1);
67 #ifdef _EIGEN_ACCUMULATE_PACKETS
68 #error _EIGEN_ACCUMULATE_PACKETS has already been defined
69 #endif
70 #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \
71 pstore(&res[j], \
72 padd(pload<ResPacket>(&res[j]), \
73 padd( \
74 padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]), ptmp0), \
75 pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]), ptmp1)), \
76 padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]), ptmp2), \
77 pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]), ptmp3)) )))
78
79 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
80 conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
81 if(ConjugateRhs)
82 alpha = numext::conj(alpha);
83
84 enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
85 const Index columnsAtOnce = 4;
86 const Index peels = 2;
87 const Index LhsPacketAlignedMask = LhsPacketSize-1;
88 const Index ResPacketAlignedMask = ResPacketSize-1;
89// const Index PeelAlignedMask = ResPacketSize*peels-1;
90 const Index size = rows;
91
92 // How many coeffs of the result do we have to skip to be aligned.
93 // Here we assume data are at least aligned on the base scalar type.
94 Index alignedStart = internal::first_aligned(res,size);
95 Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
96 const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
97
98 const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
99 Index alignmentPattern = alignmentStep==0 ? AllAligned
100 : alignmentStep==(LhsPacketSize/2) ? EvenAligned
101 : FirstAligned;
102
103 // we cannot assume the first element is aligned because of sub-matrices
104 const Index lhsAlignmentOffset = internal::first_aligned(lhs,size);
105
106 // find how many columns do we have to skip to be aligned with the result (if possible)
107 Index skipColumns = 0;
108 // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
109 if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) )
110 {
111 alignedSize = 0;
112 alignedStart = 0;
113 }
114 else if (LhsPacketSize>1)
115 {
116 eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
117
118 while (skipColumns<LhsPacketSize &&
119 alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
120 ++skipColumns;
121 if (skipColumns==LhsPacketSize)
122 {
123 // nothing can be aligned, no need to skip any column
124 alignmentPattern = NoneAligned;
125 skipColumns = 0;
126 }
127 else
128 {
129 skipColumns = (std::min)(skipColumns,cols);
130 // note that the skiped columns are processed later.
131 }
132
133 eigen_internal_assert( (alignmentPattern==NoneAligned)
134 || (skipColumns + columnsAtOnce >= cols)
135 || LhsPacketSize > size
136 || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);
137 }
138 else if(Vectorizable)
139 {
140 alignedStart = 0;
141 alignedSize = size;
142 alignmentPattern = AllAligned;
143 }
144
145 Index offset1 = (FirstAligned && alignmentStep==1?3:1);
146 Index offset3 = (FirstAligned && alignmentStep==1?1:3);
147
148 Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
149 for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
150 {
151 RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]),
152 ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]),
153 ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]),
154 ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]);
155
156 // this helps a lot generating better binary code
157 const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
158 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
159
160 if (Vectorizable)
161 {
162 /* explicit vectorization */
163 // process initial unaligned coeffs
164 for (Index j=0; j<alignedStart; ++j)
165 {
166 res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
167 res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
168 res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
169 res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
170 }
171
172 if (alignedSize>alignedStart)
173 {
174 switch(alignmentPattern)
175 {
176 case AllAligned:
177 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
178 _EIGEN_ACCUMULATE_PACKETS(d,d,d);
179 break;
180 case EvenAligned:
181 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
182 _EIGEN_ACCUMULATE_PACKETS(d,du,d);
183 break;
184 case FirstAligned:
185 {
186 Index j = alignedStart;
187 if(peels>1)
188 {
189 LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
190 ResPacket T0, T1;
191
192 A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
193 A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
194 A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
195
196 for (; j<peeledSize; j+=peels*ResPacketSize)
197 {
198 A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
199 A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
200 A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
201
202 A00 = pload<LhsPacket>(&lhs0[j]);
203 A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]);
204 T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
205 T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
206
207 T0 = pcj.pmadd(A01, ptmp1, T0);
208 A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
209 T0 = pcj.pmadd(A02, ptmp2, T0);
210 A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
211 T0 = pcj.pmadd(A03, ptmp3, T0);
212 pstore(&res[j],T0);
213 A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
214 T1 = pcj.pmadd(A11, ptmp1, T1);
215 T1 = pcj.pmadd(A12, ptmp2, T1);
216 T1 = pcj.pmadd(A13, ptmp3, T1);
217 pstore(&res[j+ResPacketSize],T1);
218 }
219 }
220 for (; j<alignedSize; j+=ResPacketSize)
221 _EIGEN_ACCUMULATE_PACKETS(d,du,du);
222 break;
223 }
224 default:
225 for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
226 _EIGEN_ACCUMULATE_PACKETS(du,du,du);
227 break;
228 }
229 }
230 } // end explicit vectorization
231
232 /* process remaining coeffs (or all if there is no explicit vectorization) */
233 for (Index j=alignedSize; j<size; ++j)
234 {
235 res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
236 res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
237 res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
238 res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
239 }
240 }
241
242 // process remaining first and last columns (at most columnsAtOnce-1)
243 Index end = cols;
244 Index start = columnBound;
245 do
246 {
247 for (Index k=start; k<end; ++k)
248 {
249 RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]);
250 const LhsScalar* lhs0 = lhs + k*lhsStride;
251
252 if (Vectorizable)
253 {
254 /* explicit vectorization */
255 // process first unaligned result's coeffs
256 for (Index j=0; j<alignedStart; ++j)
257 res[j] += cj.pmul(lhs0[j], pfirst(ptmp0));
258 // process aligned result's coeffs
259 if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
260 for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
261 pstore(&res[i], pcj.pmadd(pload<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
262 else
263 for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
264 pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
265 }
266
267 // process remaining scalars (or all if no explicit vectorization)
268 for (Index i=alignedSize; i<size; ++i)
269 res[i] += cj.pmul(lhs0[i], pfirst(ptmp0));
270 }
271 if (skipColumns)
272 {
273 start = 0;
274 end = skipColumns;
275 skipColumns = 0;
276 }
277 else
278 break;
279 } while(Vectorizable);
280 #undef _EIGEN_ACCUMULATE_PACKETS
281}
282
283/* Optimized row-major matrix * vector product:
284 * This algorithm processes 4 rows at onces that allows to both reduce
285 * the number of load/stores of the result by a factor 4 and to reduce
286 * the instruction dependency. Moreover, we know that all bands have the
287 * same alignment pattern.
288 *
289 * Mixing type logic:
290 * - alpha is always a complex (or converted to a complex)
291 * - no vectorization
292 */
293template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
294struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
295{
296typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
297
298enum {
299 Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
300 && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
301 LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
302 RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
303 ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
304};
305
306typedef typename packet_traits<LhsScalar>::type _LhsPacket;
307typedef typename packet_traits<RhsScalar>::type _RhsPacket;
308typedef typename packet_traits<ResScalar>::type _ResPacket;
309
310typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
311typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
312typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
313
314EIGEN_DONT_INLINE static void run(
315 Index rows, Index cols,
316 const LhsScalar* lhs, Index lhsStride,
317 const RhsScalar* rhs, Index rhsIncr,
318 ResScalar* res, Index resIncr,
319 ResScalar alpha);
320};
321
322template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
323EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
324 Index rows, Index cols,
325 const LhsScalar* lhs, Index lhsStride,
326 const RhsScalar* rhs, Index rhsIncr,
327 ResScalar* res, Index resIncr,
328 ResScalar alpha)
329{
330 EIGEN_UNUSED_VARIABLE(rhsIncr);
331 eigen_internal_assert(rhsIncr==1);
332 #ifdef _EIGEN_ACCUMULATE_PACKETS
333 #error _EIGEN_ACCUMULATE_PACKETS has already been defined
334 #endif
335
336 #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\
337 RhsPacket b = pload<RhsPacket>(&rhs[j]); \
338 ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \
339 ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \
340 ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \
341 ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); }
342
343 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
344 conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
345
346 enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
347 const Index rowsAtOnce = 4;
348 const Index peels = 2;
349 const Index RhsPacketAlignedMask = RhsPacketSize-1;
350 const Index LhsPacketAlignedMask = LhsPacketSize-1;
351// const Index PeelAlignedMask = RhsPacketSize*peels-1;
352 const Index depth = cols;
353
354 // How many coeffs of the result do we have to skip to be aligned.
355 // Here we assume data are at least aligned on the base scalar type
356 // if that's not the case then vectorization is discarded, see below.
357 Index alignedStart = internal::first_aligned(rhs, depth);
358 Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
359 const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
360
361 const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
362 Index alignmentPattern = alignmentStep==0 ? AllAligned
363 : alignmentStep==(LhsPacketSize/2) ? EvenAligned
364 : FirstAligned;
365
366 // we cannot assume the first element is aligned because of sub-matrices
367 const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth);
368
369 // find how many rows do we have to skip to be aligned with rhs (if possible)
370 Index skipRows = 0;
371 // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
372 if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) )
373 {
374 alignedSize = 0;
375 alignedStart = 0;
376 }
377 else if (LhsPacketSize>1)
378 {
379 eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
380
381 while (skipRows<LhsPacketSize &&
382 alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
383 ++skipRows;
384 if (skipRows==LhsPacketSize)
385 {
386 // nothing can be aligned, no need to skip any column
387 alignmentPattern = NoneAligned;
388 skipRows = 0;
389 }
390 else
391 {
392 skipRows = (std::min)(skipRows,Index(rows));
393 // note that the skiped columns are processed later.
394 }
395 eigen_internal_assert( alignmentPattern==NoneAligned
396 || LhsPacketSize==1
397 || (skipRows + rowsAtOnce >= rows)
398 || LhsPacketSize > depth
399 || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);
400 }
401 else if(Vectorizable)
402 {
403 alignedStart = 0;
404 alignedSize = depth;
405 alignmentPattern = AllAligned;
406 }
407
408 Index offset1 = (FirstAligned && alignmentStep==1?3:1);
409 Index offset3 = (FirstAligned && alignmentStep==1?1:3);
410
411 Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
412 for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
413 {
414 EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
415 ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
416
417 // this helps the compiler generating good binary code
418 const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
419 *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
420
421 if (Vectorizable)
422 {
423 /* explicit vectorization */
424 ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)),
425 ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0));
426
427 // process initial unaligned coeffs
428 // FIXME this loop get vectorized by the compiler !
429 for (Index j=0; j<alignedStart; ++j)
430 {
431 RhsScalar b = rhs[j];
432 tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
433 tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
434 }
435
436 if (alignedSize>alignedStart)
437 {
438 switch(alignmentPattern)
439 {
440 case AllAligned:
441 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
442 _EIGEN_ACCUMULATE_PACKETS(d,d,d);
443 break;
444 case EvenAligned:
445 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
446 _EIGEN_ACCUMULATE_PACKETS(d,du,d);
447 break;
448 case FirstAligned:
449 {
450 Index j = alignedStart;
451 if (peels>1)
452 {
453 /* Here we proccess 4 rows with with two peeled iterations to hide
454 * the overhead of unaligned loads. Moreover unaligned loads are handled
455 * using special shift/move operations between the two aligned packets
456 * overlaping the desired unaligned packet. This is *much* more efficient
457 * than basic unaligned loads.
458 */
459 LhsPacket A01, A02, A03, A11, A12, A13;
460 A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
461 A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
462 A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
463
464 for (; j<peeledSize; j+=peels*RhsPacketSize)
465 {
466 RhsPacket b = pload<RhsPacket>(&rhs[j]);
467 A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
468 A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
469 A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
470
471 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0);
472 ptmp1 = pcj.pmadd(A01, b, ptmp1);
473 A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
474 ptmp2 = pcj.pmadd(A02, b, ptmp2);
475 A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
476 ptmp3 = pcj.pmadd(A03, b, ptmp3);
477 A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
478
479 b = pload<RhsPacket>(&rhs[j+RhsPacketSize]);
480 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0);
481 ptmp1 = pcj.pmadd(A11, b, ptmp1);
482 ptmp2 = pcj.pmadd(A12, b, ptmp2);
483 ptmp3 = pcj.pmadd(A13, b, ptmp3);
484 }
485 }
486 for (; j<alignedSize; j+=RhsPacketSize)
487 _EIGEN_ACCUMULATE_PACKETS(d,du,du);
488 break;
489 }
490 default:
491 for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
492 _EIGEN_ACCUMULATE_PACKETS(du,du,du);
493 break;
494 }
495 tmp0 += predux(ptmp0);
496 tmp1 += predux(ptmp1);
497 tmp2 += predux(ptmp2);
498 tmp3 += predux(ptmp3);
499 }
500 } // end explicit vectorization
501
502 // process remaining coeffs (or all if no explicit vectorization)
503 // FIXME this loop get vectorized by the compiler !
504 for (Index j=alignedSize; j<depth; ++j)
505 {
506 RhsScalar b = rhs[j];
507 tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
508 tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
509 }
510 res[i*resIncr] += alpha*tmp0;
511 res[(i+offset1)*resIncr] += alpha*tmp1;
512 res[(i+2)*resIncr] += alpha*tmp2;
513 res[(i+offset3)*resIncr] += alpha*tmp3;
514 }
515
516 // process remaining first and last rows (at most columnsAtOnce-1)
517 Index end = rows;
518 Index start = rowBound;
519 do
520 {
521 for (Index i=start; i<end; ++i)
522 {
523 EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
524 ResPacket ptmp0 = pset1<ResPacket>(tmp0);
525 const LhsScalar* lhs0 = lhs + i*lhsStride;
526 // process first unaligned result's coeffs
527 // FIXME this loop get vectorized by the compiler !
528 for (Index j=0; j<alignedStart; ++j)
529 tmp0 += cj.pmul(lhs0[j], rhs[j]);
530
531 if (alignedSize>alignedStart)
532 {
533 // process aligned rhs coeffs
534 if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
535 for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
536 ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
537 else
538 for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
539 ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
540 tmp0 += predux(ptmp0);
541 }
542
543 // process remaining scalars
544 // FIXME this loop get vectorized by the compiler !
545 for (Index j=alignedSize; j<depth; ++j)
546 tmp0 += cj.pmul(lhs0[j], rhs[j]);
547 res[i*resIncr] += alpha*tmp0;
548 }
549 if (skipRows)
550 {
551 start = 0;
552 end = skipRows;
553 skipRows = 0;
554 }
555 else
556 break;
557 } while(Vectorizable);
558
559 #undef _EIGEN_ACCUMULATE_PACKETS
560}
561
562} // end namespace internal
563
564} // end namespace Eigen
565
566#endif // EIGEN_GENERAL_MATRIX_VECTOR_H
Note: See TracBrowser for help on using the repository browser.