1 | // This file is part of Eigen, a lightweight C++ template library
|
---|
2 | // for linear algebra.
|
---|
3 | //
|
---|
4 | // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
|
---|
5 | //
|
---|
6 | // This Source Code Form is subject to the terms of the Mozilla
|
---|
7 | // Public License v. 2.0. If a copy of the MPL was not distributed
|
---|
8 | // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
---|
9 |
|
---|
10 | #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
|
---|
11 | #define EIGEN_GENERAL_MATRIX_VECTOR_H
|
---|
12 |
|
---|
13 | namespace Eigen {
|
---|
14 |
|
---|
15 | namespace internal {
|
---|
16 |
|
---|
17 | /* Optimized col-major matrix * vector product:
|
---|
18 | * This algorithm processes 4 columns at onces that allows to both reduce
|
---|
19 | * the number of load/stores of the result by a factor 4 and to reduce
|
---|
20 | * the instruction dependency. Moreover, we know that all bands have the
|
---|
21 | * same alignment pattern.
|
---|
22 | *
|
---|
23 | * Mixing type logic: C += alpha * A * B
|
---|
24 | * | A | B |alpha| comments
|
---|
25 | * |real |cplx |cplx | no vectorization
|
---|
26 | * |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
|
---|
27 | * |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
|
---|
28 | * |cplx |real |real | optimal case, vectorization possible via real-cplx mul
|
---|
29 | */
|
---|
30 | template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
|
---|
31 | struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
|
---|
32 | {
|
---|
33 | typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
---|
34 |
|
---|
35 | enum {
|
---|
36 | Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
|
---|
37 | && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
|
---|
38 | LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
|
---|
39 | RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
|
---|
40 | ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
|
---|
41 | };
|
---|
42 |
|
---|
43 | typedef typename packet_traits<LhsScalar>::type _LhsPacket;
|
---|
44 | typedef typename packet_traits<RhsScalar>::type _RhsPacket;
|
---|
45 | typedef typename packet_traits<ResScalar>::type _ResPacket;
|
---|
46 |
|
---|
47 | typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
|
---|
48 | typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
|
---|
49 | typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
|
---|
50 |
|
---|
51 | EIGEN_DONT_INLINE static void run(
|
---|
52 | Index rows, Index cols,
|
---|
53 | const LhsScalar* lhs, Index lhsStride,
|
---|
54 | const RhsScalar* rhs, Index rhsIncr,
|
---|
55 | ResScalar* res, Index resIncr, RhsScalar alpha);
|
---|
56 | };
|
---|
57 |
|
---|
58 | template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
|
---|
59 | EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
|
---|
60 | Index rows, Index cols,
|
---|
61 | const LhsScalar* lhs, Index lhsStride,
|
---|
62 | const RhsScalar* rhs, Index rhsIncr,
|
---|
63 | ResScalar* res, Index resIncr, RhsScalar alpha)
|
---|
64 | {
|
---|
65 | EIGEN_UNUSED_VARIABLE(resIncr)
|
---|
66 | eigen_internal_assert(resIncr==1);
|
---|
67 | #ifdef _EIGEN_ACCUMULATE_PACKETS
|
---|
68 | #error _EIGEN_ACCUMULATE_PACKETS has already been defined
|
---|
69 | #endif
|
---|
70 | #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \
|
---|
71 | pstore(&res[j], \
|
---|
72 | padd(pload<ResPacket>(&res[j]), \
|
---|
73 | padd( \
|
---|
74 | padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]), ptmp0), \
|
---|
75 | pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]), ptmp1)), \
|
---|
76 | padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]), ptmp2), \
|
---|
77 | pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]), ptmp3)) )))
|
---|
78 |
|
---|
79 | conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
|
---|
80 | conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
|
---|
81 | if(ConjugateRhs)
|
---|
82 | alpha = numext::conj(alpha);
|
---|
83 |
|
---|
84 | enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
|
---|
85 | const Index columnsAtOnce = 4;
|
---|
86 | const Index peels = 2;
|
---|
87 | const Index LhsPacketAlignedMask = LhsPacketSize-1;
|
---|
88 | const Index ResPacketAlignedMask = ResPacketSize-1;
|
---|
89 | // const Index PeelAlignedMask = ResPacketSize*peels-1;
|
---|
90 | const Index size = rows;
|
---|
91 |
|
---|
92 | // How many coeffs of the result do we have to skip to be aligned.
|
---|
93 | // Here we assume data are at least aligned on the base scalar type.
|
---|
94 | Index alignedStart = internal::first_aligned(res,size);
|
---|
95 | Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
|
---|
96 | const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
|
---|
97 |
|
---|
98 | const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
|
---|
99 | Index alignmentPattern = alignmentStep==0 ? AllAligned
|
---|
100 | : alignmentStep==(LhsPacketSize/2) ? EvenAligned
|
---|
101 | : FirstAligned;
|
---|
102 |
|
---|
103 | // we cannot assume the first element is aligned because of sub-matrices
|
---|
104 | const Index lhsAlignmentOffset = internal::first_aligned(lhs,size);
|
---|
105 |
|
---|
106 | // find how many columns do we have to skip to be aligned with the result (if possible)
|
---|
107 | Index skipColumns = 0;
|
---|
108 | // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
|
---|
109 | if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) )
|
---|
110 | {
|
---|
111 | alignedSize = 0;
|
---|
112 | alignedStart = 0;
|
---|
113 | }
|
---|
114 | else if (LhsPacketSize>1)
|
---|
115 | {
|
---|
116 | eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
|
---|
117 |
|
---|
118 | while (skipColumns<LhsPacketSize &&
|
---|
119 | alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
|
---|
120 | ++skipColumns;
|
---|
121 | if (skipColumns==LhsPacketSize)
|
---|
122 | {
|
---|
123 | // nothing can be aligned, no need to skip any column
|
---|
124 | alignmentPattern = NoneAligned;
|
---|
125 | skipColumns = 0;
|
---|
126 | }
|
---|
127 | else
|
---|
128 | {
|
---|
129 | skipColumns = (std::min)(skipColumns,cols);
|
---|
130 | // note that the skiped columns are processed later.
|
---|
131 | }
|
---|
132 |
|
---|
133 | eigen_internal_assert( (alignmentPattern==NoneAligned)
|
---|
134 | || (skipColumns + columnsAtOnce >= cols)
|
---|
135 | || LhsPacketSize > size
|
---|
136 | || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);
|
---|
137 | }
|
---|
138 | else if(Vectorizable)
|
---|
139 | {
|
---|
140 | alignedStart = 0;
|
---|
141 | alignedSize = size;
|
---|
142 | alignmentPattern = AllAligned;
|
---|
143 | }
|
---|
144 |
|
---|
145 | Index offset1 = (FirstAligned && alignmentStep==1?3:1);
|
---|
146 | Index offset3 = (FirstAligned && alignmentStep==1?1:3);
|
---|
147 |
|
---|
148 | Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
|
---|
149 | for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
|
---|
150 | {
|
---|
151 | RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]),
|
---|
152 | ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]),
|
---|
153 | ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]),
|
---|
154 | ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]);
|
---|
155 |
|
---|
156 | // this helps a lot generating better binary code
|
---|
157 | const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
|
---|
158 | *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
|
---|
159 |
|
---|
160 | if (Vectorizable)
|
---|
161 | {
|
---|
162 | /* explicit vectorization */
|
---|
163 | // process initial unaligned coeffs
|
---|
164 | for (Index j=0; j<alignedStart; ++j)
|
---|
165 | {
|
---|
166 | res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
|
---|
167 | res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
|
---|
168 | res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
|
---|
169 | res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
|
---|
170 | }
|
---|
171 |
|
---|
172 | if (alignedSize>alignedStart)
|
---|
173 | {
|
---|
174 | switch(alignmentPattern)
|
---|
175 | {
|
---|
176 | case AllAligned:
|
---|
177 | for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
|
---|
178 | _EIGEN_ACCUMULATE_PACKETS(d,d,d);
|
---|
179 | break;
|
---|
180 | case EvenAligned:
|
---|
181 | for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
|
---|
182 | _EIGEN_ACCUMULATE_PACKETS(d,du,d);
|
---|
183 | break;
|
---|
184 | case FirstAligned:
|
---|
185 | {
|
---|
186 | Index j = alignedStart;
|
---|
187 | if(peels>1)
|
---|
188 | {
|
---|
189 | LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
|
---|
190 | ResPacket T0, T1;
|
---|
191 |
|
---|
192 | A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
|
---|
193 | A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
|
---|
194 | A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
|
---|
195 |
|
---|
196 | for (; j<peeledSize; j+=peels*ResPacketSize)
|
---|
197 | {
|
---|
198 | A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
|
---|
199 | A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
|
---|
200 | A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
|
---|
201 |
|
---|
202 | A00 = pload<LhsPacket>(&lhs0[j]);
|
---|
203 | A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]);
|
---|
204 | T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
|
---|
205 | T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
|
---|
206 |
|
---|
207 | T0 = pcj.pmadd(A01, ptmp1, T0);
|
---|
208 | A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
|
---|
209 | T0 = pcj.pmadd(A02, ptmp2, T0);
|
---|
210 | A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
|
---|
211 | T0 = pcj.pmadd(A03, ptmp3, T0);
|
---|
212 | pstore(&res[j],T0);
|
---|
213 | A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
|
---|
214 | T1 = pcj.pmadd(A11, ptmp1, T1);
|
---|
215 | T1 = pcj.pmadd(A12, ptmp2, T1);
|
---|
216 | T1 = pcj.pmadd(A13, ptmp3, T1);
|
---|
217 | pstore(&res[j+ResPacketSize],T1);
|
---|
218 | }
|
---|
219 | }
|
---|
220 | for (; j<alignedSize; j+=ResPacketSize)
|
---|
221 | _EIGEN_ACCUMULATE_PACKETS(d,du,du);
|
---|
222 | break;
|
---|
223 | }
|
---|
224 | default:
|
---|
225 | for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
|
---|
226 | _EIGEN_ACCUMULATE_PACKETS(du,du,du);
|
---|
227 | break;
|
---|
228 | }
|
---|
229 | }
|
---|
230 | } // end explicit vectorization
|
---|
231 |
|
---|
232 | /* process remaining coeffs (or all if there is no explicit vectorization) */
|
---|
233 | for (Index j=alignedSize; j<size; ++j)
|
---|
234 | {
|
---|
235 | res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
|
---|
236 | res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
|
---|
237 | res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
|
---|
238 | res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
|
---|
239 | }
|
---|
240 | }
|
---|
241 |
|
---|
242 | // process remaining first and last columns (at most columnsAtOnce-1)
|
---|
243 | Index end = cols;
|
---|
244 | Index start = columnBound;
|
---|
245 | do
|
---|
246 | {
|
---|
247 | for (Index k=start; k<end; ++k)
|
---|
248 | {
|
---|
249 | RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]);
|
---|
250 | const LhsScalar* lhs0 = lhs + k*lhsStride;
|
---|
251 |
|
---|
252 | if (Vectorizable)
|
---|
253 | {
|
---|
254 | /* explicit vectorization */
|
---|
255 | // process first unaligned result's coeffs
|
---|
256 | for (Index j=0; j<alignedStart; ++j)
|
---|
257 | res[j] += cj.pmul(lhs0[j], pfirst(ptmp0));
|
---|
258 | // process aligned result's coeffs
|
---|
259 | if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
|
---|
260 | for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
|
---|
261 | pstore(&res[i], pcj.pmadd(pload<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
|
---|
262 | else
|
---|
263 | for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
|
---|
264 | pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
|
---|
265 | }
|
---|
266 |
|
---|
267 | // process remaining scalars (or all if no explicit vectorization)
|
---|
268 | for (Index i=alignedSize; i<size; ++i)
|
---|
269 | res[i] += cj.pmul(lhs0[i], pfirst(ptmp0));
|
---|
270 | }
|
---|
271 | if (skipColumns)
|
---|
272 | {
|
---|
273 | start = 0;
|
---|
274 | end = skipColumns;
|
---|
275 | skipColumns = 0;
|
---|
276 | }
|
---|
277 | else
|
---|
278 | break;
|
---|
279 | } while(Vectorizable);
|
---|
280 | #undef _EIGEN_ACCUMULATE_PACKETS
|
---|
281 | }
|
---|
282 |
|
---|
283 | /* Optimized row-major matrix * vector product:
|
---|
284 | * This algorithm processes 4 rows at onces that allows to both reduce
|
---|
285 | * the number of load/stores of the result by a factor 4 and to reduce
|
---|
286 | * the instruction dependency. Moreover, we know that all bands have the
|
---|
287 | * same alignment pattern.
|
---|
288 | *
|
---|
289 | * Mixing type logic:
|
---|
290 | * - alpha is always a complex (or converted to a complex)
|
---|
291 | * - no vectorization
|
---|
292 | */
|
---|
293 | template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
|
---|
294 | struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
|
---|
295 | {
|
---|
296 | typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
|
---|
297 |
|
---|
298 | enum {
|
---|
299 | Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
|
---|
300 | && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
|
---|
301 | LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
|
---|
302 | RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
|
---|
303 | ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
|
---|
304 | };
|
---|
305 |
|
---|
306 | typedef typename packet_traits<LhsScalar>::type _LhsPacket;
|
---|
307 | typedef typename packet_traits<RhsScalar>::type _RhsPacket;
|
---|
308 | typedef typename packet_traits<ResScalar>::type _ResPacket;
|
---|
309 |
|
---|
310 | typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
|
---|
311 | typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
|
---|
312 | typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
|
---|
313 |
|
---|
314 | EIGEN_DONT_INLINE static void run(
|
---|
315 | Index rows, Index cols,
|
---|
316 | const LhsScalar* lhs, Index lhsStride,
|
---|
317 | const RhsScalar* rhs, Index rhsIncr,
|
---|
318 | ResScalar* res, Index resIncr,
|
---|
319 | ResScalar alpha);
|
---|
320 | };
|
---|
321 |
|
---|
322 | template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
|
---|
323 | EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
|
---|
324 | Index rows, Index cols,
|
---|
325 | const LhsScalar* lhs, Index lhsStride,
|
---|
326 | const RhsScalar* rhs, Index rhsIncr,
|
---|
327 | ResScalar* res, Index resIncr,
|
---|
328 | ResScalar alpha)
|
---|
329 | {
|
---|
330 | EIGEN_UNUSED_VARIABLE(rhsIncr);
|
---|
331 | eigen_internal_assert(rhsIncr==1);
|
---|
332 | #ifdef _EIGEN_ACCUMULATE_PACKETS
|
---|
333 | #error _EIGEN_ACCUMULATE_PACKETS has already been defined
|
---|
334 | #endif
|
---|
335 |
|
---|
336 | #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\
|
---|
337 | RhsPacket b = pload<RhsPacket>(&rhs[j]); \
|
---|
338 | ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \
|
---|
339 | ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \
|
---|
340 | ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \
|
---|
341 | ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); }
|
---|
342 |
|
---|
343 | conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
|
---|
344 | conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
|
---|
345 |
|
---|
346 | enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
|
---|
347 | const Index rowsAtOnce = 4;
|
---|
348 | const Index peels = 2;
|
---|
349 | const Index RhsPacketAlignedMask = RhsPacketSize-1;
|
---|
350 | const Index LhsPacketAlignedMask = LhsPacketSize-1;
|
---|
351 | // const Index PeelAlignedMask = RhsPacketSize*peels-1;
|
---|
352 | const Index depth = cols;
|
---|
353 |
|
---|
354 | // How many coeffs of the result do we have to skip to be aligned.
|
---|
355 | // Here we assume data are at least aligned on the base scalar type
|
---|
356 | // if that's not the case then vectorization is discarded, see below.
|
---|
357 | Index alignedStart = internal::first_aligned(rhs, depth);
|
---|
358 | Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
|
---|
359 | const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
|
---|
360 |
|
---|
361 | const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
|
---|
362 | Index alignmentPattern = alignmentStep==0 ? AllAligned
|
---|
363 | : alignmentStep==(LhsPacketSize/2) ? EvenAligned
|
---|
364 | : FirstAligned;
|
---|
365 |
|
---|
366 | // we cannot assume the first element is aligned because of sub-matrices
|
---|
367 | const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth);
|
---|
368 |
|
---|
369 | // find how many rows do we have to skip to be aligned with rhs (if possible)
|
---|
370 | Index skipRows = 0;
|
---|
371 | // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
|
---|
372 | if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) )
|
---|
373 | {
|
---|
374 | alignedSize = 0;
|
---|
375 | alignedStart = 0;
|
---|
376 | }
|
---|
377 | else if (LhsPacketSize>1)
|
---|
378 | {
|
---|
379 | eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize);
|
---|
380 |
|
---|
381 | while (skipRows<LhsPacketSize &&
|
---|
382 | alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
|
---|
383 | ++skipRows;
|
---|
384 | if (skipRows==LhsPacketSize)
|
---|
385 | {
|
---|
386 | // nothing can be aligned, no need to skip any column
|
---|
387 | alignmentPattern = NoneAligned;
|
---|
388 | skipRows = 0;
|
---|
389 | }
|
---|
390 | else
|
---|
391 | {
|
---|
392 | skipRows = (std::min)(skipRows,Index(rows));
|
---|
393 | // note that the skiped columns are processed later.
|
---|
394 | }
|
---|
395 | eigen_internal_assert( alignmentPattern==NoneAligned
|
---|
396 | || LhsPacketSize==1
|
---|
397 | || (skipRows + rowsAtOnce >= rows)
|
---|
398 | || LhsPacketSize > depth
|
---|
399 | || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);
|
---|
400 | }
|
---|
401 | else if(Vectorizable)
|
---|
402 | {
|
---|
403 | alignedStart = 0;
|
---|
404 | alignedSize = depth;
|
---|
405 | alignmentPattern = AllAligned;
|
---|
406 | }
|
---|
407 |
|
---|
408 | Index offset1 = (FirstAligned && alignmentStep==1?3:1);
|
---|
409 | Index offset3 = (FirstAligned && alignmentStep==1?1:3);
|
---|
410 |
|
---|
411 | Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
|
---|
412 | for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
|
---|
413 | {
|
---|
414 | EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
|
---|
415 | ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
|
---|
416 |
|
---|
417 | // this helps the compiler generating good binary code
|
---|
418 | const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride,
|
---|
419 | *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
|
---|
420 |
|
---|
421 | if (Vectorizable)
|
---|
422 | {
|
---|
423 | /* explicit vectorization */
|
---|
424 | ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)),
|
---|
425 | ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0));
|
---|
426 |
|
---|
427 | // process initial unaligned coeffs
|
---|
428 | // FIXME this loop get vectorized by the compiler !
|
---|
429 | for (Index j=0; j<alignedStart; ++j)
|
---|
430 | {
|
---|
431 | RhsScalar b = rhs[j];
|
---|
432 | tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
|
---|
433 | tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
|
---|
434 | }
|
---|
435 |
|
---|
436 | if (alignedSize>alignedStart)
|
---|
437 | {
|
---|
438 | switch(alignmentPattern)
|
---|
439 | {
|
---|
440 | case AllAligned:
|
---|
441 | for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
|
---|
442 | _EIGEN_ACCUMULATE_PACKETS(d,d,d);
|
---|
443 | break;
|
---|
444 | case EvenAligned:
|
---|
445 | for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
|
---|
446 | _EIGEN_ACCUMULATE_PACKETS(d,du,d);
|
---|
447 | break;
|
---|
448 | case FirstAligned:
|
---|
449 | {
|
---|
450 | Index j = alignedStart;
|
---|
451 | if (peels>1)
|
---|
452 | {
|
---|
453 | /* Here we proccess 4 rows with with two peeled iterations to hide
|
---|
454 | * the overhead of unaligned loads. Moreover unaligned loads are handled
|
---|
455 | * using special shift/move operations between the two aligned packets
|
---|
456 | * overlaping the desired unaligned packet. This is *much* more efficient
|
---|
457 | * than basic unaligned loads.
|
---|
458 | */
|
---|
459 | LhsPacket A01, A02, A03, A11, A12, A13;
|
---|
460 | A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
|
---|
461 | A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
|
---|
462 | A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
|
---|
463 |
|
---|
464 | for (; j<peeledSize; j+=peels*RhsPacketSize)
|
---|
465 | {
|
---|
466 | RhsPacket b = pload<RhsPacket>(&rhs[j]);
|
---|
467 | A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11);
|
---|
468 | A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12);
|
---|
469 | A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13);
|
---|
470 |
|
---|
471 | ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0);
|
---|
472 | ptmp1 = pcj.pmadd(A01, b, ptmp1);
|
---|
473 | A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01);
|
---|
474 | ptmp2 = pcj.pmadd(A02, b, ptmp2);
|
---|
475 | A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02);
|
---|
476 | ptmp3 = pcj.pmadd(A03, b, ptmp3);
|
---|
477 | A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03);
|
---|
478 |
|
---|
479 | b = pload<RhsPacket>(&rhs[j+RhsPacketSize]);
|
---|
480 | ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0);
|
---|
481 | ptmp1 = pcj.pmadd(A11, b, ptmp1);
|
---|
482 | ptmp2 = pcj.pmadd(A12, b, ptmp2);
|
---|
483 | ptmp3 = pcj.pmadd(A13, b, ptmp3);
|
---|
484 | }
|
---|
485 | }
|
---|
486 | for (; j<alignedSize; j+=RhsPacketSize)
|
---|
487 | _EIGEN_ACCUMULATE_PACKETS(d,du,du);
|
---|
488 | break;
|
---|
489 | }
|
---|
490 | default:
|
---|
491 | for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
|
---|
492 | _EIGEN_ACCUMULATE_PACKETS(du,du,du);
|
---|
493 | break;
|
---|
494 | }
|
---|
495 | tmp0 += predux(ptmp0);
|
---|
496 | tmp1 += predux(ptmp1);
|
---|
497 | tmp2 += predux(ptmp2);
|
---|
498 | tmp3 += predux(ptmp3);
|
---|
499 | }
|
---|
500 | } // end explicit vectorization
|
---|
501 |
|
---|
502 | // process remaining coeffs (or all if no explicit vectorization)
|
---|
503 | // FIXME this loop get vectorized by the compiler !
|
---|
504 | for (Index j=alignedSize; j<depth; ++j)
|
---|
505 | {
|
---|
506 | RhsScalar b = rhs[j];
|
---|
507 | tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
|
---|
508 | tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
|
---|
509 | }
|
---|
510 | res[i*resIncr] += alpha*tmp0;
|
---|
511 | res[(i+offset1)*resIncr] += alpha*tmp1;
|
---|
512 | res[(i+2)*resIncr] += alpha*tmp2;
|
---|
513 | res[(i+offset3)*resIncr] += alpha*tmp3;
|
---|
514 | }
|
---|
515 |
|
---|
516 | // process remaining first and last rows (at most columnsAtOnce-1)
|
---|
517 | Index end = rows;
|
---|
518 | Index start = rowBound;
|
---|
519 | do
|
---|
520 | {
|
---|
521 | for (Index i=start; i<end; ++i)
|
---|
522 | {
|
---|
523 | EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
|
---|
524 | ResPacket ptmp0 = pset1<ResPacket>(tmp0);
|
---|
525 | const LhsScalar* lhs0 = lhs + i*lhsStride;
|
---|
526 | // process first unaligned result's coeffs
|
---|
527 | // FIXME this loop get vectorized by the compiler !
|
---|
528 | for (Index j=0; j<alignedStart; ++j)
|
---|
529 | tmp0 += cj.pmul(lhs0[j], rhs[j]);
|
---|
530 |
|
---|
531 | if (alignedSize>alignedStart)
|
---|
532 | {
|
---|
533 | // process aligned rhs coeffs
|
---|
534 | if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
|
---|
535 | for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
|
---|
536 | ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
|
---|
537 | else
|
---|
538 | for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
|
---|
539 | ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
|
---|
540 | tmp0 += predux(ptmp0);
|
---|
541 | }
|
---|
542 |
|
---|
543 | // process remaining scalars
|
---|
544 | // FIXME this loop get vectorized by the compiler !
|
---|
545 | for (Index j=alignedSize; j<depth; ++j)
|
---|
546 | tmp0 += cj.pmul(lhs0[j], rhs[j]);
|
---|
547 | res[i*resIncr] += alpha*tmp0;
|
---|
548 | }
|
---|
549 | if (skipRows)
|
---|
550 | {
|
---|
551 | start = 0;
|
---|
552 | end = skipRows;
|
---|
553 | skipRows = 0;
|
---|
554 | }
|
---|
555 | else
|
---|
556 | break;
|
---|
557 | } while(Vectorizable);
|
---|
558 |
|
---|
559 | #undef _EIGEN_ACCUMULATE_PACKETS
|
---|
560 | }
|
---|
561 |
|
---|
562 | } // end namespace internal
|
---|
563 |
|
---|
564 | } // end namespace Eigen
|
---|
565 |
|
---|
566 | #endif // EIGEN_GENERAL_MATRIX_VECTOR_H
|
---|