source: pacpusframework/branches/2.0-beta1/include/extlib/EStereo/processingMMX.inl@ 89

Last change on this file since 89 was 89, checked in by morasjul, 11 years ago

PACPUS 2.0 Beta deployed in new branch

Major changes:
-Add communication interface between components
-Add examples for communications interface (TestComponents)
-Move to Qt5 support

  • Property svn:executable set to *
File size: 54.1 KB
Line 
1/***************************************************************************
2*
3* Copyright 2000 by David Demirdjian. All rights reserved.
4*
5* Developed by David Demirdjian
6*
7* Permission to use, copy, or modify this software and its documentation
8* for educational and research purposes only and without fee is hereby
9* granted, provided that this copyright notice and the original authors's
10* names appear on all copies and supporting documentation. If individual
11* files are separated from this distribution directory structure, this
12* copyright notice must be included. For any other uses of this software,
13* in original or modified form, including but not limited to distribution
14* in whole or in part, specific prior permission must be obtained from
15* MIT. These programs shall not be used, rewritten, or adapted as the
16* basis of a commercial software or hardware product without first
17* obtaining appropriate licenses from David Demirdjian. The author makes
18* no representations about the suitability of this software for any purpose.
19* It is provided "as is" without express or implied warranty.
20*
21**************************************************************************/
22#include "stereoMatching.h"
23#include "processingMMX.h"
24
25// ************************************************************
26// ************************************************************
27// *** List of functions (MMX) for image processing
28// ************************************************************
29// ************************************************************
30
31// shrink images by a factor 'fact'. eg if fact = 2, out will be twice as small as src
32inline void shrinkImages(uchar* dst, const uchar* src, int width, int height, int fact)
33{
34 int width_f = width/fact;
35 int siz = width*height/(fact*fact);
36 if (fact>0) {
37 for (int i=0,j=0; i<siz; ++i,++j,++dst,src+=fact) {
38 *dst = *src;
39 if (j==width_f-1) {
40 src+=((fact-1)*width);
41 j=0;
42 }
43 }
44 }
45}
46
47// translate image of 'tx' pixels to the right
48// (or left if tx<0)
49void translateImage(int tx, uchar* data, int siz)
50{
51 if (tx==0) return;
52 if (tx>0) {
53 // dest. must be after src... to avoid overwriting data
54 data += (siz-tx);
55 uchar* dataDst = data+tx;
56 for (int i=0; i<siz-tx; ++i,--dataDst,--data) {
57 *dataDst = *data;
58 }
59 } else { // tx<0
60 data -= tx;
61 uchar* dataDst = data+tx;
62 for (int i=0; i<siz-tx; ++i,++dataDst,++data) {
63 *dataDst = *data;
64 }
65 }
66}
67
68void normalizeImages(uchar* data1, uchar* data2, uchar* data3, int siz)
69{
70 float a1 = pixelMean(data1,siz);
71 float a2 = pixelMean(data2,siz);
72 float a3 = pixelMean(data3,siz);
73
74 float minI = __min(a1, __min(a2,a3));
75
76 if (a2==minI) {
77 multiply(data1, a2/a1, siz);
78 multiply(data3, a2/a3, siz);
79 } else if (a1==minI) {
80 multiply(data2, a1/a2, siz);
81 multiply(data3, a1/a3, siz);
82 } else {
83 multiply(data2, a3/a2, siz);
84 multiply(data1, a3/a1, siz);
85 }
86}
87
88void normalizeImages(const uchar* data1, const uchar* data2, const uchar* data3,
89 uchar* out1, uchar* out2, uchar* out3, int siz)
90{
91 float a1 = pixelMean(data1,siz);
92 float a2 = pixelMean(data2,siz);
93 float a3 = pixelMean(data3,siz);
94
95 float minI = __min(a1, __min(a2,a3));
96
97 if (a2==minI) {
98 multiply(data1, out1, a2/a1, siz);
99 multiply(data3, out3, a2/a3, siz);
100 copyMMX(out2, data2, siz);
101 } else if (a1==minI) {
102 multiply(data2, out2, a1/a2, siz);
103 multiply(data3, out3, a1/a3, siz);
104 copyMMX(out1, data1, siz);
105 } else {
106 multiply(data2, out2, a3/a2, siz);
107 multiply(data1, out1, a3/a1, siz);
108 copyMMX(out3, data3, siz);
109 }
110}
111
112void normalizeImages(uchar* data1, uchar* data2, int siz)
113{
114 float a1 = pixelMean(data1,siz);
115 float a2 = pixelMean(data2,siz);
116
117 // normalize the image which average intensity is the highest
118 if (a1>a2)
119 multiply(data1, a2/a1, siz);
120 else
121 multiply(data2, a1/a2, siz);
122}
123
124void normalizeImages(const uchar* data1, const uchar* data2,
125 uchar* out1, uchar* out2, int siz)
126{
127 float a1 = pixelMean(data1,siz);
128 float a2 = pixelMean(data2,siz);
129
130 // normalize the image which average intensity is the highest
131 if (a1>a2) {
132 multiply(data1, out1, a2/a1, siz);
133 copyMMX(out2, data2, siz);
134 } else {
135 multiply(data2, out2, a1/a2, siz);
136 copyMMX(out1, data1, siz);
137 }
138}
139
140// ImgSub2: D = saturation0(|S1 - S2| + |S1 - S3|)
141// TODO? divide the result by 2 (shift)
142inline int ImgSubandAdd(const unsigned char *Src1, const unsigned char *Src2,
143 const unsigned char *Src3, unsigned char *Dest, int l)
144{
145
146 if (l < 8) return 0; // image size must be at least 8 bytes
147
148 __asm
149 {
150 mov eax, Src1
151 mov ebx, Src2
152 mov edx, Src3
153 mov edi, Dest
154 mov ecx, l
155 shr ecx, 3
156
157align 16
158inner_loop:
159 movq mm1,[eax] // mm1=src1
160
161 movq mm2,[ebx] // mm2=src2
162
163 movq mm4,mm1 // mm4=mm1
164
165 psubusb mm4,mm2 // mm4 = src1 - src2
166
167 movq mm3,[edx] // mm3=src3
168 psubusb mm2,mm1 // mm2 = src2 - src1
169
170 movq mm5,mm1 // mm5=src1
171 por mm2,mm4 // mm2=|src1-src2|
172
173 psubusb mm5,mm3 // mm4=src1-src3
174
175 psubusb mm3,mm1 // mm3=src3-src1
176
177 por mm3,mm5 // mm3=|src1-src3|
178
179 paddusb mm2,mm3 // mm2 = |src1-src2|+|src1-src3|
180
181 movq [edi], mm2
182 add eax,8
183 add ebx,8
184 add edx,8
185 add edi,8
186 dec ecx
187 jnz inner_loop
188 emms
189 }
190
191 return 1;
192}
193
194// ImgSub2: D = saturation0(|S1 - S2|)
195// TODO? divide the result by 2 (shift)
196inline int ImgSubandAdd(const unsigned char *Src1, const unsigned char *Src2,
197 const unsigned char *Dest, int l)
198{
199
200 if (l < 8) return 0; // image size must be at least 8 bytes
201
202 __asm
203 {
204 mov eax, Src1
205 mov ebx, Src2
206 mov edi, Dest
207 mov ecx, l
208 shr ecx, 3
209
210align 16
211inner_loop:
212 movq mm1,[eax] // mm1=src1
213 movq mm2,[ebx] // mm2=src2
214
215 movq mm4,mm1 // mm4=mm1
216 psubusb mm4,mm2 // mm4 = src1 - src2
217
218 psubusb mm2,mm1 // mm2 = src2 - src1
219 por mm2,mm4 // mm2=|src1-src2|
220
221 movq [edi], mm2
222 add eax,8
223 add ebx,8
224 add edi,8
225 dec ecx
226 jnz inner_loop
227 emms
228 }
229
230 return 1;
231}
232
233
234
235
236
237
238#define _ABS_DIFF_TRI(Z) __asm \
239{ \
240 __asm movq mm4,mm1 /* mm4=mm1 */ \
241 __asm add ebx, width \
242 __asm add edi, imageSize \
243 __asm por mm3,mm7 /* here mm2=new src2 mm3=new src3 */ \
244\
245 __asm movq mm7, mm0 \
246 __asm psubusb mm4,mm2 /* mm4 = src1 - src2 */ \
247\
248 __asm psubusb mm2,mm1 /* mm2 = src2 - src1 */ \
249 __asm psllq mm7,Z \
250\
251 __asm movq mm5,mm1 /* mm5=src1 */ \
252 __asm por mm4,mm2 /* mm2=|src1-src2| */ \
253\
254 __asm movq mm2,[ebx] /* mm2= src2 + 'width' = new src2*/ \
255 __asm psubusb mm5,mm3 /* mm5=src1-src3*/ \
256\
257 __asm movq mm6,mm3 /* mm6=src3*/ \
258 __asm psubusb mm6,mm1 /* mm3=src3-src1*/ \
259\
260 __asm por mm6,mm5 /* mm6=|src1-src3|*/ \
261 __asm paddusb mm4,mm6 /* mm4 = |src1-src2|+|src1-src3|*/ \
262\
263 __asm movq [edi], mm4 /* here mm1=src1*/ \
264 __asm psrlq mm3, 8 /* mm3 = src3 + '1' ... with [x00000000] at the end*/\
265}
266
267
268#define _ABS_DIFF_TRI_prefetch(Z, X) __asm \
269{ \
270 __asm movq mm4,mm1 /* mm4=mm1 */ \
271 __asm add ebx, width \
272 __asm add edi, imageSize \
273 __asm por mm3,mm7 /* here mm2=new src2 mm3=new src3 */ \
274\
275 __asm movq mm7, mm0 \
276 __asm psubusb mm4,mm2 /* mm4 = src1 - src2 */ \
277\
278 __asm psubusb mm2,mm1 /* mm2 = src2 - src1 */ \
279 __asm prefetcht0 [ebx + X] \
280 __asm psllq mm7,Z \
281\
282 __asm movq mm5,mm1 /* mm5=src1 */ \
283 __asm por mm4,mm2 /* mm2=|src1-src2| */ \
284\
285\
286 __asm movq mm2,[ebx] /* mm2= src2 + 'width' = new src2*/ \
287 __asm psubusb mm5,mm3 /* mm5=src1-src3*/ \
288\
289 __asm movq mm6,mm3 /* mm6=src3*/ \
290 __asm psubusb mm6,mm1 /* mm3=src3-src1*/ \
291\
292 __asm por mm6,mm5 /* mm6=|src1-src3|*/ \
293 __asm paddusb mm4,mm6 /* mm4 = |src1-src2|+|src1-src3|*/ \
294\
295 __asm movq [edi], mm4 /* here mm1=src1*/ \
296 __asm psrlq mm3, 8 /* mm3 = src3 + '1' ... with [x00000000] at the end*/\
297}
298
299// ImgSubandAdd2: D = saturation0(|S1 - S2| + |S1 - S3|)
300// process 8 disparities at a time
301//
302// Src1: right
303// Src2: top
304// Src3: left
305//
306// TODO? divide the result by 2 (shift)
307inline int ImgSubandAdd2(const unsigned char *Src1, const unsigned char *Src2,
308 const unsigned char *Src3,
309 unsigned char* Dest1, int l, int imageSize, int width)
310{
311 if (l < 8) return 0; // image size must be at least 8 bytes
312 const int back_step1 = 7*width;
313 const int back_step2 = 7*imageSize;
314 __asm
315 {
316 mov eax, Src1
317 mov ebx, Src2
318 mov edx, Src3
319 mov edi, Dest1
320
321 mov ecx, l
322 shr ecx, 3
323
324 movq mm0,[edx] // mm0=src3
325 movq mm0,[edx] // mm0=src3
326align 16
327inner_loop:
328 movq mm1,[eax] // mm1=src1
329 movq mm3,mm0 // mm3=src3
330
331 movq mm2,[ebx] // mm2=src2
332 add eax,8
333
334 // -- 1 --------- in : mm1,mm2,mm3 out: mm4=SAD mm2=new mm2 --
335 movq mm4,mm1 // mm4=mm1
336
337 add ebx,width
338
339 psubusb mm4,mm2 // mm4 = src1 - src2
340 //prefetcht0 [ebx + 32 + 2*320]
341
342 movq mm0,[edx+8]
343 psubusb mm2,mm1 // mm2 = src2 - src1
344
345 movq mm5,mm1 // mm5=src1
346 por mm4,mm2 // mm2=|src1-src2|
347
348 movq mm2,[ebx] // mm2= src2 + 'width' = new src2
349 psubusb mm5,mm3 // mm5=src1-src3
350
351 movq mm6,mm3 // mm6=src3
352 psubusb mm6,mm1 // mm3=src3-src1
353
354 movq mm7, mm0
355 psrlq mm3, 8 // mm3 = src3 + '1' ... with [x00000000] at the end
356
357 por mm6,mm5 // mm6=|src1-src3|
358 paddusb mm4,mm6 // mm4 = |src1-src2|+|src1-src3|
359
360 movq [edi], mm4
361 psllq mm7, 56 // here mm1=src1 mm2=NEW src2 mm3=begin of NEWsrc3 mm7=end of NEWsrc3
362 // -------------------------------------------------------------
363
364
365 // - 2 ----------------
366 _ABS_DIFF_TRI(48)
367
368 // - 3 ----------------
369 _ABS_DIFF_TRI(40)
370
371 // - 4 ----------------
372 _ABS_DIFF_TRI(32)
373// _ABS_DIFF_TRI_prefetch(32,24 + 3*320)
374
375 // - 5 ----------------
376 _ABS_DIFF_TRI(24)
377
378 // - 6 ----------------
379 _ABS_DIFF_TRI(16)
380
381 // - 7 ----------------
382 _ABS_DIFF_TRI(8)
383
384
385 // - 8 ----------------
386 movq mm4,mm1 // mm4=mm1
387 por mm3,mm7 // here mm2=new src2 mm3=new src3
388
389 psubusb mm4,mm2 // mm4 = src1 - src2
390 psubusb mm2,mm1 // mm2 = src2 - src1
391
392 movq mm5,mm1 // mm5=src1
393 por mm4,mm2 // mm2=|src1-src2|
394
395 psubusb mm5,mm3 // mm5=src1-src3
396 psubusb mm3,mm1 // mm3=src3-src1
397
398 por mm3,mm5 // mm6=|src1-src3|
399 paddusb mm4,mm3 // mm4 = |src1-src2|+|src1-src3|
400
401 add edi, imageSize
402
403 movq [edi], mm4 // here mm1=src1
404 // -------------------------------------------------------------
405 //
406 sub ebx, back_step1
407 add ebx,8
408 add edx,8
409 sub edi, back_step2
410 add edi,8
411 dec ecx
412 jnz inner_loop
413 emms
414 }
415
416 return 1;
417}
418
419
420// macro: in: mm1,mm2
421#define _ABS_DIFF_ __asm \
422{ \
423 __asm movq mm4,mm1 /* mm4=mm1 */ \
424 __asm psubusb mm4,mm2 /* mm4 = src1 - src2 */ \
425 __asm psubusb mm2,mm1 /* mm2 = src2 - src1 */ \
426 __asm por mm4,mm2 /* mm2=|src1-src2| */ \
427 __asm add ebx, width \
428 __asm add edi, imageSize \
429 __asm movq mm2,[ebx] \
430 __asm movq [edi], mm4 /* here mm1=src1 */ \
431}
432
433// ImgSubandAdd2: D = saturation0(|S1 - S2| + |S1 - S3|)
434// process 8 disparities at a time
435// Src1: right
436// Src2: top
437// TODO? divide the result by 2 (shift)
438inline int ImgSubandAdd2_Vert(const unsigned char *Src1, const unsigned char *Src2,
439 unsigned char* Dest1, int l, int imageSize, int width)
440{
441
442 if (l < 8) return 0; // image size must be at least 8 bytes
443 const int back_step1 = 7*width;
444 const int back_step2 = 7*imageSize;
445 __asm
446 {
447 mov eax, Src1
448 mov ebx, Src2
449 mov edi, Dest1
450
451 mov ecx, l
452 shr ecx, 3
453
454align 16
455inner_loop:
456
457 movq mm1,[eax] // mm1=src1
458 movq mm2,[ebx] // mm2=src2
459 add eax,8
460
461 // -- 1 --------- in : mm1,mm2,mm3 out: mm4=SAD mm2=new mm2 --
462 _ABS_DIFF_
463 _ABS_DIFF_
464 _ABS_DIFF_
465 _ABS_DIFF_
466 _ABS_DIFF_
467 _ABS_DIFF_
468 _ABS_DIFF_
469
470 // - 8 ----------------
471 movq mm4,mm1 // mm4=mm1
472
473 psubusb mm4,mm2 // mm4 = src1 - src2
474 psubusb mm2,mm1 // mm2 = src2 - src1
475
476 por mm4,mm2 // mm2=|src1-src2|
477 add edi, imageSize
478
479 movq [edi], mm4 // here mm1=src1
480 // -------------------------------------------------------------
481 //
482 sub ebx, back_step1
483 add ebx,8
484 sub edi, back_step2
485 add edi,8
486 dec ecx
487 jnz inner_loop
488 emms
489 }
490
491 return 1;
492}
493
494// macro: in: mm1,mm2
495#define _ABS_DIFF_HORIZ(Z) __asm \
496{ \
497 __asm movq mm7, mm0 \
498 __asm add edi, imageSize \
499 __asm movq mm5,mm1 /* mm5=src1 */ \
500 __asm psllq mm7, Z \
501 __asm psubusb mm5,mm3 /* mm5=src1-src3 */ \
502 __asm movq mm6,mm3 /* mm6=src3 */ \
503 __asm psubusb mm6,mm1 /* mm3=src3-src1 */ \
504 __asm por mm6,mm5 /* mm6=|src1-src3| */ \
505 __asm movq [edi], mm6 /* here mm1=src1 */ \
506 __asm psrlq mm3, 8 /* mm3 = src3 + '1' ... with [x00000000] at the end */ \
507 __asm por mm3,mm7 /* here mm3=new src3 */ \
508}
509
510// ImgSubandAdd2: D = saturation0(|S1 - S2| + |S1 - S3|)
511// process 8 disparities at a time
512//
513// Src1: right
514// Src2: top
515// Src3: left
516//
517// TODO? divide the result by 2 (shift)
518inline int ImgSubandAdd_Horiz(const unsigned char *rightIm, const unsigned char *leftIm,
519 unsigned char* Dest, int l, int imageSize, int width)
520{
521
522 if (l < 8) return 0; // image size must be at least 8 bytes
523 const int back_step2 = 7*imageSize;
524 __asm
525 {
526 mov eax, rightIm
527 mov edx, leftIm
528 mov edi, Dest
529
530 mov ecx, l
531 shr ecx, 3
532
533 movq mm0,[edx] // mm0=src3
534 movq mm0,[edx] // mm0=src3
535align 16
536inner_loop:
537
538 movq mm1,[eax] // mm1=src1
539 movq mm3,mm0 // mm3=src3
540
541 // -- 1 --------- in : mm1,mm2,mm3 out: mm4=SAD mm2=new mm2 --
542 movq mm0,[edx+8]
543 add eax,8
544
545 movq mm5,mm1 // mm5=src1
546 psubusb mm5,mm3 // mm5=src1-src3
547
548 movq mm6,mm3 // mm6=src3
549 psubusb mm6,mm1 // mm3=src3-src1
550
551 movq mm7, mm0
552 psrlq mm3, 8 // mm3 = src3 + '1' ... with [x00000000] at the end
553
554 por mm6,mm5 // mm6=|src1-src3|
555
556 movq [edi], mm6
557 psllq mm7, 56 // here mm1=src1 mm3=begin of NEWsrc3 mm7=end of NEWsrc3
558 por mm3,mm7 // here mm3=new src3
559
560 // - 2 ----------------
561 _ABS_DIFF_HORIZ(48)
562 _ABS_DIFF_HORIZ(40)
563 _ABS_DIFF_HORIZ(32)
564 _ABS_DIFF_HORIZ(24)
565 _ABS_DIFF_HORIZ(16)
566 _ABS_DIFF_HORIZ(8)
567
568 // - 8 ----------------
569 movq mm5,mm1 // mm5=src1
570 add edi, imageSize
571
572 psubusb mm5,mm3 // mm5=src1-src3
573 psubusb mm3,mm1 // mm3=src3-src1
574
575 por mm3,mm5 // mm6=|src1-src3|
576 movq [edi], mm3
577 // -------------------------------------------------------------
578 //
579 add edx,8
580 sub edi, back_step2
581 add edi,8
582 dec ecx
583 jnz inner_loop
584 emms
585 }
586
587 return 1;
588}
589
590
591// ----------------------
592// FULL IMAGE, BEST ONLY : Keith's code
593inline int findMinimumCorrelation_mmx(
594 const unsigned char *CurrentCorrelation,
595 unsigned char CurrentDisparity,
596 unsigned char *Disparity,
597 unsigned char *BestCorrelation, int bytecount)
598{
599 if ((bytecount < 8) || ((bytecount % 8) != 0)) {
600 return 0;
601 }
602
603 __asm {
604 // load ecx with the pixelblock count = bytecount / 8
605 mov ecx, bytecount
606 shr ecx, 3
607
608 // setup mm0 with 8 copies of the disparity constant
609 mov al, CurrentDisparity
610 mov ah, al
611 mov bx, ax
612 shl eax, 16
613 mov ax, bx
614 movd mm0, eax
615 movd mm1, eax
616 punpckldq mm0, mm1
617
618 // setup mm1 with 8 copies of the xor constant for unsigned => signed conversion
619 mov eax, 0x80808080
620 movd mm1, eax
621 movd mm2, eax
622 punpckldq mm1, mm2
623
624
625 // setup the image pointers
626 mov eax, BestCorrelation
627 mov esi, CurrentCorrelation
628 mov edi, Disparity
629
630 pixel_loop:
631 movq mm2, [esi] // current correlation
632 movq mm3, [eax] // best correlation
633
634 // check for updates
635 movq mm5, mm2 // copy the current correlation
636 pxor mm5, mm1 // convert from unsigned range to signed range
637
638 movq mm6, mm3 // copy the best correlation
639 pxor mm6, mm1 // convert from unsigned range to signed range
640
641 pcmpgtb mm5, mm6 // mm5 := (current signed> best) mask
642 // 1 indicates current > best, so keep best
643 // 0 indicates current <= best, so use new value
644
645 // BYPASS
646 // this phase adds 8 additional instructions, but could skip 2 writes and 1 read
647 // abort remainder if not updating best correlation
648 pcmpeqb mm6, mm6 // mm6 = 0xFFFFFFFF
649 pxor mm6, mm5 // mm6 = mm5 xor 0xFFFFFFFF = not mm5
650 // 0 indicates current > best, so keep best
651 // 1 indicates current <= best, so use new value
652
653 packsswb mm6, mm6 // pack it into the lower dword of mm6 (unsigned saturation)
654 // 11111111 11111111 => 11111111 some replaced
655 // 11111111 00000000 => 11111111 some replaced
656 // 00000000 11111111 => 11111111 some replaced
657 // 00000000 00000000 => 00000000 no replacements
658
659 // we don't need to backup ebx because its not used in this routine
660 // movd mm7, ebx // make a backup of eax
661 movd ebx, mm6 // get the saturated mask
662 test ebx, ebx // test ebx => yields 0 iff no substitutions will occur
663 // movd ebx, mm7 // restore ebx
664 jz bypass // store mm4 (second correlation) to [ebx]
665
666
667 // Update best Correlation
668 movq mm6, mm5 // mm6 := mask
669 movq mm7, mm5 // mm7 := mask
670
671 pand mm6, mm3 // best correlation values to keep
672 pandn mm7, mm2 // current correlation value to move to best correlation
673
674 por mm6, mm7 // merge values
675 movq [eax], mm6 // store values
676
677 // update disparity
678 movq mm2, [edi] // get disparity map
679 movq mm6, mm5 // mm6 := mask
680
681 pand mm5, mm2 // select disparity map values to keep
682 pandn mm6, mm0 // select current disparity values to move to disparity map
683
684 por mm5, mm6 // merge values
685 movq [edi], mm5 // store values
686
687 bypass:
688 add eax, 8
689 add esi, 8
690 add edi, 8
691
692 dec ecx
693 jnz pixel_loop
694
695
696 emms;
697 }
698
699 return 1;
700}
701
702/*int initMinimumCorrelation(
703 const unsigned char *CurrentCorrelation,
704 unsigned char disparityInit,
705 unsigned char *Disparity,
706 unsigned char *BestCorrelation,
707 unsigned char *SecondCorrelation,
708 int bytecount)
709{
710 for (int i=0; i<bytecount; ++i)
711 {
712 BestCorrelation[i]=255;
713 SecondCorrelation[i]=255;
714 Disparity[i]=0;
715 }
716 return 0;
717}*/
718
719inline int initMinimumCorrelation(
720 const unsigned char *CurrentCorrelation,
721 unsigned char disparityInit,
722 unsigned char *Disparity,
723 unsigned char *BestCorrelation,
724 unsigned char *SecondCorrelation,
725 int bytecount)
726{
727 if ((bytecount < 8) || ((bytecount % 8) != 0)) {
728 return 0;
729 }
730
731 __asm {
732
733 // setup mm0 with 8 copies of the disparity constant
734 mov al, disparityInit
735 mov ah, al
736 mov bx, ax
737 shl eax, 16
738 mov ax, bx
739 movd mm0, eax
740 movd mm1, eax
741 punpckldq mm0, mm1
742
743 // load ecx with the pixelblock count = bytecount / 8
744 mov ecx, bytecount
745 shr ecx, 3
746
747 mov eax, BestCorrelation
748 mov ebx, SecondCorrelation
749 mov esi, CurrentCorrelation
750 mov edx, Disparity
751
752 pixel_loop:
753 movq mm1, [esi]
754 movq [eax], mm1 // Best = Current
755 movq [ebx], mm1 // Second = Current
756 movq [edx], mm0 // Disparity = disparityInit
757
758 add eax, 8
759 add ebx, 8
760 add edx, 8
761 add esi, 8
762 dec ecx
763 jnz pixel_loop
764
765 jmp done
766
767
768 done:
769 emms;
770 }
771}
772
773inline int findMinimumCorrelation(
774 const unsigned char *CurrentCorrelation,
775 unsigned char CurrentDisparity,
776 unsigned char *Disparity,
777 unsigned char *BestCorrelation,
778 unsigned char *SecondCorrelation,
779 int bytecount)
780{
781 for (int i=0; i<bytecount; ++i,++CurrentCorrelation,++Disparity,++BestCorrelation, ++SecondCorrelation)
782 {
783 if (*CurrentCorrelation<*BestCorrelation) {
784 *Disparity = CurrentDisparity;
785 *SecondCorrelation = *BestCorrelation;
786 *BestCorrelation = *CurrentCorrelation;
787 }
788 }
789 return 1;
790}
791
792// ----------------------
793// FULL IMAGE, BEST+SECOND .. Keith's code
794inline int findMinimumCorrelation_mmx(
795 const unsigned char *CurrentCorrelation,
796 unsigned char CurrentDisparity,
797 unsigned char *Disparity,
798 unsigned char *BestCorrelation,
799 unsigned char *SecondCorrelation,
800 int bytecount)
801{
802 if ((bytecount < 8) || ((bytecount % 8) != 0)) {
803 return 0;
804 }
805
806 __asm {
807 // load ecx with the pixelblock count = bytecount / 8
808 mov ecx, bytecount
809 shr ecx, 3
810
811 // setup mm0 with 8 copies of the disparity constant
812 mov al, CurrentDisparity
813 mov ah, al
814 mov bx, ax
815 shl eax, 16
816 mov ax, bx
817 movd mm0, eax
818 movd mm1, eax
819 punpckldq mm0, mm1
820
821 // setup mm1 with 8 copies of the xor constant for unsigned => signed conversion
822 mov eax, 0x80808080
823 movd mm1, eax
824 movd mm2, eax
825 punpckldq mm1, mm2
826
827
828 // setup the image pointers
829 mov eax, BestCorrelation
830 mov ebx, SecondCorrelation
831 mov esi, CurrentCorrelation
832 mov edi, Disparity
833
834 pixel_loop:
835 movq mm2, [esi] // current correlation
836 movq mm4, [ebx] // second correlation
837
838 // convert the current correlation from unsigned range to signed range
839 movq mm5, mm2 // copy the current correlation
840 pxor mm5, mm1 // convert from unsigned range to signed range
841 movq mm7, mm5 // copy converted to mm7
842
843
844 // check for second correlation updates
845 movq mm6, mm4 // copy second best correlation
846 pxor mm6, mm1 // convert from unsigned range to signed range
847
848 pcmpgtb mm7, mm6 // mm7 := (current signed> second best) mask
849
850 // BYPASS 1
851 // skip remainder if second correlation is not to be updated
852 // this phase adds an addition 8 instructions, but it could save as 1 memory read and 3 writes
853 pcmpeqb mm6, mm6 // mm6 = 0xFFFFFFFF
854 pxor mm6, mm7 // mm6 = mm7 xor 0xFFFFFFFF = not mm7
855 // 0 indicates current > second, so keep old value
856 // 1 indicates current <= second, so use new value
857
858
859 packsswb mm6, mm6 // pack it into the lower dword of mm6 (unsigned saturation)
860 // 11111111 11111111 => 11111111 some replaced
861 // 11111111 00000000 => 11111111 some replaced
862 // 00000000 11111111 => 11111111 some replaced
863 // 00000000 00000000 => 00000000 no replacements
864
865 // don't need to backup edx because its not used in this routine
866 // movd mm3, edx // make a backup of edx
867 movd edx, mm6 // get the saturated mask
868 test edx, edx // test edx => yields 0 iff no replacements will occur
869 // movd edx, mm3 // restore edx
870 jz bypass1
871
872
873 // direct update second correlation (get values from current)
874 // mm7 already has mask
875// movq mm6, mm7 // mm6 := mask
876// pand mm6, mm4 // second correlation values to keep
877// pandn mm7, mm2 // current correlation values to move to second correlation
878// por mm6, mm7 // merge value => direct updated second correlation
879// movq mm4, mm6 // store values (*** this instruction could be eliminated!)
880
881 pand mm4, mm7 // second correlation values to keep
882 pandn mm7, mm2 // current correlation values to move to second correlation
883 por mm4, mm7 // merge value => direct updated second correlation
884
885
886 // check for best correlation updates
887 movq mm3, [eax] // best correlation
888 // mm5 has converted current correlation
889 movq mm6, mm3 // copy the best correlation
890 pxor mm6, mm1 // convert from unsigned range to signed range
891
892 pcmpgtb mm5, mm6 // mm5 := (current signed> best) mask
893 // 1 indicates current > best, so keep best
894 // 0 indicates current <= best, so use new value
895 // BYPASS 2
896 // this phase adds 8 additional instructions, but could skip 2 writes and 1 read
897 // abort remainder if not updating best correlation
898 pcmpeqb mm6, mm6 // mm6 = 0xFFFFFFFF
899 pxor mm6, mm5 // mm6 = mm5 xor 0xFFFFFFFF = not mm5
900 // 0 indicates current > best, so keep best
901 // 1 indicates current <= best, so use new value
902
903 packsswb mm6, mm6 // pack it into the lower dword of mm6 (unsigned saturation)
904 // 11111111 11111111 => 11111111 some replaced
905 // 11111111 00000000 => 11111111 some replaced
906 // 00000000 11111111 => 11111111 some replaced
907 // 00000000 00000000 => 00000000 no replacements
908
909 // don't need to backup edx because its not used in this routine
910 // movd mm7, edx // make a backup of edx
911 movd edx, mm6 // get the saturated mask
912 test edx, edx // test edx => yields 0 iff no substitutions will occur
913 // movd edx, mm7 // restore edx
914 jz bypass2 // store mm4 (second correlation) to [ebx]
915
916
917 // indirect update second correlation (pushed down from best)
918 movq mm6, mm5 // mm6 := mask
919 movq mm7, mm5 // mm7 := mask
920
921 pand mm6, mm4 // second correlation values to keep
922 pandn mm7, mm3 // best correlations to move to second correlation
923
924 por mm6, mm7 // merge values
925 movq [ebx], mm6 // store values
926
927 // direct Update best Correlation
928 movq mm6, mm5 // mm6 := mask
929 movq mm7, mm5 // mm7 := mask
930
931 pand mm6, mm3 // best correlation values to keep
932 pandn mm7, mm2 // current correlation value to move to best correlation
933
934 por mm6, mm7 // merge values
935 movq [eax], mm6 // store values
936
937 // update disparity
938 movq mm2, [edi] // get disparity map
939 movq mm6, mm5 // mm6 := mask
940
941 pand mm5, mm2 // select disparity map values to keep
942 pandn mm6, mm0 // select current disparity values to move to disparity map
943
944 por mm5, mm6 // merge values
945 movq [edi], mm5 // store values
946
947
948 bypass1:
949 next_pixel:
950 add eax, 8
951 add ebx, 8
952 add esi, 8
953 add edi, 8
954
955 dec ecx
956 jnz pixel_loop
957
958 jmp done
959
960 bypass2:
961 movq [ebx], mm4;
962 jmp next_pixel
963
964 done:
965 emms;
966 }
967
968 return 1;
969}
970
971
972
973inline void sum_Row(uchar* im, unsigned short* im_out, int rowSize, int maskSize)
974{
975 im += maskSize/2;
976 im_out += maskSize/2;
977 for (int i=0; i<rowSize; ++i) {
978 int s=0;
979 for (int j=-maskSize/2; j<=maskSize/2; ++j) {
980 s+=*(im+j);
981 }
982 *im_out=s/maskSize;
983 ++im;++im_out;
984 }
985}
986
987inline void sum_Row_mmx(uchar* im, unsigned short* im_out, int rowSize, int maskSize)
988{
989 sum_Row_5_mmx(im, im_out, rowSize);
990 for (int i=0; i<(maskSize-5)/2; ++i)
991 sum_Row_5_mmx(im_out, im_out, rowSize);
992}
993
994inline void sum_Row_mmx(unsigned short* im, unsigned short* im_out, int rowSize, int maskSize)
995{
996 sum_Row_5_mmx(im, im_out, rowSize);
997 for (int i=0; i<(maskSize-5)/2; ++i)
998 sum_Row_5_mmx(im_out, im_out, rowSize);
999}
1000
1001#define aim_Sum_Words_In_MM1 __asm \
1002{ \
1003 __asm movq mm4, mm1 \
1004 __asm movq mm2, mm1 \
1005\
1006 __asm movq mm3, mm1 \
1007 __asm psllq mm1, 16 \
1008\
1009 __asm psrlq mm2, 16 \
1010 __asm paddw mm4, mm2 \
1011\
1012 __asm paddw mm3, mm1 \
1013 __asm psrlq mm2, 16 \
1014\
1015 __asm psllq mm1, 16 \
1016 __asm paddw mm4, mm2 \
1017\
1018 __asm psrlq mm2, 16 \
1019 __asm paddw mm3, mm1 \
1020\
1021 __asm psllq mm1, 16 \
1022 __asm paddw mm4, mm2 \
1023\
1024 __asm paddw mm3, mm1 \
1025}
1026
1027
1028
1029
1030
1031// apply the mask [1 1 1 1 1] to the 1-D array im (bytes)
1032// output : im_out (words)
1033inline void sum_Row_5_mmx(uchar* im, unsigned short* im_out, int rowSize)
1034{
1035 // temp: for debugging
1036 //return sum_Row_5(im,im_out,rowSize);
1037 __asm {
1038
1039 mov eax, rowSize
1040 mov ebx, im
1041 mov ecx, im_out
1042
1043 pxor mm6, mm6 // mm6 = x00000000
1044
1045 //Process the first quad word, but save only the second result"
1046 test eax, eax // Is there anything to do?"
1047 jz end_sum_loop // Jump out if necessary
1048
1049
1050 //Process low word
1051 movq mm1, [ebx] // Copy...
1052 punpcklbw mm1, mm6 // Expand low word bytes into words // mm1 =[D C B A]
1053
1054 aim_Sum_Words_In_MM1
1055
1056 //Store the result Only in the accumulator
1057 movq mm7, mm4 // Update accumulator mm4=[D C+D B+C+D A+B+C+D]
1058
1059 //Process high word
1060 movq mm1, [ebx] // Copy...
1061 punpckhbw mm1, mm6 // Expand high word bytes into words // mm1 =[H G F E]
1062 add ebx, 8 // Update input pointer
1063
1064 aim_Sum_Words_In_MM1
1065
1066 //Add to the previous data ...
1067 // mm3=[E+F+G+H E+F+G E+F E]
1068 // mm4=[H G+H F+G+H E+F+G+H]
1069 paddw mm7, mm3 // The current word of the accum // mm7=[D+E+F+G+H C+D+E+F+G B+C+D+E+F A+B+C+D+E]
1070
1071 // translate everything to 2 words on the left
1072 movq mm1, mm7 // mm1 = [D+E+F+G+H C+D+E+F+G B+C+D+E+F A+B+C+D+E]
1073 psrlq mm1, 32 // mm1 = [0 0 D+E+F+G+H C+D+E+F+G]
1074
1075 movq mm0, mm1 // mm0 = [D+E+F+G+H C+D+E+F+G]
1076
1077 psllq mm7, 32 // mm7 = [B+C+D+E+F A+B+C+D+E 0 0]
1078
1079 movq [ecx], mm7 // Store the final result
1080 add ecx, 8 // Update output pointer
1081
1082 movq mm7, mm4 // Update accumulator mm4=[H G+H F+G+H E+F+G+H]
1083 sub eax, 8 // Update the number of points left
1084
1085 // Start the loop
1086 row_sum_loop:
1087 test eax, eax // Is there anything to do?
1088 jz end_sum_loop // Jump out if necessary
1089
1090 movq mm1, [ebx] // Load data
1091
1092 //Process low word
1093 punpcklbw mm1, mm6 // Expand low word bytes into words
1094
1095 aim_Sum_Words_In_MM1
1096
1097 //Add to the previous data
1098 //prefetcht1 [ecx+16]
1099 paddw mm7, mm3 // The current word of the accum
1100
1101 // translate everything to 2 words on the left
1102 // mm0 = [0 0 D C] mm7 = [H G F E] ----> mm7=[0 0 H G] [ecx]=[F E D C]
1103 punpckldq mm0, mm7 // mm0 = [F E D C]
1104
1105 movq [ecx], mm0
1106 sub eax, 8 // Update the number of points left
1107
1108 movq mm0, mm4 // Update accumulator
1109 psrlq mm7, 32 // mm7 = [0 0 H G]
1110
1111 //Process high word
1112 movq mm1, [ebx] // Copy...
1113 punpckhbw mm1, mm6 // Expand high word bytes into words
1114
1115 aim_Sum_Words_In_MM1
1116
1117 //Add to the previous data
1118 paddw mm0, mm3 // The current word of the accum
1119
1120 // translate everything to 2 words on the left
1121 // mm7 = [0 0 D C] mm0 = [H G F E] ----> mm0=[0 0 H G] [ecx+8]=[F E D C]
1122 punpckldq mm7, mm0 // mm7 = [F E D C]
1123 add ebx, 8 // Update input pointer
1124
1125 movq [ecx+8], mm7
1126 psrlq mm0, 32 // mm0 = [0 0 H G]
1127
1128 movq mm7, mm4 // Update accumulator
1129 add ecx, 16 // Update output pointer
1130
1131 jmp row_sum_loop // Loop
1132
1133 //Cleanup
1134 end_sum_loop:
1135 emms
1136 }
1137}
1138
1139
1140
1141// apply the mask (1/4)*[1 1 1 1 1] to the 1-D array im (words)
1142// output : im_out (words)
1143inline void sum_Row_5_mmx(ushort* im, ushort* im_out, int rowSize)
1144{
1145 // temp: for debugging
1146 //return sum_Row_5(im,im_out,rowSize);
1147 __asm {
1148
1149 mov eax, rowSize
1150 mov ebx, im
1151 mov ecx, im_out
1152
1153 //Process the first quad word, but save only the second result"
1154 test eax, eax // Is there anything to do?"
1155 jz end_sum_loop // Jump out if necessary
1156
1157 movq mm1, [ebx] // Load data (4 words)
1158 add ebx, 8 // Update input pointer
1159
1160 //Process low word
1161 aim_Sum_Words_In_MM1
1162
1163 //Store the result Only in the accumulator
1164 movq mm7, mm4 // Update accumulator
1165
1166 //Process high word
1167 movq mm1, [ebx] // Copy...
1168
1169 aim_Sum_Words_In_MM1
1170 add ebx, 8
1171
1172 //Add to the previous data
1173 paddw mm7, mm3 // The current word of the accum
1174
1175 // translate everything to 2 words on the left
1176 movq mm1, mm7 // mm1 = [D+E+F+G+H C+D+E+F+G B+C+D+E+F A+B+C+D+E]
1177 psrlq mm1, 32 // mm1 = [0 0 D+E+F+G+H C+D+E+F+G]
1178 movq mm0, mm1 // mm0 = [0 0 D+E+F+G+H C+D+E+F+G]
1179 psllq mm7, 32 // mm7 = [B+C+D+E+F A+B+C+D+E 0 0]
1180
1181 movq [ecx], mm7 // Store the final result
1182 movq mm7, mm4 // Update accumulator
1183
1184 add ecx, 8 // Update output pointer
1185 sub eax, 8 // Update the number of points left
1186
1187 // Start the loop
1188 row_sum_loop:
1189 test eax, eax // Is there anything to do?
1190 jz end_sum_loop // Jump out if necessary
1191
1192 movq mm1, [ebx] // Load data
1193
1194 aim_Sum_Words_In_MM1
1195
1196 //Add to the previous data
1197 //prefetcht0 [ecx + 32]
1198 //prefetcht0 [ebx + 48]
1199 paddw mm7, mm3 // The current word of the accum
1200 psrlw mm7, 2 // divide result by ...
1201
1202 // translate everything to 2 words on the left
1203 // mm0 = [0 0 D C] mm7 = [H G F E] ----> mm7 =[0 0 H G] [ecx]=[F E D C]
1204 punpckldq mm0, mm7 // mm0 = [F E D C]
1205
1206 movq [ecx], mm0
1207 sub eax, 8 // Update the number of points left
1208
1209 movq mm0, mm4 // Update accumulator
1210 psrlq mm7, 32 // mm7 =[0 0 H G]
1211
1212 //Process high word
1213 movq mm1, [ebx+8] // Copy...
1214
1215 aim_Sum_Words_In_MM1
1216
1217 //Add to the previous data
1218 paddw mm0, mm3 // The current word of the accum
1219 psrlw mm0, 2 // divide result by ...
1220
1221 // translate everything to 2 words on the left
1222 // mm7 = [0 0 D C] mm0 = [H G F E] ----> mm0=[0 0 H G] [ecx+8]=[F E D C]
1223 punpckldq mm7, mm0 // mm7 = [F E D C]
1224 add ebx, 16 // Update input pointer
1225
1226 movq [ecx+8], mm7
1227 psrlq mm0, 32 // mm0 = [0 0 H G]
1228
1229 movq mm7, mm4 // Update accumulator
1230 add ecx, 16 // Update output pointer */
1231
1232 jmp row_sum_loop // Loop
1233
1234 //Cleanup
1235 end_sum_loop:
1236 emms
1237 }
1238}
1239
1240template<class T> void sum_Row_5(T* im, ushort* im_out, int rowSize)
1241{
1242 im += 2;
1243 im_out +=2;
1244 int s = 0;
1245 for (int i=0; i<rowSize-5; ++i, ++im, ++im_out) {
1246 s = *(im-2);
1247 s += *(im-1);
1248 s += *(im);
1249 s += *(im+1);
1250 s += *(im+2);
1251 *im_out = s/5;
1252 }
1253}
1254
1255inline void avg_Col(ushort* im, uchar* im_out, int dataSize, int width, int sizeMask)
1256{
1257 int offset = width*(sizeMask/2);
1258 im += offset;
1259 im_out += offset;
1260 for (int i=0; i<dataSize-width*sizeMask; ++i, ++im, ++im_out) {
1261 int s = 0;
1262 for (int j=-sizeMask/2; j<=sizeMask/2; ++j) s += *(im+j*width);
1263 *im_out = s/(sizeMask);
1264 }
1265}
1266
1267// apply vertical mask 1/16*[1 1 1 ... 1]^T to 'im'
1268// result in 'im_out'
1269inline void avg_Col_mmx(ushort* im, uchar* im_out, int dataSize, int width, int sizeMask)
1270{
1271 // temp: for debugging
1272 //return avg_Col(im,im_out,dataSize,width,sizeMask);
1273
1274 switch (sizeMask)
1275 {
1276 case 5: avg_Col_5(im,im_out,dataSize,width);
1277 break;
1278 case 7: avg_Col_7(im,im_out,dataSize,width);
1279 break;
1280 case 9: avg_Col_9(im,im_out,dataSize,width);
1281 break;
1282 case 11: avg_Col_11(im,im_out,dataSize,width);
1283 break;
1284 case 13: avg_Col_13(im,im_out,dataSize,width);
1285 break;
1286 case 15: avg_Col_15(im,im_out,dataSize,width);
1287 break;
1288 case 17: avg_Col_17(im,im_out,dataSize,width);
1289 break;
1290
1291 default: if (sizeMask<5) avg_Col_5(im,im_out,dataSize,width);
1292 else if (sizeMask>17) avg_Col_17(im,im_out,dataSize,width);
1293 break;
1294
1295 }
1296}
1297
1298
1299
1300#define macro_add __asm \
1301{ \
1302 __asm paddusw mm3, [edx] \
1303 __asm paddusw mm2, [edx+8] \
1304 __asm add edx, edi \
1305}
1306
1307
1308inline void avg_Col_5(ushort* im, uchar* im_out, int dataSize, int width)
1309{
1310 __asm {
1311
1312 mov edi, width
1313 shl edi, 1 // edi = 2*width
1314
1315 mov eax, dataSize
1316 mov ecx, im_out
1317
1318 mov ebx, im
1319 sub ebx, edi
1320 sub ebx, edi // ebx = ebx-4*width
1321
1322 test eax, eax // Is there anything to do?"
1323 jz end_sum_loop // Jump out if necessary
1324
1325 row_sum_loop:
1326
1327 test eax, eax // Is there anything to do?
1328 jz end_sum_loop // Jump out if necessary
1329
1330 mov edx, ebx
1331 add ebx, 16
1332
1333 // 1
1334 movq mm3, [edx] // mm3 = 4 words of im
1335 movq mm2, [edx+8] // mm2 = next 4 words of im
1336 add edx, edi
1337
1338 macro_add
1339 macro_add
1340 macro_add
1341 macro_add
1342
1343 // divide results by ...
1344 psrlw mm3, 3
1345 psrlw mm2, 3
1346
1347 // convert [mm2 mm3] as 8 bytes
1348 packuswb mm3,mm2
1349 movq [ecx], mm3
1350
1351 sub eax, 8 // Update the number of points left
1352 add ecx, 8 // Update output pointer
1353
1354 jmp row_sum_loop // Loop
1355
1356 //Cleanup
1357 end_sum_loop:
1358 emms
1359 }
1360}
1361
1362inline void avg_Col_7(ushort* im, uchar* im_out, int dataSize, int width)
1363{
1364 __asm {
1365
1366 mov edi, width
1367 shl edi, 1 // edi = 2*width
1368
1369 mov eax, dataSize
1370 mov ecx, im_out
1371
1372 mov ebx, im
1373 sub ebx, edi
1374 sub ebx, edi
1375 sub ebx, edi // ebx = ebx-6*width
1376
1377 test eax, eax // Is there anything to do?"
1378 jz end_sum_loop // Jump out if necessary
1379
1380 row_sum_loop:
1381
1382 test eax, eax // Is there anything to do?
1383 jz end_sum_loop // Jump out if necessary
1384
1385 mov edx, ebx
1386
1387 // 1
1388 movq mm3, [edx] // mm3 = 4 words of im
1389 add ebx, 16
1390 movq mm2, [edx+8] // mm2 = next 4 words of im
1391 add edx, edi
1392
1393 macro_add
1394 macro_add
1395 macro_add
1396 macro_add
1397 macro_add
1398 macro_add
1399
1400 // divide results by ...
1401 psrlw mm3, 3
1402 psrlw mm2, 3
1403
1404 // convert [mm2 mm3] as 8 bytes
1405 packuswb mm3,mm2
1406 movq [ecx], mm3
1407
1408 sub eax, 8 // Update the number of points left
1409 add ecx, 8 // Update output pointer
1410
1411 jmp row_sum_loop // Loop
1412
1413 //Cleanup
1414 end_sum_loop:
1415 emms
1416 }
1417}
1418
1419inline void avg_Col_9(ushort* im, uchar* im_out, int dataSize, int width)
1420{
1421 __asm {
1422
1423 mov edi, width
1424 shl edi, 1 // edi = 2*width
1425
1426 mov eax, dataSize
1427 mov ecx, im_out
1428
1429 mov ebx, im
1430 sub ebx, edi
1431 sub ebx, edi
1432 sub ebx, edi
1433 sub ebx, edi // ebx = ebx-8*width
1434
1435 test eax, eax // Is there anything to do?"
1436 jz end_sum_loop // Jump out if necessary
1437
1438 row_sum_loop:
1439
1440 test eax, eax // Is there anything to do?
1441 jz end_sum_loop // Jump out if necessary
1442
1443 mov edx, ebx
1444 add ebx, 16
1445
1446 // 1
1447 movq mm3, [edx] // mm3 = 4 words of im
1448 movq mm2, [edx+8] // mm2 = next 4 words of im
1449 add edx, edi
1450
1451 macro_add
1452 macro_add
1453 macro_add
1454 macro_add
1455 macro_add
1456 macro_add
1457 macro_add
1458 macro_add
1459
1460 // divide results by ...
1461 psrlw mm3, 3
1462 psrlw mm2, 3
1463
1464 // convert [mm2 mm3] as 8 bytes
1465 packuswb mm3,mm2
1466 movq [ecx], mm3
1467
1468 sub eax, 8 // Update the number of points left
1469 add ecx, 8 // Update output pointer
1470
1471 jmp row_sum_loop // Loop
1472
1473 //Cleanup
1474 end_sum_loop:
1475 emms
1476 }
1477}
1478
1479inline void avg_Col_11(ushort* im, uchar* im_out, int dataSize, int width)
1480{
1481 __asm {
1482
1483 mov edi, width
1484 shl edi, 1 // edi = 2*width
1485
1486 mov eax, dataSize
1487 mov ecx, im_out
1488
1489 mov ebx, im
1490 sub ebx, edi
1491 sub ebx, edi
1492 sub ebx, edi
1493 sub ebx, edi
1494 sub ebx, edi // ebx = ebx-10*width
1495
1496 test eax, eax // Is there anything to do?"
1497 jz end_sum_loop // Jump out if necessary
1498
1499 row_sum_loop:
1500
1501 test eax, eax // Is there anything to do?
1502 jz end_sum_loop // Jump out if necessary
1503
1504 mov edx, ebx
1505 add ebx, 16
1506
1507 // 1
1508 movq mm3, [edx] // mm3 = 4 words of im
1509 movq mm2, [edx+8] // mm2 = next 4 words of im
1510 add edx, edi
1511
1512 macro_add
1513 macro_add
1514 macro_add
1515 macro_add
1516 macro_add
1517 macro_add
1518 macro_add
1519 macro_add
1520 macro_add
1521 macro_add
1522
1523 // divide results by ...
1524 psrlw mm3, 4
1525 psrlw mm2, 4
1526
1527 // convert [mm2 mm3] as 8 bytes
1528 packuswb mm3,mm2
1529 movq [ecx], mm3
1530
1531 sub eax, 8 // Update the number of points left
1532 add ecx, 8 // Update output pointer
1533
1534 jmp row_sum_loop // Loop
1535
1536 //Cleanup
1537 end_sum_loop:
1538 emms
1539 }
1540}
1541
1542inline void avg_Col_13(ushort* im, uchar* im_out, int dataSize, int width)
1543{
1544 __asm {
1545
1546 mov edi, width
1547 shl edi, 1 // edi = 2*width
1548
1549 mov eax, dataSize
1550 mov ecx, im_out
1551
1552 mov ebx, im
1553 sub ebx, edi
1554 sub ebx, edi
1555 sub ebx, edi
1556 sub ebx, edi
1557 sub ebx, edi
1558 sub ebx, edi // ebx = ebx-12*width
1559
1560 test eax, eax // Is there anything to do?"
1561 jz end_sum_loop // Jump out if necessary
1562
1563 row_sum_loop:
1564
1565 test eax, eax // Is there anything to do?
1566 jz end_sum_loop // Jump out if necessary
1567
1568 mov edx, ebx
1569 add ebx, 16
1570
1571 // 1
1572 movq mm3, [edx] // mm3 = 4 words of im
1573 movq mm2, [edx+8] // mm2 = next 4 words of im
1574 add edx, edi
1575
1576 macro_add
1577 macro_add
1578 macro_add
1579 macro_add
1580 macro_add
1581 macro_add
1582 macro_add
1583 macro_add
1584 macro_add
1585 macro_add
1586 macro_add
1587 macro_add
1588
1589 // divide results by ...
1590 psrlw mm3, 4
1591 psrlw mm2, 4
1592
1593 // convert [mm2 mm3] as 8 bytes
1594 packuswb mm3,mm2
1595 movq [ecx], mm3
1596
1597 sub eax, 8 // Update the number of points left
1598 add ecx, 8 // Update output pointer
1599
1600 jmp row_sum_loop // Loop
1601
1602 //Cleanup
1603 end_sum_loop:
1604 emms
1605 }
1606}
1607
1608inline void avg_Col_15(ushort* im, uchar* im_out, int dataSize, int width)
1609{
1610 __asm {
1611
1612 mov edi, width
1613 shl edi, 1 // edi = 2*width
1614
1615 mov eax, dataSize
1616 mov ecx, im_out
1617
1618 mov ebx, im
1619 sub ebx, edi
1620 sub ebx, edi
1621 sub ebx, edi
1622 sub ebx, edi
1623 sub ebx, edi
1624 sub ebx, edi
1625 sub ebx, edi // ebx = ebx-14*width
1626
1627 test eax, eax // Is there anything to do?"
1628 jz end_sum_loop // Jump out if necessary
1629
1630 row_sum_loop:
1631
1632 test eax, eax // Is there anything to do?
1633 jz end_sum_loop // Jump out if necessary
1634
1635 mov edx, ebx
1636 add ebx, 16
1637
1638 // 1
1639 movq mm3, [edx] // mm3 = 4 words of im
1640 movq mm2, [edx+8] // mm2 = next 4 words of im
1641 add edx, edi
1642
1643 macro_add
1644 macro_add
1645 macro_add
1646 macro_add
1647 macro_add
1648 macro_add
1649 macro_add
1650 macro_add
1651 macro_add
1652 macro_add
1653 macro_add
1654 macro_add
1655 macro_add
1656 macro_add
1657
1658 // divide results by ...
1659 psrlw mm3, 4
1660 psrlw mm2, 4
1661
1662 // convert [mm2 mm3] as 8 bytes
1663 packuswb mm3,mm2
1664 movq [ecx], mm3
1665
1666 sub eax, 8 // Update the number of points left
1667 add ecx, 8 // Update output pointer
1668
1669 jmp row_sum_loop // Loop
1670
1671 //Cleanup
1672 end_sum_loop:
1673 emms
1674 }
1675}
1676
1677inline void avg_Col_17(ushort* im, uchar* im_out, int dataSize, int width)
1678{
1679 __asm {
1680
1681 mov edi, width
1682 shl edi, 1 // edi = 2*width
1683
1684 mov eax, dataSize
1685 mov ecx, im_out
1686
1687 mov ebx, im
1688 sub ebx, edi
1689 sub ebx, edi
1690 sub ebx, edi
1691 sub ebx, edi
1692 sub ebx, edi
1693 sub ebx, edi
1694 sub ebx, edi
1695 sub ebx, edi // ebx = ebx-16*width
1696
1697 test eax, eax // Is there anything to do?"
1698 jz end_sum_loop // Jump out if necessary
1699
1700 row_sum_loop:
1701
1702 test eax, eax // Is there anything to do?
1703 jz end_sum_loop // Jump out if necessary
1704
1705 mov edx, ebx
1706 add ebx, 16
1707
1708 // 1
1709 movq mm3, [edx] // mm3 = 4 words of im
1710 movq mm2, [edx+8] // mm2 = next 4 words of im
1711 add edx, edi
1712
1713 macro_add
1714 macro_add
1715 macro_add
1716 macro_add
1717 macro_add
1718 macro_add
1719 macro_add
1720 macro_add
1721 macro_add
1722 macro_add
1723 macro_add
1724 macro_add
1725 macro_add
1726 macro_add
1727 macro_add
1728 macro_add
1729
1730 // divide results by ...
1731 psrlw mm3, 4
1732 psrlw mm2, 4
1733
1734 // convert [mm2 mm3] as 8 bytes
1735 packuswb mm3,mm2
1736 movq [ecx], mm3
1737
1738 sub eax, 8 // Update the number of points left
1739 add ecx, 8 // Update output pointer
1740
1741 jmp row_sum_loop // Loop
1742
1743 //Cleanup
1744 end_sum_loop:
1745 emms
1746 }
1747}
1748
1749
1750inline void add_Col_5_wb(ushort* im, uchar* im_out, int dataSize, int width)
1751{
1752 __asm {
1753
1754 mov edi, width
1755 shl edi, 1 // edi = 2*width
1756
1757 mov eax, dataSize
1758 mov ecx, im_out
1759
1760 mov ebx, im
1761 sub ebx, edi
1762 sub ebx, edi // ebx = ebx-4*width
1763
1764 test eax, eax // Is there anything to do?"
1765 jz end_sum_loop // Jump out if necessary
1766
1767 row_sum_loop:
1768
1769 test eax, eax // Is there anything to do?
1770 jz end_sum_loop // Jump out if necessary
1771
1772 mov edx, ebx
1773 add ebx, 16
1774
1775 // 1
1776 movq mm3, [edx] // mm3 = 4 words of im
1777 movq mm2, [edx+8] // mm2 = next 4 words of im
1778 add edx, edi
1779
1780 macro_add
1781 macro_add
1782 macro_add
1783 macro_add
1784
1785 // save [mm2 mm3] as 8 bytes
1786 packuswb mm3,mm2
1787 movq [ecx], mm3
1788
1789 sub eax, 8 // Update the number of points left
1790 add ecx, 8 // Update output pointer
1791
1792 jmp row_sum_loop // Loop
1793
1794 //Cleanup
1795 end_sum_loop:
1796 emms
1797 }
1798}
1799
1800inline void add_Col_5_ww(ushort* im, ushort* im_out, int dataSize, int width)
1801{
1802 __asm {
1803
1804 mov edi, width
1805 shl edi, 1 // edi = 2*width
1806
1807 mov eax, dataSize
1808 mov ecx, im_out
1809
1810 mov ebx, im
1811 sub ebx, edi
1812 sub ebx, edi // ebx = ebx-4*width
1813
1814 test eax, eax // Is there anything to do?"
1815 jz end_sum_loop // Jump out if necessary
1816
1817 row_sum_loop:
1818
1819 test eax, eax // Is there anything to do?
1820 jz end_sum_loop // Jump out if necessary
1821
1822 mov edx, ebx
1823 add ebx, 16
1824
1825 // 1
1826 movq mm3, [edx] // mm3 = 4 words of im
1827 movq mm2, [edx+8] // mm2 = next 4 words of im
1828 add edx, edi
1829
1830 macro_add
1831 macro_add
1832 macro_add
1833 macro_add
1834
1835 // save [mm2 mm3] as words
1836 movq [ecx], mm3
1837 movq [ecx+8], mm2
1838
1839 sub eax, 8 // Update the number of points left
1840 add ecx, 16 // Update output pointer
1841
1842 jmp row_sum_loop // Loop
1843
1844 //Cleanup
1845 end_sum_loop:
1846 emms
1847 }
1848}
1849
1850// compare bestScores and secondScores. if second<best+'thresh' the disp.
1851// is set to 'valForReplacement' (usually 0)
1852inline void compareBestAndSecond(uchar* bestScores, uchar* secondScores, char thresh,
1853 uchar undefined_val,
1854 uchar* disp, int dataSize)
1855{
1856 __asm {
1857
1858 // setup mm0 with 8 copies of 'thresh'
1859 mov al, thresh
1860 mov ah, al
1861 mov bx, ax
1862 shl eax, 16
1863 mov ax, bx
1864 movd mm0, eax
1865 movd mm1, eax
1866 punpckldq mm0, mm1
1867
1868 // setup mm7 with 8 copies of 'valForReplacement'
1869 mov al, undefined_val
1870 mov ah, al
1871 mov bx, ax
1872 shl eax, 16
1873 mov ax, bx
1874 movd mm7, eax
1875 movd mm1, eax
1876 punpckldq mm7, mm1
1877
1878 mov eax, dataSize
1879 mov ebx, bestScores
1880 mov ecx, secondScores
1881 mov edx, disp
1882
1883 test eax, eax // Is there anything to do?"
1884 jz end_loop // Jump out if necessary
1885
1886 comp_loop:
1887
1888 test eax, eax // Is there anything to do?
1889 jz end_loop // Jump out if necessary
1890
1891 movq mm2, [ecx]
1892 psubusb mm2, [ebx] // mm2 = secondScores - bestScores
1893
1894 movq mm3, [edx] // mm3 = disp
1895 pcmpgtb mm2, mm0 // mm2 = 1 if mm2>thresh
1896 // 0 otherwise
1897
1898 pand mm3, mm2
1899 pandn mm2, mm7
1900
1901 por mm3, mm2
1902 movq [edx], mm3
1903
1904 sub eax, 8 // Update the number of points left
1905 add ebx, 8 // Update output pointer
1906 add ecx, 8
1907 add edx, 8
1908
1909 jmp comp_loop // Loop
1910
1911 //Cleanup
1912 end_loop:
1913 emms
1914 }
1915}
1916
1917// windowWidth must be multiple of 8
1918inline void cropImage(const uchar* imSrc, int width, int height,
1919 uchar* imDest, int x0, int y0, int windowWidth, int windowHeight)
1920{
1921 int w8 = windowWidth/8;
1922
1923 int step = width-windowWidth;
1924 const uchar* srcNewOrigin = imSrc+x0+y0*width;
1925
1926 __asm {
1927
1928 mov ecx, windowHeight
1929
1930 mov edx, w8
1931 mov eax, srcNewOrigin
1932 mov ebx, imDest
1933
1934 pixel_loop:
1935
1936 movq mm1, [eax]
1937 movq [ebx], mm1
1938 add eax, 8
1939 add ebx, 8
1940
1941 dec edx
1942 jnz pixel_loop
1943
1944 mov edx, w8
1945 add eax, step
1946
1947 dec ecx
1948 jnz pixel_loop
1949
1950 jmp done
1951
1952 done:
1953 emms;
1954 }
1955}
1956
1957// return the average pixel value
1958inline float pixelMean(const uchar* im, int imageSize)
1959{
1960 int sum;
1961
1962 __asm {
1963
1964 mov ecx, imageSize
1965 shr ecx, 3
1966
1967 mov eax, im
1968 pxor mm7,mm7 // mm7 used as accumulator
1969 pxor mm0,mm0 // mm0 = 0
1970
1971 pixel_loop:
1972
1973 movq mm1, [eax]
1974 movq mm2,mm1
1975
1976 punpcklbw mm2, mm0
1977 punpckhbw mm1, mm0
1978
1979 paddw mm2,mm1
1980
1981 movq mm1,mm2
1982 punpcklwd mm2, mm0
1983 punpckhwd mm1, mm0
1984
1985 paddd mm2,mm1
1986 paddd mm7,mm2
1987
1988 add eax, 8
1989 dec ecx
1990 jnz pixel_loop
1991
1992 jmp done
1993
1994 done:
1995 movd ebx, mm7
1996 psrlq mm7, 32
1997 movd edx, mm7
1998 add ebx, edx
1999 mov sum, ebx
2000
2001 emms
2002 }
2003
2004 return sum / (float)imageSize;
2005}
2006
2007
2008
2009
2010// -------------------------------------------------------------
2011// apply mask:
2012// if mask[]=undefined_val im[]->im[]
2013// otherwise, im[]->mask[]
2014// ....... this one may not be exact :-(
2015inline void overrideImageMMX(uchar* im, const uchar* mask, uchar undefined_val, int imageSize)
2016{
2017 __asm {
2018 // setup mm0 with 8 copies of 'undefined_val'
2019 mov al, undefined_val
2020 mov ah, al
2021 mov bx, ax
2022 shl eax, 16
2023 mov ax, bx
2024 movd mm0, eax
2025 movd mm1, eax
2026 punpckldq mm0, mm1
2027
2028 mov ecx, imageSize
2029 shr ecx, 3
2030
2031 mov eax, im
2032 mov ebx, mask
2033
2034 pixel_loop:
2035 movq mm1, [eax]
2036 movq mm2, [ebx]
2037
2038 movq mm3, mm2
2039 pcmpeqb mm3, mm0 // mm3[] -> xFF if mm2[]==undefined_val
2040 // -> x00 otherwise
2041 pand mm3, mm1 // mm3[] = mm1[] if mm2[]==undefined_val
2042 // = x00 otherwise
2043 por mm3, mm2
2044 movq [eax], mm3
2045
2046 add eax, 8
2047 add ebx, 8
2048 dec ecx
2049 jnz pixel_loop
2050
2051 jmp done
2052
2053 done:
2054 emms
2055 }
2056}
2057
2058inline void overrideImage(uchar* im, const uchar* mask, uchar undefined_val, int imageSize)
2059{
2060 for (int i=0; i<imageSize; ++i, ++im,++mask)
2061 {
2062 if (*mask != undefined_val) *im=*mask;
2063 }
2064
2065}
2066
2067
2068inline void divide( ushort* im, uchar* div, uchar* result, int imageSize)
2069{
2070 for (int i=0; i<imageSize; ++i,++im,++div,++result)
2071 {
2072 *result = (*div)?(uchar)(*im / *div):0;
2073 }
2074}
2075
2076// 5x5 sum filters
2077inline void sum_5x5_mmx( uchar* im, ushort* im_out, int dataSize, int width, ushort* buff)
2078{
2079 sum_Row_5_mmx(im, buff, dataSize);
2080 add_Col_5_ww(buff+2*width, im_out+2*width, dataSize-4*width , width);
2081}
2082
2083inline void sum_5x5_mmx( uchar* im, uchar* im_out, int dataSize, int width, ushort* buff)
2084{
2085 sum_Row_5_mmx(im, buff, dataSize);
2086 add_Col_5_wb(buff+2*width, im_out+2*width, dataSize-4*width , width);
2087}
2088
2089
2090inline void binarize(uchar* im, uchar* im_out, uchar undefined_val, int dataSize)
2091{
2092 for (int i=0; i<dataSize; ++i,++im,++im_out)
2093 {
2094 *im_out = (*im != undefined_val);
2095 }
2096}
2097
2098inline void set_undefined_to_zero(uchar* im, uchar* im_out, uchar undefined_val, int dataSize)
2099{
2100 for (int i=0; i<dataSize; ++i,++im,++im_out)
2101 {
2102 if (*im == undefined_val) *im_out=0;
2103 }
2104}
2105
2106inline void set_zero_to_undefined(uchar* im, uchar* im_out, uchar undefined_val, int dataSize)
2107{
2108 for (int i=0; i<dataSize; ++i,++im,++im_out)
2109 {
2110 if (*im == 0) *im_out=undefined_val;
2111 }
2112}
2113
2114
2115
2116inline void copyMMX(void* imDest, const void* imSrc, int dataSize)
2117{
2118 __asm {
2119
2120 mov ecx, dataSize
2121 shr ecx, 3
2122
2123 mov eax, imSrc
2124 mov ebx, imDest
2125 sub ebx, 8
2126
2127 pixel_loop:
2128 movq mm1, [eax]
2129 add ebx, 8
2130
2131 movq [ebx], mm1
2132 add eax, 8
2133
2134 dec ecx
2135 jnz pixel_loop
2136
2137 jmp done
2138
2139 done:
2140 emms
2141 }
2142}
2143
2144inline void copySSE(void* imDest, const void* imSrc, int dataSize)
2145{
2146 __asm {
2147
2148 mov ecx, dataSize
2149 shr ecx, 4
2150
2151 mov eax, imSrc
2152 mov ebx, imDest
2153 sub ebx, 16
2154
2155 pixel_loop:
2156 movdqa xmm1, [eax]
2157 add ebx, 16
2158
2159 movdqa [ebx], xmm1
2160 add eax, 16
2161
2162 dec ecx
2163 jnz pixel_loop
2164
2165 jmp done
2166
2167 done:
2168 emms
2169 }
2170}
2171
2172inline void setMMX(float* imDest, const float value, int dataSize)
2173{
2174 __asm {
2175 // make 4 copies of the constant 'value' in xmm0
2176 movss xmm0, value
2177 movss xmm1, xmm0
2178 unpcklps xmm0, xmm1
2179 movlhps xmm0, xmm0
2180
2181 mov ecx, dataSize
2182 shr ecx, 2
2183
2184 mov ebx, imDest
2185
2186 pixel_loop:
2187 movaps [ebx], xmm0
2188 add ebx, 16
2189
2190 dec ecx
2191 jnz pixel_loop
2192
2193 jmp done
2194
2195 done:
2196 emms
2197 }
2198}
2199
2200inline void setMMX(char* imDest, const char value, int dataSize)
2201{
2202 __asm {
2203 // setup mm0 with 8 copies of 'value'
2204 mov al, value
2205 mov ah, al
2206 mov bx, ax
2207 shl eax, 16
2208 mov ax, bx
2209 movd mm0, eax
2210 movd mm1, eax
2211 punpckldq mm0, mm1
2212
2213
2214 mov ecx, dataSize
2215 shr ecx, 3
2216
2217 mov ebx, imDest
2218
2219 pixel_loop:
2220 movq [ebx], mm0
2221 add ebx, 8
2222
2223 dec ecx
2224 jnz pixel_loop
2225
2226 jmp done
2227
2228 done:
2229 emms
2230 }
2231}
2232
2233/*
2234void copyRGBAtoRGB(const uchar* imSrc, uchar* imred,uchar* imgreen,uchar* imblue, int dataSize)
2235{
2236 __asm {
2237
2238 mov esi, dataSize
2239 shr esi, 3
2240
2241 mov eax, imSrc
2242 mov ebx, imred
2243 mov ecx, imred
2244 mov edx, imred
2245
2246 pixel_loop:
2247 movq mm1, [eax]
2248
2249
2250 movq [ebx], mm1
2251
2252 add eax, 8
2253 add ebx, 8
2254 add ecx, 8
2255 add edx, 8
2256
2257 dec esi
2258 jnz pixel_loop
2259
2260 jmp done
2261
2262 done:
2263 emms
2264 }
2265}*/
2266
2267inline void multiply(uchar* im, float fact, int imageSize)
2268{
2269 __asm {
2270
2271 mov ecx, imageSize
2272 shr ecx, 3
2273
2274 // make 4 copies of the constant 'fact' in xmm0
2275 movss xmm0, fact
2276 movss xmm1, xmm0
2277 unpcklps xmm0, xmm1
2278 movlhps xmm0, xmm0
2279
2280
2281 mov eax, im
2282 pxor mm7,mm7 // mm7 = 0
2283
2284 pixel_loop:
2285 movq mm1, [eax]
2286 movq mm2, mm1
2287
2288 punpcklbw mm2, mm0
2289 punpckhbw mm1, mm0
2290
2291 movq mm3,mm2
2292 punpckhwd mm3, mm0
2293 punpcklwd mm2, mm0
2294
2295 movq mm4,mm1
2296 punpcklwd mm4, mm0
2297 punpckhwd mm1, mm0
2298
2299 // here, the first 8 bytes are in d-words [mm1 mm4 mm3 mm2]
2300 // --------
2301 cvtpi2ps xmm3, mm3 // put mm3 in low part of xmm3
2302 cvtpi2ps xmm2, mm2 // put mm2 in low part of xmm2
2303
2304 movlhps xmm2, xmm3 // xmm2 = [xmm3(low part) xmm2(low part)]
2305
2306 mulps xmm2, xmm0
2307
2308 cvtps2pi mm2, xmm2 // convert low 2 floats from xmm2 to mm2
2309 movhlps xmm3,xmm2 // mov high 2 floats from xmm2 to low 2 floats in xmm3
2310 cvtps2pi mm3, xmm3 // convert low 2 floats from xmm3 to mm3
2311
2312 packssdw mm2, mm3 // mm2 = (word)[mm2 mm3]
2313
2314 // --------
2315 cvtpi2ps xmm4, mm4
2316 cvtpi2ps xmm1, mm1
2317
2318 movlhps xmm4, xmm1
2319
2320 mulps xmm4, xmm0
2321
2322 cvtps2pi mm4, xmm4 // convert low 2 floats from xmm4 to mm4
2323 movhlps xmm1,xmm4 // mov high 2 floats from xmm4 to low 2 floats in xmm1
2324 cvtps2pi mm1, xmm1 // convert low 2 floats from xmm1 to mm1
2325
2326 packssdw mm4, mm1 // mm4 = (word)[mm1 mm4]
2327
2328
2329 // ------
2330 packuswb mm2, mm4 // mm2 = [[mm4] [mm2]] = [mm1 mm4 mm2 mm3]
2331 movq [eax], mm2
2332
2333 add eax, 8
2334 dec ecx
2335 jnz pixel_loop
2336
2337 jmp done
2338
2339 done:
2340 emms
2341
2342 }
2343}
2344
2345inline void multiply(const uchar* imSrc, uchar* imDest, float fact, int imageSize)
2346{
2347 __asm {
2348
2349 mov ecx, imageSize
2350 shr ecx, 3
2351
2352 // make 4 copies of the constant 'fact' in xmm0
2353 movss xmm0, fact
2354 movss xmm1, xmm0
2355 unpcklps xmm0, xmm1
2356 movlhps xmm0, xmm0
2357
2358
2359 mov eax, imSrc
2360 mov ebx, imDest
2361 pxor mm7,mm7 // mm7 = 0
2362
2363 pixel_loop:
2364 movq mm1, [eax]
2365 movq mm2, mm1
2366
2367 punpcklbw mm2, mm0
2368 punpckhbw mm1, mm0
2369
2370 movq mm3,mm2
2371 punpckhwd mm3, mm0
2372 punpcklwd mm2, mm0
2373
2374 movq mm4,mm1
2375 punpcklwd mm4, mm0
2376 punpckhwd mm1, mm0
2377
2378 // here, the first 8 bytes are in d-words [mm1 mm4 mm3 mm2]
2379 // --------
2380 cvtpi2ps xmm3, mm3 // put mm3 in low part of xmm3
2381 cvtpi2ps xmm2, mm2 // put mm2 in low part of xmm2
2382
2383 movlhps xmm2, xmm3 // xmm2 = [xmm3(low part) xmm2(low part)]
2384
2385 mulps xmm2, xmm0
2386
2387 cvtps2pi mm2, xmm2 // convert low 2 floats from xmm2 to mm2
2388 movhlps xmm3,xmm2 // mov high 2 floats from xmm2 to low 2 floats in xmm3
2389 cvtps2pi mm3, xmm3 // convert low 2 floats from xmm3 to mm3
2390
2391 packssdw mm2, mm3 // mm2 = (word)[mm2 mm3]
2392
2393 // --------
2394 cvtpi2ps xmm4, mm4
2395 cvtpi2ps xmm1, mm1
2396
2397 movlhps xmm4, xmm1
2398
2399 mulps xmm4, xmm0
2400
2401 cvtps2pi mm4, xmm4 // convert low 2 floats from xmm4 to mm4
2402 movhlps xmm1,xmm4 // mov high 2 floats from xmm4 to low 2 floats in xmm1
2403 cvtps2pi mm1, xmm1 // convert low 2 floats from xmm1 to mm1
2404
2405 packssdw mm4, mm1 // mm4 = (word)[mm1 mm4]
2406
2407
2408 // ------
2409 packuswb mm2, mm4 // mm2 = [[mm4] [mm2]] = [mm1 mm4 mm2 mm3]
2410 movq [ebx], mm2
2411
2412 add eax, 8
2413 add ebx, 8
2414 dec ecx
2415 jnz pixel_loop
2416
2417 jmp done
2418
2419 done:
2420 emms
2421
2422 }
2423}
Note: See TracBrowser for help on using the repository browser.