Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

processingMMX.inl@ 89

Last change on this file since 89 was 89, checked in by morasjul, 11 years ago

PACPUS 2.0 Beta deployed in new branch

Major changes:
-Add communication interface between components
-Add examples for communications interface (TestComponents)
-Move to Qt5 support

Property svn:executable set to *

File size: 54.1 KB

Line
1	/***************************************************************************
2	*
3	* Copyright 2000 by David Demirdjian. All rights reserved.
4	*
5	* Developed by David Demirdjian
6	*
7	* Permission to use, copy, or modify this software and its documentation
8	* for educational and research purposes only and without fee is hereby
9	* granted, provided that this copyright notice and the original authors's
10	* names appear on all copies and supporting documentation. If individual
11	* files are separated from this distribution directory structure, this
12	* copyright notice must be included. For any other uses of this software,
13	* in original or modified form, including but not limited to distribution
14	* in whole or in part, specific prior permission must be obtained from
15	* MIT. These programs shall not be used, rewritten, or adapted as the
16	* basis of a commercial software or hardware product without first
17	* obtaining appropriate licenses from David Demirdjian. The author makes
18	* no representations about the suitability of this software for any purpose.
19	* It is provided "as is" without express or implied warranty.
20	*
21	**************************************************************************/
22	#include "stereoMatching.h"
23	#include "processingMMX.h"
24
25	// ************************************************************
26	// ************************************************************
27	// *** List of functions (MMX) for image processing
28	// ************************************************************
29	// ************************************************************
30
31	// shrink images by a factor 'fact'. eg if fact = 2, out will be twice as small as src
32	inline void shrinkImages(uchar* dst, const uchar* src, int width, int height, int fact)
33	{
34	int width_f = width/fact;
35	int siz = widthheight/(factfact);
36	if (fact>0) {
37	for (int i=0,j=0; i<siz; ++i,++j,++dst,src+=fact) {
38	dst = src;
39	if (j==width_f-1) {
40	src+=((fact-1)*width);
41	j=0;
42	}
43	}
44	}
45	}
46
47	// translate image of 'tx' pixels to the right
48	// (or left if tx<0)
49	void translateImage(int tx, uchar* data, int siz)
50	{
51	if (tx==0) return;
52	if (tx>0) {
53	// dest. must be after src... to avoid overwriting data
54	data += (siz-tx);
55	uchar* dataDst = data+tx;
56	for (int i=0; i<siz-tx; ++i,--dataDst,--data) {
57	dataDst = data;
58	}
59	} else { // tx<0
60	data -= tx;
61	uchar* dataDst = data+tx;
62	for (int i=0; i<siz-tx; ++i,++dataDst,++data) {
63	dataDst = data;
64	}
65	}
66	}
67
68	void normalizeImages(uchar* data1, uchar* data2, uchar* data3, int siz)
69	{
70	float a1 = pixelMean(data1,siz);
71	float a2 = pixelMean(data2,siz);
72	float a3 = pixelMean(data3,siz);
73
74	float minI = __min(a1, __min(a2,a3));
75
76	if (a2==minI) {
77	multiply(data1, a2/a1, siz);
78	multiply(data3, a2/a3, siz);
79	} else if (a1==minI) {
80	multiply(data2, a1/a2, siz);
81	multiply(data3, a1/a3, siz);
82	} else {
83	multiply(data2, a3/a2, siz);
84	multiply(data1, a3/a1, siz);
85	}
86	}
87
88	void normalizeImages(const uchar* data1, const uchar* data2, const uchar* data3,
89	uchar* out1, uchar* out2, uchar* out3, int siz)
90	{
91	float a1 = pixelMean(data1,siz);
92	float a2 = pixelMean(data2,siz);
93	float a3 = pixelMean(data3,siz);
94
95	float minI = __min(a1, __min(a2,a3));
96
97	if (a2==minI) {
98	multiply(data1, out1, a2/a1, siz);
99	multiply(data3, out3, a2/a3, siz);
100	copyMMX(out2, data2, siz);
101	} else if (a1==minI) {
102	multiply(data2, out2, a1/a2, siz);
103	multiply(data3, out3, a1/a3, siz);
104	copyMMX(out1, data1, siz);
105	} else {
106	multiply(data2, out2, a3/a2, siz);
107	multiply(data1, out1, a3/a1, siz);
108	copyMMX(out3, data3, siz);
109	}
110	}
111
112	void normalizeImages(uchar* data1, uchar* data2, int siz)
113	{
114	float a1 = pixelMean(data1,siz);
115	float a2 = pixelMean(data2,siz);
116
117	// normalize the image which average intensity is the highest
118	if (a1>a2)
119	multiply(data1, a2/a1, siz);
120	else
121	multiply(data2, a1/a2, siz);
122	}
123
124	void normalizeImages(const uchar* data1, const uchar* data2,
125	uchar* out1, uchar* out2, int siz)
126	{
127	float a1 = pixelMean(data1,siz);
128	float a2 = pixelMean(data2,siz);
129
130	// normalize the image which average intensity is the highest
131	if (a1>a2) {
132	multiply(data1, out1, a2/a1, siz);
133	copyMMX(out2, data2, siz);
134	} else {
135	multiply(data2, out2, a1/a2, siz);
136	copyMMX(out1, data1, siz);
137	}
138	}
139
140	// ImgSub2: D = saturation0(\|S1 - S2\| + \|S1 - S3\|)
141	// TODO? divide the result by 2 (shift)
142	inline int ImgSubandAdd(const unsigned char Src1, const unsigned char Src2,
143	const unsigned char Src3, unsigned char Dest, int l)
144	{
145
146	if (l < 8) return 0; // image size must be at least 8 bytes
147
148	__asm
149	{
150	mov eax, Src1
151	mov ebx, Src2
152	mov edx, Src3
153	mov edi, Dest
154	mov ecx, l
155	shr ecx, 3
156
157	align 16
158	inner_loop:
159	movq mm1,[eax] // mm1=src1
160
161	movq mm2,[ebx] // mm2=src2
162
163	movq mm4,mm1 // mm4=mm1
164
165	psubusb mm4,mm2 // mm4 = src1 - src2
166
167	movq mm3,[edx] // mm3=src3
168	psubusb mm2,mm1 // mm2 = src2 - src1
169
170	movq mm5,mm1 // mm5=src1
171	por mm2,mm4 // mm2=\|src1-src2\|
172
173	psubusb mm5,mm3 // mm4=src1-src3
174
175	psubusb mm3,mm1 // mm3=src3-src1
176
177	por mm3,mm5 // mm3=\|src1-src3\|
178
179	paddusb mm2,mm3 // mm2 = \|src1-src2\|+\|src1-src3\|
180
181	movq [edi], mm2
182	add eax,8
183	add ebx,8
184	add edx,8
185	add edi,8
186	dec ecx
187	jnz inner_loop
188	emms
189	}
190
191	return 1;
192	}
193
194	// ImgSub2: D = saturation0(\|S1 - S2\|)
195	// TODO? divide the result by 2 (shift)
196	inline int ImgSubandAdd(const unsigned char Src1, const unsigned char Src2,
197	const unsigned char *Dest, int l)
198	{
199
200	if (l < 8) return 0; // image size must be at least 8 bytes
201
202	__asm
203	{
204	mov eax, Src1
205	mov ebx, Src2
206	mov edi, Dest
207	mov ecx, l
208	shr ecx, 3
209
210	align 16
211	inner_loop:
212	movq mm1,[eax] // mm1=src1
213	movq mm2,[ebx] // mm2=src2
214
215	movq mm4,mm1 // mm4=mm1
216	psubusb mm4,mm2 // mm4 = src1 - src2
217
218	psubusb mm2,mm1 // mm2 = src2 - src1
219	por mm2,mm4 // mm2=\|src1-src2\|
220
221	movq [edi], mm2
222	add eax,8
223	add ebx,8
224	add edi,8
225	dec ecx
226	jnz inner_loop
227	emms
228	}
229
230	return 1;
231	}
232
233
234
235
236
237
238	#define _ABS_DIFF_TRI(Z) __asm \
239	{ \
240	__asm movq mm4,mm1 /* mm4=mm1 */ \
241	__asm add ebx, width \
242	__asm add edi, imageSize \
243	__asm por mm3,mm7 /* here mm2=new src2 mm3=new src3 */ \
244	\
245	__asm movq mm7, mm0 \
246	__asm psubusb mm4,mm2 /* mm4 = src1 - src2 */ \
247	\
248	__asm psubusb mm2,mm1 /* mm2 = src2 - src1 */ \
249	__asm psllq mm7,Z \
250	\
251	__asm movq mm5,mm1 /* mm5=src1 */ \
252	__asm por mm4,mm2 /* mm2=\|src1-src2\| */ \
253	\
254	__asm movq mm2,[ebx] /* mm2= src2 + 'width' = new src2*/ \
255	__asm psubusb mm5,mm3 /* mm5=src1-src3*/ \
256	\
257	__asm movq mm6,mm3 /* mm6=src3*/ \
258	__asm psubusb mm6,mm1 /* mm3=src3-src1*/ \
259	\
260	__asm por mm6,mm5 /* mm6=\|src1-src3\|*/ \
261	__asm paddusb mm4,mm6 /* mm4 = \|src1-src2\|+\|src1-src3\|*/ \
262	\
263	__asm movq [edi], mm4 /* here mm1=src1*/ \
264	__asm psrlq mm3, 8 /* mm3 = src3 + '1' ... with [x00000000] at the end*/\
265	}
266
267
268	#define _ABS_DIFF_TRI_prefetch(Z, X) __asm \
269	{ \
270	__asm movq mm4,mm1 /* mm4=mm1 */ \
271	__asm add ebx, width \
272	__asm add edi, imageSize \
273	__asm por mm3,mm7 /* here mm2=new src2 mm3=new src3 */ \
274	\
275	__asm movq mm7, mm0 \
276	__asm psubusb mm4,mm2 /* mm4 = src1 - src2 */ \
277	\
278	__asm psubusb mm2,mm1 /* mm2 = src2 - src1 */ \
279	__asm prefetcht0 [ebx + X] \
280	__asm psllq mm7,Z \
281	\
282	__asm movq mm5,mm1 /* mm5=src1 */ \
283	__asm por mm4,mm2 /* mm2=\|src1-src2\| */ \
284	\
285	\
286	__asm movq mm2,[ebx] /* mm2= src2 + 'width' = new src2*/ \
287	__asm psubusb mm5,mm3 /* mm5=src1-src3*/ \
288	\
289	__asm movq mm6,mm3 /* mm6=src3*/ \
290	__asm psubusb mm6,mm1 /* mm3=src3-src1*/ \
291	\
292	__asm por mm6,mm5 /* mm6=\|src1-src3\|*/ \
293	__asm paddusb mm4,mm6 /* mm4 = \|src1-src2\|+\|src1-src3\|*/ \
294	\
295	__asm movq [edi], mm4 /* here mm1=src1*/ \
296	__asm psrlq mm3, 8 /* mm3 = src3 + '1' ... with [x00000000] at the end*/\
297	}
298
299	// ImgSubandAdd2: D = saturation0(\|S1 - S2\| + \|S1 - S3\|)
300	// process 8 disparities at a time
301	//
302	// Src1: right
303	// Src2: top
304	// Src3: left
305	//
306	// TODO? divide the result by 2 (shift)
307	inline int ImgSubandAdd2(const unsigned char Src1, const unsigned char Src2,
308	const unsigned char *Src3,
309	unsigned char* Dest1, int l, int imageSize, int width)
310	{
311	if (l < 8) return 0; // image size must be at least 8 bytes
312	const int back_step1 = 7*width;
313	const int back_step2 = 7*imageSize;
314	__asm
315	{
316	mov eax, Src1
317	mov ebx, Src2
318	mov edx, Src3
319	mov edi, Dest1
320
321	mov ecx, l
322	shr ecx, 3
323
324	movq mm0,[edx] // mm0=src3
325	movq mm0,[edx] // mm0=src3
326	align 16
327	inner_loop:
328	movq mm1,[eax] // mm1=src1
329	movq mm3,mm0 // mm3=src3
330
331	movq mm2,[ebx] // mm2=src2
332	add eax,8
333
334	// -- 1 --------- in : mm1,mm2,mm3 out: mm4=SAD mm2=new mm2 --
335	movq mm4,mm1 // mm4=mm1
336
337	add ebx,width
338
339	psubusb mm4,mm2 // mm4 = src1 - src2
340	//prefetcht0 [ebx + 32 + 2*320]
341
342	movq mm0,[edx+8]
343	psubusb mm2,mm1 // mm2 = src2 - src1
344
345	movq mm5,mm1 // mm5=src1
346	por mm4,mm2 // mm2=\|src1-src2\|
347
348	movq mm2,[ebx] // mm2= src2 + 'width' = new src2
349	psubusb mm5,mm3 // mm5=src1-src3
350
351	movq mm6,mm3 // mm6=src3
352	psubusb mm6,mm1 // mm3=src3-src1
353
354	movq mm7, mm0
355	psrlq mm3, 8 // mm3 = src3 + '1' ... with [x00000000] at the end
356
357	por mm6,mm5 // mm6=\|src1-src3\|
358	paddusb mm4,mm6 // mm4 = \|src1-src2\|+\|src1-src3\|
359
360	movq [edi], mm4
361	psllq mm7, 56 // here mm1=src1 mm2=NEW src2 mm3=begin of NEWsrc3 mm7=end of NEWsrc3
362	// -------------------------------------------------------------
363
364
365	// - 2 ----------------
366	_ABS_DIFF_TRI(48)
367
368	// - 3 ----------------
369	_ABS_DIFF_TRI(40)
370
371	// - 4 ----------------
372	_ABS_DIFF_TRI(32)
373	// _ABS_DIFF_TRI_prefetch(32,24 + 3*320)
374
375	// - 5 ----------------
376	_ABS_DIFF_TRI(24)
377
378	// - 6 ----------------
379	_ABS_DIFF_TRI(16)
380
381	// - 7 ----------------
382	_ABS_DIFF_TRI(8)
383
384
385	// - 8 ----------------
386	movq mm4,mm1 // mm4=mm1
387	por mm3,mm7 // here mm2=new src2 mm3=new src3
388
389	psubusb mm4,mm2 // mm4 = src1 - src2
390	psubusb mm2,mm1 // mm2 = src2 - src1
391
392	movq mm5,mm1 // mm5=src1
393	por mm4,mm2 // mm2=\|src1-src2\|
394
395	psubusb mm5,mm3 // mm5=src1-src3
396	psubusb mm3,mm1 // mm3=src3-src1
397
398	por mm3,mm5 // mm6=\|src1-src3\|
399	paddusb mm4,mm3 // mm4 = \|src1-src2\|+\|src1-src3\|
400
401	add edi, imageSize
402
403	movq [edi], mm4 // here mm1=src1
404	// -------------------------------------------------------------
405	//
406	sub ebx, back_step1
407	add ebx,8
408	add edx,8
409	sub edi, back_step2
410	add edi,8
411	dec ecx
412	jnz inner_loop
413	emms
414	}
415
416	return 1;
417	}
418
419
420	// macro: in: mm1,mm2
421	#define _ABS_DIFF_ __asm \
422	{ \
423	__asm movq mm4,mm1 /* mm4=mm1 */ \
424	__asm psubusb mm4,mm2 /* mm4 = src1 - src2 */ \
425	__asm psubusb mm2,mm1 /* mm2 = src2 - src1 */ \
426	__asm por mm4,mm2 /* mm2=\|src1-src2\| */ \
427	__asm add ebx, width \
428	__asm add edi, imageSize \
429	__asm movq mm2,[ebx] \
430	__asm movq [edi], mm4 /* here mm1=src1 */ \
431	}
432
433	// ImgSubandAdd2: D = saturation0(\|S1 - S2\| + \|S1 - S3\|)
434	// process 8 disparities at a time
435	// Src1: right
436	// Src2: top
437	// TODO? divide the result by 2 (shift)
438	inline int ImgSubandAdd2_Vert(const unsigned char Src1, const unsigned char Src2,
439	unsigned char* Dest1, int l, int imageSize, int width)
440	{
441
442	if (l < 8) return 0; // image size must be at least 8 bytes
443	const int back_step1 = 7*width;
444	const int back_step2 = 7*imageSize;
445	__asm
446	{
447	mov eax, Src1
448	mov ebx, Src2
449	mov edi, Dest1
450
451	mov ecx, l
452	shr ecx, 3
453
454	align 16
455	inner_loop:
456
457	movq mm1,[eax] // mm1=src1
458	movq mm2,[ebx] // mm2=src2
459	add eax,8
460
461	// -- 1 --------- in : mm1,mm2,mm3 out: mm4=SAD mm2=new mm2 --
462	_ABS_DIFF_
463	_ABS_DIFF_
464	_ABS_DIFF_
465	_ABS_DIFF_
466	_ABS_DIFF_
467	_ABS_DIFF_
468	_ABS_DIFF_
469
470	// - 8 ----------------
471	movq mm4,mm1 // mm4=mm1
472
473	psubusb mm4,mm2 // mm4 = src1 - src2
474	psubusb mm2,mm1 // mm2 = src2 - src1
475
476	por mm4,mm2 // mm2=\|src1-src2\|
477	add edi, imageSize
478
479	movq [edi], mm4 // here mm1=src1
480	// -------------------------------------------------------------
481	//
482	sub ebx, back_step1
483	add ebx,8
484	sub edi, back_step2
485	add edi,8
486	dec ecx
487	jnz inner_loop
488	emms
489	}
490
491	return 1;
492	}
493
494	// macro: in: mm1,mm2
495	#define _ABS_DIFF_HORIZ(Z) __asm \
496	{ \
497	__asm movq mm7, mm0 \
498	__asm add edi, imageSize \
499	__asm movq mm5,mm1 /* mm5=src1 */ \
500	__asm psllq mm7, Z \
501	__asm psubusb mm5,mm3 /* mm5=src1-src3 */ \
502	__asm movq mm6,mm3 /* mm6=src3 */ \
503	__asm psubusb mm6,mm1 /* mm3=src3-src1 */ \
504	__asm por mm6,mm5 /* mm6=\|src1-src3\| */ \
505	__asm movq [edi], mm6 /* here mm1=src1 */ \
506	__asm psrlq mm3, 8 /* mm3 = src3 + '1' ... with [x00000000] at the end */ \
507	__asm por mm3,mm7 /* here mm3=new src3 */ \
508	}
509
510	// ImgSubandAdd2: D = saturation0(\|S1 - S2\| + \|S1 - S3\|)
511	// process 8 disparities at a time
512	//
513	// Src1: right
514	// Src2: top
515	// Src3: left
516	//
517	// TODO? divide the result by 2 (shift)
518	inline int ImgSubandAdd_Horiz(const unsigned char rightIm, const unsigned char leftIm,
519	unsigned char* Dest, int l, int imageSize, int width)
520	{
521
522	if (l < 8) return 0; // image size must be at least 8 bytes
523	const int back_step2 = 7*imageSize;
524	__asm
525	{
526	mov eax, rightIm
527	mov edx, leftIm
528	mov edi, Dest
529
530	mov ecx, l
531	shr ecx, 3
532
533	movq mm0,[edx] // mm0=src3
534	movq mm0,[edx] // mm0=src3
535	align 16
536	inner_loop:
537
538	movq mm1,[eax] // mm1=src1
539	movq mm3,mm0 // mm3=src3
540
541	// -- 1 --------- in : mm1,mm2,mm3 out: mm4=SAD mm2=new mm2 --
542	movq mm0,[edx+8]
543	add eax,8
544
545	movq mm5,mm1 // mm5=src1
546	psubusb mm5,mm3 // mm5=src1-src3
547
548	movq mm6,mm3 // mm6=src3
549	psubusb mm6,mm1 // mm3=src3-src1
550
551	movq mm7, mm0
552	psrlq mm3, 8 // mm3 = src3 + '1' ... with [x00000000] at the end
553
554	por mm6,mm5 // mm6=\|src1-src3\|
555
556	movq [edi], mm6
557	psllq mm7, 56 // here mm1=src1 mm3=begin of NEWsrc3 mm7=end of NEWsrc3
558	por mm3,mm7 // here mm3=new src3
559
560	// - 2 ----------------
561	_ABS_DIFF_HORIZ(48)
562	_ABS_DIFF_HORIZ(40)
563	_ABS_DIFF_HORIZ(32)
564	_ABS_DIFF_HORIZ(24)
565	_ABS_DIFF_HORIZ(16)
566	_ABS_DIFF_HORIZ(8)
567
568	// - 8 ----------------
569	movq mm5,mm1 // mm5=src1
570	add edi, imageSize
571
572	psubusb mm5,mm3 // mm5=src1-src3
573	psubusb mm3,mm1 // mm3=src3-src1
574
575	por mm3,mm5 // mm6=\|src1-src3\|
576	movq [edi], mm3
577	// -------------------------------------------------------------
578	//
579	add edx,8
580	sub edi, back_step2
581	add edi,8
582	dec ecx
583	jnz inner_loop
584	emms
585	}
586
587	return 1;
588	}
589
590
591	// ----------------------
592	// FULL IMAGE, BEST ONLY : Keith's code
593	inline int findMinimumCorrelation_mmx(
594	const unsigned char *CurrentCorrelation,
595	unsigned char CurrentDisparity,
596	unsigned char *Disparity,
597	unsigned char *BestCorrelation, int bytecount)
598	{
599	if ((bytecount < 8) \|\| ((bytecount % 8) != 0)) {
600	return 0;
601	}
602
603	__asm {
604	// load ecx with the pixelblock count = bytecount / 8
605	mov ecx, bytecount
606	shr ecx, 3
607
608	// setup mm0 with 8 copies of the disparity constant
609	mov al, CurrentDisparity
610	mov ah, al
611	mov bx, ax
612	shl eax, 16
613	mov ax, bx
614	movd mm0, eax
615	movd mm1, eax
616	punpckldq mm0, mm1
617
618	// setup mm1 with 8 copies of the xor constant for unsigned => signed conversion
619	mov eax, 0x80808080
620	movd mm1, eax
621	movd mm2, eax
622	punpckldq mm1, mm2
623
624
625	// setup the image pointers
626	mov eax, BestCorrelation
627	mov esi, CurrentCorrelation
628	mov edi, Disparity
629
630	pixel_loop:
631	movq mm2, [esi] // current correlation
632	movq mm3, [eax] // best correlation
633
634	// check for updates
635	movq mm5, mm2 // copy the current correlation
636	pxor mm5, mm1 // convert from unsigned range to signed range
637
638	movq mm6, mm3 // copy the best correlation
639	pxor mm6, mm1 // convert from unsigned range to signed range
640
641	pcmpgtb mm5, mm6 // mm5 := (current signed> best) mask
642	// 1 indicates current > best, so keep best
643	// 0 indicates current <= best, so use new value
644
645	// BYPASS
646	// this phase adds 8 additional instructions, but could skip 2 writes and 1 read
647	// abort remainder if not updating best correlation
648	pcmpeqb mm6, mm6 // mm6 = 0xFFFFFFFF
649	pxor mm6, mm5 // mm6 = mm5 xor 0xFFFFFFFF = not mm5
650	// 0 indicates current > best, so keep best
651	// 1 indicates current <= best, so use new value
652
653	packsswb mm6, mm6 // pack it into the lower dword of mm6 (unsigned saturation)
654	// 11111111 11111111 => 11111111 some replaced
655	// 11111111 00000000 => 11111111 some replaced
656	// 00000000 11111111 => 11111111 some replaced
657	// 00000000 00000000 => 00000000 no replacements
658
659	// we don't need to backup ebx because its not used in this routine
660	// movd mm7, ebx // make a backup of eax
661	movd ebx, mm6 // get the saturated mask
662	test ebx, ebx // test ebx => yields 0 iff no substitutions will occur
663	// movd ebx, mm7 // restore ebx
664	jz bypass // store mm4 (second correlation) to [ebx]
665
666
667	// Update best Correlation
668	movq mm6, mm5 // mm6 := mask
669	movq mm7, mm5 // mm7 := mask
670
671	pand mm6, mm3 // best correlation values to keep
672	pandn mm7, mm2 // current correlation value to move to best correlation
673
674	por mm6, mm7 // merge values
675	movq [eax], mm6 // store values
676
677	// update disparity
678	movq mm2, [edi] // get disparity map
679	movq mm6, mm5 // mm6 := mask
680
681	pand mm5, mm2 // select disparity map values to keep
682	pandn mm6, mm0 // select current disparity values to move to disparity map
683
684	por mm5, mm6 // merge values
685	movq [edi], mm5 // store values
686
687	bypass:
688	add eax, 8
689	add esi, 8
690	add edi, 8
691
692	dec ecx
693	jnz pixel_loop
694
695
696	emms;
697	}
698
699	return 1;
700	}
701
702	/*int initMinimumCorrelation(
703	const unsigned char *CurrentCorrelation,
704	unsigned char disparityInit,
705	unsigned char *Disparity,
706	unsigned char *BestCorrelation,
707	unsigned char *SecondCorrelation,
708	int bytecount)
709	{
710	for (int i=0; i<bytecount; ++i)
711	{
712	BestCorrelation[i]=255;
713	SecondCorrelation[i]=255;
714	Disparity[i]=0;
715	}
716	return 0;
717	}*/
718
719	inline int initMinimumCorrelation(
720	const unsigned char *CurrentCorrelation,
721	unsigned char disparityInit,
722	unsigned char *Disparity,
723	unsigned char *BestCorrelation,
724	unsigned char *SecondCorrelation,
725	int bytecount)
726	{
727	if ((bytecount < 8) \|\| ((bytecount % 8) != 0)) {
728	return 0;
729	}
730
731	__asm {
732
733	// setup mm0 with 8 copies of the disparity constant
734	mov al, disparityInit
735	mov ah, al
736	mov bx, ax
737	shl eax, 16
738	mov ax, bx
739	movd mm0, eax
740	movd mm1, eax
741	punpckldq mm0, mm1
742
743	// load ecx with the pixelblock count = bytecount / 8
744	mov ecx, bytecount
745	shr ecx, 3
746
747	mov eax, BestCorrelation
748	mov ebx, SecondCorrelation
749	mov esi, CurrentCorrelation
750	mov edx, Disparity
751
752	pixel_loop:
753	movq mm1, [esi]
754	movq [eax], mm1 // Best = Current
755	movq [ebx], mm1 // Second = Current
756	movq [edx], mm0 // Disparity = disparityInit
757
758	add eax, 8
759	add ebx, 8
760	add edx, 8
761	add esi, 8
762	dec ecx
763	jnz pixel_loop
764
765	jmp done
766
767
768	done:
769	emms;
770	}
771	}
772
773	inline int findMinimumCorrelation(
774	const unsigned char *CurrentCorrelation,
775	unsigned char CurrentDisparity,
776	unsigned char *Disparity,
777	unsigned char *BestCorrelation,
778	unsigned char *SecondCorrelation,
779	int bytecount)
780	{
781	for (int i=0; i<bytecount; ++i,++CurrentCorrelation,++Disparity,++BestCorrelation, ++SecondCorrelation)
782	{
783	if (CurrentCorrelation<BestCorrelation) {
784	*Disparity = CurrentDisparity;
785	SecondCorrelation = BestCorrelation;
786	BestCorrelation = CurrentCorrelation;
787	}
788	}
789	return 1;
790	}
791
792	// ----------------------
793	// FULL IMAGE, BEST+SECOND .. Keith's code
794	inline int findMinimumCorrelation_mmx(
795	const unsigned char *CurrentCorrelation,
796	unsigned char CurrentDisparity,
797	unsigned char *Disparity,
798	unsigned char *BestCorrelation,
799	unsigned char *SecondCorrelation,
800	int bytecount)
801	{
802	if ((bytecount < 8) \|\| ((bytecount % 8) != 0)) {
803	return 0;
804	}
805
806	__asm {
807	// load ecx with the pixelblock count = bytecount / 8
808	mov ecx, bytecount
809	shr ecx, 3
810
811	// setup mm0 with 8 copies of the disparity constant
812	mov al, CurrentDisparity
813	mov ah, al
814	mov bx, ax
815	shl eax, 16
816	mov ax, bx
817	movd mm0, eax
818	movd mm1, eax
819	punpckldq mm0, mm1
820
821	// setup mm1 with 8 copies of the xor constant for unsigned => signed conversion
822	mov eax, 0x80808080
823	movd mm1, eax
824	movd mm2, eax
825	punpckldq mm1, mm2
826
827
828	// setup the image pointers
829	mov eax, BestCorrelation
830	mov ebx, SecondCorrelation
831	mov esi, CurrentCorrelation
832	mov edi, Disparity
833
834	pixel_loop:
835	movq mm2, [esi] // current correlation
836	movq mm4, [ebx] // second correlation
837
838	// convert the current correlation from unsigned range to signed range
839	movq mm5, mm2 // copy the current correlation
840	pxor mm5, mm1 // convert from unsigned range to signed range
841	movq mm7, mm5 // copy converted to mm7
842
843
844	// check for second correlation updates
845	movq mm6, mm4 // copy second best correlation
846	pxor mm6, mm1 // convert from unsigned range to signed range
847
848	pcmpgtb mm7, mm6 // mm7 := (current signed> second best) mask
849
850	// BYPASS 1
851	// skip remainder if second correlation is not to be updated
852	// this phase adds an addition 8 instructions, but it could save as 1 memory read and 3 writes
853	pcmpeqb mm6, mm6 // mm6 = 0xFFFFFFFF
854	pxor mm6, mm7 // mm6 = mm7 xor 0xFFFFFFFF = not mm7
855	// 0 indicates current > second, so keep old value
856	// 1 indicates current <= second, so use new value
857
858
859	packsswb mm6, mm6 // pack it into the lower dword of mm6 (unsigned saturation)
860	// 11111111 11111111 => 11111111 some replaced
861	// 11111111 00000000 => 11111111 some replaced
862	// 00000000 11111111 => 11111111 some replaced
863	// 00000000 00000000 => 00000000 no replacements
864
865	// don't need to backup edx because its not used in this routine
866	// movd mm3, edx // make a backup of edx
867	movd edx, mm6 // get the saturated mask
868	test edx, edx // test edx => yields 0 iff no replacements will occur
869	// movd edx, mm3 // restore edx
870	jz bypass1
871
872
873	// direct update second correlation (get values from current)
874	// mm7 already has mask
875	// movq mm6, mm7 // mm6 := mask
876	// pand mm6, mm4 // second correlation values to keep
877	// pandn mm7, mm2 // current correlation values to move to second correlation
878	// por mm6, mm7 // merge value => direct updated second correlation
879	// movq mm4, mm6 // store values (*** this instruction could be eliminated!)
880
881	pand mm4, mm7 // second correlation values to keep
882	pandn mm7, mm2 // current correlation values to move to second correlation
883	por mm4, mm7 // merge value => direct updated second correlation
884
885
886	// check for best correlation updates
887	movq mm3, [eax] // best correlation
888	// mm5 has converted current correlation
889	movq mm6, mm3 // copy the best correlation
890	pxor mm6, mm1 // convert from unsigned range to signed range
891
892	pcmpgtb mm5, mm6 // mm5 := (current signed> best) mask
893	// 1 indicates current > best, so keep best
894	// 0 indicates current <= best, so use new value
895	// BYPASS 2
896	// this phase adds 8 additional instructions, but could skip 2 writes and 1 read
897	// abort remainder if not updating best correlation
898	pcmpeqb mm6, mm6 // mm6 = 0xFFFFFFFF
899	pxor mm6, mm5 // mm6 = mm5 xor 0xFFFFFFFF = not mm5
900	// 0 indicates current > best, so keep best
901	// 1 indicates current <= best, so use new value
902
903	packsswb mm6, mm6 // pack it into the lower dword of mm6 (unsigned saturation)
904	// 11111111 11111111 => 11111111 some replaced
905	// 11111111 00000000 => 11111111 some replaced
906	// 00000000 11111111 => 11111111 some replaced
907	// 00000000 00000000 => 00000000 no replacements
908
909	// don't need to backup edx because its not used in this routine
910	// movd mm7, edx // make a backup of edx
911	movd edx, mm6 // get the saturated mask
912	test edx, edx // test edx => yields 0 iff no substitutions will occur
913	// movd edx, mm7 // restore edx
914	jz bypass2 // store mm4 (second correlation) to [ebx]
915
916
917	// indirect update second correlation (pushed down from best)
918	movq mm6, mm5 // mm6 := mask
919	movq mm7, mm5 // mm7 := mask
920
921	pand mm6, mm4 // second correlation values to keep
922	pandn mm7, mm3 // best correlations to move to second correlation
923
924	por mm6, mm7 // merge values
925	movq [ebx], mm6 // store values
926
927	// direct Update best Correlation
928	movq mm6, mm5 // mm6 := mask
929	movq mm7, mm5 // mm7 := mask
930
931	pand mm6, mm3 // best correlation values to keep
932	pandn mm7, mm2 // current correlation value to move to best correlation
933
934	por mm6, mm7 // merge values
935	movq [eax], mm6 // store values
936
937	// update disparity
938	movq mm2, [edi] // get disparity map
939	movq mm6, mm5 // mm6 := mask
940
941	pand mm5, mm2 // select disparity map values to keep
942	pandn mm6, mm0 // select current disparity values to move to disparity map
943
944	por mm5, mm6 // merge values
945	movq [edi], mm5 // store values
946
947
948	bypass1:
949	next_pixel:
950	add eax, 8
951	add ebx, 8
952	add esi, 8
953	add edi, 8
954
955	dec ecx
956	jnz pixel_loop
957
958	jmp done
959
960	bypass2:
961	movq [ebx], mm4;
962	jmp next_pixel
963
964	done:
965	emms;
966	}
967
968	return 1;
969	}
970
971
972
973	inline void sum_Row(uchar* im, unsigned short* im_out, int rowSize, int maskSize)
974	{
975	im += maskSize/2;
976	im_out += maskSize/2;
977	for (int i=0; i<rowSize; ++i) {
978	int s=0;
979	for (int j=-maskSize/2; j<=maskSize/2; ++j) {
980	s+=*(im+j);
981	}
982	*im_out=s/maskSize;
983	++im;++im_out;
984	}
985	}
986
987	inline void sum_Row_mmx(uchar* im, unsigned short* im_out, int rowSize, int maskSize)
988	{
989	sum_Row_5_mmx(im, im_out, rowSize);
990	for (int i=0; i<(maskSize-5)/2; ++i)
991	sum_Row_5_mmx(im_out, im_out, rowSize);
992	}
993
994	inline void sum_Row_mmx(unsigned short* im, unsigned short* im_out, int rowSize, int maskSize)
995	{
996	sum_Row_5_mmx(im, im_out, rowSize);
997	for (int i=0; i<(maskSize-5)/2; ++i)
998	sum_Row_5_mmx(im_out, im_out, rowSize);
999	}
1000
1001	#define aim_Sum_Words_In_MM1 __asm \
1002	{ \
1003	__asm movq mm4, mm1 \
1004	__asm movq mm2, mm1 \
1005	\
1006	__asm movq mm3, mm1 \
1007	__asm psllq mm1, 16 \
1008	\
1009	__asm psrlq mm2, 16 \
1010	__asm paddw mm4, mm2 \
1011	\
1012	__asm paddw mm3, mm1 \
1013	__asm psrlq mm2, 16 \
1014	\
1015	__asm psllq mm1, 16 \
1016	__asm paddw mm4, mm2 \
1017	\
1018	__asm psrlq mm2, 16 \
1019	__asm paddw mm3, mm1 \
1020	\
1021	__asm psllq mm1, 16 \
1022	__asm paddw mm4, mm2 \
1023	\
1024	__asm paddw mm3, mm1 \
1025	}
1026
1027
1028
1029
1030
1031	// apply the mask [1 1 1 1 1] to the 1-D array im (bytes)
1032	// output : im_out (words)
1033	inline void sum_Row_5_mmx(uchar* im, unsigned short* im_out, int rowSize)
1034	{
1035	// temp: for debugging
1036	//return sum_Row_5(im,im_out,rowSize);
1037	__asm {
1038
1039	mov eax, rowSize
1040	mov ebx, im
1041	mov ecx, im_out
1042
1043	pxor mm6, mm6 // mm6 = x00000000
1044
1045	//Process the first quad word, but save only the second result"
1046	test eax, eax // Is there anything to do?"
1047	jz end_sum_loop // Jump out if necessary
1048
1049
1050	//Process low word
1051	movq mm1, [ebx] // Copy...
1052	punpcklbw mm1, mm6 // Expand low word bytes into words // mm1 =[D C B A]
1053
1054	aim_Sum_Words_In_MM1
1055
1056	//Store the result Only in the accumulator
1057	movq mm7, mm4 // Update accumulator mm4=[D C+D B+C+D A+B+C+D]
1058
1059	//Process high word
1060	movq mm1, [ebx] // Copy...
1061	punpckhbw mm1, mm6 // Expand high word bytes into words // mm1 =[H G F E]
1062	add ebx, 8 // Update input pointer
1063
1064	aim_Sum_Words_In_MM1
1065
1066	//Add to the previous data ...
1067	// mm3=[E+F+G+H E+F+G E+F E]
1068	// mm4=[H G+H F+G+H E+F+G+H]
1069	paddw mm7, mm3 // The current word of the accum // mm7=[D+E+F+G+H C+D+E+F+G B+C+D+E+F A+B+C+D+E]
1070
1071	// translate everything to 2 words on the left
1072	movq mm1, mm7 // mm1 = [D+E+F+G+H C+D+E+F+G B+C+D+E+F A+B+C+D+E]
1073	psrlq mm1, 32 // mm1 = [0 0 D+E+F+G+H C+D+E+F+G]
1074
1075	movq mm0, mm1 // mm0 = [D+E+F+G+H C+D+E+F+G]
1076
1077	psllq mm7, 32 // mm7 = [B+C+D+E+F A+B+C+D+E 0 0]
1078
1079	movq [ecx], mm7 // Store the final result
1080	add ecx, 8 // Update output pointer
1081
1082	movq mm7, mm4 // Update accumulator mm4=[H G+H F+G+H E+F+G+H]
1083	sub eax, 8 // Update the number of points left
1084
1085	// Start the loop
1086	row_sum_loop:
1087	test eax, eax // Is there anything to do?
1088	jz end_sum_loop // Jump out if necessary
1089
1090	movq mm1, [ebx] // Load data
1091
1092	//Process low word
1093	punpcklbw mm1, mm6 // Expand low word bytes into words
1094
1095	aim_Sum_Words_In_MM1
1096
1097	//Add to the previous data
1098	//prefetcht1 [ecx+16]
1099	paddw mm7, mm3 // The current word of the accum
1100
1101	// translate everything to 2 words on the left
1102	// mm0 = [0 0 D C] mm7 = [H G F E] ----> mm7=[0 0 H G] [ecx]=[F E D C]
1103	punpckldq mm0, mm7 // mm0 = [F E D C]
1104
1105	movq [ecx], mm0
1106	sub eax, 8 // Update the number of points left
1107
1108	movq mm0, mm4 // Update accumulator
1109	psrlq mm7, 32 // mm7 = [0 0 H G]
1110
1111	//Process high word
1112	movq mm1, [ebx] // Copy...
1113	punpckhbw mm1, mm6 // Expand high word bytes into words
1114
1115	aim_Sum_Words_In_MM1
1116
1117	//Add to the previous data
1118	paddw mm0, mm3 // The current word of the accum
1119
1120	// translate everything to 2 words on the left
1121	// mm7 = [0 0 D C] mm0 = [H G F E] ----> mm0=[0 0 H G] [ecx+8]=[F E D C]
1122	punpckldq mm7, mm0 // mm7 = [F E D C]
1123	add ebx, 8 // Update input pointer
1124
1125	movq [ecx+8], mm7
1126	psrlq mm0, 32 // mm0 = [0 0 H G]
1127
1128	movq mm7, mm4 // Update accumulator
1129	add ecx, 16 // Update output pointer
1130
1131	jmp row_sum_loop // Loop
1132
1133	//Cleanup
1134	end_sum_loop:
1135	emms
1136	}
1137	}
1138
1139
1140
1141	// apply the mask (1/4)*[1 1 1 1 1] to the 1-D array im (words)
1142	// output : im_out (words)
1143	inline void sum_Row_5_mmx(ushort* im, ushort* im_out, int rowSize)
1144	{
1145	// temp: for debugging
1146	//return sum_Row_5(im,im_out,rowSize);
1147	__asm {
1148
1149	mov eax, rowSize
1150	mov ebx, im
1151	mov ecx, im_out
1152
1153	//Process the first quad word, but save only the second result"
1154	test eax, eax // Is there anything to do?"
1155	jz end_sum_loop // Jump out if necessary
1156
1157	movq mm1, [ebx] // Load data (4 words)
1158	add ebx, 8 // Update input pointer
1159
1160	//Process low word
1161	aim_Sum_Words_In_MM1
1162
1163	//Store the result Only in the accumulator
1164	movq mm7, mm4 // Update accumulator
1165
1166	//Process high word
1167	movq mm1, [ebx] // Copy...
1168
1169	aim_Sum_Words_In_MM1
1170	add ebx, 8
1171
1172	//Add to the previous data
1173	paddw mm7, mm3 // The current word of the accum
1174
1175	// translate everything to 2 words on the left
1176	movq mm1, mm7 // mm1 = [D+E+F+G+H C+D+E+F+G B+C+D+E+F A+B+C+D+E]
1177	psrlq mm1, 32 // mm1 = [0 0 D+E+F+G+H C+D+E+F+G]
1178	movq mm0, mm1 // mm0 = [0 0 D+E+F+G+H C+D+E+F+G]
1179	psllq mm7, 32 // mm7 = [B+C+D+E+F A+B+C+D+E 0 0]
1180
1181	movq [ecx], mm7 // Store the final result
1182	movq mm7, mm4 // Update accumulator
1183
1184	add ecx, 8 // Update output pointer
1185	sub eax, 8 // Update the number of points left
1186
1187	// Start the loop
1188	row_sum_loop:
1189	test eax, eax // Is there anything to do?
1190	jz end_sum_loop // Jump out if necessary
1191
1192	movq mm1, [ebx] // Load data
1193
1194	aim_Sum_Words_In_MM1
1195
1196	//Add to the previous data
1197	//prefetcht0 [ecx + 32]
1198	//prefetcht0 [ebx + 48]
1199	paddw mm7, mm3 // The current word of the accum
1200	psrlw mm7, 2 // divide result by ...
1201
1202	// translate everything to 2 words on the left
1203	// mm0 = [0 0 D C] mm7 = [H G F E] ----> mm7 =[0 0 H G] [ecx]=[F E D C]
1204	punpckldq mm0, mm7 // mm0 = [F E D C]
1205
1206	movq [ecx], mm0
1207	sub eax, 8 // Update the number of points left
1208
1209	movq mm0, mm4 // Update accumulator
1210	psrlq mm7, 32 // mm7 =[0 0 H G]
1211
1212	//Process high word
1213	movq mm1, [ebx+8] // Copy...
1214
1215	aim_Sum_Words_In_MM1
1216
1217	//Add to the previous data
1218	paddw mm0, mm3 // The current word of the accum
1219	psrlw mm0, 2 // divide result by ...
1220
1221	// translate everything to 2 words on the left
1222	// mm7 = [0 0 D C] mm0 = [H G F E] ----> mm0=[0 0 H G] [ecx+8]=[F E D C]
1223	punpckldq mm7, mm0 // mm7 = [F E D C]
1224	add ebx, 16 // Update input pointer
1225
1226	movq [ecx+8], mm7
1227	psrlq mm0, 32 // mm0 = [0 0 H G]
1228
1229	movq mm7, mm4 // Update accumulator
1230	add ecx, 16 // Update output pointer */
1231
1232	jmp row_sum_loop // Loop
1233
1234	//Cleanup
1235	end_sum_loop:
1236	emms
1237	}
1238	}
1239
1240	template<class T> void sum_Row_5(T* im, ushort* im_out, int rowSize)
1241	{
1242	im += 2;
1243	im_out +=2;
1244	int s = 0;
1245	for (int i=0; i<rowSize-5; ++i, ++im, ++im_out) {
1246	s = *(im-2);
1247	s += *(im-1);
1248	s += *(im);
1249	s += *(im+1);
1250	s += *(im+2);
1251	*im_out = s/5;
1252	}
1253	}
1254
1255	inline void avg_Col(ushort* im, uchar* im_out, int dataSize, int width, int sizeMask)
1256	{
1257	int offset = width*(sizeMask/2);
1258	im += offset;
1259	im_out += offset;
1260	for (int i=0; i<dataSize-width*sizeMask; ++i, ++im, ++im_out) {
1261	int s = 0;
1262	for (int j=-sizeMask/2; j<=sizeMask/2; ++j) s += (im+jwidth);
1263	*im_out = s/(sizeMask);
1264	}
1265	}
1266
1267	// apply vertical mask 1/16*[1 1 1 ... 1]^T to 'im'
1268	// result in 'im_out'
1269	inline void avg_Col_mmx(ushort* im, uchar* im_out, int dataSize, int width, int sizeMask)
1270	{
1271	// temp: for debugging
1272	//return avg_Col(im,im_out,dataSize,width,sizeMask);
1273
1274	switch (sizeMask)
1275	{
1276	case 5: avg_Col_5(im,im_out,dataSize,width);
1277	break;
1278	case 7: avg_Col_7(im,im_out,dataSize,width);
1279	break;
1280	case 9: avg_Col_9(im,im_out,dataSize,width);
1281	break;
1282	case 11: avg_Col_11(im,im_out,dataSize,width);
1283	break;
1284	case 13: avg_Col_13(im,im_out,dataSize,width);
1285	break;
1286	case 15: avg_Col_15(im,im_out,dataSize,width);
1287	break;
1288	case 17: avg_Col_17(im,im_out,dataSize,width);
1289	break;
1290
1291	default: if (sizeMask<5) avg_Col_5(im,im_out,dataSize,width);
1292	else if (sizeMask>17) avg_Col_17(im,im_out,dataSize,width);
1293	break;
1294
1295	}
1296	}
1297
1298
1299
1300	#define macro_add __asm \
1301	{ \
1302	__asm paddusw mm3, [edx] \
1303	__asm paddusw mm2, [edx+8] \
1304	__asm add edx, edi \
1305	}
1306
1307
1308	inline void avg_Col_5(ushort* im, uchar* im_out, int dataSize, int width)
1309	{
1310	__asm {
1311
1312	mov edi, width
1313	shl edi, 1 // edi = 2*width
1314
1315	mov eax, dataSize
1316	mov ecx, im_out
1317
1318	mov ebx, im
1319	sub ebx, edi
1320	sub ebx, edi // ebx = ebx-4*width
1321
1322	test eax, eax // Is there anything to do?"
1323	jz end_sum_loop // Jump out if necessary
1324
1325	row_sum_loop:
1326
1327	test eax, eax // Is there anything to do?
1328	jz end_sum_loop // Jump out if necessary
1329
1330	mov edx, ebx
1331	add ebx, 16
1332
1333	// 1
1334	movq mm3, [edx] // mm3 = 4 words of im
1335	movq mm2, [edx+8] // mm2 = next 4 words of im
1336	add edx, edi
1337
1338	macro_add
1339	macro_add
1340	macro_add
1341	macro_add
1342
1343	// divide results by ...
1344	psrlw mm3, 3
1345	psrlw mm2, 3
1346
1347	// convert [mm2 mm3] as 8 bytes
1348	packuswb mm3,mm2
1349	movq [ecx], mm3
1350
1351	sub eax, 8 // Update the number of points left
1352	add ecx, 8 // Update output pointer
1353
1354	jmp row_sum_loop // Loop
1355
1356	//Cleanup
1357	end_sum_loop:
1358	emms
1359	}
1360	}
1361
1362	inline void avg_Col_7(ushort* im, uchar* im_out, int dataSize, int width)
1363	{
1364	__asm {
1365
1366	mov edi, width
1367	shl edi, 1 // edi = 2*width
1368
1369	mov eax, dataSize
1370	mov ecx, im_out
1371
1372	mov ebx, im
1373	sub ebx, edi
1374	sub ebx, edi
1375	sub ebx, edi // ebx = ebx-6*width
1376
1377	test eax, eax // Is there anything to do?"
1378	jz end_sum_loop // Jump out if necessary
1379
1380	row_sum_loop:
1381
1382	test eax, eax // Is there anything to do?
1383	jz end_sum_loop // Jump out if necessary
1384
1385	mov edx, ebx
1386
1387	// 1
1388	movq mm3, [edx] // mm3 = 4 words of im
1389	add ebx, 16
1390	movq mm2, [edx+8] // mm2 = next 4 words of im
1391	add edx, edi
1392
1393	macro_add
1394	macro_add
1395	macro_add
1396	macro_add
1397	macro_add
1398	macro_add
1399
1400	// divide results by ...
1401	psrlw mm3, 3
1402	psrlw mm2, 3
1403
1404	// convert [mm2 mm3] as 8 bytes
1405	packuswb mm3,mm2
1406	movq [ecx], mm3
1407
1408	sub eax, 8 // Update the number of points left
1409	add ecx, 8 // Update output pointer
1410
1411	jmp row_sum_loop // Loop
1412
1413	//Cleanup
1414	end_sum_loop:
1415	emms
1416	}
1417	}
1418
1419	inline void avg_Col_9(ushort* im, uchar* im_out, int dataSize, int width)
1420	{
1421	__asm {
1422
1423	mov edi, width
1424	shl edi, 1 // edi = 2*width
1425
1426	mov eax, dataSize
1427	mov ecx, im_out
1428
1429	mov ebx, im
1430	sub ebx, edi
1431	sub ebx, edi
1432	sub ebx, edi
1433	sub ebx, edi // ebx = ebx-8*width
1434
1435	test eax, eax // Is there anything to do?"
1436	jz end_sum_loop // Jump out if necessary
1437
1438	row_sum_loop:
1439
1440	test eax, eax // Is there anything to do?
1441	jz end_sum_loop // Jump out if necessary
1442
1443	mov edx, ebx
1444	add ebx, 16
1445
1446	// 1
1447	movq mm3, [edx] // mm3 = 4 words of im
1448	movq mm2, [edx+8] // mm2 = next 4 words of im
1449	add edx, edi
1450
1451	macro_add
1452	macro_add
1453	macro_add
1454	macro_add
1455	macro_add
1456	macro_add
1457	macro_add
1458	macro_add
1459
1460	// divide results by ...
1461	psrlw mm3, 3
1462	psrlw mm2, 3
1463
1464	// convert [mm2 mm3] as 8 bytes
1465	packuswb mm3,mm2
1466	movq [ecx], mm3
1467
1468	sub eax, 8 // Update the number of points left
1469	add ecx, 8 // Update output pointer
1470
1471	jmp row_sum_loop // Loop
1472
1473	//Cleanup
1474	end_sum_loop:
1475	emms
1476	}
1477	}
1478
1479	inline void avg_Col_11(ushort* im, uchar* im_out, int dataSize, int width)
1480	{
1481	__asm {
1482
1483	mov edi, width
1484	shl edi, 1 // edi = 2*width
1485
1486	mov eax, dataSize
1487	mov ecx, im_out
1488
1489	mov ebx, im
1490	sub ebx, edi
1491	sub ebx, edi
1492	sub ebx, edi
1493	sub ebx, edi
1494	sub ebx, edi // ebx = ebx-10*width
1495
1496	test eax, eax // Is there anything to do?"
1497	jz end_sum_loop // Jump out if necessary
1498
1499	row_sum_loop:
1500
1501	test eax, eax // Is there anything to do?
1502	jz end_sum_loop // Jump out if necessary
1503
1504	mov edx, ebx
1505	add ebx, 16
1506
1507	// 1
1508	movq mm3, [edx] // mm3 = 4 words of im
1509	movq mm2, [edx+8] // mm2 = next 4 words of im
1510	add edx, edi
1511
1512	macro_add
1513	macro_add
1514	macro_add
1515	macro_add
1516	macro_add
1517	macro_add
1518	macro_add
1519	macro_add
1520	macro_add
1521	macro_add
1522
1523	// divide results by ...
1524	psrlw mm3, 4
1525	psrlw mm2, 4
1526
1527	// convert [mm2 mm3] as 8 bytes
1528	packuswb mm3,mm2
1529	movq [ecx], mm3
1530
1531	sub eax, 8 // Update the number of points left
1532	add ecx, 8 // Update output pointer
1533
1534	jmp row_sum_loop // Loop
1535
1536	//Cleanup
1537	end_sum_loop:
1538	emms
1539	}
1540	}
1541
1542	inline void avg_Col_13(ushort* im, uchar* im_out, int dataSize, int width)
1543	{
1544	__asm {
1545
1546	mov edi, width
1547	shl edi, 1 // edi = 2*width
1548
1549	mov eax, dataSize
1550	mov ecx, im_out
1551
1552	mov ebx, im
1553	sub ebx, edi
1554	sub ebx, edi
1555	sub ebx, edi
1556	sub ebx, edi
1557	sub ebx, edi
1558	sub ebx, edi // ebx = ebx-12*width
1559
1560	test eax, eax // Is there anything to do?"
1561	jz end_sum_loop // Jump out if necessary
1562
1563	row_sum_loop:
1564
1565	test eax, eax // Is there anything to do?
1566	jz end_sum_loop // Jump out if necessary
1567
1568	mov edx, ebx
1569	add ebx, 16
1570
1571	// 1
1572	movq mm3, [edx] // mm3 = 4 words of im
1573	movq mm2, [edx+8] // mm2 = next 4 words of im
1574	add edx, edi
1575
1576	macro_add
1577	macro_add
1578	macro_add
1579	macro_add
1580	macro_add
1581	macro_add
1582	macro_add
1583	macro_add
1584	macro_add
1585	macro_add
1586	macro_add
1587	macro_add
1588
1589	// divide results by ...
1590	psrlw mm3, 4
1591	psrlw mm2, 4
1592
1593	// convert [mm2 mm3] as 8 bytes
1594	packuswb mm3,mm2
1595	movq [ecx], mm3
1596
1597	sub eax, 8 // Update the number of points left
1598	add ecx, 8 // Update output pointer
1599
1600	jmp row_sum_loop // Loop
1601
1602	//Cleanup
1603	end_sum_loop:
1604	emms
1605	}
1606	}
1607
1608	inline void avg_Col_15(ushort* im, uchar* im_out, int dataSize, int width)
1609	{
1610	__asm {
1611
1612	mov edi, width
1613	shl edi, 1 // edi = 2*width
1614
1615	mov eax, dataSize
1616	mov ecx, im_out
1617
1618	mov ebx, im
1619	sub ebx, edi
1620	sub ebx, edi
1621	sub ebx, edi
1622	sub ebx, edi
1623	sub ebx, edi
1624	sub ebx, edi
1625	sub ebx, edi // ebx = ebx-14*width
1626
1627	test eax, eax // Is there anything to do?"
1628	jz end_sum_loop // Jump out if necessary
1629
1630	row_sum_loop:
1631
1632	test eax, eax // Is there anything to do?
1633	jz end_sum_loop // Jump out if necessary
1634
1635	mov edx, ebx
1636	add ebx, 16
1637
1638	// 1
1639	movq mm3, [edx] // mm3 = 4 words of im
1640	movq mm2, [edx+8] // mm2 = next 4 words of im
1641	add edx, edi
1642
1643	macro_add
1644	macro_add
1645	macro_add
1646	macro_add
1647	macro_add
1648	macro_add
1649	macro_add
1650	macro_add
1651	macro_add
1652	macro_add
1653	macro_add
1654	macro_add
1655	macro_add
1656	macro_add
1657
1658	// divide results by ...
1659	psrlw mm3, 4
1660	psrlw mm2, 4
1661
1662	// convert [mm2 mm3] as 8 bytes
1663	packuswb mm3,mm2
1664	movq [ecx], mm3
1665
1666	sub eax, 8 // Update the number of points left
1667	add ecx, 8 // Update output pointer
1668
1669	jmp row_sum_loop // Loop
1670
1671	//Cleanup
1672	end_sum_loop:
1673	emms
1674	}
1675	}
1676
1677	inline void avg_Col_17(ushort* im, uchar* im_out, int dataSize, int width)
1678	{
1679	__asm {
1680
1681	mov edi, width
1682	shl edi, 1 // edi = 2*width
1683
1684	mov eax, dataSize
1685	mov ecx, im_out
1686
1687	mov ebx, im
1688	sub ebx, edi
1689	sub ebx, edi
1690	sub ebx, edi
1691	sub ebx, edi
1692	sub ebx, edi
1693	sub ebx, edi
1694	sub ebx, edi
1695	sub ebx, edi // ebx = ebx-16*width
1696
1697	test eax, eax // Is there anything to do?"
1698	jz end_sum_loop // Jump out if necessary
1699
1700	row_sum_loop:
1701
1702	test eax, eax // Is there anything to do?
1703	jz end_sum_loop // Jump out if necessary
1704
1705	mov edx, ebx
1706	add ebx, 16
1707
1708	// 1
1709	movq mm3, [edx] // mm3 = 4 words of im
1710	movq mm2, [edx+8] // mm2 = next 4 words of im
1711	add edx, edi
1712
1713	macro_add
1714	macro_add
1715	macro_add
1716	macro_add
1717	macro_add
1718	macro_add
1719	macro_add
1720	macro_add
1721	macro_add
1722	macro_add
1723	macro_add
1724	macro_add
1725	macro_add
1726	macro_add
1727	macro_add
1728	macro_add
1729
1730	// divide results by ...
1731	psrlw mm3, 4
1732	psrlw mm2, 4
1733
1734	// convert [mm2 mm3] as 8 bytes
1735	packuswb mm3,mm2
1736	movq [ecx], mm3
1737
1738	sub eax, 8 // Update the number of points left
1739	add ecx, 8 // Update output pointer
1740
1741	jmp row_sum_loop // Loop
1742
1743	//Cleanup
1744	end_sum_loop:
1745	emms
1746	}
1747	}
1748
1749
1750	inline void add_Col_5_wb(ushort* im, uchar* im_out, int dataSize, int width)
1751	{
1752	__asm {
1753
1754	mov edi, width
1755	shl edi, 1 // edi = 2*width
1756
1757	mov eax, dataSize
1758	mov ecx, im_out
1759
1760	mov ebx, im
1761	sub ebx, edi
1762	sub ebx, edi // ebx = ebx-4*width
1763
1764	test eax, eax // Is there anything to do?"
1765	jz end_sum_loop // Jump out if necessary
1766
1767	row_sum_loop:
1768
1769	test eax, eax // Is there anything to do?
1770	jz end_sum_loop // Jump out if necessary
1771
1772	mov edx, ebx
1773	add ebx, 16
1774
1775	// 1
1776	movq mm3, [edx] // mm3 = 4 words of im
1777	movq mm2, [edx+8] // mm2 = next 4 words of im
1778	add edx, edi
1779
1780	macro_add
1781	macro_add
1782	macro_add
1783	macro_add
1784
1785	// save [mm2 mm3] as 8 bytes
1786	packuswb mm3,mm2
1787	movq [ecx], mm3
1788
1789	sub eax, 8 // Update the number of points left
1790	add ecx, 8 // Update output pointer
1791
1792	jmp row_sum_loop // Loop
1793
1794	//Cleanup
1795	end_sum_loop:
1796	emms
1797	}
1798	}
1799
1800	inline void add_Col_5_ww(ushort* im, ushort* im_out, int dataSize, int width)
1801	{
1802	__asm {
1803
1804	mov edi, width
1805	shl edi, 1 // edi = 2*width
1806
1807	mov eax, dataSize
1808	mov ecx, im_out
1809
1810	mov ebx, im
1811	sub ebx, edi
1812	sub ebx, edi // ebx = ebx-4*width
1813
1814	test eax, eax // Is there anything to do?"
1815	jz end_sum_loop // Jump out if necessary
1816
1817	row_sum_loop:
1818
1819	test eax, eax // Is there anything to do?
1820	jz end_sum_loop // Jump out if necessary
1821
1822	mov edx, ebx
1823	add ebx, 16
1824
1825	// 1
1826	movq mm3, [edx] // mm3 = 4 words of im
1827	movq mm2, [edx+8] // mm2 = next 4 words of im
1828	add edx, edi
1829
1830	macro_add
1831	macro_add
1832	macro_add
1833	macro_add
1834
1835	// save [mm2 mm3] as words
1836	movq [ecx], mm3
1837	movq [ecx+8], mm2
1838
1839	sub eax, 8 // Update the number of points left
1840	add ecx, 16 // Update output pointer
1841
1842	jmp row_sum_loop // Loop
1843
1844	//Cleanup
1845	end_sum_loop:
1846	emms
1847	}
1848	}
1849
1850	// compare bestScores and secondScores. if second<best+'thresh' the disp.
1851	// is set to 'valForReplacement' (usually 0)
1852	inline void compareBestAndSecond(uchar* bestScores, uchar* secondScores, char thresh,
1853	uchar undefined_val,
1854	uchar* disp, int dataSize)
1855	{
1856	__asm {
1857
1858	// setup mm0 with 8 copies of 'thresh'
1859	mov al, thresh
1860	mov ah, al
1861	mov bx, ax
1862	shl eax, 16
1863	mov ax, bx
1864	movd mm0, eax
1865	movd mm1, eax
1866	punpckldq mm0, mm1
1867
1868	// setup mm7 with 8 copies of 'valForReplacement'
1869	mov al, undefined_val
1870	mov ah, al
1871	mov bx, ax
1872	shl eax, 16
1873	mov ax, bx
1874	movd mm7, eax
1875	movd mm1, eax
1876	punpckldq mm7, mm1
1877
1878	mov eax, dataSize
1879	mov ebx, bestScores
1880	mov ecx, secondScores
1881	mov edx, disp
1882
1883	test eax, eax // Is there anything to do?"
1884	jz end_loop // Jump out if necessary
1885
1886	comp_loop:
1887
1888	test eax, eax // Is there anything to do?
1889	jz end_loop // Jump out if necessary
1890
1891	movq mm2, [ecx]
1892	psubusb mm2, [ebx] // mm2 = secondScores - bestScores
1893
1894	movq mm3, [edx] // mm3 = disp
1895	pcmpgtb mm2, mm0 // mm2 = 1 if mm2>thresh
1896	// 0 otherwise
1897
1898	pand mm3, mm2
1899	pandn mm2, mm7
1900
1901	por mm3, mm2
1902	movq [edx], mm3
1903
1904	sub eax, 8 // Update the number of points left
1905	add ebx, 8 // Update output pointer
1906	add ecx, 8
1907	add edx, 8
1908
1909	jmp comp_loop // Loop
1910
1911	//Cleanup
1912	end_loop:
1913	emms
1914	}
1915	}
1916
1917	// windowWidth must be multiple of 8
1918	inline void cropImage(const uchar* imSrc, int width, int height,
1919	uchar* imDest, int x0, int y0, int windowWidth, int windowHeight)
1920	{
1921	int w8 = windowWidth/8;
1922
1923	int step = width-windowWidth;
1924	const uchar* srcNewOrigin = imSrc+x0+y0*width;
1925
1926	__asm {
1927
1928	mov ecx, windowHeight
1929
1930	mov edx, w8
1931	mov eax, srcNewOrigin
1932	mov ebx, imDest
1933
1934	pixel_loop:
1935
1936	movq mm1, [eax]
1937	movq [ebx], mm1
1938	add eax, 8
1939	add ebx, 8
1940
1941	dec edx
1942	jnz pixel_loop
1943
1944	mov edx, w8
1945	add eax, step
1946
1947	dec ecx
1948	jnz pixel_loop
1949
1950	jmp done
1951
1952	done:
1953	emms;
1954	}
1955	}
1956
1957	// return the average pixel value
1958	inline float pixelMean(const uchar* im, int imageSize)
1959	{
1960	int sum;
1961
1962	__asm {
1963
1964	mov ecx, imageSize
1965	shr ecx, 3
1966
1967	mov eax, im
1968	pxor mm7,mm7 // mm7 used as accumulator
1969	pxor mm0,mm0 // mm0 = 0
1970
1971	pixel_loop:
1972
1973	movq mm1, [eax]
1974	movq mm2,mm1
1975
1976	punpcklbw mm2, mm0
1977	punpckhbw mm1, mm0
1978
1979	paddw mm2,mm1
1980
1981	movq mm1,mm2
1982	punpcklwd mm2, mm0
1983	punpckhwd mm1, mm0
1984
1985	paddd mm2,mm1
1986	paddd mm7,mm2
1987
1988	add eax, 8
1989	dec ecx
1990	jnz pixel_loop
1991
1992	jmp done
1993
1994	done:
1995	movd ebx, mm7
1996	psrlq mm7, 32
1997	movd edx, mm7
1998	add ebx, edx
1999	mov sum, ebx
2000
2001	emms
2002	}
2003
2004	return sum / (float)imageSize;
2005	}
2006
2007
2008
2009
2010	// -------------------------------------------------------------
2011	// apply mask:
2012	// if mask[]=undefined_val im[]->im[]
2013	// otherwise, im[]->mask[]
2014	// ....... this one may not be exact :-(
2015	inline void overrideImageMMX(uchar* im, const uchar* mask, uchar undefined_val, int imageSize)
2016	{
2017	__asm {
2018	// setup mm0 with 8 copies of 'undefined_val'
2019	mov al, undefined_val
2020	mov ah, al
2021	mov bx, ax
2022	shl eax, 16
2023	mov ax, bx
2024	movd mm0, eax
2025	movd mm1, eax
2026	punpckldq mm0, mm1
2027
2028	mov ecx, imageSize
2029	shr ecx, 3
2030
2031	mov eax, im
2032	mov ebx, mask
2033
2034	pixel_loop:
2035	movq mm1, [eax]
2036	movq mm2, [ebx]
2037
2038	movq mm3, mm2
2039	pcmpeqb mm3, mm0 // mm3[] -> xFF if mm2[]==undefined_val
2040	// -> x00 otherwise
2041	pand mm3, mm1 // mm3[] = mm1[] if mm2[]==undefined_val
2042	// = x00 otherwise
2043	por mm3, mm2
2044	movq [eax], mm3
2045
2046	add eax, 8
2047	add ebx, 8
2048	dec ecx
2049	jnz pixel_loop
2050
2051	jmp done
2052
2053	done:
2054	emms
2055	}
2056	}
2057
2058	inline void overrideImage(uchar* im, const uchar* mask, uchar undefined_val, int imageSize)
2059	{
2060	for (int i=0; i<imageSize; ++i, ++im,++mask)
2061	{
2062	if (mask != undefined_val) im=*mask;
2063	}
2064
2065	}
2066
2067
2068	inline void divide( ushort* im, uchar* div, uchar* result, int imageSize)
2069	{
2070	for (int i=0; i<imageSize; ++i,++im,++div,++result)
2071	{
2072	result = (div)?(uchar)(im / div):0;
2073	}
2074	}
2075
2076	// 5x5 sum filters
2077	inline void sum_5x5_mmx( uchar* im, ushort* im_out, int dataSize, int width, ushort* buff)
2078	{
2079	sum_Row_5_mmx(im, buff, dataSize);
2080	add_Col_5_ww(buff+2width, im_out+2width, dataSize-4*width , width);
2081	}
2082
2083	inline void sum_5x5_mmx( uchar* im, uchar* im_out, int dataSize, int width, ushort* buff)
2084	{
2085	sum_Row_5_mmx(im, buff, dataSize);
2086	add_Col_5_wb(buff+2width, im_out+2width, dataSize-4*width , width);
2087	}
2088
2089
2090	inline void binarize(uchar* im, uchar* im_out, uchar undefined_val, int dataSize)
2091	{
2092	for (int i=0; i<dataSize; ++i,++im,++im_out)
2093	{
2094	im_out = (im != undefined_val);
2095	}
2096	}
2097
2098	inline void set_undefined_to_zero(uchar* im, uchar* im_out, uchar undefined_val, int dataSize)
2099	{
2100	for (int i=0; i<dataSize; ++i,++im,++im_out)
2101	{
2102	if (im == undefined_val) im_out=0;
2103	}
2104	}
2105
2106	inline void set_zero_to_undefined(uchar* im, uchar* im_out, uchar undefined_val, int dataSize)
2107	{
2108	for (int i=0; i<dataSize; ++i,++im,++im_out)
2109	{
2110	if (im == 0) im_out=undefined_val;
2111	}
2112	}
2113
2114
2115
2116	inline void copyMMX(void* imDest, const void* imSrc, int dataSize)
2117	{
2118	__asm {
2119
2120	mov ecx, dataSize
2121	shr ecx, 3
2122
2123	mov eax, imSrc
2124	mov ebx, imDest
2125	sub ebx, 8
2126
2127	pixel_loop:
2128	movq mm1, [eax]
2129	add ebx, 8
2130
2131	movq [ebx], mm1
2132	add eax, 8
2133
2134	dec ecx
2135	jnz pixel_loop
2136
2137	jmp done
2138
2139	done:
2140	emms
2141	}
2142	}
2143
2144	inline void copySSE(void* imDest, const void* imSrc, int dataSize)
2145	{
2146	__asm {
2147
2148	mov ecx, dataSize
2149	shr ecx, 4
2150
2151	mov eax, imSrc
2152	mov ebx, imDest
2153	sub ebx, 16
2154
2155	pixel_loop:
2156	movdqa xmm1, [eax]
2157	add ebx, 16
2158
2159	movdqa [ebx], xmm1
2160	add eax, 16
2161
2162	dec ecx
2163	jnz pixel_loop
2164
2165	jmp done
2166
2167	done:
2168	emms
2169	}
2170	}
2171
2172	inline void setMMX(float* imDest, const float value, int dataSize)
2173	{
2174	__asm {
2175	// make 4 copies of the constant 'value' in xmm0
2176	movss xmm0, value
2177	movss xmm1, xmm0
2178	unpcklps xmm0, xmm1
2179	movlhps xmm0, xmm0
2180
2181	mov ecx, dataSize
2182	shr ecx, 2
2183
2184	mov ebx, imDest
2185
2186	pixel_loop:
2187	movaps [ebx], xmm0
2188	add ebx, 16
2189
2190	dec ecx
2191	jnz pixel_loop
2192
2193	jmp done
2194
2195	done:
2196	emms
2197	}
2198	}
2199
2200	inline void setMMX(char* imDest, const char value, int dataSize)
2201	{
2202	__asm {
2203	// setup mm0 with 8 copies of 'value'
2204	mov al, value
2205	mov ah, al
2206	mov bx, ax
2207	shl eax, 16
2208	mov ax, bx
2209	movd mm0, eax
2210	movd mm1, eax
2211	punpckldq mm0, mm1
2212
2213
2214	mov ecx, dataSize
2215	shr ecx, 3
2216
2217	mov ebx, imDest
2218
2219	pixel_loop:
2220	movq [ebx], mm0
2221	add ebx, 8
2222
2223	dec ecx
2224	jnz pixel_loop
2225
2226	jmp done
2227
2228	done:
2229	emms
2230	}
2231	}
2232
2233	/*
2234	void copyRGBAtoRGB(const uchar* imSrc, uchar* imred,uchar* imgreen,uchar* imblue, int dataSize)
2235	{
2236	__asm {
2237
2238	mov esi, dataSize
2239	shr esi, 3
2240
2241	mov eax, imSrc
2242	mov ebx, imred
2243	mov ecx, imred
2244	mov edx, imred
2245
2246	pixel_loop:
2247	movq mm1, [eax]
2248
2249
2250	movq [ebx], mm1
2251
2252	add eax, 8
2253	add ebx, 8
2254	add ecx, 8
2255	add edx, 8
2256
2257	dec esi
2258	jnz pixel_loop
2259
2260	jmp done
2261
2262	done:
2263	emms
2264	}
2265	}*/
2266
2267	inline void multiply(uchar* im, float fact, int imageSize)
2268	{
2269	__asm {
2270
2271	mov ecx, imageSize
2272	shr ecx, 3
2273
2274	// make 4 copies of the constant 'fact' in xmm0
2275	movss xmm0, fact
2276	movss xmm1, xmm0
2277	unpcklps xmm0, xmm1
2278	movlhps xmm0, xmm0
2279
2280
2281	mov eax, im
2282	pxor mm7,mm7 // mm7 = 0
2283
2284	pixel_loop:
2285	movq mm1, [eax]
2286	movq mm2, mm1
2287
2288	punpcklbw mm2, mm0
2289	punpckhbw mm1, mm0
2290
2291	movq mm3,mm2
2292	punpckhwd mm3, mm0
2293	punpcklwd mm2, mm0
2294
2295	movq mm4,mm1
2296	punpcklwd mm4, mm0
2297	punpckhwd mm1, mm0
2298
2299	// here, the first 8 bytes are in d-words [mm1 mm4 mm3 mm2]
2300	// --------
2301	cvtpi2ps xmm3, mm3 // put mm3 in low part of xmm3
2302	cvtpi2ps xmm2, mm2 // put mm2 in low part of xmm2
2303
2304	movlhps xmm2, xmm3 // xmm2 = [xmm3(low part) xmm2(low part)]
2305
2306	mulps xmm2, xmm0
2307
2308	cvtps2pi mm2, xmm2 // convert low 2 floats from xmm2 to mm2
2309	movhlps xmm3,xmm2 // mov high 2 floats from xmm2 to low 2 floats in xmm3
2310	cvtps2pi mm3, xmm3 // convert low 2 floats from xmm3 to mm3
2311
2312	packssdw mm2, mm3 // mm2 = (word)[mm2 mm3]
2313
2314	// --------
2315	cvtpi2ps xmm4, mm4
2316	cvtpi2ps xmm1, mm1
2317
2318	movlhps xmm4, xmm1
2319
2320	mulps xmm4, xmm0
2321
2322	cvtps2pi mm4, xmm4 // convert low 2 floats from xmm4 to mm4
2323	movhlps xmm1,xmm4 // mov high 2 floats from xmm4 to low 2 floats in xmm1
2324	cvtps2pi mm1, xmm1 // convert low 2 floats from xmm1 to mm1
2325
2326	packssdw mm4, mm1 // mm4 = (word)[mm1 mm4]
2327
2328
2329	// ------
2330	packuswb mm2, mm4 // mm2 = [[mm4] [mm2]] = [mm1 mm4 mm2 mm3]
2331	movq [eax], mm2
2332
2333	add eax, 8
2334	dec ecx
2335	jnz pixel_loop
2336
2337	jmp done
2338
2339	done:
2340	emms
2341
2342	}
2343	}
2344
2345	inline void multiply(const uchar* imSrc, uchar* imDest, float fact, int imageSize)
2346	{
2347	__asm {
2348
2349	mov ecx, imageSize
2350	shr ecx, 3
2351
2352	// make 4 copies of the constant 'fact' in xmm0
2353	movss xmm0, fact
2354	movss xmm1, xmm0
2355	unpcklps xmm0, xmm1
2356	movlhps xmm0, xmm0
2357
2358
2359	mov eax, imSrc
2360	mov ebx, imDest
2361	pxor mm7,mm7 // mm7 = 0
2362
2363	pixel_loop:
2364	movq mm1, [eax]
2365	movq mm2, mm1
2366
2367	punpcklbw mm2, mm0
2368	punpckhbw mm1, mm0
2369
2370	movq mm3,mm2
2371	punpckhwd mm3, mm0
2372	punpcklwd mm2, mm0
2373
2374	movq mm4,mm1
2375	punpcklwd mm4, mm0
2376	punpckhwd mm1, mm0
2377
2378	// here, the first 8 bytes are in d-words [mm1 mm4 mm3 mm2]
2379	// --------
2380	cvtpi2ps xmm3, mm3 // put mm3 in low part of xmm3
2381	cvtpi2ps xmm2, mm2 // put mm2 in low part of xmm2
2382
2383	movlhps xmm2, xmm3 // xmm2 = [xmm3(low part) xmm2(low part)]
2384
2385	mulps xmm2, xmm0
2386
2387	cvtps2pi mm2, xmm2 // convert low 2 floats from xmm2 to mm2
2388	movhlps xmm3,xmm2 // mov high 2 floats from xmm2 to low 2 floats in xmm3
2389	cvtps2pi mm3, xmm3 // convert low 2 floats from xmm3 to mm3
2390
2391	packssdw mm2, mm3 // mm2 = (word)[mm2 mm3]
2392
2393	// --------
2394	cvtpi2ps xmm4, mm4
2395	cvtpi2ps xmm1, mm1
2396
2397	movlhps xmm4, xmm1
2398
2399	mulps xmm4, xmm0
2400
2401	cvtps2pi mm4, xmm4 // convert low 2 floats from xmm4 to mm4
2402	movhlps xmm1,xmm4 // mov high 2 floats from xmm4 to low 2 floats in xmm1
2403	cvtps2pi mm1, xmm1 // convert low 2 floats from xmm1 to mm1
2404
2405	packssdw mm4, mm1 // mm4 = (word)[mm1 mm4]
2406
2407
2408	// ------
2409	packuswb mm2, mm4 // mm2 = [[mm4] [mm2]] = [mm1 mm4 mm2 mm3]
2410	movq [ebx], mm2
2411
2412	add eax, 8
2413	add ebx, 8
2414	dec ecx
2415	jnz pixel_loop
2416
2417	jmp done
2418
2419	done:
2420	emms
2421
2422	}
2423	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format