Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

processingMMX.inl@ 89

Last change on this file since 89 was 89, checked in by morasjul, 11 years ago

PACPUS 2.0 Beta deployed in new branch

Major changes:
-Add communication interface between components
-Add examples for communications interface (TestComponents)
-Move to Qt5 support

Property svn:executable set to *

File size: 54.1 KB

Rev	Line
[89]	1	/***************************************************************************
	2	*
	3	* Copyright 2000 by David Demirdjian. All rights reserved.
	4	*
	5	* Developed by David Demirdjian
	6	*
	7	* Permission to use, copy, or modify this software and its documentation
	8	* for educational and research purposes only and without fee is hereby
	9	* granted, provided that this copyright notice and the original authors's
	10	* names appear on all copies and supporting documentation. If individual
	11	* files are separated from this distribution directory structure, this
	12	* copyright notice must be included. For any other uses of this software,
	13	* in original or modified form, including but not limited to distribution
	14	* in whole or in part, specific prior permission must be obtained from
	15	* MIT. These programs shall not be used, rewritten, or adapted as the
	16	* basis of a commercial software or hardware product without first
	17	* obtaining appropriate licenses from David Demirdjian. The author makes
	18	* no representations about the suitability of this software for any purpose.
	19	* It is provided "as is" without express or implied warranty.
	20	*
	21	**************************************************************************/
	22	#include "stereoMatching.h"
	23	#include "processingMMX.h"
	24
	25	// ************************************************************
	26	// ************************************************************
	27	// *** List of functions (MMX) for image processing
	28	// ************************************************************
	29	// ************************************************************
	30
	31	// shrink images by a factor 'fact'. eg if fact = 2, out will be twice as small as src
	32	inline void shrinkImages(uchar* dst, const uchar* src, int width, int height, int fact)
	33	{
	34	int width_f = width/fact;
	35	int siz = widthheight/(factfact);
	36	if (fact>0) {
	37	for (int i=0,j=0; i<siz; ++i,++j,++dst,src+=fact) {
	38	dst = src;
	39	if (j==width_f-1) {
	40	src+=((fact-1)*width);
	41	j=0;
	42	}
	43	}
	44	}
	45	}
	46
	47	// translate image of 'tx' pixels to the right
	48	// (or left if tx<0)
	49	void translateImage(int tx, uchar* data, int siz)
	50	{
	51	if (tx==0) return;
	52	if (tx>0) {
	53	// dest. must be after src... to avoid overwriting data
	54	data += (siz-tx);
	55	uchar* dataDst = data+tx;
	56	for (int i=0; i<siz-tx; ++i,--dataDst,--data) {
	57	dataDst = data;
	58	}
	59	} else { // tx<0
	60	data -= tx;
	61	uchar* dataDst = data+tx;
	62	for (int i=0; i<siz-tx; ++i,++dataDst,++data) {
	63	dataDst = data;
	64	}
	65	}
	66	}
	67
	68	void normalizeImages(uchar* data1, uchar* data2, uchar* data3, int siz)
	69	{
	70	float a1 = pixelMean(data1,siz);
	71	float a2 = pixelMean(data2,siz);
	72	float a3 = pixelMean(data3,siz);
	73
	74	float minI = __min(a1, __min(a2,a3));
	75
	76	if (a2==minI) {
	77	multiply(data1, a2/a1, siz);
	78	multiply(data3, a2/a3, siz);
	79	} else if (a1==minI) {
	80	multiply(data2, a1/a2, siz);
	81	multiply(data3, a1/a3, siz);
	82	} else {
	83	multiply(data2, a3/a2, siz);
	84	multiply(data1, a3/a1, siz);
	85	}
	86	}
	87
	88	void normalizeImages(const uchar* data1, const uchar* data2, const uchar* data3,
	89	uchar* out1, uchar* out2, uchar* out3, int siz)
	90	{
	91	float a1 = pixelMean(data1,siz);
	92	float a2 = pixelMean(data2,siz);
	93	float a3 = pixelMean(data3,siz);
	94
	95	float minI = __min(a1, __min(a2,a3));
	96
	97	if (a2==minI) {
	98	multiply(data1, out1, a2/a1, siz);
	99	multiply(data3, out3, a2/a3, siz);
	100	copyMMX(out2, data2, siz);
	101	} else if (a1==minI) {
	102	multiply(data2, out2, a1/a2, siz);
	103	multiply(data3, out3, a1/a3, siz);
	104	copyMMX(out1, data1, siz);
	105	} else {
	106	multiply(data2, out2, a3/a2, siz);
	107	multiply(data1, out1, a3/a1, siz);
	108	copyMMX(out3, data3, siz);
	109	}
	110	}
	111
	112	void normalizeImages(uchar* data1, uchar* data2, int siz)
	113	{
	114	float a1 = pixelMean(data1,siz);
	115	float a2 = pixelMean(data2,siz);
	116
	117	// normalize the image which average intensity is the highest
	118	if (a1>a2)
	119	multiply(data1, a2/a1, siz);
	120	else
	121	multiply(data2, a1/a2, siz);
	122	}
	123
	124	void normalizeImages(const uchar* data1, const uchar* data2,
	125	uchar* out1, uchar* out2, int siz)
	126	{
	127	float a1 = pixelMean(data1,siz);
	128	float a2 = pixelMean(data2,siz);
	129
	130	// normalize the image which average intensity is the highest
	131	if (a1>a2) {
	132	multiply(data1, out1, a2/a1, siz);
	133	copyMMX(out2, data2, siz);
	134	} else {
	135	multiply(data2, out2, a1/a2, siz);
	136	copyMMX(out1, data1, siz);
	137	}
	138	}
	139
	140	// ImgSub2: D = saturation0(\|S1 - S2\| + \|S1 - S3\|)
	141	// TODO? divide the result by 2 (shift)
	142	inline int ImgSubandAdd(const unsigned char Src1, const unsigned char Src2,
	143	const unsigned char Src3, unsigned char Dest, int l)
	144	{
	145
	146	if (l < 8) return 0; // image size must be at least 8 bytes
	147
	148	__asm
	149	{
	150	mov eax, Src1
	151	mov ebx, Src2
	152	mov edx, Src3
	153	mov edi, Dest
	154	mov ecx, l
	155	shr ecx, 3
	156
	157	align 16
	158	inner_loop:
	159	movq mm1,[eax] // mm1=src1
	160
	161	movq mm2,[ebx] // mm2=src2
	162
	163	movq mm4,mm1 // mm4=mm1
	164
	165	psubusb mm4,mm2 // mm4 = src1 - src2
	166
	167	movq mm3,[edx] // mm3=src3
	168	psubusb mm2,mm1 // mm2 = src2 - src1
	169
	170	movq mm5,mm1 // mm5=src1
	171	por mm2,mm4 // mm2=\|src1-src2\|
	172
	173	psubusb mm5,mm3 // mm4=src1-src3
	174
	175	psubusb mm3,mm1 // mm3=src3-src1
	176
	177	por mm3,mm5 // mm3=\|src1-src3\|
	178
	179	paddusb mm2,mm3 // mm2 = \|src1-src2\|+\|src1-src3\|
	180
	181	movq [edi], mm2
	182	add eax,8
	183	add ebx,8
	184	add edx,8
	185	add edi,8
	186	dec ecx
	187	jnz inner_loop
	188	emms
	189	}
	190
	191	return 1;
	192	}
	193
	194	// ImgSub2: D = saturation0(\|S1 - S2\|)
	195	// TODO? divide the result by 2 (shift)
	196	inline int ImgSubandAdd(const unsigned char Src1, const unsigned char Src2,
	197	const unsigned char *Dest, int l)
	198	{
	199
	200	if (l < 8) return 0; // image size must be at least 8 bytes
	201
	202	__asm
	203	{
	204	mov eax, Src1
	205	mov ebx, Src2
	206	mov edi, Dest
	207	mov ecx, l
	208	shr ecx, 3
	209
	210	align 16
	211	inner_loop:
	212	movq mm1,[eax] // mm1=src1
	213	movq mm2,[ebx] // mm2=src2
	214
	215	movq mm4,mm1 // mm4=mm1
	216	psubusb mm4,mm2 // mm4 = src1 - src2
	217
	218	psubusb mm2,mm1 // mm2 = src2 - src1
	219	por mm2,mm4 // mm2=\|src1-src2\|
	220
	221	movq [edi], mm2
	222	add eax,8
	223	add ebx,8
	224	add edi,8
	225	dec ecx
	226	jnz inner_loop
	227	emms
	228	}
	229
	230	return 1;
	231	}
	232
	233
	234
	235
	236
	237
	238	#define _ABS_DIFF_TRI(Z) __asm \
	239	{ \
	240	__asm movq mm4,mm1 /* mm4=mm1 */ \
	241	__asm add ebx, width \
	242	__asm add edi, imageSize \
	243	__asm por mm3,mm7 /* here mm2=new src2 mm3=new src3 */ \
	244	\
	245	__asm movq mm7, mm0 \
	246	__asm psubusb mm4,mm2 /* mm4 = src1 - src2 */ \
	247	\
	248	__asm psubusb mm2,mm1 /* mm2 = src2 - src1 */ \
	249	__asm psllq mm7,Z \
	250	\
	251	__asm movq mm5,mm1 /* mm5=src1 */ \
	252	__asm por mm4,mm2 /* mm2=\|src1-src2\| */ \
	253	\
	254	__asm movq mm2,[ebx] /* mm2= src2 + 'width' = new src2*/ \
	255	__asm psubusb mm5,mm3 /* mm5=src1-src3*/ \
	256	\
	257	__asm movq mm6,mm3 /* mm6=src3*/ \
	258	__asm psubusb mm6,mm1 /* mm3=src3-src1*/ \
	259	\
	260	__asm por mm6,mm5 /* mm6=\|src1-src3\|*/ \
	261	__asm paddusb mm4,mm6 /* mm4 = \|src1-src2\|+\|src1-src3\|*/ \
	262	\
	263	__asm movq [edi], mm4 /* here mm1=src1*/ \
	264	__asm psrlq mm3, 8 /* mm3 = src3 + '1' ... with [x00000000] at the end*/\
	265	}
	266
	267
	268	#define _ABS_DIFF_TRI_prefetch(Z, X) __asm \
	269	{ \
	270	__asm movq mm4,mm1 /* mm4=mm1 */ \
	271	__asm add ebx, width \
	272	__asm add edi, imageSize \
	273	__asm por mm3,mm7 /* here mm2=new src2 mm3=new src3 */ \
	274	\
	275	__asm movq mm7, mm0 \
	276	__asm psubusb mm4,mm2 /* mm4 = src1 - src2 */ \
	277	\
	278	__asm psubusb mm2,mm1 /* mm2 = src2 - src1 */ \
	279	__asm prefetcht0 [ebx + X] \
	280	__asm psllq mm7,Z \
	281	\
	282	__asm movq mm5,mm1 /* mm5=src1 */ \
	283	__asm por mm4,mm2 /* mm2=\|src1-src2\| */ \
	284	\
	285	\
	286	__asm movq mm2,[ebx] /* mm2= src2 + 'width' = new src2*/ \
	287	__asm psubusb mm5,mm3 /* mm5=src1-src3*/ \
	288	\
	289	__asm movq mm6,mm3 /* mm6=src3*/ \
	290	__asm psubusb mm6,mm1 /* mm3=src3-src1*/ \
	291	\
	292	__asm por mm6,mm5 /* mm6=\|src1-src3\|*/ \
	293	__asm paddusb mm4,mm6 /* mm4 = \|src1-src2\|+\|src1-src3\|*/ \
	294	\
	295	__asm movq [edi], mm4 /* here mm1=src1*/ \
	296	__asm psrlq mm3, 8 /* mm3 = src3 + '1' ... with [x00000000] at the end*/\
	297	}
	298
	299	// ImgSubandAdd2: D = saturation0(\|S1 - S2\| + \|S1 - S3\|)
	300	// process 8 disparities at a time
	301	//
	302	// Src1: right
	303	// Src2: top
	304	// Src3: left
	305	//
	306	// TODO? divide the result by 2 (shift)
	307	inline int ImgSubandAdd2(const unsigned char Src1, const unsigned char Src2,
	308	const unsigned char *Src3,
	309	unsigned char* Dest1, int l, int imageSize, int width)
	310	{
	311	if (l < 8) return 0; // image size must be at least 8 bytes
	312	const int back_step1 = 7*width;
	313	const int back_step2 = 7*imageSize;
	314	__asm
	315	{
	316	mov eax, Src1
	317	mov ebx, Src2
	318	mov edx, Src3
	319	mov edi, Dest1
	320
	321	mov ecx, l
	322	shr ecx, 3
	323
	324	movq mm0,[edx] // mm0=src3
	325	movq mm0,[edx] // mm0=src3
	326	align 16
	327	inner_loop:
	328	movq mm1,[eax] // mm1=src1
	329	movq mm3,mm0 // mm3=src3
	330
	331	movq mm2,[ebx] // mm2=src2
	332	add eax,8
	333
	334	// -- 1 --------- in : mm1,mm2,mm3 out: mm4=SAD mm2=new mm2 --
	335	movq mm4,mm1 // mm4=mm1
	336
	337	add ebx,width
	338
	339	psubusb mm4,mm2 // mm4 = src1 - src2
	340	//prefetcht0 [ebx + 32 + 2*320]
	341
	342	movq mm0,[edx+8]
	343	psubusb mm2,mm1 // mm2 = src2 - src1
	344
	345	movq mm5,mm1 // mm5=src1
	346	por mm4,mm2 // mm2=\|src1-src2\|
	347
	348	movq mm2,[ebx] // mm2= src2 + 'width' = new src2
	349	psubusb mm5,mm3 // mm5=src1-src3
	350
	351	movq mm6,mm3 // mm6=src3
	352	psubusb mm6,mm1 // mm3=src3-src1
	353
	354	movq mm7, mm0
	355	psrlq mm3, 8 // mm3 = src3 + '1' ... with [x00000000] at the end
	356
	357	por mm6,mm5 // mm6=\|src1-src3\|
	358	paddusb mm4,mm6 // mm4 = \|src1-src2\|+\|src1-src3\|
	359
	360	movq [edi], mm4
	361	psllq mm7, 56 // here mm1=src1 mm2=NEW src2 mm3=begin of NEWsrc3 mm7=end of NEWsrc3
	362	// -------------------------------------------------------------
	363
	364
	365	// - 2 ----------------
	366	_ABS_DIFF_TRI(48)
	367
	368	// - 3 ----------------
	369	_ABS_DIFF_TRI(40)
	370
	371	// - 4 ----------------
	372	_ABS_DIFF_TRI(32)
	373	// _ABS_DIFF_TRI_prefetch(32,24 + 3*320)
	374
	375	// - 5 ----------------
	376	_ABS_DIFF_TRI(24)
	377
	378	// - 6 ----------------
	379	_ABS_DIFF_TRI(16)
	380
	381	// - 7 ----------------
	382	_ABS_DIFF_TRI(8)
	383
	384
	385	// - 8 ----------------
	386	movq mm4,mm1 // mm4=mm1
	387	por mm3,mm7 // here mm2=new src2 mm3=new src3
	388
	389	psubusb mm4,mm2 // mm4 = src1 - src2
	390	psubusb mm2,mm1 // mm2 = src2 - src1
	391
	392	movq mm5,mm1 // mm5=src1
	393	por mm4,mm2 // mm2=\|src1-src2\|
	394
	395	psubusb mm5,mm3 // mm5=src1-src3
	396	psubusb mm3,mm1 // mm3=src3-src1
	397
	398	por mm3,mm5 // mm6=\|src1-src3\|
	399	paddusb mm4,mm3 // mm4 = \|src1-src2\|+\|src1-src3\|
	400
	401	add edi, imageSize
	402
	403	movq [edi], mm4 // here mm1=src1
	404	// -------------------------------------------------------------
	405	//
	406	sub ebx, back_step1
	407	add ebx,8
	408	add edx,8
	409	sub edi, back_step2
	410	add edi,8
	411	dec ecx
	412	jnz inner_loop
	413	emms
	414	}
	415
	416	return 1;
	417	}
	418
	419
	420	// macro: in: mm1,mm2
	421	#define _ABS_DIFF_ __asm \
	422	{ \
	423	__asm movq mm4,mm1 /* mm4=mm1 */ \
	424	__asm psubusb mm4,mm2 /* mm4 = src1 - src2 */ \
	425	__asm psubusb mm2,mm1 /* mm2 = src2 - src1 */ \
	426	__asm por mm4,mm2 /* mm2=\|src1-src2\| */ \
	427	__asm add ebx, width \
	428	__asm add edi, imageSize \
	429	__asm movq mm2,[ebx] \
	430	__asm movq [edi], mm4 /* here mm1=src1 */ \
	431	}
	432
	433	// ImgSubandAdd2: D = saturation0(\|S1 - S2\| + \|S1 - S3\|)
	434	// process 8 disparities at a time
	435	// Src1: right
	436	// Src2: top
	437	// TODO? divide the result by 2 (shift)
	438	inline int ImgSubandAdd2_Vert(const unsigned char Src1, const unsigned char Src2,
	439	unsigned char* Dest1, int l, int imageSize, int width)
	440	{
	441
	442	if (l < 8) return 0; // image size must be at least 8 bytes
	443	const int back_step1 = 7*width;
	444	const int back_step2 = 7*imageSize;
	445	__asm
	446	{
	447	mov eax, Src1
	448	mov ebx, Src2
	449	mov edi, Dest1
	450
	451	mov ecx, l
	452	shr ecx, 3
	453
	454	align 16
	455	inner_loop:
	456
	457	movq mm1,[eax] // mm1=src1
	458	movq mm2,[ebx] // mm2=src2
	459	add eax,8
	460
	461	// -- 1 --------- in : mm1,mm2,mm3 out: mm4=SAD mm2=new mm2 --
	462	_ABS_DIFF_
	463	_ABS_DIFF_
	464	_ABS_DIFF_
	465	_ABS_DIFF_
	466	_ABS_DIFF_
	467	_ABS_DIFF_
	468	_ABS_DIFF_
	469
	470	// - 8 ----------------
	471	movq mm4,mm1 // mm4=mm1
	472
	473	psubusb mm4,mm2 // mm4 = src1 - src2
	474	psubusb mm2,mm1 // mm2 = src2 - src1
	475
	476	por mm4,mm2 // mm2=\|src1-src2\|
	477	add edi, imageSize
	478
	479	movq [edi], mm4 // here mm1=src1
	480	// -------------------------------------------------------------
	481	//
	482	sub ebx, back_step1
	483	add ebx,8
	484	sub edi, back_step2
	485	add edi,8
	486	dec ecx
	487	jnz inner_loop
	488	emms
	489	}
	490
	491	return 1;
	492	}
	493
	494	// macro: in: mm1,mm2
	495	#define _ABS_DIFF_HORIZ(Z) __asm \
	496	{ \
	497	__asm movq mm7, mm0 \
	498	__asm add edi, imageSize \
	499	__asm movq mm5,mm1 /* mm5=src1 */ \
	500	__asm psllq mm7, Z \
	501	__asm psubusb mm5,mm3 /* mm5=src1-src3 */ \
	502	__asm movq mm6,mm3 /* mm6=src3 */ \
	503	__asm psubusb mm6,mm1 /* mm3=src3-src1 */ \
	504	__asm por mm6,mm5 /* mm6=\|src1-src3\| */ \
	505	__asm movq [edi], mm6 /* here mm1=src1 */ \
	506	__asm psrlq mm3, 8 /* mm3 = src3 + '1' ... with [x00000000] at the end */ \
	507	__asm por mm3,mm7 /* here mm3=new src3 */ \
	508	}
	509
	510	// ImgSubandAdd2: D = saturation0(\|S1 - S2\| + \|S1 - S3\|)
	511	// process 8 disparities at a time
	512	//
	513	// Src1: right
	514	// Src2: top
	515	// Src3: left
	516	//
	517	// TODO? divide the result by 2 (shift)
	518	inline int ImgSubandAdd_Horiz(const unsigned char rightIm, const unsigned char leftIm,
	519	unsigned char* Dest, int l, int imageSize, int width)
	520	{
	521
	522	if (l < 8) return 0; // image size must be at least 8 bytes
	523	const int back_step2 = 7*imageSize;
	524	__asm
	525	{
	526	mov eax, rightIm
	527	mov edx, leftIm
	528	mov edi, Dest
	529
	530	mov ecx, l
	531	shr ecx, 3
	532
	533	movq mm0,[edx] // mm0=src3
	534	movq mm0,[edx] // mm0=src3
	535	align 16
	536	inner_loop:
	537
	538	movq mm1,[eax] // mm1=src1
	539	movq mm3,mm0 // mm3=src3
	540
	541	// -- 1 --------- in : mm1,mm2,mm3 out: mm4=SAD mm2=new mm2 --
	542	movq mm0,[edx+8]
	543	add eax,8
	544
	545	movq mm5,mm1 // mm5=src1
	546	psubusb mm5,mm3 // mm5=src1-src3
	547
	548	movq mm6,mm3 // mm6=src3
	549	psubusb mm6,mm1 // mm3=src3-src1
	550
	551	movq mm7, mm0
	552	psrlq mm3, 8 // mm3 = src3 + '1' ... with [x00000000] at the end
	553
	554	por mm6,mm5 // mm6=\|src1-src3\|
	555
	556	movq [edi], mm6
	557	psllq mm7, 56 // here mm1=src1 mm3=begin of NEWsrc3 mm7=end of NEWsrc3
	558	por mm3,mm7 // here mm3=new src3
	559
	560	// - 2 ----------------
	561	_ABS_DIFF_HORIZ(48)
	562	_ABS_DIFF_HORIZ(40)
	563	_ABS_DIFF_HORIZ(32)
	564	_ABS_DIFF_HORIZ(24)
	565	_ABS_DIFF_HORIZ(16)
	566	_ABS_DIFF_HORIZ(8)
	567
	568	// - 8 ----------------
	569	movq mm5,mm1 // mm5=src1
	570	add edi, imageSize
	571
	572	psubusb mm5,mm3 // mm5=src1-src3
	573	psubusb mm3,mm1 // mm3=src3-src1
	574
	575	por mm3,mm5 // mm6=\|src1-src3\|
	576	movq [edi], mm3
	577	// -------------------------------------------------------------
	578	//
	579	add edx,8
	580	sub edi, back_step2
	581	add edi,8
	582	dec ecx
	583	jnz inner_loop
	584	emms
	585	}
	586
	587	return 1;
	588	}
	589
	590
	591	// ----------------------
	592	// FULL IMAGE, BEST ONLY : Keith's code
	593	inline int findMinimumCorrelation_mmx(
	594	const unsigned char *CurrentCorrelation,
	595	unsigned char CurrentDisparity,
	596	unsigned char *Disparity,
	597	unsigned char *BestCorrelation, int bytecount)
	598	{
	599	if ((bytecount < 8) \|\| ((bytecount % 8) != 0)) {
	600	return 0;
	601	}
	602
	603	__asm {
	604	// load ecx with the pixelblock count = bytecount / 8
	605	mov ecx, bytecount
	606	shr ecx, 3
	607
	608	// setup mm0 with 8 copies of the disparity constant
	609	mov al, CurrentDisparity
	610	mov ah, al
	611	mov bx, ax
	612	shl eax, 16
	613	mov ax, bx
	614	movd mm0, eax
	615	movd mm1, eax
	616	punpckldq mm0, mm1
	617
	618	// setup mm1 with 8 copies of the xor constant for unsigned => signed conversion
	619	mov eax, 0x80808080
	620	movd mm1, eax
	621	movd mm2, eax
	622	punpckldq mm1, mm2
	623
	624
	625	// setup the image pointers
	626	mov eax, BestCorrelation
	627	mov esi, CurrentCorrelation
	628	mov edi, Disparity
	629
	630	pixel_loop:
	631	movq mm2, [esi] // current correlation
	632	movq mm3, [eax] // best correlation
	633
	634	// check for updates
	635	movq mm5, mm2 // copy the current correlation
	636	pxor mm5, mm1 // convert from unsigned range to signed range
	637
	638	movq mm6, mm3 // copy the best correlation
	639	pxor mm6, mm1 // convert from unsigned range to signed range
	640
	641	pcmpgtb mm5, mm6 // mm5 := (current signed> best) mask
	642	// 1 indicates current > best, so keep best
	643	// 0 indicates current <= best, so use new value
	644
	645	// BYPASS
	646	// this phase adds 8 additional instructions, but could skip 2 writes and 1 read
	647	// abort remainder if not updating best correlation
	648	pcmpeqb mm6, mm6 // mm6 = 0xFFFFFFFF
	649	pxor mm6, mm5 // mm6 = mm5 xor 0xFFFFFFFF = not mm5
	650	// 0 indicates current > best, so keep best
	651	// 1 indicates current <= best, so use new value
	652
	653	packsswb mm6, mm6 // pack it into the lower dword of mm6 (unsigned saturation)
	654	// 11111111 11111111 => 11111111 some replaced
	655	// 11111111 00000000 => 11111111 some replaced
	656	// 00000000 11111111 => 11111111 some replaced
	657	// 00000000 00000000 => 00000000 no replacements
	658
	659	// we don't need to backup ebx because its not used in this routine
	660	// movd mm7, ebx // make a backup of eax
	661	movd ebx, mm6 // get the saturated mask
	662	test ebx, ebx // test ebx => yields 0 iff no substitutions will occur
	663	// movd ebx, mm7 // restore ebx
	664	jz bypass // store mm4 (second correlation) to [ebx]
	665
	666
	667	// Update best Correlation
	668	movq mm6, mm5 // mm6 := mask
	669	movq mm7, mm5 // mm7 := mask
	670
	671	pand mm6, mm3 // best correlation values to keep
	672	pandn mm7, mm2 // current correlation value to move to best correlation
	673
	674	por mm6, mm7 // merge values
	675	movq [eax], mm6 // store values
	676
	677	// update disparity
	678	movq mm2, [edi] // get disparity map
	679	movq mm6, mm5 // mm6 := mask
	680
	681	pand mm5, mm2 // select disparity map values to keep
	682	pandn mm6, mm0 // select current disparity values to move to disparity map
	683
	684	por mm5, mm6 // merge values
	685	movq [edi], mm5 // store values
	686
	687	bypass:
	688	add eax, 8
	689	add esi, 8
	690	add edi, 8
	691
	692	dec ecx
	693	jnz pixel_loop
	694
	695
	696	emms;
	697	}
	698
	699	return 1;
	700	}
	701
	702	/*int initMinimumCorrelation(
	703	const unsigned char *CurrentCorrelation,
	704	unsigned char disparityInit,
	705	unsigned char *Disparity,
	706	unsigned char *BestCorrelation,
	707	unsigned char *SecondCorrelation,
	708	int bytecount)
	709	{
	710	for (int i=0; i<bytecount; ++i)
	711	{
	712	BestCorrelation[i]=255;
	713	SecondCorrelation[i]=255;
	714	Disparity[i]=0;
	715	}
	716	return 0;
	717	}*/
	718
	719	inline int initMinimumCorrelation(
	720	const unsigned char *CurrentCorrelation,
	721	unsigned char disparityInit,
	722	unsigned char *Disparity,
	723	unsigned char *BestCorrelation,
	724	unsigned char *SecondCorrelation,
	725	int bytecount)
	726	{
	727	if ((bytecount < 8) \|\| ((bytecount % 8) != 0)) {
	728	return 0;
	729	}
	730
	731	__asm {
	732
	733	// setup mm0 with 8 copies of the disparity constant
	734	mov al, disparityInit
	735	mov ah, al
	736	mov bx, ax
	737	shl eax, 16
	738	mov ax, bx
	739	movd mm0, eax
	740	movd mm1, eax
	741	punpckldq mm0, mm1
	742
	743	// load ecx with the pixelblock count = bytecount / 8
	744	mov ecx, bytecount
	745	shr ecx, 3
	746
	747	mov eax, BestCorrelation
	748	mov ebx, SecondCorrelation
	749	mov esi, CurrentCorrelation
	750	mov edx, Disparity
	751
	752	pixel_loop:
	753	movq mm1, [esi]
	754	movq [eax], mm1 // Best = Current
	755	movq [ebx], mm1 // Second = Current
	756	movq [edx], mm0 // Disparity = disparityInit
	757
	758	add eax, 8
	759	add ebx, 8
	760	add edx, 8
	761	add esi, 8
	762	dec ecx
	763	jnz pixel_loop
	764
	765	jmp done
	766
	767
	768	done:
	769	emms;
	770	}
	771	}
	772
	773	inline int findMinimumCorrelation(
	774	const unsigned char *CurrentCorrelation,
	775	unsigned char CurrentDisparity,
	776	unsigned char *Disparity,
	777	unsigned char *BestCorrelation,
	778	unsigned char *SecondCorrelation,
	779	int bytecount)
	780	{
	781	for (int i=0; i<bytecount; ++i,++CurrentCorrelation,++Disparity,++BestCorrelation, ++SecondCorrelation)
	782	{
	783	if (CurrentCorrelation<BestCorrelation) {
	784	*Disparity = CurrentDisparity;
	785	SecondCorrelation = BestCorrelation;
	786	BestCorrelation = CurrentCorrelation;
	787	}
	788	}
	789	return 1;
	790	}
	791
	792	// ----------------------
	793	// FULL IMAGE, BEST+SECOND .. Keith's code
	794	inline int findMinimumCorrelation_mmx(
	795	const unsigned char *CurrentCorrelation,
	796	unsigned char CurrentDisparity,
	797	unsigned char *Disparity,
	798	unsigned char *BestCorrelation,
	799	unsigned char *SecondCorrelation,
	800	int bytecount)
	801	{
	802	if ((bytecount < 8) \|\| ((bytecount % 8) != 0)) {
	803	return 0;
	804	}
	805
	806	__asm {
	807	// load ecx with the pixelblock count = bytecount / 8
	808	mov ecx, bytecount
	809	shr ecx, 3
	810
	811	// setup mm0 with 8 copies of the disparity constant
	812	mov al, CurrentDisparity
	813	mov ah, al
	814	mov bx, ax
	815	shl eax, 16
	816	mov ax, bx
	817	movd mm0, eax
	818	movd mm1, eax
	819	punpckldq mm0, mm1
	820
	821	// setup mm1 with 8 copies of the xor constant for unsigned => signed conversion
	822	mov eax, 0x80808080
	823	movd mm1, eax
	824	movd mm2, eax
	825	punpckldq mm1, mm2
	826
	827
	828	// setup the image pointers
	829	mov eax, BestCorrelation
	830	mov ebx, SecondCorrelation
	831	mov esi, CurrentCorrelation
	832	mov edi, Disparity
	833
	834	pixel_loop:
	835	movq mm2, [esi] // current correlation
	836	movq mm4, [ebx] // second correlation
	837
	838	// convert the current correlation from unsigned range to signed range
	839	movq mm5, mm2 // copy the current correlation
	840	pxor mm5, mm1 // convert from unsigned range to signed range
	841	movq mm7, mm5 // copy converted to mm7
	842
	843
	844	// check for second correlation updates
	845	movq mm6, mm4 // copy second best correlation
	846	pxor mm6, mm1 // convert from unsigned range to signed range
	847
	848	pcmpgtb mm7, mm6 // mm7 := (current signed> second best) mask
	849
	850	// BYPASS 1
	851	// skip remainder if second correlation is not to be updated
	852	// this phase adds an addition 8 instructions, but it could save as 1 memory read and 3 writes
	853	pcmpeqb mm6, mm6 // mm6 = 0xFFFFFFFF
	854	pxor mm6, mm7 // mm6 = mm7 xor 0xFFFFFFFF = not mm7
	855	// 0 indicates current > second, so keep old value
	856	// 1 indicates current <= second, so use new value
	857
	858
	859	packsswb mm6, mm6 // pack it into the lower dword of mm6 (unsigned saturation)
	860	// 11111111 11111111 => 11111111 some replaced
	861	// 11111111 00000000 => 11111111 some replaced
	862	// 00000000 11111111 => 11111111 some replaced
	863	// 00000000 00000000 => 00000000 no replacements
	864
	865	// don't need to backup edx because its not used in this routine
	866	// movd mm3, edx // make a backup of edx
	867	movd edx, mm6 // get the saturated mask
	868	test edx, edx // test edx => yields 0 iff no replacements will occur
	869	// movd edx, mm3 // restore edx
	870	jz bypass1
	871
	872
	873	// direct update second correlation (get values from current)
	874	// mm7 already has mask
	875	// movq mm6, mm7 // mm6 := mask
	876	// pand mm6, mm4 // second correlation values to keep
	877	// pandn mm7, mm2 // current correlation values to move to second correlation
	878	// por mm6, mm7 // merge value => direct updated second correlation
	879	// movq mm4, mm6 // store values (*** this instruction could be eliminated!)
	880
	881	pand mm4, mm7 // second correlation values to keep
	882	pandn mm7, mm2 // current correlation values to move to second correlation
	883	por mm4, mm7 // merge value => direct updated second correlation
	884
	885
	886	// check for best correlation updates
	887	movq mm3, [eax] // best correlation
	888	// mm5 has converted current correlation
	889	movq mm6, mm3 // copy the best correlation
	890	pxor mm6, mm1 // convert from unsigned range to signed range
	891
	892	pcmpgtb mm5, mm6 // mm5 := (current signed> best) mask
	893	// 1 indicates current > best, so keep best
	894	// 0 indicates current <= best, so use new value
	895	// BYPASS 2
	896	// this phase adds 8 additional instructions, but could skip 2 writes and 1 read
	897	// abort remainder if not updating best correlation
	898	pcmpeqb mm6, mm6 // mm6 = 0xFFFFFFFF
	899	pxor mm6, mm5 // mm6 = mm5 xor 0xFFFFFFFF = not mm5
	900	// 0 indicates current > best, so keep best
	901	// 1 indicates current <= best, so use new value
	902
	903	packsswb mm6, mm6 // pack it into the lower dword of mm6 (unsigned saturation)
	904	// 11111111 11111111 => 11111111 some replaced
	905	// 11111111 00000000 => 11111111 some replaced
	906	// 00000000 11111111 => 11111111 some replaced
	907	// 00000000 00000000 => 00000000 no replacements
	908
	909	// don't need to backup edx because its not used in this routine
	910	// movd mm7, edx // make a backup of edx
	911	movd edx, mm6 // get the saturated mask
	912	test edx, edx // test edx => yields 0 iff no substitutions will occur
	913	// movd edx, mm7 // restore edx
	914	jz bypass2 // store mm4 (second correlation) to [ebx]
	915
	916
	917	// indirect update second correlation (pushed down from best)
	918	movq mm6, mm5 // mm6 := mask
	919	movq mm7, mm5 // mm7 := mask
	920
	921	pand mm6, mm4 // second correlation values to keep
	922	pandn mm7, mm3 // best correlations to move to second correlation
	923
	924	por mm6, mm7 // merge values
	925	movq [ebx], mm6 // store values
	926
	927	// direct Update best Correlation
	928	movq mm6, mm5 // mm6 := mask
	929	movq mm7, mm5 // mm7 := mask
	930
	931	pand mm6, mm3 // best correlation values to keep
	932	pandn mm7, mm2 // current correlation value to move to best correlation
	933
	934	por mm6, mm7 // merge values
	935	movq [eax], mm6 // store values
	936
	937	// update disparity
	938	movq mm2, [edi] // get disparity map
	939	movq mm6, mm5 // mm6 := mask
	940
	941	pand mm5, mm2 // select disparity map values to keep
	942	pandn mm6, mm0 // select current disparity values to move to disparity map
	943
	944	por mm5, mm6 // merge values
	945	movq [edi], mm5 // store values
	946
	947
	948	bypass1:
	949	next_pixel:
	950	add eax, 8
	951	add ebx, 8
	952	add esi, 8
	953	add edi, 8
	954
	955	dec ecx
	956	jnz pixel_loop
	957
	958	jmp done
	959
	960	bypass2:
	961	movq [ebx], mm4;
	962	jmp next_pixel
	963
	964	done:
	965	emms;
	966	}
	967
	968	return 1;
	969	}
	970
	971
	972
	973	inline void sum_Row(uchar* im, unsigned short* im_out, int rowSize, int maskSize)
	974	{
	975	im += maskSize/2;
	976	im_out += maskSize/2;
	977	for (int i=0; i<rowSize; ++i) {
	978	int s=0;
	979	for (int j=-maskSize/2; j<=maskSize/2; ++j) {
	980	s+=*(im+j);
	981	}
	982	*im_out=s/maskSize;
	983	++im;++im_out;
	984	}
	985	}
	986
	987	inline void sum_Row_mmx(uchar* im, unsigned short* im_out, int rowSize, int maskSize)
	988	{
	989	sum_Row_5_mmx(im, im_out, rowSize);
	990	for (int i=0; i<(maskSize-5)/2; ++i)
	991	sum_Row_5_mmx(im_out, im_out, rowSize);
	992	}
	993
	994	inline void sum_Row_mmx(unsigned short* im, unsigned short* im_out, int rowSize, int maskSize)
	995	{
	996	sum_Row_5_mmx(im, im_out, rowSize);
	997	for (int i=0; i<(maskSize-5)/2; ++i)
	998	sum_Row_5_mmx(im_out, im_out, rowSize);
	999	}
	1000
	1001	#define aim_Sum_Words_In_MM1 __asm \
	1002	{ \
	1003	__asm movq mm4, mm1 \
	1004	__asm movq mm2, mm1 \
	1005	\
	1006	__asm movq mm3, mm1 \
	1007	__asm psllq mm1, 16 \
	1008	\
	1009	__asm psrlq mm2, 16 \
	1010	__asm paddw mm4, mm2 \
	1011	\
	1012	__asm paddw mm3, mm1 \
	1013	__asm psrlq mm2, 16 \
	1014	\
	1015	__asm psllq mm1, 16 \
	1016	__asm paddw mm4, mm2 \
	1017	\
	1018	__asm psrlq mm2, 16 \
	1019	__asm paddw mm3, mm1 \
	1020	\
	1021	__asm psllq mm1, 16 \
	1022	__asm paddw mm4, mm2 \
	1023	\
	1024	__asm paddw mm3, mm1 \
	1025	}
	1026
	1027
	1028
	1029
	1030
	1031	// apply the mask [1 1 1 1 1] to the 1-D array im (bytes)
	1032	// output : im_out (words)
	1033	inline void sum_Row_5_mmx(uchar* im, unsigned short* im_out, int rowSize)
	1034	{
	1035	// temp: for debugging
	1036	//return sum_Row_5(im,im_out,rowSize);
	1037	__asm {
	1038
	1039	mov eax, rowSize
	1040	mov ebx, im
	1041	mov ecx, im_out
	1042
	1043	pxor mm6, mm6 // mm6 = x00000000
	1044
	1045	//Process the first quad word, but save only the second result"
	1046	test eax, eax // Is there anything to do?"
	1047	jz end_sum_loop // Jump out if necessary
	1048
	1049
	1050	//Process low word
	1051	movq mm1, [ebx] // Copy...
	1052	punpcklbw mm1, mm6 // Expand low word bytes into words // mm1 =[D C B A]
	1053
	1054	aim_Sum_Words_In_MM1
	1055
	1056	//Store the result Only in the accumulator
	1057	movq mm7, mm4 // Update accumulator mm4=[D C+D B+C+D A+B+C+D]
	1058
	1059	//Process high word
	1060	movq mm1, [ebx] // Copy...
	1061	punpckhbw mm1, mm6 // Expand high word bytes into words // mm1 =[H G F E]
	1062	add ebx, 8 // Update input pointer
	1063
	1064	aim_Sum_Words_In_MM1
	1065
	1066	//Add to the previous data ...
	1067	// mm3=[E+F+G+H E+F+G E+F E]
	1068	// mm4=[H G+H F+G+H E+F+G+H]
	1069	paddw mm7, mm3 // The current word of the accum // mm7=[D+E+F+G+H C+D+E+F+G B+C+D+E+F A+B+C+D+E]
	1070
	1071	// translate everything to 2 words on the left
	1072	movq mm1, mm7 // mm1 = [D+E+F+G+H C+D+E+F+G B+C+D+E+F A+B+C+D+E]
	1073	psrlq mm1, 32 // mm1 = [0 0 D+E+F+G+H C+D+E+F+G]
	1074
	1075	movq mm0, mm1 // mm0 = [D+E+F+G+H C+D+E+F+G]
	1076
	1077	psllq mm7, 32 // mm7 = [B+C+D+E+F A+B+C+D+E 0 0]
	1078
	1079	movq [ecx], mm7 // Store the final result
	1080	add ecx, 8 // Update output pointer
	1081
	1082	movq mm7, mm4 // Update accumulator mm4=[H G+H F+G+H E+F+G+H]
	1083	sub eax, 8 // Update the number of points left
	1084
	1085	// Start the loop
	1086	row_sum_loop:
	1087	test eax, eax // Is there anything to do?
	1088	jz end_sum_loop // Jump out if necessary
	1089
	1090	movq mm1, [ebx] // Load data
	1091
	1092	//Process low word
	1093	punpcklbw mm1, mm6 // Expand low word bytes into words
	1094
	1095	aim_Sum_Words_In_MM1
	1096
	1097	//Add to the previous data
	1098	//prefetcht1 [ecx+16]
	1099	paddw mm7, mm3 // The current word of the accum
	1100
	1101	// translate everything to 2 words on the left
	1102	// mm0 = [0 0 D C] mm7 = [H G F E] ----> mm7=[0 0 H G] [ecx]=[F E D C]
	1103	punpckldq mm0, mm7 // mm0 = [F E D C]
	1104
	1105	movq [ecx], mm0
	1106	sub eax, 8 // Update the number of points left
	1107
	1108	movq mm0, mm4 // Update accumulator
	1109	psrlq mm7, 32 // mm7 = [0 0 H G]
	1110
	1111	//Process high word
	1112	movq mm1, [ebx] // Copy...
	1113	punpckhbw mm1, mm6 // Expand high word bytes into words
	1114
	1115	aim_Sum_Words_In_MM1
	1116
	1117	//Add to the previous data
	1118	paddw mm0, mm3 // The current word of the accum
	1119
	1120	// translate everything to 2 words on the left
	1121	// mm7 = [0 0 D C] mm0 = [H G F E] ----> mm0=[0 0 H G] [ecx+8]=[F E D C]
	1122	punpckldq mm7, mm0 // mm7 = [F E D C]
	1123	add ebx, 8 // Update input pointer
	1124
	1125	movq [ecx+8], mm7
	1126	psrlq mm0, 32 // mm0 = [0 0 H G]
	1127
	1128	movq mm7, mm4 // Update accumulator
	1129	add ecx, 16 // Update output pointer
	1130
	1131	jmp row_sum_loop // Loop
	1132
	1133	//Cleanup
	1134	end_sum_loop:
	1135	emms
	1136	}
	1137	}
	1138
	1139
	1140
	1141	// apply the mask (1/4)*[1 1 1 1 1] to the 1-D array im (words)
	1142	// output : im_out (words)
	1143	inline void sum_Row_5_mmx(ushort* im, ushort* im_out, int rowSize)
	1144	{
	1145	// temp: for debugging
	1146	//return sum_Row_5(im,im_out,rowSize);
	1147	__asm {
	1148
	1149	mov eax, rowSize
	1150	mov ebx, im
	1151	mov ecx, im_out
	1152
	1153	//Process the first quad word, but save only the second result"
	1154	test eax, eax // Is there anything to do?"
	1155	jz end_sum_loop // Jump out if necessary
	1156
	1157	movq mm1, [ebx] // Load data (4 words)
	1158	add ebx, 8 // Update input pointer
	1159
	1160	//Process low word
	1161	aim_Sum_Words_In_MM1
	1162
	1163	//Store the result Only in the accumulator
	1164	movq mm7, mm4 // Update accumulator
	1165
	1166	//Process high word
	1167	movq mm1, [ebx] // Copy...
	1168
	1169	aim_Sum_Words_In_MM1
	1170	add ebx, 8
	1171
	1172	//Add to the previous data
	1173	paddw mm7, mm3 // The current word of the accum
	1174
	1175	// translate everything to 2 words on the left
	1176	movq mm1, mm7 // mm1 = [D+E+F+G+H C+D+E+F+G B+C+D+E+F A+B+C+D+E]
	1177	psrlq mm1, 32 // mm1 = [0 0 D+E+F+G+H C+D+E+F+G]
	1178	movq mm0, mm1 // mm0 = [0 0 D+E+F+G+H C+D+E+F+G]
	1179	psllq mm7, 32 // mm7 = [B+C+D+E+F A+B+C+D+E 0 0]
	1180
	1181	movq [ecx], mm7 // Store the final result
	1182	movq mm7, mm4 // Update accumulator
	1183
	1184	add ecx, 8 // Update output pointer
	1185	sub eax, 8 // Update the number of points left
	1186
	1187	// Start the loop
	1188	row_sum_loop:
	1189	test eax, eax // Is there anything to do?
	1190	jz end_sum_loop // Jump out if necessary
	1191
	1192	movq mm1, [ebx] // Load data
	1193
	1194	aim_Sum_Words_In_MM1
	1195
	1196	//Add to the previous data
	1197	//prefetcht0 [ecx + 32]
	1198	//prefetcht0 [ebx + 48]
	1199	paddw mm7, mm3 // The current word of the accum
	1200	psrlw mm7, 2 // divide result by ...
	1201
	1202	// translate everything to 2 words on the left
	1203	// mm0 = [0 0 D C] mm7 = [H G F E] ----> mm7 =[0 0 H G] [ecx]=[F E D C]
	1204	punpckldq mm0, mm7 // mm0 = [F E D C]
	1205
	1206	movq [ecx], mm0
	1207	sub eax, 8 // Update the number of points left
	1208
	1209	movq mm0, mm4 // Update accumulator
	1210	psrlq mm7, 32 // mm7 =[0 0 H G]
	1211
	1212	//Process high word
	1213	movq mm1, [ebx+8] // Copy...
	1214
	1215	aim_Sum_Words_In_MM1
	1216
	1217	//Add to the previous data
	1218	paddw mm0, mm3 // The current word of the accum
	1219	psrlw mm0, 2 // divide result by ...
	1220
	1221	// translate everything to 2 words on the left
	1222	// mm7 = [0 0 D C] mm0 = [H G F E] ----> mm0=[0 0 H G] [ecx+8]=[F E D C]
	1223	punpckldq mm7, mm0 // mm7 = [F E D C]
	1224	add ebx, 16 // Update input pointer
	1225
	1226	movq [ecx+8], mm7
	1227	psrlq mm0, 32 // mm0 = [0 0 H G]
	1228
	1229	movq mm7, mm4 // Update accumulator
	1230	add ecx, 16 // Update output pointer */
	1231
	1232	jmp row_sum_loop // Loop
	1233
	1234	//Cleanup
	1235	end_sum_loop:
	1236	emms
	1237	}
	1238	}
	1239
	1240	template<class T> void sum_Row_5(T* im, ushort* im_out, int rowSize)
	1241	{
	1242	im += 2;
	1243	im_out +=2;
	1244	int s = 0;
	1245	for (int i=0; i<rowSize-5; ++i, ++im, ++im_out) {
	1246	s = *(im-2);
	1247	s += *(im-1);
	1248	s += *(im);
	1249	s += *(im+1);
	1250	s += *(im+2);
	1251	*im_out = s/5;
	1252	}
	1253	}
	1254
	1255	inline void avg_Col(ushort* im, uchar* im_out, int dataSize, int width, int sizeMask)
	1256	{
	1257	int offset = width*(sizeMask/2);
	1258	im += offset;
	1259	im_out += offset;
	1260	for (int i=0; i<dataSize-width*sizeMask; ++i, ++im, ++im_out) {
	1261	int s = 0;
	1262	for (int j=-sizeMask/2; j<=sizeMask/2; ++j) s += (im+jwidth);
	1263	*im_out = s/(sizeMask);
	1264	}
	1265	}
	1266
	1267	// apply vertical mask 1/16*[1 1 1 ... 1]^T to 'im'
	1268	// result in 'im_out'
	1269	inline void avg_Col_mmx(ushort* im, uchar* im_out, int dataSize, int width, int sizeMask)
	1270	{
	1271	// temp: for debugging
	1272	//return avg_Col(im,im_out,dataSize,width,sizeMask);
	1273
	1274	switch (sizeMask)
	1275	{
	1276	case 5: avg_Col_5(im,im_out,dataSize,width);
	1277	break;
	1278	case 7: avg_Col_7(im,im_out,dataSize,width);
	1279	break;
	1280	case 9: avg_Col_9(im,im_out,dataSize,width);
	1281	break;
	1282	case 11: avg_Col_11(im,im_out,dataSize,width);
	1283	break;
	1284	case 13: avg_Col_13(im,im_out,dataSize,width);
	1285	break;
	1286	case 15: avg_Col_15(im,im_out,dataSize,width);
	1287	break;
	1288	case 17: avg_Col_17(im,im_out,dataSize,width);
	1289	break;
	1290
	1291	default: if (sizeMask<5) avg_Col_5(im,im_out,dataSize,width);
	1292	else if (sizeMask>17) avg_Col_17(im,im_out,dataSize,width);
	1293	break;
	1294
	1295	}
	1296	}
	1297
	1298
	1299
	1300	#define macro_add __asm \
	1301	{ \
	1302	__asm paddusw mm3, [edx] \
	1303	__asm paddusw mm2, [edx+8] \
	1304	__asm add edx, edi \
	1305	}
	1306
	1307
	1308	inline void avg_Col_5(ushort* im, uchar* im_out, int dataSize, int width)
	1309	{
	1310	__asm {
	1311
	1312	mov edi, width
	1313	shl edi, 1 // edi = 2*width
	1314
	1315	mov eax, dataSize
	1316	mov ecx, im_out
	1317
	1318	mov ebx, im
	1319	sub ebx, edi
	1320	sub ebx, edi // ebx = ebx-4*width
	1321
	1322	test eax, eax // Is there anything to do?"
	1323	jz end_sum_loop // Jump out if necessary
	1324
	1325	row_sum_loop:
	1326
	1327	test eax, eax // Is there anything to do?
	1328	jz end_sum_loop // Jump out if necessary
	1329
	1330	mov edx, ebx
	1331	add ebx, 16
	1332
	1333	// 1
	1334	movq mm3, [edx] // mm3 = 4 words of im
	1335	movq mm2, [edx+8] // mm2 = next 4 words of im
	1336	add edx, edi
	1337
	1338	macro_add
	1339	macro_add
	1340	macro_add
	1341	macro_add
	1342
	1343	// divide results by ...
	1344	psrlw mm3, 3
	1345	psrlw mm2, 3
	1346
	1347	// convert [mm2 mm3] as 8 bytes
	1348	packuswb mm3,mm2
	1349	movq [ecx], mm3
	1350
	1351	sub eax, 8 // Update the number of points left
	1352	add ecx, 8 // Update output pointer
	1353
	1354	jmp row_sum_loop // Loop
	1355
	1356	//Cleanup
	1357	end_sum_loop:
	1358	emms
	1359	}
	1360	}
	1361
	1362	inline void avg_Col_7(ushort* im, uchar* im_out, int dataSize, int width)
	1363	{
	1364	__asm {
	1365
	1366	mov edi, width
	1367	shl edi, 1 // edi = 2*width
	1368
	1369	mov eax, dataSize
	1370	mov ecx, im_out
	1371
	1372	mov ebx, im
	1373	sub ebx, edi
	1374	sub ebx, edi
	1375	sub ebx, edi // ebx = ebx-6*width
	1376
	1377	test eax, eax // Is there anything to do?"
	1378	jz end_sum_loop // Jump out if necessary
	1379
	1380	row_sum_loop:
	1381
	1382	test eax, eax // Is there anything to do?
	1383	jz end_sum_loop // Jump out if necessary
	1384
	1385	mov edx, ebx
	1386
	1387	// 1
	1388	movq mm3, [edx] // mm3 = 4 words of im
	1389	add ebx, 16
	1390	movq mm2, [edx+8] // mm2 = next 4 words of im
	1391	add edx, edi
	1392
	1393	macro_add
	1394	macro_add
	1395	macro_add
	1396	macro_add
	1397	macro_add
	1398	macro_add
	1399
	1400	// divide results by ...
	1401	psrlw mm3, 3
	1402	psrlw mm2, 3
	1403
	1404	// convert [mm2 mm3] as 8 bytes
	1405	packuswb mm3,mm2
	1406	movq [ecx], mm3
	1407
	1408	sub eax, 8 // Update the number of points left
	1409	add ecx, 8 // Update output pointer
	1410
	1411	jmp row_sum_loop // Loop
	1412
	1413	//Cleanup
	1414	end_sum_loop:
	1415	emms
	1416	}
	1417	}
	1418
	1419	inline void avg_Col_9(ushort* im, uchar* im_out, int dataSize, int width)
	1420	{
	1421	__asm {
	1422
	1423	mov edi, width
	1424	shl edi, 1 // edi = 2*width
	1425
	1426	mov eax, dataSize
	1427	mov ecx, im_out
	1428
	1429	mov ebx, im
	1430	sub ebx, edi
	1431	sub ebx, edi
	1432	sub ebx, edi
	1433	sub ebx, edi // ebx = ebx-8*width
	1434
	1435	test eax, eax // Is there anything to do?"
	1436	jz end_sum_loop // Jump out if necessary
	1437
	1438	row_sum_loop:
	1439
	1440	test eax, eax // Is there anything to do?
	1441	jz end_sum_loop // Jump out if necessary
	1442
	1443	mov edx, ebx
	1444	add ebx, 16
	1445
	1446	// 1
	1447	movq mm3, [edx] // mm3 = 4 words of im
	1448	movq mm2, [edx+8] // mm2 = next 4 words of im
	1449	add edx, edi
	1450
	1451	macro_add
	1452	macro_add
	1453	macro_add
	1454	macro_add
	1455	macro_add
	1456	macro_add
	1457	macro_add
	1458	macro_add
	1459
	1460	// divide results by ...
	1461	psrlw mm3, 3
	1462	psrlw mm2, 3
	1463
	1464	// convert [mm2 mm3] as 8 bytes
	1465	packuswb mm3,mm2
	1466	movq [ecx], mm3
	1467
	1468	sub eax, 8 // Update the number of points left
	1469	add ecx, 8 // Update output pointer
	1470
	1471	jmp row_sum_loop // Loop
	1472
	1473	//Cleanup
	1474	end_sum_loop:
	1475	emms
	1476	}
	1477	}
	1478
	1479	inline void avg_Col_11(ushort* im, uchar* im_out, int dataSize, int width)
	1480	{
	1481	__asm {
	1482
	1483	mov edi, width
	1484	shl edi, 1 // edi = 2*width
	1485
	1486	mov eax, dataSize
	1487	mov ecx, im_out
	1488
	1489	mov ebx, im
	1490	sub ebx, edi
	1491	sub ebx, edi
	1492	sub ebx, edi
	1493	sub ebx, edi
	1494	sub ebx, edi // ebx = ebx-10*width
	1495
	1496	test eax, eax // Is there anything to do?"
	1497	jz end_sum_loop // Jump out if necessary
	1498
	1499	row_sum_loop:
	1500
	1501	test eax, eax // Is there anything to do?
	1502	jz end_sum_loop // Jump out if necessary
	1503
	1504	mov edx, ebx
	1505	add ebx, 16
	1506
	1507	// 1
	1508	movq mm3, [edx] // mm3 = 4 words of im
	1509	movq mm2, [edx+8] // mm2 = next 4 words of im
	1510	add edx, edi
	1511
	1512	macro_add
	1513	macro_add
	1514	macro_add
	1515	macro_add
	1516	macro_add
	1517	macro_add
	1518	macro_add
	1519	macro_add
	1520	macro_add
	1521	macro_add
	1522
	1523	// divide results by ...
	1524	psrlw mm3, 4
	1525	psrlw mm2, 4
	1526
	1527	// convert [mm2 mm3] as 8 bytes
	1528	packuswb mm3,mm2
	1529	movq [ecx], mm3
	1530
	1531	sub eax, 8 // Update the number of points left
	1532	add ecx, 8 // Update output pointer
	1533
	1534	jmp row_sum_loop // Loop
	1535
	1536	//Cleanup
	1537	end_sum_loop:
	1538	emms
	1539	}
	1540	}
	1541
	1542	inline void avg_Col_13(ushort* im, uchar* im_out, int dataSize, int width)
	1543	{
	1544	__asm {
	1545
	1546	mov edi, width
	1547	shl edi, 1 // edi = 2*width
	1548
	1549	mov eax, dataSize
	1550	mov ecx, im_out
	1551
	1552	mov ebx, im
	1553	sub ebx, edi
	1554	sub ebx, edi
	1555	sub ebx, edi
	1556	sub ebx, edi
	1557	sub ebx, edi
	1558	sub ebx, edi // ebx = ebx-12*width
	1559
	1560	test eax, eax // Is there anything to do?"
	1561	jz end_sum_loop // Jump out if necessary
	1562
	1563	row_sum_loop:
	1564
	1565	test eax, eax // Is there anything to do?
	1566	jz end_sum_loop // Jump out if necessary
	1567
	1568	mov edx, ebx
	1569	add ebx, 16
	1570
	1571	// 1
	1572	movq mm3, [edx] // mm3 = 4 words of im
	1573	movq mm2, [edx+8] // mm2 = next 4 words of im
	1574	add edx, edi
	1575
	1576	macro_add
	1577	macro_add
	1578	macro_add
	1579	macro_add
	1580	macro_add
	1581	macro_add
	1582	macro_add
	1583	macro_add
	1584	macro_add
	1585	macro_add
	1586	macro_add
	1587	macro_add
	1588
	1589	// divide results by ...
	1590	psrlw mm3, 4
	1591	psrlw mm2, 4
	1592
	1593	// convert [mm2 mm3] as 8 bytes
	1594	packuswb mm3,mm2
	1595	movq [ecx], mm3
	1596
	1597	sub eax, 8 // Update the number of points left
	1598	add ecx, 8 // Update output pointer
	1599
	1600	jmp row_sum_loop // Loop
	1601
	1602	//Cleanup
	1603	end_sum_loop:
	1604	emms
	1605	}
	1606	}
	1607
	1608	inline void avg_Col_15(ushort* im, uchar* im_out, int dataSize, int width)
	1609	{
	1610	__asm {
	1611
	1612	mov edi, width
	1613	shl edi, 1 // edi = 2*width
	1614
	1615	mov eax, dataSize
	1616	mov ecx, im_out
	1617
	1618	mov ebx, im
	1619	sub ebx, edi
	1620	sub ebx, edi
	1621	sub ebx, edi
	1622	sub ebx, edi
	1623	sub ebx, edi
	1624	sub ebx, edi
	1625	sub ebx, edi // ebx = ebx-14*width
	1626
	1627	test eax, eax // Is there anything to do?"
	1628	jz end_sum_loop // Jump out if necessary
	1629
	1630	row_sum_loop:
	1631
	1632	test eax, eax // Is there anything to do?
	1633	jz end_sum_loop // Jump out if necessary
	1634
	1635	mov edx, ebx
	1636	add ebx, 16
	1637
	1638	// 1
	1639	movq mm3, [edx] // mm3 = 4 words of im
	1640	movq mm2, [edx+8] // mm2 = next 4 words of im
	1641	add edx, edi
	1642
	1643	macro_add
	1644	macro_add
	1645	macro_add
	1646	macro_add
	1647	macro_add
	1648	macro_add
	1649	macro_add
	1650	macro_add
	1651	macro_add
	1652	macro_add
	1653	macro_add
	1654	macro_add
	1655	macro_add
	1656	macro_add
	1657
	1658	// divide results by ...
	1659	psrlw mm3, 4
	1660	psrlw mm2, 4
	1661
	1662	// convert [mm2 mm3] as 8 bytes
	1663	packuswb mm3,mm2
	1664	movq [ecx], mm3
	1665
	1666	sub eax, 8 // Update the number of points left
	1667	add ecx, 8 // Update output pointer
	1668
	1669	jmp row_sum_loop // Loop
	1670
	1671	//Cleanup
	1672	end_sum_loop:
	1673	emms
	1674	}
	1675	}
	1676
	1677	inline void avg_Col_17(ushort* im, uchar* im_out, int dataSize, int width)
	1678	{
	1679	__asm {
	1680
	1681	mov edi, width
	1682	shl edi, 1 // edi = 2*width
	1683
	1684	mov eax, dataSize
	1685	mov ecx, im_out
	1686
	1687	mov ebx, im
	1688	sub ebx, edi
	1689	sub ebx, edi
	1690	sub ebx, edi
	1691	sub ebx, edi
	1692	sub ebx, edi
	1693	sub ebx, edi
	1694	sub ebx, edi
	1695	sub ebx, edi // ebx = ebx-16*width
	1696
	1697	test eax, eax // Is there anything to do?"
	1698	jz end_sum_loop // Jump out if necessary
	1699
	1700	row_sum_loop:
	1701
	1702	test eax, eax // Is there anything to do?
	1703	jz end_sum_loop // Jump out if necessary
	1704
	1705	mov edx, ebx
	1706	add ebx, 16
	1707
	1708	// 1
	1709	movq mm3, [edx] // mm3 = 4 words of im
	1710	movq mm2, [edx+8] // mm2 = next 4 words of im
	1711	add edx, edi
	1712
	1713	macro_add
	1714	macro_add
	1715	macro_add
	1716	macro_add
	1717	macro_add
	1718	macro_add
	1719	macro_add
	1720	macro_add
	1721	macro_add
	1722	macro_add
	1723	macro_add
	1724	macro_add
	1725	macro_add
	1726	macro_add
	1727	macro_add
	1728	macro_add
	1729
	1730	// divide results by ...
	1731	psrlw mm3, 4
	1732	psrlw mm2, 4
	1733
	1734	// convert [mm2 mm3] as 8 bytes
	1735	packuswb mm3,mm2
	1736	movq [ecx], mm3
	1737
	1738	sub eax, 8 // Update the number of points left
	1739	add ecx, 8 // Update output pointer
	1740
	1741	jmp row_sum_loop // Loop
	1742
	1743	//Cleanup
	1744	end_sum_loop:
	1745	emms
	1746	}
	1747	}
	1748
	1749
	1750	inline void add_Col_5_wb(ushort* im, uchar* im_out, int dataSize, int width)
	1751	{
	1752	__asm {
	1753
	1754	mov edi, width
	1755	shl edi, 1 // edi = 2*width
	1756
	1757	mov eax, dataSize
	1758	mov ecx, im_out
	1759
	1760	mov ebx, im
	1761	sub ebx, edi
	1762	sub ebx, edi // ebx = ebx-4*width
	1763
	1764	test eax, eax // Is there anything to do?"
	1765	jz end_sum_loop // Jump out if necessary
	1766
	1767	row_sum_loop:
	1768
	1769	test eax, eax // Is there anything to do?
	1770	jz end_sum_loop // Jump out if necessary
	1771
	1772	mov edx, ebx
	1773	add ebx, 16
	1774
	1775	// 1
	1776	movq mm3, [edx] // mm3 = 4 words of im
	1777	movq mm2, [edx+8] // mm2 = next 4 words of im
	1778	add edx, edi
	1779
	1780	macro_add
	1781	macro_add
	1782	macro_add
	1783	macro_add
	1784
	1785	// save [mm2 mm3] as 8 bytes
	1786	packuswb mm3,mm2
	1787	movq [ecx], mm3
	1788
	1789	sub eax, 8 // Update the number of points left
	1790	add ecx, 8 // Update output pointer
	1791
	1792	jmp row_sum_loop // Loop
	1793
	1794	//Cleanup
	1795	end_sum_loop:
	1796	emms
	1797	}
	1798	}
	1799
	1800	inline void add_Col_5_ww(ushort* im, ushort* im_out, int dataSize, int width)
	1801	{
	1802	__asm {
	1803
	1804	mov edi, width
	1805	shl edi, 1 // edi = 2*width
	1806
	1807	mov eax, dataSize
	1808	mov ecx, im_out
	1809
	1810	mov ebx, im
	1811	sub ebx, edi
	1812	sub ebx, edi // ebx = ebx-4*width
	1813
	1814	test eax, eax // Is there anything to do?"
	1815	jz end_sum_loop // Jump out if necessary
	1816
	1817	row_sum_loop:
	1818
	1819	test eax, eax // Is there anything to do?
	1820	jz end_sum_loop // Jump out if necessary
	1821
	1822	mov edx, ebx
	1823	add ebx, 16
	1824
	1825	// 1
	1826	movq mm3, [edx] // mm3 = 4 words of im
	1827	movq mm2, [edx+8] // mm2 = next 4 words of im
	1828	add edx, edi
	1829
	1830	macro_add
	1831	macro_add
	1832	macro_add
	1833	macro_add
	1834
	1835	// save [mm2 mm3] as words
	1836	movq [ecx], mm3
	1837	movq [ecx+8], mm2
	1838
	1839	sub eax, 8 // Update the number of points left
	1840	add ecx, 16 // Update output pointer
	1841
	1842	jmp row_sum_loop // Loop
	1843
	1844	//Cleanup
	1845	end_sum_loop:
	1846	emms
	1847	}
	1848	}
	1849
	1850	// compare bestScores and secondScores. if second<best+'thresh' the disp.
	1851	// is set to 'valForReplacement' (usually 0)
	1852	inline void compareBestAndSecond(uchar* bestScores, uchar* secondScores, char thresh,
	1853	uchar undefined_val,
	1854	uchar* disp, int dataSize)
	1855	{
	1856	__asm {
	1857
	1858	// setup mm0 with 8 copies of 'thresh'
	1859	mov al, thresh
	1860	mov ah, al
	1861	mov bx, ax
	1862	shl eax, 16
	1863	mov ax, bx
	1864	movd mm0, eax
	1865	movd mm1, eax
	1866	punpckldq mm0, mm1
	1867
	1868	// setup mm7 with 8 copies of 'valForReplacement'
	1869	mov al, undefined_val
	1870	mov ah, al
	1871	mov bx, ax
	1872	shl eax, 16
	1873	mov ax, bx
	1874	movd mm7, eax
	1875	movd mm1, eax
	1876	punpckldq mm7, mm1
	1877
	1878	mov eax, dataSize
	1879	mov ebx, bestScores
	1880	mov ecx, secondScores
	1881	mov edx, disp
	1882
	1883	test eax, eax // Is there anything to do?"
	1884	jz end_loop // Jump out if necessary
	1885
	1886	comp_loop:
	1887
	1888	test eax, eax // Is there anything to do?
	1889	jz end_loop // Jump out if necessary
	1890
	1891	movq mm2, [ecx]
	1892	psubusb mm2, [ebx] // mm2 = secondScores - bestScores
	1893
	1894	movq mm3, [edx] // mm3 = disp
	1895	pcmpgtb mm2, mm0 // mm2 = 1 if mm2>thresh
	1896	// 0 otherwise
	1897
	1898	pand mm3, mm2
	1899	pandn mm2, mm7
	1900
	1901	por mm3, mm2
	1902	movq [edx], mm3
	1903
	1904	sub eax, 8 // Update the number of points left
	1905	add ebx, 8 // Update output pointer
	1906	add ecx, 8
	1907	add edx, 8
	1908
	1909	jmp comp_loop // Loop
	1910
	1911	//Cleanup
	1912	end_loop:
	1913	emms
	1914	}
	1915	}
	1916
	1917	// windowWidth must be multiple of 8
	1918	inline void cropImage(const uchar* imSrc, int width, int height,
	1919	uchar* imDest, int x0, int y0, int windowWidth, int windowHeight)
	1920	{
	1921	int w8 = windowWidth/8;
	1922
	1923	int step = width-windowWidth;
	1924	const uchar* srcNewOrigin = imSrc+x0+y0*width;
	1925
	1926	__asm {
	1927
	1928	mov ecx, windowHeight
	1929
	1930	mov edx, w8
	1931	mov eax, srcNewOrigin
	1932	mov ebx, imDest
	1933
	1934	pixel_loop:
	1935
	1936	movq mm1, [eax]
	1937	movq [ebx], mm1
	1938	add eax, 8
	1939	add ebx, 8
	1940
	1941	dec edx
	1942	jnz pixel_loop
	1943
	1944	mov edx, w8
	1945	add eax, step
	1946
	1947	dec ecx
	1948	jnz pixel_loop
	1949
	1950	jmp done
	1951
	1952	done:
	1953	emms;
	1954	}
	1955	}
	1956
	1957	// return the average pixel value
	1958	inline float pixelMean(const uchar* im, int imageSize)
	1959	{
	1960	int sum;
	1961
	1962	__asm {
	1963
	1964	mov ecx, imageSize
	1965	shr ecx, 3
	1966
	1967	mov eax, im
	1968	pxor mm7,mm7 // mm7 used as accumulator
	1969	pxor mm0,mm0 // mm0 = 0
	1970
	1971	pixel_loop:
	1972
	1973	movq mm1, [eax]
	1974	movq mm2,mm1
	1975
	1976	punpcklbw mm2, mm0
	1977	punpckhbw mm1, mm0
	1978
	1979	paddw mm2,mm1
	1980
	1981	movq mm1,mm2
	1982	punpcklwd mm2, mm0
	1983	punpckhwd mm1, mm0
	1984
	1985	paddd mm2,mm1
	1986	paddd mm7,mm2
	1987
	1988	add eax, 8
	1989	dec ecx
	1990	jnz pixel_loop
	1991
	1992	jmp done
	1993
	1994	done:
	1995	movd ebx, mm7
	1996	psrlq mm7, 32
	1997	movd edx, mm7
	1998	add ebx, edx
	1999	mov sum, ebx
	2000
	2001	emms
	2002	}
	2003
	2004	return sum / (float)imageSize;
	2005	}
	2006
	2007
	2008
	2009
	2010	// -------------------------------------------------------------
	2011	// apply mask:
	2012	// if mask[]=undefined_val im[]->im[]
	2013	// otherwise, im[]->mask[]
	2014	// ....... this one may not be exact :-(
	2015	inline void overrideImageMMX(uchar* im, const uchar* mask, uchar undefined_val, int imageSize)
	2016	{
	2017	__asm {
	2018	// setup mm0 with 8 copies of 'undefined_val'
	2019	mov al, undefined_val
	2020	mov ah, al
	2021	mov bx, ax
	2022	shl eax, 16
	2023	mov ax, bx
	2024	movd mm0, eax
	2025	movd mm1, eax
	2026	punpckldq mm0, mm1
	2027
	2028	mov ecx, imageSize
	2029	shr ecx, 3
	2030
	2031	mov eax, im
	2032	mov ebx, mask
	2033
	2034	pixel_loop:
	2035	movq mm1, [eax]
	2036	movq mm2, [ebx]
	2037
	2038	movq mm3, mm2
	2039	pcmpeqb mm3, mm0 // mm3[] -> xFF if mm2[]==undefined_val
	2040	// -> x00 otherwise
	2041	pand mm3, mm1 // mm3[] = mm1[] if mm2[]==undefined_val
	2042	// = x00 otherwise
	2043	por mm3, mm2
	2044	movq [eax], mm3
	2045
	2046	add eax, 8
	2047	add ebx, 8
	2048	dec ecx
	2049	jnz pixel_loop
	2050
	2051	jmp done
	2052
	2053	done:
	2054	emms
	2055	}
	2056	}
	2057
	2058	inline void overrideImage(uchar* im, const uchar* mask, uchar undefined_val, int imageSize)
	2059	{
	2060	for (int i=0; i<imageSize; ++i, ++im,++mask)
	2061	{
	2062	if (mask != undefined_val) im=*mask;
	2063	}
	2064
	2065	}
	2066
	2067
	2068	inline void divide( ushort* im, uchar* div, uchar* result, int imageSize)
	2069	{
	2070	for (int i=0; i<imageSize; ++i,++im,++div,++result)
	2071	{
	2072	result = (div)?(uchar)(im / div):0;
	2073	}
	2074	}
	2075
	2076	// 5x5 sum filters
	2077	inline void sum_5x5_mmx( uchar* im, ushort* im_out, int dataSize, int width, ushort* buff)
	2078	{
	2079	sum_Row_5_mmx(im, buff, dataSize);
	2080	add_Col_5_ww(buff+2width, im_out+2width, dataSize-4*width , width);
	2081	}
	2082
	2083	inline void sum_5x5_mmx( uchar* im, uchar* im_out, int dataSize, int width, ushort* buff)
	2084	{
	2085	sum_Row_5_mmx(im, buff, dataSize);
	2086	add_Col_5_wb(buff+2width, im_out+2width, dataSize-4*width , width);
	2087	}
	2088
	2089
	2090	inline void binarize(uchar* im, uchar* im_out, uchar undefined_val, int dataSize)
	2091	{
	2092	for (int i=0; i<dataSize; ++i,++im,++im_out)
	2093	{
	2094	im_out = (im != undefined_val);
	2095	}
	2096	}
	2097
	2098	inline void set_undefined_to_zero(uchar* im, uchar* im_out, uchar undefined_val, int dataSize)
	2099	{
	2100	for (int i=0; i<dataSize; ++i,++im,++im_out)
	2101	{
	2102	if (im == undefined_val) im_out=0;
	2103	}
	2104	}
	2105
	2106	inline void set_zero_to_undefined(uchar* im, uchar* im_out, uchar undefined_val, int dataSize)
	2107	{
	2108	for (int i=0; i<dataSize; ++i,++im,++im_out)
	2109	{
	2110	if (im == 0) im_out=undefined_val;
	2111	}
	2112	}
	2113
	2114
	2115
	2116	inline void copyMMX(void* imDest, const void* imSrc, int dataSize)
	2117	{
	2118	__asm {
	2119
	2120	mov ecx, dataSize
	2121	shr ecx, 3
	2122
	2123	mov eax, imSrc
	2124	mov ebx, imDest
	2125	sub ebx, 8
	2126
	2127	pixel_loop:
	2128	movq mm1, [eax]
	2129	add ebx, 8
	2130
	2131	movq [ebx], mm1
	2132	add eax, 8
	2133
	2134	dec ecx
	2135	jnz pixel_loop
	2136
	2137	jmp done
	2138
	2139	done:
	2140	emms
	2141	}
	2142	}
	2143
	2144	inline void copySSE(void* imDest, const void* imSrc, int dataSize)
	2145	{
	2146	__asm {
	2147
	2148	mov ecx, dataSize
	2149	shr ecx, 4
	2150
	2151	mov eax, imSrc
	2152	mov ebx, imDest
	2153	sub ebx, 16
	2154
	2155	pixel_loop:
	2156	movdqa xmm1, [eax]
	2157	add ebx, 16
	2158
	2159	movdqa [ebx], xmm1
	2160	add eax, 16
	2161
	2162	dec ecx
	2163	jnz pixel_loop
	2164
	2165	jmp done
	2166
	2167	done:
	2168	emms
	2169	}
	2170	}
	2171
	2172	inline void setMMX(float* imDest, const float value, int dataSize)
	2173	{
	2174	__asm {
	2175	// make 4 copies of the constant 'value' in xmm0
	2176	movss xmm0, value
	2177	movss xmm1, xmm0
	2178	unpcklps xmm0, xmm1
	2179	movlhps xmm0, xmm0
	2180
	2181	mov ecx, dataSize
	2182	shr ecx, 2
	2183
	2184	mov ebx, imDest
	2185
	2186	pixel_loop:
	2187	movaps [ebx], xmm0
	2188	add ebx, 16
	2189
	2190	dec ecx
	2191	jnz pixel_loop
	2192
	2193	jmp done
	2194
	2195	done:
	2196	emms
	2197	}
	2198	}
	2199
	2200	inline void setMMX(char* imDest, const char value, int dataSize)
	2201	{
	2202	__asm {
	2203	// setup mm0 with 8 copies of 'value'
	2204	mov al, value
	2205	mov ah, al
	2206	mov bx, ax
	2207	shl eax, 16
	2208	mov ax, bx
	2209	movd mm0, eax
	2210	movd mm1, eax
	2211	punpckldq mm0, mm1
	2212
	2213
	2214	mov ecx, dataSize
	2215	shr ecx, 3
	2216
	2217	mov ebx, imDest
	2218
	2219	pixel_loop:
	2220	movq [ebx], mm0
	2221	add ebx, 8
	2222
	2223	dec ecx
	2224	jnz pixel_loop
	2225
	2226	jmp done
	2227
	2228	done:
	2229	emms
	2230	}
	2231	}
	2232
	2233	/*
	2234	void copyRGBAtoRGB(const uchar* imSrc, uchar* imred,uchar* imgreen,uchar* imblue, int dataSize)
	2235	{
	2236	__asm {
	2237
	2238	mov esi, dataSize
	2239	shr esi, 3
	2240
	2241	mov eax, imSrc
	2242	mov ebx, imred
	2243	mov ecx, imred
	2244	mov edx, imred
	2245
	2246	pixel_loop:
	2247	movq mm1, [eax]
	2248
	2249
	2250	movq [ebx], mm1
	2251
	2252	add eax, 8
	2253	add ebx, 8
	2254	add ecx, 8
	2255	add edx, 8
	2256
	2257	dec esi
	2258	jnz pixel_loop
	2259
	2260	jmp done
	2261
	2262	done:
	2263	emms
	2264	}
	2265	}*/
	2266
	2267	inline void multiply(uchar* im, float fact, int imageSize)
	2268	{
	2269	__asm {
	2270
	2271	mov ecx, imageSize
	2272	shr ecx, 3
	2273
	2274	// make 4 copies of the constant 'fact' in xmm0
	2275	movss xmm0, fact
	2276	movss xmm1, xmm0
	2277	unpcklps xmm0, xmm1
	2278	movlhps xmm0, xmm0
	2279
	2280
	2281	mov eax, im
	2282	pxor mm7,mm7 // mm7 = 0
	2283
	2284	pixel_loop:
	2285	movq mm1, [eax]
	2286	movq mm2, mm1
	2287
	2288	punpcklbw mm2, mm0
	2289	punpckhbw mm1, mm0
	2290
	2291	movq mm3,mm2
	2292	punpckhwd mm3, mm0
	2293	punpcklwd mm2, mm0
	2294
	2295	movq mm4,mm1
	2296	punpcklwd mm4, mm0
	2297	punpckhwd mm1, mm0
	2298
	2299	// here, the first 8 bytes are in d-words [mm1 mm4 mm3 mm2]
	2300	// --------
	2301	cvtpi2ps xmm3, mm3 // put mm3 in low part of xmm3
	2302	cvtpi2ps xmm2, mm2 // put mm2 in low part of xmm2
	2303
	2304	movlhps xmm2, xmm3 // xmm2 = [xmm3(low part) xmm2(low part)]
	2305
	2306	mulps xmm2, xmm0
	2307
	2308	cvtps2pi mm2, xmm2 // convert low 2 floats from xmm2 to mm2
	2309	movhlps xmm3,xmm2 // mov high 2 floats from xmm2 to low 2 floats in xmm3
	2310	cvtps2pi mm3, xmm3 // convert low 2 floats from xmm3 to mm3
	2311
	2312	packssdw mm2, mm3 // mm2 = (word)[mm2 mm3]
	2313
	2314	// --------
	2315	cvtpi2ps xmm4, mm4
	2316	cvtpi2ps xmm1, mm1
	2317
	2318	movlhps xmm4, xmm1
	2319
	2320	mulps xmm4, xmm0
	2321
	2322	cvtps2pi mm4, xmm4 // convert low 2 floats from xmm4 to mm4
	2323	movhlps xmm1,xmm4 // mov high 2 floats from xmm4 to low 2 floats in xmm1
	2324	cvtps2pi mm1, xmm1 // convert low 2 floats from xmm1 to mm1
	2325
	2326	packssdw mm4, mm1 // mm4 = (word)[mm1 mm4]
	2327
	2328
	2329	// ------
	2330	packuswb mm2, mm4 // mm2 = [[mm4] [mm2]] = [mm1 mm4 mm2 mm3]
	2331	movq [eax], mm2
	2332
	2333	add eax, 8
	2334	dec ecx
	2335	jnz pixel_loop
	2336
	2337	jmp done
	2338
	2339	done:
	2340	emms
	2341
	2342	}
	2343	}
	2344
	2345	inline void multiply(const uchar* imSrc, uchar* imDest, float fact, int imageSize)
	2346	{
	2347	__asm {
	2348
	2349	mov ecx, imageSize
	2350	shr ecx, 3
	2351
	2352	// make 4 copies of the constant 'fact' in xmm0
	2353	movss xmm0, fact
	2354	movss xmm1, xmm0
	2355	unpcklps xmm0, xmm1
	2356	movlhps xmm0, xmm0
	2357
	2358
	2359	mov eax, imSrc
	2360	mov ebx, imDest
	2361	pxor mm7,mm7 // mm7 = 0
	2362
	2363	pixel_loop:
	2364	movq mm1, [eax]
	2365	movq mm2, mm1
	2366
	2367	punpcklbw mm2, mm0
	2368	punpckhbw mm1, mm0
	2369
	2370	movq mm3,mm2
	2371	punpckhwd mm3, mm0
	2372	punpcklwd mm2, mm0
	2373
	2374	movq mm4,mm1
	2375	punpcklwd mm4, mm0
	2376	punpckhwd mm1, mm0
	2377
	2378	// here, the first 8 bytes are in d-words [mm1 mm4 mm3 mm2]
	2379	// --------
	2380	cvtpi2ps xmm3, mm3 // put mm3 in low part of xmm3
	2381	cvtpi2ps xmm2, mm2 // put mm2 in low part of xmm2
	2382
	2383	movlhps xmm2, xmm3 // xmm2 = [xmm3(low part) xmm2(low part)]
	2384
	2385	mulps xmm2, xmm0
	2386
	2387	cvtps2pi mm2, xmm2 // convert low 2 floats from xmm2 to mm2
	2388	movhlps xmm3,xmm2 // mov high 2 floats from xmm2 to low 2 floats in xmm3
	2389	cvtps2pi mm3, xmm3 // convert low 2 floats from xmm3 to mm3
	2390
	2391	packssdw mm2, mm3 // mm2 = (word)[mm2 mm3]
	2392
	2393	// --------
	2394	cvtpi2ps xmm4, mm4
	2395	cvtpi2ps xmm1, mm1
	2396
	2397	movlhps xmm4, xmm1
	2398
	2399	mulps xmm4, xmm0
	2400
	2401	cvtps2pi mm4, xmm4 // convert low 2 floats from xmm4 to mm4
	2402	movhlps xmm1,xmm4 // mov high 2 floats from xmm4 to low 2 floats in xmm1
	2403	cvtps2pi mm1, xmm1 // convert low 2 floats from xmm1 to mm1
	2404
	2405	packssdw mm4, mm1 // mm4 = (word)[mm1 mm4]
	2406
	2407
	2408	// ------
	2409	packuswb mm2, mm4 // mm2 = [[mm4] [mm2]] = [mm1 mm4 mm2 mm3]
	2410	movq [ebx], mm2
	2411
	2412	add eax, 8
	2413	add ebx, 8
	2414	dec ecx
	2415	jnz pixel_loop
	2416
	2417	jmp done
	2418
	2419	done:
	2420	emms
	2421
	2422	}
	2423	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format