/*************************************************************************** 
*
* Copyright 2000 by David Demirdjian.   All rights reserved. 
*  
* Developed  by David Demirdjian
*  
* Permission to use, copy, or modify this software and  its documentation 
* for  educational  and  research purposes only and without fee  is hereby 
* granted, provided  that this copyright notice and the original authors's 
* names appear  on all copies and supporting documentation.  If individual 
* files are  separated from  this  distribution directory  structure, this 
* copyright notice must be included.  For any other uses of this software, 
* in original or  modified form, including but not limited to distribution 
* in whole or in  part, specific  prior permission  must be  obtained from 
* MIT.  These programs shall not  be  used, rewritten, or  adapted as  the 
* basis  of  a  commercial  software  or  hardware product  without  first 
* obtaining appropriate  licenses from David Demirdjian.  The author makes 
* no representations about the suitability of this software for any purpose.  
* It is provided "as is" without express or implied warranty. 
*  
**************************************************************************/
#include "stereoMatching.h"
#include "processingMMX.h"

// ************************************************************
// ************************************************************
// *** List of functions (MMX) for image processing
// ************************************************************
// ************************************************************
	
// shrink images by a factor 'fact'. eg if fact = 2, out will be twice as small as src
inline void shrinkImages(uchar* dst, const uchar* src, int width, int height, int fact)
{
	int width_f = width/fact;
	int siz = width*height/(fact*fact);
	if (fact>0) {
		for (int i=0,j=0; i<siz; ++i,++j,++dst,src+=fact) {
			*dst = *src;
			if (j==width_f-1) {
				src+=((fact-1)*width);			
				j=0;
			}
		}
	}
}

// translate image of 'tx' pixels to the right
// (or left if tx<0)
void translateImage(int tx, uchar* data, int siz)
{
	if (tx==0) return; 
	if (tx>0) {
		// dest. must be after src... to avoid overwriting data
		data += (siz-tx);
		uchar* dataDst = data+tx;
		for (int i=0; i<siz-tx; ++i,--dataDst,--data) {
			*dataDst = *data;
		}
	} else { // tx<0
		data -= tx;
		uchar* dataDst = data+tx;
		for (int i=0; i<siz-tx; ++i,++dataDst,++data) {
			*dataDst = *data;
		}		
	}
}

void normalizeImages(uchar* data1, uchar* data2, uchar* data3, int siz)
{
	float a1 = pixelMean(data1,siz);
	float a2 = pixelMean(data2,siz);
	float a3 = pixelMean(data3,siz);

	float minI = __min(a1, __min(a2,a3));

	if (a2==minI) {
		multiply(data1, a2/a1, siz);
		multiply(data3, a2/a3, siz);
	} else if (a1==minI) {
		multiply(data2, a1/a2, siz);
		multiply(data3, a1/a3, siz);
	} else {
		multiply(data2, a3/a2, siz);
		multiply(data1, a3/a1, siz);
	}
}

void normalizeImages(const uchar* data1, const uchar* data2, const uchar* data3, 
					 uchar* out1, uchar* out2, uchar* out3, int siz)
{
	float a1 = pixelMean(data1,siz);
	float a2 = pixelMean(data2,siz);
	float a3 = pixelMean(data3,siz);

	float minI = __min(a1, __min(a2,a3));

	if (a2==minI) {
		multiply(data1, out1, a2/a1, siz);
		multiply(data3, out3, a2/a3, siz);
		copyMMX(out2, data2, siz);
	} else if (a1==minI) {
		multiply(data2, out2, a1/a2, siz);
		multiply(data3, out3, a1/a3, siz);
		copyMMX(out1, data1, siz);
	} else {
		multiply(data2, out2, a3/a2, siz);
		multiply(data1, out1, a3/a1, siz);
		copyMMX(out3, data3, siz);
	}
}

void normalizeImages(uchar* data1, uchar* data2, int siz)
{
	float a1 = pixelMean(data1,siz);
	float a2 = pixelMean(data2,siz);

	// normalize the image which average intensity is the highest
	if (a1>a2) 
		multiply(data1, a2/a1, siz);
	else 
		multiply(data2, a1/a2, siz);
}

void normalizeImages(const uchar* data1, const uchar* data2, 
					 uchar* out1, uchar* out2, int siz)
{
	float a1 = pixelMean(data1,siz);
	float a2 = pixelMean(data2,siz);

	// normalize the image which average intensity is the highest
	if (a1>a2) {
		multiply(data1, out1, a2/a1, siz);
		copyMMX(out2, data2, siz);
	} else {
		multiply(data2, out2, a1/a2, siz);
		copyMMX(out1, data1, siz);
	}
}

//  ImgSub2: D = saturation0(|S1 - S2| + |S1 - S3|)
// TODO? divide the result by 2 (shift)
inline int ImgSubandAdd(const unsigned char *Src1, const unsigned char *Src2, 
				 const unsigned char *Src3, unsigned char *Dest, int l)
{

	if (l < 8) return 0;              // image size must be at least 8 bytes 

  __asm 
  {		
        mov eax, Src1     
        mov ebx, Src2
		mov edx, Src3
        mov edi, Dest    
        mov	ecx, l   
        shr	ecx, 3	
	
align 16
inner_loop:
		movq	mm1,[eax]	// mm1=src1

		movq	mm2,[ebx]	// mm2=src2

		movq	mm4,mm1		// mm4=mm1

		psubusb	mm4,mm2		// mm4 = src1 - src2

		movq	mm3,[edx]	// mm3=src3
		psubusb	mm2,mm1		// mm2 = src2 - src1
        
		movq	mm5,mm1		// mm5=src1
		por		mm2,mm4		// mm2=|src1-src2|

        psubusb	mm5,mm3		// mm4=src1-src3

        psubusb	mm3,mm1	 	// mm3=src3-src1

		por		mm3,mm5		// mm3=|src1-src3|

		paddusb mm2,mm3		// mm2 = |src1-src2|+|src1-src3|

        movq    [edi], mm2	 
        add eax,8         
        add ebx,8     
        add edx,8     
        add edi,8		
        dec ecx      
        jnz inner_loop    
        emms   		
  }
	
  return 1;
}

//  ImgSub2: D = saturation0(|S1 - S2|)
// TODO? divide the result by 2 (shift)
inline int ImgSubandAdd(const unsigned char *Src1, const unsigned char *Src2, 
				 const unsigned char *Dest, int l)
{

	if (l < 8) return 0;              // image size must be at least 8 bytes 

  __asm 
  {		
        mov eax, Src1     
        mov ebx, Src2
        mov edi, Dest    
        mov	ecx, l   
        shr	ecx, 3	
	
align 16
inner_loop:
		movq	mm1,[eax]	// mm1=src1
		movq	mm2,[ebx]	// mm2=src2

		movq	mm4,mm1		// mm4=mm1
		psubusb	mm4,mm2		// mm4 = src1 - src2

		psubusb	mm2,mm1		// mm2 = src2 - src1  
		por		mm2,mm4		// mm2=|src1-src2|

        movq    [edi], mm2	 
        add eax,8         
        add ebx,8     
        add edi,8		
        dec ecx      
        jnz inner_loop    
        emms   		
  }
	
  return 1;
}






#define _ABS_DIFF_TRI(Z) __asm \
{ \
	__asm	movq	mm4,mm1		/* mm4=mm1 */ \
	__asm	add ebx, width  \
	__asm	add edi, imageSize \
	__asm	por		mm3,mm7		/* here mm2=new src2		mm3=new src3 */ \
\
	__asm	movq	mm7, mm0 \
	__asm	psubusb	mm4,mm2		/* mm4 = src1 - src2 */ \
\
	__asm	psubusb	mm2,mm1		/* mm2 = src2 - src1 */ \
	__asm	psllq	mm7,Z	\
\
	__asm	movq	mm5,mm1		/* mm5=src1 */ \
	__asm	por		mm4,mm2		/* mm2=|src1-src2| */ \
\
	__asm	movq	mm2,[ebx]	/* mm2= src2 + 'width' = new src2*/ \
	__asm	psubusb	mm5,mm3		/* mm5=src1-src3*/ \
\
	__asm	movq	mm6,mm3		/* mm6=src3*/ \
    __asm   psubusb	mm6,mm1	 	/* mm3=src3-src1*/ \
\
	__asm	por		mm6,mm5		/* mm6=|src1-src3|*/ \
	__asm	paddusb mm4,mm6		/* mm4 = |src1-src2|+|src1-src3|*/ \
\
	__asm	movq    [edi], mm4	 /* here mm1=src1*/	 \
	__asm	psrlq	mm3, 8		/* mm3 = src3 + '1' ... with [x00000000] at the end*/\
}


#define _ABS_DIFF_TRI_prefetch(Z, X) __asm \
{ \
	__asm	movq	mm4,mm1		/* mm4=mm1 */ \
	__asm	add ebx, width  \
	__asm	add edi, imageSize \
	__asm	por		mm3,mm7		/* here mm2=new src2		mm3=new src3 */ \
\
	__asm	movq	mm7, mm0 \
	__asm	psubusb	mm4,mm2		/* mm4 = src1 - src2 */ \
\
	__asm	psubusb	mm2,mm1		/* mm2 = src2 - src1 */ \
	__asm   prefetcht0 [ebx + X] \
	__asm	psllq	mm7,Z	\
\
	__asm	movq	mm5,mm1		/* mm5=src1 */ \
	__asm	por		mm4,mm2		/* mm2=|src1-src2| */ \
\
\
	__asm	movq	mm2,[ebx]	/* mm2= src2 + 'width' = new src2*/ \
	__asm	psubusb	mm5,mm3		/* mm5=src1-src3*/ \
\
	__asm	movq	mm6,mm3		/* mm6=src3*/ \
    __asm   psubusb	mm6,mm1	 	/* mm3=src3-src1*/ \
\
	__asm	por		mm6,mm5		/* mm6=|src1-src3|*/ \
	__asm	paddusb mm4,mm6		/* mm4 = |src1-src2|+|src1-src3|*/ \
\
	__asm	movq    [edi], mm4	 /* here mm1=src1*/	 \
	__asm	psrlq	mm3, 8		/* mm3 = src3 + '1' ... with [x00000000] at the end*/\
}

//  ImgSubandAdd2: D = saturation0(|S1 - S2| + |S1 - S3|)
// process 8 disparities at a time
//
//			Src1: right	
//			Src2: top	
//			Src3: left
//
// TODO? divide the result by 2 (shift)
inline int ImgSubandAdd2(const unsigned char *Src1, const unsigned char *Src2, 
				 const unsigned char *Src3, 
				 unsigned char* Dest1, int l, int imageSize, int width)
{	
	if (l < 8) return 0;              // image size must be at least 8 bytes 
	const int back_step1 = 7*width;
	const int back_step2 = 7*imageSize;
  __asm 
  {		
        mov eax, Src1     
        mov ebx, Src2
		mov edx, Src3
        mov edi, Dest1   

        mov	ecx, l   
        shr	ecx, 3	
	
		movq	mm0,[edx]	// mm0=src3
		movq	mm0,[edx]	// mm0=src3
align 16
inner_loop:
		movq	mm1,[eax]	// mm1=src1
		movq	mm3,mm0		// mm3=src3

		movq	mm2,[ebx]	// mm2=src2
        add eax,8         
 
		// -- 1 --------- in : mm1,mm2,mm3     out: mm4=SAD  mm2=new mm2 --
		movq	mm4,mm1		// mm4=mm1

		add		ebx,width

		psubusb	mm4,mm2		// mm4 = src1 - src2
		//prefetcht0 [ebx + 32 + 2*320]

		movq	mm0,[edx+8]
		psubusb	mm2,mm1		// mm2 = src2 - src1
        
		movq	mm5,mm1		// mm5=src1
		por		mm4,mm2		// mm2=|src1-src2|

		movq	mm2,[ebx]	// mm2= src2 + 'width' = new src2
		psubusb	mm5,mm3		// mm5=src1-src3

		movq	mm6,mm3		// mm6=src3
        psubusb	mm6,mm1	 	// mm3=src3-src1

		movq	mm7, mm0
		psrlq	mm3, 8		// mm3 = src3 + '1' ... with [x00000000] at the end

		por		mm6,mm5		// mm6=|src1-src3|
		paddusb mm4,mm6		// mm4 = |src1-src2|+|src1-src3|

        movq    [edi], mm4	 
		psllq	mm7, 56		// here mm1=src1	mm2=NEW src2	mm3=begin of NEWsrc3 	  mm7=end of NEWsrc3
		// -------------------------------------------------------------
		

		// - 2 ----------------
		_ABS_DIFF_TRI(48)
	
 		// - 3 ----------------
		_ABS_DIFF_TRI(40)

 		// - 4 ----------------
		_ABS_DIFF_TRI(32)
//		_ABS_DIFF_TRI_prefetch(32,24 + 3*320)

		// - 5 ----------------
		_ABS_DIFF_TRI(24)
		
		// - 6 ----------------
		_ABS_DIFF_TRI(16)
		
		// - 7 ----------------
		_ABS_DIFF_TRI(8)
	

		// - 8 ----------------
		movq	mm4,mm1		// mm4=mm1
		por		mm3,mm7		// here mm2=new src2		mm3=new src3

		psubusb	mm4,mm2		// mm4 = src1 - src2
		psubusb	mm2,mm1		// mm2 = src2 - src1
        
		movq	mm5,mm1		// mm5=src1
		por		mm4,mm2		// mm2=|src1-src2|

		psubusb	mm5,mm3		// mm5=src1-src3
        psubusb	mm3,mm1	 	// mm3=src3-src1

		por		mm3,mm5		// mm6=|src1-src3|
		paddusb mm4,mm3		// mm4 = |src1-src2|+|src1-src3|
		
		add edi, imageSize

        movq    [edi], mm4	 // here mm1=src1	
		// -------------------------------------------------------------
 		// 
		sub ebx, back_step1
        add ebx,8
        add edx,8     
		sub edi, back_step2
        add edi,8		
        dec ecx      
        jnz inner_loop    
        emms   		
  }
	
  return 1;
}


// macro: in: mm1,mm2
#define _ABS_DIFF_ __asm \
{ \
	__asm  movq	mm4,mm1 	/* mm4=mm1 */ \
	__asm  psubusb mm4,mm2 	/* mm4 = src1 - src2 */ \
	__asm  psubusb mm2,mm1 	/* mm2 = src2 - src1 */ \
	__asm  por 	mm4,mm2 	/* mm2=|src1-src2| */ \
	__asm  add ebx, width \
	__asm  add edi, imageSize \
    __asm  movq mm2,[ebx] \
	__asm  movq [edi], mm4 	 /* here mm1=src1	 */ \
}

//  ImgSubandAdd2: D = saturation0(|S1 - S2| + |S1 - S3|)
// process 8 disparities at a time
//			Src1: right	
//			Src2: top	
// TODO? divide the result by 2 (shift)
inline int ImgSubandAdd2_Vert(const unsigned char *Src1, const unsigned char *Src2, 
						unsigned char* Dest1, int l, int imageSize, int width)
{

	if (l < 8) return 0;              // image size must be at least 8 bytes 
	const int back_step1 = 7*width;
	const int back_step2 = 7*imageSize;
  __asm 
  {		
        mov eax, Src1     
        mov ebx, Src2
        mov edi, Dest1   

        mov	ecx, l   
        shr	ecx, 3	

align 16
inner_loop:

		movq	mm1,[eax]	// mm1=src1
		movq	mm2,[ebx]	// mm2=src2
        add eax,8         
 
		// -- 1 --------- in : mm1,mm2,mm3     out: mm4=SAD  mm2=new mm2 --
		_ABS_DIFF_
		_ABS_DIFF_
		_ABS_DIFF_
		_ABS_DIFF_
		_ABS_DIFF_
		_ABS_DIFF_
		_ABS_DIFF_

		// - 8 ----------------
		movq	mm4,mm1		// mm4=mm1

		psubusb	mm4,mm2		// mm4 = src1 - src2
		psubusb	mm2,mm1		// mm2 = src2 - src1
        
		por		mm4,mm2		// mm2=|src1-src2|
		add edi, imageSize

        movq    [edi], mm4	 // here mm1=src1	
		// -------------------------------------------------------------
 		// 
		sub ebx, back_step1
        add ebx,8
 		sub edi, back_step2
        add edi,8		
        dec ecx      
        jnz inner_loop    
        emms   		
  }
	
  return 1;
}

// macro: in: mm1,mm2
#define _ABS_DIFF_HORIZ(Z) __asm \
{ \
	__asm  movq	mm7, mm0 \
	__asm  add edi, imageSize \
	__asm  movq	mm5,mm1		/* mm5=src1 */ \
	__asm  psllq	mm7, Z \
	__asm  psubusb	mm5,mm3		/* mm5=src1-src3 */ \
	__asm  movq	mm6,mm3		/* mm6=src3 */ \
    __asm  psubusb	mm6,mm1	 	/* mm3=src3-src1 */ \
	__asm  por		mm6,mm5		/* mm6=|src1-src3| */ \
    __asm  movq    [edi], mm6	 /* here mm1=src1 */ \
	__asm  psrlq	mm3, 8		/* mm3 = src3 + '1' ... with [x00000000] at the end */ \
	__asm  por		mm3,mm7		/* here mm3=new src3 */ \
}	

//  ImgSubandAdd2: D = saturation0(|S1 - S2| + |S1 - S3|)
// process 8 disparities at a time
//
//			Src1: right	
//			Src2: top	
//			Src3: left
//
// TODO? divide the result by 2 (shift)
inline int ImgSubandAdd_Horiz(const unsigned char *rightIm, const unsigned char *leftIm, 
						 unsigned char* Dest, int l, int imageSize, int width)
{

	if (l < 8) return 0;              // image size must be at least 8 bytes 
	const int back_step2 = 7*imageSize;
  __asm 
  {		
        mov eax, rightIm     
 		mov edx, leftIm
        mov edi, Dest 

        mov	ecx, l   
        shr	ecx, 3	
	
		movq	mm0,[edx]	// mm0=src3
		movq	mm0,[edx]	// mm0=src3
align 16
inner_loop:

		movq	mm1,[eax]	// mm1=src1
		movq	mm3,mm0		// mm3=src3

		// -- 1 --------- in : mm1,mm2,mm3     out: mm4=SAD  mm2=new mm2 --
		movq	mm0,[edx+8]
        add eax,8         
   
		movq	mm5,mm1		// mm5=src1
		psubusb	mm5,mm3		// mm5=src1-src3

		movq	mm6,mm3		// mm6=src3
        psubusb	mm6,mm1	 	// mm3=src3-src1

		movq	mm7, mm0
		psrlq	mm3, 8		// mm3 = src3 + '1' ... with [x00000000] at the end

		por		mm6,mm5		// mm6=|src1-src3|

        movq    [edi], mm6	 
		psllq	mm7, 56		// here mm1=src1	mm3=begin of NEWsrc3 	  mm7=end of NEWsrc3
		por		mm3,mm7		// here mm3=new src3

		// - 2 ----------------
		_ABS_DIFF_HORIZ(48)
		_ABS_DIFF_HORIZ(40)
		_ABS_DIFF_HORIZ(32)
		_ABS_DIFF_HORIZ(24)
		_ABS_DIFF_HORIZ(16)
		_ABS_DIFF_HORIZ(8)

		// - 8 ----------------
		movq	mm5,mm1		// mm5=src1
		add edi, imageSize

		psubusb	mm5,mm3		// mm5=src1-src3
        psubusb	mm3,mm1	 	// mm3=src3-src1

		por		mm3,mm5		// mm6=|src1-src3|		
        movq    [edi], mm3	 
		// -------------------------------------------------------------
 		// 
        add edx,8     
		sub edi, back_step2
        add edi,8		
        dec ecx      
        jnz inner_loop    
        emms   		
  }
	
  return 1;
}


// ----------------------
// FULL IMAGE, BEST ONLY : Keith's code
inline int findMinimumCorrelation_mmx(
	const unsigned char *CurrentCorrelation,
	unsigned char CurrentDisparity,
	unsigned char *Disparity,
	unsigned char *BestCorrelation, int bytecount) 
{	
	if ((bytecount < 8) || ((bytecount % 8) != 0)) {
		return 0;
	}
	
	__asm {
		// load ecx with the pixelblock count = bytecount / 8
		mov			ecx,	bytecount
		shr			ecx,	3
		
		// setup mm0 with 8 copies of the disparity constant
		mov			al, 	CurrentDisparity	
		mov			ah, 	al
		mov			bx, 	ax
		shl			eax, 	16
		mov			ax, 	bx
		movd		mm0, 	eax
		movd		mm1, 	eax
		punpckldq	mm0, 	mm1

		// setup mm1 with 8 copies of the xor constant for unsigned => signed conversion
		mov			eax,	0x80808080
		movd		mm1,	eax
		movd		mm2,	eax
		punpckldq	mm1,	mm2
						
		
		// setup the image pointers		
		mov		eax, 	BestCorrelation
		mov		esi, 	CurrentCorrelation
		mov		edi, 	Disparity
		
	pixel_loop:
			movq		mm2,	[esi]	// current correlation
			movq		mm3,	[eax]	// best correlation

			// check for updates			
			movq		mm5,	mm2		// copy the current correlation
			pxor		mm5,	mm1		// convert from unsigned range to signed range

			movq		mm6,	mm3		// copy the best correlation
			pxor		mm6,	mm1		// convert from unsigned range to signed range

			pcmpgtb		mm5,	mm6		// mm5 := (current signed> best) mask
										//		1 indicates current >  best, so keep best
										//		0 indicates	current <= best, so use new value

			// BYPASS
			// this phase adds 8 additional instructions, but could skip 2 writes and 1 read						
			// abort remainder if not updating best correlation
			pcmpeqb		mm6,	mm6		// mm6 = 0xFFFFFFFF			
			pxor		mm6, 	mm5		// mm6 = mm5 xor 0xFFFFFFFF = not mm5
								//		0 indicates current >  best, so keep best
								//		1 indicates current <= best, so use new value
						
			packsswb	mm6,	mm6		// pack it into the lower dword of mm6 (unsigned saturation)
								// 		11111111 11111111 => 11111111	some replaced
								//		11111111 00000000 => 11111111	some replaced
								//		00000000 11111111 => 11111111	some replaced
								//		00000000 00000000 => 00000000	no replacements
			
			// we don't need to backup ebx because its not used in this routine
			// movd		mm7,	ebx		// make a backup of eax
			movd		ebx,	mm6		// get the saturated mask			
			test		ebx,	ebx		// test ebx => yields 0 iff no substitutions will occur
			// movd		ebx,	mm7		// restore ebx			
			jz		bypass			// store mm4 (second correlation) to [ebx]		


			// Update best Correlation
			movq		mm6,	mm5		// mm6 := mask
			movq		mm7,	mm5		// mm7 := mask

			pand		mm6,	mm3		// best correlation values to keep
			pandn		mm7,	mm2		// current correlation value to move to best correlation

			por			mm6,	mm7		// merge values
			movq		[eax],	mm6		// store values

			// update disparity
			movq		mm2,	[edi]	// get disparity map
			movq		mm6,	mm5		// mm6 := mask

			pand		mm5,	mm2		// select disparity map values to keep
			pandn		mm6,	mm0		// select current disparity values to move to disparity map
			
			por			mm5,	mm6		// merge values
			movq		[edi],	mm5		// store values
			
		bypass:
			add		eax,	8
			add		esi,	8
			add		edi,	8

			dec		ecx			
		jnz pixel_loop

	
		emms;
	}
	
	return 1;
}

/*int initMinimumCorrelation(
	const unsigned char *CurrentCorrelation,
	unsigned char disparityInit,
	unsigned char *Disparity,
	unsigned char *BestCorrelation,
	unsigned char *SecondCorrelation,
	int bytecount) 
{
	for (int i=0; i<bytecount; ++i)
	{
		BestCorrelation[i]=255;
		SecondCorrelation[i]=255;
		Disparity[i]=0;
	}
	return 0;
}*/

inline int initMinimumCorrelation(
	const unsigned char *CurrentCorrelation,
	unsigned char disparityInit,
	unsigned char *Disparity,
	unsigned char *BestCorrelation,
	unsigned char *SecondCorrelation,
	int bytecount) 
{	
	if ((bytecount < 8) || ((bytecount % 8) != 0)) {
		return 0;
	}
	
	__asm {
				
		// setup mm0 with 8 copies of the disparity constant
		mov			al, 	disparityInit	
		mov			ah, 	al
		mov			bx, 	ax
		shl			eax, 	16
		mov			ax, 	bx
		movd		mm0, 	eax
		movd		mm1, 	eax
		punpckldq	mm0, 	mm1

		// load ecx with the pixelblock count = bytecount / 8
		mov			ecx,	bytecount
		shr			ecx,	3
		
		mov		eax, 	BestCorrelation
		mov		ebx, 	SecondCorrelation
		mov		esi, 	CurrentCorrelation
		mov		edx,	Disparity

	pixel_loop:
		movq		mm1, [esi]
		movq		[eax], mm1		// Best = Current
		movq		[ebx], mm1		// Second = Current
		movq		[edx], mm0		// Disparity = disparityInit

		add		eax, 8
		add		ebx, 8
		add		edx, 8
		add		esi, 8
		dec		ecx			
		jnz pixel_loop

		jmp done


	done:
		emms;
	}
}

inline int findMinimumCorrelation(
			const unsigned char *CurrentCorrelation,
			unsigned char CurrentDisparity,
			unsigned char *Disparity,
			unsigned char *BestCorrelation,
			unsigned char *SecondCorrelation,
			int bytecount) 
{
	for (int i=0; i<bytecount; ++i,++CurrentCorrelation,++Disparity,++BestCorrelation, ++SecondCorrelation) 
	{
		if (*CurrentCorrelation<*BestCorrelation) {
			*Disparity = CurrentDisparity;
			*SecondCorrelation = *BestCorrelation;
			*BestCorrelation = *CurrentCorrelation;
		}
	}
	return 1;
}

// ----------------------
// FULL IMAGE, BEST+SECOND .. Keith's code
inline int findMinimumCorrelation_mmx(
			const unsigned char *CurrentCorrelation,
			unsigned char CurrentDisparity,
			unsigned char *Disparity,
			unsigned char *BestCorrelation,
			unsigned char *SecondCorrelation,
			int bytecount) 
{	
	if ((bytecount < 8) || ((bytecount % 8) != 0)) {
		return 0;
	}
	
	__asm {
		// load ecx with the pixelblock count = bytecount / 8
		mov			ecx,	bytecount
		shr			ecx,	3
		
		// setup mm0 with 8 copies of the disparity constant
		mov			al, 	CurrentDisparity	
		mov			ah, 	al
		mov			bx, 	ax
		shl			eax, 	16
		mov			ax, 	bx
		movd		mm0, 	eax
		movd		mm1, 	eax
		punpckldq	mm0, 	mm1

		// setup mm1 with 8 copies of the xor constant for unsigned => signed conversion
		mov			eax,	0x80808080
		movd		mm1,	eax
		movd		mm2,	eax
		punpckldq	mm1,	mm2
						
		
		// setup the image pointers		
		mov		eax, 	BestCorrelation
		mov		ebx, 	SecondCorrelation
		mov		esi, 	CurrentCorrelation
		mov		edi, 	Disparity
		
	pixel_loop:
			movq		mm2,	[esi]	// current correlation
			movq		mm4,	[ebx]	// second correlation

			// convert the current correlation from unsigned range to signed range
			movq		mm5,	mm2		// copy the current correlation
			pxor		mm5,	mm1		// convert from unsigned range to signed range
			movq		mm7,	mm5		// copy converted to mm7


			// check for second correlation updates
			movq		mm6,	mm4		// copy second best correlation
			pxor		mm6,	mm1		// convert from unsigned range to signed range

			pcmpgtb		mm7,	mm6		// mm7 := (current signed> second best) mask

			// BYPASS 1
			// skip remainder if second correlation is not to be updated
			// this phase adds an addition 8 instructions, but it could save as 1 memory read and 3 writes
			pcmpeqb		mm6,	mm6		// mm6 = 0xFFFFFFFF			
			pxor		mm6, 	mm7		// mm6 = mm7 xor 0xFFFFFFFF = not mm7
								//		0 indicates current >  second, so keep old value
								//		1 indicates current <= second, so use new value
								
						
			packsswb	mm6,	mm6		// pack it into the lower dword of mm6 (unsigned saturation)
								// 		11111111 11111111 => 11111111	some replaced
								//		11111111 00000000 => 11111111	some replaced
								//		00000000 11111111 => 11111111	some replaced
								//		00000000 00000000 => 00000000	no replacements
			
			// don't need to backup edx because its not used in this routine
			// movd		mm3,	edx		// make a backup of edx
			movd		edx,	mm6		// get the saturated mask			
			test		edx,	edx		// test edx => yields 0 iff no replacements will occur
			// movd		edx,	mm3		// restore edx			
			jz			bypass1


			// direct update second correlation (get values from current)
										// mm7 already has mask
//			movq		mm6,	mm7		// mm6 := mask
//			pand		mm6,	mm4		// second correlation values to keep
//			pandn		mm7,	mm2		// current correlation values to move to second correlation
//			por			mm6,	mm7		// merge value => direct updated second correlation
//			movq		mm4,	mm6		// store values (*** this instruction could be eliminated!) 

			pand		mm4,	mm7		// second correlation values to keep
			pandn		mm7,	mm2		// current correlation values to move to second correlation
			por			mm4,	mm7		// merge value => direct updated second correlation


			// check for best correlation updates			
			movq		mm3,	[eax]	// best correlation
			//		mm5 has converted current correlation
			movq		mm6,	mm3		// copy the best correlation
			pxor		mm6,	mm1		// convert from unsigned range to signed range

			pcmpgtb		mm5,	mm6		// mm5 := (current signed> best) mask
										//		1 indicates current >  best, so keep best
										//		0 indicates	current <= best, so use new value
			// BYPASS 2		
			// this phase adds 8 additional instructions, but could skip 2 writes and 1 read						
			// abort remainder if not updating best correlation
			pcmpeqb		mm6,	mm6		// mm6 = 0xFFFFFFFF			
			pxor		mm6, 	mm5		// mm6 = mm5 xor 0xFFFFFFFF = not mm5
								//		0 indicates current >  best, so keep best
								//		1 indicates current <= best, so use new value
						
			packsswb	mm6,	mm6		// pack it into the lower dword of mm6 (unsigned saturation)
								// 		11111111 11111111 => 11111111	some replaced
								//		11111111 00000000 => 11111111	some replaced
								//		00000000 11111111 => 11111111	some replaced
								//		00000000 00000000 => 00000000	no replacements
			
			// don't need to backup edx because its not used in this routine
			// movd		mm7,	edx		// make a backup of edx
			movd		edx,	mm6		// get the saturated mask			
			test		edx,	edx		// test edx => yields 0 iff no substitutions will occur
			// movd		edx,	mm7		// restore edx			
			jz		bypass2			// store mm4 (second correlation) to [ebx]		


			// indirect update second correlation (pushed down from best)
			movq		mm6,	mm5		// mm6 := mask
			movq		mm7,	mm5		// mm7 := mask

			pand		mm6,	mm4		// second correlation values to keep
			pandn		mm7,	mm3		// best correlations to move to second correlation

			por			mm6,	mm7		// merge values
			movq		[ebx],	mm6		// store values

			// direct Update best Correlation
			movq		mm6,	mm5		// mm6 := mask
			movq		mm7,	mm5		// mm7 := mask

			pand		mm6,	mm3		// best correlation values to keep
			pandn		mm7,	mm2		// current correlation value to move to best correlation

			por			mm6,	mm7		// merge values
			movq		[eax],	mm6		// store values

			// update disparity
			movq		mm2,	[edi]	// get disparity map
			movq		mm6,	mm5		// mm6 := mask

			pand		mm5,	mm2		// select disparity map values to keep
			pandn		mm6,	mm0		// select current disparity values to move to disparity map
			
			por			mm5,	mm6		// merge values
			movq		[edi],	mm5		// store values


		bypass1:
		next_pixel:			
			add		eax,	8
			add		ebx,	8
			add		esi,	8
			add		edi,	8

			dec		ecx			
		jnz pixel_loop

		jmp done

		bypass2:
			movq	[ebx], mm4;
			jmp next_pixel

	done:
		emms;
	}
	
	return 1;
}



inline void sum_Row(uchar* im, unsigned short* im_out, int rowSize, int maskSize) 
{
	im += maskSize/2;
	im_out += maskSize/2;
	for (int i=0; i<rowSize; ++i) {
		int s=0;
		for (int j=-maskSize/2; j<=maskSize/2; ++j) {
			s+=*(im+j);
		}
		*im_out=s/maskSize;
		++im;++im_out;
	}		
}

inline void sum_Row_mmx(uchar* im, unsigned short* im_out, int rowSize, int maskSize) 
{
	sum_Row_5_mmx(im, im_out, rowSize);
	for (int i=0; i<(maskSize-5)/2; ++i)
		sum_Row_5_mmx(im_out, im_out, rowSize);
}

inline  void sum_Row_mmx(unsigned short* im, unsigned short* im_out, int rowSize, int maskSize) 
{
	sum_Row_5_mmx(im, im_out, rowSize);
	for (int i=0; i<(maskSize-5)/2; ++i)
		sum_Row_5_mmx(im_out, im_out, rowSize);
}

#define aim_Sum_Words_In_MM1 __asm \
{						\
	__asm movq mm4, mm1 \
	__asm movq mm2, mm1 \
\
	__asm movq mm3, mm1 \
	__asm psllq mm1, 16 \
\
	__asm psrlq mm2, 16 \
	__asm paddw mm4, mm2 \
\
	__asm paddw mm3, mm1 \
	__asm psrlq mm2, 16 \
\
	__asm psllq mm1, 16 \
	__asm paddw mm4, mm2 \
\
	__asm psrlq mm2, 16 \
	__asm paddw mm3, mm1 \
\
	__asm psllq mm1, 16 \
	__asm paddw mm4, mm2 \
\
	__asm paddw mm3, mm1 \
}





// apply the mask [1 1 1 1 1] to the 1-D array im (bytes)
// output : im_out (words)
inline void sum_Row_5_mmx(uchar* im, unsigned short* im_out, int rowSize) 
{
	// temp: for debugging
	//return sum_Row_5(im,im_out,rowSize);
	__asm {

	mov eax, rowSize
	mov ebx, im
	mov ecx, im_out

	pxor mm6, mm6	// mm6 = x00000000

	//Process the first quad word, but save only the second result" 
	test eax, eax // Is there anything to do?" 
	jz end_sum_loop // Jump out if necessary 


	//Process low word 
	movq mm1, [ebx] // Copy... 
	punpcklbw mm1, mm6 // Expand low word bytes into words		// mm1 =[D C B A]

	aim_Sum_Words_In_MM1

	//Store the result Only in the accumulator 
	movq mm7, mm4 // Update accumulator    mm4=[D  C+D   B+C+D   A+B+C+D]

	//Process high word 
	movq mm1, [ebx] // Copy... 
	punpckhbw mm1, mm6 // Expand high word bytes into words   // mm1 =[H G F E]
	add ebx, 8 // Update input pointer 

    aim_Sum_Words_In_MM1

	//Add to the previous data	... 
	// mm3=[E+F+G+H  E+F+G  E+F   E]
	// mm4=[H	G+H    F+G+H   E+F+G+H]
	paddw mm7, mm3 // The current word of the accum   // mm7=[D+E+F+G+H  C+D+E+F+G  B+C+D+E+F  A+B+C+D+E]

	// translate everything to 2 words on the left
	movq	mm1, mm7	// mm1 = [D+E+F+G+H  C+D+E+F+G  B+C+D+E+F  A+B+C+D+E]
	psrlq   mm1, 32		// mm1 = [0  0  D+E+F+G+H  C+D+E+F+G]

	movq	mm0, mm1	// mm0 = [D+E+F+G+H  C+D+E+F+G]

	psllq	mm7, 32		// mm7 = [B+C+D+E+F  A+B+C+D+E  0   0]

	movq [ecx], mm7 // Store the final result 
	add ecx, 8 // Update output pointer 

	movq mm7, mm4 // Update accumulator    mm4=[H	G+H    F+G+H   E+F+G+H]
	sub eax, 8 // Update the number of points left 

	// Start the loop 
	row_sum_loop:
		test eax, eax // Is there anything to do? 
		jz end_sum_loop // Jump out if necessary 

		movq mm1, [ebx] // Load data 

		//Process low word 
		punpcklbw mm1, mm6 // Expand low word bytes into words 

		aim_Sum_Words_In_MM1

		//Add to the previous data 
		//prefetcht1 [ecx+16]
		paddw mm7, mm3	// The current word of the accum 

		// translate everything to 2 words on the left
		// mm0 = [0 0 D C]  mm7 = [H G F E] ----> mm7=[0 0 H G]  [ecx]=[F E D C]
		punpckldq	mm0, mm7		// mm0 = [F E D C]

		movq		[ecx], mm0
		sub eax, 8 // Update the number of points left 	

		movq mm0, mm4	// Update accumulator 
		psrlq		mm7, 32			// mm7 = [0 0 H G]

		//Process high word 
		movq mm1, [ebx]		// Copy... 
		punpckhbw mm1, mm6	// Expand high word bytes into words 
		
		aim_Sum_Words_In_MM1

		//Add to the previous data 
		paddw mm0, mm3	// The current word of the accum 

		// translate everything to 2 words on the left
		// mm7 = [0 0 D C]  mm0 = [H G F E] ----> mm0=[0 0 H G]  [ecx+8]=[F E D C]
		punpckldq	mm7, mm0		// mm7 = [F E D C]
		add ebx, 8 // Update input pointer 

		movq		[ecx+8], mm7
		psrlq		mm0, 32			// mm0 = [0 0 H G] 

		movq mm7, mm4	// Update accumulator 
		add ecx, 16 // Update output pointer 
		
		jmp row_sum_loop // Loop 

		//Cleanup 
	end_sum_loop:
	emms 
	}
}



// apply the mask (1/4)*[1 1 1 1 1] to the 1-D array im (words)
// output : im_out (words)
inline void sum_Row_5_mmx(ushort* im, ushort* im_out, int rowSize) 
{
	// temp: for debugging
	//return sum_Row_5(im,im_out,rowSize);
	__asm {

	mov eax, rowSize
	mov ebx, im
	mov ecx, im_out

	//Process the first quad word, but save only the second result" 
	test eax, eax // Is there anything to do?" 
	jz end_sum_loop // Jump out if necessary 

	movq mm1, [ebx] // Load data (4 words)
	add ebx, 8 // Update input pointer 

	//Process low word 
	aim_Sum_Words_In_MM1

	//Store the result Only in the accumulator 
	movq mm7, mm4 // Update accumulator 

	//Process high word 
	movq mm1, [ebx] // Copy... 

	aim_Sum_Words_In_MM1
	add ebx, 8

	//Add to the previous data 
	paddw mm7, mm3 // The current word of the accum 

	// translate everything to 2 words on the left
	movq	mm1, mm7	// mm1 = [D+E+F+G+H  C+D+E+F+G  B+C+D+E+F  A+B+C+D+E]
	psrlq   mm1, 32		// mm1 = [0  0  D+E+F+G+H  C+D+E+F+G]
	movq	mm0, mm1	// mm0 = [0  0  D+E+F+G+H  C+D+E+F+G]
	psllq	mm7, 32		// mm7 = [B+C+D+E+F  A+B+C+D+E  0   0]

	movq [ecx], mm7 // Store the final result 
	movq mm7, mm4 // Update accumulator 

	add ecx, 8 // Update output pointer 
	sub eax, 8 // Update the number of points left 

	// Start the loop 
	row_sum_loop:
		test eax, eax // Is there anything to do? 
		jz end_sum_loop // Jump out if necessary 

		movq mm1, [ebx] // Load data 

		aim_Sum_Words_In_MM1

		//Add to the previous data 
		//prefetcht0	[ecx + 32]
		//prefetcht0	[ebx + 48]
		paddw mm7, mm3 // The current word of the accum 
		psrlw mm7, 2 // divide result by ...

		// translate everything to 2 words on the left
		// mm0 = [0 0 D C]  mm7 = [H G F E] ----> mm7 =[0 0 H G]  [ecx]=[F E D C]
		punpckldq	mm0, mm7  // mm0 = [F E D C]
		
		movq		[ecx], mm0
		sub eax, 8 // Update the number of points left 

		movq	mm0, mm4			// Update accumulator 
		psrlq	mm7, 32		// mm7 =[0 0 H G]
		
		//Process high word 
		movq mm1, [ebx+8] // Copy... 
		
		aim_Sum_Words_In_MM1

		//Add to the previous data 
		paddw mm0, mm3 // The current word of the accum 
		psrlw mm0, 2 // divide result by ...
		
		// translate everything to 2 words on the left
		// mm7 = [0 0 D C]  mm0 = [H G F E] ----> mm0=[0 0 H G]  [ecx+8]=[F E D C]
		punpckldq	mm7, mm0	// mm7 = [F E D C]
		add ebx, 16 // Update input pointer 

		movq		[ecx+8], mm7
		psrlq		mm0, 32		// mm0 = [0 0 H G]

		movq mm7, mm4	// Update accumulator 
		add ecx, 16 // Update output pointer */

		jmp row_sum_loop // Loop 

		//Cleanup 
	end_sum_loop:
	emms 
	}
}

template<class T> void sum_Row_5(T* im, ushort* im_out, int rowSize)
{
	im += 2;
	im_out +=2;
	int s = 0;
	for (int i=0; i<rowSize-5; ++i, ++im, ++im_out) {
		s = *(im-2);
		s += *(im-1);
		s += *(im);
		s += *(im+1);
		s += *(im+2);
		*im_out = s/5;
	}
}

inline void avg_Col(ushort* im, uchar* im_out, int dataSize, int width, int sizeMask)
{
	int offset = width*(sizeMask/2);
	im += offset;
	im_out += offset;
	for (int i=0; i<dataSize-width*sizeMask; ++i, ++im, ++im_out) {
		int s = 0;
		for (int j=-sizeMask/2; j<=sizeMask/2; ++j) s += *(im+j*width);
		*im_out = s/(sizeMask);
	}
}

// apply vertical mask 1/16*[1 1 1 ... 1]^T to 'im'
// result in 'im_out'
inline void avg_Col_mmx(ushort* im, uchar* im_out, int dataSize, int width, int sizeMask)
{
	// temp: for debugging
	//return avg_Col(im,im_out,dataSize,width,sizeMask);

	switch (sizeMask)
	{
	case 5: avg_Col_5(im,im_out,dataSize,width);
		break;
	case 7: avg_Col_7(im,im_out,dataSize,width);
		break;
	case 9: avg_Col_9(im,im_out,dataSize,width);
		break;	
	case 11: avg_Col_11(im,im_out,dataSize,width);
		break;
	case 13: avg_Col_13(im,im_out,dataSize,width);
		break;
	case 15: avg_Col_15(im,im_out,dataSize,width);
		break;
	case 17: avg_Col_17(im,im_out,dataSize,width);
		break;

	default: if (sizeMask<5) avg_Col_5(im,im_out,dataSize,width);
			 else if (sizeMask>17) avg_Col_17(im,im_out,dataSize,width);
		break;

	}
}



#define macro_add __asm \
{						\
	__asm 	paddusw mm3, [edx]	\
	__asm	paddusw mm2, [edx+8]  \
	__asm	add edx, edi		\
}


inline void avg_Col_5(ushort* im, uchar* im_out, int dataSize, int width)
{
	__asm {

	mov edi, width
	shl edi, 1  // edi = 2*width

	mov eax, dataSize
	mov ecx, im_out

	mov ebx, im
	sub ebx, edi
	sub ebx, edi // ebx = ebx-4*width
	
	test eax, eax // Is there anything to do?" 
	jz end_sum_loop // Jump out if necessary 

	row_sum_loop:

		test eax, eax // Is there anything to do? 
		jz end_sum_loop // Jump out if necessary 

		mov edx, ebx
		add ebx, 16

		// 1
		movq mm3, [edx] // mm3 = 4 words of im
		movq mm2, [edx+8] // mm2 = next 4 words of im
		add edx, edi

		macro_add
		macro_add
		macro_add
		macro_add
		
		// divide results by ...
		psrlw mm3, 3
		psrlw mm2, 3

		// convert [mm2 mm3] as 8 bytes
		packuswb mm3,mm2
		movq [ecx], mm3

		sub eax, 8 // Update the number of points left 
		add ecx, 8 // Update output pointer 

		jmp row_sum_loop // Loop 

		//Cleanup 
	end_sum_loop:
	emms 
	}
}

inline void avg_Col_7(ushort* im, uchar* im_out, int dataSize, int width)
{
	__asm {

	mov edi, width
	shl edi, 1  // edi = 2*width

	mov eax, dataSize
	mov ecx, im_out

	mov ebx, im
	sub ebx, edi
	sub ebx, edi
	sub ebx, edi // ebx = ebx-6*width
	
	test eax, eax // Is there anything to do?" 
	jz end_sum_loop // Jump out if necessary 

	row_sum_loop:

		test eax, eax // Is there anything to do? 
		jz end_sum_loop // Jump out if necessary 

		mov edx, ebx

		// 1
		movq mm3, [edx] // mm3 = 4 words of im
		add ebx, 16
		movq mm2, [edx+8] // mm2 = next 4 words of im
		add edx, edi

		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		
		// divide results by ...
		psrlw mm3, 3
		psrlw mm2, 3

		// convert [mm2 mm3] as 8 bytes
		packuswb mm3,mm2
		movq [ecx], mm3

		sub eax, 8 // Update the number of points left 
		add ecx, 8 // Update output pointer 

		jmp row_sum_loop // Loop 

		//Cleanup 
	end_sum_loop:
	emms 
	}
}

inline void avg_Col_9(ushort* im, uchar* im_out, int dataSize, int width)
{
	__asm {

	mov edi, width
	shl edi, 1  // edi = 2*width

	mov eax, dataSize
	mov ecx, im_out

	mov ebx, im
	sub ebx, edi
	sub ebx, edi
	sub ebx, edi
	sub ebx, edi // ebx = ebx-8*width
	
	test eax, eax // Is there anything to do?" 
	jz end_sum_loop // Jump out if necessary 

	row_sum_loop:

		test eax, eax // Is there anything to do? 
		jz end_sum_loop // Jump out if necessary 

		mov edx, ebx
		add ebx, 16

		// 1
		movq mm3, [edx] // mm3 = 4 words of im
		movq mm2, [edx+8] // mm2 = next 4 words of im
		add edx, edi

		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		
		// divide results by ...
		psrlw mm3, 3
		psrlw mm2, 3

		// convert [mm2 mm3] as 8 bytes
		packuswb mm3,mm2
		movq [ecx], mm3

		sub eax, 8 // Update the number of points left 
		add ecx, 8 // Update output pointer 

		jmp row_sum_loop // Loop 

		//Cleanup 
	end_sum_loop:
	emms 
	}
}

inline void avg_Col_11(ushort* im, uchar* im_out, int dataSize, int width)
{
	__asm {

	mov edi, width
	shl edi, 1  // edi = 2*width

	mov eax, dataSize
	mov ecx, im_out

	mov ebx, im
	sub ebx, edi
	sub ebx, edi
	sub ebx, edi
	sub ebx, edi 
	sub ebx, edi // ebx = ebx-10*width
	
	test eax, eax // Is there anything to do?" 
	jz end_sum_loop // Jump out if necessary 

	row_sum_loop:

		test eax, eax // Is there anything to do? 
		jz end_sum_loop // Jump out if necessary 

		mov edx, ebx
		add ebx, 16

		// 1
		movq mm3, [edx] // mm3 = 4 words of im
		movq mm2, [edx+8] // mm2 = next 4 words of im
		add edx, edi

		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		
		// divide results by ...
		psrlw mm3, 4
		psrlw mm2, 4

		// convert [mm2 mm3] as 8 bytes
		packuswb mm3,mm2
		movq [ecx], mm3

		sub eax, 8 // Update the number of points left 
		add ecx, 8 // Update output pointer 

		jmp row_sum_loop // Loop 

		//Cleanup 
	end_sum_loop:
	emms 
	}
}

inline void avg_Col_13(ushort* im, uchar* im_out, int dataSize, int width)
{
	__asm {

	mov edi, width
	shl edi, 1  // edi = 2*width

	mov eax, dataSize
	mov ecx, im_out

	mov ebx, im
	sub ebx, edi
	sub ebx, edi
	sub ebx, edi
	sub ebx, edi 
	sub ebx, edi
	sub ebx, edi // ebx = ebx-12*width
	
	test eax, eax // Is there anything to do?" 
	jz end_sum_loop // Jump out if necessary 

	row_sum_loop:

		test eax, eax // Is there anything to do? 
		jz end_sum_loop // Jump out if necessary 

		mov edx, ebx
		add ebx, 16

		// 1
		movq mm3, [edx] // mm3 = 4 words of im
		movq mm2, [edx+8] // mm2 = next 4 words of im
		add edx, edi

		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		
		// divide results by ...
		psrlw mm3, 4
		psrlw mm2, 4

		// convert [mm2 mm3] as 8 bytes
		packuswb mm3,mm2
		movq [ecx], mm3

		sub eax, 8 // Update the number of points left 
		add ecx, 8 // Update output pointer 

		jmp row_sum_loop // Loop 

		//Cleanup 
	end_sum_loop:
	emms 
	}
}

inline void avg_Col_15(ushort* im, uchar* im_out, int dataSize, int width)
{
	__asm {

	mov edi, width
	shl edi, 1  // edi = 2*width

	mov eax, dataSize
	mov ecx, im_out

	mov ebx, im
	sub ebx, edi
	sub ebx, edi
	sub ebx, edi
	sub ebx, edi
	sub ebx, edi 
	sub ebx, edi
	sub ebx, edi // ebx = ebx-14*width
	
	test eax, eax // Is there anything to do?" 
	jz end_sum_loop // Jump out if necessary 

	row_sum_loop:

		test eax, eax // Is there anything to do? 
		jz end_sum_loop // Jump out if necessary 

		mov edx, ebx
		add ebx, 16

		// 1
		movq mm3, [edx] // mm3 = 4 words of im
		movq mm2, [edx+8] // mm2 = next 4 words of im
		add edx, edi

		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		
		// divide results by ...
		psrlw mm3, 4
		psrlw mm2, 4

		// convert [mm2 mm3] as 8 bytes
		packuswb mm3,mm2
		movq [ecx], mm3

		sub eax, 8 // Update the number of points left 
		add ecx, 8 // Update output pointer 

		jmp row_sum_loop // Loop 

		//Cleanup 
	end_sum_loop:
	emms 
	}
}

inline void avg_Col_17(ushort* im, uchar* im_out, int dataSize, int width)
{
	__asm {

	mov edi, width
	shl edi, 1  // edi = 2*width

	mov eax, dataSize
	mov ecx, im_out

	mov ebx, im
	sub ebx, edi
	sub ebx, edi
	sub ebx, edi
	sub ebx, edi
	sub ebx, edi
	sub ebx, edi 
	sub ebx, edi
	sub ebx, edi // ebx = ebx-16*width
	
	test eax, eax // Is there anything to do?" 
	jz end_sum_loop // Jump out if necessary 

	row_sum_loop:

		test eax, eax // Is there anything to do? 
		jz end_sum_loop // Jump out if necessary 

		mov edx, ebx
		add ebx, 16

		// 1
		movq mm3, [edx] // mm3 = 4 words of im
		movq mm2, [edx+8] // mm2 = next 4 words of im
		add edx, edi

		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		macro_add
		
		// divide results by ...
		psrlw mm3, 4
		psrlw mm2, 4

		// convert [mm2 mm3] as 8 bytes
		packuswb mm3,mm2
		movq [ecx], mm3

		sub eax, 8 // Update the number of points left 
		add ecx, 8 // Update output pointer 

		jmp row_sum_loop // Loop 

		//Cleanup 
	end_sum_loop:
	emms 
	}
}


inline void add_Col_5_wb(ushort* im, uchar* im_out, int dataSize, int width)
{
	__asm {

	mov edi, width
	shl edi, 1  // edi = 2*width

	mov eax, dataSize
	mov ecx, im_out

	mov ebx, im
	sub ebx, edi
	sub ebx, edi // ebx = ebx-4*width
	
	test eax, eax // Is there anything to do?" 
	jz end_sum_loop // Jump out if necessary 

	row_sum_loop:

		test eax, eax // Is there anything to do? 
		jz end_sum_loop // Jump out if necessary 

		mov edx, ebx
		add ebx, 16

		// 1
		movq mm3, [edx] // mm3 = 4 words of im
		movq mm2, [edx+8] // mm2 = next 4 words of im
		add edx, edi

		macro_add
		macro_add
		macro_add
		macro_add
		
		// save [mm2 mm3] as 8 bytes
		packuswb mm3,mm2
		movq [ecx], mm3

		sub eax, 8 // Update the number of points left 
		add ecx, 8 // Update output pointer 

		jmp row_sum_loop // Loop 

		//Cleanup 
	end_sum_loop:
	emms 
	}
}

inline void add_Col_5_ww(ushort* im, ushort* im_out, int dataSize, int width)
{
	__asm {

	mov edi, width
	shl edi, 1  // edi = 2*width

	mov eax, dataSize
	mov ecx, im_out

	mov ebx, im
	sub ebx, edi
	sub ebx, edi // ebx = ebx-4*width
	
	test eax, eax // Is there anything to do?" 
	jz end_sum_loop // Jump out if necessary 

	row_sum_loop:

		test eax, eax // Is there anything to do? 
		jz end_sum_loop // Jump out if necessary 

		mov edx, ebx
		add ebx, 16

		// 1
		movq mm3, [edx] // mm3 = 4 words of im
		movq mm2, [edx+8] // mm2 = next 4 words of im
		add edx, edi

		macro_add
		macro_add
		macro_add
		macro_add
		
		// save [mm2 mm3] as words
		movq [ecx], mm3
		movq [ecx+8], mm2

		sub eax, 8 // Update the number of points left 
		add ecx, 16 // Update output pointer 

		jmp row_sum_loop // Loop 

		//Cleanup 
	end_sum_loop:
	emms 
	}
}

// compare bestScores and secondScores. if second<best+'thresh' the disp.
// is set to 'valForReplacement' (usually 0)
inline void compareBestAndSecond(uchar* bestScores, uchar* secondScores, char thresh, 
						  uchar undefined_val, 
						  uchar* disp, int dataSize)
{
	__asm {
		
	// setup mm0 with 8 copies of 'thresh'
	mov			al, 	thresh	
	mov			ah, 	al
	mov			bx, 	ax
	shl			eax, 	16
	mov			ax, 	bx
	movd		mm0, 	eax
	movd		mm1, 	eax
	punpckldq	mm0, 	mm1

	// setup mm7 with 8 copies of 'valForReplacement'
	mov			al, 	undefined_val	
	mov			ah, 	al
	mov			bx, 	ax
	shl			eax, 	16
	mov			ax, 	bx
	movd		mm7, 	eax
	movd		mm1, 	eax
	punpckldq	mm7, 	mm1

	mov eax, dataSize
	mov ebx, bestScores
	mov ecx, secondScores
	mov edx, disp

	test eax, eax // Is there anything to do?" 
	jz end_loop // Jump out if necessary 

	comp_loop:

		test eax, eax // Is there anything to do? 
		jz end_loop // Jump out if necessary 

		movq	mm2, [ecx]
		psubusb mm2, [ebx] // mm2 = secondScores - bestScores

		movq	mm3, [edx]	// mm3 = disp
		pcmpgtb mm2, mm0	// mm2 = 1 if mm2>thresh
							//		 0 otherwise
			
		pand	mm3, mm2
		pandn	mm2, mm7

		por		mm3, mm2
		movq	[edx], mm3

		sub eax, 8 // Update the number of points left 
		add ebx, 8 // Update output pointer 
		add ecx, 8 
		add edx, 8

		jmp comp_loop // Loop 

		//Cleanup 
	end_loop:
	emms 
	}
}

// windowWidth must be multiple of 8
inline void cropImage(const uchar* imSrc, int width, int height, 
			   uchar* imDest, int x0, int y0, int windowWidth, int windowHeight)
{
	int w8 = windowWidth/8;

	int step = width-windowWidth;
	const uchar* srcNewOrigin = imSrc+x0+y0*width;

	__asm {

		mov		ecx,	windowHeight

		mov		edx,	w8
		mov		eax, 	srcNewOrigin
		mov		ebx, 	imDest

	pixel_loop:

		movq	mm1, [eax]
		movq	[ebx], mm1
		add		eax, 8
		add		ebx, 8
	
		dec		edx
		jnz	pixel_loop

		mov		edx, w8
		add		eax, step

		dec		ecx		
		jnz pixel_loop

		jmp done

	done:
		emms;
	}
}

// return the average pixel value
inline float pixelMean(const uchar* im, int imageSize)
{
	int sum;

	__asm {

		mov		ecx, imageSize
		shr		ecx, 3

		mov		eax, im
		pxor	mm7,mm7		// mm7 used as accumulator
		pxor	mm0,mm0		// mm0 = 0

	pixel_loop:
			
			movq	mm1, [eax]
			movq	mm2,mm1

			punpcklbw	mm2, mm0
			punpckhbw	mm1, mm0

			paddw	mm2,mm1
			
			movq	mm1,mm2
			punpcklwd	mm2, mm0
			punpckhwd	mm1, mm0

			paddd	mm2,mm1
			paddd	mm7,mm2

			add eax, 8
			dec	ecx
			jnz	pixel_loop

			jmp done

	done:
			movd	ebx, mm7
			psrlq	mm7, 32
			movd	edx, mm7
			add	ebx, edx
			mov sum, ebx

			emms
	}

	return sum / (float)imageSize;
}




// -------------------------------------------------------------
// apply mask:
// if mask[]=undefined_val	im[]->im[]
// otherwise,				im[]->mask[]
// ....... this one may not be exact :-(
inline void overrideImageMMX(uchar* im, const uchar* mask, uchar undefined_val, int imageSize)
{
	__asm {
		// setup mm0 with 8 copies of 'undefined_val'
		mov			al, 	undefined_val	
		mov			ah, 	al
		mov			bx, 	ax
		shl			eax, 	16
		mov			ax, 	bx
		movd		mm0, 	eax
		movd		mm1, 	eax
		punpckldq	mm0, 	mm1
		
		mov		ecx, imageSize
		shr		ecx, 3

		mov		eax, im
		mov		ebx, mask

		pixel_loop:
			movq	mm1, [eax]
			movq	mm2, [ebx]

			movq	mm3, mm2
			pcmpeqb	mm3, mm0	// mm3[]	-> xFF if mm2[]==undefined_val
								//			-> x00 otherwise
			pand	mm3, mm1	// mm3[] = mm1[] if mm2[]==undefined_val
								//		 = x00 	 otherwise
			por		mm3, mm2
			movq	[eax], mm3

			add	eax, 8
			add ebx, 8
			dec ecx
			jnz pixel_loop

			jmp done

		done:
			emms
	}
}

inline void overrideImage(uchar* im, const uchar* mask, uchar undefined_val, int imageSize)
{
	for (int i=0; i<imageSize; ++i, ++im,++mask)
	{
		if (*mask != undefined_val) *im=*mask;
	}

}


inline void divide( ushort* im,  uchar* div, uchar* result, int imageSize)
{
	for (int i=0; i<imageSize; ++i,++im,++div,++result)
	{
		*result = (*div)?(uchar)(*im / *div):0;
	}
}

// 5x5 sum filters
inline void sum_5x5_mmx( uchar* im, ushort* im_out, int dataSize, int width, ushort* buff)
{
	sum_Row_5_mmx(im, buff, dataSize);
	add_Col_5_ww(buff+2*width, im_out+2*width, dataSize-4*width , width);
}

inline void sum_5x5_mmx( uchar* im, uchar* im_out, int dataSize, int width, ushort* buff)
{	
	sum_Row_5_mmx(im, buff, dataSize);
	add_Col_5_wb(buff+2*width, im_out+2*width, dataSize-4*width , width);
}


inline void binarize(uchar* im, uchar* im_out, uchar undefined_val, int dataSize)
{
	for (int i=0; i<dataSize; ++i,++im,++im_out)
	{
		*im_out = (*im != undefined_val);
	}
}

inline void set_undefined_to_zero(uchar* im, uchar* im_out, uchar undefined_val, int dataSize)
{
	for (int i=0; i<dataSize; ++i,++im,++im_out)
	{
		if (*im == undefined_val) *im_out=0;
	}
}

inline void set_zero_to_undefined(uchar* im, uchar* im_out, uchar undefined_val, int dataSize)
{
	for (int i=0; i<dataSize; ++i,++im,++im_out)
	{
		if (*im == 0) *im_out=undefined_val;
	}
}



inline void copyMMX(void* imDest, const void* imSrc, int dataSize)
{
	__asm {

		mov		ecx, dataSize
		shr		ecx, 3

		mov		eax, imSrc
		mov		ebx, imDest
		sub		ebx, 8

		pixel_loop:			
			movq	mm1, [eax]
			add ebx, 8
			
			movq	[ebx], mm1
			add eax, 8

			dec	ecx
			jnz	pixel_loop

			jmp done

		done:
			emms
	}
}

inline void copySSE(void* imDest, const void* imSrc, int dataSize)
{
	__asm {

		mov		ecx, dataSize
		shr		ecx, 4

		mov		eax, imSrc
		mov		ebx, imDest
		sub		ebx, 16

		pixel_loop:			
			movdqa	xmm1, [eax]
			add ebx, 16
			
			movdqa	[ebx], xmm1
			add eax, 16

			dec	ecx
			jnz	pixel_loop

			jmp done

		done:
			emms
	}
}

inline void setMMX(float* imDest, const float value, int dataSize)
{
	__asm {
		// make 4 copies of the constant 'value' in xmm0
		movss	xmm0, value
		movss	xmm1, xmm0
		unpcklps	xmm0, xmm1        
		movlhps		xmm0, xmm0

		mov		ecx, dataSize
		shr		ecx, 2

		mov		ebx, imDest

		pixel_loop:			
			movaps	[ebx], xmm0
			add ebx, 16

			dec	ecx
			jnz	pixel_loop

			jmp done

		done:
			emms
	}
}

inline void setMMX(char* imDest, const char value, int dataSize)
{
	__asm {
		// setup mm0 with 8 copies of 'value'
		mov			al, 	value	
		mov			ah, 	al
		mov			bx, 	ax
		shl			eax, 	16
		mov			ax, 	bx
		movd		mm0, 	eax
		movd		mm1, 	eax
		punpckldq	mm0, 	mm1


		mov		ecx, dataSize
		shr		ecx, 3

		mov		ebx, imDest

		pixel_loop:			
			movq	[ebx], mm0
			add ebx, 8

			dec	ecx
			jnz	pixel_loop

			jmp done

		done:
			emms
	}
}

/*
void copyRGBAtoRGB(const uchar* imSrc, uchar* imred,uchar* imgreen,uchar* imblue, int dataSize)
{
	__asm {

		mov		esi, dataSize
		shr		esi, 3

		mov		eax, imSrc
		mov		ebx, imred
		mov		ecx, imred
		mov		edx, imred

		pixel_loop:			
			movq	mm1, [eax]

			
			movq	[ebx], mm1

			add eax, 8
			add ebx, 8
			add ecx, 8
			add edx, 8

			dec	esi
			jnz	pixel_loop

			jmp done

		done:
			emms
	}
}*/

inline void multiply(uchar* im, float fact, int imageSize)
{
	__asm {

		mov		ecx, imageSize
		shr		ecx, 3

		// make 4 copies of the constant 'fact' in xmm0
		movss	xmm0, fact
		movss	xmm1, xmm0
		unpcklps	xmm0, xmm1        
		movlhps		xmm0, xmm0


		mov		eax, im
		pxor	mm7,mm7		// mm7 = 0 

		pixel_loop:			
			movq	mm1, [eax]
			movq	mm2, mm1

			punpcklbw	mm2, mm0
			punpckhbw	mm1, mm0

			movq	mm3,mm2
			punpckhwd	mm3, mm0
			punpcklwd	mm2, mm0

			movq	mm4,mm1
			punpcklwd	mm4, mm0
			punpckhwd	mm1, mm0

			// here, the first 8 bytes are in d-words [mm1 mm4 mm3 mm2]
			// --------
			cvtpi2ps	xmm3, mm3	// put mm3 in low part of xmm3
			cvtpi2ps	xmm2, mm2	// put mm2 in low part of xmm2

			movlhps		xmm2, xmm3	// xmm2 = [xmm3(low part) xmm2(low part)]

			mulps	xmm2, xmm0

			cvtps2pi	mm2, xmm2	// convert low 2 floats from xmm2 to mm2
			movhlps		xmm3,xmm2	// mov high 2 floats from xmm2 to low 2 floats in xmm3
			cvtps2pi	mm3, xmm3	// convert low 2 floats from xmm3 to mm3

			packssdw	mm2, mm3	// mm2 = (word)[mm2 mm3]

			// --------
			cvtpi2ps	xmm4, mm4
			cvtpi2ps	xmm1, mm1

			movlhps		xmm4, xmm1

			mulps	xmm4, xmm0
	
			cvtps2pi	mm4, xmm4	// convert low 2 floats from xmm4 to mm4
			movhlps		xmm1,xmm4	// mov high 2 floats from xmm4 to low 2 floats in xmm1
			cvtps2pi	mm1, xmm1	// convert low 2 floats from xmm1 to mm1

			packssdw	mm4, mm1	// mm4 = (word)[mm1 mm4]
			

			// ------
			packuswb	mm2, mm4	// mm2 = [[mm4] [mm2]] = [mm1 mm4 mm2 mm3]
			movq	[eax], mm2

			add eax, 8
			dec	ecx
			jnz	pixel_loop

			jmp done

		done:
			emms
			
	}
}

inline void multiply(const uchar* imSrc, uchar* imDest, float fact, int imageSize)
{
	__asm {

		mov		ecx, imageSize
		shr		ecx, 3

		// make 4 copies of the constant 'fact' in xmm0
		movss	xmm0, fact
		movss	xmm1, xmm0
		unpcklps	xmm0, xmm1        
		movlhps		xmm0, xmm0


		mov		eax, imSrc
		mov		ebx, imDest
		pxor	mm7,mm7		// mm7 = 0 

		pixel_loop:			
			movq	mm1, [eax]
			movq	mm2, mm1

			punpcklbw	mm2, mm0
			punpckhbw	mm1, mm0

			movq	mm3,mm2
			punpckhwd	mm3, mm0
			punpcklwd	mm2, mm0

			movq	mm4,mm1
			punpcklwd	mm4, mm0
			punpckhwd	mm1, mm0

			// here, the first 8 bytes are in d-words [mm1 mm4 mm3 mm2]
			// --------
			cvtpi2ps	xmm3, mm3	// put mm3 in low part of xmm3
			cvtpi2ps	xmm2, mm2	// put mm2 in low part of xmm2

			movlhps		xmm2, xmm3	// xmm2 = [xmm3(low part) xmm2(low part)]

			mulps	xmm2, xmm0

			cvtps2pi	mm2, xmm2	// convert low 2 floats from xmm2 to mm2
			movhlps		xmm3,xmm2	// mov high 2 floats from xmm2 to low 2 floats in xmm3
			cvtps2pi	mm3, xmm3	// convert low 2 floats from xmm3 to mm3

			packssdw	mm2, mm3	// mm2 = (word)[mm2 mm3]

			// --------
			cvtpi2ps	xmm4, mm4
			cvtpi2ps	xmm1, mm1

			movlhps		xmm4, xmm1

			mulps	xmm4, xmm0
	
			cvtps2pi	mm4, xmm4	// convert low 2 floats from xmm4 to mm4
			movhlps		xmm1,xmm4	// mov high 2 floats from xmm4 to low 2 floats in xmm1
			cvtps2pi	mm1, xmm1	// convert low 2 floats from xmm1 to mm1

			packssdw	mm4, mm1	// mm4 = (word)[mm1 mm4]
			

			// ------
			packuswb	mm2, mm4	// mm2 = [[mm4] [mm2]] = [mm1 mm4 mm2 mm3]
			movq	[ebx], mm2

			add eax, 8
			add ebx, 8
			dec	ecx
			jnz	pixel_loop

			jmp done

		done:
			emms
			
	}
}