/*************************************************************************** 
*
* Copyright 2000 by David Demirdjian.   All rights reserved. 
*  
* Developed  by David Demirdjian
*  
* Permission to use, copy, or modify this software and  its documentation 
* for  educational  and  research purposes only and without fee  is hereby 
* granted, provided  that this copyright notice and the original authors's 
* names appear  on all copies and supporting documentation.  If individual 
* files are  separated from  this  distribution directory  structure, this 
* copyright notice must be included.  For any other uses of this software, 
* in original or  modified form, including but not limited to distribution 
* in whole or in  part, specific  prior permission  must be  obtained from 
* MIT.  These programs shall not  be  used, rewritten, or  adapted as  the 
* basis  of  a  commercial  software  or  hardware product  without  first 
* obtaining appropriate  licenses from David Demirdjian.  The author makes 
* no representations about the suitability of this software for any purpose.  
* It is provided "as is" without express or implied warranty. 
*  
**************************************************************************/
#include "stereoMatching.h"
#include "processingmmx.h"

// ************************************************************
// ************************************************************
// *** List of functions (SSE2) for image processing
// ************************************************************
// ************************************************************

// Src1, Src2 and Dest suppose to point on 16-bytes memory block
inline int ImgSubandAdd_sse2(const unsigned char *Src1, const unsigned char *Src2, 
				 const unsigned char *Src3, unsigned char *Dest, int l)
{

	if (l < 8) return 0;              // image size must be at least 8 bytes 

  __asm 
  {		
        mov eax, Src1     
        mov ebx, Src2
		mov edx, Src3
        mov edi, Dest    
        mov	ecx, l   
        shr	ecx, 4	
	
align 16
inner_loop:
		movdqa	xmm1,[eax]	// xmm1=src1
		movdqa	xmm2,[ebx]	// mm2=src2

		movdqa	xmm4,xmm1		// mm4=mm1

		psubusb	xmm4,xmm2		// mm4 = src1 - src2

		movdqu	xmm3,[edx]	// mm3=src3
		psubusb	xmm2,xmm1		// mm2 = src2 - src1
        
		movdqa	xmm5,xmm1		// mm5=src1
		por		xmm2,xmm4		// mm2=|src1-src2|

        psubusb	xmm5,xmm3		// mm4=src1-src3

        psubusb	xmm3,xmm1	 	// mm3=src3-src1

		por		xmm3,xmm5		// mm3=|src1-src3|

		paddusb xmm2,xmm3		// mm2 = |src1-src2|+|src1-src3|

        movdqa    [edi], xmm2	 
        add eax,16        
        add ebx,16    
        add edx,16    
        add edi,16		
        dec ecx      
        jnz inner_loop    
        emms   		
  }
	
  return 1;
}







#define macro_add_sse2 __asm \
{						\
	__asm 	paddusw xmm3, [edx]	\
	__asm 	paddusw xmm2, [edx+16]	\
	__asm	add edx, edi		\
}


inline void avg_Col_5_sse2(ushort* im, uchar* im_out, int dataSize, int width)
{
	__asm {

	mov edi, width
	shl edi, 1  // edi = 2*width

	mov eax, dataSize
	mov ecx, im_out

	mov ebx, im
	sub ebx, edi
	sub ebx, edi // ebx = ebx-4*width
	
	test eax, eax // Is there anything to do?" 
	jz end_sum_loop // Jump out if necessary 

	row_sum_loop:

		test eax, eax // Is there anything to do? 
		jz end_sum_loop // Jump out if necessary 

		mov edx, ebx
		add ebx, 32

		// 1
		movdqa xmm3, [edx] // xmm3 = 8 words of im
		movdqa xmm2, [edx+16] // xmm3 = 8 words of im
		add edx, edi

		macro_add_sse2
		macro_add_sse2
		macro_add_sse2
		macro_add_sse2
		
		// divide results by ...
		psrlw xmm3, 3
		psrlw xmm2, 3

		// convert [xmm2 xmm3] as 8 words
		packuswb xmm3,xmm2
		movdqa [ecx], xmm3

		sub eax, 16 // Update the number of points left 
		add ecx, 16 // Update output pointer 

		jmp row_sum_loop // Loop 

		//Cleanup 
	end_sum_loop:
	emms 
	}
}

inline void avg_Col_7_sse2(ushort* im, uchar* im_out, int dataSize, int width)
{
	__asm {

	mov edi, width
	shl edi, 1  // edi = 2*width

	mov eax, dataSize
	mov ecx, im_out

	mov ebx, im
	sub ebx, edi
	sub ebx, edi
	sub ebx, edi // ebx = ebx-4*width
	
	test eax, eax // Is there anything to do?" 
	jz end_sum_loop // Jump out if necessary 

	row_sum_loop:

		test eax, eax // Is there anything to do? 
		jz end_sum_loop // Jump out if necessary 

		mov edx, ebx
		add ebx, 32

		// 1
		movdqa xmm3, [edx] // xmm3 = 8 words of im
		movdqa xmm2, [edx+16] // xmm3 = 8 words of im
		add edx, edi

		macro_add_sse2
		macro_add_sse2
		macro_add_sse2
		macro_add_sse2
		macro_add_sse2
		macro_add_sse2
		
		// divide results by ...
		psrlw xmm3, 3
		psrlw xmm2, 3

		// convert [xmm2 xmm3] as 8 words
		packuswb xmm3,xmm2
		movdqa [ecx], xmm3

		sub eax, 16 // Update the number of points left 
		add ecx, 16 // Update output pointer 

		jmp row_sum_loop // Loop 

		//Cleanup 
	end_sum_loop:
	emms 
	}
}

inline void avg_Col_9_sse2(ushort* im, uchar* im_out, int dataSize, int width)
{
	__asm {

	mov edi, width
	shl edi, 1  // edi = 2*width

	mov eax, dataSize
	mov ecx, im_out

	mov ebx, im
	sub ebx, edi
	sub ebx, edi
	sub ebx, edi
	sub ebx, edi // ebx = ebx-4*width
	
	test eax, eax // Is there anything to do?" 
	jz end_sum_loop // Jump out if necessary 

	row_sum_loop:

		test eax, eax // Is there anything to do? 
		jz end_sum_loop // Jump out if necessary 

		mov edx, ebx
		add ebx, 32

		// 1
		movdqa xmm3, [edx] // xmm3 = 8 words of im
		movdqa xmm2, [edx+16] // xmm3 = 8 words of im
		add edx, edi

		macro_add_sse2
		macro_add_sse2
		macro_add_sse2
		macro_add_sse2
		macro_add_sse2
		macro_add_sse2
		macro_add_sse2
		macro_add_sse2
		
		// divide results by ...
		psrlw xmm3, 3
		psrlw xmm2, 3

		// convert [xmm2 xmm3] as 8 words
		packuswb xmm3,xmm2
		movdqa [ecx], xmm3

		sub eax, 16 // Update the number of points left 
		add ecx, 16 // Update output pointer 

		jmp row_sum_loop // Loop 

		//Cleanup 
	end_sum_loop:
	emms 
	}
}

inline void avg_Col_11_sse2(ushort* im, uchar* im_out, int dataSize, int width)
{
	__asm {

	mov edi, width
	shl edi, 1  // edi = 2*width

	mov eax, dataSize
	mov ecx, im_out

	mov ebx, im
	sub ebx, edi
	sub ebx, edi
	sub ebx, edi
	sub ebx, edi
	sub ebx, edi // ebx = ebx-4*width
	
	test eax, eax // Is there anything to do?" 
	jz end_sum_loop // Jump out if necessary 

	row_sum_loop:

		test eax, eax // Is there anything to do? 
		jz end_sum_loop // Jump out if necessary 

		mov edx, ebx
		add ebx, 32

		// 1
		movdqa xmm3, [edx] // xmm3 = 8 words of im
		movdqa xmm2, [edx+16] // xmm3 = 8 words of im
		add edx, edi

		macro_add_sse2
		macro_add_sse2
		macro_add_sse2
		macro_add_sse2
		macro_add_sse2
		macro_add_sse2
		macro_add_sse2
		macro_add_sse2
		macro_add_sse2
		macro_add_sse2
		
		// divide results by ...
		psrlw xmm3, 3
		psrlw xmm2, 3

		// convert [xmm2 xmm3] as 8 words
		packuswb xmm3,xmm2
		movdqa [ecx], xmm3

		sub eax, 16 // Update the number of points left 
		add ecx, 16 // Update output pointer 

		jmp row_sum_loop // Loop 

		//Cleanup 
	end_sum_loop:
	emms 
	}
}

inline void avg_Col_13_sse2(ushort* im, uchar* im_out, int dataSize, int width)
{
	__asm {

	mov edi, width
	shl edi, 1  // edi = 2*width

	mov eax, dataSize
	mov ecx, im_out

	mov ebx, im
	sub ebx, edi
	sub ebx, edi
	sub ebx, edi
	sub ebx, edi
	sub ebx, edi
	sub ebx, edi // ebx = ebx-4*width
	
	test eax, eax // Is there anything to do?" 
	jz end_sum_loop // Jump out if necessary 

	row_sum_loop:

		test eax, eax // Is there anything to do? 
		jz end_sum_loop // Jump out if necessary 

		mov edx, ebx
		add ebx, 32

		// 1
		movdqa xmm3, [edx] // xmm3 = 8 words of im
		movdqa xmm2, [edx+16] // xmm3 = 8 words of im
		add edx, edi

		macro_add_sse2
		macro_add_sse2
		macro_add_sse2
		macro_add_sse2
		macro_add_sse2
		macro_add_sse2
		macro_add_sse2
		macro_add_sse2
		macro_add_sse2
		macro_add_sse2
		macro_add_sse2
		macro_add_sse2
		
		// divide results by ...
		psrlw xmm3, 3
		psrlw xmm2, 3

		// convert [xmm2 xmm3] as 8 words
		packuswb xmm3,xmm2
		movdqa [ecx], xmm3

		sub eax, 16 // Update the number of points left 
		add ecx, 16 // Update output pointer 

		jmp row_sum_loop // Loop 

		//Cleanup 
	end_sum_loop:
	emms 
	}
}

// apply vertical mask 1/16*[1 1 1 ... 1]^T to 'im'
// result in 'im_out'
inline void avg_Col_sse2(ushort* im, uchar* im_out, int dataSize, int width, int sizeMask)
{
	switch (sizeMask)
	{
	case 5: avg_Col_5_sse2(im,im_out,dataSize,width);
		break;
	case 7: avg_Col_7_sse2(im,im_out,dataSize,width);
		break;
	case 9: avg_Col_9_sse2(im,im_out,dataSize,width);
		break;	
	case 11: avg_Col_11_sse2(im,im_out,dataSize,width);
		break;
	case 13: avg_Col_13_sse2(im,im_out,dataSize,width);
		break;
	case 15: avg_Col_15(im,im_out,dataSize,width);
		break;
	case 17: avg_Col_17(im,im_out,dataSize,width);
		break;

	default: if (sizeMask<5) avg_Col_5_sse2(im,im_out,dataSize,width);
			 else if (sizeMask>17) avg_Col_17(im,im_out,dataSize,width);
	}
}