/*************************************************************************** * * Copyright 2000 by David Demirdjian. All rights reserved. * * Developed by David Demirdjian * * Permission to use, copy, or modify this software and its documentation * for educational and research purposes only and without fee is hereby * granted, provided that this copyright notice and the original authors's * names appear on all copies and supporting documentation. If individual * files are separated from this distribution directory structure, this * copyright notice must be included. For any other uses of this software, * in original or modified form, including but not limited to distribution * in whole or in part, specific prior permission must be obtained from * MIT. These programs shall not be used, rewritten, or adapted as the * basis of a commercial software or hardware product without first * obtaining appropriate licenses from David Demirdjian. The author makes * no representations about the suitability of this software for any purpose. * It is provided "as is" without express or implied warranty. * **************************************************************************/ #include "stereoMatching.h" #include "processingmmx.h" // ************************************************************ // ************************************************************ // *** List of functions (SSE2) for image processing // ************************************************************ // ************************************************************ // Src1, Src2 and Dest suppose to point on 16-bytes memory block inline int ImgSubandAdd_sse2(const unsigned char *Src1, const unsigned char *Src2, const unsigned char *Src3, unsigned char *Dest, int l) { if (l < 8) return 0; // image size must be at least 8 bytes __asm { mov eax, Src1 mov ebx, Src2 mov edx, Src3 mov edi, Dest mov ecx, l shr ecx, 4 align 16 inner_loop: movdqa xmm1,[eax] // xmm1=src1 movdqa xmm2,[ebx] // mm2=src2 movdqa xmm4,xmm1 // mm4=mm1 psubusb xmm4,xmm2 // mm4 = src1 - src2 movdqu xmm3,[edx] // mm3=src3 psubusb xmm2,xmm1 // mm2 = src2 - src1 movdqa xmm5,xmm1 // mm5=src1 por xmm2,xmm4 // mm2=|src1-src2| psubusb xmm5,xmm3 // mm4=src1-src3 psubusb xmm3,xmm1 // mm3=src3-src1 por xmm3,xmm5 // mm3=|src1-src3| paddusb xmm2,xmm3 // mm2 = |src1-src2|+|src1-src3| movdqa [edi], xmm2 add eax,16 add ebx,16 add edx,16 add edi,16 dec ecx jnz inner_loop emms } return 1; } #define macro_add_sse2 __asm \ { \ __asm paddusw xmm3, [edx] \ __asm paddusw xmm2, [edx+16] \ __asm add edx, edi \ } inline void avg_Col_5_sse2(ushort* im, uchar* im_out, int dataSize, int width) { __asm { mov edi, width shl edi, 1 // edi = 2*width mov eax, dataSize mov ecx, im_out mov ebx, im sub ebx, edi sub ebx, edi // ebx = ebx-4*width test eax, eax // Is there anything to do?" jz end_sum_loop // Jump out if necessary row_sum_loop: test eax, eax // Is there anything to do? jz end_sum_loop // Jump out if necessary mov edx, ebx add ebx, 32 // 1 movdqa xmm3, [edx] // xmm3 = 8 words of im movdqa xmm2, [edx+16] // xmm3 = 8 words of im add edx, edi macro_add_sse2 macro_add_sse2 macro_add_sse2 macro_add_sse2 // divide results by ... psrlw xmm3, 3 psrlw xmm2, 3 // convert [xmm2 xmm3] as 8 words packuswb xmm3,xmm2 movdqa [ecx], xmm3 sub eax, 16 // Update the number of points left add ecx, 16 // Update output pointer jmp row_sum_loop // Loop //Cleanup end_sum_loop: emms } } inline void avg_Col_7_sse2(ushort* im, uchar* im_out, int dataSize, int width) { __asm { mov edi, width shl edi, 1 // edi = 2*width mov eax, dataSize mov ecx, im_out mov ebx, im sub ebx, edi sub ebx, edi sub ebx, edi // ebx = ebx-4*width test eax, eax // Is there anything to do?" jz end_sum_loop // Jump out if necessary row_sum_loop: test eax, eax // Is there anything to do? jz end_sum_loop // Jump out if necessary mov edx, ebx add ebx, 32 // 1 movdqa xmm3, [edx] // xmm3 = 8 words of im movdqa xmm2, [edx+16] // xmm3 = 8 words of im add edx, edi macro_add_sse2 macro_add_sse2 macro_add_sse2 macro_add_sse2 macro_add_sse2 macro_add_sse2 // divide results by ... psrlw xmm3, 3 psrlw xmm2, 3 // convert [xmm2 xmm3] as 8 words packuswb xmm3,xmm2 movdqa [ecx], xmm3 sub eax, 16 // Update the number of points left add ecx, 16 // Update output pointer jmp row_sum_loop // Loop //Cleanup end_sum_loop: emms } } inline void avg_Col_9_sse2(ushort* im, uchar* im_out, int dataSize, int width) { __asm { mov edi, width shl edi, 1 // edi = 2*width mov eax, dataSize mov ecx, im_out mov ebx, im sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi // ebx = ebx-4*width test eax, eax // Is there anything to do?" jz end_sum_loop // Jump out if necessary row_sum_loop: test eax, eax // Is there anything to do? jz end_sum_loop // Jump out if necessary mov edx, ebx add ebx, 32 // 1 movdqa xmm3, [edx] // xmm3 = 8 words of im movdqa xmm2, [edx+16] // xmm3 = 8 words of im add edx, edi macro_add_sse2 macro_add_sse2 macro_add_sse2 macro_add_sse2 macro_add_sse2 macro_add_sse2 macro_add_sse2 macro_add_sse2 // divide results by ... psrlw xmm3, 3 psrlw xmm2, 3 // convert [xmm2 xmm3] as 8 words packuswb xmm3,xmm2 movdqa [ecx], xmm3 sub eax, 16 // Update the number of points left add ecx, 16 // Update output pointer jmp row_sum_loop // Loop //Cleanup end_sum_loop: emms } } inline void avg_Col_11_sse2(ushort* im, uchar* im_out, int dataSize, int width) { __asm { mov edi, width shl edi, 1 // edi = 2*width mov eax, dataSize mov ecx, im_out mov ebx, im sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi // ebx = ebx-4*width test eax, eax // Is there anything to do?" jz end_sum_loop // Jump out if necessary row_sum_loop: test eax, eax // Is there anything to do? jz end_sum_loop // Jump out if necessary mov edx, ebx add ebx, 32 // 1 movdqa xmm3, [edx] // xmm3 = 8 words of im movdqa xmm2, [edx+16] // xmm3 = 8 words of im add edx, edi macro_add_sse2 macro_add_sse2 macro_add_sse2 macro_add_sse2 macro_add_sse2 macro_add_sse2 macro_add_sse2 macro_add_sse2 macro_add_sse2 macro_add_sse2 // divide results by ... psrlw xmm3, 3 psrlw xmm2, 3 // convert [xmm2 xmm3] as 8 words packuswb xmm3,xmm2 movdqa [ecx], xmm3 sub eax, 16 // Update the number of points left add ecx, 16 // Update output pointer jmp row_sum_loop // Loop //Cleanup end_sum_loop: emms } } inline void avg_Col_13_sse2(ushort* im, uchar* im_out, int dataSize, int width) { __asm { mov edi, width shl edi, 1 // edi = 2*width mov eax, dataSize mov ecx, im_out mov ebx, im sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi // ebx = ebx-4*width test eax, eax // Is there anything to do?" jz end_sum_loop // Jump out if necessary row_sum_loop: test eax, eax // Is there anything to do? jz end_sum_loop // Jump out if necessary mov edx, ebx add ebx, 32 // 1 movdqa xmm3, [edx] // xmm3 = 8 words of im movdqa xmm2, [edx+16] // xmm3 = 8 words of im add edx, edi macro_add_sse2 macro_add_sse2 macro_add_sse2 macro_add_sse2 macro_add_sse2 macro_add_sse2 macro_add_sse2 macro_add_sse2 macro_add_sse2 macro_add_sse2 macro_add_sse2 macro_add_sse2 // divide results by ... psrlw xmm3, 3 psrlw xmm2, 3 // convert [xmm2 xmm3] as 8 words packuswb xmm3,xmm2 movdqa [ecx], xmm3 sub eax, 16 // Update the number of points left add ecx, 16 // Update output pointer jmp row_sum_loop // Loop //Cleanup end_sum_loop: emms } } // apply vertical mask 1/16*[1 1 1 ... 1]^T to 'im' // result in 'im_out' inline void avg_Col_sse2(ushort* im, uchar* im_out, int dataSize, int width, int sizeMask) { switch (sizeMask) { case 5: avg_Col_5_sse2(im,im_out,dataSize,width); break; case 7: avg_Col_7_sse2(im,im_out,dataSize,width); break; case 9: avg_Col_9_sse2(im,im_out,dataSize,width); break; case 11: avg_Col_11_sse2(im,im_out,dataSize,width); break; case 13: avg_Col_13_sse2(im,im_out,dataSize,width); break; case 15: avg_Col_15(im,im_out,dataSize,width); break; case 17: avg_Col_17(im,im_out,dataSize,width); break; default: if (sizeMask<5) avg_Col_5_sse2(im,im_out,dataSize,width); else if (sizeMask>17) avg_Col_17(im,im_out,dataSize,width); } }