/*************************************************************************** * * Copyright 2000 by David Demirdjian. All rights reserved. * * Developed by David Demirdjian * * Permission to use, copy, or modify this software and its documentation * for educational and research purposes only and without fee is hereby * granted, provided that this copyright notice and the original authors's * names appear on all copies and supporting documentation. If individual * files are separated from this distribution directory structure, this * copyright notice must be included. For any other uses of this software, * in original or modified form, including but not limited to distribution * in whole or in part, specific prior permission must be obtained from * MIT. These programs shall not be used, rewritten, or adapted as the * basis of a commercial software or hardware product without first * obtaining appropriate licenses from David Demirdjian. The author makes * no representations about the suitability of this software for any purpose. * It is provided "as is" without express or implied warranty. * **************************************************************************/ #include "stereoMatching.h" #include "processingMMX.h" // ************************************************************ // ************************************************************ // *** List of functions (MMX) for image processing // ************************************************************ // ************************************************************ // shrink images by a factor 'fact'. eg if fact = 2, out will be twice as small as src inline void shrinkImages(uchar* dst, const uchar* src, int width, int height, int fact) { int width_f = width/fact; int siz = width*height/(fact*fact); if (fact>0) { for (int i=0,j=0; i0) { // dest. must be after src... to avoid overwriting data data += (siz-tx); uchar* dataDst = data+tx; for (int i=0; ia2) multiply(data1, a2/a1, siz); else multiply(data2, a1/a2, siz); } void normalizeImages(const uchar* data1, const uchar* data2, uchar* out1, uchar* out2, int siz) { float a1 = pixelMean(data1,siz); float a2 = pixelMean(data2,siz); // normalize the image which average intensity is the highest if (a1>a2) { multiply(data1, out1, a2/a1, siz); copyMMX(out2, data2, siz); } else { multiply(data2, out2, a1/a2, siz); copyMMX(out1, data1, siz); } } // ImgSub2: D = saturation0(|S1 - S2| + |S1 - S3|) // TODO? divide the result by 2 (shift) inline int ImgSubandAdd(const unsigned char *Src1, const unsigned char *Src2, const unsigned char *Src3, unsigned char *Dest, int l) { if (l < 8) return 0; // image size must be at least 8 bytes __asm { mov eax, Src1 mov ebx, Src2 mov edx, Src3 mov edi, Dest mov ecx, l shr ecx, 3 align 16 inner_loop: movq mm1,[eax] // mm1=src1 movq mm2,[ebx] // mm2=src2 movq mm4,mm1 // mm4=mm1 psubusb mm4,mm2 // mm4 = src1 - src2 movq mm3,[edx] // mm3=src3 psubusb mm2,mm1 // mm2 = src2 - src1 movq mm5,mm1 // mm5=src1 por mm2,mm4 // mm2=|src1-src2| psubusb mm5,mm3 // mm4=src1-src3 psubusb mm3,mm1 // mm3=src3-src1 por mm3,mm5 // mm3=|src1-src3| paddusb mm2,mm3 // mm2 = |src1-src2|+|src1-src3| movq [edi], mm2 add eax,8 add ebx,8 add edx,8 add edi,8 dec ecx jnz inner_loop emms } return 1; } // ImgSub2: D = saturation0(|S1 - S2|) // TODO? divide the result by 2 (shift) inline int ImgSubandAdd(const unsigned char *Src1, const unsigned char *Src2, const unsigned char *Dest, int l) { if (l < 8) return 0; // image size must be at least 8 bytes __asm { mov eax, Src1 mov ebx, Src2 mov edi, Dest mov ecx, l shr ecx, 3 align 16 inner_loop: movq mm1,[eax] // mm1=src1 movq mm2,[ebx] // mm2=src2 movq mm4,mm1 // mm4=mm1 psubusb mm4,mm2 // mm4 = src1 - src2 psubusb mm2,mm1 // mm2 = src2 - src1 por mm2,mm4 // mm2=|src1-src2| movq [edi], mm2 add eax,8 add ebx,8 add edi,8 dec ecx jnz inner_loop emms } return 1; } #define _ABS_DIFF_TRI(Z) __asm \ { \ __asm movq mm4,mm1 /* mm4=mm1 */ \ __asm add ebx, width \ __asm add edi, imageSize \ __asm por mm3,mm7 /* here mm2=new src2 mm3=new src3 */ \ \ __asm movq mm7, mm0 \ __asm psubusb mm4,mm2 /* mm4 = src1 - src2 */ \ \ __asm psubusb mm2,mm1 /* mm2 = src2 - src1 */ \ __asm psllq mm7,Z \ \ __asm movq mm5,mm1 /* mm5=src1 */ \ __asm por mm4,mm2 /* mm2=|src1-src2| */ \ \ __asm movq mm2,[ebx] /* mm2= src2 + 'width' = new src2*/ \ __asm psubusb mm5,mm3 /* mm5=src1-src3*/ \ \ __asm movq mm6,mm3 /* mm6=src3*/ \ __asm psubusb mm6,mm1 /* mm3=src3-src1*/ \ \ __asm por mm6,mm5 /* mm6=|src1-src3|*/ \ __asm paddusb mm4,mm6 /* mm4 = |src1-src2|+|src1-src3|*/ \ \ __asm movq [edi], mm4 /* here mm1=src1*/ \ __asm psrlq mm3, 8 /* mm3 = src3 + '1' ... with [x00000000] at the end*/\ } #define _ABS_DIFF_TRI_prefetch(Z, X) __asm \ { \ __asm movq mm4,mm1 /* mm4=mm1 */ \ __asm add ebx, width \ __asm add edi, imageSize \ __asm por mm3,mm7 /* here mm2=new src2 mm3=new src3 */ \ \ __asm movq mm7, mm0 \ __asm psubusb mm4,mm2 /* mm4 = src1 - src2 */ \ \ __asm psubusb mm2,mm1 /* mm2 = src2 - src1 */ \ __asm prefetcht0 [ebx + X] \ __asm psllq mm7,Z \ \ __asm movq mm5,mm1 /* mm5=src1 */ \ __asm por mm4,mm2 /* mm2=|src1-src2| */ \ \ \ __asm movq mm2,[ebx] /* mm2= src2 + 'width' = new src2*/ \ __asm psubusb mm5,mm3 /* mm5=src1-src3*/ \ \ __asm movq mm6,mm3 /* mm6=src3*/ \ __asm psubusb mm6,mm1 /* mm3=src3-src1*/ \ \ __asm por mm6,mm5 /* mm6=|src1-src3|*/ \ __asm paddusb mm4,mm6 /* mm4 = |src1-src2|+|src1-src3|*/ \ \ __asm movq [edi], mm4 /* here mm1=src1*/ \ __asm psrlq mm3, 8 /* mm3 = src3 + '1' ... with [x00000000] at the end*/\ } // ImgSubandAdd2: D = saturation0(|S1 - S2| + |S1 - S3|) // process 8 disparities at a time // // Src1: right // Src2: top // Src3: left // // TODO? divide the result by 2 (shift) inline int ImgSubandAdd2(const unsigned char *Src1, const unsigned char *Src2, const unsigned char *Src3, unsigned char* Dest1, int l, int imageSize, int width) { if (l < 8) return 0; // image size must be at least 8 bytes const int back_step1 = 7*width; const int back_step2 = 7*imageSize; __asm { mov eax, Src1 mov ebx, Src2 mov edx, Src3 mov edi, Dest1 mov ecx, l shr ecx, 3 movq mm0,[edx] // mm0=src3 movq mm0,[edx] // mm0=src3 align 16 inner_loop: movq mm1,[eax] // mm1=src1 movq mm3,mm0 // mm3=src3 movq mm2,[ebx] // mm2=src2 add eax,8 // -- 1 --------- in : mm1,mm2,mm3 out: mm4=SAD mm2=new mm2 -- movq mm4,mm1 // mm4=mm1 add ebx,width psubusb mm4,mm2 // mm4 = src1 - src2 //prefetcht0 [ebx + 32 + 2*320] movq mm0,[edx+8] psubusb mm2,mm1 // mm2 = src2 - src1 movq mm5,mm1 // mm5=src1 por mm4,mm2 // mm2=|src1-src2| movq mm2,[ebx] // mm2= src2 + 'width' = new src2 psubusb mm5,mm3 // mm5=src1-src3 movq mm6,mm3 // mm6=src3 psubusb mm6,mm1 // mm3=src3-src1 movq mm7, mm0 psrlq mm3, 8 // mm3 = src3 + '1' ... with [x00000000] at the end por mm6,mm5 // mm6=|src1-src3| paddusb mm4,mm6 // mm4 = |src1-src2|+|src1-src3| movq [edi], mm4 psllq mm7, 56 // here mm1=src1 mm2=NEW src2 mm3=begin of NEWsrc3 mm7=end of NEWsrc3 // ------------------------------------------------------------- // - 2 ---------------- _ABS_DIFF_TRI(48) // - 3 ---------------- _ABS_DIFF_TRI(40) // - 4 ---------------- _ABS_DIFF_TRI(32) // _ABS_DIFF_TRI_prefetch(32,24 + 3*320) // - 5 ---------------- _ABS_DIFF_TRI(24) // - 6 ---------------- _ABS_DIFF_TRI(16) // - 7 ---------------- _ABS_DIFF_TRI(8) // - 8 ---------------- movq mm4,mm1 // mm4=mm1 por mm3,mm7 // here mm2=new src2 mm3=new src3 psubusb mm4,mm2 // mm4 = src1 - src2 psubusb mm2,mm1 // mm2 = src2 - src1 movq mm5,mm1 // mm5=src1 por mm4,mm2 // mm2=|src1-src2| psubusb mm5,mm3 // mm5=src1-src3 psubusb mm3,mm1 // mm3=src3-src1 por mm3,mm5 // mm6=|src1-src3| paddusb mm4,mm3 // mm4 = |src1-src2|+|src1-src3| add edi, imageSize movq [edi], mm4 // here mm1=src1 // ------------------------------------------------------------- // sub ebx, back_step1 add ebx,8 add edx,8 sub edi, back_step2 add edi,8 dec ecx jnz inner_loop emms } return 1; } // macro: in: mm1,mm2 #define _ABS_DIFF_ __asm \ { \ __asm movq mm4,mm1 /* mm4=mm1 */ \ __asm psubusb mm4,mm2 /* mm4 = src1 - src2 */ \ __asm psubusb mm2,mm1 /* mm2 = src2 - src1 */ \ __asm por mm4,mm2 /* mm2=|src1-src2| */ \ __asm add ebx, width \ __asm add edi, imageSize \ __asm movq mm2,[ebx] \ __asm movq [edi], mm4 /* here mm1=src1 */ \ } // ImgSubandAdd2: D = saturation0(|S1 - S2| + |S1 - S3|) // process 8 disparities at a time // Src1: right // Src2: top // TODO? divide the result by 2 (shift) inline int ImgSubandAdd2_Vert(const unsigned char *Src1, const unsigned char *Src2, unsigned char* Dest1, int l, int imageSize, int width) { if (l < 8) return 0; // image size must be at least 8 bytes const int back_step1 = 7*width; const int back_step2 = 7*imageSize; __asm { mov eax, Src1 mov ebx, Src2 mov edi, Dest1 mov ecx, l shr ecx, 3 align 16 inner_loop: movq mm1,[eax] // mm1=src1 movq mm2,[ebx] // mm2=src2 add eax,8 // -- 1 --------- in : mm1,mm2,mm3 out: mm4=SAD mm2=new mm2 -- _ABS_DIFF_ _ABS_DIFF_ _ABS_DIFF_ _ABS_DIFF_ _ABS_DIFF_ _ABS_DIFF_ _ABS_DIFF_ // - 8 ---------------- movq mm4,mm1 // mm4=mm1 psubusb mm4,mm2 // mm4 = src1 - src2 psubusb mm2,mm1 // mm2 = src2 - src1 por mm4,mm2 // mm2=|src1-src2| add edi, imageSize movq [edi], mm4 // here mm1=src1 // ------------------------------------------------------------- // sub ebx, back_step1 add ebx,8 sub edi, back_step2 add edi,8 dec ecx jnz inner_loop emms } return 1; } // macro: in: mm1,mm2 #define _ABS_DIFF_HORIZ(Z) __asm \ { \ __asm movq mm7, mm0 \ __asm add edi, imageSize \ __asm movq mm5,mm1 /* mm5=src1 */ \ __asm psllq mm7, Z \ __asm psubusb mm5,mm3 /* mm5=src1-src3 */ \ __asm movq mm6,mm3 /* mm6=src3 */ \ __asm psubusb mm6,mm1 /* mm3=src3-src1 */ \ __asm por mm6,mm5 /* mm6=|src1-src3| */ \ __asm movq [edi], mm6 /* here mm1=src1 */ \ __asm psrlq mm3, 8 /* mm3 = src3 + '1' ... with [x00000000] at the end */ \ __asm por mm3,mm7 /* here mm3=new src3 */ \ } // ImgSubandAdd2: D = saturation0(|S1 - S2| + |S1 - S3|) // process 8 disparities at a time // // Src1: right // Src2: top // Src3: left // // TODO? divide the result by 2 (shift) inline int ImgSubandAdd_Horiz(const unsigned char *rightIm, const unsigned char *leftIm, unsigned char* Dest, int l, int imageSize, int width) { if (l < 8) return 0; // image size must be at least 8 bytes const int back_step2 = 7*imageSize; __asm { mov eax, rightIm mov edx, leftIm mov edi, Dest mov ecx, l shr ecx, 3 movq mm0,[edx] // mm0=src3 movq mm0,[edx] // mm0=src3 align 16 inner_loop: movq mm1,[eax] // mm1=src1 movq mm3,mm0 // mm3=src3 // -- 1 --------- in : mm1,mm2,mm3 out: mm4=SAD mm2=new mm2 -- movq mm0,[edx+8] add eax,8 movq mm5,mm1 // mm5=src1 psubusb mm5,mm3 // mm5=src1-src3 movq mm6,mm3 // mm6=src3 psubusb mm6,mm1 // mm3=src3-src1 movq mm7, mm0 psrlq mm3, 8 // mm3 = src3 + '1' ... with [x00000000] at the end por mm6,mm5 // mm6=|src1-src3| movq [edi], mm6 psllq mm7, 56 // here mm1=src1 mm3=begin of NEWsrc3 mm7=end of NEWsrc3 por mm3,mm7 // here mm3=new src3 // - 2 ---------------- _ABS_DIFF_HORIZ(48) _ABS_DIFF_HORIZ(40) _ABS_DIFF_HORIZ(32) _ABS_DIFF_HORIZ(24) _ABS_DIFF_HORIZ(16) _ABS_DIFF_HORIZ(8) // - 8 ---------------- movq mm5,mm1 // mm5=src1 add edi, imageSize psubusb mm5,mm3 // mm5=src1-src3 psubusb mm3,mm1 // mm3=src3-src1 por mm3,mm5 // mm6=|src1-src3| movq [edi], mm3 // ------------------------------------------------------------- // add edx,8 sub edi, back_step2 add edi,8 dec ecx jnz inner_loop emms } return 1; } // ---------------------- // FULL IMAGE, BEST ONLY : Keith's code inline int findMinimumCorrelation_mmx( const unsigned char *CurrentCorrelation, unsigned char CurrentDisparity, unsigned char *Disparity, unsigned char *BestCorrelation, int bytecount) { if ((bytecount < 8) || ((bytecount % 8) != 0)) { return 0; } __asm { // load ecx with the pixelblock count = bytecount / 8 mov ecx, bytecount shr ecx, 3 // setup mm0 with 8 copies of the disparity constant mov al, CurrentDisparity mov ah, al mov bx, ax shl eax, 16 mov ax, bx movd mm0, eax movd mm1, eax punpckldq mm0, mm1 // setup mm1 with 8 copies of the xor constant for unsigned => signed conversion mov eax, 0x80808080 movd mm1, eax movd mm2, eax punpckldq mm1, mm2 // setup the image pointers mov eax, BestCorrelation mov esi, CurrentCorrelation mov edi, Disparity pixel_loop: movq mm2, [esi] // current correlation movq mm3, [eax] // best correlation // check for updates movq mm5, mm2 // copy the current correlation pxor mm5, mm1 // convert from unsigned range to signed range movq mm6, mm3 // copy the best correlation pxor mm6, mm1 // convert from unsigned range to signed range pcmpgtb mm5, mm6 // mm5 := (current signed> best) mask // 1 indicates current > best, so keep best // 0 indicates current <= best, so use new value // BYPASS // this phase adds 8 additional instructions, but could skip 2 writes and 1 read // abort remainder if not updating best correlation pcmpeqb mm6, mm6 // mm6 = 0xFFFFFFFF pxor mm6, mm5 // mm6 = mm5 xor 0xFFFFFFFF = not mm5 // 0 indicates current > best, so keep best // 1 indicates current <= best, so use new value packsswb mm6, mm6 // pack it into the lower dword of mm6 (unsigned saturation) // 11111111 11111111 => 11111111 some replaced // 11111111 00000000 => 11111111 some replaced // 00000000 11111111 => 11111111 some replaced // 00000000 00000000 => 00000000 no replacements // we don't need to backup ebx because its not used in this routine // movd mm7, ebx // make a backup of eax movd ebx, mm6 // get the saturated mask test ebx, ebx // test ebx => yields 0 iff no substitutions will occur // movd ebx, mm7 // restore ebx jz bypass // store mm4 (second correlation) to [ebx] // Update best Correlation movq mm6, mm5 // mm6 := mask movq mm7, mm5 // mm7 := mask pand mm6, mm3 // best correlation values to keep pandn mm7, mm2 // current correlation value to move to best correlation por mm6, mm7 // merge values movq [eax], mm6 // store values // update disparity movq mm2, [edi] // get disparity map movq mm6, mm5 // mm6 := mask pand mm5, mm2 // select disparity map values to keep pandn mm6, mm0 // select current disparity values to move to disparity map por mm5, mm6 // merge values movq [edi], mm5 // store values bypass: add eax, 8 add esi, 8 add edi, 8 dec ecx jnz pixel_loop emms; } return 1; } /*int initMinimumCorrelation( const unsigned char *CurrentCorrelation, unsigned char disparityInit, unsigned char *Disparity, unsigned char *BestCorrelation, unsigned char *SecondCorrelation, int bytecount) { for (int i=0; i signed conversion mov eax, 0x80808080 movd mm1, eax movd mm2, eax punpckldq mm1, mm2 // setup the image pointers mov eax, BestCorrelation mov ebx, SecondCorrelation mov esi, CurrentCorrelation mov edi, Disparity pixel_loop: movq mm2, [esi] // current correlation movq mm4, [ebx] // second correlation // convert the current correlation from unsigned range to signed range movq mm5, mm2 // copy the current correlation pxor mm5, mm1 // convert from unsigned range to signed range movq mm7, mm5 // copy converted to mm7 // check for second correlation updates movq mm6, mm4 // copy second best correlation pxor mm6, mm1 // convert from unsigned range to signed range pcmpgtb mm7, mm6 // mm7 := (current signed> second best) mask // BYPASS 1 // skip remainder if second correlation is not to be updated // this phase adds an addition 8 instructions, but it could save as 1 memory read and 3 writes pcmpeqb mm6, mm6 // mm6 = 0xFFFFFFFF pxor mm6, mm7 // mm6 = mm7 xor 0xFFFFFFFF = not mm7 // 0 indicates current > second, so keep old value // 1 indicates current <= second, so use new value packsswb mm6, mm6 // pack it into the lower dword of mm6 (unsigned saturation) // 11111111 11111111 => 11111111 some replaced // 11111111 00000000 => 11111111 some replaced // 00000000 11111111 => 11111111 some replaced // 00000000 00000000 => 00000000 no replacements // don't need to backup edx because its not used in this routine // movd mm3, edx // make a backup of edx movd edx, mm6 // get the saturated mask test edx, edx // test edx => yields 0 iff no replacements will occur // movd edx, mm3 // restore edx jz bypass1 // direct update second correlation (get values from current) // mm7 already has mask // movq mm6, mm7 // mm6 := mask // pand mm6, mm4 // second correlation values to keep // pandn mm7, mm2 // current correlation values to move to second correlation // por mm6, mm7 // merge value => direct updated second correlation // movq mm4, mm6 // store values (*** this instruction could be eliminated!) pand mm4, mm7 // second correlation values to keep pandn mm7, mm2 // current correlation values to move to second correlation por mm4, mm7 // merge value => direct updated second correlation // check for best correlation updates movq mm3, [eax] // best correlation // mm5 has converted current correlation movq mm6, mm3 // copy the best correlation pxor mm6, mm1 // convert from unsigned range to signed range pcmpgtb mm5, mm6 // mm5 := (current signed> best) mask // 1 indicates current > best, so keep best // 0 indicates current <= best, so use new value // BYPASS 2 // this phase adds 8 additional instructions, but could skip 2 writes and 1 read // abort remainder if not updating best correlation pcmpeqb mm6, mm6 // mm6 = 0xFFFFFFFF pxor mm6, mm5 // mm6 = mm5 xor 0xFFFFFFFF = not mm5 // 0 indicates current > best, so keep best // 1 indicates current <= best, so use new value packsswb mm6, mm6 // pack it into the lower dword of mm6 (unsigned saturation) // 11111111 11111111 => 11111111 some replaced // 11111111 00000000 => 11111111 some replaced // 00000000 11111111 => 11111111 some replaced // 00000000 00000000 => 00000000 no replacements // don't need to backup edx because its not used in this routine // movd mm7, edx // make a backup of edx movd edx, mm6 // get the saturated mask test edx, edx // test edx => yields 0 iff no substitutions will occur // movd edx, mm7 // restore edx jz bypass2 // store mm4 (second correlation) to [ebx] // indirect update second correlation (pushed down from best) movq mm6, mm5 // mm6 := mask movq mm7, mm5 // mm7 := mask pand mm6, mm4 // second correlation values to keep pandn mm7, mm3 // best correlations to move to second correlation por mm6, mm7 // merge values movq [ebx], mm6 // store values // direct Update best Correlation movq mm6, mm5 // mm6 := mask movq mm7, mm5 // mm7 := mask pand mm6, mm3 // best correlation values to keep pandn mm7, mm2 // current correlation value to move to best correlation por mm6, mm7 // merge values movq [eax], mm6 // store values // update disparity movq mm2, [edi] // get disparity map movq mm6, mm5 // mm6 := mask pand mm5, mm2 // select disparity map values to keep pandn mm6, mm0 // select current disparity values to move to disparity map por mm5, mm6 // merge values movq [edi], mm5 // store values bypass1: next_pixel: add eax, 8 add ebx, 8 add esi, 8 add edi, 8 dec ecx jnz pixel_loop jmp done bypass2: movq [ebx], mm4; jmp next_pixel done: emms; } return 1; } inline void sum_Row(uchar* im, unsigned short* im_out, int rowSize, int maskSize) { im += maskSize/2; im_out += maskSize/2; for (int i=0; i mm7=[0 0 H G] [ecx]=[F E D C] punpckldq mm0, mm7 // mm0 = [F E D C] movq [ecx], mm0 sub eax, 8 // Update the number of points left movq mm0, mm4 // Update accumulator psrlq mm7, 32 // mm7 = [0 0 H G] //Process high word movq mm1, [ebx] // Copy... punpckhbw mm1, mm6 // Expand high word bytes into words aim_Sum_Words_In_MM1 //Add to the previous data paddw mm0, mm3 // The current word of the accum // translate everything to 2 words on the left // mm7 = [0 0 D C] mm0 = [H G F E] ----> mm0=[0 0 H G] [ecx+8]=[F E D C] punpckldq mm7, mm0 // mm7 = [F E D C] add ebx, 8 // Update input pointer movq [ecx+8], mm7 psrlq mm0, 32 // mm0 = [0 0 H G] movq mm7, mm4 // Update accumulator add ecx, 16 // Update output pointer jmp row_sum_loop // Loop //Cleanup end_sum_loop: emms } } // apply the mask (1/4)*[1 1 1 1 1] to the 1-D array im (words) // output : im_out (words) inline void sum_Row_5_mmx(ushort* im, ushort* im_out, int rowSize) { // temp: for debugging //return sum_Row_5(im,im_out,rowSize); __asm { mov eax, rowSize mov ebx, im mov ecx, im_out //Process the first quad word, but save only the second result" test eax, eax // Is there anything to do?" jz end_sum_loop // Jump out if necessary movq mm1, [ebx] // Load data (4 words) add ebx, 8 // Update input pointer //Process low word aim_Sum_Words_In_MM1 //Store the result Only in the accumulator movq mm7, mm4 // Update accumulator //Process high word movq mm1, [ebx] // Copy... aim_Sum_Words_In_MM1 add ebx, 8 //Add to the previous data paddw mm7, mm3 // The current word of the accum // translate everything to 2 words on the left movq mm1, mm7 // mm1 = [D+E+F+G+H C+D+E+F+G B+C+D+E+F A+B+C+D+E] psrlq mm1, 32 // mm1 = [0 0 D+E+F+G+H C+D+E+F+G] movq mm0, mm1 // mm0 = [0 0 D+E+F+G+H C+D+E+F+G] psllq mm7, 32 // mm7 = [B+C+D+E+F A+B+C+D+E 0 0] movq [ecx], mm7 // Store the final result movq mm7, mm4 // Update accumulator add ecx, 8 // Update output pointer sub eax, 8 // Update the number of points left // Start the loop row_sum_loop: test eax, eax // Is there anything to do? jz end_sum_loop // Jump out if necessary movq mm1, [ebx] // Load data aim_Sum_Words_In_MM1 //Add to the previous data //prefetcht0 [ecx + 32] //prefetcht0 [ebx + 48] paddw mm7, mm3 // The current word of the accum psrlw mm7, 2 // divide result by ... // translate everything to 2 words on the left // mm0 = [0 0 D C] mm7 = [H G F E] ----> mm7 =[0 0 H G] [ecx]=[F E D C] punpckldq mm0, mm7 // mm0 = [F E D C] movq [ecx], mm0 sub eax, 8 // Update the number of points left movq mm0, mm4 // Update accumulator psrlq mm7, 32 // mm7 =[0 0 H G] //Process high word movq mm1, [ebx+8] // Copy... aim_Sum_Words_In_MM1 //Add to the previous data paddw mm0, mm3 // The current word of the accum psrlw mm0, 2 // divide result by ... // translate everything to 2 words on the left // mm7 = [0 0 D C] mm0 = [H G F E] ----> mm0=[0 0 H G] [ecx+8]=[F E D C] punpckldq mm7, mm0 // mm7 = [F E D C] add ebx, 16 // Update input pointer movq [ecx+8], mm7 psrlq mm0, 32 // mm0 = [0 0 H G] movq mm7, mm4 // Update accumulator add ecx, 16 // Update output pointer */ jmp row_sum_loop // Loop //Cleanup end_sum_loop: emms } } template void sum_Row_5(T* im, ushort* im_out, int rowSize) { im += 2; im_out +=2; int s = 0; for (int i=0; i17) avg_Col_17(im,im_out,dataSize,width); break; } } #define macro_add __asm \ { \ __asm paddusw mm3, [edx] \ __asm paddusw mm2, [edx+8] \ __asm add edx, edi \ } inline void avg_Col_5(ushort* im, uchar* im_out, int dataSize, int width) { __asm { mov edi, width shl edi, 1 // edi = 2*width mov eax, dataSize mov ecx, im_out mov ebx, im sub ebx, edi sub ebx, edi // ebx = ebx-4*width test eax, eax // Is there anything to do?" jz end_sum_loop // Jump out if necessary row_sum_loop: test eax, eax // Is there anything to do? jz end_sum_loop // Jump out if necessary mov edx, ebx add ebx, 16 // 1 movq mm3, [edx] // mm3 = 4 words of im movq mm2, [edx+8] // mm2 = next 4 words of im add edx, edi macro_add macro_add macro_add macro_add // divide results by ... psrlw mm3, 3 psrlw mm2, 3 // convert [mm2 mm3] as 8 bytes packuswb mm3,mm2 movq [ecx], mm3 sub eax, 8 // Update the number of points left add ecx, 8 // Update output pointer jmp row_sum_loop // Loop //Cleanup end_sum_loop: emms } } inline void avg_Col_7(ushort* im, uchar* im_out, int dataSize, int width) { __asm { mov edi, width shl edi, 1 // edi = 2*width mov eax, dataSize mov ecx, im_out mov ebx, im sub ebx, edi sub ebx, edi sub ebx, edi // ebx = ebx-6*width test eax, eax // Is there anything to do?" jz end_sum_loop // Jump out if necessary row_sum_loop: test eax, eax // Is there anything to do? jz end_sum_loop // Jump out if necessary mov edx, ebx // 1 movq mm3, [edx] // mm3 = 4 words of im add ebx, 16 movq mm2, [edx+8] // mm2 = next 4 words of im add edx, edi macro_add macro_add macro_add macro_add macro_add macro_add // divide results by ... psrlw mm3, 3 psrlw mm2, 3 // convert [mm2 mm3] as 8 bytes packuswb mm3,mm2 movq [ecx], mm3 sub eax, 8 // Update the number of points left add ecx, 8 // Update output pointer jmp row_sum_loop // Loop //Cleanup end_sum_loop: emms } } inline void avg_Col_9(ushort* im, uchar* im_out, int dataSize, int width) { __asm { mov edi, width shl edi, 1 // edi = 2*width mov eax, dataSize mov ecx, im_out mov ebx, im sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi // ebx = ebx-8*width test eax, eax // Is there anything to do?" jz end_sum_loop // Jump out if necessary row_sum_loop: test eax, eax // Is there anything to do? jz end_sum_loop // Jump out if necessary mov edx, ebx add ebx, 16 // 1 movq mm3, [edx] // mm3 = 4 words of im movq mm2, [edx+8] // mm2 = next 4 words of im add edx, edi macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add // divide results by ... psrlw mm3, 3 psrlw mm2, 3 // convert [mm2 mm3] as 8 bytes packuswb mm3,mm2 movq [ecx], mm3 sub eax, 8 // Update the number of points left add ecx, 8 // Update output pointer jmp row_sum_loop // Loop //Cleanup end_sum_loop: emms } } inline void avg_Col_11(ushort* im, uchar* im_out, int dataSize, int width) { __asm { mov edi, width shl edi, 1 // edi = 2*width mov eax, dataSize mov ecx, im_out mov ebx, im sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi // ebx = ebx-10*width test eax, eax // Is there anything to do?" jz end_sum_loop // Jump out if necessary row_sum_loop: test eax, eax // Is there anything to do? jz end_sum_loop // Jump out if necessary mov edx, ebx add ebx, 16 // 1 movq mm3, [edx] // mm3 = 4 words of im movq mm2, [edx+8] // mm2 = next 4 words of im add edx, edi macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add // divide results by ... psrlw mm3, 4 psrlw mm2, 4 // convert [mm2 mm3] as 8 bytes packuswb mm3,mm2 movq [ecx], mm3 sub eax, 8 // Update the number of points left add ecx, 8 // Update output pointer jmp row_sum_loop // Loop //Cleanup end_sum_loop: emms } } inline void avg_Col_13(ushort* im, uchar* im_out, int dataSize, int width) { __asm { mov edi, width shl edi, 1 // edi = 2*width mov eax, dataSize mov ecx, im_out mov ebx, im sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi // ebx = ebx-12*width test eax, eax // Is there anything to do?" jz end_sum_loop // Jump out if necessary row_sum_loop: test eax, eax // Is there anything to do? jz end_sum_loop // Jump out if necessary mov edx, ebx add ebx, 16 // 1 movq mm3, [edx] // mm3 = 4 words of im movq mm2, [edx+8] // mm2 = next 4 words of im add edx, edi macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add // divide results by ... psrlw mm3, 4 psrlw mm2, 4 // convert [mm2 mm3] as 8 bytes packuswb mm3,mm2 movq [ecx], mm3 sub eax, 8 // Update the number of points left add ecx, 8 // Update output pointer jmp row_sum_loop // Loop //Cleanup end_sum_loop: emms } } inline void avg_Col_15(ushort* im, uchar* im_out, int dataSize, int width) { __asm { mov edi, width shl edi, 1 // edi = 2*width mov eax, dataSize mov ecx, im_out mov ebx, im sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi // ebx = ebx-14*width test eax, eax // Is there anything to do?" jz end_sum_loop // Jump out if necessary row_sum_loop: test eax, eax // Is there anything to do? jz end_sum_loop // Jump out if necessary mov edx, ebx add ebx, 16 // 1 movq mm3, [edx] // mm3 = 4 words of im movq mm2, [edx+8] // mm2 = next 4 words of im add edx, edi macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add // divide results by ... psrlw mm3, 4 psrlw mm2, 4 // convert [mm2 mm3] as 8 bytes packuswb mm3,mm2 movq [ecx], mm3 sub eax, 8 // Update the number of points left add ecx, 8 // Update output pointer jmp row_sum_loop // Loop //Cleanup end_sum_loop: emms } } inline void avg_Col_17(ushort* im, uchar* im_out, int dataSize, int width) { __asm { mov edi, width shl edi, 1 // edi = 2*width mov eax, dataSize mov ecx, im_out mov ebx, im sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi // ebx = ebx-16*width test eax, eax // Is there anything to do?" jz end_sum_loop // Jump out if necessary row_sum_loop: test eax, eax // Is there anything to do? jz end_sum_loop // Jump out if necessary mov edx, ebx add ebx, 16 // 1 movq mm3, [edx] // mm3 = 4 words of im movq mm2, [edx+8] // mm2 = next 4 words of im add edx, edi macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add // divide results by ... psrlw mm3, 4 psrlw mm2, 4 // convert [mm2 mm3] as 8 bytes packuswb mm3,mm2 movq [ecx], mm3 sub eax, 8 // Update the number of points left add ecx, 8 // Update output pointer jmp row_sum_loop // Loop //Cleanup end_sum_loop: emms } } inline void add_Col_5_wb(ushort* im, uchar* im_out, int dataSize, int width) { __asm { mov edi, width shl edi, 1 // edi = 2*width mov eax, dataSize mov ecx, im_out mov ebx, im sub ebx, edi sub ebx, edi // ebx = ebx-4*width test eax, eax // Is there anything to do?" jz end_sum_loop // Jump out if necessary row_sum_loop: test eax, eax // Is there anything to do? jz end_sum_loop // Jump out if necessary mov edx, ebx add ebx, 16 // 1 movq mm3, [edx] // mm3 = 4 words of im movq mm2, [edx+8] // mm2 = next 4 words of im add edx, edi macro_add macro_add macro_add macro_add // save [mm2 mm3] as 8 bytes packuswb mm3,mm2 movq [ecx], mm3 sub eax, 8 // Update the number of points left add ecx, 8 // Update output pointer jmp row_sum_loop // Loop //Cleanup end_sum_loop: emms } } inline void add_Col_5_ww(ushort* im, ushort* im_out, int dataSize, int width) { __asm { mov edi, width shl edi, 1 // edi = 2*width mov eax, dataSize mov ecx, im_out mov ebx, im sub ebx, edi sub ebx, edi // ebx = ebx-4*width test eax, eax // Is there anything to do?" jz end_sum_loop // Jump out if necessary row_sum_loop: test eax, eax // Is there anything to do? jz end_sum_loop // Jump out if necessary mov edx, ebx add ebx, 16 // 1 movq mm3, [edx] // mm3 = 4 words of im movq mm2, [edx+8] // mm2 = next 4 words of im add edx, edi macro_add macro_add macro_add macro_add // save [mm2 mm3] as words movq [ecx], mm3 movq [ecx+8], mm2 sub eax, 8 // Update the number of points left add ecx, 16 // Update output pointer jmp row_sum_loop // Loop //Cleanup end_sum_loop: emms } } // compare bestScores and secondScores. if secondthresh // 0 otherwise pand mm3, mm2 pandn mm2, mm7 por mm3, mm2 movq [edx], mm3 sub eax, 8 // Update the number of points left add ebx, 8 // Update output pointer add ecx, 8 add edx, 8 jmp comp_loop // Loop //Cleanup end_loop: emms } } // windowWidth must be multiple of 8 inline void cropImage(const uchar* imSrc, int width, int height, uchar* imDest, int x0, int y0, int windowWidth, int windowHeight) { int w8 = windowWidth/8; int step = width-windowWidth; const uchar* srcNewOrigin = imSrc+x0+y0*width; __asm { mov ecx, windowHeight mov edx, w8 mov eax, srcNewOrigin mov ebx, imDest pixel_loop: movq mm1, [eax] movq [ebx], mm1 add eax, 8 add ebx, 8 dec edx jnz pixel_loop mov edx, w8 add eax, step dec ecx jnz pixel_loop jmp done done: emms; } } // return the average pixel value inline float pixelMean(const uchar* im, int imageSize) { int sum; __asm { mov ecx, imageSize shr ecx, 3 mov eax, im pxor mm7,mm7 // mm7 used as accumulator pxor mm0,mm0 // mm0 = 0 pixel_loop: movq mm1, [eax] movq mm2,mm1 punpcklbw mm2, mm0 punpckhbw mm1, mm0 paddw mm2,mm1 movq mm1,mm2 punpcklwd mm2, mm0 punpckhwd mm1, mm0 paddd mm2,mm1 paddd mm7,mm2 add eax, 8 dec ecx jnz pixel_loop jmp done done: movd ebx, mm7 psrlq mm7, 32 movd edx, mm7 add ebx, edx mov sum, ebx emms } return sum / (float)imageSize; } // ------------------------------------------------------------- // apply mask: // if mask[]=undefined_val im[]->im[] // otherwise, im[]->mask[] // ....... this one may not be exact :-( inline void overrideImageMMX(uchar* im, const uchar* mask, uchar undefined_val, int imageSize) { __asm { // setup mm0 with 8 copies of 'undefined_val' mov al, undefined_val mov ah, al mov bx, ax shl eax, 16 mov ax, bx movd mm0, eax movd mm1, eax punpckldq mm0, mm1 mov ecx, imageSize shr ecx, 3 mov eax, im mov ebx, mask pixel_loop: movq mm1, [eax] movq mm2, [ebx] movq mm3, mm2 pcmpeqb mm3, mm0 // mm3[] -> xFF if mm2[]==undefined_val // -> x00 otherwise pand mm3, mm1 // mm3[] = mm1[] if mm2[]==undefined_val // = x00 otherwise por mm3, mm2 movq [eax], mm3 add eax, 8 add ebx, 8 dec ecx jnz pixel_loop jmp done done: emms } } inline void overrideImage(uchar* im, const uchar* mask, uchar undefined_val, int imageSize) { for (int i=0; i