1 | /***************************************************************************
|
---|
2 | *
|
---|
3 | * Copyright 2000 by David Demirdjian. All rights reserved.
|
---|
4 | *
|
---|
5 | * Developed by David Demirdjian
|
---|
6 | *
|
---|
7 | * Permission to use, copy, or modify this software and its documentation
|
---|
8 | * for educational and research purposes only and without fee is hereby
|
---|
9 | * granted, provided that this copyright notice and the original authors's
|
---|
10 | * names appear on all copies and supporting documentation. If individual
|
---|
11 | * files are separated from this distribution directory structure, this
|
---|
12 | * copyright notice must be included. For any other uses of this software,
|
---|
13 | * in original or modified form, including but not limited to distribution
|
---|
14 | * in whole or in part, specific prior permission must be obtained from
|
---|
15 | * MIT. These programs shall not be used, rewritten, or adapted as the
|
---|
16 | * basis of a commercial software or hardware product without first
|
---|
17 | * obtaining appropriate licenses from David Demirdjian. The author makes
|
---|
18 | * no representations about the suitability of this software for any purpose.
|
---|
19 | * It is provided "as is" without express or implied warranty.
|
---|
20 | *
|
---|
21 | **************************************************************************/
|
---|
22 | #include "stereoMatching.h"
|
---|
23 | #include "processingmmx.h"
|
---|
24 |
|
---|
25 | // ************************************************************
|
---|
26 | // ************************************************************
|
---|
27 | // *** List of functions (SSE2) for image processing
|
---|
28 | // ************************************************************
|
---|
29 | // ************************************************************
|
---|
30 |
|
---|
31 | // Src1, Src2 and Dest suppose to point on 16-bytes memory block
|
---|
32 | inline int ImgSubandAdd_sse2(const unsigned char *Src1, const unsigned char *Src2,
|
---|
33 | const unsigned char *Src3, unsigned char *Dest, int l)
|
---|
34 | {
|
---|
35 |
|
---|
36 | if (l < 8) return 0; // image size must be at least 8 bytes
|
---|
37 |
|
---|
38 | __asm
|
---|
39 | {
|
---|
40 | mov eax, Src1
|
---|
41 | mov ebx, Src2
|
---|
42 | mov edx, Src3
|
---|
43 | mov edi, Dest
|
---|
44 | mov ecx, l
|
---|
45 | shr ecx, 4
|
---|
46 |
|
---|
47 | align 16
|
---|
48 | inner_loop:
|
---|
49 | movdqa xmm1,[eax] // xmm1=src1
|
---|
50 | movdqa xmm2,[ebx] // mm2=src2
|
---|
51 |
|
---|
52 | movdqa xmm4,xmm1 // mm4=mm1
|
---|
53 |
|
---|
54 | psubusb xmm4,xmm2 // mm4 = src1 - src2
|
---|
55 |
|
---|
56 | movdqu xmm3,[edx] // mm3=src3
|
---|
57 | psubusb xmm2,xmm1 // mm2 = src2 - src1
|
---|
58 |
|
---|
59 | movdqa xmm5,xmm1 // mm5=src1
|
---|
60 | por xmm2,xmm4 // mm2=|src1-src2|
|
---|
61 |
|
---|
62 | psubusb xmm5,xmm3 // mm4=src1-src3
|
---|
63 |
|
---|
64 | psubusb xmm3,xmm1 // mm3=src3-src1
|
---|
65 |
|
---|
66 | por xmm3,xmm5 // mm3=|src1-src3|
|
---|
67 |
|
---|
68 | paddusb xmm2,xmm3 // mm2 = |src1-src2|+|src1-src3|
|
---|
69 |
|
---|
70 | movdqa [edi], xmm2
|
---|
71 | add eax,16
|
---|
72 | add ebx,16
|
---|
73 | add edx,16
|
---|
74 | add edi,16
|
---|
75 | dec ecx
|
---|
76 | jnz inner_loop
|
---|
77 | emms
|
---|
78 | }
|
---|
79 |
|
---|
80 | return 1;
|
---|
81 | }
|
---|
82 |
|
---|
83 |
|
---|
84 |
|
---|
85 |
|
---|
86 |
|
---|
87 |
|
---|
88 |
|
---|
89 | #define macro_add_sse2 __asm \
|
---|
90 | { \
|
---|
91 | __asm paddusw xmm3, [edx] \
|
---|
92 | __asm paddusw xmm2, [edx+16] \
|
---|
93 | __asm add edx, edi \
|
---|
94 | }
|
---|
95 |
|
---|
96 |
|
---|
97 | inline void avg_Col_5_sse2(ushort* im, uchar* im_out, int dataSize, int width)
|
---|
98 | {
|
---|
99 | __asm {
|
---|
100 |
|
---|
101 | mov edi, width
|
---|
102 | shl edi, 1 // edi = 2*width
|
---|
103 |
|
---|
104 | mov eax, dataSize
|
---|
105 | mov ecx, im_out
|
---|
106 |
|
---|
107 | mov ebx, im
|
---|
108 | sub ebx, edi
|
---|
109 | sub ebx, edi // ebx = ebx-4*width
|
---|
110 |
|
---|
111 | test eax, eax // Is there anything to do?"
|
---|
112 | jz end_sum_loop // Jump out if necessary
|
---|
113 |
|
---|
114 | row_sum_loop:
|
---|
115 |
|
---|
116 | test eax, eax // Is there anything to do?
|
---|
117 | jz end_sum_loop // Jump out if necessary
|
---|
118 |
|
---|
119 | mov edx, ebx
|
---|
120 | add ebx, 32
|
---|
121 |
|
---|
122 | // 1
|
---|
123 | movdqa xmm3, [edx] // xmm3 = 8 words of im
|
---|
124 | movdqa xmm2, [edx+16] // xmm3 = 8 words of im
|
---|
125 | add edx, edi
|
---|
126 |
|
---|
127 | macro_add_sse2
|
---|
128 | macro_add_sse2
|
---|
129 | macro_add_sse2
|
---|
130 | macro_add_sse2
|
---|
131 |
|
---|
132 | // divide results by ...
|
---|
133 | psrlw xmm3, 3
|
---|
134 | psrlw xmm2, 3
|
---|
135 |
|
---|
136 | // convert [xmm2 xmm3] as 8 words
|
---|
137 | packuswb xmm3,xmm2
|
---|
138 | movdqa [ecx], xmm3
|
---|
139 |
|
---|
140 | sub eax, 16 // Update the number of points left
|
---|
141 | add ecx, 16 // Update output pointer
|
---|
142 |
|
---|
143 | jmp row_sum_loop // Loop
|
---|
144 |
|
---|
145 | //Cleanup
|
---|
146 | end_sum_loop:
|
---|
147 | emms
|
---|
148 | }
|
---|
149 | }
|
---|
150 |
|
---|
151 | inline void avg_Col_7_sse2(ushort* im, uchar* im_out, int dataSize, int width)
|
---|
152 | {
|
---|
153 | __asm {
|
---|
154 |
|
---|
155 | mov edi, width
|
---|
156 | shl edi, 1 // edi = 2*width
|
---|
157 |
|
---|
158 | mov eax, dataSize
|
---|
159 | mov ecx, im_out
|
---|
160 |
|
---|
161 | mov ebx, im
|
---|
162 | sub ebx, edi
|
---|
163 | sub ebx, edi
|
---|
164 | sub ebx, edi // ebx = ebx-4*width
|
---|
165 |
|
---|
166 | test eax, eax // Is there anything to do?"
|
---|
167 | jz end_sum_loop // Jump out if necessary
|
---|
168 |
|
---|
169 | row_sum_loop:
|
---|
170 |
|
---|
171 | test eax, eax // Is there anything to do?
|
---|
172 | jz end_sum_loop // Jump out if necessary
|
---|
173 |
|
---|
174 | mov edx, ebx
|
---|
175 | add ebx, 32
|
---|
176 |
|
---|
177 | // 1
|
---|
178 | movdqa xmm3, [edx] // xmm3 = 8 words of im
|
---|
179 | movdqa xmm2, [edx+16] // xmm3 = 8 words of im
|
---|
180 | add edx, edi
|
---|
181 |
|
---|
182 | macro_add_sse2
|
---|
183 | macro_add_sse2
|
---|
184 | macro_add_sse2
|
---|
185 | macro_add_sse2
|
---|
186 | macro_add_sse2
|
---|
187 | macro_add_sse2
|
---|
188 |
|
---|
189 | // divide results by ...
|
---|
190 | psrlw xmm3, 3
|
---|
191 | psrlw xmm2, 3
|
---|
192 |
|
---|
193 | // convert [xmm2 xmm3] as 8 words
|
---|
194 | packuswb xmm3,xmm2
|
---|
195 | movdqa [ecx], xmm3
|
---|
196 |
|
---|
197 | sub eax, 16 // Update the number of points left
|
---|
198 | add ecx, 16 // Update output pointer
|
---|
199 |
|
---|
200 | jmp row_sum_loop // Loop
|
---|
201 |
|
---|
202 | //Cleanup
|
---|
203 | end_sum_loop:
|
---|
204 | emms
|
---|
205 | }
|
---|
206 | }
|
---|
207 |
|
---|
208 | inline void avg_Col_9_sse2(ushort* im, uchar* im_out, int dataSize, int width)
|
---|
209 | {
|
---|
210 | __asm {
|
---|
211 |
|
---|
212 | mov edi, width
|
---|
213 | shl edi, 1 // edi = 2*width
|
---|
214 |
|
---|
215 | mov eax, dataSize
|
---|
216 | mov ecx, im_out
|
---|
217 |
|
---|
218 | mov ebx, im
|
---|
219 | sub ebx, edi
|
---|
220 | sub ebx, edi
|
---|
221 | sub ebx, edi
|
---|
222 | sub ebx, edi // ebx = ebx-4*width
|
---|
223 |
|
---|
224 | test eax, eax // Is there anything to do?"
|
---|
225 | jz end_sum_loop // Jump out if necessary
|
---|
226 |
|
---|
227 | row_sum_loop:
|
---|
228 |
|
---|
229 | test eax, eax // Is there anything to do?
|
---|
230 | jz end_sum_loop // Jump out if necessary
|
---|
231 |
|
---|
232 | mov edx, ebx
|
---|
233 | add ebx, 32
|
---|
234 |
|
---|
235 | // 1
|
---|
236 | movdqa xmm3, [edx] // xmm3 = 8 words of im
|
---|
237 | movdqa xmm2, [edx+16] // xmm3 = 8 words of im
|
---|
238 | add edx, edi
|
---|
239 |
|
---|
240 | macro_add_sse2
|
---|
241 | macro_add_sse2
|
---|
242 | macro_add_sse2
|
---|
243 | macro_add_sse2
|
---|
244 | macro_add_sse2
|
---|
245 | macro_add_sse2
|
---|
246 | macro_add_sse2
|
---|
247 | macro_add_sse2
|
---|
248 |
|
---|
249 | // divide results by ...
|
---|
250 | psrlw xmm3, 3
|
---|
251 | psrlw xmm2, 3
|
---|
252 |
|
---|
253 | // convert [xmm2 xmm3] as 8 words
|
---|
254 | packuswb xmm3,xmm2
|
---|
255 | movdqa [ecx], xmm3
|
---|
256 |
|
---|
257 | sub eax, 16 // Update the number of points left
|
---|
258 | add ecx, 16 // Update output pointer
|
---|
259 |
|
---|
260 | jmp row_sum_loop // Loop
|
---|
261 |
|
---|
262 | //Cleanup
|
---|
263 | end_sum_loop:
|
---|
264 | emms
|
---|
265 | }
|
---|
266 | }
|
---|
267 |
|
---|
268 | inline void avg_Col_11_sse2(ushort* im, uchar* im_out, int dataSize, int width)
|
---|
269 | {
|
---|
270 | __asm {
|
---|
271 |
|
---|
272 | mov edi, width
|
---|
273 | shl edi, 1 // edi = 2*width
|
---|
274 |
|
---|
275 | mov eax, dataSize
|
---|
276 | mov ecx, im_out
|
---|
277 |
|
---|
278 | mov ebx, im
|
---|
279 | sub ebx, edi
|
---|
280 | sub ebx, edi
|
---|
281 | sub ebx, edi
|
---|
282 | sub ebx, edi
|
---|
283 | sub ebx, edi // ebx = ebx-4*width
|
---|
284 |
|
---|
285 | test eax, eax // Is there anything to do?"
|
---|
286 | jz end_sum_loop // Jump out if necessary
|
---|
287 |
|
---|
288 | row_sum_loop:
|
---|
289 |
|
---|
290 | test eax, eax // Is there anything to do?
|
---|
291 | jz end_sum_loop // Jump out if necessary
|
---|
292 |
|
---|
293 | mov edx, ebx
|
---|
294 | add ebx, 32
|
---|
295 |
|
---|
296 | // 1
|
---|
297 | movdqa xmm3, [edx] // xmm3 = 8 words of im
|
---|
298 | movdqa xmm2, [edx+16] // xmm3 = 8 words of im
|
---|
299 | add edx, edi
|
---|
300 |
|
---|
301 | macro_add_sse2
|
---|
302 | macro_add_sse2
|
---|
303 | macro_add_sse2
|
---|
304 | macro_add_sse2
|
---|
305 | macro_add_sse2
|
---|
306 | macro_add_sse2
|
---|
307 | macro_add_sse2
|
---|
308 | macro_add_sse2
|
---|
309 | macro_add_sse2
|
---|
310 | macro_add_sse2
|
---|
311 |
|
---|
312 | // divide results by ...
|
---|
313 | psrlw xmm3, 3
|
---|
314 | psrlw xmm2, 3
|
---|
315 |
|
---|
316 | // convert [xmm2 xmm3] as 8 words
|
---|
317 | packuswb xmm3,xmm2
|
---|
318 | movdqa [ecx], xmm3
|
---|
319 |
|
---|
320 | sub eax, 16 // Update the number of points left
|
---|
321 | add ecx, 16 // Update output pointer
|
---|
322 |
|
---|
323 | jmp row_sum_loop // Loop
|
---|
324 |
|
---|
325 | //Cleanup
|
---|
326 | end_sum_loop:
|
---|
327 | emms
|
---|
328 | }
|
---|
329 | }
|
---|
330 |
|
---|
331 | inline void avg_Col_13_sse2(ushort* im, uchar* im_out, int dataSize, int width)
|
---|
332 | {
|
---|
333 | __asm {
|
---|
334 |
|
---|
335 | mov edi, width
|
---|
336 | shl edi, 1 // edi = 2*width
|
---|
337 |
|
---|
338 | mov eax, dataSize
|
---|
339 | mov ecx, im_out
|
---|
340 |
|
---|
341 | mov ebx, im
|
---|
342 | sub ebx, edi
|
---|
343 | sub ebx, edi
|
---|
344 | sub ebx, edi
|
---|
345 | sub ebx, edi
|
---|
346 | sub ebx, edi
|
---|
347 | sub ebx, edi // ebx = ebx-4*width
|
---|
348 |
|
---|
349 | test eax, eax // Is there anything to do?"
|
---|
350 | jz end_sum_loop // Jump out if necessary
|
---|
351 |
|
---|
352 | row_sum_loop:
|
---|
353 |
|
---|
354 | test eax, eax // Is there anything to do?
|
---|
355 | jz end_sum_loop // Jump out if necessary
|
---|
356 |
|
---|
357 | mov edx, ebx
|
---|
358 | add ebx, 32
|
---|
359 |
|
---|
360 | // 1
|
---|
361 | movdqa xmm3, [edx] // xmm3 = 8 words of im
|
---|
362 | movdqa xmm2, [edx+16] // xmm3 = 8 words of im
|
---|
363 | add edx, edi
|
---|
364 |
|
---|
365 | macro_add_sse2
|
---|
366 | macro_add_sse2
|
---|
367 | macro_add_sse2
|
---|
368 | macro_add_sse2
|
---|
369 | macro_add_sse2
|
---|
370 | macro_add_sse2
|
---|
371 | macro_add_sse2
|
---|
372 | macro_add_sse2
|
---|
373 | macro_add_sse2
|
---|
374 | macro_add_sse2
|
---|
375 | macro_add_sse2
|
---|
376 | macro_add_sse2
|
---|
377 |
|
---|
378 | // divide results by ...
|
---|
379 | psrlw xmm3, 3
|
---|
380 | psrlw xmm2, 3
|
---|
381 |
|
---|
382 | // convert [xmm2 xmm3] as 8 words
|
---|
383 | packuswb xmm3,xmm2
|
---|
384 | movdqa [ecx], xmm3
|
---|
385 |
|
---|
386 | sub eax, 16 // Update the number of points left
|
---|
387 | add ecx, 16 // Update output pointer
|
---|
388 |
|
---|
389 | jmp row_sum_loop // Loop
|
---|
390 |
|
---|
391 | //Cleanup
|
---|
392 | end_sum_loop:
|
---|
393 | emms
|
---|
394 | }
|
---|
395 | }
|
---|
396 |
|
---|
397 | // apply vertical mask 1/16*[1 1 1 ... 1]^T to 'im'
|
---|
398 | // result in 'im_out'
|
---|
399 | inline void avg_Col_sse2(ushort* im, uchar* im_out, int dataSize, int width, int sizeMask)
|
---|
400 | {
|
---|
401 | switch (sizeMask)
|
---|
402 | {
|
---|
403 | case 5: avg_Col_5_sse2(im,im_out,dataSize,width);
|
---|
404 | break;
|
---|
405 | case 7: avg_Col_7_sse2(im,im_out,dataSize,width);
|
---|
406 | break;
|
---|
407 | case 9: avg_Col_9_sse2(im,im_out,dataSize,width);
|
---|
408 | break;
|
---|
409 | case 11: avg_Col_11_sse2(im,im_out,dataSize,width);
|
---|
410 | break;
|
---|
411 | case 13: avg_Col_13_sse2(im,im_out,dataSize,width);
|
---|
412 | break;
|
---|
413 | case 15: avg_Col_15(im,im_out,dataSize,width);
|
---|
414 | break;
|
---|
415 | case 17: avg_Col_17(im,im_out,dataSize,width);
|
---|
416 | break;
|
---|
417 |
|
---|
418 | default: if (sizeMask<5) avg_Col_5_sse2(im,im_out,dataSize,width);
|
---|
419 | else if (sizeMask>17) avg_Col_17(im,im_out,dataSize,width);
|
---|
420 | }
|
---|
421 | }
|
---|