diff options
Diffstat (limited to 'Src/h264dec/ldecod/src/prediction.asm')
-rw-r--r-- | Src/h264dec/ldecod/src/prediction.asm | 1626 |
1 files changed, 1626 insertions, 0 deletions
diff --git a/Src/h264dec/ldecod/src/prediction.asm b/Src/h264dec/ldecod/src/prediction.asm new file mode 100644 index 00000000..33d3d499 --- /dev/null +++ b/Src/h264dec/ldecod/src/prediction.asm @@ -0,0 +1,1626 @@ +.686 +.XMM +.model FLAT + +copy_image_data_16x16_stride@OptimizedFunctions = 32 +dec_picture@VideoParameters = 698192 +p_Slice@MacroBlock = 0 +plane_images@StorablePicture = 158512 +mb_rec@Slice = 1696 +mb_pred@Slice = 928 +cof@Slice = 2464 + +CONST SEGMENT +align 16 +const32 DW 020H, 020H, 020H, 020H, 020H, 020H, 020H, 020H +CONST ENDS + +; +; +; +; + +PUBLIC _weighted_bi_prediction4x4 +_TEXT SEGMENT +mb_pred = 4 +block_l0 = 8 +wp_scale_l0 = 12 +wp_scale_l1 = 16 +wp_offset = 20 +weight_denom = 24 +_weighted_bi_prediction4x4 PROC ; COMDAT + mov eax, DWORD PTR weight_denom[esp] + pxor mm0, mm0 + pshufw mm1, MMWORD PTR wp_scale_l0[esp], 0 + test eax, eax + pshufw mm2, MMWORD PTR wp_scale_l1[esp], 0 + pshufw mm3, MMWORD PTR wp_offset[esp], 0 + jle BI_PRED4x4@LEFT_SHIFT + + movd mm4, eax + lea ecx, DWORD PTR [eax-1] ; + mov edx, 1 + shl edx, cl + movd mm5, edx + mov eax, mb_pred[esp] + mov edx, block_l0[esp] + pshufw mm5, mm5, 0 + movd mm6, DWORD PTR 0[edx] ; block_l0 + movd mm7, DWORD PTR 0[eax] ; mb_pred + punpcklbw mm6, mm0 + punpcklbw mm7, mm0 + pmullw mm6, mm1 + pmullw mm7, mm2 + paddw mm6, mm7 + movd mm7, DWORD PTR 16[eax] ; mb_pred + paddw mm6, mm5 + psraw mm6, mm4 + paddw mm6, mm3 + packuswb mm6, mm6 + movd DWORD PTR 0[eax], mm6 + + movd mm6, DWORD PTR 16[edx] ; block_l0 + punpcklbw mm6, mm0 + punpcklbw mm7, mm0 + pmullw mm6, mm1 + pmullw mm7, mm2 + paddw mm6, mm7 + movd mm7, DWORD PTR 32[eax] ; mb_pred + paddw mm6, mm5 + psraw mm6, mm4 + paddw mm6, mm3 + packuswb mm6, mm6 + movd DWORD PTR 16[eax], mm6 + + movd mm6, DWORD PTR 32[edx] ; block_l0 + punpcklbw mm6, mm0 + punpcklbw mm7, mm0 + pmullw mm6, mm1 + pmullw mm7, mm2 + paddw mm6, mm7 + movd mm7, DWORD PTR 48[eax] ; mb_pred + paddw mm6, mm5 + psraw mm6, mm4 + paddw mm6, mm3 + packuswb mm6, mm6 + movd DWORD PTR 0[eax], mm6 + + movd mm6, DWORD PTR 48[edx] ; block_l0 + punpcklbw mm6, mm0 + punpcklbw mm7, mm0 + pmullw mm6, mm1 + pmullw mm7, mm2 + paddw mm6, mm7 + paddw mm6, mm5 + psraw mm6, mm4 + paddw mm6, mm3 + packuswb mm6, mm6 + movd DWORD PTR 48[eax], mm6 + ret 0 + +BI_PRED4x4@LEFT_SHIFT: + neg eax + movd mm4, eax + mov eax, mb_pred[esp] + mov edx, block_l0[esp] + movd mm6, DWORD PTR 0[edx] ; block_l0 + movd mm7, DWORD PTR 0[eax] ; mb_pred + punpcklbw mm6, mm0 + punpcklbw mm7, mm0 + pmullw mm6, mm1 + pmullw mm7, mm2 + paddw mm6, mm7 + movd mm7, DWORD PTR 16[eax] ; mb_pred + psllw mm6, mm4 + paddw mm6, mm3 + packuswb mm6, mm6 + movd DWORD PTR 0[eax], mm6 + + movd mm6, DWORD PTR 16[edx] ; block_l0 + punpcklbw mm6, mm0 + punpcklbw mm7, mm0 + pmullw mm6, mm1 + pmullw mm7, mm2 + paddw mm6, mm7 + movd mm7, DWORD PTR 32[eax] ; mb_pred + psllw mm6, mm4 + paddw mm6, mm3 + packuswb mm6, mm6 + movd DWORD PTR 16[eax], mm6 + + movd mm6, DWORD PTR 32[edx] ; block_l0 + punpcklbw mm6, mm0 + punpcklbw mm7, mm0 + pmullw mm6, mm1 + pmullw mm7, mm2 + paddw mm6, mm7 + movd mm7, DWORD PTR 48[eax] ; mb_pred + psllw mm6, mm4 + paddw mm6, mm3 + packuswb mm6, mm6 + movd DWORD PTR 0[eax], mm6 + + movd mm6, DWORD PTR 48[edx] ; block_l0 + punpcklbw mm6, mm0 + punpcklbw mm7, mm0 + pmullw mm6, mm1 + pmullw mm7, mm2 + paddw mm6, mm7 + psllw mm6, mm4 + paddw mm6, mm3 + packuswb mm6, mm6 + movd DWORD PTR 48[eax], mm6 + ret 0 +_weighted_bi_prediction4x4 ENDP +_TEXT ENDS + +PUBLIC _itrans4x4_mmx +_TEXT SEGMENT +_tblock$ = 4 ; size = 4 +_mb_pred$ = 8 ; size = 4 +_mb_rec$ = 12 ; size = 4 +_pos_x$ = 16 ; size = 4 +_pos_y$ = 20 ; size = 4 +_itrans4x4_mmx PROC ; COMDAT + + mov edx, DWORD PTR _pos_y$[esp] + shl edx, 4 + add edx, DWORD PTR _pos_x$[esp] + mov eax, DWORD PTR _tblock$[esp] + mov ecx, DWORD PTR _mb_pred$[esp] + add ecx, edx + add edx, DWORD PTR _mb_rec$[esp] +_itrans4x4_mmx_direct PROC ; COMDAT + ; load 4x4 matrix + movq mm0, MMWORD PTR 0[eax] + movq mm1, MMWORD PTR 8[eax] + movq mm2, MMWORD PTR 16[eax] + movq mm3, MMWORD PTR 24[eax] + + ; rotate 4x4 matrix + movq mm4, mm0 ; p0 = mm4 (copy) + punpcklwd mm0, mm2 ; r0 = mm0 + punpckhwd mm4, mm2 ; r2 = mm4 + movq mm5, mm1 ; p1 = mm5 (copy) + punpcklwd mm1, mm3 ; r1 = mm1 + punpckhwd mm5, mm3 ; r3 = mm5 + movq mm6, mm0 ; r0 = mm6 (copy) + punpcklwd mm0, mm1 ; t0 = mm0 + punpckhwd mm6, mm1 ; t1 = mm6 + movq mm1, mm4 ; r2 = mm1 (copy) + punpcklwd mm1, mm5 ; t2 = mm1 + punpckhwd mm4, mm5 ; t3 = mm4 + + movq mm2, mm0 ; mm2 = t0 (copy) + paddw mm0, mm1 ; mm0 = p0 + psubw mm2, mm1 ; mm2 = p1, mm1 available + movq mm5, mm6 ; mm5 = t1 (copy) + psraw mm5, 1 ; mm5 = (t1 >> 1) + psubw mm5, mm4 ; mm5 = p2 + psraw mm4, 1 ; mm4 = (t3 >> 1) + paddw mm6, mm4 ; mm6 = p3 + + movq mm3, mm0 ; mm3 = p0 (copy) + paddw mm0, mm6 ; mm0 = r0 + movq mm1, mm2 ; mm1 = p1 (copy) + paddw mm1, mm5 ; mm1 = r1 + psubw mm2, mm5 ; mm2 = r2, mm5 available + psubw mm3, mm6 ; mm3 = r3 + + ; rotate 4x4 matrix to set up for vertical + movq mm4, mm0 ; r0 = mm4 (copy) + punpcklwd mm0, mm2 ; p0 = mm0 + punpckhwd mm4, mm2 ; p2 = mm4 + movq mm5, mm1 ; r1 = mm5 (copy) + punpcklwd mm1, mm3 ; p1 = mm1 + punpckhwd mm5, mm3 ; p3 = mm5 + movq mm6, mm0 ; p0 = mm6 (copy) + punpcklwd mm0, mm1 ; t0 = mm0 + punpckhwd mm6, mm1 ; t1 = mm6 + movq mm1, mm4 ; p2 = mm1 (copy) + punpcklwd mm1, mm5 ; t2 = mm1 + punpckhwd mm4, mm5 ; t3 = mm4 + + movq mm2, mm0 ; mm2 = t0 (copy) + paddw mm0, mm1 ; mm0 = p0 + psubw mm2, mm1 ; mm2 = p1, mm1 available + movq mm5, mm6 ; mm5 = t1 (copy) + psraw mm5, 1 ; mm5 = (t1 >> 1) + psubw mm5, mm4 ; mm5 = p2 + psraw mm4, 1 ; mm4 = (t3 >> 1) + paddw mm6, mm4 ; mm6 = p3 + movq mm3, mm0 ; mm3 = p0 (copy) + paddw mm0, mm6 ; mm0 = r0 + movq mm1, mm2 ; mm1 = p1 (copy) + paddw mm1, mm5 ; mm1 = r1 + psubw mm2, mm5 ; mm2 = r2, mm5 available + psubw mm3, mm6 ; mm3 = r3 + + +; --- 4x4 iDCT done, now time to combine with mpr --- + + movq mm7, MMWORD PTR const32 + + paddw mm0, mm7 ; rres + 32 + psraw mm0, 6 ; (rres + 32) >> 6 + paddw mm1, mm7 ; rres + 32 + psraw mm1, 6 ; (rres + 32) >> 6 + paddw mm2, mm7 ; rres + 32 + psraw mm2, 6 ; (rres + 32) >> 6 + paddw mm3, mm7 ; rres + 32 + psraw mm3, 6 ; (rres + 32) >> 6 + + pxor mm7, mm7 + + ; convert mpr from unsigned char to short + movd mm4, DWORD PTR 0[ecx] + movd mm5, DWORD PTR 16[ecx] + movd mm6, DWORD PTR 32[ecx] + punpcklbw mm4, mm7 + punpcklbw mm5, mm7 + punpcklbw mm6, mm7 + paddsw mm4, mm0 ; pred_row + rres_row + movd mm0, DWORD PTR 48[ecx] ; reuse mm0 for mpr[3] + paddsw mm5, mm1 ; pred_row + rres_row + punpcklbw mm0, mm7 + paddsw mm6, mm2 ; pred_row + rres_row + paddsw mm0, mm3 ; pred_row + rres_row + ; results in mm4, mm5, mm6, mm0 + + ; move back to 8 bit + packuswb mm4, mm7 + packuswb mm5, mm7 + packuswb mm6, mm7 + packuswb mm0, mm7 + movd DWORD PTR 0[edx], mm4 + movd DWORD PTR 16[edx], mm5 + movd DWORD PTR 32[edx], mm6 + movd DWORD PTR 48[edx], mm0 + ret 0 + +_itrans4x4_mmx_direct ENDP +_itrans4x4_mmx ENDP +_TEXT ENDS + +EXTRN _itrans_sp:PROC +EXTRN _Inv_Residual_trans_4x4:PROC +PUBLIC _iMBtrans4x4 +EXTRN _opt:BYTE +_TEXT SEGMENT +_currSlice$ = -4 ; size = 4 +_mb_rec$166704 = 8 ; size = 4 +_currMB$ = 8 ; size = 4 +_curr_img$ = 12 ; size = 4 +_pl$ = 8 ; second parameter +_smb$ = 16 ; size = 4 +_iMBtrans4x4 PROC + push ecx + push ebx + push ebp + push esi +STACKOFFSET = 16 +; 408 : VideoImage *curr_img = pl ? dec_picture->imgUV[pl - 1]: dec_picture->imgY; + + mov esi, DWORD PTR _pl$[esp+STACKOFFSET] + push edi +STACKOFFSET = STACKOFFSET + 4 + mov edi, DWORD PTR _currMB$[esp+16] + mov ebp, DWORD PTR [edi+p_Slice@MacroBlock] ; ebp: currMB->p_Slice + mov eax, DWORD PTR [edi+4] + mov eax, DWORD PTR [eax+dec_picture@VideoParameters] ; eax: p_Vid->dec_picture; + mov DWORD PTR _currSlice$[esp+20], ebp + mov ecx, DWORD PTR [eax+esi*4+plane_images@StorablePicture] + mov DWORD PTR _curr_img$[esp+16], ecx + + cmp DWORD PTR _smb$[esp+16], 0 ; if (smb) +; 413 : { +; 414 : h264_short_block_t *blocks = currSlice->cof4[pl]; +; 415 : const h264_imgpel_macroblock_row_t *mb_pred=currSlice->mb_pred[pl]; +; 416 : +; 417 : itrans_sp(blocks[0], mb_pred, currMB, pl, 0, 0); + + je $LN4@iMBtrans4x + push 0 + push 0 + mov eax, esi + shl eax, 9 + lea ebx, DWORD PTR [eax+ebp+cof@Slice] + mov ecx, esi + shl ecx, 8 + lea ebp, DWORD PTR [ecx+ebp+mb_pred@Slice] + push esi + push ebp + push ebx + mov eax, edi + call _itrans_sp + +; 418 : itrans_sp(blocks[1], mb_pred, currMB, pl, 4, 0); + + push 0 + push 4 + push esi + lea edx, DWORD PTR [ebx+32] + push ebp + push edx + mov eax, edi + call _itrans_sp + +; 419 : itrans_sp(blocks[2], mb_pred, currMB, pl, 0, 4); + + push 4 + push 0 + push esi + lea eax, DWORD PTR [ebx+64] + push ebp + push eax + mov eax, edi + call _itrans_sp + +; 420 : itrans_sp(blocks[3], mb_pred, currMB, pl, 4, 4); + + push 4 + push 4 + push esi + lea ecx, DWORD PTR [ebx+96] + push ebp + push ecx + mov eax, edi + call _itrans_sp + add esp, 80 ; 00000050H + +; 421 : itrans_sp(blocks[4], mb_pred, currMB, pl, 8, 0); + + push 0 + push 8 + push esi + lea edx, DWORD PTR [ebx+128] + push ebp + push edx + mov eax, edi + call _itrans_sp + +; 422 : itrans_sp(blocks[5], mb_pred, currMB, pl, 12, 0); + + push 0 + push 12 ; 0000000cH + push esi + lea eax, DWORD PTR [ebx+160] + push ebp + push eax + mov eax, edi + call _itrans_sp + +; 423 : itrans_sp(blocks[6], mb_pred, currMB, pl, 8, 4); + + push 4 + push 8 + push esi + lea ecx, DWORD PTR [ebx+192] + push ebp + push ecx + mov eax, edi + call _itrans_sp + +; 424 : itrans_sp(blocks[7], mb_pred, currMB, pl, 12, 4); + + push 4 + push 12 ; 0000000cH + push esi + lea edx, DWORD PTR [ebx+224] + push ebp + push edx + mov eax, edi + call _itrans_sp + add esp, 80 ; 00000050H + +; 425 : itrans_sp(blocks[8], mb_pred, currMB, pl, 0, 8); + + push 8 + push 0 + push esi + lea eax, DWORD PTR [ebx+256] + push ebp + push eax + mov eax, edi + call _itrans_sp + +; 426 : itrans_sp(blocks[9], mb_pred, currMB, pl, 4, 8); + + push 8 + push 4 + push esi + push ebp + lea ecx, DWORD PTR [ebx+288] + push ecx + mov eax, edi + call _itrans_sp + +; 427 : itrans_sp(blocks[10], mb_pred, currMB, pl, 0, 12); + + push 12 ; 0000000cH + push 0 + push esi + lea edx, DWORD PTR [ebx+320] + push ebp + push edx + mov eax, edi + call _itrans_sp + +; 428 : itrans_sp(blocks[11], mb_pred, currMB, pl, 4, 12); + + push 12 ; 0000000cH + push 4 + push esi + lea eax, DWORD PTR [ebx+352] + push ebp + push eax + mov eax, edi + call _itrans_sp + add esp, 80 ; 00000050H + +; 429 : itrans_sp(blocks[12], mb_pred, currMB, pl, 8, 8); + + push 8 + push 8 + push esi + lea ecx, DWORD PTR [ebx+384] + push ebp + push ecx + mov eax, edi + call _itrans_sp + +; 430 : itrans_sp(blocks[13], mb_pred, currMB, pl, 12, 8); + + push 8 + push 12 ; 0000000cH + push esi + lea edx, DWORD PTR [ebx+416] + push ebp + push edx + mov eax, edi + call _itrans_sp + +; 431 : itrans_sp(blocks[14], mb_pred, currMB, pl, 8, 12); + + push 12 ; 0000000cH + push 8 + push esi + lea eax, DWORD PTR [ebx+448] + push ebp + push eax + mov eax, edi + call _itrans_sp + +; 432 : itrans_sp(blocks[15], mb_pred, currMB, pl, 12, 12); + + push 12 ; 0000000cH + push 12 ; 0000000cH + push esi + add ebx, 480 ; 000001e0H + push ebp + push ebx + mov eax, edi + call _itrans_sp + mov ebp, DWORD PTR _currSlice$[esp+100] + add esp, 80 ; 00000050H + jmp COPY_16x16 + +$LN4@iMBtrans4x: + +; 433 : } +; 434 : else if (currMB->is_lossless) + + cmp DWORD PTR [edi+84], 0 + je $LN2@iMBtrans4x + + push 0 + push 0 + +; 435 : { +; 436 : Inv_Residual_trans_4x4(currMB, pl, 0, 0); + + push esi + push edi + call _Inv_Residual_trans_4x4 + +; 437 : Inv_Residual_trans_4x4(currMB, pl, 4, 0); + + push 0 + push 4 + push esi + push edi + call _Inv_Residual_trans_4x4 + +; 438 : Inv_Residual_trans_4x4(currMB, pl, 0, 4); + + push 4 + push 0 + push esi + push edi + call _Inv_Residual_trans_4x4 + +; 439 : Inv_Residual_trans_4x4(currMB, pl, 4, 4); + + push 4 + push 4 + push esi + push edi + call _Inv_Residual_trans_4x4 + add esp, 64 ; 00000040H + +; 440 : Inv_Residual_trans_4x4(currMB, pl, 8, 0); + + push 0 + push 8 + push esi + push edi + call _Inv_Residual_trans_4x4 + +; 441 : Inv_Residual_trans_4x4(currMB, pl, 12, 0); + + push 0 + push 12 ; 0000000cH + push esi + push edi + call _Inv_Residual_trans_4x4 + +; 442 : Inv_Residual_trans_4x4(currMB, pl, 8, 4); + + push 4 + push 8 + push esi + push edi + call _Inv_Residual_trans_4x4 + +; 443 : Inv_Residual_trans_4x4(currMB, pl, 12, 4); + + push 4 + push 12 ; 0000000cH + push esi + push edi + call _Inv_Residual_trans_4x4 + add esp, 64 ; 00000040H + +; 444 : Inv_Residual_trans_4x4(currMB, pl, 0, 8); + + push 8 + push 0 + push esi + push edi + call _Inv_Residual_trans_4x4 + +; 445 : Inv_Residual_trans_4x4(currMB, pl, 4, 8); + + push 8 + push 4 + push esi + push edi + call _Inv_Residual_trans_4x4 + +; 446 : Inv_Residual_trans_4x4(currMB, pl, 0, 12); + + push 12 ; 0000000cH + push 0 + push esi + push edi + call _Inv_Residual_trans_4x4 + +; 447 : Inv_Residual_trans_4x4(currMB, pl, 4, 12); + + push 12 ; 0000000cH + push 4 + push esi + push edi + call _Inv_Residual_trans_4x4 + add esp, 64 ; 00000040H + +; 448 : Inv_Residual_trans_4x4(currMB, pl, 8, 8); + + push 8 + push 8 + push esi + push edi + call _Inv_Residual_trans_4x4 + +; 449 : Inv_Residual_trans_4x4(currMB, pl, 12, 8); + + push 8 + push 12 ; 0000000cH + push esi + push edi + call _Inv_Residual_trans_4x4 + +; 450 : Inv_Residual_trans_4x4(currMB, pl, 8, 12); + + push 12 ; 0000000cH + push 8 + push esi + push edi + call _Inv_Residual_trans_4x4 + +; 451 : Inv_Residual_trans_4x4(currMB, pl, 12, 12); + + push 12 ; 0000000cH + push 12 ; 0000000cH + push esi + push edi + call _Inv_Residual_trans_4x4 + add esp, 64 ; 00000040H + +; 452 : } +; 453 : else + + jmp COPY_16x16 +$LN2@iMBtrans4x: + +; 454 : { +; 455 : const h264_short_block_t *blocks = currSlice->cof4[pl]; +; 456 : const h264_imgpel_macroblock_row_t *mb_pred=currSlice->mb_pred[pl]; + + mov edx, esi + mov ecx, esi + shl edx, 8 + shl ecx, 9 + lea eax, DWORD PTR [edx+ebp] + lea ebx, DWORD PTR [ecx+ebp+cof@Slice] + +; 457 : h264_imgpel_macroblock_row_t *mb_rec = currSlice->mb_rec[pl]; + + ; put things in registers that itrans4x4_mmx_direct wants + lea edx, [eax + mb_rec@Slice]; mb_rec + lea ecx, [eax + mb_pred@Slice] ; mb_pred + mov eax, ebx ; blocks + call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[0], mb_pred, mb_rec, 0, 0); + + lea edx, [edx+4] + lea ecx, [ecx+4] + lea eax, [ebx+32] + call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[1], mb_pred, mb_rec, 4, 0); + + lea edx, [edx+4] + lea ecx, [ecx+4] + lea eax, [ebx+128] + call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[4], mb_pred, mb_rec, 8, 0); + + lea edx, [edx+4] + lea ecx, [ecx+4] + lea eax, [ebx+160] + call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[5], mb_pred, mb_rec, 12, 0); + + ; second row + lea edx, [edx+52] + lea ecx, [ecx+52] + lea eax, [ebx+64] + call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[2], mb_pred, mb_rec, 0, 4); + + lea edx, [edx+4] + lea ecx, [ecx+4] + lea eax, [ebx+96] + call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[3], mb_pred, mb_rec, 4, 4); + + lea edx, [edx+4] + lea ecx, [ecx+4] + lea eax, [ebx+192] + call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[6], mb_pred, mb_rec, 8, 4); + + lea edx, [edx+4] + lea ecx, [ecx+4] + lea eax, [ebx+224] + call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[7], mb_pred, mb_rec, 12, 4); + + ; third row + lea edx, [edx+52] + lea ecx, [ecx+52] + lea eax, [ebx+256] + call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[8], mb_pred, mb_rec, 0, 8); + + lea edx, [edx+4] + lea ecx, [ecx+4] + lea eax, [ebx+288] + call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[9], mb_pred, mb_rec, 4, 8); + + lea edx, [edx+4] + lea ecx, [ecx+4] + lea eax, [ebx+384] + call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[12], mb_pred, mb_rec, 8, 8); + + lea edx, [edx+4] + lea ecx, [ecx+4] + lea eax, [ebx+416] + call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[13], mb_pred, mb_rec, 12, 8); + + ; fourth row + lea edx, [edx+52] + lea ecx, [ecx+52] + lea eax, [ebx+320] + call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[10], mb_pred, mb_rec, 0, 12); + + lea edx, [edx+4] + lea ecx, [ecx+4] + lea eax, [ebx+352] + call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[11], mb_pred, mb_rec, 4, 12); + + lea edx, [edx+4] + lea ecx, [ecx+4] + lea eax, [ebx+448] + call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[14], mb_pred, mb_rec, 8, 12); + + lea edx, [edx+4] + lea ecx, [ecx+4] + lea eax, [ebx+480] + call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[15], mb_pred, mb_rec, 12, 12); +COPY_16x16: + +; construct picture from 4x4 blocks +; opt_copy_image_data_16x16_stride(curr_img, currMB->pix_x, currMB->pix_y, currSlice->mb_rec[pl]); + + mov eax, DWORD PTR [edi+40] + mov ecx, DWORD PTR [edi+36] + shl esi, 8 + lea edx, DWORD PTR [esi+ebp+mb_rec@Slice] + push edx + mov edx, DWORD PTR _curr_img$[esp+20] + push eax + push ecx + push edx + call DWORD PTR _opt+copy_image_data_16x16_stride@OptimizedFunctions + add esp, 16 ; 00000010H + pop edi + pop esi + pop ebp + pop ebx + pop ecx + ret 0 +_iMBtrans4x4 ENDP +_TEXT ENDS + +_TEXT SEGMENT + ALIGN 2 + PUBLIC _itrans8x8_sse2 +_itrans8x8_sse2 PROC NEAR +; parameter 1(mb_rec): 8 + ebp +; parameter 2(mb_pred): 12 + ebp +; parameter 3(block): 16 + ebp +; parameter 4(pos_x): 20 + ebp + push ebp + mov ebp, esp + and esp, -16 + sub esp, 176 + mov edx, DWORD PTR [ebp+20] + mov ecx, DWORD PTR [ebp+8] ; ecx: mb_rec + add ecx, edx + add edx, DWORD PTR [ebp+12] ; edx: mb_pred + mov eax, DWORD PTR [ebp+16] ; eax: block + +;;; __m128i a0, a1, a2, a3; +;;; __m128i p0, p1, p2, p3, p4, p5 ,p6, p7; +;;; __m128i b0, b1, b2, b3, b4, b5, b6, b7; +;;; __m128i r0, r1, r2, r3, r4, r5, r6, r7; +;;; __m128i const32, zero; +;;; __declspec(align(32)) static const int16_t c32[8] = {32, 32, 32, 32, 32, 32, 32, 32}; +;;; __m128i pred0, pred1; +;;; +;;; const32 = _mm_load_si128((const __m128i *)c32); + + movdqa xmm0, XMMWORD PTR const32 + +;;; zero = _mm_setzero_si128(); +;;; +;;; // Horizontal +;;; b0 = _mm_load_si128((__m128i *)(block[0])); + + movdqa xmm4, XMMWORD PTR [eax] + +;;; b1 = _mm_load_si128((__m128i *)(block[1])); + + movdqa xmm7, XMMWORD PTR [eax+16] + +;;; b2 = _mm_load_si128((__m128i *)(block[2])); + + movdqa xmm5, XMMWORD PTR [eax+32] + +;;; b3 = _mm_load_si128((__m128i *)(block[3])); + + movdqa xmm3, XMMWORD PTR [eax+48] + +;;; b4 = _mm_load_si128((__m128i *)(block[4])); + + movdqa xmm6, XMMWORD PTR [eax+64] + +;;; b5 = _mm_load_si128((__m128i *)(block[5])); +;;; b6 = _mm_load_si128((__m128i *)(block[6])); + + movdqa xmm1, XMMWORD PTR [eax+96] + +;;; b7 = _mm_load_si128((__m128i *)(block[7])); + + movdqa xmm2, XMMWORD PTR [eax+112] + movdqa XMMWORD PTR [esp], xmm0 + movdqa xmm0, XMMWORD PTR [eax+80] + movdqa XMMWORD PTR [esp+16], xmm2 + +;;; +;;; /* rotate 8x8 (ugh) */ +;;; r0 = _mm_unpacklo_epi16(b0, b2); + + movdqa xmm2, xmm4 + punpcklwd xmm2, xmm5 + +;;; r1 = _mm_unpacklo_epi16(b1, b3); +;;; r2 = _mm_unpackhi_epi16(b0, b2); + + punpckhwd xmm4, xmm5 + +;;; r3 = _mm_unpackhi_epi16(b1, b3); +;;; r4 = _mm_unpacklo_epi16(b4, b6); +;;; r5 = _mm_unpacklo_epi16(b5, b7); + + movdqa xmm5, xmm0 + movdqa XMMWORD PTR [esp+32], xmm2 + movdqa xmm2, xmm7 + punpcklwd xmm2, xmm3 + punpckhwd xmm7, xmm3 + movdqa xmm3, xmm6 + punpcklwd xmm3, xmm1 + movdqa XMMWORD PTR [esp+48], xmm3 + movdqa xmm3, XMMWORD PTR [esp+16] + punpcklwd xmm5, xmm3 + +;;; r6 = _mm_unpackhi_epi16(b4, b6); + + punpckhwd xmm6, xmm1 +;;; r7 = _mm_unpackhi_epi16(b5, b7); + + punpckhwd xmm0, xmm3 + +;;; +;;; b0 = _mm_unpacklo_epi16(r0, r1); + + movdqa xmm3, XMMWORD PTR [esp+32] + movdqa xmm1, xmm3 + punpcklwd xmm1, xmm2 + +;;; b1 = _mm_unpackhi_epi16(r0, r1); + + punpckhwd xmm3, xmm2 + +;;; b2 = _mm_unpacklo_epi16(r2, r3); + + movdqa xmm2, xmm4 + punpcklwd xmm2, xmm7 + +;;; b3 = _mm_unpackhi_epi16(r2, r3); + + punpckhwd xmm4, xmm7 + movdqa XMMWORD PTR [esp+64], xmm4 + +;;; b4 = _mm_unpacklo_epi16(r4, r5); + + movdqa xmm4, XMMWORD PTR [esp+48] + movdqa xmm7, xmm4 + punpcklwd xmm7, xmm5 + +;;; b5 = _mm_unpackhi_epi16(r4, r5); + + punpckhwd xmm4, xmm5 + +;;; b6 = _mm_unpacklo_epi16(r6, r7); + + movdqa xmm5, xmm6 + punpcklwd xmm5, xmm0 + +;;; b7 = _mm_unpackhi_epi16(r6, r7); + + punpckhwd xmm6, xmm0 + +;;; +;;; p0 = _mm_unpacklo_epi64(b0, b4); + + movdqa xmm0, xmm1 + punpcklqdq xmm0, xmm7 + +;;; p1 = _mm_unpackhi_epi64(b0, b4); + + punpckhqdq xmm1, xmm7 + movdqa XMMWORD PTR [esp+16], xmm1 + +;;; p2 = _mm_unpacklo_epi64(b1, b5); + + movdqa xmm1, xmm3 + punpcklqdq xmm1, xmm4 + +;;; p3 = _mm_unpackhi_epi64(b1, b5); +;;; p4 = _mm_unpacklo_epi64(b2, b6); +;;; p5 = _mm_unpackhi_epi64(b2, b6); +;;; p6 = _mm_unpacklo_epi64(b3, b7); +;;; p7 = _mm_unpackhi_epi64(b3, b7); +;;; +;;; /* perform approx DCT */ +;;; a0 = _mm_add_epi16(p0, p4); // p0 + p4 +;;; a1 = _mm_sub_epi16(p0, p4); // p0 - p4 +;;; r0 = _mm_srai_epi16(p2, 1); // p2 >> 1 + + movdqa xmm7, xmm1 + psraw xmm7, 1 + punpckhqdq xmm3, xmm4 + movdqa XMMWORD PTR [esp+32], xmm3 + movdqa xmm3, xmm2 + punpcklqdq xmm3, xmm5 + punpckhqdq xmm2, xmm5 + movdqa xmm5, XMMWORD PTR [esp+64] + movdqa xmm4, xmm5 + punpcklqdq xmm4, xmm6 + punpckhqdq xmm5, xmm6 + movdqa xmm6, xmm0 + paddw xmm6, xmm3 + psubw xmm0, xmm3 + +;;; a2 = _mm_sub_epi16(p6, r0); // p6 - (p2 >> 1) + + movdqa xmm3, xmm4 + +;;; r0 = _mm_srai_epi16(p6, 1); // p6 >> 1 + + psraw xmm4, 1 + psubw xmm3, xmm7 + +;;; a3 = _mm_add_epi16(p2, r0); //p2 + (p6 >> 1) + + paddw xmm1, xmm4 + +;;; +;;; b0 = _mm_add_epi16(a0, a3); // a0 + a3; + + movdqa xmm4, xmm6 + +;;; b2 = _mm_sub_epi16(a1, a2); // a1 - a2; + + movdqa xmm7, xmm0 + paddw xmm4, xmm1 + psubw xmm7, xmm3 + movdqa XMMWORD PTR [esp+48], xmm7 + +;;; b4 = _mm_add_epi16(a1, a2); // a1 + a2; + + paddw xmm0, xmm3 + movdqa XMMWORD PTR [esp+80], xmm0 + +;;; b6 = _mm_sub_epi16(a0, a3); // a0 - a3; +;;; +;;; //-p3 + p5 - p7 - (p7 >> 1); +;;; r0 = _mm_srai_epi16(p7, 1); // p7 >> 1 +;;; a0 = _mm_sub_epi16(p5, p3); // p5 - p3 + + movdqa xmm0, XMMWORD PTR [esp+32] + psubw xmm6, xmm1 + movdqa xmm1, xmm5 + psraw xmm1, 1 + movdqa xmm3, xmm2 + +;;; a0 = _mm_sub_epi16(a0, p7); // (-p3 + p5) - p7 +;;; a0 = _mm_sub_epi16(a0, r0); // (-p3 + p5 - p7) - (p7 >> 1) +;;; +;;; //p1 + p7 - p3 - (p3 >> 1); +;;; r0 = _mm_srai_epi16(p3, 1); // (p3 >> 1) + + movdqa xmm7, xmm0 + movdqa XMMWORD PTR [esp+96], xmm6 + +;;; a1 = _mm_add_epi16(p1, p7); // p1 + p7 + + movdqa xmm6, XMMWORD PTR [esp+16] + psubw xmm3, xmm0 + psubw xmm3, xmm5 + psraw xmm7, 1 + psubw xmm3, xmm1 + movdqa xmm1, xmm6 + paddw xmm1, xmm5 + +;;; a1 = _mm_sub_epi16(a1, p3); // (p1 + p7) - p3 + + psubw xmm1, xmm0 + +;;; a1 = _mm_sub_epi16(a1, r0); // (p1 + p7 - p3) - (p3>>1) + + psubw xmm1, xmm7 + +;;; +;;; // -p1 + p7 + p5 + (p5 >> 1); +;;; r0 = _mm_srai_epi16(p5, 1); // (p5 >> 1) + + movdqa xmm7, xmm2 + psraw xmm7, 1 + +;;; a2 = _mm_sub_epi16(p7, p1); // p7 - p1 + + psubw xmm5, xmm6 + +;;; a2 = _mm_add_epi16(a2, p5); // -p1 + p7 + p5 + + paddw xmm5, xmm2 + +;;; a2 = _mm_add_epi16(a2, r0); // (-p1 + p7 + p5) + (p5 >> 1) + + paddw xmm5, xmm7 + +;;; +;;; // p3 + p5 + p1 + (p1 >> 1); +;;; a3 = _mm_add_epi16(p3, p5); // p3+p5 + + paddw xmm0, xmm2 + +;;; a3 = _mm_add_epi16(a3, p1); // p3 + p5 + p1 +;;; p1 = _mm_srai_epi16(p1, 1); // p1 >> 1 +;;; a3 = _mm_add_epi16(a3, p1); //p3 + p5 + p1 + (p1 >> 1) +;;; +;;; r0 = _mm_srai_epi16(a3, 2); // a3>>2 +;;; b1 = _mm_add_epi16(a0, r0); //a0 + (a3>>2); +;;; r0 = _mm_srai_epi16(a2, 2); // a2>>2 +;;; b3 = _mm_add_epi16(a1, r0); // a1 + (a2>>2); +;;; a1 = _mm_srai_epi16(a1, 2); // all done with a1, so this is safe +;;; b5 = _mm_sub_epi16(a2, a1); //a2 - (a1>>2); +;;; a0 = _mm_srai_epi16(a0, 2); // all done with a0, so this is safe +;;; b7 = _mm_sub_epi16(a3, a0); //a3 - (a0>>2); +;;; +;;; p0 = _mm_add_epi16(b0, b7); // b0 + b7; +;;; p1 = _mm_sub_epi16(b2, b5); // b2 - b5; +;;; p2 = _mm_add_epi16(b4, b3); // b4 + b3; +;;; p3 = _mm_add_epi16(b6, b1); // b6 + b1; + + movdqa xmm2, XMMWORD PTR [esp+96] + paddw xmm0, xmm6 + psraw xmm6, 1 + paddw xmm0, xmm6 + movdqa xmm7, xmm0 + movdqa xmm6, xmm5 + psraw xmm7, 2 + paddw xmm7, xmm3 + psraw xmm6, 2 + paddw xmm6, xmm1 + psraw xmm1, 2 + psubw xmm5, xmm1 + movdqa xmm1, xmm4 + psraw xmm3, 2 + psubw xmm0, xmm3 + movdqa xmm3, XMMWORD PTR [esp+80] + movdqa XMMWORD PTR [esp+32], xmm0 + +;;; p4 = _mm_sub_epi16(b6, b1); // b6 - b1; +;;; p5 = _mm_sub_epi16(b4, b3); // b4 - b3; +;;; p6 = _mm_add_epi16(b2, b5); // b2 + b5; +;;; p7 = _mm_sub_epi16(b0, b7); // b0 - b7; + + psubw xmm4, XMMWORD PTR [esp+32] + paddw xmm1, xmm0 + movdqa XMMWORD PTR [esp+112], xmm1 + movdqa xmm1, XMMWORD PTR [esp+48] + movdqa xmm0, xmm1 + psubw xmm0, xmm5 + movdqa XMMWORD PTR [esp+16], xmm0 + movdqa xmm0, xmm3 + paddw xmm0, xmm6 + psubw xmm3, xmm6 + movdqa XMMWORD PTR [esp+128], xmm0 + +;;; +;;; /* rotate 8x8 (ugh) */ +;;; r0 = _mm_unpacklo_epi16(p0, p2); + + movdqa xmm6, XMMWORD PTR [esp+128] + movdqa xmm0, xmm2 + paddw xmm0, xmm7 + psubw xmm2, xmm7 + paddw xmm1, xmm5 + movdqa xmm5, XMMWORD PTR [esp+112] + movdqa XMMWORD PTR [esp+144], xmm4 + movdqa xmm4, xmm5 + punpcklwd xmm4, xmm6 + +;;; r1 = _mm_unpacklo_epi16(p1, p3); +;;; r2 = _mm_unpackhi_epi16(p0, p2); + + punpckhwd xmm5, xmm6 + +;;; r3 = _mm_unpackhi_epi16(p1, p3); +;;; r4 = _mm_unpacklo_epi16(p4, p6); +;;; r5 = _mm_unpacklo_epi16(p5, p7); + + movdqa xmm6, xmm3 + movdqa XMMWORD PTR [esp+64], xmm4 + movdqa xmm4, XMMWORD PTR [esp+16] + movdqa xmm7, xmm4 + punpcklwd xmm7, xmm0 + punpckhwd xmm4, xmm0 + movdqa xmm0, xmm2 + punpcklwd xmm0, xmm1 + movdqa XMMWORD PTR [esp+128], xmm0 + movdqa xmm0, XMMWORD PTR [esp+144] + punpcklwd xmm6, xmm0 + +;;; r6 = _mm_unpackhi_epi16(p4, p6); + + punpckhwd xmm2, xmm1 + +;;; r7 = _mm_unpackhi_epi16(p5, p7); +;;; +;;; b0 = _mm_unpacklo_epi16(r0, r1); + + movdqa xmm1, XMMWORD PTR [esp+64] + punpckhwd xmm3, xmm0 + movdqa xmm0, xmm1 + punpcklwd xmm0, xmm7 + +;;; b1 = _mm_unpackhi_epi16(r0, r1); + + punpckhwd xmm1, xmm7 + +;;; b2 = _mm_unpacklo_epi16(r2, r3); + + movdqa xmm7, xmm5 + punpcklwd xmm7, xmm4 + +;;; b3 = _mm_unpackhi_epi16(r2, r3); + + punpckhwd xmm5, xmm4 + movdqa XMMWORD PTR [esp+112], xmm5 + +;;; b4 = _mm_unpacklo_epi16(r4, r5); + + movdqa xmm5, XMMWORD PTR [esp+128] + movdqa xmm4, xmm5 + punpcklwd xmm4, xmm6 + +;;; b5 = _mm_unpackhi_epi16(r4, r5); + + punpckhwd xmm5, xmm6 + +;;; b6 = _mm_unpacklo_epi16(r6, r7); + + movdqa xmm6, xmm2 + punpcklwd xmm6, xmm3 + +;;; b7 = _mm_unpackhi_epi16(r6, r7); + + punpckhwd xmm2, xmm3 + +;;; +;;; p0 = _mm_unpacklo_epi64(b0, b4); + + movdqa xmm3, xmm0 + punpcklqdq xmm3, xmm4 + +;;; p1 = _mm_unpackhi_epi64(b0, b4); + + punpckhqdq xmm0, xmm4 + movdqa XMMWORD PTR [esp+144], xmm0 + +;;; p2 = _mm_unpacklo_epi64(b1, b5); +;;; p3 = _mm_unpackhi_epi64(b1, b5); +;;; p4 = _mm_unpacklo_epi64(b2, b6); +;;; p5 = _mm_unpackhi_epi64(b2, b6); +;;; p6 = _mm_unpacklo_epi64(b3, b7); + + movdqa xmm0, XMMWORD PTR [esp+112] + movdqa xmm4, xmm1 + punpcklqdq xmm4, xmm5 + punpckhqdq xmm1, xmm5 + movdqa XMMWORD PTR [esp+64], xmm1 + movdqa xmm1, xmm7 + movdqa xmm5, xmm0 + punpcklqdq xmm1, xmm6 + punpckhqdq xmm7, xmm6 + +;;; p7 = _mm_unpackhi_epi64(b3, b7); +;;; +;;; +;;; /* Vertical */ +;;; +;;; a0 = _mm_add_epi16(p0, p4); // p0 + p4 +;;; a1 = _mm_sub_epi16(p0, p4); // p0 - p4 +;;; r0 = _mm_srai_epi16(p2, 1); // p2 >> 1 + + movdqa xmm6, xmm4 + psraw xmm6, 1 + punpcklqdq xmm5, xmm2 + punpckhqdq xmm0, xmm2 + movdqa xmm2, xmm3 + paddw xmm2, xmm1 + psubw xmm3, xmm1 + +;;; a2 = _mm_sub_epi16(p6, r0); // p6 - (p2 >> 1) + + movdqa xmm1, xmm5 + +;;; r0 = _mm_srai_epi16(p6, 1); // p6 >> 1 + + psraw xmm5, 1 + psubw xmm1, xmm6 + +;;; a3 = _mm_add_epi16(p2, r0); //p2 + (p6 >> 1) + + paddw xmm4, xmm5 + +;;; +;;; b0 = _mm_add_epi16(a0, a3); // a0 + a3; + + movdqa xmm5, xmm2 + +;;; b2 = _mm_sub_epi16(a1, a2); // a1 - a2; + + movdqa xmm6, xmm3 + paddw xmm5, xmm4 + psubw xmm6, xmm1 + movdqa XMMWORD PTR [esp+128], xmm6 + +;;; b4 = _mm_add_epi16(a1, a2); // a1 + a2; +;;; b6 = _mm_sub_epi16(a0, a3); // a0 - a3; +;;; +;;; //-p3 + p5 - p7 - (p7 >> 1); +;;; r0 = _mm_srai_epi16(p7, 1); // p7 >> 1 +;;; a0 = _mm_sub_epi16(p5, p3); // p5 - p3 + + movdqa xmm6, XMMWORD PTR [esp+64] + paddw xmm3, xmm1 + movdqa XMMWORD PTR [esp+80], xmm3 + psubw xmm2, xmm4 + movdqa xmm1, xmm0 + psraw xmm1, 1 + movdqa xmm3, xmm7 + movdqa XMMWORD PTR [esp+96], xmm2 + psubw xmm3, xmm6 + +;;; a0 = _mm_sub_epi16(a0, p7); // (-p3 + p5) - p7 + + psubw xmm3, xmm0 + +;;; a0 = _mm_sub_epi16(a0, r0); // (-p3 + p5 - p7) - (p7 >> 1) +;;; +;;; //p1 + p7 - p3 - (p3 >> 1); +;;; r0 = _mm_srai_epi16(p3, 1); // (p3 >> 1) + + movdqa xmm2, xmm6 + psraw xmm2, 1 + psubw xmm3, xmm1 + +;;; a1 = _mm_add_epi16(p1, p7); // p1 + p7 + + movdqa xmm1, XMMWORD PTR [esp+144] + movdqa xmm4, xmm1 + paddw xmm4, xmm0 + +;;; a1 = _mm_sub_epi16(a1, p3); // (p1 + p7) - p3 + + psubw xmm4, xmm6 + +;;; a1 = _mm_sub_epi16(a1, r0); // (p1 + p7 - p3) - (p3>>1) + + psubw xmm4, xmm2 + +;;; +;;; // -p1 + p7 + p5 + (p5 >> 1); +;;; r0 = _mm_srai_epi16(p5, 1); // (p5 >> 1) + + movdqa xmm2, xmm7 + psraw xmm2, 1 + +;;; a2 = _mm_sub_epi16(p7, p1); // p7 - p1 + + psubw xmm0, xmm1 + +;;; a2 = _mm_add_epi16(a2, p5); // -p1 + p7 + p5 + + paddw xmm0, xmm7 + +;;; a2 = _mm_add_epi16(a2, r0); // (-p1 + p7 + p5) + (p5 >> 1) + + paddw xmm0, xmm2 + +;;; +;;; // p3 + p5 + p1 + (p1 >> 1); +;;; r0 = _mm_srai_epi16(p1, 1); // p1 >> 1 + + movdqa xmm2, xmm1 + psraw xmm2, 1 + +;;; a3 = _mm_add_epi16(p3, p5); // p3+p5 + + paddw xmm6, xmm7 + +;;; a3 = _mm_add_epi16(a3, p1); // p3 + p5 + p1 +;;; a3 = _mm_add_epi16(a3, r0); //p3 + p5 + p1 + (p1 >> 1) +;;; +;;; r0 = _mm_srai_epi16(a3, 2); // a3>>2 +;;; b1 = _mm_add_epi16(a0, r0); //a0 + (a3>>2); +;;; r0 = _mm_srai_epi16(a2, 2); // a2>>2 +;;; b3 = _mm_add_epi16(a1, r0); // a1 + (a2>>2); +;;; a1 = _mm_srai_epi16(a1, 2); // all done with a1, so this is safe +;;; b5 = _mm_sub_epi16(a2, a1); //a2 - (a1>>2); +;;; a0 = _mm_srai_epi16(a0, 2); // all done with a0, so this is safe +;;; b7 = _mm_sub_epi16(a3, a0); //a3 - (a0>>2); +;;; +;;; r0 = _mm_add_epi16(b0, b7); // b0 + b7; +;;; r1 = _mm_sub_epi16(b2, b5); // b2 - b5; + + movdqa xmm7, XMMWORD PTR [esp+128] + paddw xmm6, xmm1 + paddw xmm6, xmm2 + movdqa xmm1, xmm6 + psraw xmm1, 2 + movdqa xmm2, xmm0 + paddw xmm1, xmm3 + psraw xmm2, 2 + paddw xmm2, xmm4 + psraw xmm4, 2 + psubw xmm0, xmm4 + psraw xmm3, 2 + psubw xmm6, xmm3 + movdqa XMMWORD PTR [esp+64], xmm6 + movdqa xmm3, xmm5 + +;;; r2 = _mm_add_epi16(b4, b3); // b4 + b3; +;;; r3 = _mm_add_epi16(b6, b1); // b6 + b1; +;;; r4 = _mm_sub_epi16(b6, b1); // b6 - b1; +;;; r5 = _mm_sub_epi16(b4, b3); // b4 - b3; +;;; r6 = _mm_add_epi16(b2, b5); // b2 + b5; +;;; r7 = _mm_sub_epi16(b0, b7); // b0 - b7; + + psubw xmm5, XMMWORD PTR [esp+64] + paddw xmm3, xmm6 + movdqa XMMWORD PTR [esp+144], xmm3 + movdqa xmm3, xmm7 + psubw xmm3, xmm0 + movdqa XMMWORD PTR [esp+48], xmm3 + movdqa xmm3, XMMWORD PTR [esp+80] + movdqa xmm4, xmm3 + paddw xmm4, xmm2 + psubw xmm3, xmm2 + +;;; +;;; +;;; // add in prediction values +;;; pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[0][pos_x])); +;;; pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[1][pos_x])); +;;; // (x + 32) >> 6 +;;; r0 = _mm_adds_epi16(r0, const32); + + movdqa xmm2, XMMWORD PTR const32 + movdqa XMMWORD PTR [esp+16], xmm4 + movdqa xmm4, XMMWORD PTR [esp+96] + movdqa xmm6, xmm4 + paddw xmm6, xmm1 + psubw xmm4, xmm1 + +;;; r0 = _mm_srai_epi16(r0, 6); +;;; r1 = _mm_adds_epi16(r1, const32); + + movdqa xmm1, XMMWORD PTR [esp+48] + paddw xmm7, xmm0 + movdqa xmm0, XMMWORD PTR [esp+144] + movdqa XMMWORD PTR [esp+128], xmm7 + +;;; r1 = _mm_srai_epi16(r1, 6); +;;; pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short +;;; pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short + + movq xmm7, QWORD PTR [edx+16] + movdqa XMMWORD PTR [esp+32], xmm5 + paddsw xmm0, xmm2 + psraw xmm0, 6 + paddsw xmm1, xmm2 + pxor xmm2, xmm2 + punpcklbw xmm7, xmm2 + movq xmm5, QWORD PTR [edx] + punpcklbw xmm5, xmm2 + psraw xmm1, 6 + +;;; pred0 = _mm_adds_epi16(pred0, r0); +;;; pred1 = _mm_adds_epi16(pred1, r1); + + paddsw xmm7, xmm1 + paddsw xmm5, xmm0 + +;;; +;;; pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char + + packuswb xmm5, xmm7 + +;;; +;;; // store +;;; _mm_storel_epi64((__m128i *)(&mb_rec[0][pos_x]), pred0); + + movdqa xmm0, XMMWORD PTR [esp+32] + movdqa xmm2, XMMWORD PTR [esp+128] + movq QWORD PTR [ecx], xmm5 + +;;; // TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily. +;;; pred0 = _mm_srli_si128(pred0, 8); + + psrldq xmm5, 8 + +;;; _mm_storel_epi64((__m128i *)(&mb_rec[1][pos_x]), pred0); + + movq QWORD PTR [ecx+16], xmm5 + +;;; +;;; /* --- */ +;;; +;;; pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[2][pos_x])); + + movq xmm1, QWORD PTR [edx+32] + +;;; pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[3][pos_x])); +;;; // (x + 32) >> 6 +;;; r2 = _mm_adds_epi16(r2, const32); + + movdqa xmm5, XMMWORD PTR [esp] + movdqa XMMWORD PTR [esp+32], xmm0 ; + +;;; r2 = _mm_srai_epi16(r2, 6); +;;; r3 = _mm_adds_epi16(r3, const32); + + paddsw xmm6, xmm5 + +;;; r3 = _mm_srai_epi16(r3, 6); + + psraw xmm6, 6 + +;;; pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short + + pxor xmm7, xmm7 + punpcklbw xmm1, xmm7 + movdqa xmm0, XMMWORD PTR [esp+16] + paddsw xmm0, xmm5 + psraw xmm0, 6 + +;;; pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short +;;; pred0 = _mm_adds_epi16(pred0, r2); + + paddsw xmm1, xmm0 + +;;; pred1 = _mm_adds_epi16(pred1, r3); +;;; +;;; pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char +;;; +;;; // store +;;; _mm_storel_epi64((__m128i *)(&mb_rec[2][pos_x]), pred0); + + movdqa xmm0, XMMWORD PTR [esp+32] + movq xmm5, QWORD PTR [edx+48] + punpcklbw xmm5, xmm7 + paddsw xmm5, xmm6 + packuswb xmm1, xmm5 + movq QWORD PTR [ecx+32], xmm1 + +;;; // TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily. +;;; pred0 = _mm_srli_si128(pred0, 8); + + psrldq xmm1, 8 + +;;; _mm_storel_epi64((__m128i *)(&mb_rec[3][pos_x]), pred0); + + movq QWORD PTR [ecx+48], xmm1 + +;;; +;;; /* --- */ +;;; +;;; pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[4][pos_x])); + + movq xmm7, QWORD PTR [edx+64] + +;;; pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[5][pos_x])); + + movq xmm6, QWORD PTR [edx+80] + +;;; // (x + 32) >> 6 +;;; r4 = _mm_adds_epi16(r4, const32); +;;; r4 = _mm_srai_epi16(r4, 6); +;;; r5 = _mm_adds_epi16(r5, const32); +;;; r5 = _mm_srai_epi16(r5, 6); +;;; pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short + + pxor xmm5, xmm5 + punpcklbw xmm7, xmm5 + +;;; pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short + + punpcklbw xmm6, xmm5 + movdqa xmm1, XMMWORD PTR [esp] + paddsw xmm4, xmm1 + psraw xmm4, 6 + paddsw xmm3, xmm1 + psraw xmm3, 6 + +;;; pred0 = _mm_adds_epi16(pred0, r4); + + paddsw xmm7, xmm4 + +;;; pred1 = _mm_adds_epi16(pred1, r5); + + paddsw xmm6, xmm3 + +;;; +;;; pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char + + packuswb xmm7, xmm6 + +;;; +;;; // store +;;; _mm_storel_epi64((__m128i *)(&mb_rec[4][pos_x]), pred0); + + movq QWORD PTR [ecx+64], xmm7 + +;;; // TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily. +;;; pred0 = _mm_srli_si128(pred0, 8); + + psrldq xmm7, 8 + +;;; _mm_storel_epi64((__m128i *)(&mb_rec[5][pos_x]), pred0); + + movq QWORD PTR [ecx+80], xmm7 + + +;;; +;;; /* --- */ +;;; +;;; pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[6][pos_x])); + + movq xmm5, QWORD PTR [edx+96] + +;;; pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[7][pos_x])); + + movq xmm4, QWORD PTR [edx+112] + +;;; // (x + 32) >> 6 +;;; r6 = _mm_adds_epi16(r6, const32); +;;; r6 = _mm_srai_epi16(r6, 6); +;;; r7 = _mm_adds_epi16(r7, const32); +;;; r7 = _mm_srai_epi16(r7, 6); +;;; pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short + + pxor xmm3, xmm3 + punpcklbw xmm5, xmm3 + +;;; pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short + + punpcklbw xmm4, xmm3 + movdqa xmm1, XMMWORD PTR [esp] + paddsw xmm2, xmm1 + psraw xmm2, 6 + paddsw xmm0, xmm1 + psraw xmm0, 6 + +;;; pred0 = _mm_adds_epi16(pred0, r6); + + paddsw xmm5, xmm2 + +;;; pred1 = _mm_adds_epi16(pred1, r7); + + paddsw xmm4, xmm0 + +;;; +;;; pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char + + packuswb xmm5, xmm4 + +;;; +;;; // store +;;; _mm_storel_epi64((__m128i *)&mb_rec[6][pos_x], pred0); + + movq QWORD PTR [ecx+96], xmm5 + +;;; // TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily. +;;; pred0 = _mm_srli_si128(pred0, 8); + + psrldq xmm5, 8 + +;;; _mm_storel_epi64((__m128i *)&mb_rec[7][pos_x], pred0); + + movq QWORD PTR [ecx+112], xmm5 + mov esp, ebp + pop ebp + ret + ALIGN 2 +_itrans8x8_sse2 ENDP + + +END
\ No newline at end of file |