aboutsummaryrefslogtreecommitdiff
path: root/Src/h264dec/ldecod/src/prediction.asm
diff options
context:
space:
mode:
Diffstat (limited to 'Src/h264dec/ldecod/src/prediction.asm')
-rw-r--r--Src/h264dec/ldecod/src/prediction.asm1626
1 files changed, 1626 insertions, 0 deletions
diff --git a/Src/h264dec/ldecod/src/prediction.asm b/Src/h264dec/ldecod/src/prediction.asm
new file mode 100644
index 00000000..33d3d499
--- /dev/null
+++ b/Src/h264dec/ldecod/src/prediction.asm
@@ -0,0 +1,1626 @@
+.686
+.XMM
+.model FLAT
+
+copy_image_data_16x16_stride@OptimizedFunctions = 32
+dec_picture@VideoParameters = 698192
+p_Slice@MacroBlock = 0
+plane_images@StorablePicture = 158512
+mb_rec@Slice = 1696
+mb_pred@Slice = 928
+cof@Slice = 2464
+
+CONST SEGMENT
+align 16
+const32 DW 020H, 020H, 020H, 020H, 020H, 020H, 020H, 020H
+CONST ENDS
+
+;
+;
+;
+;
+
+PUBLIC _weighted_bi_prediction4x4
+_TEXT SEGMENT
+mb_pred = 4
+block_l0 = 8
+wp_scale_l0 = 12
+wp_scale_l1 = 16
+wp_offset = 20
+weight_denom = 24
+_weighted_bi_prediction4x4 PROC ; COMDAT
+ mov eax, DWORD PTR weight_denom[esp]
+ pxor mm0, mm0
+ pshufw mm1, MMWORD PTR wp_scale_l0[esp], 0
+ test eax, eax
+ pshufw mm2, MMWORD PTR wp_scale_l1[esp], 0
+ pshufw mm3, MMWORD PTR wp_offset[esp], 0
+ jle BI_PRED4x4@LEFT_SHIFT
+
+ movd mm4, eax
+ lea ecx, DWORD PTR [eax-1] ;
+ mov edx, 1
+ shl edx, cl
+ movd mm5, edx
+ mov eax, mb_pred[esp]
+ mov edx, block_l0[esp]
+ pshufw mm5, mm5, 0
+ movd mm6, DWORD PTR 0[edx] ; block_l0
+ movd mm7, DWORD PTR 0[eax] ; mb_pred
+ punpcklbw mm6, mm0
+ punpcklbw mm7, mm0
+ pmullw mm6, mm1
+ pmullw mm7, mm2
+ paddw mm6, mm7
+ movd mm7, DWORD PTR 16[eax] ; mb_pred
+ paddw mm6, mm5
+ psraw mm6, mm4
+ paddw mm6, mm3
+ packuswb mm6, mm6
+ movd DWORD PTR 0[eax], mm6
+
+ movd mm6, DWORD PTR 16[edx] ; block_l0
+ punpcklbw mm6, mm0
+ punpcklbw mm7, mm0
+ pmullw mm6, mm1
+ pmullw mm7, mm2
+ paddw mm6, mm7
+ movd mm7, DWORD PTR 32[eax] ; mb_pred
+ paddw mm6, mm5
+ psraw mm6, mm4
+ paddw mm6, mm3
+ packuswb mm6, mm6
+ movd DWORD PTR 16[eax], mm6
+
+ movd mm6, DWORD PTR 32[edx] ; block_l0
+ punpcklbw mm6, mm0
+ punpcklbw mm7, mm0
+ pmullw mm6, mm1
+ pmullw mm7, mm2
+ paddw mm6, mm7
+ movd mm7, DWORD PTR 48[eax] ; mb_pred
+ paddw mm6, mm5
+ psraw mm6, mm4
+ paddw mm6, mm3
+ packuswb mm6, mm6
+ movd DWORD PTR 0[eax], mm6
+
+ movd mm6, DWORD PTR 48[edx] ; block_l0
+ punpcklbw mm6, mm0
+ punpcklbw mm7, mm0
+ pmullw mm6, mm1
+ pmullw mm7, mm2
+ paddw mm6, mm7
+ paddw mm6, mm5
+ psraw mm6, mm4
+ paddw mm6, mm3
+ packuswb mm6, mm6
+ movd DWORD PTR 48[eax], mm6
+ ret 0
+
+BI_PRED4x4@LEFT_SHIFT:
+ neg eax
+ movd mm4, eax
+ mov eax, mb_pred[esp]
+ mov edx, block_l0[esp]
+ movd mm6, DWORD PTR 0[edx] ; block_l0
+ movd mm7, DWORD PTR 0[eax] ; mb_pred
+ punpcklbw mm6, mm0
+ punpcklbw mm7, mm0
+ pmullw mm6, mm1
+ pmullw mm7, mm2
+ paddw mm6, mm7
+ movd mm7, DWORD PTR 16[eax] ; mb_pred
+ psllw mm6, mm4
+ paddw mm6, mm3
+ packuswb mm6, mm6
+ movd DWORD PTR 0[eax], mm6
+
+ movd mm6, DWORD PTR 16[edx] ; block_l0
+ punpcklbw mm6, mm0
+ punpcklbw mm7, mm0
+ pmullw mm6, mm1
+ pmullw mm7, mm2
+ paddw mm6, mm7
+ movd mm7, DWORD PTR 32[eax] ; mb_pred
+ psllw mm6, mm4
+ paddw mm6, mm3
+ packuswb mm6, mm6
+ movd DWORD PTR 16[eax], mm6
+
+ movd mm6, DWORD PTR 32[edx] ; block_l0
+ punpcklbw mm6, mm0
+ punpcklbw mm7, mm0
+ pmullw mm6, mm1
+ pmullw mm7, mm2
+ paddw mm6, mm7
+ movd mm7, DWORD PTR 48[eax] ; mb_pred
+ psllw mm6, mm4
+ paddw mm6, mm3
+ packuswb mm6, mm6
+ movd DWORD PTR 0[eax], mm6
+
+ movd mm6, DWORD PTR 48[edx] ; block_l0
+ punpcklbw mm6, mm0
+ punpcklbw mm7, mm0
+ pmullw mm6, mm1
+ pmullw mm7, mm2
+ paddw mm6, mm7
+ psllw mm6, mm4
+ paddw mm6, mm3
+ packuswb mm6, mm6
+ movd DWORD PTR 48[eax], mm6
+ ret 0
+_weighted_bi_prediction4x4 ENDP
+_TEXT ENDS
+
+PUBLIC _itrans4x4_mmx
+_TEXT SEGMENT
+_tblock$ = 4 ; size = 4
+_mb_pred$ = 8 ; size = 4
+_mb_rec$ = 12 ; size = 4
+_pos_x$ = 16 ; size = 4
+_pos_y$ = 20 ; size = 4
+_itrans4x4_mmx PROC ; COMDAT
+
+ mov edx, DWORD PTR _pos_y$[esp]
+ shl edx, 4
+ add edx, DWORD PTR _pos_x$[esp]
+ mov eax, DWORD PTR _tblock$[esp]
+ mov ecx, DWORD PTR _mb_pred$[esp]
+ add ecx, edx
+ add edx, DWORD PTR _mb_rec$[esp]
+_itrans4x4_mmx_direct PROC ; COMDAT
+ ; load 4x4 matrix
+ movq mm0, MMWORD PTR 0[eax]
+ movq mm1, MMWORD PTR 8[eax]
+ movq mm2, MMWORD PTR 16[eax]
+ movq mm3, MMWORD PTR 24[eax]
+
+ ; rotate 4x4 matrix
+ movq mm4, mm0 ; p0 = mm4 (copy)
+ punpcklwd mm0, mm2 ; r0 = mm0
+ punpckhwd mm4, mm2 ; r2 = mm4
+ movq mm5, mm1 ; p1 = mm5 (copy)
+ punpcklwd mm1, mm3 ; r1 = mm1
+ punpckhwd mm5, mm3 ; r3 = mm5
+ movq mm6, mm0 ; r0 = mm6 (copy)
+ punpcklwd mm0, mm1 ; t0 = mm0
+ punpckhwd mm6, mm1 ; t1 = mm6
+ movq mm1, mm4 ; r2 = mm1 (copy)
+ punpcklwd mm1, mm5 ; t2 = mm1
+ punpckhwd mm4, mm5 ; t3 = mm4
+
+ movq mm2, mm0 ; mm2 = t0 (copy)
+ paddw mm0, mm1 ; mm0 = p0
+ psubw mm2, mm1 ; mm2 = p1, mm1 available
+ movq mm5, mm6 ; mm5 = t1 (copy)
+ psraw mm5, 1 ; mm5 = (t1 >> 1)
+ psubw mm5, mm4 ; mm5 = p2
+ psraw mm4, 1 ; mm4 = (t3 >> 1)
+ paddw mm6, mm4 ; mm6 = p3
+
+ movq mm3, mm0 ; mm3 = p0 (copy)
+ paddw mm0, mm6 ; mm0 = r0
+ movq mm1, mm2 ; mm1 = p1 (copy)
+ paddw mm1, mm5 ; mm1 = r1
+ psubw mm2, mm5 ; mm2 = r2, mm5 available
+ psubw mm3, mm6 ; mm3 = r3
+
+ ; rotate 4x4 matrix to set up for vertical
+ movq mm4, mm0 ; r0 = mm4 (copy)
+ punpcklwd mm0, mm2 ; p0 = mm0
+ punpckhwd mm4, mm2 ; p2 = mm4
+ movq mm5, mm1 ; r1 = mm5 (copy)
+ punpcklwd mm1, mm3 ; p1 = mm1
+ punpckhwd mm5, mm3 ; p3 = mm5
+ movq mm6, mm0 ; p0 = mm6 (copy)
+ punpcklwd mm0, mm1 ; t0 = mm0
+ punpckhwd mm6, mm1 ; t1 = mm6
+ movq mm1, mm4 ; p2 = mm1 (copy)
+ punpcklwd mm1, mm5 ; t2 = mm1
+ punpckhwd mm4, mm5 ; t3 = mm4
+
+ movq mm2, mm0 ; mm2 = t0 (copy)
+ paddw mm0, mm1 ; mm0 = p0
+ psubw mm2, mm1 ; mm2 = p1, mm1 available
+ movq mm5, mm6 ; mm5 = t1 (copy)
+ psraw mm5, 1 ; mm5 = (t1 >> 1)
+ psubw mm5, mm4 ; mm5 = p2
+ psraw mm4, 1 ; mm4 = (t3 >> 1)
+ paddw mm6, mm4 ; mm6 = p3
+ movq mm3, mm0 ; mm3 = p0 (copy)
+ paddw mm0, mm6 ; mm0 = r0
+ movq mm1, mm2 ; mm1 = p1 (copy)
+ paddw mm1, mm5 ; mm1 = r1
+ psubw mm2, mm5 ; mm2 = r2, mm5 available
+ psubw mm3, mm6 ; mm3 = r3
+
+
+; --- 4x4 iDCT done, now time to combine with mpr ---
+
+ movq mm7, MMWORD PTR const32
+
+ paddw mm0, mm7 ; rres + 32
+ psraw mm0, 6 ; (rres + 32) >> 6
+ paddw mm1, mm7 ; rres + 32
+ psraw mm1, 6 ; (rres + 32) >> 6
+ paddw mm2, mm7 ; rres + 32
+ psraw mm2, 6 ; (rres + 32) >> 6
+ paddw mm3, mm7 ; rres + 32
+ psraw mm3, 6 ; (rres + 32) >> 6
+
+ pxor mm7, mm7
+
+ ; convert mpr from unsigned char to short
+ movd mm4, DWORD PTR 0[ecx]
+ movd mm5, DWORD PTR 16[ecx]
+ movd mm6, DWORD PTR 32[ecx]
+ punpcklbw mm4, mm7
+ punpcklbw mm5, mm7
+ punpcklbw mm6, mm7
+ paddsw mm4, mm0 ; pred_row + rres_row
+ movd mm0, DWORD PTR 48[ecx] ; reuse mm0 for mpr[3]
+ paddsw mm5, mm1 ; pred_row + rres_row
+ punpcklbw mm0, mm7
+ paddsw mm6, mm2 ; pred_row + rres_row
+ paddsw mm0, mm3 ; pred_row + rres_row
+ ; results in mm4, mm5, mm6, mm0
+
+ ; move back to 8 bit
+ packuswb mm4, mm7
+ packuswb mm5, mm7
+ packuswb mm6, mm7
+ packuswb mm0, mm7
+ movd DWORD PTR 0[edx], mm4
+ movd DWORD PTR 16[edx], mm5
+ movd DWORD PTR 32[edx], mm6
+ movd DWORD PTR 48[edx], mm0
+ ret 0
+
+_itrans4x4_mmx_direct ENDP
+_itrans4x4_mmx ENDP
+_TEXT ENDS
+
+EXTRN _itrans_sp:PROC
+EXTRN _Inv_Residual_trans_4x4:PROC
+PUBLIC _iMBtrans4x4
+EXTRN _opt:BYTE
+_TEXT SEGMENT
+_currSlice$ = -4 ; size = 4
+_mb_rec$166704 = 8 ; size = 4
+_currMB$ = 8 ; size = 4
+_curr_img$ = 12 ; size = 4
+_pl$ = 8 ; second parameter
+_smb$ = 16 ; size = 4
+_iMBtrans4x4 PROC
+ push ecx
+ push ebx
+ push ebp
+ push esi
+STACKOFFSET = 16
+; 408 : VideoImage *curr_img = pl ? dec_picture->imgUV[pl - 1]: dec_picture->imgY;
+
+ mov esi, DWORD PTR _pl$[esp+STACKOFFSET]
+ push edi
+STACKOFFSET = STACKOFFSET + 4
+ mov edi, DWORD PTR _currMB$[esp+16]
+ mov ebp, DWORD PTR [edi+p_Slice@MacroBlock] ; ebp: currMB->p_Slice
+ mov eax, DWORD PTR [edi+4]
+ mov eax, DWORD PTR [eax+dec_picture@VideoParameters] ; eax: p_Vid->dec_picture;
+ mov DWORD PTR _currSlice$[esp+20], ebp
+ mov ecx, DWORD PTR [eax+esi*4+plane_images@StorablePicture]
+ mov DWORD PTR _curr_img$[esp+16], ecx
+
+ cmp DWORD PTR _smb$[esp+16], 0 ; if (smb)
+; 413 : {
+; 414 : h264_short_block_t *blocks = currSlice->cof4[pl];
+; 415 : const h264_imgpel_macroblock_row_t *mb_pred=currSlice->mb_pred[pl];
+; 416 :
+; 417 : itrans_sp(blocks[0], mb_pred, currMB, pl, 0, 0);
+
+ je $LN4@iMBtrans4x
+ push 0
+ push 0
+ mov eax, esi
+ shl eax, 9
+ lea ebx, DWORD PTR [eax+ebp+cof@Slice]
+ mov ecx, esi
+ shl ecx, 8
+ lea ebp, DWORD PTR [ecx+ebp+mb_pred@Slice]
+ push esi
+ push ebp
+ push ebx
+ mov eax, edi
+ call _itrans_sp
+
+; 418 : itrans_sp(blocks[1], mb_pred, currMB, pl, 4, 0);
+
+ push 0
+ push 4
+ push esi
+ lea edx, DWORD PTR [ebx+32]
+ push ebp
+ push edx
+ mov eax, edi
+ call _itrans_sp
+
+; 419 : itrans_sp(blocks[2], mb_pred, currMB, pl, 0, 4);
+
+ push 4
+ push 0
+ push esi
+ lea eax, DWORD PTR [ebx+64]
+ push ebp
+ push eax
+ mov eax, edi
+ call _itrans_sp
+
+; 420 : itrans_sp(blocks[3], mb_pred, currMB, pl, 4, 4);
+
+ push 4
+ push 4
+ push esi
+ lea ecx, DWORD PTR [ebx+96]
+ push ebp
+ push ecx
+ mov eax, edi
+ call _itrans_sp
+ add esp, 80 ; 00000050H
+
+; 421 : itrans_sp(blocks[4], mb_pred, currMB, pl, 8, 0);
+
+ push 0
+ push 8
+ push esi
+ lea edx, DWORD PTR [ebx+128]
+ push ebp
+ push edx
+ mov eax, edi
+ call _itrans_sp
+
+; 422 : itrans_sp(blocks[5], mb_pred, currMB, pl, 12, 0);
+
+ push 0
+ push 12 ; 0000000cH
+ push esi
+ lea eax, DWORD PTR [ebx+160]
+ push ebp
+ push eax
+ mov eax, edi
+ call _itrans_sp
+
+; 423 : itrans_sp(blocks[6], mb_pred, currMB, pl, 8, 4);
+
+ push 4
+ push 8
+ push esi
+ lea ecx, DWORD PTR [ebx+192]
+ push ebp
+ push ecx
+ mov eax, edi
+ call _itrans_sp
+
+; 424 : itrans_sp(blocks[7], mb_pred, currMB, pl, 12, 4);
+
+ push 4
+ push 12 ; 0000000cH
+ push esi
+ lea edx, DWORD PTR [ebx+224]
+ push ebp
+ push edx
+ mov eax, edi
+ call _itrans_sp
+ add esp, 80 ; 00000050H
+
+; 425 : itrans_sp(blocks[8], mb_pred, currMB, pl, 0, 8);
+
+ push 8
+ push 0
+ push esi
+ lea eax, DWORD PTR [ebx+256]
+ push ebp
+ push eax
+ mov eax, edi
+ call _itrans_sp
+
+; 426 : itrans_sp(blocks[9], mb_pred, currMB, pl, 4, 8);
+
+ push 8
+ push 4
+ push esi
+ push ebp
+ lea ecx, DWORD PTR [ebx+288]
+ push ecx
+ mov eax, edi
+ call _itrans_sp
+
+; 427 : itrans_sp(blocks[10], mb_pred, currMB, pl, 0, 12);
+
+ push 12 ; 0000000cH
+ push 0
+ push esi
+ lea edx, DWORD PTR [ebx+320]
+ push ebp
+ push edx
+ mov eax, edi
+ call _itrans_sp
+
+; 428 : itrans_sp(blocks[11], mb_pred, currMB, pl, 4, 12);
+
+ push 12 ; 0000000cH
+ push 4
+ push esi
+ lea eax, DWORD PTR [ebx+352]
+ push ebp
+ push eax
+ mov eax, edi
+ call _itrans_sp
+ add esp, 80 ; 00000050H
+
+; 429 : itrans_sp(blocks[12], mb_pred, currMB, pl, 8, 8);
+
+ push 8
+ push 8
+ push esi
+ lea ecx, DWORD PTR [ebx+384]
+ push ebp
+ push ecx
+ mov eax, edi
+ call _itrans_sp
+
+; 430 : itrans_sp(blocks[13], mb_pred, currMB, pl, 12, 8);
+
+ push 8
+ push 12 ; 0000000cH
+ push esi
+ lea edx, DWORD PTR [ebx+416]
+ push ebp
+ push edx
+ mov eax, edi
+ call _itrans_sp
+
+; 431 : itrans_sp(blocks[14], mb_pred, currMB, pl, 8, 12);
+
+ push 12 ; 0000000cH
+ push 8
+ push esi
+ lea eax, DWORD PTR [ebx+448]
+ push ebp
+ push eax
+ mov eax, edi
+ call _itrans_sp
+
+; 432 : itrans_sp(blocks[15], mb_pred, currMB, pl, 12, 12);
+
+ push 12 ; 0000000cH
+ push 12 ; 0000000cH
+ push esi
+ add ebx, 480 ; 000001e0H
+ push ebp
+ push ebx
+ mov eax, edi
+ call _itrans_sp
+ mov ebp, DWORD PTR _currSlice$[esp+100]
+ add esp, 80 ; 00000050H
+ jmp COPY_16x16
+
+$LN4@iMBtrans4x:
+
+; 433 : }
+; 434 : else if (currMB->is_lossless)
+
+ cmp DWORD PTR [edi+84], 0
+ je $LN2@iMBtrans4x
+
+ push 0
+ push 0
+
+; 435 : {
+; 436 : Inv_Residual_trans_4x4(currMB, pl, 0, 0);
+
+ push esi
+ push edi
+ call _Inv_Residual_trans_4x4
+
+; 437 : Inv_Residual_trans_4x4(currMB, pl, 4, 0);
+
+ push 0
+ push 4
+ push esi
+ push edi
+ call _Inv_Residual_trans_4x4
+
+; 438 : Inv_Residual_trans_4x4(currMB, pl, 0, 4);
+
+ push 4
+ push 0
+ push esi
+ push edi
+ call _Inv_Residual_trans_4x4
+
+; 439 : Inv_Residual_trans_4x4(currMB, pl, 4, 4);
+
+ push 4
+ push 4
+ push esi
+ push edi
+ call _Inv_Residual_trans_4x4
+ add esp, 64 ; 00000040H
+
+; 440 : Inv_Residual_trans_4x4(currMB, pl, 8, 0);
+
+ push 0
+ push 8
+ push esi
+ push edi
+ call _Inv_Residual_trans_4x4
+
+; 441 : Inv_Residual_trans_4x4(currMB, pl, 12, 0);
+
+ push 0
+ push 12 ; 0000000cH
+ push esi
+ push edi
+ call _Inv_Residual_trans_4x4
+
+; 442 : Inv_Residual_trans_4x4(currMB, pl, 8, 4);
+
+ push 4
+ push 8
+ push esi
+ push edi
+ call _Inv_Residual_trans_4x4
+
+; 443 : Inv_Residual_trans_4x4(currMB, pl, 12, 4);
+
+ push 4
+ push 12 ; 0000000cH
+ push esi
+ push edi
+ call _Inv_Residual_trans_4x4
+ add esp, 64 ; 00000040H
+
+; 444 : Inv_Residual_trans_4x4(currMB, pl, 0, 8);
+
+ push 8
+ push 0
+ push esi
+ push edi
+ call _Inv_Residual_trans_4x4
+
+; 445 : Inv_Residual_trans_4x4(currMB, pl, 4, 8);
+
+ push 8
+ push 4
+ push esi
+ push edi
+ call _Inv_Residual_trans_4x4
+
+; 446 : Inv_Residual_trans_4x4(currMB, pl, 0, 12);
+
+ push 12 ; 0000000cH
+ push 0
+ push esi
+ push edi
+ call _Inv_Residual_trans_4x4
+
+; 447 : Inv_Residual_trans_4x4(currMB, pl, 4, 12);
+
+ push 12 ; 0000000cH
+ push 4
+ push esi
+ push edi
+ call _Inv_Residual_trans_4x4
+ add esp, 64 ; 00000040H
+
+; 448 : Inv_Residual_trans_4x4(currMB, pl, 8, 8);
+
+ push 8
+ push 8
+ push esi
+ push edi
+ call _Inv_Residual_trans_4x4
+
+; 449 : Inv_Residual_trans_4x4(currMB, pl, 12, 8);
+
+ push 8
+ push 12 ; 0000000cH
+ push esi
+ push edi
+ call _Inv_Residual_trans_4x4
+
+; 450 : Inv_Residual_trans_4x4(currMB, pl, 8, 12);
+
+ push 12 ; 0000000cH
+ push 8
+ push esi
+ push edi
+ call _Inv_Residual_trans_4x4
+
+; 451 : Inv_Residual_trans_4x4(currMB, pl, 12, 12);
+
+ push 12 ; 0000000cH
+ push 12 ; 0000000cH
+ push esi
+ push edi
+ call _Inv_Residual_trans_4x4
+ add esp, 64 ; 00000040H
+
+; 452 : }
+; 453 : else
+
+ jmp COPY_16x16
+$LN2@iMBtrans4x:
+
+; 454 : {
+; 455 : const h264_short_block_t *blocks = currSlice->cof4[pl];
+; 456 : const h264_imgpel_macroblock_row_t *mb_pred=currSlice->mb_pred[pl];
+
+ mov edx, esi
+ mov ecx, esi
+ shl edx, 8
+ shl ecx, 9
+ lea eax, DWORD PTR [edx+ebp]
+ lea ebx, DWORD PTR [ecx+ebp+cof@Slice]
+
+; 457 : h264_imgpel_macroblock_row_t *mb_rec = currSlice->mb_rec[pl];
+
+ ; put things in registers that itrans4x4_mmx_direct wants
+ lea edx, [eax + mb_rec@Slice]; mb_rec
+ lea ecx, [eax + mb_pred@Slice] ; mb_pred
+ mov eax, ebx ; blocks
+ call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[0], mb_pred, mb_rec, 0, 0);
+
+ lea edx, [edx+4]
+ lea ecx, [ecx+4]
+ lea eax, [ebx+32]
+ call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[1], mb_pred, mb_rec, 4, 0);
+
+ lea edx, [edx+4]
+ lea ecx, [ecx+4]
+ lea eax, [ebx+128]
+ call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[4], mb_pred, mb_rec, 8, 0);
+
+ lea edx, [edx+4]
+ lea ecx, [ecx+4]
+ lea eax, [ebx+160]
+ call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[5], mb_pred, mb_rec, 12, 0);
+
+ ; second row
+ lea edx, [edx+52]
+ lea ecx, [ecx+52]
+ lea eax, [ebx+64]
+ call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[2], mb_pred, mb_rec, 0, 4);
+
+ lea edx, [edx+4]
+ lea ecx, [ecx+4]
+ lea eax, [ebx+96]
+ call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[3], mb_pred, mb_rec, 4, 4);
+
+ lea edx, [edx+4]
+ lea ecx, [ecx+4]
+ lea eax, [ebx+192]
+ call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[6], mb_pred, mb_rec, 8, 4);
+
+ lea edx, [edx+4]
+ lea ecx, [ecx+4]
+ lea eax, [ebx+224]
+ call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[7], mb_pred, mb_rec, 12, 4);
+
+ ; third row
+ lea edx, [edx+52]
+ lea ecx, [ecx+52]
+ lea eax, [ebx+256]
+ call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[8], mb_pred, mb_rec, 0, 8);
+
+ lea edx, [edx+4]
+ lea ecx, [ecx+4]
+ lea eax, [ebx+288]
+ call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[9], mb_pred, mb_rec, 4, 8);
+
+ lea edx, [edx+4]
+ lea ecx, [ecx+4]
+ lea eax, [ebx+384]
+ call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[12], mb_pred, mb_rec, 8, 8);
+
+ lea edx, [edx+4]
+ lea ecx, [ecx+4]
+ lea eax, [ebx+416]
+ call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[13], mb_pred, mb_rec, 12, 8);
+
+ ; fourth row
+ lea edx, [edx+52]
+ lea ecx, [ecx+52]
+ lea eax, [ebx+320]
+ call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[10], mb_pred, mb_rec, 0, 12);
+
+ lea edx, [edx+4]
+ lea ecx, [ecx+4]
+ lea eax, [ebx+352]
+ call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[11], mb_pred, mb_rec, 4, 12);
+
+ lea edx, [edx+4]
+ lea ecx, [ecx+4]
+ lea eax, [ebx+448]
+ call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[14], mb_pred, mb_rec, 8, 12);
+
+ lea edx, [edx+4]
+ lea ecx, [ecx+4]
+ lea eax, [ebx+480]
+ call _itrans4x4_mmx_direct ; opt_itrans4x4(blocks[15], mb_pred, mb_rec, 12, 12);
+COPY_16x16:
+
+; construct picture from 4x4 blocks
+; opt_copy_image_data_16x16_stride(curr_img, currMB->pix_x, currMB->pix_y, currSlice->mb_rec[pl]);
+
+ mov eax, DWORD PTR [edi+40]
+ mov ecx, DWORD PTR [edi+36]
+ shl esi, 8
+ lea edx, DWORD PTR [esi+ebp+mb_rec@Slice]
+ push edx
+ mov edx, DWORD PTR _curr_img$[esp+20]
+ push eax
+ push ecx
+ push edx
+ call DWORD PTR _opt+copy_image_data_16x16_stride@OptimizedFunctions
+ add esp, 16 ; 00000010H
+ pop edi
+ pop esi
+ pop ebp
+ pop ebx
+ pop ecx
+ ret 0
+_iMBtrans4x4 ENDP
+_TEXT ENDS
+
+_TEXT SEGMENT
+ ALIGN 2
+ PUBLIC _itrans8x8_sse2
+_itrans8x8_sse2 PROC NEAR
+; parameter 1(mb_rec): 8 + ebp
+; parameter 2(mb_pred): 12 + ebp
+; parameter 3(block): 16 + ebp
+; parameter 4(pos_x): 20 + ebp
+ push ebp
+ mov ebp, esp
+ and esp, -16
+ sub esp, 176
+ mov edx, DWORD PTR [ebp+20]
+ mov ecx, DWORD PTR [ebp+8] ; ecx: mb_rec
+ add ecx, edx
+ add edx, DWORD PTR [ebp+12] ; edx: mb_pred
+ mov eax, DWORD PTR [ebp+16] ; eax: block
+
+;;; __m128i a0, a1, a2, a3;
+;;; __m128i p0, p1, p2, p3, p4, p5 ,p6, p7;
+;;; __m128i b0, b1, b2, b3, b4, b5, b6, b7;
+;;; __m128i r0, r1, r2, r3, r4, r5, r6, r7;
+;;; __m128i const32, zero;
+;;; __declspec(align(32)) static const int16_t c32[8] = {32, 32, 32, 32, 32, 32, 32, 32};
+;;; __m128i pred0, pred1;
+;;;
+;;; const32 = _mm_load_si128((const __m128i *)c32);
+
+ movdqa xmm0, XMMWORD PTR const32
+
+;;; zero = _mm_setzero_si128();
+;;;
+;;; // Horizontal
+;;; b0 = _mm_load_si128((__m128i *)(block[0]));
+
+ movdqa xmm4, XMMWORD PTR [eax]
+
+;;; b1 = _mm_load_si128((__m128i *)(block[1]));
+
+ movdqa xmm7, XMMWORD PTR [eax+16]
+
+;;; b2 = _mm_load_si128((__m128i *)(block[2]));
+
+ movdqa xmm5, XMMWORD PTR [eax+32]
+
+;;; b3 = _mm_load_si128((__m128i *)(block[3]));
+
+ movdqa xmm3, XMMWORD PTR [eax+48]
+
+;;; b4 = _mm_load_si128((__m128i *)(block[4]));
+
+ movdqa xmm6, XMMWORD PTR [eax+64]
+
+;;; b5 = _mm_load_si128((__m128i *)(block[5]));
+;;; b6 = _mm_load_si128((__m128i *)(block[6]));
+
+ movdqa xmm1, XMMWORD PTR [eax+96]
+
+;;; b7 = _mm_load_si128((__m128i *)(block[7]));
+
+ movdqa xmm2, XMMWORD PTR [eax+112]
+ movdqa XMMWORD PTR [esp], xmm0
+ movdqa xmm0, XMMWORD PTR [eax+80]
+ movdqa XMMWORD PTR [esp+16], xmm2
+
+;;;
+;;; /* rotate 8x8 (ugh) */
+;;; r0 = _mm_unpacklo_epi16(b0, b2);
+
+ movdqa xmm2, xmm4
+ punpcklwd xmm2, xmm5
+
+;;; r1 = _mm_unpacklo_epi16(b1, b3);
+;;; r2 = _mm_unpackhi_epi16(b0, b2);
+
+ punpckhwd xmm4, xmm5
+
+;;; r3 = _mm_unpackhi_epi16(b1, b3);
+;;; r4 = _mm_unpacklo_epi16(b4, b6);
+;;; r5 = _mm_unpacklo_epi16(b5, b7);
+
+ movdqa xmm5, xmm0
+ movdqa XMMWORD PTR [esp+32], xmm2
+ movdqa xmm2, xmm7
+ punpcklwd xmm2, xmm3
+ punpckhwd xmm7, xmm3
+ movdqa xmm3, xmm6
+ punpcklwd xmm3, xmm1
+ movdqa XMMWORD PTR [esp+48], xmm3
+ movdqa xmm3, XMMWORD PTR [esp+16]
+ punpcklwd xmm5, xmm3
+
+;;; r6 = _mm_unpackhi_epi16(b4, b6);
+
+ punpckhwd xmm6, xmm1
+;;; r7 = _mm_unpackhi_epi16(b5, b7);
+
+ punpckhwd xmm0, xmm3
+
+;;;
+;;; b0 = _mm_unpacklo_epi16(r0, r1);
+
+ movdqa xmm3, XMMWORD PTR [esp+32]
+ movdqa xmm1, xmm3
+ punpcklwd xmm1, xmm2
+
+;;; b1 = _mm_unpackhi_epi16(r0, r1);
+
+ punpckhwd xmm3, xmm2
+
+;;; b2 = _mm_unpacklo_epi16(r2, r3);
+
+ movdqa xmm2, xmm4
+ punpcklwd xmm2, xmm7
+
+;;; b3 = _mm_unpackhi_epi16(r2, r3);
+
+ punpckhwd xmm4, xmm7
+ movdqa XMMWORD PTR [esp+64], xmm4
+
+;;; b4 = _mm_unpacklo_epi16(r4, r5);
+
+ movdqa xmm4, XMMWORD PTR [esp+48]
+ movdqa xmm7, xmm4
+ punpcklwd xmm7, xmm5
+
+;;; b5 = _mm_unpackhi_epi16(r4, r5);
+
+ punpckhwd xmm4, xmm5
+
+;;; b6 = _mm_unpacklo_epi16(r6, r7);
+
+ movdqa xmm5, xmm6
+ punpcklwd xmm5, xmm0
+
+;;; b7 = _mm_unpackhi_epi16(r6, r7);
+
+ punpckhwd xmm6, xmm0
+
+;;;
+;;; p0 = _mm_unpacklo_epi64(b0, b4);
+
+ movdqa xmm0, xmm1
+ punpcklqdq xmm0, xmm7
+
+;;; p1 = _mm_unpackhi_epi64(b0, b4);
+
+ punpckhqdq xmm1, xmm7
+ movdqa XMMWORD PTR [esp+16], xmm1
+
+;;; p2 = _mm_unpacklo_epi64(b1, b5);
+
+ movdqa xmm1, xmm3
+ punpcklqdq xmm1, xmm4
+
+;;; p3 = _mm_unpackhi_epi64(b1, b5);
+;;; p4 = _mm_unpacklo_epi64(b2, b6);
+;;; p5 = _mm_unpackhi_epi64(b2, b6);
+;;; p6 = _mm_unpacklo_epi64(b3, b7);
+;;; p7 = _mm_unpackhi_epi64(b3, b7);
+;;;
+;;; /* perform approx DCT */
+;;; a0 = _mm_add_epi16(p0, p4); // p0 + p4
+;;; a1 = _mm_sub_epi16(p0, p4); // p0 - p4
+;;; r0 = _mm_srai_epi16(p2, 1); // p2 >> 1
+
+ movdqa xmm7, xmm1
+ psraw xmm7, 1
+ punpckhqdq xmm3, xmm4
+ movdqa XMMWORD PTR [esp+32], xmm3
+ movdqa xmm3, xmm2
+ punpcklqdq xmm3, xmm5
+ punpckhqdq xmm2, xmm5
+ movdqa xmm5, XMMWORD PTR [esp+64]
+ movdqa xmm4, xmm5
+ punpcklqdq xmm4, xmm6
+ punpckhqdq xmm5, xmm6
+ movdqa xmm6, xmm0
+ paddw xmm6, xmm3
+ psubw xmm0, xmm3
+
+;;; a2 = _mm_sub_epi16(p6, r0); // p6 - (p2 >> 1)
+
+ movdqa xmm3, xmm4
+
+;;; r0 = _mm_srai_epi16(p6, 1); // p6 >> 1
+
+ psraw xmm4, 1
+ psubw xmm3, xmm7
+
+;;; a3 = _mm_add_epi16(p2, r0); //p2 + (p6 >> 1)
+
+ paddw xmm1, xmm4
+
+;;;
+;;; b0 = _mm_add_epi16(a0, a3); // a0 + a3;
+
+ movdqa xmm4, xmm6
+
+;;; b2 = _mm_sub_epi16(a1, a2); // a1 - a2;
+
+ movdqa xmm7, xmm0
+ paddw xmm4, xmm1
+ psubw xmm7, xmm3
+ movdqa XMMWORD PTR [esp+48], xmm7
+
+;;; b4 = _mm_add_epi16(a1, a2); // a1 + a2;
+
+ paddw xmm0, xmm3
+ movdqa XMMWORD PTR [esp+80], xmm0
+
+;;; b6 = _mm_sub_epi16(a0, a3); // a0 - a3;
+;;;
+;;; //-p3 + p5 - p7 - (p7 >> 1);
+;;; r0 = _mm_srai_epi16(p7, 1); // p7 >> 1
+;;; a0 = _mm_sub_epi16(p5, p3); // p5 - p3
+
+ movdqa xmm0, XMMWORD PTR [esp+32]
+ psubw xmm6, xmm1
+ movdqa xmm1, xmm5
+ psraw xmm1, 1
+ movdqa xmm3, xmm2
+
+;;; a0 = _mm_sub_epi16(a0, p7); // (-p3 + p5) - p7
+;;; a0 = _mm_sub_epi16(a0, r0); // (-p3 + p5 - p7) - (p7 >> 1)
+;;;
+;;; //p1 + p7 - p3 - (p3 >> 1);
+;;; r0 = _mm_srai_epi16(p3, 1); // (p3 >> 1)
+
+ movdqa xmm7, xmm0
+ movdqa XMMWORD PTR [esp+96], xmm6
+
+;;; a1 = _mm_add_epi16(p1, p7); // p1 + p7
+
+ movdqa xmm6, XMMWORD PTR [esp+16]
+ psubw xmm3, xmm0
+ psubw xmm3, xmm5
+ psraw xmm7, 1
+ psubw xmm3, xmm1
+ movdqa xmm1, xmm6
+ paddw xmm1, xmm5
+
+;;; a1 = _mm_sub_epi16(a1, p3); // (p1 + p7) - p3
+
+ psubw xmm1, xmm0
+
+;;; a1 = _mm_sub_epi16(a1, r0); // (p1 + p7 - p3) - (p3>>1)
+
+ psubw xmm1, xmm7
+
+;;;
+;;; // -p1 + p7 + p5 + (p5 >> 1);
+;;; r0 = _mm_srai_epi16(p5, 1); // (p5 >> 1)
+
+ movdqa xmm7, xmm2
+ psraw xmm7, 1
+
+;;; a2 = _mm_sub_epi16(p7, p1); // p7 - p1
+
+ psubw xmm5, xmm6
+
+;;; a2 = _mm_add_epi16(a2, p5); // -p1 + p7 + p5
+
+ paddw xmm5, xmm2
+
+;;; a2 = _mm_add_epi16(a2, r0); // (-p1 + p7 + p5) + (p5 >> 1)
+
+ paddw xmm5, xmm7
+
+;;;
+;;; // p3 + p5 + p1 + (p1 >> 1);
+;;; a3 = _mm_add_epi16(p3, p5); // p3+p5
+
+ paddw xmm0, xmm2
+
+;;; a3 = _mm_add_epi16(a3, p1); // p3 + p5 + p1
+;;; p1 = _mm_srai_epi16(p1, 1); // p1 >> 1
+;;; a3 = _mm_add_epi16(a3, p1); //p3 + p5 + p1 + (p1 >> 1)
+;;;
+;;; r0 = _mm_srai_epi16(a3, 2); // a3>>2
+;;; b1 = _mm_add_epi16(a0, r0); //a0 + (a3>>2);
+;;; r0 = _mm_srai_epi16(a2, 2); // a2>>2
+;;; b3 = _mm_add_epi16(a1, r0); // a1 + (a2>>2);
+;;; a1 = _mm_srai_epi16(a1, 2); // all done with a1, so this is safe
+;;; b5 = _mm_sub_epi16(a2, a1); //a2 - (a1>>2);
+;;; a0 = _mm_srai_epi16(a0, 2); // all done with a0, so this is safe
+;;; b7 = _mm_sub_epi16(a3, a0); //a3 - (a0>>2);
+;;;
+;;; p0 = _mm_add_epi16(b0, b7); // b0 + b7;
+;;; p1 = _mm_sub_epi16(b2, b5); // b2 - b5;
+;;; p2 = _mm_add_epi16(b4, b3); // b4 + b3;
+;;; p3 = _mm_add_epi16(b6, b1); // b6 + b1;
+
+ movdqa xmm2, XMMWORD PTR [esp+96]
+ paddw xmm0, xmm6
+ psraw xmm6, 1
+ paddw xmm0, xmm6
+ movdqa xmm7, xmm0
+ movdqa xmm6, xmm5
+ psraw xmm7, 2
+ paddw xmm7, xmm3
+ psraw xmm6, 2
+ paddw xmm6, xmm1
+ psraw xmm1, 2
+ psubw xmm5, xmm1
+ movdqa xmm1, xmm4
+ psraw xmm3, 2
+ psubw xmm0, xmm3
+ movdqa xmm3, XMMWORD PTR [esp+80]
+ movdqa XMMWORD PTR [esp+32], xmm0
+
+;;; p4 = _mm_sub_epi16(b6, b1); // b6 - b1;
+;;; p5 = _mm_sub_epi16(b4, b3); // b4 - b3;
+;;; p6 = _mm_add_epi16(b2, b5); // b2 + b5;
+;;; p7 = _mm_sub_epi16(b0, b7); // b0 - b7;
+
+ psubw xmm4, XMMWORD PTR [esp+32]
+ paddw xmm1, xmm0
+ movdqa XMMWORD PTR [esp+112], xmm1
+ movdqa xmm1, XMMWORD PTR [esp+48]
+ movdqa xmm0, xmm1
+ psubw xmm0, xmm5
+ movdqa XMMWORD PTR [esp+16], xmm0
+ movdqa xmm0, xmm3
+ paddw xmm0, xmm6
+ psubw xmm3, xmm6
+ movdqa XMMWORD PTR [esp+128], xmm0
+
+;;;
+;;; /* rotate 8x8 (ugh) */
+;;; r0 = _mm_unpacklo_epi16(p0, p2);
+
+ movdqa xmm6, XMMWORD PTR [esp+128]
+ movdqa xmm0, xmm2
+ paddw xmm0, xmm7
+ psubw xmm2, xmm7
+ paddw xmm1, xmm5
+ movdqa xmm5, XMMWORD PTR [esp+112]
+ movdqa XMMWORD PTR [esp+144], xmm4
+ movdqa xmm4, xmm5
+ punpcklwd xmm4, xmm6
+
+;;; r1 = _mm_unpacklo_epi16(p1, p3);
+;;; r2 = _mm_unpackhi_epi16(p0, p2);
+
+ punpckhwd xmm5, xmm6
+
+;;; r3 = _mm_unpackhi_epi16(p1, p3);
+;;; r4 = _mm_unpacklo_epi16(p4, p6);
+;;; r5 = _mm_unpacklo_epi16(p5, p7);
+
+ movdqa xmm6, xmm3
+ movdqa XMMWORD PTR [esp+64], xmm4
+ movdqa xmm4, XMMWORD PTR [esp+16]
+ movdqa xmm7, xmm4
+ punpcklwd xmm7, xmm0
+ punpckhwd xmm4, xmm0
+ movdqa xmm0, xmm2
+ punpcklwd xmm0, xmm1
+ movdqa XMMWORD PTR [esp+128], xmm0
+ movdqa xmm0, XMMWORD PTR [esp+144]
+ punpcklwd xmm6, xmm0
+
+;;; r6 = _mm_unpackhi_epi16(p4, p6);
+
+ punpckhwd xmm2, xmm1
+
+;;; r7 = _mm_unpackhi_epi16(p5, p7);
+;;;
+;;; b0 = _mm_unpacklo_epi16(r0, r1);
+
+ movdqa xmm1, XMMWORD PTR [esp+64]
+ punpckhwd xmm3, xmm0
+ movdqa xmm0, xmm1
+ punpcklwd xmm0, xmm7
+
+;;; b1 = _mm_unpackhi_epi16(r0, r1);
+
+ punpckhwd xmm1, xmm7
+
+;;; b2 = _mm_unpacklo_epi16(r2, r3);
+
+ movdqa xmm7, xmm5
+ punpcklwd xmm7, xmm4
+
+;;; b3 = _mm_unpackhi_epi16(r2, r3);
+
+ punpckhwd xmm5, xmm4
+ movdqa XMMWORD PTR [esp+112], xmm5
+
+;;; b4 = _mm_unpacklo_epi16(r4, r5);
+
+ movdqa xmm5, XMMWORD PTR [esp+128]
+ movdqa xmm4, xmm5
+ punpcklwd xmm4, xmm6
+
+;;; b5 = _mm_unpackhi_epi16(r4, r5);
+
+ punpckhwd xmm5, xmm6
+
+;;; b6 = _mm_unpacklo_epi16(r6, r7);
+
+ movdqa xmm6, xmm2
+ punpcklwd xmm6, xmm3
+
+;;; b7 = _mm_unpackhi_epi16(r6, r7);
+
+ punpckhwd xmm2, xmm3
+
+;;;
+;;; p0 = _mm_unpacklo_epi64(b0, b4);
+
+ movdqa xmm3, xmm0
+ punpcklqdq xmm3, xmm4
+
+;;; p1 = _mm_unpackhi_epi64(b0, b4);
+
+ punpckhqdq xmm0, xmm4
+ movdqa XMMWORD PTR [esp+144], xmm0
+
+;;; p2 = _mm_unpacklo_epi64(b1, b5);
+;;; p3 = _mm_unpackhi_epi64(b1, b5);
+;;; p4 = _mm_unpacklo_epi64(b2, b6);
+;;; p5 = _mm_unpackhi_epi64(b2, b6);
+;;; p6 = _mm_unpacklo_epi64(b3, b7);
+
+ movdqa xmm0, XMMWORD PTR [esp+112]
+ movdqa xmm4, xmm1
+ punpcklqdq xmm4, xmm5
+ punpckhqdq xmm1, xmm5
+ movdqa XMMWORD PTR [esp+64], xmm1
+ movdqa xmm1, xmm7
+ movdqa xmm5, xmm0
+ punpcklqdq xmm1, xmm6
+ punpckhqdq xmm7, xmm6
+
+;;; p7 = _mm_unpackhi_epi64(b3, b7);
+;;;
+;;;
+;;; /* Vertical */
+;;;
+;;; a0 = _mm_add_epi16(p0, p4); // p0 + p4
+;;; a1 = _mm_sub_epi16(p0, p4); // p0 - p4
+;;; r0 = _mm_srai_epi16(p2, 1); // p2 >> 1
+
+ movdqa xmm6, xmm4
+ psraw xmm6, 1
+ punpcklqdq xmm5, xmm2
+ punpckhqdq xmm0, xmm2
+ movdqa xmm2, xmm3
+ paddw xmm2, xmm1
+ psubw xmm3, xmm1
+
+;;; a2 = _mm_sub_epi16(p6, r0); // p6 - (p2 >> 1)
+
+ movdqa xmm1, xmm5
+
+;;; r0 = _mm_srai_epi16(p6, 1); // p6 >> 1
+
+ psraw xmm5, 1
+ psubw xmm1, xmm6
+
+;;; a3 = _mm_add_epi16(p2, r0); //p2 + (p6 >> 1)
+
+ paddw xmm4, xmm5
+
+;;;
+;;; b0 = _mm_add_epi16(a0, a3); // a0 + a3;
+
+ movdqa xmm5, xmm2
+
+;;; b2 = _mm_sub_epi16(a1, a2); // a1 - a2;
+
+ movdqa xmm6, xmm3
+ paddw xmm5, xmm4
+ psubw xmm6, xmm1
+ movdqa XMMWORD PTR [esp+128], xmm6
+
+;;; b4 = _mm_add_epi16(a1, a2); // a1 + a2;
+;;; b6 = _mm_sub_epi16(a0, a3); // a0 - a3;
+;;;
+;;; //-p3 + p5 - p7 - (p7 >> 1);
+;;; r0 = _mm_srai_epi16(p7, 1); // p7 >> 1
+;;; a0 = _mm_sub_epi16(p5, p3); // p5 - p3
+
+ movdqa xmm6, XMMWORD PTR [esp+64]
+ paddw xmm3, xmm1
+ movdqa XMMWORD PTR [esp+80], xmm3
+ psubw xmm2, xmm4
+ movdqa xmm1, xmm0
+ psraw xmm1, 1
+ movdqa xmm3, xmm7
+ movdqa XMMWORD PTR [esp+96], xmm2
+ psubw xmm3, xmm6
+
+;;; a0 = _mm_sub_epi16(a0, p7); // (-p3 + p5) - p7
+
+ psubw xmm3, xmm0
+
+;;; a0 = _mm_sub_epi16(a0, r0); // (-p3 + p5 - p7) - (p7 >> 1)
+;;;
+;;; //p1 + p7 - p3 - (p3 >> 1);
+;;; r0 = _mm_srai_epi16(p3, 1); // (p3 >> 1)
+
+ movdqa xmm2, xmm6
+ psraw xmm2, 1
+ psubw xmm3, xmm1
+
+;;; a1 = _mm_add_epi16(p1, p7); // p1 + p7
+
+ movdqa xmm1, XMMWORD PTR [esp+144]
+ movdqa xmm4, xmm1
+ paddw xmm4, xmm0
+
+;;; a1 = _mm_sub_epi16(a1, p3); // (p1 + p7) - p3
+
+ psubw xmm4, xmm6
+
+;;; a1 = _mm_sub_epi16(a1, r0); // (p1 + p7 - p3) - (p3>>1)
+
+ psubw xmm4, xmm2
+
+;;;
+;;; // -p1 + p7 + p5 + (p5 >> 1);
+;;; r0 = _mm_srai_epi16(p5, 1); // (p5 >> 1)
+
+ movdqa xmm2, xmm7
+ psraw xmm2, 1
+
+;;; a2 = _mm_sub_epi16(p7, p1); // p7 - p1
+
+ psubw xmm0, xmm1
+
+;;; a2 = _mm_add_epi16(a2, p5); // -p1 + p7 + p5
+
+ paddw xmm0, xmm7
+
+;;; a2 = _mm_add_epi16(a2, r0); // (-p1 + p7 + p5) + (p5 >> 1)
+
+ paddw xmm0, xmm2
+
+;;;
+;;; // p3 + p5 + p1 + (p1 >> 1);
+;;; r0 = _mm_srai_epi16(p1, 1); // p1 >> 1
+
+ movdqa xmm2, xmm1
+ psraw xmm2, 1
+
+;;; a3 = _mm_add_epi16(p3, p5); // p3+p5
+
+ paddw xmm6, xmm7
+
+;;; a3 = _mm_add_epi16(a3, p1); // p3 + p5 + p1
+;;; a3 = _mm_add_epi16(a3, r0); //p3 + p5 + p1 + (p1 >> 1)
+;;;
+;;; r0 = _mm_srai_epi16(a3, 2); // a3>>2
+;;; b1 = _mm_add_epi16(a0, r0); //a0 + (a3>>2);
+;;; r0 = _mm_srai_epi16(a2, 2); // a2>>2
+;;; b3 = _mm_add_epi16(a1, r0); // a1 + (a2>>2);
+;;; a1 = _mm_srai_epi16(a1, 2); // all done with a1, so this is safe
+;;; b5 = _mm_sub_epi16(a2, a1); //a2 - (a1>>2);
+;;; a0 = _mm_srai_epi16(a0, 2); // all done with a0, so this is safe
+;;; b7 = _mm_sub_epi16(a3, a0); //a3 - (a0>>2);
+;;;
+;;; r0 = _mm_add_epi16(b0, b7); // b0 + b7;
+;;; r1 = _mm_sub_epi16(b2, b5); // b2 - b5;
+
+ movdqa xmm7, XMMWORD PTR [esp+128]
+ paddw xmm6, xmm1
+ paddw xmm6, xmm2
+ movdqa xmm1, xmm6
+ psraw xmm1, 2
+ movdqa xmm2, xmm0
+ paddw xmm1, xmm3
+ psraw xmm2, 2
+ paddw xmm2, xmm4
+ psraw xmm4, 2
+ psubw xmm0, xmm4
+ psraw xmm3, 2
+ psubw xmm6, xmm3
+ movdqa XMMWORD PTR [esp+64], xmm6
+ movdqa xmm3, xmm5
+
+;;; r2 = _mm_add_epi16(b4, b3); // b4 + b3;
+;;; r3 = _mm_add_epi16(b6, b1); // b6 + b1;
+;;; r4 = _mm_sub_epi16(b6, b1); // b6 - b1;
+;;; r5 = _mm_sub_epi16(b4, b3); // b4 - b3;
+;;; r6 = _mm_add_epi16(b2, b5); // b2 + b5;
+;;; r7 = _mm_sub_epi16(b0, b7); // b0 - b7;
+
+ psubw xmm5, XMMWORD PTR [esp+64]
+ paddw xmm3, xmm6
+ movdqa XMMWORD PTR [esp+144], xmm3
+ movdqa xmm3, xmm7
+ psubw xmm3, xmm0
+ movdqa XMMWORD PTR [esp+48], xmm3
+ movdqa xmm3, XMMWORD PTR [esp+80]
+ movdqa xmm4, xmm3
+ paddw xmm4, xmm2
+ psubw xmm3, xmm2
+
+;;;
+;;;
+;;; // add in prediction values
+;;; pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[0][pos_x]));
+;;; pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[1][pos_x]));
+;;; // (x + 32) >> 6
+;;; r0 = _mm_adds_epi16(r0, const32);
+
+ movdqa xmm2, XMMWORD PTR const32
+ movdqa XMMWORD PTR [esp+16], xmm4
+ movdqa xmm4, XMMWORD PTR [esp+96]
+ movdqa xmm6, xmm4
+ paddw xmm6, xmm1
+ psubw xmm4, xmm1
+
+;;; r0 = _mm_srai_epi16(r0, 6);
+;;; r1 = _mm_adds_epi16(r1, const32);
+
+ movdqa xmm1, XMMWORD PTR [esp+48]
+ paddw xmm7, xmm0
+ movdqa xmm0, XMMWORD PTR [esp+144]
+ movdqa XMMWORD PTR [esp+128], xmm7
+
+;;; r1 = _mm_srai_epi16(r1, 6);
+;;; pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
+;;; pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
+
+ movq xmm7, QWORD PTR [edx+16]
+ movdqa XMMWORD PTR [esp+32], xmm5
+ paddsw xmm0, xmm2
+ psraw xmm0, 6
+ paddsw xmm1, xmm2
+ pxor xmm2, xmm2
+ punpcklbw xmm7, xmm2
+ movq xmm5, QWORD PTR [edx]
+ punpcklbw xmm5, xmm2
+ psraw xmm1, 6
+
+;;; pred0 = _mm_adds_epi16(pred0, r0);
+;;; pred1 = _mm_adds_epi16(pred1, r1);
+
+ paddsw xmm7, xmm1
+ paddsw xmm5, xmm0
+
+;;;
+;;; pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
+
+ packuswb xmm5, xmm7
+
+;;;
+;;; // store
+;;; _mm_storel_epi64((__m128i *)(&mb_rec[0][pos_x]), pred0);
+
+ movdqa xmm0, XMMWORD PTR [esp+32]
+ movdqa xmm2, XMMWORD PTR [esp+128]
+ movq QWORD PTR [ecx], xmm5
+
+;;; // TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
+;;; pred0 = _mm_srli_si128(pred0, 8);
+
+ psrldq xmm5, 8
+
+;;; _mm_storel_epi64((__m128i *)(&mb_rec[1][pos_x]), pred0);
+
+ movq QWORD PTR [ecx+16], xmm5
+
+;;;
+;;; /* --- */
+;;;
+;;; pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[2][pos_x]));
+
+ movq xmm1, QWORD PTR [edx+32]
+
+;;; pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[3][pos_x]));
+;;; // (x + 32) >> 6
+;;; r2 = _mm_adds_epi16(r2, const32);
+
+ movdqa xmm5, XMMWORD PTR [esp]
+ movdqa XMMWORD PTR [esp+32], xmm0 ;
+
+;;; r2 = _mm_srai_epi16(r2, 6);
+;;; r3 = _mm_adds_epi16(r3, const32);
+
+ paddsw xmm6, xmm5
+
+;;; r3 = _mm_srai_epi16(r3, 6);
+
+ psraw xmm6, 6
+
+;;; pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
+
+ pxor xmm7, xmm7
+ punpcklbw xmm1, xmm7
+ movdqa xmm0, XMMWORD PTR [esp+16]
+ paddsw xmm0, xmm5
+ psraw xmm0, 6
+
+;;; pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
+;;; pred0 = _mm_adds_epi16(pred0, r2);
+
+ paddsw xmm1, xmm0
+
+;;; pred1 = _mm_adds_epi16(pred1, r3);
+;;;
+;;; pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
+;;;
+;;; // store
+;;; _mm_storel_epi64((__m128i *)(&mb_rec[2][pos_x]), pred0);
+
+ movdqa xmm0, XMMWORD PTR [esp+32]
+ movq xmm5, QWORD PTR [edx+48]
+ punpcklbw xmm5, xmm7
+ paddsw xmm5, xmm6
+ packuswb xmm1, xmm5
+ movq QWORD PTR [ecx+32], xmm1
+
+;;; // TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
+;;; pred0 = _mm_srli_si128(pred0, 8);
+
+ psrldq xmm1, 8
+
+;;; _mm_storel_epi64((__m128i *)(&mb_rec[3][pos_x]), pred0);
+
+ movq QWORD PTR [ecx+48], xmm1
+
+;;;
+;;; /* --- */
+;;;
+;;; pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[4][pos_x]));
+
+ movq xmm7, QWORD PTR [edx+64]
+
+;;; pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[5][pos_x]));
+
+ movq xmm6, QWORD PTR [edx+80]
+
+;;; // (x + 32) >> 6
+;;; r4 = _mm_adds_epi16(r4, const32);
+;;; r4 = _mm_srai_epi16(r4, 6);
+;;; r5 = _mm_adds_epi16(r5, const32);
+;;; r5 = _mm_srai_epi16(r5, 6);
+;;; pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
+
+ pxor xmm5, xmm5
+ punpcklbw xmm7, xmm5
+
+;;; pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
+
+ punpcklbw xmm6, xmm5
+ movdqa xmm1, XMMWORD PTR [esp]
+ paddsw xmm4, xmm1
+ psraw xmm4, 6
+ paddsw xmm3, xmm1
+ psraw xmm3, 6
+
+;;; pred0 = _mm_adds_epi16(pred0, r4);
+
+ paddsw xmm7, xmm4
+
+;;; pred1 = _mm_adds_epi16(pred1, r5);
+
+ paddsw xmm6, xmm3
+
+;;;
+;;; pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
+
+ packuswb xmm7, xmm6
+
+;;;
+;;; // store
+;;; _mm_storel_epi64((__m128i *)(&mb_rec[4][pos_x]), pred0);
+
+ movq QWORD PTR [ecx+64], xmm7
+
+;;; // TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
+;;; pred0 = _mm_srli_si128(pred0, 8);
+
+ psrldq xmm7, 8
+
+;;; _mm_storel_epi64((__m128i *)(&mb_rec[5][pos_x]), pred0);
+
+ movq QWORD PTR [ecx+80], xmm7
+
+
+;;;
+;;; /* --- */
+;;;
+;;; pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[6][pos_x]));
+
+ movq xmm5, QWORD PTR [edx+96]
+
+;;; pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[7][pos_x]));
+
+ movq xmm4, QWORD PTR [edx+112]
+
+;;; // (x + 32) >> 6
+;;; r6 = _mm_adds_epi16(r6, const32);
+;;; r6 = _mm_srai_epi16(r6, 6);
+;;; r7 = _mm_adds_epi16(r7, const32);
+;;; r7 = _mm_srai_epi16(r7, 6);
+;;; pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
+
+ pxor xmm3, xmm3
+ punpcklbw xmm5, xmm3
+
+;;; pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
+
+ punpcklbw xmm4, xmm3
+ movdqa xmm1, XMMWORD PTR [esp]
+ paddsw xmm2, xmm1
+ psraw xmm2, 6
+ paddsw xmm0, xmm1
+ psraw xmm0, 6
+
+;;; pred0 = _mm_adds_epi16(pred0, r6);
+
+ paddsw xmm5, xmm2
+
+;;; pred1 = _mm_adds_epi16(pred1, r7);
+
+ paddsw xmm4, xmm0
+
+;;;
+;;; pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
+
+ packuswb xmm5, xmm4
+
+;;;
+;;; // store
+;;; _mm_storel_epi64((__m128i *)&mb_rec[6][pos_x], pred0);
+
+ movq QWORD PTR [ecx+96], xmm5
+
+;;; // TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
+;;; pred0 = _mm_srli_si128(pred0, 8);
+
+ psrldq xmm5, 8
+
+;;; _mm_storel_epi64((__m128i *)&mb_rec[7][pos_x], pred0);
+
+ movq QWORD PTR [ecx+112], xmm5
+ mov esp, ebp
+ pop ebp
+ ret
+ ALIGN 2
+_itrans8x8_sse2 ENDP
+
+
+END \ No newline at end of file