diff options
Diffstat (limited to 'Src/h264dec/lcommon/src/transform.c')
-rw-r--r-- | Src/h264dec/lcommon/src/transform.c | 809 |
1 files changed, 809 insertions, 0 deletions
diff --git a/Src/h264dec/lcommon/src/transform.c b/Src/h264dec/lcommon/src/transform.c new file mode 100644 index 00000000..617ca7c1 --- /dev/null +++ b/Src/h264dec/lcommon/src/transform.c @@ -0,0 +1,809 @@ +/*! +*************************************************************************** +* \file transform.c +* +* \brief +* Transform functions +* +* \author +* Main contributors (see contributors.h for copyright, address and affiliation details) +* - Alexis Michael Tourapis +* \date +* 01. July 2007 +************************************************************************** +*/ +#include "global.h" +#include "transform.h" +#include <emmintrin.h> + +void forward4x4(int **block, int **tblock, int pos_y, int pos_x) +{ + int i, ii; + int tmp[16]; + int *pTmp = tmp, *pblock; + int p0,p1,p2,p3; + int t0,t1,t2,t3; + + // Horizontal + for (i=pos_y; i < pos_y + BLOCK_SIZE; i++) + { + pblock = &block[i][pos_x]; + p0 = *(pblock++); + p1 = *(pblock++); + p2 = *(pblock++); + p3 = *(pblock ); + + t0 = p0 + p3; + t1 = p1 + p2; + t2 = p1 - p2; + t3 = p0 - p3; + + *(pTmp++) = t0 + t1; + *(pTmp++) = (t3 << 1) + t2; + *(pTmp++) = t0 - t1; + *(pTmp++) = t3 - (t2 << 1); + } + + // Vertical + for (i=0; i < BLOCK_SIZE; i++) + { + pTmp = tmp + i; + p0 = *pTmp; + p1 = *(pTmp += BLOCK_SIZE); + p2 = *(pTmp += BLOCK_SIZE); + p3 = *(pTmp += BLOCK_SIZE); + + t0 = p0 + p3; + t1 = p1 + p2; + t2 = p1 - p2; + t3 = p0 - p3; + + ii = pos_x + i; + tblock[pos_y ][ii] = t0 + t1; + tblock[pos_y + 1][ii] = t2 + (t3 << 1); + tblock[pos_y + 2][ii] = t0 - t1; + tblock[pos_y + 3][ii] = t3 - (t2 << 1); + } +} + +static void inverse4x4(const h264_short_block_t tblock, h264_short_block_t block, int pos_y, int pos_x) +{ + int i; + short tmp[16]; + short *pTmp = tmp; + int p0,p1,p2,p3; + int t0,t1,t2,t3; + + // Horizontal + for (i = 0; i < BLOCK_SIZE; i++) + { + t0 = tblock[i][0]; + t1 = tblock[i][1]; + t2 = tblock[i][2]; + t3 = tblock[i][3]; + + p0 = t0 + t2; + p1 = t0 - t2; + p2 = (t1 >> 1) - t3; + p3 = t1 + (t3 >> 1); + + *(pTmp++) = p0 + p3; + *(pTmp++) = p1 + p2; + *(pTmp++) = p1 - p2; + *(pTmp++) = p0 - p3; + } + + // Vertical + for (i = 0; i < BLOCK_SIZE; i++) + { + pTmp = tmp + i; + t0 = *pTmp; + t1 = *(pTmp += BLOCK_SIZE); + t2 = *(pTmp += BLOCK_SIZE); + t3 = *(pTmp += BLOCK_SIZE); + + p0 = t0 + t2; + p1 = t0 - t2; + p2 =(t1 >> 1) - t3; + p3 = t1 + (t3 >> 1); + + block[0][i] = p0 + p3; + block[1][i] = p1 + p2; + block[2][i] = p1 - p2; + block[3][i] = p0 - p3; + } +} + +#ifdef _M_IX86 +// benski> this exists just for conformance testing. not used in production code +static void inverse4x4_sse2_x86(const h264_short_macroblock_t tblock, h264_short_macroblock_t block, int pos_y, int pos_x) +{ + __asm + { + mov edx, pos_y + shl edx, 4 // 16 step stride + add edx, pos_x + shl edx, 1 // * sizeof(short) + + // eax: pointer to the start of tblock (offset by passed pos_y, pos_x) + mov eax, edx + add eax, tblock + + // esi: results + mov esi, edx + add esi, block + + // load 4x4 matrix + movq mm0, MMWORD PTR 0[eax] + movq mm1, MMWORD PTR 32[eax] + movq mm2, MMWORD PTR 64[eax] + movq mm3, MMWORD PTR 96[eax] + + // rotate 4x4 matrix + movq mm4, mm0 // p0 = mm4 (copy) + punpcklwd mm0, mm2 // r0 = mm0 + punpckhwd mm4, mm2 // r2 = mm4 + movq mm5, mm1 // p1 = mm5 (copy) + punpcklwd mm1, mm3 // r1 = mm1 + punpckhwd mm5, mm3 // r3 = mm5 + movq mm6, mm0 // r0 = mm6 (copy) + punpcklwd mm0, mm1 // t0 = mm0 + punpckhwd mm6, mm1 // t1 = mm6 + movq mm1, mm4 // r2 = mm1 (copy) + punpcklwd mm1, mm5 // t2 = mm1 + punpckhwd mm4, mm5 // t3 = mm4 + + /* register state: + mm0: t0 + mm1: t2 + mm2: + mm3: + mm4: t3 + mm5: + mm6: t1 + mm7: + */ + + /* + p0 = t0 + t2; + p1 = t0 - t2; + p2 = (t1 >> 1) - t3; + p3 = t1 + (t3 >> 1); + */ + movq mm2, mm0 // mm2 = t0 (copy) + paddw mm0, mm1 // mm0 = p0 + psubw mm2, mm1 // mm2 = p1, mm1 available + movq mm5, mm6 // mm5 = t1 (copy) + psraw mm5, 1 // mm5 = (t1 >> 1) + psubw mm5, mm4 // mm5 = p2 + psraw mm4, 1 // mm4 = (t3 >> 1) + paddw mm6, mm4 // mm6 = p3 + + /* register state: + mm0: p0 + mm1: + mm2: p1 + mm3: + mm4: + mm5: p2 + mm6: p3 + mm7: + */ + + /* + *(pTmp++) = p0 + p3; + *(pTmp++) = p1 + p2; + *(pTmp++) = p1 - p2; + *(pTmp++) = p0 - p3; + */ + + movq mm3, mm0 // mm3 = p0 (copy) + paddw mm0, mm6 // mm0 = r0 + movq mm1, mm2 // mm1 = p1 (copy) + paddw mm1, mm5 // mm1 = r1 + psubw mm2, mm5 // mm2 = r2, mm5 available + psubw mm3, mm6 // mm3 = r3 + + /* register state: + mm0: r0 + mm1: r1 + mm2: r2 + mm3: r3 + mm4: + mm5: + mm6: + mm7: + */ + + // rotate 4x4 matrix to set up for vertical + movq mm4, mm0 // r0 = mm4 (copy) + punpcklwd mm0, mm2 // p0 = mm0 + punpckhwd mm4, mm2 // p2 = mm4 + movq mm5, mm1 // r1 = mm5 (copy) + punpcklwd mm1, mm3 // p1 = mm1 + punpckhwd mm5, mm3 // p3 = mm5 + movq mm6, mm0 // p0 = mm6 (copy) + punpcklwd mm0, mm1 // t0 = mm0 + punpckhwd mm6, mm1 // t1 = mm6 + movq mm1, mm4 // p2 = mm1 (copy) + punpcklwd mm1, mm5 // t2 = mm1 + punpckhwd mm4, mm5 // t3 = mm4 + + /* register state: + mm0: t0 + mm1: t2 + mm2: + mm3: + mm4: t3 + mm5: + mm6: t1 + mm7: + */ + /* + p0 = t0 + t2; + p1 = t0 - t2; + p2 = (t1 >> 1) - t3; + p3 = t1 + (t3 >> 1); + */ + movq mm2, mm0 // mm2 = t0 (copy) + paddw mm0, mm1 // mm0 = p0 + psubw mm2, mm1 // mm2 = p1, mm1 available + movq mm5, mm6 // mm5 = t1 (copy) + psraw mm5, 1 // mm5 = (t1 >> 1) + psubw mm5, mm4 // mm5 = p2 + psraw mm4, 1 // mm4 = (t3 >> 1) + paddw mm6, mm4 // mm6 = p3 + + /* register state: + mm0: p0 + mm1: + mm2: p1 + mm3: + mm4: + mm5: p2 + mm6: p3 + mm7: + */ + + /* + *(pTmp++) = p0 + p3; + *(pTmp++) = p1 + p2; + *(pTmp++) = p1 - p2; + *(pTmp++) = p0 - p3; + */ + + movq mm3, mm0 // mm3 = p0 (copy) + paddw mm0, mm6 // mm0 = r0 + movq mm1, mm2 // mm1 = p1 (copy) + paddw mm1, mm5 // mm1 = r1 + psubw mm2, mm5 // mm2 = r2, mm5 available + psubw mm3, mm6 // mm3 = r3 + + /* register state: + mm0: r0 + mm1: r1 + mm2: r2 + mm3: r3 + mm4: + mm5: + mm6: + mm7: + */ + movq XMMWORD PTR 0[esi], mm0 + movq XMMWORD PTR 32[esi], mm1 + movq XMMWORD PTR 64[esi], mm2 + movq XMMWORD PTR 96[esi], mm3 + } +} +#endif + +static void sample_reconstruct(h264_imgpel_macroblock_t curImg, const h264_imgpel_macroblock_t mpr, const h264_short_block_t tblock, int joff, int mb_x, int max_imgpel_value) +{ + #ifdef _M_IX86 + __asm + { + // mm0 : constant value 32 + mov edx, 0x00200020 + movd mm0, edx + punpckldq mm0, mm0 + + // ecx: y offset + mov ecx, joff + shl ecx, 4 // imgpel stuff is going to be 16 byte stride + add ecx, mb_x + + // eax: curImg + mov eax, curImg + add eax, ecx + + // edx: mpr + mov edx, mpr + add edx, ecx + + // ecx: tblock (which is short, not byte) + mov ecx, tblock + + // mm7: zero + pxor mm7, mm7 + + // load coefficients + movq mm1, MMWORD PTR 0[ecx] + movq mm2, MMWORD PTR 8[ecx] + movq mm3, MMWORD PTR 16[ecx] + movq mm4, MMWORD PTR 24[ecx] + paddw mm1, mm0 // rres + 32 + paddw mm2, mm0 // rres + 32 + paddw mm3, mm0 // rres + 32 + paddw mm0, mm4 // rres + 32 + psraw mm1, 6 // (rres + 32) >> 6 + psraw mm2, 6 // (rres + 32) >> 6 + psraw mm3, 6 // (rres + 32) >> 6 + psraw mm0, 6 // (rres + 32) >> 6 + // mm1-mm3: tblock[0] - tblock[2], mm0: tblock[3] + + // convert mpr from unsigned char to short + movd mm4, DWORD PTR 0[edx] + movd mm5, DWORD PTR 16[edx] + movd mm6, DWORD PTR 32[edx] + punpcklbw mm4, mm7 + punpcklbw mm5, mm7 + punpcklbw mm6, mm7 + paddsw mm4, mm1 // pred_row + rres_row + movd mm1, DWORD PTR 48[edx] // reuse mm1 for mpr[3] + paddsw mm5, mm2 // pred_row + rres_row + punpcklbw mm1, mm7 + paddsw mm6, mm3 // pred_row + rres_row + paddsw mm1, mm0 // pred_row + rres_row + // results in mm4, mm5, mm6, mm1 + + // move back to 8 bit + packuswb mm4, mm7 + packuswb mm5, mm7 + packuswb mm6, mm7 + packuswb mm1, mm7 + movd DWORD PTR 0[eax], mm4 + movd DWORD PTR 16[eax], mm5 + movd DWORD PTR 32[eax], mm6 + movd DWORD PTR 48[eax], mm1 + } +#else + int i, j; + + for (j = 0; j < BLOCK_SIZE; j++) + { + for (i=0;i<BLOCK_SIZE;i++) + curImg[j+joff][mb_x+i] = (imgpel) iClip1( max_imgpel_value, rshift_rnd_sf(tblock[j][i], DQ_BITS) + mpr[j+joff][mb_x+i]); + } +#endif +} + +#if defined(_M_IX86) && defined(_DEBUG) +void itrans4x4_sse2(const h264_short_macroblock_t tblock, const h264_imgpel_macroblock_t mb_pred, h264_imgpel_macroblock_t mb_rec, int pos_x, int pos_y) +{ + __declspec(align(32)) static const short const32[4] = {32, 32, 32, 32}; + __asm + { + mov edx, pos_y + shl edx, 4 // imgpel stuff is going to be 16 byte stride + add edx, pos_x + + // eax: tblock + lea eax, [edx*2] + add eax, tblock + + // ecx: mpr + mov ecx, mb_pred + add ecx, edx + + // edx: results + add edx, mb_rec + + // load 4x4 matrix + movq mm0, MMWORD PTR 0[eax] + movq mm1, MMWORD PTR 32[eax] + movq mm2, MMWORD PTR 64[eax] + movq mm3, MMWORD PTR 96[eax] + + // rotate 4x4 matrix + movq mm4, mm0 // p0 = mm4 (copy) + punpcklwd mm0, mm2 // r0 = mm0 + punpckhwd mm4, mm2 // r2 = mm4 + movq mm5, mm1 // p1 = mm5 (copy) + punpcklwd mm1, mm3 // r1 = mm1 + punpckhwd mm5, mm3 // r3 = mm5 + movq mm6, mm0 // r0 = mm6 (copy) + punpcklwd mm0, mm1 // t0 = mm0 + punpckhwd mm6, mm1 // t1 = mm6 + movq mm1, mm4 // r2 = mm1 (copy) + punpcklwd mm1, mm5 // t2 = mm1 + punpckhwd mm4, mm5 // t3 = mm4 + + /* register state: + mm0: t0 + mm1: t2 + mm2: + mm3: + mm4: t3 + mm5: + mm6: t1 + mm7: + */ + + /* + p0 = t0 + t2; + p1 = t0 - t2; + p2 = (t1 >> 1) - t3; + p3 = t1 + (t3 >> 1); + */ + movq mm2, mm0 // mm2 = t0 (copy) + paddw mm0, mm1 // mm0 = p0 + psubw mm2, mm1 // mm2 = p1, mm1 available + movq mm5, mm6 // mm5 = t1 (copy) + psraw mm5, 1 // mm5 = (t1 >> 1) + psubw mm5, mm4 // mm5 = p2 + psraw mm4, 1 // mm4 = (t3 >> 1) + paddw mm6, mm4 // mm6 = p3 + + /* register state: + mm0: p0 + mm1: + mm2: p1 + mm3: + mm4: + mm5: p2 + mm6: p3 + mm7: + */ + + /* + *(pTmp++) = p0 + p3; + *(pTmp++) = p1 + p2; + *(pTmp++) = p1 - p2; + *(pTmp++) = p0 - p3; + */ + + movq mm3, mm0 // mm3 = p0 (copy) + paddw mm0, mm6 // mm0 = r0 + movq mm1, mm2 // mm1 = p1 (copy) + paddw mm1, mm5 // mm1 = r1 + psubw mm2, mm5 // mm2 = r2, mm5 available + psubw mm3, mm6 // mm3 = r3 + + /* register state: + mm0: r0 + mm1: r1 + mm2: r2 + mm3: r3 + mm4: + mm5: + mm6: + mm7: + */ + + // rotate 4x4 matrix to set up for vertical + movq mm4, mm0 // r0 = mm4 (copy) + punpcklwd mm0, mm2 // p0 = mm0 + punpckhwd mm4, mm2 // p2 = mm4 + movq mm5, mm1 // r1 = mm5 (copy) + punpcklwd mm1, mm3 // p1 = mm1 + punpckhwd mm5, mm3 // p3 = mm5 + movq mm6, mm0 // p0 = mm6 (copy) + punpcklwd mm0, mm1 // t0 = mm0 + punpckhwd mm6, mm1 // t1 = mm6 + movq mm1, mm4 // p2 = mm1 (copy) + punpcklwd mm1, mm5 // t2 = mm1 + punpckhwd mm4, mm5 // t3 = mm4 + + /* register state: + mm0: t0 + mm1: t2 + mm2: + mm3: + mm4: t3 + mm5: + mm6: t1 + mm7: + */ + /* + p0 = t0 + t2; + p1 = t0 - t2; + p2 = (t1 >> 1) - t3; + p3 = t1 + (t3 >> 1); + */ + movq mm2, mm0 // mm2 = t0 (copy) + paddw mm0, mm1 // mm0 = p0 + psubw mm2, mm1 // mm2 = p1, mm1 available + movq mm5, mm6 // mm5 = t1 (copy) + psraw mm5, 1 // mm5 = (t1 >> 1) + psubw mm5, mm4 // mm5 = p2 + psraw mm4, 1 // mm4 = (t3 >> 1) + paddw mm6, mm4 // mm6 = p3 + + /* register state: + mm0: p0 + mm1: + mm2: p1 + mm3: + mm4: + mm5: p2 + mm6: p3 + mm7: + */ + + /* + *(pTmp++) = p0 + p3; + *(pTmp++) = p1 + p2; + *(pTmp++) = p1 - p2; + *(pTmp++) = p0 - p3; + */ + + movq mm3, mm0 // mm3 = p0 (copy) + paddw mm0, mm6 // mm0 = r0 + movq mm1, mm2 // mm1 = p1 (copy) + paddw mm1, mm5 // mm1 = r1 + psubw mm2, mm5 // mm2 = r2, mm5 available + psubw mm3, mm6 // mm3 = r3 + + /* register state: + mm0: r0 + mm1: r1 + mm2: r2 + mm3: r3 + mm4: + mm5: + mm6: + mm7: + */ +/* --- 4x4 iDCT done, now time to combine with mpr --- */ + // mm0 : constant value 32 + movq mm7, const32 + + paddw mm0, mm7 // rres + 32 + psraw mm0, 6 // (rres + 32) >> 6 + paddw mm1, mm7 // rres + 32 + psraw mm1, 6 // (rres + 32) >> 6 + paddw mm2, mm7 // rres + 32 + psraw mm2, 6 // (rres + 32) >> 6 + paddw mm3, mm7 // rres + 32 + psraw mm3, 6 // (rres + 32) >> 6 + + pxor mm7, mm7 + + // convert mpr from unsigned char to short + movd mm4, DWORD PTR 0[ecx] + movd mm5, DWORD PTR 16[ecx] + movd mm6, DWORD PTR 32[ecx] + punpcklbw mm4, mm7 + punpcklbw mm5, mm7 + punpcklbw mm6, mm7 + paddsw mm4, mm0 // pred_row + rres_row + movd mm0, DWORD PTR 48[ecx] // reuse mm0 for mpr[3] + paddsw mm5, mm1 // pred_row + rres_row + punpcklbw mm0, mm7 + paddsw mm6, mm2 // pred_row + rres_row + paddsw mm0, mm3 // pred_row + rres_row + // results in mm4, mm5, mm6, mm0 + + // move back to 8 bit + packuswb mm4, mm7 + packuswb mm5, mm7 + packuswb mm6, mm7 + packuswb mm0, mm7 + movd DWORD PTR 0[edx], mm4 + movd DWORD PTR 16[edx], mm5 + movd DWORD PTR 32[edx], mm6 + movd DWORD PTR 48[edx], mm0 + } +} +#elif defined(_M_X64) +static void itrans4x4_sse2(const h264_int_macroblock_t tblock, const h264_imgpel_macroblock_t mb_pred, h264_imgpel_macroblock_t mb_rec, int pos_x, int pos_y) +{ + __declspec(align(32)) static const int const32[4] = {32, 32, 32, 32}; + __m128i p0,p1,p2,p3; + __m128i t0,t1,t2,t3; + __m128i r0,r1,r2,r3; + __m128i c32, zero; + + // horizontal + // load registers in vertical mode, we'll rotate them next + p0 = _mm_loadu_si128((__m128i *)&tblock[pos_y][pos_x]); // 00 01 02 03 + p1 = _mm_loadu_si128((__m128i *)&tblock[pos_y+1][pos_x]); // 10 11 12 13 + p2 = _mm_loadu_si128((__m128i *)&tblock[pos_y+2][pos_x]); // 20 21 22 23 + p3 = _mm_loadu_si128((__m128i *)&tblock[pos_y+3][pos_x]); // 30 31 32 33 + + // rotate 4x4 matrix + r0 = _mm_unpacklo_epi32(p0, p2); // 00 20 01 21 + r1 = _mm_unpacklo_epi32(p1, p3); // 10 30 11 31 + r2 = _mm_unpackhi_epi32(p0, p2); // 02 22 03 23 + r3 = _mm_unpackhi_epi32(p1, p3); // 12 32 13 33 + t0 = _mm_unpacklo_epi32(r0, r1); // 00 10 20 30 + t1 = _mm_unpackhi_epi32(r0, r1); // 01 11 21 31 + t2 = _mm_unpacklo_epi32(r2, r3); // 02 12 22 32 + t3 = _mm_unpackhi_epi32(r2, r3); // 03 13 23 33 + + p0 = _mm_add_epi32(t0, t2); //t0 + t2; + p1 = _mm_sub_epi32(t0, t2); // t0 - t2; + p2 = _mm_srai_epi32(t1, 1); // t1 >> 1 + p2 = _mm_sub_epi32(p2, t3); // (t1 >> 1) - t3; + p3 = _mm_srai_epi32(t3, 1); // (t3 >> 1) + p3 = _mm_add_epi32(p3, t1); // t1 + (t3 >> 1); + + t0 = _mm_add_epi32(p0, p3); //p0 + p3; + t1 = _mm_add_epi32(p1, p2);//p1 + p2; + t2 = _mm_sub_epi32(p1, p2); //p1 - p2; + t3 = _mm_sub_epi32(p0, p3); //p0 - p3; + + // rotate 4x4 matrix to set up for vertical + r0 = _mm_unpacklo_epi32(t0, t2); + r1 = _mm_unpacklo_epi32(t1, t3); + r2 = _mm_unpackhi_epi32(t0, t2); + r3 = _mm_unpackhi_epi32(t1, t3); + t0 = _mm_unpacklo_epi32(r0, r1); + t1 = _mm_unpackhi_epi32(r0, r1); + t2 = _mm_unpacklo_epi32(r2, r3); + t3 = _mm_unpackhi_epi32(r2, r3); + + // vertical + p0 = _mm_add_epi32(t0, t2); //t0 + t2; + p3 = _mm_srai_epi32(t3, 1); // (t3 >> 1) + p3 = _mm_add_epi32(p3, t1); // t1 + (t3 >> 1); + r0 = _mm_add_epi32(p0, p3); //p0 + p3; + r3 = _mm_sub_epi32(p0, p3); //p0 - p3; + p1 = _mm_sub_epi32(t0, t2); // t0 - t2; + p2 = _mm_srai_epi32(t1, 1); // t1 >> 1 + p2 = _mm_sub_epi32(p2, t3); // (t1 >> 1) - t3; + r1 = _mm_add_epi32(p1, p2);//p1 + p2; + r2 = _mm_sub_epi32(p1, p2); //p1 - p2; + + c32 = _mm_load_si128((const __m128i *)const32); + zero = _mm_setzero_si128(); + + // (x + 32) >> 6 + r0 = _mm_add_epi32(r0, c32); + r0 = _mm_srai_epi32(r0, 6); + r1 = _mm_add_epi32(r1, c32); + r1 = _mm_srai_epi32(r1, 6); + r2 = _mm_add_epi32(r2, c32); + r2 = _mm_srai_epi32(r2, 6); + r3 = _mm_add_epi32(r3, c32); + r3 = _mm_srai_epi32(r3, 6); + + // convert to 16bit values + r0 = _mm_packs_epi32(r0, r1); + r2 = _mm_packs_epi32(r2, r3); + + // convert mpr from unsigned char to short + p0 = _mm_cvtsi32_si128(*(int32_t *)&mb_pred[pos_y][pos_x]); + p1 = _mm_cvtsi32_si128(*(int32_t *)&mb_pred[pos_y+1][pos_x]); + p0 = _mm_unpacklo_epi32(p0, p1); + p0 = _mm_unpacklo_epi8(p0, zero); // convert to short + r0 = _mm_add_epi16(r0, p0); + + p0 = _mm_cvtsi32_si128(*(int32_t *)&mb_pred[pos_y+2][pos_x]); + p1 = _mm_cvtsi32_si128(*(int32_t *)&mb_pred[pos_y+3][pos_x]); + p0 = _mm_unpacklo_epi32(p0, p1); + p0 = _mm_unpacklo_epi8(p0, zero); // convert to short + r2 = _mm_add_epi16(r2, p0); + + r0 = _mm_packus_epi16(r0, r2); // convert to unsigned char + *(int32_t *)&mb_rec[pos_y][pos_x] = _mm_cvtsi128_si32(r0); + r0 = _mm_srli_si128(r0, 4); + *(int32_t *)&mb_rec[pos_y+1][pos_x] = _mm_cvtsi128_si32(r0); + r0 = _mm_srli_si128(r0, 4); + *(int32_t *)&mb_rec[pos_y+2][pos_x] = _mm_cvtsi128_si32(r0); + r0 = _mm_srli_si128(r0, 4); + *(int32_t *)&mb_rec[pos_y+3][pos_x] = _mm_cvtsi128_si32(r0); +} +#endif + +void itrans4x4_c(const h264_short_block_t tblock, const h264_imgpel_macroblock_t mb_pred, h264_imgpel_macroblock_t mb_rec, int pos_x, int pos_y) +{ + inverse4x4(tblock, (h264_short_block_row_t *)tblock,pos_y,pos_x); + sample_reconstruct(mb_rec, mb_pred, tblock, pos_y, pos_x, 255); +} + +void ihadamard4x4(int block[4][4]) +{ + int i; + int tmp[16]; + int *pTmp = tmp; + int p0,p1,p2,p3; + int t0,t1,t2,t3; + + // Horizontal + for (i = 0; i < BLOCK_SIZE; i++) + { + t0 = block[i][0]; + t1 = block[i][1]; + t2 = block[i][2]; + t3 = block[i][3]; + + p0 = t0 + t2; + p1 = t0 - t2; + p2 = t1 - t3; + p3 = t1 + t3; + + *(pTmp++) = p0 + p3; + *(pTmp++) = p1 + p2; + *(pTmp++) = p1 - p2; + *(pTmp++) = p0 - p3; + } + + // Vertical + for (i = 0; i < BLOCK_SIZE; i++) + { + pTmp = tmp + i; + t0 = *pTmp; + t1 = *(pTmp += BLOCK_SIZE); + t2 = *(pTmp += BLOCK_SIZE); + t3 = *(pTmp += BLOCK_SIZE); + + p0 = t0 + t2; + p1 = t0 - t2; + p2 = t1 - t3; + p3 = t1 + t3; + + block[0][i] = p0 + p3; + block[1][i] = p1 + p2; + block[2][i] = p1 - p2; + block[3][i] = p0 - p3; + } +} + +void ihadamard4x2(int **tblock, int **block) +{ + int i; + int tmp[8]; + int *pTmp = tmp; + int p0,p1,p2,p3; + int t0,t1,t2,t3; + + // Horizontal + *(pTmp++) = tblock[0][0] + tblock[1][0]; + *(pTmp++) = tblock[0][1] + tblock[1][1]; + *(pTmp++) = tblock[0][2] + tblock[1][2]; + *(pTmp++) = tblock[0][3] + tblock[1][3]; + + *(pTmp++) = tblock[0][0] - tblock[1][0]; + *(pTmp++) = tblock[0][1] - tblock[1][1]; + *(pTmp++) = tblock[0][2] - tblock[1][2]; + *(pTmp ) = tblock[0][3] - tblock[1][3]; + + // Vertical + pTmp = tmp; + for (i = 0; i < 2; i++) + { + p0 = *(pTmp++); + p1 = *(pTmp++); + p2 = *(pTmp++); + p3 = *(pTmp++); + + t0 = p0 + p2; + t1 = p0 - p2; + t2 = p1 - p3; + t3 = p1 + p3; + + // coefficients (transposed) + block[0][i] = t0 + t3; + block[1][i] = t1 + t2; + block[2][i] = t1 - t2; + block[3][i] = t0 - t3; + } +} + +//following functions perform 8 additions, 8 assignments. Should be a bit faster +void ihadamard2x2(int tblock[4], int block[4]) +{ + int t0,t1,t2,t3; + + t0 = tblock[0] + tblock[1]; + t1 = tblock[0] - tblock[1]; + t2 = tblock[2] + tblock[3]; + t3 = tblock[2] - tblock[3]; + + block[0] = (t0 + t2); + block[1] = (t1 + t3); + block[2] = (t0 - t2); + block[3] = (t1 - t3); +} + |