103 files changed, 58239 insertions, 0 deletions
diff --git a/Src/h264dec/dec_api.c b/Src/h264dec/dec_api.c
new file mode 100644
index 00000000..aaaeb46a
--- /dev/null
+++ b/Src/h264dec/dec_api.c
@@ -0,0 +1,393 @@
+#include "dec_api.h"
+#include "global.h"
+#include "nalu.h"
+#include "image.h"
+#include "meminput.h"
+#include "output.h"
+#include "fmo.h"
+#include "erc_api.h"
+#include "parset.h"
+#include "memcache.h"
+#include "block.h"
+#include "optim.h"
+#include "mc_prediction.h"
+#include "vlc.h"
+#include <stddef.h> // for offsetof
+
+#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
+
+OptimizedFunctions opt;
+
+DecoderParams *alloc_decoder();
+void Configure(VideoParameters *p_Vid, InputParameters *p_Inp);
+void malloc_slice(InputParameters *p_Inp, VideoParameters *p_Vid);
+void init        (VideoParameters *p_Vid);
+void free_slice  (Slice *currSlice);
+void free_img( VideoParameters *p_Vid);
+
+int sse2_flag = 0, mmx_flag=0, sse_flag=0, sse3_flag=0, sse4_1_flag=0;
+int H264_Init()
+{
+	int flags_edx, flags_ecx;
+	#ifdef H264_IPP
+	ippStaticInit();
+	#endif
+
+#ifdef _M_IX86
+	 _asm {
+		 mov eax, 1
+			 cpuid
+			 mov flags_edx, edx
+			 mov flags_ecx, ecx
+	 }
+	 mmx_flag = flags_edx & 0x00800000;
+	 sse_flag = flags_edx & 0x02000000;
+	 sse2_flag = flags_edx & 0x04000000;
+	 sse3_flag = flags_ecx & 0x00000001;
+	 sse4_1_flag= flags_ecx & (1 << 19);
+	 
+#elif defined(_M_X64)
+	 sse2_flag = 1;
+#endif
+
+#ifdef _M_IX86
+	 /* if you get any compile errors here, you need to change biari.asm */
+	 BUILD_BUG_ON(offsetof(TextureInfoContexts, map_contexts) != 436);
+	 BUILD_BUG_ON(offsetof(TextureInfoContexts, last_contexts) != 3252);
+	 BUILD_BUG_ON(offsetof(TextureInfoContexts, one_contexts) != 6068);	 
+	 BUILD_BUG_ON(offsetof(TextureInfoContexts, abs_contexts) != 6508);	 	 
+	 
+	 BUILD_BUG_ON(offsetof(Macroblock, p_Slice) != 0);
+	 BUILD_BUG_ON(offsetof(Macroblock, p_Vid) != 4);	 
+ 	 BUILD_BUG_ON(offsetof(Macroblock, qp) != 60);	 
+	 BUILD_BUG_ON(offsetof(Macroblock, qpc) != 64);	 
+	 BUILD_BUG_ON(offsetof(Macroblock, qp_scaled) != 72);	 	 
+	 BUILD_BUG_ON(offsetof(Macroblock, cbp_blk) != 248);	 
+	 BUILD_BUG_ON(offsetof(Macroblock, mb_field) != 344);
+	 BUILD_BUG_ON(offsetof(Macroblock, read_and_store_CBP_block_bit) != 400);	 
+	  
+	 BUILD_BUG_ON(offsetof(Slice, tex_ctx) != 100);	
+	 BUILD_BUG_ON(offsetof(Slice, mb_rec) != 1696);	
+	 BUILD_BUG_ON(offsetof(Slice, mb_pred) != 928);	
+	 BUILD_BUG_ON(offsetof(Slice, coeff) != 15632);	 
+	 BUILD_BUG_ON(offsetof(Slice, coeff_ctr) != 15760);	 
+	 BUILD_BUG_ON(offsetof(Slice, pos) != 15764);	 
+	 BUILD_BUG_ON(offsetof(Slice, cof) != 2464);	 
+	 BUILD_BUG_ON(offsetof(Slice, last_dquant) != 88);	 
+	 BUILD_BUG_ON(offsetof(Slice, mot_ctx) != 96);	 
+	 BUILD_BUG_ON(offsetof(Slice, slice_type) != 64);	 
+	 
+	 
+	 BUILD_BUG_ON(offsetof(StorablePicture, structure) != 0);
+	 BUILD_BUG_ON(offsetof(StorablePicture, chroma_qp_offset) != 158688);	 	 
+	 BUILD_BUG_ON(offsetof(StorablePicture, motion) != 158524);
+	 BUILD_BUG_ON(offsetof(StorablePicture, plane_images) != 158512);
+	 BUILD_BUG_ON(offsetof(StorablePicture, imgY) != 158512);
+
+	 
+	 BUILD_BUG_ON(offsetof(VideoParameters, structure) != 697200);
+	 BUILD_BUG_ON(offsetof(VideoParameters, bitdepth_chroma_qp_scale) != 697456);
+	 BUILD_BUG_ON(offsetof(VideoParameters, dec_picture) != 698192);
+
+	 BUILD_BUG_ON(offsetof(DecodingEnvironment, Dcodestrm_len) != 16);
+	 BUILD_BUG_ON(offsetof(DecodingEnvironment, Dcodestrm) != 12);
+	 BUILD_BUG_ON(offsetof(DecodingEnvironment, DbitsLeft) != 8);
+	 BUILD_BUG_ON(offsetof(DecodingEnvironment, Dvalue) != 4);
+	 BUILD_BUG_ON(offsetof(DecodingEnvironment, Drange) != 0);
+
+	 BUILD_BUG_ON(sizeof(BiContextType) != 4);
+	 BUILD_BUG_ON(offsetof(BiContextType, state) != 0);
+	 BUILD_BUG_ON(offsetof(BiContextType, MPS) != 2);
+
+	BUILD_BUG_ON(offsetof(OptimizedFunctions, copy_image_data_16x16_stride) != 32);
+#endif
+
+	 if (sse2_flag)
+	 {
+		 //opt.itrans4x4 = itrans4x4_mmx;
+		 opt.itrans8x8 = itrans8x8_sse2;
+		 opt.weighted_mc_prediction16x16 = weighted_mc_prediction16x16_sse2;
+		 opt.weighted_mc_prediction16x8 = weighted_mc_prediction16x8_sse2;
+		 opt.weighted_mc_prediction8x8 = weighted_mc_prediction8x8_sse2;
+
+		 opt.weighted_bi_prediction16x16 = weighted_bi_prediction16x16_sse2;
+		 opt.weighted_bi_prediction16x8 = weighted_bi_prediction16x8_sse2;
+		 opt.weighted_bi_prediction8x8 = weighted_bi_prediction8x8_sse2;
+		 
+		 opt.bi_prediction8x8 = bi_prediction8x8_sse2;
+		 opt.copy_image_data_16x16_stride = copy_image_data_16x16_stride_sse;
+		 opt.code_from_bitstream_2d_5_4 = code_from_bitstream_2d_5_4_sse2;
+		 opt.code_from_bitstream_2d_17_4 = code_from_bitstream_2d_17_4_sse2;
+		 opt.code_from_bitstream_2d_16_1 = code_from_bitstream_2d_16_1_sse2;
+	 }
+	 else if (sse_flag && mmx_flag)
+	 {
+		 //opt.itrans4x4 = itrans4x4_mmx;
+		 opt.itrans8x8 = itrans8x8_c;//itrans8x8_mmx;
+
+		 opt.weighted_mc_prediction16x16 = weighted_mc_prediction16x16_ipp;
+		 opt.weighted_mc_prediction16x8 = weighted_mc_prediction16x8_ipp;
+		 opt.weighted_mc_prediction8x8 = weighted_mc_prediction8x8_ipp;
+
+		 opt.weighted_bi_prediction16x16 = weighted_bi_prediction16x16_ipp;
+		 opt.weighted_bi_prediction16x8 = weighted_bi_prediction16x8_ipp;
+		 opt.weighted_bi_prediction8x8 = weighted_bi_prediction8x8_ipp;
+		 
+		 opt.bi_prediction8x8 = bi_prediction8x8_ipp;
+		 opt.copy_image_data_16x16_stride = copy_image_data_16x16_stride_sse;
+		 opt.code_from_bitstream_2d_5_4 = code_from_bitstream_2d_5_4_c;
+		 opt.code_from_bitstream_2d_17_4 = code_from_bitstream_2d_17_4_c;
+		 opt.code_from_bitstream_2d_16_1 = code_from_bitstream_2d_16_1_c;
+	 }
+	 else
+		 return 0;
+
+	 return 1;
+}
+
+h264_decoder_t H264_CreateDecoder()
+{
+	DecoderParams *decoder=alloc_decoder();
+
+	if (decoder)
+	{
+		InputParameters *p_Inp = decoder->p_Inp;
+		Configure(decoder->p_Vid, p_Inp);
+		p_Inp->intra_profile_deblocking = 1;
+
+		initBitsFile(decoder->p_Vid);
+
+		malloc_slice(decoder->p_Inp, decoder->p_Vid);
+		init_old_slice(decoder->p_Vid->old_slice);
+
+		init(decoder->p_Vid);
+
+		init_out_buffer(decoder->p_Vid);
+
+		decoder->p_Vid->current_mb_nr = -4711;     // initialized to an impossible value for debugging -- correct value is taken from slice header
+
+	}
+	return decoder;
+}
+
+void H264_DestroyDecoder(h264_decoder_t d)
+{
+	DecoderParams *decoder = (DecoderParams *)d;
+	if (decoder)
+	{
+		free_slice(decoder->p_Vid->currentSlice);
+		FmoFinit(decoder->p_Vid);
+
+		free_global_buffers(decoder->p_Vid);
+		flush_dpb(decoder->p_Vid);
+
+#if (PAIR_FIELDS_IN_OUTPUT)
+		flush_pending_output(decoder->p_Vid);
+#endif
+
+		out_storable_pictures_destroy(decoder->p_Vid);
+
+		ercClose(decoder->p_Vid, decoder->p_Vid->erc_errorVar);
+
+		CleanUpPPS(decoder->p_Vid);
+		free_dpb(decoder->p_Vid);
+		uninit_out_buffer(decoder->p_Vid);
+		image_cache_flush(&decoder->p_Vid->image_cache[0]);
+		image_cache_flush(&decoder->p_Vid->image_cache[1]);
+		motion_cache_flush(&decoder->p_Vid->motion_cache);
+		FreeNALU(decoder->p_Vid->nalu);
+		free (decoder->p_Inp);
+		free_img (decoder->p_Vid);
+		free(decoder);
+	}
+}
+
+void H264_DecodeFrame(h264_decoder_t d, const void *buffer, size_t bufferlen, uint64_t time_code)
+{
+	DecoderParams *decoder = (DecoderParams *)d;
+	int ret;
+	memory_input_t *mem_input = decoder->p_Vid->mem_input;
+	mem_input->user_buffer=buffer;
+	mem_input->user_buffer_size=bufferlen;
+	mem_input->user_buffer_read=0;
+	__try
+	{
+		ret = decode_one_frame(decoder->p_Vid, time_code);
+	}
+	__except(EXCEPTION_EXECUTE_HANDLER)
+	{
+		int x;
+		x=0;
+	}
+#ifdef _M_IX86
+		_mm_empty();
+#endif
+}
+
+void H264_GetPicture(h264_decoder_t d, StorablePicture **pic)
+{
+	DecoderParams *decoder = (DecoderParams *)d;
+	if (pic)
+	{
+		out_storable_picture_get(decoder->p_Vid, pic);
+	}
+}
+
+static double GetAspectRatio(const vui_seq_parameters_t *vui)
+{
+	int aspect_ratio_width=1, aspect_ratio_height=1;
+
+	if (vui->aspect_ratio_info_present_flag) 
+	{
+		switch(vui->aspect_ratio_idc)
+		{
+		case VUI_AR_UNDEFINED:
+		case VUI_AR_SQUARE:
+			aspect_ratio_width  = 1;
+			aspect_ratio_height = 1;
+			break;
+		case VUI_AR_12_11:
+			aspect_ratio_width  = 12;
+			aspect_ratio_height = 11;
+			break;
+		case VUI_AR_10_11:
+			aspect_ratio_width  = 10;
+			aspect_ratio_height = 11;
+			break;
+		case VUI_AR_16_11:
+			aspect_ratio_width  = 16;
+			aspect_ratio_height = 11;
+			break;
+		case VUI_AR_40_33:
+			aspect_ratio_width  = 40;
+			aspect_ratio_height = 33;
+			break;
+		case VUI_AR_24_11:
+			aspect_ratio_width  = 24;
+			aspect_ratio_height = 11;
+			break;
+		case VUI_AR_20_11:
+			aspect_ratio_width  = 20;
+			aspect_ratio_height = 11;
+			break;
+		case VUI_AR_32_11:
+			aspect_ratio_width  = 32;
+			aspect_ratio_height = 11;
+			break;
+		case VUI_AR_80_33:
+			aspect_ratio_width  = 80;
+			aspect_ratio_height = 33;
+			break;
+		case VUI_AR_18_11:
+			aspect_ratio_width  = 18;
+			aspect_ratio_height = 11;
+			break;
+		case VUI_AR_15_11:
+			aspect_ratio_width  = 15;
+			aspect_ratio_height = 11;
+			break;
+		case VUI_AR_64_33:
+			aspect_ratio_width  = 64;
+			aspect_ratio_height = 33;
+			break;
+		case VUI_AR_160_99:
+			aspect_ratio_width  = 160;
+			aspect_ratio_height = 99;
+			break;
+		case VUI_AR_4_3:
+			aspect_ratio_width  = 4;
+			aspect_ratio_height = 3;
+			break;
+		case VUI_AR_3_2:
+			aspect_ratio_width  = 3;
+			aspect_ratio_height = 2;
+			break;;
+		case VUI_AR_2_1:
+			aspect_ratio_width  = 2;
+			aspect_ratio_height = 1;
+			break;;
+		case VUI_EXTENDED_SAR:
+		default:
+			aspect_ratio_width  = vui->sar_width;
+			aspect_ratio_height = vui->sar_height;
+			break;
+		}
+	}
+	return (double)aspect_ratio_width / (double)aspect_ratio_height;
+}
+
+const FrameFormat *H264_GetOutputFormat(h264_decoder_t d, double *aspect_ratio)
+{
+	DecoderParams *decoder = (DecoderParams *)d;
+	if (decoder && decoder->p_Inp)
+	{
+		if (decoder->p_Vid->active_sps)
+			*aspect_ratio = GetAspectRatio(&decoder->p_Vid->active_sps->vui_seq_parameters);
+
+		return &decoder->p_Inp->output;
+	}
+	else
+		return 0;
+}
+
+void H264_Flush(h264_decoder_t d)
+{
+	DecoderParams *decoder = (DecoderParams *)d;
+	if (decoder && decoder->p_Vid)
+	{
+		StorablePicture *pic=0;
+		exit_picture(decoder->p_Vid,  &decoder->p_Vid->dec_picture);
+		if (pic)
+			free_storable_picture(decoder->p_Vid, pic);
+		pic=0;
+
+		decoder->p_Vid->frame_num = 0;
+		decoder->p_Vid->pre_frame_num = INT_MIN;
+		decoder->p_Vid->PreviousFrameNum=0;
+		decoder->p_Vid->PreviousFrameNumOffset = 0;
+		decoder->p_Vid->PrevPicOrderCntLsb = 0;
+		decoder->p_Vid->PrevPicOrderCntMsb = 0;
+		flush_dpb(decoder->p_Vid);
+
+		do
+		{
+			pic=0;
+			out_storable_picture_get(decoder->p_Vid, &pic);
+			if (pic)
+				free_storable_picture(decoder->p_Vid, pic);
+		} while (pic);
+		decoder->p_Vid->mem_input->resetting = 1;
+	}
+}
+
+void H264_FreePicture(h264_decoder_t d, StorablePicture *p)
+{
+	DecoderParams *decoder = (DecoderParams *)d;
+	if (decoder && decoder->p_Vid && p)
+	{
+		free_storable_picture(decoder->p_Vid, p);
+	}
+}
+
+void H264_EndOfStream(h264_decoder_t d)
+{
+	DecoderParams *decoder = (DecoderParams *)d;
+	if (decoder && decoder->p_Vid)
+	{
+		if (decoder->p_Vid->dec_picture)
+		exit_picture(decoder->p_Vid, &decoder->p_Vid->dec_picture);
+		else
+			flush_dpb(decoder->p_Vid);
+	}
+}
+
+void H264_HurryUp(h264_decoder_t d, int state)
+{
+	DecoderParams *decoder = (DecoderParams *)d;
+	if (decoder && decoder->p_Vid)
+	{
+		memory_input_t *mem_input = decoder->p_Vid->mem_input;
+		if (mem_input)
+			mem_input->skip_b_frames = state;
+	}
+}
+\ No newline at end of file
diff --git a/Src/h264dec/dec_api.h b/Src/h264dec/dec_api.h
new file mode 100644
index 00000000..41811666
--- /dev/null
+++ b/Src/h264dec/dec_api.h
@@ -0,0 +1,25 @@
+#pragma once
+#include <bfc/platform/types.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+	#include "ldecod/inc/mbuffer.h"
+#include "lcommon/inc/frame.h"
+
+typedef void *h264_decoder_t;
+
+int H264_Init(); // initializes the library.  currently just does a CPU feature check (sse2, etc)
+h264_decoder_t H264_CreateDecoder();
+void H264_DestroyDecoder(h264_decoder_t decoder);
+
+void H264_DecodeFrame(h264_decoder_t decoder, const void *buffer, size_t bufferlen, uint64_t time_code);
+void H264_GetPicture(h264_decoder_t decoder, StorablePicture **pic);
+void H264_FreePicture(h264_decoder_t decoder, StorablePicture *pic);
+void H264_Flush(h264_decoder_t decoder);
+void H264_EndOfStream(h264_decoder_t decoder);
+void H264_HurryUp(h264_decoder_t decoder, int state);
+const FrameFormat *H264_GetOutputFormat(h264_decoder_t decoder, double *aspect_ratio);
+
+#ifdef __cplusplus
+}
+#endif
+\ No newline at end of file
diff --git a/Src/h264dec/jm_vc9.sln b/Src/h264dec/jm_vc9.sln
new file mode 100644
index 00000000..9d057c83
--- /dev/null
+++ b/Src/h264dec/jm_vc9.sln
@@ -0,0 +1,19 @@
+Microsoft Visual Studio Solution File, Format Version 10.00
+# Visual Studio 2008
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ldecod", "ldecod_vc9.vcproj", "{5499B067-CF32-4141-A757-E0A29866994A}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Release|Win32 = Release|Win32
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{5499B067-CF32-4141-A757-E0A29866994A}.Debug|Win32.ActiveCfg = Debug|Win32
+		{5499B067-CF32-4141-A757-E0A29866994A}.Debug|Win32.Build.0 = Debug|Win32
+		{5499B067-CF32-4141-A757-E0A29866994A}.Release|Win32.ActiveCfg = Release|Win32
+		{5499B067-CF32-4141-A757-E0A29866994A}.Release|Win32.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Src/h264dec/lcommon/inc/ctx_tables.h b/Src/h264dec/lcommon/inc/ctx_tables.h
new file mode 100644
index 00000000..28d622f3
--- /dev/null
+++ b/Src/h264dec/lcommon/inc/ctx_tables.h
@@ -0,0 +1,994 @@
+
+/*!
+ *************************************************************************************
+ * \file ctx_tables.h
+ *
+ * \brief
+ *    CABAC context initialization tables
+ *
+ * \author
+ *    Main contributors (see contributors.h for copyright, address and affiliation details)
+ *    - Detlev Marpe                    <marpe@hhi.de>
+ *    - Heiko Schwarz                   <hschwarz@hhi.de>
+ **************************************************************************************
+ */
+
+#define CTX_UNUSED          {0,64}
+#define CTX_UNDEF           {0,63}
+
+#ifdef CONTEXT_INI_C
+
+
+#define NUM_CTX_MODELS_I     1
+#define NUM_CTX_MODELS_P     3
+
+
+static const char INIT_MB_TYPE_I[1][3][11][2] =
+{
+  //----- model 0 -----
+  {
+    { {  20, -15} , {   2,  54} , {   3,  74} ,  CTX_UNUSED , { -28, 127} , { -23, 104} , {  -6,  53} , {  -1,  54} , {   7,  51} ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  20, -15} , {   2,  54} , {   3,  74} , {  20, -15} , {   2,  54} , {   3,  74} , { -28, 127} , { -23, 104} , {  -6,  53} , {  -1,  54} , {   7,  51} }, // SI (unused at the moment)
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED }
+  }
+};
+static const char INIT_MB_TYPE_P[3][3][11][2] =
+{
+  //----- model 0 -----
+  {
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+  { {  23,  33} , {  23,   2} , {  21,   0} ,  CTX_UNUSED , {   1,   9} , {   0,  49} , { -37, 118} , {   5,  57} , { -13,  78} , { -11,  65} , {   1,  62} },
+  { {  26,  67} , {  16,  90} , {   9, 104} ,  CTX_UNUSED , { -46, 127} , { -20, 104} , {   1,  67} , {  18,  64} , {   9,  43} , {  29,   0} ,  CTX_UNUSED }
+  },
+  //----- model 1 -----
+  {
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  22,  25} , {  34,   0} , {  16,   0} ,  CTX_UNUSED , {  -2,   9} , {   4,  41} , { -29, 118} , {   2,  65} , {  -6,  71} , { -13,  79} , {   5,  52} },
+    { {  57,   2} , {  41,  36} , {  26,  69} ,  CTX_UNUSED , { -45, 127} , { -15, 101} , {  -4,  76} , {  26,  34} , {  19,  22} , {  40,   0} ,  CTX_UNUSED }
+  },
+  //----- model 2 -----
+  {
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  29,  16} , {  25,   0} , {  14,   0} ,  CTX_UNUSED , { -10,  51} , {  -3,  62} , { -27,  99} , {  26,  16} , {  -4,  85} , { -24, 102} , {   5,  57} },
+  { {  54,   0} , {  37,  42} , {  12,  97} ,  CTX_UNUSED , { -32, 127} , { -22, 117} , {  -2,  74} , {  20,  40} , {  20,  10} , {  29,   0} ,  CTX_UNUSED }
+  }
+};
+
+static const char INIT_B8_TYPE_I[1][2][9][2] =
+{
+  //----- model 0 -----
+  {
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED }
+  }
+};
+
+static const char INIT_B8_TYPE_P[3][2][9][2] =
+{
+  //----- model 0 -----
+  {
+    {  CTX_UNUSED , {  12,  49} ,  CTX_UNUSED , {  -4,  73} , {  17,  50} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  -6,  86} , { -17,  95} , {  -6,  61} , {   9,  45} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED }
+  },
+  //----- model 1 -----
+  {
+    {  CTX_UNUSED , {   9,  50} ,  CTX_UNUSED , {  -3,  70} , {  10,  54} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {   6,  69} , { -13,  90} , {   0,  52} , {   8,  43} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED }
+  },
+  //----- model 2 -----
+  {
+    {  CTX_UNUSED , {   6,  57} ,  CTX_UNUSED , { -17,  73} , {  14,  57} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  -6,  93} , { -14,  88} , {  -6,  44} , {   4,  55} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED }
+  }
+};
+
+static const char INIT_MV_RES_I[1][2][10][2] =
+{
+  //----- model 0 -----
+  {
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED }
+  }
+};
+
+static const char INIT_MV_RES_P[3][2][10][2] =
+{
+  //----- model 0 -----
+  {
+    { {  -3,  69} ,  CTX_UNUSED , {  -6,  81} , { -11,  96} ,  CTX_UNUSED , {   0,  58} ,  CTX_UNUSED , {  -3,  76} , { -10,  94} ,  CTX_UNUSED },
+    { {   6,  55} , {   7,  67} , {  -5,  86} , {   2,  88} ,  CTX_UNUSED , {   5,  54} , {   4,  69} , {  -3,  81} , {   0,  88} ,  CTX_UNUSED }
+  },
+  //----- model 1 -----
+  {
+    { {  -2,  69} ,  CTX_UNUSED , {  -5,  82} , { -10,  96} ,  CTX_UNUSED , {   1,  56} ,  CTX_UNUSED , {  -3,  74} , {  -6,  85} ,  CTX_UNUSED },
+    { {   2,  59} , {   2,  75} , {  -3,  87} , {  -3, 100} ,  CTX_UNUSED , {   0,  59} , {  -3,  81} , {  -7,  86} , {  -5,  95} ,  CTX_UNUSED }
+  },
+  //----- model 2 -----
+  {
+    { { -11,  89} ,  CTX_UNUSED , { -15, 103} , { -21, 116} ,  CTX_UNUSED , {   1,  63} ,  CTX_UNUSED , {  -5,  85} , { -13, 106} ,  CTX_UNUSED },
+    { {  19,  57} , {  20,  58} , {   4,  84} , {   6,  96} ,  CTX_UNUSED , {   5,  63} , {   6,  75} , {  -3,  90} , {  -1, 101} ,  CTX_UNUSED }
+  }
+};
+
+static const char INIT_REF_NO_I[1][2][6][2] =
+{
+  //----- model 0 -----
+  {
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED }
+  }
+};
+
+static const char INIT_REF_NO_P[3][2][6][2] =
+{
+  //----- model 0 -----
+  {
+    { {  -7,  67} , {  -5,  74} , {  -4,  74} , {  -5,  80} , {  -7,  72} , {   1,  58} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED }
+  },
+  //----- model 1 -----
+  {
+    { {  -1,  66} , {  -1,  77} , {   1,  70} , {  -2,  86} , {  -5,  72} , {   0,  61} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED }
+  },
+  //----- model 2 -----
+  {
+    { {   3,  55} , {  -4,  79} , {  -2,  75} , { -12,  97} , {  -7,  50} , {   1,  60} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED }
+  }
+};
+
+
+static const char INIT_TRANSFORM_SIZE_I[1][1][3][2]=
+{
+  //----- model 0 -----
+  {
+    {  {  31,  21} , {  31,  31} , {  25,  50} },
+//    { {   0,  41} , {   0,  63} , {   0,  63} },
+  }
+};
+
+static const char INIT_TRANSFORM_SIZE_P[3][1][3][2]=
+{
+  //----- model 0 -----
+  {
+    {  {  12,  40} , {  11,  51} , {  14,  59} },
+//    { {   0,  41} , {   0,  63} , {   0,  63} },
+  },
+  //----- model 1 -----
+  {
+    {  {  25,  32} , {  21,  49} , {  21,  54} },
+//    { {   0,  41} , {   0,  63} , {   0,  63} },
+  },
+  //----- model 2 -----
+  {
+    {  {  21,  33} , {  19,  50} , {  17,  61} },
+//    { {   0,  41} , {   0,  63} , {   0,  63} },
+  }
+};
+
+static const char INIT_DELTA_QP_I[1][1][4][2]=
+{
+  //----- model 0 -----
+  {
+    { {   0,  41} , {   0,  63} , {   0,  63} , {   0,  63} },
+  }
+};
+static const char INIT_DELTA_QP_P[3][1][4][2]=
+{
+  //----- model 0 -----
+  {
+    { {   0,  41} , {   0,  63} , {   0,  63} , {   0,  63} },
+  },
+  //----- model 1 -----
+  {
+    { {   0,  41} , {   0,  63} , {   0,  63} , {   0,  63} },
+  },
+  //----- model 2 -----
+  {
+    { {   0,  41} , {   0,  63} , {   0,  63} , {   0,  63} },
+  }
+};
+
+static const char INIT_MB_AFF_I[1][1][4][2] =
+{
+  //----- model 0 -----
+  {
+    { {   0,  11} , {   1,  55} , {   0,  69} ,  CTX_UNUSED }
+  }
+};
+static const char INIT_MB_AFF_P[3][1][4][2] =
+{
+  //----- model 0 -----
+  {
+    { {   0,  45} , {  -4,  78} , {  -3,  96} ,  CTX_UNUSED }
+  },
+  //----- model 1 -----
+  {
+    { {  13,  15} , {   7,  51} , {   2,  80} ,  CTX_UNUSED }
+  },
+  //----- model 2 -----
+  {
+    { {   7,  34} , {  -9,  88} , { -20, 127} ,  CTX_UNUSED }
+  }
+};
+
+static const char INIT_IPR_I[1][1][2][2] =
+{
+  //----- model 0 -----
+  {
+    { { 13,  41} , {   3,  62} }
+  }
+};
+
+static const char INIT_IPR_P[3][1][2][2] =
+{
+  //----- model 0 -----
+  {
+    { { 13,  41} , {   3,  62} }
+  },
+  //----- model 1 -----
+  {
+    { { 13,  41} , {   3,  62} }
+  },
+  //----- model 2 -----
+  {
+    { { 13,  41} , {   3,  62} }
+  }
+};
+
+static const char INIT_CIPR_I[1][1][4][2] =
+{
+  //----- model 0 -----
+  {
+    { {  -9,  83} , {   4,  86} , {   0,  97} , {  -7,  72} }
+  }
+};
+
+static const char INIT_CIPR_P[3][1][4][2] =
+{
+  //----- model 0 -----
+  {
+    { {  -9,  83} , {   4,  86} , {   0,  97} , {  -7,  72} }
+  },
+  //----- model 1 -----
+  {
+    { {  -9,  83} , {   4,  86} , {   0,  97} , {  -7,  72} }
+  },
+  //----- model 2 -----
+  {
+    { {  -9,  83} , {   4,  86} , {   0,  97} , {  -7,  72} }
+  }
+};
+
+static const char INIT_CBP_I[1][3][4][2] =
+{
+  //----- model 0 -----
+  {
+    { { -17, 127} , { -13, 102} , {   0,  82} , {  -7,  74} },
+    { { -21, 107} , { -27, 127} , { -31, 127} , { -24, 127} },
+    { { -18,  95} , { -27, 127} , { -21, 114} , { -30, 127} }
+  }
+};
+
+static const char INIT_CBP_P[3][3][4][2] =
+{
+  //----- model 0 -----
+  {
+    { { -27, 126} , { -28,  98} , { -25, 101} , { -23,  67} },
+    { { -28,  82} , { -20,  94} , { -16,  83} , { -22, 110} },
+    { { -21,  91} , { -18, 102} , { -13,  93} , { -29, 127} }
+  },
+  //----- model 1 -----
+  {
+    { { -39, 127} , { -18,  91} , { -17,  96} , { -26,  81} },
+    { { -35,  98} , { -24, 102} , { -23,  97} , { -27, 119} },
+    { { -24,  99} , { -21, 110} , { -18, 102} , { -36, 127} }
+  },
+  //----- model 2 -----
+  {
+    { { -36, 127} , { -17,  91} , { -14,  95} , { -25,  84} },
+    { { -25,  86} , { -12,  89} , { -17,  91} , { -31, 127} },
+    { { -14,  76} , { -18, 103} , { -13,  90} , { -37, 127} }
+  }
+};
+
+static const char INIT_BCBP_I[1][22][4][2] = 
+{
+  //----- model 0 -----
+  {
+    { { -17, 123} , { -12, 115} , { -16, 122} , { -11, 115} },
+    { { -12,  63} , {  -2,  68} , { -15,  84} , { -13, 104} },
+    { {  -3,  70} , {  -8,  93} , { -10,  90} , { -30, 127} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  -3,  70} , {  -8,  93} , { -10,  90} , { -30, 127} },
+    { {  -1,  74} , {  -6,  97} , {  -7,  91} , { -20, 127} },
+    { {  -4,  56} , {  -5,  82} , {  -7,  76} , { -22, 125} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    // Cb in the 4:4:4 common mode
+    { { -17, 123} , { -12, 115} , { -16, 122} , { -11, 115} },
+    { { -12,  63} , {  -2,  68} , { -15,  84} , { -13, 104} },
+    { {  -3,  70} , {  -8,  93} , { -10,  90} , { -30, 127} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  -3,  70} , {  -8,  93} , { -10,  90} , { -30, 127} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    // Cr in the 4:4:4 common mode   
+    { { -17, 123} , { -12, 115} , { -16, 122} , { -11, 115} },
+    { { -12,  63} , {  -2,  68} , { -15,  84} , { -13, 104} },
+    { {  -3,  70} , {  -8,  93} , { -10,  90} , { -30, 127} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  -3,  70} , {  -8,  93} , { -10,  90} , { -30, 127} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED }
+  }
+};
+
+static const char INIT_BCBP_P[3][22][4][2] =
+{
+  //----- model 0 -----
+  {
+    { {  -7,  92} , {  -5,  89} , {  -7,  96} , { -13, 108} },
+    { {  -3,  46} , {  -1,  65} , {  -1,  57} , {  -9,  93} },
+    { {  -3,  74} , {  -9,  92} , {  -8,  87} , { -23, 126} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  -3,  74} , {  -9,  92} , {  -8,  87} , { -23, 126} },
+    { {   5,  54} , {   6,  60} , {   6,  59} , {   6,  69} },
+    { {  -1,  48} , {   0,  68} , {  -4,  69} , {  -8,  88} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    // Cb in the 4:4:4 common mode
+    { {  -7,  92} , {  -5,  89} , {  -7,  96} , { -13, 108} },
+    { {  -3,  46} , {  -1,  65} , {  -1,  57} , {  -9,  93} },
+    { {  -3,  74} , {  -9,  92} , {  -8,  87} , { -23, 126} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  -3,  74} , {  -9,  92} , {  -8,  87} , { -23, 126} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    // Cr in the 4:4:4 common mode
+    { {  -7,  92} , {  -5,  89} , {  -7,  96} , { -13, 108} },
+    { {  -3,  46} , {  -1,  65} , {  -1,  57} , {  -9,  93} },
+    { {  -3,  74} , {  -9,  92} , {  -8,  87} , { -23, 126} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  -3,  74} , {  -9,  92} , {  -8,  87} , { -23, 126} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED }
+  },
+  //----- model 1 -----
+  {
+    { {   0,  80} , {  -5,  89} , {  -7,  94} , {  -4,  92} },
+    { {   0,  39} , {   0,  65} , { -15,  84} , { -35, 127} },
+    { {  -2,  73} , { -12, 104} , {  -9,  91} , { -31, 127} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  -2,  73} , { -12, 104} , {  -9,  91} , { -31, 127} },
+    { {   3,  55} , {   7,  56} , {   7,  55} , {   8,  61} },
+    { {  -3,  53} , {   0,  68} , {  -7,  74} , {  -9,  88} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    // Cb in the 4:4:4 common mode 
+    { {   0,  80} , {  -5,  89} , {  -7,  94} , {  -4,  92} },
+    { {   0,  39} , {   0,  65} , { -15,  84} , { -35, 127} },
+    { {  -2,  73} , { -12, 104} , {  -9,  91} , { -31, 127} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  -2,  73} , { -12, 104} , {  -9,  91} , { -31, 127} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    // Cr in the 4:4:4 common mode 
+    { {   0,  80} , {  -5,  89} , {  -7,  94} , {  -4,  92} },
+    { {   0,  39} , {   0,  65} , { -15,  84} , { -35, 127} },
+    { {  -2,  73} , { -12, 104} , {  -9,  91} , { -31, 127} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  -2,  73} , { -12, 104} , {  -9,  91} , { -31, 127} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED }
+  },
+  //----- model 2 -----
+  {
+    { {  11,  80} , {   5,  76} , {   2,  84} , {   5,  78} },
+    { {  -6,  55} , {   4,  61} , { -14,  83} , { -37, 127} },
+    { {  -5,  79} , { -11, 104} , { -11,  91} , { -30, 127} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  -5,  79} , { -11, 104} , { -11,  91} , { -30, 127} },
+    { {   0,  65} , {  -2,  79} , {   0,  72} , {  -4,  92} },
+    { {  -6,  56} , {   3,  68} , {  -8,  71} , { -13,  98} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    // Cb in the 4:4:4 common mode 
+    { {  11,  80} , {   5,  76} , {   2,  84} , {   5,  78} },
+    { {  -6,  55} , {   4,  61} , { -14,  83} , { -37, 127} },
+    { {  -5,  79} , { -11, 104} , { -11,  91} , { -30, 127} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  -5,  79} , { -11, 104} , { -11,  91} , { -30, 127} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    // Cr in the 4:4:4 common mode 
+    { {  11,  80} , {   5,  76} , {   2,  84} , {   5,  78} },
+    { {  -6,  55} , {   4,  61} , { -14,  83} , { -37, 127} },
+    { {  -5,  79} , { -11, 104} , { -11,  91} , { -30, 127} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  -5,  79} , { -11, 104} , { -11,  91} , { -30, 127} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED }
+  }
+};
+
+static const char INIT_MAP_I[1][22][15][2] = 
+{
+  //----- model 0 -----
+  {
+  { {  -7,  93} , { -11,  87} , {  -3,  77} , {  -5,  71} , {  -4,  63} , {  -4,  68} , { -12,  84} , {  -7,  62} , {  -7,  65} , {   8,  61} , {   5,  56} , {  -2,  66} , {   1,  64} , {   0,  61} , {  -2,  78} },
+    {  CTX_UNUSED , {   1,  50} , {   7,  52} , {  10,  35} , {   0,  44} , {  11,  38} , {   1,  45} , {   0,  46} , {   5,  44} , {  31,  17} , {   1,  51} , {   7,  50} , {  28,  19} , {  16,  33} , {  14,  62} },
+    { { -17, 120} , { -20, 112} , { -18, 114} , { -11,  85} , { -15,  92} , { -14,  89} , { -26,  71} , { -15,  81} , { -14,  80} , {   0,  68} , { -14,  70} , { -24,  56} , { -23,  68} , { -24,  50} , { -11,  74} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { { -13, 108} , { -15, 100} , { -13, 101} , { -13,  91} , { -12,  94} , { -10,  88} , { -16,  84} , { -10,  86} , {  -7,  83} , { -13,  87} , { -19,  94} , {   1,  70} , {   0,  72} , {  -5,  74} , {  18,  59} },
+    { {  -8, 102} , { -15, 100} , {   0,  95} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED , {  -4,  75} , {   2,  72} , { -11,  75} , {  -3,  71} , {  15,  46} , { -13,  69} , {   0,  62} , {   0,  65} , {  21,  37} , { -15,  72} , {   9,  57} , {  16,  54} , {   0,  62} , {  12,  72} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    //Cb in the 4:4:4 common mode
+    { {  -7,  93} , { -11,  87} , {  -3,  77} , {  -5,  71} , {  -4,  63} , {  -4,  68} , { -12,  84} , {  -7,  62} , {  -7,  65} , {   8,  61} , {   5,  56} , {  -2,  66} , {   1,  64} , {   0,  61} , {  -2,  78} },
+    {  CTX_UNUSED , {   1,  50} , {   7,  52} , {  10,  35} , {   0,  44} , {  11,  38} , {   1,  45} , {   0,  46} , {   5,  44} , {  31,  17} , {   1,  51} , {   7,  50} , {  28,  19} , {  16,  33} , {  14,  62} },
+    { { -17, 120} , { -20, 112} , { -18, 114} , { -11,  85} , { -15,  92} , { -14,  89} , { -26,  71} , { -15,  81} , { -14,  80} , {   0,  68} , { -14,  70} , { -24,  56} , { -23,  68} , { -24,  50} , { -11,  74} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { { -13, 108} , { -15, 100} , { -13, 101} , { -13,  91} , { -12,  94} , { -10,  88} , { -16,  84} , { -10,  86} , {  -7,  83} , { -13,  87} , { -19,  94} , {   1,  70} , {   0,  72} , {  -5,  74} , {  18,  59} },
+    //Cr in the 4:4:4 common mode
+    { {  -7,  93} , { -11,  87} , {  -3,  77} , {  -5,  71} , {  -4,  63} , {  -4,  68} , { -12,  84} , {  -7,  62} , {  -7,  65} , {   8,  61} , {   5,  56} , {  -2,  66} , {   1,  64} , {   0,  61} , {  -2,  78} },
+    {  CTX_UNUSED , {   1,  50} , {   7,  52} , {  10,  35} , {   0,  44} , {  11,  38} , {   1,  45} , {   0,  46} , {   5,  44} , {  31,  17} , {   1,  51} , {   7,  50} , {  28,  19} , {  16,  33} , {  14,  62} },
+    { { -17, 120} , { -20, 112} , { -18, 114} , { -11,  85} , { -15,  92} , { -14,  89} , { -26,  71} , { -15,  81} , { -14,  80} , {   0,  68} , { -14,  70} , { -24,  56} , { -23,  68} , { -24,  50} , { -11,  74} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { { -13, 108} , { -15, 100} , { -13, 101} , { -13,  91} , { -12,  94} , { -10,  88} , { -16,  84} , { -10,  86} , {  -7,  83} , { -13,  87} , { -19,  94} , {   1,  70} , {   0,  72} , {  -5,  74} , {  18,  59} }
+  }
+};
+
+static const char INIT_MAP_P[3][22][15][2] =
+{
+  //----- model 0 -----
+  {
+    { {  -2,  85} , {  -6,  78} , {  -1,  75} , {  -7,  77} , {   2,  54} , {   5,  50} , {  -3,  68} , {   1,  50} , {   6,  42} , {  -4,  81} , {   1,  63} , {  -4,  70} , {   0,  67} , {   2,  57} , {  -2,  76} },
+    {  CTX_UNUSED , {  11,  35} , {   4,  64} , {   1,  61} , {  11,  35} , {  18,  25} , {  12,  24} , {  13,  29} , {  13,  36} , { -10,  93} , {  -7,  73} , {  -2,  73} , {  13,  46} , {   9,  49} , {  -7, 100} },
+    { {  -4,  79} , {  -7,  71} , {  -5,  69} , {  -9,  70} , {  -8,  66} , { -10,  68} , { -19,  73} , { -12,  69} , { -16,  70} , { -15,  67} , { -20,  62} , { -19,  70} , { -16,  66} , { -22,  65} , { -20,  63} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {   9,  53} , {   2,  53} , {   5,  53} , {  -2,  61} , {   0,  56} , {   0,  56} , { -13,  63} , {  -5,  60} , {  -1,  62} , {   4,  57} , {  -6,  69} , {   4,  57} , {  14,  39} , {   4,  51} , {  13,  68} },
+    { {   3,  64} , {   1,  61} , {   9,  63} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED , {   7,  50} , {  16,  39} , {   5,  44} , {   4,  52} , {  11,  48} , {  -5,  60} , {  -1,  59} , {   0,  59} , {  22,  33} , {   5,  44} , {  14,  43} , {  -1,  78} , {   0,  60} , {   9,  69} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    //Cb in the 4:4:4 common mode
+    { {  -2,  85} , {  -6,  78} , {  -1,  75} , {  -7,  77} , {   2,  54} , {   5,  50} , {  -3,  68} , {   1,  50} , {   6,  42} , {  -4,  81} , {   1,  63} , {  -4,  70} , {   0,  67} , {   2,  57} , {  -2,  76} },
+    {  CTX_UNUSED , {  11,  35} , {   4,  64} , {   1,  61} , {  11,  35} , {  18,  25} , {  12,  24} , {  13,  29} , {  13,  36} , { -10,  93} , {  -7,  73} , {  -2,  73} , {  13,  46} , {   9,  49} , {  -7, 100} },
+    { {  -4,  79} , {  -7,  71} , {  -5,  69} , {  -9,  70} , {  -8,  66} , { -10,  68} , { -19,  73} , { -12,  69} , { -16,  70} , { -15,  67} , { -20,  62} , { -19,  70} , { -16,  66} , { -22,  65} , { -20,  63} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {   9,  53} , {   2,  53} , {   5,  53} , {  -2,  61} , {   0,  56} , {   0,  56} , { -13,  63} , {  -5,  60} , {  -1,  62} , {   4,  57} , {  -6,  69} , {   4,  57} , {  14,  39} , {   4,  51} , {  13,  68} },
+    //Cr in the 4:4:4 common mode
+    { {  -2,  85} , {  -6,  78} , {  -1,  75} , {  -7,  77} , {   2,  54} , {   5,  50} , {  -3,  68} , {   1,  50} , {   6,  42} , {  -4,  81} , {   1,  63} , {  -4,  70} , {   0,  67} , {   2,  57} , {  -2,  76} },
+    {  CTX_UNUSED , {  11,  35} , {   4,  64} , {   1,  61} , {  11,  35} , {  18,  25} , {  12,  24} , {  13,  29} , {  13,  36} , { -10,  93} , {  -7,  73} , {  -2,  73} , {  13,  46} , {   9,  49} , {  -7, 100} },
+    { {  -4,  79} , {  -7,  71} , {  -5,  69} , {  -9,  70} , {  -8,  66} , { -10,  68} , { -19,  73} , { -12,  69} , { -16,  70} , { -15,  67} , { -20,  62} , { -19,  70} , { -16,  66} , { -22,  65} , { -20,  63} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {   9,  53} , {   2,  53} , {   5,  53} , {  -2,  61} , {   0,  56} , {   0,  56} , { -13,  63} , {  -5,  60} , {  -1,  62} , {   4,  57} , {  -6,  69} , {   4,  57} , {  14,  39} , {   4,  51} , {  13,  68} }
+  },
+  //----- model 1 -----
+  {
+    { { -13, 103} , { -13,  91} , {  -9,  89} , { -14,  92} , {  -8,  76} , { -12,  87} , { -23, 110} , { -24, 105} , { -10,  78} , { -20, 112} , { -17,  99} , { -78, 127} , { -70, 127} , { -50, 127} , { -46, 127} },
+    {  CTX_UNUSED , {  -4,  66} , {  -5,  78} , {  -4,  71} , {  -8,  72} , {   2,  59} , {  -1,  55} , {  -7,  70} , {  -6,  75} , {  -8,  89} , { -34, 119} , {  -3,  75} , {  32,  20} , {  30,  22} , { -44, 127} },
+    { {  -5,  85} , {  -6,  81} , { -10,  77} , {  -7,  81} , { -17,  80} , { -18,  73} , {  -4,  74} , { -10,  83} , {  -9,  71} , {  -9,  67} , {  -1,  61} , {  -8,  66} , { -14,  66} , {   0,  59} , {   2,  59} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {   0,  54} , {  -5,  61} , {   0,  58} , {  -1,  60} , {  -3,  61} , {  -8,  67} , { -25,  84} , { -14,  74} , {  -5,  65} , {   5,  52} , {   2,  57} , {   0,  61} , {  -9,  69} , { -11,  70} , {  18,  55} },
+    { {  -4,  71} , {   0,  58} , {   7,  61} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED , {   9,  41} , {  18,  25} , {   9,  32} , {   5,  43} , {   9,  47} , {   0,  44} , {   0,  51} , {   2,  46} , {  19,  38} , {  -4,  66} , {  15,  38} , {  12,  42} , {   9,  34} , {   0,  89} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    //Cb in the 4:4:4 common mode
+    { { -13, 103} , { -13,  91} , {  -9,  89} , { -14,  92} , {  -8,  76} , { -12,  87} , { -23, 110} , { -24, 105} , { -10,  78} , { -20, 112} , { -17,  99} , { -78, 127} , { -70, 127} , { -50, 127} , { -46, 127} },
+    {  CTX_UNUSED , {  -4,  66} , {  -5,  78} , {  -4,  71} , {  -8,  72} , {   2,  59} , {  -1,  55} , {  -7,  70} , {  -6,  75} , {  -8,  89} , { -34, 119} , {  -3,  75} , {  32,  20} , {  30,  22} , { -44, 127} },
+    { {  -5,  85} , {  -6,  81} , { -10,  77} , {  -7,  81} , { -17,  80} , { -18,  73} , {  -4,  74} , { -10,  83} , {  -9,  71} , {  -9,  67} , {  -1,  61} , {  -8,  66} , { -14,  66} , {   0,  59} , {   2,  59} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {   0,  54} , {  -5,  61} , {   0,  58} , {  -1,  60} , {  -3,  61} , {  -8,  67} , { -25,  84} , { -14,  74} , {  -5,  65} , {   5,  52} , {   2,  57} , {   0,  61} , {  -9,  69} , { -11,  70} , {  18,  55} },
+    //Cr in the 4:4:4 common mode
+    { { -13, 103} , { -13,  91} , {  -9,  89} , { -14,  92} , {  -8,  76} , { -12,  87} , { -23, 110} , { -24, 105} , { -10,  78} , { -20, 112} , { -17,  99} , { -78, 127} , { -70, 127} , { -50, 127} , { -46, 127} },
+    {  CTX_UNUSED , {  -4,  66} , {  -5,  78} , {  -4,  71} , {  -8,  72} , {   2,  59} , {  -1,  55} , {  -7,  70} , {  -6,  75} , {  -8,  89} , { -34, 119} , {  -3,  75} , {  32,  20} , {  30,  22} , { -44, 127} },
+    { {  -5,  85} , {  -6,  81} , { -10,  77} , {  -7,  81} , { -17,  80} , { -18,  73} , {  -4,  74} , { -10,  83} , {  -9,  71} , {  -9,  67} , {  -1,  61} , {  -8,  66} , { -14,  66} , {   0,  59} , {   2,  59} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {   0,  54} , {  -5,  61} , {   0,  58} , {  -1,  60} , {  -3,  61} , {  -8,  67} , { -25,  84} , { -14,  74} , {  -5,  65} , {   5,  52} , {   2,  57} , {   0,  61} , {  -9,  69} , { -11,  70} , {  18,  55} }
+  },
+  //----- model 2 -----
+  {
+    { {  -4,  86} , { -12,  88} , {  -5,  82} , {  -3,  72} , {  -4,  67} , {  -8,  72} , { -16,  89} , {  -9,  69} , {  -1,  59} , {   5,  66} , {   4,  57} , {  -4,  71} , {  -2,  71} , {   2,  58} , {  -1,  74} },
+    {  CTX_UNUSED , {  -4,  44} , {  -1,  69} , {   0,  62} , {  -7,  51} , {  -4,  47} , {  -6,  42} , {  -3,  41} , {  -6,  53} , {   8,  76} , {  -9,  78} , { -11,  83} , {   9,  52} , {   0,  67} , {  -5,  90} },
+    {  {  -3,  78} , {  -8,  74} , {  -9,  72} , { -10,  72} , { -18,  75} , { -12,  71} , { -11,  63} , {  -5,  70} , { -17,  75} , { -14,  72} , { -16,  67} , {  -8,  53} , { -14,  59} , {  -9,  52} , { -11,  68} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {   1,  67} , { -15,  72} , {  -5,  75} , {  -8,  80} , { -21,  83} , { -21,  64} , { -13,  31} , { -25,  64} , { -29,  94} , {   9,  75} , {  17,  63} , {  -8,  74} , {  -5,  35} , {  -2,  27} , {  13,  91} },
+    { {   3,  65} , {  -7,  69} , {   8,  77} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED , { -10,  66} , {   3,  62} , {  -3,  68} , { -20,  81} , {   0,  30} , {   1,   7} , {  -3,  23} , { -21,  74} , {  16,  66} , { -23, 124} , {  17,  37} , {  44, -18} , {  50, -34} , { -22, 127} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    //Cb in the 4:4:4 common mode
+    { {  -4,  86} , { -12,  88} , {  -5,  82} , {  -3,  72} , {  -4,  67} , {  -8,  72} , { -16,  89} , {  -9,  69} , {  -1,  59} , {   5,  66} , {   4,  57} , {  -4,  71} , {  -2,  71} , {   2,  58} , {  -1,  74} },
+    {  CTX_UNUSED , {  -4,  44} , {  -1,  69} , {   0,  62} , {  -7,  51} , {  -4,  47} , {  -6,  42} , {  -3,  41} , {  -6,  53} , {   8,  76} , {  -9,  78} , { -11,  83} , {   9,  52} , {   0,  67} , {  -5,  90} },
+    { {  -3,  78} , {  -8,  74} , {  -9,  72} , { -10,  72} , { -18,  75} , { -12,  71} , { -11,  63} , {  -5,  70} , { -17,  75} , { -14,  72} , { -16,  67} , {  -8,  53} , { -14,  59} , {  -9,  52} , { -11,  68} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {   1,  67} , { -15,  72} , {  -5,  75} , {  -8,  80} , { -21,  83} , { -21,  64} , { -13,  31} , { -25,  64} , { -29,  94} , {   9,  75} , {  17,  63} , {  -8,  74} , {  -5,  35} , {  -2,  27} , {  13,  91} },
+    //Cr in the 4:4:4 common mode
+    { {  -4,  86} , { -12,  88} , {  -5,  82} , {  -3,  72} , {  -4,  67} , {  -8,  72} , { -16,  89} , {  -9,  69} , {  -1,  59} , {   5,  66} , {   4,  57} , {  -4,  71} , {  -2,  71} , {   2,  58} , {  -1,  74} },
+    {  CTX_UNUSED , {  -4,  44} , {  -1,  69} , {   0,  62} , {  -7,  51} , {  -4,  47} , {  -6,  42} , {  -3,  41} , {  -6,  53} , {   8,  76} , {  -9,  78} , { -11,  83} , {   9,  52} , {   0,  67} , {  -5,  90} },
+    { {  -3,  78} , {  -8,  74} , {  -9,  72} , { -10,  72} , { -18,  75} , { -12,  71} , { -11,  63} , {  -5,  70} , { -17,  75} , { -14,  72} , { -16,  67} , {  -8,  53} , { -14,  59} , {  -9,  52} , { -11,  68} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {   1,  67} , { -15,  72} , {  -5,  75} , {  -8,  80} , { -21,  83} , { -21,  64} , { -13,  31} , { -25,  64} , { -29,  94} , {   9,  75} , {  17,  63} , {  -8,  74} , {  -5,  35} , {  -2,  27} , {  13,  91} }
+  }
+};
+
+static const char INIT_LAST_I[1][22][15][2] = 
+{
+  //----- model 0 -----
+  {
+    { {  24,   0} , {  15,   9} , {   8,  25} , {  13,  18} , {  15,   9} , {  13,  19} , {  10,  37} , {  12,  18} , {   6,  29} , {  20,  33} , {  15,  30} , {   4,  45} , {   1,  58} , {   0,  62} , {   7,  61} },
+    {  CTX_UNUSED , {  12,  38} , {  11,  45} , {  15,  39} , {  11,  42} , {  13,  44} , {  16,  45} , {  12,  41} , {  10,  49} , {  30,  34} , {  18,  42} , {  10,  55} , {  17,  51} , {  17,  46} , {   0,  89} },
+    {  {  23, -13} , {  26, -13} , {  40, -15} , {  49, -14} , {  44,   3} , {  45,   6} , {  44,  34} , {  33,  54} , {  19,  82} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  26, -19} , {  22, -17} , {  26, -17} , {  30, -25} , {  28, -20} , {  33, -23} , {  37, -27} , {  33, -23} , {  40, -28} , {  38, -17} , {  33, -11} , {  40, -15} , {  41,  -6} , {  38,   1} , {  41,  17} },
+    { {  30,  -6} , {  27,   3} , {  26,  22} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED , {  37, -16} , {  35,  -4} , {  38,  -8} , {  38,  -3} , {  37,   3} , {  38,   5} , {  42,   0} , {  35,  16} , {  39,  22} , {  14,  48} , {  27,  37} , {  21,  60} , {  12,  68} , {   2,  97} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    //Cb in the 4:4:4 common mode
+    { {  24,   0} , {  15,   9} , {   8,  25} , {  13,  18} , {  15,   9} , {  13,  19} , {  10,  37} , {  12,  18} , {   6,  29} , {  20,  33} , {  15,  30} , {   4,  45} , {   1,  58} , {   0,  62} , {   7,  61} },
+    {  CTX_UNUSED , {  12,  38} , {  11,  45} , {  15,  39} , {  11,  42} , {  13,  44} , {  16,  45} , {  12,  41} , {  10,  49} , {  30,  34} , {  18,  42} , {  10,  55} , {  17,  51} , {  17,  46} , {   0,  89} },
+    {  {  23, -13} , {  26, -13} , {  40, -15} , {  49, -14} , {  44,   3} , {  45,   6} , {  44,  34} , {  33,  54} , {  19,  82} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  26, -19} , {  22, -17} , {  26, -17} , {  30, -25} , {  28, -20} , {  33, -23} , {  37, -27} , {  33, -23} , {  40, -28} , {  38, -17} , {  33, -11} , {  40, -15} , {  41,  -6} , {  38,   1} , {  41,  17} },
+    //Cr in the 4:4:4 common mode
+    { {  24,   0} , {  15,   9} , {   8,  25} , {  13,  18} , {  15,   9} , {  13,  19} , {  10,  37} , {  12,  18} , {   6,  29} , {  20,  33} , {  15,  30} , {   4,  45} , {   1,  58} , {   0,  62} , {   7,  61} },
+    {  CTX_UNUSED , {  12,  38} , {  11,  45} , {  15,  39} , {  11,  42} , {  13,  44} , {  16,  45} , {  12,  41} , {  10,  49} , {  30,  34} , {  18,  42} , {  10,  55} , {  17,  51} , {  17,  46} , {   0,  89} },
+    {  {  23, -13} , {  26, -13} , {  40, -15} , {  49, -14} , {  44,   3} , {  45,   6} , {  44,  34} , {  33,  54} , {  19,  82} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  26, -19} , {  22, -17} , {  26, -17} , {  30, -25} , {  28, -20} , {  33, -23} , {  37, -27} , {  33, -23} , {  40, -28} , {  38, -17} , {  33, -11} , {  40, -15} , {  41,  -6} , {  38,   1} , {  41,  17} }
+  }
+};
+
+static const char INIT_LAST_P[3][22][15][2] =
+{
+  //----- model 0 -----
+  {
+    { {  11,  28} , {   2,  40} , {   3,  44} , {   0,  49} , {   0,  46} , {   2,  44} , {   2,  51} , {   0,  47} , {   4,  39} , {   2,  62} , {   6,  46} , {   0,  54} , {   3,  54} , {   2,  58} , {   4,  63} },
+    {  CTX_UNUSED , {   6,  51} , {   6,  57} , {   7,  53} , {   6,  52} , {   6,  55} , {  11,  45} , {  14,  36} , {   8,  53} , {  -1,  82} , {   7,  55} , {  -3,  78} , {  15,  46} , {  22,  31} , {  -1,  84} },
+    {  {   9,  -2} , {  26,  -9} , {  33,  -9} , {  39,  -7} , {  41,  -2} , {  45,   3} , {  49,   9} , {  45,  27} , {  36,  59} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  25,   7} , {  30,  -7} , {  28,   3} , {  28,   4} , {  32,   0} , {  34,  -1} , {  30,   6} , {  30,   6} , {  32,   9} , {  31,  19} , {  26,  27} , {  26,  30} , {  37,  20} , {  28,  34} , {  17,  70} },
+    { {   1,  67} , {   5,  59} , {   9,  67} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED , {  16,  30} , {  18,  32} , {  18,  35} , {  22,  29} , {  24,  31} , {  23,  38} , {  18,  43} , {  20,  41} , {  11,  63} , {   9,  59} , {   9,  64} , {  -1,  94} , {  -2,  89} , {  -9, 108} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    //Cb in the 4:4:4 common mode
+    { {  11,  28} , {   2,  40} , {   3,  44} , {   0,  49} , {   0,  46} , {   2,  44} , {   2,  51} , {   0,  47} , {   4,  39} , {   2,  62} , {   6,  46} , {   0,  54} , {   3,  54} , {   2,  58} , {   4,  63} },
+    {  CTX_UNUSED , {   6,  51} , {   6,  57} , {   7,  53} , {   6,  52} , {   6,  55} , {  11,  45} , {  14,  36} , {   8,  53} , {  -1,  82} , {   7,  55} , {  -3,  78} , {  15,  46} , {  22,  31} , {  -1,  84} },
+    {  {   9,  -2} , {  26,  -9} , {  33,  -9} , {  39,  -7} , {  41,  -2} , {  45,   3} , {  49,   9} , {  45,  27} , {  36,  59} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  25,   7} , {  30,  -7} , {  28,   3} , {  28,   4} , {  32,   0} , {  34,  -1} , {  30,   6} , {  30,   6} , {  32,   9} , {  31,  19} , {  26,  27} , {  26,  30} , {  37,  20} , {  28,  34} , {  17,  70} },
+    //Cr in the 4:4:4 common mode
+    { {  11,  28} , {   2,  40} , {   3,  44} , {   0,  49} , {   0,  46} , {   2,  44} , {   2,  51} , {   0,  47} , {   4,  39} , {   2,  62} , {   6,  46} , {   0,  54} , {   3,  54} , {   2,  58} , {   4,  63} },
+    {  CTX_UNUSED , {   6,  51} , {   6,  57} , {   7,  53} , {   6,  52} , {   6,  55} , {  11,  45} , {  14,  36} , {   8,  53} , {  -1,  82} , {   7,  55} , {  -3,  78} , {  15,  46} , {  22,  31} , {  -1,  84} },
+    {  {   9,  -2} , {  26,  -9} , {  33,  -9} , {  39,  -7} , {  41,  -2} , {  45,   3} , {  49,   9} , {  45,  27} , {  36,  59} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  25,   7} , {  30,  -7} , {  28,   3} , {  28,   4} , {  32,   0} , {  34,  -1} , {  30,   6} , {  30,   6} , {  32,   9} , {  31,  19} , {  26,  27} , {  26,  30} , {  37,  20} , {  28,  34} , {  17,  70} }
+  },
+  //----- model 1 -----
+  {
+    { {   4,  45} , {  10,  28} , {  10,  31} , {  33, -11} , {  52, -43} , {  18,  15} , {  28,   0} , {  35, -22} , {  38, -25} , {  34,   0} , {  39, -18} , {  32, -12} , { 102, -94} , {   0,   0} , {  56, -15} },
+    {  CTX_UNUSED , {  33,  -4} , {  29,  10} , {  37,  -5} , {  51, -29} , {  39,  -9} , {  52, -34} , {  69, -58} , {  67, -63} , {  44,  -5} , {  32,   7} , {  55, -29} , {  32,   1} , {   0,   0} , {  27,  36} },
+    {  {  17, -10} , {  32, -13} , {  42,  -9} , {  49,  -5} , {  53,   0} , {  64,   3} , {  68,  10} , {  66,  27} , {  47,  57} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  33, -25} , {  34, -30} , {  36, -28} , {  38, -28} , {  38, -27} , {  34, -18} , {  35, -16} , {  34, -14} , {  32,  -8} , {  37,  -6} , {  35,   0} , {  30,  10} , {  28,  18} , {  26,  25} , {  29,  41} },
+    { {   0,  75} , {   2,  72} , {   8,  77} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED , {  14,  35} , {  18,  31} , {  17,  35} , {  21,  30} , {  17,  45} , {  20,  42} , {  18,  45} , {  27,  26} , {  16,  54} , {   7,  66} , {  16,  56} , {  11,  73} , {  10,  67} , { -10, 116} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    //Cb in the 4:4:4 common mode
+    { {   4,  45} , {  10,  28} , {  10,  31} , {  33, -11} , {  52, -43} , {  18,  15} , {  28,   0} , {  35, -22} , {  38, -25} , {  34,   0} , {  39, -18} , {  32, -12} , { 102, -94} , {   0,   0} , {  56, -15} },
+    {  CTX_UNUSED , {  33,  -4} , {  29,  10} , {  37,  -5} , {  51, -29} , {  39,  -9} , {  52, -34} , {  69, -58} , {  67, -63} , {  44,  -5} , {  32,   7} , {  55, -29} , {  32,   1} , {   0,   0} , {  27,  36} },
+    {  {  17, -10} , {  32, -13} , {  42,  -9} , {  49,  -5} , {  53,   0} , {  64,   3} , {  68,  10} , {  66,  27} , {  47,  57} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  33, -25} , {  34, -30} , {  36, -28} , {  38, -28} , {  38, -27} , {  34, -18} , {  35, -16} , {  34, -14} , {  32,  -8} , {  37,  -6} , {  35,   0} , {  30,  10} , {  28,  18} , {  26,  25} , {  29,  41} },
+    //Cr in the 4:4:4 common mode
+    { {   4,  45} , {  10,  28} , {  10,  31} , {  33, -11} , {  52, -43} , {  18,  15} , {  28,   0} , {  35, -22} , {  38, -25} , {  34,   0} , {  39, -18} , {  32, -12} , { 102, -94} , {   0,   0} , {  56, -15} },
+    {  CTX_UNUSED , {  33,  -4} , {  29,  10} , {  37,  -5} , {  51, -29} , {  39,  -9} , {  52, -34} , {  69, -58} , {  67, -63} , {  44,  -5} , {  32,   7} , {  55, -29} , {  32,   1} , {   0,   0} , {  27,  36} },
+    {  {  17, -10} , {  32, -13} , {  42,  -9} , {  49,  -5} , {  53,   0} , {  64,   3} , {  68,  10} , {  66,  27} , {  47,  57} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  33, -25} , {  34, -30} , {  36, -28} , {  38, -28} , {  38, -27} , {  34, -18} , {  35, -16} , {  34, -14} , {  32,  -8} , {  37,  -6} , {  35,   0} , {  30,  10} , {  28,  18} , {  26,  25} , {  29,  41} }
+  },
+  //----- model 2 -----
+  {
+    { {   4,  39} , {   0,  42} , {   7,  34} , {  11,  29} , {   8,  31} , {   6,  37} , {   7,  42} , {   3,  40} , {   8,  33} , {  13,  43} , {  13,  36} , {   4,  47} , {   3,  55} , {   2,  58} , {   6,  60} },
+    {  CTX_UNUSED , {   8,  44} , {  11,  44} , {  14,  42} , {   7,  48} , {   4,  56} , {   4,  52} , {  13,  37} , {   9,  49} , {  19,  58} , {  10,  48} , {  12,  45} , {   0,  69} , {  20,  33} , {   8,  63} },
+    {  {   9,  -2} , {  30, -10} , {  31,  -4} , {  33,  -1} , {  33,   7} , {  31,  12} , {  37,  23} , {  31,  38} , {  20,  64} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  35, -18} , {  33, -25} , {  28,  -3} , {  24,  10} , {  27,   0} , {  34, -14} , {  52, -44} , {  39, -24} , {  19,  17} , {  31,  25} , {  36,  29} , {  24,  33} , {  34,  15} , {  30,  20} , {  22,  73} },
+    { {  20,  34} , {  19,  31} , {  27,  44} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED , {  19,  16} , {  15,  36} , {  15,  36} , {  21,  28} , {  25,  21} , {  30,  20} , {  31,  12} , {  27,  16} , {  24,  42} , {   0,  93} , {  14,  56} , {  15,  57} , {  26,  38} , { -24, 127} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    //Cb in the 4:4:4 common mode
+    { {   4,  39} , {   0,  42} , {   7,  34} , {  11,  29} , {   8,  31} , {   6,  37} , {   7,  42} , {   3,  40} , {   8,  33} , {  13,  43} , {  13,  36} , {   4,  47} , {   3,  55} , {   2,  58} , {   6,  60} },
+    {  CTX_UNUSED , {   8,  44} , {  11,  44} , {  14,  42} , {   7,  48} , {   4,  56} , {   4,  52} , {  13,  37} , {   9,  49} , {  19,  58} , {  10,  48} , {  12,  45} , {   0,  69} , {  20,  33} , {   8,  63} },
+    {  {   9,  -2} , {  30, -10} , {  31,  -4} , {  33,  -1} , {  33,   7} , {  31,  12} , {  37,  23} , {  31,  38} , {  20,  64} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  35, -18} , {  33, -25} , {  28,  -3} , {  24,  10} , {  27,   0} , {  34, -14} , {  52, -44} , {  39, -24} , {  19,  17} , {  31,  25} , {  36,  29} , {  24,  33} , {  34,  15} , {  30,  20} , {  22,  73} },
+    //Cr in the 4:4:4 common mode
+    { {   4,  39} , {   0,  42} , {   7,  34} , {  11,  29} , {   8,  31} , {   6,  37} , {   7,  42} , {   3,  40} , {   8,  33} , {  13,  43} , {  13,  36} , {   4,  47} , {   3,  55} , {   2,  58} , {   6,  60} },
+    {  CTX_UNUSED , {   8,  44} , {  11,  44} , {  14,  42} , {   7,  48} , {   4,  56} , {   4,  52} , {  13,  37} , {   9,  49} , {  19,  58} , {  10,  48} , {  12,  45} , {   0,  69} , {  20,  33} , {   8,  63} },
+    {  {   9,  -2} , {  30, -10} , {  31,  -4} , {  33,  -1} , {  33,   7} , {  31,  12} , {  37,  23} , {  31,  38} , {  20,  64} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  35, -18} , {  33, -25} , {  28,  -3} , {  24,  10} , {  27,   0} , {  34, -14} , {  52, -44} , {  39, -24} , {  19,  17} , {  31,  25} , {  36,  29} , {  24,  33} , {  34,  15} , {  30,  20} , {  22,  73} }
+  }
+};
+
+static const char INIT_ONE_I[1][22][5][2] = 
+{
+  //----- model 0 -----
+  {
+    { {  -3,  71} , {  -6,  42} , {  -5,  50} , {  -3,  54} , {  -2,  62} },
+    { {  -5,  67} , {  -5,  27} , {  -3,  39} , {  -2,  44} , {   0,  46} },
+    {  {  -3,  75} , {  -1,  23} , {   1,  34} , {   1,  43} , {   0,  54} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { { -12,  92} , { -15,  55} , { -10,  60} , {  -6,  62} , {  -4,  65} },
+    { { -11,  97} , { -20,  84} , { -11,  79} , {  -6,  73} , {  -4,  74} },
+    { {  -8,  78} , {  -5,  33} , {  -4,  48} , {  -2,  53} , {  -3,  62} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    //Cb in the 4:4:4 common mode
+    { {  -3,  71} , {  -6,  42} , {  -5,  50} , {  -3,  54} , {  -2,  62} },
+    { {  -5,  67} , {  -5,  27} , {  -3,  39} , {  -2,  44} , {   0,  46} },
+    { {  -3,  75} , {  -1,  23} , {   1,  34} , {   1,  43} , {   0,  54} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { { -12,  92} , { -15,  55} , { -10,  60} , {  -6,  62} , {  -4,  65} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    //Cr in the 4:4:4 common mode
+    { {  -3,  71} , {  -6,  42} , {  -5,  50} , {  -3,  54} , {  -2,  62} },
+    { {  -5,  67} , {  -5,  27} , {  -3,  39} , {  -2,  44} , {   0,  46} },
+    { {  -3,  75} , {  -1,  23} , {   1,  34} , {   1,  43} , {   0,  54} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { { -12,  92} , { -15,  55} , { -10,  60} , {  -6,  62} , {  -4,  65} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED }
+  }
+};
+
+static const char INIT_ONE_P[3][22][5][2] =
+{
+  //----- model 0 -----
+  {
+    { {  -6,  76} , {  -2,  44} , {   0,  45} , {   0,  52} , {  -3,  64} },
+    { {  -9,  77} , {   3,  24} , {   0,  42} , {   0,  48} , {   0,  55} },
+    {  {  -6,  66} , {  -7,  35} , {  -7,  42} , {  -8,  45} , {  -5,  48} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {   1,  58} , {  -3,  29} , {  -1,  36} , {   1,  38} , {   2,  43} },
+    { {   0,  70} , {  -4,  29} , {   5,  31} , {   7,  42} , {   1,  59} },
+    { {   0,  58} , {   8,   5} , {  10,  14} , {  14,  18} , {  13,  27} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    //Cb in the 4:4:4 common mode
+    { {  -6,  76} , {  -2,  44} , {   0,  45} , {   0,  52} , {  -3,  64} },
+    { {  -9,  77} , {   3,  24} , {   0,  42} , {   0,  48} , {   0,  55} },
+    {  {  -6,  66} , {  -7,  35} , {  -7,  42} , {  -8,  45} , {  -5,  48} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {   1,  58} , {  -3,  29} , {  -1,  36} , {   1,  38} , {   2,  43} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    //Cr in the 4:4:4 common mode
+    { {  -6,  76} , {  -2,  44} , {   0,  45} , {   0,  52} , {  -3,  64} },
+    { {  -9,  77} , {   3,  24} , {   0,  42} , {   0,  48} , {   0,  55} },
+    {  {  -6,  66} , {  -7,  35} , {  -7,  42} , {  -8,  45} , {  -5,  48} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {   1,  58} , {  -3,  29} , {  -1,  36} , {   1,  38} , {   2,  43} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED }
+  },
+  //----- model 1 -----
+  {
+    { { -23, 112} , { -15,  71} , {  -7,  61} , {   0,  53} , {  -5,  66} },
+    { { -21, 101} , {  -3,  39} , {  -5,  53} , {  -7,  61} , { -11,  75} },
+    {  {  -5,  71} , {   0,  24} , {  -1,  36} , {  -2,  42} , {  -2,  52} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { { -11,  76} , { -10,  44} , { -10,  52} , { -10,  57} , {  -9,  58} },
+    { {   2,  66} , {  -9,  34} , {   1,  32} , {  11,  31} , {   5,  52} },
+    { {   3,  52} , {   7,   4} , {  10,   8} , {  17,   8} , {  16,  19} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    //Cb in the 4:4:4 common mode
+    { { -23, 112} , { -15,  71} , {  -7,  61} , {   0,  53} , {  -5,  66} },
+    { { -21, 101} , {  -3,  39} , {  -5,  53} , {  -7,  61} , { -11,  75} },
+    {  {  -5,  71} , {   0,  24} , {  -1,  36} , {  -2,  42} , {  -2,  52} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { { -11,  76} , { -10,  44} , { -10,  52} , { -10,  57} , {  -9,  58} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+     //Cr in the 4:4:4 common mode
+    { { -23, 112} , { -15,  71} , {  -7,  61} , {   0,  53} , {  -5,  66} },
+    { { -21, 101} , {  -3,  39} , {  -5,  53} , {  -7,  61} , { -11,  75} },
+    {  {  -5,  71} , {   0,  24} , {  -1,  36} , {  -2,  42} , {  -2,  52} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { { -11,  76} , { -10,  44} , { -10,  52} , { -10,  57} , {  -9,  58} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED }
+  },
+  //----- model 2 -----
+  {
+    { { -24, 115} , { -22,  82} , {  -9,  62} , {   0,  53} , {   0,  59} },
+    { { -21, 100} , { -14,  57} , { -12,  67} , { -11,  71} , { -10,  77} },
+    {  {  -9,  71} , {  -7,  37} , {  -8,  44} , { -11,  49} , { -10,  56} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { { -10,  82} , {  -8,  48} , {  -8,  61} , {  -8,  66} , {  -7,  70} },
+    { {  -4,  79} , { -22,  69} , { -16,  75} , {  -2,  58} , {   1,  58} },
+    { { -13,  81} , {  -6,  38} , { -13,  62} , {  -6,  58} , {  -2,  59} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    //Cb in the 4:4:4 common mode
+    { { -24, 115} , { -22,  82} , {  -9,  62} , {   0,  53} , {   0,  59} },
+    { { -21, 100} , { -14,  57} , { -12,  67} , { -11,  71} , { -10,  77} },
+    {  {  -9,  71} , {  -7,  37} , {  -8,  44} , { -11,  49} , { -10,  56} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { { -10,  82} , {  -8,  48} , {  -8,  61} , {  -8,  66} , {  -7,  70} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    //Cr in the 4:4:4 common mode
+    { { -24, 115} , { -22,  82} , {  -9,  62} , {   0,  53} , {   0,  59} },
+    { { -21, 100} , { -14,  57} , { -12,  67} , { -11,  71} , { -10,  77} },
+    {  {  -9,  71} , {  -7,  37} , {  -8,  44} , { -11,  49} , { -10,  56} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { { -10,  82} , {  -8,  48} , {  -8,  61} , {  -8,  66} , {  -7,  70} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED }
+  }
+};
+
+static const char INIT_ABS_I[1][22][5][2] = 
+{
+  //----- model 0 -----
+  {
+    { {   0,  58} , {   1,  63} , {  -2,  72} , {  -1,  74} , {  -9,  91} },
+    { { -16,  64} , {  -8,  68} , { -10,  78} , {  -6,  77} , { -10,  86} },
+    {  {  -2,  55} , {   0,  61} , {   1,  64} , {   0,  68} , {  -9,  92} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { { -12,  73} , {  -8,  76} , {  -7,  80} , {  -9,  88} , { -17, 110} },
+    { { -13,  86} , { -13,  96} , { -11,  97} , { -19, 117} ,  CTX_UNUSED },
+    { { -13,  71} , { -10,  79} , { -12,  86} , { -13,  90} , { -14,  97} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    //Cb in the 4:4:4 common mode
+    { {   0,  58} , {   1,  63} , {  -2,  72} , {  -1,  74} , {  -9,  91} },
+    { { -16,  64} , {  -8,  68} , { -10,  78} , {  -6,  77} , { -10,  86} },
+    {  {  -2,  55} , {   0,  61} , {   1,  64} , {   0,  68} , {  -9,  92} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { { -12,  73} , {  -8,  76} , {  -7,  80} , {  -9,  88} , { -17, 110} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    //Cr in the 4:4:4 common mode
+    { {   0,  58} , {   1,  63} , {  -2,  72} , {  -1,  74} , {  -9,  91} },
+    { { -16,  64} , {  -8,  68} , { -10,  78} , {  -6,  77} , { -10,  86} },
+    {  {  -2,  55} , {   0,  61} , {   1,  64} , {   0,  68} , {  -9,  92} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { { -12,  73} , {  -8,  76} , {  -7,  80} , {  -9,  88} , { -17, 110} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED }
+  }
+};
+
+static const char INIT_ABS_P[3][22][5][2] =
+{
+  //----- model 0 -----
+  {
+    { {  -2,  59} , {  -4,  70} , {  -4,  75} , {  -8,  82} , { -17, 102} },
+    { {  -6,  59} , {  -7,  71} , { -12,  83} , { -11,  87} , { -30, 119} },
+    {  { -12,  56} , {  -6,  60} , {  -5,  62} , {  -8,  66} , {  -8,  76} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  -6,  55} , {   0,  58} , {   0,  64} , {  -3,  74} , { -10,  90} },
+    { {  -2,  58} , {  -3,  72} , {  -3,  81} , { -11,  97} ,  CTX_UNUSED },
+    { {   2,  40} , {   0,  58} , {  -3,  70} , {  -6,  79} , {  -8,  85} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    //Cb in the 4:4:4 common mode
+    { {  -2,  59} , {  -4,  70} , {  -4,  75} , {  -8,  82} , { -17, 102} },
+    { {  -6,  59} , {  -7,  71} , { -12,  83} , { -11,  87} , { -30, 119} },
+    {  { -12,  56} , {  -6,  60} , {  -5,  62} , {  -8,  66} , {  -8,  76} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  -6,  55} , {   0,  58} , {   0,  64} , {  -3,  74} , { -10,  90} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    //Cr in the 4:4:4 common mode
+    { {  -2,  59} , {  -4,  70} , {  -4,  75} , {  -8,  82} , { -17, 102} },
+    { {  -6,  59} , {  -7,  71} , { -12,  83} , { -11,  87} , { -30, 119} },
+    {  { -12,  56} , {  -6,  60} , {  -5,  62} , {  -8,  66} , {  -8,  76} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  -6,  55} , {   0,  58} , {   0,  64} , {  -3,  74} , { -10,  90} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+  },
+  //----- model 1 -----
+  {
+    { { -11,  77} , {  -9,  80} , {  -9,  84} , { -10,  87} , { -34, 127} },
+    { { -15,  77} , { -17,  91} , { -25, 107} , { -25, 111} , { -28, 122} },
+    {  {  -9,  57} , {  -6,  63} , {  -4,  65} , {  -4,  67} , {  -7,  82} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { { -16,  72} , {  -7,  69} , {  -4,  69} , {  -5,  74} , {  -9,  86} },
+    { {  -2,  55} , {  -2,  67} , {   0,  73} , {  -8,  89} ,  CTX_UNUSED },
+    { {   3,  37} , {  -1,  61} , {  -5,  73} , {  -1,  70} , {  -4,  78} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    //Cb in the 4:4:4 common mode
+    { { -11,  77} , {  -9,  80} , {  -9,  84} , { -10,  87} , { -34, 127} },
+    { { -15,  77} , { -17,  91} , { -25, 107} , { -25, 111} , { -28, 122} },
+    {  {  -9,  57} , {  -6,  63} , {  -4,  65} , {  -4,  67} , {  -7,  82} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { { -16,  72} , {  -7,  69} , {  -4,  69} , {  -5,  74} , {  -9,  86} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    //Cr in the 4:4:4 common mode
+    { { -11,  77} , {  -9,  80} , {  -9,  84} , { -10,  87} , { -34, 127} },
+    { { -15,  77} , { -17,  91} , { -25, 107} , { -25, 111} , { -28, 122} },
+    {  {  -9,  57} , {  -6,  63} , {  -4,  65} , {  -4,  67} , {  -7,  82} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { { -16,  72} , {  -7,  69} , {  -4,  69} , {  -5,  74} , {  -9,  86} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+  },
+  //----- model 2 -----
+  {
+    { { -14,  85} , { -13,  89} , { -13,  94} , { -11,  92} , { -29, 127} },
+    { { -21,  85} , { -16,  88} , { -23, 104} , { -15,  98} , { -37, 127} },
+    {  { -12,  59} , {  -8,  63} , {  -9,  67} , {  -6,  68} , { -10,  79} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { { -14,  75} , { -10,  79} , {  -9,  83} , { -12,  92} , { -18, 108} },
+    { { -13,  78} , {  -9,  83} , {  -4,  81} , { -13,  99} ,  CTX_UNUSED },
+    { { -16,  73} , { -10,  76} , { -13,  86} , {  -9,  83} , { -10,  87} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    //Cb in the 4:4:4 common mode
+    { { -14,  85} , { -13,  89} , { -13,  94} , { -11,  92} , { -29, 127} },
+    { { -21,  85} , { -16,  88} , { -23, 104} , { -15,  98} , { -37, 127} },
+    {  { -12,  59} , {  -8,  63} , {  -9,  67} , {  -6,  68} , { -10,  79} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { { -14,  75} , { -10,  79} , {  -9,  83} , { -12,  92} , { -18, 108} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    //Cr in the 4:4:4 common mode
+    { { -14,  85} , { -13,  89} , { -13,  94} , { -11,  92} , { -29, 127} },
+    { { -21,  85} , { -16,  88} , { -23, 104} , { -15,  98} , { -37, 127} },
+    {  { -12,  59} , {  -8,  63} , {  -9,  67} , {  -6,  68} , { -10,  79} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { { -14,  75} , { -10,  79} , {  -9,  83} , { -12,  92} , { -18, 108} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED }
+  }
+};
+
+
+
+#if ENABLE_FIELD_CTX
+static const char INIT_FLD_MAP_I[1][8][15][2] =
+{
+  //----- model 0 -----
+  {
+    { {  -6,  93} , {  -6,  84} , {  -8,  79} , {   0,  66} , {  -1,  71} , {   0,  62} , {  -2,  60} , {  -2,  59} , {  -5,  75} , {  -3,  62} , {  -4,  58} , {  -9,  66} , {  -1,  79} , {   0,  71} , {   3,  68} },
+    {  CTX_UNUSED , {  10,  44} , {  -7,  62} , {  15,  36} , {  14,  40} , {  16,  27} , {  12,  29} , {   1,  44} , {  20,  36} , {  18,  32} , {   5,  42} , {   1,  48} , {  10,  62} , {  17,  46} , {   9,  64} },
+    {  { -14, 106} , { -13,  97} , { -15,  90} , { -12,  90} , { -18,  88} , { -10,  73} , {  -9,  79} , { -14,  86} , { -10,  73} , { -10,  70} , { -10,  69} , {  -5,  66} , {  -9,  64} , {  -5,  58} , {   2,  59} },
+//    { {  -1,  73} , {  -7,  73} , {  -6,  76} , {  -7,  71} , {  -9,  72} , {  -5,  65} , { -14,  83} , {  -8,  72} , { -10,  75} , {  -5,  64} , {  -4,  59} , { -13,  79} , {  -9,  69} , {  -8,  66} , {   3,  55} },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { { -12, 104} , { -11,  97} , { -16,  96} , {  -7,  88} , {  -8,  85} , {  -7,  85} , {  -9,  85} , { -13,  88} , {   4,  66} , {  -3,  77} , {  -3,  76} , {  -6,  76} , {  10,  58} , {  -1,  76} , {  -1,  83} },
+    { {  -7,  99} , { -14,  95} , {   2,  95} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED , {   0,  76} , {  -5,  74} , {   0,  70} , { -11,  75} , {   1,  68} , {   0,  65} , { -14,  73} , {   3,  62} , {   4,  62} , {  -1,  68} , { -13,  75} , {  11,  55} , {   5,  64} , {  12,  70} }
+  }
+};
+
+static const char INIT_FLD_MAP_P[3][8][15][2] =
+{
+  //----- model 0 -----
+  {
+    { { -13, 106} , { -16, 106} , { -10,  87} , { -21, 114} , { -18, 110} , { -14,  98} , { -22, 110} , { -21, 106} , { -18, 103} , { -21, 107} , { -23, 108} , { -26, 112} , { -10,  96} , { -12,  95} , {  -5,  91} },
+    {  CTX_UNUSED , {  -9,  93} , { -22,  94} , {  -5,  86} , {   9,  67} , {  -4,  80} , { -10,  85} , {  -1,  70} , {   7,  60} , {   9,  58} , {   5,  61} , {  12,  50} , {  15,  50} , {  18,  49} , {  17,  54} },
+    {  {  -5,  85} , {  -6,  81} , { -10,  77} , {  -7,  81} , { -17,  80} , { -18,  73} , {  -4,  74} , { -10,  83} , {  -9,  71} , {  -9,  67} , {  -1,  61} , {  -8,  66} , { -14,  66} , {   0,  59} , {   2,  59} },
+//    { {  -4,  60} , {  -3,  49} , {  -2,  50} , {  -4,  49} , {  -5,  48} , {  -2,  46} , {  -7,  54} , {  -1,  45} , {  -4,  49} , {   4,  39} , {   0,  42} , {   2,  43} , {   0,  44} , {   5,  32} , {  15,  30}  },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  10,  41} , {   7,  46} , {  -1,  51} , {   7,  49} , {   8,  52} , {   9,  41} , {   6,  47} , {   2,  55} , {  13,  41} , {  10,  44} , {   6,  50} , {   5,  53} , {  13,  49} , {   4,  63} , {   6,  64} },
+    { {  -2,  69} , {  -2,  59} , {   6,  70} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED , {  10,  44} , {   9,  31} , {  12,  43} , {   3,  53} , {  14,  34} , {  10,  38} , {  -3,  52} , {  13,  40} , {  17,  32} , {   7,  44} , {   7,  38} , {  13,  50} , {  10,  57} , {  26,  43} }
+  },
+  //----- model 1 -----
+  {
+    { { -21, 126} , { -23, 124} , { -20, 110} , { -26, 126} , { -25, 124} , { -17, 105} , { -27, 121} , { -27, 117} , { -17, 102} , { -26, 117} , { -27, 116} , { -33, 122} , { -10,  95} , { -14, 100} , {  -8,  95} },
+    {  CTX_UNUSED , { -17, 111} , { -28, 114} , {  -6,  89} , {  -2,  80} , {  -4,  82} , {  -9,  85} , {  -8,  81} , {  -1,  72} , {   5,  64} , {   1,  67} , {   9,  56} , {   0,  69} , {   1,  69} , {   7,  69} },
+    {  {  -3,  81} , {  -3,  76} , {  -7,  72} , {  -6,  78} , { -12,  72} , { -14,  68} , {  -3,  70} , {  -6,  76} , {  -5,  66} , {  -5,  62} , {   0,  57} , {  -4,  61} , {  -9,  60} , {   1,  54} , {   2,  58} },
+//    { {  -4,  60} , {  -3,  49} , {  -2,  50} , {  -4,  49} , {  -5,  48} , {  -2,  46} , {  -7,  54} , {  -1,  45} , {  -4,  49} , {   4,  39} , {   0,  42} , {   2,  43} , {   0,  44} , {   5,  32} , {  15,  30}  },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  -7,  69} , {  -6,  67} , { -16,  77} , {  -2,  64} , {   2,  61} , {  -6,  67} , {  -3,  64} , {   2,  57} , {  -3,  65} , {  -3,  66} , {   0,  62} , {   9,  51} , {  -1,  66} , {  -2,  71} , {  -2,  75} },
+    { {  -1,  70} , {  -9,  72} , {  14,  60} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED , {  16,  37} , {   0,  47} , {  18,  35} , {  11,  37} , {  12,  41} , {  10,  41} , {   2,  48} , {  12,  41} , {  13,  41} , {   0,  59} , {   3,  50} , {  19,  40} , {   3,  66} , {  18,  50} }
+  },
+  //----- model 2 -----
+  {
+    { { -22, 127} , { -25, 127} , { -25, 120} , { -27, 127} , { -19, 114} , { -23, 117} , { -25, 118} , { -26, 117} , { -24, 113} , { -28, 118} , { -31, 120} , { -37, 124} , { -10,  94} , { -15, 102} , { -10,  99} },
+    {  CTX_UNUSED , { -13, 106} , { -50, 127} , {  -5,  92} , {  17,  57} , {  -5,  86} , { -13,  94} , { -12,  91} , {  -2,  77} , {   0,  71} , {  -1,  73} , {   4,  64} , {  -7,  81} , {   5,  64} , {  15,  57} },
+    {  {  -3,  78} , {  -8,  74} , {  -9,  72} , { -10,  72} , { -18,  75} , { -12,  71} , { -11,  63} , {  -5,  70} , { -17,  75} , { -14,  72} , { -16,  67} , {  -8,  53} , { -14,  59} , {  -9,  52} , { -11,  68} },
+//    { {  -4,  60} , {  -3,  49} , {  -2,  50} , {  -4,  49} , {  -5,  48} , {  -2,  46} , {  -7,  54} , {  -1,  45} , {  -4,  49} , {   4,  39} , {   0,  42} , {   2,  43} , {   0,  44} , {   5,  32} , {  15,  30}  },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {   1,  67} , {   0,  68} , { -10,  67} , {   1,  68} , {   0,  77} , {   2,  64} , {   0,  68} , {  -5,  78} , {   7,  55} , {   5,  59} , {   2,  65} , {  14,  54} , {  15,  44} , {   5,  60} , {   2,  70} },
+    { {  -2,  76} , { -18,  86} , {  12,  70} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED , {   5,  64} , { -12,  70} , {  11,  55} , {   5,  56} , {   0,  69} , {   2,  65} , {  -6,  74} , {   5,  54} , {   7,  54} , {  -6,  76} , { -11,  82} , {  -2,  77} , {  -2,  77} , {  25,  42} }
+  }
+};
+
+static const char INIT_FLD_LAST_I[1][8][15][2] =
+{
+  //----- model 0 -----
+  {
+    { {  15,   6} , {   6,  19} , {   7,  16} , {  12,  14} , {  18,  13} , {  13,  11} , {  13,  15} , {  15,  16} , {  12,  23} , {  13,  23} , {  15,  20} , {  14,  26} , {  14,  44} , {  17,  40} , {  17,  47} },
+    {  CTX_UNUSED , {  24,  17} , {  21,  21} , {  25,  22} , {  31,  27} , {  22,  29} , {  19,  35} , {  14,  50} , {  10,  57} , {   7,  63} , {  -2,  77} , {  -4,  82} , {  -3,  94} , {   9,  69} , { -12, 109} },
+    {  {  21, -10} , {  24, -11} , {  28,  -8} , {  28,  -1} , {  29,   3} , {  29,   9} , {  35,  20} , {  29,  36} , {  14,  67} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+//    { {  12,  33} , {   5,  38} , {   9,  34} , {  18,  22} , {  19,  22} , {  23,  19} , {  26,  16} , {  14,  44} , {  40,  14} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  36, -35} , {  36, -34} , {  32, -26} , {  37, -30} , {  44, -32} , {  34, -18} , {  34, -15} , {  40, -15} , {  33,  -7} , {  35,  -5} , {  33,   0} , {  38,   2} , {  33,  13} , {  23,  35} , {  13,  58} },
+    { {  29,  -3} , {  26,   0} , {  22,  30} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED , {  31,  -7} , {  35, -15} , {  34,  -3} , {  34,   3} , {  36,  -1} , {  34,   5} , {  32,  11} , {  35,   5} , {  34,  12} , {  39,  11} , {  30,  29} , {  34,  26} , {  29,  39} , {  19,  66} }
+  }
+};
+
+static const char INIT_FLD_LAST_P[3][8][15][2] =
+{
+  //----- model 0 -----
+  {
+    { {  14,  11} , {  11,  14} , {   9,  11} , {  18,  11} , {  21,   9} , {  23,  -2} , {  32, -15} , {  32, -15} , {  34, -21} , {  39, -23} , {  42, -33} , {  41, -31} , {  46, -28} , {  38, -12} , {  21,  29} },
+    {  CTX_UNUSED , {  45, -24} , {  53, -45} , {  48, -26} , {  65, -43} , {  43, -19} , {  39, -10} , {  30,   9} , {  18,  26} , {  20,  27} , {   0,  57} , { -14,  82} , {  -5,  75} , { -19,  97} , { -35, 125} },
+    {  {  21, -13} , {  33, -14} , {  39,  -7} , {  46,  -2} , {  51,   2} , {  60,   6} , {  61,  17} , {  55,  34} , {  42,  62} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+//    { {  17,  27} , {  23,  13} , {  24,  16} , {  22,  25} , {  23,  27} , {  23,  32} , {  17,  43} , {  17,  49} , {   2,  70} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  27,   0} , {  28,   0} , {  31,  -4} , {  27,   6} , {  34,   8} , {  30,  10} , {  24,  22} , {  33,  19} , {  22,  32} , {  26,  31} , {  21,  41} , {  26,  44} , {  23,  47} , {  16,  65} , {  14,  71} },
+    { {   8,  60} , {   6,  63} , {  17,  65} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED , {  21,  24} , {  23,  20} , {  26,  23} , {  27,  32} , {  28,  23} , {  28,  24} , {  23,  40} , {  24,  32} , {  28,  29} , {  23,  42} , {  19,  57} , {  22,  53} , {  22,  61} , {  11,  86} }
+  },
+  //----- model 1 -----
+  {
+    { {  19,  -6} , {  18,  -6} , {  14,   0} , {  26, -12} , {  31, -16} , {  33, -25} , {  33, -22} , {  37, -28} , {  39, -30} , {  42, -30} , {  47, -42} , {  45, -36} , {  49, -34} , {  41, -17} , {  32,   9} },
+    {  CTX_UNUSED , {  69, -71} , {  63, -63} , {  66, -64} , {  77, -74} , {  54, -39} , {  52, -35} , {  41, -10} , {  36,   0} , {  40,  -1} , {  30,  14} , {  28,  26} , {  23,  37} , {  12,  55} , {  11,  65} },
+    {  {  17, -10} , {  32, -13} , {  42,  -9} , {  49,  -5} , {  53,   0} , {  64,   3} , {  68,  10} , {  66,  27} , {  47,  57} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+//    { {  17,  27} , {  23,  13} , {  24,  16} , {  22,  25} , {  23,  27} , {  23,  32} , {  17,  43} , {  17,  49} , {   2,  70} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  37, -33} , {  39, -36} , {  40, -37} , {  38, -30} , {  46, -33} , {  42, -30} , {  40, -24} , {  49, -29} , {  38, -12} , {  40, -10} , {  38,  -3} , {  46,  -5} , {  31,  20} , {  29,  30} , {  25,  44} },
+    { {  12,  48} , {  11,  49} , {  26,  45} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED , {  22,  22} , {  23,  22} , {  27,  21} , {  33,  20} , {  26,  28} , {  30,  24} , {  27,  34} , {  18,  42} , {  25,  39} , {  18,  50} , {  12,  70} , {  21,  54} , {  14,  71} , {  11,  83} }
+  },
+  //----- model 2 -----
+  {
+    { {  17, -13} , {  16,  -9} , {  17, -12} , {  27, -21} , {  37, -30} , {  41, -40} , {  42, -41} , {  48, -47} , {  39, -32} , {  46, -40} , {  52, -51} , {  46, -41} , {  52, -39} , {  43, -19} , {  32,  11} },
+    {  CTX_UNUSED , {  61, -55} , {  56, -46} , {  62, -50} , {  81, -67} , {  45, -20} , {  35,  -2} , {  28,  15} , {  34,   1} , {  39,   1} , {  30,  17} , {  20,  38} , {  18,  45} , {  15,  54} , {   0,  79} },
+    {  {   9,  -2} , {  30, -10} , {  31,  -4} , {  33,  -1} , {  33,   7} , {  31,  12} , {  37,  23} , {  31,  38} , {  20,  64} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+//    { {  17,  27} , {  23,  13} , {  24,  16} , {  22,  25} , {  23,  27} , {  23,  32} , {  17,  43} , {  17,  49} , {   2,  70} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    { {  36, -16} , {  37, -14} , {  37, -17} , {  32,   1} , {  34,  15} , {  29,  15} , {  24,  25} , {  34,  22} , {  31,  16} , {  35,  18} , {  31,  28} , {  33,  41} , {  36,  28} , {  27,  47} , {  21,  62} },
+    { {  18,  31} , {  19,  26} , {  36,  24} ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED ,  CTX_UNUSED },
+    {  CTX_UNUSED , {  24,  23} , {  27,  16} , {  24,  30} , {  31,  29} , {  22,  41} , {  22,  42} , {  16,  60} , {  15,  52} , {  14,  60} , {   3,  78} , { -16, 123} , {  21,  53} , {  22,  56} , {  25,  61} }
+  }
+};
+#endif
+
+
+#endif
+
diff --git a/Src/h264dec/lcommon/inc/enc_statistics.h b/Src/h264dec/lcommon/inc/enc_statistics.h
new file mode 100644
index 00000000..534a7d4c
--- /dev/null
+++ b/Src/h264dec/lcommon/inc/enc_statistics.h
@@ -0,0 +1,60 @@
+/*!
+ **************************************************************************
+ *  \file enc_statistics.h
+ *
+ *  \brief
+ *     statistics reports for the encoding process.
+ *
+ *  \author
+ *      Main contributors (see contributors.h for copyright, address and affiliation details)
+ *      - Alexis Tourapis                 <alexismt@ieee.org>
+ *      - Karsten S�hring                 <suehring@hhi.de>
+ *
+ **************************************************************************
+ */
+
+#ifndef _ENC_STATISTICS_H_
+#define _ENC_STATISTICS_H_
+#include "global.h"
+
+struct stat_parameters
+{
+  float  bitr;                        //!< bit rate for current frame, used only for output til terminal
+  float  bitrate;                     //!< average bit rate for the sequence except first frame
+  int64  bit_ctr;                     //!< counter for bit usage
+  int64  bit_ctr_n;                   //!< bit usage for the current frame
+  int64  bit_ctr_emulationprevention; //!< stored bits needed to prevent start code emulation
+  int    bit_slice;                   //!< number of bits in current slice
+  int    stored_bit_slice;            //!< keep number of bits in current slice (to restore status in case of MB re-encoding)  
+  int    b8_mode_0_use     [NUM_SLICE_TYPES][2];
+  int64  mode_use_transform[NUM_SLICE_TYPES][MAXMODE][2];
+  int64  intra_chroma_mode[4];
+
+  // B pictures
+  int     NumberBFrames;
+
+  int     frame_counter;
+  int64   quant               [NUM_SLICE_TYPES];
+  int64   num_macroblocks     [NUM_SLICE_TYPES];
+  int     frame_ctr           [NUM_SLICE_TYPES];
+  int64   bit_counter         [NUM_SLICE_TYPES];
+  float   bitrate_st          [NUM_SLICE_TYPES];
+  int64   mode_use            [NUM_SLICE_TYPES][MAXMODE]; //!< Macroblock mode usage for Intra frames
+  int64   bit_use_mode        [NUM_SLICE_TYPES][MAXMODE]; //!< statistics of bit usage  
+  int64   bit_use_mb_type     [NUM_SLICE_TYPES];
+  int64   bit_use_header      [NUM_SLICE_TYPES];
+  int64   tmp_bit_use_cbp     [NUM_SLICE_TYPES];
+  int64   bit_use_coeffC      [NUM_SLICE_TYPES];
+  int64   bit_use_coeff    [3][NUM_SLICE_TYPES];  
+  int64   bit_use_delta_quant [NUM_SLICE_TYPES];
+  int64   bit_use_stuffingBits[NUM_SLICE_TYPES];
+
+  int   bit_ctr_parametersets;
+  int   bit_ctr_parametersets_n;
+  int64 bit_ctr_filler_data;
+  int64 bit_ctr_filler_data_n;
+
+};
+typedef struct stat_parameters StatParameters;
+
+#endif
diff --git a/Src/h264dec/lcommon/inc/frame.h b/Src/h264dec/lcommon/inc/frame.h
new file mode 100644
index 00000000..25507d51
--- /dev/null
+++ b/Src/h264dec/lcommon/inc/frame.h
@@ -0,0 +1,50 @@
+
+/*!
+ ************************************************************************
+ * \file frame.h
+ *
+ * \brief
+ *    headers for frame format related information
+ *
+ * \author
+ *
+ ************************************************************************
+ */
+#ifndef H264_FRAME_H_
+#define H264_FRAME_H_
+#pragma once
+
+typedef enum {
+  CM_UNKNOWN = -1,
+  CM_YUV     =  0,
+  CM_RGB     =  1,
+  CM_XYZ     =  2
+} ColorModel;
+
+typedef enum {
+  CF_UNKNOWN = -1,     //!< Unknown color format
+  YUV400     =  0,     //!< Monochrome
+  YUV420     =  1,     //!< 4:2:0
+  YUV422     =  2,     //!< 4:2:2
+  YUV444     =  3      //!< 4:4:4
+} ColorFormat;
+
+typedef struct frame_format
+{  
+  ColorFormat yuv_format;                    //!< YUV format (0=4:0:0, 1=4:2:0, 2=4:2:2, 3=4:4:4)
+  int         width;                         //!< luma component frame width
+  int         height;                        //!< luma component frame height    
+  int         height_cr;                     //!< chroma component frame width
+  int         width_cr;                      //!< chroma component frame height
+  int         width_crop;                    //!< width after cropping consideration
+  int         height_crop;                   //!< height after cropping consideration
+  int         mb_width;                      //!< luma component frame width
+  int         mb_height;                     //!< luma component frame height    
+  int         size_cmp[3];                   //!< component sizes  
+  int         size;                          //!< total image size
+  int         bit_depth[3];                  //!< component bit depth  
+  int         max_value[3];                  //!< component max value
+  int         max_value_sq[3];               //!< component max value squared
+} FrameFormat;
+
+#endif
diff --git a/Src/h264dec/lcommon/inc/ifunctions.h b/Src/h264dec/lcommon/inc/ifunctions.h
new file mode 100644
index 00000000..0d0e86d8
--- /dev/null
+++ b/Src/h264dec/lcommon/inc/ifunctions.h
@@ -0,0 +1,251 @@
+
+/*!
+ ************************************************************************
+ *  \file
+ *     ifunctions.h
+ *
+ *  \brief
+ *     define some inline functions that are used within the encoder.
+ *
+ *  \author
+ *      Main contributors (see contributors.h for copyright, address and affiliation details)
+ *      - Karsten S�hring                 <suehring@hhi.de>
+ *      - Alexis Tourapis                 <alexismt@ieee.org>
+ *
+ ************************************************************************
+ */
+#ifndef _IFUNCTIONS_H_
+#define _IFUNCTIONS_H_
+
+# if !defined(WIN32) && (__STDC_VERSION__ < 199901L)
+  #define static
+  #define inline
+#endif
+#include <math.h>
+#include <limits.h>
+
+
+static inline short smin(short a, short b)
+{
+  return (short) (((a) < (b)) ? (a) : (b));
+}
+
+static inline short smax(short a, short b)
+{
+  return (short) (((a) > (b)) ? (a) : (b));
+}
+
+static inline int imin(int a, int b)
+{/*
+	int retu;
+	_asm 
+	{
+		mov eax, a
+		mov edx, b
+		cmp edx, eax
+		cmovle eax, edx
+		mov retu, eax
+	}
+	return retu;*/
+  return ((a) < (b)) ? (a) : (b);
+}
+
+static inline int imax(int a, int b)
+{
+  return ((a) > (b)) ? (a) : (b);
+}
+
+static inline double dmin(double a, double b)
+{
+  return ((a) < (b)) ? (a) : (b);
+}
+
+static inline double dmax(double a, double b)
+{
+  return ((a) > (b)) ? (a) : (b);
+}
+
+static inline int64 i64min(int64 a, int64 b)
+{
+  return ((a) < (b)) ? (a) : (b);
+}
+
+static inline int64 i64max(int64 a, int64 b)
+{
+  return ((a) > (b)) ? (a) : (b);
+}
+
+
+static inline short sabs(short x)
+{
+  static const short SHORT_BITS = (sizeof(short) * CHAR_BIT) - 1;
+  short y = (short) (x >> SHORT_BITS);
+  return (short) ((x ^ y) - y);
+}
+
+static inline int iabs(int x)
+{
+  static const int INT_BITS = (sizeof(int) * CHAR_BIT) - 1;
+  int y = x >> INT_BITS;
+  return (x ^ y) - y;
+}
+
+static inline double dabs(double x)
+{
+  return ((x) < 0) ? -(x) : (x);
+}
+
+static inline int64 i64abs(int64 x)
+{
+  static const int64 INT64_BITS = (sizeof(int64) * CHAR_BIT) - 1;
+  int64 y = x >> INT64_BITS;
+  return (x ^ y) - y;
+}
+
+static inline double dabs2(double x)
+{
+  return (x) * (x);
+}
+
+static inline int iabs2(int x) 
+{
+  return (x) * (x);
+}
+
+static inline int64 i64abs2(int64 x)
+{
+  return (x) * (x);
+}
+
+static inline int isign(int x)
+{
+  return ( (x > 0) - (x < 0));
+}
+
+static inline int isignab(int a, int b)
+{
+  return ((b) < 0) ? -iabs(a) : iabs(a);
+}
+
+static inline int rshift_rnd(int x, int a)
+{
+  return (a > 0) ? ((x + (1 << (a-1) )) >> a) : (x << (-a));
+}
+
+static inline int rshift_rnd_pos(int x, int a)
+{
+  return (x + (1 << (a-1) )) >> a;
+}
+
+// flip a before calling
+static inline int rshift_rnd_nonpos(int x, int a)
+{
+  return (x << a);
+}
+
+static inline int rshift_rnd_sign(int x, int a)
+{
+  return (x > 0) ? ( ( x + (1 << (a-1)) ) >> a ) : (-( ( iabs(x) + (1 << (a-1)) ) >> a ));
+}
+
+static inline unsigned int rshift_rnd_us(unsigned int x, unsigned int a)
+{
+  return (a > 0) ? ((x + (1 << (a-1))) >> a) : x;
+}
+
+static inline int rshift_rnd_sf(int x, int a)
+{
+  return ((x + (1 << (a-1) )) >> a);
+}
+
+static inline unsigned int rshift_rnd_us_sf(unsigned int x, unsigned int a)
+{
+  return ((x + (1 << (a-1))) >> a);
+}
+
+static inline int iClip1(int high, int x)
+{
+	if (x < 0)
+		return 0;
+	if (x > high)
+		return high;
+	return x;
+	/* old:
+  x = imax(x, 0);
+  x = imin(x, high);
+
+  return x;*/
+}
+
+static inline int iClip3(int low, int high, int x)
+{
+	if (x < low)
+		return low;
+	if (x > high)
+		return high;
+	return x;
+	/* old:
+  x = imax(x, low);
+  x = imin(x, high);
+
+  return x;*/
+}
+
+static inline short sClip3(short low, short high, short x)
+{
+  x = smax(x, low);
+  x = smin(x, high);
+
+  return x;
+}
+
+static inline double dClip3(double low, double high, double x)
+{
+  x = dmax(x, low);
+  x = dmin(x, high);
+
+  return x;
+}
+
+static inline int weighted_cost(int factor, int bits)
+{
+  return (((factor)*(bits))>>LAMBDA_ACCURACY_BITS);
+}
+
+static inline int RSD(int x)
+{
+ return ((x&2)?(x|1):(x&(~1)));
+}
+
+static inline int power2(int x) 
+{
+  return 1 << (x);
+}
+
+static inline int float2int (float x)
+{
+  return (int)((x < 0) ? (x - 0.5f) : (x + 0.5f));
+}
+
+
+
+#if ZEROSNR
+static inline float psnr(int max_sample_sq, int samples, float sse_distortion ) 
+{
+  return (float) (10.0 * log10(max_sample_sq * (double) ((double) samples / (sse_distortion < 1.0 ? 1.0 : sse_distortion))));
+}
+#else
+static inline float psnr(int max_sample_sq, int samples, float sse_distortion ) 
+{
+  return (float) (sse_distortion == 0.0 ? 0.0 : (10.0 * log10(max_sample_sq * (double) ((double) samples / sse_distortion))));
+}
+#endif
+
+
+# if !defined(WIN32) && (__STDC_VERSION__ < 199901L)
+  #undef static
+  #undef inline
+#endif
+
+#endif
+
diff --git a/Src/h264dec/lcommon/inc/img_io.h b/Src/h264dec/lcommon/inc/img_io.h
new file mode 100644
index 00000000..7d57d03f
--- /dev/null
+++ b/Src/h264dec/lcommon/inc/img_io.h
@@ -0,0 +1,28 @@
+/*!
+ *************************************************************************************
+ * \file img_io.h
+ *
+ * \brief
+ *    image I/O related functions
+ *
+ * \author
+ *    Main contributors (see contributors.h for copyright, address and affiliation details)
+ *     - Alexis Michael Tourapis         <alexismt@ieee.org>
+ *************************************************************************************
+ */
+#include "global.h"
+
+#ifndef _IMG_IO_H_
+#define _IMG_IO_H_
+
+#include "io_video.h"
+
+extern int ParseSizeFromString           (VideoDataFile *input_file, int *xlen, int *ylen, double *fps);
+extern void ParseFrameNoFormatFromString (VideoDataFile *input_file);
+extern void OpenFrameFile                (VideoDataFile *input_file, int FrameNumberInFile);
+extern void OpenFiles                    (VideoDataFile *input_file);
+extern void CloseFiles                   (VideoDataFile *input_file);
+extern VideoFileType ParseVideoType      (VideoDataFile *input_file);
+
+#endif
+
diff --git a/Src/h264dec/lcommon/inc/mb_access.h b/Src/h264dec/lcommon/inc/mb_access.h
new file mode 100644
index 00000000..0bad3bca
--- /dev/null
+++ b/Src/h264dec/lcommon/inc/mb_access.h
@@ -0,0 +1,70 @@
+
+/*!
+ *************************************************************************************
+ * \file mb_access.h
+ *
+ * \brief
+ *    Functions for macroblock neighborhoods
+ *
+ * \author
+ *     Main contributors (see contributors.h for copyright, address and affiliation details)
+ *     - Karsten S�hring                 <suehring@hhi.de> 
+ *     - Alexis Michael Tourapis         <alexismt@ieee.org>  
+ *************************************************************************************
+ */
+
+#ifndef _MB_ACCESS_H_
+#define _MB_ACCESS_H_
+
+extern void CheckAvailabilityOfNeighbors(Macroblock *currMB);
+
+/* MB Aff */
+extern void getAffNeighbour         (const Macroblock *currMB, int xN, int yN, const int mb_size[2], PixelPos *pix);
+extern void getAffNeighbourLuma     (const Macroblock *currMB, int xN, int yN, PixelPos *pix);
+extern void getAffNeighbourXPLuma   (const Macroblock *currMB, int xN, int yN, PixelPos *pix);
+extern void getAffNeighbourPPLumaNB (const Macroblock *currMB, int xN, int yN, PixelPos *pix);
+extern void getAffNeighbourNPLuma   (const Macroblock *currMB, int yN, PixelPos *pix);
+extern void getAffNeighbourN0Luma   (const Macroblock *currMB, PixelPos *pix);
+extern void getAffNeighbourNXLuma   (const Macroblock *currMB, int xN, PixelPos *pix);
+extern void getAffNeighbour0X       (const Macroblock *currMB, int yN, const int mb_size[2], PixelPos *pix);
+extern void getAffNeighbour0XLuma   (const Macroblock *currMB, int yN, PixelPos *pix);
+extern void getAffNeighbour0N       (const Macroblock *currMB, const int mb_size[2], PixelPos *pix);
+extern void getAffNeighbourX0       (const Macroblock *currMB, int xN, const int mb_size[2], PixelPos *pix);
+extern void getAffNeighbourNX       (const Macroblock *currMB, int yN, const int mb_size[2], PixelPos *pix);
+extern void getAffNeighbourN0       (const Macroblock *currMB, const int mb_size[2], PixelPos *pix);
+extern void getAffNeighbour0NLuma   (const Macroblock *currMB, PixelPos *pix);
+extern void getAffNeighbourPXLumaNB(const Macroblock *currMB, int xN, int yN, PixelPos *pix);
+extern void getAffNeighbourPXLumaNB_NoPos(const Macroblock *currMB, int yN, PixelPos *pix);
+/* normal */
+extern void getNonAffNeighbour      (const Macroblock *currMB, int xN, int yN, const int mb_size[2], PixelPos *pix);
+extern void getNonAffNeighbourXP_NoPos(const Macroblock *currMB, int xN, int yN, const int mb_size[2], PixelPos *pix);
+extern void getNonAffNeighbourPX_NoPos(const Macroblock *currMB, int xN, int yN, const int mb_size[2], PixelPos *pix);
+extern void getNonAffNeighbourLuma  (const Macroblock *currMB, int xN, int yN, PixelPos *pix);
+extern void getNonAffNeighbourXPLuma(const Macroblock *currMB, int xN, int yN, PixelPos *pix);
+extern void getNonAffNeighbourPXLuma(const Macroblock *currMB, int xN, int yN, PixelPos *pix);
+extern void getNonAffNeighbourN0Luma(const Macroblock *currMB, PixelPos *pix);
+extern void getNonAffNeighbourNXLuma(const Macroblock *currMB, int yN, PixelPos *pix);
+extern void getNonAffNeighbourN0    (const Macroblock *currMB, const int mb_size[2], PixelPos *pix);
+extern void getNonAffNeighbour0N    (const Macroblock *currMB, const int mb_size[2], PixelPos *pix);
+extern void getNonAffNeighbourNX    (const Macroblock *currMB, int yN, const int mb_size[2], PixelPos *pix);
+extern void getNonAffNeighbourNP    (const Macroblock *currMB, int yN, const int mb_size[2], PixelPos *pix);
+extern void getNonAffNeighbourNPChromaNB(const Macroblock *currMB, int yN, const int mb_size[2], PixelPos *pix);
+extern void getNonAffNeighbour0X    (const Macroblock *currMB, int yN, const int mb_size[2], PixelPos *pix);
+extern void getNonAffNeighbour0XLuma(const Macroblock *currMB, int yN, PixelPos *pix);
+extern void getNonAffNeighbourX0    (const Macroblock *currMB, int xN, const int mb_size[2], PixelPos *pix);
+extern void getNonAffNeighbour0NLuma(const Macroblock *currMB, PixelPos *pix);
+extern void getNonAffNeighbourNPLumaNB(const Macroblock *currMB, int yN, PixelPos *pix);
+extern void getNonAffNeighbourXPLumaNB(const Macroblock *currMB, int xN, int yN, PixelPos *pix);
+extern void getNonAffNeighbourPPLumaNB(const Macroblock *currMB, int xN, int yN, PixelPos *pix);
+extern void getNonAffNeighbourXPLumaNB_NoPos(const Macroblock *currMB, int xN, int yN, PixelPos *pix);
+extern void getNonAffNeighbourPXLumaNB(const Macroblock *currMB, int xN, int yN, PixelPos *pix);
+extern void getNonAffNeighbourPXLumaNB_NoPos(const Macroblock *currMB, int yN, PixelPos *pix);
+extern void get4x4Neighbour         (const Macroblock *currMB, int xN, int yN, const int mb_size[2], PixelPos *pix);
+extern void get4x4NeighbourLuma     (const Macroblock *currMB, int block_x, int block_y, PixelPos *pix);
+extern Boolean mb_is_available      (int mbAddr, const Macroblock *currMB);
+extern void get_mb_pos              (VideoParameters *p_Vid, int mb_addr, const int mb_size[2], short *x, short *y);
+extern void get_mb_block_pos_normal (const h264_pic_position *PicPos, int mb_addr, short *x, short *y);
+extern void get_mb_block_pos_mbaff  (const h264_pic_position *PicPos, int mb_addr, short *x, short *y);
+
+
+#endif
diff --git a/Src/h264dec/lcommon/inc/memalloc.h b/Src/h264dec/lcommon/inc/memalloc.h
new file mode 100644
index 00000000..fb4c3132
--- /dev/null
+++ b/Src/h264dec/lcommon/inc/memalloc.h
@@ -0,0 +1,71 @@
+
+/*!
+ ************************************************************************
+ * \file  memalloc.h
+ *
+ * \brief
+ *    Memory allocation and free helper funtions
+ *
+ * \author
+ *    Main contributors (see contributors.h for copyright, address and affiliation details)
+ *     - Karsten S�hring                 <suehring@hhi.de> 
+ *     - Alexis Michael Tourapis         <alexismt@ieee.org> 
+ *
+ ************************************************************************
+ */
+
+#ifndef _MEMALLOC_H_
+#define _MEMALLOC_H_
+
+#include "global.h"
+#include "quant_params.h"
+
+#if defined(USEMMX) // && (IMGTYPE == 0) // MMX, SSE, SSE2 intrinsic support
+#if defined(_MSC_VER) || defined(__INTEL_COMPILER) // ICC
+#  include <emmintrin.h>
+# else
+#  include <xmmintrin.h>
+# endif
+#endif
+
+extern int  get_mem2D(byte ***array2D, int dim0, int dim1);
+extern int  get_mem3D(byte ****array3D, int dim0, int dim1, int dim2);
+extern int  get_mem4D(byte *****array4D, int dim0, int dim1, int dim2, int dim3);
+
+extern int  get_mem2Dint(int ***array2D, int rows, int columns);
+extern int  get_mem3Dint(int ****array3D, int frames, int rows, int columns);
+extern int  get_mem4Dint(int *****array4D, int idx, int frames, int rows, int columns );
+
+extern int  get_mem2DPicMotion(struct pic_motion ***array3D, int rows, int columns);
+extern int  get_mem3Dref(h264_ref_t ****array3D, int frames, int rows, int columns);
+
+extern int  get_mem2Dshort(short ***array2D, int dim0, int dim1);
+extern MotionVector ***get_mem3DMotionVector(int dim0, int dim1, int dim2);
+extern int  get_mem4Dshort(short *****array4D, int dim0, int dim1, int dim2, int dim3);
+extern int  get_mem2Dpel(imgpel ***array2D, int rows, int columns);
+
+extern struct video_image *get_memImage(int width, int height);
+extern void free_memImage(struct video_image *image);
+
+extern void free_mem2D     (byte      **array2D);
+extern void free_mem3D     (byte     ***array3D);
+extern void free_mem4D     (byte    ****array4D);
+//
+extern void free_mem2Dint  (int       **array2D);
+extern void free_mem3Dint  (int      ***array3D);
+
+extern void free_mem3Dref(h264_ref_t    ***array3D);
+extern void free_mem2DPicMotion(struct pic_motion    **array3D);
+//
+extern void free_mem2Dshort(short      **array2D);
+
+extern void free_mem3DMotionVector(MotionVector ***);
+
+extern void free_mem2Dpel  (imgpel    **array2D);
+extern int init_top_bot_planes(imgpel **imgFrame, int height, imgpel ***imgTopField, imgpel ***imgBotField);
+extern void free_top_bot_planes(imgpel **imgTopField, imgpel **imgBotField);
+
+extern void no_mem_exit(char *where);
+
+
+#endif
diff --git a/Src/h264dec/lcommon/inc/memcache.h b/Src/h264dec/lcommon/inc/memcache.h
new file mode 100644
index 00000000..26131827
--- /dev/null
+++ b/Src/h264dec/lcommon/inc/memcache.h
@@ -0,0 +1,24 @@
+#pragma once
+typedef struct image_cache
+{
+	int size_x, size_y;
+	struct video_image *head;
+} ImageCache;
+
+void image_cache_set_dimensions(ImageCache *cache, int width, int height);
+int image_cache_dimensions_match(ImageCache *cache, int width, int height);
+void image_cache_add(ImageCache *cache, struct video_image *image);
+struct video_image *image_cache_get(ImageCache *cache);
+void image_cache_flush(ImageCache *cache);
+
+typedef struct motion_cache
+{
+	int size_x, size_y;
+	struct pic_motion **head;
+} MotionCache;
+
+void motion_cache_set_dimensions(MotionCache *cache, int width, int height);
+int motion_cache_dimensions_match(MotionCache *cache, int width, int height);
+void motion_cache_add(MotionCache *cache, struct pic_motion **image);
+struct pic_motion **motion_cache_get(MotionCache *cache);
+void motion_cache_flush(MotionCache *cache);
+\ No newline at end of file
diff --git a/Src/h264dec/lcommon/inc/mv_prediction.h b/Src/h264dec/lcommon/inc/mv_prediction.h
new file mode 100644
index 00000000..0f2a13e2
--- /dev/null
+++ b/Src/h264dec/lcommon/inc/mv_prediction.h
@@ -0,0 +1,19 @@
+/*!
+ *************************************************************************************
+ * \file mv_prediction.h
+ *
+ * \brief
+ *    Declarations for Motion Vector Prediction
+ *
+ * \author
+ *     Main contributors (see contributors.h for copyright, address and affiliation details)
+ *     - Alexis Michael Tourapis         <alexismt@ieee.org>  
+ *************************************************************************************
+ */
+
+#ifndef _MV_PREDICTION_H_
+#define _MV_PREDICTION_H_
+
+extern void init_motion_vector_prediction(Macroblock *currMB, int MbaffFrameFlag);
+
+#endif
diff --git a/Src/h264dec/lcommon/inc/nalucommon.h b/Src/h264dec/lcommon/inc/nalucommon.h
new file mode 100644
index 00000000..f0288ac5
--- /dev/null
+++ b/Src/h264dec/lcommon/inc/nalucommon.h
@@ -0,0 +1,64 @@
+
+/*!
+ **************************************************************************************
+ * \file
+ *    nalucommon.h
+ * \brief
+ *    NALU handling common to encoder and decoder
+ * \author
+ *    Main contributors (see contributors.h for copyright, address and affiliation details)
+ *      - Stephan Wenger        <stewe@cs.tu-berlin.de>
+ *      - Karsten Suehring      <suehring@hhi.de>
+ ***************************************************************************************
+ */
+
+#ifndef _NALUCOMMON_H_
+#define _NALUCOMMON_H_
+
+#define MAXRBSPSIZE 64000
+#define MAXNALUSIZE 64000
+
+//! values for nal_unit_type
+typedef enum {
+ NALU_TYPE_SLICE    = 1,
+ NALU_TYPE_DPA      = 2,
+ NALU_TYPE_DPB      = 3,
+ NALU_TYPE_DPC      = 4,
+ NALU_TYPE_IDR      = 5,
+ NALU_TYPE_SEI      = 6,
+ NALU_TYPE_SPS      = 7,
+ NALU_TYPE_PPS      = 8,
+ NALU_TYPE_AUD      = 9,
+ NALU_TYPE_EOSEQ    = 10,
+ NALU_TYPE_EOSTREAM = 11,
+ NALU_TYPE_FILL     = 12
+} NaluType;
+
+//! values for nal_ref_idc
+typedef enum {
+ NALU_PRIORITY_HIGHEST     = 3,
+ NALU_PRIORITY_HIGH        = 2,
+ NALU_PRIORITY_LOW         = 1,
+ NALU_PRIORITY_DISPOSABLE  = 0
+} NalRefIdc;
+
+//! NAL unit structure
+typedef struct nalu_t
+{
+  int       startcodeprefix_len;   //!< 4 for parameter sets and first slice in picture, 3 for everything else (suggested)
+  unsigned  len;                   //!< Length of the NAL unit (Excluding the start code, which does not belong to the NALU)
+  unsigned  max_size;              //!< NAL Unit Buffer size
+  int       forbidden_bit;         //!< should be always FALSE
+  NaluType  nal_unit_type;         //!< NALU_TYPE_xxxx
+  NalRefIdc nal_reference_idc;     //!< NALU_PRIORITY_xxxx  
+  byte     *buf;                   //!< contains the first byte followed by the EBSP
+  uint16    lost_packets;     //!< true, if packet loss is detected
+} NALU_t;
+
+//! allocate one NAL Unit
+extern NALU_t *AllocNALU(int);
+
+//! free one NAL Unit
+extern void FreeNALU(NALU_t *n);
+
+#endif
diff --git a/Src/h264dec/lcommon/inc/quant_params.h b/Src/h264dec/lcommon/inc/quant_params.h
new file mode 100644
index 00000000..c35682c0
--- /dev/null
+++ b/Src/h264dec/lcommon/inc/quant_params.h
@@ -0,0 +1,55 @@
+/*!
+ ***************************************************************************
+ * \file
+ *    quant_params.h
+ *
+ * \author
+ *    Alexis Michael Tourapis
+ *
+ * \brief
+ *    Headerfile for Quantization parameters
+ **************************************************************************
+ */
+
+#ifndef _QUANT_PARAMS_H_
+#define _QUANT_PARAMS_H_
+
+struct level_quant_params {
+  int   OffsetComp;
+  int    ScaleComp;
+  int InvScaleComp;
+};
+
+typedef struct level_quant_params LevelQuantParams;
+
+struct quant_params {
+  int AdaptRndWeight;
+  int AdaptRndCrWeight;
+
+  LevelQuantParams *****q_params_4x4;
+  LevelQuantParams *****q_params_8x8;
+
+  int *qp_per_matrix;
+  int *qp_rem_matrix;
+
+  short **OffsetList4x4input;
+  short **OffsetList8x8input;
+  short ***OffsetList4x4;
+  short ***OffsetList8x8;
+};
+
+struct quant_methods {
+  int   block_y; 
+  int   block_x;
+  int   qp; 
+  int*  ACLevel;
+  int*  ACRun;
+  int **fadjust; 
+  LevelQuantParams **q_params;
+  int *coeff_cost;
+  const byte (*pos_scan)[2];
+  const byte *c_cost;
+};
+
+#endif
+
diff --git a/Src/h264dec/lcommon/inc/transform.h b/Src/h264dec/lcommon/inc/transform.h
new file mode 100644
index 00000000..61942004
--- /dev/null
+++ b/Src/h264dec/lcommon/inc/transform.h
@@ -0,0 +1,27 @@
+
+/*!
+ ***************************************************************************
+ *
+ * \file transform.h
+ *
+ * \brief
+ *    prototypes of transform functions
+ *
+ * \date
+ *    10 July 2007
+ *
+ * \author
+ *    Main contributors (see contributors.h for copyright, address and affiliation details)
+ *    Alexis Michael Tourapis
+ **************************************************************************/
+
+#ifndef _TRANSFORM_H_
+#define _TRANSFORM_H_
+
+#include "global.h"
+
+extern void forward4x4   (int **block , int **tblock, int pos_y, int pos_x);
+extern void ihadamard4x4 (int block[4][4]);
+extern void ihadamard2x2 (int block[4], int tblock[4]);
+
+#endif //_TRANSFORM_H_
diff --git a/Src/h264dec/lcommon/inc/typedefs.h b/Src/h264dec/lcommon/inc/typedefs.h
new file mode 100644
index 00000000..58806bef
--- /dev/null
+++ b/Src/h264dec/lcommon/inc/typedefs.h
@@ -0,0 +1,71 @@
+/*!
+ *************************************************************************************
+ * \file typedefs.h
+ *
+ * \brief
+ *    Common type definitions
+ *    Currently only supports Windows and Linux operating systems. 
+ *    Need to add support for other "older systems such as VAX, DECC, Unix Alpha etc
+ *
+ * \author
+ *    Main contributors (see contributors.h for copyright, address and affiliation details)
+ *     - Alexis Michael Tourapis         <alexismt@ieee.org>
+ *************************************************************************************
+ */
+
+#ifndef _TYPEDEFS_H_
+#define _TYPEDEFS_H_
+
+#include "win32.h"
+
+typedef unsigned char  byte;     //!< byte type definition
+typedef unsigned char  uint8;    //!< type definition for unsigned char (same as byte, 8 bits)
+typedef unsigned short uint16;   //!< type definition for unsigned short (16 bits)
+typedef unsigned int   uint32;   //!< type definition for unsigned int (32 bits)
+
+typedef          char  int8;
+typedef          short int16;
+typedef          int   int32;
+
+#if (IMGTYPE == 0)
+typedef byte   imgpel;
+typedef uint16 distpel;
+typedef int32  distblk;
+#elif (IMGTYPE == 2)
+typedef float imgpel;
+typedef float distpel;
+typedef float distblk;
+#else
+typedef uint16 imgpel;
+typedef uint32 distpel;
+typedef int64  distblk;
+#endif
+
+//! Boolean Type
+#ifdef FALSE
+#  define Boolean int
+#else
+typedef enum {
+  FALSE,
+  TRUE
+} Boolean;
+#endif
+
+/*
+#define MAXUINT8     0xff
+#define MAXUINT16    0xffff
+#define MAXUINT32    0xffffffff
+#define MAXUINT64    0xffffffffffffffff
+
+#define MAXINT8      0x7f
+#define MININT8      (-MAXINT8)
+#define MAXINT16     0x7fff
+#define MININT16     (-MAXINT16)
+#define MAXINT32     0x7fffffff
+#define MININT32     (-MAXINT32)
+#define MAXINT64     0x7fffffffffffffff
+#define MININT64     (-MAXINT64)
+*/
+
+#endif
+
diff --git a/Src/h264dec/lcommon/inc/types.h b/Src/h264dec/lcommon/inc/types.h
new file mode 100644
index 00000000..6088f8e4
--- /dev/null
+++ b/Src/h264dec/lcommon/inc/types.h
@@ -0,0 +1,204 @@
+/*!
+ ************************************************************************
+ *  \file
+ *     types.h
+ *
+ *  \brief
+ *     type definitions.
+ *
+ *  \author
+ *    Main contributors (see contributors.h for copyright, address and affiliation details)
+ *
+ ************************************************************************
+ */
+#ifndef _TYPES_H_
+#define _TYPES_H_
+
+/***********************************************************************
+ * T y p e    d e f i n i t i o n s    f o r    T M L
+ ***********************************************************************
+ */
+
+typedef enum
+{
+  // YUV
+  PLANE_Y = 0,  // PLANE_Y
+  PLANE_U = 1,  // PLANE_Cb
+  PLANE_V = 2,  // PLANE_Cr
+  // RGB
+  PLANE_G = 0,
+  PLANE_B = 1,
+  PLANE_R = 2
+} ColorPlane;
+
+enum {
+  LIST_0 = 0,
+  LIST_1 = 1,
+  BI_PRED = 2,
+  BI_PRED_L0 = 3,
+  BI_PRED_L1 = 4
+};
+
+enum {
+  ERROR_SAD = 0,
+  ERROR_SSE = 1,
+  ERROR_SATD = 2,
+  ERROR_PSATD = 3
+};
+
+enum {
+  ME_Y_ONLY = 0,
+  ME_YUV_FP = 1,
+  ME_YUV_FP_SP = 2
+};
+
+
+enum {
+  DISTORTION_MSE = 0
+};
+
+
+//! Data Partitioning Modes
+typedef enum
+{
+  PAR_DP_1,   //!< no data partitioning is supported
+  PAR_DP_3    //!< data partitioning with 3 partitions
+} PAR_DP_TYPE;
+
+
+//! Output File Types
+typedef enum
+{
+  PAR_OF_ANNEXB,    //!< Annex B byte stream format
+  PAR_OF_RTP,       //!< RTP packets in outfile
+	PAR_OF_MEMORY,				//!< 
+} PAR_OF_TYPE;
+
+//! Field Coding Types
+typedef enum
+{
+  FRAME_CODING,
+  FIELD_CODING,
+  ADAPTIVE_CODING,
+  FRAME_MB_PAIR_CODING
+} CodingType;
+
+//! definition of H.264 syntax elements
+typedef enum
+{
+  SE_HEADER,
+  SE_PTYPE,
+  SE_MBTYPE,
+  SE_REFFRAME,
+  SE_INTRAPREDMODE,
+  SE_MVD,
+  SE_CBP,
+  SE_LUM_DC_INTRA,
+  SE_CHR_DC_INTRA,
+  SE_LUM_AC_INTRA,
+  SE_CHR_AC_INTRA,
+  SE_LUM_DC_INTER,
+  SE_CHR_DC_INTER,
+  SE_LUM_AC_INTER,
+  SE_CHR_AC_INTER,
+  SE_DELTA_QUANT,
+  SE_BFRAME,
+  SE_EOS,
+  SE_MAX_ELEMENTS = 20 //!< number of maximum syntax elements
+} SE_type;             // substituting the definitions in elements.h
+
+
+typedef enum
+{
+  NO_SLICES,
+  FIXED_MB,
+  FIXED_RATE,
+  CALL_BACK
+} SliceMode;
+
+
+typedef enum
+{
+  CAVLC,
+  CABAC
+} SymbolMode;
+
+typedef enum
+{
+  FULL_SEARCH      = -1,
+  FAST_FULL_SEARCH =  0,
+  UM_HEX           =  1,
+  UM_HEX_SIMPLE    =  2,
+  EPZS             =  3
+} SearchType;
+
+
+typedef enum
+{
+  FRAME,
+  TOP_FIELD,
+  BOTTOM_FIELD
+} PictureStructure;           //!< New enum for field processing
+
+typedef enum
+{
+  P_SLICE = 0,
+  B_SLICE = 1,
+  I_SLICE = 2,
+  SP_SLICE = 3,
+  SI_SLICE = 4,
+  NUM_SLICE_TYPES = 5
+} SliceType;
+
+//Motion Estimation levels
+typedef enum
+{
+  F_PEL,   //!< Full Pel refinement
+  H_PEL,   //!< Half Pel refinement
+  Q_PEL    //!< Quarter Pel refinement
+} MELevel;
+
+typedef enum
+{
+  FAST_ACCESS = 0,    //!< Fast/safe reference access
+  UMV_ACCESS = 1      //!< unconstrained reference access
+} REF_ACCESS_TYPE;
+
+typedef enum
+{
+  IS_LUMA = 0,
+  IS_CHROMA = 1
+} Component_Type;
+
+typedef enum
+{
+  RC_MODE_0 = 0,
+  RC_MODE_1 = 1,
+  RC_MODE_2 = 2,
+  RC_MODE_3 = 3
+} RCModeType;
+
+
+typedef enum {
+  SSE              = 0,
+  SSE_RGB          = 1,  
+  PSNR             = 2,
+  PSNR_RGB         = 3,
+  SSIM             = 4,
+  SSIM_RGB         = 5,
+  MS_SSIM          = 6,
+  MS_SSIM_RGB      = 7,
+  TOTAL_DIST_TYPES = 8
+} distortion_types;
+
+typedef enum {
+  WP_MCPREC_PLUS0 =       4,
+  WP_MCPREC_PLUS1 =       5,
+  WP_MCPREC_MINUS0 =      6,
+  WP_MCPREC_MINUS1 =      7,
+  WP_MCPREC_MINUS_PLUS0 = 8,
+  WP_REGULAR =            9
+} weighted_prediction_types;
+
+
+#endif
diff --git a/Src/h264dec/lcommon/inc/win32.h b/Src/h264dec/lcommon/inc/win32.h
new file mode 100644
index 00000000..09ffef61
--- /dev/null
+++ b/Src/h264dec/lcommon/inc/win32.h
@@ -0,0 +1,92 @@
+
+/*!
+ ************************************************************************
+ *  \file
+ *     win32.h
+ *
+ *  \brief
+ *     win32 definitions for H.264 encoder.
+ *
+ *  \author
+ *
+ ************************************************************************
+ */
+#ifndef _H264_WIN32_H_
+#define _H264_WIN32_H_
+#pragma once
+
+# include <fcntl.h>
+# include <stdlib.h>
+# include <stdio.h>
+# include <string.h>
+# include <assert.h>
+
+#if defined(WIN32)
+# include <io.h>
+# include <sys/types.h>
+# include <sys/stat.h>
+# include <windows.h>
+#ifndef strcasecmp
+# define strcasecmp _strcmpi
+#endif
+
+# define  snprintf _snprintf
+# define  open     _open
+# define  close    _close
+# define  read     _read
+# define  write    _write
+#ifndef lseek
+# define  lseek    _lseeki64
+#endif
+# define  fsync    _commit
+# define  tell     _telli64
+# define  TIMEB    _timeb
+# define  TIME_T    LARGE_INTEGER
+# define  OPENFLAGS_WRITE _O_WRONLY|_O_CREAT|_O_BINARY|_O_TRUNC
+# define  OPEN_PERMISSIONS _S_IREAD | _S_IWRITE
+# define  OPENFLAGS_READ  _O_RDONLY|_O_BINARY
+# define  inline   _inline
+# define  forceinline __forceinline
+#else
+# include <unistd.h>
+# include <sys/time.h>
+# include <sys/stat.h>
+# include <time.h>
+
+# define  TIMEB    timeb
+# define  TIME_T   struct timeval
+# define  tell(fd) lseek(fd, 0, SEEK_CUR)
+# define  OPENFLAGS_WRITE O_WRONLY|O_CREAT|O_TRUNC
+# define  OPENFLAGS_READ  O_RDONLY
+# define  OPEN_PERMISSIONS S_IRUSR | S_IWUSR
+
+# if __STDC_VERSION__ >= 199901L
+   /* "inline" is a keyword */
+# else
+#  define inline /* nothing */
+# endif
+# define  forceinline inline
+#endif
+
+#if defined(WIN32) && !defined(__GNUC__)
+typedef __int64   int64;
+typedef unsigned __int64   uint64;
+# define FORMAT_OFF_T "I64d"
+# ifndef INT64_MIN
+#  define INT64_MIN        (-9223372036854775807i64 - 1i64)
+# endif
+#else
+
+typedef long long int64;
+typedef unsigned long long  uint64;
+# define FORMAT_OFF_T "lld"
+# ifndef INT64_MIN
+#  define INT64_MIN        (-9223372036854775807LL - 1LL)
+# endif
+#endif
+
+void   gettime(TIME_T* time);
+int64 timediff(TIME_T* start, TIME_T* end);
+int64 timenorm(int64 cur_time);
+
+#endif
diff --git a/Src/h264dec/lcommon/src/img_io.c b/Src/h264dec/lcommon/src/img_io.c
new file mode 100644
index 00000000..c0520218
--- /dev/null
+++ b/Src/h264dec/lcommon/src/img_io.c
@@ -0,0 +1,327 @@
+
+/*!
+ *************************************************************************************
+ * \file img_io.c
+ *
+ * \brief
+ *    image I/O related functions
+ *
+ * \author
+ *    Main contributors (see contributors.h for copyright, address and affiliation details)
+ *     - Alexis Michael Tourapis         <alexismt@ieee.org>
+ *************************************************************************************
+ */
+#include "contributors.h"
+#include "global.h"
+#include "img_io.h"
+#include "report.h"
+
+static const VIDEO_SIZE VideoRes[] = {
+  { "qcif"  ,  176,  144},
+  { "qqvga" ,  160,  128},
+  { "qvga"  ,  320,  240},
+  { "sif"   ,  352,  240},
+  { "cif"   ,  352,  288},
+  { "vga"   ,  640,  480},
+  { "sd1"   ,  720,  480},
+  { "sd2"   ,  704,  576},
+  { "sd3"   ,  720,  576},
+  { "720p"  , 1280,  720},
+  { "1080p" , 1920, 1080},
+  { NULL, 0, 0}
+};
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Parse Size from from file name
+ *
+ ************************************************************************
+ */
+int ParseSizeFromString (VideoDataFile *input_file, int *x_size, int *y_size, double *fps) 
+{
+  char *p1, *p2, *tail;
+  char *fn = input_file->fname;
+  char c;
+  int i = 0;
+
+  *x_size = *y_size = -1;
+  p1 = p2 = fn;
+  while (p1 != NULL && p2 != NULL) 
+  {
+    // Search for first '_'
+    p1 = strstr( p1, "_");
+    if (p1 == NULL)
+      break;
+
+    // Search for end character of x_size (first 'x' after last '_')
+    p2 = strstr( p1, "x");
+
+    // If no 'x' is found, exit
+    if (p2 == NULL)    
+      break;
+
+    // Try conversion of number
+    *p2 = 0;
+    *x_size = strtol( p1 + 1, &tail, 10);
+
+    // If there are characters left in the string, or the string is null, discard conversion
+    if (*tail != '\0' || *(p1 + 1) == '\0') 
+    {
+      *p2 = 'x';
+      p1 = tail;
+      continue;
+    }
+
+    // Conversion was correct. Restore string
+    *p2 = 'x';
+
+    // Search for end character of y_size (first '_' or '.' after last 'x')
+    p1 = strpbrk( p2 + 1, "_.");
+    // If no '_' or '.' is found, try again from current position
+    if (p1 == NULL) 
+    {
+      p1 = p2 + 1;
+      continue;
+    }
+
+    // Try conversion of number
+    c = *p1;
+    *p1 = 0;
+    *y_size = strtol( p2 + 1, &tail, 10);
+
+    // If there are characters left in the string, or the string is null, discard conversion
+    if (*tail != '\0' || *(p2 + 1) == '\0') 
+    {
+      *p1 = c;
+      p1 = tail;
+      continue;
+    }
+
+    // Conversion was correct. Restore string
+    *p1 = c;
+
+    // Search for end character of y_size (first 'i' or 'p' after last '_')
+    p2 = strstr( p1 + 1, "ip");
+
+    // If no 'i' or 'p' is found, exit
+    if (p2 == NULL)      
+      break;
+
+    // Try conversion of number
+    c = *p2;
+    *p2 = 0;
+    *fps = strtod( p1 + 1, &tail);
+
+    // If there are characters left in the string, or the string is null, discard conversion
+    if (*tail != '\0' || *(p1 + 1) == '\0') 
+    {
+      *p2 = c;
+      p1 = tail;
+      continue;
+    }
+
+    // Conversion was correct. Restore string
+    *p2 = c;
+    break;
+  }
+
+  // Now lets test some common video file formats
+  if (p1 == NULL || p2 == NULL)
+  {       
+    for (i = 0; VideoRes[i].name != NULL; i++) 
+    {
+      if (strcasecmp (fn, VideoRes[i].name)) 
+      {
+        *x_size = VideoRes[i].x_size;
+        *y_size = VideoRes[i].y_size;       
+        // Should add frame rate support as well
+        break;
+      }
+    }
+  }
+
+  return (*x_size == -1 || *y_size == -1) ? 0 : 1; 
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Parse Size from from file name
+ *
+ ************************************************************************
+ */
+void ParseFrameNoFormatFromString (VideoDataFile *input_file)
+{
+  char *p1, *p2, *tail;  
+  char *fn         = input_file->fname;
+  char *fhead      = input_file->fhead;
+  char *ftail      = input_file->ftail;
+  int  *zero_pad   = &input_file->zero_pad;
+  int  *num_digits = &input_file->num_digits;
+
+  *zero_pad = 0;
+  *num_digits = -1;
+  p1 = p2 = fn;
+  while (p1 != NULL && p2 != NULL) 
+  {
+    // Search for first '_'
+    p1 = strstr( p1, "%");
+    if (p1 == NULL)
+      break;
+
+    strncpy(fhead, fn, p1 - fn);
+
+    // Search for end character of x_size (first 'x' after last '_')
+    p2 = strstr( p1, "d");
+
+    // If no 'x' is found, exit
+    if (p2 == NULL)    
+      break;
+    
+    // Try conversion of number
+    *p2 = 0;
+
+    if (*(p1 + 1) == '0')
+      *zero_pad = 1;
+
+    *num_digits = strtol( p1 + 1, &tail, 10);
+
+    // If there are characters left in the string, or the string is null, discard conversion
+    if (*tail != '\0' || *(p1 + 1) == '\0') 
+    {
+      *p2 = 'd';
+      p1 = tail;
+      continue;
+    }
+
+    // Conversion was correct. Restore string
+    *p2 = 'd';
+
+    tail++;
+    strncpy(ftail, tail, strlen(tail));
+    break;
+  }
+
+  if (input_file->vdtype == VIDEO_TIFF)
+  {
+    input_file->is_concatenated = 0;
+  }
+  else
+    input_file->is_concatenated = (*num_digits == -1) ? 1 : 0;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Open file containing a single frame
+ ************************************************************************
+ */
+void OpenFrameFile( VideoDataFile *input_file, int FrameNumberInFile)
+{
+  char infile [FILE_NAME_SIZE], in_number[16];
+  int length = 0;
+  in_number[length]='\0';
+  length = strlen(input_file->fhead);
+  strncpy(infile, input_file->fhead, length);
+  infile[length]='\0';
+  if (input_file->zero_pad)       
+    snprintf(in_number, 16, "%0*d", input_file->num_digits, FrameNumberInFile);
+  else
+    snprintf(in_number, 16, "%*d", input_file->num_digits, FrameNumberInFile);
+
+  strncat(infile, in_number, sizeof(in_number));
+  length += sizeof(in_number);
+  infile[length]='\0';
+  strncat(infile, input_file->ftail, strlen(input_file->ftail));
+  length += strlen(input_file->ftail);
+  infile[length]='\0';
+
+  if ((input_file->f_num = open(infile, OPENFLAGS_READ)) == -1)
+  {
+    printf ("OpenFrameFile: cannot open file %s\n", infile);
+    report_stats_on_error();
+  }    
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Open file(s) containing the entire frame sequence
+ ************************************************************************
+ */
+void OpenFiles( VideoDataFile *input_file)
+{
+  if (input_file->is_concatenated == 1)
+  {
+    if (strlen(input_file->fname) == 0)
+    {
+      snprintf(errortext, ET_SIZE, "No input sequence name was provided. Please check settings.");
+      error (errortext, 500);
+    }
+
+    if ((input_file->f_num = open(input_file->fname, OPENFLAGS_READ)) == -1)
+    {
+      snprintf(errortext, ET_SIZE, "Input file %s does not exist",input_file->fname);
+      error (errortext, 500);
+    }
+  }
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Close input file
+ ************************************************************************
+ */
+void CloseFiles(VideoDataFile *input_file)
+{
+  if (input_file->f_num != -1)
+    close(input_file->f_num);
+  input_file->f_num = -1;
+}
+
+/* ==========================================================================
+ *
+ * ParseVideoType
+ *
+ * ==========================================================================
+*/
+VideoFileType ParseVideoType (VideoDataFile *input_file)
+{
+  char *format;
+
+  format = input_file->fname + strlen(input_file->fname) - 3;
+
+  if (strcasecmp (format, "yuv") == 0)
+  {
+    input_file->vdtype = VIDEO_YUV;
+    input_file->format.yuv_format = YUV420;
+    input_file->avi = NULL;
+  }
+  else if (strcasecmp (format, "rgb") == 0)
+  {
+    input_file->vdtype = VIDEO_RGB;
+    input_file->format.yuv_format = YUV444;
+    input_file->avi = NULL;
+  }
+  else if (strcasecmp (format, "tif") == 0)
+  {
+    input_file->vdtype = VIDEO_TIFF;
+    input_file->avi = NULL;
+  }
+  else if (strcasecmp (format, "avi") == 0) 
+  {
+    input_file->vdtype = VIDEO_AVI;
+  }
+  else
+  {
+    //snprintf(errortext, ET_SIZE, "ERROR: video file format not supported");
+    //error (errortext, 500);
+    input_file->vdtype = VIDEO_YUV;
+    input_file->format.yuv_format = YUV420;
+    input_file->avi = NULL;
+  }
+
+  return input_file->vdtype;
+}
diff --git a/Src/h264dec/lcommon/src/memalloc.c b/Src/h264dec/lcommon/src/memalloc.c
new file mode 100644
index 00000000..da5872ed
--- /dev/null
+++ b/Src/h264dec/lcommon/src/memalloc.c
@@ -0,0 +1,1280 @@
+
+/*!
+ ************************************************************************
+ * \file  memalloc.c
+ *
+ * \brief
+ *    Memory allocation and free helper functions
+ *
+ * \author
+ *    Main contributors (see contributors.h for copyright, address and affiliation details)
+ *     - Alexis Michael Tourapis         <alexismt@ieee.org> 
+ *     - Karsten S�hring                 <suehring@hhi.de> 
+ *
+ ************************************************************************
+ */
+
+#include "global.h"
+#include "memalloc.h"
+#include "mbuffer.h"
+
+#define ROUNDUP16(size) (((size)+15) & ~15)
+
+#if !defined(USEMMX)
+ /*!
+ ************************************************************************
+ * \brief
+ *    Initialize 2-dimensional top and bottom field to point to the proper
+ *    lines in frame
+ *
+ * \par Output:
+ *    memory size in bytes
+ ************************************************************************/
+int init_top_bot_planes(imgpel **imgFrame, int dim0, imgpel ***imgTopField, imgpel ***imgBotField)
+{
+  int i;
+
+  if((*imgTopField   = (imgpel**) malloc((dim0>>1) * sizeof(imgpel*))) == NULL)
+    no_mem_exit("init_top_bot_planes: imgTopField");
+
+  if((*imgBotField   = (imgpel**) malloc((dim0>>1) * sizeof(imgpel*))) == NULL)
+    no_mem_exit("init_top_bot_planes: imgBotField");
+
+  for(i = 0; i < (dim0>>1); i++)
+  {
+    (*imgTopField)[i] =  imgFrame[2 * i    ];
+    (*imgBotField)[i] =  imgFrame[2 * i + 1];
+  }
+
+  return dim0 * sizeof(imgpel*);
+}
+
+ /*!
+ ************************************************************************
+ * \brief
+ *    free 2-dimensional top and bottom fields without freeing target memory
+ *
+ * \par Output:
+ *    memory size in bytes
+ ************************************************************************/
+void free_top_bot_planes(imgpel **imgTopField, imgpel **imgBotField)
+{
+  free (imgTopField);
+  free (imgBotField);
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Allocate 1D memory array -> imgpel array1D[dim0
+ *
+ * \par Output:
+ *    memory size in bytes
+ ************************************************************************/
+int get_mem1Dpel(imgpel **array1D, int dim0)
+{
+  if((*array1D    = (imgpel*)calloc(dim0,       sizeof(imgpel))) == NULL)
+    no_mem_exit("get_mem1Dpel: arra12D");
+
+  return (sizeof(imgpel*) + dim0 * sizeof(imgpel));
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Allocate 2D memory array -> imgpel array2D[dim0][dim1]
+ *
+ * \par Output:
+ *    memory size in bytes
+ ************************************************************************/
+int get_mem2Dpel(imgpel ***array2D, int dim0, int dim1)
+{
+  int i;
+
+  if((*array2D    = (imgpel**)malloc(dim0 * sizeof(imgpel*))) == NULL)
+    no_mem_exit("get_mem2Dpel: array2D");
+  if((*(*array2D) = (imgpel* )calloc(dim0 * dim1,sizeof(imgpel ))) == NULL)
+    no_mem_exit("get_mem2Dpel: array2D");
+
+  for(i = 1 ; i < dim0; i++)
+    (*array2D)[i] = (*array2D)[i-1] + dim1;
+
+  return dim0 * (sizeof(imgpel*) + dim1 * sizeof(imgpel));
+}
+
+VideoImage *get_memImage(int width, int height)
+{
+	int i, stride;
+	VideoImage *image = (VideoImage *)calloc(1, sizeof(VideoImage));
+
+#ifdef H264_IPP
+
+	IppiSize roi = {width, height};
+	if (!image)
+		return 0;
+	if((image->img   = (imgpel**)malloc(height * sizeof(imgpel*))) == NULL)
+		return 0;
+
+	image->base_address = (imgpel* )ippiMalloc_8u_C1(width, height+1, &stride); // height+1 so we can deal with overreading
+	if (!image->base_address)
+		return 0;
+
+	image->stride=stride;
+
+	for(i = 0 ; i < height; i++)
+		image->img[i] = image->base_address + stride*i;
+
+	image->next = 0;
+
+	return image;
+#else
+	if (!image)
+		return 0;
+	stride = ROUNDUP16(width);
+	image->stride = stride;
+
+  if((image->img   = (imgpel**)malloc(height * sizeof(imgpel*))) == NULL)
+    return 0;
+  if((image->base_address = (imgpel* )malloc(stride * height* sizeof(imgpel))) == NULL)
+    return 0;
+	memset(image->base_address, 0, stride * height* sizeof(imgpel));
+
+  for(i = 0 ; i < height; i++)
+    image->img[i] = image->base_address + stride*i;
+
+  return image; 
+#endif
+}
+
+void free_memImage(VideoImage *image)
+{
+	free(image->img);
+	#ifdef H264_IPP
+	ippiFree(image->base_address);
+	#else
+	free(image->base_address);
+	#endif
+	free(image);
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Allocate 3D memory array -> imgpel array3D[dim0][dim1][dim2]
+ *
+ * \par Output:
+ *    memory size in bytes
+ ************************************************************************
+ */
+int get_mem3Dpel(imgpel ****array3D, int dim0, int dim1, int dim2)
+{
+  int i, mem_size = dim0 * sizeof(imgpel**);
+
+  if(((*array3D) = (imgpel***)malloc(dim0 * sizeof(imgpel**))) == NULL)
+    no_mem_exit("get_mem3Dpel: array3D");
+
+  mem_size += get_mem2Dpel(*array3D, dim0 * dim1, dim2);
+
+  for(i = 1; i < dim0; i++)
+    (*array3D)[i] = (*array3D)[i - 1] + dim1;
+  
+  return mem_size;
+}
+/*!
+ ************************************************************************
+ * \brief
+ *    Allocate 4D memory array -> imgpel array4D[dim0][dim1][dim2][dim3]
+ *
+ * \par Output:
+ *    memory size in bytes
+ ************************************************************************
+ */
+int get_mem4Dpel(imgpel *****array4D, int dim0, int dim1, int dim2, int dim3)
+{  
+  int  i, mem_size = dim0 * sizeof(imgpel***);
+
+  if(((*array4D) = (imgpel****)malloc(dim0 * sizeof(imgpel***))) == NULL)
+    no_mem_exit("get_mem4Dpel: array4D");
+
+  mem_size += get_mem3Dpel(*array4D, dim0 * dim1, dim2, dim3);
+
+  for(i = 1; i < dim0; i++)
+    (*array4D)[i] = (*array4D)[i - 1] + dim1;
+
+  return mem_size;
+}
+
+
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    free 1D memory array
+ *    which was allocated with get_mem1Dpel()
+ ************************************************************************
+ */
+void free_mem1Dpel(imgpel *array1D)
+{
+  if (array1D)
+  {
+    free (array1D);
+  } 
+  else
+  {
+    error ("free_mem1Dpel: trying to free unused memory",100);
+  }
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    free 2D memory array
+ *    which was allocated with get_mem2Dpel()
+ ************************************************************************
+ */
+void free_mem2Dpel(imgpel **array2D)
+{
+  if (array2D)
+  {
+    if (*array2D)
+      free (*array2D);
+    else 
+      error ("free_mem2Dpel: trying to free unused memory",100);
+
+    free (array2D);
+  } 
+  else
+  {
+    error ("free_mem2Dpel: trying to free unused memory",100);
+  }
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    free 3D memory array
+ *    which was allocated with get_mem3Dpel()
+ ************************************************************************
+ */
+void free_mem3Dpel(imgpel ***array3D)
+{
+  if (array3D)
+  {
+    free_mem2Dpel(*array3D);
+    free (array3D);
+  }
+  else
+  {
+    error ("free_mem3Dpel: trying to free unused memory",100);
+  }
+}
+/*!
+ ************************************************************************
+ * \brief
+ *    free 4D memory array
+ *    which was allocated with get_mem4Dpel()
+ ************************************************************************
+ */
+void free_mem4Dpel(imgpel ****array4D)
+{
+  if (array4D)
+  {
+    free_mem3Dpel(*array4D);
+    free (array4D);
+  }
+  else
+  {
+    error ("free_mem4Dpel: trying to free unused memory",100);
+  }
+}
+/*!
+ ************************************************************************
+ * \brief
+ *    free 5D memory array
+ *    which was allocated with get_mem5Dpel()
+ ************************************************************************
+ */
+void free_mem5Dpel(imgpel *****array5D)
+{
+  if (array5D)
+  {
+    free_mem4Dpel(*array5D);
+    free (array5D);
+  }
+  else
+  {
+    error ("free_mem5Dpel: trying to free unused memory",100);
+  }
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Allocate 2D memory array -> unsigned char array2D[dim0][dim1]
+ *
+ * \par Output:
+ *    memory size in bytes
+ ************************************************************************/
+int get_mem2D(byte ***array2D, int dim0, int dim1)
+{
+  int i;
+
+  if((  *array2D  = (byte**)malloc(dim0 *      sizeof(byte*))) == NULL)
+    no_mem_exit("get_mem2D: array2D");
+  if((*(*array2D) = (byte* )calloc(dim0 * dim1,sizeof(byte ))) == NULL)
+    no_mem_exit("get_mem2D: array2D");
+
+  for(i = 1; i < dim0; i++)
+    (*array2D)[i] = (*array2D)[i-1] + dim1;
+
+  return dim0 * (sizeof(byte*) + dim1 * sizeof(byte));
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Allocate 2D memory array -> int array2D[dim0][dim1]
+ *
+ * \par Output:
+ *    memory size in bytes
+ ************************************************************************
+ */
+int get_mem2Dint(int ***array2D, int dim0, int dim1)
+{
+  int i;
+
+  if((*array2D    = (int**)malloc(dim0 *       sizeof(int*))) == NULL)
+    no_mem_exit("get_mem2Dint: array2D");
+  if((*(*array2D) = (int* )calloc(dim0 * dim1, sizeof(int ))) == NULL)
+    no_mem_exit("get_mem2Dint: array2D");
+
+  for(i = 1 ; i < dim0; i++)
+    (*array2D)[i] =  (*array2D)[i-1] + dim1;
+
+  return dim0 * (sizeof(int*) + dim1 * sizeof(int));
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Allocate 2D memory array -> int64 array2D[dim0][dim1]
+ *
+ * \par Output:
+ *    memory size in bytes
+ ************************************************************************
+ */
+static int get_mem2Dref(h264_ref_t ***array2D, int dim0, int dim1)
+{
+  int i;
+	size_t malloc_size = ROUNDUP16(dim0 * dim1*sizeof(h264_ref_t));
+  if((*array2D    = (h264_ref_t**)malloc(dim0 *      sizeof(h264_ref_t*))) == NULL)
+    no_mem_exit("get_mem2Dint64: array2D");
+  if((*(*array2D) = (h264_ref_t* )_aligned_malloc(malloc_size, 32)) == NULL)
+    no_mem_exit("get_mem2Dint64: array2D");
+	memset((*array2D)[0], 0, malloc_size);
+
+  for(i = 1; i < dim0; i++)
+    (*array2D)[i] =  (*array2D)[i-1] + dim1;
+
+  return dim0 * (sizeof(h264_ref_t*) + dim1 * sizeof(h264_ref_t));
+}
+
+int get_mem2DPicMotion(PicMotion ***array2D, int dim0, int dim1)
+{
+	// we allocate with one extra position in the first dimension
+	// so the motion_cache can use it as a next pointer
+  int i;
+	size_t malloc_size = ROUNDUP16(dim0 * dim1*sizeof(PicMotion));
+  if((*array2D    = (PicMotion**)malloc((dim0+1) *      sizeof(PicMotion*))) == NULL)
+    no_mem_exit("get_mem2Dint64: array2D");
+  if((*(*array2D) = (PicMotion* )_aligned_malloc(malloc_size, 32)) == NULL)
+    no_mem_exit("get_mem2Dint64: array2D");
+	memset((*array2D)[0], 0, malloc_size);
+
+  for(i = 1; i < dim0; i++)
+    (*array2D)[i] =  (*array2D)[i-1] + dim1;
+	(*array2D)[dim0] = 0;
+
+  return dim0 * (sizeof(PicMotion*) + dim1 * sizeof(PicMotion));
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Allocate 3D memory array -> unsigned char array3D[dim0][dim1][dim2]
+ *
+ * \par Output:
+ *    memory size in bytes
+ ************************************************************************
+ */
+int get_mem3D(byte ****array3D, int dim0, int dim1, int dim2)
+{
+  int  i, mem_size = dim0 * sizeof(byte**);
+
+  if(((*array3D) = (byte***)malloc(dim0 * sizeof(byte**))) == NULL)
+    no_mem_exit("get_mem3D: array3D");
+
+  mem_size += get_mem2D(*array3D, dim0 * dim1, dim2);
+
+  for(i = 1; i < dim0; i++)
+    (*array3D)[i] =  (*array3D)[i-1] + dim1;
+
+  return mem_size;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Allocate 4D memory array -> unsigned char array4D[dim0][dim1][dim2][dim3]
+ *
+ * \par Output:
+ *    memory size in bytes
+ ************************************************************************
+ */
+int get_mem4D(byte *****array4D, int dim0, int dim1, int dim2, int dim3)
+{
+  int  i, mem_size = dim0 * sizeof(byte***);
+
+  if(((*array4D) = (byte****)malloc(dim0 * sizeof(byte***))) == NULL)
+    no_mem_exit("get_mem4D: array4D");
+
+  mem_size += get_mem3D(*array4D, dim0 * dim1, dim2, dim3);
+
+  for(i = 1; i < dim0; i++)
+    (*array4D)[i] =  (*array4D)[i-1] + dim1;
+
+  return mem_size;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Allocate 3D memory array -> int array3D[dim0][dim1][dim2]
+ *
+ * \par Output:
+ *    memory size in bytes
+ ************************************************************************
+ */
+int get_mem3Dint(int ****array3D, int dim0, int dim1, int dim2)
+{
+  int  i, mem_size = dim0 * sizeof(int**);
+
+  if(((*array3D) = (int***)malloc(dim0 * sizeof(int**))) == NULL)
+    no_mem_exit("get_mem3Dint: array3D");
+
+  mem_size += get_mem2Dint(*array3D, dim0 * dim1, dim2);
+
+  for(i = 1; i < dim0; i++)
+    (*array3D)[i] =  (*array3D)[i-1] + dim1;
+
+  return mem_size;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Allocate 3D memory array -> int64 array3D[dim0][dim1][dim2]
+ *
+ * \par Output:
+ *    memory size in bytes
+ ************************************************************************
+ */
+int get_mem3Dref(h264_ref_t ****array3D, int dim0, int dim1, int dim2)
+{
+  int  i, mem_size = dim0 * sizeof(h264_ref_t**);
+
+  if(((*array3D) = (h264_ref_t***)malloc(dim0 * sizeof(h264_ref_t**))) == NULL)
+    no_mem_exit("get_mem3Dint64: array3D");
+
+  mem_size += get_mem2Dref(*array3D, dim0 * dim1, dim2);
+
+  for(i = 1; i < dim0; i++)
+    (*array3D)[i] =  (*array3D)[i-1] + dim1;
+
+  return mem_size;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Allocate 4D memory array -> int array4D[dim0][dim1][dim2][dim3]
+ *
+ * \par Output:
+ *    memory size in bytes
+ ************************************************************************
+ */
+int get_mem4Dint(int *****array4D, int dim0, int dim1, int dim2, int dim3)
+{
+  int  i, mem_size = dim0 * sizeof(int***);
+
+  if(((*array4D) = (int****)malloc(dim0 * sizeof(int***))) == NULL)
+    no_mem_exit("get_mem4Dint: array4D");
+
+  mem_size += get_mem3Dint(*array4D, dim0 * dim1, dim2, dim3);
+
+  for(i = 1; i < dim0; i++)
+    (*array4D)[i] =  (*array4D)[i-1] + dim1;
+
+  return mem_size;
+}
+
+
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    free 2D memory array
+ *    which was allocated with get_mem2D()
+ ************************************************************************
+ */
+void free_mem2D(byte **array2D)
+{
+  if (array2D)
+  {
+    free (*array2D);
+    free (array2D);
+  } 
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    free 2D memory array
+ *    which was allocated with get_mem2Dint()
+ ************************************************************************
+ */
+void free_mem2Dint(int **array2D)
+{
+  if (array2D)
+  {
+    free (*array2D);
+    free (array2D);
+  }
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    free 2D memory array
+ *    which was allocated with get_mem2Dint64()
+ ************************************************************************
+ */
+void free_mem2Dref(h264_ref_t **array2D)
+{
+	if (array2D)
+	{
+		_aligned_free (*array2D);
+		free (array2D);
+	} 
+}
+
+void free_mem2DPicMotion(PicMotion **array2D)
+{
+	if (array2D)
+	{
+		_aligned_free (*array2D);
+		free (array2D);
+	} 
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    free 3D memory array
+ *    which was allocated with get_mem3D()
+ ************************************************************************
+ */
+void free_mem3D(byte ***array3D)
+{
+  if (array3D)
+  {
+   free_mem2D(*array3D);
+   free (array3D);
+  } 
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    free 4D memory array
+ *    which was allocated with get_mem3D()
+ ************************************************************************
+ */
+void free_mem4D(byte ****array4D)
+{
+  if (array4D)
+  {
+   free_mem3D(*array4D);
+   free (array4D);
+  } 
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    free 3D memory array
+ *    which was allocated with get_mem3Dint()
+ ************************************************************************
+ */
+void free_mem3Dint(int ***array3D)
+{
+  if (array3D)
+  {
+   free_mem2Dint(*array3D);
+   free (array3D);
+  } 
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    free 3D memory array
+ *    which was allocated with get_mem3Dint64()
+ ************************************************************************
+ */
+void free_mem3Dref(h264_ref_t ***array3D)
+{
+  if (array3D)
+  {
+   free_mem2Dref(*array3D);
+   free (array3D);
+  } 
+}
+
+void free_mem3DPicMotion(PicMotion ***array3D)
+{
+  if (array3D)
+  {
+   free_mem2DPicMotion(*array3D);
+   free (array3D);
+  } 
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    free 4D memory array
+ *    which was allocated with get_mem4Dint()
+ ************************************************************************
+ */
+void free_mem4Dint(int ****array4D)
+{
+  if (array4D)
+  {
+    free_mem3Dint( *array4D);
+    free (array4D);
+  } 
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Exit program if memory allocation failed (using error())
+ * \param where
+ *    string indicating which memory allocation failed
+ ************************************************************************
+ */
+void no_mem_exit(char *where)
+{
+   snprintf(errortext, ET_SIZE, "Could not allocate memory: %s",where);
+   error (errortext, 100);
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Allocate 2D uint16 memory array -> uint16 array2D[dim0][dim1]
+ *
+ * \par Output:
+ *    memory size in bytes
+ ************************************************************************
+ */
+int get_mem2Duint16(uint16 ***array2D, int dim0, int dim1)
+{
+  int i;
+
+  if((  *array2D  = (uint16**)malloc(dim0 *      sizeof(uint16*))) == NULL)
+    no_mem_exit("get_mem2Duint16: array2D");
+
+  if((*(*array2D) = (uint16* )calloc(dim0 * dim1,sizeof(uint16 ))) == NULL)
+    no_mem_exit("get_mem2Duint16: array2D");
+
+  for(i = 1; i < dim0; i++)
+    (*array2D)[i] = (*array2D)[i-1] + dim1;
+
+  return dim0 * (sizeof(uint16*) + dim1 * sizeof(uint16));
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Allocate 2D short memory array -> short array2D[dim0][dim1]
+ *
+ * \par Output:
+ *    memory size in bytes
+ ************************************************************************
+ */
+int get_mem2Dshort(short ***array2D, int dim0, int dim1)
+{
+  int i;
+	size_t malloc_size = ROUNDUP16(dim0 * dim1*sizeof(short));
+  if((  *array2D  = (short**)malloc(dim0 *      sizeof(short*))) == NULL)
+    no_mem_exit("get_mem2Dshort: array2D");
+  if((*(*array2D) = (short* )_aligned_malloc(malloc_size, 32)) == NULL)
+    no_mem_exit("get_mem2Dshort: array2D");
+	memset((*array2D)[0], 0, malloc_size);
+
+  for(i = 1; i < dim0; i++)
+    (*array2D)[i] = (*array2D)[i-1] + dim1;
+
+  return dim0 * (sizeof(short*) + dim1 * sizeof(short));
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Allocate 3D memory short array -> short array3D[dim0][dim1][dim2]
+ *
+ * \par Output:
+ *    memory size in bytes
+ ************************************************************************
+ */
+int get_mem3Dshort(short ****array3D,int dim0, int dim1, int dim2)
+{
+  int  i, mem_size = dim0 * sizeof(short**);
+
+  if(((*array3D) = (short***)malloc(dim0 * sizeof(short**))) == NULL)
+    no_mem_exit("get_mem3Dshort: array3D");
+
+  mem_size += get_mem2Dshort(*array3D, dim0 * dim1, dim2);
+
+  for(i = 1; i < dim0; i++)
+    (*array3D)[i] =  (*array3D)[i-1] + dim1;
+
+  return mem_size;
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Allocate 4D memory short array -> short array3D[dim0][dim1][dim2][dim3]
+ *
+ * \par Output:
+ *    memory size in bytes
+ ************************************************************************
+ */
+static MotionVector **get_mem2DMotionVector(int dim0, int dim1)
+{
+	MotionVector **array2D;
+  int i;
+	size_t malloc_size = ROUNDUP16(dim0 * dim1)*sizeof(MotionVector);
+  if((array2D  = (MotionVector**)malloc(dim0 *      sizeof(MotionVector*))) == NULL)
+    return 0;
+	
+  if((array2D[0] = (MotionVector* )_aligned_malloc(malloc_size, 32)) == NULL)
+	{
+		free(array2D);
+    return 0;
+	}
+	memset(array2D[0], 0, malloc_size);
+
+  for(i = 1; i < dim0; i++)
+    array2D[i] = array2D[i-1] + dim1;
+
+  return array2D;
+}
+
+MotionVector ***get_mem3DMotionVector(int dim0, int dim1, int dim2)
+{
+	MotionVector ***array3D;
+  int  i;
+
+  if((array3D = (MotionVector***)malloc(dim0 * sizeof(MotionVector **))) == NULL)
+    return 0;
+
+	array3D[0] = get_mem2DMotionVector(dim0 * dim1, dim2);
+	if (!array3D[0])
+	{
+		free(array3D);
+		return 0;
+	}
+
+  for(i = 1; i < dim0; i++)
+    array3D[i] =  array3D[i-1] + dim1;
+
+  return array3D;
+}
+
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    free 2D uint16 memory array
+ *    which was allocated with get_mem2Duint16()
+ ************************************************************************
+ */
+void free_mem2Duint16(uint16 **array2D)
+{
+  if (array2D)
+  {
+    free (*array2D);
+    free (array2D);
+  } 
+
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    free 2D short memory array
+ *    which was allocated with get_mem2Dshort()
+ ************************************************************************
+ */
+void free_mem2Dshort(short **array2D)
+{
+  if (array2D)
+  {
+    _aligned_free (*array2D);
+    free (array2D);
+  } 
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    free 4D short memory array
+ *    which was allocated with get_mem4Dshort()
+ ************************************************************************
+ */
+
+static void free_mem2DMotionVector(MotionVector **array2D)
+{
+  if (array2D)
+  {
+    _aligned_free(*array2D);
+    free (array2D);
+  } 
+
+}
+
+void free_mem3DMotionVector(MotionVector ***array3D)
+{  
+  if (array3D)
+  {
+    free_mem2DMotionVector( *array3D);
+    free (array3D);
+  } 
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Allocate 2D memory array -> double array2D[dim0][dim1]
+ *
+ * \par Output:
+ *    memory size in bytes
+ ************************************************************************
+ */
+int get_mem2Ddouble(double ***array2D, int dim0, int dim1)
+{
+  int i;
+
+  if((*array2D      = (double**)malloc(dim0 * sizeof(double*))) == NULL)
+    no_mem_exit("get_mem2Ddouble: array2D");
+  
+  if(((*array2D)[0] = (double* )calloc(dim0 * dim1,sizeof(double ))) == NULL)
+    no_mem_exit("get_mem2Ddouble: array2D");
+
+  for(i=1 ; i<dim0 ; i++)
+    (*array2D)[i] =  (*array2D)[i-1] + dim1  ;
+
+  return dim0 * (sizeof(double*) + dim1 * sizeof(double));
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Allocate 2D memory array -> double array2D[dim0][dim1]
+ *    Note that array is shifted towards offset allowing negative values
+ *
+ * \par Output:
+ *    memory size in bytes
+ ************************************************************************
+ */
+int get_mem2Dodouble(double ***array2D, int dim0, int dim1, int offset)
+{
+  int i;
+
+  if((*array2D      = (double**)malloc(dim0 *       sizeof(double*))) == NULL)
+    no_mem_exit("get_mem2Dodouble: array2D");
+  if(((*array2D)[0] = (double* )calloc(dim0 * dim1, sizeof(double ))) == NULL)
+    no_mem_exit("get_mem2Dodouble: array2D");
+
+  (*array2D)[0] += offset;
+
+  for(i=1 ; i<dim0 ; i++)
+    (*array2D)[i] =  (*array2D)[i-1] + dim1  ;
+
+  return dim0 * (sizeof(double*) + dim1 * sizeof(double));
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Allocate 3D memory double array -> double array3D[dim0][dim1][dim2]
+ *
+ * \par Output:
+ *    memory size in bytes
+ ************************************************************************
+ */
+int get_mem3Dodouble(double ****array3D, int dim0, int dim1, int dim2, int offset)
+{
+  int  i,j;
+
+  if(((*array3D) = (double***)malloc(dim0 * sizeof(double**))) == NULL)
+    no_mem_exit("get_mem3Dodouble: array3D");
+
+  if(((*array3D)[0] = (double** )calloc(dim0 * dim1, sizeof(double*))) == NULL)
+    no_mem_exit("get_mem3Dodouble: array3D");
+
+  (*array3D) [0] += offset;
+
+  for(i=1 ; i<dim0 ; i++)
+    (*array3D)[i] =  (*array3D)[i-1] + dim1  ;
+
+  for (i = 0; i < dim0; i++)
+    for (j = -offset; j < dim1 - offset; j++)
+      if(((*array3D)[i][j] = (double* )calloc(dim2, sizeof(double))) == NULL)
+        no_mem_exit("get_mem3Dodouble: array3D");
+
+  return dim0*( sizeof(double**) + dim1 * ( sizeof(double*) + dim2 * sizeof(double)));
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Allocate 2D memory array -> int array2D[dim0][dim1]
+ *    Note that array is shifted towards offset allowing negative values
+ *
+ * \par Output:
+ *    memory size in bytes
+ ************************************************************************
+ */
+int get_offset_mem2Dshort(short ***array2D, int dim0, int dim1, int offset_y, int offset_x)
+{
+  int i;
+
+  if((*array2D      = (short**)malloc(dim0 * sizeof(short*))) == NULL)
+    no_mem_exit("get_offset_mem2Dshort: array2D");
+
+  if(((*array2D)[0] = (short* )calloc(dim0 * dim1, sizeof(short))) == NULL)
+    no_mem_exit("get_offset_mem2Dshort: array2D");
+  (*array2D)[0] += offset_x + offset_y * dim1;
+
+  for(i=-1 ; i > -offset_y - 1; i--)
+  {
+    (*array2D)[i] =  (*array2D)[i+1] - dim1;
+  }
+
+  for(i=1 ; i < dim1 - offset_y; i++)
+    (*array2D)[i] =  (*array2D)[i-1] + dim1;
+
+  return dim0 * (sizeof(short*) + dim1 * sizeof(short));
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Allocate 3D memory int array -> int array3D[dim0][dim1][dim2]
+ *
+ * \par Output:
+ *    memory size in bytes
+ ************************************************************************
+ */
+int get_mem3Doint(int ****array3D, int dim0, int dim1, int dim2, int offset)
+{
+  int  i,j;
+
+  if(((*array3D) = (int***)malloc(dim0 * sizeof(int**))) == NULL)
+    no_mem_exit("get_mem3Doint: array3D");
+
+  if(((*array3D)[0] = (int** )calloc(dim0 * dim1, sizeof(int*))) == NULL)
+    no_mem_exit("get_mem3Doint: array3D");
+
+  (*array3D) [0] += offset;
+
+  for(i=1 ; i<dim0 ; i++)
+    (*array3D)[i] =  (*array3D)[i-1] + dim1  ;
+
+  for (i = 0; i < dim0; i++)
+    for (j = -offset; j < dim1 - offset; j++)
+      if(((*array3D)[i][j] = (int* )calloc(dim2, sizeof(int))) == NULL)
+        no_mem_exit("get_mem3Doint: array3D");
+
+  return dim0 * (sizeof(int**) + dim1 * (sizeof(int*) + dim2 * sizeof(int)));
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Allocate 2D memory array -> int array2D[dim0][dim1]
+ *    Note that array is shifted towards offset allowing negative values
+ *
+ * \par Output:
+ *    memory size in bytes
+ ************************************************************************
+ */
+int get_mem2Doint(int ***array2D, int dim0, int dim1, int offset)
+{
+  int i;
+
+  if((*array2D      = (int**)malloc(dim0 * sizeof(int*))) == NULL)
+    no_mem_exit("get_mem2Dint: array2D");
+  if(((*array2D)[0] = (int* )calloc(dim0 * dim1, sizeof(int))) == NULL)
+    no_mem_exit("get_mem2Dint: array2D");
+
+  (*array2D)[0] += offset;
+
+  for(i=1 ; i<dim0 ; i++)
+    (*array2D)[i] =  (*array2D)[i-1] + dim1  ;
+
+  return dim0 * (sizeof(int*) + dim1 * sizeof(int));
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Allocate 3D memory array -> int array3D[dim0][dim1][dim2]
+ *
+ * \par Output:
+ *    memory size in bytes
+ ************************************************************************
+ */
+// same change as in get_mem3Dint
+int get_mem3Ddouble(double ****array3D, int dim0, int dim1, int dim2)
+{
+	int  j, mem_size = dim0 * sizeof(double**);
+
+  double **array2D;
+
+  if(((*array3D) = (double***)malloc(dim0 * sizeof(double**))) == NULL)
+    no_mem_exit("get_mem3Ddouble: array3D");
+
+  mem_size += get_mem2Ddouble(&array2D, dim0 * dim1, dim2);
+
+  for(j = 0; j < dim0; j++)
+  {    
+    (*array3D)[j] = &array2D[j * dim1];
+  }
+
+  return mem_size;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    free 2D double memory array
+ *    which was allocated with get_mem2Ddouble()
+ ************************************************************************
+ */
+void free_mem2Ddouble(double **array2D)
+{
+  if (array2D)
+  {
+    if (*array2D)
+      free (*array2D);
+    else 
+      error ("free_mem2Ddouble: trying to free unused memory",100);
+
+    free (array2D);
+
+  }
+  else
+  {
+    error ("free_mem2Ddouble: trying to free unused memory",100);
+  }
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    free 2D double memory array (with offset)
+*    which was allocated with get_mem2Ddouble()
+************************************************************************
+*/
+void free_mem2Dodouble(double **array2D, int offset)
+{
+  if (array2D)
+  {
+    array2D[0] -= offset;
+    if (array2D[0])
+      free (array2D[0]);
+    else error ("free_mem2Dodouble: trying to free unused memory",100);
+
+    free (array2D);
+
+  } else
+  {
+    error ("free_mem2Dodouble: trying to free unused memory",100);
+  }
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    free 3D memory array with offset
+ ************************************************************************
+ */
+void free_mem3Dodouble(double ***array3D, int dim0, int dim1, int offset)
+{
+  int i, j;
+
+  if (array3D)
+  {
+    for (i = 0; i < dim0; i++)
+    {
+      for (j = -offset; j < dim1 - offset; j++)
+      {
+        if (array3D[i][j])
+          free(array3D[i][j]);
+        else
+          error ("free_mem3Dodouble: trying to free unused memory",100);
+      }
+    }
+    array3D[0] -= offset;
+    if (array3D[0])
+      free(array3D[0]);
+    else
+      error ("free_mem3Dodouble: trying to free unused memory",100);
+    free (array3D);
+  }
+  else
+  {
+    error ("free_mem3Dodouble: trying to free unused memory",100);
+  }
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    free 3D memory array with offset
+ ************************************************************************
+ */
+void free_mem3Doint(int ***array3D, int dim0, int dim1, int offset)
+{
+  int i, j;
+
+  if (array3D)
+  {
+    for (i = 0; i < dim0; i++)
+    {
+      for (j = -offset; j < dim1 - offset; j++)
+      {
+        if (array3D[i][j])
+          free(array3D[i][j]);
+        else
+          error ("free_mem3Doint: trying to free unused memory",100);
+      }
+    }
+    array3D[0] -= offset;
+    if (array3D[0])
+      free(array3D[0]);
+    else
+      error ("free_mem3Doint: trying to free unused memory",100);
+    free (array3D);
+  }
+  else
+  {
+    error ("free_mem3Doint: trying to free unused memory",100);
+  }
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    free 2D double memory array (with offset)
+*    which was allocated with get_mem2Ddouble()
+************************************************************************
+*/
+void free_mem2Doint(int **array2D, int offset)
+{
+  if (array2D)
+  {
+    array2D[0] -= offset;
+    if (array2D[0])
+      free (array2D[0]);
+    else 
+      error ("free_mem2Doint: trying to free unused memory",100);
+
+    free (array2D);
+
+  } 
+  else
+  {
+    error ("free_mem2Doint: trying to free unused memory",100);
+  }
+}
+
+/*!
+************************************************************************
+* \brief
+*    free 2D double memory array (with offset)
+*    which was allocated with get_mem2Ddouble()
+************************************************************************
+*/
+void free_offset_mem2Dshort(short **array2D, int dim1, int offset_y, int offset_x)
+{
+  if (array2D)
+  {
+    array2D[0] -= offset_x + offset_y * dim1;
+    if (array2D[0])
+      free (array2D[0]);
+    else 
+      error ("free_offset_mem2Dshort: trying to free unused memory",100);
+
+    free (array2D);
+
+  } 
+  else
+  {
+    error ("free_offset_mem2Dshort: trying to free unused memory",100);
+  }
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    free 3D memory array
+ *    which was alocated with get_mem3Dint()
+ ************************************************************************
+ */
+void free_mem3Ddouble(double ***array3D)
+{
+  if (array3D)
+  {
+    free_mem2Ddouble(*array3D);
+    free (array3D);
+  } 
+  else
+  {
+    error ("free_mem3D: trying to free unused memory",100);
+  }
+}
+
+
+#endif
diff --git a/Src/h264dec/lcommon/src/memcache.c b/Src/h264dec/lcommon/src/memcache.c
new file mode 100644
index 00000000..ce3b29d1
--- /dev/null
+++ b/Src/h264dec/lcommon/src/memcache.c
@@ -0,0 +1,106 @@
+#include "memcache.h"
+#include "mbuffer.h"
+#include "memalloc.h"
+
+void image_cache_flush(ImageCache *cache)
+{
+	while (cache->head)
+	{
+		VideoImage *next = cache->head->next;
+		free_memImage(cache->head);
+		cache->head = next;
+	}
+	cache->size_x = 0;
+	cache->size_y = 0;
+}
+
+void image_cache_set_dimensions(ImageCache *cache, int width, int height)
+{
+	if (width != cache->size_x || height != cache->size_y)
+	{
+		image_cache_flush(cache);
+		cache->size_x = width;
+		cache->size_y = height;
+	}
+}
+
+int image_cache_dimensions_match(ImageCache *cache, int width, int height)
+{
+	if (width != cache->size_x || height != cache->size_y)
+		return 0;
+
+	return 1;
+}
+
+void image_cache_add(ImageCache *cache, VideoImage *image)
+{
+	image->next = cache->head;
+	cache->head = image;
+}
+
+struct video_image *image_cache_get(ImageCache *cache)
+{
+	if (cache->head)
+	{
+		VideoImage *ret = cache->head;
+		cache->head = ret->next;
+		ret->next = 0;
+		return ret;
+	}
+	return 0;
+}
+
+/* ------------- 
+
+PicMotion arrays are allowed with one extra slot in the first dimension
+which we use as the next pointer
+------------- */
+
+
+void motion_cache_flush(MotionCache *cache)
+{
+	while (cache->head)
+	{
+		PicMotion **next = (PicMotion **)cache->head[cache->size_y];
+		free_mem2DPicMotion(cache->head);
+		cache->head = next;
+	}
+	cache->size_x = 0;
+	cache->size_y = 0;
+}
+
+void motion_cache_set_dimensions(MotionCache *cache, int width, int height)
+{
+	if (width != cache->size_x || height != cache->size_y)
+	{
+		motion_cache_flush(cache);
+		cache->size_x = width;
+		cache->size_y = height;
+	}
+}
+
+int motion_cache_dimensions_match(MotionCache *cache, int width, int height)
+{
+	if (width != cache->size_x || height != cache->size_y)
+		return 0;
+
+	return 1;
+}
+
+void motion_cache_add(MotionCache *cache, PicMotion **image)
+{
+	image[cache->size_y] = (PicMotion *)cache->head;
+	cache->head = image;
+}
+
+struct pic_motion **motion_cache_get(MotionCache *cache)
+{
+	if (cache->head)
+	{
+		PicMotion **ret = cache->head;
+		cache->head = (PicMotion **)ret[cache->size_y];
+		ret[cache->size_y] = 0;
+		return ret;
+	}
+	return 0;
+}
diff --git a/Src/h264dec/lcommon/src/mv_prediction.c b/Src/h264dec/lcommon/src/mv_prediction.c
new file mode 100644
index 00000000..b4638d6d
--- /dev/null
+++ b/Src/h264dec/lcommon/src/mv_prediction.c
@@ -0,0 +1,250 @@
+/*!
+ *************************************************************************************
+ * \file mv_prediction.c
+ *
+ * \brief
+ *    Motion Vector Prediction Functions
+ *
+ *  \author
+ *      Main contributors (see contributors.h for copyright, address and affiliation details)
+ *      - Alexis Michael Tourapis  <alexismt@ieee.org>
+ *      - Karsten S�hring          <suehring@hhi.de>
+ *************************************************************************************
+ */
+
+#include "global.h"
+#include "mbuffer.h"
+/*!
+ ************************************************************************
+ * \brief
+ *    Get motion vector predictor
+ ************************************************************************
+ */
+static void GetMotionVectorPredictorMBAFF (Macroblock *currMB, 
+                                    PixelPos *block,        // <--> block neighbors
+                                    short  pmv[2],
+                                    short  ref_frame,
+                                    PicMotion **motion, 
+                                    int    mb_x,
+                                    int    mb_y,
+                                    int    blockshape_x,
+                                    int    blockshape_y)
+{
+  int mv_a, mv_b, mv_c, pred_vec=0;
+  int mvPredType, rFrameL, rFrameU, rFrameUR;
+  int hv;
+  VideoParameters *p_Vid = currMB->p_Vid;
+
+  mvPredType = MVPRED_MEDIAN;
+
+
+  if (currMB->mb_field)
+	{
+		rFrameL  = block[0].available
+			? (p_Vid->mb_data[block[0].mb_addr].mb_field
+			? motion[block[0].pos_y][block[0].pos_x].ref_idx
+			: motion[block[0].pos_y][block[0].pos_x].ref_idx * 2) : -1;
+		rFrameU  = block[1].available
+			? (p_Vid->mb_data[block[1].mb_addr].mb_field
+			? motion[block[1].pos_y][block[1].pos_x].ref_idx
+			: motion[block[1].pos_y][block[1].pos_x].ref_idx * 2) : -1;
+		rFrameUR = block[2].available
+			? (p_Vid->mb_data[block[2].mb_addr].mb_field
+			? motion[block[2].pos_y][block[2].pos_x].ref_idx
+			: motion[block[2].pos_y][block[2].pos_x].ref_idx * 2) : -1;
+	}
+	else
+  {
+    rFrameL = block[0].available
+      ? (p_Vid->mb_data[block[0].mb_addr].mb_field
+      ? motion[block[0].pos_y][block[0].pos_x].ref_idx >>1
+      : motion[block[0].pos_y][block[0].pos_x].ref_idx) : -1;
+    rFrameU  = block[1].available
+      ? (p_Vid->mb_data[block[1].mb_addr].mb_field
+      ? motion[block[1].pos_y][block[1].pos_x].ref_idx >>1
+      : motion[block[1].pos_y][block[1].pos_x].ref_idx) : -1;
+    rFrameUR = block[2].available
+      ? (p_Vid->mb_data[block[2].mb_addr].mb_field
+      ? motion[block[2].pos_y][block[2].pos_x].ref_idx >>1
+      : motion[block[2].pos_y][block[2].pos_x].ref_idx) : -1;
+  }
+
+
+  /* Prediction if only one of the neighbors uses the reference frame
+  *  we are checking
+  */
+  if(rFrameL == ref_frame && rFrameU != ref_frame && rFrameUR != ref_frame)       
+    mvPredType = MVPRED_L;
+  else if(rFrameL != ref_frame && rFrameU == ref_frame && rFrameUR != ref_frame)  
+    mvPredType = MVPRED_U;
+  else if(rFrameL != ref_frame && rFrameU != ref_frame && rFrameUR == ref_frame)  
+    mvPredType = MVPRED_UR;
+  // Directional predictions
+  if(blockshape_x == 8 && blockshape_y == 16)
+  {
+    if(mb_x == 0)
+    {
+      if(rFrameL == ref_frame)
+        mvPredType = MVPRED_L;
+    }
+    else
+    {
+      if( rFrameUR == ref_frame)
+        mvPredType = MVPRED_UR;
+    }
+  }
+  else if(blockshape_x == 16 && blockshape_y == 8)
+  {
+    if(mb_y == 0)
+    {
+      if(rFrameU == ref_frame)
+        mvPredType = MVPRED_U;
+    }
+    else
+    {
+      if(rFrameL == ref_frame)
+        mvPredType = MVPRED_L;
+    }
+  }
+
+  for (hv=0; hv < 2; hv++)
+  {
+    if (hv == 0)
+    {
+      mv_a = block[0].available ? motion[block[0].pos_y][block[0].pos_x].mv[hv] : 0;
+      mv_b = block[1].available ? motion[block[1].pos_y][block[1].pos_x].mv[hv] : 0;
+      mv_c = block[2].available ? motion[block[2].pos_y][block[2].pos_x].mv[hv] : 0;
+    }
+    else
+    {
+			if (currMB->mb_field)
+			{
+				mv_a = block[0].available  ? p_Vid->mb_data[block[0].mb_addr].mb_field
+					? motion[block[0].pos_y][block[0].pos_x].mv[hv]
+				: motion[block[0].pos_y][block[0].pos_x].mv[hv] / 2
+					: 0;
+				mv_b = block[1].available  ? p_Vid->mb_data[block[1].mb_addr].mb_field
+					? motion[block[1].pos_y][block[1].pos_x].mv[hv]
+				: motion[block[1].pos_y][block[1].pos_x].mv[hv] / 2
+					: 0;
+				mv_c = block[2].available  ? p_Vid->mb_data[block[2].mb_addr].mb_field
+					? motion[block[2].pos_y][block[2].pos_x].mv[hv]
+				: motion[block[2].pos_y][block[2].pos_x].mv[hv] / 2
+					: 0;
+			}
+			else
+			{
+				mv_a = block[0].available  ? p_Vid->mb_data[block[0].mb_addr].mb_field
+					? motion[block[0].pos_y][block[0].pos_x].mv[hv] * 2
+					: motion[block[0].pos_y][block[0].pos_x].mv[hv]
+				: 0;
+				mv_b = block[1].available  ? p_Vid->mb_data[block[1].mb_addr].mb_field
+					? motion[block[1].pos_y][block[1].pos_x].mv[hv] * 2
+					: motion[block[1].pos_y][block[1].pos_x].mv[hv]
+				: 0;
+				mv_c = block[2].available  ? p_Vid->mb_data[block[2].mb_addr].mb_field
+					? motion[block[2].pos_y][block[2].pos_x].mv[hv] * 2
+					: motion[block[2].pos_y][block[2].pos_x].mv[hv]
+				: 0;
+			}
+    }
+
+    switch (mvPredType)
+    {
+    case MVPRED_MEDIAN:
+      if(!(block[1].available || block[2].available))
+      {
+        pred_vec = mv_a;
+      }
+      else
+      {
+        pred_vec = mv_a + mv_b + mv_c - imin(mv_a, imin(mv_b, mv_c)) - imax(mv_a, imax(mv_b ,mv_c));
+      }
+      break;
+    case MVPRED_L:
+      pred_vec = mv_a;
+      break;
+    case MVPRED_U:
+      pred_vec = mv_b;
+      break;
+    case MVPRED_UR:
+      pred_vec = mv_c;
+      break;
+    default:
+      break;
+    }
+
+    pmv[hv] = (short) pred_vec;
+  }
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Get motion vector predictor
+ ************************************************************************
+ */
+// TODO: benski> make SSE3/MMX version
+static void GetMotionVectorPredictorNormal (Macroblock *currMB, 
+                                            PixelPos *block,      // <--> block neighbors
+                                            short  pmv[2],
+                                            short  ref_frame,
+                                            PicMotion **motion, 
+                                            int    mb_x,
+                                            int    mb_y,
+                                            int    blockshape_x,
+																						int    blockshape_y)
+{
+	int rFrameL    = block[0].available ? motion[block[0].pos_y][block[0].pos_x].ref_idx : -1;
+	int rFrameU    = block[1].available ? motion[block[1].pos_y][block[1].pos_x].ref_idx : -1;
+	int rFrameUR   = block[2].available ? motion[block[2].pos_y][block[2].pos_x].ref_idx : -1;
+
+	/* Prediction if only one of the neighbors uses the reference frame
+	*  we are checking
+	*/
+	if (rFrameL == ref_frame && 
+		((rFrameU != ref_frame && rFrameUR != ref_frame) || (blockshape_x == 8 && blockshape_y == 16 && mb_x == 0) || (blockshape_x == 16 && blockshape_y == 8 && mb_y != 0)))
+	{ // left
+		pmv[0] = block[0].available ? motion[block[0].pos_y][block[0].pos_x].mv[0] : 0;
+		pmv[1] = block[0].available ? motion[block[0].pos_y][block[0].pos_x].mv[1] : 0;
+	}
+	else if (rFrameU == ref_frame && 
+		((rFrameL != ref_frame && rFrameUR != ref_frame) || (blockshape_x == 16 && blockshape_y == 8 && mb_y == 0)))
+	{ // up
+		pmv[0] = block[1].available ? motion[block[1].pos_y][block[1].pos_x].mv[0] : 0;
+		pmv[1] = block[1].available ? motion[block[1].pos_y][block[1].pos_x].mv[1] : 0;
+	}
+	else if (rFrameUR == ref_frame &&
+		((rFrameL != ref_frame && rFrameU != ref_frame) || (blockshape_x == 8 && blockshape_y == 16 && mb_x != 0)))
+	{ // upper right
+		pmv[0] = block[2].available ? motion[block[2].pos_y][block[2].pos_x].mv[0] : 0;   
+		pmv[1] = block[2].available ? motion[block[2].pos_y][block[2].pos_x].mv[1] : 0;   
+	}
+	else
+	{ // median
+		if(!(block[1].available || block[2].available))
+		{
+			pmv[0] = block[0].available ? motion[block[0].pos_y][block[0].pos_x].mv[0] : 0;
+			pmv[1] = block[0].available ? motion[block[0].pos_y][block[0].pos_x].mv[1] : 0;
+		}
+		else
+		{
+			int mv_a = block[0].available ? motion[block[0].pos_y][block[0].pos_x].mv[0] : 0;
+			int mv_b = block[1].available ? motion[block[1].pos_y][block[1].pos_x].mv[0] : 0;
+			int mv_c = block[2].available ? motion[block[2].pos_y][block[2].pos_x].mv[0] : 0;   
+			pmv[0] = mv_a + mv_b + mv_c - imin(mv_a, imin(mv_b, mv_c)) - imax(mv_a, imax(mv_b ,mv_c));
+			mv_a = block[0].available ? motion[block[0].pos_y][block[0].pos_x].mv[1] : 0;
+			mv_b = block[1].available ? motion[block[1].pos_y][block[1].pos_x].mv[1] : 0;
+			mv_c = block[2].available ? motion[block[2].pos_y][block[2].pos_x].mv[1] : 0;   
+			pmv[1] = mv_a + mv_b + mv_c - imin(mv_a, imin(mv_b, mv_c)) - imax(mv_a, imax(mv_b ,mv_c));
+		}
+	}
+}
+
+void init_motion_vector_prediction(Macroblock *currMB, int mb_aff_frame_flag)
+{
+  if (mb_aff_frame_flag)
+    currMB->GetMVPredictor = GetMotionVectorPredictorMBAFF;
+  else
+    currMB->GetMVPredictor = GetMotionVectorPredictorNormal;
+}
diff --git a/Src/h264dec/lcommon/src/parsetcommon.c b/Src/h264dec/lcommon/src/parsetcommon.c
new file mode 100644
index 00000000..fe3f0e9a
--- /dev/null
+++ b/Src/h264dec/lcommon/src/parsetcommon.c
@@ -0,0 +1,244 @@
+
+/*!
+ **************************************************************************************
+ * \file
+ *    parsetcommon.c
+ * \brief
+ *    Picture and Sequence Parameter set generation and handling
+ *  \date 25 November 2002
+ * \author
+ *    Main contributors (see contributors.h for copyright, address and affiliation details)
+ *      - Stephan Wenger        <stewe@cs.tu-berlin.de>
+ *
+ **************************************************************************************
+ */
+
+#include "global.h"
+#include "parsetcommon.h"
+#include "memalloc.h"
+/*!
+ *************************************************************************************
+ * \brief
+ *    Allocates memory for a picture paramater set
+ *
+ * \return
+ *    pointer to a pps
+ *************************************************************************************
+ */
+
+pic_parameter_set_rbsp_t *AllocPPS ()
+ {
+   pic_parameter_set_rbsp_t *p;
+
+   if ((p=calloc (sizeof (pic_parameter_set_rbsp_t), 1)) == NULL)
+     no_mem_exit ("AllocPPS: PPS");
+   p->slice_group_id = NULL;
+   return p;
+ }
+
+
+/*!
+ *************************************************************************************
+ * \brief
+ *    Allocates memory for am sequence paramater set
+ *
+ * \return
+ *    pointer to a sps
+ *************************************************************************************
+ */
+
+seq_parameter_set_rbsp_t *AllocSPS ()
+ {
+   seq_parameter_set_rbsp_t *p;
+
+   if ((p=calloc (sizeof (seq_parameter_set_rbsp_t), 1)) == NULL)
+     no_mem_exit ("AllocSPS: SPS");
+   return p;
+ }
+
+
+/*!
+ *************************************************************************************
+ * \brief
+ *    Frees a picture parameter set
+ *
+ * \param pps to be freed
+ *   Picture parameter set to be freed
+ *************************************************************************************
+ */
+
+ void FreePPS (pic_parameter_set_rbsp_t *pps)
+ {
+   assert (pps != NULL);
+   if (pps->slice_group_id != NULL) 
+     free (pps->slice_group_id);
+   free (pps);
+ }
+
+
+ /*!
+ *************************************************************************************
+ * \brief
+ *    Frees a sps
+ *
+ * \param sps
+ *   Sequence parameter set to be freed
+ *************************************************************************************
+ */
+
+ void FreeSPS (seq_parameter_set_rbsp_t *sps)
+ {
+   assert (sps != NULL);
+   free (sps);
+ }
+
+
+int sps_is_equal(seq_parameter_set_rbsp_t *sps1, seq_parameter_set_rbsp_t *sps2)
+{
+  unsigned i;
+  int equal = 1;
+
+  if ((!sps1->Valid) || (!sps2->Valid))
+    return 0;
+
+  equal &= (sps1->profile_idc == sps2->profile_idc);
+  equal &= (sps1->constrained_set0_flag == sps2->constrained_set0_flag);
+  equal &= (sps1->constrained_set1_flag == sps2->constrained_set1_flag);
+  equal &= (sps1->constrained_set2_flag == sps2->constrained_set2_flag);
+  equal &= (sps1->level_idc == sps2->level_idc);
+  equal &= (sps1->seq_parameter_set_id == sps2->seq_parameter_set_id);
+  equal &= (sps1->log2_max_frame_num_minus4 == sps2->log2_max_frame_num_minus4);
+  equal &= (sps1->pic_order_cnt_type == sps2->pic_order_cnt_type);
+
+  if (!equal) return equal;
+
+  if( sps1->pic_order_cnt_type == 0 )
+  {
+    equal &= (sps1->log2_max_pic_order_cnt_lsb_minus4 == sps2->log2_max_pic_order_cnt_lsb_minus4);
+  }
+
+  else if( sps1->pic_order_cnt_type == 1 )
+  {
+    equal &= (sps1->delta_pic_order_always_zero_flag == sps2->delta_pic_order_always_zero_flag);
+    equal &= (sps1->offset_for_non_ref_pic == sps2->offset_for_non_ref_pic);
+    equal &= (sps1->offset_for_top_to_bottom_field == sps2->offset_for_top_to_bottom_field);
+    equal &= (sps1->num_ref_frames_in_pic_order_cnt_cycle == sps2->num_ref_frames_in_pic_order_cnt_cycle);
+    if (!equal) return equal;
+
+    for ( i = 0 ; i< sps1->num_ref_frames_in_pic_order_cnt_cycle ;i ++)
+      equal &= (sps1->offset_for_ref_frame[i] == sps2->offset_for_ref_frame[i]);
+  }
+
+  equal &= (sps1->num_ref_frames == sps2->num_ref_frames);
+  equal &= (sps1->gaps_in_frame_num_value_allowed_flag == sps2->gaps_in_frame_num_value_allowed_flag);
+  equal &= (sps1->pic_width_in_mbs_minus1 == sps2->pic_width_in_mbs_minus1);
+  equal &= (sps1->pic_height_in_map_units_minus1 == sps2->pic_height_in_map_units_minus1);
+  equal &= (sps1->frame_mbs_only_flag == sps2->frame_mbs_only_flag);
+
+  if (!equal) return equal;
+  if( !sps1->frame_mbs_only_flag )
+    equal &= (sps1->mb_adaptive_frame_field_flag == sps2->mb_adaptive_frame_field_flag);
+
+  equal &= (sps1->direct_8x8_inference_flag == sps2->direct_8x8_inference_flag);
+  equal &= (sps1->frame_cropping_flag == sps2->frame_cropping_flag);
+  if (!equal) return equal;
+  if (sps1->frame_cropping_flag)
+  {
+    equal &= (sps1->frame_cropping_rect_left_offset == sps2->frame_cropping_rect_left_offset);
+    equal &= (sps1->frame_cropping_rect_right_offset == sps2->frame_cropping_rect_right_offset);
+    equal &= (sps1->frame_cropping_rect_top_offset == sps2->frame_cropping_rect_top_offset);
+    equal &= (sps1->frame_cropping_rect_bottom_offset == sps2->frame_cropping_rect_bottom_offset);
+  }
+  equal &= (sps1->vui_parameters_present_flag == sps2->vui_parameters_present_flag);
+
+  return equal;
+}
+
+int pps_is_equal(pic_parameter_set_rbsp_t *pps1, pic_parameter_set_rbsp_t *pps2)
+{
+  unsigned i, j;
+  int equal = 1;
+
+  if ((!pps1->Valid) || (!pps2->Valid))
+    return 0;
+
+  equal &= (pps1->pic_parameter_set_id == pps2->pic_parameter_set_id);
+  equal &= (pps1->seq_parameter_set_id == pps2->seq_parameter_set_id);
+  equal &= (pps1->entropy_coding_mode_flag == pps2->entropy_coding_mode_flag);
+  equal &= (pps1->bottom_field_pic_order_in_frame_present_flag == pps2->bottom_field_pic_order_in_frame_present_flag);
+  equal &= (pps1->num_slice_groups_minus1 == pps2->num_slice_groups_minus1);
+
+  if (!equal) return equal;
+
+  if (pps1->num_slice_groups_minus1>0)
+  {
+      equal &= (pps1->slice_group_map_type == pps2->slice_group_map_type);
+      if (!equal) return equal;
+      if (pps1->slice_group_map_type == 0)
+      {
+        for (i=0; i<=pps1->num_slice_groups_minus1; i++)
+          equal &= (pps1->run_length_minus1[i] == pps2->run_length_minus1[i]);
+      }
+      else if( pps1->slice_group_map_type == 2 )
+      {
+        for (i=0; i<pps1->num_slice_groups_minus1; i++)
+        {
+          equal &= (pps1->top_left[i] == pps2->top_left[i]);
+          equal &= (pps1->bottom_right[i] == pps2->bottom_right[i]);
+        }
+      }
+      else if( pps1->slice_group_map_type == 3 || pps1->slice_group_map_type==4 || pps1->slice_group_map_type==5 )
+      {
+        equal &= (pps1->slice_group_change_direction_flag == pps2->slice_group_change_direction_flag);
+        equal &= (pps1->slice_group_change_rate_minus1 == pps2->slice_group_change_rate_minus1);
+      }
+      else if( pps1->slice_group_map_type == 6 )
+      {
+        equal &= (pps1->pic_size_in_map_units_minus1 == pps2->pic_size_in_map_units_minus1);
+        if (!equal) return equal;
+        for (i=0; i<=pps1->pic_size_in_map_units_minus1; i++)
+          equal &= (pps1->slice_group_id[i] == pps2->slice_group_id[i]);
+      }
+  }
+
+  equal &= (pps1->num_ref_idx_l0_active_minus1 == pps2->num_ref_idx_l0_active_minus1);
+  equal &= (pps1->num_ref_idx_l1_active_minus1 == pps2->num_ref_idx_l1_active_minus1);
+  equal &= (pps1->weighted_pred_flag == pps2->weighted_pred_flag);
+  equal &= (pps1->weighted_bipred_idc == pps2->weighted_bipred_idc);
+  equal &= (pps1->pic_init_qp_minus26 == pps2->pic_init_qp_minus26);
+  equal &= (pps1->pic_init_qs_minus26 == pps2->pic_init_qs_minus26);
+  equal &= (pps1->chroma_qp_index_offset == pps2->chroma_qp_index_offset);
+  equal &= (pps1->deblocking_filter_control_present_flag == pps2->deblocking_filter_control_present_flag);
+  equal &= (pps1->constrained_intra_pred_flag == pps2->constrained_intra_pred_flag);
+  equal &= (pps1->redundant_pic_cnt_present_flag == pps2->redundant_pic_cnt_present_flag);
+
+  if (!equal) return equal;
+
+  //Fidelity Range Extensions Stuff
+  //It is initialized to zero, so should be ok to check all the time.
+  equal &= (pps1->transform_8x8_mode_flag == pps2->transform_8x8_mode_flag);
+  equal &= (pps1->pic_scaling_matrix_present_flag == pps2->pic_scaling_matrix_present_flag);
+  if(pps1->pic_scaling_matrix_present_flag)
+  {
+    for(i = 0; i < (6 + ((unsigned)pps1->transform_8x8_mode_flag << 1)); i++)
+    {
+      equal &= (pps1->pic_scaling_list_present_flag[i] == pps2->pic_scaling_list_present_flag[i]);
+      if(pps1->pic_scaling_list_present_flag[i])
+      {
+        if(i < 6)
+        {
+          for (j = 0; j < 16; j++)
+            equal &= (pps1->ScalingList4x4[i][j] == pps2->ScalingList4x4[i][j]);
+        }
+        else
+        {
+          for (j = 0; j < 64; j++)
+            equal &= (pps1->ScalingList8x8[i-6][j] == pps2->ScalingList8x8[i-6][j]);
+        }
+      }
+    }
+  }
+  equal &= (pps1->second_chroma_qp_index_offset == pps2->second_chroma_qp_index_offset);
+
+  return equal;
+}
diff --git a/Src/h264dec/lcommon/src/transform.c b/Src/h264dec/lcommon/src/transform.c
new file mode 100644
index 00000000..617ca7c1
--- /dev/null
+++ b/Src/h264dec/lcommon/src/transform.c
@@ -0,0 +1,809 @@
+/*!
+***************************************************************************
+* \file transform.c
+*
+* \brief
+*    Transform functions
+*
+* \author
+*    Main contributors (see contributors.h for copyright, address and affiliation details)
+*    - Alexis Michael Tourapis
+* \date
+*    01. July 2007
+**************************************************************************
+*/
+#include "global.h"
+#include "transform.h"
+#include <emmintrin.h>
+
+void forward4x4(int **block, int **tblock, int pos_y, int pos_x)
+{
+	int i, ii;  
+	int tmp[16];
+	int *pTmp = tmp, *pblock;
+	int p0,p1,p2,p3;
+	int t0,t1,t2,t3;
+
+	// Horizontal
+	for (i=pos_y; i < pos_y + BLOCK_SIZE; i++)
+	{
+		pblock = &block[i][pos_x];
+		p0 = *(pblock++);
+		p1 = *(pblock++);
+		p2 = *(pblock++);
+		p3 = *(pblock  );
+
+		t0 = p0 + p3;
+		t1 = p1 + p2;
+		t2 = p1 - p2;
+		t3 = p0 - p3;
+
+		*(pTmp++) =  t0 + t1;
+		*(pTmp++) = (t3 << 1) + t2;
+		*(pTmp++) =  t0 - t1;    
+		*(pTmp++) =  t3 - (t2 << 1);
+	}
+
+	// Vertical 
+	for (i=0; i < BLOCK_SIZE; i++)
+	{
+		pTmp = tmp + i;
+		p0 = *pTmp;
+		p1 = *(pTmp += BLOCK_SIZE);
+		p2 = *(pTmp += BLOCK_SIZE);
+		p3 = *(pTmp += BLOCK_SIZE);
+
+		t0 = p0 + p3;
+		t1 = p1 + p2;
+		t2 = p1 - p2;
+		t3 = p0 - p3;
+
+		ii = pos_x + i;
+		tblock[pos_y    ][ii] = t0 +  t1;
+		tblock[pos_y + 1][ii] = t2 + (t3 << 1);
+		tblock[pos_y + 2][ii] = t0 -  t1;
+		tblock[pos_y + 3][ii] = t3 - (t2 << 1);
+	}
+}
+
+static void inverse4x4(const h264_short_block_t tblock, h264_short_block_t block, int pos_y, int pos_x)
+{
+		int i;  
+		short tmp[16];
+		short *pTmp = tmp;
+		int p0,p1,p2,p3;
+		int t0,t1,t2,t3;
+
+		// Horizontal
+		for (i = 0; i < BLOCK_SIZE; i++)
+		{
+			t0 = tblock[i][0];
+			t1 = tblock[i][1];
+			t2 = tblock[i][2];
+			t3 = tblock[i][3];
+
+			p0 =  t0 + t2;
+			p1 =  t0 - t2;
+			p2 = (t1 >> 1) - t3;
+			p3 =  t1 + (t3 >> 1);
+
+			*(pTmp++) = p0 + p3;
+			*(pTmp++) = p1 + p2;
+			*(pTmp++) = p1 - p2;
+			*(pTmp++) = p0 - p3;
+		}
+
+		//  Vertical 
+		for (i = 0; i < BLOCK_SIZE; i++)
+		{
+			pTmp = tmp + i;
+			t0 = *pTmp;
+			t1 = *(pTmp += BLOCK_SIZE);
+			t2 = *(pTmp += BLOCK_SIZE);
+			t3 = *(pTmp += BLOCK_SIZE);
+
+			p0 = t0 + t2;
+			p1 = t0 - t2;
+			p2 =(t1 >> 1) - t3;
+			p3 = t1 + (t3 >> 1);
+
+			block[0][i] = p0 + p3;
+			block[1][i] = p1 + p2;
+			block[2][i] = p1 - p2;
+			block[3][i] = p0 - p3;
+		}
+}
+
+#ifdef _M_IX86
+// benski> this exists just for conformance testing. not used in production code
+static void inverse4x4_sse2_x86(const h264_short_macroblock_t tblock, h264_short_macroblock_t block, int pos_y, int pos_x)
+{
+		__asm
+		{
+			mov edx, pos_y
+			shl edx, 4 // 16 step stride
+			add edx, pos_x
+			shl edx, 1 // * sizeof(short)
+
+			// eax: pointer to the start of tblock (offset by passed pos_y, pos_x)
+			mov eax, edx
+			add eax, tblock
+
+			// esi: results
+			mov esi, edx
+			add esi, block
+
+			// load 4x4 matrix
+			movq mm0, MMWORD PTR 0[eax]
+			movq mm1, MMWORD PTR 32[eax]
+			movq mm2, MMWORD PTR 64[eax]
+			movq mm3, MMWORD PTR 96[eax]
+
+			// rotate 4x4 matrix
+			movq mm4, mm0 // p0 = mm4 (copy)
+			punpcklwd mm0, mm2 // r0 = mm0
+			punpckhwd mm4, mm2 // r2 = mm4
+			movq mm5, mm1 // p1 = mm5 (copy)
+			punpcklwd mm1, mm3 // r1 = mm1
+			punpckhwd mm5, mm3 // r3 = mm5
+			movq mm6, mm0 // r0 = mm6 (copy)
+			punpcklwd mm0, mm1 // t0 = mm0
+			punpckhwd mm6, mm1 // t1 = mm6
+			movq mm1, mm4 // r2 = mm1 (copy)
+			punpcklwd mm1, mm5 // t2 = mm1
+			punpckhwd mm4, mm5 // t3 = mm4
+
+			/* register state:
+			mm0: t0
+			mm1: t2
+			mm2: 
+			mm3: 
+			mm4: t3
+			mm5: 
+			mm6: t1
+			mm7: 
+			*/
+
+			/*
+			p0 =  t0 + t2;
+			p1 =  t0 - t2;
+			p2 = (t1 >> 1) - t3;
+			p3 =  t1 + (t3 >> 1);
+			*/
+			movq mm2, mm0 // mm2 = t0 (copy)
+			paddw mm0, mm1 // mm0 = p0
+			psubw mm2, mm1 // mm2 = p1, mm1 available
+			movq mm5, mm6 // mm5 = t1 (copy)
+			psraw mm5, 1 // mm5 = (t1 >> 1)
+			psubw mm5, mm4 // mm5 = p2
+			psraw mm4, 1 // mm4 = (t3 >> 1)
+			paddw mm6, mm4 // mm6 = p3
+
+			/* register state:
+			mm0: p0
+			mm1: 
+			mm2: p1
+			mm3: 
+			mm4: 
+			mm5: p2
+			mm6: p3
+			mm7: 
+			*/
+
+			/*
+			*(pTmp++) = p0 + p3;
+			*(pTmp++) = p1 + p2;
+			*(pTmp++) = p1 - p2;
+			*(pTmp++) = p0 - p3;
+			*/
+
+			movq mm3, mm0 // mm3 = p0 (copy)
+			paddw mm0, mm6 // mm0 = r0
+			movq mm1, mm2 // mm1 = p1 (copy)
+			paddw mm1, mm5 // mm1 = r1
+			psubw mm2, mm5 // mm2 = r2, mm5 available
+			psubw mm3, mm6 // mm3 = r3
+
+			/* register state:
+			mm0: r0
+			mm1: r1
+			mm2: r2
+			mm3: r3
+			mm4: 
+			mm5: 
+			mm6: 
+			mm7: 
+			*/
+
+			// rotate 4x4 matrix to set up for vertical
+			movq mm4, mm0 // r0 = mm4 (copy)
+			punpcklwd mm0, mm2 // p0 = mm0
+			punpckhwd mm4, mm2 // p2 = mm4
+			movq mm5, mm1 // r1 = mm5 (copy)
+			punpcklwd mm1, mm3 // p1 = mm1
+			punpckhwd mm5, mm3 // p3 = mm5
+			movq mm6, mm0 // p0 = mm6 (copy)
+			punpcklwd mm0, mm1 // t0 = mm0
+			punpckhwd mm6, mm1 // t1 = mm6
+			movq mm1, mm4 // p2 = mm1 (copy)
+			punpcklwd mm1, mm5 // t2 = mm1
+			punpckhwd mm4, mm5 // t3 = mm4
+
+			/* register state:
+			mm0: t0
+			mm1: t2
+			mm2: 
+			mm3: 
+			mm4: t3
+			mm5: 
+			mm6: t1
+			mm7: 
+			*/
+					/*
+			p0 =  t0 + t2;
+			p1 =  t0 - t2;
+			p2 = (t1 >> 1) - t3;
+			p3 =  t1 + (t3 >> 1);
+			*/
+			movq mm2, mm0 // mm2 = t0 (copy)
+			paddw mm0, mm1 // mm0 = p0
+			psubw mm2, mm1 // mm2 = p1, mm1 available
+			movq mm5, mm6 // mm5 = t1 (copy)
+			psraw mm5, 1 // mm5 = (t1 >> 1)
+			psubw mm5, mm4 // mm5 = p2
+			psraw mm4, 1 // mm4 = (t3 >> 1)
+			paddw mm6, mm4 // mm6 = p3
+
+			/* register state:
+			mm0: p0
+			mm1: 
+			mm2: p1
+			mm3: 
+			mm4: 
+			mm5: p2
+			mm6: p3
+			mm7: 
+			*/
+
+			/*
+			*(pTmp++) = p0 + p3;
+			*(pTmp++) = p1 + p2;
+			*(pTmp++) = p1 - p2;
+			*(pTmp++) = p0 - p3;
+			*/
+
+			movq mm3, mm0 // mm3 = p0 (copy)
+			paddw mm0, mm6 // mm0 = r0
+			movq mm1, mm2 // mm1 = p1 (copy)
+			paddw mm1, mm5 // mm1 = r1
+			psubw mm2, mm5 // mm2 = r2, mm5 available
+			psubw mm3, mm6 // mm3 = r3
+
+			/* register state:
+			mm0: r0
+			mm1: r1
+			mm2: r2
+			mm3: r3
+			mm4: 
+			mm5: 
+			mm6: 
+			mm7: 
+			*/
+			movq XMMWORD PTR 0[esi], mm0
+			movq XMMWORD PTR 32[esi], mm1
+			movq XMMWORD PTR 64[esi], mm2
+			movq XMMWORD PTR 96[esi], mm3
+		}
+}
+#endif
+
+static void sample_reconstruct(h264_imgpel_macroblock_t curImg, const h264_imgpel_macroblock_t mpr, const h264_short_block_t tblock, int joff, int mb_x, int max_imgpel_value)
+{
+	#ifdef _M_IX86
+		__asm
+		{
+			// mm0 : constant value 32
+			mov edx, 0x00200020
+			movd mm0, edx
+			punpckldq	mm0, mm0
+
+			// ecx: y offset
+			mov ecx, joff
+			shl ecx, 4 // imgpel stuff is going to be 16 byte stride
+			add ecx, mb_x
+
+			// eax: curImg
+			mov eax, curImg
+			add eax, ecx
+
+			// edx: mpr
+			mov edx, mpr
+			add edx, ecx
+
+			// ecx: tblock (which is short, not byte)
+			mov ecx, tblock
+			
+			// mm7: zero
+			pxor mm7, mm7
+
+			// load coefficients
+			movq	mm1, MMWORD PTR 0[ecx]
+			movq	mm2, MMWORD PTR 8[ecx]
+			movq	mm3, MMWORD PTR 16[ecx]
+			movq	mm4, MMWORD PTR 24[ecx]
+			paddw mm1, mm0 // rres + 32
+			paddw mm2, mm0 // rres + 32
+			paddw mm3, mm0 // rres + 32
+			paddw mm0, mm4 // rres + 32
+			psraw mm1, 6 // (rres + 32) >> 6
+			psraw mm2, 6 // (rres + 32) >> 6
+			psraw mm3, 6 // (rres + 32) >> 6
+			psraw mm0, 6 // (rres + 32) >> 6
+			// mm1-mm3: tblock[0] - tblock[2], mm0: tblock[3]
+
+			// convert mpr from unsigned char to short
+			movd mm4, DWORD PTR 0[edx]
+			movd mm5, DWORD PTR 16[edx]
+			movd mm6, DWORD PTR 32[edx]
+			punpcklbw mm4, mm7
+			punpcklbw mm5, mm7
+			punpcklbw mm6, mm7
+			paddsw mm4, mm1 // pred_row + rres_row
+			movd mm1, DWORD PTR 48[edx] // reuse mm1 for mpr[3]
+			paddsw mm5, mm2 // pred_row + rres_row
+			punpcklbw mm1, mm7
+			paddsw mm6, mm3 // pred_row + rres_row			
+			paddsw mm1, mm0 // pred_row + rres_row
+			// results in mm4, mm5, mm6, mm1
+			
+			// move back to 8 bit
+			packuswb mm4, mm7
+			packuswb mm5, mm7
+			packuswb mm6, mm7
+			packuswb mm1, mm7
+			movd DWORD PTR 0[eax], mm4
+			movd DWORD PTR 16[eax], mm5
+			movd DWORD PTR 32[eax], mm6
+			movd DWORD PTR 48[eax], mm1
+		}
+#else
+  int i, j;
+
+  for (j = 0; j < BLOCK_SIZE; j++)
+  {
+    for (i=0;i<BLOCK_SIZE;i++)
+      curImg[j+joff][mb_x+i] = (imgpel) iClip1( max_imgpel_value, rshift_rnd_sf(tblock[j][i], DQ_BITS) + mpr[j+joff][mb_x+i]);
+  }
+#endif
+}
+
+#if defined(_M_IX86) && defined(_DEBUG)
+void itrans4x4_sse2(const h264_short_macroblock_t tblock, const h264_imgpel_macroblock_t mb_pred, h264_imgpel_macroblock_t mb_rec, int pos_x, int pos_y)
+{
+	__declspec(align(32)) static const short const32[4] = {32, 32, 32, 32};
+		__asm
+		{
+			mov edx, pos_y
+			shl edx, 4 // imgpel stuff is going to be 16 byte stride
+			add edx, pos_x
+
+			// eax: tblock
+			lea eax, [edx*2]
+			add eax, tblock
+
+			// ecx: mpr
+			mov ecx, mb_pred
+			add ecx, edx
+
+			// edx: results
+			add edx, mb_rec
+
+			// load 4x4 matrix
+			movq mm0, MMWORD PTR 0[eax]
+			movq mm1, MMWORD PTR 32[eax]
+			movq mm2, MMWORD PTR 64[eax]
+			movq mm3, MMWORD PTR 96[eax]
+
+			// rotate 4x4 matrix
+			movq mm4, mm0 // p0 = mm4 (copy)
+			punpcklwd mm0, mm2 // r0 = mm0
+			punpckhwd mm4, mm2 // r2 = mm4
+			movq mm5, mm1 // p1 = mm5 (copy)
+			punpcklwd mm1, mm3 // r1 = mm1
+			punpckhwd mm5, mm3 // r3 = mm5
+			movq mm6, mm0 // r0 = mm6 (copy)
+			punpcklwd mm0, mm1 // t0 = mm0
+			punpckhwd mm6, mm1 // t1 = mm6
+			movq mm1, mm4 // r2 = mm1 (copy)
+			punpcklwd mm1, mm5 // t2 = mm1
+			punpckhwd mm4, mm5 // t3 = mm4
+
+			/* register state:
+			mm0: t0
+			mm1: t2
+			mm2: 
+			mm3: 
+			mm4: t3
+			mm5: 
+			mm6: t1
+			mm7: 
+			*/
+
+			/*
+			p0 =  t0 + t2;
+			p1 =  t0 - t2;
+			p2 = (t1 >> 1) - t3;
+			p3 =  t1 + (t3 >> 1);
+			*/
+			movq mm2, mm0 // mm2 = t0 (copy)
+			paddw mm0, mm1 // mm0 = p0
+			psubw mm2, mm1 // mm2 = p1, mm1 available
+			movq mm5, mm6 // mm5 = t1 (copy)
+			psraw mm5, 1 // mm5 = (t1 >> 1)
+			psubw mm5, mm4 // mm5 = p2
+			psraw mm4, 1 // mm4 = (t3 >> 1)
+			paddw mm6, mm4 // mm6 = p3
+
+			/* register state:
+			mm0: p0
+			mm1: 
+			mm2: p1
+			mm3: 
+			mm4: 
+			mm5: p2
+			mm6: p3
+			mm7: 
+			*/
+
+			/*
+			*(pTmp++) = p0 + p3;
+			*(pTmp++) = p1 + p2;
+			*(pTmp++) = p1 - p2;
+			*(pTmp++) = p0 - p3;
+			*/
+
+			movq mm3, mm0 // mm3 = p0 (copy)
+			paddw mm0, mm6 // mm0 = r0
+			movq mm1, mm2 // mm1 = p1 (copy)
+			paddw mm1, mm5 // mm1 = r1
+			psubw mm2, mm5 // mm2 = r2, mm5 available
+			psubw mm3, mm6 // mm3 = r3
+
+			/* register state:
+			mm0: r0
+			mm1: r1
+			mm2: r2
+			mm3: r3
+			mm4: 
+			mm5: 
+			mm6: 
+			mm7: 
+			*/
+
+			// rotate 4x4 matrix to set up for vertical
+			movq mm4, mm0 // r0 = mm4 (copy)
+			punpcklwd mm0, mm2 // p0 = mm0
+			punpckhwd mm4, mm2 // p2 = mm4
+			movq mm5, mm1 // r1 = mm5 (copy)
+			punpcklwd mm1, mm3 // p1 = mm1
+			punpckhwd mm5, mm3 // p3 = mm5
+			movq mm6, mm0 // p0 = mm6 (copy)
+			punpcklwd mm0, mm1 // t0 = mm0
+			punpckhwd mm6, mm1 // t1 = mm6
+			movq mm1, mm4 // p2 = mm1 (copy)
+			punpcklwd mm1, mm5 // t2 = mm1
+			punpckhwd mm4, mm5 // t3 = mm4
+
+			/* register state:
+			mm0: t0
+			mm1: t2
+			mm2: 
+			mm3: 
+			mm4: t3
+			mm5: 
+			mm6: t1
+			mm7: 
+			*/
+					/*
+			p0 =  t0 + t2;
+			p1 =  t0 - t2;
+			p2 = (t1 >> 1) - t3;
+			p3 =  t1 + (t3 >> 1);
+			*/
+			movq mm2, mm0 // mm2 = t0 (copy)
+			paddw mm0, mm1 // mm0 = p0
+			psubw mm2, mm1 // mm2 = p1, mm1 available
+			movq mm5, mm6 // mm5 = t1 (copy)
+			psraw mm5, 1 // mm5 = (t1 >> 1)
+			psubw mm5, mm4 // mm5 = p2
+			psraw mm4, 1 // mm4 = (t3 >> 1)
+			paddw mm6, mm4 // mm6 = p3
+
+			/* register state:
+			mm0: p0
+			mm1: 
+			mm2: p1
+			mm3: 
+			mm4: 
+			mm5: p2
+			mm6: p3
+			mm7: 
+			*/
+
+			/*
+			*(pTmp++) = p0 + p3;
+			*(pTmp++) = p1 + p2;
+			*(pTmp++) = p1 - p2;
+			*(pTmp++) = p0 - p3;
+			*/
+
+			movq mm3, mm0 // mm3 = p0 (copy)
+			paddw mm0, mm6 // mm0 = r0
+			movq mm1, mm2 // mm1 = p1 (copy)
+			paddw mm1, mm5 // mm1 = r1
+			psubw mm2, mm5 // mm2 = r2, mm5 available
+			psubw mm3, mm6 // mm3 = r3
+
+			/* register state:
+			mm0: r0
+			mm1: r1
+			mm2: r2
+			mm3: r3
+			mm4: 
+			mm5: 
+			mm6: 
+			mm7: 
+			*/
+/* --- 4x4 iDCT done, now time to combine with mpr --- */
+			// mm0 : constant value 32
+			movq	mm7, const32
+
+			paddw mm0, mm7 // rres + 32
+			psraw mm0, 6 // (rres + 32) >> 6
+			paddw mm1, mm7 // rres + 32
+			psraw mm1, 6 // (rres + 32) >> 6
+			paddw mm2, mm7 // rres + 32
+			psraw mm2, 6 // (rres + 32) >> 6
+			paddw mm3, mm7 // rres + 32
+			psraw mm3, 6 // (rres + 32) >> 6
+
+			pxor mm7, mm7
+
+			// convert mpr from unsigned char to short
+			movd mm4, DWORD PTR 0[ecx]
+			movd mm5, DWORD PTR 16[ecx]
+			movd mm6, DWORD PTR 32[ecx]
+			punpcklbw mm4, mm7
+			punpcklbw mm5, mm7
+			punpcklbw mm6, mm7
+			paddsw mm4, mm0 // pred_row + rres_row
+			movd mm0, DWORD PTR 48[ecx] // reuse mm0 for mpr[3]
+			paddsw mm5, mm1 // pred_row + rres_row
+			punpcklbw mm0, mm7
+			paddsw mm6, mm2 // pred_row + rres_row			
+			paddsw mm0, mm3 // pred_row + rres_row
+			// results in mm4, mm5, mm6, mm0
+			
+			// move back to 8 bit
+			packuswb mm4, mm7
+			packuswb mm5, mm7
+			packuswb mm6, mm7
+			packuswb mm0, mm7
+			movd DWORD PTR 0[edx], mm4
+			movd DWORD PTR 16[edx], mm5
+			movd DWORD PTR 32[edx], mm6
+			movd DWORD PTR 48[edx], mm0
+		}
+}
+#elif defined(_M_X64)
+static void itrans4x4_sse2(const h264_int_macroblock_t tblock, const h264_imgpel_macroblock_t mb_pred, h264_imgpel_macroblock_t mb_rec, int pos_x, int pos_y)
+{
+	__declspec(align(32)) static const int const32[4] = {32, 32, 32, 32};
+			__m128i  p0,p1,p2,p3;
+		__m128i t0,t1,t2,t3;
+		__m128i r0,r1,r2,r3;
+		__m128i c32, zero;
+
+		// horizontal 
+		// load registers in vertical mode, we'll rotate them next
+		p0 = _mm_loadu_si128((__m128i *)&tblock[pos_y][pos_x]); // 00 01 02 03
+		p1 = _mm_loadu_si128((__m128i *)&tblock[pos_y+1][pos_x]); // 10 11 12 13
+		p2 = _mm_loadu_si128((__m128i *)&tblock[pos_y+2][pos_x]); // 20 21 22 23
+		p3 = _mm_loadu_si128((__m128i *)&tblock[pos_y+3][pos_x]); // 30 31 32 33
+		
+		// rotate 4x4 matrix
+		r0 = _mm_unpacklo_epi32(p0, p2); // 00 20 01 21
+		r1 = _mm_unpacklo_epi32(p1, p3); // 10 30 11 31
+		r2 = _mm_unpackhi_epi32(p0, p2); // 02 22 03 23
+		r3 = _mm_unpackhi_epi32(p1, p3); // 12 32 13 33
+		t0 = _mm_unpacklo_epi32(r0, r1); // 00 10 20 30
+		t1 = _mm_unpackhi_epi32(r0, r1); // 01 11 21 31
+		t2 = _mm_unpacklo_epi32(r2, r3); // 02 12 22 32
+		t3 = _mm_unpackhi_epi32(r2, r3); // 03 13 23 33
+
+		p0 = _mm_add_epi32(t0, t2); //t0 + t2;
+		p1 = _mm_sub_epi32(t0, t2); // t0 - t2;
+		p2 = _mm_srai_epi32(t1, 1); // t1 >> 1
+		p2 = _mm_sub_epi32(p2, t3); // (t1 >> 1) - t3;
+		p3 = _mm_srai_epi32(t3, 1); // (t3 >> 1)
+		p3 = _mm_add_epi32(p3, t1); // t1 + (t3 >> 1);
+
+		t0 = _mm_add_epi32(p0, p3); //p0 + p3;
+		t1 = _mm_add_epi32(p1, p2);//p1 + p2;
+		t2 = _mm_sub_epi32(p1, p2); //p1 - p2;
+		t3 = _mm_sub_epi32(p0, p3); //p0 - p3;
+
+		// rotate 4x4 matrix to set up for vertical
+		r0 = _mm_unpacklo_epi32(t0, t2); 
+		r1 = _mm_unpacklo_epi32(t1, t3); 
+		r2 = _mm_unpackhi_epi32(t0, t2); 
+		r3 = _mm_unpackhi_epi32(t1, t3); 
+		t0 = _mm_unpacklo_epi32(r0, r1); 
+		t1 = _mm_unpackhi_epi32(r0, r1); 
+		t2 = _mm_unpacklo_epi32(r2, r3); 
+		t3 = _mm_unpackhi_epi32(r2, r3); 
+
+		// vertical
+		p0 = _mm_add_epi32(t0, t2); //t0 + t2;
+		p3 = _mm_srai_epi32(t3, 1); // (t3 >> 1)
+		p3 = _mm_add_epi32(p3, t1); // t1 + (t3 >> 1);
+		r0 = _mm_add_epi32(p0, p3); //p0 + p3;
+		r3 = _mm_sub_epi32(p0, p3); //p0 - p3;
+		p1 = _mm_sub_epi32(t0, t2); // t0 - t2;
+		p2 = _mm_srai_epi32(t1, 1); // t1 >> 1
+		p2 = _mm_sub_epi32(p2, t3); // (t1 >> 1) - t3;
+		r1 = _mm_add_epi32(p1, p2);//p1 + p2;
+		r2 = _mm_sub_epi32(p1, p2); //p1 - p2;
+
+		c32 = _mm_load_si128((const __m128i *)const32);
+		zero = _mm_setzero_si128();
+
+		// (x + 32) >> 6
+		r0 = _mm_add_epi32(r0, c32);
+		r0 = _mm_srai_epi32(r0, 6);
+		r1 = _mm_add_epi32(r1, c32);
+		r1 = _mm_srai_epi32(r1, 6);
+		r2 = _mm_add_epi32(r2, c32);
+		r2 = _mm_srai_epi32(r2, 6);
+		r3 = _mm_add_epi32(r3, c32);
+		r3 = _mm_srai_epi32(r3, 6);
+
+		// convert to 16bit values
+		r0 = _mm_packs_epi32(r0, r1);
+		r2 = _mm_packs_epi32(r2, r3);
+
+		// convert mpr from unsigned char to short
+		p0 = _mm_cvtsi32_si128(*(int32_t *)&mb_pred[pos_y][pos_x]);
+		p1 = _mm_cvtsi32_si128(*(int32_t *)&mb_pred[pos_y+1][pos_x]);
+		p0 = _mm_unpacklo_epi32(p0, p1);
+		p0 = _mm_unpacklo_epi8(p0, zero); // convert to short
+		r0 = _mm_add_epi16(r0, p0);
+
+		p0 = _mm_cvtsi32_si128(*(int32_t *)&mb_pred[pos_y+2][pos_x]);
+		p1 = _mm_cvtsi32_si128(*(int32_t *)&mb_pred[pos_y+3][pos_x]);
+		p0 = _mm_unpacklo_epi32(p0, p1);
+		p0 = _mm_unpacklo_epi8(p0, zero); // convert to short
+		r2 = _mm_add_epi16(r2, p0);
+
+		r0 = _mm_packus_epi16(r0, r2); // convert to unsigned char
+		*(int32_t *)&mb_rec[pos_y][pos_x] = _mm_cvtsi128_si32(r0);
+		r0 = _mm_srli_si128(r0, 4);
+		*(int32_t *)&mb_rec[pos_y+1][pos_x] = _mm_cvtsi128_si32(r0);
+		r0 = _mm_srli_si128(r0, 4);
+		*(int32_t *)&mb_rec[pos_y+2][pos_x] = _mm_cvtsi128_si32(r0);
+		r0 = _mm_srli_si128(r0, 4);
+		*(int32_t *)&mb_rec[pos_y+3][pos_x] = _mm_cvtsi128_si32(r0);
+}
+#endif
+
+void itrans4x4_c(const h264_short_block_t tblock, const h264_imgpel_macroblock_t mb_pred, h264_imgpel_macroblock_t mb_rec, int pos_x, int pos_y)
+{
+  inverse4x4(tblock, (h264_short_block_row_t *)tblock,pos_y,pos_x);
+	sample_reconstruct(mb_rec, mb_pred, tblock, pos_y, pos_x, 255);
+}
+
+void ihadamard4x4(int block[4][4])
+{
+	int i;  
+	int tmp[16];
+	int *pTmp = tmp;
+	int p0,p1,p2,p3;
+	int t0,t1,t2,t3;
+
+	// Horizontal
+	for (i = 0; i < BLOCK_SIZE; i++)
+	{
+		t0 = block[i][0];
+		t1 = block[i][1];
+		t2 = block[i][2];
+		t3 = block[i][3];
+
+		p0 = t0 + t2;
+		p1 = t0 - t2;
+		p2 = t1 - t3;
+		p3 = t1 + t3;
+
+		*(pTmp++) = p0 + p3;
+		*(pTmp++) = p1 + p2;
+		*(pTmp++) = p1 - p2;
+		*(pTmp++) = p0 - p3;
+	}
+
+	//  Vertical 
+	for (i = 0; i < BLOCK_SIZE; i++)
+	{
+		pTmp = tmp + i;
+		t0 = *pTmp;
+		t1 = *(pTmp += BLOCK_SIZE);
+		t2 = *(pTmp += BLOCK_SIZE);
+		t3 = *(pTmp += BLOCK_SIZE);
+
+		p0 = t0 + t2;
+		p1 = t0 - t2;
+		p2 = t1 - t3;
+		p3 = t1 + t3;
+
+		block[0][i] = p0 + p3;
+		block[1][i] = p1 + p2;
+		block[2][i] = p1 - p2;
+		block[3][i] = p0 - p3;
+	}
+}
+
+void ihadamard4x2(int **tblock, int **block)
+{
+	int i;  
+	int tmp[8];
+	int *pTmp = tmp;
+	int p0,p1,p2,p3;
+	int t0,t1,t2,t3;
+
+	// Horizontal
+	*(pTmp++) = tblock[0][0] + tblock[1][0];
+	*(pTmp++) = tblock[0][1] + tblock[1][1];
+	*(pTmp++) = tblock[0][2] + tblock[1][2];
+	*(pTmp++) = tblock[0][3] + tblock[1][3];
+
+	*(pTmp++) = tblock[0][0] - tblock[1][0];
+	*(pTmp++) = tblock[0][1] - tblock[1][1];
+	*(pTmp++) = tblock[0][2] - tblock[1][2];
+	*(pTmp  ) = tblock[0][3] - tblock[1][3];
+
+	// Vertical
+	pTmp = tmp;
+	for (i = 0; i < 2; i++)
+	{
+		p0 = *(pTmp++);
+		p1 = *(pTmp++);
+		p2 = *(pTmp++);
+		p3 = *(pTmp++);
+
+		t0 = p0 + p2;
+		t1 = p0 - p2;
+		t2 = p1 - p3;
+		t3 = p1 + p3;
+
+		// coefficients (transposed)
+		block[0][i] = t0 + t3;
+		block[1][i] = t1 + t2;
+		block[2][i] = t1 - t2;
+		block[3][i] = t0 - t3;
+	}
+}
+
+//following functions perform 8 additions, 8 assignments. Should be a bit faster
+void ihadamard2x2(int tblock[4], int block[4])
+{
+	int t0,t1,t2,t3;
+
+	t0 = tblock[0] + tblock[1];
+	t1 = tblock[0] - tblock[1];
+	t2 = tblock[2] + tblock[3];
+	t3 = tblock[2] - tblock[3];
+
+	block[0] = (t0 + t2);
+	block[1] = (t1 + t3);
+	block[2] = (t0 - t2);
+	block[3] = (t1 - t3);
+}
+
diff --git a/Src/h264dec/lcommon/src/win32.c b/Src/h264dec/lcommon/src/win32.c
new file mode 100644
index 00000000..7d921e1e
--- /dev/null
+++ b/Src/h264dec/lcommon/src/win32.c
@@ -0,0 +1,67 @@
+
+/*!
+ *************************************************************************************
+ * \file win32.c
+ *
+ * \brief
+ *    Platform dependent code
+ *
+ * \author
+ *    Main contributors (see contributors.h for copyright, address and affiliation details)
+ *      - Karsten Suehring                  <suehring@hhi.de>
+ *************************************************************************************
+ */
+
+#include "global.h"
+
+
+#ifdef _WIN32
+
+static LARGE_INTEGER freq;
+
+void gettime(TIME_T* time)
+{
+  QueryPerformanceCounter(time);
+}
+
+int64 timediff(TIME_T* start, TIME_T* end)
+{
+  return (int64)((end->QuadPart - start->QuadPart));
+}
+
+int64 timenorm(int64  cur_time)
+{
+  static int first = 1;
+
+  if(first) 
+  {
+    QueryPerformanceFrequency(&freq);
+    first = 0;
+  }
+
+  return (int64)(cur_time * 1000 /(freq.QuadPart));
+}
+
+#else
+
+static struct timezone tz;
+
+void gettime(TIME_T* time)
+{
+  gettimeofday(time, &tz);
+}
+
+int64 timediff(TIME_T* start, TIME_T* end)
+{
+  int t1, t2;
+
+  t1 =  end->tv_sec  - start->tv_sec;
+  t2 =  end->tv_usec - start->tv_usec;
+  return (int64) t2 + (int64) t1 * (int64) 1000000;
+}
+
+int64 timenorm(int64 cur_time)
+{
+  return (int64)(cur_time / (int64) 1000);
+}
+#endif
diff --git a/Src/h264dec/ldecod/inc/biaridecod.h b/Src/h264dec/ldecod/inc/biaridecod.h
new file mode 100644
index 00000000..9364632b
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/biaridecod.h
@@ -0,0 +1,157 @@
+
+/*!
+ ***************************************************************************
+ * \file
+ *    biaridecod.h
+ *
+ * \brief
+ *    Headerfile for binary arithmetic decoder routines
+ *
+ * \author
+ *    Detlev Marpe,
+ *    Gabi Bl�ttermann
+ *    Copyright (C) 2000 HEINRICH HERTZ INSTITUTE All Rights Reserved.
+ *
+ * \date
+ *    21. Oct 2000
+ **************************************************************************
+ */
+
+#ifndef _BIARIDECOD_H_
+#define _BIARIDECOD_H_
+
+
+/************************************************************************
+ * D e f i n i t i o n s
+ ***********************************************************************
+ */
+
+/* Range table for  LPS */
+/*
+static const byte rLPS_table_64x4[64][4]=
+{
+  { 128, 176, 208, 240},
+  { 128, 167, 197, 227},
+  { 128, 158, 187, 216},
+  { 123, 150, 178, 205},
+  { 116, 142, 169, 195},
+  { 111, 135, 160, 185},
+  { 105, 128, 152, 175},
+  { 100, 122, 144, 166},
+  {  95, 116, 137, 158},
+  {  90, 110, 130, 150},
+  {  85, 104, 123, 142},
+  {  81,  99, 117, 135},
+  {  77,  94, 111, 128},
+  {  73,  89, 105, 122},
+  {  69,  85, 100, 116},
+  {  66,  80,  95, 110},
+  {  62,  76,  90, 104},
+  {  59,  72,  86,  99},
+  {  56,  69,  81,  94},
+  {  53,  65,  77,  89},
+  {  51,  62,  73,  85},
+  {  48,  59,  69,  80},
+  {  46,  56,  66,  76},
+  {  43,  53,  63,  72},
+  {  41,  50,  59,  69},
+  {  39,  48,  56,  65},
+  {  37,  45,  54,  62},
+  {  35,  43,  51,  59},
+  {  33,  41,  48,  56},
+  {  32,  39,  46,  53},
+  {  30,  37,  43,  50},
+  {  29,  35,  41,  48},
+  {  27,  33,  39,  45},
+  {  26,  31,  37,  43},
+  {  24,  30,  35,  41},
+  {  23,  28,  33,  39},
+  {  22,  27,  32,  37},
+  {  21,  26,  30,  35},
+  {  20,  24,  29,  33},
+  {  19,  23,  27,  31},
+  {  18,  22,  26,  30},
+  {  17,  21,  25,  28},
+  {  16,  20,  23,  27},
+  {  15,  19,  22,  25},
+  {  14,  18,  21,  24},
+  {  14,  17,  20,  23},
+  {  13,  16,  19,  22},
+  {  12,  15,  18,  21},
+  {  12,  14,  17,  20},
+  {  11,  14,  16,  19},
+  {  11,  13,  15,  18},
+  {  10,  12,  15,  17},
+  {  10,  12,  14,  16},
+  {   9,  11,  13,  15},
+  {   9,  11,  12,  14},
+  {   8,  10,  12,  14},
+  {   8,   9,  11,  13},
+  {   7,   9,  11,  12},
+  {   7,   9,  10,  12},
+  {   7,   8,  10,  11},
+  {   6,   8,   9,  11},
+  {   6,   7,   9,  10},
+  {   6,   7,   8,   9},
+  {   2,   2,   2,   2}
+};*/
+static const byte rLPS_table_64x4[4][64]={
+{128, 128, 128, 123, 116, 111, 105, 100,  95,  90,  85,  81,  77,  73,  69,  66,
+  62,  59,  56,  53,  51,  48,  46,  43,  41,  39,  37,  35,  33,  32,  30,  29,
+  27,  26,  24,  23,  22,  21,  20,  19,  18,  17,  16,  15,  14,  14,  13,  12,
+  12,  11,  11,  10,  10,   9,   9,   8,   8,   7,   7,   7,   6,   6,   6,   2,
+ },
+{176, 167, 158, 150, 142, 135, 128, 122, 116, 110, 104,  99,  94,  89,  85,  80,
+  76,  72,  69,  65,  62,  59,  56,  53,  50,  48,  45,  43,  41,  39,  37,  35,
+  33,  31,  30,  28,  27,  26,  24,  23,  22,  21,  20,  19,  18,  17,  16,  15,
+  14,  14,  13,  12,  12,  11,  11,  10,   9,   9,   9,   8,   8,   7,   7,   2,
+ },
+{208, 197, 187, 178, 169, 160, 152, 144, 137, 130, 123, 117, 111, 105, 100,  95,
+  90,  86,  81,  77,  73,  69,  66,  63,  59,  56,  54,  51,  48,  46,  43,  41,
+  39,  37,  35,  33,  32,  30,  29,  27,  26,  25,  23,  22,  21,  20,  19,  18,
+  17,  16,  15,  15,  14,  13,  12,  12,  11,  11,  10,  10,   9,   9,   8,   2,
+ },
+{240, 227, 216, 205, 195, 185, 175, 166, 158, 150, 142, 135, 128, 122, 116, 110,
+ 104,  99,  94,  89,  85,  80,  76,  72,  69,  65,  62,  59,  56,  53,  50,  48,
+  45,  43,  41,  39,  37,  35,  33,  31,  30,  28,  27,  25,  24,  23,  22,  21,
+  20,  19,  18,  17,  16,  15,  14,  14,  13,  12,  12,  11,  11,  10,   9,   2,
+}};
+
+// make uint16 to match biari_decode_symbol
+static const byte AC_next_state_MPS_64[64] =    
+{
+  1,2,3,4,5,6,7,8,9,10,
+  11,12,13,14,15,16,17,18,19,20,
+  21,22,23,24,25,26,27,28,29,30,
+  31,32,33,34,35,36,37,38,39,40,
+  41,42,43,44,45,46,47,48,49,50,
+  51,52,53,54,55,56,57,58,59,60,
+  61,62,62,63
+};
+
+// make uint16 to match biari_decode_symbol
+static const byte AC_next_state_LPS_64[64] =    
+{
+  0, 0, 1, 2, 2, 4, 4, 5, 6, 7,
+  8, 9, 9,11,11,12,13,13,15,15,
+  16,16,18,18,19,19,21,21,22,22,
+  23,24,24,25,26,26,27,27,28,29,
+  29,30,30,30,31,32,32,33,33,33,
+  34,34,35,35,35,36,36,36,37,37,
+  37,38,38,63
+};
+
+static const byte renorm_table_32[32]={6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
+static const byte renorm_table_256[256]={6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, };
+
+extern void arideco_start_decoding(DecodingEnvironmentPtr eep, unsigned char *code_buffer, int firstbyte, int *code_len);
+//extern int  arideco_bits_read(const DecodingEnvironmentPtr dep);
+extern void arideco_done_decoding(DecodingEnvironmentPtr dep);
+extern void biari_init_context (int qp, BiContextTypePtr ctx, const char* ini);
+extern unsigned int biari_decode_symbol(DecodingEnvironmentPtr dep, BiContextTypePtr bi_ct );
+extern unsigned int biari_decode_symbol_eq_prob(DecodingEnvironmentPtr dep);
+extern unsigned int biari_decode_final(DecodingEnvironmentPtr dep);
+extern unsigned int getbyte(DecodingEnvironmentPtr dep);
+extern unsigned int getword(DecodingEnvironmentPtr dep);
+#endif  // BIARIDECOD_H_
+
diff --git a/Src/h264dec/ldecod/inc/block.h b/Src/h264dec/ldecod/inc/block.h
new file mode 100644
index 00000000..d819b13c
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/block.h
@@ -0,0 +1,133 @@
+
+/*!
+ ************************************************************************
+ * \file block.h
+ *
+ * \brief
+ *    definitions for block decoding functions
+ *
+ * \author
+ *  Inge Lille-Langoy               <inge.lille-langoy@telenor.com>    \n
+ *  Telenor Satellite Services                                         \n
+ *  P.O.Box 6914 St.Olavs plass                                        \n
+ *  N-0130 Oslo, Norway
+ *
+ ************************************************************************
+ */
+
+#ifndef _BLOCK_H_
+#define _BLOCK_H_
+
+#include "global.h"
+#include "transform8x8.h"
+
+static const byte QP_SCALE_CR[52]=
+{
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,
+   12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,
+   28,29,29,30,31,32,32,33,34,34,35,35,36,36,37,37,
+   37,38,38,38,39,39,39,39
+
+};
+
+//! look up tables for FRExt_chroma support
+static const unsigned char subblk_offset_x[3][8][4] =
+{
+  {
+    {0, 4, 0, 4},
+    {0, 4, 0, 4},
+    {0, 0, 0, 0},
+    {0, 0, 0, 0},
+    {0, 0, 0, 0},
+    {0, 0, 0, 0},
+    {0, 0, 0, 0},
+    {0, 0, 0, 0}, 
+  },
+  { 
+    {0, 4, 0, 4},
+    {0, 4, 0, 4},
+    {0, 4, 0, 4},
+    {0, 4, 0, 4},
+    {0, 0, 0, 0},
+    {0, 0, 0, 0},
+    {0, 0, 0, 0},
+    {0, 0, 0, 0}, 
+  },
+  {
+    {0, 4, 0, 4},
+    {8,12, 8,12},
+    {0, 4, 0, 4},
+    {8,12, 8,12},
+    {0, 4, 0, 4},
+    {8,12, 8,12},
+    {0, 4, 0, 4},
+    {8,12, 8,12}  
+  }
+};
+
+
+static const unsigned char subblk_offset_y[3][8][4] =
+{
+  {
+    {0, 0, 4, 4},
+    {0, 0, 4, 4},
+    {0, 0, 0, 0},
+    {0, 0, 0, 0},
+    {0, 0, 0, 0},
+    {0, 0, 0, 0},
+    {0, 0, 0, 0},
+    {0, 0, 0, 0}
+  },
+  { 
+    {0, 0, 4, 4},
+    {8, 8,12,12},
+    {0, 0, 4, 4},
+    {8, 8,12,12},
+    {0, 0, 0, 0},
+    {0, 0, 0, 0},
+    {0, 0, 0, 0},
+    {0, 0, 0, 0}
+  },
+  { 
+    {0, 0, 4, 4},
+    {0, 0, 4, 4},
+    {8, 8,12,12},
+    {8, 8,12,12},
+    {0, 0, 4, 4},
+    {0, 0, 4, 4},
+    {8, 8,12,12},
+    {8, 8,12,12}
+  }
+};
+
+static const byte decode_block_scan[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15};
+static const int cof4_pos_to_subblock[4][4] =  {	 { 0, 1, 4, 5},	 {2,3,6,7},	 {8,9,12,13},	 {10,11,14,15} };
+
+extern void iMBtrans4x4(Macroblock *currMB, ColorPlane pl, int smb);
+extern void iMBtrans8x8(Macroblock *currMB, ColorPlane pl);
+
+extern void itrans_sp_cr(Macroblock *currMB, int uv);
+
+extern void intrapred_chroma      (Macroblock *currMB, int uv);
+
+extern void Inv_Residual_trans_4x4(Macroblock *currMB, ColorPlane pl, int ioff, int joff);
+extern void Inv_Residual_trans_8x8(Macroblock *currMB, ColorPlane pl, int ioff,int joff);
+
+extern void itrans4x4_c(const h264_short_block_t tblock, const h264_imgpel_macroblock_t mb_pred, h264_imgpel_macroblock_t mb_rec, int pos_x, int pos_y);
+extern void itrans4x4_mmx(const h264_short_block_t tblock, const h264_imgpel_macroblock_t mb_pred, h264_imgpel_macroblock_t mb_rec, int pos_x, int pos_y);
+extern int  intrapred   (Macroblock *currMB, ColorPlane pl, int ioff,int joff,int i4,int j4);
+extern void itrans_2    (Macroblock *currMB, ColorPlane pl);
+extern void iTransform  (Macroblock *currMB, ColorPlane pl, int smb);
+
+extern void copy_image_data       (imgpel  **imgBuf1, imgpel  **imgBuf2, int dest_x, int src_x, int width, int height);
+extern void copy_image_data_16x16 (imgpel  **imgBuf1, imgpel  **imgBuf2, int dest_x, int src_x);
+
+extern void copy_image_data_16x16_stride_c(struct video_image *destination, int dest_x, int dest_y, const h264_imgpel_macroblock_t source);
+extern void copy_image_data_16x16_stride_sse(struct video_image *destination, int dest_x, int dest_y, const h264_imgpel_macroblock_t source);
+extern void copy_image_data_8x8_stride  (struct video_image *destination, int dest_x, int dest_y, const h264_imgpel_macroblock_t source);
+extern void copy_image_data_8x8_stride2  (struct video_image *destination, int dest_x, int dest_y, const h264_imgpel_macroblock_t source, int src_x, int src_y);
+extern void copy_image_data_4x4_stride  (struct video_image *destination, int dest_x, int dest_y, const h264_imgpel_macroblock_t source, int src_x, int src_y);
+extern void copy_image_data_stride      (struct video_image *destination, int dest_x, int dest_y, const h264_imgpel_macroblock_t source, int width, int height);
+
+#endif
+
diff --git a/Src/h264dec/ldecod/inc/cabac.h b/Src/h264dec/ldecod/inc/cabac.h
new file mode 100644
index 00000000..5458d2be
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/cabac.h
@@ -0,0 +1,64 @@
+
+/*!
+ ***************************************************************************
+ * \file
+ *    cabac.h
+ *
+ * \brief
+ *    Header file for entropy coding routines
+ *
+ * \author
+ *    Detlev Marpe                                                         \n
+ *    Copyright (C) 2000 HEINRICH HERTZ INSTITUTE All Rights Reserved.
+ *
+ * \date
+ *    21. Oct 2000 (Changes by Tobias Oelbaum 28.08.2001)
+ ***************************************************************************
+ */
+
+#ifndef _CABAC_H_
+#define _CABAC_H_
+
+#include "global.h"
+
+typedef struct Run_Level
+{
+	int level;
+	int run;
+} RunLevel;
+extern MotionInfoContexts*  create_contexts_MotionInfo(void);
+extern TextureInfoContexts* create_contexts_TextureInfo(void);
+extern void delete_contexts_MotionInfo(MotionInfoContexts *enco_ctx);
+extern void delete_contexts_TextureInfo(TextureInfoContexts *enco_ctx);
+
+extern void cabac_new_slice(Slice *currSlice);
+
+extern int readMB_typeInfo_CABAC(Macroblock *currMB, DecodingEnvironmentPtr dep_dp);
+extern int readB8_typeInfo_CABAC(Slice *currSlice, DecodingEnvironmentPtr dep_dp);
+extern int readIntraPredMode_CABAC(Slice *currSlice, DecodingEnvironmentPtr dep_dp);
+extern char readRefFrame_CABAC(Macroblock *currMB, DecodingEnvironmentPtr dep_dp, int list, int x, int y);
+extern char readRefFrame_CABAC0(Macroblock *currMB, DecodingEnvironmentPtr dep_dp, int list, int y);
+extern int readMVD_CABAC(Macroblock *currMB, DecodingEnvironmentPtr dep_dp, int k, int list, int x, int y);
+extern int readCBP_CABAC                   (Macroblock *currMB, DecodingEnvironmentPtr dep_dp);
+
+// readRunLevel_CABAC returns level and sets *run
+extern RunLevel readRunLevel_CABAC(Macroblock *currMB, DecodingEnvironmentPtr dep_dp, int context);
+extern short readDquant_CABAC(Slice *currSlice, DecodingEnvironmentPtr dep_dp);
+extern char readCIPredMode_CABAC            (Macroblock *currMB, DecodingEnvironmentPtr dep_dp);
+extern int readMB_skip_flagInfo_CABAC      (Macroblock *currMB, DecodingEnvironmentPtr dep_dp);
+extern Boolean readFieldModeInfo_CABAC         (Macroblock *currMB, DecodingEnvironmentPtr dep_dp);
+extern Boolean readMB_transform_size_flag_CABAC(Macroblock *currMB, DecodingEnvironmentPtr dep_dp);
+
+extern void readIPCM_CABAC(Slice *currSlice, struct datapartition *dP);
+
+extern int  cabac_startcode_follows(Slice *currSlice, int eos_bit);
+
+
+extern int check_next_mb_and_get_field_mode_CABAC(Slice *currSlice, DataPartition  *act_dp);
+
+extern void CheckAvailabilityOfNeighborsCABAC(Macroblock *currMB);
+
+extern void set_read_and_store_CBP(Macroblock **currMB, int chroma_format_idc);
+
+#endif  // _CABAC_H_
+
diff --git a/Src/h264dec/ldecod/inc/context_ini.h b/Src/h264dec/ldecod/inc/context_ini.h
new file mode 100644
index 00000000..73977be6
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/context_ini.h
@@ -0,0 +1,23 @@
+
+/*!
+ *************************************************************************************
+ * \file context_ini.h
+ *
+ * \brief
+ *    CABAC context initializations
+ *
+ * \author
+ *    Main contributors (see contributors.h for copyright, address and affiliation details)
+ *    - Detlev Marpe                    <marpe@hhi.de>
+ *    - Heiko Schwarz                   <hschwarz@hhi.de>
+ **************************************************************************************
+ */
+
+
+#ifndef _CONTEXT_INI_
+#define _CONTEXT_INI_
+
+extern void  init_contexts  (Slice *currslice);
+
+#endif
+
diff --git a/Src/h264dec/ldecod/inc/contributors.h b/Src/h264dec/ldecod/inc/contributors.h
new file mode 100644
index 00000000..3e462d0b
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/contributors.h
@@ -0,0 +1,223 @@
+
+/*! \file
+ *     contributors.h
+ *  \brief
+ *     List of contributors and copyright information.
+ *
+ *  \par Copyright statements
+    \verbatim
+   H.264 JM coder/decoder
+
+   Copyright (C) 2000 by
+      Telenor Satellite Services, Norway
+      Ericsson Radio Systems, Sweden
+      TELES AG, Germany
+      Nokia Inc., USA
+      Nokia Corporation, Finland
+      Siemens AG, Germany
+      Fraunhofer-Institute for Telecommunications Heinrich-Hertz-Institut (HHI), Germany
+      University of Hannover, Institut of Communication Theory and Signal Processing, Germany
+      TICSP, Tampere University of Technology, Finland
+      Munich University of Technology, Institute for Communications Engineering, Germany
+      Videolocus, Canada
+      Motorola Inc., USA
+      Microsoft Corp., USA
+      Apple Computer, Inc.
+      RealNetworks, Inc., USA
+      Thomson, Inc., USA
+      Sejong Univ., Digital Media System Lab., Korea
+  \endverbatim
+  \par Full Contact Information
+  \verbatim
+
+      Lowell Winger                   <lwinger@videolocus.com><lwinger@uwaterloo.ca>
+      Guy C�t�                        <gcote@videolocus.com>
+      Michael Gallant                 <mgallant@videolocus.com>
+      VideoLocus Inc.
+      97 Randall Dr.
+      Waterloo, ON, Canada  N2V1C5
+
+      Inge Lille-Lang�y               <inge.lille-langoy@telenor.com>
+      Telenor Satellite Services
+      P.O.Box 6914 St.Olavs plass
+      N-0130 Oslo, Norway
+
+      Rickard Sjoberg                 <rickard.sjoberg@era.ericsson.se>
+      Ericsson Radio Systems
+      KI/ERA/T/VV
+      164 80 Stockholm, Sweden
+
+      Stephan Wenger                  <stewe@cs.tu-berlin.de>
+      TU Berlin / TELES AG
+      Sekr. FR 6-3
+      Franklinstr. 28-29
+      D-10587 Berlin, Germany
+
+      Jani Lainema                    <jani.lainema@nokia.com>
+      Nokia Inc. / Nokia Research Center
+      6000 Connection Drive
+      Irving, TX 75039, USA
+
+      Sebastian Purreiter             <sebastian.purreiter@mch.siemens.de>
+      Siemens AG
+      ICM MD MP RD MCH 83
+      P.O.Box 80 17 07
+      D-81617 Munich, Germany
+
+      Thomas Wedi                     <wedi@tnt.uni-hannover.de>
+      University of Hannover
+      Institut of Communication Theory and Signal Processing
+      Appelstr. 9a
+      30167 Hannover, Germany
+
+      Guido Heising
+      Fraunhofer-Institute for Telecommunications
+      Heinrich-Hertz-Institut (HHI)
+      Einsteinufer 37
+      10587 Berlin
+      Germany
+
+      Gabi Blaettermann
+      Fraunhofer-Institute for Telecommunications
+      Heinrich-Hertz-Institut (HHI)
+      Einsteinufer 37
+      10587 Berlin
+      Germany
+
+      Detlev Marpe                    <marpe@hhi.de>
+      Fraunhofer-Institute for Telecommunications
+      Heinrich-Hertz-Institut (HHI)
+      Einsteinufer 37
+      10587 Berlin
+      Germany
+
+      Ragip Kurceren                  <ragip.kurceren@nokia.com>
+      Nokia Inc. / Nokia Research Center
+      6000 Connection Drive
+      Irving, TX 75039, USA
+
+      Viktor Varsa                    <viktor.varsa@nokia.com>
+      Nokia Inc. / Nokia Research Center
+      6000 Connection Drive
+      Irving, TX 75039, USA
+
+      Ye-Kui Wang                     <wyk@ieee.org>
+      Tampere University of Technology
+      Tampere International Center for Signal Processing
+      33720 Tampere, Finland
+
+      Ari Hourunranta                 <ari.hourunranta@nokia.com>
+      Nokia Corporation / Nokia Mobile Phones
+      P.O. Box 88
+      33721 Tampere, Finland
+
+      Yann Le Maguet                  <yann.lemaguet@philips.com>
+      Philips Research France
+
+      Dong Tian                       <tian@cs.tut.fi>
+      Tampere University of Technology
+      Tampere International Center for Signal Processing
+      33720 Tampere, Finland
+
+      Miska M. Hannuksela             <miska.hannuksela@nokia.com>
+      Nokia Corporation / Nokia Mobile Phones
+      P.O. Box 88
+      33721 Tampere, Finland
+
+      Karsten Suehring                <suehring@hhi.de>
+      Fraunhofer-Institute for Telecommunications
+      Heinrich-Hertz-Institut (HHI)
+      Einsteinufer 37
+      10587 Berlin
+      Germany
+
+      Heiko Schwarz                   <hschwarz@hhi.de>
+      Fraunhofer-Institute for Telecommunications
+      Heinrich-Hertz-Institut (HHI)
+      Einsteinufer 37
+      10587 Berlin
+      Germany
+
+      Tobias Oelbaum                  <drehvial@gmx.net>
+      Institute for Communications Engineering
+      Munich University of Technology
+      Germany
+
+      Limin Wang                      <liwang@gi.com>
+      Krit Panusopone                 <kpanusopone@gi.com>
+      Rajeev Gandhi                   <rgandhi@gi.com>
+      Yue Yu                          <yyu@gi.com>
+      Motorola Inc.
+      6450 Sequence Drive
+      San Diego, CA 92121 USA
+
+      Feng Wu                         <fengwu@microsoft.com>
+      Xiaoyan Sun                     <sunxiaoyan@msrchina.research.microsoft.com>
+      Microsoft Research Asia
+      3/F, Beijing Sigma Center
+      No.49, Zhichun Road, Hai Dian District,
+      Beijing China 100080
+
+      Yoshihiro Kikuchi               <yoshihiro.kikuchi@toshiba.co.jp>
+      Takeshi Chujoh                  <takeshi.chujoh@toshiba.co.jp>
+      Toshiba Corporation
+      Research and Development Center
+      Kawasaki 212-8582, Japan
+
+      Shinya Kadono                   <kadono@drl.mei.co.jp>
+      Matsushita Electric Industrial Co., Ltd.
+      1006 Kadoma, Kadoma
+      Osaka 663-8113, Japan
+
+      Dzung Hoang                     <dthoang@yahoo.com>
+      10533 Roy Butler Dr.
+      Austin, TX 78717
+
+      Eric Viscito                    <eric@ev-consulting.com>
+      eV Consulting
+      52 Tracy Ln
+      Shelburne, VT 05482 USA
+
+      Barry Haskell
+      Apple Computer, Inc.            <bhaskell@apple.com>
+      2 Infinite Loop
+      Cupertino, California 95014
+
+      Greg Conklin
+      RealNetworks, Inc.              <gregc@real.com>
+      2601 Elliott Ave
+      Seattle, WA 98101
+
+      Jill Boyce                      <jill.boyce@thomson.net>
+      Cristina Gomila                 <cristina.gomila@thomson.net>
+      Thomson
+      2 Independence Way
+      Princeton, NJ 08540
+
+      Alexis Michael Tourapis         <alexismt@ieee.org><atour@dolby.com>
+      Athanasios Leontaris            <aleon@dolby.com>
+      Dolby Laboratories Inc.
+      3601 West Alameda Ave.
+      Burbank, CA 91505
+
+      Saurav K Bandyopadhyay          <saurav@ieee.org>
+      Purvin Pandit                   <Purvin.Pandit@thomson.net>
+      Zhenyu Wu                       <Zhenyu.Wu@thomson.net>
+      Thomson Inc.
+      2 Independence Way
+      Princeton, NJ 08540
+
+      Shun-ichi Sekiguchi             <Sekiguchi.Shunichi@eb.MitsubishiElectric.co.jp>
+      Information Technology R&D Center, 
+      Mitsubishi Electric Corporation
+      5-1-1, Ofuna, Kamakura, Japan
+
+      Yung-Lyul Lee                   <yllee@sejong.ac.kr>
+      Ki-Hun Han                      <khhan@dms.sejong.ac.kr>
+      Department of Computer Engineering, 
+      Sejong University
+      98 Kunja-Dong, Kwangjin-Gu, Seoul 143-747, Korea
+
+  \endverbatim
+*/
+
diff --git a/Src/h264dec/ldecod/inc/defines.h b/Src/h264dec/ldecod/inc/defines.h
new file mode 100644
index 00000000..edc50563
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/defines.h
@@ -0,0 +1,273 @@
+
+/*!
+ **************************************************************************
+ * \file defines.h
+ *
+ * \brief
+ *    Header file containing some useful global definitions
+ *
+ * \author
+ *    Main contributors (see contributors.h for copyright, address and affiliation details)
+ *     - Detlev Marpe
+ *     - Karsten S�hring                 <suehring@hhi.de> 
+ *     - Alexis Michael Tourapis         <alexismt@ieee.org> 
+ *   
+ *
+ * \date
+ *    21. March 2001
+ **************************************************************************
+ */
+
+
+#ifndef H264_DEFINES_H_
+#define H264_DEFINES_H_
+#pragma once
+
+#if defined _DEBUG
+# define TRACE           0      //!< 0:Trace off 1:Trace on 2:detailed CABAC context information
+#else
+# define TRACE           0      //!< 0:Trace off 1:Trace on 2:detailed CABAC context information
+#endif
+
+#define JM                  "16.1 (FRExt)"
+#define VERSION             "16.1"
+#define EXT_VERSION         "(FRExt)"
+
+#define DUMP_DPB                  0    //!< Dump DPB info for debug purposes
+#define PAIR_FIELDS_IN_OUTPUT     0    //!< Pair field pictures for output purposes
+#define IMGTYPE                   0    //!< Define imgpel size type. 0 implies byte (cannot handle >8 bit depths) and 1 implies unsigned short
+#define ENABLE_FIELD_CTX          1    //!< Enables Field mode related context types for CABAC
+#define ENABLE_HIGH444_CTX        1    //!< Enables High 444 profile context types for CABAC. 
+#define ZEROSNR                   0    //!< PSNR computation method
+#define ENABLE_OUTPUT_TONEMAPPING 1    //!< enable tone map the output if tone mapping SEI present
+
+
+#include "typedefs.h"
+
+
+#define H264_MEMORY_ALIGNMENT      32
+
+//#define MAX_NUM_SLICES 150
+#define MAX_NUM_SLICES 50
+#define MAX_REFERENCE_PICTURES 32               //!< H.264 allows 32 fields
+#define MAX_CODED_FRAME_SIZE 8000000         //!< bytes for one frame
+
+//AVC Profile IDC definitions
+#define BASELINE         66      //!< YUV 4:2:0/8  "Baseline"
+#define MAIN             77      //!< YUV 4:2:0/8  "Main"
+#define EXTENDED         88      //!< YUV 4:2:0/8  "Extended"
+#define FREXT_HP        100      //!< YUV 4:2:0/8 "High"
+#define FREXT_Hi10P     110      //!< YUV 4:2:0/10 "High 10"
+#define FREXT_Hi422     122      //!< YUV 4:2:2/10 "High 4:2:2"
+#define FREXT_Hi444     244      //!< YUV 4:4:4/14 "High 4:4:4"
+#define FREXT_CAVLC444   44      //!< YUV 4:4:4/14 "CAVLC 4:4:4"
+
+
+#define FILE_NAME_SIZE  255
+#define INPUT_TEXT_SIZE 1024
+
+#if (ENABLE_HIGH444_CTX == 1)
+# define NUM_BLOCK_TYPES 22  
+#else
+# define NUM_BLOCK_TYPES 10
+#endif
+
+
+//#define _LEAKYBUCKET_
+
+#define BLOCK_SHIFT            2
+#define BLOCK_SIZE             4
+#define BLOCK_SIZE_8x8         8
+#define SMB_BLOCK_SIZE         8
+#define BLOCK_PIXELS          16
+#define MB_BLOCK_SIZE         16
+#define MB_PIXELS            256 // MB_BLOCK_SIZE * MB_BLOCK_SIZE
+#define MB_PIXELS_SHIFT        8 // log2(MB_BLOCK_SIZE * MB_BLOCK_SIZE)
+#define MB_BLOCK_SHIFT         4
+#define BLOCK_MULTIPLE         4 // (MB_BLOCK_SIZE/BLOCK_SIZE)
+#define MB_BLOCK_PARTITIONS   16 // (BLOCK_MULTIPLE * BLOCK_MULTIPLE)
+#define BLOCK_CONTEXT         64 // (4 * MB_BLOCK_PARTITIONS)
+
+// These variables relate to the subpel accuracy supported by the software (1/4)
+#define BLOCK_SIZE_SP      16  // BLOCK_SIZE << 2
+#define BLOCK_SIZE_8x8_SP  32  // BLOCK_SIZE8x8 << 2
+
+//  Available MB modes
+enum {
+  PSKIP        =  0,
+  BSKIP_DIRECT =  0,
+  P16x16       =  1,
+  P16x8        =  2,
+  P8x16        =  3,
+  SMB8x8       =  4,
+  SMB8x4       =  5,
+  SMB4x8       =  6,
+  SMB4x4       =  7,
+  P8x8         =  8,
+  I4MB         =  9,
+  I16MB        = 10,
+  IBLOCK       = 11,
+  SI4MB        = 12,
+  I8MB         = 13,
+  IPCM         = 14,
+  MAXMODE      = 15
+} ;//MBModeTypes;
+
+// number of intra prediction modes
+#define NO_INTRA_PMODE  9
+
+// Direct Mode types
+enum {
+  DIR_TEMPORAL = 0, //!< Temporal Direct Mode
+  DIR_SPATIAL  = 1 //!< Spatial Direct Mode
+} ;//DirectModes;
+
+// CAVLC block types
+enum {
+  LUMA              =  0,
+  LUMA_INTRA16x16DC =  1,
+  LUMA_INTRA16x16AC =  2,
+  CB                =  3,
+  CB_INTRA16x16DC   =  4,
+  CB_INTRA16x16AC   =  5,
+  CR                =  8,
+  CR_INTRA16x16DC   =  9,
+  CR_INTRA16x16AC   = 10
+} ;//CAVLCBlockTypes;
+
+// CABAC block types
+enum {
+  LUMA_16DC     =   0,
+  LUMA_16AC     =   1,
+  LUMA_8x8      =   2,
+  LUMA_8x4      =   3,
+  LUMA_4x8      =   4,
+  LUMA_4x4      =   5,
+  CHROMA_DC     =   6,
+  CHROMA_AC     =   7,
+  CHROMA_DC_2x4 =   8,
+  CHROMA_DC_4x4 =   9,
+  CB_16DC       =  10,
+  CB_16AC       =  11,
+  CB_8x8        =  12,
+  CB_8x4        =  13,
+  CB_4x8        =  14,
+  CB_4x4        =  15,
+  CR_16DC       =  16,
+  CR_16AC       =  17,
+  CR_8x8        =  18,
+  CR_8x4        =  19,
+  CR_4x8        =  20,
+  CR_4x4        =  21
+} ;//CABACBlockTypes;
+
+// Macro defines
+#define Q_BITS          15
+#define DQ_BITS          6
+#define Q_BITS_8        16
+#define DQ_BITS_8        6 
+
+//#define IS_INTRA(MB)    ((MB)->mb_type==I4MB  || (MB)->mb_type==I16MB ||(MB)->mb_type==IPCM || (MB)->mb_type==I8MB || (MB)->mb_type==SI4MB)
+#define IS_INTRA(MB)    (!!((1 << (MB)->mb_type) & ((1<<I4MB) | (1<<I16MB) | (1<<IPCM) | (1<<I8MB) | (1<<SI4MB))))
+#define IS_I16MB(MB)    ((MB)->mb_type==I16MB  || (MB)->mb_type==IPCM)
+
+#define IS_INTER(MB)    (!IS_INTRA(MB))
+//#define IS_INTER(MB)    ((MB)->mb_type!=SI4MB && (MB)->mb_type!=I4MB  && (MB)->mb_type!=I16MB && (MB)->mb_type!=I8MB  && (MB)->mb_type!=IPCM)
+#define IS_INTERMV(MB)  ((MB)->mb_type!=I4MB  && (MB)->mb_type!=I16MB && (MB)->mb_type!=I8MB  && (MB)->mb_type!=0 && (MB)->mb_type!=IPCM)
+#define IS_DIRECT(MB)   ((MB)->mb_type==0     && (currSlice->slice_type == B_SLICE ))
+#define IS_SKIP(MB)     ((MB)->mb_type==0     && (currSlice->slice_type == P_SLICE || currSlice->slice_type == SP_SLICE))
+
+#define TOTRUN_NUM       15
+#define RUNBEFORE_NUM     7
+#define RUNBEFORE_NUM_M1  6
+
+// Quantization parameter range
+#define MIN_QP          0
+#define MAX_QP          51
+// 4x4 intra prediction modes 
+enum {
+  VERT_PRED            = 0,
+  HOR_PRED             = 1,
+  DC_PRED              = 2,
+  DIAG_DOWN_LEFT_PRED  = 3,
+  DIAG_DOWN_RIGHT_PRED = 4,
+  VERT_RIGHT_PRED      = 5,
+  HOR_DOWN_PRED        = 6,
+  VERT_LEFT_PRED       = 7,
+  HOR_UP_PRED          = 8
+} ;//I4x4PredModes;
+
+// 16x16 intra prediction modes
+enum {
+  VERT_PRED_16   = 0,
+  HOR_PRED_16    = 1,
+  DC_PRED_16     = 2,
+  PLANE_16       = 3
+} ;//I16x16PredModes;
+
+// 8x8 chroma intra prediction modes
+enum {
+  DC_PRED_8     =  0,
+  HOR_PRED_8    =  1,
+  VERT_PRED_8   =  2,
+  PLANE_8       =  3
+} ;//I8x8PredModes;
+
+enum {
+  EOS = 1,    //!< End Of Sequence
+  SOP = 2,    //!< Start Of Picture
+  SOS = 3     //!< Start Of Slice
+};
+
+// MV Prediction types
+enum {
+  MVPRED_MEDIAN   = 0,
+  MVPRED_L        = 1,
+  MVPRED_U        = 2,
+  MVPRED_UR       = 3
+} ;//MVPredTypes;
+
+enum {
+  DECODING_OK     = 0,
+  SEARCH_SYNC     = 1,
+  PICTURE_DECODED = 2
+};
+
+#define  LAMBDA_ACCURACY_BITS         16
+#define INVALIDINDEX  (-135792468)
+
+#define RC_MAX_TEMPORAL_LEVELS   5
+
+//Start code and Emulation Prevention need this to be defined in identical manner at encoder and decoder
+#define ZEROBYTES_SHORTSTARTCODE 2 //indicates the number of zero bytes in the short start-code prefix
+
+#define MAX_PLANE       3
+#define IS_INDEPENDENT(IMG)           ((IMG)->separate_colour_plane_flag)
+#define IS_FREXT_PROFILE(profile_idc) ( profile_idc>=FREXT_HP || profile_idc == FREXT_CAVLC444 )
+#define HI_INTRA_ONLY_PROFILE (((p_Vid->active_sps->profile_idc>=FREXT_Hi10P)&&(p_Vid->active_sps->constrained_set3_flag))||(p_Vid->active_sps->profile_idc==FREXT_CAVLC444)) 
+
+enum
+{
+	VUI_AR_UNDEFINED = 0,
+	VUI_AR_SQUARE = 1, // 1:1
+	VUI_AR_12_11 = 2, // 12:11
+	VUI_AR_10_11 = 3, // 10:11
+	VUI_AR_16_11 = 4, // 16:11
+	VUI_AR_40_33 = 5, // 40:33
+	VUI_AR_24_11 = 6, // 24:11
+	VUI_AR_20_11 = 7, // 20:11
+	VUI_AR_32_11 = 8, // 32:11
+	VUI_AR_80_33 = 9, // 80:33
+	VUI_AR_18_11 = 10, // 18:11
+	VUI_AR_15_11 = 11, // 15:11
+	VUI_AR_64_33 = 12, // 64:33
+VUI_AR_160_99 = 13, // 160:99
+VUI_AR_4_3 = 14, // 4:3
+VUI_AR_3_2 = 15, // 3:2
+VUI_AR_2_1 = 16, // 2:1
+	
+
+	VUI_EXTENDED_SAR = 255,
+};
+#endif
+
diff --git a/Src/h264dec/ldecod/inc/elements.h b/Src/h264dec/ldecod/inc/elements.h
new file mode 100644
index 00000000..f115bff6
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/elements.h
@@ -0,0 +1,112 @@
+
+/*!
+ *************************************************************************************
+ * \file elements.h
+ *
+ * \brief
+ *    Header file for elements in H.264 streams
+ *
+ * \date
+ *    6.10.2000
+ *
+ * \version
+ *    1.0
+ *
+ *  \author
+ *     Sebastian Purreiter     <sebastian.purreiter@mch.siemens.de>  \n
+ *     Siemens AG, Information and Communication Mobile              \n
+ *     P.O.Box 80 17 07                                              \n
+ *     D-81617 Munich, Germany                                       \n
+ *************************************************************************************
+ */
+
+#ifndef _ELEMENTS_H_
+#define _ELEMENTS_H_
+
+/*!
+ *  definition of H.264 syntax elements
+ *  order of elements follow dependencies for picture reconstruction
+ */
+/*!
+ * \brief   Assignment of old TYPE partition elements to new
+ *          elements
+ *
+ *  old element     | new elements
+ *  ----------------+-------------------------------------------------------------------
+ *  TYPE_HEADER     | SE_HEADER, SE_PTYPE
+ *  TYPE_MBHEADER   | SE_MBTYPE, SE_REFFRAME, SE_INTRAPREDMODE
+ *  TYPE_MVD        | SE_MVD
+ *  TYPE_CBP        | SE_CBP_INTRA, SE_CBP_INTER
+ *  SE_DELTA_QUANT_INTER
+ *  SE_DELTA_QUANT_INTRA
+ *  TYPE_COEFF_Y    | SE_LUM_DC_INTRA, SE_LUM_AC_INTRA, SE_LUM_DC_INTER, SE_LUM_AC_INTER
+ *  TYPE_2x2DC      | SE_CHR_DC_INTRA, SE_CHR_DC_INTER
+ *  TYPE_COEFF_C    | SE_CHR_AC_INTRA, SE_CHR_AC_INTER
+ *  TYPE_EOS        | SE_EOS
+*/
+
+#define SE_HEADER           0
+#define SE_PTYPE            1
+#define SE_MBTYPE           2
+#define SE_REFFRAME         3
+#define SE_INTRAPREDMODE    4
+#define SE_MVD              5
+#define SE_CBP_INTRA        6
+#define SE_LUM_DC_INTRA     7
+#define SE_CHR_DC_INTRA     8
+#define SE_LUM_AC_INTRA     9
+#define SE_CHR_AC_INTRA     10
+#define SE_CBP_INTER        11
+#define SE_LUM_DC_INTER     12
+#define SE_CHR_DC_INTER     13
+#define SE_LUM_AC_INTER     14
+#define SE_CHR_AC_INTER     15
+#define SE_DELTA_QUANT_INTER      16
+#define SE_DELTA_QUANT_INTRA      17
+#define SE_BFRAME           18
+#define SE_EOS              19
+#define SE_MAX_ELEMENTS     20
+
+
+#define NO_EC               0   //!< no error concealment necessary
+#define EC_REQ              1   //!< error concealment required
+#define EC_SYNC             2   //!< search and sync on next header element
+
+#define MAXPARTITIONMODES   2   //!< maximum possible partition modes as defined in assignSE2partition[][]
+
+/*!
+ *  \brief  lookup-table to assign different elements to partition
+ *
+ *  \note   here we defined up to 6 different partitions similar to
+ *          document Q15-k-18 described in the PROGFRAMEMODE.
+ *          The Sliceheader contains the PSYNC information. \par
+ *
+ *          Elements inside a partition are not ordered. They are
+ *          ordered by occurence in the stream.
+ *          Assumption: Only partitionlosses are considered. \par
+ *
+ *          The texture elements luminance and chrominance are
+ *          not ordered in the progressive form
+ *          This may be changed in image.c \par
+ *
+ *          We also defined the proposed internet partition mode
+ *          of Stephan Wenger here. To select the desired mode
+ *          uncomment one of the two following lines. \par
+ *
+ *  -IMPORTANT:
+ *          Picture- or Sliceheaders must be assigned to partition 0. \par
+ *          Furthermore partitions must follow syntax dependencies as
+ *          outlined in document Q15-J-23.
+ */
+
+
+static const byte assignSE2partition[][SE_MAX_ELEMENTS] =
+{
+  // 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19  // element number (do not uncomment)
+  {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },   //!< all elements in one partition no data partitioning
+  {  0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 2, 2, 2, 2, 0, 0, 0, 0 }    //!< three partitions per slice
+};
+
+
+#endif
+
diff --git a/Src/h264dec/ldecod/inc/erc_api.h b/Src/h264dec/ldecod/inc/erc_api.h
new file mode 100644
index 00000000..428c4ed3
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/erc_api.h
@@ -0,0 +1,159 @@
+
+/*!
+ ************************************************************************
+ * \file  erc_api.h
+ *
+ * \brief
+ *      External (still inside video decoder) interface for error concealment module
+ *
+ * \author
+ *      - Ari Hourunranta                <ari.hourunranta@nokia.com>
+ *      - Ye-Kui Wang                    <wyk@ieee.org>
+ *      - Jill Boyce                     <jill.boyce@thomson.net>
+ *      - Saurav K Bandyopadhyay         <saurav@ieee.org>
+ *      - Zhenyu Wu                      <Zhenyu.Wu@thomson.net
+ *      - Purvin Pandit                  <Purvin.Pandit@thomson.net>
+ *
+ * ************************************************************************
+ */
+
+
+#ifndef _ERC_API_H_
+#define _ERC_API_H_
+
+#include "erc_globals.h"
+
+/*
+* Defines
+*/
+
+/* If the average motion vector of the correctly received macroblocks is less than the
+threshold, concealByCopy is used, otherwise concealByTrial is used. */
+#define MVPERMB_THR 8
+
+/* used to determine the size of the allocated memory for a temporal Region (MB) */
+#define DEF_REGION_SIZE 384  /* 8*8*6 */
+
+#define ERC_BLOCK_OK                3
+#define ERC_BLOCK_CONCEALED         2
+#define ERC_BLOCK_CORRUPTED         1
+#define ERC_BLOCK_EMPTY             0
+
+
+/*
+* Functions to convert MBNum representation to blockNum
+*/
+
+#define xPosYBlock(currYBlockNum,picSizeX) \
+((currYBlockNum)%((picSizeX)>>3))
+
+#define yPosYBlock(currYBlockNum,picSizeX) \
+((currYBlockNum)/((picSizeX)>>3))
+
+#define xPosMB(currMBNum,picSizeX) \
+((currMBNum)%((picSizeX)>>4))
+
+#define yPosMB(currMBNum,picSizeX) \
+((currMBNum)/((picSizeX)>>4))
+
+#define MBxy2YBlock(currXPos,currYPos,comp,picSizeX) \
+((((currYPos)<<1)+((comp)>>1))*((picSizeX)>>3)+((currXPos)<<1)+((comp)&1))
+
+#define MBNum2YBlock(currMBNum,comp,picSizeX) \
+MBxy2YBlock(xPosMB((currMBNum),(picSizeX)),yPosMB((currMBNum),(picSizeX)),(comp),(picSizeX))
+
+
+/*
+* typedefs
+*/
+
+/* segment data structure */
+typedef struct ercSegment_s
+{
+  int      startMBPos;
+  int      endMBPos;
+  int      fCorrupted;
+} ercSegment_t;
+
+/* Error detector & concealment instance data structure */
+typedef struct ercVariables_s
+{
+  /*  Number of macroblocks (size or size/4 of the arrays) */
+  int   nOfMBs;
+  /* Number of segments (slices) in frame */
+  int     nOfSegments;
+
+  /*  Array for conditions of Y blocks */
+  int     *yCondition;
+  /*  Array for conditions of U blocks */
+  int     *uCondition;
+  /*  Array for conditions of V blocks */
+  int     *vCondition;
+
+  /* Array for Slice level information */
+  ercSegment_t *segments;
+  int     currSegment;
+
+  /* Conditions of the MBs of the previous frame */
+  int   *prevFrameYCondition;
+
+  /* Flag telling if the current segment was found to be corrupted */
+  int   currSegmentCorrupted;
+  /* Counter for corrupted segments per picture */
+  int   nOfCorruptedSegments;
+
+  /* State variables for error detector and concealer */
+  int   concealment;
+
+} ercVariables_t;
+
+/*
+* External function interface
+*/
+
+void ercInit(VideoParameters *p_Vid, int pic_sizex, int pic_sizey, int flag);
+ercVariables_t *ercOpen( void );
+void ercReset( ercVariables_t *errorVar, int nOfMBs, int numOfSegments, int picSizeX );
+void ercClose( VideoParameters *p_Vid, ercVariables_t *errorVar );
+void ercSetErrorConcealment( ercVariables_t *errorVar, int value );
+
+void ercStartSegment( int currMBNum, int segment, unsigned int bitPos, ercVariables_t *errorVar );
+void ercStopSegment( int currMBNum, int segment, unsigned int bitPos, ercVariables_t *errorVar );
+void ercMarkCurrSegmentLost(int picSizeX, ercVariables_t *errorVar );
+void ercMarkCurrSegmentOK(int picSizeX, ercVariables_t *errorVar );
+void ercMarkCurrMBConcealed( int currMBNum, int comp, int picSizeX, ercVariables_t *errorVar );
+
+int ercConcealIntraFrame( VideoParameters *p_Vid, frame *recfr, int picSizeX, int picSizeY, ercVariables_t *errorVar );
+int ercConcealInterFrame( frame *recfr, objectBuffer_t *object_list,
+                          int picSizeX, int picSizeY, ercVariables_t *errorVar, int chroma_format_idc );
+
+
+/* Thomson APIs for concealing entire frame loss */
+
+#include "mbuffer.h"
+#include "output.h"
+
+struct concealment_node {
+    StorablePicture* picture;
+    int  missingpocs;
+    struct concealment_node *next;
+};
+
+extern struct concealment_node * init_node(StorablePicture* , int );
+extern void print_node( struct concealment_node * );
+extern void print_list( struct concealment_node * );
+extern void init_lists_for_non_reference_loss(VideoParameters *p_Vid, int , PictureStructure );
+
+extern void conceal_non_ref_pics(VideoParameters *p_Vid, int diff);
+extern void conceal_lost_frames(VideoParameters *p_Vid);
+
+extern void sliding_window_poc_management(DecodedPictureBuffer *p_Dpb, StorablePicture *p);
+
+extern void write_lost_non_ref_pic(VideoParameters *p_Vid, int poc);
+extern void write_lost_ref_after_idr(VideoParameters *p_Vid, int pos);
+
+extern int comp(const void *, const void *);
+
+
+#endif
+
diff --git a/Src/h264dec/ldecod/inc/erc_do.h b/Src/h264dec/ldecod/inc/erc_do.h
new file mode 100644
index 00000000..9879222f
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/erc_do.h
@@ -0,0 +1,44 @@
+
+/*!
+ ************************************************************************
+ * \file  erc_do.h
+ *
+ * \brief
+ *      Header for the I & P frame error concealment common functions
+ *
+ * \author
+ *      - Viktor Varsa                     <viktor.varsa@nokia.com>
+ *      - Ye-Kui Wang                   <wyk@ieee.org>
+ *
+ ************************************************************************
+ */
+
+#ifndef _ERC_DO_H_
+#define _ERC_DO_H_
+
+
+#include "erc_api.h"
+
+void ercPixConcealIMB    (VideoParameters *p_Vid, imgpel *currFrame, int row, int column, int predBlocks[], int frameWidth, int mbWidthInBlocks);
+
+int ercCollect8PredBlocks( int predBlocks[], int currRow, int currColumn, int *condition,
+                          int maxRow, int maxColumn, int step, byte fNoCornerNeigh );
+int ercCollectColumnBlocks( int predBlocks[], int currRow, int currColumn, int *condition, int maxRow, int maxColumn, int step );
+
+#define isSplitted(object_list,currMBNum) \
+    ((object_list+((currMBNum)<<2))->regionMode >= REGMODE_SPLITTED)
+
+/* this can be used as isBlock(...,INTRA) or isBlock(...,INTER_COPY) */
+#define isBlock(object_list,currMBNum,comp,regMode) \
+    (isSplitted(object_list,currMBNum) ? \
+     ((object_list+((currMBNum)<<2)+(comp))->regionMode == REGMODE_##regMode##_8x8) : \
+     ((object_list+((currMBNum)<<2))->regionMode == REGMODE_##regMode))
+
+/* this can be used as getParam(...,mv) or getParam(...,xMin) or getParam(...,yMin) */
+#define getParam(object_list,currMBNum,comp,param) \
+    (isSplitted(object_list,currMBNum) ? \
+     ((object_list+((currMBNum)<<2)+(comp))->param) : \
+     ((object_list+((currMBNum)<<2))->param))
+
+#endif
+
diff --git a/Src/h264dec/ldecod/inc/erc_globals.h b/Src/h264dec/ldecod/inc/erc_globals.h
new file mode 100644
index 00000000..63ba4e2e
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/erc_globals.h
@@ -0,0 +1,52 @@
+
+/*!
+ ************************************************************************
+ * \file erc_globals.h
+ *
+ * \brief
+ *      global header file for error concealment module
+ *
+ * \author
+ *      - Viktor Varsa                     <viktor.varsa@nokia.com>
+ *      - Ye-Kui Wang                   <wyk@ieee.org>
+ ************************************************************************
+ */
+
+#ifndef _ERC_GLOBALS_H_
+#define _ERC_GLOBALS_H_
+
+#include "defines.h"
+
+/* "block" means an 8x8 pixel area */
+
+/* Region modes */
+#define REGMODE_INTER_COPY       0  //!< Copy region
+#define REGMODE_INTER_PRED       1  //!< Inter region with motion vectors
+#define REGMODE_INTRA            2  //!< Intra region
+#define REGMODE_SPLITTED         3  //!< Any region mode higher than this indicates that the region
+                                    //!< is splitted which means 8x8 block
+#define REGMODE_INTER_COPY_8x8   4
+#define REGMODE_INTER_PRED_8x8   5
+#define REGMODE_INTRA_8x8        6
+
+//! YUV pixel domain image arrays for a video frame
+typedef struct frame_s
+{
+  VideoParameters *p_Vid;
+  imgpel *yptr;
+  imgpel *uptr;
+  imgpel *vptr;
+} frame;
+
+//! region structure stores information about a region that is needed for concealment
+typedef struct object_buffer
+{
+  byte regionMode;  //!< region mode as above
+  int xMin;         //!< X coordinate of the pixel position of the top-left corner of the region
+  int yMin;         //!< Y coordinate of the pixel position of the top-left corner of the region
+  short mv[3];        //!< motion vectors in 1/4 pixel units: mvx = mv[0], mvy = mv[1],
+                    //!< and ref_frame = mv[2]
+} objectBuffer_t;
+
+#endif
+
diff --git a/Src/h264dec/ldecod/inc/errorconcealment.h b/Src/h264dec/ldecod/inc/errorconcealment.h
new file mode 100644
index 00000000..36650e25
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/errorconcealment.h
@@ -0,0 +1,20 @@
+
+
+/*!
+ ****************************************************************************
+ * \file errorconcealment.h
+ *
+ * \brief
+ *    Header file for errorconcealment.c
+ *
+ ****************************************************************************
+ */
+
+#ifndef _ERRORCONCEALMENT_H_
+#define _ERRORCONCEALMENT_H_
+
+extern int set_ec_flag(VideoParameters *p_Vid, int se);
+extern void reset_ec_flags(VideoParameters *p_Vid);
+
+#endif
+
diff --git a/Src/h264dec/ldecod/inc/fmo.h b/Src/h264dec/ldecod/inc/fmo.h
new file mode 100644
index 00000000..df749bba
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/fmo.h
@@ -0,0 +1,30 @@
+
+/*!
+ ***************************************************************************
+ *
+ * \file fmo.h
+ *
+ * \brief
+ *    Support for Flexilble Macroblock Ordering (FMO)
+ *
+ * \date
+ *    19 June, 2002
+ *
+ * \author
+ *    Stephan Wenger   stewe@cs.tu-berlin.de
+ **************************************************************************/
+
+#ifndef _FMO_H_
+#define _FMO_H_
+
+
+extern int fmo_init (VideoParameters *p_Vid);
+extern int FmoFinit (VideoParameters *p_Vid);
+
+extern int FmoGetNumberOfSliceGroup(VideoParameters *p_Vid);
+extern int FmoGetLastMBOfPicture   (VideoParameters *p_Vid);
+extern int FmoGetLastMBInSliceGroup(VideoParameters *p_Vid, int SliceGroup);
+extern int FmoGetSliceGroupId      (VideoParameters *p_Vid, int mb);
+extern int FmoGetNextMBNr          (VideoParameters *p_Vid, int CurrentMbNr);
+
+#endif
diff --git a/Src/h264dec/ldecod/inc/global.h b/Src/h264dec/ldecod/inc/global.h
new file mode 100644
index 00000000..6d2677e6
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/global.h
@@ -0,0 +1,1230 @@
+
+/*!
+ ************************************************************************
+ *  \file
+ *     global.h
+ *  \brief
+ *     global definitions for H.264 decoder.
+ *  \author
+ *     Copyright (C) 1999  Telenor Satellite Services,Norway
+ *                         Ericsson Radio Systems, Sweden
+ *
+ *     Inge Lille-Langoy               <inge.lille-langoy@telenor.com>
+ *
+ *     Telenor Satellite Services
+ *     Keysers gt.13                       tel.:   +47 23 13 86 98
+ *     N-0130 Oslo,Norway                  fax.:   +47 22 77 79 80
+ *
+ *     Rickard Sjoberg                 <rickard.sjoberg@era.ericsson.se>
+ *
+ *     Ericsson Radio Systems
+ *     KI/ERA/T/VV
+ *     164 80 Stockholm, Sweden
+ *
+ ************************************************************************
+ */
+#ifndef _GLOBAL_H_
+#define _GLOBAL_H_
+
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <assert.h>
+#include <time.h>
+#include <sys/timeb.h>
+
+#include <bfc/platform/types.h>
+#include "win32.h"
+#include "defines.h"
+#include "ifunctions.h"
+#include "parsetcommon.h"
+#include "types.h"
+#include "frame.h"
+#include "nalucommon.h"
+#include "memcache.h"
+#include <mmintrin.h>
+#ifdef H264_IPP
+//#include "../tools/staticlib/ipp_px.h"
+#include "ippdefs.h"
+#include "ippcore.h"
+#include "ipps.h"
+#include "ippi.h"
+#include "ippvc.h"
+#endif
+/* benski> not the best place for this but it works for now */
+#ifdef _M_IX86
+// must be a multiple of 16
+#pragma warning(disable: 4799)
+static inline void memzero_cache32(void *dst, unsigned long i)
+{
+	
+	__asm {
+		pxor	mm0, mm0
+		mov edi, dst
+
+loopwrite:
+
+		movq 0[edi], mm0
+		movq 8[edi], mm0
+		movq 16[edi], mm0
+		movq 24[edi], mm0
+		lea edi, [edi+32]
+		sub i, 32
+		jg loopwrite
+
+	}
+}
+
+static inline void memzero_fast32(void *dst, unsigned long i)
+{
+	
+	__asm {
+		pxor	mm0, mm0
+		mov edi, dst
+
+loopwrite:
+
+		movntq 0[edi], mm0
+		movntq 8[edi], mm0
+		movntq 16[edi], mm0
+		movntq 24[edi], mm0
+
+		lea edi, [edi+32]
+		sub i, 32
+		jg loopwrite
+
+	}
+}
+
+static inline void memzero64(void *dst)
+{
+	__asm {
+		pxor	mm0, mm0
+		mov edi, dst
+
+		movq 0[edi], mm0
+		movq 8[edi], mm0
+		movq 16[edi], mm0
+		movq 24[edi], mm0
+		movq 32[edi], mm0
+		movq 40[edi], mm0
+		movq 48[edi], mm0
+		movq 56[edi], mm0
+	}
+}
+
+static inline void memzero128(void *dst)
+{
+	__asm {
+		pxor	mm0, mm0
+		mov edi, dst
+
+		movq 0[edi], mm0
+		movq 8[edi], mm0
+		movq 16[edi], mm0
+		movq 24[edi], mm0
+		movq 32[edi], mm0
+		movq 40[edi], mm0
+		movq 48[edi], mm0
+		movq 56[edi], mm0
+		movq 64[edi], mm0
+		movq 72[edi], mm0
+		movq 80[edi], mm0
+		movq 88[edi], mm0
+		movq 96[edi], mm0
+		movq 104[edi], mm0
+		movq 112[edi], mm0
+		movq 120[edi], mm0
+	}
+}
+
+static inline void memzero24(void *dst)
+{
+	__asm {
+		pxor	mm0, mm0
+		mov edi, dst
+
+		movq 0[edi], mm0
+		movq 8[edi], mm0
+		movq 16[edi], mm0
+	}
+}
+
+static inline void memzero48(void *dst)
+{
+	__asm {
+		pxor	mm0, mm0
+		mov edi, dst
+
+		movq 0[edi], mm0
+		movq 8[edi], mm0
+		movq 16[edi], mm0
+		movq 24[edi], mm0
+		movq 32[edi], mm0
+		movq 40[edi], mm0
+	}
+}
+
+static inline void memzero16(void *dst)
+{
+	__asm {
+		pxor	mm0, mm0
+		mov edi, dst
+
+		movq 0[edi], mm0
+		movq 8[edi], mm0
+	}
+}
+
+static inline void memzero8(void *dst)
+{
+	__asm {
+		pxor	mm0, mm0
+		mov edi, dst
+
+		movq 0[edi], mm0
+	}
+}
+
+static inline void memset_fast_end()
+{
+	_mm_empty();
+}
+
+// Very optimized memcpy() routine for all AMD Athlon and Duron family.
+// This code uses any of FOUR different basic copy methods, depending
+// on the transfer size.
+// NOTE:  Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
+// "Streaming Store"), and also uses the software prefetchnta instructions,
+// be sure youre running on Athlon/Duron or other recent CPU before calling!
+
+#define TINY_BLOCK_COPY 64       // upper limit for movsd type copy
+// The smallest copy uses the X86 "movsd" instruction, in an optimized
+// form which is an "unrolled loop".
+
+#define IN_CACHE_COPY 64 * 1024  // upper limit for movq/movq copy w/SW prefetch
+// Next is a copy that uses the MMX registers to copy 8 bytes at a time,
+// also using the "unrolled loop" optimization.   This code uses
+// the software prefetch instruction to get the data into the cache.
+
+#define UNCACHED_COPY 197 * 1024 // upper limit for movq/movntq w/SW prefetch
+// For larger blocks, which will spill beyond the cache, its faster to
+// use the Streaming Store instruction MOVNTQ.   This write instruction
+// bypasses the cache and writes straight to main memory.  This code also
+// uses the software prefetch instruction to pre-read the data.
+// USE 64 * 1024 FOR THIS VALUE IF YOURE ALWAYS FILLING A "CLEAN CACHE"
+
+#define BLOCK_PREFETCH_COPY  infinity // no limit for movq/movntq w/block prefetch 
+#define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch
+// For the largest size blocks, a special technique called Block Prefetch
+// can be used to accelerate the read operations.   Block Prefetch reads
+// one address per cache line, for a series of cache lines, in a short loop.
+// This is faster than using software prefetch.  The technique is great for
+// getting maximum read bandwidth, especially in DDR memory systems.
+
+// Inline assembly syntax for use with Visual C++
+
+
+static void * memcpy_amd(void *dest, const void *src, size_t n)
+{
+  __asm {
+
+	mov		ecx, [n]		// number of bytes to copy
+	mov		edi, [dest]		// destination
+	mov		esi, [src]		// source
+	mov		ebx, ecx		// keep a copy of count
+
+	cld
+	cmp		ecx, TINY_BLOCK_COPY
+	jb		$memcpy_ic_3	// tiny? skip mmx copy
+
+	cmp		ecx, 32*1024		// dont align between 32k-64k because
+	jbe		$memcpy_do_align	//  it appears to be slower
+	cmp		ecx, 64*1024
+	jbe		$memcpy_align_done
+$memcpy_do_align:
+	mov		ecx, 8			// a trick thats faster than rep movsb...
+	sub		ecx, edi		// align destination to qword
+	and		ecx, 111b		// get the low bits
+	sub		ebx, ecx		// update copy count
+	neg		ecx				// set up to jump into the array
+	add		ecx, offset $memcpy_align_done
+	jmp		ecx				// jump to array of movsbs
+
+align 4
+	movsb
+	movsb
+	movsb
+	movsb
+	movsb
+	movsb
+	movsb
+	movsb
+
+$memcpy_align_done:			// destination is dword aligned
+	mov		ecx, ebx		// number of bytes left to copy
+	shr		ecx, 6			// get 64-byte block count
+	jz		$memcpy_ic_2	// finish the last few bytes
+
+	cmp		ecx, IN_CACHE_COPY/64	// too big 4 cache? use uncached copy
+	jae		$memcpy_uc_test
+
+// This is small block copy that uses the MMX registers to copy 8 bytes
+// at a time.  It uses the "unrolled loop" optimization, and also uses
+// the software prefetch instruction to get the data into the cache.
+align 16
+$memcpy_ic_1:			// 64-byte block copies, in-cache copy
+
+	prefetchnta [esi + (200*64/34+192)]		// start reading ahead
+
+	movq	mm0, [esi+0]	// read 64 bits
+	movq	mm1, [esi+8]
+	movq	[edi+0], mm0	// write 64 bits
+	movq	[edi+8], mm1	//    note:  the normal movq writes the
+	movq	mm2, [esi+16]	//    data to cache// a cache line will be
+	movq	mm3, [esi+24]	//    allocated as needed, to store the data
+	movq	[edi+16], mm2
+	movq	[edi+24], mm3
+	movq	mm0, [esi+32]
+	movq	mm1, [esi+40]
+	movq	[edi+32], mm0
+	movq	[edi+40], mm1
+	movq	mm2, [esi+48]
+	movq	mm3, [esi+56]
+	movq	[edi+48], mm2
+	movq	[edi+56], mm3
+
+	add		esi, 64			// update source pointer
+	add		edi, 64			// update destination pointer
+	dec		ecx				// count down
+	jnz		$memcpy_ic_1	// last 64-byte block?
+
+$memcpy_ic_2:
+	mov		ecx, ebx		// has valid low 6 bits of the byte count
+$memcpy_ic_3:
+	shr		ecx, 2			// dword count
+	and		ecx, 1111b		// only look at the "remainder" bits
+	neg		ecx				// set up to jump into the array
+	add		ecx, offset $memcpy_last_few
+	jmp		ecx				// jump to array of movsds
+
+$memcpy_uc_test:
+	cmp		ecx, UNCACHED_COPY/64	// big enough? use block prefetch copy
+	jae		$memcpy_bp_1
+
+$memcpy_64_test:
+	or		ecx, ecx		// _tail end of block prefetch will jump here
+	jz		$memcpy_ic_2	// no more 64-byte blocks left
+
+// For larger blocks, which will spill beyond the cache, its faster to
+// use the Streaming Store instruction MOVNTQ.   This write instruction
+// bypasses the cache and writes straight to main memory.  This code also
+// uses the software prefetch instruction to pre-read the data.
+align 16
+$memcpy_uc_1:				// 64-byte blocks, uncached copy
+
+	prefetchnta [esi + (200*64/34+192)]		// start reading ahead
+
+	movq	mm0,[esi+0]		// read 64 bits
+	add		edi,64			// update destination pointer
+	movq	mm1,[esi+8]
+	add		esi,64			// update source pointer
+	movq	mm2,[esi-48]
+	movntq	[edi-64], mm0	// write 64 bits, bypassing the cache
+	movq	mm0,[esi-40]	//    note: movntq also prevents the CPU
+	movntq	[edi-56], mm1	//    from READING the destination address
+	movq	mm1,[esi-32]	//    into the cache, only to be over-written
+	movntq	[edi-48], mm2	//    so that also helps performance
+	movq	mm2,[esi-24]
+	movntq	[edi-40], mm0
+	movq	mm0,[esi-16]
+	movntq	[edi-32], mm1
+	movq	mm1,[esi-8]
+	movntq	[edi-24], mm2
+	movntq	[edi-16], mm0
+	dec		ecx
+	movntq	[edi-8], mm1
+	jnz		$memcpy_uc_1	// last 64-byte block?
+
+	jmp		$memcpy_ic_2		// almost done
+
+// For the largest size blocks, a special technique called Block Prefetch
+// can be used to accelerate the read operations.   Block Prefetch reads
+// one address per cache line, for a series of cache lines, in a short loop.
+// This is faster than using software prefetch, in this case.
+// The technique is great for getting maximum read bandwidth,
+// especially in DDR memory systems.
+$memcpy_bp_1:			// large blocks, block prefetch copy
+
+	cmp		ecx, CACHEBLOCK			// big enough to run another prefetch loop?
+	jl		$memcpy_64_test			// no, back to regular uncached copy
+
+	mov		eax, CACHEBLOCK / 2		// block prefetch loop, unrolled 2X
+	add		esi, CACHEBLOCK * 64	// move to the top of the block
+align 16
+$memcpy_bp_2:
+	mov		edx, [esi-64]		// grab one address per cache line
+	mov		edx, [esi-128]		// grab one address per cache line
+	sub		esi, 128			// go reverse order
+	dec		eax					// count down the cache lines
+	jnz		$memcpy_bp_2		// keep grabbing more lines into cache
+
+	mov		eax, CACHEBLOCK		// now that its in cache, do the copy
+align 16
+$memcpy_bp_3:
+	movq	mm0, [esi   ]		// read 64 bits
+	movq	mm1, [esi+ 8]
+	movq	mm2, [esi+16]
+	movq	mm3, [esi+24]
+	movq	mm4, [esi+32]
+	movq	mm5, [esi+40]
+	movq	mm6, [esi+48]
+	movq	mm7, [esi+56]
+	add		esi, 64				// update source pointer
+	movntq	[edi   ], mm0		// write 64 bits, bypassing cache
+	movntq	[edi+ 8], mm1		//    note: movntq also prevents the CPU
+	movntq	[edi+16], mm2		//    from READING the destination address 
+	movntq	[edi+24], mm3		//    into the cache, only to be over-written,
+	movntq	[edi+32], mm4		//    so that also helps performance
+	movntq	[edi+40], mm5
+	movntq	[edi+48], mm6
+	movntq	[edi+56], mm7
+	add		edi, 64				// update dest pointer
+
+	dec		eax					// count down
+
+	jnz		$memcpy_bp_3		// keep copying
+	sub		ecx, CACHEBLOCK		// update the 64-byte block count
+	jmp		$memcpy_bp_1		// keep processing chunks
+
+// The smallest copy uses the X86 "movsd" instruction, in an optimized
+// form which is an "unrolled loop".   Then it handles the last few bytes.
+align 4
+	movsd
+	movsd			// perform last 1-15 dword copies
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd			// perform last 1-7 dword copies
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+
+$memcpy_last_few:		// dword aligned from before movsds
+	mov		ecx, ebx	// has valid low 2 bits of the byte count
+	and		ecx, 11b	// the last few cows must come home
+	jz		$memcpy_final	// no more, lets leave
+	rep		movsb		// the last 1, 2, or 3 bytes
+
+$memcpy_final: 
+//	emms				// clean up the MMX state
+	sfence				// flush the write buffer
+	mov		eax, [dest]	// ret value = destination pointer
+
+    }
+}
+
+#elif defined(_M_X64)
+static inline void memzero24(void *dst)
+{
+	int32_t j;
+	int32_t *d = (int32_t *)dst;
+	for (j=0;j<24;j+=4)
+	{
+		d[j] = 0;
+	}
+}
+static inline void memset_fast_end() {}
+#else
+static inline void memzero_fast16(void *dst,  unsigned long i)
+{
+	int32_t j;
+	int32_t *d = (int32_t *)dst;
+	for (j=0;j<i;j+=4)
+	{
+		d[j] = 0;
+	}
+}
+static inline void memzero24(void *dst)
+{
+	int32_t j;
+	int32_t *d = (int32_t *)dst;
+	for (j=0;j<24;j+=4)
+	{
+		d[j] = 0;
+	}
+}
+static inline void memset_fast_end() {}
+#endif
+
+#define UNDEFINED_REFERENCE ((int)0x80000000)
+typedef int32_t h264_ref_t;
+
+#define ET_SIZE 300      //!< size of error text buffer
+extern char errortext[ET_SIZE]; //!< buffer for error message for exit with error()
+extern int sse2_flag, mmx_flag, sse_flag, sse3_flag, sse4_1_flag;
+/***********************************************************************
+ * T y p e    d e f i n i t i o n s    f o r    J M
+ ***********************************************************************
+ */
+
+typedef enum
+{
+  LumaComp = 0,
+  CrComp = 1,
+  CbComp = 2
+} Color_Component;
+
+/***********************************************************************
+ * D a t a    t y p e s   f o r  C A B A C
+ ***********************************************************************
+ */
+
+typedef struct pix_pos
+{
+  int   available;
+  int   mb_addr;
+  short x;
+  short y;
+  short pos_x;
+  short pos_y;
+} PixelPos;
+
+//! struct to characterize the state of the arithmetic coding engine
+typedef struct
+{
+  unsigned int    Drange;
+  unsigned int    Dvalue;
+  int             DbitsLeft;
+  byte            *Dcodestrm;
+  int             *Dcodestrm_len;
+} DecodingEnvironment;
+
+typedef DecodingEnvironment *DecodingEnvironmentPtr;
+
+typedef short MotionVector[2];
+
+//! definition of motion parameters
+typedef struct pic_motion
+{
+	h264_ref_t ref_pic_id;
+	h264_ref_t ref_id;
+	MotionVector mv;
+	char ref_idx;
+} PicMotion;
+
+// TODO: benski> might be more efficient to make a [list][subblock_y][subblock_x] array of these values instead of parallel arrays
+typedef struct motion_params
+{
+	PicMotion **motion[2];
+  byte **     moving_block;
+} MotionParams;
+
+//! struct for context management
+typedef struct
+{
+  uint16_t state;         // index into state-table CP
+  unsigned char  MPS;           // Least Probable Symbol 0/1 CP
+  unsigned char dummy;          // for alignment
+} BiContextType;
+
+typedef BiContextType *BiContextTypePtr;
+
+
+/**********************************************************************
+ * C O N T E X T S   F O R   T M L   S Y N T A X   E L E M E N T S
+ **********************************************************************
+ */
+
+#define NUM_MB_TYPE_CTX  11
+#define NUM_B8_TYPE_CTX  9
+#define NUM_MV_RES_CTX   10
+#define NUM_REF_NO_CTX   6
+#define NUM_DELTA_QP_CTX 4
+#define NUM_MB_AFF_CTX 4
+#define NUM_TRANSFORM_SIZE_CTX 3
+
+// structures that will be declared somewhere else
+struct storable_picture;
+struct datapartition;
+struct syntaxelement;
+
+typedef struct
+{
+  BiContextType mb_type_contexts [3][NUM_MB_TYPE_CTX];
+  BiContextType b8_type_contexts [2][NUM_B8_TYPE_CTX];
+  BiContextType mv_res_contexts  [2][NUM_MV_RES_CTX];
+  BiContextType ref_no_contexts  [2][NUM_REF_NO_CTX];
+  BiContextType delta_qp_contexts[NUM_DELTA_QP_CTX];
+  BiContextType mb_aff_contexts  [NUM_MB_AFF_CTX];
+} MotionInfoContexts;
+
+#define NUM_IPR_CTX    2
+#define NUM_CIPR_CTX   4
+#define NUM_CBP_CTX    4
+#define NUM_BCBP_CTX   4
+#define NUM_MAP_CTX   15
+#define NUM_LAST_CTX  15
+#define NUM_ONE_CTX    5
+#define NUM_ABS_CTX    5
+
+
+typedef struct
+{
+  BiContextType  transform_size_contexts [NUM_TRANSFORM_SIZE_CTX];
+  BiContextType  ipr_contexts [NUM_IPR_CTX];
+  BiContextType  cipr_contexts[NUM_CIPR_CTX];
+  BiContextType  cbp_contexts [3][NUM_CBP_CTX];
+  BiContextType  bcbp_contexts[NUM_BLOCK_TYPES][NUM_BCBP_CTX];
+  BiContextType  map_contexts [2][NUM_BLOCK_TYPES][NUM_MAP_CTX+1]; // +1 for better alignment
+  BiContextType  last_contexts[2][NUM_BLOCK_TYPES][NUM_LAST_CTX+1]; // +1 for better alignment
+  BiContextType  one_contexts [NUM_BLOCK_TYPES][NUM_ONE_CTX];
+  BiContextType  abs_contexts [NUM_BLOCK_TYPES][NUM_ABS_CTX];
+} TextureInfoContexts;
+
+
+//*********************** end of data type definition for CABAC *******************
+
+/***********************************************************************
+ * N e w   D a t a    t y p e s   f o r    T M L
+ ***********************************************************************
+ */
+
+/*! Buffer structure for decoded reference picture marking commands */
+typedef struct DecRefPicMarking_s
+{
+  int memory_management_control_operation;
+  int difference_of_pic_nums_minus1;
+  int long_term_pic_num;
+  int long_term_frame_idx;
+  int max_long_term_frame_idx_plus1;
+  struct DecRefPicMarking_s *Next;
+} DecRefPicMarking_t;
+
+
+//! definition of pic motion parameters
+typedef struct pic_motion_params2
+{
+  h264_ref_t    ref_pic_id;    //!< reference picture identifier [list][subblock_y][subblock_x]
+  h264_ref_t    ref_id;        //!< reference picture identifier [list][subblock_y][subblock_x]
+  short    mv[2];         //!< motion vector       [list][subblock_x][subblock_y][component]
+  char     ref_idx;       //!< reference picture   [list][subblock_y][subblock_x]
+  byte     mb_field;      //!< field macroblock indicator
+  byte     field_frame;   //!< indicates if co_located is field or frame.
+} PicMotionParams2;
+
+//! Macroblock
+typedef struct macroblock
+{
+  struct slice       *p_Slice;                    //!< pointer to the current slice
+  struct img_par     *p_Vid;                      //!< pointer to VideoParameters
+  struct inp_par     *p_Inp;
+  int                 mbAddrX;                    //!< current MB address
+  int mb_x;
+  int mb_y;
+  int block_x;
+  int block_y;
+  int block_y_aff;
+  int pix_x;
+  int pix_y;
+  int pix_c_x;
+  int pix_c_y;
+
+  int subblock_x;
+  int subblock_y;
+
+  int           qp;                    //!< QP luma
+  int           qpc[2];                //!< QP chroma
+  int           qp_scaled[MAX_PLANE];  //!< QP scaled for all comps.
+  Boolean       is_lossless;
+  Boolean       is_intra_block;
+  Boolean       is_v_block;
+
+  short         slice_nr;
+  short         delta_quant;          //!< for rate control
+
+  struct macroblock   *mb_up;   //!< pointer to neighboring MB (CABAC)
+  struct macroblock   *mb_left; //!< pointer to neighboring MB (CABAC)
+
+  // some storage of macroblock syntax elements for global access
+  int           mb_type;
+  short         mvd[2][BLOCK_MULTIPLE][BLOCK_MULTIPLE][2];      //!< indices correspond to [forw,backw][block_y][block_x][x,y]
+  int           cbp;
+  int64         cbp_blk     [3];
+  int64         cbp_bits    [3];
+  int64         cbp_bits_8x8[3];
+
+  int           i16mode;
+  char          b8mode[4];
+  char          b8pdir[4];
+  char          ei_flag;             //!< error indicator flag that enables concealment
+  char          dpl_flag;            //!< error indicator flag that signals a missing data partition
+  char          ipmode_DPCM;
+
+  short         DFDisableIdc;
+  short         DFAlphaC0Offset;
+  short         DFBetaOffset;
+
+  char          c_ipred_mode;       //!< chroma intra prediction mode
+  Boolean       mb_field;
+
+  int          skip_flag;
+
+  int mb_addr_left, mb_addr_up, mb_addr_upper_right, mb_addr_upper_left;
+  Boolean mb_avail_left, mb_avail_up, mb_avail_upper_right, mb_avail_upper_left;
+
+  Boolean       luma_transform_size_8x8_flag;
+  Boolean       NoMbPartLessThan8x8Flag;
+
+  void (*itrans_8x8)(struct macroblock *currMB, ColorPlane pl, int ioff, int joff);
+
+  void (*GetMVPredictor) (struct macroblock *currMB, PixelPos *block, 
+    short pmv[2], short ref_frame, struct pic_motion **motion, int mb_x, int mb_y, int blockshape_x, int blockshape_y);
+
+  int (*read_and_store_CBP_block_bit) (struct macroblock *currMB, DecodingEnvironmentPtr  dep_dp, int type);
+  char (*readRefPictureIdx)   (struct syntaxelement *currSE, struct datapartition *dP, int list);
+
+} Macroblock;
+
+//! Syntaxelement
+typedef struct syntaxelement
+{
+  int           value1;                //!< numerical value of syntax element
+  int           value2;                //!< for blocked symbols, e.g. run/level
+  int           len;                   //!< length of code
+  //int           inf;                   //!< info part of CAVLC code
+
+#if TRACE
+  #define       TRACESTRING_SIZE 100           //!< size of trace string
+  char          tracestring[TRACESTRING_SIZE]; //!< trace string
+#endif
+
+  //! for mapping of CAVLC to syntaxElement
+  void  (*mapping)(int len, int info, int *value1, int *value2);
+} SyntaxElement;
+
+
+//! Bitstream
+typedef struct
+{
+  // CABAC Decoding
+  int           read_len;           //!< actual position in the codebuffer, CABAC only
+  int           code_len;           //!< overall codebuffer length, CABAC only
+  // CAVLC Decoding
+  int           frame_bitoffset;    //!< actual position in the codebuffer, bit-oriented, CAVLC only
+  int           bitstream_length;   //!< over codebuffer lnegth, byte oriented, CAVLC only
+
+  byte          *streamBuffer;      //!< actual codebuffer for read bytes
+} Bitstream;
+
+
+/* === 4x4 block typedefs === */
+// 32 bit precision
+typedef int h264_int_block_row_t[BLOCK_SIZE];
+typedef h264_int_block_row_t h264_int_block_t[BLOCK_SIZE];
+// 16 bit precision
+typedef int16_t h264_short_block_row_t[BLOCK_SIZE];
+typedef h264_short_block_row_t h264_short_block_t[BLOCK_SIZE];
+// 8 bit precision
+
+/* === 8x8 block typedefs === */
+// 32 bit precision
+typedef int h264_int_8x8block_row_t[BLOCK_SIZE_8x8];
+typedef h264_int_8x8block_row_t h264_int_8x8block_t[BLOCK_SIZE_8x8];
+// 16 bit precision
+typedef int16_t h264_short_8x8block_row_t[BLOCK_SIZE_8x8];
+typedef h264_short_8x8block_row_t h264_short_8x8block_t[BLOCK_SIZE_8x8];
+// 8 bit precision
+typedef imgpel h264_imgpel_8x8block_row_t[BLOCK_SIZE_8x8];
+typedef h264_imgpel_8x8block_row_t h264_imgpel_8x8block_t[BLOCK_SIZE_8x8];
+
+/* === 16x16 block typedefs === */
+// 32 bit precision
+typedef int h264_int_macroblock_row_t[MB_BLOCK_SIZE];
+typedef h264_int_macroblock_row_t h264_int_macroblock_t[MB_BLOCK_SIZE];
+// 16 bit precision
+typedef int16_t h264_short_macroblock_row_t[MB_BLOCK_SIZE];
+typedef h264_short_macroblock_row_t h264_short_macroblock_t[MB_BLOCK_SIZE];
+// 8 bit precision
+typedef imgpel h264_imgpel_macroblock_row_t[MB_BLOCK_SIZE];
+typedef h264_imgpel_macroblock_row_t h264_imgpel_macroblock_t[MB_BLOCK_SIZE];
+
+
+
+
+typedef int h264_pic_position[2];
+typedef byte h264_4x4_byte[BLOCK_SIZE][BLOCK_SIZE];
+typedef h264_4x4_byte h264_nz_coefficient[3];
+
+//! DataPartition
+typedef struct datapartition
+{
+
+  Bitstream           *bitstream;
+  DecodingEnvironment de_cabac;
+
+} DataPartition;
+
+//! Slice
+typedef struct slice
+{
+  struct img_par      *p_Vid;
+  struct inp_par      *p_Inp;
+  pic_parameter_set_rbsp_t *active_pps;
+  seq_parameter_set_rbsp_t *active_sps;
+
+  struct colocated_params *p_colocated;
+  struct colocated_params *Co_located_JV[MAX_PLANE];  //!< p_colocated to be used during 4:4:4 independent mode decoding
+
+  int                 mb_aff_frame_flag;
+  int                 direct_spatial_mv_pred_flag;       //!< Indicator for direct mode type (1 for Spatial, 0 for Temporal)
+  int                 num_ref_idx_l0_active;             //!< number of available list 0 references
+  int                 num_ref_idx_l1_active;             //!< number of available list 1 references
+
+  int                 qp;
+  int                 slice_qp_delta;
+  int                 qs;
+  int                 slice_qs_delta;
+  int                 slice_type;    //!< slice type
+  int                 model_number;  //!< cabac model number
+  PictureStructure    structure;     //!< Identify picture structure type
+  int                 start_mb_nr;   //!< MUST be set by NAL even in case of ei_flag == 1
+  int                 max_part_nr;
+  int                 dp_mode;       //!< data partitioning mode
+  int                 last_dquant;
+
+  //  int                 last_mb_nr;    //!< only valid when entropy coding == CABAC
+  DataPartition       *partArr;      //!< array of partitions
+  MotionInfoContexts  *mot_ctx;      //!< pointer to struct of context models for use in CABAC
+  TextureInfoContexts *tex_ctx;      //!< pointer to struct of context models for use in CABAC
+
+  int mvscale[6][MAX_REFERENCE_PICTURES];
+
+  int                 ref_pic_list_reordering_flag_l0;
+  int                 *reordering_of_pic_nums_idc_l0;
+  int                 *abs_diff_pic_num_minus1_l0;
+  int                 *long_term_pic_idx_l0;
+  int                 ref_pic_list_reordering_flag_l1;
+  int                 *reordering_of_pic_nums_idc_l1;
+  int                 *abs_diff_pic_num_minus1_l1;
+  int                 *long_term_pic_idx_l1;
+  
+
+  short               DFDisableIdc;     //!< Disable deblocking filter on slice
+  short               DFAlphaC0Offset;  //!< Alpha and C0 offset for filtering slice
+  short               DFBetaOffset;     //!< Beta offset for filtering slice
+
+  int                 pic_parameter_set_id;   //!<the ID of the picture parameter set the slice is reffering to
+
+  int                 dpB_NotPresent;    //!< non-zero, if data partition B is lost
+  int                 dpC_NotPresent;    //!< non-zero, if data partition C is lost
+
+
+	__declspec(align(32)) h264_imgpel_macroblock_t mb_pred[MAX_PLANE];
+	__declspec(align(32)) h264_imgpel_macroblock_t mb_rec[MAX_PLANE];
+	__declspec(align(32)) union
+	{
+		__declspec(align(32)) h264_short_8x8block_t mb_rres8[MAX_PLANE][4];
+		__declspec(align(32)) h264_short_macroblock_t cof[MAX_PLANE];
+		__declspec(align(32)) h264_short_block_t cof4[MAX_PLANE][16]; // TODO: get this to work, one of these days
+		__declspec(align(32)) h264_short_macroblock_t ipcm[MAX_PLANE];
+	};
+
+  int cofu[16];
+
+  // Scaling matrix info
+  int  InvLevelScale4x4_Intra[3][6][4][4];
+  int  InvLevelScale4x4_Inter[3][6][4][4];
+  int  InvLevelScale8x8_Intra[3][6][64];
+  int  InvLevelScale8x8_Inter[3][6][64];
+
+  int  *qmatrix[12];
+
+  // Cabac
+	// TODO: we could optimize coefficient reading by storing the levels/runs instead of coefficients
+  int16_t  coeff[64]; // one more for EOB
+  int  coeff_ctr;
+  int  pos;
+
+  //weighted prediction
+  unsigned int apply_weights;
+  unsigned int luma_log2_weight_denom;
+  unsigned int chroma_log2_weight_denom;
+  int wp_weight[2][MAX_REFERENCE_PICTURES][3];  // weight in [list][index][component] order
+  int wp_offset[6][MAX_REFERENCE_PICTURES][3];  // offset in [list][index][component] order
+  int wbp_weight[6][MAX_REFERENCE_PICTURES][MAX_REFERENCE_PICTURES][3]; //weight in [list][fw_index][bw_index][component] order
+  int wp_round_luma;
+  int wp_round_chroma;
+
+  void (*read_CBP_and_coeffs_from_NAL) (Macroblock *currMB);
+  int  (*decode_one_component     ) (Macroblock *currMB, ColorPlane curr_plane, struct video_image *image, struct storable_picture *dec_picture);
+  int  (*readSlice                ) (struct img_par *, struct inp_par *);  
+  int  (*nal_startcode_follows    ) (struct slice*, int );
+  void (*read_motion_info_from_NAL) (Macroblock *currMB);
+  void (*read_one_macroblock      ) (Macroblock *currMB);
+  void (*interpret_mb_mode        ) (Macroblock *currMB);
+	void (*compute_colocated        ) (struct slice *currSlice, struct colocated_params *p, struct storable_picture **listX[6]);
+
+  void (*linfo_cbp_intra) (int len,int info,int *cbp, int *dummy);
+  void (*linfo_cbp_inter) (int len,int info,int *cbp, int *dummy);
+} Slice;
+
+//****************************** ~DM ***********************************
+
+// image parameters
+typedef struct img_par
+{
+  struct inp_par      *p_Inp;
+  pic_parameter_set_rbsp_t *active_pps;
+  seq_parameter_set_rbsp_t *active_sps;
+  seq_parameter_set_rbsp_t SeqParSet[MAXSPS];
+  pic_parameter_set_rbsp_t PicParSet[MAXPPS];
+
+  struct sei_params        *p_SEI;
+
+  struct old_slice_par *old_slice;
+  int number;                                 //!< frame number
+  unsigned int current_mb_nr; // bitstream order
+  unsigned int num_dec_mb;
+  short        current_slice_nr;
+  int *intra_block;
+  
+  int qp;                                     //!< quant for the current frame
+
+  int sp_switch;                              //!< 1 for switching sp, 0 for normal sp  
+  int type;                                   //!< image type INTER/INTRA
+  int width;
+  int height;
+  int width_cr;                               //!< width chroma  
+  int height_cr;                              //!< height chroma
+  int mb_x;
+  int mb_y;
+  int block_x;
+  int block_y;
+  int pix_c_x;
+  int pix_c_y;
+
+  int allrefzero;
+
+  byte **ipredmode;                  //!< prediction type [90][74]
+  h264_nz_coefficient *nz_coeff;
+  int **siblock;
+  int cod_counter;                   //!< Current count of number of skipped macroblocks in a row
+
+  int structure;                     //!< Identify picture structure type
+
+  Slice      *currentSlice;          //!< pointer to current Slice data struct
+  Macroblock *mb_data;               //!< array containing all MBs of a whole frame
+  Macroblock *mb_data_JV[MAX_PLANE]; //!< mb_data to be used for 4:4:4 independent mode
+  int colour_plane_id;               //!< colour_plane_id of the current coded slice
+  int ChromaArrayType;
+
+  // For MB level frame/field coding
+  int mb_aff_frame_flag;
+
+  // for signalling to the neighbour logic that this is a deblocker call
+  int DeblockCall;
+  byte mixedModeEdgeFlag;
+
+  // picture error concealment
+  // concealment_head points to first node in list, concealment_end points to
+  // last node in list. Initialize both to NULL, meaning no nodes in list yet
+  struct concealment_node *concealment_head;
+  struct concealment_node *concealment_end;
+
+  DecRefPicMarking_t *dec_ref_pic_marking_buffer;                    //!< stores the memory management control operations
+
+  int num_ref_idx_l0_active;             //!< number of forward reference
+  int num_ref_idx_l1_active;             //!< number of backward reference
+
+  int slice_group_change_cycle;
+
+  int redundant_pic_cnt;
+
+  unsigned int pre_frame_num;           //!< store the frame_num in the last decoded slice. For detecting gap in frame_num.
+  int non_conforming_stream;
+
+  // End JVT-D101
+  // POC200301: from unsigned int to int
+  int toppoc;      //poc for this top field // POC200301
+  int bottompoc;   //poc of bottom field of frame
+  int framepoc;    //poc of this frame // POC200301
+  unsigned int frame_num;   //frame_num for this frame
+  unsigned int field_pic_flag;
+  byte         bottom_field_flag;
+
+  //the following is for slice header syntax elements of poc
+  // for poc mode 0.
+  unsigned int pic_order_cnt_lsb;
+  int delta_pic_order_cnt_bottom;
+  // for poc mode 1.
+  int delta_pic_order_cnt[3];
+
+  // ////////////////////////
+  // for POC mode 0:
+  signed   int PrevPicOrderCntMsb;
+  unsigned int PrevPicOrderCntLsb;
+  signed   int PicOrderCntMsb;
+
+  // for POC mode 1:
+  unsigned int AbsFrameNum;
+  signed int ExpectedPicOrderCnt, PicOrderCntCycleCnt, FrameNumInPicOrderCntCycle;
+  unsigned int PreviousFrameNum, FrameNumOffset;
+  int ExpectedDeltaPerPicOrderCntCycle;
+  int PreviousPOC, ThisPOC;
+  int PreviousFrameNumOffset;
+  // /////////////////////////
+
+  int idr_flag;
+  int nal_reference_idc;                       //!< nal_reference_idc from NAL unit
+
+  int idr_pic_id;
+
+  int MaxFrameNum;
+
+  unsigned int PicWidthInMbs;
+  unsigned int PicHeightInMapUnits;
+  unsigned int FrameHeightInMbs;
+  unsigned int PicHeightInMbs;
+  unsigned int PicSizeInMbs;
+  unsigned int FrameSizeInMbs;
+  unsigned int oldFrameSizeInMbs;
+
+  int no_output_of_prior_pics_flag;
+  int long_term_reference_flag;
+  int adaptive_ref_pic_buffering_flag;
+
+  int last_has_mmco_5;
+  int last_pic_bottom_field;
+
+  // Fidelity Range Extensions Stuff
+  short bitdepth_luma;
+  short bitdepth_chroma;
+  int bitdepth_scale[2];
+  int bitdepth_luma_qp_scale;
+  int bitdepth_chroma_qp_scale;
+  unsigned int dc_pred_value_comp[MAX_PLANE]; //!< component value for DC prediction (depends on component pel bit depth)
+  int max_pel_value_comp[MAX_PLANE];       //!< max value that one picture element (pixel) can take (depends on pic_unit_bitdepth)
+  int Transform8x8Mode;
+  int profile_idc;
+  int yuv_format;
+  int lossless_qpprime_flag;
+  int num_blk8x8_uv;
+  int num_uv_blocks;
+  int num_cdc_coeff;
+  int mb_cr_size_x;
+  int mb_cr_size_y;
+  int mb_cr_size_x_blk;
+  int mb_cr_size_y_blk;
+  int mb_size[3][2];                         //!< component macroblock dimensions
+  int mb_size_blk[3][2];                     //!< component macroblock dimensions 
+  int mb_size_shift[3][2];
+  int subpel_x;
+  int subpel_y;
+  int shiftpel_x;
+  int shiftpel_y;
+
+  int max_vmv_r;                             //!< maximum vertical motion vector range in luma quarter frame pixel units for the current level_idc
+  int max_mb_vmv_r;                          //!< maximum vertical motion vector range in luma quarter pixel units for the current level_idc
+
+  // picture error concealment
+  int last_ref_pic_poc;
+  int ref_poc_gap;
+  int poc_gap;
+  int earlier_missing_poc;
+  unsigned int frame_to_conceal;
+  int IDR_concealment_flag;
+  int conceal_slice_type;
+
+  // random access point decoding
+  int recovery_point;
+  int recovery_point_found;
+  int recovery_frame_cnt;
+  int recovery_frame_num;
+  int recovery_poc;
+
+  int separate_colour_plane_flag;
+
+  int frame_number;
+  int init_bl_done;
+
+  // Redundant slices. Should be moved to another structure and allocated only if extended profile
+  unsigned int previous_frame_num; //!< frame number of previous slice
+  int ref_flag[17];                //!< 0: i-th previous frame is incorrect
+  //!< non-zero: i-th previous frame is correct
+  int Is_primary_correct;          //!< if primary frame is correct, 0: incorrect
+  int Is_redundant_correct;        //!< if redundant frame is correct, 0:incorrect
+  int redundant_slice_ref_idx;     //!< reference index of redundant slice
+
+  //FILE *p_log;                     //!< SNR file
+  int LastAccessUnitExists;
+  int NALUCount;
+
+  Boolean global_init_done;
+
+  int *qp_per_matrix;
+  int *qp_rem_matrix;
+
+  struct frame_store *last_out_fs;
+  int pocs_in_dpb[100];
+
+
+  struct storable_picture *dec_picture;
+  struct storable_picture *dec_picture_JV[MAX_PLANE];  //!< dec_picture to be used during 4:4:4 independent mode decoding
+  struct storable_picture *no_reference_picture; //!< dummy storable picture for recovery point
+  struct storable_picture **listX[6];
+
+  // Error parameters
+  struct object_buffer  *erc_object_list;
+  struct ercVariables_s *erc_errorVar;
+
+  int erc_mvperMB;
+  struct img_par *erc_img;
+  int ec_flag[SE_MAX_ELEMENTS];        //!< array to set errorconcealment
+
+	struct memory_input_struct *mem_input;
+
+  struct frame_store *out_buffer;
+
+  struct storable_picture *pending_output;
+  int    pending_output_state;
+  int    recovery_flag;
+
+  // dpb
+  struct decoded_picture_buffer *p_Dpb;
+
+  char listXsize[6];
+  // report
+  char cslice_type[9];  
+  // FMO
+  int *MbToSliceGroupMap;
+  int *MapUnitToSliceGroupMap;
+  int  NumberOfSliceGroups;    // the number of slice groups -1 (0 == scan order, 7 == maximum)
+
+#if (ENABLE_OUTPUT_TONEMAPPING)
+  struct tone_mapping_struct_s *seiToneMapping;
+#endif
+
+	// benski> buffer of storablge pictures ready for output.
+	// might be able to optimize a tad by making a ringbuffer, but i doubt it matters
+	struct storable_picture **out_pictures;
+	size_t size_out_pictures;
+	size_t num_out_pictures;
+
+	ImageCache image_cache[2]; // [0] is luma [1] is chroma (shared for both planes)
+	MotionCache motion_cache;
+
+	h264_pic_position *PicPos; //! Helper array to access macroblock positions. 
+
+	NALU_t *nalu; // a cache so we don't re-alloc every time
+
+  void (*getNeighbour)        (const Macroblock *currMB, int xN, int yN, const int mb_size[2], PixelPos *pix);
+	void (*getNeighbourPX_NoPos)(const Macroblock *currMB, int xN, int yN, const int mb_size[2], PixelPos *pix);
+	void (*getNeighbourXP_NoPos)(const Macroblock *currMB, int xN, int yN, const int mb_size[2], PixelPos *pix);
+	void (*getNeighbourLuma)    (const Macroblock *currMB, int xN, int yN, PixelPos *pix);
+	void (*getNeighbourPXLuma)  (const Macroblock *currMB, int xN, int yN, PixelPos *pix);
+	void (*getNeighbourXPLuma)  (const Macroblock *currMB, int xN, int yN, PixelPos *pix);
+	void (*getNeighbourLeftLuma)(const Macroblock *currMB, PixelPos *pix);
+	void (*getNeighbourNXLuma)  (const Macroblock *currMB, int yN, PixelPos *pix); // xN<0, yN full range
+	void (*getNeighbourLeft)    (const Macroblock *currMB, const int mb_size[2], PixelPos *pix); // xN<0, yN=0
+	void (*getNeighbourUp)      (const Macroblock *currMB, const int mb_size[2], PixelPos *pix); // xN=0, yN<0
+	void (*getNeighbourNX)      (const Macroblock *currMB, int yN, const int mb_size[2], PixelPos *pix); // xN<0, yN full range
+	void (*getNeighbourNP)      (const Macroblock *currMB, int yN, const int mb_size[2], PixelPos *pix); // xN<0, yN>=0
+	void (*getNeighbourNPChromaNB)(const Macroblock *currMB, int yN, const int mb_size[2], PixelPos *pix); // xN<0, yN>=0
+	void (*getNeighbour0X)      (const Macroblock *currMB, int yN, const int mb_size[2], PixelPos *pix); // xN=0, yN full range
+	void (*getNeighbour0XLuma)      (const Macroblock *currMB, int yN, PixelPos *pix); // xN=0, yN full range
+	void (*getNeighbourX0)      (const Macroblock *currMB, int xN, const int mb_size[2], PixelPos *pix); // xN full range, yN = 0
+	void (*getNeighbourUpLuma)  (const Macroblock *currMB, PixelPos *pix); // xN=0, yN<0
+	void (*getNeighbourNPLumaNB)(const Macroblock *currMB, int yN, PixelPos *pix);
+	void (*getNeighbourPXLumaNB)  (const Macroblock *currMB, int xN, int yN, PixelPos *pix);
+	void (*getNeighbourPXLumaNB_NoPos)(const Macroblock *currMB, int yN, PixelPos *pix);
+	void (*getNeighbourPPLumaNB)  (const Macroblock *currMB, int xN, int yN, PixelPos *pix);
+	void (*getNeighbourXPLumaNB)  (const Macroblock *currMB, int xN, int yN, PixelPos *pix);
+	void (*getNeighbourXPLumaNB_NoPos)(const Macroblock *currMB, int xN, int yN, PixelPos *pix);
+  void (*get_mb_block_pos) (const h264_pic_position *PicPos, int mb_addr, short *x, short *y);
+  void (*GetStrength)      (byte Strength[16], Macroblock *MbQ, int dir,int edge, int mvlimit, struct storable_picture *p);
+  void (*EdgeLoopLuma)     (ColorPlane pl, struct video_image *image, const byte Strength[16], Macroblock *MbQ, int dir, int edge, struct storable_picture *p);
+  void (*EdgeLoopChroma)   (struct video_image *image, byte Strength[16], Macroblock *MbQ, int dir, int edge, int uv, struct storable_picture *p);
+} VideoParameters;
+
+// input parameters from configuration file
+typedef struct inp_par
+{
+  int intra_profile_deblocking;               //!< Loop filter usage determined by flags and parameters in bitstream 
+
+  // Output sequence format related variables
+  FrameFormat output;                   //!< output related information
+
+#ifdef _LEAKYBUCKET_
+  unsigned long R_decoder;                //!< Decoder Rate in HRD Model
+  unsigned long B_decoder;                //!< Decoder Buffer size in HRD model
+  unsigned long F_decoder;                //!< Decoder Initial buffer fullness in HRD model
+  char LeakyBucketParamFile[FILE_NAME_SIZE];         //!< LeakyBucketParamFile
+#endif
+
+  // picture error concealment
+  int ref_poc_gap;
+  int poc_gap;
+} InputParameters;
+
+typedef struct old_slice_par
+{
+   unsigned field_pic_flag;   
+   unsigned frame_num;
+   int      nal_ref_idc;
+   unsigned pic_oder_cnt_lsb;
+   int      delta_pic_oder_cnt_bottom;
+   int      delta_pic_order_cnt[2];
+   byte     bottom_field_flag;
+   byte     idr_flag;
+   int      idr_pic_id;
+   int      pps_id;   
+} OldSliceParams;
+
+typedef struct decoder_params
+{
+  InputParameters   *p_Inp;          //!< Input Parameters
+  VideoParameters   *p_Vid;          //!< Image Parameters
+  
+} DecoderParams;
+
+#ifdef TRACE
+extern FILE *p_trace; //!< Trace file
+extern int bitcounter;
+#endif
+
+// prototypes
+
+extern void error(char *text, int code);
+
+// dynamic mem allocation
+extern int  init_global_buffers(VideoParameters *p_Vid);
+extern void free_global_buffers(VideoParameters *p_Vid);
+
+extern int RBSPtoSODB(byte *streamBuffer, int last_byte_pos);
+extern int EBSPtoRBSP(byte *streamBuffer, int end_bytepos);
+
+void FreePartition (DataPartition *dp, int n);
+DataPartition *AllocPartition(int n);
+
+void tracebits(const char *trace_str,  int len,  int info,int value1);
+void tracebits2(const char *trace_str, int len, int info);
+
+unsigned CeilLog2   ( unsigned uiVal);
+unsigned CeilLog2_sf( unsigned uiVal);
+
+// For 4:4:4 independent mode
+extern void change_plane_JV( VideoParameters *p_Vid, int nplane );
+extern void make_frame_picture_JV(VideoParameters *p_Vid);
+
+
+#endif
+
+
diff --git a/Src/h264dec/ldecod/inc/header.h b/Src/h264dec/ldecod/inc/header.h
new file mode 100644
index 00000000..f3185b07
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/header.h
@@ -0,0 +1,22 @@
+/*!
+ *************************************************************************************
+ * \file header.h
+ *
+ * \brief
+ *    Prototypes for header.c
+ *************************************************************************************
+ */
+
+#ifndef _HEADER_H_
+#define _HEADER_H_
+
+extern void FirstPartOfSliceHeader(Slice *currSlice);
+extern void RestOfSliceHeader     (Slice *currSlice);
+
+extern void dec_ref_pic_marking(VideoParameters *p_Vid, Bitstream *currStream);
+
+extern void decode_poc(VideoParameters *p_Vid);
+extern int dumppoc(VideoParameters *p_Vid);
+
+#endif
+
diff --git a/Src/h264dec/ldecod/inc/image.h b/Src/h264dec/ldecod/inc/image.h
new file mode 100644
index 00000000..2540f3c5
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/image.h
@@ -0,0 +1,33 @@
+
+/*!
+ ************************************************************************
+ * \file image.h
+ *
+ * \brief
+ *    prototypes for image.c
+ *
+ ************************************************************************
+ */
+
+#ifndef _IMAGE_H_
+#define _IMAGE_H_
+
+#include "mbuffer.h"
+
+extern int  picture_order(VideoParameters *p_Vid);
+
+extern void decode_one_slice (Slice *currSlice);
+
+extern void exit_picture(VideoParameters *p_Vid, StorablePicture **dec_picture);
+extern int  decode_one_frame(VideoParameters *p_Vid, uint64_t time_code);
+
+extern int  is_new_picture(StorablePicture *dec_picture, Slice *currSlice, OldSliceParams *p_old_slice);
+extern void init_old_slice(OldSliceParams *p_old_slice);
+// For 4:4:4 independent mode
+extern void copy_dec_picture_JV( VideoParameters *p_Vid, StorablePicture *dst, StorablePicture *src );
+
+extern void frame_postprocessing(VideoParameters *p_Vid);
+extern void field_postprocessing(VideoParameters *p_Vid);
+
+#endif
+
diff --git a/Src/h264dec/ldecod/inc/intra16x16_pred.h b/Src/h264dec/ldecod/inc/intra16x16_pred.h
new file mode 100644
index 00000000..6680280a
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/intra16x16_pred.h
@@ -0,0 +1,25 @@
+/*!
+ *************************************************************************************
+ * \file intra16x16_pred.h
+ *
+ * \brief
+ *    definitions for intra 16x16 prediction
+ *
+ * \author
+ *      Main contributors (see contributors.h for copyright, 
+ *                         address and affiliation details)
+ *      - Alexis Michael Tourapis  <alexismt@ieee.org>
+ *
+ *************************************************************************************
+ */
+
+#ifndef _INTRA16x16_PRED_H_
+#define _INTRA16x16_PRED_H_
+
+#include "global.h"
+#include "mbuffer.h"
+
+extern int intrapred16x16(Macroblock *currMB, ColorPlane pl, int b8);
+
+#endif
+
diff --git a/Src/h264dec/ldecod/inc/intra4x4_pred.h b/Src/h264dec/ldecod/inc/intra4x4_pred.h
new file mode 100644
index 00000000..a50c5262
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/intra4x4_pred.h
@@ -0,0 +1,25 @@
+/*!
+ *************************************************************************************
+ * \file intra4x4_pred.h
+ *
+ * \brief
+ *    definitions for intra 4x4 prediction
+ *
+ * \author
+ *      Main contributors (see contributors.h for copyright, 
+ *                         address and affiliation details)
+ *      - Alexis Michael Tourapis  <alexismt@ieee.org>
+ *
+ *************************************************************************************
+ */
+
+#ifndef _INTRA4x4_PRED_H_
+#define _INTRA4x4_PRED_H_
+
+#include "global.h"
+#include "mbuffer.h"
+
+extern int intrapred(Macroblock *currMB, ColorPlane pl, int ioff, int joff, int img_block_x, int img_block_y);
+
+#endif
+
diff --git a/Src/h264dec/ldecod/inc/intra8x8_pred.h b/Src/h264dec/ldecod/inc/intra8x8_pred.h
new file mode 100644
index 00000000..ff238460
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/intra8x8_pred.h
@@ -0,0 +1,25 @@
+/*!
+ *************************************************************************************
+ * \file intra8x8_pred.h
+ *
+ * \brief
+ *    definitions for intra 8x8 prediction
+ *
+ * \author
+ *      Main contributors (see contributors.h for copyright, 
+ *                         address and affiliation details)
+ *      - Alexis Michael Tourapis  <alexismt@ieee.org>
+ *
+ *************************************************************************************
+ */
+
+#ifndef _INTRA8x8_PRED_H_
+#define _INTRA8x8_PRED_H_
+
+#include "global.h"
+#include "mbuffer.h"
+
+extern int intrapred8x8(Macroblock *currMB, ColorPlane pl, int ioff, int joff);
+
+#endif
+
diff --git a/Src/h264dec/ldecod/inc/leaky_bucket.h b/Src/h264dec/ldecod/inc/leaky_bucket.h
new file mode 100644
index 00000000..ad605a6a
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/leaky_bucket.h
@@ -0,0 +1,26 @@
+
+/*!
+ *************************************************************************************
+ * \file leaky_bucket.h
+ *
+ * \brief
+ *    Header for Leaky Buffer parameters
+ *
+ * \author
+ *    Main contributors (see contributors.h for copyright, address and affiliation details)
+ *    - Shankar Regunathan                   <shanre@microsoft.com>
+ *************************************************************************************
+ */
+#ifndef _LEAKY_BUCKET_H_
+#define _LEAKY_BUCKET_H_
+
+#include "global.h"
+
+#ifdef _LEAKYBUCKET_
+// Leaky Bucket functions
+unsigned long GetBigDoubleWord(FILE *fp);
+void calc_buffer(InputParameters *p_Inp);
+#endif
+
+#endif
+
diff --git a/Src/h264dec/ldecod/inc/loopfilter.h b/Src/h264dec/ldecod/inc/loopfilter.h
new file mode 100644
index 00000000..c8b739fa
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/loopfilter.h
@@ -0,0 +1,23 @@
+/*!
+ ************************************************************************
+ *  \file
+ *     loopfilter.h
+ *  \brief
+ *     external deblocking filter interface
+ ************************************************************************
+ */
+
+#ifndef _LOOPFILTER_H_
+#define _LOOPFILTER_H_
+
+#include "global.h"
+#include "mbuffer.h"
+
+extern void DeblockPicture(VideoParameters *p_Vid, StorablePicture *p) ;
+
+
+extern void EdgeLoopLumaNormal_Horiz(ColorPlane pl, VideoImage *image, const byte Strength[MB_BLOCK_SIZE], Macroblock *MbQ, int edge, StorablePicture *p);
+extern void EdgeLoopLumaNormal_Horiz_sse2(ColorPlane pl, VideoImage *image, const byte Strength[MB_BLOCK_SIZE], Macroblock *MbQ, int edge, StorablePicture *p);
+extern void EdgeLoopChromaNormal_Horiz(VideoImage *image, const byte Strength[16], Macroblock *MbQ, int edge, int uv, StorablePicture *p);
+
+#endif //_LOOPFILTER_H_
diff --git a/Src/h264dec/ldecod/inc/macroblock.h b/Src/h264dec/ldecod/inc/macroblock.h
new file mode 100644
index 00000000..d11547a4
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/macroblock.h
@@ -0,0 +1,177 @@
+/*!
+ ************************************************************************
+ * \file macroblock.h
+ *
+ * \brief
+ *    Arrays for macroblock encoding
+ *
+ * \author
+ *    Inge Lille-Langoy               <inge.lille-langoy@telenor.com>
+ *    Copyright (C) 1999 Telenor Satellite Services, Norway
+ ************************************************************************
+ */
+
+#ifndef _MACROBLOCK_H_
+#define _MACROBLOCK_H_
+
+#include "global.h"
+#include "mbuffer.h"
+#include "block.h"
+
+//! single scan pattern
+static const byte SNGL_SCAN[16][2] =
+{
+  {0,0},{1,0},{0,1},{0,2},
+  {1,1},{2,0},{3,0},{2,1},
+  {1,2},{0,3},{1,3},{2,2},
+  {3,1},{3,2},{2,3},{3,3}
+};
+
+static const byte SNGL_SCAN_1D[16] = { 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15};
+static const byte SNGL_SCAN_DC[16] = { 0, 1, 2, 8, 3, 4, 5, 6, 9, 10, 11, 12, 7, 13, 14, 15};
+//! field scan pattern
+static const byte FIELD_SCAN[16][2] =
+{
+  {0,0},{0,1},{1,0},{0,2},
+  {0,3},{1,1},{1,2},{1,3},
+  {2,0},{2,1},{2,2},{2,3},
+  {3,0},{3,1},{3,2},{3,3}
+};
+
+static const byte FIELD_SCAN_1D[16] = {0, 4, 1, 8, 12, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15};
+static const byte FIELD_SCAN_DC[16] = {0, 2, 1, 8, 10, 3, 9, 11, 4, 6, 12, 14, 5, 7, 13, 15};
+//! used to control block sizes : Not used/16x16/16x8/8x16/8x8/8x4/4x8/4x4
+static const int BLOCK_STEP[8][2]=
+{
+  {0,0},{4,4},{4,2},{2,4},{2,2},{2,1},{1,2},{1,1}
+};
+
+//! single scan pattern
+static const byte SNGL_SCAN8x8[64][2] = {
+  {0,0}, {1,0}, {0,1}, {0,2}, {1,1}, {2,0}, {3,0}, {2,1}, {1,2}, {0,3}, {0,4}, {1,3}, {2,2}, {3,1}, {4,0}, {5,0},
+  {4,1}, {3,2}, {2,3}, {1,4}, {0,5}, {0,6}, {1,5}, {2,4}, {3,3}, {4,2}, {5,1}, {6,0}, {7,0}, {6,1}, {5,2}, {4,3},
+  {3,4}, {2,5}, {1,6}, {0,7}, {1,7}, {2,6}, {3,5}, {4,4}, {5,3}, {6,2}, {7,1}, {7,2}, {6,3}, {5,4}, {4,5}, {3,6},
+  {2,7}, {3,7}, {4,6}, {5,5}, {6,4}, {7,3}, {7,4}, {6,5}, {5,6}, {4,7}, {5,7}, {6,6}, {7,5}, {7,6}, {6,7}, {7,7}
+};
+
+static const byte SNGL_SCAN8x8_1D[64] = 
+{
+  0,  1,  8, 16,  9,  2,  3, 10,
+ 17, 24, 32, 25, 18, 11,  4,  5,
+ 12, 19, 26, 33, 40, 48, 41, 34,
+ 27, 20, 13,  6,  7, 14, 21, 28,
+ 35, 42, 49, 56, 57, 50, 43, 36,
+ 29, 22, 15, 23, 30, 37, 44, 51,
+ 58, 59, 52, 45, 38, 31, 39, 46,
+ 53, 60, 61, 54, 47, 55, 62, 63
+};
+
+//! field scan pattern
+static const byte FIELD_SCAN8x8[64][2] = {   // 8x8
+  {0,0}, {0,1}, {0,2}, {1,0}, {1,1}, {0,3}, {0,4}, {1,2}, {2,0}, {1,3}, {0,5}, {0,6}, {0,7}, {1,4}, {2,1}, {3,0},
+  {2,2}, {1,5}, {1,6}, {1,7}, {2,3}, {3,1}, {4,0}, {3,2}, {2,4}, {2,5}, {2,6}, {2,7}, {3,3}, {4,1}, {5,0}, {4,2},
+  {3,4}, {3,5}, {3,6}, {3,7}, {4,3}, {5,1}, {6,0}, {5,2}, {4,4}, {4,5}, {4,6}, {4,7}, {5,3}, {6,1}, {6,2}, {5,4},
+  {5,5}, {5,6}, {5,7}, {6,3}, {7,0}, {7,1}, {6,4}, {6,5}, {6,6}, {6,7}, {7,2}, {7,3}, {7,4}, {7,5}, {7,6}, {7,7}
+};
+
+static const byte FIELD_SCAN8x8_1D[64] =
+{
+  0,  8, 16,  1,  9, 24, 32, 17,
+  2, 25, 40, 48, 56, 33, 10,  3,
+ 18, 41, 49, 57, 26, 11,  4, 19,
+ 34, 42, 50, 58, 27, 12,  5, 20,
+ 35, 43, 51, 59, 28, 13,  6, 21,
+ 36, 44, 52, 60, 29, 14, 22, 37,
+ 45, 53, 61, 30,  7, 15, 38, 46,
+ 54, 62, 23, 31, 39, 47, 55, 63
+};
+//! single scan pattern
+static const byte SCAN_YUV422[8][2] =
+{
+  {0,0},{0,1},
+  {1,0},{0,2},
+  {0,3},{1,1},
+  {1,2},{1,3}
+};
+
+static const unsigned char cbp_blk_chroma[8][4] =
+{ {16, 17, 18, 19},
+  {20, 21, 22, 23},
+  {24, 25, 26, 27},
+  {28, 29, 30, 31},
+  {32, 33, 34, 35},
+  {36, 37, 38, 39},
+  {40, 41, 42, 43},
+  {44, 45, 46, 47} 
+};
+
+static const unsigned char cofuv_blk_x[3][8][4] =
+{ { {0, 1, 0, 1},
+    {0, 1, 0, 1},
+    {0, 0, 0, 0},
+    {0, 0, 0, 0},
+    {0, 0, 0, 0},
+    {0, 0, 0, 0},
+    {0, 0, 0, 0},
+    {0, 0, 0, 0} },
+
+  { {0, 1, 0, 1},
+    {0, 1, 0, 1},
+    {0, 1, 0, 1},
+    {0, 1, 0, 1},
+    {0, 0, 0, 0},
+    {0, 0, 0, 0},
+    {0, 0, 0, 0},
+    {0, 0, 0, 0} },
+
+  { {0, 1, 0, 1},
+    {2, 3, 2, 3},
+    {0, 1, 0, 1},
+    {2, 3, 2, 3},
+    {0, 1, 0, 1},
+    {2, 3, 2, 3},
+    {0, 1, 0, 1},
+    {2, 3, 2, 3} }
+};
+
+static const unsigned char cofuv_blk_y[3][8][4] =
+{
+  { { 0, 0, 1, 1},
+    { 0, 0, 1, 1},
+    { 0, 0, 0, 0},
+    { 0, 0, 0, 0},
+    { 0, 0, 0, 0},
+    { 0, 0, 0, 0},
+    { 0, 0, 0, 0},
+    { 0, 0, 0, 0} },
+
+  { { 0, 0, 1, 1},
+    { 2, 2, 3, 3},
+    { 0, 0, 1, 1},
+    { 2, 2, 3, 3},
+    { 0, 0, 0, 0},
+    { 0, 0, 0, 0},
+    { 0, 0, 0, 0},
+    { 0, 0, 0, 0} },
+
+  { { 0, 0, 1, 1},
+    { 0, 0, 1, 1},
+    { 2, 2, 3, 3},
+    { 2, 2, 3, 3},
+    { 0, 0, 1, 1},
+    { 0, 0, 1, 1},
+    { 2, 2, 3, 3},
+    { 2, 2, 3, 3}}
+};
+
+
+extern void setup_slice_methods(Slice *currSlice);
+extern void get_neighbors(Macroblock *currMB, PixelPos *block, int mb_x, int mb_y, int blockshape_x);
+extern void get_neighbors0016(Macroblock *currMB, PixelPos *block);
+
+extern void start_macroblock     (Slice *currSlice, Macroblock **currMB);
+extern int  decode_one_macroblock(Macroblock *currMB, StorablePicture *dec_picture);
+extern Boolean  exit_macroblock  (Slice *currSlice, int eos_bit);
+
+#endif
+
diff --git a/Src/h264dec/ldecod/inc/mb_prediction.h b/Src/h264dec/ldecod/inc/mb_prediction.h
new file mode 100644
index 00000000..cea3bd45
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/mb_prediction.h
@@ -0,0 +1,33 @@
+
+/*!
+ *************************************************************************************
+ * \file mb_prediction.h
+ *
+ * \brief
+ *    Functions for macroblock prediction
+ *
+ * \author
+ *     Main contributors (see contributors.h for copyright, address and affiliation details)
+ *     - Alexis Michael Tourapis         <alexismt@ieee.org>  
+ *************************************************************************************
+ */
+
+#ifndef _MB_PREDICTION_H_
+#define _MB_PREDICTION_H_
+
+extern int mb_pred_intra4x4    (Macroblock *currMB, ColorPlane curr_plane, struct video_image *image, StorablePicture *dec_picture);
+extern int mb_pred_intra16x16  (Macroblock *currMB, ColorPlane curr_plane, struct video_image *image, StorablePicture *dec_picture);
+extern int mb_pred_intra8x8    (Macroblock *currMB, ColorPlane curr_plane, struct video_image *image, StorablePicture *dec_picture);
+
+extern void mb_pred_skip        (Macroblock *currMB, ColorPlane curr_plane, struct video_image *image, StorablePicture *dec_picture);
+extern void mb_pred_sp_skip     (Macroblock *currMB, ColorPlane curr_plane, struct video_image *image, StorablePicture *dec_picture);
+extern void mb_pred_p_inter8x8  (Macroblock *currMB, ColorPlane curr_plane, struct video_image *image, StorablePicture *dec_picture);
+extern void mb_pred_p_inter16x16(Macroblock *currMB, ColorPlane curr_plane, struct video_image *image, StorablePicture *dec_picture);
+extern void mb_pred_p_inter16x8 (Macroblock *currMB, ColorPlane curr_plane, struct video_image *image, StorablePicture *dec_picture);
+extern void mb_pred_p_inter8x16 (Macroblock *currMB, ColorPlane curr_plane, struct video_image *image, StorablePicture *dec_picture);
+extern void mb_pred_b_dspatial  (Macroblock *currMB, ColorPlane curr_plane, struct video_image *image, StorablePicture *dec_picture);
+extern void mb_pred_b_dtemporal (Macroblock *currMB, ColorPlane curr_plane, struct video_image *image, StorablePicture *dec_picture);
+extern void mb_pred_b_inter8x8  (Macroblock *currMB, ColorPlane curr_plane, struct video_image *image, StorablePicture *dec_picture);
+extern void mb_pred_ipcm        (Macroblock *currMB);
+
+#endif
diff --git a/Src/h264dec/ldecod/inc/mbuffer.h b/Src/h264dec/ldecod/inc/mbuffer.h
new file mode 100644
index 00000000..c06e9fb0
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/mbuffer.h
@@ -0,0 +1,235 @@
+
+/*!
+ ***********************************************************************
+ *  \file
+ *      mbuffer.h
+ *
+ *  \brief
+ *      Frame buffer functions
+ *
+ *  \author
+ *      Main contributors (see contributors.h for copyright, address and affiliation details)
+ *      - Karsten S�hring          <suehring@hhi.de>
+ *      - Alexis Michael Tourapis  <alexismt@ieee.org>
+ 
+ *      - Jill Boyce               <jill.boyce@thomson.net>
+ *      - Saurav K Bandyopadhyay   <saurav@ieee.org>
+ *      - Zhenyu Wu                <Zhenyu.Wu@thomson.net
+ *      - Purvin Pandit            <Purvin.Pandit@thomson.net>
+ *
+ ***********************************************************************
+ */
+#ifndef _MBUFFER_H_
+#define _MBUFFER_H_
+
+#include "global.h"
+#include <bfc/platform/types.h>
+
+#define MAX_LIST_SIZE 33
+//! definition of pic motion parameters
+
+typedef struct pic_motion_params
+{
+	PicMotion **motion[2];
+	h264_ref_t ***field_references;
+  byte *      mb_field;      //!< field macroblock indicator
+  byte **     field_frame;   //!< indicates if co_located is field or frame.
+	int padding[1];
+} PicMotionParams;
+
+typedef struct video_image
+{
+	imgpel **img;
+	imgpel *base_address;
+	size_t stride;
+	struct video_image *next; // for the memory cacher
+} VideoImage;
+//! definition a picture (field or frame)
+typedef struct storable_picture
+{
+  PictureStructure structure;
+
+  int         poc;
+  int         top_poc;
+  int         bottom_poc;
+  int         frame_poc;
+  h264_ref_t       ref_pic_num        [MAX_NUM_SLICES][6][MAX_LIST_SIZE];
+  h264_ref_t       frm_ref_pic_num    [MAX_NUM_SLICES][6][MAX_LIST_SIZE];
+  h264_ref_t       top_ref_pic_num    [MAX_NUM_SLICES][6][MAX_LIST_SIZE];
+  h264_ref_t       bottom_ref_pic_num [MAX_NUM_SLICES][6][MAX_LIST_SIZE];
+  unsigned    frame_num;
+  unsigned    recovery_frame;
+
+  int         pic_num;
+  int         long_term_pic_num;
+  int         long_term_frame_idx;
+
+  byte        is_long_term;
+  int         used_for_reference;
+  int         is_output;
+  int         non_existing;
+
+  short       max_slice_id;
+
+  int         size_x, size_y, size_x_cr, size_y_cr;
+  int         size_x_m1, size_y_m1, size_x_cr_m1, size_y_cr_m1;
+  int         chroma_vector_adjustment;
+  int         coded_frame;
+  int         mb_aff_frame_flag;
+  unsigned    PicWidthInMbs;
+  unsigned    PicSizeInMbs;
+
+  //imgpel **     imgY;         //!< Y picture component
+	union
+	{
+		VideoImage *plane_images[3]; // to ensure array alignment
+		struct
+		{
+			VideoImage *imgY;
+			VideoImage *imgUV[2];        //!< U and V picture components
+		};
+	};
+  
+  struct pic_motion_params motion;              //!< Motion info
+  struct pic_motion_params JVmotion[MAX_PLANE]; //!< Motion info for 4:4:4 independent mode decoding
+
+  short **     slice_id;      //!< reference picture   [mb_x][mb_y]
+
+  struct storable_picture *top_field;     // for mb aff, if frame for referencing the top field
+  struct storable_picture *bottom_field;  // for mb aff, if frame for referencing the bottom field
+  struct storable_picture *frame;         // for mb aff, if field for referencing the combined frame
+
+  int         slice_type;
+  int         idr_flag;
+  int         no_output_of_prior_pics_flag;
+  int         long_term_reference_flag;
+  int         adaptive_ref_pic_buffering_flag;
+
+  int         chroma_format_idc;
+  int         frame_mbs_only_flag;
+  int         frame_cropping_flag;
+  int         frame_cropping_rect_left_offset;
+  int         frame_cropping_rect_right_offset;
+  int         frame_cropping_rect_top_offset;
+  int         frame_cropping_rect_bottom_offset;
+  int         qp;
+  int         chroma_qp_offset[2];
+  int         slice_qp_delta;
+  DecRefPicMarking_t *dec_ref_pic_marking_buffer;                    //!< stores the memory management control operations
+
+  // picture error concealment
+  int         concealed_pic; //indicates if this is a concealed picture
+  
+  // variables for tone mapping
+  int         seiHasTone_mapping;
+  int         tone_mapping_model_id;
+  int         tonemapped_bit_depth;  
+  imgpel*     tone_mapping_lut;                //!< tone mapping look up table
+
+	int retain_count; // benski> we're going to reference count these things
+	uint64_t time_code; // user-passed timecode for this frame
+} StorablePicture;
+
+//! definition a picture (field or frame)
+typedef struct colocated_params
+{
+  int         mb_adaptive_frame_field_flag;
+  int         size_x, size_y;
+  byte        is_long_term;
+
+  MotionParams frame;
+  MotionParams top;
+  MotionParams bottom;
+
+} ColocatedParams;
+
+//! Frame Stores for Decoded Picture Buffer
+typedef struct frame_store
+{
+  int       is_used;                //!< 0=empty; 1=top; 2=bottom; 3=both fields (or frame)
+  int       is_reference;           //!< 0=not used for ref; 1=top used; 2=bottom used; 3=both fields (or frame) used
+  int       is_long_term;           //!< 0=not used for ref; 1=top used; 2=bottom used; 3=both fields (or frame) used
+  int       is_orig_reference;      //!< original marking by nal_ref_idc: 0=not used for ref; 1=top used; 2=bottom used; 3=both fields (or frame) used
+
+  int       is_non_existent;
+
+  unsigned  frame_num;
+  unsigned  recovery_frame;
+
+  int       frame_num_wrap;
+  int       long_term_frame_idx;
+  int       is_output;
+  int       poc;
+
+  // picture error concealment
+  int concealment_reference;
+
+  StorablePicture *frame;
+  StorablePicture *top_field;
+  StorablePicture *bottom_field;
+
+} FrameStore;
+
+
+//! Decoded Picture Buffer
+typedef struct decoded_picture_buffer
+{
+  VideoParameters *p_Vid;
+  InputParameters *p_Inp;
+  FrameStore  **fs;
+  FrameStore  **fs_ref;
+  FrameStore  **fs_ltref;
+  unsigned      size;
+  unsigned      used_size;
+  unsigned      ref_frames_in_buffer;
+  unsigned      ltref_frames_in_buffer;
+  int           last_output_poc;
+  int           max_long_term_pic_idx;
+
+  int           init_done;
+  int           num_ref_frames;
+
+  FrameStore   *last_picture;
+} DecodedPictureBuffer;
+
+extern void             init_dpb(VideoParameters *p_Vid);
+extern void             free_dpb(VideoParameters *p_Vid);
+extern FrameStore*      alloc_frame_store(void);
+extern void             free_frame_store(VideoParameters *p_Vid, FrameStore* f);
+extern StorablePicture* alloc_storable_picture(VideoParameters *p_Vid, PictureStructure type, int size_x, int size_y, int size_x_cr, int size_y_cr);
+extern void             free_storable_picture(VideoParameters *p_Vid, StorablePicture* p);
+extern void             store_picture_in_dpb(VideoParameters *p_Vid, StorablePicture* p);
+extern void             flush_dpb(VideoParameters *p_Vid);
+
+extern void             dpb_split_field  (VideoParameters *p_Vid, FrameStore *fs);
+extern void             dpb_combine_field(VideoParameters *p_Vid, FrameStore *fs);
+extern void             dpb_combine_field_yuv(VideoParameters *p_Vid, FrameStore *fs);
+
+extern void             init_lists          (Slice *currSlice);
+extern void             reorder_ref_pic_list(VideoParameters *p_Vid, StorablePicture **list, char *list_size,
+                                      int num_ref_idx_lX_active_minus1, int *reordering_of_pic_nums_idc,
+                                      int *abs_diff_pic_num_minus1, int *long_term_pic_idx);
+
+extern void             init_mbaff_lists(VideoParameters *p_Vid);
+extern void             alloc_ref_pic_list_reordering_buffer(Slice *currSlice);
+extern void             free_ref_pic_list_reordering_buffer(Slice *currSlice);
+
+extern void             fill_frame_num_gap(VideoParameters *p_Vid);
+
+extern ColocatedParams* alloc_colocated(VideoParameters *p_Vid, int size_x, int size_y,int mb_adaptive_frame_field_flag);
+extern void free_colocated(VideoParameters *p_Vid, ColocatedParams* p);
+extern void compute_colocated            (Slice *currSlice, ColocatedParams* p, StorablePicture **listX[6]);
+extern void compute_colocated_frames_mbs (Slice *currSlice, ColocatedParams* p, StorablePicture **listX[6]);
+
+// For 4:4:4 independent mode
+extern void compute_colocated_JV  ( Slice *currSlice, ColocatedParams* p, StorablePicture **listX[6]);
+extern void copy_storable_param_JV( VideoParameters *p_Vid, PicMotionParams  *JVplane, PicMotionParams  *motion );
+
+// benski> decoded output pictures
+void out_storable_picture_get(VideoParameters *img, StorablePicture **pic);
+void out_storable_picture_add(VideoParameters *img, StorablePicture *pic);
+void out_storable_pictures_init(VideoParameters *img, size_t count);
+void out_storable_pictures_destroy(VideoParameters *img);
+
+#endif
+
diff --git a/Src/h264dec/ldecod/inc/mc_prediction.h b/Src/h264dec/ldecod/inc/mc_prediction.h
new file mode 100644
index 00000000..e3165e27
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/mc_prediction.h
@@ -0,0 +1,61 @@
+
+/*!
+ *************************************************************************************
+ * \file mc_prediction.h
+ *
+ * \brief
+ *    definitions for motion compensated prediction
+ *
+ * \author
+ *      Main contributors (see contributors.h for copyright, 
+ *                         address and affiliation details)
+ *      - Alexis Michael Tourapis  <alexismt@ieee.org>
+ *
+ *************************************************************************************
+ */
+
+#ifndef _MC_PREDICTION_H_
+#define _MC_PREDICTION_H_
+
+#include "global.h"
+#include "mbuffer.h"
+
+extern void get_block_luma  (Macroblock *currMB, ColorPlane pl, StorablePicture *list, int x_pos, int y_pos, const short *motion_vector, int ver_block_size, int hor_block_size, h264_imgpel_macroblock_t block);
+
+extern void intra_cr_decoding    (Macroblock *currMB, int yuv);
+extern void prepare_direct_params(Macroblock *currMB, StorablePicture *dec_picture, short pmvl0[2], short pmvl1[2],char *l0_rFrame, char *l1_rFrame);
+extern void perform_mc           (Macroblock *currMB, ColorPlane pl, StorablePicture *dec_picture, int pred_dir, int i, int j, int list_offset,   int block_size_x, int block_size_y, int curr_mb_field);
+extern void perform_mc16x16      (Macroblock *currMB, ColorPlane pl, StorablePicture *dec_picture, int pred_dir, int list_offset, int curr_mb_field);
+extern void perform_mc16x8       (Macroblock *currMB, ColorPlane pl, StorablePicture *dec_picture, int pred_dir, int i, int j, int list_offset,   int curr_mb_field);
+extern void perform_mc8x8        (Macroblock *currMB, ColorPlane pl, StorablePicture *dec_picture, int pred_dir, int i, int j, int list_offset,   int curr_mb_field);
+extern void perform_mc8x16       (Macroblock *currMB, ColorPlane pl, StorablePicture *dec_picture, int pred_dir, int i, int j, int list_offset,   int curr_mb_field);
+
+void weighted_mc_prediction16x16_sse2(h264_imgpel_macroblock_row_t *mb_pred, int wp_scale, int wp_offset, int weight_denom);
+void weighted_mc_prediction16x16_ipp(h264_imgpel_macroblock_row_t *mb_pred, int wp_scale, int wp_offset, int weight_denom);
+void weighted_mc_prediction16x16_c(h264_imgpel_macroblock_row_t *mb_pred, int wp_scale, int wp_offset, int weight_denom);
+
+void weighted_mc_prediction16x8_sse2(h264_imgpel_macroblock_row_t *mb_pred, int wp_scale, int wp_offset, int weight_denom);
+void weighted_mc_prediction16x8_ipp(h264_imgpel_macroblock_row_t *mb_pred, int wp_scale, int wp_offset, int weight_denom);
+void weighted_mc_prediction16x8_c(h264_imgpel_macroblock_row_t *mb_pred, int wp_scale, int wp_offset, int weight_denom);
+
+void weighted_mc_prediction8x8_sse2(h264_imgpel_macroblock_row_t *mb_pred, int wp_scale, int wp_offset, int weight_denom);
+void weighted_mc_prediction8x8_ipp(h264_imgpel_macroblock_row_t *mb_pred, int wp_scale, int wp_offset, int weight_denom);
+void weighted_mc_prediction8x8_c(h264_imgpel_macroblock_row_t *mb_pred, int wp_scale, int wp_offset, int weight_denom);
+
+void weighted_bi_prediction8x8_sse2(h264_imgpel_macroblock_row_t *mb_pred, const h264_imgpel_macroblock_t block_l0,  int wp_scale_l0, int wp_scale_l1, int wp_offset, int weight_denom);
+void weighted_bi_prediction8x8_ipp(h264_imgpel_macroblock_row_t *mb_pred, const h264_imgpel_macroblock_t block_l0,  int wp_scale_l0, int wp_scale_l1, int wp_offset, int weight_denom);
+void weighted_bi_prediction8x8_c(h264_imgpel_macroblock_row_t *mb_pred, const h264_imgpel_macroblock_t block_l0,  int wp_scale_l0, int wp_scale_l1, int wp_offset, int weight_denom);
+
+void weighted_bi_prediction16x16_sse2(h264_imgpel_macroblock_row_t *mb_pred, const h264_imgpel_macroblock_t block_l0, int wp_scale_l0, int wp_scale_l1, int wp_offset, int weight_denom);
+void weighted_bi_prediction16x16_ipp(h264_imgpel_macroblock_row_t *mb_pred, const h264_imgpel_macroblock_t block_l0, int wp_scale_l0, int wp_scale_l1, int wp_offset, int weight_denom);
+void weighted_bi_prediction16x16_c(h264_imgpel_macroblock_row_t *mb_pred, const h264_imgpel_macroblock_t block_l0, int wp_scale_l0, int wp_scale_l1, int wp_offset, int weight_denom);
+
+void weighted_bi_prediction16x8_sse2(h264_imgpel_macroblock_row_t *mb_pred, const h264_imgpel_macroblock_t block_l0, int wp_scale_l0, int wp_scale_l1, int wp_offset, int weight_denom);
+void weighted_bi_prediction16x8_ipp(h264_imgpel_macroblock_row_t *mb_pred, const h264_imgpel_macroblock_t block_l0, int wp_scale_l0, int wp_scale_l1, int wp_offset, int weight_denom);
+void weighted_bi_prediction16x8_c(h264_imgpel_macroblock_row_t *mb_pred, const h264_imgpel_macroblock_t block_l0, int wp_scale_l0, int wp_scale_l1, int wp_offset, int weight_denom);
+
+void bi_prediction8x8_sse2(h264_imgpel_macroblock_row_t *mb_pred, const h264_imgpel_macroblock_t block_l0);
+void bi_prediction8x8_ipp(h264_imgpel_macroblock_row_t *mb_pred, const h264_imgpel_macroblock_t block_l0);
+
+#endif
+
diff --git a/Src/h264dec/ldecod/inc/meminput.h b/Src/h264dec/ldecod/inc/meminput.h
new file mode 100644
index 00000000..babaf2f0
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/meminput.h
@@ -0,0 +1,26 @@
+#ifndef _MEMINPUT_H
+#define _MEMINPUT_H
+#pragma once
+
+#include "nalucommon.h"
+#include <bfc/platform/types.h>
+
+typedef struct memory_input_struct 
+{
+	const uint8_t *user_buffer;
+	size_t user_buffer_size;
+	size_t user_buffer_read;
+
+  uint8_t *Buf;
+	int resetting;
+	int skip_b_frames;
+} memory_input_t;
+
+int  GetMemoryNALU (VideoParameters *p_Vid, NALU_t *nalu);
+void OpenMemory(VideoParameters *p_Vid, const char *fn);
+void CloseMemory(VideoParameters *p_Vid);
+void malloc_mem_input(VideoParameters *p_Vid);
+void free_mem_input(VideoParameters *p_Vid);
+
+#endif
+
diff --git a/Src/h264dec/ldecod/inc/nalu.h b/Src/h264dec/ldecod/inc/nalu.h
new file mode 100644
index 00000000..d10e6bfa
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/nalu.h
@@ -0,0 +1,28 @@
+
+/*!
+ **************************************************************************************
+ * \file
+ *    nalu.h
+ * \brief
+ *    Common NALU support functions
+ *
+ * \date 25 November 2002
+ * \author
+ *    Main contributors (see contributors.h for copyright, address and affiliation details)
+ *      - Stephan Wenger        <stewe@cs.tu-berlin.de>
+ ***************************************************************************************
+ */
+
+
+#ifndef _NALU_H_
+#define _NALU_H_
+
+#include "nalucommon.h"
+
+extern void initBitsFile (VideoParameters *p_Vid);
+extern void CheckZeroByteNonVCL(VideoParameters *p_Vid, NALU_t *nalu);
+extern void CheckZeroByteVCL   (VideoParameters *p_Vid, NALU_t *nalu);
+
+extern int read_next_nalu(VideoParameters *p_Vid, NALU_t *nalu);
+
+#endif
diff --git a/Src/h264dec/ldecod/inc/optim.h b/Src/h264dec/ldecod/inc/optim.h
new file mode 100644
index 00000000..bfcc07f8
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/optim.h
@@ -0,0 +1,46 @@
+#pragma once
+
+typedef struct optimized_functions
+{
+	//void (*itrans4x4)(const h264_short_macroblock_t tblock, const h264_imgpel_macroblock_t mb_pred, h264_imgpel_macroblock_t mb_rec, int pos_x, int pos_y);
+	void (*itrans8x8)(h264_imgpel_macroblock_row_t *mb_rec, const h264_imgpel_macroblock_row_t *mb_pred, const h264_short_8x8block_row_t *block, int pos_x);
+
+	void (*weighted_mc_prediction16x16)(h264_imgpel_macroblock_row_t *mb_pred, int wp_scale, int wp_offset, int weight_denom);
+	void (*weighted_mc_prediction16x8)(h264_imgpel_macroblock_row_t *mb_pred, int wp_scale, int wp_offset, int weight_denom);
+	void (*weighted_mc_prediction8x8)(h264_imgpel_macroblock_row_t *mb_pred, int wp_scale, int wp_offset, int weight_denom);
+
+	void (*weighted_bi_prediction16x16)(h264_imgpel_macroblock_row_t *mb_pred, const h264_imgpel_macroblock_t block_l0,  int wp_scale_l0, int wp_scale_l1, int wp_offset, int weight_denom);
+	void (*weighted_bi_prediction16x8)(h264_imgpel_macroblock_row_t *mb_pred, const h264_imgpel_macroblock_t block_l0,  int wp_scale_l0, int wp_scale_l1, int wp_offset, int weight_denom);
+	void (*weighted_bi_prediction8x8)(h264_imgpel_macroblock_row_t *mb_pred, const h264_imgpel_macroblock_t block_l0,  int wp_scale_l0, int wp_scale_l1, int wp_offset, int weight_denom);
+
+	void (*bi_prediction8x8)(h264_imgpel_macroblock_row_t *mb_pred, const h264_imgpel_macroblock_t block_l0);
+
+	void (*copy_image_data_16x16_stride)(struct video_image *destination, int dest_x, int dest_y, const h264_imgpel_macroblock_t source);
+	int (*code_from_bitstream_2d_5_4)(SyntaxElement *sym, Bitstream *currStream, const uint8_t *lentab, const uint8_t *codtab, const uint8_t *masktab);
+	int (*code_from_bitstream_2d_17_4)(SyntaxElement *sym, Bitstream *currStream, const uint16_t *lentab, const uint16_t *codtab, const uint16_t *masktab);
+	int (*code_from_bitstream_2d_16_1)(Bitstream *currStream, const uint16_t *lentab,	const uint16_t *codtab, const uint16_t *masktab);
+} OptimizedFunctions;
+
+extern OptimizedFunctions opt;
+
+/* define macros for these function calls.  this way we could do specific builds that call the functions directly, if we have the need */
+#ifdef _DEBUG
+#define opt_itrans4x4 (itrans4x4_c)
+#else
+#define opt_itrans4x4 (itrans4x4_mmx)
+#endif
+#define opt_itrans8x8 (opt.itrans8x8)
+
+#define opt_weighted_mc_prediction16x16 (opt.weighted_mc_prediction16x16)
+#define opt_weighted_mc_prediction16x8 (opt.weighted_mc_prediction16x8)
+#define opt_weighted_mc_prediction8x8 (opt.weighted_mc_prediction8x8)
+
+#define opt_weighted_bi_prediction16x16 (opt.weighted_bi_prediction16x16)
+#define opt_weighted_bi_prediction16x8 (opt.weighted_bi_prediction16x8)
+#define opt_weighted_bi_prediction8x8 (opt.weighted_bi_prediction8x8)
+
+#define opt_bi_prediction8x8 (opt.bi_prediction8x8)
+#define opt_copy_image_data_16x16_stride (opt.copy_image_data_16x16_stride)
+#define opt_code_from_bitstream_2d_5_4 (opt.code_from_bitstream_2d_5_4)
+#define opt_code_from_bitstream_2d_17_4 (opt.code_from_bitstream_2d_17_4)
+#define opt_code_from_bitstream_2d_16_1 (opt.code_from_bitstream_2d_16_1)
+\ No newline at end of file
diff --git a/Src/h264dec/ldecod/inc/output.h b/Src/h264dec/ldecod/inc/output.h
new file mode 100644
index 00000000..69e06f85
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/output.h
@@ -0,0 +1,27 @@
+
+/*!
+ **************************************************************************************
+ * \file
+ *    output.h
+ * \brief
+ *    Picture writing routine headers
+ * \author
+ *    Main contributors (see contributors.h for copyright, address and affiliation details)
+ *      - Karsten Suehring        <suehring@hhi.de>
+ ***************************************************************************************
+ */
+
+#ifndef _OUTPUT_H_
+#define _OUTPUT_H_
+
+
+extern void write_stored_frame(VideoParameters *p_Vid, FrameStore *fs);
+extern void direct_output     (VideoParameters *p_Vid, StorablePicture *p);
+extern void init_out_buffer   (VideoParameters *p_Vid);
+extern void uninit_out_buffer (VideoParameters *p_Vid);
+
+#if (PAIR_FIELDS_IN_OUTPUT)
+extern void flush_pending_output(VideoParameters *p_Vid);
+#endif
+
+#endif //_OUTPUT_H_
diff --git a/Src/h264dec/ldecod/inc/parset.h b/Src/h264dec/ldecod/inc/parset.h
new file mode 100644
index 00000000..c433c26d
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/parset.h
@@ -0,0 +1,56 @@
+
+/*!
+ **************************************************************************************
+ * \file
+ *    parset.h
+ * \brief
+ *    Picture and Sequence Parameter Sets, decoder operations
+ * 
+ * \date 25 November 2002
+ * \author
+ *    Main contributors (see contributors.h for copyright, address and affiliation details)
+ *      - Stephan Wenger        <stewe@cs.tu-berlin.de>
+ ***************************************************************************************
+ */
+
+
+#ifndef _PARSET_H_
+#define _PARSET_H_
+
+#include "parsetcommon.h"
+#include "nalucommon.h"
+
+static const byte ZZ_SCAN[16]  =
+{  0,  1,  4,  8,  5,  2,  3,  6,  9, 12, 13, 10,  7, 11, 14, 15
+};
+
+static const byte ZZ_SCAN8[64] =
+{  0,  1,  8, 16,  9,  2,  3, 10, 17, 24, 32, 25, 18, 11,  4,  5,
+   12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13,  6,  7, 14, 21, 28,
+   35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
+   58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63
+};
+
+extern void Scaling_List(int *scalingList, int sizeOfScalingList, Boolean *UseDefaultScalingMatrix, Bitstream *s);
+
+extern void InitVUI(seq_parameter_set_rbsp_t *sps);
+extern int  ReadVUI(DataPartition *p, seq_parameter_set_rbsp_t *sps);
+extern int  ReadHRDParameters(DataPartition *p, hrd_parameters_t *hrd);
+
+extern void PPSConsistencyCheck (pic_parameter_set_rbsp_t *pps);
+extern void SPSConsistencyCheck (seq_parameter_set_rbsp_t *sps);
+
+extern void MakePPSavailable (VideoParameters *p_Vid, int id, pic_parameter_set_rbsp_t *pps);
+extern void MakeSPSavailable (VideoParameters *p_Vid, int id, seq_parameter_set_rbsp_t *sps);
+
+extern void ProcessSPS (VideoParameters *p_Vid, NALU_t *nalu);
+extern void ProcessPPS (VideoParameters *p_Vid, NALU_t *nalu);
+
+extern void UseParameterSet (Slice *currSlice, int PicParsetId);
+
+extern void CleanUpPPS(VideoParameters *p_Vid);
+
+extern void activate_sps (VideoParameters *p_Vid, seq_parameter_set_rbsp_t *sps);
+extern void activate_pps (VideoParameters *p_Vid, pic_parameter_set_rbsp_t *pps);
+
+#endif
diff --git a/Src/h264dec/ldecod/inc/parsetcommon.h b/Src/h264dec/ldecod/inc/parsetcommon.h
new file mode 100644
index 00000000..16a64098
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/parsetcommon.h
@@ -0,0 +1,202 @@
+
+/*!
+ **************************************************************************************
+ * \file
+ *    parsetcommon.h
+ * \brief
+ *    Picture and Sequence Parameter Sets, structures common to encoder and decoder
+ *
+ * \date 25 November 2002
+ * \author
+ *    Main contributors (see contributors.h for copyright, address and affiliation details)
+ *      - Stephan Wenger        <stewe@cs.tu-berlin.de>
+ ***************************************************************************************
+ */
+
+
+
+// In the JVT syntax, frequently flags are used that indicate the presence of
+// certain pieces of information in the NALU.  Here, these flags are also
+// present.  In the encoder, those bits indicate that the values signaled to
+// be present are meaningful and that this part of the syntax should be
+// written to the NALU.  In the decoder, the flag indicates that information
+// was received from the decoded NALU and should be used henceforth.
+// The structure names were chosen as indicated in the JVT syntax
+
+#ifndef _PARSETCOMMON_H_
+#define _PARSETCOMMON_H_
+
+#include "defines.h"
+
+#define MAXIMUMPARSETRBSPSIZE   1500
+#define MAXIMUMPARSETNALUSIZE   1500
+
+#define MAXSPS  32
+#define MAXPPS  256
+
+#define MAXIMUMVALUEOFcpb_cnt   32
+typedef struct
+{
+  unsigned int cpb_cnt_minus1;                                   // ue(v)
+  unsigned int bit_rate_scale;                                   // u(4)
+  unsigned int cpb_size_scale;                                   // u(4)
+  unsigned int bit_rate_value_minus1 [MAXIMUMVALUEOFcpb_cnt];    // ue(v)
+  unsigned int cpb_size_value_minus1 [MAXIMUMVALUEOFcpb_cnt];    // ue(v)
+  unsigned int cbr_flag              [MAXIMUMVALUEOFcpb_cnt];    // u(1)
+  unsigned int initial_cpb_removal_delay_length_minus1;          // u(5)
+  unsigned int cpb_removal_delay_length_minus1;                  // u(5)
+  unsigned int dpb_output_delay_length_minus1;                   // u(5)
+  unsigned int time_offset_length;                               // u(5)
+} hrd_parameters_t;
+
+
+typedef struct
+{
+  Boolean      aspect_ratio_info_present_flag;                   // u(1)
+  unsigned int aspect_ratio_idc;                                 // u(8)
+  unsigned short sar_width;                                        // u(16)
+  unsigned short sar_height;                                       // u(16)
+  Boolean      overscan_info_present_flag;                       // u(1)
+  Boolean      overscan_appropriate_flag;                        // u(1)
+  Boolean      video_signal_type_present_flag;                   // u(1)
+  unsigned int video_format;                                     // u(3)
+  Boolean      video_full_range_flag;                            // u(1)
+  Boolean      colour_description_present_flag;                  // u(1)
+  unsigned int colour_primaries;                                 // u(8)
+  unsigned int transfer_characteristics;                         // u(8)
+  unsigned int matrix_coefficients;                              // u(8)
+  Boolean      chroma_location_info_present_flag;                // u(1)
+  unsigned int  chroma_sample_loc_type_top_field;                // ue(v)
+  unsigned int  chroma_sample_loc_type_bottom_field;             // ue(v)
+  Boolean      timing_info_present_flag;                         // u(1)
+  unsigned int num_units_in_tick;                                // u(32)
+  unsigned int time_scale;                                       // u(32)
+  Boolean      fixed_frame_rate_flag;                            // u(1)
+  Boolean      nal_hrd_parameters_present_flag;                  // u(1)
+  hrd_parameters_t nal_hrd_parameters;                           // hrd_paramters_t
+  Boolean      vcl_hrd_parameters_present_flag;                  // u(1)
+  hrd_parameters_t vcl_hrd_parameters;                           // hrd_paramters_t
+  // if ((nal_hrd_parameters_present_flag || (vcl_hrd_parameters_present_flag))
+  Boolean      low_delay_hrd_flag;                               // u(1)
+  Boolean      pic_struct_present_flag;                          // u(1)
+  Boolean      bitstream_restriction_flag;                       // u(1)
+  Boolean      motion_vectors_over_pic_boundaries_flag;          // u(1)
+  unsigned int max_bytes_per_pic_denom;                          // ue(v)
+  unsigned int max_bits_per_mb_denom;                            // ue(v)
+  unsigned int log2_max_mv_length_vertical;                      // ue(v)
+  unsigned int log2_max_mv_length_horizontal;                    // ue(v)
+  unsigned int num_reorder_frames;                               // ue(v)
+  unsigned int max_dec_frame_buffering;                          // ue(v)
+} vui_seq_parameters_t;
+
+
+#define MAXnum_slice_groups_minus1  8
+typedef struct
+{
+  Boolean   Valid;                  // indicates the parameter set is valid
+  unsigned int pic_parameter_set_id;                             // ue(v)
+  unsigned int seq_parameter_set_id;                             // ue(v)
+  Boolean   entropy_coding_mode_flag;                            // u(1)
+  Boolean   transform_8x8_mode_flag;                             // u(1)
+
+  Boolean   pic_scaling_matrix_present_flag;                     // u(1)
+  int       pic_scaling_list_present_flag[12];                   // u(1)
+  int       ScalingList4x4[6][16];                               // se(v)
+  int       ScalingList8x8[6][64];                               // se(v)
+  Boolean   UseDefaultScalingMatrix4x4Flag[6];
+  Boolean   UseDefaultScalingMatrix8x8Flag[6];
+
+  // if( pic_order_cnt_type < 2 )  in the sequence parameter set
+  Boolean      bottom_field_pic_order_in_frame_present_flag;                           // u(1)
+  unsigned int num_slice_groups_minus1;                          // ue(v)
+  unsigned int slice_group_map_type;                        // ue(v)
+  // if( slice_group_map_type = = 0 )
+  unsigned int run_length_minus1[MAXnum_slice_groups_minus1]; // ue(v)
+  // else if( slice_group_map_type = = 2 )
+  unsigned int top_left[MAXnum_slice_groups_minus1];         // ue(v)
+  unsigned int bottom_right[MAXnum_slice_groups_minus1];     // ue(v)
+  // else if( slice_group_map_type = = 3 || 4 || 5
+  Boolean   slice_group_change_direction_flag;            // u(1)
+  unsigned int slice_group_change_rate_minus1;               // ue(v)
+  // else if( slice_group_map_type = = 6 )
+  unsigned int pic_size_in_map_units_minus1;             // ue(v)
+  byte      *slice_group_id;                              // complete MBAmap u(v)
+
+  int num_ref_idx_l0_active_minus1;                     // ue(v)
+  int num_ref_idx_l1_active_minus1;                     // ue(v)
+  Boolean   weighted_pred_flag;                               // u(1)
+  unsigned int  weighted_bipred_idc;                              // u(2)
+  int       pic_init_qp_minus26;                              // se(v)
+  int       pic_init_qs_minus26;                              // se(v)
+  int       chroma_qp_index_offset;                           // se(v)
+
+  int       second_chroma_qp_index_offset;                    // se(v)
+
+  Boolean   deblocking_filter_control_present_flag;           // u(1)
+  Boolean   constrained_intra_pred_flag;                      // u(1)
+  Boolean   redundant_pic_cnt_present_flag;                   // u(1)
+} pic_parameter_set_rbsp_t;
+
+
+#define MAXnum_ref_frames_in_pic_order_cnt_cycle  256
+typedef struct
+{
+  Boolean   Valid;                  // indicates the parameter set is valid
+
+  unsigned int profile_idc;                                       // u(8)
+  Boolean   constrained_set0_flag;                                // u(1)
+  Boolean   constrained_set1_flag;                                // u(1)
+  Boolean   constrained_set2_flag;                                // u(1)
+  Boolean   constrained_set3_flag;                                // u(1)
+  unsigned  int level_idc;                                        // u(8)
+  unsigned  int seq_parameter_set_id;                             // ue(v)
+  unsigned  int chroma_format_idc;                                // ue(v)
+
+  Boolean   seq_scaling_matrix_present_flag;                   // u(1)
+  int       seq_scaling_list_present_flag[12];                 // u(1)
+  int       ScalingList4x4[6][16];                             // se(v)
+  int       ScalingList8x8[6][64];                             // se(v)
+  Boolean   UseDefaultScalingMatrix4x4Flag[6];
+  Boolean   UseDefaultScalingMatrix8x8Flag[6];
+
+  unsigned int bit_depth_luma_minus8;                            // ue(v)
+  unsigned int bit_depth_chroma_minus8;                          // ue(v)
+  unsigned int log2_max_frame_num_minus4;                        // ue(v)
+  unsigned int pic_order_cnt_type;
+  // if( pic_order_cnt_type == 0 )
+  unsigned int log2_max_pic_order_cnt_lsb_minus4;                 // ue(v)
+  // else if( pic_order_cnt_type == 1 )
+    Boolean delta_pic_order_always_zero_flag;               // u(1)
+    int     offset_for_non_ref_pic;                         // se(v)
+    int     offset_for_top_to_bottom_field;                 // se(v)
+    unsigned int num_ref_frames_in_pic_order_cnt_cycle;          // ue(v)
+    // for( i = 0; i < num_ref_frames_in_pic_order_cnt_cycle; i++ )
+      int   offset_for_ref_frame[MAXnum_ref_frames_in_pic_order_cnt_cycle];   // se(v)
+  unsigned int num_ref_frames;                                   // ue(v)
+  Boolean   gaps_in_frame_num_value_allowed_flag;             // u(1)
+  unsigned int pic_width_in_mbs_minus1;                          // ue(v)
+  unsigned int pic_height_in_map_units_minus1;                   // ue(v)
+  Boolean   frame_mbs_only_flag;                              // u(1)
+  // if( !frame_mbs_only_flag )
+    Boolean   mb_adaptive_frame_field_flag;                   // u(1)
+  Boolean   direct_8x8_inference_flag;                        // u(1)
+  Boolean   frame_cropping_flag;                              // u(1)
+    unsigned int frame_cropping_rect_left_offset;                // ue(v)
+    unsigned int frame_cropping_rect_right_offset;               // ue(v)
+    unsigned int frame_cropping_rect_top_offset;                 // ue(v)
+    unsigned int frame_cropping_rect_bottom_offset;              // ue(v)
+  Boolean   vui_parameters_present_flag;                      // u(1)
+    vui_seq_parameters_t vui_seq_parameters;                  // vui_seq_parameters_t
+    unsigned  separate_colour_plane_flag;                       // u(1)
+} seq_parameter_set_rbsp_t;
+
+pic_parameter_set_rbsp_t *AllocPPS (void);
+seq_parameter_set_rbsp_t *AllocSPS (void);
+
+void FreePPS (pic_parameter_set_rbsp_t *pps);
+void FreeSPS (seq_parameter_set_rbsp_t *sps);
+
+int sps_is_equal(seq_parameter_set_rbsp_t *sps1, seq_parameter_set_rbsp_t *sps2);
+int pps_is_equal(pic_parameter_set_rbsp_t *pps1, pic_parameter_set_rbsp_t *pps2);
+
+#endif
diff --git a/Src/h264dec/ldecod/inc/quant.h b/Src/h264dec/ldecod/inc/quant.h
new file mode 100644
index 00000000..01bde65f
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/quant.h
@@ -0,0 +1,169 @@
+
+/*!
+ ************************************************************************
+ * \file quant.h
+ *
+ * \brief
+ *    definitions for quantization functions
+ *
+ * \author
+ *
+ ************************************************************************
+ */
+
+#ifndef _QUANT_H_
+#define _QUANT_H_
+
+// exported variables
+static const int dequant_coef8[6][64] =
+{
+  {
+    20,  19, 25, 19, 20, 19, 25, 19,
+    19,  18, 24, 18, 19, 18, 24, 18,
+    25,  24, 32, 24, 25, 24, 32, 24,
+    19,  18, 24, 18, 19, 18, 24, 18,
+    20,  19, 25, 19, 20, 19, 25, 19,
+    19,  18, 24, 18, 19, 18, 24, 18,
+    25,  24, 32, 24, 25, 24, 32, 24,
+    19,  18, 24, 18, 19, 18, 24, 18
+  },
+  {
+    22,  21, 28, 21, 22, 21, 28, 21,
+    21,  19, 26, 19, 21, 19, 26, 19,
+    28,  26, 35, 26, 28, 26, 35, 26,
+    21,  19, 26, 19, 21, 19, 26, 19,
+    22,  21, 28, 21, 22, 21, 28, 21,
+    21,  19, 26, 19, 21, 19, 26, 19,
+    28,  26, 35, 26, 28, 26, 35, 26,
+    21,  19, 26, 19, 21, 19, 26, 19
+  },
+  {
+    26,  24, 33, 24, 26, 24, 33, 24,
+    24,  23, 31, 23, 24, 23, 31, 23,
+    33,  31, 42, 31, 33, 31, 42, 31,
+    24,  23, 31, 23, 24, 23, 31, 23,
+    26,  24, 33, 24, 26, 24, 33, 24,
+    24,  23, 31, 23, 24, 23, 31, 23,
+    33,  31, 42, 31, 33, 31, 42, 31,
+    24,  23, 31, 23, 24, 23, 31, 23
+  },
+  {
+    28,  26, 35, 26, 28, 26, 35, 26,
+    26,  25, 33, 25, 26, 25, 33, 25,
+    35,  33, 45, 33, 35, 33, 45, 33,
+    26,  25, 33, 25, 26, 25, 33, 25,
+    28,  26, 35, 26, 28, 26, 35, 26,
+    26,  25, 33, 25, 26, 25, 33, 25,
+    35,  33, 45, 33, 35, 33, 45, 33,
+    26,  25, 33, 25, 26, 25, 33, 25
+  },
+  {
+    32,  30, 40, 30, 32, 30, 40, 30,
+    30,  28, 38, 28, 30, 28, 38, 28,
+    40,  38, 51, 38, 40, 38, 51, 38,
+    30,  28, 38, 28, 30, 28, 38, 28,
+    32,  30, 40, 30, 32, 30, 40, 30,
+    30,  28, 38, 28, 30, 28, 38, 28,
+    40,  38, 51, 38, 40, 38, 51, 38,
+    30,  28, 38, 28, 30, 28, 38, 28
+  },
+  {
+    36,  34, 46, 34, 36, 34, 46, 34,
+    34,  32, 43, 32, 34, 32, 43, 32,
+    46,  43, 58, 43, 46, 43, 58, 43,
+    34,  32, 43, 32, 34, 32, 43, 32,
+    36,  34, 46, 34, 36, 34, 46, 34,
+    34,  32, 43, 32, 34, 32, 43, 32,
+    46,  43, 58, 43, 46, 43, 58, 43,
+    34,  32, 43, 32, 34, 32, 43, 32
+  }
+};
+
+
+//! Dequantization coefficients
+static const int dequant_coef[6][4][4] = {
+  {
+    { 10, 13, 10, 13},
+    { 13, 16, 13, 16},
+    { 10, 13, 10, 13},
+    { 13, 16, 13, 16}},
+  {
+    { 11, 14, 11, 14},
+    { 14, 18, 14, 18},
+    { 11, 14, 11, 14},
+    { 14, 18, 14, 18}},
+  {
+    { 13, 16, 13, 16},
+    { 16, 20, 16, 20},
+    { 13, 16, 13, 16},
+    { 16, 20, 16, 20}},
+  {
+    { 14, 18, 14, 18},
+    { 18, 23, 18, 23},
+    { 14, 18, 14, 18},
+    { 18, 23, 18, 23}},
+  {
+    { 16, 20, 16, 20},
+    { 20, 25, 20, 25},
+    { 16, 20, 16, 20},
+    { 20, 25, 20, 25}},
+  {
+    { 18, 23, 18, 23},
+    { 23, 29, 23, 29},
+    { 18, 23, 18, 23},
+    { 23, 29, 23, 29}}
+};
+
+static const int quant_coef[6][4][4] = {
+  {
+    { 13107,  8066, 13107,  8066},
+    {  8066,  5243,  8066,  5243},
+    { 13107,  8066, 13107,  8066},
+    {  8066,  5243,  8066,  5243}},
+  {
+    { 11916,  7490, 11916,  7490},
+    {  7490,  4660,  7490,  4660},
+    { 11916,  7490, 11916,  7490},
+    {  7490,  4660,  7490,  4660}},
+  {
+    { 10082,  6554, 10082,  6554},
+    {  6554,  4194,  6554,  4194},
+    { 10082,  6554, 10082,  6554},
+    {  6554,  4194,  6554,  4194}},
+  {
+    {  9362,  5825,  9362,  5825},
+    {  5825,  3647,  5825,  3647},
+    {  9362,  5825,  9362,  5825},
+    {  5825,  3647,  5825,  3647}},
+  {
+    {  8192,  5243,  8192,  5243},
+    {  5243,  3355,  5243,  3355},
+    {  8192,  5243,  8192,  5243},
+    {  5243,  3355,  5243,  3355}},
+  {
+    {  7282,  4559,  7282,  4559},
+    {  4559,  2893,  4559,  2893},
+    {  7282,  4559,  7282,  4559},
+    {  4559,  2893,  4559,  2893}}
+};
+
+// SP decoding parameter (EQ. 8-425)
+static const int A[4][4] = {
+  { 16, 20, 16, 20},
+  { 20, 25, 20, 25},
+  { 16, 20, 16, 20},
+  { 20, 25, 20, 25}
+};
+
+// exported functions
+// quantization initialization
+extern void init_qp_process (VideoParameters *p_Vid);
+extern void free_qp_matrices(VideoParameters *p_Vid);
+
+// For Q-matrix
+extern void assign_quant_params   (Slice *currslice);
+extern void CalculateQuant4x4Param(Slice *currslice);
+
+
+#endif
+
diff --git a/Src/h264dec/ldecod/inc/sei.h b/Src/h264dec/ldecod/inc/sei.h
new file mode 100644
index 00000000..943428fd
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/sei.h
@@ -0,0 +1,100 @@
+
+/*!
+ *************************************************************************************
+ * \file sei.h
+ *
+ * \brief
+ *    Prototypes for sei.c
+ *************************************************************************************
+ */
+
+#ifndef SEI_H
+#define SEI_H
+
+typedef enum {
+  SEI_BUFFERING_PERIOD = 0,
+  SEI_PIC_TIMING,
+  SEI_PAN_SCAN_RECT,
+  SEI_FILLER_PAYLOAD,
+  SEI_USER_DATA_REGISTERED_ITU_T_T35,
+  SEI_USER_DATA_UNREGISTERED,
+  SEI_RECOVERY_POINT,
+  SEI_DEC_REF_PIC_MARKING_REPETITION,
+  SEI_SPARE_PIC,
+  SEI_SCENE_INFO,
+  SEI_SUB_SEQ_INFO,
+  SEI_SUB_SEQ_LAYER_CHARACTERISTICS,
+  SEI_SUB_SEQ_CHARACTERISTICS,
+  SEI_FULL_FRAME_FREEZE,
+  SEI_FULL_FRAME_FREEZE_RELEASE,
+  SEI_FULL_FRAME_SNAPSHOT,
+  SEI_PROGRESSIVE_REFINEMENT_SEGMENT_START,
+  SEI_PROGRESSIVE_REFINEMENT_SEGMENT_END,
+  SEI_MOTION_CONSTRAINED_SLICE_GROUP_SET,
+  SEI_FILM_GRAIN_CHARACTERISTICS,
+  SEI_DEBLOCKING_FILTER_DISPLAY_PREFERENCE,
+  SEI_STEREO_VIDEO_INFO,
+  SEI_POST_FILTER_HINTS,
+  SEI_TONE_MAPPING,
+
+  SEI_MAX_ELEMENTS  //!< number of maximum syntax elements
+} SEI_type;
+
+#define MAX_FN 256
+// tone mapping information
+#define MAX_CODED_BIT_DEPTH  12
+#define MAX_SEI_BIT_DEPTH    12
+#define MAX_NUM_PIVOTS     (1<<MAX_CODED_BIT_DEPTH)
+
+#if (ENABLE_OUTPUT_TONEMAPPING)
+typedef struct tone_mapping_struct_s
+{
+  Boolean seiHasTone_mapping;
+  unsigned int  tone_map_repetition_period;
+  unsigned char coded_data_bit_depth;
+  unsigned char sei_bit_depth;
+  unsigned int  model_id;
+  unsigned int count;
+  
+  imgpel lut[1<<MAX_CODED_BIT_DEPTH];                 //<! look up table for mapping the coded data value to output data value
+
+  Bitstream *data;
+  int payloadSize;
+} ToneMappingSEI;
+
+#endif
+
+void InterpretSEIMessage(byte* msg, int size, VideoParameters *p_Vid);
+void interpret_spare_pic( byte* payload, int size, VideoParameters *p_Vid );
+void interpret_subsequence_info( byte* payload, int size, VideoParameters *p_Vid );
+void interpret_subsequence_layer_characteristics_info( byte* payload, int size, VideoParameters *p_Vid );
+void interpret_subsequence_characteristics_info( byte* payload, int size, VideoParameters *p_Vid );
+void interpret_scene_information( byte* payload, int size, VideoParameters *p_Vid ); // JVT-D099
+void interpret_user_data_registered_itu_t_t35_info( byte* payload, int size, VideoParameters *p_Vid );
+void interpret_user_data_unregistered_info( byte* payload, int size, VideoParameters *p_Vid );
+void interpret_pan_scan_rect_info( byte* payload, int size, VideoParameters *p_Vid );
+void interpret_recovery_point_info( byte* payload, int size, VideoParameters *p_Vid );
+void interpret_filler_payload_info( byte* payload, int size, VideoParameters *p_Vid );
+void interpret_dec_ref_pic_marking_repetition_info( byte* payload, int size, VideoParameters *p_Vid );
+void interpret_full_frame_freeze_info( byte* payload, int size, VideoParameters *p_Vid );
+void interpret_full_frame_freeze_release_info( byte* payload, int size, VideoParameters *p_Vid );
+void interpret_full_frame_snapshot_info( byte* payload, int size, VideoParameters *p_Vid );
+void interpret_progressive_refinement_start_info( byte* payload, int size, VideoParameters *p_Vid );
+void interpret_progressive_refinement_end_info( byte* payload, int size, VideoParameters *p_Vid );
+void interpret_motion_constrained_slice_group_set_info( byte* payload, int size, VideoParameters *p_Vid );
+void interpret_reserved_info( byte* payload, int size, VideoParameters *p_Vid );
+void interpret_buffering_period_info( byte* payload, int size, VideoParameters *p_Vid );
+void interpret_picture_timing_info( byte* payload, int size, VideoParameters *p_Vid );
+void interpret_film_grain_characteristics_info( byte* payload, int size, VideoParameters *p_Vid );
+void interpret_deblocking_filter_display_preference_info( byte* payload, int size, VideoParameters *p_Vid );
+void interpret_stereo_video_info_info( byte* payload, int size, VideoParameters *p_Vid );
+void interpret_post_filter_hints_info( byte* payload, int size, VideoParameters *p_Vid );
+// functions for tone mapping SEI message
+void interpret_tone_mapping( byte* payload, int size, VideoParameters *p_Vid );
+
+#if (ENABLE_OUTPUT_TONEMAPPING)
+void tone_map(imgpel** imgX, imgpel* lut, int size_x, int size_y);
+void init_tone_mapping_sei(ToneMappingSEI *seiToneMapping);
+void update_tone_mapping_sei(ToneMappingSEI *seiToneMapping);
+#endif
+#endif
diff --git a/Src/h264dec/ldecod/inc/transform8x8.h b/Src/h264dec/ldecod/inc/transform8x8.h
new file mode 100644
index 00000000..88cfafee
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/transform8x8.h
@@ -0,0 +1,24 @@
+/*!
+ ***************************************************************************
+ *
+ * \file transform8x8.h
+ *
+ * \brief
+ *    prototypes of 8x8 transform functions
+ *
+ * \date
+ *    9. October 2003
+ *
+ * \author
+ *    Main contributors (see contributors.h for copyright, address and affiliation details)
+ *    - Yuri Vatis
+ **************************************************************************/
+
+#ifndef _TRANSFORM8X8_H_
+#define _TRANSFORM8X8_H_
+
+extern void itrans8x8_mmx(h264_imgpel_macroblock_row_t *mb_rec, const h264_imgpel_macroblock_row_t *mb_pred, const h264_short_8x8block_row_t *block, int pos_x);
+extern void itrans8x8_sse2(h264_imgpel_macroblock_row_t *mb_rec, const h264_imgpel_macroblock_row_t *mb_pred, const h264_short_8x8block_row_t *block, int pos_x);
+extern void itrans8x8_c(h264_imgpel_macroblock_row_t *mb_rec, const h264_imgpel_macroblock_row_t *mb_pred, const h264_short_8x8block_row_t *block, int pos_x);
+extern void itrans8x8_lossless(h264_imgpel_macroblock_row_t *mb_rec, const h264_imgpel_macroblock_row_t *mb_pred, const h264_short_8x8block_row_t *block, int pos_x);
+#endif
diff --git a/Src/h264dec/ldecod/inc/vlc.h b/Src/h264dec/ldecod/inc/vlc.h
new file mode 100644
index 00000000..9a75ad3a
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/vlc.h
@@ -0,0 +1,122 @@
+
+/*!
+ ************************************************************************
+ * \file vlc.h
+ *
+ * \brief
+ *    header for (CA)VLC coding functions
+ *
+ * \author
+ *    Karsten Suehring
+ *
+ ************************************************************************
+ */
+
+#ifndef _VLC_H_
+#define _VLC_H_
+
+#include <bfc/platform/types.h>
+
+//! gives CBP value from codeword number, both for intra and inter
+static const byte NCBP[2][48][2]=
+{
+  {  // 0      1        2       3       4       5       6       7       8       9      10      11
+    {15, 0},{ 0, 1},{ 7, 2},{11, 4},{13, 8},{14, 3},{ 3, 5},{ 5,10},{10,12},{12,15},{ 1, 7},{ 2,11},
+    { 4,13},{ 8,14},{ 6, 6},{ 9, 9},{ 0, 0},{ 0, 0},{ 0, 0},{ 0, 0},{ 0, 0},{ 0, 0},{ 0, 0},{ 0, 0},
+    { 0, 0},{ 0, 0},{ 0, 0},{ 0, 0},{ 0, 0},{ 0, 0},{ 0, 0},{ 0, 0},{ 0, 0},{ 0, 0},{ 0, 0},{ 0, 0},
+    { 0, 0},{ 0, 0},{ 0, 0},{ 0, 0},{ 0, 0},{ 0, 0},{ 0, 0},{ 0, 0},{ 0, 0},{ 0, 0},{ 0, 0},{ 0, 0}
+  },
+  {
+    {47, 0},{31,16},{15, 1},{ 0, 2},{23, 4},{27, 8},{29,32},{30, 3},{ 7, 5},{11,10},{13,12},{14,15},
+    {39,47},{43, 7},{45,11},{46,13},{16,14},{ 3, 6},{ 5, 9},{10,31},{12,35},{19,37},{21,42},{26,44},
+    {28,33},{35,34},{37,36},{42,40},{44,39},{ 1,43},{ 2,45},{ 4,46},{ 8,17},{17,18},{18,20},{20,24},
+    {24,19},{ 6,21},{ 9,26},{22,28},{25,23},{32,27},{33,29},{34,30},{36,22},{40,25},{38,38},{41,41}
+  }
+};
+
+//! for the linfo_levrun_inter routine
+static const byte NTAB1[4][8][2] =
+{
+  {{1,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
+  {{1,1},{1,2},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
+  {{2,0},{1,3},{1,4},{1,5},{0,0},{0,0},{0,0},{0,0}},
+  {{3,0},{2,1},{2,2},{1,6},{1,7},{1,8},{1,9},{4,0}},
+};
+
+static const byte LEVRUN1[16]=
+{
+  4,2,2,1,1,1,1,1,1,1,0,0,0,0,0,0,
+};
+
+
+static const byte NTAB2[4][8][2] =
+{
+  {{1,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
+  {{1,1},{2,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0}},
+  {{1,2},{3,0},{4,0},{5,0},{0,0},{0,0},{0,0},{0,0}},
+  {{1,3},{1,4},{2,1},{3,1},{6,0},{7,0},{8,0},{9,0}},
+};
+
+//! for the linfo_levrun__c2x2 routine
+static const byte LEVRUN3[4] =
+{
+  2,1,0,0
+};
+
+static const byte NTAB3[2][2][2] =
+{
+  {{1,0},{0,0}},
+  {{2,0},{1,1}},
+};
+
+extern int se_v (const char *tracestring, Bitstream *bitstream);
+extern int ue_v (const char *tracestring, Bitstream *bitstream);
+extern Boolean u_1 (const char *tracestring, Bitstream *bitstream);
+extern int u_v (int LenInBits, const char *tracestring, Bitstream *bitstream);
+extern int i_v (int LenInBits, const char *tracestring, Bitstream *bitstream);
+
+// CAVLC mapping
+extern void linfo_ue(int len, int info, int *value1, int *dummy);
+extern void linfo_se(int len, int info, int *value1, int *dummy);
+
+extern void linfo_cbp_intra_normal(int len,int info,int *cbp, int *dummy);
+extern void linfo_cbp_inter_normal(int len,int info,int *cbp, int *dummy);
+extern void linfo_cbp_intra_other(int len,int info,int *cbp, int *dummy);
+extern void linfo_cbp_inter_other(int len,int info,int *cbp, int *dummy);
+
+extern void linfo_levrun_inter(int len,int info,int *level,int *irun);
+extern void linfo_levrun_c2x2(int len,int info,int *level,int *irun);
+
+extern int  uvlc_startcode_follows(Slice *currSlice, int dummy);
+
+extern int  readSyntaxElement_VLC (SyntaxElement *sym, Bitstream *currStream);
+extern int  readSyntaxElement_UVLC(SyntaxElement *, struct datapartition *);
+extern int  readSyntaxElement_Intra4x4PredictionMode(SyntaxElement *sym, Bitstream   *currStream);
+
+extern int  GetVLCSymbol (const uint8_t buffer[],int totbitoffset,int *info, int bytecount);
+extern int  GetVLCSymbol_IntraMode (const uint8_t buffer[],int totbitoffset,int *info, int bytecount);
+
+extern int readSyntaxElement_FLC                         (Bitstream *currStream, int numbits);
+extern int readSyntaxElement_NumCoeffTrailingOnes        (SyntaxElement *sym,  Bitstream *currStream, int vlcnum);
+extern int readSyntaxElement_NumCoeffTrailingOnesChromaDC(VideoParameters *p_Vid, SyntaxElement *sym, Bitstream *currStream);
+extern int readSyntaxElement_Level_VLC0                  (Bitstream *currStream);
+extern int readSyntaxElement_Level_VLCN                  (int vlc, Bitstream *currStream);
+extern int readSyntaxElement_TotalZeros                  (Bitstream *currStream, int vlcnum);
+extern int readSyntaxElement_TotalZerosChromaDC          (VideoParameters *p_Vid, Bitstream *currStream, int vlcnum);
+extern int readSyntaxElement_Run                         (Bitstream *currStream, int vlcnum);
+extern int GetBits  (const uint8_t buffer[],int totbitoffset,int *info, int bitcount, int numbits);
+
+
+extern int more_rbsp_data (const uint8_t buffer[],int totbitoffset,int bytecount);
+
+int code_from_bitstream_2d_17_4_c(SyntaxElement *sym, Bitstream *currStream, const uint16_t *lentab, const uint16_t *codtab, const uint16_t *masktab);
+int code_from_bitstream_2d_17_4_sse2(SyntaxElement *sym, Bitstream *currStream, const uint16_t *lentab, const uint16_t *codtab, const uint16_t *masktab);
+
+int code_from_bitstream_2d_5_4_c(SyntaxElement *sym, Bitstream *currStream, const uint8_t *lentab, const uint8_t *codtab, const uint8_t *masktab);
+int code_from_bitstream_2d_5_4_sse2(SyntaxElement *sym, Bitstream *currStream, const uint8_t *lentab, const uint8_t *codtab, const uint8_t *masktab);
+
+int code_from_bitstream_2d_16_1_c(Bitstream *currStream, const uint16_t *lentab,	const uint16_t *codtab, const uint16_t *masktab);
+int code_from_bitstream_2d_16_1_sse2(Bitstream *currStream, const uint16_t *lentab,	const uint16_t *codtab, const uint16_t *masktab);
+
+#endif
+
diff --git a/Src/h264dec/ldecod/src/biari.asm b/Src/h264dec/ldecod/src/biari.asm
new file mode 100644
index 00000000..ca5f3d6d
--- /dev/null
+++ b/Src/h264dec/ldecod/src/biari.asm
@@ -0,0 +1,2540 @@
+.686
+.XMM
+.model FLAT
+
+; Slice
+tex_ctx@Slice = 100
+coeff@Slice = 15632
+coeff_ctr@Slice = 15760
+pos@Slice = 15764
+last_dquant@Slice = 88
+mot_ctx@Slice = 96
+slice_type@Slice = 64
+
+; VideoParameters
+structure@VideoParameters = 697200
+dec_picture@VideoParameters = 698192
+bitdepth_chroma_qp_scale@VideoParameters = 697456
+
+; Macroblock
+p_Slice@Macroblock = 0
+p_Vid@Macroblock = 4
+qp@macroblock = 60
+qp_scaled@Macroblock = 72
+mb_field@Macroblock = 344
+read_and_store_CBP_block_bit@Macroblock = 400
+
+; StorablePicture
+structure@StorablePicture = 0
+chroma_qp_offset@StorablePicture = 158688
+
+; TextureInfoContexts
+map_contexts@TextureInfoContexts = 436
+last_contexts@TextureInfoContexts = 3252
+one_contexts@TextureInfoContexts = 6068
+abs_contexts@TextureInfoContexts = 6508
+
+_DATA	SEGMENT
+_pos2ctx_map DD	FLAT:_pos2ctx_map4x4
+	DD	FLAT:_pos2ctx_map4x4
+	DD	FLAT:_pos2ctx_map8x8
+	DD	FLAT:_pos2ctx_map8x4
+	DD	FLAT:_pos2ctx_map8x4
+	DD	FLAT:_pos2ctx_map4x4
+	DD	FLAT:_pos2ctx_map4x4
+	DD	FLAT:_pos2ctx_map4x4
+	DD	FLAT:_pos2ctx_map2x4c
+	DD	FLAT:_pos2ctx_map4x4c
+	DD	FLAT:_pos2ctx_map4x4
+	DD	FLAT:_pos2ctx_map4x4
+	DD	FLAT:_pos2ctx_map8x8
+	DD	FLAT:_pos2ctx_map8x4
+	DD	FLAT:_pos2ctx_map8x4
+	DD	FLAT:_pos2ctx_map4x4
+	DD	FLAT:_pos2ctx_map4x4
+	DD	FLAT:_pos2ctx_map4x4
+	DD	FLAT:_pos2ctx_map8x8
+	DD	FLAT:_pos2ctx_map8x4
+	DD	FLAT:_pos2ctx_map8x4
+	DD	FLAT:_pos2ctx_map4x4
+_pos2ctx_map_int DD FLAT:_pos2ctx_map4x4
+	DD	FLAT:_pos2ctx_map4x4
+	DD	FLAT:_pos2ctx_map8x8i
+	DD	FLAT:_pos2ctx_map8x4i
+	DD	FLAT:_pos2ctx_map4x8i
+	DD	FLAT:_pos2ctx_map4x4
+	DD	FLAT:_pos2ctx_map4x4
+	DD	FLAT:_pos2ctx_map4x4
+	DD	FLAT:_pos2ctx_map2x4c
+	DD	FLAT:_pos2ctx_map4x4c
+	DD	FLAT:_pos2ctx_map4x4
+	DD	FLAT:_pos2ctx_map4x4
+	DD	FLAT:_pos2ctx_map8x8i
+	DD	FLAT:_pos2ctx_map8x4i
+	DD	FLAT:_pos2ctx_map8x4i
+	DD	FLAT:_pos2ctx_map4x4
+	DD	FLAT:_pos2ctx_map4x4
+	DD	FLAT:_pos2ctx_map4x4
+	DD	FLAT:_pos2ctx_map8x8i
+	DD	FLAT:_pos2ctx_map8x4i
+	DD	FLAT:_pos2ctx_map8x4i
+	DD	FLAT:_pos2ctx_map4x4
+_pos2ctx_last DD FLAT:_pos2ctx_last4x4
+	DD	FLAT:_pos2ctx_last4x4
+	DD	FLAT:_pos2ctx_last8x8
+	DD	FLAT:_pos2ctx_last8x4
+	DD	FLAT:_pos2ctx_last8x4
+	DD	FLAT:_pos2ctx_last4x4
+	DD	FLAT:_pos2ctx_last4x4
+	DD	FLAT:_pos2ctx_last4x4
+	DD	FLAT:_pos2ctx_last2x4c
+	DD	FLAT:_pos2ctx_last4x4c
+	DD	FLAT:_pos2ctx_last4x4
+	DD	FLAT:_pos2ctx_last4x4
+	DD	FLAT:_pos2ctx_last8x8
+	DD	FLAT:_pos2ctx_last8x4
+	DD	FLAT:_pos2ctx_last8x4
+	DD	FLAT:_pos2ctx_last4x4
+	DD	FLAT:_pos2ctx_last4x4
+	DD	FLAT:_pos2ctx_last4x4
+	DD	FLAT:_pos2ctx_last8x8
+	DD	FLAT:_pos2ctx_last8x4
+	DD	FLAT:_pos2ctx_last8x4
+	DD	FLAT:_pos2ctx_last4x4
+_DATA	ENDS
+
+CONST SEGMENT
+_rLPS_table_64x4 DB 080H
+	DB	080H
+	DB	080H
+	DB	07bH
+	DB	074H
+	DB	06fH
+	DB	069H
+	DB	064H
+	DB	05fH
+	DB	05aH
+	DB	055H
+	DB	051H
+	DB	04dH
+	DB	049H
+	DB	045H
+	DB	042H
+	DB	03eH
+	DB	03bH
+	DB	038H
+	DB	035H
+	DB	033H
+	DB	030H
+	DB	02eH
+	DB	02bH
+	DB	029H
+	DB	027H
+	DB	025H
+	DB	023H
+	DB	021H
+	DB	020H
+	DB	01eH
+	DB	01dH
+	DB	01bH
+	DB	01aH
+	DB	018H
+	DB	017H
+	DB	016H
+	DB	015H
+	DB	014H
+	DB	013H
+	DB	012H
+	DB	011H
+	DB	010H
+	DB	0fH
+	DB	0eH
+	DB	0eH
+	DB	0dH
+	DB	0cH
+	DB	0cH
+	DB	0bH
+	DB	0bH
+	DB	0aH
+	DB	0aH
+	DB	09H
+	DB	09H
+	DB	08H
+	DB	08H
+	DB	07H
+	DB	07H
+	DB	07H
+	DB	06H
+	DB	06H
+	DB	06H
+	DB	02H
+	DB	0b0H
+	DB	0a7H
+	DB	09eH
+	DB	096H
+	DB	08eH
+	DB	087H
+	DB	080H
+	DB	07aH
+	DB	074H
+	DB	06eH
+	DB	068H
+	DB	063H
+	DB	05eH
+	DB	059H
+	DB	055H
+	DB	050H
+	DB	04cH
+	DB	048H
+	DB	045H
+	DB	041H
+	DB	03eH
+	DB	03bH
+	DB	038H
+	DB	035H
+	DB	032H
+	DB	030H
+	DB	02dH
+	DB	02bH
+	DB	029H
+	DB	027H
+	DB	025H
+	DB	023H
+	DB	021H
+	DB	01fH
+	DB	01eH
+	DB	01cH
+	DB	01bH
+	DB	01aH
+	DB	018H
+	DB	017H
+	DB	016H
+	DB	015H
+	DB	014H
+	DB	013H
+	DB	012H
+	DB	011H
+	DB	010H
+	DB	0fH
+	DB	0eH
+	DB	0eH
+	DB	0dH
+	DB	0cH
+	DB	0cH
+	DB	0bH
+	DB	0bH
+	DB	0aH
+	DB	09H
+	DB	09H
+	DB	09H
+	DB	08H
+	DB	08H
+	DB	07H
+	DB	07H
+	DB	02H
+	DB	0d0H
+	DB	0c5H
+	DB	0bbH
+	DB	0b2H
+	DB	0a9H
+	DB	0a0H
+	DB	098H
+	DB	090H
+	DB	089H
+	DB	082H
+	DB	07bH
+	DB	075H
+	DB	06fH
+	DB	069H
+	DB	064H
+	DB	05fH
+	DB	05aH
+	DB	056H
+	DB	051H
+	DB	04dH
+	DB	049H
+	DB	045H
+	DB	042H
+	DB	03fH
+	DB	03bH
+	DB	038H
+	DB	036H
+	DB	033H
+	DB	030H
+	DB	02eH
+	DB	02bH
+	DB	029H
+	DB	027H
+	DB	025H
+	DB	023H
+	DB	021H
+	DB	020H
+	DB	01eH
+	DB	01dH
+	DB	01bH
+	DB	01aH
+	DB	019H
+	DB	017H
+	DB	016H
+	DB	015H
+	DB	014H
+	DB	013H
+	DB	012H
+	DB	011H
+	DB	010H
+	DB	0fH
+	DB	0fH
+	DB	0eH
+	DB	0dH
+	DB	0cH
+	DB	0cH
+	DB	0bH
+	DB	0bH
+	DB	0aH
+	DB	0aH
+	DB	09H
+	DB	09H
+	DB	08H
+	DB	02H
+	DB	0f0H
+	DB	0e3H
+	DB	0d8H
+	DB	0cdH
+	DB	0c3H
+	DB	0b9H
+	DB	0afH
+	DB	0a6H
+	DB	09eH
+	DB	096H
+	DB	08eH
+	DB	087H
+	DB	080H
+	DB	07aH
+	DB	074H
+	DB	06eH
+	DB	068H
+	DB	063H
+	DB	05eH
+	DB	059H
+	DB	055H
+	DB	050H
+	DB	04cH
+	DB	048H
+	DB	045H
+	DB	041H
+	DB	03eH
+	DB	03bH
+	DB	038H
+	DB	035H
+	DB	032H
+	DB	030H
+	DB	02dH
+	DB	02bH
+	DB	029H
+	DB	027H
+	DB	025H
+	DB	023H
+	DB	021H
+	DB	01fH
+	DB	01eH
+	DB	01cH
+	DB	01bH
+	DB	019H
+	DB	018H
+	DB	017H
+	DB	016H
+	DB	015H
+	DB	014H
+	DB	013H
+	DB	012H
+	DB	011H
+	DB	010H
+	DB	0fH
+	DB	0eH
+	DB	0eH
+	DB	0dH
+	DB	0cH
+	DB	0cH
+	DB	0bH
+	DB	0bH
+	DB	0aH
+	DB	09H
+	DB	02H
+_AC_next_state_MPS_64 DB 01H
+ DB 02H
+ DB 03H
+ DB 04H
+ DB 05H
+ DB 06H
+ DB 07H
+ DB 08H
+ DB 09H
+ DB 0aH
+ DB 0bH
+ DB 0cH
+ DB 0dH
+ DB 0eH
+ DB 0fH
+ DB 010H
+ DB 011H
+ DB 012H
+ DB 013H
+ DB 014H
+ DB 015H
+ DB 016H
+ DB 017H
+ DB 018H
+ DB 019H
+ DB 01aH
+ DB 01bH
+ DB 01cH
+ DB 01dH
+ DB 01eH
+ DB 01fH
+ DB 020H
+ DB 021H
+ DB 022H
+ DB 023H
+ DB 024H
+ DB 025H
+ DB 026H
+ DB 027H
+ DB 028H
+ DB 029H
+ DB 02aH
+ DB 02bH
+ DB 02cH
+ DB 02dH
+ DB 02eH
+ DB 02fH
+ DB 030H
+ DB 031H
+ DB 032H
+ DB 033H
+ DB 034H
+ DB 035H
+ DB 036H
+ DB 037H
+ DB 038H
+ DB 039H
+ DB 03aH
+ DB 03bH
+ DB 03cH
+ DB 03dH
+ DB 03eH
+ DB 03eH
+ DB 03fH
+_AC_next_state_LPS_64 DB 00H
+ DB 00H
+ DB 01H
+ DB 02H
+ DB 02H
+ DB 04H
+ DB 04H
+ DB 05H
+ DB 06H
+ DB 07H
+ DB 08H
+ DB 09H
+ DB 09H
+ DB 0bH
+ DB 0bH
+ DB 0cH
+ DB 0dH
+ DB 0dH
+ DB 0fH
+ DB 0fH
+ DB 010H
+ DB 010H
+ DB 012H
+ DB 012H
+ DB 013H
+ DB 013H
+ DB 015H
+ DB 015H
+ DB 016H
+ DB 016H
+ DB 017H
+ DB 018H
+ DB 018H
+ DB 019H
+ DB 01aH
+ DB 01aH
+ DB 01bH
+ DB 01bH
+ DB 01cH
+ DB 01dH
+ DB 01dH
+ DB 01eH
+ DB 01eH
+ DB 01eH
+ DB 01fH
+ DB 020H
+ DB 020H
+ DB 021H
+ DB 021H
+ DB 021H
+ DB 022H
+ DB 022H
+ DB 023H
+ DB 023H
+ DB 023H
+ DB 024H
+ DB 024H
+ DB 024H
+ DB 025H
+ DB 025H
+ DB 025H
+ DB 026H
+ DB 026H
+ DB 03fH
+_renorm_table_32 DB 06H
+ DB 05H
+ DB 04H
+ DB 04H
+ DB 03H
+ DB 03H
+ DB 03H
+ DB 03H
+ DB 02H
+ DB 02H
+ DB 02H
+ DB 02H
+ DB 02H
+ DB 02H
+ DB 02H
+ DB 02H
+ DB 01H
+ DB 01H
+ DB 01H
+ DB 01H
+ DB 01H
+ DB 01H
+ DB 01H
+ DB 01H
+ DB 01H
+ DB 01H
+ DB 01H
+ DB 01H
+ DB 01H
+ DB 01H
+ DB 01H
+ DB 01H
+ _renorm_table_256 DB 06H
+	DB	06H
+	DB	06H
+	DB	06H
+	DB	06H
+	DB	06H
+	DB	06H
+	DB	06H
+	DB	05H
+	DB	05H
+	DB	05H
+	DB	05H
+	DB	05H
+	DB	05H
+	DB	05H
+	DB	05H
+	DB	04H
+	DB	04H
+	DB	04H
+	DB	04H
+	DB	04H
+	DB	04H
+	DB	04H
+	DB	04H
+	DB	04H
+	DB	04H
+	DB	04H
+	DB	04H
+	DB	04H
+	DB	04H
+	DB	04H
+	DB	04H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+_maxpos	DB	0fH
+	DB	0eH
+	DB	03fH
+	DB	01fH
+	DB	01fH
+	DB	0fH
+	DB	03H
+	DB	0eH
+	DB	07H
+	DB	0fH
+	DB	0fH
+	DB	0eH
+	DB	03fH
+	DB	01fH
+	DB	01fH
+	DB	0fH
+	DB	0fH
+	DB	0eH
+	DB	03fH
+	DB	01fH
+	DB	01fH
+	DB	0fH
+	ORG $+2
+_c1isdc	DB	01H
+	DB	00H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	00H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	00H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	00H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	ORG $+2
+_type2ctx_bcbp DB 00H
+	DB	01H
+	DB	02H
+	DB	03H
+	DB	03H
+	DB	04H
+	DB	05H
+	DB	06H
+	DB	05H
+	DB	05H
+	DB	0aH
+	DB	0bH
+	DB	0cH
+	DB	0dH
+	DB	0dH
+	DB	0eH
+	DB	010H
+	DB	011H
+	DB	012H
+	DB	013H
+	DB	013H
+	DB	014H
+	ORG $+2
+_type2ctx_map DW 00H
+DW 010H
+DW 020H
+DW 030H
+DW 040H
+DW 050H
+DW 060H
+DW 070H
+DW 060H
+DW 060H
+DW 0A0H
+DW 0B0H
+DW 0C0H
+DW 0D0H
+DW 0E0H
+DW 0F0H
+DW 0100H
+DW 0110H
+DW 0120H
+DW 0130H
+DW 0140H
+DW 0150H
+	ORG $+2
+_type2ctx_last DW 00H
+DW 010H
+DW 020H
+DW 030H
+DW 040H
+DW 050H
+DW 060H
+DW 070H
+DW 060H
+DW 060H
+DW 0A0H
+DW 0B0H
+DW 0C0H
+DW 0D0H
+DW 0E0H
+DW 0F0H
+DW 0100H
+DW 0110H
+DW 0120H
+DW 0130H
+DW 0140H
+DW 0150H
+	ORG $+2
+_type2ctx_one DB 00H
+	DB	01H
+	DB	02H
+	DB	03H
+	DB	03H
+	DB	04H
+	DB	05H
+	DB	06H
+	DB	05H
+	DB	05H
+	DB	0aH
+	DB	0bH
+	DB	0cH
+	DB	0dH
+	DB	0dH
+	DB	0eH
+	DB	010H
+	DB	011H
+	DB	012H
+	DB	013H
+	DB	013H
+	DB	014H
+	ORG $+2
+_type2ctx_abs DB 00H
+	DB	01H
+	DB	02H
+	DB	03H
+	DB	03H
+	DB	04H
+	DB	05H
+	DB	06H
+	DB	05H
+	DB	05H
+	DB	0aH
+	DB	0bH
+	DB	0cH
+	DB	0dH
+	DB	0dH
+	DB	0eH
+	DB	010H
+	DB	011H
+	DB	012H
+	DB	013H
+	DB	013H
+	DB	014H
+	ORG $+2
+plus_one_clip4 DD 1,2,3,4,4
+plus_one_clip3 DD 1,2,3,3
+_max_c2	DD	plus_one_clip4
+	DD	plus_one_clip4
+	DD	plus_one_clip4
+	DD	plus_one_clip4
+	DD	plus_one_clip4
+	DD	plus_one_clip4
+	DD	plus_one_clip3
+	DD	plus_one_clip4
+	DD	plus_one_clip3
+	DD	plus_one_clip4
+	DD	plus_one_clip4
+	DD	plus_one_clip4
+	DD	plus_one_clip4
+	DD	plus_one_clip4
+	DD	plus_one_clip4
+	DD	plus_one_clip4
+	DD	plus_one_clip4
+	DD	plus_one_clip4
+	DD	plus_one_clip4
+	DD	plus_one_clip4
+	DD	plus_one_clip4
+	DD	plus_one_clip4
+	ORG $+6
+_pos2ctx_map8x8 DB 00H
+	DB	01H
+	DB	02H
+	DB	03H
+	DB	04H
+	DB	05H
+	DB	05H
+	DB	04H
+	DB	04H
+	DB	03H
+	DB	03H
+	DB	04H
+	DB	04H
+	DB	04H
+	DB	05H
+	DB	05H
+	DB	04H
+	DB	04H
+	DB	04H
+	DB	04H
+	DB	03H
+	DB	03H
+	DB	06H
+	DB	07H
+	DB	07H
+	DB	07H
+	DB	08H
+	DB	09H
+	DB	0aH
+	DB	09H
+	DB	08H
+	DB	07H
+	DB	07H
+	DB	06H
+	DB	0bH
+	DB	0cH
+	DB	0dH
+	DB	0bH
+	DB	06H
+	DB	07H
+	DB	08H
+	DB	09H
+	DB	0eH
+	DB	0aH
+	DB	09H
+	DB	08H
+	DB	06H
+	DB	0bH
+	DB	0cH
+	DB	0dH
+	DB	0bH
+	DB	06H
+	DB	09H
+	DB	0eH
+	DB	0aH
+	DB	09H
+	DB	0bH
+	DB	0cH
+	DB	0dH
+	DB	0bH
+	DB	0eH
+	DB	0aH
+	DB	0cH
+	DB	0eH
+_pos2ctx_map8x4 DB 00H
+	DB	01H
+	DB	02H
+	DB	03H
+	DB	04H
+	DB	05H
+	DB	07H
+	DB	08H
+	DB	09H
+	DB	0aH
+	DB	0bH
+	DB	09H
+	DB	08H
+	DB	06H
+	DB	07H
+	DB	08H
+	DB	09H
+	DB	0aH
+	DB	0bH
+	DB	09H
+	DB	08H
+	DB	06H
+	DB	0cH
+	DB	08H
+	DB	09H
+	DB	0aH
+	DB	0bH
+	DB	09H
+	DB	0dH
+	DB	0dH
+	DB	0eH
+	DB	0eH
+_pos2ctx_map4x4 DB 00H
+	DB	01H
+	DB	02H
+	DB	03H
+	DB	04H
+	DB	05H
+	DB	06H
+	DB	07H
+	DB	08H
+	DB	09H
+	DB	0aH
+	DB	0bH
+	DB	0cH
+	DB	0dH
+	DB	0eH
+	DB	0eH
+_pos2ctx_map2x4c DB 00H
+	DB	00H
+	DB	01H
+	DB	01H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+_pos2ctx_map4x4c DB 00H
+	DB	00H
+	DB	00H
+	DB	00H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+_pos2ctx_map8x8i DB 00H
+	DB	01H
+	DB	01H
+	DB	02H
+	DB	02H
+	DB	03H
+	DB	03H
+	DB	04H
+	DB	05H
+	DB	06H
+	DB	07H
+	DB	07H
+	DB	07H
+	DB	08H
+	DB	04H
+	DB	05H
+	DB	06H
+	DB	09H
+	DB	0aH
+	DB	0aH
+	DB	08H
+	DB	0bH
+	DB	0cH
+	DB	0bH
+	DB	09H
+	DB	09H
+	DB	0aH
+	DB	0aH
+	DB	08H
+	DB	0bH
+	DB	0cH
+	DB	0bH
+	DB	09H
+	DB	09H
+	DB	0aH
+	DB	0aH
+	DB	08H
+	DB	0bH
+	DB	0cH
+	DB	0bH
+	DB	09H
+	DB	09H
+	DB	0aH
+	DB	0aH
+	DB	08H
+	DB	0dH
+	DB	0dH
+	DB	09H
+	DB	09H
+	DB	0aH
+	DB	0aH
+	DB	08H
+	DB	0dH
+	DB	0dH
+	DB	09H
+	DB	09H
+	DB	0aH
+	DB	0aH
+	DB	0eH
+	DB	0eH
+	DB	0eH
+	DB	0eH
+	DB	0eH
+	DB	0eH
+_pos2ctx_map8x4i DB 00H
+	DB	01H
+	DB	02H
+	DB	03H
+	DB	04H
+	DB	05H
+	DB	06H
+	DB	03H
+	DB	04H
+	DB	05H
+	DB	06H
+	DB	03H
+	DB	04H
+	DB	07H
+	DB	06H
+	DB	08H
+	DB	09H
+	DB	07H
+	DB	06H
+	DB	08H
+	DB	09H
+	DB	0aH
+	DB	0bH
+	DB	0cH
+	DB	0cH
+	DB	0aH
+	DB	0bH
+	DB	0dH
+	DB	0dH
+	DB	0eH
+	DB	0eH
+	DB	0eH
+_pos2ctx_map4x8i DB 00H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	02H
+	DB	03H
+	DB	03H
+	DB	04H
+	DB	04H
+	DB	04H
+	DB	05H
+	DB	06H
+	DB	02H
+	DB	07H
+	DB	07H
+	DB	08H
+	DB	08H
+	DB	08H
+	DB	05H
+	DB	06H
+	DB	09H
+	DB	0aH
+	DB	0aH
+	DB	0bH
+	DB	0bH
+	DB	0bH
+	DB	0cH
+	DB	0dH
+	DB	0dH
+	DB	0eH
+	DB	0eH
+	DB	0eH
+_pos2ctx_last8x8 DB 00H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	04H
+	DB	04H
+	DB	04H
+	DB	04H
+	DB	04H
+	DB	04H
+	DB	04H
+	DB	04H
+	DB	05H
+	DB	05H
+	DB	05H
+	DB	05H
+	DB	06H
+	DB	06H
+	DB	06H
+	DB	06H
+	DB	07H
+	DB	07H
+	DB	07H
+	DB	07H
+	DB	08H
+	DB	08H
+	DB	08H
+	DB	08H
+_pos2ctx_last8x4 DB 00H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	01H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	03H
+	DB	04H
+	DB	04H
+	DB	04H
+	DB	04H
+	DB	05H
+	DB	05H
+	DB	06H
+	DB	06H
+	DB	07H
+	DB	07H
+	DB	08H
+	DB	08H
+_pos2ctx_last4x4 DB 00H
+	DB	01H
+	DB	02H
+	DB	03H
+	DB	04H
+	DB	05H
+	DB	06H
+	DB	07H
+	DB	08H
+	DB	09H
+	DB	0aH
+	DB	0bH
+	DB	0cH
+	DB	0dH
+	DB	0eH
+	DB	0fH
+_pos2ctx_last2x4c DB 00H
+	DB	00H
+	DB	01H
+	DB	01H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+	DB	02H
+_pos2ctx_last4x4c DB 00, 00, 00, 00, 01, 01, 01, 01, 02, 02, 02, 02, 02, 02, 02, 02
+plus_one_clip0_4 DD 0,2,3,4,4
+
+align 16
+_QP_SCALE_CR DD	00H
+	DD	01H
+	DD	02H
+	DD	03H
+	DD	04H
+	DD	05H
+	DD	06H
+	DD	07H
+	DD	08H
+	DD	09H
+	DD	0aH
+	DD	0bH
+	DD	0cH
+	DD	0dH
+	DD	0eH
+	DD	0fH
+	DD	010H
+	DD	011H
+	DD	012H
+	DD	013H
+	DD	014H
+	DD	015H
+	DD	016H
+	DD	017H
+	DD	018H
+	DD	019H
+	DD	01aH
+	DD	01bH
+	DD	01cH
+	DD	01dH
+	DD	01dH
+	DD	01eH
+	DD	01fH
+	DD	020H
+	DD	020H
+	DD	021H
+	DD	022H
+	DD	022H
+	DD	023H
+	DD	023H
+	DD	024H
+	DD	024H
+	DD	025H
+	DD	025H
+	DD	025H
+	DD	026H
+	DD	026H
+	DD	026H
+	DD	027H
+	DD	027H
+	DD	027H
+	DD	027H
+	align 16
+_51 DD 51
+CONST ENDS
+
+
+PUBLIC _biari_decode_symbol
+_TEXT SEGMENT
+dep = 4      ; size = 4
+bi_ct = 8      ; size = 4
+_biari_decode_symbol PROC
+ STACKOFFSET=0
+ mov edx, DWORD PTR dep[esp+STACKOFFSET] ; edx = dep
+ STACKOFFSET=STACKOFFSET+4
+ push ebx 
+ mov ebx, DWORD PTR bi_ct[esp+STACKOFFSET] ; ebx = bi_ct
+ movzx eax, WORD PTR [ebx] ; eax = state
+ push ebp 
+ push edi 
+ STACKOFFSET = STACKOFFSET+8
+
+ mov edi, DWORD PTR [edx] ; edi = range
+ mov ecx, edi ; ecx = range
+ and ecx, 0C0H ; range >>= 6
+ movzx ebp, BYTE PTR _rLPS_table_64x4[ecx+eax] ; ebp = rLPS
+ 
+ ; register state:
+ ; eax: state (bi_ct->state)
+ ; ebx: bi_ct
+ ; edx: dep
+ ; edi: range
+ ; ebp: rLPS
+
+ mov ecx, DWORD PTR [edx+8] ; ecx = bitsleft
+ sub edi, ebp ; range -= rLPS
+ shl edi, cl ; range << bitsleft
+ cmp DWORD PTR [edx+4], edi ; value < (range << bitsleft)
+ jge SHORT CABAC@LPS
+
+ movzx ax, BYTE PTR _AC_next_state_MPS_64[eax] ; eax = state = AC_next_state_MPS_64[state]
+ shr edi, cl ; undo earlier shift
+ mov WORD PTR [ebx], ax ; bi_ct->MPS = state
+ cmp edi, 256    ; 00000100H
+ setb cl
+ 
+ ; register state
+ ; eax: state
+ ; ebx: bi_ct
+ ; ecx: state (old)
+ ; edx: dep
+ ; edi: range
+ ; ebp: rLPS
+
+ shl edi, cl
+ sub DWORD PTR [edx+8], ecx ; dep->DbitsLeft--
+ mov DWORD PTR [edx], edi ; dep->Drange = range 
+ movzx eax, BYTE PTR [ebx+2] ; return bit
+ jz SHORT READ_TWO_BYTES; if (dep->DbitsLeft==0)
+ 
+ pop edi
+ pop ebp
+ pop ebx
+ ret 0
+align 16
+CABAC@LPS:
+ sub DWORD PTR [edx+4], edi
+ movzx cx, BYTE PTR _AC_next_state_LPS_64[eax] ; cx: state = AC_next_state_LPS_64[state]
+ mov WORD PTR [ebx], cx ; store state back to bi_ct->MPS
+
+ ; register state:
+ ; eax: state (old)
+ ; ebx: bi_ct
+ ; ecx: state (new)
+ ; edx: dep
+ ; edi: range
+ ; ebp: rLPS
+ 
+ mov edi, ebx
+ test eax, eax ; if state(old) == 0
+ movzx ecx, BYTE PTR _renorm_table_256[ebp] ; ecx = renorm_table_32[rLPS>>3]
+ sete bl ; bl = 1  [ if state(old) == 0 ]
+ movzx eax, BYTE PTR [edi+2]
+ xor eax, 1
+ xor BYTE PTR [edi+2], bl  ; al ^= bi_ct->state
+ 
+ ; register state:
+ ; eax: !state
+ ; ebx: bi_ct
+ ; ecx: renorm
+ ; edx: dep
+ ; edi: range
+ ; ebp: rLPS
+
+ shl ebp, cl ; ebp = range = rLPS <<= renorm
+ sub DWORD PTR [edx+8], ecx ; dep->DbitsLeft -= renorm;
+ mov DWORD PTR [edx], ebp ; dep->Drange = range;
+ jle SHORT READ_TWO_BYTES ; if( dep->DbitsLeft <= 0 )
+
+ ; register state:
+ ; eax: !state
+ ; ebx: bi_ct
+ ; ecx: renorm
+ ; edx: dep
+ ; edi: range
+ ; ebp: range = rLPS <<= renorm
+ 
+ pop edi
+ pop ebp
+ pop ebx
+ ret 0
+align 16
+READ_TWO_BYTES:
+
+ ; register state:
+ ; eax: !state
+ ; ebx: bi_ct
+ ; ecx: renorm
+ ; edx: dep
+ ; edi: range
+ 
+ mov ebx, DWORD PTR [edx+16] ; eax = dep->Dcodestrm_len
+ mov ecx, DWORD PTR [ebx] ; ecx = *dep->Dcodestrm_len
+ lea edi, DWORD PTR [ecx+2] ; edi = *dep->Dcodestrm_len + 2
+ mov DWORD PTR [ebx], edi ; *dep->Dcodestrm_len += 2
+ mov ebx, DWORD PTR [edx+12] ; edx = dep->Dcodestrm
+ movzx ecx, WORD PTR [ebx+ecx]
+ xchg cl, ch
+ shl DWORD PTR [edx+4], 16
+ mov WORD PTR [edx+4], cx
+ 
+ add DWORD PTR [edx+8], 16   ; dep->DbitsLeft += 16
+ ;mov eax, DWORD PTR _bit$[esp+STACKOFFSET] ; eax = bit = return value
+ pop edi
+ pop ebp
+ pop ebx
+ ret 0
+_biari_decode_symbol ENDP
+_TEXT	ENDS
+
+;
+;
+; a version of biari_decode_symbol slightly optimized
+; pass dep in edx and ctx in eax.  edx retains dep on exit
+
+_TEXT SEGMENT
+_biari_decode_symbol_map PROC NEAR
+ STACKOFFSET=0
+ push ebx 
+ STACKOFFSET=4
+ movzx ebx, WORD PTR [eax] ; ebx = state
+ push ebp 
+ push edi 
+ STACKOFFSET = 12
+
+ mov edi, DWORD PTR [edx] ; edi = range
+ mov ecx, edi ; ecx = range
+ and ecx, 0C0H ; range >>= 6
+ movzx ebp, BYTE PTR _rLPS_table_64x4[ecx+ebx] ; ebp = rLPS
+ 
+ ; register state:
+ ; ebx: state (bi_ct->state)
+ ; eax: bi_ct
+ ; edx: dep
+ ; edi: range
+ ; ebp: rLPS
+
+ mov ecx, DWORD PTR [edx+8] ; ecx = bitsleft
+ sub edi, ebp ; range -= rLPS
+ shl edi, cl ; range << bitsleft
+ cmp DWORD PTR [edx+4], edi ; value < (range << bitsleft)
+ jge SHORT CABAC_OPT@LPS
+; MPS 
+ movzx bx, BYTE PTR _AC_next_state_MPS_64[ebx] ; ebx = state = AC_next_state_MPS_64[state]
+ shr edi, cl ; undo earlier shift
+ mov WORD PTR [eax], bx ; bi_ct->MPS = state
+ cmp edi, 256    ; 00000100H
+ setb cl
+ 
+ ; register state
+ ; ebx: state
+ ; eax: bi_ct
+ ; ecx: state (old)
+ ; edx: dep
+ ; edi: range
+ ; ebp: rLPS
+
+ shl edi, cl
+ sub DWORD PTR [edx+8], ecx ; dep->DbitsLeft--
+ mov DWORD PTR [edx], edi ; dep->Drange = range
+ movzx eax, BYTE PTR [eax+2] ; return bit
+ jz SHORT READ_TWO_BYTES ; if (dep->DbitsLeft==0)
+ 
+ ; register state
+ ; ebx: state
+ ; eax: bi_ct
+ ; ecx: range<<1
+ ; edx: dep
+ ; edi: range
+ ; ebp: rLPS
+ 
+ pop edi
+ pop ebp
+ pop ebx
+ ret 0
+align 16
+CABAC_OPT@LPS:
+ sub DWORD PTR [edx+4], edi
+ movzx cx, BYTE PTR _AC_next_state_LPS_64[ebx] ; cx: state = AC_next_state_LPS_64[state]
+ mov WORD PTR [eax], cx ; store state back to bi_ct->MPS
+
+ ; register state:
+ ; ebx: state (old)
+ ; eax: bi_ct
+ ; ecx: state (new)
+ ; edx: dep
+ ; edi: range
+ ; ebp: rLPS
+ 
+ mov edi, eax
+ movzx eax, BYTE PTR [eax+2]
+ xor eax, 1
+ test ebx, ebx ; if state(old) == 0
+ movzx ecx, BYTE PTR _renorm_table_256[ebp] ; ecx = renorm_table_32[rLPS>>3]
+ sete bl ; bl = 1  [ if state(old) == 0 ]
+ xor BYTE PTR [edi+2], bl  ; bl ^= bi_ct->state
+  
+ ; register state:
+ ; ebx: !state
+ ; eax: bi_ct
+ ; ecx: renorm
+ ; edx: dep
+ ; edi: range
+ ; ebp: rLPS
+
+ shl ebp, cl ; ebp = range = rLPS <<= renorm
+ sub DWORD PTR [edx+8], ecx ; dep->DbitsLeft -= renorm;
+ mov DWORD PTR [edx], ebp ; dep->Drange = range;
+ jle SHORT READ_TWO_BYTES ; if( dep->DbitsLeft <= 0 )
+
+ ; register state:
+ ; ebx: !state
+ ; eax: bi_ct
+ ; ecx: renorm
+ ; edx: dep
+ ; edi: range
+ ; ebp: range = rLPS <<= renorm
+ 
+ pop edi
+ pop ebp
+ pop ebx
+ ret 0
+align 16
+READ_TWO_BYTES:
+
+ ; register state:
+ ; ebx: !state
+ ; eax: bi_ct
+ ; ecx: renorm
+ ; edx: dep
+ ; edi: range
+ 
+ mov ebx, DWORD PTR [edx+16] ; ebx = dep->Dcodestrm_len
+ mov ecx, DWORD PTR [ebx] ; ecx = *dep->Dcodestrm_len
+ lea edi, DWORD PTR [ecx+2] ; edi = *dep->Dcodestrm_len + 2
+ mov DWORD PTR [ebx], edi ; *dep->Dcodestrm_len += 2
+ mov ebx, DWORD PTR [edx+12] ; edx = dep->Dcodestrm
+ movzx ecx, WORD PTR [ebx+ecx]
+ xchg cl, ch
+ shl DWORD PTR [edx+4], 16
+ mov WORD PTR [edx+4], cx
+ 
+ add DWORD PTR [edx+8], 16   ; dep->DbitsLeft += 16
+ pop edi
+ pop ebp
+ pop ebx
+ ret 0
+
+_biari_decode_symbol_map ENDP
+_TEXT	ENDS
+
+
+; ebx, ebp and edi are NOT preserved
+; pass tex_ctx in ebp
+; pass type in ebx
+; pass dep in edx
+; pass coeff in edi
+SigCoefFunction MACRO MaxC2, TypeCtxOne, TypeCtxAbs, MaxPos
+_abs_contexts$ = 28 ; local variable (safe because of how the function is called)
+_one_contexts$ = 32	; local variable (safe because of how the function is called)
+STACKOFFSET=0
+	lea	eax, DWORD PTR [ebp+TypeCtxOne*20+6068] ; 6068 = offsetof(tex_ctx, one_contexts)
+	mov	DWORD PTR _one_contexts$[esp+STACKOFFSET], eax ; one_contexts = tex_ctx->one_contexts[type2ctx_one[type]];
+	;push	esi
+	STACKOFFSET=STACKOFFSET+0
+
+	;esi: i (loop variable) = maxpos[type]
+	lea	ecx, DWORD PTR [ebp+TypeCtxAbs*20+6508]
+	mov	ebp, 1 ; ebp: c1	
+	xor	ebx, ebx ; ebx: c2
+	mov	DWORD PTR _abs_contexts$[esp+STACKOFFSET], ecx ; abs_contexts = tex_ctx->abs_contexts[type2ctx_abs[type]];
+SIGN_COEFF@LOOP_AGAIN:
+	cmp	WORD PTR [edi+esi*2], 0 ;if (coeff[i]!=0)
+	je	SHORT SIGN_COEFF@LOOP_ITR
+	mov	ecx, DWORD PTR _one_contexts$[esp+STACKOFFSET]
+	lea	eax, DWORD PTR [ecx+ebp*4]
+	mov	ebp, DWORD PTR plus_one_clip0_4[ebp*4] ; c1 = plus_one_clip0_4[c1];
+	call	_biari_decode_symbol_map ; biari_decode_symbol (dep_dp, one_contexts + c1);
+	test eax, eax
+	jz SHORT SIGN_COEFF@DECODE_EQ_PROB
+	;add	WORD PTR [edi+esi*2], ax ; coeff[i] += 
+	mov	ecx, DWORD PTR _abs_contexts$[esp+STACKOFFSET]
+	lea	eax, DWORD PTR [ecx+ebx*4]
+	call _unary_exp_golomb_level_decode ;unary_exp_golomb_level_decode (dep_dp, abs_contexts + c2);
+	inc eax
+	add	WORD PTR [edi+esi*2], ax ; coeff[i] += return val
+	xor	ebp, ebp ; c1 = 0
+	mov	ebx, DWORD PTR MaxC2[ebx*4]	
+SIGN_COEFF@DECODE_EQ_PROB:
+	call	_biari_decode_symbol_eq_prob_asm ; biari_decode_symbol_eq_prob(dep_dp)
+	js	SHORT SIGN_COEFF@LOOP_ITR
+	neg WORD PTR [edi+esi*2]
+SIGN_COEFF@LOOP_ITR:
+	sub	esi, 1
+	jns	SHORT SIGN_COEFF@LOOP_AGAIN
+	pop	esi
+	ret	0
+ENDM
+
+_TEXT SEGMENT
+_read_significant_coefficients0 PROC
+SigCoefFunction plus_one_clip4, 0, 0, 15
+_read_significant_coefficients0 ENDP
+_read_significant_coefficients1 PROC
+SigCoefFunction plus_one_clip4, 1, 1, 14
+_read_significant_coefficients1 ENDP
+_read_significant_coefficients2 PROC
+SigCoefFunction plus_one_clip4, 2, 2, 63
+_read_significant_coefficients2 ENDP
+_read_significant_coefficients3 PROC
+SigCoefFunction plus_one_clip4, 3, 3, 31
+_read_significant_coefficients3 ENDP
+_read_significant_coefficients4 PROC
+SigCoefFunction plus_one_clip4, 3, 3, 31
+_read_significant_coefficients4 ENDP
+_read_significant_coefficients5 PROC
+SigCoefFunction plus_one_clip4, 4, 4, 15
+_read_significant_coefficients5 ENDP
+_read_significant_coefficients6 PROC
+SigCoefFunction plus_one_clip3, 5, 5, 3
+_read_significant_coefficients6 ENDP
+_read_significant_coefficients7 PROC
+SigCoefFunction plus_one_clip4, 6, 6, 14
+_read_significant_coefficients7 ENDP
+_read_significant_coefficients8 PROC
+SigCoefFunction plus_one_clip3, 5, 5, 7
+_read_significant_coefficients8 ENDP
+_read_significant_coefficients9 PROC
+SigCoefFunction plus_one_clip4, 5, 5, 15
+_read_significant_coefficients9 ENDP
+_read_significant_coefficients10 PROC
+SigCoefFunction plus_one_clip4, 10, 10, 15
+_read_significant_coefficients10 ENDP
+_read_significant_coefficients11 PROC
+SigCoefFunction plus_one_clip4, 11, 11, 14
+_read_significant_coefficients11 ENDP
+_read_significant_coefficients12 PROC
+SigCoefFunction plus_one_clip4, 12, 12, 63
+_read_significant_coefficients12 ENDP
+_read_significant_coefficients13 PROC
+SigCoefFunction plus_one_clip4, 13, 13, 31
+_read_significant_coefficients13 ENDP
+_read_significant_coefficients14 PROC
+SigCoefFunction plus_one_clip4, 13, 13, 31
+_read_significant_coefficients14 ENDP
+_read_significant_coefficients15 PROC
+SigCoefFunction plus_one_clip4, 14, 14, 15
+_read_significant_coefficients15 ENDP
+_read_significant_coefficients16 PROC
+SigCoefFunction plus_one_clip4, 16, 16, 15
+_read_significant_coefficients16 ENDP
+_read_significant_coefficients17 PROC
+SigCoefFunction plus_one_clip4, 17, 17, 14
+_read_significant_coefficients17 ENDP
+_read_significant_coefficients18 PROC
+SigCoefFunction plus_one_clip4, 18, 18, 63
+_read_significant_coefficients18 ENDP
+_read_significant_coefficients19 PROC
+SigCoefFunction plus_one_clip4, 19, 19, 31
+_read_significant_coefficients19 ENDP
+_read_significant_coefficients20 PROC
+SigCoefFunction plus_one_clip4, 19, 19, 31
+_read_significant_coefficients20 ENDP
+_read_significant_coefficients21 PROC
+SigCoefFunction plus_one_clip4, 20, 20, 15
+_read_significant_coefficients21 ENDP
+_TEXT ENDS
+
+
+;
+; push	eax ; currSlice->coeff
+; push	ecx ; tex_ctx
+; edi is NOT preserved
+; pass currMB in edi
+; pass dep in ebp
+; pass type in ebx
+; on return, edi contains coeff, edx contains dep
+
+
+SigMapFunction MACRO  PosCtxMap, TypeCtxLast, IsDC, MaxPos, PosCtxLast, TypeCtxMap, Func
+last_ctx$ = 24 ; local variable (cheating and using stack space from _readRunLevel_CABAC)
+coeff_ctr$ = 28	 ; local variable (cheating and using stack space from _readRunLevel_CABAC)
+	STACKOFFSET=0
+	mov	edx, DWORD PTR [edi+p_Vid@Macroblock] ; edx: p_Vid
+	push	esi 
+	xor esi, esi
+	STACKOFFSET=STACKOFFSET+4
+	mov edx, DWORD PTR [edx+structure@VideoParameters]
+	add edx, DWORD PTR [edi+mb_field@Macroblock] ; currMB->mb_field
+	mov edi, eax ; edi: coeff
+	mov eax, 1408 ; 16 * 22 * sizeof(BiContextType)
+	cmovz eax, esi
+	mov	edx, OFFSET PosCtxMap
+	cmovnz edx, DWORD PTR _pos2ctx_map_int[ebx*4]
+	IF IsDC EQ 0 
+	lea ebx, [edx + 1]
+	ELSE
+	mov ebx, edx ; pos2ctx_Map = (fld) ? pos2ctx_map_int[type] : pos2ctx_map[type];
+	ENDIF
+	mov	edx, ebp 	
+	lea ebp, [eax+ecx+TypeCtxMap*64+map_contexts@TextureInfoContexts] ; map_ctx   = tex_ctx->map_contexts[fld][type2ctx_map [type]];
+	lea	ecx, DWORD PTR [eax+ecx+TypeCtxLast*64+last_contexts@TextureInfoContexts]
+	mov	DWORD PTR last_ctx$[esp+STACKOFFSET], ecx ; last_ctx  = tex_ctx->last_contexts[fld][type2ctx_last[type]];
+	mov	DWORD PTR coeff_ctr$[esp+STACKOFFSET], esi; coeff_ctr = 0
+	;jne	LOOP_AGAIN
+
+	; esi: i
+	; ebx: i1 (loop end)
+	; ebp: dep_dp
+	; edi: coeff
+; for (i=i0; i < i1; ++i) // if last coeff is reached, it has to be significant
+LOOP_AGAIN:
+
+; --- read significance symbol ---
+; if (biari_decode_symbol   (dep_dp, map_ctx + pos2ctx_Map[i]))
+
+	movzx	eax, BYTE PTR [esi+ebx]	
+	lea	eax, DWORD PTR [ebp+eax*4]
+	call	_biari_decode_symbol_map
+	test	eax, eax
+	mov	WORD PTR [edi+esi*2], ax ; coeff[i] = biari_decode_symbol()
+	je	SHORT LOOP_ITR
+
+; --- read last coefficient symbol ---
+; if (biari_decode_symbol (dep_dp, last_ctx + last[i]))
+
+	inc	DWORD PTR coeff_ctr$[esp+STACKOFFSET] ; coeff_ctr++
+	IF IsDC EQ 0 
+      movzx ecx, BYTE PTR PosCtxLast[esi+1]
+	ELSE
+	  movzx ecx, BYTE PTR PosCtxLast[esi]
+	ENDIF
+	mov	eax, DWORD PTR last_ctx$[esp+STACKOFFSET]	
+	lea	eax, DWORD PTR [eax+ecx*4]
+	call	_biari_decode_symbol_map
+	test	eax, eax
+	je	SHORT LOOP_ITR
+	
+	mov	eax, DWORD PTR coeff_ctr$[esp+STACKOFFSET]; return coeff_ctr;
+	mov ecx, DWORD PTR [esp]
+	mov	ebp, DWORD PTR [ecx+tex_ctx@Slice] ; ; edx:  currSlice->tex_ctx
+	mov	DWORD PTR [ecx+coeff_ctr@Slice], eax ; currSlice->coeff_ctr = return value (read_significance_map)
+	jmp Func
+	align 16
+LOOP_ITR:
+	inc	esi
+	cmp	esi, MaxPos
+	jl	SHORT LOOP_AGAIN
+	mov	eax, DWORD PTR coeff_ctr$[esp+STACKOFFSET]
+	mov	WORD PTR [edi+esi*2], 1
+	inc	eax
+	mov ecx, DWORD PTR [esp]
+	mov	ebp, DWORD PTR [ecx+tex_ctx@Slice] ; ; edx:  currSlice->tex_ctx
+	mov	DWORD PTR [ecx+coeff_ctr@Slice], eax ; currSlice->coeff_ctr = return value (read_significance_map)
+	jmp Func
+ENDM
+
+_TEXT SEGMENT
+_read_significance_map0 PROC
+SigMapFunction _pos2ctx_map4x4, 0, 1, 15, _pos2ctx_last4x4, 0, _read_significant_coefficients0
+_read_significance_map0 ENDP
+_read_significance_map1 PROC
+SigMapFunction _pos2ctx_map4x4, 1, 0, 14, _pos2ctx_last4x4, 1, _read_significant_coefficients1
+_read_significance_map1 ENDP
+_read_significance_map2 PROC
+SigMapFunction _pos2ctx_map8x8, 2, 1, 63, _pos2ctx_last8x8, 2, _read_significant_coefficients2
+_read_significance_map2 ENDP
+_read_significance_map3 PROC
+SigMapFunction _pos2ctx_map8x4, 3, 1, 31, _pos2ctx_last8x4, 3, _read_significant_coefficients3
+_read_significance_map3 ENDP
+_read_significance_map4 PROC
+SigMapFunction _pos2ctx_map8x4, 4, 1, 31, _pos2ctx_last8x4, 4, _read_significant_coefficients4
+_read_significance_map4 ENDP
+_read_significance_map5 PROC
+SigMapFunction _pos2ctx_map4x4, 5, 1, 15, _pos2ctx_last4x4, 5, _read_significant_coefficients5
+_read_significance_map5 ENDP
+_read_significance_map6 PROC
+SigMapFunction _pos2ctx_map4x4, 6, 1, 3, _pos2ctx_last4x4, 6, _read_significant_coefficients6
+_read_significance_map6 ENDP
+_read_significance_map7 PROC
+SigMapFunction _pos2ctx_map4x4, 7, 0, 14, _pos2ctx_last4x4, 7, _read_significant_coefficients7
+_read_significance_map7 ENDP
+_read_significance_map8 PROC
+SigMapFunction _pos2ctx_map2x4c, 6, 1, 7, _pos2ctx_last2x4c, 6, _read_significant_coefficients8
+_read_significance_map8 ENDP
+_read_significance_map9 PROC
+SigMapFunction _pos2ctx_map4x4c, 6, 1, 15, _pos2ctx_last4x4c, 6, _read_significant_coefficients9
+_read_significance_map9 ENDP
+_read_significance_map10 PROC
+SigMapFunction _pos2ctx_map4x4, 10, 1, 15, _pos2ctx_last4x4, 10, _read_significant_coefficients10
+_read_significance_map10 ENDP
+_read_significance_map11 PROC
+SigMapFunction _pos2ctx_map4x4, 11, 0, 14, _pos2ctx_last4x4, 11, _read_significant_coefficients11
+_read_significance_map11 ENDP
+_read_significance_map12 PROC
+SigMapFunction _pos2ctx_map8x8, 12, 1, 63, _pos2ctx_last8x8, 12, _read_significant_coefficients12
+_read_significance_map12 ENDP
+_read_significance_map13 PROC
+SigMapFunction _pos2ctx_map8x4, 13, 1, 31, _pos2ctx_last8x4, 13, _read_significant_coefficients13
+_read_significance_map13 ENDP
+_read_significance_map14 PROC
+SigMapFunction _pos2ctx_map8x4, 14, 1, 31, _pos2ctx_last8x4, 14, _read_significant_coefficients14
+_read_significance_map14 ENDP
+_read_significance_map15 PROC
+SigMapFunction _pos2ctx_map4x4, 15, 1, 15, _pos2ctx_last4x4, 15, _read_significant_coefficients15
+_read_significance_map15 ENDP
+_read_significance_map16 PROC
+SigMapFunction _pos2ctx_map4x4, 16, 1, 15, _pos2ctx_last4x4, 16, _read_significant_coefficients16
+_read_significance_map16 ENDP
+_read_significance_map17 PROC
+SigMapFunction _pos2ctx_map4x4, 17, 0, 14, _pos2ctx_last4x4, 17, _read_significant_coefficients17
+_read_significance_map17 ENDP
+_read_significance_map18 PROC
+SigMapFunction _pos2ctx_map8x8, 18, 1, 63, _pos2ctx_last8x8, 18, _read_significant_coefficients18
+_read_significance_map18 ENDP
+_read_significance_map19 PROC
+SigMapFunction _pos2ctx_map8x4, 19, 1, 31, _pos2ctx_last8x4, 19, _read_significant_coefficients19
+_read_significance_map19 ENDP
+_read_significance_map20 PROC
+SigMapFunction _pos2ctx_map8x4, 20, 1, 31, _pos2ctx_last8x4, 20, _read_significant_coefficients20
+_read_significance_map20 ENDP
+_read_significance_map21 PROC
+SigMapFunction _pos2ctx_map4x4, 21, 1, 15, _pos2ctx_last4x4, 21, _read_significant_coefficients21
+_read_significance_map21 ENDP
+_TEXT ENDS
+
+
+_TEXT	SEGMENT
+; edx: dep - unchanged by function
+; SF holds the return value
+_biari_decode_symbol_eq_prob_asm PROC
+	mov ecx, DWORD PTR [edx+8]; dep->DbitsLeft
+	dec ecx ; dep->DbitsLeft--
+	mov	eax, DWORD PTR [edx+4] ; eax: dep->DValue
+	push	esi
+	jnz	SHORT $LN3@biari_deco; if(--(dep->DbitsLeft) == 0)  
+
+	mov	ecx, DWORD PTR [edx+16] ; ebp: dep->Dcodestrm_len
+	mov	esi, DWORD PTR [ecx] ; esi: *dep->Dcodestrm_len
+	add	DWORD PTR [ecx], 2 ; *dep->Dcodestrm_len += 2
+	mov	ecx, DWORD PTR [edx+12] ; ebp: dep->Dcodestrm
+	shl eax, 16
+	mov	ax, WORD PTR [ecx+esi]  ; value = (value << 16) | getword( dep )
+	xchg ah, al
+	mov	ecx, 16			; dep->DbitsLeft = 16;
+$LN3@biari_deco:
+	mov	esi, DWORD PTR [edx] ; dep->Drange 
+	shl	esi, cl ; (dep->Drange << dep->DbitsLeft)
+	mov DWORD PTR [edx+8], ecx
+	mov	ecx, eax
+	sub	ecx, esi
+	pop	esi
+	
+	cmovns eax, ecx ; if (tmp_value <0) value = tmp_value
+	mov	DWORD PTR [edx+4], eax ; dep->Dvalue = value;
+	ret	0
+_biari_decode_symbol_eq_prob_asm ENDP
+_TEXT	ENDS
+
+_TEXT	SEGMENT
+; edx: dep.  retained on return
+; esi and ebp are NOT retained, because the (only) calling function doesn't need them to be
+_exp_golomb_decode_eq_prob0 PROC
+STACKOFFSET=0
+	xor esi, esi ; esi: binary_symbol
+	xor	ebp, ebp ; ebp: symbol
+	push	edi
+	mov edi, 1 ; edi: k
+DECODE_EQ@LOOP_AGAIN:
+	call	_biari_decode_symbol_eq_prob_asm ; l = biari_decode_symbol_eq_prob(dep_dp);
+	js	SHORT DECODE_EQ@LOOP_DONE
+	add	ebp, edi ; symbol += k
+	shl edi, 1 ; k <<= 1
+	jmp	SHORT DECODE_EQ@LOOP_AGAIN
+	align 16
+DECODE_EQ@LOOP_DONE:
+	shr edi, 1
+	jz	SHORT DECODE_EQ@RETURN
+	call	_biari_decode_symbol_eq_prob_asm ; if (biari_decode_symbol_eq_prob(dep_dp)==1)
+	js	SHORT DECODE_EQ@LOOP_DONE
+	or	esi, edi ; binary_symbol |= (1<<k);
+	jmp SHORT DECODE_EQ@LOOP_DONE
+	align 16
+DECODE_EQ@RETURN:
+	lea	eax, DWORD PTR [esi+ebp+13] ; return (unsigned int) (symbol + binary_symbol);
+	pop	edi
+	ret	0
+_exp_golomb_decode_eq_prob0 ENDP
+_TEXT	ENDS
+
+;
+;
+; pass dep in edx, context in eax
+; edx is retained on return
+; ebp is destroyed
+
+_TEXT	SEGMENT
+ctx = 4 ; second parameter
+_unary_exp_golomb_level_decode PROC	
+	STACKOFFSET=0
+	mov ebp, eax ; eax (and now ebp also) contains the context pointer
+	call	_biari_decode_symbol_map
+	test	eax, eax ; if (symbol==0)
+	jne	SHORT SYMBOL_NOT_ZERO
+	ret	0
+align 16
+SYMBOL_NOT_ZERO:
+	push	esi
+	xor	esi, esi
+LEVEL_DECODE@LOOP_AGAIN:
+
+	mov eax, ebp ; _biari_decode_symbol_map wants ctx in eax
+	inc	esi ; ++symbol;
+	call	_biari_decode_symbol_map ; l = biari_decode_symbol(dep_dp, ctx);
+
+	test	eax, eax ; if (!l)
+	je	SHORT LEVEL_IS_ZERO
+	cmp	esi, 12				; exp_start-1
+	jb	SHORT LEVEL_DECODE@LOOP_AGAIN
+
+	call _exp_golomb_decode_eq_prob0 ; exp_golomb_decode_eq_prob(dep_dp,0)
+	pop	esi
+	ret	0
+align 16
+LEVEL_IS_ZERO:
+	mov	eax, esi ; return symbol;
+	pop	esi
+	ret	0
+_unary_exp_golomb_level_decode ENDP
+_TEXT	ENDS
+
+CONST SEGMENT
+sigmap_functions DD FLAT:_read_significance_map0
+DD FLAT:_read_significance_map1
+DD FLAT:_read_significance_map2
+DD FLAT:_read_significance_map3
+DD FLAT:_read_significance_map4
+DD FLAT:_read_significance_map5
+DD FLAT:_read_significance_map6
+DD FLAT:_read_significance_map7
+DD FLAT:_read_significance_map8
+DD FLAT:_read_significance_map9
+DD FLAT:_read_significance_map10
+DD FLAT:_read_significance_map11
+DD FLAT:_read_significance_map12
+DD FLAT:_read_significance_map13
+DD FLAT:_read_significance_map14
+DD FLAT:_read_significance_map15
+DD FLAT:_read_significance_map16
+DD FLAT:_read_significance_map17
+DD FLAT:_read_significance_map18
+DD FLAT:_read_significance_map19
+DD FLAT:_read_significance_map20
+DD FLAT:_read_significance_map21
+CONST ENDS
+
+PUBLIC	_readRunLevel_CABAC
+_TEXT	SEGMENT
+_currMB$ = 4 ; first parameter
+_dep_dp$ = 8 ; second parameter
+_context$ = 12 ; third parameter
+_readRunLevel_CABAC PROC
+	push	esi
+	push	edi
+STACKOFFSET=8
+	mov	edi, DWORD PTR _currMB$[esp+STACKOFFSET] ; edi: currMB
+	mov	esi, DWORD PTR [edi] ; esi: currSlice = currMB->p_Slice;
+
+	cmp	DWORD PTR [esi+coeff_ctr@Slice], 0 ; if (currSlice->coeff_ctr >= 0)
+	jge	SHORT SET_RUN_AND_LEVEL
+
+; ===== decode CBP-BIT =====
+	mov	eax, DWORD PTR [edi+read_and_store_CBP_block_bit@Macroblock] ; eax: currMB->read_and_store_CBP_block_bit
+	push	ebx
+STACKOFFSET=STACKOFFSET+4
+	mov	ebx, DWORD PTR _context$[esp+STACKOFFSET] ; ebx: context
+	push	ebp
+STACKOFFSET=STACKOFFSET+4
+	mov	ebp, DWORD PTR _dep_dp$[esp+STACKOFFSET] ; ebp: dep
+	push	ebx ; context
+	push	ebp ; dep
+	push	edi ; currMB
+	call	eax ; currMB->read_and_store_CBP_block_bit(currMB, dep_dp, context)
+	add	esp, 12
+	mov	DWORD PTR [esi+coeff_ctr@Slice], eax ; currSlice->coeff_ctr = return value
+	test	eax, eax ; if (currSlice->coeff_ctr == 0)
+	je	SHORT SET_RUN_AND_LEVEL_POP
+
+; ===== decode significance coefficients =====
+	mov	ecx, DWORD PTR [esi+tex_ctx@Slice] ; ecx:  currSlice->tex_ctx
+	lea	eax, DWORD PTR [esi+coeff@Slice] ; eax: currSlice->coeff
+	;push	eax ; currSlice->coeff
+	;push	ecx ; tex_ctx
+	;call	_read_significance_map ; read_significance_map(currSlice->tex_ctx, currMB, dep_dp, context, currSlice->coeff);
+	call sigmap_functions[ebx*4]
+SET_RUN_AND_LEVEL_POP:
+	pop	ebp
+	pop	ebx
+STACKOFFSET=STACKOFFSET-8
+SET_RUN_AND_LEVEL:
+
+; --- set run and level ---
+	xor	edx, edx ; edx: 0
+
+	dec DWORD PTR [esi+coeff_ctr@Slice] ; if (currSlice->coeff_ctr--)
+	js 	SHORT EOB
+
+; --- set run and level (coefficient) ---
+	mov	ecx, DWORD PTR [esi+pos@Slice] ; ecx: currSlice->pos
+	xor edi, edi ; edi: run=0
+	cmp	WORD PTR [esi+ecx*2+coeff@Slice], dx ; currSlice->coeff[currSlice->pos] == 0
+	jne	SHORT LOOP_END
+LOOP_ITR:
+	cmp	WORD PTR [esi+ecx*2+1+coeff@Slice], dx ; currSlice->coeff[currSlice->pos] == 0
+	lea ecx, [ecx+1]
+	lea edi, [edi+1]
+	je	SHORT LOOP_ITR
+LOOP_END:
+	movsx eax, WORD PTR [esi+ecx*2+coeff@Slice] ; eax: value = currSlice->coeff[currSlice->pos]
+	inc	ecx ; currSlice->pos++
+
+; --- decrement coefficient counter and re-set position ---
+
+	;cmp	DWORD PTR [esi+coeff_ctr@Slice], edx ; if (currSlice->coeff_ctr == 0) 
+	;cmove ecx, edx ; currSlice->pos = 0
+	mov edx, edi
+	pop	edi
+	mov	DWORD PTR [esi+pos@Slice], ecx ; store currSlice->pos
+	pop	esi
+	ret	0 ; eax contains value
+	align 16
+EOB:
+	xor	eax, eax ; return 0
+	mov	DWORD PTR [esi+pos@Slice], edx ; currSlice->pos = 0;
+	pop	edi
+	pop	esi
+	ret	0
+_readRunLevel_CABAC ENDP
+_TEXT	ENDS
+
+;
+; edi is not saved
+; pass dep_dp in edx, retained on exit
+; pass ctx in edi
+; return value in esi
+
+PUBLIC	_unary_exp_golomb_mv_decode3
+_TEXT	SEGMENT
+_ctx$ = 4 ; second parameter
+_unary_exp_golomb_mv_decode3 PROC
+STACKOFFSET=0
+	mov eax, edi
+	call	_biari_decode_symbol_map ; pass dep in edx and ctx in eax.  edx retains dep on exit
+	test	eax, eax ; if (symbol)
+	jne	SHORT SYMBOL_NOT_ZERO
+	xor esi, esi
+	ret	0
+	align 16
+SYMBOL_NOT_ZERO:
+	push	ebp
+STACKOFFSET=STACKOFFSET+4
+	mov	ebp, 3
+	add	edi, 4 ; ctx++
+	mov	esi, 1 ; esi: symbol
+LOOP_START:
+	mov eax, edi
+	call	_biari_decode_symbol_map ; pass dep in edx and ctx in eax.  edx retains dep on exit
+	test	eax, eax
+	je	SHORT SYMBOL_ZERO_RETURN
+
+	inc	esi
+	cmp	esi, 2 ; if (symbol == 2)
+	sete al ; eax will be 1, so this is safe to do
+	lea edi, [edi + eax*4] ; ctx += (symbol == 2)
+
+	cmp	esi, ebp ; if (symbol == max_bin)
+	sete al ; eax will have nothing set high, so this is safe to do
+	lea edi, [edi + eax*4] ; ctx += (symbol != max_bin)
+
+	cmp	esi, 8 ; if (symbol < exp_start)
+	jb	SHORT LOOP_START
+
+; return exp_start + exp_golomb_decode_eq_prob(dep_dp,3);
+	xor	ebp, ebp ; ebp: symbol
+	mov	edi, ebp ; edi: binary_symbol
+DECODE_EQ3@LOOP1:
+	call	_biari_decode_symbol_eq_prob_asm ; edx holds dep_dp
+	js	SHORT DECODE_EQ3@LOOP2
+	or ebp, esi; symbol += (l<<k)
+	shl esi, 1 ; k <<= 1
+	jmp	SHORT DECODE_EQ3@LOOP1
+	align 16
+DECODE_EQ3@LOOP2:
+	shr esi, 1
+	jz	SHORT DECODE_EQ3@RETURN
+	call	_biari_decode_symbol_eq_prob_asm
+	js	SHORT DECODE_EQ3@LOOP2
+	or	edi, esi ; binary_symbol |= (1<<k);
+	jmp	SHORT DECODE_EQ3@LOOP2
+	align 16
+DECODE_EQ3@RETURN:
+	; return (unsigned int) (symbol + binary_symbol);
+	lea esi, [edi+ebp+8]
+	pop	ebp
+	ret	0
+	align 16
+SYMBOL_ZERO_RETURN:
+	; return symbol is in esi
+	pop	ebp
+	ret	0
+_unary_exp_golomb_mv_decode3 ENDP
+_TEXT	ENDS
+
+_TEXT	SEGMENT
+_unary_bin_decode1 PROC
+; _ctx$ = eax
+; _dep_dp$ = edx
+	push edi
+	mov	edi, eax
+	call	_biari_decode_symbol_map ; biari_decode_symbol(dep_dp, ctx );
+	test	eax, eax ; if (symbol)
+	jne	SHORT $LN5@unary_bin_@2
+	mov eax, 2
+	shr	eax, 1
+	pop	edi
+	ret	0
+align 16
+$LN5@unary_bin_@2:
+	xor	esi, esi ; symbol = 0;
+$LL3@unary_bin_@2:
+	inc	esi ; ++symbol;
+	lea	eax, DWORD PTR [edi+4] ; ctx + ctx_offset
+	call	_biari_decode_symbol_map ; biari_decode_symbol(dep_dp, ctx);
+	test	eax, eax ; while( l != 0 );
+	jne	SHORT $LL3@unary_bin_@2
+	lea eax, [esi + 2]; return symbol+2;
+	shr	eax, 1
+	pop edi
+	ret	0
+_unary_bin_decode1 ENDP
+_TEXT	ENDS
+
+
+PUBLIC	_readDquant_CABAC
+_TEXT	SEGMENT
+_currSlice$ = 4 ; first parameter
+_dep_dp$ = 8 ; second parameter
+_readDquant_CABAC PROC
+STACKOFFSET=0
+; 815  : 	MotionInfoContexts *ctx = currSlice->mot_ctx;
+; 816  : 	short dquant;
+; 817  : 	int act_ctx = ((currSlice->last_dquant != 0) ? 1 : 0);
+; 818  : 	int act_sym = biari_decode_symbol(dep_dp,ctx->delta_qp_contexts + act_ctx );
+
+	mov	edx, DWORD PTR _dep_dp$[esp+STACKOFFSET]
+	push	esi
+	push	edi
+STACKOFFSET = STACKOFFSET + 8
+	mov	edi, DWORD PTR _currSlice$[esp+STACKOFFSET]
+	mov	esi, DWORD PTR [edi+mot_ctx@Slice]
+	xor	eax, eax
+	cmp	DWORD PTR [edi+last_dquant@Slice], eax
+	setne	al
+	lea	eax, DWORD PTR [esi+eax*4+332]
+	; pass dep in edx and ctx in eax.  edx retains dep on exit
+	call	_biari_decode_symbol_map
+
+	test	eax, eax ; if (!act_sym)
+	jz	SHORT $LN2@readDquant
+
+	lea	eax, DWORD PTR [esi+340] ; unary_bin_decode(dep_dp,ctx->delta_qp_contexts + 2,1);
+	call	_unary_bin_decode1
+	
+	jnc	SHORT $LN2@readDquant ; lsb is signed bit
+
+	neg	eax ; dquant = -dquant;
+	movzx	eax, ax
+$LN2@readDquant:
+	movsx	edx, ax
+	mov	DWORD PTR [edi+last_dquant@Slice], edx ; currSlice->last_dquant = dquant;
+	pop	edi
+	pop	esi
+	;mov	ax, cx ; return dquant;
+	ret	0
+_readDquant_CABAC ENDP
+_TEXT	ENDS
+
+PUBLIC	_readIntraPredMode_CABAC
+_TEXT	SEGMENT
+_currSlice$ = 4 ; first parameter
+_dep_dp$ = 8 ; second parameter
+_readIntraPredMode_CABAC PROC
+; 720  : 	TextureInfoContexts *ctx     =
+STACKOFFSET=0
+	mov	eax, DWORD PTR _currSlice$[esp + STACKOFFSET]
+	push	esi
+	mov	esi, DWORD PTR [eax+100] ;  currSlice->tex_ctx;
+STACKOFFSET=4
+; 721  : 	int act_sym;
+; 722  : 
+; 723  : 	// use_most_probable_mode
+; 724  : 	act_sym = biari_decode_symbol(dep_dp, ctx->ipr_contexts);
+
+	mov	edx, DWORD PTR _dep_dp$[esp+STACKOFFSET]
+	lea	eax, DWORD PTR [esi+12]
+	call	_biari_decode_symbol_map
+
+	; remaining_mode_selector
+	test	eax, eax ; if (act_sym == 0)
+	jz	SHORT $LN2@readIntraP
+
+	or	eax, -1 ; return -1;
+	pop	esi
+	ret	0
+align 16
+$LN2@readIntraP:
+	push	ebx
+	add	esi, 16					; 00000010H
+	mov eax, esi
+	call	_biari_decode_symbol_map
+	mov	ebx, eax
+; 735  : 		pred_mode |= (biari_decode_symbol(dep_dp, ctx->ipr_contexts+1) << 1);
+
+	mov eax, esi
+	call	_biari_decode_symbol_map
+	lea ebx, [ebx+2*eax]
+; 736  : 		pred_mode |= (biari_decode_symbol(dep_dp, ctx->ipr_contexts+1) << 2);
+
+	mov eax, esi
+	call	_biari_decode_symbol_map
+	lea eax, [ebx+4*eax] ; return pred_mode;
+
+	pop	ebx
+	pop	esi
+	ret	0
+_readIntraPredMode_CABAC ENDP
+_TEXT	ENDS
+
+PUBLIC	_readMB_skip_flagInfo_CABAC
+_TEXT	SEGMENT
+_currMB$ = 4 ; first parameter
+_dep_dp$ = 12						; size = 4
+_readMB_skip_flagInfo_CABAC PROC
+
+; 406  : 	Slice *currSlice = currMB->p_Slice;
+STACKOFFSET=0
+	mov	ecx, DWORD PTR _currMB$[esp + STACKOFFSET]
+	push	ebp
+
+	xor	eax, eax
+	push	esi
+	mov	esi, DWORD PTR [ecx + p_Slice@Macroblock] ; esi: currSlice
+	cmp	DWORD PTR [esi+slice_type@Slice], 1 ; int bframe=(currSlice->slice_type == B_SLICE);
+	push	edi
+
+	mov	edi, DWORD PTR [esi+mot_ctx@Slice] ; edi: ctx = currSlice->mot_ctx;  
+	sete	al ; int bframe=(currSlice->slice_type == B_SLICE);
+
+; 409  : 	int a = (currMB->mb_left != NULL) ? (currMB->mb_left->skip_flag == 0) : 0;
+
+	xor	edx, edx
+	mov	ebp, eax
+	mov	eax, DWORD PTR [ecx+104]
+	test	eax, eax
+	je	SHORT READ_B
+	cmp	DWORD PTR [eax+348], edx
+	sete	dl
+
+; 410  : 	int b = (currMB->mb_up   != NULL) ? (currMB->mb_up  ->skip_flag == 0) : 0;
+
+READ_B:
+	mov	ecx, DWORD PTR [ecx+100]
+	xor	eax, eax
+	test	ecx, ecx
+	je	SHORT $LN9@readMB_ski
+	cmp	DWORD PTR [ecx+348], eax
+	sete	al
+$LN9@readMB_ski:
+
+; 414  : 	if (bframe)
+; 415  : 	{
+; 416  : 		act_ctx = 7 + a + b;
+; 418  : 		skip = biari_decode_symbol (dep_dp, &ctx->mb_type_contexts[2][act_ctx]);
+
+	add	eax, edx
+	test	ebp, ebp
+	mov	edx, DWORD PTR _dep_dp$[esp+8]
+	je	SHORT $LN3@readMB_ski
+	lea	eax, DWORD PTR [edi+eax*4+116]
+	jmp	SHORT $LN11@readMB_ski
+align 16
+$LN3@readMB_ski:
+
+; 422  : 		act_ctx = a + b;
+; 424  : 		skip = biari_decode_symbol(dep_dp, &ctx->mb_type_contexts[1][act_ctx]);
+
+	lea	eax, DWORD PTR [edi+eax*4+44]
+$LN11@readMB_ski:
+	call	_biari_decode_symbol_map
+
+	test	eax, eax ; if (!skip)
+	je	SHORT $LN1@readMB_ski
+
+; 429  : 		currSlice->last_dquant = 0;
+
+	mov	DWORD PTR [esi + last_dquant@Slice], 0
+$LN1@readMB_ski:
+	pop	edi
+	pop	esi
+	pop	ebp
+	ret	0
+_readMB_skip_flagInfo_CABAC ENDP
+_TEXT	ENDS
+
+
+PUBLIC	_set_chroma_qp
+_TEXT	SEGMENT
+_currMB$ = 4 ; first parameter
+_set_chroma_qp PROC
+	mov	eax, DWORD PTR _currMB$[esp] ; eax: currMB
+	mov	ecx, DWORD PTR [eax+4] ; ecx: currMB->p_Vid
+	mov	edx, DWORD PTR [ecx+bitdepth_chroma_qp_scale@VideoParameters] ; edx: p_Vid->bitdepth_chroma_qp_scale;
+	push	edi
+	mov	edi, DWORD PTR [ecx+dec_picture@VideoParameters] ; edi: p_Vid->dec_picture
+	mov	ecx, DWORD PTR [edi+chroma_qp_offset@StorablePicture] ; ecx: dec_picture->chroma_qp_offset[0]
+	add	ecx, DWORD PTR [eax+qp@macroblock] ; ecx: dec_picture->chroma_qp_offset[0] + currMB->qp
+	neg	edx ; edx: -p_Vid->bitdepth_chroma_qp_scale;
+	cmp	ecx, edx
+	cmovl ecx, edx
+	cmp	ecx, 51
+	cmovg ecx, DWORD PTR _51 ; cmov doesn't allow for immediates
+	test	ecx, ecx
+	cmovge	ecx, DWORD PTR _QP_SCALE_CR[ecx*4]
+	mov	DWORD PTR [eax+64], ecx
+
+	sub ecx, edx; currMB->qpc[0] + p_Vid->bitdepth_chroma_qp_scale;
+	mov	DWORD PTR [eax+qp_scaled@Macroblock + 4], ecx ; currMB->qp_scaled[1]
+	mov	ecx, DWORD PTR [edi+chroma_qp_offset@StorablePicture + 4]
+	add	ecx, DWORD PTR [eax+qp@macroblock]
+	cmp	ecx, edx
+	cmovl ecx, edx
+	cmp	ecx, 51
+	cmovg ecx, DWORD PTR _51 ; cmov doesn't allow for immediates
+	test	ecx, ecx
+	cmovge	ecx, DWORD PTR _QP_SCALE_CR[ecx*4]
+	mov	DWORD PTR [eax+64+4], ecx
+	sub ecx, edx
+	pop	edi
+	mov	DWORD PTR [eax+72 + 8], ecx
+	ret	0
+_set_chroma_qp ENDP
+_TEXT	ENDS
+
+PUBLIC	_decodeMVD_CABAC
+_TEXT	SEGMENT
+_dep_dp$ = 4 ; first parameter
+_mv_ctx$ = 8 ; second parameter
+_act_ctx$ = 12; third parameter
+_err$ = 16 ; 4th parameter
+_decodeMVD_CABAC PROC
+STACKOFFSET = 0
+	mov	eax, DWORD PTR _act_ctx$[esp+STACKOFFSET]
+	push	edi
+STACKOFFSET = STACKOFFSET + 4
+	mov	edi, DWORD PTR _mv_ctx$[esp+STACKOFFSET]
+	lea edi, [edi+eax*4] ; mv_ctx[0][act_ctx]
+	mov	eax, DWORD PTR _err$[esp+STACKOFFSET]
+	lea	eax, DWORD PTR [edi+eax*4] ; &mv_ctx[0][act_ctx+err]
+	mov	edx, DWORD PTR _dep_dp$[esp+STACKOFFSET]
+	call	_biari_decode_symbol_map ; int act_sym = biari_decode_symbol(dep_dp,&mv_ctx[0][act_ctx+err] );
+
+	test	eax, eax ; if (act_sym != 0)
+	je	SHORT SYMBOL_ZERO
+	push	esi
+STACKOFFSET = STACKOFFSET + 4
+	lea	edi, [edi + 40] ; mv_ctx[1]+act_ctx
+	call	_unary_exp_golomb_mv_decode3 ; act_sym = unary_exp_golomb_mv_decode3(dep_dp,mv_ctx[1]+act_ctx);
+	inc	esi ; ++act_sym;
+	call	_biari_decode_symbol_eq_prob_asm ; mv_sign = biari_decode_symbol_eq_prob(dep_dp);
+	js	SHORT SKIP_NEGATE; if(mv_sign)
+	neg	esi ; act_sym = -act_sym;
+SKIP_NEGATE:
+	mov	eax, esi
+	pop	esi
+SYMBOL_ZERO:
+	pop	edi
+	ret	0
+_decodeMVD_CABAC ENDP
+_TEXT	ENDS
+
+END
+
diff --git a/Src/h264dec/ldecod/src/biaridecod.c b/Src/h264dec/ldecod/src/biaridecod.c
new file mode 100644
index 00000000..8b1d44f3
--- /dev/null
+++ b/Src/h264dec/ldecod/src/biaridecod.c
@@ -0,0 +1,322 @@
+/*!
+ *************************************************************************************
+ * \file biaridecod.c
+ *
+ * \brief
+ *   Binary arithmetic decoder routines.
+ *
+ *   This modified implementation of the M Coder is based on JVT-U084 
+ *   with the choice of M_BITS = 16.
+ *
+ * \date
+ *    21. Oct 2000
+ * \author
+ *    Main contributors (see contributors.h for copyright, address and affiliation details)
+ *    - Detlev Marpe                    <marpe@hhi.de>
+ *    - Gabi Blaettermann
+ *    - Gunnar Marten
+ *************************************************************************************
+ */
+
+#include "global.h"
+#include "memalloc.h"
+#include "biaridecod.h"
+
+
+#define B_BITS    10      // Number of bits to represent the whole coding interval
+#define HALF      0x01FE  //(1 << (B_BITS-1)) - 2
+#define QUARTER   0x0100  //(1 << (B_BITS-2))
+
+
+/************************************************************************
+ ************************************************************************
+                      init / exit decoder
+ ************************************************************************
+ ************************************************************************/
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Allocates memory for the DecodingEnvironment struct
+ * \return DecodingContextPtr
+ *    allocates memory
+ ************************************************************************
+ */
+DecodingEnvironmentPtr arideco_create_decoding_environment()
+{
+  DecodingEnvironmentPtr dep;
+
+  if ((dep = calloc(1,sizeof(DecodingEnvironment))) == NULL)
+    no_mem_exit("arideco_create_decoding_environment: dep");
+  return dep;
+}
+
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    Frees memory of the DecodingEnvironment struct
+ ***********************************************************************
+ */
+void arideco_delete_decoding_environment(DecodingEnvironmentPtr dep)
+{
+  if (dep == NULL)
+  {
+    snprintf(errortext, ET_SIZE, "Error freeing dep (NULL pointer)");
+    error (errortext, 200);
+  }
+  else
+    free(dep);
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    finalize arithetic decoding():
+ ************************************************************************
+ */
+void arideco_done_decoding(DecodingEnvironmentPtr dep)
+{
+  (*dep->Dcodestrm_len)++;
+#if(TRACE==2)
+  fprintf(p_trace, "done_decoding: %d\n", *dep->Dcodestrm_len);
+#endif
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    read one byte from the bitstream
+ ************************************************************************
+ */
+unsigned int getbyte(DecodingEnvironmentPtr dep)
+{     
+#if(TRACE==2)
+  fprintf(p_trace, "get_byte: %d\n", (*dep->Dcodestrm_len));
+#endif
+  return dep->Dcodestrm[(*dep->Dcodestrm_len)++];
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    read two bytes from the bitstream
+ ************************************************************************
+ */
+
+static unsigned int getword(DecodingEnvironmentPtr dep)
+{
+  int d = *dep->Dcodestrm_len;
+  *dep->Dcodestrm_len += 2;
+  return ((dep->Dcodestrm[d]<<8) | dep->Dcodestrm[d+1]);
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Initializes the DecodingEnvironment for the arithmetic coder
+ ************************************************************************
+ */
+void arideco_start_decoding(DecodingEnvironmentPtr dep, unsigned char *code_buffer,
+                            int firstbyte, int *code_len)
+{
+
+  dep->Dcodestrm      = code_buffer;
+  dep->Dcodestrm_len  = code_len;
+  *dep->Dcodestrm_len = firstbyte;
+
+  dep->Dvalue = getbyte(dep);
+  dep->Dvalue = (dep->Dvalue << 16) | getword(dep); // lookahead of 2 bytes: always make sure that bitstream buffer
+                                        // contains 2 more bytes than actual bitstream
+  dep->DbitsLeft = 15;
+  dep->Drange = HALF;
+
+#if (2==TRACE)
+  fprintf(p_trace, "value: %d firstbyte: %d code_len: %d\n", dep->Dvalue >> dep->DbitsLeft, firstbyte, *code_len);
+#endif
+}
+
+
+
+
+/*!
+************************************************************************
+* \brief
+*    biari_decode_symbol():
+* \return
+*    the decoded symbol
+************************************************************************
+*/
+/* random notes 
+max rLPS = 240  1111 1   111
+max state = 63
+max renorm = 6, min 1
+max bitsleft = 16
+max range = (1<<10) ?????  (1024)
+*/
+#if !defined(_M_IX86) || defined(_DEBUG)
+unsigned int biari_decode_symbol(DecodingEnvironmentPtr dep, BiContextTypePtr bi_ct )
+{
+	unsigned int state = bi_ct->state;
+	unsigned int bit   = bi_ct->MPS;
+	unsigned int value = dep->Dvalue;
+	unsigned int range = dep->Drange;
+	const unsigned int rLPS  = rLPS_table_64x4[(range>>6)&3][state];
+
+	range -= rLPS;
+
+	if(value >= (range << dep->DbitsLeft))   
+	{	// LPS 
+		int renorm;
+		bi_ct->state = AC_next_state_LPS_64[state]; // next state 
+		value -= (range << dep->DbitsLeft);
+		bit ^= 0x01;
+
+		//if (!state)          // switch meaning of MPS if necessary 
+		//	bi_ct->MPS = bit;
+		bi_ct->MPS ^= !state;//0x01; 
+
+		renorm = renorm_table_256[rLPS]; 
+		range = (rLPS << renorm);
+
+		dep->Drange = range;
+		dep->DbitsLeft -= renorm;
+		if( dep->DbitsLeft > 0 )
+		{ 
+			dep->Dvalue = value;
+			return(bit);
+		} 
+
+		dep->Dvalue = (value << 16) | getword(dep);    // lookahead of 2 bytes: always make sure that bitstream buffer
+		// contains 2 more bytes than actual bitstream
+		dep->DbitsLeft += 16;
+
+		return(bit);
+	}
+	else
+	{ 		//MPS
+		bi_ct->state = AC_next_state_MPS_64[state]; // next state 
+
+		if( range < QUARTER )
+		{
+			dep->Drange = range << 1;
+			dep->DbitsLeft -= 1;
+			if( dep->DbitsLeft > 0 )
+			{ 
+				return(bit);
+			} 
+
+			dep->Dvalue = (value << 16) | getword(dep);    // lookahead of 2 bytes: always make sure that bitstream buffer
+			// contains 2 more bytes than actual bitstream
+			dep->DbitsLeft += 16;
+
+			return(bit);
+		}
+		else
+		{
+			dep->Drange = range;
+			return (bit);
+		}
+	}
+	
+}
+#endif
+/*!
+ ************************************************************************
+ * \brief
+ *    biari_decode_symbol_eq_prob():
+ * \return
+ *    the decoded symbol
+ ************************************************************************
+ */
+unsigned int biari_decode_symbol_eq_prob(DecodingEnvironmentPtr dep)
+{
+   int tmp_value;
+   int value = dep->Dvalue;
+
+  if(--(dep->DbitsLeft) == 0)  
+  {
+    value = (value << 16) | getword( dep );  // lookahead of 2 bytes: always make sure that bitstream buffer
+                                             // contains 2 more bytes than actual bitstream
+    dep->DbitsLeft = 16;
+  }
+  tmp_value  = value - (dep->Drange << dep->DbitsLeft);
+
+  if (tmp_value < 0)
+  {
+    dep->Dvalue = value;
+    return 0;
+  }
+  else
+  {
+    dep->Dvalue = tmp_value;
+    return 1;
+  }
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    biari_decode_symbol_final():
+ * \return
+ *    the decoded symbol
+ ************************************************************************
+ */
+unsigned int biari_decode_final(DecodingEnvironmentPtr dep)
+{
+  unsigned int range  = dep->Drange - 2;
+  int value  = dep->Dvalue;
+  value -= (range << dep->DbitsLeft);
+
+  if (value < 0) 
+  {
+    if( range >= QUARTER )
+    {
+      dep->Drange = range;
+      return 0;
+    }
+    else 
+    {   
+      dep->Drange = (range << 1);
+      if( --(dep->DbitsLeft) > 0 )
+        return 0;
+      else
+      {
+        dep->Dvalue = (dep->Dvalue << 16) | getword( dep ); // lookahead of 2 bytes: always make sure that bitstream buffer
+                                                            // contains 2 more bytes than actual bitstream
+        dep->DbitsLeft = 16;
+        return 0;
+      }
+    }
+  }
+  else
+  {
+    return 1;
+  }
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Initializes a given context with some pre-defined probability state
+ ************************************************************************
+ */
+void biari_init_context (int qp, BiContextTypePtr ctx, const char* ini)
+{
+  int pstate = ((ini[0]* qp )>>4) + ini[1];
+
+  if ( pstate >= 64 )
+  {
+		pstate = imin(126, pstate);
+    ctx->state = (uint16) (pstate - 64);
+    ctx->MPS   = 1;
+  }
+  else
+  {
+		pstate = imax(1, pstate);
+    ctx->state = (uint16) (63 - pstate);
+    ctx->MPS   = 0;
+  }
+}
+
diff --git a/Src/h264dec/ldecod/src/block.c b/Src/h264dec/ldecod/src/block.c
new file mode 100644
index 00000000..d048f956
--- /dev/null
+++ b/Src/h264dec/ldecod/src/block.c
@@ -0,0 +1,929 @@
+
+/*!
+ ***********************************************************************
+ *  \file
+ *      block.c
+ *
+ *  \brief
+ *      Block functions
+ *
+ *  \author
+ *      Main contributors (see contributors.h for copyright, address and affiliation details)
+ *      - Inge Lille-Langoy          <inge.lille-langoy@telenor.com>
+ *      - Rickard Sjoberg            <rickard.sjoberg@era.ericsson.se>
+ ***********************************************************************
+ */
+
+#include "contributors.h"
+
+#include "global.h"
+#include "block.h"
+#include "image.h"
+#include "mb_access.h"
+#include "transform.h"
+#include "quant.h"
+#include "memalloc.h"
+#include "optim.h"
+
+
+/*!
+ ****************************************************************************
+ * \brief
+ *    Inverse 4x4 lossless_qpprime transformation, transforms cof to mb_rres
+ ****************************************************************************
+ */
+void itrans4x4_ls(const h264_short_block_row_t *tblock,
+									const h264_imgpel_macroblock_row_t *mb_pred,
+									h264_imgpel_macroblock_row_t *mb_rec,
+                  int ioff,             //!< index to 4x4 block
+                  int joff)             //!< index to 4x4 block
+{
+  int i,j;
+  for (j = 0; j < BLOCK_SIZE; ++j)
+  {
+    for (i = 0; i < BLOCK_SIZE; ++i)
+    {      
+      mb_rec[j+joff][i+ioff] = (imgpel) iClip1(255/*max_imgpel_value*/, mb_pred[j+joff][i+ioff] + tblock[j][i]);
+    }
+  }
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    Inverse residual DPCM for Intra lossless coding
+*
+************************************************************************
+*/
+void Inv_Residual_trans_4x4(Macroblock *currMB,   //!< current macroblock
+														ColorPlane pl,        //!< used color plane
+														int ioff,             //!< index to 4x4 block
+														int joff)             //!< index to 4x4 block
+{
+	int i,j;
+	h264_short_block_t temp;
+	Slice *currSlice = currMB->p_Slice;
+	int subblock = cof4_pos_to_subblock[joff>>2][ioff>>2];
+
+	h264_short_block_row_t *tblock = currSlice->cof4[pl][subblock];
+
+	if(currMB->ipmode_DPCM == VERT_PRED)
+	{
+		for(i=0; i<4; ++i)
+		{
+			temp[0][i] = tblock[0][i];
+			temp[1][i] = tblock[1][i] + temp[0][i];
+			temp[2][i] = tblock[2][i] + temp[1][i];
+			temp[3][i] = tblock[3][i] + temp[2][i];
+		}
+	}
+	else if(currMB->ipmode_DPCM == HOR_PRED)
+	{
+		for(j=0; j<4; ++j)
+		{
+			temp[j][0] = tblock[j][0];
+			temp[j][1] = tblock[j][1] + temp[j][0];
+			temp[j][2] = tblock[j][2] + temp[j][1];
+			temp[j][3] = tblock[j][3] + temp[j][2];
+		}
+	}
+	else
+	{
+		for (j = 0; j < BLOCK_SIZE; ++j)
+			for (i = 0; i < BLOCK_SIZE; ++i)
+				temp[j][i] = tblock[j][i];
+	}
+
+	for (j = 0; j < BLOCK_SIZE; ++j)
+	{
+		for (i = 0; i < BLOCK_SIZE; ++i)
+		{
+			currSlice->mb_rec[pl][j+joff][i+ioff] = (imgpel) (temp[j][i] + currSlice->mb_pred[pl][j+joff][i+ioff]);
+		}
+	}
+}
+
+/*!
+************************************************************************
+* \brief
+*    Inverse residual DPCM for Intra lossless coding
+*
+* \par Input:
+*    ioff_x,joff_y: Block position inside a macro block (0,8).
+************************************************************************
+*/
+//For residual DPCM
+void Inv_Residual_trans_8x8(Macroblock *currMB, ColorPlane pl, int ioff,int joff)
+{
+  Slice *currSlice = currMB->p_Slice;
+  int i, j;
+	h264_short_8x8block_t temp;
+
+	int block = (joff>>2) + (ioff>>3);
+
+  if(currMB->ipmode_DPCM == VERT_PRED)
+  {
+    for(i=0; i<8; ++i)
+    {
+      temp[0][i] = currSlice->mb_rres8[pl][block][0][i];
+      temp[1][i] = currSlice->mb_rres8[pl][block][1][i] + temp[0][i];
+      temp[2][i] = currSlice->mb_rres8[pl][block][2][i] + temp[1][i];
+      temp[3][i] = currSlice->mb_rres8[pl][block][3][i] + temp[2][i];
+      temp[4][i] = currSlice->mb_rres8[pl][block][4][i] + temp[3][i];
+      temp[5][i] = currSlice->mb_rres8[pl][block][5][i] + temp[4][i];
+      temp[6][i] = currSlice->mb_rres8[pl][block][6][i] + temp[5][i];
+      temp[7][i] = currSlice->mb_rres8[pl][block][7][i] + temp[6][i];
+    }
+    for(i=0; i<8; ++i)
+    {
+      currSlice->mb_rres8[pl][block][0][i]=temp[0][i];
+      currSlice->mb_rres8[pl][block][1][i]=temp[1][i];
+      currSlice->mb_rres8[pl][block][2][i]=temp[2][i];
+      currSlice->mb_rres8[pl][block][3][i]=temp[3][i];
+      currSlice->mb_rres8[pl][block][4][i]=temp[4][i];
+      currSlice->mb_rres8[pl][block][5][i]=temp[5][i];
+      currSlice->mb_rres8[pl][block][6][i]=temp[6][i];
+      currSlice->mb_rres8[pl][block][7][i]=temp[7][i];
+    }
+  }
+  else if(currMB->ipmode_DPCM == HOR_PRED)//HOR_PRED
+  {
+    for(i=0; i<8; ++i)
+    {
+      temp[i][0] = currSlice->mb_rres8[pl][block][i][0];
+      temp[i][1] = currSlice->mb_rres8[pl][block][i][1] + temp[i][0];
+      temp[i][2] = currSlice->mb_rres8[pl][block][i][2] + temp[i][1];
+      temp[i][3] = currSlice->mb_rres8[pl][block][i][3] + temp[i][2];
+      temp[i][4] = currSlice->mb_rres8[pl][block][i][4] + temp[i][3];
+      temp[i][5] = currSlice->mb_rres8[pl][block][i][5] + temp[i][4];
+      temp[i][6] = currSlice->mb_rres8[pl][block][i][6] + temp[i][5];
+      temp[i][7] = currSlice->mb_rres8[pl][block][i][7] + temp[i][6];
+    }
+    for(i=0; i<8; ++i)
+    {
+      currSlice->mb_rres8[pl][block][i][0]=temp[i][0];
+      currSlice->mb_rres8[pl][block][i][1]=temp[i][1];
+      currSlice->mb_rres8[pl][block][i][2]=temp[i][2];
+      currSlice->mb_rres8[pl][block][i][3]=temp[i][3];
+      currSlice->mb_rres8[pl][block][i][4]=temp[i][4];
+      currSlice->mb_rres8[pl][block][i][5]=temp[i][5];
+      currSlice->mb_rres8[pl][block][i][6]=temp[i][6];
+      currSlice->mb_rres8[pl][block][i][7]=temp[i][7];
+    }
+  }
+
+  for (j = 0; j < BLOCK_SIZE_8x8; ++j)
+  {
+    for (i = 0; i < BLOCK_SIZE_8x8; ++i)
+    {
+      currSlice->mb_rec[pl][joff+j][ioff+i]  = (imgpel) (currSlice->mb_rres8[pl][block][j][i] + currSlice->mb_pred[pl][joff+j][ioff+i]);
+    }
+  }
+}
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    Luma DC inverse transform
+ ***********************************************************************
+ */ 
+void itrans_2(Macroblock *currMB, ColorPlane pl)
+{
+  Slice *currSlice = currMB->p_Slice;
+  VideoParameters *p_Vid = currMB->p_Vid;
+
+  int transform_pl = IS_INDEPENDENT(p_Vid) ? PLANE_Y /*p_Vid->colour_plane_id*/ : pl;
+  h264_short_block_t *blocks = currSlice->cof4[transform_pl];
+  int qp_scaled = currMB->qp_scaled[pl];
+
+  int qp_per = p_Vid->qp_per_matrix[ qp_scaled ];
+  int qp_rem = p_Vid->qp_rem_matrix[ qp_scaled ];      
+
+  int invLevelScale = currSlice->InvLevelScale4x4_Intra[pl][qp_rem][0][0];
+  h264_int_block_t M4;
+  
+  // horizontal
+	M4[0][0]=blocks[0][0][0];
+	M4[0][1]=blocks[1][0][0];
+	M4[0][2]=blocks[4][0][0];
+	M4[0][3]=blocks[5][0][0];
+	M4[1][0]=blocks[2][0][0];
+	M4[1][1]=blocks[3][0][0];
+	M4[1][2]=blocks[6][0][0];
+	M4[1][3]=blocks[7][0][0];
+	M4[2][0]=blocks[8][0][0];
+	M4[2][1]=blocks[9][0][0];
+	M4[2][2]=blocks[12][0][0];
+	M4[2][3]=blocks[13][0][0];
+	M4[3][0]=blocks[10][0][0];
+	M4[3][1]=blocks[11][0][0];
+	M4[3][2]=blocks[14][0][0];
+	M4[3][3]=blocks[15][0][0];
+
+  ihadamard4x4(M4);
+
+  // vertical
+	blocks[0][0][0] = rshift_rnd((( M4[0][0] * invLevelScale) << qp_per), 6);
+	blocks[1][0][0] = rshift_rnd((( M4[0][1] * invLevelScale) << qp_per), 6);
+	blocks[4][0][0] = rshift_rnd((( M4[0][2] * invLevelScale) << qp_per), 6);
+	blocks[5][0][0] = rshift_rnd((( M4[0][3] * invLevelScale) << qp_per), 6);
+	blocks[2][0][0] = rshift_rnd((( M4[1][0] * invLevelScale) << qp_per), 6);
+	blocks[3][0][0] = rshift_rnd((( M4[1][1] * invLevelScale) << qp_per), 6);
+	blocks[6][0][0] = rshift_rnd((( M4[1][2] * invLevelScale) << qp_per), 6);
+	blocks[7][0][0] = rshift_rnd((( M4[1][3] * invLevelScale) << qp_per), 6);
+	blocks[8][0][0] = rshift_rnd((( M4[2][0] * invLevelScale) << qp_per), 6);
+	blocks[9][0][0] = rshift_rnd((( M4[2][1] * invLevelScale) << qp_per), 6);
+	blocks[12][0][0] = rshift_rnd((( M4[2][2] * invLevelScale) << qp_per), 6);
+	blocks[13][0][0] = rshift_rnd((( M4[2][3] * invLevelScale) << qp_per), 6);
+	blocks[10][0][0] = rshift_rnd((( M4[3][0] * invLevelScale) << qp_per), 6);
+	blocks[11][0][0] = rshift_rnd((( M4[3][1] * invLevelScale) << qp_per), 6);
+	blocks[14][0][0] = rshift_rnd((( M4[3][2] * invLevelScale) << qp_per), 6);
+	blocks[15][0][0] = rshift_rnd((( M4[3][3] * invLevelScale) << qp_per), 6);
+}
+
+
+void itrans_sp(h264_short_block_row_t *tblock, const h264_imgpel_macroblock_row_t *mb_pred, Macroblock *currMB, ColorPlane pl, int ioff, int joff) 
+{
+  VideoParameters *p_Vid = currMB->p_Vid;
+  Slice *currSlice = currMB->p_Slice;
+  int i,j;  
+  int ilev, icof;
+
+  int qp = (currSlice->slice_type == SI_SLICE) ? currSlice->qs : p_Vid->qp;
+  int qp_per = p_Vid->qp_per_matrix[ qp ];
+  int qp_rem = p_Vid->qp_rem_matrix[ qp ];
+
+  int qp_per_sp = p_Vid->qp_per_matrix[ currSlice->qs ];
+  int qp_rem_sp = p_Vid->qp_rem_matrix[ currSlice->qs ];
+  int q_bits_sp = Q_BITS + qp_per_sp;
+  int max_imgpel_value = p_Vid->max_pel_value_comp[pl];
+
+  const int (*InvLevelScale4x4)  [4] = dequant_coef[qp_rem];
+  const int (*InvLevelScale4x4SP)[4] = dequant_coef[qp_rem_sp];  
+  int **PBlock;  
+
+  get_mem2Dint(&PBlock, MB_BLOCK_SIZE, MB_BLOCK_SIZE);
+
+  for (j=0; j< BLOCK_SIZE; ++j)
+    for (i=0; i< BLOCK_SIZE; ++i)
+      PBlock[j][i] = mb_pred[j+joff][i+ioff];
+
+  forward4x4(PBlock, PBlock, 0, 0);
+
+  if(p_Vid->sp_switch || currSlice->slice_type==SI_SLICE)
+  {    
+    for (j=0;j<BLOCK_SIZE;++j)
+    {
+      for (i=0;i<BLOCK_SIZE;++i)
+      {
+        // recovering coefficient since they are already dequantized earlier
+        icof = (tblock[j][i] >> qp_per) / InvLevelScale4x4[j][i];
+        ilev  = rshift_rnd_sf(iabs(PBlock[j][i]) * quant_coef[qp_rem_sp][j][i], q_bits_sp);
+        ilev  = isignab(ilev, PBlock[j][i]) + icof;
+        tblock[j][i] = ilev * InvLevelScale4x4SP[j][i] << qp_per_sp;
+      }
+    }
+  }
+  else
+  {
+    for (j=0;j<BLOCK_SIZE;++j)
+    {
+      for (i=0;i<BLOCK_SIZE;++i)
+      {
+        // recovering coefficient since they are already dequantized earlier
+        icof = (tblock[j][i] >> qp_per) / InvLevelScale4x4[j][i];
+        ilev = PBlock[j][i] + ((icof * InvLevelScale4x4[j][i] * A[j][i] <<  qp_per) >> 6);
+        ilev  = isign(ilev) * rshift_rnd_sf(iabs(ilev) * quant_coef[qp_rem_sp][j][i], q_bits_sp);
+        tblock[j][i] = ilev * InvLevelScale4x4SP[j][i] << qp_per_sp;
+      }
+    }
+  }
+
+	{
+		h264_imgpel_macroblock_row_t *mb_rec = currSlice->mb_rec[pl];
+		opt_itrans4x4(tblock, mb_pred, mb_rec, ioff, joff);
+	}
+
+  free_mem2Dint(PBlock);
+}
+
+void itrans_sp_cr(Macroblock *currMB, int uv)
+{
+  Slice *currSlice = currMB->p_Slice;
+  VideoParameters *p_Vid = currMB->p_Vid;
+  int i,j,ilev, icof, n2,n1;
+  int mp1[BLOCK_SIZE];
+  int qp_per,qp_rem;
+  int qp_per_sp,qp_rem_sp,q_bits_sp;
+  int **PBlock;  
+
+  get_mem2Dint(&PBlock, MB_BLOCK_SIZE, MB_BLOCK_SIZE);
+
+
+  qp_per    = p_Vid->qp_per_matrix[ ((p_Vid->qp < 0 ? p_Vid->qp : QP_SCALE_CR[p_Vid->qp]))];
+  qp_rem    = p_Vid->qp_rem_matrix[ ((p_Vid->qp < 0 ? p_Vid->qp : QP_SCALE_CR[p_Vid->qp]))];
+
+  qp_per_sp = p_Vid->qp_per_matrix[ ((currSlice->qs < 0 ? currSlice->qs : QP_SCALE_CR[currSlice->qs]))];
+  qp_rem_sp = p_Vid->qp_rem_matrix[ ((currSlice->qs < 0 ? currSlice->qs : QP_SCALE_CR[currSlice->qs]))];
+  q_bits_sp = Q_BITS + qp_per_sp;  
+
+  if (currSlice->slice_type == SI_SLICE)
+  {
+    qp_per = qp_per_sp;
+    qp_rem = qp_rem_sp;
+  }
+
+  for (j=0; j < p_Vid->mb_cr_size_y; ++j)
+  {
+    for (i=0; i < p_Vid->mb_cr_size_x; ++i)
+    {
+      PBlock[j][i] = currSlice->mb_pred[uv + 1][j][i];
+      currSlice->mb_pred[uv + 1][j][i] = 0;
+    }
+  }
+
+  for (n2=0; n2 < p_Vid->mb_cr_size_y; n2 += BLOCK_SIZE)
+  {
+    for (n1=0; n1 < p_Vid->mb_cr_size_x; n1 += BLOCK_SIZE)
+    {
+      forward4x4(PBlock, PBlock, n2, n1);
+    }
+  }
+
+  //     2X2 transform of DC coeffs.
+  mp1[0] = (PBlock[0][0] + PBlock[4][0] + PBlock[0][4] + PBlock[4][4]);
+  mp1[1] = (PBlock[0][0] - PBlock[4][0] + PBlock[0][4] - PBlock[4][4]);
+  mp1[2] = (PBlock[0][0] + PBlock[4][0] - PBlock[0][4] - PBlock[4][4]);
+  mp1[3] = (PBlock[0][0] - PBlock[4][0] - PBlock[0][4] + PBlock[4][4]);
+
+  if (p_Vid->sp_switch || currSlice->slice_type == SI_SLICE)  
+  {        
+    for (n2=0; n2 < 2; ++n2 )
+    {
+      for (n1=0; n1 < 2; ++n1 )
+      {
+        //quantization fo predicted block
+        ilev = rshift_rnd_sf(iabs (mp1[n1+n2*2]) * quant_coef[qp_rem_sp][0][0], q_bits_sp + 1);
+        //addition
+        ilev = isignab(ilev, mp1[n1+n2*2]) + currSlice->cof4[uv + 1][cof4_pos_to_subblock[n2][n1]][0][0];
+        //dequantization
+        mp1[n1+n2*2] =ilev * dequant_coef[qp_rem_sp][0][0] << qp_per_sp;
+      }
+    }
+
+    for (n2 = 0; n2 < p_Vid->mb_cr_size_y; n2 += BLOCK_SIZE)
+    {
+      for (n1 = 0; n1 < p_Vid->mb_cr_size_x; n1 += BLOCK_SIZE)
+      {
+        for (j = 0; j < BLOCK_SIZE; ++j)
+        {
+          for (i = 0; i < BLOCK_SIZE; ++i)
+          {
+            // recovering coefficient since they are already dequantized earlier
+            currSlice->cof4[uv + 1][cof4_pos_to_subblock[n2>>2][n1>>2]][j][i] = (currSlice->cof4[uv + 1][cof4_pos_to_subblock[n2>>2][n1>>2]][j][i] >> qp_per) / dequant_coef[qp_rem][j][i];
+
+            //quantization of the predicted block
+            ilev = rshift_rnd_sf(iabs(PBlock[n2 + j][n1 + i]) * quant_coef[qp_rem_sp][j][i], q_bits_sp);
+            //addition of the residual
+            ilev = isignab(ilev,PBlock[n2 + j][n1 + i]) + currSlice->cof4[uv + 1][cof4_pos_to_subblock[n2>>2][n1>>2]][j][i] ;
+            // Inverse quantization
+            currSlice->cof4[uv + 1][cof4_pos_to_subblock[n2>>2][n1>>2]][j][i]  = ilev * dequant_coef[qp_rem_sp][j][i] << qp_per_sp;
+          }
+        }
+      }
+    }
+  }
+  else
+  {
+    for (n2=0; n2 < 2; ++n2 )
+    {
+      for (n1=0; n1 < 2; ++n1 )
+      {
+        ilev = mp1[n1+n2*2] + (((currSlice->cof4[uv + 1][cof4_pos_to_subblock[n2][n1]][0][0] * dequant_coef[qp_rem][0][0] * A[0][0]) << qp_per) >> 5);
+        ilev = isign(ilev) * rshift_rnd_sf(iabs(ilev) * quant_coef[qp_rem_sp][0][0], q_bits_sp + 1);
+        //ilev = isignab(rshift_rnd_sf(iabs(ilev)* quant_coef[qp_rem_sp][0][0], q_bits_sp + 1), ilev);
+        mp1[n1+n2*2] = ilev * dequant_coef[qp_rem_sp][0][0] << qp_per_sp;
+      }
+    }
+
+    for (n2 = 0; n2 < p_Vid->mb_cr_size_y; n2 += BLOCK_SIZE)
+    {
+      for (n1 = 0; n1 < p_Vid->mb_cr_size_x; n1 += BLOCK_SIZE)
+      {
+        for (j = 0; j< BLOCK_SIZE; ++j)
+        {
+          for (i = 0; i< BLOCK_SIZE; ++i)
+          {
+            // recovering coefficient since they are already dequantized earlier
+            icof = (currSlice->cof4[uv + 1][cof4_pos_to_subblock[n2>>2][n1>>2]][j][i] >> qp_per) / dequant_coef[qp_rem][j][i];
+            //dequantization and addition of the predicted block      
+            ilev = PBlock[n2 + j][n1 + i] + ((icof * dequant_coef[qp_rem][j][i] * A[j][i] << qp_per) >> 6);
+            //quantization and dequantization
+            ilev = isign(ilev) * rshift_rnd_sf(iabs(ilev) * quant_coef[qp_rem_sp][j][i], q_bits_sp);
+            currSlice->cof4[uv + 1][cof4_pos_to_subblock[n2>>2][n1>>2]][j][i] = ilev * dequant_coef[qp_rem_sp][j][i] << qp_per_sp;
+          }
+        }
+      }
+    }
+  }
+
+  currSlice->cof4[uv + 1][0][0][0] = (mp1[0] + mp1[1] + mp1[2] + mp1[3]) >> 1;
+  currSlice->cof4[uv + 1][1][0][0] = (mp1[0] + mp1[1] - mp1[2] - mp1[3]) >> 1;
+  currSlice->cof4[uv + 1][2][0][0] = (mp1[0] - mp1[1] + mp1[2] - mp1[3]) >> 1;
+  currSlice->cof4[uv + 1][3][0][0] = (mp1[0] - mp1[1] - mp1[2] + mp1[3]) >> 1;
+
+  free_mem2Dint(PBlock);
+}
+
+#if defined(_DEBUG) || !defined(_M_IX86)
+void iMBtrans4x4(Macroblock *currMB, ColorPlane pl, int smb)
+{
+  Slice *currSlice = currMB->p_Slice;
+  VideoParameters *p_Vid = currMB->p_Vid;
+
+  StorablePicture *dec_picture = p_Vid->dec_picture;
+
+  VideoImage *curr_img = pl ? dec_picture->imgUV[pl - 1]: dec_picture->imgY;
+
+  // =============== 4x4 itrans ================
+  // -------------------------------------------
+	if (smb)
+	{
+		h264_short_block_t *blocks = currSlice->cof4[pl];
+		const h264_imgpel_macroblock_row_t *mb_pred=currSlice->mb_pred[pl];
+
+		itrans_sp(blocks[0], mb_pred, currMB, pl, 0, 0);
+		itrans_sp(blocks[1], mb_pred, currMB, pl, 4, 0);
+		itrans_sp(blocks[2], mb_pred, currMB, pl, 0, 4);
+		itrans_sp(blocks[3], mb_pred, currMB, pl, 4, 4);
+		itrans_sp(blocks[4], mb_pred, currMB, pl, 8, 0);
+		itrans_sp(blocks[5], mb_pred, currMB, pl, 12, 0);
+		itrans_sp(blocks[6], mb_pred, currMB, pl, 8, 4);
+		itrans_sp(blocks[7], mb_pred, currMB, pl, 12, 4);
+		itrans_sp(blocks[8], mb_pred, currMB, pl, 0, 8);
+		itrans_sp(blocks[9], mb_pred, currMB, pl, 4, 8);
+		itrans_sp(blocks[10], mb_pred, currMB, pl, 0, 12);
+		itrans_sp(blocks[11], mb_pred, currMB, pl, 4, 12);
+		itrans_sp(blocks[12], mb_pred, currMB, pl, 8, 8);
+		itrans_sp(blocks[13], mb_pred, currMB, pl, 12, 8);
+		itrans_sp(blocks[14], mb_pred, currMB, pl, 8, 12);
+		itrans_sp(blocks[15], mb_pred, currMB, pl, 12, 12);
+	}
+	else if (currMB->is_lossless)
+	{
+		Inv_Residual_trans_4x4(currMB, pl, 0, 0);
+		Inv_Residual_trans_4x4(currMB, pl, 4, 0);
+		Inv_Residual_trans_4x4(currMB, pl, 0, 4);
+		Inv_Residual_trans_4x4(currMB, pl, 4, 4);
+		Inv_Residual_trans_4x4(currMB, pl, 8, 0);
+		Inv_Residual_trans_4x4(currMB, pl, 12, 0);
+		Inv_Residual_trans_4x4(currMB, pl, 8, 4);
+		Inv_Residual_trans_4x4(currMB, pl, 12, 4);
+		Inv_Residual_trans_4x4(currMB, pl, 0, 8);
+		Inv_Residual_trans_4x4(currMB, pl, 4, 8);
+		Inv_Residual_trans_4x4(currMB, pl, 0, 12);
+		Inv_Residual_trans_4x4(currMB, pl, 4, 12);
+		Inv_Residual_trans_4x4(currMB, pl, 8, 8);
+		Inv_Residual_trans_4x4(currMB, pl, 12, 8);
+		Inv_Residual_trans_4x4(currMB, pl, 8, 12);
+		Inv_Residual_trans_4x4(currMB, pl, 12, 12);
+	}
+	else
+	{
+			const h264_short_block_t *blocks = currSlice->cof4[pl];
+			const h264_imgpel_macroblock_row_t *mb_pred=currSlice->mb_pred[pl];
+			h264_imgpel_macroblock_row_t *mb_rec = currSlice->mb_rec[pl];
+
+			opt_itrans4x4(blocks[0], mb_pred, mb_rec, 0, 0);
+			opt_itrans4x4(blocks[1], mb_pred, mb_rec, 4, 0);
+			opt_itrans4x4(blocks[2], mb_pred, mb_rec, 0, 4);
+			opt_itrans4x4(blocks[3], mb_pred, mb_rec, 4, 4);
+			opt_itrans4x4(blocks[4], mb_pred, mb_rec, 8, 0);
+			opt_itrans4x4(blocks[5], mb_pred, mb_rec, 12, 0);
+			opt_itrans4x4(blocks[6], mb_pred, mb_rec, 8, 4);
+			opt_itrans4x4(blocks[7], mb_pred, mb_rec, 12, 4);
+			opt_itrans4x4(blocks[8], mb_pred, mb_rec, 0, 8);
+			opt_itrans4x4(blocks[9], mb_pred, mb_rec, 4, 8);
+			opt_itrans4x4(blocks[10], mb_pred, mb_rec, 0, 12);
+			opt_itrans4x4(blocks[11], mb_pred, mb_rec, 4, 12);
+			opt_itrans4x4(blocks[12], mb_pred, mb_rec, 8, 8);
+			opt_itrans4x4(blocks[13], mb_pred, mb_rec, 12, 8);
+			opt_itrans4x4(blocks[14], mb_pred, mb_rec, 8, 12);
+			opt_itrans4x4(blocks[15], mb_pred, mb_rec, 12, 12);
+	}
+
+  // construct picture from 4x4 blocks
+	opt_copy_image_data_16x16_stride(curr_img, currMB->pix_x, currMB->pix_y, currSlice->mb_rec[pl]);
+}
+#endif
+void iMBtrans8x8(Macroblock *currMB, ColorPlane pl)
+{
+	Slice *currSlice = currMB->p_Slice;
+	VideoParameters *p_Vid = currMB->p_Vid;
+	StorablePicture *dec_picture = p_Vid->dec_picture;
+	VideoImage *curr_img = pl ? dec_picture->imgUV[pl - 1] : dec_picture->imgY;
+
+	h264_imgpel_macroblock_row_t *mb_rec = currSlice->mb_rec[pl];
+	h264_imgpel_macroblock_row_t *mb_pred = currSlice->mb_pred[pl];
+	h264_short_8x8block_t *mb_rres8 = currSlice->mb_rres8[pl];
+
+	if (currMB->is_lossless == FALSE)
+	{
+		opt_itrans8x8(mb_rec, mb_pred, mb_rres8[0], 0);
+		opt_itrans8x8(mb_rec, mb_pred, mb_rres8[1], 8);
+		opt_itrans8x8(mb_rec+8, mb_pred+8, mb_rres8[2], 0);		
+		opt_itrans8x8(mb_rec+8, mb_pred+8, mb_rres8[3], 8);
+	}
+	else
+	{
+		itrans8x8_lossless(mb_rec, mb_pred, mb_rres8[0], 0);
+		itrans8x8_lossless(mb_rec, mb_pred, mb_rres8[1], 8);
+		itrans8x8_lossless(mb_rec+8, mb_pred+8, mb_rres8[2], 0);
+		itrans8x8_lossless(mb_rec+8, mb_pred+8, mb_rres8[3], 8);
+	}
+
+	opt_copy_image_data_16x16_stride(curr_img, currMB->pix_x, currMB->pix_y, mb_rec);
+}
+
+void iTransform(Macroblock *currMB, ColorPlane pl, int smb)
+{
+  Slice *currSlice = currMB->p_Slice;
+  VideoParameters *p_Vid = currMB->p_Vid;
+  StorablePicture *dec_picture = p_Vid->dec_picture;
+  
+  int uv = pl-1; 
+
+  if ((currMB->cbp & 15) != 0 || smb)
+  {
+    if(currMB->luma_transform_size_8x8_flag == 0) // 4x4 inverse transform
+    {
+      iMBtrans4x4(currMB, pl, smb); 
+    }
+    else // 8x8 inverse transform
+    {  
+      iMBtrans8x8(currMB, pl);    
+    }
+  }
+  else
+  {
+    VideoImage *curr_img = pl ? dec_picture->imgUV[uv] : dec_picture->imgY;
+    opt_copy_image_data_16x16_stride(curr_img, currMB->pix_x, currMB->pix_y, currSlice->mb_pred[pl]);
+  }
+// TODO: fix 4x4 lossless
+	if (dec_picture->chroma_format_idc == YUV420)
+	{
+		VideoImage *curUV;
+
+		for(uv=0;uv<2;++uv)
+		{
+			int pl = uv + 1;
+
+			const h264_imgpel_macroblock_row_t *mb_pred=currSlice->mb_pred[pl];
+
+			// =============== 4x4 itrans ================
+			// -------------------------------------------
+			curUV = dec_picture->imgUV[uv];
+
+			if (!smb && (currMB->cbp>>4))
+			{
+				if (currMB->is_lossless == FALSE)
+				{
+					const h264_short_block_t *blocks = currSlice->cof4[pl];
+					h264_imgpel_macroblock_row_t *mb_rec = currSlice->mb_rec[pl];
+
+					opt_itrans4x4(blocks[0], mb_pred, mb_rec, 0, 0);
+					opt_itrans4x4(blocks[1], mb_pred, mb_rec, 4, 0);
+					opt_itrans4x4(blocks[2], mb_pred, mb_rec, 0, 4);
+					opt_itrans4x4(blocks[3], mb_pred, mb_rec, 4, 4);
+					copy_image_data_8x8_stride(curUV,currMB->pix_c_x, currMB->pix_c_y,  mb_rec);
+				}
+				else
+				{ // lossless
+					const h264_short_block_t *blocks = currSlice->cof4[pl];
+					h264_imgpel_macroblock_row_t *mb_rec = currSlice->mb_rec[pl];
+
+					itrans4x4_ls(blocks[0], mb_pred, mb_rec, 0, 0);
+					itrans4x4_ls(blocks[1], mb_pred, mb_rec, 4, 0);
+					itrans4x4_ls(blocks[2], mb_pred, mb_rec, 0, 4);
+					itrans4x4_ls(blocks[3], mb_pred, mb_rec, 4, 4);
+					copy_image_data_8x8_stride(curUV,currMB->pix_c_x, currMB->pix_c_y,  mb_rec);
+				}
+			}
+			else if (smb)
+			{
+				const h264_short_block_t *blocks = currSlice->cof4[pl];
+				h264_imgpel_macroblock_row_t *mb_rec = currSlice->mb_rec[pl];
+
+				itrans_sp_cr(currMB, uv);
+
+				opt_itrans4x4(blocks[0], mb_pred, mb_rec, 0, 0);
+				opt_itrans4x4(blocks[1], mb_pred, mb_rec, 4, 0);
+				opt_itrans4x4(blocks[2], mb_pred, mb_rec, 0, 4);
+				opt_itrans4x4(blocks[3], mb_pred, mb_rec, 4, 4);
+
+				copy_image_data_8x8_stride(curUV,currMB->pix_c_x, currMB->pix_c_y,  mb_rec);
+			}
+			else 
+			{
+				copy_image_data_8x8_stride(curUV,currMB->pix_c_x, currMB->pix_c_y,  mb_pred);
+			}
+		}
+	}
+	else if (dec_picture->chroma_format_idc == YUV422)
+	{
+		VideoImage *curUV;
+
+		for(uv=0;uv<2;++uv)
+		{
+			// =============== 4x4 itrans ================
+			// -------------------------------------------
+			int pl = uv + 1;
+			const h264_imgpel_macroblock_row_t *mb_pred=currSlice->mb_pred[pl];
+			curUV = dec_picture->imgUV[uv];
+
+			if (!smb && (currMB->cbp>>4))
+			{
+				h264_imgpel_macroblock_row_t *mb_rec = currSlice->mb_rec[pl];
+				const h264_short_block_t *blocks = currSlice->cof4[pl];
+
+				opt_itrans4x4(blocks[0], mb_pred, mb_rec, 0, 0);
+				opt_itrans4x4(blocks[1], mb_pred, mb_rec, 4, 0);
+				opt_itrans4x4(blocks[2], mb_pred, mb_rec, 0, 4);
+				opt_itrans4x4(blocks[3], mb_pred, mb_rec, 4, 4);
+				opt_itrans4x4(blocks[8], mb_pred, mb_rec, 0, 8);
+				opt_itrans4x4(blocks[9], mb_pred, mb_rec, 4, 8);
+				opt_itrans4x4(blocks[10], mb_pred, mb_rec, 0, 12);
+				opt_itrans4x4(blocks[11], mb_pred, mb_rec, 4, 12);
+
+				copy_image_data_stride(curUV,currMB->pix_c_x, currMB->pix_c_y,  mb_rec, 8, 16);
+			}
+			else if (smb)
+			{
+				const h264_short_block_t *blocks = currSlice->cof4[pl];
+				h264_imgpel_macroblock_row_t *mb_rec = currSlice->mb_rec[pl];
+
+				itrans_sp_cr(currMB, uv);
+
+				opt_itrans4x4(blocks[0], mb_pred, mb_rec, 0, 0);
+				opt_itrans4x4(blocks[1], mb_pred, mb_rec, 4, 0);
+				opt_itrans4x4(blocks[2], mb_pred, mb_rec, 0, 4);
+				opt_itrans4x4(blocks[3], mb_pred, mb_rec, 4, 4);
+				opt_itrans4x4(blocks[8], mb_pred, mb_rec, 0, 8);
+				opt_itrans4x4(blocks[9], mb_pred, mb_rec, 4, 8);
+				opt_itrans4x4(blocks[10], mb_pred, mb_rec, 0, 12);
+				opt_itrans4x4(blocks[11], mb_pred, mb_rec, 4, 12);
+
+				copy_image_data_stride(curUV,currMB->pix_c_x, currMB->pix_c_y,  mb_rec, 8, 16);
+			}
+			else 
+			{
+				copy_image_data_stride(curUV,currMB->pix_c_x, currMB->pix_c_y,  mb_pred, 8, 16);
+			}
+		}
+	}
+}
+
+/*!
+ *************************************************************************************
+ * \brief
+ *    Copy ImgPel Data from one structure to another (16x16)
+ *************************************************************************************
+ */
+void copy_image_data_16x16(imgpel  **imgBuf1, imgpel  **imgBuf2, int dest_x, int src_x)
+{
+  int j;
+  for(j=0; j<MB_BLOCK_SIZE; ++j)
+  {
+    memcpy(&imgBuf1[j][dest_x], &imgBuf2[j][src_x], MB_BLOCK_SIZE * sizeof (imgpel));
+  }
+}
+
+/*!
+ *************************************************************************************
+ * \brief
+ *    Copy ImgPel Data from one structure to another (16x16)
+ *************************************************************************************
+ */
+#ifdef _M_IX86
+void copy_image_data_16x16_stride_sse(VideoImage *destination, int dest_x, int dest_y, const h264_imgpel_macroblock_t source)
+{
+		ptrdiff_t destination_stride = destination->stride; // in case the compiler doesn't optimize this
+	imgpel *dest = destination->base_address + destination_stride * dest_y + dest_x;
+		_asm
+		{
+			mov eax, dest
+			mov ecx, destination_stride
+			mov edx, source;
+			movaps xmm0, 0[edx]
+			movaps xmm1, 16[edx]
+			movaps xmm2, 32[edx]
+			movaps xmm3, 64[edx]
+			movups [eax], xmm0 // dest[0]
+			movups [eax+ecx], xmm1 // dest[1]
+			movups [eax+2*ecx], xmm2 // dest[2]
+			movups [eax+4*ecx], xmm3 // dest[4]
+
+			movaps xmm0, 48[edx] 
+			movaps xmm1, 96[edx] 
+			lea eax, [eax+2*ecx] // dest = &dest[2]
+			movups [eax+ecx], xmm0 // dest[3]
+			movups [eax+4*ecx], xmm1 // dest[6]
+
+			movaps xmm0, 80[edx] 
+			movaps xmm1, 128[edx] 
+			lea eax, [eax+2*ecx] // dest = &dest[2] (dest[4] from start)
+			movups [eax+ecx], xmm0 // dest[5]
+			movups [eax+4*ecx], xmm1 // dest[8]
+
+			movaps xmm0, 112[edx] 
+			movaps xmm1, 160[edx] 
+			lea eax, [eax+2*ecx] // dest = &dest[2] (dest[6] from start)
+			movups [eax+ecx], xmm0 // dest[7]
+			movups [eax+4*ecx], xmm1 // dest[10]
+
+			movaps xmm0, 144[edx] 
+			movaps xmm1, 192[edx] 
+			lea eax, [eax+2*ecx] // dest = &dest[2] (dest[8] from start)
+			movups [eax+ecx], xmm0 // dest[9]
+			movups [eax+4*ecx], xmm1 // dest[12]
+
+			movaps xmm0, 176[edx] 
+			movaps xmm1, 224[edx] 
+			lea eax, [eax+2*ecx] // dest = &dest[2] (dest[10] from start)
+			movups [eax+ecx], xmm0 // dest[11]
+			movups [eax+4*ecx], xmm1 // dest[14]
+
+			movaps xmm0, 208[edx] 
+			movaps xmm1, 240[edx] 
+			lea eax, [eax+ecx] // dest = &dest[1] (dest[11] from start)
+			movups [eax+2*ecx], xmm0 // dest[13]
+			movups [eax+4*ecx], xmm1 // dest[15]
+		}
+}
+#endif
+
+void copy_image_data_16x16_stride_c(VideoImage *destination, int dest_x, int dest_y, const h264_imgpel_macroblock_t source)
+{
+	ptrdiff_t destination_stride = destination->stride; // in case the compiler doesn't optimize this
+	imgpel *dest = destination->base_address + destination_stride * dest_y + dest_x;
+
+  int j;
+  for(j=0; j<MB_BLOCK_SIZE; j++)
+  {
+    memcpy(dest, source[j], MB_BLOCK_SIZE * sizeof (imgpel));
+		dest+=destination_stride;
+	}
+}
+
+/*!
+ *************************************************************************************
+ * \brief
+ *    Copy ImgPel Data from one structure to another (8x8)
+ *************************************************************************************
+ */
+void copy_image_data_8x8_stride2(VideoImage *destination, int dest_x, int dest_y, const h264_imgpel_macroblock_t imgBuf2, int src_x, int src_y)
+{
+#ifdef _M_IX86
+	ptrdiff_t destination_stride = destination->stride;
+	imgpel *dest = destination->base_address + destination_stride * dest_y + dest_x;
+	_asm
+	{
+		mov eax, src_y
+		shl eax, 4
+		add eax, src_x
+		add eax, imgBuf2
+
+		mov edx, dest
+		mov ecx, destination_stride
+
+		movq mm0, MMWORD PTR 0[eax]
+		movq mm1, MMWORD PTR 16[eax]
+		movq mm2, MMWORD PTR 32[eax]
+		movq mm3, MMWORD PTR 48[eax]
+		movq mm4, MMWORD PTR 64[eax]
+		movq mm5, MMWORD PTR 80[eax]
+		movq mm6, MMWORD PTR 96[eax]
+		movq mm7, MMWORD PTR 112[eax]
+
+		movntq [edx], mm0
+		movntq [edx+ecx], mm1
+		movntq [edx+2*ecx], mm2
+		movntq [edx+4*ecx], mm4
+		add edx, ecx
+		movntq 0[edx+2*ecx], mm3
+		movntq 0[edx+4*ecx], mm5
+		add edx, ecx
+		movntq 0[edx+4*ecx], mm6
+		add edx, ecx
+		movntq 0[edx+4*ecx], mm7
+	}
+#else
+ 	ptrdiff_t destination_stride = destination->stride; // in case the compiler doesn't optimize this
+	imgpel *dest = destination->base_address + destination_stride * dest_y + dest_x;
+	int j;
+	for(j = 0; j < BLOCK_SIZE_8x8; ++j)
+	{
+		memcpy(dest, &imgBuf2[src_y+j][src_x], BLOCK_SIZE_8x8 * sizeof (imgpel));
+		dest+=destination_stride;
+	}
+#endif
+
+}
+
+void copy_image_data_8x8_stride(VideoImage *destination, int dest_x, int dest_y, const h264_imgpel_macroblock_t imgBuf2)
+{
+#ifdef _M_IX86
+	ptrdiff_t destination_stride = destination->stride;
+	imgpel *dest = destination->base_address + destination_stride * dest_y + dest_x;
+	_asm
+	{
+		mov eax, imgBuf2
+		mov edx, dest
+		mov ecx, destination_stride
+
+		movq mm0, MMWORD PTR 0[eax]
+		movq mm1, MMWORD PTR 16[eax]
+		movq mm2, MMWORD PTR 32[eax]
+		movq mm3, MMWORD PTR 48[eax]
+		movq mm4, MMWORD PTR 64[eax]
+		movq mm5, MMWORD PTR 80[eax]
+		movq mm6, MMWORD PTR 96[eax]
+		movq mm7, MMWORD PTR 112[eax]
+
+		movntq [edx], mm0
+		movntq [edx+ecx], mm1
+		movntq [edx+2*ecx], mm2
+		movntq [edx+4*ecx], mm4
+		add edx, ecx
+		movntq 0[edx+2*ecx], mm3
+		movntq 0[edx+4*ecx], mm5
+		add edx, ecx
+		movntq 0[edx+4*ecx], mm6
+		add edx, ecx
+		movntq 0[edx+4*ecx], mm7
+	}
+#else
+ 	ptrdiff_t destination_stride = destination->stride; // in case the compiler doesn't optimize this
+	imgpel *dest = destination->base_address + destination_stride * dest_y + dest_x;
+	int j;
+	for(j = 0; j < BLOCK_SIZE_8x8; ++j)
+	{
+		memcpy(dest, &imgBuf2[j][0], BLOCK_SIZE_8x8 * sizeof (imgpel));
+		dest+=destination_stride;
+	}
+#endif
+}
+
+/*!
+ *************************************************************************************
+ * \brief
+ *    Copy ImgPel Data from one structure to another (4x4)
+ *************************************************************************************
+ */
+
+void copy_image_data_4x4_stride(VideoImage *destination, int dest_x, int dest_y, const h264_imgpel_macroblock_t source, int src_x, int src_y)
+{
+	ptrdiff_t destination_stride = destination->stride; // in case the compiler doesn't optimize this
+	imgpel *dest = destination->base_address + destination_stride * dest_y + dest_x;
+	h264_imgpel_macroblock_row_t *src = (h264_imgpel_macroblock_row_t *)source[src_y]; /* cast is for const */
+
+	int j;
+	for(j = 0; j < BLOCK_SIZE; ++j)
+	{
+		memcpy(dest, &src[j][src_x], BLOCK_SIZE * sizeof (imgpel));
+		dest+=destination_stride;
+	}
+}
+
+/*!
+ *************************************************************************************
+ * \brief
+ *    Copy ImgPel Data from one structure to another (8x8)
+ *************************************************************************************
+ */
+void copy_image_data(imgpel  **imgBuf1, imgpel  **imgBuf2, int dest_x, int src_x, int width, int height)
+{
+  int j;
+  for(j = 0; j < height; ++j)
+  {
+    memcpy(&imgBuf1[j][dest_x], &imgBuf2[j][src_x], width * sizeof (imgpel));
+  }
+}
+
+void copy_image_data_stride(VideoImage *destination, int dest_x, int dest_y, const h264_imgpel_macroblock_t imgBuf2, int width, int height)
+{
+	ptrdiff_t destination_stride = destination->stride; // in case the compiler doesn't optimize this
+	imgpel *dest = destination->base_address + destination_stride * dest_y + dest_x;
+	#ifdef H264_IPP
+	IppiSize roi = {width,height};
+	ippiCopy_8u_C1R(imgBuf2[0], sizeof(imgBuf2[0]), dest, destination_stride, roi);
+#else
+  int j;
+  for(j = 0; j < height; ++j)
+  {
+		memcpy(dest, imgBuf2[j], width * sizeof (imgpel));
+		dest+=destination_stride;
+  }
+#endif
+}
diff --git a/Src/h264dec/ldecod/src/cabac.c b/Src/h264dec/ldecod/src/cabac.c
new file mode 100644
index 00000000..a3c43513
--- /dev/null
+++ b/Src/h264dec/ldecod/src/cabac.c
@@ -0,0 +1,2123 @@
+/*!
+*************************************************************************************
+* \file cabac.c
+*
+* \brief
+*    CABAC entropy coding routines
+*
+* \author
+*    Main contributors (see contributors.h for copyright, address and affiliation details)
+*    - Detlev Marpe                    <marpe@hhi.de>
+**************************************************************************************
+*/
+
+#include "global.h"
+#include "cabac.h"
+#include "memalloc.h"
+#include "elements.h"
+#include "image.h"
+#include "biaridecod.h"
+#include "mb_access.h"
+#include "vlc.h"
+#include <mmintrin.h>
+#define get_bit(x, n) (_mm_cvtsi64_si32(_mm_srli_si64(*(__m64 *)&(x), n)) & 1)
+/*static inline int get_bit(int64 x,int n)
+{
+return (int)(((x >> n) & 1));
+}*/
+
+static __forceinline void or_bits_low(int64 *x, int mask, int position)
+{
+	*(int32_t *)x |= (mask << position);
+}
+
+static inline void or_bits(int64 *x, int mask, int position)
+{
+#ifdef _M_IX86
+	__m64 mmx_x = *(__m64 *)x;
+	__m64 mmx_mask = _mm_cvtsi32_si64(mask);
+	mmx_mask=_mm_slli_si64(mmx_mask, position);
+	mmx_x = _mm_or_si64(mmx_x, mmx_mask);
+	*(__m64 *)x = mmx_x;
+#else
+	*x   |= ((int64) mask << position);
+#endif
+}
+#if TRACE
+int symbolCount = 0;
+#endif
+
+/***********************************************************************
+* L O C A L L Y   D E F I N E D   F U N C T I O N   P R O T O T Y P E S
+***********************************************************************
+*/
+static unsigned int unary_bin_decode(DecodingEnvironmentPtr dep_dp,
+																		 BiContextTypePtr ctx,
+																		 int ctx_offset);
+static unsigned int unary_bin_max_decode(DecodingEnvironmentPtr dep_dp,
+																				 BiContextTypePtr ctx,
+																				 int ctx_offset,
+																				 unsigned int max_symbol);
+
+unsigned int unary_exp_golomb_mv_decode(DecodingEnvironmentPtr dep_dp, BiContextTypePtr ctx, unsigned int max_bin);
+unsigned int unary_exp_golomb_mv_decode3(DecodingEnvironmentPtr dep_dp, BiContextTypePtr ctx);
+
+void CheckAvailabilityOfNeighborsCABAC(Macroblock *currMB)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	PixelPos up, left;
+
+	p_Vid->getNeighbourLeftLuma(currMB, &left);
+	p_Vid->getNeighbourUpLuma(currMB, &up);
+
+	if (up.available)
+		currMB->mb_up = &p_Vid->mb_data[up.mb_addr];
+	else
+		currMB->mb_up = NULL;
+
+	if (left.available)
+		currMB->mb_left = &p_Vid->mb_data[left.mb_addr];
+	else
+		currMB->mb_left = NULL;
+}
+
+void cabac_new_slice(Slice *currSlice)
+{
+	currSlice->last_dquant=0;
+}
+
+/*!
+************************************************************************
+* \brief
+*    Allocation of contexts models for the motion info
+*    used for arithmetic decoding
+*
+************************************************************************
+*/
+MotionInfoContexts* create_contexts_MotionInfo(void)
+{
+	MotionInfoContexts *deco_ctx;
+
+	deco_ctx = (MotionInfoContexts*) calloc(1, sizeof(MotionInfoContexts) );
+	if( deco_ctx == NULL )
+		no_mem_exit("create_contexts_MotionInfo: deco_ctx");
+
+	return deco_ctx;
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    Allocates of contexts models for the texture info
+*    used for arithmetic decoding
+************************************************************************
+*/
+TextureInfoContexts* create_contexts_TextureInfo(void)
+{
+	TextureInfoContexts *deco_ctx;
+
+	deco_ctx = (TextureInfoContexts*) calloc(1, sizeof(TextureInfoContexts) );
+	if( deco_ctx == NULL )
+		no_mem_exit("create_contexts_TextureInfo: deco_ctx");
+
+	return deco_ctx;
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    Frees the memory of the contexts models
+*    used for arithmetic decoding of the motion info.
+************************************************************************
+*/
+void delete_contexts_MotionInfo(MotionInfoContexts *deco_ctx)
+{
+	if( deco_ctx == NULL )
+		return;
+
+	free( deco_ctx );
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    Frees the memory of the contexts models
+*    used for arithmetic decoding of the texture info.
+************************************************************************
+*/
+void delete_contexts_TextureInfo(TextureInfoContexts *deco_ctx)
+{
+	if( deco_ctx == NULL )
+		return;
+
+	free( deco_ctx );
+}
+
+Boolean readFieldModeInfo_CABAC(Macroblock *currMB, DecodingEnvironmentPtr dep_dp)
+{  
+	Slice *currSlice = currMB->p_Slice;
+	VideoParameters *p_Vid = currMB->p_Vid;
+	MotionInfoContexts *ctx  = currSlice->mot_ctx;
+	int a = currMB->mb_avail_left ? p_Vid->mb_data[currMB->mb_addr_left].mb_field : 0;
+	int b = currMB->mb_avail_up ? p_Vid->mb_data[currMB->mb_addr_up].mb_field : 0;
+	int act_ctx = a + b;
+
+	return biari_decode_symbol (dep_dp, &ctx->mb_aff_contexts[act_ctx]);
+}
+
+
+int check_next_mb_and_get_field_mode_CABAC(Slice *currSlice, DataPartition  *act_dp)
+{
+	VideoParameters *p_Vid = currSlice->p_Vid;
+	BiContextTypePtr          mb_type_ctx_copy[3];
+	BiContextTypePtr          mb_aff_ctx_copy;
+	DecodingEnvironmentPtr    dep_dp_copy;
+
+	int length;
+	DecodingEnvironmentPtr    dep_dp = &(act_dp->de_cabac);
+
+	int bframe = (currSlice->slice_type == B_SLICE);
+	int skip   = 0;
+	int field  = 0;
+	int i;
+
+	Macroblock *currMB;
+
+	//get next MB
+	++p_Vid->current_mb_nr;
+
+	currMB = &p_Vid->mb_data[p_Vid->current_mb_nr];
+	currMB->p_Vid    = p_Vid;
+	currMB->p_Slice  = currSlice; 
+	currMB->slice_nr = p_Vid->current_slice_nr;
+	currMB->mb_field = p_Vid->mb_data[p_Vid->current_mb_nr-1].mb_field;
+	currMB->mbAddrX  = p_Vid->current_mb_nr;
+
+	CheckAvailabilityOfNeighbors(currMB);
+	CheckAvailabilityOfNeighborsCABAC(currMB);
+
+	//create
+	dep_dp_copy = (DecodingEnvironmentPtr) calloc(1, sizeof(DecodingEnvironment) );
+	for (i=0;i<3;++i)
+		mb_type_ctx_copy[i] = (BiContextTypePtr) calloc(NUM_MB_TYPE_CTX, sizeof(BiContextType) );
+	mb_aff_ctx_copy = (BiContextTypePtr) calloc(NUM_MB_AFF_CTX, sizeof(BiContextType) );
+
+	//copy
+	memcpy(dep_dp_copy,dep_dp,sizeof(DecodingEnvironment));
+	length = *(dep_dp_copy->Dcodestrm_len) = *(dep_dp->Dcodestrm_len);
+	for (i=0;i<3;++i)
+		memcpy(mb_type_ctx_copy[i], currSlice->mot_ctx->mb_type_contexts[i],NUM_MB_TYPE_CTX*sizeof(BiContextType) );
+	memcpy(mb_aff_ctx_copy, currSlice->mot_ctx->mb_aff_contexts,NUM_MB_AFF_CTX*sizeof(BiContextType) );
+
+	//check_next_mb
+	currSlice->last_dquant = 0;
+	skip = readMB_skip_flagInfo_CABAC(currMB, dep_dp);
+
+	if (!skip)
+	{
+		field = readFieldModeInfo_CABAC(currMB, dep_dp);
+		p_Vid->mb_data[p_Vid->current_mb_nr-1].mb_field = field;
+	}
+
+	//reset
+	p_Vid->current_mb_nr--;
+
+	memcpy(dep_dp,dep_dp_copy,sizeof(DecodingEnvironment));
+	*(dep_dp->Dcodestrm_len) = length;
+	for (i=0;i<3;++i)
+		memcpy(currSlice->mot_ctx->mb_type_contexts[i],mb_type_ctx_copy[i], NUM_MB_TYPE_CTX*sizeof(BiContextType) );
+	memcpy( currSlice->mot_ctx->mb_aff_contexts,mb_aff_ctx_copy,NUM_MB_AFF_CTX*sizeof(BiContextType) );
+
+	CheckAvailabilityOfNeighborsCABAC(currMB);
+
+	//delete
+	free(dep_dp_copy);
+	for (i=0;i<3;++i)
+		free(mb_type_ctx_copy[i]);
+	free(mb_aff_ctx_copy);
+
+	return skip;
+}
+
+
+
+
+/*!
+************************************************************************
+* \brief
+*    This function is used to arithmetically decode the motion
+*    vector data of a B-frame MB.
+************************************************************************
+*/
+#if defined(_DEBUG) || !defined(_M_IX86)
+int decodeMVD_CABAC(DecodingEnvironmentPtr dep_dp, BiContextType mv_ctx[2][NUM_MV_RES_CTX], int act_ctx, int err)
+{
+	int act_sym = biari_decode_symbol(dep_dp,&mv_ctx[0][act_ctx+err] );
+
+	if (act_sym != 0)
+	{
+		int mv_sign;
+		act_sym = unary_exp_golomb_mv_decode3(dep_dp,mv_ctx[1]+act_ctx);
+		++act_sym;
+		mv_sign = biari_decode_symbol_eq_prob(dep_dp);
+
+		if(mv_sign)
+			act_sym = -act_sym;
+	}
+	return act_sym;
+}
+#else
+int decodeMVD_CABAC(DecodingEnvironmentPtr dep_dp, BiContextType mv_ctx[2][NUM_MV_RES_CTX], int act_ctx, int err);
+#endif
+
+int readMVD_CABAC(Macroblock *currMB, DecodingEnvironmentPtr dep_dp, int k, int list_idx, int x, int y)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	Slice *currSlice = currMB->p_Slice;
+	MotionInfoContexts *ctx = currSlice->mot_ctx;
+	int a = 0, b = 0;
+//	int act_ctx;
+//	int act_sym;
+	int mv_local_err;
+	int err;
+
+	PixelPos block_a, block_b;
+
+	p_Vid->getNeighbourPXLumaNB_NoPos(currMB, y - 1, &block_b);
+	if (block_b.available)
+	{
+		b = abs(p_Vid->mb_data[block_b.mb_addr].mvd[list_idx][block_b.y>>2][x>>2][k]);
+		if (currSlice->mb_aff_frame_flag && (k==1))
+		{
+			if ((currMB->mb_field==0) && (p_Vid->mb_data[block_b.mb_addr].mb_field==1))
+				b *= 2;
+			else if ((currMB->mb_field==1) && (p_Vid->mb_data[block_b.mb_addr].mb_field==0))
+				b /= 2;
+		}
+	}
+
+	p_Vid->getNeighbourXPLumaNB_NoPos(currMB, x - 1, y    , &block_a);
+	if (block_a.available)
+	{
+		a = abs(p_Vid->mb_data[block_a.mb_addr].mvd[list_idx][block_a.y>>2][block_a.x>>2][k]);
+		if (currSlice->mb_aff_frame_flag && (k==1))
+		{
+			if ((currMB->mb_field==0) && (p_Vid->mb_data[block_a.mb_addr].mb_field==1))
+				a *= 2;
+			else if ((currMB->mb_field==1) && (p_Vid->mb_data[block_a.mb_addr].mb_field==0))
+				a /= 2;
+		}
+	}
+
+	if ((mv_local_err = a + b)<3)
+		err = 0;
+	else
+	{
+		if (mv_local_err > 32)
+			err = 3;
+		else
+			err = 2;
+	}
+
+	return decodeMVD_CABAC(dep_dp, ctx->mv_res_contexts, 5*k, err);
+	/*
+	act_sym = biari_decode_symbol(dep_dp,&ctx->mv_res_contexts[0][act_ctx] );
+
+	if (act_sym != 0)
+	{
+		int mv_sign;
+		act_ctx = 5 * k;
+		act_sym = unary_exp_golomb_mv_decode3(dep_dp,ctx->mv_res_contexts[1]+act_ctx);
+		++act_sym;
+		mv_sign = biari_decode_symbol_eq_prob(dep_dp);
+
+		if(mv_sign)
+			act_sym = -act_sym;
+	}
+	return act_sym;
+	*/
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    This function is used to arithmetically decode the 8x8 block type.
+************************************************************************
+*/
+int readB8_typeInfo_CABAC(Slice *currSlice, DecodingEnvironmentPtr dep_dp)
+{
+	int act_sym = 0;
+	int bframe  = (currSlice->slice_type == B_SLICE);
+
+	MotionInfoContexts *ctx = currSlice->mot_ctx;
+
+
+	if (!bframe)
+	{
+		if (biari_decode_symbol (dep_dp, &ctx->b8_type_contexts[0][1]))
+		{
+			act_sym = 0;
+		}
+		else
+		{
+			if (biari_decode_symbol (dep_dp, &ctx->b8_type_contexts[0][3]))
+			{
+				if (biari_decode_symbol (dep_dp, &ctx->b8_type_contexts[0][4])) act_sym = 2;
+				else                                                            act_sym = 3;
+			}
+			else
+			{
+				act_sym = 1;
+			}
+		}
+	}
+	else
+	{
+		if (biari_decode_symbol (dep_dp, &ctx->b8_type_contexts[1][0]))
+		{
+			if (biari_decode_symbol (dep_dp, &ctx->b8_type_contexts[1][1]))
+			{
+				if (biari_decode_symbol (dep_dp, &ctx->b8_type_contexts[1][2]))
+				{
+					if (biari_decode_symbol (dep_dp, &ctx->b8_type_contexts[1][3]))
+					{
+						act_sym = 10;
+						if (biari_decode_symbol (dep_dp, &ctx->b8_type_contexts[1][3])) act_sym++;
+					}
+					else
+					{
+						act_sym = 6;
+						if (biari_decode_symbol (dep_dp, &ctx->b8_type_contexts[1][3])) act_sym+=2;
+						if (biari_decode_symbol (dep_dp, &ctx->b8_type_contexts[1][3])) act_sym++;
+					}
+				}
+				else
+				{
+					act_sym=2;
+					if (biari_decode_symbol (dep_dp, &ctx->b8_type_contexts[1][3])) act_sym+=2;
+					if (biari_decode_symbol (dep_dp, &ctx->b8_type_contexts[1][3])) act_sym+=1;
+				}
+			}
+			else
+			{
+				if (biari_decode_symbol (dep_dp, &ctx->b8_type_contexts[1][3])) act_sym = 1;
+				else                                                            act_sym = 0;
+			}
+			++act_sym;
+		}
+		else
+		{
+			act_sym= 0;
+		}
+	}
+	return act_sym;
+}
+
+/*!
+************************************************************************
+* \brief
+*    This function is used to arithmetically decode the macroblock
+*    type info of a given MB.
+************************************************************************
+*/
+#if defined(_DEBUG) || !defined(_M_IX86)
+int readMB_skip_flagInfo_CABAC(Macroblock *currMB, DecodingEnvironmentPtr dep_dp)
+{  
+	Slice *currSlice = currMB->p_Slice;
+	int bframe=(currSlice->slice_type == B_SLICE);
+	MotionInfoContexts *ctx = currSlice->mot_ctx;  
+	int a = (currMB->mb_left != NULL) ? (currMB->mb_left->skip_flag == 0) : 0;
+	int b = (currMB->mb_up   != NULL) ? (currMB->mb_up  ->skip_flag == 0) : 0;
+	int act_ctx;
+	int skip;
+
+	if (bframe)
+	{
+		act_ctx = 7 + a + b;
+
+		skip = biari_decode_symbol (dep_dp, &ctx->mb_type_contexts[2][act_ctx]);
+	}
+	else
+	{
+		act_ctx = a + b;
+
+		skip = biari_decode_symbol(dep_dp, &ctx->mb_type_contexts[1][act_ctx]);
+	}
+
+	if (skip)
+	{
+		currSlice->last_dquant = 0;
+	}
+	return skip;
+}
+#endif
+
+/*!
+***************************************************************************
+* \brief
+*    This function is used to arithmetically decode the macroblock
+*    intra_pred_size flag info of a given MB.
+***************************************************************************
+*/
+
+Boolean readMB_transform_size_flag_CABAC(Macroblock *currMB, DecodingEnvironmentPtr dep_dp)
+{
+	Slice *currSlice = currMB->p_Slice;
+	TextureInfoContexts*ctx = currSlice->tex_ctx;
+
+	int b = (currMB->mb_up   == NULL) ? 0 : currMB->mb_up->luma_transform_size_8x8_flag;
+	int a = (currMB->mb_left == NULL) ? 0 : currMB->mb_left->luma_transform_size_8x8_flag;
+
+	int act_ctx = a + b;
+	int act_sym = biari_decode_symbol(dep_dp, ctx->transform_size_contexts + act_ctx);
+
+	return act_sym;
+}
+
+/*!
+************************************************************************
+* \brief
+*    This function is used to arithmetically decode the macroblock
+*    type info of a given MB.
+************************************************************************
+*/
+int readMB_typeInfo_CABAC(Macroblock *currMB, DecodingEnvironmentPtr dep_dp)
+{
+	Slice *currSlice = currMB->p_Slice;
+	MotionInfoContexts *ctx = currSlice->mot_ctx;
+
+	int a = 0, b = 0;
+	int act_ctx;
+	int act_sym;
+	int bframe=(currSlice->slice_type == B_SLICE);
+	int mode_sym;
+	int curr_mb_type;
+
+	if(currSlice->slice_type == I_SLICE)  // INTRA-frame
+	{
+		if (currMB->mb_up != NULL)
+			b = (((currMB->mb_up)->mb_type != I4MB && currMB->mb_up->mb_type != I8MB) ? 1 : 0 );
+
+		if (currMB->mb_left != NULL)
+			a = (((currMB->mb_left)->mb_type != I4MB && currMB->mb_left->mb_type != I8MB) ? 1 : 0 );
+
+		act_ctx = a + b;
+		act_sym = biari_decode_symbol(dep_dp, ctx->mb_type_contexts[0] + act_ctx);
+
+		if (act_sym==0) // 4x4 Intra
+		{
+			curr_mb_type = act_sym;
+		}
+		else // 16x16 Intra
+		{
+			mode_sym = biari_decode_final(dep_dp);
+			if(mode_sym == 1)
+			{
+				curr_mb_type = 25;
+			}
+			else
+			{
+				act_sym = 1;
+				act_ctx = 4;
+				mode_sym =  biari_decode_symbol(dep_dp, ctx->mb_type_contexts[0] + act_ctx ); // decoding of AC/no AC
+				act_sym += mode_sym*12;
+				act_ctx = 5;
+				// decoding of cbp: 0,1,2
+				mode_sym =  biari_decode_symbol(dep_dp, ctx->mb_type_contexts[0] + act_ctx );
+				if (mode_sym!=0)
+				{
+					act_ctx=6;
+					mode_sym = biari_decode_symbol(dep_dp, ctx->mb_type_contexts[0] + act_ctx );
+					act_sym+=4;
+					if (mode_sym!=0)
+						act_sym+=4;
+				}
+				// decoding of I pred-mode: 0,1,2,3
+				act_ctx = 7;
+				mode_sym =  biari_decode_symbol(dep_dp, ctx->mb_type_contexts[0] + act_ctx );
+				act_sym += mode_sym*2;
+				act_ctx = 8;
+				mode_sym =  biari_decode_symbol(dep_dp, ctx->mb_type_contexts[0] + act_ctx );
+				act_sym += mode_sym;
+				curr_mb_type = act_sym;
+			}
+		}
+	}
+	else if(currSlice->slice_type == SI_SLICE)  // SI-frame
+	{
+		// special ctx's for SI4MB
+		if (currMB->mb_up != NULL)
+			b = (( (currMB->mb_up)->mb_type != SI4MB) ? 1 : 0 );
+
+		if (currMB->mb_left != NULL)
+			a = (( (currMB->mb_left)->mb_type != SI4MB) ? 1 : 0 );
+
+		act_ctx = a + b;
+		act_sym = biari_decode_symbol(dep_dp, ctx->mb_type_contexts[1] + act_ctx);
+
+		if (act_sym==0) //  SI 4x4 Intra
+		{
+			curr_mb_type = 0;
+		}
+		else // analog INTRA_IMG
+		{
+			if (currMB->mb_up != NULL)
+				b = (( (currMB->mb_up)->mb_type != I4MB) ? 1 : 0 );
+
+			if (currMB->mb_left != NULL)
+				a = (( (currMB->mb_left)->mb_type != I4MB) ? 1 : 0 );
+
+			act_ctx = a + b;
+			act_sym = biari_decode_symbol(dep_dp, ctx->mb_type_contexts[0] + act_ctx);
+
+			if (act_sym==0) // 4x4 Intra
+			{
+				curr_mb_type = 1;
+			}
+			else // 16x16 Intra
+			{
+				mode_sym = biari_decode_final(dep_dp);
+				if( mode_sym==1 )
+				{
+					curr_mb_type = 26;
+				}
+				else
+				{
+					act_sym = 2;
+					act_ctx = 4;
+					mode_sym =  biari_decode_symbol(dep_dp, ctx->mb_type_contexts[0] + act_ctx ); // decoding of AC/no AC
+					act_sym += mode_sym*12;
+					act_ctx = 5;
+					// decoding of cbp: 0,1,2
+					mode_sym =  biari_decode_symbol(dep_dp, ctx->mb_type_contexts[0] + act_ctx );
+					if (mode_sym!=0)
+					{
+						act_ctx=6;
+						mode_sym = biari_decode_symbol(dep_dp, ctx->mb_type_contexts[0] + act_ctx );
+						act_sym+=4;
+						if (mode_sym!=0)
+							act_sym+=4;
+					}
+					// decoding of I pred-mode: 0,1,2,3
+					act_ctx = 7;
+					mode_sym =  biari_decode_symbol(dep_dp, ctx->mb_type_contexts[0] + act_ctx );
+					act_sym += mode_sym*2;
+					act_ctx = 8;
+					mode_sym =  biari_decode_symbol(dep_dp, ctx->mb_type_contexts[0] + act_ctx );
+					act_sym += mode_sym;
+					curr_mb_type = act_sym;
+				}
+			}
+		}
+	}
+	else
+	{
+		if (bframe)
+		{
+			if (currMB->mb_up != NULL)
+				b = (( (currMB->mb_up)->mb_type != 0) ? 1 : 0 );
+
+			if (currMB->mb_left != NULL)
+				a = (( (currMB->mb_left)->mb_type != 0) ? 1 : 0 );
+
+			act_ctx = a + b;
+
+			if (biari_decode_symbol (dep_dp, &ctx->mb_type_contexts[2][act_ctx]))
+			{
+				if (biari_decode_symbol (dep_dp, &ctx->mb_type_contexts[2][4]))
+				{
+					if (biari_decode_symbol (dep_dp, &ctx->mb_type_contexts[2][5]))
+					{
+						act_sym=12;
+						if (biari_decode_symbol (dep_dp, &ctx->mb_type_contexts[2][6])) act_sym+=8;
+						if (biari_decode_symbol (dep_dp, &ctx->mb_type_contexts[2][6])) act_sym+=4;
+						if (biari_decode_symbol (dep_dp, &ctx->mb_type_contexts[2][6])) act_sym+=2;
+
+						if      (act_sym==24)  act_sym=11;
+						else if (act_sym==26)  act_sym=22;
+						else
+						{
+							if (act_sym==22)     act_sym=23;
+							if (biari_decode_symbol (dep_dp, &ctx->mb_type_contexts[2][6])) act_sym+=1;
+						}
+					}
+					else
+					{
+						act_sym=3;
+						if (biari_decode_symbol (dep_dp, &ctx->mb_type_contexts[2][6])) act_sym+=4;
+						if (biari_decode_symbol (dep_dp, &ctx->mb_type_contexts[2][6])) act_sym+=2;
+						if (biari_decode_symbol (dep_dp, &ctx->mb_type_contexts[2][6])) act_sym+=1;
+					}
+				}
+				else
+				{
+					if (biari_decode_symbol (dep_dp, &ctx->mb_type_contexts[2][6])) act_sym=2;
+					else                                                            act_sym=1;
+				}
+			}
+			else
+			{
+				act_sym = 0;
+			}
+		}
+		else // P-frame
+		{
+			{
+				if (biari_decode_symbol(dep_dp, &ctx->mb_type_contexts[1][4] ))
+				{
+					if (biari_decode_symbol(dep_dp, &ctx->mb_type_contexts[1][7] ))   act_sym = 7;
+					else                                                              act_sym = 6;
+				}
+				else
+				{
+					if (biari_decode_symbol(dep_dp, &ctx->mb_type_contexts[1][5] ))
+					{
+						if (biari_decode_symbol(dep_dp, &ctx->mb_type_contexts[1][7] )) act_sym = 2;
+						else                                                            act_sym = 3;
+					}
+					else
+					{
+						if (biari_decode_symbol(dep_dp, &ctx->mb_type_contexts[1][6] )) act_sym = 4;
+						else                                                            act_sym = 1;
+					}
+				}
+			}
+		}
+
+		if (act_sym<=6 || (((currSlice->slice_type == B_SLICE) ? 1 : 0) && act_sym<=23))
+		{
+			curr_mb_type = act_sym;
+		}
+		else  // additional info for 16x16 Intra-mode
+		{
+			mode_sym = biari_decode_final(dep_dp);
+			if( mode_sym==1 )
+			{
+				if(bframe)  // B frame
+					curr_mb_type = 48;
+				else      // P frame
+					curr_mb_type = 31;
+			}
+			else
+			{
+				act_ctx = 8;
+				mode_sym =  biari_decode_symbol(dep_dp, ctx->mb_type_contexts[1] + act_ctx ); // decoding of AC/no AC
+				act_sym += mode_sym*12;
+
+				// decoding of cbp: 0,1,2
+				act_ctx = 9;
+				mode_sym = biari_decode_symbol(dep_dp, ctx->mb_type_contexts[1] + act_ctx );
+				if (mode_sym != 0)
+				{
+					act_sym+=4;
+					mode_sym = biari_decode_symbol(dep_dp, ctx->mb_type_contexts[1] + act_ctx );
+					if (mode_sym != 0)
+						act_sym+=4;
+				}
+
+				// decoding of I pred-mode: 0,1,2,3
+				act_ctx = 10;
+				mode_sym = biari_decode_symbol(dep_dp, ctx->mb_type_contexts[1] + act_ctx );
+				act_sym += mode_sym*2;
+				mode_sym = biari_decode_symbol(dep_dp, ctx->mb_type_contexts[1] + act_ctx );
+				act_sym += mode_sym;
+				curr_mb_type = act_sym;
+			}
+		}
+	}
+	return curr_mb_type;
+}
+
+/*!
+************************************************************************
+* \brief
+*    This function is used to arithmetically decode a pair of
+*    intra prediction modes of a given MB.
+************************************************************************
+*/
+#if defined(_DEBUG) || !defined(_M_IX86)
+int readIntraPredMode_CABAC(Slice *currSlice, DecodingEnvironmentPtr dep_dp)
+{
+	TextureInfoContexts *ctx     = currSlice->tex_ctx;
+	int act_sym;
+
+	// use_most_probable_mode
+	act_sym = biari_decode_symbol(dep_dp, ctx->ipr_contexts);
+
+	// remaining_mode_selector
+	if (act_sym == 1)
+	{
+		return -1;
+	}
+	else
+	{
+		int pred_mode=0;
+		pred_mode |= (biari_decode_symbol(dep_dp, ctx->ipr_contexts+1)     );
+		pred_mode |= (biari_decode_symbol(dep_dp, ctx->ipr_contexts+1) << 1);
+		pred_mode |= (biari_decode_symbol(dep_dp, ctx->ipr_contexts+1) << 2);
+		return pred_mode;
+	}
+}
+#endif
+/*!
+************************************************************************
+* \brief
+*    This function is used to arithmetically decode the reference
+*    parameter of a given MB.
+************************************************************************
+*/
+char readRefFrame_CABAC(Macroblock *currMB, DecodingEnvironmentPtr dep_dp, int list, int x, int y)
+{
+	Slice *currSlice = currMB->p_Slice;
+	VideoParameters *p_Vid = currMB->p_Vid;
+	StorablePicture *dec_picture = p_Vid->dec_picture;
+	MotionInfoContexts *ctx = currSlice->mot_ctx;
+	Macroblock *neighborMB = NULL;
+
+	int   addctx  = 0;
+	int   a = 0, b = 0;
+	int   act_ctx;
+	int   act_sym;
+	PicMotion **refframe_array = dec_picture->motion.motion[list];
+
+	PixelPos block_a, block_b;
+
+	p_Vid->getNeighbourPXLuma(currMB, x,     y - 1, &block_b);
+	// TODO: this gets called with x << 2 and y << 2, so we can undo the internal >> 2 easily by just passing x and y
+	if (block_b.available)
+	{
+		int b8b=((block_b.x >> 3) & 0x01)+((block_b.y>>2) & 0x02); 
+		neighborMB = &p_Vid->mb_data[block_b.mb_addr];
+		if (!( (neighborMB->mb_type==IPCM) || IS_DIRECT(neighborMB) || (neighborMB->b8mode[b8b]==0 && neighborMB->b8pdir[b8b]==2)))
+		{
+			if (currSlice->mb_aff_frame_flag && (currMB->mb_field == FALSE) && (neighborMB->mb_field == TRUE))
+				b = (refframe_array[block_b.pos_y>>2][block_b.pos_x>>2].ref_idx > 1 ? 2 : 0);
+			else
+				b = (refframe_array[block_b.pos_y>>2][block_b.pos_x>>2].ref_idx > 0 ? 2 : 0);
+		}
+	}
+
+	p_Vid->getNeighbourXPLuma(currMB, x - 1, y    , &block_a);
+	if (block_a.available)
+	{    
+		int b8a=((block_a.x >> 3) & 0x01)+((block_a.y>>2) & 0x02);    
+		neighborMB = &p_Vid->mb_data[block_a.mb_addr];
+		if (!((neighborMB->mb_type==IPCM) || IS_DIRECT(neighborMB) || (neighborMB->b8mode[b8a]==0 && neighborMB->b8pdir[b8a]==2)))
+		{
+			if (currSlice->mb_aff_frame_flag && (currMB->mb_field == FALSE) && (neighborMB->mb_field == 1))
+				a = (refframe_array[block_a.pos_y>>2][block_a.pos_x>>2].ref_idx > 1 ? 1 : 0);
+			else
+				a = (refframe_array[block_a.pos_y>>2][block_a.pos_x>>2].ref_idx > 0 ? 1 : 0);
+		}
+	}
+
+	act_ctx = a + b;
+
+	act_sym = biari_decode_symbol(dep_dp,ctx->ref_no_contexts[addctx] + act_ctx );
+
+	if (act_sym != 0)
+	{
+		act_ctx = 4;
+		act_sym = unary_bin_decode(dep_dp,ctx->ref_no_contexts[addctx] + act_ctx,1);
+		++act_sym;
+	}
+	return act_sym;
+}
+
+// x == 0
+char readRefFrame_CABAC0(Macroblock *currMB, DecodingEnvironmentPtr dep_dp, int list, int y)
+{
+	Slice *currSlice = currMB->p_Slice;
+	VideoParameters *p_Vid = currMB->p_Vid;
+	StorablePicture *dec_picture = p_Vid->dec_picture;
+	MotionInfoContexts *ctx = currSlice->mot_ctx;
+	Macroblock *neighborMB = NULL;
+
+	int   addctx  = 0;
+	int   a = 0, b = 0;
+	int   act_ctx;
+	int   act_sym;
+	PicMotion **refframe_array = dec_picture->motion.motion[list];
+
+	PixelPos block_a, block_b;
+
+	p_Vid->getNeighbour0XLuma(currMB, y - 1, &block_b);
+	// TODO: this gets called with x << 2 and y << 2, so we can undo the internal >> 2 easily by just passing x and y
+	if (block_b.available)
+	{
+		int b8b=0+((block_b.y>>2) & 0x02); 
+		neighborMB = &p_Vid->mb_data[block_b.mb_addr];
+		if (!( (neighborMB->mb_type==IPCM) || IS_DIRECT(neighborMB) || (neighborMB->b8mode[b8b]==0 && neighborMB->b8pdir[b8b]==2)))
+		{
+			if (currSlice->mb_aff_frame_flag && (currMB->mb_field == FALSE) && (neighborMB->mb_field == TRUE))
+				b = (refframe_array[block_b.pos_y>>2][block_b.pos_x>>2].ref_idx > 1 ? 2 : 0);
+			else
+				b = (refframe_array[block_b.pos_y>>2][block_b.pos_x>>2].ref_idx > 0 ? 2 : 0);
+		}
+	}
+
+	p_Vid->getNeighbourNXLuma(currMB, y    , &block_a);
+	if (block_a.available)
+	{    
+		int b8a=((15 >> 3) & 0x01)+((block_a.y>>2) & 0x02);    
+		neighborMB = &p_Vid->mb_data[block_a.mb_addr];
+		if (!((neighborMB->mb_type==IPCM) || IS_DIRECT(neighborMB) || (neighborMB->b8mode[b8a]==0 && neighborMB->b8pdir[b8a]==2)))
+		{
+			if (currSlice->mb_aff_frame_flag && (currMB->mb_field == FALSE) && (neighborMB->mb_field == 1))
+				a = (refframe_array[block_a.pos_y>>2][block_a.pos_x>>2].ref_idx > 1 ? 1 : 0);
+			else
+				a = (refframe_array[block_a.pos_y>>2][block_a.pos_x>>2].ref_idx > 0 ? 1 : 0);
+		}
+	}
+
+	act_ctx = a + b;
+
+	act_sym = biari_decode_symbol(dep_dp,ctx->ref_no_contexts[addctx] + act_ctx );
+
+	if (act_sym != 0)
+	{
+		act_ctx = 4;
+		act_sym = unary_bin_decode(dep_dp,ctx->ref_no_contexts[addctx] + act_ctx,1);
+		++act_sym;
+	}
+	return act_sym;
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    This function is used to arithmetically decode the delta qp
+*     of a given MB.
+************************************************************************
+*/
+#if defined(_DEBUG) || !defined(_M_IX86)
+short readDquant_CABAC(Slice *currSlice, DecodingEnvironmentPtr dep_dp)
+{
+	MotionInfoContexts *ctx = currSlice->mot_ctx;
+	short dquant;
+	int act_ctx = ((currSlice->last_dquant != 0) ? 1 : 0);
+	int act_sym = biari_decode_symbol(dep_dp,ctx->delta_qp_contexts + act_ctx );
+
+	if (act_sym != 0)
+	{
+		act_ctx = 2;
+		act_sym = unary_bin_decode(dep_dp,ctx->delta_qp_contexts + act_ctx,1);
+		++act_sym;
+	}
+
+	dquant = (act_sym + 1) >> 1;
+	if((act_sym & 0x01)==0)                           // lsb is signed bit
+		dquant = -dquant;
+
+	currSlice->last_dquant = dquant;
+	return dquant;
+}
+#endif
+/*!
+************************************************************************
+* \brief
+*    This function is used to arithmetically decode the coded
+*    block pattern of a given MB.
+************************************************************************
+*/
+int readCBP_CABAC(Macroblock *currMB,  DecodingEnvironmentPtr dep_dp)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	StorablePicture *dec_picture = p_Vid->dec_picture;
+	Slice *currSlice = currMB->p_Slice;
+	TextureInfoContexts *ctx = currSlice->tex_ctx;  
+	Macroblock *neighborMB = NULL;
+
+	int a, b;
+	int curr_cbp_ctx;
+	int cbp = 0;
+	int cbp_bit;
+	PixelPos block_a;
+
+	//  coding of luma part (bit by bit)
+	neighborMB = currMB->mb_up;
+	b = 0;
+
+	if (neighborMB != NULL)
+	{
+		if(neighborMB->mb_type!=IPCM)
+			b = (( (neighborMB->cbp & 4) == 0) ? 2 : 0);
+	}
+
+	p_Vid->getNeighbourLeftLuma(currMB, &block_a);
+	if (block_a.available)
+	{
+		if(p_Vid->mb_data[block_a.mb_addr].mb_type==IPCM)
+			a = 0;
+		else
+			a = (( (p_Vid->mb_data[block_a.mb_addr].cbp & (1<<(2*(block_a.y>>3)+1))) == 0) ? 1 : 0);
+	}
+	else
+		a=0;
+
+	curr_cbp_ctx = a + b;
+	cbp_bit = biari_decode_symbol(dep_dp, ctx->cbp_contexts[0] + curr_cbp_ctx );
+	//if (cbp_bit) 
+		cbp += cbp_bit;//1;
+
+	if (neighborMB != NULL)
+	{
+		if(neighborMB->mb_type!=IPCM)
+			b = (( (neighborMB->cbp & 8) == 0) ? 2 : 0);
+	}
+
+	a = ( ((cbp & 1) == 0) ? 1: 0);
+
+	curr_cbp_ctx = a + b;
+
+	cbp_bit = biari_decode_symbol(dep_dp, ctx->cbp_contexts[0] + curr_cbp_ctx );
+	//if (cbp_bit) 
+	cbp += (cbp_bit << 1); //2;
+
+	b = ( ((cbp & 1) == 0) ? 2: 0);
+
+	p_Vid->getNeighbourNPLumaNB(currMB, 8, &block_a);
+	if (block_a.available)
+	{
+		if(p_Vid->mb_data[block_a.mb_addr].mb_type==IPCM)
+			a = 0;
+		else
+			a = (( (p_Vid->mb_data[block_a.mb_addr].cbp & (1<<(2*(block_a.y>>3)+1))) == 0) ? 1 : 0);
+	}
+	else
+		a=0;
+
+	curr_cbp_ctx = a + b;
+	cbp_bit = biari_decode_symbol(dep_dp, ctx->cbp_contexts[0] + curr_cbp_ctx );
+	//if (cbp_bit) 
+		cbp += (cbp_bit << 2); //4;
+
+	b = ( ((cbp & 2) == 0) ? 2: 0);
+	a = ( ((cbp & 4) == 0) ? 1: 0);
+
+	curr_cbp_ctx = a + b;
+	cbp_bit = biari_decode_symbol(dep_dp, ctx->cbp_contexts[0] + curr_cbp_ctx );
+	//if (cbp_bit) 
+			cbp += (cbp_bit << 3); //8;
+
+	if ((dec_picture->chroma_format_idc != YUV400) && (dec_picture->chroma_format_idc != YUV444)) 
+	{
+		// coding of chroma part
+		// CABAC decoding for BinIdx 0
+		b = 0;
+		neighborMB = currMB->mb_up;
+		if (neighborMB != NULL)
+		{
+			if (neighborMB->mb_type==IPCM || (neighborMB->cbp > 15))
+				b = 2;
+		}
+
+		a = 0;
+		neighborMB = currMB->mb_left;
+		if (neighborMB != NULL)
+		{
+			if (neighborMB->mb_type==IPCM || (neighborMB->cbp > 15))
+				a = 1;
+		}
+
+		curr_cbp_ctx = a + b;
+		cbp_bit = biari_decode_symbol(dep_dp, ctx->cbp_contexts[1] + curr_cbp_ctx );
+
+		// CABAC decoding for BinIdx 1
+		if (cbp_bit) // set the chroma bits
+		{
+			b = 0;
+			neighborMB = currMB->mb_up;
+			if (neighborMB != NULL)
+			{
+				//if ((neighborMB->mb_type == IPCM) || ((neighborMB->cbp > 15) && ((neighborMB->cbp >> 4) == 2)))
+				if ((neighborMB->mb_type == IPCM) || ((neighborMB->cbp >> 4) == 2))
+					b = 2;
+			}
+
+
+			a = 0;
+			neighborMB = currMB->mb_left;
+			if (neighborMB != NULL)
+			{
+				if ((neighborMB->mb_type == IPCM) || ((neighborMB->cbp >> 4) == 2))
+					a = 1;
+			}
+
+			curr_cbp_ctx = a + b;
+			cbp_bit = biari_decode_symbol(dep_dp, ctx->cbp_contexts[2] + curr_cbp_ctx );
+			cbp += (16 << cbp_bit); //  ? 32 : 16;
+		}
+	}
+
+
+	if (!cbp)
+	{
+		currSlice->last_dquant = 0;
+	}
+
+	return cbp;
+}
+
+/*!
+************************************************************************
+* \brief
+*    This function is used to arithmetically decode the chroma
+*    intra prediction mode of a given MB.
+************************************************************************
+*/
+char readCIPredMode_CABAC(Macroblock *currMB, 
+													DecodingEnvironmentPtr dep_dp)
+{
+	Slice *currSlice = currMB->p_Slice;
+	TextureInfoContexts *ctx = currSlice->tex_ctx;
+	int act_sym;
+
+	Macroblock          *MbUp   = currMB->mb_up;
+	Macroblock          *MbLeft = currMB->mb_left;
+
+	int b = (MbUp != NULL)   ? (((MbUp->c_ipred_mode   != 0) && (MbUp->mb_type != IPCM)) ? 1 : 0) : 0;
+	int a = (MbLeft != NULL) ? (((MbLeft->c_ipred_mode != 0) && (MbLeft->mb_type != IPCM)) ? 1 : 0) : 0;
+	int act_ctx = a + b;
+
+	act_sym = biari_decode_symbol(dep_dp, ctx->cipr_contexts + act_ctx );
+
+	if (act_sym != 0)
+		act_sym = unary_bin_max_decode(dep_dp, ctx->cipr_contexts + 3, 0, 1) + 1;
+	return act_sym;
+
+}
+
+static const byte maxpos       [] = {15, 14, 63, 31, 31, 15,  3, 14,  7, 15, 15, 14, 63, 31, 31, 15, 15, 14, 63, 31, 31, 15};
+static const byte c1isdc       [] = { 1,  0,  1,  1,  1,  1,  1,  0,  1,  1,  1,  0,  1,  1,  1,  1,  1,  0,  1,  1,  1,  1};
+static const byte type2ctx_bcbp[] = { 0,  1,  2,  3,  3,  4,  5,  6,  5,  5, 10, 11, 12, 13, 13, 14, 16, 17, 18, 19, 19, 20};
+static const byte type2ctx_map [] = { 0,  1,  2,  3,  4,  5,  6,  7,  6,  6, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}; // 8
+static const byte type2ctx_last[] = { 0,  1,  2,  3,  4,  5,  6,  7,  6,  6, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}; // 8
+static const byte type2ctx_one [] = { 0,  1,  2,  3,  3,  4,  5,  6,  5,  5, 10, 11, 12, 13, 13, 14, 16, 17, 18, 19, 19, 20}; // 7
+static const byte type2ctx_abs [] = { 0,  1,  2,  3,  3,  4,  5,  6,  5,  5, 10, 11, 12, 13, 13, 14, 16, 17, 18, 19, 19, 20}; // 7
+static const byte max_c2       [] = { 4,  4,  4,  4,  4,  4,  3,  4,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4}; // 9
+
+
+
+/*!
+************************************************************************
+* \brief
+*    Read CBP4-BIT
+************************************************************************
+*/
+static int read_and_store_CBP_block_bit_444(Macroblock *currMB, DecodingEnvironmentPtr dep_dp, int type)
+{
+	Slice *currSlice = currMB->p_Slice;
+	VideoParameters *p_Vid = currMB->p_Vid;
+	StorablePicture *dec_picture = p_Vid->dec_picture;
+	TextureInfoContexts *tex_ctx = currSlice->tex_ctx;
+
+	int y_ac        = (type==LUMA_16AC || type==LUMA_8x8 || type==LUMA_8x4 || type==LUMA_4x8 || type==LUMA_4x4
+		|| type==CB_16AC || type==CB_8x8 || type==CB_8x4 || type==CB_4x8 || type==CB_4x4
+		|| type==CR_16AC || type==CR_8x8 || type==CR_8x4 || type==CR_4x8 || type==CR_4x4);
+	int y_dc        = (type==LUMA_16DC || type==CB_16DC || type==CR_16DC); 
+	int u_ac        = (type==CHROMA_AC && !currMB->is_v_block);
+	int v_ac        = (type==CHROMA_AC &&  currMB->is_v_block);
+	int chroma_dc   = (type==CHROMA_DC || type==CHROMA_DC_2x4 || type==CHROMA_DC_4x4);
+	int u_dc        = (chroma_dc && !currMB->is_v_block);
+	int v_dc        = (chroma_dc &&  currMB->is_v_block);
+	int j           = (y_ac || u_ac || v_ac ? currMB->subblock_y : 0);
+	int i           = (y_ac || u_ac || v_ac ? currMB->subblock_x : 0);
+	int bit         = (y_dc ? 0 : y_ac ? 1 : u_dc ? 17 : v_dc ? 18 : u_ac ? 19 : 35);
+	int default_bit = (currMB->is_intra_block ? 1 : 0);
+	int upper_bit   = default_bit;
+	int left_bit    = default_bit;
+	int cbp_bit     = 1;  // always one for 8x8 mode
+	int ctx;
+	int bit_pos_a   = 0;
+	int bit_pos_b   = 0;
+
+	PixelPos block_a, block_b;
+	if (y_ac)
+	{
+		get4x4NeighbourLuma(currMB, i - 1, j    , &block_a);
+		get4x4NeighbourLuma(currMB, i    , j - 1,  &block_b);
+		if (block_a.available)
+			bit_pos_a = 4*block_a.y + block_a.x;
+		if (block_b.available)
+			bit_pos_b = 4*block_b.y + block_b.x;
+	}
+	else if (y_dc)
+	{
+		get4x4NeighbourLuma(currMB, i - 1, j    , &block_a);
+		get4x4NeighbourLuma(currMB, i    , j - 1, &block_b);
+	}
+	else if (u_ac||v_ac)
+	{
+		get4x4Neighbour(currMB, i - 1, j    , p_Vid->mb_size[IS_CHROMA], &block_a);
+		get4x4Neighbour(currMB, i    , j - 1, p_Vid->mb_size[IS_CHROMA], &block_b);
+		if (block_a.available)
+			bit_pos_a = 4*block_a.y + block_a.x;
+		if (block_b.available)
+			bit_pos_b = 4*block_b.y + block_b.x;
+	}
+	else
+	{
+		get4x4Neighbour(currMB, i - 1, j    , p_Vid->mb_size[IS_CHROMA], &block_a);
+		get4x4Neighbour(currMB, i    , j - 1, p_Vid->mb_size[IS_CHROMA], &block_b);
+	}
+
+	if (dec_picture->chroma_format_idc!=YUV444)
+	{
+		if (type!=LUMA_8x8)
+		{
+			//--- get bits from neighboring blocks ---
+			if (block_b.available)
+			{
+				if(p_Vid->mb_data[block_b.mb_addr].mb_type==IPCM)
+					upper_bit=1;
+				else
+					upper_bit = get_bit(p_Vid->mb_data[block_b.mb_addr].cbp_bits[0], bit + bit_pos_b);
+			}
+
+			if (block_a.available)
+			{
+				if(p_Vid->mb_data[block_a.mb_addr].mb_type==IPCM)
+					left_bit=1;
+				else
+					left_bit = get_bit(p_Vid->mb_data[block_a.mb_addr].cbp_bits[0],bit + bit_pos_a);
+			}
+
+
+			ctx = 2 * upper_bit + left_bit;     
+			//===== encode symbol =====
+			cbp_bit = biari_decode_symbol (dep_dp, tex_ctx->bcbp_contexts[type2ctx_bcbp[type]] + ctx);
+		}
+	}
+	else if( IS_INDEPENDENT(p_Vid) )
+	{
+		if (type!=LUMA_8x8)
+		{
+			//--- get bits from neighbouring blocks ---
+			if (block_b.available)
+			{
+				if(p_Vid->mb_data[block_b.mb_addr].mb_type==IPCM)
+					upper_bit = 1;
+				else
+					upper_bit = get_bit(p_Vid->mb_data[block_b.mb_addr].cbp_bits[0],bit+bit_pos_b);
+			}
+
+
+			if (block_a.available)
+			{
+				if(p_Vid->mb_data[block_a.mb_addr].mb_type==IPCM)
+					left_bit = 1;
+				else
+					left_bit = get_bit(p_Vid->mb_data[block_a.mb_addr].cbp_bits[0],bit+bit_pos_a);
+			}
+
+
+			ctx = 2 * upper_bit + left_bit;     
+			//===== encode symbol =====
+			cbp_bit = biari_decode_symbol (dep_dp, tex_ctx->bcbp_contexts[type2ctx_bcbp[type]] + ctx);
+		}
+	}
+	else {
+		if (block_b.available)
+		{
+			if(p_Vid->mb_data[block_b.mb_addr].mb_type==IPCM)
+				upper_bit=1;
+			else
+			{
+				if(type==LUMA_8x8)
+					upper_bit = get_bit(p_Vid->mb_data[block_b.mb_addr].cbp_bits_8x8[0], bit + bit_pos_b);
+				else if (type==CB_8x8)
+					upper_bit = get_bit(p_Vid->mb_data[block_b.mb_addr].cbp_bits_8x8[1], bit + bit_pos_b);
+				else if (type==CR_8x8)
+					upper_bit = get_bit(p_Vid->mb_data[block_b.mb_addr].cbp_bits_8x8[2], bit + bit_pos_b);
+				else if ((type==CB_4x4)||(type==CB_4x8)||(type==CB_8x4)||(type==CB_16AC)||(type==CB_16DC))
+					upper_bit = get_bit(p_Vid->mb_data[block_b.mb_addr].cbp_bits[1],bit+bit_pos_b);
+				else if ((type==CR_4x4)||(type==CR_4x8)||(type==CR_8x4)||(type==CR_16AC)||(type==CR_16DC))
+					upper_bit = get_bit(p_Vid->mb_data[block_b.mb_addr].cbp_bits[2],bit+bit_pos_b);
+				else
+					upper_bit = get_bit(p_Vid->mb_data[block_b.mb_addr].cbp_bits[0],bit+bit_pos_b);
+			}
+		}
+
+
+		if (block_a.available)
+		{
+			if(p_Vid->mb_data[block_a.mb_addr].mb_type==IPCM)
+				left_bit=1;
+			else
+			{
+				if(type==LUMA_8x8)
+					left_bit = get_bit(p_Vid->mb_data[block_a.mb_addr].cbp_bits_8x8[0],bit+bit_pos_a);
+				else if (type==CB_8x8)
+					left_bit = get_bit(p_Vid->mb_data[block_a.mb_addr].cbp_bits_8x8[1],bit+bit_pos_a);
+				else if (type==CR_8x8)
+					left_bit = get_bit(p_Vid->mb_data[block_a.mb_addr].cbp_bits_8x8[2],bit+bit_pos_a);
+				else if ((type==CB_4x4)||(type==CB_4x8)||(type==CB_8x4)||(type==CB_16AC)||(type==CB_16DC))
+					left_bit = get_bit(p_Vid->mb_data[block_a.mb_addr].cbp_bits[1],bit+bit_pos_a);
+				else if ((type==CR_4x4)||(type==CR_4x8)||(type==CR_8x4)||(type==CR_16AC)||(type==CR_16DC))
+					left_bit = get_bit(p_Vid->mb_data[block_a.mb_addr].cbp_bits[2],bit+bit_pos_a);
+				else
+					left_bit = get_bit(p_Vid->mb_data[block_a.mb_addr].cbp_bits[0],bit+bit_pos_a);
+			}
+		}
+
+		ctx = 2 * upper_bit + left_bit;
+		//===== encode symbol =====
+		cbp_bit = biari_decode_symbol (dep_dp, tex_ctx->bcbp_contexts[type2ctx_bcbp[type]] + ctx);
+	}
+
+	//--- set bits for current block ---
+	bit = (y_dc ? 0 : y_ac ? 1 + j + (i >> 2) : u_dc ? 17 : v_dc ? 18 : u_ac ? 19 + j + (i >> 2) : 35 + j + (i >> 2)); 
+
+	if (cbp_bit)
+	{  
+		if (type==LUMA_8x8) 
+		{      
+			currMB->cbp_bits[0] |= ((int64) 0x33 << bit   );
+
+			if (dec_picture->chroma_format_idc==YUV444)
+			{
+				currMB->cbp_bits_8x8[0]   |= ((int64) 0x33 << bit   );
+			}
+		}
+		else if (type==CB_8x8)
+		{
+			currMB->cbp_bits_8x8[1]   |= ((int64) 0x33 << bit   );      
+			currMB->cbp_bits[1]   |= ((int64) 0x33 << bit   );
+		}
+		else if (type==CR_8x8)
+		{
+			currMB->cbp_bits_8x8[2]   |= ((int64) 0x33 << bit   );      
+			currMB->cbp_bits[2]   |= ((int64) 0x33 << bit   );
+		}
+		else if (type==LUMA_8x4)
+		{
+			currMB->cbp_bits[0]   |= ((int64) 0x03 << bit   );
+		}
+		else if (type==CB_8x4)
+		{
+			currMB->cbp_bits[1]   |= ((int64) 0x03 << bit   );
+		}
+		else if (type==CR_8x4)
+		{
+			currMB->cbp_bits[2]   |= ((int64) 0x03 << bit   );
+		}
+		else if (type==LUMA_4x8)
+		{
+			currMB->cbp_bits[0]   |= ((int64) 0x11<< bit   );
+		}
+		else if (type==CB_4x8)
+		{
+			currMB->cbp_bits[1]   |= ((int64)0x11<< bit   );
+		}
+		else if (type==CR_4x8)
+		{
+			currMB->cbp_bits[2]   |= ((int64)0x11<< bit   );
+		}
+		else if ((type==CB_4x4)||(type==CB_16AC)||(type==CB_16DC))
+		{
+			currMB->cbp_bits[1]   |= ((int64)0x01<<bit);
+		}
+		else if ((type==CR_4x4)||(type==CR_16AC)||(type==CR_16DC))
+		{
+			currMB->cbp_bits[2]   |= ((int64)0x01<<bit);
+		}
+		else
+		{
+			currMB->cbp_bits[0]   |= ((int64)0x01<<bit);
+		}
+	}
+	return cbp_bit;
+}
+
+
+
+/*!
+************************************************************************
+* \brief
+*    Read CBP4-BIT
+************************************************************************
+*/
+static int read_and_store_CBP_block_bit_normal(Macroblock *currMB, DecodingEnvironmentPtr  dep_dp, int type)
+{
+	Slice *currSlice = currMB->p_Slice;
+	VideoParameters *p_Vid = currMB->p_Vid;
+	TextureInfoContexts *tex_ctx = currSlice->tex_ctx;
+	int cbp_bit     = 1;  // always one for 8x8 mode
+
+	if (type==LUMA_16DC)
+	{
+
+		int upper_bit   = 1;
+		int left_bit    = 1;
+		int ctx;
+
+		PixelPos block_a, block_b;
+
+		//--- get bits from neighboring blocks ---
+		p_Vid->getNeighbour0X(currMB,  -1, p_Vid->mb_size[IS_LUMA], &block_b);
+		if (block_b.available)
+		{
+			if(p_Vid->mb_data[block_b.mb_addr].mb_type==IPCM)
+				upper_bit=1;
+			else
+				upper_bit = (int)p_Vid->mb_data[block_b.mb_addr].cbp_bits[0]&1;
+		}
+
+		p_Vid->getNeighbourX0(currMB, -1, p_Vid->mb_size[IS_LUMA], &block_a);
+		if (block_a.available)
+		{
+			if(p_Vid->mb_data[block_a.mb_addr].mb_type==IPCM)
+				left_bit=1;
+			else
+				left_bit = (int)p_Vid->mb_data[block_a.mb_addr].cbp_bits[0]&1;
+		}
+
+		ctx = 2 * upper_bit + left_bit;     
+		//===== encode symbol =====
+		cbp_bit = biari_decode_symbol (dep_dp, tex_ctx->bcbp_contexts[type2ctx_bcbp[LUMA_16DC]] + ctx);
+
+		//--- set bits for current block ---
+
+		if (cbp_bit)
+		{  
+			currMB->cbp_bits[0]   |= 0x01;
+		}
+	}
+	else if (type == LUMA_8x8) 
+	{
+		int j           = currMB->subblock_y;
+		int i           = currMB->subblock_x;
+
+		//--- set bits for current block ---
+		int bit = 1 + j + (i >> 2); 
+
+		or_bits(&currMB->cbp_bits[0], 0x33, bit);
+	}
+	else if (type <= LUMA_4x4) // type==LUMA_16AC ||  type==LUMA_8x4 || type==LUMA_4x8 || type==LUMA_4x4)
+	{
+		int j           = currMB->subblock_y;
+		int i           = currMB->subblock_x;
+		int bit;
+		int default_bit = (currMB->is_intra_block ? 1 : 0);
+		int upper_bit   = default_bit;
+		int left_bit    = default_bit;
+		int ctx;
+
+		//--- get bits from neighboring blocks ---
+		PixelPos block_a, block_b;
+		p_Vid->getNeighbourPXLumaNB_NoPos(currMB, j-1, &block_b);
+		if (block_b.available)
+		{
+			int bit_pos_b = (block_b.y&((short)~3)) + (i>>2);   
+			if(p_Vid->mb_data[block_b.mb_addr].mb_type==IPCM)
+				upper_bit=1;
+			else
+				upper_bit = get_bit(p_Vid->mb_data[block_b.mb_addr].cbp_bits[0], 1 + bit_pos_b);
+		}
+
+		p_Vid->getNeighbourXPLumaNB_NoPos(currMB, i-1, j, &block_a);
+		if (block_a.available)
+		{
+			int bit_pos_a = (block_a.y&((short)~3)) + (block_a.x>>2);
+			if(p_Vid->mb_data[block_a.mb_addr].mb_type==IPCM)
+				left_bit=1;
+			else
+				left_bit = get_bit(p_Vid->mb_data[block_a.mb_addr].cbp_bits[0],1 + bit_pos_a);
+		}
+
+		ctx = 2 * upper_bit + left_bit;     
+		//===== encode symbol =====
+		cbp_bit = biari_decode_symbol (dep_dp, tex_ctx->bcbp_contexts[type2ctx_bcbp[type]] + ctx);
+
+
+		//--- set bits for current block ---
+		bit = 1 + j + (i >> 2); 
+
+		if (cbp_bit)
+		{  
+			if (type==LUMA_8x4)
+			{
+				or_bits_low(&currMB->cbp_bits[0], 0x03, bit);
+			}
+			else if (type==LUMA_4x8)
+			{
+				or_bits_low(&currMB->cbp_bits[0], 0x011, bit);
+			}
+			else
+			{
+				or_bits_low(&currMB->cbp_bits[0], 0x01, bit);
+			}
+		}
+	}
+	else if (type == CHROMA_AC)
+	{
+		int u_ac        = !currMB->is_v_block;
+
+		int default_bit = (currMB->is_intra_block ? 1 : 0);
+		int upper_bit   = default_bit;
+		int left_bit    = default_bit;
+		int ctx;
+
+		PixelPos block_a, block_b;
+
+		int j           = currMB->subblock_y;
+		int i           = currMB->subblock_x;
+		int bit         = (u_ac ? 19 : 35);
+		
+		p_Vid->getNeighbourXP_NoPos(currMB, i - 1, j    , p_Vid->mb_size[IS_CHROMA], &block_a);
+		p_Vid->getNeighbourPX_NoPos(currMB, i    , j - 1, p_Vid->mb_size[IS_CHROMA], &block_b);
+
+		//--- get bits from neighboring blocks ---
+		if (block_b.available)
+		{
+			if(p_Vid->mb_data[block_b.mb_addr].mb_type==IPCM)
+				upper_bit=1;
+			else
+				upper_bit = get_bit(p_Vid->mb_data[block_b.mb_addr].cbp_bits[0], bit + (block_b.y&((short)~3)) + (block_b.x>>2));
+		}
+
+		if (block_a.available)
+		{
+			if(p_Vid->mb_data[block_a.mb_addr].mb_type==IPCM)
+				left_bit=1;
+			else
+				left_bit = get_bit(p_Vid->mb_data[block_a.mb_addr].cbp_bits[0],bit + (block_a.y&((short)~3)) + (block_a.x>>2));
+		}
+
+		ctx = 2 * upper_bit + left_bit;     
+		//===== encode symbol =====
+		cbp_bit = biari_decode_symbol (dep_dp, tex_ctx->bcbp_contexts[type2ctx_bcbp[CHROMA_AC]] + ctx);
+
+
+		//--- set bits for current block ---
+		if (cbp_bit)
+		{  
+			or_bits(&currMB->cbp_bits[0], 0x01, bit + j + (i >> 2));
+		}
+
+	}
+	else if (type <= CHROMA_DC_4x4)
+	{
+		int v_dc        = currMB->is_v_block;
+		int default_bit = (currMB->is_intra_block ? 1 : 0);
+		int upper_bit   = default_bit;
+		int left_bit    = default_bit;
+		int ctx;
+
+
+		PixelPos block_a, block_b;
+
+		int bit         = (v_dc ? 18 : 17);
+		p_Vid->getNeighbourLeft(currMB, p_Vid->mb_size[IS_CHROMA], &block_a);
+		p_Vid->getNeighbourUp(currMB, p_Vid->mb_size[IS_CHROMA], &block_b);
+		//--- get bits from neighboring blocks ---
+		if (block_b.available)
+		{
+			if(p_Vid->mb_data[block_b.mb_addr].mb_type==IPCM)
+				upper_bit=1;
+			else
+				upper_bit = get_bit(p_Vid->mb_data[block_b.mb_addr].cbp_bits[0], bit);
+		}
+
+		if (block_a.available)
+		{
+			if(p_Vid->mb_data[block_a.mb_addr].mb_type==IPCM)
+				left_bit=1;
+			else
+				left_bit = get_bit(p_Vid->mb_data[block_a.mb_addr].cbp_bits[0],bit);
+		}
+
+		ctx = 2 * upper_bit + left_bit;     
+		//===== encode symbol =====
+		cbp_bit = biari_decode_symbol (dep_dp, tex_ctx->bcbp_contexts[type2ctx_bcbp[type]] + ctx);
+
+
+		//--- set bits for current block ---
+		if (cbp_bit)
+		{  
+			or_bits(&currMB->cbp_bits[0], 0x01, bit);
+		}
+
+
+	}
+	else
+	{
+		int default_bit = (currMB->is_intra_block ? 1 : 0);
+		int upper_bit   = default_bit;
+		int left_bit    = default_bit;
+		int ctx;
+
+
+		PixelPos block_a, block_b;
+
+		p_Vid->getNeighbourLeft(currMB, p_Vid->mb_size[IS_CHROMA], &block_a);
+		p_Vid->getNeighbourUp(currMB, p_Vid->mb_size[IS_CHROMA], &block_b);
+		//--- get bits from neighboring blocks ---
+		if (block_b.available)
+		{
+			if(p_Vid->mb_data[block_b.mb_addr].mb_type==IPCM)
+				upper_bit=1;
+			else
+				upper_bit = get_bit(p_Vid->mb_data[block_b.mb_addr].cbp_bits[0], 35);
+		}
+
+		if (block_a.available)
+		{
+			if(p_Vid->mb_data[block_a.mb_addr].mb_type==IPCM)
+				left_bit=1;
+			else
+				left_bit = get_bit(p_Vid->mb_data[block_a.mb_addr].cbp_bits[0],35);
+		}
+
+		ctx = 2 * upper_bit + left_bit;     
+		//===== encode symbol =====
+		cbp_bit = biari_decode_symbol (dep_dp, tex_ctx->bcbp_contexts[type2ctx_bcbp[type]] + ctx);
+
+
+		//--- set bits for current block ---
+		if (cbp_bit)
+		{  
+			or_bits(&currMB->cbp_bits[0], 0x01, 35);
+		}
+
+
+	}
+	return cbp_bit;
+}
+
+
+void set_read_and_store_CBP(Macroblock **currMB, int chroma_format_idc)
+{
+	if (chroma_format_idc == YUV444)
+		(*currMB)->read_and_store_CBP_block_bit = read_and_store_CBP_block_bit_444;
+	else
+		(*currMB)->read_and_store_CBP_block_bit = read_and_store_CBP_block_bit_normal; 
+}
+
+
+
+
+
+//===== position -> ctx for MAP =====
+//--- zig-zag scan ----
+static const byte  pos2ctx_map8x8 [] = { 0,  1,  2,  3,  4,  5,  5,  4,  4,  3,  3,  4,  4,  4,  5,  5,
+4,  4,  4,  4,  3,  3,  6,  7,  7,  7,  8,  9, 10,  9,  8,  7,
+7,  6, 11, 12, 13, 11,  6,  7,  8,  9, 14, 10,  9,  8,  6, 11,
+12, 13, 11,  6,  9, 14, 10,  9, 11, 12, 13, 11 ,14, 10, 12, 14}; // 15 CTX
+static const byte  pos2ctx_map8x4 [] = { 0,  1,  2,  3,  4,  5,  7,  8,  9, 10, 11,  9,  8,  6,  7,  8,
+9, 10, 11,  9,  8,  6, 12,  8,  9, 10, 11,  9, 13, 13, 14, 14}; // 15 CTX
+static const byte  pos2ctx_map4x4 [] = { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 14}; // 15 CTX
+static const byte  pos2ctx_map2x4c[] = { 0,  0,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2}; // 15 CTX
+static const byte  pos2ctx_map4x4c[] = { 0,  0,  0,  0,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2}; // 15 CTX
+static const byte* pos2ctx_map    [] = {pos2ctx_map4x4, pos2ctx_map4x4, pos2ctx_map8x8, pos2ctx_map8x4,
+pos2ctx_map8x4, pos2ctx_map4x4, pos2ctx_map4x4, pos2ctx_map4x4,
+pos2ctx_map2x4c, pos2ctx_map4x4c, 
+pos2ctx_map4x4, pos2ctx_map4x4, pos2ctx_map8x8,pos2ctx_map8x4,
+pos2ctx_map8x4, pos2ctx_map4x4,
+pos2ctx_map4x4, pos2ctx_map4x4, pos2ctx_map8x8,pos2ctx_map8x4,
+pos2ctx_map8x4,pos2ctx_map4x4};
+//--- interlace scan ----
+//taken from ABT
+static const byte  pos2ctx_map8x8i[] = { 0,  1,  1,  2,  2,  3,  3,  4,  5,  6,  7,  7,  7,  8,  4,  5,
+6,  9, 10, 10,  8, 11, 12, 11,  9,  9, 10, 10,  8, 11, 12, 11,
+9,  9, 10, 10,  8, 11, 12, 11,  9,  9, 10, 10,  8, 13, 13,  9,
+9, 10, 10,  8, 13, 13,  9,  9, 10, 10, 14, 14, 14, 14, 14, 14}; // 15 CTX
+static const byte  pos2ctx_map8x4i[] = { 0,  1,  2,  3,  4,  5,  6,  3,  4,  5,  6,  3,  4,  7,  6,  8,
+9,  7,  6,  8,  9, 10, 11, 12, 12, 10, 11, 13, 13, 14, 14, 14}; // 15 CTX
+static const byte  pos2ctx_map4x8i[] = { 0,  1,  1,  1,  2,  3,  3,  4,  4,  4,  5,  6,  2,  7,  7,  8,
+8,  8,  5,  6,  9, 10, 10, 11, 11, 11, 12, 13, 13, 14, 14, 14}; // 15 CTX
+static const byte* pos2ctx_map_int[] = {pos2ctx_map4x4, pos2ctx_map4x4, pos2ctx_map8x8i,pos2ctx_map8x4i,
+pos2ctx_map4x8i,pos2ctx_map4x4, pos2ctx_map4x4, pos2ctx_map4x4,
+pos2ctx_map2x4c, pos2ctx_map4x4c,
+pos2ctx_map4x4, pos2ctx_map4x4, pos2ctx_map8x8i,pos2ctx_map8x4i,
+pos2ctx_map8x4i,pos2ctx_map4x4,
+pos2ctx_map4x4, pos2ctx_map4x4, pos2ctx_map8x8i,pos2ctx_map8x4i,
+pos2ctx_map8x4i,pos2ctx_map4x4};
+
+//===== position -> ctx for LAST =====
+static const byte  pos2ctx_last8x8 [] = { 0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+3,  3,  3,  3,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4,
+5,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,  8,  8,  8,  8}; //  9 CTX
+static const byte  pos2ctx_last8x4 [] = { 0,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2,
+3,  3,  3,  3,  4,  4,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8}; //  9 CTX
+
+static const byte  pos2ctx_last4x4 [] = { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15}; // 15 CTX
+static const byte  pos2ctx_last2x4c[] = { 0,  0,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2}; // 15 CTX
+static const byte  pos2ctx_last4x4c[] = { 0,  0,  0,  0,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2}; // 15 CTX
+static const byte* pos2ctx_last    [] = {pos2ctx_last4x4, pos2ctx_last4x4, pos2ctx_last8x8, pos2ctx_last8x4,
+pos2ctx_last8x4, pos2ctx_last4x4, pos2ctx_last4x4, pos2ctx_last4x4,
+pos2ctx_last2x4c, pos2ctx_last4x4c,
+pos2ctx_last4x4, pos2ctx_last4x4, pos2ctx_last8x8,pos2ctx_last8x4,
+pos2ctx_last8x4, pos2ctx_last4x4,
+pos2ctx_last4x4, pos2ctx_last4x4, pos2ctx_last8x8,pos2ctx_last8x4,
+pos2ctx_last8x4, pos2ctx_last4x4};
+
+
+
+/*!
+************************************************************************
+* \brief
+*    Read Significance MAP
+************************************************************************
+*/
+
+#if defined(_DEBUG) || defined(_M_X64)
+static int read_significance_map(TextureInfoContexts *tex_ctx, const Macroblock *currMB, DecodingEnvironmentPtr dep_dp, int type, int16_t coeff[])
+{
+	int   i;
+	int   coeff_ctr = 0;
+	int   i0        = 0;
+	int   i1        = maxpos[type];
+	const VideoParameters *p_Vid = currMB->p_Vid;
+
+	int               fld       = ( p_Vid->structure!=FRAME || currMB->mb_field );
+	const byte *pos2ctx_Map = (fld) ? pos2ctx_map_int[type] : pos2ctx_map[type];
+	const byte *last = pos2ctx_last[type];
+
+	BiContextTypePtr  map_ctx   = tex_ctx->map_contexts[fld][type2ctx_map [type]];
+	BiContextTypePtr  last_ctx  = tex_ctx->last_contexts[fld][type2ctx_last[type]];
+
+	if (!c1isdc[type])
+	{
+		pos2ctx_Map++;
+		last++;
+	}
+
+	for (i=0; i < i1; ++i) // if last coeff is reached, it has to be significant
+	{
+		//--- read significance symbol ---
+		if (biari_decode_symbol   (dep_dp, map_ctx + pos2ctx_Map[i]))
+		{
+			coeff[i] = 1;
+			++coeff_ctr;
+			//--- read last coefficient symbol ---
+			if (biari_decode_symbol (dep_dp, last_ctx + last[i]))
+			{
+				while (i++ < i1)
+				{
+					coeff[i] = 0;
+				}
+				return coeff_ctr;
+				//memset(&coeff[i + 1], 0, (i1 - i) * sizeof(int));
+				//i = i1;
+			}
+		}
+		else
+		{
+			coeff[i] = 0;
+		}
+	}
+	//--- last coefficient must be significant if no last symbol was received ---
+	coeff[i] = 1;
+
+
+	return coeff_ctr+1;
+}
+#endif
+/*!
+************************************************************************
+* \brief
+*    Read Levels
+************************************************************************
+*/
+#if defined(_DEBUG) || defined(_M_X64)
+/*!
+************************************************************************
+* \brief
+*    Exp-Golomb decoding for LEVELS
+***********************************************************************
+*/
+unsigned int exp_golomb_decode_eq_prob( DecodingEnvironmentPtr dep_dp, int k);
+static unsigned int unary_exp_golomb_level_decode( DecodingEnvironmentPtr dep_dp,
+																									BiContextTypePtr ctx)
+{
+	unsigned int symbol = biari_decode_symbol(dep_dp, ctx );
+
+	if (symbol==0)
+		return 0;
+	else
+	{
+		const unsigned int exp_start = 13;
+
+		for (symbol=0;symbol<(exp_start-1);symbol++)
+		{
+			if (!biari_decode_symbol(dep_dp, ctx))
+				return symbol;
+		}
+		return exp_golomb_decode_eq_prob(dep_dp,0)+13;
+	}
+}
+
+static void read_significant_coefficients (TextureInfoContexts    *tex_ctx,
+																					 DecodingEnvironmentPtr  dep_dp,
+																					 int                     type,
+																					 int16_t                 coeff[])
+{
+	static const int plus_one_clip4[5] = { 1, 2, 3, 4, 4 };
+	static const int plus_one_clip3[4] = { 1, 2, 3, 3 };
+	const int *c2_clip = (max_c2[type]==4)?plus_one_clip4:plus_one_clip3;
+	int   i;
+	int   c1 = 1;
+	int   c2 = 0;
+	BiContextType *one_contexts = tex_ctx->one_contexts[type2ctx_one[type]];
+	BiContextType *abs_contexts = tex_ctx->abs_contexts[type2ctx_abs[type]];
+
+	for (i=maxpos[type]; i>=0; i--)
+	{
+		if (coeff[i]!=0)
+		{
+			coeff[i] += biari_decode_symbol (dep_dp, one_contexts + c1);
+			if (coeff[i]==2)
+			{
+				coeff[i] += unary_exp_golomb_level_decode (dep_dp, abs_contexts + c2);
+				c2 = c2_clip[c2];
+				c1=0;
+			}
+			else if (c1)
+			{
+				c1 = plus_one_clip4[c1];
+			}
+			if (biari_decode_symbol_eq_prob(dep_dp))
+			{
+				coeff[i] *= -1;
+			}
+		}
+	}
+}
+#else
+void read_significant_coefficients (TextureInfoContexts    *tex_ctx,
+																		DecodingEnvironmentPtr  dep_dp,
+																		int                     type,
+																		int                     coeff[]);
+#endif
+
+/*!
+************************************************************************
+* \brief
+*    Read Block-Transform Coefficients
+************************************************************************
+*/
+#if defined(_DEBUG) || defined(_M_X64)
+RunLevel readRunLevel_CABAC(Macroblock *currMB, DecodingEnvironmentPtr dep_dp, int context)
+{	
+	RunLevel rl;
+	Slice *currSlice = currMB->p_Slice;
+	//--- read coefficients for whole block ---
+	if (currSlice->coeff_ctr < 0)
+	{
+		//===== decode CBP-BIT =====
+		if ((currSlice->coeff_ctr = currMB->read_and_store_CBP_block_bit (currMB, dep_dp, context) )!=0)
+		{
+			//===== decode significance map =====
+			currSlice->coeff_ctr = read_significance_map (currSlice->tex_ctx, currMB, dep_dp, context, currSlice->coeff);
+
+			//===== decode significant coefficients =====
+			read_significant_coefficients    (currSlice->tex_ctx, dep_dp, context, currSlice->coeff);
+		}
+	}
+
+	//--- set run and level ---
+
+	rl.run=0;
+	if (currSlice->coeff_ctr--)
+	{
+		//--- set run and level (coefficient) ---
+		for (; currSlice->coeff[currSlice->pos] == 0; ++currSlice->pos, ++rl.run); 
+		rl.level = currSlice->coeff[currSlice->pos++];
+		//--- decrement coefficient counter and re-set position ---
+		if (currSlice->coeff_ctr == 0) 
+			currSlice->pos = 0;
+		return rl;
+	}
+	else
+	{
+		//--- set run and level (EOB) ---
+		currSlice->pos = 0;
+		rl.level = 0;
+		return rl;
+	}
+}
+#endif
+/*!
+************************************************************************
+* \brief
+*    arideco_bits_read
+************************************************************************
+*/
+static int arideco_bits_read(const DecodingEnvironmentPtr dep)
+{ 
+	int tmp = ((*dep->Dcodestrm_len) << 3) - dep->DbitsLeft;
+
+#if (2==TRACE)
+	fprintf(p_trace, "tmp: %d\n", tmp);
+#endif
+	return tmp;
+}
+
+/*!
+************************************************************************
+* \brief
+*    decoding of unary binarization using one or 2 distinct
+*    models for the first and all remaining bins; no terminating
+*    "0" for max_symbol
+***********************************************************************
+*/
+static unsigned int unary_bin_max_decode(DecodingEnvironmentPtr dep_dp,
+																				 BiContextTypePtr ctx,
+																				 int ctx_offset,
+																				 unsigned int max_symbol)
+{
+	unsigned int symbol =  biari_decode_symbol(dep_dp, ctx );
+
+	if (symbol==0 || (max_symbol == 0))
+		return symbol;
+	else
+	{    
+		unsigned int l;
+		ctx += ctx_offset;
+		symbol = 0;
+		do
+		{
+			l = biari_decode_symbol(dep_dp, ctx);
+			++symbol;
+		}
+		while( (l != 0) && (symbol < max_symbol) );
+
+		if ((l != 0) && (symbol == max_symbol))
+			++symbol;
+		return symbol;
+	}
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    decoding of unary binarization using one or 2 distinct
+*    models for the first and all remaining bins
+***********************************************************************
+*/
+static unsigned int unary_bin_decode(DecodingEnvironmentPtr dep_dp,
+																		 BiContextTypePtr ctx,
+																		 int ctx_offset)
+{
+	unsigned int symbol = biari_decode_symbol(dep_dp, ctx );
+
+	if (symbol == 0)
+		return 0;
+	else
+	{
+		unsigned int l;
+		ctx += ctx_offset;;
+		symbol = 0;
+		do
+		{
+			l=biari_decode_symbol(dep_dp, ctx);
+			++symbol;
+		}
+		while( l != 0 );
+		return symbol;
+	}
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    finding end of a slice in case this is not the end of a frame
+*
+* Unsure whether the "correction" below actually solves an off-by-one
+* problem or whether it introduces one in some cases :-(  Anyway,
+* with this change the bit stream format works with CABAC again.
+* StW, 8.7.02
+************************************************************************
+*/
+int cabac_startcode_follows(Slice *currSlice, int eos_bit)
+{
+	unsigned int  bit;
+
+	if( eos_bit )
+	{
+		const byte   *partMap    = assignSE2partition[currSlice->dp_mode];
+		DataPartition *dP = &(currSlice->partArr[partMap[SE_MBTYPE]]);  
+		DecodingEnvironmentPtr dep_dp = &(dP->de_cabac);
+
+		bit = biari_decode_final (dep_dp); //GB
+
+#if TRACE
+		fprintf(p_trace, "@%-6d %-63s (%3d)\n",symbolCount++, "end_of_slice_flag", bit);
+		fflush(p_trace);
+#endif
+	}
+	else
+	{
+		bit = 0;
+	}
+
+	return bit;
+}
+
+/*!
+************************************************************************
+* \brief
+*    Exp Golomb binarization and decoding of a symbol
+*    with prob. of 0.5r
+************************************************************************
+*/
+unsigned int exp_golomb_decode_eq_prob( DecodingEnvironmentPtr dep_dp, int k)
+{
+	unsigned int l;
+	int symbol = 0;
+	int binary_symbol = 0;
+
+	do
+	{
+		l = biari_decode_symbol_eq_prob(dep_dp);
+		if (l) // always returns 1 or zero
+		{
+			symbol += (l<<k); // l is guaranteed to be one
+			++k;
+		}
+	}
+	while (l!=0);
+
+	while (k--)                             //next binary part
+		if (biari_decode_symbol_eq_prob(dep_dp)==1)
+			binary_symbol |= (1<<k);
+
+	return (unsigned int) (symbol + binary_symbol);
+}
+
+/*!
+************************************************************************
+* \brief
+*    Exp-Golomb decoding for Motion Vectors
+***********************************************************************
+*/
+#if defined(_DEBUG) || defined(_M_X64)
+unsigned int unary_exp_golomb_mv_decode(DecodingEnvironmentPtr dep_dp,
+																							 BiContextTypePtr ctx,
+																							 unsigned int max_bin)
+{
+	unsigned int symbol = biari_decode_symbol(dep_dp, ctx );
+
+	if (symbol == 0)
+		return 0;
+	else
+	{
+		const unsigned int exp_start = 8;
+
+		++ctx;
+		for (symbol=1;symbol<exp_start;)
+		{
+			if (!biari_decode_symbol(dep_dp, ctx))
+				return symbol;
+			if ((++symbol)==2) ctx++;
+			if (symbol==max_bin) 
+				++ctx;
+		}
+
+		return exp_start + exp_golomb_decode_eq_prob(dep_dp,3);
+	}
+}
+unsigned int unary_exp_golomb_mv_decode3(DecodingEnvironmentPtr dep_dp,
+																							 BiContextTypePtr ctx)
+{
+																							 unsigned int max_bin = 3;
+	unsigned int symbol = biari_decode_symbol(dep_dp, ctx );
+
+	if (symbol == 0)
+		return 0;
+	else
+	{
+		const unsigned int exp_start = 8;
+
+		++ctx;
+		for (symbol=1;symbol<exp_start;)
+		{
+			if (!biari_decode_symbol(dep_dp, ctx))
+				return symbol;
+			if ((++symbol)==2) ctx++;
+			if (symbol==max_bin) 
+				++ctx;
+		}
+
+		return exp_start + exp_golomb_decode_eq_prob(dep_dp,3);
+	}
+}
+#endif
+
+/*!
+************************************************************************
+* \brief
+*    Read I_PCM macroblock 
+************************************************************************
+*/
+void readIPCM_CABAC(Slice *currSlice, struct datapartition *dP)
+{
+	VideoParameters *p_Vid = currSlice->p_Vid;
+	StorablePicture *dec_picture = p_Vid->dec_picture;
+	Bitstream* currStream = dP->bitstream;
+	DecodingEnvironmentPtr dep = &(dP->de_cabac);
+	byte *buf = currStream->streamBuffer;
+	int BitstreamLengthInBits = (dP->bitstream->bitstream_length << 3) + 7;
+
+	int val = 0;
+
+	int bits_read = 0;
+	int bitoffset, bitdepth;
+	int uv, i, j;
+
+	while (dep->DbitsLeft >= 8)
+	{
+		dep->Dvalue   >>= 8;
+		dep->DbitsLeft -= 8;
+		(*dep->Dcodestrm_len)--;
+	}
+
+	bitoffset = (*dep->Dcodestrm_len) << 3;
+
+	// read luma values
+	bitdepth = p_Vid->bitdepth_luma;
+	for(i=0;i<MB_BLOCK_SIZE;++i)
+	{
+		for(j=0;j<MB_BLOCK_SIZE;++j)
+		{
+			bits_read += GetBits(buf, bitoffset, &val, BitstreamLengthInBits, bitdepth);
+			currSlice->ipcm[0][i][j] = val;
+			bitoffset += bitdepth;
+		}
+	}
+
+	// read chroma values
+	bitdepth = p_Vid->bitdepth_chroma;
+	if ((dec_picture->chroma_format_idc != YUV400) && !IS_INDEPENDENT(p_Vid))
+	{
+		for (uv=1; uv<3; ++uv)
+		{
+			for(i=0;i<p_Vid->mb_cr_size_y;++i)
+			{
+				for(j=0;j<p_Vid->mb_cr_size_x;++j)
+				{
+					bits_read += GetBits(buf, bitoffset, &val, BitstreamLengthInBits, bitdepth);
+					currSlice->ipcm[uv][i][j] = val;
+					bitoffset += bitdepth;
+				}
+			}
+		}
+	}
+
+	(*dep->Dcodestrm_len) += ( bits_read >> 3);
+	if (bits_read & 7)
+	{
+		++(*dep->Dcodestrm_len);
+	}
+}
+
diff --git a/Src/h264dec/ldecod/src/context_ini.c b/Src/h264dec/ldecod/src/context_ini.c
new file mode 100644
index 00000000..2ffcfeef
--- /dev/null
+++ b/Src/h264dec/ldecod/src/context_ini.c
@@ -0,0 +1,123 @@
+
+/*!
+ *************************************************************************************
+ * \file context_ini.c
+ *
+ * \brief
+ *    CABAC context initializations
+ *
+ * \author
+ *    Main contributors (see contributors.h for copyright, address and affiliation details)
+ *    - Detlev Marpe                    <marpe@hhi.de>
+ *    - Heiko Schwarz                   <hschwarz@hhi.de>
+ **************************************************************************************
+ */
+
+#define CONTEXT_INI_C
+
+#include "defines.h"
+#include "global.h"
+#include "biaridecod.h"
+#include "ctx_tables.h"
+
+
+#define IBIARI_CTX_INIT2(ii,jj,ctx,tab,num, qp) \
+{ \
+  for (i=0; i<ii; ++i) \
+  for (j=0; j<jj; ++j) \
+  { \
+  biari_init_context (qp, &(ctx[i][j]), tab ## _I[num][i][j]); \
+  } \
+}
+
+#define PBIARI_CTX_INIT2(ii,jj,ctx,tab,num, qp) \
+{ \
+  for (i=0; i<ii; ++i) \
+  for (j=0; j<jj; ++j) \
+  { \
+  biari_init_context (qp, &(ctx[i][j]), tab ## _P[num][i][j]); \
+  } \
+}
+
+
+#define IBIARI_CTX_INIT1(jj,ctx,tab,num, qp) \
+{ \
+  for (j=0; j<jj; ++j) \
+  { \
+  biari_init_context (qp, &(ctx[j]), tab ## _I[num][0][j]); \
+  } \
+}
+
+
+#define PBIARI_CTX_INIT1(jj,ctx,tab,num, qp) \
+{ \
+  { \
+  for (j=0; j<jj; ++j) \
+  { \
+  biari_init_context (qp, &(ctx[j]), tab ## _P[num][0][j]); \
+  } \
+  } \
+}
+
+void init_contexts (Slice *currSlice)
+{
+  VideoParameters *p_Vid = currSlice->p_Vid;
+  MotionInfoContexts*  mc = currSlice->mot_ctx;
+  TextureInfoContexts* tc = currSlice->tex_ctx;
+  int i, j;
+  int qp = imax(0, p_Vid->qp);
+  int model_number = currSlice->model_number;
+
+  //printf("%d -", p_Vid->currentSlice->model_number);
+
+  //--- motion coding contexts ---
+  if ((currSlice->slice_type == I_SLICE)||(currSlice->slice_type == SI_SLICE))
+  {
+    IBIARI_CTX_INIT2 (3, NUM_MB_TYPE_CTX,   mc->mb_type_contexts,     INIT_MB_TYPE,    model_number, qp);
+    IBIARI_CTX_INIT2 (2, NUM_B8_TYPE_CTX,   mc->b8_type_contexts,     INIT_B8_TYPE,    model_number, qp);
+    IBIARI_CTX_INIT2 (2, NUM_MV_RES_CTX,    mc->mv_res_contexts,      INIT_MV_RES,     model_number, qp);
+    IBIARI_CTX_INIT2 (2, NUM_REF_NO_CTX,    mc->ref_no_contexts,      INIT_REF_NO,     model_number, qp);
+    IBIARI_CTX_INIT1 (   NUM_DELTA_QP_CTX,  mc->delta_qp_contexts,    INIT_DELTA_QP,   model_number, qp);
+    IBIARI_CTX_INIT1 (   NUM_MB_AFF_CTX,    mc->mb_aff_contexts,      INIT_MB_AFF,     model_number, qp);    
+
+    //--- texture coding contexts ---
+    IBIARI_CTX_INIT1 (   NUM_TRANSFORM_SIZE_CTX, tc->transform_size_contexts, INIT_TRANSFORM_SIZE, model_number, qp);
+    IBIARI_CTX_INIT1 (                 NUM_IPR_CTX,  tc->ipr_contexts,     INIT_IPR,       model_number, qp);
+    IBIARI_CTX_INIT1 (                 NUM_CIPR_CTX, tc->cipr_contexts,    INIT_CIPR,      model_number, qp);
+    IBIARI_CTX_INIT2 (3,               NUM_CBP_CTX,  tc->cbp_contexts,     INIT_CBP,       model_number, qp);
+    IBIARI_CTX_INIT2 (NUM_BLOCK_TYPES, NUM_BCBP_CTX, tc->bcbp_contexts,    INIT_BCBP,      model_number, qp);
+    IBIARI_CTX_INIT2 (NUM_BLOCK_TYPES, NUM_MAP_CTX,  tc->map_contexts[0],  INIT_MAP,       model_number, qp);
+#if ENABLE_FIELD_CTX
+    IBIARI_CTX_INIT2 (NUM_BLOCK_TYPES, NUM_MAP_CTX,  tc->map_contexts[1],  INIT_FLD_MAP,   model_number, qp);
+    IBIARI_CTX_INIT2 (NUM_BLOCK_TYPES, NUM_LAST_CTX, tc->last_contexts[1], INIT_FLD_LAST,  model_number, qp);
+#endif
+    IBIARI_CTX_INIT2 (NUM_BLOCK_TYPES, NUM_LAST_CTX, tc->last_contexts[0], INIT_LAST,      model_number, qp);    
+    IBIARI_CTX_INIT2 (NUM_BLOCK_TYPES, NUM_ONE_CTX,  tc->one_contexts,     INIT_ONE,       model_number, qp);
+    IBIARI_CTX_INIT2 (NUM_BLOCK_TYPES, NUM_ABS_CTX,  tc->abs_contexts,     INIT_ABS,       model_number, qp);
+  }
+  else
+  {
+    PBIARI_CTX_INIT2 (3, NUM_MB_TYPE_CTX,   mc->mb_type_contexts,     INIT_MB_TYPE,    model_number, qp);
+    PBIARI_CTX_INIT2 (2, NUM_B8_TYPE_CTX,   mc->b8_type_contexts,     INIT_B8_TYPE,    model_number, qp);
+    PBIARI_CTX_INIT2 (2, NUM_MV_RES_CTX,    mc->mv_res_contexts,      INIT_MV_RES,     model_number, qp);
+    PBIARI_CTX_INIT2 (2, NUM_REF_NO_CTX,    mc->ref_no_contexts,      INIT_REF_NO,     model_number, qp);
+    PBIARI_CTX_INIT1 (   NUM_DELTA_QP_CTX,  mc->delta_qp_contexts,    INIT_DELTA_QP,   model_number, qp);
+    PBIARI_CTX_INIT1 (   NUM_MB_AFF_CTX,    mc->mb_aff_contexts,      INIT_MB_AFF,     model_number, qp);    
+
+    //--- texture coding contexts ---
+    PBIARI_CTX_INIT1 (   NUM_TRANSFORM_SIZE_CTX, tc->transform_size_contexts, INIT_TRANSFORM_SIZE, model_number, qp);
+    PBIARI_CTX_INIT1 (                 NUM_IPR_CTX,  tc->ipr_contexts,     INIT_IPR,       model_number, qp);
+    PBIARI_CTX_INIT1 (                 NUM_CIPR_CTX, tc->cipr_contexts,    INIT_CIPR,      model_number, qp);
+    PBIARI_CTX_INIT2 (3,               NUM_CBP_CTX,  tc->cbp_contexts,     INIT_CBP,       model_number, qp);
+    PBIARI_CTX_INIT2 (NUM_BLOCK_TYPES, NUM_BCBP_CTX, tc->bcbp_contexts,    INIT_BCBP,      model_number, qp);
+    PBIARI_CTX_INIT2 (NUM_BLOCK_TYPES, NUM_MAP_CTX,  tc->map_contexts[0],  INIT_MAP,       model_number, qp);
+#if ENABLE_FIELD_CTX
+    PBIARI_CTX_INIT2 (NUM_BLOCK_TYPES, NUM_MAP_CTX,  tc->map_contexts[1],  INIT_FLD_MAP,   model_number, qp);
+    PBIARI_CTX_INIT2 (NUM_BLOCK_TYPES, NUM_LAST_CTX, tc->last_contexts[1], INIT_FLD_LAST,  model_number, qp);
+#endif
+    PBIARI_CTX_INIT2 (NUM_BLOCK_TYPES, NUM_LAST_CTX, tc->last_contexts[0], INIT_LAST,      model_number, qp);    
+    PBIARI_CTX_INIT2 (NUM_BLOCK_TYPES, NUM_ONE_CTX,  tc->one_contexts,     INIT_ONE,       model_number, qp);
+    PBIARI_CTX_INIT2 (NUM_BLOCK_TYPES, NUM_ABS_CTX,  tc->abs_contexts,     INIT_ABS,       model_number, qp);
+  }
+}
+
diff --git a/Src/h264dec/ldecod/src/erc_api.c b/Src/h264dec/ldecod/src/erc_api.c
new file mode 100644
index 00000000..48e827a0
--- /dev/null
+++ b/Src/h264dec/ldecod/src/erc_api.c
@@ -0,0 +1,371 @@
+
+/*!
+ *************************************************************************************
+ * \file erc_api.c
+ *
+ * \brief
+ *    External (still inside video decoder) interface for error concealment module
+ *
+ *  \author
+ *     - Ari Hourunranta                <ari.hourunranta@nokia.com>
+ *     - Viktor Varsa                     <viktor.varsa@nokia.com>
+ *     - Ye-Kui Wang                   <wyk@ieee.org>
+ *
+ *************************************************************************************
+ */
+
+
+#include "global.h"
+#include "memalloc.h"
+#include "erc_api.h"
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Initinize the error concealment module
+ ************************************************************************
+ */
+void ercInit(VideoParameters *p_Vid, int pic_sizex, int pic_sizey, int flag)
+{
+	ercClose(p_Vid, p_Vid->erc_errorVar);
+	p_Vid->erc_object_list = (objectBuffer_t *) calloc((pic_sizex * pic_sizey) >> 6, sizeof(objectBuffer_t));
+	if (p_Vid->erc_object_list == NULL) no_mem_exit("ercInit: erc_object_list");
+
+	// the error concealment instance is allocated
+	p_Vid->erc_errorVar = ercOpen();
+
+	// set error concealment ON
+	ercSetErrorConcealment(p_Vid->erc_errorVar, flag);
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *      Allocates data structures used in error concealment.
+ *\return
+ *      The allocated ercVariables_t is returned.
+ ************************************************************************
+ */
+ercVariables_t *ercOpen( void )
+{
+	ercVariables_t *errorVar = NULL;
+
+	errorVar = (ercVariables_t *)malloc( sizeof(ercVariables_t));
+	if ( errorVar == NULL ) no_mem_exit("ercOpen: errorVar");
+
+	errorVar->nOfMBs = 0;
+	errorVar->segments = NULL;
+	errorVar->currSegment = 0;
+	errorVar->yCondition = NULL;
+	errorVar->uCondition = NULL;
+	errorVar->vCondition = NULL;
+	errorVar->prevFrameYCondition = NULL;
+
+	errorVar->concealment = 1;
+
+	return errorVar;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *      Resets the variables used in error detection.
+ *      Should be called always when starting to decode a new frame.
+ * \param errorVar
+ *      Variables for error concealment
+ * \param nOfMBs
+ *      Number of macroblocks in a frame
+ * \param numOfSegments
+ *    Estimated number of segments (memory reserved)
+ * \param picSizeX
+ *      Width of the frame in pixels.
+ ************************************************************************
+ */
+void ercReset( ercVariables_t *errorVar, int nOfMBs, int numOfSegments, int picSizeX )
+{
+	if ( errorVar && errorVar->concealment )
+	{
+		int i = 0;
+
+		// If frame size has been changed
+		if ( nOfMBs != errorVar->nOfMBs && errorVar->yCondition != NULL )
+		{
+			free( errorVar->yCondition );
+			errorVar->yCondition = NULL;
+			free( errorVar->prevFrameYCondition );
+			errorVar->prevFrameYCondition = NULL;
+			free( errorVar->uCondition );
+			errorVar->uCondition = NULL;
+			free( errorVar->vCondition );
+			errorVar->vCondition = NULL;
+			free( errorVar->segments );
+			errorVar->segments = NULL;
+		}
+
+		// If the structures are uninitialized (first frame, or frame size is changed)
+		if ( errorVar->yCondition == NULL )
+		{
+			errorVar->segments = (ercSegment_t *)malloc( numOfSegments*sizeof(ercSegment_t) );
+			if ( errorVar->segments == NULL ) no_mem_exit("ercReset: errorVar->segments");
+			memset( errorVar->segments, 0, numOfSegments*sizeof(ercSegment_t));
+			errorVar->nOfSegments = numOfSegments;
+
+			errorVar->yCondition = (int *)malloc( 4*nOfMBs*sizeof(int) );
+			if ( errorVar->yCondition == NULL ) no_mem_exit("ercReset: errorVar->yCondition");
+			errorVar->prevFrameYCondition = (int *)malloc( 4*nOfMBs*sizeof(int) );
+			if ( errorVar->prevFrameYCondition == NULL ) no_mem_exit("ercReset: errorVar->prevFrameYCondition");
+			errorVar->uCondition = (int *)malloc( nOfMBs*sizeof(int) );
+			if ( errorVar->uCondition == NULL ) no_mem_exit("ercReset: errorVar->uCondition");
+			errorVar->vCondition = (int *)malloc( nOfMBs*sizeof(int) );
+			if ( errorVar->vCondition == NULL ) no_mem_exit("ercReset: errorVar->vCondition");
+			errorVar->nOfMBs = nOfMBs;
+		}
+		else
+		{
+			// Store the yCondition struct of the previous frame
+			int *tmp = errorVar->prevFrameYCondition;
+			errorVar->prevFrameYCondition = errorVar->yCondition;
+			errorVar->yCondition = tmp;
+		}
+
+		// Reset tables and parameters
+		memset( errorVar->yCondition, 0, 4*nOfMBs*sizeof(*errorVar->yCondition));
+		memset( errorVar->uCondition, 0,   nOfMBs*sizeof(*errorVar->uCondition));
+		memset( errorVar->vCondition, 0,   nOfMBs*sizeof(*errorVar->vCondition));
+
+		if (errorVar->nOfSegments != numOfSegments)
+		{
+			free( errorVar->segments );
+			errorVar->segments = NULL;
+			errorVar->segments = (ercSegment_t *)malloc( numOfSegments*sizeof(ercSegment_t) );
+			if ( errorVar->segments == NULL ) no_mem_exit("ercReset: errorVar->segments");
+			errorVar->nOfSegments = numOfSegments;
+		}
+
+		memset( errorVar->segments, 0, errorVar->nOfSegments*sizeof(ercSegment_t));
+
+		for ( ; i < errorVar->nOfSegments; i++ )
+		{
+			errorVar->segments[i].fCorrupted = 1; //! mark segments as corrupted
+			errorVar->segments[i].startMBPos = 0;
+			errorVar->segments[i].endMBPos = nOfMBs - 1;
+		}
+
+		errorVar->currSegment = 0;
+		errorVar->nOfCorruptedSegments = 0;
+	}
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *      Resets the variables used in error detection.
+ *      Should be called always when starting to decode a new frame.
+ * \param p_Vid
+ *      VideoParameters variable
+ * \param errorVar
+ *      Variables for error concealment
+ ************************************************************************
+ */
+void ercClose(VideoParameters *p_Vid,  ercVariables_t *errorVar )
+{
+	if ( errorVar != NULL )
+	{
+	    if (errorVar->yCondition != NULL)
+		{
+			free( errorVar->segments );
+			free( errorVar->yCondition );
+			free( errorVar->uCondition );
+			free( errorVar->vCondition );
+			free( errorVar->prevFrameYCondition );
+		}
+		free( errorVar );
+		errorVar = NULL;
+	}
+
+	if ( p_Vid && p_Vid->erc_object_list)
+	{
+		free(p_Vid->erc_object_list);
+		p_Vid->erc_object_list=NULL;
+	}
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *      Sets error concealment ON/OFF. Can be invoked only between frames, not during a frame
+ * \param errorVar
+ *      Variables for error concealment
+ * \param value
+ *      New value
+ ************************************************************************
+ */
+void ercSetErrorConcealment( ercVariables_t *errorVar, int value )
+{
+	if ( errorVar != NULL )
+		errorVar->concealment = value;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *      Creates a new segment in the segment-list, and marks the start MB and bit position.
+ *      If the end of the previous segment was not explicitly marked by "ercStopSegment",
+ *      also marks the end of the previous segment.
+ *      If needed, it reallocates the segment-list for a larger storage place.
+ * \param currMBNum
+ *      The MB number where the new slice/segment starts
+ * \param segment
+ *      Segment/Slice No. counted by the caller
+ * \param bitPos
+ *      Bitstream pointer: number of bits read from the buffer.
+ * \param errorVar
+ *      Variables for error detector
+ ************************************************************************
+ */
+void ercStartSegment( int currMBNum, int segment, unsigned int bitPos, ercVariables_t *errorVar )
+{
+	if ( errorVar && errorVar->concealment )
+	{
+		errorVar->currSegmentCorrupted = 0;
+
+		errorVar->segments[ segment ].fCorrupted = 0;
+		errorVar->segments[ segment ].startMBPos = currMBNum;
+	}
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *      Marks the end position of a segment.
+ * \param currMBNum
+ *      The last MB number of the previous segment
+ * \param segment
+ *      Segment/Slice No. counted by the caller
+ *      If (segment<0) the internal segment counter is used.
+ * \param bitPos
+ *      Bitstream pointer: number of bits read from the buffer.
+ * \param errorVar
+ *      Variables for error detector
+ ************************************************************************
+ */
+void ercStopSegment( int currMBNum, int segment, unsigned int bitPos, ercVariables_t *errorVar )
+{
+	if ( errorVar && errorVar->concealment )
+	{
+		errorVar->segments[ segment ].endMBPos = currMBNum; //! Changed TO 12.11.2001
+		errorVar->currSegment++;
+	}
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *      Marks the current segment (the one which has the "currMBNum" MB in it)
+ *      as lost: all the blocks of the MBs in the segment as corrupted.
+ * \param picSizeX
+ *      Width of the frame in pixels.
+ * \param errorVar
+ *      Variables for error detector
+ ************************************************************************
+ */
+void ercMarkCurrSegmentLost(int picSizeX, ercVariables_t *errorVar )
+{
+	if ( errorVar && errorVar->concealment )
+	{
+		int current_segment = errorVar->currSegment-1, j;
+
+		if (errorVar->currSegmentCorrupted == 0)
+		{
+			errorVar->nOfCorruptedSegments++;
+			errorVar->currSegmentCorrupted = 1;
+		}
+
+		for ( j = errorVar->segments[current_segment].startMBPos; j <= errorVar->segments[current_segment].endMBPos; j++ )
+		{
+			errorVar->yCondition[MBNum2YBlock (j, 0, picSizeX)] = ERC_BLOCK_CORRUPTED;
+			errorVar->yCondition[MBNum2YBlock (j, 1, picSizeX)] = ERC_BLOCK_CORRUPTED;
+			errorVar->yCondition[MBNum2YBlock (j, 2, picSizeX)] = ERC_BLOCK_CORRUPTED;
+			errorVar->yCondition[MBNum2YBlock (j, 3, picSizeX)] = ERC_BLOCK_CORRUPTED;
+			errorVar->uCondition[j] = ERC_BLOCK_CORRUPTED;
+			errorVar->vCondition[j] = ERC_BLOCK_CORRUPTED;
+		}
+		errorVar->segments[current_segment].fCorrupted = 1;
+	}
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *      Marks the current segment (the one which has the "currMBNum" MB in it)
+ *      as OK: all the blocks of the MBs in the segment as OK.
+ * \param picSizeX
+ *      Width of the frame in pixels.
+ * \param errorVar
+ *      Variables for error detector
+ ************************************************************************
+ */
+void ercMarkCurrSegmentOK(int picSizeX, ercVariables_t *errorVar )
+{
+	if ( errorVar && errorVar->concealment )
+	{
+		int current_segment = errorVar->currSegment-1, j;
+
+		// mark all the Blocks belonging to the segment as OK */
+		for ( j = errorVar->segments[current_segment].startMBPos; j <= errorVar->segments[current_segment].endMBPos; j++ )
+		{
+			errorVar->yCondition[MBNum2YBlock (j, 0, picSizeX)] = ERC_BLOCK_OK;
+			errorVar->yCondition[MBNum2YBlock (j, 1, picSizeX)] = ERC_BLOCK_OK;
+			errorVar->yCondition[MBNum2YBlock (j, 2, picSizeX)] = ERC_BLOCK_OK;
+			errorVar->yCondition[MBNum2YBlock (j, 3, picSizeX)] = ERC_BLOCK_OK;
+			errorVar->uCondition[j] = ERC_BLOCK_OK;
+			errorVar->vCondition[j] = ERC_BLOCK_OK;
+		}
+		errorVar->segments[current_segment].fCorrupted = 0;
+	}
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *      Marks the Blocks of the given component (YUV) of the current MB as concealed.
+ * \param currMBNum
+ *      Selects the segment where this MB number is in.
+ * \param comp
+ *      Component to mark (0:Y, 1:U, 2:V, <0:All)
+ * \param picSizeX
+ *      Width of the frame in pixels.
+ * \param errorVar
+ *      Variables for error detector
+ ************************************************************************
+ */
+void ercMarkCurrMBConcealed( int currMBNum, int comp, int picSizeX, ercVariables_t *errorVar )
+{
+	int setAll = 0;
+
+	if ( errorVar && errorVar->concealment )
+	{
+		if (comp < 0)
+		{
+			setAll = 1;
+			comp = 0;
+		}
+
+		switch (comp)
+		{
+			case 0:
+				errorVar->yCondition[MBNum2YBlock (currMBNum, 0, picSizeX)] = ERC_BLOCK_CONCEALED;
+				errorVar->yCondition[MBNum2YBlock (currMBNum, 1, picSizeX)] = ERC_BLOCK_CONCEALED;
+				errorVar->yCondition[MBNum2YBlock (currMBNum, 2, picSizeX)] = ERC_BLOCK_CONCEALED;
+				errorVar->yCondition[MBNum2YBlock (currMBNum, 3, picSizeX)] = ERC_BLOCK_CONCEALED;
+				if (!setAll)
+					break;
+			case 1:
+				errorVar->uCondition[currMBNum] = ERC_BLOCK_CONCEALED;
+				if (!setAll)
+					break;
+			case 2:
+				errorVar->vCondition[currMBNum] = ERC_BLOCK_CONCEALED;
+		}
+	}
+}
diff --git a/Src/h264dec/ldecod/src/erc_do_i.c b/Src/h264dec/ldecod/src/erc_do_i.c
new file mode 100644
index 00000000..55d2a38f
--- /dev/null
+++ b/Src/h264dec/ldecod/src/erc_do_i.c
@@ -0,0 +1,544 @@
+
+/*!
+ *************************************************************************************
+ * \file
+ *      erc_do_i.c
+ *
+ * \brief
+ *      Intra (I) frame error concealment algorithms for decoder
+ *
+ *  \author
+ *      - Ari Hourunranta              <ari.hourunranta@nokia.com>
+ *      - Viktor Varsa                 <viktor.varsa@nokia.com>
+ *      - Ye-Kui Wang                  <wyk@ieee.org>
+ *
+ *************************************************************************************
+ */
+
+#include "global.h"
+#include "erc_do.h"
+
+static void concealBlocks          ( VideoParameters *p_Vid, int lastColumn, int lastRow, int comp, frame *recfr, int picSizeX, int *condition );
+static void pixMeanInterpolateBlock( VideoParameters *p_Vid, imgpel *src[], imgpel *block, int blockSize, int frameWidth );
+
+/*!
+ ************************************************************************
+ * \brief
+ *      The main function for Intra frame concealment.
+ *      Calls "concealBlocks" for each color component (Y,U,V) separately
+ * \return
+ *      0, if the concealment was not successful and simple concealment should be used
+ *      1, otherwise (even if none of the blocks were concealed)
+ * \param p_Vid
+ *      image encoding parameters for current picture
+ * \param recfr
+ *      Reconstructed frame buffer
+ * \param picSizeX
+ *      Width of the frame in pixels
+ * \param picSizeY
+ *      Height of the frame in pixels
+ * \param errorVar
+ *      Variables for error concealment
+ ************************************************************************
+ */
+int ercConcealIntraFrame( VideoParameters *p_Vid, frame *recfr, int picSizeX, int picSizeY, ercVariables_t *errorVar )
+{
+  int lastColumn = 0, lastRow = 0;
+
+  // if concealment is on
+  if ( errorVar && errorVar->concealment )
+  {
+    // if there are segments to be concealed
+    if ( errorVar->nOfCorruptedSegments )
+    {
+      // Y
+      lastRow = (int) (picSizeY>>3);
+      lastColumn = (int) (picSizeX>>3);
+      concealBlocks( p_Vid, lastColumn, lastRow, 0, recfr, picSizeX, errorVar->yCondition );
+
+      // U (dimensions halved compared to Y)
+      lastRow = (int) (picSizeY>>4);
+      lastColumn = (int) (picSizeX>>4);
+      concealBlocks( p_Vid, lastColumn, lastRow, 1, recfr, picSizeX, errorVar->uCondition );
+
+      // V ( dimensions equal to U )
+      concealBlocks( p_Vid, lastColumn, lastRow, 2, recfr, picSizeX, errorVar->vCondition );
+    }
+    return 1;
+  }
+  else
+    return 0;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *      Conceals the MB at position (row, column) using pixels from predBlocks[]
+ *      using pixMeanInterpolateBlock()
+ * \param p_Vid
+ *      image encoding parameters for current picture
+ * \param currFrame
+ *      current frame
+ * \param row
+ *      y coordinate in blocks
+ * \param column
+ *      x coordinate in blocks
+ * \param predBlocks[]
+ *      list of neighboring source blocks (numbering 0 to 7, 1 means: use the neighbor)
+ * \param frameWidth
+ *      width of frame in pixels
+ * \param mbWidthInBlocks
+ *      2 for Y, 1 for U/V components
+ ************************************************************************
+ */
+void ercPixConcealIMB(VideoParameters *p_Vid, imgpel *currFrame, int row, int column, int predBlocks[], int frameWidth, int mbWidthInBlocks)
+{
+   imgpel *src[8]={NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL};
+   imgpel *currBlock = NULL;
+
+   // collect the reliable neighboring blocks
+   if (predBlocks[0])
+      src[0] = currFrame + (row-mbWidthInBlocks)*frameWidth*8 + (column+mbWidthInBlocks)*8;
+   if (predBlocks[1])
+      src[1] = currFrame + (row-mbWidthInBlocks)*frameWidth*8 + (column-mbWidthInBlocks)*8;
+   if (predBlocks[2])
+      src[2] = currFrame + (row+mbWidthInBlocks)*frameWidth*8 + (column-mbWidthInBlocks)*8;
+   if (predBlocks[3])
+      src[3] = currFrame + (row+mbWidthInBlocks)*frameWidth*8 + (column+mbWidthInBlocks)*8;
+   if (predBlocks[4])
+      src[4] = currFrame + (row-mbWidthInBlocks)*frameWidth*8 + column*8;
+   if (predBlocks[5])
+      src[5] = currFrame + row*frameWidth*8 + (column-mbWidthInBlocks)*8;
+   if (predBlocks[6])
+      src[6] = currFrame + (row+mbWidthInBlocks)*frameWidth*8 + column*8;
+   if (predBlocks[7])
+      src[7] = currFrame + row*frameWidth*8 + (column+mbWidthInBlocks)*8;
+
+   currBlock = currFrame + row*frameWidth*8 + column*8;
+   pixMeanInterpolateBlock( p_Vid, src, currBlock, mbWidthInBlocks*8, frameWidth );
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *      This function checks the neighbors of a Macroblock for usability in
+ *      concealment. First the OK macroblocks are marked, and if there is not
+ *      enough of them, then the CONCEALED ones as well.
+ *      A "1" in the the output array means reliable, a "0" non reliable MB.
+ *      The block order in "predBlocks":
+ *              1 4 0
+ *              5 x 7
+ *              2 6 3
+ *      i.e., corners first.
+ * \return
+ *      Number of useable neighbor macroblocks for concealment.
+ * \param predBlocks[]
+ *      Array for indicating the valid neighbor blocks
+ * \param currRow
+ *      Current block row in the frame
+ * \param currColumn
+ *      Current block column in the frame
+ * \param condition
+ *      The block condition (ok, lost) table
+ * \param maxRow
+ *      Number of block rows in the frame
+ * \param maxColumn
+ *      Number of block columns in the frame
+ * \param step
+ *      Number of blocks belonging to a MB, when counting
+ *      in vertical/horizontal direction. (Y:2 U,V:1)
+ * \param fNoCornerNeigh
+ *      No corner neighbors are considered
+ ************************************************************************
+ */
+int ercCollect8PredBlocks( int predBlocks[], int currRow, int currColumn, int *condition,
+                           int maxRow, int maxColumn, int step, byte fNoCornerNeigh )
+{
+  int srcCounter  = 0;
+  int srcCountMin = (fNoCornerNeigh ? 2 : 4);
+  int threshold   = ERC_BLOCK_OK;
+
+  memset( predBlocks, 0, 8*sizeof(int) );
+
+  // collect the reliable neighboring blocks
+  do
+  {
+    srcCounter = 0;
+    // top
+    if (currRow > 0 && condition[ (currRow-1)*maxColumn + currColumn ] >= threshold )
+    {                           //ERC_BLOCK_OK (3) or ERC_BLOCK_CONCEALED (2)
+      predBlocks[4] = condition[ (currRow-1)*maxColumn + currColumn ];
+      srcCounter++;
+    }
+    // bottom
+    if ( currRow < (maxRow-step) && condition[ (currRow+step)*maxColumn + currColumn ] >= threshold )
+    {
+      predBlocks[6] = condition[ (currRow+step)*maxColumn + currColumn ];
+      srcCounter++;
+    }
+
+    if ( currColumn > 0 )
+    {
+      // left
+      if ( condition[ currRow*maxColumn + currColumn - 1 ] >= threshold )
+      {
+        predBlocks[5] = condition[ currRow*maxColumn + currColumn - 1 ];
+        srcCounter++;
+      }
+
+      if ( !fNoCornerNeigh )
+      {
+        // top-left
+        if ( currRow > 0 && condition[ (currRow-1)*maxColumn + currColumn - 1 ] >= threshold )
+        {
+          predBlocks[1] = condition[ (currRow-1)*maxColumn + currColumn - 1 ];
+          srcCounter++;
+        }
+        // bottom-left
+        if ( currRow < (maxRow-step) && condition[ (currRow+step)*maxColumn + currColumn - 1 ] >= threshold )
+        {
+          predBlocks[2] = condition[ (currRow+step)*maxColumn + currColumn - 1 ];
+          srcCounter++;
+        }
+      }
+    }
+
+    if ( currColumn < (maxColumn-step) )
+    {
+      // right
+      if ( condition[ currRow*maxColumn+currColumn + step ] >= threshold )
+      {
+        predBlocks[7] = condition[ currRow*maxColumn+currColumn + step ];
+        srcCounter++;
+      }
+
+      if ( !fNoCornerNeigh )
+      {
+        // top-right
+        if ( currRow > 0 && condition[ (currRow-1)*maxColumn + currColumn + step ] >= threshold )
+        {
+          predBlocks[0] = condition[ (currRow-1)*maxColumn + currColumn + step ];
+          srcCounter++;
+        }
+        // bottom-right
+        if ( currRow < (maxRow-step) && condition[ (currRow+step)*maxColumn + currColumn + step ] >= threshold )
+        {
+          predBlocks[3] = condition[ (currRow+step)*maxColumn + currColumn + step ];
+          srcCounter++;
+        }
+      }
+    }
+    // prepare for the next round
+    threshold--;
+    if (threshold < ERC_BLOCK_CONCEALED)
+      break;
+  } while ( srcCounter < srcCountMin);
+
+  return srcCounter;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *      collects prediction blocks only from the current column
+ * \return
+ *      Number of usable neighbour Macroblocks for concealment.
+ * \param predBlocks[]
+ *      Array for indicating the valid neighbor blocks
+ * \param currRow
+ *      Current block row in the frame
+ * \param currColumn
+ *      Current block column in the frame
+ * \param condition
+ *      The block condition (ok, lost) table
+ * \param maxRow
+ *      Number of block rows in the frame
+ * \param maxColumn
+ *      Number of block columns in the frame
+ * \param step
+ *      Number of blocks belonging to a MB, when counting
+ *      in vertical/horizontal direction. (Y:2 U,V:1)
+ ************************************************************************
+ */
+int ercCollectColumnBlocks( int predBlocks[], int currRow, int currColumn, int *condition, int maxRow, int maxColumn, int step )
+{
+  int srcCounter = 0, threshold = ERC_BLOCK_CORRUPTED;
+
+  memset( predBlocks, 0, 8*sizeof(int) );
+
+  // in this case, row > 0 and row < 17
+  if ( condition[ (currRow-1)*maxColumn + currColumn ] > threshold )
+  {
+    predBlocks[4] = 1;
+    srcCounter++;
+  }
+  if ( condition[ (currRow+step)*maxColumn + currColumn ] > threshold )
+  {
+    predBlocks[6] = 1;
+    srcCounter++;
+  }
+
+  return srcCounter;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *      Core for the Intra blocks concealment.
+ *      It is called for each color component (Y,U,V) separately
+ *      Finds the corrupted blocks and calls pixel interpolation functions
+ *      to correct them, one block at a time.
+ *      Scanning is done vertically and each corrupted column is corrected
+ *      bi-directionally, i.e., first block, last block, first block+1, last block -1 ...
+ * \param p_Vid
+ *      image encoding parameters for current picture
+ * \param lastColumn
+ *      Number of block columns in the frame
+ * \param lastRow
+ *      Number of block rows in the frame
+ * \param comp
+ *      color component
+ * \param recfr
+ *      Reconstructed frame buffer
+ * \param picSizeX
+ *      Width of the frame in pixels
+ * \param condition
+ *      The block condition (ok, lost) table
+ ************************************************************************
+ */
+static void concealBlocks( VideoParameters *p_Vid, int lastColumn, int lastRow, int comp, frame *recfr, int picSizeX, int *condition )
+{
+  int row, column, srcCounter = 0,  thr = ERC_BLOCK_CORRUPTED,
+      lastCorruptedRow = -1, firstCorruptedRow = -1, currRow = 0,
+      areaHeight = 0, i = 0, smoothColumn = 0;
+  int predBlocks[8], step = 1;
+
+  // in the Y component do the concealment MB-wise (not block-wise):
+  // this is useful if only whole MBs can be damaged or lost
+  if ( comp == 0 )
+    step = 2;
+  else
+    step = 1;
+
+  for ( column = 0; column < lastColumn; column += step )
+  {
+    for ( row = 0; row < lastRow; row += step )
+    {
+      if ( condition[row*lastColumn+column] <= thr )
+      {
+        firstCorruptedRow = row;
+        // find the last row which has corrupted blocks (in same continuous area)
+        for ( lastCorruptedRow = row+step; lastCorruptedRow < lastRow; lastCorruptedRow += step )
+        {
+          // check blocks in the current column
+          if ( condition[ lastCorruptedRow*lastColumn + column ] > thr )
+          {
+            // current one is already OK, so the last was the previous one
+            lastCorruptedRow -= step;
+            break;
+          }
+        }
+        if ( lastCorruptedRow >= lastRow )
+        {
+          // correct only from above
+          lastCorruptedRow = lastRow-step;
+          for ( currRow = firstCorruptedRow; currRow < lastRow; currRow += step )
+          {
+            srcCounter = ercCollect8PredBlocks( predBlocks, currRow, column, condition, lastRow, lastColumn, step, 1 );
+
+            switch( comp )
+            {
+            case 0 :
+              ercPixConcealIMB( p_Vid, recfr->yptr, currRow, column, predBlocks, picSizeX, 2 );
+              break;
+            case 1 :
+              ercPixConcealIMB( p_Vid, recfr->uptr, currRow, column, predBlocks, (picSizeX>>1), 1 );
+              break;
+            case 2 :
+              ercPixConcealIMB( p_Vid, recfr->vptr, currRow, column, predBlocks, (picSizeX>>1), 1 );
+              break;
+            }
+
+            if ( comp == 0 )
+            {
+              condition[ currRow*lastColumn+column] = ERC_BLOCK_CONCEALED;
+              condition[ currRow*lastColumn+column + 1] = ERC_BLOCK_CONCEALED;
+              condition[ currRow*lastColumn+column + lastColumn] = ERC_BLOCK_CONCEALED;
+              condition[ currRow*lastColumn+column + lastColumn + 1] = ERC_BLOCK_CONCEALED;
+            }
+            else
+            {
+              condition[ currRow*lastColumn+column] = ERC_BLOCK_CONCEALED;
+            }
+
+          }
+          row = lastRow;
+        }
+        else if ( firstCorruptedRow == 0 )
+        {
+          // correct only from below
+          for ( currRow = lastCorruptedRow; currRow >= 0; currRow -= step )
+          {
+            srcCounter = ercCollect8PredBlocks( predBlocks, currRow, column, condition, lastRow, lastColumn, step, 1 );
+
+            switch( comp )
+            {
+            case 0 :
+              ercPixConcealIMB( p_Vid, recfr->yptr, currRow, column, predBlocks, picSizeX, 2 );
+              break;
+            case 1 :
+              ercPixConcealIMB( p_Vid, recfr->uptr, currRow, column, predBlocks, (picSizeX>>1), 1 );
+              break;
+            case 2 :
+              ercPixConcealIMB( p_Vid, recfr->vptr, currRow, column, predBlocks, (picSizeX>>1), 1 );
+              break;
+            }
+
+            if ( comp == 0 )
+            {
+              condition[ currRow*lastColumn+column] = ERC_BLOCK_CONCEALED;
+              condition[ currRow*lastColumn+column + 1] = ERC_BLOCK_CONCEALED;
+              condition[ currRow*lastColumn+column + lastColumn] = ERC_BLOCK_CONCEALED;
+              condition[ currRow*lastColumn+column + lastColumn + 1] = ERC_BLOCK_CONCEALED;
+            }
+            else
+            {
+              condition[ currRow*lastColumn+column] = ERC_BLOCK_CONCEALED;
+            }
+
+          }
+
+          row = lastCorruptedRow+step;
+        }
+        else
+        {
+          // correct bi-directionally
+
+          row = lastCorruptedRow+step;
+          areaHeight = lastCorruptedRow-firstCorruptedRow+step;
+
+          // Conceal the corrupted area switching between the up and the bottom rows
+          for ( i = 0; i < areaHeight; i += step )
+          {
+            if ( i % 2 )
+            {
+              currRow = lastCorruptedRow;
+              lastCorruptedRow -= step;
+            }
+            else
+            {
+              currRow = firstCorruptedRow;
+              firstCorruptedRow += step;
+            }
+
+            if (smoothColumn > 0)
+            {
+              srcCounter = ercCollectColumnBlocks( predBlocks, currRow, column, condition, lastRow, lastColumn, step );
+            }
+            else
+            {
+              srcCounter = ercCollect8PredBlocks( predBlocks, currRow, column, condition, lastRow, lastColumn, step, 1 );
+            }
+
+            switch( comp )
+            {
+            case 0 :
+              ercPixConcealIMB( p_Vid, recfr->yptr, currRow, column, predBlocks, picSizeX, 2 );
+              break;
+
+            case 1 :
+              ercPixConcealIMB( p_Vid, recfr->uptr, currRow, column, predBlocks, (picSizeX>>1), 1 );
+              break;
+
+            case 2 :
+              ercPixConcealIMB( p_Vid, recfr->vptr, currRow, column, predBlocks, (picSizeX>>1), 1 );
+              break;
+            }
+
+            if ( comp == 0 )
+            {
+              condition[ currRow*lastColumn+column] = ERC_BLOCK_CONCEALED;
+              condition[ currRow*lastColumn+column + 1] = ERC_BLOCK_CONCEALED;
+              condition[ currRow*lastColumn+column + lastColumn] = ERC_BLOCK_CONCEALED;
+              condition[ currRow*lastColumn+column + lastColumn + 1] = ERC_BLOCK_CONCEALED;
+            }
+            else
+            {
+              condition[ currRow*lastColumn+column ] = ERC_BLOCK_CONCEALED;
+            }
+          }
+        }
+
+        lastCorruptedRow = -1;
+        firstCorruptedRow = -1;
+
+      }
+    }
+  }
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *      Does the actual pixel based interpolation for block[]
+ *      using weighted average
+ * \param p_Vid
+ *      image encoding parameters for current picture
+ * \param src[]
+ *      pointers to neighboring source blocks
+ * \param block
+ *      destination block
+ * \param blockSize
+ *      16 for Y, 8 for U/V components
+ * \param frameWidth
+ *      Width of the frame in pixels
+ ************************************************************************
+ */
+static void pixMeanInterpolateBlock( VideoParameters *p_Vid, imgpel *src[], imgpel *block, int blockSize, int frameWidth )
+{
+  int row, column, k, tmp, srcCounter = 0, weight = 0, bmax = blockSize - 1;
+
+  k = 0;
+  for ( row = 0; row < blockSize; row++ )
+  {
+    for ( column = 0; column < blockSize; column++ )
+    {
+      tmp = 0;
+      srcCounter = 0;
+      // above
+      if ( src[4] != NULL )
+      {
+        weight = blockSize-row;
+        tmp += weight * (*(src[4]+bmax*frameWidth+column));
+        srcCounter += weight;
+      }
+      // left
+      if ( src[5] != NULL )
+      {
+        weight = blockSize-column;
+        tmp += weight * (*(src[5]+row*frameWidth+bmax));
+        srcCounter += weight;
+      }
+      // below
+      if ( src[6] != NULL )
+      {
+        weight = row+1;
+        tmp += weight * (*(src[6]+column));
+        srcCounter += weight;
+      }
+      // right
+      if ( src[7] != NULL )
+      {
+        weight = column+1;
+        tmp += weight * (*(src[7]+row*frameWidth));
+        srcCounter += weight;
+      }
+
+      if ( srcCounter > 0 )
+        block[ k + column ] = (byte)(tmp/srcCounter);
+      else
+        block[ k + column ] = blockSize == 8 ? p_Vid->dc_pred_value_comp[1] : p_Vid->dc_pred_value_comp[0];
+    }
+    k += frameWidth;
+  }
+}
diff --git a/Src/h264dec/ldecod/src/erc_do_p.c b/Src/h264dec/ldecod/src/erc_do_p.c
new file mode 100644
index 00000000..69727d2b
--- /dev/null
+++ b/Src/h264dec/ldecod/src/erc_do_p.c
@@ -0,0 +1,1742 @@
+
+/*!
+ *************************************************************************************
+ * \file
+ *      erc_do_p.c
+ *
+ * \brief
+ *      Inter (P) frame error concealment algorithms for decoder
+ *
+ *  \author
+ *      - Viktor Varsa                     <viktor.varsa@nokia.com>
+ *      - Ye-Kui Wang                      <wyk@ieee.org>
+ *      - Jill Boyce                       <jill.boyce@thomson.net>
+ *      - Saurav K Bandyopadhyay           <saurav@ieee.org>
+ *      - Zhenyu Wu                        <Zhenyu.Wu@thomson.net>
+ *      - Purvin Pandit                    <Purvin.Pandit@thomson.net>
+ *
+ *************************************************************************************
+ */
+
+#include "global.h"
+#include "mbuffer.h"
+#include "memalloc.h"
+#include "erc_do.h"
+#include "image.h"
+#include "mc_prediction.h"
+#include "macroblock.h"
+
+
+// static function declarations
+static int concealByCopy(frame *recfr, int currMBNum, objectBuffer_t *object_list, int picSizeX);
+static int concealByTrial(frame *recfr, imgpel *predMB,
+                          int currMBNum, objectBuffer_t *object_list, int predBlocks[],
+                          int picSizeX, int picSizeY, int *yCondition);
+static int edgeDistortion (int predBlocks[], int currYBlockNum, imgpel *predMB,
+                           imgpel *recY, int picSizeX, int regionSize);
+static void copyBetweenFrames (frame *recfr, int currYBlockNum, int picSizeX, int regionSize);
+static void buildPredRegionYUV(VideoParameters *p_Vid, const short *mv, int x, int y, imgpel *predMB);
+
+// picture error concealment
+static void buildPredblockRegionYUV(VideoParameters *p_Vid, const short *mv,
+                                    int x, int y, imgpel *predMB, int list);
+static void CopyImgData(imgpel **inputY, imgpel ***inputUV, imgpel **outputY, imgpel ***outputUV, 
+                        int img_width, int img_height, int img_width_cr, int img_height_cr);
+
+static void copyPredMB (int currYBlockNum, imgpel *predMB, frame *recfr,
+                        int picSizeX, int regionSize);
+static void add_node   ( VideoParameters *p_Vid, struct concealment_node *ptr );
+static void delete_node( VideoParameters *p_Vid, struct concealment_node *ptr );
+
+static const int uv_div[2][4] = {{0, 1, 1, 0}, {0, 1, 0, 0}}; //[x/y][yuv_format]
+
+/*!
+ ************************************************************************
+ * \brief
+ *      The main function for Inter (P) frame concealment.
+ * \return
+ *      0, if the concealment was not successful and simple concealment should be used
+ *      1, otherwise (even if none of the blocks were concealed)
+ * \param recfr
+ *      Reconstructed frame buffer
+ * \param object_list
+ *      Motion info for all MBs in the frame
+ * \param picSizeX
+ *      Width of the frame in pixels
+ * \param picSizeY
+ *      Height of the frame in pixels
+ * \param errorVar
+ *      Variables for error concealment
+ * \param chroma_format_idc
+ *      Chroma format IDC
+ ************************************************************************
+ */
+int ercConcealInterFrame(frame *recfr, objectBuffer_t *object_list,
+                         int picSizeX, int picSizeY, ercVariables_t *errorVar, int chroma_format_idc )
+{
+  VideoParameters *p_Vid = recfr->p_Vid;
+  int lastColumn = 0, lastRow = 0, predBlocks[8];
+  int lastCorruptedRow = -1, firstCorruptedRow = -1;
+  int currRow = 0, row, column, columnInd, areaHeight = 0, i = 0;
+  imgpel *predMB;
+
+  /* if concealment is on */
+  if ( errorVar && errorVar->concealment )
+  {
+    /* if there are segments to be concealed */
+    if ( errorVar->nOfCorruptedSegments )
+    {
+      if (chroma_format_idc != YUV400)
+        predMB = (imgpel *) malloc ( (256 + (p_Vid->mb_cr_size_x * p_Vid->mb_cr_size_y)*2) * sizeof (imgpel));
+      else
+        predMB = (imgpel *) malloc(256 * sizeof (imgpel));
+
+      if ( predMB == NULL ) no_mem_exit("ercConcealInterFrame: predMB");
+
+      lastRow = (int) (picSizeY>>4);
+      lastColumn = (int) (picSizeX>>4);
+
+      for ( columnInd = 0; columnInd < lastColumn; columnInd ++)
+      {
+        column = ((columnInd%2) ? (lastColumn - columnInd/2 -1) : (columnInd/2));
+
+        for ( row = 0; row < lastRow; row++)
+        {
+
+          if ( errorVar->yCondition[MBxy2YBlock(column, row, 0, picSizeX)] <= ERC_BLOCK_CORRUPTED )
+          {                           // ERC_BLOCK_CORRUPTED (1) or ERC_BLOCK_EMPTY (0)
+            firstCorruptedRow = row;
+            /* find the last row which has corrupted blocks (in same continuous area) */
+            for ( lastCorruptedRow = row+1; lastCorruptedRow < lastRow; lastCorruptedRow++)
+            {
+              /* check blocks in the current column */
+              if (errorVar->yCondition[MBxy2YBlock(column, lastCorruptedRow, 0, picSizeX)] > ERC_BLOCK_CORRUPTED)
+              {
+                /* current one is already OK, so the last was the previous one */
+                lastCorruptedRow --;
+                break;
+              }
+            }
+            if ( lastCorruptedRow >= lastRow )
+            {
+              /* correct only from above */
+              lastCorruptedRow = lastRow-1;
+              for ( currRow = firstCorruptedRow; currRow < lastRow; currRow++ )
+              {
+
+                ercCollect8PredBlocks (predBlocks, (currRow<<1), (column<<1),
+                  errorVar->yCondition, (lastRow<<1), (lastColumn<<1), 2, 0);
+
+                if(p_Vid->erc_mvperMB >= MVPERMB_THR)
+                  concealByTrial(recfr, predMB,
+                    currRow*lastColumn+column, object_list, predBlocks,
+                    picSizeX, picSizeY,
+                    errorVar->yCondition);
+                else
+                  concealByCopy(recfr, currRow*lastColumn+column,
+                    object_list, picSizeX);
+
+                ercMarkCurrMBConcealed (currRow*lastColumn+column, -1, picSizeX, errorVar);
+              }
+              row = lastRow;
+            }
+            else if ( firstCorruptedRow == 0 )
+            {
+              /* correct only from below */
+              for ( currRow = lastCorruptedRow; currRow >= 0; currRow-- )
+              {
+
+                ercCollect8PredBlocks (predBlocks, (currRow<<1), (column<<1),
+                  errorVar->yCondition, (lastRow<<1), (lastColumn<<1), 2, 0);
+
+                if(p_Vid->erc_mvperMB >= MVPERMB_THR)
+                  concealByTrial(recfr, predMB,
+                    currRow*lastColumn+column, object_list, predBlocks,
+                    picSizeX, picSizeY,
+                    errorVar->yCondition);
+                else
+                  concealByCopy(recfr, currRow*lastColumn+column,
+                    object_list, picSizeX);
+
+                ercMarkCurrMBConcealed (currRow*lastColumn+column, -1, picSizeX, errorVar);
+              }
+
+              row = lastCorruptedRow+1;
+            }
+            else
+            {
+              /* correct bi-directionally */
+
+              row = lastCorruptedRow+1;
+
+              areaHeight = lastCorruptedRow-firstCorruptedRow+1;
+
+              /*
+              *  Conceal the corrupted area switching between the up and the bottom rows
+              */
+              for ( i = 0; i < areaHeight; i++)
+              {
+                if ( i % 2 )
+                {
+                  currRow = lastCorruptedRow;
+                  lastCorruptedRow --;
+                }
+                else
+                {
+                  currRow = firstCorruptedRow;
+                  firstCorruptedRow ++;
+                }
+
+                ercCollect8PredBlocks (predBlocks, (currRow<<1), (column<<1),
+                  errorVar->yCondition, (lastRow<<1), (lastColumn<<1), 2, 0);
+
+                if(p_Vid->erc_mvperMB >= MVPERMB_THR)
+                  concealByTrial(recfr, predMB,
+                    currRow*lastColumn+column, object_list, predBlocks,
+                    picSizeX, picSizeY,
+                    errorVar->yCondition);
+                else
+                  concealByCopy(recfr, currRow*lastColumn+column,
+                    object_list, picSizeX);
+
+                ercMarkCurrMBConcealed (currRow*lastColumn+column, -1, picSizeX, errorVar);
+
+              }
+            }
+            lastCorruptedRow = -1;
+            firstCorruptedRow = -1;
+          }
+        }
+      }
+
+      free(predMB);
+    }
+    return 1;
+  }
+  else
+    return 0;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *      It conceals a given MB by simply copying the pixel area from the reference image
+ *      that is at the same location as the macroblock in the current image. This correcponds
+ *      to COPY MBs.
+ * \return
+ *      Always zero (0).
+ * \param recfr
+ *      Reconstructed frame buffer
+ * \param currMBNum
+ *      current MB index
+ * \param object_list
+ *      Motion info for all MBs in the frame
+ * \param picSizeX
+ *      Width of the frame in pixels
+ ************************************************************************
+ */
+static int concealByCopy(frame *recfr, int currMBNum,
+                         objectBuffer_t *object_list, int picSizeX)
+{
+  objectBuffer_t *currRegion;
+
+  currRegion = object_list+(currMBNum<<2);
+  currRegion->regionMode = REGMODE_INTER_COPY;
+
+  currRegion->xMin = (xPosMB(currMBNum,picSizeX)<<4);
+  currRegion->yMin = (yPosMB(currMBNum,picSizeX)<<4);
+
+  copyBetweenFrames (recfr, MBNum2YBlock(currMBNum,0,picSizeX), picSizeX, 16);
+
+  return 0;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *      Copies the co-located pixel values from the reference to the current frame.
+ *      Used by concealByCopy
+ * \param recfr
+ *      Reconstructed frame buffer
+ * \param currYBlockNum
+ *      index of the block (8x8) in the Y plane
+ * \param picSizeX
+ *      Width of the frame in pixels
+ * \param regionSize
+ *      can be 16 or 8 to tell the dimension of the region to copy
+ ************************************************************************
+ */
+static void copyBetweenFrames (frame *recfr, int currYBlockNum, int picSizeX, int regionSize)
+{
+  VideoParameters *p_Vid = recfr->p_Vid;
+  StorablePicture *dec_picture = p_Vid->dec_picture;
+  int j, k, location, xmin, ymin;
+  StorablePicture* refPic = p_Vid->listX[0][0];
+
+  /* set the position of the region to be copied */
+  xmin = (xPosYBlock(currYBlockNum,picSizeX)<<3);
+  ymin = (yPosYBlock(currYBlockNum,picSizeX)<<3);
+
+  for (j = ymin; j < ymin + regionSize; j++)
+    for (k = xmin; k < xmin + regionSize; k++)
+    {
+      location = j * picSizeX + k;
+//th      recfr->yptr[location] = dec_picture->imgY[j][k];
+      recfr->yptr[location] = refPic->imgY->img[j][k];
+    }
+
+    for (j = ymin >> uv_div[1][dec_picture->chroma_format_idc]; j < (ymin + regionSize) >> uv_div[1][dec_picture->chroma_format_idc]; j++)
+      for (k = xmin >> uv_div[0][dec_picture->chroma_format_idc]; k < (xmin + regionSize) >> uv_div[0][dec_picture->chroma_format_idc]; k++)
+      {
+//        location = j * picSizeX / 2 + k;
+        location = ((j * picSizeX) >> uv_div[0][dec_picture->chroma_format_idc]) + k;
+
+//th        recfr->uptr[location] = dec_picture->imgUV[0][j][k];
+//th        recfr->vptr[location] = dec_picture->imgUV[1][j][k];
+        recfr->uptr[location] = refPic->imgUV[0]->img[j][k];
+        recfr->vptr[location] = refPic->imgUV[1]->img[j][k];
+      }
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *      It conceals a given MB by using the motion vectors of one reliable neighbor. That MV of a
+ *      neighbor is selected wich gives the lowest pixel difference at the edges of the MB
+ *      (see function edgeDistortion). This corresponds to a spatial smoothness criteria.
+ * \return
+ *      Always zero (0).
+ * \param recfr
+ *      Reconstructed frame buffer
+ * \param predMB
+ *      memory area for storing temporary pixel values for a macroblock
+ *      the Y,U,V planes are concatenated y = predMB, u = predMB+256, v = predMB+320
+ * \param currMBNum
+ *      current MB index
+ * \param object_list
+ *      array of region structures storing region mode and mv for each region
+ * \param predBlocks
+ *      status array of the neighboring blocks (if they are OK, concealed or lost)
+ * \param picSizeX
+ *      Width of the frame in pixels
+ * \param picSizeY
+ *      Height of the frame in pixels
+ * \param yCondition
+ *      array for conditions of Y blocks from ercVariables_t
+ ************************************************************************
+ */
+static int concealByTrial(frame *recfr, imgpel *predMB,
+                          int currMBNum, objectBuffer_t *object_list, int predBlocks[],
+                          int picSizeX, int picSizeY, int *yCondition)
+{
+  VideoParameters *p_Vid = recfr->p_Vid;
+  int predMBNum = 0, numMBPerLine,
+      compSplit1 = 0, compSplit2 = 0, compLeft = 1, comp = 0, compPred, order = 1,
+      fInterNeighborExists, numIntraNeighbours,
+      fZeroMotionChecked, predSplitted = 0,
+      threshold = ERC_BLOCK_OK,
+      minDist, currDist, i, k, bestDir;
+  int regionSize;
+  objectBuffer_t *currRegion;
+  short mvBest[3] = {0, 0, 0}, mvPred[3] = {0, 0, 0}, *mvptr;
+
+  numMBPerLine = (int) (picSizeX>>4);
+
+  p_Vid->current_mb_nr = currMBNum;
+
+  comp = 0;
+  regionSize = 16;
+
+  do
+  { /* 4 blocks loop */
+
+    currRegion = object_list+(currMBNum<<2)+comp;
+
+    /* set the position of the region to be concealed */
+
+    currRegion->xMin = (xPosYBlock(MBNum2YBlock(currMBNum,comp,picSizeX),picSizeX)<<3);
+    currRegion->yMin = (yPosYBlock(MBNum2YBlock(currMBNum,comp,picSizeX),picSizeX)<<3);
+
+    do
+    { /* reliability loop */
+
+      minDist = 0;
+      fInterNeighborExists = 0;
+      numIntraNeighbours = 0;
+      fZeroMotionChecked = 0;
+
+      /* loop the 4 neighbours */
+      for (i = 4; i < 8; i++)
+      {
+
+        /* if reliable, try it */
+        if (predBlocks[i] >= threshold)
+        {
+          switch (i)
+          {
+          case 4:
+            predMBNum = currMBNum-numMBPerLine;
+            compSplit1 = 2;
+            compSplit2 = 3;
+            break;
+
+          case 5:
+            predMBNum = currMBNum-1;
+            compSplit1 = 1;
+            compSplit2 = 3;
+            break;
+
+          case 6:
+            predMBNum = currMBNum+numMBPerLine;
+            compSplit1 = 0;
+            compSplit2 = 1;
+            break;
+
+          case 7:
+            predMBNum = currMBNum+1;
+            compSplit1 = 0;
+            compSplit2 = 2;
+            break;
+          }
+
+          /* try the concealment with the Motion Info of the current neighbour
+          only try if the neighbour is not Intra */
+          if (isBlock(object_list,predMBNum,compSplit1,INTRA) ||
+            isBlock(object_list,predMBNum,compSplit2,INTRA))
+          {
+            numIntraNeighbours++;
+          }
+          else
+          {
+            /* if neighbour MB is splitted, try both neighbour blocks */
+            for (predSplitted = isSplitted(object_list, predMBNum),
+              compPred = compSplit1;
+              predSplitted >= 0;
+              compPred = compSplit2,
+              predSplitted -= ((compSplit1 == compSplit2) ? 2 : 1))
+            {
+
+              /* if Zero Motion Block, do the copying. This option is tried only once */
+              if (isBlock(object_list, predMBNum, compPred, INTER_COPY))
+              {
+
+                if (fZeroMotionChecked)
+                {
+                  continue;
+                }
+                else
+                {
+                  fZeroMotionChecked = 1;
+
+                  mvPred[0] = mvPred[1] = 0;
+                  mvPred[2] = 0;
+
+                  buildPredRegionYUV(p_Vid->erc_img, mvPred, currRegion->xMin, currRegion->yMin, predMB);
+                }
+              }
+              /* build motion using the neighbour's Motion Parameters */
+              else if (isBlock(object_list,predMBNum,compPred,INTRA))
+              {
+                continue;
+              }
+              else
+              {
+                mvptr = getParam(object_list, predMBNum, compPred, mv);
+
+                mvPred[0] = mvptr[0];
+                mvPred[1] = mvptr[1];
+                mvPred[2] = mvptr[2];
+
+                buildPredRegionYUV(p_Vid->erc_img, mvPred, currRegion->xMin, currRegion->yMin, predMB);
+              }
+
+              /* measure absolute boundary pixel difference */
+              currDist = edgeDistortion(predBlocks,
+                MBNum2YBlock(currMBNum,comp,picSizeX),
+                predMB, recfr->yptr, picSizeX, regionSize);
+
+              /* if so far best -> store the pixels as the best concealment */
+              if (currDist < minDist || !fInterNeighborExists)
+              {
+
+                minDist = currDist;
+                bestDir = i;
+
+                for (k=0;k<3;k++)
+                  mvBest[k] = mvPred[k];
+
+                currRegion->regionMode =
+                  (isBlock(object_list, predMBNum, compPred, INTER_COPY)) ?
+                  ((regionSize == 16) ? REGMODE_INTER_COPY : REGMODE_INTER_COPY_8x8) :
+                  ((regionSize == 16) ? REGMODE_INTER_PRED : REGMODE_INTER_PRED_8x8);
+
+                copyPredMB(MBNum2YBlock(currMBNum,comp,picSizeX), predMB, recfr,
+                  picSizeX, regionSize);
+              }
+
+              fInterNeighborExists = 1;
+            }
+          }
+        }
+    }
+
+    threshold--;
+
+    } while ((threshold >= ERC_BLOCK_CONCEALED) && (fInterNeighborExists == 0));
+
+    /* always try zero motion */
+    if (!fZeroMotionChecked)
+    {
+      mvPred[0] = mvPred[1] = 0;
+      mvPred[2] = 0;
+
+      buildPredRegionYUV(p_Vid->erc_img, mvPred, currRegion->xMin, currRegion->yMin, predMB);
+
+      currDist = edgeDistortion(predBlocks,
+        MBNum2YBlock(currMBNum,comp,picSizeX),
+        predMB, recfr->yptr, picSizeX, regionSize);
+
+      if (currDist < minDist || !fInterNeighborExists)
+      {
+
+        minDist = currDist;
+        for (k=0;k<3;k++)
+          mvBest[k] = mvPred[k];
+
+        currRegion->regionMode =
+          ((regionSize == 16) ? REGMODE_INTER_COPY : REGMODE_INTER_COPY_8x8);
+
+        copyPredMB(MBNum2YBlock(currMBNum,comp,picSizeX), predMB, recfr,
+          picSizeX, regionSize);
+      }
+    }
+
+    for (i=0; i<3; i++)
+      currRegion->mv[i] = mvBest[i];
+
+    yCondition[MBNum2YBlock(currMBNum,comp,picSizeX)] = ERC_BLOCK_CONCEALED;
+    comp = (comp+order+4)%4;
+    compLeft--;
+
+    } while (compLeft);
+
+    return 0;
+}
+
+/*!
+************************************************************************
+* \brief
+*      Builds the motion prediction pixels from the given location (in 1/4 pixel units)
+*      of the reference frame. It not only copies the pixel values but builds the interpolation
+*      when the pixel positions to be copied from is not full pixel (any 1/4 pixel position).
+*      It copies the resulting pixel vlaues into predMB.
+* \param p_Vid
+*      The pointer of img_par struture of current frame
+* \param mv
+*      The pointer of the predicted MV of the current (being concealed) MB
+* \param x
+*      The x-coordinate of the above-left corner pixel of the current MB
+* \param y
+*      The y-coordinate of the above-left corner pixel of the current MB
+* \param predMB
+*      memory area for storing temporary pixel values for a macroblock
+*      the Y,U,V planes are concatenated y = predMB, u = predMB+256, v = predMB+320
+************************************************************************
+*/
+static void buildPredRegionYUV(VideoParameters *p_Vid, const short *mv, int x, int y, imgpel *predMB)
+{
+  int i=0, j=0, ii=0, jj=0,i1=0,j1=0,j4=0,i4=0;
+  int jf=0;
+  int uv;
+  int vec1_x=0,vec1_y=0;
+  int ioff,joff;
+  imgpel *pMB = predMB;
+
+  StorablePicture *dec_picture = p_Vid->dec_picture;
+  int ii0,jj0,ii1,jj1,if1,jf1,if0,jf0;
+  int mv_mul;
+
+  //FRExt
+  int f1_x, f1_y, f2_x, f2_y, f3, f4, ifx;
+  int b8, b4;
+  int yuv = dec_picture->chroma_format_idc - 1;
+
+  int ref_frame = imax (mv[2], 0); // !!KS: quick fix, we sometimes seem to get negative ref_pic here, so restrict to zero and above
+  int mb_nr = p_Vid->current_mb_nr;
+  
+  Macroblock *currMB = &p_Vid->mb_data[mb_nr];   // intialization code deleted, see below, StW  
+  Slice *currSlice = currMB->p_Slice;
+
+  h264_imgpel_macroblock_t tmp_block;
+
+  /* Update coordinates of the current concealed macroblock */
+  p_Vid->mb_x = x/MB_BLOCK_SIZE;
+  p_Vid->mb_y = y/MB_BLOCK_SIZE;
+  p_Vid->block_y = p_Vid->mb_y * BLOCK_SIZE;
+  p_Vid->pix_c_y = p_Vid->mb_y * p_Vid->mb_cr_size_y;
+  p_Vid->block_x = p_Vid->mb_x * BLOCK_SIZE;
+  p_Vid->pix_c_x = p_Vid->mb_x * p_Vid->mb_cr_size_x;
+
+  mv_mul=4;
+
+  // luma *******************************************************
+
+  for(j=0;j<MB_BLOCK_SIZE/BLOCK_SIZE;j++)
+  {
+    joff=j*4;
+    j4=p_Vid->block_y+j;
+    for(i=0;i<MB_BLOCK_SIZE/BLOCK_SIZE;i++)
+    {
+      ioff=i*4;
+      i4=p_Vid->block_x+i;
+
+      vec1_x = i4*4*mv_mul + mv[0];
+      vec1_y = j4*4*mv_mul + mv[1];
+
+      get_block_luma(currMB, PLANE_Y, p_Vid->listX[0][ref_frame], i4, j4, mv, BLOCK_SIZE, BLOCK_SIZE, tmp_block);  
+
+      for(ii=0;ii<BLOCK_SIZE;ii++)
+        for(jj=0;jj<MB_BLOCK_SIZE/BLOCK_SIZE;jj++)
+          currSlice->mb_pred[LumaComp][jj+joff][ii+ioff]=tmp_block[jj][ii];
+    }
+  }
+
+
+  for (j = 0; j < 16; j++)
+  {
+    for (i = 0; i < 16; i++)
+    {
+      pMB[j*16+i] = currSlice->mb_pred[LumaComp][j][i];
+    }
+  }
+  pMB += 256;
+
+  if (dec_picture->chroma_format_idc != YUV400)
+  {
+    // chroma *******************************************************
+    f1_x = 64/p_Vid->mb_cr_size_x;
+    f2_x=f1_x-1;
+
+    f1_y = 64/p_Vid->mb_cr_size_y;
+    f2_y=f1_y-1;
+
+    f3=f1_x*f1_y;
+    f4=f3>>1;
+
+    for(uv=0;uv<2;uv++)
+    {
+      for (b8=0;b8<(p_Vid->num_uv_blocks);b8++)
+      {
+        for(b4=0;b4<4;b4++)
+        {
+          joff = subblk_offset_y[yuv][b8][b4];
+          j4=p_Vid->pix_c_y+joff;
+          ioff = subblk_offset_x[yuv][b8][b4];
+          i4=p_Vid->pix_c_x+ioff;
+
+          for(jj=0;jj<4;jj++)
+          {
+            jf=(j4+jj)/(p_Vid->mb_cr_size_y/4);     // jf  = Subblock_y-coordinate
+            for(ii=0;ii<4;ii++)
+            {
+              ifx=(i4+ii)/(p_Vid->mb_cr_size_x/4);  // ifx = Subblock_x-coordinate
+
+              i1=(i4+ii)*f1_x + mv[0];
+              j1=(j4+jj)*f1_y + mv[1];
+
+              ii0=iClip3 (0, dec_picture->size_x_cr-1, i1/f1_x);
+              jj0=iClip3 (0, dec_picture->size_y_cr-1, j1/f1_y);
+              ii1=iClip3 (0, dec_picture->size_x_cr-1, ((i1+f2_x)/f1_x));
+              jj1=iClip3 (0, dec_picture->size_y_cr-1, ((j1+f2_y)/f1_y));
+
+              if1=(i1 & f2_x);
+              jf1=(j1 & f2_y);
+              if0=f1_x-if1;
+              jf0=f1_y-jf1;
+
+              currSlice->mb_pred[uv + 1][jj+joff][ii+ioff]=(if0*jf0*p_Vid->listX[0][ref_frame]->imgUV[uv]->img[jj0][ii0]+
+                if1*jf0*p_Vid->listX[0][ref_frame]->imgUV[uv]->img[jj0][ii1]+
+                if0*jf1*p_Vid->listX[0][ref_frame]->imgUV[uv]->img[jj1][ii0]+
+                if1*jf1*p_Vid->listX[0][ref_frame]->imgUV[uv]->img[jj1][ii1]+f4)/f3;
+            }
+          }
+        }
+      }
+
+      for (j = 0; j < 8; j++)
+      {
+        for (i = 0; i < 8; i++)
+        {
+          pMB[j*8+i] = currSlice->mb_pred[uv + 1][j][i];
+        }
+      }
+      pMB += 64;
+
+    }
+  }
+}
+/*!
+ ************************************************************************
+ * \brief
+ *      Copies pixel values between a YUV frame and the temporary pixel value storage place. This is
+ *      used to save some pixel values temporarily before overwriting it, or to copy back to a given
+ *      location in a frame the saved pixel values.
+ * \param currYBlockNum
+ *      index of the block (8x8) in the Y plane
+ * \param predMB
+ *      memory area where the temporary pixel values are stored
+ *      the Y,U,V planes are concatenated y = predMB, u = predMB+256, v = predMB+320
+ * \param recfr
+ *      pointer to a YUV frame
+ * \param picSizeX
+ *      picture width in pixels
+ * \param regionSize
+ *      can be 16 or 8 to tell the dimension of the region to copy
+ ************************************************************************
+ */
+static void copyPredMB (int currYBlockNum, imgpel *predMB, frame *recfr,
+                        int picSizeX, int regionSize)
+{
+  VideoParameters *p_Vid = recfr->p_Vid;
+StorablePicture *dec_picture = p_Vid->dec_picture;
+  int j, k, xmin, ymin, xmax, ymax;
+  int locationTmp, locationPred;
+  int uv_x = uv_div[0][dec_picture->chroma_format_idc];
+  int uv_y = uv_div[1][dec_picture->chroma_format_idc];
+
+  xmin = (xPosYBlock(currYBlockNum,picSizeX)<<3);
+  ymin = (yPosYBlock(currYBlockNum,picSizeX)<<3);
+  xmax = xmin + regionSize -1;
+  ymax = ymin + regionSize -1;
+
+  for (j = ymin; j <= ymax; j++)
+  {
+    for (k = xmin; k <= xmax; k++)
+    {
+      locationPred = j * picSizeX + k;
+      locationTmp = (j-ymin) * 16 + (k-xmin);
+      dec_picture->imgY->img[j][k] = predMB[locationTmp];
+    }
+  }
+
+  if (dec_picture->chroma_format_idc != YUV400)
+  {
+    for (j = (ymin>>uv_y); j <= (ymax>>uv_y); j++)
+    {
+      for (k = (xmin>>uv_x); k <= (xmax>>uv_x); k++)
+      {
+        locationPred = ((j * picSizeX) >> uv_x) + k;
+        locationTmp = (j-(ymin>>uv_y)) * p_Vid->mb_cr_size_x + (k-(xmin>>1)) + 256;
+        dec_picture->imgUV[0]->img[j][k] = predMB[locationTmp];
+
+        locationTmp += 64;
+
+        dec_picture->imgUV[1]->img[j][k] = predMB[locationTmp];
+      }
+    }
+  }
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *      Calculates a weighted pixel difference between edge Y pixels of the macroblock stored in predMB
+ *      and the pixels in the given Y plane of a frame (recY) that would become neighbor pixels if
+ *      predMB was placed at currYBlockNum block position into the frame. This "edge distortion" value
+ *      is used to determine how well the given macroblock in predMB would fit into the frame when
+ *      considering spatial smoothness. If there are correctly received neighbor blocks (status stored
+ *      in predBlocks) only they are used in calculating the edge distorion; otherwise also the already
+ *      concealed neighbor blocks can also be used.
+ * \return
+ *      The calculated weighted pixel difference at the edges of the MB.
+ * \param predBlocks
+ *      status array of the neighboring blocks (if they are OK, concealed or lost)
+ * \param currYBlockNum
+ *      index of the block (8x8) in the Y plane
+ * \param predMB
+ *      memory area where the temporary pixel values are stored
+ *      the Y,U,V planes are concatenated y = predMB, u = predMB+256, v = predMB+320
+ * \param recY
+ *      pointer to a Y plane of a YUV frame
+ * \param picSizeX
+ *      picture width in pixels
+ * \param regionSize
+ *      can be 16 or 8 to tell the dimension of the region to copy
+ ************************************************************************
+ */
+static int edgeDistortion (int predBlocks[], int currYBlockNum, imgpel *predMB,
+                           imgpel *recY, int picSizeX, int regionSize)
+{
+  int i, j, distortion, numOfPredBlocks, threshold = ERC_BLOCK_OK;
+  imgpel *currBlock = NULL, *neighbor = NULL;
+  int currBlockOffset = 0;
+
+  currBlock = recY + (yPosYBlock(currYBlockNum,picSizeX)<<3)*picSizeX + (xPosYBlock(currYBlockNum,picSizeX)<<3);
+
+  do
+  {
+
+    distortion = 0; numOfPredBlocks = 0;
+
+    // loop the 4 neighbors
+    for (j = 4; j < 8; j++)
+    {
+      /* if reliable, count boundary pixel difference */
+      if (predBlocks[j] >= threshold)
+      {
+
+        switch (j)
+        {
+        case 4:
+          neighbor = currBlock - picSizeX;
+          for ( i = 0; i < regionSize; i++ )
+          {
+            distortion += iabs((int)(predMB[i] - neighbor[i]));
+          }
+          break;
+        case 5:
+          neighbor = currBlock - 1;
+          for ( i = 0; i < regionSize; i++ )
+          {
+            distortion += iabs((int)(predMB[i*16] - neighbor[i*picSizeX]));
+          }
+          break;
+        case 6:
+          neighbor = currBlock + regionSize*picSizeX;
+          currBlockOffset = (regionSize-1)*16;
+          for ( i = 0; i < regionSize; i++ )
+          {
+            distortion += iabs((int)(predMB[i+currBlockOffset] - neighbor[i]));
+          }
+          break;
+        case 7:
+          neighbor = currBlock + regionSize;
+          currBlockOffset = regionSize-1;
+          for ( i = 0; i < regionSize; i++ )
+          {
+            distortion += iabs((int)(predMB[i*16+currBlockOffset] - neighbor[i*picSizeX]));
+          }
+          break;
+        }
+
+        numOfPredBlocks++;
+      }
+    }
+
+    threshold--;
+    if (threshold < ERC_BLOCK_CONCEALED)
+      break;
+  } while (numOfPredBlocks == 0);
+
+  if(numOfPredBlocks == 0)
+  {
+    return 0;
+    // assert (numOfPredBlocks != 0); !!!KS hmm, trying to continue...
+  }
+  return (distortion/numOfPredBlocks);
+}
+
+// picture error concealment below
+
+/*!
+************************************************************************
+* \brief
+* The motion prediction pixels are calculated from the given location (in
+* 1/4 pixel units) of the referenced frame. It copies the sub block from the
+* corresponding reference to the frame to be concealed.
+*
+*************************************************************************
+*/
+static void buildPredblockRegionYUV(VideoParameters *p_Vid, const short *mv,
+                                    int x, int y, imgpel *predMB, int list)
+{
+  int i=0,j=0,ii=0,jj=0,i1=0,j1=0,j4=0,i4=0;
+  int jf=0;
+  int uv;
+  int vec1_x=0,vec1_y=0;
+  int ioff,joff;
+
+  StorablePicture *dec_picture = p_Vid->dec_picture;
+  imgpel *pMB = predMB;
+
+  int ii0,jj0,ii1,jj1,if1,jf1,if0,jf0;
+  int mv_mul;
+
+  //FRExt
+  int f1_x, f1_y, f2_x, f2_y, f3, f4, ifx;
+  int yuv = dec_picture->chroma_format_idc - 1;
+
+  int ref_frame = mv[2];
+  int mb_nr = p_Vid->current_mb_nr;
+  
+  Macroblock *currMB = &p_Vid->mb_data[mb_nr];   // intialization code deleted, see below, StW  
+  Slice *currSlice = currMB->p_Slice;
+
+	h264_imgpel_macroblock_t tmp_block;
+
+  /* Update coordinates of the current concealed macroblock */
+
+  p_Vid->mb_x = x/BLOCK_SIZE;
+  p_Vid->mb_y = y/BLOCK_SIZE;
+  p_Vid->block_y = p_Vid->mb_y * BLOCK_SIZE;
+  p_Vid->pix_c_y = p_Vid->mb_y * p_Vid->mb_cr_size_y/4;
+  p_Vid->block_x = p_Vid->mb_x * BLOCK_SIZE;
+  p_Vid->pix_c_x = p_Vid->mb_x * p_Vid->mb_cr_size_x/4;
+
+  mv_mul=4;
+
+  // luma *******************************************************
+
+  vec1_x = x*mv_mul + mv[0];
+  vec1_y = y*mv_mul + mv[1];
+  get_block_luma(currMB, PLANE_Y, p_Vid->listX[list][ref_frame], x,y, mv, BLOCK_SIZE, BLOCK_SIZE, tmp_block);  
+
+  for(jj=0;jj<MB_BLOCK_SIZE/BLOCK_SIZE;jj++)
+    for(ii=0;ii<BLOCK_SIZE;ii++)
+      currSlice->mb_pred[LumaComp][jj][ii]=tmp_block[jj][ii];
+
+
+  for (j = 0; j < 4; j++)
+  {
+    for (i = 0; i < 4; i++)
+    {
+      pMB[j*4+i] = currSlice->mb_pred[LumaComp][j][i];
+    }
+  }
+  pMB += 16;
+
+  if (dec_picture->chroma_format_idc != YUV400)
+  {
+    // chroma *******************************************************
+    f1_x = 64/(p_Vid->mb_cr_size_x);
+    f2_x=f1_x-1;
+
+    f1_y = 64/(p_Vid->mb_cr_size_y);
+    f2_y=f1_y-1;
+
+    f3=f1_x*f1_y;
+    f4=f3>>1;
+
+    for(uv=0;uv<2;uv++)
+    {
+      joff = subblk_offset_y[yuv][0][0];
+      j4=p_Vid->pix_c_y+joff;
+      ioff = subblk_offset_x[yuv][0][0];
+      i4=p_Vid->pix_c_x+ioff;
+
+      for(jj=0;jj<2;jj++)
+      {
+        jf=(j4+jj)/(p_Vid->mb_cr_size_y/4);     // jf  = Subblock_y-coordinate
+        for(ii=0;ii<2;ii++)
+        {
+          ifx=(i4+ii)/(p_Vid->mb_cr_size_x/4);  // ifx = Subblock_x-coordinate
+
+          i1=(i4+ii)*f1_x + mv[0];
+          j1=(j4+jj)*f1_y + mv[1];
+
+          ii0=iClip3 (0, dec_picture->size_x_cr-1, i1/f1_x);
+          jj0=iClip3 (0, dec_picture->size_y_cr-1, j1/f1_y);
+          ii1=iClip3 (0, dec_picture->size_x_cr-1, ((i1+f2_x)/f1_x));
+          jj1=iClip3 (0, dec_picture->size_y_cr-1, ((j1+f2_y)/f1_y));
+
+          if1=(i1 & f2_x);
+          jf1=(j1 & f2_y);
+          if0=f1_x-if1;
+          jf0=f1_y-jf1;
+
+          currSlice->mb_pred[uv + 1][jj][ii]=(if0*jf0*p_Vid->listX[list][ref_frame]->imgUV[uv]->img[jj0][ii0]+
+            if1*jf0*p_Vid->listX[list][ref_frame]->imgUV[uv]->img[jj0][ii1]+
+            if0*jf1*p_Vid->listX[list][ref_frame]->imgUV[uv]->img[jj1][ii0]+
+            if1*jf1*p_Vid->listX[list][ref_frame]->imgUV[uv]->img[jj1][ii1]+f4)/f3;
+        }
+      }
+
+      for (j = 0; j < 2; j++)
+      {
+        for (i = 0; i < 2; i++)
+        {
+          pMB[j*2+i] = currSlice->mb_pred[uv + 1][j][i];
+        }
+      }
+      pMB += 4;
+
+    }
+  }
+}
+
+/*!
+************************************************************************
+* \brief
+*    compares two stored pictures by picture number for qsort in descending order
+*
+************************************************************************
+*/
+static inline int compare_pic_by_pic_num_desc( const void *arg1, const void *arg2 )
+{
+  int pic_num1 = (*(StorablePicture**)arg1)->pic_num;
+  int pic_num2 = (*(StorablePicture**)arg2)->pic_num;
+
+  if (pic_num1 < pic_num2)
+    return 1;
+  if (pic_num1 > pic_num2)
+    return -1;
+  else
+    return 0;
+}
+
+/*!
+************************************************************************
+* \brief
+*    compares two stored pictures by picture number for qsort in descending order
+*
+************************************************************************
+*/
+static inline int compare_pic_by_lt_pic_num_asc( const void *arg1, const void *arg2 )
+{
+  int long_term_pic_num1 = (*(StorablePicture**)arg1)->long_term_pic_num;
+  int long_term_pic_num2 = (*(StorablePicture**)arg2)->long_term_pic_num;
+  if ( long_term_pic_num1 < long_term_pic_num2)
+    return -1;
+
+  if ( long_term_pic_num1 > long_term_pic_num2)
+    return 1;
+  else
+    return 0;
+}
+
+/*!
+************************************************************************
+* \brief
+*    compares two stored pictures by poc for qsort in ascending order
+*
+************************************************************************
+*/
+static inline int compare_pic_by_poc_asc( const void *arg1, const void *arg2 )
+{
+  int poc1 = (*(StorablePicture**)arg1)->poc;
+  int poc2 = (*(StorablePicture**)arg2)->poc;
+
+  if ( poc1 < poc2)
+    return -1;  
+  if ( poc1 > poc2)
+    return 1;
+  else
+    return 0;
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    compares two stored pictures by poc for qsort in descending order
+*
+************************************************************************
+*/
+static inline int compare_pic_by_poc_desc( const void *arg1, const void *arg2 )
+{
+  int poc1 = (*(StorablePicture**)arg1)->poc;
+  int poc2 = (*(StorablePicture**)arg2)->poc;
+
+  if (poc1 < poc2)
+    return 1;
+  if (poc1 > poc2)
+    return -1;
+  else
+    return 0;
+}
+
+/*!
+************************************************************************
+* \brief
+*    Copy image data from one array to another array
+************************************************************************
+*/
+
+static void CopyImgData(imgpel **inputY, imgpel ***inputUV, imgpel **outputY, imgpel ***outputUV, 
+                        int img_width, int img_height, int img_width_cr, int img_height_cr)
+{
+  int x, y;
+
+  for (y=0; y<img_height; y++)
+    for (x=0; x<img_width; x++)
+      outputY[y][x] = inputY[y][x];
+
+  for (y=0; y<img_height_cr; y++)
+    for (x=0; x<img_width_cr; x++)
+    {
+      outputUV[0][y][x] = inputUV[0][y][x];
+      outputUV[1][y][x] = inputUV[1][y][x];
+    }
+}
+
+/*!
+************************************************************************
+* \brief
+*    Copies the last reference frame for concealing reference frame loss.
+************************************************************************
+*/
+
+static StorablePicture* get_last_ref_pic_from_dpb(DecodedPictureBuffer *p_Dpb)
+{
+  int used_size = p_Dpb->used_size - 1;
+  int i;
+
+  for(i = used_size; i >= 0; i--)
+  {
+    if (p_Dpb->fs[i]->is_used==3)
+    {
+      if (((p_Dpb->fs[i]->frame->used_for_reference) &&
+        (!p_Dpb->fs[i]->frame->is_long_term)) /*||  ((p_Dpb->fs[i]->frame->used_for_reference==0)
+                                           && (p_Dpb->fs[i]->frame->slice_type == P_SLICE))*/ )
+      {
+        return p_Dpb->fs[i]->frame;
+      }
+    }
+  }
+
+  return NULL;
+}
+
+/*!
+************************************************************************
+* \brief
+* Conceals the lost reference or non reference frame by either frame copy
+* or motion vector copy concealment.
+*
+************************************************************************
+*/
+
+static void copy_to_conceal(StorablePicture *src, StorablePicture *dst, VideoParameters *p_Vid)
+{
+  int i=0;
+  int  ii=0, jj=0;
+  int scale = 1;
+  StorablePicture *dec_picture = p_Vid->dec_picture;
+  // InputParameters *test;
+
+  p_Vid->current_mb_nr = 0;
+
+  dst->PicSizeInMbs  = src->PicSizeInMbs;
+
+  dst->slice_type = src->slice_type = p_Vid->conceal_slice_type;
+
+  dst->idr_flag = FALSE; //since we do not want to clears the ref list
+
+  dst->no_output_of_prior_pics_flag = src->no_output_of_prior_pics_flag;
+  dst->long_term_reference_flag = src->long_term_reference_flag;
+  dst->adaptive_ref_pic_buffering_flag = src->adaptive_ref_pic_buffering_flag = 0;
+  dst->chroma_format_idc = src->chroma_format_idc;
+  dst->frame_mbs_only_flag = src->frame_mbs_only_flag;
+  dst->frame_cropping_flag = src->frame_cropping_flag;
+  dst->frame_cropping_rect_left_offset = src->frame_cropping_rect_left_offset;
+  dst->frame_cropping_rect_right_offset = src->frame_cropping_rect_right_offset;
+  dst->frame_cropping_rect_bottom_offset = src->frame_cropping_rect_bottom_offset;
+  dst->frame_cropping_rect_top_offset = src->frame_cropping_rect_top_offset;
+  dst->qp = src->qp;
+  dst->slice_qp_delta = src->slice_qp_delta;
+
+  dec_picture = src;
+}
+
+/*!
+************************************************************************
+* \brief
+* Uses the previous reference pic for concealment of reference frames
+*
+************************************************************************
+*/
+
+static void
+copy_prev_pic_to_concealed_pic(StorablePicture *picture, VideoParameters *p_Vid)
+{
+  DecodedPictureBuffer *p_Dpb = p_Vid->p_Dpb;
+
+  StorablePicture *ref_pic;
+  /* get the last ref pic in dpb */
+  ref_pic = get_last_ref_pic_from_dpb(p_Dpb);
+
+  assert(ref_pic != NULL);
+
+  /* copy all the struc from this to current concealment pic */
+  p_Vid->conceal_slice_type = P_SLICE;
+  copy_to_conceal(ref_pic, picture, p_Vid);
+}
+
+
+/*!
+************************************************************************
+* \brief
+* This function conceals a missing reference frame. The routine is called
+* based on the difference in frame number. It conceals an IDR frame loss
+* based on the sudden decrease in frame number.
+*
+************************************************************************
+*/
+// TODO: benski> pass timecode
+void conceal_lost_frames(VideoParameters *p_Vid)
+{
+  int CurrFrameNum;
+  int UnusedShortTermFrameNum;
+  StorablePicture *picture = NULL;
+  int tmp1 = p_Vid->delta_pic_order_cnt[0];
+  int tmp2 = p_Vid->delta_pic_order_cnt[1];
+  int i;
+
+  p_Vid->delta_pic_order_cnt[0] = p_Vid->delta_pic_order_cnt[1] = 0;
+
+  // printf("A gap in frame number is found, try to fill it.\n");
+
+  if(p_Vid->IDR_concealment_flag == 1)
+  {
+    // Conceals an IDR frame loss. Uses the reference frame in the previous
+    // GOP for concealment.
+    UnusedShortTermFrameNum = 0;
+    p_Vid->last_ref_pic_poc = -p_Vid->poc_gap;
+    p_Vid->earlier_missing_poc = 0;
+  }
+  else
+    UnusedShortTermFrameNum = (p_Vid->pre_frame_num + 1) % p_Vid->MaxFrameNum;
+
+  CurrFrameNum = p_Vid->frame_num;
+
+  while (CurrFrameNum != UnusedShortTermFrameNum)
+  {
+    picture = alloc_storable_picture (p_Vid, FRAME, p_Vid->width, p_Vid->height, p_Vid->width_cr, p_Vid->height_cr);
+
+    picture->coded_frame = 1;
+    picture->pic_num = UnusedShortTermFrameNum;
+    picture->frame_num = UnusedShortTermFrameNum;
+    picture->non_existing = 0;
+    picture->is_output = 0;
+    picture->used_for_reference = 1;
+    picture->concealed_pic = 1;
+
+    picture->adaptive_ref_pic_buffering_flag = 0;
+
+    p_Vid->frame_num = UnusedShortTermFrameNum;
+
+    picture->top_poc=p_Vid->last_ref_pic_poc + p_Vid->ref_poc_gap;
+    picture->bottom_poc=picture->top_poc;
+    picture->frame_poc=picture->top_poc;
+    picture->poc=picture->top_poc;
+    p_Vid->last_ref_pic_poc = picture->poc;
+
+    copy_prev_pic_to_concealed_pic(picture, p_Vid);
+
+    //if (UnusedShortTermFrameNum == 0)
+    if(p_Vid->IDR_concealment_flag == 1)
+    {
+      picture->slice_type = I_SLICE;
+      picture->idr_flag = TRUE;
+      flush_dpb(p_Vid);
+      picture->top_poc= 0;
+      picture->bottom_poc=picture->top_poc;
+      picture->frame_poc=picture->top_poc;
+      picture->poc=picture->top_poc;
+      p_Vid->last_ref_pic_poc = picture->poc;
+    }
+
+    store_picture_in_dpb(p_Vid, picture);
+
+    picture=NULL;
+
+    p_Vid->pre_frame_num = UnusedShortTermFrameNum;
+    UnusedShortTermFrameNum = (UnusedShortTermFrameNum + 1) % p_Vid->MaxFrameNum;
+
+    // update reference flags and set current flag.
+    for(i=16;i>0;i--)
+    {
+      p_Vid->ref_flag[i] = p_Vid->ref_flag[i-1];
+    }
+    p_Vid->ref_flag[0] = 0;
+  }
+  p_Vid->delta_pic_order_cnt[0] = tmp1;
+  p_Vid->delta_pic_order_cnt[1] = tmp2;
+  p_Vid->frame_num = CurrFrameNum;
+}
+
+/*!
+************************************************************************
+* \brief
+* Updates the reference list for motion vector copy concealment for non-
+* reference frame loss.
+*
+************************************************************************
+*/
+
+void update_ref_list_for_concealment(DecodedPictureBuffer *p_Dpb)
+{
+  VideoParameters *p_Vid = p_Dpb->p_Vid;
+
+  unsigned i, j;
+  for (i=0, j=0; i<p_Dpb->used_size; i++)
+  {
+    if (p_Dpb->fs[i]->concealment_reference)
+    {
+      p_Dpb->fs_ref[j++] = p_Dpb->fs[i];
+    }
+  }
+
+  p_Dpb->ref_frames_in_buffer = p_Vid->active_pps->num_ref_idx_l0_active_minus1;
+}
+
+/*!
+************************************************************************
+* \brief
+*    Initialize the list based on the B frame or non reference 'p' frame
+*    to be concealed. The function initialize p_Vid->listX[0] and list 1 depending
+*    on current picture type
+*
+************************************************************************
+*/
+void init_lists_for_non_reference_loss(VideoParameters *p_Vid, int currSliceType, PictureStructure currPicStructure)
+{
+  DecodedPictureBuffer *p_Dpb = p_Vid->p_Dpb;
+  seq_parameter_set_rbsp_t *active_sps = p_Vid->active_sps;
+
+  unsigned i;
+  int j;
+  int MaxFrameNum = 1 << (active_sps->log2_max_frame_num_minus4 + 4);
+  int diff;
+
+  int list0idx = 0;
+  int list0idx_1 = 0;
+
+  StorablePicture *tmp_s;
+
+  if (currPicStructure == FRAME)
+  {
+    for(i=0;i<p_Dpb->ref_frames_in_buffer; i++)
+    {
+      if(p_Dpb->fs[i]->concealment_reference == 1)
+      {
+        if(p_Dpb->fs[i]->frame_num > p_Vid->frame_to_conceal)
+          p_Dpb->fs_ref[i]->frame_num_wrap = p_Dpb->fs[i]->frame_num - MaxFrameNum;
+        else
+          p_Dpb->fs_ref[i]->frame_num_wrap = p_Dpb->fs[i]->frame_num;
+        p_Dpb->fs_ref[i]->frame->pic_num = p_Dpb->fs_ref[i]->frame_num_wrap;
+      }
+    }
+  }
+
+  if (currSliceType == P_SLICE)
+  {
+    // Calculate FrameNumWrap and PicNum
+    if (currPicStructure == FRAME)
+    {
+      for(i=0;i<p_Dpb->used_size; i++)
+      {
+        if(p_Dpb->fs[i]->concealment_reference == 1)
+        {
+          p_Vid->listX[0][list0idx++] = p_Dpb->fs[i]->frame;
+        }
+      }
+      // order list 0 by PicNum
+      qsort((void *)p_Vid->listX[0], list0idx, sizeof(StorablePicture*), compare_pic_by_pic_num_desc);
+      p_Vid->listXsize[0] = list0idx;
+    }
+  }
+
+  if (currSliceType == B_SLICE)
+  {
+    if (currPicStructure == FRAME)
+    {
+      //      for(i=0;i<p_Dpb->ref_frames_in_buffer; i++)
+      for(i=0;i<p_Dpb->used_size; i++)
+      {
+        if(p_Dpb->fs[i]->concealment_reference == 1)
+        {
+          if(p_Vid->earlier_missing_poc > p_Dpb->fs[i]->frame->poc)
+            p_Vid->listX[0][list0idx++] = p_Dpb->fs[i]->frame;
+        }
+      }
+
+      qsort((void *)p_Vid->listX[0], list0idx, sizeof(StorablePicture*), compare_pic_by_poc_desc);
+      list0idx_1 = list0idx;
+
+      //      for(i=0;i<p_Dpb->ref_frames_in_buffer; i++)
+      for(i=0;i<p_Dpb->used_size; i++)
+      {
+        if(p_Dpb->fs[i]->concealment_reference == 1)
+        {
+          if(p_Vid->earlier_missing_poc < p_Dpb->fs[i]->frame->poc)
+            p_Vid->listX[0][list0idx++] = p_Dpb->fs[i]->frame;
+        }
+      }
+
+      qsort((void *)&p_Vid->listX[0][list0idx_1], list0idx-list0idx_1, sizeof(StorablePicture*), compare_pic_by_poc_asc);
+
+      for (j=0; j<list0idx_1; j++)
+      {
+        p_Vid->listX[1][list0idx-list0idx_1+j]=p_Vid->listX[0][j];
+      }
+      for (j=list0idx_1; j<list0idx; j++)
+      {
+        p_Vid->listX[1][j-list0idx_1]=p_Vid->listX[0][j];
+      }
+
+      p_Vid->listXsize[0] = p_Vid->listXsize[1] = list0idx;
+
+      qsort((void *)&p_Vid->listX[0][(short) p_Vid->listXsize[0]], list0idx-p_Vid->listXsize[0], sizeof(StorablePicture*), compare_pic_by_lt_pic_num_asc);
+      qsort((void *)&p_Vid->listX[1][(short) p_Vid->listXsize[0]], list0idx-p_Vid->listXsize[0], sizeof(StorablePicture*), compare_pic_by_lt_pic_num_asc);
+      p_Vid->listXsize[0] = p_Vid->listXsize[1] = list0idx;
+    }
+  }
+
+  if ((p_Vid->listXsize[0] == p_Vid->listXsize[1]) && (p_Vid->listXsize[0] > 1))
+  {
+    // check if lists are identical, if yes swap first two elements of listX[1]
+    diff=0;
+    for (j = 0; j< p_Vid->listXsize[0]; j++)
+    {
+      if (p_Vid->listX[0][j]!=p_Vid->listX[1][j])
+        diff=1;
+    }
+    if (!diff)
+    {
+      tmp_s = p_Vid->listX[1][0];
+      p_Vid->listX[1][0]=p_Vid->listX[1][1];
+      p_Vid->listX[1][1]=tmp_s;
+    }
+  }
+
+  // set max size
+  p_Vid->listXsize[0] = imin (p_Vid->listXsize[0], (int)active_sps->num_ref_frames);
+  p_Vid->listXsize[1] = imin (p_Vid->listXsize[1], (int)active_sps->num_ref_frames);
+
+  p_Vid->listXsize[1] = 0;
+  // set the unused list entries to NULL
+  for (i=p_Vid->listXsize[0]; i< (MAX_LIST_SIZE) ; i++)
+  {
+    p_Vid->listX[0][i] = NULL;
+  }
+  for (i=p_Vid->listXsize[1]; i< (MAX_LIST_SIZE) ; i++)
+  {
+    p_Vid->listX[1][i] = NULL;
+  }
+}
+
+
+/*!
+************************************************************************
+* \brief
+* Get from the dpb the picture corresponding to a POC.  The POC varies
+* depending on whether it is a frame copy or motion vector copy concealment.
+* The frame corresponding to the POC is returned.
+*
+************************************************************************
+*/
+
+StorablePicture *get_pic_from_dpb(VideoParameters *p_Vid, int missingpoc, unsigned int *pos)
+{
+  DecodedPictureBuffer *p_Dpb = p_Vid->p_Dpb;
+  int used_size = p_Dpb->used_size - 1;
+  int i, concealfrom = 0;
+
+  for(i = used_size; i >= 0; i--)
+  {
+    if(p_Dpb->fs[i]->poc == concealfrom)
+    {
+      *pos = i;
+      return p_Dpb->fs[i]->frame;
+    }
+  }
+
+  return NULL;
+}
+
+/*!
+************************************************************************
+* \brief
+* Function to sort the POC and find the lowest number in the POC list
+* Compare the integers
+*
+************************************************************************
+*/
+
+int comp(const void *i, const void *j)
+{
+  return *(int *)i - *(int *)j;
+}
+
+/*!
+************************************************************************
+* \brief
+* Initialises a node, allocates memory for the node, and returns
+* a pointer to the new node.
+*
+************************************************************************
+*/
+
+struct concealment_node * init_node( StorablePicture* picture, int missingpoc )
+{
+  struct concealment_node *ptr;
+
+  ptr = (struct concealment_node *) calloc( 1, sizeof(struct concealment_node ) );
+
+  if( ptr == NULL )
+    return (struct concealment_node *) NULL;
+  else {
+    ptr->picture = picture;
+    ptr->missingpocs = missingpoc;
+    ptr->next = NULL;
+    return ptr;
+  }
+}
+
+/*!
+************************************************************************
+* \brief
+* Prints the details of a node
+*
+************************************************************************
+*/
+
+void print_node( struct concealment_node *ptr )
+{
+  printf("Missing POC=%d\n", ptr->missingpocs );
+}
+
+
+/*!
+************************************************************************
+* \brief
+* Prints all nodes from the current address passed to it.
+*
+************************************************************************
+*/
+
+void print_list( struct concealment_node *ptr )
+{
+  while( ptr != NULL )
+  {
+    print_node( ptr );
+    ptr = ptr->next;
+  }
+}
+
+/*!
+************************************************************************
+* \brief
+* Adds a node to the end of the list.
+*
+************************************************************************
+*/
+
+
+static void add_node( VideoParameters *p_Vid, struct concealment_node *concealment_new )
+{
+  if( p_Vid->concealment_head == NULL )
+  {
+    p_Vid->concealment_end = p_Vid->concealment_head = concealment_new;
+    return;
+  }
+  p_Vid->concealment_end->next = concealment_new;
+  p_Vid->concealment_end = concealment_new;
+}
+
+
+/*!
+************************************************************************
+* \brief
+* Deletes the specified node pointed to by 'ptr' from the list
+*
+************************************************************************
+*/
+
+
+static void delete_node( VideoParameters *p_Vid, struct concealment_node *ptr )
+{
+  // We only need to delete the first node in the linked list
+  if( ptr == p_Vid->concealment_head ) 
+  {
+    p_Vid->concealment_head = p_Vid->concealment_head->next;
+    if( p_Vid->concealment_end == ptr )
+      p_Vid->concealment_end = p_Vid->concealment_end->next;
+    free(ptr);
+  }
+}
+
+/*!
+************************************************************************
+* \brief
+* Deletes all nodes from the place specified by ptr
+*
+************************************************************************
+*/
+
+void delete_list( VideoParameters *p_Vid, struct concealment_node *ptr )
+{
+  struct concealment_node *temp;
+
+  if( p_Vid->concealment_head == NULL ) return;
+
+  if( ptr == p_Vid->concealment_head ) 
+  {
+    p_Vid->concealment_head = NULL;
+    p_Vid->concealment_end = NULL;
+  }
+  else
+  {
+    temp = p_Vid->concealment_head;
+
+    while( temp->next != ptr )
+      temp = temp->next;
+    p_Vid->concealment_end = temp;
+  }
+
+  while( ptr != NULL ) 
+  {
+    temp = ptr->next;
+    free( ptr );
+    ptr = temp;
+  }
+}
+
+/*!
+************************************************************************
+* \brief
+* Stores the missing non reference frames in the concealment buffer. The
+* detection is based on the POC difference in the sorted POC array. A missing
+* non reference frame is detected when the dpb is full. A singly linked list
+* is maintained for storing the missing non reference frames.
+*
+************************************************************************
+*/
+// TODO: benski> pass timecode
+void conceal_non_ref_pics(VideoParameters *p_Vid, int diff)
+{
+  DecodedPictureBuffer *p_Dpb = p_Vid->p_Dpb;
+  int missingpoc = 0;
+  unsigned int i, pos = 0;
+  StorablePicture *conceal_from_picture = NULL;
+  StorablePicture *conceal_to_picture = NULL;
+  struct concealment_node *concealment_ptr = NULL;
+  int temp_used_size = p_Dpb->used_size;
+
+  if(p_Dpb->used_size == 0 )
+    return;
+
+  qsort(p_Vid->pocs_in_dpb, p_Dpb->size, sizeof(int), comp);
+
+  for(i=0;i<p_Dpb->size-diff;i++)
+  {
+    p_Dpb->used_size = p_Dpb->size;
+    if((p_Vid->pocs_in_dpb[i+1] - p_Vid->pocs_in_dpb[i]) > p_Vid->poc_gap)
+    {
+      conceal_to_picture = alloc_storable_picture (p_Vid, FRAME, p_Vid->width, p_Vid->height, p_Vid->width_cr, p_Vid->height_cr);
+
+      missingpoc = p_Vid->pocs_in_dpb[i] + p_Vid->poc_gap;
+      // Diagnostics
+      // printf("\n missingpoc = %d\n",missingpoc);
+
+      if(missingpoc > p_Vid->earlier_missing_poc)
+      {
+        p_Vid->earlier_missing_poc = missingpoc;
+        conceal_to_picture->top_poc= missingpoc;
+        conceal_to_picture->bottom_poc=missingpoc;
+        conceal_to_picture->frame_poc=missingpoc;
+        conceal_to_picture->poc=missingpoc;
+        conceal_from_picture = get_pic_from_dpb(p_Vid, missingpoc, &pos);
+
+        assert(conceal_from_picture != NULL);
+
+        p_Dpb->used_size = pos+1;
+
+        p_Vid->frame_to_conceal = conceal_from_picture->frame_num + 1;
+
+        update_ref_list_for_concealment(p_Dpb);
+        p_Vid->conceal_slice_type = B_SLICE;
+        copy_to_conceal(conceal_from_picture, conceal_to_picture, p_Vid);
+        concealment_ptr = init_node( conceal_to_picture, missingpoc );
+        add_node(p_Vid, concealment_ptr);
+        // Diagnostics
+        // print_node(concealment_ptr);
+      }
+    }
+  }
+
+  //restore the original value
+  //p_Dpb->used_size = p_Dpb->size;
+  p_Dpb->used_size = temp_used_size;
+}
+
+/*!
+************************************************************************
+* \brief
+* Perform Sliding window decoded reference picture marking process. It
+* maintains the POC s stored in the dpb at a specific instance.
+*
+************************************************************************
+*/
+
+void sliding_window_poc_management(DecodedPictureBuffer *p_Dpb, StorablePicture *p)
+{    
+  if (p_Dpb->used_size == p_Dpb->size)
+  {
+    VideoParameters *p_Vid = p_Dpb->p_Vid;
+    unsigned int i;
+
+    for(i=0;i<p_Dpb->size-1; i++)
+      p_Vid->pocs_in_dpb[i] = p_Vid->pocs_in_dpb[i+1];
+  }
+
+  //    p_Vid->pocs_in_dpb[p_Dpb->used_size-1] = p->poc;
+}
+
+
+/*!
+************************************************************************
+* \brief
+* Outputs the non reference frames. The POCs in the concealment buffer are
+* sorted in ascending order and outputted when the lowest POC in the
+* concealment buffer is lower than the lowest in the p_Dpb-> The linked list
+* entry corresponding to the outputted POC is immediately deleted.
+*
+************************************************************************
+*/
+
+void write_lost_non_ref_pic(VideoParameters *p_Vid, int poc)
+{
+  DecodedPictureBuffer *p_Dpb = p_Vid->p_Dpb;
+  FrameStore concealment_fs;
+  if(poc > 0)
+  {
+    if((poc - p_Dpb->last_output_poc) > p_Vid->poc_gap)
+    {
+
+      concealment_fs.frame = p_Vid->concealment_head->picture;
+      concealment_fs.is_output = 0;
+      concealment_fs.is_reference = 0;
+      concealment_fs.is_used = 3;
+
+      write_stored_frame(p_Vid, &concealment_fs);
+      delete_node(p_Vid, p_Vid->concealment_head);
+    }
+  }
+}
+
+/*!
+************************************************************************
+* \brief
+* Conceals frame loss immediately after the IDR. This special case produces
+* the same result for either frame copy or motion vector copy concealment.
+*
+************************************************************************
+*/
+// TODO: benski> pass timecode
+void write_lost_ref_after_idr(VideoParameters *p_Vid, int pos)
+{
+  DecodedPictureBuffer *p_Dpb = p_Vid->p_Dpb;
+  int temp = 1;
+
+  if(p_Vid->last_out_fs->frame == NULL)
+  {
+    p_Vid->last_out_fs->frame = alloc_storable_picture (p_Vid, FRAME, p_Vid->width, p_Vid->height,
+      p_Vid->width_cr, p_Vid->height_cr);
+    p_Vid->last_out_fs->is_used = 3;
+  }
+
+  copy_to_conceal(p_Dpb->fs[pos]->frame, p_Vid->last_out_fs->frame, p_Vid);
+}
+
diff --git a/Src/h264dec/ldecod/src/errorconcealment.c b/Src/h264dec/ldecod/src/errorconcealment.c
new file mode 100644
index 00000000..6b1b47bf
--- /dev/null
+++ b/Src/h264dec/ldecod/src/errorconcealment.c
@@ -0,0 +1,138 @@
+
+/*!
+ ***********************************************************************
+ * \file errorconcealment.c
+ *
+ * \brief
+ *    Implements error concealment scheme for H.264 decoder
+ *
+ * \date
+ *    6.10.2000
+ *
+ * \version
+ *    1.0
+ *
+ * \note
+ *    This simple error concealment implemented in this decoder uses
+ *    the existing dependencies of syntax elements.
+ *    In case that an element is detected as false this elements and all
+ *    dependend elements are marked as elements to conceal in the p_Vid->ec_flag[]
+ *    array. If the decoder requests a new element by the function
+ *    readSyntaxElement_xxxx() this array is checked first if an error concealment has
+ *    to be applied on this element.
+ *    In case that an error occured a concealed element is given to the
+ *    decoding function in macroblock().
+ *
+ * \author
+ *    Main contributors (see contributors.h for copyright, address and affiliation details)
+ *    - Sebastian Purreiter   <sebastian.purreiter@mch.siemens.de>
+ ***********************************************************************
+ */
+
+#include "contributors.h"
+#include "global.h"
+#include "elements.h"
+
+
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    set concealment for all elements in same partition
+ *    and dependend syntax elements
+ * \param p_Vid
+ *      image encoding parameters for current picture
+ * \param se
+ *      type of syntax element to conceal
+ * \return
+ *    EC_REQ, elements of same type or depending type need error concealment. \n
+ *    EX_SYNC   sync on next header
+ ***********************************************************************
+ */
+int set_ec_flag(VideoParameters *p_Vid, int se)
+{
+
+  /*
+  if (p_Vid->ec_flag[se] == NO_EC)
+    printf("Error concealment on element %s\n",SEtypes[se]);
+  */
+  switch (se)
+  {
+  case SE_HEADER :
+    p_Vid->ec_flag[SE_HEADER] = EC_REQ;
+  case SE_PTYPE :
+    p_Vid->ec_flag[SE_PTYPE] = EC_REQ;
+  case SE_MBTYPE :
+    p_Vid->ec_flag[SE_MBTYPE] = EC_REQ;
+
+  case SE_REFFRAME :
+    p_Vid->ec_flag[SE_REFFRAME] = EC_REQ;
+    p_Vid->ec_flag[SE_MVD] = EC_REQ; // set all motion vectors to zero length
+    se = SE_CBP_INTER;      // conceal also Inter texture elements
+    break;
+
+  case SE_INTRAPREDMODE :
+    p_Vid->ec_flag[SE_INTRAPREDMODE] = EC_REQ;
+    se = SE_CBP_INTRA;      // conceal also Intra texture elements
+    break;
+  case SE_MVD :
+    p_Vid->ec_flag[SE_MVD] = EC_REQ;
+    se = SE_CBP_INTER;      // conceal also Inter texture elements
+    break;
+
+  default:
+    break;
+  }
+
+  switch (se)
+  {
+  case SE_CBP_INTRA :
+    p_Vid->ec_flag[SE_CBP_INTRA] = EC_REQ;
+  case SE_LUM_DC_INTRA :
+    p_Vid->ec_flag[SE_LUM_DC_INTRA] = EC_REQ;
+  case SE_CHR_DC_INTRA :
+    p_Vid->ec_flag[SE_CHR_DC_INTRA] = EC_REQ;
+  case SE_LUM_AC_INTRA :
+    p_Vid->ec_flag[SE_LUM_AC_INTRA] = EC_REQ;
+  case SE_CHR_AC_INTRA :
+    p_Vid->ec_flag[SE_CHR_AC_INTRA] = EC_REQ;
+    break;
+
+  case SE_CBP_INTER :
+    p_Vid->ec_flag[SE_CBP_INTER] = EC_REQ;
+  case SE_LUM_DC_INTER :
+    p_Vid->ec_flag[SE_LUM_DC_INTER] = EC_REQ;
+  case SE_CHR_DC_INTER :
+    p_Vid->ec_flag[SE_CHR_DC_INTER] = EC_REQ;
+  case SE_LUM_AC_INTER :
+    p_Vid->ec_flag[SE_LUM_AC_INTER] = EC_REQ;
+  case SE_CHR_AC_INTER :
+    p_Vid->ec_flag[SE_CHR_AC_INTER] = EC_REQ;
+    break;
+  case SE_DELTA_QUANT_INTER :
+    p_Vid->ec_flag[SE_DELTA_QUANT_INTER] = EC_REQ;
+    break;
+  case SE_DELTA_QUANT_INTRA :
+    p_Vid->ec_flag[SE_DELTA_QUANT_INTRA] = EC_REQ;
+    break;
+  default:
+    break;
+
+  }
+  return EC_REQ;
+}
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    resets EC_Flags called at the start of each slice
+ *
+ ***********************************************************************
+ */
+void reset_ec_flags(VideoParameters *p_Vid)
+{
+  int i;
+  for (i=0; i<SE_MAX_ELEMENTS; i++)
+    p_Vid->ec_flag[i] = NO_EC;
+}
+
diff --git a/Src/h264dec/ldecod/src/filter_chroma_horiz.c b/Src/h264dec/ldecod/src/filter_chroma_horiz.c
new file mode 100644
index 00000000..dbed3e15
--- /dev/null
+++ b/Src/h264dec/ldecod/src/filter_chroma_horiz.c
@@ -0,0 +1,533 @@
+#include "global.h"
+#include "image.h"
+#include "mb_access.h"
+#include "loopfilter.h"
+#include <mmintrin.h>
+#include <emmintrin.h>
+
+static const byte ALPHA_TABLE[52]  = {0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,4,4,5,6,  7,8,9,10,12,13,15,17,  20,22,25,28,32,36,40,45,  50,56,63,71,80,90,101,113,  127,144,162,182,203,226,255,255} ;
+static const byte  BETA_TABLE[52]  = {0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,2,2,2,3,  3,3,3, 4, 4, 4, 6, 6,   7, 7, 8, 8, 9, 9,10,10,  11,11,12,12,13,13, 14, 14,   15, 15, 16, 16, 17, 17, 18, 18} ;
+static const byte CLIP_TAB[52][5]  =
+{
+	{ -1, 0, 0, 0, 0},{ -1, 0, 0, 0, 0},{ -1, 0, 0, 0, 0},{ -1, 0, 0, 0, 0},{ -1, 0, 0, 0, 0},{ -1, 0, 0, 0, 0},{ -1, 0, 0, 0, 0},{ -1, 0, 0, 0, 0},
+	{ -1, 0, 0, 0, 0},{ -1, 0, 0, 0, 0},{ -1, 0, 0, 0, 0},{ -1, 0, 0, 0, 0},{ -1, 0, 0, 0, 0},{ -1, 0, 0, 0, 0},{ -1, 0, 0, 0, 0},{ -1, 0, 0, 0, 0},
+	{ -1, 0, 0, 0, 0},{ -1, 0, 0, 1, 1},{ -1, 0, 0, 1, 1},{ -1, 0, 0, 1, 1},{ -1, 0, 0, 1, 1},{ -1, 0, 1, 1, 1},{ -1, 0, 1, 1, 1},{ -1, 1, 1, 1, 1},
+	{ -1, 1, 1, 1, 1},{ -1, 1, 1, 1, 1},{ -1, 1, 1, 1, 1},{ -1, 1, 1, 2, 2},{ -1, 1, 1, 2, 2},{ -1, 1, 1, 2, 2},{ -1, 1, 1, 2, 2},{ -1, 1, 2, 3, 3},
+	{ -1, 1, 2, 3, 3},{ -1, 2, 2, 3, 3},{ -1, 2, 2, 4, 4},{ -1, 2, 3, 4, 4},{ -1, 2, 3, 4, 4},{ -1, 3, 3, 5, 5},{ -1, 3, 4, 6, 6},{ -1, 3, 4, 6, 6},
+	{ -1, 4, 5, 7, 7},{ -1, 4, 5, 8, 8},{ -1, 4, 6, 9, 9},{ -1, 5, 7,10,10},{ -1, 6, 8,11,11},{ -1, 6, 8,13,13},{ -1, 7,10,14,14},{ -1, 8,11,16,16},
+	{ -1, 9,12,18,18},{ -1,10,13,20,20},{ -1,11,15,23,23},{ -1,13,17,25,25}
+};
+
+static const int pelnum_cr[2][4] =  {{0,8,16,16}, {0,8, 8,16}};  //[dir:0=vert, 1=hor.][yuv_format]
+
+#define LOAD_LINE_EPI16(reg, ptr) { reg = _mm_loadl_epi64((__m128i *)(ptr));	reg = _mm_unpacklo_epi8(reg, xmm_zero); }
+static void FilterChroma8_Horiz_sse2(int inc_dim, imgpel *SrcPtrP, imgpel *SrcPtrQ, const byte Strength[16], const byte *ClipTab, int Alpha, int Beta, int bitdepth_scale, int max_imgpel_value)
+{
+	__m128i xmm_L1, xmm_L0, xmm_R0, xmm_R1;
+	__m128i xmm_strength;
+	__m128i xmm_absdiff, xmm_diff, xmm_acc;
+	__m128i xmm_127, xmm_zero;
+	__m128i xmm_alpha, xmm_beta;
+
+			int match;
+			xmm_zero = _mm_setzero_si128();
+			xmm_strength = _mm_load_si128((__m128i *)Strength);
+			xmm_127 = _mm_set1_epi8(127);
+			xmm_strength = _mm_adds_epu8(xmm_strength, xmm_127);
+			xmm_strength = _mm_srai_epi16(xmm_strength, 15); // shift so it's all 0xFFFF or 0x0000
+
+			LOAD_LINE_EPI16(xmm_R0, SrcPtrQ);
+			LOAD_LINE_EPI16(xmm_L0, SrcPtrP);
+
+			xmm_alpha = _mm_set1_epi16((uint16_t)Alpha);
+
+			// if ( abs( R0 - L0 ) < Alpha ) 
+			xmm_diff=_mm_subs_epu16(xmm_R0, xmm_L0);
+			xmm_absdiff =_mm_subs_epu16(xmm_L0, xmm_R0);
+			xmm_absdiff =_mm_or_si128(xmm_absdiff, xmm_diff);
+			xmm_absdiff = _mm_cmplt_epi16(xmm_absdiff, xmm_alpha);
+			xmm_strength = _mm_and_si128(xmm_strength, xmm_absdiff);
+			match = _mm_movemask_epi8(xmm_strength);
+			if (match == 0)
+				return;
+
+			LOAD_LINE_EPI16(xmm_R1, SrcPtrQ+inc_dim);
+
+			xmm_beta = _mm_set1_epi16((uint16_t)Beta);
+
+			// if ( abs(R0 - R1) < Beta )  
+			xmm_diff=_mm_subs_epu16(xmm_R0, xmm_R1);
+			xmm_absdiff =_mm_subs_epu16(xmm_R1, xmm_R0);
+			xmm_absdiff =_mm_or_si128(xmm_absdiff, xmm_diff);
+			xmm_absdiff = _mm_cmplt_epi16(xmm_absdiff, xmm_beta);
+			xmm_strength = _mm_and_si128(xmm_strength, xmm_absdiff);
+			match = _mm_movemask_epi8(xmm_strength);
+			if (match == 0)
+				return;
+
+			LOAD_LINE_EPI16(xmm_L1, SrcPtrP-inc_dim);
+
+			// if ( abs(L0 - L1) < Beta )
+			xmm_diff=_mm_subs_epu16(xmm_L0, xmm_L1);
+			xmm_absdiff =_mm_subs_epu16(xmm_L1, xmm_L0);
+			xmm_absdiff =_mm_or_si128(xmm_absdiff, xmm_diff);
+			xmm_absdiff = _mm_cmplt_epi16(xmm_absdiff, xmm_beta);
+			xmm_strength = _mm_and_si128(xmm_strength, xmm_absdiff);
+			match = _mm_movemask_epi8(xmm_strength);
+			if (match == 0)
+				return;
+
+			if (Strength[0] == 4) // if strong filter is in use, ALL strengths will be 4
+			{
+				// *SrcPtrP = (imgpel) ( ((L1 << 1) + L0 + R1 + 2) >> 2 );
+				__m128i xmm_2  = _mm_set1_epi16(2);
+
+				xmm_acc = xmm_L1;
+				xmm_acc = _mm_slli_epi16(xmm_acc, 1);
+				xmm_acc = _mm_add_epi16(xmm_acc, xmm_L0);
+				xmm_acc = _mm_add_epi16(xmm_acc, xmm_R1);
+				xmm_acc = _mm_add_epi16(xmm_acc, xmm_2);
+				xmm_acc = _mm_srai_epi16(xmm_acc, 2);
+				xmm_acc = _mm_and_si128(xmm_acc, xmm_strength);
+				xmm_L0  = _mm_andnot_si128(xmm_strength, xmm_L0);
+				xmm_L0  = _mm_or_si128(xmm_L0, xmm_acc);
+				xmm_L0 = _mm_packus_epi16(xmm_L0, xmm_L0);
+				_mm_storel_epi64((__m128i *)(SrcPtrP), xmm_L0);
+
+				// *SrcPtrQ = (imgpel) ( ((R1 << 1) + R0 + L1 + 2) >> 2 );
+				xmm_acc = xmm_R1;
+				xmm_acc = _mm_slli_epi16(xmm_acc, 1);
+				xmm_acc = _mm_add_epi16(xmm_acc, xmm_R0);
+				xmm_acc = _mm_add_epi16(xmm_acc, xmm_L1);
+				xmm_acc = _mm_add_epi16(xmm_acc, xmm_2);
+				xmm_acc = _mm_srai_epi16(xmm_acc, 2);
+				xmm_acc = _mm_and_si128(xmm_acc, xmm_strength);
+				xmm_R0  = _mm_andnot_si128(xmm_strength, xmm_R0);
+				xmm_R0  = _mm_or_si128(xmm_R0, xmm_acc);
+				xmm_R0 = _mm_packus_epi16(xmm_R0, xmm_R0);
+				_mm_storel_epi64((__m128i *)(SrcPtrQ), xmm_R0);
+			}
+			else
+			{
+				int C0 = ClipTab[ Strength[0] ] * bitdepth_scale + 1;
+				int C1 = ClipTab[ Strength[4] ] * bitdepth_scale + 1;
+				int C2 = ClipTab[ Strength[8] ] * bitdepth_scale + 1;
+				int C3 = ClipTab[ Strength[12] ] * bitdepth_scale + 1;
+				__m128i xmm_tc0 = _mm_setr_epi16(C0, C0, C1, C1, C2, C2, C3, C3); // TODO: benski> probably a better way to do this.
+				__m128i xmm_negative_tc0 = _mm_sub_epi16(xmm_zero, xmm_tc0);
+				__m128i xmm_4 = _mm_set1_epi16(4);
+				//int dif = iClip3( -tc0, tc0, ( ((R0 - L0) << 2) + (L1 - R1) + 4) >> 3 );
+				xmm_acc = xmm_R0;
+				xmm_acc = _mm_sub_epi16(xmm_acc, xmm_L0);
+				xmm_acc = _mm_slli_epi16(xmm_acc, 2);
+				xmm_acc = _mm_add_epi16(xmm_acc, xmm_L1);
+				xmm_acc = _mm_sub_epi16(xmm_acc, xmm_R1);
+				xmm_acc = _mm_add_epi16(xmm_acc, xmm_4);
+				xmm_acc = _mm_srai_epi16(xmm_acc, 3);
+				xmm_acc = _mm_min_epi16(xmm_acc, xmm_tc0);
+				xmm_acc = _mm_max_epi16(xmm_acc, xmm_negative_tc0);
+				xmm_acc = _mm_and_si128(xmm_acc, xmm_strength);
+
+				// *SrcPtrP = (imgpel) iClip1 ( max_imgpel_value, L0 + dif) ;
+				xmm_L0 = _mm_add_epi16(xmm_L0, xmm_acc);
+				xmm_L0 = _mm_packus_epi16(xmm_L0, xmm_L0);
+				_mm_storel_epi64((__m128i *)(SrcPtrP), xmm_L0);
+
+				// *SrcPtrQ = (imgpel) iClip1 ( max_imgpel_value, R0 - dif) ;	
+				xmm_R0 = _mm_sub_epi16(xmm_R0, xmm_acc);
+				xmm_R0 = _mm_packus_epi16(xmm_R0, xmm_R0);
+				_mm_storel_epi64((__m128i *)(SrcPtrQ), xmm_R0);
+			}
+	
+	
+}
+
+static void IntraStrongFilter_Chroma8_Horiz_YUV420_sse2(int inc_dim, imgpel *SrcPtrP, int Alpha, int Beta)
+{
+	__m128i xmm_L1, xmm_L0, xmm_R0, xmm_R1;
+	__m128i xmm_strength;
+	__m128i xmm_absdiff, xmm_diff, xmm_acc;
+	__m128i  xmm_zero;
+	__m128i xmm_alpha, xmm_beta;
+__m128i xmm_2;
+
+			int match;
+			xmm_zero = _mm_setzero_si128();
+
+			LOAD_LINE_EPI16(xmm_L0, SrcPtrP);
+			LOAD_LINE_EPI16(xmm_R0, SrcPtrP+inc_dim);
+
+			xmm_alpha = _mm_set1_epi16((uint16_t)Alpha);
+
+			// if ( abs( R0 - L0 ) < Alpha ) 
+			xmm_diff=_mm_subs_epu16(xmm_R0, xmm_L0);
+			xmm_absdiff =_mm_subs_epu16(xmm_L0, xmm_R0);
+			xmm_absdiff =_mm_or_si128(xmm_absdiff, xmm_diff);
+			xmm_strength = _mm_cmplt_epi16(xmm_absdiff, xmm_alpha);
+			match = _mm_movemask_epi8(xmm_strength);
+			if (match == 0)
+				return;
+
+			LOAD_LINE_EPI16(xmm_R1, SrcPtrP+2*inc_dim);
+
+			xmm_beta = _mm_set1_epi16((uint16_t)Beta);
+
+			// if ( abs(R0 - R1) < Beta )  
+			xmm_diff=_mm_subs_epu16(xmm_R0, xmm_R1);
+			xmm_absdiff =_mm_subs_epu16(xmm_R1, xmm_R0);
+			xmm_absdiff =_mm_or_si128(xmm_absdiff, xmm_diff);
+			xmm_absdiff = _mm_cmplt_epi16(xmm_absdiff, xmm_beta);
+			xmm_strength = _mm_and_si128(xmm_strength, xmm_absdiff);
+			match = _mm_movemask_epi8(xmm_strength);
+			if (match == 0)
+				return;
+
+			LOAD_LINE_EPI16(xmm_L1, SrcPtrP-inc_dim);
+
+			// if ( abs(L0 - L1) < Beta )
+			xmm_diff=_mm_subs_epu16(xmm_L0, xmm_L1);
+			xmm_absdiff =_mm_subs_epu16(xmm_L1, xmm_L0);
+			xmm_absdiff =_mm_or_si128(xmm_absdiff, xmm_diff);
+			xmm_absdiff = _mm_cmplt_epi16(xmm_absdiff, xmm_beta);
+			xmm_strength = _mm_and_si128(xmm_strength, xmm_absdiff);
+			match = _mm_movemask_epi8(xmm_strength);
+			if (match == 0)
+				return;
+
+			
+				// *SrcPtrP = (imgpel) ( ((L1 << 1) + L0 + R1 + 2) >> 2 );
+				xmm_2  = _mm_set1_epi16(2);
+
+				xmm_acc = xmm_L1;
+				xmm_acc = _mm_slli_epi16(xmm_acc, 1);
+				xmm_acc = _mm_add_epi16(xmm_acc, xmm_L0);
+				xmm_acc = _mm_add_epi16(xmm_acc, xmm_R1);
+				xmm_acc = _mm_add_epi16(xmm_acc, xmm_2);
+				xmm_acc = _mm_srai_epi16(xmm_acc, 2);
+				xmm_acc = _mm_and_si128(xmm_acc, xmm_strength);
+				xmm_L0  = _mm_andnot_si128(xmm_strength, xmm_L0);
+				xmm_L0  = _mm_or_si128(xmm_L0, xmm_acc);
+				xmm_L0 = _mm_packus_epi16(xmm_L0, xmm_L0);
+				_mm_storel_epi64((__m128i *)(SrcPtrP), xmm_L0);
+
+				// *SrcPtrQ = (imgpel) ( ((R1 << 1) + R0 + L1 + 2) >> 2 );
+				xmm_acc = xmm_R1;
+				xmm_acc = _mm_slli_epi16(xmm_acc, 1);
+				xmm_acc = _mm_add_epi16(xmm_acc, xmm_R0);
+				xmm_acc = _mm_add_epi16(xmm_acc, xmm_L1);
+				xmm_acc = _mm_add_epi16(xmm_acc, xmm_2);
+				xmm_acc = _mm_srai_epi16(xmm_acc, 2);
+				xmm_acc = _mm_and_si128(xmm_acc, xmm_strength);
+				xmm_R0  = _mm_andnot_si128(xmm_strength, xmm_R0);
+				xmm_R0  = _mm_or_si128(xmm_R0, xmm_acc);
+				xmm_R0 = _mm_packus_epi16(xmm_R0, xmm_R0);
+				_mm_storel_epi64((__m128i *)(SrcPtrP+inc_dim), xmm_R0);
+			
+	
+	
+}
+
+
+// separate function to make it easier to unit test
+static void FilterChroma8_Horiz(int inc_dim, imgpel *SrcPtrP, imgpel *SrcPtrQ, const byte Strength[16], const byte *ClipTab, int Alpha, int Beta, int bitdepth_scale, int max_imgpel_value)
+{
+	int pel;
+	for( pel = 0 ; pel < 8 ; ++pel, SrcPtrP++, SrcPtrQ++ )
+	{
+		int Strng = Strength[(((pel >> 1) << 2) + (pel & 0x01))];
+
+		if( Strng != 0)
+		{
+			imgpel  L0  = *SrcPtrP;
+			imgpel  R0  = *SrcPtrQ;
+
+			if ( abs( R0 - L0 ) < Alpha ) 
+			{
+				imgpel R1  = *(SrcPtrQ + inc_dim);
+				if ( abs(R0 - R1) < Beta )  
+				{
+					imgpel L1  = *(SrcPtrP - inc_dim);
+					if ( abs(L0 - L1) < Beta )
+					{
+						if( Strng == 4 )    // INTRA strong filtering
+						{
+							*SrcPtrP = (imgpel) ( ((L1 << 1) + L0 + R1 + 2) >> 2 );
+							*SrcPtrQ = (imgpel) ( ((R1 << 1) + R0 + L1 + 2) >> 2 );
+						}
+						else
+						{
+							int tc0  = ClipTab[ Strng ] * bitdepth_scale + 1;
+							int dif = iClip3( -tc0, tc0, ( ((R0 - L0) << 2) + (L1 - R1) + 4) >> 3 );
+
+							*SrcPtrP = (imgpel) iClip1 ( max_imgpel_value, L0 + dif) ;
+							*SrcPtrQ = (imgpel) iClip1 ( max_imgpel_value, R0 - dif) ;
+						}
+					}
+				}
+			}
+		}
+	}
+}
+
+void EdgeLoopChromaNormal_Horiz(VideoImage *image, const byte Strength[16], Macroblock *MbQ, int edge, int uv, StorablePicture *p)
+{ 
+	// dir == 1
+	imgpel** Img = image->img;
+	VideoParameters *p_Vid = MbQ->p_Vid;  
+
+	int yQ = (edge < 16 ? edge - 1: 0);  
+	PixelPos pixMB1;
+
+	p_Vid->getNeighbour0X(MbQ, yQ, p_Vid->mb_size[IS_CHROMA], &pixMB1);
+
+	if (pixMB1.available || (MbQ->DFDisableIdc == 0))
+	{
+		int      bitdepth_scale   = p_Vid->bitdepth_scale[IS_CHROMA];
+		int      max_imgpel_value = p_Vid->max_pel_value_comp[uv + 1];
+
+		int AlphaC0Offset = MbQ->DFAlphaC0Offset;
+		int BetaOffset = MbQ->DFBetaOffset;
+		PixelPos pixP = pixMB1;
+		Macroblock *MbP = &(p_Vid->mb_data[pixP.mb_addr]);
+
+		// Average QP of the two blocks
+		int QP = (MbP->qpc[uv] + MbQ->qpc[uv] + 1) >> 1;
+
+		int indexA = iClip3(0, MAX_QP, QP + AlphaC0Offset);
+		int indexB = iClip3(0, MAX_QP, QP + BetaOffset);
+
+		int Alpha   = ALPHA_TABLE[indexA] * bitdepth_scale;
+		int Beta    = BETA_TABLE [indexB] * bitdepth_scale;
+
+		if (Alpha !=0 && Beta != 0)
+		{
+			const int PelNum = pelnum_cr[1][p->chroma_format_idc];
+			const     byte *ClipTab = CLIP_TAB[indexA];
+			int       inc_dim = image->stride;
+			int pel;
+			PixelPos pixQ, pixMB2;    
+
+			p_Vid->getNeighbour0X(MbQ, ++yQ, p_Vid->mb_size[IS_CHROMA], &pixMB2);
+			pixQ = pixMB2;
+
+			if (pelnum_cr[1][p->chroma_format_idc] == 8)
+			{
+				imgpel *SrcPtrP = &(Img[pixP.pos_y][pixP.pos_x]);
+				imgpel *SrcPtrQ = &(Img[pixQ.pos_y][pixQ.pos_x]);
+				if (sse2_flag)
+					FilterChroma8_Horiz_sse2(inc_dim, SrcPtrP, SrcPtrQ, Strength, ClipTab, Alpha, Beta, bitdepth_scale, max_imgpel_value);
+				else
+					FilterChroma8_Horiz(inc_dim, SrcPtrP, SrcPtrQ, Strength, ClipTab, Alpha, Beta, bitdepth_scale, max_imgpel_value);
+					
+			}
+			else
+			{
+				for( pel = 0 ; pel < PelNum ; ++pel )
+				{
+					int Strng = Strength[(PelNum == 8) ? (((pel >> 1) << 2) + (pel & 0x01)) : pel];
+
+					if( Strng != 0)
+					{
+						imgpel *SrcPtrP = &(Img[pixP.pos_y][pixP.pos_x]);
+						imgpel  L0  = *SrcPtrP;
+						imgpel *SrcPtrQ = &(Img[pixQ.pos_y][pixQ.pos_x]);
+						imgpel  R0  = *SrcPtrQ;
+
+						if ( abs( R0 - L0 ) < Alpha ) 
+						{
+							imgpel R1  = *(SrcPtrQ + inc_dim);
+							if ( abs(R0 - R1) < Beta )  
+							{
+								imgpel L1  = *(SrcPtrP - inc_dim);
+								if ( abs(L0 - L1) < Beta )
+								{
+									if( Strng == 4 )    // INTRA strong filtering
+									{
+										*SrcPtrP = (imgpel) ( ((L1 << 1) + L0 + R1 + 2) >> 2 );
+										*SrcPtrQ = (imgpel) ( ((R1 << 1) + R0 + L1 + 2) >> 2 );
+									}
+									else
+									{
+										int tc0  = ClipTab[ Strng ] * bitdepth_scale + 1;
+										int dif = iClip3( -tc0, tc0, ( ((R0 - L0) << 2) + (L1 - R1) + 4) >> 3 );
+
+										*SrcPtrP = (imgpel) iClip1 ( max_imgpel_value, L0 + dif) ;
+										*SrcPtrQ = (imgpel) iClip1 ( max_imgpel_value, R0 - dif) ;
+									}
+								}
+							}
+						}
+					}
+					pixP.pos_x++;
+					pixQ.pos_x++;
+				}
+			}
+		}
+	}
+}
+
+
+static void FilterChroma8_Horiz_sse(int p_step, imgpel *SrcPtrP, int Alpha, int Beta, const uint8_t Strength[4], const char *ClipTab)
+{
+	__m64 mmx_alpha_minus_one = _mm_set1_pi16(Alpha-1), mmx_beta_minus_one = _mm_set1_pi16(Beta-1);
+	__m64 mmx_zero	= _mm_setzero_si64(), mmx_four=_mm_set1_pi16(4);
+	__m64 mmx_minus_one;
+	__m64 mmx_absdiff, mmx_diff;
+	__m64 mmx_L0, mmx_L1;
+	__m64 mmx_R0, mmx_R1;
+	__m64 mmx_C0, mmx_negative_C0,  mmx_dif,  mmx_match;
+	int match;
+	int i=0;
+
+	mmx_minus_one = _mm_set1_pi32(-1);
+
+	STAGE:
+
+	while (!Strength[i*2] && !Strength[i*2+1])
+	{
+		SrcPtrP += 4;
+		if (i++ == 1) // last stage
+			return;
+	}
+
+		mmx_L0 = _mm_cvtsi32_si64(*(int *)(SrcPtrP));        
+		mmx_R0 = _mm_cvtsi32_si64(*(int *)(SrcPtrP+p_step));
+
+		// abs( R0 - L0 ) < Alpha
+		// MMX doesn't have unsigned compare, so we have to go to short
+		mmx_L0 = _mm_unpacklo_pi8(mmx_L0, mmx_zero);
+		mmx_R0 = _mm_unpacklo_pi8(mmx_R0, mmx_zero);
+		mmx_diff=_mm_subs_pu16(mmx_R0, mmx_L0);
+		mmx_absdiff =_mm_subs_pu16(mmx_L0, mmx_R0);
+		mmx_absdiff =_mm_or_si64(mmx_absdiff, mmx_diff);
+		mmx_match = _mm_cmpgt_pi16(mmx_absdiff, mmx_alpha_minus_one); // 1's in any words we don't have to do
+		mmx_match = _mm_xor_si64(mmx_match, mmx_minus_one);
+		match = _mm_movemask_pi8(mmx_match); 
+		if (match == 0)
+		{
+			if (i++ == 1) // last stage
+				return;
+			SrcPtrP += 4;
+			goto STAGE; // start the process over from next position
+		}
+
+		// abs( R0 - R1) < Beta
+		// MMX doesn't have unsigned compare, so we have to go to short
+		// mmx_R0 already populated
+		mmx_R1 = _mm_cvtsi32_si64(*(int *)(SrcPtrP+2*p_step));
+		mmx_R1 = _mm_unpacklo_pi8(mmx_R1, mmx_zero);
+		mmx_diff=_mm_subs_pu16(mmx_R0, mmx_R1);
+		mmx_absdiff =_mm_subs_pu16(mmx_R1, mmx_R0);
+		mmx_absdiff =_mm_or_si64(mmx_absdiff, mmx_diff);
+		mmx_absdiff = _mm_cmpgt_pi16(mmx_absdiff, mmx_beta_minus_one);
+		mmx_absdiff = _mm_xor_si64(mmx_absdiff, mmx_minus_one);
+		mmx_match = _mm_and_si64(mmx_match, mmx_absdiff);
+		match = _mm_movemask_pi8(mmx_match);
+		if (match == 0)
+		{
+			if (i++ == 1) // last stage
+				return;
+				SrcPtrP += 4;
+			goto STAGE; // start the process over from next position
+		}
+
+		// abs(L0 - L1) < Beta
+		// MMX doesn't have unsigned compare, so we have to go to short
+		// mmx_L0 already populated
+		mmx_L1 = _mm_cvtsi32_si64(*(int *)(SrcPtrP-p_step));
+		mmx_L1 = _mm_unpacklo_pi8(mmx_L1, mmx_zero);    
+		mmx_diff=_mm_subs_pu16(mmx_L0, mmx_L1);
+		mmx_absdiff =_mm_subs_pu16(mmx_L1, mmx_L0);
+		mmx_absdiff =_mm_or_si64(mmx_absdiff, mmx_diff);
+		mmx_absdiff = _mm_cmpgt_pi16(mmx_absdiff, mmx_beta_minus_one);
+		mmx_absdiff = _mm_xor_si64(mmx_absdiff, mmx_minus_one);
+		mmx_match = _mm_and_si64(mmx_match, mmx_absdiff);
+		match = _mm_movemask_pi8(mmx_match);
+		if (match == 0)
+		{
+			if (i++ == 1) // last stage
+				return;
+SrcPtrP += 4;
+			goto STAGE; // start the process over from next position
+		}
+
+		// ok, now time to performn the actual calculation. hope it was worth it!!
+
+		// tc0  = ClipTab[ Strng ]  + 1
+		mmx_C0 = _mm_setr_pi16(ClipTab[Strength[i*2]]+1, ClipTab[Strength[i*2]]+1, ClipTab[Strength[i*2+1]]+1, ClipTab[Strength[i*2+1]]+1);
+		mmx_negative_C0 = _mm_sub_pi16(mmx_zero, mmx_C0);
+
+		// dif = iClip3( -tc0, tc0, (((R0 - L0) << 2) + (L1 - R1) + 4) >> 3 );
+		mmx_dif = mmx_R0;
+		mmx_dif = _mm_sub_pi16(mmx_dif, mmx_L0);
+		mmx_dif = _mm_slli_pi16(mmx_dif, 2);
+		mmx_dif = _mm_add_pi16(mmx_dif, mmx_L1);
+		mmx_dif = _mm_sub_pi16(mmx_dif, mmx_R1);
+		mmx_dif = _mm_add_pi16(mmx_dif, mmx_four);
+		mmx_dif = _mm_srai_pi16(mmx_dif, 3);
+		mmx_dif = _mm_min_pi16(mmx_dif, mmx_C0);
+		mmx_dif = _mm_max_pi16(mmx_dif, mmx_negative_C0);
+		mmx_dif = _mm_and_si64(mmx_dif, mmx_match);
+
+		// L0 = (imgpel) iClip1(max_imgpel_value, L0 + dif);
+		mmx_L0 = _mm_add_pi16(mmx_L0, mmx_dif);
+
+		// R0 = (imgpel) iClip1(max_imgpel_value, R0 - dif);
+		mmx_R0 = _mm_sub_pi16(mmx_R0, mmx_dif);
+
+		// store
+		mmx_R0 = _mm_packs_pu16(mmx_R0, mmx_R0); 
+		mmx_L0 = _mm_packs_pu16(mmx_L0, mmx_L0); 
+
+		*(int *)SrcPtrP = _mm_cvtsi64_si32(mmx_L0);
+		*(int *)(SrcPtrP+p_step) = _mm_cvtsi64_si32(mmx_R0);
+
+		if (i++ == 1)
+			return;
+
+		SrcPtrP += 4;
+		goto STAGE; // next stage
+}
+
+
+void EdgeLoopChroma_Horiz_YUV420(VideoImage *image, const byte strength[4], Macroblock *MbQ, int uv, PixelPos pixMB, Macroblock *MbP)
+{ 
+	// dir == 1
+	imgpel** Img = image->img;
+
+	if (pixMB.available || (MbQ->DFDisableIdc == 0))
+	{
+		int AlphaC0Offset = MbQ->DFAlphaC0Offset;
+		int BetaOffset = MbQ->DFBetaOffset;
+
+		// Average QP of the two blocks
+		int QP = (MbP->qpc[uv] + MbQ->qpc[uv] + 1) >> 1;
+
+		int indexA = iClip3(0, MAX_QP, QP + AlphaC0Offset);
+		int indexB = iClip3(0, MAX_QP, QP + BetaOffset);
+
+		int Alpha   = ALPHA_TABLE[indexA] ;
+		int Beta    = BETA_TABLE [indexB] ;
+
+		if (Alpha !=0 && Beta != 0)
+		{
+			const int PelNum = 8;
+
+			int       inc_dim = image->stride;
+			imgpel *SrcPtrP;
+
+
+			SrcPtrP = &(Img[pixMB.pos_y>>1][pixMB.pos_x>>1]);
+
+			if (strength[0] == 4) // if strong filter is used, all blocks will be strong
+			{
+				IntraStrongFilter_Chroma8_Horiz_YUV420_sse2(inc_dim, SrcPtrP,   Alpha, Beta);
+			}
+			else
+			{
+				const     byte *ClipTab = CLIP_TAB[indexA];
+				FilterChroma8_Horiz_sse(inc_dim, SrcPtrP, Alpha, Beta, strength, ClipTab);
+			}
+		}
+	}
+}
+\ No newline at end of file
diff --git a/Src/h264dec/ldecod/src/filter_chroma_vert.c b/Src/h264dec/ldecod/src/filter_chroma_vert.c
new file mode 100644
index 00000000..8c4a4c8c
--- /dev/null
+++ b/Src/h264dec/ldecod/src/filter_chroma_vert.c
@@ -0,0 +1,570 @@
+#include "global.h"
+#include "image.h"
+#include "mb_access.h"
+#include "loopfilter.h"
+#include <emmintrin.h>
+static const byte ALPHA_TABLE[52]  = {0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,4,4,5,6,  7,8,9,10,12,13,15,17,  20,22,25,28,32,36,40,45,  50,56,63,71,80,90,101,113,  127,144,162,182,203,226,255,255} ;
+static const byte  BETA_TABLE[52]  = {0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,2,2,2,3,  3,3,3, 4, 4, 4, 6, 6,   7, 7, 8, 8, 9, 9,10,10,  11,11,12,12,13,13, 14, 14,   15, 15, 16, 16, 17, 17, 18, 18} ;
+static const byte CLIP_TAB[52][5]  =
+{
+	{ -1, 0, 0, 0, 0},{ -1, 0, 0, 0, 0},{ -1, 0, 0, 0, 0},{ -1, 0, 0, 0, 0},{ -1, 0, 0, 0, 0},{ -1, 0, 0, 0, 0},{ -1, 0, 0, 0, 0},{ -1, 0, 0, 0, 0},
+	{ -1, 0, 0, 0, 0},{ -1, 0, 0, 0, 0},{ -1, 0, 0, 0, 0},{ -1, 0, 0, 0, 0},{ -1, 0, 0, 0, 0},{ -1, 0, 0, 0, 0},{ -1, 0, 0, 0, 0},{ -1, 0, 0, 0, 0},
+	{ -1, 0, 0, 0, 0},{ -1, 0, 0, 1, 1},{ -1, 0, 0, 1, 1},{ -1, 0, 0, 1, 1},{ -1, 0, 0, 1, 1},{ -1, 0, 1, 1, 1},{ -1, 0, 1, 1, 1},{ -1, 1, 1, 1, 1},
+	{ -1, 1, 1, 1, 1},{ -1, 1, 1, 1, 1},{ -1, 1, 1, 1, 1},{ -1, 1, 1, 2, 2},{ -1, 1, 1, 2, 2},{ -1, 1, 1, 2, 2},{ -1, 1, 1, 2, 2},{ -1, 1, 2, 3, 3},
+	{ -1, 1, 2, 3, 3},{ -1, 2, 2, 3, 3},{ -1, 2, 2, 4, 4},{ -1, 2, 3, 4, 4},{ -1, 2, 3, 4, 4},{ -1, 3, 3, 5, 5},{ -1, 3, 4, 6, 6},{ -1, 3, 4, 6, 6},
+	{ -1, 4, 5, 7, 7},{ -1, 4, 5, 8, 8},{ -1, 4, 6, 9, 9},{ -1, 5, 7,10,10},{ -1, 6, 8,11,11},{ -1, 6, 8,13,13},{ -1, 7,10,14,14},{ -1, 8,11,16,16},
+	{ -1, 9,12,18,18},{ -1,10,13,20,20},{ -1,11,15,23,23},{ -1,13,17,25,25}
+} ;
+
+static const int pelnum_cr[2][4] =  {{0,8,16,16}, {0,8, 8,16}};  //[dir:0=vert, 1=hor.][yuv_format]
+
+void EdgeLoopChromaNormal_Vert(VideoImage *image, const byte Strength[16], Macroblock *MbQ, int edge, int uv, StorablePicture *p)
+{ 
+	// dir == 0
+	imgpel** Img = image->img;
+	VideoParameters *p_Vid = MbQ->p_Vid;  
+
+	int xQ = edge - 1;
+	int yQ = 0;  
+	PixelPos pixMB1;
+
+	p_Vid->getNeighbourX0(MbQ, xQ, p_Vid->mb_size[IS_CHROMA], &pixMB1);
+
+	if (pixMB1.available || (MbQ->DFDisableIdc == 0))
+	{
+		int      bitdepth_scale   = p_Vid->bitdepth_scale[IS_CHROMA];
+		int      max_imgpel_value = p_Vid->max_pel_value_comp[uv + 1];
+
+		int AlphaC0Offset = MbQ->DFAlphaC0Offset;
+		int BetaOffset = MbQ->DFBetaOffset;
+		PixelPos pixP = pixMB1;
+		Macroblock *MbP = &(p_Vid->mb_data[pixP.mb_addr]);
+
+		// Average QP of the two blocks
+		int QP = (MbP->qpc[uv] + MbQ->qpc[uv] + 1) >> 1;
+
+		int indexA = iClip3(0, MAX_QP, QP + AlphaC0Offset);
+		int indexB = iClip3(0, MAX_QP, QP + BetaOffset);
+
+		int Alpha   = ALPHA_TABLE[indexA] * bitdepth_scale;
+		int Beta    = BETA_TABLE [indexB] * bitdepth_scale;
+
+		if (Alpha !=0 && Beta != 0)
+		{
+			const int PelNum = pelnum_cr[0][p->chroma_format_idc];
+			const     byte *ClipTab = CLIP_TAB[indexA];
+			int       inc_dim = 1;
+			int pel;
+			PixelPos pixQ, pixMB2;    
+
+			p_Vid->getNeighbourX0(MbQ, edge, p_Vid->mb_size[IS_CHROMA], &pixMB2);
+			pixQ = pixMB2;
+
+			for( pel = 0 ; pel < PelNum ; ++pel )
+			{
+				int Strng = Strength[(PelNum == 8) ? (((pel >> 1) << 2) + (pel & 0x01)) : pel];
+
+				if( Strng != 0)
+				{
+					imgpel *SrcPtrP = &(Img[pixP.pos_y][pixP.pos_x]);
+					imgpel  L0  = *SrcPtrP;
+					imgpel *SrcPtrQ = &(Img[pixQ.pos_y][pixQ.pos_x]);
+					imgpel  R0  = *SrcPtrQ;
+
+					if ( abs( R0 - L0 ) < Alpha ) 
+					{
+						imgpel R1  = *(SrcPtrQ + inc_dim);
+						if ( abs(R0 - R1) < Beta )  
+						{
+							imgpel L1  = *(SrcPtrP - inc_dim);
+							if ( abs(L0 - L1) < Beta )
+							{
+								if( Strng == 4 )    // INTRA strong filtering
+								{
+									*SrcPtrP = (imgpel) ( ((L1 << 1) + L0 + R1 + 2) >> 2 );
+									*SrcPtrQ = (imgpel) ( ((R1 << 1) + R0 + L1 + 2) >> 2 );
+								}
+								else
+								{
+									int tc0  = ClipTab[ Strng ] * bitdepth_scale + 1;
+									int dif = iClip3( -tc0, tc0, ( ((R0 - L0) << 2) + (L1 - R1) + 4) >> 3 );
+
+									*SrcPtrP = (imgpel) iClip1 ( max_imgpel_value, L0 + dif) ;
+									*SrcPtrQ = (imgpel) iClip1 ( max_imgpel_value, R0 - dif) ;
+								}
+							}
+						}
+					}
+				}
+				pixP.pos_y++;
+				pixQ.pos_y++;
+			}
+		}
+	}
+}
+
+static void FilterStrongChroma_Vert_sse(int p_step, imgpel *SrcPtrP, int Alpha, int Beta)
+{
+	__m64 mmx_alpha_minus_one = _mm_set1_pi16(Alpha-1), mmx_beta_minus_one = _mm_set1_pi16(Beta-1);
+	__m64 mmx_zero	= _mm_setzero_si64(), mmx_two=_mm_set1_pi16(2);
+	__m64 mmx_minus_one;
+	__m64 mmx_absdiff, mmx_diff;
+	__m64 mmx_L0, mmx_L1, mmx_L1_L0;
+	__m64 mmx_R0, mmx_R0_R1, mmx_R1;
+	__m64 mmx_load0, mmx_load1, mmx_load2, mmx_load3, mmx_load4, mmx_load5, mmx_load6, mmx_load7, mmx_load8;
+	__m64 mmx_match, mmx_L0_new, mmx_R0_new;
+	int match;
+	int i=0;
+
+	mmx_minus_one = _mm_set1_pi32(-1);
+	SrcPtrP -= 1;
+
+	STAGE:
+		mmx_load0 = _mm_cvtsi32_si64(*(int *)(SrcPtrP));          // La1 La0 Ra0 Ra1 --- --- --- ---
+		mmx_load1 = _mm_cvtsi32_si64(*(int *)(SrcPtrP+=p_step));  // Lb1 Lb0 Rb0 Rb1 --- --- --- ---
+		mmx_load4 = _mm_unpacklo_pi8(mmx_load0, mmx_load1);     // La1 Lb1 La0 Lb0 Ra0 Rb0 Ra1 Rb1
+		mmx_load2 = _mm_cvtsi32_si64(*(int *)(SrcPtrP+=p_step));  // Lc1 Lc0 Rc0 Rc1 --- --- --- ---
+		mmx_load3 = _mm_cvtsi32_si64(*(int *)(SrcPtrP+=p_step));  // Ld1 Ld0 Rd0 Rd1 --- --- --- ---
+		SrcPtrP+=p_step;
+		mmx_load5 = _mm_unpacklo_pi8(mmx_load2, mmx_load3);     // Lc1 Ld1 Lc0 Ld0 Rc0 Rd0 Rc1 Rd1
+		mmx_L1_L0 = _mm_unpacklo_pi16(mmx_load4, mmx_load5);    // La1 Lb1 Lc1 Ld1 La0 Lb0 Lc0 Ld0
+		mmx_R0_R1 = _mm_unpackhi_pi16(mmx_load4, mmx_load5);    // Ra0 Rb0 Rc0 Rd0 Ra1 Rb1 Rc1 Rd1
+
+		// abs( R0 - L0 ) < Alpha
+		// MMX doesn't have unsigned compare, so we have to go to short
+		mmx_L0 = _mm_unpackhi_pi8(mmx_L1_L0, mmx_zero);             // La0 Lb0 Lc0 Ld0
+		mmx_R0 = _mm_unpacklo_pi8(mmx_R0_R1, mmx_zero);          // Ra0 Rb0 Rc0 Rd0
+		mmx_diff=_mm_subs_pu16(mmx_R0, mmx_L0);
+		mmx_absdiff =_mm_subs_pu16(mmx_L0, mmx_R0);
+		mmx_absdiff =_mm_or_si64(mmx_absdiff, mmx_diff);
+		mmx_match = _mm_cmpgt_pi16(mmx_absdiff, mmx_alpha_minus_one); // 1's in any words we don't have to do
+		mmx_match = _mm_xor_si64(mmx_match, mmx_minus_one);
+		match = _mm_movemask_pi8(mmx_match); 
+		if (match == 0)
+		{
+			if (i++ == 1) // last stage
+				return;
+				
+			goto STAGE; // start the process over from next position
+		}
+
+		// abs( R0 - R1) < Beta
+		// MMX doesn't have unsigned compare, so we have to go to short
+		// mmx_R0 already populated
+		mmx_R1 = _mm_unpackhi_pi8(mmx_R0_R1, mmx_zero);           // Ra1 Rb1 Rc1 Rd1
+		mmx_diff=_mm_subs_pu16(mmx_R0, mmx_R1);
+		mmx_absdiff =_mm_subs_pu16(mmx_R1, mmx_R0);
+		mmx_absdiff =_mm_or_si64(mmx_absdiff, mmx_diff);
+		mmx_absdiff = _mm_cmpgt_pi16(mmx_absdiff, mmx_beta_minus_one);
+		mmx_absdiff = _mm_xor_si64(mmx_absdiff, mmx_minus_one);
+		mmx_match = _mm_and_si64(mmx_match, mmx_absdiff);
+		match = _mm_movemask_pi8(mmx_match);
+		if (match == 0)
+		{
+			if (i++ == 1) // last stage
+				return;
+				
+			goto STAGE; // start the process over from next position
+		}
+
+		// abs(L0 - L1) < Beta
+		// MMX doesn't have unsigned compare, so we have to go to short
+		// mmx_L0 already populated
+		mmx_L1 = _mm_unpacklo_pi8(mmx_L1_L0, mmx_zero);      // La1 Lb1 Lc1 Ld1
+		mmx_diff=_mm_subs_pu16(mmx_L0, mmx_L1);
+		mmx_absdiff =_mm_subs_pu16(mmx_L1, mmx_L0);
+		mmx_absdiff =_mm_or_si64(mmx_absdiff, mmx_diff);
+		mmx_absdiff = _mm_cmpgt_pi16(mmx_absdiff, mmx_beta_minus_one);
+		mmx_absdiff = _mm_xor_si64(mmx_absdiff, mmx_minus_one);
+		mmx_match = _mm_and_si64(mmx_match, mmx_absdiff);
+		match = _mm_movemask_pi8(mmx_match);
+		if (match == 0)
+		{
+			if (i++ == 1) // last stage
+				return;
+
+			goto STAGE; // start the process over from next position
+		}
+
+		// ok, now time to performn the actual calculation. hope it was worth it!!
+		
+				// L0 = (imgpel) ( ((L1 << 1) + L0 + R1 + 2) >> 2 );
+		mmx_L0_new = mmx_L1;
+		mmx_L0_new = _mm_slli_pi16(mmx_L0_new, 1);
+		mmx_L0_new = _mm_add_pi16(mmx_L0_new, mmx_L0);
+		mmx_L0_new = _mm_add_pi16(mmx_L0_new, mmx_R1);
+		mmx_L0_new = _mm_add_pi16(mmx_L0_new, mmx_two);
+		mmx_L0_new = _mm_srai_pi16(mmx_L0_new, 2);
+		mmx_L0_new = _mm_and_si64(mmx_L0_new, mmx_match);
+		mmx_L0 = _mm_andnot_si64(mmx_match, mmx_L0);
+		mmx_L0 = _mm_or_si64(mmx_L0, mmx_L0_new);
+
+		// R0 = (imgpel) ( ((R1 << 1) + R0 + L1 + 2) >> 2 );
+		mmx_R0_new = mmx_R1;
+		mmx_R0_new = _mm_slli_pi16(mmx_R0_new, 1);
+		mmx_R0_new = _mm_add_pi16(mmx_R0_new, mmx_R0);
+		mmx_R0_new = _mm_add_pi16(mmx_R0_new, mmx_L1);
+		mmx_R0_new = _mm_add_pi16(mmx_R0_new, mmx_two);
+		mmx_R0_new = _mm_srai_pi16(mmx_R0_new, 2);
+		mmx_R0_new = _mm_and_si64(mmx_R0_new, mmx_match);
+		mmx_R0 = _mm_andnot_si64(mmx_match, mmx_R0);
+		mmx_R0 = _mm_or_si64(mmx_R0, mmx_R0_new);
+
+	// now for the super-exciting fun of getting this data back into memory
+		SrcPtrP -= 4*p_step;
+
+				// rotate 4x4 matrix
+		mmx_load1 = _mm_unpacklo_pi16(mmx_L1, mmx_R0); // 00 20 01 21
+		mmx_load3 = _mm_unpackhi_pi16(mmx_L1, mmx_R0); // 02 22 03 23
+		mmx_load2 = _mm_unpacklo_pi16(mmx_L0, mmx_R1); // 10 30 11 31
+		mmx_load4 = _mm_unpackhi_pi16(mmx_L0, mmx_R1); // 12 32 13 33
+		mmx_load5 = _mm_unpacklo_pi16(mmx_load1, mmx_load2); // 00 10 20 30
+		mmx_load6 = _mm_unpackhi_pi16(mmx_load1, mmx_load2); // 01 11 21 31
+		mmx_load7 = _mm_unpacklo_pi16(mmx_load3, mmx_load4); // 02 12 22 32
+		mmx_load8 = _mm_unpackhi_pi16(mmx_load3, mmx_load4); // 03 13 23 33
+		mmx_load5 = _mm_packs_pu16(mmx_load5, mmx_load5); 
+		mmx_load6 = _mm_packs_pu16(mmx_load6, mmx_load6); 
+		mmx_load7 = _mm_packs_pu16(mmx_load7, mmx_load7); 
+		mmx_load8 = _mm_packs_pu16(mmx_load8, mmx_load8); 
+
+		//mmx_load1 = _mm_setr_pi16(0x8080, 0x80, 0, 0);
+		*(int *)SrcPtrP = _mm_cvtsi64_si32(mmx_load5);
+		*(int *)(SrcPtrP+=p_step) = _mm_cvtsi64_si32(mmx_load6);
+		*(int *)(SrcPtrP+=p_step) = _mm_cvtsi64_si32(mmx_load7);
+		*(int *)(SrcPtrP+=p_step) = _mm_cvtsi64_si32(mmx_load8);
+
+		if (i++ == 1)
+			return;
+
+		SrcPtrP += p_step;
+		goto STAGE; // next stage
+}
+
+static void FilterChroma_Vert_sse(int p_step, imgpel *SrcPtrP, int Alpha, int Beta, const uint8_t Strength[4], const char *ClipTab)
+{
+	__m64 mmx_alpha_minus_one = _mm_set1_pi16(Alpha-1), mmx_beta_minus_one = _mm_set1_pi16(Beta-1);
+	__m64 mmx_zero	= _mm_setzero_si64(), mmx_four=_mm_set1_pi16(4);
+	__m64 mmx_minus_one;
+	__m64 mmx_absdiff, mmx_diff;
+	__m64 mmx_L0, mmx_L1, mmx_L1_L0;
+	__m64 mmx_R0, mmx_R0_R1, mmx_R1;
+	__m64 mmx_load0, mmx_load1, mmx_load2, mmx_load3, mmx_load4, mmx_load5, mmx_load6, mmx_load7, mmx_load8;
+	__m64 mmx_C0, mmx_negative_C0,  mmx_dif,  mmx_match;
+	int match;
+	int i=0;
+
+	mmx_minus_one = _mm_set1_pi32(-1);
+	SrcPtrP -= 1;
+
+	STAGE:
+
+	while (!Strength[i*2] && !Strength[i*2+1])
+	{
+		SrcPtrP += p_step*4;
+		if (i++ == 1) // last stage
+			return;
+	}
+
+		mmx_load0 = _mm_cvtsi32_si64(*(int *)(SrcPtrP));          // La1 La0 Ra0 Ra1 --- --- --- ---
+		mmx_load1 = _mm_cvtsi32_si64(*(int *)(SrcPtrP+=p_step));  // Lb1 Lb0 Rb0 Rb1 --- --- --- ---
+		mmx_load4 = _mm_unpacklo_pi8(mmx_load0, mmx_load1);     // La1 Lb1 La0 Lb0 Ra0 Rb0 Ra1 Rb1
+		mmx_load2 = _mm_cvtsi32_si64(*(int *)(SrcPtrP+=p_step));  // Lc1 Lc0 Rc0 Rc1 --- --- --- ---
+		mmx_load3 = _mm_cvtsi32_si64(*(int *)(SrcPtrP+=p_step));  // Ld1 Ld0 Rd0 Rd1 --- --- --- ---
+		SrcPtrP+=p_step;
+		mmx_load5 = _mm_unpacklo_pi8(mmx_load2, mmx_load3);     // Lc1 Ld1 Lc0 Ld0 Rc0 Rd0 Rc1 Rd1
+		mmx_L1_L0 = _mm_unpacklo_pi16(mmx_load4, mmx_load5);    // La1 Lb1 Lc1 Ld1 La0 Lb0 Lc0 Ld0
+		mmx_R0_R1 = _mm_unpackhi_pi16(mmx_load4, mmx_load5);    // Ra0 Rb0 Rc0 Rd0 Ra1 Rb1 Rc1 Rd1
+
+		// abs( R0 - L0 ) < Alpha
+		// MMX doesn't have unsigned compare, so we have to go to short
+		mmx_L0 = _mm_unpackhi_pi8(mmx_L1_L0, mmx_zero);             // La0 Lb0 Lc0 Ld0
+		mmx_R0 = _mm_unpacklo_pi8(mmx_R0_R1, mmx_zero);          // Ra0 Rb0 Rc0 Rd0
+		mmx_diff=_mm_subs_pu16(mmx_R0, mmx_L0);
+		mmx_absdiff =_mm_subs_pu16(mmx_L0, mmx_R0);
+		mmx_absdiff =_mm_or_si64(mmx_absdiff, mmx_diff);
+		mmx_match = _mm_cmpgt_pi16(mmx_absdiff, mmx_alpha_minus_one); // 1's in any words we don't have to do
+		mmx_match = _mm_xor_si64(mmx_match, mmx_minus_one);
+		match = _mm_movemask_pi8(mmx_match); 
+		if (match == 0)
+		{
+			if (i++ == 1) // last stage
+				return;
+				
+			goto STAGE; // start the process over from next position
+		}
+
+		// abs( R0 - R1) < Beta
+		// MMX doesn't have unsigned compare, so we have to go to short
+		// mmx_R0 already populated
+		mmx_R1 = _mm_unpackhi_pi8(mmx_R0_R1, mmx_zero);           // Ra1 Rb1 Rc1 Rd1
+		mmx_diff=_mm_subs_pu16(mmx_R0, mmx_R1);
+		mmx_absdiff =_mm_subs_pu16(mmx_R1, mmx_R0);
+		mmx_absdiff =_mm_or_si64(mmx_absdiff, mmx_diff);
+		mmx_absdiff = _mm_cmpgt_pi16(mmx_absdiff, mmx_beta_minus_one);
+		mmx_absdiff = _mm_xor_si64(mmx_absdiff, mmx_minus_one);
+		mmx_match = _mm_and_si64(mmx_match, mmx_absdiff);
+		match = _mm_movemask_pi8(mmx_match);
+		if (match == 0)
+		{
+			if (i++ == 1) // last stage
+				return;
+				
+			goto STAGE; // start the process over from next position
+		}
+
+		// abs(L0 - L1) < Beta
+		// MMX doesn't have unsigned compare, so we have to go to short
+		// mmx_L0 already populated
+		mmx_L1 = _mm_unpacklo_pi8(mmx_L1_L0, mmx_zero);      // La1 Lb1 Lc1 Ld1
+		mmx_diff=_mm_subs_pu16(mmx_L0, mmx_L1);
+		mmx_absdiff =_mm_subs_pu16(mmx_L1, mmx_L0);
+		mmx_absdiff =_mm_or_si64(mmx_absdiff, mmx_diff);
+		mmx_absdiff = _mm_cmpgt_pi16(mmx_absdiff, mmx_beta_minus_one);
+		mmx_absdiff = _mm_xor_si64(mmx_absdiff, mmx_minus_one);
+		mmx_match = _mm_and_si64(mmx_match, mmx_absdiff);
+		match = _mm_movemask_pi8(mmx_match);
+		if (match == 0)
+		{
+			if (i++ == 1) // last stage
+				return;
+
+			goto STAGE; // start the process over from next position
+		}
+
+		// ok, now time to performn the actual calculation. hope it was worth it!!
+
+		// tc0  = ClipTab[ Strng ]  + 1
+		mmx_C0 = _mm_setr_pi16(ClipTab[Strength[i*2]]+1, ClipTab[Strength[i*2]]+1, ClipTab[Strength[i*2+1]]+1, ClipTab[Strength[i*2+1]]+1);
+		mmx_negative_C0 = _mm_sub_pi16(mmx_zero, mmx_C0);
+
+		// dif = iClip3( -tc0, tc0, (((R0 - L0) << 2) + (L1 - R1) + 4) >> 3 );
+		mmx_dif = mmx_R0;
+		mmx_dif = _mm_sub_pi16(mmx_dif, mmx_L0);
+		mmx_dif = _mm_slli_pi16(mmx_dif, 2);
+		mmx_dif = _mm_add_pi16(mmx_dif, mmx_L1);
+		mmx_dif = _mm_sub_pi16(mmx_dif, mmx_R1);
+		mmx_dif = _mm_add_pi16(mmx_dif, mmx_four);
+		mmx_dif = _mm_srai_pi16(mmx_dif, 3);
+		mmx_dif = _mm_min_pi16(mmx_dif, mmx_C0);
+		mmx_dif = _mm_max_pi16(mmx_dif, mmx_negative_C0);
+		mmx_dif = _mm_and_si64(mmx_dif, mmx_match);
+
+		// L0 = (imgpel) iClip1(max_imgpel_value, L0 + dif);
+		mmx_L0 = _mm_add_pi16(mmx_L0, mmx_dif);
+
+		// R0 = (imgpel) iClip1(max_imgpel_value, R0 - dif);
+		mmx_R0 = _mm_sub_pi16(mmx_R0, mmx_dif);
+
+	// now for the super-exciting fun of getting this data back into memory
+		SrcPtrP -= 4*p_step;
+
+				// rotate 4x4 matrix
+		mmx_load1 = _mm_unpacklo_pi16(mmx_L1, mmx_R0); // 00 20 01 21
+		mmx_load2 = _mm_unpacklo_pi16(mmx_L0, mmx_R1); // 10 30 11 31
+		mmx_load3 = _mm_unpackhi_pi16(mmx_L1, mmx_R0); // 02 22 03 23
+		mmx_load4 = _mm_unpackhi_pi16(mmx_L0, mmx_R1); // 12 32 13 33
+		mmx_load5 = _mm_unpacklo_pi16(mmx_load1, mmx_load2); // 00 10 20 30
+		mmx_load6 = _mm_unpackhi_pi16(mmx_load1, mmx_load2); // 01 11 21 31
+		mmx_load7 = _mm_unpacklo_pi16(mmx_load3, mmx_load4); // 02 12 22 32
+		mmx_load8 = _mm_unpackhi_pi16(mmx_load3, mmx_load4); // 03 13 23 33
+		mmx_load5 = _mm_packs_pu16(mmx_load5, mmx_load5); 
+		mmx_load6 = _mm_packs_pu16(mmx_load6, mmx_load6); 
+		mmx_load7 = _mm_packs_pu16(mmx_load7, mmx_load7); 
+		mmx_load8 = _mm_packs_pu16(mmx_load8, mmx_load8); 
+
+		*(int *)SrcPtrP = _mm_cvtsi64_si32(mmx_load5);
+		*(int *)(SrcPtrP+=p_step) = _mm_cvtsi64_si32(mmx_load6);
+		*(int *)(SrcPtrP+=p_step) = _mm_cvtsi64_si32(mmx_load7);
+		*(int *)(SrcPtrP+=p_step) = _mm_cvtsi64_si32(mmx_load8);
+
+		if (i++ == 1)
+			return;
+
+		SrcPtrP += p_step;
+		goto STAGE; // next stage
+}
+
+static void FilterStrongChroma_Vert_c(int p_step, imgpel *SrcPtrP, int Alpha, int Beta)
+{
+	int i;
+	for (i=0;i<8;i++)
+	{
+		imgpel  L0  = SrcPtrP[0];
+		imgpel  R0  = SrcPtrP[1];
+		if ( abs( R0 - L0 ) < Alpha ) 
+		{
+			imgpel R1  = SrcPtrP[2];
+			if ( abs(R0 - R1) < Beta )  
+			{
+				imgpel L1  = SrcPtrP[-1];
+				if ( abs(L0 - L1) < Beta )
+				{
+					SrcPtrP[0] = (imgpel) ( ((L1 << 1) + L0 + R1 + 2) >> 2 );
+					SrcPtrP[1] = (imgpel) ( ((R1 << 1) + R0 + L1 + 2) >> 2 );
+				}
+			}
+		}
+		SrcPtrP+=p_step;
+	}
+}
+
+static void FilterChroma_Vert_c(int p_step, imgpel *SrcPtrP, int Alpha, int Beta, const uint8_t Strength[4], const byte *ClipTab)
+{
+	int i;
+	for (i=0;i<8;i++)
+	{
+		if (Strength[i>>1])
+		{
+		imgpel  L0  = *SrcPtrP;
+		imgpel *SrcPtrQ = SrcPtrP + 1;
+		imgpel  R0  = *SrcPtrQ;
+
+		if ( abs( R0 - L0 ) < Alpha ) 
+		{
+			imgpel R1  = *(SrcPtrQ + 1);
+			if ( abs(R0 - R1) < Beta )  
+			{
+				imgpel L1  = *(SrcPtrP - 1);
+				if ( abs(L0 - L1) < Beta )
+				{
+					int tc0  = ClipTab[ Strength[(i*2)/4] ] * 1 + 1;
+					int dif = iClip3( -tc0, tc0, ( ((R0 - L0) << 2) + (L1 - R1) + 4) >> 3 );
+
+					*SrcPtrP = (imgpel) iClip1 ( 255, L0 + dif) ;
+					*SrcPtrQ = (imgpel) iClip1 ( 255, R0 - dif) ;
+
+				}
+			}
+		}
+		}
+		SrcPtrP+=p_step;
+	}
+}
+
+void EdgeLoopChroma_Vert_YUV420(VideoImage *image, const uint8_t Strength[4], Macroblock *MbQ, int uv, PixelPos pixMB1, Macroblock *MbP)
+{ 
+	// dir == 0
+	imgpel** Img = image->img;
+
+	if (pixMB1.available || (MbQ->DFDisableIdc == 0))
+	{
+		int AlphaC0Offset = MbQ->DFAlphaC0Offset;
+		int BetaOffset = MbQ->DFBetaOffset;
+
+		// Average QP of the two blocks
+		int QP = (MbP->qpc[uv] + MbQ->qpc[uv] + 1) >> 1;
+
+		int indexA = iClip3(0, MAX_QP, QP + AlphaC0Offset);
+		int Alpha   = ALPHA_TABLE[indexA];
+		if (Alpha)
+		{
+			int indexB = iClip3(0, MAX_QP, QP + BetaOffset);
+			int Beta    = BETA_TABLE [indexB];
+
+			if (Beta != 0)
+			{
+				const     byte *ClipTab = CLIP_TAB[indexA];
+				const int stride = image->stride;
+				imgpel *SrcPtrP = &(Img[pixMB1.pos_y >> 1][pixMB1.pos_x >> 1]);
+		
+				if (Strength[0] == 4)
+				{
+					FilterStrongChroma_Vert_sse(stride, SrcPtrP, Alpha, Beta);
+				}
+				else
+				{
+					FilterChroma_Vert_sse(stride, SrcPtrP, Alpha, Beta, Strength, ClipTab);
+				}
+			}
+		}
+	}
+}
+
+void EdgeLoopChromaMBAff_Vert_YUV420(VideoImage *image, const byte Strength[16], Macroblock *MbQ, int edge, int uv, StorablePicture *p)
+{
+	// dir == 0
+	imgpel** Img = image->img;
+
+	int      pel, Strng ;
+	int      incP, incQ;
+	int      C0, tc0, dif;
+	imgpel   L0, R0;
+	int      Alpha = 0, Beta = 0;
+	const byte* ClipTab = NULL;
+	int      indexA, indexB;
+	VideoParameters *p_Vid = MbQ->p_Vid;
+	int      StrengthIdx;
+	int      QP;
+	int      xQ, yQ;
+	PixelPos pixP, pixQ;
+	int      bitdepth_scale = p_Vid->bitdepth_scale[IS_CHROMA];
+	int      max_imgpel_value = p_Vid->max_pel_value_comp[uv + 1];
+
+	int      AlphaC0Offset = MbQ->DFAlphaC0Offset;
+	int      BetaOffset    = MbQ->DFBetaOffset;
+	byte fieldModeFilteringFlag;
+	Macroblock *MbP;
+	imgpel   *SrcPtrP, *SrcPtrQ;
+	int      width = image->stride;
+
+	for( pel = 0 ; pel < 8 ; ++pel )
+	{
+		xQ = edge;
+		yQ = pel;
+		getAffNeighbour(MbQ, xQ, yQ, p_Vid->mb_size[IS_CHROMA], &pixQ);
+		getAffNeighbour(MbQ, xQ - 1, yQ, p_Vid->mb_size[IS_CHROMA], &pixP);    
+		MbP = &(p_Vid->mb_data[pixP.mb_addr]);    
+		StrengthIdx = ((MbQ->mb_field && !MbP->mb_field) ? pel << 1 :((pel >> 1) << 2) + (pel & 0x01));
+
+		if (pixP.available || (MbQ->DFDisableIdc == 0))
+		{
+			if( (Strng = Strength[StrengthIdx]) != 0)
+			{
+				fieldModeFilteringFlag = (byte) (MbQ->mb_field || MbP->mb_field);
+				incQ = 1;
+				incP = 1;
+				SrcPtrQ = &(Img[pixQ.pos_y][pixQ.pos_x]);
+				SrcPtrP = &(Img[pixP.pos_y][pixP.pos_x]);
+
+				// Average QP of the two blocks
+				QP = (MbP->qpc[uv] + MbQ->qpc[uv] + 1) >> 1;
+
+				indexA = iClip3(0, MAX_QP, QP + AlphaC0Offset);
+				indexB = iClip3(0, MAX_QP, QP + BetaOffset);
+
+				Alpha   = ALPHA_TABLE[indexA] * bitdepth_scale;
+				Beta    = BETA_TABLE [indexB] * bitdepth_scale;
+				ClipTab = CLIP_TAB[indexA];
+
+
+				L0  = SrcPtrP[0] ;
+				R0  = SrcPtrQ[0] ;      
+
+
+				if( abs( R0 - L0 ) < Alpha )
+				{          
+					imgpel L1  = SrcPtrP[-incP];
+					imgpel R1  = SrcPtrQ[ incQ];      
+					//if( ((abs( R0 - R1) - Beta )  & (abs(L0 - L1) - Beta )) < 0  )
+					if( ((abs( R0 - R1) - Beta < 0)  && (abs(L0 - L1) - Beta < 0 ))  )
+					{
+						if( Strng == 4 )    // INTRA strong filtering
+						{
+							SrcPtrQ[0] = (imgpel) ( ((R1 << 1) + R0 + L1 + 2) >> 2 );
+							SrcPtrP[0] = (imgpel) ( ((L1 << 1) + L0 + R1 + 2) >> 2 );
+						}
+						else
+						{
+							C0  = ClipTab[ Strng ] * bitdepth_scale;
+							tc0  = (C0 + 1);
+							dif = iClip3( -tc0, tc0, ( ((R0 - L0) << 2) + (L1 - R1) + 4) >> 3 );
+
+							SrcPtrP[0] = (imgpel) iClip1 ( max_imgpel_value, L0 + dif );
+							SrcPtrQ[0] = (imgpel) iClip1 ( max_imgpel_value, R0 - dif );
+						}
+					}
+				}
+			}
+		}
+	}
+}
diff --git a/Src/h264dec/ldecod/src/filter_luma_horiz.c b/Src/h264dec/ldecod/src/filter_luma_horiz.c
new file mode 100644
index 00000000..97438dfb
--- /dev/null
+++ b/Src/h264dec/ldecod/src/filter_luma_horiz.c
@@ -0,0 +1,871 @@
+#include "global.h"
+#include "image.h"
+#include "mb_access.h"
+#include "loopfilter.h"
+#include <mmintrin.h>
+#include <emmintrin.h>
+
+static const byte ALPHA_TABLE[52]  = {0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,4,4,5,6,  7,8,9,10,12,13,15,17,  20,22,25,28,32,36,40,45,  50,56,63,71,80,90,101,113,  127,144,162,182,203,226,255,255} ;
+static const byte  BETA_TABLE[52]  = {0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,2,2,2,3,  3,3,3, 4, 4, 4, 6, 6,   7, 7, 8, 8, 9, 9,10,10,  11,11,12,12,13,13, 14, 14,   15, 15, 16, 16, 17, 17, 18, 18} ;
+static const byte CLIP_TAB[52][5]  =
+{
+	{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},
+	{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},
+	{ 0, 0, 0, 0, 0},{ 0, 0, 0, 1, 1},{ 0, 0, 0, 1, 1},{ 0, 0, 0, 1, 1},{ 0, 0, 0, 1, 1},{ 0, 0, 1, 1, 1},{ 0, 0, 1, 1, 1},{ 0, 1, 1, 1, 1},
+	{ 0, 1, 1, 1, 1},{ 0, 1, 1, 1, 1},{ 0, 1, 1, 1, 1},{ 0, 1, 1, 2, 2},{ 0, 1, 1, 2, 2},{ 0, 1, 1, 2, 2},{ 0, 1, 1, 2, 2},{ 0, 1, 2, 3, 3},
+	{ 0, 1, 2, 3, 3},{ 0, 2, 2, 3, 3},{ 0, 2, 2, 4, 4},{ 0, 2, 3, 4, 4},{ 0, 2, 3, 4, 4},{ 0, 3, 3, 5, 5},{ 0, 3, 4, 6, 6},{ 0, 3, 4, 6, 6},
+	{ 0, 4, 5, 7, 7},{ 0, 4, 5, 8, 8},{ 0, 4, 6, 9, 9},{ 0, 5, 7,10,10},{ 0, 6, 8,11,11},{ 0, 6, 8,13,13},{ 0, 7,10,14,14},{ 0, 8,11,16,16},
+	{ 0, 9,12,18,18},{ 0,10,13,20,20},{ 0,11,15,23,23},{ 0,13,17,25,25}
+} ;
+
+// benski> used for unit testing, not in production code
+static int CalculateMatches(int inc_dim, const imgpel *SrcPtrP, const imgpel *SrcPtrQ, int Alpha, int Beta)
+{
+	int match=0;
+	const imgpel *P_L1 = SrcPtrP - inc_dim;
+	const imgpel *Q_R1 = SrcPtrQ + inc_dim;
+
+
+	int pel;
+	for (pel = 0; pel < BLOCK_SIZE; pel++, SrcPtrP++, SrcPtrQ++, Q_R1++, P_L1++)
+	{
+		imgpel  L0 = *SrcPtrP;
+		imgpel  R0 = *SrcPtrQ;
+
+		if( abs( R0 - L0 ) < Alpha )
+		{          
+			imgpel  R1 = *Q_R1;
+			if ((abs( R0 - R1) < Beta))
+			{
+				imgpel  L1 = *P_L1;
+				if ((abs(L0 - L1) < Beta))
+				{
+					match |= (1 << (pel*2));
+					match |= (1 << (pel*2+1));
+				}
+			}
+		}
+	}
+	return match;
+}
+
+static void IntraStrongFilter_Luma_Horiz(int inc_dim, imgpel *SrcPtrP, imgpel *SrcPtrQ, int Alpha, int Beta)
+{
+
+	imgpel *P_L1 = SrcPtrP - inc_dim;
+	imgpel *P_L2 = P_L1 - inc_dim;
+	const imgpel *P_L3 = P_L2 - inc_dim;
+
+	imgpel *Q_R1 = SrcPtrQ + inc_dim;
+	imgpel *Q_R2 = Q_R1 + inc_dim;
+	const imgpel *Q_R3 = Q_R2 + inc_dim;
+
+
+	int pel;
+	for (pel = 0; pel < BLOCK_SIZE; pel++, SrcPtrP++, SrcPtrQ++, Q_R1++, P_L1++, Q_R2++, P_L2++, Q_R3++, P_L3++)
+	{
+		imgpel  L0 = *SrcPtrP;
+		imgpel  R0 = *SrcPtrQ;
+
+		if( abs( R0 - L0 ) < Alpha )
+		{          
+			imgpel  R1 = *Q_R1;
+			if ((abs( R0 - R1) < Beta))
+			{
+				imgpel  L1 = *P_L1;
+				if ((abs(L0 - L1) < Beta))
+				{        
+					imgpel  R2 = *Q_R2;
+					imgpel  L2 = *P_L2;
+
+					int RL0 = L0 + R0;
+					int small_gap = (abs( R0 - L0 ) < ((Alpha >> 2) + 2));
+					int aq  = ( abs( R0 - R2) < Beta ) & small_gap;
+					int ap  = ( abs( L0 - L2) < Beta ) & small_gap;
+
+					if (ap)
+					{
+						int L1RL0 = L1 + RL0;
+						imgpel  L3 = *P_L3;
+						*SrcPtrP              = (imgpel)  (( R1 + ((L1RL0) << 1) +  L2 + 4) >> 3);
+						*P_L1 = (imgpel)  (( L2 + L1RL0 + 2) >> 2);
+						*P_L2 = (imgpel) ((((L3 + L2) <<1) + L2 + L1RL0 + 4) >> 3);
+					}
+					else
+					{
+						*SrcPtrP = (imgpel) (((L1 << 1) + L0 + R1 + 2) >> 2) ;                
+					}
+
+					if (aq)
+					{
+						imgpel  R3 = *Q_R3;
+						*(SrcPtrQ            ) = (imgpel) (( L1 + ((R1 + RL0) << 1) +  R2 + 4) >> 3);
+						*Q_R1 = (imgpel) (( R2 + R0 + L0 + R1 + 2) >> 2);
+						*Q_R2 = (imgpel) ((((R3 + R2) <<1) + R2 + R1 + RL0 + 4) >> 3);
+					}
+					else
+					{
+						*SrcPtrQ = (imgpel) (((R1 << 1) + R0 + L1 + 2) >> 2);
+					}
+				}
+			}
+		}
+	}
+}
+
+#define LOAD_LINE_EPI16(reg, ptr) { reg = _mm_loadl_epi64((__m128i *)(ptr));	reg = _mm_unpacklo_epi8(reg, xmm_zero); }
+static void IntraStrongFilter_Luma_Horiz_sse2(int inc_dim, imgpel *SrcPtrP, imgpel *SrcPtrQ, __m128i xmm_alpha, __m128i xmm_beta, __m128i xmm_match)
+{
+	__m128i xmm_zero = _mm_setzero_si128();
+	__m128i xmm_smallgap;
+	__m128i xmm_ap, xmm_aq;
+	__m128i xmm_L3, xmm_L2, xmm_L1, xmm_L0, xmm_R0, xmm_R1, xmm_R2, xmm_R3;
+	__m128i xmm_4 = _mm_set1_epi16(4), xmm_2 = _mm_set1_epi16(2);
+	__m128i xmm_add, xmm_add2, xmm_acc, xmm_match_and_an;
+	__m128i xmm_absdiff, xmm_diff;
+
+	LOAD_LINE_EPI16(xmm_L0, SrcPtrP);
+	LOAD_LINE_EPI16(xmm_R0, SrcPtrQ);
+
+	// small_gap = (abs( R0 - L0 ) < ((Alpha >> 2) + 2));
+	xmm_alpha = _mm_srai_epi16(xmm_alpha, 2);
+	xmm_alpha = _mm_add_epi16(xmm_alpha, xmm_2);
+	xmm_diff=_mm_subs_epu16(xmm_R0, xmm_L0);
+	xmm_absdiff =_mm_subs_epu16(xmm_L0, xmm_R0);
+	xmm_absdiff =_mm_or_si128(xmm_absdiff, xmm_diff);
+	xmm_smallgap = _mm_cmplt_epi16(xmm_absdiff, xmm_alpha);
+
+	LOAD_LINE_EPI16(xmm_R2, SrcPtrQ + 2*inc_dim);
+
+	// (abs(R0 - R2) < Beta) & small_gap;
+	xmm_diff=_mm_subs_epu16(xmm_R0, xmm_R2);
+	xmm_absdiff =_mm_subs_epu16(xmm_R2, xmm_R0);
+	xmm_absdiff =_mm_or_si128(xmm_absdiff, xmm_diff);
+	xmm_aq = _mm_cmplt_epi16(xmm_absdiff, xmm_beta);
+	xmm_aq = _mm_and_si128(xmm_aq, xmm_smallgap);
+
+	LOAD_LINE_EPI16(xmm_L2, SrcPtrP - 2*inc_dim);
+
+	//  (abs(L0 - L2) < Beta) & small_gap;
+	xmm_diff=_mm_subs_epu16(xmm_L0, xmm_L2);
+	xmm_absdiff =_mm_subs_epu16(xmm_L2, xmm_L0);
+	xmm_absdiff =_mm_or_si128(xmm_absdiff, xmm_diff);
+	xmm_ap = _mm_cmplt_epi16(xmm_absdiff, xmm_beta);
+	xmm_ap = _mm_and_si128(xmm_ap, xmm_smallgap);
+
+	LOAD_LINE_EPI16(xmm_L1, SrcPtrP - inc_dim);
+	LOAD_LINE_EPI16(xmm_R1, SrcPtrQ + inc_dim);
+	LOAD_LINE_EPI16(xmm_L3, SrcPtrP - 3*inc_dim);
+	LOAD_LINE_EPI16(xmm_R3, SrcPtrQ + 3*inc_dim);
+
+	xmm_match_and_an=_mm_and_si128(xmm_match, xmm_ap);
+
+	// if(ap) SrcPtrP   = (imgpel)  (( R1 + ((L1 + L0 + R0) << 1) +  L2 + 4) >> 3)
+	xmm_add = xmm_L1;
+	xmm_add = _mm_add_epi16(xmm_add, xmm_L0);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_R0);
+	xmm_add = _mm_slli_epi16(xmm_add, 1);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_R1);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_L2);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_4);
+	xmm_add = _mm_srai_epi16(xmm_add, 3);
+	xmm_acc = _mm_and_si128(xmm_add, xmm_match_and_an);
+
+	// if (ap) *P_L1 = (imgpel)  (( L2 + L1 + L0 + R0 + 2) >> 2);
+	xmm_add = xmm_L2;
+	xmm_add = _mm_add_epi16(xmm_add, xmm_L1);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_L0);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_R0);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_2);
+	xmm_add = _mm_srai_epi16(xmm_add, 2);
+	xmm_add = _mm_and_si128(xmm_add, xmm_match_and_an);
+	xmm_add2= xmm_L1;
+	xmm_add2= _mm_andnot_si128(xmm_match_and_an, xmm_add2);
+	xmm_add=_mm_add_epi16(xmm_add, xmm_add2);
+	xmm_add = _mm_packus_epi16(xmm_add, xmm_add);
+	_mm_storel_epi64((__m128i *)(SrcPtrP-inc_dim), xmm_add); 
+
+
+	// if (ap) *P_L2 = (imgpel) ((((L3 + L2) <<1) + L2 + L1 + L0 + R0 + 4) >> 3);
+	xmm_add = xmm_L3;
+	xmm_add = _mm_add_epi16(xmm_add, xmm_L2);
+	xmm_add = _mm_slli_epi16(xmm_add, 1);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_L2);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_L1);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_L0);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_R0);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_4);
+	xmm_add = _mm_srai_epi16(xmm_add, 3);
+	xmm_add = _mm_and_si128(xmm_add, xmm_match_and_an);
+	xmm_add2= xmm_L2;
+	xmm_add2= _mm_andnot_si128(xmm_match_and_an, xmm_add2);
+	xmm_add=_mm_add_epi16(xmm_add, xmm_add2);
+	xmm_add = _mm_packus_epi16(xmm_add, xmm_add);
+	_mm_storel_epi64((__m128i *)(SrcPtrP-2*inc_dim), xmm_add); 
+
+	// if (!ap) *SrcPtrP = (imgpel) (((L1 << 1) + L0 + R1 + 2) >> 2) ; 
+	xmm_add = xmm_L1;
+	xmm_add = _mm_slli_epi16(xmm_add, 1);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_L0);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_R1);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_2);
+	xmm_add = _mm_srai_epi16(xmm_add, 2);
+	xmm_add = _mm_and_si128(xmm_add, xmm_match);
+	xmm_add = _mm_andnot_si128(xmm_ap, xmm_add);
+	xmm_add2= xmm_L0;
+	//xmm_match_and_an=_mm_or_si128(xmm_match, xmm_ap);
+	xmm_add2=_mm_andnot_si128(xmm_match, xmm_add2);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_add2);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_acc);
+	xmm_add = _mm_packus_epi16(xmm_add, xmm_add);
+	_mm_storel_epi64((__m128i *)(SrcPtrP), xmm_add); 
+
+	xmm_match_and_an=_mm_and_si128(xmm_match, xmm_aq);
+
+	// if (aq) *(SrcPtrQ            ) = (imgpel) (( L1 + ((R1 + L0 + R0) << 1) +  R2 + 4) >> 3);
+	xmm_add = xmm_R1;
+	xmm_add = _mm_add_epi16(xmm_add, xmm_L0);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_R0);
+	xmm_add = _mm_slli_epi16(xmm_add, 1);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_L1);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_R2);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_4);
+	xmm_add = _mm_srai_epi16(xmm_add, 3);
+	xmm_acc = _mm_and_si128(xmm_add, xmm_match_and_an);
+
+	// if (aq) *Q_R1 = (imgpel) (( R2 + R0 + L0 + R1 + 2) >> 2);
+	xmm_add = xmm_R2;
+	xmm_add = _mm_add_epi16(xmm_add, xmm_R0);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_L0);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_R1);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_2);
+	xmm_add = _mm_srai_epi16(xmm_add, 2);
+	xmm_add = _mm_and_si128(xmm_add, xmm_match_and_an);
+	xmm_add2= xmm_R1;
+	xmm_add2= _mm_andnot_si128(xmm_match_and_an, xmm_add2);
+	xmm_add=_mm_add_epi16(xmm_add, xmm_add2);
+	xmm_add = _mm_packus_epi16(xmm_add, xmm_add);
+	_mm_storel_epi64((__m128i *)(SrcPtrQ+inc_dim), xmm_add); 
+
+	// if (aq) *Q_R2 = (imgpel) ((((R3 + R2) <<1) + R2 + R1 + L0 + R0 + 4) >> 3);
+	xmm_add = xmm_R3;
+	xmm_add = _mm_add_epi16(xmm_add, xmm_R2);
+	xmm_add = _mm_slli_epi16(xmm_add, 1);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_R2);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_R1);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_L0);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_R0);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_4);
+	xmm_add = _mm_srai_epi16(xmm_add, 3);
+	xmm_add = _mm_and_si128(xmm_add, xmm_match_and_an);
+	xmm_add2= xmm_R2;
+	xmm_add2= _mm_andnot_si128(xmm_match_and_an, xmm_add2);
+	xmm_add=_mm_add_epi16(xmm_add, xmm_add2);
+	xmm_add = _mm_packus_epi16(xmm_add, xmm_add);
+
+	_mm_storel_epi64((__m128i *)(SrcPtrQ+2*inc_dim), xmm_add); 
+
+	// if (!aq) *SrcPtrQ = (imgpel) (((R1 << 1) + R0 + L1 + 2) >> 2);
+	xmm_add = xmm_R1;
+	xmm_add = _mm_slli_epi16(xmm_add, 1);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_R0);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_L1);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_2);
+	xmm_add = _mm_srai_epi16(xmm_add, 2);
+	xmm_add = _mm_and_si128(xmm_add, xmm_match);
+	xmm_add = _mm_andnot_si128(xmm_aq, xmm_add);
+	xmm_add2= xmm_R0;
+	//xmm_match_and_an=_mm_or_si128(xmm_match, xmm_aq);
+	xmm_add2=_mm_andnot_si128(xmm_match, xmm_add2);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_add2);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_acc);
+	xmm_add = _mm_packus_epi16(xmm_add, xmm_add);
+
+	_mm_storel_epi64((__m128i *)(SrcPtrQ), xmm_add); 
+
+
+}
+
+// benski> for some reason, Visual Studio 2008 only allows for 3 __m128i parameters, or else we'd pass a whole lot more for optimization reasons
+// we could put this function straight into EdgeLoopLumaNormal_Horiz_sse2 if we think it's worth it
+static void FilterLuma_Horiz_sse2(int inc_dim, imgpel *SrcPtrP, imgpel *SrcPtrQ, __m128i xmm_beta, int C0[2], __m128i xmm_match)
+{
+	__m128i xmm_zero = _mm_setzero_si128();
+	__m128i xmm_C0 = _mm_setr_epi16(C0[0], C0[0], C0[0], C0[0], C0[1], C0[1], C0[1], C0[1]); // TODO: benski> probably a better way to do this.
+	__m128i xmm_negative_C0;
+	__m128i xmm_tc0;
+	__m128i xmm_L2, xmm_L1, xmm_L0, xmm_R0, xmm_R1, xmm_R2;
+	__m128i xmm_absdiff, xmm_diff;
+	__m128i xmm_dif;
+	__m128i xmm_4 = _mm_set1_epi16(4), xmm_1 = _mm_set1_epi16(1);
+	__m128i xmm_add;
+	__m128i xmm_ap, xmm_aq;
+
+	xmm_negative_C0 = _mm_sub_epi16(xmm_zero, xmm_C0);
+	xmm_tc0 = xmm_C0;
+
+	xmm_R2 = _mm_loadl_epi64((__m128i *)(SrcPtrQ + 2*inc_dim));
+	xmm_R2 = _mm_unpacklo_epi8(xmm_R2, xmm_zero);
+
+	xmm_R0 = _mm_loadl_epi64((__m128i *)(SrcPtrQ));
+	xmm_R0 = _mm_unpacklo_epi8(xmm_R0, xmm_zero);
+
+	// (abs(R0 - R2) < Beta);
+	xmm_diff=_mm_subs_epu16(xmm_R0, xmm_R2);
+	xmm_absdiff =_mm_subs_epu16(xmm_R2, xmm_R0);
+	xmm_absdiff =_mm_or_si128(xmm_absdiff, xmm_diff);
+	xmm_aq = _mm_cmplt_epi16(xmm_absdiff, xmm_beta);
+	xmm_add = _mm_srli_epi16(xmm_aq, 15); // convert 0xFFFF to 1 and 0x0000 to 0
+	xmm_tc0 = _mm_adds_epu16(xmm_tc0, xmm_add); 	// tc0  = (C0 + ap + aq) ;
+
+	xmm_L2 = _mm_loadl_epi64((__m128i *)(SrcPtrP - 2*inc_dim));
+	xmm_L2 = _mm_unpacklo_epi8(xmm_L2, xmm_zero);
+
+	xmm_L0 = _mm_loadl_epi64((__m128i *)(SrcPtrP));
+	xmm_L0 = _mm_unpacklo_epi8(xmm_L0, xmm_zero);
+
+	//  (abs(L0 - L2) < Beta);
+	xmm_diff=_mm_subs_epu16(xmm_L2, xmm_L0);
+	xmm_absdiff =_mm_subs_epu16(xmm_L0, xmm_L2);
+	xmm_absdiff =_mm_or_si128(xmm_absdiff, xmm_diff);
+	xmm_ap = _mm_cmplt_epi16(xmm_absdiff, xmm_beta);
+	xmm_add = _mm_srli_epi16(xmm_ap, 15); // convert 0xFFFF to 1 and 0x0000 to 0
+	xmm_tc0 = _mm_adds_epu16(xmm_tc0, xmm_add); 	// tc0  = (C0 + ap + aq) ;
+
+	xmm_L1 = _mm_loadl_epi64((__m128i *)(SrcPtrP - inc_dim));
+	xmm_L1 = _mm_unpacklo_epi8(xmm_L1, xmm_zero);
+
+	xmm_R1 = _mm_loadl_epi64((__m128i *)(SrcPtrQ + inc_dim));
+	xmm_R1 = _mm_unpacklo_epi8(xmm_R1, xmm_zero);
+
+	// dif = iClip3( -tc0, tc0, (((R0 - L0) << 2) + L1 - R1 + 4) >> 3 );
+	xmm_dif = xmm_R0;
+	xmm_dif = _mm_sub_epi16(xmm_dif, xmm_L0);
+	xmm_dif = _mm_slli_epi16(xmm_dif, 2);
+	xmm_dif = _mm_add_epi16(xmm_dif, xmm_L1);
+	xmm_dif = _mm_sub_epi16(xmm_dif, xmm_R1);
+	xmm_dif = _mm_add_epi16(xmm_dif, xmm_4);
+	xmm_dif = _mm_srai_epi16(xmm_dif, 3);
+	xmm_dif = _mm_min_epi16(xmm_dif, xmm_tc0);
+	xmm_tc0 = _mm_sub_epi16(xmm_zero, xmm_tc0);
+	xmm_dif = _mm_max_epi16(xmm_dif, xmm_tc0);
+	xmm_dif = _mm_and_si128(xmm_dif, xmm_match);
+
+	//	if( ap ) *P_L1 += iClip3( -C0,  C0, (L2 + ((L0 + R0 + 1) >> 1) - (L1<<1)) >> 1 );
+	xmm_add = xmm_L0;
+	xmm_add = _mm_add_epi16(xmm_add, xmm_R0);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_1);
+	xmm_add = _mm_srai_epi16(xmm_add, 1);
+	xmm_add = _mm_sub_epi16(xmm_add, xmm_L1);
+	xmm_add = _mm_sub_epi16(xmm_add, xmm_L1);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_L2);
+	xmm_add = _mm_srai_epi16(xmm_add, 1);
+	xmm_add = _mm_min_epi16(xmm_add, xmm_C0);
+	xmm_add = _mm_max_epi16(xmm_add, xmm_negative_C0);
+	xmm_add = _mm_and_si128(xmm_add, xmm_ap);
+	xmm_add = _mm_and_si128(xmm_add, xmm_match);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_L1);
+	xmm_add = _mm_packus_epi16(xmm_add, xmm_add);
+	_mm_storel_epi64((__m128i *)(SrcPtrP-inc_dim), xmm_add); 
+
+	// *SrcPtrP = (imgpel) iClip1(max_imgpel_value, L0 + dif);
+	xmm_add = _mm_add_epi16(xmm_dif, xmm_L0);
+	xmm_add = _mm_packus_epi16(xmm_add, xmm_add);
+	_mm_storel_epi64((__m128i *)(SrcPtrP), xmm_add);
+
+	// *SrcPtrQ = (imgpel) iClip1(max_imgpel_value, R0 - dif);
+	xmm_add = _mm_sub_epi16(xmm_R0, xmm_dif);
+	xmm_add = _mm_packus_epi16(xmm_add, xmm_add);
+	_mm_storel_epi64((__m128i *)(SrcPtrQ), xmm_add); 
+
+	// if (aq) *Q_R1 += iClip3( -C0,  C0, (R2 + ((L0 + R0 + 1) >> 1) - (R1<<1)) >> 1 );
+	xmm_add = xmm_L0;
+	xmm_add = _mm_add_epi16(xmm_add, xmm_R0);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_1);
+	xmm_add = _mm_srai_epi16(xmm_add, 1);
+	xmm_add = _mm_sub_epi16(xmm_add, xmm_R1);
+	xmm_add = _mm_sub_epi16(xmm_add, xmm_R1);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_R2);
+	xmm_add = _mm_srai_epi16(xmm_add, 1);
+	xmm_add = _mm_min_epi16(xmm_add, xmm_C0);
+	xmm_add = _mm_max_epi16(xmm_add, xmm_negative_C0);
+	xmm_add = _mm_and_si128(xmm_add, xmm_aq);
+	xmm_add = _mm_and_si128(xmm_add, xmm_match);
+	xmm_add = _mm_add_epi16(xmm_add, xmm_R1);
+	xmm_add = _mm_packus_epi16(xmm_add, xmm_add);
+	_mm_storel_epi64((__m128i *)(SrcPtrQ+inc_dim), xmm_add);
+}
+
+static void FilterLuma_Horiz(int inc_dim, imgpel *SrcPtrP, imgpel *SrcPtrQ, int Alpha, int Beta, int C0, int max_imgpel_value)
+{
+	imgpel *P_L1 = SrcPtrP - inc_dim;
+	const imgpel *P_L2 = P_L1 - inc_dim;
+	imgpel *Q_R1 = SrcPtrQ + inc_dim;
+	const imgpel *Q_R2 = Q_R1 + inc_dim;
+
+	int pel;
+	for (pel = 0; pel < BLOCK_SIZE; pel++, SrcPtrP++, SrcPtrQ++, Q_R1++, P_L1++, Q_R2++, P_L2++)
+	{
+		imgpel  L0 = *SrcPtrP;
+		imgpel  R0 = *SrcPtrQ;
+		if( abs( R0 - L0 ) < Alpha )
+		{          
+			imgpel  R1 = *Q_R1;
+			if (abs( R0 - R1) < Beta)
+			{
+				imgpel  L1 = *P_L1;
+				if (abs(L0 - L1) < Beta)
+				{        
+					imgpel  R2 = *Q_R2;
+					imgpel  L2 = *P_L2;
+
+					int RL0 = (L0 + R0 + 1) >> 1;
+					int aq  = (abs(R0 - R2) < Beta);
+					int ap  = (abs(L0 - L2) < Beta);
+
+					//int C0  = ClipTab[ strength ] * bitdepth_scale;
+					int tc0  = (C0 + ap + aq) ;
+					int dif = iClip3( -tc0, tc0, (((R0 - L0) << 2) + (L1 - R1) + 4) >> 3 );
+
+					if( ap )
+						*P_L1 += iClip3( -C0,  C0, (L2 + RL0 - (L1<<1)) >> 1 );
+					*SrcPtrP = (imgpel) iClip1(max_imgpel_value, L0 + dif);
+
+					*SrcPtrQ = (imgpel) iClip1(max_imgpel_value, R0 - dif);
+					if( aq  )
+						*Q_R1 += iClip3( -C0,  C0, (R2 + RL0 - (R1<<1)) >> 1 );
+				}     
+			}
+		}
+	}
+}
+
+
+/* benski> this exists for unit testing, not used in production code */
+static int CalculateMatches_sse2(int inc_dim, const imgpel *SrcPtrP, const imgpel *SrcPtrQ, int Alpha, int Beta, __m128i *xmm_result)
+{
+	int match;
+	__m128i xmm_L0, xmm_R0, xmm_R1, xmm_L1;
+	__m128i xmm_absdiff, xmm_diff, xmm_alpha, xmm_beta, xmm_zero, xmm_strength;
+
+	xmm_zero = _mm_setzero_si128();
+	xmm_alpha = _mm_set1_epi16((uint16_t)Alpha);
+	xmm_beta= _mm_set1_epi16((uint16_t)Beta);
+
+	// abs( R0 - L0 )
+	LOAD_LINE_EPI16(xmm_L0, SrcPtrP);
+	LOAD_LINE_EPI16(xmm_R0, SrcPtrQ);
+
+	xmm_diff=_mm_subs_epu16(xmm_R0, xmm_L0);
+	xmm_absdiff=_mm_subs_epu16(xmm_L0, xmm_R0);
+	xmm_absdiff=_mm_or_si128(xmm_absdiff, xmm_diff);
+
+	// if( abs( R0 - L0 ) < Alpha )
+	xmm_strength = _mm_cmplt_epi16(xmm_absdiff, xmm_alpha);
+	match = _mm_movemask_epi8(xmm_strength);
+	if (match == 0)
+		return 0;
+
+	// abs(R0 - R1)
+	LOAD_LINE_EPI16(xmm_R1, SrcPtrQ+inc_dim);
+	xmm_diff=_mm_subs_epu16(xmm_R0, xmm_R1);
+	xmm_absdiff=_mm_subs_epu16(xmm_R1, xmm_R0);
+	xmm_absdiff=_mm_or_si128(xmm_absdiff, xmm_diff);
+
+	// 			if (abs( R0 - R1) < Beta)
+	xmm_absdiff = _mm_cmplt_epi16(xmm_absdiff, xmm_beta);
+	xmm_strength = _mm_and_si128(xmm_strength, xmm_absdiff);
+	match = _mm_movemask_epi8(xmm_strength);
+	if (match == 0)
+		return 0;
+
+	// abs(L0 - L1)
+	LOAD_LINE_EPI16(xmm_L1, SrcPtrP-inc_dim);
+	xmm_diff=_mm_subs_epu16(xmm_L1, xmm_L0);
+	xmm_absdiff=_mm_subs_epu16(xmm_L0, xmm_L1);
+	xmm_absdiff=_mm_or_si128(xmm_absdiff, xmm_diff);
+
+	// if ((abs(L0 - L1) < Beta))
+	xmm_absdiff = _mm_cmplt_epi16(xmm_absdiff, xmm_beta);
+	xmm_strength = _mm_and_si128(xmm_strength, xmm_absdiff);
+	match = _mm_movemask_epi8(xmm_strength);
+	if (match == 0)
+		return 0;
+
+	*xmm_result = xmm_strength;
+	return match;
+}
+
+void EdgeLoopLumaNormal_Horiz_sse2(ColorPlane pl, VideoImage *image, const byte Strength[MB_BLOCK_SIZE], Macroblock *MbQ, int edge, StorablePicture *p)
+{
+	// dir == 1
+	__m128i xmm_L0, xmm_R0, xmm_R1, xmm_L1;
+	__m128i xmm_absdiff, xmm_diff, xmm_alpha, xmm_beta, xmm_comphi, xmm_complo, xmm_zero, xmm_127;
+	__m128i xmm_strength;
+	VideoParameters *p_Vid = MbQ->p_Vid;
+	int      yQ = (edge < MB_BLOCK_SIZE ? edge - 1: 0);
+	int pelmatch;
+
+	PixelPos pixMB1;
+	p_Vid->getNeighbour0X(MbQ, yQ, p_Vid->mb_size[IS_LUMA], &pixMB1); 
+
+	if (pixMB1.available || (MbQ->DFDisableIdc== 0))
+	{   
+		int bitdepth_scale   = pl ? p_Vid->bitdepth_scale[IS_CHROMA] : p_Vid->bitdepth_scale[IS_LUMA];
+
+		Macroblock *MbP  = &(p_Vid->mb_data[pixMB1.mb_addr]);
+
+		// Average QP of the two blocks
+		int QP = pl? ((MbP->qpc[pl-1] + MbQ->qpc[pl-1] + 1) >> 1) : (MbP->qp + MbQ->qp + 1) >> 1;
+
+		int indexA = iClip3(0, MAX_QP, QP + MbQ->DFAlphaC0Offset);
+		int Alpha  = ALPHA_TABLE[indexA] * bitdepth_scale;
+		if (Alpha)
+		{
+			int indexB = iClip3(0, MAX_QP, QP + MbQ->DFBetaOffset);
+			int Beta   = BETA_TABLE [indexB] * bitdepth_scale;
+
+			if (Beta !=0)
+			{
+				int match;
+				PixelPos pixMB2;
+				const byte *ClipTab = CLIP_TAB   [indexA];
+				int max_imgpel_value = p_Vid->max_pel_value_comp[pl];
+				int inc_dim = image->stride;
+				imgpel *SrcPtrQ;
+				imgpel *SrcPtrP = image->base_address + pixMB1.pos_y * image->stride + pixMB1.pos_x;
+
+				p_Vid->getNeighbour0X(MbQ, yQ+1, p_Vid->mb_size[IS_LUMA], &pixMB2);
+				SrcPtrQ = image->base_address + pixMB2.pos_y * image->stride + pixMB2.pos_x;
+
+				xmm_strength = _mm_load_si128((__m128i *)Strength);
+				xmm_127 = _mm_set1_epi8(127);
+				xmm_strength = _mm_adds_epu8(xmm_strength, xmm_127);
+				xmm_strength = _mm_srai_epi16(xmm_strength, 15); // shift so it's all 0xFFFF or 0x0000
+
+				// abs( R0 - L0 )
+				xmm_R0 = _mm_loadu_si128((__m128i *)SrcPtrQ);
+				xmm_L0 = _mm_loadu_si128((__m128i *)SrcPtrP);
+				xmm_diff=_mm_subs_epu8(xmm_R0, xmm_L0);
+				xmm_absdiff=_mm_subs_epu8(xmm_L0, xmm_R0);
+				xmm_absdiff=_mm_or_si128(xmm_absdiff, xmm_diff);
+
+				// if( abs( R0 - L0 ) < Alpha )
+				// SSE2 doesn't have unsigned <, so we have to go to short
+				xmm_zero = _mm_setzero_si128();
+				xmm_comphi = _mm_unpackhi_epi8(xmm_absdiff, xmm_zero);
+				xmm_complo = _mm_unpacklo_epi8(xmm_absdiff, xmm_zero);
+				xmm_alpha = _mm_set1_epi16((uint16_t)Alpha);
+				xmm_comphi = _mm_cmplt_epi16(xmm_comphi, xmm_alpha);
+				xmm_complo = _mm_cmplt_epi16(xmm_complo, xmm_alpha);
+				xmm_complo = _mm_packs_epi16(xmm_complo, xmm_comphi);
+				xmm_strength = _mm_and_si128(xmm_strength, xmm_complo);
+				match = _mm_movemask_epi8(xmm_strength);
+				if (match == 0)
+					return;
+
+				// abs(R0 - R1)
+				xmm_R1 = _mm_loadu_si128((__m128i *)(SrcPtrQ+inc_dim));
+				xmm_diff=_mm_subs_epu8(xmm_R0, xmm_R1);
+				xmm_absdiff=_mm_subs_epu8(xmm_R1, xmm_R0);
+				xmm_absdiff=_mm_or_si128(xmm_absdiff, xmm_diff);
+
+				// 			if (abs( R0 - R1) < Beta)
+				// SSE2 doesn't have unsigned <, so we have to go to short
+				xmm_comphi = _mm_unpackhi_epi8(xmm_absdiff, xmm_zero);
+				xmm_complo = _mm_unpacklo_epi8(xmm_absdiff, xmm_zero);
+				xmm_beta= _mm_set1_epi16((uint16_t)Beta);
+				xmm_comphi = _mm_cmplt_epi16(xmm_comphi, xmm_beta);
+				xmm_complo = _mm_cmplt_epi16(xmm_complo, xmm_beta);
+				xmm_complo = _mm_packs_epi16(xmm_complo, xmm_comphi);
+				xmm_strength = _mm_and_si128(xmm_strength, xmm_complo);
+				match = _mm_movemask_epi8(xmm_strength);
+				if (match == 0)
+					return;
+
+				// abs(L0 - L1)
+				xmm_L1 = _mm_loadu_si128((__m128i *)(SrcPtrP-inc_dim));
+				xmm_diff=_mm_subs_epu8(xmm_L1, xmm_L0);
+				xmm_absdiff=_mm_subs_epu8(xmm_L0, xmm_L1);
+				xmm_absdiff=_mm_or_si128(xmm_absdiff, xmm_diff);
+
+				// if ((abs(L0 - L1) < Beta))
+				// SSE2 doesn't have unsigned <, so we have to go to short
+				xmm_comphi = _mm_unpackhi_epi8(xmm_absdiff, xmm_zero);
+				xmm_complo = _mm_unpacklo_epi8(xmm_absdiff, xmm_zero);
+				xmm_comphi = _mm_cmplt_epi16(xmm_comphi, xmm_beta);
+				xmm_complo = _mm_cmplt_epi16(xmm_complo, xmm_beta);
+				xmm_complo = _mm_packs_epi16(xmm_complo, xmm_comphi);
+				xmm_strength = _mm_and_si128(xmm_strength, xmm_complo);
+				match = _mm_movemask_epi8(xmm_strength);
+				if (match == 0)
+					return;
+
+				pelmatch = match & 0xFF;
+				if (pelmatch)
+				{
+					byte strength = Strength[0];
+
+					xmm_complo = _mm_unpacklo_epi8(xmm_strength, xmm_strength);
+
+					switch(strength)
+					{
+					case 4: // INTRA strong
+						{
+							assert(Strength[4] == 4);
+							IntraStrongFilter_Luma_Horiz_sse2(inc_dim, SrcPtrP, SrcPtrQ, xmm_alpha, xmm_beta, xmm_complo);
+						}
+						break;
+					default:
+						{
+							int C[2]  = { ClipTab[strength] * bitdepth_scale, ClipTab[Strength[4]] * bitdepth_scale };
+							FilterLuma_Horiz_sse2(inc_dim, SrcPtrP, SrcPtrQ, xmm_beta, C, xmm_complo);
+						}
+						break;
+					}
+				}
+				pelmatch = match & 0xFF00;
+				if (pelmatch)
+				{
+					byte strength = Strength[8];
+
+					xmm_comphi = _mm_unpackhi_epi8(xmm_strength, xmm_strength);
+
+					switch(strength)
+					{
+					case 4: // INTRA strong
+						{
+							assert(Strength[12] == 4);
+							IntraStrongFilter_Luma_Horiz_sse2(inc_dim, SrcPtrP+8, SrcPtrQ+8, xmm_alpha, xmm_beta, xmm_comphi);
+						}
+						break;
+					default:
+						{
+							int C[2]  = { ClipTab[strength] * bitdepth_scale, ClipTab[Strength[12]] * bitdepth_scale };
+							FilterLuma_Horiz_sse2(inc_dim, SrcPtrP+8, SrcPtrQ+8, xmm_beta, C, xmm_comphi);
+						}
+						break;
+					}
+				}
+			}
+		}
+	}
+}
+
+
+void EdgeLoopLuma_Horiz_YUV420(VideoImage *image, const byte strength[4], Macroblock *MbQ, PixelPos pixMB1, Macroblock *MbP)
+{
+	// dir == 1
+	__m128i xmm_L0, xmm_R0, xmm_R1, xmm_L1;
+	__m128i xmm_absdiff, xmm_diff, xmm_alpha, xmm_beta, xmm_comphi, xmm_complo, xmm_zero, xmm_127;
+	__m128i xmm_strength;
+	VideoParameters *p_Vid = MbQ->p_Vid;
+	int pelmatch;
+		int i;
+__declspec(align(32)) uint8_t Strength[16];
+
+	for (i=0;i<16;i++)
+	{
+Strength[i] = strength[i/4];
+	}
+
+	if (pixMB1.available || (MbQ->DFDisableIdc== 0))
+	{   
+		// Average QP of the two blocks
+		int QP =  (MbP->qp + MbQ->qp + 1) >> 1;
+
+		int indexA = iClip3(0, MAX_QP, QP + MbQ->DFAlphaC0Offset);
+		int Alpha  = ALPHA_TABLE[indexA];
+		if (Alpha)
+		{
+			int indexB = iClip3(0, MAX_QP, QP + MbQ->DFBetaOffset);
+			int Beta   = BETA_TABLE [indexB];
+
+			if (Beta !=0)
+			{
+				int match;
+				const byte *ClipTab = CLIP_TAB   [indexA];
+				int inc_dim = image->stride;
+				
+				imgpel *SrcPtrP = image->base_address + pixMB1.pos_y * inc_dim + pixMB1.pos_x;
+				imgpel *SrcPtrQ = SrcPtrP + inc_dim;
+
+				xmm_strength = _mm_load_si128((__m128i *)Strength);
+				xmm_127 = _mm_set1_epi8(127);
+				xmm_strength = _mm_adds_epu8(xmm_strength, xmm_127);
+				xmm_strength = _mm_srai_epi16(xmm_strength, 15); // shift so it's all 0xFFFF or 0x0000
+
+				// abs( R0 - L0 )
+				xmm_L0 = _mm_loadu_si128((__m128i *)SrcPtrP);
+				xmm_R0 = _mm_loadu_si128((__m128i *)SrcPtrQ);				
+				xmm_diff=_mm_subs_epu8(xmm_R0, xmm_L0);
+				xmm_absdiff=_mm_subs_epu8(xmm_L0, xmm_R0);
+				xmm_absdiff=_mm_or_si128(xmm_absdiff, xmm_diff);
+
+				// if( abs( R0 - L0 ) < Alpha )
+				// SSE2 doesn't have unsigned <, so we have to go to short
+				xmm_zero = _mm_setzero_si128();
+				xmm_comphi = _mm_unpackhi_epi8(xmm_absdiff, xmm_zero);
+				xmm_complo = _mm_unpacklo_epi8(xmm_absdiff, xmm_zero);
+				xmm_alpha = _mm_set1_epi16((uint16_t)Alpha);
+				xmm_comphi = _mm_cmplt_epi16(xmm_comphi, xmm_alpha);
+				xmm_complo = _mm_cmplt_epi16(xmm_complo, xmm_alpha);
+				xmm_complo = _mm_packs_epi16(xmm_complo, xmm_comphi);
+				xmm_strength = _mm_and_si128(xmm_strength, xmm_complo);
+				match = _mm_movemask_epi8(xmm_strength);
+				if (match == 0)
+					return;
+
+				// abs(R0 - R1)
+				xmm_R1 = _mm_loadu_si128((__m128i *)(SrcPtrQ+inc_dim));
+				xmm_diff=_mm_subs_epu8(xmm_R0, xmm_R1);
+				xmm_absdiff=_mm_subs_epu8(xmm_R1, xmm_R0);
+				xmm_absdiff=_mm_or_si128(xmm_absdiff, xmm_diff);
+
+				// 			if (abs( R0 - R1) < Beta)
+				// SSE2 doesn't have unsigned <, so we have to go to short
+				xmm_comphi = _mm_unpackhi_epi8(xmm_absdiff, xmm_zero);
+				xmm_complo = _mm_unpacklo_epi8(xmm_absdiff, xmm_zero);
+				xmm_beta= _mm_set1_epi16((uint16_t)Beta);
+				xmm_comphi = _mm_cmplt_epi16(xmm_comphi, xmm_beta);
+				xmm_complo = _mm_cmplt_epi16(xmm_complo, xmm_beta);
+				xmm_complo = _mm_packs_epi16(xmm_complo, xmm_comphi);
+				xmm_strength = _mm_and_si128(xmm_strength, xmm_complo);
+				match = _mm_movemask_epi8(xmm_strength);
+				if (match == 0)
+					return;
+
+				// abs(L0 - L1)
+				xmm_L1 = _mm_loadu_si128((__m128i *)(SrcPtrP-inc_dim));
+				xmm_diff=_mm_subs_epu8(xmm_L1, xmm_L0);
+				xmm_absdiff=_mm_subs_epu8(xmm_L0, xmm_L1);
+				xmm_absdiff=_mm_or_si128(xmm_absdiff, xmm_diff);
+
+				// if ((abs(L0 - L1) < Beta))
+				// SSE2 doesn't have unsigned <, so we have to go to short
+				xmm_comphi = _mm_unpackhi_epi8(xmm_absdiff, xmm_zero);
+				xmm_complo = _mm_unpacklo_epi8(xmm_absdiff, xmm_zero);
+				xmm_comphi = _mm_cmplt_epi16(xmm_comphi, xmm_beta);
+				xmm_complo = _mm_cmplt_epi16(xmm_complo, xmm_beta);
+				xmm_complo = _mm_packs_epi16(xmm_complo, xmm_comphi);
+				xmm_strength = _mm_and_si128(xmm_strength, xmm_complo);
+				match = _mm_movemask_epi8(xmm_strength);
+				if (match == 0)
+					return;
+
+				pelmatch = match & 0xFF;
+				if (pelmatch)
+				{
+					byte strength = Strength[0];
+
+					xmm_complo = _mm_unpacklo_epi8(xmm_strength, xmm_strength);
+
+					switch(strength)
+					{
+					case 4: // INTRA strong
+						{
+							assert(Strength[4] == 4);
+							IntraStrongFilter_Luma_Horiz_sse2(inc_dim, SrcPtrP, SrcPtrQ, xmm_alpha, xmm_beta, xmm_complo);
+						}
+						break;
+					default:
+						{
+							int C[2]  = { ClipTab[strength], ClipTab[Strength[4]] };
+							FilterLuma_Horiz_sse2(inc_dim, SrcPtrP, SrcPtrQ, xmm_beta, C, xmm_complo);
+						}
+						break;
+					}
+				}
+				pelmatch = match & 0xFF00;
+				if (pelmatch)
+				{
+					byte strength = Strength[8];
+
+					xmm_comphi = _mm_unpackhi_epi8(xmm_strength, xmm_strength);
+
+					switch(strength)
+					{
+					case 4: // INTRA strong
+						{
+							assert(Strength[12] == 4);
+							IntraStrongFilter_Luma_Horiz_sse2(inc_dim, SrcPtrP+8, SrcPtrQ+8, xmm_alpha, xmm_beta, xmm_comphi);
+						}
+						break;
+					default:
+						{
+							int C[2]  = { ClipTab[strength], ClipTab[Strength[12]] };
+							FilterLuma_Horiz_sse2(inc_dim, SrcPtrP+8, SrcPtrQ+8, xmm_beta, C, xmm_comphi);
+						}
+						break;
+					}
+				}
+			}
+		}
+	}
+}
+
+
+void EdgeLoopLumaNormal_Horiz(ColorPlane pl, VideoImage *image, const byte Strength[MB_BLOCK_SIZE], Macroblock *MbQ, int edge, StorablePicture *p)
+{
+	// dir == 1
+	VideoParameters *p_Vid = MbQ->p_Vid;
+	int      yQ = (edge < MB_BLOCK_SIZE ? edge - 1: 0);
+
+	PixelPos pixMB1;
+	p_Vid->getNeighbour0X(MbQ, yQ, p_Vid->mb_size[IS_LUMA], &pixMB1); 
+
+	if (pixMB1.available || (MbQ->DFDisableIdc== 0))
+	{   
+		int bitdepth_scale   = pl ? p_Vid->bitdepth_scale[IS_CHROMA] : p_Vid->bitdepth_scale[IS_LUMA];
+
+		Macroblock *MbP  = &(p_Vid->mb_data[pixMB1.mb_addr]);
+
+		// Average QP of the two blocks
+		int QP = pl? ((MbP->qpc[pl-1] + MbQ->qpc[pl-1] + 1) >> 1) : (MbP->qp + MbQ->qp + 1) >> 1;
+
+		int indexA = iClip3(0, MAX_QP, QP + MbQ->DFAlphaC0Offset);
+		int Alpha  = ALPHA_TABLE[indexA] * bitdepth_scale;
+		if (Alpha)
+		{
+			int indexB = iClip3(0, MAX_QP, QP + MbQ->DFBetaOffset);
+			int Beta   = BETA_TABLE [indexB] * bitdepth_scale;
+
+			if (Beta !=0)
+			{
+				PixelPos pixMB2;
+				const byte *ClipTab = CLIP_TAB   [indexA];
+				int max_imgpel_value = p_Vid->max_pel_value_comp[pl];
+				int inc_dim = image->stride;
+				int pel;
+				imgpel *SrcPtrQ;
+				imgpel *SrcPtrP = image->base_address + pixMB1.pos_y * image->stride + pixMB1.pos_x;
+
+				p_Vid->getNeighbour0X(MbQ, ++yQ, p_Vid->mb_size[IS_LUMA], &pixMB2);
+				SrcPtrQ = image->base_address + pixMB2.pos_y * image->stride + pixMB2.pos_x;
+
+				for( pel = 0 ; pel < MB_BLOCK_SIZE ; pel+=BLOCK_SIZE)
+				{
+					byte strength = Strength[pel];
+
+					switch(strength)
+					{
+					case 0:
+						break;
+					case 4: // INTRA strong
+						{
+							IntraStrongFilter_Luma_Horiz(inc_dim, SrcPtrP+pel, SrcPtrQ+pel, Alpha, Beta);
+						}
+						break;
+					default:
+						{
+							int C0  = ClipTab[strength] * bitdepth_scale;
+							FilterLuma_Horiz(inc_dim, SrcPtrP+pel, SrcPtrQ+pel, Alpha, Beta, C0, max_imgpel_value);
+						}
+						break;
+					}
+
+				}
+			}
+		}
+	}
+}
diff --git a/Src/h264dec/ldecod/src/filter_luma_vert.c b/Src/h264dec/ldecod/src/filter_luma_vert.c
new file mode 100644
index 00000000..6d994217
--- /dev/null
+++ b/Src/h264dec/ldecod/src/filter_luma_vert.c
@@ -0,0 +1,554 @@
+#include "global.h"
+#include "image.h"
+#include "mb_access.h"
+#include "loopfilter.h"
+#include <mmintrin.h>
+#include <emmintrin.h>
+
+static const byte ALPHA_TABLE[52]  = {0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,4,4,5,6,  7,8,9,10,12,13,15,17,  20,22,25,28,32,36,40,45,  50,56,63,71,80,90,101,113,  127,144,162,182,203,226,255,255} ;
+static const byte  BETA_TABLE[52]  = {0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,2,2,2,3,  3,3,3, 4, 4, 4, 6, 6,   7, 7, 8, 8, 9, 9,10,10,  11,11,12,12,13,13, 14, 14,   15, 15, 16, 16, 17, 17, 18, 18} ;
+static const byte CLIP_TAB[52][5]  =
+{
+	{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},
+	{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},
+	{ 0, 0, 0, 0, 0},{ 0, 0, 0, 1, 1},{ 0, 0, 0, 1, 1},{ 0, 0, 0, 1, 1},{ 0, 0, 0, 1, 1},{ 0, 0, 1, 1, 1},{ 0, 0, 1, 1, 1},{ 0, 1, 1, 1, 1},
+	{ 0, 1, 1, 1, 1},{ 0, 1, 1, 1, 1},{ 0, 1, 1, 1, 1},{ 0, 1, 1, 2, 2},{ 0, 1, 1, 2, 2},{ 0, 1, 1, 2, 2},{ 0, 1, 1, 2, 2},{ 0, 1, 2, 3, 3},
+	{ 0, 1, 2, 3, 3},{ 0, 2, 2, 3, 3},{ 0, 2, 2, 4, 4},{ 0, 2, 3, 4, 4},{ 0, 2, 3, 4, 4},{ 0, 3, 3, 5, 5},{ 0, 3, 4, 6, 6},{ 0, 3, 4, 6, 6},
+	{ 0, 4, 5, 7, 7},{ 0, 4, 5, 8, 8},{ 0, 4, 6, 9, 9},{ 0, 5, 7,10,10},{ 0, 6, 8,11,11},{ 0, 6, 8,13,13},{ 0, 7,10,14,14},{ 0, 8,11,16,16},
+	{ 0, 9,12,18,18},{ 0,10,13,20,20},{ 0,11,15,23,23},{ 0,13,17,25,25}
+} ;
+
+static void IntraStrongFilter_Luma_Vert(int p_step, imgpel *SrcPtrP, imgpel *SrcPtrQ, int Alpha, int Beta)
+{
+
+	int pel;
+	for (pel = 0; pel < BLOCK_SIZE; pel++, SrcPtrP+=p_step, SrcPtrQ+=p_step)
+	{
+		imgpel  L0 = SrcPtrP[0];
+		imgpel  R0 = SrcPtrQ[0];
+
+		if( abs( R0 - L0 ) < Alpha )
+		{          
+			imgpel  R1 = SrcPtrQ[1];
+			imgpel  L1 = SrcPtrP[-1];
+			if ((abs( R0 - R1) < Beta)  && (abs(L0 - L1) < Beta))
+			{        
+				imgpel  R2 = SrcPtrQ[2];
+				imgpel  L2 = SrcPtrP[-2];
+
+				int RL0 = L0 + R0;
+				int small_gap = (abs( R0 - L0 ) < ((Alpha >> 2) + 2));
+				int aq  = ( abs( R0 - R2) < Beta ) & small_gap;
+				int ap  = ( abs( L0 - L2) < Beta ) & small_gap;
+
+				if (ap)
+				{
+					int L1RL0 = L1 + RL0;
+					imgpel  L3 = SrcPtrP[-3];
+					SrcPtrP[0]              = (imgpel)  (( R1 + ((L1RL0) << 1) +  L2 + 4) >> 3);
+					SrcPtrP[-1] = (imgpel)  (( L2 + L1RL0 + 2) >> 2);
+					SrcPtrP[-2] = (imgpel) ((((L3 + L2) <<1) + L2 + L1RL0 + 4) >> 3);
+				}
+				else
+				{
+					*SrcPtrP = (imgpel) (((L1 << 1) + L0 + R1 + 2) >> 2) ;
+				}
+
+				if (aq)
+				{
+					imgpel  R3 = SrcPtrQ[3];
+					SrcPtrQ[0] = (imgpel) (( L1 + ((R1 + RL0) << 1) +  R2 + 4) >> 3);
+					SrcPtrQ[1] = (imgpel) (( R2 + R0 + L0 + R1 + 2) >> 2);
+					SrcPtrQ[2] = (imgpel) ((((R3 + R2) <<1) + R2 + R1 + RL0 + 4) >> 3);
+				}
+				else
+				{
+					SrcPtrQ[0] = (imgpel) (((R1 << 1) + R0 + L1 + 2) >> 2);
+				}
+			}
+		}
+	}
+}
+
+static void FilterLuma_Vert(int p_step, imgpel *SrcPtrP, imgpel *SrcPtrQ, int Alpha, int Beta, int C0, int max_imgpel_value)
+{
+	int pel;
+	for (pel = 0; pel < BLOCK_SIZE; pel++, SrcPtrP+=p_step, SrcPtrQ+=p_step)
+	{
+		imgpel  L0 = SrcPtrP[0];
+		imgpel  R0 = SrcPtrQ[0];
+
+		if( abs( R0 - L0 ) < Alpha )
+		{          
+			imgpel  R1 = SrcPtrQ[1];
+			if (abs( R0 - R1) < Beta)
+			{
+				imgpel  L1 = SrcPtrP[-1];
+				if (abs(L0 - L1) < Beta)
+				{        
+					imgpel  R2 = SrcPtrQ[2];
+					imgpel  L2 = SrcPtrP[-2];
+
+					int RL0 = (L0 + R0 + 1) >> 1;
+					int aq  = (abs(R0 - R2) < Beta);
+					int ap  = (abs(L0 - L2) < Beta);
+
+					//int C0  = ClipTab[ *Strength ] * bitdepth_scale;
+					int tc0  = (C0 + ap + aq) ;
+					int dif = iClip3( -tc0, tc0, (((R0 - L0) << 2) + (L1 - R1) + 4) >> 3 );
+
+					if( ap )
+						SrcPtrP[-1] += iClip3( -C0,  C0, (L2 + RL0 - (L1<<1)) >> 1 );
+					SrcPtrP[0] = (imgpel) iClip1(max_imgpel_value, L0 + dif);
+
+					SrcPtrQ[0] = (imgpel) iClip1(max_imgpel_value, R0 - dif);
+					if( aq  )
+						SrcPtrQ[1] += iClip3( -C0,  C0, (R2 + RL0 - (R1<<1)) >> 1 );
+				}
+			}
+		}
+	}
+}
+
+void EdgeLoopLumaNormal_Vert(ColorPlane pl, VideoImage *image, const byte Strength[MB_BLOCK_SIZE], Macroblock *MbQ, int edge, StorablePicture *p)
+{
+	// dir == 0
+	imgpel **Img = image->img;
+	VideoParameters *p_Vid = MbQ->p_Vid;
+	int      xQ = edge - 1;
+
+	PixelPos pixMB1;
+	p_Vid->getNeighbourX0(MbQ, xQ, p_Vid->mb_size[IS_LUMA], &pixMB1); 
+
+	if (pixMB1.available || (MbQ->DFDisableIdc== 0))
+	{   
+		int bitdepth_scale   = pl ? p_Vid->bitdepth_scale[IS_CHROMA] : p_Vid->bitdepth_scale[IS_LUMA];
+		ptrdiff_t p_step = image->stride;
+
+		Macroblock *MbP  = &(p_Vid->mb_data[pixMB1.mb_addr]);
+
+		// Average QP of the two blocks
+		int QP = pl? ((MbP->qpc[pl-1] + MbQ->qpc[pl-1] + 1) >> 1) : (MbP->qp + MbQ->qp + 1) >> 1;
+
+		int indexA = iClip3(0, MAX_QP, QP + MbQ->DFAlphaC0Offset);
+		int indexB = iClip3(0, MAX_QP, QP + MbQ->DFBetaOffset);
+
+		int Alpha  = ALPHA_TABLE[indexA] * bitdepth_scale;
+		int Beta   = BETA_TABLE [indexB] * bitdepth_scale;
+
+		if (Alpha != 0 && Beta !=0)
+		{
+			PixelPos pixMB2;
+			const byte *ClipTab = CLIP_TAB   [indexA];
+			int max_imgpel_value = p_Vid->max_pel_value_comp[pl];
+			int pel;
+			imgpel *SrcPtrQ;
+			imgpel *SrcPtrP = image->base_address + pixMB1.pos_y * image->stride + pixMB1.pos_x;
+
+			p_Vid->getNeighbourX0(MbQ, ++xQ, p_Vid->mb_size[IS_LUMA], &pixMB2);
+			SrcPtrQ = image->base_address + pixMB2.pos_y * image->stride + pixMB2.pos_x;
+
+			for( pel = 0 ; pel < MB_BLOCK_SIZE ; pel+=BLOCK_SIZE)
+			{
+				byte strength = Strength[pel];
+
+				switch(strength)
+				{
+				case 0:
+					break;
+				case 4: // INTRA strong
+					{
+						IntraStrongFilter_Luma_Vert(p_step, SrcPtrP, SrcPtrQ, Alpha, Beta);
+					}
+					break;
+				default:
+					{
+						int C0  = ClipTab[strength] * bitdepth_scale;
+						FilterLuma_Vert(p_step, SrcPtrP, SrcPtrQ, Alpha, Beta, C0, max_imgpel_value);
+					}
+					break;
+				}
+				SrcPtrP += p_step * BLOCK_SIZE;
+				SrcPtrQ += p_step * BLOCK_SIZE;
+			}
+		}
+	}
+}
+
+
+static void FilterLuma_Vert_sse2(int p_step, imgpel *SrcPtrP, int Alpha, int Beta, const uint8_t Strength[4], const byte *ClipTab)
+{
+
+	__m64 mmx_alpha_minus_one = _mm_set1_pi16(Alpha-1), mmx_beta_minus_one = _mm_set1_pi16(Beta-1);
+	__m64 mmx_zero	= _mm_setzero_si64(), mmx_one, mmx_four=_mm_set1_pi16(4);
+	__m64 mmx_minus_one;
+	__m64 mmx_absdiff, mmx_diff;
+	__m64 mmx_L0, mmx_L1, mmx_L2, mmx_L0_R0;
+	__m64 mmx_R0, mmx_R1_R2, mmx_R1, mmx_R2;
+	__m64 mmx_load0, mmx_load1, mmx_load2, mmx_load3, mmx_load4, mmx_load5, mmx_load6, mmx_load7, mmx_load8;
+	__m64 mmx_ap, mmx_aq, mmx_C0, mmx_negative_C0, mmx_tc0, mmx_dif, mmx_acc, mmx_match;
+	int match;
+	int i=0;
+
+	mmx_minus_one = _mm_set1_pi32(-1);
+	mmx_one = _mm_sub_pi16(mmx_zero, mmx_minus_one); // dunno if this'll be faster than _mm_set1_pi16 or not
+	SrcPtrP -= 2;
+
+	STAGE:
+
+	while (!Strength[i])
+	{
+		SrcPtrP += p_step << 2;
+		if (i++ == 3) // last stage
+			return; 
+	}
+
+		mmx_load0 = (*(__m64 *)(SrcPtrP));                    // La2 La1 La0 Ra0 Ra1 Ra2 --- ---
+		mmx_load1 = (*(__m64 *)(SrcPtrP+=p_step));            // Lb2 Lb1 Lb0 Rb0 Rb1 Rb2 --- ---
+		mmx_load4 = _mm_unpacklo_pi8(mmx_load0, mmx_load1);   // La2 Lb2 La1 Lb1 La0 Lb0 Ra0 Rb0 *
+		mmx_load2 = (*(__m64 *)(SrcPtrP+=p_step));            // Lc2 Lc1 Lc0 Rc0 Rc1 Rc2 --- ---
+		mmx_load3 = (*(__m64 *)(SrcPtrP+=p_step));            // Ld2 Ld1 Ld0 Rd0 Rd1 Rd2 --- ---
+		SrcPtrP+=p_step;
+		mmx_load5 = _mm_unpacklo_pi8(mmx_load2, mmx_load3);   // Lc2 Ld2 Lc1 Ld1 Lc0 Ld0 Rc0 Rd0 *
+		mmx_L0_R0 = _mm_unpackhi_pi16(mmx_load4, mmx_load5);  // La0 Lb0 Lc0 Ld0 Ra0 Rb0 Rc0 Rd0
+
+		// abs( R0 - L0 ) < Alpha
+		// MMX doesn't have unsigned compare, so we have to go to short
+		mmx_L0 = _mm_unpacklo_pi8(mmx_L0_R0, mmx_zero);             // La0 Lb0 Lc0 Ld0
+		mmx_R0 = _mm_unpackhi_pi8(mmx_L0_R0, mmx_zero);          // Ra0 Rb0 Rc0 Rd0
+		mmx_diff=_mm_subs_pu16(mmx_R0, mmx_L0);
+		mmx_absdiff =_mm_subs_pu16(mmx_L0, mmx_R0);
+		mmx_absdiff =_mm_or_si64(mmx_absdiff, mmx_diff);
+		mmx_match = _mm_cmpgt_pi16(mmx_absdiff, mmx_alpha_minus_one); // 1's in any words we don't have to do
+		mmx_match = _mm_xor_si64(mmx_match, mmx_minus_one);
+		match = _mm_movemask_pi8(mmx_match); 
+		if (match == 0)
+		{
+			if (i++ == 3) // last stage
+				return;
+				
+			goto STAGE; // start the process over from next position
+		}
+
+		mmx_load6 = _mm_unpackhi_pi8(mmx_load0, mmx_load1);       // Ra1 Rb1 Ra2 Rb2 --- --- --- --- *
+		mmx_load7 = _mm_unpackhi_pi8(mmx_load2, mmx_load3);       // Rc1 Rd1 Rc2 Rd2 --- --- --- --- *
+		mmx_R1_R2 = _mm_unpacklo_pi16(mmx_load6, mmx_load7);      // Ra1 Rb1 Rc1 Rd1 Ra2 Rb2 Rc2 Rd2
+
+		// abs( R0 - R1) < Beta
+		// MMX doesn't have unsigned compare, so we have to go to short
+		// mmx_R0 already populated
+		mmx_R1 = _mm_unpacklo_pi8(mmx_R1_R2, mmx_zero);           // Ra1 Rb1 Rc1 Rd1
+		mmx_diff=_mm_subs_pu16(mmx_R0, mmx_R1);
+		mmx_absdiff =_mm_subs_pu16(mmx_R1, mmx_R0);
+		mmx_absdiff =_mm_or_si64(mmx_absdiff, mmx_diff);
+		mmx_absdiff = _mm_cmpgt_pi16(mmx_absdiff, mmx_beta_minus_one);
+		mmx_absdiff = _mm_xor_si64(mmx_absdiff, mmx_minus_one);
+		mmx_match = _mm_and_si64(mmx_match, mmx_absdiff);
+		match = _mm_movemask_pi8(mmx_match);
+		if (match == 0)
+		{
+			if (i++ == 3) // last stage
+				return;
+				
+			goto STAGE; // start the process over from next position
+		}
+
+		// abs(L0 - L1) < Beta
+		// MMX doesn't have unsigned compare, so we have to go to short
+		// mmx_L0 already populated
+		// mmx_load4: La2 Lb2 La1 Lb1 La0 Lb0 --- ---
+		// mmx_load5: Lc2 Ld2 Lc1 Ld1 Lc0 Ld0 --- ---
+		mmx_load4 = _mm_unpacklo_pi16(mmx_load4, mmx_load5); // La2 Lb2 Lc2 Ld2 La1 Lb1 Lc1 Ld1 
+		mmx_L1 = _mm_unpackhi_pi8(mmx_load4, mmx_zero);      // La1 Lb1 Lc1 Ld1
+		mmx_diff=_mm_subs_pu16(mmx_L0, mmx_L1);
+		mmx_absdiff =_mm_subs_pu16(mmx_L1, mmx_L0);
+		mmx_absdiff =_mm_or_si64(mmx_absdiff, mmx_diff);
+		mmx_absdiff = _mm_cmpgt_pi16(mmx_absdiff, mmx_beta_minus_one);
+		mmx_absdiff = _mm_xor_si64(mmx_absdiff, mmx_minus_one);
+		mmx_match = _mm_and_si64(mmx_match, mmx_absdiff);
+		match = _mm_movemask_pi8(mmx_match);
+		if (match == 0)
+		{
+			if (i++ == 3) // last stage
+				return;
+
+			goto STAGE; // start the process over from next position
+		}
+
+		// ok, now time to performn the actual calculation. hope it was worth it!!
+		
+		// ap  = (abs(L0 - L2) < Beta);
+		// finish loading L2 
+		mmx_L2 = _mm_unpacklo_pi8(mmx_load4, mmx_zero);      // La1 Lb1 Lc1 Ld1
+		mmx_diff=_mm_subs_pu16(mmx_L0, mmx_L2);
+		mmx_absdiff =_mm_subs_pu16(mmx_L2, mmx_L0);
+		mmx_absdiff =_mm_or_si64(mmx_absdiff, mmx_diff);
+		mmx_ap = _mm_cmpgt_pi16(mmx_absdiff, mmx_beta_minus_one);
+		mmx_tc0 = _mm_add_pi16(mmx_ap, mmx_one); // a clever trick. add one to essential do !mmx_absdiff   (since mmx_diff will == 0xFFFF when true)
+
+		// aq  = (abs(R0 - R2) < Beta);
+		// finish loading R2
+                                        // 		mmx_R1_R2: Ra1 Rb1 Rc1 Rd1 Ra2 Rb2 Rc2 Rd2
+		mmx_R2 = _mm_unpackhi_pi8(mmx_R1_R2, mmx_zero);   // Ra2 Rb2 Rc2 Rd2 
+		mmx_diff=_mm_subs_pu16(mmx_R0, mmx_R2);
+		mmx_absdiff =_mm_subs_pu16(mmx_R2, mmx_R0);
+		mmx_absdiff =_mm_or_si64(mmx_absdiff, mmx_diff);
+		mmx_aq = _mm_cmpgt_pi16(mmx_absdiff, mmx_beta_minus_one);
+		mmx_tc0 = _mm_add_pi16(mmx_tc0, _mm_add_pi16(mmx_aq, mmx_one)); // a clever trick. add one to essential do !mmx_absdiff   (since mmx_diff will == 0xFFFF when true)
+
+		// tc0  = (C0 + ap + aq) ;
+		mmx_C0 = _mm_set1_pi16(ClipTab[Strength[i]]);
+		mmx_negative_C0 = _mm_sub_pi16(mmx_zero, mmx_C0);
+		mmx_tc0 = _mm_add_pi16(mmx_tc0, mmx_C0);
+
+
+		// dif = iClip3( -tc0, tc0, (((R0 - L0) << 2) + (L1 - R1) + 4) >> 3 );
+		mmx_dif = mmx_R0;
+		mmx_dif = _mm_sub_pi16(mmx_dif, mmx_L0);
+		mmx_dif = _mm_slli_pi16(mmx_dif, 2);
+		mmx_dif = _mm_add_pi16(mmx_dif, mmx_L1);
+		mmx_dif = _mm_sub_pi16(mmx_dif, mmx_R1);
+		mmx_dif = _mm_add_pi16(mmx_dif, mmx_four);
+		mmx_dif = _mm_srai_pi16(mmx_dif, 3);
+		mmx_dif = _mm_min_pi16(mmx_dif, mmx_tc0);
+		mmx_tc0 = _mm_sub_pi16(mmx_zero, mmx_tc0);
+		mmx_dif = _mm_max_pi16(mmx_dif, mmx_tc0);
+		mmx_dif = _mm_and_si64(mmx_dif, mmx_match);
+
+				// TODO: benski> is it worth checking for_mm_movemask_pi8(ap) to see if we can skip this?
+				// if( ap ) L1 += iClip3( -C0,  C0, (L2 + ((L0 + R0 + 1) >> 1) - (L1<<1)) >> 1 );
+		mmx_acc = mmx_L0;
+		mmx_acc = _mm_add_pi16(mmx_acc, mmx_R0);
+		mmx_acc = _mm_add_pi16(mmx_acc, mmx_one);
+		mmx_acc = _mm_srai_pi16(mmx_acc, 1);
+		mmx_acc = _mm_sub_pi16(mmx_acc, mmx_L1);
+		mmx_acc = _mm_sub_pi16(mmx_acc, mmx_L1);
+		mmx_acc = _mm_add_pi16(mmx_acc, mmx_L2);
+		mmx_acc = _mm_srai_pi16(mmx_acc, 1);
+		mmx_acc = _mm_min_pi16(mmx_acc, mmx_C0);
+		mmx_acc = _mm_max_pi16(mmx_acc, mmx_negative_C0);
+		mmx_acc = _mm_andnot_si64(mmx_ap, mmx_acc);
+		mmx_acc = _mm_and_si64(mmx_acc, mmx_match);
+		mmx_L1 = _mm_add_pi16(mmx_L1, mmx_acc);
+
+
+			//if( aq  )						R1 += iClip3( -C0,  C0, (R2 + RL0 - (R1<<1)) >> 1 );
+			mmx_acc = mmx_L0;
+		mmx_acc = _mm_add_pi16(mmx_acc, mmx_R0);
+		mmx_acc = _mm_add_pi16(mmx_acc, mmx_one);
+		mmx_acc = _mm_srai_pi16(mmx_acc, 1);
+		mmx_acc = _mm_sub_pi16(mmx_acc, mmx_R1);
+		mmx_acc = _mm_sub_pi16(mmx_acc, mmx_R1);
+		mmx_acc = _mm_add_pi16(mmx_acc, mmx_R2);
+		mmx_acc = _mm_srai_pi16(mmx_acc, 1);
+		mmx_acc = _mm_min_pi16(mmx_acc, mmx_C0);
+		mmx_acc = _mm_max_pi16(mmx_acc, mmx_negative_C0);
+		mmx_acc = _mm_andnot_si64(mmx_aq, mmx_acc);
+		mmx_acc = _mm_and_si64(mmx_acc, mmx_match);
+		mmx_R1 = _mm_add_pi16(mmx_R1, mmx_acc);
+
+		// L0 = (imgpel) iClip1(max_imgpel_value, L0 + dif);
+		mmx_L0 = _mm_add_pi16(mmx_L0, mmx_dif);
+
+		// R0 = (imgpel) iClip1(max_imgpel_value, R0 - dif);
+		mmx_R0 = _mm_sub_pi16(mmx_R0, mmx_dif);
+
+
+	// now for the super-exciting fun of getting this data back into memory
+		SrcPtrP -= 4*p_step;
+		//SrcPtrQ -= 4*p_step;
+		SrcPtrP++;
+
+				// rotate 4x4 matrix
+		mmx_load1 = _mm_unpacklo_pi16(mmx_L1, mmx_R0); // 00 20 01 21
+		mmx_load2 = _mm_unpacklo_pi16(mmx_L0, mmx_R1); // 10 30 11 31
+		mmx_load3 = _mm_unpackhi_pi16(mmx_L1, mmx_R0); // 02 22 03 23
+		mmx_load4 = _mm_unpackhi_pi16(mmx_L0, mmx_R1); // 12 32 13 33
+		mmx_load5 = _mm_unpacklo_pi16(mmx_load1, mmx_load2); // 00 10 20 30
+		mmx_load6 = _mm_unpackhi_pi16(mmx_load1, mmx_load2); // 01 11 21 31
+		mmx_load7 = _mm_unpacklo_pi16(mmx_load3, mmx_load4); // 02 12 22 32
+		mmx_load8 = _mm_unpackhi_pi16(mmx_load3, mmx_load4); // 03 13 23 33
+		mmx_load5 = _mm_packs_pu16(mmx_load5, mmx_load5); 
+		mmx_load6 = _mm_packs_pu16(mmx_load6, mmx_load6); 
+		mmx_load7 = _mm_packs_pu16(mmx_load7, mmx_load7); 
+		mmx_load8 = _mm_packs_pu16(mmx_load8, mmx_load8); 
+
+		//mmx_load1 = _mm_setr_pi16(0x8080, 0x80, 0, 0);
+		*(int *)SrcPtrP = _mm_cvtsi64_si32(mmx_load5);
+		*(int *)(SrcPtrP+=p_step) = _mm_cvtsi64_si32(mmx_load6);
+		*(int *)(SrcPtrP+=p_step) = _mm_cvtsi64_si32(mmx_load7);
+		*(int *)(SrcPtrP+=p_step) = _mm_cvtsi64_si32(mmx_load8);
+
+		if (i++ == 3)
+			return;
+
+		//SrcPtrQ += 2;
+		SrcPtrP += p_step;
+		//SrcPtrQ += p_step;
+		SrcPtrP--;
+		goto STAGE; // next stage
+}
+
+/* assumptions: YUV 420, getNonAffNeighbour */
+void EdgeLoopLuma_Vert_YUV420(VideoImage *image, const uint8_t Strength[4], Macroblock *MbQ, PixelPos pixMB1, Macroblock *MbP)
+{
+	// dir == 0
+	if (MbQ->DFDisableIdc== 0)
+	{   
+		ptrdiff_t p_step = image->stride;
+
+		// Average QP of the two blocks
+		int QP = (MbP->qp + MbQ->qp + 1) >> 1;
+
+		int indexA = iClip3(0, MAX_QP, QP + MbQ->DFAlphaC0Offset);
+		int indexB = iClip3(0, MAX_QP, QP + MbQ->DFBetaOffset);
+
+		int Alpha  = ALPHA_TABLE[indexA];
+		int Beta   = BETA_TABLE [indexB];
+
+		if (Alpha != 0 && Beta !=0)
+		{
+			imgpel *SrcPtrP = image->base_address + pixMB1.pos_y * image->stride + pixMB1.pos_x;
+
+			if (Strength[0] == 4) // if strong filter is used, all blocks will be strong
+			{
+				imgpel *SrcPtrQ = SrcPtrP+1;
+				int pel;
+				for( pel = 0 ; pel < BLOCK_SIZE ; pel++)
+				{
+					IntraStrongFilter_Luma_Vert(p_step, SrcPtrP, SrcPtrQ, Alpha, Beta);
+					SrcPtrP += p_step * BLOCK_SIZE;
+					SrcPtrQ += p_step * BLOCK_SIZE;
+				}
+			}
+			else
+			{
+				const byte *ClipTab = CLIP_TAB   [indexA];
+				FilterLuma_Vert_sse2(p_step, SrcPtrP, Alpha, Beta, Strength, ClipTab);
+			}
+		}
+	}
+}
+
+void EdgeLoopLumaMBAff_Vert_YUV420(VideoImage *image, const byte Strength[16], Macroblock *MbQ, int edge, StorablePicture *p)
+{
+	// dir == 0
+	imgpel **Img = image->img;
+	int      width = image->stride;
+	int      pel, ap = 0, aq = 0, Strng ;
+
+	int      C0, tc0, dif;
+	imgpel   L0, R0;
+	int      Alpha = 0, Beta = 0 ;
+	const byte* ClipTab = NULL;
+	int      small_gap;
+	int      indexA, indexB;
+
+	int      QP;
+	int      xQ, yQ;
+
+	PixelPos pixP, pixQ;
+	VideoParameters *p_Vid = MbQ->p_Vid;
+	int      bitdepth_scale = p_Vid->bitdepth_scale[IS_LUMA];
+	int      max_imgpel_value = p_Vid->max_pel_value_comp[PLANE_Y];
+
+	int AlphaC0Offset = MbQ->DFAlphaC0Offset;
+	int BetaOffset = MbQ->DFBetaOffset;
+
+	Macroblock *MbP;
+	imgpel   *SrcPtrP, *SrcPtrQ;
+
+	for( pel = 0 ; pel < MB_BLOCK_SIZE ; ++pel )
+	{
+		xQ = edge;
+		yQ = pel;
+		getAffNeighbourXPLuma(MbQ, xQ - 1, yQ, &pixP);     
+
+		if (pixP.available || (MbQ->DFDisableIdc== 0))
+		{
+			if( (Strng = Strength[pel]) != 0)
+			{
+				getAffNeighbourXPLuma(MbQ, xQ, yQ, &pixQ); // TODO: PP
+
+				MbP = &(p_Vid->mb_data[pixP.mb_addr]);
+
+				SrcPtrQ = &(Img[pixQ.pos_y][pixQ.pos_x]);
+				SrcPtrP = &(Img[pixP.pos_y][pixP.pos_x]);
+
+				// Average QP of the two blocks
+				QP = (MbP->qp + MbQ->qp + 1) >> 1;
+
+				indexA = iClip3(0, MAX_QP, QP + AlphaC0Offset);
+				indexB = iClip3(0, MAX_QP, QP + BetaOffset);
+
+				Alpha   = ALPHA_TABLE[indexA] * bitdepth_scale;
+				Beta    = BETA_TABLE [indexB] * bitdepth_scale;
+				ClipTab = CLIP_TAB[indexA];
+
+				L0  = SrcPtrP[0] ;
+				R0  = SrcPtrQ[0] ;      
+
+				if( abs( R0 - L0 ) < Alpha )
+				{          
+					imgpel L1  = SrcPtrP[-1];
+					imgpel R1  = SrcPtrQ[ 1];      
+					if ((abs( R0 - R1) < Beta )   && (abs(L0 - L1) < Beta ))
+					{
+						imgpel L2  = SrcPtrP[-2];
+						imgpel R2  = SrcPtrQ[ 2];
+						if(Strng == 4 )    // INTRA strong filtering
+						{
+							int RL0 = L0 + R0;
+							small_gap = (abs( R0 - L0 ) < ((Alpha >> 2) + 2));
+							aq  = ( abs( R0 - R2) < Beta ) & small_gap;               
+							ap  = ( abs( L0 - L2) < Beta ) & small_gap;
+
+							if (ap)
+							{
+								imgpel L3  = SrcPtrP[-3];
+								SrcPtrP[-2] = (imgpel) ((((L3 + L2) << 1) + L2 + L1 + RL0 + 4) >> 3);
+								SrcPtrP[-1    ] = (imgpel) (( L2 + L1 + L0 + R0 + 2) >> 2);
+								SrcPtrP[    0    ] = (imgpel) (( R1 + ((L1 + RL0) << 1) +  L2 + 4) >> 3);
+							}
+							else
+							{
+								SrcPtrP[     0     ] = (imgpel) (((L1 << 1) + L0 + R1 + 2) >> 2) ;
+							}
+
+							if (aq)
+							{
+								imgpel R3  = SrcPtrQ[ 3];
+								SrcPtrQ[    0     ] = (imgpel) (( L1 + ((R1 + RL0) << 1) +  R2 + 4) >> 3);
+								SrcPtrQ[ 1     ] = (imgpel) (( R2 + R0 + R1 + L0 + 2) >> 2);
+								SrcPtrQ[  2 ] = (imgpel) ((((R3 + R2) << 1) + R2 + R1 + RL0 + 4) >> 3);
+							}
+							else
+							{
+								SrcPtrQ[    0     ] = (imgpel) (((R1 << 1) + R0 + L1 + 2) >> 2);
+							}
+						}
+						else   // normal filtering
+						{              
+							int RL0 = (L0 + R0 + 1) >> 1;
+							aq  = (abs( R0 - R2) < Beta);
+							ap  = (abs( L0 - L2) < Beta);
+
+							C0  = ClipTab[ Strng ] * bitdepth_scale;
+							tc0  = (C0 + ap + aq) ;
+							dif = iClip3( -tc0, tc0, (((R0 - L0) << 2) + (L1 - R1) + 4) >> 3) ;
+
+							if( ap )
+								*(SrcPtrP - 1) += iClip3( -C0,  C0, ( L2 + RL0 - (L1 << 1)) >> 1 ) ;
+
+							*SrcPtrP  = (imgpel) iClip1 (max_imgpel_value, L0 + dif) ;
+							*SrcPtrQ  = (imgpel) iClip1 (max_imgpel_value, R0 - dif) ;
+
+							if( aq  )
+								*(SrcPtrQ + 1) += iClip3( -C0,  C0, ( R2 + RL0 - (R1 << 1)) >> 1 ) ;
+						}            
+					}
+				}
+			}
+		}
+	}
+}
diff --git a/Src/h264dec/ldecod/src/fmo.c b/Src/h264dec/ldecod/src/fmo.c
new file mode 100644
index 00000000..cb32230e
--- /dev/null
+++ b/Src/h264dec/ldecod/src/fmo.c
@@ -0,0 +1,552 @@
+
+/*!
+ *****************************************************************************
+ *
+ * \file fmo.c
+ *
+ * \brief
+ *    Support for Flexible Macroblock Ordering (FMO)
+ *
+ * \author
+ *    Main contributors (see contributors.h for copyright, address and affiliation details)
+ *    - Stephan Wenger      stewe@cs.tu-berlin.de
+ *    - Karsten Suehring    suehring@hhi.de
+ ******************************************************************************
+ */
+
+#include "global.h"
+#include "elements.h"
+#include "defines.h"
+#include "header.h"
+#include "fmo.h"
+
+//#define PRINT_FMO_MAPS
+
+static void FmoGenerateType0MapUnitMap (VideoParameters *p_Vid, unsigned PicSizeInMapUnits );
+static void FmoGenerateType1MapUnitMap (VideoParameters *p_Vid, unsigned PicSizeInMapUnits );
+static void FmoGenerateType2MapUnitMap (VideoParameters *p_Vid, unsigned PicSizeInMapUnits );
+static void FmoGenerateType3MapUnitMap (VideoParameters *p_Vid, unsigned PicSizeInMapUnits );
+static void FmoGenerateType4MapUnitMap (VideoParameters *p_Vid, unsigned PicSizeInMapUnits );
+static void FmoGenerateType5MapUnitMap (VideoParameters *p_Vid, unsigned PicSizeInMapUnits );
+static void FmoGenerateType6MapUnitMap (VideoParameters *p_Vid, unsigned PicSizeInMapUnits );
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Generates p_Vid->MapUnitToSliceGroupMap
+ *    Has to be called every time a new Picture Parameter Set is used
+ *
+ * \param p_Vid
+ *      image encoding parameters for current picture
+ *
+ ************************************************************************
+ */
+static int FmoGenerateMapUnitToSliceGroupMap (VideoParameters *p_Vid)
+{
+  seq_parameter_set_rbsp_t* sps = p_Vid->active_sps;
+  pic_parameter_set_rbsp_t* pps = p_Vid->active_pps;
+
+  unsigned int NumSliceGroupMapUnits;
+
+  NumSliceGroupMapUnits = (sps->pic_height_in_map_units_minus1+1)* (sps->pic_width_in_mbs_minus1+1);
+
+  if (pps->slice_group_map_type == 6)
+  {
+    if ((pps->pic_size_in_map_units_minus1 + 1) != NumSliceGroupMapUnits)
+    {
+      error ("wrong pps->pic_size_in_map_units_minus1 for used SPS and FMO type 6", 500);
+    }
+  }
+
+  // allocate memory for p_Vid->MapUnitToSliceGroupMap
+  if (p_Vid->MapUnitToSliceGroupMap)
+    free (p_Vid->MapUnitToSliceGroupMap);
+  if ((p_Vid->MapUnitToSliceGroupMap = malloc ((NumSliceGroupMapUnits) * sizeof (int))) == NULL)
+  {
+    printf ("cannot allocated %d bytes for p_Vid->MapUnitToSliceGroupMap, exit\n", (int) ( (pps->pic_size_in_map_units_minus1+1) * sizeof (int)));
+    exit (-1);
+  }
+
+  if (pps->num_slice_groups_minus1 == 0)    // only one slice group
+  {
+    memset (p_Vid->MapUnitToSliceGroupMap, 0, NumSliceGroupMapUnits * sizeof (int));
+    return 0;
+  }
+
+  switch (pps->slice_group_map_type)
+  {
+  case 0:
+    FmoGenerateType0MapUnitMap (p_Vid, NumSliceGroupMapUnits);
+    break;
+  case 1:
+    FmoGenerateType1MapUnitMap (p_Vid, NumSliceGroupMapUnits);
+    break;
+  case 2:
+    FmoGenerateType2MapUnitMap (p_Vid, NumSliceGroupMapUnits);
+    break;
+  case 3:
+    FmoGenerateType3MapUnitMap (p_Vid, NumSliceGroupMapUnits);
+    break;
+  case 4:
+    FmoGenerateType4MapUnitMap (p_Vid, NumSliceGroupMapUnits);
+    break;
+  case 5:
+    FmoGenerateType5MapUnitMap (p_Vid, NumSliceGroupMapUnits);
+    break;
+  case 6:
+    FmoGenerateType6MapUnitMap (p_Vid, NumSliceGroupMapUnits);
+    break;
+  default:
+    printf ("Illegal slice_group_map_type %d , exit \n", (int) pps->slice_group_map_type);
+    exit (-1);
+  }
+  return 0;
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Generates p_Vid->MbToSliceGroupMap from p_Vid->MapUnitToSliceGroupMap
+ *
+ * \param p_Vid
+ *      image encoding parameters for current picture
+ *
+ ************************************************************************
+ */
+static int FmoGenerateMbToSliceGroupMap (VideoParameters *p_Vid)
+{
+  seq_parameter_set_rbsp_t* sps = p_Vid->active_sps;
+
+  unsigned i;
+
+  // allocate memory for p_Vid->MbToSliceGroupMap
+  if (p_Vid->MbToSliceGroupMap)
+    free (p_Vid->MbToSliceGroupMap);
+
+  if ((p_Vid->MbToSliceGroupMap = malloc ((p_Vid->PicSizeInMbs) * sizeof (int))) == NULL)
+  {
+    printf ("cannot allocate %d bytes for p_Vid->MbToSliceGroupMap, exit\n", (int) ((p_Vid->PicSizeInMbs) * sizeof (int)));
+    exit (-1);
+  }
+
+
+  if ((sps->frame_mbs_only_flag)|| p_Vid->field_pic_flag)
+  {
+    for (i=0; i<p_Vid->PicSizeInMbs; i++)
+    {
+      p_Vid->MbToSliceGroupMap[i] = p_Vid->MapUnitToSliceGroupMap[i];
+    }
+  }
+  else
+    if (sps->mb_adaptive_frame_field_flag  &&  (!p_Vid->field_pic_flag))
+    {
+      for (i=0; i<p_Vid->PicSizeInMbs; i++)
+      {
+        p_Vid->MbToSliceGroupMap[i] = p_Vid->MapUnitToSliceGroupMap[i/2];
+      }
+    }
+    else
+    {
+      for (i=0; i<p_Vid->PicSizeInMbs; i++)
+      {
+        p_Vid->MbToSliceGroupMap[i] = p_Vid->MapUnitToSliceGroupMap[(i/(2*p_Vid->PicWidthInMbs))*p_Vid->PicWidthInMbs+(i%p_Vid->PicWidthInMbs)];
+      }
+    }
+  return 0;
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    FMO initialization: Generates p_Vid->MapUnitToSliceGroupMap and p_Vid->MbToSliceGroupMap.
+ *
+ * \param p_Vid
+ *      image encoding parameters for current picture
+ ************************************************************************
+ */
+int fmo_init(VideoParameters *p_Vid)
+{
+  pic_parameter_set_rbsp_t* pps = p_Vid->active_pps;
+
+#ifdef PRINT_FMO_MAPS
+  unsigned i,j;
+#endif
+
+  FmoGenerateMapUnitToSliceGroupMap(p_Vid);
+  FmoGenerateMbToSliceGroupMap(p_Vid);
+
+  p_Vid->NumberOfSliceGroups = pps->num_slice_groups_minus1 + 1;
+
+#ifdef PRINT_FMO_MAPS
+  printf("\n");
+  printf("FMO Map (Units):\n");
+
+  for (j=0; j<p_Vid->PicHeightInMapUnits; j++)
+  {
+    for (i=0; i<p_Vid->PicWidthInMbs; i++)
+    {
+      printf("%c",48+p_Vid->MapUnitToSliceGroupMap[i+j*p_Vid->PicWidthInMbs]);
+    }
+    printf("\n");
+  }
+  printf("\n");
+  printf("FMO Map (Mb):\n");
+
+  for (j=0; j<p_Vid->PicHeightInMbs; j++)
+  {
+    for (i=0; i<p_Vid->PicWidthInMbs; i++)
+    {
+      printf("%c",48 + p_Vid->MbToSliceGroupMap[i + j * p_Vid->PicWidthInMbs]);
+    }
+    printf("\n");
+  }
+  printf("\n");
+
+#endif
+
+  return 0;
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Free memory allocated by FMO functions
+ ************************************************************************
+ */
+int FmoFinit(VideoParameters *p_Vid)
+{
+  if (p_Vid->MbToSliceGroupMap)
+  {
+    free (p_Vid->MbToSliceGroupMap);
+    p_Vid->MbToSliceGroupMap = NULL;
+  }
+  if (p_Vid->MapUnitToSliceGroupMap)
+  {
+    free (p_Vid->MapUnitToSliceGroupMap);
+    p_Vid->MapUnitToSliceGroupMap = NULL;
+  }
+  return 0;
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    FmoGetNumberOfSliceGroup(p_Vid)
+ *
+ * \par p_Vid:
+ *    VideoParameters
+ ************************************************************************
+ */
+int FmoGetNumberOfSliceGroup(VideoParameters *p_Vid)
+{
+  return p_Vid->NumberOfSliceGroups;
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    FmoGetLastMBOfPicture(p_Vid)
+ *    returns the macroblock number of the last MB in a picture.  This
+ *    mb happens to be the last macroblock of the picture if there is only
+ *    one slice group
+ *
+ * \par Input:
+ *    None
+ ************************************************************************
+ */
+int FmoGetLastMBOfPicture(VideoParameters *p_Vid)
+{
+  return FmoGetLastMBInSliceGroup (p_Vid, FmoGetNumberOfSliceGroup(p_Vid)-1);
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    FmoGetLastMBInSliceGroup: Returns MB number of last MB in SG
+ *
+ * \par Input:
+ *    SliceGroupID (0 to 7)
+ ************************************************************************
+ */
+
+int FmoGetLastMBInSliceGroup (VideoParameters *p_Vid, int SliceGroup)
+{
+  int i;
+
+  for (i=p_Vid->PicSizeInMbs-1; i>=0; i--)
+    if (FmoGetSliceGroupId (p_Vid, i) == SliceGroup)
+      return i;
+  return -1;
+
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Returns SliceGroupID for a given MB
+ *
+ * \param p_Vid
+ *      image encoding parameters for current picture
+ * \param mb
+ *    Macroblock number (in scan order)
+ ************************************************************************
+ */
+int FmoGetSliceGroupId (VideoParameters *p_Vid, int mb)
+{
+  assert (mb < (int) p_Vid->PicSizeInMbs);
+  assert (p_Vid->MbToSliceGroupMap != NULL);
+  return p_Vid->MbToSliceGroupMap[mb];
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    FmoGetNextMBBr: Returns the MB-Nr (in scan order) of the next
+ *    MB in the (scattered) Slice, -1 if the slice is finished
+ * \param p_Vid
+ *      image encoding parameters for current picture
+ *
+ * \param CurrentMbNr
+ *    number of the current macroblock
+ ************************************************************************
+ */
+int FmoGetNextMBNr (VideoParameters *p_Vid, int CurrentMbNr)
+{
+  int SliceGroup = FmoGetSliceGroupId (p_Vid, CurrentMbNr);
+
+  while (++CurrentMbNr<(int)p_Vid->PicSizeInMbs && p_Vid->MbToSliceGroupMap [CurrentMbNr] != SliceGroup)
+    ;
+
+  if (CurrentMbNr >= (int)p_Vid->PicSizeInMbs)
+    return -1;    // No further MB in this slice (could be end of picture)
+  else
+    return CurrentMbNr;
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Generate interleaved slice group map type MapUnit map (type 0)
+ *
+ ************************************************************************
+ */
+static void FmoGenerateType0MapUnitMap (VideoParameters *p_Vid, unsigned PicSizeInMapUnits )
+{
+  pic_parameter_set_rbsp_t* pps = p_Vid->active_pps;
+  unsigned iGroup, j;
+  unsigned i = 0;
+  do
+  {
+    for( iGroup = 0;
+         (iGroup <= pps->num_slice_groups_minus1) && (i < PicSizeInMapUnits);
+         i += pps->run_length_minus1[iGroup++] + 1 )
+    {
+      for( j = 0; j <= pps->run_length_minus1[ iGroup ] && i + j < PicSizeInMapUnits; j++ )
+        p_Vid->MapUnitToSliceGroupMap[i+j] = iGroup;
+    }
+  }
+  while( i < PicSizeInMapUnits );
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Generate dispersed slice group map type MapUnit map (type 1)
+ *
+ ************************************************************************
+ */
+static void FmoGenerateType1MapUnitMap (VideoParameters *p_Vid, unsigned PicSizeInMapUnits )
+{
+  pic_parameter_set_rbsp_t* pps = p_Vid->active_pps;
+  unsigned i;
+  for( i = 0; i < PicSizeInMapUnits; i++ )
+  {
+    p_Vid->MapUnitToSliceGroupMap[i] = ((i%p_Vid->PicWidthInMbs)+(((i/p_Vid->PicWidthInMbs)*(pps->num_slice_groups_minus1+1))/2))
+                                %(pps->num_slice_groups_minus1+1);
+  }
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Generate foreground with left-over slice group map type MapUnit map (type 2)
+ *
+ ************************************************************************
+ */
+static void FmoGenerateType2MapUnitMap (VideoParameters *p_Vid, unsigned PicSizeInMapUnits )
+{
+  pic_parameter_set_rbsp_t* pps = p_Vid->active_pps;
+  int iGroup;
+  unsigned i, x, y;
+  unsigned yTopLeft, xTopLeft, yBottomRight, xBottomRight;
+
+  for( i = 0; i < PicSizeInMapUnits; i++ )
+    p_Vid->MapUnitToSliceGroupMap[ i ] = pps->num_slice_groups_minus1;
+
+  for( iGroup = pps->num_slice_groups_minus1 - 1 ; iGroup >= 0; iGroup-- )
+  {
+    yTopLeft = pps->top_left[ iGroup ] / p_Vid->PicWidthInMbs;
+    xTopLeft = pps->top_left[ iGroup ] % p_Vid->PicWidthInMbs;
+    yBottomRight = pps->bottom_right[ iGroup ] / p_Vid->PicWidthInMbs;
+    xBottomRight = pps->bottom_right[ iGroup ] % p_Vid->PicWidthInMbs;
+    for( y = yTopLeft; y <= yBottomRight; y++ )
+      for( x = xTopLeft; x <= xBottomRight; x++ )
+        p_Vid->MapUnitToSliceGroupMap[ y * p_Vid->PicWidthInMbs + x ] = iGroup;
+ }
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Generate box-out slice group map type MapUnit map (type 3)
+ *
+ ************************************************************************
+ */
+static void FmoGenerateType3MapUnitMap (VideoParameters *p_Vid, unsigned PicSizeInMapUnits )
+{
+  pic_parameter_set_rbsp_t* pps = p_Vid->active_pps;
+  unsigned i, k;
+  int leftBound, topBound, rightBound, bottomBound;
+  int x, y, xDir, yDir;
+  int mapUnitVacant;
+
+  unsigned mapUnitsInSliceGroup0 = imin((pps->slice_group_change_rate_minus1 + 1) * p_Vid->slice_group_change_cycle, PicSizeInMapUnits);
+
+  for( i = 0; i < PicSizeInMapUnits; i++ )
+    p_Vid->MapUnitToSliceGroupMap[ i ] = 2;
+
+  x = ( p_Vid->PicWidthInMbs - pps->slice_group_change_direction_flag ) / 2;
+  y = ( p_Vid->PicHeightInMapUnits - pps->slice_group_change_direction_flag ) / 2;
+
+  leftBound   = x;
+  topBound    = y;
+  rightBound  = x;
+  bottomBound = y;
+
+  xDir =  pps->slice_group_change_direction_flag - 1;
+  yDir =  pps->slice_group_change_direction_flag;
+
+  for( k = 0; k < PicSizeInMapUnits; k += mapUnitVacant )
+  {
+    mapUnitVacant = ( p_Vid->MapUnitToSliceGroupMap[ y * p_Vid->PicWidthInMbs + x ]  ==  2 );
+    if( mapUnitVacant )
+       p_Vid->MapUnitToSliceGroupMap[ y * p_Vid->PicWidthInMbs + x ] = ( k >= mapUnitsInSliceGroup0 );
+
+    if( xDir  ==  -1  &&  x  ==  leftBound )
+    {
+      leftBound = imax( leftBound - 1, 0 );
+      x = leftBound;
+      xDir = 0;
+      yDir = 2 * pps->slice_group_change_direction_flag - 1;
+    }
+    else
+      if( xDir  ==  1  &&  x  ==  rightBound )
+      {
+        rightBound = imin( rightBound + 1, (int)p_Vid->PicWidthInMbs - 1 );
+        x = rightBound;
+        xDir = 0;
+        yDir = 1 - 2 * pps->slice_group_change_direction_flag;
+      }
+      else
+        if( yDir  ==  -1  &&  y  ==  topBound )
+        {
+          topBound = imax( topBound - 1, 0 );
+          y = topBound;
+          xDir = 1 - 2 * pps->slice_group_change_direction_flag;
+          yDir = 0;
+         }
+        else
+          if( yDir  ==  1  &&  y  ==  bottomBound )
+          {
+            bottomBound = imin( bottomBound + 1, (int)p_Vid->PicHeightInMapUnits - 1 );
+            y = bottomBound;
+            xDir = 2 * pps->slice_group_change_direction_flag - 1;
+            yDir = 0;
+          }
+          else
+          {
+            x = x + xDir;
+            y = y + yDir;
+          }
+  }
+
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Generate raster scan slice group map type MapUnit map (type 4)
+ *
+ ************************************************************************
+ */
+static void FmoGenerateType4MapUnitMap (VideoParameters *p_Vid, unsigned PicSizeInMapUnits )
+{
+  pic_parameter_set_rbsp_t* pps = p_Vid->active_pps;
+
+  unsigned mapUnitsInSliceGroup0 = imin((pps->slice_group_change_rate_minus1 + 1) * p_Vid->slice_group_change_cycle, PicSizeInMapUnits);
+  unsigned sizeOfUpperLeftGroup = pps->slice_group_change_direction_flag ? ( PicSizeInMapUnits - mapUnitsInSliceGroup0 ) : mapUnitsInSliceGroup0;
+
+  unsigned i;
+
+  for( i = 0; i < PicSizeInMapUnits; i++ )
+    if( i < sizeOfUpperLeftGroup )
+        p_Vid->MapUnitToSliceGroupMap[ i ] = pps->slice_group_change_direction_flag;
+    else
+        p_Vid->MapUnitToSliceGroupMap[ i ] = 1 - pps->slice_group_change_direction_flag;
+
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Generate wipe slice group map type MapUnit map (type 5)
+ *
+ ************************************************************************
+ */
+static void FmoGenerateType5MapUnitMap (VideoParameters *p_Vid, unsigned PicSizeInMapUnits )
+{
+  pic_parameter_set_rbsp_t* pps = p_Vid->active_pps;
+
+  unsigned mapUnitsInSliceGroup0 = imin((pps->slice_group_change_rate_minus1 + 1) * p_Vid->slice_group_change_cycle, PicSizeInMapUnits);
+  unsigned sizeOfUpperLeftGroup = pps->slice_group_change_direction_flag ? ( PicSizeInMapUnits - mapUnitsInSliceGroup0 ) : mapUnitsInSliceGroup0;
+
+  unsigned i,j, k = 0;
+
+  for( j = 0; j < p_Vid->PicWidthInMbs; j++ )
+    for( i = 0; i < p_Vid->PicHeightInMapUnits; i++ )
+        if( k++ < sizeOfUpperLeftGroup )
+            p_Vid->MapUnitToSliceGroupMap[ i * p_Vid->PicWidthInMbs + j ] = pps->slice_group_change_direction_flag;
+        else
+            p_Vid->MapUnitToSliceGroupMap[ i * p_Vid->PicWidthInMbs + j ] = 1 - pps->slice_group_change_direction_flag;
+
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Generate explicit slice group map type MapUnit map (type 6)
+ *
+ ************************************************************************
+ */
+static void FmoGenerateType6MapUnitMap (VideoParameters *p_Vid, unsigned PicSizeInMapUnits )
+{
+  pic_parameter_set_rbsp_t* pps = p_Vid->active_pps; 
+  unsigned i;
+  for (i=0; i<PicSizeInMapUnits; i++)
+  {
+    p_Vid->MapUnitToSliceGroupMap[i] = pps->slice_group_id[i];
+  }
+}
+
diff --git a/Src/h264dec/ldecod/src/header.c b/Src/h264dec/ldecod/src/header.c
new file mode 100644
index 00000000..1823d82b
--- /dev/null
+++ b/Src/h264dec/ldecod/src/header.c
@@ -0,0 +1,857 @@
+
+/*!
+ *************************************************************************************
+ * \file header.c
+ *
+ * \brief
+ *    H.264 Slice headers
+ *
+ *************************************************************************************
+ */
+
+#include "global.h"
+#include "elements.h"
+#include "defines.h"
+#include "fmo.h"
+#include "vlc.h"
+#include "mbuffer.h"
+#include "header.h"
+
+#include "ctx_tables.h"
+
+
+#if TRACE
+#define SYMTRACESTRING(s) strncpy(sym.tracestring,s,TRACESTRING_SIZE)
+#else
+#define SYMTRACESTRING(s) // do nothing
+#endif
+
+static void ref_pic_list_reordering(Slice *currSlice);
+static void pred_weight_table(Slice *currSlice);
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    calculate Ceil(Log2(uiVal))
+ ************************************************************************
+ */
+unsigned CeilLog2( unsigned uiVal)
+{
+  unsigned uiTmp = uiVal-1;
+  unsigned uiRet = 0;
+
+  while( uiTmp != 0 )
+  {
+    uiTmp >>= 1;
+    uiRet++;
+  }
+  return uiRet;
+}
+
+unsigned CeilLog2_sf( unsigned uiVal)
+{
+  unsigned uiTmp = uiVal-1;
+  unsigned uiRet = 0;
+
+  while( uiTmp > 0 )
+  {
+    uiTmp >>= 1;
+    uiRet++;
+  }
+  return uiRet;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    read the first part of the header (only the pic_parameter_set_id)
+ * \return
+ *    Length of the first part of the slice header (in bits)
+ ************************************************************************
+ */
+void FirstPartOfSliceHeader(Slice *currSlice)
+{
+  VideoParameters *p_Vid = currSlice->p_Vid;
+  byte dP_nr = assignSE2partition[currSlice->dp_mode][SE_HEADER];
+  DataPartition *partition = &(currSlice->partArr[dP_nr]);
+  Bitstream *currStream = partition->bitstream;
+  int tmp;
+
+  // Get first_mb_in_slice
+  currSlice->start_mb_nr = ue_v ("SH: first_mb_in_slice", currStream);
+
+  tmp = ue_v ("SH: slice_type", currStream);
+
+  if (tmp > 4) tmp -= 5;
+
+  p_Vid->type = currSlice->slice_type = (SliceType) tmp;
+
+  currSlice->pic_parameter_set_id = ue_v ("SH: pic_parameter_set_id", currStream);
+
+  if( p_Vid->separate_colour_plane_flag )
+    p_Vid->colour_plane_id = u_v (2, "SH: colour_plane_id", currStream);
+  else
+    p_Vid->colour_plane_id = PLANE_Y;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    read the scond part of the header (without the pic_parameter_set_id
+ * \return
+ *    Length of the second part of the Slice header in bits
+ ************************************************************************
+ */
+void RestOfSliceHeader(Slice *currSlice)
+{
+  VideoParameters *p_Vid = currSlice->p_Vid;
+  InputParameters *p_Inp = currSlice->p_Inp;
+  seq_parameter_set_rbsp_t *active_sps = p_Vid->active_sps;
+
+  byte dP_nr = assignSE2partition[currSlice->dp_mode][SE_HEADER];
+  DataPartition *partition = &(currSlice->partArr[dP_nr]);
+  Bitstream *currStream = partition->bitstream;
+
+  int val, len;
+
+  p_Vid->frame_num = u_v (active_sps->log2_max_frame_num_minus4 + 4, "SH: frame_num", currStream);
+
+  /* Tian Dong: frame_num gap processing, if found */
+  if (p_Vid->idr_flag)
+  {
+    p_Vid->pre_frame_num = p_Vid->frame_num;
+    // picture error concealment
+    p_Vid->last_ref_pic_poc = 0;
+    assert(p_Vid->frame_num == 0);
+  }
+
+  if (active_sps->frame_mbs_only_flag)
+  {
+    p_Vid->structure = FRAME;
+    p_Vid->field_pic_flag=0;
+  }
+  else
+  {
+    // field_pic_flag   u(1)
+    p_Vid->field_pic_flag = u_1("SH: field_pic_flag", currStream);
+    if (p_Vid->field_pic_flag)
+    {
+      // bottom_field_flag  u(1)
+      p_Vid->bottom_field_flag = (byte)u_1("SH: bottom_field_flag", currStream);
+      p_Vid->structure = p_Vid->bottom_field_flag ? BOTTOM_FIELD : TOP_FIELD;
+    }
+    else
+    {
+      p_Vid->structure = FRAME;
+      p_Vid->bottom_field_flag = FALSE;
+    }
+  }
+
+  currSlice->structure = (PictureStructure) p_Vid->structure;
+
+  p_Vid->mb_aff_frame_flag=(active_sps->mb_adaptive_frame_field_flag && (p_Vid->field_pic_flag==0));
+  currSlice->mb_aff_frame_flag = p_Vid->mb_aff_frame_flag;
+
+  if (p_Vid->structure == FRAME       ) 
+    assert (p_Vid->field_pic_flag == 0);
+  if (p_Vid->structure == TOP_FIELD   ) 
+    assert (p_Vid->field_pic_flag == 1 && (p_Vid->bottom_field_flag == FALSE));
+  if (p_Vid->structure == BOTTOM_FIELD) 
+    assert (p_Vid->field_pic_flag == 1 && (p_Vid->bottom_field_flag == TRUE ));
+
+  if (p_Vid->idr_flag)
+  {
+    p_Vid->idr_pic_id = ue_v("SH: idr_pic_id", currStream);
+  }
+
+  if (active_sps->pic_order_cnt_type == 0)
+  {
+    p_Vid->pic_order_cnt_lsb = u_v(active_sps->log2_max_pic_order_cnt_lsb_minus4 + 4, "SH: pic_order_cnt_lsb", currStream);
+    if( p_Vid->active_pps->bottom_field_pic_order_in_frame_present_flag  ==  1 &&  !p_Vid->field_pic_flag )
+      p_Vid->delta_pic_order_cnt_bottom = se_v("SH: delta_pic_order_cnt_bottom", currStream);
+    else
+      p_Vid->delta_pic_order_cnt_bottom = 0;
+  }
+  if( active_sps->pic_order_cnt_type == 1 && !active_sps->delta_pic_order_always_zero_flag )
+  {
+    p_Vid->delta_pic_order_cnt[ 0 ] = se_v("SH: delta_pic_order_cnt[0]", currStream);
+    if( p_Vid->active_pps->bottom_field_pic_order_in_frame_present_flag  ==  1  &&  !p_Vid->field_pic_flag )
+      p_Vid->delta_pic_order_cnt[ 1 ] = se_v("SH: delta_pic_order_cnt[1]", currStream);
+  }else
+  {
+    if (active_sps->pic_order_cnt_type == 1)
+    {
+      p_Vid->delta_pic_order_cnt[ 0 ] = 0;
+      p_Vid->delta_pic_order_cnt[ 1 ] = 0;
+    }
+  }
+
+  //! redundant_pic_cnt is missing here
+  if (p_Vid->active_pps->redundant_pic_cnt_present_flag)
+  {
+    p_Vid->redundant_pic_cnt = ue_v ("SH: redundant_pic_cnt", currStream);
+  }
+
+  if(currSlice->slice_type == B_SLICE)
+  {
+    currSlice->direct_spatial_mv_pred_flag = u_1 ("SH: direct_spatial_mv_pred_flag", currStream);
+  }
+
+  currSlice->num_ref_idx_l0_active = p_Vid->active_pps->num_ref_idx_l0_active_minus1 + 1;
+  currSlice->num_ref_idx_l1_active = p_Vid->active_pps->num_ref_idx_l1_active_minus1 + 1;
+
+  if(p_Vid->type==P_SLICE || p_Vid->type == SP_SLICE || p_Vid->type==B_SLICE)
+  {
+    val = u_1 ("SH: num_ref_idx_override_flag", currStream);
+    if (val)
+    {
+      currSlice->num_ref_idx_l0_active = 1 + ue_v ("SH: num_ref_idx_l0_active_minus1", currStream);
+
+      if(p_Vid->type==B_SLICE)
+      {
+        currSlice->num_ref_idx_l1_active = 1 + ue_v ("SH: num_ref_idx_l1_active_minus1", currStream);
+      }
+    }
+  }
+  if (currSlice->slice_type!=B_SLICE)
+  {
+    currSlice->num_ref_idx_l1_active = 0;
+  }
+
+  ref_pic_list_reordering(currSlice);
+
+  currSlice->apply_weights = ((p_Vid->active_pps->weighted_pred_flag && (currSlice->slice_type == P_SLICE || currSlice->slice_type == SP_SLICE) )
+          || ((p_Vid->active_pps->weighted_bipred_idc > 0 ) && (currSlice->slice_type == B_SLICE)));
+
+  if ((p_Vid->active_pps->weighted_pred_flag&&(p_Vid->type==P_SLICE|| p_Vid->type == SP_SLICE))||
+      (p_Vid->active_pps->weighted_bipred_idc==1 && (p_Vid->type==B_SLICE)))
+  {
+    pred_weight_table(currSlice);
+  }
+
+  if (p_Vid->nal_reference_idc)
+    dec_ref_pic_marking(p_Vid, currStream);
+
+  if (p_Vid->active_pps->entropy_coding_mode_flag && p_Vid->type!=I_SLICE && p_Vid->type!=SI_SLICE)
+  {
+    currSlice->model_number = ue_v("SH: cabac_init_idc", currStream);
+  }
+  else
+  {
+    currSlice->model_number = 0;
+  }
+
+  currSlice->slice_qp_delta = val = se_v("SH: slice_qp_delta", currStream);
+  currSlice->qp = p_Vid->qp = 26 + p_Vid->active_pps->pic_init_qp_minus26 + val;
+
+  if ((p_Vid->qp < -p_Vid->bitdepth_luma_qp_scale) || (p_Vid->qp > 51))
+    error ("slice_qp_delta makes slice_qp_y out of range", 500);
+
+  if(p_Vid->type==SP_SLICE || p_Vid->type == SI_SLICE)
+  {
+    if(p_Vid->type==SP_SLICE)
+    {
+      p_Vid->sp_switch = u_1 ("SH: sp_for_switch_flag", currStream);
+    }
+    currSlice->slice_qs_delta = val = se_v("SH: slice_qs_delta", currStream);
+    currSlice->qs = 26 + p_Vid->active_pps->pic_init_qs_minus26 + val;    
+    if ((currSlice->qs < 0) || (currSlice->qs > 51))
+      error ("slice_qs_delta makes slice_qs_y out of range", 500);
+  }
+
+  if ( !HI_INTRA_ONLY_PROFILE || (HI_INTRA_ONLY_PROFILE && (p_Inp->intra_profile_deblocking == 1) ))
+  //then read flags and parameters from bistream
+  {
+    if (p_Vid->active_pps->deblocking_filter_control_present_flag)
+    {
+      currSlice->DFDisableIdc = (short)ue_v ("SH: disable_deblocking_filter_idc", currStream);
+
+      if (currSlice->DFDisableIdc!=1)
+      {
+        currSlice->DFAlphaC0Offset = (short) (2 * se_v("SH: slice_alpha_c0_offset_div2", currStream));
+        currSlice->DFBetaOffset    = (short) (2 * se_v("SH: slice_beta_offset_div2", currStream));
+      }
+      else
+      {
+        currSlice->DFAlphaC0Offset = currSlice->DFBetaOffset = 0;
+      }
+    }
+    else
+    {
+      currSlice->DFDisableIdc = currSlice->DFAlphaC0Offset = currSlice->DFBetaOffset = 0;
+    }
+  }
+  else //By default the Loop Filter is Off
+  { //444_TEMP_NOTE: change made below. 08/07/07
+    //still need to parse the SEs (read flags and parameters from bistream) but will ignore
+    if (p_Vid->active_pps->deblocking_filter_control_present_flag)
+    {
+      currSlice->DFDisableIdc = (short) ue_v ("SH: disable_deblocking_filter_idc", currStream);
+
+      if (currSlice->DFDisableIdc!=1)
+      {
+        currSlice->DFAlphaC0Offset = (short) (2 * se_v("SH: slice_alpha_c0_offset_div2", currStream));
+        currSlice->DFBetaOffset    = (short) (2 * se_v("SH: slice_beta_offset_div2", currStream));
+      }
+    }//444_TEMP_NOTE. the end of change. 08/07/07
+    //Ignore the SEs, by default the Loop Filter is Off
+    currSlice->DFDisableIdc =1;
+    currSlice->DFAlphaC0Offset = currSlice->DFBetaOffset = 0;
+  }
+
+
+  if (p_Vid->active_pps->num_slice_groups_minus1>0 && p_Vid->active_pps->slice_group_map_type>=3 &&
+      p_Vid->active_pps->slice_group_map_type<=5)
+  {
+    len = (active_sps->pic_height_in_map_units_minus1+1)*(active_sps->pic_width_in_mbs_minus1+1)/
+          (p_Vid->active_pps->slice_group_change_rate_minus1+1);
+    if (((active_sps->pic_height_in_map_units_minus1+1)*(active_sps->pic_width_in_mbs_minus1+1))%
+          (p_Vid->active_pps->slice_group_change_rate_minus1+1))
+          len +=1;
+
+    len = CeilLog2(len+1);
+
+    p_Vid->slice_group_change_cycle = u_v (len, "SH: slice_group_change_cycle", currStream);
+  }
+  p_Vid->PicHeightInMbs = p_Vid->FrameHeightInMbs / ( 1 + p_Vid->field_pic_flag );
+  p_Vid->PicSizeInMbs   = p_Vid->PicWidthInMbs * p_Vid->PicHeightInMbs;
+  p_Vid->FrameSizeInMbs = p_Vid->PicWidthInMbs * p_Vid->FrameHeightInMbs;
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    read the reference picture reordering information
+ ************************************************************************
+ */
+static void ref_pic_list_reordering(Slice *currSlice)
+{
+  VideoParameters *p_Vid = currSlice->p_Vid;
+  byte dP_nr = assignSE2partition[currSlice->dp_mode][SE_HEADER];
+  DataPartition *partition = &(currSlice->partArr[dP_nr]);
+  Bitstream *currStream = partition->bitstream;
+  int i, val;
+
+  alloc_ref_pic_list_reordering_buffer(currSlice);
+
+  if (p_Vid->type!=I_SLICE && p_Vid->type!=SI_SLICE)
+  {
+    val = currSlice->ref_pic_list_reordering_flag_l0 = u_1 ("SH: ref_pic_list_reordering_flag_l0", currStream);
+
+    if (val)
+    {
+      i=0;
+      do
+      {
+        val = currSlice->reordering_of_pic_nums_idc_l0[i] = ue_v("SH: reordering_of_pic_nums_idc_l0", currStream);
+        if (val==0 || val==1)
+        {
+          currSlice->abs_diff_pic_num_minus1_l0[i] = ue_v("SH: abs_diff_pic_num_minus1_l0", currStream);
+        }
+        else
+        {
+          if (val==2)
+          {
+            currSlice->long_term_pic_idx_l0[i] = ue_v("SH: long_term_pic_idx_l0", currStream);
+          }
+        }
+        i++;
+        // assert (i>currSlice->num_ref_idx_l0_active);
+      } while (val != 3);
+    }
+  }
+
+  if (p_Vid->type==B_SLICE)
+  {
+    val = currSlice->ref_pic_list_reordering_flag_l1 = u_1 ("SH: ref_pic_list_reordering_flag_l1", currStream);
+
+    if (val)
+    {
+      i=0;
+      do
+      {
+        val = currSlice->reordering_of_pic_nums_idc_l1[i] = ue_v("SH: reordering_of_pic_nums_idc_l1", currStream);
+        if (val==0 || val==1)
+        {
+          currSlice->abs_diff_pic_num_minus1_l1[i] = ue_v("SH: abs_diff_pic_num_minus1_l1", currStream);
+        }
+        else
+        {
+          if (val==2)
+          {
+            currSlice->long_term_pic_idx_l1[i] = ue_v("SH: long_term_pic_idx_l1", currStream);
+          }
+        }
+        i++;
+        // assert (i>currSlice->num_ref_idx_l1_active);
+      } while (val != 3);
+    }
+  }
+
+  // set reference index of redundant slices.
+  if(p_Vid->redundant_pic_cnt && (p_Vid->type != I_SLICE) )
+  {
+    p_Vid->redundant_slice_ref_idx = currSlice->abs_diff_pic_num_minus1_l0[0] + 1;
+  }
+}
+
+
+static void reset_wp_params(Slice *currSlice)
+{
+  int i,comp;
+  int log_weight_denom;
+
+  for (i=0; i<MAX_REFERENCE_PICTURES; i++)
+  {
+    for (comp=0; comp<3; comp++)
+    {
+      log_weight_denom = (comp == 0) ? currSlice->luma_log2_weight_denom : currSlice->chroma_log2_weight_denom;
+      currSlice->wp_weight[0][i][comp] = 1 << log_weight_denom;
+      currSlice->wp_weight[1][i][comp] = 1 << log_weight_denom;
+    }
+  }
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    read the weighted prediction tables
+ ************************************************************************
+ */
+static void pred_weight_table(Slice *currSlice)
+{
+  VideoParameters *p_Vid = currSlice->p_Vid;
+  seq_parameter_set_rbsp_t *active_sps = p_Vid->active_sps;
+  byte dP_nr = assignSE2partition[currSlice->dp_mode][SE_HEADER];
+  DataPartition *partition = &(currSlice->partArr[dP_nr]);
+  Bitstream *currStream = partition->bitstream;
+  int luma_weight_flag_l0, luma_weight_flag_l1, chroma_weight_flag_l0, chroma_weight_flag_l1;
+  int i,j;
+
+  currSlice->luma_log2_weight_denom = ue_v ("SH: luma_log2_weight_denom", currStream);
+  currSlice->wp_round_luma = currSlice->luma_log2_weight_denom ? 1<<(currSlice->luma_log2_weight_denom - 1): 0;
+
+  if ( 0 != active_sps->chroma_format_idc)
+  {
+    currSlice->chroma_log2_weight_denom = ue_v ("SH: chroma_log2_weight_denom", currStream);
+    currSlice->wp_round_chroma = currSlice->chroma_log2_weight_denom ? 1<<(currSlice->chroma_log2_weight_denom - 1): 0;
+  }
+
+  reset_wp_params(currSlice);
+
+  for (i=0; i<currSlice->num_ref_idx_l0_active; i++)
+  {
+    luma_weight_flag_l0 = u_1("SH: luma_weight_flag_l0", currStream);
+
+    if (luma_weight_flag_l0)
+    {
+      currSlice->wp_weight[0][i][0] = se_v ("SH: luma_weight_l0", currStream);
+      currSlice->wp_offset[0][i][0] = se_v ("SH: luma_offset_l0", currStream);
+      currSlice->wp_offset[0][i][0] = currSlice->wp_offset[0][i][0]<<(p_Vid->bitdepth_luma - 8);
+    }
+    else
+    {
+      currSlice->wp_weight[0][i][0] = 1 << currSlice->luma_log2_weight_denom;
+      currSlice->wp_offset[0][i][0] = 0;
+    }
+
+    if (active_sps->chroma_format_idc != 0)
+    {
+      chroma_weight_flag_l0 = u_1 ("SH: chroma_weight_flag_l0", currStream);
+
+      for (j=1; j<3; j++)
+      {
+        if (chroma_weight_flag_l0)
+        {
+          currSlice->wp_weight[0][i][j] = se_v("SH: chroma_weight_l0", currStream);
+          currSlice->wp_offset[0][i][j] = se_v("SH: chroma_offset_l0", currStream);
+          currSlice->wp_offset[0][i][j] = currSlice->wp_offset[0][i][j]<<(p_Vid->bitdepth_chroma-8);
+        }
+        else
+        {
+          currSlice->wp_weight[0][i][j] = 1<<currSlice->chroma_log2_weight_denom;
+          currSlice->wp_offset[0][i][j] = 0;
+        }
+      }
+    }
+  }
+  if ((p_Vid->type == B_SLICE) && p_Vid->active_pps->weighted_bipred_idc == 1)
+  {
+    for (i=0; i<currSlice->num_ref_idx_l1_active; i++)
+    {
+      luma_weight_flag_l1 = u_1("SH: luma_weight_flag_l1", currStream);
+
+      if (luma_weight_flag_l1)
+      {
+        currSlice->wp_weight[1][i][0] = se_v ("SH: luma_weight_l1", currStream);
+        currSlice->wp_offset[1][i][0] = se_v ("SH: luma_offset_l1", currStream);
+        currSlice->wp_offset[1][i][0] = currSlice->wp_offset[1][i][0]<<(p_Vid->bitdepth_luma-8);
+      }
+      else
+      {
+        currSlice->wp_weight[1][i][0] = 1<<currSlice->luma_log2_weight_denom;
+        currSlice->wp_offset[1][i][0] = 0;
+      }
+
+      if (active_sps->chroma_format_idc != 0)
+      {
+        chroma_weight_flag_l1 = u_1 ("SH: chroma_weight_flag_l1", currStream);
+
+        for (j=1; j<3; j++)
+        {
+          if (chroma_weight_flag_l1)
+          {
+            currSlice->wp_weight[1][i][j] = se_v("SH: chroma_weight_l1", currStream);
+            currSlice->wp_offset[1][i][j] = se_v("SH: chroma_offset_l1", currStream);
+            currSlice->wp_offset[1][i][j] = currSlice->wp_offset[1][i][j]<<(p_Vid->bitdepth_chroma-8);
+          }
+          else
+          {
+            currSlice->wp_weight[1][i][j] = 1<<currSlice->chroma_log2_weight_denom;
+            currSlice->wp_offset[1][i][j] = 0;
+          }
+        }
+      }
+    }
+  }
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    read the memory control operations
+ ************************************************************************
+ */
+void dec_ref_pic_marking(VideoParameters *p_Vid, Bitstream *currStream)
+{
+  int val;
+
+  DecRefPicMarking_t *tmp_drpm,*tmp_drpm2;
+
+  // free old buffer content
+  while (p_Vid->dec_ref_pic_marking_buffer)
+  {
+    tmp_drpm=p_Vid->dec_ref_pic_marking_buffer;
+
+    p_Vid->dec_ref_pic_marking_buffer=tmp_drpm->Next;
+    free (tmp_drpm);
+  }
+
+  if (p_Vid->idr_flag)
+  {
+    p_Vid->no_output_of_prior_pics_flag = u_1("SH: no_output_of_prior_pics_flag", currStream);
+    p_Vid->long_term_reference_flag = u_1("SH: long_term_reference_flag", currStream);
+  }
+  else
+  {
+    p_Vid->adaptive_ref_pic_buffering_flag = u_1("SH: adaptive_ref_pic_buffering_flag", currStream);
+    if (p_Vid->adaptive_ref_pic_buffering_flag)
+    {
+      // read Memory Management Control Operation
+      do
+      {
+        tmp_drpm=(DecRefPicMarking_t*)calloc (1,sizeof (DecRefPicMarking_t));
+        tmp_drpm->Next=NULL;
+
+        val = tmp_drpm->memory_management_control_operation = ue_v("SH: memory_management_control_operation", currStream);
+
+        if ((val==1)||(val==3))
+        {
+          tmp_drpm->difference_of_pic_nums_minus1 = ue_v("SH: difference_of_pic_nums_minus1", currStream);
+        }
+        if (val==2)
+        {
+          tmp_drpm->long_term_pic_num = ue_v("SH: long_term_pic_num", currStream);
+        }
+
+        if ((val==3)||(val==6))
+        {
+          tmp_drpm->long_term_frame_idx = ue_v("SH: long_term_frame_idx", currStream);
+        }
+        if (val==4)
+        {
+          tmp_drpm->max_long_term_frame_idx_plus1 = ue_v("SH: max_long_term_pic_idx_plus1", currStream);
+        }
+
+        // add command
+        if (p_Vid->dec_ref_pic_marking_buffer==NULL)
+        {
+          p_Vid->dec_ref_pic_marking_buffer=tmp_drpm;
+        }
+        else
+        {
+          tmp_drpm2=p_Vid->dec_ref_pic_marking_buffer;
+          while (tmp_drpm2->Next!=NULL) tmp_drpm2=tmp_drpm2->Next;
+          tmp_drpm2->Next=tmp_drpm;
+        }
+
+      }
+      while (val != 0);
+    }
+  }
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    To calculate the poc values
+ *        based upon JVT-F100d2
+ *  POC200301: Until Jan 2003, this function will calculate the correct POC
+ *    values, but the management of POCs in buffered pictures may need more work.
+ * \return
+ *    none
+ ************************************************************************
+ */
+void decode_poc(VideoParameters *p_Vid)
+{
+  seq_parameter_set_rbsp_t *active_sps = p_Vid->active_sps;
+  int i;
+  // for POC mode 0:
+  unsigned int MaxPicOrderCntLsb = (1<<(active_sps->log2_max_pic_order_cnt_lsb_minus4+4));
+
+  switch ( active_sps->pic_order_cnt_type )
+  {
+  case 0: // POC MODE 0
+    // 1st
+    if(p_Vid->idr_flag)
+    {
+      p_Vid->PrevPicOrderCntMsb = 0;
+      p_Vid->PrevPicOrderCntLsb = 0;
+    }
+    else
+    {
+      if (p_Vid->last_has_mmco_5)
+      {
+        if (p_Vid->last_pic_bottom_field)
+        {
+          p_Vid->PrevPicOrderCntMsb = 0;
+          p_Vid->PrevPicOrderCntLsb = 0;
+        }
+        else
+        {
+          p_Vid->PrevPicOrderCntMsb = 0;
+          p_Vid->PrevPicOrderCntLsb = p_Vid->toppoc;
+        }
+      }
+    }
+    // Calculate the MSBs of current picture
+    if( p_Vid->pic_order_cnt_lsb  <  p_Vid->PrevPicOrderCntLsb  &&
+      ( p_Vid->PrevPicOrderCntLsb - p_Vid->pic_order_cnt_lsb )  >=  ( MaxPicOrderCntLsb / 2 ) )
+      p_Vid->PicOrderCntMsb = p_Vid->PrevPicOrderCntMsb + MaxPicOrderCntLsb;
+    else if ( p_Vid->pic_order_cnt_lsb  >  p_Vid->PrevPicOrderCntLsb  &&
+      ( p_Vid->pic_order_cnt_lsb - p_Vid->PrevPicOrderCntLsb )  >  ( MaxPicOrderCntLsb / 2 ) )
+      p_Vid->PicOrderCntMsb = p_Vid->PrevPicOrderCntMsb - MaxPicOrderCntLsb;
+    else
+      p_Vid->PicOrderCntMsb = p_Vid->PrevPicOrderCntMsb;
+
+    // 2nd
+
+    if(p_Vid->field_pic_flag==0)
+    {           //frame pix
+      p_Vid->toppoc = p_Vid->PicOrderCntMsb + p_Vid->pic_order_cnt_lsb;
+      p_Vid->bottompoc = p_Vid->toppoc + p_Vid->delta_pic_order_cnt_bottom;
+      p_Vid->ThisPOC = p_Vid->framepoc = (p_Vid->toppoc < p_Vid->bottompoc)? p_Vid->toppoc : p_Vid->bottompoc; // POC200301
+    }
+    else if (p_Vid->bottom_field_flag == FALSE)
+    {  //top field
+      p_Vid->ThisPOC= p_Vid->toppoc = p_Vid->PicOrderCntMsb + p_Vid->pic_order_cnt_lsb;
+    }
+    else
+    {  //bottom field
+      p_Vid->ThisPOC= p_Vid->bottompoc = p_Vid->PicOrderCntMsb + p_Vid->pic_order_cnt_lsb;
+    }
+    p_Vid->framepoc=p_Vid->ThisPOC;
+
+    if ( p_Vid->frame_num!=p_Vid->PreviousFrameNum)
+      p_Vid->PreviousFrameNum=p_Vid->frame_num;
+
+    if(p_Vid->nal_reference_idc)
+    {
+      p_Vid->PrevPicOrderCntLsb = p_Vid->pic_order_cnt_lsb;
+      p_Vid->PrevPicOrderCntMsb = p_Vid->PicOrderCntMsb;
+    }
+
+    break;
+
+  case 1: // POC MODE 1
+    // 1st
+    if(p_Vid->idr_flag)
+    {
+      p_Vid->FrameNumOffset=0;     //  first pix of IDRGOP,
+      p_Vid->delta_pic_order_cnt[0]=0;                        //ignore first delta
+      if(p_Vid->frame_num)
+        error("frame_num not equal to zero in IDR picture", -1020);
+    }
+    else
+    {
+      if (p_Vid->last_has_mmco_5)
+      {
+        p_Vid->PreviousFrameNumOffset = 0;
+        p_Vid->PreviousFrameNum = 0;
+      }
+      if (p_Vid->frame_num<p_Vid->PreviousFrameNum)
+      {             //not first pix of IDRGOP
+        p_Vid->FrameNumOffset = p_Vid->PreviousFrameNumOffset + p_Vid->MaxFrameNum;
+      }
+      else
+      {
+        p_Vid->FrameNumOffset = p_Vid->PreviousFrameNumOffset;
+      }
+    }
+
+    // 2nd
+    if(active_sps->num_ref_frames_in_pic_order_cnt_cycle)
+      p_Vid->AbsFrameNum = p_Vid->FrameNumOffset+p_Vid->frame_num;
+    else
+      p_Vid->AbsFrameNum=0;
+    if( (!p_Vid->nal_reference_idc) && p_Vid->AbsFrameNum > 0)
+      p_Vid->AbsFrameNum--;
+
+    // 3rd
+    p_Vid->ExpectedDeltaPerPicOrderCntCycle=0;
+
+    if(active_sps->num_ref_frames_in_pic_order_cnt_cycle)
+    for(i=0;i<(int) active_sps->num_ref_frames_in_pic_order_cnt_cycle;i++)
+      p_Vid->ExpectedDeltaPerPicOrderCntCycle += active_sps->offset_for_ref_frame[i];
+
+    if(p_Vid->AbsFrameNum)
+    {
+      p_Vid->PicOrderCntCycleCnt = (p_Vid->AbsFrameNum-1)/active_sps->num_ref_frames_in_pic_order_cnt_cycle;
+      p_Vid->FrameNumInPicOrderCntCycle = (p_Vid->AbsFrameNum-1)%active_sps->num_ref_frames_in_pic_order_cnt_cycle;
+      p_Vid->ExpectedPicOrderCnt = p_Vid->PicOrderCntCycleCnt*p_Vid->ExpectedDeltaPerPicOrderCntCycle;
+      for(i=0;i<=(int)p_Vid->FrameNumInPicOrderCntCycle;i++)
+        p_Vid->ExpectedPicOrderCnt += active_sps->offset_for_ref_frame[i];
+    }
+    else
+      p_Vid->ExpectedPicOrderCnt=0;
+
+    if(!p_Vid->nal_reference_idc)
+      p_Vid->ExpectedPicOrderCnt += active_sps->offset_for_non_ref_pic;
+
+    if(p_Vid->field_pic_flag==0)
+    {           //frame pix
+      p_Vid->toppoc = p_Vid->ExpectedPicOrderCnt + p_Vid->delta_pic_order_cnt[0];
+      p_Vid->bottompoc = p_Vid->toppoc + active_sps->offset_for_top_to_bottom_field + p_Vid->delta_pic_order_cnt[1];
+      p_Vid->ThisPOC = p_Vid->framepoc = (p_Vid->toppoc < p_Vid->bottompoc)? p_Vid->toppoc : p_Vid->bottompoc; // POC200301
+    }
+    else if (p_Vid->bottom_field_flag == FALSE)
+    {  //top field
+      p_Vid->ThisPOC = p_Vid->toppoc = p_Vid->ExpectedPicOrderCnt + p_Vid->delta_pic_order_cnt[0];
+    }
+    else
+    {  //bottom field
+      p_Vid->ThisPOC = p_Vid->bottompoc = p_Vid->ExpectedPicOrderCnt + active_sps->offset_for_top_to_bottom_field + p_Vid->delta_pic_order_cnt[0];
+    }
+    p_Vid->framepoc=p_Vid->ThisPOC;
+
+    p_Vid->PreviousFrameNum=p_Vid->frame_num;
+    p_Vid->PreviousFrameNumOffset=p_Vid->FrameNumOffset;
+
+    break;
+
+
+  case 2: // POC MODE 2
+    if(p_Vid->idr_flag) // IDR picture
+    {
+      p_Vid->FrameNumOffset=0;     //  first pix of IDRGOP,
+      p_Vid->ThisPOC = p_Vid->framepoc = p_Vid->toppoc = p_Vid->bottompoc = 0;
+      if(p_Vid->frame_num)
+        error("frame_num not equal to zero in IDR picture", -1020);
+    }
+    else
+    {
+      if (p_Vid->last_has_mmco_5)
+      {
+        p_Vid->PreviousFrameNum = 0;
+        p_Vid->PreviousFrameNumOffset = 0;
+      }
+      if (p_Vid->frame_num<p_Vid->PreviousFrameNum)
+        p_Vid->FrameNumOffset = p_Vid->PreviousFrameNumOffset + p_Vid->MaxFrameNum;
+      else
+        p_Vid->FrameNumOffset = p_Vid->PreviousFrameNumOffset;
+
+
+      p_Vid->AbsFrameNum = p_Vid->FrameNumOffset+p_Vid->frame_num;
+      if(!p_Vid->nal_reference_idc)
+        p_Vid->ThisPOC = (2*p_Vid->AbsFrameNum - 1);
+      else
+        p_Vid->ThisPOC = (2*p_Vid->AbsFrameNum);
+
+      if (p_Vid->field_pic_flag==0)
+        p_Vid->toppoc = p_Vid->bottompoc = p_Vid->framepoc = p_Vid->ThisPOC;
+      else if (p_Vid->bottom_field_flag == FALSE)
+         p_Vid->toppoc = p_Vid->framepoc = p_Vid->ThisPOC;
+      else p_Vid->bottompoc = p_Vid->framepoc = p_Vid->ThisPOC;
+    }
+
+    p_Vid->PreviousFrameNum=p_Vid->frame_num;
+    p_Vid->PreviousFrameNumOffset=p_Vid->FrameNumOffset;
+    break;
+
+
+  default:
+    //error must occurs
+    assert( 1==0 );
+    break;
+  }
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    A little helper for the debugging of POC code
+ * \return
+ *    none
+ ************************************************************************
+ */
+int dumppoc(VideoParameters *p_Vid) 
+{
+  seq_parameter_set_rbsp_t *active_sps = p_Vid->active_sps;
+
+  printf ("\nPOC locals...\n");
+  printf ("toppoc                                %d\n", (int) p_Vid->toppoc);
+  printf ("bottompoc                             %d\n", (int) p_Vid->bottompoc);
+  printf ("frame_num                             %d\n", (int) p_Vid->frame_num);
+  printf ("field_pic_flag                        %d\n", (int) p_Vid->field_pic_flag);
+  printf ("bottom_field_flag                     %d\n", (int) p_Vid->bottom_field_flag);
+  printf ("POC SPS\n");
+  printf ("log2_max_frame_num_minus4             %d\n", (int) active_sps->log2_max_frame_num_minus4);         // POC200301
+  printf ("log2_max_pic_order_cnt_lsb_minus4     %d\n", (int) active_sps->log2_max_pic_order_cnt_lsb_minus4);
+  printf ("pic_order_cnt_type                    %d\n", (int) active_sps->pic_order_cnt_type);
+  printf ("num_ref_frames_in_pic_order_cnt_cycle %d\n", (int) active_sps->num_ref_frames_in_pic_order_cnt_cycle);
+  printf ("delta_pic_order_always_zero_flag      %d\n", (int) active_sps->delta_pic_order_always_zero_flag);
+  printf ("offset_for_non_ref_pic                %d\n", (int) active_sps->offset_for_non_ref_pic);
+  printf ("offset_for_top_to_bottom_field        %d\n", (int) active_sps->offset_for_top_to_bottom_field);
+  printf ("offset_for_ref_frame[0]               %d\n", (int) active_sps->offset_for_ref_frame[0]);
+  printf ("offset_for_ref_frame[1]               %d\n", (int) active_sps->offset_for_ref_frame[1]);
+  printf ("POC in SLice Header\n");
+  printf ("bottom_field_pic_order_in_frame_present_flag                %d\n", (int) p_Vid->active_pps->bottom_field_pic_order_in_frame_present_flag);
+  printf ("delta_pic_order_cnt[0]                %d\n", (int) p_Vid->delta_pic_order_cnt[0]);
+  printf ("delta_pic_order_cnt[1]                %d\n", (int) p_Vid->delta_pic_order_cnt[1]);
+  printf ("delta_pic_order_cnt[2]                %d\n", (int) p_Vid->delta_pic_order_cnt[2]);
+  printf ("idr_flag                              %d\n", (int) p_Vid->idr_flag);
+  printf ("MaxFrameNum                           %d\n", (int) p_Vid->MaxFrameNum);
+
+  return 0;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    return the poc of p_Vid as per (8-1) JVT-F100d2
+ *  POC200301
+ ************************************************************************
+ */
+int picture_order(VideoParameters *p_Vid)
+{
+  if (p_Vid->field_pic_flag==0) // is a frame
+    return p_Vid->framepoc;
+  else if (p_Vid->bottom_field_flag == FALSE) // top field
+    return p_Vid->toppoc;
+  else // bottom field
+    return p_Vid->bottompoc;
+}
+
diff --git a/Src/h264dec/ldecod/src/image.c b/Src/h264dec/ldecod/src/image.c
new file mode 100644
index 00000000..a75ff7ea
--- /dev/null
+++ b/Src/h264dec/ldecod/src/image.c
@@ -0,0 +1,1699 @@
+
+/*!
+ ***********************************************************************
+ * \file image.c
+ *
+ * \brief
+ *    Decode a Slice
+ *
+ * \author
+ *    Main contributors (see contributors.h for copyright, address and affiliation details)
+ *    - Inge Lille-Langoy               <inge.lille-langoy@telenor.com>
+ *    - Rickard Sjoberg                 <rickard.sjoberg@era.ericsson.se>
+ *    - Jani Lainema                    <jani.lainema@nokia.com>
+ *    - Sebastian Purreiter             <sebastian.purreiter@mch.siemens.de>
+ *    - Byeong-Moon Jeon                <jeonbm@lge.com>
+ *    - Thomas Wedi                     <wedi@tnt.uni-hannover.de>
+ *    - Gabi Blaettermann
+ *    - Ye-Kui Wang                     <wyk@ieee.org>
+ *    - Antti Hallapuro                 <antti.hallapuro@nokia.com>
+ *    - Alexis Tourapis                 <alexismt@ieee.org>
+ *    - Jill Boyce                      <jill.boyce@thomson.net>
+ *    - Saurav K Bandyopadhyay          <saurav@ieee.org>
+ *    - Zhenyu Wu                       <Zhenyu.Wu@thomson.net
+ *    - Purvin Pandit                   <Purvin.Pandit@thomson.net>
+ *
+ ***********************************************************************
+ */
+
+#include "contributors.h"
+
+#include <math.h>
+#include <limits.h>
+
+#include "global.h"
+#include "image.h"
+#include "fmo.h"
+#include "nalu.h"
+#include "parset.h"
+#include "header.h"
+
+#include "sei.h"
+#include "output.h"
+#include "mb_access.h"
+#include "memalloc.h"
+#include "macroblock.h"
+
+#include "loopfilter.h"
+
+#include "biaridecod.h"
+#include "context_ini.h"
+#include "cabac.h"
+#include "vlc.h"
+#include "quant.h"
+
+#include "errorconcealment.h"
+#include "erc_api.h"
+
+/*!
+ ************************************************************************
+ * \brief
+ *      checks if the System is big- or little-endian
+ * \return
+ *      0, little-endian (e.g. Intel architectures)
+ *      1, big-endian (e.g. SPARC, MIPS, PowerPC)
+ ************************************************************************
+ */
+int testEndian(void)
+{
+  short s;
+  byte *p;
+
+  p=(byte*)&s;
+
+  s=1;
+
+  return (*p==0);
+}
+
+static int read_new_slice(Slice *currSlice, uint64_t time_code);
+/*!
+ ************************************************************************
+ * \brief
+ *    Initializes the parameters for a new picture
+ ************************************************************************
+ */
+// benski> time_code is user-passed value
+static void init_picture(VideoParameters *p_Vid, Slice *currSlice, InputParameters *p_Inp, uint64_t time_code)
+{
+  int i;
+  int nplane;
+
+  if (p_Vid->dec_picture)
+  {
+    // this may only happen on slice loss
+    exit_picture(p_Vid, &p_Vid->dec_picture);
+  }
+  if (p_Vid->recovery_point)
+    p_Vid->recovery_frame_num = (p_Vid->frame_num + p_Vid->recovery_frame_cnt) % p_Vid->MaxFrameNum;
+
+  if (p_Vid->idr_flag)
+    p_Vid->recovery_frame_num = p_Vid->frame_num;
+
+  if (p_Vid->recovery_point == 0 &&
+		p_Vid->pre_frame_num != INT_MIN &&
+    p_Vid->frame_num != p_Vid->pre_frame_num &&
+    p_Vid->frame_num != (p_Vid->pre_frame_num + 1) % p_Vid->MaxFrameNum)
+  {
+    if (p_Vid->active_sps->gaps_in_frame_num_value_allowed_flag == 0)
+    {
+         /* Advanced Error Concealment would be called here to combat unintentional loss of pictures. */
+        error("An unintentional loss of pictures occurs! Exit\n", 100);
+      
+    }
+      fill_frame_num_gap(p_Vid);
+  }
+
+  if(p_Vid->nal_reference_idc)
+  {
+    p_Vid->pre_frame_num = p_Vid->frame_num;
+  }
+
+  p_Vid->num_dec_mb = 0;
+
+  //calculate POC
+  decode_poc(p_Vid);
+
+  if (p_Vid->recovery_frame_num == p_Vid->frame_num &&
+    p_Vid->recovery_poc == 0x7fffffff)
+    p_Vid->recovery_poc = p_Vid->framepoc;
+
+  if(p_Vid->nal_reference_idc)
+    p_Vid->last_ref_pic_poc = p_Vid->framepoc;
+
+  //  dumppoc (p_Vid);
+
+  p_Vid->dec_picture = alloc_storable_picture (p_Vid, (PictureStructure) p_Vid->structure, p_Vid->width, p_Vid->height, p_Vid->width_cr, p_Vid->height_cr);
+	p_Vid->dec_picture->time_code = time_code;
+  p_Vid->dec_picture->top_poc=p_Vid->toppoc;
+  p_Vid->dec_picture->bottom_poc=p_Vid->bottompoc;
+  p_Vid->dec_picture->frame_poc=p_Vid->framepoc;
+  p_Vid->dec_picture->qp = p_Vid->qp;
+  p_Vid->dec_picture->slice_qp_delta = currSlice->slice_qp_delta;
+  p_Vid->dec_picture->chroma_qp_offset[0] = p_Vid->active_pps->chroma_qp_index_offset;
+  p_Vid->dec_picture->chroma_qp_offset[1] = p_Vid->active_pps->second_chroma_qp_index_offset;
+
+  // reset all variables of the error concealment instance before decoding of every frame.
+  // here the third parameter should, if perfectly, be equal to the number of slices per frame.
+  // using little value is ok, the code will allocate more memory if the slice number is larger
+  ercReset(p_Vid->erc_errorVar, p_Vid->PicSizeInMbs, p_Vid->PicSizeInMbs, p_Vid->dec_picture->size_x);
+  p_Vid->erc_mvperMB = 0;
+
+  switch (p_Vid->structure )
+  {
+  case TOP_FIELD:
+    {
+      p_Vid->dec_picture->poc=p_Vid->toppoc;
+      p_Vid->number *= 2;
+      break;
+    }
+  case BOTTOM_FIELD:
+    {
+      p_Vid->dec_picture->poc=p_Vid->bottompoc;
+      p_Vid->number = p_Vid->number * 2 + 1;
+      break;
+    }
+  case FRAME:
+    {
+      p_Vid->dec_picture->poc=p_Vid->framepoc;
+      break;
+    }
+  default:
+    error("p_Vid->structure not initialized", 235);
+  }
+
+  p_Vid->current_slice_nr=0;
+
+  if (p_Vid->type > SI_SLICE)
+  {
+    set_ec_flag(p_Vid, SE_PTYPE);
+    p_Vid->type = P_SLICE;  // concealed element
+  }
+
+  // CAVLC init
+  if (p_Vid->active_pps->entropy_coding_mode_flag == CAVLC)
+  {
+    memset(p_Vid->nz_coeff[0], -1, p_Vid->PicSizeInMbs * 48 *sizeof(byte)); // 3 * 4 * 4
+  }
+
+  if(p_Vid->active_pps->constrained_intra_pred_flag)
+  {
+    for (i=0; i<(int)p_Vid->PicSizeInMbs; ++i)
+    {
+      p_Vid->intra_block[i] = 1;
+    }
+  }
+
+  // Set the slice_nr member of each MB to -1, to ensure correct when packet loss occurs
+  // TO set Macroblock Map (mark all MBs as 'have to be concealed')
+  if( IS_INDEPENDENT(p_Vid) )
+  {
+    for( nplane=0; nplane<MAX_PLANE; ++nplane )
+    {      
+      for(i=0; i<(int)p_Vid->PicSizeInMbs; ++i)
+      {
+        p_Vid->mb_data_JV[nplane][i].slice_nr = -1; 
+        p_Vid->mb_data_JV[nplane][i].ei_flag = 1;
+        p_Vid->mb_data_JV[nplane][i].dpl_flag = 0;
+      }
+    }
+  }
+  else
+  {
+    for(i=0; i<(int)p_Vid->PicSizeInMbs; ++i)
+    {
+      p_Vid->mb_data[i].slice_nr = -1; 
+      p_Vid->mb_data[i].ei_flag = 1;
+      p_Vid->mb_data[i].dpl_flag = 0;
+    }
+  }
+
+  p_Vid->dec_picture->slice_type = p_Vid->type;
+  p_Vid->dec_picture->used_for_reference = (p_Vid->nal_reference_idc != 0);
+  p_Vid->dec_picture->idr_flag = p_Vid->idr_flag;
+  p_Vid->dec_picture->no_output_of_prior_pics_flag = p_Vid->no_output_of_prior_pics_flag;
+  p_Vid->dec_picture->long_term_reference_flag = p_Vid->long_term_reference_flag;
+  p_Vid->dec_picture->adaptive_ref_pic_buffering_flag = p_Vid->adaptive_ref_pic_buffering_flag;
+
+  p_Vid->dec_picture->dec_ref_pic_marking_buffer = p_Vid->dec_ref_pic_marking_buffer;
+  p_Vid->dec_ref_pic_marking_buffer = NULL;
+
+  p_Vid->dec_picture->mb_aff_frame_flag = p_Vid->mb_aff_frame_flag;
+  p_Vid->dec_picture->PicWidthInMbs = p_Vid->PicWidthInMbs;
+
+	if (p_Vid->dec_picture->mb_aff_frame_flag)
+	{
+		p_Vid->get_mb_block_pos = get_mb_block_pos_mbaff;
+		p_Vid->getNeighbour = getAffNeighbour;
+		p_Vid->getNeighbourXP_NoPos = getAffNeighbour;
+		p_Vid->getNeighbourPX_NoPos = getAffNeighbour;
+		p_Vid->getNeighbourLuma = getAffNeighbourLuma;
+		p_Vid->getNeighbourPXLuma = getAffNeighbourLuma;
+		p_Vid->getNeighbourXPLuma = getAffNeighbourXPLuma;
+		p_Vid->getNeighbourLeftLuma = getAffNeighbourN0Luma;
+		p_Vid->getNeighbourNXLuma = getAffNeighbourNXLuma;
+		p_Vid->getNeighbourLeft = getAffNeighbourN0;
+		p_Vid->getNeighbourUp = getAffNeighbour0N;
+		p_Vid->getNeighbourUpLuma = getAffNeighbour0NLuma;
+		p_Vid->getNeighbourNX = getAffNeighbourNX;
+		p_Vid->getNeighbourNP = getAffNeighbourNX;
+		p_Vid->getNeighbourNPChromaNB = getAffNeighbourNX;
+		p_Vid->getNeighbour0X = getAffNeighbour0X;
+		p_Vid->getNeighbour0XLuma = getAffNeighbour0XLuma;
+		p_Vid->getNeighbourX0 = getAffNeighbourX0;
+		p_Vid->getNeighbourNPLumaNB = getAffNeighbourNPLuma;
+		p_Vid->getNeighbourPXLumaNB = getAffNeighbourPXLumaNB;
+		p_Vid->getNeighbourPXLumaNB_NoPos = getAffNeighbourPXLumaNB_NoPos;
+		p_Vid->getNeighbourXPLumaNB = getAffNeighbourXPLuma;		
+		p_Vid->getNeighbourPPLumaNB = getAffNeighbourPPLumaNB;		
+		p_Vid->getNeighbourXPLumaNB_NoPos = getAffNeighbourXPLuma;
+	}
+	else
+	{
+		p_Vid->get_mb_block_pos = get_mb_block_pos_normal;
+		p_Vid->getNeighbour = getNonAffNeighbour;
+		p_Vid->getNeighbourXP_NoPos = getNonAffNeighbourXP_NoPos;
+		p_Vid->getNeighbourPX_NoPos = getNonAffNeighbourPX_NoPos;
+		p_Vid->getNeighbourLuma = getNonAffNeighbourLuma;
+		p_Vid->getNeighbourPXLuma = getNonAffNeighbourPXLuma;
+		p_Vid->getNeighbourXPLuma = getNonAffNeighbourXPLuma;
+		p_Vid->getNeighbourLeftLuma = getNonAffNeighbourN0Luma;
+		p_Vid->getNeighbourNXLuma = getNonAffNeighbourNXLuma;
+		p_Vid->getNeighbourLeft = getNonAffNeighbourN0;
+		p_Vid->getNeighbourUp = getNonAffNeighbour0N;
+		p_Vid->getNeighbourUpLuma = getNonAffNeighbour0NLuma;
+		p_Vid->getNeighbourNX = getNonAffNeighbourNX;
+		p_Vid->getNeighbourNP = getNonAffNeighbourNP;
+		p_Vid->getNeighbourNPChromaNB = getNonAffNeighbourNPChromaNB;
+		p_Vid->getNeighbour0X = getNonAffNeighbour0X;
+		p_Vid->getNeighbour0XLuma = getNonAffNeighbour0XLuma;
+		p_Vid->getNeighbourX0 = getNonAffNeighbourX0;
+		p_Vid->getNeighbourNPLumaNB = getNonAffNeighbourNPLumaNB;
+		p_Vid->getNeighbourPXLumaNB = getNonAffNeighbourPXLumaNB;
+		p_Vid->getNeighbourPXLumaNB_NoPos = getNonAffNeighbourPXLumaNB_NoPos;
+		p_Vid->getNeighbourXPLumaNB = getNonAffNeighbourXPLumaNB;
+		p_Vid->getNeighbourPPLumaNB = getNonAffNeighbourPPLumaNB;
+		p_Vid->getNeighbourXPLumaNB_NoPos = getNonAffNeighbourXPLumaNB_NoPos;
+	}
+
+  p_Vid->dec_picture->pic_num = p_Vid->frame_num;
+  p_Vid->dec_picture->frame_num = p_Vid->frame_num;
+
+  p_Vid->dec_picture->recovery_frame = (unsigned int) (p_Vid->frame_num == p_Vid->recovery_frame_num);
+
+  p_Vid->dec_picture->coded_frame = (p_Vid->structure==FRAME);
+
+  p_Vid->dec_picture->chroma_format_idc = p_Vid->active_sps->chroma_format_idc;
+
+  p_Vid->dec_picture->frame_mbs_only_flag = p_Vid->active_sps->frame_mbs_only_flag;
+  p_Vid->dec_picture->frame_cropping_flag = p_Vid->active_sps->frame_cropping_flag;
+
+  if (p_Vid->dec_picture->frame_cropping_flag)
+  {
+    p_Vid->dec_picture->frame_cropping_rect_left_offset   = p_Vid->active_sps->frame_cropping_rect_left_offset;
+    p_Vid->dec_picture->frame_cropping_rect_right_offset  = p_Vid->active_sps->frame_cropping_rect_right_offset;
+    p_Vid->dec_picture->frame_cropping_rect_top_offset    = p_Vid->active_sps->frame_cropping_rect_top_offset;
+    p_Vid->dec_picture->frame_cropping_rect_bottom_offset = p_Vid->active_sps->frame_cropping_rect_bottom_offset;
+  }
+
+#if (ENABLE_OUTPUT_TONEMAPPING)
+  // store the necessary tone mapping sei into StorablePicture structure
+  p_Vid->dec_picture->seiHasTone_mapping = 0;
+
+  if (p_Vid->seiToneMapping->seiHasTone_mapping)
+  {
+    p_Vid->dec_picture->seiHasTone_mapping    = 1;
+    p_Vid->dec_picture->tone_mapping_model_id = p_Vid->seiToneMapping->model_id;
+    p_Vid->dec_picture->tonemapped_bit_depth  = p_Vid->seiToneMapping->sei_bit_depth;
+    p_Vid->dec_picture->tone_mapping_lut      = malloc(sizeof(int)*(1<<p_Vid->seiToneMapping->coded_data_bit_depth));
+    if (NULL == p_Vid->dec_picture->tone_mapping_lut)
+    {
+      no_mem_exit("init_picture: tone_mapping_lut");
+    }
+    memcpy(p_Vid->dec_picture->tone_mapping_lut, p_Vid->seiToneMapping->lut, sizeof(imgpel)*(1<<p_Vid->seiToneMapping->coded_data_bit_depth));
+    update_tone_mapping_sei(p_Vid->seiToneMapping);
+  }
+#endif
+
+  if( IS_INDEPENDENT(p_Vid) )
+  {
+    p_Vid->dec_picture_JV[0] = p_Vid->dec_picture;
+    p_Vid->dec_picture_JV[1] = alloc_storable_picture (p_Vid, (PictureStructure) p_Vid->structure, p_Vid->width, p_Vid->height, p_Vid->width_cr, p_Vid->height_cr);
+    copy_dec_picture_JV( p_Vid, p_Vid->dec_picture_JV[1], p_Vid->dec_picture_JV[0] );
+    p_Vid->dec_picture_JV[2] = alloc_storable_picture (p_Vid, (PictureStructure) p_Vid->structure, p_Vid->width, p_Vid->height, p_Vid->width_cr, p_Vid->height_cr);
+    copy_dec_picture_JV( p_Vid, p_Vid->dec_picture_JV[2], p_Vid->dec_picture_JV[0] );
+  }
+}
+
+void MbAffPostProc(VideoParameters *p_Vid)
+{
+  imgpel temp[32][16];
+
+  StorablePicture *dec_picture = p_Vid->dec_picture;
+  imgpel ** imgY  = dec_picture->imgY->img;
+	imgpel **imgUV[2] = {dec_picture->imgUV[0]->img, dec_picture->imgUV[1]->img};
+
+  short i, y, x0, y0, uv;
+  for (i=0; i<(int)dec_picture->PicSizeInMbs; i+=2)
+  {
+    if (dec_picture->motion.mb_field[i])
+    {
+      get_mb_pos(p_Vid, i, p_Vid->mb_size[IS_LUMA], &x0, &y0);
+      for (y=0; y<(2*MB_BLOCK_SIZE);++y)
+        memcpy(temp[y], &imgY[y0+y][x0], MB_BLOCK_SIZE * sizeof(imgpel));
+
+      for (y=0; y<MB_BLOCK_SIZE; ++y)
+      {
+         memcpy(&imgY[y0+(2*y  )][x0], temp[y              ], MB_BLOCK_SIZE * sizeof(imgpel));
+         memcpy(&imgY[y0+(2*y+1)][x0], temp[y+MB_BLOCK_SIZE], MB_BLOCK_SIZE * sizeof(imgpel));
+      }
+       
+
+      if (dec_picture->chroma_format_idc != YUV400)
+      {
+        x0 = (short) (x0 / (16/p_Vid->mb_cr_size_x));
+        y0 = (short) (y0 / (16/p_Vid->mb_cr_size_y));
+
+        for (uv=0; uv<2; ++uv)
+        {
+          for (y=0; y<(2*p_Vid->mb_cr_size_y);++y)
+            memcpy(temp[y], &imgUV[uv][y0+y][x0], p_Vid->mb_cr_size_x * sizeof(imgpel));
+
+          for (y=0; y<p_Vid->mb_cr_size_y;++y)
+          {
+            memcpy(&imgUV[uv][y0+(2*y  )][x0], temp[y                  ], p_Vid->mb_cr_size_x * sizeof(imgpel));
+            memcpy(&imgUV[uv][y0+(2*y+1)][x0], temp[y+p_Vid->mb_cr_size_y], p_Vid->mb_cr_size_x * sizeof(imgpel));
+          }
+        }
+      }
+    }
+  }
+}
+
+static void fill_wp_params(Slice *currSlice)
+{
+  VideoParameters *p_Vid = currSlice->p_Vid;
+  
+  int i, j, k;
+  int comp;
+  int log_weight_denom;
+  int tb, td;
+  int tx,DistScaleFactor;
+
+  int max_l0_ref = currSlice->num_ref_idx_l0_active;
+  int max_l1_ref = currSlice->num_ref_idx_l1_active;
+
+  if (p_Vid->active_pps->weighted_bipred_idc == 2)
+  {
+    currSlice->luma_log2_weight_denom = 5;
+    currSlice->chroma_log2_weight_denom = 5;
+    currSlice->wp_round_luma   = 16;
+    currSlice->wp_round_chroma = 16;
+
+    for (i=0; i<MAX_REFERENCE_PICTURES; ++i)
+    {
+      for (comp=0; comp<3; ++comp)
+      {
+        log_weight_denom = (comp == 0) ? currSlice->luma_log2_weight_denom : currSlice->chroma_log2_weight_denom;
+        currSlice->wp_weight[0][i][comp] = 1<<log_weight_denom;
+        currSlice->wp_weight[1][i][comp] = 1<<log_weight_denom;
+        currSlice->wp_offset[0][i][comp] = 0;
+        currSlice->wp_offset[1][i][comp] = 0;
+      }
+    }
+  }
+
+ 
+    for (i=0; i<max_l0_ref; ++i)
+    {
+      for (j=0; j<max_l1_ref; ++j)
+      {
+        for (comp = 0; comp<3; ++comp)
+        {
+          log_weight_denom = (comp == 0) ? currSlice->luma_log2_weight_denom : currSlice->chroma_log2_weight_denom;
+          if (p_Vid->active_pps->weighted_bipred_idc == 1)
+          {
+            currSlice->wbp_weight[0][i][j][comp] =  currSlice->wp_weight[0][i][comp];
+            currSlice->wbp_weight[1][i][j][comp] =  currSlice->wp_weight[1][j][comp];
+          }
+          else if (p_Vid->active_pps->weighted_bipred_idc == 2)
+          {
+            td = iClip3(-128,127,p_Vid->listX[LIST_1][j]->poc - p_Vid->listX[LIST_0][i]->poc);
+            if (td == 0 || p_Vid->listX[LIST_1][j]->is_long_term || p_Vid->listX[LIST_0][i]->is_long_term)
+            {
+              currSlice->wbp_weight[0][i][j][comp] = 32;
+              currSlice->wbp_weight[1][i][j][comp] = 32;
+            }
+            else
+            {
+              tb = iClip3(-128,127,p_Vid->ThisPOC - p_Vid->listX[LIST_0][i]->poc);
+
+              tx = (16384 + iabs(td/2))/td;
+              DistScaleFactor = iClip3(-1024, 1023, (tx*tb + 32 )>>6);
+
+              currSlice->wbp_weight[1][i][j][comp] = DistScaleFactor >> 2;
+              currSlice->wbp_weight[0][i][j][comp] = 64 - currSlice->wbp_weight[1][i][j][comp];
+              if (currSlice->wbp_weight[1][i][j][comp] < -64 || currSlice->wbp_weight[1][i][j][comp] > 128)
+              {
+                currSlice->wbp_weight[0][i][j][comp] = 32;
+                currSlice->wbp_weight[1][i][j][comp] = 32;
+                currSlice->wp_offset[0][i][comp] = 0;
+                currSlice->wp_offset[1][j][comp] = 0;
+              }
+            }
+          }
+        }
+      }
+   }
+ 
+
+  if (currSlice->mb_aff_frame_flag)
+  {
+    for (i=0; i<2*max_l0_ref; ++i)
+    {
+      for (j=0; j<2*max_l1_ref; ++j)
+      {
+        for (comp = 0; comp<3; ++comp)
+        {
+          for (k=2; k<6; k+=2)
+          {
+            currSlice->wp_offset[k+0][i][comp] = currSlice->wp_offset[0][i>>1][comp];
+            currSlice->wp_offset[k+1][j][comp] = currSlice->wp_offset[1][j>>1][comp];
+
+            log_weight_denom = (comp == 0) ? currSlice->luma_log2_weight_denom : currSlice->chroma_log2_weight_denom;
+            if (p_Vid->active_pps->weighted_bipred_idc == 1)
+            {
+              currSlice->wbp_weight[k+0][i][j][comp] =  currSlice->wp_weight[0][i>>1][comp];
+              currSlice->wbp_weight[k+1][i][j][comp] =  currSlice->wp_weight[1][j>>1][comp];
+            }
+            else if (p_Vid->active_pps->weighted_bipred_idc == 2)
+            {
+              td = iClip3(-128,127,p_Vid->listX[k+LIST_1][j]->poc - p_Vid->listX[k+LIST_0][i]->poc);
+              if (td == 0 || p_Vid->listX[k+LIST_1][j]->is_long_term || p_Vid->listX[k+LIST_0][i]->is_long_term)
+              {
+                currSlice->wbp_weight[k+0][i][j][comp] =   32;
+                currSlice->wbp_weight[k+1][i][j][comp] =   32;
+              }
+              else
+              {
+                tb = iClip3(-128,127,((k==2)?p_Vid->toppoc:p_Vid->bottompoc) - p_Vid->listX[k+LIST_0][i]->poc);
+
+                tx = (16384 + iabs(td/2))/td;
+                DistScaleFactor = iClip3(-1024, 1023, (tx*tb + 32 )>>6);
+
+                currSlice->wbp_weight[k+1][i][j][comp] = DistScaleFactor >> 2;
+                currSlice->wbp_weight[k+0][i][j][comp] = 64 - currSlice->wbp_weight[k+1][i][j][comp];
+                if (currSlice->wbp_weight[k+1][i][j][comp] < -64 || currSlice->wbp_weight[k+1][i][j][comp] > 128)
+                {
+                  currSlice->wbp_weight[k+1][i][j][comp] = 32;
+                  currSlice->wbp_weight[k+0][i][j][comp] = 32;
+                  currSlice->wp_offset[k+0][i][comp] = 0;
+                  currSlice->wp_offset[k+1][j][comp] = 0;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+static void decode_slice(Slice *currSlice, int current_header)
+{
+  VideoParameters *p_Vid = currSlice->p_Vid;
+  
+  if (p_Vid->active_pps->entropy_coding_mode_flag)
+  {
+    init_contexts  (currSlice);
+    cabac_new_slice(currSlice);
+  }
+
+  if ( (p_Vid->active_pps->weighted_bipred_idc > 0  && (currSlice->slice_type == B_SLICE)) || (p_Vid->active_pps->weighted_pred_flag && currSlice->slice_type !=I_SLICE))
+    fill_wp_params(currSlice);
+
+  //printf("frame picture %d %d %d\n",p_Vid->structure,p_Vid->ThisPOC,currSlice->direct_spatial_mv_pred_flag);
+
+  // decode main slice information
+  if (current_header == SOP || current_header == SOS)
+    decode_one_slice(currSlice);
+
+  // setMB-Nr in case this slice was lost
+  // if(currSlice->ei_flag)
+  //   p_Vid->current_mb_nr = currSlice->last_mb_nr + 1;
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Error tracking: if current frame is lost or any reference frame of
+ *                    current frame is lost, current frame is incorrect.
+ ************************************************************************
+ */
+static void Error_tracking(VideoParameters *p_Vid)
+{
+  Slice *currSlice = p_Vid->currentSlice;
+  int i;
+
+  if(p_Vid->redundant_pic_cnt == 0)
+    {
+      p_Vid->Is_primary_correct = p_Vid->Is_redundant_correct = 1;
+    }
+
+  if(p_Vid->redundant_pic_cnt == 0 && p_Vid->type != I_SLICE)
+  {
+    for(i=0;i<currSlice->num_ref_idx_l0_active;++i)
+    {
+      if(p_Vid->ref_flag[i] == 0)  // any reference of primary slice is incorrect
+      {
+        p_Vid->Is_primary_correct = 0; // primary slice is incorrect
+      }
+    }
+  }
+  else if(p_Vid->redundant_pic_cnt != 0 && p_Vid->type != I_SLICE)
+  {
+    if(p_Vid->ref_flag[p_Vid->redundant_slice_ref_idx] == 0)  // reference of redundant slice is incorrect
+    {
+      p_Vid->Is_redundant_correct = 0;  // redundant slice is incorrect
+    }
+  }
+}
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    decodes one I- or P-frame
+ *
+ ***********************************************************************
+ */
+
+int decode_one_frame(VideoParameters *p_Vid, uint64_t time_code)
+{
+  InputParameters *p_Inp = p_Vid->p_Inp;
+
+  int current_header;
+  int i;
+
+  while (1)
+  {
+		Slice *currSlice = p_Vid->currentSlice;
+		currSlice->p_Vid = p_Vid;
+	  currSlice->p_Inp = p_Inp;
+
+    current_header = read_new_slice(p_Vid->currentSlice, time_code);
+		if (current_header == EOS)
+			return EOS;
+
+    // error tracking of primary and redundant slices.
+    Error_tracking(p_Vid);
+
+    // If primary and redundant are received and primary is correct, discard the redundant
+    // else, primary slice will be replaced with redundant slice.
+    if(p_Vid->frame_num == p_Vid->previous_frame_num && p_Vid->redundant_pic_cnt !=0
+      && p_Vid->Is_primary_correct !=0 && current_header != EOS)
+    {
+      continue;
+    }
+
+    // update reference flags and set current p_Vid->ref_flag
+    if(!(p_Vid->redundant_pic_cnt != 0 && p_Vid->previous_frame_num == p_Vid->frame_num))
+    {
+      for(i=16;i>0;i--)
+      {
+        p_Vid->ref_flag[i] = p_Vid->ref_flag[i-1];
+      }
+    }
+    p_Vid->ref_flag[0] = p_Vid->redundant_pic_cnt==0 ? p_Vid->Is_primary_correct : p_Vid->Is_redundant_correct;
+    p_Vid->previous_frame_num = p_Vid->frame_num;
+
+    if (current_header == EOS)
+    {
+//      exit_picture(p_Vid, &p_Vid->dec_picture);
+      return EOS;
+    }
+
+    if((p_Vid->active_sps->chroma_format_idc==0)||(p_Vid->active_sps->chroma_format_idc==3))
+    {
+      currSlice->linfo_cbp_intra = linfo_cbp_intra_other;
+      currSlice->linfo_cbp_inter = linfo_cbp_inter_other;
+    }
+    else
+    {
+      currSlice->linfo_cbp_intra = linfo_cbp_intra_normal;
+      currSlice->linfo_cbp_inter = linfo_cbp_inter_normal;
+    }
+
+    decode_slice(currSlice, current_header);
+
+    ++(p_Vid->current_slice_nr);
+  }
+
+  exit_picture(p_Vid, &p_Vid->dec_picture);
+
+  return (SOP);
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Convert file read buffer to source picture structure
+ * \param imgX
+ *    Pointer to image plane
+ * \param buf
+ *    Buffer for file output
+ * \param size_x
+ *    horizontal image size in pixel
+ * \param size_y
+ *    vertical image size in pixel
+ * \param symbol_size_in_bytes
+ *    number of bytes used per pel
+ ************************************************************************
+ */
+void buffer2img (imgpel** imgX, unsigned char* buf, int size_x, int size_y, int symbol_size_in_bytes)
+{
+  int i,j;
+
+  uint16 tmp16, ui16;
+  unsigned long  tmp32, ui32;
+
+  if (symbol_size_in_bytes> sizeof(imgpel))
+  {
+    error ("Source picture has higher bit depth than imgpel data type. \nPlease recompile with larger data type for imgpel.", 500);
+  }
+
+  if (( sizeof(char) == sizeof (imgpel)) && ( sizeof(char) == symbol_size_in_bytes))
+  {
+    // imgpel == pixel_in_file == 1 byte -> simple copy
+    memcpy(&imgX[0][0], buf, size_x * size_y);
+  }
+  else
+  {
+    // sizeof (imgpel) > sizeof(char)
+    if (testEndian())
+    {
+      // big endian
+      switch (symbol_size_in_bytes)
+      {
+      case 1:
+        {
+          for(j = 0; j < size_y; ++j)
+            for(i = 0; i < size_x; ++i)
+            {
+              imgX[j][i]= buf[i+j*size_x];
+            }
+          break;
+        }
+      case 2:
+        {
+          for(j=0;j<size_y;++j)
+            for(i=0;i<size_x;++i)
+            {
+              memcpy(&tmp16, buf+((i+j*size_x)*2), 2);
+              ui16  = (uint16) ((tmp16 >> 8) | ((tmp16&0xFF)<<8));
+              imgX[j][i] = (imgpel) ui16;
+            }
+          break;
+        }
+      case 4:
+        {
+          for(j=0;j<size_y;++j)
+            for(i=0;i<size_x;++i)
+            {
+              memcpy(&tmp32, buf+((i+j*size_x)*4), 4);
+              ui32  = ((tmp32&0xFF00)<<8) | ((tmp32&0xFF)<<24) | ((tmp32&0xFF0000)>>8) | ((tmp32&0xFF000000)>>24);
+              imgX[j][i] = (imgpel) ui32;
+            }
+        }
+      default:
+        {
+           error ("reading only from formats of 8, 16 or 32 bit allowed on big endian architecture", 500);
+           break;
+        }
+      }
+
+    }
+    else
+    {
+      // little endian
+      if (symbol_size_in_bytes == 1)
+      {
+        for (j=0; j < size_y; ++j)
+        {
+          for (i=0; i < size_x; ++i)
+          {
+            imgX[j][i]=*(buf++);
+          }
+        }
+      }
+      else
+      {
+        for (j=0; j < size_y; ++j)
+        {
+          int jpos = j*size_x;
+          for (i=0; i < size_x; ++i)
+          {
+            imgX[j][i]=0;
+            memcpy(&(imgX[j][i]), buf +((i+jpos)*symbol_size_in_bytes), symbol_size_in_bytes);
+          }
+        }
+      }
+
+    }
+  }
+}
+
+void reorder_lists(Slice *currSlice)
+{
+  VideoParameters *p_Vid = currSlice->p_Vid;
+  if ((currSlice->slice_type != I_SLICE)&&(currSlice->slice_type != SI_SLICE))
+  {
+    if (currSlice->ref_pic_list_reordering_flag_l0)
+    {
+      reorder_ref_pic_list(p_Vid, p_Vid->listX[0], &p_Vid->listXsize[0],
+                           currSlice->num_ref_idx_l0_active - 1,
+                           currSlice->reordering_of_pic_nums_idc_l0,
+                           currSlice->abs_diff_pic_num_minus1_l0,
+                           currSlice->long_term_pic_idx_l0);
+    }
+    if (p_Vid->no_reference_picture == p_Vid->listX[0][currSlice->num_ref_idx_l0_active-1])
+    {
+      if (p_Vid->non_conforming_stream)
+        printf("RefPicList0[ num_ref_idx_l0_active_minus1 ] is equal to 'no reference picture'\n");
+      else
+        error("RefPicList0[ num_ref_idx_l0_active_minus1 ] is equal to 'no reference picture', invalid bitstream",500);
+    }
+    // that's a definition
+    p_Vid->listXsize[0] = (char) currSlice->num_ref_idx_l0_active;
+  }
+  if (currSlice->slice_type == B_SLICE)
+  {
+    if (currSlice->ref_pic_list_reordering_flag_l1)
+    {
+      reorder_ref_pic_list(p_Vid, p_Vid->listX[1], &p_Vid->listXsize[1],
+                           currSlice->num_ref_idx_l1_active - 1,
+                           currSlice->reordering_of_pic_nums_idc_l1,
+                           currSlice->abs_diff_pic_num_minus1_l1,
+                           currSlice->long_term_pic_idx_l1);
+    }
+    if (p_Vid->no_reference_picture == p_Vid->listX[1][currSlice->num_ref_idx_l1_active-1])
+    {
+      if (p_Vid->non_conforming_stream)
+        printf("RefPicList1[ num_ref_idx_l1_active_minus1 ] is equal to 'no reference picture'\n");
+      else
+        error("RefPicList1[ num_ref_idx_l1_active_minus1 ] is equal to 'no reference picture', invalid bitstream",500);
+    }
+    // that's a definition
+    p_Vid->listXsize[1] = (char) currSlice->num_ref_idx_l1_active;
+  }
+
+  free_ref_pic_list_reordering_buffer(currSlice);
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    initialize ref_pic_num array
+ ************************************************************************
+ */
+void set_ref_pic_num(VideoParameters *p_Vid)
+{
+  int i,j;
+  StorablePicture *dec_picture = p_Vid->dec_picture;
+  int slice_id=p_Vid->current_slice_nr;
+
+  for (i=0;i<p_Vid->listXsize[LIST_0];++i)
+  {
+    dec_picture->ref_pic_num        [slice_id][LIST_0][i] = p_Vid->listX[LIST_0][i]->poc * 2 + ((p_Vid->listX[LIST_0][i]->structure==BOTTOM_FIELD)?1:0) ;
+    dec_picture->frm_ref_pic_num    [slice_id][LIST_0][i] = p_Vid->listX[LIST_0][i]->frame_poc * 2;
+    dec_picture->top_ref_pic_num    [slice_id][LIST_0][i] = p_Vid->listX[LIST_0][i]->top_poc * 2;
+    dec_picture->bottom_ref_pic_num [slice_id][LIST_0][i] = p_Vid->listX[LIST_0][i]->bottom_poc * 2 + 1;
+    //printf("POCS %d %d %d %d ",p_Vid->listX[LIST_0][i]->frame_poc,p_Vid->listX[LIST_0][i]->bottom_poc,p_Vid->listX[LIST_0][i]->top_poc,p_Vid->listX[LIST_0][i]->poc);
+    //printf("refid %d %d %d %d\n",(int) dec_picture->frm_ref_pic_num[LIST_0][i],(int) dec_picture->top_ref_pic_num[LIST_0][i],(int) dec_picture->bottom_ref_pic_num[LIST_0][i],(int) dec_picture->ref_pic_num[LIST_0][i]);
+  }
+
+  for (i=0;i<p_Vid->listXsize[LIST_1];++i)
+  {
+    dec_picture->ref_pic_num        [slice_id][LIST_1][i] = p_Vid->listX[LIST_1][i]->poc  *2 + ((p_Vid->listX[LIST_1][i]->structure==BOTTOM_FIELD)?1:0);
+    dec_picture->frm_ref_pic_num    [slice_id][LIST_1][i] = p_Vid->listX[LIST_1][i]->frame_poc * 2;
+    dec_picture->top_ref_pic_num    [slice_id][LIST_1][i] = p_Vid->listX[LIST_1][i]->top_poc * 2;
+    dec_picture->bottom_ref_pic_num [slice_id][LIST_1][i] = p_Vid->listX[LIST_1][i]->bottom_poc * 2 + 1;
+  }
+
+  if (!p_Vid->active_sps->frame_mbs_only_flag)
+  {
+    if (p_Vid->structure==FRAME)
+    {
+      for (j=2;j<6;++j)
+      {
+        for (i=0;i<p_Vid->listXsize[j];++i)
+        {
+          dec_picture->ref_pic_num        [slice_id][j][i] = p_Vid->listX[j][i]->poc * 2 + ((p_Vid->listX[j][i]->structure==BOTTOM_FIELD)?1:0);
+          dec_picture->frm_ref_pic_num    [slice_id][j][i] = p_Vid->listX[j][i]->frame_poc * 2 ;
+          dec_picture->top_ref_pic_num    [slice_id][j][i] = p_Vid->listX[j][i]->top_poc * 2 ;
+          dec_picture->bottom_ref_pic_num [slice_id][j][i] = p_Vid->listX[j][i]->bottom_poc * 2 + 1;
+        }
+      }
+    }
+  }
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Reads new slice from bit_stream
+ ************************************************************************
+ */
+
+// benski> time_code is user-passed data
+static int read_new_slice(Slice *currSlice, uint64_t time_code)
+{
+  VideoParameters *p_Vid = currSlice->p_Vid;
+  InputParameters *p_Inp = currSlice->p_Inp;
+    
+  NALU_t *nalu = p_Vid->nalu;
+	
+  int current_header = 0;
+  Bitstream *currStream;
+
+  int slice_id_a, slice_id_b, slice_id_c;
+  int redundant_pic_cnt_b, redundant_pic_cnt_c;
+
+  if (!nalu)
+	  return EOS; 
+
+	for (;;)
+  {
+    if (0 == read_next_nalu(p_Vid, nalu))
+      return EOS;
+
+process_nalu:
+
+    switch (nalu->nal_unit_type)
+    {
+      case NALU_TYPE_SLICE:
+      case NALU_TYPE_IDR:
+
+        if (p_Vid->recovery_point || nalu->nal_unit_type == NALU_TYPE_IDR)
+        {
+          if (p_Vid->recovery_point_found == 0)
+          {
+            if (nalu->nal_unit_type != NALU_TYPE_IDR)
+            {
+              printf("Warning: Decoding does not start with an IDR picture.\n");
+              p_Vid->non_conforming_stream = 1;
+            }
+            else
+              p_Vid->non_conforming_stream = 0;
+          }
+          p_Vid->recovery_point_found = 1;
+        }
+
+        if (p_Vid->recovery_point_found == 0)
+          break;
+
+        p_Vid->idr_flag = (nalu->nal_unit_type == NALU_TYPE_IDR);
+        p_Vid->nal_reference_idc = nalu->nal_reference_idc;
+
+        currSlice->dp_mode = PAR_DP_1;
+        currSlice->max_part_nr = 1;
+        currStream = currSlice->partArr[0].bitstream;
+        currStream->frame_bitoffset = currStream->read_len = 0;
+        //memcpy (currStream->streamBuffer, &nalu->buf[1], nalu->len-1);
+				currStream->streamBuffer = &nalu->buf[1];
+        currStream->code_len = currStream->bitstream_length = RBSPtoSODB(currStream->streamBuffer, nalu->len-1);
+
+        // Some syntax of the Slice Header depends on the parameter set, which depends on
+        // the parameter set ID of the SLice header.  Hence, read the pic_parameter_set_id
+        // of the slice header first, then setup the active parameter sets, and then read
+        // the rest of the slice header
+        FirstPartOfSliceHeader(currSlice);
+        UseParameterSet (currSlice, currSlice->pic_parameter_set_id);
+        RestOfSliceHeader (currSlice);
+
+        fmo_init (p_Vid);
+				currSlice->active_sps = p_Vid->active_sps;
+        currSlice->active_pps = p_Vid->active_pps;
+
+        assign_quant_params (currSlice);
+
+        // if primary slice is replaced with redundant slice, set the correct image type
+        if(p_Vid->redundant_pic_cnt && p_Vid->Is_primary_correct==0 && p_Vid->Is_redundant_correct)
+        {
+          p_Vid->dec_picture->slice_type = p_Vid->type;
+        }
+
+        if(is_new_picture(p_Vid->dec_picture, currSlice, p_Vid->old_slice))
+        {
+          init_picture(p_Vid, currSlice, p_Inp, time_code);
+
+          current_header = SOP;
+          //check zero_byte if it is also the first NAL unit in the access unit
+          CheckZeroByteVCL(p_Vid, nalu);
+        }
+        else
+          current_header = SOS;
+
+        if (currSlice->slice_type == B_SLICE)
+        {
+          if( IS_INDEPENDENT(p_Vid) )
+          {
+            int nplane = 0;
+            for( nplane=0; nplane<MAX_PLANE; nplane++ )
+            {
+              if( NULL != currSlice->Co_located_JV[nplane] )
+              {
+                free_colocated(p_Vid, currSlice->Co_located_JV[nplane]);        
+                currSlice->Co_located_JV[nplane] = NULL;
+              }
+              currSlice->Co_located_JV[nplane] = alloc_colocated (p_Vid, p_Vid->width, p_Vid->height, p_Vid->active_sps->mb_adaptive_frame_field_flag);      
+            }
+          }
+          else
+          {
+            if (NULL != currSlice->p_colocated)
+            {
+              free_colocated(p_Vid, currSlice->p_colocated);
+              currSlice->p_colocated = NULL;
+            }
+            currSlice->p_colocated = alloc_colocated (p_Vid, p_Vid->width, p_Vid->height,p_Vid->active_sps->mb_adaptive_frame_field_flag);
+          }
+        }
+
+ 
+        init_lists(currSlice);
+        reorder_lists (currSlice);
+
+        if (p_Vid->structure==FRAME)
+        {
+          init_mbaff_lists(p_Vid);
+        }
+
+        // From here on, p_Vid->active_sps, p_Vid->active_pps and the slice header are valid
+        if (currSlice->mb_aff_frame_flag)
+          p_Vid->current_mb_nr = currSlice->start_mb_nr << 1;
+        else
+          p_Vid->current_mb_nr = currSlice->start_mb_nr;
+
+        if (p_Vid->active_pps->entropy_coding_mode_flag)
+        {
+          int ByteStartPosition = currStream->frame_bitoffset/8;
+          if (currStream->frame_bitoffset%8 != 0)
+          {
+            ++ByteStartPosition;
+          }
+          arideco_start_decoding (&currSlice->partArr[0].de_cabac, currStream->streamBuffer, ByteStartPosition, &currStream->read_len);
+        }
+        // printf ("read_new_slice: returning %s\n", current_header == SOP?"SOP":"SOS");
+        p_Vid->recovery_point = 0;
+        return current_header;
+        break;
+      case NALU_TYPE_DPA:
+        // read DP_A
+        currSlice->dpB_NotPresent =1; 
+        currSlice->dpC_NotPresent =1; 
+
+        p_Vid->idr_flag          = FALSE;
+        p_Vid->nal_reference_idc = nalu->nal_reference_idc;
+        currSlice->dp_mode     = PAR_DP_3;
+        currSlice->max_part_nr = 3;
+        currStream             = currSlice->partArr[0].bitstream;
+        currStream->frame_bitoffset = currStream->read_len = 0;
+        //memcpy (currStream->streamBuffer, &nalu->buf[1], nalu->len-1);
+				currStream->streamBuffer = &nalu->buf[1];
+        currStream->code_len = currStream->bitstream_length = RBSPtoSODB(currStream->streamBuffer, nalu->len-1);
+
+        FirstPartOfSliceHeader(currSlice);
+        UseParameterSet (currSlice, currSlice->pic_parameter_set_id);
+        RestOfSliceHeader (currSlice);
+
+        fmo_init (p_Vid);
+
+        if(is_new_picture(p_Vid->dec_picture, currSlice, p_Vid->old_slice))
+        {
+          init_picture(p_Vid, currSlice, p_Inp, time_code);
+          current_header = SOP;
+          CheckZeroByteVCL(p_Vid, nalu);
+        }
+        else
+          current_header = SOS;
+
+        init_lists(currSlice);
+        reorder_lists (currSlice);
+
+        if (p_Vid->structure==FRAME)
+        {
+          init_mbaff_lists(p_Vid);
+        }
+
+        // From here on, p_Vid->active_sps, p_Vid->active_pps and the slice header are valid
+        if (currSlice->mb_aff_frame_flag)
+          p_Vid->current_mb_nr = currSlice->start_mb_nr << 1;
+        else
+          p_Vid->current_mb_nr = currSlice->start_mb_nr;
+
+        // Now I need to read the slice ID, which depends on the value of
+        // redundant_pic_cnt_present_flag
+
+        slice_id_a  = ue_v("NALU: DP_A slice_id", currStream);
+
+        if (p_Vid->active_pps->entropy_coding_mode_flag)
+          error ("received data partition with CABAC, this is not allowed", 500);
+
+        // continue with reading next DP
+        if (0 == read_next_nalu(p_Vid, nalu))
+          return current_header;
+        
+        if ( NALU_TYPE_DPB == nalu->nal_unit_type)
+        {
+          // we got a DPB
+          currStream             = currSlice->partArr[1].bitstream;
+          currStream->frame_bitoffset = currStream->read_len = 0;
+
+          //memcpy (currStream->streamBuffer, &nalu->buf[1], nalu->len-1);
+					currStream->streamBuffer = &nalu->buf[1];
+          currStream->code_len = currStream->bitstream_length = RBSPtoSODB(currStream->streamBuffer, nalu->len-1);
+
+          slice_id_b  = ue_v("NALU: DP_B slice_id", currStream);
+
+          currSlice->dpB_NotPresent = 0; 
+
+          if ((slice_id_b != slice_id_a) || (nalu->lost_packets))
+          {
+            printf ("Waning: got a data partition B which does not match DP_A (DP loss!)\n");
+            currSlice->dpB_NotPresent =1; 
+            currSlice->dpC_NotPresent =1; 
+          }
+          else
+          {
+            if (p_Vid->active_pps->redundant_pic_cnt_present_flag)
+              redundant_pic_cnt_b = ue_v("NALU: DP_B redudant_pic_cnt", currStream);
+            else
+              redundant_pic_cnt_b = 0;
+
+            // we're finished with DP_B, so let's continue with next DP
+            if (0 == read_next_nalu(p_Vid, nalu))
+              return current_header;
+          }
+        }
+        else
+        {
+          currSlice->dpB_NotPresent =1; 
+        }
+
+        // check if we got DP_C
+        if ( NALU_TYPE_DPC == nalu->nal_unit_type)
+        {
+          currStream             = currSlice->partArr[2].bitstream;
+          currStream->frame_bitoffset = currStream->read_len = 0;
+
+          //memcpy (currStream->streamBuffer, &nalu->buf[1], nalu->len-1);
+					currStream->streamBuffer = &nalu->buf[1];
+          currStream->code_len = currStream->bitstream_length = RBSPtoSODB(currStream->streamBuffer, nalu->len-1);
+
+          currSlice->dpC_NotPresent = 0;
+
+          slice_id_c  = ue_v("NALU: DP_C slice_id", currStream);
+          if ((slice_id_c != slice_id_a)|| (nalu->lost_packets))
+          {
+            printf ("Warning: got a data partition C which does not match DP_A(DP loss!)\n");
+            //currSlice->dpB_NotPresent =1;
+            currSlice->dpC_NotPresent =1;
+          }
+
+          if (p_Vid->active_pps->redundant_pic_cnt_present_flag)
+            redundant_pic_cnt_c = ue_v("NALU:SLICE_C redudand_pic_cnt", currStream);
+          else
+            redundant_pic_cnt_c = 0;
+        }
+        else
+        {
+          currSlice->dpC_NotPresent =1;
+        }
+
+        // check if we read anything else than the expected partitions
+        if ((nalu->nal_unit_type != NALU_TYPE_DPB) && (nalu->nal_unit_type != NALU_TYPE_DPC))
+        {
+          // we have a NALI that we can't process here, so restart processing
+          goto process_nalu;
+          // yes, "goto" should not be used, but it's really the best way here before we restructure the decoding loop
+          // (which should be taken care of anyway)
+        }
+
+        return current_header;
+
+        break;
+      case NALU_TYPE_DPB:
+        printf ("found data partition B without matching DP A, discarding\n");
+        break;
+      case NALU_TYPE_DPC:
+        printf ("found data partition C without matching DP A, discarding\n");
+        break;
+      case NALU_TYPE_SEI:
+        //printf ("read_new_slice: Found NALU_TYPE_SEI, len %d\n", nalu->len);
+        InterpretSEIMessage(nalu->buf,nalu->len,p_Vid);
+        break;
+      case NALU_TYPE_PPS:
+        ProcessPPS(p_Vid, nalu);
+        break;
+      case NALU_TYPE_SPS:
+        ProcessSPS(p_Vid, nalu);
+        break;
+      case NALU_TYPE_AUD:
+//        printf ("read_new_slice: Found 'Access Unit Delimiter' NAL unit, len %d, ignored\n", nalu->len);
+        break;
+      case NALU_TYPE_EOSEQ:
+//        printf ("read_new_slice: Found 'End of Sequence' NAL unit, len %d, ignored\n", nalu->len);
+        break;
+      case NALU_TYPE_EOSTREAM:
+//        printf ("read_new_slice: Found 'End of Stream' NAL unit, len %d, ignored\n", nalu->len);
+        break;
+      case NALU_TYPE_FILL:
+        printf ("read_new_slice: Found NALU_TYPE_FILL, len %d\n", (int) nalu->len);
+        printf ("Skipping these filling bits, proceeding w/ next NALU\n");
+        break;
+      default:
+        printf ("Found NALU type %d, len %d undefined, ignore NALU, moving on\n", (int) nalu->nal_unit_type, (int) nalu->len);
+        break;
+    }
+  }
+        
+  return  current_header;
+}
+
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    finish decoding of a picture, conceal errors and store it
+ *    into the DPB
+ ************************************************************************
+ */
+void exit_picture(VideoParameters *p_Vid, StorablePicture **dec_picture)
+{
+  InputParameters *p_Inp = p_Vid->p_Inp;
+  int ercStartMB;
+  int ercSegment;
+  frame recfr;
+  unsigned int i;
+  int structure, frame_poc, slice_type, refpic, qp, pic_num, chroma_format_idc, is_idr;
+
+  int nplane;
+
+  // return if the last picture has already been finished
+  if (*dec_picture==NULL)
+  {
+    return;
+  }
+
+  recfr.p_Vid = p_Vid;
+  recfr.yptr = (*dec_picture)->imgY->base_address;
+  if ((*dec_picture)->chroma_format_idc != YUV400)
+  {
+    recfr.uptr = (*dec_picture)->imgUV[0]->base_address;
+    recfr.vptr = (*dec_picture)->imgUV[1]->base_address;
+  }
+
+  //! this is always true at the beginning of a picture
+  ercStartMB = 0;
+  ercSegment = 0;
+
+  //! mark the start of the first segment
+  if (!(*dec_picture)->mb_aff_frame_flag)
+  {
+    ercStartSegment(0, ercSegment, 0 , p_Vid->erc_errorVar);
+    //! generate the segments according to the macroblock map
+    for(i = 1; i<(*dec_picture)->PicSizeInMbs; ++i)
+    {
+      if(p_Vid->mb_data[i].ei_flag != p_Vid->mb_data[i-1].ei_flag)
+      {
+        ercStopSegment(i-1, ercSegment, 0, p_Vid->erc_errorVar); //! stop current segment
+
+        //! mark current segment as lost or OK
+        if(p_Vid->mb_data[i-1].ei_flag)
+          ercMarkCurrSegmentLost((*dec_picture)->size_x, p_Vid->erc_errorVar);
+        else
+          ercMarkCurrSegmentOK((*dec_picture)->size_x, p_Vid->erc_errorVar);
+
+        ++ercSegment;  //! next segment
+        ercStartSegment(i, ercSegment, 0 , p_Vid->erc_errorVar); //! start new segment
+        ercStartMB = i;//! save start MB for this segment
+      }
+    }
+    //! mark end of the last segment
+    ercStopSegment((*dec_picture)->PicSizeInMbs-1, ercSegment, 0, p_Vid->erc_errorVar);
+    if(p_Vid->mb_data[i-1].ei_flag)
+      ercMarkCurrSegmentLost((*dec_picture)->size_x, p_Vid->erc_errorVar);
+    else
+      ercMarkCurrSegmentOK((*dec_picture)->size_x, p_Vid->erc_errorVar);
+
+    //! call the right error concealment function depending on the frame type.
+    p_Vid->erc_mvperMB /= (*dec_picture)->PicSizeInMbs;
+
+    p_Vid->erc_img = p_Vid;
+    if((*dec_picture)->slice_type == I_SLICE || (*dec_picture)->slice_type == SI_SLICE) // I-frame
+      ercConcealIntraFrame(p_Vid, &recfr, (*dec_picture)->size_x, (*dec_picture)->size_y, p_Vid->erc_errorVar);
+    else
+      ercConcealInterFrame(&recfr, p_Vid->erc_object_list, (*dec_picture)->size_x, (*dec_picture)->size_y, p_Vid->erc_errorVar, (*dec_picture)->chroma_format_idc);
+  }
+
+  //deblocking for frame or field
+  if( IS_INDEPENDENT(p_Vid) )
+  {
+    int colour_plane_id = p_Vid->colour_plane_id;
+    for( nplane=0; nplane<MAX_PLANE; ++nplane )
+    {
+      change_plane_JV( p_Vid, nplane );
+      DeblockPicture( p_Vid, *dec_picture );
+    }
+    p_Vid->colour_plane_id = colour_plane_id;
+    make_frame_picture_JV(p_Vid);
+  }
+  else
+  {
+    DeblockPicture( p_Vid, *dec_picture );
+  }
+
+  if ((*dec_picture)->mb_aff_frame_flag)
+    MbAffPostProc(p_Vid);
+
+  if (p_Vid->structure == FRAME)         // buffer mgt. for frame mode
+    frame_postprocessing(p_Vid);
+  else
+    field_postprocessing(p_Vid);   // reset all interlaced variables
+
+  structure  = (*dec_picture)->structure;
+  slice_type = (*dec_picture)->slice_type;
+  frame_poc  = (*dec_picture)->frame_poc;
+  refpic     = (*dec_picture)->used_for_reference;
+  qp         = (*dec_picture)->qp;
+  pic_num    = (*dec_picture)->pic_num;
+  is_idr     = (*dec_picture)->idr_flag;
+
+  chroma_format_idc = (*dec_picture)->chroma_format_idc;
+
+  store_picture_in_dpb(p_Vid, *dec_picture);
+  *dec_picture=NULL;
+
+  if (p_Vid->last_has_mmco_5)
+  {
+    p_Vid->pre_frame_num = 0;
+  }
+
+
+  if ((structure==FRAME)||structure==BOTTOM_FIELD)
+  {
+    if(slice_type == I_SLICE || slice_type == SI_SLICE || slice_type == P_SLICE || refpic)   // I or P pictures
+      ++(p_Vid->number);
+
+  }
+
+  p_Vid->current_mb_nr = -4712;   // impossible value for debugging, StW
+  p_Vid->current_slice_nr = 0;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    write the encoding mode and motion vectors of current
+ *    MB to the buffer of the error concealment module.
+ ************************************************************************
+ */
+
+void ercWriteMBMODEandMV(Macroblock *currMB)
+{
+  VideoParameters *p_Vid = currMB->p_Vid;
+  int i, ii, jj, currMBNum = p_Vid->current_mb_nr;
+  StorablePicture *dec_picture = p_Vid->dec_picture;
+  int mbx = 4*xPosMB(currMBNum, dec_picture->size_x), mby = 4*yPosMB(currMBNum, dec_picture->size_x);
+  objectBuffer_t *currRegion, *pRegion;
+
+  currRegion = p_Vid->erc_object_list + (currMBNum<<2);
+
+  if(p_Vid->type != B_SLICE) //non-B frame
+  {
+    for (i=0; i<4; ++i)
+    {
+      pRegion             = currRegion + i;
+      pRegion->regionMode = (currMB->mb_type  ==I16MB  ? REGMODE_INTRA      :
+                             currMB->b8mode[i]==IBLOCK ? REGMODE_INTRA_8x8  :
+                             currMB->b8mode[i]==0      ? REGMODE_INTER_COPY :
+                             currMB->b8mode[i]==1      ? REGMODE_INTER_PRED : REGMODE_INTER_PRED_8x8);
+      if (currMB->b8mode[i]==0 || currMB->b8mode[i]==IBLOCK)  // INTRA OR COPY
+      {
+        pRegion->mv[0]    = 0;
+        pRegion->mv[1]    = 0;
+        pRegion->mv[2]    = 0;
+      }
+      else
+      {
+				PicMotion *motion00;
+				PicMotion *motion01;
+        ii              = mbx + (i & 0x01)*2;// + BLOCK_SIZE;
+        jj              = mby + (i & ~1  );
+				motion00 = &dec_picture->motion.motion[LIST_0][jj][ii];
+        if (currMB->b8mode[i]>=5 && currMB->b8mode[i]<=7)  // SMALL BLOCKS
+        {
+					motion01 = &dec_picture->motion.motion[LIST_0][jj+1][ii];
+          pRegion->mv[0]  = (motion00[0].mv[0] + motion00[1].mv[0] + motion01[0].mv[0] + motion01[1].mv[0] + 2)/4;
+          pRegion->mv[1]  = (motion00[0].mv[1] + motion00[1].mv[1] + motion01[0].mv[1] + motion01[1].mv[1] + 2)/4;
+        }
+        else // 16x16, 16x8, 8x16, 8x8
+        {
+          pRegion->mv[0]  = motion00->mv[0];
+          pRegion->mv[1]  = motion00->mv[1];
+        }
+        p_Vid->erc_mvperMB      += abs(pRegion->mv[0]) + abs(pRegion->mv[1]);
+        pRegion->mv[2]    = motion00->ref_idx;
+      }
+    }
+  }
+  else  //B-frame
+  {
+    for (i=0; i<4; ++i)
+    {
+      ii                  = mbx + (i%2)*2;// + BLOCK_SIZE;
+      jj                  = mby + (i/2)*2;
+      pRegion             = currRegion + i;
+      pRegion->regionMode = (currMB->mb_type  ==I16MB  ? REGMODE_INTRA      :
+                             currMB->b8mode[i]==IBLOCK ? REGMODE_INTRA_8x8  : REGMODE_INTER_PRED_8x8);
+      if (currMB->mb_type==I16MB || currMB->b8mode[i]==IBLOCK)  // INTRA
+      {
+        pRegion->mv[0]    = 0;
+        pRegion->mv[1]    = 0;
+        pRegion->mv[2]    = 0;
+      }
+      else
+      {
+				PicMotion *motion0= & dec_picture->motion.motion[LIST_0][jj][ii];
+				PicMotion *motion1;
+
+				if (motion0->ref_idx<0)
+				{
+					motion0 = & dec_picture->motion.motion[LIST_1][jj][ii];
+					motion1 = & dec_picture->motion.motion[LIST_1][jj+1][ii];
+				}
+				else
+				{
+						motion1 = & dec_picture->motion.motion[LIST_0][jj+1][ii];
+				}
+//        int idx = (currMB->b8mode[i]==0 && currMB->b8pdir[i]==2 ? LIST_0 : currMB->b8pdir[i]==1 ? LIST_1 : LIST_0);
+//        int idx = currMB->b8pdir[i]==0 ? LIST_0 : LIST_1;
+
+        pRegion->mv[0]    = (motion0[0].mv[0] + motion0[1].mv[0] + motion1[0].mv[0] + motion1[1].mv[0] + 2)/4;
+        pRegion->mv[1]    = (motion0[0].mv[1] + motion0[1].mv[1] + motion1[0].mv[1] + motion1[1].mv[1] + 2)/4;
+        p_Vid->erc_mvperMB      += iabs(pRegion->mv[0]) + iabs(pRegion->mv[1]);
+
+        pRegion->mv[2]  = (motion0[0].ref_idx);
+/*
+        if (currMB->b8pdir[i]==0 || (currMB->b8pdir[i]==2 && currMB->b8mode[i]!=0)) // forward or bidirect
+        {
+          pRegion->mv[2]  = (dec_picture->motion.ref_idx[LIST_0][jj][ii]);
+          ///???? is it right, not only "p_Vid->fw_refFrArr[jj][ii-4]"
+        }
+        else
+        {
+          pRegion->mv[2]  = (dec_picture->motion.ref_idx[LIST_1][jj][ii]);
+//          pRegion->mv[2]  = 0;
+        }
+        */
+      }
+    }
+  }
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    set defaults for old_slice
+ *    NAL unit of a picture"
+ ************************************************************************
+ */
+void init_old_slice(OldSliceParams *p_old_slice)
+{
+  p_old_slice->field_pic_flag = 0;
+
+  p_old_slice->pps_id = INT_MAX;
+
+  p_old_slice->frame_num = INT_MAX;
+
+  p_old_slice->nal_ref_idc = INT_MAX;
+
+  p_old_slice->idr_flag = FALSE;
+
+  p_old_slice->pic_oder_cnt_lsb          = UINT_MAX;
+  p_old_slice->delta_pic_oder_cnt_bottom = INT_MAX;
+
+  p_old_slice->delta_pic_order_cnt[0] = INT_MAX;
+  p_old_slice->delta_pic_order_cnt[1] = INT_MAX;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    save slice parameters that are needed for checking of "first VCL
+ *    NAL unit of a picture"
+ ************************************************************************
+ */
+static void exit_slice(Slice *currSlice, OldSliceParams *p_old_slice)
+{
+  VideoParameters *p_Vid = currSlice->p_Vid;
+
+  if (currSlice->slice_type == B_SLICE)
+  {
+    if( IS_INDEPENDENT(p_Vid) )
+    {
+      int nplane;
+      for( nplane = 0; nplane < MAX_PLANE; ++nplane )
+      {
+        free_colocated(p_Vid, currSlice->Co_located_JV[nplane]);   
+        currSlice->Co_located_JV[nplane] = NULL;
+      }
+    }
+    else
+    {
+      free_colocated(p_Vid, currSlice->p_colocated);
+      currSlice->p_colocated = NULL;
+    }
+  }
+
+  p_old_slice->pps_id = currSlice->pic_parameter_set_id;
+
+  p_old_slice->frame_num = p_Vid->frame_num;
+
+  p_old_slice->field_pic_flag = p_Vid->field_pic_flag;
+
+  if(p_Vid->field_pic_flag)
+  {
+    p_old_slice->bottom_field_flag = p_Vid->bottom_field_flag;
+  }
+
+  p_old_slice->nal_ref_idc   = p_Vid->nal_reference_idc;
+
+  p_old_slice->idr_flag = (byte) p_Vid->idr_flag;
+  if (p_Vid->idr_flag)
+  {
+    p_old_slice->idr_pic_id = p_Vid->idr_pic_id;
+  }
+
+  if (p_Vid->active_sps->pic_order_cnt_type == 0)
+  {
+    p_old_slice->pic_oder_cnt_lsb          = p_Vid->pic_order_cnt_lsb;
+    p_old_slice->delta_pic_oder_cnt_bottom = p_Vid->delta_pic_order_cnt_bottom;
+  }
+
+  if (p_Vid->active_sps->pic_order_cnt_type == 1)
+  {
+    p_old_slice->delta_pic_order_cnt[0] = p_Vid->delta_pic_order_cnt[0];
+    p_old_slice->delta_pic_order_cnt[1] = p_Vid->delta_pic_order_cnt[1];
+  }
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    detect if current slice is "first VCL NAL unit of a picture"
+ ************************************************************************
+ */
+int is_new_picture(StorablePicture *dec_picture, Slice *currSlice, OldSliceParams *p_old_slice)
+{
+  VideoParameters *p_Vid = currSlice->p_Vid;
+
+  int result=0;
+
+  result |= (NULL==dec_picture);
+
+  result |= (p_old_slice->pps_id != currSlice->pic_parameter_set_id);
+
+  result |= (p_old_slice->frame_num != p_Vid->frame_num);
+
+  result |= (p_old_slice->field_pic_flag != p_Vid->field_pic_flag);
+
+  if(p_Vid->field_pic_flag && p_old_slice->field_pic_flag)
+  {
+    result |= (p_old_slice->bottom_field_flag != p_Vid->bottom_field_flag);
+  }
+
+  result |= (p_old_slice->nal_ref_idc != p_Vid->nal_reference_idc) && ((p_old_slice->nal_ref_idc == 0) || (p_Vid->nal_reference_idc == 0));
+
+  result |= ( p_old_slice->idr_flag != p_Vid->idr_flag);
+
+  if (p_Vid->idr_flag && p_old_slice->idr_flag)
+  {
+    result |= (p_old_slice->idr_pic_id != p_Vid->idr_pic_id);
+  }
+
+  if (p_Vid->active_sps->pic_order_cnt_type == 0)
+  {
+    result |= (p_old_slice->pic_oder_cnt_lsb          != p_Vid->pic_order_cnt_lsb);
+    result |= (p_old_slice->delta_pic_oder_cnt_bottom != p_Vid->delta_pic_order_cnt_bottom);
+  }
+
+  if (p_Vid->active_sps->pic_order_cnt_type == 1)
+  {
+    result |= (p_old_slice->delta_pic_order_cnt[0] != p_Vid->delta_pic_order_cnt[0]);
+    result |= (p_old_slice->delta_pic_order_cnt[1] != p_Vid->delta_pic_order_cnt[1]);
+  }
+
+  return result;
+}
+
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Prepare field and frame buffer after frame decoding
+ ************************************************************************
+ */
+void frame_postprocessing(VideoParameters *p_Vid)
+{
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Prepare field and frame buffer after field decoding
+ ************************************************************************
+ */
+void field_postprocessing(VideoParameters *p_Vid)
+{
+  p_Vid->number /= 2;
+}
+
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    copy StorablePicture *src -> StorablePicture *dst
+ *    for 4:4:4 Independent mode
+ ************************************************************************
+ */
+void copy_dec_picture_JV( VideoParameters *p_Vid, StorablePicture *dst, StorablePicture *src )
+{
+  dst->top_poc              = src->top_poc;
+  dst->bottom_poc           = src->bottom_poc;
+  dst->frame_poc            = src->frame_poc;
+  dst->qp                   = src->qp;
+  dst->slice_qp_delta       = src->slice_qp_delta;
+  dst->chroma_qp_offset[0]  = src->chroma_qp_offset[0];
+  dst->chroma_qp_offset[1]  = src->chroma_qp_offset[1];
+
+  dst->poc                  = src->poc;
+
+  dst->slice_type           = src->slice_type;
+  dst->used_for_reference   = src->used_for_reference;
+  dst->idr_flag             = src->idr_flag;
+  dst->no_output_of_prior_pics_flag = src->no_output_of_prior_pics_flag;
+  dst->long_term_reference_flag = src->long_term_reference_flag;
+  dst->adaptive_ref_pic_buffering_flag = src->adaptive_ref_pic_buffering_flag;
+
+  dst->dec_ref_pic_marking_buffer = src->dec_ref_pic_marking_buffer;
+
+  dst->mb_aff_frame_flag       = src->mb_aff_frame_flag;
+  dst->PicWidthInMbs        = src->PicWidthInMbs;
+  dst->pic_num              = src->pic_num;
+  dst->frame_num            = src->frame_num;
+  dst->recovery_frame       = src->recovery_frame;
+  dst->coded_frame          = src->coded_frame;
+
+  dst->chroma_format_idc    = src->chroma_format_idc;
+
+  dst->frame_mbs_only_flag  = src->frame_mbs_only_flag;
+  dst->frame_cropping_flag  = src->frame_cropping_flag;
+
+  dst->frame_cropping_rect_left_offset   = src->frame_cropping_rect_left_offset;
+  dst->frame_cropping_rect_right_offset  = src->frame_cropping_rect_right_offset;
+  dst->frame_cropping_rect_top_offset    = src->frame_cropping_rect_top_offset;
+  dst->frame_cropping_rect_bottom_offset = src->frame_cropping_rect_bottom_offset;
+
+#if (ENABLE_OUTPUT_TONEMAPPING)
+  // store the necessary tone mapping sei into StorablePicture structure
+  dst->seiHasTone_mapping = src->seiHasTone_mapping;
+
+  dst->seiHasTone_mapping    = src->seiHasTone_mapping;
+  dst->tone_mapping_model_id = src->tone_mapping_model_id;
+  dst->tonemapped_bit_depth  = src->tonemapped_bit_depth;
+  if( src->tone_mapping_lut )
+  {
+    dst->tone_mapping_lut      = malloc(sizeof(int)*(1 << p_Vid->seiToneMapping->coded_data_bit_depth));
+    if (NULL == dst->tone_mapping_lut)
+    {
+      no_mem_exit("copy_dec_picture_JV: tone_mapping_lut");
+    }
+    memcpy(dst->tone_mapping_lut, src->tone_mapping_lut, sizeof(imgpel)*(1<<p_Vid->seiToneMapping->coded_data_bit_depth));
+  }
+#endif
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    decodes one slice
+ ************************************************************************
+ */
+void decode_one_slice(Slice *currSlice)
+{
+  VideoParameters *p_Vid = currSlice->p_Vid;
+  Boolean end_of_slice = FALSE;
+  Macroblock *currMB = NULL;
+  p_Vid->cod_counter=-1;
+
+  setup_slice_methods(currSlice);
+
+  if( IS_INDEPENDENT(p_Vid) )
+  {
+    change_plane_JV( p_Vid, p_Vid->colour_plane_id );
+  }
+
+  set_ref_pic_num(p_Vid);
+
+  if (currSlice->slice_type == B_SLICE)
+  {
+    currSlice->compute_colocated(currSlice, currSlice->p_colocated, p_Vid->listX);
+  }
+
+  //reset_ec_flags(p_Vid);
+
+  while (end_of_slice == FALSE) // loop over macroblocks
+  {
+
+#if TRACE
+  fprintf(p_trace,"\n*********** POC: %i (I/P) MB: %i Slice: %i Type %d **********\n", p_Vid->ThisPOC, p_Vid->current_mb_nr, p_Vid->current_slice_nr, currSlice->slice_type);
+#endif
+
+    // Initializes the current macroblock
+    start_macroblock(currSlice, &currMB);
+    // Get the syntax elements from the NAL
+    currSlice->read_one_macroblock(currMB);
+    decode_one_macroblock(currMB, p_Vid->dec_picture);
+
+    if(currSlice->mb_aff_frame_flag && p_Vid->dec_picture->motion.mb_field[p_Vid->current_mb_nr])
+    {
+      currSlice->num_ref_idx_l0_active >>= 1;
+      currSlice->num_ref_idx_l1_active >>= 1;
+    }
+
+    ercWriteMBMODEandMV(currMB);
+
+    end_of_slice = exit_macroblock(currSlice, (!currSlice->mb_aff_frame_flag||p_Vid->current_mb_nr%2));
+  }
+
+  exit_slice(currSlice, p_Vid->old_slice);
+  //reset_ec_flags(p_Vid);
+}
diff --git a/Src/h264dec/ldecod/src/intra16x16_pred.c b/Src/h264dec/ldecod/src/intra16x16_pred.c
new file mode 100644
index 00000000..a2b89ca2
--- /dev/null
+++ b/Src/h264dec/ldecod/src/intra16x16_pred.c
@@ -0,0 +1,428 @@
+/*!
+ *************************************************************************************
+ * \file intra16x16_pred.c
+ *
+ * \brief
+ *    Functions for intra 8x8 prediction
+ *
+ * \author
+ *      Main contributors (see contributors.h for copyright, 
+ *                         address and affiliation details)
+ *      - Yuri Vatis
+ *      - Jan Muenster
+ *      - Alexis Michael Tourapis  <alexismt@ieee.org>
+ *
+ *************************************************************************************
+ */
+#include "global.h"
+#include "intra16x16_pred.h"
+#include "mb_access.h"
+#include "image.h"
+#include <emmintrin.h>
+static void memset_16x16(h264_imgpel_macroblock_row_t *mb_pred, int pred)
+{
+	if (sse2_flag)
+	{
+		__m128i xmm_pred = _mm_set1_epi8(pred);
+		int i;
+		__m128i *xmm_macroblock = (__m128i *)mb_pred;
+		for (i=0;i<16;i++)
+		{
+			_mm_store_si128(xmm_macroblock++, xmm_pred);
+		}
+	}
+#ifdef _M_IX86
+	else
+	{
+		__m64 mmx_pred = _mm_set1_pi8(pred);
+		int i;
+		__m64 *mmx_macroblock = (__m64 *)mb_pred;
+		for (i=0;i<16;i++)
+		{
+			*mmx_macroblock++ = mmx_pred;
+			*mmx_macroblock++ = mmx_pred;
+		}
+	}
+#else
+	else
+	{
+	int ii, jj;
+	for (jj = 0; jj < MB_BLOCK_SIZE; jj++)
+	{
+		for (ii = 0; ii < MB_BLOCK_SIZE; ii++)
+		{
+			mb_pred[jj][ii]=(imgpel) pred;
+		}
+	}
+	}
+	#endif
+}
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    makes and returns 16x16 DC prediction mode
+ *
+ * \return
+ *    DECODING_OK   decoding of intraprediction mode was sucessfull            \n
+ *
+ ***********************************************************************
+ */
+static inline int intra16x16_dc_pred(Macroblock *currMB, 
+                                     ColorPlane pl)
+{
+  Slice *currSlice = currMB->p_Slice;
+  VideoParameters *p_Vid = currMB->p_Vid;
+
+  int s0 = 0, s1 = 0, s2 = 0;
+
+  int i;
+
+  imgpel **imgY = (pl) ? p_Vid->dec_picture->imgUV[pl - 1]->img : p_Vid->dec_picture->imgY->img;
+
+  PixelPos up;          //!< pixel position p(0,-1)
+  PixelPos left[17];    //!< pixel positions p(-1, -1..15)
+
+  int up_avail, left_avail, left_up_avail;
+
+  s1=s2=0;
+
+	p_Vid->getNeighbourNXLuma(currMB,  -1, &left[0]);
+	p_Vid->getNeighbourLeftLuma(currMB,  &left[1]);
+     p_Vid->getNeighbourNPLumaNB(currMB,  2-1, &left[2]);
+		p_Vid->getNeighbourNPLumaNB(currMB,  3-1, &left[3]);
+		p_Vid->getNeighbourNPLumaNB(currMB,  4-1, &left[4]);
+		p_Vid->getNeighbourNPLumaNB(currMB,  5-1, &left[5]);
+		p_Vid->getNeighbourNPLumaNB(currMB,  6-1, &left[6]);
+		p_Vid->getNeighbourNPLumaNB(currMB,  7-1, &left[7]);
+		p_Vid->getNeighbourNPLumaNB(currMB,  8-1, &left[8]);
+		p_Vid->getNeighbourNPLumaNB(currMB,  9-1, &left[9]);
+		p_Vid->getNeighbourNPLumaNB(currMB,  10-1, &left[10]);
+		p_Vid->getNeighbourNPLumaNB(currMB,  11-1, &left[11]);
+		p_Vid->getNeighbourNPLumaNB(currMB,  12-1, &left[12]);
+		p_Vid->getNeighbourNPLumaNB(currMB,  13-1, &left[13]);
+		p_Vid->getNeighbourNPLumaNB(currMB,  14-1, &left[14]);
+	p_Vid->getNeighbourNPLumaNB(currMB,  15-1, &left[15]);
+	p_Vid->getNeighbourNPLumaNB(currMB,  16-1, &left[16]);
+
+  p_Vid->getNeighbourUpLuma(currMB, &up);
+
+  if (!p_Vid->active_pps->constrained_intra_pred_flag)
+  {
+    up_avail      = up.available;
+    left_avail    = left[1].available;
+    left_up_avail = left[0].available;
+  }
+  else
+  {
+    up_avail      = up.available ? p_Vid->intra_block[up.mb_addr] : 0;
+    for (i = 1, left_avail = 1; i < 17; ++i)
+      left_avail  &= left[i].available ? p_Vid->intra_block[left[i].mb_addr]: 0;
+    left_up_avail = left[0].available ? p_Vid->intra_block[left[0].mb_addr]: 0;
+  }
+
+	if (up_avail)
+	{
+		s1 += imgY[up.pos_y][up.pos_x+0];    // sum hor pix
+		s1 += imgY[up.pos_y][up.pos_x+1];    // sum hor pix
+		s1 += imgY[up.pos_y][up.pos_x+2];    // sum hor pix
+		s1 += imgY[up.pos_y][up.pos_x+3];    // sum hor pix
+		s1 += imgY[up.pos_y][up.pos_x+4];    // sum hor pix
+		s1 += imgY[up.pos_y][up.pos_x+5];    // sum hor pix
+		s1 += imgY[up.pos_y][up.pos_x+6];    // sum hor pix
+		s1 += imgY[up.pos_y][up.pos_x+7];    // sum hor pix
+		s1 += imgY[up.pos_y][up.pos_x+8];    // sum hor pix
+		s1 += imgY[up.pos_y][up.pos_x+9];    // sum hor pix
+		s1 += imgY[up.pos_y][up.pos_x+10];    // sum hor pix
+		s1 += imgY[up.pos_y][up.pos_x+11];    // sum hor pix
+		s1 += imgY[up.pos_y][up.pos_x+12];    // sum hor pix
+		s1 += imgY[up.pos_y][up.pos_x+13];    // sum hor pix
+		s1 += imgY[up.pos_y][up.pos_x+14];    // sum hor pix
+		s1 += imgY[up.pos_y][up.pos_x+15];    // sum hor pix
+	}
+
+	if (left_avail)
+	{
+		s2 += imgY[left[0 + 1].pos_y][left[0 + 1].pos_x];    // sum vert pix
+		s2 += imgY[left[1 + 1].pos_y][left[1 + 1].pos_x];    // sum vert pix
+		s2 += imgY[left[2 + 1].pos_y][left[2 + 1].pos_x];    // sum vert pix
+		s2 += imgY[left[3 + 1].pos_y][left[3 + 1].pos_x];    // sum vert pix
+		s2 += imgY[left[4 + 1].pos_y][left[4 + 1].pos_x];    // sum vert pix
+		s2 += imgY[left[5 + 1].pos_y][left[5 + 1].pos_x];    // sum vert pix
+		s2 += imgY[left[6 + 1].pos_y][left[6 + 1].pos_x];    // sum vert pix
+		s2 += imgY[left[7 + 1].pos_y][left[7 + 1].pos_x];    // sum vert pix
+		s2 += imgY[left[8 + 1].pos_y][left[8 + 1].pos_x];    // sum vert pix
+		s2 += imgY[left[9 + 1].pos_y][left[9 + 1].pos_x];    // sum vert pix
+		s2 += imgY[left[10 + 1].pos_y][left[10 + 1].pos_x];    // sum vert pix
+		s2 += imgY[left[11 + 1].pos_y][left[11 + 1].pos_x];    // sum vert pix
+		s2 += imgY[left[12 + 1].pos_y][left[12 + 1].pos_x];    // sum vert pix
+		s2 += imgY[left[13 + 1].pos_y][left[13 + 1].pos_x];    // sum vert pix
+		s2 += imgY[left[14 + 1].pos_y][left[14 + 1].pos_x];    // sum vert pix
+		s2 += imgY[left[15 + 1].pos_y][left[15 + 1].pos_x];    // sum vert pix
+	}
+  
+  if (up_avail && left_avail)
+    s0 = (s1 + s2 + 16)>>5;       // no edge
+  else if (!up_avail && left_avail)
+    s0 = (s2 + 8)>>4;              // upper edge
+  else if (up_avail && !left_avail)
+    s0 = (s1 + 8)>>4;              // left edge
+  else
+    s0 = p_Vid->dc_pred_value_comp[pl];                            // top left corner, nothing to predict from
+
+	memset_16x16(currSlice->mb_pred[pl], s0);
+
+  return DECODING_OK;
+}
+
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    makes and returns 16x16 vertical prediction mode
+ *
+ * \return
+ *    DECODING_OK   decoding of intraprediction mode was sucessfull            \n
+ *
+ ***********************************************************************
+ */
+static inline int intra16x16_vert_pred(Macroblock *currMB, 
+                                       ColorPlane pl)
+{
+  Slice *currSlice = currMB->p_Slice;
+  VideoParameters *p_Vid = currMB->p_Vid;
+  
+  int j;
+
+  imgpel **imgY = (pl) ? p_Vid->dec_picture->imgUV[pl - 1]->img : p_Vid->dec_picture->imgY->img;
+
+  PixelPos up;          //!< pixel position p(0,-1)
+
+  int up_avail;
+
+  p_Vid->getNeighbourUpLuma(currMB, &up);
+
+  if (!p_Vid->active_pps->constrained_intra_pred_flag)
+  {
+    up_avail = up.available;
+  }
+  else
+  {
+    up_avail = up.available ? p_Vid->intra_block[up.mb_addr] : 0;
+  }
+
+  if (!up_avail)
+    error ("invalid 16x16 intra pred Mode VERT_PRED_16",500);
+
+  for(j=0;j<MB_BLOCK_SIZE;++j)
+	{
+		// TODO; take advantage of imgY's stride
+    memcpy(&currSlice->mb_pred[pl][j][0], &(imgY[up.pos_y][up.pos_x]), MB_BLOCK_SIZE * sizeof(imgpel));
+	}
+
+  return DECODING_OK;
+}
+
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    makes and returns 16x16 horizontal prediction mode
+ *
+ * \return
+ *    DECODING_OK   decoding of intraprediction mode was sucessfull            \n
+ *
+ ***********************************************************************
+ */
+static inline int intra16x16_hor_pred(Macroblock *currMB, 
+                                      ColorPlane pl)
+{
+  Slice *currSlice = currMB->p_Slice;
+  VideoParameters *p_Vid = currMB->p_Vid;
+  int i,j;
+
+  imgpel **imgY = (pl) ? p_Vid->dec_picture->imgUV[pl - 1]->img : p_Vid->dec_picture->imgY->img;
+  imgpel prediction;
+
+  PixelPos left[17];    //!< pixel positions p(-1, -1..15)
+
+  int left_avail, left_up_avail;
+
+  for (i=0;i<17;++i)
+  {
+    p_Vid->getNeighbourNXLuma(currMB, i-1, &left[i]);
+  }
+
+  if (!p_Vid->active_pps->constrained_intra_pred_flag)
+  {
+    left_avail    = left[1].available;
+    left_up_avail = left[0].available;
+  }
+  else
+  {
+    for (i = 1, left_avail = 1; i < 17; ++i)
+      left_avail  &= left[i].available ? p_Vid->intra_block[left[i].mb_addr]: 0;
+    left_up_avail = left[0].available ? p_Vid->intra_block[left[0].mb_addr]: 0;
+  }
+
+  if (!left_avail)
+    error ("invalid 16x16 intra pred Mode HOR_PRED_16",500);
+
+  for(j = 0; j < MB_BLOCK_SIZE; ++j)
+  {
+    prediction = imgY[left[j+1].pos_y][left[j+1].pos_x];
+    for(i = 0; i < MB_BLOCK_SIZE; ++i)
+      currSlice->mb_pred[pl][j][i]= prediction; // store predicted 16x16 block
+  }
+
+  return DECODING_OK;
+}
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    makes and returns 16x16 horizontal prediction mode
+ *
+ * \return
+ *    DECODING_OK   decoding of intraprediction mode was sucessfull            \n
+ *
+ ***********************************************************************
+ */
+static void planeset(h264_imgpel_macroblock_row_t *dest, int iaa, int ib, int ic)
+{
+	int j;
+	__m128i i0_7 = _mm_setr_epi16(-7,-6,-5,-4,-3,-2,-1, 0);
+	__m128i i8_15 = _mm_setr_epi16(1,2,3,4,5,6,7,8);
+	__m128i xmm_ib = _mm_set1_epi16(ib);
+	int j7ic = iaa + -7 * ic + 16;
+	i0_7 = _mm_mullo_epi16(i0_7, xmm_ib);
+	i8_15 = _mm_mullo_epi16(i8_15, xmm_ib);
+  for (j = 0;j < MB_BLOCK_SIZE; ++j)
+  {
+		__m128i xmm_j7ic = _mm_set1_epi16(j7ic);
+		__m128i xmm_lo = _mm_add_epi16(i0_7, xmm_j7ic);
+		__m128i xmm_hi = _mm_add_epi16(i8_15, xmm_j7ic);
+		__m128i xmm_store;
+		xmm_lo = _mm_srai_epi16(xmm_lo, 5);
+		xmm_hi = _mm_srai_epi16(xmm_hi, 5);
+		xmm_store = _mm_packus_epi16(xmm_lo, xmm_hi);
+		_mm_store_si128((__m128i *)dest[j], xmm_store);
+		j7ic += ic;
+  }// store plane prediction
+}
+
+static inline int intra16x16_plane_pred(Macroblock *currMB, 
+                                        ColorPlane pl)
+{
+  Slice *currSlice = currMB->p_Slice;
+  VideoParameters *p_Vid = currMB->p_Vid;
+  
+  int i,j;
+
+  int ih = 0, iv = 0;
+  int ib,ic,iaa;
+
+  imgpel **imgY = (pl) ? p_Vid->dec_picture->imgUV[pl - 1]->img : p_Vid->dec_picture->imgY->img;
+  imgpel *mpr_line;
+  int max_imgpel_value = p_Vid->max_pel_value_comp[pl];
+
+  PixelPos up;          //!< pixel position p(0,-1)
+  PixelPos left[17];    //!< pixel positions p(-1, -1..15)
+
+  int up_avail, left_avail, left_up_avail;
+
+	p_Vid->getNeighbourNXLuma(currMB,  -1, &left[0]);
+	p_Vid->getNeighbourLeftLuma(currMB, &left[1]);
+  for (i=2;i<17; ++i)
+  {
+    p_Vid->getNeighbourNPLumaNB(currMB,  i-1, &left[i]);
+  }
+  p_Vid->getNeighbourUpLuma(currMB, &up);
+
+  if (!p_Vid->active_pps->constrained_intra_pred_flag)
+  {
+    up_avail      = up.available;
+    left_avail    = left[1].available;
+    left_up_avail = left[0].available;
+  }
+  else
+  {
+    up_avail      = up.available ? p_Vid->intra_block[up.mb_addr] : 0;
+    for (i = 1, left_avail = 1; i < 17; ++i)
+      left_avail  &= left[i].available ? p_Vid->intra_block[left[i].mb_addr]: 0;
+    left_up_avail = left[0].available ? p_Vid->intra_block[left[0].mb_addr]: 0;
+  }
+
+  if (!up_avail || !left_up_avail  || !left_avail)
+    error ("invalid 16x16 intra pred Mode PLANE_16",500);
+
+  mpr_line = &imgY[up.pos_y][up.pos_x+7];
+  for (i = 1; i < 8; ++i)
+  {
+    ih += i*(mpr_line[i] - mpr_line[-i]);
+    iv += i*(imgY[left[8+i].pos_y][left[8+i].pos_x] - imgY[left[8-i].pos_y][left[8-i].pos_x]);
+  }
+
+  ih += 8*(mpr_line[8] - imgY[left[0].pos_y][left[0].pos_x]);
+  iv += 8*(imgY[left[16].pos_y][left[16].pos_x] - imgY[left[0].pos_y][left[0].pos_x]);
+
+  ib=(5 * ih + 32)>>6;
+  ic=(5 * iv + 32)>>6;
+
+  iaa=16 * (mpr_line[8] + imgY[left[16].pos_y][left[16].pos_x]);
+	if (sse2_flag)
+	{
+		planeset(currSlice->mb_pred[pl], iaa, ib, ic);
+	}
+	else
+	{
+	// TODO: MMX
+  for (j = 0;j < MB_BLOCK_SIZE; ++j)
+  {
+		int j7ic = iaa + (j - 7) * ic + 16;
+    for (i = 0;i < MB_BLOCK_SIZE; ++i)
+    {
+      currSlice->mb_pred[pl][j][i] = (imgpel) iClip1(max_imgpel_value, (((i - 7) * ib + j7ic) >> 5));
+    }
+  }// store plane prediction
+	}
+  return DECODING_OK;
+}
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    makes and returns 16x16 intra prediction blocks 
+ *
+ * \return
+ *    DECODING_OK   decoding of intraprediction mode was sucessfull            \n
+ *    SEARCH_SYNC   search next sync element as errors while decoding occured
+ ***********************************************************************
+ */
+// TODO: replace with ippiPredictIntra_16x16_H264_8u_C1IR ?
+int intrapred16x16(Macroblock *currMB,  //!< Current Macroblock
+                   ColorPlane pl,       //!< Current colorplane (for 4:4:4)                         
+                   int predmode)        //!< prediction mode
+{
+  switch (predmode)
+  {
+  case VERT_PRED_16:                       // vertical prediction from block above
+    return (intra16x16_vert_pred(currMB, pl));
+    break;
+  case HOR_PRED_16:                        // horizontal prediction from left block
+    return (intra16x16_hor_pred(currMB, pl));
+    break;
+  case DC_PRED_16:                         // DC prediction
+    return (intra16x16_dc_pred(currMB, pl));
+    break;
+  case PLANE_16:// 16 bit integer plan pred
+    return (intra16x16_plane_pred(currMB, pl));
+    break;
+  default:
+    {                                    // indication of fault in bitstream,exit
+      printf("illegal 16x16 intra prediction mode input: %d\n",predmode);
+      return SEARCH_SYNC;
+    }
+  }
+}
+
diff --git a/Src/h264dec/ldecod/src/intra4x4_pred.c b/Src/h264dec/ldecod/src/intra4x4_pred.c
new file mode 100644
index 00000000..21f16ed6
--- /dev/null
+++ b/Src/h264dec/ldecod/src/intra4x4_pred.c
@@ -0,0 +1,854 @@
+/*!
+ *************************************************************************************
+ * \file intra4x4_pred.c
+ *
+ * \brief
+ *    Functions for intra 4x4 prediction
+ *
+ * \author
+ *      Main contributors (see contributors.h for copyright, 
+ *                         address and affiliation details)
+ *      - Alexis Michael Tourapis  <alexismt@ieee.org>
+ *
+ *************************************************************************************
+ */
+#include "global.h"
+#include "intra4x4_pred.h"
+#include "mb_access.h"
+#include "image.h"
+
+// Notation for comments regarding prediction and predictors.
+// The pels of the 4x4 block are labelled a..p. The predictor pels above
+// are labelled A..H, from the left I..L, and from above left X, as follows:
+//
+//  X A B C D E F G H
+//  I a b c d
+//  J e f g h
+//  K i j k l
+//  L m n o p
+//
+
+// Predictor array index definitions
+#define P_X (PredPel[0])
+#define P_A (PredPel[1])
+#define P_B (PredPel[2])
+#define P_C (PredPel[3])
+#define P_D (PredPel[4])
+#define P_E (PredPel[5])
+#define P_F (PredPel[6])
+#define P_G (PredPel[7])
+#define P_H (PredPel[8])
+#define P_I (PredPel[9])
+#define P_J (PredPel[10])
+#define P_K (PredPel[11])
+#define P_L (PredPel[12])
+
+static void memset_4x4(h264_imgpel_macroblock_row_t *mb_pred, int offset_x, int pred)
+{
+#ifdef _M_IX86
+	// benski> can't believe the shitty code that the compiler generated...  this code is better
+	int dword_pred = pred * 0x01010101;
+	mb_pred = (h264_imgpel_macroblock_row_t *)&mb_pred[0][offset_x];
+	*(int *)mb_pred[0] = dword_pred;
+	*(int *)mb_pred[1] = dword_pred;
+	*(int *)mb_pred[2] = dword_pred;
+	*(int *)mb_pred[3] = dword_pred;
+#else
+	int ii, jj;
+	for (jj = 0; jj < BLOCK_SIZE; jj++)
+	{
+		for (ii = 0; ii < BLOCK_SIZE; ii++)
+		{
+			mb_pred[jj][offset_x+ii]=(imgpel) pred;
+		}
+	}
+#endif
+}
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    makes and returns 4x4 DC prediction mode
+ *
+ * \param currMB
+ *    current MB structure
+ * \param pl
+ *    color plane
+ * \param ioff
+ *    pixel offset X within MB
+ * \param joff
+ *    pixel offset Y within MB
+ * \return
+ *    DECODING_OK   decoding of intra prediction mode was successful            \n
+ *
+ ***********************************************************************
+ */
+static inline int intra4x4_dc_pred(Macroblock *currMB, 
+                                   ColorPlane pl,               
+                                   int ioff,
+                                   int joff)
+{
+  Slice *currSlice = currMB->p_Slice;
+  VideoParameters *p_Vid = currMB->p_Vid;  
+
+  int i;
+  int s0 = 0;  
+  imgpel **imgY = (pl) ? p_Vid->dec_picture->imgUV[pl - 1]->img : p_Vid->dec_picture->imgY->img;
+
+  PixelPos pix_a[4], pix_b;
+
+  int block_available_up;
+  int block_available_left;  
+
+  p_Vid->getNeighbourXPLumaNB(currMB, ioff -1 , joff +0 , &pix_a[0]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff -1 , joff +1 , &pix_a[1]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff -1 , joff +2 , &pix_a[2]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff -1 , joff +3 , &pix_a[3]);
+  p_Vid->getNeighbourPXLumaNB(currMB, ioff    , joff -1 , &pix_b);
+
+  if (p_Vid->active_pps->constrained_intra_pred_flag)
+  {
+    for (i=0, block_available_left=1; i<4;++i)
+      block_available_left  &= pix_a[i].available ? p_Vid->intra_block[pix_a[i].mb_addr]: 0;
+    block_available_up       = pix_b.available ? p_Vid->intra_block [pix_b.mb_addr] : 0;
+  }
+  else
+  {
+    block_available_left     = pix_a[0].available;
+    block_available_up       = pix_b.available;
+  }
+	// TODO:  ippiPredictIntra_4x4_H264_8u_C1IR
+  // form predictor pels
+  if (block_available_up)
+  {
+    s0 += imgY[pix_b.pos_y][pix_b.pos_x + 0];
+    s0 += imgY[pix_b.pos_y][pix_b.pos_x + 1];
+    s0 += imgY[pix_b.pos_y][pix_b.pos_x + 2];
+    s0 += imgY[pix_b.pos_y][pix_b.pos_x + 3];
+  }
+
+  if (block_available_left)
+  {
+    s0 += imgY[pix_a[0].pos_y][pix_a[0].pos_x];
+    s0 += imgY[pix_a[1].pos_y][pix_a[1].pos_x];
+    s0 += imgY[pix_a[2].pos_y][pix_a[2].pos_x];
+    s0 += imgY[pix_a[3].pos_y][pix_a[3].pos_x];
+  }
+
+  if (block_available_up && block_available_left)
+  {
+    // no edge
+    s0 = (s0 + 4)>>3;
+  }
+  else if (!block_available_up && block_available_left)
+  {
+    // upper edge
+    s0 = (s0 + 2)>>2;
+  }
+  else if (block_available_up && !block_available_left)
+  {
+    // left edge
+    s0 = (s0 + 2)>>2;
+  }
+  else //if (!block_available_up && !block_available_left)
+  {
+    // top left corner, nothing to predict from
+    s0 = p_Vid->dc_pred_value_comp[pl];
+  }
+
+	memset_4x4(&currSlice->mb_pred[pl][joff], ioff, s0);
+  return DECODING_OK;
+}
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    makes and returns 4x4 vertical prediction mode
+ *
+ * \return
+ *    DECODING_OK   decoding of intraprediction mode was sucessfull            \n
+ *
+ ***********************************************************************
+ */
+static inline int intra4x4_vert_pred(Macroblock *currMB,    //!< current macroblock
+                                     ColorPlane pl,         //!< current image plane
+                                     int ioff,              //!< pixel offset X within MB
+                                     int joff)              //!< pixel offset Y within MB
+{
+  Slice *currSlice = currMB->p_Slice;
+  VideoParameters *p_Vid = currMB->p_Vid;
+  
+  imgpel **imgY = (pl) ? p_Vid->dec_picture->imgUV[pl - 1]->img : p_Vid->dec_picture->imgY->img;
+  imgpel *src;
+  PixelPos pix_b;
+
+  p_Vid->getNeighbourPXLumaNB(currMB, ioff, joff - 1 ,  &pix_b);
+#ifdef H264_WARNINGS
+	{
+		  int block_available_up;
+
+  if (p_Vid->active_pps->constrained_intra_pred_flag)
+  {
+    block_available_up = pix_b.available ? p_Vid->intra_block [pix_b.mb_addr] : 0;
+  }
+  else
+  {
+    block_available_up = pix_b.available;
+  }
+	
+  if (!block_available_up)
+    printf ("warning: Intra_4x4_Vertical prediction mode not allowed at mb %d\n", (int) p_Vid->current_mb_nr);
+	}
+#endif
+	src = &(imgY[pix_b.pos_y][pix_b.pos_x]);
+  /* store predicted 4x4 block */
+    memcpy(&(currSlice->mb_pred[pl][joff][ioff]), src, BLOCK_SIZE * sizeof(imgpel));
+		memcpy(&(currSlice->mb_pred[pl][joff+1][ioff]), src, BLOCK_SIZE * sizeof(imgpel));
+		memcpy(&(currSlice->mb_pred[pl][joff+2][ioff]), src, BLOCK_SIZE * sizeof(imgpel));
+		memcpy(&(currSlice->mb_pred[pl][joff+3][ioff]), src, BLOCK_SIZE * sizeof(imgpel));
+
+  return DECODING_OK;
+}
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    makes and returns 4x4 horizontal prediction mode
+ *
+ * \param currMB
+ *    current MB structure
+ * \param pl
+ *    color plane
+ * \param ioff
+ *    pixel offset X within MB
+ * \param joff
+ *    pixel offset Y within MB
+ *
+ * \return
+ *    DECODING_OK   decoding of intra prediction mode was successful
+ *
+ ***********************************************************************
+ */
+static inline int intra4x4_hor_pred(Macroblock *currMB, 
+                                    ColorPlane pl,               
+                                    int ioff,
+                                    int joff)
+{
+  VideoParameters *p_Vid = currMB->p_Vid;
+  Slice *currSlice = currMB->p_Slice;
+
+  int j;
+  imgpel **imgY = (pl) ? p_Vid->dec_picture->imgUV[pl - 1]->img : p_Vid->dec_picture->imgY->img;
+
+  PixelPos pix_a[4];
+
+  imgpel prediction;    
+
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1 , joff + 0 ,  &pix_a[0]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1 , joff + 1 ,  &pix_a[1]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1 , joff + 2 ,  &pix_a[2]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1 , joff + 3 ,  &pix_a[3]);
+
+#ifdef H264_WARNINGS
+	{
+		int i;
+		int block_available_left;
+
+		if (p_Vid->active_pps->constrained_intra_pred_flag)
+		{
+			for (i=0, block_available_left=1; i<4;++i)
+				block_available_left  &= pix_a[i].available ? p_Vid->intra_block[pix_a[i].mb_addr]: 0;
+		}
+		else
+		{
+			block_available_left     = pix_a[0].available;
+		}
+
+		if (!block_available_left)
+			printf ("warning: Intra_4x4_Horizontal prediction mode not allowed at mb %d\n",(int) p_Vid->current_mb_nr);
+	}
+#endif
+  for(j=0;j<BLOCK_SIZE;++j)
+  {
+		// TODO: write using imgY's stride
+    prediction = imgY[pix_a[j].pos_y][pix_a[j].pos_x];
+    //for(i = ioff;i < ioff + BLOCK_SIZE;++i)
+      currSlice->mb_pred[pl][j+joff][ioff]= prediction; /* store predicted 4x4 block */
+		currSlice->mb_pred[pl][j+joff][ioff+1]= prediction; /* store predicted 4x4 block */
+		currSlice->mb_pred[pl][j+joff][ioff+2]= prediction; /* store predicted 4x4 block */
+		currSlice->mb_pred[pl][j+joff][ioff+3]= prediction; /* store predicted 4x4 block */
+  }
+
+  return DECODING_OK;
+}
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    makes and returns 4x4 diagonal down right prediction mode
+ *
+ * \return
+ *    DECODING_OK   decoding of intraprediction mode was sucessfull            \n
+ *
+ ***********************************************************************
+ */
+static inline int intra4x4_diag_down_right_pred(Macroblock *currMB,    //!< current macroblock
+                                                ColorPlane pl,         //!< current image plane
+                                                int ioff,              //!< pixel offset X within MB
+                                                int joff)              //!< pixel offset Y within MB
+{
+  Slice *currSlice = currMB->p_Slice;
+  VideoParameters *p_Vid = currMB->p_Vid;
+  
+  
+  imgpel PredPel[13];  // array of predictor pels
+  imgpel **imgY = (pl) ? p_Vid->dec_picture->imgUV[pl - 1]->img : p_Vid->dec_picture->imgY->img;
+
+  PixelPos pix_a[4];
+  PixelPos pix_b, pix_d;
+
+  
+
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1 , joff + 0 ,  &pix_a[0]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1 , joff + 1 ,  &pix_a[1]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1 , joff + 2 ,  &pix_a[2]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1 , joff + 3 ,  &pix_a[3]);
+
+  p_Vid->getNeighbourPXLumaNB(currMB, ioff    , joff -1 ,  &pix_b);
+  p_Vid->getNeighbourLuma(currMB, ioff -1 , joff -1 ,  &pix_d);
+#ifdef H264_WARNINGS
+	{
+		int block_available_up;
+  int block_available_left;
+  int block_available_up_left;
+  if (p_Vid->active_pps->constrained_intra_pred_flag)
+  {
+		int i;
+    for (i=0, block_available_left=1; i<4;++i)
+      block_available_left  &= pix_a[i].available ? p_Vid->intra_block[pix_a[i].mb_addr]: 0;
+    block_available_up       = pix_b.available ? p_Vid->intra_block [pix_b.mb_addr] : 0;
+    block_available_up_left  = pix_d.available ? p_Vid->intra_block [pix_d.mb_addr] : 0;
+  }
+  else
+  {
+    block_available_left     = pix_a[0].available;
+    block_available_up       = pix_b.available;
+    block_available_up_left  = pix_d.available;
+  }
+
+  if ((!block_available_up)||(!block_available_left)||(!block_available_up_left))
+    printf ("warning: Intra_4x4_Diagonal_Down_Right prediction mode not allowed at mb %d\n",(int) p_Vid->current_mb_nr);
+}
+#endif
+  // form predictor pels
+  P_A = imgY[pix_b.pos_y][pix_b.pos_x + 0];
+  P_B = imgY[pix_b.pos_y][pix_b.pos_x + 1];
+  P_C = imgY[pix_b.pos_y][pix_b.pos_x + 2];
+  P_D = imgY[pix_b.pos_y][pix_b.pos_x + 3];
+
+  P_I = imgY[pix_a[0].pos_y][pix_a[0].pos_x];
+  P_J = imgY[pix_a[1].pos_y][pix_a[1].pos_x];
+  P_K = imgY[pix_a[2].pos_y][pix_a[2].pos_x];
+  P_L = imgY[pix_a[3].pos_y][pix_a[3].pos_x];
+
+  P_X = imgY[pix_d.pos_y][pix_d.pos_x];
+
+  currSlice->mb_pred[pl][joff+3][ioff+0] = (imgpel) ((P_L + 2*P_K + P_J + 2) >> 2);
+  currSlice->mb_pred[pl][joff+2][ioff+0] =
+  currSlice->mb_pred[pl][joff+3][ioff+1] = (imgpel) ((P_K + 2*P_J + P_I + 2) >> 2);
+  currSlice->mb_pred[pl][joff+1][ioff+0] =
+  currSlice->mb_pred[pl][joff+2][ioff+1] =
+  currSlice->mb_pred[pl][joff+3][ioff+2] = (imgpel) ((P_J + 2*P_I + P_X + 2) >> 2);
+  currSlice->mb_pred[pl][joff+0][ioff+0] =
+  currSlice->mb_pred[pl][joff+1][ioff+1] =
+  currSlice->mb_pred[pl][joff+2][ioff+2] =
+  currSlice->mb_pred[pl][joff+3][ioff+3] = (imgpel) ((P_I + 2*P_X + P_A + 2) >> 2);
+  currSlice->mb_pred[pl][joff+0][ioff+1] =
+  currSlice->mb_pred[pl][joff+1][ioff+2] =
+  currSlice->mb_pred[pl][joff+2][ioff+3] = (imgpel) ((P_X + 2*P_A + P_B + 2) >> 2);
+  currSlice->mb_pred[pl][joff+0][ioff+2] =
+  currSlice->mb_pred[pl][joff+1][ioff+3] = (imgpel) ((P_A + 2*P_B + P_C + 2) >> 2);
+  currSlice->mb_pred[pl][joff+0][ioff+3] = (imgpel) ((P_B + 2*P_C + P_D + 2) >> 2);
+
+  return DECODING_OK;
+}
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    makes and returns 4x4 diagonal down left prediction mode
+ *
+ * \return
+ *    DECODING_OK   decoding of intraprediction mode was sucessfull            \n
+ *
+ ***********************************************************************
+ */
+static inline int intra4x4_diag_down_left_pred(Macroblock *currMB,    //!< current macroblock
+                                               ColorPlane pl,         //!< current image plane
+                                               int ioff,              //!< pixel offset X within MB
+                                               int joff)              //!< pixel offset Y within MB
+{
+  Slice *currSlice = currMB->p_Slice;
+  VideoParameters *p_Vid = currMB->p_Vid;
+  int block_available_up_right;
+		#ifdef H264_WARNINGS
+		int block_available_up;
+#endif
+  imgpel PredPel[13];  // array of predictor pels
+  imgpel **imgY = (pl) ? p_Vid->dec_picture->imgUV[pl - 1]->img : p_Vid->dec_picture->imgY->img;
+
+  PixelPos pix_b, pix_c;
+
+  p_Vid->getNeighbourPXLumaNB(currMB, ioff    , joff - 1,  &pix_b);
+  p_Vid->getNeighbourLuma(currMB, ioff + 4, joff - 1,  &pix_c);
+
+  pix_c.available = pix_c.available && !((ioff==4) && ((joff==4)||(joff==12)));
+
+	
+	
+  if (p_Vid->active_pps->constrained_intra_pred_flag)
+  {
+		#ifdef H264_WARNINGS
+    block_available_up       = pix_b.available ? p_Vid->intra_block [pix_b.mb_addr] : 0;
+#endif
+    block_available_up_right = pix_c.available ? p_Vid->intra_block [pix_c.mb_addr] : 0;
+  }
+  else
+  {
+		#ifdef H264_WARNINGS
+    block_available_up       = pix_b.available;
+#endif
+    block_available_up_right = pix_c.available;
+  }
+#ifdef H264_WARNINGS
+  if (!block_available_up)
+    printf ("warning: Intra_4x4_Diagonal_Down_Left prediction mode not allowed at mb %d\n", (int) p_Vid->current_mb_nr);
+
+#endif
+  // form predictor pels
+  P_A = imgY[pix_b.pos_y][pix_b.pos_x + 0];
+  P_B = imgY[pix_b.pos_y][pix_b.pos_x + 1];
+  P_C = imgY[pix_b.pos_y][pix_b.pos_x + 2];
+  P_D = imgY[pix_b.pos_y][pix_b.pos_x + 3];
+
+  if (block_available_up_right)
+  {
+    P_E = imgY[pix_c.pos_y][pix_c.pos_x + 0];
+    P_F = imgY[pix_c.pos_y][pix_c.pos_x + 1];
+    P_G = imgY[pix_c.pos_y][pix_c.pos_x + 2];
+    P_H = imgY[pix_c.pos_y][pix_c.pos_x + 3];
+  }
+  else
+  {
+    P_E = P_F = P_G = P_H = P_D;
+  }
+
+  currSlice->mb_pred[pl][joff+0][ioff+0] = (imgpel) ((P_A + P_C + 2*(P_B) + 2) >> 2);
+  currSlice->mb_pred[pl][joff+0][ioff+1] =
+  currSlice->mb_pred[pl][joff+1][ioff+0] = (imgpel) ((P_B + P_D + 2*(P_C) + 2) >> 2);
+  currSlice->mb_pred[pl][joff+0][ioff+2] =
+  currSlice->mb_pred[pl][joff+1][ioff+1] =
+  currSlice->mb_pred[pl][joff+2][ioff+0] = (imgpel) ((P_C + P_E + 2*(P_D) + 2) >> 2);
+  currSlice->mb_pred[pl][joff+0][ioff+3] =
+  currSlice->mb_pred[pl][joff+1][ioff+2] =
+  currSlice->mb_pred[pl][joff+2][ioff+1] =
+  currSlice->mb_pred[pl][joff+3][ioff+0] = (imgpel) ((P_D + P_F + 2*(P_E) + 2) >> 2);
+  currSlice->mb_pred[pl][joff+1][ioff+3] =
+  currSlice->mb_pred[pl][joff+2][ioff+2] =
+  currSlice->mb_pred[pl][joff+3][ioff+1] = (imgpel) ((P_E + P_G + 2*(P_F) + 2) >> 2);
+  currSlice->mb_pred[pl][joff+2][ioff+3] =
+  currSlice->mb_pred[pl][joff+3][ioff+2] = (imgpel) ((P_F + P_H + 2*(P_G) + 2) >> 2);
+  currSlice->mb_pred[pl][joff+3][ioff+3] = (imgpel) ((P_G + 3*(P_H) + 2) >> 2);
+
+  return DECODING_OK;
+}
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    makes and returns 4x4 vertical right prediction mode
+ *
+ * \return
+ *    DECODING_OK   decoding of intraprediction mode was sucessfull            \n
+ *
+ ***********************************************************************
+ */
+static inline int intra4x4_vert_right_pred(Macroblock *currMB,    //!< current macroblock
+                                           ColorPlane pl,         //!< current image plane
+                                           int ioff,              //!< pixel offset X within MB
+                                           int joff)              //!< pixel offset Y within MB
+{
+  Slice *currSlice = currMB->p_Slice;
+  VideoParameters *p_Vid = currMB->p_Vid;
+  
+
+  imgpel PredPel[13];  // array of predictor pels
+  imgpel **imgY = (pl) ? p_Vid->dec_picture->imgUV[pl - 1]->img : p_Vid->dec_picture->imgY->img;
+
+  PixelPos pix_a[4];
+  PixelPos pix_b, pix_d;
+
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff -1 , joff +0 , &pix_a[0]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff -1 , joff +1 , &pix_a[1]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff -1 , joff +2 , &pix_a[2]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff -1 , joff +3 , &pix_a[3]);
+
+	p_Vid->getNeighbourPXLumaNB(currMB, ioff    , joff -1 ,  &pix_b);
+	p_Vid->getNeighbourLuma(currMB, ioff -1 , joff -1 ,  &pix_d);
+#ifdef H264_WARNINGS
+	{
+		  int i;
+		  int block_available_up;
+  int block_available_left;
+  int block_available_up_left;
+
+  if (p_Vid->active_pps->constrained_intra_pred_flag)
+  {
+    for (i=0, block_available_left=1; i<4;++i)
+      block_available_left  &= pix_a[i].available ? p_Vid->intra_block[pix_a[i].mb_addr]: 0;
+    block_available_up       = pix_b.available ? p_Vid->intra_block [pix_b.mb_addr] : 0;
+    block_available_up_left  = pix_d.available ? p_Vid->intra_block [pix_d.mb_addr] : 0;
+  }
+  else
+  {
+    block_available_left     = pix_a[0].available;
+    block_available_up       = pix_b.available;
+    block_available_up_left  = pix_d.available;
+  }
+
+  if ((!block_available_up)||(!block_available_left)||(!block_available_up_left))
+    printf ("warning: Intra_4x4_Vertical_Right prediction mode not allowed at mb %d\n", (int) p_Vid->current_mb_nr);
+}
+#endif
+  // form predictor pels
+  P_A = imgY[pix_b.pos_y][pix_b.pos_x + 0];
+  P_B = imgY[pix_b.pos_y][pix_b.pos_x + 1];
+  P_C = imgY[pix_b.pos_y][pix_b.pos_x + 2];
+  P_D = imgY[pix_b.pos_y][pix_b.pos_x + 3];
+
+  P_I = imgY[pix_a[0].pos_y][pix_a[0].pos_x];
+  P_J = imgY[pix_a[1].pos_y][pix_a[1].pos_x];
+  P_K = imgY[pix_a[2].pos_y][pix_a[2].pos_x];
+  P_L = imgY[pix_a[3].pos_y][pix_a[3].pos_x];
+
+  P_X = imgY[pix_d.pos_y][pix_d.pos_x];
+
+  currSlice->mb_pred[pl][joff+0][ioff+0] =
+  currSlice->mb_pred[pl][joff+2][ioff+1] = (imgpel) ((P_X + P_A + 1) >> 1);
+  currSlice->mb_pred[pl][joff+0][ioff+1] =
+  currSlice->mb_pred[pl][joff+2][ioff+2] = (imgpel) ((P_A + P_B + 1) >> 1);
+  currSlice->mb_pred[pl][joff+0][ioff+2] =
+  currSlice->mb_pred[pl][joff+2][ioff+3] = (imgpel) ((P_B + P_C + 1) >> 1);
+  currSlice->mb_pred[pl][joff+0][ioff+3] = (imgpel) ((P_C + P_D + 1) >> 1);
+  currSlice->mb_pred[pl][joff+1][ioff+0] =
+  currSlice->mb_pred[pl][joff+3][ioff+1] = (imgpel) ((P_I + 2*P_X + P_A + 2) >> 2);
+  currSlice->mb_pred[pl][joff+1][ioff+1] =
+  currSlice->mb_pred[pl][joff+3][ioff+2] = (imgpel) ((P_X + 2*P_A + P_B + 2) >> 2);
+  currSlice->mb_pred[pl][joff+1][ioff+2] =
+  currSlice->mb_pred[pl][joff+3][ioff+3] = (imgpel) ((P_A + 2*P_B + P_C + 2) >> 2);
+  currSlice->mb_pred[pl][joff+1][ioff+3] = (imgpel) ((P_B + 2*P_C + P_D + 2) >> 2);
+  currSlice->mb_pred[pl][joff+2][ioff+0] = (imgpel) ((P_X + 2*P_I + P_J + 2) >> 2);
+  currSlice->mb_pred[pl][joff+3][ioff+0] = (imgpel) ((P_I + 2*P_J + P_K + 2) >> 2);
+
+  return DECODING_OK;
+}
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    makes and returns 4x4 vertical left prediction mode
+ *
+ * \return
+ *    DECODING_OK   decoding of intraprediction mode was sucessfull            \n
+ *
+ ***********************************************************************
+ */
+static inline int intra4x4_vert_left_pred(Macroblock *currMB,    //!< current macroblock
+                                          ColorPlane pl,         //!< current image plane
+                                          int ioff,              //!< pixel offset X within MB
+                                          int joff)              //!< pixel offset Y within MB
+{
+  Slice *currSlice = currMB->p_Slice;
+  VideoParameters *p_Vid = currMB->p_Vid;
+  
+  imgpel PredPel[13];  // array of predictor pels  
+  imgpel **imgY = (pl) ? p_Vid->dec_picture->imgUV[pl - 1]->img : p_Vid->dec_picture->imgY->img;
+
+  PixelPos pix_b, pix_c;
+
+  int block_available_up;
+  int block_available_up_right;
+
+  p_Vid->getNeighbourPXLumaNB(currMB, ioff    , joff -1 ,  &pix_b);
+  p_Vid->getNeighbourLuma(currMB, ioff +4 , joff -1 ,  &pix_c);
+
+  pix_c.available = pix_c.available && !((ioff==4) && ((joff==4)||(joff==12)));
+  
+  if (p_Vid->active_pps->constrained_intra_pred_flag)
+  {
+    block_available_up       = pix_b.available ? p_Vid->intra_block [pix_b.mb_addr] : 0;
+    block_available_up_right = pix_c.available ? p_Vid->intra_block [pix_c.mb_addr] : 0;
+  }
+  else
+  {
+    block_available_up       = pix_b.available;
+    block_available_up_right = pix_c.available;
+  }
+
+
+  if (!block_available_up)
+    printf ("warning: Intra_4x4_Vertical_Left prediction mode not allowed at mb %d\n", (int) p_Vid->current_mb_nr);
+
+  // form predictor pels
+  P_A = imgY[pix_b.pos_y][pix_b.pos_x + 0];
+  P_B = imgY[pix_b.pos_y][pix_b.pos_x + 1];
+  P_C = imgY[pix_b.pos_y][pix_b.pos_x + 2];
+  P_D = imgY[pix_b.pos_y][pix_b.pos_x + 3];
+
+  if (block_available_up_right)
+  {
+    P_E = imgY[pix_c.pos_y][pix_c.pos_x + 0];
+    P_F = imgY[pix_c.pos_y][pix_c.pos_x + 1];
+    P_G = imgY[pix_c.pos_y][pix_c.pos_x + 2];
+    P_H = imgY[pix_c.pos_y][pix_c.pos_x + 3];
+  }
+  else
+  {
+    P_E = P_F = P_G = P_H = P_D;
+  }
+
+  currSlice->mb_pred[pl][joff+0][ioff+0] = (imgpel) ((P_A + P_B + 1) >> 1);
+  currSlice->mb_pred[pl][joff+0][ioff+1] =
+  currSlice->mb_pred[pl][joff+2][ioff+0] = (imgpel) ((P_B + P_C + 1) >> 1);
+  currSlice->mb_pred[pl][joff+0][ioff+2] =
+  currSlice->mb_pred[pl][joff+2][ioff+1] = (imgpel) ((P_C + P_D + 1) >> 1);
+  currSlice->mb_pred[pl][joff+0][ioff+3] =
+  currSlice->mb_pred[pl][joff+2][ioff+2] = (imgpel) ((P_D + P_E + 1) >> 1);
+  currSlice->mb_pred[pl][joff+2][ioff+3] = (imgpel) ((P_E + P_F + 1) >> 1);
+  currSlice->mb_pred[pl][joff+1][ioff+0] = (imgpel) ((P_A + 2*P_B + P_C + 2) >> 2);
+  currSlice->mb_pred[pl][joff+1][ioff+1] =
+  currSlice->mb_pred[pl][joff+3][ioff+0] = (imgpel) ((P_B + 2*P_C + P_D + 2) >> 2);
+  currSlice->mb_pred[pl][joff+1][ioff+2] =
+  currSlice->mb_pred[pl][joff+3][ioff+1] = (imgpel) ((P_C + 2*P_D + P_E + 2) >> 2);
+  currSlice->mb_pred[pl][joff+1][ioff+3] =
+  currSlice->mb_pred[pl][joff+3][ioff+2] = (imgpel) ((P_D + 2*P_E + P_F + 2) >> 2);
+  currSlice->mb_pred[pl][joff+3][ioff+3] = (imgpel) ((P_E + 2*P_F + P_G + 2) >> 2);
+
+  return DECODING_OK;
+}
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    makes and returns 4x4 horizontal up prediction mode
+ *
+ * \return
+ *    DECODING_OK   decoding of intraprediction mode was sucessfull            \n
+ *
+ ***********************************************************************
+ */
+static inline int intra4x4_hor_up_pred(Macroblock *currMB,    //!< current macroblock
+                                       ColorPlane pl,         //!< current image plane
+                                       int ioff,              //!< pixel offset X within MB
+                                       int joff)              //!< pixel offset Y within MB
+{
+  Slice *currSlice = currMB->p_Slice;
+  VideoParameters *p_Vid = currMB->p_Vid;
+  
+  imgpel PredPel[13];  // array of predictor pels
+  imgpel **imgY = (pl) ? p_Vid->dec_picture->imgUV[pl - 1]->img : p_Vid->dec_picture->imgY->img;
+
+  PixelPos pix_a[4];
+
+  p_Vid->getNeighbourXPLumaNB(currMB, ioff -1 , joff +0 , &pix_a[0]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff -1 , joff +1 , &pix_a[1]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff -1 , joff +2 , &pix_a[2]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff -1 , joff +3 , &pix_a[3]);
+
+#ifdef H264_WARNINGS
+	{
+		int i;
+		int block_available_left;
+  if (p_Vid->active_pps->constrained_intra_pred_flag)
+  {
+    for (i=0, block_available_left=1; i<4;++i)
+      block_available_left  &= pix_a[i].available ? p_Vid->intra_block[pix_a[i].mb_addr]: 0;
+  }
+  else
+  {
+    block_available_left     = pix_a[0].available;
+  }
+
+  if (!block_available_left)
+    printf ("warning: Intra_4x4_Horizontal_Up prediction mode not allowed at mb %d\n",(int) p_Vid->current_mb_nr);
+	}
+#endif
+  // form predictor pels
+  P_I = imgY[pix_a[0].pos_y][pix_a[0].pos_x];
+  P_J = imgY[pix_a[1].pos_y][pix_a[1].pos_x];
+  P_K = imgY[pix_a[2].pos_y][pix_a[2].pos_x];
+  P_L = imgY[pix_a[3].pos_y][pix_a[3].pos_x];
+
+  currSlice->mb_pred[pl][joff+0][ioff+0] = (imgpel) ((P_I + P_J + 1) >> 1);
+  currSlice->mb_pred[pl][joff+0][ioff+1] = (imgpel) ((P_I + 2*P_J + P_K + 2) >> 2);
+  currSlice->mb_pred[pl][joff+0][ioff+2] =
+  currSlice->mb_pred[pl][joff+1][ioff+0] = (imgpel) ((P_J + P_K + 1) >> 1);
+  currSlice->mb_pred[pl][joff+0][ioff+3] =
+  currSlice->mb_pred[pl][joff+1][ioff+1] = (imgpel) ((P_J + 2*P_K + P_L + 2) >> 2);
+  currSlice->mb_pred[pl][joff+1][ioff+2] =
+  currSlice->mb_pred[pl][joff+2][ioff+0] = (imgpel) ((P_K + P_L + 1) >> 1);
+  currSlice->mb_pred[pl][joff+1][ioff+3] =
+  currSlice->mb_pred[pl][joff+2][ioff+1] = (imgpel) ((P_K + 2*P_L + P_L + 2) >> 2);
+  currSlice->mb_pred[pl][joff+2][ioff+3] =
+  currSlice->mb_pred[pl][joff+3][ioff+1] =
+  currSlice->mb_pred[pl][joff+3][ioff+0] =
+  currSlice->mb_pred[pl][joff+2][ioff+2] =
+  currSlice->mb_pred[pl][joff+3][ioff+2] =
+  currSlice->mb_pred[pl][joff+3][ioff+3] = (imgpel) P_L;
+
+  return DECODING_OK;
+}
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    makes and returns 4x4 horizontal down prediction mode
+ *
+ * \return
+ *    DECODING_OK   decoding of intraprediction mode was sucessfull            \n
+ *
+ ***********************************************************************
+ */
+static inline int intra4x4_hor_down_pred(Macroblock *currMB,    //!< current macroblock
+                                         ColorPlane pl,         //!< current image plane
+                                         int ioff,              //!< pixel offset X within MB
+                                         int joff)              //!< pixel offset Y within MB
+{
+  Slice *currSlice = currMB->p_Slice;
+  VideoParameters *p_Vid = currMB->p_Vid;
+  
+  int i;
+  imgpel PredPel[13];  // array of predictor pels
+  imgpel **imgY = (pl) ? p_Vid->dec_picture->imgUV[pl - 1]->img : p_Vid->dec_picture->imgY->img;
+
+  PixelPos pix_a[4];
+  PixelPos pix_b, pix_d;
+
+  int block_available_up;
+  int block_available_left;
+  int block_available_up_left;
+
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff -1 , joff +0 , &pix_a[0]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff -1 , joff +1 , &pix_a[1]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff -1 , joff +2 , &pix_a[2]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff -1 , joff +3 , &pix_a[3]);
+
+  p_Vid->getNeighbourPXLumaNB(currMB, ioff    , joff -1 ,  &pix_b);
+  p_Vid->getNeighbourLuma(currMB, ioff -1 , joff -1 ,  &pix_d);
+
+  if (p_Vid->active_pps->constrained_intra_pred_flag)
+  {
+    for (i=0, block_available_left=1; i<4;++i)
+      block_available_left  &= pix_a[i].available ? p_Vid->intra_block[pix_a[i].mb_addr]: 0;
+    block_available_up       = pix_b.available ? p_Vid->intra_block [pix_b.mb_addr] : 0;
+    block_available_up_left  = pix_d.available ? p_Vid->intra_block [pix_d.mb_addr] : 0;
+  }
+  else
+  {
+    block_available_left     = pix_a[0].available;
+    block_available_up       = pix_b.available;
+    block_available_up_left  = pix_d.available;
+  }
+
+  if ((!block_available_up)||(!block_available_left)||(!block_available_up_left))
+    printf ("warning: Intra_4x4_Horizontal_Down prediction mode not allowed at mb %d\n", (int) p_Vid->current_mb_nr);
+
+  // form predictor pels
+  P_A = imgY[pix_b.pos_y][pix_b.pos_x + 0];
+  P_B = imgY[pix_b.pos_y][pix_b.pos_x + 1];
+  P_C = imgY[pix_b.pos_y][pix_b.pos_x + 2];
+  P_D = imgY[pix_b.pos_y][pix_b.pos_x + 3];
+
+  P_I = imgY[pix_a[0].pos_y][pix_a[0].pos_x];
+  P_J = imgY[pix_a[1].pos_y][pix_a[1].pos_x];
+  P_K = imgY[pix_a[2].pos_y][pix_a[2].pos_x];
+  P_L = imgY[pix_a[3].pos_y][pix_a[3].pos_x];
+
+  P_X = imgY[pix_d.pos_y][pix_d.pos_x];
+
+  currSlice->mb_pred[pl][joff+0][ioff+0] =
+  currSlice->mb_pred[pl][joff+1][ioff+2] = (imgpel) ((P_X + P_I + 1) >> 1);
+  currSlice->mb_pred[pl][joff+0][ioff+1] =
+  currSlice->mb_pred[pl][joff+1][ioff+3] = (imgpel) ((P_I + 2*P_X + P_A + 2) >> 2);
+  currSlice->mb_pred[pl][joff+0][ioff+2] = (imgpel) ((P_X + 2*P_A + P_B + 2) >> 2);
+  currSlice->mb_pred[pl][joff+0][ioff+3] = (imgpel) ((P_A + 2*P_B + P_C + 2) >> 2);
+  currSlice->mb_pred[pl][joff+1][ioff+0] =
+  currSlice->mb_pred[pl][joff+2][ioff+2] = (imgpel) ((P_I + P_J + 1) >> 1);
+  currSlice->mb_pred[pl][joff+1][ioff+1] =
+  currSlice->mb_pred[pl][joff+2][ioff+3] = (imgpel) ((P_X + 2*P_I + P_J + 2) >> 2);
+  currSlice->mb_pred[pl][joff+2][ioff+0] =
+  currSlice->mb_pred[pl][joff+3][ioff+2] = (imgpel) ((P_J + P_K + 1) >> 1);
+  currSlice->mb_pred[pl][joff+2][ioff+1] =
+  currSlice->mb_pred[pl][joff+3][ioff+3] = (imgpel) ((P_I + 2*P_J + P_K + 2) >> 2);
+  currSlice->mb_pred[pl][joff+3][ioff+0] = (imgpel) ((P_K + P_L + 1) >> 1);
+  currSlice->mb_pred[pl][joff+3][ioff+1] = (imgpel) ((P_J + 2*P_K + P_L + 2) >> 2);
+
+  return DECODING_OK;
+}
+
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    makes and returns 4x4 intra prediction blocks 
+ *
+ * \return
+ *    DECODING_OK   decoding of intraprediction mode was sucessfull            \n
+ *    SEARCH_SYNC   search next sync element as errors while decoding occured
+ ***********************************************************************
+ */
+int intrapred(Macroblock *currMB,    //!< current macroblock
+              ColorPlane pl,         //!< current image plane
+              int ioff,              //!< pixel offset X within MB
+              int joff,              //!< pixel offset Y within MB
+              int img_block_x,       //!< location of block X, multiples of 4
+              int img_block_y)       //!< location of block Y, multiples of 4
+{
+  VideoParameters *p_Vid = currMB->p_Vid;
+  byte predmode = p_Vid->ipredmode[img_block_y][img_block_x];
+  currMB->ipmode_DPCM = predmode; //For residual DPCM
+
+  switch (predmode)
+  {
+  case DC_PRED:
+    return (intra4x4_dc_pred(currMB, pl, ioff, joff));
+    break;
+  case VERT_PRED:
+    return (intra4x4_vert_pred(currMB, pl, ioff, joff));
+    break;
+  case HOR_PRED:
+    return (intra4x4_hor_pred(currMB, pl, ioff, joff));
+    break;
+  case DIAG_DOWN_RIGHT_PRED:
+    return (intra4x4_diag_down_right_pred(currMB, pl, ioff, joff));
+    break;
+  case DIAG_DOWN_LEFT_PRED:
+    return (intra4x4_diag_down_left_pred(currMB, pl, ioff, joff));
+    break;
+  case VERT_RIGHT_PRED:
+    return (intra4x4_vert_right_pred(currMB, pl, ioff, joff));
+    break;
+  case VERT_LEFT_PRED:
+    return (intra4x4_vert_left_pred(currMB, pl, ioff, joff));
+    break;
+  case HOR_UP_PRED:
+    return (intra4x4_hor_up_pred(currMB, pl, ioff, joff));
+    break;
+  case HOR_DOWN_PRED:  
+    return (intra4x4_hor_down_pred(currMB, pl, ioff, joff));
+  default:
+    printf("Error: illegal intra_4x4 prediction mode: %d\n", (int) predmode);
+    return SEARCH_SYNC;
+    break;
+  }
+}
diff --git a/Src/h264dec/ldecod/src/intra8x8_pred.c b/Src/h264dec/ldecod/src/intra8x8_pred.c
new file mode 100644
index 00000000..217e0ea8
--- /dev/null
+++ b/Src/h264dec/ldecod/src/intra8x8_pred.c
@@ -0,0 +1,1928 @@
+/*!
+ *************************************************************************************
+ * \file intra8x8_pred.c
+ *
+ * \brief
+ *    Functions for intra 8x8 prediction
+ *
+ * \author
+ *      Main contributors (see contributors.h for copyright, 
+ *                         address and affiliation details)
+ *      - Yuri Vatis
+ *      - Jan Muenster
+ *      - Alexis Michael Tourapis  <alexismt@ieee.org>
+ *
+ *************************************************************************************
+ */
+#include "global.h"
+#include "intra8x8_pred.h"
+#include "mb_access.h"
+#include "image.h"
+
+// Notation for comments regarding prediction and predictors.
+// The pels of the 8x8 block are labeled a..p. The predictor pels above
+// are labeled A..H, from the left I..P, and from above left X, as follows:
+//
+//  0   1   2   3   4   5  6  7  8  9  10  11  12  13   14  15  16
+//  17  a1  b1  c1  d1  e1 f1 g1 h1
+//  18  a2  b2  c2  d2  e2 f2 g2 h2
+//  19  a3  b3  c3  d3  e3 f3 g3 h3
+//  20  a4  b4  c4  d4  e4 f4 g4 h4
+//  21  a5  b5  c5  d5  e5 f5 g5 h5
+//  22  a6  b6  c6  d6  e6 f6 g6 h6
+//  23  a7  b7  c7  d7  e7 f7 g7 h7
+//  24  a8  b8  c8  d8  e8 f8 g8 h8
+
+
+static void memset_8x8(h264_imgpel_macroblock_row_t *mb_pred, int offset_x, int pred)
+{
+#ifdef _M_IX86
+	// benski> can't believe the shitty code that the compiler generated...  this code is better
+	__m64 mmx_pred = _mm_set1_pi8(pred);
+	mb_pred = (h264_imgpel_macroblock_row_t *)&mb_pred[0][offset_x];
+	*(__m64 *)mb_pred[0] = mmx_pred;
+	*(__m64 *)mb_pred[1] = mmx_pred;
+	*(__m64 *)mb_pred[2] = mmx_pred;
+	*(__m64 *)mb_pred[3] = mmx_pred;
+	*(__m64 *)mb_pred[4] = mmx_pred;
+	*(__m64 *)mb_pred[5] = mmx_pred;
+	*(__m64 *)mb_pred[6] = mmx_pred;
+	*(__m64 *)mb_pred[7] = mmx_pred;
+#else
+	int ii, jj;
+	for (jj = 0; jj < BLOCK_SIZE_8x8; jj++)
+	{
+		for (ii = 0; ii < BLOCK_SIZE_8x8; ii++)
+		{
+			mb_pred[jj][offset_x+ii]=(imgpel) pred;
+		}
+	}
+#endif
+}
+
+static void memset_8x8_row(h264_imgpel_macroblock_row_t *mb_pred, int offset_x, const imgpel row[8])
+{
+#ifdef _M_IX86
+	// benski> can't believe the shitty code that the compiler generated...  this code is better
+	__m64 mmx_pred = *(__m64 *)row;
+	mb_pred = (h264_imgpel_macroblock_row_t *)&mb_pred[0][offset_x];
+	*(__m64 *)mb_pred[0] = mmx_pred;
+	*(__m64 *)mb_pred[1] = mmx_pred;
+	*(__m64 *)mb_pred[2] = mmx_pred;
+	*(__m64 *)mb_pred[3] = mmx_pred;
+	*(__m64 *)mb_pred[4] = mmx_pred;
+	*(__m64 *)mb_pred[5] = mmx_pred;
+	*(__m64 *)mb_pred[6] = mmx_pred;
+	*(__m64 *)mb_pred[7] = mmx_pred;
+#else
+	int jj;
+	for (jj = 0; jj < BLOCK_SIZE_8x8; jj++)
+	{
+			memcpy(&mb_pred[jj][offset_x], row, 8);
+	}
+#endif
+}
+
+/*!
+ *************************************************************************************
+ * \brief
+ *    Prefiltering for Intra8x8 prediction
+ *************************************************************************************
+ */
+static __forceinline void LowPassForIntra8x8Pred(imgpel *PredPel, int block_up_left, int block_up, int block_left)
+{
+  imgpel LoopArray[25];
+
+  memcpy(&LoopArray[0], &PredPel[0], 25 * sizeof(imgpel));
+
+  if(block_up_left)
+  {
+    if(block_up && block_left)
+    {
+      PredPel[0] = (imgpel) ((LoopArray[17] + (LoopArray[0]<<1) + LoopArray[1] + 2)>>2);
+    }
+    else
+    {
+      if(block_up)
+        PredPel[0] = (imgpel) ((LoopArray[0] + (LoopArray[0]<<1) + LoopArray[1] + 2)>>2);
+      else if (block_left)
+        PredPel[0] = (imgpel) ((LoopArray[0] + (LoopArray[0]<<1) + LoopArray[17] + 2)>>2);
+    }
+  }
+  
+  if(block_up)
+  {    
+    if(block_up_left)
+    {
+      PredPel[1] = (imgpel) ((LoopArray[0] + (LoopArray[1]<<1) + LoopArray[2] + 2)>>2);
+    }
+    else
+      PredPel[1] = (imgpel) ((LoopArray[1] + (LoopArray[1]<<1) + LoopArray[2] + 2)>>2);
+
+
+      PredPel[2] = (imgpel) ((LoopArray[2-1] + (LoopArray[2]<<1) + LoopArray[2+1] + 2)>>2);
+			PredPel[3] = (imgpel) ((LoopArray[3-1] + (LoopArray[3]<<1) + LoopArray[3+1] + 2)>>2);
+			PredPel[4] = (imgpel) ((LoopArray[4-1] + (LoopArray[4]<<1) + LoopArray[4+1] + 2)>>2);
+			PredPel[5] = (imgpel) ((LoopArray[5-1] + (LoopArray[5]<<1) + LoopArray[5+1] + 2)>>2);
+			PredPel[6] = (imgpel) ((LoopArray[6-1] + (LoopArray[6]<<1) + LoopArray[6+1] + 2)>>2);
+			PredPel[7] = (imgpel) ((LoopArray[7-1] + (LoopArray[7]<<1) + LoopArray[7+1] + 2)>>2);
+			PredPel[8] = (imgpel) ((LoopArray[8-1] + (LoopArray[8]<<1) + LoopArray[8+1] + 2)>>2);
+			PredPel[9] = (imgpel) ((LoopArray[9-1] + (LoopArray[9]<<1) + LoopArray[9+1] + 2)>>2);
+			PredPel[10] = (imgpel) ((LoopArray[10-1] + (LoopArray[10]<<1) + LoopArray[10+1] + 2)>>2);
+			PredPel[11] = (imgpel) ((LoopArray[11-1] + (LoopArray[11]<<1) + LoopArray[11+1] + 2)>>2);
+			PredPel[12] = (imgpel) ((LoopArray[12-1] + (LoopArray[12]<<1) + LoopArray[12+1] + 2)>>2);
+			PredPel[13] = (imgpel) ((LoopArray[13-1] + (LoopArray[13]<<1) + LoopArray[13+1] + 2)>>2);
+			PredPel[14] = (imgpel) ((LoopArray[14-1] + (LoopArray[14]<<1) + LoopArray[14+1] + 2)>>2);
+			PredPel[15] = (imgpel) ((LoopArray[15-1] + (LoopArray[15]<<1) + LoopArray[15+1] + 2)>>2);
+
+    PredPel[16] = (imgpel) ((LoopArray[16] + (LoopArray[16]<<1) + LoopArray[15] + 2)>>2);
+  }
+
+  if(block_left)
+  {
+    if(block_up_left)
+      PredPel[17] = (imgpel) ((LoopArray[0] + (LoopArray[17]<<1) + LoopArray[18] + 2)>>2);
+    else
+      PredPel[17] = (imgpel) ((LoopArray[17] + (LoopArray[17]<<1) + LoopArray[18] + 2)>>2);
+
+    PredPel[18] = (imgpel) ((LoopArray[18-1] + (LoopArray[18]<<1) + LoopArray[18+1] + 2)>>2);
+		PredPel[19] = (imgpel) ((LoopArray[19-1] + (LoopArray[19]<<1) + LoopArray[19+1] + 2)>>2);
+		PredPel[20] = (imgpel) ((LoopArray[20-1] + (LoopArray[20]<<1) + LoopArray[20+1] + 2)>>2);
+		PredPel[21] = (imgpel) ((LoopArray[21-1] + (LoopArray[21]<<1) + LoopArray[21+1] + 2)>>2);
+		PredPel[22] = (imgpel) ((LoopArray[22-1] + (LoopArray[22]<<1) + LoopArray[22+1] + 2)>>2);
+		PredPel[23] = (imgpel) ((LoopArray[23-1] + (LoopArray[23]<<1) + LoopArray[23+1] + 2)>>2);
+
+    PredPel[24] = (imgpel) ((LoopArray[23] + (LoopArray[24]<<1) + LoopArray[24] + 2) >> 2);
+  }
+
+  //memcpy(&PredPel[0], &LoopArray[0], 25 * sizeof(imgpel));
+}
+
+/*!
+ *************************************************************************************
+ * \brief
+ *    Prefiltering for Intra8x8 prediction (Horizontal)
+ *************************************************************************************
+ */
+static __forceinline void LowPassForIntra8x8PredHor(imgpel *PredPel, int block_up_left, int block_up, int block_left)
+{
+  imgpel LoopArray[16];
+
+  memcpy(&LoopArray[0], &PredPel[0], 16 * sizeof(imgpel));
+
+  if(block_up_left)
+  {
+    if(block_up && block_left)
+    {
+      PredPel[0] = (imgpel) ((PredPel[17] + (LoopArray[0]<<1) + LoopArray[1] + 2)>>2);
+    }
+    else
+    {
+      if(block_up)
+        PredPel[0] = (imgpel) ((LoopArray[0] + (LoopArray[0]<<1) + LoopArray[1] + 2)>>2);
+      else if (block_left)
+        PredPel[0] = (imgpel) ((LoopArray[0] + (LoopArray[0]<<1) + PredPel[17] + 2)>>2);
+    }
+  }
+  
+  if(block_up)
+  {    
+    if(block_up_left)
+    {
+      PredPel[1] = (imgpel) ((LoopArray[0] + (LoopArray[1]<<1) + LoopArray[2] + 2)>>2);
+    }
+    else
+      PredPel[1] = (imgpel) ((LoopArray[1] + (LoopArray[1]<<1) + LoopArray[2] + 2)>>2);
+
+
+		PredPel[2] = (imgpel) ((LoopArray[2-1] + (LoopArray[2]<<1) + LoopArray[2+1] + 2)>>2);
+		PredPel[3] = (imgpel) ((LoopArray[3-1] + (LoopArray[3]<<1) + LoopArray[3+1] + 2)>>2);
+		PredPel[4] = (imgpel) ((LoopArray[4-1] + (LoopArray[4]<<1) + LoopArray[4+1] + 2)>>2);
+		PredPel[5] = (imgpel) ((LoopArray[5-1] + (LoopArray[5]<<1) + LoopArray[5+1] + 2)>>2);
+		PredPel[6] = (imgpel) ((LoopArray[6-1] + (LoopArray[6]<<1) + LoopArray[6+1] + 2)>>2);
+		PredPel[7] = (imgpel) ((LoopArray[7-1] + (LoopArray[7]<<1) + LoopArray[7+1] + 2)>>2);
+		PredPel[8] = (imgpel) ((LoopArray[8-1] + (LoopArray[8]<<1) + LoopArray[8+1] + 2)>>2);
+		PredPel[9] = (imgpel) ((LoopArray[9-1] + (LoopArray[9]<<1) + LoopArray[9+1] + 2)>>2);
+		PredPel[10] = (imgpel) ((LoopArray[10-1] + (LoopArray[10]<<1) + LoopArray[10+1] + 2)>>2);
+		PredPel[11] = (imgpel) ((LoopArray[11-1] + (LoopArray[11]<<1) + LoopArray[11+1] + 2)>>2);
+		PredPel[12] = (imgpel) ((LoopArray[12-1] + (LoopArray[12]<<1) + LoopArray[12+1] + 2)>>2);
+		PredPel[13] = (imgpel) ((LoopArray[13-1] + (LoopArray[13]<<1) + LoopArray[13+1] + 2)>>2);
+		PredPel[14] = (imgpel) ((LoopArray[14-1] + (LoopArray[14]<<1) + LoopArray[14+1] + 2)>>2);
+		PredPel[15] = (imgpel) ((LoopArray[15-1] + (LoopArray[15]<<1) + PredPel[15+1] + 2)>>2);
+		PredPel[16] = (imgpel) ((PredPel[16] + (PredPel[16]<<1) + LoopArray[15] + 2)>>2);
+  }
+
+
+  //memcpy(&PredPel[0], &LoopArray[0], 17 * sizeof(imgpel));
+}
+
+/*!
+ *************************************************************************************
+ * \brief
+ *    Prefiltering for Intra8x8 prediction (Vertical)
+ *************************************************************************************
+ */
+static __forceinline void LowPassForIntra8x8PredVer(imgpel *PredPel, int block_up_left, int block_up, int block_left)
+{
+  // These functions need some cleanup and can be further optimized. 
+  // For convenience, let us copy all data for now. It is obvious that the filtering makes things a bit more "complex"
+  int i;
+  imgpel LoopArray[25];
+
+  //memcpy(&LoopArray[0], &PredPel[0], 25 * sizeof(imgpel));
+	LoopArray[0] = PredPel[0];
+	LoopArray[1] = PredPel[1];
+	LoopArray[17] = PredPel[17];
+	LoopArray[18] = PredPel[18];
+	LoopArray[19] = PredPel[19];
+	LoopArray[20] = PredPel[20];
+	LoopArray[21] = PredPel[21];
+	LoopArray[22] = PredPel[22];
+	LoopArray[23] = PredPel[23];
+	LoopArray[24] = PredPel[24];
+
+  if(block_up_left)
+  {
+    if(block_up && block_left)
+    {
+      PredPel[0] = (imgpel) ((LoopArray[17] + (LoopArray[0]<<1) + LoopArray[1] + 2)>>2);
+    }
+    else
+    {
+      if(block_up)
+        PredPel[0] = (imgpel) ((LoopArray[0] + (LoopArray[0]<<1) + LoopArray[1] + 2)>>2);
+      else if (block_left)
+        PredPel[0] = (imgpel) ((LoopArray[0] + (LoopArray[0]<<1) + LoopArray[17] + 2)>>2);
+    }
+  }
+  
+  if(block_left)
+  {
+    if(block_up_left)
+      PredPel[17] = (imgpel) ((LoopArray[0] + (LoopArray[17]<<1) + LoopArray[18] + 2)>>2);
+    else
+      PredPel[17] = (imgpel) ((LoopArray[17] + (LoopArray[17]<<1) + LoopArray[18] + 2)>>2);
+
+    for(i = 18; i <24; i++)
+    {
+      PredPel[i] = (imgpel) ((LoopArray[i-1] + (LoopArray[i]<<1) + LoopArray[i+1] + 2)>>2);
+    }
+    PredPel[24] = (imgpel) ((LoopArray[23] + (LoopArray[24]<<1) + LoopArray[24] + 2) >> 2);
+  }
+
+  //memcpy(&PredPel[0], &LoopArray[0], 25 * sizeof(imgpel));
+}
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    makes and returns 8x8 DC prediction mode
+ *
+ * \return
+ *    DECODING_OK   decoding of intraprediction mode was sucessfull            \n
+ *
+ ***********************************************************************
+ */
+static inline int intra8x8_dc_pred(Macroblock *currMB,    //!< current macroblock
+                                   ColorPlane pl,         //!< current image plane
+                                   int ioff,              //!< pixel offset X within MB
+                                   int joff)              //!< pixel offset Y within MB
+{
+  int s0 = 0;
+  imgpel PredPel[25];  // array of predictor pels
+  Slice *currSlice = currMB->p_Slice;
+  VideoParameters *p_Vid = currMB->p_Vid;
+
+  StorablePicture *dec_picture = p_Vid->dec_picture;
+  imgpel **imgY = (pl) ? dec_picture->imgUV[pl - 1]->img : dec_picture->imgY->img; // For MB level frame/field coding tools -- set default to imgY
+
+  PixelPos pix_a[8];
+  PixelPos pix_b, pix_c, pix_d;
+
+  int block_available_up;
+  int block_available_left;
+  int block_available_up_left;
+  int block_available_up_right;
+  
+  imgpel *pred_pels;
+
+	if (ioff == 0)
+	{
+			p_Vid->getNeighbourNPLumaNB(currMB, joff + 0,  &pix_a[0]);
+	p_Vid->getNeighbourNPLumaNB(currMB, joff + 1,  &pix_a[1]);
+	p_Vid->getNeighbourNPLumaNB(currMB, joff + 2,  &pix_a[2]);
+	p_Vid->getNeighbourNPLumaNB(currMB, joff + 3,  &pix_a[3]);
+	p_Vid->getNeighbourNPLumaNB(currMB, joff + 4,  &pix_a[4]);
+	p_Vid->getNeighbourNPLumaNB(currMB, joff + 5,  &pix_a[5]);
+	p_Vid->getNeighbourNPLumaNB(currMB, joff + 6,  &pix_a[6]);
+	p_Vid->getNeighbourNPLumaNB(currMB,  joff + 7,  &pix_a[7]);
+
+	p_Vid->getNeighbour0XLuma(currMB, joff - 1,  &pix_b);
+	p_Vid->getNeighbourPXLumaNB(currMB, 8, joff - 1,  &pix_c);
+	p_Vid->getNeighbourNXLuma(currMB, joff - 1,  &pix_d);
+	}
+	else
+	{ // ioff == 8
+	p_Vid->getNeighbourPPLumaNB(currMB, ioff - 1, joff + 0,  &pix_a[0]);
+	p_Vid->getNeighbourPPLumaNB(currMB, ioff - 1, joff + 1,  &pix_a[1]);
+	p_Vid->getNeighbourPPLumaNB(currMB, ioff - 1, joff + 2,  &pix_a[2]);
+	p_Vid->getNeighbourPPLumaNB(currMB, ioff - 1, joff + 3,  &pix_a[3]);
+	p_Vid->getNeighbourPPLumaNB(currMB, ioff - 1, joff + 4,  &pix_a[4]);
+	p_Vid->getNeighbourPPLumaNB(currMB, ioff - 1, joff + 5,  &pix_a[5]);
+	p_Vid->getNeighbourPPLumaNB(currMB, ioff - 1, joff + 6,  &pix_a[6]);
+	p_Vid->getNeighbourPPLumaNB(currMB, ioff - 1, joff + 7,  &pix_a[7]);
+
+	p_Vid->getNeighbourPXLumaNB(currMB, ioff    , joff - 1,  &pix_b);
+	p_Vid->getNeighbourPXLuma(currMB, ioff + 8, joff - 1,  &pix_c);
+	p_Vid->getNeighbourPXLumaNB(currMB, ioff - 1, joff - 1,  &pix_d);
+	}
+  pix_c.available = pix_c.available &&!(ioff == 8 && joff == 8);
+
+  if (p_Vid->active_pps->constrained_intra_pred_flag)
+  {
+		int i;
+    for (i=0, block_available_left=1; i<8;i++)
+      block_available_left  &= pix_a[i].available ? p_Vid->intra_block[pix_a[i].mb_addr]: 0;
+    block_available_up       = pix_b.available ? p_Vid->intra_block [pix_b.mb_addr] : 0;
+    block_available_up_right = pix_c.available ? p_Vid->intra_block [pix_c.mb_addr] : 0;
+    block_available_up_left  = pix_d.available ? p_Vid->intra_block [pix_d.mb_addr] : 0;
+  }
+  else
+  {
+    block_available_left     = pix_a[0].available;
+    block_available_up       = pix_b.available;
+    block_available_up_right = pix_c.available;
+    block_available_up_left  = pix_d.available;
+  }
+
+	// form predictor pels
+  if (block_available_up)
+  {
+    pred_pels = &imgY[pix_b.pos_y][pix_b.pos_x];
+    PredPel[1] = pred_pels[0];
+    PredPel[2] = pred_pels[1];
+    PredPel[3] = pred_pels[2];
+    PredPel[4] = pred_pels[3];
+    PredPel[5] = pred_pels[4];
+    PredPel[6] = pred_pels[5];
+    PredPel[7] = pred_pels[6];
+    PredPel[8] = pred_pels[7];
+  }
+  else
+  {
+    PredPel[1] = PredPel[2] = PredPel[3] = PredPel[4] = PredPel[5] = PredPel[6] = PredPel[7] = PredPel[8] = (imgpel) p_Vid->dc_pred_value_comp[pl];
+  }
+
+  if (block_available_up_right)
+  {
+    pred_pels = &imgY[pix_c.pos_y][pix_c.pos_x];
+    PredPel[9] = pred_pels[0];
+    PredPel[10] = pred_pels[1];
+    PredPel[11] = pred_pels[2];
+    PredPel[12] = pred_pels[3];
+    PredPel[13] = pred_pels[4];
+    PredPel[14] = pred_pels[5];
+    PredPel[15] = pred_pels[6];
+    PredPel[16] = pred_pels[7];
+
+  }
+  else
+  {
+    PredPel[9] = PredPel[10] = PredPel[11] = PredPel[12] = PredPel[13] = PredPel[14] = PredPel[15] = PredPel[16] = PredPel[8];
+  }
+
+  if (block_available_left)
+  {
+    PredPel[17] = imgY[pix_a[0].pos_y][pix_a[0].pos_x];
+    PredPel[18] = imgY[pix_a[1].pos_y][pix_a[0].pos_x];
+    PredPel[19] = imgY[pix_a[2].pos_y][pix_a[0].pos_x];
+    PredPel[20] = imgY[pix_a[3].pos_y][pix_a[0].pos_x];
+    PredPel[21] = imgY[pix_a[4].pos_y][pix_a[0].pos_x];
+    PredPel[22] = imgY[pix_a[5].pos_y][pix_a[0].pos_x];
+    PredPel[23] = imgY[pix_a[6].pos_y][pix_a[0].pos_x];
+    PredPel[24] = imgY[pix_a[7].pos_y][pix_a[0].pos_x];
+  }
+  else
+  {
+    PredPel[17] = PredPel[18] = PredPel[19] = PredPel[20] = PredPel[21] = PredPel[22] = PredPel[23] = PredPel[24] = (imgpel) p_Vid->dc_pred_value_comp[pl];
+  }
+
+  if (block_available_up_left)
+  {
+    PredPel[0] = imgY[pix_d.pos_y][pix_d.pos_x];
+  }
+  else
+  {
+    PredPel[0] = (imgpel) p_Vid->dc_pred_value_comp[pl];
+  }
+
+  LowPassForIntra8x8Pred(&(PredPel[0]), block_available_up_left, block_available_up, block_available_left);
+  
+  if (block_available_up && block_available_left)
+  {
+    // no edge
+    s0 = (PredPel[1] + PredPel[2] + PredPel[3] + PredPel[4] + PredPel[5] + PredPel[6] + PredPel[7] + PredPel[8] + PredPel[17] + PredPel[18] + PredPel[19] + PredPel[20] + PredPel[21] + PredPel[22] + PredPel[23] + PredPel[24] + 8) >> 4;
+  }
+  else if (!block_available_up && block_available_left)
+  {
+    // upper edge
+    s0 = (PredPel[17] + PredPel[18] + PredPel[19] + PredPel[20] + PredPel[21] + PredPel[22] + PredPel[23] + PredPel[24] + 4) >> 3;
+  }
+  else if (block_available_up && !block_available_left)
+  {
+    // left edge
+    s0 = (PredPel[1] + PredPel[2] + PredPel[3] + PredPel[4] + PredPel[5] + PredPel[6] + PredPel[7] + PredPel[8] + 4) >> 3;
+  }
+  else //if (!block_available_up && !block_available_left)
+  {
+    // top left corner, nothing to predict from
+    s0 = p_Vid->dc_pred_value_comp[pl];
+  }
+
+	memset_8x8(&currSlice->mb_pred[pl][joff], ioff, s0);
+
+  return DECODING_OK;
+}
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    makes and returns 8x8 vertical prediction mode
+ *
+ * \return
+ *    DECODING_OK   decoding of intraprediction mode was sucessfull            \n
+ *
+ ***********************************************************************
+ */
+static inline int intra8x8_vert_pred(Macroblock *currMB,    //!< current macroblock
+                                     ColorPlane pl,         //!< current image plane
+                                     int ioff,              //!< pixel offset X within MB
+                                     int joff)              //!< pixel offset Y within MB
+{
+  Slice *currSlice = currMB->p_Slice;
+  VideoParameters *p_Vid = currMB->p_Vid;
+  
+  int i;
+  imgpel PredPel[25];  // array of predictor pels  
+  imgpel **imgY = (pl) ? p_Vid->dec_picture->imgUV[pl - 1]->img : p_Vid->dec_picture->imgY->img; // For MB level frame/field coding tools -- set default to imgY
+
+  PixelPos pix_a[8];
+  PixelPos pix_b, pix_c, pix_d;
+
+  int block_available_up;
+  int block_available_left;
+  int block_available_up_left;
+  int block_available_up_right;
+
+  
+  imgpel *pred_pels;
+
+    p_Vid->getNeighbourXPLumaNB_NoPos(currMB, ioff - 1, joff + 0,  &pix_a[0]);
+		p_Vid->getNeighbourXPLumaNB_NoPos(currMB, ioff - 1, joff + 1,  &pix_a[1]);
+		p_Vid->getNeighbourXPLumaNB_NoPos(currMB, ioff - 1, joff + 2,  &pix_a[2]);
+		p_Vid->getNeighbourXPLumaNB_NoPos(currMB, ioff - 1, joff + 3,  &pix_a[3]);
+		p_Vid->getNeighbourXPLumaNB_NoPos(currMB, ioff - 1, joff + 4,  &pix_a[4]);
+		p_Vid->getNeighbourXPLumaNB_NoPos(currMB, ioff - 1, joff + 5,  &pix_a[5]);
+		p_Vid->getNeighbourXPLumaNB_NoPos(currMB, ioff - 1, joff + 6,  &pix_a[6]);
+		p_Vid->getNeighbourXPLumaNB_NoPos(currMB, ioff - 1, joff + 7,  &pix_a[7]);
+
+  p_Vid->getNeighbourPXLumaNB(currMB, ioff    , joff - 1, &pix_b);
+  p_Vid->getNeighbourPXLuma(currMB, ioff + 8, joff - 1, &pix_c);
+  p_Vid->getNeighbourLuma(currMB, ioff - 1, joff - 1, &pix_d);
+
+  pix_c.available = pix_c.available &&!(ioff == 8 && joff == 8);
+
+  if (p_Vid->active_pps->constrained_intra_pred_flag)
+  {
+    for (i=0, block_available_left=1; i<8;i++)
+      block_available_left  &= pix_a[i].available ? p_Vid->intra_block[pix_a[i].mb_addr]: 0;
+    block_available_up       = pix_b.available ? p_Vid->intra_block [pix_b.mb_addr] : 0;
+    block_available_up_right = pix_c.available ? p_Vid->intra_block [pix_c.mb_addr] : 0;
+    block_available_up_left  = pix_d.available ? p_Vid->intra_block [pix_d.mb_addr] : 0;
+  }
+  else
+  {
+    block_available_left     = pix_a[0].available;
+    block_available_up       = pix_b.available;
+    block_available_up_right = pix_c.available;
+    block_available_up_left  = pix_d.available;
+  }
+
+  if (!block_available_up)
+    printf ("warning: Intra_8x8_Vertical prediction mode not allowed at mb %d\n", (int) p_Vid->current_mb_nr);
+
+  // form predictor pels
+  if (block_available_up)
+  {
+    pred_pels = &imgY[pix_b.pos_y][pix_b.pos_x];
+    PredPel[1] = *(pred_pels ++);
+    PredPel[2] = *(pred_pels ++);
+    PredPel[3] = *(pred_pels ++);
+    PredPel[4] = *(pred_pels ++);
+    PredPel[5] = *(pred_pels ++);
+    PredPel[6] = *(pred_pels ++);
+    PredPel[7] = *(pred_pels ++);
+    PredPel[8] = *pred_pels;
+  }
+  else
+  {
+    PredPel[1] = PredPel[2] = PredPel[3] = PredPel[4] = PredPel[5] = PredPel[6] = PredPel[7] = PredPel[8] = (imgpel) p_Vid->dc_pred_value_comp[pl];
+  }
+
+  if (block_available_up_right)
+  {
+    pred_pels = &imgY[pix_c.pos_y][pix_c.pos_x];
+    PredPel[9] = *(pred_pels ++);
+    PredPel[10] = *(pred_pels ++);
+    PredPel[11] = *(pred_pels ++);
+    PredPel[12] = *(pred_pels ++);
+    PredPel[13] = *(pred_pels ++);
+    PredPel[14] = *(pred_pels ++);
+    PredPel[15] = *(pred_pels ++);
+    PredPel[16] = *pred_pels;
+  }
+  else
+  {
+    PredPel[9] = PredPel[10] = PredPel[11] = PredPel[12] = PredPel[13] = PredPel[14] = PredPel[15] = PredPel[16] = PredPel[8];
+  }
+
+  if (block_available_up_left)
+  {
+    PredPel[0] = imgY[pix_d.pos_y][pix_d.pos_x];
+  }
+  else
+  {
+    PredPel[0] = (imgpel) p_Vid->dc_pred_value_comp[pl];
+  }
+
+  LowPassForIntra8x8PredHor(&(PredPel[0]), block_available_up_left, block_available_up, block_available_left);
+  
+	memset_8x8_row(&currSlice->mb_pred[pl][joff], ioff, &PredPel[1]);
+
+	return DECODING_OK;
+}
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    makes and returns 8x8 horizontal prediction mode
+ *
+ * \return
+ *    DECODING_OK   decoding of intraprediction mode was sucessfull            \n
+ *
+ ***********************************************************************
+ */
+static inline int intra8x8_hor_pred(Macroblock *currMB,    //!< current macroblock
+                                    ColorPlane pl,         //!< current image plane
+                                    int ioff,              //!< pixel offset X within MB
+                                    int joff)              //!< pixel offset Y within MB
+{
+  Slice *currSlice = currMB->p_Slice;
+  VideoParameters *p_Vid = currMB->p_Vid;
+  
+
+  int i,j;
+  imgpel PredPel[25];  // array of predictor pels
+  imgpel **imgY = (pl) ? p_Vid->dec_picture->imgUV[pl - 1]->img : p_Vid->dec_picture->imgY->img; // For MB level frame/field coding tools -- set default to imgY
+
+  PixelPos pix_a[8];
+  PixelPos pix_b, pix_c, pix_d;
+
+  int block_available_up;
+  int block_available_left;
+  int block_available_up_left;
+  int block_available_up_right;
+  int jpos;  
+
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 0,  &pix_a[0]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 1,  &pix_a[1]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 2,  &pix_a[2]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 3,  &pix_a[3]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 4,  &pix_a[4]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 5,  &pix_a[5]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 6,  &pix_a[6]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 7,  &pix_a[7]);
+
+	p_Vid->getNeighbourPXLumaNB(currMB, ioff    , joff - 1,  &pix_b);
+	p_Vid->getNeighbourPXLuma(currMB, ioff + 8, joff - 1,  &pix_c);
+	p_Vid->getNeighbourLuma(currMB, ioff - 1, joff - 1,  &pix_d);
+
+  pix_c.available = pix_c.available &&!(ioff == 8 && joff == 8);
+
+  if (p_Vid->active_pps->constrained_intra_pred_flag)
+  {
+    for (i=0, block_available_left=1; i<8;i++)
+      block_available_left  &= pix_a[i].available ? p_Vid->intra_block[pix_a[i].mb_addr]: 0;
+    block_available_up       = pix_b.available ? p_Vid->intra_block [pix_b.mb_addr] : 0;
+    block_available_up_right = pix_c.available ? p_Vid->intra_block [pix_c.mb_addr] : 0;
+    block_available_up_left  = pix_d.available ? p_Vid->intra_block [pix_d.mb_addr] : 0;
+  }
+  else
+  {
+    block_available_left     = pix_a[0].available;
+    block_available_up       = pix_b.available;
+    block_available_up_right = pix_c.available;
+    block_available_up_left  = pix_d.available;
+  }
+
+  if (!block_available_left)
+    printf ("warning: Intra_8x8_Horizontal prediction mode not allowed at mb %d\n", (int) p_Vid->current_mb_nr);
+
+  // form predictor pels
+  if (block_available_left)
+  {
+    PredPel[17] = imgY[pix_a[0].pos_y][pix_a[0].pos_x];
+    PredPel[18] = imgY[pix_a[1].pos_y][pix_a[1].pos_x];
+    PredPel[19] = imgY[pix_a[2].pos_y][pix_a[2].pos_x];
+    PredPel[20] = imgY[pix_a[3].pos_y][pix_a[3].pos_x];
+    PredPel[21] = imgY[pix_a[4].pos_y][pix_a[4].pos_x];
+    PredPel[22] = imgY[pix_a[5].pos_y][pix_a[5].pos_x];
+    PredPel[23] = imgY[pix_a[6].pos_y][pix_a[6].pos_x];
+    PredPel[24] = imgY[pix_a[7].pos_y][pix_a[7].pos_x];
+  }
+  else
+  {
+    PredPel[17] = PredPel[18] = PredPel[19] = PredPel[20] = PredPel[21] = PredPel[22] = PredPel[23] = PredPel[24] = (imgpel) p_Vid->dc_pred_value_comp[pl];
+  }
+
+  if (block_available_up_left)
+  {
+    PredPel[0] = imgY[pix_d.pos_y][pix_d.pos_x];
+  }
+  else
+  {
+    PredPel[0] = (imgpel) p_Vid->dc_pred_value_comp[pl];
+  }
+
+  LowPassForIntra8x8PredVer(&(PredPel[0]), block_available_up_left, block_available_up, block_available_left);
+
+  for (j=0; j < BLOCK_SIZE_8x8; j++)
+  {
+    jpos = j + joff;
+    currSlice->mb_pred[pl][jpos][ioff]  =
+      currSlice->mb_pred[pl][jpos][ioff+1]  =
+      currSlice->mb_pred[pl][jpos][ioff+2]  =
+      currSlice->mb_pred[pl][jpos][ioff+3]  =
+      currSlice->mb_pred[pl][jpos][ioff+4]  =
+      currSlice->mb_pred[pl][jpos][ioff+5]  =
+      currSlice->mb_pred[pl][jpos][ioff+6]  =
+      currSlice->mb_pred[pl][jpos][ioff+7]  = (imgpel) (&PredPel[17])[j];
+  }
+ 
+  return DECODING_OK;
+}
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    makes and returns 8x8 diagonal down right prediction mode
+ *
+ * \return
+ *    DECODING_OK   decoding of intraprediction mode was sucessfull            \n
+ *
+ ***********************************************************************
+ */
+static inline int intra8x8_diag_down_right_pred(Macroblock *currMB,    //!< current macroblock
+                                                ColorPlane pl,         //!< current image plane
+                                                int ioff,              //!< pixel offset X within MB
+                                                int joff)              //!< pixel offset Y within MB
+{
+  Slice *currSlice = currMB->p_Slice;
+  VideoParameters *p_Vid = currMB->p_Vid;
+  
+
+  int i;
+  imgpel PredPel[25];  // array of predictor pels
+  imgpel **imgY = (pl) ? p_Vid->dec_picture->imgUV[pl - 1]->img : p_Vid->dec_picture->imgY->img; // For MB level frame/field coding tools -- set default to imgY
+
+  PixelPos pix_a[8];
+  PixelPos pix_b, pix_c, pix_d;
+h264_imgpel_macroblock_row_t *pred;
+  int block_available_up;
+  int block_available_left;
+  int block_available_up_left;
+  int block_available_up_right;
+
+  imgpel *pred_pels;
+
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 0,  &pix_a[0]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 1,  &pix_a[1]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 2,  &pix_a[2]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 3,  &pix_a[3]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 4,  &pix_a[4]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 5,  &pix_a[5]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 6,  &pix_a[6]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 7,  &pix_a[7]);
+
+	p_Vid->getNeighbourPXLumaNB(currMB, ioff    , joff - 1,  &pix_b);
+	p_Vid->getNeighbourPXLuma(currMB, ioff + 8, joff - 1,  &pix_c);
+	p_Vid->getNeighbourLuma(currMB, ioff - 1, joff - 1,  &pix_d);
+
+  pix_c.available = pix_c.available &&!(ioff == 8 && joff == 8);
+
+  if (p_Vid->active_pps->constrained_intra_pred_flag)
+  {
+    for (i=0, block_available_left=1; i<8;i++)
+      block_available_left  &= pix_a[i].available ? p_Vid->intra_block[pix_a[i].mb_addr]: 0;
+    block_available_up       = pix_b.available ? p_Vid->intra_block [pix_b.mb_addr] : 0;
+    block_available_up_right = pix_c.available ? p_Vid->intra_block [pix_c.mb_addr] : 0;
+    block_available_up_left  = pix_d.available ? p_Vid->intra_block [pix_d.mb_addr] : 0;
+  }
+  else
+  {
+    block_available_left     = pix_a[0].available;
+    block_available_up       = pix_b.available;
+    block_available_up_right = pix_c.available;
+    block_available_up_left  = pix_d.available;
+  }
+
+  if ((!block_available_up)||(!block_available_left)||(!block_available_up_left))
+    printf ("warning: Intra_8x8_Diagonal_Down_Right prediction mode not allowed at mb %d\n", (int) p_Vid->current_mb_nr);
+
+  // form predictor pels
+  if (block_available_up)
+  {
+    pred_pels = &imgY[pix_b.pos_y][pix_b.pos_x];
+    PredPel[1] = pred_pels[0];
+    PredPel[2] = pred_pels[1];
+    PredPel[3] = pred_pels[2];
+    PredPel[4] = pred_pels[3];
+    PredPel[5] = pred_pels[4];
+    PredPel[6] = pred_pels[5];
+    PredPel[7] = pred_pels[6];
+    PredPel[8] = pred_pels[7];
+  }
+  else
+  {
+    PredPel[1] = PredPel[2] = PredPel[3] = PredPel[4] = PredPel[5] = PredPel[6] = PredPel[7] = PredPel[8] = (imgpel) p_Vid->dc_pred_value_comp[pl];
+  }
+
+  if (block_available_up_right)
+  {
+    pred_pels = &imgY[pix_c.pos_y][pix_c.pos_x];
+    PredPel[9] = pred_pels[0];
+    PredPel[10] = pred_pels[1];
+    PredPel[11] = pred_pels[2];
+    PredPel[12] = pred_pels[3];
+    PredPel[13] = pred_pels[4];
+    PredPel[14] = pred_pels[5];
+    PredPel[15] = pred_pels[6];
+    PredPel[16] = pred_pels[7];
+
+  }
+  else
+  {
+    PredPel[9] = PredPel[10] = PredPel[11] = PredPel[12] = PredPel[13] = PredPel[14] = PredPel[15] = PredPel[16] = PredPel[8];
+  }
+
+  if (block_available_left)
+  {
+    PredPel[17] = imgY[pix_a[0].pos_y][pix_a[0].pos_x];
+    PredPel[18] = imgY[pix_a[1].pos_y][pix_a[1].pos_x];
+    PredPel[19] = imgY[pix_a[2].pos_y][pix_a[2].pos_x];
+    PredPel[20] = imgY[pix_a[3].pos_y][pix_a[3].pos_x];
+    PredPel[21] = imgY[pix_a[4].pos_y][pix_a[4].pos_x];
+    PredPel[22] = imgY[pix_a[5].pos_y][pix_a[5].pos_x];
+    PredPel[23] = imgY[pix_a[6].pos_y][pix_a[6].pos_x];
+    PredPel[24] = imgY[pix_a[7].pos_y][pix_a[7].pos_x];
+  }
+  else
+  {
+    PredPel[17] = PredPel[18] = PredPel[19] = PredPel[20] = PredPel[21] = PredPel[22] = PredPel[23] = PredPel[24] = (imgpel) p_Vid->dc_pred_value_comp[pl];
+  }
+
+  if (block_available_up_left)
+  {
+    PredPel[0] = imgY[pix_d.pos_y][pix_d.pos_x];
+  }
+  else
+  {
+    PredPel[0] = (imgpel) p_Vid->dc_pred_value_comp[pl];
+  }
+
+  LowPassForIntra8x8Pred(&(PredPel[0]), block_available_up_left, block_available_up, block_available_left);
+
+  // Mode DIAG_DOWN_RIGHT_PRED
+	pred = &currSlice->mb_pred[pl][joff];
+  pred[0+7][ioff+0] = (imgpel) ((PredPel[24] + PredPel[22] + 2*(PredPel[23]) + 2) >> 2);
+  pred[0+6][ioff+0] =
+    pred[0+7][ioff+1] = (imgpel) ((PredPel[23] + PredPel[21] + 2*(PredPel[22]) + 2) >> 2);
+  pred[0+5][ioff+0] =
+    pred[0+6][ioff+1] =
+    pred[0+7][ioff+2] = (imgpel) ((PredPel[22] + PredPel[20] + 2*(PredPel[21]) + 2) >> 2);
+  pred[0+4][ioff+0] =
+    pred[0+5][ioff+1] =
+    pred[0+6][ioff+2] =
+    pred[0+7][ioff+3] = (imgpel) ((PredPel[21] + PredPel[19] + 2*(PredPel[20]) + 2) >> 2);
+  pred[0+3][ioff+0] =
+    pred[0+4][ioff+1] =
+    pred[0+5][ioff+2] =
+    pred[0+6][ioff+3] =
+    pred[0+7][ioff+4] = (imgpel) ((PredPel[20] + PredPel[18] + 2*(PredPel[19]) + 2) >> 2);
+  pred[0+2][ioff+0] =
+    pred[0+3][ioff+1] =
+    pred[0+4][ioff+2] =
+    pred[0+5][ioff+3] =
+    pred[0+6][ioff+4] =
+    pred[0+7][ioff+5] = (imgpel) ((PredPel[19] + PredPel[17] + 2*(PredPel[18]) + 2) >> 2);
+  pred[0+1][ioff+0] =
+    pred[0+2][ioff+1] =
+    pred[0+3][ioff+2] =
+    pred[0+4][ioff+3] =
+    pred[0+5][ioff+4] =
+    pred[0+6][ioff+5] =
+    pred[0+7][ioff+6] = (imgpel) ((PredPel[18] + PredPel[0] + 2*(PredPel[17]) + 2) >> 2);
+  pred[0+0][ioff+0] =
+    pred[0+1][ioff+1] =
+    pred[0+2][ioff+2] =
+    pred[0+3][ioff+3] =
+    pred[0+4][ioff+4] =
+    pred[0+5][ioff+5] =
+    pred[0+6][ioff+6] =
+    pred[0+7][ioff+7] = (imgpel) ((PredPel[17] + PredPel[1] + 2*(PredPel[0]) + 2) >> 2);
+  pred[0+0][ioff+1] =
+    pred[0+1][ioff+2] =
+    pred[0+2][ioff+3] =
+    pred[0+3][ioff+4] =
+    pred[0+4][ioff+5] =
+    pred[0+5][ioff+6] =
+    pred[0+6][ioff+7] = (imgpel) ((PredPel[0] + PredPel[2] + 2*(PredPel[1]) + 2) >> 2);
+  pred[0+0][ioff+2] =
+    pred[0+1][ioff+3] =
+    pred[0+2][ioff+4] =
+    pred[0+3][ioff+5] =
+    pred[0+4][ioff+6] =
+    pred[0+5][ioff+7] = (imgpel) ((PredPel[1] + PredPel[3] + 2*(PredPel[2]) + 2) >> 2);
+  pred[0+0][ioff+3] =
+    pred[0+1][ioff+4] =
+    pred[0+2][ioff+5] =
+    pred[0+3][ioff+6] =
+    pred[0+4][ioff+7] = (imgpel) ((PredPel[2] + PredPel[4] + 2*(PredPel[3]) + 2) >> 2);
+  pred[0+0][ioff+4] =
+    pred[0+1][ioff+5] =
+    pred[0+2][ioff+6] =
+    pred[0+3][ioff+7] = (imgpel) ((PredPel[3] + PredPel[5] + 2*(PredPel[4]) + 2) >> 2);
+  pred[0+0][ioff+5] =
+    pred[0+1][ioff+6] =
+    pred[0+2][ioff+7] = (imgpel) ((PredPel[4] + PredPel[6] + 2*(PredPel[5]) + 2) >> 2);
+  pred[0+0][ioff+6] =
+    pred[0+1][ioff+7] = (imgpel) ((PredPel[5] + PredPel[7] + 2*(PredPel[6]) + 2) >> 2);
+  pred[0+0][ioff+7] = (imgpel) ((PredPel[6] + PredPel[8] + 2*(PredPel[7]) + 2) >> 2);
+ 
+  return DECODING_OK;
+}
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    makes and returns 8x8 diagonal down left prediction mode
+ *
+ * \return
+ *    DECODING_OK   decoding of intraprediction mode was sucessfull            \n
+ *
+ ***********************************************************************
+ */
+static inline int intra8x8_diag_down_left_pred(Macroblock *currMB,    //!< current macroblock
+                                               ColorPlane pl,         //!< current image plane
+                                               int ioff,              //!< pixel offset X within MB
+                                               int joff)              //!< pixel offset Y within MB
+{
+  Slice *currSlice = currMB->p_Slice;
+  VideoParameters *p_Vid = currMB->p_Vid;
+  h264_imgpel_macroblock_row_t *pred;
+  int i;
+  imgpel PredPel[25];  // array of predictor pels
+  imgpel **imgY = (pl) ? p_Vid->dec_picture->imgUV[pl - 1]->img : p_Vid->dec_picture->imgY->img; // For MB level frame/field coding tools -- set default to imgY
+
+  PixelPos pix_a[8];
+  PixelPos pix_b, pix_c, pix_d;
+
+  int block_available_up;
+  int block_available_left;
+  int block_available_up_left;
+  int block_available_up_right;
+
+ 
+  imgpel *pred_pels;
+
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 0,  &pix_a[0]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 1,  &pix_a[1]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 2,  &pix_a[2]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 3,  &pix_a[3]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 4,  &pix_a[4]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 5,  &pix_a[5]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 6,  &pix_a[6]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 7,  &pix_a[7]);
+
+	p_Vid->getNeighbourPXLumaNB(currMB, ioff    , joff - 1,  &pix_b);
+	p_Vid->getNeighbourPXLuma(currMB, ioff + 8, joff - 1,  &pix_c);
+	p_Vid->getNeighbourLuma(currMB, ioff - 1, joff - 1,  &pix_d);
+
+  pix_c.available = pix_c.available &&!(ioff == 8 && joff == 8);
+
+  if (p_Vid->active_pps->constrained_intra_pred_flag)
+  {
+    for (i=0, block_available_left=1; i<8;i++)
+      block_available_left  &= pix_a[i].available ? p_Vid->intra_block[pix_a[i].mb_addr]: 0;
+    block_available_up       = pix_b.available ? p_Vid->intra_block [pix_b.mb_addr] : 0;
+    block_available_up_right = pix_c.available ? p_Vid->intra_block [pix_c.mb_addr] : 0;
+    block_available_up_left  = pix_d.available ? p_Vid->intra_block [pix_d.mb_addr] : 0;
+  }
+  else
+  {
+    block_available_left     = pix_a[0].available;
+    block_available_up       = pix_b.available;
+    block_available_up_right = pix_c.available;
+    block_available_up_left  = pix_d.available;
+  }
+
+  if (!block_available_up)
+    printf ("warning: Intra_8x8_Diagonal_Down_Left prediction mode not allowed at mb %d\n", (int) p_Vid->current_mb_nr);
+
+  // form predictor pels
+  if (block_available_up)
+  {
+    pred_pels = &imgY[pix_b.pos_y][pix_b.pos_x];
+    PredPel[1] = pred_pels[0];
+    PredPel[2] = pred_pels[1];
+    PredPel[3] = pred_pels[2];
+    PredPel[4] = pred_pels[3];
+    PredPel[5] = pred_pels[4];
+    PredPel[6] = pred_pels[5];
+    PredPel[7] = pred_pels[6];
+    PredPel[8] = pred_pels[7];
+  }
+  else
+  {
+    PredPel[1] = PredPel[2] = PredPel[3] = PredPel[4] = PredPel[5] = PredPel[6] = PredPel[7] = PredPel[8] = (imgpel) p_Vid->dc_pred_value_comp[pl];
+  }
+
+  if (block_available_up_right)
+  {
+    pred_pels = &imgY[pix_c.pos_y][pix_c.pos_x];
+    PredPel[9] = pred_pels[0];
+    PredPel[10] = pred_pels[1];
+    PredPel[11] = pred_pels[2];
+    PredPel[12] = pred_pels[3];
+    PredPel[13] = pred_pels[4];
+    PredPel[14] = pred_pels[5];
+    PredPel[15] = pred_pels[6];
+    PredPel[16] = pred_pels[7];
+
+  }
+  else
+  {
+    PredPel[9] = PredPel[10] = PredPel[11] = PredPel[12] = PredPel[13] = PredPel[14] = PredPel[15] = PredPel[16] = PredPel[8];
+  }
+
+  if (block_available_left)
+  {
+    PredPel[17] = imgY[pix_a[0].pos_y][pix_a[0].pos_x];
+    PredPel[18] = imgY[pix_a[1].pos_y][pix_a[1].pos_x];
+    PredPel[19] = imgY[pix_a[2].pos_y][pix_a[2].pos_x];
+    PredPel[20] = imgY[pix_a[3].pos_y][pix_a[3].pos_x];
+    PredPel[21] = imgY[pix_a[4].pos_y][pix_a[4].pos_x];
+    PredPel[22] = imgY[pix_a[5].pos_y][pix_a[5].pos_x];
+    PredPel[23] = imgY[pix_a[6].pos_y][pix_a[6].pos_x];
+    PredPel[24] = imgY[pix_a[7].pos_y][pix_a[7].pos_x];
+  }
+  else
+  {
+    PredPel[17] = PredPel[18] = PredPel[19] = PredPel[20] = PredPel[21] = PredPel[22] = PredPel[23] = PredPel[24] = (imgpel) p_Vid->dc_pred_value_comp[pl];
+  }
+
+  if (block_available_up_left)
+  {
+    PredPel[0] = imgY[pix_d.pos_y][pix_d.pos_x];
+  }
+  else
+  {
+    PredPel[0] = (imgpel) p_Vid->dc_pred_value_comp[pl];
+  }
+
+  LowPassForIntra8x8Pred(&(PredPel[0]), block_available_up_left, block_available_up, block_available_left);
+
+  // Mode DIAG_DOWN_LEFT_PRED
+	pred = &currSlice->mb_pred[pl][joff];
+  pred[0+0][ioff+0] = (imgpel) ((PredPel[1] + PredPel[3] + 2*(PredPel[2]) + 2) >> 2);
+  pred[0+1][ioff+0] =
+    pred[0+0][ioff+1] = (imgpel) ((PredPel[2] + PredPel[4] + 2*(PredPel[3]) + 2) >> 2);
+  pred[0+2][ioff+0] =
+    pred[0+1][ioff+1] =
+    pred[0+0][ioff+2] = (imgpel) ((PredPel[3] + PredPel[5] + 2*(PredPel[4]) + 2) >> 2);
+  pred[0+3][ioff+0] =
+    pred[0+2][ioff+1] =
+    pred[0+1][ioff+2] =
+    pred[0+0][ioff+3] = (imgpel) ((PredPel[4] + PredPel[6] + 2*(PredPel[5]) + 2) >> 2);
+  pred[0+4][ioff+0] =
+    pred[0+3][ioff+1] =
+    pred[0+2][ioff+2] =
+    pred[0+1][ioff+3] =
+    pred[0+0][ioff+4] = (imgpel) ((PredPel[5] + PredPel[7] + 2*(PredPel[6]) + 2) >> 2);
+  pred[0+5][ioff+0] =
+    pred[0+4][ioff+1] =
+    pred[0+3][ioff+2] =
+    pred[0+2][ioff+3] =
+    pred[0+1][ioff+4] =
+    pred[0+0][ioff+5] = (imgpel) ((PredPel[6] + PredPel[8] + 2*(PredPel[7]) + 2) >> 2);
+  pred[0+6][ioff+0] =
+    pred[0+5][ioff+1] =
+    pred[0+4][ioff+2] =
+    pred[0+3][ioff+3] =
+    pred[0+2][ioff+4] =
+    pred[0+1][ioff+5] =
+    pred[0+0][ioff+6] = (imgpel) ((PredPel[7] + PredPel[9] + 2*(PredPel[8]) + 2) >> 2);
+  pred[0+7][ioff+0] =
+    pred[0+6][ioff+1] =
+    pred[0+5][ioff+2] =
+    pred[0+4][ioff+3] =
+    pred[0+3][ioff+4] =
+    pred[0+2][ioff+5] =
+    pred[0+1][ioff+6] =
+    pred[0+0][ioff+7] = (imgpel) ((PredPel[8] + PredPel[10] + 2*(PredPel[9]) + 2) >> 2);
+  pred[0+7][ioff+1] =
+    pred[0+6][ioff+2] =
+    pred[0+5][ioff+3] =
+    pred[0+4][ioff+4] =
+    pred[0+3][ioff+5] =
+    pred[0+2][ioff+6] =
+    pred[0+1][ioff+7] = (imgpel) ((PredPel[9] + PredPel[11] + 2*(PredPel[10]) + 2) >> 2);
+  pred[0+7][ioff+2] =
+    pred[0+6][ioff+3] =
+    pred[0+5][ioff+4] =
+    pred[0+4][ioff+5] =
+    pred[0+3][ioff+6] =
+    pred[0+2][ioff+7] = (imgpel) ((PredPel[10] + PredPel[12] + 2*(PredPel[11]) + 2) >> 2);
+  pred[0+7][ioff+3] =
+    pred[0+6][ioff+4] =
+    pred[0+5][ioff+5] =
+    pred[0+4][ioff+6] =
+    pred[0+3][ioff+7] = (imgpel) ((PredPel[11] + PredPel[13] + 2*(PredPel[12]) + 2) >> 2);
+  pred[0+7][ioff+4] =
+    pred[0+6][ioff+5] =
+    pred[0+5][ioff+6] =
+    pred[0+4][ioff+7] = (imgpel) ((PredPel[12] + PredPel[14] + 2*(PredPel[13]) + 2) >> 2);
+  pred[0+7][ioff+5] =
+    pred[0+6][ioff+6] =
+    pred[0+5][ioff+7] = (imgpel) ((PredPel[13] + PredPel[15] + 2*(PredPel[14]) + 2) >> 2);
+  pred[0+7][ioff+6] =
+    pred[0+6][ioff+7] = (imgpel) ((PredPel[14] + PredPel[16] + 2*(PredPel[15]) + 2) >> 2);
+  pred[0+7][ioff+7] = (imgpel) ((PredPel[15] + 3*(PredPel[16]) + 2) >> 2);
+
+  return DECODING_OK;
+}
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    makes and returns 8x8 vertical right prediction mode
+ *
+ * \return
+ *    DECODING_OK   decoding of intraprediction mode was sucessfull            \n
+ *
+ ***********************************************************************
+ */
+static inline int intra8x8_vert_right_pred(Macroblock *currMB,    //!< current macroblock
+                                           ColorPlane pl,         //!< current image plane
+                                           int ioff,              //!< pixel offset X within MB
+                                           int joff)              //!< pixel offset Y within MB
+{
+  Slice *currSlice = currMB->p_Slice;
+  VideoParameters *p_Vid = currMB->p_Vid;
+  h264_imgpel_macroblock_row_t *pred;
+  int i;
+  imgpel PredPel[25];  // array of predictor pels
+  imgpel **imgY = (pl) ? p_Vid->dec_picture->imgUV[pl - 1]->img : p_Vid->dec_picture->imgY->img; // For MB level frame/field coding tools -- set default to imgY
+
+  PixelPos pix_a[8];
+  PixelPos pix_b, pix_c, pix_d;
+
+  int block_available_up;
+  int block_available_left;
+  int block_available_up_left;
+  int block_available_up_right;
+  imgpel *pred_pels;
+
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 0,  &pix_a[0]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 1,  &pix_a[1]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 2,  &pix_a[2]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 3,  &pix_a[3]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 4,  &pix_a[4]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 5,  &pix_a[5]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 6,  &pix_a[6]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 7,  &pix_a[7]);
+
+	p_Vid->getNeighbourPXLumaNB(currMB, ioff    , joff - 1,  &pix_b);
+	p_Vid->getNeighbourPXLuma(currMB, ioff + 8, joff - 1,  &pix_c);
+	p_Vid->getNeighbourLuma(currMB, ioff - 1, joff - 1,  &pix_d);
+
+  pix_c.available = pix_c.available &&!(ioff == 8 && joff == 8);
+
+  if (p_Vid->active_pps->constrained_intra_pred_flag)
+  {
+    for (i=0, block_available_left=1; i<8;i++)
+      block_available_left  &= pix_a[i].available ? p_Vid->intra_block[pix_a[i].mb_addr]: 0;
+    block_available_up       = pix_b.available ? p_Vid->intra_block [pix_b.mb_addr] : 0;
+    block_available_up_right = pix_c.available ? p_Vid->intra_block [pix_c.mb_addr] : 0;
+    block_available_up_left  = pix_d.available ? p_Vid->intra_block [pix_d.mb_addr] : 0;
+  }
+  else
+  {
+    block_available_left     = pix_a[0].available;
+    block_available_up       = pix_b.available;
+    block_available_up_right = pix_c.available;
+    block_available_up_left  = pix_d.available;
+  }
+
+  if ((!block_available_up)||(!block_available_left)||(!block_available_up_left))
+    printf ("warning: Intra_8x8_Vertical_Right prediction mode not allowed at mb %d\n", (int) p_Vid->current_mb_nr);
+
+  // form predictor pels
+  if (block_available_up)
+  {
+    pred_pels = &imgY[pix_b.pos_y][pix_b.pos_x];
+    PredPel[1] = pred_pels[0];
+    PredPel[2] = pred_pels[1];
+    PredPel[3] = pred_pels[2];
+    PredPel[4] = pred_pels[3];
+    PredPel[5] = pred_pels[4];
+    PredPel[6] = pred_pels[5];
+    PredPel[7] = pred_pels[6];
+    PredPel[8] = pred_pels[7];
+  }
+  else
+  {
+    PredPel[1] = PredPel[2] = PredPel[3] = PredPel[4] = PredPel[5] = PredPel[6] = PredPel[7] = PredPel[8] = (imgpel) p_Vid->dc_pred_value_comp[pl];
+  }
+
+  if (block_available_up_right)
+  {
+    pred_pels = &imgY[pix_c.pos_y][pix_c.pos_x];
+    PredPel[9] = pred_pels[0];
+    PredPel[10] = pred_pels[1];
+    PredPel[11] = pred_pels[2];
+    PredPel[12] = pred_pels[3];
+    PredPel[13] = pred_pels[4];
+    PredPel[14] = pred_pels[5];
+    PredPel[15] = pred_pels[6];
+    PredPel[16] = pred_pels[7];
+
+  }
+  else
+  {
+    PredPel[9] = PredPel[10] = PredPel[11] = PredPel[12] = PredPel[13] = PredPel[14] = PredPel[15] = PredPel[16] = PredPel[8];
+  }
+
+  if (block_available_left)
+  {
+    PredPel[17] = imgY[pix_a[0].pos_y][pix_a[0].pos_x];
+    PredPel[18] = imgY[pix_a[1].pos_y][pix_a[1].pos_x];
+    PredPel[19] = imgY[pix_a[2].pos_y][pix_a[2].pos_x];
+    PredPel[20] = imgY[pix_a[3].pos_y][pix_a[3].pos_x];
+    PredPel[21] = imgY[pix_a[4].pos_y][pix_a[4].pos_x];
+    PredPel[22] = imgY[pix_a[5].pos_y][pix_a[5].pos_x];
+    PredPel[23] = imgY[pix_a[6].pos_y][pix_a[6].pos_x];
+    PredPel[24] = imgY[pix_a[7].pos_y][pix_a[7].pos_x];
+  }
+  else
+  {
+    PredPel[17] = PredPel[18] = PredPel[19] = PredPel[20] = PredPel[21] = PredPel[22] = PredPel[23] = PredPel[24] = (imgpel) p_Vid->dc_pred_value_comp[pl];
+  }
+
+  if (block_available_up_left)
+  {
+    PredPel[0] = imgY[pix_d.pos_y][pix_d.pos_x];
+  }
+  else
+  {
+    PredPel[0] = (imgpel) p_Vid->dc_pred_value_comp[pl];
+  }
+
+  LowPassForIntra8x8Pred(&(PredPel[0]), block_available_up_left, block_available_up, block_available_left);
+pred = &currSlice->mb_pred[pl][joff];
+  pred[0+0][ioff+0] =
+    pred[0+2][ioff+1] =
+    pred[0+4][ioff+2] =
+    pred[0+6][ioff+3] = (imgpel) ((PredPel[0] + PredPel[1] + 1) >> 1);
+  pred[0+0][ioff+1] =
+    pred[0+2][ioff+2] =
+    pred[0+4][ioff+3] =
+    pred[0+6][ioff+4] = (imgpel) ((PredPel[1] + PredPel[2] + 1) >> 1);
+  pred[0+0][ioff+2] =
+    pred[0+2][ioff+3] =
+    pred[0+4][ioff+4] =
+    pred[0+6][ioff+5] = (imgpel) ((PredPel[2] + PredPel[3] + 1) >> 1);
+  pred[0+0][ioff+3] =
+    pred[0+2][ioff+4] =
+    pred[0+4][ioff+5] =
+    pred[0+6][ioff+6] = (imgpel) ((PredPel[3] + PredPel[4] + 1) >> 1);
+  pred[0+0][ioff+4] =
+    pred[0+2][ioff+5] =
+    pred[0+4][ioff+6] =
+    pred[0+6][ioff+7] = (imgpel) ((PredPel[4] + PredPel[5] + 1) >> 1);
+  pred[0+0][ioff+5] =
+    pred[0+2][ioff+6] =
+    pred[0+4][ioff+7] = (imgpel) ((PredPel[5] + PredPel[6] + 1) >> 1);
+  pred[0+0][ioff+6] =
+    pred[0+2][ioff+7] = (imgpel) ((PredPel[6] + PredPel[7] + 1) >> 1);
+  pred[0+0][ioff+7] = (imgpel) ((PredPel[7] + PredPel[8] + 1) >> 1);
+  pred[0+1][ioff+0] =
+    pred[0+3][ioff+1] =
+    pred[0+5][ioff+2] =
+    pred[0+7][ioff+3] = (imgpel) ((PredPel[17] + PredPel[1] + 2*PredPel[0] + 2) >> 2);
+  pred[0+1][ioff+1] =
+    pred[0+3][ioff+2] =
+    pred[0+5][ioff+3] =
+    pred[0+7][ioff+4] = (imgpel) ((PredPel[0] + PredPel[2] + 2*PredPel[1] + 2) >> 2);
+  pred[0+1][ioff+2] =
+    pred[0+3][ioff+3] =
+    pred[0+5][ioff+4] =
+    pred[0+7][ioff+5] = (imgpel) ((PredPel[1] + PredPel[3] + 2*PredPel[2] + 2) >> 2);
+  pred[0+1][ioff+3] =
+    pred[0+3][ioff+4] =
+    pred[0+5][ioff+5] =
+    pred[0+7][ioff+6] = (imgpel) ((PredPel[2] + PredPel[4] + 2*PredPel[3] + 2) >> 2);
+  pred[0+1][ioff+4] =
+    pred[0+3][ioff+5] =
+    pred[0+5][ioff+6] =
+    pred[0+7][ioff+7] = (imgpel) ((PredPel[3] + PredPel[5] + 2*PredPel[4] + 2) >> 2);
+  pred[0+1][ioff+5] =
+    pred[0+3][ioff+6] =
+    pred[0+5][ioff+7] = (imgpel) ((PredPel[4] + PredPel[6] + 2*PredPel[5] + 2) >> 2);
+  pred[0+1][ioff+6] =
+    pred[0+3][ioff+7] = (imgpel) ((PredPel[5] + PredPel[7] + 2*PredPel[6] + 2) >> 2);
+  pred[0+1][ioff+7] = (imgpel) ((PredPel[6] + PredPel[8] + 2*PredPel[7] + 2) >> 2);
+  pred[0+2][ioff+0] =
+    pred[0+4][ioff+1] =
+    pred[0+6][ioff+2] = (imgpel) ((PredPel[18] + PredPel[0] + 2*PredPel[17] + 2) >> 2);
+  pred[0+3][ioff+0] =
+    pred[0+5][ioff+1] =
+    pred[0+7][ioff+2] = (imgpel) ((PredPel[19] + PredPel[17] + 2*PredPel[18] + 2) >> 2);
+  pred[0+4][ioff+0] =
+    pred[0+6][ioff+1] = (imgpel) ((PredPel[20] + PredPel[18] + 2*PredPel[19] + 2) >> 2);
+  pred[0+5][ioff+0] =
+    pred[0+7][ioff+1] = (imgpel) ((PredPel[21] + PredPel[19] + 2*PredPel[20] + 2) >> 2);
+  pred[0+6][ioff+0] = (imgpel) ((PredPel[22] + PredPel[20] + 2*PredPel[21] + 2) >> 2);
+  pred[0+7][ioff+0] = (imgpel) ((PredPel[23] + PredPel[21] + 2*PredPel[22] + 2) >> 2);
+
+  return DECODING_OK;
+}
+
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    makes and returns 8x8 vertical left prediction mode
+ *
+ * \return
+ *    DECODING_OK   decoding of intraprediction mode was sucessfull            \n
+ *
+ ***********************************************************************
+ */
+static inline int intra8x8_vert_left_pred(Macroblock *currMB,    //!< current macroblock
+                                          ColorPlane pl,         //!< current image plane
+                                          int ioff,              //!< pixel offset X within MB
+                                          int joff)              //!< pixel offset Y within MB
+{
+  Slice *currSlice = currMB->p_Slice;
+  VideoParameters *p_Vid = currMB->p_Vid;
+  
+  int i;
+  imgpel PredPel[25];  // array of predictor pels  
+  imgpel **imgY = (pl) ? p_Vid->dec_picture->imgUV[pl - 1]->img : p_Vid->dec_picture->imgY->img; // For MB level frame/field coding tools -- set default to imgY
+
+  PixelPos pix_a[8];
+  PixelPos pix_b, pix_c, pix_d;
+h264_imgpel_macroblock_row_t *pred;
+  int block_available_up;
+  int block_available_left;
+  int block_available_up_left;
+  int block_available_up_right;
+
+  imgpel *pred_pels;
+
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 0,  &pix_a[0]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 1,  &pix_a[1]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 2,  &pix_a[2]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 3,  &pix_a[3]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 4,  &pix_a[4]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 5,  &pix_a[5]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 6,  &pix_a[6]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 7,  &pix_a[7]);
+
+	p_Vid->getNeighbourPXLumaNB(currMB, ioff    , joff - 1,  &pix_b);
+	p_Vid->getNeighbourPXLuma(currMB, ioff + 8, joff - 1,  &pix_c);
+	p_Vid->getNeighbourLuma(currMB, ioff - 1, joff - 1,  &pix_d);
+
+  pix_c.available = pix_c.available &&!(ioff == 8 && joff == 8);
+
+  if (p_Vid->active_pps->constrained_intra_pred_flag)
+  {
+    for (i=0, block_available_left=1; i<8;i++)
+      block_available_left  &= pix_a[i].available ? p_Vid->intra_block[pix_a[i].mb_addr]: 0;
+    block_available_up       = pix_b.available ? p_Vid->intra_block [pix_b.mb_addr] : 0;
+    block_available_up_right = pix_c.available ? p_Vid->intra_block [pix_c.mb_addr] : 0;
+    block_available_up_left  = pix_d.available ? p_Vid->intra_block [pix_d.mb_addr] : 0;
+  }
+  else
+  {
+    block_available_left     = pix_a[0].available;
+    block_available_up       = pix_b.available;
+    block_available_up_right = pix_c.available;
+    block_available_up_left  = pix_d.available;
+  }
+
+  if (!block_available_up)
+    printf ("warning: Intra_4x4_Vertical_Left prediction mode not allowed at mb %d\n", (int) p_Vid->current_mb_nr);
+
+  // form predictor pels
+  if (block_available_up)
+  {
+    pred_pels = &imgY[pix_b.pos_y][pix_b.pos_x];
+    PredPel[1] = pred_pels[0];
+    PredPel[2] = pred_pels[1];
+    PredPel[3] = pred_pels[2];
+    PredPel[4] = pred_pels[3];
+    PredPel[5] = pred_pels[4];
+    PredPel[6] = pred_pels[5];
+    PredPel[7] = pred_pels[6];
+    PredPel[8] = pred_pels[7];
+  }
+  else
+  {
+    PredPel[1] = PredPel[2] = PredPel[3] = PredPel[4] = PredPel[5] = PredPel[6] = PredPel[7] = PredPel[8] = (imgpel) p_Vid->dc_pred_value_comp[pl];
+  }
+
+  if (block_available_up_right)
+  {
+    pred_pels = &imgY[pix_c.pos_y][pix_c.pos_x];
+    PredPel[9] = pred_pels[0];
+    PredPel[10] = pred_pels[1];
+    PredPel[11] = pred_pels[2];
+    PredPel[12] = pred_pels[3];
+    PredPel[13] = pred_pels[4];
+    PredPel[14] = pred_pels[5];
+    PredPel[15] = pred_pels[6];
+    PredPel[16] = pred_pels[7];
+
+  }
+  else
+  {
+    PredPel[9] = PredPel[10] = PredPel[11] = PredPel[12] = PredPel[13] = PredPel[14] = PredPel[15] = PredPel[16] = PredPel[8];
+  }
+
+  if (block_available_left)
+  {
+    PredPel[17] = imgY[pix_a[0].pos_y][pix_a[0].pos_x];
+    PredPel[18] = imgY[pix_a[1].pos_y][pix_a[1].pos_x];
+    PredPel[19] = imgY[pix_a[2].pos_y][pix_a[2].pos_x];
+    PredPel[20] = imgY[pix_a[3].pos_y][pix_a[3].pos_x];
+    PredPel[21] = imgY[pix_a[4].pos_y][pix_a[4].pos_x];
+    PredPel[22] = imgY[pix_a[5].pos_y][pix_a[5].pos_x];
+    PredPel[23] = imgY[pix_a[6].pos_y][pix_a[6].pos_x];
+    PredPel[24] = imgY[pix_a[7].pos_y][pix_a[7].pos_x];
+  }
+  else
+  {
+    PredPel[17] = PredPel[18] = PredPel[19] = PredPel[20] = PredPel[21] = PredPel[22] = PredPel[23] = PredPel[24] = (imgpel) p_Vid->dc_pred_value_comp[pl];
+  }
+
+  if (block_available_up_left)
+  {
+    PredPel[0] = imgY[pix_d.pos_y][pix_d.pos_x];
+  }
+  else
+  {
+    PredPel[0] = (imgpel) p_Vid->dc_pred_value_comp[pl];
+  }
+
+  LowPassForIntra8x8Pred(&(PredPel[0]), block_available_up_left, block_available_up, block_available_left);
+pred = &currSlice->mb_pred[pl][joff];
+  pred[0+0][ioff+0] = (imgpel) ((PredPel[1] + PredPel[2] + 1) >> 1);
+  pred[0+0][ioff+1] =
+    pred[0+2][ioff+0] = (imgpel) ((PredPel[2] + PredPel[3] + 1) >> 1);
+  pred[0+0][ioff+2] =
+    pred[0+2][ioff+1] =
+    pred[0+4][ioff+0] = (imgpel) ((PredPel[3] + PredPel[4] + 1) >> 1);
+  pred[0+0][ioff+3] =
+    pred[0+2][ioff+2] =
+    pred[0+4][ioff+1] =
+    pred[0+6][ioff+0] = (imgpel) ((PredPel[4] + PredPel[5] + 1) >> 1);
+  pred[0+0][ioff+4] =
+    pred[0+2][ioff+3] =
+    pred[0+4][ioff+2] =
+    pred[0+6][ioff+1] = (imgpel) ((PredPel[5] + PredPel[6] + 1) >> 1);
+  pred[0+0][ioff+5] =
+    pred[0+2][ioff+4] =
+    pred[0+4][ioff+3] =
+    pred[0+6][ioff+2] = (imgpel) ((PredPel[6] + PredPel[7] + 1) >> 1);
+  pred[0+0][ioff+6] =
+    pred[0+2][ioff+5] =
+    pred[0+4][ioff+4] =
+    pred[0+6][ioff+3] = (imgpel) ((PredPel[7] + PredPel[8] + 1) >> 1);
+  pred[0+0][ioff+7] =
+    pred[0+2][ioff+6] =
+    pred[0+4][ioff+5] =
+    pred[0+6][ioff+4] = (imgpel) ((PredPel[8] + PredPel[9] + 1) >> 1);
+  pred[0+2][ioff+7] =
+    pred[0+4][ioff+6] =
+    pred[0+6][ioff+5] = (imgpel) ((PredPel[9] + PredPel[10] + 1) >> 1);
+  pred[0+4][ioff+7] =
+    pred[0+6][ioff+6] = (imgpel) ((PredPel[10] + PredPel[11] + 1) >> 1);
+  pred[0+6][ioff+7] = (imgpel) ((PredPel[11] + PredPel[12] + 1) >> 1);
+  pred[0+1][ioff+0] = (imgpel) ((PredPel[1] + PredPel[3] + 2*PredPel[2] + 2) >> 2);
+  pred[0+1][ioff+1] =
+    pred[0+3][ioff+0] = (imgpel) ((PredPel[2] + PredPel[4] + 2*PredPel[3] + 2) >> 2);
+  pred[0+1][ioff+2] =
+    pred[0+3][ioff+1] =
+    pred[0+5][ioff+0] = (imgpel) ((PredPel[3] + PredPel[5] + 2*PredPel[4] + 2) >> 2);
+  pred[0+1][ioff+3] =
+    pred[0+3][ioff+2] =
+    pred[0+5][ioff+1] =
+    pred[0+7][ioff+0] = (imgpel) ((PredPel[4] + PredPel[6] + 2*PredPel[5] + 2) >> 2);
+  pred[0+1][ioff+4] =
+    pred[0+3][ioff+3] =
+    pred[0+5][ioff+2] =
+    pred[0+7][ioff+1] = (imgpel) ((PredPel[5] + PredPel[7] + 2*PredPel[6] + 2) >> 2);
+  pred[0+1][ioff+5] =
+    pred[0+3][ioff+4] =
+    pred[0+5][ioff+3] =
+    pred[0+7][ioff+2] = (imgpel) ((PredPel[6] + PredPel[8] + 2*PredPel[7] + 2) >> 2);
+  pred[0+1][ioff+6] =
+    pred[0+3][ioff+5] =
+    pred[0+5][ioff+4] =
+    pred[0+7][ioff+3] = (imgpel) ((PredPel[7] + PredPel[9] + 2*PredPel[8] + 2) >> 2);
+  pred[0+1][ioff+7] =
+    pred[0+3][ioff+6] =
+    pred[0+5][ioff+5] =
+    pred[0+7][ioff+4] = (imgpel) ((PredPel[8] + PredPel[10] + 2*PredPel[9] + 2) >> 2);
+  pred[0+3][ioff+7] =
+    pred[0+5][ioff+6] =
+    pred[0+7][ioff+5] = (imgpel) ((PredPel[9] + PredPel[11] + 2*PredPel[10] + 2) >> 2);
+  pred[0+5][ioff+7] =
+    pred[0+7][ioff+6] = (imgpel) ((PredPel[10] + PredPel[12] + 2*PredPel[11] + 2) >> 2);
+  pred[0+7][ioff+7] = (imgpel) ((PredPel[11] + PredPel[13] + 2*PredPel[12] + 2) >> 2);
+
+  return DECODING_OK;
+}
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    makes and returns 8x8 horizontal up prediction mode
+ *
+ * \return
+ *    DECODING_OK   decoding of intraprediction mode was sucessfull            \n
+ *
+ ***********************************************************************
+ */
+static inline int intra8x8_hor_up_pred(Macroblock *currMB,    //!< current macroblock
+                                       ColorPlane pl,         //!< current image plane
+                                       int ioff,              //!< pixel offset X within MB
+                                       int joff)              //!< pixel offset Y within MB
+{
+  Slice *currSlice = currMB->p_Slice;
+  VideoParameters *p_Vid = currMB->p_Vid;
+  
+  int i;
+  imgpel PredPel[25];  // array of predictor pels
+  imgpel **imgY = (pl) ? p_Vid->dec_picture->imgUV[pl - 1]->img : p_Vid->dec_picture->imgY->img; // For MB level frame/field coding tools -- set default to imgY
+h264_imgpel_macroblock_row_t *pred;
+  PixelPos pix_a[8];
+  PixelPos pix_b, pix_c, pix_d;
+
+  int block_available_up;
+  int block_available_left;
+  int block_available_up_left;
+  int block_available_up_right;
+  
+  imgpel *pred_pels;
+  
+
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 0,  &pix_a[0]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 1,  &pix_a[1]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 2,  &pix_a[2]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 3,  &pix_a[3]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 4,  &pix_a[4]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 5,  &pix_a[5]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 6,  &pix_a[6]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 7,  &pix_a[7]);
+
+	p_Vid->getNeighbourPXLumaNB(currMB, ioff    , joff - 1,  &pix_b);
+	p_Vid->getNeighbourPXLuma(currMB, ioff + 8, joff - 1,  &pix_c);
+	p_Vid->getNeighbourLuma(currMB, ioff - 1, joff - 1,  &pix_d);
+
+  pix_c.available = pix_c.available &&!(ioff == 8 && joff == 8);
+
+  if (p_Vid->active_pps->constrained_intra_pred_flag)
+  {
+    for (i=0, block_available_left=1; i<8;i++)
+      block_available_left  &= pix_a[i].available ? p_Vid->intra_block[pix_a[i].mb_addr]: 0;
+    block_available_up       = pix_b.available ? p_Vid->intra_block [pix_b.mb_addr] : 0;
+    block_available_up_right = pix_c.available ? p_Vid->intra_block [pix_c.mb_addr] : 0;
+    block_available_up_left  = pix_d.available ? p_Vid->intra_block [pix_d.mb_addr] : 0;
+  }
+  else
+  {
+    block_available_left     = pix_a[0].available;
+    block_available_up       = pix_b.available;
+    block_available_up_right = pix_c.available;
+    block_available_up_left  = pix_d.available;
+  }
+
+  if (!block_available_left)
+    printf ("warning: Intra_8x8_Horizontal_Up prediction mode not allowed at mb %d\n", (int) p_Vid->current_mb_nr);
+
+  // form predictor pels
+  if (block_available_up)
+  {
+    pred_pels = &imgY[pix_b.pos_y][pix_b.pos_x];
+    PredPel[1] = pred_pels[0];
+    PredPel[2] = pred_pels[1];
+    PredPel[3] = pred_pels[2];
+    PredPel[4] = pred_pels[3];
+    PredPel[5] = pred_pels[4];
+    PredPel[6] = pred_pels[5];
+    PredPel[7] = pred_pels[6];
+    PredPel[8] = pred_pels[7];
+  }
+  else
+  {
+    PredPel[1] = PredPel[2] = PredPel[3] = PredPel[4] = PredPel[5] = PredPel[6] = PredPel[7] = PredPel[8] = (imgpel) p_Vid->dc_pred_value_comp[pl];
+  }
+
+  if (block_available_up_right)
+  {
+    pred_pels = &imgY[pix_c.pos_y][pix_c.pos_x];
+    PredPel[9] = pred_pels[0];
+    PredPel[10] = pred_pels[1];
+    PredPel[11] = pred_pels[2];
+    PredPel[12] = pred_pels[3];
+    PredPel[13] = pred_pels[4];
+    PredPel[14] = pred_pels[5];
+    PredPel[15] = pred_pels[6];
+    PredPel[16] = pred_pels[7];
+
+  }
+  else
+  {
+    PredPel[9] = PredPel[10] = PredPel[11] = PredPel[12] = PredPel[13] = PredPel[14] = PredPel[15] = PredPel[16] = PredPel[8];
+  }
+
+  if (block_available_left)
+  {
+    PredPel[17] = imgY[pix_a[0].pos_y][pix_a[0].pos_x];
+    PredPel[18] = imgY[pix_a[1].pos_y][pix_a[1].pos_x];
+    PredPel[19] = imgY[pix_a[2].pos_y][pix_a[2].pos_x];
+    PredPel[20] = imgY[pix_a[3].pos_y][pix_a[3].pos_x];
+    PredPel[21] = imgY[pix_a[4].pos_y][pix_a[4].pos_x];
+    PredPel[22] = imgY[pix_a[5].pos_y][pix_a[5].pos_x];
+    PredPel[23] = imgY[pix_a[6].pos_y][pix_a[6].pos_x];
+    PredPel[24] = imgY[pix_a[7].pos_y][pix_a[7].pos_x];
+  }
+  else
+  {
+    PredPel[17] = PredPel[18] = PredPel[19] = PredPel[20] = PredPel[21] = PredPel[22] = PredPel[23] = PredPel[24] = (imgpel) p_Vid->dc_pred_value_comp[pl];
+  }
+
+  if (block_available_up_left)
+  {
+    PredPel[0] = imgY[pix_d.pos_y][pix_d.pos_x];
+  }
+  else
+  {
+    PredPel[0] = (imgpel) p_Vid->dc_pred_value_comp[pl];
+  }
+
+  LowPassForIntra8x8Pred(&(PredPel[0]), block_available_up_left, block_available_up, block_available_left);
+pred = &currSlice->mb_pred[pl][joff];
+  pred[0+0][ioff+0] = (imgpel) ((PredPel[17] + PredPel[18] + 1) >> 1);
+  pred[0+1][ioff+0] =
+    pred[0+0][ioff+2] = (imgpel) ((PredPel[18] + PredPel[19] + 1) >> 1);
+  pred[0+2][ioff+0] =
+    pred[0+1][ioff+2] =
+    pred[0+0][ioff+4] = (imgpel) ((PredPel[19] + PredPel[20] + 1) >> 1);
+  pred[0+3][ioff+0] =
+    pred[0+2][ioff+2] =
+    pred[0+1][ioff+4] =
+    pred[0+0][ioff+6] = (imgpel) ((PredPel[20] + PredPel[21] + 1) >> 1);
+  pred[0+4][ioff+0] =
+    pred[0+3][ioff+2] =
+    pred[0+2][ioff+4] =
+    pred[0+1][ioff+6] = (imgpel) ((PredPel[21] + PredPel[22] + 1) >> 1);
+  pred[0+5][ioff+0] =
+    pred[0+4][ioff+2] =
+    pred[0+3][ioff+4] =
+    pred[0+2][ioff+6] = (imgpel) ((PredPel[22] + PredPel[23] + 1) >> 1);
+  pred[0+6][ioff+0] =
+    pred[0+5][ioff+2] =
+    pred[0+4][ioff+4] =
+    pred[0+3][ioff+6] = (imgpel) ((PredPel[23] + PredPel[24] + 1) >> 1);
+  pred[0+4][ioff+6] =
+    pred[0+4][ioff+7] =
+    pred[0+5][ioff+4] =
+    pred[0+5][ioff+5] =
+    pred[0+5][ioff+6] =
+    pred[0+5][ioff+7] =
+    pred[0+6][ioff+2] =
+    pred[0+6][ioff+3] =
+    pred[0+6][ioff+4] =
+    pred[0+6][ioff+5] =
+    pred[0+6][ioff+6] =
+    pred[0+6][ioff+7] =
+    pred[0+7][ioff+0] =
+    pred[0+7][ioff+1] =
+    pred[0+7][ioff+2] =
+    pred[0+7][ioff+3] =
+    pred[0+7][ioff+4] =
+    pred[0+7][ioff+5] =
+    pred[0+7][ioff+6] =
+    pred[0+7][ioff+7] = (imgpel) PredPel[24];
+  pred[0+6][ioff+1] =
+    pred[0+5][ioff+3] =
+    pred[0+4][ioff+5] =
+    pred[0+3][ioff+7] = (imgpel) ((PredPel[23] + 3*PredPel[24] + 2) >> 2);
+  pred[0+5][ioff+1] =
+    pred[0+4][ioff+3] =
+    pred[0+3][ioff+5] =
+    pred[0+2][ioff+7] = (imgpel) ((PredPel[24] + PredPel[22] + 2*PredPel[23] + 2) >> 2);
+  pred[0+4][ioff+1] =
+    pred[0+3][ioff+3] =
+    pred[0+2][ioff+5] =
+    pred[0+1][ioff+7] = (imgpel) ((PredPel[23] + PredPel[21] + 2*PredPel[22] + 2) >> 2);
+  pred[0+3][ioff+1] =
+    pred[0+2][ioff+3] =
+    pred[0+1][ioff+5] =
+    pred[0+0][ioff+7] = (imgpel) ((PredPel[22] + PredPel[20] + 2*PredPel[21] + 2) >> 2);
+  pred[0+2][ioff+1] =
+    pred[0+1][ioff+3] =
+    pred[0+0][ioff+5] = (imgpel) ((PredPel[21] + PredPel[19] + 2*PredPel[20] + 2) >> 2);
+  pred[0+1][ioff+1] =
+    pred[0+0][ioff+3] = (imgpel) ((PredPel[20] + PredPel[18] + 2*PredPel[19] + 2) >> 2);
+  pred[0+0][ioff+1] = (imgpel) ((PredPel[19] + PredPel[17] + 2*PredPel[18] + 2) >> 2);
+
+  return DECODING_OK;
+}
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    makes and returns 8x8 horizontal down prediction mode
+ *
+ * \return
+ *    DECODING_OK   decoding of intraprediction mode was sucessfull            \n
+ *
+ ***********************************************************************
+ */
+static inline int intra8x8_hor_down_pred(Macroblock *currMB,    //!< current macroblock
+                                         ColorPlane pl,         //!< current image plane
+                                         int ioff,              //!< pixel offset X within MB
+                                         int joff)              //!< pixel offset Y within MB
+{
+  Slice *currSlice = currMB->p_Slice;
+  VideoParameters *p_Vid = currMB->p_Vid;
+
+  int i;
+  imgpel PredPel[25];  // array of predictor pels
+  imgpel **imgY = (pl) ? p_Vid->dec_picture->imgUV[pl - 1]->img : p_Vid->dec_picture->imgY->img; // For MB level frame/field coding tools -- set default to imgY
+
+  PixelPos pix_a[8];
+  PixelPos pix_b, pix_c, pix_d;
+	h264_imgpel_macroblock_row_t *pred;
+  int block_available_up;
+  int block_available_left;
+  int block_available_up_left;
+  int block_available_up_right;
+  
+  imgpel *pred_pels;
+
+
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 0,  &pix_a[0]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 1,  &pix_a[1]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 2,  &pix_a[2]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 3,  &pix_a[3]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 4,  &pix_a[4]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 5,  &pix_a[5]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 6,  &pix_a[6]);
+	p_Vid->getNeighbourXPLumaNB(currMB, ioff - 1, joff + 7,  &pix_a[7]);
+
+	p_Vid->getNeighbourPXLumaNB(currMB, ioff    , joff - 1,  &pix_b);
+	p_Vid->getNeighbourPXLuma(currMB, ioff + 8, joff - 1,  &pix_c);
+	p_Vid->getNeighbourLuma(currMB, ioff - 1, joff - 1,  &pix_d);
+  pix_c.available = pix_c.available &&!(ioff == 8 && joff == 8);
+
+  if (p_Vid->active_pps->constrained_intra_pred_flag)
+  {
+    for (i=0, block_available_left=1; i<8;i++)
+      block_available_left  &= pix_a[i].available ? p_Vid->intra_block[pix_a[i].mb_addr]: 0;
+    block_available_up       = pix_b.available ? p_Vid->intra_block [pix_b.mb_addr] : 0;
+    block_available_up_right = pix_c.available ? p_Vid->intra_block [pix_c.mb_addr] : 0;
+    block_available_up_left  = pix_d.available ? p_Vid->intra_block [pix_d.mb_addr] : 0;
+  }
+  else
+  {
+    block_available_left     = pix_a[0].available;
+    block_available_up       = pix_b.available;
+    block_available_up_right = pix_c.available;
+    block_available_up_left  = pix_d.available;
+  }
+
+  if ((!block_available_up)||(!block_available_left)||(!block_available_up_left))
+    printf ("warning: Intra_8x8_Horizontal_Down prediction mode not allowed at mb %d\n", (int) p_Vid->current_mb_nr);
+
+  // form predictor pels
+  if (block_available_up)
+  {
+    pred_pels = &imgY[pix_b.pos_y][pix_b.pos_x];
+    PredPel[1] = pred_pels[0];
+    PredPel[2] = pred_pels[1];
+    PredPel[3] = pred_pels[2];
+    PredPel[4] = pred_pels[3];
+    PredPel[5] = pred_pels[4];
+    PredPel[6] = pred_pels[5];
+    PredPel[7] = pred_pels[6];
+    PredPel[8] = pred_pels[7];
+  }
+  else
+  {
+    PredPel[1] = PredPel[2] = PredPel[3] = PredPel[4] = PredPel[5] = PredPel[6] = PredPel[7] = PredPel[8] = (imgpel) p_Vid->dc_pred_value_comp[pl];
+  }
+
+  if (block_available_up_right)
+  {
+    pred_pels = &imgY[pix_c.pos_y][pix_c.pos_x];
+    PredPel[9] = pred_pels[0];
+    PredPel[10] = pred_pels[1];
+    PredPel[11] = pred_pels[2];
+    PredPel[12] = pred_pels[3];
+    PredPel[13] = pred_pels[4];
+    PredPel[14] = pred_pels[5];
+    PredPel[15] = pred_pels[6];
+    PredPel[16] = pred_pels[7];
+
+  }
+  else
+  {
+    PredPel[9] = PredPel[10] = PredPel[11] = PredPel[12] = PredPel[13] = PredPel[14] = PredPel[15] = PredPel[16] = PredPel[8];
+  }
+
+  if (block_available_left)
+  {
+    PredPel[17] = imgY[pix_a[0].pos_y][pix_a[0].pos_x];
+    PredPel[18] = imgY[pix_a[1].pos_y][pix_a[1].pos_x];
+    PredPel[19] = imgY[pix_a[2].pos_y][pix_a[2].pos_x];
+    PredPel[20] = imgY[pix_a[3].pos_y][pix_a[3].pos_x];
+    PredPel[21] = imgY[pix_a[4].pos_y][pix_a[4].pos_x];
+    PredPel[22] = imgY[pix_a[5].pos_y][pix_a[5].pos_x];
+    PredPel[23] = imgY[pix_a[6].pos_y][pix_a[6].pos_x];
+    PredPel[24] = imgY[pix_a[7].pos_y][pix_a[7].pos_x];
+  }
+  else
+  {
+    PredPel[17] = PredPel[18] = PredPel[19] = PredPel[20] = PredPel[21] = PredPel[22] = PredPel[23] = PredPel[24] = (imgpel) p_Vid->dc_pred_value_comp[pl];
+  }
+
+  if (block_available_up_left)
+  {
+    PredPel[0] = imgY[pix_d.pos_y][pix_d.pos_x];
+  }
+  else
+  {
+    PredPel[0] = (imgpel) p_Vid->dc_pred_value_comp[pl];
+  }
+
+  LowPassForIntra8x8Pred(&(PredPel[0]), block_available_up_left, block_available_up, block_available_left);
+pred = &currSlice->mb_pred[pl][joff];
+  pred[0][ioff] =
+    pred[0+1][ioff+2] =
+    pred[0+2][ioff+4] =
+    pred[0+3][ioff+6] = (imgpel) ((PredPel[17] + PredPel[0] + 1) >> 1);
+  pred[0+1][ioff] =
+    pred[0+2][ioff+2] =
+    pred[0+3][ioff+4] =
+    pred[0+4][ioff+6] = (imgpel) ((PredPel[18] + PredPel[17] + 1) >> 1);
+  pred[0+2][ioff] =
+    pred[0+3][ioff+2] =
+    pred[0+4][ioff+4] =
+    pred[0+5][ioff+6] = (imgpel) ((PredPel[19] + PredPel[18] + 1) >> 1);
+  pred[0+3][ioff] =
+    pred[0+4][ioff+2] =
+    pred[0+5][ioff+4] =
+    pred[0+6][ioff+6] = (imgpel) ((PredPel[20] + PredPel[19] + 1) >> 1);
+  pred[0+4][ioff] =
+    pred[0+5][ioff+2] =
+    pred[0+6][ioff+4] =
+    pred[0+7][ioff+6] = (imgpel) ((PredPel[21] + PredPel[20] + 1) >> 1);
+  pred[0+5][ioff] =
+    pred[0+6][ioff+2] =
+    pred[0+7][ioff+4] = (imgpel) ((PredPel[22] + PredPel[21] + 1) >> 1);
+  pred[0+6][ioff] =
+    pred[0+7][ioff+2] = (imgpel) ((PredPel[23] + PredPel[22] + 1) >> 1);
+  pred[0+7][ioff] = (imgpel) ((PredPel[24] + PredPel[23] + 1) >> 1);
+  pred[0][ioff+1] =
+    pred[0+1][ioff+3] =
+    pred[0+2][ioff+5] =
+    pred[0+3][ioff+7] = (imgpel) ((PredPel[17] + PredPel[1] + 2*PredPel[0] + 2) >> 2);
+  pred[0+1][ioff+1] =
+    pred[0+2][ioff+3] =
+    pred[0+3][ioff+5] =
+    pred[0+4][ioff+7] = (imgpel) ((PredPel[0] + PredPel[18] + 2*PredPel[17] + 2) >> 2);
+  pred[0+2][ioff+1] =
+    pred[0+3][ioff+3] =
+    pred[0+4][ioff+5] =
+    pred[0+5][ioff+7] = (imgpel) ((PredPel[17] + PredPel[19] + 2*PredPel[18] + 2) >> 2);
+  pred[0+3][ioff+1] =
+    pred[0+4][ioff+3] =
+    pred[0+5][ioff+5] =
+    pred[0+6][ioff+7] = (imgpel) ((PredPel[18] + PredPel[20] + 2*PredPel[19] + 2) >> 2);
+  pred[0+4][ioff+1] =
+    pred[0+5][ioff+3] =
+    pred[0+6][ioff+5] =
+    pred[0+7][ioff+7] = (imgpel) ((PredPel[19] + PredPel[21] + 2*PredPel[20] + 2) >> 2);
+  pred[0+5][ioff+1] =
+    pred[0+6][ioff+3] =
+    pred[0+7][ioff+5] = (imgpel) ((PredPel[20] + PredPel[22] + 2*PredPel[21] + 2) >> 2);
+  pred[0+6][ioff+1] =
+    pred[0+7][ioff+3] = (imgpel) ((PredPel[21] + PredPel[23] + 2*PredPel[22] + 2) >> 2);
+  pred[0+7][ioff+1] = (imgpel) ((PredPel[22] + PredPel[24] + 2*PredPel[23] + 2) >> 2);
+  pred[0][ioff+2] =
+    pred[0+1][ioff+4] =
+    pred[0+2][ioff+6] = (imgpel) ((PredPel[0] + PredPel[2] + 2*PredPel[1] + 2) >> 2);
+  pred[0][ioff+3] =
+    pred[0+1][ioff+5] =
+    pred[0+2][ioff+7] = (imgpel) ((PredPel[1] + PredPel[3] + 2*PredPel[2] + 2) >> 2);
+  pred[0][ioff+4] =
+    pred[0+1][ioff+6] = (imgpel) ((PredPel[2] + PredPel[4] + 2*PredPel[3] + 2) >> 2);
+  pred[0][ioff+5] =
+    pred[0+1][ioff+7] = (imgpel) ((PredPel[3] + PredPel[5] + 2*PredPel[4] + 2) >> 2);
+  pred[0][ioff+6] = (imgpel) ((PredPel[4] + PredPel[6] + 2*PredPel[5] + 2) >> 2);
+  pred[0][ioff+7] = (imgpel) ((PredPel[5] + PredPel[7] + 2*PredPel[6] + 2) >> 2);
+
+  return DECODING_OK;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Make intra 8x8 prediction according to all 9 prediction modes.
+ *    The routine uses left and upper neighbouring points from
+ *    previous coded blocks to do this (if available). Notice that
+ *    inaccessible neighbouring points are signalled with a negative
+ *    value in the predmode array .
+ *
+ *  \par Input:
+ *     Starting point of current 8x8 block image position
+ *
+ ************************************************************************
+ */
+int intrapred8x8(Macroblock *currMB,    //!< Current Macroblock
+                 ColorPlane pl,         //!< Current color plane
+                 int ioff,              //!< ioff
+                 int joff)              //!< joff
+
+{  
+  VideoParameters *p_Vid = currMB->p_Vid;
+  int block_x = (currMB->block_x) + (ioff >> 2);
+  int block_y = (currMB->block_y) + (joff >> 2);
+  byte predmode = p_Vid->ipredmode[block_y][block_x];
+
+  currMB->ipmode_DPCM = predmode;  //For residual DPCM
+
+  switch (predmode)
+  {
+  case DC_PRED:
+    return (intra8x8_dc_pred(currMB, pl, ioff, joff));
+    break;
+  case VERT_PRED:
+    return (intra8x8_vert_pred(currMB, pl, ioff, joff));
+    break;
+  case HOR_PRED:
+    return (intra8x8_hor_pred(currMB, pl, ioff, joff));
+    break;
+  case DIAG_DOWN_RIGHT_PRED:
+    return (intra8x8_diag_down_right_pred(currMB, pl, ioff, joff));
+    break;
+  case DIAG_DOWN_LEFT_PRED:
+    return (intra8x8_diag_down_left_pred(currMB, pl, ioff, joff));
+    break;
+  case VERT_RIGHT_PRED:
+    return (intra8x8_vert_right_pred(currMB, pl, ioff, joff));
+    break;
+  case VERT_LEFT_PRED:
+    return (intra8x8_vert_left_pred(currMB, pl, ioff, joff));
+    break;
+  case HOR_UP_PRED:
+    return (intra8x8_hor_up_pred(currMB, pl, ioff, joff));
+    break;
+  case HOR_DOWN_PRED:  
+    return (intra8x8_hor_down_pred(currMB, pl, ioff, joff));
+  default:
+    printf("Error: illegal intra_8x8 prediction mode: %d\n", (int) predmode);
+    return SEARCH_SYNC;
+    break;
+  }
+
+  return DECODING_OK;
+}
+
+
diff --git a/Src/h264dec/ldecod/src/intra_chroma_pred.c b/Src/h264dec/ldecod/src/intra_chroma_pred.c
new file mode 100644
index 00000000..5c45ec6b
--- /dev/null
+++ b/Src/h264dec/ldecod/src/intra_chroma_pred.c
@@ -0,0 +1,357 @@
+/*!
+*************************************************************************************
+* \file intra_chroma_pred.c
+*
+* \brief
+*    Functions for intra chroma prediction
+*
+* \author
+*      Main contributors (see contributors.h for copyright, 
+*                         address and affiliation details)
+*      - Alexis Michael Tourapis  <alexismt@ieee.org>
+*
+*************************************************************************************
+*/
+#include "global.h"
+#include "block.h"
+#include "mb_access.h"
+#include "image.h"
+
+static void intra_chroma_DC_single(imgpel **curr_img, int up_avail, int left_avail, PixelPos up, PixelPos left[17], int blk_x, int blk_y, int *pred, int direction )
+{
+	int s0;
+
+	if ((direction && up_avail) || (!left_avail && up_avail))
+	{
+		imgpel *img = &curr_img[up.pos_y][up.pos_x + blk_x];
+		s0 = img[0] + img[1] + img[2] + img[3];
+		*pred = (s0+2) >> 2;
+	}
+	else if (left_avail)  
+	{
+		s0 = curr_img[left[blk_y].pos_y][left[blk_y].pos_x];
+		s0 += curr_img[left[blk_y+1].pos_y][left[blk_y+1].pos_x];
+		s0 += curr_img[left[blk_y+2].pos_y][left[blk_y+2].pos_x];
+		s0 += curr_img[left[blk_y+3].pos_y][left[blk_y+3].pos_x];
+
+		*pred = (s0+2) >> 2;
+	}
+}
+
+
+static void intra_chroma_DC_all(imgpel **curr_img, int up_avail, int left_avail, PixelPos up, PixelPos left[17], int blk_x, int blk_y, int *pred )
+{
+	int s0 = 0, s1 = 0;
+
+	if (up_avail)
+	{
+		imgpel *img = &curr_img[up.pos_y][up.pos_x + blk_x];
+		s0 = img[0] + img[1] + img[2] + img[3];
+	}
+
+	if (left_avail)
+	{
+		s1 += curr_img[left[blk_y].pos_y][left[blk_y].pos_x];
+		s1 += curr_img[left[blk_y+1].pos_y][left[blk_y+1].pos_x];
+		s1 += curr_img[left[blk_y+2].pos_y][left[blk_y+2].pos_x];
+		s1 += curr_img[left[blk_y+3].pos_y][left[blk_y+3].pos_x];
+	}
+
+	if (up_avail && left_avail)
+		*pred = (s0 + s1 + 4) >> 3;
+	else if (up_avail)
+		*pred = (s0 + 2) >> 2;
+	else if (left_avail)
+		*pred = (s1 + 2) >> 2;
+}
+
+/*!
+************************************************************************
+* \brief
+*    Chroma Intra prediction. Note that many operations can be moved
+*    outside since they are repeated for both components for no reason.
+************************************************************************
+*/
+
+static void memset_4x4(h264_imgpel_macroblock_row_t *mb_pred, int offset_x, int pred)
+{
+#ifdef _M_IX86
+	// benski> can't believe the shitty code that the compiler generated...  this code is better
+	int dword_pred = pred * 0x01010101;
+	mb_pred = (h264_imgpel_macroblock_row_t *)&mb_pred[0][offset_x];
+	*(int *)mb_pred[0] = dword_pred;
+	*(int *)mb_pred[1] = dword_pred;
+	*(int *)mb_pred[2] = dword_pred;
+	*(int *)mb_pred[3] = dword_pred;
+#else
+	int ii, jj;
+	for (jj = 0; jj < BLOCK_SIZE; jj++)
+	{
+		for (ii = 0; ii < BLOCK_SIZE; ii++)
+		{
+			mb_pred[jj][offset_x+ii]=(imgpel) pred;
+		}
+	}
+#endif
+}
+
+static void chroma_dc_pred8(VideoParameters *p_Vid, int yuv, imgpel **imgUV, int up_avail, int left_avail[2], PixelPos up, PixelPos left[17], h264_imgpel_macroblock_row_t *mb_pred)
+{
+	static const byte block_pos[3][4][4]= //[yuv][b8][b4]
+	{
+		{ {0, 1, 2, 3},{0, 0, 0, 0},{0, 0, 0, 0},{0, 0, 0, 0}},
+		{ {0, 1, 2, 3},{2, 3, 2, 3},{0, 0, 0, 0},{0, 0, 0, 0}},
+		{ {0, 1, 2, 3},{1, 1, 3, 3},{2, 3, 2, 3},{3, 3, 3, 3}}
+	};
+
+	int b8, b4;
+	int pred;
+
+	// DC prediction
+	// Note that unlike what is stated in many presentations and papers, this mode does not operate
+	// the same way as I_16x16 DC prediction.
+
+	for(b8 = 0; b8 < (p_Vid->num_uv_blocks) ;b8++)
+	{
+		for (b4 = 0; b4 < 4; b4++)
+		{
+			int blk_y = subblk_offset_y[yuv][b8][b4];
+			int blk_x = subblk_offset_x[yuv][b8][b4];
+
+			pred = p_Vid->dc_pred_value_comp[1];
+
+			//===== get prediction value =====
+			switch (block_pos[yuv][b8][b4])
+			{
+			case 0:  //===== TOP LEFT =====
+				intra_chroma_DC_all   (imgUV, up_avail, left_avail[0], up, left, blk_x, blk_y + 1, &pred);
+				break;
+			case 1: //===== TOP RIGHT =====
+				intra_chroma_DC_single(imgUV, up_avail, left_avail[0], up, left, blk_x, blk_y + 1, &pred, 1);
+				break;
+			case 2: //===== BOTTOM LEFT =====
+				intra_chroma_DC_single(imgUV, up_avail, left_avail[1], up, left, blk_x, blk_y + 1, &pred, 0);
+				break;
+			case 3: //===== BOTTOM RIGHT =====
+				intra_chroma_DC_all   (imgUV, up_avail, left_avail[1], up, left, blk_x, blk_y + 1, &pred);          
+				break;
+			}
+
+			memset_4x4(mb_pred+blk_y, blk_x, pred);
+		}
+	}
+
+}
+
+static void chroma_pred_horiz8(int cr_MB_x, int cr_MB_y, PixelPos left[17], imgpel **imgUV, h264_imgpel_macroblock_row_t *mb_pred)
+{
+	// Horizontal Prediction
+	int i,j;
+
+	if (cr_MB_x == 8)
+	{
+		for (j = 0; j < cr_MB_y; ++j)
+	{
+
+		int pred = imgUV[left[1 + j].pos_y][left[1 + j].pos_x];
+		for (i = 0; i < 8; ++i)
+			mb_pred[j][i]=(imgpel) pred;
+	}
+	}
+	else
+	{
+		assert(cr_MB_x == 16);
+	for (j = 0; j < cr_MB_y; ++j)
+	{
+
+		int pred = imgUV[left[1 + j].pos_y][left[1 + j].pos_x];
+		for (i = 0; i < 16; ++i)
+			mb_pred[j][i]=(imgpel) pred;
+	}
+	}
+}
+
+static void chroma_pred_vert8(int cr_MB_x, int cr_MB_y, PixelPos up, imgpel **imgUV,  h264_imgpel_macroblock_row_t *mb_pred)
+{
+	// Vertical Prediction
+	const imgpel *source = &(imgUV[up.pos_y][up.pos_x]);
+	if (cr_MB_x == 8)
+	{
+		int j;
+		for (j = 0; j < cr_MB_y; ++j)
+		{
+			memcpy(mb_pred[j], source, 8 * sizeof(imgpel));
+		}
+	}
+	else
+	{
+		int j;
+		assert(cr_MB_x == 16);
+
+		
+		for (j = 0; j < cr_MB_y; ++j)
+		{
+			memcpy(mb_pred[j], source, 16 * sizeof(imgpel));
+		}
+	}
+}
+
+static void chroma_pred_plane8(int cr_MB_x, int cr_MB_y, int cr_MB_x2, int cr_MB_y2, PixelPos up, PixelPos left[17], int max_imgpel_value, imgpel **imgUV,  h264_imgpel_macroblock_row_t *mb_pred)
+{
+	int ih, iv, ib, ic, i, j, iaa;
+	imgpel *upPred = &imgUV[up.pos_y][up.pos_x];
+
+	ih = cr_MB_x2 * (upPred[cr_MB_x - 1] - imgUV[left[0].pos_y][left[0].pos_x]);
+	for (i = 0; i < cr_MB_x2 - 1; ++i)
+		ih += (i + 1) * (upPred[cr_MB_x2 + i] - upPred[cr_MB_x2 - 2 - i]);
+
+	iv = cr_MB_y2 * (imgUV[left[cr_MB_y].pos_y][left[cr_MB_y].pos_x] - imgUV[left[0].pos_y][left[0].pos_x]);
+	for (i = 0; i < cr_MB_y2 - 1; ++i)
+		iv += (i + 1)*(imgUV[left[cr_MB_y2 + 1 + i].pos_y][left[cr_MB_y2 + 1 + i].pos_x] -
+		imgUV[left[cr_MB_y2 - 1 - i].pos_y][left[cr_MB_y2 - 1 - i].pos_x]);
+
+	ib= ((cr_MB_x == 8 ? 17 : 5) * ih + 2 * cr_MB_x)>>(cr_MB_x == 8 ? 5 : 6);
+	ic= ((cr_MB_y == 8 ? 17 : 5) * iv + 2 * cr_MB_y)>>(cr_MB_y == 8 ? 5 : 6);
+
+	iaa=16*(imgUV[left[cr_MB_y].pos_y][left[cr_MB_y].pos_x] + upPred[cr_MB_x-1]);
+
+	for (j = 0; j < cr_MB_y; ++j)
+		for (i = 0; i < cr_MB_x; ++i)
+			mb_pred[j][i]=(imgpel) iClip1(max_imgpel_value, ((iaa + (i - cr_MB_x2 + 1) * ib + (j - cr_MB_y2 + 1) * ic + 16) >> 5));  
+}
+
+// TODO: benski> replace with PredictIntraChroma8x8_H264 ?
+void intrapred_chroma(Macroblock *currMB, int uv)
+{
+	if (currMB->c_ipred_mode == VERT_PRED_8)
+	{
+		Slice *currSlice = currMB->p_Slice;
+		VideoParameters *p_Vid = currMB->p_Vid;
+		StorablePicture *dec_picture = p_Vid->dec_picture;
+		imgpel **imgUV = dec_picture->imgUV[uv]->img;
+
+		h264_imgpel_macroblock_row_t *mb_pred = currSlice->mb_pred[uv + 1];
+
+		PixelPos up;        //!< pixel position  p(0,-1)
+
+		int up_avail;
+
+		int cr_MB_x = p_Vid->mb_cr_size_x;
+		int cr_MB_y = p_Vid->mb_cr_size_y;
+
+		p_Vid->getNeighbourUp(currMB, p_Vid->mb_size[IS_CHROMA], &up);
+
+		if (!p_Vid->active_pps->constrained_intra_pred_flag)
+		{
+			up_avail      = up.available;
+		}
+		else
+		{
+			up_avail = up.available ? p_Vid->intra_block[up.mb_addr] : 0;
+		}
+
+		// Vertical Prediction
+		if (!up_avail)
+			error("unexpected VERT_PRED_8 chroma intra prediction mode",-1);
+
+		chroma_pred_vert8(cr_MB_x, cr_MB_y, up, imgUV, mb_pred);
+	}
+	else
+	{
+		Slice *currSlice = currMB->p_Slice;
+		VideoParameters *p_Vid = currMB->p_Vid;
+		int i;
+		StorablePicture *dec_picture = p_Vid->dec_picture;
+		imgpel **imgUV = dec_picture->imgUV[uv]->img;
+		int     max_imgpel_value = p_Vid->max_pel_value_comp[uv + 1];
+
+		int        yuv = dec_picture->chroma_format_idc - 1;
+		h264_imgpel_macroblock_row_t *mb_pred = currSlice->mb_pred[uv + 1];
+
+
+		PixelPos up;        //!< pixel position  p(0,-1)
+		PixelPos left[17];  //!< pixel positions p(-1, -1..16)
+
+		int up_avail, left_avail[2], left_up_avail;
+
+		int cr_MB_x = p_Vid->mb_cr_size_x;
+		int cr_MB_y = p_Vid->mb_cr_size_y;
+		int cr_MB_y2 = (cr_MB_y >> 1);
+		int cr_MB_x2 = (cr_MB_x >> 1);
+
+		p_Vid->getNeighbourNX(currMB, -1, p_Vid->mb_size[IS_CHROMA], &left[0]);
+		p_Vid->getNeighbourLeft(currMB, p_Vid->mb_size[IS_CHROMA], &left[1]); 
+
+		p_Vid->getNeighbourNPChromaNB(currMB, 2-1, p_Vid->mb_size[IS_CHROMA], &left[2]);
+		p_Vid->getNeighbourNPChromaNB(currMB, 3-1, p_Vid->mb_size[IS_CHROMA], &left[3]);
+		p_Vid->getNeighbourNPChromaNB(currMB, 4-1, p_Vid->mb_size[IS_CHROMA], &left[4]);
+		p_Vid->getNeighbourNPChromaNB(currMB, 5-1, p_Vid->mb_size[IS_CHROMA], &left[5]);
+		p_Vid->getNeighbourNPChromaNB(currMB, 6-1, p_Vid->mb_size[IS_CHROMA], &left[6]);
+		p_Vid->getNeighbourNPChromaNB(currMB, 7-1, p_Vid->mb_size[IS_CHROMA], &left[7]);
+		p_Vid->getNeighbourNPChromaNB(currMB, 8-1, p_Vid->mb_size[IS_CHROMA], &left[8]);
+
+		if (cr_MB_y == 16)
+		{
+			p_Vid->getNeighbourNPChromaNB(currMB, 9-1, p_Vid->mb_size[IS_CHROMA], &left[9]);
+			p_Vid->getNeighbourNPChromaNB(currMB, 10-1, p_Vid->mb_size[IS_CHROMA], &left[10]);
+			p_Vid->getNeighbourNPChromaNB(currMB, 11-1, p_Vid->mb_size[IS_CHROMA], &left[11]);
+			p_Vid->getNeighbourNPChromaNB(currMB, 12-1, p_Vid->mb_size[IS_CHROMA], &left[12]);
+			p_Vid->getNeighbourNPChromaNB(currMB, 13-1, p_Vid->mb_size[IS_CHROMA], &left[13]);
+			p_Vid->getNeighbourNPChromaNB(currMB, 14-1, p_Vid->mb_size[IS_CHROMA], &left[14]);
+			p_Vid->getNeighbourNPChromaNB(currMB, 15-1, p_Vid->mb_size[IS_CHROMA], &left[15]);
+			p_Vid->getNeighbourNPChromaNB(currMB, 16-1, p_Vid->mb_size[IS_CHROMA], &left[16]);
+		}
+
+		p_Vid->getNeighbourUp(currMB, p_Vid->mb_size[IS_CHROMA], &up);
+
+		if (!p_Vid->active_pps->constrained_intra_pred_flag)
+		{
+			up_avail      = up.available;
+			left_avail[0] = left_avail[1] = left[1].available;
+			left_up_avail = left[0].available;
+		}
+		else
+		{
+			up_avail = up.available ? p_Vid->intra_block[up.mb_addr] : 0;
+			for (i=0, left_avail[0] = 1; i < cr_MB_y2;++i)
+				left_avail[0]  &= left[i + 1].available ? p_Vid->intra_block[left[i + 1].mb_addr]: 0;
+
+			for (i = cr_MB_y2, left_avail[1] = 1; i<cr_MB_y;++i)
+				left_avail[1]  &= left[i + 1].available ? p_Vid->intra_block[left[i + 1].mb_addr]: 0;
+
+			left_up_avail = left[0].available ? p_Vid->intra_block[left[0].mb_addr]: 0;
+		}
+
+		switch (currMB->c_ipred_mode)
+		{
+		case DC_PRED_8:
+			chroma_dc_pred8(p_Vid, yuv, imgUV, up_avail, left_avail, up, left, mb_pred);
+			break;
+		case HOR_PRED_8:
+			{
+				// Horizontal Prediction
+				if (!left_avail[0] || !left_avail[1])
+					error("unexpected HOR_PRED_8 chroma intra prediction mode",-1);
+
+				chroma_pred_horiz8(cr_MB_x, cr_MB_y, left, imgUV, mb_pred);
+			}
+			break;
+		case PLANE_8:
+			// plane prediction
+			if (!left_up_avail || !left_avail[0] || !left_avail[1] || !up_avail)
+				error("unexpected PLANE_8 chroma intra prediction mode",-1);
+			else
+			{
+				chroma_pred_plane8(cr_MB_x, cr_MB_y, cr_MB_x2, cr_MB_y2, up, left, max_imgpel_value, imgUV, mb_pred);
+			}
+			break;
+		default:
+			error("illegal chroma intra prediction mode", 600);
+			break;
+		}
+	}
+}
+
+
+
+
diff --git a/Src/h264dec/ldecod/src/ldecod.c b/Src/h264dec/ldecod/src/ldecod.c
new file mode 100644
index 00000000..39919e1e
--- /dev/null
+++ b/Src/h264dec/ldecod/src/ldecod.c
@@ -0,0 +1,639 @@
+
+/*!
+ ***********************************************************************
+ *  \mainpage
+ *     This is the H.264/AVC decoder reference software. For detailed documentation
+ *     see the comments in each file.
+ *
+ *     The JM software web site is located at:
+ *     http://iphome.hhi.de/suehring/tml
+ *
+ *     For bug reporting and known issues see:
+ *     https://ipbt.hhi.de
+ *
+ *  \author
+ *     The main contributors are listed in contributors.h
+ *
+ *  \version
+ *     JM 16.1 (FRExt)
+ *
+ *  \note
+ *     tags are used for document system "doxygen"
+ *     available at http://www.doxygen.org
+ */
+/*!
+ *  \file
+ *     ldecod.c
+ *  \brief
+ *     H.264/AVC reference decoder project main()
+ *  \author
+ *     Main contributors (see contributors.h for copyright, address and affiliation details)
+ *     - Inge Lille-Lang�y       <inge.lille-langoy@telenor.com>
+ *     - Rickard Sjoberg         <rickard.sjoberg@era.ericsson.se>
+ *     - Stephan Wenger          <stewe@cs.tu-berlin.de>
+ *     - Jani Lainema            <jani.lainema@nokia.com>
+ *     - Sebastian Purreiter     <sebastian.purreiter@mch.siemens.de>
+ *     - Byeong-Moon Jeon        <jeonbm@lge.com>
+ *     - Gabi Blaettermann
+ *     - Ye-Kui Wang             <wyk@ieee.org>
+ *     - Valeri George           <george@hhi.de>
+ *     - Karsten Suehring        <suehring@hhi.de>
+ *
+ ***********************************************************************
+ */
+
+#include "contributors.h"
+
+#include <sys/stat.h>
+
+#include "global.h"
+#include "image.h"
+#include "memalloc.h"
+#include "mc_prediction.h"
+#include "mbuffer.h"
+#include "leaky_bucket.h"
+#include "fmo.h"
+#include "output.h"
+#include "cabac.h"
+#include "parset.h"
+#include "sei.h"
+#include "erc_api.h"
+#include "quant.h"
+#include "block.h"
+#include "nalu.h"
+#include "meminput.h"
+#define LOGFILE     "log.dec"
+#define DATADECFILE "dataDec.txt"
+#define TRACEFILE   "trace_dec.txt"
+
+// Decoder definition. This should be the only global variable in the entire
+// software. Global variables should be avoided.
+char errortext[ET_SIZE]; //!< buffer for error message for exit with error()
+
+#ifdef TRACE
+FILE *p_trace=0;
+int bitcounter=0;
+#endif
+
+// Prototypes of static functions
+void init        (VideoParameters *p_Vid);
+void malloc_slice(InputParameters *p_Inp, VideoParameters *p_Vid);
+void free_slice  (Slice *currSlice);
+
+void init_frext(VideoParameters *p_Vid);
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Error handling procedure. Print error message to stderr and exit
+ *    with supplied code.
+ * \param text
+ *    Error message
+ * \param code
+ *    Exit code
+ ************************************************************************
+ */
+void error(char *text, int code)
+{
+	RaiseException(code, 0, 1, (ULONG_PTR *)text);
+  //fprintf(stderr, "%s\n", text); 
+	//flush_dpb(p_Dec->p_Vid);
+  //exit(code);
+}
+
+/*static */void Configure(VideoParameters *p_Vid, InputParameters *p_Inp)
+{
+  p_Vid->p_Inp = p_Inp;
+
+  p_Inp->intra_profile_deblocking = 0;
+
+#ifdef _LEAKYBUCKET_
+  p_Inp->R_decoder=500000;          //! Decoder rate
+  p_Inp->B_decoder=104000;          //! Decoder buffer size
+  p_Inp->F_decoder=73000;           //! Decoder initial delay
+  strcpy(p_Inp->LeakyBucketParamFile,"leakybucketparam.cfg");    // file where Leaky Bucket parameters (computed by encoder) are stored
+#endif
+
+}
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    Allocate the Image structure
+ * \par  Output:
+ *    Image Parameters VideoParameters *p_Vid
+ ***********************************************************************
+ */
+static void alloc_img( VideoParameters **p_Vid)
+{
+  if ((*p_Vid   =  (VideoParameters *) calloc(1, sizeof(VideoParameters)))==NULL) 
+    no_mem_exit("alloc_img: p_Vid");
+
+  if (((*p_Vid)->old_slice = (OldSliceParams *) calloc(1, sizeof(OldSliceParams)))==NULL) 
+    no_mem_exit("alloc_img: p_Vid->old_slice");
+
+  if (((*p_Vid)->p_Dpb =  (DecodedPictureBuffer*)calloc(1, sizeof(DecodedPictureBuffer)))==NULL) 
+    no_mem_exit("alloc_img: p_Vid->p_Dpb");  
+
+  (*p_Vid)->p_Dpb->init_done = 0;
+  
+  (*p_Vid)->global_init_done = 0;
+
+#if (ENABLE_OUTPUT_TONEMAPPING)  
+  if (((*p_Vid)->seiToneMapping =  (ToneMappingSEI*)calloc(1, sizeof(ToneMappingSEI)))==NULL) 
+    no_mem_exit("alloc_img: (*p_Vid)->seiToneMapping");  
+#endif
+
+}
+
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    Allocate the Input structure
+ * \par  Output:
+ *    Input Parameters InputParameters *p_Vid
+ ***********************************************************************
+ */
+static void alloc_params( InputParameters **p_Inp )
+{
+  if ((*p_Inp = (InputParameters *) calloc(1, sizeof(InputParameters)))==NULL) 
+    no_mem_exit("alloc_params: p_Inp");
+}
+
+  /*!
+ ***********************************************************************
+ * \brief
+ *    Allocate the Decoder Structure
+ * \par  Output:
+ *    Decoder Parameters
+ ***********************************************************************
+ */
+DecoderParams *alloc_decoder()
+{
+	DecoderParams *decoder = (DecoderParams *) calloc(1, sizeof(DecoderParams));
+	if (decoder)
+
+	{
+		alloc_img(&(decoder->p_Vid));
+		alloc_params(&(decoder->p_Inp));
+#ifdef TRACE
+		p_trace = 0;
+		bitcounter = 0;
+#endif
+	}
+	return decoder;
+}
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    Free the Image structure
+ * \par  Input:
+ *    Image Parameters VideoParameters *p_Vid
+ ***********************************************************************
+ */
+void free_img( VideoParameters *p_Vid)
+{
+  if (p_Vid != NULL)
+  {
+		free_mem_input(p_Vid);
+#if (ENABLE_OUTPUT_TONEMAPPING)  
+    if (p_Vid->seiToneMapping != NULL)
+    {
+      free (p_Vid->seiToneMapping);
+      p_Vid->seiToneMapping = NULL;
+    }
+#endif
+
+    if (p_Vid->p_Dpb != NULL)
+    {
+      free (p_Vid->p_Dpb);
+      p_Vid->p_Dpb = NULL;
+    }
+    if (p_Vid->old_slice != NULL)
+    {
+      free (p_Vid->old_slice);
+      p_Vid->old_slice = NULL;
+    }
+
+    free (p_Vid);
+    p_Vid = NULL;
+  }
+}
+/*!
+ ***********************************************************************
+ * \brief
+ *    main function for TML decoder
+ ***********************************************************************
+ */
+#if 0
+int main(int argc, char **argv)
+{  
+  DecoderParams  *p_Dec = alloc_decoder();
+  if (!p_Dec)
+	  return 1;
+
+  Configure(p_Dec->p_Vid, p_Dec->p_Inp, argc, argv);
+
+  initBitsFile(p_Dec->p_Vid, p_Dec->p_Inp->FileFormat);
+
+  p_Dec->p_Vid->bitsfile->OpenBitsFile(p_Dec->p_Vid, p_Dec->p_Inp->infile);
+  
+  // Allocate Slice data struct
+  malloc_slice(p_Dec->p_Inp, p_Dec->p_Vid);
+  init_old_slice(p_Dec->p_Vid->old_slice);
+
+  init(p_Dec->p_Vid);
+ 
+  init_out_buffer(p_Dec->p_Vid);  
+
+  while (decode_one_frame(p_Dec->p_Vid) != EOS)
+    ;
+
+  free_slice(p_Dec->p_Vid->currentSlice);
+  FmoFinit(p_Dec->p_Vid);
+
+  free_global_buffers(p_Dec->p_Vid);
+  flush_dpb(p_Dec->p_Vid);
+
+#if (PAIR_FIELDS_IN_OUTPUT)
+  flush_pending_output(p_Dec->p_Vid, p_Dec->p_Vid->p_out);
+#endif
+
+  p_Dec->p_Vid->bitsfile->CloseBitsFile(p_Dec->p_Vid);
+
+  close(p_Dec->p_Vid->p_out);
+
+  if (p_Dec->p_Vid->p_ref != -1)
+    close(p_Dec->p_Vid->p_ref);
+
+#if TRACE
+  fclose(p_trace);
+#endif
+
+  ercClose(p_Dec->p_Vid, p_Dec->p_Vid->erc_errorVar);
+
+  CleanUpPPS(p_Dec->p_Vid);
+  free_dpb(p_Dec->p_Vid);
+  uninit_out_buffer(p_Dec->p_Vid);
+
+  free (p_Dec->p_Inp);
+  free_img (p_Dec->p_Vid);
+  free(p_Dec);
+
+  return 0;
+}
+#endif
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    Initilize some arrays
+ ***********************************************************************
+ */
+void init(VideoParameters *p_Vid)  //!< image parameters
+{
+  int i;
+  InputParameters *p_Inp = p_Vid->p_Inp;
+  p_Vid->oldFrameSizeInMbs = -1;
+
+  p_Vid->recovery_point = 0;
+  p_Vid->recovery_point_found = 0;
+  p_Vid->recovery_poc = 0x7fffffff; /* set to a max value */
+
+  p_Vid->number = 0;
+  p_Vid->type = I_SLICE;
+
+  p_Vid->dec_ref_pic_marking_buffer = NULL;
+
+  p_Vid->dec_picture = NULL;
+  // reference flag initialization
+  for(i=0;i<17;++i)
+  {
+    p_Vid->ref_flag[i] = 1;
+  }
+
+  p_Vid->MbToSliceGroupMap = NULL;
+  p_Vid->MapUnitToSliceGroupMap = NULL;
+
+  p_Vid->LastAccessUnitExists  = 0;
+  p_Vid->NALUCount = 0;
+
+
+  p_Vid->out_buffer = NULL;
+  p_Vid->pending_output = NULL;
+  p_Vid->pending_output_state = FRAME;
+  p_Vid->recovery_flag = 0;
+
+
+#if (ENABLE_OUTPUT_TONEMAPPING)
+  init_tone_mapping_sei(p_Vid->seiToneMapping);
+#endif
+
+}
+
+/*!
+ ***********************************************************************
+ * \brief
+ *    Initialize FREXT variables
+ ***********************************************************************
+ */
+void init_frext(VideoParameters *p_Vid)  //!< image parameters
+{
+  //pel bitdepth init
+  p_Vid->bitdepth_luma_qp_scale   = 6 * (p_Vid->bitdepth_luma - 8);
+
+  p_Vid->dc_pred_value_comp[0]    = 1<<(p_Vid->bitdepth_luma - 1);
+  p_Vid->max_pel_value_comp[0] = (1<<p_Vid->bitdepth_luma) - 1;
+  p_Vid->mb_size[IS_LUMA][0] = p_Vid->mb_size[IS_LUMA][1] = MB_BLOCK_SIZE;
+
+  if (p_Vid->active_sps->chroma_format_idc != YUV400)
+  {
+    //for chrominance part
+    p_Vid->bitdepth_chroma_qp_scale = 6 * (p_Vid->bitdepth_chroma - 8);
+    p_Vid->dc_pred_value_comp[1]    = (1 << (p_Vid->bitdepth_chroma - 1));
+    p_Vid->dc_pred_value_comp[2]    = p_Vid->dc_pred_value_comp[1];
+    p_Vid->max_pel_value_comp[1] = (1 << p_Vid->bitdepth_chroma) - 1;
+    p_Vid->max_pel_value_comp[2] = (1 << p_Vid->bitdepth_chroma) - 1;
+    p_Vid->num_blk8x8_uv = (1 << p_Vid->active_sps->chroma_format_idc) & (~(0x1));
+    p_Vid->num_uv_blocks = (p_Vid->num_blk8x8_uv >> 1);
+    p_Vid->num_cdc_coeff = (p_Vid->num_blk8x8_uv << 1);
+    p_Vid->mb_size[IS_CHROMA][0] = p_Vid->mb_size[2][0] = p_Vid->mb_cr_size_x  = (p_Vid->active_sps->chroma_format_idc==YUV420 || p_Vid->active_sps->chroma_format_idc==YUV422)?  8 : 16;
+    p_Vid->mb_size[IS_CHROMA][1] = p_Vid->mb_size[2][1] = p_Vid->mb_cr_size_y  = (p_Vid->active_sps->chroma_format_idc==YUV444 || p_Vid->active_sps->chroma_format_idc==YUV422)? 16 :  8;
+
+		p_Vid->subpel_x    = p_Vid->mb_cr_size_x == 8 ? 7 : 3;
+		p_Vid->subpel_y    = p_Vid->mb_cr_size_y == 8 ? 7 : 3;
+		p_Vid->shiftpel_x  = p_Vid->mb_cr_size_x == 8 ? 3 : 2;
+		p_Vid->shiftpel_y  = p_Vid->mb_cr_size_y == 8 ? 3 : 2;
+  }
+  else
+  {
+    p_Vid->bitdepth_chroma_qp_scale = 0;
+    p_Vid->max_pel_value_comp[1] = 0;
+    p_Vid->max_pel_value_comp[2] = 0;
+    p_Vid->num_blk8x8_uv = 0;
+    p_Vid->num_uv_blocks = 0;
+    p_Vid->num_cdc_coeff = 0;
+    p_Vid->mb_size[IS_CHROMA][0] = p_Vid->mb_size[2][0] = p_Vid->mb_cr_size_x  = 0;
+    p_Vid->mb_size[IS_CHROMA][1] = p_Vid->mb_size[2][1] = p_Vid->mb_cr_size_y  = 0;
+
+		p_Vid->subpel_x      = 0;
+		p_Vid->subpel_y      = 0;
+		p_Vid->shiftpel_x    = 0;
+		p_Vid->shiftpel_y    = 0;
+  }
+  p_Vid->mb_size_blk[0][0] = p_Vid->mb_size_blk[0][1] = p_Vid->mb_size[0][0] >> 2;
+  p_Vid->mb_size_blk[1][0] = p_Vid->mb_size_blk[2][0] = p_Vid->mb_size[1][0] >> 2;
+  p_Vid->mb_size_blk[1][1] = p_Vid->mb_size_blk[2][1] = p_Vid->mb_size[1][1] >> 2;
+
+  p_Vid->mb_size_shift[0][0] = p_Vid->mb_size_shift[0][1] = CeilLog2_sf (p_Vid->mb_size[0][0]);
+  p_Vid->mb_size_shift[1][0] = p_Vid->mb_size_shift[2][0] = CeilLog2_sf (p_Vid->mb_size[1][0]);
+  p_Vid->mb_size_shift[1][1] = p_Vid->mb_size_shift[2][1] = CeilLog2_sf (p_Vid->mb_size[1][1]);
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Allocates a stand-alone partition structure.  Structure should
+ *    be freed by FreePartition();
+ *    data structures
+ *
+ * \par Input:
+ *    n: number of partitions in the array
+ * \par return
+ *    pointer to DataPartition Structure, zero-initialized
+ ************************************************************************
+ */
+
+DataPartition *AllocPartition(int n)
+{
+  DataPartition *partArr, *dataPart;
+  int i;
+
+  partArr = (DataPartition *) calloc(n, sizeof(DataPartition));
+  if (partArr == NULL)
+  {
+    snprintf(errortext, ET_SIZE, "AllocPartition: Memory allocation for Data Partition failed");
+    error(errortext, 100);
+  }
+
+  for (i=0; i<n; ++i) // loop over all data partitions
+  {
+    dataPart = &(partArr[i]);
+    dataPart->bitstream = (Bitstream *) calloc(1, sizeof(Bitstream));
+    if (dataPart->bitstream == NULL)
+    {
+      snprintf(errortext, ET_SIZE, "AllocPartition: Memory allocation for Bitstream failed");
+      error(errortext, 100);
+    }
+		dataPart->bitstream->streamBuffer = 0;
+  }
+  return partArr;
+}
+
+
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Frees a partition structure (array).
+ *
+ * \par Input:
+ *    Partition to be freed, size of partition Array (Number of Partitions)
+ *
+ * \par return
+ *    None
+ *
+ * \note
+ *    n must be the same as for the corresponding call of AllocPartition
+ ************************************************************************
+ */
+
+
+void FreePartition (DataPartition *dp, int n)
+{
+  int i;
+
+  assert (dp != NULL);
+  assert (dp->bitstream != NULL);
+  //assert (dp->bitstream->streamBuffer != NULL);
+  for (i=0; i<n; ++i)
+  {
+    //free (dp[i].bitstream->streamBuffer);
+    free (dp[i].bitstream);
+  }
+  free (dp);
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Allocates the slice structure along with its dependent
+ *    data structures
+ *
+ * \par Input:
+ *    Input Parameters InputParameters *p_Inp,  VideoParameters *p_Vid
+ ************************************************************************
+ */
+void malloc_slice(InputParameters *p_Inp, VideoParameters *p_Vid)
+{
+  int memory_size = 0;
+  Slice *currSlice;
+
+  p_Vid->currentSlice = (Slice *) _aligned_malloc(sizeof(Slice), 32);
+  if ( (currSlice = p_Vid->currentSlice) == NULL)
+  {
+    error("Memory allocation for Slice datastruct failed",100);
+  }
+	memset(p_Vid->currentSlice, 0, sizeof(Slice));
+  //  p_Vid->currentSlice->rmpni_buffer=NULL;
+  //! you don't know whether we do CABAC here, hence initialize CABAC anyway
+  // if (p_Inp->symbol_mode == CABAC)
+
+  // create all context models
+  currSlice->mot_ctx = create_contexts_MotionInfo();
+  currSlice->tex_ctx = create_contexts_TextureInfo();
+
+
+  currSlice->max_part_nr = 3;  //! assume data partitioning (worst case) for the following mallocs()
+  currSlice->partArr = AllocPartition(currSlice->max_part_nr);
+  currSlice->p_colocated = NULL;
+
+	currSlice->coeff_ctr = -1;
+  currSlice->pos       =  0;
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Memory frees of the Slice structure and of its dependent
+ *    data structures
+ *
+ * \par Input:
+ *    Input Parameters InputParameters *p_Inp,  VideoParameters *p_Vid
+ ************************************************************************
+ */
+void free_slice(Slice *currSlice)
+{
+  FreePartition (currSlice->partArr, 3);
+  
+  if (1)
+  {
+    // delete all context models
+    delete_contexts_MotionInfo(currSlice->mot_ctx);
+    delete_contexts_TextureInfo(currSlice->tex_ctx);
+  }
+  _aligned_free(currSlice);
+
+  currSlice = NULL;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Dynamic memory allocation of frame size related global buffers
+ *    buffers are defined in global.h, allocated memory must be freed in
+ *    void free_global_buffers()
+ *
+ *  \par Input:
+ *    Input Parameters InputParameters *p_Inp, Image Parameters VideoParameters *p_Vid
+ *
+ *  \par Output:
+ *     Number of allocated bytes
+ ***********************************************************************
+ */
+int init_global_buffers(VideoParameters *p_Vid)
+{
+  int memory_size=0;
+  int i;
+
+  if (p_Vid->global_init_done)
+  {
+    free_global_buffers(p_Vid);
+  }
+
+  // allocate memory in structure p_Vid
+  if( IS_INDEPENDENT(p_Vid) )
+  {
+    for( i=0; i<MAX_PLANE; ++i )
+    {
+      if(((p_Vid->mb_data_JV[i]) = (Macroblock *) calloc(p_Vid->FrameSizeInMbs, sizeof(Macroblock))) == NULL)
+        no_mem_exit("init_global_buffers: p_Vid->mb_data");
+    }
+    p_Vid->mb_data = NULL;
+  }
+  else
+  {
+    if(((p_Vid->mb_data) = (Macroblock *) calloc(p_Vid->FrameSizeInMbs, sizeof(Macroblock))) == NULL)
+      no_mem_exit("init_global_buffers: p_Vid->mb_data");
+  }
+
+  if(((p_Vid->intra_block) = (int*)calloc(p_Vid->FrameSizeInMbs, sizeof(int))) == NULL)
+    no_mem_exit("init_global_buffers: p_Vid->intra_block");
+
+	p_Vid->PicPos = (h264_pic_position *)calloc(p_Vid->FrameSizeInMbs + 1, sizeof(h264_pic_position)); //! Helper array to access macroblock positions. We add 1 to also consider last MB.
+
+  for (i = 0; i < (int) p_Vid->FrameSizeInMbs + 1;++i)
+  {
+    p_Vid->PicPos[i][0] = (i % p_Vid->PicWidthInMbs);
+    p_Vid->PicPos[i][1] = (i / p_Vid->PicWidthInMbs);
+  }
+
+  memory_size += get_mem2D(&(p_Vid->ipredmode), 4*p_Vid->FrameHeightInMbs, 4*p_Vid->PicWidthInMbs);
+
+  // CAVLC mem
+	p_Vid->nz_coeff = (h264_nz_coefficient *)_aligned_malloc(p_Vid->FrameSizeInMbs*sizeof(h264_nz_coefficient), 32);
+	memset(p_Vid->nz_coeff, 0, p_Vid->FrameSizeInMbs*sizeof(h264_nz_coefficient));
+  //memory_size += get_mem4D(&(p_Vid->nz_coeff), p_Vid->FrameSizeInMbs, 3, BLOCK_SIZE, BLOCK_SIZE);
+
+  memory_size += get_mem2Dint(&(p_Vid->siblock), p_Vid->FrameHeightInMbs, p_Vid->PicWidthInMbs);
+
+  init_qp_process(p_Vid);
+
+  p_Vid->global_init_done = 1;
+
+  p_Vid->oldFrameSizeInMbs = p_Vid->FrameSizeInMbs;
+
+  return (memory_size);
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Free allocated memory of frame size related global buffers
+ *    buffers are defined in global.h, allocated memory is allocated in
+ *    int init_global_buffers()
+ *
+ * \par Input:
+ *    Input Parameters InputParameters *p_Inp, Image Parameters VideoParameters *p_Vid
+ *
+ * \par Output:
+ *    none
+ *
+ ************************************************************************
+ */
+void free_global_buffers(VideoParameters *p_Vid)
+{  
+  // CAVLC free mem
+  _aligned_free(p_Vid->nz_coeff);
+
+  free_mem2Dint(p_Vid->siblock);
+
+  // free mem, allocated for structure p_Vid
+  if (p_Vid->mb_data != NULL)
+    free(p_Vid->mb_data);
+
+  free(p_Vid->PicPos);
+
+  free (p_Vid->intra_block);
+  free_mem2D(p_Vid->ipredmode);
+
+  free_qp_matrices(p_Vid);
+
+  p_Vid->global_init_done = 0;
+
+}
diff --git a/Src/h264dec/ldecod/src/loopFilter.c b/Src/h264dec/ldecod/src/loopFilter.c
new file mode 100644
index 00000000..c1c19a2f
--- /dev/null
+++ b/Src/h264dec/ldecod/src/loopFilter.c
@@ -0,0 +1,1338 @@
+
+/*!
+*************************************************************************************
+* \file loopFilter.c
+*
+* \brief
+*    Filter to reduce blocking artifacts on a macroblock level.
+*    The filter strength is QP dependent.
+*
+* \author
+*    Contributors:
+*    - Peter List       Peter.List@t-systems.de:  Original code                                 (13-Aug-2001)
+*    - Jani Lainema     Jani.Lainema@nokia.com:   Some bug fixing, removal of recursiveness     (16-Aug-2001)
+*    - Peter List       Peter.List@t-systems.de:  inplace filtering and various simplifications (10-Jan-2002)
+*    - Anthony Joch     anthony@ubvideo.com:      Simplified switching between filters and
+*                                                 non-recursive default filter.                 (08-Jul-2002)
+*    - Cristina Gomila  cristina.gomila@thomson.net: Simplification of the chroma deblocking
+*                                                    from JVT-E089                              (21-Nov-2002)
+*    - Alexis Michael Tourapis atour@dolby.com:   Speed/Architecture improvements               (08-Feb-2007)
+*************************************************************************************
+*/
+
+#include "global.h"
+#include "image.h"
+#include "mb_access.h"
+#include "loopfilter.h"
+
+/*********************************************************************************************************/
+
+// NOTE: In principle, the alpha and beta tables are calculated with the formulas below
+//       Alpha( qp ) = 0.8 * (2^(qp/6)  -  1)
+//       Beta ( qp ) = 0.5 * qp  -  7
+
+// The tables actually used have been "hand optimized" though (by Anthony Joch). So, the
+// table values might be a little different to formula-generated values. Also, the first
+// few values of both tables is set to zero to force the filter off at low qp�s
+
+static const byte ALPHA_TABLE[52]  = {0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,4,4,5,6,  7,8,9,10,12,13,15,17,  20,22,25,28,32,36,40,45,  50,56,63,71,80,90,101,113,  127,144,162,182,203,226,255,255} ;
+static const byte  BETA_TABLE[52]  = {0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,2,2,2,3,  3,3,3, 4, 4, 4, 6, 6,   7, 7, 8, 8, 9, 9,10,10,  11,11,12,12,13,13, 14, 14,   15, 15, 16, 16, 17, 17, 18, 18} ;
+static const byte CLIP_TAB[52][5]  =
+{
+	{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},
+	{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},{ 0, 0, 0, 0, 0},
+	{ 0, 0, 0, 0, 0},{ 0, 0, 0, 1, 1},{ 0, 0, 0, 1, 1},{ 0, 0, 0, 1, 1},{ 0, 0, 0, 1, 1},{ 0, 0, 1, 1, 1},{ 0, 0, 1, 1, 1},{ 0, 1, 1, 1, 1},
+	{ 0, 1, 1, 1, 1},{ 0, 1, 1, 1, 1},{ 0, 1, 1, 1, 1},{ 0, 1, 1, 2, 2},{ 0, 1, 1, 2, 2},{ 0, 1, 1, 2, 2},{ 0, 1, 1, 2, 2},{ 0, 1, 2, 3, 3},
+	{ 0, 1, 2, 3, 3},{ 0, 2, 2, 3, 3},{ 0, 2, 2, 4, 4},{ 0, 2, 3, 4, 4},{ 0, 2, 3, 4, 4},{ 0, 3, 3, 5, 5},{ 0, 3, 4, 6, 6},{ 0, 3, 4, 6, 6},
+	{ 0, 4, 5, 7, 7},{ 0, 4, 5, 8, 8},{ 0, 4, 6, 9, 9},{ 0, 5, 7,10,10},{ 0, 6, 8,11,11},{ 0, 6, 8,13,13},{ 0, 7,10,14,14},{ 0, 8,11,16,16},
+	{ 0, 9,12,18,18},{ 0,10,13,20,20},{ 0,11,15,23,23},{ 0,13,17,25,25}
+} ;
+
+static const char chroma_edge[2][4][4] = //[dir][edge][yuv_format]
+{ { {-4, 0, 0, 0},
+{-4,-4,-4, 4},
+{-4, 4, 4, 8},
+{-4,-4,-4, 12}},
+
+{ {-4, 0,  0,  0},
+{-4,-4,  4,  4},
+{-4, 4,  8,  8},
+{-4,-4, 12, 12}}};
+
+static const int pelnum_cr[2][4] =  {{0,8,16,16}, {0,8, 8,16}};  //[dir:0=vert, 1=hor.][yuv_format]
+
+/* YUV420 & non-aff optimized functions */
+void EdgeLoopLuma_Vert_YUV420(VideoImage *image, const uint8_t Strength[4], Macroblock *MbQ, PixelPos pixMB, Macroblock *MbP);
+void EdgeLoopLuma_Horiz_YUV420(VideoImage *image, const byte Strength[MB_BLOCK_SIZE], Macroblock *MbQ, PixelPos pixMB, Macroblock *MbP);
+void EdgeLoopLumaMBAff_Vert_YUV420(VideoImage *image, const byte Strength[16], Macroblock *MbQ, int edge, StorablePicture *p);
+void EdgeLoopChroma_Vert_YUV420(VideoImage *image, const uint8_t Strength[4], Macroblock *MbQ, int uv, PixelPos pixMB, Macroblock *MbP);
+void EdgeLoopChroma_Horiz_YUV420(VideoImage *image, const byte Strength[4], Macroblock *MbQ, int uv, PixelPos pixMB, Macroblock *MbP);
+void EdgeLoopChromaMBAff_Vert_YUV420(VideoImage *image, const byte Strength[16], Macroblock *MbQ, int edge, int uv, StorablePicture *p);
+void GetStrength_Vert_YUV420(uint8_t Strength[4], Macroblock *MbQ, int edge, int mvlimit, StorablePicture *p, PixelPos pixMB, Macroblock *MbP);
+void GetStrength_Vert_YUV420_All(uint8_t Strength[4][4], Macroblock *MbQ, int mvlimit, StorablePicture *p, int pos_x, int pos_y, Macroblock *MbP, int luma_transform_size_8x8_flag);
+void GetStrength_Horiz_YUV420(uint8_t Strength[16], Macroblock *MbQ, int edge, int mvlimit, StorablePicture *p, PixelPos pixMB, Macroblock *MbP);
+void GetStrength_Horiz_YUV420_All(uint8_t Strength[4][4], Macroblock *MbQ, int mvlimit, StorablePicture *p, int pos_x, int pos_y, Macroblock *MbP, int luma_transform_size_8x8_flag);
+void GetStrength_MBAff_Vert_YUV420(byte Strength[16], Macroblock *MbQ, int edge, int mvlimit, StorablePicture *p);
+void GetStrengthMBAff_Horiz_YUV420(byte Strength[16], Macroblock *MbQ, int edge, int mvlimit, StorablePicture *p);
+static void Deblock_YUV420(VideoParameters *p_Vid, StorablePicture *p, int MbQAddr);
+static void Deblock_YUV420_MBAFF(VideoParameters *p_Vid, StorablePicture *p, int MbQAddr);
+/* */
+void EdgeLoopChromaNormal_Vert(VideoImage *image, const byte Strength[16], Macroblock *MbQ, int edge, int uv, StorablePicture *p);
+void EdgeLoopLumaNormal_Vert(ColorPlane pl, VideoImage *image, const byte Strength[MB_BLOCK_SIZE], Macroblock *MbQ, int edge, StorablePicture *p);
+void GetStrengthNormal_Vert(byte Strength[MB_BLOCK_SIZE], Macroblock *MbQ, int edge, int mvlimit, StorablePicture *p);
+void GetStrengthNormal_Horiz(byte Strength[MB_BLOCK_SIZE], Macroblock *MbQ, int edge, int mvlimit, StorablePicture *p);
+static void GetStrengthNormal (byte Strength[MB_BLOCK_SIZE], Macroblock *MbQ, int dir,int edge, int mvlimit,StorablePicture *p);
+static void GetStrengthMBAff  (byte Strength[MB_BLOCK_SIZE], Macroblock *MbQ, int dir,int edge, int mvlimit,StorablePicture *p);
+static void EdgeLoopLumaNormal(ColorPlane pl, struct video_image *image, const byte Strength[MB_BLOCK_SIZE],Macroblock *MbQ, int dir, int edge, StorablePicture *p);
+static void EdgeLoopLumaMBAff (ColorPlane pl, struct video_image *image, const byte Strength[MB_BLOCK_SIZE],Macroblock *MbQ, int dir, int edge, StorablePicture *p);
+static void EdgeLoopChromaNormal(struct video_image *image, const byte Strength[MB_BLOCK_SIZE],Macroblock *MbQ, int dir, int edge, int uv, StorablePicture *p);
+static void EdgeLoopChromaMBAff(struct video_image *image, const byte Strength[MB_BLOCK_SIZE],Macroblock *MbQ, int dir, int edge, int uv, StorablePicture *p);
+static void DeblockMb(VideoParameters *p_Vid, StorablePicture *p, int MbQAddr);
+static void EdgeLoopLumaMBAff_Horiz(ColorPlane pl, VideoImage *image, const byte Strength[16], Macroblock *MbQ, int edge, StorablePicture *p);
+static void EdgeLoopLumaMBAff_Vert(ColorPlane pl, VideoImage *image, const byte Strength[16], Macroblock *MbQ, int edge, StorablePicture *p);
+
+/*!
+*****************************************************************************************
+* \brief
+*    Filter all macroblocks in order of increasing macroblock address.
+*****************************************************************************************
+*/
+void DeblockPicture(VideoParameters *p_Vid, StorablePicture *p)
+{
+	unsigned i;
+
+	if (!p->mb_aff_frame_flag && p_Vid->active_sps->chroma_format_idc==YUV420 && p_Vid->getNeighbour == getNonAffNeighbour && !p_Vid->mixedModeEdgeFlag)
+	{
+		for (i = 0; i < p->PicSizeInMbs; ++i)
+		{
+			Deblock_YUV420( p_Vid, p, i ) ;
+		}
+	}
+	else if (p->mb_aff_frame_flag && p_Vid->active_sps->chroma_format_idc==YUV420 && p_Vid->getNeighbour == getAffNeighbour)
+	{
+		for (i = 0; i < p->PicSizeInMbs; ++i)
+		{
+			Deblock_YUV420_MBAFF( p_Vid, p, i ) ;
+		}
+	}
+	else
+	{
+		if (p->mb_aff_frame_flag == 1) 
+		{
+			p_Vid->GetStrength    = GetStrengthMBAff;
+			p_Vid->EdgeLoopLuma   = EdgeLoopLumaMBAff;
+			p_Vid->EdgeLoopChroma = EdgeLoopChromaMBAff;
+		}
+		else
+		{
+			p_Vid->GetStrength    = GetStrengthNormal;
+			p_Vid->EdgeLoopLuma   = EdgeLoopLumaNormal;
+			p_Vid->EdgeLoopChroma = EdgeLoopChromaNormal;
+		}
+
+		for (i = 0; i < p->PicSizeInMbs; ++i)
+		{
+			DeblockMb( p_Vid, p, i ) ;
+		}
+	}
+}
+
+
+/*!
+*****************************************************************************************
+* \brief
+*    Deblocking filter for one macroblock.
+*****************************************************************************************
+*/
+
+static void DeblockMb(VideoParameters *p_Vid, StorablePicture *p, int MbQAddr)
+{
+	int           EdgeCondition;
+	int           dir, edge;
+	__declspec(align(32)) byte          Strength[16];
+	short         mb_x, mb_y;
+
+	int           filterNon8x8LumaEdgesFlag[4] = {1,1,1,1};
+	int           filterLeftMbEdgeFlag;
+	int           filterTopMbEdgeFlag;
+	int           fieldModeMbFlag;
+	int           mvlimit = 4;
+	int           i, StrengthSum;
+	Macroblock    *MbQ = &(p_Vid->mb_data[MbQAddr]) ; // current Mb
+	VideoImage *imgY   = p->imgY;
+	VideoImage *imgUV[2] = {p->imgUV[0], p->imgUV[1]};
+
+	int           edge_cr;
+
+	// return, if filter is disabled
+	if (MbQ->DFDisableIdc==1) 
+	{
+		p_Vid->DeblockCall = 0;
+		return;
+	}
+	p_Vid->DeblockCall = 1;
+	get_mb_pos (p_Vid, MbQAddr, p_Vid->mb_size[IS_LUMA], &mb_x, &mb_y);
+
+	filterLeftMbEdgeFlag = (mb_x != 0);
+	filterTopMbEdgeFlag  = (mb_y != 0);
+
+	if (MbQ->mb_type == I8MB)
+		assert(MbQ->luma_transform_size_8x8_flag);
+
+	filterNon8x8LumaEdgesFlag[1] =
+		filterNon8x8LumaEdgesFlag[3] = !(MbQ->luma_transform_size_8x8_flag);
+
+	if (p->mb_aff_frame_flag && mb_y == MB_BLOCK_SIZE && MbQ->mb_field)
+		filterTopMbEdgeFlag = 0;
+
+	fieldModeMbFlag = (p->structure!=FRAME) || (p->mb_aff_frame_flag && MbQ->mb_field);
+	if (fieldModeMbFlag)
+		mvlimit = 2;
+
+	if (MbQ->DFDisableIdc==2)
+	{
+		// don't filter at slice boundaries
+		filterLeftMbEdgeFlag = MbQ->mb_avail_left;
+		// if this the bottom of a frame macroblock pair then always filter the top edge
+		filterTopMbEdgeFlag  = (p->mb_aff_frame_flag && !MbQ->mb_field && (MbQAddr & 0x01)) ? 1 : MbQ->mb_avail_up;
+	}
+
+	CheckAvailabilityOfNeighbors(MbQ);
+
+	for( dir = 0 ; dir < 2 ; ++dir )                                                      // filter first vertical edges, followed by horizontal 
+	{
+		EdgeCondition = (dir && filterTopMbEdgeFlag) || (!dir && filterLeftMbEdgeFlag); // can not filter beyond picture boundaries
+		for( edge=0; edge<4 ; ++edge )                                            // first 4 vertical strips of 16 pel
+		{                                                                               // then  4 horizontal
+			if( edge || EdgeCondition )
+			{
+				edge_cr = chroma_edge[dir][edge][p->chroma_format_idc];
+
+				p_Vid->GetStrength(Strength, MbQ, dir, edge << 2, mvlimit, p); // Strength for 4 blks in 1 stripe
+				StrengthSum = Strength[0];
+				for (i = 1; i < MB_BLOCK_SIZE && StrengthSum == 0 ; ++i)
+				{
+					StrengthSum += (int) Strength[i];
+				}
+
+				if( StrengthSum )                      // only if one of the 16 Strength bytes is != 0
+				{
+					if (filterNon8x8LumaEdgesFlag[edge])
+					{
+						p_Vid->EdgeLoopLuma( PLANE_Y, imgY, Strength, MbQ, dir, edge << 2, p) ;
+						if( p_Vid->active_sps->chroma_format_idc==YUV444 && !IS_INDEPENDENT(p_Vid) )
+						{
+							p_Vid->EdgeLoopLuma(PLANE_U, imgUV[0], Strength, MbQ, dir, edge << 2, p);
+							p_Vid->EdgeLoopLuma(PLANE_V, imgUV[1], Strength, MbQ, dir, edge << 2, p);
+						}
+					}
+					if (p_Vid->active_sps->chroma_format_idc==YUV420 || p_Vid->active_sps->chroma_format_idc==YUV422)
+					{
+						if( (imgUV != NULL) && (edge_cr >= 0))
+						{
+							p_Vid->EdgeLoopChroma( imgUV[0], Strength, MbQ, dir, edge_cr, 0, p);
+							p_Vid->EdgeLoopChroma( imgUV[1], Strength, MbQ, dir, edge_cr, 1, p);
+						}
+					}
+				}
+
+				if (dir && !edge && !MbQ->mb_field && p_Vid->mixedModeEdgeFlag) 
+				{
+					// this is the extra horizontal edge between a frame macroblock pair and a field above it
+					p_Vid->DeblockCall = 2;
+					p_Vid->GetStrength(Strength, MbQ, 1, MB_BLOCK_SIZE, mvlimit, p); // Strength for 4 blks in 1 stripe
+					//if( *((int*)Strength) )                      // only if one of the 4 Strength bytes is != 0
+					{            
+						if (filterNon8x8LumaEdgesFlag[edge])
+						{             
+							p_Vid->EdgeLoopLuma(PLANE_Y, imgY, Strength, MbQ, dir, MB_BLOCK_SIZE, p) ;
+							if( p_Vid->active_sps->chroma_format_idc==YUV444 && !IS_INDEPENDENT(p_Vid) )
+							{
+								p_Vid->EdgeLoopLuma(PLANE_U, imgUV[0], Strength, MbQ, dir, MB_BLOCK_SIZE, p) ;
+								p_Vid->EdgeLoopLuma(PLANE_V, imgUV[1], Strength, MbQ, dir, MB_BLOCK_SIZE, p) ;
+							}
+						}
+						if (p_Vid->active_sps->chroma_format_idc==YUV420 || p_Vid->active_sps->chroma_format_idc==YUV422) 
+						{
+							if( (imgUV != NULL) && (edge_cr >= 0))
+							{
+								p_Vid->EdgeLoopChroma( imgUV[0], Strength, MbQ, dir, MB_BLOCK_SIZE, 0, p) ;
+								p_Vid->EdgeLoopChroma( imgUV[1], Strength, MbQ, dir, MB_BLOCK_SIZE, 1, p) ;
+							}
+						}
+					}
+					p_Vid->DeblockCall = 1;
+				}
+			}
+		}//end edge
+	}//end loop dir
+
+	p_Vid->DeblockCall = 0;
+}
+
+
+
+static void Deblock_YUV420_MBAFF(VideoParameters *p_Vid, StorablePicture *p, int MbQAddr)
+{
+	int           EdgeCondition;
+	int           edge;
+	__declspec(align(32)) byte          Strength[16];
+	short         mb_x, mb_y;
+
+	int           filterNon8x8LumaEdgesFlag[4] = {1,1,1,1};
+	int           filterLeftMbEdgeFlag;
+	int           filterTopMbEdgeFlag;
+	int           fieldModeMbFlag;
+	int           mvlimit = 4;
+	int           i, StrengthSum;
+	Macroblock    *MbQ = &(p_Vid->mb_data[MbQAddr]) ; // current Mb
+	VideoImage *imgY   = p->imgY;
+	VideoImage *imgUV[2] = {p->imgUV[0], p->imgUV[1]};
+
+	int           edge_cr;
+
+	// return, if filter is disabled
+	if (MbQ->DFDisableIdc==1) 
+	{
+		p_Vid->DeblockCall = 0;
+		return;
+	}
+	p_Vid->DeblockCall = 1;
+	get_mb_block_pos_mbaff(p_Vid->PicPos, MbQAddr, &mb_x, &mb_y);
+
+	filterLeftMbEdgeFlag = (mb_x != 0);
+	filterTopMbEdgeFlag  = (mb_y != 0);
+
+	if (MbQ->mb_type == I8MB)
+		assert(MbQ->luma_transform_size_8x8_flag);
+
+	filterNon8x8LumaEdgesFlag[1] =
+		filterNon8x8LumaEdgesFlag[3] = !(MbQ->luma_transform_size_8x8_flag);
+
+	if (1 && mb_y == 1 && MbQ->mb_field)
+		filterTopMbEdgeFlag = 0;
+
+	fieldModeMbFlag = (p->structure!=FRAME) || MbQ->mb_field;
+	if (fieldModeMbFlag)
+		mvlimit = 2;
+
+	if (MbQ->DFDisableIdc==2)
+	{
+		// don't filter at slice boundaries
+		filterLeftMbEdgeFlag = MbQ->mb_avail_left;
+		// if this the bottom of a frame macroblock pair then always filter the top edge
+		filterTopMbEdgeFlag  = (1 && !MbQ->mb_field && (MbQAddr & 0x01)) ? 1 : MbQ->mb_avail_up;
+	}
+
+	CheckAvailabilityOfNeighbors(MbQ);
+
+
+		EdgeCondition = filterLeftMbEdgeFlag; // can not filter beyond picture boundaries
+		for( edge=0; edge<4 ; ++edge )                                            // first 4 vertical strips of 16 pel
+		{                                                                               // then  4 horizontal
+			if( edge || EdgeCondition )
+			{
+				edge_cr = chroma_edge[0][edge][YUV420];
+
+				GetStrength_MBAff_Vert_YUV420(Strength, MbQ, edge << 2, mvlimit, p); // Strength for 4 blks in 1 stripe
+				StrengthSum = Strength[0];
+				for (i = 1; i < MB_BLOCK_SIZE && StrengthSum == 0 ; ++i)
+				{
+					StrengthSum += (int) Strength[i];
+				}
+
+				if( StrengthSum )                      // only if one of the 16 Strength bytes is != 0
+				{
+					if (filterNon8x8LumaEdgesFlag[edge])
+					{
+						EdgeLoopLumaMBAff_Vert_YUV420(imgY, Strength, MbQ, edge << 2, p) ;
+					}
+					if( (imgUV != NULL) && (edge_cr >= 0))
+					{
+						EdgeLoopChromaMBAff_Vert_YUV420( imgUV[0], Strength, MbQ, edge_cr, 0, p);
+						EdgeLoopChromaMBAff_Vert_YUV420( imgUV[1], Strength, MbQ, edge_cr, 1, p);
+					}
+				}
+			}
+		}//end edge
+
+			EdgeCondition = filterTopMbEdgeFlag; // can not filter beyond picture boundaries
+		for( edge=0; edge<4 ; ++edge )                                            // first 4 vertical strips of 16 pel
+		{                                                                               // then  4 horizontal
+			if( edge || EdgeCondition )
+			{
+				edge_cr = chroma_edge[1][edge][YUV420];
+
+				GetStrengthMBAff_Horiz_YUV420(Strength, MbQ, edge << 2, mvlimit, p); // Strength for 4 blks in 1 stripe
+				StrengthSum = Strength[0];
+				for (i = 1; i < MB_BLOCK_SIZE && StrengthSum == 0 ; ++i)
+				{
+					StrengthSum += (int) Strength[i];
+				}
+
+				if( StrengthSum )                      // only if one of the 16 Strength bytes is != 0
+				{
+					if (filterNon8x8LumaEdgesFlag[edge])
+					{
+						EdgeLoopLumaMBAff_Horiz( PLANE_Y, imgY, Strength, MbQ, edge << 2, p) ;
+					}
+					if( (imgUV != NULL) && (edge_cr >= 0))
+					{
+						EdgeLoopChromaMBAff( imgUV[0], Strength, MbQ, 1, edge_cr, 0, p);
+						EdgeLoopChromaMBAff( imgUV[1], Strength, MbQ, 1, edge_cr, 1, p);
+					}
+				}
+						if (!edge && !MbQ->mb_field && p_Vid->mixedModeEdgeFlag) 
+				{
+					// this is the extra horizontal edge between a frame macroblock pair and a field above it
+					p_Vid->DeblockCall = 2;
+					GetStrengthMBAff(Strength, MbQ, 1, MB_BLOCK_SIZE, mvlimit, p); // Strength for 4 blks in 1 stripe
+					//if( *((int*)Strength) )                      // only if one of the 4 Strength bytes is != 0
+					{            
+						EdgeLoopLumaMBAff_Horiz(PLANE_Y, imgY, Strength, MbQ, MB_BLOCK_SIZE, p) ;
+
+						EdgeLoopChromaMBAff( imgUV[0], Strength, MbQ, 1, MB_BLOCK_SIZE, 0, p) ;
+						EdgeLoopChromaMBAff( imgUV[1], Strength, MbQ, 1, MB_BLOCK_SIZE, 1, p) ;
+
+					}
+					p_Vid->DeblockCall = 1;
+				}
+			}
+		}//end edge
+	
+
+	p_Vid->DeblockCall = 0;
+}
+
+
+
+static void Deblock_YUV420(VideoParameters *p_Vid, StorablePicture *p, int MbQAddr)
+{
+	__declspec(align(32)) union 
+	{
+		uint32_t as32[4];
+		uint8_t as8[16];
+		uint8_t edge[4][4];
+	} strength;
+	uint8_t alphas[2], alphas_chroma[2][2];
+	uint8_t betas[2], betas_chroma[2][2];
+	__declspec(align(32)) union 
+	{
+		uint32_t as32[4];
+		uint8_t as8[16];
+	} thresholds;
+	short         mb_x, mb_y;
+	int           filterLeftMbEdgeFlag;
+	int           filterTopMbEdgeFlag;
+	int           mvlimit = 4;
+	Macroblock    *MbQ = &(p_Vid->mb_data[MbQAddr]) ; // current Mb
+	Macroblock *MbP=0;
+	VideoImage *imgY   = p->imgY;
+	imgpel *YQ, *UQ, *VQ;
+	VideoImage *imgUV[2] = {p->imgUV[0], p->imgUV[1]};
+	int QPQ = MbQ->qp;
+	int indexAQ = iClip3(0, MAX_QP, QPQ + MbQ->DFAlphaC0Offset);
+	int indexBQ = iClip3(0, MAX_QP, QPQ + MbQ->DFBetaOffset);
+	const byte *ClipTabQ = CLIP_TAB[indexAQ], *ClipTabQ_Chroma[2], *ClipTabP_Chroma[2];
+
+	// return, if filter is disabled
+	if (MbQ->DFDisableIdc==1) 
+	{
+		p_Vid->DeblockCall = 0;
+		return;
+	}
+
+	alphas[1]  = ALPHA_TABLE[indexAQ];
+	betas[1]  = BETA_TABLE [indexBQ];
+
+	indexAQ = iClip3(0, MAX_QP, MbQ->qpc[0] + MbQ->DFAlphaC0Offset);
+	alphas_chroma[0][1]  = ALPHA_TABLE[indexAQ];
+	ClipTabQ_Chroma[0] = CLIP_TAB   [indexAQ];
+	indexAQ = iClip3(0, MAX_QP, MbQ->qpc[1] + MbQ->DFAlphaC0Offset);
+	alphas_chroma[1][1]  = ALPHA_TABLE[indexAQ];
+	ClipTabQ_Chroma[1] = CLIP_TAB   [indexAQ];
+
+	indexBQ = iClip3(0, MAX_QP, MbQ->qpc[0] + MbQ->DFBetaOffset);
+	betas_chroma[0][1]  = BETA_TABLE[indexBQ];
+	indexBQ = iClip3(0, MAX_QP, MbQ->qpc[1] + MbQ->DFBetaOffset);
+	betas_chroma[1][1]  = BETA_TABLE[indexBQ];
+
+	p_Vid->DeblockCall = 1;
+	get_mb_block_pos_normal(p_Vid->PicPos, MbQAddr, &mb_x, &mb_y);
+
+	filterLeftMbEdgeFlag = (mb_x != 0);
+	filterTopMbEdgeFlag  = (mb_y != 0);
+	YQ = imgY->base_address + mb_y*16 * imgY->stride + mb_x*16;
+	UQ = imgUV[0]->base_address + mb_y * 8 * imgUV[0]->stride + mb_x * 8;
+	VQ = imgUV[1]->base_address + mb_y * 8 * imgUV[1]->stride + mb_x * 8;
+
+	if (p->structure!=FRAME)
+		mvlimit = 2;
+
+	if (MbQ->DFDisableIdc==2)
+	{
+		// don't filter at slice boundaries
+		filterLeftMbEdgeFlag = MbQ->mb_avail_left;
+		// if this the bottom of a frame macroblock pair then always filter the top edge
+		filterTopMbEdgeFlag  = MbQ->mb_avail_up;
+	}
+
+	//CheckAvailabilityOfNeighbors(MbQ);
+
+#pragma region vertical
+	if(filterLeftMbEdgeFlag)  // can not filter beyond picture boundaries
+	{
+		MbP = &(p_Vid->mb_data[MbQ->mb_addr_left]);
+	}
+	else
+		MbP=0;
+
+	GetStrength_Vert_YUV420_All(strength.edge, MbQ, mvlimit, p, mb_x*4, mb_y*4, MbP, MbQ->luma_transform_size_8x8_flag);
+
+	{
+		int i;
+		if (MbP)
+		{
+			int QP_Chroma0 = (MbP->qpc[0] + MbQ->qpc[0] + 1) >> 1;
+			int QP_Chroma1 = (MbP->qpc[1] + MbQ->qpc[1] + 1) >> 1;
+			int QP = (MbP->qp + QPQ + 1) >> 1;
+
+			int indexA = iClip3(0, MAX_QP, QP + MbQ->DFAlphaC0Offset);
+			int indexB = iClip3(0, MAX_QP, QP + MbQ->DFBetaOffset);
+			const byte *ClipTab = CLIP_TAB[indexA];
+
+			alphas[0]  = ALPHA_TABLE[indexA];
+			betas[0]  = BETA_TABLE [indexB];
+			for (i=0;i<4;i++)
+			{
+				thresholds.as8[i] = ClipTab[strength.as8[i]];
+			}
+
+			indexA = iClip3(0, MAX_QP, QP_Chroma0 + MbQ->DFAlphaC0Offset);
+			alphas_chroma[0][0] = ALPHA_TABLE[indexA];
+			ClipTabP_Chroma[0] = CLIP_TAB[indexA];
+			indexB = iClip3(0, MAX_QP, QP_Chroma0 + MbQ->DFBetaOffset);
+			betas_chroma[0][0] =BETA_TABLE[indexB];
+
+			indexA = iClip3(0, MAX_QP, QP_Chroma1 + MbQ->DFAlphaC0Offset);
+			alphas_chroma[1][0] = ALPHA_TABLE[indexA];
+			ClipTabP_Chroma[1] = CLIP_TAB[indexA];
+			indexB = iClip3(0, MAX_QP, QP_Chroma1 + MbQ->DFBetaOffset);
+			betas_chroma[1][0] = BETA_TABLE[indexB];
+		}
+
+		for (i=4;i<16;i++)
+		{
+			thresholds.as8[i] = ClipTabQ[strength.as8[i]];
+		}
+
+		ippiFilterDeblockingLuma_VerEdge_H264_8u_C1IR(YQ, imgY->stride, alphas, betas, thresholds.as8, strength.as8);
+
+		if (MbP)
+		{
+			for (i=0;i<4;i++)
+			{
+				thresholds.as8[i] = ClipTabP_Chroma[0][strength.as8[i]];
+			}
+		}
+		for (i=4;i<8;i++)
+		{
+			thresholds.as8[i] = ClipTabQ_Chroma[0][strength.as8[i+4]];
+		}		
+		ippiFilterDeblockingChroma_VerEdge_H264_8u_C1IR(UQ, imgUV[0]->stride, alphas_chroma[0], betas_chroma[0], thresholds.as8, strength.as8);
+
+		if (MbP)
+		{
+			for (i=0;i<4;i++)
+			{
+				thresholds.as8[i] = ClipTabP_Chroma[1][strength.as8[i]];
+			}
+		}
+		for (i=4;i<8;i++)
+		{
+			thresholds.as8[i] = ClipTabQ_Chroma[1][strength.as8[i+4]];
+		}		
+		ippiFilterDeblockingChroma_VerEdge_H264_8u_C1IR(VQ, imgUV[1]->stride, alphas_chroma[1], betas_chroma[1], thresholds.as8, strength.as8);
+
+	}
+
+#pragma endregion
+
+
+#pragma region horizontal
+	MbP = 0;
+	/* ---- horizontal ---- */
+	//	edge=0;
+	if(filterTopMbEdgeFlag) // can not filter beyond picture boundaries
+	{
+		MbP = &(p_Vid->mb_data[MbQ->mb_addr_up]);
+	}
+	else
+	{
+		MbP = 0;
+	}
+
+	GetStrength_Horiz_YUV420_All(strength.edge, MbQ, mvlimit, p, mb_x*4, mb_y*4, MbP, MbQ->luma_transform_size_8x8_flag);
+
+	{
+		int i;
+
+		if (MbP)
+		{
+			int QP_Chroma0 = (MbP->qpc[0] + MbQ->qpc[0] + 1) >> 1;
+			int QP_Chroma1 = (MbP->qpc[1] + MbQ->qpc[1] + 1) >> 1;
+			int QP = (MbP->qp + QPQ + 1) >> 1;
+
+			int indexA = iClip3(0, MAX_QP, QP + MbQ->DFAlphaC0Offset);
+			int indexB = iClip3(0, MAX_QP, QP + MbQ->DFBetaOffset);
+			const byte *ClipTab = CLIP_TAB[indexA];
+
+			alphas[0]  = ALPHA_TABLE[indexA];
+			betas[0]  = BETA_TABLE [indexB];
+			for (i=0;i<4;i++)
+			{
+				thresholds.as8[i] = ClipTab[strength.as8[i]];
+			}
+
+			indexA = iClip3(0, MAX_QP, QP_Chroma0 + MbQ->DFAlphaC0Offset);
+			alphas_chroma[0][0] = ALPHA_TABLE[indexA];
+			ClipTabP_Chroma[0] = CLIP_TAB[indexA];
+			indexB = iClip3(0, MAX_QP, QP_Chroma0 + MbQ->DFBetaOffset);
+			betas_chroma[0][0] =BETA_TABLE[indexB];
+
+			indexA = iClip3(0, MAX_QP, QP_Chroma1 + MbQ->DFAlphaC0Offset);
+			alphas_chroma[1][0] = ALPHA_TABLE[indexA];
+			ClipTabP_Chroma[1] = CLIP_TAB[indexA];
+			indexB = iClip3(0, MAX_QP, QP_Chroma1 + MbQ->DFBetaOffset);
+			betas_chroma[1][0] = BETA_TABLE[indexB];
+		}
+
+		for (i=4;i<16;i++)
+		{
+			thresholds.as8[i] = ClipTabQ[strength.as8[i]];
+		}
+
+		ippiFilterDeblockingLuma_HorEdge_H264_8u_C1IR(YQ, imgY->stride, alphas, betas, thresholds.as8, strength.as8);
+
+		if (MbP)
+		{
+			for (i=0;i<4;i++)
+			{
+				thresholds.as8[i] = ClipTabP_Chroma[0][strength.as8[i]];
+			}
+		}
+		for (i=4;i<8;i++)
+		{
+			thresholds.as8[i] = ClipTabQ_Chroma[0][strength.as8[i+4]];
+		}		
+		ippiFilterDeblockingChroma_HorEdge_H264_8u_C1IR(UQ, imgUV[0]->stride, alphas_chroma[0], betas_chroma[0], thresholds.as8, strength.as8);
+
+		if (MbP)
+		{
+			for (i=0;i<4;i++)
+			{
+				thresholds.as8[i] = ClipTabP_Chroma[1][strength.as8[i]];
+			}
+		}
+		for (i=4;i<8;i++)
+		{
+			thresholds.as8[i] = ClipTabQ_Chroma[1][strength.as8[i+4]];
+		}		
+		ippiFilterDeblockingChroma_HorEdge_H264_8u_C1IR(VQ, imgUV[1]->stride, alphas_chroma[1], betas_chroma[1], thresholds.as8, strength.as8);
+
+	}
+#pragma endregion
+
+	p_Vid->DeblockCall = 0;
+}
+
+
+#define ANY_INTRA (MbP->mb_type==I4MB||MbP->mb_type==I8MB||MbP->mb_type==I16MB||MbP->mb_type==IPCM||MbQ->mb_type==I4MB||MbQ->mb_type==I8MB||MbQ->mb_type==I16MB||MbQ->mb_type==IPCM)
+
+/*!
+*********************************************************************************************
+* \brief
+*    returns a buffer of 16 Strength values for one stripe in a mb (for different Frame or Field types)
+*********************************************************************************************
+*/
+void GetStrengthNormal_Horiz(byte Strength[MB_BLOCK_SIZE], Macroblock *MbQ, int edge, int mvlimit, StorablePicture *p);
+void GetStrengthNormal_Vert(byte Strength[MB_BLOCK_SIZE], Macroblock *MbQ, int edge, int mvlimit, StorablePicture *p);
+
+static void GetStrengthNormal(byte Strength[MB_BLOCK_SIZE], Macroblock *MbQ, int dir, int edge, int mvlimit, StorablePicture *p)
+{
+	if (dir == 0)
+		GetStrengthNormal_Vert(Strength, MbQ, edge, mvlimit, p);
+	else
+		GetStrengthNormal_Horiz(Strength, MbQ, edge, mvlimit, p);
+}
+
+/*!
+*********************************************************************************************
+* \brief
+*    returns a buffer of 16 Strength values for one stripe in a mb (for MBAFF)
+*********************************************************************************************
+*/
+static void GetStrengthMBAff_Horiz(byte Strength[16], Macroblock *MbQ, int dir, int edge, int mvlimit, StorablePicture *p)
+{
+	// dir == 1
+	short  blkP, blkQ, idx;
+	short  blk_x, blk_x2, blk_y, blk_y2 ;
+	h264_ref_t  ref_p0,ref_p1,ref_q0,ref_q1;
+	int    xQ, yQ;
+	short  mb_x, mb_y;
+	Macroblock *MbP;
+
+	PixelPos pixP;
+	int dir_m1 = 0;
+
+	PicMotionParams *motion = &p->motion;
+	PicMotion **motion0 = motion->motion[LIST_0];
+	PicMotion **motion1 = motion->motion[LIST_1];
+	yQ = (edge < MB_BLOCK_SIZE ? edge : 1);
+
+	for( idx = 0; idx < 16; ++idx )
+	{
+		VideoParameters *p_Vid = MbQ->p_Vid;
+		xQ = idx;
+
+		p_Vid->getNeighbourLuma(MbQ, xQ , yQ - 1, &pixP);
+		blkQ = (short) ((yQ & 0xFFFC) + (xQ >> 2));
+		blkP = (short) ((pixP.y & 0xFFFC) + (pixP.x >> 2));
+
+		MbP = &(p_Vid->mb_data[pixP.mb_addr]);
+		p_Vid->mixedModeEdgeFlag = (byte) (MbQ->mb_field != MbP->mb_field);   
+
+		if ((p->slice_type==SP_SLICE)||(p->slice_type==SI_SLICE) )
+		{
+			Strength[idx] = (edge == 0 && (((!p->mb_aff_frame_flag && (p->structure==FRAME)) ||
+				(p->mb_aff_frame_flag && !MbP->mb_field && !MbQ->mb_field)))) ? 4 : 3;
+		}
+		else
+		{
+			// Start with Strength=3. or Strength=4 for Mb-edge
+			Strength[idx] = (edge == 0 && (((!p->mb_aff_frame_flag && (p->structure==FRAME)) ||
+				(p->mb_aff_frame_flag && !MbP->mb_field && !MbQ->mb_field)))) ? 4 : 3;
+
+			if(  !(MbP->mb_type==I4MB || MbP->mb_type==I16MB || MbP->mb_type==I8MB || MbP->mb_type==IPCM)
+				&& !(MbQ->mb_type==I4MB || MbQ->mb_type==I16MB || MbQ->mb_type==I8MB || MbQ->mb_type==IPCM) )
+			{
+				if( ((MbQ->cbp_blk[0] &  ((int64)1 << blkQ )) != 0) || ((MbP->cbp_blk[0] &  ((int64)1 << blkP)) != 0) )
+					Strength[idx] = 2 ;
+				else
+				{
+					// if no coefs, but vector difference >= 1 set Strength=1
+					// if this is a mixed mode edge then one set of reference pictures will be frame and the
+					// other will be field
+					if (p_Vid->mixedModeEdgeFlag)
+					{
+						(Strength[idx] = 1);
+					}
+					else
+					{
+						p_Vid->get_mb_block_pos (p_Vid->PicPos, MbQ->mbAddrX, &mb_x, &mb_y);
+						blk_y  = (short) ((mb_y<<2) + (blkQ >> 2));
+						blk_x  = (short) ((mb_x<<2) + (blkQ  & 3));
+						blk_y2 = (short) (pixP.pos_y >> 2);
+						blk_x2 = (short) (pixP.pos_x >> 2);
+						{
+							PicMotion *motion_p0, *motion_q0, *motion_p1, *motion_q1;
+							motion_p0=&motion0[blk_y ][blk_x ];
+							motion_q0=&motion0[blk_y2][blk_x2];
+							motion_p1=&motion1[blk_y ][blk_x ];
+							motion_q1=&motion1[blk_y2][blk_x2];
+
+							ref_p0 = motion_p0->ref_idx < 0 ? UNDEFINED_REFERENCE : motion_p0->ref_pic_id;
+							ref_q0 = motion_q0->ref_idx < 0 ? UNDEFINED_REFERENCE : motion_q0->ref_pic_id;
+							ref_p1 = motion_p1->ref_idx < 0 ? UNDEFINED_REFERENCE : motion_p1->ref_pic_id;
+							ref_q1 = motion_q1->ref_idx < 0 ? UNDEFINED_REFERENCE : motion_q1->ref_pic_id;
+							if ( ((ref_p0==ref_q0) && (ref_p1==ref_q1)) ||
+								((ref_p0==ref_q1) && (ref_p1==ref_q0)))
+							{
+								Strength[idx]=0;
+								// L0 and L1 reference pictures of p0 are different; q0 as well
+								if (ref_p0 != ref_p1)
+								{
+									// compare MV for the same reference picture
+									if (ref_p0==ref_q0)
+									{
+										Strength[idx] =  (byte) (
+											(abs( motion_p0->mv[0] - motion_q0->mv[0]) >= 4) ||
+											(abs( motion_p0->mv[1] - motion_q0->mv[1]) >= mvlimit) ||
+											(abs( motion_p1->mv[0] - motion_q1->mv[0]) >= 4) ||
+											(abs( motion_p1->mv[1] - motion_q1->mv[1]) >= mvlimit));
+									}
+									else
+									{
+										Strength[idx] =  (byte) (
+											(abs( motion_p0->mv[0] - motion_q1->mv[0]) >= 4) ||
+											(abs( motion_p0->mv[1] - motion_q1->mv[1]) >= mvlimit) ||
+											(abs( motion_p1->mv[0] - motion_q0->mv[0]) >= 4) ||
+											(abs( motion_p1->mv[1] - motion_q0->mv[1]) >= mvlimit));
+									}
+								}
+								else
+								{ // L0 and L1 reference pictures of p0 are the same; q0 as well
+
+									Strength[idx] = (byte) (
+										((abs( motion_p0->mv[0] - motion_q0->mv[0]) >= 4) ||
+										(abs( motion_p0->mv[1] - motion_q0->mv[1]) >= mvlimit ) ||
+										(abs( motion_p1->mv[0] - motion_q1->mv[0]) >= 4) ||
+										(abs( motion_p1->mv[1] - motion_q1->mv[1]) >= mvlimit))
+										&&
+										((abs( motion_p0->mv[0] - motion_q1->mv[0]) >= 4) ||
+										(abs( motion_p0->mv[1] - motion_q1->mv[1]) >= mvlimit) ||
+										(abs( motion_p1->mv[0] - motion_q0->mv[0]) >= 4) ||
+										(abs( motion_p1->mv[1] - motion_q0->mv[1]) >= mvlimit)));
+								}
+							}
+							else
+							{
+								Strength[idx] = 1;
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+}
+
+static void GetStrengthMBAff_Vert(byte Strength[16], Macroblock *MbQ, int dir, int edge, int mvlimit, StorablePicture *p)
+{
+	// dir == 0
+	short  blkP, blkQ, idx;
+	short  blk_x, blk_x2, blk_y, blk_y2 ;
+	h264_ref_t  ref_p0,ref_p1,ref_q0,ref_q1;
+	int    xQ, yQ;
+	short  mb_x, mb_y;
+	Macroblock *MbP;
+
+	PixelPos pixP;
+	int dir_m1 = 1;
+
+	PicMotionParams *motion = &p->motion;
+	PicMotion **motion0 = motion->motion[LIST_0];
+	PicMotion **motion1 = motion->motion[LIST_1];
+	xQ = edge;
+	for( idx = 0; idx < 16; ++idx )
+	{
+		VideoParameters *p_Vid = MbQ->p_Vid;
+
+		yQ = idx;
+		p_Vid->getNeighbourLuma(MbQ, xQ - 1, yQ, &pixP);
+		blkQ = (short) ((yQ & 0xFFFC) + (xQ >> 2));
+		blkP = (short) ((pixP.y & 0xFFFC) + (pixP.x >> 2));
+
+		MbP = &(p_Vid->mb_data[pixP.mb_addr]);
+		p_Vid->mixedModeEdgeFlag = (byte) (MbQ->mb_field != MbP->mb_field);   
+
+		if ((p->slice_type==SP_SLICE)||(p->slice_type==SI_SLICE) )
+		{
+			Strength[idx] = (edge == 0 && (((!p->mb_aff_frame_flag && (p->structure==FRAME)) ||
+				(p->mb_aff_frame_flag && !MbP->mb_field && !MbQ->mb_field)) ||
+				((p->mb_aff_frame_flag || (p->structure != FRAME))))) ? 4 : 3;
+		}
+		else
+		{
+			// Start with Strength=3. or Strength=4 for Mb-edge
+			Strength[idx] = (edge == 0 && (((!p->mb_aff_frame_flag && (p->structure==FRAME)) ||
+				(p->mb_aff_frame_flag && !MbP->mb_field && !MbQ->mb_field)) ||
+				((p->mb_aff_frame_flag || (p->structure!=FRAME))))) ? 4 : 3;
+
+			if(  !(MbP->mb_type==I4MB || MbP->mb_type==I16MB || MbP->mb_type==I8MB || MbP->mb_type==IPCM)
+				&& !(MbQ->mb_type==I4MB || MbQ->mb_type==I16MB || MbQ->mb_type==I8MB || MbQ->mb_type==IPCM) )
+			{
+				if( ((MbQ->cbp_blk[0] &  ((int64)1 << blkQ )) != 0) || ((MbP->cbp_blk[0] &  ((int64)1 << blkP)) != 0) )
+					Strength[idx] = 2 ;
+				else
+				{
+					// if no coefs, but vector difference >= 1 set Strength=1
+					// if this is a mixed mode edge then one set of reference pictures will be frame and the
+					// other will be field
+					if (p_Vid->mixedModeEdgeFlag)
+					{
+						(Strength[idx] = 1);
+					}
+					else
+					{
+						p_Vid->get_mb_block_pos (p_Vid->PicPos, MbQ->mbAddrX, &mb_x, &mb_y);
+						blk_y  = (short) ((mb_y<<2) + (blkQ >> 2));
+						blk_x  = (short) ((mb_x<<2) + (blkQ  & 3));
+						blk_y2 = (short) (pixP.pos_y >> 2);
+						blk_x2 = (short) (pixP.pos_x >> 2);
+						{
+							PicMotion *motion_p0, *motion_q0, *motion_p1, *motion_q1;
+							motion_p0=&motion0[blk_y ][blk_x ];
+							motion_q0=&motion0[blk_y2][blk_x2];
+							motion_p1=&motion1[blk_y ][blk_x ];
+							motion_q1=&motion1[blk_y2][blk_x2];
+
+							ref_p0 = motion_p0->ref_idx < 0 ? UNDEFINED_REFERENCE : motion_p0->ref_pic_id;
+							ref_q0 = motion_q0->ref_idx < 0 ? UNDEFINED_REFERENCE : motion_q0->ref_pic_id;
+							ref_p1 = motion_p1->ref_idx < 0 ? UNDEFINED_REFERENCE : motion_p1->ref_pic_id;
+							ref_q1 = motion_q1->ref_idx < 0 ? UNDEFINED_REFERENCE : motion_q1->ref_pic_id;
+
+							if ( ((ref_p0==ref_q0) && (ref_p1==ref_q1)) ||
+								((ref_p0==ref_q1) && (ref_p1==ref_q0)))
+							{
+								Strength[idx]=0;
+								// L0 and L1 reference pictures of p0 are different; q0 as well
+								if (ref_p0 != ref_p1)
+								{
+									// compare MV for the same reference picture
+									if (ref_p0==ref_q0)
+									{
+										Strength[idx] =  (byte) (
+											(abs( motion_p0->mv[0] - motion_q0->mv[0]) >= 4) ||
+											(abs( motion_p0->mv[1] - motion_q0->mv[1]) >= mvlimit) ||
+											(abs( motion_p1->mv[0] - motion_q1->mv[0]) >= 4) ||
+											(abs( motion_p1->mv[1] - motion_q1->mv[1]) >= mvlimit));
+									}
+									else
+									{
+										Strength[idx] =  (byte) (
+											(abs( motion_p0->mv[0] - motion_q1->mv[0]) >= 4) ||
+											(abs( motion_p0->mv[1] - motion_q1->mv[1]) >= mvlimit) ||
+											(abs( motion_p1->mv[0] - motion_q0->mv[0]) >= 4) ||
+											(abs( motion_p1->mv[1] - motion_q0->mv[1]) >= mvlimit));
+									}
+								}
+								else
+								{ // L0 and L1 reference pictures of p0 are the same; q0 as well
+
+									Strength[idx] = (byte) (
+										((abs( motion_p0->mv[0] - motion_q0->mv[0]) >= 4) ||
+										(abs( motion_p0->mv[1] - motion_q0->mv[1]) >= mvlimit ) ||
+										(abs( motion_p1->mv[0] - motion_q1->mv[0]) >= 4) ||
+										(abs( motion_p1->mv[1] - motion_q1->mv[1]) >= mvlimit))
+										&&
+										((abs( motion_p0->mv[0] - motion_q1->mv[0]) >= 4) ||
+										(abs( motion_p0->mv[1] - motion_q1->mv[1]) >= mvlimit) ||
+										(abs( motion_p1->mv[0] - motion_q0->mv[0]) >= 4) ||
+										(abs( motion_p1->mv[1] - motion_q0->mv[1]) >= mvlimit)));
+								}
+							}
+							else
+							{
+								Strength[idx] = 1;
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+}
+
+static void GetStrengthMBAff(byte Strength[16], Macroblock *MbQ, int dir, int edge, int mvlimit, StorablePicture *p)
+{
+	if (dir == 0)
+		GetStrengthMBAff_Vert(Strength, MbQ, dir, edge, mvlimit, p);
+	else
+		GetStrengthMBAff_Horiz(Strength, MbQ, dir, edge, mvlimit, p);
+}
+
+/*!
+*****************************************************************************************
+* \brief
+*    Filters 16 pel block edge of Frame or Field coded MBs 
+*****************************************************************************************
+*/
+
+
+static void EdgeLoopLumaNormal(ColorPlane pl, VideoImage *image, const byte Strength[MB_BLOCK_SIZE], Macroblock *MbQ, 
+															 int dir, int edge, StorablePicture *p)
+{
+	if (dir == 0)
+		EdgeLoopLumaNormal_Vert(pl, image, Strength, MbQ, edge, p);
+	else if (sse2_flag)
+		EdgeLoopLumaNormal_Horiz_sse2(pl, image, Strength, MbQ, edge, p);
+	else
+		EdgeLoopLumaNormal_Horiz(pl, image, Strength, MbQ, edge, p);
+}
+
+/*!
+*****************************************************************************************
+* \brief
+*    Filters 16 pel block edge of Super MB Frame coded MBs
+*****************************************************************************************
+*/
+static void EdgeLoopLumaMBAff_Horiz(ColorPlane pl, VideoImage *image, const byte Strength[16], Macroblock *MbQ, int edge, StorablePicture *p)
+{
+	// dir == 1
+	imgpel **Img = image->img;
+	int      width = image->stride;
+	int      pel, ap = 0, aq = 0, Strng ;
+	int      incP, incQ;
+	int      C0, tc0, dif;
+	imgpel   L0, R0;
+	int      Alpha = 0, Beta = 0 ;
+	const byte* ClipTab = NULL;
+	int      small_gap;
+	int      indexA, indexB;
+	int      PelNum = pl? pelnum_cr[1][p->chroma_format_idc] : MB_BLOCK_SIZE;
+
+	int      QP;
+	int      xQ, yQ;
+
+	PixelPos pixP, pixQ;
+	VideoParameters *p_Vid = MbQ->p_Vid;
+	int      bitdepth_scale = pl? p_Vid->bitdepth_scale[IS_CHROMA] : p_Vid->bitdepth_scale[IS_LUMA];
+	int      max_imgpel_value = p_Vid->max_pel_value_comp[pl];
+
+	int AlphaC0Offset = MbQ->DFAlphaC0Offset;
+	int BetaOffset = MbQ->DFBetaOffset;
+	byte fieldModeFilteringFlag;
+
+	Macroblock *MbP;
+	imgpel   *SrcPtrP, *SrcPtrQ;
+
+	for( pel = 0 ; pel < PelNum ; ++pel )
+	{
+		xQ = pel ;
+		yQ = (edge < 16 ? edge : 1) ;
+		p_Vid->getNeighbourLuma(MbQ, xQ, yQ - 1, &pixP);     
+
+		if (pixP.available || (MbQ->DFDisableIdc== 0))
+		{
+			if( (Strng = Strength[pel]) != 0)
+			{
+				p_Vid->getNeighbourLuma(MbQ, xQ, yQ, &pixQ);
+
+				MbP = &(p_Vid->mb_data[pixP.mb_addr]);
+				fieldModeFilteringFlag = (byte) (MbQ->mb_field || MbP->mb_field);
+
+				incQ    = ((fieldModeFilteringFlag && !MbQ->mb_field) ? 2 * width : width);
+				incP    = ((fieldModeFilteringFlag && !MbP->mb_field) ? 2 * width : width);
+				SrcPtrQ = &(Img[pixQ.pos_y][pixQ.pos_x]);
+				SrcPtrP = &(Img[pixP.pos_y][pixP.pos_x]);
+
+				// Average QP of the two blocks
+				QP = pl? ((MbP->qpc[pl-1] + MbQ->qpc[pl-1] + 1) >> 1) : (MbP->qp + MbQ->qp + 1) >> 1;
+
+				indexA = iClip3(0, MAX_QP, QP + AlphaC0Offset);
+				indexB = iClip3(0, MAX_QP, QP + BetaOffset);
+
+				Alpha   = ALPHA_TABLE[indexA] * bitdepth_scale;
+				Beta    = BETA_TABLE [indexB] * bitdepth_scale;
+				ClipTab = CLIP_TAB[indexA];
+
+
+				L0  = SrcPtrP[0] ;
+				R0  = SrcPtrQ[0] ;      
+
+
+
+				if( abs( R0 - L0 ) < Alpha )
+				{          
+					imgpel L1  = SrcPtrP[-incP];
+					imgpel R1  = SrcPtrQ[ incQ];      
+					if ((abs( R0 - R1) < Beta )   && (abs(L0 - L1) < Beta ))
+					{
+						imgpel L2  = SrcPtrP[-incP*2];
+						imgpel R2  = SrcPtrQ[ incQ*2];
+						if(Strng == 4 )    // INTRA strong filtering
+						{
+							int RL0 = L0 + R0;
+							small_gap = (abs( R0 - L0 ) < ((Alpha >> 2) + 2));
+							aq  = ( abs( R0 - R2) < Beta ) & small_gap;               
+							ap  = ( abs( L0 - L2) < Beta ) & small_gap;
+
+							if (ap)
+							{
+								imgpel L3  = SrcPtrP[-incP*3];
+								SrcPtrP[-incP * 2] = (imgpel) ((((L3 + L2) << 1) + L2 + L1 + RL0 + 4) >> 3);
+								SrcPtrP[-incP    ] = (imgpel) (( L2 + L1 + L0 + R0 + 2) >> 2);
+								SrcPtrP[    0    ] = (imgpel) (( R1 + ((L1 + RL0) << 1) +  L2 + 4) >> 3);
+							}
+							else
+							{
+								SrcPtrP[     0     ] = (imgpel) (((L1 << 1) + L0 + R1 + 2) >> 2) ;
+							}
+
+							if (aq)
+							{
+								imgpel R3  = SrcPtrQ[ incQ*3];
+								SrcPtrQ[    0     ] = (imgpel) (( L1 + ((R1 + RL0) << 1) +  R2 + 4) >> 3);
+								SrcPtrQ[ incQ     ] = (imgpel) (( R2 + R0 + R1 + L0 + 2) >> 2);
+								SrcPtrQ[ incQ * 2 ] = (imgpel) ((((R3 + R2) << 1) + R2 + R1 + RL0 + 4) >> 3);
+							}
+							else
+							{
+								SrcPtrQ[    0     ] = (imgpel) (((R1 << 1) + R0 + L1 + 2) >> 2);
+							}
+						}
+						else   // normal filtering
+						{              
+							int RL0 = (L0 + R0 + 1) >> 1;
+							aq  = (abs( R0 - R2) < Beta);
+							ap  = (abs( L0 - L2) < Beta);
+
+							C0  = ClipTab[ Strng ] * bitdepth_scale;
+							tc0  = (C0 + ap + aq) ;
+							dif = iClip3( -tc0, tc0, (((R0 - L0) << 2) + (L1 - R1) + 4) >> 3) ;
+
+							if( ap )
+								*(SrcPtrP - incP) += iClip3( -C0,  C0, ( L2 + RL0 - (L1 << 1)) >> 1 ) ;
+
+							*SrcPtrP  = (imgpel) iClip1 (max_imgpel_value, L0 + dif) ;
+							*SrcPtrQ  = (imgpel) iClip1 (max_imgpel_value, R0 - dif) ;
+
+							if( aq  )
+								*(SrcPtrQ + incQ) += iClip3( -C0,  C0, ( R2 + RL0 - (R1 << 1)) >> 1 ) ;
+						}            
+					}
+				}
+			}
+		}
+	}
+}
+
+static void EdgeLoopLumaMBAff_Vert(ColorPlane pl, VideoImage *image, const byte Strength[16], Macroblock *MbQ, int edge, StorablePicture *p)
+{
+	// dir == 0
+	imgpel **Img = image->img;
+	int      width = image->stride;
+	int      pel, ap = 0, aq = 0, Strng ;
+
+	int      C0, tc0, dif;
+	imgpel   L0, R0;
+	int      Alpha = 0, Beta = 0 ;
+	const byte* ClipTab = NULL;
+	int      small_gap;
+	int      indexA, indexB;
+	int      PelNum = pl? pelnum_cr[0][p->chroma_format_idc] : MB_BLOCK_SIZE;
+
+	int      QP;
+	int      xQ, yQ;
+
+	PixelPos pixP, pixQ;
+	VideoParameters *p_Vid = MbQ->p_Vid;
+	int      bitdepth_scale = pl? p_Vid->bitdepth_scale[IS_CHROMA] : p_Vid->bitdepth_scale[IS_LUMA];
+	int      max_imgpel_value = p_Vid->max_pel_value_comp[pl];
+
+	int AlphaC0Offset = MbQ->DFAlphaC0Offset;
+	int BetaOffset = MbQ->DFBetaOffset;
+	byte fieldModeFilteringFlag;
+
+	Macroblock *MbP;
+	imgpel   *SrcPtrP, *SrcPtrQ;
+
+	for( pel = 0 ; pel < PelNum ; ++pel )
+	{
+		xQ = edge;
+		yQ = pel;
+		p_Vid->getNeighbourXPLuma(MbQ, xQ - 1, yQ, &pixP);     
+
+		if (pixP.available || (MbQ->DFDisableIdc== 0))
+		{
+			if( (Strng = Strength[pel]) != 0)
+			{
+				p_Vid->getNeighbourLuma(MbQ, xQ, yQ, &pixQ);
+
+				MbP = &(p_Vid->mb_data[pixP.mb_addr]);
+				fieldModeFilteringFlag = (byte) (MbQ->mb_field || MbP->mb_field);
+
+				SrcPtrQ = &(Img[pixQ.pos_y][pixQ.pos_x]);
+				SrcPtrP = &(Img[pixP.pos_y][pixP.pos_x]);
+
+				// Average QP of the two blocks
+				QP = pl? ((MbP->qpc[pl-1] + MbQ->qpc[pl-1] + 1) >> 1) : (MbP->qp + MbQ->qp + 1) >> 1;
+
+				indexA = iClip3(0, MAX_QP, QP + AlphaC0Offset);
+				indexB = iClip3(0, MAX_QP, QP + BetaOffset);
+
+				Alpha   = ALPHA_TABLE[indexA] * bitdepth_scale;
+				Beta    = BETA_TABLE [indexB] * bitdepth_scale;
+				ClipTab = CLIP_TAB[indexA];
+
+
+				L0  = SrcPtrP[0] ;
+				R0  = SrcPtrQ[0] ;      
+
+
+
+				if( abs( R0 - L0 ) < Alpha )
+				{          
+					imgpel L1  = SrcPtrP[-1];
+					imgpel R1  = SrcPtrQ[ 1];      
+					if ((abs( R0 - R1) < Beta )   && (abs(L0 - L1) < Beta ))
+					{
+						imgpel L2  = SrcPtrP[-2];
+						imgpel R2  = SrcPtrQ[ 2];
+						if(Strng == 4 )    // INTRA strong filtering
+						{
+							int RL0 = L0 + R0;
+							small_gap = (abs( R0 - L0 ) < ((Alpha >> 2) + 2));
+							aq  = ( abs( R0 - R2) < Beta ) & small_gap;               
+							ap  = ( abs( L0 - L2) < Beta ) & small_gap;
+
+							if (ap)
+							{
+								imgpel L3  = SrcPtrP[-3];
+								SrcPtrP[-2] = (imgpel) ((((L3 + L2) << 1) + L2 + L1 + RL0 + 4) >> 3);
+								SrcPtrP[-1    ] = (imgpel) (( L2 + L1 + L0 + R0 + 2) >> 2);
+								SrcPtrP[    0    ] = (imgpel) (( R1 + ((L1 + RL0) << 1) +  L2 + 4) >> 3);
+							}
+							else
+							{
+								SrcPtrP[     0     ] = (imgpel) (((L1 << 1) + L0 + R1 + 2) >> 2) ;
+							}
+
+							if (aq)
+							{
+								imgpel R3  = SrcPtrQ[ 3];
+								SrcPtrQ[    0     ] = (imgpel) (( L1 + ((R1 + RL0) << 1) +  R2 + 4) >> 3);
+								SrcPtrQ[ 1     ] = (imgpel) (( R2 + R0 + R1 + L0 + 2) >> 2);
+								SrcPtrQ[  2 ] = (imgpel) ((((R3 + R2) << 1) + R2 + R1 + RL0 + 4) >> 3);
+							}
+							else
+							{
+								SrcPtrQ[    0     ] = (imgpel) (((R1 << 1) + R0 + L1 + 2) >> 2);
+							}
+						}
+						else   // normal filtering
+						{              
+							int RL0 = (L0 + R0 + 1) >> 1;
+							aq  = (abs( R0 - R2) < Beta);
+							ap  = (abs( L0 - L2) < Beta);
+
+							C0  = ClipTab[ Strng ] * bitdepth_scale;
+							tc0  = (C0 + ap + aq) ;
+							dif = iClip3( -tc0, tc0, (((R0 - L0) << 2) + (L1 - R1) + 4) >> 3) ;
+
+							if( ap )
+								*(SrcPtrP - 1) += iClip3( -C0,  C0, ( L2 + RL0 - (L1 << 1)) >> 1 ) ;
+
+							*SrcPtrP  = (imgpel) iClip1 (max_imgpel_value, L0 + dif) ;
+							*SrcPtrQ  = (imgpel) iClip1 (max_imgpel_value, R0 - dif) ;
+
+							if( aq  )
+								*(SrcPtrQ + 1) += iClip3( -C0,  C0, ( R2 + RL0 - (R1 << 1)) >> 1 ) ;
+						}            
+					}
+				}
+			}
+		}
+	}
+}
+
+static void EdgeLoopLumaMBAff(ColorPlane pl, VideoImage *image, const byte Strength[16], Macroblock *MbQ, int dir, int edge, StorablePicture *p)
+{
+	if (dir == 0)
+		EdgeLoopLumaMBAff_Vert(pl, image, Strength, MbQ, edge, p);
+	else
+		EdgeLoopLumaMBAff_Horiz(pl, image, Strength, MbQ, edge, p);
+}
+
+/*!
+*****************************************************************************************
+* \brief
+*    Filters chroma block edge for Frame or Field coded pictures
+*****************************************************************************************
+*/
+
+
+static void EdgeLoopChromaNormal(VideoImage *image, const byte Strength[16], Macroblock *MbQ, int dir, int edge, int uv, StorablePicture *p)
+{ 
+	if (dir == 0)
+		EdgeLoopChromaNormal_Vert(image, Strength, MbQ, edge, uv, p);
+	else 
+		EdgeLoopChromaNormal_Horiz(image, Strength, MbQ, edge, uv, p);
+
+}
+/*!
+*****************************************************************************************
+* \brief
+*    Filters chroma block edge for MBAFF types
+*****************************************************************************************
+*/
+static void EdgeLoopChromaMBAff(VideoImage *image, const byte Strength[16], Macroblock *MbQ, int dir, int edge, int uv, StorablePicture *p)
+{
+	imgpel** Img = image->img;
+
+	int      pel, Strng ;
+	int      incP, incQ;
+	int      C0, tc0, dif;
+	imgpel   L0, R0;
+	int      Alpha = 0, Beta = 0;
+	const byte* ClipTab = NULL;
+	int      indexA, indexB;
+	VideoParameters *p_Vid = MbQ->p_Vid;
+	int      PelNum = pelnum_cr[dir][p->chroma_format_idc];
+	int      StrengthIdx;
+	int      QP;
+	int      xQ, yQ;
+	PixelPos pixP, pixQ;
+	int      dir_m1 = 1 - dir;
+	int      bitdepth_scale = p_Vid->bitdepth_scale[IS_CHROMA];
+	int      max_imgpel_value = p_Vid->max_pel_value_comp[uv + 1];
+
+	int      AlphaC0Offset = MbQ->DFAlphaC0Offset;
+	int      BetaOffset    = MbQ->DFBetaOffset;
+	byte fieldModeFilteringFlag;
+	Macroblock *MbP;
+	imgpel   *SrcPtrP, *SrcPtrQ;
+	int      width = image->stride;
+
+	for( pel = 0 ; pel < PelNum ; ++pel )
+	{
+		xQ = dir ? pel : edge;
+		yQ = dir ? (edge < 16? edge : 1) : pel;
+		p_Vid->getNeighbour(MbQ, xQ, yQ, p_Vid->mb_size[IS_CHROMA], &pixQ);
+		p_Vid->getNeighbour(MbQ, xQ - (dir_m1), yQ - dir, p_Vid->mb_size[IS_CHROMA], &pixP);    
+		MbP = &(p_Vid->mb_data[pixP.mb_addr]);    
+		StrengthIdx = (PelNum == 8) ? ((MbQ->mb_field && !MbP->mb_field) ? pel << 1 :((pel >> 1) << 2) + (pel & 0x01)) : pel;
+
+		if (pixP.available || (MbQ->DFDisableIdc == 0))
+		{
+			if( (Strng = Strength[StrengthIdx]) != 0)
+			{
+				fieldModeFilteringFlag = (byte) (MbQ->mb_field || MbP->mb_field);
+				incQ = dir ? ((fieldModeFilteringFlag && !MbQ->mb_field) ? 2 * width : width) : 1;
+				incP = dir ? ((fieldModeFilteringFlag && !MbP->mb_field) ? 2 * width : width) : 1;
+				SrcPtrQ = &(Img[pixQ.pos_y][pixQ.pos_x]);
+				SrcPtrP = &(Img[pixP.pos_y][pixP.pos_x]);
+
+				// Average QP of the two blocks
+				QP = (MbP->qpc[uv] + MbQ->qpc[uv] + 1) >> 1;
+
+				indexA = iClip3(0, MAX_QP, QP + AlphaC0Offset);
+				indexB = iClip3(0, MAX_QP, QP + BetaOffset);
+
+				Alpha   = ALPHA_TABLE[indexA] * bitdepth_scale;
+				Beta    = BETA_TABLE [indexB] * bitdepth_scale;
+				ClipTab = CLIP_TAB[indexA];
+
+
+				L0  = SrcPtrP[0] ;
+				R0  = SrcPtrQ[0] ;      
+
+
+				if( abs( R0 - L0 ) < Alpha )
+				{          
+					imgpel L1  = SrcPtrP[-incP];
+					imgpel R1  = SrcPtrQ[ incQ];      
+					//if( ((abs( R0 - R1) - Beta )  & (abs(L0 - L1) - Beta )) < 0  )
+					if( ((abs( R0 - R1) - Beta < 0)  && (abs(L0 - L1) - Beta < 0 ))  )
+					{
+						if( Strng == 4 )    // INTRA strong filtering
+						{
+							SrcPtrQ[0] = (imgpel) ( ((R1 << 1) + R0 + L1 + 2) >> 2 );
+							SrcPtrP[0] = (imgpel) ( ((L1 << 1) + L0 + R1 + 2) >> 2 );
+						}
+						else
+						{
+							C0  = ClipTab[ Strng ] * bitdepth_scale;
+							tc0  = (C0 + 1);
+							dif = iClip3( -tc0, tc0, ( ((R0 - L0) << 2) + (L1 - R1) + 4) >> 3 );
+
+							SrcPtrP[0] = (imgpel) iClip1 ( max_imgpel_value, L0 + dif );
+							SrcPtrQ[0] = (imgpel) iClip1 ( max_imgpel_value, R0 - dif );
+						}
+					}
+				}
+			}
+		}
+	}
+}
diff --git a/Src/h264dec/ldecod/src/macroblock.asm b/Src/h264dec/ldecod/src/macroblock.asm
new file mode 100644
index 00000000..0f96d2b5
--- /dev/null
+++ b/Src/h264dec/ldecod/src/macroblock.asm
@@ -0,0 +1,189 @@
+.686
+.XMM
+.model FLAT
+
+
+PUBLIC	_inv_level_coefficients
+_TEXT	SEGMENT
+_blocks$ = 8						; size = 4
+_InvLevelScale$ = 12					; size = 4
+_qp_per$ = 16						; size = 4
+_inv_level_coefficients PROC
+
+	mov	eax, DWORD PTR _blocks$[esp-4]
+	mov	ecx, DWORD PTR _qp_per$[esp-4]
+	mov	edx, DWORD PTR _InvLevelScale$[esp-4]
+	push	esi
+	push	edi
+	mov	edi, 4
+$LL10@inv_level_:
+
+; 3870 : 	{
+; 3871 : 		h264_short_block_row_t *block = blocks[b];
+; 3872 : 		for (j = 0; j < 4; ++j)
+; 3873 : 		{
+; 3874 : 				if (block[j][0]) block[j][0]= rshift_rnd_sf((block[j][0] * InvLevelScale[j][0]) << qp_per, 4);
+
+	movsx esi, WORD PTR [eax+4-4]
+	test esi, esi
+	je	SHORT $LN4@inv_level_
+	imul	esi, DWORD PTR [edx]
+	shl	esi, cl
+	add	esi, 8
+	sar	esi, 4
+	mov	WORD PTR [eax+4-4], si
+$LN4@inv_level_:
+
+; 3875 : 				if (block[j][1]) block[j][1]= rshift_rnd_sf((block[j][1] * InvLevelScale[j][1]) << qp_per, 4);
+
+	movsx esi, WORD PTR [eax+4-2]
+	test esi, esi
+	je	SHORT $LN3@inv_level_
+	imul	esi, DWORD PTR [edx+4]
+	shl	esi, cl
+	add	esi, 8
+	sar	esi, 4
+	mov	WORD PTR [eax+4-2], si
+$LN3@inv_level_:
+
+; 3876 : 				if (block[j][2]) block[j][2]= rshift_rnd_sf((block[j][2] * InvLevelScale[j][2]) << qp_per, 4);
+
+	movsx esi, WORD PTR [eax+4]
+	test esi, esi
+	je	SHORT $LN2@inv_level_
+	imul	esi, DWORD PTR [edx+8]
+	shl	esi, cl
+	add	esi, 8
+	sar	esi, 4
+	mov	WORD PTR [eax+4], si
+$LN2@inv_level_:
+
+; 3877 : 				if (block[j][3]) block[j][3]= rshift_rnd_sf((block[j][3] * InvLevelScale[j][3]) << qp_per, 4);
+
+	movsx esi, WORD PTR [eax+4+2]
+	test esi, esi
+	je	SHORT $LN6@inv_level_
+	imul	esi, DWORD PTR [edx+12]
+	shl	esi, cl
+	add	esi, 8
+	sar	esi, 4
+	mov	WORD PTR [eax+4+2], si
+$LN6@inv_level_:
+	movsx esi, WORD PTR [eax+4+4]
+	test esi, esi
+	je	SHORT $LN27@inv_level_
+	imul	esi, DWORD PTR [edx+16]
+	shl	esi, cl
+	add	esi, 8
+	sar	esi, 4
+	mov	WORD PTR [eax+4+4], si
+$LN27@inv_level_:
+	movsx esi, WORD PTR [eax+4+6]
+	test esi, esi
+	je	SHORT $LN28@inv_level_
+	imul	esi, DWORD PTR [edx+20]
+	shl	esi, cl
+	add	esi, 8
+	sar	esi, 4
+	mov	WORD PTR [eax+4+6], si
+$LN28@inv_level_:
+	movsx esi, WORD PTR [eax+4+8]
+	test esi, esi
+	je	SHORT $LN29@inv_level_
+	imul	esi, DWORD PTR [edx+24]
+	shl	esi, cl
+	add	esi, 8
+	sar	esi, 4
+	mov	WORD PTR [eax+4+8], si
+$LN29@inv_level_:
+	movsx esi, WORD PTR [eax+4+10]
+	test esi, esi
+	je	SHORT $LN30@inv_level_
+	imul	esi, DWORD PTR [edx+28]
+	shl	esi, cl
+	add	esi, 8
+	sar	esi, 4
+	mov	WORD PTR [eax+4+10], si
+$LN30@inv_level_:
+	movsx esi, WORD PTR [eax+4+12]
+	test esi, esi
+	je	SHORT $LN32@inv_level_
+	imul	esi, DWORD PTR [edx+32]
+	shl	esi, cl
+	add	esi, 8
+	sar	esi, 4
+	mov	WORD PTR [eax+4+12], si
+$LN32@inv_level_:
+	movsx esi, WORD PTR [eax+4+14]
+	test esi, esi
+	je	SHORT $LN33@inv_level_
+	imul	esi, DWORD PTR [edx+36]
+	shl	esi, cl
+	add	esi, 8
+	sar	esi, 4
+	mov	WORD PTR [eax+4+14], si
+$LN33@inv_level_:
+	movsx esi, WORD PTR [eax+4+16]
+	test esi, esi
+	je	SHORT $LN34@inv_level_
+	imul	esi, DWORD PTR [edx+40]
+	shl	esi, cl
+	add	esi, 8
+	sar	esi, 4
+	mov	WORD PTR [eax+4+16], si
+$LN34@inv_level_:
+	movsx esi, WORD PTR [eax+4+18]
+	test esi, esi
+	je	SHORT $LN35@inv_level_
+	imul	esi, DWORD PTR [edx+44]
+	shl	esi, cl
+	add	esi, 8
+	sar	esi, 4
+	mov	WORD PTR [eax+4+18], si
+$LN35@inv_level_:
+	movsx esi, WORD PTR [eax+4+20]
+	test esi, esi
+	je	SHORT $LN37@inv_level_
+	imul	esi, DWORD PTR [edx+48]
+	shl	esi, cl
+	add	esi, 8
+	sar	esi, 4
+	mov	WORD PTR [eax+4+20], si
+$LN37@inv_level_:
+	movsx esi, WORD PTR [eax+4+22]
+	test esi, esi
+	je	SHORT $LN38@inv_level_
+	imul	esi, DWORD PTR [edx+52]
+	shl	esi, cl
+	add	esi, 8
+	sar	esi, 4
+	mov	WORD PTR [eax+4+22], si
+$LN38@inv_level_:
+	movsx esi, WORD PTR [eax+4+24]
+	test esi, esi
+	je	SHORT $LN39@inv_level_
+	imul	esi, DWORD PTR [edx+56]
+	shl	esi, cl
+	add	esi, 8
+	sar	esi, 4
+	mov	WORD PTR [eax+4+24], si
+$LN39@inv_level_:
+	movsx esi, WORD PTR [eax+4+26]
+	test esi, esi
+	je	SHORT $LN9@inv_level_
+	imul	esi, DWORD PTR [edx+60]
+	shl	esi, cl
+	add	esi, 8
+	sar	esi, 4
+	mov	WORD PTR [eax+4+26], si
+$LN9@inv_level_:
+	add	eax, 32					; 00000020H
+	sub	edi, 1
+	jne	$LL10@inv_level_
+	pop	edi
+	pop	esi
+
+	ret	0
+_inv_level_coefficients ENDP
+
+END
+\ No newline at end of file
diff --git a/Src/h264dec/ldecod/src/macroblock.c b/Src/h264dec/ldecod/src/macroblock.c
new file mode 100644
index 00000000..10d083b1
--- /dev/null
+++ b/Src/h264dec/ldecod/src/macroblock.c
@@ -0,0 +1,6475 @@
+
+/*!
+***********************************************************************
+* \file macroblock.c
+*
+* \brief
+*     Decode a Macroblock
+*
+* \author
+*    Main contributors (see contributors.h for copyright, address and affiliation details)
+*    - Inge Lille-Lang�y               <inge.lille-langoy@telenor.com>
+*    - Rickard Sjoberg                 <rickard.sjoberg@era.ericsson.se>
+*    - Jani Lainema                    <jani.lainema@nokia.com>
+*    - Sebastian Purreiter             <sebastian.purreiter@mch.siemens.de>
+*    - Thomas Wedi                     <wedi@tnt.uni-hannover.de>
+*    - Detlev Marpe                    <marpe@hhi.de>
+*    - Gabi Blaettermann
+*    - Ye-Kui Wang                     <wyk@ieee.org>
+*    - Lowell Winger                   <lwinger@lsil.com>
+*    - Alexis Michael Tourapis         <alexismt@ieee.org>
+***********************************************************************
+*/
+
+#include "contributors.h"
+
+#include <math.h>
+
+#include "block.h"
+#include "global.h"
+#include "mbuffer.h"
+#include "elements.h"
+#include "errorconcealment.h"
+#include "macroblock.h"
+#include "fmo.h"
+#include "cabac.h"
+#include "vlc.h"
+#include "image.h"
+#include "mb_access.h"
+#include "biaridecod.h"
+#include "transform8x8.h"
+#include "transform.h"
+#include "mc_prediction.h"
+#include "quant.h"
+#include "intra4x4_pred.h"
+#include "intra8x8_pred.h"
+#include "intra16x16_pred.h"
+#include "mv_prediction.h"
+#include "optim.h"
+#include "mb_prediction.h"
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#if TRACE
+#define TRACE_STRING(s) strncpy(currSE.tracestring, s, TRACESTRING_SIZE)
+#define TRACE_DECBITS(i) dectracebitcnt(1)
+#define TRACE_PRINTF(s) sprintf(type, "%s", s);
+#define TRACE_STRING_P(s) strncpy(currSE->tracestring, s, TRACESTRING_SIZE)
+#else
+#define TRACE_STRING(s)
+#define TRACE_DECBITS(i)
+#define TRACE_PRINTF(s) 
+#define TRACE_STRING_P(s)
+#endif
+
+//! look up tables for FRExt_chroma support
+void dectracebitcnt(int count);
+
+static void read_motion_info_from_NAL_p_slice  (Macroblock *currMB);
+static void read_motion_info_from_NAL_b_slice  (Macroblock *currMB);
+static void read_ipred_modes                   (Macroblock *currMB);
+static void read_CBP_and_coeffs_from_NAL_CABAC (Macroblock *currMB);
+static void read_CBP_and_coeffs_from_NAL_CAVLC (Macroblock *currMB);
+static void read_IPCM_coeffs_from_NAL          (Slice *currSlice, struct datapartition *dP);
+static void read_one_macroblock_i_slice        (Macroblock *currMB);
+static void read_one_macroblock_p_slice        (Macroblock *currMB);
+static void read_one_macroblock_b_slice        (Macroblock *currMB);
+static int  decode_one_component_i_slice       (Macroblock *currMB, ColorPlane curr_plane, struct video_image *image, StorablePicture *dec_picture);
+static int  decode_one_component_p_slice       (Macroblock *currMB, ColorPlane curr_plane, struct video_image *image, StorablePicture *dec_picture);
+static int  decode_one_component_b_slice       (Macroblock *currMB, ColorPlane curr_plane, struct video_image *image, StorablePicture *dec_picture);
+static int  decode_one_component_sp_slice      (Macroblock *currMB, ColorPlane curr_plane, struct video_image *image, StorablePicture *dec_picture);
+
+static inline void or_bits(int64 *x, int mask, int position)
+{
+#ifdef _M_IX86
+	__m64 mmx_x = *(__m64 *)x;
+	__m64 mmx_mask = _mm_cvtsi32_si64(mask);
+	mmx_mask=_mm_slli_si64(mmx_mask, position);
+	mmx_x = _mm_or_si64(mmx_x, mmx_mask);
+	*(__m64 *)x = mmx_x;
+#else
+	*x   |= ((int64) mask << position);
+#endif
+}
+
+/*!
+************************************************************************
+* \brief
+*    Set context for reference frames
+************************************************************************
+*/
+static inline int BType2CtxRef (int btype)
+{
+	return (btype >= 4);
+}
+
+/*!
+************************************************************************
+* \brief
+*    Function for reading the reference picture indices using VLC
+************************************************************************
+*/
+static char readRefPictureIdx_VLC(SyntaxElement *currSE, DataPartition *dP, int list)
+{
+#if TRACE
+	char tstring[20];   
+	sprintf( tstring, "ref_idx_l%d", list); 
+	strncpy(currSE->tracestring, tstring, TRACESTRING_SIZE);
+#endif
+	currSE->value2 = list;
+	readSyntaxElement_UVLC(currSE, dP);
+	return (char) currSE->value1;
+}
+
+/*!
+************************************************************************
+* \brief
+*    Function for reading the reference picture indices using FLC
+************************************************************************
+*/
+static char readRefPictureIdx_FLC(SyntaxElement *currSE, DataPartition *dP, int list)
+{
+#if TRACE
+	char tstring[20];   
+	sprintf( tstring, "ref_idx_l%d", list); 
+	strncpy(currSE->tracestring, tstring, TRACESTRING_SIZE);
+#endif
+	//currSE->len = 1;
+	currSE->value1 = 1 - readSyntaxElement_FLC(dP->bitstream, 1);
+
+	return (char) currSE->value1;
+}
+
+/*!
+************************************************************************
+* \brief
+*    Dummy Function for reading the reference picture indices
+************************************************************************
+*/
+static char readRefPictureIdx_Null(SyntaxElement *currSE, DataPartition *dP, int list)
+{
+	return 0;
+}
+
+/*!
+************************************************************************
+* \brief
+*    Function to prepare reference picture indice function pointer
+************************************************************************
+*/
+static void prepareListforRefIdx ( Macroblock *currMB, SyntaxElement *currSE, int num_ref_idx_active, int refidx_present)
+{
+	currMB->readRefPictureIdx = readRefPictureIdx_Null; // Initialize readRefPictureIdx
+	if(num_ref_idx_active > 1)
+	{
+		currSE->mapping = linfo_ue;
+		if (refidx_present)
+		{
+			if (num_ref_idx_active == 2)
+				currMB->readRefPictureIdx = readRefPictureIdx_FLC;        
+			else
+				currMB->readRefPictureIdx = readRefPictureIdx_VLC;
+		}
+	}    
+}
+
+#if defined(_DEBUG) || defined(_M_X64)
+void set_chroma_qp(Macroblock* currMB)
+{
+	// TODO: benski> we could use MMX for this if we could find a formula for QP_SCALE_CR
+	VideoParameters *p_Vid = currMB->p_Vid;
+	StorablePicture *dec_picture = p_Vid->dec_picture;
+	int i;
+	for (i=0; i<2; ++i)
+	{
+		currMB->qpc[i] = iClip3 ( -p_Vid->bitdepth_chroma_qp_scale, 51, currMB->qp + dec_picture->chroma_qp_offset[i] );
+		currMB->qpc[i] = currMB->qpc[i] < 0 ? currMB->qpc[i] : QP_SCALE_CR[currMB->qpc[i]];
+		currMB->qp_scaled[i + 1] = currMB->qpc[i] + p_Vid->bitdepth_chroma_qp_scale;
+	}
+}
+#else
+void set_chroma_qp(Macroblock* currMB);
+#endif
+
+/*!
+************************************************************************
+* \brief
+*    updates chroma QP according to luma QP and bit depth
+************************************************************************
+*/
+static inline void update_qp(Macroblock *currMB, int qp)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	currMB->qp = qp;
+	currMB->qp_scaled[0] = qp + p_Vid->bitdepth_luma_qp_scale;
+	set_chroma_qp(currMB);
+	currMB->is_lossless = (Boolean) ((currMB->qp_scaled[0] == 0) && (p_Vid->lossless_qpprime_flag == 1));
+}
+
+static void read_delta_quant_CAVLC(SyntaxElement *currSE, DataPartition *dP, Macroblock *currMB, const byte *partMap, int type)
+{
+	Slice *currSlice = currMB->p_Slice;
+	VideoParameters *p_Vid = currMB->p_Vid;
+
+	dP = &(currSlice->partArr[partMap[type]]);
+
+
+	currSE->mapping = linfo_se;
+	readSyntaxElement_UVLC(currSE, dP);
+	currMB->delta_quant = (short) currSE->value1;
+
+
+	if ((currMB->delta_quant < -(26 + p_Vid->bitdepth_luma_qp_scale/2)) || (currMB->delta_quant > (25 + p_Vid->bitdepth_luma_qp_scale/2)))
+		error ("mb_qp_delta is out of range", 500);
+
+	p_Vid->qp = ((p_Vid->qp + currMB->delta_quant + 52 + 2*p_Vid->bitdepth_luma_qp_scale)%(52+p_Vid->bitdepth_luma_qp_scale)) -
+		p_Vid->bitdepth_luma_qp_scale;
+	update_qp(currMB, p_Vid->qp);
+}
+
+static void inline read_delta_quant_CABAC(SyntaxElement *currSE, DataPartition *dP, Macroblock *currMB, const byte *partMap, int type)
+{
+	Slice *currSlice = currMB->p_Slice;
+	VideoParameters *p_Vid = currMB->p_Vid;
+
+	dP = &(currSlice->partArr[partMap[type]]);
+
+	currMB->delta_quant = readDquant_CABAC(currSlice, &dP->de_cabac);
+
+	if ((currMB->delta_quant < -(26 + p_Vid->bitdepth_luma_qp_scale/2)) || (currMB->delta_quant > (25 + p_Vid->bitdepth_luma_qp_scale/2)))
+		error ("mb_qp_delta is out of range", 500);
+
+	p_Vid->qp = ((p_Vid->qp + currMB->delta_quant + 52 + 2*p_Vid->bitdepth_luma_qp_scale)%(52+p_Vid->bitdepth_luma_qp_scale)) - p_Vid->bitdepth_luma_qp_scale;
+	update_qp(currMB, p_Vid->qp);
+}
+
+/*!
+************************************************************************
+* \brief
+*    Function to read reference picture indice values
+************************************************************************
+*/
+static void readMBRefPictureIdx(SyntaxElement *currSE, DataPartition *dP, Macroblock *currMB, PicMotion **motion, int list, int step_v0, int step_h0)
+{
+	int k, j, j0, i0, i;
+	char refframe;
+
+	for (j0 = 0; j0 < 4; j0 += step_v0)
+	{
+		currMB->subblock_y = j0 << 2;
+		for (i0 = 0; i0 < 4; i0 += step_h0)
+		{
+			currMB->subblock_x = i0 << 2;
+			k = 2 * (j0 >> 1) + (i0 >> 1);
+
+			if ((currMB->b8pdir[k] == list || currMB->b8pdir[k] == BI_PRED) && currMB->b8mode[k] != 0)
+			{
+				refframe = currMB->readRefPictureIdx(currSE, dP, list);
+
+				for (j = j0; j < j0 + step_v0; ++j)
+				{
+					for (i=0;i<step_h0;i++)
+					{
+						motion[j][currMB->block_x + i0 + i].ref_idx = refframe;
+					}
+				}
+			}
+		}
+	}
+}
+
+static void readMBRefPictureIdx_CABAC1(DataPartition *dP, Macroblock *currMB, PicMotion **motion, int list, int step_v0)
+{
+	int k, j, j0, i0;
+	char refframe;
+
+	for (j0 = 0; j0 < 4; j0 += step_v0)
+	{
+		currMB->subblock_y = j0 << 2;
+		for (i0 = 0; i0 < 4; i0 += 1)
+		{
+			currMB->subblock_x = i0 << 2;
+			k = 2 * (j0 >> 1) + (i0 >> 1);
+
+			if ((currMB->b8pdir[k] == list || currMB->b8pdir[k] == BI_PRED) && currMB->b8mode[k] != 0)
+			{
+				refframe = readRefFrame_CABAC(currMB, &dP->de_cabac, list, i0<<2, j0<<2);
+
+				for (j = j0; j < j0 + step_v0; ++j)
+					motion[j][currMB->block_x + i0].ref_idx=refframe;
+			}
+		}
+	}
+}
+
+static void readMBRefPictureIdx_CABAC2(DataPartition *dP, Macroblock *currMB, PicMotion **motion, int list, int step_v0)
+{
+	int k, j, j0;
+	char refframe;
+
+	for (j0 = 0; j0 < 4; j0 += step_v0)
+	{
+		currMB->subblock_y = j0 << 2;
+
+		currMB->subblock_x = 0 << 2;
+		k = 2 * (j0 >> 1) + (0 >> 1);
+
+		if ((currMB->b8pdir[k] == list || currMB->b8pdir[k] == BI_PRED) && currMB->b8mode[k] != 0)
+		{
+			refframe = readRefFrame_CABAC0(currMB, &dP->de_cabac, list, j0<<2);
+
+			for (j = j0; j < j0 + step_v0; ++j)
+			{
+				motion[j][currMB->block_x + 0].ref_idx=refframe;
+				motion[j][currMB->block_x + 1].ref_idx=refframe;
+			}
+		}
+
+		//
+
+		currMB->subblock_x = 2 << 2;
+		k = 2 * (j0 >> 1) + (2 >> 1);
+
+		if ((currMB->b8pdir[k] == list || currMB->b8pdir[k] == BI_PRED) && currMB->b8mode[k] != 0)
+		{
+			refframe = readRefFrame_CABAC(currMB, &dP->de_cabac, list, 8, j0<<2);
+
+			for (j = j0; j < j0 + step_v0; ++j)
+			{
+				motion[j][currMB->block_x + 2].ref_idx=refframe;
+				motion[j][currMB->block_x + 3].ref_idx=refframe;
+			}
+		}
+
+	}
+}
+
+
+static void readMBRefPictureIdx_CABAC4(DataPartition *dP, Macroblock *currMB, PicMotion **motion, int list, int step_v0)
+{
+	int k, j, j0;
+	char refframe;
+
+	for (j0 = 0; j0 < 4; j0 += step_v0)
+	{
+		currMB->subblock_y = j0 << 2;
+		currMB->subblock_x = 0;
+		k = j0 & ~1;
+
+		if ((currMB->b8pdir[k] == list || currMB->b8pdir[k] == BI_PRED) && currMB->b8mode[k] != 0)
+		{
+			refframe =  readRefFrame_CABAC0(currMB, &dP->de_cabac, list, j0<<2);
+			for (j = j0; j < j0 + step_v0; ++j)
+			{
+				motion[j][currMB->block_x + 0].ref_idx=refframe;
+				motion[j][currMB->block_x + 1].ref_idx=refframe;
+				motion[j][currMB->block_x + 2].ref_idx=refframe;
+				motion[j][currMB->block_x + 3].ref_idx=refframe;
+			}
+		}
+	}
+}
+
+static void readMBRefPictureIdx_CABAC(DataPartition *dP, Macroblock *currMB, PicMotion **motion, int list, int step_v0, int step_h0)
+{
+	switch(step_h0)
+	{
+	case 1:
+		readMBRefPictureIdx_CABAC1(dP, currMB, motion, list, step_v0);
+		break;
+	case 2:
+		readMBRefPictureIdx_CABAC2(dP, currMB, motion, list, step_v0);
+		break;
+	case 4:
+		readMBRefPictureIdx_CABAC4(dP, currMB, motion, list, step_v0);
+		break;
+	}
+}
+
+static void readMBRefPictureIdx_CABAC_NoReference(Macroblock *currMB, PicMotion **motion, int list, int step_v0, int step_h0)
+{
+	int k, j, j0, i0, i;
+
+	for (j0 = 0; j0 < 4; j0 += step_v0)
+	{
+		for (i0 = 0; i0 < 4; i0 += step_h0)
+		{
+			k = 2 * (j0 >> 1) + (i0 >> 1);
+
+			if ((currMB->b8pdir[k] == list || currMB->b8pdir[k] == BI_PRED) && currMB->b8mode[k] != 0)
+			{
+				for (j = j0; j < j0 + step_v0; ++j)
+				{
+					for (i=0;i<step_h0;i++)
+					{
+						motion[j][currMB->block_x + i0 + i].ref_idx=0;
+					}
+				}
+			}
+		}
+	}
+}
+
+/*!
+************************************************************************
+* \brief
+*    Function to read reference picture indice values
+************************************************************************
+*/
+static void readMBMotionVectors(SyntaxElement *currSE, DataPartition *dP, Macroblock *currMB, int list, int step_h0, int step_v0)
+{
+	int i, j, k, i4, j4, ii, jj, kk, i0, j0;
+	short curr_mvd[2], curr_mv[2], pred_mv[2];
+	MotionVector  (*mvd)[4];
+	//MotionVector **mv;
+	int mv_mode, step_h, step_v;
+	char cur_ref_idx;
+	VideoParameters *p_Vid = currMB->p_Vid;
+	StorablePicture *dec_picture = p_Vid->dec_picture;
+	PicMotionParams *motion = &dec_picture->motion;
+	PixelPos block[4]; // neighbor blocks
+
+
+	for (j0=0; j0<4; j0+=step_v0)
+	{
+		for (i0=0; i0<4; i0+=step_h0)
+		{       
+			kk = 2 * (j0 >> 1) + (i0 >> 1);
+			if ((currMB->b8pdir[kk]== list || currMB->b8pdir[kk]== BI_PRED) && (currMB->b8mode[kk] !=0))//has forward vector
+			{
+				PicMotion **list_motion = motion->motion[list];
+				cur_ref_idx = list_motion[currMB->block_y+j0][currMB->block_x+i0].ref_idx;
+				mv_mode  = currMB->b8mode[kk];
+				step_h = BLOCK_STEP [mv_mode][0];
+				step_v = BLOCK_STEP [mv_mode][1];
+
+				for (j = j0; j < j0 + step_v0; j += step_v)
+				{
+					PicMotion **mv;
+					currMB->subblock_y = j << 2; // position used for context determination
+					j4 = currMB->block_y + j;
+					mv = &list_motion[j4];
+					mvd = &currMB->mvd [list][j];
+					for (i = i0; i < i0 + step_h0; i += step_h)
+					{
+						currMB->subblock_x = i << 2; // position used for context determination
+						i4 = currMB->block_x + i;
+
+						get_neighbors(currMB, block, BLOCK_SIZE * i, BLOCK_SIZE * j, 4 * step_h);
+
+						// first make mv-prediction
+						currMB->GetMVPredictor (currMB, block, pred_mv, cur_ref_idx, list_motion, BLOCK_SIZE * i, BLOCK_SIZE * j, 4 * step_h, 4 * step_v);
+
+						for (k=0; k < 2; ++k)
+						{
+							currSE->value2   = (k << 1) + list; // identifies the component; only used for context determination
+							readSyntaxElement_UVLC(currSE, dP);
+							curr_mvd[k] = (short) currSE->value1;
+							curr_mv [k] = (short)(curr_mvd[k] + pred_mv[k]);  // compute motion vector 
+						}
+
+						// Init motion vectors
+						for(jj = 0; jj < step_v; ++jj)
+						{
+							for(ii = i4; ii < i4 + step_h; ++ii)
+							{
+								memcpy(&mv[jj][ii].mv, curr_mv,  sizeof(MotionVector));
+							}
+						}
+
+						// Init first line (mvd)
+						for(ii = i; ii < i + step_h; ++ii)
+						{
+							memcpy(mvd[0][ii], curr_mvd,  sizeof(MotionVector));
+						}              
+
+						// now copy all other lines
+						for(jj = 1; jj < step_v; ++jj)
+						{
+							memcpy(mvd[jj][i], mvd[0][i],  step_h * sizeof(MotionVector));
+						}
+					}
+				}
+			}
+		}
+	}
+}
+
+static void readMBMotionVectors_CABAC(DataPartition *dP, Macroblock *currMB, int list, int step_h0, int step_v0)
+{
+	int i, j, k, i4, j4, ii, jj, kk, i0, j0;
+	short curr_mvd[2], curr_mv[2], pred_mv[2];
+	MotionVector  (*mvd)[4];
+	//MotionVector **mv;
+	int mv_mode, step_h, step_v;
+	char cur_ref_idx;
+	VideoParameters *p_Vid = currMB->p_Vid;
+	StorablePicture *dec_picture = p_Vid->dec_picture;
+	PicMotionParams *motion = &dec_picture->motion;
+	PixelPos block[4]; // neighbor blocks
+
+	for (j0=0; j0<4; j0+=step_v0)
+	{
+		for (i0=0; i0<4; i0+=step_h0)
+		{       
+			kk = (j0 & ~1) + (i0 >> 1);
+			if ((currMB->b8pdir[kk]== list || currMB->b8pdir[kk]== BI_PRED) && (currMB->b8mode[kk] !=0))//has forward vector
+			{
+				PicMotion **list_motion = motion->motion[list];
+				cur_ref_idx = list_motion[currMB->block_y+j0][currMB->block_x+i0].ref_idx;
+				mv_mode  = currMB->b8mode[kk];
+				step_h = BLOCK_STEP [mv_mode][0];
+				step_v = BLOCK_STEP [mv_mode][1];
+
+				for (j = j0; j < j0 + step_v0; j += step_v)
+				{
+					PicMotion **mv;
+					int block_j = j << 2;
+					currMB->subblock_y = block_j; // position used for context determination
+					j4 = currMB->block_y + j;
+					mv = &list_motion[j4];
+					mvd = &currMB->mvd [list][j];
+					for (i = i0; i < i0 + step_h0; i += step_h)
+					{
+						int block_i=i << 2;
+						currMB->subblock_x = block_i; // position used for context determination
+						i4 = currMB->block_x + i;
+
+						get_neighbors(currMB, block, block_i, block_j, 4 * step_h);
+
+						// first make mv-prediction
+						currMB->GetMVPredictor (currMB, block, pred_mv, cur_ref_idx, list_motion, block_i, block_j, 4 * step_h, 4 * step_v);
+
+						for (k=0; k < 2; ++k)
+						{
+							//currSE.value2   = (k << 1) + list; // identifies the component; only used for context determination
+							curr_mvd[k] = (short)readMVD_CABAC(currMB, &dP->de_cabac, k, list, block_i, block_j);
+							curr_mv [k] = (short)(curr_mvd[k] + pred_mv[k]);  // compute motion vector 
+						}
+
+						// Init motion vectors
+						for(jj = 0; jj < step_v; ++jj)
+						{
+							for(ii = i4; ii < i4 + step_h; ++ii)
+							{
+								*(int32_t *)(&mv[jj][ii].mv) = *(int32_t *)curr_mv;
+							}
+						}
+
+						// Init first line (mvd)
+						for(ii = i; ii < i + step_h; ++ii)
+						{
+							*(int32_t *)(mvd[0][ii]) = *(int32_t *)curr_mvd;
+						}              
+
+						// now copy all other lines
+						for(jj = 1; jj < step_v; ++jj)
+						{
+							memcpy_amd(mvd[jj][i], mvd[0][i],  step_h * sizeof(MotionVector));
+						}
+					}
+				}
+			}
+		}
+	}
+}
+
+/*!
+************************************************************************
+* \brief
+*    initializes the current macroblock
+************************************************************************
+*/
+void start_macroblock(Slice *currSlice, Macroblock **currMB)
+{
+	VideoParameters *p_Vid = currSlice->p_Vid;
+	StorablePicture *dec_picture = p_Vid->dec_picture;
+	int mb_nr = p_Vid->current_mb_nr;
+	Macroblock *mb = &p_Vid->mb_data[mb_nr];   // intialization code deleted, see below, StW  
+	*currMB = mb;
+
+	mb->p_Vid   = p_Vid;
+	mb->p_Slice = currSlice;
+	mb->mbAddrX = mb_nr;
+
+	//assert (mb_nr < (int) p_Vid->PicSizeInMbs);
+
+	/* Update coordinates of the current macroblock */
+	if (currSlice->mb_aff_frame_flag)
+	{
+		mb->mb_x =    (mb_nr) % ((2*p_Vid->width) / MB_BLOCK_SIZE);
+		mb->mb_y = 2*((mb_nr) / ((2*p_Vid->width) / MB_BLOCK_SIZE));
+
+		mb->mb_y += (mb->mb_x & 0x01);
+		mb->mb_x >>= 1;
+	}
+	else
+	{
+		mb->mb_x = p_Vid->PicPos[mb_nr][0];
+		mb->mb_y = p_Vid->PicPos[mb_nr][1];
+	}
+
+	/* Define vertical positions */
+	mb->block_y = mb->mb_y * BLOCK_SIZE;      /* luma block position */
+	mb->block_y_aff = mb->block_y;
+	mb->pix_y   = mb->mb_y * MB_BLOCK_SIZE;   /* luma macroblock position */
+	mb->pix_c_y = mb->mb_y * p_Vid->mb_cr_size_y; /* chroma macroblock position */
+
+	/* Define horizontal positions */
+	mb->block_x = mb->mb_x * BLOCK_SIZE;      /* luma block position */
+	mb->pix_x   = mb->mb_x * MB_BLOCK_SIZE;   /* luma pixel position */
+	mb->pix_c_x = mb->mb_x * p_Vid->mb_cr_size_x; /* chroma pixel position */
+
+	// Save the slice number of this macroblock. When the macroblock below
+	// is coded it will use this to decide if prediction for above is possible
+	mb->slice_nr = (short) p_Vid->current_slice_nr;
+
+	if (p_Vid->current_slice_nr >= MAX_NUM_SLICES)
+	{
+		error ("Maximum number of supported slices exceeded. \nPlease recompile with increased value for MAX_NUM_SLICES", 200);
+	}
+
+	dec_picture->slice_id[mb->mb_y][mb->mb_x] = (short) p_Vid->current_slice_nr;
+	dec_picture->max_slice_id = (short) imax(p_Vid->current_slice_nr, dec_picture->max_slice_id);
+
+	CheckAvailabilityOfNeighbors(mb);
+
+	// Select appropriate MV predictor function
+	init_motion_vector_prediction(*currMB, currSlice->mb_aff_frame_flag);
+
+	set_read_and_store_CBP(currMB, currSlice->active_sps->chroma_format_idc);
+
+	// Reset syntax element entries in MB struct
+	update_qp(*currMB, p_Vid->qp);
+	mb->mb_type         = 0;
+	mb->delta_quant     = 0;
+	mb->cbp             = 0;    
+	mb->c_ipred_mode    = DC_PRED_8; //GB
+
+	if (currSlice->slice_type != I_SLICE)
+	{
+		if (currSlice->slice_type != B_SLICE)
+			memzero64(mb->mvd);//, BLOCK_MULTIPLE * BLOCK_MULTIPLE * 2 * sizeof(short));
+		else
+			memzero128(mb->mvd);//, 2 * BLOCK_MULTIPLE * BLOCK_MULTIPLE * 2 * sizeof(short));
+	}
+
+	memzero24(mb->cbp_blk);  
+	memzero24(mb->cbp_bits);
+	memzero24(mb->cbp_bits_8x8);
+
+	// initialize currSlice->mb_rres
+	memset(currSlice->mb_rres8, 0, sizeof(currSlice->mb_rres8));
+
+	// store filtering parameters for this MB
+	mb->DFDisableIdc    = currSlice->DFDisableIdc;
+	mb->DFAlphaC0Offset = currSlice->DFAlphaC0Offset;
+	mb->DFBetaOffset    = currSlice->DFBetaOffset;
+
+}
+
+/*!
+************************************************************************
+* \brief
+*    set coordinates of the next macroblock
+*    check end_of_slice condition
+************************************************************************
+*/
+Boolean exit_macroblock(Slice *currSlice, int eos_bit)
+{
+	VideoParameters *p_Vid = currSlice->p_Vid;
+
+	//! The if() statement below resembles the original code, which tested
+	//! p_Vid->current_mb_nr == p_Vid->PicSizeInMbs.  Both is, of course, nonsense
+	//! In an error prone environment, one can only be sure to have a new
+	//! picture by checking the tr of the next slice header!
+
+	// printf ("exit_macroblock: FmoGetLastMBOfPicture %d, p_Vid->current_mb_nr %d\n", FmoGetLastMBOfPicture(), p_Vid->current_mb_nr);
+	++(p_Vid->num_dec_mb);
+
+	if (p_Vid->num_dec_mb == p_Vid->PicSizeInMbs)
+	{
+		return TRUE;
+	}
+	// ask for last mb in the slice  CAVLC
+	else
+	{
+
+		p_Vid->current_mb_nr = FmoGetNextMBNr (p_Vid, p_Vid->current_mb_nr);
+
+		if (p_Vid->current_mb_nr == -1)     // End of Slice group, MUST be end of slice
+		{
+			assert (currSlice->nal_startcode_follows (currSlice, eos_bit) == TRUE);
+			return TRUE;
+		}
+
+		if(currSlice->nal_startcode_follows(currSlice, eos_bit) == FALSE)
+			return FALSE;
+
+		if(currSlice->slice_type == I_SLICE  || currSlice->slice_type == SI_SLICE || p_Vid->active_pps->entropy_coding_mode_flag == CABAC)
+			return TRUE;
+		if(p_Vid->cod_counter <= 0)
+			return TRUE;
+		return FALSE;
+	}
+}
+
+/*!
+************************************************************************
+* \brief
+*    Interpret the mb mode for P-Frames
+************************************************************************
+*/
+static void interpret_mb_mode_P(Macroblock *currMB)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+
+	static const int ICBPTAB[6] = {0,16,32,15,31,47};
+	int         mbmode = currMB->mb_type;
+
+#define ZERO_P8x8     (mbmode==5)
+#define MODE_IS_P8x8  (mbmode==4 || mbmode==5)
+#define MODE_IS_I4x4  (mbmode==6)
+#define I16OFFSET     (mbmode-7)
+#define MODE_IS_IPCM  (mbmode==31)
+
+	if(mbmode <4)
+	{
+		currMB->mb_type = mbmode;
+		memset(&currMB->b8mode[0],mbmode,4 * sizeof(char));
+		memset(&currMB->b8pdir[0], 0, 4 * sizeof(char));
+	}
+	else if(MODE_IS_P8x8)
+	{
+		currMB->mb_type = P8x8;
+		p_Vid->allrefzero = ZERO_P8x8;
+	}
+	else if(MODE_IS_I4x4)
+	{
+		currMB->mb_type = I4MB;
+		memset(&currMB->b8mode[0],IBLOCK, 4 * sizeof(char));
+		memset(&currMB->b8pdir[0],    -1, 4 * sizeof(char));
+	}
+	else if(MODE_IS_IPCM)
+	{
+		currMB->mb_type = IPCM;
+		currMB->cbp = -1;
+		currMB->i16mode = 0;
+
+		memset(&currMB->b8mode[0], 0, 4 * sizeof(char));
+		memset(&currMB->b8pdir[0],-1, 4 * sizeof(char));
+	}
+	else
+	{
+		currMB->mb_type = I16MB;
+		currMB->cbp = ICBPTAB[(I16OFFSET)>>2];
+		currMB->i16mode = (I16OFFSET) & 0x03;
+		memset(&currMB->b8mode[0], 0, 4 * sizeof(char));
+		memset(&currMB->b8pdir[0],-1, 4 * sizeof(char));
+	}
+}
+
+/*!
+************************************************************************
+* \brief
+*    Interpret the mb mode for I-Frames
+************************************************************************
+*/
+static void interpret_mb_mode_I(Macroblock *currMB)
+{
+	static const int ICBPTAB[6] = {0,16,32,15,31,47};
+	int         mbmode   = currMB->mb_type;
+
+	if (mbmode==0)
+	{
+		currMB->mb_type = I4MB;
+		memset(&currMB->b8mode[0],IBLOCK,4 * sizeof(char));
+		memset(&currMB->b8pdir[0],-1,4 * sizeof(char));
+	}
+	else if(mbmode==25)
+	{
+		currMB->mb_type=IPCM;
+		currMB->cbp= -1;
+		currMB->i16mode = 0;
+
+		memset(&currMB->b8mode[0],0,4 * sizeof(char));
+		memset(&currMB->b8pdir[0],-1,4 * sizeof(char));
+	}
+	else
+	{
+		currMB->mb_type = I16MB;
+		currMB->cbp= ICBPTAB[(mbmode-1)>>2];
+		currMB->i16mode = (mbmode-1) & 0x03;
+		memset(&currMB->b8mode[0], 0, 4 * sizeof(char));
+		memset(&currMB->b8pdir[0],-1, 4 * sizeof(char));
+	}
+}
+
+/*!
+************************************************************************
+* \brief
+*    Interpret the mb mode for B-Frames
+************************************************************************
+*/
+static void interpret_mb_mode_B(Macroblock *currMB)
+{
+	static const int offset2pdir16x16[12]   = {0, 0, 1, 2, 0,0,0,0,0,0,0,0};
+	static const int offset2pdir16x8[22][2] = {{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{1,1},{0,0},{0,1},{0,0},{1,0},
+	{0,0},{0,2},{0,0},{1,2},{0,0},{2,0},{0,0},{2,1},{0,0},{2,2},{0,0}};
+	static const int offset2pdir8x16[22][2] = {{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},{1,1},{0,0},{0,1},{0,0},
+	{1,0},{0,0},{0,2},{0,0},{1,2},{0,0},{2,0},{0,0},{2,1},{0,0},{2,2}};
+
+	static const int ICBPTAB[6] = {0,16,32,15,31,47};
+
+	int i, mbmode;
+	int mbtype  = currMB->mb_type;
+
+	//--- set mbtype, b8type, and b8pdir ---
+	if (mbtype==0)       // direct
+	{
+		mbmode=0;
+		memset(&currMB->b8mode[0],0,4 * sizeof(char));
+		memset(&currMB->b8pdir[0],2,4 * sizeof(char));
+	}
+	else if (mbtype==23) // intra4x4
+	{
+		mbmode=I4MB;
+		memset(&currMB->b8mode[0],IBLOCK,4 * sizeof(char));
+		memset(&currMB->b8pdir[0],-1,4 * sizeof(char));
+	}
+	else if ((mbtype>23) && (mbtype<48) ) // intra16x16
+	{
+		mbmode=I16MB;
+		memset(&currMB->b8mode[0],0,4 * sizeof(char));
+		memset(&currMB->b8pdir[0],-1,4 * sizeof(char));
+
+		currMB->cbp     = ICBPTAB[(mbtype-24)>>2];
+		currMB->i16mode = (mbtype-24) & 0x03;
+	}
+	else if (mbtype==22) // 8x8(+split)
+	{
+		mbmode=P8x8;       // b8mode and pdir is transmitted in additional codewords
+	}
+	else if (mbtype<4)   // 16x16
+	{
+		mbmode=1;
+		memset(&currMB->b8mode[0], 1,4 * sizeof(char));
+		memset(&currMB->b8pdir[0],offset2pdir16x16[mbtype],4 * sizeof(char));
+	}
+	else if(mbtype==48)
+	{
+		mbmode=IPCM;
+		memset(&currMB->b8mode[0], 0,4 * sizeof(char));
+		memset(&currMB->b8pdir[0],-1,4 * sizeof(char));
+
+		currMB->cbp= -1;
+		currMB->i16mode = 0;
+	}
+
+	else if ((mbtype&0x01)==0) // 16x8
+	{
+		mbmode=2;
+		memset(&currMB->b8mode[0], 2,4 * sizeof(char));
+		for(i=0;i<4;++i)
+		{
+			currMB->b8pdir[i] = (char) offset2pdir16x8 [mbtype][i>>1];
+		}
+	}
+	else
+	{
+		mbmode=3;
+		memset(&currMB->b8mode[0], 3,4 * sizeof(char));
+		for(i=0;i<4; ++i)
+		{
+			currMB->b8pdir[i] = (char) offset2pdir8x16 [mbtype][i&0x01];
+		}
+	}
+	currMB->mb_type = mbmode;
+}
+/*!
+************************************************************************
+* \brief
+*    Interpret the mb mode for SI-Frames
+************************************************************************
+*/
+static void interpret_mb_mode_SI(Macroblock *currMB)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	const int ICBPTAB[6] = {0,16,32,15,31,47};
+	int         mbmode   = currMB->mb_type;
+
+	if (mbmode==0)
+	{
+		currMB->mb_type = SI4MB;
+		memset(&currMB->b8mode[0],IBLOCK,4 * sizeof(char));
+		memset(&currMB->b8pdir[0],-1,4 * sizeof(char));
+		p_Vid->siblock[currMB->mb_y][currMB->mb_x]=1;
+	}
+	else if (mbmode==1)
+	{
+		currMB->mb_type = I4MB;
+		memset(&currMB->b8mode[0],IBLOCK,4 * sizeof(char));
+		memset(&currMB->b8pdir[0],-1,4 * sizeof(char));
+	}
+	else if(mbmode==26)
+	{
+		currMB->mb_type=IPCM;
+		currMB->cbp= -1;
+		currMB->i16mode = 0;
+		memset(&currMB->b8mode[0],0,4 * sizeof(char));
+		memset(&currMB->b8pdir[0],-1,4 * sizeof(char));
+	}
+
+	else
+	{
+		currMB->mb_type = I16MB;
+		currMB->cbp= ICBPTAB[(mbmode-2)>>2];
+		currMB->i16mode = (mbmode-2) & 0x03;
+		memset(&currMB->b8mode[0],0,4 * sizeof(char));
+		memset(&currMB->b8pdir[0],-1,4 * sizeof(char));
+	}
+}
+
+/*!
+************************************************************************
+* \brief
+*    Set mode interpretation based on slice type
+************************************************************************
+*/
+void setup_slice_methods(Slice *currSlice)
+{
+	switch (currSlice->slice_type)
+	{
+	case P_SLICE: 
+		currSlice->interpret_mb_mode         = interpret_mb_mode_P;
+		currSlice->read_motion_info_from_NAL = read_motion_info_from_NAL_p_slice;
+		currSlice->read_one_macroblock       = read_one_macroblock_p_slice;
+		currSlice->decode_one_component      = decode_one_component_p_slice;
+		break;
+	case SP_SLICE:
+		currSlice->interpret_mb_mode         = interpret_mb_mode_P;
+		currSlice->read_motion_info_from_NAL = read_motion_info_from_NAL_p_slice;
+		currSlice->read_one_macroblock       = read_one_macroblock_p_slice;
+		currSlice->decode_one_component      = decode_one_component_sp_slice;
+		break;
+	case B_SLICE:
+		currSlice->interpret_mb_mode         = interpret_mb_mode_B;
+		currSlice->read_motion_info_from_NAL = read_motion_info_from_NAL_b_slice;
+		currSlice->read_one_macroblock       = read_one_macroblock_b_slice;
+		currSlice->decode_one_component      = decode_one_component_b_slice;
+		break;
+	case I_SLICE: 
+		currSlice->interpret_mb_mode         = interpret_mb_mode_I;
+		currSlice->read_motion_info_from_NAL = NULL;
+		currSlice->read_one_macroblock       = read_one_macroblock_i_slice;
+		currSlice->decode_one_component      = decode_one_component_i_slice;
+		break;
+	case SI_SLICE: 
+		currSlice->interpret_mb_mode         = interpret_mb_mode_SI;
+		currSlice->read_motion_info_from_NAL = NULL;
+		currSlice->read_one_macroblock       = read_one_macroblock_i_slice;
+		currSlice->decode_one_component      = decode_one_component_i_slice;
+		break;
+	default:
+		printf("Unsupported slice type\n");
+		break;
+	}
+
+	if( IS_INDEPENDENT(currSlice->p_Vid) )
+		currSlice->compute_colocated  = compute_colocated_JV;
+	else
+	{
+		if (currSlice->active_sps->frame_mbs_only_flag)
+			currSlice->compute_colocated  = compute_colocated;
+		else
+			currSlice->compute_colocated  = compute_colocated_frames_mbs;     
+	}
+
+	switch(currSlice->p_Vid->active_pps->entropy_coding_mode_flag)
+	{
+	case CABAC:
+		currSlice->read_CBP_and_coeffs_from_NAL = read_CBP_and_coeffs_from_NAL_CABAC;
+		break;
+	case CAVLC:
+		currSlice->read_CBP_and_coeffs_from_NAL = read_CBP_and_coeffs_from_NAL_CAVLC;
+		break;
+	default:
+		printf("Unsupported entropy coding mode\n");
+		break;
+	}
+
+}
+
+void macroblock_set_dc_pred(VideoParameters *p_Vid, int block_x, int block_y)
+{
+	int32_t dc_pred = 2 + (2 << 8) + (2 << 16) + (2 << 24);
+	int32_t *pred = (int32_t *)&p_Vid->ipredmode[block_y][block_x];
+	int stride = p_Vid->PicWidthInMbs;
+	int i;
+	for (i=0;i<BLOCK_SIZE;i++)
+	{
+		*pred = dc_pred;
+		pred += stride;
+	}
+}
+/*!
+************************************************************************
+* \brief
+*    init macroblock I and P frames
+************************************************************************
+*/
+#ifdef _M_IX86
+static void init_macroblock(Macroblock *currMB)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int j;
+	int block_x = currMB->block_x, block_y = currMB->block_y;
+	PicMotionParams *motion = &p_Vid->dec_picture->motion;
+	PicMotion **list_motion0, **list_motion1;
+	__m64 const_0_minus_1 = _mm_setr_pi32(0, -1);
+	macroblock_set_dc_pred(p_Vid, block_x, block_y);
+
+	// reset vectors and pred. modes
+	list_motion0 = motion->motion[LIST_0];
+	for(j = 0; j < BLOCK_SIZE; j++)
+	{                           
+		PicMotion *block = &list_motion0[block_y+j][block_x];
+		block[0].ref_pic_id = UNDEFINED_REFERENCE;	
+		*(__m64 *)&block[0].mv = const_0_minus_1;
+
+		block[1].ref_pic_id = UNDEFINED_REFERENCE;			
+		*(__m64 *)&block[1].mv = const_0_minus_1;
+
+		block[2].ref_pic_id = UNDEFINED_REFERENCE;			
+		*(__m64 *)&block[2].mv = const_0_minus_1;
+
+
+		block[3].ref_pic_id = UNDEFINED_REFERENCE;			
+		*(__m64 *)&block[3].mv = const_0_minus_1;
+	}
+
+	list_motion1 = motion->motion[LIST_1];
+	for(j = 0; j < BLOCK_SIZE; j++)
+	{                           
+		PicMotion *block = &list_motion1[block_y+j][block_x];
+
+		block[0].ref_pic_id = UNDEFINED_REFERENCE;	
+		*(__m64 *)&block[0].mv = const_0_minus_1;
+
+		block[1].ref_pic_id = UNDEFINED_REFERENCE;			
+		*(__m64 *)&block[1].mv = const_0_minus_1;
+
+		block[2].ref_pic_id = UNDEFINED_REFERENCE;			
+		*(__m64 *)&block[2].mv = const_0_minus_1;
+
+
+		block[3].ref_pic_id = UNDEFINED_REFERENCE;			
+		*(__m64 *)&block[3].mv = const_0_minus_1;
+	}
+
+}
+
+
+#else
+static void init_macroblock(Macroblock *currMB)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int i, j;
+	int block_x = currMB->block_x, block_y = currMB->block_y;
+	PicMotionParams *motion = &p_Vid->dec_picture->motion;
+	PicMotion **list_motion0, **list_motion1;
+	macroblock_set_dc_pred(p_Vid, block_x, block_y);
+
+	// reset vectors and pred. modes
+	list_motion0 = motion->motion[LIST_0];
+	for(j = 0; j < BLOCK_SIZE; j++)
+	{                           
+		PicMotion *block0 = &list_motion0[block_y+j][block_x];
+		block0[0].ref_pic_id = UNDEFINED_REFERENCE;			
+		memset(block0[0].mv, 0, sizeof(MotionVector));
+		block0[0].ref_idx = -1;
+
+		block0[1].ref_pic_id = UNDEFINED_REFERENCE;			
+		memset(block0[1].mv, 0, sizeof(MotionVector));
+		block0[1].ref_idx = -1;			
+
+		block0[2].ref_pic_id = UNDEFINED_REFERENCE;			
+		memset(block0[2].mv, 0, sizeof(MotionVector));
+		block0[2].ref_idx = -1;		
+
+		block0[3].ref_pic_id = UNDEFINED_REFERENCE;			
+		memset(block0[3].mv, 0, sizeof(MotionVector));
+		block0[3].ref_idx = -1;			
+
+
+	}
+
+	list_motion1 = motion->motion[LIST_1];
+	for(j = 0; j < BLOCK_SIZE; j++)
+	{                           
+		PicMotion *block1 = &list_motion1[block_y+j][block_x];
+
+		block1[0].ref_pic_id = UNDEFINED_REFERENCE;			
+		memset(block1[0].mv, 0, sizeof(MotionVector));
+		block1[0].ref_idx = -1;
+
+		block1[1].ref_pic_id = UNDEFINED_REFERENCE;			
+		memset(block1[1].mv, 0, sizeof(MotionVector));
+		block1[1].ref_idx = -1;			
+
+		block1[2].ref_pic_id = UNDEFINED_REFERENCE;			
+		memset(block1[2].mv, 0, sizeof(MotionVector));
+		block1[2].ref_idx = -1;			
+
+		block1[3].ref_pic_id = UNDEFINED_REFERENCE;			
+		memset(block1[3].mv, 0, sizeof(MotionVector));
+		block1[3].ref_idx = -1;			
+	}
+
+}
+
+
+#endif
+/*!
+************************************************************************
+* \brief
+*    Sets mode for 8x8 block
+************************************************************************
+*/
+void SetB8Mode (Macroblock* currMB, int value, int i)
+{
+	Slice* currSlice = currMB->p_Slice;
+	static const char p_v2b8 [ 5] = {4, 5, 6, 7, IBLOCK};
+	static const char p_v2pd [ 5] = {0, 0, 0, 0, -1};
+	static const char b_v2b8 [14] = {0, 4, 4, 4, 5, 6, 5, 6, 5, 6, 7, 7, 7, IBLOCK};
+	static const char b_v2pd [14] = {2, 0, 1, 2, 0, 0, 1, 1, 2, 2, 0, 1, 2, -1};
+
+	if (currSlice->slice_type==B_SLICE)
+	{
+		currMB->b8mode[i] = b_v2b8[value];
+		currMB->b8pdir[i] = b_v2pd[value];
+	}
+	else
+	{
+		currMB->b8mode[i] = p_v2b8[value];
+		currMB->b8pdir[i] = p_v2pd[value];
+	}
+}
+
+
+void reset_coeffs(Slice *currSlice)
+{
+
+	VideoParameters *p_Vid = currSlice->p_Vid;
+
+	// reset all coeffs
+#ifdef _DEBUG
+	{
+		int m;
+		for (m=0;m<3;m++)
+		{
+			int z;
+			short *b = &currSlice->cof[m][0][0];
+			for (z=0;z<256;z++)
+			{
+				if (b[z] != 0)
+				{
+					DebugBreak();
+				}
+			}
+		}
+	}
+#endif
+
+	// benski> don't think this is necessary... enable check above to be sure
+	// 	memset(currSlice->cof, 0, sizeof(currSlice->cof));
+
+	// CAVLC
+	if (p_Vid->active_pps->entropy_coding_mode_flag == CAVLC)
+		memzero48(p_Vid->nz_coeff[p_Vid->current_mb_nr]);
+}
+
+void field_flag_inference(Macroblock *currMB)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	if (currMB->mb_avail_left)
+	{
+		currMB->mb_field = p_Vid->mb_data[currMB->mb_addr_left].mb_field;
+	}
+	else
+	{
+		// check top macroblock pair
+		currMB->mb_field = currMB->mb_avail_up ? p_Vid->mb_data[currMB->mb_addr_up].mb_field : FALSE;
+	}
+}
+
+
+static void skip_macroblock(Macroblock *currMB)
+{
+	short pred_mv[2];
+	int zeroMotionAbove;
+	int zeroMotionLeft;
+	PixelPos mb[4];    // neighbor blocks
+	int   i, j;
+	int   a_mv_y = 0;
+	int   a_ref_idx = 0;
+	int   b_mv_y = 0;
+	int   b_ref_idx = 0;
+	int   img_block_y   = currMB->block_y;
+	VideoParameters *p_Vid = currMB->p_Vid;
+	Slice *currSlice = currMB->p_Slice;
+	int   list_offset = ((currSlice->mb_aff_frame_flag) && (currMB->mb_field)) ? (currMB->mbAddrX & 0x01) ? 4 : 2 : 0;
+	StorablePicture *dec_picture = p_Vid->dec_picture;
+	PicMotionParams *motion = &dec_picture->motion;
+	short *a_mv = NULL;
+	short *b_mv = NULL;
+
+	get_neighbors0016(currMB, mb);
+
+	if (mb[0].available)
+	{
+		a_mv      = motion->motion[LIST_0][mb[0].pos_y][mb[0].pos_x].mv;
+		a_mv_y    = a_mv[1];    
+		a_ref_idx = motion->motion[LIST_0][mb[0].pos_y][mb[0].pos_x].ref_idx;
+
+		if (currMB->mb_field && !p_Vid->mb_data[mb[0].mb_addr].mb_field)
+		{
+			a_mv_y    /=2;
+			a_ref_idx *=2;
+		}
+		if (!currMB->mb_field && p_Vid->mb_data[mb[0].mb_addr].mb_field)
+		{
+			a_mv_y    *=2;
+			a_ref_idx >>=1;
+		}
+	}
+
+	if (mb[1].available)
+	{
+		b_mv      = motion->motion[LIST_0][mb[1].pos_y][mb[1].pos_x].mv;
+		b_mv_y    = b_mv[1];
+		b_ref_idx = motion->motion[LIST_0][mb[1].pos_y][mb[1].pos_x].ref_idx;
+
+		if (currMB->mb_field && !p_Vid->mb_data[mb[1].mb_addr].mb_field)
+		{
+			b_mv_y    /=2;
+			b_ref_idx *=2;
+		}
+		if (!currMB->mb_field && p_Vid->mb_data[mb[1].mb_addr].mb_field)
+		{
+			b_mv_y    *=2;
+			b_ref_idx >>=1;
+		}
+	}
+
+	zeroMotionLeft  = !mb[0].available ? 1 : a_ref_idx==0 && a_mv[0]==0 && a_mv_y==0 ? 1 : 0;
+	zeroMotionAbove = !mb[1].available ? 1 : b_ref_idx==0 && b_mv[0]==0 && b_mv_y==0 ? 1 : 0;
+
+	currMB->cbp = 0;
+	reset_coeffs(currSlice);
+
+	if (zeroMotionAbove || zeroMotionLeft)
+	{
+		for(j = img_block_y; j < img_block_y + BLOCK_SIZE; ++j)
+		{
+			for(i=currMB->block_x;i<currMB->block_x + BLOCK_SIZE; ++i)
+			{
+				memset(&motion->motion[LIST_0][j][i].mv, 0, sizeof(MotionVector));
+				motion->motion[LIST_0][j][i].ref_idx=0;
+				motion->motion[LIST_0][j][i].ref_pic_id = dec_picture->ref_pic_num[p_Vid->current_slice_nr][LIST_0 + list_offset][0];
+			}
+		}
+	}
+	else
+	{
+		currMB->GetMVPredictor (currMB, mb, pred_mv, 0, motion->motion[LIST_0], 0, 0, MB_BLOCK_SIZE, MB_BLOCK_SIZE);
+
+		// Set first block line (position img_block_y)
+		for(j=img_block_y; j < img_block_y + BLOCK_SIZE; ++j)
+		{
+			for(i=currMB->block_x;i<currMB->block_x + BLOCK_SIZE; ++i)
+			{
+				memcpy(&motion->motion[LIST_0][j][i].mv, pred_mv, sizeof(MotionVector));
+				motion->motion[LIST_0][j][i].ref_idx=0;
+				motion->motion[LIST_0][j][i].ref_pic_id = dec_picture->ref_pic_num[p_Vid->current_slice_nr][LIST_0 + list_offset][0];
+			}
+		}
+	}
+}
+
+static void concealIPCMcoeffs(Macroblock *currMB)
+{
+	Slice *currSlice = currMB->p_Slice;
+	VideoParameters *p_Vid = currMB->p_Vid;
+	StorablePicture *dec_picture = p_Vid->dec_picture;
+	int i, j, k;
+
+	for(i=0;i<MB_BLOCK_SIZE;++i)
+	{
+		for(j=0;j<MB_BLOCK_SIZE;++j)
+		{
+			currSlice->ipcm[0][i][j] = p_Vid->dc_pred_value_comp[0];
+		}
+	}
+
+	if ((dec_picture->chroma_format_idc != YUV400) && !IS_INDEPENDENT(p_Vid))
+	{
+		for (k = 0; k < 2; ++k)
+		{
+			for(i=0;i<p_Vid->mb_cr_size_y;++i)
+			{
+				for(j=0;j<p_Vid->mb_cr_size_x;++j)
+				{
+					currSlice->ipcm[k][i][j] = p_Vid->dc_pred_value_comp[k];
+				}
+			}
+		}
+	}
+}
+
+/*!
+************************************************************************
+* \brief
+*    Get the syntax elements from the NAL
+************************************************************************
+*/
+static void read_one_macroblock_i_slice(Macroblock *currMB)
+{
+	Slice *currSlice = currMB->p_Slice;
+	VideoParameters *p_Vid = currMB->p_Vid;
+
+	SyntaxElement currSE;
+	int mb_nr = currMB->mbAddrX; 
+
+	DataPartition *dP;
+	const byte *partMap = assignSE2partition[currSlice->dp_mode];
+	StorablePicture *dec_picture = p_Vid->dec_picture;
+	PicMotionParams *motion = &dec_picture->motion;
+
+	currMB->mb_field = ((mb_nr&0x01) == 0)? FALSE : p_Vid->mb_data[mb_nr-1].mb_field;
+
+	update_qp(currMB, p_Vid->qp);
+
+	//  read MB mode *****************************************************************
+	dP = &(currSlice->partArr[partMap[SE_MBTYPE]]);
+
+	if (p_Vid->active_pps->entropy_coding_mode_flag == CAVLC)
+		currSE.mapping = linfo_ue;
+
+	// read MB aff
+	if (currSlice->mb_aff_frame_flag && (mb_nr&0x01)==0)
+	{
+		TRACE_STRING("mb_field_decoding_flag");
+		if (p_Vid->active_pps->entropy_coding_mode_flag == CAVLC)
+		{
+			currMB->mb_field = readSyntaxElement_FLC(dP->bitstream, 1);
+		}
+		else
+		{
+			currMB->mb_field = readFieldModeInfo_CABAC(currMB, &dP->de_cabac);
+		}
+	}
+
+	if(p_Vid->active_pps->entropy_coding_mode_flag  == CABAC)
+	{
+		CheckAvailabilityOfNeighborsCABAC(currMB);
+
+		//  read MB type
+		currMB->mb_type = readMB_typeInfo_CABAC(currMB, &dP->de_cabac);
+	}
+	else 
+	{ // CAVLC
+		//  read MB type
+		readSyntaxElement_UVLC(&currSE, dP);
+		currMB->mb_type = currSE.value1;
+	}
+
+
+
+	currMB->ei_flag = 0;
+
+	motion->mb_field[mb_nr] = (byte) currMB->mb_field;
+
+	currMB->block_y_aff = ((currSlice->mb_aff_frame_flag) && (currMB->mb_field)) ? (mb_nr&0x01) ? (currMB->block_y - 4)>>1 : currMB->block_y >> 1 : currMB->block_y;
+
+	p_Vid->siblock[currMB->mb_y][currMB->mb_x] = 0;
+
+	currSlice->interpret_mb_mode(currMB);
+
+	//init NoMbPartLessThan8x8Flag
+	currMB->NoMbPartLessThan8x8Flag = TRUE;
+
+	//============= Transform Size Flag for INTRA MBs =============
+	//-------------------------------------------------------------
+	//transform size flag for INTRA_4x4 and INTRA_8x8 modes
+	if (currMB->mb_type == I4MB && p_Vid->Transform8x8Mode)
+	{
+		dP = &(currSlice->partArr[partMap[SE_HEADER]]);
+		TRACE_STRING("transform_size_8x8_flag");
+
+		// read CAVLC transform_size_8x8_flag
+		if (p_Vid->active_pps->entropy_coding_mode_flag == CAVLC)
+		{
+			currMB->luma_transform_size_8x8_flag = readSyntaxElement_FLC(dP->bitstream, 1);
+		}
+		else
+		{
+			currMB->luma_transform_size_8x8_flag = readMB_transform_size_flag_CABAC(currMB, &dP->de_cabac);
+		}
+
+		if (currMB->luma_transform_size_8x8_flag)
+		{      
+			currMB->mb_type = I8MB;
+			memset(&currMB->b8mode, I8MB, 4 * sizeof(char));
+			memset(&currMB->b8pdir, -1, 4 * sizeof(char));
+		}
+	}
+	else
+	{
+		currMB->luma_transform_size_8x8_flag = FALSE;
+	}
+
+	//--- init macroblock data ---
+	init_macroblock(currMB);
+
+	if(currMB->mb_type != IPCM)
+	{
+		// intra prediction modes for a macroblock 4x4 **********************************************
+		read_ipred_modes(currMB);
+
+		// read CBP and Coeffs  ***************************************************************
+		currSlice->read_CBP_and_coeffs_from_NAL (currMB);
+	}
+	else
+	{
+		//read pcm_alignment_zero_bit and pcm_byte[i]
+
+		// here dP is assigned with the same dP as SE_MBTYPE, because IPCM syntax is in the
+		// same category as MBTYPE
+		if ( currSlice->dp_mode && currSlice->dpB_NotPresent )
+		{
+			concealIPCMcoeffs(currMB);
+		}
+		else
+		{
+			dP = &(currSlice->partArr[partMap[SE_LUM_DC_INTRA]]);
+			read_IPCM_coeffs_from_NAL(currSlice, dP);
+		}
+	}
+
+	return;
+}
+
+/*!
+************************************************************************
+* \brief
+*    Get the syntax elements from the NAL
+************************************************************************
+*/
+static void read_one_macroblock_p_slice(Macroblock *currMB)
+{
+	Slice *currSlice = currMB->p_Slice;
+	VideoParameters *p_Vid = currMB->p_Vid;
+
+	int i;
+
+	SyntaxElement currSE;
+	int mb_nr = currMB->mbAddrX; 
+
+	DataPartition *dP;
+	const byte *partMap = assignSE2partition[currSlice->dp_mode];
+	Macroblock *topMB = NULL;
+	int  prevMbSkipped = 0;
+	int  check_bottom, read_bottom, read_top;  
+	StorablePicture *dec_picture = p_Vid->dec_picture;
+	PicMotionParams *motion = &dec_picture->motion;
+
+	if (currSlice->mb_aff_frame_flag)
+	{
+		if (mb_nr&0x01)
+		{
+			topMB= &p_Vid->mb_data[mb_nr-1];
+			prevMbSkipped = (topMB->mb_type == 0);
+		}
+		else
+			prevMbSkipped = 0;
+	}
+
+	currMB->mb_field = ((mb_nr&0x01) == 0)? FALSE : p_Vid->mb_data[mb_nr-1].mb_field;
+
+	update_qp(currMB, p_Vid->qp);
+
+	//  read MB mode *****************************************************************
+	dP = &(currSlice->partArr[partMap[SE_MBTYPE]]);
+
+	if (p_Vid->active_pps->entropy_coding_mode_flag == CAVLC)   
+		currSE.mapping = linfo_ue;
+
+	if (p_Vid->active_pps->entropy_coding_mode_flag == CABAC)
+	{
+		int skip;
+		// read MB skip_flag
+		if (currSlice->mb_aff_frame_flag && ((mb_nr&0x01) == 0||prevMbSkipped))
+			field_flag_inference(currMB);
+
+		CheckAvailabilityOfNeighborsCABAC(currMB);
+		TRACE_STRING("mb_skip_flag");
+		skip = readMB_skip_flagInfo_CABAC(currMB, &dP->de_cabac);
+
+		currMB->mb_type   = !skip;
+		currMB->skip_flag = skip;
+
+		currMB->ei_flag = 0;
+
+		// read MB AFF
+		if (currSlice->mb_aff_frame_flag)
+		{
+			check_bottom=read_bottom=read_top=0;
+			if ((mb_nr&0x01)==0)
+			{
+				check_bottom =  currMB->skip_flag;
+				read_top = !check_bottom;
+			}
+			else
+			{
+				read_bottom = (topMB->skip_flag && (!currMB->skip_flag));
+			}
+
+			if (read_bottom || read_top)
+			{
+				TRACE_STRING("mb_field_decoding_flag");
+				currMB->mb_field = readFieldModeInfo_CABAC(currMB, &dP->de_cabac);
+			}
+			if (check_bottom)
+				check_next_mb_and_get_field_mode_CABAC(currSlice, dP);
+
+			CheckAvailabilityOfNeighborsCABAC(currMB);
+		}
+
+		// read MB type
+		if (currMB->mb_type != 0 )
+		{
+			TRACE_STRING("mb_type");
+			currMB->mb_type = readMB_typeInfo_CABAC(currMB, &dP->de_cabac);
+			currMB->ei_flag = 0;
+		}
+	}
+	// VLC Non-Intra
+	else
+	{
+		if(p_Vid->cod_counter == -1)
+		{
+			TRACE_STRING("mb_skip_run");
+			readSyntaxElement_UVLC(&currSE, dP);
+			p_Vid->cod_counter = currSE.value1;
+		}
+		if (p_Vid->cod_counter==0)
+		{
+			// read MB aff
+			if ((currSlice->mb_aff_frame_flag) && (((mb_nr&0x01)==0) || ((mb_nr&0x01) && prevMbSkipped)))
+			{
+				TRACE_STRING("mb_field_decoding_flag");
+				currMB->mb_field = (Boolean) readSyntaxElement_FLC(dP->bitstream, 1);
+			}
+
+			// read MB type
+			TRACE_STRING("mb_type");
+			readSyntaxElement_UVLC(&currSE, dP);
+			if(currSlice->slice_type == P_SLICE || currSlice->slice_type == SP_SLICE)
+				++(currSE.value1);
+			currMB->mb_type = currSE.value1;
+			currMB->ei_flag = 0;
+			p_Vid->cod_counter--;
+			currMB->skip_flag = 0;
+		}
+		else
+		{
+			p_Vid->cod_counter--;
+			currMB->mb_type = 0;
+			currMB->ei_flag = 0;
+			currMB->skip_flag = 1;
+
+			// read field flag of bottom block
+			if(currSlice->mb_aff_frame_flag)
+			{
+				if(p_Vid->cod_counter == 0 && ((mb_nr&0x01) == 0))
+				{
+					TRACE_STRING("mb_field_decoding_flag (of coded bottom mb)");
+					currMB->mb_field = (Boolean) readSyntaxElement_FLC(dP->bitstream, 1);
+					dP->bitstream->frame_bitoffset--;
+					TRACE_DECBITS(1);
+				}
+				else if (p_Vid->cod_counter > 0 && ((mb_nr & 0x01) == 0))
+				{
+					// check left macroblock pair first
+					if (mb_is_available(mb_nr - 2, currMB) && ((mb_nr % (p_Vid->PicWidthInMbs * 2))!=0))
+					{
+						currMB->mb_field = p_Vid->mb_data[mb_nr-2].mb_field;
+					}
+					else
+					{
+						// check top macroblock pair
+						if (mb_is_available(mb_nr - 2*p_Vid->PicWidthInMbs, currMB))
+						{
+							currMB->mb_field = p_Vid->mb_data[mb_nr-2*p_Vid->PicWidthInMbs].mb_field;
+						}
+						else
+							currMB->mb_field = FALSE;
+					}
+				}
+			}
+		}
+	}
+
+	motion->mb_field[mb_nr] = (byte) currMB->mb_field;
+
+	currMB->block_y_aff = ((currSlice->mb_aff_frame_flag) && (currMB->mb_field)) ? (mb_nr&0x01) ? (currMB->block_y - 4)>>1 : currMB->block_y >> 1 : currMB->block_y;
+
+	p_Vid->siblock[currMB->mb_y][currMB->mb_x] = 0;
+
+	currSlice->interpret_mb_mode(currMB);
+
+	if(currSlice->mb_aff_frame_flag)
+	{
+		if(currMB->mb_field)
+		{
+			currSlice->num_ref_idx_l0_active <<=1;
+			currSlice->num_ref_idx_l1_active <<=1;
+		}
+	}
+
+	//init NoMbPartLessThan8x8Flag
+	currMB->NoMbPartLessThan8x8Flag = (IS_DIRECT(currMB) && !(p_Vid->active_sps->direct_8x8_inference_flag))? FALSE: TRUE;
+
+	//====== READ 8x8 SUB-PARTITION MODES (modes of 8x8 blocks) and Intra VBST block modes ======
+	if (currMB->mb_type == P8x8)
+	{
+		dP = &(currSlice->partArr[partMap[SE_MBTYPE]]);
+
+		if (p_Vid->active_pps->entropy_coding_mode_flag ==CAVLC)
+		{
+			currSE.mapping = linfo_ue;
+			for (i = 0; i < 4; ++i)
+			{
+				TRACE_STRING("sub_mb_type");
+				readSyntaxElement_UVLC(&currSE, dP);
+				SetB8Mode (currMB, currSE.value1, i);
+
+				//set NoMbPartLessThan8x8Flag for P8x8 mode
+				currMB->NoMbPartLessThan8x8Flag &= (currMB->b8mode[i]==0 && p_Vid->active_sps->direct_8x8_inference_flag) ||
+					(currMB->b8mode[i]==4);
+			}
+		}
+		else
+		{
+			for (i = 0; i < 4; ++i)
+			{
+				int value = readB8_typeInfo_CABAC(currSlice, &dP->de_cabac);
+				SetB8Mode (currMB, value, i);
+
+				//set NoMbPartLessThan8x8Flag for P8x8 mode
+				currMB->NoMbPartLessThan8x8Flag &= (currMB->b8mode[i]==0 && p_Vid->active_sps->direct_8x8_inference_flag) ||
+					(currMB->b8mode[i]==4);
+			}
+		}
+
+		//--- init macroblock data ---
+		init_macroblock       (currMB);
+		currSlice->read_motion_info_from_NAL (currMB);
+	}
+
+	//============= Transform Size Flag for INTRA MBs =============
+	//-------------------------------------------------------------
+	//transform size flag for INTRA_4x4 and INTRA_8x8 modes
+	if (currMB->mb_type == I4MB && p_Vid->Transform8x8Mode)
+	{
+		dP = &(currSlice->partArr[partMap[SE_HEADER]]);
+		TRACE_STRING("transform_size_8x8_flag");
+
+		// read CAVLC transform_size_8x8_flag
+		if (p_Vid->active_pps->entropy_coding_mode_flag == CAVLC)
+		{
+			currMB->luma_transform_size_8x8_flag = (Boolean) readSyntaxElement_FLC(dP->bitstream, 1);
+		}
+		else
+		{
+			currMB->luma_transform_size_8x8_flag = readMB_transform_size_flag_CABAC(currMB,  &dP->de_cabac);
+		}
+
+		if (currMB->luma_transform_size_8x8_flag)
+		{      
+			currMB->mb_type = I8MB;
+			memset(&currMB->b8mode, I8MB, 4 * sizeof(char));
+			memset(&currMB->b8pdir, -1, 4 * sizeof(char));
+		}
+	}
+	else
+	{
+		currMB->luma_transform_size_8x8_flag = FALSE;
+	}
+
+	if(p_Vid->active_pps->constrained_intra_pred_flag)
+	{
+		if( !IS_INTRA(currMB) )
+		{
+			p_Vid->intra_block[mb_nr] = 0;
+		}
+	}
+
+	//--- init macroblock data ---
+	if (currMB->mb_type != P8x8)
+		init_macroblock(currMB);
+
+	if (IS_SKIP (currMB)) //keep last macroblock
+	{
+		skip_macroblock(currMB);
+	}
+	else if(currMB->mb_type != IPCM)
+	{
+		// intra prediction modes for a macroblock 4x4 **********************************************
+		if (IS_INTRA(currMB))
+			read_ipred_modes(currMB);
+
+		// read inter frame vector data *********************************************************
+		if (IS_INTERMV (currMB) && (currMB->mb_type != P8x8))
+		{
+			currSlice->read_motion_info_from_NAL (currMB);
+		}
+		// read CBP and Coeffs  ***************************************************************
+		currSlice->read_CBP_and_coeffs_from_NAL (currMB);
+	}
+	else
+	{
+		//read pcm_alignment_zero_bit and pcm_byte[i]
+
+		// here dP is assigned with the same dP as SE_MBTYPE, because IPCM syntax is in the
+		// same category as MBTYPE
+		if ( currSlice->dp_mode && currSlice->dpB_NotPresent )
+		{
+			concealIPCMcoeffs(currMB);
+		}
+		else
+		{
+			dP = &(currSlice->partArr[partMap[SE_LUM_DC_INTRA]]);
+			read_IPCM_coeffs_from_NAL(currSlice, dP);
+		}
+	}
+
+	return;
+}
+
+/*!
+************************************************************************
+* \brief
+*    Get the syntax elements from the NAL
+************************************************************************
+*/
+static void read_one_macroblock_b_slice(Macroblock *currMB)
+{
+	Slice *currSlice = currMB->p_Slice;
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int i;
+
+	SyntaxElement currSE;
+	int mb_nr = currMB->mbAddrX; 
+
+	DataPartition *dP;
+	const byte *partMap = assignSE2partition[currSlice->dp_mode];
+	Macroblock *topMB = NULL;
+	int  prevMbSkipped = 0;
+	int  check_bottom, read_bottom, read_top;  
+	StorablePicture *dec_picture = p_Vid->dec_picture;
+	PicMotionParams *motion = &dec_picture->motion;
+
+	if (currSlice->mb_aff_frame_flag)
+	{
+		if (mb_nr&0x01)
+		{
+			topMB= &p_Vid->mb_data[mb_nr-1];
+			prevMbSkipped = topMB->skip_flag;
+		}
+		else
+			prevMbSkipped = 0;
+	}
+
+	currMB->mb_field = ((mb_nr&0x01) == 0)? FALSE : p_Vid->mb_data[mb_nr-1].mb_field;
+
+	update_qp(currMB, p_Vid->qp);
+
+	//  read MB mode *****************************************************************
+	dP = &(currSlice->partArr[partMap[SE_MBTYPE]]);
+
+	if (p_Vid->active_pps->entropy_coding_mode_flag == CAVLC)   
+		currSE.mapping = linfo_ue;
+
+	if (p_Vid->active_pps->entropy_coding_mode_flag == CABAC)
+	{
+		// read MB skip_flag
+		int skip;
+		if (currSlice->mb_aff_frame_flag && ((mb_nr&0x01) == 0||prevMbSkipped))
+			field_flag_inference(currMB);
+
+		CheckAvailabilityOfNeighborsCABAC(currMB);
+		TRACE_STRING("mb_skip_flag");
+		skip = readMB_skip_flagInfo_CABAC(currMB, &dP->de_cabac);
+
+		currMB->mb_type   = !skip;
+		currMB->skip_flag = skip;
+
+		currMB->cbp = !skip;
+
+		currMB->ei_flag = 0;
+
+		if (skip)
+			p_Vid->cod_counter=0;
+
+		// read MB AFF
+		if (currSlice->mb_aff_frame_flag)
+		{
+			check_bottom=read_bottom=read_top=0;
+			if ((mb_nr&0x01)==0)
+			{
+				check_bottom =  currMB->skip_flag;
+				read_top = !check_bottom;
+			}
+			else
+			{
+				read_bottom = (topMB->skip_flag && (!currMB->skip_flag));
+			}
+
+			if (read_bottom || read_top)
+			{
+				TRACE_STRING("mb_field_decoding_flag");
+				currMB->mb_field = readFieldModeInfo_CABAC(currMB, &dP->de_cabac);
+			}
+			if (check_bottom)
+				check_next_mb_and_get_field_mode_CABAC(currSlice,dP);
+
+			CheckAvailabilityOfNeighborsCABAC(currMB);
+		}
+
+		// read MB type
+		if (currMB->mb_type != 0 )
+		{
+			TRACE_STRING("mb_type");
+			currMB->mb_type = readMB_typeInfo_CABAC(currMB, &dP->de_cabac);
+			currMB->ei_flag = 0;
+		}
+	}
+	// VLC Non-Intra
+	else
+	{
+		if(p_Vid->cod_counter == -1)
+		{
+			TRACE_STRING("mb_skip_run");
+			readSyntaxElement_UVLC(&currSE, dP);
+			p_Vid->cod_counter = currSE.value1;
+		}
+		if (p_Vid->cod_counter==0)
+		{
+			// read MB aff
+			if ((currSlice->mb_aff_frame_flag) && (((mb_nr&0x01)==0) || ((mb_nr&0x01) && prevMbSkipped)))
+			{
+				TRACE_STRING("mb_field_decoding_flag");
+				currMB->mb_field = (Boolean) readSyntaxElement_FLC(dP->bitstream, 1);
+			}
+
+			// read MB type
+			TRACE_STRING("mb_type");
+			readSyntaxElement_UVLC(&currSE, dP);
+			if(currSlice->slice_type == P_SLICE || currSlice->slice_type == SP_SLICE)
+				++(currSE.value1);
+			currMB->mb_type = currSE.value1;
+			currMB->ei_flag = 0;
+			p_Vid->cod_counter--;
+			currMB->skip_flag = 0;
+		}
+		else
+		{
+			p_Vid->cod_counter--;
+			currMB->mb_type = 0;
+			currMB->ei_flag = 0;
+			currMB->skip_flag = 1;
+
+			// read field flag of bottom block
+			if(currSlice->mb_aff_frame_flag)
+			{
+				if(p_Vid->cod_counter == 0 && ((mb_nr&0x01) == 0))
+				{
+					TRACE_STRING("mb_field_decoding_flag (of coded bottom mb)");
+					currMB->mb_field = (Boolean) readSyntaxElement_FLC(dP->bitstream, 1);
+					dP->bitstream->frame_bitoffset--;
+					TRACE_DECBITS(1);
+				}
+				else if (p_Vid->cod_counter > 0 && ((mb_nr & 0x01) == 0))
+				{
+					// check left macroblock pair first
+					if (mb_is_available(mb_nr - 2, currMB) && ((mb_nr % (p_Vid->PicWidthInMbs * 2))!=0))
+					{
+						currMB->mb_field = p_Vid->mb_data[mb_nr-2].mb_field;
+					}
+					else
+					{
+						// check top macroblock pair
+						if (mb_is_available(mb_nr - 2*p_Vid->PicWidthInMbs, currMB))
+						{
+							currMB->mb_field = p_Vid->mb_data[mb_nr-2*p_Vid->PicWidthInMbs].mb_field;
+						}
+						else
+							currMB->mb_field = FALSE;
+					}
+				}
+			}
+		}
+	}
+
+	motion->mb_field[mb_nr] = (byte) currMB->mb_field;
+
+	currMB->block_y_aff = ((currSlice->mb_aff_frame_flag) && (currMB->mb_field)) ? (mb_nr&0x01) ? (currMB->block_y - 4)>>1 : currMB->block_y >> 1 : currMB->block_y;
+
+	p_Vid->siblock[currMB->mb_y][currMB->mb_x] = 0;
+
+	currSlice->interpret_mb_mode(currMB);
+
+	if(currSlice->mb_aff_frame_flag)
+	{
+		if(currMB->mb_field)
+		{
+			currSlice->num_ref_idx_l0_active <<=1;
+			currSlice->num_ref_idx_l1_active <<=1;
+		}
+	}
+
+	//init NoMbPartLessThan8x8Flag
+	currMB->NoMbPartLessThan8x8Flag = (IS_DIRECT(currMB) && !(p_Vid->active_sps->direct_8x8_inference_flag))? FALSE: TRUE;
+
+	//====== READ 8x8 SUB-PARTITION MODES (modes of 8x8 blocks) and Intra VBST block modes ======
+	if (currMB->mb_type == P8x8)
+	{
+		dP = &(currSlice->partArr[partMap[SE_MBTYPE]]);
+
+		if (p_Vid->active_pps->entropy_coding_mode_flag ==CAVLC)
+		{
+			currSE.mapping = linfo_ue;
+			for (i = 0; i < 4; ++i)
+			{
+				TRACE_STRING("sub_mb_type");
+				readSyntaxElement_UVLC(&currSE, dP);
+				SetB8Mode (currMB, currSE.value1, i);
+
+				//set NoMbPartLessThan8x8Flag for P8x8 mode
+				currMB->NoMbPartLessThan8x8Flag &= (currMB->b8mode[i]==0 && p_Vid->active_sps->direct_8x8_inference_flag) ||
+					(currMB->b8mode[i]==4);
+			}
+		}
+		else
+		{
+			for (i = 0; i < 4; ++i)
+			{
+				int value = readB8_typeInfo_CABAC(currSlice,  &dP->de_cabac);
+				SetB8Mode (currMB, value, i);
+
+				//set NoMbPartLessThan8x8Flag for P8x8 mode
+				currMB->NoMbPartLessThan8x8Flag &= (currMB->b8mode[i]==0 && p_Vid->active_sps->direct_8x8_inference_flag) ||
+					(currMB->b8mode[i]==4);
+			}
+		}
+
+		//--- init macroblock data ---
+		init_macroblock       (currMB);
+		currSlice->read_motion_info_from_NAL (currMB);
+	}
+
+	//============= Transform Size Flag for INTRA MBs =============
+	//-------------------------------------------------------------
+	//transform size flag for INTRA_4x4 and INTRA_8x8 modes
+	if (currMB->mb_type == I4MB && p_Vid->Transform8x8Mode)
+	{
+		dP = &(currSlice->partArr[partMap[SE_HEADER]]);
+		TRACE_STRING("transform_size_8x8_flag");
+
+		// read CAVLC transform_size_8x8_flag
+		if (p_Vid->active_pps->entropy_coding_mode_flag == CAVLC)
+		{
+			currMB->luma_transform_size_8x8_flag = (Boolean) readSyntaxElement_FLC(dP->bitstream, 1);
+		}
+		else
+		{
+			currMB->luma_transform_size_8x8_flag = readMB_transform_size_flag_CABAC(currMB, &dP->de_cabac);
+		}
+
+
+		if (currMB->luma_transform_size_8x8_flag)
+		{      
+			currMB->mb_type = I8MB;
+			memset(&currMB->b8mode, I8MB, 4 * sizeof(char));
+			memset(&currMB->b8pdir, -1, 4 * sizeof(char));
+		}
+	}
+	else
+	{
+		currMB->luma_transform_size_8x8_flag = FALSE;
+	}
+
+	if(p_Vid->active_pps->constrained_intra_pred_flag) // inter frame
+	{
+		if( !IS_INTRA(currMB) )
+		{
+			p_Vid->intra_block[mb_nr] = 0;
+		}
+	}
+
+	//--- init macroblock data ---
+	if (currMB->mb_type != P8x8)
+		init_macroblock(currMB);
+
+	if (IS_DIRECT (currMB) && p_Vid->cod_counter >= 0)
+	{
+		currMB->cbp = 0;
+		reset_coeffs(currSlice);
+
+		if (p_Vid->active_pps->entropy_coding_mode_flag ==CABAC)
+			p_Vid->cod_counter=-1;
+	}
+	else if (IS_SKIP (currMB)) //keep last macroblock
+	{
+		skip_macroblock(currMB);
+	}
+	else if(currMB->mb_type != IPCM)
+	{
+		// intra prediction modes for a macroblock 4x4 **********************************************
+		if (IS_INTRA(currMB))
+			read_ipred_modes(currMB);
+
+		// read inter frame vector data *********************************************************
+		if (IS_INTERMV (currMB) && (currMB->mb_type != P8x8))
+		{
+			currSlice->read_motion_info_from_NAL (currMB);
+		}
+		// read CBP and Coeffs  ***************************************************************
+		currSlice->read_CBP_and_coeffs_from_NAL (currMB);
+	}
+	else
+	{
+		//read pcm_alignment_zero_bit and pcm_byte[i]
+
+		// here dP is assigned with the same dP as SE_MBTYPE, because IPCM syntax is in the
+		// same category as MBTYPE
+		if ( currSlice->dp_mode && currSlice->dpB_NotPresent )
+		{
+			concealIPCMcoeffs(currMB);
+		}
+		else
+		{
+			dP = &(currSlice->partArr[partMap[SE_LUM_DC_INTRA]]);
+			read_IPCM_coeffs_from_NAL(currSlice, dP);
+		}
+	}
+
+	return;
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    Initialize decoding engine after decoding an IPCM macroblock
+*    (for IPCM CABAC  28/11/2003)
+*
+* \author
+*    Dong Wang <Dong.Wang@bristol.ac.uk>
+************************************************************************
+*/
+static void init_decoding_engine_IPCM(Slice *currSlice)
+{   
+	Bitstream *currStream;
+	int ByteStartPosition;
+	int PartitionNumber;
+	int i;
+
+	if(currSlice->dp_mode==PAR_DP_1)
+		PartitionNumber=1;
+	else if(currSlice->dp_mode==PAR_DP_3)
+		PartitionNumber=3;
+	else
+	{
+		printf("Partition Mode is not supported\n");
+		exit(1);
+	}
+
+	for(i=0;i<PartitionNumber;++i)
+	{
+		currStream = currSlice->partArr[i].bitstream;
+		ByteStartPosition = currStream->read_len;
+
+		arideco_start_decoding (&currSlice->partArr[i].de_cabac, currStream->streamBuffer, ByteStartPosition, &currStream->read_len);
+	}
+}
+
+
+
+
+/*!
+************************************************************************
+* \brief
+*    Read IPCM pcm_alignment_zero_bit and pcm_byte[i] from stream to currSlice->ipcm
+*    (for IPCM CABAC and IPCM CAVLC)
+*
+* \author
+*    Dong Wang <Dong.Wang@bristol.ac.uk>
+************************************************************************
+*/
+
+static void read_IPCM_coeffs_from_NAL(Slice *currSlice, struct datapartition *dP)
+{
+	VideoParameters *p_Vid = currSlice->p_Vid;
+
+	StorablePicture *dec_picture = p_Vid->dec_picture;
+	int i,j;
+
+	//For CABAC, we don't need to read bits to let stream byte aligned
+	//  because we have variable for integer bytes position
+	if(p_Vid->active_pps->entropy_coding_mode_flag  == CABAC)
+	{
+		readIPCM_CABAC(currSlice, dP);
+		init_decoding_engine_IPCM(currSlice);
+	}
+	else
+	{
+		//read bits to let stream byte aligned
+
+		if(((dP->bitstream->frame_bitoffset) & 0x07) != 0)
+		{
+			TRACE_STRING("pcm_alignment_zero_bit");
+			readSyntaxElement_FLC(dP->bitstream, (8 - ((dP->bitstream->frame_bitoffset) & 0x07)));
+		}
+
+		//read luma and chroma IPCM coefficients
+		TRACE_STRING("pcm_sample_luma");
+
+		for(i=0;i<MB_BLOCK_SIZE;++i)
+		{
+			for(j=0;j<MB_BLOCK_SIZE;++j)
+			{
+				currSlice->ipcm[0][i][j] = readSyntaxElement_FLC(dP->bitstream, p_Vid->bitdepth_luma);
+			}
+		}
+		if ((dec_picture->chroma_format_idc != YUV400) && !IS_INDEPENDENT(p_Vid))
+		{
+			TRACE_STRING("pcm_sample_chroma (u)");
+			for(i=0;i<p_Vid->mb_cr_size_y;++i)
+			{
+				for(j=0;j<p_Vid->mb_cr_size_x;++j)
+				{
+					currSlice->ipcm[1][i][j] = readSyntaxElement_FLC(dP->bitstream, p_Vid->bitdepth_chroma);
+				}
+			}
+			TRACE_STRING("pcm_sample_chroma (v)");
+			for(i=0;i<p_Vid->mb_cr_size_y;++i)
+			{
+				for(j=0;j<p_Vid->mb_cr_size_x;++j)
+				{
+					currSlice->ipcm[2][i][j] = readSyntaxElement_FLC(dP->bitstream, p_Vid->bitdepth_chroma);
+				}
+			}
+		}
+	}
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    If data partition B is lost, conceal PCM sample values with DC.
+*
+************************************************************************
+*/
+
+
+static void __forceinline read_ipred_iblock(VideoParameters *p_Vid, Macroblock *currMB, Slice *currSlice, DataPartition *dP, int b8)
+{
+	int i, j;
+	int mostProbableIntraPredMode;
+	int upIntraPredMode;
+	int leftIntraPredMode;
+	int bx, by, bi, bj;
+	SyntaxElement currSE;
+	int ts, ls;
+	PixelPos left_block, top_block;
+	int dec;
+
+	for(j=0;j<2;j++)  //loop subblocks
+	{
+		by = (b8&2) + j;
+		bj = currMB->block_y + by;
+		for(i=0;i<2;i++)
+		{
+			int pred_mode;
+			bx = ((b8&1)<<1) + i;
+			bi = currMB->block_x + bx;
+
+			//get from stream
+			if (p_Vid->active_pps->entropy_coding_mode_flag == CAVLC)
+			{
+				readSyntaxElement_Intra4x4PredictionMode(&currSE, dP->bitstream);
+				pred_mode = currSE.value1;
+			}
+			else
+			{
+				pred_mode = readIntraPredMode_CABAC(currSlice, &dP->de_cabac);
+			}
+
+			p_Vid->getNeighbourXPLumaNB(currMB, (bx<<2) - 1, (by<<2),      &left_block);
+			p_Vid->getNeighbourPXLumaNB(currMB, (bx<<2),     (by<<2) - 1,  &top_block );
+
+			//get from array and decode
+
+			if (p_Vid->active_pps->constrained_intra_pred_flag)
+			{
+				left_block.available = left_block.available ? p_Vid->intra_block[left_block.mb_addr] : 0;
+				top_block.available  = top_block.available  ? p_Vid->intra_block[top_block.mb_addr]  : 0;
+			}
+
+			// !! KS: not sure if the following is still correct...
+			ts = ls = 0;   // Check to see if the neighboring block is SI
+			if (currMB->mb_type == I4MB && currSlice->slice_type == SI_SLICE)           // need support for MBINTLC1
+			{
+				if (left_block.available)
+					if (p_Vid->siblock [left_block.mb_addr / p_Vid->PicWidthInMbs][left_block.mb_addr % p_Vid->PicWidthInMbs])
+						ls=1;
+
+				if (top_block.available)
+					if (p_Vid->siblock [top_block.mb_addr / p_Vid->PicWidthInMbs][top_block.mb_addr % p_Vid->PicWidthInMbs])
+						ts=1;
+			}
+
+			upIntraPredMode            = (top_block.available  &&(ts == 0)) ? p_Vid->ipredmode[top_block.pos_y>>2 ][top_block.pos_x>>2 ] : -1;
+			leftIntraPredMode          = (left_block.available &&(ls == 0)) ? p_Vid->ipredmode[left_block.pos_y>>2][left_block.pos_x>>2] : -1;
+
+			mostProbableIntraPredMode  = (upIntraPredMode < 0 || leftIntraPredMode < 0) ? DC_PRED : upIntraPredMode < leftIntraPredMode ? upIntraPredMode : leftIntraPredMode;
+
+			dec = (pred_mode == -1) ? mostProbableIntraPredMode : pred_mode + (pred_mode >= mostProbableIntraPredMode);
+
+
+			p_Vid->ipredmode[bj][bi] = dec;
+		}
+	}
+}
+
+static void __forceinline read_ipred_i8mb(VideoParameters *p_Vid, Macroblock *currMB, Slice *currSlice, DataPartition *dP, int b8)
+{
+	int mostProbableIntraPredMode;
+	int upIntraPredMode;
+	int leftIntraPredMode;
+	int bx, by, bi, bj;
+	int pred_mode;
+	SyntaxElement currSE;
+	int ts, ls;
+	PixelPos left_block, top_block;
+	int dec;
+
+	by = (b8&2);
+	bj = currMB->block_y + by;
+
+	bx = ((b8&1)<<1);
+	bi = currMB->block_x + bx;
+
+	//get from stream
+	if (p_Vid->active_pps->entropy_coding_mode_flag == CAVLC)
+	{
+		readSyntaxElement_Intra4x4PredictionMode(&currSE, dP->bitstream);
+		pred_mode = currSE.value1;
+	}
+	else
+	{
+		pred_mode = readIntraPredMode_CABAC(currSlice, &dP->de_cabac);
+	}
+
+	p_Vid->getNeighbourXPLumaNB(currMB, (bx<<2) - 1, (by<<2),      &left_block);
+	p_Vid->getNeighbourPXLumaNB(currMB, (bx<<2),     (by<<2) - 1,  &top_block );
+
+	//get from array and decode
+
+	if (p_Vid->active_pps->constrained_intra_pred_flag)
+	{
+		left_block.available = left_block.available ? p_Vid->intra_block[left_block.mb_addr] : 0;
+		top_block.available  = top_block.available  ? p_Vid->intra_block[top_block.mb_addr]  : 0;
+	}
+
+	// !! KS: not sure if the following is still correct...
+	ts = ls = 0;   // Check to see if the neighboring block is SI
+	if (currMB->mb_type == I4MB && currSlice->slice_type == SI_SLICE)           // need support for MBINTLC1
+	{
+		if (left_block.available)
+			if (p_Vid->siblock [left_block.mb_addr / p_Vid->PicWidthInMbs][left_block.mb_addr % p_Vid->PicWidthInMbs])
+				ls=1;
+
+		if (top_block.available)
+			if (p_Vid->siblock [top_block.mb_addr / p_Vid->PicWidthInMbs][top_block.mb_addr % p_Vid->PicWidthInMbs])
+				ts=1;
+	}
+
+	upIntraPredMode            = (top_block.available  &&(ts == 0)) ? p_Vid->ipredmode[top_block.pos_y>>2 ][top_block.pos_x>>2 ] : -1;
+	leftIntraPredMode          = (left_block.available &&(ls == 0)) ? p_Vid->ipredmode[left_block.pos_y>>2][left_block.pos_x>>2] : -1;
+
+	mostProbableIntraPredMode  = (upIntraPredMode < 0 || leftIntraPredMode < 0) ? DC_PRED : upIntraPredMode < leftIntraPredMode ? upIntraPredMode : leftIntraPredMode;
+
+	dec = (pred_mode == -1) ? mostProbableIntraPredMode : pred_mode + (pred_mode >= mostProbableIntraPredMode);
+
+	//set
+	p_Vid->ipredmode[bj][bi] = dec;
+	p_Vid->ipredmode[bj][bi+1] = dec;
+	p_Vid->ipredmode[bj+1][bi] = dec;
+	p_Vid->ipredmode[bj+1][bi+1] = dec;				
+}
+
+static void read_ipred_modes(Macroblock *currMB)
+{
+	int b8;
+	SyntaxElement currSE;
+	DataPartition *dP;
+	Slice *currSlice = currMB->p_Slice;
+	const byte *partMap = assignSE2partition[currSlice->dp_mode];
+	VideoParameters *p_Vid = currMB->p_Vid;
+
+	StorablePicture *dec_picture = p_Vid->dec_picture;
+	char IntraChromaPredModeFlag = IS_INTRA(currMB);
+
+	dP = &(currSlice->partArr[partMap[SE_INTRAPREDMODE]]);
+
+	for(b8 = 0; b8 < 4; ++b8)  //loop 8x8 blocks
+	{
+		if (currMB->b8mode[b8]==IBLOCK)
+		{
+			IntraChromaPredModeFlag = 1;
+			read_ipred_iblock(p_Vid, currMB, currSlice, dP, b8);
+		} 
+		else if (currMB->b8mode[b8]==I8MB)
+		{
+			IntraChromaPredModeFlag = 1;
+			read_ipred_i8mb(p_Vid, currMB, currSlice, dP, b8);
+		}
+	}
+
+	if (IntraChromaPredModeFlag && (dec_picture->chroma_format_idc != YUV400) && (dec_picture->chroma_format_idc != YUV444))
+	{
+		TRACE_STRING("intra_chroma_pred_mode");
+		dP = &(currSlice->partArr[partMap[SE_INTRAPREDMODE]]);
+
+		if (p_Vid->active_pps->entropy_coding_mode_flag == CAVLC) 
+		{
+			currSE.mapping = linfo_ue;
+			readSyntaxElement_UVLC(&currSE, dP);
+			currMB->c_ipred_mode = (char) currSE.value1;
+		}
+		else
+		{
+			currMB->c_ipred_mode = readCIPredMode_CABAC(currMB, &dP->de_cabac);
+		}
+
+
+
+		if (currMB->c_ipred_mode < DC_PRED_8 || currMB->c_ipred_mode > PLANE_8)
+		{
+			error("illegal chroma intra pred mode!\n", 600);
+		}
+	}
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    Get current block spatial neighbors
+************************************************************************
+*/
+void get_neighbors(Macroblock *currMB,       // <--  current Macroblock
+									 PixelPos   *block,     // <--> neighbor blocks
+									 int         mb_x,         // <--  block x position
+									 int         mb_y,         // <--  block y position
+									 int         blockshape_x  // <--  block width
+									 )
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int i;
+	p_Vid->getNeighbourXPLumaNB(currMB, mb_x - 1,            mb_y    ,  &block[0]); // left
+	p_Vid->getNeighbourPXLumaNB(currMB, mb_x,                mb_y - 1,  &block[1]); // up
+	p_Vid->getNeighbourPXLuma(currMB, mb_x + blockshape_x, mb_y - 1,  &block[2]); // upper right
+	p_Vid->getNeighbourLuma(currMB, mb_x - 1,            mb_y - 1,  &block[3]); // upper left
+	for (i = 0; i < 4; i++)
+	{
+		block[i].pos_x >>= 2;
+		block[i].pos_y >>= 2;
+	}
+
+	if (mb_y > 0)
+	{
+		if (mb_x < 8)  // first column of 8x8 blocks
+		{
+			if (mb_y == 8 )
+			{
+				if (blockshape_x == MB_BLOCK_SIZE)      
+					block[2].available  = 0;
+			}
+			else if (mb_x+blockshape_x == 8)
+			{
+				block[2].available = 0;
+			}
+		}
+		else if (mb_x + blockshape_x == MB_BLOCK_SIZE)
+		{
+			block[2].available = 0;
+		}
+	}
+
+	if (!block[2].available)
+	{
+		block[2] = block[3];
+	}
+}
+
+/* this version is for mb_x == 0, mb_y == 0 and blockshape_x == 16 */
+void get_neighbors0016(Macroblock *currMB,       // <--  current Macroblock
+											 PixelPos   *block   // <--> neighbor blocks
+											 )
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int i;
+
+	p_Vid->getNeighbourLeftLuma(currMB, &block[0]); // left
+	p_Vid->getNeighbourPXLumaNB(currMB, 0,  -1,  &block[1]); // up
+	p_Vid->getNeighbourPXLuma(currMB, 16, -1,  &block[2]); // upper right
+	p_Vid->getNeighbourLuma(currMB, -1, -1,  &block[3]); // upper left
+	for (i = 0; i < 4; i++)
+	{
+		if (block[i].available)
+		{
+			block[i].pos_x >>= 2;
+			block[i].pos_y >>= 2;
+		}
+	}	
+
+	if (!block[2].available)
+	{
+		block[2] = block[3];
+	}
+}
+
+/*!
+************************************************************************
+* \brief
+*    Read motion info
+************************************************************************
+*/
+static void read_motion_info_from_NAL_p_slice(Macroblock *currMB)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	Slice *currSlice = currMB->p_Slice;
+
+	int mb_nr = currMB->mbAddrX;
+
+	DataPartition *dP = NULL;
+	const byte *partMap       = assignSE2partition[currSlice->dp_mode];
+	int partmode        = ((currMB->mb_type == P8x8) ? 4 : currMB->mb_type);
+	int step_h0         = BLOCK_STEP [partmode][0];
+	int step_v0         = BLOCK_STEP [partmode][1];
+	h264_ref_t *pic_num;
+
+	int j4;
+	StorablePicture *dec_picture = p_Vid->dec_picture;
+	PicMotionParams *motion = &dec_picture->motion;
+
+	int list_offset = ((currSlice->mb_aff_frame_flag)&&(currMB->mb_field))? (mb_nr&0x01) ? 4 : 2 : 0;
+
+	if (p_Vid->active_pps->entropy_coding_mode_flag == CAVLC) 
+	{
+		SyntaxElement currSE;
+		//=====  READ REFERENCE PICTURE INDICES =====
+		dP = &(currSlice->partArr[partMap[SE_REFFRAME]]);
+		//  For LIST_0, if multiple ref. pictures, read LIST_0 reference picture indices for the MB ***********
+		prepareListforRefIdx (currMB, &currSE, currSlice->num_ref_idx_l0_active, (currMB->mb_type != P8x8) || (!p_Vid->allrefzero));
+		readMBRefPictureIdx  (&currSE, dP, currMB, &motion->motion[LIST_0][currMB->block_y], LIST_0, step_v0, step_h0);
+
+		//  For LIST_1, if multiple ref. pictures, read LIST_1 reference picture indices for the MB ***********
+		prepareListforRefIdx (currMB, &currSE, currSlice->num_ref_idx_l1_active, (currMB->mb_type != P8x8) || (!p_Vid->allrefzero));
+		readMBRefPictureIdx  (&currSE, dP, currMB, &motion->motion[LIST_1][currMB->block_y], LIST_1, step_v0, step_h0);
+
+		//=====  READ MOTION VECTORS =====
+		dP = &(currSlice->partArr[partMap[SE_MVD]]);
+
+		currSE.mapping = linfo_se;
+		readMBMotionVectors (&currSE, dP, currMB, LIST_0, step_h0, step_v0);
+	}
+	else                                                  
+	{
+		if (currMB->mb_type != P8x8 || !p_Vid->allrefzero)
+		{
+			//=====  READ REFERENCE PICTURE INDICES =====
+			dP = &(currSlice->partArr[partMap[SE_REFFRAME]]);
+			if (currSlice->num_ref_idx_l0_active > 1)
+			{
+				//  For LIST_0, if multiple ref. pictures, read LIST_0 reference picture indices for the MB ***********
+				readMBRefPictureIdx_CABAC(dP, currMB, &motion->motion[LIST_0][currMB->block_y], LIST_0, step_v0, step_h0);
+			}
+			else
+			{
+				readMBRefPictureIdx_CABAC_NoReference(currMB, &motion->motion[LIST_0][currMB->block_y], LIST_0, step_v0, step_h0);
+			}
+
+			if (currSlice->num_ref_idx_l1_active > 1)
+			{
+				//  For LIST_1, if multiple ref. pictures, read LIST_1 reference picture indices for the MB ***********
+				readMBRefPictureIdx_CABAC(dP, currMB, &motion->motion[LIST_1][currMB->block_y], LIST_1, step_v0, step_h0);
+			}
+			else
+			{
+				readMBRefPictureIdx_CABAC_NoReference(currMB, &motion->motion[LIST_1][currMB->block_y], LIST_1, step_v0, step_h0);
+			}
+		}
+		else
+		{
+			readMBRefPictureIdx_CABAC_NoReference(currMB, &motion->motion[LIST_0][currMB->block_y], LIST_0, step_v0, step_h0);
+			readMBRefPictureIdx_CABAC_NoReference(currMB, &motion->motion[LIST_1][currMB->block_y], LIST_1, step_v0, step_h0);
+		}
+		//=====  READ MOTION VECTORS =====
+		dP = &(currSlice->partArr[partMap[SE_MVD]]);
+
+		readMBMotionVectors_CABAC(dP, currMB, LIST_0, step_h0, step_v0);
+
+	}
+
+	// LIST_0 Motion vectors
+
+
+	// record reference picture Ids for deblocking decisions
+	pic_num = dec_picture->ref_pic_num[p_Vid->current_slice_nr][LIST_0 + list_offset];
+	for(j4 = currMB->block_y; j4 < (currMB->block_y +4);++j4)
+	{
+		PicMotion *ref = &motion->motion[LIST_0][j4][currMB->block_x];
+		ref[0].ref_pic_id = (ref[0].ref_idx >= 0)?pic_num[(short)ref[0].ref_idx]:UNDEFINED_REFERENCE;
+		ref[1].ref_pic_id = (ref[1].ref_idx >= 0)?pic_num[(short)ref[1].ref_idx]:UNDEFINED_REFERENCE;
+		ref[2].ref_pic_id = (ref[2].ref_idx >= 0)?pic_num[(short)ref[2].ref_idx]:UNDEFINED_REFERENCE;
+		ref[3].ref_pic_id = (ref[3].ref_idx >= 0)?pic_num[(short)ref[3].ref_idx]:UNDEFINED_REFERENCE;
+	}
+}
+
+/*!
+************************************************************************
+* \brief
+*    Read motion info
+************************************************************************
+*/
+static void read_motion_info_from_NAL_b_slice (Macroblock *currMB)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	Slice *currSlice = currMB->p_Slice;
+	int i,j,k;
+	int mb_nr = currMB->mbAddrX;
+	DataPartition *dP = NULL;
+	const byte *partMap      = assignSE2partition[currSlice->dp_mode];
+	int partmode       = ((currMB->mb_type == P8x8) ? 4 : currMB->mb_type);
+	int step_h0        = BLOCK_STEP [partmode][0];
+	int step_v0        = BLOCK_STEP [partmode][1];
+
+	int i0, j0, j6;
+
+	int j4, i4, ii;
+	StorablePicture *dec_picture = p_Vid->dec_picture;
+	PicMotionParams *motion = &dec_picture->motion;
+	MotionParams *colocated;
+
+	int mv_scale = 0;
+
+	int list_offset = ((currSlice->mb_aff_frame_flag)&&(currMB->mb_field))? (mb_nr&0x01) ? 4 : 2 : 0;
+
+
+	if ((currSlice->mb_aff_frame_flag) && (currMB->mb_field))
+	{
+		if(mb_nr&0x01)
+		{
+			colocated = &currSlice->p_colocated->bottom;
+		}
+		else
+		{
+			colocated = &currSlice->p_colocated->top;
+		}
+	}
+	else
+	{
+		colocated = &currSlice->p_colocated->frame;
+	}
+
+	if (currMB->mb_type == P8x8)
+	{
+		if (currSlice->direct_spatial_mv_pred_flag)
+		{
+			char  l0_rFrame, l1_rFrame;
+			short pmvl0[2]={0,0}, pmvl1[2]={0,0};
+
+			prepare_direct_params(currMB, dec_picture, pmvl0, pmvl1, &l0_rFrame, &l1_rFrame);
+
+			for (k = 0; k < 4; ++k)
+			{        
+				if (currMB->b8mode[k] == 0)
+				{
+					i = currMB->block_x + 2 * (k & 0x01);
+					for(j = 2 * (k >> 1); j < 2 * (k >> 1)+2;++j)
+					{
+						j6 = currMB->block_y_aff + j;
+						j4 = currMB->block_y     + j;
+						for(i4 = i; i4 < i + 2; ++i4)
+						{
+							if (l0_rFrame >= 0)
+							{
+								if  (!l0_rFrame  && ((!colocated->moving_block[j6][i4]) && (!p_Vid->listX[LIST_1 + list_offset][0]->is_long_term)))
+								{
+									motion->motion[LIST_0][j4][i4].mv[0] = 0;
+									motion->motion[LIST_0][j4][i4].mv[1] = 0;
+									motion->motion[LIST_0][j4][i4].ref_idx = 0;
+								}
+								else
+								{
+									motion->motion[LIST_0][j4][i4].mv[0] = pmvl0[0];
+									motion->motion[LIST_0][j4][i4].mv[1] = pmvl0[1];
+									motion->motion[LIST_0][j4][i4].ref_idx = l0_rFrame;
+								}
+							}
+							else
+							{
+								motion->motion[LIST_0][j4][i4].mv[0] = 0;
+								motion->motion[LIST_0][j4][i4].mv[1] = 0;
+								motion->motion[LIST_0][j4][i4].ref_idx = -1;
+							}
+
+							if (l1_rFrame >= 0)
+							{
+								if  (l1_rFrame==0 && ((!colocated->moving_block[j6][i4])&& (!p_Vid->listX[LIST_1 + list_offset][0]->is_long_term)))
+								{
+									motion->motion[LIST_1][j4][i4].mv[0] = 0;
+									motion->motion[LIST_1][j4][i4].mv[1] = 0;
+									motion->motion[LIST_1][j4][i4].ref_idx = 0;
+								}
+								else
+								{
+									motion->motion[LIST_1][j4][i4].mv[0] = pmvl1[0];
+									motion->motion[LIST_1][j4][i4].mv[1] = pmvl1[1];
+									motion->motion[LIST_1][j4][i4].ref_idx = l1_rFrame;
+								}
+							}
+							else
+							{
+								motion->motion[LIST_1][j4][i4].mv[0] = 0;
+								motion->motion[LIST_1][j4][i4].mv[1] = 0;
+								motion->motion[LIST_1][j4][i4].ref_idx = -1;
+							}
+
+							if (l0_rFrame <0 && l1_rFrame <0)
+							{
+								motion->motion[LIST_0][j4][i4].ref_idx = 0;
+								motion->motion[LIST_1][j4][i4].ref_idx = 0;
+							}
+						}
+					}
+				}
+			}
+		}
+		else
+		{
+			for (k = 0; k < 4; ++k) // Scan all blocks
+			{
+				if (currMB->b8mode[k] == 0)
+				{
+					for(j0 = 2 * (k >> 1); j0 < 2 * (k >> 1) + 2; j0 += step_v0)
+					{
+						for(i0 = currMB->block_x + 2*(k & 0x01); i0 < currMB->block_x + 2 * (k & 0x01)+2; i0 += step_h0)
+						{
+							int refList = colocated->motion[LIST_0 ][currMB->block_y_aff + j0][i0].ref_idx== -1 ? LIST_1 : LIST_0;
+							int ref_idx = colocated->motion[refList][currMB->block_y_aff + j0][i0].ref_idx;
+							int mapped_idx = -1, iref;
+
+							if (ref_idx == -1)
+							{
+								for (j4 = currMB->block_y + j0; j4 < currMB->block_y + j0 + step_v0; ++j4)
+								{
+									int h;
+									for (h=0;h<step_h0;h++)
+									{
+										PicMotion *m0 = &motion->motion[LIST_0][j4][i0+h];
+										PicMotion *m1 = &motion->motion[LIST_1][j4][i0+h];
+										m0->ref_idx = 0;
+										m1->ref_idx = 0;
+										memset(&m0->mv, 0, sizeof(MotionVector));
+										memset(&m1->mv, 0, sizeof(MotionVector));
+									}
+								}
+							}
+							else
+							{
+								for (iref = 0; iref < imin(currSlice->num_ref_idx_l0_active, p_Vid->listXsize[LIST_0 + list_offset]); ++iref)
+								{
+									int curr_mb_field = ((currSlice->mb_aff_frame_flag)&&(currMB->mb_field));
+
+									if(p_Vid->structure==0 && curr_mb_field==0)
+									{
+										// If the current MB is a frame MB and the colocated is from a field picture,
+										// then the colocated->ref_pic_id may have been generated from the wrong value of
+										// frame_poc if it references it's complementary field, so test both POC values
+										if(p_Vid->listX[0][iref]->top_poc * 2    == colocated->motion[refList][currMB->block_y_aff + j0][i0].ref_pic_id
+											|| p_Vid->listX[0][iref]->bottom_poc * 2 == colocated->motion[refList][currMB->block_y_aff + j0][i0].ref_pic_id)
+										{
+											mapped_idx=iref;
+											break;
+										}
+										else //! invalid index. Default to zero even though this case should not happen
+											mapped_idx=INVALIDINDEX;
+										continue;
+									}
+									if (dec_picture->ref_pic_num[p_Vid->current_slice_nr][LIST_0 + list_offset][iref]==colocated->motion[refList][currMB->block_y_aff + j0][i0].ref_pic_id)
+									{
+										mapped_idx=iref;
+										break;
+									}
+									else //! invalid index. Default to zero even though this case should not happen
+										mapped_idx=INVALIDINDEX;
+								}
+
+								if (INVALIDINDEX == mapped_idx)
+								{
+									error("temporal direct error: colocated block has ref that is unavailable",-1111);
+								}
+
+								for (j = j0; j < j0 + step_v0; ++j)
+								{
+									j4 = currMB->block_y + j;
+									j6 = currMB->block_y_aff + j;
+
+									for (i4 = i0; i4 < i0 + step_h0; ++i4)
+									{
+										mv_scale = currSlice->mvscale[LIST_0 + list_offset][mapped_idx];
+
+										motion->motion[LIST_0][j4][i4].ref_idx = (char) mapped_idx;
+										motion->motion[LIST_1][j4][i4].ref_idx = 0;
+
+										if (mv_scale == 9999 || p_Vid->listX[LIST_0+list_offset][mapped_idx]->is_long_term)
+										{
+											for (ii=0; ii < 2; ++ii)
+											{
+												motion->motion[LIST_0][j4][i4].mv[ii] = colocated->motion[refList][j6][i4].mv[ii];
+												motion->motion[LIST_1][j4][i4].mv[ii] = 0;
+											}
+										}
+										else
+										{
+											for (ii=0; ii < 2; ++ii)
+											{
+												motion->motion[LIST_0][j4][i4].mv[ii] = (short) ((mv_scale * colocated->motion[refList][j6][i4].mv[ii] + 128 ) >> 8);
+												motion->motion[LIST_1][j4][i4].mv[ii] = (short) (motion->motion[LIST_0][j4][i4].mv[ii] - colocated->motion[refList][j6][i4].mv[ii]);
+											}
+										}
+									}
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+
+
+
+	if (p_Vid->active_pps->entropy_coding_mode_flag == CAVLC) 
+	{
+		SyntaxElement currSE;
+		//=====  READ REFERENCE PICTURE INDICES =====
+		dP = &(currSlice->partArr[partMap[SE_REFFRAME]]);
+		//  For LIST_0, if multiple ref. pictures, read LIST_0 reference picture indices for the MB ***********
+		prepareListforRefIdx (currMB, &currSE,  currSlice->num_ref_idx_l0_active, TRUE);
+		readMBRefPictureIdx  (&currSE, dP, currMB, &motion->motion[LIST_0][currMB->block_y], LIST_0, step_v0, step_h0);
+
+		//  For LIST_1, if multiple ref. pictures, read LIST_1 reference picture indices for the MB ***********
+		prepareListforRefIdx (currMB, &currSE, currSlice->num_ref_idx_l1_active, TRUE);
+		readMBRefPictureIdx  (&currSE, dP, currMB, &motion->motion[LIST_1][currMB->block_y], LIST_1, step_v0, step_h0);
+
+		//=====  READ MOTION VECTORS =====
+		dP = &(currSlice->partArr[partMap[SE_MVD]]);
+
+		currSE.mapping = linfo_se;
+		// LIST_0 Motion vectors
+		readMBMotionVectors (&currSE, dP, currMB, LIST_0, step_h0, step_v0);
+		// LIST_1 Motion vectors
+		readMBMotionVectors (&currSE, dP, currMB, LIST_1, step_h0, step_v0);
+	}
+	else                                                  
+	{
+		//=====  READ REFERENCE PICTURE INDICES =====
+		dP = &(currSlice->partArr[partMap[SE_REFFRAME]]);
+		if (currSlice->num_ref_idx_l0_active>1)
+		{
+			//  For LIST_0, if multiple ref. pictures, read LIST_0 reference picture indices for the MB ***********
+			readMBRefPictureIdx_CABAC(dP, currMB, &motion->motion[LIST_0][currMB->block_y], LIST_0, step_v0, step_h0);
+		}
+		else
+		{
+			readMBRefPictureIdx_CABAC_NoReference(currMB, &motion->motion[LIST_0][currMB->block_y], LIST_0, step_v0, step_h0);
+		}
+
+		if (currSlice->num_ref_idx_l1_active > 1)
+		{
+			//  For LIST_1, if multiple ref. pictures, read LIST_1 reference picture indices for the MB ***********
+			readMBRefPictureIdx_CABAC(dP, currMB, &motion->motion[LIST_1][currMB->block_y], LIST_1, step_v0, step_h0);
+		}
+		else
+		{
+			readMBRefPictureIdx_CABAC_NoReference(currMB, &motion->motion[LIST_1][currMB->block_y], LIST_1, step_v0, step_h0);
+		}
+
+		//=====  READ MOTION VECTORS =====
+		dP = &(currSlice->partArr[partMap[SE_MVD]]);
+
+		// LIST_0 Motion vectors
+		readMBMotionVectors_CABAC(dP, currMB, LIST_0, step_h0, step_v0);
+		// LIST_1 Motion vectors
+		readMBMotionVectors_CABAC(dP, currMB, LIST_1, step_h0, step_v0);
+	}
+
+
+
+	// record reference picture Ids for deblocking decisions
+
+	for (k = LIST_0; k <= LIST_1; ++k)
+	{
+		const h264_ref_t *rec_pic_num = dec_picture->ref_pic_num[p_Vid->current_slice_nr][k+list_offset];
+		PicMotion **list_motion = &motion->motion[k][currMB->block_y];
+		for(j4 = 0; j4 < 4 ;++j4)
+		{
+			PicMotion *m = &list_motion[j4][currMB->block_x];
+			m[0].ref_pic_id = (m[0].ref_idx>=0)?rec_pic_num[(short)m[0].ref_idx]:UNDEFINED_REFERENCE;
+			m[1].ref_pic_id = (m[1].ref_idx>=0)?rec_pic_num[(short)m[1].ref_idx]:UNDEFINED_REFERENCE;
+			m[2].ref_pic_id = (m[2].ref_idx>=0)?rec_pic_num[(short)m[2].ref_idx]:UNDEFINED_REFERENCE;
+			m[3].ref_pic_id = (m[3].ref_idx>=0)?rec_pic_num[(short)m[3].ref_idx]:UNDEFINED_REFERENCE;
+		}
+	}
+}
+
+/*!
+************************************************************************
+* \brief
+*    Get the Prediction from the Neighboring Blocks for Number of 
+*    Nonzero Coefficients
+*
+*    Luma Blocks
+************************************************************************
+*/
+static int predict_nnz_cb(Macroblock *currMB, int i,int j)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+
+	PixelPos pix;
+
+	int pred_nnz = 0;
+	int cnt      = 0;
+
+	// left block
+	p_Vid->getNeighbourLuma(currMB, i - 1, j, &pix);
+
+	if (IS_INTRA(currMB) && pix.available && p_Vid->active_pps->constrained_intra_pred_flag && (p_Vid->currentSlice->dp_mode==PAR_DP_3))
+	{
+		pix.available &= p_Vid->intra_block[pix.mb_addr];
+		if (!pix.available)
+			++cnt;
+	}
+
+	if (pix.available)
+	{ 
+		pred_nnz = p_Vid->nz_coeff [pix.mb_addr ][1][pix.y>>2][pix.x>>2];
+		++cnt;
+	}
+
+	// top block
+	p_Vid->getNeighbourLuma(currMB, i, j - 1, &pix);
+
+	if (IS_INTRA(currMB) && pix.available && p_Vid->active_pps->constrained_intra_pred_flag && (p_Vid->currentSlice->dp_mode==PAR_DP_3))
+	{
+		pix.available &= p_Vid->intra_block[pix.mb_addr];
+		if (!pix.available)
+			++cnt;
+	}
+
+	if (pix.available)
+	{
+		pred_nnz += p_Vid->nz_coeff [pix.mb_addr ][1][pix.y>>2][pix.x>>2];
+		++cnt;
+	}
+
+	if (cnt==2)
+	{
+		++pred_nnz;
+		pred_nnz>>=1;
+	}
+
+	return pred_nnz;
+}
+
+
+static int predict_nnz_cr(Macroblock *currMB, int i,int j)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+
+	PixelPos pix;
+
+	int pred_nnz = 0;
+	int cnt      = 0;
+
+	// left block
+	p_Vid->getNeighbourLuma(currMB, i - 1, j, &pix);
+
+	if (IS_INTRA(currMB) && pix.available && p_Vid->active_pps->constrained_intra_pred_flag && (p_Vid->currentSlice->dp_mode==PAR_DP_3))
+	{
+		pix.available &= p_Vid->intra_block[pix.mb_addr];
+		if (!pix.available)
+			++cnt;
+	}
+
+	if (pix.available)
+	{ 
+		pred_nnz = p_Vid->nz_coeff [pix.mb_addr ][2][pix.y>>2][pix.x>>2];
+		++cnt;
+	}
+
+	// top block
+	p_Vid->getNeighbourLuma(currMB, i, j - 1, &pix);
+
+	if (IS_INTRA(currMB) && pix.available && p_Vid->active_pps->constrained_intra_pred_flag && (p_Vid->currentSlice->dp_mode==PAR_DP_3))
+	{
+		pix.available &= p_Vid->intra_block[pix.mb_addr];
+		if (!pix.available)
+			++cnt;
+	}
+
+	if (pix.available)
+	{
+		pred_nnz += p_Vid->nz_coeff [pix.mb_addr ][2][pix.y>>2][pix.x>>2];
+		++cnt;
+	}
+
+	if (cnt==2)
+	{
+		++pred_nnz;
+		pred_nnz>>=1;
+	}
+
+	return pred_nnz;
+}
+
+
+static int predict_nnz_luma(Macroblock *currMB, int i,int j)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+
+	PixelPos pix;
+
+	int pred_nnz = 0;
+	int cnt      = 0;
+
+	// left block
+	p_Vid->getNeighbourXPLuma(currMB, i - 1, j, &pix);
+
+	if (pix.available)
+	{ 
+		pred_nnz = p_Vid->nz_coeff [pix.mb_addr ][0][pix.y>>2][pix.x>>2];
+		++cnt;
+	}
+
+	// top block
+	p_Vid->getNeighbourPXLuma(currMB, i, j - 1, &pix);
+
+	if (pix.available)
+	{
+		pred_nnz += p_Vid->nz_coeff [pix.mb_addr ][0][pix.y>>2][pix.x>>2];
+		++cnt;
+	}
+
+	if (cnt==2)
+	{
+		++pred_nnz;
+		pred_nnz>>=1;
+	}
+
+	return pred_nnz;
+}
+
+
+static int predict_nnz_luma_intra(Macroblock *currMB, int i,int j)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+
+	PixelPos pix;
+
+	int pred_nnz = 0;
+	int cnt      = 0;
+
+	// left block
+	p_Vid->getNeighbourXPLuma(currMB, i - 1, j, &pix);
+
+	if (pix.available && p_Vid->active_pps->constrained_intra_pred_flag && (p_Vid->currentSlice->dp_mode==PAR_DP_3))
+	{
+		pix.available &= p_Vid->intra_block[pix.mb_addr];
+		if (!pix.available)
+			++cnt;
+	}
+
+	if (pix.available)
+	{ 
+		pred_nnz = p_Vid->nz_coeff [pix.mb_addr ][0][pix.y>>2][pix.x>>2];
+		++cnt;
+	}
+
+	// top block
+	p_Vid->getNeighbourPXLuma(currMB, i, j - 1, &pix);
+
+	if (pix.available && p_Vid->active_pps->constrained_intra_pred_flag && (p_Vid->currentSlice->dp_mode==PAR_DP_3))
+	{
+		pix.available &= p_Vid->intra_block[pix.mb_addr];
+		if (!pix.available)
+			++cnt;
+	}
+
+	if (pix.available)
+	{
+		pred_nnz += p_Vid->nz_coeff [pix.mb_addr ][0][pix.y>>2][pix.x>>2];
+		++cnt;
+	}
+
+	if (cnt==2)
+	{
+		++pred_nnz;
+		pred_nnz>>=1;
+	}
+
+	return pred_nnz;
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    Get the Prediction from the Neighboring Blocks for Number of 
+*    Nonzero Coefficients
+*
+*    Chroma Blocks
+************************************************************************
+*/
+static int predict_nnz_chroma_inter(Macroblock *currMB, int i,int j)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	StorablePicture *dec_picture = p_Vid->dec_picture;
+	PixelPos pix;
+
+	int pred_nnz = 0;
+	int cnt      = 0;
+
+	if (dec_picture->chroma_format_idc != YUV444)
+	{
+		//YUV420 and YUV422
+		// left block
+		p_Vid->getNeighbour(currMB, ((i&0x01)<<2) - 1, j, p_Vid->mb_size[IS_CHROMA], &pix);
+		if (pix.available)
+		{
+			pred_nnz = p_Vid->nz_coeff [pix.mb_addr ][1][pix.y>>2][2 * (i>>1) + (pix.x>>2)];
+			++cnt;
+		}
+
+		// top block
+		p_Vid->getNeighbour(currMB, ((i&0x01)<<2), j - 1, p_Vid->mb_size[IS_CHROMA], &pix);
+		if (pix.available)
+		{
+			pred_nnz += p_Vid->nz_coeff [pix.mb_addr ][1][pix.y>>2][2 * (i>>1) + (pix.x>>2)];
+			++cnt;
+		}
+
+		if (cnt==2)
+		{
+			++pred_nnz;
+			pred_nnz >>= 1;
+		}
+	}
+
+	return pred_nnz;
+}
+
+
+static int predict_nnz_chroma_intra(Macroblock *currMB, int i,int j)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	StorablePicture *dec_picture = p_Vid->dec_picture;
+	PixelPos pix;
+
+	int pred_nnz = 0;
+	int cnt      = 0;
+
+	if (dec_picture->chroma_format_idc != YUV444)
+	{
+		//YUV420 and YUV422
+		// left block
+		p_Vid->getNeighbour(currMB, ((i&0x01)<<2) - 1, j, p_Vid->mb_size[IS_CHROMA], &pix);
+
+		if (pix.available && p_Vid->active_pps->constrained_intra_pred_flag && (p_Vid->currentSlice->dp_mode==PAR_DP_3))
+		{
+			pix.available &= p_Vid->intra_block[pix.mb_addr];
+			if (!pix.available)
+				++cnt;
+		}
+
+		if (pix.available)
+		{
+			pred_nnz = p_Vid->nz_coeff [pix.mb_addr ][1][pix.y>>2][2 * (i>>1) + (pix.x>>2)];
+			++cnt;
+		}
+
+		// top block
+		p_Vid->getNeighbour(currMB, ((i&0x01)<<2), j - 1, p_Vid->mb_size[IS_CHROMA], &pix);
+
+		if (pix.available && p_Vid->active_pps->constrained_intra_pred_flag && (p_Vid->currentSlice->dp_mode==PAR_DP_3))
+		{
+			pix.available &= p_Vid->intra_block[pix.mb_addr];
+			if (!pix.available)
+				++cnt;
+		}
+
+		if (pix.available)
+		{
+			pred_nnz += p_Vid->nz_coeff [pix.mb_addr ][1][pix.y>>2][2 * (i>>1) + (pix.x>>2)];
+			++cnt;
+		}
+
+		if (cnt==2)
+		{
+			++pred_nnz;
+			pred_nnz >>= 1;
+		}
+	}
+
+	return pred_nnz;
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    Reads coeff of an 4x4 block (CAVLC)
+*
+* \author
+*    Karl Lillevold <karll@real.com>
+*    contributions by James Au <james@ubvideo.com>
+************************************************************************
+*/
+static void readCoeff4x4_CAVLC_Luma (Macroblock *currMB, 
+																		 int i, int j, int levarr[16], int runarr[16],
+																		 int *number_coefficients)
+{
+	Slice *currSlice = currMB->p_Slice;
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int mb_nr = currMB->mbAddrX;
+	SyntaxElement currSE;
+	DataPartition *dP;
+	const byte *partMap = assignSE2partition[currSlice->dp_mode];
+	Bitstream *currStream;
+
+	int k, code, vlcnum;
+	int numcoeff = 0, numtrailingones, numcoeff_vlc;
+	int level_two_or_higher;
+	int numones, totzeros, abslevel;
+	int zerosleft;
+	int nnz;
+	static const int incVlc[] = {0,3,6,12,24,48,32768};    // maximum vlc = 6
+
+	p_Vid->nz_coeff[mb_nr][0][j][i] = 0; 
+
+	if (IS_INTRA (currMB))
+	{
+		dP = &(currSlice->partArr[partMap[SE_LUM_AC_INTRA]]);
+		nnz = predict_nnz_luma_intra(currMB, i<<2, j<<2);
+	}
+	else
+	{
+		dP = &(currSlice->partArr[partMap[SE_LUM_AC_INTER]]);
+		nnz = predict_nnz_luma(currMB, i<<2, j<<2);
+	}
+
+	if (nnz < 2)
+	{
+		numcoeff_vlc = 0;
+	}
+	else if (nnz < 4)
+	{
+		numcoeff_vlc = 1;
+	}
+	else if (nnz < 8)
+	{
+		numcoeff_vlc = 2;
+	}
+	else //
+	{
+		numcoeff_vlc = 3;
+	}
+
+	currStream = dP->bitstream;
+	readSyntaxElement_NumCoeffTrailingOnes(&currSE, currStream, numcoeff_vlc);
+
+	numcoeff        =  currSE.value1;
+	numtrailingones =  currSE.value2;
+
+	p_Vid->nz_coeff[mb_nr][0][j][i] = (byte) numcoeff;
+
+	memzero64(levarr);
+	memzero64(runarr);
+
+	numones = numtrailingones;
+	*number_coefficients = numcoeff;
+
+	if (numcoeff)
+	{
+		if (numtrailingones)
+		{ 
+			code = readSyntaxElement_FLC(currStream, numtrailingones);
+
+			for (k=0;k<numtrailingones;k++)
+			{
+#ifdef _M_IX86
+				levarr[k+numcoeff-numtrailingones] = ((_bittest((const long *)&code, k)<<1) ^ 0xFFFFFFFF) + 2;
+#else
+				levarr[k+numcoeff-numtrailingones] = (code>>k)&1 ? -1:1;
+#endif
+			}
+		}
+
+		// decode levels
+		level_two_or_higher = (numcoeff > 3 && numtrailingones == 3)? 0 : 1;
+		vlcnum = (numcoeff > 10 && numtrailingones < 3) ? 1 : 0;
+
+		for (k = numcoeff - 1 - numtrailingones; k >= 0; k--)
+		{
+			int level;
+			if (vlcnum == 0)
+				level=readSyntaxElement_Level_VLC0(currStream);
+			else
+				level=readSyntaxElement_Level_VLCN(vlcnum, currStream);
+
+			if (level_two_or_higher)
+			{
+				level += (level > 0) ? 1 : -1;
+				level_two_or_higher = 0;
+			}
+
+			levarr[k] = level;
+			abslevel = iabs(levarr[k]);
+			if (abslevel  == 1)
+				++numones;
+
+			// update VLC table
+			if (abslevel  > incVlc[vlcnum])
+				++vlcnum;
+
+			if (k == numcoeff - 1 - numtrailingones && abslevel >3)
+				vlcnum = 2;      
+		}
+
+		if (numcoeff < 16)
+		{
+			// decode total run
+			vlcnum = numcoeff - 1;
+			totzeros = readSyntaxElement_TotalZeros(currStream, vlcnum);
+		}
+		else
+		{
+			totzeros = 0;
+		}
+
+		// decode run before each coefficient
+		zerosleft = totzeros;
+		i = numcoeff - 1;
+
+		if (zerosleft > 0 && i > 0)
+		{
+			do
+			{
+				// select VLC for runbefore
+				vlcnum = imin(zerosleft - 1, RUNBEFORE_NUM_M1);
+
+				runarr[i] = readSyntaxElement_Run(currStream, vlcnum);
+
+				zerosleft -= runarr[i];
+				i --;
+			} while (zerosleft != 0 && i != 0);
+		}
+		runarr[i] = zerosleft;    
+	} // if numcoeff
+}
+
+
+static void readCoeff4x4_CAVLC_ChromaAC(Macroblock *currMB, 
+																				int i, int j, int levarr[16], int runarr[16],
+																				int *number_coefficients)
+{
+	Slice *currSlice = currMB->p_Slice;
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int mb_nr = currMB->mbAddrX;
+	SyntaxElement currSE;
+	DataPartition *dP;
+	const byte *partMap = assignSE2partition[currSlice->dp_mode];
+	Bitstream *currStream;
+
+	int k, code, vlcnum;
+	int numcoeff = 0, numtrailingones, numcoeff_vlc;
+	int level_two_or_higher;
+	int numones, totzeros, abslevel;
+	int zerosleft, ntr;
+	int nnz;
+	static const int incVlc[] = {0,3,6,12,24,48,32768};    // maximum vlc = 6
+
+	TRACE_PRINTF("ChrDC");
+	p_Vid->nz_coeff[mb_nr][0][j][i] = 0; 
+
+	if (IS_INTRA (currMB))
+	{
+		dP = &(currSlice->partArr[partMap[SE_CHR_AC_INTRA]]);
+		nnz = predict_nnz_chroma_intra(currMB, i, ((j-4)<<2));
+	}
+	else
+	{
+		dP = &(currSlice->partArr[partMap[SE_CHR_AC_INTER]]);
+		nnz = predict_nnz_chroma_inter(currMB, i, ((j-4)<<2));
+	}
+	currStream = dP->bitstream;  
+
+
+	// luma or chroma AC    
+
+	if (nnz < 2)
+	{
+		numcoeff_vlc = 0;
+	}
+	else if (nnz < 4)
+	{
+		numcoeff_vlc = 1;
+	}
+	else if (nnz < 8)
+	{
+		numcoeff_vlc = 2;
+	}
+	else //
+	{
+		numcoeff_vlc = 3;
+	}
+
+	readSyntaxElement_NumCoeffTrailingOnes(&currSE, currStream, numcoeff_vlc);
+
+	numcoeff        =  currSE.value1;
+	numtrailingones =  currSE.value2;
+
+
+	p_Vid->nz_coeff[mb_nr][0][j][i] = (byte) numcoeff;
+
+	memzero64(levarr);
+	memzero64(runarr);
+
+	numones = numtrailingones;
+	*number_coefficients = numcoeff;
+
+	if (numcoeff)
+	{
+		if (numtrailingones)
+		{      
+			code = readSyntaxElement_FLC (currStream, numtrailingones);
+
+			ntr = numtrailingones;
+			for (k = numcoeff - 1; k > numcoeff - 1 - numtrailingones; k--)
+			{
+				ntr --;
+				levarr[k] = (code>>ntr)&1 ? -1 : 1;
+			}
+		}
+
+		// decode levels
+		level_two_or_higher = (numcoeff > 3 && numtrailingones == 3)? 0 : 1;
+		vlcnum = (numcoeff > 10 && numtrailingones < 3) ? 1 : 0;
+
+		for (k = numcoeff - 1 - numtrailingones; k >= 0; k--)
+		{
+
+#if TRACE
+			snprintf(currSE.tracestring,
+				TRACESTRING_SIZE, "%s lev (%d,%d) k=%d vlc=%d ", type, i, j, k, vlcnum);
+#endif
+
+			int level;
+			if (vlcnum == 0)
+				level=readSyntaxElement_Level_VLC0(currStream);
+			else
+				level=readSyntaxElement_Level_VLCN(vlcnum, currStream);
+
+			if (level_two_or_higher)
+			{
+				level += (level > 0) ? 1 : -1;
+				level_two_or_higher = 0;
+			}
+
+			levarr[k] = level;
+			abslevel = iabs(levarr[k]);
+			if (abslevel  == 1)
+				++numones;
+
+			// update VLC table
+			if (abslevel  > incVlc[vlcnum])
+				++vlcnum;
+
+			if (k == numcoeff - 1 - numtrailingones && abslevel >3)
+				vlcnum = 2;      
+		}
+
+		if (numcoeff < 15)
+		{
+			// decode total run
+			vlcnum = numcoeff - 1;
+			totzeros = readSyntaxElement_TotalZeros(currStream, vlcnum);
+		}
+		else
+		{
+			totzeros = 0;
+		}
+
+		// decode run before each coefficient
+		zerosleft = totzeros;
+		i = numcoeff - 1;
+
+		if (zerosleft > 0 && i > 0)
+		{
+			do
+			{
+				// select VLC for runbefore
+				vlcnum = imin(zerosleft - 1, RUNBEFORE_NUM_M1);
+
+				runarr[i] = readSyntaxElement_Run(currStream, vlcnum);
+
+				zerosleft -= runarr[i];
+				i --;
+			} while (zerosleft != 0 && i != 0);
+		}
+		runarr[i] = zerosleft;    
+	} // if numcoeff
+}
+
+static void readCoeff4x4_CAVLC_ChromaDC(Macroblock *currMB, int i, int j, int levarr[16], int runarr[16], int *number_coefficients)
+{
+	Slice *currSlice = currMB->p_Slice;
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int mb_nr = currMB->mbAddrX;
+	SyntaxElement currSE;
+	DataPartition *dP;
+	Bitstream *currStream;
+
+	int k, code, vlcnum;
+	int numcoeff = 0, numtrailingones;
+	int level_two_or_higher;
+	int numones, totzeros, abslevel;
+	int zerosleft, ntr;
+	int max_coeff_num;
+	static const int incVlc[] = {0,3,6,12,24,48,32768};    // maximum vlc = 6
+
+	max_coeff_num = p_Vid->num_cdc_coeff;
+	TRACE_PRINTF("ChrDC");
+	p_Vid->nz_coeff[mb_nr][0][j][i] = 0; 
+	if (IS_INTRA (currMB))
+		dP = &(currSlice->partArr[assignSE2partition[currSlice->dp_mode][SE_CHR_DC_INTRA]]);
+	else
+		dP = &(currSlice->partArr[assignSE2partition[currSlice->dp_mode][SE_CHR_DC_INTER]]);
+	currStream = dP->bitstream;  
+
+	readSyntaxElement_NumCoeffTrailingOnesChromaDC(p_Vid, &currSE, currStream);
+
+	numcoeff        =  currSE.value1;
+	numtrailingones =  currSE.value2;
+
+	memzero64(levarr);
+	memzero64(runarr);
+
+	numones = numtrailingones;
+	*number_coefficients = numcoeff;
+
+	if (numcoeff)
+	{
+		if (numtrailingones)
+		{      
+			code = readSyntaxElement_FLC (currStream, numtrailingones);
+
+			ntr = numtrailingones;
+			for (k = numcoeff - 1; k > numcoeff - 1 - numtrailingones; k--)
+			{
+				ntr --;
+				levarr[k] = (code>>ntr)&1 ? -1 : 1;
+			}
+		}
+
+		// decode levels
+		level_two_or_higher = (numcoeff > 3 && numtrailingones == 3)? 0 : 1;
+		vlcnum = (numcoeff > 10 && numtrailingones < 3) ? 1 : 0;
+
+		for (k = numcoeff - 1 - numtrailingones; k >= 0; k--)
+		{
+			int level;
+			if (vlcnum == 0)
+				level=readSyntaxElement_Level_VLC0(currStream);
+			else
+				level=readSyntaxElement_Level_VLCN(vlcnum, currStream);
+
+			if (level_two_or_higher)
+			{
+				level += (level > 0) ? 1 : -1;
+				level_two_or_higher = 0;
+			}
+
+			levarr[k] = level;
+			abslevel = iabs(levarr[k]);
+			if (abslevel  == 1)
+				++numones;
+
+			// update VLC table
+			if (abslevel  > incVlc[vlcnum])
+				++vlcnum;
+
+			if (k == numcoeff - 1 - numtrailingones && abslevel >3)
+				vlcnum = 2;      
+		}
+
+		if (numcoeff < max_coeff_num)
+		{
+			// decode total run
+			vlcnum = numcoeff - 1;
+			totzeros = readSyntaxElement_TotalZerosChromaDC(p_Vid, currStream, vlcnum);
+		}
+		else
+		{
+			totzeros = 0;
+		}
+
+		// decode run before each coefficient
+		zerosleft = totzeros;
+		i = numcoeff - 1;
+
+		if (zerosleft > 0 && i > 0)
+		{
+			do
+			{
+				// select VLC for runbefore
+				vlcnum = imin(zerosleft - 1, RUNBEFORE_NUM_M1);
+
+				runarr[i] = readSyntaxElement_Run(currStream, vlcnum);
+
+				zerosleft -= runarr[i];
+				i --;
+			} while (zerosleft != 0 && i != 0);
+		}
+		runarr[i] = zerosleft;    
+	} // if numcoeff
+}
+
+static void readCoeff4x4_CAVLC(Macroblock *currMB, int block_type, int i, int j, int levarr[16], int runarr[16], int *number_coefficients)
+{
+	Slice *currSlice = currMB->p_Slice;
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int mb_nr = currMB->mbAddrX;
+	SyntaxElement currSE;
+	DataPartition *dP;
+	Bitstream *currStream;
+
+	int k, code, vlcnum;
+	int numcoeff = 0, numtrailingones, numcoeff_vlc;
+	int level_two_or_higher;
+	int numones, totzeros, abslevel;
+	int zerosleft, ntr, dptype = 0;
+	int max_coeff_num, nnz;
+	static const int incVlc[] = {0,3,6,12,24,48,32768};    // maximum vlc = 6
+
+	switch (block_type)
+	{
+	case LUMA:
+		readCoeff4x4_CAVLC_Luma(currMB, i, j, levarr, runarr, number_coefficients);
+		return;
+	case LUMA_INTRA16x16DC:
+		max_coeff_num = 16;
+		TRACE_PRINTF("Lum16DC");
+		dptype = SE_LUM_DC_INTRA;
+		p_Vid->nz_coeff[mb_nr][0][j][i] = 0; 
+		break;
+	case LUMA_INTRA16x16AC:
+		max_coeff_num = 15;
+		TRACE_PRINTF("Lum16AC");
+		dptype = SE_LUM_AC_INTRA;
+		p_Vid->nz_coeff[mb_nr][0][j][i] = 0; 
+		break;
+	case CB:
+		max_coeff_num = 16;
+		TRACE_PRINTF("Luma_add1");
+		dptype = (IS_INTRA (currMB)) ? SE_LUM_AC_INTRA : SE_LUM_AC_INTER;
+		p_Vid->nz_coeff[mb_nr][1][j][i] = 0; 
+		break;
+	case CB_INTRA16x16DC:
+		max_coeff_num = 16;
+		TRACE_PRINTF("Luma_add1_16DC");
+		dptype = SE_LUM_DC_INTRA;
+		p_Vid->nz_coeff[mb_nr][1][j][i] = 0; 
+		break;
+	case CB_INTRA16x16AC:
+		max_coeff_num = 15;
+		TRACE_PRINTF("Luma_add1_16AC");
+		dptype = SE_LUM_AC_INTRA;
+		p_Vid->nz_coeff[mb_nr][1][j][i] = 0; 
+		break;
+	case CR:
+		max_coeff_num = 16;
+		TRACE_PRINTF("Luma_add2");
+		dptype = (IS_INTRA (currMB)) ? SE_LUM_AC_INTRA : SE_LUM_AC_INTER;
+		p_Vid->nz_coeff[mb_nr][2][j][i] = 0; 
+		break;
+	case CR_INTRA16x16DC:
+		max_coeff_num = 16;
+		TRACE_PRINTF("Luma_add2_16DC");
+		dptype = SE_LUM_DC_INTRA;
+		p_Vid->nz_coeff[mb_nr][2][j][i] = 0; 
+		break;
+	case CR_INTRA16x16AC:
+		max_coeff_num = 15;
+		TRACE_PRINTF("Luma_add1_16AC");
+		dptype = SE_LUM_AC_INTRA;
+		p_Vid->nz_coeff[mb_nr][2][j][i] = 0; 
+		break;        
+	case CHROMA_DC:
+		readCoeff4x4_CAVLC_ChromaDC(currMB, i, j, levarr, runarr, number_coefficients);
+		return;
+	case CHROMA_AC:
+		readCoeff4x4_CAVLC_ChromaAC(currMB, i, j, levarr, runarr, number_coefficients);
+		return;
+	default:
+		error ("readCoeff4x4_CAVLC: invalid block type", 600);
+		p_Vid->nz_coeff[mb_nr][0][j][i] = 0; 
+		break;
+	}
+
+	dP = &(currSlice->partArr[assignSE2partition[currSlice->dp_mode][dptype]]);
+	currStream = dP->bitstream;  
+
+	// luma or chroma AC    
+	if(block_type==LUMA_INTRA16x16DC || block_type==LUMA_INTRA16x16AC)
+	{
+		nnz = predict_nnz_luma_intra(currMB, i<<2, j<<2);
+	}
+	else if (block_type==CB || block_type==CB_INTRA16x16DC || block_type==CB_INTRA16x16AC)
+	{   
+		nnz = predict_nnz_cb(currMB, i<<2, j<<2);
+	}
+	else
+	{ 
+		nnz = predict_nnz_cr(currMB, i<<2, j<<2);
+	}
+
+	if (nnz < 2)
+	{
+		numcoeff_vlc = 0;
+	}
+	else if (nnz < 4)
+	{
+		numcoeff_vlc = 1;
+	}
+	else if (nnz < 8)
+	{
+		numcoeff_vlc = 2;
+	}
+	else //
+	{
+		numcoeff_vlc = 3;
+	}
+
+	readSyntaxElement_NumCoeffTrailingOnes(&currSE, currStream, numcoeff_vlc);
+
+	numcoeff        =  currSE.value1;
+	numtrailingones =  currSE.value2;
+
+	if(block_type==LUMA_INTRA16x16DC || block_type==LUMA_INTRA16x16AC)
+		p_Vid->nz_coeff[mb_nr][0][j][i] = (byte) numcoeff;
+	else if (block_type==CB || block_type==CB_INTRA16x16DC || block_type==CB_INTRA16x16AC)
+		p_Vid->nz_coeff[mb_nr][1][j][i] = (byte) numcoeff;
+	else
+		p_Vid->nz_coeff[mb_nr][2][j][i] = (byte) numcoeff;        
+
+
+	memzero64(levarr);
+	memzero64(runarr);
+
+	numones = numtrailingones;
+	*number_coefficients = numcoeff;
+
+	if (numcoeff)
+	{
+		if (numtrailingones)
+		{      
+			code = readSyntaxElement_FLC(currStream, numtrailingones);
+
+			ntr = numtrailingones;
+			for (k = numcoeff - 1; k > numcoeff - 1 - numtrailingones; k--)
+			{
+				ntr --;
+				levarr[k] = (code>>ntr)&1 ? -1 : 1;
+			}
+		}
+
+		// decode levels
+		level_two_or_higher = (numcoeff > 3 && numtrailingones == 3)? 0 : 1;
+		vlcnum = (numcoeff > 10 && numtrailingones < 3) ? 1 : 0;
+
+		for (k = numcoeff - 1 - numtrailingones; k >= 0; k--)
+		{
+			int level;
+			if (vlcnum == 0)
+				level=readSyntaxElement_Level_VLC0(currStream);
+			else
+				level=readSyntaxElement_Level_VLCN(vlcnum, currStream);
+
+			if (level_two_or_higher)
+			{
+				level += (level > 0) ? 1 : -1;
+				level_two_or_higher = 0;
+			}
+
+			levarr[k] = level;
+			abslevel = iabs(levarr[k]);
+			if (abslevel  == 1)
+				++numones;
+
+			// update VLC table
+			if (abslevel  > incVlc[vlcnum])
+				++vlcnum;
+
+			if (k == numcoeff - 1 - numtrailingones && abslevel >3)
+				vlcnum = 2;      
+		}
+
+		if (numcoeff < max_coeff_num)
+		{
+			// decode total run
+			vlcnum = numcoeff - 1;
+			totzeros = readSyntaxElement_TotalZeros(currStream, vlcnum);
+		}
+		else
+		{
+			totzeros = 0;
+		}
+
+		// decode run before each coefficient
+		zerosleft = totzeros;
+		i = numcoeff - 1;
+
+		if (zerosleft > 0 && i > 0)
+		{
+			do
+			{
+				// select VLC for runbefore
+				vlcnum = imin(zerosleft - 1, RUNBEFORE_NUM_M1);
+
+				runarr[i] = readSyntaxElement_Run(currStream, vlcnum);
+
+				zerosleft -= runarr[i];
+				i --;
+			} while (zerosleft != 0 && i != 0);
+		}
+		runarr[i] = zerosleft;    
+	} // if numcoeff
+}
+
+/*!
+************************************************************************
+* \brief
+*    Get coefficients (run/level) of 4x4 blocks in a SMB
+*    from the NAL (CABAC Mode)
+************************************************************************
+*/
+static void readCompCoeff4x4SMB_I16MB_CABAC(Macroblock *currMB, int context, h264_short_block_t *blocks, int block_y, int block_x, int64 *cbp_blk)
+{
+	// start_scan == 1
+	int i,j,k;
+	RunLevel rl;
+	VideoParameters *p_Vid = currMB->p_Vid;
+	Slice *currSlice = currMB->p_Slice;
+	const byte *partMap = assignSE2partition[currSlice->dp_mode];
+
+	const byte *pos_scan4x4 = ((p_Vid->structure == FRAME) && (!currMB->mb_field)) ? SNGL_SCAN_1D : FIELD_SCAN_1D;
+	const byte *pos_scan_4x4;
+	// make distinction between INTRA and INTER coded luminance coefficients
+	int type = (currMB->is_intra_block ? SE_LUM_AC_INTRA : SE_LUM_AC_INTER);
+	DecodingEnvironment *de_cabac = &currSlice->partArr[partMap[type]].de_cabac;
+
+	for (j = 0; j < BLOCK_SIZE_8x8; j += BLOCK_SIZE)
+	{
+		currMB->subblock_y = block_y + j; // position for coeff_count ctx
+
+		for (i = 0; i < BLOCK_SIZE_8x8; i += BLOCK_SIZE)
+		{
+			int16_t *block = (int16_t *)(*blocks++);
+			currMB->subblock_x = block_x + i; // position for coeff_count ctx
+			pos_scan_4x4 = &pos_scan4x4[1];
+			for(k = 0; k < 16; k++)
+			{
+				rl = readRunLevel_CABAC(currMB, de_cabac, context);
+
+				if (rl.level != 0)    /* leave if level == 0 */
+				{
+					pos_scan_4x4 += rl.run;
+					block[*pos_scan_4x4++] = rl.level;
+				}
+				else
+					break;
+			}
+		}
+	}
+}
+
+#ifdef _M_IX86
+static void readCompCoeff4x4SMB_CABAC(Macroblock *currMB, int context, h264_short_block_t *blocks, int block_y, int block_x, int64_t *cbp_blk64)
+#else
+static void readCompCoeff4x4SMB_CABAC(Macroblock *currMB, int context, h264_short_block_t *blocks, int block_y, int block_x, int64_t *cbp_blk)
+#endif
+{
+	int k;
+	RunLevel rl;
+	VideoParameters *p_Vid = currMB->p_Vid;
+	Slice *currSlice = currMB->p_Slice;
+	const byte *partMap = assignSE2partition[currSlice->dp_mode];
+	const byte *pos_scan4x4 = ((p_Vid->structure == FRAME) && (!currMB->mb_field)) ? SNGL_SCAN_1D : FIELD_SCAN_1D;
+	const byte *pos_scan_4x4;
+	int16_t *block;
+#ifdef _M_IX86
+	int32_t *cbp_blk = (int32_t *)cbp_blk64; 
+#endif
+	//h264_short_block_t *blocks = &currSlice->cof4[pl][cof4_pos_to_subblock[block_y>>2][block_x>>2]];
+	DecodingEnvironment *de_cabac_dc, *de_cabac_ac;
+	/*
+	* make distinction between INTRA and INTER coded
+	* luminance coefficients
+	*/
+	if (currMB->is_intra_block)
+	{
+		de_cabac_dc = &currSlice->partArr[partMap[SE_LUM_DC_INTRA]].de_cabac;
+		de_cabac_ac = &currSlice->partArr[partMap[SE_LUM_AC_INTRA]].de_cabac;
+	}
+	else
+	{
+		de_cabac_dc = &currSlice->partArr[partMap[SE_LUM_DC_INTER]].de_cabac;
+		de_cabac_ac = &currSlice->partArr[partMap[SE_LUM_AC_INTER]].de_cabac;
+	}
+//	for (j = block_y; j < (block_y+BLOCK_SIZE_8x8); j += 4)
+	
+
+		block = (int16_t *)(*blocks++);
+		currMB->subblock_y = block_y; // position for coeff_count ctx		
+		currMB->subblock_x = block_x; // position for coeff_count ctx
+		pos_scan_4x4 = pos_scan4x4;
+		rl = readRunLevel_CABAC(currMB, de_cabac_dc, context);
+		if (rl.level != 0)    /* leave if level == 0 */
+		{
+			pos_scan_4x4 += rl.run;
+			*cbp_blk |= 1 << (block_y + (block_x >> 2)) ;
+			block[*pos_scan_4x4++] = rl.level;
+			for(k = 0; k < 16; ++k)
+			{
+				rl = readRunLevel_CABAC(currMB, de_cabac_ac, context);
+				if (rl.level != 0)    /* leave if level == 0 */
+				{
+					pos_scan_4x4 += rl.run;
+					block[*pos_scan_4x4++] = rl.level;
+				}
+				else
+					break;
+			}
+		}
+
+		block = (int16_t *)(*blocks++);
+		currMB->subblock_x += 4; // position for coeff_count ctx
+		pos_scan_4x4 = pos_scan4x4;
+		rl = readRunLevel_CABAC(currMB, de_cabac_dc, context);
+		if (rl.level != 0)    /* leave if level == 0 */
+		{
+			pos_scan_4x4 += rl.run;
+			*cbp_blk |= 2 << (block_y + (block_x >> 2)) ;
+			block[*pos_scan_4x4++] = rl.level;
+			for(k = 0; k < 16; ++k)
+			{
+				rl = readRunLevel_CABAC(currMB, de_cabac_ac, context);
+				if (rl.level != 0)    /* leave if level == 0 */
+				{
+					pos_scan_4x4 += rl.run;
+					block[*pos_scan_4x4++] = rl.level;
+				}
+				else
+					break;
+			}
+		}
+		/* ---- */
+		block = (int16_t *)(*blocks++);
+		currMB->subblock_y += 4; // position for coeff_count ctx		
+		currMB->subblock_x = block_x; // position for coeff_count ctx
+		pos_scan_4x4 = pos_scan4x4;
+		rl = readRunLevel_CABAC(currMB, de_cabac_dc, context);
+		if (rl.level != 0)    /* leave if level == 0 */
+		{
+			pos_scan_4x4 += rl.run;
+			*cbp_blk |= 16 << (block_y + (block_x >> 2)) ;
+			block[*pos_scan_4x4++] = rl.level;
+			for(k = 0; k < 16; ++k)
+			{
+				rl = readRunLevel_CABAC(currMB, de_cabac_ac, context);
+				if (rl.level != 0)    /* leave if level == 0 */
+				{
+					pos_scan_4x4 += rl.run;
+					block[*pos_scan_4x4++] = rl.level;
+				}
+				else
+					break;
+			}
+		}
+
+		block = (int16_t *)(*blocks++);
+		currMB->subblock_x += 4; // position for coeff_count ctx
+		pos_scan_4x4 = pos_scan4x4;
+		rl = readRunLevel_CABAC(currMB, de_cabac_dc, context);
+		if (rl.level != 0)    /* leave if level == 0 */
+		{
+			pos_scan_4x4 += rl.run;
+			*cbp_blk |= 32 << (block_y + (block_x >> 2)) ;
+			block[*pos_scan_4x4++] = rl.level;
+			for(k = 0; k < 16; ++k)
+			{
+				rl = readRunLevel_CABAC(currMB, de_cabac_ac, context);
+				if (rl.level != 0)    /* leave if level == 0 */
+				{
+					pos_scan_4x4 += rl.run;
+					block[*pos_scan_4x4++] = rl.level;
+				}
+				else
+					break;
+			}
+		}
+	
+}
+
+#if defined(_DEBUG) || defined(_M_IX64)
+static void inv_level_coefficients(h264_short_block_t *blocks, const int (*InvLevelScale)[4], int qp_per)
+{
+	int j, b;
+
+	for (b = 0;b<4;b++)
+	{
+		h264_short_block_row_t *block = blocks[b];
+		for (j = 0; j < 4; ++j)
+		{
+			if (block[j][0]) block[j][0]= rshift_rnd_sf((block[j][0] * InvLevelScale[j][0]) << qp_per, 4);
+			if (block[j][1]) block[j][1]= rshift_rnd_sf((block[j][1] * InvLevelScale[j][1]) << qp_per, 4);
+			if (block[j][2]) block[j][2]= rshift_rnd_sf((block[j][2] * InvLevelScale[j][2]) << qp_per, 4);
+			if (block[j][3]) block[j][3]= rshift_rnd_sf((block[j][3] * InvLevelScale[j][3]) << qp_per, 4);
+		}
+	}
+}
+#else
+void inv_level_coefficients(h264_short_block_t *blocks, const int (*InvLevelScale)[4], int qp_per);
+#endif
+
+static void inv_level_coefficients_AC(h264_short_block_t *blocks, const int (*InvLevelScale)[4], int qp_per)
+{
+	int b;
+
+	for (b = 0;b<4;b++)
+	{
+		h264_short_block_row_t *block = blocks[b];
+		if (block[0][1]) block[0][1]= rshift_rnd_sf((block[0][1] * InvLevelScale[0][1]) << qp_per, 4);
+		if (block[0][2]) block[0][2]= rshift_rnd_sf((block[0][2] * InvLevelScale[0][2]) << qp_per, 4);
+		if (block[0][3]) block[0][3]= rshift_rnd_sf((block[0][3] * InvLevelScale[0][3]) << qp_per, 4);
+
+		if (block[1][0]) block[1][0]= rshift_rnd_sf((block[1][0] * InvLevelScale[1][0]) << qp_per, 4);
+		if (block[1][1]) block[1][1]= rshift_rnd_sf((block[1][1] * InvLevelScale[1][1]) << qp_per, 4);
+		if (block[1][2]) block[1][2]= rshift_rnd_sf((block[1][2] * InvLevelScale[1][2]) << qp_per, 4);
+		if (block[1][3]) block[1][3]= rshift_rnd_sf((block[1][3] * InvLevelScale[1][3]) << qp_per, 4);
+
+		if (block[2][0]) block[2][0]= rshift_rnd_sf((block[2][0] * InvLevelScale[2][0]) << qp_per, 4);
+		if (block[2][1]) block[2][1]= rshift_rnd_sf((block[2][1] * InvLevelScale[2][1]) << qp_per, 4);
+		if (block[2][2]) block[2][2]= rshift_rnd_sf((block[2][2] * InvLevelScale[2][2]) << qp_per, 4);
+		if (block[2][3]) block[2][3]= rshift_rnd_sf((block[2][3] * InvLevelScale[2][3]) << qp_per, 4);
+
+		if (block[3][0]) block[3][0]= rshift_rnd_sf((block[3][0] * InvLevelScale[3][0]) << qp_per, 4);
+		if (block[3][1]) block[3][1]= rshift_rnd_sf((block[3][1] * InvLevelScale[3][1]) << qp_per, 4);
+		if (block[3][2]) block[3][2]= rshift_rnd_sf((block[3][2] * InvLevelScale[3][2]) << qp_per, 4);
+		if (block[3][3]) block[3][3]= rshift_rnd_sf((block[3][3] * InvLevelScale[3][3]) << qp_per, 4);
+	}
+}
+
+/*!
+************************************************************************
+* \brief
+*    Get coefficients (run/level) of all 4x4 blocks in a MB
+*    from the NAL (CABAC Mode)
+************************************************************************
+*/
+static void readCompCoeff4x4MB_CABAC(Macroblock *currMB, ColorPlane pl, int intra, int (*InvLevelScale4x4)[4], int qp_per, int cbp)
+{
+	Slice *currSlice = currMB->p_Slice;
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int start_scan = IS_I16MB (currMB)? 1 : 0; 
+	int64 *cbp_blk = &currMB->cbp_blk[pl];
+	int context;
+	h264_short_block_t *blocks = currSlice->cof4[pl];
+
+	currMB->is_intra_block = intra;  	
+
+	if( pl == PLANE_Y || IS_INDEPENDENT(p_Vid) )
+		context = (IS_I16MB(currMB) ? LUMA_16AC: LUMA_4x4);
+	else if (pl == PLANE_U)
+		context = (IS_I16MB(currMB) ? CB_16AC: CB_4x4);
+	else
+		context = (IS_I16MB(currMB) ? CR_16AC: CR_4x4);  
+	if (start_scan == 0)
+	{
+		if (currMB->is_lossless == FALSE)
+		{
+			if (cbp & 1)
+			{
+				readCompCoeff4x4SMB_CABAC(currMB, context, &blocks[0], 0, 0, cbp_blk);
+				inv_level_coefficients(&blocks[0], InvLevelScale4x4, qp_per);
+			}
+			if (cbp & 2)
+			{
+				readCompCoeff4x4SMB_CABAC(currMB, context, &blocks[4], 0, 8, cbp_blk);
+				inv_level_coefficients(&blocks[4], InvLevelScale4x4, qp_per);
+			}
+			if (cbp & 4)
+			{
+				readCompCoeff4x4SMB_CABAC(currMB, context, &blocks[8], 8, 0, cbp_blk);
+				inv_level_coefficients(&blocks[8], InvLevelScale4x4, qp_per);
+			}
+			if (cbp & 8)
+			{
+				readCompCoeff4x4SMB_CABAC(currMB, context, &blocks[12], 8, 8, cbp_blk);
+				inv_level_coefficients(&blocks[12], InvLevelScale4x4, qp_per);
+			}
+		}
+		else
+		{
+			if (cbp & 1)
+				readCompCoeff4x4SMB_CABAC(currMB, context, &blocks[0], 0, 0, cbp_blk);
+			if (cbp & 2)
+				readCompCoeff4x4SMB_CABAC(currMB, context, &blocks[4], 0, 8, cbp_blk);
+			if (cbp & 4)
+				readCompCoeff4x4SMB_CABAC(currMB, context, &blocks[8], 8, 0, cbp_blk);
+			if (cbp & 8)
+				readCompCoeff4x4SMB_CABAC(currMB, context, &blocks[12], 8, 8, cbp_blk);
+		}
+	}
+	else
+	{
+		if (currMB->is_lossless == FALSE)
+		{
+			if (cbp & 1)  // are there any coeff in current block at all
+			{
+				readCompCoeff4x4SMB_I16MB_CABAC(currMB, context, &blocks[0], 0, 0, cbp_blk);
+				inv_level_coefficients_AC(&blocks[0], InvLevelScale4x4, qp_per);
+			}
+			if (cbp & 2)  // are there any coeff in current block at all
+			{
+				readCompCoeff4x4SMB_I16MB_CABAC(currMB, context, &blocks[4], 0, 8, cbp_blk);
+				inv_level_coefficients_AC(&blocks[4], InvLevelScale4x4, qp_per);
+			}
+			if (cbp & 4)  // are there any coeff in current block at all
+			{
+				readCompCoeff4x4SMB_I16MB_CABAC(currMB, context, &blocks[8], 8, 0, cbp_blk);
+				inv_level_coefficients_AC(&blocks[8], InvLevelScale4x4, qp_per);
+			}
+			if (cbp & 8)  // are there any coeff in current block at all
+			{
+				readCompCoeff4x4SMB_I16MB_CABAC(currMB, context, &blocks[12], 8, 8, cbp_blk);
+				inv_level_coefficients_AC(&blocks[12], InvLevelScale4x4, qp_per);
+			}
+		}
+		else
+		{
+			if (cbp & 1)
+				readCompCoeff4x4SMB_I16MB_CABAC(currMB, context, &blocks[0], 0, 0, cbp_blk);
+			if (cbp & 2)
+				readCompCoeff4x4SMB_I16MB_CABAC(currMB, context, &blocks[4], 0, 8, cbp_blk);
+			if (cbp & 4)
+				readCompCoeff4x4SMB_I16MB_CABAC(currMB, context, &blocks[8], 8, 0, cbp_blk);
+			if (cbp & 8)
+				readCompCoeff4x4SMB_I16MB_CABAC(currMB, context, &blocks[12], 8, 8, cbp_blk);
+		}
+	}
+}
+
+/*!
+************************************************************************
+* \brief
+*    Get coefficients (run/level) of one 8x8 block
+*    from the NAL (CABAC Mode)
+************************************************************************
+*/
+static void readCompCoeff8x8_CABAC_Lossless(Macroblock *currMB, ColorPlane pl, int b8)
+{
+	if (currMB->cbp & (1<<b8))  // are there any coefficients in the current block
+	{
+		VideoParameters *p_Vid = currMB->p_Vid;
+		int transform_pl = IS_INDEPENDENT(p_Vid) ? p_Vid->colour_plane_id : pl;
+		int scan;
+		short *tcoeffs;
+		int k;
+		RunLevel rl;
+		int context;
+		DataPartition *dP;
+		Slice *currSlice = currMB->p_Slice;
+		const byte *partMap = assignSE2partition[currSlice->dp_mode];
+
+		int cbp_mask = (int64) 51 << (4 * b8 - 2 * (b8 & 0x01)); // corresponds to 110011, as if all four 4x4 blocks contain coeff, shifted to block position            
+		int64 *cur_cbp = &currMB->cbp_blk[pl];
+
+		// select scan type
+		const byte *pos_scan8x8 = ((p_Vid->structure == FRAME) && (!currMB->mb_field)) ? SNGL_SCAN8x8_1D : FIELD_SCAN8x8_1D;
+
+		int qp_per = p_Vid->qp_per_matrix[ currMB->qp_scaled[pl] ];
+		int qp_rem = p_Vid->qp_rem_matrix[ currMB->qp_scaled[pl] ];
+
+		const int *InvLevelScale8x8 = IS_INTRA(currMB)? currSlice->InvLevelScale8x8_Intra[transform_pl][qp_rem] : currSlice->InvLevelScale8x8_Inter[transform_pl][qp_rem];
+
+		currMB->is_intra_block = IS_INTRA(currMB);
+
+		// === set offset in current macroblock ===
+		tcoeffs = (short *)(currSlice->mb_rres8[pl][b8]);
+
+		currMB->subblock_x = (b8&0x01) << 3; // position for coeff_count ctx
+		currMB->subblock_y = (b8 >> 1) << 3; // position for coeff_count ctx
+
+		if (pl==PLANE_Y || IS_INDEPENDENT(p_Vid))  
+			context = LUMA_8x8;
+		else if (pl==PLANE_U)
+			context = CB_8x8;
+		else
+			context = CR_8x8;  
+
+		for(k=0; (k < 65);++k)
+		{
+			//============ read =============
+			/*
+			* make distinction between INTRA and INTER coded
+			* luminance coefficients
+			*/
+
+			int type = ((currMB->is_intra_block == 1)
+				? (k==0 ? SE_LUM_DC_INTRA : SE_LUM_AC_INTRA) 
+				: (k==0 ? SE_LUM_DC_INTER : SE_LUM_AC_INTER));
+
+			dP = &(currSlice->partArr[partMap[type]]);
+			rl = readRunLevel_CABAC(currMB, &(dP->de_cabac), context);
+
+			//============ decode =============
+			if (rl.level != 0)    /* leave if level == 0 */
+			{
+				pos_scan8x8 += rl.run;
+
+				scan = *pos_scan8x8++;
+
+				*cur_cbp |= cbp_mask;
+
+				tcoeffs[scan] = rl.level;
+			}
+			else
+				break;
+		}
+	}
+}
+
+
+static void readCompCoeff8x8_CABAC_Intra(Macroblock *currMB, ColorPlane pl, int b8)
+{
+	if (currMB->cbp & (1<<b8))  // are there any coefficients in the current block
+	{
+		VideoParameters *p_Vid = currMB->p_Vid;
+		int transform_pl = IS_INDEPENDENT(p_Vid) ? p_Vid->colour_plane_id : pl;
+		int scan;
+		short *tcoeffs;
+		RunLevel rl;
+		int k;
+		int context;
+		DecodingEnvironment *cabac;
+		Slice *currSlice = currMB->p_Slice;
+		const byte *partMap = assignSE2partition[currSlice->dp_mode];
+
+		int cbp_mask = (int64) 51 << (4 * b8 - 2 * (b8 & 0x01)); // corresponds to 110011, as if all four 4x4 blocks contain coeff, shifted to block position            
+		int64 *cur_cbp = &currMB->cbp_blk[pl];
+
+		// select scan type
+		const byte *pos_scan8x8 = ((p_Vid->structure == FRAME) && (!currMB->mb_field)) ? SNGL_SCAN8x8_1D : FIELD_SCAN8x8_1D;
+
+		int qp_per = p_Vid->qp_per_matrix[ currMB->qp_scaled[pl] ];
+		int qp_rem = p_Vid->qp_rem_matrix[ currMB->qp_scaled[pl] ];
+
+		const int *InvLevelScale8x8 = currSlice->InvLevelScale8x8_Intra[transform_pl][qp_rem];
+
+		currMB->is_intra_block = 1;
+
+		// === set offset in current macroblock ===
+		tcoeffs = (short *)(currSlice->mb_rres8[pl][b8]);
+
+		currMB->subblock_x = (b8&0x01) << 3; // position for coeff_count ctx
+		currMB->subblock_y = (b8 >> 1) << 3; // position for coeff_count ctx
+
+		if (pl==PLANE_Y || IS_INDEPENDENT(p_Vid))  
+			context = LUMA_8x8;
+		else if (pl==PLANE_U)
+			context = CB_8x8;
+		else
+			context = CR_8x8;  
+
+		// Read DC
+		cabac = &(currSlice->partArr[partMap[SE_LUM_DC_INTRA]].de_cabac);
+		rl = readRunLevel_CABAC(currMB, cabac, context);
+
+		//============ decode =============
+		if (rl.level != 0)    /* leave if level == 0 */
+		{
+			*cur_cbp |= cbp_mask; 
+
+			pos_scan8x8 += rl.run;
+
+			scan = *pos_scan8x8++;
+
+			tcoeffs[scan] = rshift_rnd_sf((rl.level * InvLevelScale8x8[scan]) << qp_per, 6); // dequantization
+
+			// AC coefficients
+			cabac = &(currSlice->partArr[partMap[SE_LUM_AC_INTRA]].de_cabac);
+
+			k = 64;
+			do
+			{
+				rl = readRunLevel_CABAC(currMB, cabac, context);
+
+				//============ decode =============
+				if (rl.level != 0)    /* leave if level == 0 */
+				{
+					pos_scan8x8 += rl.run;
+
+					scan = *pos_scan8x8++;
+
+					tcoeffs[scan] = rshift_rnd_sf((rl.level * InvLevelScale8x8[scan]) << qp_per, 6); // dequantization
+				}
+				else
+					break;
+			} while (--k);
+		}
+	}
+
+}
+
+
+
+static void readCompCoeff8x8_CABAC_Inter(Macroblock *currMB, ColorPlane pl, int b8)
+{
+	if (currMB->cbp & (1<<b8))  // are there any coefficients in the current block
+	{
+		VideoParameters *p_Vid = currMB->p_Vid;
+		int transform_pl = IS_INDEPENDENT(p_Vid) ? p_Vid->colour_plane_id : pl;
+		int scan;
+		short *tcoeffs;
+		int k;
+		RunLevel rl;
+		int context;
+		DecodingEnvironment *cabac;
+		Slice *currSlice = currMB->p_Slice;
+		const byte *partMap = assignSE2partition[currSlice->dp_mode];
+
+		int cbp_mask = (int64) 51 << (4 * b8 - 2 * (b8 & 0x01)); // corresponds to 110011, as if all four 4x4 blocks contain coeff, shifted to block position            
+		int64 *cur_cbp = &currMB->cbp_blk[pl];
+
+		// select scan type
+		const byte *pos_scan8x8 = ((p_Vid->structure == FRAME) && (!currMB->mb_field)) ? SNGL_SCAN8x8_1D : FIELD_SCAN8x8_1D;
+
+		int qp_per = p_Vid->qp_per_matrix[ currMB->qp_scaled[pl] ];
+		int qp_rem = p_Vid->qp_rem_matrix[ currMB->qp_scaled[pl] ];
+
+		const int *InvLevelScale8x8 = currSlice->InvLevelScale8x8_Inter[transform_pl][qp_rem];
+
+		currMB->is_intra_block = 0;
+
+		// === set offset in current macroblock ===
+		tcoeffs = (short *)(currSlice->mb_rres8[pl][b8]);
+
+		currMB->subblock_x = (b8&0x01) << 3; // position for coeff_count ctx
+		currMB->subblock_y = (b8 >> 1) << 3; // position for coeff_count ctx
+
+		if (pl==PLANE_Y || IS_INDEPENDENT(p_Vid))  
+			context = LUMA_8x8;
+		else if (pl==PLANE_U)
+			context = CB_8x8;
+		else
+			context = CR_8x8;  
+
+		// Read DC
+		cabac = &(currSlice->partArr[partMap[SE_LUM_DC_INTER]].de_cabac);
+		rl = readRunLevel_CABAC(currMB, cabac, context);
+
+		//============ decode =============
+		if (rl.level != 0)    /* leave if level == 0 */
+		{
+			*cur_cbp |= cbp_mask; 
+
+			pos_scan8x8 += rl.run;
+
+			scan = *pos_scan8x8++;
+
+			tcoeffs[scan] = rshift_rnd_sf((rl.level * InvLevelScale8x8[scan]) << qp_per, 6); // dequantization
+
+			// AC coefficients
+			cabac = &(currSlice->partArr[partMap[SE_LUM_AC_INTER]].de_cabac);
+
+			k=64;
+			do
+			{
+				rl = readRunLevel_CABAC(currMB, cabac, context);
+
+				//============ decode =============
+				if (rl.level != 0)    /* leave if level == 0 */
+				{
+					pos_scan8x8 += rl.run;
+
+					scan = *pos_scan8x8++;
+
+					tcoeffs[scan] = rshift_rnd_sf((rl.level * InvLevelScale8x8[scan]) << qp_per, 6); // dequantization
+				}
+				else
+					break;
+			} while (--k);
+		}
+	}
+
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    Get coefficients (run/level) of 8x8 blocks in a MB
+*    from the NAL (CABAC Mode)
+************************************************************************
+*/
+static void readCompCoeff8x8MB_CABAC(Macroblock *currMB, ColorPlane pl)
+{
+	//======= 8x8 transform size & CABAC ========
+	if(currMB->is_lossless == FALSE)
+	{
+		if (IS_INTRA(currMB))
+		{
+			readCompCoeff8x8_CABAC_Intra(currMB, pl, 0); 
+			readCompCoeff8x8_CABAC_Intra(currMB, pl, 1); 
+			readCompCoeff8x8_CABAC_Intra(currMB, pl, 2); 
+			readCompCoeff8x8_CABAC_Intra(currMB, pl, 3); 
+		}
+		else
+		{
+			readCompCoeff8x8_CABAC_Inter(currMB, pl, 0); 
+			readCompCoeff8x8_CABAC_Inter(currMB, pl, 1); 
+			readCompCoeff8x8_CABAC_Inter(currMB, pl, 2); 
+			readCompCoeff8x8_CABAC_Inter(currMB, pl, 3); 
+		}
+	}
+	else
+	{
+		readCompCoeff8x8_CABAC_Lossless(currMB, pl, 0); 
+		readCompCoeff8x8_CABAC_Lossless(currMB, pl, 1); 
+		readCompCoeff8x8_CABAC_Lossless(currMB, pl, 2); 
+		readCompCoeff8x8_CABAC_Lossless(currMB, pl, 3); 
+	}
+}
+
+/*!
+************************************************************************
+* \brief
+*    Get coefficients (run/level) of 4x4 blocks in a MB
+*    from the NAL (CABAC Mode)
+************************************************************************
+*/
+static void readCompCoeff4x4MB_CAVLC (Macroblock *currMB, ColorPlane pl, int (*InvLevelScale4x4)[4], int qp_per, int cbp, h264_4x4_byte nzcoeff)
+{
+	int block_y, block_x, b8;
+	int i, j, k;
+	int i0, j0;
+	__declspec(align(32)) int levarr[16], runarr[16];
+	int numcoeff;
+	Slice *currSlice = currMB->p_Slice;
+	VideoParameters *p_Vid = currMB->p_Vid;
+	const byte (*pos_scan4x4)[2] = ((p_Vid->structure == FRAME) && (!currMB->mb_field)) ? SNGL_SCAN : FIELD_SCAN;
+	const byte *pos_scan_4x4 = pos_scan4x4[0];
+	int start_scan = IS_I16MB(currMB) ? 1 : 0;
+	int64 *cur_cbp = &currMB->cbp_blk[pl];
+	int coef_ctr, cur_context; 
+
+	memzero64(levarr);
+	memzero64(runarr);
+
+	if (IS_I16MB(currMB))
+	{
+		if (pl == PLANE_Y)
+			cur_context = LUMA_INTRA16x16AC;
+		else if (pl == PLANE_U)
+			cur_context = CB_INTRA16x16AC;
+		else
+			cur_context = CR_INTRA16x16AC;
+	}
+	else
+	{
+		if (pl == PLANE_Y)
+			cur_context = LUMA;
+		else if (pl == PLANE_U)
+			cur_context = CB;
+		else
+			cur_context = CR;
+	}
+
+	if (currMB->is_lossless == FALSE)
+	{
+		for (block_y = 0; block_y < 4; block_y += 2) /* all modes */
+		{
+			for (block_x = 0; block_x < 4; block_x += 2)
+			{
+				b8 = (block_y + (block_x >> 1));
+
+				if (cbp & (1 << b8))  // test if the block contains any coefficients
+				{
+					for (j=block_y << 2; j < (block_y + 2) << 2; j += BLOCK_SIZE)
+					{
+						for (i=block_x << 2; i < (block_x + 2) << 2; i += BLOCK_SIZE)
+						{
+							readCoeff4x4_CAVLC(currMB, cur_context, i >> 2, j >> 2, levarr, runarr, &numcoeff);
+							pos_scan_4x4 = pos_scan4x4[start_scan];
+
+							for (k = 0; k < numcoeff; ++k)
+							{
+								if (levarr[k] != 0)
+								{
+									pos_scan_4x4 += (runarr[k] << 1);
+
+									i0 = *pos_scan_4x4++;
+									j0 = *pos_scan_4x4++;
+
+									// inverse quant for 4x4 transform only
+									*cur_cbp |= (int64) 1 << (j + (i >> 2));
+
+									currSlice->cof4[pl][cof4_pos_to_subblock[j>>2][i>>2]][j0][i0]= rshift_rnd_sf((levarr[k] * InvLevelScale4x4[j0][i0])<<qp_per, 4);
+								}
+							}
+						}
+					}
+				}
+				else
+				{
+					for (j=0; j < 2; j++)
+					{
+						for (i=0;i<2;i++)
+						{
+							nzcoeff[block_y+j][block_x+i]=0;
+						}
+					}
+				}
+			}
+		}
+	}
+	else
+	{   
+		for (block_y=0; block_y < 4; block_y += 2) /* all modes */
+		{
+			for (block_x=0; block_x < 4; block_x += 2)
+			{
+				b8 = 2*(block_y>>1) + (block_x>>1);
+
+				if (cbp & (1<<b8))  /* are there any coeff in current block at all */
+				{
+					for (j=block_y; j < block_y+2; ++j)
+					{
+						for (i=block_x; i < block_x+2; ++i)
+						{
+							readCoeff4x4_CAVLC(currMB, cur_context, i, j, levarr, runarr, &numcoeff);
+
+							coef_ctr = start_scan - 1;
+
+							for (k = 0; k < numcoeff; ++k)
+							{
+								if (levarr[k] != 0)
+								{
+									coef_ctr += runarr[k]+1;
+
+									i0=pos_scan4x4[coef_ctr][0];
+									j0=pos_scan4x4[coef_ctr][1];
+
+									*cur_cbp |= (int64) 1 << ((j<<2) + i);
+									currSlice->cof4[pl][cof4_pos_to_subblock[j>>2][i>>2]][j0][i0]= levarr[k];
+								}
+							}
+						}
+					}
+				}
+				else
+				{
+					for (j=0; j < 2; j++)
+					{
+						for (i=0;i<2;i++)
+						{
+							nzcoeff[block_y+j][block_x+i]=0;
+						}
+					}
+				}
+			}
+		}
+	}  
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    Get coefficients (run/level) of 4x4 blocks in a MB
+*    from the NAL (CABAC Mode)
+************************************************************************
+*/
+static void readCompCoeff8x8MB_CAVLC (Macroblock *currMB, ColorPlane pl, const int *InvLevelScale8x8, int qp_per, int cbp, h264_4x4_byte nzcoeff)
+{
+	int block_y, block_x, b4, b8;
+	int i,j,k;
+	int scan;
+	__declspec(align(32)) int levarr[16] = {0}, runarr[16] = {0};
+	int numcoeff;
+	Slice *currSlice = currMB->p_Slice;
+	VideoParameters *p_Vid = currMB->p_Vid;
+	const byte *pos_scan8x8 = ((p_Vid->structure == FRAME) && (!currMB->mb_field)) ? SNGL_SCAN8x8_1D : FIELD_SCAN8x8_1D;
+	int start_scan = IS_I16MB(currMB) ? 1 : 0;
+	int64 *cur_cbp = &currMB->cbp_blk[pl];
+	int coef_ctr, cur_context; 
+	short *coefficients;	
+
+	if (IS_I16MB(currMB))
+	{
+		if (pl == PLANE_Y)
+			cur_context = LUMA_INTRA16x16AC;
+		else if (pl == PLANE_U)
+			cur_context = CB_INTRA16x16AC;
+		else
+			cur_context = CR_INTRA16x16AC;
+	}
+	else
+	{
+		if (pl == PLANE_Y)
+			cur_context = LUMA;
+		else if (pl == PLANE_U)
+			cur_context = CB;
+		else
+			cur_context = CR;
+	}
+
+	if (currMB->is_lossless == FALSE)
+	{    
+		for (block_y=0; block_y < 4; block_y += 2) /* all modes */
+		{
+			for (block_x=0; block_x < 4; block_x += 2)
+			{
+				b8 = block_y + (block_x>>1);
+				coefficients =(short *)(currSlice->mb_rres8[pl][b8]);
+				if (cbp & (1<<b8))  /* are there any coeff in current block at all */
+				{
+					for (j=block_y; j < block_y+2; ++j)
+					{
+						for (i=block_x; i < block_x+2; ++i)
+						{
+							readCoeff4x4_CAVLC(currMB, cur_context, i, j, levarr, runarr, &numcoeff);
+
+							coef_ctr = start_scan - 1;
+
+							for (k = 0; k < numcoeff; ++k)
+							{
+								if (levarr[k] != 0)
+								{
+									coef_ctr += runarr[k]+1;
+
+									// do same as CABAC for deblocking: any coeff in the 8x8 marks all the 4x4s
+									//as containing coefficients
+									*cur_cbp |= 51 << ((block_y<<2) + block_x);
+
+									b4 = (coef_ctr << 2) + 2*(j - block_y)+(i - block_x);
+
+									scan = pos_scan8x8[b4];
+
+									coefficients[scan] = rshift_rnd_sf((levarr[k] * InvLevelScale8x8[scan])<<qp_per, 6); // dequantization
+								}
+							}//else (!currMB->luma_transform_size_8x8_flag)
+						}
+					}
+				}
+				else
+				{
+					for (j=block_y; j < block_y+2; ++j)
+					{
+						memset(&nzcoeff[j][block_x], 0, 2 * sizeof(byte));
+					}
+				}
+			}
+		}
+	}
+	else // inverse quant for 8x8 transform
+	{
+		for (block_y=0; block_y < 4; block_y += 2) /* all modes */
+		{
+			for (block_x=0; block_x < 4; block_x += 2)
+			{
+				b8 = 2*(block_y>>1) + (block_x>>1);
+				coefficients =(short *)(currSlice->mb_rres8[pl][b8]);
+				if (cbp & (1<<b8))  /* are there any coeff in current block at all */
+				{
+					for (j=block_y; j < block_y+2; ++j)
+					{
+						for (i=block_x; i < block_x+2; ++i)
+						{
+
+							readCoeff4x4_CAVLC(currMB, cur_context, i, j, levarr, runarr, &numcoeff);
+
+							coef_ctr = start_scan - 1;
+
+							for (k = 0; k < numcoeff; ++k)
+							{
+								if (levarr[k] != 0)
+								{
+									coef_ctr += runarr[k]+1;
+
+									// do same as CABAC for deblocking: any coeff in the 8x8 marks all the 4x4s
+									//as containing coefficients
+									*cur_cbp  |= 51 << ((block_y<<2) + block_x);
+
+									b4 = 2*(j-block_y)+(i-block_x);
+
+									scan=pos_scan8x8[coef_ctr*4+b4];
+
+									coefficients[scan] = levarr[k];
+								}
+							}
+						}
+					}
+				}
+				else
+				{
+					for (j=block_y; j < block_y+2; ++j)
+					{
+						memset(&nzcoeff[j][block_x], 0, 2 * sizeof(byte));
+					}
+				}
+			}
+		}
+	}
+}
+
+/*!
+************************************************************************
+* \brief
+*    Data partitioning: Check if neighboring macroblock is needed for 
+*    CAVLC context decoding, and disable current MB if data partition
+*    is missing.
+************************************************************************
+*/
+static void check_dp_neighbors (Macroblock *currMB)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+
+	if (IS_INTER (currMB) || (IS_INTRA (currMB) && !(p_Vid->active_pps->constrained_intra_pred_flag)) )
+	{
+		PixelPos up, left;
+
+		p_Vid->getNeighbourLeft(currMB, p_Vid->mb_size[1], &left);
+		p_Vid->getNeighbourUp(currMB, p_Vid->mb_size[1], &up);
+
+		if (left.available)
+		{
+			currMB->dpl_flag |= p_Vid->mb_data[left.mb_addr].dpl_flag;
+		}
+		if (up.available)
+		{
+			currMB->dpl_flag |= p_Vid->mb_data[up.mb_addr].dpl_flag;
+		}
+	}
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    Get coded block pattern and coefficients (run/level)
+*    from the NAL
+************************************************************************
+*/
+static void read_CBP_and_coeffs_from_NAL_CABAC(Macroblock *currMB)
+{
+	int i,j,k;
+	int cbp;
+	SyntaxElement currSE;
+	DataPartition *dP = NULL;
+	Slice *currSlice = currMB->p_Slice;
+	const byte *partMap = assignSE2partition[currSlice->dp_mode];
+	int coef_ctr, i0, j0, b8;
+	int ll;
+	RunLevel rl;
+
+	int qp_per, qp_rem;
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int intra = IS_INTRA (currMB);
+	int smb = ((p_Vid->type==SP_SLICE) && !intra) || (p_Vid->type == SI_SLICE && currMB->mb_type == SI4MB);
+
+	int uv; 
+	int qp_per_uv[2];
+	int qp_rem_uv[2];
+
+
+	int temp[4];
+
+	int b4;
+	StorablePicture *dec_picture = p_Vid->dec_picture;
+	int yuv = dec_picture->chroma_format_idc - 1;
+	int m6[4];
+
+	int need_transform_size_flag;
+
+	int (*InvLevelScale4x4)[4] = NULL;
+
+	// select scan type
+	const byte (*pos_scan4x4)[2] = ((p_Vid->structure == FRAME) && (!currMB->mb_field)) ? SNGL_SCAN : FIELD_SCAN;
+	const byte *pos_scan4x4_1d = ((p_Vid->structure == FRAME) && (!currMB->mb_field)) ? SNGL_SCAN_1D : FIELD_SCAN_1D;
+	const byte *pos_scan4x4_dc = ((p_Vid->structure == FRAME) && (!currMB->mb_field)) ? SNGL_SCAN_DC : FIELD_SCAN_DC;
+	const byte *pos_scan_4x4;
+
+	// QPI
+	//init constants for every chroma qp offset
+	if (dec_picture->chroma_format_idc != YUV400)
+	{
+		for (i=0; i<2; ++i)
+		{
+			qp_per_uv[i] = p_Vid->qp_per_matrix[ currMB->qp_scaled[i + 1] ];
+			qp_rem_uv[i] = p_Vid->qp_rem_matrix[ currMB->qp_scaled[i + 1] ];
+		}
+	}
+
+	// read CBP if not new intra mode
+	if (!IS_I16MB (currMB))
+	{
+		//=====   C B P   =====
+		//---------------------
+		int type =  (currMB->mb_type == I4MB || currMB->mb_type == SI4MB || currMB->mb_type == I8MB) 
+			? SE_CBP_INTRA
+			: SE_CBP_INTER;
+
+		dP = &(currSlice->partArr[partMap[type]]);
+
+		currMB->cbp = cbp = readCBP_CABAC(currMB, &(dP->de_cabac));
+
+		TRACE_STRING("coded_block_pattern");
+
+
+		//============= Transform size flag for INTER MBs =============
+		//-------------------------------------------------------------
+		need_transform_size_flag = (((currMB->mb_type >= 1 && currMB->mb_type <= 3)||
+			(IS_DIRECT(currMB) && p_Vid->active_sps->direct_8x8_inference_flag) ||
+			(currMB->NoMbPartLessThan8x8Flag))
+			&& currMB->mb_type != I8MB && currMB->mb_type != I4MB
+			&& (currMB->cbp&15)
+			&& p_Vid->Transform8x8Mode);
+
+		if (need_transform_size_flag)
+		{
+			dP = &(currSlice->partArr[partMap[SE_HEADER]]);
+			TRACE_STRING("transform_size_8x8_flag");
+
+			// read CAVLC transform_size_8x8_flag
+			currMB->luma_transform_size_8x8_flag = readMB_transform_size_flag_CABAC(currMB, &(dP->de_cabac));
+		}
+
+		//=====   DQUANT   =====
+		//----------------------
+		// Delta quant only if nonzero coeffs
+		if (cbp !=0)
+		{
+			read_delta_quant_CABAC(&currSE, dP, currMB, partMap, (!intra) ? SE_DELTA_QUANT_INTER : SE_DELTA_QUANT_INTRA);
+
+			if (currSlice->dp_mode)
+			{
+				if (!intra && currSlice->dpC_NotPresent ) 
+					currMB->dpl_flag = 1;
+
+				if( intra && currSlice->dpB_NotPresent )
+				{
+					currMB->ei_flag = 1;
+					currMB->dpl_flag = 1;
+				}
+
+				// check for prediction from neighbours
+				check_dp_neighbors (currMB);
+				if (currMB->dpl_flag)
+				{
+					cbp = 0; 
+					currMB->cbp = cbp;
+				}
+			}
+		}
+	}
+	else
+	{
+		cbp = currMB->cbp;
+	}
+
+	if (IS_I16MB (currMB)) // read DC coeffs for new intra modes
+	{
+		read_delta_quant_CABAC(&currSE, dP, currMB, partMap, SE_DELTA_QUANT_INTRA);
+
+		macroblock_set_dc_pred(p_Vid, currMB->block_x, currMB->block_y);
+
+		if (currSlice->dp_mode)
+		{  
+			if (currSlice->dpB_NotPresent)
+			{
+				currMB->ei_flag  = 1;
+				currMB->dpl_flag = 1;
+			}
+			check_dp_neighbors (currMB);
+			if (currMB->dpl_flag)
+			{
+				currMB->cbp = cbp = 0; 
+			}
+		}
+
+		if (!currMB->dpl_flag)
+		{
+			pos_scan_4x4 = pos_scan4x4_dc;
+
+			{
+				dP = &(currSlice->partArr[partMap[SE_LUM_DC_INTRA]]);
+
+				currMB->is_intra_block = 1;
+
+				for(k = 0; k < 17 ; k++)
+				{
+					rl = readRunLevel_CABAC(currMB, &(dP->de_cabac), LUMA_16DC);
+
+					if (rl.level != 0)    /* leave if level == 0 */
+					{
+						pos_scan_4x4 += rl.run;
+						currSlice->cof4[0][*pos_scan_4x4++][0][0] = rl.level;// add new intra DC coeff
+					}
+					else
+						break;
+				}
+
+			}
+
+			if(currMB->is_lossless == FALSE)
+				itrans_2(currMB, (ColorPlane) p_Vid->colour_plane_id);// transform new intra DC
+		}
+	}
+
+	update_qp(currMB, p_Vid->qp);
+
+	qp_per = p_Vid->qp_per_matrix[ currMB->qp_scaled[p_Vid->colour_plane_id] ];
+	qp_rem = p_Vid->qp_rem_matrix[ currMB->qp_scaled[p_Vid->colour_plane_id] ];
+
+	//init quant parameters for chroma 
+	if (dec_picture->chroma_format_idc != YUV400)
+	{
+		for(i=0; i < 2; ++i)
+		{
+			qp_per_uv[i] = p_Vid->qp_per_matrix[ currMB->qp_scaled[i + 1] ];
+			qp_rem_uv[i] = p_Vid->qp_rem_matrix[ currMB->qp_scaled[i + 1] ];
+		}
+	}
+
+	InvLevelScale4x4 = intra? currSlice->InvLevelScale4x4_Intra[p_Vid->colour_plane_id][qp_rem] : currSlice->InvLevelScale4x4_Inter[p_Vid->colour_plane_id][qp_rem];
+
+	// luma coefficients
+	{
+		//======= Other Modes & CABAC ========
+		//------------------------------------          
+		if (cbp)
+		{
+			if(currMB->luma_transform_size_8x8_flag) 
+			{
+				//======= 8x8 transform size & CABAC ========
+				readCompCoeff8x8MB_CABAC (currMB, PLANE_Y); 
+			}
+			else
+			{
+				readCompCoeff4x4MB_CABAC (currMB, PLANE_Y, intra, InvLevelScale4x4, qp_per, cbp);        
+			}
+		}
+	}
+
+	if ( p_Vid->active_sps->chroma_format_idc==YUV444 && !IS_INDEPENDENT(p_Vid) ) 
+	{
+		for (uv = 0; uv < 2; ++uv )
+		{
+			/*----------------------16x16DC Luma_Add----------------------*/
+			if (IS_I16MB (currMB)) // read DC coeffs for new intra modes       
+			{
+				macroblock_set_dc_pred(p_Vid, currMB->block_x, currMB->block_y);
+
+				{              
+					int context;
+					dP = &(currSlice->partArr[partMap[SE_LUM_DC_INTRA]]);
+
+					if( IS_INDEPENDENT(p_Vid) )
+						context = LUMA_16DC; 
+					else
+						context = (uv==0) ? CB_16DC : CR_16DC;
+
+					currMB->is_intra_block = 1;
+
+					coef_ctr = -1;
+
+					for(k=0;k<17;++k)
+					{
+						rl = readRunLevel_CABAC(currMB, &dP->de_cabac, context);
+
+						if (rl.level != 0)                     // leave if level == 0
+						{
+							coef_ctr += rl.run + 1;
+							currSlice->cof4[uv + 1][pos_scan4x4_1d[coef_ctr]][0][0] = rl.level;
+						} 
+						else
+							break;
+					} //k loop
+				} // else CAVLC
+
+				if(currMB->is_lossless == FALSE)
+				{
+					itrans_2(currMB, (ColorPlane) (uv + 1)); // transform new intra DC
+				}
+			} //IS_I16MB
+
+			update_qp(currMB, p_Vid->qp);
+
+			qp_per = p_Vid->qp_per_matrix[ (p_Vid->qp + p_Vid->bitdepth_luma_qp_scale) ];
+			qp_rem = p_Vid->qp_rem_matrix[ (p_Vid->qp + p_Vid->bitdepth_luma_qp_scale) ];
+
+			//init constants for every chroma qp offset
+			qp_per_uv[uv] = p_Vid->qp_per_matrix[ (currMB->qpc[uv] + p_Vid->bitdepth_chroma_qp_scale) ];
+			qp_rem_uv[uv] = p_Vid->qp_rem_matrix[ (currMB->qpc[uv] + p_Vid->bitdepth_chroma_qp_scale) ];
+
+			InvLevelScale4x4 = intra? currSlice->InvLevelScale4x4_Intra[uv + 1][qp_rem_uv[uv]] : currSlice->InvLevelScale4x4_Inter[uv + 1][qp_rem_uv[uv]];
+
+			{  
+				if (cbp)
+				{
+					if(currMB->luma_transform_size_8x8_flag) 
+					{
+						//======= 8x8 transform size & CABAC ========
+						readCompCoeff8x8MB_CABAC(currMB, (ColorPlane) (PLANE_U + uv)); 
+					}
+					else //4x4
+					{        
+						readCompCoeff4x4MB_CABAC(currMB, (ColorPlane) (PLANE_U + uv), intra, InvLevelScale4x4,  qp_per_uv[uv], cbp);
+					}
+				}
+			}
+		} 
+	} //444
+	else  if ((dec_picture->chroma_format_idc != YUV400) && (dec_picture->chroma_format_idc != YUV444))
+	{
+		//========================== CHROMA DC ============================
+		//-----------------------------------------------------------------
+		// chroma DC coeff
+		if(cbp>15)
+		{
+			if (dec_picture->chroma_format_idc == YUV420)
+			{    
+				for (ll=0;ll<3;ll+=2)
+				{
+					uv = ll>>1;          
+
+					InvLevelScale4x4 = intra ? currSlice->InvLevelScale4x4_Intra[uv + 1][qp_rem_uv[uv]] : currSlice->InvLevelScale4x4_Inter[uv + 1][qp_rem_uv[uv]];
+					//===================== CHROMA DC YUV420 ======================
+					memzero16(&currSlice->cofu[0]);
+					coef_ctr=-1;
+
+					{
+						int type = (intra ? SE_CHR_DC_INTRA : SE_CHR_DC_INTER);
+
+						currMB->is_intra_block =  intra;
+						currMB->is_v_block     = ll;
+
+						dP = &(currSlice->partArr[partMap[type]]);
+
+						for(k = 0; k < (p_Vid->num_cdc_coeff + 1);++k)
+						{
+							rl = readRunLevel_CABAC(currMB, &(dP->de_cabac), CHROMA_DC);
+
+							if (rl.level != 0)
+							{
+								currMB->cbp_blk[0] |= 0xf0000 << (ll<<1) ;
+								coef_ctr += rl.run + 1;
+
+								// Bug: currSlice->cofu has only 4 entries, hence coef_ctr MUST be <4 (which is
+								// caught by the assert().  If it is bigger than 4, it starts patching the
+								// p_Vid->predmode pointer, which leads to bugs later on.
+								//
+								// This assert() should be left in the code, because it captures a very likely
+								// bug early when testing in error prone environments (or when testing NAL
+								// functionality).
+								assert (coef_ctr < p_Vid->num_cdc_coeff);
+								currSlice->cofu[coef_ctr&3]=rl.level;
+							}
+							else
+								break;
+						}
+					}
+
+					if (smb || (currMB->is_lossless == TRUE)) // check to see if MB type is SPred or SIntra4x4
+					{
+						currSlice->cof4[uv + 1][0][0][0] = currSlice->cofu[0];
+						currSlice->cof4[uv + 1][1][0][0] = currSlice->cofu[1];
+						currSlice->cof4[uv + 1][2][0][0] = currSlice->cofu[2];
+						currSlice->cof4[uv + 1][3][0][0] = currSlice->cofu[3];
+					}
+					else
+					{
+						ihadamard2x2(currSlice->cofu, temp);
+
+						currSlice->cof4[uv + 1][0][0][0] = (((temp[0] * InvLevelScale4x4[0][0])<<qp_per_uv[uv])>>5);
+						currSlice->cof4[uv + 1][1][0][0] = (((temp[1] * InvLevelScale4x4[0][0])<<qp_per_uv[uv])>>5);
+						currSlice->cof4[uv + 1][2][0][0] = (((temp[2] * InvLevelScale4x4[0][0])<<qp_per_uv[uv])>>5);
+						currSlice->cof4[uv + 1][3][0][0] = (((temp[3] * InvLevelScale4x4[0][0])<<qp_per_uv[uv])>>5);
+					}          
+				}
+			}
+			else if (dec_picture->chroma_format_idc == YUV422)
+			{
+				for (ll=0;ll<3;ll+=2)
+				{
+					int (*InvLevelScale4x4)[4] = NULL;
+					uv = ll>>1;
+					{
+						h264_short_block_t *imgcof = currSlice->cof4[uv + 1];
+						int m3[2][4] = {{0,0,0,0},{0,0,0,0}};
+						int m4[2][4] = {{0,0,0,0},{0,0,0,0}};
+						int qp_per_uv_dc = p_Vid->qp_per_matrix[ (currMB->qpc[uv] + 3 + p_Vid->bitdepth_chroma_qp_scale) ];       //for YUV422 only
+						int qp_rem_uv_dc = p_Vid->qp_rem_matrix[ (currMB->qpc[uv] + 3 + p_Vid->bitdepth_chroma_qp_scale) ];       //for YUV422 only
+						if (intra)
+							InvLevelScale4x4 = currSlice->InvLevelScale4x4_Intra[uv + 1][qp_rem_uv_dc];
+						else 
+							InvLevelScale4x4 = currSlice->InvLevelScale4x4_Inter[uv + 1][qp_rem_uv_dc];
+
+
+						//===================== CHROMA DC YUV422 ======================
+						{
+							coef_ctr=-1;
+							for(k=0;k<9;++k)
+							{
+								int type = (intra ? SE_CHR_DC_INTRA : SE_CHR_DC_INTER);
+								currMB->is_intra_block =  intra;
+								currMB->is_v_block     = ll;
+
+								dP = &(currSlice->partArr[partMap[type]]);
+
+								rl = readRunLevel_CABAC(currMB, &dP->de_cabac, CHROMA_DC_2x4);
+
+								if (rl.level != 0)
+								{
+									currMB->cbp_blk[0] |= ((int64)0xff0000) << (ll<<2) ;
+									coef_ctr += rl.run + 1;
+									assert (coef_ctr < p_Vid->num_cdc_coeff);
+									i0=SCAN_YUV422[coef_ctr][0];
+									j0=SCAN_YUV422[coef_ctr][1];
+
+									m3[i0][j0]=rl.level;
+								}
+								else
+									break;
+							}
+						}
+						// inverse CHROMA DC YUV422 transform
+						// horizontal
+						if(currMB->is_lossless == FALSE)
+						{
+							m4[0][0] = m3[0][0] + m3[1][0];
+							m4[0][1] = m3[0][1] + m3[1][1];
+							m4[0][2] = m3[0][2] + m3[1][2];
+							m4[0][3] = m3[0][3] + m3[1][3];
+
+							m4[1][0] = m3[0][0] - m3[1][0];
+							m4[1][1] = m3[0][1] - m3[1][1];
+							m4[1][2] = m3[0][2] - m3[1][2];
+							m4[1][3] = m3[0][3] - m3[1][3];
+
+							for (i = 0; i < 2; ++i)
+							{
+								m6[0] = m4[i][0] + m4[i][2];
+								m6[1] = m4[i][0] - m4[i][2];
+								m6[2] = m4[i][1] - m4[i][3];
+								m6[3] = m4[i][1] + m4[i][3];
+
+								imgcof[cof4_pos_to_subblock[0][i]][0][0] = m6[0] + m6[3];
+								imgcof[cof4_pos_to_subblock[1][i]][0][0] = m6[1] + m6[2];
+								imgcof[cof4_pos_to_subblock[2][i]][0][0] = m6[1] - m6[2];
+								imgcof[cof4_pos_to_subblock[3][i]][0][0]= m6[0] - m6[3];
+							}//for (i=0;i<2;++i)
+						}
+						else
+						{
+							for(j=0;j<4;++j)
+							{
+								for(i=0;i<2;++i)                
+								{
+									currSlice->cof4[uv + 1][cof4_pos_to_subblock[j][i]][0][0] = m3[i][j];
+								}
+							}
+						}
+
+						for(j = 0;j < p_Vid->mb_cr_size_y; j += BLOCK_SIZE)
+						{
+							for(i=0;i < p_Vid->mb_cr_size_x;i+=BLOCK_SIZE)
+							{
+								imgcof[cof4_pos_to_subblock[j>>2][i>>2]][0][0] = rshift_rnd_sf((imgcof[cof4_pos_to_subblock[j>>2][i>>2]][0][0] * InvLevelScale4x4[0][0]) << qp_per_uv_dc, 6);
+							}
+						}
+					}
+				}//for (ll=0;ll<3;ll+=2)
+			}//else if (dec_picture->chroma_format_idc == YUV422)
+		}
+
+		//========================== CHROMA AC ============================
+		//-----------------------------------------------------------------
+		// chroma AC coeff, all zero fram start_scan
+		if (cbp<=31)
+		{
+		}
+		else
+		{
+			{
+				int type;
+				currMB->is_intra_block =  intra;
+				type = (intra ? SE_CHR_AC_INTRA : SE_CHR_AC_INTER);
+
+				dP = &(currSlice->partArr[partMap[type]]);
+
+
+				if(currMB->is_lossless == FALSE)
+				{ 
+					for (b8=0; b8 < p_Vid->num_blk8x8_uv; ++b8)
+					{
+						currMB->is_v_block = uv = (b8 > ((p_Vid->num_uv_blocks) - 1 ));
+						InvLevelScale4x4 = intra ? currSlice->InvLevelScale4x4_Intra[uv + 1][qp_rem_uv[uv]] : currSlice->InvLevelScale4x4_Inter[uv + 1][qp_rem_uv[uv]];
+
+						for (b4 = 0; b4 < 4; ++b4)
+						{
+							int *scale = &InvLevelScale4x4[0][0];
+							i = cofuv_blk_x[yuv][b8][b4];
+							j = cofuv_blk_y[yuv][b8][b4];
+
+							currMB->subblock_y = subblk_offset_y[yuv][b8][b4];
+							currMB->subblock_x = subblk_offset_x[yuv][b8][b4];
+
+							pos_scan_4x4 = &pos_scan4x4_1d[1];
+							for(k = 0; k < 16;++k)
+							{
+								rl = readRunLevel_CABAC(currMB, &(dP->de_cabac), CHROMA_AC);
+
+								if (rl.level != 0)
+								{
+									byte position;
+									currMB->cbp_blk[0] |= ((int64)1) << cbp_blk_chroma[b8][b4];
+									pos_scan_4x4 += rl.run;
+									position = *pos_scan_4x4++;
+
+									((int16_t *)currSlice->cof4[uv + 1][cof4_pos_to_subblock[j][i]])[position] = rshift_rnd_sf((rl.level * scale[position])<<qp_per_uv[uv], 4);
+								}
+								else
+									break;
+							} //for(k=0;(k<16)&&(level!=0);++k)
+						}
+					}
+				}
+				else
+				{
+					for (b8=0; b8 < p_Vid->num_blk8x8_uv; ++b8)
+					{
+						currMB->is_v_block = uv = (b8 > ((p_Vid->num_uv_blocks) - 1 ));
+
+						for (b4=0; b4 < 4; ++b4)
+						{
+							i = cofuv_blk_x[yuv][b8][b4];
+							j = cofuv_blk_y[yuv][b8][b4];
+
+							pos_scan_4x4 = &pos_scan4x4_1d[1];
+
+							currMB->subblock_y = subblk_offset_y[yuv][b8][b4];
+							currMB->subblock_x = subblk_offset_x[yuv][b8][b4];
+
+							for(k=0;k<16;++k)
+							{
+								rl = readRunLevel_CABAC(currMB, &dP->de_cabac, CHROMA_AC);
+
+								if (rl.level != 0)
+								{
+									currMB->cbp_blk[0] |= ((int64)1) << cbp_blk_chroma[b8][b4];
+									pos_scan_4x4 += rl.run;
+
+									((int16_t *)currSlice->cof4[uv + 1][cof4_pos_to_subblock[j][i]])[*pos_scan_4x4++] = rl.level;
+								}
+								else
+									break;
+							} 
+						}
+					} 
+				} //for (b4=0; b4 < 4; b4++)
+			} //for (b8=0; b8 < p_Vid->num_blk8x8_uv; b8++)
+		} //if (dec_picture->chroma_format_idc != YUV400)
+	}
+}
+
+/*!
+************************************************************************
+* \brief
+*    Get coded block pattern and coefficients (run/level)
+*    from the NAL
+************************************************************************
+*/
+static void read_CBP_and_coeffs_from_NAL_CAVLC(Macroblock *currMB)
+{
+	int i,j,k;
+	int level;
+	int mb_nr = currMB->mbAddrX;
+	int cbp;
+	SyntaxElement currSE;
+	DataPartition *dP = NULL;
+	Slice *currSlice = currMB->p_Slice;
+	const byte *partMap = assignSE2partition[currSlice->dp_mode];
+	int coef_ctr, i0, j0, b8;
+	int ll;
+	__declspec(align(32)) int levarr[16], runarr[16];
+	int numcoeff;
+
+	int qp_per, qp_rem;
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int smb = ((p_Vid->type==SP_SLICE) && IS_INTER (currMB)) || (p_Vid->type == SI_SLICE && currMB->mb_type == SI4MB);
+
+	int uv; 
+	int qp_per_uv[2];
+	int qp_rem_uv[2];
+
+	int intra = IS_INTRA (currMB);
+	int temp[4];
+
+	int b4;
+	StorablePicture *dec_picture = p_Vid->dec_picture;
+	int yuv = dec_picture->chroma_format_idc - 1;
+	int m6[4];
+
+	int need_transform_size_flag;
+
+	int (*InvLevelScale4x4)[4] = NULL;
+	const int *InvLevelScale8x8 = NULL;
+	// select scan type
+	const byte (*pos_scan4x4)[2] = ((p_Vid->structure == FRAME) && (!currMB->mb_field)) ? SNGL_SCAN : FIELD_SCAN;
+	const byte *pos_scan_4x4 = pos_scan4x4[0];
+
+	// QPI
+	//init constants for every chroma qp offset
+	if (dec_picture->chroma_format_idc != YUV400)
+	{
+		for (i=0; i<2; ++i)
+		{
+			qp_per_uv[i] = p_Vid->qp_per_matrix[ currMB->qp_scaled[i + 1] ];
+			qp_rem_uv[i] = p_Vid->qp_rem_matrix[ currMB->qp_scaled[i + 1] ];
+		}
+	}
+
+	// read CBP if not new intra mode
+	if (!IS_I16MB (currMB))
+	{
+		//=====   C B P   =====
+		//---------------------
+		int type = (currMB->mb_type == I4MB || currMB->mb_type == SI4MB || currMB->mb_type == I8MB) 
+			? SE_CBP_INTRA
+			: SE_CBP_INTER;
+
+		dP = &(currSlice->partArr[partMap[type]]);
+
+		currSE.mapping = (currMB->mb_type == I4MB || currMB->mb_type == SI4MB || currMB->mb_type == I8MB)
+			? currSlice->linfo_cbp_intra
+			: currSlice->linfo_cbp_inter;
+
+		TRACE_STRING("coded_block_pattern");
+		readSyntaxElement_UVLC(&currSE, dP);
+		currMB->cbp = cbp = currSE.value1;
+
+
+		//============= Transform size flag for INTER MBs =============
+		//-------------------------------------------------------------
+		need_transform_size_flag = (((currMB->mb_type >= 1 && currMB->mb_type <= 3)||
+			(IS_DIRECT(currMB) && p_Vid->active_sps->direct_8x8_inference_flag) ||
+			(currMB->NoMbPartLessThan8x8Flag))
+			&& currMB->mb_type != I8MB && currMB->mb_type != I4MB
+			&& (currMB->cbp&15)
+			&& p_Vid->Transform8x8Mode);
+
+		if (need_transform_size_flag)
+		{
+			dP = &(currSlice->partArr[partMap[SE_HEADER]]);
+			TRACE_STRING("transform_size_8x8_flag");
+
+			// read CAVLC transform_size_8x8_flag
+			currMB->luma_transform_size_8x8_flag = (Boolean) readSyntaxElement_FLC(dP->bitstream, 1);
+		}
+
+		//=====   DQUANT   =====
+		//----------------------
+		// Delta quant only if nonzero coeffs
+		if (cbp !=0)
+		{
+			read_delta_quant_CAVLC(&currSE, dP, currMB, partMap, (IS_INTER (currMB)) ? SE_DELTA_QUANT_INTER : SE_DELTA_QUANT_INTRA);
+
+			if (currSlice->dp_mode)
+			{
+				if (IS_INTER (currMB) && currSlice->dpC_NotPresent ) 
+					currMB->dpl_flag = 1;
+
+				if( intra && currSlice->dpB_NotPresent )
+				{
+					currMB->ei_flag = 1;
+					currMB->dpl_flag = 1;
+				}
+
+				// check for prediction from neighbours
+				check_dp_neighbors (currMB);
+				if (currMB->dpl_flag)
+				{
+					cbp = 0; 
+					currMB->cbp = cbp;
+				}
+			}
+		}
+	}
+	else
+	{
+		cbp = currMB->cbp;
+	}
+
+	if (IS_I16MB (currMB)) // read DC coeffs for new intra modes
+	{
+		read_delta_quant_CAVLC(&currSE, dP, currMB, partMap, SE_DELTA_QUANT_INTRA);
+
+		macroblock_set_dc_pred(p_Vid, currMB->block_x, currMB->block_y);
+
+		if (currSlice->dp_mode)
+		{  
+			if (currSlice->dpB_NotPresent)
+			{
+				currMB->ei_flag  = 1;
+				currMB->dpl_flag = 1;
+			}
+			check_dp_neighbors (currMB);
+			if (currMB->dpl_flag)
+			{
+				currMB->cbp = cbp = 0; 
+			}
+		}
+
+		if (!currMB->dpl_flag)
+		{
+			pos_scan_4x4 = pos_scan4x4[0];
+
+			readCoeff4x4_CAVLC(currMB, LUMA_INTRA16x16DC, 0, 0, levarr, runarr, &numcoeff);
+
+			for(k = 0; k < numcoeff; ++k)
+			{
+				if (levarr[k] != 0)                     // leave if level == 0
+				{
+					pos_scan_4x4 += 2 * runarr[k];
+
+					i0 = (*pos_scan_4x4++);
+					j0 = (*pos_scan_4x4++);
+
+					currSlice->cof4[0][cof4_pos_to_subblock[j0][i0]][0][0] = levarr[k];// add new intra DC coeff
+				}
+			}
+
+
+			if(currMB->is_lossless == FALSE)
+				itrans_2(currMB, (ColorPlane) p_Vid->colour_plane_id);// transform new intra DC
+		}
+	}
+
+	update_qp(currMB, p_Vid->qp);
+
+	qp_per = p_Vid->qp_per_matrix[ currMB->qp_scaled[p_Vid->colour_plane_id] ];
+	qp_rem = p_Vid->qp_rem_matrix[ currMB->qp_scaled[p_Vid->colour_plane_id] ];
+
+	//init quant parameters for chroma 
+	if (dec_picture->chroma_format_idc != YUV400)
+	{
+		for(i=0; i < 2; ++i)
+		{
+			qp_per_uv[i] = p_Vid->qp_per_matrix[ currMB->qp_scaled[i + 1] ];
+			qp_rem_uv[i] = p_Vid->qp_rem_matrix[ currMB->qp_scaled[i + 1] ];
+		}
+	}
+
+	InvLevelScale4x4 = intra? currSlice->InvLevelScale4x4_Intra[p_Vid->colour_plane_id][qp_rem] : currSlice->InvLevelScale4x4_Inter[p_Vid->colour_plane_id][qp_rem];
+	InvLevelScale8x8 = intra? currSlice->InvLevelScale8x8_Intra[p_Vid->colour_plane_id][qp_rem] : currSlice->InvLevelScale8x8_Inter[p_Vid->colour_plane_id][qp_rem];
+
+	// luma coefficients
+	if (cbp)
+	{
+		if (!currMB->luma_transform_size_8x8_flag) // 4x4 transform
+		{
+			readCompCoeff4x4MB_CAVLC(currMB, PLANE_Y, InvLevelScale4x4, qp_per, cbp, p_Vid->nz_coeff[mb_nr][PLANE_Y]);
+		}
+		else // 8x8 transform
+		{
+			readCompCoeff8x8MB_CAVLC(currMB, PLANE_Y, InvLevelScale8x8, qp_per, cbp, p_Vid->nz_coeff[mb_nr][PLANE_Y]);
+		}
+	}
+	else
+	{
+		memset(&p_Vid->nz_coeff[mb_nr][0][0][0], 0, BLOCK_SIZE * BLOCK_SIZE * sizeof(byte));
+	}
+
+	if ( p_Vid->active_sps->chroma_format_idc==YUV444 && !IS_INDEPENDENT(p_Vid) ) 
+	{
+		for (uv = 0; uv < 2; ++uv )
+		{
+			/*----------------------16x16DC Luma_Add----------------------*/
+			if (IS_I16MB (currMB)) // read DC coeffs for new intra modes       
+			{
+				macroblock_set_dc_pred(p_Vid, currMB->block_x, currMB->block_y);
+
+				if (uv == 0)
+					readCoeff4x4_CAVLC(currMB, CB_INTRA16x16DC, 0, 0, levarr, runarr, &numcoeff);
+				else
+					readCoeff4x4_CAVLC(currMB, CR_INTRA16x16DC, 0, 0, levarr, runarr, &numcoeff);
+
+				coef_ctr=-1;
+				level = 1;                            // just to get inside the loop
+
+				for(k = 0; k < numcoeff; ++k)
+				{
+					if (levarr[k] != 0)                     // leave if level == 0
+					{
+						coef_ctr += runarr[k] + 1;
+
+						i0 = pos_scan4x4[coef_ctr][0];
+						j0 = pos_scan4x4[coef_ctr][1];
+						currSlice->cof4[uv + 1][cof4_pos_to_subblock[j0][i0]][0][0] = levarr[k];// add new intra DC coeff
+					} //if leavarr[k]
+				} //k loop
+
+				if(currMB->is_lossless == FALSE)
+				{
+					itrans_2(currMB, (ColorPlane) (uv + 1)); // transform new intra DC
+				}
+			} //IS_I16MB
+
+			update_qp(currMB, p_Vid->qp);
+
+			qp_per = p_Vid->qp_per_matrix[ (p_Vid->qp + p_Vid->bitdepth_luma_qp_scale) ];
+			qp_rem = p_Vid->qp_rem_matrix[ (p_Vid->qp + p_Vid->bitdepth_luma_qp_scale) ];
+
+			//init constants for every chroma qp offset
+			qp_per_uv[uv] = p_Vid->qp_per_matrix[ (currMB->qpc[uv] + p_Vid->bitdepth_chroma_qp_scale) ];
+			qp_rem_uv[uv] = p_Vid->qp_rem_matrix[ (currMB->qpc[uv] + p_Vid->bitdepth_chroma_qp_scale) ];
+
+			InvLevelScale4x4 = intra? currSlice->InvLevelScale4x4_Intra[uv + 1][qp_rem_uv[uv]] : currSlice->InvLevelScale4x4_Inter[uv + 1][qp_rem_uv[uv]];
+			InvLevelScale8x8 = intra? currSlice->InvLevelScale8x8_Intra[uv + 1][qp_rem_uv[uv]] : currSlice->InvLevelScale8x8_Inter[uv + 1][qp_rem_uv[uv]];
+
+			if (!currMB->luma_transform_size_8x8_flag) // 4x4 transform
+			{
+				readCompCoeff4x4MB_CAVLC(currMB, (ColorPlane) (PLANE_U + uv), InvLevelScale4x4, qp_per_uv[uv], cbp, p_Vid->nz_coeff[mb_nr][PLANE_U + uv]);
+			}
+			else // 8x8 transform
+			{
+				readCompCoeff8x8MB_CAVLC(currMB, (ColorPlane) (PLANE_U + uv), InvLevelScale8x8, qp_per_uv[uv], cbp, p_Vid->nz_coeff[mb_nr][PLANE_U + uv]);
+			}   
+		} 
+	} //444
+	else  if ((dec_picture->chroma_format_idc != YUV400) && (dec_picture->chroma_format_idc != YUV444))
+	{
+		//========================== CHROMA DC ============================
+		//-----------------------------------------------------------------
+		// chroma DC coeff
+		if(cbp>15)
+		{
+			if (dec_picture->chroma_format_idc == YUV420)
+			{    
+				for (ll=0;ll<3;ll+=2)
+				{
+					uv = ll>>1;          
+
+					InvLevelScale4x4 = intra ? currSlice->InvLevelScale4x4_Intra[uv + 1][qp_rem_uv[uv]] : currSlice->InvLevelScale4x4_Inter[uv + 1][qp_rem_uv[uv]];
+					//===================== CHROMA DC YUV420 ======================
+					memset(&currSlice->cofu[0], 0, 4 *sizeof(int));
+					coef_ctr=-1;
+
+					readCoeff4x4_CAVLC(currMB, CHROMA_DC, 0, 0, levarr, runarr, &numcoeff);
+
+					for(k = 0; k < numcoeff; ++k)
+					{
+						if (levarr[k] != 0)
+						{
+							currMB->cbp_blk[0] |= 0xf0000 << (ll<<1) ;
+							coef_ctr += runarr[k] + 1;
+							currSlice->cofu[coef_ctr]=levarr[k];
+						}
+					}
+
+					if (smb || (currMB->is_lossless == TRUE)) // check to see if MB type is SPred or SIntra4x4
+					{
+						currSlice->cof4[uv + 1][0][0][0] = currSlice->cofu[0];
+						currSlice->cof4[uv + 1][1][0][0] = currSlice->cofu[1];
+						currSlice->cof4[uv + 1][2][0][0] = currSlice->cofu[2];
+						currSlice->cof4[uv + 1][3][0][0] = currSlice->cofu[3];
+					}
+					else
+					{
+						ihadamard2x2(currSlice->cofu, temp);
+
+						currSlice->cof4[uv + 1][0][0][0] = (((temp[0] * InvLevelScale4x4[0][0])<<qp_per_uv[uv])>>5);
+						currSlice->cof4[uv + 1][1][0][0] = (((temp[1] * InvLevelScale4x4[0][0])<<qp_per_uv[uv])>>5);
+						currSlice->cof4[uv + 1][2][0][0] = (((temp[2] * InvLevelScale4x4[0][0])<<qp_per_uv[uv])>>5);
+						currSlice->cof4[uv + 1][3][0][0] = (((temp[3] * InvLevelScale4x4[0][0])<<qp_per_uv[uv])>>5);
+					}          
+				}
+			}
+			else if (dec_picture->chroma_format_idc == YUV422)
+			{
+				for (ll=0;ll<3;ll+=2)
+				{
+					int (*InvLevelScale4x4)[4] = NULL;
+					uv = ll>>1;
+					{
+						h264_short_block_t *imgcof = currSlice->cof4[uv + 1];
+						int m3[2][4] = {{0,0,0,0},{0,0,0,0}};
+						int m4[2][4] = {{0,0,0,0},{0,0,0,0}};
+						int qp_per_uv_dc = p_Vid->qp_per_matrix[ (currMB->qpc[uv] + 3 + p_Vid->bitdepth_chroma_qp_scale) ];       //for YUV422 only
+						int qp_rem_uv_dc = p_Vid->qp_rem_matrix[ (currMB->qpc[uv] + 3 + p_Vid->bitdepth_chroma_qp_scale) ];       //for YUV422 only
+						if (intra)
+							InvLevelScale4x4 = currSlice->InvLevelScale4x4_Intra[uv + 1][qp_rem_uv_dc];
+						else 
+							InvLevelScale4x4 = currSlice->InvLevelScale4x4_Inter[uv + 1][qp_rem_uv_dc];
+
+
+						//===================== CHROMA DC YUV422 ======================
+						readCoeff4x4_CAVLC(currMB, CHROMA_DC, 0, 0, levarr, runarr, &numcoeff);
+						coef_ctr=-1;
+						level=1;
+						for(k = 0; k < numcoeff; ++k)
+						{
+							if (levarr[k] != 0)
+							{
+								currMB->cbp_blk[0] |= ((int64)0xff0000) << (ll<<2);
+								coef_ctr += runarr[k]+1;
+								i0 = SCAN_YUV422[coef_ctr][0];
+								j0 = SCAN_YUV422[coef_ctr][1];
+
+								m3[i0][j0]=levarr[k];
+							}
+						}
+
+						// inverse CHROMA DC YUV422 transform
+						// horizontal
+						if(currMB->is_lossless == FALSE)
+						{
+							m4[0][0] = m3[0][0] + m3[1][0];
+							m4[0][1] = m3[0][1] + m3[1][1];
+							m4[0][2] = m3[0][2] + m3[1][2];
+							m4[0][3] = m3[0][3] + m3[1][3];
+
+							m4[1][0] = m3[0][0] - m3[1][0];
+							m4[1][1] = m3[0][1] - m3[1][1];
+							m4[1][2] = m3[0][2] - m3[1][2];
+							m4[1][3] = m3[0][3] - m3[1][3];
+
+							for (i = 0; i < 2; ++i)
+							{
+								m6[0] = m4[i][0] + m4[i][2];
+								m6[1] = m4[i][0] - m4[i][2];
+								m6[2] = m4[i][1] - m4[i][3];
+								m6[3] = m4[i][1] + m4[i][3];
+
+								imgcof[cof4_pos_to_subblock[0][i]][0][0] = m6[0] + m6[3];
+								imgcof[cof4_pos_to_subblock[1][i]][0][0] = m6[1] + m6[2];
+								imgcof[cof4_pos_to_subblock[2][i]][0][0] = m6[1] - m6[2];
+								imgcof[cof4_pos_to_subblock[3][i]][0][0] = m6[0] - m6[3];
+							}//for (i=0;i<2;++i)
+						}
+						else
+						{
+							currSlice->cof4[uv + 1][0][0][0] = m3[0][0];
+							currSlice->cof4[uv + 1][1][0][0] = m3[1][0];
+							currSlice->cof4[uv + 1][2][0][0] = m3[0][1];
+							currSlice->cof4[uv + 1][3][0][0] = m3[1][1];
+							currSlice->cof4[uv + 1][8][0][0] = m3[0][2];
+							currSlice->cof4[uv + 1][9][0][0] = m3[1][2];
+							currSlice->cof4[uv + 1][10][0][0] = m3[0][3];
+							currSlice->cof4[uv + 1][11][0][0] = m3[1][3];
+						}
+
+						for(j = 0;j < 16; j += BLOCK_SIZE)
+						{
+							for(i=0;i < 8;i+=BLOCK_SIZE)
+							{
+								imgcof[cof4_pos_to_subblock[j>>2][i>>2]][0][0] = rshift_rnd_sf((imgcof[cof4_pos_to_subblock[j>>2][i>>2]][0][0] * InvLevelScale4x4[0][0]) << qp_per_uv_dc, 6);
+							}
+						}
+					}
+				}//for (ll=0;ll<3;ll+=2)
+			}//else if (dec_picture->chroma_format_idc == YUV422)
+		}
+
+		//========================== CHROMA AC ============================
+		//-----------------------------------------------------------------
+		// chroma AC coeff, all zero fram start_scan
+		if (cbp<=31)
+		{
+			memset(&p_Vid->nz_coeff [mb_nr ][1][0][0], 0, 2 * BLOCK_SIZE * BLOCK_SIZE * sizeof(byte));
+		}
+		else
+		{
+			if(currMB->is_lossless == FALSE)
+			{
+				for (b8=0; b8 < p_Vid->num_blk8x8_uv; ++b8)
+				{
+					currMB->is_v_block = uv = (b8 > ((p_Vid->num_uv_blocks) - 1 ));
+					InvLevelScale4x4 = intra ? currSlice->InvLevelScale4x4_Intra[uv + 1][qp_rem_uv[uv]] : currSlice->InvLevelScale4x4_Inter[uv + 1][qp_rem_uv[uv]];
+
+					for (b4=0; b4 < 4; ++b4)
+					{
+						i = cofuv_blk_x[yuv][b8][b4];
+						j = cofuv_blk_y[yuv][b8][b4];
+
+						readCoeff4x4_CAVLC(currMB, CHROMA_AC, i + 2*uv, j + 4, levarr, runarr, &numcoeff);
+						coef_ctr = 0;
+
+						for(k = 0; k < numcoeff;++k)
+						{
+							if (levarr[k] != 0)
+							{
+								currMB->cbp_blk[0] |= ((int64)1) << cbp_blk_chroma[b8][b4];
+								coef_ctr += runarr[k] + 1;
+
+								i0=pos_scan4x4[coef_ctr][0];
+								j0=pos_scan4x4[coef_ctr][1];
+
+								currSlice->cof4[uv + 1][cof4_pos_to_subblock[j][i]][j0][i0] = rshift_rnd_sf((levarr[k] * InvLevelScale4x4[j0][i0])<<qp_per_uv[uv], 4);
+							}
+						}
+					}
+				}        
+			}
+			else
+			{
+				int type;
+				currMB->is_intra_block =  IS_INTRA(currMB);
+				type =  (currMB->is_intra_block ? SE_CHR_AC_INTRA : SE_CHR_AC_INTER);
+
+				dP = &(currSlice->partArr[partMap[type]]);
+				currSE.mapping = linfo_levrun_inter;
+
+				if(currMB->is_lossless == FALSE)
+				{          
+					for (b8=0; b8 < p_Vid->num_blk8x8_uv; ++b8)
+					{
+						currMB->is_v_block = uv = (b8 > ((p_Vid->num_uv_blocks) - 1 ));
+						InvLevelScale4x4 = intra ? currSlice->InvLevelScale4x4_Intra[uv + 1][qp_rem_uv[uv]] : currSlice->InvLevelScale4x4_Inter[uv + 1][qp_rem_uv[uv]];
+
+						for (b4 = 0; b4 < 4; ++b4)
+						{
+							i = cofuv_blk_x[yuv][b8][b4];
+							j = cofuv_blk_y[yuv][b8][b4];
+
+							currMB->subblock_y = subblk_offset_y[yuv][b8][b4];
+							currMB->subblock_x = subblk_offset_x[yuv][b8][b4];
+
+							pos_scan_4x4 = pos_scan4x4[1];
+
+							for(k = 0; k < 16;k++)
+							{
+								readSyntaxElement_UVLC(&currSE, dP);
+								level = currSE.value1;
+
+								if (level != 0)
+								{
+									currMB->cbp_blk[0] |= ((int64)1) << cbp_blk_chroma[b8][b4];
+									pos_scan_4x4 += (currSE.value2 << 1);
+
+									i0 = *pos_scan_4x4++;
+									j0 = *pos_scan_4x4++;
+
+									currSlice->cof4[uv + 1][cof4_pos_to_subblock[j][i]][j0][i0] = rshift_rnd_sf((level * InvLevelScale4x4[j0][i0])<<qp_per_uv[uv], 4);
+								}
+								else
+									break;
+							} //for(k=0;(k<16)&&(level!=0);++k)
+						}
+					}
+				}
+				else
+				{
+					for (b8=0; b8 < p_Vid->num_blk8x8_uv; ++b8)
+					{
+						currMB->is_v_block = uv = (b8 > ((p_Vid->num_uv_blocks) - 1 ));
+
+						for (b4=0; b4 < 4; ++b4)
+						{
+							i = cofuv_blk_x[yuv][b8][b4];
+							j = cofuv_blk_y[yuv][b8][b4];
+
+							pos_scan_4x4 = pos_scan4x4[1];
+
+							currMB->subblock_y = subblk_offset_y[yuv][b8][b4];
+							currMB->subblock_x = subblk_offset_x[yuv][b8][b4];
+
+							for(k=0;k<16;++k)
+							{
+								readSyntaxElement_UVLC(&currSE, dP);
+								level = currSE.value1;
+
+								if (level != 0)
+								{
+									currMB->cbp_blk[0] |= ((int64)1) << cbp_blk_chroma[b8][b4];
+									pos_scan_4x4 += (currSE.value2 << 1);
+
+									i0 = *pos_scan_4x4++;
+									j0 = *pos_scan_4x4++;
+
+									currSlice->cof4[uv + 1][cof4_pos_to_subblock[j][i]][j0][i0] = level;
+								}
+								else
+									break;
+							} 
+						}
+					} 
+				} //for (b4=0; b4 < 4; b4++)
+			} //for (b8=0; b8 < p_Vid->num_blk8x8_uv; b8++)
+		} //if (dec_picture->chroma_format_idc != YUV400)
+	}
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    decode one color component in an I slice
+************************************************************************
+*/
+
+static int decode_one_component_i_slice(Macroblock *currMB, ColorPlane curr_plane, VideoImage *image, StorablePicture *dec_picture)
+{
+	//For residual DPCM
+	currMB->ipmode_DPCM = NO_INTRA_PMODE; 
+	if(currMB->mb_type == IPCM)
+		mb_pred_ipcm(currMB);
+	else if (IS_I16MB (currMB)) // get prediction for INTRA_MB_16x16
+		mb_pred_intra16x16(currMB, curr_plane, image, dec_picture);
+	else if (currMB->mb_type == I4MB)
+		mb_pred_intra4x4(currMB, curr_plane, image, dec_picture);
+	else if (currMB->mb_type == I8MB) 
+		mb_pred_intra8x8(currMB, curr_plane, image, dec_picture);
+
+	return 1;
+}
+
+/*!
+************************************************************************
+* \brief
+*    decode one color component for a p slice
+************************************************************************
+*/
+static int decode_one_component_p_slice(Macroblock *currMB, ColorPlane curr_plane, VideoImage *image, StorablePicture *dec_picture)
+{
+	//For residual DPCM
+	currMB->ipmode_DPCM = NO_INTRA_PMODE; 
+	if(currMB->mb_type == IPCM)
+		mb_pred_ipcm(currMB);
+	else if (IS_I16MB (currMB)) // get prediction for INTRA_MB_16x16
+		mb_pred_intra16x16(currMB, curr_plane, image, dec_picture);
+	else if (currMB->mb_type == I4MB)
+		mb_pred_intra4x4(currMB, curr_plane, image, dec_picture);
+	else if (currMB->mb_type == I8MB) 
+		mb_pred_intra8x8(currMB, curr_plane, image, dec_picture);
+	else if (currMB->mb_type == PSKIP)
+		mb_pred_skip(currMB, curr_plane, image, dec_picture);
+	else if (currMB->mb_type == P16x16)
+		mb_pred_p_inter16x16(currMB, curr_plane, image, dec_picture);  
+	else if (currMB->mb_type == P16x8)
+		mb_pred_p_inter16x8(currMB, curr_plane, image, dec_picture);
+	else if (currMB->mb_type == P8x16)
+		mb_pred_p_inter8x16(currMB, curr_plane, image, dec_picture);
+	else
+		mb_pred_p_inter8x8(currMB, curr_plane, image, dec_picture);
+
+	return 1;
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    decode one color component for a sp slice
+************************************************************************
+*/
+static int decode_one_component_sp_slice(Macroblock *currMB, ColorPlane curr_plane, VideoImage *image, StorablePicture *dec_picture)
+{
+	//For residual DPCM
+	currMB->ipmode_DPCM = NO_INTRA_PMODE; 
+
+	if(currMB->mb_type == IPCM)
+		mb_pred_ipcm(currMB);
+	else if (IS_I16MB (currMB)) // get prediction for INTRA_MB_16x16
+		mb_pred_intra16x16(currMB, curr_plane, image, dec_picture);
+	else if (currMB->mb_type == I4MB)
+		mb_pred_intra4x4(currMB, curr_plane, image, dec_picture);
+	else if (currMB->mb_type == I8MB) 
+		mb_pred_intra8x8(currMB, curr_plane, image, dec_picture);
+	else if (currMB->mb_type == PSKIP)
+		mb_pred_sp_skip(currMB, curr_plane, image, dec_picture);
+	else if (currMB->mb_type == P16x16)
+		mb_pred_p_inter16x16(currMB, curr_plane, image, dec_picture);  
+	else if (currMB->mb_type == P16x8)
+		mb_pred_p_inter16x8(currMB, curr_plane, image, dec_picture);
+	else if (currMB->mb_type == P8x16)
+		mb_pred_p_inter8x16(currMB, curr_plane, image, dec_picture);
+	else
+		mb_pred_p_inter8x8(currMB, curr_plane, image, dec_picture);
+
+	return 1;
+}
+
+static void set_chroma_vector(Macroblock *currMB, int *list_offset)
+{
+	Slice *currSlice = currMB->p_Slice;
+	VideoParameters *p_Vid = currMB->p_Vid;
+
+	if (!currSlice->mb_aff_frame_flag)
+	{
+		if(p_Vid->structure == TOP_FIELD)
+		{
+			int k,l;  
+			for (l = LIST_0; l <= (LIST_1); l++)
+			{
+				for(k = 0; k < p_Vid->listXsize[l]; k++)
+				{
+					if(p_Vid->structure != p_Vid->listX[l][k]->structure)
+						p_Vid->listX[l][k]->chroma_vector_adjustment = -2;
+					else
+						p_Vid->listX[l][k]->chroma_vector_adjustment= 0;
+				}
+			}
+		}
+		else if(p_Vid->structure == BOTTOM_FIELD)
+		{
+			int k,l;  
+			for (l = LIST_0; l <= (LIST_1); l++)
+			{
+				for(k = 0; k < p_Vid->listXsize[l]; k++)
+				{
+					if (p_Vid->structure != p_Vid->listX[l][k]->structure)
+						p_Vid->listX[l][k]->chroma_vector_adjustment = 2;
+					else
+						p_Vid->listX[l][k]->chroma_vector_adjustment= 0;
+				}
+			}
+		}
+		else
+		{
+			int k,l;  
+			for (l = LIST_0; l <= (LIST_1); l++)
+			{
+				for(k = 0; k < p_Vid->listXsize[l]; k++)
+				{
+					p_Vid->listX[l][k]->chroma_vector_adjustment= 0;
+				}
+			}
+		}
+	}
+	else
+	{
+		int mb_nr = (currMB->mbAddrX & 0x01);
+		int k,l;  
+
+		//////////////////////////
+		// find out the correct list offsets
+		if (currMB->mb_field)
+		{
+			*list_offset = mb_nr ? 4 : 2;
+
+			for (l = LIST_0 + *list_offset; l <= (LIST_1 + *list_offset); l++)
+			{
+				for(k = 0; k < p_Vid->listXsize[l]; k++)
+				{          
+					if(mb_nr == 0 && p_Vid->listX[l][k]->structure == BOTTOM_FIELD)
+						p_Vid->listX[l][k]->chroma_vector_adjustment = -2;
+					else if(mb_nr == 1 && p_Vid->listX[l][k]->structure == TOP_FIELD)
+						p_Vid->listX[l][k]->chroma_vector_adjustment = 2;
+					else
+						p_Vid->listX[l][k]->chroma_vector_adjustment= 0;
+				}
+			}
+		}
+		else
+		{
+			for (l = LIST_0; l <= (LIST_1); l++)
+			{
+				for(k = 0; k < p_Vid->listXsize[l]; k++)
+				{
+					p_Vid->listX[l][k]->chroma_vector_adjustment= 0;
+				}
+			}
+		}
+	}
+
+	p_Vid->max_mb_vmv_r = (p_Vid->structure != FRAME || (currSlice->mb_aff_frame_flag && currMB->mb_field)) ? p_Vid->max_vmv_r >> 1 : p_Vid->max_vmv_r;
+}
+
+
+static void mb_pred_b_dspatial(Macroblock *currMB, ColorPlane curr_plane, VideoImage *image, StorablePicture *dec_picture)
+{
+	char l0_rFrame = -1, l1_rFrame = -1;
+	PicMotionParams *motion = &dec_picture->motion;
+	MotionVector pmvl0={0,0}, pmvl1={0,0};
+	int k;
+	int block8x8;
+	Slice *currSlice = currMB->p_Slice;
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int curr_mb_field = ((currSlice->mb_aff_frame_flag)&&(currMB->mb_field));
+
+	MotionParams *colocated = &currSlice->p_colocated->frame;
+	int list_offset = 0;
+	int pred_dir = 0;
+
+	Boolean has_zero_partitions = FALSE;
+	h264_ref_t *ref_pic_num_l0, *ref_pic_num_l1;
+
+	set_chroma_vector(currMB, &list_offset);
+
+	if (currMB->mb_field)
+	{
+		if(currMB->mbAddrX & 0x01)
+		{
+			colocated = &currSlice->p_colocated->bottom;
+		}
+		else
+		{
+			colocated = &currSlice->p_colocated->top;
+		}
+	}
+
+	prepare_direct_params(currMB, dec_picture, pmvl0, pmvl1, &l0_rFrame, &l1_rFrame);
+
+	ref_pic_num_l0 = dec_picture->ref_pic_num[p_Vid->current_slice_nr][LIST_0 + list_offset];
+	ref_pic_num_l1 = dec_picture->ref_pic_num[p_Vid->current_slice_nr][LIST_1 + list_offset];
+
+	if (p_Vid->active_sps->direct_8x8_inference_flag)
+	{
+		if (l0_rFrame >=0 && l1_rFrame >=0)
+		{
+			PicMotion **motion0 = &motion->motion[LIST_0][currMB->block_y];
+			PicMotion **motion1 = &motion->motion[LIST_1][currMB->block_y];
+			int block_x = currMB->block_x;
+			has_zero_partitions = TRUE;
+			pred_dir = 2;
+			if (p_Vid->listX[LIST_1 + list_offset][0]->is_long_term)
+			{ // long term
+				//---
+				memcpy(motion0[0][block_x + 0].mv, pmvl0, sizeof(MotionVector));
+				motion0[0][block_x + 0].ref_idx    = l0_rFrame;
+				memcpy(motion1[0][block_x + 0].mv, pmvl1, sizeof(MotionVector));
+				motion1[0][block_x + 0].ref_idx    = l1_rFrame;
+				motion0[0][block_x + 0].ref_pic_id = ref_pic_num_l0[(short)motion0[0][block_x + 0].ref_idx];
+				motion1[0][block_x + 0].ref_pic_id = ref_pic_num_l1[(short)motion1[0][block_x + 0].ref_idx];
+				memcpy(motion0[0][block_x + 1].mv, pmvl0, sizeof(MotionVector));
+				motion0[0][block_x + 1].ref_idx    = l0_rFrame;
+				memcpy(motion1[0][block_x + 1].mv, pmvl1, sizeof(MotionVector));
+				motion1[0][block_x + 1].ref_idx    = l1_rFrame;
+				motion0[0][block_x + 1].ref_pic_id = ref_pic_num_l0[(short)motion0[0][block_x + 1].ref_idx];
+				motion1[0][block_x + 1].ref_pic_id = ref_pic_num_l1[(short)motion1[0][block_x + 1].ref_idx];
+				memcpy(motion0[1][block_x + 0].mv, pmvl0, sizeof(MotionVector));
+				motion0[1][block_x + 0].ref_idx    = l0_rFrame;
+				memcpy(motion1[1][block_x + 0].mv, pmvl1, sizeof(MotionVector));
+				motion1[1][block_x + 0].ref_idx    = l1_rFrame;
+				motion0[1][block_x + 0].ref_pic_id = ref_pic_num_l0[(short)motion0[1][block_x + 0].ref_idx];
+				motion1[1][block_x + 0].ref_pic_id = ref_pic_num_l1[(short)motion1[1][block_x + 0].ref_idx];
+				memcpy(motion0[1][block_x + 1].mv, pmvl0, sizeof(MotionVector));
+				motion0[1][block_x + 1].ref_idx    = l0_rFrame;
+				memcpy(motion1[1][block_x + 1].mv, pmvl1, sizeof(MotionVector));
+				motion1[1][block_x + 1].ref_idx    = l1_rFrame;
+				motion0[1][block_x + 1].ref_pic_id = ref_pic_num_l0[(short)motion0[1][block_x + 1].ref_idx];
+				motion1[1][block_x + 1].ref_pic_id = ref_pic_num_l1[(short)motion1[1][block_x + 1].ref_idx];
+				perform_mc8x8(currMB, curr_plane, dec_picture, pred_dir, 0, 0, list_offset, curr_mb_field);
+				//---
+				memcpy(motion0[0][block_x + 2].mv, pmvl0, sizeof(MotionVector));
+				motion0[0][block_x + 2].ref_idx    = l0_rFrame;
+				memcpy(motion1[0][block_x + 2].mv, pmvl1, sizeof(MotionVector));
+				motion1[0][block_x + 2].ref_idx    = l1_rFrame;
+				motion0[0][block_x + 2].ref_pic_id = ref_pic_num_l0[(short)motion0[0][block_x + 2].ref_idx];
+				motion1[0][block_x + 2].ref_pic_id = ref_pic_num_l1[(short)motion1[0][block_x + 2].ref_idx];
+				memcpy(motion0[0][block_x + 3].mv, pmvl0, sizeof(MotionVector));
+				motion0[0][block_x + 3].ref_idx    = l0_rFrame;
+				memcpy(motion1[0][block_x + 3].mv, pmvl1, sizeof(MotionVector));
+				motion1[0][block_x + 3].ref_idx    = l1_rFrame;
+				motion0[0][block_x + 3].ref_pic_id = ref_pic_num_l0[(short)motion0[0][block_x + 3].ref_idx];
+				motion1[0][block_x + 3].ref_pic_id = ref_pic_num_l1[(short)motion1[0][block_x + 3].ref_idx];
+				memcpy(motion0[1][block_x + 2].mv, pmvl0, sizeof(MotionVector));
+				motion0[1][block_x + 2].ref_idx    = l0_rFrame;
+				memcpy(motion1[1][block_x + 2].mv, pmvl1, sizeof(MotionVector));
+				motion1[1][block_x + 2].ref_idx    = l1_rFrame;
+				motion0[1][block_x + 2].ref_pic_id = ref_pic_num_l0[(short)motion0[1][block_x + 2].ref_idx];
+				motion1[1][block_x + 2].ref_pic_id = ref_pic_num_l1[(short)motion1[1][block_x + 2].ref_idx];
+				memcpy(motion0[1][block_x + 3].mv, pmvl0, sizeof(MotionVector));
+				motion0[1][block_x + 3].ref_idx    = l0_rFrame;
+				memcpy(motion1[1][block_x + 3].mv, pmvl1, sizeof(MotionVector));
+				motion1[1][block_x + 3].ref_idx    = l1_rFrame;
+				motion0[1][block_x + 3].ref_pic_id = ref_pic_num_l0[(short)motion0[1][block_x + 3].ref_idx];
+				motion1[1][block_x + 3].ref_pic_id = ref_pic_num_l1[(short)motion1[1][block_x + 3].ref_idx];
+				perform_mc8x8(currMB, curr_plane, dec_picture, pred_dir, 2, 0, list_offset, curr_mb_field);
+				//---
+				memcpy(motion0[2][block_x + 0].mv, pmvl0, sizeof(MotionVector));
+				motion0[2][block_x + 0].ref_idx    = l0_rFrame;
+				memcpy(motion1[2][block_x + 0].mv, pmvl1, sizeof(MotionVector));
+				motion1[2][block_x + 0].ref_idx    = l1_rFrame;
+				motion0[2][block_x + 0].ref_pic_id = ref_pic_num_l0[(short)motion0[2][block_x + 0].ref_idx];
+				motion1[2][block_x + 0].ref_pic_id = ref_pic_num_l1[(short)motion1[2][block_x + 0].ref_idx];
+				memcpy(motion0[2][block_x + 1].mv, pmvl0, sizeof(MotionVector));
+				motion0[2][block_x + 1].ref_idx    = l0_rFrame;
+				memcpy(motion1[2][block_x + 1].mv, pmvl1, sizeof(MotionVector));
+				motion1[2][block_x + 1].ref_idx    = l1_rFrame;
+				motion0[2][block_x + 1].ref_pic_id = ref_pic_num_l0[(short)motion0[2][block_x + 1].ref_idx];
+				motion1[2][block_x + 1].ref_pic_id = ref_pic_num_l1[(short)motion1[2][block_x + 1].ref_idx];
+				memcpy(motion0[3][block_x + 0].mv, pmvl0, sizeof(MotionVector));
+				motion0[3][block_x + 0].ref_idx    = l0_rFrame;
+				memcpy(motion1[3][block_x + 0].mv, pmvl1, sizeof(MotionVector));
+				motion1[3][block_x + 0].ref_idx    = l1_rFrame;
+				motion0[3][block_x + 0].ref_pic_id = ref_pic_num_l0[(short)motion0[3][block_x + 0].ref_idx];
+				motion1[3][block_x + 0].ref_pic_id = ref_pic_num_l1[(short)motion1[3][block_x + 0].ref_idx];
+				memcpy(motion0[3][block_x + 1].mv, pmvl0, sizeof(MotionVector));
+				motion0[3][block_x + 1].ref_idx    = l0_rFrame;
+				memcpy(motion1[3][block_x + 1].mv, pmvl1, sizeof(MotionVector));
+				motion1[3][block_x + 1].ref_idx    = l1_rFrame;
+				motion0[3][block_x + 1].ref_pic_id = ref_pic_num_l0[(short)motion0[3][block_x + 1].ref_idx];
+				motion1[3][block_x + 1].ref_pic_id = ref_pic_num_l1[(short)motion1[3][block_x + 1].ref_idx];
+				perform_mc8x8(currMB, curr_plane, dec_picture, pred_dir, 0, 2, list_offset, curr_mb_field);
+				//---
+				memcpy(motion0[2][block_x + 2].mv, pmvl0, sizeof(MotionVector));
+				motion0[2][block_x + 2].ref_idx    = l0_rFrame;
+				memcpy(motion1[2][block_x + 2].mv, pmvl1, sizeof(MotionVector));
+				motion1[2][block_x + 2].ref_idx    = l1_rFrame;
+				motion0[2][block_x + 2].ref_pic_id = ref_pic_num_l0[(short)motion0[2][block_x + 2].ref_idx];
+				motion1[2][block_x + 2].ref_pic_id = ref_pic_num_l1[(short)motion1[2][block_x + 2].ref_idx];
+				memcpy(motion0[2][block_x + 3].mv, pmvl0, sizeof(MotionVector));
+				motion0[2][block_x + 3].ref_idx    = l0_rFrame;
+				memcpy(motion1[2][block_x + 3].mv, pmvl1, sizeof(MotionVector));
+				motion1[2][block_x + 3].ref_idx    = l1_rFrame;
+				motion0[2][block_x + 3].ref_pic_id = ref_pic_num_l0[(short)motion0[2][block_x + 3].ref_idx];
+				motion1[2][block_x + 3].ref_pic_id = ref_pic_num_l1[(short)motion1[2][block_x + 3].ref_idx];
+				memcpy(motion0[3][block_x + 2].mv, pmvl0, sizeof(MotionVector));
+				motion0[3][block_x + 2].ref_idx    = l0_rFrame;
+				memcpy(motion1[3][block_x + 2].mv, pmvl1, sizeof(MotionVector));
+				motion1[3][block_x + 2].ref_idx    = l1_rFrame;
+				motion0[3][block_x + 2].ref_pic_id = ref_pic_num_l0[(short)motion0[3][block_x + 2].ref_idx];
+				motion1[3][block_x + 2].ref_pic_id = ref_pic_num_l1[(short)motion1[3][block_x + 2].ref_idx];
+				memcpy(motion0[3][block_x + 3].mv, pmvl0, sizeof(MotionVector));
+				motion0[3][block_x + 3].ref_idx    = l0_rFrame;
+				memcpy(motion1[3][block_x + 3].mv, pmvl1, sizeof(MotionVector));
+				motion1[3][block_x + 3].ref_idx    = l1_rFrame;
+				motion0[3][block_x + 3].ref_pic_id = ref_pic_num_l0[(short)motion0[3][block_x + 3].ref_idx];
+				motion1[3][block_x + 3].ref_pic_id = ref_pic_num_l1[(short)motion1[3][block_x + 3].ref_idx];
+				perform_mc8x8(currMB, curr_plane, dec_picture, pred_dir, 2, 2, list_offset, curr_mb_field);
+			}
+			else
+			{ // not long term
+				const byte **colocated_moving_block = &colocated->moving_block[currMB->block_y_aff];
+				for (block8x8 = 0; block8x8 < 4; block8x8++)
+				{
+					int k_start = (block8x8 << 2);
+					for (k = k_start; k < k_start + BLOCK_MULTIPLE; k ++)
+					{
+						int i  =  (decode_block_scan[k] & 3);
+						int j  = ((decode_block_scan[k] >> 2) & 3);
+						int i4  = currMB->block_x + i;
+
+						//===== DIRECT PREDICTION =====
+						if (!l0_rFrame  && !colocated_moving_block[j][i4])
+						{
+							motion0[j][i4].mv[0] = 0;
+							motion0[j][i4].mv[1] = 0;
+							motion0[j][i4].ref_idx    = 0;
+						}
+						else
+						{
+							motion0[j][i4].mv[0] = pmvl0[0];
+							motion0[j][i4].mv[1] = pmvl0[1];
+							motion0[j][i4].ref_idx    = l0_rFrame;
+						}
+
+						if  (l1_rFrame == 0 && !colocated_moving_block[j][i4])
+						{
+							motion1[j][i4].mv[0] = 0;
+							motion1[j][i4].mv[1] = 0;
+							motion1[j][i4].ref_idx    = 0;
+						}
+						else
+						{
+							motion1[j][i4].mv[0] = pmvl1[0];
+							motion1[j][i4].mv[1] = pmvl1[1];
+							motion1[j][i4].ref_idx    = l1_rFrame;
+						}
+
+						motion0[j][i4].ref_pic_id = ref_pic_num_l0[(short)motion0[j][i4].ref_idx];
+						motion1[j][i4].ref_pic_id = ref_pic_num_l1[(short)motion1[j][i4].ref_idx];
+					}
+
+					perform_mc8x8(currMB, curr_plane, dec_picture, pred_dir, (decode_block_scan[k_start] & 3), ((decode_block_scan[k_start] >> 2) & 3), list_offset, curr_mb_field);
+				}
+			}
+		}
+		else
+		{
+			for (block8x8 = 0; block8x8 < 4; block8x8++)
+			{
+				int k_start = (block8x8 << 2);
+				for (k = k_start; k < k_start + BLOCK_MULTIPLE; k ++)
+				{
+					int i  =  (decode_block_scan[k] & 3);
+					int j  = ((decode_block_scan[k] >> 2) & 3);
+					int i4  = currMB->block_x + i;
+					int j4  = currMB->block_y + j;
+					int j6  = currMB->block_y_aff + j;
+
+					//printf("%d %d\n", i, j);
+
+					//===== DIRECT PREDICTION =====
+
+					if (l0_rFrame >=0)
+					{
+						if (!l0_rFrame  && ((!colocated->moving_block[j6][i4]) && (!p_Vid->listX[LIST_1 + list_offset][0]->is_long_term)))
+						{
+							has_zero_partitions = TRUE;
+							motion->motion[LIST_0][j4][i4].mv[0] = 0;
+							motion->motion[LIST_0][j4][i4].mv[1] = 0;
+							motion->motion[LIST_0][j4][i4].ref_idx    = 0;
+						}
+						else
+						{
+							has_zero_partitions = TRUE;
+							motion->motion[LIST_0][j4][i4].mv[0] = pmvl0[0];
+							motion->motion[LIST_0][j4][i4].mv[1] = pmvl0[1];
+							motion->motion[LIST_0][j4][i4].ref_idx    = l0_rFrame;
+						}
+					}
+					else
+					{        
+						motion->motion[LIST_0][j4][i4].mv[0] = 0;
+						motion->motion[LIST_0][j4][i4].mv[1] = 0;
+						motion->motion[LIST_0][j4][i4].ref_idx    = -1;
+					}
+
+					if (l1_rFrame >=0)
+					{
+						if  (l1_rFrame == 0 && ((!colocated->moving_block[j6][i4]) && (!p_Vid->listX[LIST_1 + list_offset][0]->is_long_term)))
+						{
+							has_zero_partitions = TRUE;
+							motion->motion[LIST_1][j4][i4].mv[0] = 0;
+							motion->motion[LIST_1][j4][i4].mv[1] = 0;
+							motion->motion[LIST_1][j4][i4].ref_idx    = 0;
+						}
+						else
+						{
+							has_zero_partitions = TRUE;
+							motion->motion[LIST_1][j4][i4].mv[0] = pmvl1[0];
+							motion->motion[LIST_1][j4][i4].mv[1] = pmvl1[1];
+							motion->motion[LIST_1][j4][i4].ref_idx    = l1_rFrame;
+						}
+					}
+					else
+					{
+						motion->motion[LIST_1][j4][i4].mv[0] = 0;
+						motion->motion[LIST_1][j4][i4].mv[1] = 0;
+						motion->motion[LIST_1][j4][i4].ref_idx    = -1;
+					}
+
+					if (l1_rFrame == -1) 
+						pred_dir = 0;
+					else if (l0_rFrame == -1) 
+						pred_dir = 1;
+					else
+						pred_dir = 2;
+
+					if (l0_rFrame < 0 && l1_rFrame < 0)
+					{
+						motion->motion[LIST_0][j4][i4].ref_idx = 0;
+						motion->motion[LIST_1][j4][i4].ref_idx = 0;
+						pred_dir = 2;
+					}
+
+					motion->motion[LIST_0][j4][i4].ref_pic_id = ref_pic_num_l0[(short)motion->motion[LIST_0][j4][i4].ref_idx];
+					motion->motion[LIST_1][j4][i4].ref_pic_id = ref_pic_num_l1[(short)motion->motion[LIST_1][j4][i4].ref_idx];
+				}
+
+				if (has_zero_partitions == TRUE)
+				{
+					int i =  (decode_block_scan[k_start] & 3);
+					int j = ((decode_block_scan[k_start] >> 2) & 3);
+
+					perform_mc8x8(currMB, curr_plane, dec_picture, pred_dir, i, j, list_offset, curr_mb_field);
+				}
+			}
+		}
+	}
+	else
+	{
+		for (block8x8 = 0; block8x8 < 4; block8x8++)
+		{
+			int k_start = (block8x8 << 2);
+			int k_end = k_start + BLOCK_MULTIPLE;
+
+			for (k = k_start; k < k_end; k ++)
+			{
+				int i  =  (decode_block_scan[k] & 3);
+				int j  = ((decode_block_scan[k] >> 2) & 3);
+				int i4  = currMB->block_x + i;
+				int j4  = currMB->block_y + j;
+				int j6  = currMB->block_y_aff + j;
+
+				//===== DIRECT PREDICTION =====
+
+				if (l0_rFrame >=0)
+				{
+					if (!l0_rFrame  && ((!colocated->moving_block[j6][i4]) && (!p_Vid->listX[LIST_1 + list_offset][0]->is_long_term)))
+					{
+						has_zero_partitions = TRUE;
+						motion->motion[LIST_0][j4][i4].mv[0] = 0;
+						motion->motion[LIST_0][j4][i4].mv[1] = 0;
+						motion->motion[LIST_0][j4][i4].ref_idx    = 0;
+					}
+					else
+					{
+						has_zero_partitions = TRUE;
+						motion->motion[LIST_0][j4][i4].mv[0] = pmvl0[0];
+						motion->motion[LIST_0][j4][i4].mv[1] = pmvl0[1];
+						motion->motion[LIST_0][j4][i4].ref_idx    = l0_rFrame;
+					}
+				}
+				else
+				{        
+					motion->motion[LIST_0][j4][i4].mv[0] = 0;
+					motion->motion[LIST_0][j4][i4].mv[1] = 0;
+					motion->motion[LIST_0][j4][i4].ref_idx    = -1;
+				}
+
+				if (l1_rFrame >=0)
+				{
+					if  (l1_rFrame == 0 && ((!colocated->moving_block[j6][i4]) && (!p_Vid->listX[LIST_1 + list_offset][0]->is_long_term)))
+					{
+						has_zero_partitions = TRUE;
+						motion->motion[LIST_1][j4][i4].mv[0] = 0;
+						motion->motion[LIST_1][j4][i4].mv[1] = 0;
+						motion->motion[LIST_1][j4][i4].ref_idx    = 0;
+					}
+					else
+					{
+						has_zero_partitions = TRUE;
+						motion->motion[LIST_1][j4][i4].mv[0] = pmvl1[0];
+						motion->motion[LIST_1][j4][i4].mv[1] = pmvl1[1];
+						motion->motion[LIST_1][j4][i4].ref_idx    = l1_rFrame;
+					}
+				}
+				else
+				{
+					motion->motion[LIST_1][j4][i4].mv[0] = 0;
+					motion->motion[LIST_1][j4][i4].mv[1] = 0;
+					motion->motion[LIST_1][j4][i4].ref_idx    = -1;
+				}
+
+				if (l0_rFrame < 0 && l1_rFrame < 0)
+				{
+					motion->motion[LIST_0][j4][i4].ref_idx = 0;
+					motion->motion[LIST_1][j4][i4].ref_idx = 0;
+				}
+
+				if (l1_rFrame == -1) 
+				{
+					if (l0_rFrame == -1) 
+						pred_dir = 2;
+					else
+						pred_dir = 0;
+				}
+				else if (l0_rFrame == -1) 
+				{
+					pred_dir = 1;
+				}
+				else                                               
+					pred_dir = 2;
+
+				motion->motion[LIST_0][j4][i4].ref_pic_id = ref_pic_num_l0[(short)motion->motion[LIST_0][j4][i4].ref_idx];
+				motion->motion[LIST_1][j4][i4].ref_pic_id = ref_pic_num_l1[(short)motion->motion[LIST_1][j4][i4].ref_idx];
+			}
+
+			if (has_zero_partitions == TRUE)
+			{
+				for (k = k_start; k < k_end; k ++)
+				{        
+					int i =  (decode_block_scan[k] & 3);
+					int j = ((decode_block_scan[k] >> 2) & 3);
+
+					perform_mc(currMB, curr_plane, dec_picture, pred_dir, i, j, list_offset, BLOCK_SIZE, BLOCK_SIZE, curr_mb_field);           
+				}
+			}
+		}
+	}
+
+	if (has_zero_partitions == FALSE)
+	{
+		perform_mc16x16(currMB, curr_plane, dec_picture, pred_dir, list_offset, curr_mb_field);           
+	}  
+
+	if (currMB->cbp == 0)
+	{
+		opt_copy_image_data_16x16_stride(image, currMB->pix_x, currMB->pix_y, currSlice->mb_pred[curr_plane]);
+
+		if (dec_picture->chroma_format_idc == YUV420)
+		{
+			copy_image_data_8x8_stride(dec_picture->imgUV[0], currMB->pix_c_x, currMB->pix_c_y, currSlice->mb_pred[1]);
+			copy_image_data_8x8_stride(dec_picture->imgUV[1], currMB->pix_c_x, currMB->pix_c_y, currSlice->mb_pred[2]);
+		}
+		else if (dec_picture->chroma_format_idc == YUV422)
+		{
+			copy_image_data_stride(dec_picture->imgUV[0], currMB->pix_c_x, currMB->pix_c_y, currSlice->mb_pred[1], 8, 16);
+			copy_image_data_stride(dec_picture->imgUV[1], currMB->pix_c_x, currMB->pix_c_y, currSlice->mb_pred[2], 8, 16);
+		}
+	}
+	else
+		iTransform(currMB, curr_plane, 0); 
+}
+
+
+
+/*!
+************************************************************************
+* \brief
+*    decode one color component for a b slice
+************************************************************************
+*/
+
+static int decode_one_component_b_slice(Macroblock *currMB, ColorPlane curr_plane, VideoImage *image, StorablePicture *dec_picture)
+{  
+	//For residual DPCM
+	currMB->ipmode_DPCM = NO_INTRA_PMODE; 
+
+	if(currMB->mb_type == IPCM)
+		mb_pred_ipcm(currMB);
+	else if (IS_I16MB (currMB)) // get prediction for INTRA_MB_16x16
+		mb_pred_intra16x16(currMB, curr_plane, image, dec_picture);
+	else if (currMB->mb_type == I4MB)
+		mb_pred_intra4x4(currMB, curr_plane, image, dec_picture);
+	else if (currMB->mb_type == I8MB) 
+		mb_pred_intra8x8(currMB, curr_plane, image, dec_picture);  
+	else if (currMB->mb_type == P16x16)
+		mb_pred_p_inter16x16(currMB, curr_plane, image, dec_picture);
+	else if (currMB->mb_type == P16x8)
+		mb_pred_p_inter16x8(currMB, curr_plane, image, dec_picture);
+	else if (currMB->mb_type == P8x16)
+		mb_pred_p_inter8x16(currMB, curr_plane, image, dec_picture);
+	else if (currMB->mb_type == BSKIP_DIRECT)
+	{
+		if (currMB->p_Slice->direct_spatial_mv_pred_flag == 0)
+			mb_pred_b_dtemporal (currMB, curr_plane, image, dec_picture);
+		else
+			mb_pred_b_dspatial (currMB, curr_plane, image, dec_picture);
+	}
+	else
+		mb_pred_b_inter8x8 (currMB, curr_plane, image, dec_picture);
+
+	return 1;
+}
+
+/*!
+************************************************************************
+* \brief
+*    decode one macroblock
+************************************************************************
+*/
+
+int decode_one_macroblock(Macroblock *currMB, StorablePicture *dec_picture)
+{
+	Slice *currSlice = currMB->p_Slice;
+	VideoParameters *p_Vid = currMB->p_Vid;  
+
+	// luma decoding **************************************************
+	currSlice->decode_one_component(currMB, PLANE_Y, dec_picture->imgY, dec_picture);
+
+	if ((p_Vid->active_sps->chroma_format_idc==YUV444)&&(!IS_INDEPENDENT(p_Vid)))  
+	{
+		currSlice->decode_one_component(currMB, PLANE_U, dec_picture->imgUV[0], dec_picture);
+		currSlice->decode_one_component(currMB, PLANE_V, dec_picture->imgUV[1], dec_picture);
+	}
+	return 0;
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    change target plane
+*    for 4:4:4 Independent mode
+************************************************************************
+*/
+void change_plane_JV( VideoParameters *p_Vid, int nplane )
+{
+	Slice *currSlice = p_Vid->currentSlice;
+	p_Vid->colour_plane_id = nplane;
+	p_Vid->mb_data = p_Vid->mb_data_JV[nplane];
+	p_Vid->dec_picture  = p_Vid->dec_picture_JV[nplane];
+	currSlice->p_colocated   = currSlice->Co_located_JV[nplane];
+}
+
+/*!
+************************************************************************
+* \brief
+*    make frame picture from each plane data
+*    for 4:4:4 Independent mode
+************************************************************************
+*/
+void make_frame_picture_JV(VideoParameters *p_Vid)
+{
+	int uv, line;
+	int nsize;
+	int nplane;
+	p_Vid->dec_picture = p_Vid->dec_picture_JV[0];
+
+	// Copy Storable Params
+	for( nplane=0; nplane<MAX_PLANE; nplane++ )
+	{
+		copy_storable_param_JV( p_Vid, &p_Vid->dec_picture->JVmotion[nplane], &p_Vid->dec_picture_JV[nplane]->motion );
+	}
+
+	// This could be done with pointers and seems not necessary
+	for( uv=0; uv<2; uv++ )
+	{
+		for( line=0; line<p_Vid->height; line++ )
+		{
+			nsize = sizeof(imgpel) * p_Vid->width;
+			memcpy( p_Vid->dec_picture->imgUV[uv]->img[line], p_Vid->dec_picture_JV[uv+1]->imgY->img[line], nsize );
+		}
+		free_storable_picture(p_Vid, p_Vid->dec_picture_JV[uv+1]);
+	}
+}
+
+
diff --git a/Src/h264dec/ldecod/src/mb_access.c b/Src/h264dec/ldecod/src/mb_access.c
new file mode 100644
index 00000000..70f3aee2
--- /dev/null
+++ b/Src/h264dec/ldecod/src/mb_access.c
@@ -0,0 +1,3388 @@
+
+/*!
+*************************************************************************************
+* \file mb_access.c
+*
+* \brief
+*    Functions for macroblock neighborhoods
+*
+*  \author
+*      Main contributors (see contributors.h for copyright, address and affiliation details)
+*      - Karsten S�hring          <suehring@hhi.de>
+*************************************************************************************
+*/
+
+#include "global.h"
+#include "mbuffer.h"
+#include "mb_access.h"
+
+/*!
+************************************************************************
+* \brief
+*    returns 1 if the macroblock at the given address is available
+************************************************************************
+*/
+Boolean mb_is_available(int mbAddr, const Macroblock *currMB)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	if ((mbAddr < 0) || (mbAddr > ((int)p_Vid->dec_picture->PicSizeInMbs - 1)))
+		return FALSE;
+
+	// the following line checks both: slice number and if the mb has been decoded
+	if (!p_Vid->DeblockCall)
+	{
+		if (p_Vid->mb_data[mbAddr].slice_nr != currMB->slice_nr)
+			return FALSE;
+	}
+
+	return TRUE;
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    Checks the availability of neighboring macroblocks of
+*    the current macroblock for prediction and context determination;
+************************************************************************
+*/
+void CheckAvailabilityOfNeighbors(Macroblock *currMB)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	const int mb_nr = currMB->mbAddrX;
+
+	// mark all neighbors as unavailable
+	currMB->mb_up   = NULL;
+	currMB->mb_left = NULL;
+
+	if (p_Vid->dec_picture->mb_aff_frame_flag)
+	{
+		int cur_mb_pair = mb_nr >> 1;
+		currMB->mb_addr_left = 2 * (cur_mb_pair - 1);
+		currMB->mb_addr_up = 2 * (cur_mb_pair - p_Vid->dec_picture->PicWidthInMbs);
+		currMB->mb_addr_upper_right = 2 * (cur_mb_pair - p_Vid->dec_picture->PicWidthInMbs + 1);
+		currMB->mb_addr_upper_left = 2 * (cur_mb_pair - p_Vid->dec_picture->PicWidthInMbs - 1);
+
+		currMB->mb_avail_left = (Boolean) (mb_is_available(currMB->mb_addr_left, currMB) && ((p_Vid->PicPos[cur_mb_pair    ][0])!=0));
+		currMB->mb_avail_up = (Boolean) (mb_is_available(currMB->mb_addr_up, currMB));
+		currMB->mb_avail_upper_right = (Boolean) (mb_is_available(currMB->mb_addr_upper_right, currMB) && ((p_Vid->PicPos[cur_mb_pair + 1][0])!=0));
+		currMB->mb_avail_upper_left = (Boolean) (mb_is_available(currMB->mb_addr_upper_left, currMB) && ((p_Vid->PicPos[cur_mb_pair    ][0])!=0));
+	}
+	else
+	{
+		currMB->mb_addr_left = mb_nr - 1; // left?
+		currMB->mb_addr_up = mb_nr - p_Vid->dec_picture->PicWidthInMbs; // up?
+		currMB->mb_addr_upper_right = mb_nr - p_Vid->dec_picture->PicWidthInMbs + 1; // upper right?
+		currMB->mb_addr_upper_left = mb_nr - p_Vid->dec_picture->PicWidthInMbs - 1; // upper left?
+
+		currMB->mb_avail_left = (Boolean) (mb_is_available(currMB->mb_addr_left, currMB) && ((p_Vid->PicPos[mb_nr    ][0])!=0));
+		currMB->mb_avail_up = (Boolean) (mb_is_available(currMB->mb_addr_up, currMB));
+		currMB->mb_avail_upper_right = (Boolean) (mb_is_available(currMB->mb_addr_upper_right, currMB) && ((p_Vid->PicPos[mb_nr + 1][0])!=0));
+		currMB->mb_avail_upper_left = (Boolean) (mb_is_available(currMB->mb_addr_upper_left, currMB) && ((p_Vid->PicPos[mb_nr    ][0])!=0));
+	}
+
+	if (currMB->mb_avail_left) currMB->mb_left = &(p_Vid->mb_data[currMB->mb_addr_left]);
+	if (currMB->mb_avail_up) currMB->mb_up   = &(p_Vid->mb_data[currMB->mb_addr_up]);
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    returns the x and y macroblock coordinates for a given MbAddress
+************************************************************************
+*/
+void get_mb_block_pos_normal (const h264_pic_position *PicPos, int mb_addr, short *x, short *y)
+{
+	*x = (short) PicPos[ mb_addr ][0];
+	*y = (short) PicPos[ mb_addr ][1];
+}
+
+/*!
+************************************************************************
+* \brief
+*    returns the x and y macroblock coordinates for a given MbAddress
+*    for mbaff type slices
+************************************************************************
+*/
+void get_mb_block_pos_mbaff (const h264_pic_position *PicPos, int mb_addr, short *x, short *y)
+{
+	*x = (short)  PicPos[mb_addr>>1][0];
+	*y = (short) ((PicPos[mb_addr>>1][1] << 1) + (mb_addr & 0x01));
+}
+
+/*!
+************************************************************************
+* \brief
+*    returns the x and y sample coordinates for a given MbAddress
+************************************************************************
+*/
+void get_mb_pos (VideoParameters *p_Vid, int mb_addr, const int mb_size[2], short *x, short *y)
+{
+	p_Vid->get_mb_block_pos(p_Vid->PicPos, mb_addr, x, y);
+
+	(*x) = (short) ((*x) * mb_size[0]);
+	(*y) = (short) ((*y) * mb_size[1]);
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    get neighbouring positions for non-aff coding
+* \param currMB
+*   current macroblock
+* \param xN
+*    input x position
+* \param yN
+*    input y position
+* \param mb_size
+*    Macroblock size in pixel (according to luma or chroma MB access)
+* \param pix
+*    returns position informations
+************************************************************************
+*/
+void getNonAffNeighbour(const Macroblock *currMB, int xN, int yN, const int mb_size[2], PixelPos *pix)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int maxW = mb_size[0], maxH = mb_size[1];
+
+	if (xN < 0)
+	{
+		if (yN < 0)
+		{
+			pix->mb_addr   = currMB->mb_addr_upper_left;
+			pix->available = currMB->mb_avail_upper_left;
+		}
+		else if (yN < maxH)
+		{
+			pix->mb_addr  = currMB->mb_addr_left;
+			pix->available = currMB->mb_avail_left;
+		}
+		else
+		{
+			pix->available = FALSE;
+		}
+	}
+	else if (xN < maxW)
+	{
+		if (yN<0)
+		{
+			pix->mb_addr   = currMB->mb_addr_up;
+			pix->available = currMB->mb_avail_up;
+		}
+		else if (yN < maxH)
+		{
+			pix->mb_addr   = currMB->mbAddrX;
+			pix->available = TRUE;
+		}
+		else
+		{
+			pix->available = FALSE;
+		}
+	}
+	else if (yN < 0)
+	{
+		pix->mb_addr   = currMB->mb_addr_upper_right;
+		pix->available = currMB->mb_avail_upper_right;
+	}
+	else
+	{
+		pix->available = FALSE;
+	}
+
+	if (pix->available || p_Vid->DeblockCall && pix->mb_addr)
+	{
+		const int *CurPos = &p_Vid->PicPos[ pix->mb_addr ][0];
+		pix->x     = (short) (xN & (maxW - 1));
+		pix->pos_x = (short) (pix->x + *(CurPos++) * maxW);
+		pix->y     = (short) (yN & (maxH - 1));    
+		pix->pos_y = (short) (pix->y + *CurPos * maxH);
+	}
+}
+
+void getNonAffNeighbourXP_NoPos(const Macroblock *currMB, int xN, int yN, const int mb_size[2], PixelPos *pix)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int maxW = mb_size[0], maxH = mb_size[1];
+
+	if (xN < 0)
+	{
+		if (yN < maxH)
+		{
+			pix->mb_addr  = currMB->mb_addr_left;
+			pix->available = currMB->mb_avail_left;
+		}
+		else
+		{
+			pix->available = FALSE;
+		}
+	}
+	else if (xN < maxW)
+	{
+		if (yN < maxH)
+		{
+			pix->mb_addr   = currMB->mbAddrX;
+			pix->available = TRUE;
+		}
+		else
+		{
+			pix->available = FALSE;
+		}
+	}
+	else
+	{
+		pix->available = FALSE;
+	}
+
+	if (pix->available)
+	{
+		const int *CurPos = &p_Vid->PicPos[ pix->mb_addr ][0];
+		pix->x     = (short) (xN & (maxW - 1));
+		pix->y     = (short) (yN & (maxH - 1));    
+	}
+}
+
+void getNonAffNeighbourPX_NoPos(const Macroblock *currMB, int xN, int yN, const int mb_size[2], PixelPos *pix)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int maxW = mb_size[0], maxH = mb_size[1];
+
+	if (xN < maxW)
+	{
+		if (yN<0)
+		{
+			pix->mb_addr   = currMB->mb_addr_up;
+			pix->available = currMB->mb_avail_up;
+		}
+		else if (yN < maxH)
+		{
+			pix->mb_addr   = currMB->mbAddrX;
+			pix->available = TRUE;
+		}
+		else
+		{
+			pix->available = FALSE;
+		}
+	}
+	else if (yN < 0)
+	{
+		pix->mb_addr   = currMB->mb_addr_upper_right;
+		pix->available = currMB->mb_avail_upper_right;
+	}
+	else
+	{
+		pix->available = FALSE;
+	}
+
+	if (pix->available)
+	{
+		const int *CurPos = &p_Vid->PicPos[ pix->mb_addr ][0];
+		pix->x     = (short) (xN & (maxW - 1));
+		pix->y     = (short) (yN & (maxH - 1));    
+	}
+}
+
+void getNonAffNeighbourLuma(const Macroblock *currMB, int xN, int yN, PixelPos *pix)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+
+	if (xN < 0)
+	{
+		if (yN < 0)
+		{
+			pix->mb_addr   = currMB->mb_addr_upper_left;
+			pix->available = currMB->mb_avail_upper_left;
+		}
+		else if (yN < 16)
+		{
+			pix->mb_addr  = currMB->mb_addr_left;
+			pix->available = currMB->mb_avail_left;
+		}
+		else
+		{
+			pix->available = FALSE;
+		}
+	}
+	else if (xN < 16)
+	{
+		if (yN<0)
+		{
+			pix->mb_addr   = currMB->mb_addr_up;
+			pix->available = currMB->mb_avail_up;
+		}
+		else if (yN < 16)
+		{
+			pix->mb_addr   = currMB->mbAddrX;
+			pix->available = TRUE;
+		}
+		else
+		{
+			pix->available = FALSE;
+		}
+	}
+	else if (yN < 0)
+	{
+		pix->mb_addr   = currMB->mb_addr_upper_right;
+		pix->available = currMB->mb_avail_upper_right;
+	}
+	else
+	{
+		pix->available = FALSE;
+	}
+
+	if (pix->available || p_Vid->DeblockCall && pix->mb_addr)
+	{
+		const int *CurPos = &p_Vid->PicPos[ pix->mb_addr ][0];
+		pix->x     = (short) (xN & 15);
+		pix->pos_x = (short) (pix->x + *(CurPos++) * 16);
+		pix->y     = (short) (yN & 15);    
+		pix->pos_y = (short) (pix->y + *CurPos * 16);
+	}
+}
+
+void getNonAffNeighbourXPLuma(const Macroblock *currMB, int xN, int yN, PixelPos *pix) // yN >= 0
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+
+	if (xN < 0)
+	{
+		if (yN < 16)
+		{
+			pix->mb_addr  = currMB->mb_addr_left;
+			pix->available = currMB->mb_avail_left;
+		}
+		else
+		{
+			pix->available = FALSE;
+		}
+	}
+	else if (xN < 16)
+	{
+		if (yN < 16)
+		{
+			pix->mb_addr   = currMB->mbAddrX;
+			pix->available = TRUE;
+		}
+		else
+		{
+			pix->available = FALSE;
+		}
+	}
+	else
+	{
+		pix->available = FALSE;
+	}
+
+	if (pix->available || p_Vid->DeblockCall && pix->mb_addr)
+	{
+		const int *CurPos = &p_Vid->PicPos[ pix->mb_addr ][0];
+		pix->x     = (short) (xN & 15);
+		pix->pos_x = (short) (pix->x + *(CurPos++) * 16);
+		pix->y     = (short) (yN & 15);    
+		pix->pos_y = (short) (pix->y + *CurPos * 16);
+	}
+}
+
+
+void getNonAffNeighbourXPLumaNB(const Macroblock *currMB, int xN, int yN, PixelPos *pix) // yN >= 0
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	assert(!p_Vid->DeblockCall);
+	if (xN < 0)
+	{
+		pix->mb_addr  = currMB->mb_addr_left;
+		pix->available = currMB->mb_avail_left;
+	}
+	else
+	{
+		pix->mb_addr   = currMB->mbAddrX;
+		pix->available = TRUE;
+	}
+
+	if (pix->available)
+	{
+		const int *CurPos = &p_Vid->PicPos[ pix->mb_addr ][0];
+		pix->pos_x = (short) ((xN & 15) + *(CurPos++) * 16);
+		pix->pos_y = (short) (yN + *CurPos * 16);
+	}
+}
+
+void getNonAffNeighbourPPLumaNB(const Macroblock *currMB, int xN, int yN, PixelPos *pix) // yN >= 0, xN >= 0
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	assert(!p_Vid->DeblockCall);
+	pix->mb_addr   = currMB->mbAddrX;
+	pix->available = TRUE;
+
+	{
+		const int *CurPos = &p_Vid->PicPos[ pix->mb_addr ][0];
+		pix->pos_x = (short) ((xN & 15) + *(CurPos++) * 16);
+		pix->pos_y = (short) (yN + *CurPos * 16);
+	}
+}
+
+
+void getNonAffNeighbourXPLumaNB_NoPos(const Macroblock *currMB, int xN, int yN, PixelPos *pix) // yN >= 0
+{
+	assert(!currMB->p_Vid->DeblockCall);
+	if (xN < 0)
+	{
+		pix->mb_addr  = currMB->mb_addr_left;
+		pix->available = currMB->mb_avail_left;
+	}
+	else
+	{
+		pix->mb_addr   = currMB->mbAddrX;
+		pix->available = TRUE;
+	}
+
+
+	if (pix->available)
+	{
+		pix->x     = (short) (xN & 15);
+		pix->y     = (short) (yN);
+	}
+}
+
+void getNonAffNeighbourNPLumaNB(const Macroblock *currMB, int yN, PixelPos *pix) // xN = -1, yN >= 0 && yN < 16
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+
+	pix->mb_addr  = currMB->mb_addr_left;
+	pix->available = currMB->mb_avail_left;
+
+	if (pix->available || p_Vid->DeblockCall && pix->mb_addr)
+	{
+		const int *CurPos = &p_Vid->PicPos[ pix->mb_addr ][0];
+		//pix->x     = (short) (-1 & 15);
+		pix->pos_x = (short) ((-1 & 15) + *(CurPos++) * 16);
+		pix->y     = (short) (yN);    
+		pix->pos_y = (short) (yN + *CurPos * 16);
+	}
+}
+
+
+void getNonAffNeighbourPXLuma(const Macroblock *currMB, int xN, int yN, PixelPos *pix) // xN is >= 0
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+
+	if (xN < 16)
+	{
+		if (yN<0)
+		{
+			pix->mb_addr   = currMB->mb_addr_up;
+			pix->available = currMB->mb_avail_up;
+		}
+		else if (yN < 16)
+		{
+			pix->mb_addr   = currMB->mbAddrX;
+			pix->available = TRUE;
+		}
+		else
+		{
+			pix->available = FALSE;
+		}
+	}
+	else if (yN < 0)
+	{
+		pix->mb_addr   = currMB->mb_addr_upper_right;
+		pix->available = currMB->mb_avail_upper_right;
+	}
+	else
+	{
+		pix->available = FALSE;
+	}
+
+	if (pix->available || p_Vid->DeblockCall && pix->mb_addr)
+	{
+		const int *CurPos = &p_Vid->PicPos[ pix->mb_addr ][0];
+		pix->x     = (short) (xN & 15);
+		pix->pos_x = (short) (pix->x + *(CurPos++) * 16);
+		pix->y     = (short) (yN & 15);    
+		pix->pos_y = (short) (pix->y + *CurPos * 16);
+	}
+}
+
+void getNonAffNeighbourPXLumaNB(const Macroblock *currMB, int xN, int yN, PixelPos *pix) // xN is >= 0
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	assert(!p_Vid->DeblockCall);
+	if (yN<0)
+	{
+		pix->mb_addr   = currMB->mb_addr_up;
+		pix->available = currMB->mb_avail_up;
+	}
+	else 
+	{
+		pix->mb_addr   = currMB->mbAddrX;
+		pix->available = TRUE;
+	}
+
+	if (pix->available)
+	{
+		const int *CurPos = &p_Vid->PicPos[ pix->mb_addr ][0];
+		pix->pos_x = (short) (xN + *(CurPos++) * 16);
+		pix->pos_y = (short) ((yN & 15) + *CurPos * 16);
+	}
+}
+
+void getNonAffNeighbourPXLumaNB_NoPos(const Macroblock *currMB, int yN, PixelPos *pix) // xN is >= 0
+{
+	assert(!currMB->p_Vid->DeblockCall);
+	if (yN<0)
+	{
+		pix->mb_addr   = currMB->mb_addr_up;
+		pix->available = currMB->mb_avail_up;
+	}
+	else 
+	{
+		pix->mb_addr   = currMB->mbAddrX;
+		pix->available = TRUE;
+	}
+
+	if (pix->available)
+	{
+		pix->y     = (short) (yN & 15);    
+	}
+}
+
+void getNonAffNeighbourN0Luma(const Macroblock *currMB, PixelPos *pix) // xN = -1, yN = 0
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	assert(p_Vid->DeblockCall == 0);
+	pix->mb_addr  = currMB->mb_addr_left;
+	pix->available = currMB->mb_avail_left;
+
+	if (pix->available)
+	{
+		const int *CurPos = &p_Vid->PicPos[ pix->mb_addr ][0];
+		pix->x     = (short) (-1 & 15);
+		pix->pos_x = (short) (pix->x + *(CurPos++) * 16);
+		pix->y     = 0;
+		pix->pos_y = (short) (*CurPos * 16);
+	}
+}
+
+
+void getNonAffNeighbourN0(const Macroblock *currMB, const int mb_size[2], PixelPos *pix) // xN = -1, yN = 0
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int maxW = mb_size[0], maxH = mb_size[1];
+
+	assert(maxH != 0);
+	assert(p_Vid->DeblockCall == 0);
+	pix->mb_addr  = currMB->mb_addr_left;
+	pix->available = currMB->mb_avail_left;
+
+	if (pix->available)
+	{
+		const int *CurPos = &p_Vid->PicPos[ pix->mb_addr ][0];
+		pix->x     = (short) (-1 & (maxW - 1));
+		pix->pos_x = (short) (pix->x + *(CurPos++) * maxW);
+		pix->y     = 0;
+		pix->pos_y = (short) (*CurPos * maxH);
+	}
+}
+
+void getNonAffNeighbour0N(const Macroblock *currMB, const int mb_size[2], PixelPos *pix) // xN = 0, yN = -1
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int maxW = mb_size[0], maxH = mb_size[1];
+
+	assert(maxW != 0);
+	pix->mb_addr   = currMB->mb_addr_up;
+	pix->available = currMB->mb_avail_up;
+
+	if (pix->available)
+	{
+		const int *CurPos = &p_Vid->PicPos[ pix->mb_addr ][0];
+		pix->x     = 0;
+		pix->pos_x = (short) (*(CurPos++) * maxW);
+		pix->y     = (short) (-1 & (maxH - 1));    
+		pix->pos_y = (short) (pix->y + *CurPos * maxH);
+	}
+}
+
+void getNonAffNeighbour0NLuma(const Macroblock *currMB, PixelPos *pix) // xN = 0, yN = -1
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+
+	pix->mb_addr   = currMB->mb_addr_up;
+	pix->available = currMB->mb_avail_up;
+
+	if (pix->available)
+	{
+		const int *CurPos = &p_Vid->PicPos[ pix->mb_addr ][0];
+		pix->x     = 0;
+		pix->pos_x = (short) (*(CurPos++) * 16);
+		pix->y     = (short) (-1 & (16 - 1));    
+		pix->pos_y = (short) (pix->y + *CurPos * 16);
+	}
+}
+
+
+void getNonAffNeighbourNX(const Macroblock *currMB, int yN, const int mb_size[2], PixelPos *pix) // xN = -1, yN full range
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int maxW = mb_size[0], maxH = mb_size[1];
+
+
+	if (yN < 0)
+	{
+		pix->mb_addr   = currMB->mb_addr_upper_left;
+		pix->available = currMB->mb_avail_upper_left;
+	}
+	else if (yN < maxH)
+	{
+		pix->mb_addr  = currMB->mb_addr_left;
+		pix->available = currMB->mb_avail_left;
+	}
+	else
+	{
+		pix->available = FALSE;
+	}
+
+	if (pix->available || p_Vid->DeblockCall && pix->mb_addr)
+	{
+		const int *CurPos = &p_Vid->PicPos[ pix->mb_addr ][0];
+		pix->x     = (short) (-1 & (maxW - 1));
+		pix->pos_x = (short) (pix->x + *(CurPos++) * maxW);
+		pix->y     = (short) (yN & (maxH - 1));    
+		pix->pos_y = (short) (pix->y + *CurPos * maxH);
+	}
+}
+
+void getNonAffNeighbourNXLuma(const Macroblock *currMB, int yN, PixelPos *pix) // xN = -1, yN full range
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+
+
+	if (yN < 0)
+	{
+		pix->mb_addr   = currMB->mb_addr_upper_left;
+		pix->available = currMB->mb_avail_upper_left;
+	}
+	else if (yN < 16)
+	{
+		pix->mb_addr  = currMB->mb_addr_left;
+		pix->available = currMB->mb_avail_left;
+	}
+	else
+	{
+		pix->available = FALSE;
+	}
+
+	if (pix->available || p_Vid->DeblockCall && pix->mb_addr)
+	{
+		const int *CurPos = &p_Vid->PicPos[ pix->mb_addr ][0];
+		pix->x     = (short) (-1 & (16 - 1));
+		pix->pos_x = (short) (pix->x + *(CurPos++) * 16);
+		pix->y     = (short) (yN & (16 - 1));    
+		pix->pos_y = (short) (pix->y + *CurPos * 16);
+	}
+}
+
+void getNonAffNeighbourNP(const Macroblock *currMB, int yN, const int mb_size[2], PixelPos *pix) // xN < 0, yN >= 0
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int maxW = mb_size[0], maxH = mb_size[1];
+
+	if (yN < maxH)
+	{
+		pix->mb_addr  = currMB->mb_addr_left;
+		pix->available = currMB->mb_avail_left;
+		if (pix->available)
+		{
+			const int *CurPos = &p_Vid->PicPos[ pix->mb_addr ][0];
+			pix->pos_x = (short) ((-1 & (maxW - 1)) + *(CurPos++) * maxW);
+			pix->pos_y = (short) (yN + *CurPos * maxH);
+		}
+	}
+	else
+	{
+		pix->available = FALSE;
+	}
+}
+
+void getNonAffNeighbourNPChromaNB(const Macroblock *currMB, int yN, const int mb_size[2], PixelPos *pix) // xN < 0, yN >= 0
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int maxW = mb_size[0], maxH = mb_size[1];
+
+	pix->mb_addr  = currMB->mb_addr_left;
+	pix->available = currMB->mb_avail_left;
+	if (pix->available)
+	{
+		const int *CurPos = &p_Vid->PicPos[ pix->mb_addr ][0];
+		pix->pos_x = (short) ((-1 & (maxW - 1)) + *(CurPos++) * maxW);
+		pix->pos_y = (short) (yN + *CurPos * maxH);
+	}
+}
+
+void getNonAffNeighbour0X(const Macroblock *currMB, int yN, const int mb_size[2], PixelPos *pix) // xN is guaranteed to be zero
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int maxW = mb_size[0], maxH = mb_size[1];
+
+	if (0 < maxW)
+	{
+		if (yN<0)
+		{
+			pix->mb_addr   = currMB->mb_addr_up;
+			pix->available = currMB->mb_avail_up;
+		}
+		else if (yN < maxH)
+		{
+			pix->mb_addr   = currMB->mbAddrX;
+			pix->available = TRUE;
+		}
+		else
+		{
+			pix->available = FALSE;
+		}
+	}
+	else if (yN < 0)
+	{
+		pix->mb_addr   = currMB->mb_addr_upper_right;
+		pix->available = currMB->mb_avail_upper_right;
+	}
+	else
+	{
+		pix->available = FALSE;
+	}
+
+	if (pix->available || p_Vid->DeblockCall && pix->mb_addr)
+	{
+		const int *CurPos = &p_Vid->PicPos[ pix->mb_addr ][0];
+		pix->x     = 0;
+		pix->pos_x = (short) (*(CurPos++) * maxW);
+		pix->y     = (short) (yN & (maxH - 1));    
+		pix->pos_y = (short) (pix->y + *CurPos * maxH);
+	}
+}
+
+void getNonAffNeighbour0XLuma(const Macroblock *currMB, int yN, PixelPos *pix) // xN is guaranteed to be zero
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int maxW = 16, maxH = 16;
+
+	if (yN<0)
+	{
+		pix->mb_addr   = currMB->mb_addr_up;
+		pix->available = currMB->mb_avail_up;
+	}
+	else if (yN < 16)
+	{
+		pix->mb_addr   = currMB->mbAddrX;
+		pix->available = TRUE;
+	}
+	else
+	{
+		pix->available = FALSE;
+	}
+
+	if (pix->available || p_Vid->DeblockCall && pix->mb_addr)
+	{
+		const int *CurPos = &p_Vid->PicPos[ pix->mb_addr ][0];
+		pix->x     = 0;
+		pix->pos_x = (short) (*(CurPos++) * maxW);
+		pix->y     = (short) (yN & (maxH - 1));    
+		pix->pos_y = (short) (pix->y + *CurPos * maxH);
+	}
+}
+
+void getNonAffNeighbourX0(const Macroblock *currMB, int xN, const int mb_size[2], PixelPos *pix) // xN is full range, yN is 0
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int maxW = mb_size[0], maxH = mb_size[1];
+
+	if (xN < 0)
+	{
+		if (0 < maxH)
+		{
+			pix->mb_addr  = currMB->mb_addr_left;
+			pix->available = currMB->mb_avail_left;
+		}
+		else
+		{
+			pix->available = FALSE;
+		}
+	}
+	else if (xN < maxW)
+	{
+		if (0 < maxH)
+		{
+			pix->mb_addr   = currMB->mbAddrX;
+			pix->available = TRUE;
+		}
+		else
+		{
+			pix->available = FALSE;
+		}
+	}
+	else
+	{
+		pix->available = FALSE;
+	}
+
+	if (pix->available || p_Vid->DeblockCall && pix->mb_addr && p_Vid)
+	{
+		const int *CurPos = &p_Vid->PicPos[ pix->mb_addr ][0];
+		pix->x     = (short) (xN & (maxW - 1));
+		pix->pos_x = (short) (pix->x + *(CurPos++) * maxW);
+		pix->y     = 0;
+		pix->pos_y = (short) (*CurPos * maxH);
+	}
+}
+
+/*!
+************************************************************************
+* \brief
+*    get neighboring positions for aff coding
+* \param currMB
+*   current macroblock
+* \param xN
+*    input x position
+* \param yN
+*    input y position
+* \param mb_size
+*    Macroblock size in pixel (according to luma or chroma MB access)
+* \param pix
+*    returns position informations
+************************************************************************
+*/
+void getAffNeighbour(const Macroblock *currMB, int xN, int yN, const int mb_size[2], PixelPos *pix)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int maxW, maxH;
+	int yM = -1;
+
+	maxW = mb_size[0];
+	maxH = mb_size[1];
+
+	// initialize to "not available"
+	pix->available = FALSE;
+
+	if(yN > (maxH - 1))
+	{
+		return;
+	}
+	if (xN > (maxW - 1) && yN >= 0 && yN < maxH)
+	{
+		return;
+	}
+
+	if (xN < 0)
+	{
+		if (yN < 0)
+		{
+			if(!currMB->mb_field)
+			{
+				// frame
+				if ((currMB->mbAddrX & 0x01) == 0)
+				{
+					// top
+					pix->mb_addr   = currMB->mb_addr_upper_left  + 1;
+					pix->available = currMB->mb_avail_upper_left;
+					yM = yN;
+				}
+				else
+				{
+					// bottom
+					pix->mb_addr   = currMB->mb_addr_left;
+					pix->available = currMB->mb_avail_left;
+					if (currMB->mb_avail_left)
+					{
+						if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+						{
+							yM = yN;
+						}
+						else
+						{
+							(pix->mb_addr)++;
+							yM = (yN + maxH) >> 1;
+						}
+					}
+				}
+			}
+			else
+			{
+				// field
+				if ((currMB->mbAddrX & 0x01) == 0)
+				{
+					// top
+					pix->mb_addr   = currMB->mb_addr_upper_left;
+					pix->available = currMB->mb_avail_upper_left;
+					if (currMB->mb_avail_upper_left)
+					{
+						if(!p_Vid->mb_data[currMB->mb_addr_upper_left].mb_field)
+						{
+							(pix->mb_addr)++;
+							yM = 2 * yN;
+						}
+						else
+						{
+							yM = yN;
+						}
+					}
+				}
+				else
+				{
+					// bottom
+					pix->mb_addr   = currMB->mb_addr_upper_left+1;
+					pix->available = currMB->mb_avail_upper_left;
+					yM = yN;
+				}
+			}
+		}
+		else
+		{ // xN < 0 && yN >= 0
+			if (yN <maxH)
+			{
+				if (!currMB->mb_field)
+				{
+					// frame
+					if ((currMB->mbAddrX & 0x01) == 0)
+					{
+						// top
+						pix->mb_addr   = currMB->mb_addr_left;
+						pix->available = currMB->mb_avail_left;
+						if (currMB->mb_avail_left)
+						{
+							if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+							{
+								yM = yN;
+							}
+							else
+							{
+								(pix->mb_addr)+= ((yN & 0x01) != 0);
+								yM = yN >> 1;
+							}
+						}
+					}
+					else
+					{
+						// bottom
+						pix->mb_addr   = currMB->mb_addr_left;
+						pix->available = currMB->mb_avail_left;
+						if (currMB->mb_avail_left)
+						{
+							if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+							{
+								(pix->mb_addr)++;
+								yM = yN;
+							}
+							else
+							{
+								(pix->mb_addr)+= ((yN & 0x01) != 0);
+								yM = (yN + maxH) >> 1;
+							}
+						}
+					}
+				}
+				else
+				{
+					// field
+					if ((currMB->mbAddrX & 0x01) == 0)
+					{
+						// top
+						pix->mb_addr  = currMB->mb_addr_left;
+						pix->available = currMB->mb_avail_left;
+						if (currMB->mb_avail_left)
+						{
+							if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+							{
+								if (yN < (maxH >> 1))
+								{
+									yM = yN << 1;
+								}
+								else
+								{
+									(pix->mb_addr)++;
+									yM = (yN << 1 ) - maxH;
+								}
+							}
+							else
+							{
+								yM = yN;
+							}
+						}
+					}
+					else
+					{
+						// bottom
+						pix->mb_addr  = currMB->mb_addr_left;
+						pix->available = currMB->mb_avail_left;
+						if (currMB->mb_avail_left)
+						{
+							if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+							{
+								if (yN < (maxH >> 1))
+								{
+									yM = (yN << 1) + 1;
+								}
+								else
+								{
+									(pix->mb_addr)++;
+									yM = (yN << 1 ) + 1 - maxH;
+								}
+							}
+							else
+							{
+								(pix->mb_addr)++;
+								yM = yN;
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+	else
+	{ // xN >= 0
+		if (xN >= 0 && xN < maxW)
+		{
+			if (yN<0)
+			{
+				if (!currMB->mb_field)
+				{
+					//frame
+					if ((currMB->mbAddrX & 0x01) == 0)
+					{
+						//top
+						pix->mb_addr  = currMB->mb_addr_up;
+						// for the deblocker if the current MB is a frame and the one above is a field
+						// then the neighbor is the top MB of the pair
+						if (currMB->mb_avail_up)
+						{
+							if (!(p_Vid->DeblockCall == 1 && (p_Vid->mb_data[currMB->mb_addr_up]).mb_field))
+								pix->mb_addr  += 1;
+						}
+
+						pix->available = currMB->mb_avail_up;
+						yM = yN;
+					}
+					else
+					{
+						// bottom
+						pix->mb_addr   = currMB->mbAddrX - 1;
+						pix->available = TRUE;
+						yM = yN;
+					}
+				}
+				else
+				{
+					// field
+					if ((currMB->mbAddrX & 0x01) == 0)
+					{
+						// top
+						pix->mb_addr   = currMB->mb_addr_up;
+						pix->available = currMB->mb_avail_up;
+						if (currMB->mb_avail_up)
+						{
+							if(!p_Vid->mb_data[currMB->mb_addr_up].mb_field)
+							{
+								(pix->mb_addr)++;
+								yM = 2* yN;
+							}
+							else
+							{
+								yM = yN;
+							}
+						}
+					}
+					else
+					{
+						// bottom
+						pix->mb_addr   = currMB->mb_addr_up + 1;
+						pix->available = currMB->mb_avail_up;
+						yM = yN;
+					}
+				}
+			}
+			else
+			{
+				// yN >=0
+				// for the deblocker if this is the extra edge then do this special stuff
+				if (yN == 0 && p_Vid->DeblockCall == 2)
+				{
+					pix->mb_addr  = currMB->mb_addr_up + 1;
+					pix->available = TRUE;
+					yM = yN - 1;
+				}
+
+				else if ((yN <maxH))
+				{
+					pix->mb_addr   = currMB->mbAddrX;
+					pix->available = TRUE;
+					yM = yN;
+				}
+			}
+		}
+		else
+		{ // xN >= maxW
+			if(yN < 0)
+			{
+				if (!currMB->mb_field)
+				{
+					// frame
+					if ((currMB->mbAddrX & 0x01) == 0)
+					{
+						// top
+						pix->mb_addr  = currMB->mb_addr_upper_right + 1;
+						pix->available = currMB->mb_avail_upper_right;
+						yM = yN;
+					}
+					else
+					{
+						// bottom
+						pix->available = FALSE;
+					}
+				}
+				else
+				{
+					// field
+					if ((currMB->mbAddrX & 0x01) == 0)
+					{
+						// top
+						pix->mb_addr   = currMB->mb_addr_upper_right;
+						pix->available = currMB->mb_avail_upper_right;
+						if (currMB->mb_avail_upper_right)
+						{
+							if(!p_Vid->mb_data[currMB->mb_addr_upper_right].mb_field)
+							{
+								(pix->mb_addr)++;
+								yM = 2* yN;
+							}
+							else
+							{
+								yM = yN;
+							}
+						}
+					}
+					else
+					{
+						// bottom
+						pix->mb_addr   = currMB->mb_addr_upper_right + 1;
+						pix->available = currMB->mb_avail_upper_right;
+						yM = yN;
+					}
+				}
+			}
+		}
+	}
+	if (pix->available || p_Vid->DeblockCall)
+	{
+		pix->x = (short) (xN & (maxW - 1));
+		pix->y = (short) (yM & (maxH - 1));
+		get_mb_pos(p_Vid, pix->mb_addr, mb_size, &(pix->pos_x), &(pix->pos_y));
+		pix->pos_x = pix->pos_x + pix->x;
+		pix->pos_y = pix->pos_y + pix->y;
+	}
+}
+
+void getAffNeighbourNX(const Macroblock *currMB, int yN, const int mb_size[2], PixelPos *pix)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int maxW, maxH;
+	int yM = -1;
+	int xN = -1;
+
+	maxW = mb_size[0];
+	maxH = mb_size[1];
+
+	// initialize to "not available"
+	pix->available = FALSE;
+
+	if(yN > (maxH - 1))
+	{
+		return;
+	}
+	if (xN > (maxW - 1) && yN >= 0 && yN < maxH)
+	{
+		return;
+	}
+
+	if (xN < 0)
+	{
+		if (yN < 0)
+		{
+			if(!currMB->mb_field)
+			{
+				// frame
+				if ((currMB->mbAddrX & 0x01) == 0)
+				{
+					// top
+					pix->mb_addr   = currMB->mb_addr_upper_left  + 1;
+					pix->available = currMB->mb_avail_upper_left;
+					yM = yN;
+				}
+				else
+				{
+					// bottom
+					pix->mb_addr   = currMB->mb_addr_left;
+					pix->available = currMB->mb_avail_left;
+					if (currMB->mb_avail_left)
+					{
+						if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+						{
+							yM = yN;
+						}
+						else
+						{
+							(pix->mb_addr)++;
+							yM = (yN + maxH) >> 1;
+						}
+					}
+				}
+			}
+			else
+			{
+				// field
+				if ((currMB->mbAddrX & 0x01) == 0)
+				{
+					// top
+					pix->mb_addr   = currMB->mb_addr_upper_left;
+					pix->available = currMB->mb_avail_upper_left;
+					if (currMB->mb_avail_upper_left)
+					{
+						if(!p_Vid->mb_data[currMB->mb_addr_upper_left].mb_field)
+						{
+							(pix->mb_addr)++;
+							yM = 2 * yN;
+						}
+						else
+						{
+							yM = yN;
+						}
+					}
+				}
+				else
+				{
+					// bottom
+					pix->mb_addr   = currMB->mb_addr_upper_left+1;
+					pix->available = currMB->mb_avail_upper_left;
+					yM = yN;
+				}
+			}
+		}
+		else
+		{ // xN < 0 && yN >= 0
+			if (yN <maxH)
+			{
+				if (!currMB->mb_field)
+				{
+					// frame
+					if ((currMB->mbAddrX & 0x01) == 0)
+					{
+						// top
+						pix->mb_addr   = currMB->mb_addr_left;
+						pix->available = currMB->mb_avail_left;
+						if (currMB->mb_avail_left)
+						{
+							if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+							{
+								yM = yN;
+							}
+							else
+							{
+								(pix->mb_addr)+= ((yN & 0x01) != 0);
+								yM = yN >> 1;
+							}
+						}
+					}
+					else
+					{
+						// bottom
+						pix->mb_addr   = currMB->mb_addr_left;
+						pix->available = currMB->mb_avail_left;
+						if (currMB->mb_avail_left)
+						{
+							if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+							{
+								(pix->mb_addr)++;
+								yM = yN;
+							}
+							else
+							{
+								(pix->mb_addr)+= ((yN & 0x01) != 0);
+								yM = (yN + maxH) >> 1;
+							}
+						}
+					}
+				}
+				else
+				{
+					// field
+					if ((currMB->mbAddrX & 0x01) == 0)
+					{
+						// top
+						pix->mb_addr  = currMB->mb_addr_left;
+						pix->available = currMB->mb_avail_left;
+						if (currMB->mb_avail_left)
+						{
+							if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+							{
+								if (yN < (maxH >> 1))
+								{
+									yM = yN << 1;
+								}
+								else
+								{
+									(pix->mb_addr)++;
+									yM = (yN << 1 ) - maxH;
+								}
+							}
+							else
+							{
+								yM = yN;
+							}
+						}
+					}
+					else
+					{
+						// bottom
+						pix->mb_addr  = currMB->mb_addr_left;
+						pix->available = currMB->mb_avail_left;
+						if (currMB->mb_avail_left)
+						{
+							if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+							{
+								if (yN < (maxH >> 1))
+								{
+									yM = (yN << 1) + 1;
+								}
+								else
+								{
+									(pix->mb_addr)++;
+									yM = (yN << 1 ) + 1 - maxH;
+								}
+							}
+							else
+							{
+								(pix->mb_addr)++;
+								yM = yN;
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+	else
+	{ // xN >= 0
+		if (xN >= 0 && xN < maxW)
+		{
+			if (yN<0)
+			{
+				if (!currMB->mb_field)
+				{
+					//frame
+					if ((currMB->mbAddrX & 0x01) == 0)
+					{
+						//top
+						pix->mb_addr  = currMB->mb_addr_up;
+						// for the deblocker if the current MB is a frame and the one above is a field
+						// then the neighbor is the top MB of the pair
+						if (currMB->mb_avail_up)
+						{
+							if (!(p_Vid->DeblockCall == 1 && (p_Vid->mb_data[currMB->mb_addr_up]).mb_field))
+								pix->mb_addr  += 1;
+						}
+
+						pix->available = currMB->mb_avail_up;
+						yM = yN;
+					}
+					else
+					{
+						// bottom
+						pix->mb_addr   = currMB->mbAddrX - 1;
+						pix->available = TRUE;
+						yM = yN;
+					}
+				}
+				else
+				{
+					// field
+					if ((currMB->mbAddrX & 0x01) == 0)
+					{
+						// top
+						pix->mb_addr   = currMB->mb_addr_up;
+						pix->available = currMB->mb_avail_up;
+						if (currMB->mb_avail_up)
+						{
+							if(!p_Vid->mb_data[currMB->mb_addr_up].mb_field)
+							{
+								(pix->mb_addr)++;
+								yM = 2* yN;
+							}
+							else
+							{
+								yM = yN;
+							}
+						}
+					}
+					else
+					{
+						// bottom
+						pix->mb_addr   = currMB->mb_addr_up + 1;
+						pix->available = currMB->mb_avail_up;
+						yM = yN;
+					}
+				}
+			}
+			else
+			{
+				// yN >=0
+				// for the deblocker if this is the extra edge then do this special stuff
+				if (yN == 0 && p_Vid->DeblockCall == 2)
+				{
+					pix->mb_addr  = currMB->mb_addr_up + 1;
+					pix->available = TRUE;
+					yM = yN - 1;
+				}
+
+				else if ((yN <maxH))
+				{
+					pix->mb_addr   = currMB->mbAddrX;
+					pix->available = TRUE;
+					yM = yN;
+				}
+			}
+		}
+		else
+		{ // xN >= maxW
+			if(yN < 0)
+			{
+				if (!currMB->mb_field)
+				{
+					// frame
+					if ((currMB->mbAddrX & 0x01) == 0)
+					{
+						// top
+						pix->mb_addr  = currMB->mb_addr_upper_right + 1;
+						pix->available = currMB->mb_avail_upper_right;
+						yM = yN;
+					}
+					else
+					{
+						// bottom
+						pix->available = FALSE;
+					}
+				}
+				else
+				{
+					// field
+					if ((currMB->mbAddrX & 0x01) == 0)
+					{
+						// top
+						pix->mb_addr   = currMB->mb_addr_upper_right;
+						pix->available = currMB->mb_avail_upper_right;
+						if (currMB->mb_avail_upper_right)
+						{
+							if(!p_Vid->mb_data[currMB->mb_addr_upper_right].mb_field)
+							{
+								(pix->mb_addr)++;
+								yM = 2* yN;
+							}
+							else
+							{
+								yM = yN;
+							}
+						}
+					}
+					else
+					{
+						// bottom
+						pix->mb_addr   = currMB->mb_addr_upper_right + 1;
+						pix->available = currMB->mb_avail_upper_right;
+						yM = yN;
+					}
+				}
+			}
+		}
+	}
+	if (pix->available || p_Vid->DeblockCall)
+	{
+		pix->x = (short) (xN & (maxW - 1));
+		pix->y = (short) (yM & (maxH - 1));
+		get_mb_pos(p_Vid, pix->mb_addr, mb_size, &(pix->pos_x), &(pix->pos_y));
+		pix->pos_x = pix->pos_x + pix->x;
+		pix->pos_y = pix->pos_y + pix->y;
+	}
+}
+
+void getAffNeighbourNXLuma(const Macroblock *currMB, int yN, PixelPos *pix)
+{
+	const int mb_size[2]={16,16};
+	getAffNeighbourNX(currMB, yN, mb_size, pix);
+}
+void getAffNeighbourN0(const Macroblock *currMB, const int mb_size[2], PixelPos *pix)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int maxW, maxH;
+	int yM = -1;
+	int xN = -1;
+	int yN=0;
+
+	maxW = mb_size[0];
+	maxH = mb_size[1];
+
+	// initialize to "not available"
+	pix->available = FALSE;
+
+	if(yN > (maxH - 1))
+	{
+		return;
+	}
+	if (xN > (maxW - 1) && yN >= 0 && yN < maxH)
+	{
+		return;
+	}
+
+	if (xN < 0)
+	{
+		if (yN < 0)
+		{
+			if(!currMB->mb_field)
+			{
+				// frame
+				if ((currMB->mbAddrX & 0x01) == 0)
+				{
+					// top
+					pix->mb_addr   = currMB->mb_addr_upper_left  + 1;
+					pix->available = currMB->mb_avail_upper_left;
+					yM = yN;
+				}
+				else
+				{
+					// bottom
+					pix->mb_addr   = currMB->mb_addr_left;
+					pix->available = currMB->mb_avail_left;
+					if (currMB->mb_avail_left)
+					{
+						if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+						{
+							yM = yN;
+						}
+						else
+						{
+							(pix->mb_addr)++;
+							yM = (yN + maxH) >> 1;
+						}
+					}
+				}
+			}
+			else
+			{
+				// field
+				if ((currMB->mbAddrX & 0x01) == 0)
+				{
+					// top
+					pix->mb_addr   = currMB->mb_addr_upper_left;
+					pix->available = currMB->mb_avail_upper_left;
+					if (currMB->mb_avail_upper_left)
+					{
+						if(!p_Vid->mb_data[currMB->mb_addr_upper_left].mb_field)
+						{
+							(pix->mb_addr)++;
+							yM = 2 * yN;
+						}
+						else
+						{
+							yM = yN;
+						}
+					}
+				}
+				else
+				{
+					// bottom
+					pix->mb_addr   = currMB->mb_addr_upper_left+1;
+					pix->available = currMB->mb_avail_upper_left;
+					yM = yN;
+				}
+			}
+		}
+		else
+		{ // xN < 0 && yN >= 0
+			if (yN <maxH)
+			{
+				if (!currMB->mb_field)
+				{
+					// frame
+					if ((currMB->mbAddrX & 0x01) == 0)
+					{
+						// top
+						pix->mb_addr   = currMB->mb_addr_left;
+						pix->available = currMB->mb_avail_left;
+						if (currMB->mb_avail_left)
+						{
+							if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+							{
+								yM = yN;
+							}
+							else
+							{
+								(pix->mb_addr)+= ((yN & 0x01) != 0);
+								yM = yN >> 1;
+							}
+						}
+					}
+					else
+					{
+						// bottom
+						pix->mb_addr   = currMB->mb_addr_left;
+						pix->available = currMB->mb_avail_left;
+						if (currMB->mb_avail_left)
+						{
+							if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+							{
+								(pix->mb_addr)++;
+								yM = yN;
+							}
+							else
+							{
+								(pix->mb_addr)+= ((yN & 0x01) != 0);
+								yM = (yN + maxH) >> 1;
+							}
+						}
+					}
+				}
+				else
+				{
+					// field
+					if ((currMB->mbAddrX & 0x01) == 0)
+					{
+						// top
+						pix->mb_addr  = currMB->mb_addr_left;
+						pix->available = currMB->mb_avail_left;
+						if (currMB->mb_avail_left)
+						{
+							if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+							{
+								if (yN < (maxH >> 1))
+								{
+									yM = yN << 1;
+								}
+								else
+								{
+									(pix->mb_addr)++;
+									yM = (yN << 1 ) - maxH;
+								}
+							}
+							else
+							{
+								yM = yN;
+							}
+						}
+					}
+					else
+					{
+						// bottom
+						pix->mb_addr  = currMB->mb_addr_left;
+						pix->available = currMB->mb_avail_left;
+						if (currMB->mb_avail_left)
+						{
+							if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+							{
+								if (yN < (maxH >> 1))
+								{
+									yM = (yN << 1) + 1;
+								}
+								else
+								{
+									(pix->mb_addr)++;
+									yM = (yN << 1 ) + 1 - maxH;
+								}
+							}
+							else
+							{
+								(pix->mb_addr)++;
+								yM = yN;
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+	else
+	{ // xN >= 0
+		if (xN >= 0 && xN < maxW)
+		{
+			if (yN<0)
+			{
+				if (!currMB->mb_field)
+				{
+					//frame
+					if ((currMB->mbAddrX & 0x01) == 0)
+					{
+						//top
+						pix->mb_addr  = currMB->mb_addr_up;
+						// for the deblocker if the current MB is a frame and the one above is a field
+						// then the neighbor is the top MB of the pair
+						if (currMB->mb_avail_up)
+						{
+							if (!(p_Vid->DeblockCall == 1 && (p_Vid->mb_data[currMB->mb_addr_up]).mb_field))
+								pix->mb_addr  += 1;
+						}
+
+						pix->available = currMB->mb_avail_up;
+						yM = yN;
+					}
+					else
+					{
+						// bottom
+						pix->mb_addr   = currMB->mbAddrX - 1;
+						pix->available = TRUE;
+						yM = yN;
+					}
+				}
+				else
+				{
+					// field
+					if ((currMB->mbAddrX & 0x01) == 0)
+					{
+						// top
+						pix->mb_addr   = currMB->mb_addr_up;
+						pix->available = currMB->mb_avail_up;
+						if (currMB->mb_avail_up)
+						{
+							if(!p_Vid->mb_data[currMB->mb_addr_up].mb_field)
+							{
+								(pix->mb_addr)++;
+								yM = 2* yN;
+							}
+							else
+							{
+								yM = yN;
+							}
+						}
+					}
+					else
+					{
+						// bottom
+						pix->mb_addr   = currMB->mb_addr_up + 1;
+						pix->available = currMB->mb_avail_up;
+						yM = yN;
+					}
+				}
+			}
+			else
+			{
+				// yN >=0
+				// for the deblocker if this is the extra edge then do this special stuff
+				if (yN == 0 && p_Vid->DeblockCall == 2)
+				{
+					pix->mb_addr  = currMB->mb_addr_up + 1;
+					pix->available = TRUE;
+					yM = yN - 1;
+				}
+
+				else if ((yN <maxH))
+				{
+					pix->mb_addr   = currMB->mbAddrX;
+					pix->available = TRUE;
+					yM = yN;
+				}
+			}
+		}
+		else
+		{ // xN >= maxW
+			if(yN < 0)
+			{
+				if (!currMB->mb_field)
+				{
+					// frame
+					if ((currMB->mbAddrX & 0x01) == 0)
+					{
+						// top
+						pix->mb_addr  = currMB->mb_addr_upper_right + 1;
+						pix->available = currMB->mb_avail_upper_right;
+						yM = yN;
+					}
+					else
+					{
+						// bottom
+						pix->available = FALSE;
+					}
+				}
+				else
+				{
+					// field
+					if ((currMB->mbAddrX & 0x01) == 0)
+					{
+						// top
+						pix->mb_addr   = currMB->mb_addr_upper_right;
+						pix->available = currMB->mb_avail_upper_right;
+						if (currMB->mb_avail_upper_right)
+						{
+							if(!p_Vid->mb_data[currMB->mb_addr_upper_right].mb_field)
+							{
+								(pix->mb_addr)++;
+								yM = 2* yN;
+							}
+							else
+							{
+								yM = yN;
+							}
+						}
+					}
+					else
+					{
+						// bottom
+						pix->mb_addr   = currMB->mb_addr_upper_right + 1;
+						pix->available = currMB->mb_avail_upper_right;
+						yM = yN;
+					}
+				}
+			}
+		}
+	}
+	if (pix->available || p_Vid->DeblockCall)
+	{
+		pix->x = (short) (xN & (maxW - 1));
+		pix->y = (short) (yM & (maxH - 1));
+		get_mb_pos(p_Vid, pix->mb_addr, mb_size, &(pix->pos_x), &(pix->pos_y));
+		pix->pos_x = pix->pos_x + pix->x;
+		pix->pos_y = pix->pos_y + pix->y;
+	}
+}
+
+
+void getAffNeighbourLuma(const Macroblock *currMB, int xN, int yN, PixelPos *pix)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	const int maxW=16, maxH=16;
+	int yM = -1;
+
+	// initialize to "not available"
+	pix->available = FALSE;
+
+	if(yN > (maxH - 1))
+	{
+		return;
+	}
+	if (xN > (maxW - 1) && yN >= 0 && yN < maxH)
+	{
+		return;
+	}
+
+	if (xN < 0)
+	{
+		if (yN < 0)
+		{
+			if(!currMB->mb_field)
+			{
+				// frame
+				if ((currMB->mbAddrX & 0x01) == 0)
+				{
+					// top
+					pix->mb_addr   = currMB->mb_addr_upper_left  + 1;
+					pix->available = currMB->mb_avail_upper_left;
+					yM = yN;
+				}
+				else
+				{
+					// bottom
+					pix->mb_addr   = currMB->mb_addr_left;
+					pix->available = currMB->mb_avail_left;
+					if (currMB->mb_avail_left)
+					{
+						if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+						{
+							yM = yN;
+						}
+						else
+						{
+							(pix->mb_addr)++;
+							yM = (yN + maxH) >> 1;
+						}
+					}
+				}
+			}
+			else
+			{
+				// field
+				if ((currMB->mbAddrX & 0x01) == 0)
+				{
+					// top
+					pix->mb_addr   = currMB->mb_addr_upper_left;
+					pix->available = currMB->mb_avail_upper_left;
+					if (currMB->mb_avail_upper_left)
+					{
+						if(!p_Vid->mb_data[currMB->mb_addr_upper_left].mb_field)
+						{
+							(pix->mb_addr)++;
+							yM = 2 * yN;
+						}
+						else
+						{
+							yM = yN;
+						}
+					}
+				}
+				else
+				{
+					// bottom
+					pix->mb_addr   = currMB->mb_addr_upper_left+1;
+					pix->available = currMB->mb_avail_upper_left;
+					yM = yN;
+				}
+			}
+		}
+		else
+		{ // xN < 0 && yN >= 0
+			if (yN <maxH)
+			{
+				if (!currMB->mb_field)
+				{
+					// frame
+					if ((currMB->mbAddrX & 0x01) == 0)
+					{
+						// top
+						pix->mb_addr   = currMB->mb_addr_left;
+						pix->available = currMB->mb_avail_left;
+						if (currMB->mb_avail_left)
+						{
+							if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+							{
+								yM = yN;
+							}
+							else
+							{
+								(pix->mb_addr)+= ((yN & 0x01) != 0);
+								yM = yN >> 1;
+							}
+						}
+					}
+					else
+					{
+						// bottom
+						pix->mb_addr   = currMB->mb_addr_left;
+						pix->available = currMB->mb_avail_left;
+						if (currMB->mb_avail_left)
+						{
+							if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+							{
+								(pix->mb_addr)++;
+								yM = yN;
+							}
+							else
+							{
+								(pix->mb_addr)+= ((yN & 0x01) != 0);
+								yM = (yN + maxH) >> 1;
+							}
+						}
+					}
+				}
+				else
+				{
+					// field
+					if ((currMB->mbAddrX & 0x01) == 0)
+					{
+						// top
+						pix->mb_addr  = currMB->mb_addr_left;
+						pix->available = currMB->mb_avail_left;
+						if (currMB->mb_avail_left)
+						{
+							if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+							{
+								if (yN < (maxH >> 1))
+								{
+									yM = yN << 1;
+								}
+								else
+								{
+									(pix->mb_addr)++;
+									yM = (yN << 1 ) - maxH;
+								}
+							}
+							else
+							{
+								yM = yN;
+							}
+						}
+					}
+					else
+					{
+						// bottom
+						pix->mb_addr  = currMB->mb_addr_left;
+						pix->available = currMB->mb_avail_left;
+						if (currMB->mb_avail_left)
+						{
+							if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+							{
+								if (yN < (maxH >> 1))
+								{
+									yM = (yN << 1) + 1;
+								}
+								else
+								{
+									(pix->mb_addr)++;
+									yM = (yN << 1 ) + 1 - maxH;
+								}
+							}
+							else
+							{
+								(pix->mb_addr)++;
+								yM = yN;
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+	else
+	{ // xN >= 0
+		if (xN >= 0 && xN < maxW)
+		{
+			if (yN<0)
+			{
+				if (!currMB->mb_field)
+				{
+					//frame
+					if ((currMB->mbAddrX & 0x01) == 0)
+					{
+						//top
+						pix->mb_addr  = currMB->mb_addr_up;
+						// for the deblocker if the current MB is a frame and the one above is a field
+						// then the neighbor is the top MB of the pair
+						if (currMB->mb_avail_up)
+						{
+							if (!(p_Vid->DeblockCall == 1 && (p_Vid->mb_data[currMB->mb_addr_up]).mb_field))
+								pix->mb_addr  += 1;
+						}
+
+						pix->available = currMB->mb_avail_up;
+						yM = yN;
+					}
+					else
+					{
+						// bottom
+						pix->mb_addr   = currMB->mbAddrX - 1;
+						pix->available = TRUE;
+						yM = yN;
+					}
+				}
+				else
+				{
+					// field
+					if ((currMB->mbAddrX & 0x01) == 0)
+					{
+						// top
+						pix->mb_addr   = currMB->mb_addr_up;
+						pix->available = currMB->mb_avail_up;
+						if (currMB->mb_avail_up)
+						{
+							if(!p_Vid->mb_data[currMB->mb_addr_up].mb_field)
+							{
+								(pix->mb_addr)++;
+								yM = 2* yN;
+							}
+							else
+							{
+								yM = yN;
+							}
+						}
+					}
+					else
+					{
+						// bottom
+						pix->mb_addr   = currMB->mb_addr_up + 1;
+						pix->available = currMB->mb_avail_up;
+						yM = yN;
+					}
+				}
+			}
+			else
+			{
+				// yN >=0
+				// for the deblocker if this is the extra edge then do this special stuff
+				if (yN == 0 && p_Vid->DeblockCall == 2)
+				{
+					pix->mb_addr  = currMB->mb_addr_up + 1;
+					pix->available = TRUE;
+					yM = yN - 1;
+				}
+
+				else if (yN <maxH)
+				{
+					pix->mb_addr   = currMB->mbAddrX;
+					pix->available = TRUE;
+					yM = yN;
+				}
+			}
+		}
+		else
+		{ // xN >= maxW
+			if(yN < 0)
+			{
+				if (!currMB->mb_field)
+				{
+					// frame
+					if ((currMB->mbAddrX & 0x01) == 0)
+					{
+						// top
+						pix->mb_addr  = currMB->mb_addr_upper_right + 1;
+						pix->available = currMB->mb_avail_upper_right;
+						yM = yN;
+					}
+					else
+					{
+						// bottom
+						pix->available = FALSE;
+					}
+				}
+				else
+				{
+					// field
+					if ((currMB->mbAddrX & 0x01) == 0)
+					{
+						// top
+						pix->mb_addr   = currMB->mb_addr_upper_right;
+						pix->available = currMB->mb_avail_upper_right;
+						if (currMB->mb_avail_upper_right)
+						{
+							if(!p_Vid->mb_data[currMB->mb_addr_upper_right].mb_field)
+							{
+								(pix->mb_addr)++;
+								yM = 2* yN;
+							}
+							else
+							{
+								yM = yN;
+							}
+						}
+					}
+					else
+					{
+						// bottom
+						pix->mb_addr   = currMB->mb_addr_upper_right + 1;
+						pix->available = currMB->mb_avail_upper_right;
+						yM = yN;
+					}
+				}
+			}
+		}
+	}
+	if (pix->available || p_Vid->DeblockCall)
+	{
+		pix->x = (short) (xN & (maxW - 1));
+		pix->y = (short) (yM & (maxH - 1));
+		get_mb_block_pos_mbaff(p_Vid->PicPos, pix->mb_addr, &(pix->pos_x), &(pix->pos_y));
+		pix->pos_x = 16*pix->pos_x + pix->x;
+		pix->pos_y = 16*pix->pos_y + pix->y;
+	}
+}
+
+
+void getAffNeighbourPXLumaNB(const Macroblock *currMB, int xN, int yN, PixelPos *pix)
+{ // xN >= 0, yN < 16, xN < 16
+	VideoParameters *p_Vid = currMB->p_Vid;
+	const int maxW=16, maxH=16;
+	int yM = -1;
+
+	// initialize to "not available"
+	pix->available = FALSE;
+
+	if (yN<0)
+	{
+		if (!currMB->mb_field)
+		{
+			//frame
+			if ((currMB->mbAddrX & 0x01) == 0)
+			{
+				//top
+				pix->mb_addr  = currMB->mb_addr_up;
+				// for the deblocker if the current MB is a frame and the one above is a field
+				// then the neighbor is the top MB of the pair
+				if (currMB->mb_avail_up)
+				{
+					if (!(p_Vid->DeblockCall == 1 && (p_Vid->mb_data[currMB->mb_addr_up]).mb_field))
+						pix->mb_addr  += 1;
+				}
+
+				pix->available = currMB->mb_avail_up;
+				yM = yN;
+			}
+			else
+			{
+				// bottom
+				pix->mb_addr   = currMB->mbAddrX - 1;
+				pix->available = TRUE;
+				yM = yN;
+			}
+		}
+		else
+		{
+			// field
+			pix->available = currMB->mb_avail_up;
+			if ((currMB->mbAddrX & 0x01) == 0)
+			{
+				// top
+				pix->mb_addr   = currMB->mb_addr_up;
+
+				if (currMB->mb_avail_up)
+				{
+					if(!p_Vid->mb_data[currMB->mb_addr_up].mb_field)
+					{
+						(pix->mb_addr)++;
+						yM = 2* yN;
+					}
+					else
+					{
+						yM = yN;
+					}
+				}
+			}
+			else
+			{
+				// bottom
+				pix->mb_addr   = currMB->mb_addr_up + 1;
+
+				yM = yN;
+			}
+		}
+	}
+	else
+	{
+		// yN >=0
+		// for the deblocker if this is the extra edge then do this special stuff
+		if (yN == 0 && p_Vid->DeblockCall == 2)
+		{
+			pix->mb_addr  = currMB->mb_addr_up + 1;
+			pix->available = TRUE;
+			yM = yN - 1;
+		}
+		else
+		{
+			pix->mb_addr   = currMB->mbAddrX;
+			pix->available = TRUE;
+			yM = yN;
+		}
+	}
+
+	if (pix->available || p_Vid->DeblockCall)
+	{
+		pix->x = (short) (xN);
+		pix->y = (short) (yM & (maxH - 1));
+		get_mb_block_pos_mbaff(p_Vid->PicPos, pix->mb_addr, &(pix->pos_x), &(pix->pos_y));
+		pix->pos_x = 16*pix->pos_x + pix->x;
+		pix->pos_y = 16*pix->pos_y + pix->y;
+	}
+}
+
+void getAffNeighbourPXLumaNB_NoPos(const Macroblock *currMB, int yN, PixelPos *pix)
+{ // xN >= 0, yN < 16, xN < 16, DeblockCall == 0
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int yM = -1;
+
+	// initialize to "not available"
+	pix->available = FALSE;
+
+	if (yN<0)
+	{
+		if (!currMB->mb_field)
+		{
+			//frame
+			if ((currMB->mbAddrX & 0x01) == 0)
+			{
+				//top
+				pix->mb_addr  = currMB->mb_addr_up;
+				// for the deblocker if the current MB is a frame and the one above is a field
+				// then the neighbor is the top MB of the pair
+				if (currMB->mb_avail_up)
+				{
+					pix->mb_addr  += 1;
+				}
+
+				pix->available = currMB->mb_avail_up;
+				yM = yN;
+			}
+			else
+			{
+				// bottom
+				pix->mb_addr   = currMB->mbAddrX - 1;
+				pix->available = TRUE;
+				yM = yN;
+			}
+		}
+		else
+		{
+			// field
+			pix->available = currMB->mb_avail_up;
+			if ((currMB->mbAddrX & 0x01) == 0)
+			{
+				// top
+				pix->mb_addr   = currMB->mb_addr_up;
+
+				if (currMB->mb_avail_up)
+				{
+					if(!p_Vid->mb_data[currMB->mb_addr_up].mb_field)
+					{
+						(pix->mb_addr)++;
+						yM = 2* yN;
+					}
+					else
+					{
+						yM = yN;
+					}
+				}
+			}
+			else
+			{
+				// bottom
+				pix->mb_addr   = currMB->mb_addr_up + 1;
+
+				yM = yN;
+			}
+		}
+	}
+	else
+	{
+		// yN >=0
+		pix->mb_addr   = currMB->mbAddrX;
+		pix->available = TRUE;
+		yM = yN;
+	}
+
+	if (pix->available)
+	{
+		pix->y = (short) (yM & 15);
+	}
+}
+
+
+void getAffNeighbourXPLuma(const Macroblock *currMB, int xN, int yN, PixelPos *pix)
+{ // yN >= 0
+	VideoParameters *p_Vid = currMB->p_Vid;
+	const int maxW=16, maxH=16;
+	int yM = -1;
+
+	// initialize to "not available"
+	pix->available = FALSE;
+
+	if(yN > (maxH - 1))
+	{
+		return;
+	}
+	if (xN > (maxW - 1)  && yN < maxH)
+	{
+		return;
+	}
+
+	if (xN < 0)
+	{
+			if (!currMB->mb_field)
+			{
+				// frame
+				pix->mb_addr   = currMB->mb_addr_left;
+				pix->available = currMB->mb_avail_left;
+				if ((currMB->mbAddrX & 0x01) == 0)
+				{
+					// top
+					if (currMB->mb_avail_left)
+					{
+						if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+						{
+							yM = yN;
+						}
+						else
+						{
+							(pix->mb_addr)+= ((yN & 0x01) != 0);
+							yM = yN >> 1;
+						}
+					}
+				}
+				else
+				{
+					// bottom
+					if (currMB->mb_avail_left)
+					{
+						if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+						{
+							(pix->mb_addr)++;
+							yM = yN;
+						}
+						else
+						{
+							(pix->mb_addr)+= ((yN & 0x01) != 0);
+							yM = (yN + maxH) >> 1;
+						}
+					}
+				}
+			}
+			else
+			{
+				pix->mb_addr  = currMB->mb_addr_left;
+				pix->available = currMB->mb_avail_left;
+				// field
+				if ((currMB->mbAddrX & 0x01) == 0)
+				{
+					// top
+					if (currMB->mb_avail_left)
+					{
+						if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+						{
+							if (yN < (maxH >> 1))
+							{
+								yM = yN << 1;
+							}
+							else
+							{
+								(pix->mb_addr)++;
+								yM = (yN << 1 ) - maxH;
+							}
+						}
+						else
+						{
+							yM = yN;
+						}
+					}
+				}
+				else
+				{
+					// bottom
+					if (currMB->mb_avail_left)
+					{
+						if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+						{
+							if (yN < (maxH >> 1))
+							{
+								yM = (yN << 1) + 1;
+							}
+							else
+							{
+								(pix->mb_addr)++;
+								yM = (yN << 1 ) + 1 - maxH;
+							}
+						}
+						else
+						{
+							(pix->mb_addr)++;
+							yM = yN;
+						}
+					}
+				}
+			}
+	}
+	else if (xN < maxW)
+	{ // xN >= 0
+		// yN >=0
+		// for the deblocker if this is the extra edge then do this special stuff
+		if (yN == 0 && p_Vid->DeblockCall == 2)
+		{
+			pix->mb_addr  = currMB->mb_addr_up + 1;
+			pix->available = TRUE;
+			yM = yN - 1;
+		}
+
+		else if (yN <maxH)
+		{
+			pix->mb_addr   = currMB->mbAddrX;
+			pix->available = TRUE;
+			yM = yN;
+		}
+	}
+
+	if (pix->available || p_Vid->DeblockCall)
+	{
+		pix->x = (short) (xN & (maxW - 1));
+		pix->y = (short) (yM & (maxH - 1));
+		get_mb_block_pos_mbaff(p_Vid->PicPos, pix->mb_addr, &(pix->pos_x), &(pix->pos_y));
+		pix->pos_x = 16*pix->pos_x + pix->x;
+		pix->pos_y = 16*pix->pos_y + pix->y;
+	}
+}
+
+
+void getAffNeighbourPPLumaNB(const Macroblock *currMB, int xN, int yN, PixelPos *pix)
+{ 
+	VideoParameters *p_Vid = currMB->p_Vid;
+
+	// xN >= 0
+	// yN >=0
+	pix->mb_addr   = currMB->mbAddrX;
+	pix->available = TRUE;
+
+	pix->x = (short) (xN & (16 - 1));
+	pix->y = (short) (yN & (16 - 1));
+	get_mb_block_pos_mbaff(p_Vid->PicPos, pix->mb_addr, &(pix->pos_x), &(pix->pos_y));
+	pix->pos_x = 16*pix->pos_x + pix->x;
+	pix->pos_y = 16*pix->pos_y + pix->y;
+}
+
+void getAffNeighbourNPLuma(const Macroblock *currMB, int yN, PixelPos *pix)
+{ // yN >= 0
+	VideoParameters *p_Vid = currMB->p_Vid;
+	const int maxW=16, maxH=16;
+	int yM = -1;
+
+	// initialize to "not available"
+	pix->available = FALSE;
+
+	if(yN > (maxH - 1))
+	{
+		return;
+	}
+
+	if (yN <maxH)
+	{
+		if (!currMB->mb_field)
+		{
+			// frame
+			if ((currMB->mbAddrX & 0x01) == 0)
+			{
+				// top
+				pix->mb_addr   = currMB->mb_addr_left;
+				pix->available = currMB->mb_avail_left;
+				if (currMB->mb_avail_left)
+				{
+					if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+					{
+						yM = yN;
+					}
+					else
+					{
+						(pix->mb_addr)+= ((yN & 0x01) != 0);
+						yM = yN >> 1;
+					}
+				}
+			}
+			else
+			{
+				// bottom
+				pix->mb_addr   = currMB->mb_addr_left;
+				pix->available = currMB->mb_avail_left;
+				if (currMB->mb_avail_left)
+				{
+					if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+					{
+						(pix->mb_addr)++;
+						yM = yN;
+					}
+					else
+					{
+						(pix->mb_addr)+= ((yN & 0x01) != 0);
+						yM = (yN + maxH) >> 1;
+					}
+				}
+			}
+		}
+		else
+		{
+			// field
+			if ((currMB->mbAddrX & 0x01) == 0)
+			{
+				// top
+				pix->mb_addr  = currMB->mb_addr_left;
+				pix->available = currMB->mb_avail_left;
+				if (currMB->mb_avail_left)
+				{
+					if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+					{
+						if (yN < (maxH >> 1))
+						{
+							yM = yN << 1;
+						}
+						else
+						{
+							(pix->mb_addr)++;
+							yM = (yN << 1 ) - maxH;
+						}
+					}
+					else
+					{
+						yM = yN;
+					}
+				}
+			}
+			else
+			{
+				// bottom
+				pix->mb_addr  = currMB->mb_addr_left;
+				pix->available = currMB->mb_avail_left;
+				if (currMB->mb_avail_left)
+				{
+					if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+					{
+						if (yN < (maxH >> 1))
+						{
+							yM = (yN << 1) + 1;
+						}
+						else
+						{
+							(pix->mb_addr)++;
+							yM = (yN << 1 ) + 1 - maxH;
+						}
+					}
+					else
+					{
+						(pix->mb_addr)++;
+						yM = yN;
+					}
+				}
+			}
+		}
+	}
+
+
+	if (pix->available || p_Vid->DeblockCall)
+	{
+		pix->x = (short) (-1 & (maxW - 1));
+		pix->y = (short) (yM & (maxH - 1));
+		get_mb_block_pos_mbaff(p_Vid->PicPos, pix->mb_addr, &(pix->pos_x), &(pix->pos_y));
+		pix->pos_x = 16*pix->pos_x + pix->x;
+		pix->pos_y = 16*pix->pos_y + pix->y;
+	}
+}
+
+void getAffNeighbourN0Luma(const Macroblock *currMB, PixelPos *pix)
+{ // xN = -1 && yN == 0
+	VideoParameters *p_Vid = currMB->p_Vid;
+	//const int maxW=16, maxH=16;
+	int yM = -1;
+
+
+	// initialize to "not available"
+	pix->available = FALSE;
+
+	if (!currMB->mb_field)
+	{
+		// frame
+		if ((currMB->mbAddrX & 0x01) == 0)
+		{
+			// top
+			pix->mb_addr   = currMB->mb_addr_left;
+			pix->available = currMB->mb_avail_left;
+			if (currMB->mb_avail_left)
+			{
+				yM = 0;
+			}
+		}
+		else
+		{
+			// bottom
+			pix->mb_addr   = currMB->mb_addr_left;
+			pix->available = currMB->mb_avail_left;
+			if (currMB->mb_avail_left)
+			{
+				if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+				{
+					(pix->mb_addr)++;
+					yM = 0;
+				}
+				else
+				{
+					yM = 8;
+				}
+			}
+		}
+	}
+	else
+	{
+		// field
+		if ((currMB->mbAddrX & 0x01) == 0)
+		{
+			// top
+			pix->mb_addr  = currMB->mb_addr_left;
+			pix->available = currMB->mb_avail_left;
+			if (currMB->mb_avail_left)
+			{
+				yM = 0;
+			}
+		}
+		else
+		{
+			// bottom
+			pix->mb_addr  = currMB->mb_addr_left;
+			pix->available = currMB->mb_avail_left;
+			if (currMB->mb_avail_left)
+			{
+				if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+				{
+					yM = 1;
+				}
+				else
+				{
+					(pix->mb_addr)++;
+					yM = 0;
+				}
+			}
+		}
+	}
+
+
+	if (pix->available || p_Vid->DeblockCall)
+	{
+		pix->x = (short) (-1 & 15);
+		pix->y = (short) (yM & 15);
+		get_mb_block_pos_mbaff(p_Vid->PicPos, pix->mb_addr, &(pix->pos_x), &(pix->pos_y));
+		pix->pos_x = 16*pix->pos_x + pix->x;
+		pix->pos_y = 16*pix->pos_y + pix->y;
+	}
+}
+
+void getAffNeighbourX0(const Macroblock *currMB, int xN, const int mb_size[2], PixelPos *pix)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int maxW, maxH;
+	int yM = -1;
+
+	maxW = mb_size[0];
+	maxH = mb_size[1];
+
+	// initialize to "not available"
+	pix->available = FALSE;
+
+	if(0 > (maxH - 1))
+	{
+		return;
+	}
+	if (xN > (maxW - 1) && 0 < maxH)
+	{
+		return;
+	}
+
+	if (xN < 0)
+	{
+		if (0 <maxH)
+		{
+			if (!currMB->mb_field)
+			{
+				// frame
+				if ((currMB->mbAddrX & 0x01) == 0)
+				{
+					// top
+					pix->mb_addr   = currMB->mb_addr_left;
+					pix->available = currMB->mb_avail_left;
+					if (currMB->mb_avail_left)
+					{
+						yM = 0;
+					}
+				}
+				else
+				{
+					// bottom
+					pix->mb_addr   = currMB->mb_addr_left;
+					pix->available = currMB->mb_avail_left;
+					if (currMB->mb_avail_left)
+					{
+						if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+						{
+							(pix->mb_addr)++;
+							yM = 0;
+						}
+						else
+						{
+							yM = (0 + maxH) >> 1;
+						}
+					}
+				}
+			}
+			else
+			{
+				// field
+				if ((currMB->mbAddrX & 0x01) == 0)
+				{
+					// top
+					pix->mb_addr  = currMB->mb_addr_left;
+					pix->available = currMB->mb_avail_left;
+					if (currMB->mb_avail_left)
+					{
+						if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+						{
+							if (0 < (maxH >> 1))
+							{
+								yM = 0;
+							}
+							else
+							{
+								(pix->mb_addr)++;
+								yM = (0) - maxH;
+							}
+						}
+						else
+						{
+							yM = 0;
+						}
+					}
+				}
+				else
+				{
+					// bottom
+					pix->mb_addr  = currMB->mb_addr_left;
+					pix->available = currMB->mb_avail_left;
+					if (currMB->mb_avail_left)
+					{
+						if(!p_Vid->mb_data[currMB->mb_addr_left].mb_field)
+						{
+							if (0 < (maxH >> 1))
+							{
+								yM = 1;
+							}
+							else
+							{
+								(pix->mb_addr)++;
+								yM = 1 - maxH;
+							}
+						}
+						else
+						{
+							(pix->mb_addr)++;
+							yM = 0;
+						}
+					}
+				}
+			}
+		}
+
+	}
+	else
+	{ // xN >= 0
+		if (xN >= 0 && xN < maxW)
+		{
+			// yN >=0
+			// for the deblocker if this is the extra edge then do this special stuff
+			if (p_Vid->DeblockCall == 2)
+			{
+				pix->mb_addr  = currMB->mb_addr_up + 1;
+				pix->available = TRUE;
+				yM = 0 - 1;
+			}
+
+			else if (0 <maxH)
+			{
+				pix->mb_addr   = currMB->mbAddrX;
+				pix->available = TRUE;
+				yM = 0;
+			}
+
+		}
+	}
+	if (pix->available || p_Vid->DeblockCall)
+	{
+		pix->x = (short) (xN & (maxW - 1));
+		pix->y = (short) (yM & (maxH - 1));
+		get_mb_pos(p_Vid, pix->mb_addr, mb_size, &(pix->pos_x), &(pix->pos_y));
+		pix->pos_x = pix->pos_x + pix->x;
+		pix->pos_y = pix->pos_y + pix->y;
+	}
+}
+
+void getAffNeighbour0X(const Macroblock *currMB, int yN, const int mb_size[2], PixelPos *pix) // xN == 0, yN full range
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int maxW, maxH;
+	int yM = -1;
+
+	maxW = mb_size[0];
+	maxH = mb_size[1];
+
+	// initialize to "not available"
+	pix->available = FALSE;
+
+	if(yN > (maxH - 1))
+	{
+		return;
+	}
+	if (0 > (maxW - 1) && yN >= 0 && yN < maxH)
+	{
+		return;
+	}
+
+	if (0 < maxW)
+	{
+		if (yN<0)
+		{
+			if (!currMB->mb_field)
+			{
+				//frame
+				if ((currMB->mbAddrX & 0x01) == 0)
+				{
+					//top
+					pix->mb_addr  = currMB->mb_addr_up;
+					// for the deblocker if the current MB is a frame and the one above is a field
+					// then the neighbor is the top MB of the pair
+					if (currMB->mb_avail_up)
+					{
+						if (!(p_Vid->DeblockCall == 1 && (p_Vid->mb_data[currMB->mb_addr_up]).mb_field))
+							pix->mb_addr  += 1;
+					}
+
+					pix->available = currMB->mb_avail_up;
+					yM = yN;
+				}
+				else
+				{
+					// bottom
+					pix->mb_addr   = currMB->mbAddrX - 1;
+					pix->available = TRUE;
+					yM = yN;
+				}
+			}
+			else
+			{
+				// field
+				if ((currMB->mbAddrX & 0x01) == 0)
+				{
+					// top
+					pix->mb_addr   = currMB->mb_addr_up;
+					pix->available = currMB->mb_avail_up;
+					if (currMB->mb_avail_up)
+					{
+						if(!p_Vid->mb_data[currMB->mb_addr_up].mb_field)
+						{
+							(pix->mb_addr)++;
+							yM = 2* yN;
+						}
+						else
+						{
+							yM = yN;
+						}
+					}
+				}
+				else
+				{
+					// bottom
+					pix->mb_addr   = currMB->mb_addr_up + 1;
+					pix->available = currMB->mb_avail_up;
+					yM = yN;
+				}
+			}
+		}
+		else
+		{
+			// yN >=0
+			// for the deblocker if this is the extra edge then do this special stuff
+			if (yN == 0 && p_Vid->DeblockCall == 2)
+			{
+				pix->mb_addr  = currMB->mb_addr_up + 1;
+				pix->available = TRUE;
+				yM = yN - 1;
+			}
+
+			else if ((yN >= 0) && (yN <maxH))
+			{
+				pix->mb_addr   = currMB->mbAddrX;
+				pix->available = TRUE;
+				yM = yN;
+			}
+		}
+	}
+	else
+	{ // xN >= maxW
+		if(yN < 0)
+		{
+			if (!currMB->mb_field)
+			{
+				// frame
+				if ((currMB->mbAddrX & 0x01) == 0)
+				{
+					// top
+					pix->mb_addr  = currMB->mb_addr_upper_right + 1;
+					pix->available = currMB->mb_avail_upper_right;
+					yM = yN;
+				}
+				else
+				{
+					// bottom
+					pix->available = FALSE;
+				}
+			}
+			else
+			{
+				// field
+				if ((currMB->mbAddrX & 0x01) == 0)
+				{
+					// top
+					pix->mb_addr   = currMB->mb_addr_upper_right;
+					pix->available = currMB->mb_avail_upper_right;
+					if (currMB->mb_avail_upper_right)
+					{
+						if(!p_Vid->mb_data[currMB->mb_addr_upper_right].mb_field)
+						{
+							(pix->mb_addr)++;
+							yM = 2* yN;
+						}
+						else
+						{
+							yM = yN;
+						}
+					}
+				}
+				else
+				{
+					// bottom
+					pix->mb_addr   = currMB->mb_addr_upper_right + 1;
+					pix->available = currMB->mb_avail_upper_right;
+					yM = yN;
+				}
+			}
+		}
+	}
+
+	if (pix->available || p_Vid->DeblockCall)
+	{
+		pix->x = 0;
+		pix->y = (short) (yM & (maxH - 1));
+		get_mb_pos(p_Vid, pix->mb_addr, mb_size, &(pix->pos_x), &(pix->pos_y));
+		pix->pos_y = pix->pos_y + pix->y;
+	}
+}
+
+void getAffNeighbour0XLuma(const Macroblock *currMB, int yN, PixelPos *pix) // xN == 0, yN full range
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int maxW, maxH;
+	int yM = -1;
+
+	maxW = 16;
+	maxH = 16;
+
+	// initialize to "not available"
+	pix->available = FALSE;
+
+	if(yN > 15)
+	{
+		return;
+	}
+
+	if (yN<0)
+	{
+		if (!currMB->mb_field)
+		{
+			//frame
+			if ((currMB->mbAddrX & 0x01) == 0)
+			{
+				//top
+				pix->mb_addr  = currMB->mb_addr_up;
+				// for the deblocker if the current MB is a frame and the one above is a field
+				// then the neighbor is the top MB of the pair
+				if (currMB->mb_avail_up)
+				{
+					if (!(p_Vid->DeblockCall == 1 && (p_Vid->mb_data[currMB->mb_addr_up]).mb_field))
+						pix->mb_addr  += 1;
+				}
+
+				pix->available = currMB->mb_avail_up;
+				yM = yN;
+			}
+			else
+			{
+				// bottom
+				pix->mb_addr   = currMB->mbAddrX - 1;
+				pix->available = TRUE;
+				yM = yN;
+			}
+		}
+		else
+		{
+			// field
+			if ((currMB->mbAddrX & 0x01) == 0)
+			{
+				// top
+				pix->mb_addr   = currMB->mb_addr_up;
+				pix->available = currMB->mb_avail_up;
+				if (currMB->mb_avail_up)
+				{
+					if(!p_Vid->mb_data[currMB->mb_addr_up].mb_field)
+					{
+						(pix->mb_addr)++;
+						yM = 2* yN;
+					}
+					else
+					{
+						yM = yN;
+					}
+				}
+			}
+			else
+			{
+				// bottom
+				pix->mb_addr   = currMB->mb_addr_up + 1;
+				pix->available = currMB->mb_avail_up;
+				yM = yN;
+			}
+		}
+	}
+	else
+	{
+		// yN >=0
+		// for the deblocker if this is the extra edge then do this special stuff
+		if (yN == 0 && p_Vid->DeblockCall == 2)
+		{
+			pix->mb_addr  = currMB->mb_addr_up + 1;
+			pix->available = TRUE;
+			yM = yN - 1;
+		}
+
+		else if ((yN >= 0) && (yN <maxH))
+		{
+			pix->mb_addr   = currMB->mbAddrX;
+			pix->available = TRUE;
+			yM = yN;
+		}
+	}
+
+	if (pix->available || p_Vid->DeblockCall)
+	{
+		const int mb_size[2] = {16,16};
+		pix->x = 0;
+		pix->y = (short) (yM & (maxH - 1));
+		get_mb_pos(p_Vid, pix->mb_addr, mb_size, &(pix->pos_x), &(pix->pos_y));
+		pix->pos_y = pix->pos_y + pix->y;
+	}
+}
+
+
+
+void getAffNeighbour0N(const Macroblock *currMB, const int mb_size[2], PixelPos *pix) // xN == 0, yN = -1
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int maxW, maxH;
+	int yM = -1;
+
+	maxW = mb_size[0];
+	maxH = mb_size[1];
+
+	// initialize to "not available"
+	pix->available = FALSE;
+
+	if (0 < maxW)
+	{
+		if (!currMB->mb_field)
+		{
+			//frame
+			if ((currMB->mbAddrX & 0x01) == 0)
+			{
+				//top
+				pix->mb_addr  = currMB->mb_addr_up;
+				// for the deblocker if the current MB is a frame and the one above is a field
+				// then the neighbor is the top MB of the pair
+				if (currMB->mb_avail_up)
+				{
+					if (!(p_Vid->DeblockCall == 1 && (p_Vid->mb_data[currMB->mb_addr_up]).mb_field))
+						pix->mb_addr  += 1;
+				}
+
+				pix->available = currMB->mb_avail_up;
+				yM = -1;
+			}
+			else
+			{
+				// bottom
+				pix->mb_addr   = currMB->mbAddrX - 1;
+				pix->available = TRUE;
+				yM = -1;
+			}
+		}
+		else
+		{
+			// field
+			if ((currMB->mbAddrX & 0x01) == 0)
+			{
+				// top
+				pix->mb_addr   = currMB->mb_addr_up;
+				pix->available = currMB->mb_avail_up;
+				if (currMB->mb_avail_up)
+				{
+					if(!p_Vid->mb_data[currMB->mb_addr_up].mb_field)
+					{
+						(pix->mb_addr)++;
+						yM = -2;
+					}
+					else
+					{
+						yM = -1;
+					}
+				}
+			}
+			else
+			{
+				// bottom
+				pix->mb_addr   = currMB->mb_addr_up + 1;
+				pix->available = currMB->mb_avail_up;
+				yM = -1;
+			}
+		}
+	}
+	else
+	{ // xN >= maxW
+		if (!currMB->mb_field)
+		{
+			// frame
+			if ((currMB->mbAddrX & 0x01) == 0)
+			{
+				// top
+				pix->mb_addr  = currMB->mb_addr_upper_right + 1;
+				pix->available = currMB->mb_avail_upper_right;
+				yM = -1;
+			}
+			else
+			{
+				// bottom
+				pix->available = FALSE;
+			}
+		}
+		else
+		{
+			// field
+			if ((currMB->mbAddrX & 0x01) == 0)
+			{
+				// top
+				pix->mb_addr   = currMB->mb_addr_upper_right;
+				pix->available = currMB->mb_avail_upper_right;
+				if (currMB->mb_avail_upper_right)
+				{
+					if(!p_Vid->mb_data[currMB->mb_addr_upper_right].mb_field)
+					{
+						(pix->mb_addr)++;
+						yM = -2;
+					}
+					else
+					{
+						yM = -1;
+					}
+				}
+			}
+			else
+			{
+				// bottom
+				pix->mb_addr   = currMB->mb_addr_upper_right + 1;
+				pix->available = currMB->mb_avail_upper_right;
+				yM = -1;
+			}
+		}
+	}
+
+	if (pix->available || p_Vid->DeblockCall)
+	{
+		pix->x = 0;
+		pix->y = (short) (yM & (maxH - 1));
+		get_mb_pos(p_Vid, pix->mb_addr, mb_size, &(pix->pos_x), &(pix->pos_y));
+		pix->pos_y = pix->pos_y + pix->y;
+	}
+}
+
+void getAffNeighbour0NLuma(const Macroblock *currMB, PixelPos *pix) // xN == 0, yN = -1
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	const int maxW=16, maxH=16;
+	int yM = -1;
+
+
+	// initialize to "not available"
+	pix->available = FALSE;
+
+	if (!currMB->mb_field)
+	{
+		//frame
+		if ((currMB->mbAddrX & 0x01) == 0)
+		{
+			//top
+			pix->mb_addr  = currMB->mb_addr_up;
+			// for the deblocker if the current MB is a frame and the one above is a field
+			// then the neighbor is the top MB of the pair
+			if (currMB->mb_avail_up)
+			{
+				if (!(p_Vid->DeblockCall == 1 && (p_Vid->mb_data[currMB->mb_addr_up]).mb_field))
+					pix->mb_addr  += 1;
+			}
+
+			pix->available = currMB->mb_avail_up;
+			yM = -1;
+		}
+		else
+		{
+			// bottom
+			pix->mb_addr   = currMB->mbAddrX - 1;
+			pix->available = TRUE;
+			yM = -1;
+		}
+	}
+	else
+	{
+		// field
+		if ((currMB->mbAddrX & 0x01) == 0)
+		{
+			// top
+			pix->mb_addr   = currMB->mb_addr_up;
+			pix->available = currMB->mb_avail_up;
+			if (currMB->mb_avail_up)
+			{
+				if(!p_Vid->mb_data[currMB->mb_addr_up].mb_field)
+				{
+					(pix->mb_addr)++;
+					yM = -2;
+				}
+				else
+				{
+					yM = -1;
+				}
+			}
+		}
+		else
+		{
+			// bottom
+			pix->mb_addr   = currMB->mb_addr_up + 1;
+			pix->available = currMB->mb_avail_up;
+			yM = -1;
+		}
+	}
+
+	if (pix->available || p_Vid->DeblockCall)
+	{
+		const int mb_size[2] = {16,16};
+		pix->x = 0;
+		pix->y = (short) (yM & (maxH - 1));
+		get_mb_pos(p_Vid, pix->mb_addr, mb_size, &(pix->pos_x), &(pix->pos_y));
+		pix->pos_y = pix->pos_y + pix->y;
+	}
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    get neighboring 4x4 block
+* \param currMB
+*   current macroblock
+* \param block_x
+*    input x block position
+* \param block_y
+*    input y block position
+* \param mb_size
+*    Macroblock size in pixel (according to luma or chroma MB access)
+* \param pix
+*    returns position informations
+************************************************************************
+*/
+void get4x4Neighbour(const Macroblock *currMB, int block_x, int block_y, const int mb_size[2], PixelPos *pix)
+{
+	currMB->p_Vid->getNeighbour(currMB, block_x, block_y, mb_size, pix);
+
+	if (pix->available)
+	{
+		pix->x >>= 2;
+		pix->y >>= 2;
+		pix->pos_x >>= 2;
+		pix->pos_y >>= 2;
+	}
+}
+
+void get4x4NeighbourLuma(const Macroblock *currMB, int block_x, int block_y, PixelPos *pix)
+{
+	currMB->p_Vid->getNeighbourLuma(currMB, block_x, block_y, pix);
+
+	if (pix->available)
+	{
+		pix->x >>= 2;
+		pix->y >>= 2;
+		pix->pos_x >>= 2;
+		pix->pos_y >>= 2;
+	}
+}
diff --git a/Src/h264dec/ldecod/src/mb_prediction.c b/Src/h264dec/ldecod/src/mb_prediction.c
new file mode 100644
index 00000000..799236a8
--- /dev/null
+++ b/Src/h264dec/ldecod/src/mb_prediction.c
@@ -0,0 +1,979 @@
+/*!
+*************************************************************************************
+* \file mb_prediction.c
+*
+* \brief
+*    Macroblock prediction functions
+*
+* \author
+*    Main contributors (see contributors.h for copyright, address and affiliation details)
+*    - Alexis Michael Tourapis         <alexismt@ieee.org>
+*************************************************************************************
+*/
+
+#include "contributors.h"
+
+#include "block.h"
+#include "global.h"
+#include "mbuffer.h"
+#include "elements.h"
+#include "errorconcealment.h"
+#include "macroblock.h"
+#include "fmo.h"
+#include "cabac.h"
+#include "vlc.h"
+#include "image.h"
+#include "mb_access.h"
+#include "biaridecod.h"
+#include "transform8x8.h"
+#include "transform.h"
+#include "mc_prediction.h"
+#include "quant.h"
+#include "intra4x4_pred.h"
+#include "intra8x8_pred.h"
+#include "intra16x16_pred.h"
+#include "mv_prediction.h"
+#include "mb_prediction.h"
+#include "optim.h"
+
+
+int mb_pred_intra4x4(Macroblock *currMB, ColorPlane curr_plane, VideoImage *image, StorablePicture *dec_picture)
+{
+	Slice *currSlice = currMB->p_Slice;
+	int yuv = dec_picture->chroma_format_idc - 1;
+
+	if (currMB->is_lossless == FALSE)
+	{
+		const h264_short_block_t *blocks = currSlice->cof4[curr_plane];
+		const h264_imgpel_macroblock_row_t *mb_pred=currSlice->mb_pred[curr_plane];
+		h264_imgpel_macroblock_row_t *mb_rec = currSlice->mb_rec[curr_plane];
+		int block_x = currMB->block_x;
+		int block_y = currMB->block_y;
+		if (intrapred(currMB, curr_plane, 0,0,block_x + 0,block_y + 0) == SEARCH_SYNC)  return SEARCH_SYNC;
+		opt_itrans4x4(blocks[0], mb_pred, mb_rec, 0, 0);
+		copy_image_data_4x4_stride(image, (block_x + 0)<<2, (block_y + 0)<<2, currSlice->mb_rec[curr_plane], 0, 0);
+		if (intrapred(currMB, curr_plane, 4,0,block_x + 1,block_y + 0) == SEARCH_SYNC)  return SEARCH_SYNC;
+		opt_itrans4x4(blocks[1], mb_pred, mb_rec, 4, 0);
+		copy_image_data_4x4_stride(image, (block_x + 1)<<2, (block_y + 0)<<2, currSlice->mb_rec[curr_plane], 4, 0);
+		if (intrapred(currMB, curr_plane, 0,4,block_x + 0,block_y + 1) == SEARCH_SYNC)  return SEARCH_SYNC;
+		opt_itrans4x4(blocks[2], mb_pred, mb_rec, 0, 4);
+		copy_image_data_4x4_stride(image, (block_x + 0)<<2, (block_y + 1)<<2, currSlice->mb_rec[curr_plane], 0, 4);
+		if (intrapred(currMB, curr_plane, 4,4,block_x + 1,block_y + 1) == SEARCH_SYNC)  return SEARCH_SYNC;
+		opt_itrans4x4(blocks[3], mb_pred, mb_rec, 4, 4);
+		copy_image_data_4x4_stride(image, (block_x + 1)<<2, (block_y + 1)<<2, currSlice->mb_rec[curr_plane], 4, 4);
+		if (intrapred(currMB, curr_plane, 8,0,block_x + 2,block_y + 0) == SEARCH_SYNC)  return SEARCH_SYNC;
+		opt_itrans4x4(blocks[4], mb_pred, mb_rec, 8, 0);
+		copy_image_data_4x4_stride(image, (block_x + 2)<<2, (block_y + 0)<<2, currSlice->mb_rec[curr_plane], 8, 0);
+		if (intrapred(currMB, curr_plane, 12,0,block_x + 3,block_y + 0) == SEARCH_SYNC)  return SEARCH_SYNC;
+		opt_itrans4x4(blocks[5], mb_pred, mb_rec, 12, 0);
+		copy_image_data_4x4_stride(image, (block_x + 3)<<2, (block_y + 0)<<2, currSlice->mb_rec[curr_plane], 12, 0);
+		if (intrapred(currMB, curr_plane, 8,4,block_x + 2,block_y + 1) == SEARCH_SYNC)  return SEARCH_SYNC;
+		opt_itrans4x4(blocks[6], mb_pred, mb_rec, 8, 4);
+		copy_image_data_4x4_stride(image, (block_x + 2)<<2, (block_y + 1)<<2, currSlice->mb_rec[curr_plane], 8, 4);
+		if (intrapred(currMB, curr_plane, 12,4,block_x + 3,block_y + 1) == SEARCH_SYNC)  return SEARCH_SYNC;
+		opt_itrans4x4(blocks[7], mb_pred, mb_rec, 12, 4);
+		copy_image_data_4x4_stride(image, (block_x + 3)<<2, (block_y + 1)<<2, currSlice->mb_rec[curr_plane], 12, 4);
+		if (intrapred(currMB, curr_plane, 0,8,block_x + 0,block_y + 2) == SEARCH_SYNC)  return SEARCH_SYNC;
+		opt_itrans4x4(blocks[8], mb_pred, mb_rec, 0, 8);
+		copy_image_data_4x4_stride(image, (block_x + 0)<<2, (block_y + 2)<<2, currSlice->mb_rec[curr_plane], 0, 8);
+		if (intrapred(currMB, curr_plane, 4,8,block_x + 1,block_y + 2) == SEARCH_SYNC)  return SEARCH_SYNC;
+		opt_itrans4x4(blocks[9], mb_pred, mb_rec, 4, 8);
+		copy_image_data_4x4_stride(image, (block_x + 1)<<2, (block_y + 2)<<2, currSlice->mb_rec[curr_plane], 4, 8);
+		if (intrapred(currMB, curr_plane, 0,12,block_x + 0,block_y + 3) == SEARCH_SYNC)  return SEARCH_SYNC;
+		opt_itrans4x4(blocks[10], mb_pred, mb_rec, 0, 12);
+		copy_image_data_4x4_stride(image, (block_x + 0)<<2, (block_y + 3)<<2, currSlice->mb_rec[curr_plane], 0, 12);
+		if (intrapred(currMB, curr_plane, 4,12,block_x + 1,block_y + 3) == SEARCH_SYNC)  return SEARCH_SYNC;
+		opt_itrans4x4(blocks[11], mb_pred, mb_rec, 4, 12);
+		copy_image_data_4x4_stride(image, (block_x + 1)<<2, (block_y + 3)<<2, currSlice->mb_rec[curr_plane], 4, 12);
+		if (intrapred(currMB, curr_plane, 8,8,block_x + 2,block_y + 2) == SEARCH_SYNC)  return SEARCH_SYNC;
+		opt_itrans4x4(blocks[12], mb_pred, mb_rec, 8, 8);
+		copy_image_data_4x4_stride(image, (block_x + 2)<<2, (block_y + 2)<<2, currSlice->mb_rec[curr_plane], 8, 8);
+		if (intrapred(currMB, curr_plane, 12,8,block_x + 3,block_y + 2) == SEARCH_SYNC)  return SEARCH_SYNC;
+		opt_itrans4x4(blocks[13], mb_pred, mb_rec, 12, 8);
+		copy_image_data_4x4_stride(image, (block_x + 3)<<2, (block_y + 2)<<2, currSlice->mb_rec[curr_plane], 12, 8);
+		if (intrapred(currMB, curr_plane, 8,12,block_x + 2,block_y + 3) == SEARCH_SYNC)  return SEARCH_SYNC;
+		opt_itrans4x4(blocks[14], mb_pred, mb_rec, 8, 12);
+		copy_image_data_4x4_stride(image, (block_x + 2)<<2, (block_y + 3)<<2, currSlice->mb_rec[curr_plane], 8, 12);
+		if (intrapred(currMB, curr_plane, 12,12,block_x + 3,block_y + 3) == SEARCH_SYNC)  return SEARCH_SYNC;
+		opt_itrans4x4(blocks[15], mb_pred, mb_rec, 12, 12);
+		copy_image_data_4x4_stride(image, (block_x + 3)<<2, (block_y + 3)<<2, currSlice->mb_rec[curr_plane], 12, 12);
+		// benski> prediction might reference other parts of the image reconstructed during this block, so can't just do a single 16x16 image copy
+	}
+	else
+	{ // lossless
+		h264_imgpel_macroblock_row_t *mb_rec = currSlice->mb_rec[curr_plane];
+		int block_x = currMB->block_x;
+		int block_y = currMB->block_y;
+
+		if (intrapred(currMB, curr_plane, 0,0,block_x + 0,block_y + 0) == SEARCH_SYNC)  return SEARCH_SYNC;
+		Inv_Residual_trans_4x4(currMB, curr_plane, 0, 0);
+		copy_image_data_4x4_stride(image, (block_x + 0)<<2, (block_y + 0)<<2, currSlice->mb_rec[curr_plane], 0, 0);
+		if (intrapred(currMB, curr_plane, 4,0,block_x + 1,block_y + 0) == SEARCH_SYNC)  return SEARCH_SYNC;
+		Inv_Residual_trans_4x4(currMB, curr_plane, 4, 0);
+		copy_image_data_4x4_stride(image, (block_x + 1)<<2, (block_y + 0)<<2, currSlice->mb_rec[curr_plane], 4, 0);
+		if (intrapred(currMB, curr_plane, 0,4,block_x + 0,block_y + 1) == SEARCH_SYNC)  return SEARCH_SYNC;
+		Inv_Residual_trans_4x4(currMB, curr_plane, 0, 4);
+		copy_image_data_4x4_stride(image, (block_x + 0)<<2, (block_y + 1)<<2, currSlice->mb_rec[curr_plane], 0, 4);
+		if (intrapred(currMB, curr_plane, 4,4,block_x + 1,block_y + 1) == SEARCH_SYNC)  return SEARCH_SYNC;
+		Inv_Residual_trans_4x4(currMB, curr_plane, 4, 4);
+		copy_image_data_4x4_stride(image, (block_x + 1)<<2, (block_y + 1)<<2, currSlice->mb_rec[curr_plane], 4, 4);
+		if (intrapred(currMB, curr_plane, 8,0,block_x + 2,block_y + 0) == SEARCH_SYNC)  return SEARCH_SYNC;
+		Inv_Residual_trans_4x4(currMB, curr_plane, 8, 0);
+		copy_image_data_4x4_stride(image, (block_x + 2)<<2, (block_y + 0)<<2, currSlice->mb_rec[curr_plane], 8, 0);
+		if (intrapred(currMB, curr_plane, 12,0,block_x + 3,block_y + 0) == SEARCH_SYNC)  return SEARCH_SYNC;
+		Inv_Residual_trans_4x4(currMB, curr_plane, 12, 0);
+		copy_image_data_4x4_stride(image, (block_x + 3)<<2, (block_y + 0)<<2, currSlice->mb_rec[curr_plane], 12, 0);
+		if (intrapred(currMB, curr_plane, 8,4,block_x + 2,block_y + 1) == SEARCH_SYNC)  return SEARCH_SYNC;
+		Inv_Residual_trans_4x4(currMB, curr_plane, 8, 4);
+		copy_image_data_4x4_stride(image, (block_x + 2)<<2, (block_y + 1)<<2, currSlice->mb_rec[curr_plane], 8, 4);
+		if (intrapred(currMB, curr_plane, 12,4,block_x + 3,block_y + 1) == SEARCH_SYNC)  return SEARCH_SYNC;
+		Inv_Residual_trans_4x4(currMB, curr_plane, 12, 4);
+		copy_image_data_4x4_stride(image, (block_x + 3)<<2, (block_y + 1)<<2, currSlice->mb_rec[curr_plane], 12, 4);
+		if (intrapred(currMB, curr_plane, 0,8,block_x + 0,block_y + 2) == SEARCH_SYNC)  return SEARCH_SYNC;
+		Inv_Residual_trans_4x4(currMB, curr_plane, 0, 8);
+		copy_image_data_4x4_stride(image, (block_x + 0)<<2, (block_y + 2)<<2, currSlice->mb_rec[curr_plane], 0, 8);
+		if (intrapred(currMB, curr_plane, 4,8,block_x + 1,block_y + 2) == SEARCH_SYNC)  return SEARCH_SYNC;
+		Inv_Residual_trans_4x4(currMB, curr_plane, 4, 8);
+		copy_image_data_4x4_stride(image, (block_x + 1)<<2, (block_y + 2)<<2, currSlice->mb_rec[curr_plane], 4, 8);
+		if (intrapred(currMB, curr_plane, 0,12,block_x + 0,block_y + 3) == SEARCH_SYNC)  return SEARCH_SYNC;
+		Inv_Residual_trans_4x4(currMB, curr_plane, 0, 12);
+		copy_image_data_4x4_stride(image, (block_x + 0)<<2, (block_y + 3)<<2, currSlice->mb_rec[curr_plane], 0, 12);
+		if (intrapred(currMB, curr_plane, 4,12,block_x + 1,block_y + 3) == SEARCH_SYNC)  return SEARCH_SYNC;
+		Inv_Residual_trans_4x4(currMB, curr_plane, 4, 12);
+		copy_image_data_4x4_stride(image, (block_x + 1)<<2, (block_y + 3)<<2, currSlice->mb_rec[curr_plane], 4, 12);
+		if (intrapred(currMB, curr_plane, 8,8,block_x + 2,block_y + 2) == SEARCH_SYNC)  return SEARCH_SYNC;
+		Inv_Residual_trans_4x4(currMB, curr_plane, 8, 8);
+		copy_image_data_4x4_stride(image, (block_x + 2)<<2, (block_y + 2)<<2, currSlice->mb_rec[curr_plane], 8, 8);
+		if (intrapred(currMB, curr_plane, 12,8,block_x + 3,block_y + 2) == SEARCH_SYNC)  return SEARCH_SYNC;
+		Inv_Residual_trans_4x4(currMB, curr_plane, 12, 8);
+		copy_image_data_4x4_stride(image, (block_x + 3)<<2, (block_y + 2)<<2, currSlice->mb_rec[curr_plane], 12, 8);
+		if (intrapred(currMB, curr_plane, 8,12,block_x + 2,block_y + 3) == SEARCH_SYNC)  return SEARCH_SYNC;
+		Inv_Residual_trans_4x4(currMB, curr_plane, 8, 12);
+		copy_image_data_4x4_stride(image, (block_x + 2)<<2, (block_y + 3)<<2, currSlice->mb_rec[curr_plane], 8, 12);
+		if (intrapred(currMB, curr_plane, 12,12,block_x + 3,block_y + 3) == SEARCH_SYNC)  return SEARCH_SYNC;
+		Inv_Residual_trans_4x4(currMB, curr_plane, 12, 12);
+		copy_image_data_4x4_stride(image, (block_x + 3)<<2, (block_y + 3)<<2, currSlice->mb_rec[curr_plane], 12, 12);
+		// benski> prediction might reference other parts of the image reconstructed during this block, so can't just do a single 16x16 image copy
+	}
+
+	// chroma decoding *******************************************************
+	if ((dec_picture->chroma_format_idc != YUV400) && (dec_picture->chroma_format_idc != YUV444)) 
+	{
+		intra_cr_decoding(currMB, yuv);
+	}
+
+	return 1;
+}
+
+
+int mb_pred_intra16x16(Macroblock *currMB, ColorPlane curr_plane, VideoImage *image, StorablePicture *dec_picture)
+{
+	int yuv = dec_picture->chroma_format_idc - 1;
+
+	intrapred16x16(currMB, curr_plane, currMB->i16mode);
+	currMB->ipmode_DPCM = (char) currMB->i16mode; //For residual DPCM
+	// =============== 4x4 itrans ================
+	// -------------------------------------------
+	iMBtrans4x4(currMB, curr_plane, 0);
+
+	// chroma decoding *******************************************************
+	if ((dec_picture->chroma_format_idc != YUV400) && (dec_picture->chroma_format_idc != YUV444)) 
+	{
+		intra_cr_decoding(currMB, yuv);
+	}
+	return 1;
+}
+
+int mb_pred_intra8x8(Macroblock *currMB, ColorPlane curr_plane, VideoImage *image, StorablePicture *dec_picture)
+{
+	Slice *currSlice = currMB->p_Slice;
+	int yuv = dec_picture->chroma_format_idc - 1;
+
+		if (currMB->is_lossless)
+	{
+		//PREDICTION
+		intrapred8x8(currMB, curr_plane, 0, 0);
+		Inv_Residual_trans_8x8(currMB, curr_plane, 0,0);      // use DCT transform and make 8x8 block m7 from prediction block mpr
+		copy_image_data_8x8_stride2(image, currMB->pix_x + 0 ,currMB->pix_y + 0, currSlice->mb_rec[curr_plane], 0, 0);
+
+		intrapred8x8(currMB, curr_plane, 8, 0);
+		Inv_Residual_trans_8x8(currMB, curr_plane, 8,0);      // use DCT transform and make 8x8 block m7 from prediction block mpr
+		copy_image_data_8x8_stride2(image, currMB->pix_x + 8 ,currMB->pix_y + 0, currSlice->mb_rec[curr_plane], 8, 0);
+
+		intrapred8x8(currMB, curr_plane, 0, 8);
+		Inv_Residual_trans_8x8(currMB, curr_plane, 0,8);      // use DCT transform and make 8x8 block m7 from prediction block mpr
+		copy_image_data_8x8_stride2(image, currMB->pix_x + 0 ,currMB->pix_y + 8, currSlice->mb_rec[curr_plane], 0, 8);
+
+		intrapred8x8(currMB, curr_plane, 8, 8);
+		Inv_Residual_trans_8x8 (currMB, curr_plane, 8,8);      // use DCT transform and make 8x8 block m7 from prediction block mpr
+		copy_image_data_8x8_stride2(image, currMB->pix_x + 8 ,currMB->pix_y + 8, currSlice->mb_rec[curr_plane], 8, 8);
+	}
+	else 
+	{
+		h264_imgpel_macroblock_row_t *mb_rec = currSlice->mb_rec[curr_plane];
+		h264_imgpel_macroblock_row_t *mb_pred = currSlice->mb_pred[curr_plane];
+		h264_short_8x8block_t *mb_rres8 = currSlice->mb_rres8[curr_plane];
+
+		//PREDICTION
+		intrapred8x8(currMB, curr_plane, 0, 0);
+		opt_itrans8x8(mb_rec, mb_pred, mb_rres8[0], 0);		// use DCT transform and make 8x8 block m7 from prediction block mpr
+		copy_image_data_8x8_stride2(image, currMB->pix_x + 0 ,currMB->pix_y + 0, currSlice->mb_rec[curr_plane], 0, 0);
+
+		intrapred8x8(currMB, curr_plane, 8, 0);
+		opt_itrans8x8(mb_rec, mb_pred, mb_rres8[1], 8);      // use DCT transform and make 8x8 block m7 from prediction block mpr
+		copy_image_data_8x8_stride2(image, currMB->pix_x + 8 ,currMB->pix_y + 0, currSlice->mb_rec[curr_plane], 8, 0);
+
+		intrapred8x8(currMB, curr_plane, 0, 8);
+		opt_itrans8x8(mb_rec+8, mb_pred+8, mb_rres8[2], 0);      // use DCT transform and make 8x8 block m7 from prediction block mpr		
+		copy_image_data_8x8_stride2(image, currMB->pix_x + 0 ,currMB->pix_y + 8, currSlice->mb_rec[curr_plane], 0, 8);
+
+		intrapred8x8(currMB, curr_plane, 8, 8);
+		opt_itrans8x8(mb_rec+8, mb_pred+8, mb_rres8[3], 8);      // use DCT transform and make 8x8 block m7 from prediction block mpr
+		copy_image_data_8x8_stride2(image, currMB->pix_x + 8 ,currMB->pix_y + 8, currSlice->mb_rec[curr_plane], 8, 8);
+	}
+
+	// chroma decoding *******************************************************
+	if ((dec_picture->chroma_format_idc != YUV400) && (dec_picture->chroma_format_idc != YUV444)) 
+	{
+		intra_cr_decoding(currMB, yuv);
+	}
+	return 1;
+}
+
+
+static void set_chroma_vector(Macroblock *currMB, int *list_offset)
+{
+	Slice *currSlice = currMB->p_Slice;
+	VideoParameters *p_Vid = currMB->p_Vid;
+
+	if (!currSlice->mb_aff_frame_flag)
+	{
+		if(p_Vid->structure == TOP_FIELD)
+		{
+			int k,l;  
+			for (l = LIST_0; l <= (LIST_1); l++)
+			{
+				for(k = 0; k < p_Vid->listXsize[l]; k++)
+				{
+					if(p_Vid->structure != p_Vid->listX[l][k]->structure)
+						p_Vid->listX[l][k]->chroma_vector_adjustment = -2;
+					else
+						p_Vid->listX[l][k]->chroma_vector_adjustment= 0;
+				}
+			}
+		}
+		else if(p_Vid->structure == BOTTOM_FIELD)
+		{
+			int k,l;  
+			for (l = LIST_0; l <= (LIST_1); l++)
+			{
+				for(k = 0; k < p_Vid->listXsize[l]; k++)
+				{
+					if (p_Vid->structure != p_Vid->listX[l][k]->structure)
+						p_Vid->listX[l][k]->chroma_vector_adjustment = 2;
+					else
+						p_Vid->listX[l][k]->chroma_vector_adjustment= 0;
+				}
+			}
+		}
+		else
+		{
+			int k,l;  
+			for (l = LIST_0; l <= (LIST_1); l++)
+			{
+				for(k = 0; k < p_Vid->listXsize[l]; k++)
+				{
+					p_Vid->listX[l][k]->chroma_vector_adjustment= 0;
+				}
+			}
+		}
+	}
+	else
+	{
+		int mb_nr = (currMB->mbAddrX & 0x01);
+		int k,l;  
+
+		//////////////////////////
+		// find out the correct list offsets
+		if (currMB->mb_field)
+		{
+			*list_offset = mb_nr ? 4 : 2;
+
+			for (l = LIST_0 + *list_offset; l <= (LIST_1 + *list_offset); l++)
+			{
+				for(k = 0; k < p_Vid->listXsize[l]; k++)
+				{          
+					if(mb_nr == 0 && p_Vid->listX[l][k]->structure == BOTTOM_FIELD)
+						p_Vid->listX[l][k]->chroma_vector_adjustment = -2;
+					else if(mb_nr == 1 && p_Vid->listX[l][k]->structure == TOP_FIELD)
+						p_Vid->listX[l][k]->chroma_vector_adjustment = 2;
+					else
+						p_Vid->listX[l][k]->chroma_vector_adjustment= 0;
+				}
+			}
+		}
+		else
+		{
+			for (l = LIST_0; l <= (LIST_1); l++)
+			{
+				for(k = 0; k < p_Vid->listXsize[l]; k++)
+				{
+					p_Vid->listX[l][k]->chroma_vector_adjustment= 0;
+				}
+			}
+		}
+	}
+
+	p_Vid->max_mb_vmv_r = (p_Vid->structure != FRAME || (currSlice->mb_aff_frame_flag && currMB->mb_field)) ? p_Vid->max_vmv_r >> 1 : p_Vid->max_vmv_r;
+}
+
+void mb_pred_skip(Macroblock *currMB, ColorPlane curr_plane, VideoImage *image, StorablePicture *dec_picture)
+{
+	Slice *currSlice = currMB->p_Slice;
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int curr_mb_field = ((currSlice->mb_aff_frame_flag)&&(currMB->mb_field));
+
+	int list_offset = 0;
+
+	set_chroma_vector(currMB, &list_offset);
+
+	perform_mc16x16(currMB, curr_plane, dec_picture, LIST_0, list_offset, curr_mb_field);
+
+	opt_copy_image_data_16x16_stride(image, currMB->pix_x, currMB->pix_y, currSlice->mb_pred[curr_plane]);
+
+	if (dec_picture->chroma_format_idc == YUV420)
+	{
+		copy_image_data_8x8_stride(dec_picture->imgUV[0], currMB->pix_c_x, currMB->pix_c_y, currSlice->mb_pred[1]);
+		copy_image_data_8x8_stride(dec_picture->imgUV[1], currMB->pix_c_x, currMB->pix_c_y, currSlice->mb_pred[2]);
+	}
+	else if (dec_picture->chroma_format_idc == YUV422)
+	{
+		copy_image_data_stride(dec_picture->imgUV[0], currMB->pix_c_x, currMB->pix_c_y, currSlice->mb_pred[1], 8, 16);
+		copy_image_data_stride(dec_picture->imgUV[1], currMB->pix_c_x, currMB->pix_c_y, currSlice->mb_pred[2], 8, 16);
+	}
+}
+
+void mb_pred_sp_skip(Macroblock *currMB, ColorPlane curr_plane, VideoImage *image, StorablePicture *dec_picture)
+{
+	Slice *currSlice = currMB->p_Slice;
+	int curr_mb_field = ((currSlice->mb_aff_frame_flag)&&(currMB->mb_field));
+	int list_offset = 0;
+
+	set_chroma_vector(currMB, &list_offset);
+
+	perform_mc16x16(currMB, curr_plane, dec_picture, LIST_0, list_offset, curr_mb_field);	
+	iTransform(currMB, curr_plane, 1);
+}
+
+void mb_pred_p_inter8x8(Macroblock *currMB, ColorPlane curr_plane, VideoImage *image, StorablePicture *dec_picture)
+{
+	int block8x8;   // needed for ABT
+	int i=0, j=0,k;  
+
+	Slice *currSlice = currMB->p_Slice;
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int smb = p_Vid->type == SP_SLICE && IS_INTER(currMB);
+	int curr_mb_field = ((currSlice->mb_aff_frame_flag)&&(currMB->mb_field));
+
+	int list_offset = 0;
+
+	set_chroma_vector(currMB, &list_offset);
+
+	for (block8x8=0; block8x8<4; block8x8++)
+	{
+		int mv_mode  = currMB->b8mode[block8x8];
+		int pred_dir = currMB->b8pdir[block8x8]; 
+		if (mv_mode == SMB8x8)
+		{
+			i =  (decode_block_scan[block8x8*4] & 3);
+			j = block8x8 & ~1;
+			perform_mc8x8(currMB, curr_plane, dec_picture, pred_dir, i, j, list_offset, curr_mb_field);
+		}
+		else if (mv_mode == SMB4x4)
+		{
+			int k_start = (block8x8 << 2);
+			int k_inc = (mv_mode == SMB8x4) ? 2 : 1;
+			int k_end = (mv_mode == SMB8x8) ? k_start + 1 : ((mv_mode == SMB4x4) ? k_start + 4 : k_start + k_inc + 1);
+
+			int block_size_x = (mv_mode == SMB8x4) ? SMB_BLOCK_SIZE : BLOCK_SIZE;
+			int block_size_y = (mv_mode == SMB4x8) ? SMB_BLOCK_SIZE : BLOCK_SIZE;
+
+			for (k = k_start; k < k_end; k += k_inc)
+			{
+				i =  (decode_block_scan[k] & 3);
+				j = ((decode_block_scan[k] >> 2) & 3);
+				perform_mc(currMB, curr_plane, dec_picture, pred_dir, i, j, list_offset, block_size_x, block_size_y, curr_mb_field);
+			}
+		}
+		else
+		{
+			int k_start = (block8x8 << 2);
+			int k_inc = (mv_mode == SMB8x4) ? 2 : 1;
+			int k_end = k_start + k_inc + 1;
+
+			int block_size_x = (mv_mode == SMB8x4) ? SMB_BLOCK_SIZE : BLOCK_SIZE;
+			int block_size_y = (mv_mode == SMB4x8) ? SMB_BLOCK_SIZE : BLOCK_SIZE;
+
+			for (k = k_start; k < k_end; k += k_inc)
+			{
+				i =  (decode_block_scan[k] & 3);
+				j = ((decode_block_scan[k] >> 2) & 3);
+				perform_mc(currMB, curr_plane, dec_picture, pred_dir, i, j, list_offset, block_size_x, block_size_y, curr_mb_field);
+			}
+		}
+		/* generic:
+		int k_start = (block8x8 << 2);
+		int k_inc = (mv_mode == SMB8x4) ? 2 : 1;
+		int k_end = (mv_mode == SMB8x8) ? k_start + 1 : ((mv_mode == SMB4x4) ? k_start + 4 : k_start + k_inc + 1);
+
+		int block_size_x = ( mv_mode == SMB8x4 || mv_mode == SMB8x8 ) ? SMB_BLOCK_SIZE : BLOCK_SIZE;
+		int block_size_y = ( mv_mode == SMB4x8 || mv_mode == SMB8x8 ) ? SMB_BLOCK_SIZE : BLOCK_SIZE;
+
+		for (k = k_start; k < k_end; k += k_inc)
+		{
+		i =  (decode_block_scan[k] & 3);
+		j = ((decode_block_scan[k] >> 2) & 3);
+		perform_mc(currMB, curr_plane, dec_picture, pred_dir, i, j, list_offset, block_size_x, block_size_y, curr_mb_field);
+		}
+		*/
+	}
+
+	iTransform(currMB, curr_plane, smb); 
+}
+
+void mb_pred_p_inter16x16(Macroblock *currMB, ColorPlane curr_plane, VideoImage *image, StorablePicture *dec_picture)
+{
+	int smb = (currMB->p_Vid->type == SP_SLICE);
+	Slice *currSlice = currMB->p_Slice;
+	int curr_mb_field = ((currSlice->mb_aff_frame_flag)&&(currMB->mb_field));
+	int list_offset = 0;
+
+	set_chroma_vector(currMB, &list_offset);
+
+	perform_mc16x16(currMB, curr_plane, dec_picture, currMB->b8pdir[0], list_offset, curr_mb_field);
+	iTransform(currMB, curr_plane, smb);
+}
+
+void mb_pred_p_inter16x8(Macroblock *currMB, ColorPlane curr_plane, VideoImage *image, StorablePicture *dec_picture)
+{
+	int smb = (currMB->p_Vid->type == SP_SLICE);
+	Slice *currSlice = currMB->p_Slice;
+	int curr_mb_field = ((currSlice->mb_aff_frame_flag)&&(currMB->mb_field));
+	int list_offset = 0;
+
+	set_chroma_vector(currMB, &list_offset);
+
+	perform_mc16x8(currMB, curr_plane, dec_picture, currMB->b8pdir[0], 0, 0, list_offset, curr_mb_field);
+	perform_mc16x8(currMB, curr_plane, dec_picture, currMB->b8pdir[2], 0, 2, list_offset, curr_mb_field);
+	iTransform(currMB, curr_plane, smb); 
+}
+
+void mb_pred_p_inter8x16(Macroblock *currMB, ColorPlane curr_plane, VideoImage *image, StorablePicture *dec_picture)
+{
+	int smb = (currMB->p_Vid->type == SP_SLICE);
+	Slice *currSlice = currMB->p_Slice;
+	int curr_mb_field = ((currSlice->mb_aff_frame_flag)&&(currMB->mb_field));
+	int list_offset = 0;
+
+	set_chroma_vector(currMB, &list_offset);
+
+	perform_mc8x16(currMB, curr_plane, dec_picture, currMB->b8pdir[0], 0, 0, list_offset, curr_mb_field);
+	perform_mc8x16(currMB, curr_plane, dec_picture, currMB->b8pdir[1], 2, 0, list_offset, curr_mb_field);
+	iTransform(currMB, curr_plane, smb);
+}
+
+void mb_pred_b_dtemporal(Macroblock *currMB, ColorPlane curr_plane, VideoImage *image, StorablePicture *dec_picture)
+{
+	short ref_idx;
+	int refList;
+
+	PicMotionParams *motion = &dec_picture->motion;
+	int k;
+	int block8x8;   // needed for ABT
+	Slice *currSlice = currMB->p_Slice;
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int curr_mb_field = ((currSlice->mb_aff_frame_flag)&&(currMB->mb_field));
+
+	MotionParams *colocated = &currSlice->p_colocated->frame;
+	int list_offset = 0;
+
+	set_chroma_vector(currMB, &list_offset);
+
+	if (currMB->mb_field)
+	{
+		if(currMB->mbAddrX & 0x01)
+		{
+			colocated = &currSlice->p_colocated->bottom;
+		}
+		else
+		{
+			colocated = &currSlice->p_colocated->top;
+		}
+	}
+
+	for (block8x8=0; block8x8<4; block8x8++)
+	{      
+		int pred_dir = currMB->b8pdir[block8x8];
+
+		int k_start = (block8x8 << 2);
+		int k_end = k_start;
+
+		if (p_Vid->active_sps->direct_8x8_inference_flag)
+		{
+			k_end ++;
+		}
+		else
+		{
+			k_end += BLOCK_MULTIPLE;
+		}
+
+		for (k = k_start; k < k_start + BLOCK_MULTIPLE; k ++)
+		{
+
+			int i =  (decode_block_scan[k] & 3);
+			int j = ((decode_block_scan[k] >> 2) & 3);
+			int i4   = currMB->block_x + i;
+			int j4   = currMB->block_y + j;
+			int j6   = currMB->block_y_aff + j;
+			assert (pred_dir<=2);
+
+			refList = (colocated->motion[LIST_0][j6][i4].ref_idx== -1 ? LIST_1 : LIST_0);
+			ref_idx =  colocated->motion[refList][j6][i4].ref_idx;
+
+			if(ref_idx==-1) // co-located is intra mode
+			{
+				memset( &motion->motion[LIST_0][j4][i4].mv, 0,  sizeof(MotionVector));
+				memset( &motion->motion[LIST_1][j4][i4].mv, 0,  sizeof(MotionVector));
+
+				motion->motion[LIST_0][j4][i4].ref_idx = 0;
+				motion->motion[LIST_1][j4][i4].ref_idx = 0;
+			}
+			else // co-located skip or inter mode
+			{
+				int mapped_idx=0;
+				int iref;
+
+				for (iref=0;iref<imin(currSlice->num_ref_idx_l0_active,p_Vid->listXsize[LIST_0 + list_offset]);iref++)
+				{
+					if(p_Vid->structure==0 && curr_mb_field==0)
+					{
+						// If the current MB is a frame MB and the colocated is from a field picture,
+						// then the colocated->ref_pic_id may have been generated from the wrong value of
+						// frame_poc if it references it's complementary field, so test both POC values
+						if(p_Vid->listX[0][iref]->top_poc*2 == colocated->motion[refList][j6][i4].ref_pic_id || p_Vid->listX[0][iref]->bottom_poc*2 == colocated->motion[refList][j6][i4].ref_pic_id)
+						{
+							mapped_idx=iref;
+							break;
+						}
+						else //! invalid index. Default to zero even though this case should not happen
+							mapped_idx=INVALIDINDEX;
+						continue;
+					}
+
+					if (dec_picture->ref_pic_num[p_Vid->current_slice_nr][LIST_0 + list_offset][iref]==colocated->motion[refList][j6][i4].ref_pic_id)
+					{
+						mapped_idx=iref;
+						break;
+					}
+					else //! invalid index. Default to zero even though this case should not happen
+					{
+						mapped_idx=INVALIDINDEX;
+					}
+				}
+				if (INVALIDINDEX == mapped_idx)
+				{
+					error("temporal direct error: colocated block has ref that is unavailable",-1111);
+				}
+				else
+				{
+					int mv_scale = currSlice->mvscale[LIST_0 + list_offset][mapped_idx];
+
+					//! In such case, an array is needed for each different reference.
+					if (mv_scale == 9999 || p_Vid->listX[LIST_0+list_offset][mapped_idx]->is_long_term)
+					{
+						memcpy(&motion->motion[LIST_0][j4][i4].mv, &colocated->motion[refList][j6][i4].mv, sizeof(MotionVector));
+						memset(&motion->motion[LIST_1][j4][i4].mv, 0, sizeof(MotionVector));
+					}
+					else
+					{
+						motion->motion[LIST_0][j4][i4].mv[0]= (short) ((mv_scale * colocated->motion[refList][j6][i4].mv[0] + 128 ) >> 8);
+						motion->motion[LIST_0][j4][i4].mv[1]= (short) ((mv_scale * colocated->motion[refList][j6][i4].mv[1] + 128 ) >> 8);
+
+						motion->motion[LIST_1][j4][i4].mv[0]= (short) (motion->motion[LIST_0][j4][i4].mv[0] - colocated->motion[refList][j6][i4].mv[0]);
+						motion->motion[LIST_1][j4][i4].mv[1]= (short) (motion->motion[LIST_0][j4][i4].mv[1] - colocated->motion[refList][j6][i4].mv[1]);
+					}
+
+					motion->motion[LIST_0][j4][i4].ref_idx = (char) mapped_idx; //p_Vid->listX[1][0]->ref_idx[refList][j4][i4];
+					motion->motion[LIST_1][j4][i4].ref_idx = 0;
+				}
+			}
+			// store reference picture ID determined by direct mode
+			motion->motion[LIST_0][j4][i4].ref_pic_id = dec_picture->ref_pic_num[p_Vid->current_slice_nr][LIST_0 + list_offset][(short)motion->motion[LIST_0][j4][i4].ref_idx];
+			motion->motion[LIST_1][j4][i4].ref_pic_id = dec_picture->ref_pic_num[p_Vid->current_slice_nr][LIST_1 + list_offset][(short)motion->motion[LIST_1][j4][i4].ref_idx];
+		}
+		for (k = k_start; k < k_end; k ++)
+		{
+			int i =  (decode_block_scan[k] & 3);
+			int j = ((decode_block_scan[k] >> 2) & 3);
+			if (p_Vid->active_sps->direct_8x8_inference_flag)
+				perform_mc8x8(currMB, curr_plane, dec_picture, pred_dir, i, j, list_offset, curr_mb_field);
+			else
+				perform_mc(currMB, curr_plane, dec_picture, pred_dir, i, j, list_offset, 4, 4, curr_mb_field);
+		}
+	}
+
+	if (currMB->cbp == 0)
+	{
+		opt_copy_image_data_16x16_stride(image, currMB->pix_x, currMB->pix_y, currSlice->mb_pred[curr_plane]);
+
+		if ((dec_picture->chroma_format_idc != YUV400) && (dec_picture->chroma_format_idc != YUV444)) 
+		{
+			copy_image_data_stride(dec_picture->imgUV[0], currMB->pix_c_x, currMB->pix_c_y, currSlice->mb_pred[1], p_Vid->mb_size[IS_CHROMA][0], p_Vid->mb_size[IS_CHROMA][1]);
+			copy_image_data_stride(dec_picture->imgUV[1], currMB->pix_c_x, currMB->pix_c_y, currSlice->mb_pred[2], p_Vid->mb_size[IS_CHROMA][0], p_Vid->mb_size[IS_CHROMA][1]);
+		}
+	}
+	else
+		iTransform(currMB, curr_plane, 0); 
+}
+
+
+void mb_pred_b_inter8x8(Macroblock *currMB, ColorPlane curr_plane, VideoImage *image, StorablePicture *dec_picture)
+{
+	short ref_idx;
+	int refList;
+
+	char l0_rFrame = -1, l1_rFrame = -1;
+	PicMotionParams *motion = &dec_picture->motion;
+	short pmvl0[2]={0,0}, pmvl1[2]={0,0};
+	int block_size_x, block_size_y;
+	int k;
+	int block8x8;   // needed for ABT
+	Slice *currSlice = currMB->p_Slice;
+	VideoParameters *p_Vid = currMB->p_Vid;
+	int curr_mb_field = ((currSlice->mb_aff_frame_flag)&&(currMB->mb_field));
+
+	MotionParams *colocated = &currSlice->p_colocated->frame;
+	int list_offset = 0;
+
+	set_chroma_vector(currMB, &list_offset);
+
+	if (currMB->mb_field)
+	{
+		if(currMB->mbAddrX & 0x01)
+		{
+			colocated = &currSlice->p_colocated->bottom;
+		}
+		else
+		{
+			colocated = &currSlice->p_colocated->top;
+		}
+	}
+
+	// prepare direct modes
+	if (currSlice->direct_spatial_mv_pred_flag && (!(currMB->b8mode[0] && currMB->b8mode[1] && currMB->b8mode[2] && currMB->b8mode[3])))
+		prepare_direct_params(currMB, dec_picture, pmvl0, pmvl1, &l0_rFrame, &l1_rFrame);
+
+	for (block8x8=0; block8x8<4; block8x8++)
+	{
+		int mv_mode  = currMB->b8mode[block8x8];
+		int pred_dir = currMB->b8pdir[block8x8];
+
+		if ( mv_mode == SMB8x8)
+		{
+			int i =  (decode_block_scan[block8x8*4] & 3);
+			int j = ((decode_block_scan[block8x8*4] >> 2) & 3);
+			perform_mc8x8(currMB, curr_plane, dec_picture, pred_dir, i, j, list_offset, curr_mb_field);
+		}
+		else if ( mv_mode == SMB4x4)
+		{
+			int k_start = (block8x8 << 2);
+
+			for (k = k_start; k < k_start + 4; k ++)
+			{
+				int i =  (decode_block_scan[k] & 3);
+				int j = ((decode_block_scan[k] >> 2) & 3);
+				perform_mc(currMB, curr_plane, dec_picture, pred_dir, i, j, list_offset, BLOCK_SIZE, BLOCK_SIZE, curr_mb_field);
+			}        
+		}
+		else if ( mv_mode != BSKIP_DIRECT)
+		{
+			int k_start = (block8x8 << 2);
+			int k_inc = (mv_mode == SMB8x4) ? 2 : 1;
+			int k_end = (k_start + k_inc + 1);
+
+			block_size_x = ( mv_mode == SMB8x4) ? SMB_BLOCK_SIZE : BLOCK_SIZE;
+			block_size_y = ( mv_mode == SMB4x8) ? SMB_BLOCK_SIZE : BLOCK_SIZE;
+
+			for (k = k_start; k < k_end; k += k_inc)
+			{
+				int i =  (decode_block_scan[k] & 3);
+				int j = ((decode_block_scan[k] >> 2) & 3);
+				perform_mc(currMB, curr_plane, dec_picture, pred_dir, i, j, list_offset, block_size_x, block_size_y, curr_mb_field);
+			}        
+		}
+		else
+		{
+			int k_start = (block8x8 << 2);
+
+			// Prepare mvs (needed for deblocking and mv prediction
+			if (currSlice->direct_spatial_mv_pred_flag)
+			{
+				h264_ref_t *ref_pic_num_l0 = dec_picture->ref_pic_num[p_Vid->current_slice_nr][LIST_0 + list_offset];
+				h264_ref_t *ref_pic_num_l1 = dec_picture->ref_pic_num[p_Vid->current_slice_nr][LIST_1 + list_offset];
+
+				for (k = k_start; k < k_start + BLOCK_MULTIPLE; k ++)
+				{
+					int i  =  (decode_block_scan[k] & 3);
+					int j  = ((decode_block_scan[k] >> 2) & 3);
+					int i4  = currMB->block_x + i;
+					int j4  = currMB->block_y + j;
+					int j6  = currMB->block_y_aff + j;
+
+					assert (pred_dir<=2);
+					//===== DIRECT PREDICTION =====
+
+					if (l0_rFrame >=0)
+					{
+						if (!l0_rFrame  && ((!colocated->moving_block[j6][i4]) && (!p_Vid->listX[LIST_1 + list_offset][0]->is_long_term)))
+						{
+							motion->motion[LIST_0][j4][i4].mv[0] = 0;
+							motion->motion[LIST_0][j4][i4].mv[1] = 0;
+							motion->motion[LIST_0][j4][i4].ref_idx = 0;
+						}
+						else
+						{
+							motion->motion[LIST_0][j4][i4].mv[0] = pmvl0[0];
+							motion->motion[LIST_0][j4][i4].mv[1] = pmvl0[1];
+							motion->motion[LIST_0][j4][i4].ref_idx = l0_rFrame;
+						}
+					}
+					else
+					{
+						motion->motion[LIST_0][j4][i4].ref_idx = -1;
+						motion->motion[LIST_0][j4][i4].mv[0] = 0;
+						motion->motion[LIST_0][j4][i4].mv[1] = 0;
+					}
+
+					if (l1_rFrame >=0)
+					{
+						if  (l1_rFrame==0 && ((!colocated->moving_block[j6][i4]) && (!p_Vid->listX[LIST_1 + list_offset][0]->is_long_term)))
+						{
+							motion->motion[LIST_1][j4][i4].mv[0] = 0;
+							motion->motion[LIST_1][j4][i4].mv[1] = 0;
+							motion->motion[LIST_1][j4][i4].ref_idx = l1_rFrame;
+						}
+						else
+						{
+							motion->motion[LIST_1][j4][i4].mv[0] = pmvl1[0];
+							motion->motion[LIST_1][j4][i4].mv[1] = pmvl1[1];
+							motion->motion[LIST_1][j4][i4].ref_idx = l1_rFrame;
+						}
+					}
+					else
+					{
+						motion->motion[LIST_1][j4][i4].mv[0] = 0;
+						motion->motion[LIST_1][j4][i4].mv[1] = 0;
+						motion->motion[LIST_1][j4][i4].ref_idx = -1;
+					}
+
+					if (l0_rFrame < 0 && l1_rFrame < 0)
+					{
+						motion->motion[LIST_0][j4][i4].ref_idx = 0;
+						motion->motion[LIST_1][j4][i4].ref_idx = 0;
+					}
+
+					if      (motion->motion[LIST_1][j4][i4].ref_idx==-1) 
+					{
+						pred_dir = 0;
+						ref_idx  = (motion->motion[LIST_0][j4][i4].ref_idx != -1) ? motion->motion[LIST_0][j4][i4].ref_idx : 0;
+					}
+					else if (motion->motion[LIST_0][j4][i4].ref_idx==-1) 
+					{
+						pred_dir = 1;
+						ref_idx  = (motion->motion[LIST_1][j4][i4].ref_idx != -1) ? motion->motion[LIST_1][j4][i4].ref_idx : 0;
+					}
+					else                                               
+						pred_dir = 2;
+
+					motion->motion[LIST_0][j4][i4].ref_pic_id = ref_pic_num_l0[(short)motion->motion[LIST_0][j4][i4].ref_idx];
+					motion->motion[LIST_1][j4][i4].ref_pic_id = ref_pic_num_l1[(short)motion->motion[LIST_1][j4][i4].ref_idx];
+				}
+			}
+			else
+			{
+				for (k = k_start; k < k_start + BLOCK_MULTIPLE; k ++)
+				{
+					int i =  (decode_block_scan[k] & 3);
+					int j = ((decode_block_scan[k] >> 2) & 3);
+					int i4   = currMB->block_x + i;
+					int j4   = currMB->block_y + j;
+					int j6   = currMB->block_y_aff + j;
+
+					assert (pred_dir<=2);
+
+					refList = (colocated->motion[LIST_0][j6][i4].ref_idx== -1 ? LIST_1 : LIST_0);
+					ref_idx =  colocated->motion[refList][j6][i4].ref_idx;
+
+					if(ref_idx==-1) // co-located is intra mode
+					{
+						memset( &motion->motion[LIST_0][j4][i4].mv, 0, sizeof(MotionVector));
+						memset( &motion->motion[LIST_1][j4][i4].mv, 0, sizeof(MotionVector));
+
+						motion->motion[LIST_0][j4][i4].ref_idx = 0;
+						motion->motion[LIST_1][j4][i4].ref_idx = 0;
+					}
+					else // co-located skip or inter mode
+					{
+						int mapped_idx=0;
+						int iref;
+
+						for (iref=0;iref<imin(currSlice->num_ref_idx_l0_active,p_Vid->listXsize[LIST_0 + list_offset]);iref++)
+						{
+							if(p_Vid->structure==0 && curr_mb_field==0)
+							{
+								// If the current MB is a frame MB and the colocated is from a field picture,
+								// then the colocated->ref_pic_id may have been generated from the wrong value of
+								// frame_poc if it references it's complementary field, so test both POC values
+								if(p_Vid->listX[0][iref]->top_poc*2 == colocated->motion[refList][j6][i4].ref_pic_id || p_Vid->listX[0][iref]->bottom_poc*2 == colocated->motion[refList][j6][i4].ref_pic_id)
+								{
+									mapped_idx=iref;
+									break;
+								}
+								else //! invalid index. Default to zero even though this case should not happen
+									mapped_idx=INVALIDINDEX;
+								continue;
+							}
+
+							if (dec_picture->ref_pic_num[p_Vid->current_slice_nr][LIST_0 + list_offset][iref]==colocated->motion[refList][j6][i4].ref_pic_id)
+							{
+								mapped_idx=iref;
+								break;
+							}
+							else //! invalid index. Default to zero even though this case should not happen
+							{
+								mapped_idx=INVALIDINDEX;
+							}
+						}
+						if (INVALIDINDEX == mapped_idx)
+						{
+							error("temporal direct error: colocated block has ref that is unavailable",-1111);
+						}
+						else
+						{
+							int mv_scale = currSlice->mvscale[LIST_0 + list_offset][mapped_idx];
+
+							//! In such case, an array is needed for each different reference.
+							if (mv_scale == 9999 || p_Vid->listX[LIST_0+list_offset][mapped_idx]->is_long_term)
+							{
+								memcpy(&motion->motion[LIST_0][j4][i4].mv, &colocated->motion[refList][j6][i4].mv, sizeof(MotionVector));
+								memset(&motion->motion[LIST_1][j4][i4].mv, 0, sizeof(MotionVector));
+							}
+							else
+							{
+								motion->motion[LIST_0][j4][i4].mv[0]= (short) ((mv_scale * colocated->motion[refList][j6][i4].mv[0] + 128 ) >> 8);
+								motion->motion[LIST_0][j4][i4].mv[1]= (short) ((mv_scale * colocated->motion[refList][j6][i4].mv[1] + 128 ) >> 8);
+
+								motion->motion[LIST_1][j4][i4].mv[0]= (short) (motion->motion[LIST_0][j4][i4].mv[0] - colocated->motion[refList][j6][i4].mv[0]);
+								motion->motion[LIST_1][j4][i4].mv[1]= (short) (motion->motion[LIST_0][j4][i4].mv[1] - colocated->motion[refList][j6][i4].mv[1]);
+							}
+
+							motion->motion[LIST_0][j4][i4].ref_idx = (char) mapped_idx; //p_Vid->listX[1][0]->ref_idx[refList][j4][i4];
+							motion->motion[LIST_1][j4][i4].ref_idx = 0;
+						}
+					}
+					// store reference picture ID determined by direct mode
+					motion->motion[LIST_0][j4][i4].ref_pic_id = dec_picture->ref_pic_num[p_Vid->current_slice_nr][LIST_0 + list_offset][(short)motion->motion[LIST_0][j4][i4].ref_idx];
+					motion->motion[LIST_1][j4][i4].ref_pic_id = dec_picture->ref_pic_num[p_Vid->current_slice_nr][LIST_1 + list_offset][(short)motion->motion[LIST_1][j4][i4].ref_idx];
+				}
+			}
+
+			if (p_Vid->active_sps->direct_8x8_inference_flag)
+			{
+				int i =  (decode_block_scan[k_start] & 3);
+				int j = ((decode_block_scan[k_start] >> 2) & 3);
+				perform_mc8x8(currMB, curr_plane, dec_picture, pred_dir, i, j, list_offset, curr_mb_field);           
+			}
+			else
+			{
+				for (k = k_start; k < k_start+BLOCK_MULTIPLE; k ++)
+				{
+					int i =  (decode_block_scan[k] & 3);
+					int j = ((decode_block_scan[k] >> 2) & 3);
+					perform_mc(currMB, curr_plane, dec_picture, pred_dir, i, j, list_offset, BLOCK_SIZE, BLOCK_SIZE, curr_mb_field);           
+				}
+			}
+		}
+	}
+
+	iTransform(currMB, curr_plane, 0); 
+}
+
+/*!
+************************************************************************
+* \brief
+*    Copy IPCM coefficients to decoded picture buffer and set parameters for this MB
+*    (for IPCM CABAC and IPCM CAVLC  28/11/2003)
+*
+* \author
+*    Dong Wang <Dong.Wang@bristol.ac.uk>
+************************************************************************
+*/
+void set_chroma_qp(Macroblock* currMB);
+static inline void update_qp(Macroblock *currMB, int qp)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	currMB->qp = qp;
+	currMB->qp_scaled[0] = qp + p_Vid->bitdepth_luma_qp_scale;
+	set_chroma_qp(currMB);
+	currMB->is_lossless = (Boolean) ((currMB->qp_scaled[0] == 0) && (p_Vid->lossless_qpprime_flag == 1));
+}
+
+void mb_pred_ipcm(Macroblock *currMB)
+{
+	int i, j, k;
+	Slice *currSlice = currMB->p_Slice;
+	VideoParameters *p_Vid = currMB->p_Vid;
+	StorablePicture *dec_picture = p_Vid->dec_picture;
+
+	//Copy coefficients to decoded picture buffer
+	//IPCM coefficients are stored in currSlice->ipcm which is set in function read_IPCM_coeffs_from_NAL()
+
+	for(i = 0; i < MB_BLOCK_SIZE; ++i)
+	{
+		for(j = 0;j < MB_BLOCK_SIZE ; ++j)
+		{
+			dec_picture->imgY->img[currMB->pix_y + i][currMB->pix_x + j] = (imgpel) currSlice->ipcm[0][i][j];
+		}
+	}
+
+	if ((dec_picture->chroma_format_idc != YUV400) && !IS_INDEPENDENT(p_Vid))
+	{
+		for (k = 0; k < 2; ++k)
+		{
+			for(i = 0; i < p_Vid->mb_cr_size_y; ++i)
+			{
+				for(j = 0;j < p_Vid->mb_cr_size_x; ++j)
+				{
+					dec_picture->imgUV[k]->img[currMB->pix_c_y+i][currMB->pix_c_x + j] = (imgpel) currSlice->ipcm[k + 1][i][j];  
+				}
+			}
+		}
+	}
+
+	// for deblocking filter
+	update_qp(currMB, 0);
+
+	// for CAVLC: Set the nz_coeff to 16.
+	// These parameters are to be used in CAVLC decoding of neighbour blocks  
+	memset(&p_Vid->nz_coeff[currMB->mbAddrX][0][0][0], 16, sizeof(h264_nz_coefficient));
+
+	// for CABAC decoding of MB skip flag
+	currMB->skip_flag = 0;
+
+	//for deblocking filter CABAC
+	currMB->cbp_blk[0] = 0xFFFF;
+
+	//For CABAC decoding of Dquant
+	currSlice->last_dquant = 0;
+}
+
diff --git a/Src/h264dec/ldecod/src/mbuffer.c b/Src/h264dec/ldecod/src/mbuffer.c
new file mode 100644
index 00000000..9784d1ef
--- /dev/null
+++ b/Src/h264dec/ldecod/src/mbuffer.c
@@ -0,0 +1,4409 @@
+
+/*!
+ ***********************************************************************
+ *  \file
+ *      mbuffer.c
+ *
+ *  \brief
+ *      Frame buffer functions
+ *
+ *  \author
+ *      Main contributors (see contributors.h for copyright, address and affiliation details)
+ *      - Karsten S�hring                 <suehring@hhi.de>
+ *      - Alexis Tourapis                 <alexismt@ieee.org>
+ *      - Jill Boyce                      <jill.boyce@thomson.net>
+ *      - Saurav K Bandyopadhyay          <saurav@ieee.org>
+ *      - Zhenyu Wu                       <Zhenyu.Wu@thomson.net
+ *      - Purvin Pandit                   <Purvin.Pandit@thomson.net>
+ *
+ ***********************************************************************
+ */
+
+#include <limits.h>
+
+#include "global.h"
+#include "erc_api.h"
+#include "header.h"
+#include "image.h"
+#include "mbuffer.h"
+#include "memalloc.h"
+#include "output.h"
+
+
+
+static void insert_picture_in_dpb    (VideoParameters *p_Vid, FrameStore* fs, StorablePicture* p);
+static void output_one_frame_from_dpb(VideoParameters *p_Vid);
+static void get_smallest_poc         (DecodedPictureBuffer *p_Dpb, int *poc,int * pos);
+static void gen_field_ref_ids        (StorablePicture *p);
+static int  remove_unused_frame_from_dpb (VideoParameters *p_Vid, DecodedPictureBuffer *p_Dpb);
+static int  is_used_for_reference    (FrameStore* fs);
+static int  is_short_term_reference  (FrameStore* fs);
+static int  is_long_term_reference   (FrameStore* fs);
+
+#define MAX_LIST_SIZE 33
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Print out list of pictures in DPB. Used for debug purposes.
+ ************************************************************************
+ */
+static void dump_dpb(DecodedPictureBuffer *p_Dpb)
+{
+#if DUMP_DPB
+  unsigned i;
+
+  for (i=0; i<p_Dpb->used_size;i++)
+  {
+    printf("(");
+    printf("fn=%d  ", p_Dpb->fs[i]->frame_num);
+    if (p_Dpb->fs[i]->is_used & 1)
+    {
+      if (p_Dpb->fs[i]->top_field)
+        printf("T: poc=%d  ", p_Dpb->fs[i]->top_field->poc);
+      else
+        printf("T: poc=%d  ", p_Dpb->fs[i]->frame->top_poc);
+    }
+    if (p_Dpb->fs[i]->is_used & 2)
+    {
+      if (p_Dpb->fs[i]->bottom_field)
+        printf("B: poc=%d  ", p_Dpb->fs[i]->bottom_field->poc);
+      else
+        printf("B: poc=%d  ", p_Dpb->fs[i]->frame->bottom_poc);
+    }
+    if (p_Dpb->fs[i]->is_used == 3)
+      printf("F: poc=%d  ", p_Dpb->fs[i]->frame->poc);
+    printf("G: poc=%d)  ", p_Dpb->fs[i]->poc);
+    if (p_Dpb->fs[i]->is_reference) printf ("ref (%d) ", p_Dpb->fs[i]->is_reference);
+    if (p_Dpb->fs[i]->is_long_term) printf ("lt_ref (%d) ", p_Dpb->fs[i]->is_reference);
+    if (p_Dpb->fs[i]->is_output) printf ("out  ");
+    if (p_Dpb->fs[i]->is_used == 3)
+    {
+      if (p_Dpb->fs[i]->frame->non_existing) printf ("ne  ");
+    }
+    printf ("\n");
+  }
+#endif
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Returns the size of the dpb depending on level and picture size
+ *
+ *
+ ************************************************************************
+ */
+static int getDpbSize(seq_parameter_set_rbsp_t *active_sps)
+{
+  int pic_size = (active_sps->pic_width_in_mbs_minus1 + 1) * (active_sps->pic_height_in_map_units_minus1 + 1) * (active_sps->frame_mbs_only_flag?1:2) * 384;
+
+  int size = 0;
+
+  switch (active_sps->level_idc)
+  {
+  case 9:
+    size = 152064;
+    break;
+  case 10:
+    size = 152064;
+    break;
+  case 11:
+    if (!IS_FREXT_PROFILE(active_sps->profile_idc) && (active_sps->constrained_set3_flag == 1))
+      size = 152064;
+    else
+      size = 345600;
+    break;
+  case 12:
+    size = 912384;
+    break;
+  case 13:
+    size = 912384;
+    break;
+  case 20:
+    size = 912384;
+    break;
+  case 21:
+    size = 1824768;
+    break;
+  case 22:
+    size = 3110400;
+    break;
+  case 30:
+    size = 3110400;
+    break;
+  case 31:
+    size = 6912000;
+    break;
+  case 32:
+    size = 7864320;
+    break;
+  case 40:
+    size = 12582912;
+    break;
+  case 41:
+    size = 12582912;
+    break;
+  case 42:
+    size = 13369344;
+    break;
+  case 50:
+    size = 42393600;
+    break;
+  case 51:
+    size = 70778880;
+    break;
+  default:
+    error ("undefined level", 500);
+    break;
+  }
+
+  size /= pic_size;
+  size = imin( size, 16);
+
+  if (active_sps->vui_parameters_present_flag && active_sps->vui_seq_parameters.bitstream_restriction_flag)
+  {
+    if ((int)active_sps->vui_seq_parameters.max_dec_frame_buffering > size)
+    {
+      error ("max_dec_frame_buffering larger than MaxDpbSize", 500);
+    }
+    size = imax (1, active_sps->vui_seq_parameters.max_dec_frame_buffering);
+  }
+
+  return size;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Check then number of frames marked "used for reference" and break
+ *    if maximum is exceeded
+ *
+ ************************************************************************
+ */
+void check_num_ref(DecodedPictureBuffer *p_Dpb)
+{
+  if ((int)(p_Dpb->ltref_frames_in_buffer +  p_Dpb->ref_frames_in_buffer ) > (imax(1, p_Dpb->num_ref_frames)))
+  {
+    error ("Max. number of reference frames exceeded. Invalid stream.", 500);
+  }
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Allocate memory for decoded picture buffer and initialize with sane values.
+ *
+ ************************************************************************
+ */
+void init_dpb(VideoParameters *p_Vid)
+{
+  unsigned i,j;
+  DecodedPictureBuffer *p_Dpb = p_Vid->p_Dpb;
+  seq_parameter_set_rbsp_t *active_sps = p_Vid->active_sps;
+	size_t num_output_pictures;
+
+  if (p_Dpb->init_done)
+  {
+    free_dpb(p_Vid);
+  }
+
+  p_Dpb->p_Vid = p_Vid;
+  p_Dpb->size  = getDpbSize(active_sps);
+
+  p_Dpb->num_ref_frames = active_sps->num_ref_frames;
+
+  if (p_Dpb->size < active_sps->num_ref_frames)
+  {
+    error ("DPB size at specified level is smaller than the specified number of reference frames. This is not allowed.\n", 1000);
+  }
+
+  p_Dpb->used_size = 0;
+  p_Dpb->last_picture = NULL;
+
+  p_Dpb->ref_frames_in_buffer = 0;
+  p_Dpb->ltref_frames_in_buffer = 0;
+
+  p_Dpb->fs = calloc(p_Dpb->size, sizeof (FrameStore*));
+  if (NULL==p_Dpb->fs)
+    no_mem_exit("init_dpb: dpb->fs");
+
+  p_Dpb->fs_ref = calloc(p_Dpb->size, sizeof (FrameStore*));
+  if (NULL==p_Dpb->fs_ref)
+    no_mem_exit("init_dpb: dpb->fs_ref");
+
+  p_Dpb->fs_ltref = calloc(p_Dpb->size, sizeof (FrameStore*));
+  if (NULL==p_Dpb->fs_ltref)
+    no_mem_exit("init_dpb: dpb->fs_ltref");
+
+  for (i=0; i<p_Dpb->size; i++)
+  {
+    p_Dpb->fs[i]       = alloc_frame_store();
+    p_Dpb->fs_ref[i]   = NULL;
+    p_Dpb->fs_ltref[i] = NULL;
+  }
+
+  for (i=0; i<6; i++)
+  {
+    p_Vid->listX[i] = calloc(MAX_LIST_SIZE, sizeof (StorablePicture*)); // +1 for reordering
+    if (NULL==p_Vid->listX[i])
+      no_mem_exit("init_dpb: p_Vid->listX[i]");
+  }
+
+  /* allocate a dummy storable picture */
+  p_Vid->no_reference_picture = alloc_storable_picture (p_Vid, FRAME, p_Vid->width, p_Vid->height, p_Vid->width_cr, p_Vid->height_cr);
+  p_Vid->no_reference_picture->top_field    = p_Vid->no_reference_picture;
+  p_Vid->no_reference_picture->bottom_field = p_Vid->no_reference_picture;
+  p_Vid->no_reference_picture->frame        = p_Vid->no_reference_picture;
+
+
+  for (j=0;j<6;j++)
+  {
+    for (i=0; i<MAX_LIST_SIZE; i++)
+    {
+      p_Vid->listX[j][i] = NULL;
+    }
+    p_Vid->listXsize[j]=0;
+  }
+
+  p_Dpb->last_output_poc = INT_MIN;
+
+  p_Vid->last_has_mmco_5 = 0;
+
+  p_Dpb->init_done = 1;
+
+	num_output_pictures = getDpbSize(active_sps) + active_sps->vui_seq_parameters.max_dec_frame_buffering;
+	out_storable_pictures_init(p_Vid, num_output_pictures);
+}
+/*!
+ ************************************************************************
+ * \brief
+ *    Free memory for decoded picture buffer.
+ ************************************************************************
+ */
+void free_dpb(VideoParameters *p_Vid)
+{
+  DecodedPictureBuffer *p_Dpb = p_Vid->p_Dpb;
+  unsigned i;
+  if (p_Dpb->fs)
+  {
+    for (i=0; i<p_Dpb->size; i++)
+    {
+      free_frame_store(p_Vid, p_Dpb->fs[i]);
+    }
+    free (p_Dpb->fs);
+    p_Dpb->fs=NULL;
+  }
+  if (p_Dpb->fs_ref)
+  {
+    free (p_Dpb->fs_ref);
+  }
+  if (p_Dpb->fs_ltref)
+  {
+    free (p_Dpb->fs_ltref);
+  }
+  p_Dpb->last_output_poc = INT_MIN;
+
+  for (i=0; i<6; i++)
+    if (p_Vid->listX[i])
+    {
+      free (p_Vid->listX[i]);
+      p_Vid->listX[i] = NULL;
+    }
+
+  p_Dpb->init_done = 0;
+
+  free_storable_picture(p_Vid, p_Vid->no_reference_picture);
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Allocate memory for decoded picture buffer frame stores an initialize with sane values.
+ *
+ * \return
+ *    the allocated FrameStore structure
+ ************************************************************************
+ */
+FrameStore* alloc_frame_store(void)
+{
+  FrameStore *f;
+
+  f = calloc (1, sizeof(FrameStore));
+  if (NULL==f)
+    no_mem_exit("alloc_frame_store: f");
+
+  f->is_used      = 0;
+  f->is_reference = 0;
+  f->is_long_term = 0;
+  f->is_orig_reference = 0;
+
+  f->is_output = 0;
+
+  f->frame        = NULL;;
+  f->top_field    = NULL;
+  f->bottom_field = NULL;
+
+  return f;
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Free frame store memory.
+ *
+ * \param p_Vid
+ *      image decoding parameters for current picture
+ * \param f
+ *    FrameStore to be freed
+ *
+ ************************************************************************
+ */
+void free_frame_store(VideoParameters *p_Vid, FrameStore* f)
+{
+  if (f)
+  {
+    if (f->frame)
+    {
+      free_storable_picture(p_Vid, f->frame);
+      f->frame=NULL;
+    }
+    if (f->top_field)
+    {
+      free_storable_picture(p_Vid, f->top_field);
+      f->top_field=NULL;
+    }
+    if (f->bottom_field)
+    {
+      free_storable_picture(p_Vid, f->bottom_field);
+      f->bottom_field=NULL;
+    }
+    free(f);
+  }
+}
+
+void free_pic_motion(VideoParameters *p_Vid, PicMotionParams *motion, int size_x, int size_y);
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    mark FrameStore unused for reference
+ *
+ ************************************************************************
+ */
+static void unmark_for_reference(VideoParameters *p_Vid, FrameStore* fs)
+{
+
+  if (fs->is_used & 1)
+  {
+    if (fs->top_field)
+    {
+      fs->top_field->used_for_reference = 0;
+    }
+  }
+  if (fs->is_used & 2)
+  {
+    if (fs->bottom_field)
+    {
+      fs->bottom_field->used_for_reference = 0;
+    }
+  }
+  if (fs->is_used == 3)
+  {
+    if (fs->top_field && fs->bottom_field)
+    {
+      fs->top_field->used_for_reference = 0;
+      fs->bottom_field->used_for_reference = 0;
+    }
+    fs->frame->used_for_reference = 0;
+  }
+
+  fs->is_reference = 0;
+
+  if(fs->frame)
+  {
+    free_pic_motion(p_Vid, &fs->frame->motion, fs->frame->size_x, fs->frame->size_y);
+  }
+
+  if (fs->top_field)
+  {
+    free_pic_motion(p_Vid, &fs->top_field->motion, fs->top_field->size_x, fs->top_field->size_y);
+  }
+
+  if (fs->bottom_field)
+  {
+    free_pic_motion(p_Vid, &fs->bottom_field->motion, fs->bottom_field->size_x, fs->bottom_field->size_y);
+  }
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    mark FrameStore unused for reference and reset long term flags
+ *
+ ************************************************************************
+ */
+static void unmark_for_long_term_reference(FrameStore* fs)
+{
+
+  if (fs->is_used & 1)
+  {
+    if (fs->top_field)
+    {
+      fs->top_field->used_for_reference = 0;
+      fs->top_field->is_long_term = 0;
+    }
+  }
+  if (fs->is_used & 2)
+  {
+    if (fs->bottom_field)
+    {
+      fs->bottom_field->used_for_reference = 0;
+      fs->bottom_field->is_long_term = 0;
+    }
+  }
+  if (fs->is_used == 3)
+  {
+    if (fs->top_field && fs->bottom_field)
+    {
+      fs->top_field->used_for_reference = 0;
+      fs->top_field->is_long_term = 0;
+      fs->bottom_field->used_for_reference = 0;
+      fs->bottom_field->is_long_term = 0;
+    }
+    fs->frame->used_for_reference = 0;
+    fs->frame->is_long_term = 0;
+  }
+
+  fs->is_reference = 0;
+  fs->is_long_term = 0;
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    compares two stored pictures by picture number for qsort in descending order
+ *
+ ************************************************************************
+ */
+static inline int compare_pic_by_pic_num_desc( const void *arg1, const void *arg2 )
+{
+  int pic_num1 = (*(StorablePicture**)arg1)->pic_num;
+  int pic_num2 = (*(StorablePicture**)arg2)->pic_num;
+
+  if (pic_num1 < pic_num2)
+    return 1;
+  if (pic_num1 > pic_num2)
+    return -1;
+  else
+    return 0;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    compares two stored pictures by picture number for qsort in descending order
+ *
+ ************************************************************************
+ */
+static inline int compare_pic_by_lt_pic_num_asc( const void *arg1, const void *arg2 )
+{
+  int long_term_pic_num1 = (*(StorablePicture**)arg1)->long_term_pic_num;
+  int long_term_pic_num2 = (*(StorablePicture**)arg2)->long_term_pic_num;
+
+  if ( long_term_pic_num1 < long_term_pic_num2)
+    return -1;
+  if ( long_term_pic_num1 > long_term_pic_num2)
+    return 1;
+  else
+    return 0;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    compares two frame stores by pic_num for qsort in descending order
+ *
+ ************************************************************************
+ */
+static inline int compare_fs_by_frame_num_desc( const void *arg1, const void *arg2 )
+{
+  int frame_num_wrap1 = (*(FrameStore**)arg1)->frame_num_wrap;
+  int frame_num_wrap2 = (*(FrameStore**)arg2)->frame_num_wrap;
+  if ( frame_num_wrap1 < frame_num_wrap2)
+    return 1;
+  if ( frame_num_wrap1 > frame_num_wrap2)
+    return -1;
+  else
+    return 0;
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    compares two frame stores by lt_pic_num for qsort in descending order
+ *
+ ************************************************************************
+ */
+static inline int compare_fs_by_lt_pic_idx_asc( const void *arg1, const void *arg2 )
+{
+  int long_term_frame_idx1 = (*(FrameStore**)arg1)->long_term_frame_idx;
+  int long_term_frame_idx2 = (*(FrameStore**)arg2)->long_term_frame_idx;
+
+  if ( long_term_frame_idx1 < long_term_frame_idx2)
+    return -1;
+  if ( long_term_frame_idx1 > long_term_frame_idx2)
+    return 1;
+  else
+    return 0;
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    compares two stored pictures by poc for qsort in ascending order
+ *
+ ************************************************************************
+ */
+static inline int compare_pic_by_poc_asc( const void *arg1, const void *arg2 )
+{
+  int poc1 = (*(StorablePicture**)arg1)->poc;
+  int poc2 = (*(StorablePicture**)arg2)->poc;
+
+  if ( poc1 < poc2)
+    return -1;  
+  if ( poc1 > poc2)
+    return 1;
+  else
+    return 0;
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    compares two stored pictures by poc for qsort in descending order
+ *
+ ************************************************************************
+ */
+static inline int compare_pic_by_poc_desc( const void *arg1, const void *arg2 )
+{
+  int poc1 = (*(StorablePicture**)arg1)->poc;
+  int poc2 = (*(StorablePicture**)arg2)->poc;
+
+  if (poc1 < poc2)
+    return 1;
+  if (poc1 > poc2)
+    return -1;
+  else
+    return 0;
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    compares two frame stores by poc for qsort in ascending order
+ *
+ ************************************************************************
+ */
+static inline int compare_fs_by_poc_asc( const void *arg1, const void *arg2 )
+{
+  int poc1 = (*(FrameStore**)arg1)->poc;
+  int poc2 = (*(FrameStore**)arg2)->poc;
+
+  if (poc1 < poc2)
+    return -1;  
+  if (poc1 > poc2)
+    return 1;
+  else
+    return 0;
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    compares two frame stores by poc for qsort in descending order
+ *
+ ************************************************************************
+ */
+static inline int compare_fs_by_poc_desc( const void *arg1, const void *arg2 )
+{
+  int poc1 = (*(FrameStore**)arg1)->poc;
+  int poc2 = (*(FrameStore**)arg2)->poc;
+
+  if (poc1 < poc2)
+    return 1;
+  if (poc1 > poc2)
+    return -1;
+  else
+    return 0;
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    returns true, if picture is short term reference picture
+ *
+ ************************************************************************
+ */
+int is_short_ref(StorablePicture *s)
+{
+  return ((s->used_for_reference) && (!(s->is_long_term)));
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    returns true, if picture is long term reference picture
+ *
+ ************************************************************************
+ */
+int is_long_ref(StorablePicture *s)
+{
+  return ((s->used_for_reference) && (s->is_long_term));
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Generates a alternating field list from a given FrameStore list
+ *
+ ************************************************************************
+ */
+static void gen_pic_list_from_frame_list(PictureStructure currStructure, FrameStore **fs_list, int list_idx, StorablePicture **list, char *list_size, int long_term)
+{
+  int top_idx = 0;
+  int bot_idx = 0;
+
+  int (*is_ref)(StorablePicture *s);
+
+  if (long_term)
+    is_ref=is_long_ref;
+  else
+    is_ref=is_short_ref;
+
+  if (currStructure == TOP_FIELD)
+  {
+    while ((top_idx<list_idx)||(bot_idx<list_idx))
+    {
+      for ( ; top_idx<list_idx; top_idx++)
+      {
+        if(fs_list[top_idx]->is_used & 1)
+        {
+          if(is_ref(fs_list[top_idx]->top_field))
+          {
+            // short term ref pic
+            list[(short) *list_size] = fs_list[top_idx]->top_field;
+            (*list_size)++;
+            top_idx++;
+            break;
+          }
+        }
+      }
+      for ( ; bot_idx<list_idx; bot_idx++)
+      {
+        if(fs_list[bot_idx]->is_used & 2)
+        {
+          if(is_ref(fs_list[bot_idx]->bottom_field))
+          {
+            // short term ref pic
+            list[(short) *list_size] = fs_list[bot_idx]->bottom_field;
+            (*list_size)++;
+            bot_idx++;
+            break;
+          }
+        }
+      }
+    }
+  }
+  if (currStructure == BOTTOM_FIELD)
+  {
+    while ((top_idx<list_idx)||(bot_idx<list_idx))
+    {
+      for ( ; bot_idx<list_idx; bot_idx++)
+      {
+        if(fs_list[bot_idx]->is_used & 2)
+        {
+          if(is_ref(fs_list[bot_idx]->bottom_field))
+          {
+            // short term ref pic
+            list[(short) *list_size] = fs_list[bot_idx]->bottom_field;
+            (*list_size)++;
+            bot_idx++;
+            break;
+          }
+        }
+      }
+      for ( ; top_idx<list_idx; top_idx++)
+      {
+        if(fs_list[top_idx]->is_used & 1)
+        {
+          if(is_ref(fs_list[top_idx]->top_field))
+          {
+            // short term ref pic
+            list[(short) *list_size] = fs_list[top_idx]->top_field;
+            (*list_size)++;
+            top_idx++;
+            break;
+          }
+        }
+      }
+    }
+  }
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Initialize p_Vid->listX[0] and list 1 depending on current slice type
+ *
+ ************************************************************************
+ */
+void init_lists(Slice *currSlice)
+{
+  VideoParameters *p_Vid = currSlice->p_Vid;
+  DecodedPictureBuffer *p_Dpb = p_Vid->p_Dpb;
+  seq_parameter_set_rbsp_t *active_sps = p_Vid->active_sps;
+
+  int add_top = 0, add_bottom = 0;
+  unsigned i;
+  int j;
+  int MaxFrameNum = 1 << (active_sps->log2_max_frame_num_minus4 + 4);
+  int diff;
+
+  int list0idx = 0;
+  int list0idx_1 = 0;
+  int listltidx = 0;
+
+  FrameStore **fs_list0;
+  FrameStore **fs_list1;
+  FrameStore **fs_listlt;
+
+  StorablePicture *tmp_s;
+
+  if (currSlice->structure == FRAME)
+  {
+    for (i=0; i<p_Dpb->ref_frames_in_buffer; i++)
+    {
+      if (p_Dpb->fs_ref[i]->is_used==3)
+      {
+        if ((p_Dpb->fs_ref[i]->frame->used_for_reference)&&(!p_Dpb->fs_ref[i]->frame->is_long_term))
+        {
+          if( p_Dpb->fs_ref[i]->frame_num > p_Vid->frame_num )
+          {
+            p_Dpb->fs_ref[i]->frame_num_wrap = p_Dpb->fs_ref[i]->frame_num - MaxFrameNum;
+          }
+          else
+          {
+            p_Dpb->fs_ref[i]->frame_num_wrap = p_Dpb->fs_ref[i]->frame_num;
+          }
+          p_Dpb->fs_ref[i]->frame->pic_num = p_Dpb->fs_ref[i]->frame_num_wrap;
+        }
+      }
+    }
+    // update long_term_pic_num
+    for (i = 0; i < p_Dpb->ltref_frames_in_buffer; i++)
+    {
+      if (p_Dpb->fs_ltref[i]->is_used==3)
+      {
+        if (p_Dpb->fs_ltref[i]->frame->is_long_term)
+        {
+          p_Dpb->fs_ltref[i]->frame->long_term_pic_num = p_Dpb->fs_ltref[i]->frame->long_term_frame_idx;
+        }
+      }
+    }
+  }
+  else
+  {
+    if (currSlice->structure == TOP_FIELD)
+    {
+      add_top    = 1;
+      add_bottom = 0;
+    }
+    else
+    {
+      add_top    = 0;
+      add_bottom = 1;
+    }
+
+    for (i=0; i<p_Dpb->ref_frames_in_buffer; i++)
+    {
+      if (p_Dpb->fs_ref[i]->is_reference)
+      {
+        if( p_Dpb->fs_ref[i]->frame_num > p_Vid->frame_num )
+        {
+          p_Dpb->fs_ref[i]->frame_num_wrap = p_Dpb->fs_ref[i]->frame_num - MaxFrameNum;
+        }
+        else
+        {
+          p_Dpb->fs_ref[i]->frame_num_wrap = p_Dpb->fs_ref[i]->frame_num;
+        }
+        if (p_Dpb->fs_ref[i]->is_reference & 1)
+        {
+          p_Dpb->fs_ref[i]->top_field->pic_num = (2 * p_Dpb->fs_ref[i]->frame_num_wrap) + add_top;
+        }
+        if (p_Dpb->fs_ref[i]->is_reference & 2)
+        {
+          p_Dpb->fs_ref[i]->bottom_field->pic_num = (2 * p_Dpb->fs_ref[i]->frame_num_wrap) + add_bottom;
+        }
+      }
+    }
+    // update long_term_pic_num
+    for (i=0; i<p_Dpb->ltref_frames_in_buffer; i++)
+    {
+      if (p_Dpb->fs_ltref[i]->is_long_term & 1)
+      {
+        p_Dpb->fs_ltref[i]->top_field->long_term_pic_num = 2 * p_Dpb->fs_ltref[i]->top_field->long_term_frame_idx + add_top;
+      }
+      if (p_Dpb->fs_ltref[i]->is_long_term & 2)
+      {
+        p_Dpb->fs_ltref[i]->bottom_field->long_term_pic_num = 2 * p_Dpb->fs_ltref[i]->bottom_field->long_term_frame_idx + add_bottom;
+      }
+    }
+  }
+
+  if ((currSlice->slice_type == I_SLICE)||(currSlice->slice_type == SI_SLICE))
+  {
+    p_Vid->listXsize[0] = 0;
+    p_Vid->listXsize[1] = 0;
+    return;
+  }
+
+  if ((currSlice->slice_type == P_SLICE)||(currSlice->slice_type == SP_SLICE))
+  {
+    // Calculate FrameNumWrap and PicNum
+    if (currSlice->structure == FRAME)
+    {
+      for (i=0; i<p_Dpb->ref_frames_in_buffer; i++)
+      {
+        if (p_Dpb->fs_ref[i]->is_used==3)
+        {
+          if ((p_Dpb->fs_ref[i]->frame->used_for_reference)&&(!p_Dpb->fs_ref[i]->frame->is_long_term))
+          {
+            p_Vid->listX[0][list0idx++] = p_Dpb->fs_ref[i]->frame;
+          }
+        }
+      }
+      // order list 0 by PicNum
+      qsort((void *)p_Vid->listX[0], list0idx, sizeof(StorablePicture*), compare_pic_by_pic_num_desc);
+      p_Vid->listXsize[0] = (char) list0idx;
+//      printf("listX[0] (PicNum): "); for (i=0; i<list0idx; i++){printf ("%d  ", p_Vid->listX[0][i]->pic_num);} printf("\n");
+
+      // long term handling
+      for (i=0; i<p_Dpb->ltref_frames_in_buffer; i++)
+      {
+        if (p_Dpb->fs_ltref[i]->is_used==3)
+        {
+          if (p_Dpb->fs_ltref[i]->frame->is_long_term)
+          {
+            p_Vid->listX[0][list0idx++]=p_Dpb->fs_ltref[i]->frame;
+          }
+        }
+      }
+      qsort((void *)&p_Vid->listX[0][(short) p_Vid->listXsize[0]], list0idx - p_Vid->listXsize[0], sizeof(StorablePicture*), compare_pic_by_lt_pic_num_asc);
+      p_Vid->listXsize[0] = (char) list0idx;
+    }
+    else
+    {
+      fs_list0 = calloc(p_Dpb->size, sizeof (FrameStore*));
+      if (NULL==fs_list0)
+         no_mem_exit("init_lists: fs_list0");
+      fs_listlt = calloc(p_Dpb->size, sizeof (FrameStore*));
+      if (NULL==fs_listlt)
+         no_mem_exit("init_lists: fs_listlt");
+
+      for (i=0; i<p_Dpb->ref_frames_in_buffer; i++)
+      {
+        if (p_Dpb->fs_ref[i]->is_reference)
+        {
+          fs_list0[list0idx++] = p_Dpb->fs_ref[i];
+        }
+      }
+
+      qsort((void *)fs_list0, list0idx, sizeof(FrameStore*), compare_fs_by_frame_num_desc);
+
+//      printf("fs_list0 (FrameNum): "); for (i=0; i<list0idx; i++){printf ("%d  ", fs_list0[i]->frame_num_wrap);} printf("\n");
+
+      p_Vid->listXsize[0] = 0;
+      gen_pic_list_from_frame_list(currSlice->structure, fs_list0, list0idx, p_Vid->listX[0], &p_Vid->listXsize[0], 0);
+
+//      printf("p_Vid->listX[0] (PicNum): "); for (i=0; i<p_Vid->listXsize[0]; i++){printf ("%d  ", p_Vid->listX[0][i]->pic_num);} printf("\n");
+
+      // long term handling
+      for (i=0; i<p_Dpb->ltref_frames_in_buffer; i++)
+      {
+        fs_listlt[listltidx++]=p_Dpb->fs_ltref[i];
+      }
+
+      qsort((void *)fs_listlt, listltidx, sizeof(FrameStore*), compare_fs_by_lt_pic_idx_asc);
+
+      gen_pic_list_from_frame_list(currSlice->structure, fs_listlt, listltidx, p_Vid->listX[0], &p_Vid->listXsize[0], 1);
+
+      free(fs_list0);
+      free(fs_listlt);
+    }
+    p_Vid->listXsize[1] = 0;
+  }
+  else
+  {
+    // B-Slice
+    if (currSlice->structure == FRAME)
+    {
+      for (i=0; i<p_Dpb->ref_frames_in_buffer; i++)
+      {
+        if (p_Dpb->fs_ref[i]->is_used==3)
+        {
+          if ((p_Dpb->fs_ref[i]->frame->used_for_reference)&&(!p_Dpb->fs_ref[i]->frame->is_long_term))
+          {
+            if (p_Vid->framepoc >= p_Dpb->fs_ref[i]->frame->poc) //!KS use >= for error concealment
+//            if (p_Vid->framepoc > p_Dpb->fs_ref[i]->frame->poc)
+            {
+              p_Vid->listX[0][list0idx++] = p_Dpb->fs_ref[i]->frame;
+            }
+          }
+        }
+      }
+      qsort((void *)p_Vid->listX[0], list0idx, sizeof(StorablePicture*), compare_pic_by_poc_desc);
+      list0idx_1 = list0idx;
+      for (i=0; i<p_Dpb->ref_frames_in_buffer; i++)
+      {
+        if (p_Dpb->fs_ref[i]->is_used==3)
+        {
+          if ((p_Dpb->fs_ref[i]->frame->used_for_reference)&&(!p_Dpb->fs_ref[i]->frame->is_long_term))
+          {
+            if (p_Vid->framepoc < p_Dpb->fs_ref[i]->frame->poc)
+            {
+              p_Vid->listX[0][list0idx++] = p_Dpb->fs_ref[i]->frame;
+            }
+          }
+        }
+      }
+      qsort((void *)&p_Vid->listX[0][list0idx_1], list0idx-list0idx_1, sizeof(StorablePicture*), compare_pic_by_poc_asc);
+
+      for (j=0; j<list0idx_1; j++)
+      {
+        p_Vid->listX[1][list0idx-list0idx_1+j]=p_Vid->listX[0][j];
+      }
+      for (j=list0idx_1; j<list0idx; j++)
+      {
+        p_Vid->listX[1][j-list0idx_1]=p_Vid->listX[0][j];
+      }
+
+      p_Vid->listXsize[0] = p_Vid->listXsize[1] = (char) list0idx;
+
+//      printf("p_Vid->listX[0] currPoc=%d (Poc): ", p_Vid->framepoc); for (i=0; i<p_Vid->listXsize[0]; i++){printf ("%d  ", p_Vid->listX[0][i]->poc);} printf("\n");
+//      printf("p_Vid->listX[1] currPoc=%d (Poc): ", p_Vid->framepoc); for (i=0; i<p_Vid->listXsize[1]; i++){printf ("%d  ", p_Vid->listX[1][i]->poc);} printf("\n");
+
+      // long term handling
+      for (i=0; i<p_Dpb->ltref_frames_in_buffer; i++)
+      {
+        if (p_Dpb->fs_ltref[i]->is_used==3)
+        {
+          if (p_Dpb->fs_ltref[i]->frame->is_long_term)
+          {
+            p_Vid->listX[0][list0idx]  =p_Dpb->fs_ltref[i]->frame;
+            p_Vid->listX[1][list0idx++]=p_Dpb->fs_ltref[i]->frame;
+          }
+        }
+      }
+      qsort((void *)&p_Vid->listX[0][(short) p_Vid->listXsize[0]], list0idx-p_Vid->listXsize[0], sizeof(StorablePicture*), compare_pic_by_lt_pic_num_asc);
+      qsort((void *)&p_Vid->listX[1][(short) p_Vid->listXsize[0]], list0idx-p_Vid->listXsize[0], sizeof(StorablePicture*), compare_pic_by_lt_pic_num_asc);
+      p_Vid->listXsize[0] = p_Vid->listXsize[1] = (char) list0idx;
+    }
+    else
+    {
+      fs_list0 = calloc(p_Dpb->size, sizeof (FrameStore*));
+      if (NULL==fs_list0)
+         no_mem_exit("init_lists: fs_list0");
+      fs_list1 = calloc(p_Dpb->size, sizeof (FrameStore*));
+      if (NULL==fs_list1)
+         no_mem_exit("init_lists: fs_list1");
+      fs_listlt = calloc(p_Dpb->size, sizeof (FrameStore*));
+      if (NULL==fs_listlt)
+         no_mem_exit("init_lists: fs_listlt");
+
+      p_Vid->listXsize[0] = 0;
+      p_Vid->listXsize[1] = 1;
+
+      for (i=0; i<p_Dpb->ref_frames_in_buffer; i++)
+      {
+        if (p_Dpb->fs_ref[i]->is_used)
+        {
+          if (p_Vid->ThisPOC >= p_Dpb->fs_ref[i]->poc)
+          {
+            fs_list0[list0idx++] = p_Dpb->fs_ref[i];
+          }
+        }
+      }
+      qsort((void *)fs_list0, list0idx, sizeof(FrameStore*), compare_fs_by_poc_desc);
+      list0idx_1 = list0idx;
+      for (i=0; i<p_Dpb->ref_frames_in_buffer; i++)
+      {
+        if (p_Dpb->fs_ref[i]->is_used)
+        {
+          if (p_Vid->ThisPOC < p_Dpb->fs_ref[i]->poc)
+          {
+            fs_list0[list0idx++] = p_Dpb->fs_ref[i];
+          }
+        }
+      }
+      qsort((void *)&fs_list0[list0idx_1], list0idx-list0idx_1, sizeof(FrameStore*), compare_fs_by_poc_asc);
+
+      for (j=0; j<list0idx_1; j++)
+      {
+        fs_list1[list0idx-list0idx_1+j]=fs_list0[j];
+      }
+      for (j=list0idx_1; j<list0idx; j++)
+      {
+        fs_list1[j-list0idx_1]=fs_list0[j];
+      }
+
+//      printf("fs_list0 currPoc=%d (Poc): ", p_Vid->ThisPOC); for (i=0; i<list0idx; i++){printf ("%d  ", fs_list0[i]->poc);} printf("\n");
+//      printf("fs_list1 currPoc=%d (Poc): ", p_Vid->ThisPOC); for (i=0; i<list0idx; i++){printf ("%d  ", fs_list1[i]->poc);} printf("\n");
+
+      p_Vid->listXsize[0] = 0;
+      p_Vid->listXsize[1] = 0;
+      gen_pic_list_from_frame_list(currSlice->structure, fs_list0, list0idx, p_Vid->listX[0], &p_Vid->listXsize[0], 0);
+      gen_pic_list_from_frame_list(currSlice->structure, fs_list1, list0idx, p_Vid->listX[1], &p_Vid->listXsize[1], 0);
+
+//      printf("p_Vid->listX[0] currPoc=%d (Poc): ", p_Vid->framepoc); for (i=0; i<p_Vid->listXsize[0]; i++){printf ("%d  ", p_Vid->listX[0][i]->poc);} printf("\n");
+//      printf("p_Vid->listX[1] currPoc=%d (Poc): ", p_Vid->framepoc); for (i=0; i<p_Vid->listXsize[1]; i++){printf ("%d  ", p_Vid->listX[1][i]->poc);} printf("\n");
+
+      // long term handling
+      for (i=0; i<p_Dpb->ltref_frames_in_buffer; i++)
+      {
+        fs_listlt[listltidx++]=p_Dpb->fs_ltref[i];
+      }
+
+      qsort((void *)fs_listlt, listltidx, sizeof(FrameStore*), compare_fs_by_lt_pic_idx_asc);
+
+      gen_pic_list_from_frame_list(currSlice->structure, fs_listlt, listltidx, p_Vid->listX[0], &p_Vid->listXsize[0], 1);
+      gen_pic_list_from_frame_list(currSlice->structure, fs_listlt, listltidx, p_Vid->listX[1], &p_Vid->listXsize[1], 1);
+
+      free(fs_list0);
+      free(fs_list1);
+      free(fs_listlt);
+    }
+  }
+
+  if ((p_Vid->listXsize[0] == p_Vid->listXsize[1]) && (p_Vid->listXsize[0] > 1))
+  {
+    // check if lists are identical, if yes swap first two elements of p_Vid->listX[1]
+    diff=0;
+    for (j = 0; j< p_Vid->listXsize[0]; j++)
+    {
+      if (p_Vid->listX[0][j]!=p_Vid->listX[1][j])
+        diff=1;
+    }
+    if (!diff)
+    {
+      tmp_s = p_Vid->listX[1][0];
+      p_Vid->listX[1][0]=p_Vid->listX[1][1];
+      p_Vid->listX[1][1]=tmp_s;
+    }
+  }
+  // set max size
+  p_Vid->listXsize[0] = (char) imin (p_Vid->listXsize[0], currSlice->num_ref_idx_l0_active);
+  p_Vid->listXsize[1] = (char) imin (p_Vid->listXsize[1], currSlice->num_ref_idx_l1_active);
+
+  // set the unused list entries to NULL
+  for (i=p_Vid->listXsize[0]; i< (MAX_LIST_SIZE) ; i++)
+  {
+      p_Vid->listX[0][i] = p_Vid->no_reference_picture;
+
+  }
+  for (i=p_Vid->listXsize[1]; i< (MAX_LIST_SIZE) ; i++)
+  {
+      p_Vid->listX[1][i] = p_Vid->no_reference_picture;
+  }
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Initialize listX[2..5] from lists 0 and 1
+ *    listX[2]: list0 for current_field==top
+ *    listX[3]: list1 for current_field==top
+ *    listX[4]: list0 for current_field==bottom
+ *    listX[5]: list1 for current_field==bottom
+ *
+ ************************************************************************
+ */
+void init_mbaff_lists(VideoParameters *p_Vid)
+{
+  unsigned j;
+  int i;
+
+  for (i=2;i<6;i++)
+  {
+    for (j=0; j<MAX_LIST_SIZE; j++)
+    {
+      p_Vid->listX[i][j] = p_Vid->no_reference_picture;
+    }
+    p_Vid->listXsize[i]=0;
+  }
+
+  for (i=0; i<p_Vid->listXsize[0]; i++)
+  {
+    p_Vid->listX[2][2*i  ] = p_Vid->listX[0][i]->top_field;
+    p_Vid->listX[2][2*i+1] = p_Vid->listX[0][i]->bottom_field;
+    p_Vid->listX[4][2*i  ] = p_Vid->listX[0][i]->bottom_field;
+    p_Vid->listX[4][2*i+1] = p_Vid->listX[0][i]->top_field;
+  }
+  p_Vid->listXsize[2]=p_Vid->listXsize[4]=p_Vid->listXsize[0] * 2;
+
+  for (i=0; i<p_Vid->listXsize[1]; i++)
+  {
+    p_Vid->listX[3][2*i  ] = p_Vid->listX[1][i]->top_field;
+    p_Vid->listX[3][2*i+1] = p_Vid->listX[1][i]->bottom_field;
+    p_Vid->listX[5][2*i  ] = p_Vid->listX[1][i]->bottom_field;
+    p_Vid->listX[5][2*i+1] = p_Vid->listX[1][i]->top_field;
+  }
+  p_Vid->listXsize[3]=p_Vid->listXsize[5]=p_Vid->listXsize[1] * 2;
+}
+
+ /*!
+ ************************************************************************
+ * \brief
+ *    Returns short term pic with given picNum
+ *
+ ************************************************************************
+ */
+static StorablePicture*  get_short_term_pic(VideoParameters *p_Vid, int picNum)
+{
+  DecodedPictureBuffer *p_Dpb = p_Vid->p_Dpb;
+  unsigned i;
+
+  for (i=0; i<p_Dpb->ref_frames_in_buffer; i++)
+  {
+    if (p_Vid->structure==FRAME)
+    {
+      if (p_Dpb->fs_ref[i]->is_reference == 3)
+        if ((!p_Dpb->fs_ref[i]->frame->is_long_term)&&(p_Dpb->fs_ref[i]->frame->pic_num == picNum))
+          return p_Dpb->fs_ref[i]->frame;
+    }
+    else
+    {
+      if (p_Dpb->fs_ref[i]->is_reference & 1)
+        if ((!p_Dpb->fs_ref[i]->top_field->is_long_term)&&(p_Dpb->fs_ref[i]->top_field->pic_num == picNum))
+          return p_Dpb->fs_ref[i]->top_field;
+      if (p_Dpb->fs_ref[i]->is_reference & 2)
+        if ((!p_Dpb->fs_ref[i]->bottom_field->is_long_term)&&(p_Dpb->fs_ref[i]->bottom_field->pic_num == picNum))
+          return p_Dpb->fs_ref[i]->bottom_field;
+    }
+  }
+
+  return p_Vid->no_reference_picture;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Returns long term pic with given LongtermPicNum
+ *
+ ************************************************************************
+ */
+static StorablePicture*  get_long_term_pic(VideoParameters *p_Vid, int LongtermPicNum)
+{
+  DecodedPictureBuffer *p_Dpb = p_Vid->p_Dpb;
+  unsigned i;
+
+  for (i=0; i<p_Dpb->ltref_frames_in_buffer; i++)
+  {
+    if (p_Vid->structure==FRAME)
+    {
+      if (p_Dpb->fs_ltref[i]->is_reference == 3)
+        if ((p_Dpb->fs_ltref[i]->frame->is_long_term)&&(p_Dpb->fs_ltref[i]->frame->long_term_pic_num == LongtermPicNum))
+          return p_Dpb->fs_ltref[i]->frame;
+    }
+    else
+    {
+      if (p_Dpb->fs_ltref[i]->is_reference & 1)
+        if ((p_Dpb->fs_ltref[i]->top_field->is_long_term)&&(p_Dpb->fs_ltref[i]->top_field->long_term_pic_num == LongtermPicNum))
+          return p_Dpb->fs_ltref[i]->top_field;
+      if (p_Dpb->fs_ltref[i]->is_reference & 2)
+        if ((p_Dpb->fs_ltref[i]->bottom_field->is_long_term)&&(p_Dpb->fs_ltref[i]->bottom_field->long_term_pic_num == LongtermPicNum))
+          return p_Dpb->fs_ltref[i]->bottom_field;
+    }
+  }
+  return NULL;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Reordering process for short-term reference pictures
+ *
+ ************************************************************************
+ */
+static void reorder_short_term(VideoParameters *p_Vid, StorablePicture **RefPicListX, int num_ref_idx_lX_active_minus1, int picNumLX, int *refIdxLX)
+{
+  int cIdx, nIdx;
+
+  StorablePicture *picLX;
+
+  picLX = get_short_term_pic(p_Vid, picNumLX);
+
+  for( cIdx = num_ref_idx_lX_active_minus1+1; cIdx > *refIdxLX; cIdx-- )
+    RefPicListX[ cIdx ] = RefPicListX[ cIdx - 1];
+
+  RefPicListX[ (*refIdxLX)++ ] = picLX;
+
+  nIdx = *refIdxLX;
+
+  for( cIdx = *refIdxLX; cIdx <= num_ref_idx_lX_active_minus1+1; cIdx++ )
+    if (RefPicListX[ cIdx ])
+      if( (RefPicListX[ cIdx ]->is_long_term ) ||  (RefPicListX[ cIdx ]->pic_num != picNumLX ))
+        RefPicListX[ nIdx++ ] = RefPicListX[ cIdx ];
+
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Reordering process for long-term reference pictures
+ *
+ ************************************************************************
+ */
+static void reorder_long_term(VideoParameters *p_Vid, StorablePicture **RefPicListX, int num_ref_idx_lX_active_minus1, int LongTermPicNum, int *refIdxLX)
+{
+  int cIdx, nIdx;
+
+  StorablePicture *picLX;
+
+  picLX = get_long_term_pic(p_Vid, LongTermPicNum);
+
+  for( cIdx = num_ref_idx_lX_active_minus1+1; cIdx > *refIdxLX; cIdx-- )
+    RefPicListX[ cIdx ] = RefPicListX[ cIdx - 1];
+
+  RefPicListX[ (*refIdxLX)++ ] = picLX;
+
+  nIdx = *refIdxLX;
+
+  for( cIdx = *refIdxLX; cIdx <= num_ref_idx_lX_active_minus1+1; cIdx++ )
+    if (RefPicListX[ cIdx ])
+      if( (!RefPicListX[ cIdx ]->is_long_term ) ||  (RefPicListX[ cIdx ]->long_term_pic_num != LongTermPicNum ))
+        RefPicListX[ nIdx++ ] = RefPicListX[ cIdx ];
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Reordering process for reference picture lists
+ *
+ ************************************************************************
+ */
+void reorder_ref_pic_list(VideoParameters *p_Vid, StorablePicture **list, char *list_size, int num_ref_idx_lX_active_minus1, int *reordering_of_pic_nums_idc, int *abs_diff_pic_num_minus1, int *long_term_pic_idx)
+{
+  int i;
+
+  int maxPicNum, currPicNum, picNumLXNoWrap, picNumLXPred, picNumLX;
+  int refIdxLX = 0;
+
+  if (p_Vid->structure==FRAME)
+  {
+    maxPicNum  = p_Vid->MaxFrameNum;
+    currPicNum = p_Vid->frame_num;
+  }
+  else
+  {
+    maxPicNum  = 2 * p_Vid->MaxFrameNum;
+    currPicNum = 2 * p_Vid->frame_num + 1;
+  }
+
+  picNumLXPred = currPicNum;
+
+  for (i=0; reordering_of_pic_nums_idc[i]!=3; i++)
+  {
+    if (reordering_of_pic_nums_idc[i]>3)
+      error ("Invalid remapping_of_pic_nums_idc command", 500);
+
+    if (reordering_of_pic_nums_idc[i] < 2)
+    {
+      if (reordering_of_pic_nums_idc[i] == 0)
+      {
+        if( picNumLXPred - ( abs_diff_pic_num_minus1[i] + 1 ) < 0 )
+          picNumLXNoWrap = picNumLXPred - ( abs_diff_pic_num_minus1[i] + 1 ) + maxPicNum;
+        else
+          picNumLXNoWrap = picNumLXPred - ( abs_diff_pic_num_minus1[i] + 1 );
+      }
+      else // (remapping_of_pic_nums_idc[i] == 1)
+      {
+        if( picNumLXPred + ( abs_diff_pic_num_minus1[i] + 1 )  >=  maxPicNum )
+          picNumLXNoWrap = picNumLXPred + ( abs_diff_pic_num_minus1[i] + 1 ) - maxPicNum;
+        else
+          picNumLXNoWrap = picNumLXPred + ( abs_diff_pic_num_minus1[i] + 1 );
+      }
+      picNumLXPred = picNumLXNoWrap;
+
+      if( picNumLXNoWrap > currPicNum )
+        picNumLX = picNumLXNoWrap - maxPicNum;
+      else
+        picNumLX = picNumLXNoWrap;
+
+      reorder_short_term(p_Vid, list, num_ref_idx_lX_active_minus1, picNumLX, &refIdxLX);
+    }
+    else //(remapping_of_pic_nums_idc[i] == 2)
+    {
+      reorder_long_term(p_Vid, list, num_ref_idx_lX_active_minus1, long_term_pic_idx[i], &refIdxLX);
+    }
+
+  }
+  // that's a definition
+  *list_size = (char)(num_ref_idx_lX_active_minus1 + 1);
+}
+
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Update the list of frame stores that contain reference frames/fields
+ *
+ ************************************************************************
+ */
+void update_ref_list(DecodedPictureBuffer *p_Dpb)
+{
+  unsigned i, j;
+  for (i=0, j=0; i<p_Dpb->used_size; i++)
+  {
+    if (is_short_term_reference(p_Dpb->fs[i]))
+    {
+      p_Dpb->fs_ref[j++]=p_Dpb->fs[i];
+    }
+  }
+
+  p_Dpb->ref_frames_in_buffer = j;
+
+  while (j<p_Dpb->size)
+  {
+    p_Dpb->fs_ref[j++]=NULL;
+  }
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Update the list of frame stores that contain long-term reference
+ *    frames/fields
+ *
+ ************************************************************************
+ */
+void update_ltref_list(DecodedPictureBuffer *p_Dpb)
+{
+  unsigned i, j;
+  for (i=0, j=0; i<p_Dpb->used_size; i++)
+  {
+    if (is_long_term_reference(p_Dpb->fs[i]))
+    {
+      p_Dpb->fs_ltref[j++]=p_Dpb->fs[i];
+    }
+  }
+
+  p_Dpb->ltref_frames_in_buffer=j;
+
+  while (j<p_Dpb->size)
+  {
+    p_Dpb->fs_ltref[j++]=NULL;
+  }
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Perform Memory management for idr pictures
+ *
+ ************************************************************************
+ */
+static void idr_memory_management(VideoParameters *p_Vid, StorablePicture* p)
+{
+  DecodedPictureBuffer *p_Dpb = p_Vid->p_Dpb;
+  unsigned i;
+
+  assert (p->idr_flag);
+
+  if (p->no_output_of_prior_pics_flag)
+  {
+    // free all stored pictures
+    for (i=0; i<p_Dpb->used_size; i++)
+    {
+      // reset all reference settings
+      free_frame_store(p_Vid, p_Dpb->fs[i]);
+      p_Dpb->fs[i] = alloc_frame_store();
+    }
+    for (i=0; i<p_Dpb->ref_frames_in_buffer; i++)
+    {
+      p_Dpb->fs_ref[i]=NULL;
+    }
+    for (i=0; i<p_Dpb->ltref_frames_in_buffer; i++)
+    {
+      p_Dpb->fs_ltref[i]=NULL;
+    }
+    p_Dpb->used_size=0;
+  }
+  else
+  {
+    flush_dpb(p_Vid);
+  }
+  p_Dpb->last_picture = NULL;
+
+  update_ref_list(p_Dpb);
+  update_ltref_list(p_Dpb);
+  p_Dpb->last_output_poc = INT_MIN;
+
+  if (p->long_term_reference_flag)
+  {
+    p_Dpb->max_long_term_pic_idx = 0;
+    p->is_long_term           = 1;
+    p->long_term_frame_idx    = 0;
+  }
+  else
+  {
+    p_Dpb->max_long_term_pic_idx = -1;
+    p->is_long_term           = 0;
+  }
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Perform Sliding window decoded reference picture marking process
+ *
+ ************************************************************************
+ */
+static void sliding_window_memory_management(DecodedPictureBuffer *p_Dpb, StorablePicture* p)
+{
+  unsigned i;
+
+  assert (!p->idr_flag);
+  // if this is a reference pic with sliding sliding window, unmark first ref frame
+  if (p_Dpb->ref_frames_in_buffer==p_Dpb->num_ref_frames - p_Dpb->ltref_frames_in_buffer)
+  {
+    for (i=0; i<p_Dpb->used_size;i++)
+    {
+      if (p_Dpb->fs[i]->is_reference  && (!(p_Dpb->fs[i]->is_long_term)))
+      {
+        unmark_for_reference(p_Dpb->p_Vid, p_Dpb->fs[i]);
+        update_ref_list(p_Dpb);
+        break;
+      }
+    }
+  }
+
+  p->is_long_term = 0;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Calculate picNumX
+ ************************************************************************
+ */
+static int get_pic_num_x (StorablePicture *p, int difference_of_pic_nums_minus1)
+{
+  int currPicNum;
+
+  if (p->structure == FRAME)
+    currPicNum = p->frame_num;
+  else
+    currPicNum = 2 * p->frame_num + 1;
+
+  return currPicNum - (difference_of_pic_nums_minus1 + 1);
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Adaptive Memory Management: Mark short term picture unused
+ ************************************************************************
+ */
+static void mm_unmark_short_term_for_reference(DecodedPictureBuffer *p_Dpb, StorablePicture *p, int difference_of_pic_nums_minus1)
+{
+  int picNumX;
+
+  unsigned i;
+
+  picNumX = get_pic_num_x(p, difference_of_pic_nums_minus1);
+
+  for (i=0; i<p_Dpb->ref_frames_in_buffer; i++)
+  {
+    if (p->structure == FRAME)
+    {
+      if ((p_Dpb->fs_ref[i]->is_reference==3) && (p_Dpb->fs_ref[i]->is_long_term==0))
+      {
+        if (p_Dpb->fs_ref[i]->frame->pic_num == picNumX)
+        {
+          unmark_for_reference(p_Dpb->p_Vid, p_Dpb->fs_ref[i]);
+          return;
+        }
+      }
+    }
+    else
+    {
+      if ((p_Dpb->fs_ref[i]->is_reference & 1) && (!(p_Dpb->fs_ref[i]->is_long_term & 1)))
+      {
+        if (p_Dpb->fs_ref[i]->top_field->pic_num == picNumX)
+        {
+          p_Dpb->fs_ref[i]->top_field->used_for_reference = 0;
+          p_Dpb->fs_ref[i]->is_reference &= 2;
+          if (p_Dpb->fs_ref[i]->is_used == 3)
+          {
+            p_Dpb->fs_ref[i]->frame->used_for_reference = 0;
+          }
+          return;
+        }
+      }
+      if ((p_Dpb->fs_ref[i]->is_reference & 2) && (!(p_Dpb->fs_ref[i]->is_long_term & 2)))
+      {
+        if (p_Dpb->fs_ref[i]->bottom_field->pic_num == picNumX)
+        {
+          p_Dpb->fs_ref[i]->bottom_field->used_for_reference = 0;
+          p_Dpb->fs_ref[i]->is_reference &= 1;
+          if (p_Dpb->fs_ref[i]->is_used == 3)
+          {
+            p_Dpb->fs_ref[i]->frame->used_for_reference = 0;
+          }
+          return;
+        }
+      }
+    }
+  }
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Adaptive Memory Management: Mark long term picture unused
+ ************************************************************************
+ */
+static void mm_unmark_long_term_for_reference(DecodedPictureBuffer *p_Dpb, StorablePicture *p, int long_term_pic_num)
+{
+  unsigned i;
+  for (i=0; i<p_Dpb->ltref_frames_in_buffer; i++)
+  {
+    if (p->structure == FRAME)
+    {
+      if ((p_Dpb->fs_ltref[i]->is_reference==3) && (p_Dpb->fs_ltref[i]->is_long_term==3))
+      {
+        if (p_Dpb->fs_ltref[i]->frame->long_term_pic_num == long_term_pic_num)
+        {
+          unmark_for_long_term_reference(p_Dpb->fs_ltref[i]);
+        }
+      }
+    }
+    else
+    {
+      if ((p_Dpb->fs_ltref[i]->is_reference & 1) && ((p_Dpb->fs_ltref[i]->is_long_term & 1)))
+      {
+        if (p_Dpb->fs_ltref[i]->top_field->long_term_pic_num == long_term_pic_num)
+        {
+          p_Dpb->fs_ltref[i]->top_field->used_for_reference = 0;
+          p_Dpb->fs_ltref[i]->top_field->is_long_term = 0;
+          p_Dpb->fs_ltref[i]->is_reference &= 2;
+          p_Dpb->fs_ltref[i]->is_long_term &= 2;
+          if (p_Dpb->fs_ltref[i]->is_used == 3)
+          {
+            p_Dpb->fs_ltref[i]->frame->used_for_reference = 0;
+            p_Dpb->fs_ltref[i]->frame->is_long_term = 0;
+          }
+          return;
+        }
+      }
+      if ((p_Dpb->fs_ltref[i]->is_reference & 2) && ((p_Dpb->fs_ltref[i]->is_long_term & 2)))
+      {
+        if (p_Dpb->fs_ltref[i]->bottom_field->long_term_pic_num == long_term_pic_num)
+        {
+          p_Dpb->fs_ltref[i]->bottom_field->used_for_reference = 0;
+          p_Dpb->fs_ltref[i]->bottom_field->is_long_term = 0;
+          p_Dpb->fs_ltref[i]->is_reference &= 1;
+          p_Dpb->fs_ltref[i]->is_long_term &= 1;
+          if (p_Dpb->fs_ltref[i]->is_used == 3)
+          {
+            p_Dpb->fs_ltref[i]->frame->used_for_reference = 0;
+            p_Dpb->fs_ltref[i]->frame->is_long_term = 0;
+          }
+          return;
+        }
+      }
+    }
+  }
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Mark a long-term reference frame or complementary field pair unused for referemce
+ ************************************************************************
+ */
+static void unmark_long_term_frame_for_reference_by_frame_idx(DecodedPictureBuffer *p_Dpb, int long_term_frame_idx)
+{
+  unsigned i;
+  for(i=0; i<p_Dpb->ltref_frames_in_buffer; i++)
+  {
+    if (p_Dpb->fs_ltref[i]->long_term_frame_idx == long_term_frame_idx)
+      unmark_for_long_term_reference(p_Dpb->fs_ltref[i]);
+  }
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Mark a long-term reference field unused for reference only if it's not
+ *    the complementary field of the picture indicated by picNumX
+ ************************************************************************
+ */
+static void unmark_long_term_field_for_reference_by_frame_idx(VideoParameters *p_Vid, PictureStructure structure, int long_term_frame_idx, int mark_current, unsigned curr_frame_num, int curr_pic_num)
+{
+  unsigned i;
+  DecodedPictureBuffer *p_Dpb = p_Vid->p_Dpb;
+
+  assert(structure!=FRAME);
+  if (curr_pic_num<0)
+    curr_pic_num+=(2*p_Vid->MaxFrameNum);
+
+  for(i=0; i<p_Dpb->ltref_frames_in_buffer; i++)
+  {
+    if (p_Dpb->fs_ltref[i]->long_term_frame_idx == long_term_frame_idx)
+    {
+      if (structure == TOP_FIELD)
+      {
+        if ((p_Dpb->fs_ltref[i]->is_long_term == 3))
+        {
+          unmark_for_long_term_reference(p_Dpb->fs_ltref[i]);
+        }
+        else
+        {
+          if ((p_Dpb->fs_ltref[i]->is_long_term == 1))
+          {
+            unmark_for_long_term_reference(p_Dpb->fs_ltref[i]);
+          }
+          else
+          {
+            if (mark_current)
+            {
+              if (p_Dpb->last_picture)
+              {
+                if ( ( p_Dpb->last_picture != p_Dpb->fs_ltref[i] )|| p_Dpb->last_picture->frame_num != curr_frame_num)
+                  unmark_for_long_term_reference(p_Dpb->fs_ltref[i]);
+              }
+              else
+              {
+                unmark_for_long_term_reference(p_Dpb->fs_ltref[i]);
+              }
+            }
+            else
+            {
+              if ((p_Dpb->fs_ltref[i]->frame_num) != (unsigned)(curr_pic_num/2))
+              {
+                unmark_for_long_term_reference(p_Dpb->fs_ltref[i]);
+              }
+            }
+          }
+        }
+      }
+      if (structure == BOTTOM_FIELD)
+      {
+        if ((p_Dpb->fs_ltref[i]->is_long_term == 3))
+        {
+          unmark_for_long_term_reference(p_Dpb->fs_ltref[i]);
+        }
+        else
+        {
+          if ((p_Dpb->fs_ltref[i]->is_long_term == 2))
+          {
+            unmark_for_long_term_reference(p_Dpb->fs_ltref[i]);
+          }
+          else
+          {
+            if (mark_current)
+            {
+              if (p_Dpb->last_picture)
+              {
+                if ( ( p_Dpb->last_picture != p_Dpb->fs_ltref[i] )|| p_Dpb->last_picture->frame_num != curr_frame_num)
+                  unmark_for_long_term_reference(p_Dpb->fs_ltref[i]);
+              }
+              else
+              {
+                unmark_for_long_term_reference(p_Dpb->fs_ltref[i]);
+              }
+            }
+            else
+            {
+              if ((p_Dpb->fs_ltref[i]->frame_num) != (unsigned)(curr_pic_num/2))
+              {
+                unmark_for_long_term_reference(p_Dpb->fs_ltref[i]);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    mark a picture as long-term reference
+ ************************************************************************
+ */
+static void mark_pic_long_term(DecodedPictureBuffer *p_Dpb, StorablePicture* p, int long_term_frame_idx, int picNumX)
+{
+  unsigned i;
+  int add_top, add_bottom;
+
+  if (p->structure == FRAME)
+  {
+    for (i=0; i<p_Dpb->ref_frames_in_buffer; i++)
+    {
+      if (p_Dpb->fs_ref[i]->is_reference == 3)
+      {
+        if ((!p_Dpb->fs_ref[i]->frame->is_long_term)&&(p_Dpb->fs_ref[i]->frame->pic_num == picNumX))
+        {
+          p_Dpb->fs_ref[i]->long_term_frame_idx = p_Dpb->fs_ref[i]->frame->long_term_frame_idx
+                                             = long_term_frame_idx;
+          p_Dpb->fs_ref[i]->frame->long_term_pic_num = long_term_frame_idx;
+          p_Dpb->fs_ref[i]->frame->is_long_term = 1;
+
+          if (p_Dpb->fs_ref[i]->top_field && p_Dpb->fs_ref[i]->bottom_field)
+          {
+            p_Dpb->fs_ref[i]->top_field->long_term_frame_idx = p_Dpb->fs_ref[i]->bottom_field->long_term_frame_idx
+                                                          = long_term_frame_idx;
+            p_Dpb->fs_ref[i]->top_field->long_term_pic_num = long_term_frame_idx;
+            p_Dpb->fs_ref[i]->bottom_field->long_term_pic_num = long_term_frame_idx;
+
+            p_Dpb->fs_ref[i]->top_field->is_long_term = p_Dpb->fs_ref[i]->bottom_field->is_long_term
+                                                   = 1;
+
+          }
+          p_Dpb->fs_ref[i]->is_long_term = 3;
+          return;
+        }
+      }
+    }
+    printf ("Warning: reference frame for long term marking not found\n");
+  }
+  else
+  {
+    if (p->structure == TOP_FIELD)
+    {
+      add_top    = 1;
+      add_bottom = 0;
+    }
+    else
+    {
+      add_top    = 0;
+      add_bottom = 1;
+    }
+    for (i=0; i<p_Dpb->ref_frames_in_buffer; i++)
+    {
+      if (p_Dpb->fs_ref[i]->is_reference & 1)
+      {
+        if ((!p_Dpb->fs_ref[i]->top_field->is_long_term)&&(p_Dpb->fs_ref[i]->top_field->pic_num == picNumX))
+        {
+          if ((p_Dpb->fs_ref[i]->is_long_term) && (p_Dpb->fs_ref[i]->long_term_frame_idx != long_term_frame_idx))
+          {
+              printf ("Warning: assigning long_term_frame_idx different from other field\n");
+          }
+
+          p_Dpb->fs_ref[i]->long_term_frame_idx = p_Dpb->fs_ref[i]->top_field->long_term_frame_idx
+                                             = long_term_frame_idx;
+          p_Dpb->fs_ref[i]->top_field->long_term_pic_num = 2 * long_term_frame_idx + add_top;
+          p_Dpb->fs_ref[i]->top_field->is_long_term = 1;
+          p_Dpb->fs_ref[i]->is_long_term |= 1;
+          if (p_Dpb->fs_ref[i]->is_long_term == 3)
+          {
+            p_Dpb->fs_ref[i]->frame->is_long_term = 1;
+            p_Dpb->fs_ref[i]->frame->long_term_frame_idx = p_Dpb->fs_ref[i]->frame->long_term_pic_num = long_term_frame_idx;
+          }
+          return;
+        }
+      }
+      if (p_Dpb->fs_ref[i]->is_reference & 2)
+      {
+        if ((!p_Dpb->fs_ref[i]->bottom_field->is_long_term)&&(p_Dpb->fs_ref[i]->bottom_field->pic_num == picNumX))
+        {
+          if ((p_Dpb->fs_ref[i]->is_long_term) && (p_Dpb->fs_ref[i]->long_term_frame_idx != long_term_frame_idx))
+          {
+              printf ("Warning: assigning long_term_frame_idx different from other field\n");
+          }
+
+          p_Dpb->fs_ref[i]->long_term_frame_idx = p_Dpb->fs_ref[i]->bottom_field->long_term_frame_idx
+                                             = long_term_frame_idx;
+          p_Dpb->fs_ref[i]->bottom_field->long_term_pic_num = 2 * long_term_frame_idx + add_bottom;
+          p_Dpb->fs_ref[i]->bottom_field->is_long_term = 1;
+          p_Dpb->fs_ref[i]->is_long_term |= 2;
+          if (p_Dpb->fs_ref[i]->is_long_term == 3)
+          {
+            p_Dpb->fs_ref[i]->frame->is_long_term = 1;
+            p_Dpb->fs_ref[i]->frame->long_term_frame_idx = p_Dpb->fs_ref[i]->frame->long_term_pic_num = long_term_frame_idx;
+          }
+          return;
+        }
+      }
+    }
+    printf ("Warning: reference field for long term marking not found\n");
+  }
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Assign a long term frame index to a short term picture
+ ************************************************************************
+ */
+static void mm_assign_long_term_frame_idx(VideoParameters *p_Vid, StorablePicture* p, int difference_of_pic_nums_minus1, int long_term_frame_idx)
+{
+  int picNumX;
+  DecodedPictureBuffer *p_Dpb = p_Vid->p_Dpb;
+
+  picNumX = get_pic_num_x(p, difference_of_pic_nums_minus1);
+
+  // remove frames/fields with same long_term_frame_idx
+  if (p->structure == FRAME)
+  {
+    unmark_long_term_frame_for_reference_by_frame_idx(p_Dpb, long_term_frame_idx);
+  }
+  else
+  {
+    unsigned i;
+    PictureStructure structure = FRAME;
+
+    for (i=0; i<p_Dpb->ref_frames_in_buffer; i++)
+    {
+      if (p_Dpb->fs_ref[i]->is_reference & 1)
+      {
+        if (p_Dpb->fs_ref[i]->top_field->pic_num == picNumX)
+        {
+          structure = TOP_FIELD;
+          break;
+        }
+      }
+      if (p_Dpb->fs_ref[i]->is_reference & 2)
+      {
+        if (p_Dpb->fs_ref[i]->bottom_field->pic_num == picNumX)
+        {
+          structure = BOTTOM_FIELD;
+          break;
+        }
+      }
+    }
+    if (structure==FRAME)
+    {
+      error ("field for long term marking not found",200);
+    }
+
+    unmark_long_term_field_for_reference_by_frame_idx(p_Vid, structure, long_term_frame_idx, 0, 0, picNumX);
+  }
+
+  mark_pic_long_term(p_Dpb, p, long_term_frame_idx, picNumX);
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Set new max long_term_frame_idx
+ ************************************************************************
+ */
+void mm_update_max_long_term_frame_idx(DecodedPictureBuffer *p_Dpb, int max_long_term_frame_idx_plus1)
+{
+  unsigned i;
+
+  p_Dpb->max_long_term_pic_idx = max_long_term_frame_idx_plus1 - 1;
+
+  // check for invalid frames
+  for (i=0; i<p_Dpb->ltref_frames_in_buffer; i++)
+  {
+    if (p_Dpb->fs_ltref[i]->long_term_frame_idx > p_Dpb->max_long_term_pic_idx)
+    {
+      unmark_for_long_term_reference(p_Dpb->fs_ltref[i]);
+    }
+  }
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Mark all long term reference pictures unused for reference
+ ************************************************************************
+ */
+static void mm_unmark_all_long_term_for_reference (DecodedPictureBuffer *p_Dpb)
+{
+  mm_update_max_long_term_frame_idx(p_Dpb, 0);
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Mark all short term reference pictures unused for reference
+ ************************************************************************
+ */
+static void mm_unmark_all_short_term_for_reference (DecodedPictureBuffer *p_Dpb)
+{
+  unsigned int i;
+  for (i=0; i<p_Dpb->ref_frames_in_buffer; i++)
+  {
+    unmark_for_reference(p_Dpb->p_Vid, p_Dpb->fs_ref[i]);
+  }
+  update_ref_list(p_Dpb);
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Mark the current picture used for long term reference
+ ************************************************************************
+ */
+static void mm_mark_current_picture_long_term(VideoParameters *p_Vid, DecodedPictureBuffer *p_Dpb, StorablePicture *p, int long_term_frame_idx)
+{
+  // remove long term pictures with same long_term_frame_idx
+  if (p->structure == FRAME)
+  {
+    unmark_long_term_frame_for_reference_by_frame_idx(p_Dpb, long_term_frame_idx);
+  }
+  else
+  {
+    unmark_long_term_field_for_reference_by_frame_idx(p_Vid, p->structure, long_term_frame_idx, 1, p->pic_num, 0);
+  }
+
+  p->is_long_term = 1;
+  p->long_term_frame_idx = long_term_frame_idx;
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Perform Adaptive memory control decoded reference picture marking process
+ ************************************************************************
+ */
+static void adaptive_memory_management(VideoParameters *p_Vid, StorablePicture* p)
+{
+  DecRefPicMarking_t *tmp_drpm;
+  DecodedPictureBuffer *p_Dpb = p_Vid->p_Dpb;
+
+  p_Vid->last_has_mmco_5 = 0;
+
+  assert (!p->idr_flag);
+  assert (p->adaptive_ref_pic_buffering_flag);
+
+  while (p->dec_ref_pic_marking_buffer)
+  {
+    tmp_drpm = p->dec_ref_pic_marking_buffer;
+    switch (tmp_drpm->memory_management_control_operation)
+    {
+      case 0:
+        if (tmp_drpm->Next != NULL)
+        {
+          error ("memory_management_control_operation = 0 not last operation in buffer", 500);
+        }
+        break;
+      case 1:
+        mm_unmark_short_term_for_reference(p_Dpb, p, tmp_drpm->difference_of_pic_nums_minus1);
+        update_ref_list(p_Dpb);
+        break;
+      case 2:
+        mm_unmark_long_term_for_reference(p_Dpb, p, tmp_drpm->long_term_pic_num);
+        update_ltref_list(p_Dpb);
+        break;
+      case 3:
+        mm_assign_long_term_frame_idx(p_Vid, p, tmp_drpm->difference_of_pic_nums_minus1, tmp_drpm->long_term_frame_idx);
+        update_ref_list(p_Dpb);
+        update_ltref_list(p_Dpb);
+        break;
+      case 4:
+        mm_update_max_long_term_frame_idx (p_Dpb, tmp_drpm->max_long_term_frame_idx_plus1);
+        update_ltref_list(p_Dpb);
+        break;
+      case 5:
+        mm_unmark_all_short_term_for_reference(p_Dpb);
+        mm_unmark_all_long_term_for_reference(p_Dpb);
+       p_Vid->last_has_mmco_5 = 1;
+        break;
+      case 6:
+        mm_mark_current_picture_long_term(p_Vid, p_Dpb, p, tmp_drpm->long_term_frame_idx);
+        check_num_ref(p_Dpb);
+        break;
+      default:
+        error ("invalid memory_management_control_operation in buffer", 500);
+    }
+    p->dec_ref_pic_marking_buffer = tmp_drpm->Next;
+    free (tmp_drpm);
+  }
+  if ( p_Vid->last_has_mmco_5 )
+  {
+    p->pic_num = p->frame_num = 0;
+
+    switch (p->structure)
+    {
+    case TOP_FIELD:
+      {
+        p->poc = p->top_poc = p_Vid->toppoc =0;
+        break;
+      }
+    case BOTTOM_FIELD:
+      {
+        p->poc = p->bottom_poc = p_Vid->bottompoc = 0;
+        break;
+      }
+    case FRAME:
+      {
+        p->top_poc    -= p->poc;
+        p->bottom_poc -= p->poc;
+
+        p_Vid->toppoc = p->top_poc;
+        p_Vid->bottompoc = p->bottom_poc;
+
+        p->poc = imin (p->top_poc, p->bottom_poc);
+        p_Vid->framepoc = p->poc;
+        break;
+      }
+    }
+    p_Vid->ThisPOC = p->poc;
+    flush_dpb(p_Vid);
+  }
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Store a picture in DPB. This includes cheking for space in DPB and
+ *    flushing frames.
+ *    If we received a frame, we need to check for a new store, if we
+ *    got a field, check if it's the second field of an already allocated
+ *    store.
+ *
+ * \param p_Vid
+ *      image decoding parameters for current picture
+ * \param p
+ *    Picture to be stored
+ *
+ ************************************************************************
+ */
+
+void store_picture_in_dpb(VideoParameters *p_Vid, StorablePicture* p)
+{
+  DecodedPictureBuffer *p_Dpb = p_Vid->p_Dpb;
+  unsigned i;
+  int poc, pos;
+  // picture error concealment
+  
+  // diagnostics
+  //printf ("Storing (%s) non-ref pic with frame_num #%d\n", (p->type == FRAME)?"FRAME":(p->type == TOP_FIELD)?"TOP_FIELD":"BOTTOM_FIELD", p->pic_num);
+  // if frame, check for new store,
+  assert (p!=NULL);
+
+  p_Vid->last_has_mmco_5=0;
+  p_Vid->last_pic_bottom_field = (p->structure == BOTTOM_FIELD);
+
+  if (p->idr_flag)
+  {
+    idr_memory_management(p_Vid, p);
+  // picture error concealment
+    memset(p_Vid->pocs_in_dpb, 0, sizeof(int)*100);
+  }
+  else
+  {
+    // adaptive memory management
+    if (p->used_for_reference && (p->adaptive_ref_pic_buffering_flag))
+      adaptive_memory_management(p_Vid, p);
+  }
+
+  if ((p->structure==TOP_FIELD)||(p->structure==BOTTOM_FIELD))
+  {
+    // check for frame store with same pic_number
+    if (p_Dpb->last_picture)
+    {
+      if ((int)p_Dpb->last_picture->frame_num == p->pic_num)
+      {
+        if (((p->structure==TOP_FIELD)&&(p_Dpb->last_picture->is_used==2))||((p->structure==BOTTOM_FIELD)&&(p_Dpb->last_picture->is_used==1)))
+        {
+          if ((p->used_for_reference && (p_Dpb->last_picture->is_orig_reference!=0))||
+              (!p->used_for_reference && (p_Dpb->last_picture->is_orig_reference==0)))
+          {
+            insert_picture_in_dpb(p_Vid, p_Dpb->last_picture, p);            
+            update_ref_list(p_Dpb);
+            update_ltref_list(p_Dpb);
+            dump_dpb(p_Dpb);
+            p_Dpb->last_picture = NULL;
+            return;
+          }
+        }
+      }
+    }
+  }
+
+  // this is a frame or a field which has no stored complementary field
+
+  // sliding window, if necessary
+  if ((!p->idr_flag)&&(p->used_for_reference && (!p->adaptive_ref_pic_buffering_flag)))
+  {
+    sliding_window_memory_management(p_Dpb, p);
+  }
+
+  // first try to remove unused frames
+  if (p_Dpb->used_size==p_Dpb->size)
+  {
+    remove_unused_frame_from_dpb(p_Vid, p_Dpb);
+  }
+
+  // then output frames until one can be removed
+  while (p_Dpb->used_size == p_Dpb->size)
+  {
+    // non-reference frames may be output directly
+    if (!p->used_for_reference)
+    {
+      get_smallest_poc(p_Dpb, &poc, &pos);
+      if ((-1==pos) || (p->poc < poc))
+      {
+        direct_output(p_Vid, p);
+        return;
+      }
+    }
+    // flush a frame
+    output_one_frame_from_dpb(p_Vid);
+  }
+
+  // check for duplicate frame number in short term reference buffer
+  if ((p->used_for_reference)&&(!p->is_long_term))
+  {
+    for (i=0; i<p_Dpb->ref_frames_in_buffer; i++)
+    {
+      if (p_Dpb->fs_ref[i]->frame_num == p->frame_num)
+      {
+        //error("duplicate frame_num in short-term reference picture buffer", 500);
+				//printf("warning");
+				//return;
+      }
+    }
+
+  }
+  // store at end of buffer
+  insert_picture_in_dpb(p_Vid, p_Dpb->fs[p_Dpb->used_size],p);
+
+  // picture error concealment
+  if (p->idr_flag)
+  {
+      p_Vid->earlier_missing_poc = 0;
+  }
+
+  if (p->structure != FRAME)
+  {
+    p_Dpb->last_picture = p_Dpb->fs[p_Dpb->used_size];
+  }
+  else
+  {
+    p_Dpb->last_picture = NULL;
+  }
+
+  p_Dpb->used_size++;
+
+  update_ref_list(p_Dpb);
+  update_ltref_list(p_Dpb);
+
+  check_num_ref(p_Dpb);
+
+  dump_dpb(p_Dpb);
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Insert the picture into the DPB. A free DPB position is necessary
+ *    for frames, .
+ *
+ * \param p_Vid
+ *      image decoding parameters for current picture
+ * \param fs
+ *    FrameStore into which the picture will be inserted
+ * \param p
+ *    StorablePicture to be inserted
+ *
+ ************************************************************************
+ */
+static void insert_picture_in_dpb(VideoParameters *p_Vid, FrameStore* fs, StorablePicture* p)
+{
+  InputParameters *p_Inp = p_Vid->p_Inp;
+//  printf ("insert (%s) pic with frame_num #%d, poc %d\n", (p->structure == FRAME)?"FRAME":(p->structure == TOP_FIELD)?"TOP_FIELD":"BOTTOM_FIELD", p->pic_num, p->poc);
+  assert (p!=NULL);
+  assert (fs!=NULL);
+  switch (p->structure)
+  {
+  case FRAME:
+    fs->frame = p;
+    fs->is_used = 3;
+    if (p->used_for_reference)
+    {
+      fs->is_reference = 3;
+      fs->is_orig_reference = 3;
+      if (p->is_long_term)
+      {
+        fs->is_long_term = 3;
+        fs->long_term_frame_idx = p->long_term_frame_idx;
+      }
+    }
+    // generate field views
+    dpb_split_field(p_Vid, fs);
+    break;
+  case TOP_FIELD:
+    fs->top_field = p;
+    fs->is_used |= 1;
+    if (p->used_for_reference)
+    {
+      fs->is_reference |= 1;
+      fs->is_orig_reference |= 1;
+      if (p->is_long_term)
+      {
+        fs->is_long_term |= 1;
+        fs->long_term_frame_idx = p->long_term_frame_idx;
+      }
+    }
+    if (fs->is_used == 3)
+    {
+      // generate frame view
+      dpb_combine_field(p_Vid, fs);
+			fs->frame->time_code = p->time_code;
+    } else
+    {
+      fs->poc = p->poc;
+      gen_field_ref_ids(p);
+    }
+    break;
+  case BOTTOM_FIELD:
+    fs->bottom_field = p;
+    fs->is_used |= 2;
+    if (p->used_for_reference)
+    {
+      fs->is_reference |= 2;
+      fs->is_orig_reference |= 2;
+      if (p->is_long_term)
+      {
+        fs->is_long_term |= 2;
+        fs->long_term_frame_idx = p->long_term_frame_idx;
+      }
+    }
+    if (fs->is_used == 3)
+    {
+      // generate frame view
+      dpb_combine_field(p_Vid, fs);
+			fs->frame->time_code = p->time_code;
+    } 
+    else
+    {
+      fs->poc = p->poc;
+      gen_field_ref_ids(p);
+    }
+    break;
+  }
+  fs->frame_num = p->pic_num;
+  fs->recovery_frame = p->recovery_frame;
+
+  fs->is_output = p->is_output;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Check if one of the frames/fields in frame store is used for reference
+ ************************************************************************
+ */
+static int is_used_for_reference(FrameStore* fs)
+{
+  if (fs->is_reference)
+  {
+    return 1;
+  }
+
+  if (fs->is_used == 3) // frame
+  {
+    if (fs->frame->used_for_reference)
+    {
+      return 1;
+    }
+  }
+
+  if (fs->is_used & 1) // top field
+  {
+    if (fs->top_field)
+    {
+      if (fs->top_field->used_for_reference)
+      {
+        return 1;
+      }
+    }
+  }
+
+  if (fs->is_used & 2) // bottom field
+  {
+    if (fs->bottom_field)
+    {
+      if (fs->bottom_field->used_for_reference)
+      {
+        return 1;
+      }
+    }
+  }
+  return 0;
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Check if one of the frames/fields in frame store is used for short-term reference
+ ************************************************************************
+ */
+static int is_short_term_reference(FrameStore* fs)
+{
+
+  if (fs->is_used==3) // frame
+  {
+    if ((fs->frame->used_for_reference)&&(!fs->frame->is_long_term))
+    {
+      return 1;
+    }
+  }
+
+  if (fs->is_used & 1) // top field
+  {
+    if (fs->top_field)
+    {
+      if ((fs->top_field->used_for_reference)&&(!fs->top_field->is_long_term))
+      {
+        return 1;
+      }
+    }
+  }
+
+  if (fs->is_used & 2) // bottom field
+  {
+    if (fs->bottom_field)
+    {
+      if ((fs->bottom_field->used_for_reference)&&(!fs->bottom_field->is_long_term))
+      {
+        return 1;
+      }
+    }
+  }
+  return 0;
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Check if one of the frames/fields in frame store is used for short-term reference
+ ************************************************************************
+ */
+static int is_long_term_reference(FrameStore* fs)
+{
+
+  if (fs->is_used==3) // frame
+  {
+    if ((fs->frame->used_for_reference)&&(fs->frame->is_long_term))
+    {
+      return 1;
+    }
+  }
+
+  if (fs->is_used & 1) // top field
+  {
+    if (fs->top_field)
+    {
+      if ((fs->top_field->used_for_reference)&&(fs->top_field->is_long_term))
+      {
+        return 1;
+      }
+    }
+  }
+
+  if (fs->is_used & 2) // bottom field
+  {
+    if (fs->bottom_field)
+    {
+      if ((fs->bottom_field->used_for_reference)&&(fs->bottom_field->is_long_term))
+      {
+        return 1;
+      }
+    }
+  }
+  return 0;
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    remove one frame from DPB
+ ************************************************************************
+ */
+static void remove_frame_from_dpb(VideoParameters *p_Vid, int pos)
+{
+  DecodedPictureBuffer *p_Dpb = p_Vid->p_Dpb;
+  FrameStore* fs = p_Dpb->fs[pos];
+  FrameStore* tmp;
+  unsigned i;
+
+//  printf ("remove frame with frame_num #%d\n", fs->frame_num);
+  switch (fs->is_used)
+  {
+  case 3:
+    free_storable_picture(p_Vid, fs->frame);
+    free_storable_picture(p_Vid, fs->top_field);
+    free_storable_picture(p_Vid, fs->bottom_field);
+    fs->frame=NULL;
+    fs->top_field=NULL;
+    fs->bottom_field=NULL;
+    break;
+  case 2:
+    free_storable_picture(p_Vid, fs->bottom_field);
+    fs->bottom_field=NULL;
+    break;
+  case 1:
+    free_storable_picture(p_Vid, fs->top_field);
+    fs->top_field=NULL;
+    break;
+  case 0:
+    break;
+  default:
+    error("invalid frame store type",500);
+  }
+  fs->is_used = 0;
+  fs->is_long_term = 0;
+  fs->is_reference = 0;
+  fs->is_orig_reference = 0;
+
+  // move empty framestore to end of buffer
+  tmp = p_Dpb->fs[pos];
+
+  for (i=pos; i<p_Dpb->used_size-1;i++)
+  {
+    p_Dpb->fs[i] = p_Dpb->fs[i+1];
+  }
+  p_Dpb->fs[p_Dpb->used_size-1] = tmp;
+  p_Dpb->used_size--;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    find smallest POC in the DPB.
+ ************************************************************************
+ */
+static void get_smallest_poc(DecodedPictureBuffer *p_Dpb, int *poc,int * pos)
+{
+  unsigned i;
+
+  if (p_Dpb->used_size<1)
+  {
+    error("Cannot determine smallest POC, DPB empty.",150);
+  }
+
+  *pos=-1;
+  *poc = INT_MAX;
+  for (i=0; i<p_Dpb->used_size; i++)
+  {
+    if ((*poc > p_Dpb->fs[i]->poc)&&(!p_Dpb->fs[i]->is_output))
+    {
+      *poc = p_Dpb->fs[i]->poc;
+      *pos=i;
+    }
+  }
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Remove a picture from DPB which is no longer needed.
+ ************************************************************************
+ */
+static int remove_unused_frame_from_dpb(VideoParameters *p_Vid, DecodedPictureBuffer *p_Dpb)
+{
+  unsigned i;
+
+  // check for frames that were already output and no longer used for reference
+  for (i = 0; i < p_Dpb->used_size; i++)
+  {
+    if (p_Dpb->fs[i]->is_output && (!is_used_for_reference(p_Dpb->fs[i])))
+    {
+      remove_frame_from_dpb(p_Vid, i);
+      return 1;
+    }
+  }
+  return 0;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Output one picture stored in the DPB.
+ ************************************************************************
+ */
+static void output_one_frame_from_dpb(VideoParameters *p_Vid)
+{
+  DecodedPictureBuffer *p_Dpb = p_Vid->p_Dpb;
+  int poc, pos;
+  //diagnostics
+  if (p_Dpb->used_size<1)
+  {
+    error("Cannot output frame, DPB empty.",150);
+  }
+
+  // find smallest POC
+  get_smallest_poc(p_Dpb, &poc, &pos);
+
+  if(pos==-1)
+  {
+    error("no frames for output available", 150);
+  }
+
+  // call the output function
+//  printf ("output frame with frame_num #%d, poc %d (dpb. p_Dpb->size=%d, p_Dpb->used_size=%d)\n", p_Dpb->fs[pos]->frame_num, p_Dpb->fs[pos]->frame->poc, p_Dpb->size, p_Dpb->used_size);
+
+
+// JVT-P072 ends
+
+  write_stored_frame(p_Vid, p_Dpb->fs[pos]);
+
+
+    if (p_Dpb->last_output_poc >= poc)
+    {
+			//printf("warning");
+      //error ("output POC must be in ascending order", 150);
+    }
+  p_Dpb->last_output_poc = poc;
+  // free frame store and move empty store to end of buffer
+  if (!is_used_for_reference(p_Dpb->fs[pos]))
+  {
+    remove_frame_from_dpb(p_Vid, pos);
+  }
+}
+
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    All stored picture are output. Should be called to empty the buffer
+ ************************************************************************
+ */
+void flush_dpb(VideoParameters *p_Vid)
+{
+  unsigned i;
+  DecodedPictureBuffer *p_Dpb = p_Vid->p_Dpb;
+
+  //diagnostics
+//  printf("Flush remaining frames from dpb. p_Dpb->size=%d, p_Dpb->used_size=%d\n",p_Dpb->size,p_Dpb->used_size);
+
+
+  // mark all frames unused
+  for (i=0; i<p_Dpb->used_size; i++)
+  {
+    unmark_for_reference (p_Vid, p_Dpb->fs[i]);
+  }
+
+  while (remove_unused_frame_from_dpb(p_Vid, p_Dpb)) ;
+
+  // output frames in POC order
+  while (p_Dpb->used_size)
+  {
+    output_one_frame_from_dpb(p_Vid);
+  }
+
+  p_Dpb->last_output_poc = INT_MIN;
+}
+
+
+static void gen_field_ref_ids(StorablePicture *p)
+{
+  int i,j, dummylist0, dummylist1;
+   //! Generate Frame parameters from field information.
+  for (i=0 ; i<p->size_x/4 ; i++)
+  {
+    for (j=0 ; j<p->size_y/4 ; j++)
+    {
+        dummylist0= p->motion.motion[LIST_0][j][i].ref_idx;
+        dummylist1= p->motion.motion[LIST_1][j][i].ref_idx;
+        //! association with id already known for fields.
+        p->motion.motion[LIST_0][j][i].ref_id = (dummylist0>=0)? p->ref_pic_num[p->slice_id[j>>2][i>>2]][LIST_0][dummylist0] : 0;
+        p->motion.motion[LIST_1][j][i].ref_id = (dummylist1>=0)? p->ref_pic_num[p->slice_id[j>>2][i>>2]][LIST_1][dummylist1] : 0;
+        p->motion.field_frame[j][i]=1;
+    }
+  }
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Extract top field from a frame
+ ************************************************************************
+ */
+void dpb_split_field(VideoParameters *p_Vid, FrameStore *fs)
+{
+  int i, j, ii, jj, jj4;
+  int idiv,jdiv;
+  int currentmb;
+  int dummylist0, dummylist1;
+  int twosz16 = 2 * (fs->frame->size_x >> 4);
+  StorablePicture *fs_top, *fs_btm; 
+  StorablePicture *frame = fs->frame;
+
+
+  fs->poc = frame->poc;
+
+  if (!frame->frame_mbs_only_flag)
+  {
+    fs_top = fs->top_field    = alloc_storable_picture(p_Vid, TOP_FIELD,    frame->size_x, frame->size_y, frame->size_x_cr, frame->size_y_cr);
+    fs_btm = fs->bottom_field = alloc_storable_picture(p_Vid, BOTTOM_FIELD, frame->size_x, frame->size_y, frame->size_x_cr, frame->size_y_cr);
+
+    for (i = 0; i < (frame->size_y>>1); i++)
+    {
+      memcpy(fs_top->imgY->img[i], frame->imgY->img[i*2], frame->size_x*sizeof(imgpel));
+    }
+
+    for (i = 0; i< (frame->size_y_cr>>1); i++)
+    {
+      memcpy(fs_top->imgUV[0]->img[i], frame->imgUV[0]->img[i*2], frame->size_x_cr*sizeof(imgpel));
+      memcpy(fs_top->imgUV[1]->img[i], frame->imgUV[1]->img[i*2], frame->size_x_cr*sizeof(imgpel));
+    }
+
+    for (i = 0; i < (frame->size_y>>1); i++)
+    {
+      memcpy(fs_btm->imgY->img[i], frame->imgY->img[i*2 + 1], frame->size_x*sizeof(imgpel));
+    }
+
+    for (i = 0; i < (frame->size_y_cr>>1); i++)
+    {
+      memcpy(fs_btm->imgUV[0]->img[i], frame->imgUV[0]->img[i*2 + 1], frame->size_x_cr*sizeof(imgpel));
+      memcpy(fs_btm->imgUV[1]->img[i], frame->imgUV[1]->img[i*2 + 1], frame->size_x_cr*sizeof(imgpel));
+    }
+
+    fs_top->poc = frame->top_poc;
+    fs_btm->poc = frame->bottom_poc;
+
+    fs_top->frame_poc =  frame->frame_poc;
+
+    fs_top->bottom_poc = fs_btm->bottom_poc =  frame->bottom_poc;
+    fs_top->top_poc    = fs_btm->top_poc    =  frame->top_poc;
+    fs_btm->frame_poc  = frame->frame_poc;
+
+    fs_top->used_for_reference = fs_btm->used_for_reference
+                                      = frame->used_for_reference;
+    fs_top->is_long_term = fs_btm->is_long_term
+                                = frame->is_long_term;
+    fs->long_term_frame_idx = fs_top->long_term_frame_idx
+                            = fs_btm->long_term_frame_idx
+                            = frame->long_term_frame_idx;
+
+    fs_top->coded_frame = fs_btm->coded_frame = 1;
+    fs_top->mb_aff_frame_flag = fs_btm->mb_aff_frame_flag
+                                  = frame->mb_aff_frame_flag;
+
+    frame->top_field    = fs_top;
+    frame->bottom_field = fs_btm;
+
+    fs_top->bottom_field = fs_btm;
+    fs_top->frame        = frame;
+    fs_btm->top_field = fs_top;
+    fs_btm->frame     = frame;
+
+    fs_top->chroma_format_idc = fs_btm->chroma_format_idc = frame->chroma_format_idc;
+
+    //store reference picture index
+    for (j=0; j<=frame->max_slice_id; j++)
+    {
+      memcpy(&fs_top->ref_pic_num[j][LIST_0][0], &frame->ref_pic_num[j][2 + LIST_0][0], 66 * sizeof(h264_ref_t));
+      //memcpy(&fs_top->ref_pic_num[j][LIST_1][0], &frame->ref_pic_num[j][2 + LIST_1][0], 33 * sizeof(int64));            
+      memcpy(&fs_btm->ref_pic_num[j][LIST_0][0], &frame->ref_pic_num[j][4 + LIST_0][0], 66 * sizeof(h264_ref_t));
+      //memcpy(&fs_btm->ref_pic_num[j][LIST_1][0], &frame->ref_pic_num[j][4 + LIST_1][0], 33 * sizeof(int64));
+    }
+  }
+  else
+  {
+    fs_top=NULL;
+    fs_btm=NULL;
+    frame->top_field=NULL;
+    frame->bottom_field=NULL;
+  }
+
+  if (!frame->mb_aff_frame_flag)
+  {
+		// TODO: benski> this part of the function is kinda slow and the compiler 
+		// isn't too good at optimizing it
+		// can probably optimize by using CMOV eax, -1 in the inner loop
+		int ii, jj;
+		PicMotionParams *motion = &frame->motion;
+		for (j = 0; (j < frame->size_y >> 2) ; j+=4)
+		{
+			const short *slicej = frame->slice_id[j>>2];
+			for (jj = 0;jj<4;jj++)
+			{
+				PicMotion *ref0=frame->motion.motion[LIST_0][j+jj];
+				PicMotion *ref1=frame->motion.motion[LIST_1][j+jj];
+				for (i = 0 ; i < (frame->size_x >> 2) ; i+=4)
+				{
+					short slice = slicej[i>>2];
+					const h264_ref_t *ref_pic_num0 = frame->ref_pic_num[slice][LIST_0];
+					const h264_ref_t *ref_pic_num1 = frame->ref_pic_num[slice][LIST_1];
+					for (ii=0;ii<4;ii++)
+					{
+						dummylist0 = ref0[i+ii].ref_idx;
+						ref0[i+ii].ref_id = (dummylist0>=0)? ref_pic_num0[dummylist0] : -1;
+						dummylist1 = ref1[i+ii].ref_idx;
+						ref1[i+ii].ref_id = (dummylist1>=0)? ref_pic_num1[dummylist1] : -1;
+					}
+				}
+			}
+		}
+	}
+  else
+  {
+    for (j = 0; (j < frame->size_y >> 2) ; j++)
+    {
+      jdiv = j >> 2;
+      for (i = 0 ; i < (frame->size_x >> 2) ; i++)
+      {
+        idiv = (i >> 2);
+        currentmb = twosz16*(jdiv >> 1)+ (idiv)*2 + (jdiv & 0x01);
+
+        if (frame->motion.mb_field[currentmb])
+        {
+          int list_offset = currentmb&1;
+          dummylist0 = frame->motion.motion[LIST_0][j][i].ref_idx;
+          dummylist1 = frame->motion.motion[LIST_1][j][i].ref_idx;
+          //! association with id already known for fields.
+          frame->motion.field_references[2*list_offset  ][j][i]= (dummylist0>=0)? frame->ref_pic_num[frame->slice_id[jdiv][idiv]][LIST_0 + list_offset*2 + 2][dummylist0] : 0;
+          frame->motion.field_references[2*list_offset+1][j][i]= (dummylist1>=0)? frame->ref_pic_num[frame->slice_id[jdiv][idiv]][LIST_1 + list_offset*2 + 2][dummylist1] : 0;
+          //! need to make association with frames
+          frame->motion.motion[LIST_0][j][i].ref_id = (dummylist0>=0)? frame->frm_ref_pic_num[frame->slice_id[jdiv][idiv]][LIST_0 + list_offset*2 + 2][dummylist0] : 0;
+          frame->motion.motion[LIST_1][j][i].ref_id = (dummylist1>=0)? frame->frm_ref_pic_num[frame->slice_id[jdiv][idiv]][LIST_1 + list_offset*2 + 2][dummylist1] : 0;
+
+        }
+        else
+        {
+          dummylist0 = frame->motion.motion[LIST_0][j][i].ref_idx;
+          dummylist1 = frame->motion.motion[LIST_1][j][i].ref_idx;
+          frame->motion.motion[LIST_0][j][i].ref_id = (dummylist0>=0)? frame->ref_pic_num[frame->slice_id[jdiv][idiv]][LIST_0][dummylist0] : -1;
+          frame->motion.motion[LIST_1][j][i].ref_id = (dummylist1>=0)? frame->ref_pic_num[frame->slice_id[jdiv][idiv]][LIST_1][dummylist1] : -1;
+        }
+      }
+    }
+  }
+
+    if (!frame->frame_mbs_only_flag)
+  {
+    if (frame->mb_aff_frame_flag)
+  {
+    PicMotionParams *frm_motion = &frame->motion;
+    PicMotionParams *top_motion = &fs_top->motion;
+    PicMotionParams *btm_motion = &fs_btm->motion;
+    for (j=0 ; j< (frame->size_y >> 3); j++)
+    {
+      jj = (j >> 2)*8 + (j & 0x03);
+      jj4 = jj + 4;
+      jdiv = (j >> 1);
+      for (i=0 ; i < (frame->size_x>>2); i++)
+      {
+        idiv = (i >> 2);
+
+        currentmb = twosz16*(jdiv >> 1)+ (idiv)*2 + (jdiv & 0x01);
+        // Assign field mvs attached to MB-Frame buffer to the proper buffer
+        if (frm_motion->mb_field[currentmb])
+        {
+          btm_motion->field_frame[j][i]  = top_motion->field_frame[j][i]=1;
+          frm_motion->field_frame[2*j][i] = frm_motion->field_frame[2*j+1][i]=1;
+
+          btm_motion->motion[LIST_0][j][i].mv[0] = frm_motion->motion[LIST_0][jj4][i].mv[0];
+          btm_motion->motion[LIST_0][j][i].mv[1] = frm_motion->motion[LIST_0][jj4][i].mv[1];
+          btm_motion->motion[LIST_1][j][i].mv[0] = frm_motion->motion[LIST_1][jj4][i].mv[0];
+          btm_motion->motion[LIST_1][j][i].mv[1] = frm_motion->motion[LIST_1][jj4][i].mv[1];
+          btm_motion->motion[LIST_0][j][i].ref_idx = frm_motion->motion[LIST_0][jj4][i].ref_idx;
+          btm_motion->motion[LIST_1][j][i].ref_idx = frm_motion->motion[LIST_1][jj4][i].ref_idx;
+          btm_motion->motion[LIST_0][j][i].ref_id = frm_motion->field_references[2][jj4][i];
+          btm_motion->motion[LIST_1][j][i].ref_id = frm_motion->field_references[3][jj4][i];
+
+
+          top_motion->motion[LIST_0][j][i].mv[0] = frm_motion->motion[LIST_0][jj][i].mv[0];
+          top_motion->motion[LIST_0][j][i].mv[1] = frm_motion->motion[LIST_0][jj][i].mv[1];
+          top_motion->motion[LIST_1][j][i].mv[0] = frm_motion->motion[LIST_1][jj][i].mv[0];
+          top_motion->motion[LIST_1][j][i].mv[1] = frm_motion->motion[LIST_1][jj][i].mv[1];
+          top_motion->motion[LIST_0][j][i].ref_idx = frm_motion->motion[LIST_0][jj][i].ref_idx;
+          top_motion->motion[LIST_1][j][i].ref_idx = frm_motion->motion[LIST_1][jj][i].ref_idx;
+          top_motion->motion[LIST_0][j][i].ref_id = frm_motion->field_references[0][jj][i];
+          top_motion->motion[LIST_1][j][i].ref_id = frm_motion->field_references[1][jj][i];
+        }
+      }
+    }
+  }
+
+  //! Generate field MVs from Frame MVs
+    for (j=0 ; j < (frame->size_y >> 3) ; j++)
+    {
+      jj = 2* RSD(j);
+      jdiv = (j >> 1);
+      for (i=0 ; i < (frame->size_x >> 2) ; i++)
+      {
+        ii = RSD(i);
+        idiv = (i >> 2);
+
+        currentmb = twosz16 * (jdiv >> 1)+ (idiv)*2 + (jdiv & 0x01);
+
+        if (!frame->mb_aff_frame_flag  || !frame->motion.mb_field[currentmb])
+        {
+          frame->motion.field_frame[2*j+1][i] = frame->motion.field_frame[2*j][i]=0;
+
+          fs_top->motion.field_frame[j][i] = fs_btm->motion.field_frame[j][i] = 0;
+
+          fs_top->motion.motion[LIST_0][j][i].mv[0] = fs_btm->motion.motion[LIST_0][j][i].mv[0] = frame->motion.motion[LIST_0][jj][ii].mv[0];
+          fs_top->motion.motion[LIST_0][j][i].mv[1] = fs_btm->motion.motion[LIST_0][j][i].mv[1] = frame->motion.motion[LIST_0][jj][ii].mv[1];
+          fs_top->motion.motion[LIST_1][j][i].mv[0] = fs_btm->motion.motion[LIST_1][j][i].mv[0] = frame->motion.motion[LIST_1][jj][ii].mv[0];
+          fs_top->motion.motion[LIST_1][j][i].mv[1] = fs_btm->motion.motion[LIST_1][j][i].mv[1] = frame->motion.motion[LIST_1][jj][ii].mv[1];
+
+          // Scaling of references is done here since it will not affect spatial direct (2*0 =0)
+          if (frame->motion.motion[LIST_0][jj][ii].ref_idx == -1)
+            fs_top->motion.motion[LIST_0][j][i].ref_idx = fs_btm->motion.motion[LIST_0][j][i].ref_idx = - 1;
+          else
+          {
+            dummylist0=fs_top->motion.motion[LIST_0][j][i].ref_idx = fs_btm->motion.motion[LIST_0][j][i].ref_idx = frame->motion.motion[LIST_0][jj][ii].ref_idx;
+            fs_top->motion.motion[LIST_0][j][i].ref_id = (dummylist0>=0)? frame->top_ref_pic_num[frame->slice_id[jj>>2][ii>>2]][LIST_0][dummylist0] : 0;
+            fs_btm->motion.motion[LIST_0][j][i].ref_id = (dummylist0>=0)? frame->bottom_ref_pic_num[frame->slice_id[jj>>2][ii>>2]][LIST_0][dummylist0] : 0;
+          }
+
+          if (frame->motion.motion[LIST_1][jj][ii].ref_idx == -1)
+            fs_top->motion.motion[LIST_1][j][i].ref_idx = fs_btm->motion.motion[LIST_1][j][i].ref_idx = - 1;
+          else
+          {
+            dummylist1=fs_top->motion.motion[LIST_1][j][i].ref_idx = fs_btm->motion.motion[LIST_1][j][i].ref_idx = frame->motion.motion[LIST_1][jj][ii].ref_idx;
+
+            fs_top->motion.motion[LIST_1][j][i].ref_id = (dummylist1>=0)? frame->top_ref_pic_num[frame->slice_id[jj>>2][ii>>2]][LIST_1][dummylist1] : 0;
+            fs_btm->motion.motion[LIST_1][j][i].ref_id = (dummylist1>=0)? frame->bottom_ref_pic_num[frame->slice_id[jj>>2][ii>>2]][LIST_1][dummylist1] : 0;
+          }
+        }
+        else
+        {
+          frame->motion.field_frame[2*j+1][i] = frame->motion.field_frame[2*j][i]= frame->motion.mb_field[currentmb];
+        }
+      }
+    }
+  }
+  else
+  {
+    memset( &(frame->motion.field_frame[0][0]), 0, (frame->size_y * frame->size_x >> 4) * sizeof(byte));
+  }
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Generate a frame from top and bottom fields,
+ *    YUV components and display information only
+ ************************************************************************
+ */
+void dpb_combine_field_yuv(VideoParameters *p_Vid, FrameStore *fs)
+{
+  int i, j;
+
+  fs->frame = alloc_storable_picture(p_Vid, FRAME, fs->top_field->size_x, fs->top_field->size_y*2, fs->top_field->size_x_cr, fs->top_field->size_y_cr*2);
+
+  for (i=0; i<fs->top_field->size_y; i++)
+  {
+    memcpy(fs->frame->imgY->img[i*2],     fs->top_field->imgY->img[i]   , fs->top_field->size_x * sizeof(imgpel));     // top field
+    memcpy(fs->frame->imgY->img[i*2 + 1], fs->bottom_field->imgY->img[i], fs->bottom_field->size_x * sizeof(imgpel)); // bottom field
+  }
+
+  for (j = 0; j < 2; j++)
+  {
+    for (i=0; i<fs->top_field->size_y_cr; i++)
+    {
+      memcpy(fs->frame->imgUV[j]->img[i*2],     fs->top_field->imgUV[j]->img[i],    fs->top_field->size_x_cr*sizeof(imgpel));
+      memcpy(fs->frame->imgUV[j]->img[i*2 + 1], fs->bottom_field->imgUV[j]->img[i], fs->bottom_field->size_x_cr*sizeof(imgpel));
+    }
+  }
+
+  fs->poc=fs->frame->poc =fs->frame->frame_poc = imin (fs->top_field->poc, fs->bottom_field->poc);
+
+  fs->bottom_field->frame_poc=fs->top_field->frame_poc=fs->frame->poc;
+
+  fs->bottom_field->top_poc=fs->frame->top_poc=fs->top_field->poc;
+  fs->top_field->bottom_poc=fs->frame->bottom_poc=fs->bottom_field->poc;
+
+  fs->frame->used_for_reference = (fs->top_field->used_for_reference && fs->bottom_field->used_for_reference );
+  fs->frame->is_long_term = (fs->top_field->is_long_term && fs->bottom_field->is_long_term );
+
+  if (fs->frame->is_long_term)
+    fs->frame->long_term_frame_idx = fs->long_term_frame_idx;
+
+  fs->frame->top_field    = fs->top_field;
+  fs->frame->bottom_field = fs->bottom_field;
+
+  fs->frame->coded_frame = 0;
+
+  fs->frame->chroma_format_idc = fs->top_field->chroma_format_idc;
+  fs->frame->frame_cropping_flag = fs->top_field->frame_cropping_flag;
+  if (fs->frame->frame_cropping_flag)
+  {
+    fs->frame->frame_cropping_rect_top_offset = fs->top_field->frame_cropping_rect_top_offset;
+    fs->frame->frame_cropping_rect_bottom_offset = fs->top_field->frame_cropping_rect_bottom_offset;
+    fs->frame->frame_cropping_rect_left_offset = fs->top_field->frame_cropping_rect_left_offset;
+    fs->frame->frame_cropping_rect_right_offset = fs->top_field->frame_cropping_rect_right_offset;
+  }
+
+  fs->top_field->frame = fs->bottom_field->frame = fs->frame;
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Generate a frame from top and bottom fields
+ ************************************************************************
+ */
+void dpb_combine_field(VideoParameters *p_Vid, FrameStore *fs)
+{
+  int i,j, k, jj, jj4;
+  int dummylist0, dummylist1;
+
+  dpb_combine_field_yuv(p_Vid, fs);
+
+
+  //combine field for frame
+  for (j=0; j<=(imax(fs->top_field->max_slice_id, fs->bottom_field->max_slice_id)); j++)
+  {
+    for (k = LIST_0; k <= LIST_1; k++)
+    {
+      for (i=0;i<16;i++)
+      {
+        fs->frame->ref_pic_num[j][k][i]= (h264_ref_t) i64min ((fs->top_field->ref_pic_num[j][k][2*i]/2)*2, (fs->bottom_field->ref_pic_num[j][k][2*i]/2)*2);
+      }
+    }
+  }
+
+   //! Use inference flag to remap mvs/references
+
+  //! Generate Frame parameters from field information.
+  for (j=0 ; j < (fs->top_field->size_y >> 2) ; j++)
+  {
+    jj = 8*(j >> 2) + (j & 0x03);
+    jj4 = jj + 4;
+    for (i=0 ; i< (fs->top_field->size_x >> 2) ; i++)
+    {
+      fs->frame->motion.field_frame[jj][i]= fs->frame->motion.field_frame[jj4][i]=1;
+
+      fs->frame->motion.motion[LIST_0][jj][i].mv[0] = fs->top_field->motion.motion[LIST_0][j][i].mv[0];
+      fs->frame->motion.motion[LIST_0][jj][i].mv[1] = fs->top_field->motion.motion[LIST_0][j][i].mv[1];
+      fs->frame->motion.motion[LIST_1][jj][i].mv[0] = fs->top_field->motion.motion[LIST_1][j][i].mv[0];
+      fs->frame->motion.motion[LIST_1][jj][i].mv[1] = fs->top_field->motion.motion[LIST_1][j][i].mv[1];
+
+      dummylist0=fs->frame->motion.motion[LIST_0][jj][i].ref_idx  = fs->top_field->motion.motion[LIST_0][j][i].ref_idx;
+      dummylist1=fs->frame->motion.motion[LIST_1][jj][i].ref_idx  = fs->top_field->motion.motion[LIST_1][j][i].ref_idx;
+
+      //! association with id already known for fields.
+      fs->top_field->motion.motion[LIST_0][j][i].ref_id = (dummylist0>=0)? fs->top_field->ref_pic_num[fs->top_field->slice_id[j>>2][i>>2]][LIST_0][dummylist0] : 0;
+      fs->top_field->motion.motion[LIST_1][j][i].ref_id = (dummylist1>=0)? fs->top_field->ref_pic_num[fs->top_field->slice_id[j>>2][i>>2]][LIST_1][dummylist1] : 0;
+
+      //! need to make association with frames
+      fs->frame->motion.motion[LIST_0][jj][i].ref_id = (dummylist0>=0)? fs->top_field->frm_ref_pic_num[fs->top_field->slice_id[j>>2][i>>2]][LIST_0][dummylist0] : 0;
+      fs->frame->motion.motion[LIST_1][jj][i].ref_id = (dummylist1>=0)? fs->top_field->frm_ref_pic_num[fs->top_field->slice_id[j>>2][i>>2]][LIST_1][dummylist1] : 0;
+
+      fs->frame->motion.motion[LIST_0][jj4][i].mv[0] = fs->bottom_field->motion.motion[LIST_0][j][i].mv[0];
+      fs->frame->motion.motion[LIST_0][jj4][i].mv[1] = fs->bottom_field->motion.motion[LIST_0][j][i].mv[1] ;
+      fs->frame->motion.motion[LIST_1][jj4][i].mv[0] = fs->bottom_field->motion.motion[LIST_1][j][i].mv[0];
+      fs->frame->motion.motion[LIST_1][jj4][i].mv[1] = fs->bottom_field->motion.motion[LIST_1][j][i].mv[1] ;
+
+      dummylist0=fs->frame->motion.motion[LIST_0][jj4][i].ref_idx  = fs->bottom_field->motion.motion[LIST_0][j][i].ref_idx;
+      dummylist1=fs->frame->motion.motion[LIST_1][jj4][i].ref_idx  = fs->bottom_field->motion.motion[LIST_1][j][i].ref_idx;
+
+      fs->bottom_field->motion.motion[LIST_0][j][i].ref_id = (dummylist0>=0)? fs->bottom_field->ref_pic_num[fs->bottom_field->slice_id[j>>2][i>>2]][LIST_0][dummylist0] : 0;
+      fs->bottom_field->motion.motion[LIST_1][j][i].ref_id = (dummylist1>=0)? fs->bottom_field->ref_pic_num[fs->bottom_field->slice_id[j>>2][i>>2]][LIST_1][dummylist1] : 0;
+
+      //! need to make association with frames
+      fs->frame->motion.motion[LIST_0][jj4][i].ref_id = (dummylist0>=0)? fs->bottom_field->frm_ref_pic_num[fs->bottom_field->slice_id[j>>2][i>>2]][LIST_0][dummylist0] : -1;
+      fs->frame->motion.motion[LIST_1][jj4][i].ref_id = (dummylist1>=0)? fs->bottom_field->frm_ref_pic_num[fs->bottom_field->slice_id[j>>2][i>>2]][LIST_1][dummylist1] : -1;
+
+      fs->top_field->motion.field_frame[j][i]=1;
+      fs->bottom_field->motion.field_frame[j][i]=1;
+    }
+  }
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Allocate memory for buffering of reference picture reordering commands
+ ************************************************************************
+ */
+void alloc_ref_pic_list_reordering_buffer(Slice *currSlice)
+{
+  VideoParameters *p_Vid = currSlice->p_Vid;
+  int size = currSlice->num_ref_idx_l0_active + 1;
+
+  if (p_Vid->type!=I_SLICE && p_Vid->type!=SI_SLICE)
+  {
+    if ((currSlice->reordering_of_pic_nums_idc_l0 = calloc(size,sizeof(int)))==NULL) no_mem_exit("alloc_ref_pic_list_reordering_buffer: reordering_of_pic_nums_idc_l0");
+    if ((currSlice->abs_diff_pic_num_minus1_l0 = calloc(size,sizeof(int)))==NULL) no_mem_exit("alloc_ref_pic_list_reordering_buffer: abs_diff_pic_num_minus1_l0");
+    if ((currSlice->long_term_pic_idx_l0 = calloc(size,sizeof(int)))==NULL) no_mem_exit("alloc_ref_pic_list_reordering_buffer: long_term_pic_idx_l0");
+  }
+  else
+  {
+    currSlice->reordering_of_pic_nums_idc_l0 = NULL;
+    currSlice->abs_diff_pic_num_minus1_l0 = NULL;
+    currSlice->long_term_pic_idx_l0 = NULL;
+  }
+
+  size = currSlice->num_ref_idx_l1_active+1;
+
+  if (p_Vid->type==B_SLICE)
+  {
+    if ((currSlice->reordering_of_pic_nums_idc_l1 = calloc(size,sizeof(int)))==NULL) no_mem_exit("alloc_ref_pic_list_reordering_buffer: reordering_of_pic_nums_idc_l1");
+    if ((currSlice->abs_diff_pic_num_minus1_l1 = calloc(size,sizeof(int)))==NULL) no_mem_exit("alloc_ref_pic_list_reordering_buffer: abs_diff_pic_num_minus1_l1");
+    if ((currSlice->long_term_pic_idx_l1 = calloc(size,sizeof(int)))==NULL) no_mem_exit("alloc_ref_pic_list_reordering_buffer: long_term_pic_idx_l1");
+  }
+  else
+  {
+    currSlice->reordering_of_pic_nums_idc_l1 = NULL;
+    currSlice->abs_diff_pic_num_minus1_l1 = NULL;
+    currSlice->long_term_pic_idx_l1 = NULL;
+  }
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Free memory for buffering of reference picture reordering commands
+ ************************************************************************
+ */
+void free_ref_pic_list_reordering_buffer(Slice *currSlice)
+{
+
+  if (currSlice->reordering_of_pic_nums_idc_l0)
+    free(currSlice->reordering_of_pic_nums_idc_l0);
+  if (currSlice->abs_diff_pic_num_minus1_l0)
+    free(currSlice->abs_diff_pic_num_minus1_l0);
+  if (currSlice->long_term_pic_idx_l0)
+    free(currSlice->long_term_pic_idx_l0);
+
+  currSlice->reordering_of_pic_nums_idc_l0 = NULL;
+  currSlice->abs_diff_pic_num_minus1_l0 = NULL;
+  currSlice->long_term_pic_idx_l0 = NULL;
+
+  if (currSlice->reordering_of_pic_nums_idc_l1)
+    free(currSlice->reordering_of_pic_nums_idc_l1);
+  if (currSlice->abs_diff_pic_num_minus1_l1)
+    free(currSlice->abs_diff_pic_num_minus1_l1);
+  if (currSlice->long_term_pic_idx_l1)
+    free(currSlice->long_term_pic_idx_l1);
+
+  currSlice->reordering_of_pic_nums_idc_l1 = NULL;
+  currSlice->abs_diff_pic_num_minus1_l1 = NULL;
+  currSlice->long_term_pic_idx_l1 = NULL;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *      Tian Dong
+ *          June 13, 2002, Modifed on July 30, 2003
+ *
+ *      If a gap in frame_num is found, try to fill the gap
+ * \param p_Vid
+ *
+ ************************************************************************
+ */
+// TODO: benski> pass in timecode
+void fill_frame_num_gap(VideoParameters *p_Vid)
+{
+  seq_parameter_set_rbsp_t *active_sps = p_Vid->active_sps;
+  
+  int CurrFrameNum;
+  int UnusedShortTermFrameNum;
+  StorablePicture *picture = NULL;
+  int tmp1 = p_Vid->delta_pic_order_cnt[0];
+  int tmp2 = p_Vid->delta_pic_order_cnt[1];
+  p_Vid->delta_pic_order_cnt[0] = p_Vid->delta_pic_order_cnt[1] = 0;
+
+//  printf("A gap in frame number is found, try to fill it.\n");
+
+  UnusedShortTermFrameNum = (p_Vid->pre_frame_num + 1) % p_Vid->MaxFrameNum;
+  CurrFrameNum = p_Vid->frame_num;
+
+  while (CurrFrameNum != UnusedShortTermFrameNum)
+  {
+    picture = alloc_storable_picture (p_Vid, FRAME, p_Vid->width, p_Vid->height, p_Vid->width_cr, p_Vid->height_cr);
+    picture->coded_frame = 1;
+    picture->pic_num = UnusedShortTermFrameNum;
+    picture->frame_num = UnusedShortTermFrameNum;
+    picture->non_existing = 1;
+    picture->is_output = 1;
+    picture->used_for_reference = 1;
+
+    picture->adaptive_ref_pic_buffering_flag = 0;
+
+    p_Vid->frame_num = UnusedShortTermFrameNum;
+    if (active_sps->pic_order_cnt_type!=0)
+    {
+      decode_poc(p_Vid);
+    }
+    picture->top_poc=p_Vid->toppoc;
+    picture->bottom_poc=p_Vid->bottompoc;
+    picture->frame_poc=p_Vid->framepoc;
+    picture->poc=p_Vid->framepoc;
+
+    store_picture_in_dpb(p_Vid, picture);
+
+    picture=NULL;
+    p_Vid->pre_frame_num = UnusedShortTermFrameNum;
+    UnusedShortTermFrameNum = (UnusedShortTermFrameNum + 1) % p_Vid->MaxFrameNum;
+  }
+  p_Vid->delta_pic_order_cnt[0] = tmp1;
+  p_Vid->delta_pic_order_cnt[1] = tmp2;
+  p_Vid->frame_num = CurrFrameNum;
+
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Allocate motion parameter memory for colocated structure
+ *
+ ************************************************************************
+ */
+#define ROUNDUP2(size) (((size)+1) & ~1)
+void alloc_motion_params(VideoParameters *p_Vid, MotionParams *ftype, int size_y, int size_x)
+{
+	ftype->motion[0] = 0;
+	ftype->motion[1] = 0;
+	if (motion_cache_dimensions_match(&p_Vid->motion_cache, size_x, size_y))
+	{
+		ftype->motion[0] = motion_cache_get(&p_Vid->motion_cache);
+		ftype->motion[1] = motion_cache_get(&p_Vid->motion_cache);
+	}
+	if (!ftype->motion[0])
+		get_mem2DPicMotion(&(ftype->motion[0]), size_y, size_x);
+	if (!ftype->motion[1])
+		get_mem2DPicMotion(&(ftype->motion[1]), size_y, size_x);
+
+  get_mem2D      (&(ftype->moving_block) , ROUNDUP2(size_y), ROUNDUP2(size_x));
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Allocate co-located memory
+ *
+ * \param size_x
+ *    horizontal luma size
+ * \param size_y
+ *    vertical luma size
+ * \param mb_adaptive_frame_field_flag
+ *    flag that indicates macroblock adaptive frame/field coding
+ *
+ * \return
+ *    the allocated StorablePicture structure
+ ************************************************************************
+ */
+ColocatedParams* alloc_colocated(VideoParameters *p_Vid, int size_x, int size_y, int mb_adaptive_frame_field_flag)
+{
+  ColocatedParams *s;
+
+  s = calloc(1, sizeof(ColocatedParams));
+  if (NULL == s)
+    no_mem_exit("alloc_colocated: s");
+
+  s->size_x = size_x;
+  s->size_y = size_y;
+
+  alloc_motion_params(p_Vid, &s->frame, size_y / BLOCK_SIZE, size_x / BLOCK_SIZE);
+
+  if (mb_adaptive_frame_field_flag)
+  {
+    alloc_motion_params(p_Vid, &s->top   , size_y / (BLOCK_SIZE * 2), size_x / BLOCK_SIZE);
+    alloc_motion_params(p_Vid, &s->bottom, size_y / (BLOCK_SIZE * 2), size_x / BLOCK_SIZE);
+  }
+
+  s->mb_adaptive_frame_field_flag  = mb_adaptive_frame_field_flag;
+
+  return s;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Free co-located memory.
+ *
+ * \param p
+ *    Picture to be freed
+ *
+ ************************************************************************
+ */
+void free_colocated(VideoParameters *p_Vid, ColocatedParams* p)
+{
+  if (p)
+  {
+		if (motion_cache_dimensions_match(&p_Vid->motion_cache, p_Vid->width / BLOCK_SIZE, p_Vid->height / BLOCK_SIZE))
+		{
+			motion_cache_add(&p_Vid->motion_cache,p->frame.motion[0]);
+			motion_cache_add(&p_Vid->motion_cache,p->frame.motion[1]);
+		}
+		else
+		{
+			free_mem2DPicMotion(p->frame.motion[0]);
+			free_mem2DPicMotion(p->frame.motion[1]);
+		}
+
+    if (p->frame.moving_block)
+    {
+      free_mem2D (p->frame.moving_block);
+      p->frame.moving_block=NULL;
+    }
+
+    if (p->mb_adaptive_frame_field_flag)
+    {
+					free_mem2DPicMotion(p->top.motion[0]);
+			free_mem2DPicMotion(p->top.motion[1]);
+
+      if (p->top.moving_block)
+      {
+        free_mem2D (p->top.moving_block);
+        p->top.moving_block=NULL;
+      }
+
+									free_mem2DPicMotion(p->bottom.motion[0]);
+			free_mem2DPicMotion(p->bottom.motion[1]);
+
+      if (p->bottom.moving_block)
+      {
+        free_mem2D (p->bottom.moving_block);
+        p->bottom.moving_block=NULL;
+      }
+    }
+
+    free(p);
+
+    p = NULL;
+  }
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Compute co-located motion info
+ *
+ ************************************************************************
+ */
+void compute_colocated (Slice *currSlice, ColocatedParams* p, StorablePicture **listX[6])
+{
+  StorablePicture *fs = listX[LIST_1 ][0];
+  int i,j, ii, jj;
+  int fs_size_x4 = (fs->size_x >> 2);
+  int fs_size_y4 = (fs->size_y >> 2);
+  MotionParams *p_motion = &p->frame;
+  PicMotionParams *p_frm_motion = &fs->motion;
+  VideoParameters *p_Vid = currSlice->p_Vid;
+  seq_parameter_set_rbsp_t *active_sps = p_Vid->active_sps;
+  p->is_long_term = fs->is_long_term;
+
+  if (active_sps->direct_8x8_inference_flag)
+  {
+    // Spatial Direct
+    if (currSlice->direct_spatial_mv_pred_flag == 1)
+    {
+      if (p->is_long_term)
+      {
+        for (j=0 ; j < fs_size_y4; j++)
+        {
+          for (i=0 ; i < fs_size_x4; i++)
+          {
+            p_motion->moving_block[j][i]= (byte) 1;
+          }
+        }
+      }
+      else
+      {
+				PicMotion **motion0 = p_frm_motion->motion[LIST_0];
+				PicMotion **motion1 = p_frm_motion->motion[LIST_1];
+				for (j=0 ; j < fs_size_y4; j+=2)
+				{
+					PicMotion *ref0, *ref1;
+					byte *moving_block = p_motion->moving_block[j];
+					byte *moving_block2 = p_motion->moving_block[j+1];
+					jj = RSD(j);
+					ref0 = motion0[jj];
+					ref1 = motion1[jj];
+					for (i=0 ; i < fs_size_x4; i+=2)
+					{
+						PicMotion *r0;
+						ii = RSD(i);
+						r0 = &ref0[ii];
+						if (((r0->ref_idx == 0) && (iabs(r0->mv[0])>>1 == 0) && (iabs(r0->mv[1])>>1 == 0)))
+						{
+							moving_block[i]=0;
+							moving_block[i+1]=0;
+							moving_block2[i]=0;
+							moving_block2[i+1]=0;
+						}
+						else if (r0->ref_idx == -1)
+						{
+							PicMotion *r1 = &ref1[ii];
+							if ((r1->ref_idx == 0) && (iabs(r1->mv[0])>>1 == 0) && (iabs(r1->mv[1])>>1 == 0))
+							{
+								moving_block[i]=0;
+								moving_block[i+1]=0;
+								moving_block2[i]=0;
+							moving_block2[i+1]=0;
+							}
+							else
+							{
+								moving_block[i]=1;
+								moving_block[i+1]=1;
+								moving_block2[i]=1;
+							moving_block2[i+1]=1;
+							}
+						}
+						else
+						{
+							moving_block[i]=1;
+							moving_block[i+1]=1;
+							moving_block2[i]=1;
+							moving_block2[i+1]=1;
+						}
+						/*
+            p_motion->moving_block[j][i]= (byte) (
+              !(((ref0[ii].ref_idx == 0)
+              &&  (iabs(ref0[ii].mv[0])>>1 == 0)
+              &&  (iabs(ref0[ii].mv[1])>>1 == 0)))
+              || ((ref0[ii].ref_idx == -1)
+              &&  (ref1[ii].ref_idx == 0)
+              &&  (iabs(ref1[ii].mv[0])>>1 == 0)
+              &&  (iabs(ref1[ii].mv[1])>>1 == 0))));
+							*/
+          }
+        }
+      }
+    }
+    else
+    {
+      int k;
+
+      for (k = LIST_0; k<=LIST_1; k++)
+      {
+        for (j = 0; j < fs_size_y4; j++)
+        {
+					for (i=0;i<fs_size_x4;i++)
+					{
+						PicMotion *src = &p_frm_motion->motion[k][j][i];
+						PicMotion *dest = &p_motion->motion[k][j][i];
+						memcpy(&dest->mv, &src->mv, sizeof(MotionVector));
+						dest->ref_idx = src->ref_idx;
+						dest->ref_pic_id = src->ref_id;
+					}
+        }
+      }
+
+      // temporal direct
+      for (j=0 ; j < fs_size_y4; j++)
+      {
+        jj = RSD(j);
+        for (i=0 ; i < fs_size_x4; i++)
+        {
+          ii = RSD(i);
+
+					p_motion->motion[LIST_0][j][i] = p_motion->motion[LIST_0][jj][ii];
+					p_motion->motion[LIST_1][j][i] = p_motion->motion[LIST_1][jj][ii];
+          //p_motion->mv[LIST_0][j][i].mv[0] = p_motion->mv[LIST_0][jj][ii][0];
+          //p_motion->mv[LIST_0][j][i].mv[1] = p_motion->mv[LIST_0][jj][ii][1];
+          //p_motion->mv[LIST_1][j][i].mv[0] = p_motion->mv[LIST_1][jj][ii][0];
+          //p_motion->mv[LIST_1][j][i][1] = p_motion->mv[LIST_1][jj][ii][1];
+
+          //p_motion->ref_idx[LIST_0][j][i] = p_motion->ref_idx[LIST_0][jj][ii];
+          //p_motion->ref_idx[LIST_1][j][i] = p_motion->ref_idx[LIST_1][jj][ii];
+          //p_motion->ref_pic_id[LIST_0][j][i] = p_motion->ref_pic_id[LIST_0][jj][ii];
+          //p_motion->ref_pic_id[LIST_1][j][i] = p_motion->ref_pic_id[LIST_1][jj][ii];
+        }
+      }
+    }
+  }
+  else
+  {
+        for (j=0 ; j < fs_size_y4; j++)
+        {
+          for (i=0 ; i < fs_size_x4; i++)
+          {
+						memcpy(&p_motion->motion[LIST_0][j][i].mv, &p_frm_motion->motion[LIST_0][j][i].mv, sizeof(MotionVector));
+						p_motion->motion[LIST_0][j][i].ref_idx= p_frm_motion->motion[LIST_0][j][i].ref_idx;
+						p_motion->motion[LIST_0][j][i].ref_pic_id= p_frm_motion->motion[LIST_0][j][i].ref_id;
+						memcpy(&p_motion->motion[LIST_1][j][i].mv, &p_frm_motion->motion[LIST_1][j][i].mv, sizeof(MotionVector));
+						p_motion->motion[LIST_1][j][i].ref_idx= p_frm_motion->motion[LIST_1][j][i].ref_idx;
+						p_motion->motion[LIST_1][j][i].ref_pic_id= p_frm_motion->motion[LIST_1][j][i].ref_id;
+			}
+		}
+    if (currSlice->direct_spatial_mv_pred_flag == 1)
+    {
+      if (p->is_long_term)
+      {
+        for (j=0 ; j < fs_size_y4; j++)
+        {
+          for (i=0 ; i < fs_size_x4; i++)
+          {
+
+            p_motion->moving_block[j][i]= 1;
+          }
+        }
+      }
+      else
+      {
+        for (j=0 ; j < fs_size_y4; j++)
+        {
+          for (i=0 ; i < fs_size_x4; i++)
+          {
+            p_motion->moving_block[j][i]=
+              !((((p_motion->motion[LIST_0][j][i].ref_idx == 0)
+              &&  (iabs(p_motion->motion[LIST_0][j][i].mv[0])>>1 == 0)
+              &&  (iabs(p_motion->motion[LIST_0][j][i].mv[1])>>1 == 0)))
+              || ((p_motion->motion[LIST_0][j][i].ref_idx == -1)
+              &&  (p_motion->motion[LIST_1][j][i].ref_idx == 0)
+              &&  (iabs(p_motion->motion[LIST_1][j][i].mv[0])>>1 == 0)
+              &&  (iabs(p_motion->motion[LIST_1][j][i].mv[1])>>1 == 0)));
+          }
+        }
+      }
+    }
+  }
+
+  if (currSlice->direct_spatial_mv_pred_flag == 0)
+  {    
+    for (j = 0; j < 2;j += 2)
+    {
+      for (i=0; i<p_Vid->listXsize[j];i++)
+      {
+        int prescale, iTRb, iTRp;
+
+        if (j==0)
+        {
+          iTRb = iClip3( -128, 127, p_Vid->dec_picture->poc - listX[LIST_0 + j][i]->poc );
+        }
+        else if (j == 2)
+        {
+          iTRb = iClip3( -128, 127, p_Vid->dec_picture->top_poc - listX[LIST_0 + j][i]->poc );
+        }
+        else
+        {
+          iTRb = iClip3( -128, 127, p_Vid->dec_picture->bottom_poc - listX[LIST_0 + j][i]->poc );
+        }
+
+        iTRp = iClip3( -128, 127,  listX[LIST_1 + j][0]->poc - listX[LIST_0 + j][i]->poc);
+
+        if (iTRp!=0)
+        {
+          prescale = ( 16384 + iabs( iTRp / 2 ) ) / iTRp;
+          currSlice->mvscale[j][i] = iClip3( -1024, 1023, ( iTRb * prescale + 32 ) >> 6 ) ;
+        }
+        else
+        {
+          currSlice->mvscale[j][i] = 9999;
+        }
+      }
+    }
+  }
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Compute co-located motion info (for interlace support)
+ *
+ ************************************************************************
+ */
+void compute_colocated_frames_mbs (Slice *currSlice, ColocatedParams* p, StorablePicture **listX[6])
+{
+  StorablePicture *fs = listX[LIST_1 ][0];
+  StorablePicture *fs_top = fs, *fs_bottom = fs;
+  int i,j, ii, jj, jdiv;
+  int fs_size_x4 = (fs->size_x >> 2);
+  int fs_size_y4 = (fs->size_y >> 2);
+  MotionParams *p_motion = &p->frame;
+  PicMotionParams *p_frm_motion = &fs->motion;
+  VideoParameters *p_Vid = currSlice->p_Vid;
+  seq_parameter_set_rbsp_t *active_sps = p_Vid->active_sps;
+
+
+  if (currSlice->mb_aff_frame_flag)
+  {
+    fs_top = listX[LIST_1 + 2][0];
+    fs_bottom = listX[LIST_1 + 4][0];
+  }
+  else
+  {
+    if (p_Vid->field_pic_flag)
+    {
+      if ((p_Vid->structure != fs->structure) && (fs->coded_frame))
+      {
+        if (p_Vid->structure==TOP_FIELD)
+        {
+          fs_top = fs_bottom = fs = listX[LIST_1 ][0]->top_field;
+        }
+        else
+        {
+          fs_top = fs_bottom = fs = listX[LIST_1 ][0]->bottom_field;
+        }
+      }
+      p_frm_motion = &fs->motion;
+    }
+  }  
+
+  if (!active_sps->frame_mbs_only_flag || active_sps->direct_8x8_inference_flag)
+  {
+    if (!currSlice->mb_aff_frame_flag)
+    { 
+      int k;
+
+      for (k = LIST_0; k<=LIST_1; k++)
+      {
+        for (j = 0; j < (fs->size_y>>2); j++)
+        {
+					for (i=0;i<fs_size_x4;i++)
+					{
+						
+						memcpy(&p_motion->motion[k][j][i].mv, &p_frm_motion->motion[k][j][i].mv,  sizeof(MotionVector));
+						p_motion->motion[k][j][i].ref_idx=p_frm_motion->motion[k][j][i].ref_idx;
+						p_motion->motion[k][j][i].ref_pic_id=p_frm_motion->motion[k][j][i].ref_id;
+					}
+        }
+      }
+      p->is_long_term = fs->is_long_term;
+    }
+    else
+    {
+      for (j=0 ; j < (fs->size_y>>2); j++)
+      {
+        jdiv = (j>>1);
+        jj = jdiv + ((j>>3)<<2);
+        for (i=0 ; i < fs_size_x4 ; i++)
+        {
+          if (p_frm_motion->field_frame[j][i])
+          {
+            //! Assign frame buffers for field MBs
+            //! Check whether we should use top or bottom field mvs.
+            //! Depending on the assigned poc values.
+
+            if (iabs(p_Vid->dec_picture->poc - fs_bottom->poc)> iabs(p_Vid->dec_picture->poc -fs_top->poc) )
+            {
+							 p_motion->motion[LIST_0][j][i].mv[0]      = fs_top->motion.motion[LIST_0][jdiv][i].mv[0];
+							 p_motion->motion[LIST_0][j][i].mv[1]      = fs_top->motion.motion[LIST_0][jdiv][i].mv[1];
+							 p_motion->motion[LIST_1][j][i].mv[0]      = fs_top->motion.motion[LIST_1][jdiv][i].mv[0];
+							 p_motion->motion[LIST_1][j][i].mv[1]      = fs_top->motion.motion[LIST_1][jdiv][i].mv[1];
+							 p_motion->motion[LIST_0][j][i].ref_idx      = fs_top->motion.motion[LIST_0][jdiv][i].ref_idx;
+							 p_motion->motion[LIST_1][j][i].ref_idx      = fs_top->motion.motion[LIST_1][jdiv][i].ref_idx;
+              //p_motion->mv[LIST_0][j][i][0]      = fs_top->motion.motion[LIST_0][jdiv][i].mv[0];
+              //p_motion->mv[LIST_0][j][i][1]      = fs_top->motion.motion[LIST_0][jdiv][i].mv[1] ;
+              //p_motion->mv[LIST_1][j][i][0]      = fs_top->motion.motion[LIST_1][jdiv][i].mv[0];
+              //p_motion->mv[LIST_1][j][i][1]      = fs_top->motion.motion[LIST_1][jdiv][i].mv[1] ;
+              //p_motion->ref_idx[LIST_0][j][i]    = fs_top->motion.motion[LIST_0][jdiv][i].ref_idx;
+              //p_motion->ref_idx[LIST_1][j][i]    = fs_top->motion.motion[LIST_1][jdiv][i].ref_idx;
+              p_motion->motion[LIST_0][j][i].ref_pic_id = p_frm_motion->motion[LIST_0][jj][i].ref_id;
+              p_motion->motion[LIST_1][j][i].ref_pic_id = p_frm_motion->motion[LIST_1][jj][i].ref_id;
+
+              p->is_long_term             = fs_top->is_long_term;
+            }
+            else
+            {
+							p_motion->motion[LIST_0][j][i].mv[0]      = fs_bottom->motion.motion[LIST_0][jdiv][i].mv[0];
+							p_motion->motion[LIST_0][j][i].mv[1]      = fs_bottom->motion.motion[LIST_0][jdiv][i].mv[1];
+							p_motion->motion[LIST_1][j][i].mv[0]      = fs_bottom->motion.motion[LIST_1][jdiv][i].mv[0];
+							p_motion->motion[LIST_1][j][i].mv[1]      = fs_bottom->motion.motion[LIST_1][jdiv][i].mv[1];
+							p_motion->motion[LIST_0][j][i].ref_idx      = fs_bottom->motion.motion[LIST_0][jdiv][i].ref_idx;
+							p_motion->motion[LIST_1][j][i].ref_idx      = fs_bottom->motion.motion[LIST_1][jdiv][i].ref_idx;
+							//p_motion->mv[LIST_0][j][i][0]      = fs_bottom->motion.motion[LIST_0][jdiv][i].mv[0];
+              //p_motion->mv[LIST_0][j][i][1]      = fs_bottom->motion.motion[LIST_0][jdiv][i].mv[1] ;
+              //p_motion->mv[LIST_1][j][i][0]      = fs_bottom->motion.motion[LIST_1][jdiv][i].mv[0];
+              //p_motion->mv[LIST_1][j][i][1]      = fs_bottom->motion.motion[LIST_1][jdiv][i].mv[1] ;
+              //p_motion->ref_idx[LIST_0][j][i]    = fs_bottom->motion.motion[LIST_0][jdiv][i].ref_idx;
+              //p_motion->ref_idx[LIST_1][j][i]    = fs_bottom->motion.motion[LIST_1][jdiv][i].ref_idx;
+              p_motion->motion[LIST_0][j][i].ref_pic_id = p_frm_motion->motion[LIST_0][jj + 4][i].ref_id;
+              p_motion->motion[LIST_1][j][i].ref_pic_id = p_frm_motion->motion[LIST_1][jj + 4][i].ref_id;
+
+              p->is_long_term             = fs_bottom->is_long_term;
+            }
+          }
+          else
+          {
+						p_motion->motion[LIST_0][j][i].mv[0]      = p_frm_motion->motion[LIST_0][j][i].mv[0];
+            p_motion->motion[LIST_0][j][i].mv[1]      = p_frm_motion->motion[LIST_0][j][i].mv[1] ;
+            p_motion->motion[LIST_1][j][i].mv[0]      = p_frm_motion->motion[LIST_1][j][i].mv[0];
+            p_motion->motion[LIST_1][j][i].mv[1]      = p_frm_motion->motion[LIST_1][j][i].mv[1] ;
+            p_motion->motion[LIST_0][j][i].ref_idx    = p_frm_motion->motion[LIST_0][j][i].ref_idx;
+            p_motion->motion[LIST_1][j][i].ref_idx    = p_frm_motion->motion[LIST_1][j][i].ref_idx;
+            //p_motion->mv[LIST_0][j][i][0]      = p_frm_motion->motion[LIST_0][j][i].mv[0];
+            //p_motion->mv[LIST_0][j][i][1]      = p_frm_motion->motion[LIST_0][j][i].mv[1] ;
+            //p_motion->mv[LIST_1][j][i][0]      = p_frm_motion->motion[LIST_1][j][i].mv[0];
+            //p_motion->mv[LIST_1][j][i][1]      = p_frm_motion->motion[LIST_1][j][i].mv[1] ;
+            //p_motion->ref_idx[LIST_0][j][i]    = p_frm_motion->motion[LIST_0][j][i].ref_idx;
+            //p_motion->ref_idx[LIST_1][j][i]    = p_frm_motion->motion[LIST_1][j][i].ref_idx;
+            p_motion->motion[LIST_0][j][i].ref_pic_id = p_frm_motion->motion[LIST_0][j][i].ref_id;
+            p_motion->motion[LIST_1][j][i].ref_pic_id = p_frm_motion->motion[LIST_1][j][i].ref_id;
+
+            p->is_long_term             = fs->is_long_term;
+          }
+        }
+      }
+    }
+  }
+
+  //! Generate field MVs from Frame MVs
+  if (p_Vid->structure || currSlice->mb_aff_frame_flag)
+  {
+    for (j = 0; j < fs->size_y >> 3; j++)
+    {
+      jj = RSD(j);
+      for (i = 0 ; i < fs->size_x >> 2; i++)
+      {
+        ii = RSD(i);
+        //! Do nothing if macroblock as field coded in MB-AFF
+        if (!currSlice->mb_aff_frame_flag )
+        {
+          p_motion->motion[LIST_0][j][i].mv[0] = p_frm_motion->motion[LIST_0][jj][ii].mv[0];
+          p_motion->motion[LIST_0][j][i].mv[1] = p_frm_motion->motion[LIST_0][jj][ii].mv[1];
+          p_motion->motion[LIST_1][j][i].mv[0] = p_frm_motion->motion[LIST_1][jj][ii].mv[0];
+          p_motion->motion[LIST_1][j][i].mv[1] = p_frm_motion->motion[LIST_1][jj][ii].mv[1];
+
+          // Scaling of references is done here since it will not affect spatial direct (2*0 =0)
+
+          if (p_frm_motion->motion[LIST_0][jj][ii].ref_idx == -1)
+          {
+            p_motion->motion[LIST_0][j][i].ref_idx = -1;
+            p_motion->motion[LIST_0][j][i].ref_pic_id = UNDEFINED_REFERENCE; // TODO: UNDEFINED_REFERENCE ?
+          }
+          else
+          {
+            p_motion->motion[LIST_0][j][i].ref_idx = p_frm_motion->motion[LIST_0][jj][ii].ref_idx ;
+            p_motion->motion[LIST_0][j][i].ref_pic_id = p_frm_motion->motion [LIST_0][jj][ii].ref_id;
+          }
+
+          if (p_frm_motion->motion[LIST_1][jj][ii].ref_idx == -1)
+          {
+            p_motion->motion[LIST_1][j][i].ref_idx = -1;
+            p_motion->motion[LIST_1][j][i].ref_pic_id = UNDEFINED_REFERENCE; // TODO: UNDEFINED_REFERENCE ?
+          }
+          else
+          {
+            p_motion->motion[LIST_1][j][i].ref_idx = p_frm_motion->motion[LIST_1][jj][ii].ref_idx;
+            p_motion->motion[LIST_1][j][i].ref_pic_id = p_frm_motion->motion [LIST_1][jj][ii].ref_id;
+          }
+
+          p->is_long_term = fs->is_long_term;
+
+          if (currSlice->direct_spatial_mv_pred_flag == 1)
+          {
+            p_motion->moving_block[j][i] =
+              !((!p->is_long_term
+              && ((p_motion->motion[LIST_0][j][i].ref_idx == 0)
+              &&  (iabs(p_motion->motion[LIST_0][j][i].mv[0])>>1 == 0)
+              &&  (iabs(p_motion->motion[LIST_0][j][i].mv[1])>>1 == 0)))
+              || ((p_motion->motion[LIST_0][j][i].ref_idx == -1)
+              &&  (p_motion->motion[LIST_1][j][i].ref_idx == 0)
+              &&  (iabs(p_motion->motion[LIST_1][j][i].mv[0])>>1 == 0)
+              &&  (iabs(p_motion->motion[LIST_1][j][i].mv[1])>>1 == 0)));
+          }
+        }
+        else
+        {
+          p->bottom.motion[LIST_0][j][i].mv[0] = fs_bottom->motion.motion[LIST_0][jj][ii].mv[0];
+          p->bottom.motion[LIST_0][j][i].mv[1] = fs_bottom->motion.motion[LIST_0][jj][ii].mv[1];
+          p->bottom.motion[LIST_1][j][i].mv[0] = fs_bottom->motion.motion[LIST_1][jj][ii].mv[0];
+          p->bottom.motion[LIST_1][j][i].mv[1] = fs_bottom->motion.motion[LIST_1][jj][ii].mv[1];
+          p->bottom.motion[LIST_0][j][i].ref_idx = fs_bottom->motion.motion[LIST_0][jj][ii].ref_idx;
+          p->bottom.motion[LIST_1][j][i].ref_idx = fs_bottom->motion.motion[LIST_1][jj][ii].ref_idx;
+          p->bottom.motion[LIST_0][j][i].ref_pic_id = fs_bottom->motion.motion[LIST_0][jj][ii].ref_id;
+          p->bottom.motion[LIST_1][j][i].ref_pic_id = fs_bottom->motion.motion[LIST_1][jj][ii].ref_id;
+
+          if (currSlice->direct_spatial_mv_pred_flag == 1)
+          {
+            p->bottom.moving_block[j][i] =
+              !((!fs_bottom->is_long_term
+              && ((p->bottom.motion[LIST_0][j][i].ref_idx == 0)
+              &&  (iabs(p->bottom.motion[LIST_0][j][i].mv[0])>>1 == 0)
+              &&  (iabs(p->bottom.motion[LIST_0][j][i].mv[1])>>1 == 0)))
+              || ((p->bottom.motion[LIST_0][j][i].ref_idx == -1)
+              &&  (p->bottom.motion[LIST_1][j][i].ref_idx == 0)
+              &&  (iabs(p->bottom.motion[LIST_1][j][i].mv[0])>>1 == 0)
+              &&  (iabs(p->bottom.motion[LIST_1][j][i].mv[1])>>1 == 0)));
+          }
+
+          p->top.motion[LIST_0][j][i].mv[0] = fs_top->motion.motion[LIST_0][jj][ii].mv[0];
+          p->top.motion[LIST_0][j][i].mv[1] = fs_top->motion.motion[LIST_0][jj][ii].mv[1];
+          p->top.motion[LIST_1][j][i].mv[0] = fs_top->motion.motion[LIST_1][jj][ii].mv[0];
+          p->top.motion[LIST_1][j][i].mv[1] = fs_top->motion.motion[LIST_1][jj][ii].mv[1];
+          p->top.motion[LIST_0][j][i].ref_idx = fs_top->motion.motion[LIST_0][jj][ii].ref_idx;
+          p->top.motion[LIST_1][j][i].ref_idx = fs_top->motion.motion[LIST_1][jj][ii].ref_idx;
+          p->top.motion[LIST_0][j][i].ref_pic_id = fs_top->motion.motion[LIST_0][jj][ii].ref_id;
+          p->top.motion[LIST_1][j][i].ref_pic_id = fs_top->motion.motion[LIST_1][jj][ii].ref_id;
+
+          if (currSlice->direct_spatial_mv_pred_flag == 1)
+          {
+            p->top.moving_block[j][i] =
+              !((!fs_top->is_long_term
+              && ((p->top.motion[LIST_0][j][i].ref_idx == 0)
+              &&  (iabs(p->top.motion[LIST_0][j][i].mv[0])>>1 == 0)
+              &&  (iabs(p->top.motion[LIST_0][j][i].mv[1])>>1 == 0)))
+              || ((p->top.motion[LIST_0][j][i].ref_idx == -1)
+              &&  (p->top.motion[LIST_1][j][i].ref_idx == 0)
+              &&  (iabs(p->top.motion[LIST_1][j][i].mv[0])>>1 == 0)
+              &&  (iabs(p->top.motion[LIST_1][j][i].mv[1])>>1 == 0)));
+          }
+
+          if ((currSlice->direct_spatial_mv_pred_flag == 0 ) && !p_frm_motion->field_frame[2*j][i])
+          {
+            p->top.motion[LIST_0][j][i].mv[1] /= 2;
+            p->top.motion[LIST_1][j][i].mv[1] /= 2;
+            p->bottom.motion[LIST_0][j][i].mv[1] /= 2;
+            p->bottom.motion[LIST_1][j][i].mv[1] /= 2;
+          }
+        }
+      }
+    }
+  }
+
+  //if (!active_sps->frame_mbs_only_flag || active_sps->direct_8x8_inference_flag)
+  if (!active_sps->frame_mbs_only_flag)
+  {
+    //! Use inference flag to remap mvs/references
+    //! Frame with field co-located
+    if (!p_Vid->structure)
+    {
+      for (j=0 ; j < fs_size_y4; j++)
+      {
+        jdiv = (j >> 1);
+        jj   = jdiv + ((j >> 3) << 2);
+        for (i = 0 ; i < fs_size_x4; i++)
+        {
+          if (p_frm_motion->field_frame[j][i])
+          {
+            if (iabs(p_Vid->dec_picture->poc - fs->bottom_field->poc) > iabs(p_Vid->dec_picture->poc - fs->top_field->poc))
+            {
+              p_motion->motion[LIST_0][j][i].mv[0] = fs->top_field->motion.motion[LIST_0][jdiv][i].mv[0];
+              p_motion->motion[LIST_0][j][i].mv[1] = fs->top_field->motion.motion[LIST_0][jdiv][i].mv[1] ;
+              p_motion->motion[LIST_1][j][i].mv[0] = fs->top_field->motion.motion[LIST_1][jdiv][i].mv[0];
+              p_motion->motion[LIST_1][j][i].mv[1] = fs->top_field->motion.motion[LIST_1][jdiv][i].mv[1] ;
+
+              p_motion->motion[LIST_0][j][i].ref_idx = fs->top_field->motion.motion[LIST_0][jdiv][i].ref_idx;
+              p_motion->motion[LIST_1][j][i].ref_idx    = fs->top_field->motion.motion[LIST_1][jdiv][i].ref_idx;
+              p_motion->motion[LIST_0][j][i].ref_pic_id = p_frm_motion->motion[LIST_0][jj][i].ref_id;
+              p_motion->motion[LIST_1][j][i].ref_pic_id = p_frm_motion->motion[LIST_1][jj][i].ref_id;
+              p->is_long_term             = fs->top_field->is_long_term;
+            }
+            else
+            {
+              p_motion->motion[LIST_0][j][i].mv[0] = fs->bottom_field->motion.motion[LIST_0][jdiv][i].mv[0];
+              p_motion->motion[LIST_0][j][i].mv[1] = fs->bottom_field->motion.motion[LIST_0][jdiv][i].mv[1] ;
+              p_motion->motion[LIST_1][j][i].mv[0] = fs->bottom_field->motion.motion[LIST_1][jdiv][i].mv[0];
+              p_motion->motion[LIST_1][j][i].mv[1] = fs->bottom_field->motion.motion[LIST_1][jdiv][i].mv[1] ;
+
+              p_motion->motion[LIST_0][j][i].ref_idx  = fs->bottom_field->motion.motion[LIST_0][jdiv][i].ref_idx;
+              p_motion->motion[LIST_1][j][i].ref_idx  = fs->bottom_field->motion.motion[LIST_1][jdiv][i].ref_idx;
+              p_motion->motion[LIST_0][j][i].ref_pic_id = p_frm_motion->motion[LIST_0][jj + 4][i].ref_id;
+              p_motion->motion[LIST_1][j][i].ref_pic_id = p_frm_motion->motion[LIST_1][jj + 4][i].ref_id;
+              p->is_long_term             = fs->bottom_field->is_long_term;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  p->is_long_term = fs->is_long_term;
+
+  if (!active_sps->frame_mbs_only_flag || active_sps->direct_8x8_inference_flag)
+  {
+    if (currSlice->direct_spatial_mv_pred_flag == 1)
+    {
+      for (j=0 ; j < (fs->size_y>>2); j++)
+      {
+        jj = RSD(j);
+        for (i=0 ; i < (fs->size_x>>2); i++)
+        {
+          ii = RSD(i);
+
+					p_motion->motion[LIST_0][j][i] = p_motion->motion[LIST_0][jj][ii];
+					p_motion->motion[LIST_1][j][i] = p_motion->motion[LIST_1][jj][ii];
+          //p_motion->motion[LIST_0][j][i].mv[0]=p_motion->motion[LIST_0][jj][ii].mv[0];
+          //p_motion->motion[LIST_0][j][i].mv[1]=p_motion->motion[LIST_0][jj][ii].mv[1];
+          //p_motion->motion[LIST_1][j][i].mv[0]=p_motion->motion[LIST_1][jj][ii].mv[0];
+          //p_motion->motion[LIST_1][j][i].mv[1]=p_motion->motion[LIST_1][jj][ii].mv[1];
+
+          //p_motion->motion[LIST_0][j][i].ref_idx=p_motion->motion[LIST_0][jj][ii].ref_idx;
+          //p_motion->motion[LIST_1][j][i].ref_idx=p_motion->motion[LIST_1][jj][ii].ref_idx;
+          //p_motion->motion[LIST_0][j][i].ref_pic_id = p_motion->motion[LIST_0][jj][ii].ref_pic_id;
+          //p_motion->motion[LIST_1][j][i].ref_pic_id = p_motion->motion[LIST_1][jj][ii].ref_pic_id;
+
+          p_motion->moving_block[j][i]= (byte) (
+            !((!p->is_long_term
+            && ((p_motion->motion[LIST_0][j][i].ref_idx == 0)
+            &&  (iabs(p_motion->motion[LIST_0][j][i].mv[0])>>1 == 0)
+            &&  (iabs(p_motion->motion[LIST_0][j][i].mv[1])>>1 == 0)))
+            || ((p_motion->motion[LIST_0][j][i].ref_idx == -1)
+            &&  (p_motion->motion[LIST_1][j][i].ref_idx == 0)
+            &&  (iabs(p_motion->motion[LIST_1][j][i].mv[0])>>1 == 0)
+            &&  (iabs(p_motion->motion[LIST_1][j][i].mv[1])>>1 == 0))));
+        }
+      }
+    }
+    else
+    {
+      for (j=0 ; j < (fs->size_y>>2); j++)
+      {
+        jj = RSD(j);
+        for (i=0 ; i < (fs->size_x>>2); i++)
+        {
+          ii = RSD(i);
+					          p_motion->motion[LIST_0][j][i]=p_motion->motion[LIST_0][jj][ii];
+										p_motion->motion[LIST_1][j][i]=p_motion->motion[LIST_1][jj][ii];
+
+          //p_motion->mv[LIST_0][j][i][0]=p_motion->mv[LIST_0][jj][ii][0];
+          //p_motion->mv[LIST_0][j][i][1]=p_motion->mv[LIST_0][jj][ii][1];
+          //p_motion->mv[LIST_1][j][i][0]=p_motion->mv[LIST_1][jj][ii][0];
+          //p_motion->mv[LIST_1][j][i][1]=p_motion->mv[LIST_1][jj][ii][1];
+
+          //p_motion->ref_idx[LIST_0][j][i]=p_motion->ref_idx[LIST_0][jj][ii];
+          //p_motion->ref_idx[LIST_1][j][i]=p_motion->ref_idx[LIST_1][jj][ii];
+          //p_motion->ref_pic_id[LIST_0][j][i] = p_motion->ref_pic_id[LIST_0][jj][ii];
+          //p_motion->ref_pic_id[LIST_1][j][i] = p_motion->ref_pic_id[LIST_1][jj][ii];
+        }
+      }
+    }
+  }
+  else
+  {
+		for (j=0 ; j < fs_size_y4; j++)
+		{
+			for (i=0 ; i < fs_size_x4; i++)
+			{
+				memcpy(&p_motion->motion[LIST_0][j][i].mv, &p_frm_motion->motion[LIST_0][j][i].mv, sizeof(MotionVector));
+				p_motion->motion[LIST_0][j][i].ref_idx=    p_frm_motion->motion[LIST_0][j][i].ref_idx;
+				p_motion->motion[LIST_0][j][i].ref_pic_id = p_frm_motion->motion [LIST_0][j][i].ref_id;
+				memcpy(&p_motion->motion[LIST_1][j][i].mv, &p_frm_motion->motion[LIST_1][j][i].mv, sizeof(MotionVector));
+				p_motion->motion[LIST_1][j][i].ref_idx=    p_frm_motion->motion[LIST_1][j][i].ref_idx;
+				p_motion->motion[LIST_1][j][i].ref_pic_id= p_frm_motion->motion [LIST_1][j][i].ref_id;
+			}
+		}
+
+    if (currSlice->direct_spatial_mv_pred_flag == 1)
+    {
+      for (j=0 ; j < fs_size_y4; j++)
+      {
+        for (i=0 ; i < fs_size_x4; i++)
+        {
+          p_motion->moving_block[j][i]=
+            !((!p->is_long_term
+            && ((p_motion->motion[LIST_0][j][i].ref_idx == 0)
+            &&  (iabs(p_motion->motion[LIST_0][j][i].mv[0])>>1 == 0)
+            &&  (iabs(p_motion->motion[LIST_0][j][i].mv[1])>>1 == 0)))
+            || ((p_motion->motion[LIST_0][j][i].ref_idx == -1)
+            &&  (p_motion->motion[LIST_1][j][i].ref_idx == 0)
+            &&  (iabs(p_motion->motion[LIST_1][j][i].mv[0])>>1 == 0)
+            &&  (iabs(p_motion->motion[LIST_1][j][i].mv[1])>>1 == 0)));
+        }
+      }
+    }
+  }
+
+  if (currSlice->direct_spatial_mv_pred_flag == 0)
+  {    
+    if (currSlice->mb_aff_frame_flag || !p_Vid->structure)
+    {
+      for (j=0 ; j < fs_size_y4; j++)
+      {
+        for (i=0 ; i < fs_size_x4 ; i++)
+        {
+          if (p_frm_motion->field_frame[j][i])
+          {
+            p_motion->motion[LIST_0][j][i].mv[1] *= 2;
+            p_motion->motion[LIST_1][j][i].mv[1] *= 2;
+          }
+        }
+      }
+    }
+    else  if (p_Vid->structure)
+    {
+      for (j=0 ; j < fs_size_y4; j++)
+      {
+        for (i=0 ; i < fs_size_x4 ; i++)
+        {
+          if (!p_frm_motion->field_frame[j][i])
+          {
+            p_motion->motion[LIST_0][j][i].mv[1] /= 2;
+            p_motion->motion[LIST_1][j][i].mv[1] /= 2;
+          }
+        }
+      }
+    }
+
+    for (j=0; j<2 + (currSlice->mb_aff_frame_flag * 4);j+=2)
+    {
+      for (i=0; i<p_Vid->listXsize[j];i++)
+      {
+        int prescale, iTRb, iTRp;
+
+        if (j==0)
+        {
+          iTRb = iClip3( -128, 127, p_Vid->dec_picture->poc - listX[LIST_0 + j][i]->poc );
+        }
+        else if (j == 2)
+        {
+          iTRb = iClip3( -128, 127, p_Vid->dec_picture->top_poc - listX[LIST_0 + j][i]->poc );
+        }
+        else
+        {
+          iTRb = iClip3( -128, 127, p_Vid->dec_picture->bottom_poc - listX[LIST_0 + j][i]->poc );
+        }
+
+        iTRp = iClip3( -128, 127,  listX[LIST_1 + j][0]->poc - listX[LIST_0 + j][i]->poc);
+
+        if (iTRp!=0)
+        {
+          prescale = ( 16384 + iabs( iTRp / 2 ) ) / iTRp;
+          currSlice->mvscale[j][i] = iClip3( -1024, 1023, ( iTRb * prescale + 32 ) >> 6 ) ;
+        }
+        else
+        {
+          currSlice->mvscale[j][i] = 9999;
+        }
+      }
+    }
+  }
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Compute co-located motion info
+ *    for 4:4:4 Independent mode
+ *
+ ************************************************************************
+ */
+
+void compute_colocated_JV(Slice *currSlice, ColocatedParams* p, StorablePicture **listX[6])
+{
+  VideoParameters *p_Vid = currSlice->p_Vid;
+  seq_parameter_set_rbsp_t *active_sps = p_Vid->active_sps;
+
+  StorablePicture *fs, *fs_top, *fs_bottom;
+  int i,j, ii, jj, jdiv;
+  int np = p_Vid->colour_plane_id;
+
+  fs_top = fs_bottom = fs = listX[LIST_1 ][0];
+
+  if (currSlice->mb_aff_frame_flag)
+  {
+    fs_top= listX[LIST_1 + 2][0];
+    fs_bottom= listX[LIST_1 + 4][0];
+  }
+  else
+  {
+    if (p_Vid->field_pic_flag)
+    {
+      if ((p_Vid->structure != fs->structure) && (fs->coded_frame))
+      {
+        if (p_Vid->structure==TOP_FIELD)
+        {
+          fs_top=fs_bottom=fs = listX[LIST_1 ][0]->top_field;
+        }
+        else
+        {
+          fs_top=fs_bottom=fs = listX[LIST_1 ][0]->bottom_field;
+        }
+      }
+    }
+  }
+
+  if (!active_sps->frame_mbs_only_flag || active_sps->direct_8x8_inference_flag)
+  {
+    for (j=0 ; j<fs->size_y/4 ; j++)
+    {      
+      jdiv = j/2;
+      jj = j/2 + 4 * (j/8);
+      for (i=0 ; i<fs->size_x/4 ; i++)
+      {
+
+        if (currSlice->mb_aff_frame_flag && fs->motion.field_frame[j][i])
+        {
+          //! Assign frame buffers for field MBs
+          //! Check whether we should use top or bottom field mvs.
+          //! Depending on the assigned poc values.
+
+          if (iabs(p_Vid->dec_picture->poc - fs_bottom->poc)> iabs(p_Vid->dec_picture->poc -fs_top->poc) )
+          {
+            p->frame.motion[LIST_0][j][i].mv[0]      = fs_top->JVmotion[np].motion[LIST_0][jdiv][i].mv[0];
+            p->frame.motion[LIST_0][j][i].mv[1]      = fs_top->JVmotion[np].motion[LIST_0][jdiv][i].mv[1] ;
+            p->frame.motion[LIST_1][j][i].mv[0]      = fs_top->JVmotion[np].motion[LIST_1][jdiv][i].mv[0];
+            p->frame.motion[LIST_1][j][i].mv[1]      = fs_top->JVmotion[np].motion[LIST_1][jdiv][i].mv[1] ;
+            p->frame.motion[LIST_0][j][i].ref_idx    = fs_top->JVmotion[np].motion[LIST_0][jdiv][i].ref_idx;
+            p->frame.motion[LIST_1][j][i].ref_idx    = fs_top->JVmotion[np].motion[LIST_1][jdiv][i].ref_idx;
+            p->frame.motion[LIST_0][j][i].ref_pic_id = fs->JVmotion[np].motion[LIST_0][jj][i].ref_id;
+            p->frame.motion[LIST_1][j][i].ref_pic_id = fs->JVmotion[np].motion[LIST_1][jj][i].ref_id;
+
+            p->is_long_term             = fs_top->is_long_term;
+          }
+          else
+          {
+            p->frame.motion[LIST_0][j][i].mv[0]      = fs_bottom->JVmotion[np].motion[LIST_0][jdiv][i].mv[0];
+            p->frame.motion[LIST_0][j][i].mv[1]      = fs_bottom->JVmotion[np].motion[LIST_0][jdiv][i].mv[1] ;
+            p->frame.motion[LIST_1][j][i].mv[0]      = fs_bottom->JVmotion[np].motion[LIST_1][jdiv][i].mv[0];
+            p->frame.motion[LIST_1][j][i].mv[1]      = fs_bottom->JVmotion[np].motion[LIST_1][jdiv][i].mv[1] ;
+            p->frame.motion[LIST_0][j][i].ref_idx    = fs_bottom->JVmotion[np].motion[LIST_0][jdiv][i].ref_idx;
+            p->frame.motion[LIST_1][j][i].ref_idx    = fs_bottom->JVmotion[np].motion[LIST_1][jdiv][i].ref_idx;
+            p->frame.motion[LIST_0][j][i].ref_pic_id = fs->JVmotion[np].motion[LIST_0][jj + 4][i].ref_id;
+            p->frame.motion[LIST_1][j][i].ref_pic_id = fs->JVmotion[np].motion[LIST_1][jj + 4][i].ref_id;
+
+            p->is_long_term             = fs_bottom->is_long_term;
+          }
+        }
+        else
+        {
+          p->frame.motion[LIST_0][j][i].mv[0]      = fs->JVmotion[np].motion[LIST_0][j][i].mv[0];
+          p->frame.motion[LIST_0][j][i].mv[1]      = fs->JVmotion[np].motion[LIST_0][j][i].mv[1] ;
+          p->frame.motion[LIST_1][j][i].mv[0]      = fs->JVmotion[np].motion[LIST_1][j][i].mv[0];
+          p->frame.motion[LIST_1][j][i].mv[1]      = fs->JVmotion[np].motion[LIST_1][j][i].mv[1] ;
+          p->frame.motion[LIST_0][j][i].ref_idx    = fs->JVmotion[np].motion[LIST_0][j][i].ref_idx;
+          p->frame.motion[LIST_1][j][i].ref_idx    = fs->JVmotion[np].motion[LIST_1][j][i].ref_idx;
+          p->frame.motion[LIST_0][j][i].ref_pic_id = fs->JVmotion[np].motion[LIST_0][j][i].ref_id;
+          p->frame.motion[LIST_1][j][i].ref_pic_id = fs->JVmotion[np].motion[LIST_1][j][i].ref_id;
+
+          p->is_long_term             = fs->is_long_term;
+        }
+      }
+    }
+  }
+
+
+  //! Generate field MVs from Frame MVs
+  if (p_Vid->structure || currSlice->mb_aff_frame_flag)
+  {
+    for (j=0 ; j<fs->size_y/8 ; j++)
+    {
+      jj = RSD(j);
+      for (i=0 ; i<fs->size_x/4 ; i++)
+      {
+        ii = RSD(i);
+        //! Do nothing if macroblock as field coded in MB-AFF
+        if (!currSlice->mb_aff_frame_flag )
+        {
+          p->frame.motion[LIST_0][j][i].mv[0] = fs->JVmotion[np].motion[LIST_0][jj][ii].mv[0];
+          p->frame.motion[LIST_0][j][i].mv[1] = fs->JVmotion[np].motion[LIST_0][jj][ii].mv[1];
+          p->frame.motion[LIST_1][j][i].mv[0] = fs->JVmotion[np].motion[LIST_1][jj][ii].mv[0];
+          p->frame.motion[LIST_1][j][i].mv[1] = fs->JVmotion[np].motion[LIST_1][jj][ii].mv[1];
+
+          // Scaling of references is done here since it will not affect spatial direct (2*0 =0)
+
+          if (fs->JVmotion[np].motion[LIST_0][jj][ii].ref_idx == -1)
+          {
+            p->frame.motion[LIST_0][j][i].ref_idx = -1;
+            p->frame.motion[LIST_0][j][i].ref_pic_id = UNDEFINED_REFERENCE; // TODO: UNDEFINED_REFERENCE ?
+          }
+          else
+          {
+            p->frame.motion[LIST_0][j][i].ref_idx = fs->JVmotion[np].motion[LIST_0][jj][ii].ref_idx;
+            p->frame.motion[LIST_0][j][i].ref_pic_id = fs->JVmotion[np].motion[LIST_0][jj][ii].ref_id;
+          }
+
+          if (fs->JVmotion[np].motion[LIST_1][jj][ii].ref_idx == -1)
+          {
+            p->frame.motion[LIST_1][j][i].ref_idx = -1;
+            p->frame.motion[LIST_1][j][i].ref_pic_id = UNDEFINED_REFERENCE; // TODO: UNDEFINED_REFERENCE ?
+          }
+          else
+          {
+            p->frame.motion[LIST_1][j][i].ref_idx = fs->JVmotion[np].motion[LIST_1][jj][ii].ref_idx;
+            p->frame.motion[LIST_1][j][i].ref_pic_id = fs->JVmotion[np].motion[LIST_1][jj][ii].ref_id;
+          }
+
+          p->is_long_term = fs->is_long_term;
+
+          if (currSlice->direct_spatial_mv_pred_flag == 1)
+          {
+            p->frame.moving_block[j][i] =
+              !((!p->is_long_term
+              && ((p->frame.motion[LIST_0][j][i].ref_idx == 0)
+              &&  (iabs(p->frame.motion[LIST_0][j][i].mv[0])>>1 == 0)
+              &&  (iabs(p->frame.motion[LIST_0][j][i].mv[1])>>1 == 0)))
+              || ((p->frame.motion[LIST_0][j][i].ref_idx == -1)
+              &&  (p->frame.motion[LIST_1][j][i].ref_idx == 0)
+              &&  (iabs(p->frame.motion[LIST_1][j][i].mv[0])>>1 == 0)
+              &&  (iabs(p->frame.motion[LIST_1][j][i].mv[1])>>1 == 0)));
+          }
+        }
+        else
+        {
+          p->bottom.motion[LIST_0][j][i].mv[0] = fs_bottom->JVmotion[np].motion[LIST_0][jj][ii].mv[0];
+          p->bottom.motion[LIST_0][j][i].mv[1] = fs_bottom->JVmotion[np].motion[LIST_0][jj][ii].mv[1];
+          p->bottom.motion[LIST_1][j][i].mv[0] = fs_bottom->JVmotion[np].motion[LIST_1][jj][ii].mv[0];
+          p->bottom.motion[LIST_1][j][i].mv[1] = fs_bottom->JVmotion[np].motion[LIST_1][jj][ii].mv[1];
+          p->bottom.motion[LIST_0][j][i].ref_idx = fs_bottom->JVmotion[np].motion[LIST_0][jj][ii].ref_idx;
+          p->bottom.motion[LIST_1][j][i].ref_idx = fs_bottom->JVmotion[np].motion[LIST_1][jj][ii].ref_idx;
+          p->bottom.motion[LIST_0][j][i].ref_pic_id = fs_bottom->JVmotion[np].motion[LIST_0][jj][ii].ref_id;
+          p->bottom.motion[LIST_1][j][i].ref_pic_id = fs_bottom->JVmotion[np].motion[LIST_1][jj][ii].ref_id;
+
+          if (currSlice->direct_spatial_mv_pred_flag == 1)
+          {
+            p->bottom.moving_block[j][i] =
+              !((!fs_bottom->is_long_term
+              && ((p->bottom.motion[LIST_0][j][i].ref_idx == 0)
+              &&  (iabs(p->bottom.motion[LIST_0][j][i].mv[0])>>1 == 0)
+              &&  (iabs(p->bottom.motion[LIST_0][j][i].mv[1])>>1 == 0)))
+              || ((p->bottom.motion[LIST_0][j][i].ref_idx == -1)
+              &&  (p->bottom.motion[LIST_1][j][i].ref_idx == 0)
+              &&  (iabs(p->bottom.motion[LIST_1][j][i].mv[0])>>1 == 0)
+              &&  (iabs(p->bottom.motion[LIST_1][j][i].mv[1])>>1 == 0)));
+          }
+
+          p->top.motion[LIST_0][j][i].mv[0] = fs_top->JVmotion[np].motion[LIST_0][jj][ii].mv[0];
+          p->top.motion[LIST_0][j][i].mv[1] = fs_top->JVmotion[np].motion[LIST_0][jj][ii].mv[1];
+          p->top.motion[LIST_1][j][i].mv[0] = fs_top->JVmotion[np].motion[LIST_1][jj][ii].mv[0];
+          p->top.motion[LIST_1][j][i].mv[1] = fs_top->JVmotion[np].motion[LIST_1][jj][ii].mv[1];
+          p->top.motion[LIST_0][j][i].ref_idx = fs_top->JVmotion[np].motion[LIST_0][jj][ii].ref_idx;
+          p->top.motion[LIST_1][j][i].ref_idx = fs_top->JVmotion[np].motion[LIST_1][jj][ii].ref_idx;
+          p->top.motion[LIST_0][j][i].ref_pic_id = fs_top->JVmotion[np].motion[LIST_0][jj][ii].ref_id;
+          p->top.motion[LIST_1][j][i].ref_pic_id = fs_top->JVmotion[np].motion[LIST_1][jj][ii].ref_id;
+
+          if (currSlice->direct_spatial_mv_pred_flag == 1)
+          {
+            p->top.moving_block[j][i] =
+              !((!fs_top->is_long_term
+              && ((p->top.motion[LIST_0][j][i].ref_idx == 0)
+              &&  (iabs(p->top.motion[LIST_0][j][i].mv[0])>>1 == 0)
+              &&  (iabs(p->top.motion[LIST_0][j][i].mv[1])>>1 == 0)))
+              || ((p->top.motion[LIST_0][j][i].ref_idx == -1)
+              &&  (p->top.motion[LIST_1][j][i].ref_idx == 0)
+              &&  (iabs(p->top.motion[LIST_1][j][i].mv[0])>>1 == 0)
+              &&  (iabs(p->top.motion[LIST_1][j][i].mv[1])>>1 == 0)));
+          }
+
+          if ((currSlice->direct_spatial_mv_pred_flag == 0 ) && !fs->motion.field_frame[2*j][i])
+          {
+            p->top.motion[LIST_0][j][i].mv[1] /= 2;
+            p->top.motion[LIST_1][j][i].mv[1] /= 2;
+            p->bottom.motion[LIST_0][j][i].mv[1] /= 2;
+            p->bottom.motion[LIST_1][j][i].mv[1] /= 2;
+          }
+
+        }
+      }
+    }
+  }
+
+
+  if (!active_sps->frame_mbs_only_flag || active_sps->direct_8x8_inference_flag)
+  {
+    //! Use inference flag to remap mvs/references
+    //! Frame with field co-located
+
+    if (!p_Vid->structure)
+    {
+      for (j=0 ; j<fs->size_y/4 ; j++)
+      {
+        jdiv = j/2;
+        jj = j/2 + 4*(j/8);
+        for (i=0 ; i<fs->size_x/4 ; i++)
+        {
+
+          if (fs->motion.field_frame[j][i])
+          {
+            if (iabs(p_Vid->dec_picture->poc - fs->bottom_field->poc) > iabs(p_Vid->dec_picture->poc - fs->top_field->poc))
+            {
+              p->frame.motion[LIST_0][j][i].mv[0] = fs->top_field->JVmotion[np].motion[LIST_0][jdiv][i].mv[0];
+              p->frame.motion[LIST_0][j][i].mv[1] = fs->top_field->JVmotion[np].motion[LIST_0][jdiv][i].mv[1] ;
+              p->frame.motion[LIST_1][j][i].mv[0] = fs->top_field->JVmotion[np].motion[LIST_1][jdiv][i].mv[0];
+              p->frame.motion[LIST_1][j][i].mv[1] = fs->top_field->JVmotion[np].motion[LIST_1][jdiv][i].mv[1] ;
+
+              p->frame.motion[LIST_0][j][i].ref_idx  = fs->top_field->JVmotion[np].motion[LIST_0][jdiv][i].ref_idx;
+              p->frame.motion[LIST_1][j][i].ref_idx  = fs->top_field->JVmotion[np].motion[LIST_1][jdiv][i].ref_idx;
+              p->frame.motion[LIST_0][j][i].ref_pic_id   = fs->JVmotion[np].motion[LIST_0][jj][i].ref_id;
+              p->frame.motion[LIST_1][j][i].ref_pic_id   = fs->JVmotion[np].motion[LIST_1][jj][i].ref_id;
+              p->is_long_term               = fs->top_field->is_long_term;
+            }
+            else
+            {
+              p->frame.motion[LIST_0][j][i].mv[0] = fs->bottom_field->JVmotion[np].motion[LIST_0][jdiv][i].mv[0];
+              p->frame.motion[LIST_0][j][i].mv[1] = fs->bottom_field->JVmotion[np].motion[LIST_0][jdiv][i].mv[1] ;
+              p->frame.motion[LIST_1][j][i].mv[0] = fs->bottom_field->JVmotion[np].motion[LIST_1][jdiv][i].mv[0];
+              p->frame.motion[LIST_1][j][i].mv[1] = fs->bottom_field->JVmotion[np].motion[LIST_1][jdiv][i].mv[1] ;
+
+              p->frame.motion[LIST_0][j][i].ref_idx  = fs->bottom_field->JVmotion[np].motion[LIST_0][jdiv][i].ref_idx;
+              p->frame.motion[LIST_1][j][i].ref_idx  = fs->bottom_field->JVmotion[np].motion[LIST_1][jdiv][i].ref_idx;
+              p->frame.motion[LIST_0][j][i].ref_pic_id = fs->JVmotion[np].motion[LIST_0][jj + 4][i].ref_id;
+              p->frame.motion[LIST_1][j][i].ref_pic_id = fs->JVmotion[np].motion[LIST_1][jj + 4][i].ref_id;
+              p->is_long_term             = fs->bottom_field->is_long_term;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  p->is_long_term = fs->is_long_term;
+
+  if (!active_sps->frame_mbs_only_flag || active_sps->direct_8x8_inference_flag)
+  {
+    for (j=0 ; j<fs->size_y/4 ; j++)
+    {
+      jj = RSD(j);
+      for (i=0 ; i<fs->size_x/4 ; i++)
+      {
+        ii = RSD(i);
+
+				p->frame.motion[LIST_0][j][i] = p->frame.motion[LIST_0][jj][ii];
+				p->frame.motion[LIST_1][j][i] = p->frame.motion[LIST_1][jj][ii];
+        //p->frame.mv[LIST_0][j][i][0] = p->frame.mv[LIST_0][jj][ii][0];
+        //p->frame.mv[LIST_0][j][i][1] = p->frame.mv[LIST_0][jj][ii][1];
+        //p->frame.mv[LIST_1][j][i][0] = p->frame.mv[LIST_1][jj][ii][0];
+        //p->frame.mv[LIST_1][j][i][1] = p->frame.mv[LIST_1][jj][ii][1];
+
+        //p->frame.ref_idx[LIST_0][j][i]=p->frame.ref_idx[LIST_0][jj][ii];
+        //p->frame.ref_idx[LIST_1][j][i]=p->frame.ref_idx[LIST_1][jj][ii];
+        //p->frame.ref_pic_id[LIST_0][j][i] = p->frame.ref_pic_id[LIST_0][jj][ii];
+        //p->frame.ref_pic_id[LIST_1][j][i] = p->frame.ref_pic_id[LIST_1][jj][ii];
+
+        if (currSlice->direct_spatial_mv_pred_flag == 1)
+        {
+          p->frame.moving_block[j][i]= (byte) (
+            !((!p->is_long_term
+            && ((p->frame.motion[LIST_0][j][i].ref_idx == 0)
+            &&  (iabs(p->frame.motion[LIST_0][j][i].mv[0])>>1 == 0)
+            &&  (iabs(p->frame.motion[LIST_0][j][i].mv[1])>>1 == 0)))
+            || ((p->frame.motion[LIST_0][j][i].ref_idx == -1)
+            &&  (p->frame.motion[LIST_1][j][i].ref_idx == 0)
+            &&  (iabs(p->frame.motion[LIST_1][j][i].mv[0])>>1 == 0)
+            &&  (iabs(p->frame.motion[LIST_1][j][i].mv[1])>>1 == 0))));
+        }
+      }
+    }
+  }
+  else
+  {
+    for (j=0 ; j<fs->size_y/4 ; j++)
+    {
+      jj = RSD(j);
+      for (i=0 ; i<fs->size_x/4 ; i++)
+      {
+        ii = RSD(i);
+        //! Use inference flag to remap mvs/references
+        p->frame.motion[LIST_0][j][i].mv[0] = fs->JVmotion[np].motion[LIST_0][j][i].mv[0];
+        p->frame.motion[LIST_0][j][i].mv[1] = fs->JVmotion[np].motion[LIST_0][j][i].mv[1];
+        p->frame.motion[LIST_1][j][i].mv[0] = fs->JVmotion[np].motion[LIST_1][j][i].mv[0];
+        p->frame.motion[LIST_1][j][i].mv[1] = fs->JVmotion[np].motion[LIST_1][j][i].mv[1];
+
+        p->frame.motion[LIST_0][j][i].ref_idx = fs->JVmotion[np].motion[LIST_0][j][i].ref_idx;
+        p->frame.motion[LIST_1][j][i].ref_idx = fs->JVmotion[np].motion[LIST_1][j][i].ref_idx;
+        p->frame.motion[LIST_0][j][i].ref_pic_id = fs->JVmotion[np].motion[LIST_0][j][i].ref_id;
+        p->frame.motion[LIST_1][j][i].ref_pic_id = fs->JVmotion[np].motion[LIST_1][j][i].ref_id;
+
+        if (currSlice->direct_spatial_mv_pred_flag == 1)
+        {
+          p->frame.moving_block[j][i]= (byte) (
+            !((!p->is_long_term
+            && ((p->frame.motion[LIST_0][j][i].ref_idx == 0)
+            &&  (iabs(p->frame.motion[LIST_0][j][i].mv[0])>>1 == 0)
+            &&  (iabs(p->frame.motion[LIST_0][j][i].mv[1])>>1 == 0)))
+            || ((p->frame.motion[LIST_0][j][i].ref_idx == -1)
+            &&  (p->frame.motion[LIST_1][j][i].ref_idx == 0)
+            &&  (iabs(p->frame.motion[LIST_1][j][i].mv[0])>>1 == 0)
+            &&  (iabs(p->frame.motion[LIST_1][j][i].mv[1])>>1 == 0))));
+        }
+      }
+    }
+  }
+
+
+  if (currSlice->direct_spatial_mv_pred_flag == 0)
+  {
+    for (j=0 ; j<fs->size_y/4 ; j++)
+    {
+      for (i=0 ; i<fs->size_x/4 ; i++)
+      {
+        if ((!currSlice->mb_aff_frame_flag &&!p_Vid->structure && fs->motion.field_frame[j][i]) || (currSlice->mb_aff_frame_flag && fs->motion.field_frame[j][i]))
+        {
+          p->frame.motion[LIST_0][j][i].mv[1] *= 2;
+          p->frame.motion[LIST_1][j][i].mv[1] *= 2;
+        }
+        else  if (p_Vid->structure && !fs->motion.field_frame[j][i])
+        {
+          p->frame.motion[LIST_0][j][i].mv[1] /= 2;
+          p->frame.motion[LIST_1][j][i].mv[1] /= 2;
+        }
+
+      }
+    }
+
+    for (j=0; j<2 + (currSlice->mb_aff_frame_flag * 4);j+=2)
+    {
+      for (i=0; i<p_Vid->listXsize[j];i++)
+      {
+        int prescale, iTRb, iTRp;
+
+        if (j==0)
+        {
+          iTRb = iClip3( -128, 127, p_Vid->dec_picture->poc - listX[LIST_0 + j][i]->poc );
+        }
+        else if (j == 2)
+        {
+          iTRb = iClip3( -128, 127, p_Vid->dec_picture->top_poc - listX[LIST_0 + j][i]->poc );
+        }
+        else
+        {
+          iTRb = iClip3( -128, 127, p_Vid->dec_picture->bottom_poc - listX[LIST_0 + j][i]->poc );
+        }
+
+        iTRp = iClip3( -128, 127,  listX[LIST_1 + j][0]->poc - listX[LIST_0 + j][i]->poc);
+
+        if (iTRp!=0)
+        {
+          prescale = ( 16384 + iabs( iTRp / 2 ) ) / iTRp;
+          currSlice->mvscale[j][i] = iClip3( -1024, 1023, ( iTRb * prescale + 32 ) >> 6 ) ;
+        }
+        else
+        {
+          currSlice->mvscale[j][i] = 9999;
+        }
+      }
+    }
+  }
+}
+
+void copy_storable_param_JV( VideoParameters *p_Vid, PicMotionParams *JVplane, PicMotionParams *motion )
+{
+  seq_parameter_set_rbsp_t *active_sps = p_Vid->active_sps;
+
+  int md_size = (p_Vid->height / BLOCK_SIZE) * (p_Vid->width / BLOCK_SIZE);
+  int ref_size = active_sps->frame_mbs_only_flag ? 2 * md_size : 6 * md_size;
+
+	memcpy(JVplane->motion, motion->motion, md_size*sizeof(PicMotion));
+	if (motion->field_references)
+		memcpy(JVplane->field_references, motion->field_references, 4*md_size*sizeof(h264_ref_t));
+}
diff --git a/Src/h264dec/ldecod/src/mc_prediction.c b/Src/h264dec/ldecod/src/mc_prediction.c
new file mode 100644
index 00000000..8af3aae8
--- /dev/null
+++ b/Src/h264dec/ldecod/src/mc_prediction.c
@@ -0,0 +1,2420 @@
+
+/*!
+*************************************************************************************
+* \file mc_prediction.c
+*
+* \brief
+*    Functions for motion compensated prediction
+*
+* \author
+*      Main contributors (see contributors.h for copyright, 
+*                         address and affiliation details)
+*      - Alexis Michael Tourapis  <alexismt@ieee.org>
+*
+*************************************************************************************
+*/
+#include "global.h"
+#include "block.h"
+#include "mc_prediction.h"
+#include "mbuffer.h"
+#include "mb_access.h"
+#include "macroblock.h"
+#include "memalloc.h"
+#include "optim.h"
+#include <emmintrin.h>
+
+static const int COEF[6] = { 1, -5, 20, 20, -5, 1 };
+/*!
+************************************************************************
+* \brief
+*    block single list prediction
+************************************************************************
+*/
+static inline void mc_prediction(h264_imgpel_macroblock_t mb_pred,
+																 int joff,
+																 int ver_block_size, 
+																 int hor_block_size,
+																 int ioff,
+																 const h264_imgpel_macroblock_t block)
+{
+	int jj;
+
+	if (hor_block_size == MB_BLOCK_SIZE)
+	{
+		memcpy(&(mb_pred[joff][ioff]), &(block[0][0]), hor_block_size * ver_block_size * sizeof(imgpel));
+	}
+	else
+	{
+		h264_imgpel_macroblock_row_t *dest = (h264_imgpel_macroblock_row_t *)(mb_pred[joff]);
+		for(jj = 0; jj < ver_block_size; jj++)
+		{
+			memcpy(&dest[jj][ioff], &(block[jj][0]), hor_block_size * sizeof(imgpel));
+		}
+	}
+}
+
+/*!
+************************************************************************
+* \brief
+*    block single list weighted prediction
+************************************************************************
+*/
+static inline void weighted_mc_prediction(h264_imgpel_macroblock_row_t *mb_pred,
+																					int ver_block_size, 
+																					int hor_block_size,
+																					int wp_scale,
+																					int wp_offset,
+																					int weight_denom)
+{
+#ifdef H264_IPP
+	IppiSize roi = {hor_block_size, ver_block_size};
+	ippiUniDirWeightBlock_H264_8u_C1IR(mb_pred[0], sizeof(mb_pred[0]), weight_denom, wp_scale, wp_offset, roi);
+#else
+	int ii, jj;
+	if (weight_denom > 0)
+	{
+		for(jj=0;jj<ver_block_size;jj++)
+		{
+			imgpel *row = mb_pred[jj];
+			const imgpel *b0 = row;
+
+			for(ii=0;ii<hor_block_size;ii++)
+				row[ii] = (imgpel) iClip1(255, (rshift_rnd_pos((wp_scale * b0[ii]), weight_denom)  + wp_offset ));
+		}
+	}
+	else
+	{
+		weight_denom = -weight_denom;
+		for(jj=0;jj<ver_block_size;jj++)
+		{
+			imgpel *row = mb_pred[jj];
+			const imgpel *b0 = row;
+
+			for(ii=0;ii<hor_block_size;ii++)
+				row[ii] = (imgpel) iClip1(255, (rshift_rnd_nonpos((wp_scale * b0[ii]), weight_denom)  + wp_offset ));
+		}
+	}
+#endif
+}
+
+
+void weighted_mc_prediction16x16_sse2(h264_imgpel_macroblock_row_t *mb_pred, int wp_scale, int wp_offset, int weight_denom)
+{
+		int jj;
+
+		__m128i xmm_zero = _mm_setzero_si128();
+		__m128i xmm_scale = _mm_set1_epi16(wp_scale);
+		__m128i xmm_offset = _mm_set1_epi16(wp_offset);
+		if (weight_denom > 0)
+		{
+			__m128i xmm_shift = _mm_cvtsi32_si128(weight_denom);
+			__m128i xmm_add = _mm_set1_epi16((1<<(weight_denom-1)));
+
+			for(jj = 0; jj < 16; jj++)
+			{
+				__m128i b0  = _mm_load_si128((__m128i *)mb_pred[jj]);
+
+				__m128i b0_low = _mm_unpacklo_epi8(b0, xmm_zero);
+				__m128i b0_high = _mm_unpackhi_epi8(b0, xmm_zero);
+				b0_low = _mm_mullo_epi16(b0_low, xmm_scale);
+				b0_high = _mm_mullo_epi16(b0_high, xmm_scale);
+				b0_low = _mm_add_epi16(b0_low, xmm_add);
+				b0_high = _mm_add_epi16(b0_high, xmm_add);
+				b0_low = _mm_sra_epi16(b0_low, xmm_shift);
+				b0_high = _mm_sra_epi16(b0_high, xmm_shift);
+				b0_low = _mm_add_epi16(b0_low, xmm_offset);
+				b0_high = _mm_add_epi16(b0_high, xmm_offset);
+
+				b0_low = _mm_packus_epi16(b0_low, b0_high); // convert back to epi8
+				_mm_store_si128((__m128i *)mb_pred[jj], b0_low);
+				// (x + (1 << (a-1) )) >> a;
+				//row[ii] = (imgpel) iClip1(color_clip, (rshift_rnd_pos((wp_scale * b0[ii]), weight_denom)  + wp_offset ));
+			}
+		}
+		else
+		{
+
+			__m128i xmm_shift = _mm_cvtsi32_si128(-weight_denom);
+			for(jj = 0; jj < 16; jj++)
+			{
+				__m128i b0  = _mm_load_si128((__m128i *)mb_pred[jj]);
+
+				__m128i b0_low = _mm_unpacklo_epi8(b0, xmm_zero);
+				__m128i b0_high = _mm_unpackhi_epi8(b0, xmm_zero);
+				b0_low = _mm_mullo_epi16(b0_low, xmm_scale);
+				b0_high = _mm_mullo_epi16(b0_high, xmm_scale);
+				b0_low = _mm_sll_epi16(b0_low, xmm_shift);
+				b0_high = _mm_sll_epi16(b0_high, xmm_shift);
+				b0_low = _mm_add_epi16(b0_low, xmm_offset);
+				b0_high = _mm_add_epi16(b0_high, xmm_offset);
+
+				b0_low = _mm_packus_epi16(b0_low, b0_high); // convert back to epi8
+				_mm_store_si128((__m128i *)mb_pred[jj], b0_low);
+				//(x << a);
+				//			row[ii] = (imgpel) iClip1(color_clip, (rshift_rnd_nonpos((wp_scale * b0[ii]), weight_denom)  + wp_offset ));
+			}
+		}
+}
+
+#ifdef H264_IPP
+void weighted_mc_prediction16x16_ipp(h264_imgpel_macroblock_row_t *mb_pred, int wp_scale, int wp_offset, int weight_denom)
+{
+	IppiSize roi = {16, 16};
+	ippiUniDirWeightBlock_H264_8u_C1IR(mb_pred[0], sizeof(mb_pred[0]), weight_denom, wp_scale, wp_offset, roi);
+}
+#endif
+
+void weighted_mc_prediction16x16_c(h264_imgpel_macroblock_row_t *mb_pred, int wp_scale, int wp_offset, int weight_denom)
+{
+	int ii, jj;
+		if (weight_denom > 0)
+		{
+		for(jj=0;jj<16;jj++)
+		{
+		imgpel *row = mb_pred[jj];
+		const imgpel *b0 = row;
+
+		for(ii=0;ii<16;ii++)
+		row[ii] = (imgpel) iClip1(255, (rshift_rnd_pos((wp_scale * b0[ii]), weight_denom)  + wp_offset ));
+		}
+		}
+		else
+		{
+		weight_denom = -weight_denom;
+		for(jj=0;jj<16;jj++)
+		{
+		imgpel *row = mb_pred[jj];
+		const imgpel *b0 = row;
+
+		for(ii=0;ii<16;ii++)
+		row[ii] = (imgpel) iClip1(255, (rshift_rnd_nonpos((wp_scale * b0[ii]), weight_denom)  + wp_offset ));
+		}
+		}
+}
+
+
+/* 16x8 */
+void weighted_mc_prediction16x8_sse2(h264_imgpel_macroblock_row_t *mb_pred, int wp_scale, int wp_offset, int weight_denom)
+{
+		int jj;
+
+		__m128i xmm_zero = _mm_setzero_si128();
+		__m128i xmm_scale = _mm_set1_epi16(wp_scale);
+		__m128i xmm_offset = _mm_set1_epi16(wp_offset);
+		if (weight_denom > 0)
+		{
+			__m128i xmm_shift = _mm_cvtsi32_si128(weight_denom);
+			__m128i xmm_add = _mm_set1_epi16((1<<(weight_denom-1)));
+
+			for(jj = 0; jj < 8; jj++)
+			{
+				__m128i b0  = _mm_load_si128((__m128i *)mb_pred[jj]);
+
+				__m128i b0_low = _mm_unpacklo_epi8(b0, xmm_zero);
+				__m128i b0_high = _mm_unpackhi_epi8(b0, xmm_zero);
+				b0_low = _mm_mullo_epi16(b0_low, xmm_scale);
+				b0_high = _mm_mullo_epi16(b0_high, xmm_scale);
+				b0_low = _mm_add_epi16(b0_low, xmm_add);
+				b0_high = _mm_add_epi16(b0_high, xmm_add);
+				b0_low = _mm_sra_epi16(b0_low, xmm_shift);
+				b0_high = _mm_sra_epi16(b0_high, xmm_shift);
+				b0_low = _mm_add_epi16(b0_low, xmm_offset);
+				b0_high = _mm_add_epi16(b0_high, xmm_offset);
+
+				b0_low = _mm_packus_epi16(b0_low, b0_high); // convert back to epi8
+				_mm_store_si128((__m128i *)mb_pred[jj], b0_low);
+				// (x + (1 << (a-1) )) >> a;
+				//row[ii] = (imgpel) iClip1(color_clip, (rshift_rnd_pos((wp_scale * b0[ii]), weight_denom)  + wp_offset ));
+			}
+		}
+		else
+		{
+
+			__m128i xmm_shift = _mm_cvtsi32_si128(-weight_denom);
+			for(jj = 0; jj < 8; jj++)
+			{
+				__m128i b0  = _mm_load_si128((__m128i *)mb_pred[jj]);
+
+				__m128i b0_low = _mm_unpacklo_epi8(b0, xmm_zero);
+				__m128i b0_high = _mm_unpackhi_epi8(b0, xmm_zero);
+				b0_low = _mm_mullo_epi16(b0_low, xmm_scale);
+				b0_high = _mm_mullo_epi16(b0_high, xmm_scale);
+				b0_low = _mm_sll_epi16(b0_low, xmm_shift);
+				b0_high = _mm_sll_epi16(b0_high, xmm_shift);
+				b0_low = _mm_add_epi16(b0_low, xmm_offset);
+				b0_high = _mm_add_epi16(b0_high, xmm_offset);
+
+				b0_low = _mm_packus_epi16(b0_low, b0_high); // convert back to epi8
+				_mm_store_si128((__m128i *)mb_pred[jj], b0_low);
+				//(x << a);
+				//			row[ii] = (imgpel) iClip1(color_clip, (rshift_rnd_nonpos((wp_scale * b0[ii]), weight_denom)  + wp_offset ));
+			}
+		}
+}
+
+#ifdef H264_IPP
+void weighted_mc_prediction16x8_ipp(h264_imgpel_macroblock_row_t *mb_pred, int wp_scale, int wp_offset, int weight_denom)
+{
+	IppiSize roi = {16, 8};
+	ippiUniDirWeightBlock_H264_8u_C1IR(mb_pred[0], sizeof(mb_pred[0]), weight_denom, wp_scale, wp_offset, roi);
+}
+#endif
+
+void weighted_mc_prediction16x8_c(h264_imgpel_macroblock_row_t *mb_pred, int wp_scale, int wp_offset, int weight_denom)
+{
+	int ii, jj;
+		if (weight_denom > 0)
+		{
+		for(jj=0;jj<8;jj++)
+		{
+		imgpel *row = mb_pred[jj];
+		const imgpel *b0 = row;
+
+		for(ii=0;ii<16;ii++)
+		row[ii] = (imgpel) iClip1(255, (rshift_rnd_pos((wp_scale * b0[ii]), weight_denom)  + wp_offset ));
+		}
+		}
+		else
+		{
+		weight_denom = -weight_denom;
+		for(jj=0;jj<8;jj++)
+		{
+		imgpel *row = mb_pred[jj];
+		const imgpel *b0 = row;
+
+		for(ii=0;ii<16;ii++)
+		row[ii] = (imgpel) iClip1(255, (rshift_rnd_nonpos((wp_scale * b0[ii]), weight_denom)  + wp_offset ));
+		}
+		}
+}
+
+
+
+
+#define LOAD_LINE_EPI16(reg, ptr) { reg = _mm_loadl_epi64((__m128i *)(ptr));	reg = _mm_unpacklo_epi8(reg, xmm_zero); }
+void weighted_mc_prediction8x8_sse2(h264_imgpel_macroblock_row_t *mb_pred, int wp_scale, int wp_offset, int weight_denom)
+{
+		int jj;
+
+		__m128i xmm_zero = _mm_setzero_si128();
+		__m128i xmm_scale = _mm_set1_epi16(wp_scale);
+		__m128i xmm_offset = _mm_set1_epi16(wp_offset);
+		if (weight_denom > 0)
+		{
+			__m128i xmm_shift = _mm_cvtsi32_si128(weight_denom);
+			__m128i xmm_add = _mm_set1_epi16((1<<(weight_denom-1)));
+
+			for(jj = 0; jj < 8; jj++)
+			{
+				__m128i b0;
+				LOAD_LINE_EPI16(b0, (__m128i *)mb_pred[jj]);
+				b0 = _mm_mullo_epi16(b0, xmm_scale);
+				b0 = _mm_add_epi16(b0, xmm_add);
+				b0 = _mm_sra_epi16(b0, xmm_shift);
+				b0 = _mm_add_epi16(b0, xmm_offset);
+
+				b0 = _mm_packus_epi16(b0, b0); // convert back to epi8
+				_mm_storel_epi64((__m128i *)mb_pred[jj], b0);
+				// (x + (1 << (a-1) )) >> a;
+				//row[ii] = (imgpel) iClip1(color_clip, (rshift_rnd_pos((wp_scale * b0[ii]), weight_denom)  + wp_offset ));
+			}
+		}
+		else
+		{
+
+			__m128i xmm_shift = _mm_cvtsi32_si128(-weight_denom);
+			for(jj = 0; jj < 8; jj++)
+			{
+				__m128i b0;
+				LOAD_LINE_EPI16(b0, (__m128i *)mb_pred[jj]);
+				b0 = _mm_mullo_epi16(b0, xmm_scale);
+				b0 = _mm_sll_epi16(b0, xmm_shift);
+				b0 = _mm_add_epi16(b0, xmm_offset);
+
+				b0 = _mm_packus_epi16(b0, b0); // convert back to epi8
+				_mm_storel_epi64((__m128i *)mb_pred[jj], b0);
+				//(x << a);
+				//			row[ii] = (imgpel) iClip1(color_clip, (rshift_rnd_nonpos((wp_scale * b0[ii]), weight_denom)  + wp_offset ));
+			}
+		}
+}
+
+#ifdef H264_IPP
+void weighted_mc_prediction8x8_ipp(h264_imgpel_macroblock_row_t *mb_pred, int wp_scale, int wp_offset, int weight_denom)
+{
+			IppiSize roi = {8, 8};
+		ippiUniDirWeightBlock_H264_8u_C1IR(mb_pred[0], sizeof(mb_pred[0]), weight_denom, wp_scale, wp_offset, roi);
+}
+#endif
+
+void weighted_mc_prediction8x8_c(h264_imgpel_macroblock_row_t *mb_pred, int wp_scale, int wp_offset, int weight_denom)
+{
+		int ii, jj;
+		if (weight_denom > 0)
+		{
+		for(jj=0;jj<8;jj++)
+		{
+		imgpel *row = mb_pred[jj];
+		const imgpel *b0 = row;
+
+		for(ii=0;ii<8;ii++)
+		row[ii] = (imgpel) iClip1(255, (rshift_rnd_pos((wp_scale * b0[ii]), weight_denom)  + wp_offset ));
+		}
+		}
+		else
+		{
+		weight_denom = -weight_denom;
+		for(jj=0;jj<8;jj++)
+		{
+		imgpel *row = mb_pred[jj];
+		const imgpel *b0 = row;
+
+		for(ii=0;ii<8;ii++)
+		row[ii] = (imgpel) iClip1(255, (rshift_rnd_nonpos((wp_scale * b0[ii]), weight_denom)  + wp_offset ));
+		}
+		}
+}
+
+
+
+/*!
+************************************************************************
+* \brief
+*    block biprediction
+************************************************************************
+*/
+static inline void bi_prediction(h264_imgpel_macroblock_row_t *mb_pred,
+																 //int joff,
+																 const h264_imgpel_macroblock_t block_l0, 
+																 //const h264_imgpel_macroblock_t block_l1,
+																 int ver_block_size, 
+																 int hor_block_size
+																 //int ioff
+																 )
+{
+
+#ifdef H264_IPP
+	ippiInterpolateBlock_H264_8u_P2P1R(block_l0[0], mb_pred[0], mb_pred[0], hor_block_size, ver_block_size, sizeof(mb_pred[0]));
+#else
+	int ii, jj;
+
+	for(jj = 0;jj < ver_block_size;jj++)
+	{
+		const imgpel *b0  = block_l0[jj];
+		imgpel *row = mb_pred[jj];
+		const imgpel *b1  = row;
+
+		for(ii = 0; ii < hor_block_size;ii++)
+			row[ii] = (imgpel) rshift_rnd_sf((b0[ii] + b1[ii]), 1);
+	}
+#endif
+}
+
+static void bi_prediction4x4_mmx(h264_imgpel_macroblock_row_t *mb_pred, const h264_imgpel_macroblock_t block_l0)
+{
+	int jj;
+	__m64 b0, b1;
+	__m64 mmx_zero	= _mm_setzero_si64();
+	__m64 mmx_one = _mm_set1_pi16(1);
+
+	for(jj = 0;jj < 4;jj++)
+	{
+		b0 = _mm_cvtsi32_si64(*(int *)(&block_l0[jj]));        
+		b0 = _mm_unpacklo_pi8(b0, mmx_zero);
+		b1 = _mm_cvtsi32_si64(*(int *)(& mb_pred[jj]));
+		b1 = _mm_unpacklo_pi8(b1, mmx_zero);
+		b0 = _mm_add_pi16(b0, b1);
+		b0 = _mm_add_pi16(b0, mmx_one);
+		b0 = _mm_srai_pi16(b0, 1);
+		b0 = _mm_packs_pu16(b0, b0); 
+		*(int *)(&mb_pred[jj]) = _mm_cvtsi64_si32(b0);
+	}
+}
+
+void bi_prediction8x8_sse2(h264_imgpel_macroblock_row_t *mb_pred, const h264_imgpel_macroblock_t block_l0)
+{
+	int jj;
+	__m128i b0, b1;
+	__m128i xmm_zero	= _mm_setzero_si128();
+	__m128i xmm_one = _mm_set1_epi16(1);
+
+	for(jj = 0;jj < 8;jj++)
+	{
+		LOAD_LINE_EPI16(b0, (__m128i *)block_l0[jj]);
+		LOAD_LINE_EPI16(b1, (__m128i *)mb_pred[jj]);
+		b0 = _mm_add_epi16(b0, b1);
+		b0 = _mm_add_epi16(b0, xmm_one);
+		b0 = _mm_srai_epi16(b0, 1);
+		b0 = _mm_packus_epi16(b0, b0); 
+		_mm_storel_epi64((__m128i *)mb_pred[jj], b0);
+	}
+}
+
+#ifdef H264_IPP
+void bi_prediction8x8_ipp(h264_imgpel_macroblock_row_t *mb_pred, const h264_imgpel_macroblock_t block_l0)
+{
+	ippiInterpolateBlock_H264_8u_P2P1R(block_l0[0], mb_pred[0], mb_pred[0], 8, 8, sizeof(mb_pred[0]));
+}
+#endif
+/*!
+************************************************************************
+* \brief
+*    block weighted biprediction
+************************************************************************
+*/
+static inline void weighted_bi_prediction(h264_imgpel_macroblock_row_t *mb_pred,
+																					const h264_imgpel_macroblock_t block_l0, 
+																					int ver_block_size,  int hor_block_size,
+																					int wp_scale_l0, int wp_scale_l1,
+																					int wp_offset, int weight_denom)
+{
+#ifdef H264_IPP
+	IppiSize roi = {hor_block_size, ver_block_size};
+	ippiWeightedAverage_H264_8u_C1IR(block_l0[0], mb_pred[0], sizeof(mb_pred[0]), wp_scale_l0, wp_scale_l1, weight_denom, wp_offset, roi);
+#else
+	int ii, jj;
+
+	if (weight_denom > 0)
+	{
+		for(jj = 0; jj < ver_block_size; jj++)
+		{
+			const imgpel *b0  = block_l0[jj];
+			imgpel *row = mb_pred[jj];
+			const imgpel *b1  = row;
+
+			for(ii=0;ii<hor_block_size;ii++)
+				row[ii] = (imgpel) iClip1(255, (rshift_rnd_pos((wp_scale_l0 * b0[ii] + wp_scale_l1 * b1[ii]), weight_denom) + wp_offset));
+		}
+	}
+	else
+	{
+		weight_denom = -weight_denom;
+		for(jj = 0; jj < ver_block_size; jj++)
+		{
+			const imgpel *b0  = block_l0[jj];
+			imgpel *row = mb_pred[jj];
+			const imgpel *b1  = row;
+
+			for(ii=0;ii<hor_block_size;ii++)
+				row[ii] = (imgpel) iClip1(255, (rshift_rnd_nonpos((wp_scale_l0 * b0[ii] + wp_scale_l1 * b1[ii]), weight_denom) + wp_offset));
+		}
+	}
+#endif
+}
+
+void weighted_bi_prediction8x8_sse2(h264_imgpel_macroblock_row_t *mb_pred, const h264_imgpel_macroblock_t block_l0,  int wp_scale_l0, int wp_scale_l1, int wp_offset, int weight_denom)
+{
+			int jj;
+
+		__m128i xmm_zero = _mm_setzero_si128();
+		__m128i xmm_scale_l0 = _mm_set1_epi16(wp_scale_l0);
+		__m128i xmm_scale_l1 = _mm_set1_epi16(wp_scale_l1);	
+		__m128i xmm_offset = _mm_set1_epi16(wp_offset);
+		if (weight_denom > 0)
+		{
+			__m128i xmm_shift = _mm_cvtsi32_si128(weight_denom);
+			__m128i xmm_add = _mm_set1_epi16((1<<(weight_denom-1)));
+
+			for(jj = 0; jj < 8; jj++)
+			{
+				__m128i b0, b1;
+				LOAD_LINE_EPI16(b0, (__m128i *)block_l0[jj]);
+				LOAD_LINE_EPI16(b1, (__m128i *)mb_pred[jj]);
+
+				b0 = _mm_mullo_epi16(b0, xmm_scale_l0);
+				b1 = _mm_mullo_epi16(b1, xmm_scale_l1);
+				b0 = _mm_add_epi16(b0, b1);
+				b0 = _mm_add_epi16(b0, xmm_add);
+				b0 = _mm_sra_epi16(b0, xmm_shift);
+				b0 = _mm_add_epi16(b0, xmm_offset);
+
+				b0 = _mm_packus_epi16(b0, b0); // convert back to epi8
+				_mm_storel_epi64((__m128i *)mb_pred[jj], b0);
+				// (x + (1 << (a-1) )) >> a;
+				// row[ii] = (imgpel) iClip1(color_clip, (rshift_rnd_pos((wp_scale_l0 * b0[ii] + wp_scale_l1 * b1[ii]), weight_denom) + wp_offset));
+			}
+		}
+		else
+		{
+
+			__m128i xmm_shift = _mm_cvtsi32_si128(-weight_denom);
+			for(jj = 0; jj < 8; jj++)
+			{
+				__m128i b0, b1;
+				LOAD_LINE_EPI16(b0, (__m128i *)block_l0[jj]);
+				LOAD_LINE_EPI16(b1, (__m128i *)mb_pred[jj]);
+
+				b0 = _mm_mullo_epi16(b0, xmm_scale_l0);
+				b1 = _mm_mullo_epi16(b1, xmm_scale_l1);
+				b0 = _mm_add_epi16(b0, b1);
+				b0 = _mm_sll_epi16(b0, xmm_shift);
+				b0 = _mm_add_epi16(b0, xmm_offset);
+
+				b0 = _mm_packus_epi16(b0, b0); // convert back to epi8
+				_mm_storel_epi64((__m128i *)mb_pred[jj], b0);
+				//(x << a);
+				//		row[ii] = (imgpel) iClip1(color_clip, (rshift_rnd_nonpos((wp_scale_l0 * b0[ii] + wp_scale_l1 * b1[ii]), weight_denom) + wp_offset));
+			}
+		}
+}
+
+#ifdef H264_IPP
+void weighted_bi_prediction8x8_ipp(h264_imgpel_macroblock_row_t *mb_pred, const h264_imgpel_macroblock_t block_l0,  int wp_scale_l0, int wp_scale_l1, int wp_offset, int weight_denom)
+{
+	IppiSize roi = {8, 8};
+	ippiWeightedAverage_H264_8u_C1IR(block_l0[0], mb_pred[0], sizeof(mb_pred[0]), wp_scale_l0, wp_scale_l1, weight_denom, wp_offset, roi);
+}
+#endif
+
+void weighted_bi_prediction8x8_c(h264_imgpel_macroblock_row_t *mb_pred, const h264_imgpel_macroblock_t block_l0,  int wp_scale_l0, int wp_scale_l1, int wp_offset, int weight_denom)
+{
+	int ii, jj;
+
+		if (weight_denom > 0)
+		{
+		for(jj = 0; jj < 8; jj++)
+		{
+		const imgpel *b0  = block_l0[jj];
+		imgpel *row = mb_pred[jj];
+		const imgpel *b1  = row;
+
+		for(ii=0;ii<8;ii++)
+		row[ii] = (imgpel) iClip1(255, (rshift_rnd_pos((wp_scale_l0 * b0[ii] + wp_scale_l1 * b1[ii]), weight_denom) + wp_offset));
+		}
+		}
+		else
+		{
+		weight_denom = -weight_denom;
+		for(jj = 0; jj < 8; jj++)
+		{
+		const imgpel *b0  = block_l0[jj];
+		imgpel *row = mb_pred[jj];
+		const imgpel *b1  = row;
+
+		for(ii=0;ii<8;ii++)
+		row[ii] = (imgpel) iClip1(255, (rshift_rnd_nonpos((wp_scale_l0 * b0[ii] + wp_scale_l1 * b1[ii]), weight_denom) + wp_offset));
+		}
+		}
+}
+
+#if defined(_DEBUG) || !defined(_M_IX86)
+static inline void weighted_bi_prediction4x4(h264_imgpel_macroblock_row_t *mb_pred,
+																						 const h264_imgpel_macroblock_t block_l0, 
+																						 uint16_t wp_scale_l0,
+																						 uint16_t wp_scale_l1,
+																						 uint16_t wp_offset,
+																						 int weight_denom)
+{
+#ifdef H264_IPP
+	IppiSize roi = {4, 4};
+	ippiWeightedAverage_H264_8u_C1IR(block_l0[0], mb_pred[0], sizeof(mb_pred[0]), wp_scale_l0, wp_scale_l1, weight_denom, wp_offset, roi);
+#else
+	int ii, jj;
+
+	if (weight_denom > 0)
+	{
+		for(jj = 0; jj < 4; jj++)
+		{
+			const imgpel *b0  = block_l0[jj];
+			imgpel *row = mb_pred[jj];
+			const imgpel *b1  = row;
+
+			for(ii=0;ii<4;ii++)
+				row[ii] = (imgpel) iClip1(255, (rshift_rnd_pos((wp_scale_l0 * b0[ii] + wp_scale_l1 * b1[ii]), weight_denom) + wp_offset));
+		}
+	}
+	else
+	{
+		weight_denom = -weight_denom;
+		for(jj = 0; jj < 4; jj++)
+		{
+			const imgpel *b0  = block_l0[jj];
+			imgpel *row = mb_pred[jj];
+			const imgpel *b1  = row;
+
+			for(ii=0;ii<4;ii++)
+				row[ii] = (imgpel) iClip1(255, (rshift_rnd_nonpos((wp_scale_l0 * b0[ii] + wp_scale_l1 * b1[ii]), weight_denom) + wp_offset));
+		}
+	}
+#endif
+}
+#else
+extern void weighted_bi_prediction4x4(h264_imgpel_macroblock_row_t *mb_pred, const h264_imgpel_macroblock_t block_l0, int wp_scale_l0, int wp_scale_l1, int wp_offset, int weight_denom);
+#endif
+
+void weighted_bi_prediction16x16_sse2(h264_imgpel_macroblock_row_t *mb_pred, const h264_imgpel_macroblock_t block_l0, int wp_scale_l0, int wp_scale_l1, int wp_offset, int weight_denom)
+{
+		int jj;
+
+		__m128i xmm_zero = _mm_setzero_si128();
+		__m128i xmm_scale_l0 = _mm_set1_epi16(wp_scale_l0);
+		__m128i xmm_scale_l1 = _mm_set1_epi16(wp_scale_l1);	
+		__m128i xmm_offset = _mm_set1_epi16(wp_offset);
+		if (weight_denom > 0)
+		{
+			__m128i xmm_shift = _mm_cvtsi32_si128(weight_denom);
+			__m128i xmm_add = _mm_set1_epi16((1<<(weight_denom-1)));
+
+			for(jj = 0; jj < 16; jj++)
+			{
+				__m128i b0  = _mm_load_si128((__m128i *)block_l0[jj]);
+				__m128i b1  = _mm_load_si128((__m128i *)mb_pred[jj]);
+
+				__m128i b0_low = _mm_unpacklo_epi8(b0, xmm_zero);
+				__m128i b0_high = _mm_unpackhi_epi8(b0, xmm_zero);
+				__m128i b1_low = _mm_unpacklo_epi8(b1, xmm_zero);
+				__m128i b1_high = _mm_unpackhi_epi8(b1, xmm_zero);
+				b0_low = _mm_mullo_epi16(b0_low, xmm_scale_l0);
+				b1_low = _mm_mullo_epi16(b1_low, xmm_scale_l1);
+				b0_high = _mm_mullo_epi16(b0_high, xmm_scale_l0);
+				b1_high = _mm_mullo_epi16(b1_high, xmm_scale_l1);
+				b0_low = _mm_add_epi16(b0_low, b1_low);
+				b0_high = _mm_add_epi16(b0_high, b1_high);
+				b0_low = _mm_add_epi16(b0_low, xmm_add);
+				b0_high = _mm_add_epi16(b0_high, xmm_add);
+				b0_low = _mm_sra_epi16(b0_low, xmm_shift);
+				b0_high = _mm_sra_epi16(b0_high, xmm_shift);
+				b0_low = _mm_add_epi16(b0_low, xmm_offset);
+				b0_high = _mm_add_epi16(b0_high, xmm_offset);
+
+				b0_low = _mm_packus_epi16(b0_low, b0_high); // convert back to epi8
+				_mm_store_si128((__m128i *)mb_pred[jj], b0_low);
+				// (x + (1 << (a-1) )) >> a;
+				// row[ii] = (imgpel) iClip1(color_clip, (rshift_rnd_pos((wp_scale_l0 * b0[ii] + wp_scale_l1 * b1[ii]), weight_denom) + wp_offset));
+			}
+		}
+		else
+		{
+
+			__m128i xmm_shift = _mm_cvtsi32_si128(-weight_denom);
+			for(jj = 0; jj < 16; jj++)
+			{
+				__m128i b0  = _mm_load_si128((__m128i *)block_l0[jj]);
+				__m128i b1  = _mm_load_si128((__m128i *)mb_pred[jj]);
+
+				__m128i b0_low = _mm_unpacklo_epi8(b0, xmm_zero);
+				__m128i b0_high = _mm_unpackhi_epi8(b0, xmm_zero);
+				__m128i b1_low = _mm_unpacklo_epi8(b1, xmm_zero);
+				__m128i b1_high = _mm_unpackhi_epi8(b1, xmm_zero);
+				b0_low = _mm_mullo_epi16(b0_low, xmm_scale_l0);
+				b1_low = _mm_mullo_epi16(b1_low, xmm_scale_l1);
+				b0_high = _mm_mullo_epi16(b0_high, xmm_scale_l0);
+				b1_high = _mm_mullo_epi16(b1_high, xmm_scale_l1);
+				b0_low = _mm_add_epi16(b0_low, b1_low);
+				b0_high = _mm_add_epi16(b0_high, b1_high);
+				b0_low = _mm_sll_epi16(b0_low, xmm_shift);
+				b0_high = _mm_sll_epi16(b0_high, xmm_shift);
+				b0_low = _mm_add_epi16(b0_low, xmm_offset);
+				b0_high = _mm_add_epi16(b0_high, xmm_offset);
+
+				b0_low = _mm_packus_epi16(b0_low, b0_high); // convert back to epi8
+				_mm_store_si128((__m128i *)mb_pred[jj], b0_low);
+
+				//(x << a);
+				//		row[ii] = (imgpel) iClip1(color_clip, (rshift_rnd_nonpos((wp_scale_l0 * b0[ii] + wp_scale_l1 * b1[ii]), weight_denom) + wp_offset));
+			}
+		}
+}
+
+#ifdef H264_IPP
+void weighted_bi_prediction16x16_ipp(h264_imgpel_macroblock_row_t *mb_pred, const h264_imgpel_macroblock_t block_l0, int wp_scale_l0, int wp_scale_l1, int wp_offset, int weight_denom)
+{
+	IppiSize roi = {16, 16};
+	ippiWeightedAverage_H264_8u_C1IR(block_l0[0], mb_pred[0], sizeof(mb_pred[0]), wp_scale_l0, wp_scale_l1, weight_denom, wp_offset, roi);
+}
+#endif
+
+void weighted_bi_prediction16x16_c(h264_imgpel_macroblock_row_t *mb_pred, const h264_imgpel_macroblock_t block_l0, int wp_scale_l0, int wp_scale_l1, int wp_offset, int weight_denom)
+{
+	int ii, jj;
+
+	if (weight_denom > 0)
+	{
+		for(jj = 0; jj < 16; jj++)
+		{
+			const imgpel *b0  = block_l0[jj];
+			imgpel *row = mb_pred[jj];
+			const imgpel *b1  = row;
+
+			for(ii=0;ii<16;ii++)
+				row[ii] = (imgpel) iClip1(255, (rshift_rnd_pos((wp_scale_l0 * b0[ii] + wp_scale_l1 * b1[ii]), weight_denom) + wp_offset));
+		}
+	}
+	else
+	{
+		weight_denom = -weight_denom;
+		for(jj = 0; jj < 16; jj++)
+		{
+			const imgpel *b0  = block_l0[jj];
+			imgpel *row = mb_pred[jj];
+			const imgpel *b1  = row;
+
+			for(ii=0;ii<16;ii++)
+				row[ii] = (imgpel) iClip1(255, (rshift_rnd_nonpos((wp_scale_l0 * b0[ii] + wp_scale_l1 * b1[ii]), weight_denom) + wp_offset));
+		}
+	}
+}
+
+/* 16x8 */
+void weighted_bi_prediction16x8_sse2(h264_imgpel_macroblock_row_t *mb_pred, const h264_imgpel_macroblock_t block_l0, int wp_scale_l0, int wp_scale_l1, int wp_offset, int weight_denom)
+{
+		int jj;
+
+		__m128i xmm_zero = _mm_setzero_si128();
+		__m128i xmm_scale_l0 = _mm_set1_epi16(wp_scale_l0);
+		__m128i xmm_scale_l1 = _mm_set1_epi16(wp_scale_l1);	
+		__m128i xmm_offset = _mm_set1_epi16(wp_offset);
+		if (weight_denom > 0)
+		{
+			__m128i xmm_shift = _mm_cvtsi32_si128(weight_denom);
+			__m128i xmm_add = _mm_set1_epi16((1<<(weight_denom-1)));
+
+			for(jj = 0; jj < 8; jj++)
+			{
+				__m128i b0  = _mm_load_si128((__m128i *)block_l0[jj]);
+				__m128i b1  = _mm_load_si128((__m128i *)mb_pred[jj]);
+
+				__m128i b0_low = _mm_unpacklo_epi8(b0, xmm_zero);
+				__m128i b0_high = _mm_unpackhi_epi8(b0, xmm_zero);
+				__m128i b1_low = _mm_unpacklo_epi8(b1, xmm_zero);
+				__m128i b1_high = _mm_unpackhi_epi8(b1, xmm_zero);
+				b0_low = _mm_mullo_epi16(b0_low, xmm_scale_l0);
+				b1_low = _mm_mullo_epi16(b1_low, xmm_scale_l1);
+				b0_high = _mm_mullo_epi16(b0_high, xmm_scale_l0);
+				b1_high = _mm_mullo_epi16(b1_high, xmm_scale_l1);
+				b0_low = _mm_add_epi16(b0_low, b1_low);
+				b0_high = _mm_add_epi16(b0_high, b1_high);
+				b0_low = _mm_add_epi16(b0_low, xmm_add);
+				b0_high = _mm_add_epi16(b0_high, xmm_add);
+				b0_low = _mm_sra_epi16(b0_low, xmm_shift);
+				b0_high = _mm_sra_epi16(b0_high, xmm_shift);
+				b0_low = _mm_add_epi16(b0_low, xmm_offset);
+				b0_high = _mm_add_epi16(b0_high, xmm_offset);
+
+				b0_low = _mm_packus_epi16(b0_low, b0_high); // convert back to epi8
+				_mm_store_si128((__m128i *)mb_pred[jj], b0_low);
+				// (x + (1 << (a-1) )) >> a;
+				// row[ii] = (imgpel) iClip1(color_clip, (rshift_rnd_pos((wp_scale_l0 * b0[ii] + wp_scale_l1 * b1[ii]), weight_denom) + wp_offset));
+			}
+		}
+		else
+		{
+
+			__m128i xmm_shift = _mm_cvtsi32_si128(-weight_denom);
+			for(jj = 0; jj < 8; jj++)
+			{
+				__m128i b0  = _mm_load_si128((__m128i *)block_l0[jj]);
+				__m128i b1  = _mm_load_si128((__m128i *)mb_pred[jj]);
+
+				__m128i b0_low = _mm_unpacklo_epi8(b0, xmm_zero);
+				__m128i b0_high = _mm_unpackhi_epi8(b0, xmm_zero);
+				__m128i b1_low = _mm_unpacklo_epi8(b1, xmm_zero);
+				__m128i b1_high = _mm_unpackhi_epi8(b1, xmm_zero);
+				b0_low = _mm_mullo_epi16(b0_low, xmm_scale_l0);
+				b1_low = _mm_mullo_epi16(b1_low, xmm_scale_l1);
+				b0_high = _mm_mullo_epi16(b0_high, xmm_scale_l0);
+				b1_high = _mm_mullo_epi16(b1_high, xmm_scale_l1);
+				b0_low = _mm_add_epi16(b0_low, b1_low);
+				b0_high = _mm_add_epi16(b0_high, b1_high);
+				b0_low = _mm_sll_epi16(b0_low, xmm_shift);
+				b0_high = _mm_sll_epi16(b0_high, xmm_shift);
+				b0_low = _mm_add_epi16(b0_low, xmm_offset);
+				b0_high = _mm_add_epi16(b0_high, xmm_offset);
+
+				b0_low = _mm_packus_epi16(b0_low, b0_high); // convert back to epi8
+				_mm_store_si128((__m128i *)mb_pred[jj], b0_low);
+
+				//(x << a);
+				//		row[ii] = (imgpel) iClip1(color_clip, (rshift_rnd_nonpos((wp_scale_l0 * b0[ii] + wp_scale_l1 * b1[ii]), weight_denom) + wp_offset));
+			}
+		}
+}
+
+#ifdef H264_IPP
+void weighted_bi_prediction16x8_ipp(h264_imgpel_macroblock_row_t *mb_pred, const h264_imgpel_macroblock_t block_l0, int wp_scale_l0, int wp_scale_l1, int wp_offset, int weight_denom)
+{
+	IppiSize roi = {16, 8};
+	ippiWeightedAverage_H264_8u_C1IR(block_l0[0], mb_pred[0], sizeof(mb_pred[0]), wp_scale_l0, wp_scale_l1, weight_denom, wp_offset, roi);
+}
+#endif
+
+void weighted_bi_prediction16x8_c(h264_imgpel_macroblock_row_t *mb_pred, const h264_imgpel_macroblock_t block_l0, int wp_scale_l0, int wp_scale_l1, int wp_offset, int weight_denom)
+{
+	int ii, jj;
+
+	if (weight_denom > 0)
+	{
+		for(jj = 0; jj < 8; jj++)
+		{
+			const imgpel *b0  = block_l0[jj];
+			imgpel *row = mb_pred[jj];
+			const imgpel *b1  = row;
+
+			for(ii=0;ii<16;ii++)
+				row[ii] = (imgpel) iClip1(255, (rshift_rnd_pos((wp_scale_l0 * b0[ii] + wp_scale_l1 * b1[ii]), weight_denom) + wp_offset));
+		}
+	}
+	else
+	{
+		weight_denom = -weight_denom;
+		for(jj = 0; jj < 8; jj++)
+		{
+			const imgpel *b0  = block_l0[jj];
+			imgpel *row = mb_pred[jj];
+			const imgpel *b1  = row;
+
+			for(ii=0;ii<16;ii++)
+				row[ii] = (imgpel) iClip1(255, (rshift_rnd_nonpos((wp_scale_l0 * b0[ii] + wp_scale_l1 * b1[ii]), weight_denom) + wp_offset));
+		}
+	}
+}
+
+/*!
+************************************************************************
+* \brief
+*    No reference picture mc
+************************************************************************
+*/ 
+static void get_data_no_ref(h264_imgpel_macroblock_row_t *block, int ver_block_size, int hor_block_size, imgpel med_imgpel_value)
+{
+	int i, j;
+#ifdef _DEBUG
+	printf("list[ref_frame] is equal to 'no reference picture' before RAP\n");
+#endif
+
+	/* fill the block with sample value middle value */
+	for (j = 0; j < ver_block_size; j++)
+		for (i = 0; i < hor_block_size; i++)
+			block[j][i] = med_imgpel_value;
+}
+
+/*!
+************************************************************************
+* \brief
+*    Interpolation of 1/4 subpixel
+************************************************************************
+*/ 
+void get_block_luma(Macroblock *currMB, ColorPlane pl, StorablePicture *curr_ref, int x_pos, int y_pos, const short *motion_vector, int hor_block_size, int ver_block_size, h264_imgpel_macroblock_row_t *block)
+{  
+	VideoParameters *p_Vid = currMB->p_Vid;
+
+	if (curr_ref == p_Vid->no_reference_picture && p_Vid->framepoc < p_Vid->recovery_poc)
+	{
+		get_data_no_ref(block, ver_block_size, hor_block_size, (imgpel) p_Vid->dc_pred_value_comp[pl]);
+	}
+	else
+	{
+		IppVCInterpolateBlock_8u block_data;
+		StorablePicture *dec_picture = p_Vid->dec_picture;
+		VideoImage *cur_imgY = curr_ref->imgY;
+
+		if (IS_INDEPENDENT(p_Vid))
+		{
+			switch(p_Vid->colour_plane_id )
+			{
+			case    1:
+				cur_imgY = curr_ref->imgUV[0];
+				break;
+			case    2:
+				cur_imgY = curr_ref->imgUV[1];
+				break;
+			}
+		}
+		else if (pl!=PLANE_Y)
+		{
+			cur_imgY = curr_ref->imgUV[pl-1]; 
+		}
+
+		block_data.pSrc[0] = cur_imgY->base_address;
+		block_data.srcStep = cur_imgY->stride;
+		block_data.pDst[0] = block[0];
+		block_data.dstStep = sizeof(block[0]);
+		block_data.sizeFrame.width = dec_picture->size_x;
+		block_data.sizeFrame.height = (dec_picture->motion.mb_field[currMB->mbAddrX]) ? (dec_picture->size_y >> 1): dec_picture->size_y;
+		block_data.sizeBlock.width = hor_block_size;
+		block_data.sizeBlock.height = ver_block_size;
+		block_data.pointBlockPos.x = x_pos << 2;
+		block_data.pointBlockPos.y = y_pos << 2;
+		block_data.pointVector.x = motion_vector[0];
+		block_data.pointVector.y = motion_vector[1];
+		ippiInterpolateLumaBlock_H264_8u_P1R(&block_data);
+
+	}
+}
+
+/*!
+************************************************************************
+* \brief
+*    Chroma (0,0)
+************************************************************************
+*/ 
+static void get_chroma_00(h264_imgpel_macroblock_t block, const VideoImage *image, int ver_block_size, int hor_block_size, int x_pos, int y_pos)
+{
+	ptrdiff_t src_stride = image->stride; // in case the compiler doesn't optimize this
+	imgpel *src = image->base_address + src_stride * y_pos + x_pos;
+
+	int j, i;
+	switch(hor_block_size) // basically just unrolling this
+	{
+	case 16:
+		for (j = 0; j < ver_block_size; j++)
+		{  
+			imgpel *row = block[j];
+			for (i = 0; i < 16; i++)
+			{
+				row[i] = src[i];
+			}
+			src+=src_stride;
+		}
+		break;
+	case 8:
+		for (j = 0; j < ver_block_size; j++)
+		{  
+			imgpel *row = block[j];
+			for (i = 0; i < 8; i++)
+			{
+				row[i] = src[i];
+			}
+			src+=src_stride;
+		}
+		break;
+	case 4:
+		for (j = 0; j < ver_block_size; j++)
+		{  
+			imgpel *row = block[j];
+			for (i = 0; i < 4; i++)
+			{
+				row[i] = src[i];
+			}
+			src+=src_stride;
+		}
+		break;
+	case 2:
+		for (j = 0; j < ver_block_size; j++)
+		{  
+			imgpel *row = block[j];
+			for (i = 0; i < 2; i++)
+			{
+				row[i] = src[i];
+			}
+			src+=src_stride;
+		}
+		break;
+	default: //degenerate case
+		for (j = 0; j < ver_block_size; j++)
+		{  
+			imgpel *row = block[j];
+			for (i = 0; i < hor_block_size; i++)
+			{
+				row[i] = src[i];
+			}
+			src+=src_stride;
+		}
+		break;
+	}
+}
+
+static void get_block_chroma(Macroblock *currMB, StorablePicture *curr_ref, int x_pos, int y_pos, const short *motion_vector, int hor_block_size, int ver_block_size, h264_imgpel_macroblock_row_t *block0, h264_imgpel_macroblock_row_t *block1, int ioff, int joff)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	if (curr_ref == p_Vid->no_reference_picture && p_Vid->framepoc < p_Vid->recovery_poc)
+	{
+		get_data_no_ref(block0, ver_block_size, hor_block_size, (imgpel) p_Vid->dc_pred_value_comp[1]);
+		get_data_no_ref(block1, ver_block_size, hor_block_size, (imgpel) p_Vid->dc_pred_value_comp[2]);
+	}
+	else
+	{
+		StorablePicture *dec_picture = p_Vid->dec_picture;
+		IppVCInterpolateBlock_8u block_data;
+
+		block_data.pSrc[0] = curr_ref->imgUV[0]->base_address;
+		block_data.pSrc[1] = curr_ref->imgUV[1]->base_address;
+		block_data.srcStep = curr_ref->imgUV[0]->stride;
+		block_data.pDst[0] = &block0[joff][ioff];
+		block_data.pDst[1] = &block1[joff][ioff];
+		block_data.dstStep = sizeof(block0[0]);
+		block_data.sizeFrame.width = dec_picture->size_x_cr;
+		block_data.sizeFrame.height = (dec_picture->motion.mb_field[currMB->mbAddrX]) ? (dec_picture->size_y_cr >> 1): dec_picture->size_y_cr;
+		block_data.sizeBlock.width = hor_block_size;
+		block_data.sizeBlock.height = ver_block_size;
+		if (dec_picture->chroma_format_idc == YUV444)
+		{
+			block_data.pointBlockPos.x = x_pos;
+			block_data.pointVector.x = motion_vector[0] << 1; 
+		}
+		else
+		{
+			block_data.pointBlockPos.x = x_pos<<1;
+			block_data.pointVector.x = motion_vector[0]; 
+		}
+		if (dec_picture->chroma_format_idc == YUV420)
+		{
+			block_data.pointVector.y = motion_vector[1];
+			block_data.pointBlockPos.y = y_pos<<1;
+		}
+		else
+		{
+			block_data.pointBlockPos.y = y_pos;
+			block_data.pointVector.y = motion_vector[1] << 1;
+		}
+
+		ippiInterpolateChromaBlock_H264_8u_P2R(&block_data);
+
+
+	}
+}
+
+
+void intra_cr_decoding(Macroblock *currMB, int yuv)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	Slice *currSlice = currMB->p_Slice;
+	StorablePicture *dec_picture = p_Vid->dec_picture;
+	int uv;
+	int b8,b4;
+	int ioff, joff;
+	// TODO: fix 4x4 lossless
+
+	for(uv = 0; uv < 2; uv++)
+	{
+		int pl = uv + 1;
+		const h264_short_block_t *blocks = currSlice->cof4[pl];
+		const h264_imgpel_macroblock_row_t *mb_pred=currSlice->mb_pred[pl];
+		h264_imgpel_macroblock_row_t *mb_rec = currSlice->mb_rec[pl];
+
+		intrapred_chroma(currMB, uv);
+
+		if ((!(currMB->mb_type == SI4MB) && (currMB->cbp >> 4)) )
+		{
+			if (yuv == YUV420-1)
+			{
+				opt_itrans4x4(blocks[0], mb_pred, mb_rec, 0, 0);
+				opt_itrans4x4(blocks[1], mb_pred, mb_rec, 4, 0);
+				opt_itrans4x4(blocks[2], mb_pred, mb_rec, 0, 4);
+				opt_itrans4x4(blocks[3], mb_pred, mb_rec, 4, 4);
+				copy_image_data_8x8_stride(dec_picture->imgUV[uv], currMB->pix_c_x, currMB->pix_c_y, mb_rec);
+			}
+			else
+			{
+				for (b8 = 0; b8 < (p_Vid->num_uv_blocks); b8++)
+				{
+					for(b4 = 0; b4 < 4; b4++)
+					{
+						joff = subblk_offset_y[yuv][b8][b4];          
+						ioff = subblk_offset_x[yuv][b8][b4];          
+			
+						opt_itrans4x4(blocks[cof4_pos_to_subblock[joff>>2][ioff>>2]], mb_pred, mb_rec, ioff, joff);
+
+						copy_image_data_4x4_stride(dec_picture->imgUV[uv], currMB->pix_c_x + ioff, currMB->pix_c_y + joff, mb_rec, ioff, joff);
+					}
+				}
+			}
+		}	
+		else if (currMB->mb_type == SI4MB)
+		{
+			itrans_sp_cr(currMB, uv);
+
+			opt_itrans4x4(blocks[0], mb_pred, mb_rec, 0, 0);
+			copy_image_data_4x4_stride(dec_picture->imgUV[uv], currMB->pix_c_x + 0, currMB->pix_c_y + 0, mb_rec,  0, 0);
+			opt_itrans4x4(blocks[1], mb_pred, mb_rec, 4, 0);
+			copy_image_data_4x4_stride(dec_picture->imgUV[uv], currMB->pix_c_x + 4, currMB->pix_c_y + 0, mb_rec,  4, 0);
+			opt_itrans4x4(blocks[2], mb_pred, mb_rec, 0, 4);
+			copy_image_data_4x4_stride(dec_picture->imgUV[uv], currMB->pix_c_x + 0, currMB->pix_c_y + 4, mb_rec,  0, 4);
+			opt_itrans4x4(blocks[3], mb_pred, mb_rec, 4, 4);
+			copy_image_data_4x4_stride(dec_picture->imgUV[uv], currMB->pix_c_x + 4, currMB->pix_c_y + 4, mb_rec,  4, 4);
+		}
+		else
+		{
+			if (yuv == YUV420-1)
+			{
+				copy_image_data_8x8_stride(dec_picture->imgUV[uv], currMB->pix_c_x, currMB->pix_c_y, mb_pred);
+			}
+			else
+			{
+				for (b8 = 0; b8 < (p_Vid->num_uv_blocks); b8++)
+				{
+					for(b4 = 0; b4 < 4; b4++)
+					{
+						joff = subblk_offset_y[yuv][b8][b4];
+						ioff = subblk_offset_x[yuv][b8][b4];          
+
+						copy_image_data_4x4_stride(dec_picture->imgUV[uv], currMB->pix_c_x + ioff, currMB->pix_c_y + joff, mb_pred,  ioff, joff);
+					}
+				}
+			}
+		}
+	}
+}
+
+void prepare_direct_params(Macroblock *currMB, StorablePicture *dec_picture, short pmvl0[2], short pmvl1[2],char *l0_rFrame, char *l1_rFrame)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;
+	Slice *currSlice = currMB->p_Slice;
+	char l0_rFrameL, l0_rFrameU, l0_rFrameUR;
+	char l1_rFrameL, l1_rFrameU, l1_rFrameUR;
+	PicMotionParams *motion = &dec_picture->motion;
+
+	PixelPos mb[4];
+
+	get_neighbors0016(currMB, mb);
+
+	if (!currSlice->mb_aff_frame_flag)
+	{
+		l0_rFrameL  = (char) (mb[0].available ? motion->motion[LIST_0][mb[0].pos_y][mb[0].pos_x].ref_idx : -1);
+		l0_rFrameU  = (char) (mb[1].available ? motion->motion[LIST_0][mb[1].pos_y][mb[1].pos_x].ref_idx : -1);
+		l0_rFrameUR = (char) (mb[2].available ? motion->motion[LIST_0][mb[2].pos_y][mb[2].pos_x].ref_idx : -1);
+
+		l1_rFrameL  = (char) (mb[0].available ? motion->motion[LIST_1][mb[0].pos_y][mb[0].pos_x].ref_idx : -1);
+		l1_rFrameU  = (char) (mb[1].available ? motion->motion[LIST_1][mb[1].pos_y][mb[1].pos_x].ref_idx : -1);
+		l1_rFrameUR = (char) (mb[2].available ? motion->motion[LIST_1][mb[2].pos_y][mb[2].pos_x].ref_idx : -1);
+	}
+	else
+	{
+		if (currMB->mb_field)
+		{
+			l0_rFrameL = (char) (mb[0].available 
+				? p_Vid->mb_data[mb[0].mb_addr].mb_field  || motion->motion[LIST_0][mb[0].pos_y][mb[0].pos_x].ref_idx < 0
+				? motion->motion[LIST_0][mb[0].pos_y][mb[0].pos_x].ref_idx
+			: motion->motion[LIST_0][mb[0].pos_y][mb[0].pos_x].ref_idx * 2: -1);
+
+			l0_rFrameU = (char) (mb[1].available 
+				? p_Vid->mb_data[mb[1].mb_addr].mb_field || motion->motion[LIST_0][mb[1].pos_y][mb[1].pos_x].ref_idx < 0
+				? motion->motion[LIST_0][mb[1].pos_y][mb[1].pos_x].ref_idx 
+			: motion->motion[LIST_0][mb[1].pos_y][mb[1].pos_x].ref_idx * 2: -1);
+
+			l0_rFrameUR = (char) (mb[2].available 
+				? p_Vid->mb_data[mb[2].mb_addr].mb_field || motion->motion[LIST_0][mb[2].pos_y][mb[2].pos_x].ref_idx < 0 
+				? motion->motion[LIST_0][mb[2].pos_y][mb[2].pos_x].ref_idx
+			: motion->motion[LIST_0][mb[2].pos_y][mb[2].pos_x].ref_idx * 2: -1);
+
+			l1_rFrameL = (char) (mb[0].available 
+				? p_Vid->mb_data[mb[0].mb_addr].mb_field || motion->motion[LIST_1][mb[0].pos_y][mb[0].pos_x].ref_idx  < 0 
+				? motion->motion[LIST_1][mb[0].pos_y][mb[0].pos_x].ref_idx 
+			: motion->motion[LIST_1][mb[0].pos_y][mb[0].pos_x].ref_idx * 2: -1);
+
+			l1_rFrameU = (char) (mb[1].available 
+				? p_Vid->mb_data[mb[1].mb_addr].mb_field || motion->motion[LIST_1][mb[1].pos_y][mb[1].pos_x].ref_idx  < 0 
+				? motion->motion[LIST_1][mb[1].pos_y][mb[1].pos_x].ref_idx
+			: motion->motion[LIST_1][mb[1].pos_y][mb[1].pos_x].ref_idx * 2: -1);
+
+			l1_rFrameUR = (char) (mb[2].available 
+				? p_Vid->mb_data[mb[2].mb_addr].mb_field || motion->motion[LIST_1][mb[2].pos_y][mb[2].pos_x].ref_idx < 0
+				? motion->motion[LIST_1][mb[2].pos_y][mb[2].pos_x].ref_idx 
+			: motion->motion[LIST_1][mb[2].pos_y][mb[2].pos_x].ref_idx * 2: -1);
+		}
+		else
+		{
+			l0_rFrameL = (char) (mb[0].available 
+				? p_Vid->mb_data[mb[0].mb_addr].mb_field || motion->motion[LIST_0][mb[0].pos_y][mb[0].pos_x].ref_idx  < 0 
+				? motion->motion[LIST_0][mb[0].pos_y][mb[0].pos_x].ref_idx >> 1 
+				: motion->motion[LIST_0][mb[0].pos_y][mb[0].pos_x].ref_idx: -1);
+
+			l0_rFrameU = (char) (mb[1].available 
+				? p_Vid->mb_data[mb[1].mb_addr].mb_field || motion->motion[LIST_0][mb[1].pos_y][mb[1].pos_x].ref_idx < 0 
+				? motion->motion[LIST_0][mb[1].pos_y][mb[1].pos_x].ref_idx >> 1 
+				: motion->motion[LIST_0][mb[1].pos_y][mb[1].pos_x].ref_idx : -1);
+
+			l0_rFrameUR = (char) (mb[2].available 
+				? p_Vid->mb_data[mb[2].mb_addr].mb_field || motion->motion[LIST_0][mb[2].pos_y][mb[2].pos_x].ref_idx < 0 
+				? motion->motion[LIST_0][mb[2].pos_y][mb[2].pos_x].ref_idx >> 1 
+				: motion->motion[LIST_0][mb[2].pos_y][mb[2].pos_x].ref_idx : -1);
+
+			l1_rFrameL = (char) (mb[0].available 
+				? p_Vid->mb_data[mb[0].mb_addr].mb_field || motion->motion[LIST_1][mb[0].pos_y][mb[0].pos_x].ref_idx < 0 
+				? motion->motion[LIST_1][mb[0].pos_y][mb[0].pos_x].ref_idx >> 1 
+				: motion->motion[LIST_1][mb[0].pos_y][mb[0].pos_x].ref_idx : -1);
+
+			l1_rFrameU = (char) (mb[1].available 
+				? p_Vid->mb_data[mb[1].mb_addr].mb_field || motion->motion[LIST_1][mb[1].pos_y][mb[1].pos_x].ref_idx < 0 
+				? motion->motion[LIST_1][mb[1].pos_y][mb[1].pos_x].ref_idx >> 1 
+				: motion->motion[LIST_1][mb[1].pos_y][mb[1].pos_x].ref_idx : -1);
+
+			l1_rFrameUR = (char) (mb[2].available 
+				? p_Vid->mb_data[mb[2].mb_addr].mb_field || motion->motion[LIST_1][mb[2].pos_y][mb[2].pos_x].ref_idx < 0 
+				? motion->motion[LIST_1][mb[2].pos_y][mb[2].pos_x].ref_idx >> 1
+				: motion->motion[LIST_1][mb[2].pos_y][mb[2].pos_x].ref_idx : -1);
+		}
+	}
+
+	*l0_rFrame = (char) ((l0_rFrameL >= 0 && l0_rFrameU >= 0)  ? imin(l0_rFrameL,l0_rFrameU) : imax(l0_rFrameL,l0_rFrameU));
+	*l0_rFrame = (char) ((*l0_rFrame >= 0 && l0_rFrameUR >= 0) ? imin(*l0_rFrame,l0_rFrameUR): imax(*l0_rFrame,l0_rFrameUR));
+
+	*l1_rFrame = (char) ((l1_rFrameL >= 0 && l1_rFrameU >= 0)  ? imin(l1_rFrameL,l1_rFrameU) : imax(l1_rFrameL,l1_rFrameU));
+	*l1_rFrame = (char) ((*l1_rFrame >= 0 && l1_rFrameUR >= 0) ? imin(*l1_rFrame,l1_rFrameUR): imax(*l1_rFrame,l1_rFrameUR));
+
+	if (*l0_rFrame >=0)
+		currMB->GetMVPredictor (currMB, mb, pmvl0, *l0_rFrame, motion->motion[LIST_0], 0, 0, 16, 16);
+
+	if (*l1_rFrame >=0)
+		currMB->GetMVPredictor (currMB, mb, pmvl1, *l1_rFrame, motion->motion[LIST_1], 0, 0, 16, 16);
+}
+
+static void check_motion_vector_range(VideoParameters *p_Vid, short mv_x, short mv_y)
+{
+#ifdef _DEBUG
+	if (mv_x > 8191 || mv_x < -8192)
+	{
+		fprintf(stderr,"WARNING! Horizontal motion vector %d is out of allowed range {-8192, 8191} in picture %d, macroblock %d\n", mv_x, p_Vid->number, p_Vid->current_mb_nr);
+		//error("invalid stream: too big horizontal motion vector", 500);
+	}
+
+	if (mv_y > (p_Vid->max_mb_vmv_r - 1) || mv_y < (-p_Vid->max_mb_vmv_r))
+	{
+		fprintf(stderr,"WARNING! Vertical motion vector %d is out of allowed range {%d, %d} in picture %d, macroblock %d\n", mv_y, (-p_Vid->max_mb_vmv_r), (p_Vid->max_mb_vmv_r - 1), p_Vid->number, p_Vid->current_mb_nr);
+		//error("invalid stream: too big vertical motion vector", 500);
+	}
+#endif
+}
+
+void perform_mc(Macroblock *currMB, ColorPlane pl, StorablePicture *dec_picture, int pred_dir, int i, int j, int list_offset, int block_size_x, int block_size_y, int curr_mb_field)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;  
+	seq_parameter_set_rbsp_t *active_sps = p_Vid->active_sps;
+
+	Slice *currSlice = currMB->p_Slice;
+
+	static const int mv_mul = 16; // 4 * 4
+
+	int i4   = currMB->block_x + i;
+	int j4   = currMB->block_y + j;
+	int ioff = (i << 2);
+	int joff = (j << 2);         
+
+	assert (pred_dir<=2);
+
+	if (pred_dir != 2)
+	{
+		//===== Single List Prediction =====
+		short       ref_idx = dec_picture->motion.motion[pred_dir][j4][i4].ref_idx;
+		short       ref_idx_wp = ref_idx;
+		short      *mv_array = dec_picture->motion.motion[pred_dir][j4][i4].mv;
+		StorablePicture *list = p_Vid->listX[list_offset + pred_dir][ref_idx];
+
+		check_motion_vector_range(p_Vid, mv_array[0], mv_array[1]);
+
+
+		get_block_luma(currMB, pl, list, i4, currMB->block_y_aff + j, mv_array, block_size_x, block_size_y, (h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[pl][joff][ioff]); 
+
+		if (currSlice->apply_weights)
+		{
+			int alpha_l0, wp_offset;
+			if (curr_mb_field && ((p_Vid->active_pps->weighted_pred_flag&&(p_Vid->type==P_SLICE|| p_Vid->type == SP_SLICE))||
+				(p_Vid->active_pps->weighted_bipred_idc==1 && (p_Vid->type==B_SLICE))))
+			{
+				ref_idx_wp >>=1;
+			}
+
+			alpha_l0  = currSlice->wp_weight[pred_dir][ref_idx_wp][0];
+			wp_offset = currSlice->wp_offset[pred_dir][ref_idx_wp][0];
+
+			weighted_mc_prediction((h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[pl][joff][ioff], block_size_y, block_size_x, alpha_l0, wp_offset, currSlice->luma_log2_weight_denom);
+		}
+
+		if ((dec_picture->chroma_format_idc != YUV400) && (dec_picture->chroma_format_idc != YUV444) ) 
+		{ // YUV420 or YUV422
+			int uv;
+
+			int ioff_cr = ioff >> 1;
+			int joff_cr = (p_Vid->mb_cr_size_y == MB_BLOCK_SIZE) ? joff : joff >> 1;
+			int block_size_x_cr = block_size_x >> 1;
+			int block_size_y_cr = p_Vid->mb_cr_size_y == MB_BLOCK_SIZE ? block_size_y : block_size_y >> 1;
+
+			short mv_cr[2] = {mv_array[0], mv_array[1] +  + ((active_sps->chroma_format_idc == YUV420)? list->chroma_vector_adjustment : 0) };
+			get_block_chroma(currMB, list, i4, currMB->block_y_aff + j, mv_cr, block_size_x_cr, block_size_y_cr, currSlice->mb_pred[1], currSlice->mb_pred[2], ioff_cr, joff_cr);
+			for(uv=0;uv<2;uv++)
+			{
+				if (currSlice->apply_weights)
+				{
+					int alpha_l0  = currSlice->wp_weight[pred_dir][ref_idx_wp][uv + 1];
+					int wp_offset = currSlice->wp_offset[pred_dir][ref_idx_wp][uv + 1];
+
+					weighted_mc_prediction((h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[uv + 1][joff_cr][ioff_cr], block_size_y_cr, block_size_x_cr, alpha_l0, wp_offset, currSlice->chroma_log2_weight_denom);
+				}
+			}
+		}
+	}
+	else
+	{
+		//===== BI-PREDICTION =====
+		__declspec(align(32)) h264_imgpel_macroblock_t tmp_block_l0[2];
+		short *l0_mv_array = dec_picture->motion.motion[LIST_0][j4][i4].mv;
+		short *l1_mv_array = dec_picture->motion.motion[LIST_1][j4][i4].mv;
+
+		short l0_refframe = dec_picture->motion.motion[LIST_0][j4][i4].ref_idx;
+		short l0_ref_idx  = l0_refframe;
+		short l1_refframe = dec_picture->motion.motion[LIST_1][j4][i4].ref_idx;
+		short l1_ref_idx  = l1_refframe;
+
+		check_motion_vector_range(p_Vid, l0_mv_array[0], l0_mv_array[1]);
+		check_motion_vector_range(p_Vid, l1_mv_array[0], l1_mv_array[1]);
+
+		get_block_luma(currMB, pl, p_Vid->listX[LIST_0 + list_offset][l0_refframe], i4, currMB->block_y_aff + j, l0_mv_array, block_size_x, block_size_y, tmp_block_l0[0]);  
+		get_block_luma(currMB, pl, p_Vid->listX[LIST_1 + list_offset][l1_refframe], i4, currMB->block_y_aff + j, l1_mv_array, block_size_x, block_size_y, (h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[pl][joff][ioff]);  
+
+		if(currSlice->apply_weights)
+		{
+			int alpha_l0, alpha_l1, wp_offset;
+			int wt_list_offset = (p_Vid->active_pps->weighted_bipred_idc==2)? list_offset : 0;
+
+			// This code existed in the original. Seems pointless but copying it here for reference and in case temporal direct breaks.
+			// if (mv_mode==0 && currSlice->direct_spatial_mv_pred_flag==0 ) l1_ref_idx=0;    
+			if (((p_Vid->active_pps->weighted_pred_flag&&(p_Vid->type==P_SLICE|| p_Vid->type == SP_SLICE))||
+				(p_Vid->active_pps->weighted_bipred_idc==1 && (p_Vid->type==B_SLICE))) && curr_mb_field)
+			{
+				l0_ref_idx >>=1;
+				l1_ref_idx >>=1;
+			}
+
+			alpha_l0  =   currSlice->wbp_weight[LIST_0 + wt_list_offset][l0_ref_idx][l1_ref_idx][0];
+			alpha_l1  =   currSlice->wbp_weight[LIST_1 + wt_list_offset][l0_ref_idx][l1_ref_idx][0];
+			wp_offset = ((currSlice->wp_offset [LIST_0 + wt_list_offset][l0_ref_idx][0] + currSlice->wp_offset[LIST_1 + wt_list_offset][l1_ref_idx][0] + 1) >>1);
+
+			weighted_bi_prediction((h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[pl][joff][ioff], tmp_block_l0[0], block_size_y, block_size_x, alpha_l0, alpha_l1, wp_offset, (currSlice->luma_log2_weight_denom + 1));
+		}
+		else
+		{ 
+			bi_prediction((h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[pl][joff][ioff], tmp_block_l0[0], block_size_y, block_size_x); 
+		}
+
+		if ((dec_picture->chroma_format_idc != YUV400) && (dec_picture->chroma_format_idc != YUV444) ) 
+		{ // YUV420 or YUV422
+			int uv;
+
+			int ioff_cr = ioff >> 1;
+			int joff_cr = p_Vid->mb_cr_size_y == MB_BLOCK_SIZE ? joff : joff >> 1;
+			int block_size_x_cr = block_size_x >> 1;
+			int block_size_y_cr = p_Vid->mb_cr_size_y == MB_BLOCK_SIZE ? block_size_y : block_size_y >> 1;
+
+			int vec1_y_cr = currMB->block_y_aff + j + ((active_sps->chroma_format_idc == 1)? p_Vid->listX[LIST_0 + list_offset][l0_refframe]->chroma_vector_adjustment : 0);
+			int vec2_y_cr = currMB->block_y_aff + j + ((active_sps->chroma_format_idc == 1)? p_Vid->listX[LIST_1 + list_offset][l1_refframe]->chroma_vector_adjustment : 0);
+			short mv_cr1[2] = {l0_mv_array[0], l0_mv_array[1] +  ((active_sps->chroma_format_idc == 1)? p_Vid->listX[LIST_0 + list_offset][l0_refframe]->chroma_vector_adjustment : 0) };
+			short mv_cr2[2] = {l1_mv_array[0], l1_mv_array[1] +  ((active_sps->chroma_format_idc == 1)? p_Vid->listX[LIST_1 + list_offset][l1_refframe]->chroma_vector_adjustment : 0) };
+
+			get_block_chroma(currMB, p_Vid->listX[LIST_0 + list_offset][l0_refframe], i4, currMB->block_y_aff + j, mv_cr1, block_size_x_cr, block_size_y_cr, tmp_block_l0[0], tmp_block_l0[1], 0, 0);
+			get_block_chroma(currMB, p_Vid->listX[LIST_1 + list_offset][l1_refframe], i4, currMB->block_y_aff + j, mv_cr2, block_size_x_cr, block_size_y_cr, currSlice->mb_pred[1], currSlice->mb_pred[2], ioff_cr, joff_cr);
+			for(uv=0;uv<2;uv++)
+			{
+				if(currSlice->apply_weights)
+				{
+					int wt_list_offset = (p_Vid->active_pps->weighted_bipred_idc==2)? list_offset : 0;
+
+					int alpha_l0  =   currSlice->wbp_weight[LIST_0 + wt_list_offset][l0_ref_idx][l1_ref_idx][uv + 1];
+					int alpha_l1  =   currSlice->wbp_weight[LIST_1 + wt_list_offset][l0_ref_idx][l1_ref_idx][uv + 1];
+					int wp_offset = ((currSlice->wp_offset [LIST_0 + wt_list_offset][l0_ref_idx][uv + 1] + currSlice->wp_offset[LIST_1 + wt_list_offset][l1_ref_idx][uv + 1] + 1) >>1);
+
+					weighted_bi_prediction((h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[uv+1][joff_cr][ioff_cr], tmp_block_l0[uv], block_size_y_cr, block_size_x_cr, alpha_l0, alpha_l1, wp_offset, (currSlice->chroma_log2_weight_denom + 1));
+				}
+				else
+				{
+					bi_prediction((h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[uv+1][joff_cr][ioff_cr], tmp_block_l0[uv], block_size_y_cr, block_size_x_cr);
+				}
+			}
+		}      
+	}
+}
+
+
+
+void perform_mc8x16(Macroblock *currMB, ColorPlane pl, StorablePicture *dec_picture, int pred_dir, int i, int j, int list_offset,  int curr_mb_field)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;  
+	seq_parameter_set_rbsp_t *active_sps = p_Vid->active_sps;
+	const int block_size_x=8;
+	const int block_size_y=16;
+	Slice *currSlice = currMB->p_Slice;
+
+	static const int mv_mul = 16; // 4 * 4
+
+	int i4   = currMB->block_x + i;
+	int j4   = currMB->block_y + j;
+	int ioff = (i << 2);
+	int joff = (j << 2);         
+
+	assert (pred_dir<=2);
+
+	if (pred_dir != 2)
+	{
+		//===== Single List Prediction =====
+		short       ref_idx = dec_picture->motion.motion[pred_dir][j4][i4].ref_idx;
+		short       ref_idx_wp = ref_idx;
+		short      *mv_array = dec_picture->motion.motion[pred_dir][j4][i4].mv;
+		StorablePicture *list = p_Vid->listX[list_offset + pred_dir][ref_idx];
+
+		check_motion_vector_range(p_Vid, mv_array[0], mv_array[1]);
+
+		get_block_luma(currMB, pl, list, i4, currMB->block_y_aff + j, mv_array, block_size_x, block_size_y, (h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[pl][joff][ioff]); 
+
+		if (currSlice->apply_weights)
+		{
+			int alpha_l0, wp_offset;
+			if (curr_mb_field && ((p_Vid->active_pps->weighted_pred_flag&&(p_Vid->type==P_SLICE|| p_Vid->type == SP_SLICE))||
+				(p_Vid->active_pps->weighted_bipred_idc==1 && (p_Vid->type==B_SLICE))))
+			{
+				ref_idx_wp >>=1;
+			}
+
+			alpha_l0  = currSlice->wp_weight[pred_dir][ref_idx_wp][0];
+			wp_offset = currSlice->wp_offset[pred_dir][ref_idx_wp][0];
+
+			weighted_mc_prediction((h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[pl][joff][ioff], block_size_y, block_size_x, alpha_l0, wp_offset, currSlice->luma_log2_weight_denom);
+		}
+
+		if ((dec_picture->chroma_format_idc != YUV400) && (dec_picture->chroma_format_idc != YUV444) ) 
+		{ // YUV420 or YUV422
+			int uv;
+
+			int ioff_cr = ioff >> 1;
+			int joff_cr = (p_Vid->mb_cr_size_y == MB_BLOCK_SIZE) ? joff : joff >> 1;
+			int block_size_x_cr = block_size_x >> 1;
+			int block_size_y_cr = p_Vid->mb_cr_size_y == MB_BLOCK_SIZE ? block_size_y : block_size_y >> 1;
+
+			short mv_cr[2] = {mv_array[0], mv_array[1] +  + ((active_sps->chroma_format_idc == YUV420)? list->chroma_vector_adjustment : 0) };
+			get_block_chroma(currMB, list, i4, currMB->block_y_aff + j, mv_cr, block_size_x_cr, block_size_y_cr, currSlice->mb_pred[1], currSlice->mb_pred[2], ioff_cr, joff_cr);
+			for(uv=0;uv<2;uv++)
+			{
+				if (currSlice->apply_weights)
+				{
+					int alpha_l0  = currSlice->wp_weight[pred_dir][ref_idx_wp][uv + 1];
+					int wp_offset = currSlice->wp_offset[pred_dir][ref_idx_wp][uv + 1];
+
+					weighted_mc_prediction((h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[uv + 1][joff_cr][ioff_cr], block_size_y_cr, block_size_x_cr, alpha_l0, wp_offset, currSlice->chroma_log2_weight_denom);
+				}
+			}
+		}
+	}
+	else
+	{
+		//===== BI-PREDICTION =====
+		__declspec(align(32)) h264_imgpel_macroblock_t tmp_block_l0[2];
+		short *l0_mv_array = dec_picture->motion.motion[LIST_0][j4][i4].mv;
+		short *l1_mv_array = dec_picture->motion.motion[LIST_1][j4][i4].mv;
+
+		short l0_refframe = dec_picture->motion.motion[LIST_0][j4][i4].ref_idx;
+		short l0_ref_idx  = l0_refframe;
+		short l1_refframe = dec_picture->motion.motion[LIST_1][j4][i4].ref_idx;
+		short l1_ref_idx  = l1_refframe;
+
+		check_motion_vector_range(p_Vid, l0_mv_array[0], l0_mv_array[1]);
+		check_motion_vector_range(p_Vid, l1_mv_array[0], l1_mv_array[1]);
+
+		get_block_luma(currMB, pl, p_Vid->listX[LIST_0 + list_offset][l0_refframe], i4, currMB->block_y_aff + j, l0_mv_array, block_size_x, block_size_y, tmp_block_l0[0]);  
+		get_block_luma(currMB, pl, p_Vid->listX[LIST_1 + list_offset][l1_refframe], i4, currMB->block_y_aff + j, l1_mv_array, block_size_x, block_size_y, (h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[pl][joff][ioff]);  
+
+		if(currSlice->apply_weights)
+		{
+			int alpha_l0, alpha_l1, wp_offset;
+			int wt_list_offset = (p_Vid->active_pps->weighted_bipred_idc==2)? list_offset : 0;
+
+			// This code existed in the original. Seems pointless but copying it here for reference and in case temporal direct breaks.
+			// if (mv_mode==0 && currSlice->direct_spatial_mv_pred_flag==0 ) l1_ref_idx=0;    
+			if (((p_Vid->active_pps->weighted_pred_flag&&(p_Vid->type==P_SLICE|| p_Vid->type == SP_SLICE))||
+				(p_Vid->active_pps->weighted_bipred_idc==1 && (p_Vid->type==B_SLICE))) && curr_mb_field)
+			{
+				l0_ref_idx >>=1;
+				l1_ref_idx >>=1;
+			}
+
+			alpha_l0  =   currSlice->wbp_weight[LIST_0 + wt_list_offset][l0_ref_idx][l1_ref_idx][0];
+			alpha_l1  =   currSlice->wbp_weight[LIST_1 + wt_list_offset][l0_ref_idx][l1_ref_idx][0];
+			wp_offset = ((currSlice->wp_offset [LIST_0 + wt_list_offset][l0_ref_idx][0] + currSlice->wp_offset[LIST_1 + wt_list_offset][l1_ref_idx][0] + 1) >>1);
+
+			weighted_bi_prediction((h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[pl][joff][ioff], tmp_block_l0[0], block_size_y, block_size_x, alpha_l0, alpha_l1, wp_offset, (currSlice->luma_log2_weight_denom + 1));
+		}
+		else
+		{ 
+			bi_prediction((h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[pl][joff][ioff], tmp_block_l0[0], block_size_y, block_size_x); 
+		}
+
+		if ((dec_picture->chroma_format_idc != YUV400) && (dec_picture->chroma_format_idc != YUV444) ) 
+		{ // YUV420 or YUV422
+			int uv;
+
+			int ioff_cr = ioff >> 1;
+			int joff_cr = p_Vid->mb_cr_size_y == MB_BLOCK_SIZE ? joff : joff >> 1;
+			int block_size_x_cr = block_size_x >> 1;
+			int block_size_y_cr = p_Vid->mb_cr_size_y == MB_BLOCK_SIZE ? block_size_y : block_size_y >> 1;
+
+			int vec1_y_cr = currMB->block_y_aff + j + ((active_sps->chroma_format_idc == 1)? p_Vid->listX[LIST_0 + list_offset][l0_refframe]->chroma_vector_adjustment : 0);
+			int vec2_y_cr = currMB->block_y_aff + j + ((active_sps->chroma_format_idc == 1)? p_Vid->listX[LIST_1 + list_offset][l1_refframe]->chroma_vector_adjustment : 0);
+			short mv_cr1[2] = {l0_mv_array[0], l0_mv_array[1] +  ((active_sps->chroma_format_idc == 1)? p_Vid->listX[LIST_0 + list_offset][l0_refframe]->chroma_vector_adjustment : 0) };
+			short mv_cr2[2] = {l1_mv_array[0], l1_mv_array[1] +  ((active_sps->chroma_format_idc == 1)? p_Vid->listX[LIST_1 + list_offset][l1_refframe]->chroma_vector_adjustment : 0) };
+
+			get_block_chroma(currMB, p_Vid->listX[LIST_0 + list_offset][l0_refframe], i4, currMB->block_y_aff + j, mv_cr1, block_size_x_cr, block_size_y_cr, tmp_block_l0[0], tmp_block_l0[1], 0, 0);
+			get_block_chroma(currMB, p_Vid->listX[LIST_1 + list_offset][l1_refframe], i4, currMB->block_y_aff + j, mv_cr2, block_size_x_cr, block_size_y_cr, currSlice->mb_pred[1], currSlice->mb_pred[2], ioff_cr, joff_cr);
+			for(uv=0;uv<2;uv++)
+			{
+				if(currSlice->apply_weights)
+				{
+					int wt_list_offset = (p_Vid->active_pps->weighted_bipred_idc==2)? list_offset : 0;
+
+					int alpha_l0  =   currSlice->wbp_weight[LIST_0 + wt_list_offset][l0_ref_idx][l1_ref_idx][uv + 1];
+					int alpha_l1  =   currSlice->wbp_weight[LIST_1 + wt_list_offset][l0_ref_idx][l1_ref_idx][uv + 1];
+					int wp_offset = ((currSlice->wp_offset [LIST_0 + wt_list_offset][l0_ref_idx][uv + 1] + currSlice->wp_offset[LIST_1 + wt_list_offset][l1_ref_idx][uv + 1] + 1) >>1);
+
+					weighted_bi_prediction((h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[uv+1][joff_cr][ioff_cr], tmp_block_l0[uv], block_size_y_cr, block_size_x_cr, alpha_l0, alpha_l1, wp_offset, (currSlice->chroma_log2_weight_denom + 1));
+				}
+				else
+				{
+					bi_prediction((h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[uv+1][joff_cr][ioff_cr], tmp_block_l0[uv], block_size_y_cr, block_size_x_cr);
+				}
+			}
+		}      
+	}
+}
+
+void perform_mc16x8(Macroblock *currMB, ColorPlane pl, StorablePicture *dec_picture, int pred_dir, int i, int j, int list_offset, int curr_mb_field)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;  
+	seq_parameter_set_rbsp_t *active_sps = p_Vid->active_sps;
+
+	Slice *currSlice = currMB->p_Slice;
+
+	int i4   = currMB->block_x + i;
+	int j4   = currMB->block_y + j;
+	int ioff = (i << 2);
+	int joff = (j << 2);
+
+	assert (pred_dir<=2);
+
+	if (pred_dir != 2)
+	{
+		//===== Single List Prediction =====
+		short       ref_idx = dec_picture->motion.motion[pred_dir][j4][i4].ref_idx;
+		short       ref_idx_wp = ref_idx;
+		short      *mv_array = dec_picture->motion.motion[pred_dir][j4][i4].mv;
+		StorablePicture *list = p_Vid->listX[list_offset + pred_dir][ref_idx];
+
+		check_motion_vector_range(p_Vid, mv_array[0], mv_array[1]);
+
+		get_block_luma(currMB, pl, list, i4, currMB->block_y_aff + j, mv_array, 16, 8, (h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[pl][joff][ioff]); 
+
+		if (currSlice->apply_weights)
+		{
+			int alpha_l0, wp_offset;
+			if (curr_mb_field && ((p_Vid->active_pps->weighted_pred_flag&&(p_Vid->type==P_SLICE|| p_Vid->type == SP_SLICE))||
+				(p_Vid->active_pps->weighted_bipred_idc==1 && (p_Vid->type==B_SLICE))))
+			{
+				ref_idx_wp >>=1;
+			}
+
+			alpha_l0  = currSlice->wp_weight[pred_dir][ref_idx_wp][0];
+			wp_offset = currSlice->wp_offset[pred_dir][ref_idx_wp][0];
+
+			opt_weighted_mc_prediction16x8((h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[pl][joff][ioff], alpha_l0, wp_offset, currSlice->luma_log2_weight_denom);
+		}
+
+		if ((dec_picture->chroma_format_idc != YUV400) && (dec_picture->chroma_format_idc != YUV444) ) 
+		{ // YUV420 or YUV422
+			int uv;
+
+			int ioff_cr = ioff >> 1;
+			int joff_cr = (p_Vid->mb_cr_size_y == MB_BLOCK_SIZE) ? joff : joff >> 1;
+			int block_size_x_cr = 16 >> 1;
+			int block_size_y_cr = p_Vid->mb_cr_size_y == MB_BLOCK_SIZE ? 8 : 8 >> 1;
+
+			short mv_cr[2] = {mv_array[0], mv_array[1] +  + ((active_sps->chroma_format_idc == YUV420)? list->chroma_vector_adjustment : 0) };
+			get_block_chroma(currMB, list, i4, currMB->block_y_aff + j, mv_cr, block_size_x_cr, block_size_y_cr, currSlice->mb_pred[1], currSlice->mb_pred[2], ioff_cr, joff_cr);
+			for(uv=0;uv<2;uv++)
+			{
+				if (currSlice->apply_weights)
+				{
+					int alpha_l0  = currSlice->wp_weight[pred_dir][ref_idx_wp][uv + 1];
+					int wp_offset = currSlice->wp_offset[pred_dir][ref_idx_wp][uv + 1];
+
+					weighted_mc_prediction((h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[uv + 1][joff_cr][ioff_cr], block_size_y_cr, block_size_x_cr, alpha_l0, wp_offset, currSlice->chroma_log2_weight_denom);
+				}
+			}
+		}
+	}
+	else
+	{
+		//===== BI-PREDICTION =====
+		__declspec(align(32)) h264_imgpel_macroblock_t tmp_block_l0[2];
+		short *l0_mv_array = dec_picture->motion.motion[LIST_0][j4][i4].mv;
+		short *l1_mv_array = dec_picture->motion.motion[LIST_1][j4][i4].mv;
+
+		short l0_refframe = dec_picture->motion.motion[LIST_0][j4][i4].ref_idx;
+		short l0_ref_idx  = l0_refframe;
+		short l1_refframe = dec_picture->motion.motion[LIST_1][j4][i4].ref_idx;
+		short l1_ref_idx  = l1_refframe;
+
+		check_motion_vector_range(p_Vid, l0_mv_array[0], l0_mv_array[1]);
+		check_motion_vector_range(p_Vid, l1_mv_array[0], l1_mv_array[1]);
+
+		get_block_luma(currMB, pl, p_Vid->listX[LIST_0 + list_offset][l0_refframe], i4, currMB->block_y_aff + j, l0_mv_array, 16, 8, tmp_block_l0[0]);  
+		get_block_luma(currMB, pl, p_Vid->listX[LIST_1 + list_offset][l1_refframe], i4, currMB->block_y_aff + j, l1_mv_array, 16, 8, (h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[pl][joff][ioff]);  
+
+		if(currSlice->apply_weights)
+		{
+			int alpha_l0, alpha_l1, wp_offset;
+			int wt_list_offset = (p_Vid->active_pps->weighted_bipred_idc==2)? list_offset : 0;
+
+			// This code existed in the original. Seems pointless but copying it here for reference and in case temporal direct breaks.
+			// if (mv_mode==0 && currSlice->direct_spatial_mv_pred_flag==0 ) l1_ref_idx=0;    
+			if (((p_Vid->active_pps->weighted_pred_flag&&(p_Vid->type==P_SLICE|| p_Vid->type == SP_SLICE))||
+				(p_Vid->active_pps->weighted_bipred_idc==1 && (p_Vid->type==B_SLICE))) && curr_mb_field)
+			{
+				l0_ref_idx >>=1;
+				l1_ref_idx >>=1;
+			}
+
+			alpha_l0  =   currSlice->wbp_weight[LIST_0 + wt_list_offset][l0_ref_idx][l1_ref_idx][0];
+			alpha_l1  =   currSlice->wbp_weight[LIST_1 + wt_list_offset][l0_ref_idx][l1_ref_idx][0];
+			wp_offset = ((currSlice->wp_offset [LIST_0 + wt_list_offset][l0_ref_idx][0] + currSlice->wp_offset[LIST_1 + wt_list_offset][l1_ref_idx][0] + 1) >>1);
+
+			opt_weighted_bi_prediction16x8((h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[pl][joff][ioff], tmp_block_l0[0], alpha_l0, alpha_l1, wp_offset, (currSlice->luma_log2_weight_denom + 1));
+		}
+		else
+		{ 
+			bi_prediction((h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[pl][joff][ioff], tmp_block_l0[0], 8, 16); 
+		}
+
+		if ((dec_picture->chroma_format_idc != YUV400) && (dec_picture->chroma_format_idc != YUV444) ) 
+		{ // YUV420 or YUV422
+			int uv;
+
+			int ioff_cr = ioff >> 1;
+			int joff_cr = p_Vid->mb_cr_size_y == MB_BLOCK_SIZE ? joff : joff >> 1;
+			int block_size_x_cr = 16 >> 1;
+			int block_size_y_cr = p_Vid->mb_cr_size_y == MB_BLOCK_SIZE ? 8 : 8 >> 1;
+
+			int vec1_y_cr = currMB->block_y_aff + j + ((active_sps->chroma_format_idc == 1)? p_Vid->listX[LIST_0 + list_offset][l0_refframe]->chroma_vector_adjustment : 0);
+			int vec2_y_cr = currMB->block_y_aff + j + ((active_sps->chroma_format_idc == 1)? p_Vid->listX[LIST_1 + list_offset][l1_refframe]->chroma_vector_adjustment : 0);
+			short mv_cr1[2] = {l0_mv_array[0], l0_mv_array[1] +  ((active_sps->chroma_format_idc == 1)? p_Vid->listX[LIST_0 + list_offset][l0_refframe]->chroma_vector_adjustment : 0) };
+			short mv_cr2[2] = {l1_mv_array[0], l1_mv_array[1] +  ((active_sps->chroma_format_idc == 1)? p_Vid->listX[LIST_1 + list_offset][l1_refframe]->chroma_vector_adjustment : 0) };
+
+			get_block_chroma(currMB, p_Vid->listX[LIST_0 + list_offset][l0_refframe], i4, currMB->block_y_aff + j, mv_cr1, block_size_x_cr, block_size_y_cr, tmp_block_l0[0], tmp_block_l0[1], 0, 0);
+			get_block_chroma(currMB, p_Vid->listX[LIST_1 + list_offset][l1_refframe], i4, currMB->block_y_aff + j, mv_cr2, block_size_x_cr, block_size_y_cr, currSlice->mb_pred[1], currSlice->mb_pred[2], ioff_cr, joff_cr);
+			for(uv=0;uv<2;uv++)
+			{
+				if(currSlice->apply_weights)
+				{
+					int wt_list_offset = (p_Vid->active_pps->weighted_bipred_idc==2)? list_offset : 0;
+
+					int alpha_l0  =   currSlice->wbp_weight[LIST_0 + wt_list_offset][l0_ref_idx][l1_ref_idx][uv + 1];
+					int alpha_l1  =   currSlice->wbp_weight[LIST_1 + wt_list_offset][l0_ref_idx][l1_ref_idx][uv + 1];
+					int wp_offset = ((currSlice->wp_offset [LIST_0 + wt_list_offset][l0_ref_idx][uv + 1] + currSlice->wp_offset[LIST_1 + wt_list_offset][l1_ref_idx][uv + 1] + 1) >>1);
+
+					weighted_bi_prediction((h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[uv+1][joff_cr][ioff_cr], tmp_block_l0[uv], block_size_y_cr, block_size_x_cr, alpha_l0, alpha_l1, wp_offset, (currSlice->chroma_log2_weight_denom + 1));
+				}
+				else
+				{
+					bi_prediction((h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[uv+1][joff_cr][ioff_cr], tmp_block_l0[uv], block_size_y_cr, block_size_x_cr);
+				}
+			}
+		}      
+	}
+}
+
+
+static void __forceinline perform_mc8x8_YUV420(Macroblock *currMB, StorablePicture *dec_picture, int pred_dir, int i, int j, int list_offset, int curr_mb_field)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;  
+
+	Slice *currSlice = currMB->p_Slice;
+
+	int i4   = currMB->block_x + i;
+	int j4   = currMB->block_y + j;
+	int ioff = (i << 2);
+	int joff = (j << 2);         
+
+	assert (pred_dir<=2);
+
+	if (pred_dir != 2)
+	{
+		//===== Single List Prediction =====
+		short       ref_idx = dec_picture->motion.motion[pred_dir][j4][i4].ref_idx;
+		short       ref_idx_wp = ref_idx;
+		short      *mv_array = dec_picture->motion.motion[pred_dir][j4][i4].mv;
+		StorablePicture *list = p_Vid->listX[list_offset + pred_dir][ref_idx];
+
+		check_motion_vector_range(p_Vid, mv_array[0], mv_array[1]);
+
+		get_block_luma(currMB, PLANE_Y, list, i4, currMB->block_y_aff + j, mv_array, 8, 8, (h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[0][joff][ioff]); 
+
+		if (currSlice->apply_weights)
+		{
+			int alpha_l0, wp_offset;
+			if (curr_mb_field && ((p_Vid->active_pps->weighted_pred_flag&&(p_Vid->type==P_SLICE|| p_Vid->type == SP_SLICE))||
+				(p_Vid->active_pps->weighted_bipred_idc==1 && (p_Vid->type==B_SLICE))))
+			{
+				ref_idx_wp >>=1;
+			}
+
+			alpha_l0  = currSlice->wp_weight[pred_dir][ref_idx_wp][0];
+			wp_offset = currSlice->wp_offset[pred_dir][ref_idx_wp][0];
+
+			opt_weighted_mc_prediction8x8((h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[0][joff][ioff], alpha_l0, wp_offset, currSlice->luma_log2_weight_denom);
+		}
+
+		{ 
+			int uv;
+
+			int ioff_cr = ioff >> 1;
+			int joff_cr = joff >> 1;
+
+			short mv_cr[2] = {mv_array[0], mv_array[1] + list->chroma_vector_adjustment };
+			get_block_chroma(currMB, list, i4, currMB->block_y_aff + j, mv_cr, 4, 4, currSlice->mb_pred[1], currSlice->mb_pred[2], ioff_cr, joff_cr);
+
+			for(uv=0;uv<2;uv++)
+			{
+
+				if (currSlice->apply_weights)
+				{
+					int alpha_l0  = currSlice->wp_weight[pred_dir][ref_idx_wp][uv + 1];
+					int wp_offset = currSlice->wp_offset[pred_dir][ref_idx_wp][uv + 1];
+
+					weighted_mc_prediction((h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[uv + 1][joff_cr][ioff_cr], 4, 4, alpha_l0, wp_offset, currSlice->chroma_log2_weight_denom);
+				}
+			}
+		}
+	}
+	else
+	{
+		//===== BI-PREDICTION =====
+		__declspec(align(32)) h264_imgpel_macroblock_t tmp_block_l0[2];
+		short *l0_mv_array = dec_picture->motion.motion[LIST_0][j4][i4].mv;
+		short *l1_mv_array = dec_picture->motion.motion[LIST_1][j4][i4].mv;
+
+		short l0_ref_idx = dec_picture->motion.motion[LIST_0][j4][i4].ref_idx;
+		short l1_ref_idx = dec_picture->motion.motion[LIST_1][j4][i4].ref_idx;
+		
+		StorablePicture *ref_image0 = p_Vid->listX[LIST_0 + list_offset][l0_ref_idx];
+		StorablePicture *ref_image1 = p_Vid->listX[LIST_1 + list_offset][l1_ref_idx];
+
+		check_motion_vector_range(p_Vid, l0_mv_array[0], l0_mv_array[1]);
+		check_motion_vector_range(p_Vid, l1_mv_array[0], l1_mv_array[1]);
+
+		if (p_Vid->framepoc < p_Vid->recovery_poc || IS_INDEPENDENT(p_Vid))
+		{
+			get_block_luma(currMB, PLANE_Y, ref_image0, i4, currMB->block_y_aff + j, l0_mv_array, 8, 8, tmp_block_l0[0]);  
+			get_block_luma(currMB, PLANE_Y, ref_image1, i4, currMB->block_y_aff + j, l1_mv_array, 8, 8, (h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[0][joff][ioff]);  
+		}
+		else
+		{
+			IppVCInterpolateBlock_8u block_data;
+
+			block_data.pSrc[0] = ref_image0->imgY->base_address;
+			block_data.srcStep = ref_image0->imgY->stride;
+			block_data.pDst[0] = (Ipp8u *)(tmp_block_l0[0]);
+			block_data.dstStep = sizeof(tmp_block_l0[0][0]);
+			block_data.sizeFrame.width = dec_picture->size_x;
+			block_data.sizeFrame.height = (dec_picture->motion.mb_field[currMB->mbAddrX]) ? (dec_picture->size_y >> 1): dec_picture->size_y;
+			block_data.sizeBlock.width = 8;
+			block_data.sizeBlock.height = 8;
+			block_data.pointBlockPos.x = i4 << 2;
+			block_data.pointBlockPos.y = (currMB->block_y_aff + j) << 2;
+			block_data.pointVector.x = l0_mv_array[0];
+			block_data.pointVector.y = l0_mv_array[1];
+			ippiInterpolateLumaBlock_H264_8u_P1R(&block_data);
+			block_data.pSrc[0] = ref_image1->imgY->base_address;
+			block_data.srcStep = ref_image1->imgY->stride;
+			block_data.pDst[0] = &currSlice->mb_pred[0][joff][ioff];
+			block_data.pointVector.x = l1_mv_array[0];
+			block_data.pointVector.y = l1_mv_array[1];
+			ippiInterpolateLumaBlock_H264_8u_P1R(&block_data);
+		}
+
+		if(currSlice->apply_weights)
+		{
+			int alpha_l0, alpha_l1, wp_offset;
+			int wt_list_offset = (p_Vid->active_pps->weighted_bipred_idc==2)? list_offset : 0;
+
+			// This code existed in the original. Seems pointless but copying it here for reference and in case temporal direct breaks.
+			// if (mv_mode==0 && currSlice->direct_spatial_mv_pred_flag==0 ) l1_ref_idx=0;    
+			if (((p_Vid->active_pps->weighted_pred_flag&&(p_Vid->type==P_SLICE|| p_Vid->type == SP_SLICE))||
+				(p_Vid->active_pps->weighted_bipred_idc==1 && (p_Vid->type==B_SLICE))) && curr_mb_field)
+			{
+				l0_ref_idx >>=1;
+				l1_ref_idx >>=1;
+			}
+
+			alpha_l0  =   currSlice->wbp_weight[LIST_0 + wt_list_offset][l0_ref_idx][l1_ref_idx][0];
+			alpha_l1  =   currSlice->wbp_weight[LIST_1 + wt_list_offset][l0_ref_idx][l1_ref_idx][0];
+			wp_offset = ((currSlice->wp_offset [LIST_0 + wt_list_offset][l0_ref_idx][0] + currSlice->wp_offset[LIST_1 + wt_list_offset][l1_ref_idx][0] + 1) >>1);
+
+			opt_weighted_bi_prediction8x8((h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[0][joff][ioff], tmp_block_l0[0], alpha_l0, alpha_l1, wp_offset, (currSlice->luma_log2_weight_denom + 1));
+		}
+		else
+		{ 
+			bi_prediction((h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[0][joff][ioff], tmp_block_l0[0], 8, 8); 
+		}
+
+		{
+			int uv;
+
+			int ioff_cr = ioff >> 1;
+			int joff_cr = joff >> 1;
+
+			short mv_cr1[2] = {l0_mv_array[0], l0_mv_array[1] +  ref_image0->chroma_vector_adjustment};
+			short mv_cr2[2] = {l1_mv_array[0], l1_mv_array[1] +  ref_image1->chroma_vector_adjustment};
+
+			if (p_Vid->framepoc < p_Vid->recovery_poc)
+			{
+				get_block_chroma(currMB, ref_image0, i4, currMB->block_y_aff + j, mv_cr1, 4, 4, tmp_block_l0[0], tmp_block_l0[1], 0, 0);
+				get_block_chroma(currMB, ref_image1, i4, currMB->block_y_aff + j, mv_cr2, 4, 4, currSlice->mb_pred[1], currSlice->mb_pred[2], ioff_cr, joff_cr);
+			}
+			else
+			{
+				IppVCInterpolateBlock_8u block_data;
+				block_data.pSrc[0] = ref_image0->imgUV[0]->base_address;
+				block_data.pSrc[1] = ref_image0->imgUV[1]->base_address;
+				block_data.srcStep = ref_image0->imgUV[0]->stride;
+				block_data.pDst[0] = (Ipp8u *)(tmp_block_l0[0]);
+				block_data.pDst[1] = (Ipp8u *)(tmp_block_l0[1]);
+				block_data.dstStep = sizeof(tmp_block_l0[0][0]);
+				block_data.sizeFrame.width = dec_picture->size_x_cr;
+				block_data.sizeFrame.height = (dec_picture->motion.mb_field[currMB->mbAddrX]) ? (dec_picture->size_y_cr >> 1): dec_picture->size_y_cr;
+				block_data.sizeBlock.width = 4;
+				block_data.sizeBlock.height = 4;
+				block_data.pointBlockPos.x = i4<<1;
+				block_data.pointVector.x = mv_cr1[0]; 
+				block_data.pointVector.y = mv_cr1[1];
+				block_data.pointBlockPos.y = (currMB->block_y_aff + j)<<1;
+				ippiInterpolateChromaBlock_H264_8u_P2R(&block_data);
+				block_data.pSrc[0] = ref_image1->imgUV[0]->base_address;
+				block_data.pSrc[1] = ref_image1->imgUV[1]->base_address;
+				block_data.srcStep = ref_image1->imgUV[0]->stride;
+				block_data.pDst[0] = &currSlice->mb_pred[1][joff_cr][ioff_cr];
+				block_data.pDst[1] = &currSlice->mb_pred[2][joff_cr][ioff_cr];
+				block_data.pointVector.x = mv_cr2[0]; 
+				block_data.pointVector.y = mv_cr2[1];
+				ippiInterpolateChromaBlock_H264_8u_P2R(&block_data);
+			}
+
+			for(uv=0;uv<2;uv++)
+			{
+				if(currSlice->apply_weights)
+				{
+					int wt_list_offset = (p_Vid->active_pps->weighted_bipred_idc==2)? list_offset : 0;
+
+					int alpha_l0  =   currSlice->wbp_weight[LIST_0 + wt_list_offset][l0_ref_idx][l1_ref_idx][uv + 1];
+					int alpha_l1  =   currSlice->wbp_weight[LIST_1 + wt_list_offset][l0_ref_idx][l1_ref_idx][uv + 1];
+					int wp_offset = ((currSlice->wp_offset [LIST_0 + wt_list_offset][l0_ref_idx][uv + 1] + currSlice->wp_offset[LIST_1 + wt_list_offset][l1_ref_idx][uv + 1] + 1) >>1);
+
+					weighted_bi_prediction4x4((h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[uv+1][joff_cr][ioff_cr], tmp_block_l0[uv], alpha_l0, alpha_l1, wp_offset, (currSlice->chroma_log2_weight_denom + 1));
+				}
+				else
+				{
+					bi_prediction4x4_mmx((h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[uv+1][joff_cr][ioff_cr], tmp_block_l0[uv]);
+				}
+			}
+		}
+	}
+}
+
+void perform_mc8x8(Macroblock *currMB, ColorPlane pl, StorablePicture *dec_picture, int pred_dir, int i, int j, int list_offset, int curr_mb_field)
+{
+	if (dec_picture->chroma_format_idc == YUV420)
+	{
+		perform_mc8x8_YUV420(currMB, dec_picture, pred_dir, i, j, list_offset, curr_mb_field);
+	}
+	else
+	{
+	VideoParameters *p_Vid = currMB->p_Vid;  
+	seq_parameter_set_rbsp_t *active_sps = p_Vid->active_sps;
+
+	Slice *currSlice = currMB->p_Slice;
+
+	static const int mv_mul = 16; // 4 * 4
+
+	int i4   = currMB->block_x + i;
+	int j4   = currMB->block_y + j;
+	int ioff = (i << 2);
+	int joff = (j << 2);         
+
+	assert (pred_dir<=2);
+
+	if (pred_dir != 2)
+	{
+		//===== Single List Prediction =====
+		short       ref_idx = dec_picture->motion.motion[pred_dir][j4][i4].ref_idx;
+		short       ref_idx_wp = ref_idx;
+		short      *mv_array = dec_picture->motion.motion[pred_dir][j4][i4].mv;
+		StorablePicture *list = p_Vid->listX[list_offset + pred_dir][ref_idx];
+
+		check_motion_vector_range(p_Vid, mv_array[0], mv_array[1]);
+
+		get_block_luma(currMB, pl, list, i4, currMB->block_y_aff + j, mv_array, 8, 8, (h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[pl][joff][ioff]); 
+
+		if (currSlice->apply_weights)
+		{
+			int alpha_l0, wp_offset;
+			if (curr_mb_field && ((p_Vid->active_pps->weighted_pred_flag&&(p_Vid->type==P_SLICE|| p_Vid->type == SP_SLICE))||
+				(p_Vid->active_pps->weighted_bipred_idc==1 && (p_Vid->type==B_SLICE))))
+			{
+				ref_idx_wp >>=1;
+			}
+
+			alpha_l0  = currSlice->wp_weight[pred_dir][ref_idx_wp][0];
+			wp_offset = currSlice->wp_offset[pred_dir][ref_idx_wp][0];
+
+			opt_weighted_mc_prediction8x8((h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[pl][joff][ioff], alpha_l0, wp_offset, currSlice->luma_log2_weight_denom);
+		}
+
+		if ((dec_picture->chroma_format_idc != YUV400) && (dec_picture->chroma_format_idc != YUV444) ) 
+		{ // YUV420 or YUV422
+			int uv;
+
+			int ioff_cr = ioff >> 1;
+			int joff_cr = (p_Vid->mb_cr_size_y == MB_BLOCK_SIZE) ? joff : joff >> 1;
+			int block_size_y_cr = p_Vid->mb_cr_size_y == MB_BLOCK_SIZE ? 8 : 4;
+
+			short mv_cr[2] = {mv_array[0], mv_array[1] };
+			get_block_chroma(currMB, list, i4, currMB->block_y_aff + j, mv_cr, 4, block_size_y_cr, currSlice->mb_pred[1], currSlice->mb_pred[2], ioff_cr, joff_cr);
+
+			for(uv=0;uv<2;uv++)
+			{
+
+				if (currSlice->apply_weights)
+				{
+					int alpha_l0  = currSlice->wp_weight[pred_dir][ref_idx_wp][uv + 1];
+					int wp_offset = currSlice->wp_offset[pred_dir][ref_idx_wp][uv + 1];
+
+					weighted_mc_prediction((h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[uv + 1][joff_cr][ioff_cr], block_size_y_cr, 4, alpha_l0, wp_offset, currSlice->chroma_log2_weight_denom);
+				}
+			}
+		}
+	}
+	else
+	{
+		//===== BI-PREDICTION =====
+		__declspec(align(32)) h264_imgpel_macroblock_t tmp_block_l0[2];
+		short *l0_mv_array = dec_picture->motion.motion[LIST_0][j4][i4].mv;
+		short *l1_mv_array = dec_picture->motion.motion[LIST_1][j4][i4].mv;
+
+		short l0_refframe = dec_picture->motion.motion[LIST_0][j4][i4].ref_idx;
+		short l0_ref_idx  = l0_refframe;
+		short l1_refframe = dec_picture->motion.motion[LIST_1][j4][i4].ref_idx;
+		short l1_ref_idx  = l1_refframe;
+		
+		check_motion_vector_range(p_Vid, l0_mv_array[0], l0_mv_array[1]);
+		check_motion_vector_range(p_Vid, l1_mv_array[0], l1_mv_array[1]);
+
+		if (p_Vid->framepoc < p_Vid->recovery_poc || IS_INDEPENDENT(p_Vid) || pl!=PLANE_Y)
+		{
+			get_block_luma(currMB, pl, p_Vid->listX[LIST_0 + list_offset][l0_refframe], i4, currMB->block_y_aff + j, l0_mv_array, 8, 8, tmp_block_l0[0]);  
+			get_block_luma(currMB, pl, p_Vid->listX[LIST_1 + list_offset][l1_refframe], i4, currMB->block_y_aff + j, l1_mv_array, 8, 8, (h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[pl][joff][ioff]);  
+		}
+		else
+		{
+			VideoImage *cur_imgY = p_Vid->listX[LIST_0 + list_offset][l0_refframe]->imgY;
+			IppVCInterpolateBlock_8u block_data;
+
+			block_data.pSrc[0] = cur_imgY->base_address;
+			block_data.srcStep = cur_imgY->stride;
+			block_data.pDst[0] = (Ipp8u *)(tmp_block_l0[0]);
+			block_data.dstStep = sizeof(tmp_block_l0[0][0]);
+			block_data.sizeFrame.width = dec_picture->size_x;
+			block_data.sizeFrame.height = (dec_picture->motion.mb_field[currMB->mbAddrX]) ? (dec_picture->size_y >> 1): dec_picture->size_y;
+			block_data.sizeBlock.width = 8;
+			block_data.sizeBlock.height = 8;
+			block_data.pointBlockPos.x = i4 << 2;
+			block_data.pointBlockPos.y = (currMB->block_y_aff + j) << 2;
+			block_data.pointVector.x = l0_mv_array[0];
+			block_data.pointVector.y = l0_mv_array[1];
+			ippiInterpolateLumaBlock_H264_8u_P1R(&block_data);
+			cur_imgY = p_Vid->listX[LIST_1 + list_offset][l1_refframe]->imgY;
+			block_data.pSrc[0] = cur_imgY->base_address;
+			block_data.srcStep = cur_imgY->stride;
+			block_data.pDst[0] = &currSlice->mb_pred[pl][joff][ioff];
+			block_data.pointVector.x = l1_mv_array[0];
+			block_data.pointVector.y = l1_mv_array[1];
+			ippiInterpolateLumaBlock_H264_8u_P1R(&block_data);
+		}
+
+		if(currSlice->apply_weights)
+		{
+			int alpha_l0, alpha_l1, wp_offset;
+			int wt_list_offset = (p_Vid->active_pps->weighted_bipred_idc==2)? list_offset : 0;
+
+			// This code existed in the original. Seems pointless but copying it here for reference and in case temporal direct breaks.
+			// if (mv_mode==0 && currSlice->direct_spatial_mv_pred_flag==0 ) l1_ref_idx=0;    
+			if (((p_Vid->active_pps->weighted_pred_flag&&(p_Vid->type==P_SLICE|| p_Vid->type == SP_SLICE))||
+				(p_Vid->active_pps->weighted_bipred_idc==1 && (p_Vid->type==B_SLICE))) && curr_mb_field)
+			{
+				l0_ref_idx >>=1;
+				l1_ref_idx >>=1;
+			}
+
+			alpha_l0  =   currSlice->wbp_weight[LIST_0 + wt_list_offset][l0_ref_idx][l1_ref_idx][0];
+			alpha_l1  =   currSlice->wbp_weight[LIST_1 + wt_list_offset][l0_ref_idx][l1_ref_idx][0];
+			wp_offset = ((currSlice->wp_offset [LIST_0 + wt_list_offset][l0_ref_idx][0] + currSlice->wp_offset[LIST_1 + wt_list_offset][l1_ref_idx][0] + 1) >>1);
+
+			opt_weighted_bi_prediction8x8((h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[pl][joff][ioff], tmp_block_l0[0], alpha_l0, alpha_l1, wp_offset, (currSlice->luma_log2_weight_denom + 1));
+		}
+		else
+		{ 
+			bi_prediction((h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[pl][joff][ioff], tmp_block_l0[0], 8, 8); 
+		}
+
+		if (dec_picture->chroma_format_idc == YUV422)
+		{
+			int uv;
+			int ioff_cr = ioff >> 1;
+			int joff_cr = joff;
+
+			short mv_cr1[2] = {l0_mv_array[0], l0_mv_array[1]};
+			short mv_cr2[2] = {l1_mv_array[0], l1_mv_array[1]};
+
+			get_block_chroma(currMB, p_Vid->listX[LIST_0 + list_offset][l0_refframe], i4, currMB->block_y_aff + j, mv_cr1, 4, 8, tmp_block_l0[0], tmp_block_l0[1], 0, 0);
+			get_block_chroma(currMB, p_Vid->listX[LIST_1 + list_offset][l1_refframe], i4, currMB->block_y_aff + j, mv_cr2, 4, 8, currSlice->mb_pred[1], currSlice->mb_pred[2], ioff_cr, joff_cr);
+
+			for(uv=0;uv<2;uv++)
+			{
+
+				if(currSlice->apply_weights)
+				{
+					int wt_list_offset = (p_Vid->active_pps->weighted_bipred_idc==2)? list_offset : 0;
+
+					int alpha_l0  =   currSlice->wbp_weight[LIST_0 + wt_list_offset][l0_ref_idx][l1_ref_idx][uv + 1];
+					int alpha_l1  =   currSlice->wbp_weight[LIST_1 + wt_list_offset][l0_ref_idx][l1_ref_idx][uv + 1];
+					int wp_offset = ((currSlice->wp_offset [LIST_0 + wt_list_offset][l0_ref_idx][uv + 1] + currSlice->wp_offset[LIST_1 + wt_list_offset][l1_ref_idx][uv + 1] + 1) >>1);
+
+					weighted_bi_prediction((h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[uv+1][joff_cr][ioff_cr], tmp_block_l0[uv], 8, 4, alpha_l0, alpha_l1, wp_offset, (currSlice->chroma_log2_weight_denom + 1));
+				}
+				else
+				{
+					bi_prediction((h264_imgpel_macroblock_row_t *)&currSlice->mb_pred[uv+1][joff_cr][ioff_cr], tmp_block_l0[uv], 8, 4);
+				}
+			}
+		}
+	}
+	}
+}
+
+
+static void __forceinline perform_mc16x16_YUV420(Macroblock *currMB, StorablePicture *dec_picture, int pred_dir, int list_offset, int curr_mb_field)
+{
+	VideoParameters *p_Vid = currMB->p_Vid;  
+	seq_parameter_set_rbsp_t *active_sps = p_Vid->active_sps;
+
+	Slice *currSlice = currMB->p_Slice;
+
+	static const int mv_mul = 16; // 4 * 4
+
+	int i4   = currMB->block_x;
+	int j4   = currMB->block_y;
+
+	assert (pred_dir<=2);
+
+	if (pred_dir != 2)
+	{
+		//===== Single List Prediction =====
+		short       ref_idx = dec_picture->motion.motion[pred_dir][j4][i4].ref_idx;
+		short       ref_idx_wp = ref_idx;
+		short      *mv_array = dec_picture->motion.motion[pred_dir][j4][i4].mv;
+		StorablePicture *list = p_Vid->listX[list_offset + pred_dir][ref_idx];
+
+		check_motion_vector_range(p_Vid, mv_array[0], mv_array[1]);
+
+		get_block_luma(currMB, PLANE_Y, list, i4, currMB->block_y_aff, mv_array, 16, 16, currSlice->mb_pred[0]); 
+
+		if (currSlice->apply_weights)
+		{
+			int alpha_l0, wp_offset;
+			if (curr_mb_field && ((p_Vid->active_pps->weighted_pred_flag&&(p_Vid->type==P_SLICE|| p_Vid->type == SP_SLICE))||
+				(p_Vid->active_pps->weighted_bipred_idc==1 && (p_Vid->type==B_SLICE))))
+			{
+				ref_idx_wp >>=1;
+			}
+
+			alpha_l0  = currSlice->wp_weight[pred_dir][ref_idx_wp][0];
+			wp_offset = currSlice->wp_offset[pred_dir][ref_idx_wp][0];
+
+			opt_weighted_mc_prediction16x16(currSlice->mb_pred[0], alpha_l0, wp_offset, currSlice->luma_log2_weight_denom);
+		}
+		{
+			int uv;
+			short mv_cr[2] = {mv_array[0], mv_array[1] +  list->chroma_vector_adjustment };
+			get_block_chroma(currMB, list, i4, currMB->block_y_aff, mv_cr, 8, 8, currSlice->mb_pred[1], currSlice->mb_pred[2], 0, 0);
+
+			for(uv=0;uv<2;uv++)
+			{
+				if (currSlice->apply_weights)
+				{
+					int alpha_l0  = currSlice->wp_weight[pred_dir][ref_idx_wp][uv + 1];
+					int wp_offset = currSlice->wp_offset[pred_dir][ref_idx_wp][uv + 1];
+
+					opt_weighted_mc_prediction8x8(currSlice->mb_pred[uv + 1], alpha_l0, wp_offset, currSlice->chroma_log2_weight_denom);
+				}
+			}
+		}
+		}
+	else
+	{
+		//===== BI-PREDICTION =====
+		__declspec(align(32)) h264_imgpel_macroblock_t tmp_block_l0[2];
+		short *l0_mv_array = dec_picture->motion.motion[LIST_0][j4][i4].mv;
+		short *l1_mv_array = dec_picture->motion.motion[LIST_1][j4][i4].mv;
+
+		short l0_refframe = dec_picture->motion.motion[LIST_0][j4][i4].ref_idx;
+		short l0_ref_idx  = l0_refframe;
+		short l1_refframe = dec_picture->motion.motion[LIST_1][j4][i4].ref_idx;
+		short l1_ref_idx  = l1_refframe;
+
+		check_motion_vector_range(p_Vid, l0_mv_array[0], l0_mv_array[1]);
+		check_motion_vector_range(p_Vid, l1_mv_array[0], l1_mv_array[1]);
+
+		if (p_Vid->framepoc < p_Vid->recovery_poc || IS_INDEPENDENT(p_Vid))
+		{
+			get_block_luma(currMB, PLANE_Y, p_Vid->listX[LIST_0 + list_offset][l0_refframe], i4, currMB->block_y_aff, l0_mv_array, 16, 16, tmp_block_l0[0]);  
+			get_block_luma(currMB, PLANE_Y, p_Vid->listX[LIST_1 + list_offset][l1_refframe], i4, currMB->block_y_aff, l1_mv_array, 16, 16, currSlice->mb_pred[0]);
+		}
+		else
+		{
+			VideoImage *cur_imgY = p_Vid->listX[LIST_0 + list_offset][l0_refframe]->imgY;
+			IppVCInterpolateBlock_8u block_data;
+
+			block_data.pSrc[0] = cur_imgY->base_address;
+			block_data.srcStep = cur_imgY->stride;
+			block_data.pDst[0] = (Ipp8u *)(tmp_block_l0[0]);
+			block_data.dstStep = sizeof(tmp_block_l0[0][0]);
+			block_data.sizeFrame.width = dec_picture->size_x;
+			block_data.sizeFrame.height = (dec_picture->motion.mb_field[currMB->mbAddrX]) ? (dec_picture->size_y >> 1): dec_picture->size_y;
+			block_data.sizeBlock.width = 16;
+			block_data.sizeBlock.height = 16;
+			block_data.pointBlockPos.x = i4 << 2;
+			block_data.pointBlockPos.y = currMB->block_y_aff<< 2;
+			block_data.pointVector.x = l0_mv_array[0];
+			block_data.pointVector.y = l0_mv_array[1];
+			ippiInterpolateLumaBlock_H264_8u_P1R(&block_data);
+			cur_imgY = p_Vid->listX[LIST_1 + list_offset][l1_refframe]->imgY;
+			block_data.pSrc[0] = cur_imgY->base_address;
+			block_data.srcStep = cur_imgY->stride;
+			block_data.pDst[0] = (Ipp8u *)(currSlice->mb_pred[0]);
+			block_data.pointVector.x = l1_mv_array[0];
+			block_data.pointVector.y = l1_mv_array[1];
+			ippiInterpolateLumaBlock_H264_8u_P1R(&block_data);
+		}
+
+		if(currSlice->apply_weights)
+		{
+			int alpha_l0, alpha_l1, wp_offset;
+			int wt_list_offset = (p_Vid->active_pps->weighted_bipred_idc==2)? list_offset : 0;
+
+			// This code existed in the original. Seems pointless but copying it here for reference and in case temporal direct breaks.
+			// if (mv_mode==0 && currSlice->direct_spatial_mv_pred_flag==0 ) l1_ref_idx=0;    
+			if (((p_Vid->active_pps->weighted_pred_flag&&(p_Vid->type==P_SLICE|| p_Vid->type == SP_SLICE))||
+				(p_Vid->active_pps->weighted_bipred_idc==1 && (p_Vid->type==B_SLICE))) && curr_mb_field)
+			{
+				l0_ref_idx >>=1;
+				l1_ref_idx >>=1;
+			}
+
+			alpha_l0  =   currSlice->wbp_weight[LIST_0 + wt_list_offset][l0_ref_idx][l1_ref_idx][0];
+			alpha_l1  =   currSlice->wbp_weight[LIST_1 + wt_list_offset][l0_ref_idx][l1_ref_idx][0];
+			wp_offset = ((currSlice->wp_offset [LIST_0 + wt_list_offset][l0_ref_idx][0] + currSlice->wp_offset[LIST_1 + wt_list_offset][l1_ref_idx][0] + 1) >>1);
+
+			opt_weighted_bi_prediction16x16(currSlice->mb_pred[0], tmp_block_l0[0], alpha_l0, alpha_l1, wp_offset, (currSlice->luma_log2_weight_denom + 1));
+		}
+		else
+		{ 
+			bi_prediction(currSlice->mb_pred[0], tmp_block_l0[0], 16, 16); 
+		}
+		
+		{ 
+			int uv;
+
+			short mv_cr1[2] = {l0_mv_array[0], l0_mv_array[1] + p_Vid->listX[LIST_0 + list_offset][l0_refframe]->chroma_vector_adjustment };
+			short mv_cr2[2] = {l1_mv_array[0], l1_mv_array[1] + p_Vid->listX[LIST_1 + list_offset][l1_refframe]->chroma_vector_adjustment };
+
+			get_block_chroma(currMB, p_Vid->listX[LIST_0 + list_offset][l0_refframe], i4, currMB->block_y_aff , mv_cr1, 8, 8, tmp_block_l0[0], tmp_block_l0[1], 0, 0);
+			get_block_chroma(currMB, p_Vid->listX[LIST_1 + list_offset][l1_refframe], i4, currMB->block_y_aff , mv_cr2, 8, 8, currSlice->mb_pred[1], currSlice->mb_pred[2], 0, 0);
+
+			for(uv=0;uv<2;uv++)
+			{
+				if(currSlice->apply_weights)
+				{
+					int wt_list_offset = (p_Vid->active_pps->weighted_bipred_idc==2)? list_offset : 0;
+
+					int alpha_l0  =   currSlice->wbp_weight[LIST_0 + wt_list_offset][l0_ref_idx][l1_ref_idx][uv + 1];
+					int alpha_l1  =   currSlice->wbp_weight[LIST_1 + wt_list_offset][l0_ref_idx][l1_ref_idx][uv + 1];
+					int wp_offset = ((currSlice->wp_offset [LIST_0 + wt_list_offset][l0_ref_idx][uv + 1] + currSlice->wp_offset[LIST_1 + wt_list_offset][l1_ref_idx][uv + 1] + 1) >>1);
+
+					opt_weighted_bi_prediction8x8(currSlice->mb_pred[uv+1], tmp_block_l0[uv], alpha_l0, alpha_l1, wp_offset, (currSlice->chroma_log2_weight_denom + 1));
+				}
+				else
+				{
+					bi_prediction(currSlice->mb_pred[uv+1], tmp_block_l0[uv], 8, 8);
+				}
+			}
+		}      
+		
+	}
+}
+
+
+
+void perform_mc16x16(Macroblock *currMB, ColorPlane pl, StorablePicture *dec_picture, int pred_dir, int list_offset, int curr_mb_field)
+{
+	if (dec_picture->chroma_format_idc == YUV420)
+	{
+		perform_mc16x16_YUV420(currMB, dec_picture, pred_dir, list_offset, curr_mb_field);
+	}
+	else
+	{
+	VideoParameters *p_Vid = currMB->p_Vid;  
+	seq_parameter_set_rbsp_t *active_sps = p_Vid->active_sps;
+
+	Slice *currSlice = currMB->p_Slice;
+
+	static const int mv_mul = 16; // 4 * 4
+
+	int i4   = currMB->block_x;
+	int j4   = currMB->block_y;
+
+	assert (pred_dir<=2);
+
+	if (pred_dir != 2)
+	{
+		//===== Single List Prediction =====
+		short       ref_idx = dec_picture->motion.motion[pred_dir][j4][i4].ref_idx;
+		short       ref_idx_wp = ref_idx;
+		short      *mv_array = dec_picture->motion.motion[pred_dir][j4][i4].mv;
+		StorablePicture *list = p_Vid->listX[list_offset + pred_dir][ref_idx];
+
+		check_motion_vector_range(p_Vid, mv_array[0], mv_array[1]);
+
+		get_block_luma(currMB, pl, list, i4, currMB->block_y_aff, mv_array, 16, 16, currSlice->mb_pred[pl]); 
+
+		if (currSlice->apply_weights)
+		{
+			int alpha_l0, wp_offset;
+			if (curr_mb_field && ((p_Vid->active_pps->weighted_pred_flag&&(p_Vid->type==P_SLICE|| p_Vid->type == SP_SLICE))||
+				(p_Vid->active_pps->weighted_bipred_idc==1 && (p_Vid->type==B_SLICE))))
+			{
+				ref_idx_wp >>=1;
+			}
+
+			alpha_l0  = currSlice->wp_weight[pred_dir][ref_idx_wp][0];
+			wp_offset = currSlice->wp_offset[pred_dir][ref_idx_wp][0];
+
+			opt_weighted_mc_prediction16x16(currSlice->mb_pred[pl], alpha_l0, wp_offset, currSlice->luma_log2_weight_denom);
+		}
+
+		if (dec_picture->chroma_format_idc == YUV422)
+		{
+			int uv;
+			short mv_cr[2] = {mv_array[0], mv_array[1]};
+			get_block_chroma(currMB, list, i4, currMB->block_y_aff, mv_cr, 8, 16, currSlice->mb_pred[1], currSlice->mb_pred[2], 0, 0);
+
+			for(uv=0;uv<2;uv++)
+			{
+				if (currSlice->apply_weights)
+				{
+					int alpha_l0  = currSlice->wp_weight[pred_dir][ref_idx_wp][uv + 1];
+					int wp_offset = currSlice->wp_offset[pred_dir][ref_idx_wp][uv + 1];
+
+					weighted_mc_prediction(currSlice->mb_pred[uv + 1], 16, 8, alpha_l0, wp_offset, currSlice->chroma_log2_weight_denom);
+				}
+			}
+		}
+	}
+	else
+	{
+		//===== BI-PREDICTION =====
+		__declspec(align(32)) h264_imgpel_macroblock_t tmp_block_l0[2];
+		short *l0_mv_array = dec_picture->motion.motion[LIST_0][j4][i4].mv;
+		short *l1_mv_array = dec_picture->motion.motion[LIST_1][j4][i4].mv;
+
+		short l0_refframe = dec_picture->motion.motion[LIST_0][j4][i4].ref_idx;
+		short l0_ref_idx  = l0_refframe;
+		short l1_refframe = dec_picture->motion.motion[LIST_1][j4][i4].ref_idx;
+		short l1_ref_idx  = l1_refframe;
+
+		check_motion_vector_range(p_Vid, l0_mv_array[0], l0_mv_array[1]);
+		check_motion_vector_range(p_Vid, l1_mv_array[0], l1_mv_array[1]);
+
+		if (p_Vid->framepoc < p_Vid->recovery_poc || IS_INDEPENDENT(p_Vid) || pl!=PLANE_Y)
+		{
+			get_block_luma(currMB, pl, p_Vid->listX[LIST_0 + list_offset][l0_refframe], i4, currMB->block_y_aff, l0_mv_array, 16, 16, tmp_block_l0[0]);  
+			get_block_luma(currMB, pl, p_Vid->listX[LIST_1 + list_offset][l1_refframe], i4, currMB->block_y_aff, l1_mv_array, 16, 16, currSlice->mb_pred[pl]);
+		}
+		else
+		{
+			VideoImage *cur_imgY = p_Vid->listX[LIST_0 + list_offset][l0_refframe]->imgY;
+			IppVCInterpolateBlock_8u block_data;
+
+			block_data.pSrc[0] = cur_imgY->base_address;
+			block_data.srcStep = cur_imgY->stride;
+			block_data.pDst[0] = (Ipp8u *)(tmp_block_l0[0]);
+			block_data.dstStep = sizeof(tmp_block_l0[0][0]);
+			block_data.sizeFrame.width = dec_picture->size_x;
+			block_data.sizeFrame.height = (dec_picture->motion.mb_field[currMB->mbAddrX]) ? (dec_picture->size_y >> 1): dec_picture->size_y;
+			block_data.sizeBlock.width = 16;
+			block_data.sizeBlock.height = 16;
+			block_data.pointBlockPos.x = i4 << 2;
+			block_data.pointBlockPos.y = currMB->block_y_aff<< 2;
+			block_data.pointVector.x = l0_mv_array[0];
+			block_data.pointVector.y = l0_mv_array[1];
+			ippiInterpolateLumaBlock_H264_8u_P1R(&block_data);
+			cur_imgY = p_Vid->listX[LIST_1 + list_offset][l1_refframe]->imgY;
+			block_data.pSrc[0] = cur_imgY->base_address;
+			block_data.srcStep = cur_imgY->stride;
+			block_data.pDst[0] = (Ipp8u *)(currSlice->mb_pred[pl]);
+			block_data.pointVector.x = l1_mv_array[0];
+			block_data.pointVector.y = l1_mv_array[1];
+			ippiInterpolateLumaBlock_H264_8u_P1R(&block_data);
+		}
+
+		if(currSlice->apply_weights)
+		{
+			int alpha_l0, alpha_l1, wp_offset;
+			int wt_list_offset = (p_Vid->active_pps->weighted_bipred_idc==2)? list_offset : 0;
+
+			// This code existed in the original. Seems pointless but copying it here for reference and in case temporal direct breaks.
+			// if (mv_mode==0 && currSlice->direct_spatial_mv_pred_flag==0 ) l1_ref_idx=0;    
+			if (((p_Vid->active_pps->weighted_pred_flag&&(p_Vid->type==P_SLICE|| p_Vid->type == SP_SLICE))||
+				(p_Vid->active_pps->weighted_bipred_idc==1 && (p_Vid->type==B_SLICE))) && curr_mb_field)
+			{
+				l0_ref_idx >>=1;
+				l1_ref_idx >>=1;
+			}
+
+			alpha_l0  =   currSlice->wbp_weight[LIST_0 + wt_list_offset][l0_ref_idx][l1_ref_idx][0];
+			alpha_l1  =   currSlice->wbp_weight[LIST_1 + wt_list_offset][l0_ref_idx][l1_ref_idx][0];
+			wp_offset = ((currSlice->wp_offset [LIST_0 + wt_list_offset][l0_ref_idx][0] + currSlice->wp_offset[LIST_1 + wt_list_offset][l1_ref_idx][0] + 1) >>1);
+
+			opt_weighted_bi_prediction16x16(currSlice->mb_pred[pl], tmp_block_l0[0], alpha_l0, alpha_l1, wp_offset, (currSlice->luma_log2_weight_denom + 1));
+		}
+		else
+		{ 
+			bi_prediction(currSlice->mb_pred[pl], tmp_block_l0[0], 16, 16); 
+		}
+
+		if (dec_picture->chroma_format_idc == YUV422) 
+		{ // YUV422
+			int uv;
+
+			int block_size_y_cr = p_Vid->mb_cr_size_y;
+
+			short mv_cr1[2] = {l0_mv_array[0], l0_mv_array[1] + ((active_sps->chroma_format_idc == 1)? p_Vid->listX[LIST_0 + list_offset][l0_refframe]->chroma_vector_adjustment : 0) };
+			short mv_cr2[2] = {l1_mv_array[0], l1_mv_array[1] + ((active_sps->chroma_format_idc == 1)? p_Vid->listX[LIST_1 + list_offset][l1_refframe]->chroma_vector_adjustment : 0) };
+
+			get_block_chroma(currMB, p_Vid->listX[LIST_0 + list_offset][l0_refframe], i4, currMB->block_y_aff , mv_cr1, 8, block_size_y_cr, tmp_block_l0[0], tmp_block_l0[1], 0, 0);
+			get_block_chroma(currMB, p_Vid->listX[LIST_1 + list_offset][l1_refframe], i4, currMB->block_y_aff , mv_cr2, 8, block_size_y_cr, currSlice->mb_pred[1], currSlice->mb_pred[2], 0, 0);
+
+
+			for(uv=0;uv<2;uv++)
+			{
+				if(currSlice->apply_weights)
+				{
+					int wt_list_offset = (p_Vid->active_pps->weighted_bipred_idc==2)? list_offset : 0;
+
+					int alpha_l0  =   currSlice->wbp_weight[LIST_0 + wt_list_offset][l0_ref_idx][l1_ref_idx][uv + 1];
+					int alpha_l1  =   currSlice->wbp_weight[LIST_1 + wt_list_offset][l0_ref_idx][l1_ref_idx][uv + 1];
+					int wp_offset = ((currSlice->wp_offset [LIST_0 + wt_list_offset][l0_ref_idx][uv + 1] + currSlice->wp_offset[LIST_1 + wt_list_offset][l1_ref_idx][uv + 1] + 1) >>1);
+
+					weighted_bi_prediction(currSlice->mb_pred[uv+1], tmp_block_l0[uv], block_size_y_cr, 8, alpha_l0, alpha_l1, wp_offset, (currSlice->chroma_log2_weight_denom + 1));
+				}
+				else
+				{
+					bi_prediction(currSlice->mb_pred[uv+1], tmp_block_l0[uv], block_size_y_cr, 8);
+				}
+			}
+		}      
+	}
+	}
+}
+
diff --git a/Src/h264dec/ldecod/src/meminput.c b/Src/h264dec/ldecod/src/meminput.c
new file mode 100644
index 00000000..54465340
--- /dev/null
+++ b/Src/h264dec/ldecod/src/meminput.c
@@ -0,0 +1,134 @@
+#include "global.h"
+#include "meminput.h"
+
+void malloc_mem_input(VideoParameters *p_Vid)
+{
+  if ( (p_Vid->mem_input = (memory_input_t *) calloc(1, sizeof(memory_input_t))) == NULL)
+  {
+    snprintf(errortext, ET_SIZE, "Memory allocation for memory input failed");
+    error(errortext,100);
+  }  
+}
+
+void free_mem_input(VideoParameters *p_Vid)
+{
+  free(p_Vid->mem_input);
+  p_Vid->mem_input = NULL;
+}
+
+/*!
+************************************************************************
+* \brief
+*    returns a byte from IO buffer
+************************************************************************
+*/
+static inline uint8_t getfbyte(memory_input_t *mem_input)
+{
+   return mem_input->user_buffer[mem_input->user_buffer_read++];
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    returns if new start code is found at byte aligned position buf.
+ *    new-startcode is of form N 0x00 bytes, followed by a 0x01 byte.
+ *
+ *  \return
+ *     1 if start-code is found or                      \n
+ *     0, indicating that there is no start code
+ *
+ *  \param Buf
+ *     pointer to byte-stream
+ *  \param zeros_in_startcode
+ *     indicates number of 0x00 bytes in start-code.
+ ************************************************************************
+ */
+static inline int FindStartCode (unsigned char *Buf, int zeros_in_startcode)
+{
+  int i;
+
+  for (i = 0; i < zeros_in_startcode; i++)
+  {
+    if(*(Buf++) != 0)
+    {
+      return 0;
+    }
+  }
+
+  if(*Buf != 1)
+    return 0;
+
+  return 1;
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Returns the size of the NALU (bits between start codes in case of
+ *    Annex B.  nalu->buf and nalu->len are filled.  Other field in
+ *    nalu-> remain uninitialized (will be taken care of by NALUtoRBSP.
+ *
+ * \return
+ *     0 if there is nothing any more to read (EOF)
+ *    -1 in case of any error
+ *
+ *  \note Side-effect: Returns length of start-code in bytes.
+ *
+ * \note
+ *   GetAnnexbNALU expects start codes at byte aligned positions in the file
+ *
+ ************************************************************************
+ */
+int GetMemoryNALU (VideoParameters *p_Vid, NALU_t *nalu)
+{
+	memory_input_t *mem_input = p_Vid->mem_input;
+	if (!mem_input->user_buffer)
+		return 0;
+	nalu->len = mem_input->user_buffer_size;
+	memcpy(nalu->buf, mem_input->user_buffer, nalu->len);
+	memzero16(nalu->buf+nalu->len); // add some extra 0's to the end
+	nalu->forbidden_bit     = (*(nalu->buf) >> 7) & 1;
+	nalu->nal_reference_idc = (NalRefIdc) ((*(nalu->buf) >> 5) & 3);
+	nalu->nal_unit_type     = (NaluType) ((*(nalu->buf)) & 0x1f);
+	nalu->lost_packets = 0;
+	mem_input->user_buffer = 0;
+
+	if (mem_input->skip_b_frames && nalu->nal_reference_idc == NALU_PRIORITY_DISPOSABLE)
+		return 0;
+
+	if (mem_input->resetting && nalu->nal_unit_type != NALU_TYPE_IDR)
+		return 0;
+
+	mem_input->resetting = 0;
+
+	return 1;
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Opens the bit stream file named fn
+ * \return
+ *    none
+ ************************************************************************
+ */
+void OpenMemory(VideoParameters *p_Vid, const char *fn)
+{
+  memory_input_t *mem_input = p_Vid->mem_input;
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Closes the bit stream file
+ ************************************************************************
+ */
+void CloseMemory(VideoParameters *p_Vid)
+{
+    memory_input_t *mem_input = p_Vid->mem_input;
+}
+
diff --git a/Src/h264dec/ldecod/src/nal.c b/Src/h264dec/ldecod/src/nal.c
new file mode 100644
index 00000000..73c39474
--- /dev/null
+++ b/Src/h264dec/ldecod/src/nal.c
@@ -0,0 +1,123 @@
+
+/*!
+ ************************************************************************
+ * \file  nal.c
+ *
+ * \brief
+ *    Converts Encapsulated Byte Sequence Packets (EBSP) to Raw Byte
+ *    Sequence Packets (RBSP), and then onto String Of Data Bits (SODB)
+ *
+ * \author
+ *    Main contributors (see contributors.h for copyright, address and affiliation details)
+ *    - Shankar L. Regunathan <shanre@microsoft.com>
+ *    - Tobias Oelbaum <oelbaum@drehvial.de>
+************************************************************************
+ */
+
+#include "contributors.h"
+#include "global.h"
+
+ /*!
+ ************************************************************************
+ * \brief
+ *    Converts RBSP to string of data bits
+ * \param streamBuffer
+ *          pointer to buffer containing data
+ *  \param last_byte_pos
+ *          position of the last byte containing data.
+ * \return last_byte_pos
+ *          position of the last byte pos. If the last-byte was entirely a stuffing byte,
+ *          it is removed, and the last_byte_pos is updated.
+ *
+************************************************************************/
+
+int RBSPtoSODB(byte *streamBuffer, int last_byte_pos)
+{
+  int ctr_bit, bitoffset;
+
+  bitoffset = 0;
+  //find trailing 1
+  ctr_bit = (streamBuffer[last_byte_pos-1] & (0x01<<bitoffset));   // set up control bit
+
+  while (ctr_bit==0)
+  {                 // find trailing 1 bit
+    bitoffset++;
+    if(bitoffset == 8)
+    {
+      if(last_byte_pos == 0)
+        printf(" Panic: All zero data sequence in RBSP \n");
+      assert(last_byte_pos != 0);
+      last_byte_pos -= 1;
+      bitoffset = 0;
+    }
+    ctr_bit= streamBuffer[last_byte_pos-1] & (0x01<<(bitoffset));
+  }
+
+
+  // We keep the stop bit for now
+/*  if (remove_stop)
+  {
+    streamBuffer[last_byte_pos-1] -= (0x01<<(bitoffset));
+    if(bitoffset == 7)
+      return(last_byte_pos-1);
+    else
+      return(last_byte_pos);
+  }
+*/
+  return(last_byte_pos);
+
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    Converts Encapsulated Byte Sequence Packets to RBSP
+* \param streamBuffer
+*    pointer to data stream
+* \param end_bytepos
+*    size of data stream
+* \param begin_bytepos
+*    Position after beginning
+************************************************************************/
+
+// TODO: benski> optimize using BitScanReverse
+int EBSPtoRBSP(byte *streamBuffer, int end_bytepos)
+{
+  int i, j, count;
+		int begin_bytepos = 1;
+  count = 0;
+
+
+  if(end_bytepos < begin_bytepos)
+    return end_bytepos;
+
+  j = begin_bytepos;
+
+  for(i = begin_bytepos; i < end_bytepos; i++)
+  { //starting from begin_bytepos to avoid header information
+    //in NAL unit, 0x000000, 0x000001 or 0x000002 shall not occur at any byte-aligned position
+    if(count == ZEROBYTES_SHORTSTARTCODE && streamBuffer[i] < 0x03) 
+      return j;//-1;
+    if(count == ZEROBYTES_SHORTSTARTCODE && streamBuffer[i] == 0x03)
+    {
+      //check the 4th byte after 0x000003, except when cabac_zero_word is used, in which case the last three bytes of this NAL unit must be 0x000003
+      if((i < end_bytepos-1) && (streamBuffer[i+1] > 0x03))
+        return -1;
+      //if cabac_zero_word is used, the final byte of this NAL unit(0x03) is discarded, and the last two bytes of RBSP must be 0x0000
+      if(i == end_bytepos-1)
+        return j;
+
+      i++;
+      count = 0;
+    }
+    streamBuffer[j] = streamBuffer[i];
+    if(streamBuffer[i] == 0x00)
+      count++;
+    else
+      count = 0;
+    j++;
+  }
+
+  return j;
+}
diff --git a/Src/h264dec/ldecod/src/nalu.c b/Src/h264dec/ldecod/src/nalu.c
new file mode 100644
index 00000000..e70617e3
--- /dev/null
+++ b/Src/h264dec/ldecod/src/nalu.c
@@ -0,0 +1,162 @@
+
+/*!
+ ************************************************************************
+ * \file  nalu.c
+ *
+ * \brief
+ *    Decoder NALU support functions
+ *
+ * \author
+ *    Main contributors (see contributors.h for copyright, address and affiliation details)
+ *    - Stephan Wenger   <stewe@cs.tu-berlin.de>
+ ************************************************************************
+ */
+
+#include "global.h"
+#include "nalu.h"
+#include "memalloc.h"
+#include "meminput.h"
+
+/*!
+*************************************************************************************
+* \brief
+*    Initialize bitstream reading structure
+*
+* \param
+*    p_Vid: Imageparameter information
+* \param
+*    filemode: 
+*
+*************************************************************************************
+*/
+void OpenMemory(VideoParameters *p_Vid, const char *fn);
+void CloseMemory(VideoParameters *p_Vid);
+int GetMemoryNALU (VideoParameters *p_Vid, NALU_t *nalu);
+
+void initBitsFile (VideoParameters *p_Vid)
+{
+		malloc_mem_input(p_Vid);
+		p_Vid->nalu = AllocNALU(MAX_CODED_FRAME_SIZE);
+
+}
+
+/*!
+ *************************************************************************************
+ * \brief
+ *    Converts a NALU to an RBSP
+ *
+ * \param
+ *    nalu: nalu structure to be filled
+ *
+ * \return
+ *    length of the RBSP in bytes
+ *************************************************************************************
+ */
+
+static int NALUtoRBSP (NALU_t *nalu)
+{
+  assert (nalu != NULL);
+
+  nalu->len = EBSPtoRBSP (nalu->buf, nalu->len) ;
+
+  return nalu->len ;
+}
+
+/*!
+************************************************************************
+* \brief
+*    Read the next NAL unit (with error handling)
+************************************************************************
+*/
+int read_next_nalu(VideoParameters *p_Vid, NALU_t *nalu)
+{
+  InputParameters *p_Inp = p_Vid->p_Inp;
+  int ret;
+
+  ret = GetMemoryNALU(p_Vid, nalu);
+
+  if (ret < 0)
+  {
+    error ("Error while getting the next NALU, exit\n", 601);
+  }
+  if (ret == 0)
+  {
+    return 0;
+  }
+
+  //In some cases, zero_byte shall be present. If current NALU is a VCL NALU, we can't tell
+  //whether it is the first VCL NALU at this point, so only non-VCL NAL unit is checked here.
+  CheckZeroByteNonVCL(p_Vid, nalu);
+
+  ret = NALUtoRBSP(nalu);
+
+  if (ret < 0)
+    error ("Invalid startcode emulation prevention found.", 602);
+
+
+  // Got a NALU
+  if (nalu->forbidden_bit)
+  {
+    error ("Found NALU with forbidden_bit set, bit error?", 603);
+  }
+
+  return nalu->len;
+}
+
+void CheckZeroByteNonVCL(VideoParameters *p_Vid, NALU_t *nalu)
+{
+  int CheckZeroByte=0;
+
+  //This function deals only with non-VCL NAL units
+  if(nalu->nal_unit_type>=1&&nalu->nal_unit_type<=5)
+    return;
+
+  //for SPS and PPS, zero_byte shall exist
+  if(nalu->nal_unit_type==NALU_TYPE_SPS || nalu->nal_unit_type==NALU_TYPE_PPS)
+    CheckZeroByte=1;
+  //check the possibility of the current NALU to be the start of a new access unit, according to 7.4.1.2.3
+  if(nalu->nal_unit_type==NALU_TYPE_AUD  || nalu->nal_unit_type==NALU_TYPE_SPS ||
+    nalu->nal_unit_type==NALU_TYPE_PPS || nalu->nal_unit_type==NALU_TYPE_SEI ||
+    (nalu->nal_unit_type>=13 && nalu->nal_unit_type<=18))
+  {
+    if(p_Vid->LastAccessUnitExists)
+    {
+      p_Vid->LastAccessUnitExists=0;    //deliver the last access unit to decoder
+      p_Vid->NALUCount=0;
+    }
+  }
+  p_Vid->NALUCount++;
+  //for the first NAL unit in an access unit, zero_byte shall exists
+  if(p_Vid->NALUCount==1)
+    CheckZeroByte=1;
+  if(CheckZeroByte && nalu->startcodeprefix_len==3)
+  {
+   // printf("Warning: zero_byte shall exist\n");
+    //because it is not a very serious problem, we do not exit here
+  }
+}
+
+void CheckZeroByteVCL(VideoParameters *p_Vid, NALU_t *nalu)
+{
+  int CheckZeroByte=0;
+
+  //This function deals only with VCL NAL units
+  if(!(nalu->nal_unit_type>=1&&nalu->nal_unit_type<=5))
+    return;
+
+  if(p_Vid->LastAccessUnitExists)
+  {
+    p_Vid->NALUCount=0;
+  }
+  p_Vid->NALUCount++;
+  //the first VCL NAL unit that is the first NAL unit after last VCL NAL unit indicates
+  //the start of a new access unit and hence the first NAL unit of the new access unit.           (sounds like a tongue twister :-)
+  if(p_Vid->NALUCount == 1)
+    CheckZeroByte = 1;
+  p_Vid->LastAccessUnitExists = 1;
+  if(CheckZeroByte && nalu->startcodeprefix_len==3)
+  {
+    //printf("warning: zero_byte shall exist\n");
+    //because it is not a very serious problem, we do not exit here
+  }
+}
diff --git a/Src/h264dec/ldecod/src/nalucommon.c b/Src/h264dec/ldecod/src/nalucommon.c
new file mode 100644
index 00000000..fe900722
--- /dev/null
+++ b/Src/h264dec/ldecod/src/nalucommon.c
@@ -0,0 +1,73 @@
+
+/*!
+ ************************************************************************
+ * \file  nalucommon.c
+ *
+ * \brief
+ *    Common NALU support functions
+ *
+ * \author
+ *    Main contributors (see contributors.h for copyright, address and affiliation details)
+ *    - Stephan Wenger   <stewe@cs.tu-berlin.de>
+ ************************************************************************
+ */
+
+#include "global.h"
+#include "nalu.h"
+#include "memalloc.h"
+#include <bfc/platform/types.h>
+
+/*!
+ *************************************************************************************
+ * \brief
+ *    Allocates memory for a NALU
+ *
+ * \param buffersize
+ *     size of NALU buffer
+ *
+ * \return
+ *    pointer to a NALU
+ *************************************************************************************
+ */
+NALU_t *AllocNALU(int buffersize)
+{
+  NALU_t *n;
+
+  if ((n = (NALU_t*)calloc (1, sizeof (NALU_t))) == NULL)
+    return 0;
+
+  n->max_size=buffersize;
+
+  if ((n->buf = (uint8_t *)_aligned_malloc(buffersize, 32)) == NULL)
+  {
+    free (n);
+    return 0;
+  }
+	memset(n->buf, 0, buffersize);
+
+  return n;
+}
+
+
+/*!
+ *************************************************************************************
+ * \brief
+ *    Frees a NALU
+ *
+ * \param n
+ *    NALU to be freed
+ *
+ *************************************************************************************
+ */
+void FreeNALU(NALU_t *n)
+{
+  if (n != NULL)
+  {
+    if (n->buf != NULL)
+    {
+      _aligned_free(n->buf);
+      n->buf=NULL;
+    }
+    free (n);
+  }
+}
diff --git a/Src/h264dec/ldecod/src/output.c b/Src/h264dec/ldecod/src/output.c
new file mode 100644
index 00000000..74576af0
--- /dev/null
+++ b/Src/h264dec/ldecod/src/output.c
@@ -0,0 +1,599 @@
+
+/*!
+ ************************************************************************
+ * \file output.c
+ *
+ * \brief
+ *    Output an image and Trance support
+ *
+ * \author
+ *    Main contributors (see contributors.h for copyright, address and affiliation details)
+ *    - Karsten Suehring               <suehring@hhi.de>
+ ************************************************************************
+ */
+
+#include "contributors.h"
+
+#include "global.h"
+#include "mbuffer.h"
+#include "image.h"
+#include "memalloc.h"
+#include "sei.h"
+
+static void write_out_picture(VideoParameters *p_Vid, StorablePicture *p);
+
+
+#if (PAIR_FIELDS_IN_OUTPUT)
+
+void clear_picture(VideoParameters *p_Vid, StorablePicture *p);
+
+/*!
+ ************************************************************************
+ * \brief
+ *    output the pending frame buffer
+ * \param p_out
+ *    Output file
+ ************************************************************************
+ */
+void flush_pending_output(VideoParameters *p_Vid)
+{
+  if (p_Vid->pending_output_state != FRAME)
+  {
+    write_out_picture(p_Vid, p_Vid->pending_output);
+  }
+
+  if (p_Vid->pending_output->imgY)
+  {
+    free_mem2Dpel (p_Vid->pending_output->imgY);
+    p_Vid->pending_output->imgY=NULL;
+  }
+  if (p_Vid->pending_output->imgUV)
+  {
+    free_mem3Dpel (p_Vid->pending_output->imgUV);
+    p_Vid->pending_output->imgUV=NULL;
+  }
+
+  p_Vid->pending_output_state = FRAME;
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Writes out a storable picture
+ *    If the picture is a field, the output buffers the picture and tries
+ *    to pair it with the next field.
+ * \param p
+ *    Picture to be written
+ * \param p_out
+ *    Output file
+ ************************************************************************
+ */
+void write_picture(VideoParameters *p_Vid, StorablePicture *p, int real_structure)
+{
+   int i, add;
+
+  if (real_structure==FRAME)
+  {
+    flush_pending_output(p_Vid);
+    write_out_picture(p_Vid, p);
+    return;
+  }
+  if (real_structure == p_Vid->pending_output_state)
+  {
+    flush_pending_output(p_Vid);
+    write_picture(p_Vid, p, real_structure);
+    return;
+  }
+
+  if (p_Vid->pending_output_state == FRAME)
+  {
+    p_Vid->pending_output->size_x = p->size_x;
+    p_Vid->pending_output->size_y = p->size_y;
+    p_Vid->pending_output->size_x_cr = p->size_x_cr;
+    p_Vid->pending_output->size_y_cr = p->size_y_cr;
+    p_Vid->pending_output->chroma_format_idc = p->chroma_format_idc;
+
+    p_Vid->pending_output->frame_mbs_only_flag = p->frame_mbs_only_flag;
+    p_Vid->pending_output->frame_cropping_flag = p->frame_cropping_flag;
+    if (p_Vid->pending_output->frame_cropping_flag)
+    {
+      p_Vid->pending_output->frame_cropping_rect_left_offset = p->frame_cropping_rect_left_offset;
+      p_Vid->pending_output->frame_cropping_rect_right_offset = p->frame_cropping_rect_right_offset;
+      p_Vid->pending_output->frame_cropping_rect_top_offset = p->frame_cropping_rect_top_offset;
+      p_Vid->pending_output->frame_cropping_rect_bottom_offset = p->frame_cropping_rect_bottom_offset;
+    }
+
+    get_mem2Dpel (&(p_Vid->pending_output->imgY), p_Vid->pending_output->size_y, p_Vid->pending_output->size_x);
+    get_mem3Dpel (&(p_Vid->pending_output->imgUV), 2, p_Vid->pending_output->size_y_cr, p_Vid->pending_output->size_x_cr);
+
+    clear_picture(p_Vid, p_Vid->pending_output);
+
+    // copy first field
+    if (real_structure == TOP_FIELD)
+    {
+      add = 0;
+    }
+    else
+    {
+      add = 1;
+    }
+
+    for (i=0; i<p_Vid->pending_output->size_y; i+=2)
+    {
+      memcpy(p_Vid->pending_output->imgY[(i+add)], p->imgY[(i+add)], p->size_x * sizeof(imgpel));
+    }
+    for (i=0; i<p_Vid->pending_output->size_y_cr; i+=2)
+    {
+      memcpy(p_Vid->pending_output->imgUV[0][(i+add)], p->imgUV[0][(i+add)], p->size_x_cr * sizeof(imgpel));
+      memcpy(p_Vid->pending_output->imgUV[1][(i+add)], p->imgUV[1][(i+add)], p->size_x_cr * sizeof(imgpel));
+    }
+    p_Vid->pending_output_state = real_structure;
+  }
+  else
+  {
+    if (  (p_Vid->pending_output->size_x!=p->size_x) || (p_Vid->pending_output->size_y!= p->size_y)
+       || (p_Vid->pending_output->frame_mbs_only_flag != p->frame_mbs_only_flag)
+       || (p_Vid->pending_output->frame_cropping_flag != p->frame_cropping_flag)
+       || ( p_Vid->pending_output->frame_cropping_flag &&
+            (  (p_Vid->pending_output->frame_cropping_rect_left_offset   != p->frame_cropping_rect_left_offset)
+             ||(p_Vid->pending_output->frame_cropping_rect_right_offset  != p->frame_cropping_rect_right_offset)
+             ||(p_Vid->pending_output->frame_cropping_rect_top_offset    != p->frame_cropping_rect_top_offset)
+             ||(p_Vid->pending_output->frame_cropping_rect_bottom_offset != p->frame_cropping_rect_bottom_offset)
+            )
+          )
+       )
+    {
+      flush_pending_output(p_Vid);
+      write_picture (p_Vid, p, real_structure);
+      return;
+    }
+    // copy second field
+    if (real_structure == TOP_FIELD)
+    {
+      add = 0;
+    }
+    else
+    {
+      add = 1;
+    }
+
+    for (i=0; i<p_Vid->pending_output->size_y; i+=2)
+    {
+      memcpy(p_Vid->pending_output->imgY[(i+add)], p->imgY[(i+add)], p->size_x * sizeof(imgpel));
+    }
+    for (i=0; i<p_Vid->pending_output->size_y_cr; i+=2)
+    {
+      memcpy(p_Vid->pending_output->imgUV[0][(i+add)], p->imgUV[0][(i+add)], p->size_x_cr * sizeof(imgpel));
+      memcpy(p_Vid->pending_output->imgUV[1][(i+add)], p->imgUV[1][(i+add)], p->size_x_cr * sizeof(imgpel));
+    }
+
+		p_Vid->pending_output->time_code = p->time_code;
+    flush_pending_output(p_Vid);
+  }
+}
+
+#else
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Writes out a storable picture without doing any output modifications
+ *
+ * \param p_Vid
+ *      image decoding parameters for current picture
+ * \param p
+ *    Picture to be written
+ * \param p_out
+ *    Output file
+ * \param real_structure
+ *    real picture structure
+ ************************************************************************
+ */
+static void write_picture(VideoParameters *p_Vid, StorablePicture *p, int real_structure)
+{
+  write_out_picture(p_Vid, p);
+}
+
+
+#endif
+
+/*!
+************************************************************************
+* \brief
+*    Writes out a storable picture
+*
+* \param p_Vid
+*      image decoding parameters for current picture
+* \param p
+*    Picture to be written
+* \param p_out
+*    Output file
+************************************************************************
+*/
+static void write_out_picture(VideoParameters *p_Vid, StorablePicture *p)
+{
+#if 0
+  InputParameters *p_Inp = p_Vid->p_Inp;
+
+  static const int SubWidthC  [4]= { 1, 2, 2, 1};
+  static const int SubHeightC [4]= { 1, 2, 1, 1};
+
+  int crop_left, crop_right, crop_top, crop_bottom;
+  int symbol_size_in_bytes = (p_Vid->pic_unit_bitsize_on_disk >> 3);
+  Boolean rgb_output = (Boolean) (p_Vid->active_sps->vui_seq_parameters.matrix_coefficients==0);
+  unsigned char *buf;
+
+  int ret;
+
+  if (p->non_existing)
+    return;
+
+	printf("*** Outputting poc %d, frame_num %d, frame_poc %d, pic_num %d\n", p->poc, p->frame_num, p->frame_poc, p->pic_num);
+
+#if (ENABLE_OUTPUT_TONEMAPPING)
+  // note: this tone-mapping is working for RGB format only. Sharp
+  if (p->seiHasTone_mapping && rgb_output)
+  {
+    //printf("output frame %d with tone model id %d\n",  p->frame_num, p->tone_mapping_model_id);
+    symbol_size_in_bytes = (p->tonemapped_bit_depth>8)? 2 : 1;
+    tone_map(p->imgY, p->tone_mapping_lut, p->size_x, p->size_y);
+    tone_map(p->imgUV[0], p->tone_mapping_lut, p->size_x_cr, p->size_y_cr);
+    tone_map(p->imgUV[1], p->tone_mapping_lut, p->size_x_cr, p->size_y_cr);
+  }
+#endif
+
+  if (p->frame_cropping_flag)
+  {
+    crop_left   = SubWidthC[p->chroma_format_idc] * p->frame_cropping_rect_left_offset;
+    crop_right  = SubWidthC[p->chroma_format_idc] * p->frame_cropping_rect_right_offset;
+    crop_top    = SubHeightC[p->chroma_format_idc]*( 2 - p->frame_mbs_only_flag ) * p->frame_cropping_rect_top_offset;
+    crop_bottom = SubHeightC[p->chroma_format_idc]*( 2 - p->frame_mbs_only_flag ) * p->frame_cropping_rect_bottom_offset;
+  }
+  else
+  {
+    crop_left = crop_right = crop_top = crop_bottom = 0;
+  }
+
+  //printf ("write frame size: %dx%d\n", p->size_x-crop_left-crop_right,p->size_y-crop_top-crop_bottom );
+  initOutput(p_Vid, symbol_size_in_bytes);
+
+  // KS: this buffer should actually be allocated only once, but this is still much faster than the previous version
+  buf = malloc (p->size_x*p->size_y*symbol_size_in_bytes);
+  if (NULL==buf)
+  {
+    no_mem_exit("write_out_picture: buf");
+  }
+
+  if(rgb_output)
+  {
+    crop_left   = p->frame_cropping_rect_left_offset;
+    crop_right  = p->frame_cropping_rect_right_offset;
+    crop_top    = ( 2 - p->frame_mbs_only_flag ) * p->frame_cropping_rect_top_offset;
+    crop_bottom = ( 2 - p->frame_mbs_only_flag ) * p->frame_cropping_rect_bottom_offset;
+
+    p_Vid->img2buf (p->imgUV[1], buf, p->size_x_cr, p->size_y_cr, symbol_size_in_bytes, crop_left, crop_right, crop_top, crop_bottom);
+    ret = write(p_out, buf, (p->size_y_cr-crop_bottom-crop_top)*(p->size_x_cr-crop_right-crop_left)*symbol_size_in_bytes);
+    if (ret != ((p->size_y_cr-crop_bottom-crop_top)*(p->size_x_cr-crop_right-crop_left)*symbol_size_in_bytes))
+    {
+      error ("write_out_picture: error writing to RGB file", 500);
+    }
+
+    if (p->frame_cropping_flag)
+    {
+      crop_left   = SubWidthC[p->chroma_format_idc] * p->frame_cropping_rect_left_offset;
+      crop_right  = SubWidthC[p->chroma_format_idc] * p->frame_cropping_rect_right_offset;
+      crop_top    = SubHeightC[p->chroma_format_idc]*( 2 - p->frame_mbs_only_flag ) * p->frame_cropping_rect_top_offset;
+      crop_bottom = SubHeightC[p->chroma_format_idc]*( 2 - p->frame_mbs_only_flag ) * p->frame_cropping_rect_bottom_offset;
+    }
+    else
+    {
+      crop_left = crop_right = crop_top = crop_bottom = 0;
+    }
+  }
+	// write Y
+  p_Vid->img2buf (p->imgY, buf, p->size_x, p->size_y, symbol_size_in_bytes, crop_left, crop_right, crop_top, crop_bottom);
+  ret = write(p_out, buf, (p->size_y-crop_bottom-crop_top)*(p->size_x-crop_right-crop_left)*symbol_size_in_bytes);
+  if (ret != ((p->size_y-crop_bottom-crop_top)*(p->size_x-crop_right-crop_left)*symbol_size_in_bytes))
+  {
+    error ("write_out_picture: error writing to YUV file", 500);
+  }
+
+  if (p->chroma_format_idc!=YUV400)
+  {
+    crop_left   = p->frame_cropping_rect_left_offset;
+    crop_right  = p->frame_cropping_rect_right_offset;
+    crop_top    = ( 2 - p->frame_mbs_only_flag ) * p->frame_cropping_rect_top_offset;
+    crop_bottom = ( 2 - p->frame_mbs_only_flag ) * p->frame_cropping_rect_bottom_offset;
+
+    p_Vid->img2buf (p->imgUV[0], buf, p->size_x_cr, p->size_y_cr, symbol_size_in_bytes, crop_left, crop_right, crop_top, crop_bottom);
+    ret = write(p_out, buf, (p->size_y_cr-crop_bottom-crop_top)*(p->size_x_cr-crop_right-crop_left)* symbol_size_in_bytes);
+    if (ret != ((p->size_y_cr-crop_bottom-crop_top)*(p->size_x_cr-crop_right-crop_left)* symbol_size_in_bytes))
+    {
+      error ("write_out_picture: error writing to YUV file", 500);
+    }
+    if (!rgb_output)
+    {
+      p_Vid->img2buf (p->imgUV[1], buf, p->size_x_cr, p->size_y_cr, symbol_size_in_bytes, crop_left, crop_right, crop_top, crop_bottom);
+      ret = write(p_out, buf, (p->size_y_cr-crop_bottom-crop_top)*(p->size_x_cr-crop_right-crop_left)*symbol_size_in_bytes);
+      if (ret != ((p->size_y_cr-crop_bottom-crop_top)*(p->size_x_cr-crop_right-crop_left)*symbol_size_in_bytes))
+      {
+        error ("write_out_picture: error writing to YUV file", 500);
+      }
+    }
+  }
+  else
+  {
+    if (p_Inp->write_uv)
+    {
+      int i,j;
+      imgpel cr_val = (imgpel) (1<<(p_Vid->bitdepth_luma - 1));
+
+      get_mem3Dpel (&(p->imgUV), 1, p->size_y/2, p->size_x/2);
+      for (j=0; j<p->size_y/2; j++)
+        for (i=0; i<p->size_x/2; i++)
+          p->imgUV[0][j][i]=cr_val;
+
+      // fake out U=V=128 to make a YUV 4:2:0 stream
+      p_Vid->img2buf (p->imgUV[0], buf, p->size_x/2, p->size_y/2, symbol_size_in_bytes, crop_left/2, crop_right/2, crop_top/2, crop_bottom/2);
+
+      ret = write(p_out, buf, symbol_size_in_bytes * (p->size_y-crop_bottom-crop_top)/2 * (p->size_x-crop_right-crop_left)/2 );
+      if (ret != (symbol_size_in_bytes * (p->size_y-crop_bottom-crop_top)/2 * (p->size_x-crop_right-crop_left)/2))
+      {
+        error ("write_out_picture: error writing to YUV file", 500);
+      }
+      ret = write(p_out, buf, symbol_size_in_bytes * (p->size_y-crop_bottom-crop_top)/2 * (p->size_x-crop_right-crop_left)/2 );
+      if (ret != (symbol_size_in_bytes * (p->size_y-crop_bottom-crop_top)/2 * (p->size_x-crop_right-crop_left)/2))
+      {
+        error ("write_out_picture: error writing to YUV file", 500);
+      }
+
+      free_mem3Dpel(p->imgUV);
+      p->imgUV=NULL;
+    }
+  }
+
+  free(buf);
+#endif
+	if (p)
+	{
+		p->retain_count++;
+		out_storable_picture_add(p_Vid, p);
+		free_storable_picture(p_Vid, p); // release the reference we added above (out_storable_picture will add its own)
+	}
+//  fsync(p_out);
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Initialize output buffer for direct output
+ ************************************************************************
+ */
+void init_out_buffer(VideoParameters *p_Vid)
+{
+  p_Vid->out_buffer = alloc_frame_store();  
+
+#if (PAIR_FIELDS_IN_OUTPUT)
+  p_Vid->pending_output = calloc (sizeof(StorablePicture), 1);
+  if (NULL==p_Vid->pending_output) no_mem_exit("init_out_buffer");
+  p_Vid->pending_output->imgUV = NULL;
+  p_Vid->pending_output->imgY  = NULL;
+#endif
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Uninitialize output buffer for direct output
+ ************************************************************************
+ */
+void uninit_out_buffer(VideoParameters *p_Vid)
+{
+  free_frame_store(p_Vid, p_Vid->out_buffer);
+  p_Vid->out_buffer=NULL;
+#if (PAIR_FIELDS_IN_OUTPUT)
+  flush_pending_output(p_Vid);
+  free (p_Vid->pending_output);
+#endif
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Initialize picture memory with (Y:0,U:128,V:128)
+ ************************************************************************
+ */
+void clear_picture(VideoParameters *p_Vid, StorablePicture *p)
+{
+  int i,j;
+
+  for(i=0;i<p->size_y;i++)
+  {
+    for (j=0; j<p->size_x; j++)
+      p->imgY->img[i][j] = (imgpel) p_Vid->dc_pred_value_comp[0];
+  }
+  for(i=0;i<p->size_y_cr;i++)
+  {
+    for (j=0; j<p->size_x_cr; j++)
+      p->imgUV[0]->img[i][j] = (imgpel) p_Vid->dc_pred_value_comp[1];
+  }
+  for(i=0;i<p->size_y_cr;i++)
+  {
+    for (j=0; j<p->size_x_cr; j++)
+      p->imgUV[1]->img[i][j] = (imgpel) p_Vid->dc_pred_value_comp[2];
+  }
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Write out not paired direct output fields. A second empty field is generated
+ *    and combined into the frame buffer.
+ *
+ * \param p_Vid
+ *      image decoding parameters for current picture
+ * \param fs
+ *    FrameStore that contains a single field
+ * \param p_out
+ *    Output file
+ ************************************************************************
+ */
+static void write_unpaired_field(VideoParameters *p_Vid, FrameStore* fs)
+{
+  StorablePicture *p;
+  assert (fs->is_used<3);
+
+  if(fs->is_used & 0x01)
+  {
+    // we have a top field
+    // construct an empty bottom field
+    p = fs->top_field;
+    fs->bottom_field = alloc_storable_picture(p_Vid, BOTTOM_FIELD, p->size_x, 2*p->size_y, p->size_x_cr, 2*p->size_y_cr);
+    fs->bottom_field->chroma_format_idc = p->chroma_format_idc;
+    clear_picture(p_Vid, fs->bottom_field);
+    dpb_combine_field_yuv(p_Vid, fs);
+    write_picture (p_Vid, fs->frame, TOP_FIELD);
+  }
+
+  if(fs->is_used & 0x02)
+  {
+    // we have a bottom field
+    // construct an empty top field
+    p = fs->bottom_field;
+    fs->top_field = alloc_storable_picture(p_Vid, TOP_FIELD, p->size_x, 2*p->size_y, p->size_x_cr, 2*p->size_y_cr);
+    fs->top_field->chroma_format_idc = p->chroma_format_idc;
+    clear_picture(p_Vid, fs->top_field);
+    fs ->top_field->frame_cropping_flag = fs->bottom_field->frame_cropping_flag;
+    if(fs ->top_field->frame_cropping_flag)
+    {
+      fs ->top_field->frame_cropping_rect_top_offset = fs->bottom_field->frame_cropping_rect_top_offset;
+      fs ->top_field->frame_cropping_rect_bottom_offset = fs->bottom_field->frame_cropping_rect_bottom_offset;
+      fs ->top_field->frame_cropping_rect_left_offset = fs->bottom_field->frame_cropping_rect_left_offset;
+      fs ->top_field->frame_cropping_rect_right_offset = fs->bottom_field->frame_cropping_rect_right_offset;
+    }
+    dpb_combine_field_yuv(p_Vid, fs);
+    write_picture (p_Vid, fs->frame, BOTTOM_FIELD);
+  }
+
+  fs->is_used = 3;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Write out unpaired fields from output buffer.
+ *
+ * \param p_Vid
+ *      image decoding parameters for current picture
+ * \param p_out
+ *    Output file
+ ************************************************************************
+ */
+static void flush_direct_output(VideoParameters *p_Vid)
+{
+  write_unpaired_field(p_Vid, p_Vid->out_buffer);
+
+  free_storable_picture(p_Vid, p_Vid->out_buffer->frame);
+  p_Vid->out_buffer->frame = NULL;
+  free_storable_picture(p_Vid, p_Vid->out_buffer->top_field);
+  p_Vid->out_buffer->top_field = NULL;
+  free_storable_picture(p_Vid, p_Vid->out_buffer->bottom_field);
+  p_Vid->out_buffer->bottom_field = NULL;
+  p_Vid->out_buffer->is_used = 0;
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Write a frame (from FrameStore)
+ *
+ * \param p_Vid
+ *      image decoding parameters for current picture
+ * \param fs
+ *    FrameStore containing the frame
+ * \param p_out
+ *    Output file
+ ************************************************************************
+ */
+void write_stored_frame( VideoParameters *p_Vid, FrameStore *fs)
+{
+  // make sure no direct output field is pending
+  flush_direct_output(p_Vid);
+
+  if (fs->is_used<3)
+  {
+    write_unpaired_field(p_Vid, fs);
+  }
+  else
+  {
+    if (fs->recovery_frame)
+      p_Vid->recovery_flag = 1;
+    if ((!p_Vid->non_conforming_stream) || p_Vid->recovery_flag)
+      write_picture(p_Vid, fs->frame, FRAME);
+  }
+
+  fs->is_output = 1;
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Directly output a picture without storing it in the DPB. Fields
+ *    are buffered before they are written to the file.
+ *
+ * \param p_Vid
+ *      image decoding parameters for current picture
+ * \param p
+ *    Picture for output
+ * \param p_out
+ *    Output file
+ ************************************************************************
+ */
+void direct_output(VideoParameters *p_Vid, StorablePicture *p)
+{
+  InputParameters *p_Inp = p_Vid->p_Inp;
+  if (p->structure==FRAME)
+  {
+    // we have a frame (or complementary field pair)
+    // so output it directly
+    flush_direct_output(p_Vid);
+    write_picture (p_Vid, p, FRAME);
+    free_storable_picture(p_Vid, p);
+    return;
+  }
+
+  if (p->structure == TOP_FIELD)
+  {
+    if (p_Vid->out_buffer->is_used &1)
+      flush_direct_output(p_Vid);
+    p_Vid->out_buffer->top_field = p;
+    p_Vid->out_buffer->is_used |= 1;
+  }
+
+  if (p->structure == BOTTOM_FIELD)
+  {
+    if (p_Vid->out_buffer->is_used &2)
+      flush_direct_output(p_Vid);
+    p_Vid->out_buffer->bottom_field = p;
+    p_Vid->out_buffer->is_used |= 2;
+  }
+
+  if (p_Vid->out_buffer->is_used == 3)
+  {
+    // we have both fields, so output them
+    dpb_combine_field_yuv(p_Vid, p_Vid->out_buffer);
+		p_Vid->out_buffer->frame->time_code = p->time_code;
+    write_picture (p_Vid, p_Vid->out_buffer->frame, FRAME);
+
+    free_storable_picture(p_Vid, p_Vid->out_buffer->frame);
+    p_Vid->out_buffer->frame = NULL;
+    free_storable_picture(p_Vid, p_Vid->out_buffer->top_field);
+    p_Vid->out_buffer->top_field = NULL;
+    free_storable_picture(p_Vid, p_Vid->out_buffer->bottom_field);
+    p_Vid->out_buffer->bottom_field = NULL;
+    p_Vid->out_buffer->is_used = 0;
+  }
+}
+
diff --git a/Src/h264dec/ldecod/src/parset.c b/Src/h264dec/ldecod/src/parset.c
new file mode 100644
index 00000000..1b49cbd8
--- /dev/null
+++ b/Src/h264dec/ldecod/src/parset.c
@@ -0,0 +1,779 @@
+
+/*!
+ ************************************************************************
+ *  \file
+ *     parset.c
+ *  \brief
+ *     Parameter Sets
+ *  \author
+ *     Main contributors (see contributors.h for copyright, address and affiliation details)
+ *     - Stephan Wenger          <stewe@cs.tu-berlin.de>
+ *
+ ***********************************************************************
+ */
+
+#include "global.h"
+#include "image.h"
+#include "parsetcommon.h"
+#include "parset.h"
+#include "nalu.h"
+#include "memalloc.h"
+#include "fmo.h"
+#include "cabac.h"
+#include "vlc.h"
+#include "mbuffer.h"
+#include "erc_api.h"
+
+#if TRACE
+#define SYMTRACESTRING(s) strncpy(sym->tracestring,s,TRACESTRING_SIZE)
+#else
+#define SYMTRACESTRING(s) // do nothing
+#endif
+
+
+extern void init_frext(VideoParameters *p_Vid);
+
+// syntax for scaling list matrix values
+void Scaling_List(int *scalingList, int sizeOfScalingList, Boolean *UseDefaultScalingMatrix, Bitstream *s)
+{
+  int j, scanj;
+  int delta_scale, lastScale, nextScale;
+
+  lastScale      = 8;
+  nextScale      = 8;
+
+  for(j=0; j<sizeOfScalingList; j++)
+  {
+    scanj = (sizeOfScalingList==16) ? ZZ_SCAN[j]:ZZ_SCAN8[j];
+
+    if(nextScale!=0)
+    {
+      delta_scale = se_v (   "   : delta_sl   "                           , s);
+      nextScale = (lastScale + delta_scale + 256) % 256;
+      *UseDefaultScalingMatrix = (Boolean) (scanj==0 && nextScale==0);
+    }
+
+    scalingList[scanj] = (nextScale==0) ? lastScale:nextScale;
+    lastScale = scalingList[scanj];
+  }
+}
+// fill sps with content of p
+
+static void InterpretSPS (VideoParameters *p_Vid, DataPartition *p, seq_parameter_set_rbsp_t *sps)
+{
+  unsigned i;
+  unsigned n_ScalingList;
+  int reserved_zero;
+  Bitstream *s = p->bitstream;
+
+  assert (p != NULL);
+  assert (p->bitstream != NULL);
+  assert (p->bitstream->streamBuffer != 0);
+  assert (sps != NULL);
+
+  sps->profile_idc                            = u_v  (8, "SPS: profile_idc"                           , s);
+
+  if ((sps->profile_idc!=BASELINE       ) &&
+      (sps->profile_idc!=MAIN           ) &&
+      (sps->profile_idc!=EXTENDED       ) &&
+      (sps->profile_idc!=FREXT_HP       ) &&
+      (sps->profile_idc!=FREXT_Hi10P    ) &&
+      (sps->profile_idc!=FREXT_Hi422    ) &&
+      (sps->profile_idc!=FREXT_Hi444    ) &&
+      (sps->profile_idc!=FREXT_CAVLC444 ))
+  {
+    printf("Invalid Profile IDC (%d) encountered. \n", sps->profile_idc);
+    return;
+  }
+
+  sps->constrained_set0_flag                  = u_1  (   "SPS: constrained_set0_flag"                 , s);
+  sps->constrained_set1_flag                  = u_1  (   "SPS: constrained_set1_flag"                 , s);
+  sps->constrained_set2_flag                  = u_1  (   "SPS: constrained_set2_flag"                 , s);
+  sps->constrained_set3_flag                  = u_1  (   "SPS: constrained_set3_flag"                 , s);
+  reserved_zero                               = u_v  (4, "SPS: reserved_zero_4bits"                   , s);
+  assert (reserved_zero==0);
+
+  sps->level_idc                              = u_v  (8, "SPS: level_idc"                             , s);
+
+  sps->seq_parameter_set_id                   = ue_v ("SPS: seq_parameter_set_id"                     , s);
+
+  // Fidelity Range Extensions stuff
+  sps->chroma_format_idc = 1;
+  sps->bit_depth_luma_minus8   = 0;
+  sps->bit_depth_chroma_minus8 = 0;
+  p_Vid->lossless_qpprime_flag   = 0;
+  sps->separate_colour_plane_flag = 0;
+
+  if((IS_FREXT_PROFILE(sps->profile_idc)))/*==FREXT_HP   ) ||
+     (sps->profile_idc==FREXT_Hi10P) ||
+     (sps->profile_idc==FREXT_Hi422) ||
+     (sps->profile_idc==FREXT_Hi444) ||
+     (sps->profile_idc==FREXT_CAVLC444))*/
+  {
+    sps->chroma_format_idc                      = ue_v ("SPS: chroma_format_idc"                       , s);
+
+    if(sps->chroma_format_idc == YUV444)
+    {
+      sps->separate_colour_plane_flag           = u_1  ("SPS: separate_colour_plane_flag"              , s);
+    }
+
+    sps->bit_depth_luma_minus8                  = ue_v ("SPS: bit_depth_luma_minus8"                   , s);
+    sps->bit_depth_chroma_minus8                = ue_v ("SPS: bit_depth_chroma_minus8"                 , s);
+    p_Vid->lossless_qpprime_flag                  = u_1  ("SPS: lossless_qpprime_y_zero_flag"            , s);
+
+    sps->seq_scaling_matrix_present_flag        = u_1  (   "SPS: seq_scaling_matrix_present_flag"       , s);
+
+    if(sps->seq_scaling_matrix_present_flag)
+    {
+      n_ScalingList = (sps->chroma_format_idc != YUV444) ? 8 : 12;
+      for(i=0; i<n_ScalingList; i++)
+      {
+        sps->seq_scaling_list_present_flag[i]   = u_1  (   "SPS: seq_scaling_list_present_flag"         , s);
+        if(sps->seq_scaling_list_present_flag[i])
+        {
+          if(i<6)
+            Scaling_List(sps->ScalingList4x4[i], 16, &sps->UseDefaultScalingMatrix4x4Flag[i], s);
+          else
+            Scaling_List(sps->ScalingList8x8[i-6], 64, &sps->UseDefaultScalingMatrix8x8Flag[i-6], s);
+        }
+      }
+    }
+  }
+
+  sps->log2_max_frame_num_minus4              = ue_v ("SPS: log2_max_frame_num_minus4"                , s);
+  sps->pic_order_cnt_type                     = ue_v ("SPS: pic_order_cnt_type"                       , s);
+
+  if (sps->pic_order_cnt_type == 0)
+    sps->log2_max_pic_order_cnt_lsb_minus4 = ue_v ("SPS: log2_max_pic_order_cnt_lsb_minus4"           , s);
+  else if (sps->pic_order_cnt_type == 1)
+  {
+    sps->delta_pic_order_always_zero_flag      = u_1  ("SPS: delta_pic_order_always_zero_flag"       , s);
+    sps->offset_for_non_ref_pic                = se_v ("SPS: offset_for_non_ref_pic"                 , s);
+    sps->offset_for_top_to_bottom_field        = se_v ("SPS: offset_for_top_to_bottom_field"         , s);
+    sps->num_ref_frames_in_pic_order_cnt_cycle = ue_v ("SPS: num_ref_frames_in_pic_order_cnt_cycle"  , s);
+    for(i=0; i<sps->num_ref_frames_in_pic_order_cnt_cycle; i++)
+      sps->offset_for_ref_frame[i]               = se_v ("SPS: offset_for_ref_frame[i]"              , s);
+  }
+  sps->num_ref_frames                        = ue_v ("SPS: num_ref_frames"                         , s);
+  sps->gaps_in_frame_num_value_allowed_flag  = u_1  ("SPS: gaps_in_frame_num_value_allowed_flag"   , s);
+  sps->pic_width_in_mbs_minus1               = ue_v ("SPS: pic_width_in_mbs_minus1"                , s);
+  sps->pic_height_in_map_units_minus1        = ue_v ("SPS: pic_height_in_map_units_minus1"         , s);
+  sps->frame_mbs_only_flag                   = u_1  ("SPS: frame_mbs_only_flag"                    , s);
+  if (!sps->frame_mbs_only_flag)
+  {
+    sps->mb_adaptive_frame_field_flag        = u_1  ("SPS: mb_adaptive_frame_field_flag"           , s);
+  }
+  sps->direct_8x8_inference_flag             = u_1  ("SPS: direct_8x8_inference_flag"              , s);
+  sps->frame_cropping_flag                   = u_1  ("SPS: frame_cropping_flag"                    , s);
+
+  if (sps->frame_cropping_flag)
+  {
+    sps->frame_cropping_rect_left_offset      = ue_v ("SPS: frame_cropping_rect_left_offset"           , s);
+    sps->frame_cropping_rect_right_offset     = ue_v ("SPS: frame_cropping_rect_right_offset"          , s);
+    sps->frame_cropping_rect_top_offset       = ue_v ("SPS: frame_cropping_rect_top_offset"            , s);
+    sps->frame_cropping_rect_bottom_offset    = ue_v ("SPS: frame_cropping_rect_bottom_offset"         , s);
+  }
+  sps->vui_parameters_present_flag           = (Boolean) u_1  ("SPS: vui_parameters_present_flag"      , s);
+
+  InitVUI(sps);
+  ReadVUI(p, sps);
+
+  sps->Valid = TRUE;
+}
+
+
+void InitVUI(seq_parameter_set_rbsp_t *sps)
+{
+  sps->vui_seq_parameters.matrix_coefficients = 2;
+}
+
+
+int ReadVUI(DataPartition *p, seq_parameter_set_rbsp_t *sps)
+{
+  Bitstream *s = p->bitstream;
+  if (sps->vui_parameters_present_flag)
+  {
+    sps->vui_seq_parameters.aspect_ratio_info_present_flag = u_1  ("VUI: aspect_ratio_info_present_flag"   , s);
+    if (sps->vui_seq_parameters.aspect_ratio_info_present_flag)
+    {
+      sps->vui_seq_parameters.aspect_ratio_idc             = u_v  ( 8, "VUI: aspect_ratio_idc"              , s);
+      if (255==sps->vui_seq_parameters.aspect_ratio_idc)
+      {
+        sps->vui_seq_parameters.sar_width                  = (unsigned short) u_v  (16, "VUI: sar_width"                     , s);
+        sps->vui_seq_parameters.sar_height                 = (unsigned short) u_v  (16, "VUI: sar_height"                    , s);
+      }
+  }
+
+    sps->vui_seq_parameters.overscan_info_present_flag     = u_1  ("VUI: overscan_info_present_flag"        , s);
+    if (sps->vui_seq_parameters.overscan_info_present_flag)
+    {
+      sps->vui_seq_parameters.overscan_appropriate_flag    = u_1  ("VUI: overscan_appropriate_flag"         , s);
+    }
+
+    sps->vui_seq_parameters.video_signal_type_present_flag = u_1  ("VUI: video_signal_type_present_flag"    , s);
+    if (sps->vui_seq_parameters.video_signal_type_present_flag)
+    {
+      sps->vui_seq_parameters.video_format                    = u_v  ( 3,"VUI: video_format"                      , s);
+      sps->vui_seq_parameters.video_full_range_flag           = u_1  (   "VUI: video_full_range_flag"             , s);
+      sps->vui_seq_parameters.colour_description_present_flag = u_1  (   "VUI: color_description_present_flag"    , s);
+      if(sps->vui_seq_parameters.colour_description_present_flag)
+      {
+        sps->vui_seq_parameters.colour_primaries              = u_v  ( 8,"VUI: colour_primaries"                  , s);
+        sps->vui_seq_parameters.transfer_characteristics      = u_v  ( 8,"VUI: transfer_characteristics"          , s);
+        sps->vui_seq_parameters.matrix_coefficients           = u_v  ( 8,"VUI: matrix_coefficients"               , s);
+      }
+    }
+    sps->vui_seq_parameters.chroma_location_info_present_flag = u_1  (   "VUI: chroma_loc_info_present_flag"      , s);
+    if(sps->vui_seq_parameters.chroma_location_info_present_flag)
+    {
+      sps->vui_seq_parameters.chroma_sample_loc_type_top_field     = ue_v  ( "VUI: chroma_sample_loc_type_top_field"    , s);
+      sps->vui_seq_parameters.chroma_sample_loc_type_bottom_field  = ue_v  ( "VUI: chroma_sample_loc_type_bottom_field" , s);
+    }
+    sps->vui_seq_parameters.timing_info_present_flag          = u_1  ("VUI: timing_info_present_flag"           , s);
+    if (sps->vui_seq_parameters.timing_info_present_flag)
+    {
+      sps->vui_seq_parameters.num_units_in_tick               = u_v  (32,"VUI: num_units_in_tick"               , s);
+      sps->vui_seq_parameters.time_scale                      = u_v  (32,"VUI: time_scale"                      , s);
+      sps->vui_seq_parameters.fixed_frame_rate_flag           = u_1  (   "VUI: fixed_frame_rate_flag"           , s);
+    }
+    sps->vui_seq_parameters.nal_hrd_parameters_present_flag   = u_1  ("VUI: nal_hrd_parameters_present_flag"    , s);
+    if (sps->vui_seq_parameters.nal_hrd_parameters_present_flag)
+    {
+      ReadHRDParameters(p, &(sps->vui_seq_parameters.nal_hrd_parameters));
+    }
+    sps->vui_seq_parameters.vcl_hrd_parameters_present_flag   = u_1  ("VUI: vcl_hrd_parameters_present_flag"    , s);
+    if (sps->vui_seq_parameters.vcl_hrd_parameters_present_flag)
+    {
+      ReadHRDParameters(p, &(sps->vui_seq_parameters.vcl_hrd_parameters));
+    }
+    if (sps->vui_seq_parameters.nal_hrd_parameters_present_flag || sps->vui_seq_parameters.vcl_hrd_parameters_present_flag)
+    {
+      sps->vui_seq_parameters.low_delay_hrd_flag             =  u_1  ("VUI: low_delay_hrd_flag"                 , s);
+    }
+    sps->vui_seq_parameters.pic_struct_present_flag          =  u_1  ("VUI: pic_struct_present_flag   "         , s);
+    sps->vui_seq_parameters.bitstream_restriction_flag       =  u_1  ("VUI: bitstream_restriction_flag"         , s);
+    if (sps->vui_seq_parameters.bitstream_restriction_flag)
+    {
+      sps->vui_seq_parameters.motion_vectors_over_pic_boundaries_flag =  u_1  ("VUI: motion_vectors_over_pic_boundaries_flag", s);
+      sps->vui_seq_parameters.max_bytes_per_pic_denom                 =  ue_v ("VUI: max_bytes_per_pic_denom"                , s);
+      sps->vui_seq_parameters.max_bits_per_mb_denom                   =  ue_v ("VUI: max_bits_per_mb_denom"                  , s);
+      sps->vui_seq_parameters.log2_max_mv_length_horizontal           =  ue_v ("VUI: log2_max_mv_length_horizontal"          , s);
+      sps->vui_seq_parameters.log2_max_mv_length_vertical             =  ue_v ("VUI: log2_max_mv_length_vertical"            , s);
+      sps->vui_seq_parameters.num_reorder_frames                      =  ue_v ("VUI: num_reorder_frames"                     , s);
+      sps->vui_seq_parameters.max_dec_frame_buffering                 =  ue_v ("VUI: max_dec_frame_buffering"                , s);
+    }
+  }
+
+  return 0;
+}
+
+
+int ReadHRDParameters(DataPartition *p, hrd_parameters_t *hrd)
+{
+  Bitstream *s = p->bitstream;
+  unsigned int SchedSelIdx;
+
+  hrd->cpb_cnt_minus1                                      = ue_v (   "VUI: cpb_cnt_minus1"                       , s);
+  hrd->bit_rate_scale                                      = u_v  ( 4,"VUI: bit_rate_scale"                       , s);
+  hrd->cpb_size_scale                                      = u_v  ( 4,"VUI: cpb_size_scale"                       , s);
+
+  for( SchedSelIdx = 0; SchedSelIdx <= hrd->cpb_cnt_minus1; SchedSelIdx++ )
+  {
+    hrd->bit_rate_value_minus1[ SchedSelIdx ]             = ue_v  ( "VUI: bit_rate_value_minus1"                  , s);
+    hrd->cpb_size_value_minus1[ SchedSelIdx ]             = ue_v  ( "VUI: cpb_size_value_minus1"                  , s);
+    hrd->cbr_flag[ SchedSelIdx ]                          = u_1   ( "VUI: cbr_flag"                               , s);
+  }
+
+  hrd->initial_cpb_removal_delay_length_minus1            = u_v  ( 5,"VUI: initial_cpb_removal_delay_length_minus1" , s);
+  hrd->cpb_removal_delay_length_minus1                    = u_v  ( 5,"VUI: cpb_removal_delay_length_minus1"         , s);
+  hrd->dpb_output_delay_length_minus1                     = u_v  ( 5,"VUI: dpb_output_delay_length_minus1"          , s);
+  hrd->time_offset_length                                 = u_v  ( 5,"VUI: time_offset_length"          , s);
+
+  return 0;
+}
+
+
+static void InterpretPPS (VideoParameters *p_Vid, DataPartition *p, pic_parameter_set_rbsp_t *pps)
+{
+  unsigned i;
+  unsigned n_ScalingList;
+  int chroma_format_idc;
+  int NumberBitsPerSliceGroupId;
+  Bitstream *s = p->bitstream;
+
+  assert (p != NULL);
+  assert (p->bitstream != NULL);
+  assert (p->bitstream->streamBuffer != 0);
+  assert (pps != NULL);
+
+  pps->pic_parameter_set_id                  = ue_v ("PPS: pic_parameter_set_id"                   , s);
+  pps->seq_parameter_set_id                  = ue_v ("PPS: seq_parameter_set_id"                   , s);
+  pps->entropy_coding_mode_flag              = u_1  ("PPS: entropy_coding_mode_flag"               , s);
+
+  //! Note: as per JVT-F078 the following bit is unconditional.  If F078 is not accepted, then
+  //! one has to fetch the correct SPS to check whether the bit is present (hopefully there is
+  //! no consistency problem :-(
+  //! The current encoder code handles this in the same way.  When you change this, don't forget
+  //! the encoder!  StW, 12/8/02
+  pps->bottom_field_pic_order_in_frame_present_flag                = u_1  ("PPS: bottom_field_pic_order_in_frame_present_flag"                 , s);
+
+  pps->num_slice_groups_minus1               = ue_v ("PPS: num_slice_groups_minus1"                , s);
+
+  // FMO stuff begins here
+  if (pps->num_slice_groups_minus1 > 0)
+  {
+    pps->slice_group_map_type               = ue_v ("PPS: slice_group_map_type"                , s);
+    if (pps->slice_group_map_type == 0)
+    {
+      for (i=0; i<=pps->num_slice_groups_minus1; i++)
+        pps->run_length_minus1 [i]                  = ue_v ("PPS: run_length_minus1 [i]"              , s);
+    }
+    else if (pps->slice_group_map_type == 2)
+    {
+      for (i=0; i<pps->num_slice_groups_minus1; i++)
+      {
+        //! JVT-F078: avoid reference of SPS by using ue(v) instead of u(v)
+        pps->top_left [i]                          = ue_v ("PPS: top_left [i]"                        , s);
+        pps->bottom_right [i]                      = ue_v ("PPS: bottom_right [i]"                    , s);
+      }
+    }
+    else if (pps->slice_group_map_type == 3 ||
+             pps->slice_group_map_type == 4 ||
+             pps->slice_group_map_type == 5)
+    {
+      pps->slice_group_change_direction_flag     = u_1  ("PPS: slice_group_change_direction_flag"      , s);
+      pps->slice_group_change_rate_minus1        = ue_v ("PPS: slice_group_change_rate_minus1"         , s);
+    }
+    else if (pps->slice_group_map_type == 6)
+    {
+      if (pps->num_slice_groups_minus1+1 >4)
+        NumberBitsPerSliceGroupId = 3;
+      else if (pps->num_slice_groups_minus1+1 > 2)
+        NumberBitsPerSliceGroupId = 2;
+      else
+        NumberBitsPerSliceGroupId = 1;
+      pps->pic_size_in_map_units_minus1      = ue_v ("PPS: pic_size_in_map_units_minus1"               , s);
+      if ((pps->slice_group_id = calloc (pps->pic_size_in_map_units_minus1+1, 1)) == NULL)
+        no_mem_exit ("InterpretPPS: slice_group_id");
+      for (i=0; i<=pps->pic_size_in_map_units_minus1; i++)
+        pps->slice_group_id[i] = (byte) u_v (NumberBitsPerSliceGroupId, "slice_group_id[i]", s);
+    }
+  }
+
+  // End of FMO stuff
+
+  pps->num_ref_idx_l0_active_minus1          = ue_v ("PPS: num_ref_idx_l0_active_minus1"           , s);
+  pps->num_ref_idx_l1_active_minus1          = ue_v ("PPS: num_ref_idx_l1_active_minus1"           , s);
+  pps->weighted_pred_flag                    = u_1  ("PPS: weighted_pred_flag"                     , s);
+  pps->weighted_bipred_idc                   = u_v  ( 2, "PPS: weighted_bipred_idc"                , s);
+  pps->pic_init_qp_minus26                   = se_v ("PPS: pic_init_qp_minus26"                    , s);
+  pps->pic_init_qs_minus26                   = se_v ("PPS: pic_init_qs_minus26"                    , s);
+
+  pps->chroma_qp_index_offset                = se_v ("PPS: chroma_qp_index_offset"                 , s);
+
+  pps->deblocking_filter_control_present_flag = u_1 ("PPS: deblocking_filter_control_present_flag" , s);
+  pps->constrained_intra_pred_flag           = u_1  ("PPS: constrained_intra_pred_flag"            , s);
+  pps->redundant_pic_cnt_present_flag        = u_1  ("PPS: redundant_pic_cnt_present_flag"         , s);
+
+  if(more_rbsp_data(s->streamBuffer, s->frame_bitoffset,s->bitstream_length)) // more_data_in_rbsp()
+  {
+    //Fidelity Range Extensions Stuff
+    pps->transform_8x8_mode_flag           = u_1  ("PPS: transform_8x8_mode_flag"                , s);
+    pps->pic_scaling_matrix_present_flag   =  u_1  ("PPS: pic_scaling_matrix_present_flag"        , s);
+
+    if(pps->pic_scaling_matrix_present_flag)
+    {
+      chroma_format_idc = p_Vid->SeqParSet[pps->seq_parameter_set_id].chroma_format_idc;
+      n_ScalingList = 6 + ((chroma_format_idc != YUV444) ? 2 : 6) * pps->transform_8x8_mode_flag;
+      for(i=0; i<n_ScalingList; i++)
+      {
+        pps->pic_scaling_list_present_flag[i]= u_1  ("PPS: pic_scaling_list_present_flag"          , s);
+
+        if(pps->pic_scaling_list_present_flag[i])
+        {
+          if(i<6)
+            Scaling_List(pps->ScalingList4x4[i], 16, &pps->UseDefaultScalingMatrix4x4Flag[i], s);
+          else
+            Scaling_List(pps->ScalingList8x8[i-6], 64, &pps->UseDefaultScalingMatrix8x8Flag[i-6], s);
+        }
+      }
+    }
+    pps->second_chroma_qp_index_offset      = se_v ("PPS: second_chroma_qp_index_offset"          , s);
+  }
+  else
+  {
+    pps->second_chroma_qp_index_offset      = pps->chroma_qp_index_offset;
+  }
+
+  pps->Valid = TRUE;
+}
+
+
+void PPSConsistencyCheck (pic_parameter_set_rbsp_t *pps)
+{
+  printf ("Consistency checking a picture parset, to be implemented\n");
+//  if (pps->seq_parameter_set_id invalid then do something)
+}
+
+void SPSConsistencyCheck (seq_parameter_set_rbsp_t *sps)
+{
+  printf ("Consistency checking a sequence parset, to be implemented\n");
+}
+
+void MakePPSavailable (VideoParameters *p_Vid, int id, pic_parameter_set_rbsp_t *pps)
+{
+  assert (pps->Valid == TRUE);
+
+  if (p_Vid->PicParSet[id].Valid == TRUE && p_Vid->PicParSet[id].slice_group_id != NULL)
+    free (p_Vid->PicParSet[id].slice_group_id);
+
+  memcpy (&p_Vid->PicParSet[id], pps, sizeof (pic_parameter_set_rbsp_t));
+
+  // we can simply use the memory provided with the pps. the PPS is destroyed after this function
+  // call and will not try to free if pps->slice_group_id == NULL
+  p_Vid->PicParSet[id].slice_group_id = pps->slice_group_id;
+  pps->slice_group_id          = NULL;
+}
+
+void CleanUpPPS(VideoParameters *p_Vid)
+{
+  int i;
+
+  for (i=0; i<MAXPPS; i++)
+  {
+    if (p_Vid->PicParSet[i].Valid == TRUE && p_Vid->PicParSet[i].slice_group_id != NULL)
+      free (p_Vid->PicParSet[i].slice_group_id);
+
+    p_Vid->PicParSet[i].Valid = FALSE;
+  }
+}
+
+
+void MakeSPSavailable (VideoParameters *p_Vid, int id, seq_parameter_set_rbsp_t *sps)
+{
+  assert (sps->Valid == TRUE);
+  memcpy (&p_Vid->SeqParSet[id], sps, sizeof (seq_parameter_set_rbsp_t));
+}
+
+void ProcessSPS_Memory(VideoParameters *p_Vid, const void *buffer, size_t bufferlen)
+{
+  
+}
+
+void ProcessSPS(VideoParameters *p_Vid, NALU_t *nalu)
+{ 
+	DataPartition *dp = AllocPartition(1);
+  seq_parameter_set_rbsp_t *sps = AllocSPS();
+	
+  //memcpy (dp->bitstream->streamBuffer, buffer, bufferlen);
+	dp->bitstream->streamBuffer = &nalu->buf[1];
+  dp->bitstream->code_len = dp->bitstream->bitstream_length = RBSPtoSODB (dp->bitstream->streamBuffer, nalu->len-1);
+  dp->bitstream->read_len = dp->bitstream->frame_bitoffset = 0;
+  InterpretSPS (p_Vid, dp, sps);
+
+  if (sps->Valid)
+  {
+    if (p_Vid->active_sps)
+    {
+      if (sps->seq_parameter_set_id == p_Vid->active_sps->seq_parameter_set_id)
+      {
+        if (!sps_is_equal(sps, p_Vid->active_sps))
+        {
+          if (p_Vid->dec_picture)
+          {
+            // this may only happen on slice loss
+            exit_picture(p_Vid, &p_Vid->dec_picture);
+          }
+          p_Vid->active_sps=NULL;
+        }
+      }
+    }
+    // SPSConsistencyCheck (pps);
+    MakeSPSavailable (p_Vid, sps->seq_parameter_set_id, sps);
+    p_Vid->profile_idc = sps->profile_idc;
+    p_Vid->separate_colour_plane_flag = sps->separate_colour_plane_flag;
+    if( p_Vid->separate_colour_plane_flag )
+    {
+      p_Vid->ChromaArrayType = 0;
+    }
+    else
+    {
+      p_Vid->ChromaArrayType = sps->chroma_format_idc;
+    }
+  }
+
+  FreePartition (dp, 1);
+  FreeSPS (sps);
+	
+}
+
+void ProcessPPS (VideoParameters *p_Vid, NALU_t *nalu)
+{
+  DataPartition *dp = AllocPartition(1);
+  pic_parameter_set_rbsp_t *pps = AllocPPS();
+
+  //memcpy (dp->bitstream->streamBuffer, &nalu->buf[1], nalu->len-1);
+	dp->bitstream->streamBuffer = &nalu->buf[1];
+  dp->bitstream->code_len = dp->bitstream->bitstream_length = RBSPtoSODB (dp->bitstream->streamBuffer, nalu->len-1);
+  dp->bitstream->read_len = dp->bitstream->frame_bitoffset = 0;
+  InterpretPPS (p_Vid, dp, pps);
+  // PPSConsistencyCheck (pps);
+  if (p_Vid->active_pps)
+  {
+    if (pps->pic_parameter_set_id == p_Vid->active_pps->pic_parameter_set_id)
+    {
+      if (!pps_is_equal(pps, p_Vid->active_pps))
+      {
+        if (p_Vid->dec_picture)
+        {
+          // this may only happen on slice loss
+          exit_picture(p_Vid, &p_Vid->dec_picture);
+        }
+        p_Vid->active_pps = NULL;
+      }
+    }
+  }
+  MakePPSavailable (p_Vid, pps->pic_parameter_set_id, pps);
+  FreePartition (dp, 1);
+  FreePPS (pps);
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Updates images max values
+ *
+ ************************************************************************
+ */
+static void updateMaxValue(FrameFormat *format)
+{
+  format->max_value[0] = (1 << format->bit_depth[0]) - 1;
+  format->max_value_sq[0] = format->max_value[0] * format->max_value[0];
+  format->max_value[1] = (1 << format->bit_depth[1]) - 1;
+  format->max_value_sq[1] = format->max_value[1] * format->max_value[1];
+  format->max_value[2] = (1 << format->bit_depth[2]) - 1;
+  format->max_value_sq[2] = format->max_value[2] * format->max_value[2];
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Reset format information
+ *
+ ************************************************************************
+ */
+static void reset_format_info(seq_parameter_set_rbsp_t *sps, VideoParameters *p_Vid, FrameFormat *output)
+{
+  InputParameters *p_Inp = p_Vid->p_Inp;
+  static const int SubWidthC  [4]= { 1, 2, 2, 1};
+  static const int SubHeightC [4]= { 1, 2, 1, 1};
+
+  int crop_left, crop_right;
+  int crop_top, crop_bottom;
+
+  // cropping for luma
+  if (sps->frame_cropping_flag)
+  {
+    crop_left   = SubWidthC [sps->chroma_format_idc] * sps->frame_cropping_rect_left_offset;
+    crop_right  = SubWidthC [sps->chroma_format_idc] * sps->frame_cropping_rect_right_offset;
+    crop_top    = SubHeightC[sps->chroma_format_idc] * ( 2 - sps->frame_mbs_only_flag ) *  sps->frame_cropping_rect_top_offset;
+    crop_bottom = SubHeightC[sps->chroma_format_idc] * ( 2 - sps->frame_mbs_only_flag ) *  sps->frame_cropping_rect_bottom_offset;
+  }
+  else
+  {
+    crop_left = crop_right = crop_top = crop_bottom = 0;
+	}
+
+	output->width_crop = p_Vid->width - crop_left - crop_right;
+	output->height_crop = p_Vid->height - crop_top - crop_bottom;
+
+  output->width     = p_Vid->width;
+  output->height    = p_Vid->height;
+
+  output->width_cr  = p_Vid->width_cr;
+  output->height_cr = p_Vid->height_cr;
+
+  // output size (excluding padding)
+  output->size_cmp[0] = output->width * output->height;
+  output->size_cmp[1] = output->width_cr * output->height_cr;
+  output->size_cmp[2] = output->size_cmp[1];
+  output->size        = output->size_cmp[0] + output->size_cmp[1] + output->size_cmp[2];
+  output->mb_width    = output->width  / MB_BLOCK_SIZE;
+  output->mb_height   = output->height / MB_BLOCK_SIZE;
+
+  output->bit_depth[0] = p_Vid->bitdepth_luma;
+  output->bit_depth[1] = p_Vid->bitdepth_chroma;
+  output->bit_depth[2] = p_Vid->bitdepth_chroma;  
+
+  output->yuv_format  = (ColorFormat) sps->chroma_format_idc;
+
+  updateMaxValue(output);
+}
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Activate Sequence Parameter Sets
+ *
+ ************************************************************************
+ */
+void activate_sps (VideoParameters *p_Vid, seq_parameter_set_rbsp_t *sps)
+{
+  InputParameters *p_Inp = p_Vid->p_Inp;  
+
+  if (p_Vid->active_sps != sps)
+  {
+    if (p_Vid->dec_picture)
+    {
+      // this may only happen on slice loss
+      exit_picture(p_Vid, &p_Vid->dec_picture);
+    }
+    p_Vid->active_sps = sps;
+
+    p_Vid->bitdepth_chroma = 0;
+    p_Vid->width_cr        = 0;
+    p_Vid->height_cr       = 0;
+
+    // maximum vertical motion vector range in luma quarter pixel units
+    if (p_Vid->active_sps->level_idc <= 10)
+    {
+      p_Vid->max_vmv_r = 64 * 4;
+    }
+    else if (p_Vid->active_sps->level_idc <= 20)
+    {
+      p_Vid->max_vmv_r = 128 * 4;
+    }
+    else if (p_Vid->active_sps->level_idc <= 30)
+    {
+      p_Vid->max_vmv_r = 256 * 4;
+    }
+    else
+    {
+      p_Vid->max_vmv_r = 512 * 4; // 512 pixels in quarter pixels
+    }
+
+    // Fidelity Range Extensions stuff (part 1)
+    p_Vid->bitdepth_luma       = (short) (sps->bit_depth_luma_minus8 + 8);
+    p_Vid->bitdepth_scale[0]   = 1 << sps->bit_depth_luma_minus8;
+    if (sps->chroma_format_idc != YUV400)
+    {
+      p_Vid->bitdepth_chroma   = (short) (sps->bit_depth_chroma_minus8 + 8);
+      p_Vid->bitdepth_scale[1] = 1 << sps->bit_depth_chroma_minus8;
+    }
+
+    p_Vid->MaxFrameNum = 1<<(sps->log2_max_frame_num_minus4+4);
+    p_Vid->PicWidthInMbs = (sps->pic_width_in_mbs_minus1 +1);
+    p_Vid->PicHeightInMapUnits = (sps->pic_height_in_map_units_minus1 +1);
+    p_Vid->FrameHeightInMbs = ( 2 - sps->frame_mbs_only_flag ) * p_Vid->PicHeightInMapUnits;
+    p_Vid->FrameSizeInMbs = p_Vid->PicWidthInMbs * p_Vid->FrameHeightInMbs;
+
+    p_Vid->yuv_format=sps->chroma_format_idc;
+
+    p_Vid->width = p_Vid->PicWidthInMbs * MB_BLOCK_SIZE;
+    p_Vid->height = p_Vid->FrameHeightInMbs * MB_BLOCK_SIZE;
+    
+    if (sps->chroma_format_idc == YUV420)
+    {
+      p_Vid->width_cr  = (p_Vid->width  >> 1);
+      p_Vid->height_cr = (p_Vid->height >> 1);
+    }
+    else if (sps->chroma_format_idc == YUV422)
+    {
+      p_Vid->width_cr  = (p_Vid->width >> 1);
+      p_Vid->height_cr = p_Vid->height;
+    }
+    else if (sps->chroma_format_idc == YUV444)
+    {
+      //YUV444
+      p_Vid->width_cr = p_Vid->width;
+      p_Vid->height_cr = p_Vid->height;
+    }
+
+    init_frext(p_Vid);
+    init_global_buffers(p_Vid);
+
+    if (!p_Vid->no_output_of_prior_pics_flag)
+    {
+      flush_dpb(p_Vid);
+    }
+    init_dpb(p_Vid);
+
+    ercInit(p_Vid, p_Vid->width, p_Vid->height, 1);
+		image_cache_set_dimensions(&p_Vid->image_cache[0], p_Vid->width, p_Vid->height);
+		image_cache_set_dimensions(&p_Vid->image_cache[1], p_Vid->width_cr, p_Vid->height_cr);
+		motion_cache_set_dimensions(&p_Vid->motion_cache, p_Vid->width / BLOCK_SIZE, p_Vid->height / BLOCK_SIZE);
+		
+  }
+  
+  reset_format_info(sps, p_Vid, &p_Inp->output);
+
+}
+
+void activate_pps(VideoParameters *p_Vid, pic_parameter_set_rbsp_t *pps)
+{  
+  if (p_Vid->active_pps != pps)
+  {
+    if (p_Vid->dec_picture)
+    {
+      // this may only happen on slice loss
+      exit_picture(p_Vid, &p_Vid->dec_picture);
+    }
+
+    p_Vid->active_pps = pps;
+
+    // Fidelity Range Extensions stuff (part 2)
+    p_Vid->Transform8x8Mode = pps->transform_8x8_mode_flag;
+
+  }
+}
+
+void UseParameterSet (Slice *currSlice, int PicParsetId)
+{
+  VideoParameters *p_Vid = currSlice->p_Vid;
+  seq_parameter_set_rbsp_t *sps = &p_Vid->SeqParSet[p_Vid->PicParSet[PicParsetId].seq_parameter_set_id];
+  pic_parameter_set_rbsp_t *pps = &p_Vid->PicParSet[PicParsetId];
+
+  if (p_Vid->PicParSet[PicParsetId].Valid != TRUE)
+    printf ("Trying to use an invalid (uninitialized) Picture Parameter Set with ID %d, expect the unexpected...\n", PicParsetId);
+  if (p_Vid->SeqParSet[p_Vid->PicParSet[PicParsetId].seq_parameter_set_id].Valid != TRUE)
+    printf ("PicParset %d references an invalid (uninitialized) Sequence Parameter Set with ID %d, expect the unexpected...\n", PicParsetId, (int) p_Vid->PicParSet[PicParsetId].seq_parameter_set_id);
+
+  sps =  &p_Vid->SeqParSet[p_Vid->PicParSet[PicParsetId].seq_parameter_set_id];
+
+
+  // In theory, and with a well-designed software, the lines above
+  // are everything necessary.  In practice, we need to patch many values
+  // in p_Vid-> (but no more in p_Inp-> -- these have been taken care of)
+
+  // Sequence Parameter Set Stuff first
+
+//  printf ("Using Picture Parameter set %d and associated Sequence Parameter Set %d\n", PicParsetId, p_Vid->PicParSet[PicParsetId].seq_parameter_set_id);
+
+  if ((int) sps->pic_order_cnt_type < 0 || sps->pic_order_cnt_type > 2)  // != 1
+  {
+    printf ("invalid sps->pic_order_cnt_type = %d\n", (int) sps->pic_order_cnt_type);
+    error ("pic_order_cnt_type != 1", -1000);
+  }
+
+  if (sps->pic_order_cnt_type == 1)
+  {
+    if(sps->num_ref_frames_in_pic_order_cnt_cycle >= MAXnum_ref_frames_in_pic_order_cnt_cycle)
+    {
+      error("num_ref_frames_in_pic_order_cnt_cycle too large",-1011);
+    }
+  }
+
+  activate_sps(p_Vid, sps);
+  activate_pps(p_Vid, pps);
+
+  // currSlice->dp_mode is set by read_new_slice (NALU first byte available there)
+  if (pps->entropy_coding_mode_flag == CAVLC)
+  {
+    currSlice->nal_startcode_follows = uvlc_startcode_follows;
+  }
+  else
+  {
+    currSlice->nal_startcode_follows = cabac_startcode_follows;
+  }
+}
+
diff --git a/Src/h264dec/ldecod/src/prediction.asm b/Src/h264dec/ldecod/src/prediction.asm
new file mode 100644
index 00000000..33d3d499
--- /dev/null
+++ b/Src/h264dec/ldecod/src/prediction.asm
@@ -0,0 +1,1626 @@
+.686
+.XMM
+.model FLAT
+
+copy_image_data_16x16_stride@OptimizedFunctions = 32
+dec_picture@VideoParameters = 698192
+p_Slice@MacroBlock = 0
+plane_images@StorablePicture = 158512
+mb_rec@Slice = 1696
+mb_pred@Slice = 928
+cof@Slice = 2464
+
+CONST SEGMENT
+align 16
+const32	DW 020H, 020H, 020H, 020H, 020H, 020H, 020H, 020H
+CONST ENDS
+
+;
+;
+;
+;
+
+PUBLIC _weighted_bi_prediction4x4
+_TEXT	SEGMENT
+mb_pred = 4						
+block_l0 = 8						
+wp_scale_l0 = 12					
+wp_scale_l1 = 16					
+wp_offset = 20
+weight_denom = 24
+_weighted_bi_prediction4x4 PROC				; COMDAT
+  mov eax, DWORD PTR weight_denom[esp]
+  pxor mm0, mm0
+  pshufw mm1, MMWORD PTR wp_scale_l0[esp], 0
+  test eax, eax
+  pshufw mm2, MMWORD PTR wp_scale_l1[esp], 0
+  pshufw mm3, MMWORD PTR wp_offset[esp], 0
+  jle	BI_PRED4x4@LEFT_SHIFT
+
+  movd mm4, eax	
+  lea	ecx, DWORD PTR [eax-1] ; 
+  mov	edx, 1
+  shl	edx, cl
+  movd mm5, edx
+  mov eax, mb_pred[esp]
+  mov edx, block_l0[esp]
+  pshufw mm5, mm5, 0
+  movd mm6, DWORD PTR 0[edx] ; block_l0
+  movd mm7, DWORD PTR 0[eax] ; mb_pred
+  punpcklbw mm6, mm0
+  punpcklbw mm7, mm0
+  pmullw	mm6, mm1
+  pmullw	mm7, mm2
+  paddw mm6, mm7
+  movd mm7, DWORD PTR 16[eax] ; mb_pred
+  paddw mm6, mm5
+  psraw	mm6, mm4
+  paddw mm6, mm3
+  packuswb mm6, mm6
+  movd DWORD PTR 0[eax], mm6
+
+  movd mm6, DWORD PTR 16[edx] ; block_l0
+  punpcklbw mm6, mm0
+  punpcklbw mm7, mm0
+  pmullw	mm6, mm1
+  pmullw	mm7, mm2
+  paddw mm6, mm7
+  movd mm7, DWORD PTR 32[eax] ; mb_pred
+  paddw mm6, mm5
+  psraw	mm6, mm4
+  paddw mm6, mm3
+  packuswb mm6, mm6
+  movd DWORD PTR 16[eax], mm6
+
+  movd mm6, DWORD PTR 32[edx] ; block_l0
+  punpcklbw mm6, mm0
+  punpcklbw mm7, mm0
+  pmullw	mm6, mm1
+  pmullw	mm7, mm2
+  paddw mm6, mm7
+  movd mm7, DWORD PTR 48[eax] ; mb_pred
+  paddw mm6, mm5
+  psraw	mm6, mm4
+  paddw mm6, mm3
+  packuswb mm6, mm6
+  movd DWORD PTR 0[eax], mm6
+
+  movd mm6, DWORD PTR 48[edx] ; block_l0
+  punpcklbw mm6, mm0
+  punpcklbw mm7, mm0
+  pmullw	mm6, mm1
+  pmullw	mm7, mm2
+  paddw mm6, mm7
+  paddw mm6, mm5
+  psraw	mm6, mm4
+  paddw mm6, mm3
+  packuswb mm6, mm6
+  movd DWORD PTR 48[eax], mm6
+  ret 0	
+
+BI_PRED4x4@LEFT_SHIFT:
+  neg eax
+  movd mm4, eax	
+  mov eax, mb_pred[esp]
+  mov edx, block_l0[esp]
+  movd mm6, DWORD PTR 0[edx] ; block_l0
+  movd mm7, DWORD PTR 0[eax] ; mb_pred
+  punpcklbw mm6, mm0
+  punpcklbw mm7, mm0
+  pmullw	mm6, mm1
+  pmullw	mm7, mm2
+  paddw mm6, mm7
+  movd mm7, DWORD PTR 16[eax] ; mb_pred
+  psllw	mm6, mm4
+  paddw mm6, mm3
+  packuswb mm6, mm6
+  movd DWORD PTR 0[eax], mm6
+
+  movd mm6, DWORD PTR 16[edx] ; block_l0
+  punpcklbw mm6, mm0
+  punpcklbw mm7, mm0
+  pmullw	mm6, mm1
+  pmullw	mm7, mm2
+  paddw mm6, mm7
+  movd mm7, DWORD PTR 32[eax] ; mb_pred
+  psllw	mm6, mm4
+  paddw mm6, mm3
+  packuswb mm6, mm6
+  movd DWORD PTR 16[eax], mm6
+
+  movd mm6, DWORD PTR 32[edx] ; block_l0
+  punpcklbw mm6, mm0
+  punpcklbw mm7, mm0
+  pmullw	mm6, mm1
+  pmullw	mm7, mm2
+  paddw mm6, mm7
+  movd mm7, DWORD PTR 48[eax] ; mb_pred
+  psllw	mm6, mm4
+  paddw mm6, mm3
+  packuswb mm6, mm6
+  movd DWORD PTR 0[eax], mm6
+
+  movd mm6, DWORD PTR 48[edx] ; block_l0
+  punpcklbw mm6, mm0
+  punpcklbw mm7, mm0
+  pmullw	mm6, mm1
+  pmullw	mm7, mm2
+  paddw mm6, mm7
+  psllw	mm6, mm4
+  paddw mm6, mm3
+  packuswb mm6, mm6
+  movd DWORD PTR 48[eax], mm6
+  ret 0	
+_weighted_bi_prediction4x4 ENDP
+_TEXT	ENDS
+
+PUBLIC	_itrans4x4_mmx
+_TEXT	SEGMENT
+_tblock$ = 4						; size = 4
+_mb_pred$ = 8						; size = 4
+_mb_rec$ = 12						; size = 4
+_pos_x$ = 16						; size = 4
+_pos_y$ = 20						; size = 4
+_itrans4x4_mmx PROC					; COMDAT
+
+	mov	edx, DWORD PTR _pos_y$[esp]
+	shl	edx, 4
+	add	edx, DWORD PTR _pos_x$[esp]
+	mov eax, DWORD PTR _tblock$[esp]
+	mov	ecx, DWORD PTR _mb_pred$[esp]
+	add	ecx, edx
+	add edx, DWORD PTR _mb_rec$[esp]
+_itrans4x4_mmx_direct PROC					; COMDAT
+			; load 4x4 matrix
+			movq mm0, MMWORD PTR 0[eax]
+			movq mm1, MMWORD PTR 8[eax]
+			movq mm2, MMWORD PTR 16[eax]
+			movq mm3, MMWORD PTR 24[eax]
+
+			; rotate 4x4 matrix
+			movq mm4, mm0 ; p0 = mm4 (copy)
+			punpcklwd mm0, mm2 ; r0 = mm0
+			punpckhwd mm4, mm2 ; r2 = mm4
+			movq mm5, mm1 ; p1 = mm5 (copy)
+			punpcklwd mm1, mm3 ; r1 = mm1
+			punpckhwd mm5, mm3 ; r3 = mm5
+			movq mm6, mm0 ; r0 = mm6 (copy)
+			punpcklwd mm0, mm1 ; t0 = mm0
+			punpckhwd mm6, mm1 ; t1 = mm6
+			movq mm1, mm4 ; r2 = mm1 (copy)
+			punpcklwd mm1, mm5 ; t2 = mm1
+			punpckhwd mm4, mm5 ; t3 = mm4
+
+			movq mm2, mm0 ; mm2 = t0 (copy)
+			paddw mm0, mm1 ; mm0 = p0
+			psubw mm2, mm1 ; mm2 = p1, mm1 available
+			movq mm5, mm6 ; mm5 = t1 (copy)
+			psraw mm5, 1 ; mm5 = (t1 >> 1)
+			psubw mm5, mm4 ; mm5 = p2
+			psraw mm4, 1 ; mm4 = (t3 >> 1)
+			paddw mm6, mm4 ; mm6 = p3
+
+			movq mm3, mm0 ; mm3 = p0 (copy)
+			paddw mm0, mm6 ; mm0 = r0
+			movq mm1, mm2 ; mm1 = p1 (copy)
+			paddw mm1, mm5 ; mm1 = r1
+			psubw mm2, mm5 ; mm2 = r2, mm5 available
+			psubw mm3, mm6 ; mm3 = r3
+
+			; rotate 4x4 matrix to set up for vertical
+			movq mm4, mm0 ; r0 = mm4 (copy)
+			punpcklwd mm0, mm2 ; p0 = mm0
+			punpckhwd mm4, mm2 ; p2 = mm4
+			movq mm5, mm1 ; r1 = mm5 (copy)
+			punpcklwd mm1, mm3 ; p1 = mm1
+			punpckhwd mm5, mm3 ; p3 = mm5
+			movq mm6, mm0 ; p0 = mm6 (copy)
+			punpcklwd mm0, mm1 ; t0 = mm0
+			punpckhwd mm6, mm1 ; t1 = mm6
+			movq mm1, mm4 ; p2 = mm1 (copy)
+			punpcklwd mm1, mm5 ; t2 = mm1
+			punpckhwd mm4, mm5 ; t3 = mm4
+
+			movq mm2, mm0 ; mm2 = t0 (copy)
+			paddw mm0, mm1 ; mm0 = p0
+			psubw mm2, mm1 ; mm2 = p1, mm1 available
+			movq mm5, mm6 ; mm5 = t1 (copy)
+			psraw mm5, 1 ; mm5 = (t1 >> 1)
+			psubw mm5, mm4 ; mm5 = p2
+			psraw mm4, 1 ; mm4 = (t3 >> 1)
+			paddw mm6, mm4 ; mm6 = p3
+			movq mm3, mm0 ; mm3 = p0 (copy)
+			paddw mm0, mm6 ; mm0 = r0
+			movq mm1, mm2 ; mm1 = p1 (copy)
+			paddw mm1, mm5 ; mm1 = r1
+			psubw mm2, mm5 ; mm2 = r2, mm5 available
+			psubw mm3, mm6 ; mm3 = r3
+
+
+; --- 4x4 iDCT done, now time to combine with mpr --- 
+
+			movq	mm7, MMWORD PTR const32
+
+			paddw mm0, mm7 ; rres + 32
+			psraw mm0, 6 ; (rres + 32) >> 6
+			paddw mm1, mm7 ; rres + 32
+			psraw mm1, 6 ; (rres + 32) >> 6
+			paddw mm2, mm7 ; rres + 32
+			psraw mm2, 6 ; (rres + 32) >> 6
+			paddw mm3, mm7 ; rres + 32
+			psraw mm3, 6 ; (rres + 32) >> 6
+
+			pxor mm7, mm7
+
+			; convert mpr from unsigned char to short
+			movd mm4, DWORD PTR 0[ecx]
+			movd mm5, DWORD PTR 16[ecx]
+			movd mm6, DWORD PTR 32[ecx]
+			punpcklbw mm4, mm7
+			punpcklbw mm5, mm7
+			punpcklbw mm6, mm7
+			paddsw mm4, mm0 ; pred_row + rres_row
+			movd mm0, DWORD PTR 48[ecx] ; reuse mm0 for mpr[3]
+			paddsw mm5, mm1 ; pred_row + rres_row
+			punpcklbw mm0, mm7
+			paddsw mm6, mm2 ; pred_row + rres_row			
+			paddsw mm0, mm3 ; pred_row + rres_row
+			; results in mm4, mm5, mm6, mm0
+			
+			; move back to 8 bit
+			packuswb mm4, mm7
+			packuswb mm5, mm7
+			packuswb mm6, mm7
+			packuswb mm0, mm7
+			movd DWORD PTR 0[edx], mm4
+			movd DWORD PTR 16[edx], mm5
+			movd DWORD PTR 32[edx], mm6
+			movd DWORD PTR 48[edx], mm0
+	ret	0
+
+_itrans4x4_mmx_direct ENDP
+_itrans4x4_mmx ENDP
+_TEXT	ENDS
+
+EXTRN	_itrans_sp:PROC
+EXTRN	_Inv_Residual_trans_4x4:PROC
+PUBLIC	_iMBtrans4x4
+EXTRN	_opt:BYTE
+_TEXT	SEGMENT
+_currSlice$ = -4					; size = 4
+_mb_rec$166704 = 8					; size = 4
+_currMB$ = 8						; size = 4
+_curr_img$ = 12						; size = 4
+_pl$ = 8 ; second parameter
+_smb$ = 16						; size = 4
+_iMBtrans4x4 PROC
+	push	ecx
+	push	ebx
+	push	ebp
+	push	esi
+STACKOFFSET = 16
+; 408  :   VideoImage *curr_img = pl ? dec_picture->imgUV[pl - 1]: dec_picture->imgY;
+
+	mov	esi, DWORD PTR _pl$[esp+STACKOFFSET]
+	push	edi
+STACKOFFSET = STACKOFFSET + 4
+	mov	edi, DWORD PTR _currMB$[esp+16]
+	mov	ebp, DWORD PTR [edi+p_Slice@MacroBlock] ; ebp: currMB->p_Slice
+	mov	eax, DWORD PTR [edi+4]
+	mov	eax, DWORD PTR [eax+dec_picture@VideoParameters] ; eax: p_Vid->dec_picture;
+	mov	DWORD PTR _currSlice$[esp+20], ebp
+	mov	ecx, DWORD PTR [eax+esi*4+plane_images@StorablePicture]
+	mov	DWORD PTR _curr_img$[esp+16], ecx
+	
+	cmp	DWORD PTR _smb$[esp+16], 0 ; if (smb)
+; 413  : 	{
+; 414  : 		h264_short_block_t *blocks = currSlice->cof4[pl];
+; 415  : 		const h264_imgpel_macroblock_row_t *mb_pred=currSlice->mb_pred[pl];
+; 416  : 
+; 417  : 		itrans_sp(blocks[0], mb_pred, currMB, pl, 0, 0);
+
+	je	$LN4@iMBtrans4x
+	push	0
+	push	0
+	mov	eax, esi
+	shl	eax, 9
+	lea	ebx, DWORD PTR [eax+ebp+cof@Slice]
+	mov	ecx, esi
+	shl	ecx, 8
+	lea	ebp, DWORD PTR [ecx+ebp+mb_pred@Slice]
+	push	esi
+	push	ebp
+	push	ebx
+	mov	eax, edi
+	call	_itrans_sp
+
+; 418  : 		itrans_sp(blocks[1], mb_pred, currMB, pl, 4, 0);
+
+	push	0
+	push	4
+	push	esi
+	lea	edx, DWORD PTR [ebx+32]
+	push	ebp
+	push	edx
+	mov	eax, edi
+	call	_itrans_sp
+
+; 419  : 		itrans_sp(blocks[2], mb_pred, currMB, pl, 0, 4);
+
+	push	4
+	push	0
+	push	esi
+	lea	eax, DWORD PTR [ebx+64]
+	push	ebp
+	push	eax
+	mov	eax, edi
+	call	_itrans_sp
+
+; 420  : 		itrans_sp(blocks[3], mb_pred, currMB, pl, 4, 4);
+
+	push	4
+	push	4
+	push	esi
+	lea	ecx, DWORD PTR [ebx+96]
+	push	ebp
+	push	ecx
+	mov	eax, edi
+	call	_itrans_sp
+	add	esp, 80					; 00000050H
+
+; 421  : 		itrans_sp(blocks[4], mb_pred, currMB, pl, 8, 0);
+
+	push	0
+	push	8
+	push	esi
+	lea	edx, DWORD PTR [ebx+128]
+	push	ebp
+	push	edx
+	mov	eax, edi
+	call	_itrans_sp
+
+; 422  : 		itrans_sp(blocks[5], mb_pred, currMB, pl, 12, 0);
+
+	push	0
+	push	12					; 0000000cH
+	push	esi
+	lea	eax, DWORD PTR [ebx+160]
+	push	ebp
+	push	eax
+	mov	eax, edi
+	call	_itrans_sp
+
+; 423  : 		itrans_sp(blocks[6], mb_pred, currMB, pl, 8, 4);
+
+	push	4
+	push	8
+	push	esi
+	lea	ecx, DWORD PTR [ebx+192]
+	push	ebp
+	push	ecx
+	mov	eax, edi
+	call	_itrans_sp
+
+; 424  : 		itrans_sp(blocks[7], mb_pred, currMB, pl, 12, 4);
+
+	push	4
+	push	12					; 0000000cH
+	push	esi
+	lea	edx, DWORD PTR [ebx+224]
+	push	ebp
+	push	edx
+	mov	eax, edi
+	call	_itrans_sp
+	add	esp, 80					; 00000050H
+
+; 425  : 		itrans_sp(blocks[8], mb_pred, currMB, pl, 0, 8);
+
+	push	8
+	push	0
+	push	esi
+	lea	eax, DWORD PTR [ebx+256]
+	push	ebp
+	push	eax
+	mov	eax, edi
+	call	_itrans_sp
+
+; 426  : 		itrans_sp(blocks[9], mb_pred, currMB, pl, 4, 8);
+
+	push	8
+	push	4
+	push	esi
+	push	ebp
+	lea	ecx, DWORD PTR [ebx+288]
+	push	ecx
+	mov	eax, edi
+	call	_itrans_sp
+
+; 427  : 		itrans_sp(blocks[10], mb_pred, currMB, pl, 0, 12);
+
+	push	12					; 0000000cH
+	push	0
+	push	esi
+	lea	edx, DWORD PTR [ebx+320]
+	push	ebp
+	push	edx
+	mov	eax, edi
+	call	_itrans_sp
+
+; 428  : 		itrans_sp(blocks[11], mb_pred, currMB, pl, 4, 12);
+
+	push	12					; 0000000cH
+	push	4
+	push	esi
+	lea	eax, DWORD PTR [ebx+352]
+	push	ebp
+	push	eax
+	mov	eax, edi
+	call	_itrans_sp
+	add	esp, 80					; 00000050H
+
+; 429  : 		itrans_sp(blocks[12], mb_pred, currMB, pl, 8, 8);
+
+	push	8
+	push	8
+	push	esi
+	lea	ecx, DWORD PTR [ebx+384]
+	push	ebp
+	push	ecx
+	mov	eax, edi
+	call	_itrans_sp
+
+; 430  : 		itrans_sp(blocks[13], mb_pred, currMB, pl, 12, 8);
+
+	push	8
+	push	12					; 0000000cH
+	push	esi
+	lea	edx, DWORD PTR [ebx+416]
+	push	ebp
+	push	edx
+	mov	eax, edi
+	call	_itrans_sp
+
+; 431  : 		itrans_sp(blocks[14], mb_pred, currMB, pl, 8, 12);
+
+	push	12					; 0000000cH
+	push	8
+	push	esi
+	lea	eax, DWORD PTR [ebx+448]
+	push	ebp
+	push	eax
+	mov	eax, edi
+	call	_itrans_sp
+
+; 432  : 		itrans_sp(blocks[15], mb_pred, currMB, pl, 12, 12);
+
+	push	12					; 0000000cH
+	push	12					; 0000000cH
+	push	esi
+	add	ebx, 480				; 000001e0H
+	push	ebp
+	push	ebx
+	mov	eax, edi
+	call	_itrans_sp
+		mov	ebp, DWORD PTR _currSlice$[esp+100]
+	add	esp, 80					; 00000050H
+	jmp	COPY_16x16
+	
+$LN4@iMBtrans4x:
+
+; 433  : 	}
+; 434  : 	else if (currMB->is_lossless)
+
+	cmp	DWORD PTR [edi+84], 0
+	je	$LN2@iMBtrans4x
+
+	push	0
+	push	0
+
+; 435  : 	{
+; 436  : 		Inv_Residual_trans_4x4(currMB, pl, 0, 0);
+
+	push	esi
+	push	edi
+	call	_Inv_Residual_trans_4x4
+
+; 437  : 		Inv_Residual_trans_4x4(currMB, pl, 4, 0);
+
+	push	0
+	push	4
+	push	esi
+	push	edi
+	call	_Inv_Residual_trans_4x4
+
+; 438  : 		Inv_Residual_trans_4x4(currMB, pl, 0, 4);
+
+	push	4
+	push	0
+	push	esi
+	push	edi
+	call	_Inv_Residual_trans_4x4
+
+; 439  : 		Inv_Residual_trans_4x4(currMB, pl, 4, 4);
+
+	push	4
+	push	4
+	push	esi
+	push	edi
+	call	_Inv_Residual_trans_4x4
+	add	esp, 64					; 00000040H
+
+; 440  : 		Inv_Residual_trans_4x4(currMB, pl, 8, 0);
+
+	push	0
+	push	8
+	push	esi
+	push	edi
+	call	_Inv_Residual_trans_4x4
+
+; 441  : 		Inv_Residual_trans_4x4(currMB, pl, 12, 0);
+
+	push	0
+	push	12					; 0000000cH
+	push	esi
+	push	edi
+	call	_Inv_Residual_trans_4x4
+
+; 442  : 		Inv_Residual_trans_4x4(currMB, pl, 8, 4);
+
+	push	4
+	push	8
+	push	esi
+	push	edi
+	call	_Inv_Residual_trans_4x4
+
+; 443  : 		Inv_Residual_trans_4x4(currMB, pl, 12, 4);
+
+	push	4
+	push	12					; 0000000cH
+	push	esi
+	push	edi
+	call	_Inv_Residual_trans_4x4
+	add	esp, 64					; 00000040H
+
+; 444  : 		Inv_Residual_trans_4x4(currMB, pl, 0, 8);
+
+	push	8
+	push	0
+	push	esi
+	push	edi
+	call	_Inv_Residual_trans_4x4
+
+; 445  : 		Inv_Residual_trans_4x4(currMB, pl, 4, 8);
+
+	push	8
+	push	4
+	push	esi
+	push	edi
+	call	_Inv_Residual_trans_4x4
+
+; 446  : 		Inv_Residual_trans_4x4(currMB, pl, 0, 12);
+
+	push	12					; 0000000cH
+	push	0
+	push	esi
+	push	edi
+	call	_Inv_Residual_trans_4x4
+
+; 447  : 		Inv_Residual_trans_4x4(currMB, pl, 4, 12);
+
+	push	12					; 0000000cH
+	push	4
+	push	esi
+	push	edi
+	call	_Inv_Residual_trans_4x4
+	add	esp, 64					; 00000040H
+
+; 448  : 		Inv_Residual_trans_4x4(currMB, pl, 8, 8);
+
+	push	8
+	push	8
+	push	esi
+	push	edi
+	call	_Inv_Residual_trans_4x4
+
+; 449  : 		Inv_Residual_trans_4x4(currMB, pl, 12, 8);
+
+	push	8
+	push	12					; 0000000cH
+	push	esi
+	push	edi
+	call	_Inv_Residual_trans_4x4
+
+; 450  : 		Inv_Residual_trans_4x4(currMB, pl, 8, 12);
+
+	push	12					; 0000000cH
+	push	8
+	push	esi
+	push	edi
+	call	_Inv_Residual_trans_4x4
+
+; 451  : 		Inv_Residual_trans_4x4(currMB, pl, 12, 12);
+
+	push	12					; 0000000cH
+	push	12					; 0000000cH
+	push	esi
+	push	edi
+	call	_Inv_Residual_trans_4x4
+	add	esp, 64					; 00000040H
+
+; 452  : 	}
+; 453  : 	else
+
+	jmp	COPY_16x16
+$LN2@iMBtrans4x:
+
+; 454  : 	{
+; 455  : 			const h264_short_block_t *blocks = currSlice->cof4[pl];
+; 456  : 			const h264_imgpel_macroblock_row_t *mb_pred=currSlice->mb_pred[pl];
+
+	mov	edx, esi
+	mov	ecx, esi
+	shl	edx, 8
+	shl	ecx, 9
+	lea	eax, DWORD PTR [edx+ebp]
+	lea	ebx, DWORD PTR [ecx+ebp+cof@Slice]
+
+; 457  : 			h264_imgpel_macroblock_row_t *mb_rec = currSlice->mb_rec[pl];
+	
+	; put things in registers that itrans4x4_mmx_direct wants
+	lea edx, [eax + mb_rec@Slice]; mb_rec
+	lea ecx, [eax + mb_pred@Slice] ; mb_pred
+	mov eax, ebx ; blocks
+	call	_itrans4x4_mmx_direct ; opt_itrans4x4(blocks[0], mb_pred, mb_rec, 0, 0);
+	
+	lea edx, [edx+4]
+	lea ecx, [ecx+4]
+	lea eax, [ebx+32]
+	call	_itrans4x4_mmx_direct ; opt_itrans4x4(blocks[1], mb_pred, mb_rec, 4, 0);
+
+	lea edx, [edx+4]
+	lea ecx, [ecx+4]
+	lea eax, [ebx+128]
+	call	_itrans4x4_mmx_direct ; opt_itrans4x4(blocks[4], mb_pred, mb_rec, 8, 0);
+	
+	lea edx, [edx+4]
+	lea ecx, [ecx+4]
+	lea eax, [ebx+160]
+	call	_itrans4x4_mmx_direct ; opt_itrans4x4(blocks[5], mb_pred, mb_rec, 12, 0);
+
+	; second row
+	lea edx, [edx+52]
+	lea ecx, [ecx+52]
+	lea eax, [ebx+64]
+	call	_itrans4x4_mmx_direct ; opt_itrans4x4(blocks[2], mb_pred, mb_rec, 0, 4);
+	
+	lea edx, [edx+4]
+	lea ecx, [ecx+4]
+	lea eax, [ebx+96]
+	call	_itrans4x4_mmx_direct ; opt_itrans4x4(blocks[3], mb_pred, mb_rec, 4, 4);
+	
+	lea edx, [edx+4]
+	lea ecx, [ecx+4]
+	lea eax, [ebx+192]
+	call	_itrans4x4_mmx_direct ; opt_itrans4x4(blocks[6], mb_pred, mb_rec, 8, 4);
+
+	lea edx, [edx+4]
+	lea ecx, [ecx+4]
+	lea eax, [ebx+224]
+	call	_itrans4x4_mmx_direct ; opt_itrans4x4(blocks[7], mb_pred, mb_rec, 12, 4);
+
+	; third row
+	lea edx, [edx+52]
+	lea ecx, [ecx+52]
+	lea eax, [ebx+256]
+	call	_itrans4x4_mmx_direct ; opt_itrans4x4(blocks[8], mb_pred, mb_rec, 0, 8);
+	
+	lea edx, [edx+4]
+	lea ecx, [ecx+4]
+	lea eax, [ebx+288]
+	call	_itrans4x4_mmx_direct ; opt_itrans4x4(blocks[9], mb_pred, mb_rec, 4, 8);
+	
+	lea edx, [edx+4]
+	lea ecx, [ecx+4]
+	lea eax, [ebx+384]
+	call	_itrans4x4_mmx_direct ; opt_itrans4x4(blocks[12], mb_pred, mb_rec, 8, 8);
+	
+	lea edx, [edx+4]
+	lea ecx, [ecx+4]
+	lea eax, [ebx+416]
+	call	_itrans4x4_mmx_direct ; opt_itrans4x4(blocks[13], mb_pred, mb_rec, 12, 8);
+	
+	; fourth row
+	lea edx, [edx+52]
+	lea ecx, [ecx+52]
+	lea eax, [ebx+320]
+	call	_itrans4x4_mmx_direct ; opt_itrans4x4(blocks[10], mb_pred, mb_rec, 0, 12);
+
+	lea edx, [edx+4]
+	lea ecx, [ecx+4]
+	lea eax, [ebx+352]
+	call	_itrans4x4_mmx_direct ; opt_itrans4x4(blocks[11], mb_pred, mb_rec, 4, 12);
+	
+	lea edx, [edx+4]
+	lea ecx, [ecx+4]
+	lea eax, [ebx+448]
+	call	_itrans4x4_mmx_direct ; opt_itrans4x4(blocks[14], mb_pred, mb_rec, 8, 12);
+
+	lea edx, [edx+4]
+	lea ecx, [ecx+4]
+	lea eax, [ebx+480]
+	call	_itrans4x4_mmx_direct ; opt_itrans4x4(blocks[15], mb_pred, mb_rec, 12, 12);
+COPY_16x16:
+
+; construct picture from 4x4 blocks
+; opt_copy_image_data_16x16_stride(curr_img, currMB->pix_x, currMB->pix_y, currSlice->mb_rec[pl]);
+
+	mov	eax, DWORD PTR [edi+40]
+	mov	ecx, DWORD PTR [edi+36]
+	shl	esi, 8
+	lea	edx, DWORD PTR [esi+ebp+mb_rec@Slice]
+	push	edx
+	mov	edx, DWORD PTR _curr_img$[esp+20]
+	push	eax
+	push	ecx
+	push	edx
+	call	DWORD PTR _opt+copy_image_data_16x16_stride@OptimizedFunctions
+	add	esp, 16					; 00000010H
+	pop	edi
+	pop	esi
+	pop	ebp
+	pop	ebx
+	pop	ecx
+	ret	0
+_iMBtrans4x4 ENDP
+_TEXT	ENDS
+
+_TEXT	SEGMENT 
+       ALIGN     2
+	PUBLIC _itrans8x8_sse2
+_itrans8x8_sse2	PROC NEAR
+; parameter 1(mb_rec): 8 + ebp
+; parameter 2(mb_pred): 12 + ebp
+; parameter 3(block): 16 + ebp
+; parameter 4(pos_x): 20 + ebp
+        push      ebp
+        mov       ebp, esp
+        and       esp, -16
+        sub       esp, 176
+        mov edx, DWORD PTR [ebp+20]
+        mov ecx, DWORD PTR [ebp+8] ; ecx: mb_rec
+        add ecx, edx
+        add edx, DWORD PTR [ebp+12] ; edx: mb_pred
+        mov eax, DWORD PTR [ebp+16] ; eax: block
+
+;;; 		__m128i a0, a1, a2, a3;
+;;; 		__m128i p0, p1, p2, p3, p4, p5 ,p6, p7;
+;;; 		__m128i b0, b1, b2, b3, b4, b5, b6, b7;
+;;; 		__m128i r0, r1, r2, r3, r4, r5, r6, r7;
+;;; 		__m128i const32, zero;
+;;; 		__declspec(align(32)) static const int16_t c32[8] = {32, 32, 32, 32, 32, 32, 32, 32};
+;;; 		__m128i pred0, pred1;
+;;; 
+;;; 		const32 = _mm_load_si128((const __m128i *)c32);
+
+        movdqa    xmm0, XMMWORD PTR const32
+
+;;; 		zero = _mm_setzero_si128();
+;;; 
+;;; 				// Horizontal
+;;; 			b0 = _mm_load_si128((__m128i *)(block[0])); 
+
+        movdqa    xmm4, XMMWORD PTR [eax]
+
+;;; 			b1 = _mm_load_si128((__m128i *)(block[1])); 
+
+        movdqa    xmm7, XMMWORD PTR [eax+16]
+
+;;; 			b2 = _mm_load_si128((__m128i *)(block[2])); 
+
+        movdqa    xmm5, XMMWORD PTR [eax+32]
+
+;;; 			b3 = _mm_load_si128((__m128i *)(block[3])); 
+
+        movdqa    xmm3, XMMWORD PTR [eax+48]
+
+;;; 			b4 = _mm_load_si128((__m128i *)(block[4])); 
+
+        movdqa    xmm6, XMMWORD PTR [eax+64]
+
+;;; 			b5 = _mm_load_si128((__m128i *)(block[5])); 
+;;; 			b6 = _mm_load_si128((__m128i *)(block[6])); 
+
+        movdqa    xmm1, XMMWORD PTR [eax+96]
+
+;;; 			b7 = _mm_load_si128((__m128i *)(block[7])); 
+
+        movdqa    xmm2, XMMWORD PTR [eax+112]
+        movdqa    XMMWORD PTR [esp], xmm0
+        movdqa    xmm0, XMMWORD PTR [eax+80]
+        movdqa    XMMWORD PTR [esp+16], xmm2
+
+;;; 
+;;; 			/* rotate 8x8 (ugh) */
+;;; 			r0 = _mm_unpacklo_epi16(b0, b2); 
+
+        movdqa    xmm2, xmm4
+        punpcklwd xmm2, xmm5
+
+;;; 			r1 = _mm_unpacklo_epi16(b1, b3); 
+;;; 			r2 = _mm_unpackhi_epi16(b0, b2); 
+
+        punpckhwd xmm4, xmm5
+
+;;; 			r3 = _mm_unpackhi_epi16(b1, b3); 
+;;; 			r4 = _mm_unpacklo_epi16(b4, b6); 
+;;; 			r5 = _mm_unpacklo_epi16(b5, b7); 
+
+        movdqa    xmm5, xmm0
+        movdqa    XMMWORD PTR [esp+32], xmm2
+        movdqa    xmm2, xmm7
+        punpcklwd xmm2, xmm3
+        punpckhwd xmm7, xmm3
+        movdqa    xmm3, xmm6
+        punpcklwd xmm3, xmm1
+        movdqa    XMMWORD PTR [esp+48], xmm3
+        movdqa    xmm3, XMMWORD PTR [esp+16]
+        punpcklwd xmm5, xmm3
+
+;;; 			r6 = _mm_unpackhi_epi16(b4, b6); 
+
+        punpckhwd xmm6, xmm1
+;;; 			r7 = _mm_unpackhi_epi16(b5, b7); 
+
+        punpckhwd xmm0, xmm3
+
+;;; 
+;;; 			b0 = _mm_unpacklo_epi16(r0, r1); 
+
+        movdqa    xmm3, XMMWORD PTR [esp+32]
+        movdqa    xmm1, xmm3
+        punpcklwd xmm1, xmm2
+
+;;; 			b1 = _mm_unpackhi_epi16(r0, r1); 
+
+        punpckhwd xmm3, xmm2
+
+;;; 			b2 = _mm_unpacklo_epi16(r2, r3); 
+
+        movdqa    xmm2, xmm4
+        punpcklwd xmm2, xmm7
+
+;;; 			b3 = _mm_unpackhi_epi16(r2, r3); 
+
+        punpckhwd xmm4, xmm7
+        movdqa    XMMWORD PTR [esp+64], xmm4
+
+;;; 			b4 = _mm_unpacklo_epi16(r4, r5); 
+
+        movdqa    xmm4, XMMWORD PTR [esp+48]
+        movdqa    xmm7, xmm4
+        punpcklwd xmm7, xmm5
+
+;;; 			b5 = _mm_unpackhi_epi16(r4, r5); 
+
+        punpckhwd xmm4, xmm5
+
+;;; 			b6 = _mm_unpacklo_epi16(r6, r7); 
+
+        movdqa    xmm5, xmm6
+        punpcklwd xmm5, xmm0
+
+;;; 			b7 = _mm_unpackhi_epi16(r6, r7); 
+
+        punpckhwd xmm6, xmm0
+
+;;; 
+;;; 			p0 = _mm_unpacklo_epi64(b0, b4);
+
+        movdqa    xmm0, xmm1
+        punpcklqdq xmm0, xmm7
+
+;;; 			p1 = _mm_unpackhi_epi64(b0, b4);
+
+        punpckhqdq xmm1, xmm7
+        movdqa    XMMWORD PTR [esp+16], xmm1
+
+;;; 			p2 = _mm_unpacklo_epi64(b1, b5);
+
+        movdqa    xmm1, xmm3
+        punpcklqdq xmm1, xmm4
+
+;;; 			p3 = _mm_unpackhi_epi64(b1, b5);
+;;; 			p4 = _mm_unpacklo_epi64(b2, b6);
+;;; 			p5 = _mm_unpackhi_epi64(b2, b6);
+;;; 			p6 = _mm_unpacklo_epi64(b3, b7);
+;;; 			p7 = _mm_unpackhi_epi64(b3, b7);
+;;; 
+;;; 			/* perform approx DCT */
+;;; 						a0 = _mm_add_epi16(p0, p4); // p0 + p4
+;;; 			a1 = _mm_sub_epi16(p0, p4); // p0 - p4
+;;; 			r0 = _mm_srai_epi16(p2, 1); // p2 >> 1
+
+        movdqa    xmm7, xmm1
+        psraw     xmm7, 1
+        punpckhqdq xmm3, xmm4
+        movdqa    XMMWORD PTR [esp+32], xmm3
+        movdqa    xmm3, xmm2
+        punpcklqdq xmm3, xmm5
+        punpckhqdq xmm2, xmm5
+        movdqa    xmm5, XMMWORD PTR [esp+64]
+        movdqa    xmm4, xmm5
+        punpcklqdq xmm4, xmm6
+        punpckhqdq xmm5, xmm6
+        movdqa    xmm6, xmm0
+        paddw     xmm6, xmm3
+        psubw     xmm0, xmm3
+
+;;; 			a2 = _mm_sub_epi16(p6, r0); // p6 - (p2 >> 1)
+
+        movdqa    xmm3, xmm4
+
+;;; 			r0 = _mm_srai_epi16(p6, 1); // p6 >> 1
+
+        psraw     xmm4, 1
+        psubw     xmm3, xmm7
+
+;;; 			a3 = _mm_add_epi16(p2, r0); //p2 + (p6 >> 1)
+
+        paddw     xmm1, xmm4
+
+;;; 
+;;; 			b0 =  _mm_add_epi16(a0, a3); // a0 + a3;
+
+        movdqa    xmm4, xmm6
+
+;;; 			b2 =  _mm_sub_epi16(a1, a2);  // a1 - a2;
+
+        movdqa    xmm7, xmm0
+        paddw     xmm4, xmm1
+        psubw     xmm7, xmm3
+        movdqa    XMMWORD PTR [esp+48], xmm7
+
+;;; 			b4 =  _mm_add_epi16(a1, a2);    // a1 + a2;
+
+        paddw     xmm0, xmm3
+        movdqa    XMMWORD PTR [esp+80], xmm0
+
+;;; 			b6 =  _mm_sub_epi16(a0, a3);  // a0 - a3;
+;;; 
+;;; 			//-p3 + p5 - p7 - (p7 >> 1);
+;;; 			r0 = _mm_srai_epi16(p7, 1); // p7 >> 1
+;;; 			a0 = _mm_sub_epi16(p5, p3); // p5 - p3
+
+        movdqa    xmm0, XMMWORD PTR [esp+32]
+        psubw     xmm6, xmm1
+        movdqa    xmm1, xmm5
+        psraw     xmm1, 1
+        movdqa    xmm3, xmm2
+
+;;; 			a0 = _mm_sub_epi16(a0, p7); // (-p3 + p5) - p7
+;;; 			a0 = _mm_sub_epi16(a0, r0); // (-p3 + p5 - p7) - (p7 >> 1)
+;;; 
+;;; 			//p1 + p7 - p3 - (p3 >> 1);
+;;; 			r0 =  _mm_srai_epi16(p3, 1); // (p3 >> 1)
+
+        movdqa    xmm7, xmm0
+        movdqa    XMMWORD PTR [esp+96], xmm6
+
+;;; 			a1 = _mm_add_epi16(p1, p7); // p1 + p7
+
+        movdqa    xmm6, XMMWORD PTR [esp+16]
+        psubw     xmm3, xmm0
+        psubw     xmm3, xmm5
+        psraw     xmm7, 1
+        psubw     xmm3, xmm1
+        movdqa    xmm1, xmm6
+        paddw     xmm1, xmm5
+
+;;; 			a1 = _mm_sub_epi16(a1, p3); // (p1 + p7) - p3
+
+        psubw     xmm1, xmm0
+
+;;; 			a1 = _mm_sub_epi16(a1, r0); // (p1 + p7 - p3) - (p3>>1)
+
+        psubw     xmm1, xmm7
+
+;;; 
+;;; 			// -p1 + p7 + p5 + (p5 >> 1);
+;;; 			r0 =  _mm_srai_epi16(p5, 1); // (p5 >> 1)
+
+        movdqa    xmm7, xmm2
+        psraw     xmm7, 1
+
+;;; 			a2 = _mm_sub_epi16(p7, p1); // p7 - p1
+
+        psubw     xmm5, xmm6
+
+;;; 			a2 = _mm_add_epi16(a2, p5); // -p1 + p7 + p5
+
+        paddw     xmm5, xmm2
+
+;;; 			a2 = _mm_add_epi16(a2, r0); // (-p1 + p7 + p5) + (p5 >> 1)
+
+        paddw     xmm5, xmm7
+
+;;; 
+;;; 			// p3 + p5 + p1 + (p1 >> 1);
+;;; 			a3 = _mm_add_epi16(p3, p5); // p3+p5
+
+        paddw     xmm0, xmm2
+
+;;; 			a3 = _mm_add_epi16(a3, p1); // p3 + p5 + p1
+;;; 			p1 = _mm_srai_epi16(p1, 1); // p1 >> 1
+;;; 			a3 = _mm_add_epi16(a3, p1); //p3 + p5 + p1 + (p1 >> 1)
+;;; 
+;;; 			r0 = _mm_srai_epi16(a3, 2); // a3>>2
+;;; 			b1 = _mm_add_epi16(a0, r0); //a0 + (a3>>2);
+;;; 			r0 = _mm_srai_epi16(a2, 2); // a2>>2
+;;; 			b3 = _mm_add_epi16(a1, r0); // a1 + (a2>>2);
+;;; 			a1 = _mm_srai_epi16(a1, 2); // all done with a1, so this is safe
+;;; 			b5 = _mm_sub_epi16(a2, a1); //a2 - (a1>>2);
+;;; 			a0 = _mm_srai_epi16(a0, 2); // all done with a0, so this is safe
+;;; 			b7 = _mm_sub_epi16(a3, a0); //a3 - (a0>>2);
+;;; 
+;;; 			p0 = _mm_add_epi16(b0, b7); // b0 + b7;
+;;; 			p1 = _mm_sub_epi16(b2, b5); // b2 - b5;
+;;; 			p2 = _mm_add_epi16(b4, b3); // b4 + b3;
+;;; 			p3 = _mm_add_epi16(b6, b1); // b6 + b1;
+
+        movdqa    xmm2, XMMWORD PTR [esp+96]
+        paddw     xmm0, xmm6
+        psraw     xmm6, 1
+        paddw     xmm0, xmm6
+        movdqa    xmm7, xmm0
+        movdqa    xmm6, xmm5
+        psraw     xmm7, 2
+        paddw     xmm7, xmm3
+        psraw     xmm6, 2
+        paddw     xmm6, xmm1
+        psraw     xmm1, 2
+        psubw     xmm5, xmm1
+        movdqa    xmm1, xmm4
+        psraw     xmm3, 2
+        psubw     xmm0, xmm3
+        movdqa    xmm3, XMMWORD PTR [esp+80]
+        movdqa    XMMWORD PTR [esp+32], xmm0
+
+;;; 			p4 = _mm_sub_epi16(b6, b1); // b6 - b1;
+;;; 			p5 = _mm_sub_epi16(b4, b3); // b4 - b3;
+;;; 			p6 = _mm_add_epi16(b2, b5); // b2 + b5;
+;;; 			p7 = _mm_sub_epi16(b0, b7); // b0 - b7;
+
+        psubw     xmm4, XMMWORD PTR [esp+32]
+        paddw     xmm1, xmm0
+        movdqa    XMMWORD PTR [esp+112], xmm1
+        movdqa    xmm1, XMMWORD PTR [esp+48]
+        movdqa    xmm0, xmm1
+        psubw     xmm0, xmm5
+        movdqa    XMMWORD PTR [esp+16], xmm0
+        movdqa    xmm0, xmm3
+        paddw     xmm0, xmm6
+        psubw     xmm3, xmm6
+        movdqa    XMMWORD PTR [esp+128], xmm0
+
+;;; 
+;;; 						/* rotate 8x8 (ugh) */
+;;; 			r0 = _mm_unpacklo_epi16(p0, p2); 
+
+        movdqa    xmm6, XMMWORD PTR [esp+128]
+        movdqa    xmm0, xmm2
+        paddw     xmm0, xmm7
+        psubw     xmm2, xmm7
+        paddw     xmm1, xmm5
+        movdqa    xmm5, XMMWORD PTR [esp+112]
+        movdqa    XMMWORD PTR [esp+144], xmm4
+        movdqa    xmm4, xmm5
+        punpcklwd xmm4, xmm6
+
+;;; 			r1 = _mm_unpacklo_epi16(p1, p3); 
+;;; 			r2 = _mm_unpackhi_epi16(p0, p2); 
+
+        punpckhwd xmm5, xmm6
+
+;;; 			r3 = _mm_unpackhi_epi16(p1, p3); 
+;;; 			r4 = _mm_unpacklo_epi16(p4, p6); 
+;;; 			r5 = _mm_unpacklo_epi16(p5, p7); 
+
+        movdqa    xmm6, xmm3
+        movdqa    XMMWORD PTR [esp+64], xmm4
+        movdqa    xmm4, XMMWORD PTR [esp+16]
+        movdqa    xmm7, xmm4
+        punpcklwd xmm7, xmm0
+        punpckhwd xmm4, xmm0
+        movdqa    xmm0, xmm2
+        punpcklwd xmm0, xmm1
+        movdqa    XMMWORD PTR [esp+128], xmm0
+        movdqa    xmm0, XMMWORD PTR [esp+144]
+        punpcklwd xmm6, xmm0
+
+;;; 			r6 = _mm_unpackhi_epi16(p4, p6); 
+
+        punpckhwd xmm2, xmm1
+
+;;; 			r7 = _mm_unpackhi_epi16(p5, p7); 
+;;; 
+;;; 			b0 = _mm_unpacklo_epi16(r0, r1); 
+
+        movdqa    xmm1, XMMWORD PTR [esp+64]
+        punpckhwd xmm3, xmm0
+        movdqa    xmm0, xmm1
+        punpcklwd xmm0, xmm7
+
+;;; 			b1 = _mm_unpackhi_epi16(r0, r1); 
+
+        punpckhwd xmm1, xmm7
+
+;;; 			b2 = _mm_unpacklo_epi16(r2, r3); 
+
+        movdqa    xmm7, xmm5
+        punpcklwd xmm7, xmm4
+
+;;; 			b3 = _mm_unpackhi_epi16(r2, r3); 
+
+        punpckhwd xmm5, xmm4
+        movdqa    XMMWORD PTR [esp+112], xmm5
+
+;;; 			b4 = _mm_unpacklo_epi16(r4, r5); 
+
+        movdqa    xmm5, XMMWORD PTR [esp+128]
+        movdqa    xmm4, xmm5
+        punpcklwd xmm4, xmm6
+
+;;; 			b5 = _mm_unpackhi_epi16(r4, r5); 
+
+        punpckhwd xmm5, xmm6
+
+;;; 			b6 = _mm_unpacklo_epi16(r6, r7); 
+
+        movdqa    xmm6, xmm2
+        punpcklwd xmm6, xmm3
+
+;;; 			b7 = _mm_unpackhi_epi16(r6, r7); 
+
+        punpckhwd xmm2, xmm3
+
+;;; 
+;;; 			p0 = _mm_unpacklo_epi64(b0, b4);
+
+        movdqa    xmm3, xmm0
+        punpcklqdq xmm3, xmm4
+
+;;; 			p1 = _mm_unpackhi_epi64(b0, b4);
+
+        punpckhqdq xmm0, xmm4
+        movdqa    XMMWORD PTR [esp+144], xmm0
+
+;;; 			p2 = _mm_unpacklo_epi64(b1, b5);
+;;; 			p3 = _mm_unpackhi_epi64(b1, b5);
+;;; 			p4 = _mm_unpacklo_epi64(b2, b6);
+;;; 			p5 = _mm_unpackhi_epi64(b2, b6);
+;;; 			p6 = _mm_unpacklo_epi64(b3, b7);
+
+        movdqa    xmm0, XMMWORD PTR [esp+112]
+        movdqa    xmm4, xmm1
+        punpcklqdq xmm4, xmm5
+        punpckhqdq xmm1, xmm5
+        movdqa    XMMWORD PTR [esp+64], xmm1
+        movdqa    xmm1, xmm7
+        movdqa    xmm5, xmm0
+        punpcklqdq xmm1, xmm6
+        punpckhqdq xmm7, xmm6
+
+;;; 			p7 = _mm_unpackhi_epi64(b3, b7);
+;;; 
+;;; 
+;;; 		/*  Vertical  */
+;;; 
+;;; 			a0 = _mm_add_epi16(p0, p4); // p0 + p4
+;;; 			a1 = _mm_sub_epi16(p0, p4); // p0 - p4
+;;; 			r0 = _mm_srai_epi16(p2, 1); // p2 >> 1
+
+        movdqa    xmm6, xmm4
+        psraw     xmm6, 1
+        punpcklqdq xmm5, xmm2
+        punpckhqdq xmm0, xmm2
+        movdqa    xmm2, xmm3
+        paddw     xmm2, xmm1
+        psubw     xmm3, xmm1
+
+;;; 			a2 = _mm_sub_epi16(p6, r0); // p6 - (p2 >> 1)
+
+        movdqa    xmm1, xmm5
+
+;;; 			r0 = _mm_srai_epi16(p6, 1); // p6 >> 1
+
+        psraw     xmm5, 1
+        psubw     xmm1, xmm6
+
+;;; 			a3 = _mm_add_epi16(p2, r0); //p2 + (p6 >> 1)
+
+        paddw     xmm4, xmm5
+
+;;; 
+;;; 			b0 =  _mm_add_epi16(a0, a3); // a0 + a3;
+
+        movdqa    xmm5, xmm2
+
+;;; 			b2 =  _mm_sub_epi16(a1, a2);  // a1 - a2;
+
+        movdqa    xmm6, xmm3
+        paddw     xmm5, xmm4
+        psubw     xmm6, xmm1
+        movdqa    XMMWORD PTR [esp+128], xmm6
+
+;;; 			b4 =  _mm_add_epi16(a1, a2);    // a1 + a2;
+;;; 			b6 =  _mm_sub_epi16(a0, a3);  // a0 - a3;
+;;; 
+;;; 			//-p3 + p5 - p7 - (p7 >> 1);
+;;; 			r0 = _mm_srai_epi16(p7, 1); // p7 >> 1
+;;; 			a0 = _mm_sub_epi16(p5, p3); // p5 - p3
+
+        movdqa    xmm6, XMMWORD PTR [esp+64]
+        paddw     xmm3, xmm1
+        movdqa    XMMWORD PTR [esp+80], xmm3
+        psubw     xmm2, xmm4
+        movdqa    xmm1, xmm0
+        psraw     xmm1, 1
+        movdqa    xmm3, xmm7
+        movdqa    XMMWORD PTR [esp+96], xmm2
+        psubw     xmm3, xmm6
+
+;;; 			a0 = _mm_sub_epi16(a0, p7); // (-p3 + p5) - p7
+
+        psubw     xmm3, xmm0
+
+;;; 			a0 = _mm_sub_epi16(a0, r0); // (-p3 + p5 - p7) - (p7 >> 1)
+;;; 
+;;; 			//p1 + p7 - p3 - (p3 >> 1);
+;;; 			r0 =  _mm_srai_epi16(p3, 1); // (p3 >> 1)
+
+        movdqa    xmm2, xmm6
+        psraw     xmm2, 1
+        psubw     xmm3, xmm1
+
+;;; 			a1 = _mm_add_epi16(p1, p7); // p1 + p7
+
+        movdqa    xmm1, XMMWORD PTR [esp+144]
+        movdqa    xmm4, xmm1
+        paddw     xmm4, xmm0
+
+;;; 			a1 = _mm_sub_epi16(a1, p3); // (p1 + p7) - p3
+
+        psubw     xmm4, xmm6
+
+;;; 			a1 = _mm_sub_epi16(a1, r0); // (p1 + p7 - p3) - (p3>>1)
+
+        psubw     xmm4, xmm2
+
+;;; 
+;;; 			// -p1 + p7 + p5 + (p5 >> 1);
+;;; 			r0 =  _mm_srai_epi16(p5, 1); // (p5 >> 1)
+
+        movdqa    xmm2, xmm7
+        psraw     xmm2, 1
+
+;;; 			a2 = _mm_sub_epi16(p7, p1); // p7 - p1
+
+        psubw     xmm0, xmm1
+
+;;; 			a2 = _mm_add_epi16(a2, p5); // -p1 + p7 + p5
+
+        paddw     xmm0, xmm7
+
+;;; 			a2 = _mm_add_epi16(a2, r0); // (-p1 + p7 + p5) + (p5 >> 1)
+
+        paddw     xmm0, xmm2
+
+;;; 
+;;; 			// p3 + p5 + p1 + (p1 >> 1);
+;;; 			r0 = _mm_srai_epi16(p1, 1); // p1 >> 1
+
+        movdqa    xmm2, xmm1
+        psraw     xmm2, 1
+
+;;; 			a3 = _mm_add_epi16(p3, p5); // p3+p5
+
+        paddw     xmm6, xmm7
+
+;;; 			a3 = _mm_add_epi16(a3, p1); // p3 + p5 + p1
+;;; 			a3 = _mm_add_epi16(a3, r0); //p3 + p5 + p1 + (p1 >> 1)
+;;; 
+;;; 			r0 = _mm_srai_epi16(a3, 2); // a3>>2
+;;; 			b1 = _mm_add_epi16(a0, r0); //a0 + (a3>>2);
+;;; 			r0 = _mm_srai_epi16(a2, 2); // a2>>2
+;;; 			b3 = _mm_add_epi16(a1, r0); // a1 + (a2>>2);
+;;; 			a1 = _mm_srai_epi16(a1, 2); // all done with a1, so this is safe
+;;; 			b5 = _mm_sub_epi16(a2, a1); //a2 - (a1>>2);
+;;; 			a0 = _mm_srai_epi16(a0, 2); // all done with a0, so this is safe
+;;; 			b7 = _mm_sub_epi16(a3, a0); //a3 - (a0>>2);
+;;; 
+;;; 			r0 = _mm_add_epi16(b0, b7); // b0 + b7;
+;;; 			r1 = _mm_sub_epi16(b2, b5); // b2 - b5;
+
+        movdqa    xmm7, XMMWORD PTR [esp+128]
+        paddw     xmm6, xmm1
+        paddw     xmm6, xmm2
+        movdqa    xmm1, xmm6
+        psraw     xmm1, 2
+        movdqa    xmm2, xmm0
+        paddw     xmm1, xmm3
+        psraw     xmm2, 2
+        paddw     xmm2, xmm4
+        psraw     xmm4, 2
+        psubw     xmm0, xmm4
+        psraw     xmm3, 2
+        psubw     xmm6, xmm3
+        movdqa    XMMWORD PTR [esp+64], xmm6
+        movdqa    xmm3, xmm5
+
+;;; 			r2 = _mm_add_epi16(b4, b3); // b4 + b3;
+;;; 			r3 = _mm_add_epi16(b6, b1); // b6 + b1;
+;;; 			r4 = _mm_sub_epi16(b6, b1); // b6 - b1;
+;;; 			r5 = _mm_sub_epi16(b4, b3); // b4 - b3;
+;;; 			r6 = _mm_add_epi16(b2, b5); // b2 + b5;
+;;; 			r7 = _mm_sub_epi16(b0, b7); // b0 - b7;
+
+        psubw     xmm5, XMMWORD PTR [esp+64]
+        paddw     xmm3, xmm6
+        movdqa    XMMWORD PTR [esp+144], xmm3
+        movdqa    xmm3, xmm7
+        psubw     xmm3, xmm0
+        movdqa    XMMWORD PTR [esp+48], xmm3
+        movdqa    xmm3, XMMWORD PTR [esp+80]
+        movdqa    xmm4, xmm3
+        paddw     xmm4, xmm2
+        psubw     xmm3, xmm2
+
+;;; 
+;;; 
+;;; 			// add in prediction values
+;;; 			pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[0][pos_x]));
+;;; 			pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[1][pos_x]));
+;;; 			// (x + 32) >> 6
+;;; 			r0 = _mm_adds_epi16(r0, const32);
+
+        movdqa    xmm2, XMMWORD PTR const32
+        movdqa    XMMWORD PTR [esp+16], xmm4
+        movdqa    xmm4, XMMWORD PTR [esp+96]
+        movdqa    xmm6, xmm4
+        paddw     xmm6, xmm1
+        psubw     xmm4, xmm1
+
+;;; 			r0 = _mm_srai_epi16(r0, 6);
+;;; 			r1 = _mm_adds_epi16(r1, const32);
+
+        movdqa    xmm1, XMMWORD PTR [esp+48]
+        paddw     xmm7, xmm0
+        movdqa    xmm0, XMMWORD PTR [esp+144]
+        movdqa    XMMWORD PTR [esp+128], xmm7
+
+;;; 			r1 = _mm_srai_epi16(r1, 6);
+;;; 			pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
+;;; 			pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
+
+        movq      xmm7, QWORD PTR [edx+16]
+        movdqa    XMMWORD PTR [esp+32], xmm5
+        paddsw    xmm0, xmm2
+        psraw     xmm0, 6
+        paddsw    xmm1, xmm2
+        pxor      xmm2, xmm2
+        punpcklbw xmm7, xmm2
+        movq      xmm5, QWORD PTR [edx] 
+        punpcklbw xmm5, xmm2
+        psraw     xmm1, 6
+
+;;; 			pred0 = _mm_adds_epi16(pred0, r0);
+;;; 			pred1 = _mm_adds_epi16(pred1, r1);
+
+        paddsw    xmm7, xmm1
+        paddsw    xmm5, xmm0
+
+;;; 
+;;; 			pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
+
+        packuswb  xmm5, xmm7
+
+;;; 
+;;; 			// store
+;;; 			_mm_storel_epi64((__m128i *)(&mb_rec[0][pos_x]), pred0);
+
+        movdqa    xmm0, XMMWORD PTR [esp+32]
+        movdqa    xmm2, XMMWORD PTR [esp+128]
+        movq      QWORD PTR [ecx], xmm5 
+
+;;; 			// TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
+;;; 			pred0 = _mm_srli_si128(pred0, 8);
+
+        psrldq    xmm5, 8
+
+;;; 			_mm_storel_epi64((__m128i *)(&mb_rec[1][pos_x]), pred0);
+
+        movq      QWORD PTR [ecx+16], xmm5
+
+;;; 
+;;; 			/* --- */
+;;; 
+;;; 			pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[2][pos_x]));
+
+        movq      xmm1, QWORD PTR [edx+32]
+
+;;; 			pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[3][pos_x]));
+;;; 			// (x + 32) >> 6
+;;; 			r2 = _mm_adds_epi16(r2, const32);
+
+        movdqa    xmm5, XMMWORD PTR [esp]
+        movdqa    XMMWORD PTR [esp+32], xmm0                    ;
+
+;;; 			r2 = _mm_srai_epi16(r2, 6);
+;;; 			r3 = _mm_adds_epi16(r3, const32);
+
+        paddsw    xmm6, xmm5
+
+;;; 			r3 = _mm_srai_epi16(r3, 6);
+
+        psraw     xmm6, 6
+
+;;; 			pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
+
+        pxor      xmm7, xmm7
+        punpcklbw xmm1, xmm7
+        movdqa    xmm0, XMMWORD PTR [esp+16]
+        paddsw    xmm0, xmm5
+        psraw     xmm0, 6
+
+;;; 			pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
+;;; 			pred0 = _mm_adds_epi16(pred0, r2);
+
+        paddsw    xmm1, xmm0
+
+;;; 			pred1 = _mm_adds_epi16(pred1, r3);
+;;; 
+;;; 			pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
+;;; 
+;;; 			// store
+;;; 			_mm_storel_epi64((__m128i *)(&mb_rec[2][pos_x]), pred0);
+
+        movdqa    xmm0, XMMWORD PTR [esp+32]
+        movq      xmm5, QWORD PTR [edx+48]
+        punpcklbw xmm5, xmm7
+        paddsw    xmm5, xmm6
+        packuswb  xmm1, xmm5
+        movq      QWORD PTR [ecx+32], xmm1
+
+;;; 			// TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
+;;; 			pred0 = _mm_srli_si128(pred0, 8);
+
+        psrldq    xmm1, 8
+
+;;; 			_mm_storel_epi64((__m128i *)(&mb_rec[3][pos_x]), pred0);
+
+        movq      QWORD PTR [ecx+48], xmm1
+
+;;; 
+;;; 			/* --- */
+;;; 
+;;; 			pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[4][pos_x]));
+
+        movq      xmm7, QWORD PTR [edx+64]
+
+;;; 			pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[5][pos_x]));
+
+        movq      xmm6, QWORD PTR [edx+80]
+
+;;; 			// (x + 32) >> 6
+;;; 			r4 = _mm_adds_epi16(r4, const32);
+;;; 			r4 = _mm_srai_epi16(r4, 6);
+;;; 			r5 = _mm_adds_epi16(r5, const32);
+;;; 			r5 = _mm_srai_epi16(r5, 6);
+;;; 			pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
+
+        pxor      xmm5, xmm5
+        punpcklbw xmm7, xmm5
+
+;;; 			pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
+
+        punpcklbw xmm6, xmm5
+        movdqa    xmm1, XMMWORD PTR [esp]
+        paddsw    xmm4, xmm1
+        psraw     xmm4, 6
+        paddsw    xmm3, xmm1
+        psraw     xmm3, 6
+
+;;; 			pred0 = _mm_adds_epi16(pred0, r4);
+
+        paddsw    xmm7, xmm4
+
+;;; 			pred1 = _mm_adds_epi16(pred1, r5);
+
+        paddsw    xmm6, xmm3
+
+;;; 
+;;; 			pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
+
+        packuswb  xmm7, xmm6
+
+;;; 
+;;; 			// store
+;;; 			_mm_storel_epi64((__m128i *)(&mb_rec[4][pos_x]), pred0);
+
+        movq      QWORD PTR [ecx+64], xmm7
+
+;;; 			// TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
+;;; 			pred0 = _mm_srli_si128(pred0, 8);
+
+        psrldq    xmm7, 8
+
+;;; 			_mm_storel_epi64((__m128i *)(&mb_rec[5][pos_x]), pred0);
+
+        movq      QWORD PTR [ecx+80], xmm7
+
+
+;;; 
+;;; 			/* --- */
+;;; 
+;;; 			pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[6][pos_x]));
+
+        movq      xmm5, QWORD PTR [edx+96]
+
+;;; 			pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[7][pos_x]));
+
+        movq      xmm4, QWORD PTR [edx+112] 
+
+;;; 			// (x + 32) >> 6
+;;; 			r6 = _mm_adds_epi16(r6, const32);
+;;; 			r6 = _mm_srai_epi16(r6, 6);
+;;; 			r7 = _mm_adds_epi16(r7, const32);
+;;; 			r7 = _mm_srai_epi16(r7, 6);
+;;; 			pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
+
+        pxor      xmm3, xmm3
+        punpcklbw xmm5, xmm3
+
+;;; 			pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
+
+        punpcklbw xmm4, xmm3
+        movdqa    xmm1, XMMWORD PTR [esp]
+        paddsw    xmm2, xmm1
+        psraw     xmm2, 6
+        paddsw    xmm0, xmm1
+        psraw     xmm0, 6
+
+;;; 			pred0 = _mm_adds_epi16(pred0, r6);
+
+        paddsw    xmm5, xmm2
+
+;;; 			pred1 = _mm_adds_epi16(pred1, r7);
+
+        paddsw    xmm4, xmm0
+
+;;; 
+;;; 			pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
+
+        packuswb  xmm5, xmm4
+
+;;; 
+;;; 			// store
+;;; 			_mm_storel_epi64((__m128i *)&mb_rec[6][pos_x], pred0);
+
+        movq      QWORD PTR [ecx+96], xmm5
+
+;;; 			// TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
+;;; 			pred0 = _mm_srli_si128(pred0, 8);
+
+        psrldq    xmm5, 8
+
+;;; 			_mm_storel_epi64((__m128i *)&mb_rec[7][pos_x], pred0);
+
+        movq      QWORD PTR [ecx+112], xmm5 
+        mov       esp, ebp
+        pop       ebp
+        ret
+        ALIGN     2
+_itrans8x8_sse2 ENDP
+
+
+END
+\ No newline at end of file
diff --git a/Src/h264dec/ldecod/src/quant.c b/Src/h264dec/ldecod/src/quant.c
new file mode 100644
index 00000000..2f01c34a
--- /dev/null
+++ b/Src/h264dec/ldecod/src/quant.c
@@ -0,0 +1,338 @@
+
+/*!
+***********************************************************************
+*  \file
+*      quant.c
+*
+*  \brief
+*      Quantization functions
+*
+*  \author
+*      Main contributors (see contributors.h for copyright, address and affiliation details)
+*
+***********************************************************************
+*/
+
+#include "contributors.h"
+
+#include "global.h"
+#include "memalloc.h"
+#include "block.h"
+#include "image.h"
+#include "mb_access.h"
+#include "transform.h"
+#include "quant.h"
+
+int quant_intra_default[16] = {
+	6,13,20,28,
+	13,20,28,32,
+	20,28,32,37,
+	28,32,37,42
+};
+
+int quant_inter_default[16] = {
+	10,14,20,24,
+	14,20,24,27,
+	20,24,27,30,
+	24,27,30,34
+};
+
+int quant8_intra_default[64] = {
+	6,10,13,16,18,23,25,27,
+	10,11,16,18,23,25,27,29,
+	13,16,18,23,25,27,29,31,
+	16,18,23,25,27,29,31,33,
+	18,23,25,27,29,31,33,36,
+	23,25,27,29,31,33,36,38,
+	25,27,29,31,33,36,38,40,
+	27,29,31,33,36,38,40,42
+};
+
+int quant8_inter_default[64] = {
+	9,13,15,17,19,21,22,24,
+	13,13,17,19,21,22,24,25,
+	15,17,19,21,22,24,25,27,
+	17,19,21,22,24,25,27,28,
+	19,21,22,24,25,27,28,30,
+	21,22,24,25,27,28,30,32,
+	22,24,25,27,28,30,32,33,
+	24,25,27,28,30,32,33,35
+};
+
+int quant_org[16] = { //to be use if no q matrix is chosen
+	16,16,16,16,
+	16,16,16,16,
+	16,16,16,16,
+	16,16,16,16
+};
+
+int quant8_org[64] = { //to be use if no q matrix is chosen
+	16,16,16,16,16,16,16,16,
+	16,16,16,16,16,16,16,16,
+	16,16,16,16,16,16,16,16,
+	16,16,16,16,16,16,16,16,
+	16,16,16,16,16,16,16,16,
+	16,16,16,16,16,16,16,16,
+	16,16,16,16,16,16,16,16,
+	16,16,16,16,16,16,16,16
+};
+
+static void CalculateQuant8x8Param(Slice *currslice);
+
+/*!
+***********************************************************************
+* \brief
+*    Initiate quantization process arrays
+***********************************************************************
+*/
+void init_qp_process(VideoParameters *p_Vid)
+{
+	int bitdepth_qp_scale = imax(p_Vid->bitdepth_luma_qp_scale,p_Vid->bitdepth_chroma_qp_scale);
+	int i;
+
+	// We should allocate memory outside of this process since maybe we will have a change of SPS 
+	// and we may need to recreate these. Currently should only support same bitdepth
+	if (p_Vid->qp_per_matrix == NULL)
+		if ((p_Vid->qp_per_matrix = (int*)malloc((MAX_QP + 1 +  bitdepth_qp_scale)*sizeof(int))) == NULL)
+			no_mem_exit("init_qp_process: p_Vid->qp_per_matrix");
+
+	if (p_Vid->qp_rem_matrix == NULL)
+		if ((p_Vid->qp_rem_matrix = (int*)malloc((MAX_QP + 1 +  bitdepth_qp_scale)*sizeof(int))) == NULL)
+			no_mem_exit("init_qp_process: p_Vid->qp_rem_matrix");
+
+	for (i = 0; i < MAX_QP + bitdepth_qp_scale + 1; i++)
+	{
+		p_Vid->qp_per_matrix[i] = i / 6;
+		p_Vid->qp_rem_matrix[i] = i % 6;
+	}
+}
+
+void free_qp_matrices(VideoParameters *p_Vid)
+{
+	if (p_Vid->qp_per_matrix != NULL)
+	{
+		free (p_Vid->qp_per_matrix);
+		p_Vid->qp_per_matrix = NULL;
+	}
+
+	if (p_Vid->qp_rem_matrix != NULL)
+	{
+		free (p_Vid->qp_rem_matrix);
+		p_Vid->qp_rem_matrix = NULL;
+	}
+}
+
+/*!
+************************************************************************
+* \brief
+*    For mapping the q-matrix to the active id and calculate quantisation values
+*
+* \param currSlice
+*    Slice pointer
+* \param pps
+*    Picture parameter set
+* \param sps
+*    Sequence parameter set
+*
+************************************************************************
+*/
+void assign_quant_params(Slice *currSlice)
+{
+	seq_parameter_set_rbsp_t* sps = currSlice->active_sps;
+	pic_parameter_set_rbsp_t* pps = currSlice->active_pps;
+	int i;
+	int n_ScalingList;
+
+	if(!pps->pic_scaling_matrix_present_flag && !sps->seq_scaling_matrix_present_flag)
+	{
+		for(i=0; i<12; i++)
+			currSlice->qmatrix[i] = (i < 6) ? quant_org : quant8_org;
+	}
+	else
+	{
+		n_ScalingList = (sps->chroma_format_idc != YUV444) ? 8 : 12;
+		if(sps->seq_scaling_matrix_present_flag) // check sps first
+		{
+			for(i=0; i<n_ScalingList; i++)
+			{
+				if(i<6)
+				{
+					if(!sps->seq_scaling_list_present_flag[i]) // fall-back rule A
+					{
+						if(i==0)
+							currSlice->qmatrix[i] = quant_intra_default;
+						else if(i==3)
+							currSlice->qmatrix[i] = quant_inter_default;
+						else
+							currSlice->qmatrix[i] = currSlice->qmatrix[i-1];
+					}
+					else
+					{
+						if(sps->UseDefaultScalingMatrix4x4Flag[i])
+							currSlice->qmatrix[i] = (i<3) ? quant_intra_default : quant_inter_default;
+						else
+							currSlice->qmatrix[i] = sps->ScalingList4x4[i];
+					}
+				}
+				else
+				{
+					if(!sps->seq_scaling_list_present_flag[i]) // fall-back rule A
+					{
+						if(i==6)
+							currSlice->qmatrix[i] = quant8_intra_default;
+						else if(i==7)
+							currSlice->qmatrix[i] = quant8_inter_default;
+						else
+							currSlice->qmatrix[i] = currSlice->qmatrix[i-2];
+					}
+					else
+					{
+						if(sps->UseDefaultScalingMatrix8x8Flag[i-6])
+							currSlice->qmatrix[i] = (i==6 || i==8 || i==10) ? quant8_intra_default:quant8_inter_default;
+						else
+							currSlice->qmatrix[i] = sps->ScalingList8x8[i-6];
+					}
+				}
+			}
+		}
+
+		if(pps->pic_scaling_matrix_present_flag) // then check pps
+		{
+			for(i=0; i<n_ScalingList; i++)
+			{
+				if(i<6)
+				{
+					if(!pps->pic_scaling_list_present_flag[i]) // fall-back rule B
+					{
+						if (i==0)
+						{
+							if(!sps->seq_scaling_matrix_present_flag)
+								currSlice->qmatrix[i] = quant_intra_default;
+						}
+						else if (i==3)
+						{
+							if(!sps->seq_scaling_matrix_present_flag)
+								currSlice->qmatrix[i] = quant_inter_default;
+						}
+						else
+							currSlice->qmatrix[i] = currSlice->qmatrix[i-1];
+					}
+					else
+					{
+						if(pps->UseDefaultScalingMatrix4x4Flag[i])
+							currSlice->qmatrix[i] = (i<3) ? quant_intra_default:quant_inter_default;
+						else
+							currSlice->qmatrix[i] = pps->ScalingList4x4[i];
+					}
+				}
+				else
+				{
+					if(!pps->pic_scaling_list_present_flag[i]) // fall-back rule B
+					{
+						if (i==6)
+						{
+							if(!sps->seq_scaling_matrix_present_flag)
+								currSlice->qmatrix[i] = quant8_intra_default;
+						}
+						else if(i==7)
+						{
+							if(!sps->seq_scaling_matrix_present_flag)
+								currSlice->qmatrix[i] = quant8_inter_default;
+						}
+						else  
+							currSlice->qmatrix[i] = currSlice->qmatrix[i-2];
+					}
+					else
+					{
+						if(pps->UseDefaultScalingMatrix8x8Flag[i-6])
+							currSlice->qmatrix[i] = (i==6 || i==8 || i==10) ? quant8_intra_default:quant8_inter_default;
+						else
+							currSlice->qmatrix[i] = pps->ScalingList8x8[i-6];
+					}
+				}
+			}
+		}
+	}
+
+	CalculateQuant4x4Param(currSlice);
+	if(pps->transform_8x8_mode_flag)
+		CalculateQuant8x8Param(currSlice);
+}
+
+/*!
+************************************************************************
+* \brief
+*    For calculating the quantisation values at frame level
+*
+************************************************************************
+*/
+void CalculateQuant4x4Param(Slice *currSlice)
+{
+	int i, j, k, temp;
+
+	for(k=0; k<6; k++)
+	{
+		for(i=0; i<4; i++)
+		{
+			for(j=0; j<4; j++)
+			{
+				temp = (i<<2)+j;
+				currSlice->InvLevelScale4x4_Intra[0][k][i][j] = dequant_coef[k][i][j] * currSlice->qmatrix[0][temp];
+				currSlice->InvLevelScale4x4_Intra[1][k][i][j] = dequant_coef[k][i][j] * currSlice->qmatrix[1][temp];
+				currSlice->InvLevelScale4x4_Intra[2][k][i][j] = dequant_coef[k][i][j] * currSlice->qmatrix[2][temp];
+
+				currSlice->InvLevelScale4x4_Inter[0][k][i][j] = dequant_coef[k][i][j] * currSlice->qmatrix[3][temp];
+				currSlice->InvLevelScale4x4_Inter[1][k][i][j] = dequant_coef[k][i][j] * currSlice->qmatrix[4][temp];
+				currSlice->InvLevelScale4x4_Inter[2][k][i][j] = dequant_coef[k][i][j] * currSlice->qmatrix[5][temp];
+			}
+		}
+	}
+}
+
+/*!
+************************************************************************
+* \brief
+*    Calculate the quantisation and inverse quantisation parameters
+*
+************************************************************************
+*/
+static void CalculateQuant8x8Param(Slice *currSlice)
+{
+	VideoParameters *p_Vid = currSlice->p_Vid;
+	int i, j, k, temp;
+
+	for(k=0; k<6; k++)
+	{
+		int x = 0;
+		for(i=0; i<8; i++)
+		{
+			for(j=0; j<8; j++)
+			{
+				temp = (i<<3)+j;
+				currSlice->InvLevelScale8x8_Intra[0][k][x] = dequant_coef8[k][x] * currSlice->qmatrix[6][temp];
+				currSlice->InvLevelScale8x8_Inter[0][k][x] = dequant_coef8[k][x] * currSlice->qmatrix[7][temp];
+				x++;
+			}
+		}
+	}
+
+	if( p_Vid->active_sps->chroma_format_idc == YUV444 )  // 4:4:4
+	{
+		for(k=0; k<6; k++)
+		{
+			int x=0;
+			for(i=0; i<8; i++)
+			{
+				for(j=0; j<8; j++)
+				{
+					temp = (i<<3)+j;
+					currSlice->InvLevelScale8x8_Intra[1][k][x] = dequant_coef8[k][x] * currSlice->qmatrix[8][temp];
+					currSlice->InvLevelScale8x8_Inter[1][k][x] = dequant_coef8[k][x] * currSlice->qmatrix[9][temp];
+					currSlice->InvLevelScale8x8_Intra[2][k][x] = dequant_coef8[k][x] * currSlice->qmatrix[10][temp];
+					currSlice->InvLevelScale8x8_Inter[2][k][x] = dequant_coef8[k][x] * currSlice->qmatrix[11][temp];
+					x++;
+				}
+			}
+		}
+	}
+}
diff --git a/Src/h264dec/ldecod/src/sei.c b/Src/h264dec/ldecod/src/sei.c
new file mode 100644
index 00000000..faa4f8ec
--- /dev/null
+++ b/Src/h264dec/ldecod/src/sei.c
@@ -0,0 +1,2132 @@
+/*!
+ ************************************************************************
+ * \file  sei.c
+ *
+ * \brief
+ *    Functions to implement SEI messages
+ *
+ * \author
+ *    Main contributors (see contributors.h for copyright, address and affiliation details)
+ *    - Dong Tian        <tian@cs.tut.fi>
+ *    - Karsten Suehring <suehring@hhi.de>
+ ************************************************************************
+ */
+
+#include "contributors.h"
+
+#include <math.h>
+#include "global.h"
+#include "memalloc.h"
+#include "sei.h"
+#include "vlc.h"
+#include "header.h"
+#include "mbuffer.h"
+#include "parset.h"
+
+
+// #define PRINT_BUFFERING_PERIOD_INFO    // uncomment to print buffering period SEI info
+// #define PRINT_PCITURE_TIMING_INFO      // uncomment to print picture timing SEI info
+// #define WRITE_MAP_IMAGE                // uncomment to write spare picture map
+// #define PRINT_SUBSEQUENCE_INFO         // uncomment to print sub-sequence SEI info
+// #define PRINT_SUBSEQUENCE_LAYER_CHAR   // uncomment to print sub-sequence layer characteristics SEI info
+// #define PRINT_SUBSEQUENCE_CHAR         // uncomment to print sub-sequence characteristics SEI info
+// #define PRINT_SCENE_INFORMATION        // uncomment to print scene information SEI info
+// #define PRINT_PAN_SCAN_RECT            // uncomment to print pan-scan rectangle SEI info
+// #define PRINT_RECOVERY_POINT            // uncomment to print random access point SEI info
+// #define PRINT_FILLER_PAYLOAD_INFO      // uncomment to print filler payload SEI info
+// #define PRINT_DEC_REF_PIC_MARKING      // uncomment to print decoded picture buffer management repetition SEI info
+// #define PRINT_RESERVED_INFO            // uncomment to print reserved SEI info
+// #define PRINT_USER_DATA_UNREGISTERED_INFO          // uncomment to print unregistered user data SEI info
+// #define PRINT_USER_DATA_REGISTERED_ITU_T_T35_INFO  // uncomment to print ITU-T T.35 user data SEI info
+// #define PRINT_FULL_FRAME_FREEZE_INFO               // uncomment to print full-frame freeze SEI info
+// #define PRINT_FULL_FRAME_FREEZE_RELEASE_INFO       // uncomment to print full-frame freeze release SEI info
+// #define PRINT_FULL_FRAME_SNAPSHOT_INFO             // uncomment to print full-frame snapshot SEI info
+// #define PRINT_PROGRESSIVE_REFINEMENT_END_INFO      // uncomment to print Progressive refinement segment start SEI info
+// #define PRINT_PROGRESSIVE_REFINEMENT_END_INFO      // uncomment to print Progressive refinement segment end SEI info
+// #define PRINT_MOTION_CONST_SLICE_GROUP_SET_INFO    // uncomment to print Motion-constrained slice group set SEI info
+// #define PRINT_FILM_GRAIN_CHARACTERISTICS_INFO      // uncomment to print Film grain characteristics SEI info
+// #define PRINT_DEBLOCKING_FILTER_DISPLAY_PREFERENCE_INFO // uncomment to print deblocking filter display preference SEI info
+// #define PRINT_STEREO_VIDEO_INFO_INFO               // uncomment to print stero video SEI info
+// #define PRINT_TONE_MAPPING                         // uncomment to print tone-mapping SEI info
+// #define PRINT_POST_FILTER_HINT_INFO                // uncomment to print post-filter hint SEI info
+/*!
+ ************************************************************************
+ *  \brief
+ *     Interpret the SEI rbsp
+ *  \param msg
+ *     a pointer that point to the sei message.
+ *  \param size
+ *     the size of the sei message
+ *  \param p_Vid
+ *     the image pointer
+ *
+ ************************************************************************
+ */
+void InterpretSEIMessage(byte* msg, int size, VideoParameters *p_Vid)
+{
+  int payload_type = 0;
+  int payload_size = 0;
+  int offset = 1;
+  byte tmp_byte;
+  
+  do
+  {
+    // sei_message();
+    payload_type = 0;
+    tmp_byte = msg[offset++];
+    while (tmp_byte == 0xFF)
+    {
+      payload_type += 255;
+      tmp_byte = msg[offset++];
+    }
+    payload_type += tmp_byte;   // this is the last byte
+
+    payload_size = 0;
+    tmp_byte = msg[offset++];
+    while (tmp_byte == 0xFF)
+    {
+      payload_size += 255;
+      tmp_byte = msg[offset++];
+    }
+    payload_size += tmp_byte;   // this is the last byte
+
+    switch ( payload_type )     // sei_payload( type, size );
+    {
+    case  SEI_BUFFERING_PERIOD:
+      interpret_buffering_period_info( msg+offset, payload_size, p_Vid );
+      break;
+    case  SEI_PIC_TIMING:
+      interpret_picture_timing_info( msg+offset, payload_size, p_Vid );
+      break;
+    case  SEI_PAN_SCAN_RECT:
+      interpret_pan_scan_rect_info( msg+offset, payload_size, p_Vid );
+      break;
+    case  SEI_FILLER_PAYLOAD:
+      interpret_filler_payload_info( msg+offset, payload_size, p_Vid );
+      break;
+    case  SEI_USER_DATA_REGISTERED_ITU_T_T35:
+      interpret_user_data_registered_itu_t_t35_info( msg+offset, payload_size, p_Vid );
+      break;
+    case  SEI_USER_DATA_UNREGISTERED:
+      interpret_user_data_unregistered_info( msg+offset, payload_size, p_Vid );
+      break;
+    case  SEI_RECOVERY_POINT:
+      interpret_recovery_point_info( msg+offset, payload_size, p_Vid );
+      break;
+    case  SEI_DEC_REF_PIC_MARKING_REPETITION:
+      interpret_dec_ref_pic_marking_repetition_info( msg+offset, payload_size, p_Vid );
+      break;
+    case  SEI_SPARE_PIC:
+      interpret_spare_pic( msg+offset, payload_size, p_Vid );
+      break;
+    case  SEI_SCENE_INFO:
+      interpret_scene_information( msg+offset, payload_size, p_Vid );
+      break;
+    case  SEI_SUB_SEQ_INFO:
+      interpret_subsequence_info( msg+offset, payload_size, p_Vid );
+      break;
+    case  SEI_SUB_SEQ_LAYER_CHARACTERISTICS:
+      interpret_subsequence_layer_characteristics_info( msg+offset, payload_size, p_Vid );
+      break;
+    case  SEI_SUB_SEQ_CHARACTERISTICS:
+      interpret_subsequence_characteristics_info( msg+offset, payload_size, p_Vid );
+      break;
+    case  SEI_FULL_FRAME_FREEZE:
+      interpret_full_frame_freeze_info( msg+offset, payload_size, p_Vid );
+      break;
+    case  SEI_FULL_FRAME_FREEZE_RELEASE:
+      interpret_full_frame_freeze_release_info( msg+offset, payload_size, p_Vid );
+      break;
+    case  SEI_FULL_FRAME_SNAPSHOT:
+      interpret_full_frame_snapshot_info( msg+offset, payload_size, p_Vid );
+      break;
+    case  SEI_PROGRESSIVE_REFINEMENT_SEGMENT_START:
+      interpret_progressive_refinement_start_info( msg+offset, payload_size, p_Vid );
+      break;
+    case  SEI_PROGRESSIVE_REFINEMENT_SEGMENT_END:
+      interpret_progressive_refinement_end_info( msg+offset, payload_size, p_Vid );
+      break;
+    case  SEI_MOTION_CONSTRAINED_SLICE_GROUP_SET:
+      interpret_motion_constrained_slice_group_set_info( msg+offset, payload_size, p_Vid );
+    case  SEI_FILM_GRAIN_CHARACTERISTICS:
+      interpret_film_grain_characteristics_info ( msg+offset, payload_size, p_Vid );
+      break;
+    case  SEI_DEBLOCKING_FILTER_DISPLAY_PREFERENCE:
+      interpret_deblocking_filter_display_preference_info ( msg+offset, payload_size, p_Vid );
+      break;
+    case  SEI_STEREO_VIDEO_INFO:
+      interpret_stereo_video_info_info ( msg+offset, payload_size, p_Vid );
+      break;
+    case SEI_TONE_MAPPING:
+      interpret_tone_mapping( msg+offset, payload_size, p_Vid );
+      break;
+    case SEI_POST_FILTER_HINTS:
+      interpret_post_filter_hints_info ( msg+offset, payload_size, p_Vid );
+    default:
+      interpret_reserved_info( msg+offset, payload_size, p_Vid );
+      break;
+    }
+    offset += payload_size;
+
+  } while( msg[offset] != 0x80 );    // more_rbsp_data()  msg[offset] != 0x80
+  // ignore the trailing bits rbsp_trailing_bits();
+  assert(msg[offset] == 0x80);      // this is the trailing bits
+  assert( offset+1 == size );
+}
+
+
+/*!
+************************************************************************
+*  \brief
+*     Interpret the spare picture SEI message
+*  \param payload
+*     a pointer that point to the sei payload
+*  \param size
+*     the size of the sei message
+*  \param p_Vid
+*     the image pointer
+*
+************************************************************************
+*/
+void interpret_spare_pic( byte* payload, int size, VideoParameters *p_Vid )
+{
+  int i,x,y;
+  Bitstream* buf;
+  int bit0, bit1, bitc, no_bit0;
+  int target_frame_num = 0;
+  int num_spare_pics;
+  int delta_spare_frame_num, CandidateSpareFrameNum, SpareFrameNum = 0;
+  int ref_area_indicator;
+
+  int m, n, left, right, top, bottom,directx, directy;
+  byte ***map;
+
+#ifdef WRITE_MAP_IMAGE
+  int symbol_size_in_bytes = p_Vid->pic_unit_bitsize_on_disk/8;
+  int  j, k, i0, j0, tmp, kk;
+  char filename[20] = "map_dec.yuv";
+  FILE *fp;
+  imgpel** Y;
+  static int old_pn=-1;
+  static int first = 1;
+
+  printf("Spare picture SEI message\n");
+#endif
+
+
+
+  assert( payload!=NULL);
+  assert( p_Vid!=NULL);
+
+  buf = malloc(sizeof(Bitstream));
+  buf->bitstream_length = size;
+  buf->streamBuffer = payload;
+  buf->frame_bitoffset = 0;
+
+  target_frame_num = ue_v("SEI: target_frame_num", buf);
+
+#ifdef WRITE_MAP_IMAGE
+  printf( "target_frame_num is %d\n", target_frame_num );
+#endif
+
+  num_spare_pics = 1 + ue_v("SEI: num_spare_pics_minus1", buf);
+
+#ifdef WRITE_MAP_IMAGE
+  printf( "num_spare_pics is %d\n", num_spare_pics );
+#endif
+
+  get_mem3D(&map, num_spare_pics, p_Vid->height >> 4, p_Vid->width >> 4);
+
+  for (i=0; i<num_spare_pics; i++)
+  {
+    if (i==0)
+    {
+      CandidateSpareFrameNum = target_frame_num - 1;
+      if ( CandidateSpareFrameNum < 0 ) CandidateSpareFrameNum = MAX_FN - 1;
+    }
+    else
+      CandidateSpareFrameNum = SpareFrameNum;
+
+    delta_spare_frame_num = ue_v("SEI: delta_spare_frame_num", buf);
+
+    SpareFrameNum = CandidateSpareFrameNum - delta_spare_frame_num;
+    if( SpareFrameNum < 0 )
+      SpareFrameNum = MAX_FN + SpareFrameNum;
+
+    ref_area_indicator = ue_v("SEI: ref_area_indicator", buf);
+
+    switch ( ref_area_indicator )
+    {
+    case 0:   // The whole frame can serve as spare picture
+      for (y=0; y<p_Vid->height >> 4; y++)
+        for (x=0; x<p_Vid->width >> 4; x++)
+          map[i][y][x] = 0;
+      break;
+    case 1:   // The map is not compressed
+      for (y=0; y<p_Vid->height >> 4; y++)
+        for (x=0; x<p_Vid->width >> 4; x++)
+        {
+          map[i][y][x] = (byte) u_1("SEI: ref_mb_indicator", buf);
+        }
+      break;
+    case 2:   // The map is compressed
+              //!KS: could not check this function, description is unclear (as stated in Ed. Note)
+      bit0 = 0;
+      bit1 = 1;
+      bitc = bit0;
+      no_bit0 = -1;
+
+      x = ( (p_Vid->width >> 4) - 1 ) / 2;
+      y = ( (p_Vid->height >> 4) - 1 ) / 2;
+      left = right = x;
+      top = bottom = y;
+      directx = 0;
+      directy = 1;
+
+      for (m=0; m<p_Vid->height >> 4; m++)
+        for (n=0; n<p_Vid->width >> 4; n++)
+        {
+
+          if (no_bit0<0)
+          {
+            no_bit0 = ue_v("SEI: zero_run_length", buf);
+          }
+          if (no_bit0>0) 
+            map[i][y][x] = (byte) bit0;
+          else 
+            map[i][y][x] = (byte) bit1;
+          no_bit0--;
+
+          // go to the next mb:
+          if ( directx == -1 && directy == 0 )
+          {
+            if (x > left) x--;
+            else if (x == 0)
+            {
+              y = bottom + 1;
+              bottom++;
+              directx = 1;
+              directy = 0;
+            }
+            else if (x == left)
+            {
+              x--;
+              left--;
+              directx = 0;
+              directy = 1;
+            }
+          }
+          else if ( directx == 1 && directy == 0 )
+          {
+            if (x < right) x++;
+            else if (x == (p_Vid->width >> 4) - 1)
+            {
+              y = top - 1;
+              top--;
+              directx = -1;
+              directy = 0;
+            }
+            else if (x == right)
+            {
+              x++;
+              right++;
+              directx = 0;
+              directy = -1;
+            }
+          }
+          else if ( directx == 0 && directy == -1 )
+          {
+            if ( y > top) y--;
+            else if (y == 0)
+            {
+              x = left - 1;
+              left--;
+              directx = 0;
+              directy = 1;
+            }
+            else if (y == top)
+            {
+              y--;
+              top--;
+              directx = -1;
+              directy = 0;
+            }
+          }
+          else if ( directx == 0 && directy == 1 )
+          {
+            if (y < bottom) y++;
+            else if (y == (p_Vid->height >> 4) - 1)
+            {
+              x = right+1;
+              right++;
+              directx = 0;
+              directy = -1;
+            }
+            else if (y == bottom)
+            {
+              y++;
+              bottom++;
+              directx = 1;
+              directy = 0;
+            }
+          }
+
+
+        }
+      break;
+    default:
+      printf( "Wrong ref_area_indicator %d!\n", ref_area_indicator );
+      exit(0);
+      break;
+    }
+
+  } // end of num_spare_pics
+
+#ifdef WRITE_MAP_IMAGE
+  // begin to write map seq
+  if ( old_pn != p_Vid->number )
+  {
+    old_pn = p_Vid->number;
+    get_mem2Dpel(&Y, p_Vid->height, p_Vid->width);
+    if (first)
+    {
+      fp = fopen( filename, "wb" );
+      first = 0;
+    }
+    else
+      fp = fopen( filename, "ab" );
+    assert( fp != NULL );
+    for (kk=0; kk<num_spare_pics; kk++)
+    {
+      for (i=0; i < p_Vid->height >> 4; i++)
+        for (j=0; j < p_Vid->width >> 4; j++)
+        {
+          tmp=map[kk][i][j]==0? p_Vid->max_pel_value_comp[0] : 0;
+          for (i0=0; i0<16; i0++)
+            for (j0=0; j0<16; j0++)
+              Y[i*16+i0][j*16+j0]=tmp;
+        }
+
+      // write the map image
+      for (i=0; i < p_Vid->height; i++)
+        for (j=0; j < p_Vid->width; j++)
+          fwrite(&(Y[i][j]), symbol_size_in_bytes, 1, p_out);
+
+      for (k=0; k < 2; k++)
+        for (i=0; i < p_Vid->height>>1; i++)
+          for (j=0; j < p_Vid->width>>1; j++)
+            fwrite(&(p_Vid->dc_pred_value_comp[1]), symbol_size_in_bytes, 1, p_out);
+    }
+    fclose( fp );
+    free_mem2Dpel( Y );
+  }
+  // end of writing map image
+#undef WRITE_MAP_IMAGE
+#endif
+
+  free_mem3D( map );
+
+  free(buf);
+}
+
+
+/*!
+ ************************************************************************
+ *  \brief
+ *     Interpret the Sub-sequence information SEI message
+ *  \param payload
+ *     a pointer that point to the sei payload
+ *  \param size
+ *     the size of the sei message
+ *  \param p_Vid
+ *     the image pointer
+ *
+ ************************************************************************
+ */
+void interpret_subsequence_info( byte* payload, int size, VideoParameters *p_Vid )
+{
+  Bitstream* buf;
+  int sub_seq_layer_num, sub_seq_id, first_ref_pic_flag, leading_non_ref_pic_flag, last_pic_flag,
+      sub_seq_frame_num_flag, sub_seq_frame_num;
+
+  buf = malloc(sizeof(Bitstream));
+  buf->bitstream_length = size;
+  buf->streamBuffer = payload;
+  buf->frame_bitoffset = 0;
+
+  sub_seq_layer_num        = ue_v("SEI: sub_seq_layer_num"       , buf);
+  sub_seq_id               = ue_v("SEI: sub_seq_id"              , buf);
+  first_ref_pic_flag       = u_1 ("SEI: first_ref_pic_flag"      , buf);
+  leading_non_ref_pic_flag = u_1 ("SEI: leading_non_ref_pic_flag", buf);
+  last_pic_flag            = u_1 ("SEI: last_pic_flag"           , buf);
+  sub_seq_frame_num_flag   = u_1 ("SEI: sub_seq_frame_num_flag"  , buf);
+  if (sub_seq_frame_num_flag)
+  {
+    sub_seq_frame_num        = ue_v("SEI: sub_seq_frame_num"       , buf);
+  }
+
+#ifdef PRINT_SUBSEQUENCE_INFO
+  printf("Sub-sequence information SEI message\n");
+  printf("sub_seq_layer_num        = %d\n", sub_seq_layer_num );
+  printf("sub_seq_id               = %d\n", sub_seq_id);
+  printf("first_ref_pic_flag       = %d\n", first_ref_pic_flag);
+  printf("leading_non_ref_pic_flag = %d\n", leading_non_ref_pic_flag);
+  printf("last_pic_flag            = %d\n", last_pic_flag);
+  printf("sub_seq_frame_num_flag   = %d\n", sub_seq_frame_num_flag);
+  if (sub_seq_frame_num_flag)
+  {
+    printf("sub_seq_frame_num        = %d\n", sub_seq_frame_num);
+  }
+#endif
+
+  free(buf);
+#ifdef PRINT_SUBSEQUENCE_INFO
+#undef PRINT_SUBSEQUENCE_INFO
+#endif
+}
+
+/*!
+ ************************************************************************
+ *  \brief
+ *     Interpret the Sub-sequence layer characteristics SEI message
+ *  \param payload
+ *     a pointer that point to the sei payload
+ *  \param size
+ *     the size of the sei message
+ *  \param p_Vid
+ *     the image pointer
+ *
+ ************************************************************************
+ */
+void interpret_subsequence_layer_characteristics_info( byte* payload, int size, VideoParameters *p_Vid )
+{
+  Bitstream* buf;
+  long num_sub_layers, accurate_statistics_flag, average_bit_rate, average_frame_rate;
+  int i;
+
+  buf = malloc(sizeof(Bitstream));
+  buf->bitstream_length = size;
+  buf->streamBuffer = payload;
+  buf->frame_bitoffset = 0;
+
+  num_sub_layers = 1 + ue_v("SEI: num_sub_layers_minus1", buf);
+
+#ifdef PRINT_SUBSEQUENCE_LAYER_CHAR
+  printf("Sub-sequence layer characteristics SEI message\n");
+  printf("num_sub_layers_minus1 = %d\n", num_sub_layers - 1);
+#endif
+
+  for (i=0; i<num_sub_layers; i++)
+  {
+    accurate_statistics_flag = u_1(   "SEI: accurate_statistics_flag", buf);
+    average_bit_rate         = u_v(16,"SEI: average_bit_rate"        , buf);
+    average_frame_rate       = u_v(16,"SEI: average_frame_rate"      , buf);
+
+#ifdef PRINT_SUBSEQUENCE_LAYER_CHAR
+    printf("layer %d: accurate_statistics_flag = %ld \n", i, accurate_statistics_flag);
+    printf("layer %d: average_bit_rate         = %ld \n", i, average_bit_rate);
+    printf("layer %d: average_frame_rate       = %ld \n", i, average_frame_rate);
+#endif
+  }
+  free (buf);
+}
+
+
+/*!
+ ************************************************************************
+ *  \brief
+ *     Interpret the Sub-sequence characteristics SEI message
+ *  \param payload
+ *     a pointer that point to the sei payload
+ *  \param size
+ *     the size of the sei message
+ *  \param p_Vid
+ *     the image pointer
+ *
+ ************************************************************************
+ */
+void interpret_subsequence_characteristics_info( byte* payload, int size, VideoParameters *p_Vid )
+{
+  Bitstream* buf;
+  int i;
+  int sub_seq_layer_num, sub_seq_id, duration_flag, average_rate_flag, accurate_statistics_flag;
+  unsigned long sub_seq_duration, average_bit_rate, average_frame_rate;
+  int num_referenced_subseqs, ref_sub_seq_layer_num, ref_sub_seq_id, ref_sub_seq_direction;
+
+  buf = malloc(sizeof(Bitstream));
+  buf->bitstream_length = size;
+  buf->streamBuffer = payload;
+  buf->frame_bitoffset = 0;
+
+  sub_seq_layer_num = ue_v("SEI: sub_seq_layer_num", buf);
+  sub_seq_id        = ue_v("SEI: sub_seq_id", buf);
+  duration_flag     = u_1 ("SEI: duration_flag", buf);
+
+#ifdef PRINT_SUBSEQUENCE_CHAR
+  printf("Sub-sequence characteristics SEI message\n");
+  printf("sub_seq_layer_num = %d\n", sub_seq_layer_num );
+  printf("sub_seq_id        = %d\n", sub_seq_id);
+  printf("duration_flag     = %d\n", duration_flag);
+#endif
+
+  if ( duration_flag )
+  {
+    sub_seq_duration = u_v (32, "SEI: duration_flag", buf);
+#ifdef PRINT_SUBSEQUENCE_CHAR
+    printf("sub_seq_duration = %ld\n", sub_seq_duration);
+#endif
+  }
+
+  average_rate_flag = u_1 ("SEI: average_rate_flag", buf);
+
+#ifdef PRINT_SUBSEQUENCE_CHAR
+  printf("average_rate_flag = %d\n", average_rate_flag);
+#endif
+
+  if ( average_rate_flag )
+  {
+    accurate_statistics_flag = u_1 (    "SEI: accurate_statistics_flag", buf);
+    average_bit_rate         = u_v (16, "SEI: average_bit_rate", buf);
+    average_frame_rate       = u_v (16, "SEI: average_frame_rate", buf);
+
+#ifdef PRINT_SUBSEQUENCE_CHAR
+    printf("accurate_statistics_flag = %d\n", accurate_statistics_flag);
+    printf("average_bit_rate         = %ld\n", average_bit_rate);
+    printf("average_frame_rate       = %ld\n", average_frame_rate);
+#endif
+  }
+
+  num_referenced_subseqs  = ue_v("SEI: num_referenced_subseqs", buf);
+
+#ifdef PRINT_SUBSEQUENCE_CHAR
+  printf("num_referenced_subseqs = %d\n", num_referenced_subseqs);
+#endif
+
+  for (i=0; i<num_referenced_subseqs; i++)
+  {
+    ref_sub_seq_layer_num  = ue_v("SEI: ref_sub_seq_layer_num", buf);
+    ref_sub_seq_id         = ue_v("SEI: ref_sub_seq_id", buf);
+    ref_sub_seq_direction  = u_1 ("SEI: ref_sub_seq_direction", buf);
+
+#ifdef PRINT_SUBSEQUENCE_CHAR
+    printf("ref_sub_seq_layer_num = %d\n", ref_sub_seq_layer_num);
+    printf("ref_sub_seq_id        = %d\n", ref_sub_seq_id);
+    printf("ref_sub_seq_direction = %d\n", ref_sub_seq_direction);
+#endif
+  }
+
+  free( buf );
+#ifdef PRINT_SUBSEQUENCE_CHAR
+#undef PRINT_SUBSEQUENCE_CHAR
+#endif
+}
+
+
+/*!
+ ************************************************************************
+ *  \brief
+ *     Interpret the Scene information SEI message
+ *  \param payload
+ *     a pointer that point to the sei payload
+ *  \param size
+ *     the size of the sei message
+ *  \param p_Vid
+ *     the image pointer
+ *
+ ************************************************************************
+ */
+void interpret_scene_information( byte* payload, int size, VideoParameters *p_Vid )
+{
+  Bitstream* buf;
+  int scene_id, scene_transition_type, second_scene_id;
+
+  buf = malloc(sizeof(Bitstream));
+  buf->bitstream_length = size;
+  buf->streamBuffer = payload;
+  buf->frame_bitoffset = 0;
+
+  scene_id              = ue_v("SEI: scene_id"             , buf);
+  scene_transition_type = ue_v("SEI: scene_transition_type", buf);
+  if ( scene_transition_type > 3 )
+  {
+    second_scene_id     = ue_v("SEI: scene_transition_type", buf);;
+  }
+
+#ifdef PRINT_SCENE_INFORMATION
+  printf("Scene information SEI message\n");
+  printf("scene_transition_type = %d\n", scene_transition_type);
+  printf("scene_id              = %d\n", scene_id);
+  if ( scene_transition_type > 3 )
+  {
+    printf("second_scene_id       = %d\n", second_scene_id);
+  }
+#endif
+  free( buf );
+#ifdef PRINT_SCENE_INFORMATION
+#undef PRINT_SCENE_INFORMATION
+#endif
+}
+
+
+/*!
+ ************************************************************************
+ *  \brief
+ *     Interpret the Filler payload SEI message
+ *  \param payload
+ *     a pointer that point to the sei payload
+ *  \param size
+ *     the size of the sei message
+ *  \param p_Vid
+ *     the image pointer
+ *
+ ************************************************************************
+ */
+void interpret_filler_payload_info( byte* payload, int size, VideoParameters *p_Vid )
+{
+  int payload_cnt = 0;
+
+  while (payload_cnt<size)
+  {
+    if (payload[payload_cnt] == 0xFF)
+    {
+       payload_cnt++;
+    }
+  }
+
+
+#ifdef PRINT_FILLER_PAYLOAD_INFO
+  printf("Filler payload SEI message\n");
+  if (payload_cnt==size)
+  {
+    printf("read %d bytes of filler payload\n", payload_cnt);
+  }
+  else
+  {
+    printf("error reading filler payload: not all bytes are 0xFF (%d of %d)\n", payload_cnt, size);
+  }
+#endif
+
+#ifdef PRINT_FILLER_PAYLOAD_INFO
+#undef PRINT_FILLER_PAYLOAD_INFO
+#endif
+}
+
+
+/*!
+ ************************************************************************
+ *  \brief
+ *     Interpret the User data unregistered SEI message
+ *  \param payload
+ *     a pointer that point to the sei payload
+ *  \param size
+ *     the size of the sei message
+ *  \param p_Vid
+ *     the image pointer
+ *
+ ************************************************************************
+ */
+void interpret_user_data_unregistered_info( byte* payload, int size, VideoParameters *p_Vid )
+{
+  int offset = 0;
+  byte payload_byte;
+
+#ifdef PRINT_USER_DATA_UNREGISTERED_INFO
+  printf("User data unregistered SEI message\n");
+  printf("uuid_iso_11578 = 0x");
+#endif
+  assert (size>=16);
+
+  for (offset = 0; offset < 16; offset++)
+  {
+#ifdef PRINT_USER_DATA_UNREGISTERED_INFO
+    printf("%02x",payload[offset]);
+#endif
+  }
+
+#ifdef PRINT_USER_DATA_UNREGISTERED_INFO
+    printf("\n");
+#endif
+
+  while (offset < size)
+  {
+    payload_byte = payload[offset];
+    offset ++;
+#ifdef PRINT_USER_DATA_UNREGISTERED_INFO
+    printf("Unreg data payload_byte = %d\n", payload_byte);
+#endif
+  }
+#ifdef PRINT_USER_DATA_UNREGISTERED_INFO
+#undef PRINT_USER_DATA_UNREGISTERED_INFO
+#endif
+}
+
+
+/*!
+ ************************************************************************
+ *  \brief
+ *     Interpret the User data registered by ITU-T T.35 SEI message
+ *  \param payload
+ *     a pointer that point to the sei payload
+ *  \param size
+ *     the size of the sei message
+ *  \param p_Vid
+ *     the image pointer
+ *
+ ************************************************************************
+ */
+void interpret_user_data_registered_itu_t_t35_info( byte* payload, int size, VideoParameters *p_Vid )
+{
+  int offset = 0;
+  byte itu_t_t35_country_code, itu_t_t35_country_code_extension_byte, payload_byte;
+
+  itu_t_t35_country_code = payload[offset];
+  offset++;
+#ifdef PRINT_USER_DATA_REGISTERED_ITU_T_T35_INFO
+  printf("User data registered by ITU-T T.35 SEI message\n");
+  printf(" itu_t_t35_country_code = %d \n", itu_t_t35_country_code);
+#endif
+  if(itu_t_t35_country_code == 0xFF)
+  {
+    itu_t_t35_country_code_extension_byte = payload[offset];
+    offset++;
+#ifdef PRINT_USER_DATA_REGISTERED_ITU_T_T35_INFO
+    printf(" ITU_T_T35_COUNTRY_CODE_EXTENSION_BYTE %d \n", itu_t_t35_country_code_extension_byte);
+#endif
+  }
+  while (offset < size)
+  {
+    payload_byte = payload[offset];
+    offset ++;
+#ifdef PRINT_USER_DATA_REGISTERED_ITU_T_T35_INFO
+    printf("itu_t_t35 payload_byte = %d\n", payload_byte);
+#endif
+  }
+#ifdef PRINT_USER_DATA_REGISTERED_ITU_T_T35_INFO
+#undef PRINT_USER_DATA_REGISTERED_ITU_T_T35_INFO
+#endif
+}
+
+
+/*!
+ ************************************************************************
+ *  \brief
+ *     Interpret the Pan scan rectangle SEI message
+ *  \param payload
+ *     a pointer that point to the sei payload
+ *  \param size
+ *     the size of the sei message
+ *  \param p_Vid
+ *     the image pointer
+ *
+ ************************************************************************
+ */
+void interpret_pan_scan_rect_info( byte* payload, int size, VideoParameters *p_Vid )
+{
+  int pan_scan_rect_cancel_flag;
+  int pan_scan_cnt_minus1, i;
+  int pan_scan_rect_repetition_period;
+  int pan_scan_rect_id, pan_scan_rect_left_offset, pan_scan_rect_right_offset;
+  int pan_scan_rect_top_offset, pan_scan_rect_bottom_offset;
+
+  Bitstream* buf;
+
+  buf = malloc(sizeof(Bitstream));
+  buf->bitstream_length = size;
+  buf->streamBuffer = payload;
+  buf->frame_bitoffset = 0;
+
+  pan_scan_rect_id = ue_v("SEI: pan_scan_rect_id", buf);
+
+  pan_scan_rect_cancel_flag = u_1("SEI: pan_scan_rect_cancel_flag", buf);
+  if (!pan_scan_rect_cancel_flag) 
+  {
+    pan_scan_cnt_minus1 = ue_v("SEI: pan_scan_cnt_minus1", buf);
+    for (i = 0; i <= pan_scan_cnt_minus1; i++) 
+    {
+      pan_scan_rect_left_offset   = se_v("SEI: pan_scan_rect_left_offset"  , buf);
+      pan_scan_rect_right_offset  = se_v("SEI: pan_scan_rect_right_offset" , buf);
+      pan_scan_rect_top_offset    = se_v("SEI: pan_scan_rect_top_offset"   , buf);
+      pan_scan_rect_bottom_offset = se_v("SEI: pan_scan_rect_bottom_offset", buf);
+#ifdef PRINT_PAN_SCAN_RECT
+      printf("Pan scan rectangle SEI message %d/%d\n", i, pan_scan_cnt_minus1);
+      printf("pan_scan_rect_id            = %d\n", pan_scan_rect_id);
+      printf("pan_scan_rect_left_offset   = %d\n", pan_scan_rect_left_offset);
+      printf("pan_scan_rect_right_offset  = %d\n", pan_scan_rect_right_offset);
+      printf("pan_scan_rect_top_offset    = %d\n", pan_scan_rect_top_offset);
+      printf("pan_scan_rect_bottom_offset = %d\n", pan_scan_rect_bottom_offset);
+#endif
+    }
+    pan_scan_rect_repetition_period = ue_v("SEI: pan_scan_rect_repetition_period", buf);
+  }
+
+  free (buf);
+#ifdef PRINT_PAN_SCAN_RECT
+#undef PRINT_PAN_SCAN_RECT
+#endif
+}
+
+
+/*!
+ ************************************************************************
+ *  \brief
+ *     Interpret the Random access point SEI message
+ *  \param payload
+ *     a pointer that point to the sei payload
+ *  \param size
+ *     the size of the sei message
+ *  \param p_Vid
+ *     the image pointer
+ *
+ ************************************************************************
+ */
+void interpret_recovery_point_info( byte* payload, int size, VideoParameters *p_Vid )
+{
+  int recovery_frame_cnt, exact_match_flag, broken_link_flag, changing_slice_group_idc;
+
+
+  Bitstream* buf;
+
+
+  buf = malloc(sizeof(Bitstream));
+  buf->bitstream_length = size;
+  buf->streamBuffer = payload;
+  buf->frame_bitoffset = 0;
+
+  recovery_frame_cnt       = ue_v(    "SEI: recovery_frame_cnt"      , buf);
+  exact_match_flag         = u_1 (    "SEI: exact_match_flag"        , buf);
+  broken_link_flag         = u_1 (    "SEI: broken_link_flag"        , buf);
+  changing_slice_group_idc = u_v ( 2, "SEI: changing_slice_group_idc", buf);
+
+  p_Vid->recovery_point = 1;
+  p_Vid->recovery_frame_cnt = recovery_frame_cnt;
+
+#ifdef PRINT_RECOVERY_POINT
+  printf("Recovery point SEI message\n");
+  printf("recovery_frame_cnt       = %d\n", recovery_frame_cnt);
+  printf("exact_match_flag         = %d\n", exact_match_flag);
+  printf("broken_link_flag         = %d\n", broken_link_flag);
+  printf("changing_slice_group_idc = %d\n", changing_slice_group_idc);
+#endif
+  free (buf);
+#ifdef PRINT_RECOVERY_POINT
+#undef PRINT_RECOVERY_POINT
+#endif
+}
+
+
+/*!
+ ************************************************************************
+ *  \brief
+ *     Interpret the Decoded Picture Buffer Management Repetition SEI message
+ *  \param payload
+ *     a pointer that point to the sei payload
+ *  \param size
+ *     the size of the sei message
+ *  \param p_Vid
+ *     the image pointer
+ *
+ ************************************************************************
+ */
+void interpret_dec_ref_pic_marking_repetition_info( byte* payload, int size, VideoParameters *p_Vid )
+{
+  int original_idr_flag, original_frame_num;
+  int original_field_pic_flag, original_bottom_field_flag;
+
+  DecRefPicMarking_t *tmp_drpm;
+
+  DecRefPicMarking_t *old_drpm;
+  int old_idr_flag , old_no_output_of_prior_pics_flag, old_long_term_reference_flag , old_adaptive_ref_pic_buffering_flag;
+
+
+  Bitstream* buf;
+
+  buf = malloc(sizeof(Bitstream));
+  buf->bitstream_length = size;
+  buf->streamBuffer = payload;
+  buf->frame_bitoffset = 0;
+
+  original_idr_flag     = u_1 (    "SEI: original_idr_flag"    , buf);
+  original_frame_num    = ue_v(    "SEI: original_frame_num"   , buf);
+
+  if ( !p_Vid->active_sps->frame_mbs_only_flag )
+  {
+    original_field_pic_flag = u_1 ( "SEI: original_field_pic_flag", buf);
+    if ( original_field_pic_flag )
+    {
+      original_bottom_field_flag = u_1 ( "SEI: original_bottom_field_flag", buf);
+    }
+  }
+
+#ifdef PRINT_DEC_REF_PIC_MARKING
+  printf("Decoded Picture Buffer Management Repetition SEI message\n");
+  printf("original_idr_flag       = %d\n", original_idr_flag);
+  printf("original_frame_num      = %d\n", original_frame_num);
+  if ( active_sps->frame_mbs_only_flag )
+  {
+    printf("original_field_pic_flag = %d\n", original_field_pic_flag);
+    if ( original_field_pic_flag )
+    {
+      printf("original_bottom_field_flag = %d\n", original_bottom_field_flag);
+    }
+  }
+#endif
+
+  // we need to save everything that is probably overwritten in dec_ref_pic_marking()
+  old_drpm = p_Vid->dec_ref_pic_marking_buffer;
+  old_idr_flag = p_Vid->idr_flag;
+
+  old_no_output_of_prior_pics_flag = p_Vid->no_output_of_prior_pics_flag;
+  old_long_term_reference_flag = p_Vid->long_term_reference_flag;
+  old_adaptive_ref_pic_buffering_flag = p_Vid->adaptive_ref_pic_buffering_flag;
+
+  // set new initial values
+  p_Vid->idr_flag = original_idr_flag;
+  p_Vid->dec_ref_pic_marking_buffer = NULL;
+
+  dec_ref_pic_marking(p_Vid, buf);
+
+  // print out decoded values
+#ifdef PRINT_DEC_REF_PIC_MARKING
+  if (p_Vid->idr_flag)
+  {
+    printf("no_output_of_prior_pics_flag = %d\n", p_Vid->no_output_of_prior_pics_flag);
+    printf("long_term_reference_flag     = %d\n", p_Vid->long_term_reference_flag);
+  }
+  else
+  {
+    printf("adaptive_ref_pic_buffering_flag  = %d\n", p_Vid->adaptive_ref_pic_buffering_flag);
+    if (p_Vid->adaptive_ref_pic_buffering_flag)
+    {
+      tmp_drpm=p_Vid->dec_ref_pic_marking_buffer;
+      while (tmp_drpm != NULL)
+      {
+        printf("memory_management_control_operation  = %d\n", tmp_drpm->memory_management_control_operation);
+
+        if ((tmp_drpm->memory_management_control_operation==1)||(tmp_drpm->memory_management_control_operation==3))
+        {
+          printf("difference_of_pic_nums_minus1        = %d\n", tmp_drpm->difference_of_pic_nums_minus1);
+        }
+        if (tmp_drpm->memory_management_control_operation==2)
+        {
+          printf("long_term_pic_num                    = %d\n", tmp_drpm->long_term_pic_num);
+        }
+        if ((tmp_drpm->memory_management_control_operation==3)||(tmp_drpm->memory_management_control_operation==6))
+        {
+          printf("long_term_frame_idx                  = %d\n", tmp_drpm->long_term_frame_idx);
+        }
+        if (tmp_drpm->memory_management_control_operation==4)
+        {
+          printf("max_long_term_pic_idx_plus1          = %d\n", tmp_drpm->max_long_term_frame_idx_plus1);
+        }
+        tmp_drpm = tmp_drpm->Next;
+      }
+    }
+  }
+#endif
+
+  while (p_Vid->dec_ref_pic_marking_buffer)
+  {
+    tmp_drpm=p_Vid->dec_ref_pic_marking_buffer;
+
+    p_Vid->dec_ref_pic_marking_buffer=tmp_drpm->Next;
+    free (tmp_drpm);
+  }
+
+  // restore old values in p_Vid
+  p_Vid->dec_ref_pic_marking_buffer = old_drpm;
+  p_Vid->idr_flag = old_idr_flag;
+  p_Vid->no_output_of_prior_pics_flag = old_no_output_of_prior_pics_flag;
+  p_Vid->long_term_reference_flag = old_long_term_reference_flag;
+  p_Vid->adaptive_ref_pic_buffering_flag = old_adaptive_ref_pic_buffering_flag;
+
+  free (buf);
+#ifdef PRINT_DEC_REF_PIC_MARKING
+#undef PRINT_DEC_REF_PIC_MARKING
+#endif
+}
+
+/*!
+ ************************************************************************
+ *  \brief
+ *     Interpret the Full-frame freeze SEI message
+ *  \param payload
+ *     a pointer that point to the sei payload
+ *  \param size
+ *     the size of the sei message
+ *  \param p_Vid
+ *     the image pointer
+ *
+ ************************************************************************
+ */
+void interpret_full_frame_freeze_info( byte* payload, int size, VideoParameters *p_Vid )
+{
+  int full_frame_freeze_repetition_period;
+  Bitstream* buf;
+
+  buf = malloc(sizeof(Bitstream));
+  buf->bitstream_length = size;
+  buf->streamBuffer = payload;
+  buf->frame_bitoffset = 0;
+
+  full_frame_freeze_repetition_period  = ue_v(    "SEI: full_frame_freeze_repetition_period"   , buf);
+
+#ifdef PRINT_FULL_FRAME_FREEZE_INFO
+  printf("full_frame_freeze_repetition_period = %d\n", full_frame_freeze_repetition_period);
+#endif
+
+  free (buf);
+#ifdef PRINT_FULL_FRAME_FREEZE_INFO
+#undef PRINT_FULL_FRAME_FREEZE_INFO
+#endif
+}
+
+
+/*!
+ ************************************************************************
+ *  \brief
+ *     Interpret the Full-frame freeze release SEI message
+ *  \param payload
+ *     a pointer that point to the sei payload
+ *  \param size
+ *     the size of the sei message
+ *  \param p_Vid
+ *     the image pointer
+ *
+ ************************************************************************
+ */
+void interpret_full_frame_freeze_release_info( byte* payload, int size, VideoParameters *p_Vid )
+{
+#ifdef PRINT_FULL_FRAME_FREEZE_RELEASE_INFO
+  printf("Full-frame freeze release SEI message\n");
+  if (size)
+  {
+    printf("payload size of this message should be zero, but is %d bytes.\n", size);
+  }
+#endif
+
+#ifdef PRINT_FULL_FRAME_FREEZE_RELEASE_INFO
+#undef PRINT_FULL_FRAME_FREEZE_RELEASE_INFO
+#endif
+}
+
+/*!
+ ************************************************************************
+ *  \brief
+ *     Interpret the Full-frame snapshot SEI message
+ *  \param payload
+ *     a pointer that point to the sei payload
+ *  \param size
+ *     the size of the sei message
+ *  \param p_Vid
+ *     the image pointer
+ *
+ ************************************************************************
+ */
+void interpret_full_frame_snapshot_info( byte* payload, int size, VideoParameters *p_Vid )
+{
+  int snapshot_id;
+
+  Bitstream* buf;
+
+  buf = malloc(sizeof(Bitstream));
+  buf->bitstream_length = size;
+  buf->streamBuffer = payload;
+  buf->frame_bitoffset = 0;
+
+  snapshot_id = ue_v("SEI: snapshot_id", buf);
+
+#ifdef PRINT_FULL_FRAME_SNAPSHOT_INFO
+  printf("Full-frame snapshot SEI message\n");
+  printf("snapshot_id = %d\n", snapshot_id);
+#endif
+  free (buf);
+#ifdef PRINT_FULL_FRAME_SNAPSHOT_INFO
+#undef PRINT_FULL_FRAME_SNAPSHOT_INFO
+#endif
+}
+
+/*!
+ ************************************************************************
+ *  \brief
+ *     Interpret the Progressive refinement segment start SEI message
+ *  \param payload
+ *     a pointer that point to the sei payload
+ *  \param size
+ *     the size of the sei message
+ *  \param p_Vid
+ *     the image pointer
+ *
+ ************************************************************************
+ */
+void interpret_progressive_refinement_start_info( byte* payload, int size, VideoParameters *p_Vid )
+{
+  int progressive_refinement_id, num_refinement_steps_minus1;
+
+  Bitstream* buf;
+
+  buf = malloc(sizeof(Bitstream));
+  buf->bitstream_length = size;
+  buf->streamBuffer = payload;
+  buf->frame_bitoffset = 0;
+
+  progressive_refinement_id   = ue_v("SEI: progressive_refinement_id"  , buf);
+  num_refinement_steps_minus1 = ue_v("SEI: num_refinement_steps_minus1", buf);
+
+#ifdef PRINT_PROGRESSIVE_REFINEMENT_START_INFO
+  printf("Progressive refinement segment start SEI message\n");
+  printf("progressive_refinement_id   = %d\n", progressive_refinement_id);
+  printf("num_refinement_steps_minus1 = %d\n", num_refinement_steps_minus1);
+#endif
+  free (buf);
+#ifdef PRINT_PROGRESSIVE_REFINEMENT_START_INFO
+#undef PRINT_PROGRESSIVE_REFINEMENT_START_INFO
+#endif
+}
+
+
+/*!
+ ************************************************************************
+ *  \brief
+ *     Interpret the Progressive refinement segment end SEI message
+ *  \param payload
+ *     a pointer that point to the sei payload
+ *  \param size
+ *     the size of the sei message
+ *  \param p_Vid
+ *     the image pointer
+ *
+ ************************************************************************
+ */
+void interpret_progressive_refinement_end_info( byte* payload, int size, VideoParameters *p_Vid )
+{
+  int progressive_refinement_id;
+
+  Bitstream* buf;
+
+  buf = malloc(sizeof(Bitstream));
+  buf->bitstream_length = size;
+  buf->streamBuffer = payload;
+  buf->frame_bitoffset = 0;
+
+  progressive_refinement_id   = ue_v("SEI: progressive_refinement_id"  , buf);
+
+#ifdef PRINT_PROGRESSIVE_REFINEMENT_END_INFO
+  printf("Progressive refinement segment end SEI message\n");
+  printf("progressive_refinement_id   = %d\n", progressive_refinement_id);
+#endif
+  free (buf);
+#ifdef PRINT_PROGRESSIVE_REFINEMENT_END_INFO
+#undef PRINT_PROGRESSIVE_REFINEMENT_END_INFO
+#endif
+}
+
+
+/*!
+ ************************************************************************
+ *  \brief
+ *     Interpret the Motion-constrained slice group set SEI message
+ *  \param payload
+ *     a pointer that point to the sei payload
+ *  \param size
+ *     the size of the sei message
+ *  \param p_Vid
+ *     the image pointer
+ *
+ ************************************************************************
+ */
+void interpret_motion_constrained_slice_group_set_info( byte* payload, int size, VideoParameters *p_Vid )
+{
+  int num_slice_groups_minus1, slice_group_id, exact_match_flag, pan_scan_rect_flag, pan_scan_rect_id;
+  int i;
+  int sliceGroupSize;
+
+  Bitstream* buf;
+
+  buf = malloc(sizeof(Bitstream));
+  buf->bitstream_length = size;
+  buf->streamBuffer = payload;
+  buf->frame_bitoffset = 0;
+
+  num_slice_groups_minus1   = ue_v("SEI: num_slice_groups_minus1"  , buf);
+  sliceGroupSize = CeilLog2( num_slice_groups_minus1 + 1 );
+#ifdef PRINT_MOTION_CONST_SLICE_GROUP_SET_INFO
+  printf("Motion-constrained slice group set SEI message\n");
+  printf("num_slice_groups_minus1   = %d\n", num_slice_groups_minus1);
+#endif
+
+  for (i=0; i<=num_slice_groups_minus1;i++)
+  {
+
+    slice_group_id   = u_v (sliceGroupSize, "SEI: slice_group_id" , buf)    ;
+#ifdef PRINT_MOTION_CONST_SLICE_GROUP_SET_INFO
+    printf("slice_group_id            = %d\n", slice_group_id);
+#endif
+  }
+
+  exact_match_flag   = u_1("SEI: exact_match_flag"  , buf);
+  pan_scan_rect_flag = u_1("SEI: pan_scan_rect_flag"  , buf);
+
+#ifdef PRINT_MOTION_CONST_SLICE_GROUP_SET_INFO
+  printf("exact_match_flag         = %d\n", exact_match_flag);
+  printf("pan_scan_rect_flag       = %d\n", pan_scan_rect_flag);
+#endif
+
+  if (pan_scan_rect_flag)
+  {
+    pan_scan_rect_id = ue_v("SEI: pan_scan_rect_id"  , buf);
+#ifdef PRINT_MOTION_CONST_SLICE_GROUP_SET_INFO
+    printf("pan_scan_rect_id         = %d\n", pan_scan_rect_id);
+#endif
+  }
+
+  free (buf);
+#ifdef PRINT_MOTION_CONST_SLICE_GROUP_SET_INFO
+#undef PRINT_MOTION_CONST_SLICE_GROUP_SET_INFO
+#endif
+}
+
+/*!
+ ************************************************************************
+ *  \brief
+ *     Interpret the film grain characteristics SEI message
+ *  \param payload
+ *     a pointer that point to the sei payload
+ *  \param size
+ *     the size of the sei message
+ *  \param p_Vid
+ *     the image pointer
+ *
+ ************************************************************************
+ */
+void interpret_film_grain_characteristics_info( byte* payload, int size, VideoParameters *p_Vid )
+{
+  int film_grain_characteristics_cancel_flag;
+  int model_id, separate_colour_description_present_flag;
+  int film_grain_bit_depth_luma_minus8, film_grain_bit_depth_chroma_minus8, film_grain_full_range_flag, film_grain_colour_primaries, film_grain_transfer_characteristics, film_grain_matrix_coefficients;
+  int blending_mode_id, log2_scale_factor, comp_model_present_flag[3];
+  int num_intensity_intervals_minus1, num_model_values_minus1;
+  int intensity_interval_lower_bound, intensity_interval_upper_bound;
+  int comp_model_value;
+  int film_grain_characteristics_repetition_period;
+
+  int c, i, j;
+
+  Bitstream* buf;
+
+  buf = malloc(sizeof(Bitstream));
+  buf->bitstream_length = size;
+  buf->streamBuffer = payload;
+  buf->frame_bitoffset = 0;
+
+  film_grain_characteristics_cancel_flag = u_1("SEI: film_grain_characteristics_cancel_flag", buf);
+#ifdef PRINT_FILM_GRAIN_CHARACTERISTICS_INFO
+  printf("film_grain_characteristics_cancel_flag = %d\n", film_grain_characteristics_cancel_flag);
+#endif
+  if(!film_grain_characteristics_cancel_flag)
+  {
+
+    model_id                                    = u_v(2, "SEI: model_id", buf);
+    separate_colour_description_present_flag    = u_1("SEI: separate_colour_description_present_flag", buf);
+#ifdef PRINT_FILM_GRAIN_CHARACTERISTICS_INFO
+    printf("model_id = %d\n", model_id);
+    printf("separate_colour_description_present_flag = %d\n", separate_colour_description_present_flag);
+#endif
+    if (separate_colour_description_present_flag)
+    {
+      film_grain_bit_depth_luma_minus8          = u_v(3, "SEI: film_grain_bit_depth_luma_minus8", buf);
+      film_grain_bit_depth_chroma_minus8        = u_v(3, "SEI: film_grain_bit_depth_chroma_minus8", buf);
+      film_grain_full_range_flag                = u_v(1, "SEI: film_grain_full_range_flag", buf);
+      film_grain_colour_primaries               = u_v(8, "SEI: film_grain_colour_primaries", buf);
+      film_grain_transfer_characteristics       = u_v(8, "SEI: film_grain_transfer_characteristics", buf);
+      film_grain_matrix_coefficients            = u_v(8, "SEI: film_grain_matrix_coefficients", buf);
+#ifdef PRINT_FILM_GRAIN_CHARACTERISTICS_INFO
+      printf("film_grain_bit_depth_luma_minus8 = %d\n", film_grain_bit_depth_luma_minus8);
+      printf("film_grain_bit_depth_chroma_minus8 = %d\n", film_grain_bit_depth_chroma_minus8);
+      printf("film_grain_full_range_flag = %d\n", film_grain_full_range_flag);
+      printf("film_grain_colour_primaries = %d\n", film_grain_colour_primaries);
+      printf("film_grain_transfer_characteristics = %d\n", film_grain_transfer_characteristics);
+      printf("film_grain_matrix_coefficients = %d\n", film_grain_matrix_coefficients);
+#endif
+    }
+    blending_mode_id                            = u_v(2, "SEI: blending_mode_id", buf);
+    log2_scale_factor                           = u_v(4, "SEI: log2_scale_factor", buf);
+#ifdef PRINT_FILM_GRAIN_CHARACTERISTICS_INFO
+    printf("blending_mode_id = %d\n", blending_mode_id);
+    printf("log2_scale_factor = %d\n", log2_scale_factor);
+#endif
+    for (c = 0; c < 3; c ++)
+    {
+      comp_model_present_flag[c]                = u_1("SEI: comp_model_present_flag", buf);
+#ifdef PRINT_FILM_GRAIN_CHARACTERISTICS_INFO
+      printf("comp_model_present_flag = %d\n", comp_model_present_flag[c]);
+#endif
+    }
+    for (c = 0; c < 3; c ++)
+      if (comp_model_present_flag[c])
+      {
+        num_intensity_intervals_minus1          = u_v(8, "SEI: num_intensity_intervals_minus1", buf);
+        num_model_values_minus1                 = u_v(3, "SEI: num_model_values_minus1", buf);
+#ifdef PRINT_FILM_GRAIN_CHARACTERISTICS_INFO
+        printf("num_intensity_intervals_minus1 = %d\n", num_intensity_intervals_minus1);
+        printf("num_model_values_minus1 = %d\n", num_model_values_minus1);
+#endif
+        for (i = 0; i <= num_intensity_intervals_minus1; i ++)
+        {
+          intensity_interval_lower_bound        = u_v(8, "SEI: intensity_interval_lower_bound", buf);
+          intensity_interval_upper_bound        = u_v(8, "SEI: intensity_interval_upper_bound", buf);
+#ifdef PRINT_FILM_GRAIN_CHARACTERISTICS_INFO
+          printf("intensity_interval_lower_bound = %d\n", intensity_interval_lower_bound);
+          printf("intensity_interval_upper_bound = %d\n", intensity_interval_upper_bound);
+#endif
+          for (j = 0; j <= num_model_values_minus1; j++)
+          {
+            comp_model_value                    = se_v("SEI: comp_model_value", buf);
+#ifdef PRINT_FILM_GRAIN_CHARACTERISTICS_INFO
+            printf("comp_model_value = %d\n", comp_model_value);
+#endif
+          }
+        }
+      }
+    film_grain_characteristics_repetition_period = ue_v("SEI: film_grain_characteristics_repetition_period", buf);
+#ifdef PRINT_FILM_GRAIN_CHARACTERISTICS_INFO
+    printf("film_grain_characteristics_repetition_period = %d\n", film_grain_characteristics_repetition_period);
+#endif
+  }
+
+  free (buf);
+#ifdef PRINT_FILM_GRAIN_CHARACTERISTICS_INFO
+#undef PRINT_FILM_GRAIN_CHARACTERISTICS_INFO
+#endif
+}
+
+/*!
+ ************************************************************************
+ *  \brief
+ *     Interpret the deblocking filter display preference SEI message
+ *  \param payload
+ *     a pointer that point to the sei payload
+ *  \param size
+ *     the size of the sei message
+ *  \param p_Vid
+ *     the image pointer
+ *
+ ************************************************************************
+ */
+void interpret_deblocking_filter_display_preference_info( byte* payload, int size, VideoParameters *p_Vid )
+{
+  int deblocking_display_preference_cancel_flag;
+  int display_prior_to_deblocking_preferred_flag, dec_frame_buffering_constraint_flag, deblocking_display_preference_repetition_period;
+
+  Bitstream* buf;
+
+  buf = malloc(sizeof(Bitstream));
+  buf->bitstream_length = size;
+  buf->streamBuffer = payload;
+  buf->frame_bitoffset = 0;
+
+  deblocking_display_preference_cancel_flag             = u_1("SEI: deblocking_display_preference_cancel_flag", buf);
+#ifdef PRINT_DEBLOCKING_FILTER_DISPLAY_PREFERENCE_INFO
+  printf("deblocking_display_preference_cancel_flag = %d\n", deblocking_display_preference_cancel_flag);
+#endif
+  if(!deblocking_display_preference_cancel_flag)
+  {
+    display_prior_to_deblocking_preferred_flag            = u_1("SEI: display_prior_to_deblocking_preferred_flag", buf);
+    dec_frame_buffering_constraint_flag                   = u_1("SEI: dec_frame_buffering_constraint_flag", buf);
+    deblocking_display_preference_repetition_period       = ue_v("SEI: deblocking_display_preference_repetition_period", buf);
+#ifdef PRINT_DEBLOCKING_FILTER_DISPLAY_PREFERENCE_INFO
+    printf("display_prior_to_deblocking_preferred_flag = %d\n", display_prior_to_deblocking_preferred_flag);
+    printf("dec_frame_buffering_constraint_flag = %d\n", dec_frame_buffering_constraint_flag);
+    printf("deblocking_display_preference_repetition_period = %d\n", deblocking_display_preference_repetition_period);
+#endif
+  }
+
+  free (buf);
+#ifdef PRINT_DEBLOCKING_FILTER_DISPLAY_PREFERENCE_INFO
+#undef PRINT_DEBLOCKING_FILTER_DISPLAY_PREFERENCE_INFO
+#endif
+}
+
+/*!
+ ************************************************************************
+ *  \brief
+ *     Interpret the stereo video info SEI message
+ *  \param payload
+ *     a pointer that point to the sei payload
+ *  \param size
+ *     the size of the sei message
+ *  \param p_Vid
+ *     the image pointer
+ *
+ ************************************************************************
+ */
+void interpret_stereo_video_info_info( byte* payload, int size, VideoParameters *p_Vid )
+{
+  int field_views_flags;
+  int top_field_is_left_view_flag, current_frame_is_left_view_flag, next_frame_is_second_view_flag;
+  int left_view_self_contained_flag;
+  int right_view_self_contained_flag;
+
+  Bitstream* buf;
+
+  buf = malloc(sizeof(Bitstream));
+  buf->bitstream_length = size;
+  buf->streamBuffer = payload;
+  buf->frame_bitoffset = 0;
+
+  field_views_flags = u_1("SEI: field_views_flags", buf);
+#ifdef PRINT_STEREO_VIDEO_INFO_INFO
+  printf("field_views_flags = %d\n", field_views_flags);
+#endif
+  if (field_views_flags)
+  {
+    top_field_is_left_view_flag         = u_1("SEI: top_field_is_left_view_flag", buf);
+#ifdef PRINT_STEREO_VIDEO_INFO_INFO
+    printf("top_field_is_left_view_flag = %d\n", top_field_is_left_view_flag);
+#endif
+  }
+  else
+  {
+    current_frame_is_left_view_flag     = u_1("SEI: current_frame_is_left_view_flag", buf);
+    next_frame_is_second_view_flag      = u_1("SEI: next_frame_is_second_view_flag", buf);
+#ifdef PRINT_STEREO_VIDEO_INFO_INFO
+    printf("current_frame_is_left_view_flag = %d\n", current_frame_is_left_view_flag);
+    printf("next_frame_is_second_view_flag = %d\n", next_frame_is_second_view_flag);
+#endif
+  }
+
+  left_view_self_contained_flag         = u_1("SEI: left_view_self_contained_flag", buf);
+  right_view_self_contained_flag        = u_1("SEI: right_view_self_contained_flag", buf);
+#ifdef PRINT_STEREO_VIDEO_INFO_INFO
+  printf("left_view_self_contained_flag = %d\n", left_view_self_contained_flag);
+  printf("right_view_self_contained_flag = %d\n", right_view_self_contained_flag);
+#endif
+
+  free (buf);
+#ifdef PRINT_STEREO_VIDEO_INFO_INFO
+#undef PRINT_STEREO_VIDEO_INFO_INFO
+#endif
+}
+
+/*!
+ ************************************************************************
+ *  \brief
+ *     Interpret the Reserved SEI message
+ *  \param payload
+ *     a pointer that point to the sei payload
+ *  \param size
+ *     the size of the sei message
+ *  \param p_Vid
+ *     the image pointer
+ *
+ ************************************************************************
+ */
+void interpret_reserved_info( byte* payload, int size, VideoParameters *p_Vid )
+{
+  int offset = 0;
+  byte payload_byte;
+
+#ifdef PRINT_RESERVED_INFO
+  printf("Reserved SEI message\n");
+#endif
+
+  while (offset < size)
+  {
+    payload_byte = payload[offset];
+    offset ++;
+#ifdef PRINT_RESERVED_INFO
+    printf("reserved_sei_message_payload_byte = %d\n", payload_byte);
+#endif
+  }
+#ifdef PRINT_RESERVED_INFO
+#undef PRINT_RESERVED_INFO
+#endif
+}
+
+
+/*!
+ ************************************************************************
+ *  \brief
+ *     Interpret the Buffering period SEI message
+ *  \param payload
+ *     a pointer that point to the sei payload
+ *  \param size
+ *     the size of the sei message
+ *  \param p_Vid
+ *     the image pointer
+ *
+ ************************************************************************
+ */
+void interpret_buffering_period_info( byte* payload, int size, VideoParameters *p_Vid )
+{
+  int seq_parameter_set_id, initial_cpb_removal_delay, initial_cpb_removal_delay_offset;
+  unsigned int k;
+
+  Bitstream* buf;
+  seq_parameter_set_rbsp_t *sps;
+
+
+  buf = malloc(sizeof(Bitstream));
+  buf->bitstream_length = size;
+  buf->streamBuffer = payload;
+  buf->frame_bitoffset = 0;
+
+  seq_parameter_set_id   = ue_v("SEI: seq_parameter_set_id"  , buf);
+
+  sps = &p_Vid->SeqParSet[seq_parameter_set_id];
+
+  activate_sps(p_Vid, sps);
+
+#ifdef PRINT_BUFFERING_PERIOD_INFO
+  printf("Buffering period SEI message\n");
+  printf("seq_parameter_set_id   = %d\n", seq_parameter_set_id);
+#endif
+
+  // Note: NalHrdBpPresentFlag and CpbDpbDelaysPresentFlag can also be set "by some means not specified in this Recommendation | International Standard"
+  if (sps->vui_parameters_present_flag)
+  {
+
+    if (sps->vui_seq_parameters.nal_hrd_parameters_present_flag)
+    {
+      for (k=0; k<sps->vui_seq_parameters.nal_hrd_parameters.cpb_cnt_minus1+1; k++)
+      {
+        initial_cpb_removal_delay        = u_v(sps->vui_seq_parameters.nal_hrd_parameters.initial_cpb_removal_delay_length_minus1+1, "SEI: initial_cpb_removal_delay"        , buf);
+        initial_cpb_removal_delay_offset = u_v(sps->vui_seq_parameters.nal_hrd_parameters.initial_cpb_removal_delay_length_minus1+1, "SEI: initial_cpb_removal_delay_offset" , buf);
+
+#ifdef PRINT_BUFFERING_PERIOD_INFO
+        printf("nal initial_cpb_removal_delay[%d]        = %d\n", k, initial_cpb_removal_delay);
+        printf("nal initial_cpb_removal_delay_offset[%d] = %d\n", k, initial_cpb_removal_delay_offset);
+#endif
+      }
+    }
+
+    if (sps->vui_seq_parameters.vcl_hrd_parameters_present_flag)
+    {
+      for (k=0; k<sps->vui_seq_parameters.vcl_hrd_parameters.cpb_cnt_minus1+1; k++)
+      {
+        initial_cpb_removal_delay        = u_v(sps->vui_seq_parameters.vcl_hrd_parameters.initial_cpb_removal_delay_length_minus1+1, "SEI: initial_cpb_removal_delay"        , buf);
+        initial_cpb_removal_delay_offset = u_v(sps->vui_seq_parameters.vcl_hrd_parameters.initial_cpb_removal_delay_length_minus1+1, "SEI: initial_cpb_removal_delay_offset" , buf);
+
+#ifdef PRINT_BUFFERING_PERIOD_INFO
+        printf("vcl initial_cpb_removal_delay[%d]        = %d\n", k, initial_cpb_removal_delay);
+        printf("vcl initial_cpb_removal_delay_offset[%d] = %d\n", k, initial_cpb_removal_delay_offset);
+#endif
+      }
+    }
+  }
+
+  free (buf);
+#ifdef PRINT_BUFFERING_PERIOD_INFO
+#undef PRINT_BUFFERING_PERIOD_INFO
+#endif
+}
+
+
+/*!
+ ************************************************************************
+ *  \brief
+ *     Interpret the Picture timing SEI message
+ *  \param payload
+ *     a pointer that point to the sei payload
+ *  \param size
+ *     the size of the sei message
+ *  \param p_Vid
+ *     the image pointer
+ *
+ ************************************************************************
+ */
+void interpret_picture_timing_info( byte* payload, int size, VideoParameters *p_Vid )
+{
+
+  seq_parameter_set_rbsp_t *active_sps = p_Vid->active_sps;
+
+  int cpb_removal_delay, dpb_output_delay, picture_structure_present_flag, picture_structure;
+  int clock_time_stamp_flag;
+  int ct_type, nuit_field_based_flag, counting_type, full_timestamp_flag, discontinuity_flag, cnt_dropped_flag, nframes;
+  int seconds_value, minutes_value, hours_value, seconds_flag, minutes_flag, hours_flag, time_offset;
+  int NumClockTs = 0;
+  int i;
+
+  int cpb_removal_len = 24;
+  int dpb_output_len  = 24;
+
+  Boolean CpbDpbDelaysPresentFlag;
+
+  Bitstream* buf;
+
+  if (NULL==active_sps)
+  {
+    fprintf (stderr, "Warning: no active SPS, timing SEI cannot be parsed\n");
+    return;
+  }
+
+  buf = malloc(sizeof(Bitstream));
+  buf->bitstream_length = size;
+  buf->streamBuffer = payload;
+  buf->frame_bitoffset = 0;
+
+
+#ifdef PRINT_PCITURE_TIMING_INFO
+  printf("Picture timing SEI message\n");
+#endif
+
+  // CpbDpbDelaysPresentFlag can also be set "by some means not specified in this Recommendation | International Standard"
+  CpbDpbDelaysPresentFlag =  (Boolean) (active_sps->vui_parameters_present_flag
+                              && (   (active_sps->vui_seq_parameters.nal_hrd_parameters_present_flag != 0)
+                                   ||(active_sps->vui_seq_parameters.vcl_hrd_parameters_present_flag != 0)));
+
+  if (CpbDpbDelaysPresentFlag )
+  {
+    if (active_sps->vui_parameters_present_flag)
+    {
+      if (active_sps->vui_seq_parameters.nal_hrd_parameters_present_flag)
+      {
+        cpb_removal_len = active_sps->vui_seq_parameters.nal_hrd_parameters.cpb_removal_delay_length_minus1 + 1;
+        dpb_output_len  = active_sps->vui_seq_parameters.nal_hrd_parameters.dpb_output_delay_length_minus1  + 1;
+      }
+      else if (active_sps->vui_seq_parameters.vcl_hrd_parameters_present_flag)
+      {
+        cpb_removal_len = active_sps->vui_seq_parameters.vcl_hrd_parameters.cpb_removal_delay_length_minus1 + 1;
+        dpb_output_len  = active_sps->vui_seq_parameters.vcl_hrd_parameters.dpb_output_delay_length_minus1  + 1;
+      }
+    }
+
+    if ((active_sps->vui_seq_parameters.nal_hrd_parameters_present_flag)||
+      (active_sps->vui_seq_parameters.vcl_hrd_parameters_present_flag))
+    {
+      cpb_removal_delay = u_v(cpb_removal_len, "SEI: cpb_removal_delay" , buf);
+      dpb_output_delay  = u_v(dpb_output_len,  "SEI: dpb_output_delay"  , buf);
+#ifdef PRINT_PCITURE_TIMING_INFO
+      printf("cpb_removal_delay = %d\n",cpb_removal_delay);
+      printf("dpb_output_delay  = %d\n",dpb_output_delay);
+#endif
+    }
+  }
+
+  if (!active_sps->vui_parameters_present_flag)
+  {
+    picture_structure_present_flag = 0;
+  }
+  else
+  {
+    picture_structure_present_flag  =  active_sps->vui_seq_parameters.pic_struct_present_flag;
+  }
+
+  if (picture_structure_present_flag)
+  {
+    picture_structure = u_v(4, "SEI: pic_struct" , buf);
+#ifdef PRINT_PCITURE_TIMING_INFO
+    printf("picture_structure = %d\n",picture_structure);
+#endif
+    switch (picture_structure)
+    {
+    case 0:
+    case 1:
+    case 2:
+      NumClockTs = 1;
+      break;
+    case 3:
+    case 4:
+    case 7:
+      NumClockTs = 2;
+      break;
+    case 5:
+    case 6:
+    case 8:
+      NumClockTs = 3;
+      break;
+    default:
+      error("reserved picture_structure used (can't determine NumClockTs)", 500);
+    }
+    for (i=0; i<NumClockTs; i++)
+    {
+      clock_time_stamp_flag = u_1("SEI: clock_time_stamp_flag"  , buf);
+#ifdef PRINT_PCITURE_TIMING_INFO
+      printf("clock_time_stamp_flag = %d\n",clock_time_stamp_flag);
+#endif
+      if (clock_time_stamp_flag)
+      {
+        ct_type               = u_v(2, "SEI: ct_type"               , buf);
+        nuit_field_based_flag = u_1(   "SEI: nuit_field_based_flag" , buf);
+        counting_type         = u_v(5, "SEI: counting_type"         , buf);
+        full_timestamp_flag   = u_1(   "SEI: full_timestamp_flag"   , buf);
+        discontinuity_flag    = u_1(   "SEI: discontinuity_flag"    , buf);
+        cnt_dropped_flag      = u_1(   "SEI: cnt_dropped_flag"      , buf);
+        nframes               = u_v(8, "SEI: nframes"               , buf);
+
+#ifdef PRINT_PCITURE_TIMING_INFO
+        printf("ct_type               = %d\n",ct_type);
+        printf("nuit_field_based_flag = %d\n",nuit_field_based_flag);
+        printf("full_timestamp_flag   = %d\n",full_timestamp_flag);
+        printf("discontinuity_flag    = %d\n",discontinuity_flag);
+        printf("cnt_dropped_flag      = %d\n",cnt_dropped_flag);
+        printf("nframes               = %d\n",nframes);
+#endif
+        if (full_timestamp_flag)
+        {
+          seconds_value         = u_v(6, "SEI: seconds_value"   , buf);
+          minutes_value         = u_v(6, "SEI: minutes_value"   , buf);
+          hours_value           = u_v(5, "SEI: hours_value"     , buf);
+#ifdef PRINT_PCITURE_TIMING_INFO
+          printf("seconds_value = %d\n",seconds_value);
+          printf("minutes_value = %d\n",minutes_value);
+          printf("hours_value   = %d\n",hours_value);
+#endif
+        }
+        else
+        {
+          seconds_flag          = u_1(   "SEI: seconds_flag" , buf);
+#ifdef PRINT_PCITURE_TIMING_INFO
+          printf("seconds_flag = %d\n",seconds_flag);
+#endif
+          if (seconds_flag)
+          {
+            seconds_value         = u_v(6, "SEI: seconds_value"   , buf);
+            minutes_flag          = u_1(   "SEI: minutes_flag" , buf);
+#ifdef PRINT_PCITURE_TIMING_INFO
+            printf("seconds_value = %d\n",seconds_value);
+            printf("minutes_flag  = %d\n",minutes_flag);
+#endif
+            if(minutes_flag)
+            {
+              minutes_value         = u_v(6, "SEI: minutes_value"   , buf);
+              hours_flag            = u_1(   "SEI: hours_flag" , buf);
+#ifdef PRINT_PCITURE_TIMING_INFO
+              printf("minutes_value = %d\n",minutes_value);
+              printf("hours_flag    = %d\n",hours_flag);
+#endif
+              if(hours_flag)
+              {
+                hours_value           = u_v(5, "SEI: hours_value"     , buf);
+#ifdef PRINT_PCITURE_TIMING_INFO
+                printf("hours_value   = %d\n",hours_value);
+#endif
+              }
+            }
+          }
+        }
+        {
+          int time_offset_length;
+          if (active_sps->vui_seq_parameters.vcl_hrd_parameters_present_flag)
+            time_offset_length = active_sps->vui_seq_parameters.vcl_hrd_parameters.time_offset_length;
+          else if (active_sps->vui_seq_parameters.nal_hrd_parameters_present_flag)
+            time_offset_length = active_sps->vui_seq_parameters.nal_hrd_parameters.time_offset_length;
+          else
+            time_offset_length = 24;
+          if (time_offset_length)
+            time_offset = i_v(time_offset_length, "SEI: time_offset"   , buf);
+          else
+            time_offset = 0;
+#ifdef PRINT_PCITURE_TIMING_INFO
+          printf("time_offset   = %d\n",time_offset);
+#endif
+        }
+      }
+    }
+  }
+
+  free (buf);
+#ifdef PRINT_PCITURE_TIMING_INFO
+#undef PRINT_PCITURE_TIMING_INFO
+#endif
+}
+
+/*!
+ ************************************************************************
+ *  \brief
+ *     Interpret the HDR tone-mapping SEI message (JVT-T060)
+ *  \param payload
+ *     a pointer that point to the sei payload
+ *  \param size
+ *     the size of the sei message
+ *  \param p_Vid
+ *     the image pointer
+ *
+ ************************************************************************
+ */
+typedef struct
+{
+  unsigned int  tone_map_id;
+  unsigned char tone_map_cancel_flag;
+  unsigned int  tone_map_repetition_period;
+  unsigned char coded_data_bit_depth;
+  unsigned char sei_bit_depth;
+  unsigned int  model_id;
+  // variables for model 0
+  int  min_value;
+  int  max_value;
+  // variables for model 1
+  int  sigmoid_midpoint;
+  int  sigmoid_width;
+  // variables for model 2
+  int start_of_coded_interval[1<<MAX_SEI_BIT_DEPTH];
+  // variables for model 3
+  int num_pivots;
+  int coded_pivot_value[MAX_NUM_PIVOTS];
+  int sei_pivot_value[MAX_NUM_PIVOTS];
+} tone_mapping_struct_tmp;
+
+void interpret_tone_mapping( byte* payload, int size, VideoParameters *p_Vid )
+{
+  tone_mapping_struct_tmp seiToneMappingTmp;
+  Bitstream* buf;
+  int i = 0, max_coded_num, max_output_num;
+
+  memset (&seiToneMappingTmp, 0, sizeof (tone_mapping_struct_tmp));
+
+  buf = malloc(sizeof(Bitstream));
+  buf->bitstream_length = size;
+  buf->streamBuffer = payload;
+  buf->frame_bitoffset = 0;
+
+  seiToneMappingTmp.tone_map_id = ue_v("SEI: tone_map_id", buf);
+  seiToneMappingTmp.tone_map_cancel_flag = (unsigned char) u_1("SEI: tone_map_cancel_flag", buf);
+
+#ifdef PRINT_TONE_MAPPING
+  printf("Tone-mapping SEI message\n");
+  printf("tone_map_id = %d\n", seiToneMappingTmp.tone_map_id);
+
+  if (seiToneMappingTmp.tone_map_id != 0)
+    printf("WARNING! Tone_map_id != 0, print the SEI message info only. The tone mapping is actually applied only when Tone_map_id==0\n\n");
+  printf("tone_map_cancel_flag = %d\n", seiToneMappingTmp.tone_map_cancel_flag);
+#endif
+
+  if (!seiToneMappingTmp.tone_map_cancel_flag) 
+  {
+    seiToneMappingTmp.tone_map_repetition_period  = ue_v(  "SEI: tone_map_repetition_period", buf);
+        seiToneMappingTmp.coded_data_bit_depth        = (unsigned char)u_v (8,"SEI: coded_data_bit_depth"      , buf);
+    seiToneMappingTmp.sei_bit_depth               = (unsigned char)u_v (8,"SEI: sei_bit_depth"             , buf);
+
+    seiToneMappingTmp.model_id                    = ue_v(  "SEI: model_id"                  , buf);
+
+#ifdef PRINT_TONE_MAPPING
+    printf("tone_map_repetition_period = %d\n", seiToneMappingTmp.tone_map_repetition_period);
+    printf("coded_data_bit_depth = %d\n", seiToneMappingTmp.coded_data_bit_depth);
+    printf("sei_bit_depth = %d\n", seiToneMappingTmp.sei_bit_depth);
+    printf("model_id = %d\n", seiToneMappingTmp.model_id);
+#endif
+
+    max_coded_num  = 1<<seiToneMappingTmp.coded_data_bit_depth;
+    max_output_num = 1<<seiToneMappingTmp.sei_bit_depth;
+
+    if (seiToneMappingTmp.model_id == 0) 
+    { // linear mapping with clipping
+      seiToneMappingTmp.min_value   = u_v (32,"SEI: min_value", buf);
+      seiToneMappingTmp.max_value   = u_v (32,"SEI: min_value", buf);
+#ifdef PRINT_TONE_MAPPING
+      printf("min_value = %d, max_value = %d\n", seiToneMappingTmp.min_value, seiToneMappingTmp.max_value);
+#endif
+    }
+    else if (seiToneMappingTmp.model_id == 1) 
+    { // sigmoidal mapping
+      seiToneMappingTmp.sigmoid_midpoint = u_v (32,"SEI: sigmoid_midpoint", buf);
+      seiToneMappingTmp.sigmoid_width    = u_v (32,"SEI: sigmoid_width", buf);
+#ifdef PRINT_TONE_MAPPING
+      printf("sigmoid_midpoint = %d, sigmoid_width = %d\n", seiToneMappingTmp.sigmoid_midpoint, seiToneMappingTmp.sigmoid_width);
+#endif
+    }
+    else if (seiToneMappingTmp.model_id == 2) 
+    { // user defined table mapping
+      for (i=0; i<max_output_num; i++) 
+      {
+        seiToneMappingTmp.start_of_coded_interval[i] = u_v((((seiToneMappingTmp.coded_data_bit_depth+7)>>3)<<3), "SEI: start_of_coded_interval"  , buf);
+#ifdef PRINT_TONE_MAPPING // too long to print
+        //printf("start_of_coded_interval[%d] = %d\n", i, seiToneMappingTmp.start_of_coded_interval[i]);
+#endif
+      }
+    }
+    else if (seiToneMappingTmp.model_id == 3) 
+    {  // piece-wise linear mapping
+      seiToneMappingTmp.num_pivots = u_v (16,"SEI: num_pivots", buf);
+#ifdef PRINT_TONE_MAPPING
+      printf("num_pivots = %d\n", seiToneMappingTmp.num_pivots);
+#endif
+      seiToneMappingTmp.coded_pivot_value[0] = 0;
+      seiToneMappingTmp.sei_pivot_value[0] = 0;
+      seiToneMappingTmp.coded_pivot_value[seiToneMappingTmp.num_pivots+1] = max_coded_num-1;
+      seiToneMappingTmp.sei_pivot_value[seiToneMappingTmp.num_pivots+1] = max_output_num-1;
+
+      for (i=1; i < seiToneMappingTmp.num_pivots+1; i++) 
+      {
+        seiToneMappingTmp.coded_pivot_value[i] = u_v( (((seiToneMappingTmp.coded_data_bit_depth+7)>>3)<<3), "SEI: coded_pivot_value", buf);
+        seiToneMappingTmp.sei_pivot_value[i] = u_v( (((seiToneMappingTmp.sei_bit_depth+7)>>3)<<3), "SEI: sei_pivot_value", buf);
+#ifdef PRINT_TONE_MAPPING
+        printf("coded_pivot_value[%d] = %d, sei_pivot_value[%d] = %d\n", i, seiToneMappingTmp.coded_pivot_value[i], i, seiToneMappingTmp.sei_pivot_value[i]);
+#endif
+      }
+    }
+
+#if (ENABLE_OUTPUT_TONEMAPPING)
+    // Currently, only when the map_id == 0, the tone-mapping is actually applied.
+    if (seiToneMappingTmp.tone_map_id== 0) 
+    {
+      int j;
+      p_Vid->seiToneMapping->seiHasTone_mapping = TRUE;
+      p_Vid->seiToneMapping->tone_map_repetition_period = seiToneMappingTmp.tone_map_repetition_period;
+      p_Vid->seiToneMapping->coded_data_bit_depth = seiToneMappingTmp.coded_data_bit_depth;
+      p_Vid->seiToneMapping->sei_bit_depth = seiToneMappingTmp.sei_bit_depth;
+      p_Vid->seiToneMapping->model_id = seiToneMappingTmp.model_id;
+      p_Vid->seiToneMapping->count = 0;
+
+      // generate the look up table of tone mapping
+      switch(seiToneMappingTmp.model_id)
+      {
+      case 0:            // linear mapping with clipping
+        for (i=0; i<=seiToneMappingTmp.min_value; i++)
+          p_Vid->seiToneMapping->lut[i] = 0;
+
+        for (i=seiToneMappingTmp.min_value+1; i < seiToneMappingTmp.max_value; i++)
+          p_Vid->seiToneMapping->lut[i] = (imgpel) ((i-seiToneMappingTmp.min_value) * (max_output_num-1)/(seiToneMappingTmp.max_value- seiToneMappingTmp.min_value));
+
+        for (i=seiToneMappingTmp.max_value; i<max_coded_num; i++)
+          p_Vid->seiToneMapping->lut[i] =(imgpel) (max_output_num - 1);
+        break;
+      case 1: // sigmoid mapping
+
+        for (i=0; i < max_coded_num; i++) 
+        {
+#if 0
+          int j = (int)(1 + exp( -6*(double)(i-seiToneMappingTmp.sigmoid_midpoint)/seiToneMappingTmp.sigmoid_width));
+          p_Vid->seiToneMapping->lut[i] = ((max_output_num-1)+(j>>1)) / j;
+#else
+          double tmp = 1.0 + exp( -6*(double)(i-seiToneMappingTmp.sigmoid_midpoint)/seiToneMappingTmp.sigmoid_width);
+          p_Vid->seiToneMapping->lut[i] = (imgpel)( (double)(max_output_num-1)/ tmp + 0.5);
+#endif
+        }
+        break;
+      case 2: // user defined table
+        if (0 < max_output_num-1)
+        {
+          for (j=0; j<max_output_num-1; j++) 
+          {
+            for (i=seiToneMappingTmp.start_of_coded_interval[j]; i<seiToneMappingTmp.start_of_coded_interval[j+1]; i++) 
+            {
+              p_Vid->seiToneMapping->lut[i] = (imgpel) j;
+            }
+          }
+          p_Vid->seiToneMapping->lut[i] = (imgpel) (max_output_num - 1);
+        }
+        break;
+      case 3: // piecewise linear mapping
+        for (j=0; j<seiToneMappingTmp.num_pivots+1; j++) 
+        {
+#if 0
+          slope = ((seiToneMappingTmp.sei_pivot_value[j+1] - seiToneMappingTmp.sei_pivot_value[j])<<16)/(seiToneMappingTmp.coded_pivot_value[j+1]-seiToneMappingTmp.coded_pivot_value[j]);
+          for (i=seiToneMappingTmp.coded_pivot_value[j]; i <= seiToneMappingTmp.coded_pivot_value[j+1]; i++) 
+          {
+            p_Vid->seiToneMapping->lut[i] = seiToneMappingTmp.sei_pivot_value[j] + (( (i - seiToneMappingTmp.coded_pivot_value[j]) * slope)>>16);
+          }
+#else
+          double slope = (double)(seiToneMappingTmp.sei_pivot_value[j+1] - seiToneMappingTmp.sei_pivot_value[j])/(seiToneMappingTmp.coded_pivot_value[j+1]-seiToneMappingTmp.coded_pivot_value[j]);
+          for (i=seiToneMappingTmp.coded_pivot_value[j]; i <= seiToneMappingTmp.coded_pivot_value[j+1]; i++) 
+          {
+            p_Vid->seiToneMapping->lut[i] = (imgpel) (seiToneMappingTmp.sei_pivot_value[j] + (int)(( (i - seiToneMappingTmp.coded_pivot_value[j]) * slope)));
+          }
+#endif
+        }
+        break;
+
+      default:
+        break;
+      } // end switch
+    }
+#endif
+  } // end !tone_map_cancel_flag
+  free (buf);
+}
+
+#if (ENABLE_OUTPUT_TONEMAPPING)
+// tone map using the look-up-table generated according to SEI tone mapping message
+void tone_map (imgpel** imgX, imgpel* lut, int size_x, int size_y)
+{
+  int i, j;
+
+  for(i=0;i<size_y;i++)
+  {
+    for(j=0;j<size_x;j++)
+    {
+      imgX[i][j] = (imgpel)lut[imgX[i][j]];
+    }
+  }
+}
+
+void init_tone_mapping_sei(ToneMappingSEI *seiToneMapping) 
+{
+  seiToneMapping->seiHasTone_mapping = FALSE;
+  seiToneMapping->count = 0;
+}
+
+void update_tone_mapping_sei(ToneMappingSEI *seiToneMapping) 
+{
+
+  if(seiToneMapping->tone_map_repetition_period == 0)
+  {
+    seiToneMapping->seiHasTone_mapping = FALSE;
+    seiToneMapping->count = 0;
+  }
+  else if (seiToneMapping->tone_map_repetition_period>1)
+  {
+    seiToneMapping->count++;
+    if (seiToneMapping->count>=seiToneMapping->tone_map_repetition_period) 
+    {
+      seiToneMapping->seiHasTone_mapping = FALSE;
+      seiToneMapping->count = 0;
+    }
+  }
+}
+#endif
+
+/*!
+ ************************************************************************
+ *  \brief
+ *     Interpret the post filter hints SEI message (JVT-U035)
+ *  \param payload
+ *     a pointer that point to the sei payload
+ *  \param size
+ *     the size of the sei message
+ *  \param p_Vid
+ *     the image pointer
+ *    
+ ************************************************************************
+ */
+void interpret_post_filter_hints_info( byte* payload, int size, VideoParameters *p_Vid )
+{
+  Bitstream* buf;
+  unsigned int filter_hint_size_y, filter_hint_size_x, filter_hint_type, color_component, cx, cy, additional_extension_flag;
+  int ***filter_hint;
+
+  buf = malloc(sizeof(Bitstream));
+  buf->bitstream_length = size;
+  buf->streamBuffer = payload;
+  buf->frame_bitoffset = 0;
+
+  filter_hint_size_y = ue_v("SEI: filter_hint_size_y", buf); // interpret post-filter hint SEI here
+  filter_hint_size_x = ue_v("SEI: filter_hint_size_x", buf); // interpret post-filter hint SEI here
+  filter_hint_type   = u_v(2, "SEI: filter_hint_type", buf); // interpret post-filter hint SEI here
+
+  get_mem3Dint (&filter_hint, 3, filter_hint_size_y, filter_hint_size_x);
+
+  for (color_component = 0; color_component < 3; color_component ++)
+    for (cy = 0; cy < filter_hint_size_y; cy ++)
+      for (cx = 0; cx < filter_hint_size_x; cx ++)
+        filter_hint[color_component][cy][cx] = se_v("SEI: filter_hint", buf); // interpret post-filter hint SEI here
+
+  additional_extension_flag = u_1("SEI: additional_extension_flag", buf); // interpret post-filter hint SEI here
+
+#ifdef PRINT_POST_FILTER_HINT_INFO
+  printf(" Post-filter hint SEI message\n");
+  printf(" post_filter_hint_size_y %d \n", filter_hint_size_y);
+  printf(" post_filter_hint_size_x %d \n", filter_hint_size_x);
+  printf(" post_filter_hint_type %d \n",   filter_hint_type);
+  for (color_component = 0; color_component < 3; color_component ++)
+    for (cy = 0; cy < filter_hint_size_y; cy ++)
+      for (cx = 0; cx < filter_hint_size_x; cx ++)
+        printf(" post_filter_hint[%d][%d][%d] %d \n", color_component, cy, cx, filter_hint[color_component][cy][cx]);
+
+  printf(" additional_extension_flag %d \n", additional_extension_flag);
+
+#undef PRINT_POST_FILTER_HINT_INFO
+#endif
+
+  free_mem3Dint (filter_hint);
+  free( buf );
+}
diff --git a/Src/h264dec/ldecod/src/storable_picture.c b/Src/h264dec/ldecod/src/storable_picture.c
new file mode 100644
index 00000000..c12b68b3
--- /dev/null
+++ b/Src/h264dec/ldecod/src/storable_picture.c
@@ -0,0 +1,287 @@
+#include "global.h"
+#include "mbuffer.h"
+#include "memalloc.h"
+
+static void alloc_pic_motion(VideoParameters *p_Vid, PicMotionParams *motion, int size_y, int size_x)
+{
+	// TODO: benski> re-use memory just like for image data
+	seq_parameter_set_rbsp_t *active_sps = p_Vid->active_sps;  
+
+	if (!active_sps->frame_mbs_only_flag)
+	{
+		get_mem3Dref(&(motion->field_references), 4, size_y, size_x);
+	}
+	else
+	{
+		motion->field_references = 0; // just in case
+	}
+
+	if (motion_cache_dimensions_match(&p_Vid->motion_cache, size_x, size_y))
+	{
+		motion->motion[LIST_0]=motion_cache_get(&p_Vid->motion_cache);
+		motion->motion[LIST_1]=motion_cache_get(&p_Vid->motion_cache);
+	}
+	if (!motion->motion[LIST_0])
+		get_mem2DPicMotion(&(motion->motion[LIST_0]), size_y, size_x);
+	if (!motion->motion[LIST_1])
+		get_mem2DPicMotion(&(motion->motion[LIST_1]), size_y, size_x);
+
+	motion->mb_field = calloc (size_y * size_x, sizeof(byte));
+	if (motion->mb_field == NULL)
+		no_mem_exit("alloc_storable_picture: motion->mb_field");
+
+	get_mem2D (&(motion->field_frame), size_y, size_x);
+}
+
+void free_pic_motion(VideoParameters *p_Vid, PicMotionParams *motion, int size_x, int size_y)
+{
+	if (motion->motion[LIST_0])
+	{
+		if (motion_cache_dimensions_match(&p_Vid->motion_cache, size_x / BLOCK_SIZE, size_y / BLOCK_SIZE))
+		{
+			motion_cache_add(&p_Vid->motion_cache,motion->motion[LIST_0]);
+			motion_cache_add(&p_Vid->motion_cache,motion->motion[LIST_1]);
+		}
+		else
+		{
+			free_mem2DPicMotion(motion->motion[LIST_0]);
+			free_mem2DPicMotion(motion->motion[LIST_1]);
+		}
+		motion->motion[LIST_0] = NULL;
+		motion->motion[LIST_1] = NULL;
+	}
+
+	if (motion->field_references)
+	{
+		free_mem3Dref(motion->field_references);
+		motion->field_references=0;
+	}
+
+  if (motion->mb_field)
+  {
+    free(motion->mb_field);
+    motion->mb_field = NULL;
+  }
+
+  if (motion->field_frame)
+  {
+    free_mem2D (motion->field_frame);
+    motion->field_frame=NULL;
+  }
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Free picture memory.
+ *
+ * \param p_Vid
+ *      image decoding parameters for current picture
+ * \param p
+ *    Picture to be freed
+ *
+ ************************************************************************
+ */
+static void internal_free_storable_picture(VideoParameters *p_Vid, StorablePicture* p)
+{
+  int nplane;
+  if (p)
+  {
+
+			free_pic_motion(p_Vid, &p->motion, p->size_x, p->size_y);
+
+    //if( IS_INDEPENDENT(p_Vid) )
+    {
+      for( nplane=0; nplane<MAX_PLANE; nplane++ )
+      {
+        free_pic_motion(p_Vid, &p->JVmotion[nplane], p->size_x, p->size_y);
+      }
+    }
+
+		if (image_cache_dimensions_match(&p_Vid->image_cache[0], p->size_x, p->size_y))
+			image_cache_add(&p_Vid->image_cache[0], p->imgY);
+		else
+			free_memImage(p->imgY);
+
+		if (image_cache_dimensions_match(&p_Vid->image_cache[1], p->size_x_cr, p->size_y_cr))
+			image_cache_add(&p_Vid->image_cache[1], p->imgUV[0]);
+		else
+			free_memImage(p->imgUV[0]);
+
+		if (image_cache_dimensions_match(&p_Vid->image_cache[1], p->size_x_cr, p->size_y_cr))
+			image_cache_add(&p_Vid->image_cache[1], p->imgUV[1]);
+		else
+			free_memImage(p->imgUV[1]);
+
+		if (p->slice_id)
+		{
+			free_mem2Dshort(p->slice_id);
+			p->slice_id=NULL;
+		}
+
+		if (p->seiHasTone_mapping)
+			free(p->tone_mapping_lut);
+
+		_aligned_free(p);
+		p = NULL;
+  }
+}
+
+void free_storable_picture(VideoParameters *p_Vid, StorablePicture* p)
+{
+	if (p && --p->retain_count == 0)
+	{
+		internal_free_storable_picture(p_Vid, p);
+	}
+}
+
+
+/*!
+ ************************************************************************
+ * \brief
+ *    Allocate memory for a stored picture.
+ *
+ * \param p_Vid
+ *      image decoding parameters for current picture
+ * \param structure
+ *    picture structure
+ * \param size_x
+ *    horizontal luma size
+ * \param size_y
+ *    vertical luma size
+ * \param size_x_cr
+ *    horizontal chroma size
+ * \param size_y_cr
+ *    vertical chroma size
+ *
+ * \return
+ *    the allocated StorablePicture structure
+ ************************************************************************
+ */
+#define ROUNDUP32(size) (((size)+31) & ~31)
+
+StorablePicture* alloc_storable_picture(VideoParameters *p_Vid, PictureStructure structure, int size_x, int size_y, int size_x_cr, int size_y_cr)
+{
+  seq_parameter_set_rbsp_t *active_sps = p_Vid->active_sps;  
+
+  StorablePicture *s;
+  int   nplane;
+
+  //printf ("Allocating (%s) picture (x=%d, y=%d, x_cr=%d, y_cr=%d)\n", (type == FRAME)?"FRAME":(type == TOP_FIELD)?"TOP_FIELD":"BOTTOM_FIELD", size_x, size_y, size_x_cr, size_y_cr);
+  s = _aligned_malloc(sizeof(StorablePicture), 32);
+  if (NULL==s)
+    return 0;
+	memset(s, 0, sizeof(StorablePicture));
+
+	s->retain_count = 1;
+	s->time_code = (uint64_t)-666;
+
+  if (structure!=FRAME)
+  {
+    size_y    /= 2;
+    size_y_cr /= 2;
+  }
+
+  s->PicSizeInMbs = (size_x*size_y)/256;
+
+	if (image_cache_dimensions_match(&p_Vid->image_cache[0], size_x, size_y))
+		s->imgY = image_cache_get(&p_Vid->image_cache[0]);
+	if (!s->imgY)
+		s->imgY = get_memImage(size_x, size_y);
+
+	if (active_sps->chroma_format_idc != YUV400)
+	{
+		if (image_cache_dimensions_match(&p_Vid->image_cache[1], size_x_cr, size_y_cr))
+		{
+			s->imgUV[0] = image_cache_get(&p_Vid->image_cache[1]);
+			s->imgUV[1] = image_cache_get(&p_Vid->image_cache[1]);
+		}
+
+		if (!s->imgUV[0])
+			s->imgUV[0] = get_memImage(size_x_cr, size_y);
+		if (!s->imgUV[1])
+			s->imgUV[1] = get_memImage(size_x_cr, size_y);
+	}
+  
+  get_mem2Dshort (&(s->slice_id), size_y / MB_BLOCK_SIZE, size_x / MB_BLOCK_SIZE);
+
+  alloc_pic_motion(p_Vid, &s->motion, size_y / BLOCK_SIZE, size_x / BLOCK_SIZE);
+
+  if( IS_INDEPENDENT(p_Vid) )
+  {
+    for( nplane=0; nplane<MAX_PLANE; nplane++ )
+    {
+      alloc_pic_motion(p_Vid, &s->JVmotion[nplane], size_y / BLOCK_SIZE, size_x / BLOCK_SIZE);
+    }
+  }
+
+  s->structure=structure;
+
+  s->size_x = size_x;
+  s->size_y = size_y;
+  s->size_x_cr = size_x_cr;
+  s->size_y_cr = size_y_cr;
+  s->size_x_m1 = size_x - 1;
+  s->size_y_m1 = size_y - 1;
+  s->size_x_cr_m1 = size_x_cr - 1;
+  s->size_y_cr_m1 = size_y_cr - 1;
+
+  s->top_field    = p_Vid->no_reference_picture;
+  s->bottom_field = p_Vid->no_reference_picture;
+  s->frame        = p_Vid->no_reference_picture;
+
+  return s;
+}
+
+void out_storable_picture_add(VideoParameters *img, StorablePicture *pic)
+{
+	if (img->out_pictures)
+	{
+		// see if we're full
+		if (img->size_out_pictures == img->num_out_pictures)
+		{
+			StorablePicture *pic=0;
+			out_storable_picture_get(img, &pic);
+			if (pic)
+				free_storable_picture(img, pic);
+		}
+
+		img->out_pictures[img->num_out_pictures++] = pic;
+		pic->retain_count++;
+	}
+}
+
+void out_storable_picture_get(VideoParameters *img, StorablePicture **pic)
+{
+	*pic = 0;
+	if (img->out_pictures && img->num_out_pictures)
+	{
+		*pic = img->out_pictures[0];
+		img->num_out_pictures--;
+		memmove(img->out_pictures, &img->out_pictures[1], img->num_out_pictures * sizeof(StorablePicture *));
+	}
+}
+
+void out_storable_pictures_init(VideoParameters *img, size_t count)
+{
+	img->out_pictures = (StorablePicture **)calloc(sizeof(StorablePicture *), count);
+	img->size_out_pictures = count;
+	img->num_out_pictures = 0;
+}
+
+void out_storable_pictures_destroy(VideoParameters *img)
+{
+	size_t i=0;
+	while (img->num_out_pictures)
+	{
+		StorablePicture *pic=0;
+		out_storable_picture_get(img, &pic);
+		if (pic)
+			free_storable_picture(img, pic);
+	}
+	free(img->out_pictures);
+	img->out_pictures = 0;
+	img->size_out_pictures = 0;
+}
+
diff --git a/Src/h264dec/ldecod/src/strength_horiz.c b/Src/h264dec/ldecod/src/strength_horiz.c
new file mode 100644
index 00000000..bd719deb
--- /dev/null
+++ b/Src/h264dec/ldecod/src/strength_horiz.c
@@ -0,0 +1,659 @@
+#include "global.h"
+#include "image.h"
+#include "mb_access.h"
+#include "loopfilter.h"
+
+void GetStrengthNormal_Horiz(byte Strength[MB_BLOCK_SIZE], Macroblock *MbQ, int edge, int mvlimit, StorablePicture *p)
+{
+	// dir == 1
+	PixelPos pixMB;
+	byte     StrValue;
+	Macroblock *MbP;
+
+	assert(NUM_SLICE_TYPES == 5); // the next line assumes this
+	if (p->slice_type>=SP_SLICE) //(p->slice_type==SP_SLICE)||(p->slice_type==SI_SLICE) )
+	{ 
+		// Set strength to either 3 or 4 regardless of pixel position
+		StrValue = (edge == 0 && p->structure==FRAME) ? 4 : 3;
+		memset(&Strength[0], (byte) StrValue, MB_BLOCK_SIZE * sizeof(byte));
+	}
+	else
+	{    
+		VideoParameters *p_Vid = MbQ->p_Vid;
+		int yQ = edge < 16 ? edge - 1: 0;
+
+		p_Vid->getNeighbour0X(MbQ, yQ, p_Vid->mb_size[IS_LUMA], &pixMB);
+
+		MbP = &(p_Vid->mb_data[pixMB.mb_addr]);
+
+		if (!(MbP->mb_type==I4MB||MbP->mb_type==I8MB||MbP->mb_type==I16MB||MbP->mb_type==IPCM||MbQ->mb_type==I4MB||MbQ->mb_type==I8MB||MbQ->mb_type==I16MB||MbQ->mb_type==IPCM))
+		{
+			PicMotionParams *motion = &p->motion;
+			h264_ref_t    ref_p0,ref_p1,ref_q0,ref_q1;
+			int      blkP, blkQ, idx;
+			int      blk_x, blk_y ;
+			int posx;
+
+			PicMotion **motion0 = motion->motion[LIST_0];
+			PicMotion **motion1 = motion->motion[LIST_1];
+			short    mb_x, mb_y;
+			const int blk_y2 = pixMB.pos_y >> 2;
+			int cbp_pq, cbp_p, cbp_q;
+
+			posx = pixMB.pos_x >> 2;
+			blkP = (pixMB.y & 0xFFFC);
+			blkQ = ((yQ+1) & 0xFFFC);
+
+			cbp_p = (int)MbQ->cbp_blk[0];
+			cbp_q = (int)MbP->cbp_blk[0];
+			cbp_pq = (((cbp_p >> blkQ) & 0xF) | ((cbp_q >> blkP) & 0xF));
+			if (cbp_pq == 0xF)
+			{
+				memset(Strength, 2, 16);
+				return;
+				//StrValue = 2;
+			}
+
+			p_Vid->get_mb_block_pos (p_Vid->PicPos, MbQ->mbAddrX, &mb_x, &mb_y);
+			mb_x <<= 2;
+			mb_y <<= 2;
+
+			blk_x  = mb_x + (blkQ  & 3);
+			blk_y  = mb_y + (blkQ >> 2);
+
+			for( idx = 0 ; idx < MB_BLOCK_SIZE ; idx += BLOCK_SIZE, posx++, blkP++, blkQ++, blk_x++, cbp_pq>>=1)
+			{
+				if (cbp_pq & 1)
+					StrValue = 2;
+				else
+				{
+					PicMotion *motion_p0, *motion_q0, *motion_p1, *motion_q1;
+
+					motion_p0=&motion0[blk_y ][blk_x ];
+					motion_q0=&motion0[blk_y2][posx];
+					motion_p1=&motion1[blk_y ][blk_x ];
+					motion_q1=&motion1[blk_y2][posx];
+					// if no coefs, but vector difference >= 1 set Strength=1
+					// if this is a mixed mode edge then one set of reference pictures will be frame and the
+					// other will be field 								
+					ref_p0 = motion_p0->ref_idx < 0 ? UNDEFINED_REFERENCE : motion_p0->ref_pic_id;
+					ref_q0 = motion_q0->ref_idx < 0 ? UNDEFINED_REFERENCE : motion_q0->ref_pic_id;
+					ref_p1 = motion_p1->ref_idx < 0 ? UNDEFINED_REFERENCE : motion_p1->ref_pic_id;
+					ref_q1 = motion_q1->ref_idx < 0 ? UNDEFINED_REFERENCE : motion_q1->ref_pic_id;
+					if ( ((ref_p0==ref_q0) && (ref_p1==ref_q1)) || ((ref_p0==ref_q1) && (ref_p1==ref_q0)))
+					{
+						// L0 and L1 reference pictures of p0 are different; q0 as well
+						if (ref_p0 != ref_p1)
+						{
+							// compare MV for the same reference picture
+							if (ref_p0 == ref_q0)
+							{
+								if (ref_p0 == UNDEFINED_REFERENCE)
+								{
+									StrValue =  (byte) (
+										(abs( motion_p1->mv[0] - motion_q1->mv[0]) >= 4) ||
+										(abs( motion_p1->mv[1] - motion_q1->mv[1]) >= mvlimit));
+								}
+								else if (ref_p1 == UNDEFINED_REFERENCE)
+								{
+									StrValue =  (byte) (
+										(abs( motion_p0->mv[0] - motion_q0->mv[0]) >= 4) ||
+										(abs( motion_p0->mv[1] - motion_q0->mv[1]) >= mvlimit));
+								}
+								else
+								{
+									StrValue =  (byte) (
+										(abs( motion_p0->mv[0] - motion_q0->mv[0]) >= 4) ||
+										(abs( motion_p0->mv[1] - motion_q0->mv[1]) >= mvlimit) ||
+										(abs( motion_p1->mv[0] - motion_q1->mv[0]) >= 4) ||
+										(abs( motion_p1->mv[1] - motion_q1->mv[1]) >= mvlimit));
+								}
+							}
+							else
+							{
+								StrValue =  (byte) (
+									(abs( motion_p0->mv[0] - motion_q1->mv[0]) >= 4) ||
+									(abs( motion_p0->mv[1] - motion_q1->mv[1]) >= mvlimit) ||
+									(abs( motion_p1->mv[0] - motion_q0->mv[0]) >= 4) ||
+									(abs( motion_p1->mv[1] - motion_q0->mv[1]) >= mvlimit));
+							}
+						}
+						else
+						{ // L0 and L1 reference pictures of p0 are the same; q0 as well
+							StrValue = (byte) (
+								((abs( motion_p0->mv[0] - motion_q0->mv[0]) >= 4) ||
+								(abs( motion_p0->mv[1] - motion_q0->mv[1]) >= mvlimit ) ||
+								(abs( motion_p1->mv[0] - motion_q1->mv[0]) >= 4) ||
+								(abs( motion_p1->mv[1] - motion_q1->mv[1]) >= mvlimit))
+								&&
+								((abs( motion_p0->mv[0] - motion_q1->mv[0]) >= 4) ||
+								(abs( motion_p0->mv[1] - motion_q1->mv[1]) >= mvlimit) ||
+								(abs( motion_p1->mv[0] - motion_q0->mv[0]) >= 4) ||
+								(abs( motion_p1->mv[1] - motion_q0->mv[1]) >= mvlimit)));
+						}
+					}
+					else
+					{
+						StrValue = 1;
+					}
+				}
+				memset(&Strength[idx], (byte) StrValue, BLOCK_SIZE * sizeof(byte));
+			}
+		}
+		else
+		{
+			// Start with Strength=3. or Strength=4 for Mb-edge
+			StrValue = (edge == 0 && p->structure==FRAME) ? 4 : 3;
+			memset(&Strength[0], (byte) StrValue, MB_BLOCK_SIZE * sizeof(byte));
+		}      
+	}
+}
+
+
+void GetStrength_Horiz_YUV420(byte Strength[4], Macroblock *MbQ, int edge, int mvlimit, StorablePicture *p, PixelPos pixMB, Macroblock *MbP)
+{
+	// dir == 1
+	byte     StrValue;
+
+	assert(NUM_SLICE_TYPES == 5); // the next line assumes this
+	if (p->slice_type>=SP_SLICE) //(p->slice_type==SP_SLICE)||(p->slice_type==SI_SLICE) )
+	{ 
+		// Set strength to either 3 or 4 regardless of pixel position
+		StrValue = (edge == 0 && p->structure==FRAME) ? 4 : 3;
+		memset(&Strength[0], (byte) StrValue, 4 * sizeof(byte));
+	}
+	else
+	{    
+		VideoParameters *p_Vid = MbQ->p_Vid;
+		int yQ = edge < 16 ? edge - 1: 0;
+
+		if (!(MbP->mb_type==I4MB||MbP->mb_type==I8MB||MbP->mb_type==I16MB||MbP->mb_type==IPCM||MbQ->mb_type==I4MB||MbQ->mb_type==I8MB||MbQ->mb_type==I16MB||MbQ->mb_type==IPCM))
+		{
+			PicMotionParams *motion = &p->motion;
+			h264_ref_t    ref_p0,ref_p1,ref_q0,ref_q1;
+			int      blkP, blkQ, idx;
+			int posx;
+
+			PicMotion **motion0 = motion->motion[LIST_0];
+			PicMotion **motion1 = motion->motion[LIST_1];
+
+			const int blk_y2 = pixMB.pos_y >> 2;
+			int cbp_pq, cbp_p, cbp_q;
+
+			blkP = (pixMB.y & 0xFFFC);
+			blkQ = ((yQ+1) & 0xFFFC);
+
+			cbp_p = (int)MbQ->cbp_blk[0];
+			cbp_q = (int)MbP->cbp_blk[0];
+			cbp_pq = (((cbp_p >> blkQ) & 0xF) | ((cbp_q >> blkP) & 0xF));
+			if (cbp_pq == 0xF)
+			{
+				memset(Strength, 2, 4);
+				return;
+				//StrValue = 2;
+			}
+			posx = pixMB.pos_x >> 2;
+#ifdef _DEBUG
+			{
+				short    mb_x, mb_y;
+				get_mb_block_pos_normal(p_Vid->PicPos, MbQ->mbAddrX, &mb_x, &mb_y);
+				assert((mb_x << 2) == posx);
+				assert(((mb_y << 2) + (blkQ >> 2)) == (blk_y2+1));
+			}
+#endif
+			//blk_y  = mb_y + (blkQ >> 2);
+
+			for( idx = 0 ; idx < MB_BLOCK_SIZE ; idx += BLOCK_SIZE, posx++, cbp_pq>>=1)
+			{
+				if (cbp_pq & 1)
+					StrValue = 2;
+				else
+				{
+					PicMotion *motion_p0, *motion_q0, *motion_p1, *motion_q1;
+
+
+					motion_p0=&motion0[blk_y2+1][posx];
+					motion_q0=&motion0[blk_y2][posx];
+					motion_p1=&motion1[blk_y2+1][posx];
+					motion_q1=&motion1[blk_y2][posx];
+
+					// if no coefs, but vector difference >= 1 set Strength=1
+					// if this is a mixed mode edge then one set of reference pictures will be frame and the
+					// other will be field 								
+					ref_p0 = motion_p0->ref_idx < 0 ? UNDEFINED_REFERENCE : motion_p0->ref_pic_id;
+					ref_q0 = motion_q0->ref_idx < 0 ? UNDEFINED_REFERENCE : motion_q0->ref_pic_id;
+					ref_p1 = motion_p1->ref_idx < 0 ? UNDEFINED_REFERENCE : motion_p1->ref_pic_id;
+					ref_q1 = motion_q1->ref_idx < 0 ? UNDEFINED_REFERENCE : motion_q1->ref_pic_id;
+					if ( ((ref_p0==ref_q0) && (ref_p1==ref_q1)) || ((ref_p0==ref_q1) && (ref_p1==ref_q0)))
+					{
+						// L0 and L1 reference pictures of p0 are different; q0 as well
+						if (ref_p0 != ref_p1)
+						{
+							// compare MV for the same reference picture
+							if (ref_p0 == ref_q0)
+							{
+								if (ref_p0 == UNDEFINED_REFERENCE)
+								{
+									StrValue =  (byte) (
+										(abs( motion_p1->mv[0] - motion_q1->mv[0]) >= 4) ||
+										(abs( motion_p1->mv[1] - motion_q1->mv[1]) >= mvlimit));
+								}
+								else if (ref_p1 == UNDEFINED_REFERENCE)
+								{
+									StrValue =  (byte) (
+										(abs( motion_p0->mv[0] - motion_q0->mv[0]) >= 4) ||
+										(abs( motion_p0->mv[1] - motion_q0->mv[1]) >= mvlimit));
+								}
+								else
+								{
+									StrValue =  (byte) (
+										(abs( motion_p0->mv[0] - motion_q0->mv[0]) >= 4) ||
+										(abs( motion_p0->mv[1] - motion_q0->mv[1]) >= mvlimit) ||
+										(abs( motion_p1->mv[0] - motion_q1->mv[0]) >= 4) ||
+										(abs( motion_p1->mv[1] - motion_q1->mv[1]) >= mvlimit));
+								}
+							}
+							else
+							{
+								StrValue =  (byte) (
+									(abs( motion_p0->mv[0] - motion_q1->mv[0]) >= 4) ||
+									(abs( motion_p0->mv[1] - motion_q1->mv[1]) >= mvlimit) ||
+									(abs( motion_p1->mv[0] - motion_q0->mv[0]) >= 4) ||
+									(abs( motion_p1->mv[1] - motion_q0->mv[1]) >= mvlimit));
+							}
+						}
+						else
+						{ // L0 and L1 reference pictures of p0 are the same; q0 as well
+							StrValue = (byte) (
+								((abs( motion_p0->mv[0] - motion_q0->mv[0]) >= 4) ||
+								(abs( motion_p0->mv[1] - motion_q0->mv[1]) >= mvlimit ) ||
+								(abs( motion_p1->mv[0] - motion_q1->mv[0]) >= 4) ||
+								(abs( motion_p1->mv[1] - motion_q1->mv[1]) >= mvlimit))
+								&&
+								((abs( motion_p0->mv[0] - motion_q1->mv[0]) >= 4) ||
+								(abs( motion_p0->mv[1] - motion_q1->mv[1]) >= mvlimit) ||
+								(abs( motion_p1->mv[0] - motion_q0->mv[0]) >= 4) ||
+								(abs( motion_p1->mv[1] - motion_q0->mv[1]) >= mvlimit)));
+						}
+					}
+					else
+					{
+						StrValue = 1;
+					}
+				}
+				Strength[idx/4] = StrValue;
+				//memset(&Strength[idx/4], (byte) StrValue,  sizeof(byte));
+			}
+		}
+		else
+		{
+			// Start with Strength=3. or Strength=4 for Mb-edge
+			StrValue = (edge == 0 && p->structure==FRAME) ? 4 : 3;
+			memset(&Strength[0], (byte) StrValue, 4 * sizeof(byte));
+		}      
+	}
+}
+
+void GetStrengthMBAff_Horiz_YUV420(byte Strength[16], Macroblock *MbQ, int edge, int mvlimit, StorablePicture *p)
+{
+	// dir == 1
+	short  blkP, blkQ, idx;
+	short  blk_x, blk_x2, blk_y, blk_y2 ;
+	h264_ref_t  ref_p0,ref_p1,ref_q0,ref_q1;
+	int    xQ, yQ;
+	short  mb_x, mb_y;
+	Macroblock *MbP;
+
+	PixelPos pixP;
+	int dir_m1 = 0;
+
+	PicMotionParams *motion = &p->motion;
+	PicMotion **motion0 = motion->motion[LIST_0];
+	PicMotion **motion1 = motion->motion[LIST_1];
+	yQ = (edge < MB_BLOCK_SIZE ? edge : 1);
+
+	for( idx = 0; idx < 16; ++idx )
+	{
+		VideoParameters *p_Vid = MbQ->p_Vid;
+		xQ = idx;
+
+		getAffNeighbourPXLumaNB(MbQ, xQ , yQ - 1, &pixP);
+		blkQ = (short) ((yQ & 0xFFFC) + (xQ >> 2));
+		blkP = (short) ((pixP.y & 0xFFFC) + (pixP.x >> 2));
+
+		MbP = &(p_Vid->mb_data[pixP.mb_addr]);
+		p_Vid->mixedModeEdgeFlag = (byte) (MbQ->mb_field != MbP->mb_field);   
+
+		if ((p->slice_type==SP_SLICE)||(p->slice_type==SI_SLICE) )
+		{
+			Strength[idx] = (edge == 0 && (((!MbP->mb_field && !MbQ->mb_field)))) ? 4 : 3;
+		}
+		else
+		{
+			// Start with Strength=3. or Strength=4 for Mb-edge
+			Strength[idx] = (edge == 0 && (((!MbP->mb_field && !MbQ->mb_field)))) ? 4 : 3;
+
+			if(  !(MbP->mb_type==I4MB || MbP->mb_type==I16MB || MbP->mb_type==I8MB || MbP->mb_type==IPCM)
+				&& !(MbQ->mb_type==I4MB || MbQ->mb_type==I16MB || MbQ->mb_type==I8MB || MbQ->mb_type==IPCM) )
+			{
+				if( ((MbQ->cbp_blk[0] &  ((int64)1 << blkQ )) != 0) || ((MbP->cbp_blk[0] &  ((int64)1 << blkP)) != 0) )
+					Strength[idx] = 2 ;
+				else
+				{
+					// if no coefs, but vector difference >= 1 set Strength=1
+					// if this is a mixed mode edge then one set of reference pictures will be frame and the
+					// other will be field
+					if (p_Vid->mixedModeEdgeFlag)
+					{
+						(Strength[idx] = 1);
+					}
+					else
+					{
+						get_mb_block_pos_mbaff(p_Vid->PicPos, MbQ->mbAddrX, &mb_x, &mb_y);
+						blk_y  = (short) ((mb_y<<2) + (blkQ >> 2));
+						blk_x  = (short) ((mb_x<<2) + (blkQ  & 3));
+						blk_y2 = (short) (pixP.pos_y >> 2);
+						blk_x2 = (short) (pixP.pos_x >> 2);
+						{
+							PicMotion *motion_p0, *motion_q0, *motion_p1, *motion_q1;
+							motion_p0=&motion0[blk_y ][blk_x ];
+							motion_q0=&motion0[blk_y2][blk_x2];
+							motion_p1=&motion1[blk_y ][blk_x ];
+							motion_q1=&motion1[blk_y2][blk_x2];
+
+							ref_p0 = motion_p0->ref_idx < 0 ? UNDEFINED_REFERENCE : motion_p0->ref_pic_id;
+							ref_q0 = motion_q0->ref_idx < 0 ? UNDEFINED_REFERENCE : motion_q0->ref_pic_id;
+							ref_p1 = motion_p1->ref_idx < 0 ? UNDEFINED_REFERENCE : motion_p1->ref_pic_id;
+							ref_q1 = motion_q1->ref_idx < 0 ? UNDEFINED_REFERENCE : motion_q1->ref_pic_id;
+							if ( ((ref_p0==ref_q0) && (ref_p1==ref_q1)) ||
+								((ref_p0==ref_q1) && (ref_p1==ref_q0)))
+							{
+								Strength[idx]=0;
+								// L0 and L1 reference pictures of p0 are different; q0 as well
+								if (ref_p0 != ref_p1)
+								{
+									// compare MV for the same reference picture
+									if (ref_p0==ref_q0)
+									{
+										Strength[idx] =  (byte) (
+											(abs( motion_p0->mv[0] - motion_q0->mv[0]) >= 4) ||
+											(abs( motion_p0->mv[1] - motion_q0->mv[1]) >= mvlimit) ||
+											(abs( motion_p1->mv[0] - motion_q1->mv[0]) >= 4) ||
+											(abs( motion_p1->mv[1] - motion_q1->mv[1]) >= mvlimit));
+									}
+									else
+									{
+										Strength[idx] =  (byte) (
+											(abs( motion_p0->mv[0] - motion_q1->mv[0]) >= 4) ||
+											(abs( motion_p0->mv[1] - motion_q1->mv[1]) >= mvlimit) ||
+											(abs( motion_p1->mv[0] - motion_q0->mv[0]) >= 4) ||
+											(abs( motion_p1->mv[1] - motion_q0->mv[1]) >= mvlimit));
+									}
+								}
+								else
+								{ // L0 and L1 reference pictures of p0 are the same; q0 as well
+
+									Strength[idx] = (byte) (
+										((abs( motion_p0->mv[0] - motion_q0->mv[0]) >= 4) ||
+										(abs( motion_p0->mv[1] - motion_q0->mv[1]) >= mvlimit ) ||
+										(abs( motion_p1->mv[0] - motion_q1->mv[0]) >= 4) ||
+										(abs( motion_p1->mv[1] - motion_q1->mv[1]) >= mvlimit))
+										&&
+										((abs( motion_p0->mv[0] - motion_q1->mv[0]) >= 4) ||
+										(abs( motion_p0->mv[1] - motion_q1->mv[1]) >= mvlimit) ||
+										(abs( motion_p1->mv[0] - motion_q0->mv[0]) >= 4) ||
+										(abs( motion_p1->mv[1] - motion_q0->mv[1]) >= mvlimit)));
+								}
+							}
+							else
+							{
+								Strength[idx] = 1;
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+}
+
+static __forceinline uint8_t GetMotionStrength(PicMotion *motion0, PicMotion *motion1, int motion_stride, int mvlimit)
+{
+	h264_ref_t    ref_p0,ref_p1,ref_q0,ref_q1;
+  
+	ref_p0 = motion0[0].ref_idx < 0 ? UNDEFINED_REFERENCE : motion0[0].ref_pic_id;
+	ref_p1 = motion1[0].ref_idx < 0 ? UNDEFINED_REFERENCE : motion1[0].ref_pic_id;
+	ref_q0 = motion0[motion_stride].ref_idx < 0 ? UNDEFINED_REFERENCE : motion0[motion_stride].ref_pic_id;
+	ref_q1 = motion1[motion_stride].ref_idx < 0 ? UNDEFINED_REFERENCE : motion1[motion_stride].ref_pic_id;
+
+	if (ref_p0==ref_q0 && ref_p1==ref_q1)
+	{
+		if (ref_p0 != ref_p1)
+		{
+			// compare MV for the same reference picture
+			if (ref_p0 == UNDEFINED_REFERENCE)
+			{
+				return (byte) (
+					(abs( motion1[0].mv[0] - motion1[motion_stride].mv[0]) >= 4) ||
+					(abs( motion1[0].mv[1] - motion1[motion_stride].mv[1]) >= mvlimit));
+			}
+			else if (ref_p1 == UNDEFINED_REFERENCE)
+			{
+				return  (byte) (
+					(abs( motion0[0].mv[0] - motion0[motion_stride].mv[0]) >= 4) ||
+					(abs( motion0[0].mv[1] - motion0[motion_stride].mv[1]) >= mvlimit));
+			}
+			else
+			{
+				return  (byte) (
+					(abs( motion0[0].mv[0] - motion0[motion_stride].mv[0]) >= 4) ||
+					(abs( motion0[0].mv[1] - motion0[motion_stride].mv[1]) >= mvlimit) ||
+					(abs( motion1[0].mv[0] - motion1[motion_stride].mv[0]) >= 4) ||
+					(abs( motion1[0].mv[1] - motion1[motion_stride].mv[1]) >= mvlimit));
+			}
+		}
+		else
+		{ // L0 and L1 reference pictures of p0 are the same; q0 as well
+			return (byte) (
+				((abs( motion0[0].mv[0] - motion0[motion_stride].mv[0]) >= 4) ||
+				(abs( motion0[0].mv[1] - motion0[motion_stride].mv[1]) >= mvlimit ) ||
+				(abs( motion1[0].mv[0] - motion1[motion_stride].mv[0]) >= 4) ||
+				(abs( motion1[0].mv[1] - motion1[motion_stride].mv[1]) >= mvlimit))
+				&&
+				((abs( motion0[0].mv[0] - motion1[motion_stride].mv[0]) >= 4) ||
+				(abs( motion0[0].mv[1] - motion1[motion_stride].mv[1]) >= mvlimit) ||
+				(abs( motion1[0].mv[0] - motion0[motion_stride].mv[0]) >= 4) ||
+				(abs( motion1[0].mv[1] - motion0[motion_stride].mv[1]) >= mvlimit)));
+		}
+	}
+	else if (ref_p0==ref_q1 && ref_p1==ref_q0)
+	{
+		return  (byte) (
+			(abs( motion0[0].mv[0] - motion1[motion_stride].mv[0]) >= 4) ||
+			(abs( motion0[0].mv[1] - motion1[motion_stride].mv[1]) >= mvlimit) ||
+			(abs( motion1[0].mv[0] - motion0[motion_stride].mv[0]) >= 4) ||
+			(abs( motion1[0].mv[1] - motion0[motion_stride].mv[1]) >= mvlimit));
+	}
+	else
+	{
+		return 1;
+	}
+}
+
+
+void GetStrength_Horiz_YUV420_All(uint8_t Strength[4][4], Macroblock *MbQ, int mvlimit, StorablePicture *p, int pos_x, int pos_y, Macroblock *MbP, int luma_transform_size_8x8_flag)
+{
+	// dir == 1
+	assert(NUM_SLICE_TYPES == 5); // the next line assumes this
+	if ((p->slice_type>=SP_SLICE) //(p->slice_type==SP_SLICE)||(p->slice_type==SI_SLICE) )
+		|| ((1 << MbQ->mb_type) & 26112))
+	{ 
+		// Set strength to either 3 or 4 regardless of pixel position
+		*(int32_t *)(Strength[0]) = MbP?p->structure==FRAME ? 0x04040404 : 0x03030303 : 0;
+		*(int32_t *)(Strength[1]) = luma_transform_size_8x8_flag?0:0x03030303;
+		*(int32_t *)(Strength[2]) = 0x03030303;
+		*(int32_t *)(Strength[3]) = luma_transform_size_8x8_flag?0:0x03030303;
+	}
+	else
+	{
+		PicMotionParams *motion = &p->motion;
+		int motion_stride = p->size_x>>2;
+		PicMotion *motion0 = &motion->motion[LIST_0][pos_y-!!MbP][pos_x];
+		PicMotion *motion1 = &motion->motion[LIST_1][pos_y-!!MbP][pos_x];
+
+		int cbp_p, cbp_q=(int)MbQ->cbp_blk[0], cbp_pq;	
+
+		// edge 0
+		if (!MbP)
+		{
+			*(int32_t *)(Strength[0]) = 0;
+		}
+		else if ((1 << MbP->mb_type) & 26112)
+		{
+			*(int32_t *)(Strength[0]) = p->structure==FRAME ? 0x04040404 : 0x03030303;
+			motion0 += motion_stride;
+			motion1 += motion_stride;
+		}
+		else
+		{
+			cbp_p=(int)MbP->cbp_blk[0];
+			cbp_pq = (((cbp_p >> 12) & 0xF) | (cbp_q & 0xF));
+			if (cbp_pq == 0xF)
+			{
+				memset(Strength[0], 2, 4);
+			}
+			else
+			{
+				if (cbp_pq & (1<<0))
+					Strength[0][0] = 2;
+				else
+					Strength[0][0] = GetMotionStrength(&motion0[0], &motion1[0], motion_stride, mvlimit);
+
+				if (cbp_pq & (1<<1))
+					Strength[0][1] = 2;
+				else
+					Strength[0][1] = GetMotionStrength(&motion0[1], &motion1[1], motion_stride, mvlimit);
+
+				if (cbp_pq & (1<<2))
+					Strength[0][2] = 2;
+				else
+					Strength[0][2] = GetMotionStrength(&motion0[2], &motion1[2], motion_stride, mvlimit);
+
+				if (cbp_pq & (1<<3))
+					Strength[0][3] = 2;
+				else
+					Strength[0][3] = GetMotionStrength(&motion0[3], &motion1[3], motion_stride, mvlimit);
+			}
+			motion0 += motion_stride;
+			motion1 += motion_stride;
+		}
+
+		// edge 1
+		if (luma_transform_size_8x8_flag)
+		{
+			*(int32_t *)(Strength[1]) = 0;
+		}
+		else
+		{
+			cbp_pq = ((cbp_q) | (cbp_q >> 4)) & 0xF;
+			if (cbp_pq == 0xF)
+			{
+				memset(Strength[1], 2, 4);
+			}
+			else
+			{
+				if (cbp_pq & (1<<0))
+					Strength[1][0] = 2;
+				else
+					Strength[1][0] = GetMotionStrength(&motion0[0], &motion1[0], motion_stride, mvlimit);
+
+				if (cbp_pq & (1<<1))
+					Strength[1][1] = 2;
+				else
+					Strength[1][1] = GetMotionStrength(&motion0[1], &motion1[1], motion_stride, mvlimit);
+
+				if (cbp_pq & (1<<2))
+					Strength[1][2] = 2;
+				else
+					Strength[1][2] = GetMotionStrength(&motion0[2], &motion1[2], motion_stride, mvlimit);
+
+				if (cbp_pq & (1<<3))
+					Strength[1][3] = 2;
+				else
+					Strength[1][3] = GetMotionStrength(&motion0[3], &motion1[3], motion_stride, mvlimit);
+
+			}
+		}
+
+
+		motion0 += motion_stride;
+			motion1 += motion_stride;
+		// edge 2 
+		cbp_pq = (cbp_q | (cbp_q >> 4)) & 0xF0;
+		if (cbp_pq == 0xF0)
+		{
+			memset(Strength[2], 2, 4);
+		}
+		else
+		{
+			if (cbp_pq & (0x10<<0))
+				Strength[2][0] = 2;
+			else
+				Strength[2][0] = GetMotionStrength(&motion0[0], &motion1[0], motion_stride, mvlimit);
+
+			if (cbp_pq & (0x10<<1))
+				Strength[2][1] = 2;
+			else
+				Strength[2][1] = GetMotionStrength(&motion0[1], &motion1[1], motion_stride, mvlimit);
+
+			if (cbp_pq & (0x10<<2))
+				Strength[2][2] = 2;
+			else
+				Strength[2][2] = GetMotionStrength(&motion0[2], &motion1[2], motion_stride, mvlimit);
+
+			if (cbp_pq & (0x10<<3))
+				Strength[2][3] = 2;
+			else
+				Strength[2][3] = GetMotionStrength(&motion0[3], &motion1[3], motion_stride, mvlimit);
+		}
+
+
+		motion0 += motion_stride;
+			motion1 += motion_stride;
+		// edge 3
+		if (luma_transform_size_8x8_flag)
+		{
+			*(int32_t *)(Strength[3]) = 0;
+		}
+		else
+		{
+			cbp_pq = (cbp_q | (cbp_q >> 4)) & 0xF00;
+			if (cbp_pq == 0xF00)
+			{
+				memset(Strength[3], 2, 4);
+			}
+			else
+			{
+				if (cbp_pq & (0x100<<0))
+					Strength[3][0] = 2;
+				else
+					Strength[3][0] = GetMotionStrength(&motion0[0], &motion1[0], motion_stride, mvlimit);
+
+				if (cbp_pq & (0x100<<1))
+					Strength[3][1] = 2;
+				else
+					Strength[3][1] = GetMotionStrength(&motion0[1], &motion1[1], motion_stride, mvlimit);
+
+				if (cbp_pq & (0x100<<2))
+					Strength[3][2] = 2;
+				else
+					Strength[3][2] = GetMotionStrength(&motion0[2], &motion1[2], motion_stride, mvlimit);
+
+				if (cbp_pq & (0x100<<3))
+					Strength[3][3] = 2;
+				else
+					Strength[3][3] = GetMotionStrength(&motion0[3], &motion1[3], motion_stride, mvlimit);
+			}
+		}
+	}
+}
+\ No newline at end of file
diff --git a/Src/h264dec/ldecod/src/strength_vert.c b/Src/h264dec/ldecod/src/strength_vert.c
new file mode 100644
index 00000000..89e545d4
--- /dev/null
+++ b/Src/h264dec/ldecod/src/strength_vert.c
@@ -0,0 +1,594 @@
+#include "global.h"
+#include "image.h"
+#include "mb_access.h"
+#include "loopfilter.h"
+
+void GetStrengthNormal_Vert(byte Strength[MB_BLOCK_SIZE], Macroblock *MbQ, int edge, int mvlimit, StorablePicture *p)
+{
+	// dir == 0
+	PixelPos pixP, pixMB;
+	byte     StrValue;
+	Macroblock *MbP;
+
+	if ((p->slice_type==SP_SLICE)||(p->slice_type==SI_SLICE) )
+	{ 
+		// Set strength to either 3 or 4 regardless of pixel position
+		StrValue = (edge == 0) ? 4 : 3;
+		memset(&Strength[0], (byte) StrValue, MB_BLOCK_SIZE * sizeof(byte));
+	}
+	else
+	{    
+		VideoParameters *p_Vid = MbQ->p_Vid;
+		int xQ = edge - 1;
+		int yQ = 0;
+
+		p_Vid->getNeighbourX0(MbQ, xQ, p_Vid->mb_size[IS_LUMA], &pixMB);
+		pixP = pixMB;
+		MbP = &(p_Vid->mb_data[pixP.mb_addr]);
+
+		if (!(MbP->mb_type==I4MB||MbP->mb_type==I8MB||MbP->mb_type==I16MB||MbP->mb_type==IPCM||MbQ->mb_type==I4MB||MbQ->mb_type==I8MB||MbQ->mb_type==I16MB||MbQ->mb_type==IPCM))
+		{
+			PicMotionParams *motion = &p->motion;
+			h264_ref_t    ref_p0,ref_p1,ref_q0,ref_q1;
+			int      blkP, blkQ, idx;
+			int      blk_x, blk_x2, blk_y, blk_y2 ;
+
+			PicMotion **motion0 = motion->motion[LIST_0];
+			PicMotion **motion1 = motion->motion[LIST_1];
+			short    mb_x, mb_y;
+
+			p_Vid->get_mb_block_pos (p_Vid->PicPos, MbQ->mbAddrX, &mb_x, &mb_y);
+			mb_x <<= 2;
+			mb_y <<= 2;
+
+			xQ ++;
+
+			for( idx = 0 ; idx < MB_BLOCK_SIZE ; idx += BLOCK_SIZE )
+			{
+				yQ = idx;
+
+				blkQ = (yQ & 0xFFFC) + (xQ >> 2);
+				blkP = (idx & 0xFFFC) + (pixP.x >> 2);
+
+				if( ((MbQ->cbp_blk[0] & ((int64)1 << blkQ )) != 0) || ((MbP->cbp_blk[0] & ((int64)1 << blkP)) != 0) )
+					StrValue = 2;
+				else
+				{
+					// if no coefs, but vector difference >= 1 set Strength=1
+					// if this is a mixed mode edge then one set of reference pictures will be frame and the
+					// other will be field    
+					PicMotion *motion_p0, *motion_q0, *motion_p1, *motion_q1;
+					blk_y  = mb_y + (blkQ >> 2);
+					blk_x  = mb_x + (blkQ  & 3);
+					blk_y2 = (pixMB.pos_y + idx) >> 2;
+					blk_x2 = pixMB.pos_x >> 2;
+
+					motion_p0=&motion0[blk_y ][blk_x ];
+					motion_q0=&motion0[blk_y2][blk_x2];
+					motion_p1=&motion1[blk_y ][blk_x ];
+					motion_q1=&motion1[blk_y2][blk_x2];
+					ref_p0 = motion_p0->ref_idx < 0 ? UNDEFINED_REFERENCE : motion_p0->ref_pic_id;
+					ref_q0 = motion_q0->ref_idx < 0 ? UNDEFINED_REFERENCE : motion_q0->ref_pic_id;
+					ref_p1 = motion_p1->ref_idx < 0 ? UNDEFINED_REFERENCE : motion_p1->ref_pic_id;
+					ref_q1 = motion_q1->ref_idx < 0 ? UNDEFINED_REFERENCE : motion_q1->ref_pic_id;
+					if ( ((ref_p0==ref_q0) && (ref_p1==ref_q1)) || ((ref_p0==ref_q1) && (ref_p1==ref_q0)))
+					{
+						// L0 and L1 reference pictures of p0 are different; q0 as well
+						if (ref_p0 != ref_p1)
+						{
+							// compare MV for the same reference picture
+							if (ref_p0 == ref_q0)
+							{
+								if (ref_p0 == UNDEFINED_REFERENCE)
+								{
+									StrValue =  (byte) (
+										(abs( motion_p1->mv[0] - motion_q1->mv[0]) >= 4) ||
+										(abs( motion_p1->mv[1] - motion_q1->mv[1]) >= mvlimit));
+								}
+								else if (ref_p1 == UNDEFINED_REFERENCE)
+								{
+									StrValue =  (byte) (
+										(abs( motion_p0->mv[0] - motion_q0->mv[0]) >= 4) ||
+										(abs( motion_p0->mv[1] - motion_q0->mv[1]) >= mvlimit));
+								}
+								else
+								{
+									StrValue =  (byte) (
+										(abs( motion_p0->mv[0] - motion_q0->mv[0]) >= 4) ||
+										(abs( motion_p0->mv[1] - motion_q0->mv[1]) >= mvlimit) ||
+										(abs( motion_p1->mv[0] - motion_q1->mv[0]) >= 4) ||
+										(abs( motion_p1->mv[1] - motion_q1->mv[1]) >= mvlimit));
+								}
+							}
+							else
+							{
+								StrValue =  (byte) (
+									(abs( motion_p0->mv[0] - motion_q1->mv[0]) >= 4) ||
+									(abs( motion_p0->mv[1] - motion_q1->mv[1]) >= mvlimit) ||
+									(abs( motion_p1->mv[0] - motion_q0->mv[0]) >= 4) ||
+									(abs( motion_p1->mv[1] - motion_q0->mv[1]) >= mvlimit));
+							}
+						}
+						else
+						{ // L0 and L1 reference pictures of p0 are the same; q0 as well
+							StrValue = (byte) (
+								((abs( motion_p0->mv[0] - motion_q0->mv[0]) >= 4) ||
+								(abs( motion_p0->mv[1] - motion_q0->mv[1]) >= mvlimit ) ||
+								(abs( motion_p1->mv[0] - motion_q1->mv[0]) >= 4) ||
+								(abs( motion_p1->mv[1] - motion_q1->mv[1]) >= mvlimit))
+								&&
+								((abs( motion_p0->mv[0] - motion_q1->mv[0]) >= 4) ||
+								(abs( motion_p0->mv[1] - motion_q1->mv[1]) >= mvlimit) ||
+								(abs( motion_p1->mv[0] - motion_q0->mv[0]) >= 4) ||
+								(abs( motion_p1->mv[1] - motion_q0->mv[1]) >= mvlimit)));
+						}
+					}
+					else
+					{
+						StrValue = 1;
+					}
+				}
+				memset(&Strength[idx], (byte) StrValue, BLOCK_SIZE * sizeof(byte));
+			}
+		}
+		else
+		{
+			// Start with Strength=3. or Strength=4 for Mb-edge
+			StrValue = (edge == 0) ? 4 : 3;
+			memset(&Strength[0], (byte) StrValue, MB_BLOCK_SIZE * sizeof(byte));
+		}      
+	}
+}
+
+void GetStrength_Vert_YUV420(uint8_t Strength[4], Macroblock *MbQ, int edge, int mvlimit, StorablePicture *p, PixelPos pixMB, Macroblock *MbP)
+{
+	// dir == 0
+	int i;
+	uint8_t     StrValue;
+
+	if ((p->slice_type==SP_SLICE)||(p->slice_type==SI_SLICE) )
+	{ 
+		// Set strength to either 3 or 4 regardless of pixel position
+		StrValue = (edge == 0) ? 4 : 3;
+		for (i=0;i<4;i++)
+		{
+			Strength[i]=StrValue;
+		}
+	}
+	else
+	{    
+		VideoParameters *p_Vid = MbQ->p_Vid;
+		if (!(MbP->mb_type==I4MB||MbP->mb_type==I8MB||MbP->mb_type==I16MB||MbP->mb_type==IPCM||MbQ->mb_type==I4MB||MbQ->mb_type==I8MB||MbQ->mb_type==I16MB||MbQ->mb_type==IPCM))
+		{
+			PicMotionParams *motion = &p->motion;
+			h264_ref_t    ref_p0,ref_p1,ref_q0,ref_q1;
+			int      blkP, blkQ, idx;
+			int      blk_x2, blk_y, blk_y2 ;
+
+			PicMotion **motion0 = motion->motion[LIST_0];
+			PicMotion **motion1 = motion->motion[LIST_1];
+			short    mb_x, mb_y;
+			const int cbp_p=(int)MbP->cbp_blk[0], cbp_q=(int)MbQ->cbp_blk[0];
+
+			get_mb_block_pos_normal(p_Vid->PicPos, MbQ->mbAddrX, &mb_x, &mb_y);
+			mb_x <<= 2;
+			mb_y <<= 2;
+
+			mb_x += edge;
+			blkQ = edge;
+			blkP = pixMB.x >> 2;
+			blk_x2 = pixMB.pos_x >> 2;
+
+			for( idx = 0 ; idx < BLOCK_SIZE ; idx++,blkQ+=BLOCK_SIZE, blkP+=BLOCK_SIZE)
+			{
+				if (_bittest(&cbp_p, blkP) || _bittest(&cbp_q, blkQ))
+					StrValue = 2;
+				else
+				{
+					// if no coefs, but vector difference >= 1 set Strength=1
+					// if this is a mixed mode edge then one set of reference pictures will be frame and the
+					// other will be field          
+					PicMotion *motion_p0, *motion_q0, *motion_p1, *motion_q1;
+					blk_y  = mb_y + idx;
+					blk_y2 = (pixMB.pos_y >> 2) + idx;
+
+					motion_p0=&motion0[blk_y ][mb_x ];
+					motion_q0=&motion0[blk_y2][blk_x2];
+					motion_p1=&motion1[blk_y ][mb_x ];
+					motion_q1=&motion1[blk_y2][blk_x2];
+					ref_p0 = motion_p0->ref_idx < 0 ? UNDEFINED_REFERENCE : motion_p0->ref_pic_id;
+					ref_q0 = motion_q0->ref_idx < 0 ? UNDEFINED_REFERENCE : motion_q0->ref_pic_id;
+					ref_p1 = motion_p1->ref_idx < 0 ? UNDEFINED_REFERENCE : motion_p1->ref_pic_id;
+					ref_q1 = motion_q1->ref_idx < 0 ? UNDEFINED_REFERENCE : motion_q1->ref_pic_id;
+
+					if ( ((ref_p0==ref_q0) && (ref_p1==ref_q1)) || ((ref_p0==ref_q1) && (ref_p1==ref_q0)))
+					{
+						// L0 and L1 reference pictures of p0 are different; q0 as well
+						if (ref_p0 != ref_p1)
+						{
+							// compare MV for the same reference picture
+							if (ref_p0 == ref_q0)
+							{
+								if (ref_p0 == UNDEFINED_REFERENCE)
+								{
+									StrValue =  (byte) (
+										(abs( motion_p1->mv[0] - motion_q1->mv[0]) >= 4) ||
+										(abs( motion_p1->mv[1] - motion_q1->mv[1]) >= mvlimit));
+								}
+								else if (ref_p1 == UNDEFINED_REFERENCE)
+								{
+									StrValue =  (byte) (
+										(abs( motion_p0->mv[0] - motion_q0->mv[0]) >= 4) ||
+										(abs( motion_p0->mv[1] - motion_q0->mv[1]) >= mvlimit));
+								}
+								else
+								{
+									StrValue =  (byte) (
+										(abs( motion_p0->mv[0] - motion_q0->mv[0]) >= 4) ||
+										(abs( motion_p0->mv[1] - motion_q0->mv[1]) >= mvlimit) ||
+										(abs( motion_p1->mv[0] - motion_q1->mv[0]) >= 4) ||
+										(abs( motion_p1->mv[1] - motion_q1->mv[1]) >= mvlimit));
+								}
+							}
+							else
+							{
+								StrValue =  (byte) (
+									(abs( motion_p0->mv[0] - motion_q1->mv[0]) >= 4) ||
+									(abs( motion_p0->mv[1] - motion_q1->mv[1]) >= mvlimit) ||
+									(abs( motion_p1->mv[0] - motion_q0->mv[0]) >= 4) ||
+									(abs( motion_p1->mv[1] - motion_q0->mv[1]) >= mvlimit));
+							}
+						}
+						else
+						{ // L0 and L1 reference pictures of p0 are the same; q0 as well
+							StrValue = (byte) (
+								((abs( motion_p0->mv[0] - motion_q0->mv[0]) >= 4) ||
+								(abs( motion_p0->mv[1] - motion_q0->mv[1]) >= mvlimit ) ||
+								(abs( motion_p1->mv[0] - motion_q1->mv[0]) >= 4) ||
+								(abs( motion_p1->mv[1] - motion_q1->mv[1]) >= mvlimit))
+								&&
+								((abs( motion_p0->mv[0] - motion_q1->mv[0]) >= 4) ||
+								(abs( motion_p0->mv[1] - motion_q1->mv[1]) >= mvlimit) ||
+								(abs( motion_p1->mv[0] - motion_q0->mv[0]) >= 4) ||
+								(abs( motion_p1->mv[1] - motion_q0->mv[1]) >= mvlimit)));
+						}
+					}
+					else
+					{
+						StrValue = 1;
+					}
+				}
+				Strength[idx] = StrValue;
+			}
+		}
+		else
+		{
+			// Start with Strength=3. or Strength=4 for Mb-edge
+			StrValue = (edge == 0) ? 4 : 3;
+			for (i=0;i<4;i++)
+			{
+				Strength[i]=StrValue;
+			}
+		}      
+	}
+}
+
+// assumes YUV420, MB Aff
+void GetStrength_MBAff_Vert_YUV420(byte Strength[16], Macroblock *MbQ, int edge, int mvlimit, StorablePicture *p)
+{
+	// dir == 0
+	if ((p->slice_type==SP_SLICE)||(p->slice_type==SI_SLICE) 
+		|| (MbQ->mb_type==I4MB || MbQ->mb_type==I16MB || MbQ->mb_type==I8MB || MbQ->mb_type==IPCM))
+	{
+		memset(Strength,(edge == 0) ? 4 : 3, 16); 
+	}
+	else
+	{
+		short  blkP, blkQ, idx;
+		short  blk_x, blk_x2, blk_y, blk_y2 ;
+		h264_ref_t  ref_p0,ref_p1,ref_q0,ref_q1;
+		int    xQ, yQ;
+		short  mb_x, mb_y;
+		Macroblock *MbP;
+
+		PixelPos pixP;
+
+		PicMotionParams *motion = &p->motion;
+		PicMotion **motion0 = motion->motion[LIST_0];
+		PicMotion **motion1 = motion->motion[LIST_1];
+		xQ = edge;
+		for( idx = 0; idx < 16; ++idx )
+		{
+			VideoParameters *p_Vid = MbQ->p_Vid;
+
+			yQ = idx;
+			getAffNeighbourXPLuma(MbQ, xQ - 1, yQ, &pixP);
+			blkQ = (short) ((yQ & 0xC) + (xQ >> 2)); // blkQ changes once every 4 loop iterations
+			blkP = (short) ((pixP.y & 0xFFFC) + (pixP.x >> 2));
+
+			MbP = &(p_Vid->mb_data[pixP.mb_addr]);
+			p_Vid->mixedModeEdgeFlag = (byte) (MbQ->mb_field != MbP->mb_field);   
+
+			// Start with Strength=3. or Strength=4 for Mb-edge
+			Strength[idx] = (edge == 0) ? 4 : 3;
+
+			if(  !(MbP->mb_type==I4MB || MbP->mb_type==I16MB || MbP->mb_type==I8MB || MbP->mb_type==IPCM))
+			{
+				if( ((MbQ->cbp_blk[0] &  ((int64)1 << blkQ )) != 0) || ((MbP->cbp_blk[0] &  ((int64)1 << blkP)) != 0) )
+					Strength[idx] = 2 ;
+				else
+				{
+					// if no coefs, but vector difference >= 1 set Strength=1
+					// if this is a mixed mode edge then one set of reference pictures will be frame and the
+					// other will be field
+					if (p_Vid->mixedModeEdgeFlag)
+					{
+						(Strength[idx] = 1);
+					}
+					else
+					{
+						get_mb_block_pos_mbaff(p_Vid->PicPos, MbQ->mbAddrX, &mb_x, &mb_y);
+						blk_y  = (short) ((mb_y<<2) + (blkQ >> 2));
+						blk_x  = (short) ((mb_x<<2) + (blkQ  & 3));
+						blk_y2 = (short) (pixP.pos_y >> 2);
+						blk_x2 = (short) (pixP.pos_x >> 2);
+						{
+							PicMotion *motion_p0, *motion_q0, *motion_p1, *motion_q1;
+							motion_p0=&motion0[blk_y ][blk_x ];
+							motion_q0=&motion0[blk_y2][blk_x2];
+							motion_p1=&motion1[blk_y ][blk_x ];
+							motion_q1=&motion1[blk_y2][blk_x2];
+
+							ref_p0 = motion_p0->ref_idx < 0 ? UNDEFINED_REFERENCE : motion_p0->ref_pic_id;
+							ref_q0 = motion_q0->ref_idx < 0 ? UNDEFINED_REFERENCE : motion_q0->ref_pic_id;
+							ref_p1 = motion_p1->ref_idx < 0 ? UNDEFINED_REFERENCE : motion_p1->ref_pic_id;
+							ref_q1 = motion_q1->ref_idx < 0 ? UNDEFINED_REFERENCE : motion_q1->ref_pic_id;
+
+							if ( ((ref_p0==ref_q0) && (ref_p1==ref_q1)) ||
+								((ref_p0==ref_q1) && (ref_p1==ref_q0)))
+							{
+								Strength[idx]=0;
+								// L0 and L1 reference pictures of p0 are different; q0 as well
+								if (ref_p0 != ref_p1)
+								{
+									// compare MV for the same reference picture
+									if (ref_p0==ref_q0)
+									{
+										Strength[idx] =  (byte) (
+											(abs( motion_p0->mv[0] - motion_q0->mv[0]) >= 4) ||
+											(abs( motion_p0->mv[1] - motion_q0->mv[1]) >= mvlimit) ||
+											(abs( motion_p1->mv[0] - motion_q1->mv[0]) >= 4) ||
+											(abs( motion_p1->mv[1] - motion_q1->mv[1]) >= mvlimit));
+									}
+									else
+									{
+										Strength[idx] =  (byte) (
+											(abs( motion_p0->mv[0] - motion_q1->mv[0]) >= 4) ||
+											(abs( motion_p0->mv[1] - motion_q1->mv[1]) >= mvlimit) ||
+											(abs( motion_p1->mv[0] - motion_q0->mv[0]) >= 4) ||
+											(abs( motion_p1->mv[1] - motion_q0->mv[1]) >= mvlimit));
+									}
+								}
+								else
+								{ // L0 and L1 reference pictures of p0 are the same; q0 as well
+
+									Strength[idx] = (byte) (
+										((abs( motion_p0->mv[0] - motion_q0->mv[0]) >= 4) ||
+										(abs( motion_p0->mv[1] - motion_q0->mv[1]) >= mvlimit ) ||
+										(abs( motion_p1->mv[0] - motion_q1->mv[0]) >= 4) ||
+										(abs( motion_p1->mv[1] - motion_q1->mv[1]) >= mvlimit))
+										&&
+										((abs( motion_p0->mv[0] - motion_q1->mv[0]) >= 4) ||
+										(abs( motion_p0->mv[1] - motion_q1->mv[1]) >= mvlimit) ||
+										(abs( motion_p1->mv[0] - motion_q0->mv[0]) >= 4) ||
+										(abs( motion_p1->mv[1] - motion_q0->mv[1]) >= mvlimit)));
+								}
+							}
+							else
+							{
+								Strength[idx] = 1;
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+}
+
+static __forceinline uint8_t GetMotionStrength(PicMotion *motion0, PicMotion *motion1, int mvlimit)
+{
+	uint8_t StrValue;
+	h264_ref_t    ref_p0,ref_p1,ref_q0,ref_q1;
+
+	ref_p0 = motion0[0].ref_idx < 0 ? UNDEFINED_REFERENCE : motion0[0].ref_pic_id;
+	ref_p1 = motion1[0].ref_idx < 0 ? UNDEFINED_REFERENCE : motion1[0].ref_pic_id;
+	ref_q0 = motion0[1].ref_idx < 0 ? UNDEFINED_REFERENCE : motion0[1].ref_pic_id;
+	ref_q1 = motion1[1].ref_idx < 0 ? UNDEFINED_REFERENCE : motion1[1].ref_pic_id;
+
+	if (ref_p0==ref_q0 && ref_p1==ref_q1)
+	{
+		if (ref_p0 != ref_p1)
+		{
+			// compare MV for the same reference picture
+			if (ref_p0 == UNDEFINED_REFERENCE)
+			{
+				StrValue =  (byte) (
+					(abs( motion1[0].mv[0] - motion1[1].mv[0]) >= 4) ||
+					(abs( motion1[0].mv[1] - motion1[1].mv[1]) >= mvlimit));
+			}
+			else if (ref_p1 == UNDEFINED_REFERENCE)
+			{
+				StrValue =  (byte) (
+					(abs( motion0[0].mv[0] - motion0[1].mv[0]) >= 4) ||
+					(abs( motion0[0].mv[1] - motion0[1].mv[1]) >= mvlimit));
+			}
+			else
+			{
+				StrValue =  (byte) (
+					(abs( motion0[0].mv[0] - motion0[1].mv[0]) >= 4) ||
+					(abs( motion0[0].mv[1] - motion0[1].mv[1]) >= mvlimit) ||
+					(abs( motion1[0].mv[0] - motion1[1].mv[0]) >= 4) ||
+					(abs( motion1[0].mv[1] - motion1[1].mv[1]) >= mvlimit));
+			}
+		}
+		else
+		{ // L0 and L1 reference pictures of p0 are the same; q0 as well
+			StrValue = (byte) (
+				((abs( motion0[0].mv[0] - motion0[1].mv[0]) >= 4) ||
+				(abs( motion0[0].mv[1] - motion0[1].mv[1]) >= mvlimit ) ||
+				(abs( motion1[0].mv[0] - motion1[1].mv[0]) >= 4) ||
+				(abs( motion1[0].mv[1] - motion1[1].mv[1]) >= mvlimit))
+				&&
+				((abs( motion0[0].mv[0] - motion1[1].mv[0]) >= 4) ||
+				(abs( motion0[0].mv[1] - motion1[1].mv[1]) >= mvlimit) ||
+				(abs( motion1[0].mv[0] - motion0[1].mv[0]) >= 4) ||
+				(abs( motion1[0].mv[1] - motion0[1].mv[1]) >= mvlimit)));
+		}
+	}
+	else if (ref_p0==ref_q1 && ref_p1==ref_q0)
+	{
+		StrValue =  (byte) (
+			(abs( motion0[0].mv[0] - motion1[1].mv[0]) >= 4) ||
+			(abs( motion0[0].mv[1] - motion1[1].mv[1]) >= mvlimit) ||
+			(abs( motion1[0].mv[0] - motion0[1].mv[0]) >= 4) ||
+			(abs( motion1[0].mv[1] - motion0[1].mv[1]) >= mvlimit));
+	}
+	else
+	{
+		StrValue = 1;
+	}
+	return StrValue;
+}
+
+void GetStrength_Vert_YUV420_All(uint8_t Strength[4][4], Macroblock *MbQ, int mvlimit, StorablePicture *p, int pos_x, int pos_y, Macroblock *MbP, int luma_transform_size_8x8_flag)
+{
+	// dir == 0
+	if ((p->slice_type>=SP_SLICE) //(p->slice_type==SP_SLICE)||(p->slice_type==SI_SLICE) )
+		|| ((1 << MbQ->mb_type) & 26112))
+	{ 
+		// Set strength to either 3 or 4 regardless of pixel position
+		*(int32_t *)(Strength[0]) = MbP?0x04040404:0;
+		*(int32_t *)(Strength[1]) = luma_transform_size_8x8_flag?0:0x03030303;
+		*(int32_t *)(Strength[2]) = 0x03030303;
+		*(int32_t *)(Strength[3]) = luma_transform_size_8x8_flag?0:0x03030303;
+	}
+	else
+	{ 
+		PicMotionParams *motion = &p->motion;
+		int motion_stride = p->size_x >> 2;
+		PicMotion *motion0 = &motion->motion[LIST_0][pos_y][pos_x];
+		PicMotion *motion1 = &motion->motion[LIST_1][pos_y][pos_x];
+		int cbp_q=(int)MbQ->cbp_blk[0];		
+
+		// edge 0 
+		if (!MbP)
+		{
+			*(int32_t *)(Strength[0]) = 0;
+		}
+		else if ((1 << MbP->mb_type) & 26112)
+		{
+			*(int32_t *)(Strength[0]) = 0x04040404;
+		}
+		else
+		{
+			int cbp_p = (int)MbP->cbp_blk[0];
+			if( ((cbp_q & (1 << 0 )) != 0) || ((cbp_p & (1 << (3))) != 0) )
+				Strength[0][0] = 2;
+			else
+				Strength[0][0] = GetMotionStrength(&motion0[0-1], &motion1[0-1], mvlimit);
+
+			if( ((cbp_q & (1 << 4 )) != 0) || ((cbp_p & (1 << (4 + 3))) != 0) )
+				Strength[0][1] = 2;
+			else
+				Strength[0][1] = GetMotionStrength(&motion0[motion_stride-1], &motion1[motion_stride-1], mvlimit);
+
+			if( ((cbp_q & (1 << 8 )) != 0) || ((cbp_p & (1 << (8 + 3))) != 0) )
+				Strength[0][2] = 2;
+			else
+				Strength[0][2] = GetMotionStrength(&motion0[2*motion_stride-1], &motion1[2*motion_stride-1], mvlimit);
+
+			if( ((cbp_q & (1 << 12 )) != 0) || ((cbp_p & (1 << (12 + 3))) != 0) )
+				Strength[0][3] = 2;
+			else
+				Strength[0][3] = GetMotionStrength(&motion0[3*motion_stride-1], &motion1[3*motion_stride-1], mvlimit);
+		}
+
+		// edge 1 
+		if (luma_transform_size_8x8_flag)
+		{
+			*(int32_t *)(Strength[1]) = 0;
+		}
+		else
+		{
+			if (cbp_q & (3 << 0))
+				Strength[1][0] = 2;
+			else
+				Strength[1][0] = GetMotionStrength(&motion0[0], &motion1[0], mvlimit);
+
+			if (cbp_q & (3 << 4))
+				Strength[1][1] = 2;
+			else
+				Strength[1][1] = GetMotionStrength(&motion0[1*motion_stride], &motion1[1*motion_stride], mvlimit);
+
+			if (cbp_q & (3 << 8))
+				Strength[1][2] = 2;
+			else
+				Strength[1][2] = GetMotionStrength(&motion0[2*motion_stride], &motion1[2*motion_stride], mvlimit);
+
+			if (cbp_q & (3 << 12))
+				Strength[1][3] = 2;
+			else
+				Strength[1][3] = GetMotionStrength(&motion0[3*motion_stride], &motion1[3*motion_stride], mvlimit);
+		}
+
+		// edge 2
+		if (cbp_q & (6 << 0))
+			Strength[2][0] = 2;
+		else
+			Strength[2][0] = GetMotionStrength(&motion0[1], &motion1[1], mvlimit);
+
+		if (cbp_q & (6 << 4))
+			Strength[2][1] = 2;
+		else
+			Strength[2][1] = GetMotionStrength(&motion0[motion_stride+1], &motion1[motion_stride+1], mvlimit);
+
+		if (cbp_q & (6 << 8))
+			Strength[2][2] = 2;
+		else
+			Strength[2][2] = GetMotionStrength(&motion0[2*motion_stride+1], &motion1[2*motion_stride+1], mvlimit);
+
+		if (cbp_q & (6 << 12))
+			Strength[2][3] = 2;
+		else
+			Strength[2][3] = GetMotionStrength(&motion0[3*motion_stride+1], &motion1[3*motion_stride+1], mvlimit);
+
+		// edge 3
+		if (luma_transform_size_8x8_flag)
+		{
+			*(int32_t *)(Strength[3]) = 0;
+		}
+		else
+		{
+			if (cbp_q & (0xC << 0))
+				Strength[3][0] = 2;
+			else
+				Strength[3][0] = GetMotionStrength(&motion0[2], &motion1[2], mvlimit);
+
+			if (cbp_q & (0xC << 4))
+				Strength[3][1] = 2;
+			else
+				Strength[3][1] = GetMotionStrength(&motion0[motion_stride+2], &motion1[motion_stride+2], mvlimit);
+
+			if (cbp_q & (0xC << 8))
+				Strength[3][2] = 2;
+			else
+				Strength[3][2] = GetMotionStrength(&motion0[2*motion_stride+2], &motion1[2*motion_stride+2], mvlimit);
+
+			if (cbp_q & (0xC << 12))
+				Strength[3][3] = 2;
+			else
+				Strength[3][3] = GetMotionStrength(&motion0[3*motion_stride+2], &motion1[3*motion_stride+2], mvlimit);
+		}
+	}
+}
diff --git a/Src/h264dec/ldecod/src/transform8x8.c b/Src/h264dec/ldecod/src/transform8x8.c
new file mode 100644
index 00000000..1072a6d1
--- /dev/null
+++ b/Src/h264dec/ldecod/src/transform8x8.c
@@ -0,0 +1,696 @@
+
+/*!
+***************************************************************************
+* \file transform8x8.c
+*
+* \brief
+*    8x8 transform functions
+*
+* \author
+*    Main contributors (see contributors.h for copyright, address and affiliation details)
+*    - Yuri Vatis
+*    - Jan Muenster
+*
+* \date
+*    12. October 2003
+**************************************************************************
+*/
+
+#include "global.h"
+
+#include "image.h"
+#include "mb_access.h"
+#include "elements.h"
+#include "transform8x8.h"
+#include "transform.h"
+#include "quant.h"
+#include <emmintrin.h>
+
+static void inverse8x8_sse2(h264_short_8x8block_row_t *block)
+{
+		__m128i a0, a1, a2, a3;
+		__m128i p0, p1, p2, p3, p4, p5 ,p6, p7;  
+		__m128i b0, b1, b2, b3, b4, b5, b6, b7;
+		__m128i r0, r1, r2, r3, r4, r5, r6, r7;
+
+		// Horizontal  
+			b0 = _mm_load_si128((__m128i *)(block[0])); 
+			b1 = _mm_load_si128((__m128i *)(block[1])); 
+			b2 = _mm_load_si128((__m128i *)(block[2])); 
+			b3 = _mm_load_si128((__m128i *)(block[3])); 
+			b4 = _mm_load_si128((__m128i *)(block[4])); 
+			b5 = _mm_load_si128((__m128i *)(block[5])); 
+			b6 = _mm_load_si128((__m128i *)(block[6])); 
+			b7 = _mm_load_si128((__m128i *)(block[7])); 
+
+			/* rotate 8x8 (ugh) */
+			r0 = _mm_unpacklo_epi16(b0, b2); 
+			r1 = _mm_unpacklo_epi16(b1, b3); 
+			r2 = _mm_unpackhi_epi16(b0, b2); 
+			r3 = _mm_unpackhi_epi16(b1, b3); 
+			r4 = _mm_unpacklo_epi16(b4, b6); 
+			r5 = _mm_unpacklo_epi16(b5, b7); 
+			r6 = _mm_unpackhi_epi16(b4, b6); 
+			r7 = _mm_unpackhi_epi16(b5, b7); 
+
+			b0 = _mm_unpacklo_epi16(r0, r1); 
+			b1 = _mm_unpackhi_epi16(r0, r1); 
+			b2 = _mm_unpacklo_epi16(r2, r3); 
+			b3 = _mm_unpackhi_epi16(r2, r3); 
+			b4 = _mm_unpacklo_epi16(r4, r5); 
+			b5 = _mm_unpackhi_epi16(r4, r5); 
+			b6 = _mm_unpacklo_epi16(r6, r7); 
+			b7 = _mm_unpackhi_epi16(r6, r7); 
+
+			p0 = _mm_unpacklo_epi64(b0, b4);
+			p1 = _mm_unpackhi_epi64(b0, b4);
+			p2 = _mm_unpacklo_epi64(b1, b5);
+			p3 = _mm_unpackhi_epi64(b1, b5);
+			p4 = _mm_unpacklo_epi64(b2, b6);
+			p5 = _mm_unpackhi_epi64(b2, b6);
+			p6 = _mm_unpacklo_epi64(b3, b7);
+			p7 = _mm_unpackhi_epi64(b3, b7);
+
+			/* perform approx DCT */
+						a0 = _mm_add_epi16(p0, p4); // p0 + p4
+			a1 = _mm_sub_epi16(p0, p4); // p0 - p4
+			r0 = _mm_srai_epi16(p2, 1); // p2 >> 1
+			a2 = _mm_sub_epi16(p6, r0); // p6 - (p2 >> 1)
+			r0 = _mm_srai_epi16(p6, 1); // p6 >> 1
+			a3 = _mm_add_epi16(p2, r0); //p2 + (p6 >> 1)
+
+			b0 =  _mm_add_epi16(a0, a3); // a0 + a3;
+			b2 =  _mm_sub_epi16(a1, a2);  // a1 - a2;
+			b4 =  _mm_add_epi16(a1, a2);    // a1 + a2;
+			b6 =  _mm_sub_epi16(a0, a3);  // a0 - a3;
+
+			//-p3 + p5 - p7 - (p7 >> 1);    
+			r0 = _mm_srai_epi16(p7, 1); // p7 >> 1
+			a0 = _mm_sub_epi16(p5, p3); // p5 - p3
+			a0 = _mm_sub_epi16(a0, p7); // (-p3 + p5) - p7
+			a0 = _mm_sub_epi16(a0, r0); // (-p3 + p5 - p7) - (p7 >> 1)
+
+			//p1 + p7 - p3 - (p3 >> 1);    
+			r0 =  _mm_srai_epi16(p3, 1); // (p3 >> 1)
+			a1 = _mm_add_epi16(p1, p7); // p1 + p7
+			a1 = _mm_sub_epi16(a1, p3); // (p1 + p7) - p3
+			a1 = _mm_sub_epi16(a1, r0); // (p1 + p7 - p3) - (p3>>1)
+
+			// -p1 + p7 + p5 + (p5 >> 1);   
+			r0 =  _mm_srai_epi16(p5, 1); // (p5 >> 1)
+			a2 = _mm_sub_epi16(p7, p1); // p7 - p1
+			a2 = _mm_add_epi16(a2, p5); // -p1 + p7 + p5
+			a2 = _mm_add_epi16(a2, r0); // (-p1 + p7 + p5) + (p5 >> 1)
+
+			// p3 + p5 + p1 + (p1 >> 1);
+			a3 = _mm_add_epi16(p3, p5); // p3+p5
+			a3 = _mm_add_epi16(a3, p1); // p3 + p5 + p1
+			p1 = _mm_srai_epi16(p1, 1); // p1 >> 1
+			a3 = _mm_add_epi16(a3, p1); //p3 + p5 + p1 + (p1 >> 1)
+
+			r0 = _mm_srai_epi16(a3, 2); // a3>>2
+			b1 = _mm_add_epi16(a0, r0); //a0 + (a3>>2);    
+			r0 = _mm_srai_epi16(a2, 2); // a2>>2
+			b3 = _mm_add_epi16(a1, r0); // a1 + (a2>>2);    
+			a1 = _mm_srai_epi16(a1, 2); // all done with a1, so this is safe
+			b5 = _mm_sub_epi16(a2, a1); //a2 - (a1>>2);
+			a0 = _mm_srai_epi16(a0, 2); // all done with a0, so this is safe
+			b7 = _mm_sub_epi16(a3, a0); //a3 - (a0>>2);                
+
+			p0 = _mm_add_epi16(b0, b7); // b0 + b7;
+			p1 = _mm_sub_epi16(b2, b5); // b2 - b5;
+			p2 = _mm_add_epi16(b4, b3); // b4 + b3;
+			p3 = _mm_add_epi16(b6, b1); // b6 + b1;
+			p4 = _mm_sub_epi16(b6, b1); // b6 - b1;
+			p5 = _mm_sub_epi16(b4, b3); // b4 - b3;
+			p6 = _mm_add_epi16(b2, b5); // b2 + b5;
+			p7 = _mm_sub_epi16(b0, b7); // b0 - b7;
+
+						/* rotate 8x8 (ugh) */
+			r0 = _mm_unpacklo_epi16(p0, p2); 
+			r1 = _mm_unpacklo_epi16(p1, p3); 
+			r2 = _mm_unpackhi_epi16(p0, p2); 
+			r3 = _mm_unpackhi_epi16(p1, p3); 
+			r4 = _mm_unpacklo_epi16(p4, p6); 
+			r5 = _mm_unpacklo_epi16(p5, p7); 
+			r6 = _mm_unpackhi_epi16(p4, p6); 
+			r7 = _mm_unpackhi_epi16(p5, p7); 
+
+			b0 = _mm_unpacklo_epi16(r0, r1); 
+			b1 = _mm_unpackhi_epi16(r0, r1); 
+			b2 = _mm_unpacklo_epi16(r2, r3); 
+			b3 = _mm_unpackhi_epi16(r2, r3); 
+			b4 = _mm_unpacklo_epi16(r4, r5); 
+			b5 = _mm_unpackhi_epi16(r4, r5); 
+			b6 = _mm_unpacklo_epi16(r6, r7); 
+			b7 = _mm_unpackhi_epi16(r6, r7); 
+
+			p0 = _mm_unpacklo_epi64(b0, b4);
+			p1 = _mm_unpackhi_epi64(b0, b4);
+			p2 = _mm_unpacklo_epi64(b1, b5);
+			p3 = _mm_unpackhi_epi64(b1, b5);
+			p4 = _mm_unpacklo_epi64(b2, b6);
+			p5 = _mm_unpackhi_epi64(b2, b6);
+			p6 = _mm_unpacklo_epi64(b3, b7);
+			p7 = _mm_unpackhi_epi64(b3, b7);
+
+
+		/*  Vertical  */
+
+			a0 = _mm_add_epi16(p0, p4); // p0 + p4
+			a1 = _mm_sub_epi16(p0, p4); // p0 - p4
+			r0 = _mm_srai_epi16(p2, 1); // p2 >> 1
+			a2 = _mm_sub_epi16(p6, r0); // p6 - (p2 >> 1)
+			r0 = _mm_srai_epi16(p6, 1); // p6 >> 1
+			a3 = _mm_add_epi16(p2, r0); //p2 + (p6 >> 1)
+
+			b0 =  _mm_add_epi16(a0, a3); // a0 + a3;
+			b2 =  _mm_sub_epi16(a1, a2);  // a1 - a2;
+			b4 =  _mm_add_epi16(a1, a2);    // a1 + a2;
+			b6 =  _mm_sub_epi16(a0, a3);  // a0 - a3;
+
+			//-p3 + p5 - p7 - (p7 >> 1);    
+			r0 = _mm_srai_epi16(p7, 1); // p7 >> 1
+			a0 = _mm_sub_epi16(p5, p3); // p5 - p3
+			a0 = _mm_sub_epi16(a0, p7); // (-p3 + p5) - p7
+			a0 = _mm_sub_epi16(a0, r0); // (-p3 + p5 - p7) - (p7 >> 1)
+
+			//p1 + p7 - p3 - (p3 >> 1);    
+			r0 =  _mm_srai_epi16(p3, 1); // (p3 >> 1)
+			a1 = _mm_add_epi16(p1, p7); // p1 + p7
+			a1 = _mm_sub_epi16(a1, p3); // (p1 + p7) - p3
+			a1 = _mm_sub_epi16(a1, r0); // (p1 + p7 - p3) - (p3>>1)
+
+			// -p1 + p7 + p5 + (p5 >> 1);   
+			r0 =  _mm_srai_epi16(p5, 1); // (p5 >> 1)
+			a2 = _mm_sub_epi16(p7, p1); // p7 - p1
+			a2 = _mm_add_epi16(a2, p5); // -p1 + p7 + p5
+			a2 = _mm_add_epi16(a2, r0); // (-p1 + p7 + p5) + (p5 >> 1)
+
+			// p3 + p5 + p1 + (p1 >> 1);
+			r0 = _mm_srai_epi16(p1, 1); // p1 >> 1
+			a3 = _mm_add_epi16(p3, p5); // p3+p5
+			a3 = _mm_add_epi16(a3, p1); // p3 + p5 + p1
+			a3 = _mm_add_epi16(a3, r0); //p3 + p5 + p1 + (p1 >> 1)
+
+			r0 = _mm_srai_epi16(a3, 2); // a3>>2
+			b1 = _mm_add_epi16(a0, r0); //a0 + (a3>>2);    
+			r0 = _mm_srai_epi16(a2, 2); // a2>>2
+			b3 = _mm_add_epi16(a1, r0); // a1 + (a2>>2);    
+			a1 = _mm_srai_epi16(a1, 2); // all done with a1, so this is safe
+			b5 = _mm_sub_epi16(a2, a1); //a2 - (a1>>2);
+			a0 = _mm_srai_epi16(a0, 2); // all done with a0, so this is safe
+			b7 = _mm_sub_epi16(a3, a0); //a3 - (a0>>2);                
+
+			r0 = _mm_add_epi16(b0, b7); // b0 + b7;
+			_mm_store_si128((__m128i *)(block[0]), r0);
+			r1 = _mm_sub_epi16(b2, b5); // b2 - b5;
+			_mm_store_si128((__m128i *)(block[1]), r1);
+			r2 = _mm_add_epi16(b4, b3); // b4 + b3;
+			_mm_store_si128((__m128i *)(block[2]), r2);
+			r3 = _mm_add_epi16(b6, b1); // b6 + b1;
+			_mm_store_si128((__m128i *)(block[3]), r3);
+			r4 = _mm_sub_epi16(b6, b1); // b6 - b1;
+			_mm_store_si128((__m128i *)(block[4]), r4);
+			r5 = _mm_sub_epi16(b4, b3); // b4 - b3;
+			_mm_store_si128((__m128i *)(block[5]), r5);
+			r6 = _mm_add_epi16(b2, b5); // b2 + b5;
+			_mm_store_si128((__m128i *)(block[6]), r6);
+			r7 = _mm_sub_epi16(b0, b7); // b0 - b7;
+			_mm_store_si128((__m128i *)(block[7]), r7);
+}
+
+static void inverse8x8(h264_short_8x8block_row_t *block)
+{
+	int i;
+
+		//int tmp[64];
+		//int *pTmp = tmp;
+		int a0, a1, a2, a3;
+		int p0, p1, p2, p3, p4, p5 ,p6, p7;  
+		int b0, b1, b2, b3, b4, b5, b6, b7;
+
+		// Horizontal  
+		for (i=0; i < BLOCK_SIZE_8x8; i++)
+		{
+			p0 = block[i][0];
+			p1 = block[i][1];
+			p2 = block[i][2];
+			p3 = block[i][3];
+			p4 = block[i][4];
+			p5 = block[i][5];
+			p6 = block[i][6];
+			p7 = block[i][7];
+
+			a0 = p0 + p4;
+			a1 = p0 - p4;
+			a2 = p6 - (p2 >> 1);
+			a3 = p2 + (p6 >> 1);
+
+			b0 =  a0 + a3;
+			b2 =  a1 - a2;
+			b4 =  a1 + a2;
+			b6 =  a0 - a3;
+
+			a0 =  p5 - p3 - p7 - (p7 >> 1);    
+			a1 =  p1 + p7 - p3 - (p3 >> 1);    
+			a2 =  p7 - p1 + p5 + (p5 >> 1);    
+			a3 =  p3 + p5 + p1 + (p1 >> 1);
+
+
+			b1 =  a0 + (a3>>2);    
+			b3 =  a1 + (a2>>2);    
+			b5 =  a2 - (a1>>2);
+			b7 =  a3 - (a0>>2);                
+
+			block[i][0] = b0 + b7;
+			block[i][1] = b2 - b5;
+			block[i][2] = b4 + b3;
+			block[i][3] = b6 + b1;
+			block[i][4] = b6 - b1;
+			block[i][5] = b4 - b3;
+			block[i][6] = b2 + b5;
+			block[i][7] = b0 - b7;
+		}
+
+		//  Vertical 
+		for (i=0; i < BLOCK_SIZE_8x8; i++)
+		{
+			//    pTmp = tmp + i;
+			p0 = block[0][i];
+			p1 = block[1][i];
+			p2 = block[2][i];
+			p3 = block[3][i];
+			p4 = block[4][i];
+			p5 = block[5][i];
+			p6 = block[6][i];
+			p7 = block[7][i];
+
+			a0 =  p0 + p4;
+			a1 =  p0 - p4;
+			a2 =  p6 - (p2>>1);
+			a3 =  p2 + (p6>>1);
+
+			b0 = a0 + a3;
+			b2 = a1 - a2;
+			b4 = a1 + a2;
+			b6 = a0 - a3;
+
+			a0 = -p3 + p5 - p7 - (p7 >> 1);
+			a1 =  p1 + p7 - p3 - (p3 >> 1);
+			a2 = -p1 + p7 + p5 + (p5 >> 1);
+			a3 =  p3 + p5 + p1 + (p1 >> 1);
+
+
+			b1 =  a0 + (a3 >> 2);
+			b7 =  a3 - (a0 >> 2);
+			b3 =  a1 + (a2 >> 2);
+			b5 =  a2 - (a1 >> 2);
+
+			block[0][i] = b0 + b7;
+			block[1][i] = b2 - b5;
+			block[2][i] = b4 + b3;
+			block[3][i] = b6 + b1;
+			block[4][i] = b6 - b1;
+			block[5][i] = b4 - b3;
+			block[6][i] = b2 + b5;
+			block[7][i] = b0 - b7;
+		}
+	
+}
+#if defined(_DEBUG) || defined(_M_IX64)
+void itrans8x8_sse2(h264_imgpel_macroblock_row_t *mb_rec, const h264_imgpel_macroblock_row_t *mb_pred, const h264_short_8x8block_row_t *block, int pos_x)
+{
+		__m128i a0, a1, a2, a3;
+		__m128i p0, p1, p2, p3, p4, p5 ,p6, p7;  
+		__m128i b0, b1, b2, b3, b4, b5, b6, b7;
+		__m128i r0, r1, r2, r3, r4, r5, r6, r7;
+		__m128i const32, zero;
+		__declspec(align(32)) static const int16_t c32[8] = {32, 32, 32, 32, 32, 32, 32, 32};
+		__m128i pred0, pred1;
+
+		const32 = _mm_load_si128((const __m128i *)c32);
+		zero = _mm_setzero_si128();
+
+				// Horizontal  
+			b0 = _mm_load_si128((__m128i *)(block[0])); 
+			b1 = _mm_load_si128((__m128i *)(block[1])); 
+			b2 = _mm_load_si128((__m128i *)(block[2])); 
+			b3 = _mm_load_si128((__m128i *)(block[3])); 
+			b4 = _mm_load_si128((__m128i *)(block[4])); 
+			b5 = _mm_load_si128((__m128i *)(block[5])); 
+			b6 = _mm_load_si128((__m128i *)(block[6])); 
+			b7 = _mm_load_si128((__m128i *)(block[7])); 
+
+			/* rotate 8x8 (ugh) */
+			r0 = _mm_unpacklo_epi16(b0, b2); 
+			r1 = _mm_unpacklo_epi16(b1, b3); 
+			r2 = _mm_unpackhi_epi16(b0, b2); 
+			r3 = _mm_unpackhi_epi16(b1, b3); 
+			r4 = _mm_unpacklo_epi16(b4, b6); 
+			r5 = _mm_unpacklo_epi16(b5, b7); 
+			r6 = _mm_unpackhi_epi16(b4, b6); 
+			r7 = _mm_unpackhi_epi16(b5, b7); 
+
+			b0 = _mm_unpacklo_epi16(r0, r1); 
+			b1 = _mm_unpackhi_epi16(r0, r1); 
+			b2 = _mm_unpacklo_epi16(r2, r3); 
+			b3 = _mm_unpackhi_epi16(r2, r3); 
+			b4 = _mm_unpacklo_epi16(r4, r5); 
+			b5 = _mm_unpackhi_epi16(r4, r5); 
+			b6 = _mm_unpacklo_epi16(r6, r7); 
+			b7 = _mm_unpackhi_epi16(r6, r7); 
+
+			p0 = _mm_unpacklo_epi64(b0, b4);
+			p1 = _mm_unpackhi_epi64(b0, b4);
+			p2 = _mm_unpacklo_epi64(b1, b5);
+			p3 = _mm_unpackhi_epi64(b1, b5);
+			p4 = _mm_unpacklo_epi64(b2, b6);
+			p5 = _mm_unpackhi_epi64(b2, b6);
+			p6 = _mm_unpacklo_epi64(b3, b7);
+			p7 = _mm_unpackhi_epi64(b3, b7);
+
+			/* perform approx DCT */
+						a0 = _mm_add_epi16(p0, p4); // p0 + p4
+			a1 = _mm_sub_epi16(p0, p4); // p0 - p4
+			r0 = _mm_srai_epi16(p2, 1); // p2 >> 1
+			a2 = _mm_sub_epi16(p6, r0); // p6 - (p2 >> 1)
+			r0 = _mm_srai_epi16(p6, 1); // p6 >> 1
+			a3 = _mm_add_epi16(p2, r0); //p2 + (p6 >> 1)
+
+			b0 =  _mm_add_epi16(a0, a3); // a0 + a3;
+			b2 =  _mm_sub_epi16(a1, a2);  // a1 - a2;
+			b4 =  _mm_add_epi16(a1, a2);    // a1 + a2;
+			b6 =  _mm_sub_epi16(a0, a3);  // a0 - a3;
+
+			//-p3 + p5 - p7 - (p7 >> 1);    
+			r0 = _mm_srai_epi16(p7, 1); // p7 >> 1
+			a0 = _mm_sub_epi16(p5, p3); // p5 - p3
+			a0 = _mm_sub_epi16(a0, p7); // (-p3 + p5) - p7
+			a0 = _mm_sub_epi16(a0, r0); // (-p3 + p5 - p7) - (p7 >> 1)
+
+			//p1 + p7 - p3 - (p3 >> 1);    
+			r0 =  _mm_srai_epi16(p3, 1); // (p3 >> 1)
+			a1 = _mm_add_epi16(p1, p7); // p1 + p7
+			a1 = _mm_sub_epi16(a1, p3); // (p1 + p7) - p3
+			a1 = _mm_sub_epi16(a1, r0); // (p1 + p7 - p3) - (p3>>1)
+
+			// -p1 + p7 + p5 + (p5 >> 1);   
+			r0 =  _mm_srai_epi16(p5, 1); // (p5 >> 1)
+			a2 = _mm_sub_epi16(p7, p1); // p7 - p1
+			a2 = _mm_add_epi16(a2, p5); // -p1 + p7 + p5
+			a2 = _mm_add_epi16(a2, r0); // (-p1 + p7 + p5) + (p5 >> 1)
+
+			// p3 + p5 + p1 + (p1 >> 1);
+			a3 = _mm_add_epi16(p3, p5); // p3+p5
+			a3 = _mm_add_epi16(a3, p1); // p3 + p5 + p1
+			p1 = _mm_srai_epi16(p1, 1); // p1 >> 1
+			a3 = _mm_add_epi16(a3, p1); //p3 + p5 + p1 + (p1 >> 1)
+
+			r0 = _mm_srai_epi16(a3, 2); // a3>>2
+			b1 = _mm_add_epi16(a0, r0); //a0 + (a3>>2);    
+			r0 = _mm_srai_epi16(a2, 2); // a2>>2
+			b3 = _mm_add_epi16(a1, r0); // a1 + (a2>>2);    
+			a1 = _mm_srai_epi16(a1, 2); // all done with a1, so this is safe
+			b5 = _mm_sub_epi16(a2, a1); //a2 - (a1>>2);
+			a0 = _mm_srai_epi16(a0, 2); // all done with a0, so this is safe
+			b7 = _mm_sub_epi16(a3, a0); //a3 - (a0>>2);                
+
+			p0 = _mm_add_epi16(b0, b7); // b0 + b7;
+			p1 = _mm_sub_epi16(b2, b5); // b2 - b5;
+			p2 = _mm_add_epi16(b4, b3); // b4 + b3;
+			p3 = _mm_add_epi16(b6, b1); // b6 + b1;
+			p4 = _mm_sub_epi16(b6, b1); // b6 - b1;
+			p5 = _mm_sub_epi16(b4, b3); // b4 - b3;
+			p6 = _mm_add_epi16(b2, b5); // b2 + b5;
+			p7 = _mm_sub_epi16(b0, b7); // b0 - b7;
+
+						/* rotate 8x8 (ugh) */
+			r0 = _mm_unpacklo_epi16(p0, p2); 
+			r1 = _mm_unpacklo_epi16(p1, p3); 
+			r2 = _mm_unpackhi_epi16(p0, p2); 
+			r3 = _mm_unpackhi_epi16(p1, p3); 
+			r4 = _mm_unpacklo_epi16(p4, p6); 
+			r5 = _mm_unpacklo_epi16(p5, p7); 
+			r6 = _mm_unpackhi_epi16(p4, p6); 
+			r7 = _mm_unpackhi_epi16(p5, p7); 
+
+			b0 = _mm_unpacklo_epi16(r0, r1); 
+			b1 = _mm_unpackhi_epi16(r0, r1); 
+			b2 = _mm_unpacklo_epi16(r2, r3); 
+			b3 = _mm_unpackhi_epi16(r2, r3); 
+			b4 = _mm_unpacklo_epi16(r4, r5); 
+			b5 = _mm_unpackhi_epi16(r4, r5); 
+			b6 = _mm_unpacklo_epi16(r6, r7); 
+			b7 = _mm_unpackhi_epi16(r6, r7); 
+
+			p0 = _mm_unpacklo_epi64(b0, b4);
+			p1 = _mm_unpackhi_epi64(b0, b4);
+			p2 = _mm_unpacklo_epi64(b1, b5);
+			p3 = _mm_unpackhi_epi64(b1, b5);
+			p4 = _mm_unpacklo_epi64(b2, b6);
+			p5 = _mm_unpackhi_epi64(b2, b6);
+			p6 = _mm_unpacklo_epi64(b3, b7);
+			p7 = _mm_unpackhi_epi64(b3, b7);
+
+
+		/*  Vertical  */
+
+			a0 = _mm_add_epi16(p0, p4); // p0 + p4
+			a1 = _mm_sub_epi16(p0, p4); // p0 - p4
+			r0 = _mm_srai_epi16(p2, 1); // p2 >> 1
+			a2 = _mm_sub_epi16(p6, r0); // p6 - (p2 >> 1)
+			r0 = _mm_srai_epi16(p6, 1); // p6 >> 1
+			a3 = _mm_add_epi16(p2, r0); //p2 + (p6 >> 1)
+
+			b0 =  _mm_add_epi16(a0, a3); // a0 + a3;
+			b2 =  _mm_sub_epi16(a1, a2);  // a1 - a2;
+			b4 =  _mm_add_epi16(a1, a2);    // a1 + a2;
+			b6 =  _mm_sub_epi16(a0, a3);  // a0 - a3;
+
+			//-p3 + p5 - p7 - (p7 >> 1);    
+			r0 = _mm_srai_epi16(p7, 1); // p7 >> 1
+			a0 = _mm_sub_epi16(p5, p3); // p5 - p3
+			a0 = _mm_sub_epi16(a0, p7); // (-p3 + p5) - p7
+			a0 = _mm_sub_epi16(a0, r0); // (-p3 + p5 - p7) - (p7 >> 1)
+
+			//p1 + p7 - p3 - (p3 >> 1);    
+			r0 =  _mm_srai_epi16(p3, 1); // (p3 >> 1)
+			a1 = _mm_add_epi16(p1, p7); // p1 + p7
+			a1 = _mm_sub_epi16(a1, p3); // (p1 + p7) - p3
+			a1 = _mm_sub_epi16(a1, r0); // (p1 + p7 - p3) - (p3>>1)
+
+			// -p1 + p7 + p5 + (p5 >> 1);   
+			r0 =  _mm_srai_epi16(p5, 1); // (p5 >> 1)
+			a2 = _mm_sub_epi16(p7, p1); // p7 - p1
+			a2 = _mm_add_epi16(a2, p5); // -p1 + p7 + p5
+			a2 = _mm_add_epi16(a2, r0); // (-p1 + p7 + p5) + (p5 >> 1)
+
+			// p3 + p5 + p1 + (p1 >> 1);
+			r0 = _mm_srai_epi16(p1, 1); // p1 >> 1
+			a3 = _mm_add_epi16(p3, p5); // p3+p5
+			a3 = _mm_add_epi16(a3, p1); // p3 + p5 + p1
+			a3 = _mm_add_epi16(a3, r0); //p3 + p5 + p1 + (p1 >> 1)
+
+			r0 = _mm_srai_epi16(a3, 2); // a3>>2
+			b1 = _mm_add_epi16(a0, r0); //a0 + (a3>>2);    
+			r0 = _mm_srai_epi16(a2, 2); // a2>>2
+			b3 = _mm_add_epi16(a1, r0); // a1 + (a2>>2);    
+			a1 = _mm_srai_epi16(a1, 2); // all done with a1, so this is safe
+			b5 = _mm_sub_epi16(a2, a1); //a2 - (a1>>2);
+			a0 = _mm_srai_epi16(a0, 2); // all done with a0, so this is safe
+			b7 = _mm_sub_epi16(a3, a0); //a3 - (a0>>2);                
+
+			r0 = _mm_add_epi16(b0, b7); // b0 + b7;
+			r1 = _mm_sub_epi16(b2, b5); // b2 - b5;
+			r2 = _mm_add_epi16(b4, b3); // b4 + b3;
+			r3 = _mm_add_epi16(b6, b1); // b6 + b1;
+			r4 = _mm_sub_epi16(b6, b1); // b6 - b1;
+			r5 = _mm_sub_epi16(b4, b3); // b4 - b3;
+			r6 = _mm_add_epi16(b2, b5); // b2 + b5;
+			r7 = _mm_sub_epi16(b0, b7); // b0 - b7;
+
+
+			// add in prediction values
+			pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[0][pos_x]));
+			pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[1][pos_x]));
+			// (x + 32) >> 6
+			r0 = _mm_adds_epi16(r0, const32);
+			r0 = _mm_srai_epi16(r0, 6);
+			r1 = _mm_adds_epi16(r1, const32);
+			r1 = _mm_srai_epi16(r1, 6);
+			pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
+			pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
+			pred0 = _mm_adds_epi16(pred0, r0);
+			pred1 = _mm_adds_epi16(pred1, r1);
+
+			pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
+
+			// store
+			_mm_storel_epi64((__m128i *)(&mb_rec[0][pos_x]), pred0);
+			// TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
+			pred0 = _mm_srli_si128(pred0, 8);
+			_mm_storel_epi64((__m128i *)(&mb_rec[1][pos_x]), pred0);
+
+			/* --- */
+
+			pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[2][pos_x]));
+			pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[3][pos_x]));
+			// (x + 32) >> 6
+			r2 = _mm_adds_epi16(r2, const32);
+			r2 = _mm_srai_epi16(r2, 6);
+			r3 = _mm_adds_epi16(r3, const32);
+			r3 = _mm_srai_epi16(r3, 6);
+			pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
+			pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
+			pred0 = _mm_adds_epi16(pred0, r2);
+			pred1 = _mm_adds_epi16(pred1, r3);
+
+			pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
+
+			// store
+			_mm_storel_epi64((__m128i *)(&mb_rec[2][pos_x]), pred0);
+			// TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
+			pred0 = _mm_srli_si128(pred0, 8);
+			_mm_storel_epi64((__m128i *)(&mb_rec[3][pos_x]), pred0);
+
+			/* --- */
+
+			pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[4][pos_x]));
+			pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[5][pos_x]));
+			// (x + 32) >> 6
+			r4 = _mm_adds_epi16(r4, const32);
+			r4 = _mm_srai_epi16(r4, 6);
+			r5 = _mm_adds_epi16(r5, const32);
+			r5 = _mm_srai_epi16(r5, 6);
+			pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
+			pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
+			pred0 = _mm_adds_epi16(pred0, r4);
+			pred1 = _mm_adds_epi16(pred1, r5);
+
+			pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
+
+			// store
+			_mm_storel_epi64((__m128i *)(&mb_rec[4][pos_x]), pred0);
+			// TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
+			pred0 = _mm_srli_si128(pred0, 8);
+			_mm_storel_epi64((__m128i *)(&mb_rec[5][pos_x]), pred0);
+
+			/* --- */
+
+			pred0 = _mm_loadl_epi64((__m128i *)(&mb_pred[6][pos_x]));
+			pred1 = _mm_loadl_epi64((__m128i *)(&mb_pred[7][pos_x]));
+			// (x + 32) >> 6
+			r6 = _mm_adds_epi16(r6, const32);
+			r6 = _mm_srai_epi16(r6, 6);
+			r7 = _mm_adds_epi16(r7, const32);
+			r7 = _mm_srai_epi16(r7, 6);
+			pred0 = _mm_unpacklo_epi8(pred0, zero); // convert to short
+			pred1 = _mm_unpacklo_epi8(pred1, zero); // convert to short
+			pred0 = _mm_adds_epi16(pred0, r6);
+			pred1 = _mm_adds_epi16(pred1, r7);
+
+			pred0 = _mm_packus_epi16(pred0, pred1); // convert to unsigned char
+
+			// store
+			_mm_storel_epi64((__m128i *)&mb_rec[6][pos_x], pred0);
+			// TODO: if mb_pred was converted to 4 8x8 blocks, we could store more easily.
+			pred0 = _mm_srli_si128(pred0, 8);
+			_mm_storel_epi64((__m128i *)&mb_rec[7][pos_x], pred0);
+}
+
+#endif
+
+#ifdef _M_IX86
+// TODO!! fix for 16bit coefficients instead of 32
+static void sample_reconstruct8x8_mmx(h264_imgpel_macroblock_row_t *mb_rec, const h264_imgpel_macroblock_row_t *mb_pred, const h264_short_8x8block_row_t *mb_rres8, int pos_x)
+{
+			__asm
+		{
+			mov esi, 8 // loop 8 times
+
+			mov eax, mb_rec
+			add eax, pos_x
+
+			mov ebx, mb_pred
+			add ebx, pos_x
+
+			mov ecx, mb_rres8
+
+			// mm0 : constant value 32
+			mov edx, 0x00200020
+			movd mm0, edx
+			punpckldq	mm0, mm0
+			// mm5:  zero
+			pxor mm7, mm7
+
+loop8:
+
+			movq	mm1, MMWORD PTR 0[ecx]
+			paddw mm1, mm0 // rres + 32
+			psraw mm1, 6 // (rres + 32) >> 6
+			movq mm2, MMWORD PTR 0[ebx]
+			punpcklbw mm2, mm7			// convert pred_row from unsigned char to short
+			paddsw mm2, mm1 // pred_row + rres_row
+			packuswb mm2, mm7
+			movq MMWORD PTR 0[eax], mm2
+
+
+			add eax, 16
+			add ebx, 16
+			add ecx, 16
+
+			sub	esi, 1
+			jne	loop8
+			emms
+		}
+}
+#endif
+
+// benski> unused, left in place for unit testing and if we ever need to port the decoder to non-intel
+static void sample_reconstruct8x8(h264_imgpel_macroblock_row_t *mb_rec, const h264_imgpel_macroblock_row_t *mb_pred, const h264_short_8x8block_row_t *mb_rres8, int pos_x, int max_imgpel_value)
+{
+	int i,j;
+	for( j = 0; j <  8; j++)
+	{
+		imgpel *rec_row = mb_rec[j] + pos_x;
+		const short *rres_row = mb_rres8[j];
+		const imgpel *pred_row = mb_pred[j] + pos_x;
+
+		for( i = 0; i < 8; i++)
+			rec_row[i] = (imgpel) iClip1(max_imgpel_value, pred_row[i] + rshift_rnd_sf(rres_row[i], DQ_BITS_8)); 
+	}
+}
+/*!
+***********************************************************************
+* \brief
+*    Inverse 8x8 transformation
+***********************************************************************
+*/ 
+#ifdef _M_IX86
+void itrans8x8_mmx(h264_imgpel_macroblock_row_t *mb_rec, const h264_imgpel_macroblock_row_t *mb_pred, const h264_short_8x8block_row_t *block, int pos_x)
+{
+	inverse8x8((h264_short_8x8block_row_t *)block);
+	sample_reconstruct8x8_mmx(mb_rec, mb_pred, block, pos_x);
+}
+#endif
+
+void itrans8x8_c(h264_imgpel_macroblock_row_t *mb_rec, const h264_imgpel_macroblock_row_t *mb_pred, const h264_short_8x8block_row_t *block, int pos_x)
+{
+	inverse8x8((h264_short_8x8block_row_t *)block);
+	sample_reconstruct8x8(mb_rec, mb_pred, block, pos_x, 255);
+}
+
+void itrans8x8_lossless(h264_imgpel_macroblock_row_t *mb_rec, const h264_imgpel_macroblock_row_t *mb_pred, const h264_short_8x8block_row_t *block, int pos_x)
+{
+	int i,j;
+
+	for( j = 0; j <  8; j++)
+	{
+		imgpel *rec_row = mb_rec[j] + pos_x;
+		const short *rres_row = block[j];
+		const imgpel *pred_row = mb_pred[j] + pos_x;
+		for( i = 0; i <  8; i++)
+			rec_row[i] = (imgpel) iClip1(255, (rres_row[i] + (long)pred_row[i])); 
+	}
+}
+\ No newline at end of file
diff --git a/Src/h264dec/ldecod/src/vlc.c b/Src/h264dec/ldecod/src/vlc.c
new file mode 100644
index 00000000..397a7d08
--- /dev/null
+++ b/Src/h264dec/ldecod/src/vlc.c
@@ -0,0 +1,1769 @@
+/*!
+************************************************************************
+* \file vlc.c
+*
+* \brief
+*    VLC support functions
+*
+* \author
+*    Main contributors (see contributors.h for copyright, address and affiliation details)
+*    - Inge Lille-Lang�y               <inge.lille-langoy@telenor.com>
+*    - Detlev Marpe                    <marpe@hhi.de>
+*    - Gabi Blaettermann
+************************************************************************
+*/
+#include "contributors.h"
+
+#include "global.h"
+#include "vlc.h"
+#include "elements.h"
+#include "optim.h"
+#include <emmintrin.h>
+
+// A little trick to avoid those horrible #if TRACE all over the source code
+#if TRACE
+#define SYMTRACESTRING(s) strncpy(symbol.tracestring,s,TRACESTRING_SIZE)
+#else
+#define SYMTRACESTRING(s) // do nothing
+#endif
+
+static int ShowBits (const uint8_t buffer[],int totbitoffset,int bitcount, int numbits);
+
+// Note that all NA values are filled with 0
+
+/*!
+*************************************************************************************
+* \brief
+*    ue_v, reads an ue(v) syntax element, the length in bits is stored in
+*    the global p_Dec->UsedBits variable
+*
+* \param tracestring
+*    the string for the trace file
+*
+* \param bitstream
+*    the stream to be read from
+*
+* \return
+*    the value of the coded syntax element
+*
+*************************************************************************************
+*/
+int ue_v (const char *tracestring, Bitstream *bitstream)
+{
+	SyntaxElement symbol;
+
+	//assert (bitstream->streamBuffer != NULL);
+	symbol.mapping = linfo_ue;   // Mapping rule
+	SYMTRACESTRING(tracestring);
+	readSyntaxElement_VLC (&symbol, bitstream);
+	return symbol.value1;
+}
+
+
+/*!
+*************************************************************************************
+* \brief
+*    ue_v, reads an se(v) syntax element, the length in bits is stored in
+*    the global p_Dec->UsedBits variable
+*
+* \param tracestring
+*    the string for the trace file
+*
+* \param bitstream
+*    the stream to be read from
+*
+* \return
+*    the value of the coded syntax element
+*
+*************************************************************************************
+*/
+int se_v (const char *tracestring, Bitstream *bitstream)
+{
+	SyntaxElement symbol;
+
+	//assert (bitstream->streamBuffer != NULL);
+	symbol.mapping = linfo_se;   // Mapping rule: signed integer
+	SYMTRACESTRING(tracestring);
+	readSyntaxElement_VLC (&symbol, bitstream);
+	return symbol.value1;
+}
+
+
+/*!
+*************************************************************************************
+* \brief
+*    ue_v, reads an u(v) syntax element, the length in bits is stored in
+*    the global p_Dec->UsedBits variable
+*
+* \param LenInBits
+*    length of the syntax element
+*
+* \param tracestring
+*    the string for the trace file
+*
+* \param bitstream
+*    the stream to be read from
+*
+* \return
+*    the value of the coded syntax element
+*
+*************************************************************************************
+*/
+int u_v (int LenInBits, const char*tracestring, Bitstream *bitstream)
+{
+	return readSyntaxElement_FLC(bitstream, LenInBits);
+}
+
+/*!
+*************************************************************************************
+* \brief
+*    i_v, reads an i(v) syntax element, the length in bits is stored in
+*    the global p_Dec->UsedBits variable
+*
+* \param LenInBits
+*    length of the syntax element
+*
+* \param tracestring
+*    the string for the trace file
+*
+* \param bitstream
+*    the stream to be read from
+*
+* \return
+*    the value of the coded syntax element
+*
+*************************************************************************************
+*/
+int i_v (int LenInBits, const char*tracestring, Bitstream *bitstream)
+{
+	int val;
+	val = readSyntaxElement_FLC (bitstream, LenInBits);
+
+	// can be negative
+	val = -( val & (1 << (LenInBits - 1)) ) | val;
+
+	return val;
+}
+
+
+/*!
+*************************************************************************************
+* \brief
+*    ue_v, reads an u(1) syntax element, the length in bits is stored in
+*    the global p_Dec->UsedBits variable
+*
+* \param tracestring
+*    the string for the trace file
+*
+* \param bitstream
+*    the stream to be read from
+*
+* \return
+*    the value of the coded syntax element
+*
+*************************************************************************************
+*/
+Boolean u_1 (const char *tracestring, Bitstream *bitstream)
+{
+	return (Boolean) u_v (1, tracestring, bitstream);
+}
+
+
+
+/*!
+************************************************************************
+* \brief
+*    mapping rule for ue(v) syntax elements
+* \par Input:
+*    lenght and info
+* \par Output:
+*    number in the code table
+************************************************************************
+*/
+void linfo_ue(int len, int info, int *value1, int *dummy)
+{
+	//assert ((len >> 1) < 32);
+	*value1 = (int) (((unsigned int) 1 << (len >> 1)) + (unsigned int) (info) - 1);
+}
+
+/*!
+************************************************************************
+* \brief
+*    mapping rule for se(v) syntax elements
+* \par Input:
+*    lenght and info
+* \par Output:
+*    signed mvd
+************************************************************************
+*/
+void linfo_se(int len,  int info, int *value1, int *dummy)
+{
+	//assert ((len >> 1) < 32);
+	unsigned int n = ((unsigned int) 1 << (len >> 1)) + (unsigned int) info - 1;
+	*value1 = (n + 1) >> 1;
+	if((n & 0x01) == 0)                           // lsb is signed bit
+		*value1 = -*value1;
+}
+
+
+/*!
+************************************************************************
+* \par Input:
+*    length and info
+* \par Output:
+*    cbp (intra)
+************************************************************************
+*/
+void linfo_cbp_intra_normal(int len,int info,int *cbp, int *dummy)
+{
+	int cbp_idx;
+
+	linfo_ue(len, info, &cbp_idx, dummy);
+	*cbp=NCBP[1][cbp_idx][0];
+}
+
+
+/*!
+************************************************************************
+* \par Input:
+*    length and info
+* \par Output:
+*    cbp (intra)
+************************************************************************
+*/
+void linfo_cbp_intra_other(int len,int info,int *cbp, int *dummy)
+{
+	int cbp_idx;
+
+	linfo_ue(len, info, &cbp_idx, dummy);
+	*cbp=NCBP[0][cbp_idx][0];
+}
+
+/*!
+************************************************************************
+* \par Input:
+*    length and info
+* \par Output:
+*    cbp (inter)
+************************************************************************
+*/
+void linfo_cbp_inter_normal(int len,int info,int *cbp, int *dummy)
+{
+	int cbp_idx;
+
+	linfo_ue(len, info, &cbp_idx, dummy);
+	*cbp=NCBP[1][cbp_idx][1];
+}
+
+/*!
+************************************************************************
+* \par Input:
+*    length and info
+* \par Output:
+*    cbp (inter)
+************************************************************************
+*/
+void linfo_cbp_inter_other(int len,int info,int *cbp, int *dummy)
+{
+	int cbp_idx;
+
+	linfo_ue(len, info, &cbp_idx, dummy);
+	*cbp=NCBP[0][cbp_idx][1];
+}
+
+/*!
+************************************************************************
+* \par Input:
+*    length and info
+* \par Output:
+*    level, run
+************************************************************************
+*/
+void linfo_levrun_inter(int len, int info, int *level, int *irun)
+{
+	//assert (((len >> 1) - 5) < 32);
+
+	if (len <= 9)
+	{
+		int l2     = imax(0,(len >> 1)-1);
+		int inf    = info >> 1;
+
+		*level = NTAB1[l2][inf][0];
+		*irun  = NTAB1[l2][inf][1];
+		if ((info & 0x01) == 1)
+			*level = -*level;                   // make sign
+	}
+	else                                  // if len > 9, skip using the array
+	{
+		*irun  = (info & 0x1e) >> 1;
+		*level = LEVRUN1[*irun] + (info >> 5) + ( 1 << ((len >> 1) - 5));
+		if ((info & 0x01) == 1)
+			*level = -*level;
+	}
+
+	if (len == 1) // EOB
+		*level = 0;
+}
+
+
+/*!
+************************************************************************
+* \par Input:
+*    length and info
+* \par Output:
+*    level, run
+************************************************************************
+*/
+void linfo_levrun_c2x2(int len, int info, int *level, int *irun)
+{
+	if (len<=5)
+	{
+		int l2     = imax(0, (len >> 1) - 1);
+		int inf    = info >> 1;
+		*level = NTAB3[l2][inf][0];
+		*irun  = NTAB3[l2][inf][1];
+		if ((info & 0x01) == 1)
+			*level = -*level;                 // make sign
+	}
+	else                                  // if len > 5, skip using the array
+	{
+		*irun  = (info & 0x06) >> 1;
+		*level = LEVRUN3[*irun] + (info >> 3) + (1 << ((len >> 1) - 3));
+		if ((info & 0x01) == 1)
+			*level = -*level;
+	}
+
+	if (len == 1) // EOB
+		*level = 0;
+}
+
+/*!
+************************************************************************
+* \brief
+*    read next UVLC codeword from UVLC-partition and
+*    map it to the corresponding syntax element
+************************************************************************
+*/
+int readSyntaxElement_VLC(SyntaxElement *sym, Bitstream *currStream)
+{
+
+	int info;
+	sym->len =  GetVLCSymbol (currStream->streamBuffer, currStream->frame_bitoffset, &info, currStream->bitstream_length);
+	if (sym->len == -1)
+		return -1;
+
+	currStream->frame_bitoffset += sym->len;
+	sym->mapping(sym->len, info, &(sym->value1), &(sym->value2));
+
+	return 1;
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    read next UVLC codeword from UVLC-partition and
+*    map it to the corresponding syntax element
+************************************************************************
+*/
+int readSyntaxElement_UVLC(SyntaxElement *sym, struct datapartition *dp)
+{
+	return (readSyntaxElement_VLC(sym, dp->bitstream));
+}
+
+/*!
+************************************************************************
+* \brief
+*    read next VLC codeword for 4x4 Intra Prediction Mode and
+*    map it to the corresponding Intra Prediction Direction
+************************************************************************
+*/
+int readSyntaxElement_Intra4x4PredictionMode(SyntaxElement *sym, Bitstream *currStream)
+{
+	int info;
+	sym->len = GetVLCSymbol_IntraMode (currStream->streamBuffer, currStream->frame_bitoffset, &info, currStream->bitstream_length);
+
+	if (sym->len == -1)
+		return -1;
+
+	currStream->frame_bitoffset += sym->len;
+	sym->value1       = (sym->len == 1) ? -1 : info;
+
+#if TRACE
+	tracebits2(sym->tracestring, sym->len, sym->value1);
+#endif
+
+	return 1;
+}
+
+int GetVLCSymbol_IntraMode (const uint8_t buffer[],int totbitoffset,int *info, int bytecount)
+{
+	int byteoffset = (totbitoffset >> 3);        // byte from start of buffer
+	int bitoffset   = (7 - (totbitoffset & 0x07)); // bit from start of byte
+	const uint8_t *cur_byte  = &(buffer[byteoffset]);
+	int ctr_bit     = (*cur_byte & (0x01 << bitoffset));      // control bit for current bit posision
+
+	//First bit
+	if (ctr_bit)
+	{
+		*info = 0;
+		return 1;
+	}
+
+	if (byteoffset >= bytecount) 
+	{
+		return -1;
+	}
+	else
+	{
+		int inf = (*(cur_byte) << 8) + *(cur_byte + 1);
+		inf <<= (sizeof(uint8_t) * 8) - bitoffset;
+		inf = inf & 0xFFFF;
+		inf >>= (sizeof(uint8_t) * 8) * 2 - 3;
+
+		*info = inf;
+		return 4;           // return absolute offset in bit from start of frame
+	} 
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    test if bit buffer contains only stop bit
+*
+* \param buffer
+*    buffer containing VLC-coded data bits
+* \param totbitoffset
+*    bit offset from start of partition
+* \param bytecount
+*    buffer length
+* \return
+*    true if more bits available
+************************************************************************
+*/
+int more_rbsp_data (const uint8_t buffer[],int totbitoffset,int bytecount)
+{
+	long byteoffset = (totbitoffset >> 3);      // byte from start of buffer
+	// there is more until we're in the last byte
+	if (byteoffset < (bytecount - 1)) 
+		return TRUE;
+	else
+	{
+		int bitoffset   = (7 - (totbitoffset & 0x07));      // bit from start of byte
+		const uint8_t *cur_byte  = &(buffer[byteoffset]);
+		// read one bit
+		int ctr_bit     = ctr_bit = ((*cur_byte)>> (bitoffset--)) & 0x01;      // control bit for current bit posision
+
+		//assert (byteoffset<bytecount);       
+
+		// a stop bit has to be one
+		if (ctr_bit==0) 
+			return TRUE;  
+		else
+		{
+			int cnt = 0;
+
+			while (bitoffset>=0 && !cnt)
+			{
+				cnt |= ((*cur_byte)>> (bitoffset--)) & 0x01;   // set up control bit
+			}
+
+			return (cnt);
+		}
+	}
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    Check if there are symbols for the next MB
+************************************************************************
+*/
+int uvlc_startcode_follows(Slice *currSlice, int dummy)
+{
+	byte            dp_Nr = assignSE2partition[currSlice->dp_mode][SE_MBTYPE];
+	DataPartition     *dP = &(currSlice->partArr[dp_Nr]);
+	Bitstream *currStream = dP->bitstream;
+	const uint8_t *buf = currStream->streamBuffer;
+
+	return (!(more_rbsp_data(buf, currStream->frame_bitoffset,currStream->bitstream_length)));
+}
+
+
+
+/*!
+************************************************************************
+* \brief
+*  read one exp-golomb VLC symbol
+*
+* \param buffer
+*    containing VLC-coded data bits
+* \param totbitoffset
+*    bit offset from start of partition
+* \param  info
+*    returns the value of the symbol
+* \param bytecount
+*    buffer length
+* \return
+*    bits read
+************************************************************************
+*/
+int GetVLCSymbol (const uint8_t buffer[],int totbitoffset,int *info, int bytecount)
+{
+	long byteoffset = (totbitoffset >> 3);         // byte from start of buffer
+	int  bitoffset  = (7 - (totbitoffset & 0x07)); // bit from start of byte
+	int  bitcounter = 1;
+	int  len        = 0;
+	const uint8_t *cur_byte  = &(buffer[byteoffset]);
+	int  ctr_bit    = ((*cur_byte) >> (bitoffset)) & 0x01;  // control bit for current bit posision
+
+	while (ctr_bit == 0)
+	{                 // find leading 1 bit
+		len++;
+		bitcounter++;
+		bitoffset--;
+		bitoffset &= 0x07;
+		cur_byte  += (bitoffset == 7);
+		byteoffset+= (bitoffset == 7);      
+		ctr_bit    = ((*cur_byte) >> (bitoffset)) & 0x01;
+	}
+
+	if (byteoffset + ((len + 7) >> 3) > bytecount)
+		return -1;
+	else
+	{
+		// make infoword
+		int inf = 0;                          // shortest possible code is 1, then info is always 0    
+
+		while (len--)
+		{
+			bitoffset --;    
+			bitoffset &= 0x07;
+			cur_byte  += (bitoffset == 7);
+			bitcounter++;
+			inf <<= 1;    
+			inf |= ((*cur_byte) >> (bitoffset)) & 0x01;
+		}
+
+		*info = inf;
+		return bitcounter;           // return absolute offset in bit from start of frame
+	}
+}
+
+
+/*!
+************************************************************************
+* \brief
+*  Reads bits from the bitstream buffer (Threshold based)
+*
+* \param inf
+*    bytes to extract numbits from with bitoffset already applied
+* \param numbits
+*    number of bits to read
+*
+************************************************************************
+*/
+
+static inline int ShowBitsThres16(int inf, int numbits)
+{
+	return ((inf) >> ((sizeof(uint8_t) * 16) - (numbits)));
+}
+
+//static inline int ShowBitsThres (int inf, int bitcount, int numbits)
+static inline int ShowBitsThres(int inf, int numbits)
+{
+	return ((inf) >> ((sizeof(uint8_t) * 24) - (numbits)));
+	/*
+	if ((numbits + 7) > bitcount) 
+	{
+	return -1;
+	}
+	else 
+	{
+	//Worst case scenario is that we will need to traverse 3 bytes
+	inf >>= (sizeof(byte)*8)*3 - numbits;
+	}
+
+	return inf; //Will be a small unsigned integer so will not need any conversion when returning as int
+	*/
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    code from bitstream (2d tables)
+************************************************************************
+*/
+
+static int code_from_bitstream_2d(SyntaxElement *sym,
+																	Bitstream *currStream,
+																	const uint8_t *lentab,
+																	const uint8_t *codtab,
+																	int tabwidth,
+																	int tabheight,
+																	int *code)
+{
+	int i, j;
+	const uint8_t *len = &lentab[0], *cod = &codtab[0];
+
+	int *frame_bitoffset = &currStream->frame_bitoffset;
+	const uint8_t *buf = &currStream->streamBuffer[*frame_bitoffset >> 3];
+
+	//Apply bitoffset to three bytes (maximum that may be traversed by ShowBitsThres)
+	unsigned int inf = ((*buf) << 16) + (*(buf + 1) << 8) + *(buf + 2); //Even at the end of a stream we will still be pulling out of allocated memory as alloc is done by MAX_CODED_FRAME_SIZE
+	inf <<= (*frame_bitoffset & 0x07);                                  //Offset is constant so apply before extracting different numbers of bits
+	inf  &= 0xFFFFFF;                                                   //Arithmetic shift so wipe any sign which may be extended inside ShowBitsThres
+
+	// this VLC decoding method is not optimized for speed
+	for (j = 0; j < tabheight; j++) 
+	{
+		for (i = 0; i < tabwidth; i++)
+		{
+			if ((*len == 0) || (ShowBitsThres(inf, *len) != *cod))
+			{
+				len++;
+				cod++;
+			}
+			else
+			{
+				sym->len = *len;
+				*frame_bitoffset += *len; // move bitstream pointer
+				*code = *cod;             
+				sym->value1 = i;
+				sym->value2 = j;        
+				return 0;                 // found code and return 
+			}
+		}
+	}
+	return -1;  // failed to find code
+}
+
+static int code_from_bitstream_2d_16_1(Bitstream *currStream,
+																			 const uint8_t *lentab,
+																			 const uint8_t *codtab)
+{
+	int i;
+	const uint8_t *len = &lentab[0], *cod = &codtab[0];
+
+	int *frame_bitoffset = &currStream->frame_bitoffset;
+	const uint8_t *buf = &currStream->streamBuffer[*frame_bitoffset >> 3];
+
+	//Apply bitoffset to three bytes (maximum that may be traversed by ShowBitsThres)
+	unsigned int inf = ((*buf) << 16) + (*(buf + 1) << 8) + *(buf + 2); //Even at the end of a stream we will still be pulling out of allocated memory as alloc is done by MAX_CODED_FRAME_SIZE
+	inf <<= (*frame_bitoffset & 0x07);                                  //Offset is constant so apply before extracting different numbers of bits
+	inf  &= 0xFFFFFF;                                                   //Arithmetic shift so wipe any sign which may be extended inside ShowBitsThres
+
+	// this VLC decoding method is not optimized for speed
+	for (i = 0; i < 16 && len[i]; i++)
+	{
+		if (ShowBitsThres(inf, len[i]) == cod[i])
+		{
+			*frame_bitoffset += len[i]; // move bitstream pointer
+			return i;                 // found code and return 
+		}
+	}
+
+	return -1;  // failed to find code
+}
+
+int code_from_bitstream_2d_16_1_sse2(Bitstream *currStream, const uint16_t *lentab,	const uint16_t *codtab, const uint16_t *masktab)
+{
+	unsigned long result;
+
+	int frame_bitoffset = currStream->frame_bitoffset;
+	const uint8_t *buf = &currStream->streamBuffer[frame_bitoffset >> 3];
+	uint16_t inf;
+
+	__m128i xmm_inf, xmm_mask, xmm_cod;
+	int match;
+	unsigned int _inf = _byteswap_ulong(*(unsigned long *)buf);
+	_inf >>= 16-(frame_bitoffset & 0x07);
+	_inf  &= 0xFFFF;
+	inf = (uint16_t)_inf;
+
+	xmm_inf = _mm_set1_epi16(inf);
+
+	xmm_cod  = _mm_load_si128((__m128i *)codtab);	
+	xmm_mask = _mm_load_si128((__m128i *)masktab);	
+	xmm_mask = _mm_and_si128(xmm_mask, xmm_inf); // mask = mask & inf
+	xmm_mask = _mm_cmpeq_epi16(xmm_mask, xmm_cod); // mask == cod
+	match = _mm_movemask_epi8(xmm_mask);
+	if (match)
+	{
+		_BitScanForward(&result, match);
+		result >>= 1;
+
+		currStream->frame_bitoffset += lentab[result]; // move bitstream pointer
+		return result;                 // found code and return 
+	}
+
+	xmm_cod  = _mm_load_si128((__m128i *)(codtab+8));	
+	xmm_mask = _mm_load_si128((__m128i *)(masktab+8));	
+	xmm_mask = _mm_and_si128(xmm_mask, xmm_inf); // mask = mask & inf
+	xmm_mask = _mm_cmpeq_epi16(xmm_mask, xmm_cod); // mask == cod
+	match = _mm_movemask_epi8(xmm_mask);
+	if (match)
+	{
+		_BitScanForward(&result, match);
+		result >>= 1;
+
+		currStream->frame_bitoffset += lentab[result+8]; // move bitstream pointer
+		return result+8;
+	}
+
+
+	return -1;
+}
+
+int code_from_bitstream_2d_16_1_c(Bitstream *currStream, const uint16_t *lentab,	const uint16_t *codtab, const uint16_t *masktab)
+{
+	int i;
+
+	int frame_bitoffset = currStream->frame_bitoffset;
+	const uint8_t *buf = &currStream->streamBuffer[frame_bitoffset >> 3];
+	uint16_t inf;
+
+	unsigned int _inf = _byteswap_ulong(*(unsigned long *)buf);
+	_inf >>= 16-(frame_bitoffset & 0x07);
+	_inf  &= 0xFFFF;
+	inf = (uint16_t)_inf;
+
+	// this VLC decoding method is not optimized for speed
+	for (i=0; i < 16; i++)
+	{
+		if ((inf & masktab[i]) == codtab[i])//ShowBitsThres(inf, len[i]) == cod[i])
+		{
+			currStream->frame_bitoffset += lentab[i]; // move bitstream pointer
+			return i;                 // found code and return 
+		}
+	}
+
+	return -1;  // failed to find code
+}
+
+int code_from_bitstream_2d_17_4_sse2(SyntaxElement *sym, Bitstream *currStream, const uint16_t *lentab, const uint16_t *codtab, const uint16_t *masktab)
+{
+	unsigned long result;
+	const uint16_t *len = lentab, *cod = codtab, *mask = masktab;
+
+	int frame_bitoffset = currStream->frame_bitoffset;
+	const uint8_t *buf = &currStream->streamBuffer[frame_bitoffset >> 3];
+	uint16_t inf;
+
+	__m128i xmm_inf, xmm_mask, xmm_cod;
+	int match;
+	unsigned int _inf = _byteswap_ulong(*(unsigned long *)buf);
+	_inf >>= 16-(frame_bitoffset & 0x07);
+	_inf  &= 0xFFFF;
+	inf = (uint16_t)_inf;
+
+	xmm_inf = _mm_set1_epi16(inf);
+
+	xmm_cod  = _mm_loadu_si128((__m128i *)cod);	
+	xmm_mask = _mm_loadu_si128((__m128i *)mask);	
+	xmm_mask = _mm_and_si128(xmm_mask, xmm_inf); // mask = mask & inf
+	xmm_mask = _mm_cmpeq_epi16(xmm_mask, xmm_cod); // mask == cod
+	match = _mm_movemask_epi8(xmm_mask);
+	if (match)
+	{
+		_BitScanForward(&result, match);
+		result >>= 1;
+
+		currStream->frame_bitoffset += len[result]; // move bitstream pointer
+		sym->value1 = result;
+		sym->value2 = 0;        
+		return 0;                 // found code and return 
+	}
+
+	/* second table - rows 1-8 */
+	xmm_cod  = _mm_loadu_si128((__m128i *)(cod+17));	
+	xmm_mask = _mm_loadu_si128((__m128i *)(mask+17));	
+	xmm_mask = _mm_and_si128(xmm_mask, xmm_inf); // mask = mask & inf
+	xmm_mask = _mm_cmpeq_epi16(xmm_mask, xmm_cod); // mask == cod
+	match = _mm_movemask_epi8(xmm_mask);
+	if (match)
+	{
+		_BitScanForward(&result, match);
+		result >>= 1;
+
+		currStream->frame_bitoffset += len[result+17]; // move bitstream pointer
+		sym->value1 = 1+result;
+		sym->value2 = 1;        
+		return 0;                 // found code and return 
+	}
+
+	/*  first table, rows 9-16 */
+	xmm_cod  = _mm_loadu_si128((__m128i *)(cod+8));	
+	xmm_mask = _mm_loadu_si128((__m128i *)(mask+8));	
+	xmm_mask = _mm_and_si128(xmm_mask, xmm_inf); // mask = mask & inf
+	xmm_mask = _mm_cmpeq_epi16(xmm_mask, xmm_cod); // mask == cod
+	match = _mm_movemask_epi8(xmm_mask);
+	if (match)
+	{
+		_BitScanForward(&result, match);
+		result >>= 1;
+
+		currStream->frame_bitoffset += len[result+8]; // move bitstream pointer
+		sym->value1 = 8+result;
+		sym->value2 = 0;        
+		return 0;                 // found code and return 
+	}
+
+	/* extra one just for first table */
+	if ((inf & mask[16]) == cod[16])//ShowBitsThres(inf, len[i]) == cod[i])
+	{
+		currStream->frame_bitoffset += len[16]; // move bitstream pointer
+		sym->value1 = 16;
+		sym->value2 = 0;        
+		return 0;                 // found code and return 
+	}
+
+
+
+	/* second table - rows 9-16 */
+	xmm_cod  = _mm_loadu_si128((__m128i *)(cod+25));	
+	xmm_mask = _mm_loadu_si128((__m128i *)(mask+25));	
+	xmm_mask = _mm_and_si128(xmm_mask, xmm_inf); // mask = mask & inf
+	xmm_mask = _mm_cmpeq_epi16(xmm_mask, xmm_cod); // mask == cod
+	match = _mm_movemask_epi8(xmm_mask);
+	if (match)
+	{
+		_BitScanForward(&result, match);
+		result >>= 1;
+
+		currStream->frame_bitoffset += len[result+25]; // move bitstream pointer
+		sym->value1 = 9+result;
+		sym->value2 = 1;        
+		return 0;                 // found code and return 
+	}
+
+
+	/* third table - rows 1-8 */
+	xmm_cod  = _mm_loadu_si128((__m128i *)(cod+34));	
+	xmm_mask = _mm_loadu_si128((__m128i *)(mask+34));	
+	xmm_mask = _mm_and_si128(xmm_mask, xmm_inf); // mask = mask & inf
+	xmm_mask = _mm_cmpeq_epi16(xmm_mask, xmm_cod); // mask == cod
+	match = _mm_movemask_epi8(xmm_mask);
+	if (match)
+	{
+		_BitScanForward(&result, match);
+		result >>= 1;
+
+		currStream->frame_bitoffset += len[result+34]; // move bitstream pointer
+		sym->value1 = 2+result;
+		sym->value2 = 2;        
+		return 0;                 // found code and return 
+	}
+
+
+	/* third table - rows 9-16 */
+	xmm_cod  = _mm_loadu_si128((__m128i *)(cod+42));	
+	xmm_mask = _mm_loadu_si128((__m128i *)(mask+42));	
+	xmm_mask = _mm_and_si128(xmm_mask, xmm_inf); // mask = mask & inf
+	xmm_mask = _mm_cmpeq_epi16(xmm_mask, xmm_cod); // mask == cod
+	match = _mm_movemask_epi8(xmm_mask);
+	if (match)
+	{
+		_BitScanForward(&result, match);
+		result >>= 1;
+
+		currStream->frame_bitoffset += len[result+42]; // move bitstream pointer
+		sym->value1 = 10+result;
+		sym->value2 = 2;        
+		return 0;                 // found code and return 
+	}
+
+	/* fourth table - rows 1-8 */
+	xmm_cod  = _mm_loadu_si128((__m128i *)(cod+51));	
+	xmm_mask = _mm_loadu_si128((__m128i *)(mask+51));	
+	xmm_mask = _mm_and_si128(xmm_mask, xmm_inf); // mask = mask & inf
+	xmm_mask = _mm_cmpeq_epi16(xmm_mask, xmm_cod); // mask == cod
+	match = _mm_movemask_epi8(xmm_mask);
+	if (match)
+	{
+		_BitScanForward(&result, match);
+		result >>= 1;
+
+		currStream->frame_bitoffset += len[result+51]; // move bitstream pointer
+		sym->value1 = 3+result;
+		sym->value2 = 3;        
+		return 0;                 // found code and return 
+	}
+
+	/* fourth table - rows 9-16 */
+	xmm_cod  = _mm_loadu_si128((__m128i *)(cod+59));	
+	xmm_mask = _mm_loadu_si128((__m128i *)(mask+59));	
+	xmm_mask = _mm_and_si128(xmm_mask, xmm_inf); // mask = mask & inf
+	xmm_mask = _mm_cmpeq_epi16(xmm_mask, xmm_cod); // mask == cod
+	match = _mm_movemask_epi8(xmm_mask);
+	if (match)
+	{
+		_BitScanForward(&result, match);
+		result >>= 1;
+
+		currStream->frame_bitoffset += len[result+59]; // move bitstream pointer
+		sym->value1 = 11+result;
+		sym->value2 = 3;        
+		return 0;                 // found code and return 
+	}
+
+	return -1;  // failed to find code
+}
+
+
+int code_from_bitstream_2d_17_4_c(SyntaxElement *sym, Bitstream *currStream, const uint16_t *lentab, const uint16_t *codtab, const uint16_t *masktab)
+{
+	int i, j;
+	const uint16_t *len, *cod, *mask;
+
+	int frame_bitoffset = currStream->frame_bitoffset;
+	const uint8_t *buf = &currStream->streamBuffer[frame_bitoffset >> 3];
+	uint16_t inf;
+
+	unsigned int _inf = _byteswap_ulong(*(unsigned long *)buf);
+	_inf >>= 16-(frame_bitoffset & 0x07);
+	_inf  &= 0xFFFF;
+	inf = (uint16_t)_inf;
+
+	for (j=0;j<4;j++)
+	{
+		len = &lentab[j*17];
+		cod = &codtab[j*17];
+		mask = &masktab[j*17];
+		// this VLC decoding method is not optimized for speed
+		for (i=0; i < 17; i++)
+		{
+			if ((inf & mask[i]) == cod[i])//ShowBitsThres(inf, len[i]) == cod[i])
+			{
+				currStream->frame_bitoffset += len[i]; // move bitstream pointer
+				sym->value1 = j+i;
+				sym->value2 = j;        
+				return 0;                 // found code and return 
+			}
+		}
+	}
+
+	return -1;  // failed to find code
+}
+
+static int code_from_bitstream_2d_9_4(SyntaxElement *sym,
+																			Bitstream *currStream,
+																			 const uint16_t *lentab,
+																			 const uint16_t *codtab,
+																			 const uint16_t *masktab)
+{
+	int i, j;
+	const uint16_t *len, *cod, *mask;
+
+	int frame_bitoffset = currStream->frame_bitoffset;
+	const uint8_t *buf = &currStream->streamBuffer[frame_bitoffset >> 3];
+
+	uint16_t inf;
+		unsigned int _inf = _byteswap_ulong(*(unsigned long *)buf);
+		_inf >>= 16-(frame_bitoffset & 0x07);
+		_inf  &= 0xFFFF;
+		inf = (uint16_t)_inf;
+
+	// this VLC decoding method is not optimized for speed
+	for (j = 0; j < 4; j++) 
+	{
+		len = &lentab[j*9];
+		cod = &codtab[j*9];
+		mask = &masktab[j*9];
+
+		for (i=0; i < 9; i++)
+		{
+			if ((inf & mask[i]) == cod[i])
+			{
+				sym->len = len[i];
+				currStream->frame_bitoffset += len[i]; // move bitstream pointer
+				sym->value1 = j+i;
+				sym->value2 = j;        
+				return 0;                 // found code and return 
+			}
+		}
+	}
+	return -1;  // failed to find code
+}
+
+int code_from_bitstream_2d_5_4_c(SyntaxElement *sym, Bitstream *currStream, const uint8_t *lentab, const uint8_t *codtab, const uint8_t *masktab)
+{
+
+		int i;
+		int frame_bitoffset = currStream->frame_bitoffset;
+		const uint8_t *buf = &currStream->streamBuffer[frame_bitoffset >> 3];
+		uint8_t inf;
+
+		unsigned int _inf = _byteswap_ushort(*(unsigned short *)buf);
+		_inf >>= 8-(frame_bitoffset & 0x07);
+		_inf  &= 0xFF;
+		inf = (uint8_t)_inf;
+
+		for (i = 0; i<16;i++)
+		{
+			if ((inf & masktab[i]) == codtab[i])
+			{
+				currStream->frame_bitoffset += lentab[i]; // move bitstream pointer
+				sym->value2 = (i<<1)/9;        
+				sym->value1 = sym->value2 + (((i<<1)%9)>>1);
+
+				return 0;                 // found code and return 
+			}
+		}
+
+		return -1;  // failed to find code
+
+}
+
+
+int code_from_bitstream_2d_5_4_sse2(SyntaxElement *sym, Bitstream *currStream, const uint8_t *lentab, const uint8_t *codtab, const uint8_t *masktab)
+{
+		int frame_bitoffset = currStream->frame_bitoffset;
+		const uint8_t *buf = &currStream->streamBuffer[frame_bitoffset >> 3];
+		uint8_t inf;
+		__m128i xmm_inf, xmm_mask, xmm_cod;
+		int match;
+		unsigned int _inf = _byteswap_ushort(*(unsigned short *)buf);
+		_inf >>= 8-(frame_bitoffset & 0x07);
+		_inf  &= 0xFF;
+		inf = (uint8_t)_inf;
+
+		xmm_inf = _mm_set1_epi8(_inf);
+
+		xmm_cod  = _mm_load_si128((__m128i *)codtab);	
+		xmm_mask = _mm_load_si128((__m128i *)masktab);	
+		xmm_mask = _mm_and_si128(xmm_mask, xmm_inf); // mask = mask & inf
+		xmm_mask = _mm_cmpeq_epi8(xmm_mask, xmm_cod); // mask == cod
+		match = _mm_movemask_epi8(xmm_mask);
+		if (match)
+		{
+			unsigned long result;
+			_BitScanForward(&result, match);
+
+			currStream->frame_bitoffset += lentab[result]; // move bitstream pointer
+			sym->value2 = (result<<1)/9;        
+			sym->value1 = sym->value2 + (((result<<1)%9)>>1);     
+			return 0;                 // found code and return 
+		}
+		return -1;
+}
+
+/*!
+************************************************************************
+* \brief
+*    read FLC codeword from UVLC-partition
+************************************************************************
+*/
+int readSyntaxElement_FLC(Bitstream *currStream, int numbits)
+{
+	int totbitoffset = currStream->frame_bitoffset;
+	int bitoffset  = /*7 - */(totbitoffset & 0x07); // bit from start of byte
+	int byteoffset = (totbitoffset >> 3); // byte from start of buffer
+	const uint8_t *ptr  = &(currStream->streamBuffer[byteoffset]);
+
+	uint32_t tmp = (ptr[0] << 24) | (ptr[1] << 16) | (ptr[2] <<  8);// | (ptr[3]);
+	tmp <<= bitoffset;
+	tmp >>= 32 - numbits;
+	currStream->frame_bitoffset += numbits;
+	return tmp;
+}
+
+
+
+/*!
+************************************************************************
+* \brief
+*    read NumCoeff/TrailingOnes codeword from UVLC-partition
+************************************************************************
+*/
+
+int readSyntaxElement_NumCoeffTrailingOnes(SyntaxElement *sym,  
+																					 Bitstream *currStream,
+																					 int vlcnum)
+{
+	int frame_bitoffset        = currStream->frame_bitoffset;
+	int BitstreamLengthInBytes = currStream->bitstream_length;
+	int BitstreamLengthInBits  = (BitstreamLengthInBytes << 3) + 7;
+	const uint8_t *buf = currStream->streamBuffer;
+
+	static const uint16_t lentab[3][4][17] =
+	{
+		{   // 0702
+			{ 1, 6, 8, 9,10,11,13,13,13,14,14,15,15,16,16,16,16},
+			{ 2, 6, 8, 9,10,11,13,13,14,14,15,15,15,16,16,16, 0},
+			{ 3, 7, 8, 9,10,11,13,13,14,14,15,15,16,16,16, 0, 0},
+			{ 5, 6, 7, 8, 9,10,11,13,14,14,15,15,16,16, 0, 0, 0},
+		},
+		{
+			{ 2, 6, 6, 7, 8, 8, 9,11,11,12,12,12,13,13,13,14,14},
+			{ 2, 5, 6, 6, 7, 8, 9,11,11,12,12,13,13,14,14,14, 0},
+			{ 3, 6, 6, 7, 8, 9,11,11,12,12,13,13,13,14,14, 0, 0},
+			{ 4, 4, 5, 6, 6, 7, 9,11,11,12,13,13,13,14, 0, 0, 0},
+			},
+			{
+				{ 4, 6, 6, 6, 7, 7, 7, 7, 8, 8, 9, 9, 9,10,10,10,10},
+				{ 4, 5, 5, 5, 5, 6, 6, 7, 8, 8, 9, 9, 9,10,10,10, 0},
+				{ 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9,10,10,10, 0, 0},
+				{ 4, 4, 4, 4, 4, 5, 6, 7, 8, 8, 9,10,10,10, 0, 0, 0},
+			},
+	};
+#if 0 // save for reference
+	static const uint32_t codtab[3][4][17] =
+	{
+		{
+			{ 1, 5, 7, 7, 7, 7,15,11, 8,15,11,15,11,15,11, 7,4},
+			{ 0, 1, 4, 6, 6, 6, 6,14,10,14,10,14,10, 1,14,10,6},
+			{ 0, 0, 1, 5, 5, 5, 5, 5,13, 9,13, 9,13, 9,13, 9,5},
+			{ 0, 0, 0, 3, 3, 4, 4, 4, 4, 4,12,12, 8,12, 8,12,8},
+		},
+		{
+			{ 3,11, 7, 7, 7, 4, 7,15,11,15,11, 8,15,11, 7, 9,7},
+			{ 0, 2, 7,10, 6, 6, 6, 6,14,10,14,10,14,10,11, 8,6},
+			{ 0, 0, 3, 9, 5, 5, 5, 5,13, 9,13, 9,13, 9, 6,10,5},
+			{ 0, 0, 0, 5, 4, 6, 8, 4, 4, 4,12, 8,12,12, 8, 1,4},
+			},
+			{
+				{15,15,11, 8,15,11, 9, 8,15,11,15,11, 8,13, 9, 5,1},
+				{ 0,14,15,12,10, 8,14,10,14,14,10,14,10, 7,12, 8,4},
+				{ 0, 0,13,14,11, 9,13, 9,13,10,13, 9,13, 9,11, 7,3},
+				{ 0, 0, 0,12,11,10, 9, 8,13,12,12,12, 8,12,10, 6,2},
+			},
+	};
+#endif
+	static const uint16_t codtab[3][4][17] =
+	{
+		{
+			{ 0x8000, 0x1400, 0x0700, 0x0380, 0x01C0, 0x00E0, 0x0078, 0x0058, 0x0040, 0x003C, 0x002C, 0x001E, 0x0016, 0x000F, 0x000B, 0x0007, 0x0004 },
+			{ 0x4000, 0x1000, 0x0600, 0x0300, 0x0180, 0x00C0, 0x0070, 0x0050, 0x0038, 0x0028, 0x001C, 0x0014, 0x0002, 0x000E, 0x000A, 0x0006, 0xFFFF },
+			{ 0x2000, 0x0A00, 0x0500, 0x0280, 0x0140, 0x00A0, 0x0068, 0x0048, 0x0034, 0x0024, 0x001A, 0x0012, 0x000D, 0x0009, 0x0005, 0xFFFF, 0xFFFF },
+			{ 0x1800, 0x0C00, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0060, 0x0030, 0x0020, 0x0018, 0x0010, 0x000C, 0x0008, 0xFFFF, 0xFFFF, 0xFFFF }
+		},
+		{
+			{ 0xC000, 0x2C00, 0x1C00, 0x0E00, 0x0700, 0x0400, 0x0380, 0x01E0, 0x0160, 0x00F0, 0x00B0, 0x0080, 0x0078, 0x0058, 0x0038, 0x0024, 0x001C },
+			{ 0x8000, 0x3800, 0x2800, 0x1800, 0x0C00, 0x0600, 0x0300, 0x01C0, 0x0140, 0x00E0, 0x00A0, 0x0070, 0x0050, 0x002C, 0x0020, 0x0018, 0xFFFF },
+			{ 0x6000, 0x2400, 0x1400, 0x0A00, 0x0500, 0x0280, 0x01A0, 0x0120, 0x00D0, 0x0090, 0x0068, 0x0048, 0x0030, 0x0028, 0x0014, 0xFFFF, 0xFFFF },
+			{ 0x5000, 0x4000, 0x3000, 0x2000, 0x1000, 0x0800, 0x0200, 0x0180, 0x0100, 0x00C0, 0x0060, 0x0040, 0x0008, 0x0010, 0xFFFF, 0xFFFF, 0xFFFF }
+		},
+		{
+			{ 0xF000, 0x3C00, 0x2C00, 0x2000, 0x1E00, 0x1600, 0x1200, 0x1000, 0x0F00, 0x0B00, 0x0780, 0x0580, 0x0400, 0x0340, 0x0240, 0x0140, 0x0040 },
+			{ 0xE000, 0x7800, 0x6000, 0x5000, 0x4000, 0x3800, 0x2800, 0x1C00, 0x0E00, 0x0A00, 0x0700, 0x0500, 0x0380, 0x0300, 0x0200, 0x0100, 0xFFFF },
+			{ 0xD000, 0x7000, 0x5800, 0x4800, 0x3400, 0x2400, 0x1A00, 0x1400, 0x0D00, 0x0900, 0x0680, 0x0480, 0x02C0, 0x01C0, 0x00C0, 0xFFFF, 0xFFFF },
+			{ 0xC000, 0xB000, 0xA000, 0x9000, 0x8000, 0x6800, 0x3000, 0x1800, 0x0C00, 0x0800, 0x0600, 0x0280, 0x0180, 0x0080, 0xFFFF, 0xFFFF, 0xFFFF }
+		}
+	};
+
+	static const uint16_t masktab[3][4][17] =
+	{
+		{
+			{ 0x8000, 0xFC00, 0xFF00, 0xFF80, 0xFFC0, 0xFFE0, 0xFFF8, 0xFFF8, 0xFFF8, 0xFFFC, 0xFFFC, 0xFFFE, 0xFFFE, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF },
+			{ 0xC000, 0xFC00, 0xFF00, 0xFF80, 0xFFC0, 0xFFE0, 0xFFF8, 0xFFF8, 0xFFFC, 0xFFFC, 0xFFFE, 0xFFFE, 0xFFFE, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000 },
+			{ 0xE000, 0xFE00, 0xFF00, 0xFF80, 0xFFC0, 0xFFE0, 0xFFF8, 0xFFF8, 0xFFFC, 0xFFFC, 0xFFFE, 0xFFFE, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000 },
+			{ 0xF800, 0xFC00, 0xFE00, 0xFF00, 0xFF80, 0xFFC0, 0xFFE0, 0xFFF8, 0xFFFC, 0xFFFC, 0xFFFE, 0xFFFE, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000 }
+		},
+		{
+			{ 0xC000, 0xFC00, 0xFC00, 0xFE00, 0xFF00, 0xFF00, 0xFF80, 0xFFE0, 0xFFE0, 0xFFF0, 0xFFF0, 0xFFF0, 0xFFF8, 0xFFF8, 0xFFF8, 0xFFFC, 0xFFFC },
+			{ 0xC000, 0xF800, 0xFC00, 0xFC00, 0xFE00, 0xFF00, 0xFF80, 0xFFE0, 0xFFE0, 0xFFF0, 0xFFF0, 0xFFF8, 0xFFF8, 0xFFFC, 0xFFFC, 0xFFFC, 0x0000 },
+			{ 0xE000, 0xFC00, 0xFC00, 0xFE00, 0xFF00, 0xFF80, 0xFFE0, 0xFFE0, 0xFFF0, 0xFFF0, 0xFFF8, 0xFFF8, 0xFFF8, 0xFFFC, 0xFFFC, 0x0000, 0x0000 },
+			{ 0xF000, 0xF000, 0xF800, 0xFC00, 0xFC00, 0xFE00, 0xFF80, 0xFFE0, 0xFFE0, 0xFFF0, 0xFFF8, 0xFFF8, 0xFFF8, 0xFFFC, 0x0000, 0x0000, 0x0000 }
+		},
+		{
+			{ 0xF000, 0xFC00, 0xFC00, 0xFC00, 0xFE00, 0xFE00, 0xFE00, 0xFE00, 0xFF00, 0xFF00, 0xFF80, 0xFF80, 0xFF80, 0xFFC0, 0xFFC0, 0xFFC0, 0xFFC0 },
+			{ 0xF000, 0xF800, 0xF800, 0xF800, 0xF800, 0xFC00, 0xFC00, 0xFE00, 0xFF00, 0xFF00, 0xFF80, 0xFF80, 0xFF80, 0xFFC0, 0xFFC0, 0xFFC0, 0x0000 },
+			{ 0xF000, 0xF800, 0xF800, 0xF800, 0xFC00, 0xFC00, 0xFE00, 0xFE00, 0xFF00, 0xFF00, 0xFF80, 0xFF80, 0xFFC0, 0xFFC0, 0xFFC0, 0x0000, 0x0000 },
+			{ 0xF000, 0xF000, 0xF000, 0xF000, 0xF000, 0xF800, 0xFC00, 0xFE00, 0xFF00, 0xFF00, 0xFF80, 0xFFC0, 0xFFC0, 0xFFC0, 0x0000, 0x0000, 0x0000 }
+		}
+	};
+
+	int code;
+	// vlcnum is the index of Table used to code coeff_token
+	// vlcnum==3 means (8<=nC) which uses 6bit FLC
+
+	if (vlcnum == 3)
+	{
+		// read 6 bit FLC
+		//code = ShowBits(buf, frame_bitoffset, BitstreamLengthInBytes, 6);
+		code = ShowBits(buf, frame_bitoffset, BitstreamLengthInBits, 6);
+		currStream->frame_bitoffset += 6;
+		sym->value2 = (code & 3);
+		sym->value1 = (code >> 2);
+
+		if (!sym->value1 && sym->value2 == 3)
+		{
+			// #c = 0, #t1 = 3 =>  #c = 0
+			sym->value2 = 0;
+		}
+		else
+			sym->value1++;
+	}
+	else
+	{
+		//retval = code_from_bitstream_2d(sym, currStream, &lentab[vlcnum][0][0], &codtab[vlcnum][0][0], 17, 4, &code);    
+		code = opt_code_from_bitstream_2d_17_4(sym, currStream, lentab[vlcnum][0], codtab[vlcnum][0], masktab[vlcnum][0]);
+	}
+
+	return 0;
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    read NumCoeff/TrailingOnes codeword from UVLC-partition ChromaDC
+************************************************************************
+*/
+int readSyntaxElement_NumCoeffTrailingOnesChromaDC(VideoParameters *p_Vid, SyntaxElement *sym,  Bitstream *currStream)
+{
+#if 0
+	static const uint8_t lentab[3][4][17] =
+	{
+		//YUV420
+		{{ 2, 6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+		{ 0, 1, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+		{ 0, 0, 3, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+		{ 0, 0, 0, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}},
+		//YUV422
+		{{ 1, 7, 7, 9, 9,10,11,12,13, 0, 0, 0, 0, 0, 0, 0, 0},
+		{ 0, 2, 7, 7, 9,10,11,12,12, 0, 0, 0, 0, 0, 0, 0, 0},
+		{ 0, 0, 3, 7, 7, 9,10,11,12, 0, 0, 0, 0, 0, 0, 0, 0},
+		{ 0, 0, 0, 5, 6, 7, 7,10,11, 0, 0, 0, 0, 0, 0, 0, 0}},
+		//YUV444
+		{{ 1, 6, 8, 9,10,11,13,13,13,14,14,15,15,16,16,16,16},
+		{ 0, 2, 6, 8, 9,10,11,13,13,14,14,15,15,15,16,16,16},
+		{ 0, 0, 3, 7, 8, 9,10,11,13,13,14,14,15,15,16,16,16},
+		{ 0, 0, 0, 5, 6, 7, 8, 9,10,11,13,14,14,15,15,16,16}}
+	};
+#endif
+
+	//YUV420
+
+	__declspec(align(32)) static const uint8_t lentab420[16] =    
+	{ 2, 6, 6, 6, 6, 1, 6, 7, 8, 3, 7, 8, 0, 0, 6, 7 };
+	__declspec(align(32)) static const uint8_t codtab420[16] =	
+	{ 0x40, 0x1C, 0x10, 0x0C, 0x08,	0x80, 0x18, 0x06, 0x03, 0x20, 0x04, 0x02, 0xFF, 0xFF,	0x14, 0x00 };
+	__declspec(align(32)) static const uint8_t masktab420[16] =
+	{ 0xC0, 0xFC, 0xFC, 0xFC, 0xFC,	0x80, 0xFC, 0xFE, 0xFF, 0xE0, 0xFE, 0xFF, 0x00, 0x00,	0xFC, 0xFE };
+
+
+	// YUV422
+	__declspec(align(32)) static const uint16_t lentab422[4][9] = 
+	{
+		{ 1, 7, 7, 9, 9,10,11,12,13 },
+		{ 2, 7, 7, 9,10,11,12,12, 0 },
+		{ 3, 7, 7, 9,10,11,12, 0, 0 },
+		{ 5, 6, 7, 7,10,11, 0, 0, 0 }
+	};
+	__declspec(align(32)) static const uint16_t codtab422[4][9] = 
+	{
+		{ 0x8000, 0x1E00, 0x1C00, 0x0380, 0x0300, 0x01C0, 0x00E0, 0x0070, 0x0038 },
+		{ 0x4000, 0x1A00, 0x1800, 0x0280, 0x0180, 0x00C0, 0x0060, 0x0050, 0xFFFF },
+		{ 0x2000, 0x1600, 0x1400, 0x0200, 0x0140, 0x00A0, 0x0040, 0xFFFF, 0xFFFF },
+		{ 0x0800, 0x0400, 0x1200, 0x1000, 0x0100, 0x0080, 0xFFFF, 0xFFFF, 0xFFFF }
+	};
+	__declspec(align(32)) static const uint16_t masktab422[4][9] = 
+	{
+		{ 0x8000, 0xFE00, 0xFE00, 0xFF80, 0xFF80, 0xFFC0, 0xFFE0, 0xFFF0, 0xFFF8 },
+		{ 0xC000, 0xFE00, 0xFE00, 0xFF80, 0xFFC0, 0xFFE0, 0xFFF0, 0xFFF0, 0x0000 },
+		{ 0xE000, 0xFE00, 0xFE00, 0xFF80, 0xFFC0, 0xFFE0, 0xFFF0, 0x0000, 0x0000 },
+		{ 0xF800, 0xFC00, 0xFE00, 0xFE00, 0xFFC0, 0xFFE0, 0x0000, 0x0000, 0x0000 }
+	};
+
+	// YUV444
+	__declspec(align(32)) static const uint16_t lentab444[4][17] = 
+	{
+		{ 1, 6, 8, 9,10,11,13,13,13,14,14,15,15,16,16,16,16},
+		{ 2, 6, 8, 9,10,11,13,13,14,14,15,15,15,16,16,16, 0},
+		{ 3, 7, 8, 9,10,11,13,13,14,14,15,15,16,16,16, 0, 0},
+		{ 5, 6, 7, 8, 9,10,11,13,14,14,15,15,16,16, 0, 0, 0}
+	};
+	__declspec(align(32)) static const uint16_t codtab444[4][17] = 
+	{
+		{ 0x8000, 0x1400, 0x0700, 0x0380, 0x01C0, 0x00E0, 0x0078, 0x0058, 0x0040, 0x003C, 0x002C, 0x001E, 0x0016, 0x000F, 0x000B, 0x0007, 0x0004 },
+		{ 0x4000, 0x1000, 0x0600, 0x0300, 0x0180, 0x00C0, 0x0070, 0x0050, 0x0038, 0x0028, 0x001C, 0x0014, 0x0002, 0x000E, 0x000A, 0x0006, 0xFFFF },
+		{ 0x2000, 0x0A00, 0x0500, 0x0280, 0x0140, 0x00A0, 0x0068, 0x0048, 0x0034, 0x0024, 0x001A, 0x0012, 0x000D, 0x0009, 0x0005, 0xFFFF, 0xFFFF },
+		{ 0x1800, 0x0C00, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0060, 0x0030, 0x0020, 0x0018, 0x0010, 0x000C, 0x0008, 0xFFFF, 0xFFFF, 0xFFFF }
+	};
+	__declspec(align(32)) static const uint16_t masktab444[4][17] = 
+	{
+		{ 0x8000, 0xFC00, 0xFF00, 0xFF80, 0xFFC0, 0xFFE0, 0xFFF8, 0xFFF8, 0xFFF8, 0xFFFC, 0xFFFC, 0xFFFE, 0xFFFE, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF },
+		{ 0xC000, 0xFC00, 0xFF00, 0xFF80, 0xFFC0, 0xFFE0, 0xFFF8, 0xFFF8, 0xFFFC, 0xFFFC, 0xFFFE, 0xFFFE, 0xFFFE, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000 },
+		{ 0xE000, 0xFE00, 0xFF00, 0xFF80, 0xFFC0, 0xFFE0, 0xFFF8, 0xFFF8, 0xFFFC, 0xFFFC, 0xFFFE, 0xFFFE, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000 },
+		{ 0xF800, 0xFC00, 0xFE00, 0xFF00, 0xFF80, 0xFFC0, 0xFFE0, 0xFFF8, 0xFFFC, 0xFFFC, 0xFFFE, 0xFFFE, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000 }
+	};
+
+#if 0
+	static const uint8_t codtab[3][4][17] =
+	{
+		//YUV420
+		{{ 1, 7, 4, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+		{ 0, 1, 6, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+		{ 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+		{ 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}},
+		//YUV422
+		{{ 1,15,14, 7, 6, 7, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0},
+		{ 0, 1,13,12, 5, 6, 6, 6, 5, 0, 0, 0, 0, 0, 0, 0, 0},
+		{ 0, 0, 1,11,10, 4, 5, 5, 4, 0, 0, 0, 0, 0, 0, 0, 0},
+		{ 0, 0, 0, 1, 1, 9, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0}},
+		//YUV444
+		{{ 1, 5, 7, 7, 7, 7,15,11, 8,15,11,15,11,15,11, 7, 4},
+		{ 0, 1, 4, 6, 6, 6, 6,14,10,14,10,14,10, 1,14,10, 6},
+		{ 0, 0, 1, 5, 5, 5, 5, 5,13, 9,13, 9,13, 9,13, 9, 5},
+		{ 0, 0, 0, 3, 3, 4, 4, 4, 4, 4,12,12, 8,12, 8,12, 8}}
+	};
+#endif
+
+	int code;
+	int yuv = p_Vid->active_sps->chroma_format_idc - 1;
+	switch(yuv)
+	{
+	case 0:
+		code = opt_code_from_bitstream_2d_5_4(sym, currStream, lentab420, codtab420, masktab420);
+		break;
+	case 1:
+		code = code_from_bitstream_2d_9_4(sym, currStream,  lentab422[0], codtab422[0], masktab422[0]);
+		break;
+	case 2:
+		code = opt_code_from_bitstream_2d_17_4(sym, currStream,  lentab444[0], codtab444[0], masktab444[0]);
+		break;
+	default:
+		__assume(0);
+		return -1;
+	}
+
+	return 0;
+}
+
+/*!
+************************************************************************
+* \brief
+*    read Level VLC0 codeword from UVLC-partition
+************************************************************************
+*/
+int readSyntaxElement_Level_VLC0(Bitstream *currStream)
+{
+	int frame_bitoffset        = currStream->frame_bitoffset;
+	int BitstreamLengthInBytes = currStream->bitstream_length;
+	int BitstreamLengthInBits  = (BitstreamLengthInBytes << 3) + 7;
+	byte *buf                  = currStream->streamBuffer;
+	int len = 1, sign = 0, level = 0, code = 1;
+
+	while (!ShowBits(buf, frame_bitoffset++, BitstreamLengthInBits, 1))
+		len++;
+
+	if (len < 15)
+	{
+		sign  = (len - 1) & 1;
+		level = ((len - 1) >> 1) + 1;
+	}
+	else if (len == 15)
+	{
+		// escape code
+		code <<= 4;
+		code |= ShowBits(buf, frame_bitoffset, BitstreamLengthInBits, 4);
+		len  += 4;
+		frame_bitoffset += 4;
+		sign = (code & 0x01);
+		level = ((code >> 1) & 0x07) + 8;
+	}
+	else if (len >= 16)
+	{
+		// escape code
+		int addbit = (len - 16);
+		int offset = (2048 << addbit) - 2032;
+		len   -= 4;
+		code   = ShowBits(buf, frame_bitoffset, BitstreamLengthInBits, len);
+		sign   = (code & 0x01);
+		frame_bitoffset += len;    
+		level = (code >> 1) + offset;
+
+		code |= (1 << (len)); // for display purpose only
+		len += addbit + 16;
+	}
+	currStream->frame_bitoffset = frame_bitoffset;
+	return (sign) ? -level : level ;
+	//sym->len = len;
+
+#if TRACE
+	tracebits2(sym->tracestring, sym->len, code);
+#endif
+
+	
+	return 0;
+}
+
+/*!
+************************************************************************
+* \brief
+*    read Level VLC codeword from UVLC-partition
+************************************************************************
+*/
+int readSyntaxElement_Level_VLCN(int vlc, Bitstream *currStream)
+{
+	int frame_bitoffset        = currStream->frame_bitoffset;
+	int BitstreamLengthInBytes = currStream->bitstream_length;
+	int BitstreamLengthInBits  = (BitstreamLengthInBytes << 3) + 7;
+	byte *buf                  = currStream->streamBuffer;
+
+	int levabs, sign;
+	int len = 1;
+	int code = 1, sb;
+
+	int shift = vlc - 1;
+
+	// read pre zeros
+	while (!ShowBits(buf, frame_bitoffset ++, BitstreamLengthInBits, 1))
+		len++;
+
+	frame_bitoffset -= len;
+
+	if (len < 16)
+	{
+		levabs = ((len - 1) << shift) + 1;
+
+		// read (vlc-1) bits -> suffix
+		if (shift)
+		{
+			sb =  ShowBits(buf, frame_bitoffset + len, BitstreamLengthInBits, shift);
+			code = (code << (shift) )| sb;
+			levabs += sb;
+			len += (shift);
+		}
+
+		// read 1 bit -> sign
+		sign = ShowBits(buf, frame_bitoffset + len, BitstreamLengthInBits, 1);
+		code = (code << 1)| sign;
+		len ++;
+	}
+	else // escape
+	{
+		int addbit = len - 5;
+		int offset = (1 << addbit) + (15 << shift) - 2047;
+
+		sb = ShowBits(buf, frame_bitoffset + len, BitstreamLengthInBits, addbit);
+		code = (code << addbit ) | sb;
+		len   += addbit;
+
+		levabs = sb + offset;
+
+		// read 1 bit -> sign
+		sign = ShowBits(buf, frame_bitoffset + len, BitstreamLengthInBits, 1);
+
+		code = (code << 1)| sign;
+
+		len++;
+	}
+
+	currStream->frame_bitoffset = frame_bitoffset + len;
+	return (sign)? -levabs : levabs;
+}
+
+/*!
+************************************************************************
+* \brief
+*    read Total Zeros codeword from UVLC-partition
+************************************************************************
+*/
+int readSyntaxElement_TotalZeros(Bitstream *currStream, int vlcnum)
+{
+	__declspec(align(32)) static const uint16_t lentab[TOTRUN_NUM][16] =
+	{
+
+		{ 1,3,3,4,4,5,5,6,6,7,7,8,8,9,9,9},
+		{ 3,3,3,3,3,4,4,4,4,5,5,6,6,6,6},
+		{ 4,3,3,3,4,4,3,3,4,5,5,6,5,6},
+		{ 5,3,4,4,3,3,3,4,3,4,5,5,5},
+		{ 4,4,4,3,3,3,3,3,4,5,4,5},
+		{ 6,5,3,3,3,3,3,3,4,3,6},
+		{ 6,5,3,3,3,2,3,4,3,6},
+		{ 6,4,5,3,2,2,3,3,6},
+		{ 6,6,4,2,2,3,2,5},
+		{ 5,5,3,2,2,2,4},
+		{ 4,4,3,3,1,3},
+		{ 4,4,2,1,3},
+		{ 3,3,1,2},
+		{ 2,2,1},
+		{ 1,1},
+	};
+/*
+	static const byte codtab[TOTRUN_NUM][16] =
+	{
+		{1,3,2,3,2,3,2,3,2,3,2,3,2,3,2,1},
+		{7,6,5,4,3,5,4,3,2,3,2,3,2,1,0},
+		{5,7,6,5,4,3,4,3,2,3,2,1,1,0},
+		{3,7,5,4,6,5,4,3,3,2,2,1,0},
+		{5,4,3,7,6,5,4,3,2,1,1,0},
+		{1,1,7,6,5,4,3,2,1,1,0},
+		{1,1,5,4,3,3,2,1,1,0},
+		{1,1,1,3,3,2,2,1,0},
+		{1,0,1,3,2,1,1,1,},
+		{1,0,1,3,2,1,1,},
+		{0,1,1,2,1,3},
+		{0,1,1,1,1},
+		{0,1,1,1},
+		{0,1,1},
+		{0,1},
+	};*/
+
+	__declspec(align(32)) static const uint16_t codtab[TOTRUN_NUM][16] =
+		{
+{ 0x8000, 0x6000, 0x4000, 0x3000, 0x2000, 0x1800, 0x1000, 0x0C00, 0x0800, 0x0600, 0x0400, 0x0300, 0x0200, 0x0180, 0x0100, 0x0080,  },
+{ 0xE000, 0xC000, 0xA000, 0x8000, 0x6000, 0x5000, 0x4000, 0x3000, 0x2000, 0x1800, 0x1000, 0x0C00, 0x0800, 0x0400, 0x0000, 0xFFFF,  },
+{ 0x5000, 0xE000, 0xC000, 0xA000, 0x4000, 0x3000, 0x8000, 0x6000, 0x2000, 0x1800, 0x1000, 0x0400, 0x0800, 0x0000, 0xFFFF, 0xFFFF,  },
+{ 0x1800, 0xE000, 0x5000, 0x4000, 0xC000, 0xA000, 0x8000, 0x3000, 0x6000, 0x2000, 0x1000, 0x0800, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF,  },
+{ 0x5000, 0x4000, 0x3000, 0xE000, 0xC000, 0xA000, 0x8000, 0x6000, 0x2000, 0x0800, 0x1000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,  },
+{ 0x0400, 0x0800, 0xE000, 0xC000, 0xA000, 0x8000, 0x6000, 0x4000, 0x1000, 0x2000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,  },
+{ 0x0400, 0x0800, 0xA000, 0x8000, 0x6000, 0xC000, 0x4000, 0x1000, 0x2000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,  },
+{ 0x0400, 0x1000, 0x0800, 0x6000, 0xC000, 0x8000, 0x4000, 0x2000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,  },
+{ 0x0400, 0x0000, 0x1000, 0xC000, 0x8000, 0x2000, 0x4000, 0x0800, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,  },
+{ 0x0800, 0x0000, 0x2000, 0xC000, 0x8000, 0x4000, 0x1000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,  },
+{ 0x0000, 0x1000, 0x2000, 0x4000, 0x8000, 0x6000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,  },
+{ 0x0000, 0x1000, 0x4000, 0x8000, 0x2000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,  },
+{ 0x0000, 0x2000, 0x8000, 0x4000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,  },
+{ 0x0000, 0x4000, 0x8000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,  },
+{ 0x0000, 0x8000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,  }
+	};
+	__declspec(align(32)) static const uint16_t masktab[TOTRUN_NUM][16] = 
+		{
+{ 0x8000, 0xE000, 0xE000, 0xF000, 0xF000, 0xF800, 0xF800, 0xFC00, 0xFC00, 0xFE00, 0xFE00, 0xFF00, 0xFF00, 0xFF80, 0xFF80, 0xFF80,  },
+{ 0xE000, 0xE000, 0xE000, 0xE000, 0xE000, 0xF000, 0xF000, 0xF000, 0xF000, 0xF800, 0xF800, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0x0000,  },
+{ 0xF000, 0xE000, 0xE000, 0xE000, 0xF000, 0xF000, 0xE000, 0xE000, 0xF000, 0xF800, 0xF800, 0xFC00, 0xF800, 0xFC00, 0x0000, 0x0000,  },
+{ 0xF800, 0xE000, 0xF000, 0xF000, 0xE000, 0xE000, 0xE000, 0xF000, 0xE000, 0xF000, 0xF800, 0xF800, 0xF800, 0x0000, 0x0000, 0x0000,  },
+{ 0xF000, 0xF000, 0xF000, 0xE000, 0xE000, 0xE000, 0xE000, 0xE000, 0xF000, 0xF800, 0xF000, 0xF800, 0x0000, 0x0000, 0x0000, 0x0000,  },
+{ 0xFC00, 0xF800, 0xE000, 0xE000, 0xE000, 0xE000, 0xE000, 0xE000, 0xF000, 0xE000, 0xFC00, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,  },
+{ 0xFC00, 0xF800, 0xE000, 0xE000, 0xE000, 0xC000, 0xE000, 0xF000, 0xE000, 0xFC00, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,  },
+{ 0xFC00, 0xF000, 0xF800, 0xE000, 0xC000, 0xC000, 0xE000, 0xE000, 0xFC00, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,  },
+{ 0xFC00, 0xFC00, 0xF000, 0xC000, 0xC000, 0xE000, 0xC000, 0xF800, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,  },
+{ 0xF800, 0xF800, 0xE000, 0xC000, 0xC000, 0xC000, 0xF000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,  },
+{ 0xF000, 0xF000, 0xE000, 0xE000, 0x8000, 0xE000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,  },
+{ 0xF000, 0xF000, 0xC000, 0x8000, 0xE000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,  },
+{ 0xE000, 0xE000, 0x8000, 0xC000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,  },
+{ 0xC000, 0xC000, 0x8000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,  },
+{ 0x8000, 0x8000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,  },
+};
+	
+
+	return opt_code_from_bitstream_2d_16_1(currStream, lentab[vlcnum], codtab[vlcnum], masktab[vlcnum]);
+}
+
+/*!
+************************************************************************
+* \brief
+*    read Total Zeros Chroma DC codeword from UVLC-partition
+************************************************************************
+*/
+int readSyntaxElement_TotalZerosChromaDC(VideoParameters *p_Vid, Bitstream *currStream, int vlcnum)
+{
+	static const byte lentab[3][TOTRUN_NUM][16] =
+	{
+		//YUV420
+		{{ 1,2,3,3},
+		{ 1,2,2},
+		{ 1,1}},
+		//YUV422
+		{{ 1,3,3,4,4,4,5,5},
+		{ 3,2,3,3,3,3,3},
+		{ 3,3,2,2,3,3},
+		{ 3,2,2,2,3},
+		{ 2,2,2,2},
+		{ 2,2,1},
+		{ 1,1}},
+		//YUV444
+		{{ 1,3,3,4,4,5,5,6,6,7,7,8,8,9,9,9},
+		{ 3,3,3,3,3,4,4,4,4,5,5,6,6,6,6},
+		{ 4,3,3,3,4,4,3,3,4,5,5,6,5,6},
+		{ 5,3,4,4,3,3,3,4,3,4,5,5,5},
+		{ 4,4,4,3,3,3,3,3,4,5,4,5},
+		{ 6,5,3,3,3,3,3,3,4,3,6},
+		{ 6,5,3,3,3,2,3,4,3,6},
+		{ 6,4,5,3,2,2,3,3,6},
+		{ 6,6,4,2,2,3,2,5},
+		{ 5,5,3,2,2,2,4},
+		{ 4,4,3,3,1,3},
+		{ 4,4,2,1,3},
+		{ 3,3,1,2},
+		{ 2,2,1},
+		{ 1,1}}
+	};
+
+	static const byte codtab[3][TOTRUN_NUM][16] =
+	{
+		//YUV420
+		{{ 1,1,1,0},
+		{ 1,1,0},
+		{ 1,0}},
+		//YUV422
+		{{ 1,2,3,2,3,1,1,0},
+		{ 0,1,1,4,5,6,7},
+		{ 0,1,1,2,6,7},
+		{ 6,0,1,2,7},
+		{ 0,1,2,3},
+		{ 0,1,1},
+		{ 0,1}},
+		//YUV444
+		{{1,3,2,3,2,3,2,3,2,3,2,3,2,3,2,1},
+		{7,6,5,4,3,5,4,3,2,3,2,3,2,1,0},
+		{5,7,6,5,4,3,4,3,2,3,2,1,1,0},
+		{3,7,5,4,6,5,4,3,3,2,2,1,0},
+		{5,4,3,7,6,5,4,3,2,1,1,0},
+		{1,1,7,6,5,4,3,2,1,1,0},
+		{1,1,5,4,3,3,2,1,1,0},
+		{1,1,1,3,3,2,2,1,0},
+		{1,0,1,3,2,1,1,1,},
+		{1,0,1,3,2,1,1,},
+		{0,1,1,2,1,3},
+		{0,1,1,1,1},
+		{0,1,1,1},
+		{0,1,1},
+		{0,1}}
+	};
+
+	int yuv = p_Vid->active_sps->chroma_format_idc - 1;
+	return code_from_bitstream_2d_16_1(currStream, &lentab[yuv][vlcnum][0], &codtab[yuv][vlcnum][0]);
+}
+
+
+/*!
+************************************************************************
+* \brief
+*    read  Run codeword from UVLC-partition
+************************************************************************
+*/
+int readSyntaxElement_Run(Bitstream *currStream, int vlcnum)
+{
+	__declspec(align(32)) static const uint16_t lentab[TOTRUN_NUM][16] =
+	{
+		{1,1},
+		{1,2,2},
+		{2,2,2,2},
+		{2,2,2,3,3},
+		{2,2,3,3,3,3},
+		{2,3,3,3,3,3,3},
+		{3,3,3,3,3,3,3,4,5,6,7,8,9,10,11},
+	};
+/*
+	static const byte codtab[TOTRUN_NUM][16] =
+	{
+		{1,0},
+		{1,1,0},
+		{3,2,1,0},
+		{3,2,1,1,0},
+		{3,2,3,2,1,0},
+		{3,0,1,3,2,5,4},
+		{7,6,5,4,3,2,1,1,1,1,1,1,1,1,1},
+	};*/
+
+	__declspec(align(32)) static const uint16_t codtab[TOTRUN_NUM][16] = 
+		{
+{ 0x8000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,  },
+{ 0x8000, 0x4000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,  },
+{ 0xC000, 0x8000, 0x4000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,  },
+{ 0xC000, 0x8000, 0x4000, 0x2000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,  },
+{ 0xC000, 0x8000, 0x6000, 0x4000, 0x2000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,  },
+{ 0xC000, 0x0000, 0x2000, 0x6000, 0x4000, 0xA000, 0x8000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,  },
+{ 0xE000, 0xC000, 0xA000, 0x8000, 0x6000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0xFFFF,  }
+		};
+	__declspec(align(32)) static const uint16_t masktab[TOTRUN_NUM][16] = 
+{
+{ 0x8000, 0x8000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, },
+{ 0x8000, 0xC000, 0xC000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, },
+{ 0xC000, 0xC000, 0xC000, 0xC000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, },
+{ 0xC000, 0xC000, 0xC000, 0xE000, 0xE000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, },
+{ 0xC000, 0xC000, 0xE000, 0xE000, 0xE000, 0xE000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, },
+{ 0xC000, 0xE000, 0xE000, 0xE000, 0xE000, 0xE000, 0xE000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, },
+{ 0xE000, 0xE000, 0xE000, 0xE000, 0xE000, 0xE000, 0xE000, 0xF000, 0xF800, 0xFC00, 0xFE00, 0xFF00, 0xFF80, 0xFFC0, 0xFFE0, 0x0000, }
+	};
+	return opt_code_from_bitstream_2d_16_1(currStream, lentab[vlcnum], codtab[vlcnum], masktab[vlcnum]);
+}
+
+
+/*!
+************************************************************************
+* \brief
+*  Reads bits from the bitstream buffer
+*
+* \param buffer
+*    containing VLC-coded data bits
+* \param totbitoffset
+*    bit offset from start of partition
+* \param info
+*    returns value of the read bits
+* \param bitcount
+*    total bytes in bitstream
+* \param numbits
+*    number of bits to read
+*
+************************************************************************
+*/
+
+int GetBits (const uint8_t buffer[],int totbitoffset,int *info, int bitcount,
+						 int numbits)
+{
+	int bitoffset  = /*7 - */(totbitoffset & 0x07); // bit from start of byte
+	int byteoffset = (totbitoffset >> 3); // byte from start of buffer
+	const uint8_t *ptr  = &(buffer[byteoffset]);
+
+	uint32_t tmp = (ptr[0] << 24) | (ptr[1] << 16) | (ptr[2] <<  8) | (ptr[3]);
+	tmp <<= bitoffset;
+	tmp >>= 32 - numbits;
+	*info = tmp;
+	return numbits;
+}
+
+/*!
+************************************************************************
+* \brief
+*  Reads bits from the bitstream buffer
+*
+* \param buffer
+*    buffer containing VLC-coded data bits
+* \param totbitoffset
+*    bit offset from start of partition
+* \param bitcount
+*    total bytes in bitstream
+* \param numbits
+*    number of bits to read
+*
+************************************************************************
+*/
+
+static int ShowBits (const uint8_t buffer[],int totbitoffset,int bitcount, int numbits)
+{
+	int bitoffset  = /*7 - */(totbitoffset & 0x07); // bit from start of byte
+	int byteoffset = (totbitoffset >> 3); // byte from start of buffer
+	const uint8_t *ptr  = &(buffer[byteoffset]);
+
+	uint32_t tmp = (ptr[0] << 24) | (ptr[1] << 16) | (ptr[2] <<  8) | (ptr[3]);
+	tmp <<= bitoffset;
+	tmp >>= 32 - numbits;
+	return tmp;
+
+#if 0
+	if ((totbitoffset + numbits )  > bitcount) 
+	{
+		return -1;
+	}
+	else
+	{
+		int bitoffset  = 7 - (totbitoffset & 0x07); // bit from start of byte
+		int byteoffset = (totbitoffset >> 3); // byte from start of buffer
+		const uint8_t *curbyte  = &(buffer[byteoffset]);
+		int inf        = 0;
+
+		while (numbits--)
+		{
+			inf <<=1;    
+			inf |= ((*curbyte)>> (bitoffset--)) & 0x01;
+
+			if (bitoffset == -1 ) 
+			{ //Move onto next byte to get all of numbits
+				curbyte++;
+				bitoffset = 7;
+			}
+		}
+		return inf;           // return absolute offset in bit from start of frame
+	}
+#endif
+}
+
diff --git a/Src/h264dec/ldecod_vc9.vcxproj b/Src/h264dec/ldecod_vc9.vcxproj
new file mode 100644
index 00000000..ad81b0c3
--- /dev/null
+++ b/Src/h264dec/ldecod_vc9.vcxproj
@@ -0,0 +1,487 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="Current" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release IPP|Win32">
+      <Configuration>Release IPP</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release IPP|x64">
+      <Configuration>Release IPP</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>17.0</VCProjectVersion>
+    <ProjectName>h264dec</ProjectName>
+    <ProjectGuid>{5499B067-CF32-4141-A757-E0A29866994A}</ProjectGuid>
+    <RootNamespace>ldecod</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0.19041.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release IPP|Win32'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <PlatformToolset>v142</PlatformToolset>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release IPP|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <PlatformToolset>v142</PlatformToolset>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <PlatformToolset>v142</PlatformToolset>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <PlatformToolset>v142</PlatformToolset>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <PlatformToolset>v142</PlatformToolset>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <PlatformToolset>v142</PlatformToolset>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release IPP|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC71.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release IPP|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC71.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC71.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC71.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC71.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC71.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <_ProjectFileVersion>17.0.32505.173</_ProjectFileVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <OutDir>$(PlatformShortName)_$(Configuration)\</OutDir>
+    <IntDir>$(PlatformShortName)_$(Configuration)\</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <LibraryPath>$(LibraryPath)</LibraryPath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <IncludePath>$(IncludePath)</IncludePath>
+    <LibraryPath>$(LibraryPath)</LibraryPath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <OutDir>$(PlatformShortName)_$(Configuration)\</OutDir>
+    <IntDir>$(PlatformShortName)_$(Configuration)\</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <LibraryPath>$(LibraryPath)</LibraryPath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <IncludePath>$(IncludePath)</IncludePath>
+    <LibraryPath>$(LibraryPath)</LibraryPath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release IPP|Win32'">
+    <OutDir>$(PlatformShortName)_$(Configuration)\</OutDir>
+    <IntDir>$(PlatformShortName)_$(Configuration)\</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <LibraryPath>$(LibraryPath)</LibraryPath>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release IPP|x64'">
+    <IncludePath>$(IncludePath)</IncludePath>
+    <LibraryPath>$(LibraryPath)</LibraryPath>
+  </PropertyGroup>
+  <PropertyGroup Label="Vcpkg" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <VcpkgConfiguration>Debug</VcpkgConfiguration>
+    <VcpkgTriplet>x86-windows-static-md</VcpkgTriplet>
+  </PropertyGroup>
+  <PropertyGroup Label="Vcpkg" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <VcpkgTriplet>x86-windows-static-md</VcpkgTriplet>
+    <VcpkgConfiguration>Debug</VcpkgConfiguration>
+  </PropertyGroup>
+  <PropertyGroup Label="Vcpkg" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <VcpkgTriplet>x86-windows-static-md</VcpkgTriplet>
+  </PropertyGroup>
+  <PropertyGroup Label="Vcpkg" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <VcpkgTriplet>x86-windows-static-md</VcpkgTriplet>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Midl>
+      <TypeLibraryName>.\ldecod/Debug_vc8/ldecod.tlb</TypeLibraryName>
+      <HeaderFileName />
+    </Midl>
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>..\external_dependencies\intel_ipp_6.1.1.035\ia32\include;ldecod/inc;lcommon/inc;../Wasabi;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_DEPRECATE;H264_IPP;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <PrecompiledHeaderOutputFile>$(IntDir)$(TargetName).pch</PrecompiledHeaderOutputFile>
+      <AssemblerListingLocation>$(IntDir)</AssemblerListingLocation>
+      <ObjectFileName>$(IntDir)</ObjectFileName>
+      <ProgramDataBaseFileName>$(IntDir)$(TargetName).pdb</ProgramDataBaseFileName>
+      <BrowseInformation>true</BrowseInformation>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <CompileAs>Default</CompileAs>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0409</Culture>
+    </ResourceCompile>
+    <Lib>
+      <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
+    </Lib>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Midl>
+      <TypeLibraryName>.\ldecod/Debug_vc8/ldecod.tlb</TypeLibraryName>
+      <HeaderFileName>
+      </HeaderFileName>
+    </Midl>
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>..\external_dependencies\intel_ipp_6.1.1.035\ia32\include;ldecod/inc;lcommon/inc;../Wasabi;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_DEPRECATE;H264_IPP;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <PrecompiledHeaderOutputFile>$(IntDir)$(TargetName).pch</PrecompiledHeaderOutputFile>
+      <AssemblerListingLocation>$(IntDir)</AssemblerListingLocation>
+      <ObjectFileName>$(IntDir)</ObjectFileName>
+      <ProgramDataBaseFileName>$(IntDir)$(TargetName).pdb</ProgramDataBaseFileName>
+      <BrowseInformation>true</BrowseInformation>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <CompileAs>Default</CompileAs>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0409</Culture>
+    </ResourceCompile>
+    <Lib>
+      <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
+    </Lib>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Midl>
+      <TypeLibraryName>.\ldecod\Release_vc8/ldecod.tlb</TypeLibraryName>
+      <HeaderFileName />
+    </Midl>
+    <ClCompile>
+      <Optimization>Full</Optimization>
+      <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <OmitFramePointers>true</OmitFramePointers>
+      <AdditionalIncludeDirectories>..\external_dependencies\intel_ipp_6.1.1.035\ia32\include;ldecod/inc;lcommon/inc;../Wasabi;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CRT_SECURE_NO_DEPRECATE;H264_IPP;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <StringPooling>true</StringPooling>
+      <ExceptionHandling />
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <EnableEnhancedInstructionSet>StreamingSIMDExtensions</EnableEnhancedInstructionSet>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat>None</DebugInformationFormat>
+      <ProgramDataBaseFileName>$(IntDir)$(TargetName).pdb</ProgramDataBaseFileName>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0409</Culture>
+    </ResourceCompile>
+    <Lib>
+      <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
+    </Lib>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Midl>
+      <TypeLibraryName>.\ldecod\Release_vc8/ldecod.tlb</TypeLibraryName>
+      <HeaderFileName>
+      </HeaderFileName>
+    </Midl>
+    <ClCompile>
+      <Optimization>Full</Optimization>
+      <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <OmitFramePointers>true</OmitFramePointers>
+      <AdditionalIncludeDirectories>..\external_dependencies\intel_ipp_6.1.1.035\ia32\include;ldecod/inc;lcommon/inc;../Wasabi;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CRT_SECURE_NO_DEPRECATE;H264_IPP;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <StringPooling>true</StringPooling>
+      <ExceptionHandling>
+      </ExceptionHandling>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <EnableEnhancedInstructionSet>StreamingSIMDExtensions</EnableEnhancedInstructionSet>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat>None</DebugInformationFormat>
+      <ProgramDataBaseFileName>$(IntDir)$(TargetName).pdb</ProgramDataBaseFileName>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0409</Culture>
+    </ResourceCompile>
+    <Lib>
+      <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
+    </Lib>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release IPP|Win32'">
+    <Midl>
+      <TypeLibraryName>.\ldecod\Release_vc8/ldecod.tlb</TypeLibraryName>
+      <HeaderFileName />
+    </Midl>
+    <ClCompile>
+      <Optimization>Full</Optimization>
+      <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <AdditionalIncludeDirectories>..\external_dependencies\intel_ipp_6.1.1.035\ia32\include;ldecod/inc;lcommon/inc;../Wasabi;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CRT_SECURE_NO_DEPRECATE;H264_IPP;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <StringPooling>true</StringPooling>
+      <ExceptionHandling />
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <DisableSpecificWarnings>4101;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+      <ProgramDataBaseFileName>$(IntDir)$(TargetName).pdb</ProgramDataBaseFileName>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0409</Culture>
+    </ResourceCompile>
+    <Lib>
+      <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
+    </Lib>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release IPP|x64'">
+    <Midl>
+      <TypeLibraryName>.\ldecod\Release_vc8/ldecod.tlb</TypeLibraryName>
+      <HeaderFileName>
+      </HeaderFileName>
+    </Midl>
+    <ClCompile>
+      <Optimization>Full</Optimization>
+      <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <AdditionalIncludeDirectories>..\external_dependencies\intel_ipp_6.1.1.035\ia32\include;ldecod/inc;lcommon/inc;../Wasabi;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CRT_SECURE_NO_DEPRECATE;H264_IPP;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <StringPooling>true</StringPooling>
+      <ExceptionHandling>
+      </ExceptionHandling>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <DisableSpecificWarnings>4101;%(DisableSpecificWarnings)</DisableSpecificWarnings>
+      <ProgramDataBaseFileName>$(IntDir)$(TargetName).pdb</ProgramDataBaseFileName>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0409</Culture>
+    </ResourceCompile>
+    <Lib>
+      <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
+    </Lib>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CustomBuild Include="ldecod\src\biari.asm">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ml /c /nologo /Fo"$(IntDir)biarix86.obj" /Zi %(FullPath)
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ml /c /nologo /Fo"$(IntDir)biarix86.obj" /Zi %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)biarix86.obj;%(Outputs)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)biarix86.obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release IPP|Win32'">ml /c /nologo /Fo"$(IntDir)biarix86.obj" /Zi "%(FullPath)"
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release IPP|x64'">ml /c /nologo /Fo"$(IntDir)biarix86.obj" /Zi "%(FullPath)"
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release IPP|Win32'">$(IntDir)biarix86.obj;%(Outputs)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release IPP|x64'">$(IntDir)biarix86.obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ml /c /nologo /Fo"$(IntDir)biarix86.obj" /Zi %(FullPath)
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ml /c /nologo /Fo"$(IntDir)biarix86.obj" /Zi %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)biarix86.obj;%(Outputs)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)biarix86.obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="ldecod\src\macroblock.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release IPP|Win32'">ml /c /nologo /Fo"$(IntDir)macroblockx86.obj" /Zi "%(FullPath)"
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release IPP|x64'">ml /c /nologo /Fo"$(IntDir)macroblockx86.obj" /Zi "%(FullPath)"
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release IPP|Win32'">$(IntDir)macroblockx86.obj;%(Outputs)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release IPP|x64'">$(IntDir)macroblockx86.obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="ldecod\src\prediction.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release IPP|Win32'">ml /c /nologo /Fo"$(IntDir)predictionx86.obj" /Zi "%(FullPath)"
+</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release IPP|x64'">ml /c /nologo /Fo"$(IntDir)predictionx86.obj" /Zi "%(FullPath)"
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release IPP|Win32'">$(IntDir)predictionx86.obj;%(Outputs)</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release IPP|x64'">$(IntDir)predictionx86.obj;%(Outputs)</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="dec_api.c" />
+    <ClCompile Include="lcommon\src\memalloc.c" />
+    <ClCompile Include="lcommon\src\memcache.c" />
+    <ClCompile Include="lcommon\src\mv_prediction.c" />
+    <ClCompile Include="lcommon\src\parsetcommon.c" />
+    <ClCompile Include="lcommon\src\transform.c" />
+    <ClCompile Include="lcommon\src\win32.c" />
+    <ClCompile Include="ldecod\src\biaridecod.c" />
+    <ClCompile Include="ldecod\src\block.c" />
+    <ClCompile Include="ldecod\src\cabac.c" />
+    <ClCompile Include="ldecod\src\context_ini.c" />
+    <ClCompile Include="ldecod\src\erc_api.c" />
+    <ClCompile Include="ldecod\src\erc_do_i.c" />
+    <ClCompile Include="ldecod\src\erc_do_p.c" />
+    <ClCompile Include="ldecod\src\errorconcealment.c" />
+    <ClCompile Include="ldecod\src\filter_chroma_horiz.c" />
+    <ClCompile Include="ldecod\src\filter_chroma_vert.c" />
+    <ClCompile Include="ldecod\src\filter_luma_horiz.c" />
+    <ClCompile Include="ldecod\src\filter_luma_vert.c" />
+    <ClCompile Include="ldecod\src\fmo.c" />
+    <ClCompile Include="ldecod\src\header.c" />
+    <ClCompile Include="ldecod\src\image.c" />
+    <ClCompile Include="ldecod\src\intra16x16_pred.c" />
+    <ClCompile Include="ldecod\src\intra4x4_pred.c" />
+    <ClCompile Include="ldecod\src\intra8x8_pred.c" />
+    <ClCompile Include="ldecod\src\intra_chroma_pred.c" />
+    <ClCompile Include="ldecod\src\ldecod.c" />
+    <ClCompile Include="ldecod\src\loopFilter.c" />
+    <ClCompile Include="ldecod\src\macroblock.c" />
+    <ClCompile Include="ldecod\src\mbuffer.c" />
+    <ClCompile Include="ldecod\src\mb_access.c" />
+    <ClCompile Include="ldecod\src\mb_prediction.c" />
+    <ClCompile Include="ldecod\src\mc_prediction.c" />
+    <ClCompile Include="ldecod\src\meminput.c" />
+    <ClCompile Include="ldecod\src\nal.c" />
+    <ClCompile Include="ldecod\src\nalu.c" />
+    <ClCompile Include="ldecod\src\nalucommon.c" />
+    <ClCompile Include="ldecod\src\output.c" />
+    <ClCompile Include="ldecod\src\parset.c" />
+    <ClCompile Include="ldecod\src\quant.c" />
+    <ClCompile Include="ldecod\src\sei.c" />
+    <ClCompile Include="ldecod\src\storable_picture.c" />
+    <ClCompile Include="ldecod\src\strength_horiz.c" />
+    <ClCompile Include="ldecod\src\strength_vert.c" />
+    <ClCompile Include="ldecod\src\transform8x8.c" />
+    <ClCompile Include="ldecod\src\vlc.c" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="dec_api.h" />
+    <ClInclude Include="lcommon\inc\ctx_tables.h" />
+    <ClInclude Include="lcommon\inc\frame.h" />
+    <ClInclude Include="lcommon\inc\ifunctions.h" />
+    <ClInclude Include="lcommon\inc\mb_access.h" />
+    <ClInclude Include="lcommon\inc\memalloc.h" />
+    <ClInclude Include="lcommon\inc\memcache.h" />
+    <ClInclude Include="lcommon\inc\mv_prediction.h" />
+    <ClInclude Include="lcommon\inc\nalucommon.h" />
+    <ClInclude Include="lcommon\inc\transform.h" />
+    <ClInclude Include="lcommon\inc\types.h" />
+    <ClInclude Include="lcommon\inc\win32.h" />
+    <ClInclude Include="ldecod\inc\biaridecod.h" />
+    <ClInclude Include="ldecod\inc\block.h" />
+    <ClInclude Include="ldecod\inc\cabac.h" />
+    <ClInclude Include="ldecod\inc\context_ini.h" />
+    <ClInclude Include="ldecod\inc\contributors.h" />
+    <ClInclude Include="ldecod\inc\defines.h" />
+    <ClInclude Include="ldecod\inc\elements.h" />
+    <ClInclude Include="ldecod\inc\erc_api.h" />
+    <ClInclude Include="ldecod\inc\erc_do.h" />
+    <ClInclude Include="ldecod\inc\erc_globals.h" />
+    <ClInclude Include="ldecod\inc\errorconcealment.h" />
+    <ClInclude Include="ldecod\inc\fmo.h" />
+    <ClInclude Include="ldecod\inc\global.h" />
+    <ClInclude Include="ldecod\inc\header.h" />
+    <ClInclude Include="ldecod\inc\image.h" />
+    <ClInclude Include="ldecod\inc\intra16x16_pred.h" />
+    <ClInclude Include="ldecod\inc\intra4x4_pred.h" />
+    <ClInclude Include="ldecod\inc\intra8x8_pred.h" />
+    <ClInclude Include="ldecod\inc\leaky_bucket.h" />
+    <ClInclude Include="ldecod\inc\loopfilter.h" />
+    <ClInclude Include="ldecod\inc\macroblock.h" />
+    <ClInclude Include="ldecod\inc\mbuffer.h" />
+    <ClInclude Include="ldecod\inc\mc_prediction.h" />
+    <ClInclude Include="ldecod\inc\meminput.h" />
+    <ClInclude Include="ldecod\inc\nalu.h" />
+    <ClInclude Include="ldecod\inc\optim.h" />
+    <ClInclude Include="ldecod\inc\output.h" />
+    <ClInclude Include="ldecod\inc\parset.h" />
+    <ClInclude Include="ldecod\inc\parsetcommon.h" />
+    <ClInclude Include="ldecod\inc\quant.h" />
+    <ClInclude Include="ldecod\inc\sei.h" />
+    <ClInclude Include="ldecod\inc\transform8x8.h" />
+    <ClInclude Include="ldecod\inc\vlc.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
+\ No newline at end of file
diff --git a/Src/h264dec/ldecod_vc9.vcxproj.filters b/Src/h264dec/ldecod_vc9.vcxproj.filters
new file mode 100644
index 00000000..00bcc7d3
--- /dev/null
+++ b/Src/h264dec/ldecod_vc9.vcxproj.filters
@@ -0,0 +1,304 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{303af6ae-839c-47f8-9a67-adb97270c1cc}</UniqueIdentifier>
+      <Extensions>cpp;c;cxx;rc;def;r;odl;idl;hpj;bat</Extensions>
+    </Filter>
+    <Filter Include="Source Files\deblocking filter">
+      <UniqueIdentifier>{6a99fb4f-1595-4387-a229-fb983e10ee1d}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{3b640415-33d1-4075-9c43-c885e4ab8760}</UniqueIdentifier>
+      <Extensions>h;hpp;hxx;hm;inl</Extensions>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="ldecod\src\biaridecod.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\block.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\cabac.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\context_ini.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="dec_api.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\erc_api.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\erc_do_i.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\erc_do_p.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\errorconcealment.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\fmo.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\header.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\image.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\intra16x16_pred.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\intra4x4_pred.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\intra8x8_pred.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\intra_chroma_pred.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\ldecod.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\macroblock.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\mb_access.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\mb_prediction.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\mbuffer.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\mc_prediction.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="lcommon\src\memalloc.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="lcommon\src\memcache.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\meminput.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="lcommon\src\mv_prediction.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\nal.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\nalu.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\nalucommon.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\output.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\parset.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="lcommon\src\parsetcommon.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\quant.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\sei.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\storable_picture.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="lcommon\src\transform.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\transform8x8.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\vlc.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="lcommon\src\win32.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\filter_chroma_horiz.c">
+      <Filter>Source Files\deblocking filter</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\filter_chroma_vert.c">
+      <Filter>Source Files\deblocking filter</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\filter_luma_horiz.c">
+      <Filter>Source Files\deblocking filter</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\filter_luma_vert.c">
+      <Filter>Source Files\deblocking filter</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\loopFilter.c">
+      <Filter>Source Files\deblocking filter</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\strength_horiz.c">
+      <Filter>Source Files\deblocking filter</Filter>
+    </ClCompile>
+    <ClCompile Include="ldecod\src\strength_vert.c">
+      <Filter>Source Files\deblocking filter</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="ldecod\inc\biaridecod.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ldecod\inc\block.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ldecod\inc\cabac.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ldecod\inc\context_ini.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ldecod\inc\contributors.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="lcommon\inc\ctx_tables.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="dec_api.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ldecod\inc\defines.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ldecod\inc\elements.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ldecod\inc\erc_api.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ldecod\inc\erc_do.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ldecod\inc\erc_globals.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ldecod\inc\errorconcealment.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ldecod\inc\fmo.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="lcommon\inc\frame.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ldecod\inc\global.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ldecod\inc\header.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="lcommon\inc\ifunctions.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ldecod\inc\image.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ldecod\inc\intra16x16_pred.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ldecod\inc\intra4x4_pred.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ldecod\inc\intra8x8_pred.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ldecod\inc\leaky_bucket.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ldecod\inc\loopfilter.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ldecod\inc\macroblock.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="lcommon\inc\mb_access.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ldecod\inc\mbuffer.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ldecod\inc\mc_prediction.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="lcommon\inc\memalloc.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="lcommon\inc\memcache.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ldecod\inc\meminput.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="lcommon\inc\mv_prediction.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ldecod\inc\nalu.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="lcommon\inc\nalucommon.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ldecod\inc\optim.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ldecod\inc\output.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ldecod\inc\parset.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ldecod\inc\parsetcommon.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ldecod\inc\quant.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ldecod\inc\sei.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="lcommon\inc\transform.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ldecod\inc\transform8x8.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="lcommon\inc\types.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="ldecod\inc\vlc.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="lcommon\inc\win32.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="ldecod\src\biari.asm">
+      <Filter>Source Files</Filter>
+    </CustomBuild>
+    <CustomBuild Include="ldecod\src\macroblock.asm">
+      <Filter>Source Files</Filter>
+    </CustomBuild>
+    <CustomBuild Include="ldecod\src\prediction.asm">
+      <Filter>Source Files</Filter>
+    </CustomBuild>
+  </ItemGroup>
+</Project>
+\ No newline at end of file