1 files changed, 1230 insertions, 0 deletions
diff --git a/Src/h264dec/ldecod/inc/global.h b/Src/h264dec/ldecod/inc/global.h
new file mode 100644
index 00000000..6d2677e6
--- /dev/null
+++ b/Src/h264dec/ldecod/inc/global.h
@@ -0,0 +1,1230 @@
+
+/*!
+ ************************************************************************
+ *  \file
+ *     global.h
+ *  \brief
+ *     global definitions for H.264 decoder.
+ *  \author
+ *     Copyright (C) 1999  Telenor Satellite Services,Norway
+ *                         Ericsson Radio Systems, Sweden
+ *
+ *     Inge Lille-Langoy               <inge.lille-langoy@telenor.com>
+ *
+ *     Telenor Satellite Services
+ *     Keysers gt.13                       tel.:   +47 23 13 86 98
+ *     N-0130 Oslo,Norway                  fax.:   +47 22 77 79 80
+ *
+ *     Rickard Sjoberg                 <rickard.sjoberg@era.ericsson.se>
+ *
+ *     Ericsson Radio Systems
+ *     KI/ERA/T/VV
+ *     164 80 Stockholm, Sweden
+ *
+ ************************************************************************
+ */
+#ifndef _GLOBAL_H_
+#define _GLOBAL_H_
+
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <assert.h>
+#include <time.h>
+#include <sys/timeb.h>
+
+#include <bfc/platform/types.h>
+#include "win32.h"
+#include "defines.h"
+#include "ifunctions.h"
+#include "parsetcommon.h"
+#include "types.h"
+#include "frame.h"
+#include "nalucommon.h"
+#include "memcache.h"
+#include <mmintrin.h>
+#ifdef H264_IPP
+//#include "../tools/staticlib/ipp_px.h"
+#include "ippdefs.h"
+#include "ippcore.h"
+#include "ipps.h"
+#include "ippi.h"
+#include "ippvc.h"
+#endif
+/* benski> not the best place for this but it works for now */
+#ifdef _M_IX86
+// must be a multiple of 16
+#pragma warning(disable: 4799)
+static inline void memzero_cache32(void *dst, unsigned long i)
+{
+	
+	__asm {
+		pxor	mm0, mm0
+		mov edi, dst
+
+loopwrite:
+
+		movq 0[edi], mm0
+		movq 8[edi], mm0
+		movq 16[edi], mm0
+		movq 24[edi], mm0
+		lea edi, [edi+32]
+		sub i, 32
+		jg loopwrite
+
+	}
+}
+
+static inline void memzero_fast32(void *dst, unsigned long i)
+{
+	
+	__asm {
+		pxor	mm0, mm0
+		mov edi, dst
+
+loopwrite:
+
+		movntq 0[edi], mm0
+		movntq 8[edi], mm0
+		movntq 16[edi], mm0
+		movntq 24[edi], mm0
+
+		lea edi, [edi+32]
+		sub i, 32
+		jg loopwrite
+
+	}
+}
+
+static inline void memzero64(void *dst)
+{
+	__asm {
+		pxor	mm0, mm0
+		mov edi, dst
+
+		movq 0[edi], mm0
+		movq 8[edi], mm0
+		movq 16[edi], mm0
+		movq 24[edi], mm0
+		movq 32[edi], mm0
+		movq 40[edi], mm0
+		movq 48[edi], mm0
+		movq 56[edi], mm0
+	}
+}
+
+static inline void memzero128(void *dst)
+{
+	__asm {
+		pxor	mm0, mm0
+		mov edi, dst
+
+		movq 0[edi], mm0
+		movq 8[edi], mm0
+		movq 16[edi], mm0
+		movq 24[edi], mm0
+		movq 32[edi], mm0
+		movq 40[edi], mm0
+		movq 48[edi], mm0
+		movq 56[edi], mm0
+		movq 64[edi], mm0
+		movq 72[edi], mm0
+		movq 80[edi], mm0
+		movq 88[edi], mm0
+		movq 96[edi], mm0
+		movq 104[edi], mm0
+		movq 112[edi], mm0
+		movq 120[edi], mm0
+	}
+}
+
+static inline void memzero24(void *dst)
+{
+	__asm {
+		pxor	mm0, mm0
+		mov edi, dst
+
+		movq 0[edi], mm0
+		movq 8[edi], mm0
+		movq 16[edi], mm0
+	}
+}
+
+static inline void memzero48(void *dst)
+{
+	__asm {
+		pxor	mm0, mm0
+		mov edi, dst
+
+		movq 0[edi], mm0
+		movq 8[edi], mm0
+		movq 16[edi], mm0
+		movq 24[edi], mm0
+		movq 32[edi], mm0
+		movq 40[edi], mm0
+	}
+}
+
+static inline void memzero16(void *dst)
+{
+	__asm {
+		pxor	mm0, mm0
+		mov edi, dst
+
+		movq 0[edi], mm0
+		movq 8[edi], mm0
+	}
+}
+
+static inline void memzero8(void *dst)
+{
+	__asm {
+		pxor	mm0, mm0
+		mov edi, dst
+
+		movq 0[edi], mm0
+	}
+}
+
+static inline void memset_fast_end()
+{
+	_mm_empty();
+}
+
+// Very optimized memcpy() routine for all AMD Athlon and Duron family.
+// This code uses any of FOUR different basic copy methods, depending
+// on the transfer size.
+// NOTE:  Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
+// "Streaming Store"), and also uses the software prefetchnta instructions,
+// be sure youre running on Athlon/Duron or other recent CPU before calling!
+
+#define TINY_BLOCK_COPY 64       // upper limit for movsd type copy
+// The smallest copy uses the X86 "movsd" instruction, in an optimized
+// form which is an "unrolled loop".
+
+#define IN_CACHE_COPY 64 * 1024  // upper limit for movq/movq copy w/SW prefetch
+// Next is a copy that uses the MMX registers to copy 8 bytes at a time,
+// also using the "unrolled loop" optimization.   This code uses
+// the software prefetch instruction to get the data into the cache.
+
+#define UNCACHED_COPY 197 * 1024 // upper limit for movq/movntq w/SW prefetch
+// For larger blocks, which will spill beyond the cache, its faster to
+// use the Streaming Store instruction MOVNTQ.   This write instruction
+// bypasses the cache and writes straight to main memory.  This code also
+// uses the software prefetch instruction to pre-read the data.
+// USE 64 * 1024 FOR THIS VALUE IF YOURE ALWAYS FILLING A "CLEAN CACHE"
+
+#define BLOCK_PREFETCH_COPY  infinity // no limit for movq/movntq w/block prefetch 
+#define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch
+// For the largest size blocks, a special technique called Block Prefetch
+// can be used to accelerate the read operations.   Block Prefetch reads
+// one address per cache line, for a series of cache lines, in a short loop.
+// This is faster than using software prefetch.  The technique is great for
+// getting maximum read bandwidth, especially in DDR memory systems.
+
+// Inline assembly syntax for use with Visual C++
+
+
+static void * memcpy_amd(void *dest, const void *src, size_t n)
+{
+  __asm {
+
+	mov		ecx, [n]		// number of bytes to copy
+	mov		edi, [dest]		// destination
+	mov		esi, [src]		// source
+	mov		ebx, ecx		// keep a copy of count
+
+	cld
+	cmp		ecx, TINY_BLOCK_COPY
+	jb		$memcpy_ic_3	// tiny? skip mmx copy
+
+	cmp		ecx, 32*1024		// dont align between 32k-64k because
+	jbe		$memcpy_do_align	//  it appears to be slower
+	cmp		ecx, 64*1024
+	jbe		$memcpy_align_done
+$memcpy_do_align:
+	mov		ecx, 8			// a trick thats faster than rep movsb...
+	sub		ecx, edi		// align destination to qword
+	and		ecx, 111b		// get the low bits
+	sub		ebx, ecx		// update copy count
+	neg		ecx				// set up to jump into the array
+	add		ecx, offset $memcpy_align_done
+	jmp		ecx				// jump to array of movsbs
+
+align 4
+	movsb
+	movsb
+	movsb
+	movsb
+	movsb
+	movsb
+	movsb
+	movsb
+
+$memcpy_align_done:			// destination is dword aligned
+	mov		ecx, ebx		// number of bytes left to copy
+	shr		ecx, 6			// get 64-byte block count
+	jz		$memcpy_ic_2	// finish the last few bytes
+
+	cmp		ecx, IN_CACHE_COPY/64	// too big 4 cache? use uncached copy
+	jae		$memcpy_uc_test
+
+// This is small block copy that uses the MMX registers to copy 8 bytes
+// at a time.  It uses the "unrolled loop" optimization, and also uses
+// the software prefetch instruction to get the data into the cache.
+align 16
+$memcpy_ic_1:			// 64-byte block copies, in-cache copy
+
+	prefetchnta [esi + (200*64/34+192)]		// start reading ahead
+
+	movq	mm0, [esi+0]	// read 64 bits
+	movq	mm1, [esi+8]
+	movq	[edi+0], mm0	// write 64 bits
+	movq	[edi+8], mm1	//    note:  the normal movq writes the
+	movq	mm2, [esi+16]	//    data to cache// a cache line will be
+	movq	mm3, [esi+24]	//    allocated as needed, to store the data
+	movq	[edi+16], mm2
+	movq	[edi+24], mm3
+	movq	mm0, [esi+32]
+	movq	mm1, [esi+40]
+	movq	[edi+32], mm0
+	movq	[edi+40], mm1
+	movq	mm2, [esi+48]
+	movq	mm3, [esi+56]
+	movq	[edi+48], mm2
+	movq	[edi+56], mm3
+
+	add		esi, 64			// update source pointer
+	add		edi, 64			// update destination pointer
+	dec		ecx				// count down
+	jnz		$memcpy_ic_1	// last 64-byte block?
+
+$memcpy_ic_2:
+	mov		ecx, ebx		// has valid low 6 bits of the byte count
+$memcpy_ic_3:
+	shr		ecx, 2			// dword count
+	and		ecx, 1111b		// only look at the "remainder" bits
+	neg		ecx				// set up to jump into the array
+	add		ecx, offset $memcpy_last_few
+	jmp		ecx				// jump to array of movsds
+
+$memcpy_uc_test:
+	cmp		ecx, UNCACHED_COPY/64	// big enough? use block prefetch copy
+	jae		$memcpy_bp_1
+
+$memcpy_64_test:
+	or		ecx, ecx		// _tail end of block prefetch will jump here
+	jz		$memcpy_ic_2	// no more 64-byte blocks left
+
+// For larger blocks, which will spill beyond the cache, its faster to
+// use the Streaming Store instruction MOVNTQ.   This write instruction
+// bypasses the cache and writes straight to main memory.  This code also
+// uses the software prefetch instruction to pre-read the data.
+align 16
+$memcpy_uc_1:				// 64-byte blocks, uncached copy
+
+	prefetchnta [esi + (200*64/34+192)]		// start reading ahead
+
+	movq	mm0,[esi+0]		// read 64 bits
+	add		edi,64			// update destination pointer
+	movq	mm1,[esi+8]
+	add		esi,64			// update source pointer
+	movq	mm2,[esi-48]
+	movntq	[edi-64], mm0	// write 64 bits, bypassing the cache
+	movq	mm0,[esi-40]	//    note: movntq also prevents the CPU
+	movntq	[edi-56], mm1	//    from READING the destination address
+	movq	mm1,[esi-32]	//    into the cache, only to be over-written
+	movntq	[edi-48], mm2	//    so that also helps performance
+	movq	mm2,[esi-24]
+	movntq	[edi-40], mm0
+	movq	mm0,[esi-16]
+	movntq	[edi-32], mm1
+	movq	mm1,[esi-8]
+	movntq	[edi-24], mm2
+	movntq	[edi-16], mm0
+	dec		ecx
+	movntq	[edi-8], mm1
+	jnz		$memcpy_uc_1	// last 64-byte block?
+
+	jmp		$memcpy_ic_2		// almost done
+
+// For the largest size blocks, a special technique called Block Prefetch
+// can be used to accelerate the read operations.   Block Prefetch reads
+// one address per cache line, for a series of cache lines, in a short loop.
+// This is faster than using software prefetch, in this case.
+// The technique is great for getting maximum read bandwidth,
+// especially in DDR memory systems.
+$memcpy_bp_1:			// large blocks, block prefetch copy
+
+	cmp		ecx, CACHEBLOCK			// big enough to run another prefetch loop?
+	jl		$memcpy_64_test			// no, back to regular uncached copy
+
+	mov		eax, CACHEBLOCK / 2		// block prefetch loop, unrolled 2X
+	add		esi, CACHEBLOCK * 64	// move to the top of the block
+align 16
+$memcpy_bp_2:
+	mov		edx, [esi-64]		// grab one address per cache line
+	mov		edx, [esi-128]		// grab one address per cache line
+	sub		esi, 128			// go reverse order
+	dec		eax					// count down the cache lines
+	jnz		$memcpy_bp_2		// keep grabbing more lines into cache
+
+	mov		eax, CACHEBLOCK		// now that its in cache, do the copy
+align 16
+$memcpy_bp_3:
+	movq	mm0, [esi   ]		// read 64 bits
+	movq	mm1, [esi+ 8]
+	movq	mm2, [esi+16]
+	movq	mm3, [esi+24]
+	movq	mm4, [esi+32]
+	movq	mm5, [esi+40]
+	movq	mm6, [esi+48]
+	movq	mm7, [esi+56]
+	add		esi, 64				// update source pointer
+	movntq	[edi   ], mm0		// write 64 bits, bypassing cache
+	movntq	[edi+ 8], mm1		//    note: movntq also prevents the CPU
+	movntq	[edi+16], mm2		//    from READING the destination address 
+	movntq	[edi+24], mm3		//    into the cache, only to be over-written,
+	movntq	[edi+32], mm4		//    so that also helps performance
+	movntq	[edi+40], mm5
+	movntq	[edi+48], mm6
+	movntq	[edi+56], mm7
+	add		edi, 64				// update dest pointer
+
+	dec		eax					// count down
+
+	jnz		$memcpy_bp_3		// keep copying
+	sub		ecx, CACHEBLOCK		// update the 64-byte block count
+	jmp		$memcpy_bp_1		// keep processing chunks
+
+// The smallest copy uses the X86 "movsd" instruction, in an optimized
+// form which is an "unrolled loop".   Then it handles the last few bytes.
+align 4
+	movsd
+	movsd			// perform last 1-15 dword copies
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd			// perform last 1-7 dword copies
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+
+$memcpy_last_few:		// dword aligned from before movsds
+	mov		ecx, ebx	// has valid low 2 bits of the byte count
+	and		ecx, 11b	// the last few cows must come home
+	jz		$memcpy_final	// no more, lets leave
+	rep		movsb		// the last 1, 2, or 3 bytes
+
+$memcpy_final: 
+//	emms				// clean up the MMX state
+	sfence				// flush the write buffer
+	mov		eax, [dest]	// ret value = destination pointer
+
+    }
+}
+
+#elif defined(_M_X64)
+static inline void memzero24(void *dst)
+{
+	int32_t j;
+	int32_t *d = (int32_t *)dst;
+	for (j=0;j<24;j+=4)
+	{
+		d[j] = 0;
+	}
+}
+static inline void memset_fast_end() {}
+#else
+static inline void memzero_fast16(void *dst,  unsigned long i)
+{
+	int32_t j;
+	int32_t *d = (int32_t *)dst;
+	for (j=0;j<i;j+=4)
+	{
+		d[j] = 0;
+	}
+}
+static inline void memzero24(void *dst)
+{
+	int32_t j;
+	int32_t *d = (int32_t *)dst;
+	for (j=0;j<24;j+=4)
+	{
+		d[j] = 0;
+	}
+}
+static inline void memset_fast_end() {}
+#endif
+
+#define UNDEFINED_REFERENCE ((int)0x80000000)
+typedef int32_t h264_ref_t;
+
+#define ET_SIZE 300      //!< size of error text buffer
+extern char errortext[ET_SIZE]; //!< buffer for error message for exit with error()
+extern int sse2_flag, mmx_flag, sse_flag, sse3_flag, sse4_1_flag;
+/***********************************************************************
+ * T y p e    d e f i n i t i o n s    f o r    J M
+ ***********************************************************************
+ */
+
+typedef enum
+{
+  LumaComp = 0,
+  CrComp = 1,
+  CbComp = 2
+} Color_Component;
+
+/***********************************************************************
+ * D a t a    t y p e s   f o r  C A B A C
+ ***********************************************************************
+ */
+
+typedef struct pix_pos
+{
+  int   available;
+  int   mb_addr;
+  short x;
+  short y;
+  short pos_x;
+  short pos_y;
+} PixelPos;
+
+//! struct to characterize the state of the arithmetic coding engine
+typedef struct
+{
+  unsigned int    Drange;
+  unsigned int    Dvalue;
+  int             DbitsLeft;
+  byte            *Dcodestrm;
+  int             *Dcodestrm_len;
+} DecodingEnvironment;
+
+typedef DecodingEnvironment *DecodingEnvironmentPtr;
+
+typedef short MotionVector[2];
+
+//! definition of motion parameters
+typedef struct pic_motion
+{
+	h264_ref_t ref_pic_id;
+	h264_ref_t ref_id;
+	MotionVector mv;
+	char ref_idx;
+} PicMotion;
+
+// TODO: benski> might be more efficient to make a [list][subblock_y][subblock_x] array of these values instead of parallel arrays
+typedef struct motion_params
+{
+	PicMotion **motion[2];
+  byte **     moving_block;
+} MotionParams;
+
+//! struct for context management
+typedef struct
+{
+  uint16_t state;         // index into state-table CP
+  unsigned char  MPS;           // Least Probable Symbol 0/1 CP
+  unsigned char dummy;          // for alignment
+} BiContextType;
+
+typedef BiContextType *BiContextTypePtr;
+
+
+/**********************************************************************
+ * C O N T E X T S   F O R   T M L   S Y N T A X   E L E M E N T S
+ **********************************************************************
+ */
+
+#define NUM_MB_TYPE_CTX  11
+#define NUM_B8_TYPE_CTX  9
+#define NUM_MV_RES_CTX   10
+#define NUM_REF_NO_CTX   6
+#define NUM_DELTA_QP_CTX 4
+#define NUM_MB_AFF_CTX 4
+#define NUM_TRANSFORM_SIZE_CTX 3
+
+// structures that will be declared somewhere else
+struct storable_picture;
+struct datapartition;
+struct syntaxelement;
+
+typedef struct
+{
+  BiContextType mb_type_contexts [3][NUM_MB_TYPE_CTX];
+  BiContextType b8_type_contexts [2][NUM_B8_TYPE_CTX];
+  BiContextType mv_res_contexts  [2][NUM_MV_RES_CTX];
+  BiContextType ref_no_contexts  [2][NUM_REF_NO_CTX];
+  BiContextType delta_qp_contexts[NUM_DELTA_QP_CTX];
+  BiContextType mb_aff_contexts  [NUM_MB_AFF_CTX];
+} MotionInfoContexts;
+
+#define NUM_IPR_CTX    2
+#define NUM_CIPR_CTX   4
+#define NUM_CBP_CTX    4
+#define NUM_BCBP_CTX   4
+#define NUM_MAP_CTX   15
+#define NUM_LAST_CTX  15
+#define NUM_ONE_CTX    5
+#define NUM_ABS_CTX    5
+
+
+typedef struct
+{
+  BiContextType  transform_size_contexts [NUM_TRANSFORM_SIZE_CTX];
+  BiContextType  ipr_contexts [NUM_IPR_CTX];
+  BiContextType  cipr_contexts[NUM_CIPR_CTX];
+  BiContextType  cbp_contexts [3][NUM_CBP_CTX];
+  BiContextType  bcbp_contexts[NUM_BLOCK_TYPES][NUM_BCBP_CTX];
+  BiContextType  map_contexts [2][NUM_BLOCK_TYPES][NUM_MAP_CTX+1]; // +1 for better alignment
+  BiContextType  last_contexts[2][NUM_BLOCK_TYPES][NUM_LAST_CTX+1]; // +1 for better alignment
+  BiContextType  one_contexts [NUM_BLOCK_TYPES][NUM_ONE_CTX];
+  BiContextType  abs_contexts [NUM_BLOCK_TYPES][NUM_ABS_CTX];
+} TextureInfoContexts;
+
+
+//*********************** end of data type definition for CABAC *******************
+
+/***********************************************************************
+ * N e w   D a t a    t y p e s   f o r    T M L
+ ***********************************************************************
+ */
+
+/*! Buffer structure for decoded reference picture marking commands */
+typedef struct DecRefPicMarking_s
+{
+  int memory_management_control_operation;
+  int difference_of_pic_nums_minus1;
+  int long_term_pic_num;
+  int long_term_frame_idx;
+  int max_long_term_frame_idx_plus1;
+  struct DecRefPicMarking_s *Next;
+} DecRefPicMarking_t;
+
+
+//! definition of pic motion parameters
+typedef struct pic_motion_params2
+{
+  h264_ref_t    ref_pic_id;    //!< reference picture identifier [list][subblock_y][subblock_x]
+  h264_ref_t    ref_id;        //!< reference picture identifier [list][subblock_y][subblock_x]
+  short    mv[2];         //!< motion vector       [list][subblock_x][subblock_y][component]
+  char     ref_idx;       //!< reference picture   [list][subblock_y][subblock_x]
+  byte     mb_field;      //!< field macroblock indicator
+  byte     field_frame;   //!< indicates if co_located is field or frame.
+} PicMotionParams2;
+
+//! Macroblock
+typedef struct macroblock
+{
+  struct slice       *p_Slice;                    //!< pointer to the current slice
+  struct img_par     *p_Vid;                      //!< pointer to VideoParameters
+  struct inp_par     *p_Inp;
+  int                 mbAddrX;                    //!< current MB address
+  int mb_x;
+  int mb_y;
+  int block_x;
+  int block_y;
+  int block_y_aff;
+  int pix_x;
+  int pix_y;
+  int pix_c_x;
+  int pix_c_y;
+
+  int subblock_x;
+  int subblock_y;
+
+  int           qp;                    //!< QP luma
+  int           qpc[2];                //!< QP chroma
+  int           qp_scaled[MAX_PLANE];  //!< QP scaled for all comps.
+  Boolean       is_lossless;
+  Boolean       is_intra_block;
+  Boolean       is_v_block;
+
+  short         slice_nr;
+  short         delta_quant;          //!< for rate control
+
+  struct macroblock   *mb_up;   //!< pointer to neighboring MB (CABAC)
+  struct macroblock   *mb_left; //!< pointer to neighboring MB (CABAC)
+
+  // some storage of macroblock syntax elements for global access
+  int           mb_type;
+  short         mvd[2][BLOCK_MULTIPLE][BLOCK_MULTIPLE][2];      //!< indices correspond to [forw,backw][block_y][block_x][x,y]
+  int           cbp;
+  int64         cbp_blk     [3];
+  int64         cbp_bits    [3];
+  int64         cbp_bits_8x8[3];
+
+  int           i16mode;
+  char          b8mode[4];
+  char          b8pdir[4];
+  char          ei_flag;             //!< error indicator flag that enables concealment
+  char          dpl_flag;            //!< error indicator flag that signals a missing data partition
+  char          ipmode_DPCM;
+
+  short         DFDisableIdc;
+  short         DFAlphaC0Offset;
+  short         DFBetaOffset;
+
+  char          c_ipred_mode;       //!< chroma intra prediction mode
+  Boolean       mb_field;
+
+  int          skip_flag;
+
+  int mb_addr_left, mb_addr_up, mb_addr_upper_right, mb_addr_upper_left;
+  Boolean mb_avail_left, mb_avail_up, mb_avail_upper_right, mb_avail_upper_left;
+
+  Boolean       luma_transform_size_8x8_flag;
+  Boolean       NoMbPartLessThan8x8Flag;
+
+  void (*itrans_8x8)(struct macroblock *currMB, ColorPlane pl, int ioff, int joff);
+
+  void (*GetMVPredictor) (struct macroblock *currMB, PixelPos *block, 
+    short pmv[2], short ref_frame, struct pic_motion **motion, int mb_x, int mb_y, int blockshape_x, int blockshape_y);
+
+  int (*read_and_store_CBP_block_bit) (struct macroblock *currMB, DecodingEnvironmentPtr  dep_dp, int type);
+  char (*readRefPictureIdx)   (struct syntaxelement *currSE, struct datapartition *dP, int list);
+
+} Macroblock;
+
+//! Syntaxelement
+typedef struct syntaxelement
+{
+  int           value1;                //!< numerical value of syntax element
+  int           value2;                //!< for blocked symbols, e.g. run/level
+  int           len;                   //!< length of code
+  //int           inf;                   //!< info part of CAVLC code
+
+#if TRACE
+  #define       TRACESTRING_SIZE 100           //!< size of trace string
+  char          tracestring[TRACESTRING_SIZE]; //!< trace string
+#endif
+
+  //! for mapping of CAVLC to syntaxElement
+  void  (*mapping)(int len, int info, int *value1, int *value2);
+} SyntaxElement;
+
+
+//! Bitstream
+typedef struct
+{
+  // CABAC Decoding
+  int           read_len;           //!< actual position in the codebuffer, CABAC only
+  int           code_len;           //!< overall codebuffer length, CABAC only
+  // CAVLC Decoding
+  int           frame_bitoffset;    //!< actual position in the codebuffer, bit-oriented, CAVLC only
+  int           bitstream_length;   //!< over codebuffer lnegth, byte oriented, CAVLC only
+
+  byte          *streamBuffer;      //!< actual codebuffer for read bytes
+} Bitstream;
+
+
+/* === 4x4 block typedefs === */
+// 32 bit precision
+typedef int h264_int_block_row_t[BLOCK_SIZE];
+typedef h264_int_block_row_t h264_int_block_t[BLOCK_SIZE];
+// 16 bit precision
+typedef int16_t h264_short_block_row_t[BLOCK_SIZE];
+typedef h264_short_block_row_t h264_short_block_t[BLOCK_SIZE];
+// 8 bit precision
+
+/* === 8x8 block typedefs === */
+// 32 bit precision
+typedef int h264_int_8x8block_row_t[BLOCK_SIZE_8x8];
+typedef h264_int_8x8block_row_t h264_int_8x8block_t[BLOCK_SIZE_8x8];
+// 16 bit precision
+typedef int16_t h264_short_8x8block_row_t[BLOCK_SIZE_8x8];
+typedef h264_short_8x8block_row_t h264_short_8x8block_t[BLOCK_SIZE_8x8];
+// 8 bit precision
+typedef imgpel h264_imgpel_8x8block_row_t[BLOCK_SIZE_8x8];
+typedef h264_imgpel_8x8block_row_t h264_imgpel_8x8block_t[BLOCK_SIZE_8x8];
+
+/* === 16x16 block typedefs === */
+// 32 bit precision
+typedef int h264_int_macroblock_row_t[MB_BLOCK_SIZE];
+typedef h264_int_macroblock_row_t h264_int_macroblock_t[MB_BLOCK_SIZE];
+// 16 bit precision
+typedef int16_t h264_short_macroblock_row_t[MB_BLOCK_SIZE];
+typedef h264_short_macroblock_row_t h264_short_macroblock_t[MB_BLOCK_SIZE];
+// 8 bit precision
+typedef imgpel h264_imgpel_macroblock_row_t[MB_BLOCK_SIZE];
+typedef h264_imgpel_macroblock_row_t h264_imgpel_macroblock_t[MB_BLOCK_SIZE];
+
+
+
+
+typedef int h264_pic_position[2];
+typedef byte h264_4x4_byte[BLOCK_SIZE][BLOCK_SIZE];
+typedef h264_4x4_byte h264_nz_coefficient[3];
+
+//! DataPartition
+typedef struct datapartition
+{
+
+  Bitstream           *bitstream;
+  DecodingEnvironment de_cabac;
+
+} DataPartition;
+
+//! Slice
+typedef struct slice
+{
+  struct img_par      *p_Vid;
+  struct inp_par      *p_Inp;
+  pic_parameter_set_rbsp_t *active_pps;
+  seq_parameter_set_rbsp_t *active_sps;
+
+  struct colocated_params *p_colocated;
+  struct colocated_params *Co_located_JV[MAX_PLANE];  //!< p_colocated to be used during 4:4:4 independent mode decoding
+
+  int                 mb_aff_frame_flag;
+  int                 direct_spatial_mv_pred_flag;       //!< Indicator for direct mode type (1 for Spatial, 0 for Temporal)
+  int                 num_ref_idx_l0_active;             //!< number of available list 0 references
+  int                 num_ref_idx_l1_active;             //!< number of available list 1 references
+
+  int                 qp;
+  int                 slice_qp_delta;
+  int                 qs;
+  int                 slice_qs_delta;
+  int                 slice_type;    //!< slice type
+  int                 model_number;  //!< cabac model number
+  PictureStructure    structure;     //!< Identify picture structure type
+  int                 start_mb_nr;   //!< MUST be set by NAL even in case of ei_flag == 1
+  int                 max_part_nr;
+  int                 dp_mode;       //!< data partitioning mode
+  int                 last_dquant;
+
+  //  int                 last_mb_nr;    //!< only valid when entropy coding == CABAC
+  DataPartition       *partArr;      //!< array of partitions
+  MotionInfoContexts  *mot_ctx;      //!< pointer to struct of context models for use in CABAC
+  TextureInfoContexts *tex_ctx;      //!< pointer to struct of context models for use in CABAC
+
+  int mvscale[6][MAX_REFERENCE_PICTURES];
+
+  int                 ref_pic_list_reordering_flag_l0;
+  int                 *reordering_of_pic_nums_idc_l0;
+  int                 *abs_diff_pic_num_minus1_l0;
+  int                 *long_term_pic_idx_l0;
+  int                 ref_pic_list_reordering_flag_l1;
+  int                 *reordering_of_pic_nums_idc_l1;
+  int                 *abs_diff_pic_num_minus1_l1;
+  int                 *long_term_pic_idx_l1;
+  
+
+  short               DFDisableIdc;     //!< Disable deblocking filter on slice
+  short               DFAlphaC0Offset;  //!< Alpha and C0 offset for filtering slice
+  short               DFBetaOffset;     //!< Beta offset for filtering slice
+
+  int                 pic_parameter_set_id;   //!<the ID of the picture parameter set the slice is reffering to
+
+  int                 dpB_NotPresent;    //!< non-zero, if data partition B is lost
+  int                 dpC_NotPresent;    //!< non-zero, if data partition C is lost
+
+
+	__declspec(align(32)) h264_imgpel_macroblock_t mb_pred[MAX_PLANE];
+	__declspec(align(32)) h264_imgpel_macroblock_t mb_rec[MAX_PLANE];
+	__declspec(align(32)) union
+	{
+		__declspec(align(32)) h264_short_8x8block_t mb_rres8[MAX_PLANE][4];
+		__declspec(align(32)) h264_short_macroblock_t cof[MAX_PLANE];
+		__declspec(align(32)) h264_short_block_t cof4[MAX_PLANE][16]; // TODO: get this to work, one of these days
+		__declspec(align(32)) h264_short_macroblock_t ipcm[MAX_PLANE];
+	};
+
+  int cofu[16];
+
+  // Scaling matrix info
+  int  InvLevelScale4x4_Intra[3][6][4][4];
+  int  InvLevelScale4x4_Inter[3][6][4][4];
+  int  InvLevelScale8x8_Intra[3][6][64];
+  int  InvLevelScale8x8_Inter[3][6][64];
+
+  int  *qmatrix[12];
+
+  // Cabac
+	// TODO: we could optimize coefficient reading by storing the levels/runs instead of coefficients
+  int16_t  coeff[64]; // one more for EOB
+  int  coeff_ctr;
+  int  pos;
+
+  //weighted prediction
+  unsigned int apply_weights;
+  unsigned int luma_log2_weight_denom;
+  unsigned int chroma_log2_weight_denom;
+  int wp_weight[2][MAX_REFERENCE_PICTURES][3];  // weight in [list][index][component] order
+  int wp_offset[6][MAX_REFERENCE_PICTURES][3];  // offset in [list][index][component] order
+  int wbp_weight[6][MAX_REFERENCE_PICTURES][MAX_REFERENCE_PICTURES][3]; //weight in [list][fw_index][bw_index][component] order
+  int wp_round_luma;
+  int wp_round_chroma;
+
+  void (*read_CBP_and_coeffs_from_NAL) (Macroblock *currMB);
+  int  (*decode_one_component     ) (Macroblock *currMB, ColorPlane curr_plane, struct video_image *image, struct storable_picture *dec_picture);
+  int  (*readSlice                ) (struct img_par *, struct inp_par *);  
+  int  (*nal_startcode_follows    ) (struct slice*, int );
+  void (*read_motion_info_from_NAL) (Macroblock *currMB);
+  void (*read_one_macroblock      ) (Macroblock *currMB);
+  void (*interpret_mb_mode        ) (Macroblock *currMB);
+	void (*compute_colocated        ) (struct slice *currSlice, struct colocated_params *p, struct storable_picture **listX[6]);
+
+  void (*linfo_cbp_intra) (int len,int info,int *cbp, int *dummy);
+  void (*linfo_cbp_inter) (int len,int info,int *cbp, int *dummy);
+} Slice;
+
+//****************************** ~DM ***********************************
+
+// image parameters
+typedef struct img_par
+{
+  struct inp_par      *p_Inp;
+  pic_parameter_set_rbsp_t *active_pps;
+  seq_parameter_set_rbsp_t *active_sps;
+  seq_parameter_set_rbsp_t SeqParSet[MAXSPS];
+  pic_parameter_set_rbsp_t PicParSet[MAXPPS];
+
+  struct sei_params        *p_SEI;
+
+  struct old_slice_par *old_slice;
+  int number;                                 //!< frame number
+  unsigned int current_mb_nr; // bitstream order
+  unsigned int num_dec_mb;
+  short        current_slice_nr;
+  int *intra_block;
+  
+  int qp;                                     //!< quant for the current frame
+
+  int sp_switch;                              //!< 1 for switching sp, 0 for normal sp  
+  int type;                                   //!< image type INTER/INTRA
+  int width;
+  int height;
+  int width_cr;                               //!< width chroma  
+  int height_cr;                              //!< height chroma
+  int mb_x;
+  int mb_y;
+  int block_x;
+  int block_y;
+  int pix_c_x;
+  int pix_c_y;
+
+  int allrefzero;
+
+  byte **ipredmode;                  //!< prediction type [90][74]
+  h264_nz_coefficient *nz_coeff;
+  int **siblock;
+  int cod_counter;                   //!< Current count of number of skipped macroblocks in a row
+
+  int structure;                     //!< Identify picture structure type
+
+  Slice      *currentSlice;          //!< pointer to current Slice data struct
+  Macroblock *mb_data;               //!< array containing all MBs of a whole frame
+  Macroblock *mb_data_JV[MAX_PLANE]; //!< mb_data to be used for 4:4:4 independent mode
+  int colour_plane_id;               //!< colour_plane_id of the current coded slice
+  int ChromaArrayType;
+
+  // For MB level frame/field coding
+  int mb_aff_frame_flag;
+
+  // for signalling to the neighbour logic that this is a deblocker call
+  int DeblockCall;
+  byte mixedModeEdgeFlag;
+
+  // picture error concealment
+  // concealment_head points to first node in list, concealment_end points to
+  // last node in list. Initialize both to NULL, meaning no nodes in list yet
+  struct concealment_node *concealment_head;
+  struct concealment_node *concealment_end;
+
+  DecRefPicMarking_t *dec_ref_pic_marking_buffer;                    //!< stores the memory management control operations
+
+  int num_ref_idx_l0_active;             //!< number of forward reference
+  int num_ref_idx_l1_active;             //!< number of backward reference
+
+  int slice_group_change_cycle;
+
+  int redundant_pic_cnt;
+
+  unsigned int pre_frame_num;           //!< store the frame_num in the last decoded slice. For detecting gap in frame_num.
+  int non_conforming_stream;
+
+  // End JVT-D101
+  // POC200301: from unsigned int to int
+  int toppoc;      //poc for this top field // POC200301
+  int bottompoc;   //poc of bottom field of frame
+  int framepoc;    //poc of this frame // POC200301
+  unsigned int frame_num;   //frame_num for this frame
+  unsigned int field_pic_flag;
+  byte         bottom_field_flag;
+
+  //the following is for slice header syntax elements of poc
+  // for poc mode 0.
+  unsigned int pic_order_cnt_lsb;
+  int delta_pic_order_cnt_bottom;
+  // for poc mode 1.
+  int delta_pic_order_cnt[3];
+
+  // ////////////////////////
+  // for POC mode 0:
+  signed   int PrevPicOrderCntMsb;
+  unsigned int PrevPicOrderCntLsb;
+  signed   int PicOrderCntMsb;
+
+  // for POC mode 1:
+  unsigned int AbsFrameNum;
+  signed int ExpectedPicOrderCnt, PicOrderCntCycleCnt, FrameNumInPicOrderCntCycle;
+  unsigned int PreviousFrameNum, FrameNumOffset;
+  int ExpectedDeltaPerPicOrderCntCycle;
+  int PreviousPOC, ThisPOC;
+  int PreviousFrameNumOffset;
+  // /////////////////////////
+
+  int idr_flag;
+  int nal_reference_idc;                       //!< nal_reference_idc from NAL unit
+
+  int idr_pic_id;
+
+  int MaxFrameNum;
+
+  unsigned int PicWidthInMbs;
+  unsigned int PicHeightInMapUnits;
+  unsigned int FrameHeightInMbs;
+  unsigned int PicHeightInMbs;
+  unsigned int PicSizeInMbs;
+  unsigned int FrameSizeInMbs;
+  unsigned int oldFrameSizeInMbs;
+
+  int no_output_of_prior_pics_flag;
+  int long_term_reference_flag;
+  int adaptive_ref_pic_buffering_flag;
+
+  int last_has_mmco_5;
+  int last_pic_bottom_field;
+
+  // Fidelity Range Extensions Stuff
+  short bitdepth_luma;
+  short bitdepth_chroma;
+  int bitdepth_scale[2];
+  int bitdepth_luma_qp_scale;
+  int bitdepth_chroma_qp_scale;
+  unsigned int dc_pred_value_comp[MAX_PLANE]; //!< component value for DC prediction (depends on component pel bit depth)
+  int max_pel_value_comp[MAX_PLANE];       //!< max value that one picture element (pixel) can take (depends on pic_unit_bitdepth)
+  int Transform8x8Mode;
+  int profile_idc;
+  int yuv_format;
+  int lossless_qpprime_flag;
+  int num_blk8x8_uv;
+  int num_uv_blocks;
+  int num_cdc_coeff;
+  int mb_cr_size_x;
+  int mb_cr_size_y;
+  int mb_cr_size_x_blk;
+  int mb_cr_size_y_blk;
+  int mb_size[3][2];                         //!< component macroblock dimensions
+  int mb_size_blk[3][2];                     //!< component macroblock dimensions 
+  int mb_size_shift[3][2];
+  int subpel_x;
+  int subpel_y;
+  int shiftpel_x;
+  int shiftpel_y;
+
+  int max_vmv_r;                             //!< maximum vertical motion vector range in luma quarter frame pixel units for the current level_idc
+  int max_mb_vmv_r;                          //!< maximum vertical motion vector range in luma quarter pixel units for the current level_idc
+
+  // picture error concealment
+  int last_ref_pic_poc;
+  int ref_poc_gap;
+  int poc_gap;
+  int earlier_missing_poc;
+  unsigned int frame_to_conceal;
+  int IDR_concealment_flag;
+  int conceal_slice_type;
+
+  // random access point decoding
+  int recovery_point;
+  int recovery_point_found;
+  int recovery_frame_cnt;
+  int recovery_frame_num;
+  int recovery_poc;
+
+  int separate_colour_plane_flag;
+
+  int frame_number;
+  int init_bl_done;
+
+  // Redundant slices. Should be moved to another structure and allocated only if extended profile
+  unsigned int previous_frame_num; //!< frame number of previous slice
+  int ref_flag[17];                //!< 0: i-th previous frame is incorrect
+  //!< non-zero: i-th previous frame is correct
+  int Is_primary_correct;          //!< if primary frame is correct, 0: incorrect
+  int Is_redundant_correct;        //!< if redundant frame is correct, 0:incorrect
+  int redundant_slice_ref_idx;     //!< reference index of redundant slice
+
+  //FILE *p_log;                     //!< SNR file
+  int LastAccessUnitExists;
+  int NALUCount;
+
+  Boolean global_init_done;
+
+  int *qp_per_matrix;
+  int *qp_rem_matrix;
+
+  struct frame_store *last_out_fs;
+  int pocs_in_dpb[100];
+
+
+  struct storable_picture *dec_picture;
+  struct storable_picture *dec_picture_JV[MAX_PLANE];  //!< dec_picture to be used during 4:4:4 independent mode decoding
+  struct storable_picture *no_reference_picture; //!< dummy storable picture for recovery point
+  struct storable_picture **listX[6];
+
+  // Error parameters
+  struct object_buffer  *erc_object_list;
+  struct ercVariables_s *erc_errorVar;
+
+  int erc_mvperMB;
+  struct img_par *erc_img;
+  int ec_flag[SE_MAX_ELEMENTS];        //!< array to set errorconcealment
+
+	struct memory_input_struct *mem_input;
+
+  struct frame_store *out_buffer;
+
+  struct storable_picture *pending_output;
+  int    pending_output_state;
+  int    recovery_flag;
+
+  // dpb
+  struct decoded_picture_buffer *p_Dpb;
+
+  char listXsize[6];
+  // report
+  char cslice_type[9];  
+  // FMO
+  int *MbToSliceGroupMap;
+  int *MapUnitToSliceGroupMap;
+  int  NumberOfSliceGroups;    // the number of slice groups -1 (0 == scan order, 7 == maximum)
+
+#if (ENABLE_OUTPUT_TONEMAPPING)
+  struct tone_mapping_struct_s *seiToneMapping;
+#endif
+
+	// benski> buffer of storablge pictures ready for output.
+	// might be able to optimize a tad by making a ringbuffer, but i doubt it matters
+	struct storable_picture **out_pictures;
+	size_t size_out_pictures;
+	size_t num_out_pictures;
+
+	ImageCache image_cache[2]; // [0] is luma [1] is chroma (shared for both planes)
+	MotionCache motion_cache;
+
+	h264_pic_position *PicPos; //! Helper array to access macroblock positions. 
+
+	NALU_t *nalu; // a cache so we don't re-alloc every time
+
+  void (*getNeighbour)        (const Macroblock *currMB, int xN, int yN, const int mb_size[2], PixelPos *pix);
+	void (*getNeighbourPX_NoPos)(const Macroblock *currMB, int xN, int yN, const int mb_size[2], PixelPos *pix);
+	void (*getNeighbourXP_NoPos)(const Macroblock *currMB, int xN, int yN, const int mb_size[2], PixelPos *pix);
+	void (*getNeighbourLuma)    (const Macroblock *currMB, int xN, int yN, PixelPos *pix);
+	void (*getNeighbourPXLuma)  (const Macroblock *currMB, int xN, int yN, PixelPos *pix);
+	void (*getNeighbourXPLuma)  (const Macroblock *currMB, int xN, int yN, PixelPos *pix);
+	void (*getNeighbourLeftLuma)(const Macroblock *currMB, PixelPos *pix);
+	void (*getNeighbourNXLuma)  (const Macroblock *currMB, int yN, PixelPos *pix); // xN<0, yN full range
+	void (*getNeighbourLeft)    (const Macroblock *currMB, const int mb_size[2], PixelPos *pix); // xN<0, yN=0
+	void (*getNeighbourUp)      (const Macroblock *currMB, const int mb_size[2], PixelPos *pix); // xN=0, yN<0
+	void (*getNeighbourNX)      (const Macroblock *currMB, int yN, const int mb_size[2], PixelPos *pix); // xN<0, yN full range
+	void (*getNeighbourNP)      (const Macroblock *currMB, int yN, const int mb_size[2], PixelPos *pix); // xN<0, yN>=0
+	void (*getNeighbourNPChromaNB)(const Macroblock *currMB, int yN, const int mb_size[2], PixelPos *pix); // xN<0, yN>=0
+	void (*getNeighbour0X)      (const Macroblock *currMB, int yN, const int mb_size[2], PixelPos *pix); // xN=0, yN full range
+	void (*getNeighbour0XLuma)      (const Macroblock *currMB, int yN, PixelPos *pix); // xN=0, yN full range
+	void (*getNeighbourX0)      (const Macroblock *currMB, int xN, const int mb_size[2], PixelPos *pix); // xN full range, yN = 0
+	void (*getNeighbourUpLuma)  (const Macroblock *currMB, PixelPos *pix); // xN=0, yN<0
+	void (*getNeighbourNPLumaNB)(const Macroblock *currMB, int yN, PixelPos *pix);
+	void (*getNeighbourPXLumaNB)  (const Macroblock *currMB, int xN, int yN, PixelPos *pix);
+	void (*getNeighbourPXLumaNB_NoPos)(const Macroblock *currMB, int yN, PixelPos *pix);
+	void (*getNeighbourPPLumaNB)  (const Macroblock *currMB, int xN, int yN, PixelPos *pix);
+	void (*getNeighbourXPLumaNB)  (const Macroblock *currMB, int xN, int yN, PixelPos *pix);
+	void (*getNeighbourXPLumaNB_NoPos)(const Macroblock *currMB, int xN, int yN, PixelPos *pix);
+  void (*get_mb_block_pos) (const h264_pic_position *PicPos, int mb_addr, short *x, short *y);
+  void (*GetStrength)      (byte Strength[16], Macroblock *MbQ, int dir,int edge, int mvlimit, struct storable_picture *p);
+  void (*EdgeLoopLuma)     (ColorPlane pl, struct video_image *image, const byte Strength[16], Macroblock *MbQ, int dir, int edge, struct storable_picture *p);
+  void (*EdgeLoopChroma)   (struct video_image *image, byte Strength[16], Macroblock *MbQ, int dir, int edge, int uv, struct storable_picture *p);
+} VideoParameters;
+
+// input parameters from configuration file
+typedef struct inp_par
+{
+  int intra_profile_deblocking;               //!< Loop filter usage determined by flags and parameters in bitstream 
+
+  // Output sequence format related variables
+  FrameFormat output;                   //!< output related information
+
+#ifdef _LEAKYBUCKET_
+  unsigned long R_decoder;                //!< Decoder Rate in HRD Model
+  unsigned long B_decoder;                //!< Decoder Buffer size in HRD model
+  unsigned long F_decoder;                //!< Decoder Initial buffer fullness in HRD model
+  char LeakyBucketParamFile[FILE_NAME_SIZE];         //!< LeakyBucketParamFile
+#endif
+
+  // picture error concealment
+  int ref_poc_gap;
+  int poc_gap;
+} InputParameters;
+
+typedef struct old_slice_par
+{
+   unsigned field_pic_flag;   
+   unsigned frame_num;
+   int      nal_ref_idc;
+   unsigned pic_oder_cnt_lsb;
+   int      delta_pic_oder_cnt_bottom;
+   int      delta_pic_order_cnt[2];
+   byte     bottom_field_flag;
+   byte     idr_flag;
+   int      idr_pic_id;
+   int      pps_id;   
+} OldSliceParams;
+
+typedef struct decoder_params
+{
+  InputParameters   *p_Inp;          //!< Input Parameters
+  VideoParameters   *p_Vid;          //!< Image Parameters
+  
+} DecoderParams;
+
+#ifdef TRACE
+extern FILE *p_trace; //!< Trace file
+extern int bitcounter;
+#endif
+
+// prototypes
+
+extern void error(char *text, int code);
+
+// dynamic mem allocation
+extern int  init_global_buffers(VideoParameters *p_Vid);
+extern void free_global_buffers(VideoParameters *p_Vid);
+
+extern int RBSPtoSODB(byte *streamBuffer, int last_byte_pos);
+extern int EBSPtoRBSP(byte *streamBuffer, int end_bytepos);
+
+void FreePartition (DataPartition *dp, int n);
+DataPartition *AllocPartition(int n);
+
+void tracebits(const char *trace_str,  int len,  int info,int value1);
+void tracebits2(const char *trace_str, int len, int info);
+
+unsigned CeilLog2   ( unsigned uiVal);
+unsigned CeilLog2_sf( unsigned uiVal);
+
+// For 4:4:4 independent mode
+extern void change_plane_JV( VideoParameters *p_Vid, int nplane );
+extern void make_frame_picture_JV(VideoParameters *p_Vid);
+
+
+#endif
+
+