diff options
Diffstat (limited to 'Src/tataki/blending/blending.h')
-rw-r--r-- | Src/tataki/blending/blending.h | 542 |
1 files changed, 542 insertions, 0 deletions
diff --git a/Src/tataki/blending/blending.h b/Src/tataki/blending/blending.h new file mode 100644 index 00000000..238ff08f --- /dev/null +++ b/Src/tataki/blending/blending.h @@ -0,0 +1,542 @@ +#ifndef _BLENDING_H_ +#define _BLENDING_H_ + +//#include <bfc/common.h> +#include <bfc/platform/types.h> +#include <tataki/export.h> + +#ifdef _WIN64 +#define NO_MMX +#endif + +class TATAKIAPI Blenders +{ +public: + static void init(); + static unsigned int inline BLEND_ADJ1(unsigned int a, unsigned int b, int alpha); + static unsigned int inline BLEND_ADJ2(unsigned int a, unsigned int b); + static unsigned int inline BLEND_ADJ3(unsigned int a, unsigned int b, int alpha); + static unsigned int inline BLEND_MUL(unsigned int a, int v); + static unsigned int inline BLEND_AVG(unsigned int a, unsigned int b); + static unsigned int inline BLEND4(unsigned int *p1, unsigned int w, int xp, int yp); + +#ifndef NO_MMX + static int inline MMX_AVAILABLE() { return mmx_available; } + static unsigned int inline BLEND_ADJ1_MMX(unsigned int a, unsigned int b, int alpha); + static unsigned int inline BLEND_ADJ2_MMX(unsigned int a, unsigned int b); + static unsigned int inline BLEND_ADJ3_MMX(unsigned int a, unsigned int b, int alpha); + static unsigned int inline BLEND_MUL_MMX(unsigned int a, int v); + static unsigned int inline BLEND_AVG_MMX(unsigned int a, unsigned int b); + static unsigned int inline BLEND4_MMX(unsigned int *p1, unsigned int w, int xp, int yp); + static void inline BLEND_MMX_END() + { +#if defined(WIN32) && !defined(_WIN64) + if (mmx_available) __asm emms; +#endif +#ifdef LINUX +if (mmx_available) __asm__ volatile ( "emms" : : ); +#endif + + } +#endif + +//private: + static uint8_t alphatable[256][256]; +#ifndef NO_MMX + static int mmx_available; +#endif +}; + + +// NON MMX + +// average blend of a and b. +unsigned int inline Blenders::BLEND_AVG(unsigned int a, unsigned int b) +{ + return ((a >> 1)&~((1 << 7) | (1 << 15) | (1 << 23))) + ((b >> 1)&~((1 << 7) | (1 << 15) | (1 << 23))); +} + + +// multiplies 32 bit color A by scalar V (0-255) +unsigned int inline Blenders::BLEND_MUL(unsigned int a, int v) +{ + register int t; + t = Blenders::alphatable[a & 0xFF][v]; + t |= Blenders::alphatable[(a & 0xFF00) >> 8][v] << 8; + t |= Blenders::alphatable[(a & 0xFF0000) >> 16][v] << 16; + t |= Blenders::alphatable[(a & 0xFF000000) >> 24][v] << 24; + return t; +} + + +// V is scalar (0-255), (1.0-V)*b + V*a +unsigned int inline Blenders::BLEND_ADJ1(unsigned int a, unsigned int b, int v) +{ + register int t; + t = Blenders::alphatable[b & 0xFF][0xFF - v] + Blenders::alphatable[a & 0xFF][v]; + t |= (Blenders::alphatable[(b & 0xFF00) >> 8][0xFF - v] + Blenders::alphatable[(a & 0xFF00) >> 8][v]) << 8; + t |= (Blenders::alphatable[(b & 0xFF0000) >> 16][0xFF - v] + Blenders::alphatable[(a & 0xFF0000) >> 16][v]) << 16; + t |= (Blenders::alphatable[(b & 0xFF000000) >> 24][0xFF - v] + Blenders::alphatable[(a & 0xFF000000) >> 24][v]) << 24; + return t; +} + +// returns a*(1.0-Alpha(b)) + b +unsigned int inline Blenders::BLEND_ADJ2(unsigned int a, unsigned int b) +{ + register int t, z; + int v = 0xff - ((b >> 24) & 0xff); + t = Blenders::alphatable[a & 0xFF][v] + (b & 0xFF); + if (t > 0xFF) t = 0xff; + z = (Blenders::alphatable[(a & 0xFF00) >> 8][v] << 8) + (b & 0xFF00); + if (z > 0xFF00) z = 0xff00; + t |= z; + z = (Blenders::alphatable[(a & 0xFF0000) >> 16][v] << 16) + ((b & 0xFF0000)); + if (z > 0xFF0000) z = 0xff0000; + t |= z; + z = (Blenders::alphatable[(a & 0xFF000000) >> 24][v]) + ((b & 0xFF000000) >> 24); + if (z > 0xFF) z = 0xff; + return t | (z << 24); +} + +// returns a*(1-Alpha(b)*W) + b*W, clamped (W is scalar 0-0xff). +unsigned int inline Blenders::BLEND_ADJ3(unsigned int a, unsigned int b, int w) +{ + register int t, z; + int v = 0xff - Blenders::alphatable[(b >> 24) & 0xff][w]; + + t = Blenders::alphatable[a & 0xFF][v] + Blenders::alphatable[b & 0xFF][w]; + if (t > 0xFF) t = 0xFF; + z = Blenders::alphatable[(a & 0xFF00) >> 8][v] + Blenders::alphatable[(b & 0xFF00) >> 8][w]; + if (z > 0xFF) z = 0xFF; + t |= z << 8; + z = Blenders::alphatable[(a & 0xFF0000) >> 16][v] + Blenders::alphatable[(b & 0xFF0000) >> 16][w]; + if (z > 0xFF) z = 0xFF; + t |= z << 16; + z = Blenders::alphatable[(a & 0xFF000000) >> 24][v] + Blenders::alphatable[(b & 0xFF000000) >> 24][w]; + if (z > 0xFF) z = 0xFF; + return t | (z << 24); +} + +unsigned int __inline Blenders::BLEND4(unsigned int *p1, unsigned int w, int xp, int yp) +{ + register int t; + uint8_t a1, a2, a3, a4; + xp = (xp >> 8) & 0xff; + yp = (yp >> 8) & 0xff; + a1 = alphatable[255 - xp][255 - yp]; + a2 = alphatable[xp][255 - yp]; + a3 = alphatable[255 - xp][yp]; + a4 = alphatable[xp][yp]; + t = alphatable[p1[0] & 0xff][a1] + alphatable[p1[1] & 0xff][a2] + alphatable[p1[w] & 0xff][a3] + alphatable[p1[w + 1] & 0xff][a4]; + t |= (alphatable[(p1[0] >> 8) & 0xff][a1] + alphatable[(p1[1] >> 8) & 0xff][a2] + alphatable[(p1[w] >> 8) & 0xff][a3] + alphatable[(p1[w + 1] >> 8) & 0xff][a4]) << 8; + t |= (alphatable[(p1[0] >> 16) & 0xff][a1] + alphatable[(p1[1] >> 16) & 0xff][a2] + alphatable[(p1[w] >> 16) & 0xff][a3] + alphatable[(p1[w + 1] >> 16) & 0xff][a4]) << 16; + t |= (alphatable[(p1[0] >> 24) & 0xff][a1] + alphatable[(p1[1] >> 24) & 0xff][a2] + alphatable[(p1[w] >> 24) & 0xff][a3] + alphatable[(p1[w + 1] >> 24) & 0xff][a4]) << 24; + return t; +} + + + + +#ifndef NO_MMX + + +#ifdef WIN32 +#pragma warning( push, 1 ) +#pragma warning(disable: 4799) +#endif + +#ifdef WIN32 +#define MMX_CONST const +#else +#define MMX_CONST +#endif + +static unsigned int MMX_CONST Blenders__mmx_revn2[2] = {0x01000100, 0x01000100}; +static unsigned int MMX_CONST Blenders__mmx_zero[2]; +static unsigned int MMX_CONST Blenders__mmx_one[2] = {1, 0}; + +#undef MMX_CONST + +/// MMX + +// average blend of a and b. +unsigned int inline Blenders::BLEND_AVG_MMX(unsigned int a, unsigned int b) +{ + return ((a >> 1)&~((1 << 7) | (1 << 15) | (1 << 23))) + ((b >> 1)&~((1 << 7) | (1 << 15) | (1 << 23))); +} + +// multiplies 32 bit color A by scalar V (0-255) +unsigned int inline Blenders::BLEND_MUL_MMX(unsigned int a, int v) +{ +#ifdef WIN32 + __asm + { + movd mm3, [v] // VVVVVVVV + + movd mm0, [a] + packuswb mm3, mm3 // 0000HHVV + + punpcklbw mm0, [Blenders__mmx_zero] + punpcklwd mm3, mm3 // HHVVHHVV + + punpckldq mm3, mm3 // HHVVHHVV HHVVHHVV + + pmullw mm0, mm3 + + psrlw mm0, 8 + + packuswb mm0, mm0 + + movd eax, mm0 + } +#else + __asm__ volatile ( + "movd %0, %%mm3\n" + "movd %1, %%mm0\n" + "packuswb %%mm3, %%mm3\n" + "punpcklbw (Blenders__mmx_zero), %%mm0\n" + "punpcklwd %%mm3, %%mm3\n" + "punpckldq %%mm3, %%mm3\n" + "pmullw %%mm3, %%mm0\n" + "psrlw $8, %%mm0\n" + "packuswb %%mm0, %%mm0\n" + "movd %%mm0, %%eax\n" + : + : "m" (v), "m" (a) + : "%mm0", "%mm3" ); +#endif +} + + +// V is scalar (0-255), (1.0-V)*b + V*a +unsigned int inline Blenders::BLEND_ADJ1_MMX(unsigned int a, unsigned int b, int v) +{ +#ifdef WIN32 + __asm + { + movd mm3, [v] // VVVVVVVV + + movd mm0, [a] + packuswb mm3, mm3 // 0000HHVV + + movd mm1, [b] + paddusw mm3, [Blenders__mmx_one] + + movq mm4, [Blenders__mmx_revn2] + punpcklwd mm3, mm3 // HHVVHHVV + + punpcklbw mm0, [Blenders__mmx_zero] + punpckldq mm3, mm3 // HHVVHHVV HHVVHHVV + + punpcklbw mm1, [Blenders__mmx_zero] + psubw mm4, mm3 + + pmullw mm0, mm3 + pmullw mm1, mm4 + + paddw mm0, mm1 + + psrlw mm0, 8 + + packuswb mm0, mm0 + + movd eax, mm0 + } +#else + __asm__ volatile ( + "movd %0, %%mm3\n" + "movd %1, %%mm0\n" + "packuswb %%mm3, %%mm3\n" + "movd %2, %%mm1\n" + "paddusw (Blenders__mmx_one), %%mm3\n" + "movq (Blenders__mmx_revn2), %%mm4\n" + "punpcklwd %%mm3, %%mm3\n" + "punpcklbw (Blenders__mmx_zero), %%mm0\n" + "punpckldq %%mm3, %%mm3\n" + "punpcklbw (Blenders__mmx_zero), %%mm1\n" + "psubw %%mm3, %%mm4\n" + "pmullw %%mm3, %%mm0\n" + "pmullw %%mm4, %%mm1\n" + "paddw %%mm1, %%mm0\n" + "psrlw $8, %%mm0\n" + "packuswb %%mm0, %%mm0\n" + "movd %%mm0, %%eax\n" + : + : "m" (v), "m" (a), "m" (b) + : "%mm0", "%mm1", "%mm3", "%mm4" ); +#endif +} + +// returns a*(1.0-Alpha(b)) + b +unsigned int inline Blenders::BLEND_ADJ2_MMX(unsigned int a, unsigned int b) +{ +#ifdef WIN32 + __asm + { + movd mm3, [b] // VVVVVVVV + movq mm4, [Blenders__mmx_revn2] + + movd mm0, [a] + psrld mm3, 24 + + movd mm1, [b] + paddusw mm3, [Blenders__mmx_one] + + punpcklwd mm3, mm3 // HHVVHHVV + punpcklbw mm0, [Blenders__mmx_zero] + + punpckldq mm3, mm3 // HHVVHHVV HHVVHHVV + punpcklbw mm1, [Blenders__mmx_zero] + + psubw mm4, mm3 + + pmullw mm0, mm4 + // stall + + // stall + + // stall + + psrlw mm0, 8 + // stall + + paddw mm0, mm1 + // stall + + packuswb mm0, mm0 + // stall + + movd eax, mm0 + } +#else + __asm__ volatile ( + "movd %1, %%mm3\n" + "movq (Blenders__mmx_revn2), %%mm4\n" + "movd %0, %%mm0\n" + "psrld $24, %%mm3\n" + "movd %1, %%mm1\n" + "paddusw (Blenders__mmx_one), %%mm3\n" + "punpcklwd %%mm3, %%mm3\n" + "punpcklbw (Blenders__mmx_zero), %%mm0\n" + "punpckldq %%mm3, %%mm3\n" + "punpcklbw (Blenders__mmx_zero), %%mm1\n" + "psubw %%mm3, %%mm4\n" + "pmullw %%mm4, %%mm0\n" + "psrlw $8, %%mm0\n" + "paddw %%mm1, %%mm0\n" + "packuswb %%mm0, %%mm0\n" + "movd %%mm0, %%eax\n" + : + : "m" (a), "m" (b) + : "%esi", "%mm0", "%mm1", "%mm3", "%mm4" ); +#endif +} + +// returns a*(1-Alpha(b)*W) + b*W, clamped (W is scalar 0-0xff). +unsigned int inline Blenders::BLEND_ADJ3_MMX(unsigned int a, unsigned int b, int w) +{ +#ifdef WIN32 + __asm + { + movd mm3, [b] // VVVVVVVV + movd mm5, [w] + + movd mm0, [a] + psrld mm3, 24 + + movd mm1, [b] + paddusw mm3, [Blenders__mmx_one] + + movq mm4, [Blenders__mmx_revn2] + pmullw mm3, mm5 + + packuswb mm5, mm5 + punpcklbw mm0, [Blenders__mmx_zero] + + punpcklwd mm5, mm5 + punpcklbw mm1, [Blenders__mmx_zero] + + psrlw mm3, 8 + punpckldq mm5, mm5 + + paddusw mm3, [Blenders__mmx_one] + + punpcklwd mm3, mm3 // HHVVHHVV + + punpckldq mm3, mm3 // HHVVHHVV HHVVHHVV + + + psubw mm4, mm3 + + pmullw mm0, mm4 + pmullw mm1, mm5 + + paddusw mm0, mm1 + + psrlw mm0, 8 + + packuswb mm0, mm0 + + movd eax, mm0 + } +#else + __asm__ volatile ( + "movd %2, %%mm3\n" + "movd %0, %%mm5\n" + "movd %1, %%mm0\n" + "psrld $24, %%mm3\n" + "movd %2, %%mm1\n" + "paddusw (Blenders__mmx_one), %%mm3\n" + "movq (Blenders__mmx_revn2), %%mm4\n" + "pmullw %%mm5, %%mm3\n" + "packuswb %%mm5, %%mm5 \n" + "punpcklbw (Blenders__mmx_zero), %%mm0\n" + "punpcklwd %%mm5, %%mm5\n" + "punpcklbw (Blenders__mmx_zero), %%mm1\n" + "psrlw $8, %%mm3\n" + "punpckldq %%mm5, %%mm5\n" + "paddusw (Blenders__mmx_one), %%mm3\n" + "punpcklwd %%mm3, %%mm3\n" + "punpckldq %%mm3, %%mm3\n" + "psubw %%mm3, %%mm4\n" + "pmullw %%mm4, %%mm0\n" + "pmullw %%mm5, %%mm1\n" + "paddusw %%mm1, %%mm0\n" + "psrlw $8, %%mm0\n" + "packuswb %%mm0, %%mm0\n" + "movd %%mm0, %%eax\n" + + : + : "m" (w), "m" (a), "m" (b) + : "%mm0", "%mm1", "%mm4", "%mm3", "%mm5" ); +#endif +} + +// does bilinear filtering. p1 is upper left pixel, w is width of framebuffer +// xp and yp's low 16 bits are used for the subpixel positioning. +unsigned int inline Blenders::BLEND4_MMX(unsigned int *p1, unsigned int w, int xp, int yp) +{ +#ifdef WIN32 + __asm + { + movd mm6, xp + mov eax, p1 + + movd mm7, yp + mov esi, w + + movq mm4, Blenders__mmx_revn2 + psrlw mm6, 8 + + movq mm5, Blenders__mmx_revn2 + psrlw mm7, 8 + + movd mm0, [eax] + punpcklwd mm6, mm6 + + movd mm1, [eax + 4] + punpcklwd mm7, mm7 + + movd mm2, [eax + esi*4] + punpckldq mm6, mm6 + + movd mm3, [eax + esi*4 + 4] + punpckldq mm7, mm7 + + punpcklbw mm0, [Blenders__mmx_zero] + psubw mm4, mm6 + + punpcklbw mm1, [Blenders__mmx_zero] + pmullw mm0, mm4 + + punpcklbw mm2, [Blenders__mmx_zero] + pmullw mm1, mm6 + + punpcklbw mm3, [Blenders__mmx_zero] + psubw mm5, mm7 + + pmullw mm2, mm4 + pmullw mm3, mm6 + + paddw mm0, mm1 + // stall (mm0) + + psrlw mm0, 8 + // stall (waiting for mm3/mm2) + + paddw mm2, mm3 + pmullw mm0, mm5 + + psrlw mm2, 8 + // stall (mm2) + + pmullw mm2, mm7 + // stall + + // stall (mm2) + + paddw mm0, mm2 + // stall + + psrlw mm0, 8 + // stall + + packuswb mm0, mm0 + // stall + + movd eax, mm0 + } +#else + __asm__ volatile ( + "movd %2, %%mm6\n" + "mov %0, %%eax\n" + "movd %3, %%mm7\n" + "mov %1, %%esi\n" + "movq (Blenders__mmx_revn2), %%mm4\n" + "psrlw $8, %%mm6\n" + "movq (Blenders__mmx_revn2), %%mm5\n" + "psrlw $8, %%mm7\n" + "movd (%%eax), %%mm0\n" + "punpcklwd %%mm6,%%mm6\n" + "movd 4(%%eax), %%mm1\n" + "punpcklwd %%mm7,%%mm7\n" + "movd (%%eax,%%esi,4), %%mm2\n" + "punpckldq %%mm6,%%mm6\n" + "movd 4(%%eax,%%esi,4), %%mm3\n" + "punpckldq %%mm7,%%mm7\n" + "punpcklbw (Blenders__mmx_zero), %%mm0\n" + "psubw %%mm6, %%mm4\n" + "punpcklbw (Blenders__mmx_zero), %%mm1\n" + "pmullw %%mm4, %%mm0\n" + "punpcklbw (Blenders__mmx_zero), %%mm2\n" + "pmullw %%mm6, %%mm1\n" + "punpcklbw (Blenders__mmx_zero), %%mm3\n" + "psubw %%mm7, %%mm5\n" + "pmullw %%mm4, %%mm2\n" + "pmullw %%mm6, %%mm3\n" + "paddw %%mm1, %%mm0\n" + "psrlw $8, %%mm0\n" + "paddw %%mm3, %%mm2\n" + "pmullw %%mm5, %%mm0\n" + "psrlw $8, %%mm2\n" + "pmullw %%mm7, %%mm2\n" + "paddw %%mm2, %%mm0\n" + "psrlw $8, %%mm0\n" + "packuswb %%mm0, %%mm0\n" + "movd %%mm0, %%eax\n" + + : + : "m" (p1), "m" (w), "m" (xp), "m" (yp) + : "%mm0", "%mm1", "%mm4", "%mm3", "%mm5" ); + +#endif +} + +#ifdef WIN32 +#pragma warning( pop ) +#endif + +#endif // ndef NO_MMX + + +#endif |