blit_argb8_sse.h

Go to the documentation of this file.
00001 /*
00002 **  ClanLib SDK
00003 **  Copyright (c) 1997-2011 The ClanLib Team
00004 **
00005 **  This software is provided 'as-is', without any express or implied
00006 **  warranty.  In no event will the authors be held liable for any damages
00007 **  arising from the use of this software.
00008 **
00009 **  Permission is granted to anyone to use this software for any purpose,
00010 **  including commercial applications, and to alter it and redistribute it
00011 **  freely, subject to the following restrictions:
00012 **
00013 **  1. The origin of this software must not be misrepresented; you must not
00014 **     claim that you wrote the original software. If you use this software
00015 **     in a product, an acknowledgment in the product documentation would be
00016 **     appreciated but is not required.
00017 **  2. Altered source versions must be plainly marked as such, and must not be
00018 **     misrepresented as being the original software.
00019 **  3. This notice may not be removed or altered from any source distribution.
00020 **
00021 **  Note: Some of the libraries ClanLib may link to may have additional
00022 **  requirements or restrictions.
00023 **
00024 **  File Author(s):
00025 **
00026 **    Magnus Norddahl
00027 */
00028 
00031 
00032 #pragma once
00033 
00034 #include "api_swrender.h"
00035 #include <emmintrin.h>
00036 
00040 class CL_BlitARGB8SSE
00041 {
00043 public:
00044         static void copy_pixels(unsigned int *dest, const unsigned int *src);
00045         static void load_pixel(__m128i &xmm, const unsigned int &pixel);
00046         static void load_pixels(__m128i &xmm, const unsigned int *pixels);
00047         static void load_pixels(__m128i &xmm, const unsigned int &p1, unsigned int &p2);
00048         static void load_pixel_linear(__m128i &xmm, const unsigned int &p1, const unsigned int &p2, const unsigned int &p3, const unsigned int &p4, unsigned int ifracx, unsigned int ifracy);
00049         static void set_one(__m128i &xmm);
00050         static void set_half(__m128i &xmm);
00051         static void set_color(__m128i &xmm, unsigned short red, unsigned short green, unsigned short blue, unsigned short alpha);
00052         static void set_color(__m128i &xmm, unsigned short r1, unsigned short g1, unsigned short b1, unsigned short a1, unsigned short r2, unsigned short g2, unsigned short b2, unsigned short a2);
00053 
00054 #ifdef _MSC_VER
00055         static void multiply_color(__m128i &src, __m128i &primcolor);
00056 #else
00057         // Fix to compile on gcc
00058         static void multiply_color(__m128i &src, __m128i primcolor);
00059 #endif
00060         static void blend_normal(__m128i &dest, __m128i &src, __m128i &one, __m128i &half);
00061         static void blend_premultiplied(__m128i &dest, __m128i &src, __m128i &one, __m128i &half);
00062         static void blend_lcd(__m128i &dest, __m128i &src, __m128i &one, __m128i &half, __m128i &color);
00063         static void store_pixel(unsigned int &pixel, __m128i &xmm);
00064         static void store_pixels(unsigned int *pixels, __m128i &xmm);
00065 
00066         static void pixels_to_channels(__m128i &red, __m128i &green, __m128i &blue, __m128i &alpha, const __m128i &src0, const __m128i &src1);
00067         static void channels_to_pixels(__m128i &dest0, __m128i &dest1, __m128i &red, __m128i &green, __m128i &blue, __m128i &alpha);
00068 //      static void sample_nearest(__m128i &out0, __m128i tx, __m128i ty, const unsigned int *data, int width);
00069 };
00070 
00071 inline void CL_BlitARGB8SSE::copy_pixels(unsigned int *dest, const unsigned int *src)
00072 {
00073         __m128i src0;
00074         src0 = _mm_loadl_epi64((const __m128i *) src);
00075         _mm_storel_epi64((__m128i *) dest, src0);
00076 }
00077 
00078 inline void CL_BlitARGB8SSE::load_pixel(__m128i &xmm, const unsigned int &pixel)
00079 {
00080         xmm = _mm_cvtsi32_si128(pixel);
00081         xmm = _mm_unpacklo_epi8(xmm, _mm_setzero_si128());
00082 }
00083 
00084 inline void CL_BlitARGB8SSE::load_pixels(__m128i &xmm, const unsigned int *pixels)
00085 {
00086         xmm = _mm_loadl_epi64((const __m128i *) pixels);
00087         xmm = _mm_unpacklo_epi8(xmm, _mm_setzero_si128());
00088 }
00089 
00090 inline void CL_BlitARGB8SSE::load_pixels(__m128i &xmm, const unsigned int &p1, unsigned int &p2)
00091 {
00092         xmm = _mm_set_epi32(0, 0, p2, p1);
00093         xmm = _mm_unpacklo_epi8(xmm, _mm_setzero_si128());
00094 }
00095 
00096 inline void CL_BlitARGB8SSE::load_pixel_linear(__m128i &xmm, const unsigned int &pixel1, const unsigned int &pixel2, const unsigned int &pixel3, const unsigned int &pixel4, unsigned int ifracx, unsigned int ifracy)
00097 {
00098         __m128i src0, src1, src2, src3;
00099         __m128i frac0, frac1, frac2, frac3;
00100         __m128i fracx, inv_fracx, fracy, inv_fracy;
00101         __m128i half = _mm_set1_epi16(64);
00102         fracx = _mm_set1_epi16(ifracx);
00103         fracy = _mm_set1_epi16(ifracy);
00104         inv_fracx = _mm_set1_epi16(0x80-ifracx);
00105         inv_fracy = _mm_set1_epi16(0x80-ifracy);
00106         frac0 = _mm_srli_epi16(_mm_mullo_epi16(inv_fracx, inv_fracy), 7);
00107         frac1 = _mm_srli_epi16(_mm_mullo_epi16(fracx, inv_fracy), 7);
00108         frac2 = _mm_srli_epi16(_mm_mullo_epi16(inv_fracx, fracy), 7);
00109         frac3 = _mm_srli_epi16(_mm_mullo_epi16(fracx, fracy), 7);
00110         src0 = _mm_mullo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(pixel1), _mm_setzero_si128()), frac0);
00111         src1 = _mm_mullo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(pixel2), _mm_setzero_si128()), frac1);
00112         src2 = _mm_mullo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(pixel3), _mm_setzero_si128()), frac2);
00113         src3 = _mm_mullo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(pixel4), _mm_setzero_si128()), frac3);
00114         xmm = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_add_epi16(_mm_add_epi16(src0, src1), src2), src3), half), 7);
00115 }
00116 
00117 inline void CL_BlitARGB8SSE::set_one(__m128i &xmm)
00118 {
00119         xmm =  _mm_set1_epi16(0x0100);
00120 }
00121 
00122 inline void CL_BlitARGB8SSE::set_half(__m128i &xmm)
00123 {
00124         xmm =  _mm_set1_epi16(0x007f);
00125 }
00126 
00127 inline void CL_BlitARGB8SSE::set_color(__m128i &xmm, unsigned short red, unsigned short green, unsigned short blue, unsigned short alpha)
00128 {
00129         xmm = _mm_set_epi16(alpha, red, green, blue, alpha, red, green, blue);
00130 }
00131 
00132 inline void CL_BlitARGB8SSE::set_color(__m128i &xmm, unsigned short r1, unsigned short g1, unsigned short b1, unsigned short a1, unsigned short r2, unsigned short g2, unsigned short b2, unsigned short a2)
00133 {
00134         xmm = _mm_set_epi16(a2, r2, g2, b2, a1, r1, g1, b1);
00135 }
00136 
00137 #ifdef _MSC_VER
00138 inline void CL_BlitARGB8SSE::multiply_color(__m128i &src, __m128i &primcolor)
00139 {
00140         src = _mm_mullo_epi16(src, primcolor);
00141         src = _mm_srli_epi16(src, 8);
00142 }
00143 #else
00144         // For some reason "primcolor" cannot be a reference on gcc
00145 inline void CL_BlitARGB8SSE::multiply_color(__m128i &src, __m128i primcolor)
00146 {
00147         src = _mm_mullo_epi16(src, primcolor);
00148         src = _mm_srli_epi16(src, 8);
00149 }
00150 #endif
00151 
00152 #define cl_blitargb8sse_multiply_color(src, primcolor) \
00153 { \
00154         src = _mm_mullo_epi16(src, primcolor); \
00155         src = _mm_srli_epi16(src, 8); \
00156 }
00157 
00158 inline void CL_BlitARGB8SSE::blend_normal(__m128i &dest, __m128i &src, __m128i &one, __m128i &half)
00159 {
00160         __m128i src_alpha, invsrc_alpha;
00161 
00162         src_alpha = src;
00163         src_alpha = _mm_shufflelo_epi16(src_alpha, 0xff);
00164         src_alpha = _mm_shufflehi_epi16(src_alpha, 0xff);
00165 
00166         invsrc_alpha = _mm_sub_epi16(one, src_alpha);
00167 
00168         src = _mm_mullo_epi16(src, src_alpha);
00169         dest = _mm_mullo_epi16(dest, invsrc_alpha);
00170 
00171         dest = _mm_add_epi16(dest, src);
00172         dest = _mm_add_epi16(dest, half); // round up
00173         dest = _mm_srli_epi16(dest, 8);
00174 }
00175 
00176 #define cl_blitargb8sse_blend_normal(dest, src, one, half) \
00177 { \
00178         __m128i src_alpha, invsrc_alpha; \
00179 \
00180         src_alpha = src; \
00181         src_alpha = _mm_shufflelo_epi16(src_alpha, 0xff); \
00182         src_alpha = _mm_shufflehi_epi16(src_alpha, 0xff); \
00183 \
00184         invsrc_alpha = _mm_sub_epi16(one, src_alpha); \
00185 \
00186         src = _mm_mullo_epi16(src, src_alpha); \
00187         dest = _mm_mullo_epi16(dest, invsrc_alpha); \
00188 \
00189         dest = _mm_add_epi16(dest, src); \
00190         dest = _mm_add_epi16(dest, half); \
00191         dest = _mm_srli_epi16(dest, 8); \
00192 }
00193 
00194 inline void CL_BlitARGB8SSE::blend_premultiplied(__m128i &dest, __m128i &src, __m128i &one, __m128i &half)
00195 {
00196         __m128i src_alpha, invsrc_alpha;
00197 
00198         src_alpha = src;
00199         src_alpha = _mm_shufflelo_epi16(src_alpha, 0xff);
00200         src_alpha = _mm_shufflehi_epi16(src_alpha, 0xff);
00201 
00202         invsrc_alpha = _mm_sub_epi16(one, src_alpha);
00203 
00204         dest = _mm_mullo_epi16(dest, invsrc_alpha);
00205         dest = _mm_add_epi16(dest, half); // round up
00206         dest = _mm_srli_epi16(dest, 8);
00207         dest = _mm_add_epi16(dest, src);
00208 }
00209 
00210 inline void CL_BlitARGB8SSE::blend_lcd(__m128i &dest, __m128i &src, __m128i &one, __m128i &half, __m128i &color)
00211 {
00212         __m128i invsrc;
00213         invsrc = _mm_sub_epi16(one, _mm_add_epi16(_mm_srli_epi16(src, 7), src));
00214 
00215         dest = _mm_add_epi16(_mm_mullo_epi16(src, color), _mm_mullo_epi16(dest, invsrc));
00216         dest = _mm_add_epi16(dest, half); // round up
00217         dest = _mm_srli_epi16(dest, 8);
00218 }
00219 
00220 inline void CL_BlitARGB8SSE::store_pixel(unsigned int &pixel, __m128i &xmm)
00221 {
00222         xmm = _mm_packus_epi16(xmm, _mm_setzero_si128());
00223         pixel = _mm_cvtsi128_si32(xmm);
00224 }
00225 
00226 inline void CL_BlitARGB8SSE::store_pixels(unsigned int *pixels, __m128i &xmm)
00227 {
00228         xmm = _mm_packus_epi16(xmm, _mm_setzero_si128());
00229         _mm_storel_epi64((__m128i *) pixels, xmm);
00230 }
00231 
00232 inline void CL_BlitARGB8SSE::pixels_to_channels(__m128i &red, __m128i &green, __m128i &blue, __m128i &alpha, const __m128i &src0, const __m128i &src1)
00233 {
00234         __m128i alpha_mask = _mm_set1_epi32(0xff000000);
00235         __m128i red_mask = _mm_set1_epi32(0x00ff0000);
00236         __m128i green_mask = _mm_set1_epi32(0x0000ff00);
00237         __m128i blue_mask = _mm_set1_epi32(0x000000ff);
00238 
00239         alpha = _mm_srli_si128(_mm_and_si128(alpha_mask, src0), 1);
00240         alpha = _mm_or_si128(alpha, _mm_srli_si128(_mm_and_si128(alpha_mask, src1), 3));
00241 
00242         red = _mm_and_si128(red_mask, src0);
00243         red = _mm_or_si128(red, _mm_srli_si128(_mm_and_si128(red_mask, src1), 2));
00244 
00245         green = _mm_slli_si128(_mm_and_si128(green_mask, src0), 1);
00246         green = _mm_or_si128(green, _mm_srli_si128(_mm_and_si128(green_mask, src1), 1));
00247 
00248         blue = _mm_slli_si128(_mm_and_si128(blue_mask, src0), 2);
00249         blue = _mm_or_si128(blue, _mm_and_si128(blue_mask, src1));
00250 }
00251 
00252 inline void CL_BlitARGB8SSE::channels_to_pixels(__m128i &dest0, __m128i &dest1, __m128i &red, __m128i &green, __m128i &blue, __m128i &alpha)
00253 {
00254         __m128i alpha_mask = _mm_set1_epi32(0xff000000);
00255         __m128i red_mask = _mm_set1_epi32(0x00ff0000);
00256         __m128i green_mask = _mm_set1_epi32(0x0000ff00);
00257         __m128i blue_mask = _mm_set1_epi32(0x000000ff);
00258 
00259         dest0 = _mm_and_si128(alpha_mask, _mm_slli_si128(alpha, 1));
00260         dest1 = _mm_and_si128(alpha_mask, _mm_slli_si128(alpha, 3));
00261 
00262         dest0 = _mm_or_si128(dest0, _mm_and_si128(red_mask, red));
00263         dest1 = _mm_or_si128(dest1, _mm_and_si128(red_mask, _mm_slli_si128(red, 2)));
00264 
00265         dest0 = _mm_or_si128(dest0, _mm_and_si128(green_mask, _mm_srli_si128(green, 1)));
00266         dest1 = _mm_or_si128(dest1, _mm_and_si128(green_mask, _mm_slli_si128(green, 1)));
00267 
00268         dest0 = _mm_or_si128(dest0, _mm_and_si128(blue_mask, _mm_srli_si128(blue, 2)));
00269         dest1 = _mm_or_si128(dest1, _mm_and_si128(blue_mask, blue));
00270 }
00271 
00272 #ifdef _MSC_VER
00273 
00274 #define cl_blitargb8sse_sample_nearest(out0, tx, ty, data, width) \
00275 { \
00276         __declspec(align(16)) unsigned int x[4], y[4]; \
00277         _mm_store_si128((__m128i*) x, _mm_srai_epi32(tx, 16)); \
00278         _mm_store_si128((__m128i*) y, _mm_srai_epi32(ty, 16)); \
00279         out0 = _mm_set_epi32(data[x[0]+y[0]*width], data[x[1]+y[1]*width], data[x[2]+y[2]*width], data[x[3]+y[3]*width]); \
00280 }
00281 
00282 #else
00283 
00284 #define cl_blitargb8sse_sample_nearest(out0, tx, ty, data, width) \
00285 { \
00286         __attribute__ ((aligned(16))) unsigned int x[4], y[4]; \
00287         _mm_store_si128((__m128i*) x, _mm_srai_epi32(tx, 16)); \
00288         _mm_store_si128((__m128i*) y, _mm_srai_epi32(ty, 16)); \
00289         out0 = _mm_set_epi32(data[x[0]+y[0]*width], data[x[1]+y[1]*width], data[x[2]+y[2]*width], data[x[3]+y[3]*width]); \
00290 }
00291 
00292 #endif
00293 
00294 // Sadly it seems that the Visual C++ 2008 compiler is unable to optimize CL_BlitARGB8SSE::texture_repeat properly
00295 // when implemented as an inline function.  Maybe it is the branching or the loops that does it?
00296 // Implemented as a macro instead.
00297 #define cl_blitargb8sse_texture_repeat(tx, ty, width, height) \
00298 { \
00299         while (true) \
00300         { \
00301                 __m128i compare_result = _mm_cmplt_epi32(tx, _mm_setzero_si128()); \
00302                 if (_mm_movemask_epi8(compare_result)) \
00303                         tx = _mm_add_epi32(tx, _mm_and_si128(compare_result, width)); \
00304                 else \
00305                         break; \
00306         } \
00307         while (true) \
00308         { \
00309                 __m128i compare_result = _mm_cmplt_epi32(tx, width); \
00310                 if (_mm_movemask_epi8(compare_result)!=0xffff) \
00311                         tx = _mm_sub_epi32(tx, _mm_andnot_si128(compare_result, width)); \
00312                 else \
00313                         break; \
00314         } \
00315         while (true) \
00316         { \
00317                 __m128i compare_result = _mm_cmplt_epi32(ty, _mm_setzero_si128()); \
00318                 if (_mm_movemask_epi8(compare_result)) \
00319                         ty = _mm_add_epi32(ty, _mm_and_si128(compare_result, height)); \
00320                 else \
00321                         break; \
00322         } \
00323         while (true) \
00324         { \
00325                 __m128i compare_result = _mm_cmplt_epi32(ty, height); \
00326                 if (_mm_movemask_epi8(compare_result)!=0xffff) \
00327                         ty = _mm_sub_epi32(ty, _mm_andnot_si128(compare_result, height)); \
00328                 else \
00329                         break; \
00330         } \
00331 }
00332