From 31ef94db0b99f592c2c799ba76994b8451eebd85 Mon Sep 17 00:00:00 2001 From: Sam Lantinga Date: Tue, 22 Feb 2011 21:44:36 -0800 Subject: [PATCH] Re-added the 3DNow! and AltiVec instruction support. --- VisualC/SDL/SDL_VS2005.vcproj | 4 +- VisualC/SDL/SDL_VS2008.vcproj | 4 +- VisualC/SDL/SDL_VS2010.vcxproj | 6 +- configure | 369 +++++++++++- configure.in | 103 ++++ include/SDL_config.h.in | 4 +- include/SDL_config_macosx.h | 3 + include/SDL_cpuinfo.h | 38 ++ src/cpuinfo/SDL_cpuinfo.c | 91 ++- src/video/SDL_blit.c | 34 ++ src/video/SDL_blit.h | 21 +- src/video/SDL_blit_A.c | 926 ++++++++++++++++++++++++++++- src/video/SDL_blit_N.c | 863 +++++++++++++++++++++++++++ test/automated/platform/platform.c | 2 + test/testplatform.c | 4 +- 15 files changed, 2413 insertions(+), 59 deletions(-) mode change 100644 => 100755 test/testplatform.c diff --git a/VisualC/SDL/SDL_VS2005.vcproj b/VisualC/SDL/SDL_VS2005.vcproj index 75031a04a..98686eb0d 100644 --- a/VisualC/SDL/SDL_VS2005.vcproj +++ b/VisualC/SDL/SDL_VS2005.vcproj @@ -52,7 +52,7 @@ Name="VCCLCompilerTool" Optimization="0" AdditionalIncludeDirectories="..\..\include" - PreprocessorDefinitions="_DEBUG;_WINDOWS;_WIN32_WINNT=0x0400;__MMX__;__3dNOW__;__SSE__;__SSE2__" + PreprocessorDefinitions="_DEBUG;_WINDOWS" RuntimeLibrary="2" BufferSecurityCheck="false" UsePrecompiledHeader="0" @@ -231,7 +231,7 @@ InlineFunctionExpansion="1" EnableIntrinsicFunctions="false" AdditionalIncludeDirectories="..\..\include" - PreprocessorDefinitions="NDEBUG;_WINDOWS;_WIN32_WINNT=0x0400;__MMX__;__3dNOW__;__SSE__;__SSE2__" + PreprocessorDefinitions="NDEBUG;_WINDOWS" StringPooling="true" RuntimeLibrary="2" BufferSecurityCheck="false" diff --git a/VisualC/SDL/SDL_VS2008.vcproj b/VisualC/SDL/SDL_VS2008.vcproj index ce85d20a1..5c495c001 100644 --- a/VisualC/SDL/SDL_VS2008.vcproj +++ b/VisualC/SDL/SDL_VS2008.vcproj @@ -52,7 +52,7 @@ Name="VCCLCompilerTool" Optimization="0" AdditionalIncludeDirectories="..\..\include" - PreprocessorDefinitions="_DEBUG;_WINDOWS;_WIN32_WINNT=0x0400;__MMX__;__3dNOW__;__SSE__;__SSE2__" + PreprocessorDefinitions="_DEBUG;_WINDOWS" RuntimeLibrary="3" BufferSecurityCheck="false" WarningLevel="3" @@ -223,7 +223,7 @@ InlineFunctionExpansion="1" EnableIntrinsicFunctions="false" AdditionalIncludeDirectories="..\..\include" - PreprocessorDefinitions="NDEBUG;_WINDOWS;_WIN32_WINNT=0x0400;__MMX__;__3dNOW__;__SSE__;__SSE2__" + PreprocessorDefinitions="NDEBUG;_WINDOWS" StringPooling="true" RuntimeLibrary="2" BufferSecurityCheck="false" diff --git a/VisualC/SDL/SDL_VS2010.vcxproj b/VisualC/SDL/SDL_VS2010.vcxproj index 2f2b4aa19..5da0fa57e 100644 --- a/VisualC/SDL/SDL_VS2010.vcxproj +++ b/VisualC/SDL/SDL_VS2010.vcxproj @@ -83,7 +83,7 @@ Disabled ..\..\include;%(AdditionalIncludeDirectories) - _DEBUG;_WINDOWS;_WIN32_WINNT=0x0400;__MMX__;__3dNOW__;__SSE__;__SSE2__;%(PreprocessorDefinitions) + _DEBUG;_WINDOWS;%(PreprocessorDefinitions) MultiThreadedDebugDLL false @@ -152,7 +152,7 @@ OnlyExplicitInline false ..\..\include;%(AdditionalIncludeDirectories) - NDEBUG;_WINDOWS;_WIN32_WINNT=0x0400;__MMX__;__3dNOW__;__SSE__;__SSE2__;%(PreprocessorDefinitions) + NDEBUG;_WINDOWS;%(PreprocessorDefinitions) true MultiThreadedDLL false @@ -446,4 +446,4 @@ - \ No newline at end of file + diff --git a/configure b/configure index 5f6143681..c594b1ec5 100755 --- a/configure +++ b/configure @@ -1514,8 +1514,10 @@ Optional Features: --enable-ssemath Allow GCC to use SSE floating point math [default=no] --enable-mmx use MMX assembly routines [default=yes] + --enable-3dnow use MMX assembly routines [default=yes] --enable-sse use SSE assembly routines [default=yes] --enable-sse2 use SSE2 assembly routines [default=no] + --enable-altivec use Altivec assembly routines [default=yes] --enable-oss support the OSS audio API [default=yes] --enable-alsa support the ALSA audio API [default=yes] --disable-alsatest Do not try to compile and run a test Alsa program @@ -3768,13 +3770,13 @@ if test "${lt_cv_nm_interface+set}" = set; then else lt_cv_nm_interface="BSD nm" echo "int some_variable = 0;" > conftest.$ac_ext - (eval echo "\"\$as_me:3771: $ac_compile\"" >&5) + (eval echo "\"\$as_me:3773: $ac_compile\"" >&5) (eval "$ac_compile" 2>conftest.err) cat conftest.err >&5 - (eval echo "\"\$as_me:3774: $NM \\\"conftest.$ac_objext\\\"\"" >&5) + (eval echo "\"\$as_me:3776: $NM \\\"conftest.$ac_objext\\\"\"" >&5) (eval "$NM \"conftest.$ac_objext\"" 2>conftest.err > conftest.out) cat conftest.err >&5 - (eval echo "\"\$as_me:3777: output\"" >&5) + (eval echo "\"\$as_me:3779: output\"" >&5) cat conftest.out >&5 if $GREP 'External.*some_variable' conftest.out > /dev/null; then lt_cv_nm_interface="MS dumpbin" @@ -5001,7 +5003,7 @@ ia64-*-hpux*) ;; *-*-irix6*) # Find out which ABI we are using. - echo '#line 5004 "configure"' > conftest.$ac_ext + echo '#line 5006 "configure"' > conftest.$ac_ext if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5 (eval $ac_compile) 2>&5 ac_status=$? @@ -7162,11 +7164,11 @@ else -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \ -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \ -e 's:$: $lt_compiler_flag:'` - (eval echo "\"\$as_me:7165: $lt_compile\"" >&5) + (eval echo "\"\$as_me:7167: $lt_compile\"" >&5) (eval "$lt_compile" 2>conftest.err) ac_status=$? cat conftest.err >&5 - echo "$as_me:7169: \$? = $ac_status" >&5 + echo "$as_me:7171: \$? = $ac_status" >&5 if (exit $ac_status) && test -s "$ac_outfile"; then # The compiler can only warn and ignore the option if not recognized # So say no if there are warnings other than the usual output. @@ -7501,11 +7503,11 @@ else -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \ -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \ -e 's:$: $lt_compiler_flag:'` - (eval echo "\"\$as_me:7504: $lt_compile\"" >&5) + (eval echo "\"\$as_me:7506: $lt_compile\"" >&5) (eval "$lt_compile" 2>conftest.err) ac_status=$? cat conftest.err >&5 - echo "$as_me:7508: \$? = $ac_status" >&5 + echo "$as_me:7510: \$? = $ac_status" >&5 if (exit $ac_status) && test -s "$ac_outfile"; then # The compiler can only warn and ignore the option if not recognized # So say no if there are warnings other than the usual output. @@ -7606,11 +7608,11 @@ else -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \ -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \ -e 's:$: $lt_compiler_flag:'` - (eval echo "\"\$as_me:7609: $lt_compile\"" >&5) + (eval echo "\"\$as_me:7611: $lt_compile\"" >&5) (eval "$lt_compile" 2>out/conftest.err) ac_status=$? cat out/conftest.err >&5 - echo "$as_me:7613: \$? = $ac_status" >&5 + echo "$as_me:7615: \$? = $ac_status" >&5 if (exit $ac_status) && test -s out/conftest2.$ac_objext then # The compiler can only warn and ignore the option if not recognized @@ -7661,11 +7663,11 @@ else -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \ -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \ -e 's:$: $lt_compiler_flag:'` - (eval echo "\"\$as_me:7664: $lt_compile\"" >&5) + (eval echo "\"\$as_me:7666: $lt_compile\"" >&5) (eval "$lt_compile" 2>out/conftest.err) ac_status=$? cat out/conftest.err >&5 - echo "$as_me:7668: \$? = $ac_status" >&5 + echo "$as_me:7670: \$? = $ac_status" >&5 if (exit $ac_status) && test -s out/conftest2.$ac_objext then # The compiler can only warn and ignore the option if not recognized @@ -10419,7 +10421,7 @@ else lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2 lt_status=$lt_dlunknown cat > conftest.$ac_ext <<_LT_EOF -#line 10422 "configure" +#line 10424 "configure" #include "confdefs.h" #if HAVE_DLFCN_H @@ -10515,7 +10517,7 @@ else lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2 lt_status=$lt_dlunknown cat > conftest.$ac_ext <<_LT_EOF -#line 10518 "configure" +#line 10520 "configure" #include "confdefs.h" #if HAVE_DLFCN_H @@ -14197,11 +14199,11 @@ else -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \ -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \ -e 's:$: $lt_compiler_flag:'` - (eval echo "\"\$as_me:14200: $lt_compile\"" >&5) + (eval echo "\"\$as_me:14202: $lt_compile\"" >&5) (eval "$lt_compile" 2>conftest.err) ac_status=$? cat conftest.err >&5 - echo "$as_me:14204: \$? = $ac_status" >&5 + echo "$as_me:14206: \$? = $ac_status" >&5 if (exit $ac_status) && test -s "$ac_outfile"; then # The compiler can only warn and ignore the option if not recognized # So say no if there are warnings other than the usual output. @@ -14296,11 +14298,11 @@ else -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \ -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \ -e 's:$: $lt_compiler_flag:'` - (eval echo "\"\$as_me:14299: $lt_compile\"" >&5) + (eval echo "\"\$as_me:14301: $lt_compile\"" >&5) (eval "$lt_compile" 2>out/conftest.err) ac_status=$? cat out/conftest.err >&5 - echo "$as_me:14303: \$? = $ac_status" >&5 + echo "$as_me:14305: \$? = $ac_status" >&5 if (exit $ac_status) && test -s out/conftest2.$ac_objext then # The compiler can only warn and ignore the option if not recognized @@ -14348,11 +14350,11 @@ else -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \ -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \ -e 's:$: $lt_compiler_flag:'` - (eval echo "\"\$as_me:14351: $lt_compile\"" >&5) + (eval echo "\"\$as_me:14353: $lt_compile\"" >&5) (eval "$lt_compile" 2>out/conftest.err) ac_status=$? cat out/conftest.err >&5 - echo "$as_me:14355: \$? = $ac_status" >&5 + echo "$as_me:14357: \$? = $ac_status" >&5 if (exit $ac_status) && test -s out/conftest2.$ac_objext then # The compiler can only warn and ignore the option if not recognized @@ -20693,6 +20695,79 @@ echo "${ECHO_T}$have_gcc_mmx" >&6; } fi fi + # Check whether --enable-3dnow was given. +if test "${enable_3dnow+set}" = set; then + enableval=$enable_3dnow; +else + enable_3dnow=yes +fi + + if test x$enable_3dnow = xyes; then + save_CFLAGS="$CFLAGS" + have_gcc_3dnow=no + { echo "$as_me:$LINENO: checking for GCC -m3dnow option" >&5 +echo $ECHO_N "checking for GCC -m3dnow option... $ECHO_C" >&6; } + amd3dnow_CFLAGS="-m3dnow" + CFLAGS="$save_CFLAGS $amd3dnow_CFLAGS" + + cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ + + #include + #ifndef __3dNOW__ + #error Assembler CPP flag not enabled + #endif + +int +main () +{ + + + ; + return 0; +} +_ACEOF +rm -f conftest.$ac_objext +if { (ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5 + (eval "$ac_compile") 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then + + have_gcc_3dnow=yes + +else + echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + +fi + +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + { echo "$as_me:$LINENO: result: $have_gcc_3dnow" >&5 +echo "${ECHO_T}$have_gcc_3dnow" >&6; } + CFLAGS="$save_CFLAGS" + + if test x$have_gcc_3dnow = xyes; then + EXTRA_CFLAGS="$EXTRA_CFLAGS $amd3dnow_CFLAGS" + fi + fi + # Check whether --enable-sse was given. if test "${enable_sse+set}" = set; then enableval=$enable_sse; @@ -20856,6 +20931,260 @@ echo "${ECHO_T}$have_gcc_sse2" >&6; } EXTRA_CFLAGS="$EXTRA_CFLAGS $sse2_CFLAGS" fi fi + + # Check whether --enable-altivec was given. +if test "${enable_altivec+set}" = set; then + enableval=$enable_altivec; +else + enable_altivec=yes +fi + + if test x$enable_altivec = xyes; then + save_CFLAGS="$CFLAGS" + have_gcc_altivec=no + have_altivec_h_hdr=no + altivec_CFLAGS="-maltivec" + CFLAGS="$save_CFLAGS $altivec_CFLAGS" + + { echo "$as_me:$LINENO: checking for Altivec with GCC altivec.h and -maltivec option" >&5 +echo $ECHO_N "checking for Altivec with GCC altivec.h and -maltivec option... $ECHO_C" >&6; } + cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ + + #include + vector unsigned int vzero() { + return vec_splat_u32(0); + } + +int +main () +{ + + + ; + return 0; +} +_ACEOF +rm -f conftest.$ac_objext +if { (ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5 + (eval "$ac_compile") 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then + + have_gcc_altivec=yes + have_altivec_h_hdr=yes + +else + echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + +fi + +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + { echo "$as_me:$LINENO: result: $have_gcc_altivec" >&5 +echo "${ECHO_T}$have_gcc_altivec" >&6; } + + if test x$have_gcc_altivec = xno; then + { echo "$as_me:$LINENO: checking for Altivec with GCC -maltivec option" >&5 +echo $ECHO_N "checking for Altivec with GCC -maltivec option... $ECHO_C" >&6; } + cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ + + vector unsigned int vzero() { + return vec_splat_u32(0); + } + +int +main () +{ + + + ; + return 0; +} +_ACEOF +rm -f conftest.$ac_objext +if { (ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5 + (eval "$ac_compile") 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then + + have_gcc_altivec=yes + +else + echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + +fi + +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + { echo "$as_me:$LINENO: result: $have_gcc_altivec" >&5 +echo "${ECHO_T}$have_gcc_altivec" >&6; } + fi + + if test x$have_gcc_altivec = xno; then + { echo "$as_me:$LINENO: checking for Altivec with GCC altivec.h and -faltivec option" >&5 +echo $ECHO_N "checking for Altivec with GCC altivec.h and -faltivec option... $ECHO_C" >&6; } + altivec_CFLAGS="-faltivec" + CFLAGS="$save_CFLAGS $altivec_CFLAGS" + cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ + + #include + vector unsigned int vzero() { + return vec_splat_u32(0); + } + +int +main () +{ + + + ; + return 0; +} +_ACEOF +rm -f conftest.$ac_objext +if { (ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5 + (eval "$ac_compile") 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then + + have_gcc_altivec=yes + have_altivec_h_hdr=yes + +else + echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + +fi + +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + { echo "$as_me:$LINENO: result: $have_gcc_altivec" >&5 +echo "${ECHO_T}$have_gcc_altivec" >&6; } + fi + + if test x$have_gcc_altivec = xno; then + { echo "$as_me:$LINENO: checking for Altivec with GCC -faltivec option" >&5 +echo $ECHO_N "checking for Altivec with GCC -faltivec option... $ECHO_C" >&6; } + cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ + + vector unsigned int vzero() { + return vec_splat_u32(0); + } + +int +main () +{ + + + ; + return 0; +} +_ACEOF +rm -f conftest.$ac_objext +if { (ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5 + (eval "$ac_compile") 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then + + have_gcc_altivec=yes + +else + echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + +fi + +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + { echo "$as_me:$LINENO: result: $have_gcc_altivec" >&5 +echo "${ECHO_T}$have_gcc_altivec" >&6; } + fi + CFLAGS="$save_CFLAGS" + + if test x$have_gcc_altivec = xyes; then + cat >>confdefs.h <<\_ACEOF +#define SDL_ALTIVEC_BLITTERS 1 +_ACEOF + + if test x$have_altivec_h_hdr = xyes; then + cat >>confdefs.h <<\_ACEOF +#define HAVE_ALTIVEC_H 1 +_ACEOF + + fi + EXTRA_CFLAGS="$EXTRA_CFLAGS $altivec_CFLAGS" + fi + fi fi CheckOSS() diff --git a/configure.in b/configure.in index 38bd40a28..49c4f41a2 100644 --- a/configure.in +++ b/configure.in @@ -501,6 +501,33 @@ AC_HELP_STRING([--enable-mmx], [use MMX assembly routines [[default=yes]]]), fi fi + AC_ARG_ENABLE(3dnow, +AC_HELP_STRING([--enable-3dnow], [use MMX assembly routines [[default=yes]]]), + , enable_3dnow=yes) + if test x$enable_3dnow = xyes; then + save_CFLAGS="$CFLAGS" + have_gcc_3dnow=no + AC_MSG_CHECKING(for GCC -m3dnow option) + amd3dnow_CFLAGS="-m3dnow" + CFLAGS="$save_CFLAGS $amd3dnow_CFLAGS" + + AC_TRY_COMPILE([ + #include + #ifndef __3dNOW__ + #error Assembler CPP flag not enabled + #endif + ],[ + ],[ + have_gcc_3dnow=yes + ]) + AC_MSG_RESULT($have_gcc_3dnow) + CFLAGS="$save_CFLAGS" + + if test x$have_gcc_3dnow = xyes; then + EXTRA_CFLAGS="$EXTRA_CFLAGS $amd3dnow_CFLAGS" + fi + fi + AC_ARG_ENABLE(sse, AC_HELP_STRING([--enable-sse], [use SSE assembly routines [[default=yes]]]), , enable_sse=yes) @@ -572,6 +599,82 @@ AC_HELP_STRING([--enable-sse2], [use SSE2 assembly routines [[default=no]]]), EXTRA_CFLAGS="$EXTRA_CFLAGS $sse2_CFLAGS" fi fi + + AC_ARG_ENABLE(altivec, +AC_HELP_STRING([--enable-altivec], [use Altivec assembly routines [[default=yes]]]), + , enable_altivec=yes) + if test x$enable_altivec = xyes; then + save_CFLAGS="$CFLAGS" + have_gcc_altivec=no + have_altivec_h_hdr=no + altivec_CFLAGS="-maltivec" + CFLAGS="$save_CFLAGS $altivec_CFLAGS" + + AC_MSG_CHECKING(for Altivec with GCC altivec.h and -maltivec option) + AC_TRY_COMPILE([ + #include + vector unsigned int vzero() { + return vec_splat_u32(0); + } + ],[ + ],[ + have_gcc_altivec=yes + have_altivec_h_hdr=yes + ]) + AC_MSG_RESULT($have_gcc_altivec) + + if test x$have_gcc_altivec = xno; then + AC_MSG_CHECKING(for Altivec with GCC -maltivec option) + AC_TRY_COMPILE([ + vector unsigned int vzero() { + return vec_splat_u32(0); + } + ],[ + ],[ + have_gcc_altivec=yes + ]) + AC_MSG_RESULT($have_gcc_altivec) + fi + + if test x$have_gcc_altivec = xno; then + AC_MSG_CHECKING(for Altivec with GCC altivec.h and -faltivec option) + altivec_CFLAGS="-faltivec" + CFLAGS="$save_CFLAGS $altivec_CFLAGS" + AC_TRY_COMPILE([ + #include + vector unsigned int vzero() { + return vec_splat_u32(0); + } + ],[ + ],[ + have_gcc_altivec=yes + have_altivec_h_hdr=yes + ]) + AC_MSG_RESULT($have_gcc_altivec) + fi + + if test x$have_gcc_altivec = xno; then + AC_MSG_CHECKING(for Altivec with GCC -faltivec option) + AC_TRY_COMPILE([ + vector unsigned int vzero() { + return vec_splat_u32(0); + } + ],[ + ],[ + have_gcc_altivec=yes + ]) + AC_MSG_RESULT($have_gcc_altivec) + fi + CFLAGS="$save_CFLAGS" + + if test x$have_gcc_altivec = xyes; then + AC_DEFINE(SDL_ALTIVEC_BLITTERS) + if test x$have_altivec_h_hdr = xyes; then + AC_DEFINE(HAVE_ALTIVEC_H) + fi + EXTRA_CFLAGS="$EXTRA_CFLAGS $altivec_CFLAGS" + fi + fi fi dnl See if the OSS audio interface is supported diff --git a/include/SDL_config.h.in b/include/SDL_config.h.in index c5e99daad..67d5f534f 100644 --- a/include/SDL_config.h.in +++ b/include/SDL_config.h.in @@ -34,7 +34,7 @@ /* Make sure that this isn't included by Visual C++ */ #ifdef _MSC_VER -#error You should copy include/SDL_config.h.default to include/SDL_config.h +#error You should run hg revert SDL_config.h #endif /* C language features */ @@ -82,6 +82,7 @@ #undef HAVE_MATH_H #undef HAVE_ICONV_H #undef HAVE_SIGNAL_H +#undef HAVE_ALTIVEC_H /* C library functions */ #undef HAVE_MALLOC @@ -302,5 +303,6 @@ /* Enable assembly routines */ #undef SDL_ASSEMBLY_ROUTINES +#undef SDL_ALTIVEC_BLITTERS #endif /* _SDL_config_h */ diff --git a/include/SDL_config_macosx.h b/include/SDL_config_macosx.h index c68ea7dab..83f9feee6 100644 --- a/include/SDL_config_macosx.h +++ b/include/SDL_config_macosx.h @@ -168,5 +168,8 @@ /* Enable assembly routines */ #define SDL_ASSEMBLY_ROUTINES 1 +#ifdef __ppc__ +#define SDL_ALTIVEC_BLITTERS 1 +#endif #endif /* _SDL_config_macosx_h */ diff --git a/include/SDL_cpuinfo.h b/include/SDL_cpuinfo.h index 8a8edb7e1..e93a0ef7d 100644 --- a/include/SDL_cpuinfo.h +++ b/include/SDL_cpuinfo.h @@ -31,6 +31,34 @@ #include "SDL_stdinc.h" +/* Need to do this here because intrin.h has C++ code in it */ +/* Visual Studio 2005 has a bug where intrin.h conflicts with winnt.h */ +#if defined(_MSC_VER) && (_MSC_VER >= 1500) && !defined(_WIN32_WCE) +#include +#define __MMX__ +#define __3dNOW__ +#define __SSE__ +#define __SSE2__ +#elif defined(__MINGW64_VERSION_MAJOR) +#include +#else +#ifdef __MMX__ +#include +#endif +#ifdef __3dNOW__ +#include +#endif +#ifdef __SSE__ +#include +#endif +#ifdef __SSE2__ +#include +#endif +#ifdef HAVE_ALTIVEC_H +#include +#endif +#endif + #include "begin_code.h" /* Set up for C function definitions, even when using C++ */ #ifdef __cplusplus @@ -64,11 +92,21 @@ extern DECLSPEC int SDLCALL SDL_GetCPUCacheLineSize(void); */ extern DECLSPEC SDL_bool SDLCALL SDL_HasRDTSC(void); +/** + * This function returns true if the CPU has AltiVec features. + */ +extern DECLSPEC SDL_bool SDLCALL SDL_HasAltiVec(void); + /** * This function returns true if the CPU has MMX features. */ extern DECLSPEC SDL_bool SDLCALL SDL_HasMMX(void); +/** + * This function returns true if the CPU has 3DNow! features. + */ +extern DECLSPEC SDL_bool SDLCALL SDL_Has3DNow(void); + /** * This function returns true if the CPU has SSE features. */ diff --git a/src/cpuinfo/SDL_cpuinfo.c b/src/cpuinfo/SDL_cpuinfo.c index 8a037f503..63c5c441d 100644 --- a/src/cpuinfo/SDL_cpuinfo.c +++ b/src/cpuinfo/SDL_cpuinfo.c @@ -32,18 +32,37 @@ #include #include #endif +#if defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__)) +#include /* For AltiVec check */ +#elif SDL_ALTIVEC_BLITTERS && HAVE_SETJMP +#include +#include +#endif #ifdef __WIN32__ #include "../core/windows/SDL_windows.h" #endif #define CPU_HAS_RDTSC 0x00000001 -#define CPU_HAS_MMX 0x00000002 +#define CPU_HAS_ALTIVEC 0x00000002 +#define CPU_HAS_MMX 0x00000004 +#define CPU_HAS_3DNOW 0x00000008 #define CPU_HAS_SSE 0x00000010 #define CPU_HAS_SSE2 0x00000020 #define CPU_HAS_SSE3 0x00000040 -#define CPU_HAS_SSE41 0x00000080 -#define CPU_HAS_SSE42 0x00000100 - +#define CPU_HAS_SSE41 0x00000100 +#define CPU_HAS_SSE42 0x00000200 + +#if SDL_ALTIVEC_BLITTERS && HAVE_SETJMP && !__MACOSX__ +/* This is the brute force way of detecting instruction sets... + the idea is borrowed from the libmpeg2 library - thanks! + */ +static jmp_buf jmpbuf; +static void +illegal_instruction(int sig) +{ + longjmp(jmpbuf, 1); +} +#endif /* HAVE_SETJMP */ static __inline__ int CPU_haveCPUID(void) @@ -192,6 +211,29 @@ CPU_haveRDTSC(void) return 0; } +static __inline__ int +CPU_haveAltiVec(void) +{ + volatile int altivec = 0; +#if defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__)) + int selectors[2] = { CTL_HW, HW_VECTORUNIT }; + int hasVectorUnit = 0; + size_t length = sizeof(hasVectorUnit); + int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0); + if (0 == error) + altivec = (hasVectorUnit != 0); +#elif SDL_ALTIVEC_BLITTERS && HAVE_SETJMP + void (*handler) (int sig); + handler = signal(SIGILL, illegal_instruction); + if (setjmp(jmpbuf) == 0) { + asm volatile ("mtspr 256, %0\n\t" "vand %%v0, %%v0, %%v0"::"r" (-1)); + altivec = 1; + } + signal(SIGILL, handler); +#endif + return altivec; +} + static __inline__ int CPU_haveMMX(void) { @@ -201,6 +243,21 @@ CPU_haveMMX(void) return 0; } +static __inline__ int +CPU_have3DNow(void) +{ + if (CPU_haveCPUID()) { + int a, b, c, d; + + cpuid(0x80000000, a, b, c, d); + if (a >= 0x80000001) { + cpuid(0x80000001, a, b, c, d); + return (d & 0x80000000); + } + } + return 0; +} + static __inline__ int CPU_haveSSE(void) { @@ -431,9 +488,15 @@ SDL_GetCPUFeatures(void) if (CPU_haveRDTSC()) { SDL_CPUFeatures |= CPU_HAS_RDTSC; } + if (CPU_haveAltiVec()) { + SDL_CPUFeatures |= CPU_HAS_ALTIVEC; + } if (CPU_haveMMX()) { SDL_CPUFeatures |= CPU_HAS_MMX; } + if (CPU_have3DNow()) { + SDL_CPUFeatures |= CPU_HAS_3DNOW; + } if (CPU_haveSSE()) { SDL_CPUFeatures |= CPU_HAS_SSE; } @@ -462,6 +525,15 @@ SDL_HasRDTSC(void) return SDL_FALSE; } +SDL_bool +SDL_HasAltiVec(void) +{ + if (SDL_GetCPUFeatures() & CPU_HAS_ALTIVEC) { + return SDL_TRUE; + } + return SDL_FALSE; +} + SDL_bool SDL_HasMMX(void) { @@ -471,6 +543,15 @@ SDL_HasMMX(void) return SDL_FALSE; } +SDL_bool +SDL_Has3DNow(void) +{ + if (SDL_GetCPUFeatures() & CPU_HAS_3DNOW) { + return SDL_TRUE; + } + return SDL_FALSE; +} + SDL_bool SDL_HasSSE(void) { @@ -528,7 +609,9 @@ main() printf("CPU name: %s\n", SDL_GetCPUName()); printf("CacheLine size: %d\n", SDL_GetCPUCacheLineSize()); printf("RDTSC: %d\n", SDL_HasRDTSC()); + printf("Altivec: %d\n", SDL_HasAltiVec()); printf("MMX: %d\n", SDL_HasMMX()); + printf("3DNow: %d\n", SDL_Has3DNow()); printf("SSE: %d\n", SDL_HasSSE()); printf("SSE2: %d\n", SDL_HasSSE2()); printf("SSE3: %d\n", SDL_HasSSE3()); diff --git a/src/video/SDL_blit.c b/src/video/SDL_blit.c index f1bcbfee5..fd90bffa8 100644 --- a/src/video/SDL_blit.c +++ b/src/video/SDL_blit.c @@ -100,6 +100,30 @@ SDL_SoftBlit(SDL_Surface * src, SDL_Rect * srcrect, return (okay ? 0 : -1); } +#ifdef __MACOSX__ +#include + +static SDL_bool +SDL_UseAltivecPrefetch() +{ + const char key[] = "hw.l3cachesize"; + u_int64_t result = 0; + size_t typeSize = sizeof(result); + + if (sysctlbyname(key, &result, &typeSize, NULL, 0) == 0 && result > 0) { + return SDL_TRUE; + } else { + return SDL_FALSE; + } +} +#else +static SDL_bool +SDL_UseAltivecPrefetch() +{ + /* Just guess G4 */ + return SDL_TRUE; +} +#endif /* __MACOSX__ */ static SDL_BlitFunc SDL_ChooseBlitFunc(Uint32 src_format, Uint32 dst_format, int flags, @@ -121,12 +145,22 @@ SDL_ChooseBlitFunc(Uint32 src_format, Uint32 dst_format, int flags, if (SDL_HasMMX()) { features |= SDL_CPU_MMX; } + if (SDL_Has3DNow()) { + features |= SDL_CPU_3DNOW; + } if (SDL_HasSSE()) { features |= SDL_CPU_SSE; } if (SDL_HasSSE2()) { features |= SDL_CPU_SSE2; } + if (SDL_HasAltiVec()) { + if (SDL_UseAltivecPrefetch()) { + features |= SDL_CPU_ALTIVEC_PREFETCH; + } else { + features |= SDL_CPU_ALTIVEC_NOPREFETCH; + } + } } } diff --git a/src/video/SDL_blit.h b/src/video/SDL_blit.h index 7a283da56..c20097b95 100644 --- a/src/video/SDL_blit.h +++ b/src/video/SDL_blit.h @@ -24,24 +24,6 @@ #ifndef _SDL_blit_h #define _SDL_blit_h -#ifdef __MINGW32__ -#include <_mingw.h> -#endif - -#if defined(__MINGW32__) && defined(__MINGW64_VERSION_MAJOR) -#include -#else -#ifdef __MMX__ -#include -#endif -#ifdef __SSE__ -#include -#endif -#ifdef __SSE2__ -#include -#endif -#endif - #include "SDL_cpuinfo.h" #include "SDL_endian.h" #include "SDL_surface.h" @@ -62,8 +44,11 @@ /* SDL blit CPU flags */ #define SDL_CPU_ANY 0x00000000 #define SDL_CPU_MMX 0x00000001 +#define SDL_CPU_3DNOW 0x00000002 #define SDL_CPU_SSE 0x00000004 #define SDL_CPU_SSE2 0x00000008 +#define SDL_CPU_ALTIVEC_PREFETCH 0x00000010 +#define SDL_CPU_ALTIVEC_NOPREFETCH 0x00000020 typedef struct { diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c index 38fdf39ff..171f87864 100644 --- a/src/video/SDL_blit_A.c +++ b/src/video/SDL_blit_A.c @@ -419,6 +419,806 @@ BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info) #endif /* __MMX__ */ +#if SDL_ALTIVEC_BLITTERS +#if __MWERKS__ +#pragma altivec_model on +#endif +#if HAVE_ALTIVEC_H +#include +#endif +#include + +#if (defined(__MACOSX__) && (__GNUC__ < 4)) +#define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \ + (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p ) +#define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \ + (vector unsigned short) ( a,b,c,d,e,f,g,h ) +#else +#define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \ + (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p } +#define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \ + (vector unsigned short) { a,b,c,d,e,f,g,h } +#endif + +#define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F) +#define VECPRINT(msg, v) do { \ + vector unsigned int tmpvec = (vector unsigned int)(v); \ + unsigned int *vp = (unsigned int *)&tmpvec; \ + printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \ +} while (0) + +/* the permuation vector that takes the high bytes out of all the appropriate shorts + (vector unsigned char)( + 0x00, 0x10, 0x02, 0x12, + 0x04, 0x14, 0x06, 0x16, + 0x08, 0x18, 0x0A, 0x1A, + 0x0C, 0x1C, 0x0E, 0x1E ); +*/ +#define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F))) +#define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12))) +#define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24())) +#define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \ + ? vec_lvsl(0, src) \ + : vec_add(vec_lvsl(8, src), vec_splat_u8(8))) + + +#define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \ + /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \ + vector unsigned short vtemp1 = vec_mule(vs, valpha); \ + /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \ + vector unsigned short vtemp2 = vec_mulo(vs, valpha); \ + /* valpha2 is 255-alpha */ \ + vector unsigned char valpha2 = vec_nor(valpha, valpha); \ + /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \ + vector unsigned short vtemp3 = vec_mule(vd, valpha2); \ + /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \ + vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \ + /* add source and dest */ \ + vtemp1 = vec_add(vtemp1, vtemp3); \ + vtemp2 = vec_add(vtemp2, vtemp4); \ + /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \ + vtemp1 = vec_add(vtemp1, v1_16); \ + vtemp3 = vec_sr(vtemp1, v8_16); \ + vtemp1 = vec_add(vtemp1, vtemp3); \ + /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \ + vtemp2 = vec_add(vtemp2, v1_16); \ + vtemp4 = vec_sr(vtemp2, v8_16); \ + vtemp2 = vec_add(vtemp2, vtemp4); \ + /* (>>8) and get ARGBARGBARGBARGB */ \ + vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \ +} while (0) + +/* Calculate the permute vector used for 32->32 swizzling */ +static vector unsigned char +calc_swizzle32(const SDL_PixelFormat * srcfmt, const SDL_PixelFormat * dstfmt) +{ + /* + * We have to assume that the bits that aren't used by other + * colors is alpha, and it's one complete byte, since some formats + * leave alpha with a zero mask, but we should still swizzle the bits. + */ + /* ARGB */ + const static struct SDL_PixelFormat default_pixel_format = { + NULL, 0, 0, + 0, 0, 0, 0, + 16, 8, 0, 24, + 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000 + }; + if (!srcfmt) { + srcfmt = &default_pixel_format; + } + if (!dstfmt) { + dstfmt = &default_pixel_format; + } + const vector unsigned char plus = VECUINT8_LITERAL(0x00, 0x00, 0x00, 0x00, + 0x04, 0x04, 0x04, 0x04, + 0x08, 0x08, 0x08, 0x08, + 0x0C, 0x0C, 0x0C, + 0x0C); + vector unsigned char vswiz; + vector unsigned int srcvec; +#define RESHIFT(X) (3 - ((X) >> 3)) + Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift); + Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift); + Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift); + Uint32 amask; + /* Use zero for alpha if either surface doesn't have alpha */ + if (dstfmt->Amask) { + amask = + ((srcfmt->Amask) ? RESHIFT(srcfmt-> + Ashift) : 0x10) << (dstfmt->Ashift); + } else { + amask = + 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ + 0xFFFFFFFF); + } +#undef RESHIFT + ((unsigned int *) (char *) &srcvec)[0] = (rmask | gmask | bmask | amask); + vswiz = vec_add(plus, (vector unsigned char) vec_splat(srcvec, 0)); + return (vswiz); +} + +static void +Blit32to565PixelAlphaAltivec(SDL_BlitInfo * info) +{ + int height = info->dst_h; + Uint8 *src = (Uint8 *) info->src; + int srcskip = info->src_skip; + Uint8 *dst = (Uint8 *) info->dst; + int dstskip = info->dst_skip; + SDL_PixelFormat *srcfmt = info->src_fmt; + + vector unsigned char v0 = vec_splat_u8(0); + vector unsigned short v8_16 = vec_splat_u16(8); + vector unsigned short v1_16 = vec_splat_u16(1); + vector unsigned short v2_16 = vec_splat_u16(2); + vector unsigned short v3_16 = vec_splat_u16(3); + vector unsigned int v8_32 = vec_splat_u32(8); + vector unsigned int v16_32 = vec_add(v8_32, v8_32); + vector unsigned short v3f = + VECUINT16_LITERAL(0x003f, 0x003f, 0x003f, 0x003f, + 0x003f, 0x003f, 0x003f, 0x003f); + vector unsigned short vfc = + VECUINT16_LITERAL(0x00fc, 0x00fc, 0x00fc, 0x00fc, + 0x00fc, 0x00fc, 0x00fc, 0x00fc); + + /* + 0x10 - 0x1f is the alpha + 0x00 - 0x0e evens are the red + 0x01 - 0x0f odds are zero + */ + vector unsigned char vredalpha1 = VECUINT8_LITERAL(0x10, 0x00, 0x01, 0x01, + 0x10, 0x02, 0x01, 0x01, + 0x10, 0x04, 0x01, 0x01, + 0x10, 0x06, 0x01, + 0x01); + vector unsigned char vredalpha2 = + (vector unsigned char) (vec_add((vector unsigned int) vredalpha1, + vec_sl(v8_32, v16_32)) + ); + /* + 0x00 - 0x0f is ARxx ARxx ARxx ARxx + 0x11 - 0x0f odds are blue + */ + vector unsigned char vblue1 = VECUINT8_LITERAL(0x00, 0x01, 0x02, 0x11, + 0x04, 0x05, 0x06, 0x13, + 0x08, 0x09, 0x0a, 0x15, + 0x0c, 0x0d, 0x0e, 0x17); + vector unsigned char vblue2 = + (vector unsigned char) (vec_add((vector unsigned int) vblue1, v8_32) + ); + /* + 0x00 - 0x0f is ARxB ARxB ARxB ARxB + 0x10 - 0x0e evens are green + */ + vector unsigned char vgreen1 = VECUINT8_LITERAL(0x00, 0x01, 0x10, 0x03, + 0x04, 0x05, 0x12, 0x07, + 0x08, 0x09, 0x14, 0x0b, + 0x0c, 0x0d, 0x16, 0x0f); + vector unsigned char vgreen2 = + (vector unsigned + char) (vec_add((vector unsigned int) vgreen1, vec_sl(v8_32, v8_32)) + ); + vector unsigned char vgmerge = VECUINT8_LITERAL(0x00, 0x02, 0x00, 0x06, + 0x00, 0x0a, 0x00, 0x0e, + 0x00, 0x12, 0x00, 0x16, + 0x00, 0x1a, 0x00, 0x1e); + vector unsigned char mergePermute = VEC_MERGE_PERMUTE(); + vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL); + vector unsigned char valphaPermute = + vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC)); + + vector unsigned short vf800 = (vector unsigned short) vec_splat_u8(-7); + vf800 = vec_sl(vf800, vec_splat_u16(8)); + + while (height--) { + int extrawidth; + vector unsigned char valigner; + vector unsigned char vsrc; + vector unsigned char voverflow; + int width = info->dst_w; + +#define ONE_PIXEL_BLEND(condition, widthvar) \ + while (condition) { \ + Uint32 Pixel; \ + unsigned sR, sG, sB, dR, dG, dB, sA; \ + DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \ + if(sA) { \ + unsigned short dstpixel = *((unsigned short *)dst); \ + dR = (dstpixel >> 8) & 0xf8; \ + dG = (dstpixel >> 3) & 0xfc; \ + dB = (dstpixel << 3) & 0xf8; \ + ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \ + *((unsigned short *)dst) = ( \ + ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \ + ); \ + } \ + src += 4; \ + dst += 2; \ + widthvar--; \ + } + ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width); + extrawidth = (width % 8); + valigner = VEC_ALIGNER(src); + vsrc = (vector unsigned char) vec_ld(0, src); + width -= extrawidth; + while (width) { + vector unsigned char valpha; + vector unsigned char vsrc1, vsrc2; + vector unsigned char vdst1, vdst2; + vector unsigned short vR, vG, vB; + vector unsigned short vpixel, vrpixel, vgpixel, vbpixel; + + /* Load 8 pixels from src as ARGB */ + voverflow = (vector unsigned char) vec_ld(15, src); + vsrc = vec_perm(vsrc, voverflow, valigner); + vsrc1 = vec_perm(vsrc, vsrc, vpermute); + src += 16; + vsrc = (vector unsigned char) vec_ld(15, src); + voverflow = vec_perm(voverflow, vsrc, valigner); + vsrc2 = vec_perm(voverflow, voverflow, vpermute); + src += 16; + + /* Load 8 pixels from dst as XRGB */ + voverflow = vec_ld(0, dst); + vR = vec_and((vector unsigned short) voverflow, vf800); + vB = vec_sl((vector unsigned short) voverflow, v3_16); + vG = vec_sl(vB, v2_16); + vdst1 = + (vector unsigned char) vec_perm((vector unsigned char) vR, + (vector unsigned char) vR, + vredalpha1); + vdst1 = vec_perm(vdst1, (vector unsigned char) vB, vblue1); + vdst1 = vec_perm(vdst1, (vector unsigned char) vG, vgreen1); + vdst2 = + (vector unsigned char) vec_perm((vector unsigned char) vR, + (vector unsigned char) vR, + vredalpha2); + vdst2 = vec_perm(vdst2, (vector unsigned char) vB, vblue2); + vdst2 = vec_perm(vdst2, (vector unsigned char) vG, vgreen2); + + /* Alpha blend 8 pixels as ARGB */ + valpha = vec_perm(vsrc1, v0, valphaPermute); + VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, + v8_16); + valpha = vec_perm(vsrc2, v0, valphaPermute); + VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, + v8_16); + + /* Convert 8 pixels to 565 */ + vpixel = (vector unsigned short) vec_packpx((vector unsigned int) + vdst1, + (vector unsigned int) + vdst2); + vgpixel = (vector unsigned short) vec_perm(vdst1, vdst2, vgmerge); + vgpixel = vec_and(vgpixel, vfc); + vgpixel = vec_sl(vgpixel, v3_16); + vrpixel = vec_sl(vpixel, v1_16); + vrpixel = vec_and(vrpixel, vf800); + vbpixel = vec_and(vpixel, v3f); + vdst1 = + vec_or((vector unsigned char) vrpixel, + (vector unsigned char) vgpixel); + vdst1 = vec_or(vdst1, (vector unsigned char) vbpixel); + + /* Store 8 pixels */ + vec_st(vdst1, 0, dst); + + width -= 8; + dst += 16; + } + ONE_PIXEL_BLEND((extrawidth), extrawidth); +#undef ONE_PIXEL_BLEND + src += srcskip; + dst += dstskip; + } +} + +static void +Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo * info) +{ + int height = info->dst_h; + Uint32 *srcp = (Uint32 *) info->src; + int srcskip = info->src_skip >> 2; + Uint32 *dstp = (Uint32 *) info->dst; + int dstskip = info->dst_skip >> 2; + SDL_PixelFormat *srcfmt = info->src_fmt; + SDL_PixelFormat *dstfmt = info->dst_fmt; + unsigned sA = info->a; + unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0; + Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask; + Uint32 ckey = info->colorkey; + vector unsigned char mergePermute; + vector unsigned char vsrcPermute; + vector unsigned char vdstPermute; + vector unsigned char vsdstPermute; + vector unsigned char valpha; + vector unsigned char valphamask; + vector unsigned char vbits; + vector unsigned char v0; + vector unsigned short v1; + vector unsigned short v8; + vector unsigned int vckey; + vector unsigned int vrgbmask; + + mergePermute = VEC_MERGE_PERMUTE(); + v0 = vec_splat_u8(0); + v1 = vec_splat_u16(1); + v8 = vec_splat_u16(8); + + /* set the alpha to 255 on the destination surf */ + valphamask = VEC_ALPHA_MASK(); + + vsrcPermute = calc_swizzle32(srcfmt, NULL); + vdstPermute = calc_swizzle32(NULL, dstfmt); + vsdstPermute = calc_swizzle32(dstfmt, NULL); + + /* set a vector full of alpha and 255-alpha */ + ((unsigned char *) &valpha)[0] = sA; + valpha = vec_splat(valpha, 0); + vbits = (vector unsigned char) vec_splat_s8(-1); + + ckey &= rgbmask; + ((unsigned int *) (char *) &vckey)[0] = ckey; + vckey = vec_splat(vckey, 0); + ((unsigned int *) (char *) &vrgbmask)[0] = rgbmask; + vrgbmask = vec_splat(vrgbmask, 0); + + while (height--) { + int width = info->dst_w; +#define ONE_PIXEL_BLEND(condition, widthvar) \ + while (condition) { \ + Uint32 Pixel; \ + unsigned sR, sG, sB, dR, dG, dB; \ + RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \ + if(sA && Pixel != ckey) { \ + RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \ + DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \ + ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \ + ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \ + } \ + dstp++; \ + srcp++; \ + widthvar--; \ + } + ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width); + if (width > 0) { + int extrawidth = (width % 4); + vector unsigned char valigner = VEC_ALIGNER(srcp); + vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp); + width -= extrawidth; + while (width) { + vector unsigned char vsel; + vector unsigned char voverflow; + vector unsigned char vd; + vector unsigned char vd_orig; + + /* s = *srcp */ + voverflow = (vector unsigned char) vec_ld(15, srcp); + vs = vec_perm(vs, voverflow, valigner); + + /* vsel is set for items that match the key */ + vsel = + (vector unsigned char) vec_and((vector unsigned int) vs, + vrgbmask); + vsel = (vector unsigned char) vec_cmpeq((vector unsigned int) + vsel, vckey); + + /* permute to source format */ + vs = vec_perm(vs, valpha, vsrcPermute); + + /* d = *dstp */ + vd = (vector unsigned char) vec_ld(0, dstp); + vd_orig = vd = vec_perm(vd, v0, vsdstPermute); + + VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8); + + /* set the alpha channel to full on */ + vd = vec_or(vd, valphamask); + + /* mask out color key */ + vd = vec_sel(vd, vd_orig, vsel); + + /* permute to dest format */ + vd = vec_perm(vd, vbits, vdstPermute); + + /* *dstp = res */ + vec_st((vector unsigned int) vd, 0, dstp); + + srcp += 4; + dstp += 4; + width -= 4; + vs = voverflow; + } + ONE_PIXEL_BLEND((extrawidth), extrawidth); + } +#undef ONE_PIXEL_BLEND + + srcp += srcskip; + dstp += dstskip; + } +} + + +static void +Blit32to32PixelAlphaAltivec(SDL_BlitInfo * info) +{ + int width = info->dst_w; + int height = info->dst_h; + Uint32 *srcp = (Uint32 *) info->src; + int srcskip = info->src_skip >> 2; + Uint32 *dstp = (Uint32 *) info->dst; + int dstskip = info->dst_skip >> 2; + SDL_PixelFormat *srcfmt = info->src_fmt; + SDL_PixelFormat *dstfmt = info->dst_fmt; + vector unsigned char mergePermute; + vector unsigned char valphaPermute; + vector unsigned char vsrcPermute; + vector unsigned char vdstPermute; + vector unsigned char vsdstPermute; + vector unsigned char valphamask; + vector unsigned char vpixelmask; + vector unsigned char v0; + vector unsigned short v1; + vector unsigned short v8; + + v0 = vec_splat_u8(0); + v1 = vec_splat_u16(1); + v8 = vec_splat_u16(8); + mergePermute = VEC_MERGE_PERMUTE(); + valphamask = VEC_ALPHA_MASK(); + valphaPermute = vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC)); + vpixelmask = vec_nor(valphamask, v0); + vsrcPermute = calc_swizzle32(srcfmt, NULL); + vdstPermute = calc_swizzle32(NULL, dstfmt); + vsdstPermute = calc_swizzle32(dstfmt, NULL); + + while (height--) { + width = info->dst_w; +#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \ + Uint32 Pixel; \ + unsigned sR, sG, sB, dR, dG, dB, sA, dA; \ + DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \ + if(sA) { \ + DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \ + ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \ + ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \ + } \ + ++srcp; \ + ++dstp; \ + widthvar--; \ + } + ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width); + if (width > 0) { + /* vsrcPermute */ + /* vdstPermute */ + int extrawidth = (width % 4); + vector unsigned char valigner = VEC_ALIGNER(srcp); + vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp); + width -= extrawidth; + while (width) { + vector unsigned char voverflow; + vector unsigned char vd; + vector unsigned char valpha; + vector unsigned char vdstalpha; + /* s = *srcp */ + voverflow = (vector unsigned char) vec_ld(15, srcp); + vs = vec_perm(vs, voverflow, valigner); + vs = vec_perm(vs, v0, vsrcPermute); + + valpha = vec_perm(vs, v0, valphaPermute); + + /* d = *dstp */ + vd = (vector unsigned char) vec_ld(0, dstp); + vd = vec_perm(vd, v0, vsdstPermute); + vdstalpha = vec_and(vd, valphamask); + + VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8); + + /* set the alpha to the dest alpha */ + vd = vec_and(vd, vpixelmask); + vd = vec_or(vd, vdstalpha); + vd = vec_perm(vd, v0, vdstPermute); + + /* *dstp = res */ + vec_st((vector unsigned int) vd, 0, dstp); + + srcp += 4; + dstp += 4; + width -= 4; + vs = voverflow; + + } + ONE_PIXEL_BLEND((extrawidth), extrawidth); + } + srcp += srcskip; + dstp += dstskip; +#undef ONE_PIXEL_BLEND + } +} + +/* fast ARGB888->(A)RGB888 blending with pixel alpha */ +static void +BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo * info) +{ + int width = info->dst_w; + int height = info->dst_h; + Uint32 *srcp = (Uint32 *) info->src; + int srcskip = info->src_skip >> 2; + Uint32 *dstp = (Uint32 *) info->dst; + int dstskip = info->dst_skip >> 2; + vector unsigned char mergePermute; + vector unsigned char valphaPermute; + vector unsigned char valphamask; + vector unsigned char vpixelmask; + vector unsigned char v0; + vector unsigned short v1; + vector unsigned short v8; + v0 = vec_splat_u8(0); + v1 = vec_splat_u16(1); + v8 = vec_splat_u16(8); + mergePermute = VEC_MERGE_PERMUTE(); + valphamask = VEC_ALPHA_MASK(); + valphaPermute = vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC)); + + + vpixelmask = vec_nor(valphamask, v0); + while (height--) { + width = info->dst_w; +#define ONE_PIXEL_BLEND(condition, widthvar) \ + while ((condition)) { \ + Uint32 dalpha; \ + Uint32 d; \ + Uint32 s1; \ + Uint32 d1; \ + Uint32 s = *srcp; \ + Uint32 alpha = s >> 24; \ + if(alpha) { \ + if(alpha == SDL_ALPHA_OPAQUE) { \ + *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \ + } else { \ + d = *dstp; \ + dalpha = d & 0xff000000; \ + s1 = s & 0xff00ff; \ + d1 = d & 0xff00ff; \ + d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \ + s &= 0xff00; \ + d &= 0xff00; \ + d = (d + ((s - d) * alpha >> 8)) & 0xff00; \ + *dstp = d1 | d | dalpha; \ + } \ + } \ + ++srcp; \ + ++dstp; \ + widthvar--; \ + } + ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width); + if (width > 0) { + int extrawidth = (width % 4); + vector unsigned char valigner = VEC_ALIGNER(srcp); + vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp); + width -= extrawidth; + while (width) { + vector unsigned char voverflow; + vector unsigned char vd; + vector unsigned char valpha; + vector unsigned char vdstalpha; + /* s = *srcp */ + voverflow = (vector unsigned char) vec_ld(15, srcp); + vs = vec_perm(vs, voverflow, valigner); + + valpha = vec_perm(vs, v0, valphaPermute); + + /* d = *dstp */ + vd = (vector unsigned char) vec_ld(0, dstp); + vdstalpha = vec_and(vd, valphamask); + + VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8); + + /* set the alpha to the dest alpha */ + vd = vec_and(vd, vpixelmask); + vd = vec_or(vd, vdstalpha); + + /* *dstp = res */ + vec_st((vector unsigned int) vd, 0, dstp); + + srcp += 4; + dstp += 4; + width -= 4; + vs = voverflow; + } + ONE_PIXEL_BLEND((extrawidth), extrawidth); + } + srcp += srcskip; + dstp += dstskip; + } +#undef ONE_PIXEL_BLEND +} + +static void +Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo * info) +{ + /* XXX : 6 */ + int height = info->dst_h; + Uint32 *srcp = (Uint32 *) info->src; + int srcskip = info->src_skip >> 2; + Uint32 *dstp = (Uint32 *) info->dst; + int dstskip = info->dst_skip >> 2; + SDL_PixelFormat *srcfmt = info->src_fmt; + SDL_PixelFormat *dstfmt = info->dst_fmt; + unsigned sA = info->a; + unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0; + vector unsigned char mergePermute; + vector unsigned char vsrcPermute; + vector unsigned char vdstPermute; + vector unsigned char vsdstPermute; + vector unsigned char valpha; + vector unsigned char valphamask; + vector unsigned char vbits; + vector unsigned short v1; + vector unsigned short v8; + + mergePermute = VEC_MERGE_PERMUTE(); + v1 = vec_splat_u16(1); + v8 = vec_splat_u16(8); + + /* set the alpha to 255 on the destination surf */ + valphamask = VEC_ALPHA_MASK(); + + vsrcPermute = calc_swizzle32(srcfmt, NULL); + vdstPermute = calc_swizzle32(NULL, dstfmt); + vsdstPermute = calc_swizzle32(dstfmt, NULL); + + /* set a vector full of alpha and 255-alpha */ + ((unsigned char *) &valpha)[0] = sA; + valpha = vec_splat(valpha, 0); + vbits = (vector unsigned char) vec_splat_s8(-1); + + while (height--) { + int width = info->dst_w; +#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \ + Uint32 Pixel; \ + unsigned sR, sG, sB, dR, dG, dB; \ + DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \ + DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \ + ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \ + ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \ + ++srcp; \ + ++dstp; \ + widthvar--; \ + } + ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width); + if (width > 0) { + int extrawidth = (width % 4); + vector unsigned char valigner = VEC_ALIGNER(srcp); + vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp); + width -= extrawidth; + while (width) { + vector unsigned char voverflow; + vector unsigned char vd; + + /* s = *srcp */ + voverflow = (vector unsigned char) vec_ld(15, srcp); + vs = vec_perm(vs, voverflow, valigner); + vs = vec_perm(vs, valpha, vsrcPermute); + + /* d = *dstp */ + vd = (vector unsigned char) vec_ld(0, dstp); + vd = vec_perm(vd, vd, vsdstPermute); + + VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8); + + /* set the alpha channel to full on */ + vd = vec_or(vd, valphamask); + vd = vec_perm(vd, vbits, vdstPermute); + + /* *dstp = res */ + vec_st((vector unsigned int) vd, 0, dstp); + + srcp += 4; + dstp += 4; + width -= 4; + vs = voverflow; + } + ONE_PIXEL_BLEND((extrawidth), extrawidth); + } +#undef ONE_PIXEL_BLEND + + srcp += srcskip; + dstp += dstskip; + } + +} + + +/* fast RGB888->(A)RGB888 blending */ +static void +BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo * info) +{ + unsigned alpha = info->a; + int height = info->dst_h; + Uint32 *srcp = (Uint32 *) info->src; + int srcskip = info->src_skip >> 2; + Uint32 *dstp = (Uint32 *) info->dst; + int dstskip = info->dst_skip >> 2; + vector unsigned char mergePermute; + vector unsigned char valpha; + vector unsigned char valphamask; + vector unsigned short v1; + vector unsigned short v8; + + mergePermute = VEC_MERGE_PERMUTE(); + v1 = vec_splat_u16(1); + v8 = vec_splat_u16(8); + + /* set the alpha to 255 on the destination surf */ + valphamask = VEC_ALPHA_MASK(); + + /* set a vector full of alpha and 255-alpha */ + ((unsigned char *) &valpha)[0] = alpha; + valpha = vec_splat(valpha, 0); + + while (height--) { + int width = info->dst_w; +#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \ + Uint32 s = *srcp; \ + Uint32 d = *dstp; \ + Uint32 s1 = s & 0xff00ff; \ + Uint32 d1 = d & 0xff00ff; \ + d1 = (d1 + ((s1 - d1) * alpha >> 8)) \ + & 0xff00ff; \ + s &= 0xff00; \ + d &= 0xff00; \ + d = (d + ((s - d) * alpha >> 8)) & 0xff00; \ + *dstp = d1 | d | 0xff000000; \ + ++srcp; \ + ++dstp; \ + widthvar--; \ + } + ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width); + if (width > 0) { + int extrawidth = (width % 4); + vector unsigned char valigner = VEC_ALIGNER(srcp); + vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp); + width -= extrawidth; + while (width) { + vector unsigned char voverflow; + vector unsigned char vd; + + /* s = *srcp */ + voverflow = (vector unsigned char) vec_ld(15, srcp); + vs = vec_perm(vs, voverflow, valigner); + + /* d = *dstp */ + vd = (vector unsigned char) vec_ld(0, dstp); + + VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8); + + /* set the alpha channel to full on */ + vd = vec_or(vd, valphamask); + + /* *dstp = res */ + vec_st((vector unsigned int) vd, 0, dstp); + + srcp += 4; + dstp += 4; + width -= 4; + vs = voverflow; + } + ONE_PIXEL_BLEND((extrawidth), extrawidth); + } +#undef ONE_PIXEL_BLEND + + srcp += srcskip; + dstp += dstskip; + } +} + +#if __MWERKS__ +#pragma altivec_model off +#endif +#endif /* SDL_ALTIVEC_BLITTERS */ + /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info) @@ -538,6 +1338,79 @@ BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info) } } +#ifdef __3dNOW__ +/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */ +static void +BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info) +{ + int width = info->dst_w; + int height = info->dst_h; + Uint32 *srcp = (Uint32 *) info->src; + int srcskip = info->src_skip >> 2; + Uint32 *dstp = (Uint32 *) info->dst; + int dstskip = info->dst_skip >> 2; + SDL_PixelFormat *sf = info->src_fmt; + Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask; + Uint32 amask = sf->Amask; + Uint32 ashift = sf->Ashift; + Uint64 multmask; + + __m64 src1, dst1, mm_alpha, mm_zero, dmask; + + mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */ + multmask = 0xFFFF; + multmask <<= (ashift * 2); + multmask = ~multmask; + dmask = *(__m64 *) & multmask; /* dst alpha mask -> dmask */ + + while (height--) { + /* *INDENT-OFF* */ + DUFFS_LOOP4({ + Uint32 alpha; + + _m_prefetch(srcp + 16); + _m_prefetch(dstp + 16); + + alpha = *srcp & amask; + if (alpha == 0) { + /* do nothing */ + } else if (alpha == amask) { + /* copy RGB, keep dst alpha */ + *dstp = (*srcp & chanmask) | (*dstp & ~chanmask); + } else { + src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/ + src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */ + + dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/ + dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */ + + mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */ + mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */ + mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */ + mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */ + mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */ + + /* blend */ + src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */ + src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */ + src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */ + dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */ + dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */ + + *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */ + } + ++srcp; + ++dstp; + }, width); + /* *INDENT-ON* */ + srcp += srcskip; + dstp += dstskip; + } + _mm_empty(); +} + +#endif /* __MMX__ */ + /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */ /* blend a single 16 bit pixel at 50% */ @@ -1257,10 +2130,17 @@ SDL_CalculateBlitA(SDL_Surface * surface) return BlitNto1PixelAlpha; case 2: - if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000 - && sf->Gmask == 0xff00 - && ((sf->Rmask == 0xff && df->Rmask == 0x1f) - || (sf->Bmask == 0xff && df->Bmask == 0x1f))) { +#if SDL_ALTIVEC_BLITTERS + if (sf->BytesPerPixel == 4 + && df->Gmask == 0x7e0 && df->Bmask == 0x1f + && SDL_HasAltiVec()) + return Blit32to565PixelAlphaAltivec; + else +#endif + if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000 + && sf->Gmask == 0xff00 + && ((sf->Rmask == 0xff && df->Rmask == 0x1f) + || (sf->Bmask == 0xff && df->Bmask == 0x1f))) { if (df->Gmask == 0x7e0) return BlitARGBto565PixelAlpha; else if (df->Gmask == 0x3e0) @@ -1272,20 +2152,35 @@ SDL_CalculateBlitA(SDL_Surface * surface) if (sf->Rmask == df->Rmask && sf->Gmask == df->Gmask && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) { -#if defined(__MMX__) +#if defined(__MMX__) || defined(__3dNOW__) if (sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0 && sf->Ashift % 8 == 0 && sf->Aloss == 0) { +#ifdef __3dNOW__ + if (SDL_Has3DNow()) + return BlitRGBtoRGBPixelAlphaMMX3DNOW; +#endif +#ifdef __MMX__ if (SDL_HasMMX()) return BlitRGBtoRGBPixelAlphaMMX; +#endif } -#endif /* __MMX__ */ +#endif /* __MMX__ || __3dNOW__ */ if (sf->Amask == 0xff000000) { +#if SDL_ALTIVEC_BLITTERS + if (SDL_HasAltiVec()) + return BlitRGBtoRGBPixelAlphaAltivec; +#endif return BlitRGBtoRGBPixelAlpha; } } - return BlitNtoNPixelAlpha; +#if SDL_ALTIVEC_BLITTERS + if (sf->Amask && sf->BytesPerPixel == 4 && SDL_HasAltiVec()) + return Blit32to32PixelAlphaAltivec; + else +#endif + return BlitNtoNPixelAlpha; case 3: default: @@ -1331,10 +2226,19 @@ SDL_CalculateBlitA(SDL_Surface * surface) return BlitRGBtoRGBSurfaceAlphaMMX; #endif if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) { +#if SDL_ALTIVEC_BLITTERS + if (SDL_HasAltiVec()) + return BlitRGBtoRGBSurfaceAlphaAltivec; +#endif return BlitRGBtoRGBSurfaceAlpha; } } - return BlitNtoNSurfaceAlpha; +#if SDL_ALTIVEC_BLITTERS + if ((sf->BytesPerPixel == 4) && SDL_HasAltiVec()) + return Blit32to32SurfaceAlphaAltivec; + else +#endif + return BlitNtoNSurfaceAlpha; case 3: default: @@ -1348,6 +2252,12 @@ SDL_CalculateBlitA(SDL_Surface * surface) if (df->BytesPerPixel == 1) return BlitNto1SurfaceAlphaKey; else +#if SDL_ALTIVEC_BLITTERS + if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 && + SDL_HasAltiVec()) + return Blit32to32SurfaceAlphaKeyAltivec; + else +#endif return BlitNtoNSurfaceAlphaKey; } break; diff --git a/src/video/SDL_blit_N.c b/src/video/SDL_blit_N.c index 803fa39a4..180efc82c 100644 --- a/src/video/SDL_blit_N.c +++ b/src/video/SDL_blit_N.c @@ -28,8 +28,840 @@ /* Functions to blit from N-bit surfaces to other surfaces */ +#if SDL_ALTIVEC_BLITTERS +#define assert(X) +#ifdef __MACOSX__ +#include +static size_t +GetL3CacheSize(void) +{ + const char key[] = "hw.l3cachesize"; + u_int64_t result = 0; + size_t typeSize = sizeof(result); + + + int err = sysctlbyname(key, &result, &typeSize, NULL, 0); + if (0 != err) + return 0; + + return result; +} +#else +static size_t +GetL3CacheSize(void) +{ + /* XXX: Just guess G4 */ + return 2097152; +} +#endif /* __MACOSX__ */ + +#if (defined(__MACOSX__) && (__GNUC__ < 4)) +#define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \ + (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p ) +#define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \ + (vector unsigned short) ( a,b,c,d,e,f,g,h ) +#else +#define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \ + (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p } +#define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \ + (vector unsigned short) { a,b,c,d,e,f,g,h } +#endif + +#define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F) +#define VSWIZZLE32(a,b,c,d) (vector unsigned char) \ + ( 0x00+a, 0x00+b, 0x00+c, 0x00+d, \ + 0x04+a, 0x04+b, 0x04+c, 0x04+d, \ + 0x08+a, 0x08+b, 0x08+c, 0x08+d, \ + 0x0C+a, 0x0C+b, 0x0C+c, 0x0C+d ) + +#define MAKE8888(dstfmt, r, g, b, a) \ + ( ((r<Rshift)&dstfmt->Rmask) | \ + ((g<Gshift)&dstfmt->Gmask) | \ + ((b<Bshift)&dstfmt->Bmask) | \ + ((a<Ashift)&dstfmt->Amask) ) + +/* + * Data Stream Touch...Altivec cache prefetching. + * + * Don't use this on a G5...however, the speed boost is very significant + * on a G4. + */ +#define DST_CHAN_SRC 1 +#define DST_CHAN_DEST 2 + +/* macro to set DST control word value... */ +#define DST_CTRL(size, count, stride) \ + (((size) << 24) | ((count) << 16) | (stride)) + +#define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \ + ? vec_lvsl(0, src) \ + : vec_add(vec_lvsl(8, src), vec_splat_u8(8))) + +/* Calculate the permute vector used for 32->32 swizzling */ +static vector unsigned char +calc_swizzle32(const SDL_PixelFormat * srcfmt, const SDL_PixelFormat * dstfmt) +{ + /* + * We have to assume that the bits that aren't used by other + * colors is alpha, and it's one complete byte, since some formats + * leave alpha with a zero mask, but we should still swizzle the bits. + */ + /* ARGB */ + const static const struct SDL_PixelFormat default_pixel_format = { + NULL, 32, 4, + 0, 0, 0, 0, + 16, 8, 0, 24, + 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000 + }; + if (!srcfmt) { + srcfmt = &default_pixel_format; + } + if (!dstfmt) { + dstfmt = &default_pixel_format; + } + const vector unsigned char plus = VECUINT8_LITERAL(0x00, 0x00, 0x00, 0x00, + 0x04, 0x04, 0x04, 0x04, + 0x08, 0x08, 0x08, 0x08, + 0x0C, 0x0C, 0x0C, + 0x0C); + vector unsigned char vswiz; + vector unsigned int srcvec; +#define RESHIFT(X) (3 - ((X) >> 3)) + Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift); + Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift); + Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift); + Uint32 amask; + /* Use zero for alpha if either surface doesn't have alpha */ + if (dstfmt->Amask) { + amask = + ((srcfmt->Amask) ? RESHIFT(srcfmt-> + Ashift) : 0x10) << (dstfmt->Ashift); + } else { + amask = + 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ + 0xFFFFFFFF); + } +#undef RESHIFT + ((unsigned int *) (char *) &srcvec)[0] = (rmask | gmask | bmask | amask); + vswiz = vec_add(plus, (vector unsigned char) vec_splat(srcvec, 0)); + return (vswiz); +} + +static void Blit_RGB888_RGB565(SDL_BlitInfo * info); +static void +Blit_RGB888_RGB565Altivec(SDL_BlitInfo * info) +{ + int height = info->dst_h; + Uint8 *src = (Uint8 *) info->src; + int srcskip = info->src_skip; + Uint8 *dst = (Uint8 *) info->dst; + int dstskip = info->dst_skip; + SDL_PixelFormat *srcfmt = info->src_fmt; + vector unsigned char valpha = vec_splat_u8(0); + vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL); + vector unsigned char vgmerge = VECUINT8_LITERAL(0x00, 0x02, 0x00, 0x06, + 0x00, 0x0a, 0x00, 0x0e, + 0x00, 0x12, 0x00, 0x16, + 0x00, 0x1a, 0x00, 0x1e); + vector unsigned short v1 = vec_splat_u16(1); + vector unsigned short v3 = vec_splat_u16(3); + vector unsigned short v3f = + VECUINT16_LITERAL(0x003f, 0x003f, 0x003f, 0x003f, + 0x003f, 0x003f, 0x003f, 0x003f); + vector unsigned short vfc = + VECUINT16_LITERAL(0x00fc, 0x00fc, 0x00fc, 0x00fc, + 0x00fc, 0x00fc, 0x00fc, 0x00fc); + vector unsigned short vf800 = (vector unsigned short) vec_splat_u8(-7); + vf800 = vec_sl(vf800, vec_splat_u16(8)); + + while (height--) { + vector unsigned char valigner; + vector unsigned char voverflow; + vector unsigned char vsrc; + + int width = info->dst_w; + int extrawidth; + + /* do scalar until we can align... */ +#define ONE_PIXEL_BLEND(condition, widthvar) \ + while (condition) { \ + Uint32 Pixel; \ + unsigned sR, sG, sB, sA; \ + DISEMBLE_RGBA((Uint8 *)src, 4, srcfmt, Pixel, \ + sR, sG, sB, sA); \ + *(Uint16 *)(dst) = (((sR << 8) & 0x0000F800) | \ + ((sG << 3) & 0x000007E0) | \ + ((sB >> 3) & 0x0000001F)); \ + dst += 2; \ + src += 4; \ + widthvar--; \ + } + + ONE_PIXEL_BLEND(((UNALIGNED_PTR(dst)) && (width)), width); + + /* After all that work, here's the vector part! */ + extrawidth = (width % 8); /* trailing unaligned stores */ + width -= extrawidth; + vsrc = vec_ld(0, src); + valigner = VEC_ALIGNER(src); + + while (width) { + vector unsigned short vpixel, vrpixel, vgpixel, vbpixel; + vector unsigned int vsrc1, vsrc2; + vector unsigned char vdst; + + voverflow = vec_ld(15, src); + vsrc = vec_perm(vsrc, voverflow, valigner); + vsrc1 = (vector unsigned int) vec_perm(vsrc, valpha, vpermute); + src += 16; + vsrc = voverflow; + voverflow = vec_ld(15, src); + vsrc = vec_perm(vsrc, voverflow, valigner); + vsrc2 = (vector unsigned int) vec_perm(vsrc, valpha, vpermute); + /* 1555 */ + vpixel = (vector unsigned short) vec_packpx(vsrc1, vsrc2); + vgpixel = (vector unsigned short) vec_perm(vsrc1, vsrc2, vgmerge); + vgpixel = vec_and(vgpixel, vfc); + vgpixel = vec_sl(vgpixel, v3); + vrpixel = vec_sl(vpixel, v1); + vrpixel = vec_and(vrpixel, vf800); + vbpixel = vec_and(vpixel, v3f); + vdst = + vec_or((vector unsigned char) vrpixel, + (vector unsigned char) vgpixel); + /* 565 */ + vdst = vec_or(vdst, (vector unsigned char) vbpixel); + vec_st(vdst, 0, dst); + + width -= 8; + src += 16; + dst += 16; + vsrc = voverflow; + } + + assert(width == 0); + + /* do scalar until we can align... */ + ONE_PIXEL_BLEND((extrawidth), extrawidth); +#undef ONE_PIXEL_BLEND + + src += srcskip; /* move to next row, accounting for pitch. */ + dst += dstskip; + } + + +} + +static void +Blit_RGB565_32Altivec(SDL_BlitInfo * info) +{ + int height = info->dst_h; + Uint8 *src = (Uint8 *) info->src; + int srcskip = info->src_skip; + Uint8 *dst = (Uint8 *) info->dst; + int dstskip = info->dst_skip; + SDL_PixelFormat *srcfmt = info->src_fmt; + SDL_PixelFormat *dstfmt = info->dst_fmt; + unsigned alpha; + vector unsigned char valpha; + vector unsigned char vpermute; + vector unsigned short vf800; + vector unsigned int v8 = vec_splat_u32(8); + vector unsigned int v16 = vec_add(v8, v8); + vector unsigned short v2 = vec_splat_u16(2); + vector unsigned short v3 = vec_splat_u16(3); + /* + 0x10 - 0x1f is the alpha + 0x00 - 0x0e evens are the red + 0x01 - 0x0f odds are zero + */ + vector unsigned char vredalpha1 = VECUINT8_LITERAL(0x10, 0x00, 0x01, 0x01, + 0x10, 0x02, 0x01, 0x01, + 0x10, 0x04, 0x01, 0x01, + 0x10, 0x06, 0x01, + 0x01); + vector unsigned char vredalpha2 = + (vector unsigned + char) (vec_add((vector unsigned int) vredalpha1, vec_sl(v8, v16)) + ); + /* + 0x00 - 0x0f is ARxx ARxx ARxx ARxx + 0x11 - 0x0f odds are blue + */ + vector unsigned char vblue1 = VECUINT8_LITERAL(0x00, 0x01, 0x02, 0x11, + 0x04, 0x05, 0x06, 0x13, + 0x08, 0x09, 0x0a, 0x15, + 0x0c, 0x0d, 0x0e, 0x17); + vector unsigned char vblue2 = + (vector unsigned char) (vec_add((vector unsigned int) vblue1, v8) + ); + /* + 0x00 - 0x0f is ARxB ARxB ARxB ARxB + 0x10 - 0x0e evens are green + */ + vector unsigned char vgreen1 = VECUINT8_LITERAL(0x00, 0x01, 0x10, 0x03, + 0x04, 0x05, 0x12, 0x07, + 0x08, 0x09, 0x14, 0x0b, + 0x0c, 0x0d, 0x16, 0x0f); + vector unsigned char vgreen2 = + (vector unsigned + char) (vec_add((vector unsigned int) vgreen1, vec_sl(v8, v8)) + ); + + + assert(srcfmt->BytesPerPixel == 2); + assert(dstfmt->BytesPerPixel == 4); + + vf800 = (vector unsigned short) vec_splat_u8(-7); + vf800 = vec_sl(vf800, vec_splat_u16(8)); + + if (dstfmt->Amask && info->a) { + ((unsigned char *) &valpha)[0] = alpha = info->a; + valpha = vec_splat(valpha, 0); + } else { + alpha = 0; + valpha = vec_splat_u8(0); + } + + vpermute = calc_swizzle32(NULL, dstfmt); + while (height--) { + vector unsigned char valigner; + vector unsigned char voverflow; + vector unsigned char vsrc; + + int width = info->dst_w; + int extrawidth; + + /* do scalar until we can align... */ +#define ONE_PIXEL_BLEND(condition, widthvar) \ + while (condition) { \ + unsigned sR, sG, sB; \ + unsigned short Pixel = *((unsigned short *)src); \ + sR = (Pixel >> 8) & 0xf8; \ + sG = (Pixel >> 3) & 0xfc; \ + sB = (Pixel << 3) & 0xf8; \ + ASSEMBLE_RGBA(dst, 4, dstfmt, sR, sG, sB, alpha); \ + src += 2; \ + dst += 4; \ + widthvar--; \ + } + ONE_PIXEL_BLEND(((UNALIGNED_PTR(dst)) && (width)), width); + + /* After all that work, here's the vector part! */ + extrawidth = (width % 8); /* trailing unaligned stores */ + width -= extrawidth; + vsrc = vec_ld(0, src); + valigner = VEC_ALIGNER(src); + + while (width) { + vector unsigned short vR, vG, vB; + vector unsigned char vdst1, vdst2; + + voverflow = vec_ld(15, src); + vsrc = vec_perm(vsrc, voverflow, valigner); + + vR = vec_and((vector unsigned short) vsrc, vf800); + vB = vec_sl((vector unsigned short) vsrc, v3); + vG = vec_sl(vB, v2); + + vdst1 = + (vector unsigned char) vec_perm((vector unsigned char) vR, + valpha, vredalpha1); + vdst1 = vec_perm(vdst1, (vector unsigned char) vB, vblue1); + vdst1 = vec_perm(vdst1, (vector unsigned char) vG, vgreen1); + vdst1 = vec_perm(vdst1, valpha, vpermute); + vec_st(vdst1, 0, dst); + + vdst2 = + (vector unsigned char) vec_perm((vector unsigned char) vR, + valpha, vredalpha2); + vdst2 = vec_perm(vdst2, (vector unsigned char) vB, vblue2); + vdst2 = vec_perm(vdst2, (vector unsigned char) vG, vgreen2); + vdst2 = vec_perm(vdst2, valpha, vpermute); + vec_st(vdst2, 16, dst); + + width -= 8; + dst += 32; + src += 16; + vsrc = voverflow; + } + + assert(width == 0); + + + /* do scalar until we can align... */ + ONE_PIXEL_BLEND((extrawidth), extrawidth); +#undef ONE_PIXEL_BLEND + + src += srcskip; /* move to next row, accounting for pitch. */ + dst += dstskip; + } + +} + + +static void +Blit_RGB555_32Altivec(SDL_BlitInfo * info) +{ + int height = info->dst_h; + Uint8 *src = (Uint8 *) info->src; + int srcskip = info->src_skip; + Uint8 *dst = (Uint8 *) info->dst; + int dstskip = info->dst_skip; + SDL_PixelFormat *srcfmt = info->src_fmt; + SDL_PixelFormat *dstfmt = info->dst_fmt; + unsigned alpha; + vector unsigned char valpha; + vector unsigned char vpermute; + vector unsigned short vf800; + vector unsigned int v8 = vec_splat_u32(8); + vector unsigned int v16 = vec_add(v8, v8); + vector unsigned short v1 = vec_splat_u16(1); + vector unsigned short v3 = vec_splat_u16(3); + /* + 0x10 - 0x1f is the alpha + 0x00 - 0x0e evens are the red + 0x01 - 0x0f odds are zero + */ + vector unsigned char vredalpha1 = VECUINT8_LITERAL(0x10, 0x00, 0x01, 0x01, + 0x10, 0x02, 0x01, 0x01, + 0x10, 0x04, 0x01, 0x01, + 0x10, 0x06, 0x01, + 0x01); + vector unsigned char vredalpha2 = + (vector unsigned + char) (vec_add((vector unsigned int) vredalpha1, vec_sl(v8, v16)) + ); + /* + 0x00 - 0x0f is ARxx ARxx ARxx ARxx + 0x11 - 0x0f odds are blue + */ + vector unsigned char vblue1 = VECUINT8_LITERAL(0x00, 0x01, 0x02, 0x11, + 0x04, 0x05, 0x06, 0x13, + 0x08, 0x09, 0x0a, 0x15, + 0x0c, 0x0d, 0x0e, 0x17); + vector unsigned char vblue2 = + (vector unsigned char) (vec_add((vector unsigned int) vblue1, v8) + ); + /* + 0x00 - 0x0f is ARxB ARxB ARxB ARxB + 0x10 - 0x0e evens are green + */ + vector unsigned char vgreen1 = VECUINT8_LITERAL(0x00, 0x01, 0x10, 0x03, + 0x04, 0x05, 0x12, 0x07, + 0x08, 0x09, 0x14, 0x0b, + 0x0c, 0x0d, 0x16, 0x0f); + vector unsigned char vgreen2 = + (vector unsigned + char) (vec_add((vector unsigned int) vgreen1, vec_sl(v8, v8)) + ); + + + assert(srcfmt->BytesPerPixel == 2); + assert(dstfmt->BytesPerPixel == 4); + + vf800 = (vector unsigned short) vec_splat_u8(-7); + vf800 = vec_sl(vf800, vec_splat_u16(8)); + + if (dstfmt->Amask && info->a) { + ((unsigned char *) &valpha)[0] = alpha = info->a; + valpha = vec_splat(valpha, 0); + } else { + alpha = 0; + valpha = vec_splat_u8(0); + } + + vpermute = calc_swizzle32(NULL, dstfmt); + while (height--) { + vector unsigned char valigner; + vector unsigned char voverflow; + vector unsigned char vsrc; + + int width = info->dst_w; + int extrawidth; + + /* do scalar until we can align... */ +#define ONE_PIXEL_BLEND(condition, widthvar) \ + while (condition) { \ + unsigned sR, sG, sB; \ + unsigned short Pixel = *((unsigned short *)src); \ + sR = (Pixel >> 7) & 0xf8; \ + sG = (Pixel >> 2) & 0xf8; \ + sB = (Pixel << 3) & 0xf8; \ + ASSEMBLE_RGBA(dst, 4, dstfmt, sR, sG, sB, alpha); \ + src += 2; \ + dst += 4; \ + widthvar--; \ + } + ONE_PIXEL_BLEND(((UNALIGNED_PTR(dst)) && (width)), width); + + /* After all that work, here's the vector part! */ + extrawidth = (width % 8); /* trailing unaligned stores */ + width -= extrawidth; + vsrc = vec_ld(0, src); + valigner = VEC_ALIGNER(src); + + while (width) { + vector unsigned short vR, vG, vB; + vector unsigned char vdst1, vdst2; + + voverflow = vec_ld(15, src); + vsrc = vec_perm(vsrc, voverflow, valigner); + + vR = vec_and(vec_sl((vector unsigned short) vsrc, v1), vf800); + vB = vec_sl((vector unsigned short) vsrc, v3); + vG = vec_sl(vB, v3); + + vdst1 = + (vector unsigned char) vec_perm((vector unsigned char) vR, + valpha, vredalpha1); + vdst1 = vec_perm(vdst1, (vector unsigned char) vB, vblue1); + vdst1 = vec_perm(vdst1, (vector unsigned char) vG, vgreen1); + vdst1 = vec_perm(vdst1, valpha, vpermute); + vec_st(vdst1, 0, dst); + + vdst2 = + (vector unsigned char) vec_perm((vector unsigned char) vR, + valpha, vredalpha2); + vdst2 = vec_perm(vdst2, (vector unsigned char) vB, vblue2); + vdst2 = vec_perm(vdst2, (vector unsigned char) vG, vgreen2); + vdst2 = vec_perm(vdst2, valpha, vpermute); + vec_st(vdst2, 16, dst); + + width -= 8; + dst += 32; + src += 16; + vsrc = voverflow; + } + + assert(width == 0); + + + /* do scalar until we can align... */ + ONE_PIXEL_BLEND((extrawidth), extrawidth); +#undef ONE_PIXEL_BLEND + + src += srcskip; /* move to next row, accounting for pitch. */ + dst += dstskip; + } + +} + +static void BlitNtoNKey(SDL_BlitInfo * info); +static void BlitNtoNKeyCopyAlpha(SDL_BlitInfo * info); +static void +Blit32to32KeyAltivec(SDL_BlitInfo * info) +{ + int height = info->dst_h; + Uint32 *srcp = (Uint32 *) info->src; + int srcskip = info->src_skip / 4; + Uint32 *dstp = (Uint32 *) info->dst; + int dstskip = info->dst_skip / 4; + SDL_PixelFormat *srcfmt = info->src_fmt; + int srcbpp = srcfmt->BytesPerPixel; + SDL_PixelFormat *dstfmt = info->dst_fmt; + int dstbpp = dstfmt->BytesPerPixel; + int copy_alpha = (srcfmt->Amask && dstfmt->Amask); + unsigned alpha = dstfmt->Amask ? info->a : 0; + Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask; + Uint32 ckey = info->colorkey; + vector unsigned int valpha; + vector unsigned char vpermute; + vector unsigned char vzero; + vector unsigned int vckey; + vector unsigned int vrgbmask; + vpermute = calc_swizzle32(srcfmt, dstfmt); + if (info->dst_w < 16) { + if (copy_alpha) { + BlitNtoNKeyCopyAlpha(info); + } else { + BlitNtoNKey(info); + } + return; + } + vzero = vec_splat_u8(0); + if (alpha) { + ((unsigned char *) &valpha)[0] = (unsigned char) alpha; + valpha = + (vector unsigned int) vec_splat((vector unsigned char) valpha, 0); + } else { + valpha = (vector unsigned int) vzero; + } + ckey &= rgbmask; + ((unsigned int *) (char *) &vckey)[0] = ckey; + vckey = vec_splat(vckey, 0); + ((unsigned int *) (char *) &vrgbmask)[0] = rgbmask; + vrgbmask = vec_splat(vrgbmask, 0); + + while (height--) { +#define ONE_PIXEL_BLEND(condition, widthvar) \ + if (copy_alpha) { \ + while (condition) { \ + Uint32 Pixel; \ + unsigned sR, sG, sB, sA; \ + DISEMBLE_RGBA((Uint8 *)srcp, srcbpp, srcfmt, Pixel, \ + sR, sG, sB, sA); \ + if ( (Pixel & rgbmask) != ckey ) { \ + ASSEMBLE_RGBA((Uint8 *)dstp, dstbpp, dstfmt, \ + sR, sG, sB, sA); \ + } \ + dstp = (Uint32 *) (((Uint8 *) dstp) + dstbpp); \ + srcp = (Uint32 *) (((Uint8 *) srcp) + srcbpp); \ + widthvar--; \ + } \ + } else { \ + while (condition) { \ + Uint32 Pixel; \ + unsigned sR, sG, sB; \ + RETRIEVE_RGB_PIXEL((Uint8 *)srcp, srcbpp, Pixel); \ + if ( Pixel != ckey ) { \ + RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \ + ASSEMBLE_RGBA((Uint8 *)dstp, dstbpp, dstfmt, \ + sR, sG, sB, alpha); \ + } \ + dstp = (Uint32 *) (((Uint8 *)dstp) + dstbpp); \ + srcp = (Uint32 *) (((Uint8 *)srcp) + srcbpp); \ + widthvar--; \ + } \ + } + int width = info->dst_w; + ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width); + assert(width > 0); + if (width > 0) { + int extrawidth = (width % 4); + vector unsigned char valigner = VEC_ALIGNER(srcp); + vector unsigned int vs = vec_ld(0, srcp); + width -= extrawidth; + assert(width >= 4); + while (width) { + vector unsigned char vsel; + vector unsigned int vd; + vector unsigned int voverflow = vec_ld(15, srcp); + /* load the source vec */ + vs = vec_perm(vs, voverflow, valigner); + /* vsel is set for items that match the key */ + vsel = (vector unsigned char) vec_and(vs, vrgbmask); + vsel = (vector unsigned char) vec_cmpeq(vs, vckey); + /* permute the src vec to the dest format */ + vs = vec_perm(vs, valpha, vpermute); + /* load the destination vec */ + vd = vec_ld(0, dstp); + /* select the source and dest into vs */ + vd = (vector unsigned int) vec_sel((vector unsigned char) vs, + (vector unsigned char) vd, + vsel); + + vec_st(vd, 0, dstp); + srcp += 4; + width -= 4; + dstp += 4; + vs = voverflow; + } + ONE_PIXEL_BLEND((extrawidth), extrawidth); +#undef ONE_PIXEL_BLEND + srcp += srcskip; + dstp += dstskip; + } + } +} + +/* Altivec code to swizzle one 32-bit surface to a different 32-bit format. */ +/* Use this on a G5 */ +static void +ConvertAltivec32to32_noprefetch(SDL_BlitInfo * info) +{ + int height = info->dst_h; + Uint32 *src = (Uint32 *) info->src; + int srcskip = info->src_skip / 4; + Uint32 *dst = (Uint32 *) info->dst; + int dstskip = info->dst_skip / 4; + SDL_PixelFormat *srcfmt = info->src_fmt; + SDL_PixelFormat *dstfmt = info->dst_fmt; + vector unsigned int vzero = vec_splat_u32(0); + vector unsigned char vpermute = calc_swizzle32(srcfmt, dstfmt); + if (dstfmt->Amask && !srcfmt->Amask) { + if (info->a) { + vector unsigned char valpha; + ((unsigned char *) &valpha)[0] = info->a; + vzero = (vector unsigned int) vec_splat(valpha, 0); + } + } + + assert(srcfmt->BytesPerPixel == 4); + assert(dstfmt->BytesPerPixel == 4); + + while (height--) { + vector unsigned char valigner; + vector unsigned int vbits; + vector unsigned int voverflow; + Uint32 bits; + Uint8 r, g, b, a; + + int width = info->dst_w; + int extrawidth; + + /* do scalar until we can align... */ + while ((UNALIGNED_PTR(dst)) && (width)) { + bits = *(src++); + RGBA_FROM_8888(bits, srcfmt, r, g, b, a); + *(dst++) = MAKE8888(dstfmt, r, g, b, a); + width--; + } + + /* After all that work, here's the vector part! */ + extrawidth = (width % 4); + width -= extrawidth; + valigner = VEC_ALIGNER(src); + vbits = vec_ld(0, src); + + while (width) { + voverflow = vec_ld(15, src); + src += 4; + width -= 4; + vbits = vec_perm(vbits, voverflow, valigner); /* src is ready. */ + vbits = vec_perm(vbits, vzero, vpermute); /* swizzle it. */ + vec_st(vbits, 0, dst); /* store it back out. */ + dst += 4; + vbits = voverflow; + } + + assert(width == 0); + + /* cover pixels at the end of the row that didn't fit in 16 bytes. */ + while (extrawidth) { + bits = *(src++); /* max 7 pixels, don't bother with prefetch. */ + RGBA_FROM_8888(bits, srcfmt, r, g, b, a); + *(dst++) = MAKE8888(dstfmt, r, g, b, a); + extrawidth--; + } + + src += srcskip; + dst += dstskip; + } + +} + +/* Altivec code to swizzle one 32-bit surface to a different 32-bit format. */ +/* Use this on a G4 */ +static void +ConvertAltivec32to32_prefetch(SDL_BlitInfo * info) +{ + const int scalar_dst_lead = sizeof(Uint32) * 4; + const int vector_dst_lead = sizeof(Uint32) * 16; + + int height = info->dst_h; + Uint32 *src = (Uint32 *) info->src; + int srcskip = info->src_skip / 4; + Uint32 *dst = (Uint32 *) info->dst; + int dstskip = info->dst_skip / 4; + SDL_PixelFormat *srcfmt = info->src_fmt; + SDL_PixelFormat *dstfmt = info->dst_fmt; + vector unsigned int vzero = vec_splat_u32(0); + vector unsigned char vpermute = calc_swizzle32(srcfmt, dstfmt); + if (dstfmt->Amask && !srcfmt->Amask) { + if (info->a) { + vector unsigned char valpha; + ((unsigned char *) &valpha)[0] = info->a; + vzero = (vector unsigned int) vec_splat(valpha, 0); + } + } + + assert(srcfmt->BytesPerPixel == 4); + assert(dstfmt->BytesPerPixel == 4); + + while (height--) { + vector unsigned char valigner; + vector unsigned int vbits; + vector unsigned int voverflow; + Uint32 bits; + Uint8 r, g, b, a; + + int width = info->dst_w; + int extrawidth; + + /* do scalar until we can align... */ + while ((UNALIGNED_PTR(dst)) && (width)) { + vec_dstt(src + scalar_dst_lead, DST_CTRL(2, 32, 1024), + DST_CHAN_SRC); + vec_dstst(dst + scalar_dst_lead, DST_CTRL(2, 32, 1024), + DST_CHAN_DEST); + bits = *(src++); + RGBA_FROM_8888(bits, srcfmt, r, g, b, a); + *(dst++) = MAKE8888(dstfmt, r, g, b, a); + width--; + } + + /* After all that work, here's the vector part! */ + extrawidth = (width % 4); + width -= extrawidth; + valigner = VEC_ALIGNER(src); + vbits = vec_ld(0, src); + + while (width) { + vec_dstt(src + vector_dst_lead, DST_CTRL(2, 32, 1024), + DST_CHAN_SRC); + vec_dstst(dst + vector_dst_lead, DST_CTRL(2, 32, 1024), + DST_CHAN_DEST); + voverflow = vec_ld(15, src); + src += 4; + width -= 4; + vbits = vec_perm(vbits, voverflow, valigner); /* src is ready. */ + vbits = vec_perm(vbits, vzero, vpermute); /* swizzle it. */ + vec_st(vbits, 0, dst); /* store it back out. */ + dst += 4; + vbits = voverflow; + } + + assert(width == 0); + + /* cover pixels at the end of the row that didn't fit in 16 bytes. */ + while (extrawidth) { + bits = *(src++); /* max 7 pixels, don't bother with prefetch. */ + RGBA_FROM_8888(bits, srcfmt, r, g, b, a); + *(dst++) = MAKE8888(dstfmt, r, g, b, a); + extrawidth--; + } + + src += srcskip; + dst += dstskip; + } + + vec_dss(DST_CHAN_SRC); + vec_dss(DST_CHAN_DEST); +} + +static Uint32 +GetBlitFeatures(void) +{ + static Uint32 features = 0xffffffff; + if (features == 0xffffffff) { + /* Provide an override for testing .. */ + char *override = SDL_getenv("SDL_ALTIVEC_BLIT_FEATURES"); + if (override) { + features = 0; + SDL_sscanf(override, "%u", &features); + } else { + features = (0 + /* Feature 1 is has-MMX */ + | ((SDL_HasMMX())? 1 : 0) + /* Feature 2 is has-AltiVec */ + | ((SDL_HasAltiVec())? 2 : 0) + /* Feature 4 is dont-use-prefetch */ + /* !!!! FIXME: Check for G5 or later, not the cache size! Always prefetch on a G4. */ + | ((GetL3CacheSize() == 0) ? 4 : 0) + ); + } + } + return features; +} + +#if __MWERKS__ +#pragma altivec_model off +#endif +#else /* Feature 1 is has-MMX */ #define GetBlitFeatures() ((Uint32)(SDL_HasMMX() ? 1 : 0)) +#endif /* This is now endian dependent */ #if SDL_BYTEORDER == SDL_LIL_ENDIAN @@ -1508,6 +2340,15 @@ static const struct blit_table normal_blit_1[] = { }; static const struct blit_table normal_blit_2[] = { +#if SDL_ALTIVEC_BLITTERS + /* has-altivec */ + {0x0000F800, 0x000007E0, 0x0000001F, 4, 0x00000000, 0x00000000, + 0x00000000, + 2, Blit_RGB565_32Altivec, NO_ALPHA | COPY_ALPHA | SET_ALPHA}, + {0x00007C00, 0x000003E0, 0x0000001F, 4, 0x00000000, 0x00000000, + 0x00000000, + 2, Blit_RGB555_32Altivec, NO_ALPHA | COPY_ALPHA | SET_ALPHA}, +#endif {0x0000F800, 0x000007E0, 0x0000001F, 4, 0x00FF0000, 0x0000FF00, 0x000000FF, 0, Blit_RGB565_ARGB8888, SET_ALPHA}, @@ -1531,6 +2372,22 @@ static const struct blit_table normal_blit_3[] = { }; static const struct blit_table normal_blit_4[] = { +#if SDL_ALTIVEC_BLITTERS + /* has-altivec | dont-use-prefetch */ + {0x00000000, 0x00000000, 0x00000000, 4, 0x00000000, 0x00000000, + 0x00000000, + 6, ConvertAltivec32to32_noprefetch, + NO_ALPHA | COPY_ALPHA | SET_ALPHA}, + /* has-altivec */ + {0x00000000, 0x00000000, 0x00000000, 4, 0x00000000, 0x00000000, + 0x00000000, + 2, ConvertAltivec32to32_prefetch, + NO_ALPHA | COPY_ALPHA | SET_ALPHA}, + /* has-altivec */ + {0x00000000, 0x00000000, 0x00000000, 2, 0x0000F800, 0x000007E0, + 0x0000001F, + 2, Blit_RGB888_RGB565Altivec, NO_ALPHA}, +#endif {0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x0000F800, 0x000007E0, 0x0000001F, 0, Blit_RGB888_RGB565, NO_ALPHA}, @@ -1628,6 +2485,12 @@ SDL_CalculateBlitN(SDL_Surface * surface) else if (dstfmt->BytesPerPixel == 1) return BlitNto1Key; else { +#if SDL_ALTIVEC_BLITTERS + if ((srcfmt->BytesPerPixel == 4) && (dstfmt->BytesPerPixel == 4) + && SDL_HasAltiVec()) { + return Blit32to32KeyAltivec; + } else +#endif if (srcfmt->Amask && dstfmt->Amask) { return BlitNtoNKeyCopyAlpha; } else { diff --git a/test/automated/platform/platform.c b/test/automated/platform/platform.c index 6134807d3..4c371ef70 100644 --- a/test/automated/platform/platform.c +++ b/test/automated/platform/platform.c @@ -158,7 +158,9 @@ int test_platform (void) SDL_ATprintVerbose( 1, "CPU count: %d\n", SDL_GetCPUCount()); SDL_ATprintVerbose( 1, "Available extensions:\n" ); SDL_ATprintVerbose( 1, " RDTSC %s\n", SDL_HasRDTSC()? "detected" : "not detected" ); + SDL_ATprintVerbose( 1, " AltiVec %s\n", SDL_HasAltiVec()? "detected" : "not detected" ); SDL_ATprintVerbose( 1, " MMX %s\n", SDL_HasMMX()? "detected" : "not detected" ); + SDL_ATprintVerbose( 1, " 3DNow! %s\n", SDL_Has3DNow()? "detected" : "not detected" ); SDL_ATprintVerbose( 1, " SSE %s\n", SDL_HasSSE()? "detected" : "not detected" ); SDL_ATprintVerbose( 1, " SSE2 %s\n", SDL_HasSSE2()? "detected" : "not detected" ); SDL_ATprintVerbose( 1, " SSE3 %s\n", SDL_HasSSE3()? "detected" : "not detected" ); diff --git a/test/testplatform.c b/test/testplatform.c old mode 100644 new mode 100755 index f2b125479..6509f24c0 --- a/test/testplatform.c +++ b/test/testplatform.c @@ -140,9 +140,11 @@ TestCPUInfo(SDL_bool verbose) { if (verbose) { printf("CPU count: %d\n", SDL_GetCPUCount()); - printf("CPU cache line size: %d\n", SDL_GetCPUCacheLineSize()); + printf("CPU cache line size: %d\n", SDL_GetCPUCacheLineSize()); printf("RDTSC %s\n", SDL_HasRDTSC()? "detected" : "not detected"); + printf("AltiVec %s\n", SDL_HasAltiVec()? "detected" : "not detected"); printf("MMX %s\n", SDL_HasMMX()? "detected" : "not detected"); + printf("3DNow! %s\n", SDL_Has3DNow()? "detected" : "not detected"); printf("SSE %s\n", SDL_HasSSE()? "detected" : "not detected"); printf("SSE2 %s\n", SDL_HasSSE2()? "detected" : "not detected"); printf("SSE3 %s\n", SDL_HasSSE3()? "detected" : "not detected");