Re-added the 3DNow! and AltiVec instruction support.
authorSam Lantinga <slouken@libsdl.org>
Tue, 22 Feb 2011 21:44:36 -0800
changeset 538924903690f48a
parent 5388 6e586f65f8ef
child 5390 fefd4f7b5214
Re-added the 3DNow! and AltiVec instruction support.
VisualC/SDL/SDL_VS2005.vcproj
VisualC/SDL/SDL_VS2008.vcproj
VisualC/SDL/SDL_VS2010.vcxproj
configure
configure.in
include/SDL_config.h.in
include/SDL_config_macosx.h
include/SDL_cpuinfo.h
src/cpuinfo/SDL_cpuinfo.c
src/video/SDL_blit.c
src/video/SDL_blit.h
src/video/SDL_blit_A.c
src/video/SDL_blit_N.c
test/automated/platform/platform.c
test/testplatform.c
     1.1 --- a/VisualC/SDL/SDL_VS2005.vcproj	Mon Feb 21 23:45:48 2011 -0800
     1.2 +++ b/VisualC/SDL/SDL_VS2005.vcproj	Tue Feb 22 21:44:36 2011 -0800
     1.3 @@ -52,7 +52,7 @@
     1.4  				Name="VCCLCompilerTool"
     1.5  				Optimization="0"
     1.6  				AdditionalIncludeDirectories="..\..\include"
     1.7 -				PreprocessorDefinitions="_DEBUG;_WINDOWS;_WIN32_WINNT=0x0400;__MMX__;__3dNOW__;__SSE__;__SSE2__"
     1.8 +				PreprocessorDefinitions="_DEBUG;_WINDOWS"
     1.9  				RuntimeLibrary="2"
    1.10  				BufferSecurityCheck="false"
    1.11  				UsePrecompiledHeader="0"
    1.12 @@ -231,7 +231,7 @@
    1.13  				InlineFunctionExpansion="1"
    1.14  				EnableIntrinsicFunctions="false"
    1.15  				AdditionalIncludeDirectories="..\..\include"
    1.16 -				PreprocessorDefinitions="NDEBUG;_WINDOWS;_WIN32_WINNT=0x0400;__MMX__;__3dNOW__;__SSE__;__SSE2__"
    1.17 +				PreprocessorDefinitions="NDEBUG;_WINDOWS"
    1.18  				StringPooling="true"
    1.19  				RuntimeLibrary="2"
    1.20  				BufferSecurityCheck="false"
     2.1 --- a/VisualC/SDL/SDL_VS2008.vcproj	Mon Feb 21 23:45:48 2011 -0800
     2.2 +++ b/VisualC/SDL/SDL_VS2008.vcproj	Tue Feb 22 21:44:36 2011 -0800
     2.3 @@ -52,7 +52,7 @@
     2.4  				Name="VCCLCompilerTool"
     2.5  				Optimization="0"
     2.6  				AdditionalIncludeDirectories="..\..\include"
     2.7 -				PreprocessorDefinitions="_DEBUG;_WINDOWS;_WIN32_WINNT=0x0400;__MMX__;__3dNOW__;__SSE__;__SSE2__"
     2.8 +				PreprocessorDefinitions="_DEBUG;_WINDOWS"
     2.9  				RuntimeLibrary="3"
    2.10  				BufferSecurityCheck="false"
    2.11  				WarningLevel="3"
    2.12 @@ -223,7 +223,7 @@
    2.13  				InlineFunctionExpansion="1"
    2.14  				EnableIntrinsicFunctions="false"
    2.15  				AdditionalIncludeDirectories="..\..\include"
    2.16 -				PreprocessorDefinitions="NDEBUG;_WINDOWS;_WIN32_WINNT=0x0400;__MMX__;__3dNOW__;__SSE__;__SSE2__"
    2.17 +				PreprocessorDefinitions="NDEBUG;_WINDOWS"
    2.18  				StringPooling="true"
    2.19  				RuntimeLibrary="2"
    2.20  				BufferSecurityCheck="false"
     3.1 --- a/VisualC/SDL/SDL_VS2010.vcxproj	Mon Feb 21 23:45:48 2011 -0800
     3.2 +++ b/VisualC/SDL/SDL_VS2010.vcxproj	Tue Feb 22 21:44:36 2011 -0800
     3.3 @@ -83,7 +83,7 @@
     3.4      <ClCompile>
     3.5        <Optimization>Disabled</Optimization>
     3.6        <AdditionalIncludeDirectories>..\..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
     3.7 -      <PreprocessorDefinitions>_DEBUG;_WINDOWS;_WIN32_WINNT=0x0400;__MMX__;__3dNOW__;__SSE__;__SSE2__;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     3.8 +      <PreprocessorDefinitions>_DEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     3.9        <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
    3.10        <BufferSecurityCheck>false</BufferSecurityCheck>
    3.11        <PrecompiledHeader>
    3.12 @@ -152,7 +152,7 @@
    3.13        <InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
    3.14        <IntrinsicFunctions>false</IntrinsicFunctions>
    3.15        <AdditionalIncludeDirectories>..\..\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
    3.16 -      <PreprocessorDefinitions>NDEBUG;_WINDOWS;_WIN32_WINNT=0x0400;__MMX__;__3dNOW__;__SSE__;__SSE2__;%(PreprocessorDefinitions)</PreprocessorDefinitions>
    3.17 +      <PreprocessorDefinitions>NDEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
    3.18        <StringPooling>true</StringPooling>
    3.19        <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
    3.20        <BufferSecurityCheck>false</BufferSecurityCheck>
    3.21 @@ -446,4 +446,4 @@
    3.22    <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
    3.23    <ImportGroup Label="ExtensionTargets">
    3.24    </ImportGroup>
    3.25 -</Project>
    3.26 \ No newline at end of file
    3.27 +</Project>
     4.1 --- a/configure	Mon Feb 21 23:45:48 2011 -0800
     4.2 +++ b/configure	Tue Feb 22 21:44:36 2011 -0800
     4.3 @@ -1514,8 +1514,10 @@
     4.4    --enable-ssemath        Allow GCC to use SSE floating point math
     4.5                            [default=no]
     4.6    --enable-mmx            use MMX assembly routines [default=yes]
     4.7 +  --enable-3dnow          use MMX assembly routines [default=yes]
     4.8    --enable-sse            use SSE assembly routines [default=yes]
     4.9    --enable-sse2           use SSE2 assembly routines [default=no]
    4.10 +  --enable-altivec        use Altivec assembly routines [default=yes]
    4.11    --enable-oss            support the OSS audio API [default=yes]
    4.12    --enable-alsa           support the ALSA audio API [default=yes]
    4.13    --disable-alsatest      Do not try to compile and run a test Alsa program
    4.14 @@ -3768,13 +3770,13 @@
    4.15  else
    4.16    lt_cv_nm_interface="BSD nm"
    4.17    echo "int some_variable = 0;" > conftest.$ac_ext
    4.18 -  (eval echo "\"\$as_me:3771: $ac_compile\"" >&5)
    4.19 +  (eval echo "\"\$as_me:3773: $ac_compile\"" >&5)
    4.20    (eval "$ac_compile" 2>conftest.err)
    4.21    cat conftest.err >&5
    4.22 -  (eval echo "\"\$as_me:3774: $NM \\\"conftest.$ac_objext\\\"\"" >&5)
    4.23 +  (eval echo "\"\$as_me:3776: $NM \\\"conftest.$ac_objext\\\"\"" >&5)
    4.24    (eval "$NM \"conftest.$ac_objext\"" 2>conftest.err > conftest.out)
    4.25    cat conftest.err >&5
    4.26 -  (eval echo "\"\$as_me:3777: output\"" >&5)
    4.27 +  (eval echo "\"\$as_me:3779: output\"" >&5)
    4.28    cat conftest.out >&5
    4.29    if $GREP 'External.*some_variable' conftest.out > /dev/null; then
    4.30      lt_cv_nm_interface="MS dumpbin"
    4.31 @@ -5001,7 +5003,7 @@
    4.32    ;;
    4.33  *-*-irix6*)
    4.34    # Find out which ABI we are using.
    4.35 -  echo '#line 5004 "configure"' > conftest.$ac_ext
    4.36 +  echo '#line 5006 "configure"' > conftest.$ac_ext
    4.37    if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5
    4.38    (eval $ac_compile) 2>&5
    4.39    ac_status=$?
    4.40 @@ -7162,11 +7164,11 @@
    4.41     -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    4.42     -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    4.43     -e 's:$: $lt_compiler_flag:'`
    4.44 -   (eval echo "\"\$as_me:7165: $lt_compile\"" >&5)
    4.45 +   (eval echo "\"\$as_me:7167: $lt_compile\"" >&5)
    4.46     (eval "$lt_compile" 2>conftest.err)
    4.47     ac_status=$?
    4.48     cat conftest.err >&5
    4.49 -   echo "$as_me:7169: \$? = $ac_status" >&5
    4.50 +   echo "$as_me:7171: \$? = $ac_status" >&5
    4.51     if (exit $ac_status) && test -s "$ac_outfile"; then
    4.52       # The compiler can only warn and ignore the option if not recognized
    4.53       # So say no if there are warnings other than the usual output.
    4.54 @@ -7501,11 +7503,11 @@
    4.55     -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    4.56     -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    4.57     -e 's:$: $lt_compiler_flag:'`
    4.58 -   (eval echo "\"\$as_me:7504: $lt_compile\"" >&5)
    4.59 +   (eval echo "\"\$as_me:7506: $lt_compile\"" >&5)
    4.60     (eval "$lt_compile" 2>conftest.err)
    4.61     ac_status=$?
    4.62     cat conftest.err >&5
    4.63 -   echo "$as_me:7508: \$? = $ac_status" >&5
    4.64 +   echo "$as_me:7510: \$? = $ac_status" >&5
    4.65     if (exit $ac_status) && test -s "$ac_outfile"; then
    4.66       # The compiler can only warn and ignore the option if not recognized
    4.67       # So say no if there are warnings other than the usual output.
    4.68 @@ -7606,11 +7608,11 @@
    4.69     -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    4.70     -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    4.71     -e 's:$: $lt_compiler_flag:'`
    4.72 -   (eval echo "\"\$as_me:7609: $lt_compile\"" >&5)
    4.73 +   (eval echo "\"\$as_me:7611: $lt_compile\"" >&5)
    4.74     (eval "$lt_compile" 2>out/conftest.err)
    4.75     ac_status=$?
    4.76     cat out/conftest.err >&5
    4.77 -   echo "$as_me:7613: \$? = $ac_status" >&5
    4.78 +   echo "$as_me:7615: \$? = $ac_status" >&5
    4.79     if (exit $ac_status) && test -s out/conftest2.$ac_objext
    4.80     then
    4.81       # The compiler can only warn and ignore the option if not recognized
    4.82 @@ -7661,11 +7663,11 @@
    4.83     -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    4.84     -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    4.85     -e 's:$: $lt_compiler_flag:'`
    4.86 -   (eval echo "\"\$as_me:7664: $lt_compile\"" >&5)
    4.87 +   (eval echo "\"\$as_me:7666: $lt_compile\"" >&5)
    4.88     (eval "$lt_compile" 2>out/conftest.err)
    4.89     ac_status=$?
    4.90     cat out/conftest.err >&5
    4.91 -   echo "$as_me:7668: \$? = $ac_status" >&5
    4.92 +   echo "$as_me:7670: \$? = $ac_status" >&5
    4.93     if (exit $ac_status) && test -s out/conftest2.$ac_objext
    4.94     then
    4.95       # The compiler can only warn and ignore the option if not recognized
    4.96 @@ -10419,7 +10421,7 @@
    4.97    lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
    4.98    lt_status=$lt_dlunknown
    4.99    cat > conftest.$ac_ext <<_LT_EOF
   4.100 -#line 10422 "configure"
   4.101 +#line 10424 "configure"
   4.102  #include "confdefs.h"
   4.103  
   4.104  #if HAVE_DLFCN_H
   4.105 @@ -10515,7 +10517,7 @@
   4.106    lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   4.107    lt_status=$lt_dlunknown
   4.108    cat > conftest.$ac_ext <<_LT_EOF
   4.109 -#line 10518 "configure"
   4.110 +#line 10520 "configure"
   4.111  #include "confdefs.h"
   4.112  
   4.113  #if HAVE_DLFCN_H
   4.114 @@ -14197,11 +14199,11 @@
   4.115     -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
   4.116     -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
   4.117     -e 's:$: $lt_compiler_flag:'`
   4.118 -   (eval echo "\"\$as_me:14200: $lt_compile\"" >&5)
   4.119 +   (eval echo "\"\$as_me:14202: $lt_compile\"" >&5)
   4.120     (eval "$lt_compile" 2>conftest.err)
   4.121     ac_status=$?
   4.122     cat conftest.err >&5
   4.123 -   echo "$as_me:14204: \$? = $ac_status" >&5
   4.124 +   echo "$as_me:14206: \$? = $ac_status" >&5
   4.125     if (exit $ac_status) && test -s "$ac_outfile"; then
   4.126       # The compiler can only warn and ignore the option if not recognized
   4.127       # So say no if there are warnings other than the usual output.
   4.128 @@ -14296,11 +14298,11 @@
   4.129     -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
   4.130     -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
   4.131     -e 's:$: $lt_compiler_flag:'`
   4.132 -   (eval echo "\"\$as_me:14299: $lt_compile\"" >&5)
   4.133 +   (eval echo "\"\$as_me:14301: $lt_compile\"" >&5)
   4.134     (eval "$lt_compile" 2>out/conftest.err)
   4.135     ac_status=$?
   4.136     cat out/conftest.err >&5
   4.137 -   echo "$as_me:14303: \$? = $ac_status" >&5
   4.138 +   echo "$as_me:14305: \$? = $ac_status" >&5
   4.139     if (exit $ac_status) && test -s out/conftest2.$ac_objext
   4.140     then
   4.141       # The compiler can only warn and ignore the option if not recognized
   4.142 @@ -14348,11 +14350,11 @@
   4.143     -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
   4.144     -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
   4.145     -e 's:$: $lt_compiler_flag:'`
   4.146 -   (eval echo "\"\$as_me:14351: $lt_compile\"" >&5)
   4.147 +   (eval echo "\"\$as_me:14353: $lt_compile\"" >&5)
   4.148     (eval "$lt_compile" 2>out/conftest.err)
   4.149     ac_status=$?
   4.150     cat out/conftest.err >&5
   4.151 -   echo "$as_me:14355: \$? = $ac_status" >&5
   4.152 +   echo "$as_me:14357: \$? = $ac_status" >&5
   4.153     if (exit $ac_status) && test -s out/conftest2.$ac_objext
   4.154     then
   4.155       # The compiler can only warn and ignore the option if not recognized
   4.156 @@ -20693,6 +20695,79 @@
   4.157          fi
   4.158      fi
   4.159  
   4.160 +    # Check whether --enable-3dnow was given.
   4.161 +if test "${enable_3dnow+set}" = set; then
   4.162 +  enableval=$enable_3dnow;
   4.163 +else
   4.164 +  enable_3dnow=yes
   4.165 +fi
   4.166 +
   4.167 +    if test x$enable_3dnow = xyes; then
   4.168 +        save_CFLAGS="$CFLAGS"
   4.169 +        have_gcc_3dnow=no
   4.170 +        { echo "$as_me:$LINENO: checking for GCC -m3dnow option" >&5
   4.171 +echo $ECHO_N "checking for GCC -m3dnow option... $ECHO_C" >&6; }
   4.172 +        amd3dnow_CFLAGS="-m3dnow"
   4.173 +        CFLAGS="$save_CFLAGS $amd3dnow_CFLAGS"
   4.174 +
   4.175 +        cat >conftest.$ac_ext <<_ACEOF
   4.176 +/* confdefs.h.  */
   4.177 +_ACEOF
   4.178 +cat confdefs.h >>conftest.$ac_ext
   4.179 +cat >>conftest.$ac_ext <<_ACEOF
   4.180 +/* end confdefs.h.  */
   4.181 +
   4.182 +        #include <mm3dnow.h>
   4.183 +        #ifndef __3dNOW__
   4.184 +        #error Assembler CPP flag not enabled
   4.185 +        #endif
   4.186 +
   4.187 +int
   4.188 +main ()
   4.189 +{
   4.190 +
   4.191 +
   4.192 +  ;
   4.193 +  return 0;
   4.194 +}
   4.195 +_ACEOF
   4.196 +rm -f conftest.$ac_objext
   4.197 +if { (ac_try="$ac_compile"
   4.198 +case "(($ac_try" in
   4.199 +  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
   4.200 +  *) ac_try_echo=$ac_try;;
   4.201 +esac
   4.202 +eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
   4.203 +  (eval "$ac_compile") 2>conftest.er1
   4.204 +  ac_status=$?
   4.205 +  grep -v '^ *+' conftest.er1 >conftest.err
   4.206 +  rm -f conftest.er1
   4.207 +  cat conftest.err >&5
   4.208 +  echo "$as_me:$LINENO: \$? = $ac_status" >&5
   4.209 +  (exit $ac_status); } && {
   4.210 +	 test -z "$ac_c_werror_flag" ||
   4.211 +	 test ! -s conftest.err
   4.212 +       } && test -s conftest.$ac_objext; then
   4.213 +
   4.214 +        have_gcc_3dnow=yes
   4.215 +
   4.216 +else
   4.217 +  echo "$as_me: failed program was:" >&5
   4.218 +sed 's/^/| /' conftest.$ac_ext >&5
   4.219 +
   4.220 +
   4.221 +fi
   4.222 +
   4.223 +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
   4.224 +        { echo "$as_me:$LINENO: result: $have_gcc_3dnow" >&5
   4.225 +echo "${ECHO_T}$have_gcc_3dnow" >&6; }
   4.226 +        CFLAGS="$save_CFLAGS"
   4.227 +
   4.228 +        if test x$have_gcc_3dnow = xyes; then
   4.229 +            EXTRA_CFLAGS="$EXTRA_CFLAGS $amd3dnow_CFLAGS"
   4.230 +        fi
   4.231 +    fi
   4.232 +
   4.233      # Check whether --enable-sse was given.
   4.234  if test "${enable_sse+set}" = set; then
   4.235    enableval=$enable_sse;
   4.236 @@ -20856,6 +20931,260 @@
   4.237              EXTRA_CFLAGS="$EXTRA_CFLAGS $sse2_CFLAGS"
   4.238          fi
   4.239      fi
   4.240 +
   4.241 +    # Check whether --enable-altivec was given.
   4.242 +if test "${enable_altivec+set}" = set; then
   4.243 +  enableval=$enable_altivec;
   4.244 +else
   4.245 +  enable_altivec=yes
   4.246 +fi
   4.247 +
   4.248 +    if test x$enable_altivec = xyes; then
   4.249 +        save_CFLAGS="$CFLAGS"
   4.250 +        have_gcc_altivec=no
   4.251 +        have_altivec_h_hdr=no
   4.252 +        altivec_CFLAGS="-maltivec"
   4.253 +        CFLAGS="$save_CFLAGS $altivec_CFLAGS"
   4.254 +
   4.255 +        { echo "$as_me:$LINENO: checking for Altivec with GCC altivec.h and -maltivec option" >&5
   4.256 +echo $ECHO_N "checking for Altivec with GCC altivec.h and -maltivec option... $ECHO_C" >&6; }
   4.257 +        cat >conftest.$ac_ext <<_ACEOF
   4.258 +/* confdefs.h.  */
   4.259 +_ACEOF
   4.260 +cat confdefs.h >>conftest.$ac_ext
   4.261 +cat >>conftest.$ac_ext <<_ACEOF
   4.262 +/* end confdefs.h.  */
   4.263 +
   4.264 +        #include <altivec.h>
   4.265 +        vector unsigned int vzero() {
   4.266 +            return vec_splat_u32(0);
   4.267 +        }
   4.268 +
   4.269 +int
   4.270 +main ()
   4.271 +{
   4.272 +
   4.273 +
   4.274 +  ;
   4.275 +  return 0;
   4.276 +}
   4.277 +_ACEOF
   4.278 +rm -f conftest.$ac_objext
   4.279 +if { (ac_try="$ac_compile"
   4.280 +case "(($ac_try" in
   4.281 +  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
   4.282 +  *) ac_try_echo=$ac_try;;
   4.283 +esac
   4.284 +eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
   4.285 +  (eval "$ac_compile") 2>conftest.er1
   4.286 +  ac_status=$?
   4.287 +  grep -v '^ *+' conftest.er1 >conftest.err
   4.288 +  rm -f conftest.er1
   4.289 +  cat conftest.err >&5
   4.290 +  echo "$as_me:$LINENO: \$? = $ac_status" >&5
   4.291 +  (exit $ac_status); } && {
   4.292 +	 test -z "$ac_c_werror_flag" ||
   4.293 +	 test ! -s conftest.err
   4.294 +       } && test -s conftest.$ac_objext; then
   4.295 +
   4.296 +        have_gcc_altivec=yes
   4.297 +        have_altivec_h_hdr=yes
   4.298 +
   4.299 +else
   4.300 +  echo "$as_me: failed program was:" >&5
   4.301 +sed 's/^/| /' conftest.$ac_ext >&5
   4.302 +
   4.303 +
   4.304 +fi
   4.305 +
   4.306 +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
   4.307 +        { echo "$as_me:$LINENO: result: $have_gcc_altivec" >&5
   4.308 +echo "${ECHO_T}$have_gcc_altivec" >&6; }
   4.309 +
   4.310 +        if test x$have_gcc_altivec = xno; then
   4.311 +            { echo "$as_me:$LINENO: checking for Altivec with GCC -maltivec option" >&5
   4.312 +echo $ECHO_N "checking for Altivec with GCC -maltivec option... $ECHO_C" >&6; }
   4.313 +            cat >conftest.$ac_ext <<_ACEOF
   4.314 +/* confdefs.h.  */
   4.315 +_ACEOF
   4.316 +cat confdefs.h >>conftest.$ac_ext
   4.317 +cat >>conftest.$ac_ext <<_ACEOF
   4.318 +/* end confdefs.h.  */
   4.319 +
   4.320 +            vector unsigned int vzero() {
   4.321 +                return vec_splat_u32(0);
   4.322 +            }
   4.323 +
   4.324 +int
   4.325 +main ()
   4.326 +{
   4.327 +
   4.328 +
   4.329 +  ;
   4.330 +  return 0;
   4.331 +}
   4.332 +_ACEOF
   4.333 +rm -f conftest.$ac_objext
   4.334 +if { (ac_try="$ac_compile"
   4.335 +case "(($ac_try" in
   4.336 +  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
   4.337 +  *) ac_try_echo=$ac_try;;
   4.338 +esac
   4.339 +eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
   4.340 +  (eval "$ac_compile") 2>conftest.er1
   4.341 +  ac_status=$?
   4.342 +  grep -v '^ *+' conftest.er1 >conftest.err
   4.343 +  rm -f conftest.er1
   4.344 +  cat conftest.err >&5
   4.345 +  echo "$as_me:$LINENO: \$? = $ac_status" >&5
   4.346 +  (exit $ac_status); } && {
   4.347 +	 test -z "$ac_c_werror_flag" ||
   4.348 +	 test ! -s conftest.err
   4.349 +       } && test -s conftest.$ac_objext; then
   4.350 +
   4.351 +            have_gcc_altivec=yes
   4.352 +
   4.353 +else
   4.354 +  echo "$as_me: failed program was:" >&5
   4.355 +sed 's/^/| /' conftest.$ac_ext >&5
   4.356 +
   4.357 +
   4.358 +fi
   4.359 +
   4.360 +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
   4.361 +            { echo "$as_me:$LINENO: result: $have_gcc_altivec" >&5
   4.362 +echo "${ECHO_T}$have_gcc_altivec" >&6; }
   4.363 +        fi
   4.364 +
   4.365 +        if test x$have_gcc_altivec = xno; then
   4.366 +            { echo "$as_me:$LINENO: checking for Altivec with GCC altivec.h and -faltivec option" >&5
   4.367 +echo $ECHO_N "checking for Altivec with GCC altivec.h and -faltivec option... $ECHO_C" >&6; }
   4.368 +            altivec_CFLAGS="-faltivec"
   4.369 +            CFLAGS="$save_CFLAGS $altivec_CFLAGS"
   4.370 +            cat >conftest.$ac_ext <<_ACEOF
   4.371 +/* confdefs.h.  */
   4.372 +_ACEOF
   4.373 +cat confdefs.h >>conftest.$ac_ext
   4.374 +cat >>conftest.$ac_ext <<_ACEOF
   4.375 +/* end confdefs.h.  */
   4.376 +
   4.377 +            #include <altivec.h>
   4.378 +            vector unsigned int vzero() {
   4.379 +                return vec_splat_u32(0);
   4.380 +            }
   4.381 +
   4.382 +int
   4.383 +main ()
   4.384 +{
   4.385 +
   4.386 +
   4.387 +  ;
   4.388 +  return 0;
   4.389 +}
   4.390 +_ACEOF
   4.391 +rm -f conftest.$ac_objext
   4.392 +if { (ac_try="$ac_compile"
   4.393 +case "(($ac_try" in
   4.394 +  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
   4.395 +  *) ac_try_echo=$ac_try;;
   4.396 +esac
   4.397 +eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
   4.398 +  (eval "$ac_compile") 2>conftest.er1
   4.399 +  ac_status=$?
   4.400 +  grep -v '^ *+' conftest.er1 >conftest.err
   4.401 +  rm -f conftest.er1
   4.402 +  cat conftest.err >&5
   4.403 +  echo "$as_me:$LINENO: \$? = $ac_status" >&5
   4.404 +  (exit $ac_status); } && {
   4.405 +	 test -z "$ac_c_werror_flag" ||
   4.406 +	 test ! -s conftest.err
   4.407 +       } && test -s conftest.$ac_objext; then
   4.408 +
   4.409 +            have_gcc_altivec=yes
   4.410 +            have_altivec_h_hdr=yes
   4.411 +
   4.412 +else
   4.413 +  echo "$as_me: failed program was:" >&5
   4.414 +sed 's/^/| /' conftest.$ac_ext >&5
   4.415 +
   4.416 +
   4.417 +fi
   4.418 +
   4.419 +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
   4.420 +            { echo "$as_me:$LINENO: result: $have_gcc_altivec" >&5
   4.421 +echo "${ECHO_T}$have_gcc_altivec" >&6; }
   4.422 +        fi
   4.423 +
   4.424 +        if test x$have_gcc_altivec = xno; then
   4.425 +            { echo "$as_me:$LINENO: checking for Altivec with GCC -faltivec option" >&5
   4.426 +echo $ECHO_N "checking for Altivec with GCC -faltivec option... $ECHO_C" >&6; }
   4.427 +            cat >conftest.$ac_ext <<_ACEOF
   4.428 +/* confdefs.h.  */
   4.429 +_ACEOF
   4.430 +cat confdefs.h >>conftest.$ac_ext
   4.431 +cat >>conftest.$ac_ext <<_ACEOF
   4.432 +/* end confdefs.h.  */
   4.433 +
   4.434 +            vector unsigned int vzero() {
   4.435 +                return vec_splat_u32(0);
   4.436 +            }
   4.437 +
   4.438 +int
   4.439 +main ()
   4.440 +{
   4.441 +
   4.442 +
   4.443 +  ;
   4.444 +  return 0;
   4.445 +}
   4.446 +_ACEOF
   4.447 +rm -f conftest.$ac_objext
   4.448 +if { (ac_try="$ac_compile"
   4.449 +case "(($ac_try" in
   4.450 +  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
   4.451 +  *) ac_try_echo=$ac_try;;
   4.452 +esac
   4.453 +eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
   4.454 +  (eval "$ac_compile") 2>conftest.er1
   4.455 +  ac_status=$?
   4.456 +  grep -v '^ *+' conftest.er1 >conftest.err
   4.457 +  rm -f conftest.er1
   4.458 +  cat conftest.err >&5
   4.459 +  echo "$as_me:$LINENO: \$? = $ac_status" >&5
   4.460 +  (exit $ac_status); } && {
   4.461 +	 test -z "$ac_c_werror_flag" ||
   4.462 +	 test ! -s conftest.err
   4.463 +       } && test -s conftest.$ac_objext; then
   4.464 +
   4.465 +            have_gcc_altivec=yes
   4.466 +
   4.467 +else
   4.468 +  echo "$as_me: failed program was:" >&5
   4.469 +sed 's/^/| /' conftest.$ac_ext >&5
   4.470 +
   4.471 +
   4.472 +fi
   4.473 +
   4.474 +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
   4.475 +            { echo "$as_me:$LINENO: result: $have_gcc_altivec" >&5
   4.476 +echo "${ECHO_T}$have_gcc_altivec" >&6; }
   4.477 +        fi
   4.478 +        CFLAGS="$save_CFLAGS"
   4.479 +
   4.480 +        if test x$have_gcc_altivec = xyes; then
   4.481 +            cat >>confdefs.h <<\_ACEOF
   4.482 +#define SDL_ALTIVEC_BLITTERS 1
   4.483 +_ACEOF
   4.484 +
   4.485 +            if test x$have_altivec_h_hdr = xyes; then
   4.486 +              cat >>confdefs.h <<\_ACEOF
   4.487 +#define HAVE_ALTIVEC_H 1
   4.488 +_ACEOF
   4.489 +
   4.490 +            fi
   4.491 +            EXTRA_CFLAGS="$EXTRA_CFLAGS $altivec_CFLAGS"
   4.492 +        fi
   4.493 +    fi
   4.494  fi
   4.495  
   4.496  CheckOSS()
     5.1 --- a/configure.in	Mon Feb 21 23:45:48 2011 -0800
     5.2 +++ b/configure.in	Tue Feb 22 21:44:36 2011 -0800
     5.3 @@ -501,6 +501,33 @@
     5.4          fi
     5.5      fi
     5.6  
     5.7 +    AC_ARG_ENABLE(3dnow,
     5.8 +AC_HELP_STRING([--enable-3dnow], [use MMX assembly routines [[default=yes]]]),
     5.9 +                  , enable_3dnow=yes)
    5.10 +    if test x$enable_3dnow = xyes; then
    5.11 +        save_CFLAGS="$CFLAGS"
    5.12 +        have_gcc_3dnow=no
    5.13 +        AC_MSG_CHECKING(for GCC -m3dnow option)
    5.14 +        amd3dnow_CFLAGS="-m3dnow"
    5.15 +        CFLAGS="$save_CFLAGS $amd3dnow_CFLAGS"
    5.16 +
    5.17 +        AC_TRY_COMPILE([
    5.18 +        #include <mm3dnow.h>
    5.19 +        #ifndef __3dNOW__
    5.20 +        #error Assembler CPP flag not enabled
    5.21 +        #endif
    5.22 +        ],[
    5.23 +        ],[
    5.24 +        have_gcc_3dnow=yes
    5.25 +        ])
    5.26 +        AC_MSG_RESULT($have_gcc_3dnow)
    5.27 +        CFLAGS="$save_CFLAGS"
    5.28 +
    5.29 +        if test x$have_gcc_3dnow = xyes; then
    5.30 +            EXTRA_CFLAGS="$EXTRA_CFLAGS $amd3dnow_CFLAGS"
    5.31 +        fi
    5.32 +    fi
    5.33 +
    5.34      AC_ARG_ENABLE(sse,
    5.35  AC_HELP_STRING([--enable-sse], [use SSE assembly routines [[default=yes]]]),
    5.36                    , enable_sse=yes)
    5.37 @@ -572,6 +599,82 @@
    5.38              EXTRA_CFLAGS="$EXTRA_CFLAGS $sse2_CFLAGS"
    5.39          fi
    5.40      fi
    5.41 +
    5.42 +    AC_ARG_ENABLE(altivec,
    5.43 +AC_HELP_STRING([--enable-altivec], [use Altivec assembly routines [[default=yes]]]),
    5.44 +                  , enable_altivec=yes)
    5.45 +    if test x$enable_altivec = xyes; then
    5.46 +        save_CFLAGS="$CFLAGS"
    5.47 +        have_gcc_altivec=no
    5.48 +        have_altivec_h_hdr=no
    5.49 +        altivec_CFLAGS="-maltivec"
    5.50 +        CFLAGS="$save_CFLAGS $altivec_CFLAGS"
    5.51 +
    5.52 +        AC_MSG_CHECKING(for Altivec with GCC altivec.h and -maltivec option)
    5.53 +        AC_TRY_COMPILE([
    5.54 +        #include <altivec.h>
    5.55 +        vector unsigned int vzero() {
    5.56 +            return vec_splat_u32(0);
    5.57 +        }
    5.58 +        ],[
    5.59 +        ],[
    5.60 +        have_gcc_altivec=yes
    5.61 +        have_altivec_h_hdr=yes
    5.62 +        ])
    5.63 +        AC_MSG_RESULT($have_gcc_altivec)
    5.64 +
    5.65 +        if test x$have_gcc_altivec = xno; then
    5.66 +            AC_MSG_CHECKING(for Altivec with GCC -maltivec option)
    5.67 +            AC_TRY_COMPILE([
    5.68 +            vector unsigned int vzero() {
    5.69 +                return vec_splat_u32(0);
    5.70 +            }
    5.71 +            ],[
    5.72 +            ],[
    5.73 +            have_gcc_altivec=yes
    5.74 +            ])
    5.75 +            AC_MSG_RESULT($have_gcc_altivec)
    5.76 +        fi
    5.77 +
    5.78 +        if test x$have_gcc_altivec = xno; then
    5.79 +            AC_MSG_CHECKING(for Altivec with GCC altivec.h and -faltivec option)
    5.80 +            altivec_CFLAGS="-faltivec"
    5.81 +            CFLAGS="$save_CFLAGS $altivec_CFLAGS"
    5.82 +            AC_TRY_COMPILE([
    5.83 +            #include <altivec.h>
    5.84 +            vector unsigned int vzero() {
    5.85 +                return vec_splat_u32(0);
    5.86 +            }
    5.87 +            ],[
    5.88 +            ],[
    5.89 +            have_gcc_altivec=yes
    5.90 +            have_altivec_h_hdr=yes
    5.91 +            ])
    5.92 +            AC_MSG_RESULT($have_gcc_altivec)
    5.93 +        fi
    5.94 +
    5.95 +        if test x$have_gcc_altivec = xno; then
    5.96 +            AC_MSG_CHECKING(for Altivec with GCC -faltivec option)
    5.97 +            AC_TRY_COMPILE([
    5.98 +            vector unsigned int vzero() {
    5.99 +                return vec_splat_u32(0);
   5.100 +            }
   5.101 +            ],[
   5.102 +            ],[
   5.103 +            have_gcc_altivec=yes
   5.104 +            ])
   5.105 +            AC_MSG_RESULT($have_gcc_altivec)
   5.106 +        fi
   5.107 +        CFLAGS="$save_CFLAGS"
   5.108 +
   5.109 +        if test x$have_gcc_altivec = xyes; then
   5.110 +            AC_DEFINE(SDL_ALTIVEC_BLITTERS)
   5.111 +            if test x$have_altivec_h_hdr = xyes; then
   5.112 +              AC_DEFINE(HAVE_ALTIVEC_H)
   5.113 +            fi
   5.114 +            EXTRA_CFLAGS="$EXTRA_CFLAGS $altivec_CFLAGS"
   5.115 +        fi
   5.116 +    fi
   5.117  fi
   5.118  
   5.119  dnl See if the OSS audio interface is supported
     6.1 --- a/include/SDL_config.h.in	Mon Feb 21 23:45:48 2011 -0800
     6.2 +++ b/include/SDL_config.h.in	Tue Feb 22 21:44:36 2011 -0800
     6.3 @@ -34,7 +34,7 @@
     6.4  
     6.5  /* Make sure that this isn't included by Visual C++ */
     6.6  #ifdef _MSC_VER
     6.7 -#error You should copy include/SDL_config.h.default to include/SDL_config.h
     6.8 +#error You should run hg revert SDL_config.h 
     6.9  #endif
    6.10  
    6.11  /* C language features */
    6.12 @@ -82,6 +82,7 @@
    6.13  #undef HAVE_MATH_H
    6.14  #undef HAVE_ICONV_H
    6.15  #undef HAVE_SIGNAL_H
    6.16 +#undef HAVE_ALTIVEC_H
    6.17  
    6.18  /* C library functions */
    6.19  #undef HAVE_MALLOC
    6.20 @@ -302,5 +303,6 @@
    6.21  
    6.22  /* Enable assembly routines */
    6.23  #undef SDL_ASSEMBLY_ROUTINES
    6.24 +#undef SDL_ALTIVEC_BLITTERS
    6.25  
    6.26  #endif /* _SDL_config_h */
     7.1 --- a/include/SDL_config_macosx.h	Mon Feb 21 23:45:48 2011 -0800
     7.2 +++ b/include/SDL_config_macosx.h	Tue Feb 22 21:44:36 2011 -0800
     7.3 @@ -168,5 +168,8 @@
     7.4  
     7.5  /* Enable assembly routines */
     7.6  #define SDL_ASSEMBLY_ROUTINES	1
     7.7 +#ifdef __ppc__
     7.8 +#define SDL_ALTIVEC_BLITTERS	1
     7.9 +#endif
    7.10  
    7.11  #endif /* _SDL_config_macosx_h */
     8.1 --- a/include/SDL_cpuinfo.h	Mon Feb 21 23:45:48 2011 -0800
     8.2 +++ b/include/SDL_cpuinfo.h	Tue Feb 22 21:44:36 2011 -0800
     8.3 @@ -31,6 +31,34 @@
     8.4  
     8.5  #include "SDL_stdinc.h"
     8.6  
     8.7 +/* Need to do this here because intrin.h has C++ code in it */
     8.8 +/* Visual Studio 2005 has a bug where intrin.h conflicts with winnt.h */
     8.9 +#if defined(_MSC_VER) && (_MSC_VER >= 1500) && !defined(_WIN32_WCE)
    8.10 +#include <intrin.h>
    8.11 +#define __MMX__
    8.12 +#define __3dNOW__
    8.13 +#define __SSE__
    8.14 +#define __SSE2__
    8.15 +#elif defined(__MINGW64_VERSION_MAJOR)
    8.16 +#include <intrin.h>
    8.17 +#else
    8.18 +#ifdef __MMX__
    8.19 +#include <mmintrin.h>
    8.20 +#endif
    8.21 +#ifdef __3dNOW__
    8.22 +#include <mm3dnow.h>
    8.23 +#endif
    8.24 +#ifdef __SSE__
    8.25 +#include <xmmintrin.h>
    8.26 +#endif
    8.27 +#ifdef __SSE2__
    8.28 +#include <emmintrin.h>
    8.29 +#endif
    8.30 +#ifdef HAVE_ALTIVEC_H
    8.31 +#include <altivec.h>
    8.32 +#endif
    8.33 +#endif
    8.34 +
    8.35  #include "begin_code.h"
    8.36  /* Set up for C function definitions, even when using C++ */
    8.37  #ifdef __cplusplus
    8.38 @@ -65,11 +93,21 @@
    8.39  extern DECLSPEC SDL_bool SDLCALL SDL_HasRDTSC(void);
    8.40  
    8.41  /**
    8.42 + *  This function returns true if the CPU has AltiVec features.
    8.43 + */
    8.44 +extern DECLSPEC SDL_bool SDLCALL SDL_HasAltiVec(void);
    8.45 +
    8.46 +/**
    8.47   *  This function returns true if the CPU has MMX features.
    8.48   */
    8.49  extern DECLSPEC SDL_bool SDLCALL SDL_HasMMX(void);
    8.50  
    8.51  /**
    8.52 + *  This function returns true if the CPU has 3DNow! features.
    8.53 + */
    8.54 +extern DECLSPEC SDL_bool SDLCALL SDL_Has3DNow(void);
    8.55 +
    8.56 +/**
    8.57   *  This function returns true if the CPU has SSE features.
    8.58   */
    8.59  extern DECLSPEC SDL_bool SDLCALL SDL_HasSSE(void);
     9.1 --- a/src/cpuinfo/SDL_cpuinfo.c	Mon Feb 21 23:45:48 2011 -0800
     9.2 +++ b/src/cpuinfo/SDL_cpuinfo.c	Tue Feb 22 21:44:36 2011 -0800
     9.3 @@ -32,18 +32,37 @@
     9.4  #include <sys/types.h>
     9.5  #include <sys/sysctl.h>
     9.6  #endif
     9.7 +#if defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))
     9.8 +#include <sys/sysctl.h>         /* For AltiVec check */
     9.9 +#elif SDL_ALTIVEC_BLITTERS && HAVE_SETJMP
    9.10 +#include <signal.h>
    9.11 +#include <setjmp.h>
    9.12 +#endif
    9.13  #ifdef __WIN32__
    9.14  #include "../core/windows/SDL_windows.h"
    9.15  #endif
    9.16  
    9.17  #define CPU_HAS_RDTSC   0x00000001
    9.18 -#define CPU_HAS_MMX     0x00000002
    9.19 +#define CPU_HAS_ALTIVEC 0x00000002
    9.20 +#define CPU_HAS_MMX     0x00000004
    9.21 +#define CPU_HAS_3DNOW   0x00000008
    9.22  #define CPU_HAS_SSE     0x00000010
    9.23  #define CPU_HAS_SSE2    0x00000020
    9.24  #define CPU_HAS_SSE3    0x00000040
    9.25 -#define CPU_HAS_SSE41   0x00000080
    9.26 -#define CPU_HAS_SSE42   0x00000100
    9.27 +#define CPU_HAS_SSE41   0x00000100
    9.28 +#define CPU_HAS_SSE42   0x00000200
    9.29  
    9.30 +#if SDL_ALTIVEC_BLITTERS && HAVE_SETJMP && !__MACOSX__
    9.31 +/* This is the brute force way of detecting instruction sets...
    9.32 +   the idea is borrowed from the libmpeg2 library - thanks!
    9.33 + */
    9.34 +static jmp_buf jmpbuf;
    9.35 +static void
    9.36 +illegal_instruction(int sig)
    9.37 +{
    9.38 +    longjmp(jmpbuf, 1);
    9.39 +}
    9.40 +#endif /* HAVE_SETJMP */
    9.41  
    9.42  static __inline__ int
    9.43  CPU_haveCPUID(void)
    9.44 @@ -193,6 +212,29 @@
    9.45  }
    9.46  
    9.47  static __inline__ int
    9.48 +CPU_haveAltiVec(void)
    9.49 +{
    9.50 +    volatile int altivec = 0;
    9.51 +#if defined(__MACOSX__) && (defined(__ppc__) || defined(__ppc64__))
    9.52 +    int selectors[2] = { CTL_HW, HW_VECTORUNIT };
    9.53 +    int hasVectorUnit = 0;
    9.54 +    size_t length = sizeof(hasVectorUnit);
    9.55 +    int error = sysctl(selectors, 2, &hasVectorUnit, &length, NULL, 0);
    9.56 +    if (0 == error)
    9.57 +        altivec = (hasVectorUnit != 0);
    9.58 +#elif SDL_ALTIVEC_BLITTERS && HAVE_SETJMP
    9.59 +    void (*handler) (int sig);
    9.60 +    handler = signal(SIGILL, illegal_instruction);
    9.61 +    if (setjmp(jmpbuf) == 0) {
    9.62 +        asm volatile ("mtspr 256, %0\n\t" "vand %%v0, %%v0, %%v0"::"r" (-1));
    9.63 +        altivec = 1;
    9.64 +    }
    9.65 +    signal(SIGILL, handler);
    9.66 +#endif
    9.67 +    return altivec;
    9.68 +}
    9.69 +
    9.70 +static __inline__ int
    9.71  CPU_haveMMX(void)
    9.72  {
    9.73      if (CPU_haveCPUID()) {
    9.74 @@ -202,6 +244,21 @@
    9.75  }
    9.76  
    9.77  static __inline__ int
    9.78 +CPU_have3DNow(void)
    9.79 +{
    9.80 +    if (CPU_haveCPUID()) {
    9.81 +        int a, b, c, d;
    9.82 +
    9.83 +        cpuid(0x80000000, a, b, c, d);
    9.84 +        if (a >= 0x80000001) {
    9.85 +            cpuid(0x80000001, a, b, c, d);
    9.86 +            return (d & 0x80000000);
    9.87 +        }
    9.88 +    }
    9.89 +    return 0;
    9.90 +}
    9.91 +
    9.92 +static __inline__ int
    9.93  CPU_haveSSE(void)
    9.94  {
    9.95      if (CPU_haveCPUID()) {
    9.96 @@ -431,9 +488,15 @@
    9.97          if (CPU_haveRDTSC()) {
    9.98              SDL_CPUFeatures |= CPU_HAS_RDTSC;
    9.99          }
   9.100 +        if (CPU_haveAltiVec()) {
   9.101 +            SDL_CPUFeatures |= CPU_HAS_ALTIVEC;
   9.102 +        }
   9.103          if (CPU_haveMMX()) {
   9.104              SDL_CPUFeatures |= CPU_HAS_MMX;
   9.105          }
   9.106 +        if (CPU_have3DNow()) {
   9.107 +            SDL_CPUFeatures |= CPU_HAS_3DNOW;
   9.108 +        }
   9.109          if (CPU_haveSSE()) {
   9.110              SDL_CPUFeatures |= CPU_HAS_SSE;
   9.111          }
   9.112 @@ -463,6 +526,15 @@
   9.113  }
   9.114  
   9.115  SDL_bool
   9.116 +SDL_HasAltiVec(void)
   9.117 +{
   9.118 +    if (SDL_GetCPUFeatures() & CPU_HAS_ALTIVEC) {
   9.119 +        return SDL_TRUE;
   9.120 +    }
   9.121 +    return SDL_FALSE;
   9.122 +}
   9.123 +
   9.124 +SDL_bool
   9.125  SDL_HasMMX(void)
   9.126  {
   9.127      if (SDL_GetCPUFeatures() & CPU_HAS_MMX) {
   9.128 @@ -472,6 +544,15 @@
   9.129  }
   9.130  
   9.131  SDL_bool
   9.132 +SDL_Has3DNow(void)
   9.133 +{
   9.134 +    if (SDL_GetCPUFeatures() & CPU_HAS_3DNOW) {
   9.135 +        return SDL_TRUE;
   9.136 +    }
   9.137 +    return SDL_FALSE;
   9.138 +}
   9.139 +
   9.140 +SDL_bool
   9.141  SDL_HasSSE(void)
   9.142  {
   9.143      if (SDL_GetCPUFeatures() & CPU_HAS_SSE) {
   9.144 @@ -528,7 +609,9 @@
   9.145      printf("CPU name: %s\n", SDL_GetCPUName());
   9.146      printf("CacheLine size: %d\n", SDL_GetCPUCacheLineSize());
   9.147      printf("RDTSC: %d\n", SDL_HasRDTSC());
   9.148 +    printf("Altivec: %d\n", SDL_HasAltiVec());
   9.149      printf("MMX: %d\n", SDL_HasMMX());
   9.150 +    printf("3DNow: %d\n", SDL_Has3DNow());
   9.151      printf("SSE: %d\n", SDL_HasSSE());
   9.152      printf("SSE2: %d\n", SDL_HasSSE2());
   9.153      printf("SSE3: %d\n", SDL_HasSSE3());
    10.1 --- a/src/video/SDL_blit.c	Mon Feb 21 23:45:48 2011 -0800
    10.2 +++ b/src/video/SDL_blit.c	Tue Feb 22 21:44:36 2011 -0800
    10.3 @@ -100,6 +100,30 @@
    10.4      return (okay ? 0 : -1);
    10.5  }
    10.6  
    10.7 +#ifdef __MACOSX__
    10.8 +#include <sys/sysctl.h>
    10.9 +
   10.10 +static SDL_bool
   10.11 +SDL_UseAltivecPrefetch()
   10.12 +{
   10.13 +    const char key[] = "hw.l3cachesize";
   10.14 +    u_int64_t result = 0;
   10.15 +    size_t typeSize = sizeof(result);
   10.16 +
   10.17 +    if (sysctlbyname(key, &result, &typeSize, NULL, 0) == 0 && result > 0) {
   10.18 +        return SDL_TRUE;
   10.19 +    } else {
   10.20 +        return SDL_FALSE;
   10.21 +    }
   10.22 +}
   10.23 +#else
   10.24 +static SDL_bool
   10.25 +SDL_UseAltivecPrefetch()
   10.26 +{
   10.27 +    /* Just guess G4 */
   10.28 +    return SDL_TRUE;
   10.29 +}
   10.30 +#endif /* __MACOSX__ */
   10.31  
   10.32  static SDL_BlitFunc
   10.33  SDL_ChooseBlitFunc(Uint32 src_format, Uint32 dst_format, int flags,
   10.34 @@ -121,12 +145,22 @@
   10.35              if (SDL_HasMMX()) {
   10.36                  features |= SDL_CPU_MMX;
   10.37              }
   10.38 +            if (SDL_Has3DNow()) {
   10.39 +                features |= SDL_CPU_3DNOW;
   10.40 +            }
   10.41              if (SDL_HasSSE()) {
   10.42                  features |= SDL_CPU_SSE;
   10.43              }
   10.44              if (SDL_HasSSE2()) {
   10.45                  features |= SDL_CPU_SSE2;
   10.46              }
   10.47 +            if (SDL_HasAltiVec()) {
   10.48 +                if (SDL_UseAltivecPrefetch()) {
   10.49 +                    features |= SDL_CPU_ALTIVEC_PREFETCH;
   10.50 +                } else {
   10.51 +                    features |= SDL_CPU_ALTIVEC_NOPREFETCH;
   10.52 +                }
   10.53 +            }
   10.54          }
   10.55      }
   10.56  
    11.1 --- a/src/video/SDL_blit.h	Mon Feb 21 23:45:48 2011 -0800
    11.2 +++ b/src/video/SDL_blit.h	Tue Feb 22 21:44:36 2011 -0800
    11.3 @@ -24,24 +24,6 @@
    11.4  #ifndef _SDL_blit_h
    11.5  #define _SDL_blit_h
    11.6  
    11.7 -#ifdef __MINGW32__
    11.8 -#include <_mingw.h>
    11.9 -#endif
   11.10 -
   11.11 -#if defined(__MINGW32__) && defined(__MINGW64_VERSION_MAJOR)
   11.12 -#include <intrin.h>
   11.13 -#else
   11.14 -#ifdef __MMX__
   11.15 -#include <mmintrin.h>
   11.16 -#endif
   11.17 -#ifdef __SSE__
   11.18 -#include <xmmintrin.h>
   11.19 -#endif
   11.20 -#ifdef __SSE2__
   11.21 -#include <emmintrin.h>
   11.22 -#endif
   11.23 -#endif
   11.24 -
   11.25  #include "SDL_cpuinfo.h"
   11.26  #include "SDL_endian.h"
   11.27  #include "SDL_surface.h"
   11.28 @@ -62,8 +44,11 @@
   11.29  /* SDL blit CPU flags */
   11.30  #define SDL_CPU_ANY                 0x00000000
   11.31  #define SDL_CPU_MMX                 0x00000001
   11.32 +#define SDL_CPU_3DNOW               0x00000002
   11.33  #define SDL_CPU_SSE                 0x00000004
   11.34  #define SDL_CPU_SSE2                0x00000008
   11.35 +#define SDL_CPU_ALTIVEC_PREFETCH    0x00000010
   11.36 +#define SDL_CPU_ALTIVEC_NOPREFETCH  0x00000020
   11.37  
   11.38  typedef struct
   11.39  {
    12.1 --- a/src/video/SDL_blit_A.c	Mon Feb 21 23:45:48 2011 -0800
    12.2 +++ b/src/video/SDL_blit_A.c	Tue Feb 22 21:44:36 2011 -0800
    12.3 @@ -419,6 +419,806 @@
    12.4  
    12.5  #endif /* __MMX__ */
    12.6  
    12.7 +#if SDL_ALTIVEC_BLITTERS
    12.8 +#if __MWERKS__
    12.9 +#pragma altivec_model on
   12.10 +#endif
   12.11 +#if HAVE_ALTIVEC_H
   12.12 +#include <altivec.h>
   12.13 +#endif
   12.14 +#include <assert.h>
   12.15 +
   12.16 +#if (defined(__MACOSX__) && (__GNUC__ < 4))
   12.17 +#define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
   12.18 +        (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
   12.19 +#define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
   12.20 +        (vector unsigned short) ( a,b,c,d,e,f,g,h )
   12.21 +#else
   12.22 +#define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
   12.23 +        (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
   12.24 +#define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
   12.25 +        (vector unsigned short) { a,b,c,d,e,f,g,h }
   12.26 +#endif
   12.27 +
   12.28 +#define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
   12.29 +#define VECPRINT(msg, v) do { \
   12.30 +    vector unsigned int tmpvec = (vector unsigned int)(v); \
   12.31 +    unsigned int *vp = (unsigned int *)&tmpvec; \
   12.32 +    printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
   12.33 +} while (0)
   12.34 +
   12.35 +/* the permuation vector that takes the high bytes out of all the appropriate shorts 
   12.36 +    (vector unsigned char)(
   12.37 +        0x00, 0x10, 0x02, 0x12,
   12.38 +        0x04, 0x14, 0x06, 0x16,
   12.39 +        0x08, 0x18, 0x0A, 0x1A,
   12.40 +        0x0C, 0x1C, 0x0E, 0x1E );
   12.41 +*/
   12.42 +#define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
   12.43 +#define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
   12.44 +#define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
   12.45 +#define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
   12.46 +    ? vec_lvsl(0, src) \
   12.47 +    : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
   12.48 +
   12.49 +
   12.50 +#define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
   12.51 +    /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
   12.52 +    vector unsigned short vtemp1 = vec_mule(vs, valpha); \
   12.53 +    /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
   12.54 +    vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
   12.55 +    /* valpha2 is 255-alpha */ \
   12.56 +    vector unsigned char valpha2 = vec_nor(valpha, valpha); \
   12.57 +    /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
   12.58 +    vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
   12.59 +    /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
   12.60 +    vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
   12.61 +    /* add source and dest */ \
   12.62 +    vtemp1 = vec_add(vtemp1, vtemp3); \
   12.63 +    vtemp2 = vec_add(vtemp2, vtemp4); \
   12.64 +    /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
   12.65 +    vtemp1 = vec_add(vtemp1, v1_16); \
   12.66 +    vtemp3 = vec_sr(vtemp1, v8_16); \
   12.67 +    vtemp1 = vec_add(vtemp1, vtemp3); \
   12.68 +    /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
   12.69 +    vtemp2 = vec_add(vtemp2, v1_16); \
   12.70 +    vtemp4 = vec_sr(vtemp2, v8_16); \
   12.71 +    vtemp2 = vec_add(vtemp2, vtemp4); \
   12.72 +    /* (>>8) and get ARGBARGBARGBARGB */ \
   12.73 +    vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
   12.74 +} while (0)
   12.75 +
   12.76 +/* Calculate the permute vector used for 32->32 swizzling */
   12.77 +static vector unsigned char
   12.78 +calc_swizzle32(const SDL_PixelFormat * srcfmt, const SDL_PixelFormat * dstfmt)
   12.79 +{
   12.80 +    /*
   12.81 +     * We have to assume that the bits that aren't used by other
   12.82 +     *  colors is alpha, and it's one complete byte, since some formats
   12.83 +     *  leave alpha with a zero mask, but we should still swizzle the bits.
   12.84 +     */
   12.85 +    /* ARGB */
   12.86 +    const static struct SDL_PixelFormat default_pixel_format = {
   12.87 +        NULL, 0, 0,
   12.88 +        0, 0, 0, 0,
   12.89 +        16, 8, 0, 24,
   12.90 +        0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000
   12.91 +    };
   12.92 +    if (!srcfmt) {
   12.93 +        srcfmt = &default_pixel_format;
   12.94 +    }
   12.95 +    if (!dstfmt) {
   12.96 +        dstfmt = &default_pixel_format;
   12.97 +    }
   12.98 +    const vector unsigned char plus = VECUINT8_LITERAL(0x00, 0x00, 0x00, 0x00,
   12.99 +                                                       0x04, 0x04, 0x04, 0x04,
  12.100 +                                                       0x08, 0x08, 0x08, 0x08,
  12.101 +                                                       0x0C, 0x0C, 0x0C,
  12.102 +                                                       0x0C);
  12.103 +    vector unsigned char vswiz;
  12.104 +    vector unsigned int srcvec;
  12.105 +#define RESHIFT(X) (3 - ((X) >> 3))
  12.106 +    Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
  12.107 +    Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
  12.108 +    Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
  12.109 +    Uint32 amask;
  12.110 +    /* Use zero for alpha if either surface doesn't have alpha */
  12.111 +    if (dstfmt->Amask) {
  12.112 +        amask =
  12.113 +            ((srcfmt->Amask) ? RESHIFT(srcfmt->
  12.114 +                                       Ashift) : 0x10) << (dstfmt->Ashift);
  12.115 +    } else {
  12.116 +        amask =
  12.117 +            0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^
  12.118 +                          0xFFFFFFFF);
  12.119 +    }
  12.120 +#undef RESHIFT
  12.121 +    ((unsigned int *) (char *) &srcvec)[0] = (rmask | gmask | bmask | amask);
  12.122 +    vswiz = vec_add(plus, (vector unsigned char) vec_splat(srcvec, 0));
  12.123 +    return (vswiz);
  12.124 +}
  12.125 +
  12.126 +static void
  12.127 +Blit32to565PixelAlphaAltivec(SDL_BlitInfo * info)
  12.128 +{
  12.129 +    int height = info->dst_h;
  12.130 +    Uint8 *src = (Uint8 *) info->src;
  12.131 +    int srcskip = info->src_skip;
  12.132 +    Uint8 *dst = (Uint8 *) info->dst;
  12.133 +    int dstskip = info->dst_skip;
  12.134 +    SDL_PixelFormat *srcfmt = info->src_fmt;
  12.135 +
  12.136 +    vector unsigned char v0 = vec_splat_u8(0);
  12.137 +    vector unsigned short v8_16 = vec_splat_u16(8);
  12.138 +    vector unsigned short v1_16 = vec_splat_u16(1);
  12.139 +    vector unsigned short v2_16 = vec_splat_u16(2);
  12.140 +    vector unsigned short v3_16 = vec_splat_u16(3);
  12.141 +    vector unsigned int v8_32 = vec_splat_u32(8);
  12.142 +    vector unsigned int v16_32 = vec_add(v8_32, v8_32);
  12.143 +    vector unsigned short v3f =
  12.144 +        VECUINT16_LITERAL(0x003f, 0x003f, 0x003f, 0x003f,
  12.145 +                          0x003f, 0x003f, 0x003f, 0x003f);
  12.146 +    vector unsigned short vfc =
  12.147 +        VECUINT16_LITERAL(0x00fc, 0x00fc, 0x00fc, 0x00fc,
  12.148 +                          0x00fc, 0x00fc, 0x00fc, 0x00fc);
  12.149 +
  12.150 +    /* 
  12.151 +       0x10 - 0x1f is the alpha
  12.152 +       0x00 - 0x0e evens are the red
  12.153 +       0x01 - 0x0f odds are zero
  12.154 +     */
  12.155 +    vector unsigned char vredalpha1 = VECUINT8_LITERAL(0x10, 0x00, 0x01, 0x01,
  12.156 +                                                       0x10, 0x02, 0x01, 0x01,
  12.157 +                                                       0x10, 0x04, 0x01, 0x01,
  12.158 +                                                       0x10, 0x06, 0x01,
  12.159 +                                                       0x01);
  12.160 +    vector unsigned char vredalpha2 =
  12.161 +        (vector unsigned char) (vec_add((vector unsigned int) vredalpha1,
  12.162 +                                        vec_sl(v8_32, v16_32))
  12.163 +        );
  12.164 +    /*
  12.165 +       0x00 - 0x0f is ARxx ARxx ARxx ARxx
  12.166 +       0x11 - 0x0f odds are blue
  12.167 +     */
  12.168 +    vector unsigned char vblue1 = VECUINT8_LITERAL(0x00, 0x01, 0x02, 0x11,
  12.169 +                                                   0x04, 0x05, 0x06, 0x13,
  12.170 +                                                   0x08, 0x09, 0x0a, 0x15,
  12.171 +                                                   0x0c, 0x0d, 0x0e, 0x17);
  12.172 +    vector unsigned char vblue2 =
  12.173 +        (vector unsigned char) (vec_add((vector unsigned int) vblue1, v8_32)
  12.174 +        );
  12.175 +    /*
  12.176 +       0x00 - 0x0f is ARxB ARxB ARxB ARxB
  12.177 +       0x10 - 0x0e evens are green
  12.178 +     */
  12.179 +    vector unsigned char vgreen1 = VECUINT8_LITERAL(0x00, 0x01, 0x10, 0x03,
  12.180 +                                                    0x04, 0x05, 0x12, 0x07,
  12.181 +                                                    0x08, 0x09, 0x14, 0x0b,
  12.182 +                                                    0x0c, 0x0d, 0x16, 0x0f);
  12.183 +    vector unsigned char vgreen2 =
  12.184 +        (vector unsigned
  12.185 +         char) (vec_add((vector unsigned int) vgreen1, vec_sl(v8_32, v8_32))
  12.186 +        );
  12.187 +    vector unsigned char vgmerge = VECUINT8_LITERAL(0x00, 0x02, 0x00, 0x06,
  12.188 +                                                    0x00, 0x0a, 0x00, 0x0e,
  12.189 +                                                    0x00, 0x12, 0x00, 0x16,
  12.190 +                                                    0x00, 0x1a, 0x00, 0x1e);
  12.191 +    vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
  12.192 +    vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
  12.193 +    vector unsigned char valphaPermute =
  12.194 +        vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
  12.195 +
  12.196 +    vector unsigned short vf800 = (vector unsigned short) vec_splat_u8(-7);
  12.197 +    vf800 = vec_sl(vf800, vec_splat_u16(8));
  12.198 +
  12.199 +    while (height--) {
  12.200 +        int extrawidth;
  12.201 +        vector unsigned char valigner;
  12.202 +        vector unsigned char vsrc;
  12.203 +        vector unsigned char voverflow;
  12.204 +        int width = info->dst_w;
  12.205 +
  12.206 +#define ONE_PIXEL_BLEND(condition, widthvar) \
  12.207 +        while (condition) { \
  12.208 +            Uint32 Pixel; \
  12.209 +            unsigned sR, sG, sB, dR, dG, dB, sA; \
  12.210 +            DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
  12.211 +            if(sA) { \
  12.212 +                unsigned short dstpixel = *((unsigned short *)dst); \
  12.213 +                dR = (dstpixel >> 8) & 0xf8; \
  12.214 +                dG = (dstpixel >> 3) & 0xfc; \
  12.215 +                dB = (dstpixel << 3) & 0xf8; \
  12.216 +                ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  12.217 +                *((unsigned short *)dst) = ( \
  12.218 +                    ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
  12.219 +                ); \
  12.220 +            } \
  12.221 +            src += 4; \
  12.222 +            dst += 2; \
  12.223 +            widthvar--; \
  12.224 +        }
  12.225 +        ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
  12.226 +        extrawidth = (width % 8);
  12.227 +        valigner = VEC_ALIGNER(src);
  12.228 +        vsrc = (vector unsigned char) vec_ld(0, src);
  12.229 +        width -= extrawidth;
  12.230 +        while (width) {
  12.231 +            vector unsigned char valpha;
  12.232 +            vector unsigned char vsrc1, vsrc2;
  12.233 +            vector unsigned char vdst1, vdst2;
  12.234 +            vector unsigned short vR, vG, vB;
  12.235 +            vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
  12.236 +
  12.237 +            /* Load 8 pixels from src as ARGB */
  12.238 +            voverflow = (vector unsigned char) vec_ld(15, src);
  12.239 +            vsrc = vec_perm(vsrc, voverflow, valigner);
  12.240 +            vsrc1 = vec_perm(vsrc, vsrc, vpermute);
  12.241 +            src += 16;
  12.242 +            vsrc = (vector unsigned char) vec_ld(15, src);
  12.243 +            voverflow = vec_perm(voverflow, vsrc, valigner);
  12.244 +            vsrc2 = vec_perm(voverflow, voverflow, vpermute);
  12.245 +            src += 16;
  12.246 +
  12.247 +            /* Load 8 pixels from dst as XRGB */
  12.248 +            voverflow = vec_ld(0, dst);
  12.249 +            vR = vec_and((vector unsigned short) voverflow, vf800);
  12.250 +            vB = vec_sl((vector unsigned short) voverflow, v3_16);
  12.251 +            vG = vec_sl(vB, v2_16);
  12.252 +            vdst1 =
  12.253 +                (vector unsigned char) vec_perm((vector unsigned char) vR,
  12.254 +                                                (vector unsigned char) vR,
  12.255 +                                                vredalpha1);
  12.256 +            vdst1 = vec_perm(vdst1, (vector unsigned char) vB, vblue1);
  12.257 +            vdst1 = vec_perm(vdst1, (vector unsigned char) vG, vgreen1);
  12.258 +            vdst2 =
  12.259 +                (vector unsigned char) vec_perm((vector unsigned char) vR,
  12.260 +                                                (vector unsigned char) vR,
  12.261 +                                                vredalpha2);
  12.262 +            vdst2 = vec_perm(vdst2, (vector unsigned char) vB, vblue2);
  12.263 +            vdst2 = vec_perm(vdst2, (vector unsigned char) vG, vgreen2);
  12.264 +
  12.265 +            /* Alpha blend 8 pixels as ARGB */
  12.266 +            valpha = vec_perm(vsrc1, v0, valphaPermute);
  12.267 +            VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16,
  12.268 +                               v8_16);
  12.269 +            valpha = vec_perm(vsrc2, v0, valphaPermute);
  12.270 +            VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16,
  12.271 +                               v8_16);
  12.272 +
  12.273 +            /* Convert 8 pixels to 565 */
  12.274 +            vpixel = (vector unsigned short) vec_packpx((vector unsigned int)
  12.275 +                                                        vdst1,
  12.276 +                                                        (vector unsigned int)
  12.277 +                                                        vdst2);
  12.278 +            vgpixel = (vector unsigned short) vec_perm(vdst1, vdst2, vgmerge);
  12.279 +            vgpixel = vec_and(vgpixel, vfc);
  12.280 +            vgpixel = vec_sl(vgpixel, v3_16);
  12.281 +            vrpixel = vec_sl(vpixel, v1_16);
  12.282 +            vrpixel = vec_and(vrpixel, vf800);
  12.283 +            vbpixel = vec_and(vpixel, v3f);
  12.284 +            vdst1 =
  12.285 +                vec_or((vector unsigned char) vrpixel,
  12.286 +                       (vector unsigned char) vgpixel);
  12.287 +            vdst1 = vec_or(vdst1, (vector unsigned char) vbpixel);
  12.288 +
  12.289 +            /* Store 8 pixels */
  12.290 +            vec_st(vdst1, 0, dst);
  12.291 +
  12.292 +            width -= 8;
  12.293 +            dst += 16;
  12.294 +        }
  12.295 +        ONE_PIXEL_BLEND((extrawidth), extrawidth);
  12.296 +#undef ONE_PIXEL_BLEND
  12.297 +        src += srcskip;
  12.298 +        dst += dstskip;
  12.299 +    }
  12.300 +}
  12.301 +
  12.302 +static void
  12.303 +Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo * info)
  12.304 +{
  12.305 +    int height = info->dst_h;
  12.306 +    Uint32 *srcp = (Uint32 *) info->src;
  12.307 +    int srcskip = info->src_skip >> 2;
  12.308 +    Uint32 *dstp = (Uint32 *) info->dst;
  12.309 +    int dstskip = info->dst_skip >> 2;
  12.310 +    SDL_PixelFormat *srcfmt = info->src_fmt;
  12.311 +    SDL_PixelFormat *dstfmt = info->dst_fmt;
  12.312 +    unsigned sA = info->a;
  12.313 +    unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  12.314 +    Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
  12.315 +    Uint32 ckey = info->colorkey;
  12.316 +    vector unsigned char mergePermute;
  12.317 +    vector unsigned char vsrcPermute;
  12.318 +    vector unsigned char vdstPermute;
  12.319 +    vector unsigned char vsdstPermute;
  12.320 +    vector unsigned char valpha;
  12.321 +    vector unsigned char valphamask;
  12.322 +    vector unsigned char vbits;
  12.323 +    vector unsigned char v0;
  12.324 +    vector unsigned short v1;
  12.325 +    vector unsigned short v8;
  12.326 +    vector unsigned int vckey;
  12.327 +    vector unsigned int vrgbmask;
  12.328 +
  12.329 +    mergePermute = VEC_MERGE_PERMUTE();
  12.330 +    v0 = vec_splat_u8(0);
  12.331 +    v1 = vec_splat_u16(1);
  12.332 +    v8 = vec_splat_u16(8);
  12.333 +
  12.334 +    /* set the alpha to 255 on the destination surf */
  12.335 +    valphamask = VEC_ALPHA_MASK();
  12.336 +
  12.337 +    vsrcPermute = calc_swizzle32(srcfmt, NULL);
  12.338 +    vdstPermute = calc_swizzle32(NULL, dstfmt);
  12.339 +    vsdstPermute = calc_swizzle32(dstfmt, NULL);
  12.340 +
  12.341 +    /* set a vector full of alpha and 255-alpha */
  12.342 +    ((unsigned char *) &valpha)[0] = sA;
  12.343 +    valpha = vec_splat(valpha, 0);
  12.344 +    vbits = (vector unsigned char) vec_splat_s8(-1);
  12.345 +
  12.346 +    ckey &= rgbmask;
  12.347 +    ((unsigned int *) (char *) &vckey)[0] = ckey;
  12.348 +    vckey = vec_splat(vckey, 0);
  12.349 +    ((unsigned int *) (char *) &vrgbmask)[0] = rgbmask;
  12.350 +    vrgbmask = vec_splat(vrgbmask, 0);
  12.351 +
  12.352 +    while (height--) {
  12.353 +        int width = info->dst_w;
  12.354 +#define ONE_PIXEL_BLEND(condition, widthvar) \
  12.355 +        while (condition) { \
  12.356 +            Uint32 Pixel; \
  12.357 +            unsigned sR, sG, sB, dR, dG, dB; \
  12.358 +            RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
  12.359 +            if(sA && Pixel != ckey) { \
  12.360 +                RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
  12.361 +                DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
  12.362 +                ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  12.363 +                ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
  12.364 +            } \
  12.365 +            dstp++; \
  12.366 +            srcp++; \
  12.367 +            widthvar--; \
  12.368 +        }
  12.369 +        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  12.370 +        if (width > 0) {
  12.371 +            int extrawidth = (width % 4);
  12.372 +            vector unsigned char valigner = VEC_ALIGNER(srcp);
  12.373 +            vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  12.374 +            width -= extrawidth;
  12.375 +            while (width) {
  12.376 +                vector unsigned char vsel;
  12.377 +                vector unsigned char voverflow;
  12.378 +                vector unsigned char vd;
  12.379 +                vector unsigned char vd_orig;
  12.380 +
  12.381 +                /* s = *srcp */
  12.382 +                voverflow = (vector unsigned char) vec_ld(15, srcp);
  12.383 +                vs = vec_perm(vs, voverflow, valigner);
  12.384 +
  12.385 +                /* vsel is set for items that match the key */
  12.386 +                vsel =
  12.387 +                    (vector unsigned char) vec_and((vector unsigned int) vs,
  12.388 +                                                   vrgbmask);
  12.389 +                vsel = (vector unsigned char) vec_cmpeq((vector unsigned int)
  12.390 +                                                        vsel, vckey);
  12.391 +
  12.392 +                /* permute to source format */
  12.393 +                vs = vec_perm(vs, valpha, vsrcPermute);
  12.394 +
  12.395 +                /* d = *dstp */
  12.396 +                vd = (vector unsigned char) vec_ld(0, dstp);
  12.397 +                vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
  12.398 +
  12.399 +                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  12.400 +
  12.401 +                /* set the alpha channel to full on */
  12.402 +                vd = vec_or(vd, valphamask);
  12.403 +
  12.404 +                /* mask out color key */
  12.405 +                vd = vec_sel(vd, vd_orig, vsel);
  12.406 +
  12.407 +                /* permute to dest format */
  12.408 +                vd = vec_perm(vd, vbits, vdstPermute);
  12.409 +
  12.410 +                /* *dstp = res */
  12.411 +                vec_st((vector unsigned int) vd, 0, dstp);
  12.412 +
  12.413 +                srcp += 4;
  12.414 +                dstp += 4;
  12.415 +                width -= 4;
  12.416 +                vs = voverflow;
  12.417 +            }
  12.418 +            ONE_PIXEL_BLEND((extrawidth), extrawidth);
  12.419 +        }
  12.420 +#undef ONE_PIXEL_BLEND
  12.421 +
  12.422 +        srcp += srcskip;
  12.423 +        dstp += dstskip;
  12.424 +    }
  12.425 +}
  12.426 +
  12.427 +
  12.428 +static void
  12.429 +Blit32to32PixelAlphaAltivec(SDL_BlitInfo * info)
  12.430 +{
  12.431 +    int width = info->dst_w;
  12.432 +    int height = info->dst_h;
  12.433 +    Uint32 *srcp = (Uint32 *) info->src;
  12.434 +    int srcskip = info->src_skip >> 2;
  12.435 +    Uint32 *dstp = (Uint32 *) info->dst;
  12.436 +    int dstskip = info->dst_skip >> 2;
  12.437 +    SDL_PixelFormat *srcfmt = info->src_fmt;
  12.438 +    SDL_PixelFormat *dstfmt = info->dst_fmt;
  12.439 +    vector unsigned char mergePermute;
  12.440 +    vector unsigned char valphaPermute;
  12.441 +    vector unsigned char vsrcPermute;
  12.442 +    vector unsigned char vdstPermute;
  12.443 +    vector unsigned char vsdstPermute;
  12.444 +    vector unsigned char valphamask;
  12.445 +    vector unsigned char vpixelmask;
  12.446 +    vector unsigned char v0;
  12.447 +    vector unsigned short v1;
  12.448 +    vector unsigned short v8;
  12.449 +
  12.450 +    v0 = vec_splat_u8(0);
  12.451 +    v1 = vec_splat_u16(1);
  12.452 +    v8 = vec_splat_u16(8);
  12.453 +    mergePermute = VEC_MERGE_PERMUTE();
  12.454 +    valphamask = VEC_ALPHA_MASK();
  12.455 +    valphaPermute = vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
  12.456 +    vpixelmask = vec_nor(valphamask, v0);
  12.457 +    vsrcPermute = calc_swizzle32(srcfmt, NULL);
  12.458 +    vdstPermute = calc_swizzle32(NULL, dstfmt);
  12.459 +    vsdstPermute = calc_swizzle32(dstfmt, NULL);
  12.460 +
  12.461 +    while (height--) {
  12.462 +        width = info->dst_w;
  12.463 +#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  12.464 +            Uint32 Pixel; \
  12.465 +            unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
  12.466 +            DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
  12.467 +            if(sA) { \
  12.468 +              DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
  12.469 +              ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  12.470 +              ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
  12.471 +            } \
  12.472 +            ++srcp; \
  12.473 +            ++dstp; \
  12.474 +            widthvar--; \
  12.475 +        }
  12.476 +        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  12.477 +        if (width > 0) {
  12.478 +            /* vsrcPermute */
  12.479 +            /* vdstPermute */
  12.480 +            int extrawidth = (width % 4);
  12.481 +            vector unsigned char valigner = VEC_ALIGNER(srcp);
  12.482 +            vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  12.483 +            width -= extrawidth;
  12.484 +            while (width) {
  12.485 +                vector unsigned char voverflow;
  12.486 +                vector unsigned char vd;
  12.487 +                vector unsigned char valpha;
  12.488 +                vector unsigned char vdstalpha;
  12.489 +                /* s = *srcp */
  12.490 +                voverflow = (vector unsigned char) vec_ld(15, srcp);
  12.491 +                vs = vec_perm(vs, voverflow, valigner);
  12.492 +                vs = vec_perm(vs, v0, vsrcPermute);
  12.493 +
  12.494 +                valpha = vec_perm(vs, v0, valphaPermute);
  12.495 +
  12.496 +                /* d = *dstp */
  12.497 +                vd = (vector unsigned char) vec_ld(0, dstp);
  12.498 +                vd = vec_perm(vd, v0, vsdstPermute);
  12.499 +                vdstalpha = vec_and(vd, valphamask);
  12.500 +
  12.501 +                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  12.502 +
  12.503 +                /* set the alpha to the dest alpha */
  12.504 +                vd = vec_and(vd, vpixelmask);
  12.505 +                vd = vec_or(vd, vdstalpha);
  12.506 +                vd = vec_perm(vd, v0, vdstPermute);
  12.507 +
  12.508 +                /* *dstp = res */
  12.509 +                vec_st((vector unsigned int) vd, 0, dstp);
  12.510 +
  12.511 +                srcp += 4;
  12.512 +                dstp += 4;
  12.513 +                width -= 4;
  12.514 +                vs = voverflow;
  12.515 +
  12.516 +            }
  12.517 +            ONE_PIXEL_BLEND((extrawidth), extrawidth);
  12.518 +        }
  12.519 +        srcp += srcskip;
  12.520 +        dstp += dstskip;
  12.521 +#undef ONE_PIXEL_BLEND
  12.522 +    }
  12.523 +}
  12.524 +
  12.525 +/* fast ARGB888->(A)RGB888 blending with pixel alpha */
  12.526 +static void
  12.527 +BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo * info)
  12.528 +{
  12.529 +    int width = info->dst_w;
  12.530 +    int height = info->dst_h;
  12.531 +    Uint32 *srcp = (Uint32 *) info->src;
  12.532 +    int srcskip = info->src_skip >> 2;
  12.533 +    Uint32 *dstp = (Uint32 *) info->dst;
  12.534 +    int dstskip = info->dst_skip >> 2;
  12.535 +    vector unsigned char mergePermute;
  12.536 +    vector unsigned char valphaPermute;
  12.537 +    vector unsigned char valphamask;
  12.538 +    vector unsigned char vpixelmask;
  12.539 +    vector unsigned char v0;
  12.540 +    vector unsigned short v1;
  12.541 +    vector unsigned short v8;
  12.542 +    v0 = vec_splat_u8(0);
  12.543 +    v1 = vec_splat_u16(1);
  12.544 +    v8 = vec_splat_u16(8);
  12.545 +    mergePermute = VEC_MERGE_PERMUTE();
  12.546 +    valphamask = VEC_ALPHA_MASK();
  12.547 +    valphaPermute = vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
  12.548 +
  12.549 +
  12.550 +    vpixelmask = vec_nor(valphamask, v0);
  12.551 +    while (height--) {
  12.552 +        width = info->dst_w;
  12.553 +#define ONE_PIXEL_BLEND(condition, widthvar) \
  12.554 +        while ((condition)) { \
  12.555 +            Uint32 dalpha; \
  12.556 +            Uint32 d; \
  12.557 +            Uint32 s1; \
  12.558 +            Uint32 d1; \
  12.559 +            Uint32 s = *srcp; \
  12.560 +            Uint32 alpha = s >> 24; \
  12.561 +            if(alpha) { \
  12.562 +              if(alpha == SDL_ALPHA_OPAQUE) { \
  12.563 +                *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
  12.564 +              } else { \
  12.565 +                d = *dstp; \
  12.566 +                dalpha = d & 0xff000000; \
  12.567 +                s1 = s & 0xff00ff; \
  12.568 +                d1 = d & 0xff00ff; \
  12.569 +                d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
  12.570 +                s &= 0xff00; \
  12.571 +                d &= 0xff00; \
  12.572 +                d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
  12.573 +                *dstp = d1 | d | dalpha; \
  12.574 +              } \
  12.575 +            } \
  12.576 +            ++srcp; \
  12.577 +            ++dstp; \
  12.578 +            widthvar--; \
  12.579 +	    }
  12.580 +        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  12.581 +        if (width > 0) {
  12.582 +            int extrawidth = (width % 4);
  12.583 +            vector unsigned char valigner = VEC_ALIGNER(srcp);
  12.584 +            vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  12.585 +            width -= extrawidth;
  12.586 +            while (width) {
  12.587 +                vector unsigned char voverflow;
  12.588 +                vector unsigned char vd;
  12.589 +                vector unsigned char valpha;
  12.590 +                vector unsigned char vdstalpha;
  12.591 +                /* s = *srcp */
  12.592 +                voverflow = (vector unsigned char) vec_ld(15, srcp);
  12.593 +                vs = vec_perm(vs, voverflow, valigner);
  12.594 +
  12.595 +                valpha = vec_perm(vs, v0, valphaPermute);
  12.596 +
  12.597 +                /* d = *dstp */
  12.598 +                vd = (vector unsigned char) vec_ld(0, dstp);
  12.599 +                vdstalpha = vec_and(vd, valphamask);
  12.600 +
  12.601 +                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  12.602 +
  12.603 +                /* set the alpha to the dest alpha */
  12.604 +                vd = vec_and(vd, vpixelmask);
  12.605 +                vd = vec_or(vd, vdstalpha);
  12.606 +
  12.607 +                /* *dstp = res */
  12.608 +                vec_st((vector unsigned int) vd, 0, dstp);
  12.609 +
  12.610 +                srcp += 4;
  12.611 +                dstp += 4;
  12.612 +                width -= 4;
  12.613 +                vs = voverflow;
  12.614 +            }
  12.615 +            ONE_PIXEL_BLEND((extrawidth), extrawidth);
  12.616 +        }
  12.617 +        srcp += srcskip;
  12.618 +        dstp += dstskip;
  12.619 +    }
  12.620 +#undef ONE_PIXEL_BLEND
  12.621 +}
  12.622 +
  12.623 +static void
  12.624 +Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo * info)
  12.625 +{
  12.626 +    /* XXX : 6 */
  12.627 +    int height = info->dst_h;
  12.628 +    Uint32 *srcp = (Uint32 *) info->src;
  12.629 +    int srcskip = info->src_skip >> 2;
  12.630 +    Uint32 *dstp = (Uint32 *) info->dst;
  12.631 +    int dstskip = info->dst_skip >> 2;
  12.632 +    SDL_PixelFormat *srcfmt = info->src_fmt;
  12.633 +    SDL_PixelFormat *dstfmt = info->dst_fmt;
  12.634 +    unsigned sA = info->a;
  12.635 +    unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  12.636 +    vector unsigned char mergePermute;
  12.637 +    vector unsigned char vsrcPermute;
  12.638 +    vector unsigned char vdstPermute;
  12.639 +    vector unsigned char vsdstPermute;
  12.640 +    vector unsigned char valpha;
  12.641 +    vector unsigned char valphamask;
  12.642 +    vector unsigned char vbits;
  12.643 +    vector unsigned short v1;
  12.644 +    vector unsigned short v8;
  12.645 +
  12.646 +    mergePermute = VEC_MERGE_PERMUTE();
  12.647 +    v1 = vec_splat_u16(1);
  12.648 +    v8 = vec_splat_u16(8);
  12.649 +
  12.650 +    /* set the alpha to 255 on the destination surf */
  12.651 +    valphamask = VEC_ALPHA_MASK();
  12.652 +
  12.653 +    vsrcPermute = calc_swizzle32(srcfmt, NULL);
  12.654 +    vdstPermute = calc_swizzle32(NULL, dstfmt);
  12.655 +    vsdstPermute = calc_swizzle32(dstfmt, NULL);
  12.656 +
  12.657 +    /* set a vector full of alpha and 255-alpha */
  12.658 +    ((unsigned char *) &valpha)[0] = sA;
  12.659 +    valpha = vec_splat(valpha, 0);
  12.660 +    vbits = (vector unsigned char) vec_splat_s8(-1);
  12.661 +
  12.662 +    while (height--) {
  12.663 +        int width = info->dst_w;
  12.664 +#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  12.665 +            Uint32 Pixel; \
  12.666 +            unsigned sR, sG, sB, dR, dG, dB; \
  12.667 +            DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
  12.668 +            DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
  12.669 +            ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  12.670 +            ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
  12.671 +            ++srcp; \
  12.672 +            ++dstp; \
  12.673 +            widthvar--; \
  12.674 +        }
  12.675 +        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  12.676 +        if (width > 0) {
  12.677 +            int extrawidth = (width % 4);
  12.678 +            vector unsigned char valigner = VEC_ALIGNER(srcp);
  12.679 +            vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  12.680 +            width -= extrawidth;
  12.681 +            while (width) {
  12.682 +                vector unsigned char voverflow;
  12.683 +                vector unsigned char vd;
  12.684 +
  12.685 +                /* s = *srcp */
  12.686 +                voverflow = (vector unsigned char) vec_ld(15, srcp);
  12.687 +                vs = vec_perm(vs, voverflow, valigner);
  12.688 +                vs = vec_perm(vs, valpha, vsrcPermute);
  12.689 +
  12.690 +                /* d = *dstp */
  12.691 +                vd = (vector unsigned char) vec_ld(0, dstp);
  12.692 +                vd = vec_perm(vd, vd, vsdstPermute);
  12.693 +
  12.694 +                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  12.695 +
  12.696 +                /* set the alpha channel to full on */
  12.697 +                vd = vec_or(vd, valphamask);
  12.698 +                vd = vec_perm(vd, vbits, vdstPermute);
  12.699 +
  12.700 +                /* *dstp = res */
  12.701 +                vec_st((vector unsigned int) vd, 0, dstp);
  12.702 +
  12.703 +                srcp += 4;
  12.704 +                dstp += 4;
  12.705 +                width -= 4;
  12.706 +                vs = voverflow;
  12.707 +            }
  12.708 +            ONE_PIXEL_BLEND((extrawidth), extrawidth);
  12.709 +        }
  12.710 +#undef ONE_PIXEL_BLEND
  12.711 +
  12.712 +        srcp += srcskip;
  12.713 +        dstp += dstskip;
  12.714 +    }
  12.715 +
  12.716 +}
  12.717 +
  12.718 +
  12.719 +/* fast RGB888->(A)RGB888 blending */
  12.720 +static void
  12.721 +BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo * info)
  12.722 +{
  12.723 +    unsigned alpha = info->a;
  12.724 +    int height = info->dst_h;
  12.725 +    Uint32 *srcp = (Uint32 *) info->src;
  12.726 +    int srcskip = info->src_skip >> 2;
  12.727 +    Uint32 *dstp = (Uint32 *) info->dst;
  12.728 +    int dstskip = info->dst_skip >> 2;
  12.729 +    vector unsigned char mergePermute;
  12.730 +    vector unsigned char valpha;
  12.731 +    vector unsigned char valphamask;
  12.732 +    vector unsigned short v1;
  12.733 +    vector unsigned short v8;
  12.734 +
  12.735 +    mergePermute = VEC_MERGE_PERMUTE();
  12.736 +    v1 = vec_splat_u16(1);
  12.737 +    v8 = vec_splat_u16(8);
  12.738 +
  12.739 +    /* set the alpha to 255 on the destination surf */
  12.740 +    valphamask = VEC_ALPHA_MASK();
  12.741 +
  12.742 +    /* set a vector full of alpha and 255-alpha */
  12.743 +    ((unsigned char *) &valpha)[0] = alpha;
  12.744 +    valpha = vec_splat(valpha, 0);
  12.745 +
  12.746 +    while (height--) {
  12.747 +        int width = info->dst_w;
  12.748 +#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  12.749 +            Uint32 s = *srcp; \
  12.750 +            Uint32 d = *dstp; \
  12.751 +            Uint32 s1 = s & 0xff00ff; \
  12.752 +            Uint32 d1 = d & 0xff00ff; \
  12.753 +            d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
  12.754 +                 & 0xff00ff; \
  12.755 +            s &= 0xff00; \
  12.756 +            d &= 0xff00; \
  12.757 +            d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
  12.758 +            *dstp = d1 | d | 0xff000000; \
  12.759 +            ++srcp; \
  12.760 +            ++dstp; \
  12.761 +            widthvar--; \
  12.762 +        }
  12.763 +        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  12.764 +        if (width > 0) {
  12.765 +            int extrawidth = (width % 4);
  12.766 +            vector unsigned char valigner = VEC_ALIGNER(srcp);
  12.767 +            vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  12.768 +            width -= extrawidth;
  12.769 +            while (width) {
  12.770 +                vector unsigned char voverflow;
  12.771 +                vector unsigned char vd;
  12.772 +
  12.773 +                /* s = *srcp */
  12.774 +                voverflow = (vector unsigned char) vec_ld(15, srcp);
  12.775 +                vs = vec_perm(vs, voverflow, valigner);
  12.776 +
  12.777 +                /* d = *dstp */
  12.778 +                vd = (vector unsigned char) vec_ld(0, dstp);
  12.779 +
  12.780 +                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  12.781 +
  12.782 +                /* set the alpha channel to full on */
  12.783 +                vd = vec_or(vd, valphamask);
  12.784 +
  12.785 +                /* *dstp = res */
  12.786 +                vec_st((vector unsigned int) vd, 0, dstp);
  12.787 +
  12.788 +                srcp += 4;
  12.789 +                dstp += 4;
  12.790 +                width -= 4;
  12.791 +                vs = voverflow;
  12.792 +            }
  12.793 +            ONE_PIXEL_BLEND((extrawidth), extrawidth);
  12.794 +        }
  12.795 +#undef ONE_PIXEL_BLEND
  12.796 +
  12.797 +        srcp += srcskip;
  12.798 +        dstp += dstskip;
  12.799 +    }
  12.800 +}
  12.801 +
  12.802 +#if __MWERKS__
  12.803 +#pragma altivec_model off
  12.804 +#endif
  12.805 +#endif /* SDL_ALTIVEC_BLITTERS */
  12.806 +
  12.807  /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
  12.808  static void
  12.809  BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
  12.810 @@ -538,6 +1338,79 @@
  12.811      }
  12.812  }
  12.813  
  12.814 +#ifdef __3dNOW__
  12.815 +/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
  12.816 +static void
  12.817 +BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
  12.818 +{
  12.819 +    int width = info->dst_w;
  12.820 +    int height = info->dst_h;
  12.821 +    Uint32 *srcp = (Uint32 *) info->src;
  12.822 +    int srcskip = info->src_skip >> 2;
  12.823 +    Uint32 *dstp = (Uint32 *) info->dst;
  12.824 +    int dstskip = info->dst_skip >> 2;
  12.825 +    SDL_PixelFormat *sf = info->src_fmt;
  12.826 +    Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
  12.827 +    Uint32 amask = sf->Amask;
  12.828 +    Uint32 ashift = sf->Ashift;
  12.829 +    Uint64 multmask;
  12.830 +
  12.831 +    __m64 src1, dst1, mm_alpha, mm_zero, dmask;
  12.832 +
  12.833 +    mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
  12.834 +    multmask = 0xFFFF;
  12.835 +    multmask <<= (ashift * 2);
  12.836 +    multmask = ~multmask;
  12.837 +    dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
  12.838 +
  12.839 +    while (height--) {
  12.840 +	    /* *INDENT-OFF* */
  12.841 +	    DUFFS_LOOP4({
  12.842 +		Uint32 alpha;
  12.843 +
  12.844 +		_m_prefetch(srcp + 16);
  12.845 +		_m_prefetch(dstp + 16);
  12.846 +
  12.847 +		alpha = *srcp & amask;
  12.848 +		if (alpha == 0) {
  12.849 +			/* do nothing */
  12.850 +		} else if (alpha == amask) {
  12.851 +			/* copy RGB, keep dst alpha */
  12.852 +			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
  12.853 +		} else {
  12.854 +			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
  12.855 +			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
  12.856 +
  12.857 +			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
  12.858 +			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
  12.859 +
  12.860 +			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
  12.861 +			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
  12.862 +			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
  12.863 +			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
  12.864 +			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
  12.865 +
  12.866 +			/* blend */		    
  12.867 +			src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
  12.868 +			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
  12.869 +			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
  12.870 +			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
  12.871 +			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
  12.872 +			
  12.873 +			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
  12.874 +		}
  12.875 +		++srcp;
  12.876 +		++dstp;
  12.877 +	    }, width);
  12.878 +	    /* *INDENT-ON* */
  12.879 +        srcp += srcskip;
  12.880 +        dstp += dstskip;
  12.881 +    }
  12.882 +    _mm_empty();
  12.883 +}
  12.884 +
  12.885 +#endif /* __MMX__ */
  12.886 +
  12.887  /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
  12.888  
  12.889  /* blend a single 16 bit pixel at 50% */
  12.890 @@ -1257,10 +2130,17 @@
  12.891              return BlitNto1PixelAlpha;
  12.892  
  12.893          case 2:
  12.894 -            if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
  12.895 -                && sf->Gmask == 0xff00
  12.896 -                && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
  12.897 -                    || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
  12.898 +#if SDL_ALTIVEC_BLITTERS
  12.899 +            if (sf->BytesPerPixel == 4
  12.900 +                && df->Gmask == 0x7e0 && df->Bmask == 0x1f
  12.901 +                && SDL_HasAltiVec())
  12.902 +                return Blit32to565PixelAlphaAltivec;
  12.903 +            else
  12.904 +#endif
  12.905 +                if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
  12.906 +                    && sf->Gmask == 0xff00
  12.907 +                    && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
  12.908 +                        || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
  12.909                  if (df->Gmask == 0x7e0)
  12.910                      return BlitARGBto565PixelAlpha;
  12.911                  else if (df->Gmask == 0x3e0)
  12.912 @@ -1272,20 +2152,35 @@
  12.913              if (sf->Rmask == df->Rmask
  12.914                  && sf->Gmask == df->Gmask
  12.915                  && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
  12.916 -#if defined(__MMX__)
  12.917 +#if defined(__MMX__) || defined(__3dNOW__)
  12.918                  if (sf->Rshift % 8 == 0
  12.919                      && sf->Gshift % 8 == 0
  12.920                      && sf->Bshift % 8 == 0
  12.921                      && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
  12.922 +#ifdef __3dNOW__
  12.923 +                    if (SDL_Has3DNow())
  12.924 +                        return BlitRGBtoRGBPixelAlphaMMX3DNOW;
  12.925 +#endif
  12.926 +#ifdef __MMX__
  12.927                      if (SDL_HasMMX())
  12.928                          return BlitRGBtoRGBPixelAlphaMMX;
  12.929 +#endif
  12.930                  }
  12.931 -#endif /* __MMX__ */
  12.932 +#endif /* __MMX__ || __3dNOW__ */
  12.933                  if (sf->Amask == 0xff000000) {
  12.934 +#if SDL_ALTIVEC_BLITTERS
  12.935 +                    if (SDL_HasAltiVec())
  12.936 +                        return BlitRGBtoRGBPixelAlphaAltivec;
  12.937 +#endif
  12.938                      return BlitRGBtoRGBPixelAlpha;
  12.939                  }
  12.940              }
  12.941 -            return BlitNtoNPixelAlpha;
  12.942 +#if SDL_ALTIVEC_BLITTERS
  12.943 +            if (sf->Amask && sf->BytesPerPixel == 4 && SDL_HasAltiVec())
  12.944 +                return Blit32to32PixelAlphaAltivec;
  12.945 +            else
  12.946 +#endif
  12.947 +                return BlitNtoNPixelAlpha;
  12.948  
  12.949          case 3:
  12.950          default:
  12.951 @@ -1331,10 +2226,19 @@
  12.952                          return BlitRGBtoRGBSurfaceAlphaMMX;
  12.953  #endif
  12.954                      if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
  12.955 +#if SDL_ALTIVEC_BLITTERS
  12.956 +                        if (SDL_HasAltiVec())
  12.957 +                            return BlitRGBtoRGBSurfaceAlphaAltivec;
  12.958 +#endif
  12.959                          return BlitRGBtoRGBSurfaceAlpha;
  12.960                      }
  12.961                  }
  12.962 -                return BlitNtoNSurfaceAlpha;
  12.963 +#if SDL_ALTIVEC_BLITTERS
  12.964 +                if ((sf->BytesPerPixel == 4) && SDL_HasAltiVec())
  12.965 +                    return Blit32to32SurfaceAlphaAltivec;
  12.966 +                else
  12.967 +#endif
  12.968 +                    return BlitNtoNSurfaceAlpha;
  12.969  
  12.970              case 3:
  12.971              default:
  12.972 @@ -1348,6 +2252,12 @@
  12.973              if (df->BytesPerPixel == 1)
  12.974                  return BlitNto1SurfaceAlphaKey;
  12.975              else
  12.976 +#if SDL_ALTIVEC_BLITTERS
  12.977 +            if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
  12.978 +                    SDL_HasAltiVec())
  12.979 +                return Blit32to32SurfaceAlphaKeyAltivec;
  12.980 +            else
  12.981 +#endif
  12.982                  return BlitNtoNSurfaceAlphaKey;
  12.983          }
  12.984          break;
    13.1 --- a/src/video/SDL_blit_N.c	Mon Feb 21 23:45:48 2011 -0800
    13.2 +++ b/src/video/SDL_blit_N.c	Tue Feb 22 21:44:36 2011 -0800
    13.3 @@ -28,8 +28,840 @@
    13.4  
    13.5  /* Functions to blit from N-bit surfaces to other surfaces */
    13.6  
    13.7 +#if SDL_ALTIVEC_BLITTERS
    13.8 +#define assert(X)
    13.9 +#ifdef __MACOSX__
   13.10 +#include <sys/sysctl.h>
   13.11 +static size_t
   13.12 +GetL3CacheSize(void)
   13.13 +{
   13.14 +    const char key[] = "hw.l3cachesize";
   13.15 +    u_int64_t result = 0;
   13.16 +    size_t typeSize = sizeof(result);
   13.17 +
   13.18 +
   13.19 +    int err = sysctlbyname(key, &result, &typeSize, NULL, 0);
   13.20 +    if (0 != err)
   13.21 +        return 0;
   13.22 +
   13.23 +    return result;
   13.24 +}
   13.25 +#else
   13.26 +static size_t
   13.27 +GetL3CacheSize(void)
   13.28 +{
   13.29 +    /* XXX: Just guess G4 */
   13.30 +    return 2097152;
   13.31 +}
   13.32 +#endif /* __MACOSX__ */
   13.33 +
   13.34 +#if (defined(__MACOSX__) && (__GNUC__ < 4))
   13.35 +#define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
   13.36 +        (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
   13.37 +#define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
   13.38 +        (vector unsigned short) ( a,b,c,d,e,f,g,h )
   13.39 +#else
   13.40 +#define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
   13.41 +        (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
   13.42 +#define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
   13.43 +        (vector unsigned short) { a,b,c,d,e,f,g,h }
   13.44 +#endif
   13.45 +
   13.46 +#define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
   13.47 +#define VSWIZZLE32(a,b,c,d) (vector unsigned char) \
   13.48 +                               ( 0x00+a, 0x00+b, 0x00+c, 0x00+d, \
   13.49 +                                 0x04+a, 0x04+b, 0x04+c, 0x04+d, \
   13.50 +                                 0x08+a, 0x08+b, 0x08+c, 0x08+d, \
   13.51 +                                 0x0C+a, 0x0C+b, 0x0C+c, 0x0C+d )
   13.52 +
   13.53 +#define MAKE8888(dstfmt, r, g, b, a)  \
   13.54 +    ( ((r<<dstfmt->Rshift)&dstfmt->Rmask) | \
   13.55 +      ((g<<dstfmt->Gshift)&dstfmt->Gmask) | \
   13.56 +      ((b<<dstfmt->Bshift)&dstfmt->Bmask) | \
   13.57 +      ((a<<dstfmt->Ashift)&dstfmt->Amask) )
   13.58 +
   13.59 +/*
   13.60 + * Data Stream Touch...Altivec cache prefetching.
   13.61 + *
   13.62 + *  Don't use this on a G5...however, the speed boost is very significant
   13.63 + *   on a G4.
   13.64 + */
   13.65 +#define DST_CHAN_SRC 1
   13.66 +#define DST_CHAN_DEST 2
   13.67 +
   13.68 +/* macro to set DST control word value... */
   13.69 +#define DST_CTRL(size, count, stride) \
   13.70 +    (((size) << 24) | ((count) << 16) | (stride))
   13.71 +
   13.72 +#define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
   13.73 +    ? vec_lvsl(0, src) \
   13.74 +    : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
   13.75 +
   13.76 +/* Calculate the permute vector used for 32->32 swizzling */
   13.77 +static vector unsigned char
   13.78 +calc_swizzle32(const SDL_PixelFormat * srcfmt, const SDL_PixelFormat * dstfmt)
   13.79 +{
   13.80 +    /*
   13.81 +     * We have to assume that the bits that aren't used by other
   13.82 +     *  colors is alpha, and it's one complete byte, since some formats
   13.83 +     *  leave alpha with a zero mask, but we should still swizzle the bits.
   13.84 +     */
   13.85 +    /* ARGB */
   13.86 +    const static const struct SDL_PixelFormat default_pixel_format = {
   13.87 +        NULL, 32, 4,
   13.88 +        0, 0, 0, 0,
   13.89 +        16, 8, 0, 24,
   13.90 +        0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000
   13.91 +    };
   13.92 +    if (!srcfmt) {
   13.93 +        srcfmt = &default_pixel_format;
   13.94 +    }
   13.95 +    if (!dstfmt) {
   13.96 +        dstfmt = &default_pixel_format;
   13.97 +    }
   13.98 +    const vector unsigned char plus = VECUINT8_LITERAL(0x00, 0x00, 0x00, 0x00,
   13.99 +                                                       0x04, 0x04, 0x04, 0x04,
  13.100 +                                                       0x08, 0x08, 0x08, 0x08,
  13.101 +                                                       0x0C, 0x0C, 0x0C,
  13.102 +                                                       0x0C);
  13.103 +    vector unsigned char vswiz;
  13.104 +    vector unsigned int srcvec;
  13.105 +#define RESHIFT(X) (3 - ((X) >> 3))
  13.106 +    Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
  13.107 +    Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
  13.108 +    Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
  13.109 +    Uint32 amask;
  13.110 +    /* Use zero for alpha if either surface doesn't have alpha */
  13.111 +    if (dstfmt->Amask) {
  13.112 +        amask =
  13.113 +            ((srcfmt->Amask) ? RESHIFT(srcfmt->
  13.114 +                                       Ashift) : 0x10) << (dstfmt->Ashift);
  13.115 +    } else {
  13.116 +        amask =
  13.117 +            0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^
  13.118 +                          0xFFFFFFFF);
  13.119 +    }
  13.120 +#undef RESHIFT
  13.121 +    ((unsigned int *) (char *) &srcvec)[0] = (rmask | gmask | bmask | amask);
  13.122 +    vswiz = vec_add(plus, (vector unsigned char) vec_splat(srcvec, 0));
  13.123 +    return (vswiz);
  13.124 +}
  13.125 +
  13.126 +static void Blit_RGB888_RGB565(SDL_BlitInfo * info);
  13.127 +static void
  13.128 +Blit_RGB888_RGB565Altivec(SDL_BlitInfo * info)
  13.129 +{
  13.130 +    int height = info->dst_h;
  13.131 +    Uint8 *src = (Uint8 *) info->src;
  13.132 +    int srcskip = info->src_skip;
  13.133 +    Uint8 *dst = (Uint8 *) info->dst;
  13.134 +    int dstskip = info->dst_skip;
  13.135 +    SDL_PixelFormat *srcfmt = info->src_fmt;
  13.136 +    vector unsigned char valpha = vec_splat_u8(0);
  13.137 +    vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
  13.138 +    vector unsigned char vgmerge = VECUINT8_LITERAL(0x00, 0x02, 0x00, 0x06,
  13.139 +                                                    0x00, 0x0a, 0x00, 0x0e,
  13.140 +                                                    0x00, 0x12, 0x00, 0x16,
  13.141 +                                                    0x00, 0x1a, 0x00, 0x1e);
  13.142 +    vector unsigned short v1 = vec_splat_u16(1);
  13.143 +    vector unsigned short v3 = vec_splat_u16(3);
  13.144 +    vector unsigned short v3f =
  13.145 +        VECUINT16_LITERAL(0x003f, 0x003f, 0x003f, 0x003f,
  13.146 +                          0x003f, 0x003f, 0x003f, 0x003f);
  13.147 +    vector unsigned short vfc =
  13.148 +        VECUINT16_LITERAL(0x00fc, 0x00fc, 0x00fc, 0x00fc,
  13.149 +                          0x00fc, 0x00fc, 0x00fc, 0x00fc);
  13.150 +    vector unsigned short vf800 = (vector unsigned short) vec_splat_u8(-7);
  13.151 +    vf800 = vec_sl(vf800, vec_splat_u16(8));
  13.152 +
  13.153 +    while (height--) {
  13.154 +        vector unsigned char valigner;
  13.155 +        vector unsigned char voverflow;
  13.156 +        vector unsigned char vsrc;
  13.157 +
  13.158 +        int width = info->dst_w;
  13.159 +        int extrawidth;
  13.160 +
  13.161 +        /* do scalar until we can align... */
  13.162 +#define ONE_PIXEL_BLEND(condition, widthvar) \
  13.163 +        while (condition) { \
  13.164 +            Uint32 Pixel; \
  13.165 +            unsigned sR, sG, sB, sA; \
  13.166 +            DISEMBLE_RGBA((Uint8 *)src, 4, srcfmt, Pixel, \
  13.167 +                          sR, sG, sB, sA); \
  13.168 +            *(Uint16 *)(dst) = (((sR << 8) & 0x0000F800) | \
  13.169 +                                ((sG << 3) & 0x000007E0) | \
  13.170 +                                ((sB >> 3) & 0x0000001F)); \
  13.171 +            dst += 2; \
  13.172 +            src += 4; \
  13.173 +            widthvar--; \
  13.174 +        }
  13.175 +
  13.176 +        ONE_PIXEL_BLEND(((UNALIGNED_PTR(dst)) && (width)), width);
  13.177 +
  13.178 +        /* After all that work, here's the vector part! */
  13.179 +        extrawidth = (width % 8);       /* trailing unaligned stores */
  13.180 +        width -= extrawidth;
  13.181 +        vsrc = vec_ld(0, src);
  13.182 +        valigner = VEC_ALIGNER(src);
  13.183 +
  13.184 +        while (width) {
  13.185 +            vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
  13.186 +            vector unsigned int vsrc1, vsrc2;
  13.187 +            vector unsigned char vdst;
  13.188 +
  13.189 +            voverflow = vec_ld(15, src);
  13.190 +            vsrc = vec_perm(vsrc, voverflow, valigner);
  13.191 +            vsrc1 = (vector unsigned int) vec_perm(vsrc, valpha, vpermute);
  13.192 +            src += 16;
  13.193 +            vsrc = voverflow;
  13.194 +            voverflow = vec_ld(15, src);
  13.195 +            vsrc = vec_perm(vsrc, voverflow, valigner);
  13.196 +            vsrc2 = (vector unsigned int) vec_perm(vsrc, valpha, vpermute);
  13.197 +            /* 1555 */
  13.198 +            vpixel = (vector unsigned short) vec_packpx(vsrc1, vsrc2);
  13.199 +            vgpixel = (vector unsigned short) vec_perm(vsrc1, vsrc2, vgmerge);
  13.200 +            vgpixel = vec_and(vgpixel, vfc);
  13.201 +            vgpixel = vec_sl(vgpixel, v3);
  13.202 +            vrpixel = vec_sl(vpixel, v1);
  13.203 +            vrpixel = vec_and(vrpixel, vf800);
  13.204 +            vbpixel = vec_and(vpixel, v3f);
  13.205 +            vdst =
  13.206 +                vec_or((vector unsigned char) vrpixel,
  13.207 +                       (vector unsigned char) vgpixel);
  13.208 +            /* 565 */
  13.209 +            vdst = vec_or(vdst, (vector unsigned char) vbpixel);
  13.210 +            vec_st(vdst, 0, dst);
  13.211 +
  13.212 +            width -= 8;
  13.213 +            src += 16;
  13.214 +            dst += 16;
  13.215 +            vsrc = voverflow;
  13.216 +        }
  13.217 +
  13.218 +        assert(width == 0);
  13.219 +
  13.220 +        /* do scalar until we can align... */
  13.221 +        ONE_PIXEL_BLEND((extrawidth), extrawidth);
  13.222 +#undef ONE_PIXEL_BLEND
  13.223 +
  13.224 +        src += srcskip;         /* move to next row, accounting for pitch. */
  13.225 +        dst += dstskip;
  13.226 +    }
  13.227 +
  13.228 +
  13.229 +}
  13.230 +
  13.231 +static void
  13.232 +Blit_RGB565_32Altivec(SDL_BlitInfo * info)
  13.233 +{
  13.234 +    int height = info->dst_h;
  13.235 +    Uint8 *src = (Uint8 *) info->src;
  13.236 +    int srcskip = info->src_skip;
  13.237 +    Uint8 *dst = (Uint8 *) info->dst;
  13.238 +    int dstskip = info->dst_skip;
  13.239 +    SDL_PixelFormat *srcfmt = info->src_fmt;
  13.240 +    SDL_PixelFormat *dstfmt = info->dst_fmt;
  13.241 +    unsigned alpha;
  13.242 +    vector unsigned char valpha;
  13.243 +    vector unsigned char vpermute;
  13.244 +    vector unsigned short vf800;
  13.245 +    vector unsigned int v8 = vec_splat_u32(8);
  13.246 +    vector unsigned int v16 = vec_add(v8, v8);
  13.247 +    vector unsigned short v2 = vec_splat_u16(2);
  13.248 +    vector unsigned short v3 = vec_splat_u16(3);
  13.249 +    /* 
  13.250 +       0x10 - 0x1f is the alpha
  13.251 +       0x00 - 0x0e evens are the red
  13.252 +       0x01 - 0x0f odds are zero
  13.253 +     */
  13.254 +    vector unsigned char vredalpha1 = VECUINT8_LITERAL(0x10, 0x00, 0x01, 0x01,
  13.255 +                                                       0x10, 0x02, 0x01, 0x01,
  13.256 +                                                       0x10, 0x04, 0x01, 0x01,
  13.257 +                                                       0x10, 0x06, 0x01,
  13.258 +                                                       0x01);
  13.259 +    vector unsigned char vredalpha2 =
  13.260 +        (vector unsigned
  13.261 +         char) (vec_add((vector unsigned int) vredalpha1, vec_sl(v8, v16))
  13.262 +        );
  13.263 +    /*
  13.264 +       0x00 - 0x0f is ARxx ARxx ARxx ARxx
  13.265 +       0x11 - 0x0f odds are blue
  13.266 +     */
  13.267 +    vector unsigned char vblue1 = VECUINT8_LITERAL(0x00, 0x01, 0x02, 0x11,
  13.268 +                                                   0x04, 0x05, 0x06, 0x13,
  13.269 +                                                   0x08, 0x09, 0x0a, 0x15,
  13.270 +                                                   0x0c, 0x0d, 0x0e, 0x17);
  13.271 +    vector unsigned char vblue2 =
  13.272 +        (vector unsigned char) (vec_add((vector unsigned int) vblue1, v8)
  13.273 +        );
  13.274 +    /*
  13.275 +       0x00 - 0x0f is ARxB ARxB ARxB ARxB
  13.276 +       0x10 - 0x0e evens are green
  13.277 +     */
  13.278 +    vector unsigned char vgreen1 = VECUINT8_LITERAL(0x00, 0x01, 0x10, 0x03,
  13.279 +                                                    0x04, 0x05, 0x12, 0x07,
  13.280 +                                                    0x08, 0x09, 0x14, 0x0b,
  13.281 +                                                    0x0c, 0x0d, 0x16, 0x0f);
  13.282 +    vector unsigned char vgreen2 =
  13.283 +        (vector unsigned
  13.284 +         char) (vec_add((vector unsigned int) vgreen1, vec_sl(v8, v8))
  13.285 +        );
  13.286 +
  13.287 +
  13.288 +    assert(srcfmt->BytesPerPixel == 2);
  13.289 +    assert(dstfmt->BytesPerPixel == 4);
  13.290 +
  13.291 +    vf800 = (vector unsigned short) vec_splat_u8(-7);
  13.292 +    vf800 = vec_sl(vf800, vec_splat_u16(8));
  13.293 +
  13.294 +    if (dstfmt->Amask && info->a) {
  13.295 +        ((unsigned char *) &valpha)[0] = alpha = info->a;
  13.296 +        valpha = vec_splat(valpha, 0);
  13.297 +    } else {
  13.298 +        alpha = 0;
  13.299 +        valpha = vec_splat_u8(0);
  13.300 +    }
  13.301 +
  13.302 +    vpermute = calc_swizzle32(NULL, dstfmt);
  13.303 +    while (height--) {
  13.304 +        vector unsigned char valigner;
  13.305 +        vector unsigned char voverflow;
  13.306 +        vector unsigned char vsrc;
  13.307 +
  13.308 +        int width = info->dst_w;
  13.309 +        int extrawidth;
  13.310 +
  13.311 +        /* do scalar until we can align... */
  13.312 +#define ONE_PIXEL_BLEND(condition, widthvar) \
  13.313 +        while (condition) { \
  13.314 +            unsigned sR, sG, sB; \
  13.315 +            unsigned short Pixel = *((unsigned short *)src); \
  13.316 +            sR = (Pixel >> 8) & 0xf8; \
  13.317 +            sG = (Pixel >> 3) & 0xfc; \
  13.318 +            sB = (Pixel << 3) & 0xf8; \
  13.319 +            ASSEMBLE_RGBA(dst, 4, dstfmt, sR, sG, sB, alpha); \
  13.320 +            src += 2; \
  13.321 +            dst += 4; \
  13.322 +            widthvar--; \
  13.323 +        }
  13.324 +        ONE_PIXEL_BLEND(((UNALIGNED_PTR(dst)) && (width)), width);
  13.325 +
  13.326 +        /* After all that work, here's the vector part! */
  13.327 +        extrawidth = (width % 8);       /* trailing unaligned stores */
  13.328 +        width -= extrawidth;
  13.329 +        vsrc = vec_ld(0, src);
  13.330 +        valigner = VEC_ALIGNER(src);
  13.331 +
  13.332 +        while (width) {
  13.333 +            vector unsigned short vR, vG, vB;
  13.334 +            vector unsigned char vdst1, vdst2;
  13.335 +
  13.336 +            voverflow = vec_ld(15, src);
  13.337 +            vsrc = vec_perm(vsrc, voverflow, valigner);
  13.338 +
  13.339 +            vR = vec_and((vector unsigned short) vsrc, vf800);
  13.340 +            vB = vec_sl((vector unsigned short) vsrc, v3);
  13.341 +            vG = vec_sl(vB, v2);
  13.342 +
  13.343 +            vdst1 =
  13.344 +                (vector unsigned char) vec_perm((vector unsigned char) vR,
  13.345 +                                                valpha, vredalpha1);
  13.346 +            vdst1 = vec_perm(vdst1, (vector unsigned char) vB, vblue1);
  13.347 +            vdst1 = vec_perm(vdst1, (vector unsigned char) vG, vgreen1);
  13.348 +            vdst1 = vec_perm(vdst1, valpha, vpermute);
  13.349 +            vec_st(vdst1, 0, dst);
  13.350 +
  13.351 +            vdst2 =
  13.352 +                (vector unsigned char) vec_perm((vector unsigned char) vR,
  13.353 +                                                valpha, vredalpha2);
  13.354 +            vdst2 = vec_perm(vdst2, (vector unsigned char) vB, vblue2);
  13.355 +            vdst2 = vec_perm(vdst2, (vector unsigned char) vG, vgreen2);
  13.356 +            vdst2 = vec_perm(vdst2, valpha, vpermute);
  13.357 +            vec_st(vdst2, 16, dst);
  13.358 +
  13.359 +            width -= 8;
  13.360 +            dst += 32;
  13.361 +            src += 16;
  13.362 +            vsrc = voverflow;
  13.363 +        }
  13.364 +
  13.365 +        assert(width == 0);
  13.366 +
  13.367 +
  13.368 +        /* do scalar until we can align... */
  13.369 +        ONE_PIXEL_BLEND((extrawidth), extrawidth);
  13.370 +#undef ONE_PIXEL_BLEND
  13.371 +
  13.372 +        src += srcskip;         /* move to next row, accounting for pitch. */
  13.373 +        dst += dstskip;
  13.374 +    }
  13.375 +
  13.376 +}
  13.377 +
  13.378 +
  13.379 +static void
  13.380 +Blit_RGB555_32Altivec(SDL_BlitInfo * info)
  13.381 +{
  13.382 +    int height = info->dst_h;
  13.383 +    Uint8 *src = (Uint8 *) info->src;
  13.384 +    int srcskip = info->src_skip;
  13.385 +    Uint8 *dst = (Uint8 *) info->dst;
  13.386 +    int dstskip = info->dst_skip;
  13.387 +    SDL_PixelFormat *srcfmt = info->src_fmt;
  13.388 +    SDL_PixelFormat *dstfmt = info->dst_fmt;
  13.389 +    unsigned alpha;
  13.390 +    vector unsigned char valpha;
  13.391 +    vector unsigned char vpermute;
  13.392 +    vector unsigned short vf800;
  13.393 +    vector unsigned int v8 = vec_splat_u32(8);
  13.394 +    vector unsigned int v16 = vec_add(v8, v8);
  13.395 +    vector unsigned short v1 = vec_splat_u16(1);
  13.396 +    vector unsigned short v3 = vec_splat_u16(3);
  13.397 +    /* 
  13.398 +       0x10 - 0x1f is the alpha
  13.399 +       0x00 - 0x0e evens are the red
  13.400 +       0x01 - 0x0f odds are zero
  13.401 +     */
  13.402 +    vector unsigned char vredalpha1 = VECUINT8_LITERAL(0x10, 0x00, 0x01, 0x01,
  13.403 +                                                       0x10, 0x02, 0x01, 0x01,
  13.404 +                                                       0x10, 0x04, 0x01, 0x01,
  13.405 +                                                       0x10, 0x06, 0x01,
  13.406 +                                                       0x01);
  13.407 +    vector unsigned char vredalpha2 =
  13.408 +        (vector unsigned
  13.409 +         char) (vec_add((vector unsigned int) vredalpha1, vec_sl(v8, v16))
  13.410 +        );
  13.411 +    /*
  13.412 +       0x00 - 0x0f is ARxx ARxx ARxx ARxx
  13.413 +       0x11 - 0x0f odds are blue
  13.414 +     */
  13.415 +    vector unsigned char vblue1 = VECUINT8_LITERAL(0x00, 0x01, 0x02, 0x11,
  13.416 +                                                   0x04, 0x05, 0x06, 0x13,
  13.417 +                                                   0x08, 0x09, 0x0a, 0x15,
  13.418 +                                                   0x0c, 0x0d, 0x0e, 0x17);
  13.419 +    vector unsigned char vblue2 =
  13.420 +        (vector unsigned char) (vec_add((vector unsigned int) vblue1, v8)
  13.421 +        );
  13.422 +    /*
  13.423 +       0x00 - 0x0f is ARxB ARxB ARxB ARxB
  13.424 +       0x10 - 0x0e evens are green
  13.425 +     */
  13.426 +    vector unsigned char vgreen1 = VECUINT8_LITERAL(0x00, 0x01, 0x10, 0x03,
  13.427 +                                                    0x04, 0x05, 0x12, 0x07,
  13.428 +                                                    0x08, 0x09, 0x14, 0x0b,
  13.429 +                                                    0x0c, 0x0d, 0x16, 0x0f);
  13.430 +    vector unsigned char vgreen2 =
  13.431 +        (vector unsigned
  13.432 +         char) (vec_add((vector unsigned int) vgreen1, vec_sl(v8, v8))
  13.433 +        );
  13.434 +
  13.435 +
  13.436 +    assert(srcfmt->BytesPerPixel == 2);
  13.437 +    assert(dstfmt->BytesPerPixel == 4);
  13.438 +
  13.439 +    vf800 = (vector unsigned short) vec_splat_u8(-7);
  13.440 +    vf800 = vec_sl(vf800, vec_splat_u16(8));
  13.441 +
  13.442 +    if (dstfmt->Amask && info->a) {
  13.443 +        ((unsigned char *) &valpha)[0] = alpha = info->a;
  13.444 +        valpha = vec_splat(valpha, 0);
  13.445 +    } else {
  13.446 +        alpha = 0;
  13.447 +        valpha = vec_splat_u8(0);
  13.448 +    }
  13.449 +
  13.450 +    vpermute = calc_swizzle32(NULL, dstfmt);
  13.451 +    while (height--) {
  13.452 +        vector unsigned char valigner;
  13.453 +        vector unsigned char voverflow;
  13.454 +        vector unsigned char vsrc;
  13.455 +
  13.456 +        int width = info->dst_w;
  13.457 +        int extrawidth;
  13.458 +
  13.459 +        /* do scalar until we can align... */
  13.460 +#define ONE_PIXEL_BLEND(condition, widthvar) \
  13.461 +        while (condition) { \
  13.462 +            unsigned sR, sG, sB; \
  13.463 +            unsigned short Pixel = *((unsigned short *)src); \
  13.464 +            sR = (Pixel >> 7) & 0xf8; \
  13.465 +            sG = (Pixel >> 2) & 0xf8; \
  13.466 +            sB = (Pixel << 3) & 0xf8; \
  13.467 +            ASSEMBLE_RGBA(dst, 4, dstfmt, sR, sG, sB, alpha); \
  13.468 +            src += 2; \
  13.469 +            dst += 4; \
  13.470 +            widthvar--; \
  13.471 +        }
  13.472 +        ONE_PIXEL_BLEND(((UNALIGNED_PTR(dst)) && (width)), width);
  13.473 +
  13.474 +        /* After all that work, here's the vector part! */
  13.475 +        extrawidth = (width % 8);       /* trailing unaligned stores */
  13.476 +        width -= extrawidth;
  13.477 +        vsrc = vec_ld(0, src);
  13.478 +        valigner = VEC_ALIGNER(src);
  13.479 +
  13.480 +        while (width) {
  13.481 +            vector unsigned short vR, vG, vB;
  13.482 +            vector unsigned char vdst1, vdst2;
  13.483 +
  13.484 +            voverflow = vec_ld(15, src);
  13.485 +            vsrc = vec_perm(vsrc, voverflow, valigner);
  13.486 +
  13.487 +            vR = vec_and(vec_sl((vector unsigned short) vsrc, v1), vf800);
  13.488 +            vB = vec_sl((vector unsigned short) vsrc, v3);
  13.489 +            vG = vec_sl(vB, v3);
  13.490 +
  13.491 +            vdst1 =
  13.492 +                (vector unsigned char) vec_perm((vector unsigned char) vR,
  13.493 +                                                valpha, vredalpha1);
  13.494 +            vdst1 = vec_perm(vdst1, (vector unsigned char) vB, vblue1);
  13.495 +            vdst1 = vec_perm(vdst1, (vector unsigned char) vG, vgreen1);
  13.496 +            vdst1 = vec_perm(vdst1, valpha, vpermute);
  13.497 +            vec_st(vdst1, 0, dst);
  13.498 +
  13.499 +            vdst2 =
  13.500 +                (vector unsigned char) vec_perm((vector unsigned char) vR,
  13.501 +                                                valpha, vredalpha2);
  13.502 +            vdst2 = vec_perm(vdst2, (vector unsigned char) vB, vblue2);
  13.503 +            vdst2 = vec_perm(vdst2, (vector unsigned char) vG, vgreen2);
  13.504 +            vdst2 = vec_perm(vdst2, valpha, vpermute);
  13.505 +            vec_st(vdst2, 16, dst);
  13.506 +
  13.507 +            width -= 8;
  13.508 +            dst += 32;
  13.509 +            src += 16;
  13.510 +            vsrc = voverflow;
  13.511 +        }
  13.512 +
  13.513 +        assert(width == 0);
  13.514 +
  13.515 +
  13.516 +        /* do scalar until we can align... */
  13.517 +        ONE_PIXEL_BLEND((extrawidth), extrawidth);
  13.518 +#undef ONE_PIXEL_BLEND
  13.519 +
  13.520 +        src += srcskip;         /* move to next row, accounting for pitch. */
  13.521 +        dst += dstskip;
  13.522 +    }
  13.523 +
  13.524 +}
  13.525 +
  13.526 +static void BlitNtoNKey(SDL_BlitInfo * info);
  13.527 +static void BlitNtoNKeyCopyAlpha(SDL_BlitInfo * info);
  13.528 +static void
  13.529 +Blit32to32KeyAltivec(SDL_BlitInfo * info)
  13.530 +{
  13.531 +    int height = info->dst_h;
  13.532 +    Uint32 *srcp = (Uint32 *) info->src;
  13.533 +    int srcskip = info->src_skip / 4;
  13.534 +    Uint32 *dstp = (Uint32 *) info->dst;
  13.535 +    int dstskip = info->dst_skip / 4;
  13.536 +    SDL_PixelFormat *srcfmt = info->src_fmt;
  13.537 +    int srcbpp = srcfmt->BytesPerPixel;
  13.538 +    SDL_PixelFormat *dstfmt = info->dst_fmt;
  13.539 +    int dstbpp = dstfmt->BytesPerPixel;
  13.540 +    int copy_alpha = (srcfmt->Amask && dstfmt->Amask);
  13.541 +    unsigned alpha = dstfmt->Amask ? info->a : 0;
  13.542 +    Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
  13.543 +    Uint32 ckey = info->colorkey;
  13.544 +    vector unsigned int valpha;
  13.545 +    vector unsigned char vpermute;
  13.546 +    vector unsigned char vzero;
  13.547 +    vector unsigned int vckey;
  13.548 +    vector unsigned int vrgbmask;
  13.549 +    vpermute = calc_swizzle32(srcfmt, dstfmt);
  13.550 +    if (info->dst_w < 16) {
  13.551 +        if (copy_alpha) {
  13.552 +            BlitNtoNKeyCopyAlpha(info);
  13.553 +        } else {
  13.554 +            BlitNtoNKey(info);
  13.555 +        }
  13.556 +        return;
  13.557 +    }
  13.558 +    vzero = vec_splat_u8(0);
  13.559 +    if (alpha) {
  13.560 +        ((unsigned char *) &valpha)[0] = (unsigned char) alpha;
  13.561 +        valpha =
  13.562 +            (vector unsigned int) vec_splat((vector unsigned char) valpha, 0);
  13.563 +    } else {
  13.564 +        valpha = (vector unsigned int) vzero;
  13.565 +    }
  13.566 +    ckey &= rgbmask;
  13.567 +    ((unsigned int *) (char *) &vckey)[0] = ckey;
  13.568 +    vckey = vec_splat(vckey, 0);
  13.569 +    ((unsigned int *) (char *) &vrgbmask)[0] = rgbmask;
  13.570 +    vrgbmask = vec_splat(vrgbmask, 0);
  13.571 +
  13.572 +    while (height--) {
  13.573 +#define ONE_PIXEL_BLEND(condition, widthvar) \
  13.574 +        if (copy_alpha) { \
  13.575 +            while (condition) { \
  13.576 +                Uint32 Pixel; \
  13.577 +                unsigned sR, sG, sB, sA; \
  13.578 +                DISEMBLE_RGBA((Uint8 *)srcp, srcbpp, srcfmt, Pixel, \
  13.579 +                          sR, sG, sB, sA); \
  13.580 +                if ( (Pixel & rgbmask) != ckey ) { \
  13.581 +                      ASSEMBLE_RGBA((Uint8 *)dstp, dstbpp, dstfmt, \
  13.582 +                            sR, sG, sB, sA); \
  13.583 +                } \
  13.584 +                dstp = (Uint32 *) (((Uint8 *) dstp) + dstbpp); \
  13.585 +                srcp = (Uint32 *) (((Uint8 *) srcp) + srcbpp); \
  13.586 +                widthvar--; \
  13.587 +            } \
  13.588 +        } else { \
  13.589 +            while (condition) { \
  13.590 +                Uint32 Pixel; \
  13.591 +                unsigned sR, sG, sB; \
  13.592 +                RETRIEVE_RGB_PIXEL((Uint8 *)srcp, srcbpp, Pixel); \
  13.593 +                if ( Pixel != ckey ) { \
  13.594 +                    RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
  13.595 +                    ASSEMBLE_RGBA((Uint8 *)dstp, dstbpp, dstfmt, \
  13.596 +                              sR, sG, sB, alpha); \
  13.597 +                } \
  13.598 +                dstp = (Uint32 *) (((Uint8 *)dstp) + dstbpp); \
  13.599 +                srcp = (Uint32 *) (((Uint8 *)srcp) + srcbpp); \
  13.600 +                widthvar--; \
  13.601 +            } \
  13.602 +        }
  13.603 +        int width = info->dst_w;
  13.604 +        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  13.605 +        assert(width > 0);
  13.606 +        if (width > 0) {
  13.607 +            int extrawidth = (width % 4);
  13.608 +            vector unsigned char valigner = VEC_ALIGNER(srcp);
  13.609 +            vector unsigned int vs = vec_ld(0, srcp);
  13.610 +            width -= extrawidth;
  13.611 +            assert(width >= 4);
  13.612 +            while (width) {
  13.613 +                vector unsigned char vsel;
  13.614 +                vector unsigned int vd;
  13.615 +                vector unsigned int voverflow = vec_ld(15, srcp);
  13.616 +                /* load the source vec */
  13.617 +                vs = vec_perm(vs, voverflow, valigner);
  13.618 +                /* vsel is set for items that match the key */
  13.619 +                vsel = (vector unsigned char) vec_and(vs, vrgbmask);
  13.620 +                vsel = (vector unsigned char) vec_cmpeq(vs, vckey);
  13.621 +                /* permute the src vec to the dest format */
  13.622 +                vs = vec_perm(vs, valpha, vpermute);
  13.623 +                /* load the destination vec */
  13.624 +                vd = vec_ld(0, dstp);
  13.625 +                /* select the source and dest into vs */
  13.626 +                vd = (vector unsigned int) vec_sel((vector unsigned char) vs,
  13.627 +                                                   (vector unsigned char) vd,
  13.628 +                                                   vsel);
  13.629 +
  13.630 +                vec_st(vd, 0, dstp);
  13.631 +                srcp += 4;
  13.632 +                width -= 4;
  13.633 +                dstp += 4;
  13.634 +                vs = voverflow;
  13.635 +            }
  13.636 +            ONE_PIXEL_BLEND((extrawidth), extrawidth);
  13.637 +#undef ONE_PIXEL_BLEND
  13.638 +            srcp += srcskip;
  13.639 +            dstp += dstskip;
  13.640 +        }
  13.641 +    }
  13.642 +}
  13.643 +
  13.644 +/* Altivec code to swizzle one 32-bit surface to a different 32-bit format. */
  13.645 +/* Use this on a G5 */
  13.646 +static void
  13.647 +ConvertAltivec32to32_noprefetch(SDL_BlitInfo * info)
  13.648 +{
  13.649 +    int height = info->dst_h;
  13.650 +    Uint32 *src = (Uint32 *) info->src;
  13.651 +    int srcskip = info->src_skip / 4;
  13.652 +    Uint32 *dst = (Uint32 *) info->dst;
  13.653 +    int dstskip = info->dst_skip / 4;
  13.654 +    SDL_PixelFormat *srcfmt = info->src_fmt;
  13.655 +    SDL_PixelFormat *dstfmt = info->dst_fmt;
  13.656 +    vector unsigned int vzero = vec_splat_u32(0);
  13.657 +    vector unsigned char vpermute = calc_swizzle32(srcfmt, dstfmt);
  13.658 +    if (dstfmt->Amask && !srcfmt->Amask) {
  13.659 +        if (info->a) {
  13.660 +            vector unsigned char valpha;
  13.661 +            ((unsigned char *) &valpha)[0] = info->a;
  13.662 +            vzero = (vector unsigned int) vec_splat(valpha, 0);
  13.663 +        }
  13.664 +    }
  13.665 +
  13.666 +    assert(srcfmt->BytesPerPixel == 4);
  13.667 +    assert(dstfmt->BytesPerPixel == 4);
  13.668 +
  13.669 +    while (height--) {
  13.670 +        vector unsigned char valigner;
  13.671 +        vector unsigned int vbits;
  13.672 +        vector unsigned int voverflow;
  13.673 +        Uint32 bits;
  13.674 +        Uint8 r, g, b, a;
  13.675 +
  13.676 +        int width = info->dst_w;
  13.677 +        int extrawidth;
  13.678 +
  13.679 +        /* do scalar until we can align... */
  13.680 +        while ((UNALIGNED_PTR(dst)) && (width)) {
  13.681 +            bits = *(src++);
  13.682 +            RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
  13.683 +            *(dst++) = MAKE8888(dstfmt, r, g, b, a);
  13.684 +            width--;
  13.685 +        }
  13.686 +
  13.687 +        /* After all that work, here's the vector part! */
  13.688 +        extrawidth = (width % 4);
  13.689 +        width -= extrawidth;
  13.690 +        valigner = VEC_ALIGNER(src);
  13.691 +        vbits = vec_ld(0, src);
  13.692 +
  13.693 +        while (width) {
  13.694 +            voverflow = vec_ld(15, src);
  13.695 +            src += 4;
  13.696 +            width -= 4;
  13.697 +            vbits = vec_perm(vbits, voverflow, valigner);       /* src is ready. */
  13.698 +            vbits = vec_perm(vbits, vzero, vpermute);   /* swizzle it. */
  13.699 +            vec_st(vbits, 0, dst);      /* store it back out. */
  13.700 +            dst += 4;
  13.701 +            vbits = voverflow;
  13.702 +        }
  13.703 +
  13.704 +        assert(width == 0);
  13.705 +
  13.706 +        /* cover pixels at the end of the row that didn't fit in 16 bytes. */
  13.707 +        while (extrawidth) {
  13.708 +            bits = *(src++);    /* max 7 pixels, don't bother with prefetch. */
  13.709 +            RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
  13.710 +            *(dst++) = MAKE8888(dstfmt, r, g, b, a);
  13.711 +            extrawidth--;
  13.712 +        }
  13.713 +
  13.714 +        src += srcskip;
  13.715 +        dst += dstskip;
  13.716 +    }
  13.717 +
  13.718 +}
  13.719 +
  13.720 +/* Altivec code to swizzle one 32-bit surface to a different 32-bit format. */
  13.721 +/* Use this on a G4 */
  13.722 +static void
  13.723 +ConvertAltivec32to32_prefetch(SDL_BlitInfo * info)
  13.724 +{
  13.725 +    const int scalar_dst_lead = sizeof(Uint32) * 4;
  13.726 +    const int vector_dst_lead = sizeof(Uint32) * 16;
  13.727 +
  13.728 +    int height = info->dst_h;
  13.729 +    Uint32 *src = (Uint32 *) info->src;
  13.730 +    int srcskip = info->src_skip / 4;
  13.731 +    Uint32 *dst = (Uint32 *) info->dst;
  13.732 +    int dstskip = info->dst_skip / 4;
  13.733 +    SDL_PixelFormat *srcfmt = info->src_fmt;
  13.734 +    SDL_PixelFormat *dstfmt = info->dst_fmt;
  13.735 +    vector unsigned int vzero = vec_splat_u32(0);
  13.736 +    vector unsigned char vpermute = calc_swizzle32(srcfmt, dstfmt);
  13.737 +    if (dstfmt->Amask && !srcfmt->Amask) {
  13.738 +        if (info->a) {
  13.739 +            vector unsigned char valpha;
  13.740 +            ((unsigned char *) &valpha)[0] = info->a;
  13.741 +            vzero = (vector unsigned int) vec_splat(valpha, 0);
  13.742 +        }
  13.743 +    }
  13.744 +
  13.745 +    assert(srcfmt->BytesPerPixel == 4);
  13.746 +    assert(dstfmt->BytesPerPixel == 4);
  13.747 +
  13.748 +    while (height--) {
  13.749 +        vector unsigned char valigner;
  13.750 +        vector unsigned int vbits;
  13.751 +        vector unsigned int voverflow;
  13.752 +        Uint32 bits;
  13.753 +        Uint8 r, g, b, a;
  13.754 +
  13.755 +        int width = info->dst_w;
  13.756 +        int extrawidth;
  13.757 +
  13.758 +        /* do scalar until we can align... */
  13.759 +        while ((UNALIGNED_PTR(dst)) && (width)) {
  13.760 +            vec_dstt(src + scalar_dst_lead, DST_CTRL(2, 32, 1024),
  13.761 +                     DST_CHAN_SRC);
  13.762 +            vec_dstst(dst + scalar_dst_lead, DST_CTRL(2, 32, 1024),
  13.763 +                      DST_CHAN_DEST);
  13.764 +            bits = *(src++);
  13.765 +            RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
  13.766 +            *(dst++) = MAKE8888(dstfmt, r, g, b, a);
  13.767 +            width--;
  13.768 +        }
  13.769 +
  13.770 +        /* After all that work, here's the vector part! */
  13.771 +        extrawidth = (width % 4);
  13.772 +        width -= extrawidth;
  13.773 +        valigner = VEC_ALIGNER(src);
  13.774 +        vbits = vec_ld(0, src);
  13.775 +
  13.776 +        while (width) {
  13.777 +            vec_dstt(src + vector_dst_lead, DST_CTRL(2, 32, 1024),
  13.778 +                     DST_CHAN_SRC);
  13.779 +            vec_dstst(dst + vector_dst_lead, DST_CTRL(2, 32, 1024),
  13.780 +                      DST_CHAN_DEST);
  13.781 +            voverflow = vec_ld(15, src);
  13.782 +            src += 4;
  13.783 +            width -= 4;
  13.784 +            vbits = vec_perm(vbits, voverflow, valigner);       /* src is ready. */
  13.785 +            vbits = vec_perm(vbits, vzero, vpermute);   /* swizzle it. */
  13.786 +            vec_st(vbits, 0, dst);      /* store it back out. */
  13.787 +            dst += 4;
  13.788 +            vbits = voverflow;
  13.789 +        }
  13.790 +
  13.791 +        assert(width == 0);
  13.792 +
  13.793 +        /* cover pixels at the end of the row that didn't fit in 16 bytes. */
  13.794 +        while (extrawidth) {
  13.795 +            bits = *(src++);    /* max 7 pixels, don't bother with prefetch. */
  13.796 +            RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
  13.797 +            *(dst++) = MAKE8888(dstfmt, r, g, b, a);
  13.798 +            extrawidth--;
  13.799 +        }
  13.800 +
  13.801 +        src += srcskip;
  13.802 +        dst += dstskip;
  13.803 +    }
  13.804 +
  13.805 +    vec_dss(DST_CHAN_SRC);
  13.806 +    vec_dss(DST_CHAN_DEST);
  13.807 +}
  13.808 +
  13.809 +static Uint32
  13.810 +GetBlitFeatures(void)
  13.811 +{
  13.812 +    static Uint32 features = 0xffffffff;
  13.813 +    if (features == 0xffffffff) {
  13.814 +        /* Provide an override for testing .. */
  13.815 +        char *override = SDL_getenv("SDL_ALTIVEC_BLIT_FEATURES");
  13.816 +        if (override) {
  13.817 +            features = 0;
  13.818 +            SDL_sscanf(override, "%u", &features);
  13.819 +        } else {
  13.820 +            features = (0
  13.821 +                        /* Feature 1 is has-MMX */
  13.822 +                        | ((SDL_HasMMX())? 1 : 0)
  13.823 +                        /* Feature 2 is has-AltiVec */
  13.824 +                        | ((SDL_HasAltiVec())? 2 : 0)
  13.825 +                        /* Feature 4 is dont-use-prefetch */
  13.826 +                        /* !!!! FIXME: Check for G5 or later, not the cache size! Always prefetch on a G4. */
  13.827 +                        | ((GetL3CacheSize() == 0) ? 4 : 0)
  13.828 +                );
  13.829 +        }
  13.830 +    }
  13.831 +    return features;
  13.832 +}
  13.833 +
  13.834 +#if __MWERKS__
  13.835 +#pragma altivec_model off
  13.836 +#endif
  13.837 +#else
  13.838  /* Feature 1 is has-MMX */
  13.839  #define GetBlitFeatures() ((Uint32)(SDL_HasMMX() ? 1 : 0))
  13.840 +#endif
  13.841  
  13.842  /* This is now endian dependent */
  13.843  #if SDL_BYTEORDER == SDL_LIL_ENDIAN
  13.844 @@ -1508,6 +2340,15 @@
  13.845  };
  13.846  
  13.847  static const struct blit_table normal_blit_2[] = {
  13.848 +#if SDL_ALTIVEC_BLITTERS
  13.849 +    /* has-altivec */
  13.850 +    {0x0000F800, 0x000007E0, 0x0000001F, 4, 0x00000000, 0x00000000,
  13.851 +     0x00000000,
  13.852 +     2, Blit_RGB565_32Altivec, NO_ALPHA | COPY_ALPHA | SET_ALPHA},
  13.853 +    {0x00007C00, 0x000003E0, 0x0000001F, 4, 0x00000000, 0x00000000,
  13.854 +     0x00000000,
  13.855 +     2, Blit_RGB555_32Altivec, NO_ALPHA | COPY_ALPHA | SET_ALPHA},
  13.856 +#endif
  13.857      {0x0000F800, 0x000007E0, 0x0000001F, 4, 0x00FF0000, 0x0000FF00,
  13.858       0x000000FF,
  13.859       0, Blit_RGB565_ARGB8888, SET_ALPHA},
  13.860 @@ -1531,6 +2372,22 @@
  13.861  };
  13.862  
  13.863  static const struct blit_table normal_blit_4[] = {
  13.864 +#if SDL_ALTIVEC_BLITTERS
  13.865 +    /* has-altivec | dont-use-prefetch */
  13.866 +    {0x00000000, 0x00000000, 0x00000000, 4, 0x00000000, 0x00000000,
  13.867 +     0x00000000,
  13.868 +     6, ConvertAltivec32to32_noprefetch,
  13.869 +     NO_ALPHA | COPY_ALPHA | SET_ALPHA},
  13.870 +    /* has-altivec */
  13.871 +    {0x00000000, 0x00000000, 0x00000000, 4, 0x00000000, 0x00000000,
  13.872 +     0x00000000,
  13.873 +     2, ConvertAltivec32to32_prefetch,
  13.874 +     NO_ALPHA | COPY_ALPHA | SET_ALPHA},
  13.875 +    /* has-altivec */
  13.876 +    {0x00000000, 0x00000000, 0x00000000, 2, 0x0000F800, 0x000007E0,
  13.877 +     0x0000001F,
  13.878 +     2, Blit_RGB888_RGB565Altivec, NO_ALPHA},
  13.879 +#endif
  13.880      {0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x0000F800, 0x000007E0,
  13.881       0x0000001F,
  13.882       0, Blit_RGB888_RGB565, NO_ALPHA},
  13.883 @@ -1628,6 +2485,12 @@
  13.884          else if (dstfmt->BytesPerPixel == 1)
  13.885              return BlitNto1Key;
  13.886          else {
  13.887 +#if SDL_ALTIVEC_BLITTERS
  13.888 +            if ((srcfmt->BytesPerPixel == 4) && (dstfmt->BytesPerPixel == 4)
  13.889 +                && SDL_HasAltiVec()) {
  13.890 +                return Blit32to32KeyAltivec;
  13.891 +            } else
  13.892 +#endif
  13.893              if (srcfmt->Amask && dstfmt->Amask) {
  13.894                  return BlitNtoNKeyCopyAlpha;
  13.895              } else {
    14.1 --- a/test/automated/platform/platform.c	Mon Feb 21 23:45:48 2011 -0800
    14.2 +++ b/test/automated/platform/platform.c	Tue Feb 22 21:44:36 2011 -0800
    14.3 @@ -158,7 +158,9 @@
    14.4     SDL_ATprintVerbose( 1, "CPU count: %d\n", SDL_GetCPUCount());
    14.5     SDL_ATprintVerbose( 1, "Available extensions:\n" );
    14.6     SDL_ATprintVerbose( 1, "   RDTSC %s\n", SDL_HasRDTSC()? "detected" : "not detected" );
    14.7 +   SDL_ATprintVerbose( 1, "   AltiVec %s\n", SDL_HasAltiVec()? "detected" : "not detected" );
    14.8     SDL_ATprintVerbose( 1, "   MMX %s\n", SDL_HasMMX()? "detected" : "not detected" );
    14.9 +   SDL_ATprintVerbose( 1, "   3DNow! %s\n", SDL_Has3DNow()? "detected" : "not detected" );
   14.10     SDL_ATprintVerbose( 1, "   SSE %s\n", SDL_HasSSE()? "detected" : "not detected" );
   14.11     SDL_ATprintVerbose( 1, "   SSE2 %s\n", SDL_HasSSE2()? "detected" : "not detected" );
   14.12     SDL_ATprintVerbose( 1, "   SSE3 %s\n", SDL_HasSSE3()? "detected" : "not detected" );
    15.1 --- a/test/testplatform.c	Mon Feb 21 23:45:48 2011 -0800
    15.2 +++ b/test/testplatform.c	Tue Feb 22 21:44:36 2011 -0800
    15.3 @@ -140,9 +140,11 @@
    15.4  {
    15.5      if (verbose) {
    15.6          printf("CPU count: %d\n", SDL_GetCPUCount());
    15.7 -	printf("CPU cache line size: %d\n", SDL_GetCPUCacheLineSize());
    15.8 +        printf("CPU cache line size: %d\n", SDL_GetCPUCacheLineSize());
    15.9          printf("RDTSC %s\n", SDL_HasRDTSC()? "detected" : "not detected");
   15.10 +        printf("AltiVec %s\n", SDL_HasAltiVec()? "detected" : "not detected");
   15.11          printf("MMX %s\n", SDL_HasMMX()? "detected" : "not detected");
   15.12 +        printf("3DNow! %s\n", SDL_Has3DNow()? "detected" : "not detected");
   15.13          printf("SSE %s\n", SDL_HasSSE()? "detected" : "not detected");
   15.14          printf("SSE2 %s\n", SDL_HasSSE2()? "detected" : "not detected");
   15.15          printf("SSE3 %s\n", SDL_HasSSE3()? "detected" : "not detected");