Fixed crash with movaps instruction in SDL_memcpy(), due to unaligned Uint32* cast and -O3 vectorization optimizations with gcc 4.9.0
authorSam Lantinga <slouken@libsdl.org>
Mon, 27 May 2013 16:18:11 -0700
changeset 723681ebe816a6da
parent 7235 80fefd4af771
child 7237 ed32fc5d296b
Fixed crash with movaps instruction in SDL_memcpy(), due to unaligned Uint32* cast and -O3 vectorization optimizations with gcc 4.9.0
src/stdlib/SDL_string.c
     1.1 --- a/src/stdlib/SDL_string.c	Sun May 26 22:16:42 2013 -0700
     1.2 +++ b/src/stdlib/SDL_string.c	Mon May 27 16:18:11 2013 -0700
     1.3 @@ -310,29 +310,49 @@
     1.4  void *
     1.5  SDL_memcpy(void *dst, const void *src, size_t len)
     1.6  {
     1.7 -    size_t left = (len % 4);
     1.8 -    Uint32 *srcp4, *dstp4;
     1.9 -    Uint8 *srcp1, *dstp1;
    1.10 +#ifdef __GNUC__
    1.11 +    /* Presumably this is well tuned for speed.
    1.12 +       On my machine this is twice as fast as the C code below.
    1.13 +     */
    1.14 +    return __builtin_memcpy(dst, src, len);
    1.15 +#else
    1.16 +    /* GCC 4.9.0 with -O3 will generate movaps instructions with the loop
    1.17 +       using Uint32* pointers, so we need to make sure the pointers are
    1.18 +       aligned before we loop using them.
    1.19 +     */
    1.20 +    if (((intptr_t)src & 0x3) || ((intptr_t)dst & 0x3)) {
    1.21 +        /* Do an unaligned byte copy */
    1.22 +        Uint8 *srcp1 = (Uint8 *)src;
    1.23 +        Uint8 *dstp1 = (Uint8 *)dst;
    1.24  
    1.25 -    srcp4 = (Uint32 *) src;
    1.26 -    dstp4 = (Uint32 *) dst;
    1.27 -    len /= 4;
    1.28 -    while (len--) {
    1.29 -        *dstp4++ = *srcp4++;
    1.30 +        while (len--) {
    1.31 +            *dstp1++ = *srcp1++;
    1.32 +        }
    1.33 +    } else {
    1.34 +        size_t left = (len % 4);
    1.35 +        Uint32 *srcp4, *dstp4;
    1.36 +        Uint8 *srcp1, *dstp1;
    1.37 +
    1.38 +        srcp4 = (Uint32 *) src;
    1.39 +        dstp4 = (Uint32 *) dst;
    1.40 +        len /= 4;
    1.41 +        while (len--) {
    1.42 +            *dstp4++ = *srcp4++;
    1.43 +        }
    1.44 +
    1.45 +        srcp1 = (Uint8 *) srcp4;
    1.46 +        dstp1 = (Uint8 *) dstp4;
    1.47 +        switch (left) {
    1.48 +        case 3:
    1.49 +            *dstp1++ = *srcp1++;
    1.50 +        case 2:
    1.51 +            *dstp1++ = *srcp1++;
    1.52 +        case 1:
    1.53 +            *dstp1++ = *srcp1++;
    1.54 +        }
    1.55      }
    1.56 -
    1.57 -    srcp1 = (Uint8 *) srcp4;
    1.58 -    dstp1 = (Uint8 *) dstp4;
    1.59 -    switch (left) {
    1.60 -    case 3:
    1.61 -        *dstp1++ = *srcp1++;
    1.62 -    case 2:
    1.63 -        *dstp1++ = *srcp1++;
    1.64 -    case 1:
    1.65 -        *dstp1++ = *srcp1++;
    1.66 -    }
    1.67 -
    1.68      return dst;
    1.69 +#endif /* __GNUC__ */
    1.70  }
    1.71  #endif
    1.72