Added release/acquire memory barriers to the atomic API

* Added a destructor to clean up TLS memory at thread shutdown * Refactored the TLS code to have platform independent code and a small platform dependent core with a fallback to generic code if platform dependent functions fail. * Fixed recursion issues with SDL_GetErrBuf()
libsdl-org · Jul 11, 2013 · 31d2b67 · 31d2b67
1 parent bd6696d
commit 31d2b67
Show file tree

Hide file tree

Showing 10 changed files with 334 additions and 319 deletions.
diff --git a/include/SDL_atomic.h b/include/SDL_atomic.h
@@ -45,6 +45,7 @@
  *
  * There's also lots of good information here:
  * http://www.1024cores.net/home/lock-free-algorithms
+ * http://preshing.com/
  *
  * These operations may or may not actually be implemented using
  * processor specific atomic operations. When possible they are
@@ -135,6 +136,32 @@ void _ReadWriteBarrier(void);
 { SDL_SpinLock _tmp = 0; SDL_AtomicLock(&_tmp); SDL_AtomicUnlock(&_tmp); }
 #endif
 
+/**
+ * Memory barriers are designed to prevent reads and writes from being
+ * reordered by the compiler and being seen out of order on multi-core CPUs.
+ *
+ * A typical pattern would be for thread A to write some data and a flag,
+ * and for thread B to read the flag and get the data. In this case you
+ * would insert a release barrier between writing the data and the flag,
+ * guaranteeing that the data write completes no later than the flag is
+ * written, and you would insert an acquire barrier between reading the
+ * flag and reading the data, to ensure that all the reads associated
+ * with the flag have completed.
+ *
+ * In this pattern you should always see a release barrier paired with
+ * an acquire barrier and you should gate the data reads/writes with a
+ * single flag variable.
+ *
+ * For more information on these semantics, take a look at the blog post:
+ * http://preshing.com/20120913/acquire-and-release-semantics
+ */
+/* FIXME: This is correct for x86 and x64 but not other CPUs
+   For PPC we need the lwsync instruction, and on ARM some variant of dmb
+ */
+#define SDL_MemoryBarrierRelease()  SDL_CompilerBarrier()
+#define SDL_MemoryBarrierAcquire()  SDL_CompilerBarrier()
+
+
 /* Platform specific optimized versions of the atomic functions,
  * you can disable these by defining SDL_DISABLE_ATOMIC_INLINE
  */

diff --git a/include/SDL_thread.h b/include/SDL_thread.h
@@ -48,8 +48,8 @@ typedef struct SDL_Thread SDL_Thread;
 /* The SDL thread ID */
 typedef unsigned long SDL_threadID;
 
-/* Thread local storage ID */
-typedef int SDL_TLSID;
+/* Thread local storage ID, 0 is the invalid ID */
+typedef unsigned SDL_TLSID;
 
 /* The SDL thread priority
  *
@@ -219,13 +219,14 @@ extern DECLSPEC void * SDLCALL SDL_TLSGet(SDL_TLSID id);
  *
  *  \param id The thread local storage ID
  *  \param value The value to associate with the ID for the current thread
+ *  \param destructor A function called when the thread exits, to free the value.
  *
  *  \return 0 on success, -1 on error
  *
  *  \sa SDL_TLSCreate()
  *  \sa SDL_TLSGet()
  */
-extern DECLSPEC int SDLCALL SDL_TLSSet(SDL_TLSID id, const void *value);
+extern DECLSPEC int SDLCALL SDL_TLSSet(SDL_TLSID id, const void *value, void (*destructor)(void*));
 
 
 /* Ends C function definitions when using C++ */

diff --git a/src/thread/SDL_systhread.h b/src/thread/SDL_systhread.h
@@ -50,6 +50,12 @@ extern int SDL_SYS_SetThreadPriority(SDL_ThreadPriority priority);
  */
 extern void SDL_SYS_WaitThread(SDL_Thread * thread);
 
+/* Get the thread local storage for this thread */
+extern SDL_TLSData *SDL_SYS_GetTLSData();
+
+/* Set the thread local storage for this thread */
+extern int SDL_SYS_SetTLSData(SDL_TLSData *data);
+
 #endif /* _SDL_systhread_h */
 
 /* vi: set ts=4 sw=4 expandtab: */
diff --git a/src/thread/SDL_thread.c b/src/thread/SDL_thread.c
@@ -28,38 +28,219 @@
 #include "../SDL_error_c.h"
 
 
+SDL_TLSID
+SDL_TLSCreate()
+{
+    static SDL_atomic_t SDL_tls_id;
+    return SDL_AtomicIncRef(&SDL_tls_id)+1;
+}
+
+void *
+SDL_TLSGet(SDL_TLSID id)
+{
+    SDL_TLSData *storage;
+
+    storage = SDL_SYS_GetTLSData();
+    if (!storage || id == 0 || id > storage->limit) {
+        return NULL;
+    }
+    return storage->array[id-1].data;
+}
+
+int
+SDL_TLSSet(SDL_TLSID id, const void *value, void (*destructor)(void *))
+{
+    SDL_TLSData *storage;
+
+    if (id == 0) {
+        return SDL_InvalidParamError("id");
+    }
+
+    storage = SDL_SYS_GetTLSData();
+    if (!storage || id > storage->limit) {
+        int i, oldlimit, newlimit;
+
+        oldlimit = storage ? storage->limit : 0;
+        newlimit = (id + TLS_ALLOC_CHUNKSIZE);
+        storage = (SDL_TLSData *)SDL_realloc(storage, sizeof(*storage)+(newlimit-1)*sizeof(storage->array[0]));
+        if (!storage) {
+            return SDL_OutOfMemory();
+        }
+        storage->limit = newlimit;
+        for (i = oldlimit; i < newlimit; ++i) {
+            storage->array[i].data = NULL;
+            storage->array[i].destructor = NULL;
+        }
+        if (SDL_SYS_SetTLSData(storage) != 0) {
+            return -1;
+        }
+    }
+
+    storage->array[id-1].data = SDL_const_cast(void*, value);
+    storage->array[id-1].destructor = destructor;
+    return 0;
+}
+
+static void
+SDL_TLSCleanup()
+{
+    SDL_TLSData *storage;
+
+    storage = SDL_SYS_GetTLSData();
+    if (storage) {
+        int i;
+        for (i = 0; i < storage->limit; ++i) {
+            if (storage->array[i].destructor) {
+                storage->array[i].destructor(storage->array[i].data);
+            }
+        }
+        SDL_SYS_SetTLSData(NULL);
+        SDL_free(storage);
+    }
+}
+
+
+/* This is a generic implementation of thread-local storage which doesn't
+   require additional OS support.
+
+   It is not especially efficient and doesn't clean up thread-local storage
+   as threads exit.  If there is a real OS that doesn't support thread-local
+   storage this implementation should be improved to be production quality.
+*/
+
+typedef struct SDL_TLSEntry {
+    SDL_threadID thread;
+    SDL_TLSData *storage;
+    struct SDL_TLSEntry *next;
+} SDL_TLSEntry;
+
+static SDL_mutex *SDL_generic_TLS_mutex;
+static SDL_TLSEntry *SDL_generic_TLS;
+
+
+SDL_TLSData *
+SDL_Generic_GetTLSData()
+{
+    SDL_threadID thread = SDL_ThreadID();
+    SDL_TLSEntry *entry;
+    SDL_TLSData *storage = NULL;
+
+    if (!SDL_generic_TLS_mutex) {
+        static SDL_SpinLock tls_lock;
+        SDL_AtomicLock(&tls_lock);
+        if (!SDL_generic_TLS_mutex) {
+            SDL_mutex *mutex = SDL_CreateMutex();
+            SDL_MemoryBarrierRelease();
+            SDL_generic_TLS_mutex = mutex;
+            if (!SDL_generic_TLS_mutex) {
+                SDL_AtomicUnlock(&tls_lock);
+                return NULL;
+            }
+        }
+        SDL_AtomicUnlock(&tls_lock);
+    }
+
+    SDL_MemoryBarrierAcquire();
+    SDL_LockMutex(SDL_generic_TLS_mutex);
+    for (entry = SDL_generic_TLS; entry; entry = entry->next) {
+        if (entry->thread == thread) {
+            storage = entry->storage;
+            break;
+        }
+    }
+    SDL_UnlockMutex(SDL_generic_TLS_mutex);
+
+    return storage;
+}
+
+int
+SDL_Generic_SetTLSData(SDL_TLSData *storage)
+{
+    SDL_threadID thread = SDL_ThreadID();
+    SDL_TLSEntry *prev, *entry;
+
+    /* SDL_Generic_GetTLSData() is always called first, so we can assume SDL_generic_TLS_mutex */
+    SDL_LockMutex(SDL_generic_TLS_mutex);
+    prev = NULL;
+    for (entry = SDL_generic_TLS; entry; entry = entry->next) {
+        if (entry->thread == thread) {
+            if (storage) {
+                entry->storage = storage;
+            } else {
+                if (prev) {
+                    prev->next = entry->next;
+                } else {
+                    SDL_generic_TLS = entry->next;
+                }
+                SDL_free(entry);
+            }
+            break;
+        }
+        prev = entry;
+    }
+    if (!entry) {
+        entry = (SDL_TLSEntry *)SDL_malloc(sizeof(*entry));
+        if (entry) {
+            entry->thread = thread;
+            entry->storage = storage;
+            entry->next = SDL_generic_TLS;
+            SDL_generic_TLS = entry;
+        }
+    }
+    SDL_UnlockMutex(SDL_generic_TLS_mutex);
+
+    if (!entry) {
+        return SDL_OutOfMemory();
+    }
+    return 0;
+}
+
 /* Routine to get the thread-specific error variable */
 SDL_error *
 SDL_GetErrBuf(void)
 {
-    static SDL_SpinLock spinlock;
+    static SDL_SpinLock tls_lock;
     static SDL_bool tls_being_created;
     static SDL_TLSID tls_errbuf;
     static SDL_error SDL_global_errbuf;
+    const SDL_error *ALLOCATION_IN_PROGRESS = (SDL_error *)-1;
     SDL_error *errbuf;
 
+    /* tls_being_created is there simply to prevent recursion if SDL_TLSCreate() fails.
+       It also means it's possible for another thread to also use SDL_global_errbuf,
+       but that's very unlikely and hopefully won't cause issues.
+     */
     if (!tls_errbuf && !tls_being_created) {
-        SDL_AtomicLock(&spinlock);
+        SDL_AtomicLock(&tls_lock);
         if (!tls_errbuf) {
-            /* SDL_TLSCreate() could fail and call SDL_SetError() */
+            SDL_TLSID slot;
             tls_being_created = SDL_TRUE;
-            tls_errbuf = SDL_TLSCreate();
+            slot = SDL_TLSCreate();
             tls_being_created = SDL_FALSE;
+            SDL_MemoryBarrierRelease();
+            tls_errbuf = slot;
         }
-        SDL_AtomicUnlock(&spinlock);
+        SDL_AtomicUnlock(&tls_lock);
     }
     if (!tls_errbuf) {
         return &SDL_global_errbuf;
     }
 
-    errbuf = SDL_TLSGet(tls_errbuf);
+    SDL_MemoryBarrierAcquire();
+    errbuf = (SDL_error *)SDL_TLSGet(tls_errbuf);
+    if (errbuf == ALLOCATION_IN_PROGRESS) {
+        return &SDL_global_errbuf;
+    }
     if (!errbuf) {
+        /* Mark that we're in the middle of allocating our buffer */
+        SDL_TLSSet(tls_errbuf, ALLOCATION_IN_PROGRESS, NULL);
         errbuf = (SDL_error *)SDL_malloc(sizeof(*errbuf));
         if (!errbuf) {
+            SDL_TLSSet(tls_errbuf, NULL, NULL);
             return &SDL_global_errbuf;
         }
         SDL_zerop(errbuf);
-        SDL_TLSSet(tls_errbuf, errbuf);
+        SDL_TLSSet(tls_errbuf, errbuf, SDL_free);
     }
     return errbuf;
 }
@@ -82,9 +263,7 @@ SDL_RunThread(void *data)
     void *userdata = args->data;
     int *statusloc = &args->info->status;
 
-    /* Perform any system-dependent setup
-       - this function cannot fail, and cannot use SDL_SetError()
-     */
+    /* Perform any system-dependent setup - this function may not fail */
     SDL_SYS_SetupThread(args->info->name);
 
     /* Get the thread id */
@@ -95,6 +274,9 @@ SDL_RunThread(void *data)
 
     /* Run the function */
     *statusloc = userfunc(userdata);
+
+    /* Clean up thread-local storage */
+    SDL_TLSCleanup();
 }
 
 #ifdef SDL_PASSED_BEGINTHREAD_ENDTHREAD

diff --git a/src/thread/SDL_thread_c.h b/src/thread/SDL_thread_c.h
@@ -56,6 +56,30 @@ struct SDL_Thread
 /* This is the function called to run a thread */
 extern void SDL_RunThread(void *data);
 
+/* This is the system-independent thread local storage structure */
+typedef struct {
+    int limit;
+    struct {
+        void *data;
+        void (*destructor)(void*);
+    } array[1];
+} SDL_TLSData;
+
+/* This is how many TLS entries we allocate at once */
+#define TLS_ALLOC_CHUNKSIZE 4
+
+/* Get cross-platform, slow, thread local storage for this thread.
+   This is only intended as a fallback if getting real thread-local
+   storage fails or isn't supported on this platform.
+ */
+extern SDL_TLSData *SDL_Generic_GetTLSData();
+
+/* Set cross-platform, slow, thread local storage for this thread.
+   This is only intended as a fallback if getting real thread-local
+   storage fails or isn't supported on this platform.
+ */
+extern int SDL_Generic_SetTLSData(SDL_TLSData *data);
+
 #endif /* _SDL_thread_c_h */
 
 /* vi: set ts=4 sw=4 expandtab: */