diff --git a/src/coreclr/nativeaot/Runtime/threadstore.cpp b/src/coreclr/nativeaot/Runtime/threadstore.cpp
index c2b42491a387d9..c03907e4eed665 100644
--- a/src/coreclr/nativeaot/Runtime/threadstore.cpp
+++ b/src/coreclr/nativeaot/Runtime/threadstore.cpp
@@ -162,7 +162,7 @@ void ThreadStore::DetachCurrentThread()
     }
 
     // Unregister from OS notifications
-    // This can return false if detach notification is spurious and does not belong to this thread.
+    // This can return false if a thread did not register for OS notification.
     if (!PalDetachThread(pDetachingThread))
     {
         return;
diff --git a/src/coreclr/vm/ceemain.cpp b/src/coreclr/vm/ceemain.cpp
index 2784196ab2e959..eba08c952b21dc 100644
--- a/src/coreclr/vm/ceemain.cpp
+++ b/src/coreclr/vm/ceemain.cpp
@@ -1701,6 +1701,125 @@ BOOL STDMETHODCALLTYPE EEDllMain( // TRUE on success, FALSE on error.
 
 #endif // !defined(CORECLR_EMBEDDED)
 
+static void RuntimeThreadShutdown(void* thread)
+{
+    Thread* pThread = (Thread*)thread;
+    _ASSERTE(pThread == GetThreadNULLOk());
+
+    if (pThread)
+    {
+#ifdef FEATURE_COMINTEROP
+        // reset the CoInitialize state
+        // so we don't call CoUninitialize during thread detach
+        pThread->ResetCoInitialized();
+#endif // FEATURE_COMINTEROP
+        // For case where thread calls ExitThread directly, we need to reset the
+        // frame pointer. Otherwise stackwalk would AV. We need to do it in cooperative mode.
+        // We need to set m_GCOnTransitionsOK so this thread won't trigger GC when toggle GC mode
+        if (pThread->m_pFrame != FRAME_TOP)
+        {
+#ifdef _DEBUG
+            pThread->m_GCOnTransitionsOK = FALSE;
+#endif
+            GCX_COOP_NO_DTOR();
+            pThread->m_pFrame = FRAME_TOP;
+            GCX_COOP_NO_DTOR_END();
+        }
+
+        pThread->DetachThread(TRUE);
+    }
+    else
+    {
+        // Since we don't actually cleanup the TLS data along this path, verify that it is already cleaned up
+        AssertThreadStaticDataFreed();
+    }
+
+    ThreadDetaching();
+}
+
+#ifdef TARGET_WINDOWS
+
+// Index for the fiber local storage of the attached thread pointer
+static uint32_t g_flsIndex = FLS_OUT_OF_INDEXES;
+
+// This is called when each *fiber* is destroyed. When the home fiber of a thread is destroyed,
+// it means that the thread itself is destroyed.
+// Since we receive that notification outside of the Loader Lock, it allows us to safely acquire
+// the ThreadStore lock in the RuntimeThreadShutdown.
+static void __stdcall FiberDetachCallback(void* lpFlsData)
+{
+    ASSERT(g_flsIndex != FLS_OUT_OF_INDEXES);
+    ASSERT(lpFlsData == FlsGetValue(g_flsIndex));
+
+    if (lpFlsData != NULL)
+    {
+        // The current fiber is the home fiber of a thread, so the thread is shutting down
+        RuntimeThreadShutdown(lpFlsData);
+    }
+}
+
+void InitFlsSlot()
+{
+    // We use fiber detach callbacks to run our thread shutdown code because the fiber detach
+    // callback is made without the OS loader lock
+    g_flsIndex = FlsAlloc(FiberDetachCallback);
+    if (g_flsIndex == FLS_OUT_OF_INDEXES)
+    {
+        COMPlusThrowWin32();
+    }
+}
+
+// Register the thread with OS to be notified when thread is about to be destroyed
+// It fails fast if a different thread was already registered with the current fiber.
+// Parameters:
+//  thread        - thread to attach
+static void OsAttachThread(void* thread)
+{
+    void* threadFromCurrentFiber = FlsGetValue(g_flsIndex);
+
+    if (threadFromCurrentFiber != NULL)
+    {
+        _ASSERTE_ALL_BUILDS(!"Multiple threads encountered from a single fiber");
+    }
+
+    // Associate the current fiber with the current thread.  This makes the current fiber the thread's "home"
+    // fiber.  This fiber is the only fiber allowed to execute managed code on this thread.  When this fiber
+    // is destroyed, we consider the thread to be destroyed.
+    FlsSetValue(g_flsIndex, thread);
+}
+
+// Detach thread from OS notifications.
+// It fails fast if some other thread value was attached to the current fiber.
+// Parameters:
+//  thread        - thread to detach
+// Return:
+//  true if the thread was detached, false if there was no attached thread
+void OsDetachThread(void* thread)
+{
+    ASSERT(g_flsIndex != FLS_OUT_OF_INDEXES);
+    void* threadFromCurrentFiber = FlsGetValue(g_flsIndex);
+
+    if (threadFromCurrentFiber == NULL)
+    {
+        // we've seen this thread, but not this fiber.  It must be a "foreign" fiber that was
+        // borrowing this thread.
+        return;
+    }
+
+    if (threadFromCurrentFiber != thread)
+    {
+        _ASSERTE_ALL_BUILDS(!"Detaching a thread from the wrong fiber");
+    }
+
+    FlsSetValue(g_flsIndex, NULL);
+}
+
+void EnsureTlsDestructionMonitor()
+{
+    OsAttachThread(GetThread());
+}
+
+#else
 struct TlsDestructionMonitor
 {
     bool m_activated = false;
@@ -1714,36 +1833,7 @@ struct TlsDestructionMonitor
     {
         if (m_activated)
         {
-            Thread* thread = GetThreadNULLOk();
-            if (thread)
-            {
-#ifdef FEATURE_COMINTEROP
-                // reset the CoInitialize state
-                // so we don't call CoUninitialize during thread detach
-                thread->ResetCoInitialized();
-#endif // FEATURE_COMINTEROP
-                // For case where thread calls ExitThread directly, we need to reset the
-                // frame pointer. Otherwise stackwalk would AV. We need to do it in cooperative mode.
-                // We need to set m_GCOnTransitionsOK so this thread won't trigger GC when toggle GC mode
-                if (thread->m_pFrame != FRAME_TOP)
-                {
-#ifdef _DEBUG
-                    thread->m_GCOnTransitionsOK = FALSE;
-#endif
-                    GCX_COOP_NO_DTOR();
-                    thread->m_pFrame = FRAME_TOP;
-                    GCX_COOP_NO_DTOR_END();
-                }
-
-                thread->DetachThread(TRUE);
-            }
-            else
-            {
-                // Since we don't actually cleanup the TLS data along this path, verify that it is already cleaned up
-                AssertThreadStaticDataFreed();
-            }
-
-            ThreadDetaching();
+            RuntimeThreadShutdown(GetThreadNULLOk());
         }
     }
 };
@@ -1757,6 +1847,8 @@ void EnsureTlsDestructionMonitor()
     tls_destructionMonitor.Activate();
 }
 
+#endif
+
 #ifdef DEBUGGING_SUPPORTED
 //
 // InitializeDebugger initialized the Runtime-side COM+ Debugging Services
diff --git a/src/coreclr/vm/ceemain.h b/src/coreclr/vm/ceemain.h
index 1404a5a04237ff..120082d30572ca 100644
--- a/src/coreclr/vm/ceemain.h
+++ b/src/coreclr/vm/ceemain.h
@@ -46,6 +46,10 @@ void ForceEEShutdown(ShutdownCompleteAction sca = SCA_ExitProcessWhenShutdownCom
 void ThreadDetaching();
 
 void EnsureTlsDestructionMonitor();
+#ifdef TARGET_WINDOWS
+void InitFlsSlot();
+void OsDetachThread(void* thread);
+#endif
 
 void SetLatchedExitCode (INT32 code);
 INT32 GetLatchedExitCode (void);
diff --git a/src/coreclr/vm/threads.cpp b/src/coreclr/vm/threads.cpp
index e89b6c7d94c1d0..7ffb9f4746d94a 100644
--- a/src/coreclr/vm/threads.cpp
+++ b/src/coreclr/vm/threads.cpp
@@ -353,6 +353,7 @@ void SetThread(Thread* t)
 {
     LIMITED_METHOD_CONTRACT
 
+    Thread* origThread = gCurrentThreadInfo.m_pThread;
     gCurrentThreadInfo.m_pThread = t;
     if (t != NULL)
     {
@@ -360,6 +361,14 @@ void SetThread(Thread* t)
         EnsureTlsDestructionMonitor();
         t->InitRuntimeThreadLocals();
     }
+#ifdef TARGET_WINDOWS
+    else if (origThread != NULL)
+    {
+        // Unregister from OS notifications
+        // This can return false if a thread did not register for OS notification.
+        OsDetachThread(origThread);
+    }
+#endif
 
     // Clear or set the app domain to the one domain based on if the thread is being nulled out or set
     gCurrentThreadInfo.m_pAppDomain = t == NULL ? NULL : AppDomain::GetCurrentDomain();
@@ -1039,6 +1048,10 @@ void InitThreadManager()
     }
     CONTRACTL_END;
 
+#ifdef TARGET_WINDOWS
+    InitFlsSlot();
+#endif
+
     // All patched helpers should fit into one page.
     // If you hit this assert on retail build, there is most likely problem with BBT script.
     _ASSERTE_ALL_BUILDS((BYTE*)JIT_PatchedCodeLast - (BYTE*)JIT_PatchedCodeStart > (ptrdiff_t)0);