diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 8e10f4950..e3339917a 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -479,7 +479,7 @@ jobs:
         git diff --exit-code
     - name: Run clang-tidy
       run: |
-        clang-tidy-15  src/snmalloc/override/malloc.cc  -header-filter="`pwd`/*" -warnings-as-errors='*' -export-fixes=tidy.fail -- -std=c++17 -mcx16 -DSNMALLOC_PLATFORM_HAS_GETENTROPY=0 -Isrc
+        clang-tidy-15  src/snmalloc/override/malloc.cc  -header-filter="`pwd`/*" -warnings-as-errors='*' -export-fixes=tidy.fail -- -std=c++17 -mcx16 -DSNMALLOC_USE_WAIT_ON_ADDRESS=1 -DSNMALLOC_PLATFORM_HAS_GETENTROPY=0 -Isrc
         if [ -f tidy.fail ] ; then
           cat tidy.fail
           exit 1
diff --git a/.gitignore b/.gitignore
index 8737c737a..8057a72fa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,3 +18,5 @@ CMakeFiles/
 *~
 *.sw?
 
+# cache dirs
+.cache
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a5aa31814..019feaa48 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,6 +27,7 @@ option(SNMALLOC_LINK_ICF "Link with Identical Code Folding" ON)
 option(SNMALLOC_IPO "Link with IPO/LTO support" OFF)
 option(SNMALLOC_BENCHMARK_INDIVIDUAL_MITIGATIONS "Build tests and ld_preload for individual mitigations" OFF)
 option(SNMALLOC_ENABLE_DYNAMIC_LOADING "Build such that snmalloc can be dynamically loaded. This is not required for LD_PRELOAD, and will harm performance if enabled." OFF)
+option(SNMALLOC_ENABLE_WAIT_ON_ADDRESS "Use wait on address backoff strategy if it is available" ON)
 # Options that apply only if we're not building the header-only library
 cmake_dependent_option(SNMALLOC_RUST_SUPPORT "Build static library for rust" OFF "NOT SNMALLOC_HEADER_ONLY_LIBRARY" OFF)
 cmake_dependent_option(SNMALLOC_STATIC_LIBRARY "Build static libraries" ON "NOT SNMALLOC_HEADER_ONLY_LIBRARY" OFF)
@@ -134,6 +135,9 @@ int main() {
 # this is why we check its existence here
 CHECK_INCLUDE_FILE_CXX(linux/random.h SNMALLOC_HAS_LINUX_RANDOM_H)
 
+# check if futex.h is available
+CHECK_INCLUDE_FILE_CXX(linux/futex.h SNMALLOC_HAS_LINUX_FUTEX_H)
+
 # Provide as function so other projects can reuse
 # FIXME: This modifies some variables that may or may not be the ones that
 # provide flags and so is broken by design.  It should be removed once Verona
@@ -193,6 +197,13 @@ if(SNMALLOC_USE_CXX17)
 else()
   target_compile_features(snmalloc INTERFACE cxx_std_20)
 endif()
+
+if(SNMALLOC_ENABLE_WAIT_ON_ADDRESS)
+  target_compile_definitions(snmalloc INTERFACE SNMALLOC_USE_WAIT_ON_ADDRESS=1)
+else()
+  target_compile_definitions(snmalloc INTERFACE SNMALLOC_USE_WAIT_ON_ADDRESS=0)
+endif()
+
 # https://learn.microsoft.com/en-us/cpp/build/reference/zc-cplusplus
 if(MSVC)
   target_compile_options(snmalloc INTERFACE "/Zc:__cplusplus")
@@ -248,6 +259,7 @@ add_as_define(SNMALLOC_TRACING)
 add_as_define(SNMALLOC_CI_BUILD)
 add_as_define(SNMALLOC_PLATFORM_HAS_GETENTROPY)
 add_as_define(SNMALLOC_HAS_LINUX_RANDOM_H)
+add_as_define(SNMALLOC_HAS_LINUX_FUTEX_H)
 if (SNMALLOC_NO_REALLOCARRAY)
   add_as_define(SNMALLOC_NO_REALLOCARRAY)
 endif()
diff --git a/src/snmalloc/ds/combininglock.h b/src/snmalloc/ds/combininglock.h
index 1857713d1..89a4bc258 100644
--- a/src/snmalloc/ds/combininglock.h
+++ b/src/snmalloc/ds/combininglock.h
@@ -39,10 +39,34 @@ namespace snmalloc
    */
   class CombiningLockNode
   {
+    template<typename Pal>
+    static constexpr bool use_wait_on_address =
+      pal_supports<PalFeatures::WaitOnAddress, Pal> &&
+      SNMALLOC_USE_WAIT_ON_ADDRESS;
+
+    template<bool HasWaitOnAddress, typename Pal>
+    struct WaitWordTypeSelect;
+
+    template<typename Pal>
+    struct WaitWordTypeSelect<true, Pal>
+    {
+      using type = typename Pal::WaitingWord;
+    };
+
+    template<typename Pal>
+    struct WaitWordTypeSelect<false, Pal>
+    {
+      using type = int;
+    };
+
+    using WaitingWordType =
+      typename WaitWordTypeSelect<use_wait_on_address<DefaultPal>, DefaultPal>::
+        type;
+
     template<typename F>
     friend class CombiningLockNodeTempl;
 
-    enum class LockStatus
+    enum class LockStatus : WaitingWordType
     {
       // The work for this node has not been completed.
       WAITING,
@@ -53,7 +77,10 @@ namespace snmalloc
 
       // The work for this thread has not been completed, and it is the
       // head of the queue.
-      HEAD
+      HEAD,
+
+      // The waiter is currently sleeping.
+      SLEEPING
     };
 
     // Status of the queue, set by the thread at the head of the queue,
@@ -74,6 +101,51 @@ namespace snmalloc
       status.store(s, std::memory_order_release);
     }
 
+    template<typename Pal = DefaultPal>
+    static void wake(CombiningLockNode* node, LockStatus message)
+    {
+      if constexpr (!use_wait_on_address<Pal>)
+      {
+        node->set_status(message);
+      }
+      else
+      {
+        if (
+          node->status.exchange(message, std::memory_order_acq_rel) ==
+          LockStatus::SLEEPING)
+        {
+          Pal::notify_one_on_address(node->status);
+        }
+      }
+    }
+
+    template<typename Pal = DefaultPal>
+    void wait()
+    {
+      if constexpr (!use_wait_on_address<Pal>)
+      {
+        while (status.load(std::memory_order_acquire) == LockStatus::WAITING)
+          Aal::pause();
+      }
+      else
+      {
+        int remaining = 100;
+        while (remaining > 0)
+        {
+          if (status.load(std::memory_order_acquire) != LockStatus::WAITING)
+            return;
+          Aal::pause();
+          remaining--;
+        }
+        LockStatus expected = LockStatus::WAITING;
+        if (status.compare_exchange_strong(
+              expected, LockStatus::SLEEPING, std::memory_order_acq_rel))
+        {
+          Pal::wait_on_address(status, LockStatus::SLEEPING);
+        }
+      }
+    }
+
     SNMALLOC_SLOW_PATH void attach_slow(CombiningLock& lock)
     {
       // There is contention for the lock, we need to add our work to the
@@ -86,8 +158,7 @@ namespace snmalloc
         prev->next.store(this, std::memory_order_release);
 
         // Wait to for predecessor to complete
-        while (status.load(std::memory_order_relaxed) == LockStatus::WAITING)
-          Aal::pause();
+        wait();
 
         // Determine if another thread completed our work.
         if (status.load(std::memory_order_acquire) == LockStatus::DONE)
@@ -131,7 +202,7 @@ namespace snmalloc
           break;
         // Signal this work was completed and move on to
         // next item.
-        curr->set_status(LockStatus::DONE);
+        wake(curr, LockStatus::DONE);
         curr = n;
       }
 
@@ -146,7 +217,7 @@ namespace snmalloc
       {
         // Queue was successfully closed.
         // Notify last element the work was completed.
-        curr->set_status(LockStatus::DONE);
+        wake(curr, LockStatus::DONE);
         lock.release();
         return;
       }
@@ -160,13 +231,13 @@ namespace snmalloc
 
       // As we had to wait, give the job to the next thread
       // to carry on performing the work.
-      n->set_status(LockStatus::HEAD);
+      wake(n, LockStatus::HEAD);
 
       // Notify the thread that we completed its work.
       // Note that this needs to be before setting curr->status,
       // as after the status is set the thread may deallocate the
       // queue node.
-      curr->set_status(LockStatus::DONE);
+      wake(curr, LockStatus::DONE);
       return;
     }
   };
diff --git a/src/snmalloc/pal/pal_apple.h b/src/snmalloc/pal/pal_apple.h
index f023e195a..f6a7f1a2d 100644
--- a/src/snmalloc/pal/pal_apple.h
+++ b/src/snmalloc/pal/pal_apple.h
@@ -15,8 +15,39 @@
 #  include <sys/mman.h>
 #  include <unistd.h>
 
+#  if __has_include(<AvailabilityMacros.h>) && __has_include(<Availability.h>)
+#    include <Availability.h>
+#    include <AvailabilityMacros.h>
+#    if defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && \
+      defined(MAC_OS_X_VERSION_14_4)
+#      if __MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_14_4
+#        define SNMALLOC_APPLE_HAS_OS_SYNC_WAIT_ON_ADDRESS
+#      endif
+#    endif
+#  endif
+
 namespace snmalloc
 {
+#  ifdef SNMALLOC_APPLE_HAS_OS_SYNC_WAIT_ON_ADDRESS
+  // For macos 14.4+, we use os_sync_wait_on_address and friends. It is
+  // available as a part of stable API, and the usage is more straightforward.
+  extern "C" int os_sync_wait_on_address(
+    void* addr, uint64_t value, size_t size, uint32_t flags);
+
+  extern "C" int
+  os_sync_wake_by_address_any(void* addr, size_t size, uint32_t flags);
+
+  extern "C" int
+  os_sync_wake_by_address_all(void* addr, size_t size, uint32_t flags);
+#  else
+  // For platforms before macos 14.4, we use __ulock_wait and friends. It is
+  // available since macos 10.12.
+  extern "C" int
+  __ulock_wait(uint32_t lock_type, void* addr, uint64_t value, uint32_t);
+
+  extern "C" int __ulock_wake(uint32_t lock_type, void* addr, uint64_t);
+#  endif
+
   /**
    * PAL implementation for Apple systems (macOS, iOS, watchOS, tvOS...).
    */
@@ -28,7 +59,7 @@ namespace snmalloc
      * The features exported by this PAL.
      */
     static constexpr uint64_t pal_features =
-      AlignedAllocation | LazyCommit | Entropy | Time;
+      AlignedAllocation | LazyCommit | Entropy | Time | WaitOnAddress;
 
     /*
      * `page_size`
@@ -281,6 +312,76 @@ namespace snmalloc
 
       return result;
     }
+
+    using WaitingWord = uint32_t;
+#  ifndef SNMALLOC_APPLE_HAS_OS_SYNC_WAIT_ON_ADDRESS
+    static constexpr uint32_t UL_COMPARE_AND_WAIT = 0x0000'0001;
+    static constexpr uint32_t ULF_NO_ERRNO = 0x0100'0000;
+    static constexpr uint32_t ULF_WAKE_ALL = 0x0000'0100;
+#  endif
+
+    template<class T>
+    static void wait_on_address(std::atomic<T>& addr, T expected)
+    {
+      [[maybe_unused]] int errno_backup = errno;
+      while (addr.load(std::memory_order_relaxed) == expected)
+      {
+#  ifdef SNMALLOC_APPLE_HAS_OS_SYNC_WAIT_ON_ADDRESS
+        if (
+          os_sync_wait_on_address(
+            &addr, static_cast<uint64_t>(expected), sizeof(T), 0) != -1)
+        {
+          errno = errno_backup;
+          return;
+        }
+#  else
+        if (
+          __ulock_wait(
+            UL_COMPARE_AND_WAIT | ULF_NO_ERRNO,
+            &addr,
+            static_cast<uint64_t>(expected),
+            0) != -1)
+        {
+          return;
+        }
+#  endif
+      }
+    }
+
+    template<class T>
+    static void notify_one_on_address(std::atomic<T>& addr)
+    {
+#  ifdef SNMALLOC_APPLE_HAS_OS_SYNC_WAIT_ON_ADDRESS
+      os_sync_wake_by_address_any(&addr, sizeof(T), 0);
+#  else
+      // __ulock_wake can get interrupted, so retry until either waking up a
+      // waiter or failing because there are no waiters (ENOENT).
+      for (;;)
+      {
+        int ret = __ulock_wake(UL_COMPARE_AND_WAIT | ULF_NO_ERRNO, &addr, 0);
+        if (ret >= 0 || ret == -ENOENT)
+          return;
+      }
+#  endif
+    }
+
+    template<class T>
+    static void notify_all_on_address(std::atomic<T>& addr)
+    {
+#  ifdef SNMALLOC_APPLE_HAS_OS_SYNC_WAIT_ON_ADDRESS
+      os_sync_wake_by_address_all(&addr, sizeof(T), 0);
+#  else
+      // __ulock_wake can get interrupted, so retry until either waking up a
+      // waiter or failing because there are no waiters (ENOENT).
+      for (;;)
+      {
+        int ret = __ulock_wake(
+          UL_COMPARE_AND_WAIT | ULF_NO_ERRNO | ULF_WAKE_ALL, &addr, 0);
+        if (ret >= 0 || ret == -ENOENT)
+          return;
+      }
+#  endif
+    }
   };
 } // namespace snmalloc
 #endif
diff --git a/src/snmalloc/pal/pal_consts.h b/src/snmalloc/pal/pal_consts.h
index 5679c336e..c4c4c25a2 100644
--- a/src/snmalloc/pal/pal_consts.h
+++ b/src/snmalloc/pal/pal_consts.h
@@ -60,6 +60,11 @@ namespace snmalloc
      * modify which parts get dumped.
      */
     CoreDump = (1 << 6),
+
+    /**
+     * This Pal provides a way for parking threads at a specific address.
+     */
+    WaitOnAddress = (1 << 7),
   };
 
   /**
diff --git a/src/snmalloc/pal/pal_freebsd.h b/src/snmalloc/pal/pal_freebsd.h
index 199aef4cf..d7fba0f08 100644
--- a/src/snmalloc/pal/pal_freebsd.h
+++ b/src/snmalloc/pal/pal_freebsd.h
@@ -13,6 +13,8 @@
 #    endif
 #  endif
 
+#  include <sys/umtx.h>
+
 /**
  * Direct system-call wrappers so that we can skip libthr interception, which
  * won't work if malloc is broken.
@@ -44,7 +46,7 @@ namespace snmalloc
      * add new features that they should add any required feature flags.
      */
     static constexpr uint64_t pal_features =
-      PALBSD_Aligned::pal_features | CoreDump;
+      PALBSD_Aligned::pal_features | CoreDump | WaitOnAddress;
 
     /**
      * FreeBSD uses atypically small address spaces on its 64 bit RISC machines.
@@ -129,6 +131,53 @@ namespace snmalloc
           p.unsafe_ptr(), ~static_cast<unsigned int>(CHERI_PERM_SW_VMEM)));
     }
 #  endif
+
+    using WaitingWord = unsigned int;
+
+    template<typename T>
+    static void wait_on_address(std::atomic<T>& addr, T expected)
+    {
+      static_assert(
+        sizeof(T) == sizeof(WaitingWord) && alignof(T) == alignof(WaitingWord),
+        "T must be the same size and alignment as WaitingWord");
+      int backup = errno;
+      while (addr.load(std::memory_order_relaxed) == expected)
+      {
+        int ret = _umtx_op(
+          &addr,
+          UMTX_OP_WAIT_UINT_PRIVATE,
+          static_cast<unsigned long>(expected),
+          nullptr,
+          nullptr);
+
+        if (ret == 0)
+          break;
+      }
+      errno = backup;
+    }
+
+    template<typename T>
+    static void notify_one_on_address(std::atomic<T>& addr)
+    {
+      static_assert(
+        sizeof(T) == sizeof(WaitingWord) && alignof(T) == alignof(WaitingWord),
+        "T must be the same size and alignment as WaitingWord");
+      _umtx_op(&addr, UMTX_OP_WAKE_PRIVATE, 1, nullptr, nullptr);
+    }
+
+    template<typename T>
+    static void notify_all_on_address(std::atomic<T>& addr)
+    {
+      static_assert(
+        sizeof(T) == sizeof(WaitingWord) && alignof(T) == alignof(WaitingWord),
+        "T must be the same size and alignment as WaitingWord");
+      _umtx_op(
+        &addr,
+        UMTX_OP_WAKE_PRIVATE,
+        static_cast<unsigned long>(INT_MAX),
+        nullptr,
+        nullptr);
+    }
   };
 } // namespace snmalloc
 #endif
diff --git a/src/snmalloc/pal/pal_linux.h b/src/snmalloc/pal/pal_linux.h
index 2ff8add0c..0b043de8a 100644
--- a/src/snmalloc/pal/pal_linux.h
+++ b/src/snmalloc/pal/pal_linux.h
@@ -14,6 +14,10 @@
 #    include <linux/random.h>
 #  endif
 
+#  if defined(SNMALLOC_HAS_LINUX_FUTEX_H)
+#    include <linux/futex.h>
+#  endif
+
 extern "C" int puts(const char* str);
 
 namespace snmalloc
@@ -27,8 +31,12 @@ namespace snmalloc
      *
      * We always make sure that linux has entropy support.
      */
-    static constexpr uint64_t pal_features =
-      PALPOSIX::pal_features | Entropy | CoreDump;
+    static constexpr uint64_t pal_features = PALPOSIX::pal_features | Entropy |
+      CoreDump
+#  ifdef SNMALLOC_HAS_LINUX_FUTEX_H
+      | WaitOnAddress
+#  endif
+      ;
 
     static constexpr size_t page_size =
       Aal::aal_name == PowerPC ? 0x10000 : PALPOSIX::page_size;
@@ -232,6 +240,47 @@ namespace snmalloc
       // its APIs are not exception-free.
       return dev_urandom();
     }
+
+#  ifdef SNMALLOC_HAS_LINUX_FUTEX_H
+    using WaitingWord = int;
+
+    template<class T>
+    static void wait_on_address(std::atomic<T>& addr, T expected)
+    {
+      int backup = errno;
+      static_assert(
+        sizeof(T) == sizeof(WaitingWord) && alignof(T) == alignof(WaitingWord),
+        "T must be the same size and alignment as WaitingWord");
+      while (addr.load(std::memory_order_relaxed) == expected)
+      {
+        long ret = syscall(
+          SYS_futex, &addr, FUTEX_WAIT_PRIVATE, expected, nullptr, nullptr, 0);
+
+        if (ret == 0)
+          break;
+      }
+      errno = backup;
+    }
+
+    template<class T>
+    static void notify_one_on_address(std::atomic<T>& addr)
+    {
+      static_assert(
+        sizeof(T) == sizeof(WaitingWord) && alignof(T) == alignof(WaitingWord),
+        "T must be the same size and alignment as WaitingWord");
+      syscall(SYS_futex, &addr, FUTEX_WAKE_PRIVATE, 1, nullptr, nullptr, 0);
+    }
+
+    template<class T>
+    static void notify_all_on_address(std::atomic<T>& addr)
+    {
+      static_assert(
+        sizeof(T) == sizeof(WaitingWord) && alignof(T) == alignof(WaitingWord),
+        "T must be the same size and alignment as WaitingWord");
+      syscall(
+        SYS_futex, &addr, FUTEX_WAKE_PRIVATE, INT_MAX, nullptr, nullptr, 0);
+    }
+#  endif
   };
 } // namespace snmalloc
 #endif
diff --git a/src/snmalloc/pal/pal_windows.h b/src/snmalloc/pal/pal_windows.h
index 2ab0bfc1f..4d94e4f67 100644
--- a/src/snmalloc/pal/pal_windows.h
+++ b/src/snmalloc/pal/pal_windows.h
@@ -20,6 +20,7 @@
 #    if (NTDDI_VERSION >= NTDDI_WIN10_RS5) && \
       (WINVER >= _WIN32_WINNT_WIN10) && !defined(USE_SYSTEMATIC_TESTING)
 #      define PLATFORM_HAS_VIRTUALALLOC2
+#      define PLATFORM_HAS_WAITONADDRESS
 #    endif
 #  endif
 
@@ -60,6 +61,9 @@ namespace snmalloc
       Time
 #  if defined(PLATFORM_HAS_VIRTUALALLOC2) && !defined(USE_SYSTEMATIC_TESTING)
       | AlignedAllocation
+#  endif
+#  if defined(PLATFORM_HAS_WAITONADDRESS)
+      | WaitOnAddress
 #  endif
       ;
 
@@ -231,6 +235,29 @@ namespace snmalloc
           std::chrono::steady_clock::now().time_since_epoch())
           .count());
     }
+
+#  ifdef PLATFORM_HAS_WAITONADDRESS
+    using WaitingWord = char;
+    template<class T>
+    static void wait_on_address(std::atomic<T>& addr, T expected)
+    {
+      while (addr.load(std::memory_order_relaxed) == expected)
+      {
+        if (::WaitOnAddress(&addr, &expected, sizeof(T), INFINITE))
+          break;
+      }
+    }
+    template<class T>
+    static void notify_one_on_address(std::atomic<T>& addr)
+    {
+      ::WakeByAddressSingle(&addr);
+    }
+    template<class T>
+    static void notify_all_on_address(std::atomic<T>& addr)
+    {
+      ::WakeByAddressAll(&addr);
+    }
+#  endif
   };
 }
 #endif