diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 8e10f4950..e3339917a 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -479,7 +479,7 @@ jobs: git diff --exit-code - name: Run clang-tidy run: | - clang-tidy-15 src/snmalloc/override/malloc.cc -header-filter="`pwd`/*" -warnings-as-errors='*' -export-fixes=tidy.fail -- -std=c++17 -mcx16 -DSNMALLOC_PLATFORM_HAS_GETENTROPY=0 -Isrc + clang-tidy-15 src/snmalloc/override/malloc.cc -header-filter="`pwd`/*" -warnings-as-errors='*' -export-fixes=tidy.fail -- -std=c++17 -mcx16 -DSNMALLOC_USE_WAIT_ON_ADDRESS=1 -DSNMALLOC_PLATFORM_HAS_GETENTROPY=0 -Isrc if [ -f tidy.fail ] ; then cat tidy.fail exit 1 diff --git a/.gitignore b/.gitignore index 8737c737a..8057a72fa 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,5 @@ CMakeFiles/ *~ *.sw? +# cache dirs +.cache diff --git a/CMakeLists.txt b/CMakeLists.txt index a5aa31814..019feaa48 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,6 +27,7 @@ option(SNMALLOC_LINK_ICF "Link with Identical Code Folding" ON) option(SNMALLOC_IPO "Link with IPO/LTO support" OFF) option(SNMALLOC_BENCHMARK_INDIVIDUAL_MITIGATIONS "Build tests and ld_preload for individual mitigations" OFF) option(SNMALLOC_ENABLE_DYNAMIC_LOADING "Build such that snmalloc can be dynamically loaded. This is not required for LD_PRELOAD, and will harm performance if enabled." OFF) +option(SNMALLOC_ENABLE_WAIT_ON_ADDRESS "Use wait on address backoff strategy if it is available" ON) # Options that apply only if we're not building the header-only library cmake_dependent_option(SNMALLOC_RUST_SUPPORT "Build static library for rust" OFF "NOT SNMALLOC_HEADER_ONLY_LIBRARY" OFF) cmake_dependent_option(SNMALLOC_STATIC_LIBRARY "Build static libraries" ON "NOT SNMALLOC_HEADER_ONLY_LIBRARY" OFF) @@ -134,6 +135,9 @@ int main() { # this is why we check its existence here CHECK_INCLUDE_FILE_CXX(linux/random.h SNMALLOC_HAS_LINUX_RANDOM_H) +# check if futex.h is available +CHECK_INCLUDE_FILE_CXX(linux/futex.h SNMALLOC_HAS_LINUX_FUTEX_H) + # Provide as function so other projects can reuse # FIXME: This modifies some variables that may or may not be the ones that # provide flags and so is broken by design. It should be removed once Verona @@ -193,6 +197,13 @@ if(SNMALLOC_USE_CXX17) else() target_compile_features(snmalloc INTERFACE cxx_std_20) endif() + +if(SNMALLOC_ENABLE_WAIT_ON_ADDRESS) + target_compile_definitions(snmalloc INTERFACE SNMALLOC_USE_WAIT_ON_ADDRESS=1) +else() + target_compile_definitions(snmalloc INTERFACE SNMALLOC_USE_WAIT_ON_ADDRESS=0) +endif() + # https://learn.microsoft.com/en-us/cpp/build/reference/zc-cplusplus if(MSVC) target_compile_options(snmalloc INTERFACE "/Zc:__cplusplus") @@ -248,6 +259,7 @@ add_as_define(SNMALLOC_TRACING) add_as_define(SNMALLOC_CI_BUILD) add_as_define(SNMALLOC_PLATFORM_HAS_GETENTROPY) add_as_define(SNMALLOC_HAS_LINUX_RANDOM_H) +add_as_define(SNMALLOC_HAS_LINUX_FUTEX_H) if (SNMALLOC_NO_REALLOCARRAY) add_as_define(SNMALLOC_NO_REALLOCARRAY) endif() diff --git a/src/snmalloc/ds/combininglock.h b/src/snmalloc/ds/combininglock.h index 1857713d1..89a4bc258 100644 --- a/src/snmalloc/ds/combininglock.h +++ b/src/snmalloc/ds/combininglock.h @@ -39,10 +39,34 @@ namespace snmalloc */ class CombiningLockNode { + template + static constexpr bool use_wait_on_address = + pal_supports && + SNMALLOC_USE_WAIT_ON_ADDRESS; + + template + struct WaitWordTypeSelect; + + template + struct WaitWordTypeSelect + { + using type = typename Pal::WaitingWord; + }; + + template + struct WaitWordTypeSelect + { + using type = int; + }; + + using WaitingWordType = + typename WaitWordTypeSelect, DefaultPal>:: + type; + template friend class CombiningLockNodeTempl; - enum class LockStatus + enum class LockStatus : WaitingWordType { // The work for this node has not been completed. WAITING, @@ -53,7 +77,10 @@ namespace snmalloc // The work for this thread has not been completed, and it is the // head of the queue. - HEAD + HEAD, + + // The waiter is currently sleeping. + SLEEPING }; // Status of the queue, set by the thread at the head of the queue, @@ -74,6 +101,51 @@ namespace snmalloc status.store(s, std::memory_order_release); } + template + static void wake(CombiningLockNode* node, LockStatus message) + { + if constexpr (!use_wait_on_address) + { + node->set_status(message); + } + else + { + if ( + node->status.exchange(message, std::memory_order_acq_rel) == + LockStatus::SLEEPING) + { + Pal::notify_one_on_address(node->status); + } + } + } + + template + void wait() + { + if constexpr (!use_wait_on_address) + { + while (status.load(std::memory_order_acquire) == LockStatus::WAITING) + Aal::pause(); + } + else + { + int remaining = 100; + while (remaining > 0) + { + if (status.load(std::memory_order_acquire) != LockStatus::WAITING) + return; + Aal::pause(); + remaining--; + } + LockStatus expected = LockStatus::WAITING; + if (status.compare_exchange_strong( + expected, LockStatus::SLEEPING, std::memory_order_acq_rel)) + { + Pal::wait_on_address(status, LockStatus::SLEEPING); + } + } + } + SNMALLOC_SLOW_PATH void attach_slow(CombiningLock& lock) { // There is contention for the lock, we need to add our work to the @@ -86,8 +158,7 @@ namespace snmalloc prev->next.store(this, std::memory_order_release); // Wait to for predecessor to complete - while (status.load(std::memory_order_relaxed) == LockStatus::WAITING) - Aal::pause(); + wait(); // Determine if another thread completed our work. if (status.load(std::memory_order_acquire) == LockStatus::DONE) @@ -131,7 +202,7 @@ namespace snmalloc break; // Signal this work was completed and move on to // next item. - curr->set_status(LockStatus::DONE); + wake(curr, LockStatus::DONE); curr = n; } @@ -146,7 +217,7 @@ namespace snmalloc { // Queue was successfully closed. // Notify last element the work was completed. - curr->set_status(LockStatus::DONE); + wake(curr, LockStatus::DONE); lock.release(); return; } @@ -160,13 +231,13 @@ namespace snmalloc // As we had to wait, give the job to the next thread // to carry on performing the work. - n->set_status(LockStatus::HEAD); + wake(n, LockStatus::HEAD); // Notify the thread that we completed its work. // Note that this needs to be before setting curr->status, // as after the status is set the thread may deallocate the // queue node. - curr->set_status(LockStatus::DONE); + wake(curr, LockStatus::DONE); return; } }; diff --git a/src/snmalloc/pal/pal_apple.h b/src/snmalloc/pal/pal_apple.h index f023e195a..f6a7f1a2d 100644 --- a/src/snmalloc/pal/pal_apple.h +++ b/src/snmalloc/pal/pal_apple.h @@ -15,8 +15,39 @@ # include # include +# if __has_include() && __has_include() +# include +# include +# if defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && \ + defined(MAC_OS_X_VERSION_14_4) +# if __MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_14_4 +# define SNMALLOC_APPLE_HAS_OS_SYNC_WAIT_ON_ADDRESS +# endif +# endif +# endif + namespace snmalloc { +# ifdef SNMALLOC_APPLE_HAS_OS_SYNC_WAIT_ON_ADDRESS + // For macos 14.4+, we use os_sync_wait_on_address and friends. It is + // available as a part of stable API, and the usage is more straightforward. + extern "C" int os_sync_wait_on_address( + void* addr, uint64_t value, size_t size, uint32_t flags); + + extern "C" int + os_sync_wake_by_address_any(void* addr, size_t size, uint32_t flags); + + extern "C" int + os_sync_wake_by_address_all(void* addr, size_t size, uint32_t flags); +# else + // For platforms before macos 14.4, we use __ulock_wait and friends. It is + // available since macos 10.12. + extern "C" int + __ulock_wait(uint32_t lock_type, void* addr, uint64_t value, uint32_t); + + extern "C" int __ulock_wake(uint32_t lock_type, void* addr, uint64_t); +# endif + /** * PAL implementation for Apple systems (macOS, iOS, watchOS, tvOS...). */ @@ -28,7 +59,7 @@ namespace snmalloc * The features exported by this PAL. */ static constexpr uint64_t pal_features = - AlignedAllocation | LazyCommit | Entropy | Time; + AlignedAllocation | LazyCommit | Entropy | Time | WaitOnAddress; /* * `page_size` @@ -281,6 +312,76 @@ namespace snmalloc return result; } + + using WaitingWord = uint32_t; +# ifndef SNMALLOC_APPLE_HAS_OS_SYNC_WAIT_ON_ADDRESS + static constexpr uint32_t UL_COMPARE_AND_WAIT = 0x0000'0001; + static constexpr uint32_t ULF_NO_ERRNO = 0x0100'0000; + static constexpr uint32_t ULF_WAKE_ALL = 0x0000'0100; +# endif + + template + static void wait_on_address(std::atomic& addr, T expected) + { + [[maybe_unused]] int errno_backup = errno; + while (addr.load(std::memory_order_relaxed) == expected) + { +# ifdef SNMALLOC_APPLE_HAS_OS_SYNC_WAIT_ON_ADDRESS + if ( + os_sync_wait_on_address( + &addr, static_cast(expected), sizeof(T), 0) != -1) + { + errno = errno_backup; + return; + } +# else + if ( + __ulock_wait( + UL_COMPARE_AND_WAIT | ULF_NO_ERRNO, + &addr, + static_cast(expected), + 0) != -1) + { + return; + } +# endif + } + } + + template + static void notify_one_on_address(std::atomic& addr) + { +# ifdef SNMALLOC_APPLE_HAS_OS_SYNC_WAIT_ON_ADDRESS + os_sync_wake_by_address_any(&addr, sizeof(T), 0); +# else + // __ulock_wake can get interrupted, so retry until either waking up a + // waiter or failing because there are no waiters (ENOENT). + for (;;) + { + int ret = __ulock_wake(UL_COMPARE_AND_WAIT | ULF_NO_ERRNO, &addr, 0); + if (ret >= 0 || ret == -ENOENT) + return; + } +# endif + } + + template + static void notify_all_on_address(std::atomic& addr) + { +# ifdef SNMALLOC_APPLE_HAS_OS_SYNC_WAIT_ON_ADDRESS + os_sync_wake_by_address_all(&addr, sizeof(T), 0); +# else + // __ulock_wake can get interrupted, so retry until either waking up a + // waiter or failing because there are no waiters (ENOENT). + for (;;) + { + int ret = __ulock_wake( + UL_COMPARE_AND_WAIT | ULF_NO_ERRNO | ULF_WAKE_ALL, &addr, 0); + if (ret >= 0 || ret == -ENOENT) + return; + } +# endif + } }; } // namespace snmalloc #endif diff --git a/src/snmalloc/pal/pal_consts.h b/src/snmalloc/pal/pal_consts.h index 5679c336e..c4c4c25a2 100644 --- a/src/snmalloc/pal/pal_consts.h +++ b/src/snmalloc/pal/pal_consts.h @@ -60,6 +60,11 @@ namespace snmalloc * modify which parts get dumped. */ CoreDump = (1 << 6), + + /** + * This Pal provides a way for parking threads at a specific address. + */ + WaitOnAddress = (1 << 7), }; /** diff --git a/src/snmalloc/pal/pal_freebsd.h b/src/snmalloc/pal/pal_freebsd.h index 199aef4cf..d7fba0f08 100644 --- a/src/snmalloc/pal/pal_freebsd.h +++ b/src/snmalloc/pal/pal_freebsd.h @@ -13,6 +13,8 @@ # endif # endif +# include + /** * Direct system-call wrappers so that we can skip libthr interception, which * won't work if malloc is broken. @@ -44,7 +46,7 @@ namespace snmalloc * add new features that they should add any required feature flags. */ static constexpr uint64_t pal_features = - PALBSD_Aligned::pal_features | CoreDump; + PALBSD_Aligned::pal_features | CoreDump | WaitOnAddress; /** * FreeBSD uses atypically small address spaces on its 64 bit RISC machines. @@ -129,6 +131,53 @@ namespace snmalloc p.unsafe_ptr(), ~static_cast(CHERI_PERM_SW_VMEM))); } # endif + + using WaitingWord = unsigned int; + + template + static void wait_on_address(std::atomic& addr, T expected) + { + static_assert( + sizeof(T) == sizeof(WaitingWord) && alignof(T) == alignof(WaitingWord), + "T must be the same size and alignment as WaitingWord"); + int backup = errno; + while (addr.load(std::memory_order_relaxed) == expected) + { + int ret = _umtx_op( + &addr, + UMTX_OP_WAIT_UINT_PRIVATE, + static_cast(expected), + nullptr, + nullptr); + + if (ret == 0) + break; + } + errno = backup; + } + + template + static void notify_one_on_address(std::atomic& addr) + { + static_assert( + sizeof(T) == sizeof(WaitingWord) && alignof(T) == alignof(WaitingWord), + "T must be the same size and alignment as WaitingWord"); + _umtx_op(&addr, UMTX_OP_WAKE_PRIVATE, 1, nullptr, nullptr); + } + + template + static void notify_all_on_address(std::atomic& addr) + { + static_assert( + sizeof(T) == sizeof(WaitingWord) && alignof(T) == alignof(WaitingWord), + "T must be the same size and alignment as WaitingWord"); + _umtx_op( + &addr, + UMTX_OP_WAKE_PRIVATE, + static_cast(INT_MAX), + nullptr, + nullptr); + } }; } // namespace snmalloc #endif diff --git a/src/snmalloc/pal/pal_linux.h b/src/snmalloc/pal/pal_linux.h index 2ff8add0c..0b043de8a 100644 --- a/src/snmalloc/pal/pal_linux.h +++ b/src/snmalloc/pal/pal_linux.h @@ -14,6 +14,10 @@ # include # endif +# if defined(SNMALLOC_HAS_LINUX_FUTEX_H) +# include +# endif + extern "C" int puts(const char* str); namespace snmalloc @@ -27,8 +31,12 @@ namespace snmalloc * * We always make sure that linux has entropy support. */ - static constexpr uint64_t pal_features = - PALPOSIX::pal_features | Entropy | CoreDump; + static constexpr uint64_t pal_features = PALPOSIX::pal_features | Entropy | + CoreDump +# ifdef SNMALLOC_HAS_LINUX_FUTEX_H + | WaitOnAddress +# endif + ; static constexpr size_t page_size = Aal::aal_name == PowerPC ? 0x10000 : PALPOSIX::page_size; @@ -232,6 +240,47 @@ namespace snmalloc // its APIs are not exception-free. return dev_urandom(); } + +# ifdef SNMALLOC_HAS_LINUX_FUTEX_H + using WaitingWord = int; + + template + static void wait_on_address(std::atomic& addr, T expected) + { + int backup = errno; + static_assert( + sizeof(T) == sizeof(WaitingWord) && alignof(T) == alignof(WaitingWord), + "T must be the same size and alignment as WaitingWord"); + while (addr.load(std::memory_order_relaxed) == expected) + { + long ret = syscall( + SYS_futex, &addr, FUTEX_WAIT_PRIVATE, expected, nullptr, nullptr, 0); + + if (ret == 0) + break; + } + errno = backup; + } + + template + static void notify_one_on_address(std::atomic& addr) + { + static_assert( + sizeof(T) == sizeof(WaitingWord) && alignof(T) == alignof(WaitingWord), + "T must be the same size and alignment as WaitingWord"); + syscall(SYS_futex, &addr, FUTEX_WAKE_PRIVATE, 1, nullptr, nullptr, 0); + } + + template + static void notify_all_on_address(std::atomic& addr) + { + static_assert( + sizeof(T) == sizeof(WaitingWord) && alignof(T) == alignof(WaitingWord), + "T must be the same size and alignment as WaitingWord"); + syscall( + SYS_futex, &addr, FUTEX_WAKE_PRIVATE, INT_MAX, nullptr, nullptr, 0); + } +# endif }; } // namespace snmalloc #endif diff --git a/src/snmalloc/pal/pal_windows.h b/src/snmalloc/pal/pal_windows.h index 2ab0bfc1f..4d94e4f67 100644 --- a/src/snmalloc/pal/pal_windows.h +++ b/src/snmalloc/pal/pal_windows.h @@ -20,6 +20,7 @@ # if (NTDDI_VERSION >= NTDDI_WIN10_RS5) && \ (WINVER >= _WIN32_WINNT_WIN10) && !defined(USE_SYSTEMATIC_TESTING) # define PLATFORM_HAS_VIRTUALALLOC2 +# define PLATFORM_HAS_WAITONADDRESS # endif # endif @@ -60,6 +61,9 @@ namespace snmalloc Time # if defined(PLATFORM_HAS_VIRTUALALLOC2) && !defined(USE_SYSTEMATIC_TESTING) | AlignedAllocation +# endif +# if defined(PLATFORM_HAS_WAITONADDRESS) + | WaitOnAddress # endif ; @@ -231,6 +235,29 @@ namespace snmalloc std::chrono::steady_clock::now().time_since_epoch()) .count()); } + +# ifdef PLATFORM_HAS_WAITONADDRESS + using WaitingWord = char; + template + static void wait_on_address(std::atomic& addr, T expected) + { + while (addr.load(std::memory_order_relaxed) == expected) + { + if (::WaitOnAddress(&addr, &expected, sizeof(T), INFINITE)) + break; + } + } + template + static void notify_one_on_address(std::atomic& addr) + { + ::WakeByAddressSingle(&addr); + } + template + static void notify_all_on_address(std::atomic& addr) + { + ::WakeByAddressAll(&addr); + } +# endif }; } #endif