Skip to content
Merged
2 changes: 1 addition & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -479,7 +479,7 @@ jobs:
git diff --exit-code
- name: Run clang-tidy
run: |
clang-tidy-15 src/snmalloc/override/malloc.cc -header-filter="`pwd`/*" -warnings-as-errors='*' -export-fixes=tidy.fail -- -std=c++17 -mcx16 -DSNMALLOC_PLATFORM_HAS_GETENTROPY=0 -Isrc
clang-tidy-15 src/snmalloc/override/malloc.cc -header-filter="`pwd`/*" -warnings-as-errors='*' -export-fixes=tidy.fail -- -std=c++17 -mcx16 -DSNMALLOC_USE_WAIT_ON_ADDRESS=1 -DSNMALLOC_PLATFORM_HAS_GETENTROPY=0 -Isrc
if [ -f tidy.fail ] ; then
cat tidy.fail
exit 1
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,5 @@ CMakeFiles/
*~
*.sw?

# cache dirs
.cache
12 changes: 12 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ option(SNMALLOC_LINK_ICF "Link with Identical Code Folding" ON)
option(SNMALLOC_IPO "Link with IPO/LTO support" OFF)
option(SNMALLOC_BENCHMARK_INDIVIDUAL_MITIGATIONS "Build tests and ld_preload for individual mitigations" OFF)
option(SNMALLOC_ENABLE_DYNAMIC_LOADING "Build such that snmalloc can be dynamically loaded. This is not required for LD_PRELOAD, and will harm performance if enabled." OFF)
option(SNMALLOC_ENABLE_WAIT_ON_ADDRESS "Use wait on address backoff strategy if it is available" ON)
# Options that apply only if we're not building the header-only library
cmake_dependent_option(SNMALLOC_RUST_SUPPORT "Build static library for rust" OFF "NOT SNMALLOC_HEADER_ONLY_LIBRARY" OFF)
cmake_dependent_option(SNMALLOC_STATIC_LIBRARY "Build static libraries" ON "NOT SNMALLOC_HEADER_ONLY_LIBRARY" OFF)
Expand Down Expand Up @@ -134,6 +135,9 @@ int main() {
# this is why we check its existence here
CHECK_INCLUDE_FILE_CXX(linux/random.h SNMALLOC_HAS_LINUX_RANDOM_H)

# check if futex.h is available
CHECK_INCLUDE_FILE_CXX(linux/futex.h SNMALLOC_HAS_LINUX_FUTEX_H)

# Provide as function so other projects can reuse
# FIXME: This modifies some variables that may or may not be the ones that
# provide flags and so is broken by design. It should be removed once Verona
Expand Down Expand Up @@ -193,6 +197,13 @@ if(SNMALLOC_USE_CXX17)
else()
target_compile_features(snmalloc INTERFACE cxx_std_20)
endif()

if(SNMALLOC_ENABLE_WAIT_ON_ADDRESS)
target_compile_definitions(snmalloc INTERFACE SNMALLOC_USE_WAIT_ON_ADDRESS=1)
else()
target_compile_definitions(snmalloc INTERFACE SNMALLOC_USE_WAIT_ON_ADDRESS=0)
endif()

# https://learn.microsoft.com/en-us/cpp/build/reference/zc-cplusplus
if(MSVC)
target_compile_options(snmalloc INTERFACE "/Zc:__cplusplus")
Expand Down Expand Up @@ -248,6 +259,7 @@ add_as_define(SNMALLOC_TRACING)
add_as_define(SNMALLOC_CI_BUILD)
add_as_define(SNMALLOC_PLATFORM_HAS_GETENTROPY)
add_as_define(SNMALLOC_HAS_LINUX_RANDOM_H)
add_as_define(SNMALLOC_HAS_LINUX_FUTEX_H)
if (SNMALLOC_NO_REALLOCARRAY)
add_as_define(SNMALLOC_NO_REALLOCARRAY)
endif()
Expand Down
87 changes: 79 additions & 8 deletions src/snmalloc/ds/combininglock.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,34 @@ namespace snmalloc
*/
class CombiningLockNode
{
template<typename Pal>
static constexpr bool use_wait_on_address =
pal_supports<PalFeatures::WaitOnAddress, Pal> &&
SNMALLOC_USE_WAIT_ON_ADDRESS;

template<bool HasWaitOnAddress, typename Pal>
struct WaitWordTypeSelect;

template<typename Pal>
struct WaitWordTypeSelect<true, Pal>
{
using type = typename Pal::WaitingWord;
};

template<typename Pal>
struct WaitWordTypeSelect<false, Pal>
{
using type = int;
};

using WaitingWordType =
typename WaitWordTypeSelect<use_wait_on_address<DefaultPal>, DefaultPal>::
type;

template<typename F>
friend class CombiningLockNodeTempl;

enum class LockStatus
enum class LockStatus : WaitingWordType
{
// The work for this node has not been completed.
WAITING,
Expand All @@ -53,7 +77,10 @@ namespace snmalloc

// The work for this thread has not been completed, and it is the
// head of the queue.
HEAD
HEAD,

// The waiter is currently sleeping.
SLEEPING
};

// Status of the queue, set by the thread at the head of the queue,
Expand All @@ -74,6 +101,51 @@ namespace snmalloc
status.store(s, std::memory_order_release);
}

template<typename Pal = DefaultPal>
static void wake(CombiningLockNode* node, LockStatus message)
{
if constexpr (!use_wait_on_address<Pal>)
{
node->set_status(message);
}
else
{
if (
node->status.exchange(message, std::memory_order_acq_rel) ==
LockStatus::SLEEPING)
{
Pal::notify_one_on_address(node->status);
}
}
}

template<typename Pal = DefaultPal>
void wait()
{
if constexpr (!use_wait_on_address<Pal>)
{
while (status.load(std::memory_order_acquire) == LockStatus::WAITING)
Aal::pause();
}
else
{
int remaining = 100;
while (remaining > 0)
{
if (status.load(std::memory_order_acquire) != LockStatus::WAITING)
return;
Aal::pause();
remaining--;
}
LockStatus expected = LockStatus::WAITING;
if (status.compare_exchange_strong(
expected, LockStatus::SLEEPING, std::memory_order_acq_rel))
{
Pal::wait_on_address(status, LockStatus::SLEEPING);
}
}
}

SNMALLOC_SLOW_PATH void attach_slow(CombiningLock& lock)
{
// There is contention for the lock, we need to add our work to the
Expand All @@ -86,8 +158,7 @@ namespace snmalloc
prev->next.store(this, std::memory_order_release);

// Wait to for predecessor to complete
while (status.load(std::memory_order_relaxed) == LockStatus::WAITING)
Aal::pause();
wait();

// Determine if another thread completed our work.
if (status.load(std::memory_order_acquire) == LockStatus::DONE)
Expand Down Expand Up @@ -131,7 +202,7 @@ namespace snmalloc
break;
// Signal this work was completed and move on to
// next item.
curr->set_status(LockStatus::DONE);
wake(curr, LockStatus::DONE);
curr = n;
}

Expand All @@ -146,7 +217,7 @@ namespace snmalloc
{
// Queue was successfully closed.
// Notify last element the work was completed.
curr->set_status(LockStatus::DONE);
wake(curr, LockStatus::DONE);
lock.release();
return;
}
Expand All @@ -160,13 +231,13 @@ namespace snmalloc

// As we had to wait, give the job to the next thread
// to carry on performing the work.
n->set_status(LockStatus::HEAD);
wake(n, LockStatus::HEAD);

// Notify the thread that we completed its work.
// Note that this needs to be before setting curr->status,
// as after the status is set the thread may deallocate the
// queue node.
curr->set_status(LockStatus::DONE);
wake(curr, LockStatus::DONE);
return;
}
};
Expand Down
103 changes: 102 additions & 1 deletion src/snmalloc/pal/pal_apple.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,39 @@
# include <sys/mman.h>
# include <unistd.h>

# if __has_include(<AvailabilityMacros.h>) && __has_include(<Availability.h>)
# include <Availability.h>
# include <AvailabilityMacros.h>
# if defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && \
defined(MAC_OS_X_VERSION_14_4)
# if __MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_14_4
# define SNMALLOC_APPLE_HAS_OS_SYNC_WAIT_ON_ADDRESS
# endif
# endif
# endif

namespace snmalloc
{
# ifdef SNMALLOC_APPLE_HAS_OS_SYNC_WAIT_ON_ADDRESS
// For macos 14.4+, we use os_sync_wait_on_address and friends. It is
// available as a part of stable API, and the usage is more straightforward.
extern "C" int os_sync_wait_on_address(
void* addr, uint64_t value, size_t size, uint32_t flags);

extern "C" int
os_sync_wake_by_address_any(void* addr, size_t size, uint32_t flags);

extern "C" int
os_sync_wake_by_address_all(void* addr, size_t size, uint32_t flags);
# else
// For platforms before macos 14.4, we use __ulock_wait and friends. It is
// available since macos 10.12.
extern "C" int
__ulock_wait(uint32_t lock_type, void* addr, uint64_t value, uint32_t);

extern "C" int __ulock_wake(uint32_t lock_type, void* addr, uint64_t);
# endif

/**
* PAL implementation for Apple systems (macOS, iOS, watchOS, tvOS...).
*/
Expand All @@ -28,7 +59,7 @@ namespace snmalloc
* The features exported by this PAL.
*/
static constexpr uint64_t pal_features =
AlignedAllocation | LazyCommit | Entropy | Time;
AlignedAllocation | LazyCommit | Entropy | Time | WaitOnAddress;

/*
* `page_size`
Expand Down Expand Up @@ -281,6 +312,76 @@ namespace snmalloc

return result;
}

using WaitingWord = uint32_t;
# ifndef SNMALLOC_APPLE_HAS_OS_SYNC_WAIT_ON_ADDRESS
static constexpr uint32_t UL_COMPARE_AND_WAIT = 0x0000'0001;
static constexpr uint32_t ULF_NO_ERRNO = 0x0100'0000;
static constexpr uint32_t ULF_WAKE_ALL = 0x0000'0100;
# endif

template<class T>
static void wait_on_address(std::atomic<T>& addr, T expected)
{
[[maybe_unused]] int errno_backup = errno;
while (addr.load(std::memory_order_relaxed) == expected)
{
# ifdef SNMALLOC_APPLE_HAS_OS_SYNC_WAIT_ON_ADDRESS
if (
os_sync_wait_on_address(
&addr, static_cast<uint64_t>(expected), sizeof(T), 0) != -1)
{
errno = errno_backup;
return;
}
# else
if (
__ulock_wait(
UL_COMPARE_AND_WAIT | ULF_NO_ERRNO,
&addr,
static_cast<uint64_t>(expected),
0) != -1)
{
return;
}
# endif
}
}

template<class T>
static void notify_one_on_address(std::atomic<T>& addr)
{
# ifdef SNMALLOC_APPLE_HAS_OS_SYNC_WAIT_ON_ADDRESS
os_sync_wake_by_address_any(&addr, sizeof(T), 0);
# else
// __ulock_wake can get interrupted, so retry until either waking up a
// waiter or failing because there are no waiters (ENOENT).
for (;;)
{
int ret = __ulock_wake(UL_COMPARE_AND_WAIT | ULF_NO_ERRNO, &addr, 0);
if (ret >= 0 || ret == -ENOENT)
return;
}
# endif
}

template<class T>
static void notify_all_on_address(std::atomic<T>& addr)
{
# ifdef SNMALLOC_APPLE_HAS_OS_SYNC_WAIT_ON_ADDRESS
os_sync_wake_by_address_all(&addr, sizeof(T), 0);
# else
// __ulock_wake can get interrupted, so retry until either waking up a
// waiter or failing because there are no waiters (ENOENT).
for (;;)
{
int ret = __ulock_wake(
UL_COMPARE_AND_WAIT | ULF_NO_ERRNO | ULF_WAKE_ALL, &addr, 0);
if (ret >= 0 || ret == -ENOENT)
return;
}
# endif
}
};
} // namespace snmalloc
#endif
5 changes: 5 additions & 0 deletions src/snmalloc/pal/pal_consts.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,11 @@ namespace snmalloc
* modify which parts get dumped.
*/
CoreDump = (1 << 6),

/**
* This Pal provides a way for parking threads at a specific address.
*/
WaitOnAddress = (1 << 7),
};

/**
Expand Down
Loading
Loading