Skip to content

Commit e93b204

Browse files
committed
[BACKPORT 2025.1][yugabyte#27557] DocDB: Don't schedule tasks on threadpool once shutdown flags are set at ObjectLockManager
Summary: Original commit: f5197a2 / D44662 In context of object locking, commit 6e80c56 / D44228 got rid of logic that signaled obsolete waiters corresponding to transactions that issued a release all locks request (could have been terminated to failures like timeout, deadlock etc) in order to early terminate failed waiting requests. Hence, now we let the obsolete requests terminate organically from the OLM resumed by the poller thread that runs at an interval of `olm_poll_interval_ms` (defaults to 100ms). This led to one of the itests failing with the below stack ``` * thread #1, name = 'yb-tserver', stop reason = signal SIGSEGV: address not mapped to object * frame #0: 0x0000aaaac8a093ec yb-tserver`yb::ThreadPoolToken::SubmitFunc(std::__1::function<void ()>) [inlined] yb::ThreadPoolToken::Submit(this=<unavailable>, r=<unavailable>) at threadpool.cc:146:10 frame #1: 0x0000aaaac8a093ec yb-tserver`yb::ThreadPoolToken::SubmitFunc(this=0x0000000000000000, f=<unavailable>) at threadpool.cc:142:10 frame #2: 0x0000aaaac73cdfe8 yb-tserver`yb::docdb::ObjectLockManagerImpl::DoSignal(this=0x00003342bfa0d400, entry=<unavailable>) at object_lock_manager.cc:767:3 frame #3: 0x0000aaaac73cc7c0 yb-tserver`yb::docdb::ObjectLockManagerImpl::DoLock(std::__1::shared_ptr<yb::docdb::(anonymous namespace)::TrackedTransactionLockEntry>, yb::docdb::LockData&&, yb::StronglyTypedBool<yb::docdb::(anonymous namespace)::IsLockRetry_Tag>, unsigned long, yb::Status) [inlined] yb::docdb::ObjectLockManagerImpl::PrepareAcquire(this=0x00003342bfa0d400, txn_lock=<unavailable>, transaction_entry=std::__1::shared_ptr<yb::docdb::(anonymous namespace)::TrackedTransactionLockEntry>::element_type @ 0x00003342bfa94a38, data=0x00003342b9a6a830, resume_it_offset=<unavailable>, resume_with_status=<unavailable>) at object_lock_manager.cc:523:5 frame #4: 0x0000aaaac73cc6a8 yb-tserver`yb::docdb::ObjectLockManagerImpl::DoLock(this=0x00003342bfa0d400, transaction_entry=std::__1::shared_ptr<yb::docdb::(anonymous namespace)::TrackedTransactionLockEntry>::element_type @ 0x00003342bfa94a38, data=0x00003342b9a6a830, is_retry=(value_ = true), resume_it_offset=<unavailable>, resume_with_status=Status @ 0x0000ffefaa036658) at object_lock_manager.cc:552:27 frame #5: 0x0000aaaac73cbcb4 yb-tserver`yb::docdb::WaiterEntry::Resume(this=0x00003342b9a6a820, lock_manager=0x00003342bfa0d400, resume_with_status=<unavailable>) at object_lock_manager.cc:381:17 frame #6: 0x0000aaaac85bdd4c yb-tserver`yb::tserver::TSLocalLockManager::Shutdown() at object_lock_manager.cc:752:13 frame #7: 0x0000aaaac85bda74 yb-tserver`yb::tserver::TSLocalLockManager::Shutdown() [inlined] yb::docdb::ObjectLockManager::Shutdown(this=<unavailable>) at object_lock_manager.cc:1092:10 frame yugabyte#8: 0x0000aaaac85bda6c yb-tserver`yb::tserver::TSLocalLockManager::Shutdown() [inlined] yb::tserver::TSLocalLockManager::Impl::Shutdown(this=<unavailable>) at ts_local_lock_manager.cc:411:26 frame yugabyte#9: 0x0000aaaac85bd7e8 yb-tserver`yb::tserver::TSLocalLockManager::Shutdown(this=<unavailable>) at ts_local_lock_manager.cc:566:10 frame yugabyte#10: 0x0000aaaac8665a34 yb-tserver`yb::tserver::YsqlLeasePoller::Poll() [inlined] yb::tserver::TabletServer::ResetAndGetTSLocalLockManager(this=0x000033423fc1ad80) at tablet_server.cc:797:28 frame yugabyte#11: 0x0000aaaac8665a18 yb-tserver`yb::tserver::YsqlLeasePoller::Poll() [inlined] yb::tserver::TabletServer::ProcessLeaseUpdate(this=0x000033423fc1ad80, lease_refresh_info=0x000033423a476b80) at tablet_server.cc:828:22 frame yugabyte#12: 0x0000aaaac8665950 yb-tserver`yb::tserver::YsqlLeasePoller::Poll(this=<unavailable>) at ysql_lease_poller.cc:143:18 frame yugabyte#13: 0x0000aaaac8438d58 yb-tserver`yb::tserver::MasterLeaderPollScheduler::Impl::Run(this=0x000033423ff5cc80) at master_leader_poller.cc:125:25 frame yugabyte#14: 0x0000aaaac89ffd18 yb-tserver`yb::Thread::SuperviseThread(void*) [inlined] std::__1::__function::__value_func<void ()>::operator()[abi:ne190100](this=0x000033423ffc7930) const at function.h:430:12 frame yugabyte#15: 0x0000aaaac89ffd04 yb-tserver`yb::Thread::SuperviseThread(void*) [inlined] std::__1::function<void ()>::operator()(this=0x000033423ffc7930) const at function.h:989:10 frame yugabyte#16: 0x0000aaaac89ffd04 yb-tserver`yb::Thread::SuperviseThread(arg=0x000033423ffc78c0) at thread.cc:937:3 frame yugabyte#17: 0x0000ffffac0378b8 libpthread.so.0`start_thread + 392 frame yugabyte#18: 0x0000ffffac093afc libc.so.6`thread_start + 12 ``` This is due to accessing unique_ptr `thread_pool_token_` after it has been reset. This revision fixes the issue by not scheduling any tasks on the threadpool once the shutdown flags has been set (hence not accessing `thread_pool_token_`). Since we wait for in-progress requests at the OLM and also in-progress resume tasks scheduled on the messenger using `waiters_amidst_resumption_on_messenger_`, it is safe to say that `thread_pool_token_` would not be accessed once it is reset. Jira: DB-17121 Test Plan: Jenkins ./yb_build.sh --cxx-test='TEST_F(PgObjectLocksTestRF1, TestShutdownWithWaiters) {' Reviewers: rthallam, amitanand, sergei Reviewed By: rthallam Subscribers: yql, ybase Differential Revision: https://phorge.dev.yugabyte.com/D44728
1 parent 582af54 commit e93b204

File tree

3 files changed

+51
-3
lines changed

3 files changed

+51
-3
lines changed

src/yb/docdb/object_lock_manager.cc

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -729,8 +729,9 @@ void ObjectLockManagerImpl::Shutdown() {
729729
shutdown_in_progress_ = true;
730730
// Shutdown of thread pool token => no more tasks pending/scheduled for resumption.
731731
thread_pool_token_->Shutdown();
732-
thread_pool_token_.reset();
732+
// Wait for resumptions scheduled on the messenger as they might access thread_pool_token_.
733733
waiters_amidst_resumption_on_messenger_.Shutdown();
734+
thread_pool_token_.reset();
734735
// Since TSLocalLockManager waits for running requests before processing shutdown, and no new
735736
// requests are sent post initiating shutdown, the OLM should be empty after resuming waiters.
736737
std::vector<WaiterEntryPtr> waiters;
@@ -761,7 +762,7 @@ void ObjectLockManagerImpl::Shutdown() {
761762
}
762763

763764
void ObjectLockManagerImpl::DoSignal(ObjectLockedBatchEntry* entry) {
764-
if (FLAGS_TEST_olm_skip_scheduling_waiter_resumption) {
765+
if (shutdown_in_progress_ || FLAGS_TEST_olm_skip_scheduling_waiter_resumption) {
765766
return;
766767
}
767768
WARN_NOT_OK(

src/yb/tserver/ts_local_lock_manager.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
using namespace std::literals;
3636
DECLARE_bool(dump_lock_keys);
3737

38-
DEFINE_RUNTIME_int64(olm_poll_interval_ms, 100,
38+
DEFINE_NON_RUNTIME_int64(olm_poll_interval_ms, 100,
3939
"Poll interval for Object lock Manager. Waiting requests that are unblocked by other release "
4040
"requests are independent of this interval since the release schedules unblocking of potential "
4141
"waiters. Yet this might help release timedout requests soon and also avoid probable issues "

src/yb/yql/pgwrapper/pg_object_locks-test.cc

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
#include "yb/util/async_util.h"
2424
#include "yb/util/backoff_waiter.h"
25+
#include "yb/util/logging_test_util.h"
2526
#include "yb/util/monotime.h"
2627
#include "yb/util/sync_point.h"
2728
#include "yb/util/test_macros.h"
@@ -46,20 +47,35 @@ DECLARE_uint64(pg_client_session_expiration_ms);
4647
DECLARE_uint64(pg_client_heartbeat_interval_ms);
4748
DECLARE_uint64(refresh_waiter_timeout_ms);
4849
DECLARE_uint64(transaction_heartbeat_usec);
50+
DECLARE_uint64(master_ysql_operation_lease_ttl_ms);
51+
DECLARE_bool(TEST_tserver_enable_ysql_lease_refresh);
52+
DECLARE_int64(olm_poll_interval_ms);
4953

5054
using namespace std::literals;
5155

5256
namespace yb::pgwrapper {
5357

5458
constexpr uint64_t kDefaultMasterYSQLLeaseTTLMilli = 5 * 1000;
5559
constexpr uint64_t kDefaultYSQLLeaseRefreshIntervalMilli = 500;
60+
constexpr uint64_t kDefaultLockManagerPollIntervalMs = 100;
5661

5762
class PgObjectLocksTestRF1 : public PgMiniTestBase {
5863
protected:
5964
void SetUp() override {
65+
// Set reasonable high poll interavl to disable polling in tests., would help catch issues
66+
// with signaling logic if any.
67+
ANNOTATE_UNPROTECTED_WRITE(FLAGS_olm_poll_interval_ms) = 1000000;
68+
ANNOTATE_UNPROTECTED_WRITE(FLAGS_master_ysql_operation_lease_ttl_ms) =
69+
kDefaultMasterYSQLLeaseTTLMilli;
70+
ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_tserver_enable_ysql_lease_refresh) =
71+
kDefaultYSQLLeaseRefreshIntervalMilli;
6072
ANNOTATE_UNPROTECTED_WRITE(FLAGS_enable_object_locking_for_table_locks) = true;
6173
ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_check_broadcast_address) = false; // GH #26281
6274
PgMiniTestBase::SetUp();
75+
Init();
76+
}
77+
78+
void Init() {
6379
ts_lock_manager_ = cluster_->mini_tablet_server(0)->server()->ts_local_lock_manager();
6480
master_lock_manager_ = cluster_->mini_master()
6581
->master()
@@ -663,7 +679,14 @@ TEST_F(PgObjectLocksTest, ReleaseExpiredLocksInvalidatesCatalogCache) {
663679
ASSERT_OK(ts1->Restart());
664680
}
665681

682+
// This test relies on the OLM poller to run frequently and timedout waiters, in order to
683+
// check for deadlocks/transaction aborts and retry from pg_client_session if necessary.
666684
TEST_F(PgObjectLocksTestRF1, YB_DISABLE_TEST_IN_TSAN(TestDeadlock)) {
685+
// Restart the cluster_ with the poll interval set to default.
686+
ANNOTATE_UNPROTECTED_WRITE(FLAGS_olm_poll_interval_ms) = kDefaultLockManagerPollIntervalMs;
687+
ASSERT_OK(cluster_->RestartSync());
688+
Init();
689+
667690
auto conn1 = ASSERT_RESULT(Connect());
668691
ASSERT_OK(conn1.Execute("CREATE TABLE test(k INT PRIMARY KEY, v INT)"));
669692
ASSERT_OK(conn1.Execute("INSERT INTO test SELECT generate_series(0, 10), 0"));
@@ -747,6 +770,30 @@ TEST_F(PgObjectLocksTestRF1, YB_DISABLE_TEST_IN_TSAN(TestDeadlock)) {
747770
ASSERT_STR_CONTAINS(s.ToString(), "aborted due to a deadlock");
748771
}
749772

773+
TEST_F(PgObjectLocksTestRF1, TestShutdownWithWaiters) {
774+
constexpr auto kNumWaiters = 3;
775+
auto conn = ASSERT_RESULT(Connect());
776+
ASSERT_OK(conn.Execute("CREATE TABLE test(k INT PRIMARY KEY, v INT)"));
777+
ASSERT_OK(conn.Execute("INSERT INTO test SELECT generate_series(0, 10), 0"));
778+
ASSERT_OK(conn.StartTransaction(IsolationLevel::SNAPSHOT_ISOLATION));
779+
ASSERT_OK(conn.Execute("LOCK TABLE test in ACCESS EXCLUSIVE mode"));
780+
781+
TestThreadHolder thread_holder;
782+
for (int i = 0 ; i < kNumWaiters; i++) {
783+
thread_holder.AddThreadFunctor([&]() {
784+
auto conn = ASSERT_RESULT(Connect());
785+
ASSERT_OK(conn.ExecuteFormat("SET statement_timeout=$0", 1000 * kTimeMultiplier));
786+
ASSERT_NOK(conn.Execute("UPDATE test SET v=v+1 WHERE k=1"));
787+
});
788+
}
789+
auto log_waiter = StringWaiterLogSink("has just lost its lease");
790+
ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_tserver_enable_ysql_lease_refresh) = false;
791+
ASSERT_OK(log_waiter.WaitFor(
792+
MonoDelta::FromMilliseconds(2 * kDefaultMasterYSQLLeaseTTLMilli * kTimeMultiplier)));
793+
ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_tserver_enable_ysql_lease_refresh) = true;
794+
thread_holder.JoinAll();
795+
}
796+
750797
TEST_F_EX(
751798
PgObjectLocksTestRF1, YB_DISABLE_TEST_IN_TSAN(TestDeadlockFastPath),
752799
TestWithTransactionalDDL) {

0 commit comments

Comments
 (0)