Skip to content

Commit 96bc088

Browse files
committed
Harden CH watchdog diagnostics against pid race
- Validate selected pid is alive before gcore/gdb attach - Log attach context (cmdline/parent pid) for troubleshooting - Retry gdb attach against parent pid when target exits - Use less aggressive default watchdog intervals
1 parent 2ed40a2 commit 96bc088

File tree

1 file changed

+61
-37
lines changed

1 file changed

+61
-37
lines changed

lisa/microsoft/testsuites/cloud_hypervisor/ch_tests_tool.py

Lines changed: 61 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -807,9 +807,9 @@ def _run_with_enhanced_diagnostics(
807807
inactivity watchdog, and comprehensive logging.
808808
"""
809809
# Tunables (pull from env if provided; else use sane defaults)
810-
idle_secs = int(os.environ.get("CH_IDLE_SECS", "60"))
811-
hang_kill_secs = int(os.environ.get("CH_HANG_KILL_SECS", "900"))
812-
check_interval = int(os.environ.get("CH_CHECK_INTERVAL", "5"))
810+
idle_secs = int(os.environ.get("CH_IDLE_SECS", "300"))
811+
hang_kill_secs = int(os.environ.get("CH_HANG_KILL_SECS", "1200"))
812+
check_interval = int(os.environ.get("CH_CHECK_INTERVAL", "10"))
813813

814814
# --- 1) Rich Rust diagnostics ---
815815
enhanced_env_vars = self.env_vars.copy()
@@ -904,40 +904,64 @@ def _run_with_enhanced_diagnostics(
904904
ps -eL -o pid,tid,ppid,stat,etime,comm,cmd | head -200 \\
905905
| tee -a "$log_file" || true
906906
907-
# Find a good target: prefer the integration test binary; otherwise a child
908-
# of the cargo/dev_cli process; otherwise fall back to the main pid.
909-
tpid="$(pgrep -n -f 'target/.*/deps/integration-' || true)"
910-
if [ -z "$tpid" ]; then
911-
# newest child of $pid (often cargo test or the binary)
912-
tpid="$(pgrep -P "$pid" | tail -n1 || true)"
913-
fi
914-
[ -z "$tpid" ] && tpid="$pid"
915-
916-
# Best-effort freeze to avoid the attach race
917-
sudo kill -STOP "$tpid" 2>/dev/null || true
918-
919-
# Use consistent core filename pattern that matches search pattern
920-
core_out="core.integration-$(date +%s)"
921-
echo "[watchdog] Generating core: $core_out" | tee -a "$log_file"
922-
if command -v gcore >/dev/null 2>&1; then
923-
sudo gcore -o "$core_out" "$tpid" 2>&1 | tee -a "$log_file" || true
924-
else
925-
sudo gdb -batch -p "$tpid" \\
926-
-ex "set pagination off" \\
927-
-ex "generate-core-file $core_out" \\
928-
-ex "detach" -ex "quit" 2>&1 | tee -a "$log_file" || true
929-
fi
930-
931-
# Write live backtrace to BOTH main log and side file
932-
echo "[watchdog] Attaching gdb to pid $tpid for live backtrace" \\
933-
| tee -a "$log_file"
934-
sudo gdb -batch -p "$tpid" \\
935-
-ex "set pagination off" \\
936-
-ex "set print elements 0" \\
937-
-ex "set backtrace limit 64" \\
938-
-ex "thread apply all bt" \\
939-
-ex "info threads" \\
940-
2>&1 | tee -a "$log_file" > "$live_bt_file" || true
907+
# Find a good target: prefer the integration test binary; otherwise a child
908+
# of the cargo/dev_cli process; otherwise fall back to the main pid.
909+
tpid="$(pgrep -n -f 'target/.*/deps/integration-' || true)"
910+
if [ -z "$tpid" ]; then
911+
# newest child of $pid (often cargo test or the binary)
912+
tpid="$(pgrep -P "$pid" | tail -n1 || true)"
913+
fi
914+
[ -z "$tpid" ] && tpid="$pid"
915+
916+
# Verify the target pid is still alive. If it raced away, fall back to $pid.
917+
if ! kill -0 "$tpid" 2>/dev/null; then
918+
echo "[watchdog] Selected pid $tpid is not alive; falling back to pid $pid" \\
919+
| tee -a "$log_file"
920+
tpid="$pid"
921+
fi
922+
923+
# Best-effort freeze to avoid the attach race
924+
sudo kill -STOP "$tpid" 2>/dev/null || true
925+
926+
# Use consistent core filename pattern that matches search pattern
927+
core_out="core.integration-$(date +%s)"
928+
echo "[watchdog] Generating core: $core_out" | tee -a "$log_file"
929+
if command -v gcore >/dev/null 2>&1; then
930+
sudo gcore -o "$core_out" "$tpid" 2>&1 | tee -a "$log_file" || true
931+
else
932+
sudo gdb -batch -p "$tpid" \\
933+
-ex "set pagination off" \\
934+
-ex "generate-core-file $core_out" \\
935+
-ex "detach" -ex "quit" 2>&1 | tee -a "$log_file" || true
936+
fi
937+
938+
# Write live backtrace to BOTH main log and side file
939+
echo "[watchdog] Attaching gdb to pid $tpid for live backtrace" \\
940+
| tee -a "$log_file"
941+
{{
942+
echo "[watchdog] gdb attach target pid=$tpid parent_pid=$pid";
943+
echo "[watchdog] cmdline(target)=$(tr '\\0' ' ' < /proc/$tpid/cmdline 2>/dev/null || echo n/a)";
944+
echo "[watchdog] cmdline(parent)=$(tr '\\0' ' ' < /proc/$pid/cmdline 2>/dev/null || echo n/a)";
945+
}} 2>/dev/null | tee -a "$log_file" || true
946+
sudo gdb -batch -p "$tpid" \\
947+
-ex "set pagination off" \\
948+
-ex "set print elements 0" \\
949+
-ex "set backtrace limit 64" \\
950+
-ex "thread apply all bt" \\
951+
-ex "info threads" \\
952+
2>&1 | tee -a "$log_file" > "$live_bt_file" || true
953+
# If attach failed (e.g. tpid exited/raced), retry once against the main pid
954+
if grep -q "No such process" "$live_bt_file" 2>/dev/null; then
955+
echo "[watchdog] gdb attach on pid $tpid failed; retrying against pid $pid" \\
956+
| tee -a "$log_file"
957+
sudo gdb -batch -p "$pid" \\
958+
-ex "set pagination off" \\
959+
-ex "set print elements 0" \\
960+
-ex "set backtrace limit 64" \\
961+
-ex "thread apply all bt" \\
962+
-ex "info threads" \\
963+
2>&1 | tee -a "$log_file" > "$live_bt_file" || true
964+
fi
941965
942966
# Let it run again
943967
sudo kill -CONT "$tpid" 2>/dev/null || true

0 commit comments

Comments
 (0)