@@ -807,9 +807,9 @@ def _run_with_enhanced_diagnostics(
807807 inactivity watchdog, and comprehensive logging.
808808 """
809809 # Tunables (pull from env if provided; else use sane defaults)
810- idle_secs = int (os .environ .get ("CH_IDLE_SECS" , "60 " ))
811- hang_kill_secs = int (os .environ .get ("CH_HANG_KILL_SECS" , "900 " ))
812- check_interval = int (os .environ .get ("CH_CHECK_INTERVAL" , "5 " ))
810+ idle_secs = int (os .environ .get ("CH_IDLE_SECS" , "300 " ))
811+ hang_kill_secs = int (os .environ .get ("CH_HANG_KILL_SECS" , "1200 " ))
812+ check_interval = int (os .environ .get ("CH_CHECK_INTERVAL" , "10 " ))
813813
814814 # --- 1) Rich Rust diagnostics ---
815815 enhanced_env_vars = self .env_vars .copy ()
@@ -904,40 +904,64 @@ def _run_with_enhanced_diagnostics(
904904 ps -eL -o pid,tid,ppid,stat,etime,comm,cmd | head -200 \\
905905 | tee -a "$log_file" || true
906906
907- # Find a good target: prefer the integration test binary; otherwise a child
908- # of the cargo/dev_cli process; otherwise fall back to the main pid.
909- tpid="$(pgrep -n -f 'target/.*/deps/integration-' || true)"
910- if [ -z "$tpid" ]; then
911- # newest child of $pid (often cargo test or the binary)
912- tpid="$(pgrep -P "$pid" | tail -n1 || true)"
913- fi
914- [ -z "$tpid" ] && tpid="$pid"
915-
916- # Best-effort freeze to avoid the attach race
917- sudo kill -STOP "$tpid" 2>/dev/null || true
918-
919- # Use consistent core filename pattern that matches search pattern
920- core_out="core.integration-$(date +%s)"
921- echo "[watchdog] Generating core: $core_out" | tee -a "$log_file"
922- if command -v gcore >/dev/null 2>&1; then
923- sudo gcore -o "$core_out" "$tpid" 2>&1 | tee -a "$log_file" || true
924- else
925- sudo gdb -batch -p "$tpid" \\
926- -ex "set pagination off" \\
927- -ex "generate-core-file $core_out" \\
928- -ex "detach" -ex "quit" 2>&1 | tee -a "$log_file" || true
929- fi
930-
931- # Write live backtrace to BOTH main log and side file
932- echo "[watchdog] Attaching gdb to pid $tpid for live backtrace" \\
933- | tee -a "$log_file"
934- sudo gdb -batch -p "$tpid" \\
935- -ex "set pagination off" \\
936- -ex "set print elements 0" \\
937- -ex "set backtrace limit 64" \\
938- -ex "thread apply all bt" \\
939- -ex "info threads" \\
940- 2>&1 | tee -a "$log_file" > "$live_bt_file" || true
907+ # Find a good target: prefer the integration test binary; otherwise a child
908+ # of the cargo/dev_cli process; otherwise fall back to the main pid.
909+ tpid="$(pgrep -n -f 'target/.*/deps/integration-' || true)"
910+ if [ -z "$tpid" ]; then
911+ # newest child of $pid (often cargo test or the binary)
912+ tpid="$(pgrep -P "$pid" | tail -n1 || true)"
913+ fi
914+ [ -z "$tpid" ] && tpid="$pid"
915+
916+ # Verify the target pid is still alive. If it raced away, fall back to $pid.
917+ if ! kill -0 "$tpid" 2>/dev/null; then
918+ echo "[watchdog] Selected pid $tpid is not alive; falling back to pid $pid" \\
919+ | tee -a "$log_file"
920+ tpid="$pid"
921+ fi
922+
923+ # Best-effort freeze to avoid the attach race
924+ sudo kill -STOP "$tpid" 2>/dev/null || true
925+
926+ # Use consistent core filename pattern that matches search pattern
927+ core_out="core.integration-$(date +%s)"
928+ echo "[watchdog] Generating core: $core_out" | tee -a "$log_file"
929+ if command -v gcore >/dev/null 2>&1; then
930+ sudo gcore -o "$core_out" "$tpid" 2>&1 | tee -a "$log_file" || true
931+ else
932+ sudo gdb -batch -p "$tpid" \\
933+ -ex "set pagination off" \\
934+ -ex "generate-core-file $core_out" \\
935+ -ex "detach" -ex "quit" 2>&1 | tee -a "$log_file" || true
936+ fi
937+
938+ # Write live backtrace to BOTH main log and side file
939+ echo "[watchdog] Attaching gdb to pid $tpid for live backtrace" \\
940+ | tee -a "$log_file"
941+ {{
942+ echo "[watchdog] gdb attach target pid=$tpid parent_pid=$pid";
943+ echo "[watchdog] cmdline(target)=$(tr '\\ 0' ' ' < /proc/$tpid/cmdline 2>/dev/null || echo n/a)";
944+ echo "[watchdog] cmdline(parent)=$(tr '\\ 0' ' ' < /proc/$pid/cmdline 2>/dev/null || echo n/a)";
945+ }} 2>/dev/null | tee -a "$log_file" || true
946+ sudo gdb -batch -p "$tpid" \\
947+ -ex "set pagination off" \\
948+ -ex "set print elements 0" \\
949+ -ex "set backtrace limit 64" \\
950+ -ex "thread apply all bt" \\
951+ -ex "info threads" \\
952+ 2>&1 | tee -a "$log_file" > "$live_bt_file" || true
953+ # If attach failed (e.g. tpid exited/raced), retry once against the main pid
954+ if grep -q "No such process" "$live_bt_file" 2>/dev/null; then
955+ echo "[watchdog] gdb attach on pid $tpid failed; retrying against pid $pid" \\
956+ | tee -a "$log_file"
957+ sudo gdb -batch -p "$pid" \\
958+ -ex "set pagination off" \\
959+ -ex "set print elements 0" \\
960+ -ex "set backtrace limit 64" \\
961+ -ex "thread apply all bt" \\
962+ -ex "info threads" \\
963+ 2>&1 | tee -a "$log_file" > "$live_bt_file" || true
964+ fi
941965
942966 # Let it run again
943967 sudo kill -CONT "$tpid" 2>/dev/null || true
0 commit comments