-
Notifications
You must be signed in to change notification settings - Fork 4
Open
Description
Problem
The application doesn't implement graceful shutdown handling, which causes:
- Database connections remain open when the service restarts
- In-flight requests are abruptly terminated
- Background tasks and threads are killed without cleanup
- Resources (sockets, file handles) are leaked
- Requires full machine restart to properly release connections
Evidence from Investigation
- No signal handlers: The main.rs doesn't handle SIGTERM/SIGINT signals
- No shutdown coordination: The axum server starts without graceful shutdown configuration
- Background tasks without cleanup: AWS credential refresh, traffic forwarders, and other background tasks have no shutdown mechanism
- Connection pool doesn't drain: Database connections aren't closed cleanly on shutdown
Current server startup (src/main.rs:2433-2439):
let listener = tokio::net::TcpListener::bind("127.0.0.1:3000")
.await
.unwrap();
tracing::info!("Listening on http://localhost:3000");
Ok(axum::serve(listener, app.into_make_service()).await?)Proposed Solution
1. Implement Graceful Shutdown Signal Handler
use tokio::signal;
use std::sync::atomic::{AtomicBool, Ordering};
use tokio::sync::broadcast;
// Add to main.rs
async fn shutdown_signal() {
let ctrl_c = async {
signal::ctrl_c()
.await
.expect("failed to install Ctrl+C handler");
};
#[cfg(unix)]
let terminate = async {
signal::unix::signal(signal::unix::SignalKind::terminate())
.expect("failed to install signal handler")
.recv()
.await;
};
#[cfg(not(unix))]
let terminate = std::future::pending::<()>();
tokio::select! {
_ = ctrl_c => {
tracing::info!("Received Ctrl+C signal");
},
_ = terminate => {
tracing::info!("Received terminate signal");
},
}
tracing::info!("Starting graceful shutdown...");
}2. Add Shutdown Coordinator
#[derive(Clone)]
pub struct ShutdownCoordinator {
shutdown_tx: broadcast::Sender<()>,
is_shutting_down: Arc<AtomicBool>,
}
impl ShutdownCoordinator {
pub fn new() -> Self {
let (shutdown_tx, _) = broadcast::channel(1);
Self {
shutdown_tx,
is_shutting_down: Arc::new(AtomicBool::new(false)),
}
}
pub fn subscribe(&self) -> broadcast::Receiver<()> {
self.shutdown_tx.subscribe()
}
pub fn is_shutting_down(&self) -> bool {
self.is_shutting_down.load(Ordering::Relaxed)
}
pub async fn shutdown(&self) {
self.is_shutting_down.store(true, Ordering::Relaxed);
let _ = self.shutdown_tx.send(());
}
}3. Update AppState with Shutdown Coordinator
pub struct AppState {
// ... existing fields ...
shutdown: ShutdownCoordinator,
}
impl AppStateBuilder {
pub fn shutdown(mut self, shutdown: ShutdownCoordinator) -> Self {
self.shutdown = Some(shutdown);
self
}
}4. Update Background Tasks to Handle Shutdown
// AWS Credential Refresh Task
spawn(async move {
let mut shutdown_rx = shutdown_coordinator.subscribe();
loop {
tokio::select! {
_ = tokio::time::sleep(Duration::from_secs(300)) => {
match refresh_manager.read().await.as_ref() {
Some(manager) => {
if let Err(e) = manager.refresh_credentials().await {
tracing::error!("Failed to refresh AWS credentials: {}", e);
}
}
None => {
tracing::error!("AWS credential manager not initialized");
}
}
}
_ = shutdown_rx.recv() => {
tracing::info!("AWS credential refresh task shutting down");
break;
}
}
}
});5. Add Database Connection Pool Cleanup
impl Drop for PostgresConnection {
fn drop(&mut self) {
tracing::info!("Closing database connection pool");
// The pool will automatically close all connections when dropped
// but we can add explicit cleanup if needed
}
}
// Add method to AppState
impl AppState {
pub async fn shutdown(&self) {
tracing::info!("Shutting down application state");
// Signal all background tasks to stop
self.shutdown.shutdown().await;
// Give tasks time to complete
tokio::time::sleep(Duration::from_secs(2)).await;
// Database pool will be dropped automatically
tracing::info!("Application state shutdown complete");
}
}6. Update Main Function with Graceful Shutdown
#[tokio::main]
async fn main() -> Result<(), Error> {
// ... existing setup code ...
// Create shutdown coordinator
let shutdown_coordinator = ShutdownCoordinator::new();
// Build app state with shutdown coordinator
let app_state = AppStateBuilder::default()
// ... existing fields ...
.shutdown(shutdown_coordinator.clone())
.build()
.await?;
let app_state = Arc::new(app_state);
// ... create router ...
// Bind the server
let listener = tokio::net::TcpListener::bind("127.0.0.1:3000")
.await
.unwrap();
tracing::info!("Listening on http://localhost:3000");
// Create the server with graceful shutdown
let server = axum::serve(listener, app.into_make_service())
.with_graceful_shutdown(shutdown_signal());
// Spawn a task to handle post-shutdown cleanup
let cleanup_state = app_state.clone();
let cleanup_task = spawn(async move {
shutdown_coordinator.subscribe().recv().await.ok();
cleanup_state.shutdown().await;
});
// Run the server
let result = server.await;
// Wait for cleanup to complete
let _ = tokio::time::timeout(
Duration::from_secs(30),
cleanup_task
).await;
tracing::info!("Server shutdown complete");
result.map_err(Into::into)
}7. Add Middleware for In-Flight Request Tracking
use std::sync::atomic::{AtomicUsize, Ordering};
#[derive(Clone)]
pub struct RequestTracker {
active_requests: Arc<AtomicUsize>,
}
impl RequestTracker {
pub fn new() -> Self {
Self {
active_requests: Arc::new(AtomicUsize::new(0)),
}
}
pub fn track<B>(&self) -> impl Fn(Request<B>, Next<B>) -> impl Future<Output = Response> + Clone {
let tracker = self.clone();
move |req: Request<B>, next: Next<B>| {
let tracker = tracker.clone();
async move {
tracker.active_requests.fetch_add(1, Ordering::Relaxed);
let response = next.run(req).await;
tracker.active_requests.fetch_sub(1, Ordering::Relaxed);
response
}
}
}
pub async fn wait_for_requests(&self, timeout: Duration) {
let start = tokio::time::Instant::now();
while self.active_requests.load(Ordering::Relaxed) > 0 {
if start.elapsed() > timeout {
tracing::warn!(
"Timeout waiting for {} requests to complete",
self.active_requests.load(Ordering::Relaxed)
);
break;
}
tokio::time::sleep(Duration::from_millis(100)).await;
}
}
}8. Add Health Check Endpoint for Load Balancer
async fn health_check(State(app_state): State<Arc<AppState>>) -> impl IntoResponse {
if app_state.shutdown.is_shutting_down() {
// Return 503 during shutdown so load balancer stops sending traffic
return (
StatusCode::SERVICE_UNAVAILABLE,
Json(json!({
"status": "shutting_down",
"message": "Server is shutting down"
}))
);
}
// Normal health check
(
StatusCode::OK,
Json(json!({
"status": "healthy",
"timestamp": chrono::Utc::now().to_rfc3339()
}))
)
}9. Update systemd Service for Graceful Shutdown
For production deployment, update the systemd service to handle graceful shutdown:
[Unit]
Description=OpenSecret Enclave Service
After=network.target
[Service]
Type=notify
User=enclave
WorkingDirectory=/app
ExecStart=/app/opensecret
Restart=always
RestartSec=5
# Graceful shutdown settings
KillMode=mixed
KillSignal=SIGTERM
TimeoutStopSec=30
SendSIGKILL=yes
# Environment
Environment="RUST_LOG=info"
[Install]
WantedBy=multi-user.targetTesting Plan
-
Test Signal Handling:
# Start the server ./opensecret & PID=$! # Send SIGTERM kill -TERM $PID # Verify graceful shutdown in logs
-
Test In-Flight Requests:
# Start a long-running request curl http://localhost:3000/api/long-operation & # Immediately send shutdown signal kill -TERM $(pgrep opensecret) # Verify request completes before shutdown
-
Test Database Connection Cleanup:
# Monitor connections before shutdown psql -c "SELECT count(*) FROM pg_stat_activity WHERE application_name = 'opensecret';" # Shutdown the service systemctl stop opensecret # Verify connections are closed psql -c "SELECT count(*) FROM pg_stat_activity WHERE application_name = 'opensecret';"
Expected Outcomes
- Clean Shutdown: All resources properly released on shutdown
- No Dropped Requests: In-flight requests complete before shutdown
- Database Connections Released: No need for machine restart
- Background Task Cleanup: All spawned tasks terminate cleanly
- Predictable Shutdown Time: Shutdown completes within 30 seconds
- Load Balancer Integration: Health check returns 503 during shutdown
Benefits
- Zero Downtime Deployments: Can restart service without dropping requests
- Resource Efficiency: No connection/resource leaks
- Better Monitoring: Know exactly when service is shutting down
- Improved Reliability: Predictable shutdown behavior
- Kubernetes/Container Ready: Proper signal handling for orchestration
This implementation ensures that all resources are properly cleaned up during shutdown, eliminating the need for full machine restarts when updating the service.
Metadata
Metadata
Assignees
Labels
No labels