runpod · deanq · Dec 12, 2025 · Dec 12, 2025 · Dec 12, 2025 · Dec 13, 2025
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,4 @@
+include runpod/serverless/binaries/gpu_test
+include runpod/serverless/binaries/README.md
+include build_tools/gpu_test.c
+include build_tools/compile_gpu_test.sh
diff --git a/README.md b/README.md
@@ -106,6 +106,49 @@ You can also test your worker locally before deploying it to Runpod. This is use
 python my_worker.py --rp_serve_api
 ```
 
+### Worker Fitness Checks
+
+Fitness checks allow you to validate your worker environment at startup before processing jobs. If any check fails, the worker exits immediately, allowing your orchestrator to restart it.
+
+```python
+# my_worker.py
+
+import runpod
+import torch
+
+# Register fitness checks using the decorator
+@runpod.serverless.register_fitness_check
+def check_gpu_available():
+    """Verify GPU is available."""
+    if not torch.cuda.is_available():
+        raise RuntimeError("GPU not available")
+
+@runpod.serverless.register_fitness_check
+def check_disk_space():
+    """Verify sufficient disk space."""
+    import shutil
+    stat = shutil.disk_usage("/")
+    free_gb = stat.free / (1024**3)
+    if free_gb < 10:
+        raise RuntimeError(f"Insufficient disk space: {free_gb:.2f}GB free")
+
+def handler(job):
+    job_input = job["input"]
+    # Your handler code here
+    return {"output": "success"}
+
+# Fitness checks run before handler initialization (production only)
+runpod.serverless.start({"handler": handler})
+```
+
+**Key Features:**
+- Supports both synchronous and asynchronous check functions
+- Checks run only once at worker startup (production mode)
+- Runs before handler initialization and job processing begins
+- Any check failure exits with code 1 (worker marked unhealthy)
+
+See [Worker Fitness Checks](https://github.com/runpod/runpod-python/blob/main/docs/serverless/worker_fitness_checks.md) documentation for more examples and best practices.
+
 ## 📚 | API Language Library (GraphQL Wrapper)
 
 When interacting with the Runpod API you can use this library to make requests to the API.

diff --git a/build_tools/compile_gpu_test.sh b/build_tools/compile_gpu_test.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+# Compile gpu_test binary for Linux x86_64 with CUDA support
+# Usage: ./compile_gpu_test.sh
+# Output: ../runpod/serverless/binaries/gpu_test
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+OUTPUT_DIR="$SCRIPT_DIR/../runpod/serverless/binaries"
+CUDA_VERSION="${CUDA_VERSION:-11.8.0}"
+UBUNTU_VERSION="${UBUNTU_VERSION:-ubuntu22.04}"
+
+# Create output directory if it doesn't exist
+mkdir -p "$OUTPUT_DIR"
+
+echo "Compiling gpu_test binary..."
+echo "CUDA Version: $CUDA_VERSION"
+echo "Ubuntu Version: $UBUNTU_VERSION"
+echo "Output directory: $OUTPUT_DIR"
+
+# Build in Docker container with NVIDIA CUDA development environment
+docker run --rm \
+  -v "$SCRIPT_DIR:/workspace" \
+  "nvidia/cuda:${CUDA_VERSION}-devel-${UBUNTU_VERSION}" \
+  bash -c "
+    cd /workspace && \
+    nvcc -O3 \
+      -arch=sm_70 \
+      -gencode=arch=compute_70,code=sm_70 \
+      -gencode=arch=compute_75,code=sm_75 \
+      -gencode=arch=compute_80,code=sm_80 \
+      -gencode=arch=compute_86,code=sm_86 \
+      -o gpu_test \
+      gpu_test.c -lnvidia-ml -lcudart_static && \
+    echo 'Compilation successful' && \
+    file gpu_test
+  "
+
+# Move binary to output directory
+if [ -f "$SCRIPT_DIR/gpu_test" ]; then
+  mv "$SCRIPT_DIR/gpu_test" "$OUTPUT_DIR/gpu_test"
+  chmod +x "$OUTPUT_DIR/gpu_test"
+  echo "Binary successfully created at: $OUTPUT_DIR/gpu_test"
+  echo "Binary info:"
+  file "$OUTPUT_DIR/gpu_test"
+else
+  echo "Error: Compilation failed, binary not found"
+  exit 1
+fi
diff --git a/build_tools/gpu_test.c b/build_tools/gpu_test.c
@@ -0,0 +1,77 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <cuda_runtime.h>
+#include <nvml.h>
+#include <sys/utsname.h>
+
+void log_linux_kernel_version() {
+    struct utsname buffer;
+    if (uname(&buffer) == 0) {
+        printf("Linux Kernel Version: %s\n", buffer.release);
+    } else {
+        perror("uname");
+    }
+}
+
+void log_cuda_driver_version() {
+    int driver_version;
+    cudaError_t result = cudaDriverGetVersion(&driver_version);
+    if (result == cudaSuccess) {
+        printf("CUDA Driver Version: %d.%d\n", driver_version / 1000, (driver_version % 1000) / 10);
+    } else {
+        printf("Failed to get CUDA driver version. Error code: %d (%s)\n", result, cudaGetErrorString(result));
+    }
+}
+
+void enumerate_gpus_and_test() {
+    nvmlReturn_t result;
+    result = nvmlInit();
+    if (result != NVML_SUCCESS) {
+        printf("Failed to initialize NVML: %s\n", nvmlErrorString(result));
+        return;
+    }
+
+    unsigned int device_count;
+    result = nvmlDeviceGetCount(&device_count);
+    if (result != NVML_SUCCESS) {
+        printf("Failed to get GPU count: %s\n", nvmlErrorString(result));
+        nvmlShutdown();
+        return;
+    }
+
+    printf("Found %u GPUs:\n", device_count);
+    for (unsigned int i = 0; i < device_count; i++) {
+        nvmlDevice_t device;
+        char name[NVML_DEVICE_NAME_BUFFER_SIZE];
+        char uuid[NVML_DEVICE_UUID_BUFFER_SIZE];
+        result = nvmlDeviceGetHandleByIndex(i, &device);
+        if (result == NVML_SUCCESS) {
+            nvmlDeviceGetName(device, name, sizeof(name));
+            nvmlDeviceGetUUID(device, uuid, sizeof(uuid));
+            printf("GPU %u: %s (UUID: %s)\n", i, name, uuid);
+
+            // Allocate memory on GPU to test accessibility
+            cudaSetDevice(i);
+            float *d_tensor;
+            cudaError_t cuda_result = cudaMalloc((void**)&d_tensor, sizeof(float) * 10);
+            if (cuda_result == cudaSuccess) {
+                printf("GPU %u memory allocation test passed.\n", i);
+                cudaFree(d_tensor);
+            } else {
+                printf("GPU %u memory allocation test failed. Error code: %d (%s)\n", i, cuda_result, cudaGetErrorString(cuda_result));
+            }
+        } else {
+            printf("Failed to get handle for GPU %u: %s (Error code: %d)\n", i, nvmlErrorString(result), result);
+        }
+    }
+
+    nvmlShutdown();
+}
+
+int main() {
+    log_linux_kernel_version();
+    log_cuda_driver_version();
+    enumerate_gpus_and_test();
+    return 0;
+}