Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
52bc77f
feat(serverless): add worker fitness check system
deanq Dec 12, 2025
3b3a27a
feat(serverless): add worker fitness check system
deanq Dec 12, 2025
757f334
docs: moved serverless architecture doc
deanq Dec 12, 2025
d607f4c
docs: document fitness check system
deanq Dec 13, 2025
65d08c8
feat(build): add GPU test binary build infrastructure
deanq Dec 17, 2025
f39e7fc
feat(serverless): implement GPU fitness check system
deanq Dec 17, 2025
87635e5
build(dist): package GPU test binary for distribution
deanq Dec 17, 2025
74cf3c1
test(serverless): add GPU fitness check tests
deanq Dec 17, 2025
473f10c
test(performance): disable GPU check in cold start benchmarks
deanq Dec 17, 2025
9e77a15
docs(serverless): document GPU fitness check system
deanq Dec 17, 2025
edf4694
fix(fitness): defer GPU check registration to avoid circular imports
deanq Dec 18, 2025
3fbc0ba
fix(logging): use warn() instead of warning() for RunPodLogger
deanq Dec 18, 2025
d8833ea
fix(logging): fix RunPodLogger.warning() call in rp_scale
deanq Dec 18, 2025
4e6ab53
fix(gpu-fitness): correct import path for rp_cuda
deanq Dec 18, 2025
bf5cb38
fix(test): correct mock patch target for binary path resolution test
deanq Dec 18, 2025
349cf43
build(gpu-binary): replace ARM binary with x86-64 compiled version
deanq Dec 18, 2025
f764971
feat(system-fitness): add system resource fitness checks
deanq Dec 18, 2025
83d68ee
refactor(fitness): integrate system fitness checks auto-registration
deanq Dec 18, 2025
06cdb94
build(deps): add psutil for system resource checking
deanq Dec 18, 2025
309e9c1
test(system-fitness): add comprehensive test suite for system fitness…
deanq Dec 18, 2025
9238ec1
test(fitness): update fixtures to handle system checks auto-registration
deanq Dec 18, 2025
4fc7c16
docs: document built-in system fitness checks with configuration
deanq Dec 18, 2025
e957958
feat(cuda-init): add CUDA device initialization fitness check
deanq Dec 18, 2025
d81e81a
docs: document CUDA device initialization fitness check
deanq Dec 18, 2025
5dde5cf
chore: reduce minimum disk space requirement to 1GB
deanq Dec 18, 2025
0e16910
fix(cuda): suppress nvidia-smi stderr on CPU-only workers
deanq Dec 18, 2025
1f7e83d
fix(cuda): parse actual CUDA version from nvidia-smi, not driver version
deanq Dec 18, 2025
5e35792
refactor(disk-check): use percentage-based disk space validation
deanq Dec 19, 2025
4b1cf9c
docs: update disk space check documentation for percentage-based vali…
deanq Dec 19, 2025
6c02a98
fix(disk-check): remove redundant /tmp check in containers
deanq Dec 19, 2025
bd1d464
fix(tests): update CUDA tests to match implementation
deanq Dec 19, 2025
3c69761
fix(fitness): address PR feedback on fitness checks system
deanq Dec 19, 2025
766807b
fix(fitness): resolve CodeQL code quality issues
deanq Dec 19, 2025
7a6774f
refactor(gpu-fitness): remove redundant is_available() call in fallback
deanq Dec 19, 2025
dcabce4
fix(fitness): resolve unresolved PR feedback comments
deanq Dec 19, 2025
7f31c04
fix(fitness): resolve CodeQL and Copilot feedback comments
deanq Dec 19, 2025
c4ccc0c
fix(fitness): remove unused mock variable assignments in tests
deanq Dec 19, 2025
bc1a809
fix(ruff): resolve all remaining linting errors
deanq Dec 19, 2025
bf0ff84
chore: remove .claude/CLAUDE.md from fitness checks PR
deanq Dec 25, 2025
788960a
refactor(tests): reorganize fitness check tests into dedicated subdir…
deanq Dec 25, 2025
cbe6f9f
fix(tests): resolve linting issues in fitness test reorganization
deanq Dec 25, 2025
aef8849
feat(fitness): add timing instrumentation to fitness checks
deanq Dec 25, 2025
92eefbe
fix(fitness): resolve exception handling and timing instrumentation i…
deanq Jan 3, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include runpod/serverless/binaries/gpu_test
include runpod/serverless/binaries/README.md
include build_tools/gpu_test.c
include build_tools/compile_gpu_test.sh
43 changes: 43 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,49 @@ You can also test your worker locally before deploying it to Runpod. This is use
python my_worker.py --rp_serve_api
```

### Worker Fitness Checks

Fitness checks allow you to validate your worker environment at startup before processing jobs. If any check fails, the worker exits immediately, allowing your orchestrator to restart it.

```python
# my_worker.py

import runpod
import torch

# Register fitness checks using the decorator
@runpod.serverless.register_fitness_check
def check_gpu_available():
"""Verify GPU is available."""
if not torch.cuda.is_available():
raise RuntimeError("GPU not available")

@runpod.serverless.register_fitness_check
def check_disk_space():
"""Verify sufficient disk space."""
import shutil
stat = shutil.disk_usage("/")
free_gb = stat.free / (1024**3)
if free_gb < 10:
raise RuntimeError(f"Insufficient disk space: {free_gb:.2f}GB free")

def handler(job):
job_input = job["input"]
# Your handler code here
return {"output": "success"}

# Fitness checks run before handler initialization (production only)
runpod.serverless.start({"handler": handler})
```

**Key Features:**
- Supports both synchronous and asynchronous check functions
- Checks run only once at worker startup (production mode)
- Runs before handler initialization and job processing begins
- Any check failure exits with code 1 (worker marked unhealthy)

See [Worker Fitness Checks](https://github.com/runpod/runpod-python/blob/main/docs/serverless/worker_fitness_checks.md) documentation for more examples and best practices.

## 📚 | API Language Library (GraphQL Wrapper)

When interacting with the Runpod API you can use this library to make requests to the API.
Expand Down
49 changes: 49 additions & 0 deletions build_tools/compile_gpu_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/bin/bash
# Compile gpu_test binary for Linux x86_64 with CUDA support
# Usage: ./compile_gpu_test.sh
# Output: ../runpod/serverless/binaries/gpu_test

set -e

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
OUTPUT_DIR="$SCRIPT_DIR/../runpod/serverless/binaries"
CUDA_VERSION="${CUDA_VERSION:-11.8.0}"
UBUNTU_VERSION="${UBUNTU_VERSION:-ubuntu22.04}"

# Create output directory if it doesn't exist
mkdir -p "$OUTPUT_DIR"

echo "Compiling gpu_test binary..."
echo "CUDA Version: $CUDA_VERSION"
echo "Ubuntu Version: $UBUNTU_VERSION"
echo "Output directory: $OUTPUT_DIR"

# Build in Docker container with NVIDIA CUDA development environment
docker run --rm \
-v "$SCRIPT_DIR:/workspace" \
"nvidia/cuda:${CUDA_VERSION}-devel-${UBUNTU_VERSION}" \
bash -c "
cd /workspace && \
nvcc -O3 \
-arch=sm_70 \
-gencode=arch=compute_70,code=sm_70 \
-gencode=arch=compute_75,code=sm_75 \
-gencode=arch=compute_80,code=sm_80 \
-gencode=arch=compute_86,code=sm_86 \
-o gpu_test \
gpu_test.c -lnvidia-ml -lcudart_static && \
echo 'Compilation successful' && \
file gpu_test
"

# Move binary to output directory
if [ -f "$SCRIPT_DIR/gpu_test" ]; then
mv "$SCRIPT_DIR/gpu_test" "$OUTPUT_DIR/gpu_test"
chmod +x "$OUTPUT_DIR/gpu_test"
echo "Binary successfully created at: $OUTPUT_DIR/gpu_test"
echo "Binary info:"
file "$OUTPUT_DIR/gpu_test"
else
echo "Error: Compilation failed, binary not found"
exit 1
fi
77 changes: 77 additions & 0 deletions build_tools/gpu_test.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda_runtime.h>
#include <nvml.h>
#include <sys/utsname.h>

void log_linux_kernel_version() {
struct utsname buffer;
if (uname(&buffer) == 0) {
printf("Linux Kernel Version: %s\n", buffer.release);
} else {
perror("uname");
}
}

void log_cuda_driver_version() {
int driver_version;
cudaError_t result = cudaDriverGetVersion(&driver_version);
if (result == cudaSuccess) {
printf("CUDA Driver Version: %d.%d\n", driver_version / 1000, (driver_version % 1000) / 10);
} else {
printf("Failed to get CUDA driver version. Error code: %d (%s)\n", result, cudaGetErrorString(result));
}
}

void enumerate_gpus_and_test() {
nvmlReturn_t result;
result = nvmlInit();
if (result != NVML_SUCCESS) {
printf("Failed to initialize NVML: %s\n", nvmlErrorString(result));
return;
}

unsigned int device_count;
result = nvmlDeviceGetCount(&device_count);
if (result != NVML_SUCCESS) {
printf("Failed to get GPU count: %s\n", nvmlErrorString(result));
nvmlShutdown();
return;
}

printf("Found %u GPUs:\n", device_count);
for (unsigned int i = 0; i < device_count; i++) {
nvmlDevice_t device;
char name[NVML_DEVICE_NAME_BUFFER_SIZE];
char uuid[NVML_DEVICE_UUID_BUFFER_SIZE];
result = nvmlDeviceGetHandleByIndex(i, &device);
if (result == NVML_SUCCESS) {
nvmlDeviceGetName(device, name, sizeof(name));
nvmlDeviceGetUUID(device, uuid, sizeof(uuid));
printf("GPU %u: %s (UUID: %s)\n", i, name, uuid);

// Allocate memory on GPU to test accessibility
cudaSetDevice(i);
float *d_tensor;
cudaError_t cuda_result = cudaMalloc((void**)&d_tensor, sizeof(float) * 10);
if (cuda_result == cudaSuccess) {
printf("GPU %u memory allocation test passed.\n", i);
cudaFree(d_tensor);
} else {
printf("GPU %u memory allocation test failed. Error code: %d (%s)\n", i, cuda_result, cudaGetErrorString(cuda_result));
}
} else {
printf("Failed to get handle for GPU %u: %s (Error code: %d)\n", i, nvmlErrorString(result), result);
}
}

nvmlShutdown();
}

int main() {
log_linux_kernel_version();
log_cuda_driver_version();
enumerate_gpus_and_test();
return 0;
}
Loading
Loading