containers · ieaves · Dec 2, 2025 · Nov 26, 2025 · Nov 30, 2025 · Nov 30, 2025
@@ -4,7 +4,11 @@ SELINUXOPT ?= $(shell test -x /usr/sbin/selinuxenabled && selinuxenabled && echo
 PREFIX ?= /usr/local
 BINDIR ?= ${PREFIX}/bin
 SHAREDIR ?= ${PREFIX}/share
-PYTHON ?= $(shell command -v python3 python|head -n1)
+# PYTHON ?= $(shell command -v python3 python|head -n1)
+ifndef PYTHON
+UV_BIN := $(shell command -v uv 2>/dev/null)
+PYTHON := $(if $(UV_BIN),uv run python3,$(shell command -v python3))
+endif
 DESTDIR ?= /
 PATH := $(PATH):$(HOME)/.local/bin
 MYPIP ?= pip
@@ -114,7 +118,7 @@ docsite-docs:
 .PHONY: lint
 lint:
 ifneq (,$(wildcard /usr/bin/python3))
-	/usr/bin/python3 -m compileall -q -x '\.venv' .
+	${PYTHON} -m compileall -q -x '\.venv' .
 endif
 	! grep -ri $(EXCLUDE_OPTS) "#\!/usr/bin/python3" .
 	flake8 $(PROJECT_DIR) $(PYTHON_SCRIPTS)

@@ -29,6 +29,9 @@ Show this help message and exit
 #### **--list**
 List the available models at an endpoint
 
+#### **--max-tokens**=*integer*
+Maximum number of tokens to generate. Set to 0 for unlimited output (default: 0).
+
 #### **--mcp**=SERVER_URL
 MCP (Model Context Protocol) servers to use for enhanced tool calling capabilities.
 Can be specified multiple times to connect to multiple MCP servers.
@@ -49,6 +52,10 @@ When enabled, ramalama will periodically condense older messages into a summary,
 keeping only recent messages and the summary. This prevents the context from growing
 indefinitely during long chat sessions. Set to 0 to disable (default: 4).
 
+#### **--temp**=*float*
+Temperature of the response from the AI Model.
+Lower numbers are more deterministic, higher numbers are more creative.
+
 #### **--url**=URL
 The host to send requests to (default: http://127.0.0.1:8080)
 

@@ -17,13 +17,17 @@ ramalama\-run - run specified AI Model as a chatbot
 | rlcr          | rlcr://   | [`ramalama.com`](https://registry.ramalama.com) |
 | OCI Container Registries | oci:// | [`opencontainers.org`](https://opencontainers.org)|
 |||Examples: [`quay.io`](https://quay.io),  [`Docker Hub`](https://docker.io),[`Artifactory`](https://artifactory.com)|
+| Hosted API Providers | openai:// | [`api.openai.com`](https://api.openai.com)|
 
 RamaLama defaults to the Ollama registry transport. This default can be overridden in the `ramalama.conf` file or via the RAMALAMA_TRANSPORTS
 environment. `export RAMALAMA_TRANSPORT=huggingface` Changes RamaLama to use huggingface transport.
 
-Modify individual model transports by specifying the `huggingface://`, `oci://`, `ollama://`, `https://`, `http://`, `file://` prefix to the model.
+Modify individual model transports by specifying the `huggingface://`, `oci://`, `ollama://`, `https://`, `http://`, `file://`, or hosted API
+prefix (`openai://`).
 
-URL support means if a model is on a web site or even on your local system, you can run it directly.
+Hosted API transports connect directly to the remote provider and bypass the local container runtime. In this mode, flags that tune local
+containers (for example `--image`, GPU settings, or `--network`) do not apply, and the provider's own capabilities and security posture govern
+the execution. URL support means if a model is on a web site or even on your local system, you can run it directly.
 
 ## OPTIONS
 

@@ -37,7 +37,7 @@ RamaLama CLI defaults can be modified via ramalama.conf files. Default settings
 
 ### Test and run your models more securely
 
-Because RamaLama defaults to running AI models inside of rootless containers using Podman on Docker. These containers isolate the AI models from information on the underlying host. With RamaLama containers, the AI model is mounted as a volume into the container in read/only mode. This results in the process running the model, llama.cpp or vLLM, being isolated from the host.  In addition, since `ramalama run` uses the --network=none option, the container can not reach the network and leak any information out of the system. Finally, containers are run with --rm options which means that any content written during the running of the container is wiped out when the application exits.
+Because RamaLama defaults to running AI models inside of rootless containers using Podman on Docker. These containers isolate the AI models from information on the underlying host. With RamaLama containers, the AI model is mounted as a volume into the container in read/only mode. This results in the process running the model, llama.cpp or vLLM, being isolated from the host.  In addition, since `ramalama run` uses the --network=none option, the container can not reach the network and leak any information out of the system. Finally, containers are run with --rm options which means that any content written during the running of the container is wiped out when the application exits. Hosted API transports such as `openai://` bypass the container runtime entirely and connect directly to the remote provider; those transports inherit the provider's network access and security guarantees instead of RamaLama's container sandbox.
 
 ### Here’s how RamaLama delivers a robust security footprint:
 

@@ -197,12 +197,26 @@
 # The maximum delay between retry attempts in seconds.
 #
 #max_retry_delay = 30
+
+
+[ramalama.provider]
+# Provider-specific hosted API configuration. Set per-provider options in the
+# nested tables below.
+
+
+[ramalama.provider.openai]
+# Optional provider-specific API key used when contacting OpenAI-hosted
+# transports. If unset, RamaLama falls back to the RAMALAMA_API_KEY value
+# or environment variables expected by the provider.
+#
+#api_key = "sk-..."
+
+
+[ramalama.user]
 # Suppress the interactive prompt when running on macOS with a Podman VM
 # that doesn't support GPU acceleration (e.g., applehv provider).
 # When set to true, RamaLama will automatically proceed without GPU support
 # instead of asking for confirmation.
 # Can also be set via the `RAMALAMA_USER__NO_MISSING_GPU_PROMPT` environment variable.
 #
-
-[ramalama.user]
 #no_missing_gpu_prompt = false
@@ -238,6 +238,21 @@ The maximum number of times to retry a failed download
 
 The maximum delay between retry attempts in seconds
 
+## RAMALAMA.PROVIDER TABLE
+The `ramalama.provider` table configures hosted API providers that RamaLama can proxy to.
+
+`[[ramalama.provider]]`
+
+**openai**=""
+
+Configuration settings for the openai hosted provider
+
+`[[ramalama.provider.openai]]`
+
+**api_key**=""
+
+Provider-specific API key used when invoking OpenAI-hosted transports. Overrides `RAMALAMA_API_KEY` when set.
+
 ## RAMALAMA.USER TABLE
 The ramalama.user table contains user preference settings.
 

@@ -2,9 +2,10 @@
 
 import sys
 
+from ramalama import cli
 from ramalama.cli import HelpException, init_cli, print_version
 from ramalama.common import perror
 
 assert sys.version_info >= (3, 10), "Python 3.10 or greater is required."
 
-__all__ = ["perror", "init_cli", "print_version", "HelpException"]
+__all__ = ["cli", "perror", "init_cli", "print_version", "HelpException"]
@@ -56,6 +56,8 @@ class ChatSubArgsType(Protocol):
     rag: str | None
     api_key: str | None
     ARGS: List[str] | None
+    max_tokens: int | None
+    temp: float | None
 
 
 ChatSubArgs = protocol_to_dataclass(ChatSubArgsType)