Skip to content
Draft
8 changes: 8 additions & 0 deletions openml/_api/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from openml._api.runtime.core import APIContext


def set_api_version(version: str, *, strict: bool = False) -> None:
api_context.set_version(version=version, strict=strict)


api_context = APIContext()
Comment on lines +1 to +8
Copy link
Collaborator

@PGijsbers PGijsbers Jan 15, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not clear what the function of the APIContext is here. Why do we need it and cannot just use backend directly? E.g.:

Suggested change
from openml._api.runtime.core import APIContext
def set_api_version(version: str, *, strict: bool = False) -> None:
api_context.set_version(version=version, strict=strict)
api_context = APIContext()
from openml._api.runtime.core import build_backend
_backend = build_backend("v1", strict=False)
def set_api_version(version: str, *, strict: bool = False) -> None:
global _backend
_backend = build_backend(version=version, strict=strict)
def backend() -> APIBackend:
return _backend

If it is just to avoid the pitfall where users assign the returned value to a local variable with a scope that is too long lived, then the same would apply if users would assign api_context.backend to a variable. We could instead extend the APIBackend class to allow updates to its attributes?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with you, it's not really useful, I am going to iterate over the design and will keep this in mind

6 changes: 6 additions & 0 deletions openml/_api/clients/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from .http import HTTPCache, HTTPClient

__all__ = [
"HTTPCache",
"HTTPClient",
]
211 changes: 211 additions & 0 deletions openml/_api/clients/http.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
from __future__ import annotations

import json
import time
from pathlib import Path
from typing import TYPE_CHECKING, Any
from urllib.parse import urlencode, urljoin, urlparse

import requests
from requests import Response

from openml.__version__ import __version__

if TYPE_CHECKING:
from openml._api.config import DelayMethod


class HTTPCache:
def __init__(self, *, path: Path, ttl: int) -> None:
self.path = path
self.ttl = ttl

def get_key(self, url: str, params: dict[str, Any]) -> str:
parsed_url = urlparse(url)
netloc_parts = parsed_url.netloc.split(".")[::-1]
path_parts = parsed_url.path.strip("/").split("/")

filtered_params = {k: v for k, v in params.items() if k != "api_key"}
params_part = [urlencode(filtered_params)] if filtered_params else []

return str(Path(*netloc_parts, *path_parts, *params_part))

def _key_to_path(self, key: str) -> Path:
return self.path.joinpath(key)

def load(self, key: str) -> Response:
path = self._key_to_path(key)

if not path.exists():
raise FileNotFoundError(f"Cache directory not found: {path}")

meta_path = path / "meta.json"
headers_path = path / "headers.json"
body_path = path / "body.bin"

if not (meta_path.exists() and headers_path.exists() and body_path.exists()):
raise FileNotFoundError(f"Incomplete cache at {path}")

with meta_path.open("r", encoding="utf-8") as f:
meta = json.load(f)

created_at = meta.get("created_at")
if created_at is None:
raise ValueError("Cache metadata missing 'created_at'")

if time.time() - created_at > self.ttl:
raise TimeoutError(f"Cache expired for {path}")

with headers_path.open("r", encoding="utf-8") as f:
headers = json.load(f)

body = body_path.read_bytes()

response = Response()
response.status_code = meta["status_code"]
response.url = meta["url"]
response.reason = meta["reason"]
response.headers = headers
response._content = body
response.encoding = meta["encoding"]

return response

def save(self, key: str, response: Response) -> None:
path = self._key_to_path(key)
path.mkdir(parents=True, exist_ok=True)

(path / "body.bin").write_bytes(response.content)

with (path / "headers.json").open("w", encoding="utf-8") as f:
json.dump(dict(response.headers), f)

meta = {
"status_code": response.status_code,
"url": response.url,
"reason": response.reason,
"encoding": response.encoding,
"elapsed": response.elapsed.total_seconds(),
"created_at": time.time(),
"request": {
"method": response.request.method if response.request else None,
"url": response.request.url if response.request else None,
"headers": dict(response.request.headers) if response.request else None,
"body": response.request.body if response.request else None,
},
}

with (path / "meta.json").open("w", encoding="utf-8") as f:
json.dump(meta, f)


class HTTPClient:
def __init__( # noqa: PLR0913
self,
*,
server: str,
base_url: str,
api_key: str,
timeout: int,
retries: int,
delay_method: DelayMethod,
delay_time: int,
cache: HTTPCache | None = None,
) -> None:
self.server = server
self.base_url = base_url
self.api_key = api_key
self.timeout = timeout
self.retries = retries
self.delay_method = delay_method
self.delay_time = delay_time
self.cache = cache

self.headers: dict[str, str] = {"user-agent": f"openml-python/{__version__}"}

def request(
self,
method: str,
path: str,
*,
use_cache: bool = False,
use_api_key: bool = False,
**request_kwargs: Any,
) -> Response:
url = urljoin(self.server, urljoin(self.base_url, path))

# prepare params
params = request_kwargs.pop("params", {}).copy()
if use_api_key:
params["api_key"] = self.api_key

# prepare headers
headers = request_kwargs.pop("headers", {}).copy()
headers.update(self.headers)

timeout = request_kwargs.pop("timeout", self.timeout)

if use_cache and self.cache is not None:
cache_key = self.cache.get_key(url, params)
try:
return self.cache.load(cache_key)
except (FileNotFoundError, TimeoutError):
pass # cache miss or expired, continue
except Exception:
raise # propagate unexpected cache errors

response = requests.request(
method=method,
url=url,
params=params,
headers=headers,
timeout=timeout,
**request_kwargs,
)

if use_cache and self.cache is not None:
self.cache.save(cache_key, response)

return response

def get(
self,
path: str,
*,
use_cache: bool = False,
use_api_key: bool = False,
**request_kwargs: Any,
) -> Response:
return self.request(
method="GET",
path=path,
use_cache=use_cache,
use_api_key=use_api_key,
**request_kwargs,
)

def post(
self,
path: str,
**request_kwargs: Any,
) -> Response:
return self.request(
method="POST",
path=path,
use_cache=False,
use_api_key=True,
**request_kwargs,
)

def delete(
self,
path: str,
**request_kwargs: Any,
) -> Response:
return self.request(
method="DELETE",
path=path,
use_cache=False,
use_api_key=True,
**request_kwargs,
)
Empty file added openml/_api/clients/minio.py
Empty file.
61 changes: 61 additions & 0 deletions openml/_api/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from __future__ import annotations

from dataclasses import dataclass
from enum import Enum


class DelayMethod(str, Enum):
HUMAN = "human"
ROBOT = "robot"


@dataclass
class APIConfig:
server: str
base_url: str
api_key: str
timeout: int = 10 # seconds
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Add a unit suffix (timeout_seconds) so the unit is clear without navigating to the source.

ps. I also considered typing it as datetime.timedelta but considering you probably only use it in seconds and there is a real risk of developers erroneously using datetime.timedelta.seconds instead of datetime.timedelta.total_seconds(), I think keeping it an integer is better.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

makes sense



@dataclass
class APISettings:
v1: APIConfig
v2: APIConfig


@dataclass
class ConnectionConfig:
retries: int = 3
delay_method: DelayMethod = DelayMethod.HUMAN
delay_time: int = 1 # seconds
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: here too, including the unit makes sense (delay_time_seconds)



@dataclass
class CacheConfig:
dir: str = "~/.openml/cache"
Copy link
Collaborator

@PGijsbers PGijsbers Jan 15, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Default should continue to respect XDG_CACHE_HOME.

ttl: int = 60 * 60 * 24 * 7 # one week
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Considering the TTL of the HTTP standard is already defined in seconds, maybe it is fine to exclude it in the variable name? Though as noted above there is a discussion to be had about having this as a cache level property in the first place.
For future reference, setting the value to timedelta(weeks=1).total_seconds() is preferred over the arithmetic+comment.



@dataclass
class Settings:
api: APISettings
connection: ConnectionConfig
cache: CacheConfig


settings = Settings(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would move the settings to the individual classes. I think this design introduces too high coupling of the classes to this file. You cannot move the classes around, or add a new API version without making non-extensible changes to this file here - because APISettings will require a constructor change and new classes it accepts.

Instead, a better design is to apply the strategy pattern cleanly to the different API definitions - v1 and v2 - and move the config either to their __init__, or a set_config (or similar) method.

api=APISettings(
v1=APIConfig(
server="https://www.openml.org/",
base_url="api/v1/xml/",
api_key="...",
),
v2=APIConfig(
server="http://127.0.0.1:8001/",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this be hardcoded? I guess this is just for your local development

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is hard-coded, they are the default values though the local endpoints will be replaced by remote server when deployed hopefully before merging this in main

base_url="",
api_key="...",
),
),
connection=ConnectionConfig(),
cache=CacheConfig(),
)
4 changes: 4 additions & 0 deletions openml/_api/resources/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from openml._api.resources.datasets import DatasetsV1, DatasetsV2
from openml._api.resources.tasks import TasksV1, TasksV2

__all__ = ["DatasetsV1", "DatasetsV2", "TasksV1", "TasksV2"]
31 changes: 31 additions & 0 deletions openml/_api/resources/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from __future__ import annotations

from abc import ABC, abstractmethod
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from requests import Response

from openml._api.http import HTTPClient
from openml.datasets.dataset import OpenMLDataset
from openml.tasks.task import OpenMLTask


class ResourceAPI:
def __init__(self, http: HTTPClient):
self._http = http


class DatasetsAPI(ResourceAPI, ABC):
@abstractmethod
def get(self, dataset_id: int) -> OpenMLDataset | tuple[OpenMLDataset, Response]: ...
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From an API design perspective, I am not sure what the usecase for the user is to want access to the requests.Response. The only case I can think of is to parse the data itself but if the user wants to do that I reckon we failed in our API design? I will be the first to admit that error handling needs to be improved (both on the server and client side), but I don't think this makes sense. Am I missing something?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment of course applies not just to the DatasetsAPI, but all of the resource apis

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes that is removed now, the user cannot access the response object, as it's not needed either



class TasksAPI(ResourceAPI, ABC):
@abstractmethod
def get(
self,
task_id: int,
*,
return_response: bool = False,
) -> OpenMLTask | tuple[OpenMLTask, Response]: ...
20 changes: 20 additions & 0 deletions openml/_api/resources/datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from __future__ import annotations

from typing import TYPE_CHECKING

from openml._api.resources.base import DatasetsAPI

if TYPE_CHECKING:
from responses import Response
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In production this would be requests, right? You used responses for the mocking here during development.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes this should should be requests, I'll fix it.


from openml.datasets.dataset import OpenMLDataset


class DatasetsV1(DatasetsAPI):
def get(self, dataset_id: int) -> OpenMLDataset | tuple[OpenMLDataset, Response]:
raise NotImplementedError


class DatasetsV2(DatasetsAPI):
def get(self, dataset_id: int) -> OpenMLDataset | tuple[OpenMLDataset, Response]:
raise NotImplementedError
Loading