diff --git a/openml/base.py b/openml/base.py index a282be8eb..62de13f96 100644 --- a/openml/base.py +++ b/openml/base.py @@ -1,26 +1,22 @@ # License: BSD 3-Clause from __future__ import annotations -import re import webbrowser from abc import ABC, abstractmethod -from collections.abc import Iterable, Sequence +from collections.abc import Sequence import xmltodict import openml._api_calls import openml.config +from openml.utils import ReprMixin from .utils import _get_rest_api_type_alias, _tag_openml_base -class OpenMLBase(ABC): +class OpenMLBase(ReprMixin, ABC): """Base object for functionality that is shared across entities.""" - def __repr__(self) -> str: - body_fields = self._get_repr_body_fields() - return self._apply_repr_template(body_fields) - @property @abstractmethod def id(self) -> int | None: @@ -60,34 +56,6 @@ def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str] | N """ # Should be implemented in the base class. - def _apply_repr_template( - self, - body_fields: Iterable[tuple[str, str | int | list[str] | None]], - ) -> str: - """Generates the header and formats the body for string representation of the object. - - Parameters - ---------- - body_fields: List[Tuple[str, str]] - A list of (name, value) pairs to display in the body of the __repr__. - """ - # We add spaces between capitals, e.g. ClassificationTask -> Classification Task - name_with_spaces = re.sub( - r"(\w)([A-Z])", - r"\1 \2", - self.__class__.__name__[len("OpenML") :], - ) - header_text = f"OpenML {name_with_spaces}" - header = f"{header_text}\n{'=' * len(header_text)}\n" - - _body_fields: list[tuple[str, str | int | list[str]]] = [ - (k, "None" if v is None else v) for k, v in body_fields - ] - longest_field_name_length = max(len(name) for name, _ in _body_fields) - field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}" - body = "\n".join(field_line_format.format(name, value) for name, value in _body_fields) - return header + body - @abstractmethod def _to_dict(self) -> dict[str, dict]: """Creates a dictionary representation of self. diff --git a/openml/datasets/data_feature.py b/openml/datasets/data_feature.py index 0598763b0..2edc2a091 100644 --- a/openml/datasets/data_feature.py +++ b/openml/datasets/data_feature.py @@ -7,8 +7,10 @@ if TYPE_CHECKING: from IPython.lib import pretty +from openml.utils import ReprMixin -class OpenMLDataFeature: # noqa: PLW1641 + +class OpenMLDataFeature(ReprMixin): """ Data Feature (a.k.a. Attribute) object. @@ -74,11 +76,35 @@ def __init__( # noqa: PLR0913 self.number_missing_values = number_missing_values self.ontologies = ontologies - def __repr__(self) -> str: - return f"[{self.index} - {self.name} ({self.data_type})]" + def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str] | None]]: + """Collect all information to display in the __repr__ body.""" + fields: dict[str, int | str | None] = { + "Index": self.index, + "Name": self.name, + "Data Type": self.data_type, + } + + order = [ + "Index", + "Name", + "Data Type", + ] + return [(key, fields[key]) for key in order if key in fields] def __eq__(self, other: Any) -> bool: return isinstance(other, OpenMLDataFeature) and self.__dict__ == other.__dict__ + def __hash__(self) -> int: + return hash( + ( + self.index, + self.name, + self.data_type, + tuple(self.nominal_values) if self.nominal_values is not None else None, + self.number_missing_values, + tuple(self.ontologies) if self.ontologies is not None else None, + ) + ) + def _repr_pretty_(self, pp: pretty.PrettyPrinter, cycle: bool) -> None: # noqa: ARG002 pp.text(str(self)) diff --git a/openml/setups/setup.py b/openml/setups/setup.py index 0960ad4c1..82057f009 100644 --- a/openml/setups/setup.py +++ b/openml/setups/setup.py @@ -1,13 +1,15 @@ # License: BSD 3-Clause from __future__ import annotations +from collections.abc import Sequence from typing import Any import openml.config import openml.flows +from openml.utils import ReprMixin -class OpenMLSetup: +class OpenMLSetup(ReprMixin): """Setup object (a.k.a. Configuration). Parameters @@ -43,30 +45,21 @@ def _to_dict(self) -> dict[str, Any]: else None, } - def __repr__(self) -> str: - header = "OpenML Setup" - header = f"{header}\n{'=' * len(header)}\n" - - fields = { + def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str] | None]]: + """Collect all information to display in the __repr__ body.""" + fields: dict[str, int | str | None] = { "Setup ID": self.setup_id, "Flow ID": self.flow_id, "Flow URL": openml.flows.OpenMLFlow.url_for_id(self.flow_id), - "# of Parameters": ( - len(self.parameters) if self.parameters is not None else float("nan") - ), + "# of Parameters": (len(self.parameters) if self.parameters is not None else "nan"), } # determines the order in which the information will be printed order = ["Setup ID", "Flow ID", "Flow URL", "# of Parameters"] - _fields = [(key, fields[key]) for key in order if key in fields] - - longest_field_name_length = max(len(name) for name, _ in _fields) - field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}" - body = "\n".join(field_line_format.format(name, value) for name, value in _fields) - return header + body + return [(key, fields[key]) for key in order if key in fields] -class OpenMLParameter: +class OpenMLParameter(ReprMixin): """Parameter object (used in setup). Parameters @@ -123,11 +116,9 @@ def _to_dict(self) -> dict[str, Any]: "value": self.value, } - def __repr__(self) -> str: - header = "OpenML Parameter" - header = f"{header}\n{'=' * len(header)}\n" - - fields = { + def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str] | None]]: + """Collect all information to display in the __repr__ body.""" + fields: dict[str, int | str | None] = { "ID": self.id, "Flow ID": self.flow_id, # "Flow Name": self.flow_name, @@ -156,9 +147,4 @@ def __repr__(self) -> str: parameter_default, parameter_value, ] - _fields = [(key, fields[key]) for key in order if key in fields] - - longest_field_name_length = max(len(name) for name, _ in _fields) - field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}" - body = "\n".join(field_line_format.format(name, value) for name, value in _fields) - return header + body + return [(key, fields[key]) for key in order if key in fields] diff --git a/openml/tasks/split.py b/openml/tasks/split.py index 464e41b2a..2ae683297 100644 --- a/openml/tasks/split.py +++ b/openml/tasks/split.py @@ -3,6 +3,7 @@ import pickle from collections import OrderedDict +from collections.abc import Sequence from pathlib import Path from typing import Any from typing_extensions import NamedTuple @@ -10,6 +11,8 @@ import arff # type: ignore import numpy as np +from openml.utils import ReprMixin + class Split(NamedTuple): """A single split of a dataset.""" @@ -18,7 +21,7 @@ class Split(NamedTuple): test: np.ndarray -class OpenMLSplit: # noqa: PLW1641 +class OpenMLSplit(ReprMixin): """OpenML Split object. This class manages train-test splits for a dataset across multiple @@ -63,6 +66,22 @@ def __init__( self.folds = len(self.split[0]) self.samples = len(self.split[0][0]) + def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str] | None]]: + """Collect all information to display in the __repr__ body.""" + fields = { + "Name": self.name, + "Description": ( + self.description if len(self.description) <= 80 else self.description[:77] + "..." + ), + "Repeats": self.repeats, + "Folds": self.folds, + "Samples": self.samples, + } + + order = ["Name", "Description", "Repeats", "Folds", "Samples"] + + return [(key, fields[key]) for key in order if key in fields] + def __eq__(self, other: Any) -> bool: if ( (not isinstance(self, type(other))) @@ -90,6 +109,29 @@ def __eq__(self, other: Any) -> bool: return False return True + def __hash__(self) -> int: + split_items = [] + for repetition in sorted(self.split): + for fold in sorted(self.split[repetition]): + for sample in sorted(self.split[repetition][fold]): + train, test = self.split[repetition][fold][sample] + split_items.append( + ( + repetition, + fold, + sample, + hash(train.tobytes()), + hash(test.tobytes()), + ) + ) + return hash( + ( + self.name, + self.description, + tuple(split_items), + ) + ) + @classmethod def _from_arff_file(cls, filename: Path) -> OpenMLSplit: # noqa: C901, PLR0912 repetitions = None diff --git a/openml/utils.py b/openml/utils.py index 3680bc0ff..0fd665724 100644 --- a/openml/utils.py +++ b/openml/utils.py @@ -2,9 +2,11 @@ from __future__ import annotations import contextlib +import re import shutil import warnings -from collections.abc import Callable, Mapping, Sized +from abc import ABC, abstractmethod +from collections.abc import Callable, Iterable, Mapping, Sequence, Sized from functools import wraps from pathlib import Path from typing import TYPE_CHECKING, Any, Literal, TypeVar, overload @@ -470,3 +472,57 @@ def update(self, length: int) -> None: self._progress_bar.update(length) if self._progress_bar.total <= self._progress_bar.n: self._progress_bar.close() + + +class ReprMixin(ABC): + """A mixin class that provides a customizable string representation for OpenML objects. + + This mixin standardizes the __repr__ output format across OpenML classes. + Classes inheriting from this mixin should implement the + _get_repr_body_fields method to specify which fields to display. + """ + + def __repr__(self) -> str: + body_fields = self._get_repr_body_fields() + return self._apply_repr_template(body_fields) + + @abstractmethod + def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str] | None]]: + """Collect all information to display in the __repr__ body. + + Returns + ------- + body_fields : List[Tuple[str, Union[str, int, List[str]]]] + A list of (name, value) pairs to display in the body of the __repr__. + E.g.: [('metric', 'accuracy'), ('dataset', 'iris')] + If value is a List of str, then each item of the list will appear in a separate row. + """ + # Should be implemented in the base class. + + def _apply_repr_template( + self, + body_fields: Iterable[tuple[str, str | int | list[str] | None]], + ) -> str: + """Generates the header and formats the body for string representation of the object. + + Parameters + ---------- + body_fields: List[Tuple[str, str]] + A list of (name, value) pairs to display in the body of the __repr__. + """ + # We add spaces between capitals, e.g. ClassificationTask -> Classification Task + name_with_spaces = re.sub( + r"(\w)([A-Z])", + r"\1 \2", + self.__class__.__name__[len("OpenML") :], + ) + header_text = f"OpenML {name_with_spaces}" + header = f"{header_text}\n{'=' * len(header_text)}\n" + + _body_fields: list[tuple[str, str | int | list[str]]] = [ + (k, "None" if v is None else v) for k, v in body_fields + ] + longest_field_name_length = max(len(name) for name, _ in _body_fields) + field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}" + body = "\n".join(field_line_format.format(name, value) for name, value in _body_fields) + return header + body