diff --git a/openml/extensions/base/__init__.py b/openml/extensions/base/__init__.py new file mode 100644 index 000000000..ed5289c48 --- /dev/null +++ b/openml/extensions/base/__init__.py @@ -0,0 +1,11 @@ +# License: BSD 3-Clause + +"""Base classes for OpenML extensions.""" + +from openml.extensions.base._executor import ModelExecutor +from openml.extensions.base._serializer import ModelSerializer + +__all__ = [ + "ModelExecutor", + "ModelSerializer", +] diff --git a/openml/extensions/base/_executor.py b/openml/extensions/base/_executor.py new file mode 100644 index 000000000..379fe1da4 --- /dev/null +++ b/openml/extensions/base/_executor.py @@ -0,0 +1,137 @@ +# License: BSD 3-Clause + +"""Base class for estimator executors.""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from collections import OrderedDict +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + import numpy as np + import scipy.sparse + + from openml.runs.trace import OpenMLRunTrace, OpenMLTraceIteration + from openml.tasks.task import OpenMLTask + + +class ModelExecutor(ABC): + """Define runtime execution semantics for a specific API type.""" + + @classmethod + @abstractmethod + def can_handle_model(cls, model: Any) -> bool: + """Check whether a model flow can be handled by this extension. + + This is typically done by checking the type of the model, or the package it belongs to. + + Parameters + ---------- + model : Any + + Returns + ------- + bool + """ + + @abstractmethod + def seed_model(self, model: Any, seed: int | None) -> Any: + """Set the seed of all the unseeded components of a model and return the seeded model. + + Required so that all seed information can be uploaded to OpenML for reproducible results. + + Parameters + ---------- + model : Any + The model to be seeded + seed : int + + Returns + ------- + model + """ + + @abstractmethod + def _run_model_on_fold( # noqa: PLR0913 + self, + model: Any, + task: OpenMLTask, + X_train: np.ndarray | scipy.sparse.spmatrix, + rep_no: int, + fold_no: int, + y_train: np.ndarray | None = None, + X_test: np.ndarray | scipy.sparse.spmatrix | None = None, + ) -> tuple[np.ndarray, np.ndarray | None, OrderedDict[str, float], OpenMLRunTrace | None]: + """Run a model on a repeat, fold, subsample triplet of the task. + + Returns the data that is necessary to construct the OpenML Run object. Is used by + :func:`openml.runs.run_flow_on_task`. + + Parameters + ---------- + model : Any + The UNTRAINED model to run. The model instance will be copied and not altered. + task : OpenMLTask + The task to run the model on. + X_train : array-like + Training data for the given repetition and fold. + rep_no : int + The repeat of the experiment (0-based; in case of 1 time CV, always 0) + fold_no : int + The fold nr of the experiment (0-based; in case of holdout, always 0) + y_train : Optional[np.ndarray] (default=None) + Target attributes for supervised tasks. In case of classification, these are integer + indices to the potential classes specified by dataset. + X_test : Optional, array-like (default=None) + Test attributes to test for generalization in supervised tasks. + + Returns + ------- + predictions : np.ndarray + Model predictions. + probabilities : Optional, np.ndarray + Predicted probabilities (only applicable for supervised classification tasks). + user_defined_measures : OrderedDict[str, float] + User defined measures that were generated on this fold + trace : Optional, OpenMLRunTrace + Hyperparameter optimization trace (only applicable for supervised tasks with + hyperparameter optimization). + """ + + @abstractmethod + def check_if_model_fitted(self, model: Any) -> bool: + """Returns True/False denoting if the model has already been fitted/trained. + + Parameters + ---------- + model : Any + + Returns + ------- + bool + """ + + # Abstract methods for hyperparameter optimization + + @abstractmethod + def instantiate_model_from_hpo_class( + self, + model: Any, + trace_iteration: OpenMLTraceIteration, + ) -> Any: + """Instantiate a base model which can be searched over by the hyperparameter optimization + model. + + Parameters + ---------- + model : Any + A hyperparameter optimization model which defines the model to be instantiated. + trace_iteration : OpenMLTraceIteration + Describing the hyperparameter settings to instantiate. + + Returns + ------- + Any + """ + # TODO a trace belongs to a run and therefore a flow -> simplify this part of the interface! diff --git a/openml/extensions/base/_serializer.py b/openml/extensions/base/_serializer.py new file mode 100644 index 000000000..e50296213 --- /dev/null +++ b/openml/extensions/base/_serializer.py @@ -0,0 +1,102 @@ +# License: BSD 3-Clause + +"""Base class for estimator serializors.""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from openml.flows import OpenMLFlow + + +class ModelSerializer(ABC): + """Handle the conversion between estimator instances and OpenML Flows.""" + + @classmethod + @abstractmethod + def can_handle_model(cls, model: Any) -> bool: + """Check whether a model flow can be handled by this extension. + + This is typically done by checking the type of the model, or the package it belongs to. + + Parameters + ---------- + model : Any + + Returns + ------- + bool + """ + + @abstractmethod + def model_to_flow(self, model: Any) -> OpenMLFlow: + """Transform a model to a flow for uploading it to OpenML. + + Parameters + ---------- + model : Any + + Returns + ------- + OpenMLFlow + """ + + @abstractmethod + def flow_to_model( + self, + flow: OpenMLFlow, + initialize_with_defaults: bool = False, # noqa: FBT002 + strict_version: bool = True, # noqa: FBT002 + ) -> Any: + """Instantiate a model from the flow representation. + + Parameters + ---------- + flow : OpenMLFlow + + initialize_with_defaults : bool, optional (default=False) + If this flag is set, the hyperparameter values of flows will be + ignored and a flow with its defaults is returned. + + strict_version : bool, default=True + Whether to fail if version requirements are not fulfilled. + + Returns + ------- + Any + """ + + @abstractmethod + def get_version_information(self) -> list[str]: + """Return dependency and version information.""" + + @abstractmethod + def obtain_parameter_values( + self, + flow: OpenMLFlow, + model: Any = None, + ) -> list[dict[str, Any]]: + """Extracts all parameter settings required for the flow from the model. + + If no explicit model is provided, the parameters will be extracted from `flow.model` + instead. + + Parameters + ---------- + flow : OpenMLFlow + OpenMLFlow object (containing flow ids, i.e., it has to be downloaded from the server) + + model: Any, optional (default=None) + The model from which to obtain the parameter values. Must match the flow signature. + If None, use the model specified in ``OpenMLFlow.model``. + + Returns + ------- + list + A list of dicts, where each dict has the following entries: + - ``oml:name`` : str: The OpenML parameter name + - ``oml:value`` : mixed: A representation of the parameter value + - ``oml:component`` : int: flow id to which the parameter belongs + """ diff --git a/openml/extensions/registry.py b/openml/extensions/registry.py new file mode 100644 index 000000000..c65e063a2 --- /dev/null +++ b/openml/extensions/registry.py @@ -0,0 +1,96 @@ +# License: BSD 3-Clause + +"""Extension registries for serializers and executors.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from openml.exceptions import PyOpenMLError + +if TYPE_CHECKING: + from openml.extensions.base import ModelExecutor, ModelSerializer + + +SERIALIZER_REGISTRY: list[type[ModelSerializer]] = [] +EXECUTOR_REGISTRY: list[type[ModelExecutor]] = [] + + +def register_serializer(cls: type[ModelSerializer]) -> type[ModelSerializer]: + """Register a serializer class.""" + SERIALIZER_REGISTRY.append(cls) + return cls + + +def register_executor(cls: type[ModelExecutor]) -> type[ModelExecutor]: + """Register an executor class.""" + EXECUTOR_REGISTRY.append(cls) + return cls + + +def resolve_serializer(estimator: Any) -> ModelSerializer: + """ + Identify and return the appropriate serializer for a given estimator. + + Parameters + ---------- + estimator : Any + The estimator instance (e.g., sklearn estimator, sktime estimator). + + Returns + ------- + ModelSerializer + An instance of the matching serializer. + + Raises + ------ + PyOpenMLError + If no serializer supports the estimator or if multiple serializers match. + """ + matches = [ + serializer_cls + for serializer_cls in SERIALIZER_REGISTRY + if serializer_cls.can_handle_model(estimator) + ] + + if len(matches) == 1: + return matches[0]() + + if len(matches) > 1: + raise PyOpenMLError("Multiple serializers support this estimator.") + + raise PyOpenMLError("No serializer supports this estimator.") + + +def resolve_executor(estimator: Any) -> ModelExecutor: + """ + Identify and return the appropriate executor for a given estimator. + + Parameters + ---------- + estimator : Any + The estimator instance. + + Returns + ------- + ModelExecutor + An instance of the matching executor. + + Raises + ------ + PyOpenMLError + If no executor supports the estimator or if multiple executors match. + """ + matches = [ + executor_cls + for executor_cls in EXECUTOR_REGISTRY + if executor_cls.can_handle_model(estimator) + ] + + if len(matches) == 1: + return matches[0]() + + if len(matches) > 1: + raise PyOpenMLError("Multiple executors support this estimator.") + + raise PyOpenMLError("No executor supports this estimator.") diff --git a/openml/flows/__init__.py b/openml/flows/__init__.py index d455249de..752c32317 100644 --- a/openml/flows/__init__.py +++ b/openml/flows/__init__.py @@ -1,7 +1,7 @@ # License: BSD 3-Clause -from .flow import OpenMLFlow -from .functions import ( +from openml.flows.flow import OpenMLFlow +from openml.flows.functions import ( assert_flows_equal, delete_flow, flow_exists, @@ -9,12 +9,15 @@ get_flow_id, list_flows, ) +from openml.flows.utils import estimator_to_flow, flow_to_estimator __all__ = [ "OpenMLFlow", "assert_flows_equal", "delete_flow", + "estimator_to_flow", "flow_exists", + "flow_to_estimator", "get_flow", "get_flow_id", "list_flows", diff --git a/openml/flows/utils.py b/openml/flows/utils.py new file mode 100644 index 000000000..e87337de2 --- /dev/null +++ b/openml/flows/utils.py @@ -0,0 +1,60 @@ +# License: BSD 3-Clause + +"""Utility functions for OpenML extensions.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from openml.extensions.registry import resolve_serializer + +if TYPE_CHECKING: + from openml.flows import OpenMLFlow + + +def flow_to_estimator( + flow: OpenMLFlow, + initialize_with_defaults: bool = False, # noqa: FBT002 + strict_version: bool = True, # noqa: FBT002 +) -> Any: + """Instantiate a model from the flow representation. + + Parameters + ---------- + flow : OpenMLFlow + + initialize_with_defaults : bool, optional (default=False) + If this flag is set, the hyperparameter values of flows will be + ignored and a flow with its defaults is returned. + + strict_version : bool, default=True + Whether to fail if version requirements are not fulfilled. + + Returns + ------- + estimator_instance : Any + The corresponding estimator instance. + """ + serializer = resolve_serializer(flow) + return serializer.flow_to_model( + flow, + initialize_with_defaults=initialize_with_defaults, + strict_version=strict_version, + ) + + +def estimator_to_flow(estimator_instance: Any) -> OpenMLFlow: + """Convert an estimator instance to an OpenML flow. + + Parameters + ---------- + estimator_instance : Any + The estimator instance to convert. + + Returns + ------- + flow : openml.flows.OpenMLFlow + The corresponding OpenML flow. + """ + serializer = resolve_serializer(estimator_instance) + return serializer.model_to_flow(estimator_instance) diff --git a/tests/test_extensions/test_base.py b/tests/test_extensions/test_base.py new file mode 100644 index 000000000..7297834a6 --- /dev/null +++ b/tests/test_extensions/test_base.py @@ -0,0 +1,93 @@ +# License: BSD 3-Clause + +"""Test OpenML extension base classes and registry.""" + +import pytest +from collections import OrderedDict + +from openml.exceptions import PyOpenMLError +from openml.extensions.base import ( + ModelSerializer, + ModelExecutor, +) +from openml.extensions.registry import resolve_serializer, resolve_executor + + +class TestModelSerializer: + """Test ModelSerializer abstract base class.""" + + def test_is_abstract(self): + """ModelSerializer should not be instantiable.""" + with pytest.raises(TypeError): + ModelSerializer() # noqa: B024 + + class DummySerializer(ModelSerializer): + @classmethod + def can_handle_model(cls, model): + return True + + def model_to_flow(self, model): + return "dummy_flow" + + def flow_to_model(self, flow, initialize_with_defaults=False, strict_version=True): + return "dummy_model" + + def get_version_information(self): + return ["dummy>=0.1"] + + def obtain_parameter_values(self, flow, model=None): + return [] + + def test_concrete_implementation(self): + serializer = self.DummySerializer() + + assert serializer.can_handle_model(object()) is True + assert serializer.model_to_flow("model") == "dummy_flow" + assert serializer.flow_to_model("flow") == "dummy_model" + assert serializer.get_version_information() == ["dummy>=0.1"] + + +class TestModelExecutor: + """Test ModelExecutor abstract base class.""" + + def test_is_abstract(self): + """ModelExecutor should not be instantiable.""" + with pytest.raises(TypeError): + ModelExecutor() # noqa: B024 + + class DummyExecutor(ModelExecutor): + @classmethod + def can_handle_model(cls, model): + return True + + def seed_model(self, model, seed): + return model + + def _run_model_on_fold( + self, + model, + task, + X_train, + rep_no, + fold_no, + y_train=None, + X_test=None, + ): + return ( + [], # predictions + None, # probabilities + OrderedDict(), # user_defined_measures + None, # trace + ) + + def check_if_model_fitted(self, model): + return False + + def instantiate_model_from_hpo_class(self, model, trace_iteration): + return model + + def test_concrete_implementation(self): + executor = self.DummyExecutor() + + assert executor.seed_model("model", 42) == "model" + assert executor.check_if_model_fitted("model") is False