Autogoal.ml. automl

import io
import pickle
import statistics

import numpy as np
from autogoal.contrib import find_classes

from autogoal.kb import build_pipeline_graph, SemanticType
from autogoal.ml.metrics import accuracy
from autogoal.search import PESearch
from autogoal.utils import nice_repr


@nice_repr
class AutoML:

Note

Predefined pipeline search with automatic type inference.

An AutoML instance represents a general-purpose machine learning algorithm, that can be applied to any input and output.

    def __init__(
        self,
        input=None,
        output=None,
        random_state=None,
        search_algorithm=None,
        search_iterations=100,
        include_filter=".*",
        exclude_filter=None,
        validation_split=0.3,
        errors="warn",
        cross_validation="median",
        cross_validation_steps=3,
        registry=None,
        score_metric=None,
        **search_kwargs
    ):
        self.input = input
        self.output = output
        self.search_algorithm = search_algorithm or PESearch
        self.search_iterations = search_iterations
        self.include_filter = include_filter
        self.exclude_filter = exclude_filter
        self.validation_split = validation_split
        self.errors = errors
        self.cross_validation = cross_validation
        self.cross_validation_steps = cross_validation_steps
        self.registry = registry
        self.random_state = random_state
        self.score_metric = score_metric or accuracy
        self.search_kwargs = search_kwargs
        self._unpickled = False

        if random_state:
            np.random.seed(random_state)

    def _check_fitted(self):
        if not hasattr(self, "best_pipeline_"):
            raise TypeError(
                "This operation cannot be performed on an unfitted AutoML instance. Call `fit` first."
            )

    def make_pipeline_builder(self):
        registry = self.registry or find_classes(
            include=self.include_filter, exclude=self.exclude_filter
        )

        return build_pipeline_graph(
            input_types=self.input, output_type=self.output, registry=registry,
        )

    def fit(self, X, y, **kwargs):
        self.input = self._input_type(X)
        self.output = self._output_type(y)

        search = self.search_algorithm(
            self.make_pipeline_builder(),
            self.make_fitness_fn(X, y),
            random_state=self.random_state,
            errors=self.errors,
            **self.search_kwargs,
        )

        self.best_pipeline_, self.best_score_ = search.run(
            self.search_iterations, **kwargs
        )

        self.fit_pipeline(X, y)

    def fit_pipeline(self, X, y):
        self._check_fitted()

        self.best_pipeline_.send("train")
        self.best_pipeline_.run(X, y)
        self.best_pipeline_.send("eval")

    def save(self, fp: io.BytesIO):

Note

Serializes the AutoML instance.

        self._check_fitted()
        pickle.Pickler(fp).dump(self)

    @classmethod
    def load(self, fp: io.FileIO) -> "AutoML":

Note

Deserializes an AutoML instance.

After deserialization, the best pipeline found is ready to predict.

        automl = pickle.Unpickler(fp).load()

        if not isinstance(automl, AutoML):
            raise ValueError("The serialized file does not contain an AutoML instance.")

        return automl

    def score(self, X, y):
        self._check_fitted()

        y_pred = self.best_pipeline_.run(X, np.zeros_like(y))
        return self.score_metric(y, y_pred)

    def _input_type(self, X):
        return self.input or SemanticType.infer(X)

    def _output_type(self, y):
        return self.output or SemanticType.infer(y)

    def make_fitness_fn(self, X, y):
        y = np.asarray(y)

        def fitness_fn(pipeline):
            scores = []

            for _ in range(self.cross_validation_steps):
                len_x = len(X) if isinstance(X, list) else X.shape[0]
                indices = np.arange(0, len_x)
                np.random.shuffle(indices)
                split_index = int(self.validation_split * len(indices))
                train_indices = indices[:-split_index]
                test_indices = indices[-split_index:]

                if isinstance(X, list):
                    X_train, y_train, X_test, y_test = (
                        [X[i] for i in train_indices],
                        y[train_indices],
                        [X[i] for i in test_indices],
                        y[test_indices],
                    )
                else:
                    X_train, y_train, X_test, y_test = (
                        X[train_indices],
                        y[train_indices],
                        X[test_indices],
                        y[test_indices],
                    )

                pipeline.send("train")
                pipeline.run(X_train, y_train)
                pipeline.send("eval")
                y_pred = pipeline.run(X_test, None)
                scores.append(self.score_metric(y_test, y_pred))

            return getattr(statistics, self.cross_validation)(scores)

        return fitness_fn

    def predict(self, X):
        self._check_fitted()

        return self.best_pipeline_.run(X, None)