eval_bml

`ResourceMonitor` ¶

A context manager for monitoring resource usage.

Parameters:

Name	Type	Description	Default
`name`	`str`	A description of the resource usage. Defaults to None.	`None`

Raises:

Type	Description
`ResourceMonitorError`	If the resource monitor is already tracing memory usage.

Returns:

Type	Description
`ResourceMonitor`	A ResourceMonitor object.

Examples:

>>> import time
>>> from spotriver.evaluation.eval_bml import ResourceMonitor
>>> with ResourceMonitor() as rm:
...     time.sleep(1)
...     print(rm.result())
Resource usage:
    Time [s]: 1.000000001
    Memory [b]: 0.0

Source code in spotriver/evaluation/eval_bml.py

class ResourceMonitor:
    """
    A context manager for monitoring resource usage.

    Args:
        name (str, optional): A description of the resource usage. Defaults to None.

    Raises:
        (ResourceMonitorError): If the resource monitor is already tracing memory usage.

    Returns:
        (ResourceMonitor): A ResourceMonitor object.

    Examples:
        >>> import time
        >>> from spotriver.evaluation.eval_bml import ResourceMonitor
        >>> with ResourceMonitor() as rm:
        ...     time.sleep(1)
        ...     print(rm.result())
        Resource usage:
            Time [s]: 1.000000001
            Memory [b]: 0.0
    """

    def __init__(self, name: Optional[str] = None):
        self.name = name
        self.r_time = None
        self.memory = None
        self.current_memory = None
        self.peak_memory = None
        self._start = None

    def __enter__(self):
        if tracemalloc.is_tracing():
            raise ResourceMonitorError("Already tracing memory usage!")
        tracemalloc.start()
        tracemalloc.reset_peak()
        self._start = time.perf_counter_ns()

    def __exit__(self, type, value, traceback):
        self.r_time = (time.perf_counter_ns() - self._start) / 1.0e9
        _, peak = tracemalloc.get_traced_memory()
        self.memory = peak / (1024 * 1024)
        tracemalloc.stop()

    def result(self):
        """Returns a ResourceUsage object with the results of the resource monitor.

        Raises:
            (ResourceMonitorError): If the resource monitor has not been used yet.

        Returns:
            (ResourceUsage): A ResourceUsage object.

        Examples:
            >>> import time
            >>> from spotriver.evaluation.eval_bml import ResourceMonitor
            >>> with ResourceMonitor() as rm:
            ...     time.sleep(1)
            ...     print(rm.result())
            Resource usage:
                Time [s]: 1.000000001
                Memory [b]: 0.0
        """
        if self.r_time is None or self.memory is None:
            raise ResourceMonitorError("No resources monitored yet.")
        return ResourceUsage(name=self.name, r_time=self.r_time, memory=self.memory)

`result()` ¶

Returns a ResourceUsage object with the results of the resource monitor.

Raises:

Type	Description
`ResourceMonitorError`	If the resource monitor has not been used yet.

Returns:

Type	Description
`ResourceUsage`	A ResourceUsage object.

Examples:

>>> import time
>>> from spotriver.evaluation.eval_bml import ResourceMonitor
>>> with ResourceMonitor() as rm:
...     time.sleep(1)
...     print(rm.result())
Resource usage:
    Time [s]: 1.000000001
    Memory [b]: 0.0

Source code in spotriver/evaluation/eval_bml.py

def result(self):
    """Returns a ResourceUsage object with the results of the resource monitor.

    Raises:
        (ResourceMonitorError): If the resource monitor has not been used yet.

    Returns:
        (ResourceUsage): A ResourceUsage object.

    Examples:
        >>> import time
        >>> from spotriver.evaluation.eval_bml import ResourceMonitor
        >>> with ResourceMonitor() as rm:
        ...     time.sleep(1)
        ...     print(rm.result())
        Resource usage:
            Time [s]: 1.000000001
            Memory [b]: 0.0
    """
    if self.r_time is None or self.memory is None:
        raise ResourceMonitorError("No resources monitored yet.")
    return ResourceUsage(name=self.name, r_time=self.r_time, memory=self.memory)

`eval_bml_horizon(model, train, test, target_column, horizon, include_remainder=True, metric=None)` ¶

Evaluate a machine learning model on a rolling horizon basis. This function evaluates a machine learning model on a rolling horizon basis. The model is trained on the training data and then evaluated on the test data using a given evaluation metric. The evaluation results are returned as a tuple of two data frames. The first one contains evaluation metrics for each window. The second one contains the true and predicted values for each observation in the test set.

Parameters:

Name	Type	Description	Default
`model`	`object`	The model to be evaluated.	required
`train`	`DataFrame`	The training data set.	required
`test`	`DataFrame`	The testing data set.	required
`target_column`	`str`	The name of the column containing the target variable.	required
`horizon`	`int`	The number of steps ahead to forecast.	required
`include_remainder`	`bool`	Whether to include the remainder of the test dataframe if its length is not divisible by the horizon. Defaults to True.	`True`
`metric`	`object`	An evaluation metric object that has an `evaluate` method. This metric will be used to evaluate the model’s performance on the test dataset.	`None`

Returns:

Name	Type	Description
`tuple`	`tuple`	A tuple of two data frames.
	`tuple`	The first one contains evaluation metrics for each window.
	`tuple`	The second one contains the true and predicted values for each observation in the test set.

Examples:

>>> from sklearn.linear_model import LinearRegression
>>> model = LinearRegression()
>>> train = pd.DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]})
>>> test = pd.DataFrame({"x": [4, 5], "y": [8, 10]})
>>> df_eval, df_true = eval_bml_horizon(model, train, test, "y", horizon=1)
>>> print(df_eval)
      Metric  Memory (MB)  CompTime (s)
0  0.000000          0.0           0.0
1  0.000000          0.0           0.0
...        ...          ...           ...

Source code in spotriver/evaluation/eval_bml.py

def eval_bml_horizon(
    model: object,
    train: pd.DataFrame,
    test: pd.DataFrame,
    target_column: str,
    horizon: int,
    include_remainder: bool = True,
    metric: object = None,
) -> tuple:
    """
    Evaluate a machine learning model on a rolling horizon basis.
    This function evaluates a machine learning model on a rolling horizon basis.
    The model is trained on the training data and then evaluated on the test data
    using a given evaluation metric. The evaluation results are returned as a tuple
    of two data frames. The first one contains evaluation metrics for each window.
    The second one contains the true and predicted values for each observation in the test set.

    Args:
        model (object): The model to be evaluated.
        train (pd.DataFrame): The training data set.
        test (pd.DataFrame): The testing data set.
        target_column (str): The name of the column containing the target variable.
        horizon (int, optional): The number of steps ahead to forecast.
        include_remainder (bool):
            Whether to include the remainder of the test dataframe if its length
            is not divisible by the horizon. Defaults to True.
        metric (object):
            An evaluation metric object that has an `evaluate` method.
            This metric will be used to evaluate the model's performance on the test dataset.

    Returns:
        tuple: A tuple of two data frames.
        The first one contains evaluation metrics for each window.
        The second one contains the true and predicted values for each observation in the test set.

    Examples:
        >>> from sklearn.linear_model import LinearRegression
        >>> model = LinearRegression()
        >>> train = pd.DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]})
        >>> test = pd.DataFrame({"x": [4, 5], "y": [8, 10]})
        >>> df_eval, df_true = eval_bml_horizon(model, train, test, "y", horizon=1)
        >>> print(df_eval)
              Metric  Memory (MB)  CompTime (s)
        0  0.000000          0.0           0.0
        1  0.000000          0.0           0.0
        ...        ...          ...           ...

    """
    # Check if metric is None or null and raise ValueError if it is
    if metric is None:
        raise ValueError("The 'metric' parameter must not be None or null.")
    # Reset index of train and test dataframes
    train = train.reset_index(drop=True)
    test = test.reset_index(drop=True)
    # Initialize lists for predictions and differences
    preds_list = []
    diffs_list = []
    # Fit the model on the training data
    rm = ResourceMonitor()
    with rm:
        try:
            model.fit(train.loc[:, train.columns != target_column], train[target_column])
        except Exception as e:
            print(f"Train data: {train}")
            print(f"An error occurred while fitting the model: {e}")
    # Evaluate the model on empty arrays to get initial resource usage
    df_eval = pd.DataFrame.from_dict(
        [evaluate_model(y_true=np.array([]), y_pred=np.array([]), memory=rm.memory, r_time=rm.r_time, metric=metric)]
    )
    # If include_remainder is False, remove remainder rows from test dataframe
    if include_remainder is False:
        remainder = len(test) % horizon
        if remainder > 0:
            test = test[:-remainder]
    # Evaluate the model on batches of size horizon from the test dataframe
    for batch_number, batch_df in test.groupby(np.arange(len(test)) // horizon):
        rm = ResourceMonitor()
        with rm:
            try:
                preds = model.predict(batch_df.loc[:, batch_df.columns != target_column])
            except Exception as e:
                print(f"Batch data: {batch_df}")
                print(f"An error occurred while predicting: {e}")
        diffs = batch_df[target_column].values - preds
        df_eval.loc[batch_number + 1] = pd.Series(
            evaluate_model(
                y_true=batch_df[target_column],
                y_pred=preds,
                memory=rm.memory,
                r_time=rm.r_time,
                metric=metric,
            )
        )
        # Append predictions and differences to their respective lists
        preds_list.append(preds)
        diffs_list.append(diffs)
    # Concatenate predictions and differences lists into series
    series_preds = pd.Series(np.concatenate(preds_list))
    series_diffs = pd.Series(np.concatenate(diffs_list))
    # Create a dataframe with true values and add columns for predictions and differences
    df_true = pd.DataFrame(test[target_column])
    df_true["Prediction"] = series_preds
    df_true["Difference"] = series_diffs
    return df_eval, df_true

`eval_bml_landmark(model, train, test, target_column, horizon, include_remainder=True, metric=None)` ¶

Evaluate a machine learning model on a rolling landmark basis.

This function evaluates a machine learning model on a rolling landmark basis. The model is trained on the training data and then evaluated on the test data using a given evaluation metric. The evaluation results are returned as a tuple of two data frames. The first one contains evaluation metrics for each window. The second one contains the true and predicted values for each observation in the test set.

Parameters:

Name	Type	Description	Default
`model`	`object`	The model to be evaluated.	required
`train`	`DataFrame`	The training data set.	required
`test`	`DataFrame`	The testing data set.	required
`target_column`	`str`	The name of the column containing the target variable.	required
`horizon`	`int`	The number of steps ahead to forecast.	required
`include_remainder`	`bool`	Whether to include the remainder of the test dataframe if its length is not divisible by the horizon. Defaults to True.	`True`
`metric`	`object`	An evaluation metric object that has an `evaluate` method. This metric will be used to evaluate the model’s performance on the test dataset.	`None`

Returns:

Name	Type	Description
`tuple`	`tuple`	A tuple of two data frames. The first one contains evaluation metrics for each window. The second one contains the true and predicted values for each observation in the test set.

Examples:

>>> from sklearn.linear_model import LinearRegression
>>> model = LinearRegression()
>>> train = pd.DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]})
>>> test = pd.DataFrame({"x": [4, 5], "y": [8, 10]})
>>> df_eval, df_true = eval_bml_landmark(model, train, test, "y", horizon=1)
>>> print(df_eval)
        Metric  Memory (MB)  CompTime (s)
0  0.000000          0.0           0.0
1  0.000000          0.0           0.0
...        ...          ...           ...

Source code in spotriver/evaluation/eval_bml.py

def eval_bml_landmark(
    model: object,
    train: pd.DataFrame,
    test: pd.DataFrame,
    target_column: str,
    horizon: int,
    include_remainder: bool = True,
    metric: object = None,
) -> tuple:
    """Evaluate a machine learning model on a rolling landmark basis.

    This function evaluates a machine learning model on a rolling landmark basis.
    The model is trained on the training data and then evaluated on the test data
    using a given evaluation metric. The evaluation results are returned as a tuple
    of two data frames. The first one contains evaluation metrics for each window.
    The second one contains the true and predicted values for each observation in the test set.

    Args:
        model (object): The model to be evaluated.
        train (pd.DataFrame): The training data set.
        test (pd.DataFrame): The testing data set.
        target_column (str): The name of the column containing the target variable.
        horizon (int, optional): The number of steps ahead to forecast.
        include_remainder (bool): Whether to include the remainder of the test dataframe if its length is not divisible by the horizon. Defaults to True.
        metric (object):
            An evaluation metric object that has an `evaluate` method.
            This metric will be used to evaluate the model's performance on the test dataset.

    Returns:
        tuple:
            A tuple of two data frames. The first one contains evaluation metrics for each window.
            The second one contains the true and predicted values for each observation in the test set.

    Examples:
        >>> from sklearn.linear_model import LinearRegression
        >>> model = LinearRegression()
        >>> train = pd.DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]})
        >>> test = pd.DataFrame({"x": [4, 5], "y": [8, 10]})
        >>> df_eval, df_true = eval_bml_landmark(model, train, test, "y", horizon=1)
        >>> print(df_eval)
                Metric  Memory (MB)  CompTime (s)
        0  0.000000          0.0           0.0
        1  0.000000          0.0           0.0
        ...        ...          ...           ...

    """
    train = train.reset_index(drop=True)
    test = test.reset_index(drop=True)
    series_preds = pd.Series(dtype=float)
    series_diffs = pd.Series(dtype=float)
    rm = ResourceMonitor()
    with rm:
        model.fit(train.loc[:, train.columns != target_column], train[target_column])
    df_eval = pd.DataFrame.from_dict(
        [evaluate_model(y_true=np.array([]), y_pred=np.array([]), memory=rm.memory, r_time=rm.r_time, metric=metric)]
    )
    if include_remainder is False:
        rem = len(test) % horizon
        if rem > 0:
            test = test[:-rem]
    # Landmark Evaluation
    for i, new_df in enumerate(gen_sliding_window(test, horizon)):
        train = pd.concat([train, new_df], ignore_index=True)
        rm = ResourceMonitor()
        with rm:
            preds = pd.Series(model.predict(new_df.loc[:, new_df.columns != target_column]))
            model.fit(train.loc[:, train.columns != target_column], train[target_column])
        diffs = new_df[target_column].values - preds
        df_eval.loc[i + 1] = pd.Series(
            evaluate_model(
                y_true=new_df[target_column],
                y_pred=preds,
                memory=rm.memory,
                r_time=rm.r_time,
                metric=metric,
            )
        )
        series_preds = pd.concat([series_preds, preds], ignore_index=True)
        series_diffs = pd.concat([series_diffs, diffs], ignore_index=True)
    df_true = pd.DataFrame(test[target_column])
    df_true["Prediction"] = series_preds
    df_true["Difference"] = series_diffs
    return df_eval, df_true

`eval_bml_window(model, train, test, target_column, horizon, include_remainder=True, metric=None)` ¶

Evaluate a model on a rolling window basis.

Parameters:

Name	Type	Description	Default
`model`	`object`	The model to be evaluated.	required
`train`	`DataFrame`	The training data set.	required
`test`	`DataFrame`	The testing data set.	required
`target_column`	`str`	The name of the column containing the target variable.	required
`horizon`	`int`	The number of steps ahead to forecast.	required

Returns:

Name	Type	Description
`tuple`	`tuple`	A tuple of two data frames. The first one contains evaluation metrics for each window.
	`tuple`	The second one contains the true and predicted values for each observation in the test set.

Examples:

>>> from sklearn.linear_model import LinearRegression
>>> model = LinearRegression()
>>> train = pd.DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]})
>>> test = pd.DataFrame({"x": [4, 5], "y": [8, 10]})
>>> df_eval, df_true = eval_bml_window(model, train, test, "y", horizon=1)
>>> print(df_eval)

Source code in spotriver/evaluation/eval_bml.py

def eval_bml_window(
    model: object,
    train: pd.DataFrame,
    test: pd.DataFrame,
    target_column: str,
    horizon: int,
    include_remainder: bool = True,
    metric: object = None,
) -> tuple:
    """Evaluate a model on a rolling window basis.

    Args:
        model (object): The model to be evaluated.
        train (pd.DataFrame): The training data set.
        test (pd.DataFrame): The testing data set.
        target_column (str): The name of the column containing the target variable.
        horizon (int, optional): The number of steps ahead to forecast.

    Returns:
        tuple: A tuple of two data frames. The first one contains evaluation metrics for each window.
        The second one contains the true and predicted values for each observation in the test set.

    Examples:
        >>> from sklearn.linear_model import LinearRegression
        >>> model = LinearRegression()
        >>> train = pd.DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]})
        >>> test = pd.DataFrame({"x": [4, 5], "y": [8, 10]})
        >>> df_eval, df_true = eval_bml_window(model, train, test, "y", horizon=1)
        >>> print(df_eval)
    """
    train = train.reset_index(drop=True)
    test = test.reset_index(drop=True)
    df_all = pd.concat([train, test], ignore_index=True)
    series_preds = pd.Series(dtype=float)
    series_diffs = pd.Series(dtype=float)
    rm = ResourceMonitor()
    with rm:
        model.fit(train.loc[:, train.columns != target_column], train[target_column])
    df_eval = pd.DataFrame.from_dict(
        [evaluate_model(y_true=np.array([]), y_pred=np.array([]), memory=rm.memory, r_time=rm.r_time, metric=metric)]
    )
    if include_remainder is False:
        rem = len(test) % horizon
        if rem > 0:
            test = test[:-rem]
    for i, (w_train, w_test) in enumerate(gen_horizon_shifted_window(df_all, len(train), horizon)):
        rm = ResourceMonitor()
        with rm:
            model.fit(w_train.loc[:, w_train.columns != target_column], w_train[target_column])
            preds = pd.Series(model.predict(w_test.loc[:, w_test.columns != target_column]))
        diffs = w_test[target_column].values - preds
        df_eval.loc[i + 1] = pd.Series(
            evaluate_model(
                y_true=w_test[target_column],
                y_pred=preds,
                memory=rm.memory,
                r_time=rm.r_time,
                metric=metric,
            )
        )

        series_preds = pd.concat([series_preds, preds], ignore_index=True)
        series_diffs = pd.concat([series_diffs, diffs], ignore_index=True)

    df_true = pd.DataFrame(test[target_column])
    df_true["Prediction"] = series_preds
    df_true["Difference"] = series_diffs
    return df_eval, df_true

`eval_oml_horizon(model, train, test, target_column, horizon, include_remainder=True, metric=None, oml_grace_period=None)` ¶

Evaluate an online machine learning model on a rolling horizon basis using evaluations from batch-machine learning.

This function evaluates an online-machine learning model on a rolling horizon basis. The model is trained on the training data and then evaluated on the test data using a given evaluation metric. The evaluation results are returned as a tuple of two data frames. The first one contains evaluation metrics for each window. The second one contains the true and predicted values for each observation in the test set.

Notes

First, the model is trained on the (small) training data set. No predictions are made during this initial training phase, but the memory and computation time are measured. Then, the model is evaluated on the test data set using a given (sklearn) evaluation metric. The evaluation results are returned as a tuple of two data frames.

Parameters:

Name	Type	Description	Default
`model`	`object`	The model to be evaluated. For example, a linear_model from river.	required
`train`	`DataFrame`	The training data set. Should be small compared to the test data set. See also oml_grace_period below.	required
`test`	`DataFrame`	The testing data set.	required
`target_column`	`str`	The name of the column containing the target variable.	required
`horizon`	`int`	The number of steps ahead to forecast. If set to 1, the model is evaluated and updated incrementally on the next observation in the test set. If set to 2, the model is evaluated and updated incrementally on the next two observations in the test set, and so on.	required
`include_remainder`	`bool`	Whether to include the remainder of the test dataframe if its length is not divisible by the horizon. Defaults to True.	`True`
`metric`	`object`	An evaluation metric object that has an `evaluate` method. This metric will be used to evaluate the model’s performance on the test dataset. Metrics from sklearn, e.g., mean_absolute_error can be used.	`None`
`oml_grace_period`	`int`	The number of observations to use for (initial) training. Defaults to None, in which case the horizon is used. Important: Not the entire training set is used for initial training, but only the last oml_grace_period observations. This is to simulate the online setting, where the model is trained on a small subset of the training data set. If None, the horizon is used.	`None`

Returns:

Name	Type	Description
`tuple`	`Tuple[DataFrame, DataFrame]`	A tuple of two data frames. The first one contains evaluation metrics for each window. The second one contains the true and predicted values for each observation in the test set.

Examples:

>>> from river import linear_model
    from river import preprocessing
    from sklearn.metrics import mean_absolute_error
    from spotriver.evaluation.eval_bml import eval_oml_horizon
    model = (
            preprocessing.StandardScaler() |
            linear_model.LinearRegression(intercept_lr=.5)
        )
    horizon = 10
    train = pd.DataFrame({"x": np.arange(1, 11), "y": np.arange(2, 22, 2)})
    test = pd.DataFrame({"x": np.arange(11, 111), "y": np.arange(22, 222, 2)})
    target_column = "y"
    metric = mean_absolute_error
    eval_oml_horizon(
        model = model,
        train = train,
        test = test,
        target_column = target_column,
        horizon = horizon,
        include_remainder = True,
        metric = metric,
        oml_grace_period = horizon,
    )
    (      Metric  Memory (MB)  CompTime (s)
    0        NaN     0.025515      0.001253
    1   1.721100     0.009296      0.001499
    2   1.700408     0.007614      0.000801
    3   1.690827     0.007833      0.002240
    4   1.685174     0.007614      0.000784
    5   1.681406     0.007614      0.000738
    6   1.678697     0.007937      0.001930
    7   1.676648     0.007614      0.000782
    8   1.675039     0.007431      0.000760
    9   1.673739     0.007431      0.000687
    10  1.672665     0.007431      0.000678,
        y  Prediction  Difference
    0    22   20.261831    1.738169
    1    24   22.267027    1.732973
    2    26   24.271507    1.728493
    3    28   26.275414    1.724586
    4    30   28.278854    1.721146
    ..  ...         ...         ...
    95  212  210.327390    1.672610
    96  214  212.327487    1.672513
    97  216  214.327581    1.672419
    98  218  216.327674    1.672326
    99  220  218.327766    1.672234

[100 rows x 3 columns])

Source code in spotriver/evaluation/eval_bml.py

def eval_oml_horizon(
    model: object,
    train: pd.DataFrame,
    test: pd.DataFrame,
    target_column: str,
    horizon: int,
    include_remainder: bool = True,
    metric: object = None,
    oml_grace_period: int = None,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Evaluate an online machine learning model on a rolling horizon basis using
    evaluations from batch-machine learning.

    This function evaluates an online-machine learning model on a rolling horizon basis.
    The model is trained on the training data and then evaluated on the test data
    using a given evaluation metric. The evaluation results are returned as a tuple
    of two data frames. The first one contains evaluation metrics for each window.
    The second one contains the true and predicted values for each observation in the test set.

    Notes:
        First, the model is trained on the (small) training data set. No predictions
        are made during this initial training phase, but the memory and computation
        time are measured. Then, the model is evaluated on the test data set using a
        given (sklearn) evaluation metric.
        The evaluation results are returned as a tuple of two data frames.

    Args:
        model (object):
            The model to be evaluated. For example, a linear_model from river.
        train (pd.DataFrame):
            The training data set. Should be small compared to the test data set.
            See also oml_grace_period below.
        test (pd.DataFrame):
            The testing data set.
        target_column (str):
            The name of the column containing the target variable.
        horizon (int, optional):
            The number of steps ahead to forecast. If set to 1, the model is evaluated and updated
            incrementally on the next observation in the test set. If set to 2, the model is evaluated
            and updated incrementally on the next two observations in the test set, and so on.
        include_remainder (bool):
            Whether to include the remainder of the test dataframe if its
            length is not divisible by the horizon. Defaults to True.
        metric (object):
            An evaluation metric object that has an `evaluate` method.
            This metric will be used to evaluate the model's performance on the test dataset. Metrics
            from sklearn, e.g., mean_absolute_error can be used.
        oml_grace_period (int, optional):
            The number of observations to use for (initial) training. Defaults to None,
            in which case the horizon is used. Important: Not the entire training set is used
            for initial training, but only the last oml_grace_period observations. This is
            to simulate the online setting, where the model is trained on a small subset of
            the training data set. If None, the horizon is used.

    Returns:
        tuple:
            A tuple of two data frames.
            The first one contains evaluation metrics for each window.
            The second one contains the true and predicted values for each observation
            in the test set.

    Examples:
        >>> from river import linear_model
            from river import preprocessing
            from sklearn.metrics import mean_absolute_error
            from spotriver.evaluation.eval_bml import eval_oml_horizon
            model = (
                    preprocessing.StandardScaler() |
                    linear_model.LinearRegression(intercept_lr=.5)
                )
            horizon = 10
            train = pd.DataFrame({"x": np.arange(1, 11), "y": np.arange(2, 22, 2)})
            test = pd.DataFrame({"x": np.arange(11, 111), "y": np.arange(22, 222, 2)})
            target_column = "y"
            metric = mean_absolute_error
            eval_oml_horizon(
                model = model,
                train = train,
                test = test,
                target_column = target_column,
                horizon = horizon,
                include_remainder = True,
                metric = metric,
                oml_grace_period = horizon,
            )
            (      Metric  Memory (MB)  CompTime (s)
            0        NaN     0.025515      0.001253
            1   1.721100     0.009296      0.001499
            2   1.700408     0.007614      0.000801
            3   1.690827     0.007833      0.002240
            4   1.685174     0.007614      0.000784
            5   1.681406     0.007614      0.000738
            6   1.678697     0.007937      0.001930
            7   1.676648     0.007614      0.000782
            8   1.675039     0.007431      0.000760
            9   1.673739     0.007431      0.000687
            10  1.672665     0.007431      0.000678,
                y  Prediction  Difference
            0    22   20.261831    1.738169
            1    24   22.267027    1.732973
            2    26   24.271507    1.728493
            3    28   26.275414    1.724586
            4    30   28.278854    1.721146
            ..  ...         ...         ...
            95  212  210.327390    1.672610
            96  214  212.327487    1.672513
            97  216  214.327581    1.672419
            98  218  216.327674    1.672326
            99  220  218.327766    1.672234

            [100 rows x 3 columns])

    """
    # Check if metric is None or null and raise ValueError if it is
    if metric is None:
        raise ValueError("The 'metric' parameter must not be None or null.")
    if oml_grace_period is None:
        oml_grace_period = horizon
    train = train.reset_index(drop=True)
    test = test.reset_index(drop=True)
    if include_remainder is False:
        rem = len(test) % horizon
        if rem > 0:
            test = test[:-rem]

    # Fit the model on the train data, i.e., initial Training on Train Data.
    # This is performed on a limited subset only (oml_grace_period).
    # No predictions are made here, only the model is fitted.
    # Memory and runtime are measured for the model fitting
    train_X = train.loc[:, train.columns != target_column]
    train_y = train[target_column]
    train_X = train_X.tail(oml_grace_period)
    train_y = train_y.tail(oml_grace_period)
    rm = ResourceMonitor()
    with rm:
        try:
            for xi, yi in river_stream.iter_pandas(train_X, train_y):
                # Before v0.19 we had to call predict_one before learn_one
                # in order for the whole pipeline to be updated.
                # Since v0.19, calling learn_one in a pipeline will update each part
                # of the pipeline in turn.
                # Before v0.19, predict_one has to be called for updating the unsupervised parts
                # of the pipeline.
                # The following line, which returns y_pred, which is not used after v0.19:
                # _ = model.predict_one(xi)
                # model = model.learn_one(xi, yi)
                # Starting with 0.21.0, the learn_one and learn_many methods of each estimator don't not
                # return anything anymore.
                # This is to emphasize that the estimators are stateful.
                model.learn_one(xi, yi)
        except Exception as e:
            print(f"train_X data: {train_X}")
            print(f"train_y data: {train_y}")
            print(f"An error occurred while fitting the model: {e}")

    # Create empty lists to collect data
    eval_data = []
    series_preds = []
    series_diffs = []

    # Measure the costs of the initial training:
    # Add the evaluation of the model (memory and time, not predictions) on the train data to the eval_data list
    # A metric must not be passed to the evaluate_model function, because no predictions are made here
    # If a metric is passed, it will be ignored, because no predictions are passed to the evaluation function
    # So, metric=None and metric=mean_absolute_error will both work
    # Return res_dict = {"Metric": score, "Memory (MB)": memory, "CompTime (s)": r_time}
    eval_data.append(
        evaluate_model(y_true=np.array([]), y_pred=np.array([]), memory=rm.memory, r_time=rm.r_time, metric=metric)
    )

    # Test Data Evaluation
    # A sliding window of length horizon is used to evaluate the model on the test data
    for i, new_df in enumerate(gen_sliding_window(test, horizon)):
        preds = []
        test_X = new_df.loc[:, new_df.columns != target_column]
        test_y = new_df[target_column]
        rm = ResourceMonitor()
        with rm:
            try:
                for xi, yi in river_stream.iter_pandas(test_X, test_y):
                    pred = model.predict_one(xi)
                    preds.append(pred)
                    # model = model.learn_one(xi, yi)
                    # Starting with 0.21.0, the learn_one and learn_many methods of each estimator don't not
                    # return anything anymore.
                    # This is to emphasize that the estimators are stateful.
                    model.learn_one(xi, yi)
            except Exception as e:
                print(f"test_X data: {test_X}")
                print(f"test_y data: {test_y}")
                print(f"An error occurred while predicting: {e}")
        preds = pd.Series(preds)
        diffs = new_df[target_column].values - preds

        # Collect data in lists
        eval_data.append(
            evaluate_model(
                y_true=new_df[target_column], y_pred=preds, memory=rm.memory, r_time=rm.r_time, metric=metric
            )
        )
        series_preds.extend(preds)
        series_diffs.extend(diffs)

    # Create DataFrames from the collected data
    df_eval = pd.DataFrame(eval_data)
    df_true = pd.DataFrame(test[target_column])
    df_true["Prediction"] = series_preds
    df_true["Difference"] = series_diffs
    return df_eval, df_true

`evaluate_model(y_true, y_pred, memory, r_time, metric)` ¶

Evaluate a machine learning model on a test dataset.

This function evaluates a machine learning model on a test dataset using a given evaluation metric. The evaluation results are returned as a dictionary.

Parameters:

Name	Type	Description	Default
`y_true`	`ndarray`	A numpy array containing the true values.	required
`y_pred`	`ndarray`	A numpy array containing the predicted values.	required
`memory`	`float`	The memory usage of the model.	required
`r_time`	`float`	The computation time of the model.	required
`metric`	`object`	An evaluation metric object that has an `evaluate` method. This metric will be used to evaluate the model’s performance on the test dataset.	required

Returns:

Name	Type	Description
`dict`	`dict`	A dictionary containing the evaluation results.

Examples:

>>> from sklearn.metrics import accuracy_score
>>> import numpy as np
>>> from spotriver.evaluation.eval_bml import evaluate_model
>>> y_true = np.array([0, 1, 0, 1])
>>> y_pred = np.array([0, 1, 1, 1])
>>> memory = 0.0
>>> r_time = 0.0
>>> metric = accuracy_score
>>> evaluate_model(y_true, y_pred, memory, r_time, metric)
{'Metric': 0.75, 'Memory (MB)': 0.0, 'CompTime (s)': 0.0}

Source code in spotriver/evaluation/eval_bml.py

def evaluate_model(y_true: np.ndarray, y_pred: np.ndarray, memory: float, r_time: float, metric) -> dict:
    """
    Evaluate a machine learning model on a test dataset.

    This function evaluates a machine learning model on a test dataset using a given evaluation metric.
    The evaluation results are returned as a dictionary.

    Args:
        y_true (np.ndarray): A numpy array containing the true values.
        y_pred (np.ndarray): A numpy array containing the predicted values.
        memory (float): The memory usage of the model.
        r_time (float): The computation time of the model.
        metric (object): An evaluation metric object that has an `evaluate` method.
            This metric will be used to evaluate the model's performance on the test dataset.

    Returns:
        dict: A dictionary containing the evaluation results.

    Examples:
        >>> from sklearn.metrics import accuracy_score
        >>> import numpy as np
        >>> from spotriver.evaluation.eval_bml import evaluate_model
        >>> y_true = np.array([0, 1, 0, 1])
        >>> y_pred = np.array([0, 1, 1, 1])
        >>> memory = 0.0
        >>> r_time = 0.0
        >>> metric = accuracy_score
        >>> evaluate_model(y_true, y_pred, memory, r_time, metric)
        {'Metric': 0.75, 'Memory (MB)': 0.0, 'CompTime (s)': 0.0}
    """
    if len(y_true) != len(y_pred):
        raise ValueError("y_true and y_pred must have the same size")
    if (len(y_true) == 0) or (len(y_pred) == 0):
        res_dict = {
            "Metric": None,
            "Memory (MB)": memory,
            "CompTime (s)": r_time,
        }
        return res_dict
    # if y_pred or y_true is bool convert to int
    if y_pred.dtype == bool:
        y_pred = y_pred.astype(int)
    if y_true.dtype == bool:
        y_true = y_true.astype(int)
    score = metric(y_true, y_pred)
    res_dict = {"Metric": score, "Memory (MB)": memory, "CompTime (s)": r_time}
    return res_dict

`gen_sliding_window(df, horizon, include_remainder=True)` ¶

Generates sliding windows of a given size from a DataFrame.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	The input DataFrame.	required
`horizon`	`int`	The size of the sliding window.	required
`include_remainder`	`bool`	Whether to include the remainder of the DataFrame if its length is not divisible by the horizon. Defaults to False.	`True`

Yields:

Type	Description
`DataFrame`	A sliding window of the input DataFrame.

Examples:

>>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
>>> for window in gen_sliding_window(df, 2):
...     print(window)
   A  B
0  1  4
1  2  5
   A  B
2  3  6

Source code in spotriver/evaluation/eval_bml.py

def gen_sliding_window(
    df: pd.DataFrame, horizon: int, include_remainder: bool = True
) -> Generator[pd.DataFrame, None, None]:
    """Generates sliding windows of a given size from a DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.
        horizon (int): The size of the sliding window.
        include_remainder (bool):
            Whether to include the remainder of the DataFrame
            if its length is not divisible by the horizon. Defaults to False.

    Yields:
        (pd.DataFrame):
            A sliding window of the input DataFrame.

    Examples:
        >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
        >>> for window in gen_sliding_window(df, 2):
        ...     print(window)
           A  B
        0  1  4
        1  2  5
           A  B
        2  3  6
    """
    i = 0
    while True:
        subset = df[i * horizon : (i + 1) * horizon]
        if len(subset) == 0:
            break
        elif len(subset) < horizon:
            if include_remainder:
                yield subset
            break
        i += 1
        yield subset

`plot_bml_oml_horizon_metrics(df_eval=None, df_labels=None, log_x=False, log_y=False, cumulative=True, grid=True, figsize=None, metric=None, filename=None, show=False, title='', skip_first_n=0, skip_last_n=0, tkagg=False, **kwargs)` ¶

Plot evaluation metrics for machine learning models.

This function plots the evaluation metrics for different machine learning models on a given dataset. The function takes a list of pandas dataframes as input, each containing the evaluation metrics for one model. The function also takes an optional list of labels for each model and boolean flags to indicate whether to use logarithmic scales for the x-axis and y-axis.

Parameters:

Name	Type	Description	Default
`df_eval`	`list[DataFrame]`	A list of pandas dataframes containing the evaluation metrics for each model. Each dataframe should have an index column with the dataset name and three columns with the label names: e.g., “Metric”, “CompTime (s)” and “Memory (MB)”. If None, no plot is generated. Default is None.	`None`
`df_labels`	`list`	A list of strings containing the labels for each model. The length of this list should match the length of df_eval. If None, numeric indices are used as labels. Default is None.	`None`
`log_x`	`bool`	A flag indicating whether to use logarithmic scale for the x-axis. If True, log scale is used. If False, linear scale is used. Default is False.	`False`
`log_y`	`bool`	A flag indicating whether to use logarithmic scale for the y-axis. If True, log scale is used. If False, linear scale is used. Default is False.	`False`
`cumulative`	`bool`	A flag indicating whether to plot cumulative metrics. If True, cumulative metrics are plotted. If False, non-cumulative metrics are plotted. Default is True.	`True`
`grid`	`bool`	A flag indicating whether to plot a grid. If True, grid is shown. Default is True.	`True`
`figsize`	`tuple`	The size of the figure. Default is None.	`None`
`metric`	`object`	An evaluation metric object that has an `evaluate` method. This metric will be used to evaluate the model’s performance on the test dataset.	`None`
`filename`	`str`	The name of the file to save the plot to. If None, the plot is not saved. Default is None.	`None`
`title`	`str`	The title of the plot. Default is an empty string.	`''`
`skip_first_n`	`int`	The number of rows to skip from the beginning of the dataframe. Default is 0.	`0`
`skip_last_n`	`int`	The number of rows to skip from the end of the dataframe. Default is 0.	`0`
`show`	`bool`	A flag indicating whether to show the plot. If True, the plot is displayed. If False, the plot is not displayed. Default is False.	`False`
`tkagg`	`bool`	A flag indicating whether to use the TkAgg backend for plotting. If True, the TkAgg backend is used. If False, the default backend is used. Default: False.	`False`
`**kwargs`	`Any`	Additional keyword arguments to be passed to the plot function.	`{}`

Returns:

Type	Description
`NoneType`	This function does not return anything.

Examples:

>>> from sklearn.metrics import accuracy_score
>>> from spotriver.evaluation.eval_bml import plot_bml_oml_horizon_metrics
>>> df_eval = pd.DataFrame({"Metric": [0.5, 0.75, 0.9], "CompTime (s)": [0.1, 0.2, 0.3], "Memory (MB)": [0.1, 0.2, 0.3]})
>>> df_labels = ["Model 1", "Model 2", "Model 3"]
>>> plot_bml_oml_horizon_metrics(df_eval, df_labels, metric=accuracy_score)
>>>
>>> from river import linear_model, datasets, preprocessing
    from spotriver.evaluation.eval_bml import eval_oml_horizon
    from spotriver.utils.data_conversion import convert_to_df
    from sklearn.metrics import mean_absolute_error
    metric = mean_absolute_error
    model = (preprocessing.StandardScaler() |
            linear_model.LinearRegression())
    dataset = datasets.TrumpApproval()
    target_column = "Approve"
    df = convert_to_df(dataset, target_column)
    train = df[:500]
    test = df[500:]
    horizon = 10
    df_eval, df_preds = eval_oml_horizon(
        model, train, test, target_column,
        horizon, metric=metric)
    from spotriver.evaluation.eval_bml import plot_bml_oml_horizon_metrics
    df_labels = ["OML Linear"]
    plot_bml_oml_horizon_metrics(df_eval, df_labels, metric=metric, filename=None)

Source code in spotriver/evaluation/eval_bml.py

def plot_bml_oml_horizon_metrics(
    df_eval: list[pd.DataFrame] = None,
    df_labels: list = None,
    log_x=False,
    log_y=False,
    cumulative=True,
    grid=True,
    figsize=None,
    metric=None,
    filename=None,
    show=False,
    title="",
    skip_first_n=0,
    skip_last_n=0,
    tkagg=False,
    **kwargs,
) -> None:
    """Plot evaluation metrics for machine learning models.

    This function plots the evaluation metrics for different machine learning models
    on a given dataset. The function takes a list of pandas dataframes as input,
    each containing the evaluation metrics for one model. The function also takes
    an optional list of labels for each model and boolean flags to indicate whether
    to use logarithmic scales for the x-axis and y-axis.

    Args:
        df_eval (list[pd.DataFrame], optional):
            A list of pandas dataframes containing the evaluation metrics for each model.
            Each dataframe should have an index column with the dataset name and three
            columns with the label names: e.g., "Metric", "CompTime (s)" and "Memory (MB)".
            If None, no plot is generated. Default is None.
        df_labels (list, optional):
            A list of strings containing the labels for each model.
            The length of this list should match the length of df_eval.
            If None, numeric indices are used as labels. Default is None.
        log_x (bool, optional):
            A flag indicating whether to use logarithmic scale for the x-axis.
            If True, log scale is used. If False, linear scale is used. Default is False.
        log_y (bool, optional):
            A flag indicating whether to use logarithmic scale for the y-axis.
            If True, log scale is used. If False, linear scale is used. Default is False.
        cumulative (bool, optional):
            A flag indicating whether to plot cumulative metrics.
            If True, cumulative metrics are plotted. If False, non-cumulative metrics are plotted.
            Default is True.
        grid (bool, optional):
            A flag indicating whether to plot a grid.
            If True, grid is shown. Default is True.
        figsize (tuple, optional):
            The size of the figure. Default is None.
        metric (object):
            An evaluation metric object that has an `evaluate` method.
            This metric will be used to evaluate the model's performance on the test dataset.
        filename (str, optional):
            The name of the file to save the plot to. If None, the plot is not saved. Default is None.
        title (str, optional):
            The title of the plot. Default is an empty string.
        skip_first_n (int, optional):
            The number of rows to skip from the beginning of the dataframe. Default is 0.
        skip_last_n (int, optional):
            The number of rows to skip from the end of the dataframe. Default is 0.
        show (bool, optional):
            A flag indicating whether to show the plot. If True, the plot is displayed.
            If False, the plot is not displayed. Default is False.
        tkagg (bool, optional):
            A flag indicating whether to use the TkAgg backend for plotting.
            If True, the TkAgg backend is used. If False, the default backend is used.
            Default: False.
        **kwargs (Any):
            Additional keyword arguments to be passed to the plot function.

    Returns:
        (NoneType): This function does not return anything.

    Examples:
        >>> from sklearn.metrics import accuracy_score
        >>> from spotriver.evaluation.eval_bml import plot_bml_oml_horizon_metrics
        >>> df_eval = pd.DataFrame({"Metric": [0.5, 0.75, 0.9], "CompTime (s)": [0.1, 0.2, 0.3], "Memory (MB)": [0.1, 0.2, 0.3]})
        >>> df_labels = ["Model 1", "Model 2", "Model 3"]
        >>> plot_bml_oml_horizon_metrics(df_eval, df_labels, metric=accuracy_score)
        >>>
        >>> from river import linear_model, datasets, preprocessing
            from spotriver.evaluation.eval_bml import eval_oml_horizon
            from spotriver.utils.data_conversion import convert_to_df
            from sklearn.metrics import mean_absolute_error
            metric = mean_absolute_error
            model = (preprocessing.StandardScaler() |
                    linear_model.LinearRegression())
            dataset = datasets.TrumpApproval()
            target_column = "Approve"
            df = convert_to_df(dataset, target_column)
            train = df[:500]
            test = df[500:]
            horizon = 10
            df_eval, df_preds = eval_oml_horizon(
                model, train, test, target_column,
                horizon, metric=metric)
            from spotriver.evaluation.eval_bml import plot_bml_oml_horizon_metrics
            df_labels = ["OML Linear"]
            plot_bml_oml_horizon_metrics(df_eval, df_labels, metric=metric, filename=None)
    """
    if tkagg:
        matplotlib.use("TkAgg")
    if figsize is None:
        figsize = (10, 5)
    # Check if metric is None or null and raise ValueError if it is
    if metric is None:
        raise ValueError("The 'metric' parameter must not be None or null.")
    # Check if input dataframes are provided
    if df_eval is not None:
        df_list = copy.deepcopy(df_eval)
        # Convert single dataframe input to a list if needed
        if df_list.__class__ != list:
            df_list = [df_list]
        # Define metric names and titles
        metric_name = metric.__name__
        metrics = ["Metric", "CompTime (s)", "Memory (MB)"]
        titles = [metric_name, "Computation time (s)", "Memory (MB)"]
        # Create subplots with shared x-axis
        fig, axes = plt.subplots(3, figsize=figsize, constrained_layout=True, sharex=True)
        # Loop over each dataframe in input list
        for j, df in enumerate(df_list):
            if cumulative:
                # df.MAE = np.cumsum(df.MAE) / range(1, (1 + df.MAE.size))
                df["Metric"] = np.cumsum(df["Metric"]) / range(1, (1 + df["Metric"].size))
                df["CompTime (s)"] = np.cumsum(df["CompTime (s)"])  # / range(1, (1 + df["CompTime (s)"].size))
                # df["Memory (MB)"] = np.cumsum(df["Memory (MB)"]) / range(1, (1 + df["Memory (MB)"].size))
            # Loop over each metric
            for i in range(3):
                # Assign label based on input or default value
                if df_labels is None:
                    label = f"{j}"
                else:
                    label = df_labels[j]
                # Define indices for slicing based on skip_first_n and skip_last_n
                start = skip_first_n
                end = None if skip_last_n == 0 else -skip_last_n

                # Plot metric values against dataset names, skipping specified entries
                axes[i].plot(
                    df.index.values.tolist()[start:end],
                    df[metrics[i]].values.tolist()[start:end],
                    label=label,
                    **kwargs,
                )
                # Set title and legend
                axes[i].set_title(titles[i])
                axes[i].legend(loc="upper right")
                axes[i].grid(grid)
                # Set logarithmic scales if specified
                if log_x:
                    axes[i].set_xscale("log")
                if log_y:
                    axes[i].set_yscale("log")
        if filename is not None:
            plt.savefig(filename)
    # add a title to the figure
    fig.suptitle(f"{title}")
    if show:
        plt.show()

`plot_bml_oml_horizon_predictions(df_true=None, df_labels=None, target_column='Actual', log_x=False, log_y=False, skip_first_n=0, grid=True, figsize=None, filename=None, title='', tkagg=False, **kwargs)` ¶

Plot actual vs predicted values for machine learning models.

This function plots the actual vs predicted values for different machine learning models on a given dataset. The function takes a list of pandas dataframes as input, each containing the actual and predicted values for one model. The function also takes an optional list of labels for each model and boolean flags to indicate whether to use logarithmic scales for the x-axis and y-axis.

Parameters:

Name	Type	Description	Default
`df_true`	`list[DataFrame]`	A list of pandas dataframes containing the actual and predicted values for each model. Each dataframe should have an index column with the dataset name and two columns with the label names: e.g., “Actual” and “Prediction”. If None, no plot is generated. Default is None.	`None`
`df_labels`	`list`	A list of strings containing the labels for each model. The length of this list should match the length of df_true. If None, numeric indices are used as labels. Default is None.	`None`
`target_column`	`str`	The name of the column containing the target variable. Default is “Actual”.	`'Actual'`
`log_x`	`bool`	A flag indicating whether to use logarithmic scale for the x-axis. If True, log scale is used. If False, linear scale is used. Default is False.	`False`
`log_y`	`bool`	A flag indicating whether to use logarithmic scale for the y-axis. If True, log scale is used. If False, linear scale is used. Default is False.	`False`
`skip_first_n`	`int`	The number of rows to skip from the beginning of the dataframes. Default is 0.	`0`
`grid`	`bool`	A flag indicating whether to plot a grid. If True, grid is shown. Default is True.	`True`
`figsize`	`tuple`	The size of the figure. Default is None.	`None`
`filename`	`str`	The name of the file to save the plot to. If None, the plot is not saved. Default is None.	`None`
`title`	`str`	The title of the plot. Default is an empty string.	`''`
`tkagg`	`bool`	A flag indicating whether to use the TkAgg backend for plotting. If True, the TkAgg backend is used. If False, the default backend is used. Default: False.	`False`
`**kwargs`	`Any`	Additional keyword arguments to be passed to the plot function.	`{}`

Returns:

Type	Description
`NoneType`	This function does not return anything.

Examples:

>>> from sklearn.metrics import accuracy_score
>>> from spotriver.evaluation.eval_bml import plot_bml_oml_horizon_predictions
>>> df_true = pd.DataFrame({"Actual": [0.5, 0.75, 0.9], "Prediction": [0.1, 0.2, 0.3]})
>>> df_labels = ["Model 1", "Model 2", "Model 3"]
>>> plot_bml_oml_horizon_predictions(df_true, df_labels, target_column="Actual")

Source code in spotriver/evaluation/eval_bml.py

def plot_bml_oml_horizon_predictions(
    df_true: list[pd.DataFrame] = None,
    df_labels: list = None,
    target_column: str = "Actual",
    log_x=False,
    log_y=False,
    skip_first_n=0,
    grid=True,
    figsize: tuple = None,
    filename=None,
    title="",
    tkagg=False,
    **kwargs,
) -> None:
    """Plot actual vs predicted values for machine learning models.

    This function plots the actual vs predicted values for different machine learning models
    on a given dataset. The function takes a list of pandas dataframes as input,
    each containing the actual and predicted values for one model. The function also takes
    an optional list of labels for each model and boolean flags to indicate whether
    to use logarithmic scales for the x-axis and y-axis.

    Args:
        df_true (list[pd.DataFrame], optional):
            A list of pandas dataframes containing the actual and predicted values for each model.
            Each dataframe should have an index column with the dataset name and two columns with
            the label names: e.g., "Actual" and "Prediction".
            If None, no plot is generated. Default is None.
        df_labels (list, optional):
            A list of strings containing the labels for each model.
            The length of this list should match the length of df_true.
            If None, numeric indices are used as labels. Default is None.
        target_column (str, optional):
            The name of the column containing the target variable. Default is "Actual".
        log_x (bool, optional):
            A flag indicating whether to use logarithmic scale for the x-axis.
            If True, log scale is used. If False, linear scale is used. Default is False.
        log_y (bool, optional):
            A flag indicating whether to use logarithmic scale for the y-axis.
            If True, log scale is used. If False, linear scale is used. Default is False.
        skip_first_n (int, optional):
            The number of rows to skip from the beginning of the dataframes. Default is 0.
        grid (bool, optional):
            A flag indicating whether to plot a grid. If True, grid is shown. Default is True.
        figsize (tuple, optional):
            The size of the figure. Default is None.
        filename (str, optional):
            The name of the file to save the plot to. If None, the plot is not saved. Default is None.
        title (str, optional):
            The title of the plot. Default is an empty string.
        tkagg (bool, optional):
            A flag indicating whether to use the TkAgg backend for plotting.
            If True, the TkAgg backend is used. If False, the default backend is used.
            Default: False.
        **kwargs (Any): Additional keyword arguments to be passed to the plot function.

    Returns:
        (NoneType): This function does not return anything.

    Examples:
        >>> from sklearn.metrics import accuracy_score
        >>> from spotriver.evaluation.eval_bml import plot_bml_oml_horizon_predictions
        >>> df_true = pd.DataFrame({"Actual": [0.5, 0.75, 0.9], "Prediction": [0.1, 0.2, 0.3]})
        >>> df_labels = ["Model 1", "Model 2", "Model 3"]
        >>> plot_bml_oml_horizon_predictions(df_true, df_labels, target_column="Actual")

    """
    if tkagg:
        matplotlib.use("TkAgg")
    if figsize is None:
        figsize = (10, 5)
    if df_true is not None:
        df_plot = copy.deepcopy(df_true)
        if df_plot.__class__ != list:
            df_plot = [df_plot]
        plt.figure(figsize=figsize)
        for j, df in enumerate(df_plot):
            if df_labels is None:
                label = f"{j}"
            else:
                label = df_labels[j]
            df.loc[: skip_first_n - 1, "Prediction"] = np.nan
            plt.plot(df.index, df["Prediction"], label=label, **kwargs)
        plt.plot(df_plot[0].index, df_plot[0][target_column], label="Actual", color="black", **kwargs)
        plt.title(f"Actual vs Prediction for {title}")
        if log_x:
            plt.xscale("log")
        if log_y:
            plt.yscale("log")
        plt.grid(grid)
        plt.legend()
        if filename is not None:
            plt.savefig(filename)
    plt.show()

eval_bml

ResourceMonitor ¶

result() ¶

eval_bml_horizon(model, train, test, target_column, horizon, include_remainder=True, metric=None) ¶

eval_bml_landmark(model, train, test, target_column, horizon, include_remainder=True, metric=None) ¶

eval_bml_window(model, train, test, target_column, horizon, include_remainder=True, metric=None) ¶

eval_oml_horizon(model, train, test, target_column, horizon, include_remainder=True, metric=None, oml_grace_period=None) ¶

evaluate_model(y_true, y_pred, memory, r_time, metric) ¶

gen_sliding_window(df, horizon, include_remainder=True) ¶

plot_bml_oml_horizon_metrics(df_eval=None, df_labels=None, log_x=False, log_y=False, cumulative=True, grid=True, figsize=None, metric=None, filename=None, show=False, title='', skip_first_n=0, skip_last_n=0, tkagg=False, **kwargs) ¶

plot_bml_oml_horizon_predictions(df_true=None, df_labels=None, target_column='Actual', log_x=False, log_y=False, skip_first_n=0, grid=True, figsize=None, filename=None, title='', tkagg=False, **kwargs) ¶

`ResourceMonitor` ¶

`result()` ¶

`eval_bml_horizon(model, train, test, target_column, horizon, include_remainder=True, metric=None)` ¶

`eval_bml_landmark(model, train, test, target_column, horizon, include_remainder=True, metric=None)` ¶

`eval_bml_window(model, train, test, target_column, horizon, include_remainder=True, metric=None)` ¶

`eval_oml_horizon(model, train, test, target_column, horizon, include_remainder=True, metric=None, oml_grace_period=None)` ¶

`evaluate_model(y_true, y_pred, memory, r_time, metric)` ¶

`gen_sliding_window(df, horizon, include_remainder=True)` ¶

`plot_bml_oml_horizon_metrics(df_eval=None, df_labels=None, log_x=False, log_y=False, cumulative=True, grid=True, figsize=None, metric=None, filename=None, show=False, title='', skip_first_n=0, skip_last_n=0, tkagg=False, **kwargs)` ¶

`plot_bml_oml_horizon_predictions(df_true=None, df_labels=None, target_column='Actual', log_x=False, log_y=False, skip_first_n=0, grid=True, figsize=None, filename=None, title='', tkagg=False, **kwargs)` ¶