Skip to content

validation

plot_actual_vs_predicted(y_test, y_pred, title=None, show=True, filename=None)

Plot actual vs. predicted values.

Parameters:

Name Type Description Default
y_test ndarray

True values.

required
y_pred ndarray

Predicted values.

required
title str

Title of the plot. Defaults to None.

None
show bool

If True, the plot is shown. Defaults to True.

True
filename str

Name of the file to save the plot. Defaults to None.

None

Returns:

Type Description
NoneType

None

Examples:

>>> from sklearn.datasets import load_diabetes
    from sklearn.linear_model import LinearRegression
    from spotpython.plot.validation import plot_actual_vs_predicted
    X, y = load_diabetes(return_X_y=True)
    lr = LinearRegression()
    lr.fit(X, y)
    y_pred = lr.predict(X)
    plot_actual_vs_predicted(y, y_pred)
Source code in spotpython/plot/validation.py
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
def plot_actual_vs_predicted(y_test, y_pred, title=None, show=True, filename=None) -> None:
    """Plot actual vs. predicted values.

    Args:
        y_test (np.ndarray):
            True values.
        y_pred (np.ndarray):
            Predicted values.
        title (str, optional):
            Title of the plot. Defaults to None.
        show (bool, optional):
            If True, the plot is shown. Defaults to True.
        filename (str, optional):
            Name of the file to save the plot. Defaults to None.

    Returns:
        (NoneType): None

    Examples:
        >>> from sklearn.datasets import load_diabetes
            from sklearn.linear_model import LinearRegression
            from spotpython.plot.validation import plot_actual_vs_predicted
            X, y = load_diabetes(return_X_y=True)
            lr = LinearRegression()
            lr.fit(X, y)
            y_pred = lr.predict(X)
            plot_actual_vs_predicted(y, y_pred)
    """
    fig, axs = plt.subplots(ncols=2, figsize=(8, 4))
    PredictionErrorDisplay.from_predictions(
        y_test,
        y_pred=y_pred,
        kind="actual_vs_predicted",
        subsample=100,
        ax=axs[0],
        random_state=0,
        scatter_kwargs={"alpha": 0.5},
    )
    axs[0].set_title("Actual vs. Predicted values")
    PredictionErrorDisplay.from_predictions(
        y_test,
        y_pred=y_pred,
        kind="residual_vs_predicted",
        subsample=100,
        ax=axs[1],
        random_state=0,
    )
    axs[1].set_title("Residuals vs. Predicted Values")
    if title is not None:
        fig.suptitle(title)
    plt.tight_layout()
    if filename is not None:
        plt.savefig(filename)
    if show:
        plt.show()

plot_confusion_matrix(model=None, fun_control=None, df=None, title=None, target_names=None, y_true_name=None, y_pred_name=None, show=False, ax=None)

Plotting a confusion matrix. If a model and the fun_control dictionary are passed, the confusion matrix is computed. If a dataframe is passed, the confusion matrix is computed from the dataframe. In this case, the names of the columns with the true and the predicted values must be specified. Default the dataframe is None.

Parameters:

Name Type Description Default
model Any

Sklearn model. The model to be used for cross-validation. Defaults to None.

None
fun_control Dict

Dictionary containing the data and the target column. Defaults to None.

None
title str

Title of the plot. Defaults to None.

None
df DataFrame

Dataframe containing the predictions and the target column. Defaults to None.

None
target_names List[str]

List of target names. Defaults to None.

None
y_true_name str

Name of the column with the true values if a dataframe is specified. Defaults to None.

None
y_pred_name str

Name of the column with the predicted values if a dataframe is specified. Defaults to None.

None
show bool

If True, the plot is shown. Defaults to False.

False
ax AxesSubplot

Axes to plot the confusion matrix. Defaults to None.

None

Returns:

Type Description
NoneType

None

Source code in spotpython/plot/validation.py
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
def plot_confusion_matrix(
    model=None,
    fun_control=None,
    df=None,
    title=None,
    target_names=None,
    y_true_name=None,
    y_pred_name=None,
    show=False,
    ax=None,
):
    """
    Plotting a confusion matrix. If a model and the fun_control dictionary are passed,
    the confusion matrix is computed. If a dataframe is passed, the confusion matrix is
    computed from the dataframe. In this case, the names of the columns with the true and
    the predicted values must be specified. Default the dataframe is None.

    Args:
        model (Any, optional):
            Sklearn model. The model to be used for cross-validation. Defaults to None.
        fun_control (Dict, optional):
            Dictionary containing the data and the target column. Defaults to None.
        title (str, optional):
            Title of the plot. Defaults to None.
        df (pd.DataFrame, optional):
            Dataframe containing the predictions and the target column. Defaults to None.
        target_names (List[str], optional):
            List of target names. Defaults to None.
        y_true_name (str, optional):
            Name of the column with the true values if a dataframe is specified. Defaults to None.
        y_pred_name (str, optional):
            Name of the column with the predicted values if a dataframe is specified. Defaults to None.
        show (bool, optional):
            If True, the plot is shown. Defaults to False.
        ax (matplotlib.axes._subplots.AxesSubplot, optional):
            Axes to plot the confusion matrix. Defaults to None.

    Returns:
        (NoneType): None

    """
    if df is not None:
        # assign the column y_true_name from df to y_true
        y_true = df[y_true_name]
        # assign the column y_pred_name from df to y_pred
        y_pred = df[y_pred_name]
    else:
        X_train, y_train = get_Xy_from_df(fun_control["train"], fun_control["target_column"])
        X_test, y_true = get_Xy_from_df(fun_control["test"], fun_control["target_column"])
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    if ax is None:
        fig, ax = plt.subplots(figsize=(10, 5))
    ConfusionMatrixDisplay.from_predictions(y_true=y_true, y_pred=y_pred, ax=ax, colorbar=False)
    if target_names is not None:
        ax.xaxis.set_ticklabels(target_names)
        ax.yaxis.set_ticklabels(target_names)
    if title is not None:
        _ = ax.set_title(title)
    if show:
        plt.show()

plot_cv_predictions(model, fun_control, show=True)

Plots cross-validated predictions for regression.

Uses sklearn.model_selection.cross_val_predict together with sklearn.metrics.PredictionErrorDisplay to visualize prediction errors. It is based on the example from the scikit-learn documentation: https://scikit-learn.org/stable/auto_examples/model_selection/plot_cv_predict.html#sphx-glr-download-auto-examples-model-selection-plot-cv-predict-py

Parameters:

Name Type Description Default
model Any

Sklearn model. The model to be used for cross-validation.

required
fun_control Dict

Dictionary containing the data and the target column.

required
show bool

If True, the plot is shown. Defaults to True.

True

Returns:

Type Description
NoneType

None

Examples:

>>> from sklearn.datasets import load_diabetes
>>> from sklearn.linear_model import LinearRegression
>>> X, y = load_diabetes(return_X_y=True)
>>> lr = LinearRegression()
>>> plot_cv_predictions(lr, fun_control)
Source code in spotpython/plot/validation.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
def plot_cv_predictions(model: Any, fun_control: Dict, show=True) -> None:
    """
    Plots cross-validated predictions for regression.

    Uses `sklearn.model_selection.cross_val_predict` together with
    `sklearn.metrics.PredictionErrorDisplay` to visualize prediction errors.
    It is based on the example from the scikit-learn documentation:
    https://scikit-learn.org/stable/auto_examples/model_selection/plot_cv_predict.html#sphx-glr-download-auto-examples-model-selection-plot-cv-predict-py

    Args:
        model (Any):
            Sklearn model. The model to be used for cross-validation.
        fun_control (Dict):
            Dictionary containing the data and the target column.
        show (bool, optional):
            If True, the plot is shown. Defaults to True.

    Returns:
        (NoneType): None

    Examples:
        >>> from sklearn.datasets import load_diabetes
        >>> from sklearn.linear_model import LinearRegression
        >>> X, y = load_diabetes(return_X_y=True)
        >>> lr = LinearRegression()
        >>> plot_cv_predictions(lr, fun_control)
    """
    X_test, y_test = get_Xy_from_df(fun_control["test"], fun_control["target_column"])
    # cross_val_predict returns an array of the same size of y
    # where each entry is a prediction obtained by cross validation.
    y_pred = cross_val_predict(model, X_test, y_test, cv=10)
    fig, axs = plt.subplots(ncols=2, figsize=(8, 4))
    PredictionErrorDisplay.from_predictions(
        y_test,
        y_pred=y_pred,
        kind="actual_vs_predicted",
        subsample=100,
        ax=axs[0],
        random_state=0,
    )
    axs[0].set_title("Actual vs. Predicted values")
    PredictionErrorDisplay.from_predictions(
        y_test,
        y_pred=y_pred,
        kind="residual_vs_predicted",
        subsample=100,
        ax=axs[1],
        random_state=0,
    )
    axs[1].set_title("Residuals vs. Predicted Values")
    fig.suptitle("Plotting cross-validated predictions")
    plt.tight_layout()
    if show:
        plt.show()

plot_roc(model_list, fun_control, alpha=0.8, model_names=None, show=True)

Plots ROC curves for a list of models using the Visualization API from scikit-learn.

Parameters:

Name Type Description Default
model_list List[BaseEstimator]

A list of scikit-learn models to plot ROC curves for.

required
fun_control Dict[str, Union[str, DataFrame]]

A dictionary containing the train and test dataframes and the target column name.

required
alpha float

The alpha value for the ROC curve. Defaults to 0.8.

0.8
model_names List[str]

A list of names for the models. Defaults to None.

None
show bool

If True, the plot is shown. Defaults to True.

True

Returns:

Type Description
NoneType

None

Examples:

>>> from sklearn.datasets import load_iris
>>> from sklearn.linear_model import LogisticRegression
>>> from sklearn.tree import DecisionTreeClassifier
>>> iris = load_iris()
>>> X_train = iris.data[:100]
>>> y_train = iris.target[:100]
>>> X_test = iris.data[100:]
>>> y_test = iris.target[100:]
>>> train_df = pd.DataFrame(X_train, columns=iris.feature_names)
>>> train_df['target'] = y_train
>>> test_df = pd.DataFrame(X_test, columns=iris.feature_names)
>>> test_df['target'] = y_test
>>> fun_control = {"train": train_df, "test": test_df, "target_column": "target"}
>>> model_list = [LogisticRegression(), DecisionTreeClassifier()]
>>> model_names = ["Logistic Regression", "Decision Tree"]
>>> plot_roc(model_list, fun_control, model_names=model_names)
Source code in spotpython/plot/validation.py
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
def plot_roc(
    model_list: List[BaseEstimator],
    fun_control: Dict[str, Union[str, pd.DataFrame]],
    alpha: float = 0.8,
    model_names: List[str] = None,
    show=True,
) -> None:
    """
    Plots ROC curves for a list of models using the Visualization API from scikit-learn.

    Args:
        model_list (List[BaseEstimator]):
            A list of scikit-learn models to plot ROC curves for.
        fun_control (Dict[str, Union[str, pd.DataFrame]]):
            A dictionary containing the train and test dataframes and the target column name.
        alpha (float, optional):
            The alpha value for the ROC curve. Defaults to 0.8.
        model_names (List[str], optional):
            A list of names for the models. Defaults to None.
        show (bool, optional):
            If True, the plot is shown. Defaults to True.

    Returns:
        (NoneType): None

    Examples:
        >>> from sklearn.datasets import load_iris
        >>> from sklearn.linear_model import LogisticRegression
        >>> from sklearn.tree import DecisionTreeClassifier
        >>> iris = load_iris()
        >>> X_train = iris.data[:100]
        >>> y_train = iris.target[:100]
        >>> X_test = iris.data[100:]
        >>> y_test = iris.target[100:]
        >>> train_df = pd.DataFrame(X_train, columns=iris.feature_names)
        >>> train_df['target'] = y_train
        >>> test_df = pd.DataFrame(X_test, columns=iris.feature_names)
        >>> test_df['target'] = y_test
        >>> fun_control = {"train": train_df, "test": test_df, "target_column": "target"}
        >>> model_list = [LogisticRegression(), DecisionTreeClassifier()]
        >>> model_names = ["Logistic Regression", "Decision Tree"]
        >>> plot_roc(model_list, fun_control, model_names=model_names)
    """
    X_train, y_train = get_Xy_from_df(fun_control["train"], fun_control["target_column"])
    X_test, y_test = get_Xy_from_df(fun_control["test"], fun_control["target_column"])
    ax = plt.gca()
    for i, model in enumerate(model_list):
        model.fit(X_train, y_train)
        if model_names is not None:
            model_name = model_names[i]
        else:
            model_name = None
        y_pred = model.predict(X_test)
        RocCurveDisplay.from_predictions(y_test, y_pred, ax=ax, alpha=alpha, name=model_name)
    if show:
        plt.show()

plot_roc_from_dataframes(df_list, alpha=0.8, model_names=None, target_column=None, show=True, title='', tkagg=False)

Plot ROC curve for a list of dataframes from model evaluations.

Parameters:

Name Type Description Default
df_list List[DataFrame]

List of dataframes with results from models.

required
alpha float

Transparency of the plotted lines.

0.8
model_names List[str]

List of model names.

None
target_column str

Name of the target column.

None
show bool

If True, the plot is shown.

True
title str

Title of the plot.

''
tkagg bool

If True, the TkAgg backend is used. Default is False.

False

Returns:

Type Description
None

None

Examples:

>>> import pandas as pd
    from spotpython.plot.validation import plot_roc_from_dataframes
    df1 = pd.DataFrame({"y": [1, 0, 0, 1], "Prediction": [1,0,0,0]})
    df2 = pd.DataFrame({"y": [1, 0, 0, 1], "Prediction": [1,0,1,1]})
    df_list = [df1, df2]
    model_names = ["Model 1", "Model 2"]
    plot_roc_from_dataframes(df_list, model_names=model_names, target_column="y")
Source code in spotpython/plot/validation.py
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
def plot_roc_from_dataframes(
    df_list: List[pd.DataFrame],
    alpha: float = 0.8,
    model_names: List[str] = None,
    target_column: str = None,
    show: bool = True,
    title: str = "",
    tkagg: bool = False,
) -> None:
    """
    Plot ROC curve for a list of dataframes from model evaluations.

    Args:
        df_list:
            List of dataframes with results from models.
        alpha:
            Transparency of the plotted lines.
        model_names:
            List of model names.
        target_column:
            Name of the target column.
        show:
            If True, the plot is shown.
        title:
            Title of the plot.
        tkagg:
            If True, the TkAgg backend is used.
            Default is False.

    Returns:
        None

    Examples:
        >>> import pandas as pd
            from spotpython.plot.validation import plot_roc_from_dataframes
            df1 = pd.DataFrame({"y": [1, 0, 0, 1], "Prediction": [1,0,0,0]})
            df2 = pd.DataFrame({"y": [1, 0, 0, 1], "Prediction": [1,0,1,1]})
            df_list = [df1, df2]
            model_names = ["Model 1", "Model 2"]
            plot_roc_from_dataframes(df_list, model_names=model_names, target_column="y")

    """
    if tkagg:
        matplotlib.use("TkAgg")
    fig, ax = plt.subplots(figsize=(10, 5))
    for i, df in enumerate(df_list):
        y_test = df[target_column]
        y_pred = df["Prediction"]
        if model_names is not None:
            model_name = model_names[i]
        else:
            model_name = None
        RocCurveDisplay.from_predictions(y_test, y_pred, ax=ax, alpha=alpha, name=model_name)
    # add a title to the plot
    ax.set_title(title)
    if show:
        plt.show()