Skip to content

effects

plot_all_partial_dependence(df, df_target, model='GradientBoostingRegressor', nrows=5, ncols=6, figsize=(20, 15))

Generates Partial Dependence Plots (PDPs) for every feature in a DataFrame against a target variable, arranged in a grid.

Parameters:

Name Type Description Default
df DataFrame

DataFrame containing the features.

required
df_target Series

Series containing the target variable.

required
model str

Name of the model class to use (e.g., “GradientBoostingRegressor”). Defaults to “GradientBoostingRegressor”.

'GradientBoostingRegressor'
nrows int

Number of rows in the grid of subplots. Defaults to 5.

5
ncols int

Number of columns in the grid of subplots. Defaults to 6.

6
figsize tuple

Figure size (width, height) in inches. Defaults to (20, 15).

(20, 15)

Returns:

Type Description
None

None

Examples:

>>> form spotpython.utils.effects import plot_all_partial_dependence
>>> from sklearn.datasets import load_boston
>>> import pandas as pd
>>> data = load_boston()
>>> df = pd.DataFrame(data.data, columns=data.feature_names)
>>> df_target = pd.Series(data.target, name="target")
>>> plot_all_partial_dependence(df, df_target, model="GradientBoostingRegressor", nrows=5, ncols=6, figsize=(20, 15))
Source code in spotpython/utils/effects.py
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
def plot_all_partial_dependence(df, df_target, model="GradientBoostingRegressor", nrows=5, ncols=6, figsize=(20, 15)) -> None:
    """
    Generates Partial Dependence Plots (PDPs) for every feature in a DataFrame against a target variable,
    arranged in a grid.

    Args:
        df (pd.DataFrame): DataFrame containing the features.
        df_target (pd.Series): Series containing the target variable.
        model (str, optional): Name of the model class to use (e.g., "GradientBoostingRegressor").
                               Defaults to "GradientBoostingRegressor".
        nrows (int, optional): Number of rows in the grid of subplots. Defaults to 5.
        ncols (int, optional): Number of columns in the grid of subplots. Defaults to 6.
        figsize (tuple, optional): Figure size (width, height) in inches. Defaults to (20, 15).

    Returns:
        None

    Examples:
        >>> form spotpython.utils.effects import plot_all_partial_dependence
        >>> from sklearn.datasets import load_boston
        >>> import pandas as pd
        >>> data = load_boston()
        >>> df = pd.DataFrame(data.data, columns=data.feature_names)
        >>> df_target = pd.Series(data.target, name="target")
        >>> plot_all_partial_dependence(df, df_target, model="GradientBoostingRegressor", nrows=5, ncols=6, figsize=(20, 15))

    """

    # Separate features and target
    X = df
    y = df_target  # Target variable is now a Series

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Instantiate the model
    if model == "GradientBoostingRegressor":
        gb_model = GradientBoostingRegressor(random_state=42)
    elif model == "RandomForestRegressor":
        from sklearn.ensemble import RandomForestRegressor

        gb_model = RandomForestRegressor(random_state=42)
    elif model == "DecisionTreeRegressor":
        from sklearn.tree import DecisionTreeRegressor

        gb_model = DecisionTreeRegressor(random_state=42)
    else:
        raise ValueError(f"Unsupported model: {model}")

    # Train model
    gb_model.fit(X_train, y_train)

    # Create subplots
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=figsize)
    axes = axes.flatten()  # Flatten the 2D array of axes for easy iteration

    # Generate PDP for each feature
    features = X.columns
    for i, feature in enumerate(features):
        ax = axes[i]  # Select the axis for the current feature
        PartialDependenceDisplay.from_estimator(gb_model, X_train, [feature], ax=ax)
        ax.set_title(feature)  # Set the title of the subplot to the feature name

    # Remove empty subplots if the number of features is less than nrows * ncols
    for i in range(len(features), nrows * ncols):
        fig.delaxes(axes[i])

    plt.tight_layout()  # Adjust subplot parameters for a tight layout
    plt.show()

screening(X, fun, xi, p, labels, range=None, print=False)

Generates a DataFrame with elementary effect screening metrics.

This function calculates the mean and standard deviation of the elementary effects for a given set of design variables and returns the results as a Pandas DataFrame.

Parameters:

Name Type Description Default
X ndarray

The screening plan matrix, typically structured within a [0,1]^k box.

required
fun object

The objective function to evaluate at each design point in the screening plan.

required
xi float

The elementary effect step length factor.

required
p int

Number of discrete levels along each dimension.

required
labels list of str

A list of variable names corresponding to the design variables.

required
range ndarray

A 2xk matrix where the first row contains lower bounds and the second row contains upper bounds for each variable.

None

Returns:

Type Description
DataFrame

pd.DataFrame: A DataFrame containing three columns: - ‘varname’: The name of each variable. - ‘mean’: The mean of the elementary effects for each variable. - ‘sd’: The standard deviation of the elementary effects for each variable.

Examples:

>>> import numpy as np
>>> from spotpython.fun.objectivefunctions import Analytical
>>> from spotpython.utils.effects import screening
>>>
>>> # Create a small test input with shape (n, 10)
>>> X_test = np.array([
...     [0.0]*10,
...     [1.0]*10
... ])
>>> fun = Analytical()
>>> labels = ["x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10"]
>>> result = screening(X_test, fun.fun_wingwt, np.array([[0]*10, [1]*10]), 0.1, 3, labels)
>>> print
Source code in spotpython/utils/effects.py
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
def screening(X, fun, xi, p, labels, range=None, print=False) -> pd.DataFrame:
    """Generates a DataFrame with elementary effect screening metrics.

    This function calculates the mean and standard deviation of the
    elementary effects for a given set of design variables and returns
    the results as a Pandas DataFrame.

    Args:
        X (np.ndarray): The screening plan matrix, typically structured
            within a [0,1]^k box.
        fun (object): The objective function to evaluate at each
            design point in the screening plan.
        xi (float): The elementary effect step length factor.
        p (int): Number of discrete levels along each dimension.
        labels (list of str): A list of variable names corresponding to
            the design variables.
        range (np.ndarray): A 2xk matrix where the first row contains
            lower bounds and the second row contains upper bounds for
            each variable.

    Returns:
        pd.DataFrame: A DataFrame containing three columns:
            - 'varname': The name of each variable.
            - 'mean': The mean of the elementary effects for each variable.
            - 'sd': The standard deviation of the elementary effects for
            each variable.

    Examples:
        >>> import numpy as np
        >>> from spotpython.fun.objectivefunctions import Analytical
        >>> from spotpython.utils.effects import screening
        >>>
        >>> # Create a small test input with shape (n, 10)
        >>> X_test = np.array([
        ...     [0.0]*10,
        ...     [1.0]*10
        ... ])
        >>> fun = Analytical()
        >>> labels = ["x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10"]
        >>> result = screening(X_test, fun.fun_wingwt, np.array([[0]*10, [1]*10]), 0.1, 3, labels)
        >>> print
    """
    # Determine the number of design variables (k)
    k = X.shape[1]
    # Determine the number of repetitions (r)
    r = X.shape[0] // (k + 1)

    # Scale each design point to the given range and evaluate the objective function
    t = np.zeros(X.shape[0])
    for i in range(X.shape[0]):
        if range is not None:
            X[i, :] = range[0, :] + X[i, :] * (range[1, :] - range[0, :])
        t[i] = fun(X[i, :])

    # Calculate the elementary effects
    F = np.zeros((k, r))
    for i in range(r):
        for j in range(i * (k + 1), i * (k + 1) + k):
            index = np.where(X[j, :] - X[j + 1, :] != 0)[0][0]
            F[index, i] = (t[j + 1] - t[j]) / (xi / (p - 1))

    # Compute statistical measures
    ssd = np.std(F, axis=1)
    sm = np.abs(np.mean(F, axis=1))

    if print:
        # sort the variables by decreasing mean
        idx = np.argsort(-sm)
        labels = [labels[i] for i in idx]
        sm = sm[idx]
        ssd = ssd[idx]
        df = pd.DataFrame({"varname": labels, "mean": sm, "sd": ssd})

        return df
    else:
        # Generate plot
        plt.figure()

        for i in range(k):
            plt.text(sm[i], ssd[i], labels[i], fontsize=10)

        plt.axis([min(sm), 1.1 * max(sm), min(ssd), 1.1 * max(ssd)])
        plt.xlabel("Sample means")
        plt.ylabel("Sample standard deviations")
        plt.gca().set_xlabel("Sample means")
        plt.gca().set_ylabel("Sample standard deviations")
        plt.gca().tick_params(labelsize=10)
        plt.grid(True)
        plt.show()