Skip to content

repair

apply_penalty_NA(y, penalty_NA, sd=0.1, stop_on_zero_return=False)

Replaces NaN values in y with a penalty value of penalty_NA and issues a warning if necessary.

Parameters:

Name Type Description Default
y ndarray

y array

required
penalty_NA float

penalty value to replace NaN values in y

required
sd float

standard deviation for the random noise added to penalty_NA. Default is 0.1.

0.1
stop_on_zero_return bool

whether to stop if the returned dimension is less than 1. Default is False.

False

Returns:

Type Description
ndarray

numpy.ndarray: y array with NaN values replaced by penalty value

Examples:

>>> import numpy as np
>>> from spotpython.utils.repair import apply_penalty_NA
>>> y = np.array([1, np.nan, 2])
>>> y_cleaned = apply_penalty_NA(y, 0)
>>> print(y_cleaned)
[1. 0. 2.]
Source code in spotpython/utils/repair.py
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
def apply_penalty_NA(y: np.ndarray, penalty_NA: float, sd=0.1, stop_on_zero_return: bool = False) -> np.ndarray:
    """
    Replaces NaN values in y with a penalty value of penalty_NA and issues a warning if necessary.

    Args:
        y (numpy.ndarray): y array
        penalty_NA (float): penalty value to replace NaN values in y
        sd (float): standard deviation for the random noise added to penalty_NA. Default is 0.1.
        stop_on_zero_return (bool): whether to stop if the returned dimension is less than 1. Default is False.

    Returns:
        numpy.ndarray: y array with NaN values replaced by penalty value

    Examples:
        >>> import numpy as np
        >>> from spotpython.utils.repair import apply_penalty_NA
        >>> y = np.array([1, np.nan, 2])
        >>> y_cleaned = apply_penalty_NA(y, 0)
        >>> print(y_cleaned)
        [1. 0. 2.]
    """
    if not isinstance(y, np.ndarray):
        raise TypeError("Input y must be a numpy array.")

    if not isinstance(penalty_NA, (int, float)):
        return y

    if not isinstance(sd, (int, float)):
        raise TypeError("sd must be a numeric value.")

    if not isinstance(stop_on_zero_return, bool):
        raise TypeError("stop_on_zero_return must be a boolean value.")

    original_dim = y.shape[0]
    nan_ind = ~np.isfinite(y)
    nan_dim = np.sum(nan_ind)

    random_values = np.random.normal(0, sd, y.shape)
    penalty_values = penalty_NA + random_values

    y_cleaned = y.copy()
    y_cleaned[nan_ind] = penalty_values[nan_ind]

    if nan_dim > 1:
        warnings.warn(f"\n!!! The dimension of the returned y array is {y_cleaned.shape[0]}, " f"which is smaller than the original dimension {original_dim}.")
        warnings.warn("\n!!! Check whether continuing with the reduced dimension is useful.")

    if (original_dim - nan_dim) < 1 and stop_on_zero_return:
        raise ValueError("!!!! The dimension of the returned y array is less than 1. Check the input data.")

    return y_cleaned

remove_nan(X, y, stop_on_zero_return=False)

Remove rows from X and y where y contains NaN values and issue a warning if the dimension of the returned y array is smaller than the dimension of the original y array. Issues a ValueError if the dimension of the returned y array is less than 1 and stop_on_zero_return is True.

Parameters:

Name Type Description Default
X ndarray

X array

required
y ndarray

y array

required
stop_on_zero_return bool

whether to stop if the returned dimension is less than 1. Default is False.

False

Returns:

Type Description
Tuple[ndarray, ndarray]

Tuple[numpy.ndarray, np.ndarray]: X and y arrays with rows containing NaN values in y removed.

Examples:

>>> import numpy as np
    from spotpython.utils.repair import remove_nan
    X = np.array([[1, 2], [3, 4], [5, 6]])
    y = np.array([1, np.nan, 2])
    X_cleaned, y_cleaned = remove_nan(X, y)
    print(X_cleaned, y_cleaned)
    [[1 2]
     [5 6]] [1. 2.]
Source code in spotpython/utils/repair.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
def remove_nan(X: np.ndarray, y: np.ndarray, stop_on_zero_return: bool = False) -> Tuple[np.ndarray, np.ndarray]:
    """Remove rows from X and y where y contains NaN values and issue a warning
        if the dimension of the returned y array is smaller than the dimension of the original y array.
        Issues a ValueError if the dimension of the returned y array is less than 1 and
        stop_on_zero_return is True.

    Args:
        X (numpy.ndarray):
            X array
        y (numpy.ndarray):
            y array
        stop_on_zero_return (bool):
            whether to stop if the returned dimension is less than 1.
            Default is False.

    Returns:
        Tuple[numpy.ndarray, np.ndarray]:
            X and y arrays with rows containing NaN values in y removed.

    Examples:
        >>> import numpy as np
            from spotpython.utils.repair import remove_nan
            X = np.array([[1, 2], [3, 4], [5, 6]])
            y = np.array([1, np.nan, 2])
            X_cleaned, y_cleaned = remove_nan(X, y)
            print(X_cleaned, y_cleaned)
            [[1 2]
             [5 6]] [1. 2.]
    """
    # Get the original dimension of the y array
    original_dim = y.shape[0]

    # Identify indices where y is not NaN
    ind = np.isfinite(y)

    # Update X and y by removing rows with NaN in y
    X_cleaned = X[ind, :]
    y_cleaned = y[ind]

    # Check if dimensions have been reduced
    returned_dim = y_cleaned.shape[0]
    if returned_dim < original_dim:
        warnings.warn(f"\n!!! The dimension of the returned y array is {y_cleaned.shape[0]}, " f"which is smaller than the original dimension {original_dim}.")
        warnings.warn("\n!!! Check whether to continue with the reduced dimension is useful.")
    # throw an error if the returned dimension is smaller than one
    if returned_dim < 1 and stop_on_zero_return:
        raise ValueError("!!!! The dimension of the returned y array is less than 1. Check the input data.")

    return X_cleaned, y_cleaned

repair_non_numeric(X, var_type)

Round non-numeric values to integers. This applies to all variables except for “num” and “float”.

Parameters:

Name Type Description Default
X ndarray

X array

required
var_type list

list with type information

required

Returns:

Type Description
ndarray

numpy.ndarray: X array with non-numeric values rounded to integers

Examples:

>>> X = np.array([[1.2, 2.3], [3.4, 4.5]])
>>> var_type = ["num", "factor"]
>>> repair_non_numeric(X, var_type)
array([[1., 2.],
       [3., 4.]])
Source code in spotpython/utils/repair.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
def repair_non_numeric(X: np.ndarray, var_type: List[str]) -> np.ndarray:
    """
    Round non-numeric values to integers.
    This applies to all variables except for "num" and "float".

    Args:
        X (numpy.ndarray): X array
        var_type (list): list with type information

    Returns:
        numpy.ndarray: X array with non-numeric values rounded to integers

    Examples:
        >>> X = np.array([[1.2, 2.3], [3.4, 4.5]])
        >>> var_type = ["num", "factor"]
        >>> repair_non_numeric(X, var_type)
        array([[1., 2.],
               [3., 4.]])
    """
    mask = np.isin(var_type, ["num", "float"], invert=True)
    X[:, mask] = np.around(X[:, mask])
    return X