Skip to content

aggregate

aggregate_mean_var(X, y, sort=False)

Aggregate array to mean.

Parameters:

Name Type Description Default
X ndarray

X array, shape (n, k).

required
y ndarray

values, shape (n,).

required
sort bool

Whether to sort the resulting DataFrame by the group keys.

False

Returns:

Type Description
ndarray

aggregated X values, shape (n-m, k), if m duplicates in X.

ndarray

aggregated (mean per group) y values, shape (1,), if m duplicates in X.

ndarray

aggregated (variance per group) y values, shape (1,), if m duplicates in X.

Examples:

>>> X = np.array([[1, 2], [3, 4], [1, 2]])
    y = np.array([1, 2, 3])
    X_agg, y_mean, y_var = aggregate_mean_var(X, y)
    print(X_agg)
    [[1. 2.]
    [3. 4.]]
    print(y_mean)
    [2. 2.]
    print(y_var)
    [1. 0.]
Source code in spotPython/utils/aggregate.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
def aggregate_mean_var(X, y, sort=False) -> (np.ndarray, np.ndarray, np.ndarray):
    """
    Aggregate array to mean.

    Args:
        X (numpy.ndarray): X array, shape `(n, k)`.
        y (numpy.ndarray): values, shape `(n,)`.
        sort (bool): Whether to sort the resulting DataFrame by the group keys.

    Returns:
        (numpy.ndarray):
            aggregated `X` values, shape `(n-m, k)`, if `m` duplicates in `X`.
        (numpy.ndarray):
            aggregated (mean per group) `y` values, shape `(1,)`, if `m` duplicates in `X`.
        (numpy.ndarray):
            aggregated (variance per group) `y` values, shape `(1,)`, if `m` duplicates in `X`.

    Examples:
        >>> X = np.array([[1, 2], [3, 4], [1, 2]])
            y = np.array([1, 2, 3])
            X_agg, y_mean, y_var = aggregate_mean_var(X, y)
            print(X_agg)
            [[1. 2.]
            [3. 4.]]
            print(y_mean)
            [2. 2.]
            print(y_var)
            [1. 0.]
    """
    # Create a DataFrame from X and y
    df = pd.DataFrame(X)
    df["y"] = y

    # Group by all columns except 'y' and calculate the mean and variance of 'y' for each group
    grouped = df.groupby(list(df.columns.difference(["y"])), as_index=False, sort=sort)
    df_mean = grouped.mean()
    df_var = grouped.var()

    # Convert the resulting DataFrames to numpy arrays
    mean_array = df_mean.to_numpy()
    var_array = df_var.to_numpy()

    # Split the resulting arrays into separate arrays for X and y
    X_agg = np.delete(mean_array, -1, 1)
    y_mean = mean_array[:, -1]
    y_var = var_array[:, -1]

    return X_agg, y_mean, y_var

get_ranks(x)

Returns a numpy array containing ranks of numbers within an input numpy array x.

Parameters:

Name Type Description Default
x ndarray

numpy array

required

Returns:

Type Description
ndarray

ranks

Examples:

>>> get_ranks([2, 1])
    [1, 0]
>>> get_ranks([20, 10, 100])
    [1, 0, 2]
Source code in spotPython/utils/aggregate.py
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def get_ranks(x):
    """
    Returns a numpy array containing ranks of numbers within an input numpy array x.

    Args:
        x (numpy.ndarray): numpy array

    Returns:
        (numpy.ndarray): ranks

    Examples:
        >>> get_ranks([2, 1])
            [1, 0]
        >>> get_ranks([20, 10, 100])
            [1, 0, 2]
    """
    ts = x.argsort()
    ranks = np.empty_like(ts)
    ranks[ts] = np.arange(len(x))
    return ranks

select_distant_points(X, y, k)

Selects k points that are distant from each other using a clustering approach.

Parameters:

Name Type Description Default
X ndarray

X array, shape (n, k).

required
y ndarray

values, shape (n,).

required
k int

number of points to select.

required

Returns:

Type Description
ndarray

selected X values, shape (k, k).

ndarray

selected y values, shape (k,).

Examples:

>>> from spotPython.utils.aggregate import select_distant_points
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
    y = np.array([1, 2, 3, 4, 5])
    selected_points, selected_y = select_distant_points(X, y, 3)
    print(selected_points)
    [[1 2]
    [7 8]
    [9 10]]
    print(selected_y)
    [1 4 5]
Source code in spotPython/utils/aggregate.py
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
def select_distant_points(X, y, k):
    """
    Selects k points that are distant from each other using a clustering approach.

    Args:
        X (numpy.ndarray): X array, shape `(n, k)`.
        y (numpy.ndarray): values, shape `(n,)`.
        k (int): number of points to select.

    Returns:
        (numpy.ndarray):
            selected `X` values, shape `(k, k)`.
        (numpy.ndarray):
            selected `y` values, shape `(k,)`.

    Examples:
        >>> from spotPython.utils.aggregate import select_distant_points
            X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
            y = np.array([1, 2, 3, 4, 5])
            selected_points, selected_y = select_distant_points(X, y, 3)
            print(selected_points)
            [[1 2]
            [7 8]
            [9 10]]
            print(selected_y)
            [1 4 5]

    """
    # Perform k-means clustering to find k clusters
    kmeans = KMeans(n_clusters=k, random_state=0, n_init="auto").fit(X)
    # Find the closest point in X to each cluster center
    selected_points = np.array([X[np.argmin(np.linalg.norm(X - center, axis=1))] for center in kmeans.cluster_centers_])
    # Find indices of the selected points in the original X array
    indices = np.array([np.where(np.all(X == point, axis=1))[0][0] for point in selected_points])
    # Select the corresponding y values
    selected_y = y[indices]
    return selected_points, selected_y