|
| 1 | +""" |
| 2 | +README, Author - Rohit Kumar Bansal (mailto:rohitbansal.dev@gmail.com) |
| 3 | +
|
| 4 | +Requirements: |
| 5 | + - numpy |
| 6 | + - matplotlib |
| 7 | +Python: |
| 8 | + - 3.5+ |
| 9 | +Inputs: |
| 10 | + - X: 2D numpy array of features |
| 11 | + - k: number of clusters |
| 12 | +Usage: |
| 13 | + 1. Define k and X |
| 14 | + 2. Create initial medoids: |
| 15 | + initial_medoids = get_initial_medoids(X, k, seed=0) |
| 16 | + 3. Run kmedoids: |
| 17 | + medoids, cluster_assignment = kmedoids( |
| 18 | + X, k, initial_medoids, maxiter=100, verbose=True |
| 19 | + ) |
| 20 | +""" |
| 21 | + |
| 22 | +import numpy as np |
| 23 | +from matplotlib import pyplot as plt |
| 24 | +from sklearn.metrics import pairwise_distances |
| 25 | + |
| 26 | +def get_initial_medoids(data, k, seed=None): |
| 27 | + rng = np.random.default_rng(seed) |
| 28 | + n = data.shape[0] |
| 29 | + indices = rng.choice(n, k, replace=False) |
| 30 | + medoids = data[indices, :] |
| 31 | + return medoids |
| 32 | + |
| 33 | +def assign_clusters(data, medoids): |
| 34 | + distances = pairwise_distances(data, medoids, metric='euclidean') |
| 35 | + cluster_assignment = np.argmin(distances, axis=1) |
| 36 | + return cluster_assignment |
| 37 | + |
| 38 | +def revise_medoids(data, k, cluster_assignment): |
| 39 | + new_medoids = [] |
| 40 | + for i in range(k): |
| 41 | + members = data[cluster_assignment == i] |
| 42 | + if len(members) == 0: |
| 43 | + continue |
| 44 | + # Compute total distance from each point to all others in cluster |
| 45 | + total_distances = np.sum(pairwise_distances(members, members), axis=1) |
| 46 | + medoid_index = np.argmin(total_distances) |
| 47 | + new_medoids.append(members[medoid_index]) |
| 48 | + return np.array(new_medoids) |
| 49 | + |
| 50 | +def compute_heterogeneity(data, k, medoids, cluster_assignment): |
| 51 | + heterogeneity = 0.0 |
| 52 | + for i in range(k): |
| 53 | + members = data[cluster_assignment == i] |
| 54 | + if len(members) == 0: |
| 55 | + continue |
| 56 | + distances = pairwise_distances(members, [medoids[i]]) |
| 57 | + heterogeneity += np.sum(distances**2) |
| 58 | + return heterogeneity |
| 59 | + |
| 60 | +def kmedoids(data, k, initial_medoids, maxiter=100, verbose=False): |
| 61 | + medoids = initial_medoids.copy() |
| 62 | + prev_assignment = None |
| 63 | + for itr in range(maxiter): |
| 64 | + cluster_assignment = assign_clusters(data, medoids) |
| 65 | + medoids = revise_medoids(data, k, cluster_assignment) |
| 66 | + |
| 67 | + if prev_assignment is not None and (prev_assignment == cluster_assignment).all(): |
| 68 | + break |
| 69 | + |
| 70 | + if verbose and prev_assignment is not None: |
| 71 | + changed = np.sum(prev_assignment != cluster_assignment) |
| 72 | + print(f"Iteration {itr}: {changed} points changed clusters") |
| 73 | + |
| 74 | + prev_assignment = cluster_assignment.copy() |
| 75 | + |
| 76 | + return medoids, cluster_assignment |
| 77 | + |
| 78 | +# Optional plotting |
| 79 | +def plot_clusters(data, medoids, cluster_assignment): |
| 80 | + ax = plt.axes(projection='3d') |
| 81 | + ax.scatter(data[:,0], data[:,1], data[:,2], c=cluster_assignment, cmap='viridis') |
| 82 | + ax.scatter(medoids[:,0], medoids[:,1], medoids[:,2], c='red', s=100, marker='x') |
| 83 | + ax.set_xlabel("X") |
| 84 | + ax.set_ylabel("Y") |
| 85 | + ax.set_zlabel("Z") |
| 86 | + ax.set_title("3D K-Medoids Clustering") |
| 87 | + plt.show() |
| 88 | + |
| 89 | +# Optional test |
| 90 | +if __name__ == "__main__": |
| 91 | + from sklearn import datasets |
| 92 | + X = datasets.load_iris()['data'] |
| 93 | + k = 3 |
| 94 | + medoids = get_initial_medoids(X, k, seed=0) |
| 95 | + medoids, clusters = kmedoids(X, k, medoids, maxiter=50, verbose=True) |
| 96 | + plot_clusters(X, medoids, clusters) |
0 commit comments