Skip to content

Commit bba005a

Browse files
Implement K-Medoids clustering algorithm #13488
1 parent e2a78d4 commit bba005a

File tree

1 file changed

+96
-0
lines changed

1 file changed

+96
-0
lines changed

machine_learning/k_medoids.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
"""
2+
README, Author - Rohit Kumar Bansal (mailto:rohitbansal.dev@gmail.com)
3+
4+
Requirements:
5+
- numpy
6+
- matplotlib
7+
Python:
8+
- 3.5+
9+
Inputs:
10+
- X: 2D numpy array of features
11+
- k: number of clusters
12+
Usage:
13+
1. Define k and X
14+
2. Create initial medoids:
15+
initial_medoids = get_initial_medoids(X, k, seed=0)
16+
3. Run kmedoids:
17+
medoids, cluster_assignment = kmedoids(
18+
X, k, initial_medoids, maxiter=100, verbose=True
19+
)
20+
"""
21+
22+
import numpy as np
23+
from matplotlib import pyplot as plt
24+
from sklearn.metrics import pairwise_distances
25+
26+
def get_initial_medoids(data, k, seed=None):
27+
rng = np.random.default_rng(seed)
28+
n = data.shape[0]
29+
indices = rng.choice(n, k, replace=False)
30+
medoids = data[indices, :]
31+
return medoids
32+
33+
def assign_clusters(data, medoids):
34+
distances = pairwise_distances(data, medoids, metric='euclidean')
35+
cluster_assignment = np.argmin(distances, axis=1)
36+
return cluster_assignment
37+
38+
def revise_medoids(data, k, cluster_assignment):
39+
new_medoids = []
40+
for i in range(k):
41+
members = data[cluster_assignment == i]
42+
if len(members) == 0:
43+
continue
44+
# Compute total distance from each point to all others in cluster
45+
total_distances = np.sum(pairwise_distances(members, members), axis=1)
46+
medoid_index = np.argmin(total_distances)
47+
new_medoids.append(members[medoid_index])
48+
return np.array(new_medoids)
49+
50+
def compute_heterogeneity(data, k, medoids, cluster_assignment):
51+
heterogeneity = 0.0
52+
for i in range(k):
53+
members = data[cluster_assignment == i]
54+
if len(members) == 0:
55+
continue
56+
distances = pairwise_distances(members, [medoids[i]])
57+
heterogeneity += np.sum(distances**2)
58+
return heterogeneity
59+
60+
def kmedoids(data, k, initial_medoids, maxiter=100, verbose=False):
61+
medoids = initial_medoids.copy()
62+
prev_assignment = None
63+
for itr in range(maxiter):
64+
cluster_assignment = assign_clusters(data, medoids)
65+
medoids = revise_medoids(data, k, cluster_assignment)
66+
67+
if prev_assignment is not None and (prev_assignment == cluster_assignment).all():
68+
break
69+
70+
if verbose and prev_assignment is not None:
71+
changed = np.sum(prev_assignment != cluster_assignment)
72+
print(f"Iteration {itr}: {changed} points changed clusters")
73+
74+
prev_assignment = cluster_assignment.copy()
75+
76+
return medoids, cluster_assignment
77+
78+
# Optional plotting
79+
def plot_clusters(data, medoids, cluster_assignment):
80+
ax = plt.axes(projection='3d')
81+
ax.scatter(data[:,0], data[:,1], data[:,2], c=cluster_assignment, cmap='viridis')
82+
ax.scatter(medoids[:,0], medoids[:,1], medoids[:,2], c='red', s=100, marker='x')
83+
ax.set_xlabel("X")
84+
ax.set_ylabel("Y")
85+
ax.set_zlabel("Z")
86+
ax.set_title("3D K-Medoids Clustering")
87+
plt.show()
88+
89+
# Optional test
90+
if __name__ == "__main__":
91+
from sklearn import datasets
92+
X = datasets.load_iris()['data']
93+
k = 3
94+
medoids = get_initial_medoids(X, k, seed=0)
95+
medoids, clusters = kmedoids(X, k, medoids, maxiter=50, verbose=True)
96+
plot_clusters(X, medoids, clusters)

0 commit comments

Comments
 (0)