From d696c7192eeea2f91e5462bc6c859b888f948278 Mon Sep 17 00:00:00 2001 From: Atsushi Sakai Date: Wed, 2 May 2018 17:35:45 +0900 Subject: [PATCH] onetime kmean is done --- Mapping/kmean_clustering/kmean_clustering.py | 101 +++++++++++++++---- 1 file changed, 81 insertions(+), 20 deletions(-) diff --git a/Mapping/kmean_clustering/kmean_clustering.py b/Mapping/kmean_clustering/kmean_clustering.py index 1fb76757..5cd667db 100644 --- a/Mapping/kmean_clustering/kmean_clustering.py +++ b/Mapping/kmean_clustering/kmean_clustering.py @@ -2,34 +2,83 @@ Object clustering with k-mean algorithm - author: Atsushi Sakai (@Atsushi_twi) """ +import numpy as np +import math import matplotlib.pyplot as plt import random -class Cluster: +class Clusters: - def __init__(self): - self.x = [] - self.y = [] - self.cx = None - self.cy = None + def __init__(self, x, y, nlabel): + self.x = x + self.y = y + self.ndata = len(self.x) + self.nlabel = nlabel + self.labels = [random.randint(0, nlabel - 1) + for _ in range(self.ndata)] + self.cx = [0.0 for _ in range(nlabel)] + self.cy = [0.0 for _ in range(nlabel)] + + +def init_clusters(rx, ry, nc): + + clusters = Clusters(rx, ry, nc) + + return clusters + + +def calc_centroid(clusters): + + for ic in range(clusters.nlabel): + x, y = calc_labeled_points(ic, clusters) + ndata = len(x) + clusters.cx[ic] = sum(x) / ndata + clusters.cy[ic] = sum(y) / ndata + + return clusters + + +def update_clusters(clusters): + cost = 0.0 + + for ip in range(clusters.ndata): + px = clusters.x[ip] + py = clusters.y[ip] + + dx = [icx - px for icx in clusters.cx] + dy = [icy - py for icy in clusters.cy] + + dlist = [math.sqrt(idx**2 + idy**2) for (idx, idy) in zip(dx, dy)] + mind = min(dlist) + min_id = dlist.index(mind) + clusters.labels[ip] = min_id + cost += min_id + + return clusters, cost def kmean_clustering(rx, ry, nc): - minx, maxx = min(rx), max(rx) - miny, maxy = min(ry), max(ry) + clusters = init_clusters(rx, ry, nc) + clusters = calc_centroid(clusters) - clusters = [Cluster() for i in range(nc)] + MAX_LOOP = 10 + DCOST_TH = 1.0 + pcost = 100.0 + for loop in range(MAX_LOOP): + print("Loop:", loop) + clusters, cost = update_clusters(clusters) + clusters = calc_centroid(clusters) - for c in clusters: - c.cx = random.uniform(minx, maxx) - c.cy = random.uniform(miny, maxy) + dcost = abs(cost - pcost) + if dcost < DCOST_TH: + break + pcost = cost return clusters @@ -40,17 +89,30 @@ def calc_raw_data(): cx = [0.0, 5.0] cy = [0.0, 5.0] - np = 30 + npoints = 30 rand_d = 3.0 for (icx, icy) in zip(cx, cy): - for _ in range(np): + for _ in range(npoints): rx.append(icx + rand_d * (random.random() - 0.5)) ry.append(icy + rand_d * (random.random() - 0.5)) return rx, ry +def calc_labeled_points(ic, clusters): + + inds = np.array([i for i in range(clusters.ndata) + if clusters.labels[i] == ic]) + tx = np.array(clusters.x) + ty = np.array(clusters.y) + + x = tx[inds] + y = ty[inds] + + return x, y + + def main(): print(__file__ + " start!!") @@ -59,11 +121,10 @@ def main(): ncluster = 2 clusters = kmean_clustering(rx, ry, ncluster) - for c in clusters: - print(c.cx, c.cy) - plt.plot(c.cx, c.cy, "x") - - plt.plot(rx, ry, ".") + for ic in range(clusters.nlabel): + x, y = calc_labeled_points(ic, clusters) + plt.plot(x, y, "x") + plt.plot(clusters.cx, clusters.cy, "o") plt.show()