K-means, EM Algorithm
PRMLを見ながら,K-meansによるクラスタリングと,EM Algorithmによるクラスタリングを実装してみた.
EM Algorithmの方は,あまり考えずに普通にループで書いてるのでかなり遅い.
うまく書けば行列演算で効率的にできるんだろうけど,まだ numpy, scipy の経験値が足りない….
#!/usr/bin/env python # k-means import numpy as np import numpy.random as random import matplotlib.pyplot as plt def adjust_scale(dataset): # adjust dataset with mean=0, stddev=1 mean = np.mean(dataset, axis=0) std = np.std(dataset, axis=0) return (dataset - mean) / std def plot_dataset(dataset, k, cluster): options = ["b.", "r.", "g.", "c.", "m.", "y.", "k."] fig = plt.figure() ax = fig.add_subplot(111) for i in xrange(k): subset = dataset[cluster==i] ax.plot(subset[:,0], subset[:,1], options[i]) def init_center(dataset, n, k): # choose random k points as initial values of centers index = np.arange(n, dtype=int) random.shuffle(index) return dataset[index[:k]] def e_step(dataset, n, k, cluster, center): # assign points to clusters which have nearest centers for i in xrange(n): distance = np.sum((center - dataset[i]) ** 2, axis=1) cluster[i] = np.argmin(distance) def m_step(dataset, n, k, cluster, center): # move centers to gravity centers for i in xrange(k): center[i] = np.mean(dataset[cluster==i], axis=0) def k_means(dataset, k): # divide dataset into k clusters n = dataset.shape[0] # the number of points cluster = np.empty(n, dtype=int) # cluster assignments center = init_center(dataset, n, k) # centers of the clusters max_iter = 100 # maximum iteration times for i in xrange(max_iter): old_cluster = cluster.copy() e_step(dataset, n, k, cluster, center) if np.all(cluster == old_cluster): break m_step(dataset, n, k, cluster, center) return cluster k = 2 random.seed() dataset = adjust_scale(np.loadtxt("faithful.txt")) cluster = k_means(dataset, k) plot_dataset(dataset, k, cluster) plt.show()
#!/usr/bin/env python # EM algorithm import numpy as np import numpy.linalg as la import numpy.random as random import matplotlib.pyplot as plt def mvndst(x, mean, cov): # multivariate normal distribute function d = x.size a = (2.0*np.pi)**(-0.5*d) * (la.det(cov)**-0.5) b = -0.5 * np.dot(np.dot(x - mean, la.inv(cov)), x - mean) return a * np.exp(b) def adjust_scale(dataset): # adjust dataset with mean=0, stddev=1 mean = np.mean(dataset, axis=0) std = np.std(dataset, axis=0) return (dataset - mean) / std def plot_dataset(dataset, k, res): fig = plt.figure() ax = fig.add_subplot(111) n = dataset.shape[0] colors = np.zeros((n, 3)) colors[:,0] = res[:,0] colors[:,2] = res[:,1] for i in xrange(n): colors[i,:] /= np.sqrt(np.sum(colors[i,:] ** 2)) x = dataset[:,0] y = dataset[:,1] ax.scatter(x, y, c=colors) def init_mean(dataset, n, k): # choose random k points as initial values of mean index = np.arange(n, dtype=int) random.shuffle(index) return dataset[index[:k]] def e_step(dataset, n, k, mean, cov, mix, res): for i in xrange(n): denom = 0.0 for j in xrange(k): denom += mix[j] * mvndst(dataset[i], mean[j], cov[j]) for j in xrange(k): res[i, j] = mix[j] * mvndst(dataset[i], mean[j], cov[j]) / denom def m_step(dataset, n, k, mean, cov, mix, res): for i in xrange(k): weight = np.sum(res[:,i]) mean[i] = 0.0 for j in xrange(n): mean[i] += res[j, i] * dataset[j] mean[i] /= weight cov[i] = np.zeros((k, k)) for j in xrange(n): diff = dataset[j] - mean[i] cov[i] += res[j, i] * np.outer(diff, diff) cov[i] /= weight mix[i] = weight / n def calc_likehood(dataset, n, k, mean, cov, mix): likehood = 0.0 for i in xrange(n): s = 0.0 for j in xrange(k): s += mix[j] * mvndst(dataset[i], mean[j], cov[j]) likehood += np.log(s) return likehood def em_algorithm(dataset, k): # divide dataset into k clusters n, d = dataset.shape mean = init_mean(dataset, n, k) cov = np.array([np.eye(d)] * k) mix = np.repeat(1.0/k, k) res = np.empty((n, k)) max_iter = 20 threshold = 0.001 likehood = calc_likehood(dataset, n, k, mean, cov, mix) for i in xrange(max_iter): e_step(dataset, n, k, mean, cov, mix, res) m_step(dataset, n, k, mean, cov, mix, res) old_likehood = likehood likehood = calc_likehood(dataset, n, k, mean, cov, mix) if likehood - old_likehood < threshold: break return res k = 2 random.seed() dataset = adjust_scale(np.loadtxt("faithful.txt")) res = em_algorithm(dataset, k) plot_dataset(dataset, k, res) plt.show()