Spectral Clustering is the last topic of our NLP learning group activity, hosted by Feng. Here is my homework, you may refer to this tutorial for the symbols used in this simple program. While I still have no idea about the underlying principles in the algorithm.
#!/usr/bin/python
# copyright (c) 2008 Feng Zhu, Yong Sun
import heapq
from functools import partial
from numpy import *
from scipy.linalg import *
from scipy.cluster.vq import *
import pylab
def line_samples ():
vecs = random.rand (120, 2)
vecs [:,0] *= 3
vecs [0:40,1] = 1
vecs [40:80,1] = 2
vecs [80:120,1] = 3
return vecs
def gaussian_simfunc (v1, v2, sigma=1):
tee = (-norm(v1-v2)**2)/(2*(sigma**2))
return exp (tee)
def construct_W (vecs, simfunc=gaussian_simfunc):
n = len (vecs)
W = zeros ((n, n))
for i in xrange(n):
for j in xrange(i,n):
W[i,j] = W[j,i] = simfunc (vecs[i], vecs[j])
return W
def knn (W, k, mutual=False):
n = W.shape[0]
assert (k>0 and k<(n-1))
for i in xrange(n):
thr = heapq.nlargest(k+1, W[i])[-1]
for j in xrange(n):
if W[i,j] < thr:
W[i,j] = -W[i,j]
for i in xrange(n):
for j in xrange(i, n):
if W[i,j] + W[j,i] < 0:
W[i,j] = W[j,i] = 0
elif W[i,j] + W[j,i] == 0:
W[i,j] = W[j,i] = 0 if mutual else abs(W[i,j])
vecs = line_samples()
W = construct_W (vecs, simfunc=partial(gaussian_simfunc, sigma=2))
knn (W, 10)
D = diag([reduce(lambda x,y:x+y, Wi) for Wi in W])
L = D - W
evals, evcts = eig(L,D)
vals = dict (zip(evals, evcts.transpose()))
keys = vals.keys()
keys.sort()
Y = array ([vals[k] for k in keys[:3]]).transpose()
res,idx = kmeans2(Y, 3, minit='points')
colors = [(1,2,3)[i] for i in idx]
pylab.scatter(vecs[:,0],vecs[:,1],c=colors)
pylab.show()