Spectral Clustering is the last topic of our NLP learning group activity, hosted by Feng. Here is my homework, you may refer to this tutorial for the symbols used in this simple program. While I still have no idea about the underlying principles in the algorithm.

#!/usr/bin/python # copyright (c) 2008 Feng Zhu, Yong Sun import heapq from functools import partial from numpy import * from scipy.linalg import * from scipy.cluster.vq import * import pylab def line_samples (): vecs = random.rand (120, 2) vecs [:,0] *= 3 vecs [0:40,1] = 1 vecs [40:80,1] = 2 vecs [80:120,1] = 3 return vecs def gaussian_simfunc (v1, v2, sigma=1): tee = (-norm(v1-v2)**2)/(2*(sigma**2)) return exp (tee) def construct_W (vecs, simfunc=gaussian_simfunc): n = len (vecs) W = zeros ((n, n)) for i in xrange(n): for j in xrange(i,n): W[i,j] = W[j,i] = simfunc (vecs[i], vecs[j]) return W def knn (W, k, mutual=False): n = W.shape[0] assert (k>0 and k<(n-1)) for i in xrange(n): thr = heapq.nlargest(k+1, W[i])[-1] for j in xrange(n): if W[i,j] < thr: W[i,j] = -W[i,j] for i in xrange(n): for j in xrange(i, n): if W[i,j] + W[j,i] < 0: W[i,j] = W[j,i] = 0 elif W[i,j] + W[j,i] == 0: W[i,j] = W[j,i] = 0 if mutual else abs(W[i,j]) vecs = line_samples() W = construct_W (vecs, simfunc=partial(gaussian_simfunc, sigma=2)) knn (W, 10) D = diag([reduce(lambda x,y:x+y, Wi) for Wi in W]) L = D - W evals, evcts = eig(L,D) vals = dict (zip(evals, evcts.transpose())) keys = vals.keys() keys.sort() Y = array ([vals[k] for k in keys[:3]]).transpose() res,idx = kmeans2(Y, 3, minit='points') colors = [(1,2,3)[i] for i in idx] pylab.scatter(vecs[:,0],vecs[:,1],c=colors) pylab.show()