From 9c7a6eca5f92b0cc3cc23c40b23f8c982fccdd06 Mon Sep 17 00:00:00 2001 From: Cédric Bonhomme Date: Fri, 16 Nov 2012 14:20:34 +0100 Subject: Added tests with clusters. --- source/clusters.py | 137 +++++++++++++++++++++++++++++++++++++++++++ source/generatefeedvector.py | 65 ++++++++++++++++++++ source/testclusters.py | 23 ++++++++ 3 files changed, 225 insertions(+) create mode 100755 source/clusters.py create mode 100755 source/generatefeedvector.py create mode 100644 source/testclusters.py diff --git a/source/clusters.py b/source/clusters.py new file mode 100755 index 00000000..7122c55d --- /dev/null +++ b/source/clusters.py @@ -0,0 +1,137 @@ +# -*- coding: utf-8 -*- +import random + +from math import sqrt +from PIL import Image, ImageDraw + +def readfile(filename): + lines=[line for line in file(filename)] + + # First line is the column titles + colnames=lines[0].strip().split('\t')[1:] + rownames=[] + data=[] + for line in lines[1:]: + p=line.strip().split('\t') + # First column in each row is the rowname + rownames.append(p[0]) + # The data for this row is the remainder of the row + data.append([float(x) for x in p[1:]]) + return rownames,colnames,data + +def pearson(v1,v2): + # Simple sums + sum1=sum(v1) + sum2=sum(v2) + + # Sums of the squares + sum1Sq=sum([pow(v,2) for v in v1]) + sum2Sq=sum([pow(v,2) for v in v2]) + + # Sum of the products + pSum=sum([v1[i]*v2[i] for i in range(len(v1))]) + + # Calculate r (Pearson score) + num=pSum-(sum1*sum2/len(v1)) + den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1))) + if den==0: return 0 + + return 1.0-num/den + +def kcluster(rows,distance=pearson,k=4): + # Determine the minimum and maximum values for each point + ranges=[(min([row[i] for row in rows]),max([row[i] for row in rows])) + for i in range(len(rows[0]))] + + # Create k randomly placed centroids + clusters=[[random.random()*(ranges[i][1]-ranges[i][0])+ranges[i][0] + for i in range(len(rows[0]))] for j in range(k)] + + lastmatches=None + for t in range(100): + print 'Iteration %d' % t + bestmatches=[[] for i in range(k)] + + # Find which centroid is the closest for each row + for j in range(len(rows)): + row=rows[j] + bestmatch=0 + for i in range(k): + d=distance(clusters[i],row) + if d0: + for rowid in bestmatches[i]: + for m in range(len(rows[rowid])): + avgs[m]+=rows[rowid][m] + for j in range(len(avgs)): + avgs[j]/=len(bestmatches[i]) + clusters[i]=avgs + + return bestmatches + +def scaledown(data,distance=pearson,rate=0.01): + n=len(data) + + # The real distances between every pair of items + realdist=[[distance(data[i],data[j]) for j in range(n)] + for i in range(0,n)] + + # Randomly initialize the starting points of the locations in 2D + loc=[[random.random(),random.random()] for i in range(n)] + fakedist=[[0.0 for j in range(n)] for i in range(n)] + + lasterror=None + for m in range(0,1000): + # Find projected distances + for i in range(n): + for j in range(n): + fakedist[i][j]=sqrt(sum([pow(loc[i][x]-loc[j][x],2) + for x in range(len(loc[i]))])) + + # Move points + grad=[[0.0,0.0] for i in range(n)] + + totalerror=0 + for k in range(n): + for j in range(n): + if j==k: continue + # The error is percent difference between the distances + errorterm=(fakedist[j][k]-realdist[j][k])/realdist[j][k] + + # Each point needs to be moved away from or towards the other + # point in proportion to how much error it has + grad[k][0]+=((loc[k][0]-loc[j][0])/fakedist[j][k])*errorterm + grad[k][1]+=((loc[k][1]-loc[j][1])/fakedist[j][k])*errorterm + + # Keep track of the total error + totalerror+=abs(errorterm) + + + # If the answer got worse by moving the points, we are done + if lasterror and lasterror]+>').sub('',html) + + # Split words by all non-alpha characters + words=re.compile(r'[^A-Z^a-z]+').split(txt) + + # Convert to lowercase + return [word.lower() for word in words if word!=''] + + +apcount={} +wordcounts={} +mongo = mongodb.Articles(conf.MONGODB_ADDRESS, conf.MONGODB_PORT, \ + conf.MONGODB_DBNAME, conf.MONGODB_USER, conf.MONGODB_PASSWORD) +feeds = mongo.get_all_feeds() +for feed in feeds: + try: + title,wc=getwordcounts(feed["feed_id"]) + wordcounts[title]=wc + for word,count in list(wc.items()): + apcount.setdefault(word,0) + if count>1: + apcount[word]+=1 + except: + print('Failed to parse feed %s' % feed["feed_title"]) + +wordlist=[] +for w,bc in list(apcount.items()): + frac=float(bc)/len(feeds) + if frac>0.1 and frac<0.5: + wordlist.append(w) + +out=open('blogdata1.txt','w') +out.write('Blog') +for word in wordlist: out.write('\t%s' % word) +out.write('\n') +for blog,wc in list(wordcounts.items()): + print(blog) + out.write(blog) + for word in wordlist: + if word in wc: out.write('\t%d' % wc[word]) + else: out.write('\t0') + out.write('\n') diff --git a/source/testclusters.py b/source/testclusters.py new file mode 100644 index 00000000..0e1e2a32 --- /dev/null +++ b/source/testclusters.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +import clusters + +K = 10 + +blognames,labels,data = clusters.readfile("blogdata1.txt") + +#coords = clusters.scaledown(data) + +print "Generating clusters..." +kclust = clusters.kcluster(data, k=K) +print +print "Clusters:" +for i in range(K): + print "Cluster" + str(i) + print ", ".join([blognames[r] for r in kclust[i]]) + print + + + + +#clusters.draw2d(coords,blognames,jpeg='mds2d.jpg') \ No newline at end of file -- cgit