From af21a33f233eccac6416062a6a0095631f102b36 Mon Sep 17 00:00:00 2001 From: Cédric Bonhomme Date: Wed, 5 Dec 2012 21:38:32 +0100 Subject: Removed files related to k-means sorting tests. --- source/clusters.py | 157 ------------------------------------------- source/generatefeedvector.py | 65 ------------------ source/testclusters.py | 24 ------- 3 files changed, 246 deletions(-) delete mode 100755 source/clusters.py delete mode 100755 source/generatefeedvector.py delete mode 100644 source/testclusters.py diff --git a/source/clusters.py b/source/clusters.py deleted file mode 100755 index e53fac9b..00000000 --- a/source/clusters.py +++ /dev/null @@ -1,157 +0,0 @@ -#! /usr/bin/env python -#-*- coding: utf-8 -*- - -import math -import random - -from math import sqrt -from PIL import Image, ImageDraw - -def readfile(filename): - lines=[line for line in file(filename)] - - # First line is the column titles - colnames=lines[0].strip().split('\t')[1:] - rownames=[] - data=[] - for line in lines[1:]: - p=line.strip().split('\t') - # First column in each row is the rowname - rownames.append(p[0]) - # The data for this row is the remainder of the row - data.append([float(x) for x in p[1:]]) - return rownames,colnames,data - -def pearson(v1,v2): - # Simple sums - sum1=sum(v1) - sum2=sum(v2) - - # Sums of the squares - sum1Sq=sum([pow(v,2) for v in v1]) - sum2Sq=sum([pow(v,2) for v in v2]) - - # Sum of the products - pSum=sum([v1[i]*v2[i] for i in range(len(v1))]) - - # Calculate r (Pearson score) - num=pSum-(sum1*sum2/len(v1)) - den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1))) - if den==0: return 0 - - return 1.0-num/den - -def tanimoto(v1, v2): - c1, c2, shr = 0, 0, 0 - for i in range(len(v1)): - if v1[i] != 0: - c1 += 1 # in v1 - if v2[i] != 0: - c2 += 1 # in v2 - if v1[i] != 0 and v2[i] != 0: - shr += 1 # in both - return 1.0 - (float(shr) / (c1 + c2 - shr)) - -def euclidian(v1, v2): - d = 0.0 - for i in range(len(v1)): - d += (v1[i] - v2[i])**2 - return math.sqrt(d) - -def kcluster(rows,distance=pearson,k=4): - # Determine the minimum and maximum values for each point - ranges=[(min([row[i] for row in rows]),max([row[i] for row in rows])) - for i in range(len(rows[0]))] - - # Create k randomly placed centroids - clusters=[[random.random()*(ranges[i][1]-ranges[i][0])+ranges[i][0] - for i in range(len(rows[0]))] for j in range(k)] - - lastmatches=None - for t in range(100): - print 'Iteration %d' % t - bestmatches=[[] for i in range(k)] - - # Find which centroid is the closest for each row - for j in range(len(rows)): - row=rows[j] - bestmatch=0 - for i in range(k): - d=distance(clusters[i],row) - if d0: - for rowid in bestmatches[i]: - for m in range(len(rows[rowid])): - avgs[m]+=rows[rowid][m] - for j in range(len(avgs)): - avgs[j]/=len(bestmatches[i]) - clusters[i]=avgs - - return bestmatches - -def scaledown(data,distance=pearson,rate=0.01): - n=len(data) - - # The real distances between every pair of items - realdist=[[distance(data[i],data[j]) for j in range(n)] - for i in range(0,n)] - - # Randomly initialize the starting points of the locations in 2D - loc=[[random.random(),random.random()] for i in range(n)] - fakedist=[[0.0 for j in range(n)] for i in range(n)] - - lasterror=None - for m in range(0,1000): - # Find projected distances - for i in range(n): - for j in range(n): - fakedist[i][j]=sqrt(sum([pow(loc[i][x]-loc[j][x],2) - for x in range(len(loc[i]))])) - - # Move points - grad=[[0.0,0.0] for i in range(n)] - - totalerror=0 - for k in range(n): - for j in range(n): - if j==k: continue - # The error is percent difference between the distances - errorterm=(fakedist[j][k]-realdist[j][k])/realdist[j][k] - - # Each point needs to be moved away from or towards the other - # point in proportion to how much error it has - grad[k][0]+=((loc[k][0]-loc[j][0])/fakedist[j][k])*errorterm - grad[k][1]+=((loc[k][1]-loc[j][1])/fakedist[j][k])*errorterm - - # Keep track of the total error - totalerror+=abs(errorterm) - - - # If the answer got worse by moving the points, we are done - if lasterror and lasterror]+>').sub('',html) - - # Split words by all non-alpha characters - words=re.compile(r'[^A-Z^a-z]+').split(txt) - - # Convert to lowercase - return [word.lower() for word in words if word!=''] - - -apcount={} -wordcounts={} -mongo = mongodb.Articles(conf.MONGODB_ADDRESS, conf.MONGODB_PORT, \ - conf.MONGODB_DBNAME, conf.MONGODB_USER, conf.MONGODB_PASSWORD) -feeds = mongo.get_all_feeds() -for feed in feeds: - try: - title,wc=getwordcounts(feed["feed_id"]) - wordcounts[title]=wc - for word,count in list(wc.items()): - apcount.setdefault(word,0) - if count>1: - apcount[word]+=1 - except: - print('Failed to parse feed %s' % feed["feed_title"]) - -wordlist=[] -for w,bc in list(apcount.items()): - frac=float(bc)/len(feeds) - if frac>0.1 and frac<0.5: - wordlist.append(w) - -out=open('blogdata1.txt','w') -out.write('Blog') -for word in wordlist: out.write('\t%s' % word) -out.write('\n') -for blog,wc in list(wordcounts.items()): - print(blog) - out.write(blog) - for word in wordlist: - if word in wc: out.write('\t%d' % wc[word]) - else: out.write('\t0') - out.write('\n') diff --git a/source/testclusters.py b/source/testclusters.py deleted file mode 100644 index 728e9c1b..00000000 --- a/source/testclusters.py +++ /dev/null @@ -1,24 +0,0 @@ -#! /usr/bin/env python -#-*- coding: utf-8 -*- - -import clusters - -K = 7 - -blognames,words,data = clusters.readfile("blogdata1.txt") - -coords = clusters.scaledown(data) - -print "Generating clusters..." -kclust = clusters.kcluster(data, k=K, distance=clusters.pearson) -print -print "Clusters:" -for i in range(K): - print "Cluster" + str(i) - print ", ".join([blognames[r] for r in kclust[i]]) - print - - - - -clusters.draw2d(coords,blognames,jpeg='mds2d.jpg') -- cgit