diff options
-rwxr-xr-x | source/clusters.py | 157 | ||||
-rwxr-xr-x | source/generatefeedvector.py | 65 | ||||
-rw-r--r-- | source/testclusters.py | 24 |
3 files changed, 0 insertions, 246 deletions
diff --git a/source/clusters.py b/source/clusters.py deleted file mode 100755 index e53fac9b..00000000 --- a/source/clusters.py +++ /dev/null @@ -1,157 +0,0 @@ -#! /usr/bin/env python
-#-*- coding: utf-8 -*-
-
-import math
-import random
-
-from math import sqrt
-from PIL import Image, ImageDraw
-
-def readfile(filename):
- lines=[line for line in file(filename)]
-
- # First line is the column titles
- colnames=lines[0].strip().split('\t')[1:]
- rownames=[]
- data=[]
- for line in lines[1:]:
- p=line.strip().split('\t')
- # First column in each row is the rowname
- rownames.append(p[0])
- # The data for this row is the remainder of the row
- data.append([float(x) for x in p[1:]])
- return rownames,colnames,data
-
-def pearson(v1,v2):
- # Simple sums
- sum1=sum(v1)
- sum2=sum(v2)
-
- # Sums of the squares
- sum1Sq=sum([pow(v,2) for v in v1])
- sum2Sq=sum([pow(v,2) for v in v2])
-
- # Sum of the products
- pSum=sum([v1[i]*v2[i] for i in range(len(v1))])
-
- # Calculate r (Pearson score)
- num=pSum-(sum1*sum2/len(v1))
- den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1)))
- if den==0: return 0
-
- return 1.0-num/den
-
-def tanimoto(v1, v2):
- c1, c2, shr = 0, 0, 0
- for i in range(len(v1)):
- if v1[i] != 0:
- c1 += 1 # in v1
- if v2[i] != 0:
- c2 += 1 # in v2
- if v1[i] != 0 and v2[i] != 0:
- shr += 1 # in both
- return 1.0 - (float(shr) / (c1 + c2 - shr))
-
-def euclidian(v1, v2):
- d = 0.0
- for i in range(len(v1)):
- d += (v1[i] - v2[i])**2
- return math.sqrt(d)
-
-def kcluster(rows,distance=pearson,k=4):
- # Determine the minimum and maximum values for each point
- ranges=[(min([row[i] for row in rows]),max([row[i] for row in rows]))
- for i in range(len(rows[0]))]
-
- # Create k randomly placed centroids
- clusters=[[random.random()*(ranges[i][1]-ranges[i][0])+ranges[i][0]
- for i in range(len(rows[0]))] for j in range(k)]
-
- lastmatches=None
- for t in range(100):
- print 'Iteration %d' % t
- bestmatches=[[] for i in range(k)]
-
- # Find which centroid is the closest for each row
- for j in range(len(rows)):
- row=rows[j]
- bestmatch=0
- for i in range(k):
- d=distance(clusters[i],row)
- if d<distance(clusters[bestmatch],row): bestmatch=i
- bestmatches[bestmatch].append(j)
-
- # If the results are the same as last time, this is complete
- if bestmatches==lastmatches: break
- lastmatches=bestmatches
-
- # Move the centroids to the average of their members
- for i in range(k):
- avgs=[0.0]*len(rows[0])
- if len(bestmatches[i])>0:
- for rowid in bestmatches[i]:
- for m in range(len(rows[rowid])):
- avgs[m]+=rows[rowid][m]
- for j in range(len(avgs)):
- avgs[j]/=len(bestmatches[i])
- clusters[i]=avgs
-
- return bestmatches
-
-def scaledown(data,distance=pearson,rate=0.01):
- n=len(data)
-
- # The real distances between every pair of items
- realdist=[[distance(data[i],data[j]) for j in range(n)]
- for i in range(0,n)]
-
- # Randomly initialize the starting points of the locations in 2D
- loc=[[random.random(),random.random()] for i in range(n)]
- fakedist=[[0.0 for j in range(n)] for i in range(n)]
-
- lasterror=None
- for m in range(0,1000):
- # Find projected distances
- for i in range(n):
- for j in range(n):
- fakedist[i][j]=sqrt(sum([pow(loc[i][x]-loc[j][x],2)
- for x in range(len(loc[i]))]))
-
- # Move points
- grad=[[0.0,0.0] for i in range(n)]
-
- totalerror=0
- for k in range(n):
- for j in range(n):
- if j==k: continue
- # The error is percent difference between the distances
- errorterm=(fakedist[j][k]-realdist[j][k])/realdist[j][k]
-
- # Each point needs to be moved away from or towards the other
- # point in proportion to how much error it has
- grad[k][0]+=((loc[k][0]-loc[j][0])/fakedist[j][k])*errorterm
- grad[k][1]+=((loc[k][1]-loc[j][1])/fakedist[j][k])*errorterm
-
- # Keep track of the total error
- totalerror+=abs(errorterm)
-
-
- # If the answer got worse by moving the points, we are done
- if lasterror and lasterror<totalerror: break
- lasterror=totalerror
-
- # Move each of the points by the learning rate times the gradient
- for k in range(n):
- loc[k][0]-=rate*grad[k][0]
- loc[k][1]-=rate*grad[k][1]
-
- return loc
-
-def draw2d(data,labels,jpeg='mds2d.jpg'):
- img=Image.new('RGB',(2000,2000),(255,255,255))
- draw=ImageDraw.Draw(img)
- for i in range(len(data)):
- x=(data[i][0]+0.5)*1000
- y=(data[i][1]+0.5)*1000
- draw.text((x,y),labels[i],(0,0,0))
- img.save(jpeg,'JPEG')
diff --git a/source/generatefeedvector.py b/source/generatefeedvector.py deleted file mode 100755 index 3c33efa5..00000000 --- a/source/generatefeedvector.py +++ /dev/null @@ -1,65 +0,0 @@ -# -*- coding: utf-8 -*- -import feedparser -import re - -import conf -import mongodb - -# Returns title and dictionary of word counts for an RSS feed -def getwordcounts(feed_id): - wc={} - # Loop over all the entries - for article in mongo.get_articles_from_collection(feed_id): - summary = article["article_content"] - - # Extract a list of words - words = getwords(feed["feed_title"] + ' ' + summary) - for word in words: - wc.setdefault(word,0) - wc[word] += 1 - return feed["feed_title"], wc - -def getwords(html): - # Remove all the HTML tags - txt=re.compile(r'<[^>]+>').sub('',html) - - # Split words by all non-alpha characters - words=re.compile(r'[^A-Z^a-z]+').split(txt) - - # Convert to lowercase - return [word.lower() for word in words if word!=''] - - -apcount={} -wordcounts={} -mongo = mongodb.Articles(conf.MONGODB_ADDRESS, conf.MONGODB_PORT, \ - conf.MONGODB_DBNAME, conf.MONGODB_USER, conf.MONGODB_PASSWORD) -feeds = mongo.get_all_feeds() -for feed in feeds: - try: - title,wc=getwordcounts(feed["feed_id"]) - wordcounts[title]=wc - for word,count in list(wc.items()): - apcount.setdefault(word,0) - if count>1: - apcount[word]+=1 - except: - print('Failed to parse feed %s' % feed["feed_title"]) - -wordlist=[] -for w,bc in list(apcount.items()): - frac=float(bc)/len(feeds) - if frac>0.1 and frac<0.5: - wordlist.append(w) - -out=open('blogdata1.txt','w') -out.write('Blog') -for word in wordlist: out.write('\t%s' % word) -out.write('\n') -for blog,wc in list(wordcounts.items()): - print(blog) - out.write(blog) - for word in wordlist: - if word in wc: out.write('\t%d' % wc[word]) - else: out.write('\t0') - out.write('\n') diff --git a/source/testclusters.py b/source/testclusters.py deleted file mode 100644 index 728e9c1b..00000000 --- a/source/testclusters.py +++ /dev/null @@ -1,24 +0,0 @@ -#! /usr/bin/env python -#-*- coding: utf-8 -*- - -import clusters - -K = 7 - -blognames,words,data = clusters.readfile("blogdata1.txt") - -coords = clusters.scaledown(data) - -print "Generating clusters..." -kclust = clusters.kcluster(data, k=K, distance=clusters.pearson) -print -print "Clusters:" -for i in range(K): - print "Cluster" + str(i) - print ", ".join([blognames[r] for r in kclust[i]]) - print - - - - -clusters.draw2d(coords,blognames,jpeg='mds2d.jpg') |