diff options
Diffstat (limited to 'source')
-rwxr-xr-x | source/clusters.py | 137 | ||||
-rwxr-xr-x | source/generatefeedvector.py | 65 | ||||
-rw-r--r-- | source/testclusters.py | 23 |
3 files changed, 225 insertions, 0 deletions
diff --git a/source/clusters.py b/source/clusters.py new file mode 100755 index 00000000..7122c55d --- /dev/null +++ b/source/clusters.py @@ -0,0 +1,137 @@ +# -*- coding: utf-8 -*-
+import random
+
+from math import sqrt
+from PIL import Image, ImageDraw
+
+def readfile(filename):
+ lines=[line for line in file(filename)]
+
+ # First line is the column titles
+ colnames=lines[0].strip().split('\t')[1:]
+ rownames=[]
+ data=[]
+ for line in lines[1:]:
+ p=line.strip().split('\t')
+ # First column in each row is the rowname
+ rownames.append(p[0])
+ # The data for this row is the remainder of the row
+ data.append([float(x) for x in p[1:]])
+ return rownames,colnames,data
+
+def pearson(v1,v2):
+ # Simple sums
+ sum1=sum(v1)
+ sum2=sum(v2)
+
+ # Sums of the squares
+ sum1Sq=sum([pow(v,2) for v in v1])
+ sum2Sq=sum([pow(v,2) for v in v2])
+
+ # Sum of the products
+ pSum=sum([v1[i]*v2[i] for i in range(len(v1))])
+
+ # Calculate r (Pearson score)
+ num=pSum-(sum1*sum2/len(v1))
+ den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1)))
+ if den==0: return 0
+
+ return 1.0-num/den
+
+def kcluster(rows,distance=pearson,k=4):
+ # Determine the minimum and maximum values for each point
+ ranges=[(min([row[i] for row in rows]),max([row[i] for row in rows]))
+ for i in range(len(rows[0]))]
+
+ # Create k randomly placed centroids
+ clusters=[[random.random()*(ranges[i][1]-ranges[i][0])+ranges[i][0]
+ for i in range(len(rows[0]))] for j in range(k)]
+
+ lastmatches=None
+ for t in range(100):
+ print 'Iteration %d' % t
+ bestmatches=[[] for i in range(k)]
+
+ # Find which centroid is the closest for each row
+ for j in range(len(rows)):
+ row=rows[j]
+ bestmatch=0
+ for i in range(k):
+ d=distance(clusters[i],row)
+ if d<distance(clusters[bestmatch],row): bestmatch=i
+ bestmatches[bestmatch].append(j)
+
+ # If the results are the same as last time, this is complete
+ if bestmatches==lastmatches: break
+ lastmatches=bestmatches
+
+ # Move the centroids to the average of their members
+ for i in range(k):
+ avgs=[0.0]*len(rows[0])
+ if len(bestmatches[i])>0:
+ for rowid in bestmatches[i]:
+ for m in range(len(rows[rowid])):
+ avgs[m]+=rows[rowid][m]
+ for j in range(len(avgs)):
+ avgs[j]/=len(bestmatches[i])
+ clusters[i]=avgs
+
+ return bestmatches
+
+def scaledown(data,distance=pearson,rate=0.01):
+ n=len(data)
+
+ # The real distances between every pair of items
+ realdist=[[distance(data[i],data[j]) for j in range(n)]
+ for i in range(0,n)]
+
+ # Randomly initialize the starting points of the locations in 2D
+ loc=[[random.random(),random.random()] for i in range(n)]
+ fakedist=[[0.0 for j in range(n)] for i in range(n)]
+
+ lasterror=None
+ for m in range(0,1000):
+ # Find projected distances
+ for i in range(n):
+ for j in range(n):
+ fakedist[i][j]=sqrt(sum([pow(loc[i][x]-loc[j][x],2)
+ for x in range(len(loc[i]))]))
+
+ # Move points
+ grad=[[0.0,0.0] for i in range(n)]
+
+ totalerror=0
+ for k in range(n):
+ for j in range(n):
+ if j==k: continue
+ # The error is percent difference between the distances
+ errorterm=(fakedist[j][k]-realdist[j][k])/realdist[j][k]
+
+ # Each point needs to be moved away from or towards the other
+ # point in proportion to how much error it has
+ grad[k][0]+=((loc[k][0]-loc[j][0])/fakedist[j][k])*errorterm
+ grad[k][1]+=((loc[k][1]-loc[j][1])/fakedist[j][k])*errorterm
+
+ # Keep track of the total error
+ totalerror+=abs(errorterm)
+
+
+ # If the answer got worse by moving the points, we are done
+ if lasterror and lasterror<totalerror: break
+ lasterror=totalerror
+
+ # Move each of the points by the learning rate times the gradient
+ for k in range(n):
+ loc[k][0]-=rate*grad[k][0]
+ loc[k][1]-=rate*grad[k][1]
+
+ return loc
+
+def draw2d(data,labels,jpeg='mds2d.jpg'):
+ img=Image.new('RGB',(2000,2000),(255,255,255))
+ draw=ImageDraw.Draw(img)
+ for i in range(len(data)):
+ x=(data[i][0]+0.5)*1000
+ y=(data[i][1]+0.5)*1000
+ draw.text((x,y),labels[i],(0,0,0))
+ img.save(jpeg,'JPEG')
diff --git a/source/generatefeedvector.py b/source/generatefeedvector.py new file mode 100755 index 00000000..3c33efa5 --- /dev/null +++ b/source/generatefeedvector.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- +import feedparser +import re + +import conf +import mongodb + +# Returns title and dictionary of word counts for an RSS feed +def getwordcounts(feed_id): + wc={} + # Loop over all the entries + for article in mongo.get_articles_from_collection(feed_id): + summary = article["article_content"] + + # Extract a list of words + words = getwords(feed["feed_title"] + ' ' + summary) + for word in words: + wc.setdefault(word,0) + wc[word] += 1 + return feed["feed_title"], wc + +def getwords(html): + # Remove all the HTML tags + txt=re.compile(r'<[^>]+>').sub('',html) + + # Split words by all non-alpha characters + words=re.compile(r'[^A-Z^a-z]+').split(txt) + + # Convert to lowercase + return [word.lower() for word in words if word!=''] + + +apcount={} +wordcounts={} +mongo = mongodb.Articles(conf.MONGODB_ADDRESS, conf.MONGODB_PORT, \ + conf.MONGODB_DBNAME, conf.MONGODB_USER, conf.MONGODB_PASSWORD) +feeds = mongo.get_all_feeds() +for feed in feeds: + try: + title,wc=getwordcounts(feed["feed_id"]) + wordcounts[title]=wc + for word,count in list(wc.items()): + apcount.setdefault(word,0) + if count>1: + apcount[word]+=1 + except: + print('Failed to parse feed %s' % feed["feed_title"]) + +wordlist=[] +for w,bc in list(apcount.items()): + frac=float(bc)/len(feeds) + if frac>0.1 and frac<0.5: + wordlist.append(w) + +out=open('blogdata1.txt','w') +out.write('Blog') +for word in wordlist: out.write('\t%s' % word) +out.write('\n') +for blog,wc in list(wordcounts.items()): + print(blog) + out.write(blog) + for word in wordlist: + if word in wc: out.write('\t%d' % wc[word]) + else: out.write('\t0') + out.write('\n') diff --git a/source/testclusters.py b/source/testclusters.py new file mode 100644 index 00000000..0e1e2a32 --- /dev/null +++ b/source/testclusters.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +import clusters + +K = 10 + +blognames,labels,data = clusters.readfile("blogdata1.txt") + +#coords = clusters.scaledown(data) + +print "Generating clusters..." +kclust = clusters.kcluster(data, k=K) +print +print "Clusters:" +for i in range(K): + print "Cluster" + str(i) + print ", ".join([blognames[r] for r in kclust[i]]) + print + + + + +#clusters.draw2d(coords,blognames,jpeg='mds2d.jpg')
\ No newline at end of file |