From 9c7a6eca5f92b0cc3cc23c40b23f8c982fccdd06 Mon Sep 17 00:00:00 2001
From: Cédric Bonhomme <kimble.mandel@gmail.com>
Date: Fri, 16 Nov 2012 14:20:34 +0100
Subject: Added tests with clusters.

---
 source/clusters.py           | 137 +++++++++++++++++++++++++++++++++++++++++++
 source/generatefeedvector.py |  65 ++++++++++++++++++++
 source/testclusters.py       |  23 ++++++++
 3 files changed, 225 insertions(+)
 create mode 100755 source/clusters.py
 create mode 100755 source/generatefeedvector.py
 create mode 100644 source/testclusters.py

diff --git a/source/clusters.py b/source/clusters.py
new file mode 100755
index 00000000..7122c55d
--- /dev/null
+++ b/source/clusters.py
@@ -0,0 +1,137 @@
+# -*- coding: utf-8 -*-
+import random
+
+from math import sqrt
+from PIL import Image, ImageDraw
+
+def readfile(filename):
+  lines=[line for line in file(filename)]
+
+  # First line is the column titles
+  colnames=lines[0].strip().split('\t')[1:]
+  rownames=[]
+  data=[]
+  for line in lines[1:]:
+    p=line.strip().split('\t')
+    # First column in each row is the rowname
+    rownames.append(p[0])
+    # The data for this row is the remainder of the row
+    data.append([float(x) for x in p[1:]])
+  return rownames,colnames,data
+
+def pearson(v1,v2):
+  # Simple sums
+  sum1=sum(v1)
+  sum2=sum(v2)
+
+  # Sums of the squares
+  sum1Sq=sum([pow(v,2) for v in v1])
+  sum2Sq=sum([pow(v,2) for v in v2])	
+
+  # Sum of the products
+  pSum=sum([v1[i]*v2[i] for i in range(len(v1))])
+
+  # Calculate r (Pearson score)
+  num=pSum-(sum1*sum2/len(v1))
+  den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1)))
+  if den==0: return 0
+
+  return 1.0-num/den
+
+def kcluster(rows,distance=pearson,k=4):
+  # Determine the minimum and maximum values for each point
+  ranges=[(min([row[i] for row in rows]),max([row[i] for row in rows])) 
+  for i in range(len(rows[0]))]
+
+  # Create k randomly placed centroids
+  clusters=[[random.random()*(ranges[i][1]-ranges[i][0])+ranges[i][0] 
+  for i in range(len(rows[0]))] for j in range(k)]
+
+  lastmatches=None
+  for t in range(100):
+    print 'Iteration %d' % t
+    bestmatches=[[] for i in range(k)]
+
+    # Find which centroid is the closest for each row
+    for j in range(len(rows)):
+      row=rows[j]
+      bestmatch=0
+      for i in range(k):
+        d=distance(clusters[i],row)
+        if d<distance(clusters[bestmatch],row): bestmatch=i
+      bestmatches[bestmatch].append(j)
+
+    # If the results are the same as last time, this is complete
+    if bestmatches==lastmatches: break
+    lastmatches=bestmatches
+
+    # Move the centroids to the average of their members
+    for i in range(k):
+      avgs=[0.0]*len(rows[0])
+      if len(bestmatches[i])>0:
+        for rowid in bestmatches[i]:
+          for m in range(len(rows[rowid])):
+            avgs[m]+=rows[rowid][m]
+        for j in range(len(avgs)):
+          avgs[j]/=len(bestmatches[i])
+        clusters[i]=avgs
+
+  return bestmatches
+
+def scaledown(data,distance=pearson,rate=0.01):
+  n=len(data)
+
+  # The real distances between every pair of items
+  realdist=[[distance(data[i],data[j]) for j in range(n)] 
+             for i in range(0,n)]
+
+  # Randomly initialize the starting points of the locations in 2D
+  loc=[[random.random(),random.random()] for i in range(n)]
+  fakedist=[[0.0 for j in range(n)] for i in range(n)]
+
+  lasterror=None
+  for m in range(0,1000):
+    # Find projected distances
+    for i in range(n):
+      for j in range(n):
+        fakedist[i][j]=sqrt(sum([pow(loc[i][x]-loc[j][x],2) 
+                                 for x in range(len(loc[i]))]))
+
+    # Move points
+    grad=[[0.0,0.0] for i in range(n)]
+
+    totalerror=0
+    for k in range(n):
+      for j in range(n):
+        if j==k: continue
+        # The error is percent difference between the distances
+        errorterm=(fakedist[j][k]-realdist[j][k])/realdist[j][k]
+
+        # Each point needs to be moved away from or towards the other
+        # point in proportion to how much error it has
+        grad[k][0]+=((loc[k][0]-loc[j][0])/fakedist[j][k])*errorterm
+        grad[k][1]+=((loc[k][1]-loc[j][1])/fakedist[j][k])*errorterm
+
+        # Keep track of the total error
+        totalerror+=abs(errorterm)
+
+
+    # If the answer got worse by moving the points, we are done
+    if lasterror and lasterror<totalerror: break
+    lasterror=totalerror
+
+    # Move each of the points by the learning rate times the gradient
+    for k in range(n):
+      loc[k][0]-=rate*grad[k][0]
+      loc[k][1]-=rate*grad[k][1]
+
+  return loc
+
+def draw2d(data,labels,jpeg='mds2d.jpg'):
+  img=Image.new('RGB',(2000,2000),(255,255,255))
+  draw=ImageDraw.Draw(img)
+  for i in range(len(data)):
+    x=(data[i][0]+0.5)*1000
+    y=(data[i][1]+0.5)*1000
+    draw.text((x,y),labels[i],(0,0,0))
+  img.save(jpeg,'JPEG')
diff --git a/source/generatefeedvector.py b/source/generatefeedvector.py
new file mode 100755
index 00000000..3c33efa5
--- /dev/null
+++ b/source/generatefeedvector.py
@@ -0,0 +1,65 @@
+# -*- coding: utf-8 -*-
+import feedparser
+import re
+
+import conf
+import mongodb
+
+# Returns title and dictionary of word counts for an RSS feed
+def getwordcounts(feed_id):
+  wc={}
+  # Loop over all the entries
+  for article in mongo.get_articles_from_collection(feed_id):
+    summary = article["article_content"]
+
+    # Extract a list of words
+    words = getwords(feed["feed_title"] + ' ' + summary)
+    for word in words:
+      wc.setdefault(word,0)
+      wc[word] += 1
+  return feed["feed_title"], wc
+
+def getwords(html):
+  # Remove all the HTML tags
+  txt=re.compile(r'<[^>]+>').sub('',html)
+
+  # Split words by all non-alpha characters
+  words=re.compile(r'[^A-Z^a-z]+').split(txt)
+
+  # Convert to lowercase
+  return [word.lower() for word in words if word!='']
+
+
+apcount={}
+wordcounts={}
+mongo = mongodb.Articles(conf.MONGODB_ADDRESS, conf.MONGODB_PORT, \
+                        conf.MONGODB_DBNAME, conf.MONGODB_USER, conf.MONGODB_PASSWORD)
+feeds = mongo.get_all_feeds()
+for feed in feeds:
+  try:
+    title,wc=getwordcounts(feed["feed_id"])
+    wordcounts[title]=wc
+    for word,count in list(wc.items()):
+      apcount.setdefault(word,0)
+      if count>1:
+        apcount[word]+=1
+  except:
+    print('Failed to parse feed %s' % feed["feed_title"])
+
+wordlist=[]
+for w,bc in list(apcount.items()):
+  frac=float(bc)/len(feeds)
+  if frac>0.1 and frac<0.5:
+    wordlist.append(w)
+
+out=open('blogdata1.txt','w')
+out.write('Blog')
+for word in wordlist: out.write('\t%s' % word)
+out.write('\n')
+for blog,wc in list(wordcounts.items()):
+  print(blog)
+  out.write(blog)
+  for word in wordlist:
+    if word in wc: out.write('\t%d' % wc[word])
+    else: out.write('\t0')
+  out.write('\n')
diff --git a/source/testclusters.py b/source/testclusters.py
new file mode 100644
index 00000000..0e1e2a32
--- /dev/null
+++ b/source/testclusters.py
@@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+
+import clusters
+
+K = 10
+
+blognames,labels,data = clusters.readfile("blogdata1.txt")
+
+#coords = clusters.scaledown(data)
+
+print "Generating clusters..."
+kclust = clusters.kcluster(data, k=K)
+print
+print "Clusters:"
+for i in range(K):
+    print "Cluster" + str(i)
+    print ", ".join([blognames[r] for r in kclust[i]])
+    print
+
+
+
+
+#clusters.draw2d(coords,blognames,jpeg='mds2d.jpg')
\ No newline at end of file
-- 
cgit