From af21a33f233eccac6416062a6a0095631f102b36 Mon Sep 17 00:00:00 2001
From: Cédric Bonhomme <kimble.mandel@gmail.com>
Date: Wed, 5 Dec 2012 21:38:32 +0100
Subject: Removed files related to k-means sorting tests.

---
 source/clusters.py           | 157 -------------------------------------------
 source/generatefeedvector.py |  65 ------------------
 source/testclusters.py       |  24 -------
 3 files changed, 246 deletions(-)
 delete mode 100755 source/clusters.py
 delete mode 100755 source/generatefeedvector.py
 delete mode 100644 source/testclusters.py

(limited to 'source')

diff --git a/source/clusters.py b/source/clusters.py
deleted file mode 100755
index e53fac9b..00000000
--- a/source/clusters.py
+++ /dev/null
@@ -1,157 +0,0 @@
-#! /usr/bin/env python
-#-*- coding: utf-8 -*-
-
-import math
-import random
-
-from math import sqrt
-from PIL import Image, ImageDraw
-
-def readfile(filename):
-  lines=[line for line in file(filename)]
-
-  # First line is the column titles
-  colnames=lines[0].strip().split('\t')[1:]
-  rownames=[]
-  data=[]
-  for line in lines[1:]:
-    p=line.strip().split('\t')
-    # First column in each row is the rowname
-    rownames.append(p[0])
-    # The data for this row is the remainder of the row
-    data.append([float(x) for x in p[1:]])
-  return rownames,colnames,data
-
-def pearson(v1,v2):
-  # Simple sums
-  sum1=sum(v1)
-  sum2=sum(v2)
-
-  # Sums of the squares
-  sum1Sq=sum([pow(v,2) for v in v1])
-  sum2Sq=sum([pow(v,2) for v in v2])	
-
-  # Sum of the products
-  pSum=sum([v1[i]*v2[i] for i in range(len(v1))])
-
-  # Calculate r (Pearson score)
-  num=pSum-(sum1*sum2/len(v1))
-  den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1)))
-  if den==0: return 0
-
-  return 1.0-num/den
-
-def tanimoto(v1, v2):
-    c1, c2, shr = 0, 0, 0
-    for i in range(len(v1)):
-        if v1[i] != 0:
-            c1 += 1 # in v1
-        if v2[i] != 0:
-            c2 += 1 # in v2
-        if v1[i] != 0 and v2[i] != 0:
-            shr += 1 # in both
-    return 1.0 - (float(shr) / (c1 + c2 - shr))
-
-def euclidian(v1, v2):
-    d = 0.0
-    for i in range(len(v1)):
-        d += (v1[i] - v2[i])**2
-    return math.sqrt(d)
-
-def kcluster(rows,distance=pearson,k=4):
-  # Determine the minimum and maximum values for each point
-  ranges=[(min([row[i] for row in rows]),max([row[i] for row in rows])) 
-  for i in range(len(rows[0]))]
-
-  # Create k randomly placed centroids
-  clusters=[[random.random()*(ranges[i][1]-ranges[i][0])+ranges[i][0] 
-  for i in range(len(rows[0]))] for j in range(k)]
-
-  lastmatches=None
-  for t in range(100):
-    print 'Iteration %d' % t
-    bestmatches=[[] for i in range(k)]
-
-    # Find which centroid is the closest for each row
-    for j in range(len(rows)):
-      row=rows[j]
-      bestmatch=0
-      for i in range(k):
-        d=distance(clusters[i],row)
-        if d<distance(clusters[bestmatch],row): bestmatch=i
-      bestmatches[bestmatch].append(j)
-
-    # If the results are the same as last time, this is complete
-    if bestmatches==lastmatches: break
-    lastmatches=bestmatches
-
-    # Move the centroids to the average of their members
-    for i in range(k):
-      avgs=[0.0]*len(rows[0])
-      if len(bestmatches[i])>0:
-        for rowid in bestmatches[i]:
-          for m in range(len(rows[rowid])):
-            avgs[m]+=rows[rowid][m]
-        for j in range(len(avgs)):
-          avgs[j]/=len(bestmatches[i])
-        clusters[i]=avgs
-
-  return bestmatches
-
-def scaledown(data,distance=pearson,rate=0.01):
-  n=len(data)
-
-  # The real distances between every pair of items
-  realdist=[[distance(data[i],data[j]) for j in range(n)] 
-             for i in range(0,n)]
-
-  # Randomly initialize the starting points of the locations in 2D
-  loc=[[random.random(),random.random()] for i in range(n)]
-  fakedist=[[0.0 for j in range(n)] for i in range(n)]
-
-  lasterror=None
-  for m in range(0,1000):
-    # Find projected distances
-    for i in range(n):
-      for j in range(n):
-        fakedist[i][j]=sqrt(sum([pow(loc[i][x]-loc[j][x],2) 
-                                 for x in range(len(loc[i]))]))
-
-    # Move points
-    grad=[[0.0,0.0] for i in range(n)]
-
-    totalerror=0
-    for k in range(n):
-      for j in range(n):
-        if j==k: continue
-        # The error is percent difference between the distances
-        errorterm=(fakedist[j][k]-realdist[j][k])/realdist[j][k]
-
-        # Each point needs to be moved away from or towards the other
-        # point in proportion to how much error it has
-        grad[k][0]+=((loc[k][0]-loc[j][0])/fakedist[j][k])*errorterm
-        grad[k][1]+=((loc[k][1]-loc[j][1])/fakedist[j][k])*errorterm
-
-        # Keep track of the total error
-        totalerror+=abs(errorterm)
-
-
-    # If the answer got worse by moving the points, we are done
-    if lasterror and lasterror<totalerror: break
-    lasterror=totalerror
-
-    # Move each of the points by the learning rate times the gradient
-    for k in range(n):
-      loc[k][0]-=rate*grad[k][0]
-      loc[k][1]-=rate*grad[k][1]
-
-  return loc
-
-def draw2d(data,labels,jpeg='mds2d.jpg'):
-  img=Image.new('RGB',(2000,2000),(255,255,255))
-  draw=ImageDraw.Draw(img)
-  for i in range(len(data)):
-    x=(data[i][0]+0.5)*1000
-    y=(data[i][1]+0.5)*1000
-    draw.text((x,y),labels[i],(0,0,0))
-  img.save(jpeg,'JPEG')
diff --git a/source/generatefeedvector.py b/source/generatefeedvector.py
deleted file mode 100755
index 3c33efa5..00000000
--- a/source/generatefeedvector.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# -*- coding: utf-8 -*-
-import feedparser
-import re
-
-import conf
-import mongodb
-
-# Returns title and dictionary of word counts for an RSS feed
-def getwordcounts(feed_id):
-  wc={}
-  # Loop over all the entries
-  for article in mongo.get_articles_from_collection(feed_id):
-    summary = article["article_content"]
-
-    # Extract a list of words
-    words = getwords(feed["feed_title"] + ' ' + summary)
-    for word in words:
-      wc.setdefault(word,0)
-      wc[word] += 1
-  return feed["feed_title"], wc
-
-def getwords(html):
-  # Remove all the HTML tags
-  txt=re.compile(r'<[^>]+>').sub('',html)
-
-  # Split words by all non-alpha characters
-  words=re.compile(r'[^A-Z^a-z]+').split(txt)
-
-  # Convert to lowercase
-  return [word.lower() for word in words if word!='']
-
-
-apcount={}
-wordcounts={}
-mongo = mongodb.Articles(conf.MONGODB_ADDRESS, conf.MONGODB_PORT, \
-                        conf.MONGODB_DBNAME, conf.MONGODB_USER, conf.MONGODB_PASSWORD)
-feeds = mongo.get_all_feeds()
-for feed in feeds:
-  try:
-    title,wc=getwordcounts(feed["feed_id"])
-    wordcounts[title]=wc
-    for word,count in list(wc.items()):
-      apcount.setdefault(word,0)
-      if count>1:
-        apcount[word]+=1
-  except:
-    print('Failed to parse feed %s' % feed["feed_title"])
-
-wordlist=[]
-for w,bc in list(apcount.items()):
-  frac=float(bc)/len(feeds)
-  if frac>0.1 and frac<0.5:
-    wordlist.append(w)
-
-out=open('blogdata1.txt','w')
-out.write('Blog')
-for word in wordlist: out.write('\t%s' % word)
-out.write('\n')
-for blog,wc in list(wordcounts.items()):
-  print(blog)
-  out.write(blog)
-  for word in wordlist:
-    if word in wc: out.write('\t%d' % wc[word])
-    else: out.write('\t0')
-  out.write('\n')
diff --git a/source/testclusters.py b/source/testclusters.py
deleted file mode 100644
index 728e9c1b..00000000
--- a/source/testclusters.py
+++ /dev/null
@@ -1,24 +0,0 @@
-#! /usr/bin/env python
-#-*- coding: utf-8 -*-
-
-import clusters
-
-K = 7
-
-blognames,words,data = clusters.readfile("blogdata1.txt")
-
-coords = clusters.scaledown(data)
-
-print "Generating clusters..."
-kclust = clusters.kcluster(data, k=K, distance=clusters.pearson)
-print
-print "Clusters:"
-for i in range(K):
-    print "Cluster" + str(i)
-    print ", ".join([blognames[r] for r in kclust[i]])
-    print
-
-
-
-
-clusters.draw2d(coords,blognames,jpeg='mds2d.jpg')
-- 
cgit