#! /usr/bin/env python #-*- coding: utf-8 -*- import math import random from math import sqrt from PIL import Image, ImageDraw def readfile(filename): lines=[line for line in file(filename)] # First line is the column titles colnames=lines[0].strip().split('\t')[1:] rownames=[] data=[] for line in lines[1:]: p=line.strip().split('\t') # First column in each row is the rowname rownames.append(p[0]) # The data for this row is the remainder of the row data.append([float(x) for x in p[1:]]) return rownames,colnames,data def pearson(v1,v2): # Simple sums sum1=sum(v1) sum2=sum(v2) # Sums of the squares sum1Sq=sum([pow(v,2) for v in v1]) sum2Sq=sum([pow(v,2) for v in v2]) # Sum of the products pSum=sum([v1[i]*v2[i] for i in range(len(v1))]) # Calculate r (Pearson score) num=pSum-(sum1*sum2/len(v1)) den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1))) if den==0: return 0 return 1.0-num/den def tanimoto(v1, v2): c1, c2, shr = 0, 0, 0 for i in range(len(v1)): if v1[i] != 0: c1 += 1 # in v1 if v2[i] != 0: c2 += 1 # in v2 if v1[i] != 0 and v2[i] != 0: shr += 1 # in both return 1.0 - (float(shr) / (c1 + c2 - shr)) def euclidian(v1, v2): d = 0.0 for i in range(len(v1)): d += (v1[i] - v2[i])**2 return math.sqrt(d) def kcluster(rows,distance=pearson,k=4): # Determine the minimum and maximum values for each point ranges=[(min([row[i] for row in rows]),max([row[i] for row in rows])) for i in range(len(rows[0]))] # Create k randomly placed centroids clusters=[[random.random()*(ranges[i][1]-ranges[i][0])+ranges[i][0] for i in range(len(rows[0]))] for j in range(k)] lastmatches=None for t in range(100): print 'Iteration %d' % t bestmatches=[[] for i in range(k)] # Find which centroid is the closest for each row for j in range(len(rows)): row=rows[j] bestmatch=0 for i in range(k): d=distance(clusters[i],row) if d0: for rowid in bestmatches[i]: for m in range(len(rows[rowid])): avgs[m]+=rows[rowid][m] for j in range(len(avgs)): avgs[j]/=len(bestmatches[i]) clusters[i]=avgs return bestmatches def scaledown(data,distance=pearson,rate=0.01): n=len(data) # The real distances between every pair of items realdist=[[distance(data[i],data[j]) for j in range(n)] for i in range(0,n)] # Randomly initialize the starting points of the locations in 2D loc=[[random.random(),random.random()] for i in range(n)] fakedist=[[0.0 for j in range(n)] for i in range(n)] lasterror=None for m in range(0,1000): # Find projected distances for i in range(n): for j in range(n): fakedist[i][j]=sqrt(sum([pow(loc[i][x]-loc[j][x],2) for x in range(len(loc[i]))])) # Move points grad=[[0.0,0.0] for i in range(n)] totalerror=0 for k in range(n): for j in range(n): if j==k: continue # The error is percent difference between the distances errorterm=(fakedist[j][k]-realdist[j][k])/realdist[j][k] # Each point needs to be moved away from or towards the other # point in proportion to how much error it has grad[k][0]+=((loc[k][0]-loc[j][0])/fakedist[j][k])*errorterm grad[k][1]+=((loc[k][1]-loc[j][1])/fakedist[j][k])*errorterm # Keep track of the total error totalerror+=abs(errorterm) # If the answer got worse by moving the points, we are done if lasterror and lasterror