Package ML :: Package Cluster :: Module Butina
[hide private]
[frames] | no frames]

Source Code for Module ML.Cluster.Butina

 1  # $Id: Butina.py 746 2008-07-07 13:21:24Z glandrum $ 
 2  # 
 3  # Copyright (C) 2007-2008 Greg Landrum 
 4  #   All Rights Reserved 
 5  # 
 6  """ Implementation of the clustering algorithm published in: 
 7    Butina JCICS 39 747-750 (1999) 
 8   
 9  """ 
10  import numpy 
11  import RDLogger 
12  logger=RDLogger.logger() 
13   
14 -def ClusterData(data,nPts,distThresh,isDistData=False):
15 """ clusters the data points passed in and returns the list of clusters 16 17 **Arguments** 18 19 - data: a list of lists (or array, or whatever) with the input 20 data (see discussion of _isDistData_ argument for the exception) 21 22 - nPts: the number of points to be used 23 24 - distThresh: elements within this range of each other are considered 25 to be neighbors 26 27 - isDistData: set this toggle when the data passed in is a 28 distance matrix. The distance matrix should be stored 29 symmetrically so that _LookupDist (above) can retrieve 30 the results: 31 for i<j: d_ij = dists[j*(j-1)/2 + i] 32 33 **Returns** 34 35 - a tuple of tuples containing information about the clusters: 36 ( (cluster1_elem1, cluster1_elem2, ...), 37 (cluster2_elem1, cluster2_elem2, ...), 38 ... 39 ) 40 The first element for each cluster is its centroid. 41 42 """ 43 data = numpy.array(data) 44 if isDistData and len(data)>(nPts*(nPts-1)/2): 45 logger.warning("Distance matrix is too long") 46 nbrLists = [None]*nPts 47 for i in range(nPts): nbrLists[i] = [] 48 49 dmIdx=0 50 for i in range(nPts): 51 for j in range(i): 52 if not isDistData: 53 dv = data[i]-data[j] 54 dij = numpy.sqrt(dv*dv) 55 else: 56 dij = data[dmIdx] 57 dmIdx+=1 58 #print i,j,dij 59 if dij<=distThresh: 60 nbrLists[i].append(j) 61 nbrLists[j].append(i) 62 #print nbrLists 63 # sort by the number of neighbors: 64 tLists = [(len(y),x) for x,y in enumerate(nbrLists)] 65 tLists.sort() 66 tLists.reverse() 67 68 res = [] 69 seen = [0]*nPts 70 while tLists: 71 nNbrs,idx = tLists.pop(0) 72 if seen[idx]: 73 continue 74 tRes = [idx] 75 for nbr in nbrLists[idx]: 76 if not seen[nbr]: 77 tRes.append(nbr) 78 seen[nbr]=1 79 res.append(tuple(tRes)) 80 return tuple(res)
81