1
2
3
4
5
6 """ Implementation of the clustering algorithm published in:
7 Butina JCICS 39 747-750 (1999)
8
9 """
10 import numpy
11 import RDLogger
12 logger=RDLogger.logger()
13
15 """ clusters the data points passed in and returns the list of clusters
16
17 **Arguments**
18
19 - data: a list of lists (or array, or whatever) with the input
20 data (see discussion of _isDistData_ argument for the exception)
21
22 - nPts: the number of points to be used
23
24 - distThresh: elements within this range of each other are considered
25 to be neighbors
26
27 - isDistData: set this toggle when the data passed in is a
28 distance matrix. The distance matrix should be stored
29 symmetrically so that _LookupDist (above) can retrieve
30 the results:
31 for i<j: d_ij = dists[j*(j-1)/2 + i]
32
33 **Returns**
34
35 - a tuple of tuples containing information about the clusters:
36 ( (cluster1_elem1, cluster1_elem2, ...),
37 (cluster2_elem1, cluster2_elem2, ...),
38 ...
39 )
40 The first element for each cluster is its centroid.
41
42 """
43 data = numpy.array(data)
44 if isDistData and len(data)>(nPts*(nPts-1)/2):
45 logger.warning("Distance matrix is too long")
46 nbrLists = [None]*nPts
47 for i in range(nPts): nbrLists[i] = []
48
49 dmIdx=0
50 for i in range(nPts):
51 for j in range(i):
52 if not isDistData:
53 dv = data[i]-data[j]
54 dij = numpy.sqrt(dv*dv)
55 else:
56 dij = data[dmIdx]
57 dmIdx+=1
58
59 if dij<=distThresh:
60 nbrLists[i].append(j)
61 nbrLists[j].append(i)
62
63
64 tLists = [(len(y),x) for x,y in enumerate(nbrLists)]
65 tLists.sort()
66 tLists.reverse()
67
68 res = []
69 seen = [0]*nPts
70 while tLists:
71 nNbrs,idx = tLists.pop(0)
72 if seen[idx]:
73 continue
74 tRes = [idx]
75 for nbr in nbrLists[idx]:
76 if not seen[nbr]:
77 tRes.append(nbr)
78 seen[nbr]=1
79 res.append(tuple(tRes))
80 return tuple(res)
81