1
2
3
4
5 from SimDivFilters import rdSimDivPickers as rdsimdiv
6 import DataStructs
7 import numpy
8
10 """ Class to cluster a set of bits based on their correllation
11
12 The correlation matrix is first built using by reading the fingerprints
13 from a database or a list of fingerprints
14 """
15
17 self._clusters = []
18 self._bidList = idList
19
20 self._nClusters = nCluster
21 self._type = type
22
24
25 distMat = 1/corrMat
26
27 pkr = rdsimdiv.HierarchicalClusterPicker(self._type)
28
29 cls = pkr.Cluster(distMat, len(self._bidList), self._nClusters)
30
31 self._clusters = []
32 for cl in cls :
33 bcls = []
34 for i in cl :
35 bid = self._bidList[i]
36 bcls.append(bid)
37 self._clusters.append(bcls)
38
40 assert len(clusters) == self._nClusters
41 self._clusters = clusters
42
45
47 """ Map the fingerprint to a real valued vector of score based on the bit clusters
48
49 The dimension of the vector is same as the number of clusters. Each value in the
50 vector corresponds to the number of bits in the corresponding cluster
51 that are turned on in the fingerprint
52
53 ARGUMENTS:
54 - fp : the fingerprint
55 """
56
57 scores = [0]*self._nClusters
58
59 i = 0
60 for cls in self._clusters:
61 for bid in cls :
62 if fp[bid] :
63 scores[i] += 1
64
65 i += 1
66
67 return scores
68
70 """ Map the fingerprint to a smaller sized (= number of clusters) fingerprint
71
72 Each cluster get a bit in the new fingerprint and is turned on if any of the bits in
73 the cluster are turned on in the original fingerprint"""
74
75 ebv = DataStructs.ExplicitBitVect(self._nClusters)
76 i = 0
77
78 for cls in self._clusters:
79 for bid in cls :
80 if fp[bid] :
81 ebv.SetBit(i)
82 break
83 i += 1
84
85 return ebv
86