1
2
3
4
5
6
7
8
9
10
11 """ utility functionality for clustering molecules using fingerprints
12 includes a command line app for clustering
13
14
15 Sample Usage:
16 python ClusterMols.py -d data.gdb -t daylight_sig \
17 --idName="CAS_TF" -o clust1.pkl \
18 --actTable="dop_test" --actName="moa_quant"
19
20 """
21 from rdkit.Dbase.DbConnection import DbConnect
22 from rdkit.Dbase import DbInfo,DbUtils
23 from rdkit.ML.Data import DataUtils
24 from rdkit.ML.Cluster import Clusters
25 from rdkit.ML.Cluster import Murtagh
26 import sys,cPickle
27 from rdkit.Chem.Fingerprints import FingerprintMols,MolSimilarity
28 from rdkit import DataStructs
29 import numpy
30 _cvsVersion="$Id: ClusterMols.py 1528 2010-09-26 17:04:37Z glandrum $"
31 idx1 = _cvsVersion.find(':')+1
32 idx2 = _cvsVersion.rfind('$')
33 __VERSION_STRING="%s"%(_cvsVersion[idx1:idx2])
34
35 message=FingerprintMols.message
36 error=FingerprintMols.error
37
39 """ data should be a list of tuples with fingerprints in position 1
40 (the rest of the elements of the tuple are not important)
41
42 Returns the symmetric distance matrix
43 (see ML.Cluster.Resemblance for layout documentation)
44
45 """
46 nPts = len(data)
47 res = numpy.zeros((nPts*(nPts-1)/2),numpy.float)
48 nSoFar=0
49 for col in xrange(1,nPts):
50 for row in xrange(col):
51 fp1 = data[col][1]
52 fp2 = data[row][1]
53 if fp1.GetNumBits()>fp2.GetNumBits():
54 fp1 = DataStructs.FoldFingerprint(fp1,fp1.GetNumBits()/fp2.GetNumBits())
55 elif fp2.GetNumBits()>fp1.GetNumBits():
56 fp2 = DataStructs.FoldFingerprint(fp2,fp2.GetNumBits()/fp1.GetNumBits())
57 sim = metric(fp1,fp2)
58 if isSimilarity:
59 sim = 1.-sim
60 res[nSoFar] = sim
61 nSoFar += 1
62 return res
63
64 -def ClusterPoints(data,metric,algorithmId,haveLabels=False,haveActs=True,returnDistances=False):
65 message('Generating distance matrix.\n')
66 dMat = GetDistanceMatrix(data,metric)
67 message('Clustering\n')
68 clustTree = Murtagh.ClusterData(dMat,len(data),algorithmId,
69 isDistData=1)[0]
70 acts = []
71 if haveActs and len(data[0])>2:
72
73 acts = [int(x[2]) for x in data]
74
75 if not haveLabels:
76 labels = ['Mol: %s'%str(x[0]) for x in data]
77 else:
78 labels = [x[0] for x in data]
79 clustTree._ptLabels = labels
80 if acts:
81 clustTree._ptValues = acts
82 for pt in clustTree.GetPoints():
83 idx = pt.GetIndex()-1
84 pt.SetName(labels[idx])
85 if acts:
86 try:
87 pt.SetData(int(acts[idx]))
88 except:
89 pass
90 if not returnDistances:
91 return clustTree
92 else:
93 return clustTree,dMat
94
119
120 _usageDoc="""
121 Usage: ClusterMols.py [args] <fName>
122
123 If <fName> is provided and no tableName is specified (see below),
124 data will be read from the text file <fName>. Text files delimited
125 with either commas (extension .csv) or tabs (extension .txt) are
126 supported.
127
128 Command line arguments are:
129
130 - -d _dbName_: set the name of the database from which
131 to pull input fingerprint information.
132
133 - -t _tableName_: set the name of the database table
134 from which to pull input fingerprint information
135
136 - --idName=val: sets the name of the id column in the input
137 database. Default is *ID*.
138
139 - -o _outFileName_: name of the output file (output will
140 be a pickle (.pkl) file with the cluster tree)
141
142 - --actTable=val: name of table containing activity values
143 (used to color points in the cluster tree).
144
145 - --actName=val: name of column with activities in the activity
146 table. The values in this column should either be integers or
147 convertible into integers.
148
149 - --SLINK: use the single-linkage clustering algorithm
150 (default is Ward's minimum variance)
151
152 - --CLINK: use the complete-linkage clustering algorithm
153 (default is Ward's minimum variance)
154
155 - --UPGMA: use the group-average clustering algorithm
156 (default is Ward's minimum variance)
157
158 - --dice: use the DICE similarity metric instead of Tanimoto
159
160 - --cosine: use the cosine similarity metric instead of Tanimoto
161
162 - --fpColName=val: name to use for the column which stores
163 fingerprints (in pickled format) in the input db table.
164 Default is *AutoFragmentFP*
165
166 - --minPath=val: minimum path length to be included in
167 fragment-based fingerprints. Default is *2*.
168
169 - --maxPath=val: maximum path length to be included in
170 fragment-based fingerprints. Default is *7*.
171
172 - --nBitsPerHash: number of bits to be set in the output
173 fingerprint for each fragment. Default is *4*.
174
175 - --discrim: use of path-based discriminators to hash bits.
176 Default is *false*.
177
178 - -V: include valence information in the fingerprints
179 Default is *false*.
180
181 - -H: include Hs in the fingerprint
182 Default is *false*.
183
184 - --useMACCS: use the public MACCS keys to do the fingerprinting
185 (instead of a daylight-type fingerprint)
186
187
188 """
189 if __name__ == '__main__':
190 message("This is ClusterMols version %s\n\n"%(__VERSION_STRING))
191 FingerprintMols._usageDoc=_usageDoc
192 details = FingerprintMols.ParseArgs()
193 ClusterFromDetails(details)
194