1
2
3
4
5
6
7
8
9
10
11 """ utility functionality for fingerprinting sets of molecules
12 includes a command line app for working with fingerprints
13 and databases
14
15
16 Sample Usage:
17
18 python FingerprintMols.py -d data.gdb \
19 -t 'raw_dop_data' --smilesName="Structure" --idName="Mol_ID" \
20 --outTable="daylight_sig"
21
22
23 """
24 from rdkit import Chem
25 from rdkit.Chem import MACCSkeys
26 from rdkit.ML.Cluster import Murtagh
27 from rdkit import DataStructs
28 import sys
29 import cPickle
30
31 _cvsVersion="$Id: FingerprintMols.py 1605 2011-01-02 11:17:15Z glandrum $"
32 idx1 = _cvsVersion.find(':')+1
33 idx2 = _cvsVersion.rfind('$')
34 __VERSION_STRING="%s"%(_cvsVersion[idx1:idx2])
35
36
41
46
58
75
76
81 """ fpArgs are passed as keyword arguments to the fingerprinter
82
83 Returns a list of 2-tuples: (id,fp)
84
85 """
86 res = []
87 nDone = 0
88 for entry in dataSource:
89 id,smi = str(entry[idCol]),str(entry[smiCol])
90 try:
91 mol = Chem.MolFromSmiles(smi)
92 except:
93 mol = None
94 if mol:
95 fp = FingerprintMol(mol,fingerprinter,**fpArgs)
96 res.append((id,fp))
97 nDone += 1
98 if reportFreq>0 and not nDone % reportFreq:
99 message('Done %d molecules\n'%(nDone))
100 if maxMols > 0 and nDone >= maxMols:
101 break
102 else:
103 error('Problems parsing SMILES: %s\n'%smi)
104 return res
105
110 """ fpArgs are passed as keyword arguments to the fingerprinter
111
112 Returns a list of 2-tuples: (id,fp)
113
114 """
115 res = []
116 nDone = 0
117 for id,mol in mols:
118 if mol:
119 fp = FingerprintMol(mol,fingerprinter,**fpArgs)
120 res.append((id,fp))
121 nDone += 1
122 if reportFreq>0 and not nDone % reportFreq:
123 message('Done %d molecules\n'%(nDone))
124 if maxMols > 0 and nDone >= maxMols:
125 break
126 else:
127 error('Problems parsing SMILES: %s\n'%smi)
128 return res
129
134 """ fpArgs are passed as keyword arguments to the fingerprinter
135
136 Returns a list of 2-tuples: (id,fp)
137
138 """
139 res = []
140 nDone = 0
141 for entry in dataSource:
142 id,pkl = str(entry[idCol]),str(entry[pklCol])
143 try:
144 mol = Chem.Mol(pkl)
145 except:
146 mol = None
147 if mol:
148 fp = FingerprintMol(mol,fingerprinter,**fpArgs)
149 res.append((id,fp))
150 nDone += 1
151 if reportFreq>0 and not nDone % reportFreq:
152 message('Done %d molecules\n'%(nDone))
153 if maxMols > 0 and nDone >= maxMols:
154 break
155 else:
156 error('Problems parsing pickle for id: %s\n'%id)
157 return res
158
160 data = None
161 if details.dbName and details.tableName:
162 from rdkit.Dbase.DbConnection import DbConnect
163 from rdkit.Dbase import DbInfo
164 from rdkit.ML.Data import DataUtils
165 try:
166 conn = DbConnect(details.dbName,details.tableName)
167 except:
168 import traceback
169 error('Problems establishing connection to database: %s|%s\n'%(details.dbName,
170 details.tableName))
171 traceback.print_exc()
172 if not details.idName:
173 details.idName=DbInfo.GetColumnNames(details.dbName,details.tableName)[0]
174 dataSet = DataUtils.DBToData(details.dbName,details.tableName,
175 what='%s,%s'%(details.idName,details.smilesName))
176 idCol = 0
177 smiCol = 1
178 elif details.inFileName and details.useSmiles:
179 from rdkit.ML.Data import DataUtils
180 conn = None
181 if not details.idName:
182 details.idName='ID'
183 try:
184 dataSet = DataUtils.TextFileToData(details.inFileName,
185 onlyCols=[details.idName,details.smilesName])
186 except IOError:
187 import traceback
188 error('Problems reading from file %s\n'%(details.inFileName))
189 traceback.print_exc()
190
191 idCol = 0
192 smiCol = 1
193 elif details.inFileName and details.useSD:
194 conn = None
195 dataset=None
196 if not details.idName:
197 details.idName='ID'
198 dataSet = []
199 try:
200 s = Chem.SDMolSupplier(details.inFileName)
201 except:
202 import traceback
203 error('Problems reading from file %s\n'%(details.inFileName))
204 traceback.print_exc()
205 else:
206 while 1:
207 try:
208 m = s.next()
209 except StopIteration:
210 break
211 if m:
212 dataSet.append(m)
213 if reportFreq>0 and not len(dataSet) % reportFreq:
214 message('Read %d molecules\n'%(len(dataSet)))
215 if details.maxMols > 0 and len(dataSet) >= details.maxMols:
216 break
217
218 for i,mol in enumerate(dataSet):
219 if mol.HasProp(details.idName):
220 nm = mol.GetProp(details.idName)
221 else:
222 nm = mol.GetProp('_Name')
223 dataSet[i] = (nm,mol)
224 else:
225 dataSet = None
226
227 fps = None
228 if dataSet and not details.useSD:
229 data = dataSet.GetNamedData()
230 if not details.molPklName:
231 fps = apply(FingerprintsFromSmiles,(data,idCol,smiCol),
232 details.__dict__)
233 else:
234 fps = apply(FingerprintsFromPickles,(data,idCol,smiCol),
235 details.__dict__)
236 elif dataSet and details.useSD:
237 fps = apply(FingerprintsFromMols,(dataSet,),details.__dict__)
238
239 if fps:
240 if details.outFileName:
241 outF = open(details.outFileName,'wb+')
242 for i in range(len(fps)):
243 cPickle.dump(fps[i],outF)
244 outF.close()
245 dbName = details.outDbName or details.dbName
246 if details.outTableName and dbName:
247 from rdkit.Dbase.DbConnection import DbConnect
248 from rdkit.Dbase import DbInfo,DbUtils,DbModule
249 conn = DbConnect(dbName)
250
251
252
253
254 colTypes = DbUtils.TypeFinder(data,len(data),len(data[0]))
255 typeStrs = DbUtils.GetTypeStrings([details.idName,details.smilesName],colTypes,
256 keyCol=details.idName)
257 cols = '%s, %s %s'%(typeStrs[0],details.fpColName,DbModule.binaryTypeName)
258
259
260
261
262
263
264
265
266 if details.replaceTable or \
267 details.outTableName.upper() not in [x.upper() for x in conn.GetTableNames()]:
268 conn.AddTable(details.outTableName,cols)
269
270
271
272
273 for id,fp in fps:
274 tpl = id,DbModule.binaryHolder(fp.ToBinary())
275 conn.InsertData(details.outTableName,tpl)
276 conn.Commit()
277 return fps
278
279
280
281
282
283
285 """ class for storing the details of a fingerprinting run,
286 generates sensible defaults on construction
287
288 """
293
295 self.fingerprinter = Chem.RDKFingerprint
296 self.fpColName="AutoFragmentFP"
297 self.idName=''
298 self.dbName=''
299 self.outDbName=''
300 self.tableName=''
301 self.minSize=64
302 self.fpSize=2048
303 self.tgtDensity=0.3
304 self.minPath=1
305 self.maxPath=7
306 self.discrimHash=0
307 self.useHs=0
308 self.useValence=0
309 self.bitsPerHash=2
310 self.smilesName='SMILES'
311 self.maxMols=-1
312 self.outFileName=''
313 self.outTableName=''
314 self.inFileName=''
315 self.replaceTable=True
316 self.molPklName=''
317 self.useSmiles=True
318 self.useSD=False
319
321 self.metric = DataStructs.TanimotoSimilarity
322 self.doScreen=''
323 self.topN=10
324 self.screenThresh=0.75
325 self.doThreshold=0
326 self.smilesTableName=''
327 self.probeSmiles=''
328 self.probeMol=None
329 self.noPickle=0
330
332 self.clusterAlgo = Murtagh.WARDS
333 self.actTableName = ''
334 self.actName = ''
335
355
357 """ prints a usage string and exits
358
359 """
360 print _usageDoc
361 sys.exit(-1)
362
363 _usageDoc="""
364 Usage: FingerprintMols.py [args] <fName>
365
366 If <fName> is provided and no tableName is specified (see below),
367 data will be read from the text file <fName>. Text files delimited
368 with either commas (extension .csv) or tabs (extension .txt) are
369 supported.
370
371 Command line arguments are:
372 - -d _dbName_: set the name of the database from which
373 to pull input molecule information. If output is
374 going to a database, this will also be used for that
375 unless the --outDbName option is used.
376
377 - -t _tableName_: set the name of the database table
378 from which to pull input molecule information
379
380 - --smilesName=val: sets the name of the SMILES column
381 in the input database. Default is *SMILES*.
382
383 - --useSD: Assume that the input file is an SD file, not a SMILES
384 table.
385
386 - --idName=val: sets the name of the id column in the input
387 database. Defaults to be the name of the first db column
388 (or *ID* for text files).
389
390 - -o _outFileName_: name of the output file (output will
391 be a pickle file with one label,fingerprint entry for each
392 molecule).
393
394 - --outTable=val: name of the output db table used to store
395 fingerprints. If this table already exists, it will be
396 replaced.
397
398 - --outDbName: name of output database, if it's being used.
399 Defaults to be the same as the input db.
400
401 - --fpColName=val: name to use for the column which stores
402 fingerprints (in pickled format) in the output db table.
403 Default is *AutoFragmentFP*
404
405 - --maxSize=val: base size of the fingerprints to be generated
406 Default is *2048*
407
408 - --minSize=val: minimum size of the fingerprints to be generated
409 (limits the amount of folding that happens). Default is *64*
410
411 - --density=val: target bit density in the fingerprint. The
412 fingerprint will be folded until this density is
413 reached. Default is *0.3*
414
415 - --minPath=val: minimum path length to be included in
416 fragment-based fingerprints. Default is *1*.
417
418 - --maxPath=val: maximum path length to be included in
419 fragment-based fingerprints. Default is *7*.
420
421 - --nBitsPerHash: number of bits to be set in the output
422 fingerprint for each fragment. Default is *2*.
423
424 - --discrim: use of path-based discriminators to hash bits.
425 Default is *false*.
426
427 - -V: include valence information in the fingerprints
428 Default is *false*.
429
430 - -H: include Hs in the fingerprint
431 Default is *false*.
432
433 - --maxMols=val: sets the maximum number of molecules to be
434 fingerprinted.
435
436 - --useMACCS: use the public MACCS keys to do the fingerprinting
437 (instead of a daylight-type fingerprint)
438
439 """
440
442 """ parses the command line arguments and returns a
443 _FingerprinterDetails_ instance with the results.
444
445 **Note**:
446
447 - If you make modifications here, please update the global
448 _usageDoc string so the Usage message is up to date.
449
450 - This routine is used by both the fingerprinter, the clusterer and the
451 screener; not all arguments make sense for all applications.
452
453 """
454 import sys,getopt
455 try:
456 args = sys.argv[1:]
457 except:
458 Usage()
459 try:
460 args,extras = getopt.getopt(args,'HVs:d:t:o:h',
461 [
462 'minSize=','maxSize=',
463 'density=',
464 'minPath=','maxPath=',
465 'bitsPerHash=',
466 'smilesName=',
467 'molPkl=',
468 'useSD',
469 'idName=',
470 'discrim',
471 'outTable=',
472 'outDbName=',
473 'fpColName=',
474 'maxMols=',
475 'useMACCS',
476 'keepTable',
477
478 'smilesTable=',
479 'doScreen=',
480 'topN=',
481 'thresh=',
482 'smiles=',
483 'dice',
484 'cosine',
485
486 'actTable=',
487 'actName=',
488 'SLINK',
489 'CLINK',
490 'UPGMA',
491
492 ])
493 except:
494 import traceback
495 traceback.print_exc()
496 Usage()
497
498 if details is None:
499 details = FingerprinterDetails()
500 if len(extras):
501 details.inFileName=extras[0]
502
503 for arg,val in args:
504 if arg=='-H':
505 details.useHs=1
506 elif arg=='-V':
507 details.useValence=1
508 elif arg=='-d':
509 details.dbName = val
510 elif arg=='-t':
511 details.tableName = val
512 elif arg=='-o':
513 details.outFileName = val
514 elif arg=='--minSize':
515 details.minSize= int(val)
516 elif arg=='--maxSize':
517 details.fpSize= int(val)
518 elif arg=='--density':
519 details.tgtDensity = float(val)
520 elif arg=='--outTable':
521 details.outTableName = val
522 elif arg=='--outDbName':
523 details.outDbName = val
524 elif arg=='--fpColName':
525 details.fpColName = val
526 elif arg=='--minPath':
527 details.minPath= int(val)
528 elif arg=='--maxPath':
529 details.maxPath= int(val)
530 elif arg=='--nBitsPerHash':
531 details.bitsPerHash= int(val)
532 elif arg=='--discrim':
533 details.discrimHash=1
534 elif arg=='--smilesName':
535 details.smilesName = val
536 elif arg=='--molPkl':
537 details.molPklName = val
538 elif arg=='--useSD':
539 details.useSmiles=False
540 details.useSD=True
541 elif arg=='--idName':
542 details.idName = val
543 elif arg=='--maxMols':
544 details.maxMols = int(val)
545 elif arg=='--useMACCS':
546 details.fingerprinter = MACCSkeys.GenMACCSKeys
547 elif arg=='--keepTable':
548 details.replaceTable=False
549
550
551 elif arg=='--smilesTable':
552 details.smilesTableName=val;
553 elif arg=='--topN':
554 details.doThreshold=0
555 details.topN=int(val)
556 elif arg=='--thresh':
557 details.doThreshold=1
558 details.screenThresh=float(val)
559 elif arg=='--smiles':
560 details.probeSmiles=val;
561 elif arg=='--dice':
562 details.metric = DataStructs.DiceSimilarity
563 elif arg=='--cosine':
564 details.metric = DataStructs.CosineSimilarity
565
566
567 elif arg=='--SLINK':
568 details.clusterAlgo = Murtagh.SLINK
569 elif arg=='--CLINK':
570 details.clusterAlgo = Murtagh.CLINK
571 elif arg=='--UPGMA':
572 details.clusterAlgo = Murtagh.UPGMA
573 elif arg=='--actTable':
574 details.actTableName = val
575 elif arg=='--actName':
576 details.actName = val
577 elif arg=='-h':
578 Usage()
579 return details
580
581 if __name__ == '__main__':
582 message("This is FingerprintMols version %s\n\n"%(__VERSION_STRING))
583 details = ParseArgs()
584 FingerprintsFromDetails(details)
585