Package rdkit :: Package Chem :: Package Fingerprints :: Module FingerprintMols
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Chem.Fingerprints.FingerprintMols

  1  # $Id: FingerprintMols.py 1605 2011-01-02 11:17:15Z glandrum $ 
  2  # 
  3  #  Copyright (c) 2003-2006 Rational Discovery LLC 
  4  # 
  5  #   @@ All Rights Reserved @@ 
  6  #  This file is part of the RDKit. 
  7  #  The contents are covered by the terms of the BSD license 
  8  #  which is included in the file license.txt, found at the root 
  9  #  of the RDKit source tree. 
 10  # 
 11  """ utility functionality for fingerprinting sets of molecules 
 12   includes a command line app for working with fingerprints 
 13   and databases 
 14   
 15   
 16  Sample Usage: 
 17   
 18    python FingerprintMols.py  -d data.gdb \ 
 19          -t 'raw_dop_data' --smilesName="Structure" --idName="Mol_ID"  \ 
 20          --outTable="daylight_sig" 
 21   
 22   
 23  """ 
 24  from rdkit import Chem 
 25  from rdkit.Chem import MACCSkeys 
 26  from rdkit.ML.Cluster import Murtagh 
 27  from rdkit import DataStructs 
 28  import sys 
 29  import cPickle 
 30   
 31  _cvsVersion="$Id: FingerprintMols.py 1605 2011-01-02 11:17:15Z glandrum $" 
 32  idx1 = _cvsVersion.find(':')+1 
 33  idx2 = _cvsVersion.rfind('$') 
 34  __VERSION_STRING="%s"%(_cvsVersion[idx1:idx2]) 
 35   
 36   
37 -def error(msg):
38 sys.stderr.write(msg)
39 -def message(msg):
40 sys.stderr.write(msg)
41
42 -def GetRDKFingerprint(mol):
43 """ uses default parameters """ 44 details = FingerprinterDetails() 45 return apply(FingerprintMol,(mol,),details.__dict__)
46
47 -def FoldFingerprintToTargetDensity(fp,**fpArgs):
48 nOn = fp.GetNumOnBits() 49 nTot = fp.GetNumBits() 50 while( float(nOn)/nTot < fpArgs['tgtDensity'] ): 51 if nTot / 2 > fpArgs['minSize']: 52 fp = DataStructs.FoldFingerprint(fp,2) 53 nOn = fp.GetNumOnBits() 54 nTot = fp.GetNumBits() 55 else: 56 break 57 return fp
58
59 -def FingerprintMol(mol, 60 fingerprinter=Chem.RDKFingerprint, 61 **fpArgs):
62 if not fpArgs: 63 details = FingerprinterDetails() 64 fpArgs = details.__dict__ 65 66 if fingerprinter != Chem.RDKFingerprint: 67 fp = fingerprinter(mol,**fpArgs) 68 fp = FoldFingerprintToTargetDensity(fp,**fpArgs) 69 else: 70 fp = fingerprinter(mol,fpArgs['minPath'],fpArgs['maxPath'], 71 fpArgs['fpSize'],fpArgs['bitsPerHash'], 72 fpArgs['useHs'],fpArgs['tgtDensity'], 73 fpArgs['minSize']) 74 return fp
75 76
77 -def FingerprintsFromSmiles(dataSource,idCol,smiCol, 78 fingerprinter=Chem.RDKFingerprint, 79 reportFreq=10,maxMols=-1, 80 **fpArgs):
81 """ fpArgs are passed as keyword arguments to the fingerprinter 82 83 Returns a list of 2-tuples: (id,fp) 84 85 """ 86 res = [] 87 nDone = 0 88 for entry in dataSource: 89 id,smi = str(entry[idCol]),str(entry[smiCol]) 90 try: 91 mol = Chem.MolFromSmiles(smi) 92 except: 93 mol = None 94 if mol: 95 fp = FingerprintMol(mol,fingerprinter,**fpArgs) 96 res.append((id,fp)) 97 nDone += 1 98 if reportFreq>0 and not nDone % reportFreq: 99 message('Done %d molecules\n'%(nDone)) 100 if maxMols > 0 and nDone >= maxMols: 101 break 102 else: 103 error('Problems parsing SMILES: %s\n'%smi) 104 return res
105
106 -def FingerprintsFromMols(mols, 107 fingerprinter=Chem.RDKFingerprint, 108 reportFreq=10,maxMols=-1, 109 **fpArgs):
110 """ fpArgs are passed as keyword arguments to the fingerprinter 111 112 Returns a list of 2-tuples: (id,fp) 113 114 """ 115 res = [] 116 nDone = 0 117 for id,mol in mols: 118 if mol: 119 fp = FingerprintMol(mol,fingerprinter,**fpArgs) 120 res.append((id,fp)) 121 nDone += 1 122 if reportFreq>0 and not nDone % reportFreq: 123 message('Done %d molecules\n'%(nDone)) 124 if maxMols > 0 and nDone >= maxMols: 125 break 126 else: 127 error('Problems parsing SMILES: %s\n'%smi) 128 return res
129
130 -def FingerprintsFromPickles(dataSource,idCol,pklCol, 131 fingerprinter=Chem.RDKFingerprint, 132 reportFreq=10,maxMols=-1, 133 **fpArgs):
134 """ fpArgs are passed as keyword arguments to the fingerprinter 135 136 Returns a list of 2-tuples: (id,fp) 137 138 """ 139 res = [] 140 nDone = 0 141 for entry in dataSource: 142 id,pkl = str(entry[idCol]),str(entry[pklCol]) 143 try: 144 mol = Chem.Mol(pkl) 145 except: 146 mol = None 147 if mol: 148 fp = FingerprintMol(mol,fingerprinter,**fpArgs) 149 res.append((id,fp)) 150 nDone += 1 151 if reportFreq>0 and not nDone % reportFreq: 152 message('Done %d molecules\n'%(nDone)) 153 if maxMols > 0 and nDone >= maxMols: 154 break 155 else: 156 error('Problems parsing pickle for id: %s\n'%id) 157 return res
158
159 -def FingerprintsFromDetails(details,reportFreq=10):
160 data = None 161 if details.dbName and details.tableName: 162 from rdkit.Dbase.DbConnection import DbConnect 163 from rdkit.Dbase import DbInfo 164 from rdkit.ML.Data import DataUtils 165 try: 166 conn = DbConnect(details.dbName,details.tableName) 167 except: 168 import traceback 169 error('Problems establishing connection to database: %s|%s\n'%(details.dbName, 170 details.tableName)) 171 traceback.print_exc() 172 if not details.idName: 173 details.idName=DbInfo.GetColumnNames(details.dbName,details.tableName)[0] 174 dataSet = DataUtils.DBToData(details.dbName,details.tableName, 175 what='%s,%s'%(details.idName,details.smilesName)) 176 idCol = 0 177 smiCol = 1 178 elif details.inFileName and details.useSmiles: 179 from rdkit.ML.Data import DataUtils 180 conn = None 181 if not details.idName: 182 details.idName='ID' 183 try: 184 dataSet = DataUtils.TextFileToData(details.inFileName, 185 onlyCols=[details.idName,details.smilesName]) 186 except IOError: 187 import traceback 188 error('Problems reading from file %s\n'%(details.inFileName)) 189 traceback.print_exc() 190 191 idCol = 0 192 smiCol = 1 193 elif details.inFileName and details.useSD: 194 conn = None 195 dataset=None 196 if not details.idName: 197 details.idName='ID' 198 dataSet = [] 199 try: 200 s = Chem.SDMolSupplier(details.inFileName) 201 except: 202 import traceback 203 error('Problems reading from file %s\n'%(details.inFileName)) 204 traceback.print_exc() 205 else: 206 while 1: 207 try: 208 m = s.next() 209 except StopIteration: 210 break 211 if m: 212 dataSet.append(m) 213 if reportFreq>0 and not len(dataSet) % reportFreq: 214 message('Read %d molecules\n'%(len(dataSet))) 215 if details.maxMols > 0 and len(dataSet) >= details.maxMols: 216 break 217 218 for i,mol in enumerate(dataSet): 219 if mol.HasProp(details.idName): 220 nm = mol.GetProp(details.idName) 221 else: 222 nm = mol.GetProp('_Name') 223 dataSet[i] = (nm,mol) 224 else: 225 dataSet = None 226 227 fps = None 228 if dataSet and not details.useSD: 229 data = dataSet.GetNamedData() 230 if not details.molPklName: 231 fps = apply(FingerprintsFromSmiles,(data,idCol,smiCol), 232 details.__dict__) 233 else: 234 fps = apply(FingerprintsFromPickles,(data,idCol,smiCol), 235 details.__dict__) 236 elif dataSet and details.useSD: 237 fps = apply(FingerprintsFromMols,(dataSet,),details.__dict__) 238 239 if fps: 240 if details.outFileName: 241 outF = open(details.outFileName,'wb+') 242 for i in range(len(fps)): 243 cPickle.dump(fps[i],outF) 244 outF.close() 245 dbName = details.outDbName or details.dbName 246 if details.outTableName and dbName: 247 from rdkit.Dbase.DbConnection import DbConnect 248 from rdkit.Dbase import DbInfo,DbUtils,DbModule 249 conn = DbConnect(dbName) 250 # 251 # We don't have a db open already, so we'll need to figure out 252 # the types of our columns... 253 # 254 colTypes = DbUtils.TypeFinder(data,len(data),len(data[0])) 255 typeStrs = DbUtils.GetTypeStrings([details.idName,details.smilesName],colTypes, 256 keyCol=details.idName) 257 cols = '%s, %s %s'%(typeStrs[0],details.fpColName,DbModule.binaryTypeName) 258 259 # FIX: we should really check to see if the table 260 # is already there and, if so, add the appropriate 261 # column. 262 263 # 264 # create the new table 265 # 266 if details.replaceTable or \ 267 details.outTableName.upper() not in [x.upper() for x in conn.GetTableNames()]: 268 conn.AddTable(details.outTableName,cols) 269 270 # 271 # And add the data 272 # 273 for id,fp in fps: 274 tpl = id,DbModule.binaryHolder(fp.ToBinary()) 275 conn.InsertData(details.outTableName,tpl) 276 conn.Commit() 277 return fps
278 # ------------------------------------------------ 279 # 280 # Command line parsing stuff 281 # 282 # ------------------------------------------------ 283
284 -class FingerprinterDetails(object):
285 """ class for storing the details of a fingerprinting run, 286 generates sensible defaults on construction 287 288 """
289 - def __init__(self):
290 self._fingerprinterInit() 291 self._screenerInit() 292 self._clusterInit()
293
294 - def _fingerprinterInit(self):
295 self.fingerprinter = Chem.RDKFingerprint 296 self.fpColName="AutoFragmentFP" 297 self.idName='' 298 self.dbName='' 299 self.outDbName='' 300 self.tableName='' 301 self.minSize=64 302 self.fpSize=2048 303 self.tgtDensity=0.3 304 self.minPath=1 305 self.maxPath=7 306 self.discrimHash=0 307 self.useHs=0 308 self.useValence=0 309 self.bitsPerHash=2 310 self.smilesName='SMILES' 311 self.maxMols=-1 312 self.outFileName='' 313 self.outTableName='' 314 self.inFileName='' 315 self.replaceTable=True 316 self.molPklName='' 317 self.useSmiles=True 318 self.useSD=False
319
320 - def _screenerInit(self):
321 self.metric = DataStructs.TanimotoSimilarity 322 self.doScreen='' 323 self.topN=10 324 self.screenThresh=0.75 325 self.doThreshold=0 326 self.smilesTableName='' 327 self.probeSmiles='' 328 self.probeMol=None 329 self.noPickle=0
330
331 - def _clusterInit(self):
332 self.clusterAlgo = Murtagh.WARDS 333 self.actTableName = '' 334 self.actName = ''
335
336 - def GetMetricName(self):
337 if self.metric == DataStructs.TanimotoSimilarity: 338 return 'Tanimoto' 339 elif self.metric == DataStructs.DiceSimilarity: 340 return 'Dice' 341 elif self.metric == DataStructs.CosineSimilarity: 342 return 'Cosine' 343 elif self.metric: 344 return self.metric 345 else: 346 return 'Unknown'
347 - def SetMetricFromName(self,name):
348 name = name.upper() 349 if name=="TANIMOTO": 350 self.metric = DataStructs.TanimotoSimilarity 351 elif name=="DICE": 352 self.metric = DataStructs.DiceSimilarity 353 elif name=="COSINE": 354 self.metric = DataStructs.CosineSimilarity
355
356 -def Usage():
357 """ prints a usage string and exits 358 359 """ 360 print _usageDoc 361 sys.exit(-1)
362 363 _usageDoc=""" 364 Usage: FingerprintMols.py [args] <fName> 365 366 If <fName> is provided and no tableName is specified (see below), 367 data will be read from the text file <fName>. Text files delimited 368 with either commas (extension .csv) or tabs (extension .txt) are 369 supported. 370 371 Command line arguments are: 372 - -d _dbName_: set the name of the database from which 373 to pull input molecule information. If output is 374 going to a database, this will also be used for that 375 unless the --outDbName option is used. 376 377 - -t _tableName_: set the name of the database table 378 from which to pull input molecule information 379 380 - --smilesName=val: sets the name of the SMILES column 381 in the input database. Default is *SMILES*. 382 383 - --useSD: Assume that the input file is an SD file, not a SMILES 384 table. 385 386 - --idName=val: sets the name of the id column in the input 387 database. Defaults to be the name of the first db column 388 (or *ID* for text files). 389 390 - -o _outFileName_: name of the output file (output will 391 be a pickle file with one label,fingerprint entry for each 392 molecule). 393 394 - --outTable=val: name of the output db table used to store 395 fingerprints. If this table already exists, it will be 396 replaced. 397 398 - --outDbName: name of output database, if it's being used. 399 Defaults to be the same as the input db. 400 401 - --fpColName=val: name to use for the column which stores 402 fingerprints (in pickled format) in the output db table. 403 Default is *AutoFragmentFP* 404 405 - --maxSize=val: base size of the fingerprints to be generated 406 Default is *2048* 407 408 - --minSize=val: minimum size of the fingerprints to be generated 409 (limits the amount of folding that happens). Default is *64* 410 411 - --density=val: target bit density in the fingerprint. The 412 fingerprint will be folded until this density is 413 reached. Default is *0.3* 414 415 - --minPath=val: minimum path length to be included in 416 fragment-based fingerprints. Default is *1*. 417 418 - --maxPath=val: maximum path length to be included in 419 fragment-based fingerprints. Default is *7*. 420 421 - --nBitsPerHash: number of bits to be set in the output 422 fingerprint for each fragment. Default is *2*. 423 424 - --discrim: use of path-based discriminators to hash bits. 425 Default is *false*. 426 427 - -V: include valence information in the fingerprints 428 Default is *false*. 429 430 - -H: include Hs in the fingerprint 431 Default is *false*. 432 433 - --maxMols=val: sets the maximum number of molecules to be 434 fingerprinted. 435 436 - --useMACCS: use the public MACCS keys to do the fingerprinting 437 (instead of a daylight-type fingerprint) 438 439 """ 440
441 -def ParseArgs(details=None):
442 """ parses the command line arguments and returns a 443 _FingerprinterDetails_ instance with the results. 444 445 **Note**: 446 447 - If you make modifications here, please update the global 448 _usageDoc string so the Usage message is up to date. 449 450 - This routine is used by both the fingerprinter, the clusterer and the 451 screener; not all arguments make sense for all applications. 452 453 """ 454 import sys,getopt 455 try: 456 args = sys.argv[1:] 457 except: 458 Usage() 459 try: 460 args,extras = getopt.getopt(args,'HVs:d:t:o:h', 461 [ 462 'minSize=','maxSize=', 463 'density=', 464 'minPath=','maxPath=', 465 'bitsPerHash=', 466 'smilesName=', 467 'molPkl=', 468 'useSD', 469 'idName=', 470 'discrim', 471 'outTable=', 472 'outDbName=', 473 'fpColName=', 474 'maxMols=', 475 'useMACCS', 476 'keepTable', 477 # SCREENING: 478 'smilesTable=', 479 'doScreen=', 480 'topN=', 481 'thresh=', 482 'smiles=', 483 'dice', 484 'cosine', 485 # CLUSTERING: 486 'actTable=', 487 'actName=', 488 'SLINK', 489 'CLINK', 490 'UPGMA', 491 492 ]) 493 except: 494 import traceback 495 traceback.print_exc() 496 Usage() 497 498 if details is None: 499 details = FingerprinterDetails() 500 if len(extras): 501 details.inFileName=extras[0] 502 503 for arg,val in args: 504 if arg=='-H': 505 details.useHs=1 506 elif arg=='-V': 507 details.useValence=1 508 elif arg=='-d': 509 details.dbName = val 510 elif arg=='-t': 511 details.tableName = val 512 elif arg=='-o': 513 details.outFileName = val 514 elif arg=='--minSize': 515 details.minSize= int(val) 516 elif arg=='--maxSize': 517 details.fpSize= int(val) 518 elif arg=='--density': 519 details.tgtDensity = float(val) 520 elif arg=='--outTable': 521 details.outTableName = val 522 elif arg=='--outDbName': 523 details.outDbName = val 524 elif arg=='--fpColName': 525 details.fpColName = val 526 elif arg=='--minPath': 527 details.minPath= int(val) 528 elif arg=='--maxPath': 529 details.maxPath= int(val) 530 elif arg=='--nBitsPerHash': 531 details.bitsPerHash= int(val) 532 elif arg=='--discrim': 533 details.discrimHash=1 534 elif arg=='--smilesName': 535 details.smilesName = val 536 elif arg=='--molPkl': 537 details.molPklName = val 538 elif arg=='--useSD': 539 details.useSmiles=False 540 details.useSD=True 541 elif arg=='--idName': 542 details.idName = val 543 elif arg=='--maxMols': 544 details.maxMols = int(val) 545 elif arg=='--useMACCS': 546 details.fingerprinter = MACCSkeys.GenMACCSKeys 547 elif arg=='--keepTable': 548 details.replaceTable=False 549 550 # SCREENER: 551 elif arg=='--smilesTable': 552 details.smilesTableName=val; 553 elif arg=='--topN': 554 details.doThreshold=0 555 details.topN=int(val) 556 elif arg=='--thresh': 557 details.doThreshold=1 558 details.screenThresh=float(val) 559 elif arg=='--smiles': 560 details.probeSmiles=val; 561 elif arg=='--dice': 562 details.metric = DataStructs.DiceSimilarity 563 elif arg=='--cosine': 564 details.metric = DataStructs.CosineSimilarity 565 566 # CLUSTERS: 567 elif arg=='--SLINK': 568 details.clusterAlgo = Murtagh.SLINK 569 elif arg=='--CLINK': 570 details.clusterAlgo = Murtagh.CLINK 571 elif arg=='--UPGMA': 572 details.clusterAlgo = Murtagh.UPGMA 573 elif arg=='--actTable': 574 details.actTableName = val 575 elif arg=='--actName': 576 details.actName = val 577 elif arg=='-h': 578 Usage() 579 return details
580 581 if __name__ == '__main__': 582 message("This is FingerprintMols version %s\n\n"%(__VERSION_STRING)) 583 details = ParseArgs() 584 FingerprintsFromDetails(details) 585