Package Chem :: Module BuildFragmentCatalog
[hide private]
[frames] | no frames]

Source Code for Module Chem.BuildFragmentCatalog

  1  # $Id: BuildFragmentCatalog.py 742 2008-07-05 07:42:38Z glandrum $ 
  2  # 
  3  #  Copyright (C) 2003-2008 Greg Landrum and Rational Discovery LLC 
  4  # 
  5  #   @@ All Rights Reserved  @@ 
  6  # 
  7  """  command line utility for working with FragmentCatalogs (CASE-type analysis) 
  8   
  9  **Usage** 
 10   
 11    BuildFragmentCatalog [optional args] <filename> 
 12   
 13   filename, the name of a delimited text file containing InData, is required 
 14   for some modes of operation (see below) 
 15   
 16  **Command Line Arguments** 
 17   
 18   - -n *maxNumMols*:  specify the maximum number of molecules to be processed 
 19   
 20   - -b: build the catalog and OnBitLists 
 21      *requires InData* 
 22   
 23   - -s: score compounds 
 24      *requires InData and a Catalog, can use OnBitLists* 
 25   
 26   - -g: calculate info gains 
 27      *requires Scores* 
 28   
 29   - -d: show details about high-ranking fragments 
 30      *requires a Catalog and Gains* 
 31   
 32   - --catalog=*filename*: filename with the pickled catalog. 
 33      If -b is provided, this file will be overwritten. 
 34   
 35   - --onbits=*filename*: filename to hold the pickled OnBitLists. 
 36     If -b is provided, this file will be overwritten 
 37     
 38   - --scores=*filename*: filename to hold the text score data. 
 39     If -s is provided, this file will be overwritten 
 40   
 41   - --gains=*filename*: filename to hold the text gains data. 
 42     If -g is provided, this file will be overwritten 
 43   
 44   - --details=*filename*: filename to hold the text details data. 
 45     If -d is provided, this file will be overwritten. 
 46   
 47   - --minPath=2: specify the minimum length for a path 
 48   
 49   - --maxPath=6: specify the maximum length for a path 
 50   
 51   - --smiCol=1: specify which column in the input data file contains 
 52       SMILES 
 53   
 54   - --actCol=-1: specify which column in the input data file contains 
 55       activities 
 56   
 57   - --nActs=2: specify the number of possible activity values 
 58   
 59   - --nBits=-1: specify the maximum number of bits to show details for 
 60   
 61  """ 
 62  import sys,os,cPickle 
 63  import Chem 
 64  import RDConfig 
 65  from Chem import FragmentCatalog 
 66  from Dbase.DbConnection import DbConnect 
 67  import numpy 
 68  from ML import InfoTheory 
 69  import types,sets 
 70   
 71  _cvsVersion="$Revision: 742 $" 
 72  idx1 = _cvsVersion.find(':')+1 
 73  idx2 = _cvsVersion.rfind('$') 
 74  __VERSION_STRING="%s"%(_cvsVersion[idx1:idx2]) 
 75   
76 -def message(msg,dest=sys.stdout):
77 dest.write(msg)
78
79 -def BuildCatalog(suppl,maxPts=-1,groupFileName=None, 80 minPath=2,maxPath=6,reportFreq=10):
81 """ builds a fragment catalog from a set of molecules in a delimited text block 82 83 **Arguments** 84 85 - suppl: a mol supplier 86 87 - maxPts: (optional) if provided, this will set an upper bound on the 88 number of points to be considered 89 90 - groupFileName: (optional) name of the file containing functional group 91 information 92 93 - minPath, maxPath: (optional) names of the minimum and maximum path lengths 94 to be considered 95 96 - reportFreq: (optional) how often to display status information 97 98 **Returns** 99 100 a FragmentCatalog 101 102 """ 103 if groupFileName is None: 104 groupFileName = os.path.join(RDConfig.RDDataDir,"FunctionalGroups.txt") 105 106 fpParams = FragmentCatalog.FragCatParams(minPath,maxPath,groupFileName) 107 catalog = FragmentCatalog.FragCatalog(fpParams) 108 fgen = FragmentCatalog.FragCatGenerator() 109 if maxPts >0: 110 nPts = maxPts 111 else: 112 if hasattr(suppl,'__len__'): 113 nPts = len(suppl) 114 else: 115 nPts = -1 116 for i,mol in enumerate(suppl): 117 if i == nPts: 118 break 119 if i and not i%reportFreq: 120 if nPts>-1: 121 message('Done %d of %d, %d paths\n'%(i,nPts,catalog.GetFPLength())) 122 else: 123 message('Done %d, %d paths\n'%(i,catalog.GetFPLength())) 124 fgen.AddFragsFromMol(mol,catalog) 125 return catalog
126
127 -def ScoreMolecules(suppl,catalog,maxPts=-1,actName='',acts=None, 128 nActs=2,reportFreq=10):
129 """ scores the compounds in a supplier using a catalog 130 131 **Arguments** 132 133 - suppl: a mol supplier 134 135 - catalog: the FragmentCatalog 136 137 - maxPts: (optional) the maximum number of molecules to be 138 considered 139 140 - actName: (optional) the name of the molecule's activity property. 141 If this is not provided, the molecule's last property will be used. 142 143 - acts: (optional) a sequence of activity values (integers). 144 If not provided, the activities will be read from the molecules. 145 146 - nActs: (optional) number of possible activity values 147 148 - reportFreq: (optional) how often to display status information 149 150 **Returns** 151 152 a 2-tuple: 153 154 1) the results table (a 3D array of ints nBits x 2 x nActs) 155 156 2) a list containing the on bit lists for each molecule 157 158 """ 159 nBits = catalog.GetFPLength() 160 resTbl = numpy.zeros((nBits,2,nActs),numpy.int) 161 obls = [] 162 163 if not actName and not acts: 164 actName = suppl[0].GetPropNames()[-1] 165 166 167 fpgen = FragmentCatalog.FragFPGenerator() 168 suppl.reset() 169 i = 1 170 for mol in suppl: 171 if i and not i%reportFreq: 172 message('Done %d.\n'%(i)) 173 if mol: 174 if not acts: 175 act = int(mol.GetProp(actName)) 176 else: 177 act = acts[i-1] 178 fp = fpgen.GetFPForMol(mol,catalog) 179 obls.append([x for x in fp.GetOnBits()]) 180 for j in range(nBits): 181 resTbl[j,0,act] += 1 182 for id in obls[i-1]: 183 resTbl[id-1,0,act] -= 1 184 resTbl[id-1,1,act] += 1 185 else: 186 obls.append([]) 187 i+=1 188 return resTbl,obls
189
190 -def ScoreFromLists(bitLists,suppl,catalog,maxPts=-1,actName='',acts=None, 191 nActs=2,reportFreq=10):
192 """ similar to _ScoreMolecules()_, but uses pre-calculated bit lists 193 for the molecules (this speeds things up a lot) 194 195 196 **Arguments** 197 198 - bitLists: sequence of on bit sequences for the input molecules 199 200 - suppl: the input supplier (we read activities from here) 201 202 - catalog: the FragmentCatalog 203 204 - maxPts: (optional) the maximum number of molecules to be 205 considered 206 207 - actName: (optional) the name of the molecule's activity property. 208 If this is not provided, the molecule's last property will be used. 209 210 - nActs: (optional) number of possible activity values 211 212 - reportFreq: (optional) how often to display status information 213 214 **Returns** 215 216 the results table (a 3D array of ints nBits x 2 x nActs) 217 218 """ 219 nBits = catalog.GetFPLength() 220 if maxPts >0: 221 nPts = maxPts 222 else: 223 nPts = len(bitLists) 224 resTbl = numpy.zeros((nBits,2,nActs),numpy.int) 225 if not actName and not acts: 226 actName = suppl[0].GetPropNames()[-1] 227 suppl.reset() 228 for i in range(1,nPts+1): 229 mol = suppl.next() 230 if not acts: 231 act = int(mol.GetProp(actName)) 232 else: 233 act = acts[i-1] 234 if i and not i%reportFreq: 235 message('Done %d of %d\n'%(i,nPts)) 236 ids = sets.Set() 237 for id in bitLists[i-1]: 238 ids.add(id-1) 239 for j in range(nBits): 240 resTbl[j,0,act] += 1 241 for id in ids: 242 resTbl[id,0,act] -= 1 243 resTbl[id,1,act] += 1 244 return resTbl
245
246 -def CalcGains(suppl,catalog,topN=-1,actName='',acts=None, 247 nActs=2,reportFreq=10,biasList=None,collectFps=0):
248 """ calculates info gains by constructing fingerprints 249 *DOC* 250 251 Returns a 2-tuple: 252 1) gains matrix 253 2) list of fingerprints 254 255 """ 256 nBits = catalog.GetFPLength() 257 if topN < 0: 258 topN = nBits 259 if not actName and not acts: 260 actName = suppl[0].GetPropNames()[-1] 261 262 gains = [0]*nBits 263 if hasattr(suppl,'__len__'): 264 nMols = len(suppl) 265 else: 266 nMols = -1 267 fpgen = FragmentCatalog.FragFPGenerator() 268 #ranker = InfoTheory.InfoBitRanker(nBits,nActs,InfoTheory.InfoType.ENTROPY) 269 if biasList: 270 ranker = InfoTheory.InfoBitRanker(nBits,nActs,InfoTheory.InfoType.BIASENTROPY) 271 ranker.SetBiasList(biasList) 272 else: 273 ranker = InfoTheory.InfoBitRanker(nBits,nActs,InfoTheory.InfoType.ENTROPY) 274 i = 0 275 fps = [] 276 for mol in suppl: 277 if not acts: 278 try: 279 act = int(mol.GetProp(actName)) 280 except KeyError: 281 message('ERROR: Molecule has no property: %s\n'%(actName)) 282 message('\tAvailable properties are: %s\n'%(str(mol.GetPropNames()))) 283 raise KeyError,actName 284 else: 285 act = acts[i] 286 if i and not i%reportFreq: 287 if nMols>0: 288 message('Done %d of %d.\n'%(i,nMols)) 289 else: 290 message('Done %d.\n'%(i)) 291 fp = fpgen.GetFPForMol(mol,catalog) 292 ranker.AccumulateVotes(fp,act) 293 i+=1; 294 if collectFps: 295 fps.append(fp) 296 gains = ranker.GetTopN(topN) 297 return gains,fps
298
299 -def CalcGainsFromFps(suppl,fps,topN=-1,actName='',acts=None, 300 nActs=2,reportFreq=10,biasList=None):
301 """ calculates info gains from a set of fingerprints 302 303 *DOC* 304 305 """ 306 nBits = len(fps[0]) 307 if topN < 0: 308 topN = nBits 309 if not actName and not acts: 310 actName = suppl[0].GetPropNames()[-1] 311 312 gains = [0]*nBits 313 if hasattr(suppl,'__len__'): 314 nMols = len(suppl) 315 else: 316 nMols = -1 317 if biasList: 318 ranker = InfoTheory.InfoBitRanker(nBits,nActs,InfoTheory.InfoType.BIASENTROPY) 319 ranker.SetBiasList(biasList) 320 else: 321 ranker = InfoTheory.InfoBitRanker(nBits,nActs,InfoTheory.InfoType.ENTROPY) 322 for i,mol in enumerate(suppl): 323 if not acts: 324 try: 325 act = int(mol.GetProp(actName)) 326 except KeyError: 327 message('ERROR: Molecule has no property: %s\n'%(actName)) 328 message('\tAvailable properties are: %s\n'%(str(mol.GetPropNames()))) 329 raise KeyError,actName 330 else: 331 act = acts[i] 332 if i and not i%reportFreq: 333 if nMols>0: 334 message('Done %d of %d.\n'%(i,nMols)) 335 else: 336 message('Done %d.\n'%(i)) 337 fp = fps[i] 338 ranker.AccumulateVotes(fp,act) 339 gains = ranker.GetTopN(topN) 340 return gains
341
342 -def OutputGainsData(outF,gains,cat,nActs=2):
343 actHeaders = ['Act-%d'%(x) for x in range(nActs)] 344 if cat: 345 outF.write('id,Description,Gain,%s\n'%(','.join(actHeaders))) 346 else: 347 outF.write('id,Gain,%s\n'%(','.join(actHeaders))) 348 for entry in gains: 349 id = int(entry[0]) 350 outL = [str(id)] 351 if cat: 352 descr = cat.GetBitDescription(id) 353 outL.append(descr) 354 outL.append('%.6f'%entry[1]) 355 outL += ['%d'%x for x in entry[2:]] 356 outF.write(','.join(outL)) 357 outF.write('\n')
358 359
360 -def ProcessGainsData(inF,delim=',',idCol=0,gainCol=1):
361 """ reads a list of ids and info gains out of an input file 362 363 """ 364 res = [] 365 inL = inF.readline() 366 for line in inF.xreadlines(): 367 splitL = line.strip().split(delim) 368 res.append((splitL[idCol],float(splitL[gainCol]))) 369 return res
370 371
372 -def ShowDetails(catalog,gains,nToDo=-1,outF=sys.stdout,idCol=0,gainCol=1, 373 outDelim=','):
374 """ 375 gains should be a sequence of sequences. The idCol entry of each 376 sub-sequence should be a catalog ID. _ProcessGainsData()_ provides 377 suitable input. 378 379 """ 380 if nToDo < 0: 381 nToDo = len(gains) 382 for i in range(nToDo): 383 id = int(gains[i][idCol]) 384 gain = float(gains[i][gainCol]) 385 descr = catalog.GetFragDescription(id) 386 if descr: 387 outF.write('%s\n'%(outDelim.join((str(id),descr,str(gain)))))
388
389 -def SupplierFromDetails(details):
390 from VLib.NodeLib.DbMolSupply import DbMolSupplyNode 391 from VLib.NodeLib.SmilesSupply import SmilesSupplyNode 392 393 if details.dbName: 394 conn = DbConnect(details.dbName,details.tableName) 395 suppl = DbMolSupplyNode(conn.GetData()) 396 else: 397 suppl = SmilesSupplyNode(details.inFileName,delim=details.delim, 398 nameColumn=details.nameCol, 399 smilesColumn=details.smiCol, 400 titleLine=details.hasTitle) 401 if type(details.actCol)==types.IntType: 402 suppl.reset() 403 m = suppl.next() 404 actName = m.GetPropNames()[details.actCol] 405 details.actCol = actName 406 if type(details.nameCol)==types.IntType: 407 suppl.reset() 408 m = suppl.next() 409 nameName = m.GetPropNames()[details.nameCol] 410 details.nameCol = nameName 411 suppl.reset() 412 if type(details.actCol)==types.IntType: 413 suppl.reset() 414 m = suppl.next() 415 actName = m.GetPropNames()[details.actCol] 416 details.actCol = actName 417 if type(details.nameCol)==types.IntType: 418 suppl.reset() 419 m = suppl.next() 420 nameName = m.GetPropNames()[details.nameCol] 421 details.nameCol = nameName 422 suppl.reset() 423 return suppl
424 425
426 -def Usage():
427 print "This is BuildFragmentCatalog version %s"%(__VERSION_STRING) 428 print 'usage error' 429 #print __doc__ 430 sys.exit(-1)
431
432 -class RunDetails(object):
433 numMols=-1 434 doBuild=0 435 doSigs=0 436 doScore=0 437 doGains=0 438 doDetails=0 439 catalogName=None 440 onBitsName=None 441 scoresName=None 442 gainsName=None 443 dbName='' 444 tableName=None 445 detailsName=None 446 inFileName=None 447 fpName=None 448 minPath=2 449 maxPath=6 450 smiCol=1 451 actCol=-1 452 nameCol=-1 453 hasTitle=1 454 nActs = 2 455 nBits=-1 456 delim=',' 457 biasList=None 458 topN=-1
459
460 -def ParseArgs(details):
461 import getopt 462 try: 463 args,extras = getopt.getopt(sys.argv[1:],'n:d:cst', 464 ['catalog=','onbits=', 465 'scoresFile=','gainsFile=','detailsFile=','fpFile=', 466 'minPath=','maxPath=','smiCol=','actCol=','nameCol=','nActs=', 467 'nBits=','biasList=','topN=', 468 'build','sigs','gains','details','score','noTitle']) 469 except: 470 sys.stderr.write('Error parsing command line:\n') 471 import traceback 472 traceback.print_exc() 473 Usage() 474 for arg,val in args: 475 if arg=='-n': 476 details.numMols=int(val) 477 elif arg=='-c': 478 details.delim=',' 479 elif arg=='-s': 480 details.delim=' ' 481 elif arg=='-t': 482 details.delim='\t' 483 elif arg=='-d': 484 details.dbName=val 485 elif arg=='--build': 486 details.doBuild=1 487 elif arg=='--score': 488 details.doScore=1 489 elif arg=='--gains': 490 details.doGains=1 491 elif arg=='--sigs': 492 details.doSigs=1 493 elif arg=='-details': 494 details.doDetails=1 495 elif arg=='--catalog': 496 details.catalogName=val 497 elif arg=='--onbits': 498 details.onBitsName=val 499 elif arg=='--scoresFile': 500 details.scoresName=val 501 elif arg=='--gainsFile': 502 details.gainsName=val 503 elif arg=='--detailsFile': 504 details.detailsName=val 505 elif arg=='--fpFile': 506 details.fpName=val 507 elif arg=='--minPath': 508 details.minPath=int(val) 509 elif arg=='--maxPath': 510 details.maxPath=int(val) 511 elif arg=='--smiCol': 512 try: 513 details.smiCol=int(val) 514 except ValueError: 515 details.smiCol=val 516 elif arg=='--actCol': 517 try: 518 details.actCol=int(val) 519 except ValueError: 520 details.actCol=val 521 elif arg=='--nameCol': 522 try: 523 details.nameCol=int(val) 524 except ValueError: 525 details.nameCol=val 526 elif arg=='--nActs': 527 details.nActs=int(val) 528 elif arg=='--nBits': 529 details.nBits=int(val) 530 elif arg=='--noTitle': 531 details.hasTitle=0 532 elif arg=='--biasList': 533 details.biasList=tuple(eval(val)) 534 elif arg=='--topN': 535 details.topN=int(val) 536 elif arg=='-h': 537 Usage() 538 sys.exit(0) 539 else: 540 Usage() 541 if len(extras): 542 if details.dbName: 543 details.tableName=extras[0] 544 else: 545 details.inFileName = extras[0] 546 else: 547 Usage()
548 if __name__=='__main__': 549 import time 550 details = RunDetails() 551 ParseArgs(details) 552 from cStringIO import StringIO 553 suppl = SupplierFromDetails(details) 554 555 cat = None 556 obls = None 557 if details.doBuild: 558 if not suppl: 559 message("We require inData to generate a catalog\n") 560 sys.exit(-2) 561 message("Building catalog\n") 562 t1 = time.time() 563 cat = BuildCatalog(suppl,maxPts=details.numMols, 564 minPath=details.minPath,maxPath=details.maxPath) 565 t2 = time.time() 566 message("\tThat took %.2f seconds.\n"%(t2-t1)) 567 if details.catalogName: 568 message("Dumping catalog data\n") 569 cPickle.dump(cat,open(details.catalogName,'wb+')) 570 elif details.catalogName: 571 message("Loading catalog\n") 572 cat = cPickle.load(open(details.catalogName,'rb')) 573 if details.onBitsName: 574 try: 575 obls = cPickle.load(open(details.onBitsName,'rb')) 576 except: 577 obls = None 578 else: 579 if len(obls)<(inD.count('\n')-1): 580 obls = None 581 scores = None 582 if details.doScore: 583 if not suppl: 584 message("We require inData to score molecules\n") 585 sys.exit(-2) 586 if not cat: 587 message("We require a catalog to score molecules\n") 588 sys.exit(-2) 589 message("Scoring compounds\n") 590 if not obls or len(obls)<details.numMols: 591 scores,obls = ScoreMolecules(suppl,cat,maxPts=details.numMols, 592 actName=details.actCol, 593 nActs=details.nActs) 594 if details.scoresName: 595 cPickle.dump(scores,open(details.scoresName,'wb+')) 596 if details.onBitsName: 597 cPickle.dump(obls,open(details.onBitsName,'wb+')) 598 else: 599 scores = ScoreFromLists(obls,suppl,cat,maxPts=details.numMols, 600 actName=details.actCol, 601 nActs=details.nActs) 602 elif details.scoresName: 603 scores = cPickle.load(open(details.scoresName,'rb')) 604 605 if details.fpName and os.path.exists(details.fpName) and not details.doSigs: 606 message("Reading fingerprints from file.\n") 607 fps = cPickle.load(open(details.fpName,'rb')) 608 else: 609 fps = [] 610 gains = None 611 if details.doGains: 612 if not suppl: 613 message("We require inData to calculate gains\n") 614 sys.exit(-2) 615 if not (cat or fps): 616 message("We require either a catalog or fingerprints to calculate gains\n") 617 sys.exit(-2) 618 message("Calculating Gains\n") 619 t1 = time.time() 620 if details.fpName: 621 collectFps=1 622 else: 623 collectFps=0 624 if not fps: 625 gains,fps = CalcGains(suppl,cat,topN=details.topN,actName=details.actCol, 626 nActs=details.nActs,biasList=details.biasList, 627 collectFps=collectFps) 628 if details.fpName: 629 message("Writing fingerprint file.\n") 630 tmpF=open(details.fpName,'wb+') 631 cPickle.dump(fps,tmpF,1) 632 tmpF.close() 633 else: 634 gains = CalcGainsFromFps(suppl,fps,topN=details.topN,actName=details.actCol, 635 nActs=details.nActs,biasList=details.biasList) 636 t2=time.time() 637 message("\tThat took %.2f seconds.\n"%(t2-t1)) 638 if details.gainsName: 639 outF = open(details.gainsName,'w+') 640 OutputGainsData(outF,gains,cat,nActs=details.nActs) 641 else: 642 if details.gainsName: 643 inF = open(details.gainsName,'r') 644 gains = ProcessGainsData(inF) 645 646 647 if details.doDetails: 648 if not cat: 649 message("We require a catalog to get details\n") 650 sys.exit(-2) 651 if not gains: 652 message("We require gains data to get details\n") 653 sys.exit(-2) 654 io = StringIO() 655 io.write('id,SMILES,gain\n') 656 ShowDetails(cat,gains,nToDo=details.nBits,outF=io) 657 if details.detailsName: 658 open(details.detailsName,'w+').write(io.getvalue()) 659 else: 660 sys.stderr.write(io.getvalue()) 661