Package ML :: Module GrowComposite
[hide private]
[frames] | no frames]

Source Code for Module ML.GrowComposite

  1  # $Id: GrowComposite.py 778 2008-07-30 09:51:55Z glandrum $ 
  2  # 
  3  #  Copyright (C) 2003-2006  greg Landrum and Rational Discovery LLC 
  4  # 
  5  #   @@ All Rights Reserved  @@ 
  6  # 
  7   
  8  """ command line utility for growing composite models 
  9   
 10  **Usage** 
 11   
 12    _GrowComposite [optional args] filename_ 
 13   
 14  **Command Line Arguments** 
 15   
 16    - -n *count*: number of new models to build 
 17   
 18    - -C *pickle file name*:  name of file containing composite upon which to build. 
 19   
 20    - --inNote *note*: note to be used in loading composite models from the database 
 21        for growing 
 22   
 23    - --balTable *table name*:  table from which to take the original data set 
 24       (for balancing) 
 25   
 26    - --balWeight *weight*: (between 0 and 1) weighting factor for the new data 
 27       (for balancing). OR, *weight* can be a list of weights 
 28   
 29    - --balCnt *count*: number of individual models in the balanced composite 
 30       (for balancing) 
 31   
 32    - --balH: use only the holdout set from the original data set in the balancing 
 33       (for balancing) 
 34   
 35    - --balT: use only the training set from the original data set in the balancing 
 36       (for balancing) 
 37   
 38    - -S: shuffle the original data set 
 39       (for balancing) 
 40   
 41    - -r: randomize the activities of the original data set 
 42       (for balancing) 
 43   
 44    - -N *note*: note to be attached to the grown composite when it's saved in the 
 45       database 
 46   
 47    - --outNote *note*: equivalent to -N 
 48   
 49    - -o *filename*: name of an output file to hold the pickled composite after 
 50       it has been grown. 
 51       If multiple balance weights are used, the weights will be added to 
 52       the filenames. 
 53   
 54    - -L *limit*: provide an (integer) limit on individual model complexity 
 55     
 56    - -d *database name*: instead of reading the data from a QDAT file, 
 57       pull it from a database.  In this case, the _filename_ argument 
 58       provides the name of the database table containing the data set. 
 59   
 60    - -p *tablename*: store persistence data in the database 
 61       in table *tablename* 
 62   
 63    - -l: locks the random number generator to give consistent sets 
 64       of training and hold-out data.  This is primarily intended 
 65       for testing purposes. 
 66   
 67    - -g: be less greedy when training the models. 
 68   
 69    - -G *number*: force trees to be rooted at descriptor *number*. 
 70   
 71    - -D: show a detailed breakdown of the composite model performance 
 72       across the training and, when appropriate, hold-out sets. 
 73        
 74    - -t *threshold value*: use high-confidence predictions for the final 
 75       analysis of the hold-out data. 
 76   
 77    - -q *list string*:  Add QuantTrees to the composite and use the list 
 78       specified in *list string* as the number of target quantization 
 79       bounds for each descriptor.  Don't forget to include 0's at the 
 80       beginning and end of *list string* for the name and value fields. 
 81       For example, if there are 4 descriptors and you want 2 quant bounds 
 82       apiece, you would use _-q "[0,2,2,2,2,0]"_. 
 83       Two special cases: 
 84         1) If you would like to ignore a descriptor in the model building, 
 85            use '-1' for its number of quant bounds. 
 86         2) If you have integer valued data that should not be quantized 
 87            further, enter 0 for that descriptor. 
 88   
 89    - -V: print the version number and exit 
 90   
 91  """ 
 92  import RDConfig 
 93  import numpy 
 94  from ML.Data import DataUtils,SplitData 
 95  from ML import ScreenComposite,BuildComposite 
 96  from ML.Composite import AdjustComposite 
 97  from Dbase.DbConnection import DbConnect 
 98  from ML import CompositeRun 
 99  import sys,cPickle,time,types 
100   
101  _runDetails = CompositeRun.CompositeRun() 
102   
103  __VERSION_STRING="0.5.0" 
104   
105  _verbose = 1 
106 -def message(msg):
107 """ emits messages to _sys.stdout_ 108 override this in modules which import this one to redirect output 109 110 **Arguments** 111 112 - msg: the string to be displayed 113 114 """ 115 if _verbose: sys.stdout.write('%s\n'%(msg))
116
117 -def GrowIt(details,composite,progressCallback=None, 118 saveIt=1,setDescNames=0,data=None):
119 """ does the actual work of building a composite model 120 121 **Arguments** 122 123 - details: a _CompositeRun.CompositeRun_ object containing details 124 (options, parameters, etc.) about the run 125 126 - composite: the composite model to grow 127 128 - progressCallback: (optional) a function which is called with a single 129 argument (the number of models built so far) after each model is built. 130 131 - saveIt: (optional) if this is nonzero, the resulting model will be pickled 132 and dumped to the filename specified in _details.outName_ 133 134 - setDescNames: (optional) if nonzero, the composite's _SetInputOrder()_ method 135 will be called using the results of the data set's _GetVarNames()_ method; 136 it is assumed that the details object has a _descNames attribute which 137 is passed to the composites _SetDescriptorNames()_ method. Otherwise 138 (the default), _SetDescriptorNames()_ gets the results of _GetVarNames()_. 139 140 - data: (optional) the data set to be used. If this is not provided, the 141 data set described in details will be used. 142 143 **Returns** 144 145 the enlarged composite model 146 147 148 """ 149 details.rundate = time.asctime() 150 151 if data is None: 152 fName = details.tableName.strip() 153 if details.outName == '': 154 details.outName = fName + '.pkl' 155 if details.dbName == '': 156 data = DataUtils.BuildQuantDataSet(fName) 157 elif details.qBounds != []: 158 details.tableName = fName 159 data = details.GetDataSet() 160 else: 161 data = DataUtils.DBToQuantData(details.dbName,fName,quantName=details.qTableName, 162 user=details.dbUser,password=details.dbPassword) 163 164 nExamples = data.GetNPts() 165 seed = composite._randomSeed 166 DataUtils.InitRandomNumbers(seed) 167 testExamples = [] 168 if details.shuffleActivities == 1: 169 DataUtils.RandomizeActivities(data,shuffle=1,runDetails=details) 170 elif details.randomActivities == 1: 171 DataUtils.RandomizeActivities(data,shuffle=0,runDetails=details) 172 173 namedExamples = data.GetNamedData() 174 trainExamples = namedExamples 175 nExamples = len(trainExamples) 176 message('Training with %d examples'%(nExamples)) 177 message('\t%d descriptors'%(len(trainExamples[0])-2)) 178 nVars = data.GetNVars() 179 nPossibleVals = composite.nPossibleVals 180 attrs = range(1,nVars+1) 181 182 if details.useTrees: 183 from ML.DecTree import CrossValidate,PruneTree 184 if details.qBounds != []: 185 from ML.DecTree import BuildQuantTree 186 builder = BuildQuantTree.QuantTreeBoot 187 else: 188 from ML.DecTree import ID3 189 builder = ID3.ID3Boot 190 driver = CrossValidate.CrossValidationDriver 191 pruner = PruneTree.PruneTree 192 193 if setDescNames: 194 composite.SetInputOrder(data.GetVarNames()) 195 composite.Grow(trainExamples,attrs,[0]+nPossibleVals, 196 buildDriver=driver, 197 pruner=pruner, 198 nTries=details.nModels,pruneIt=details.pruneIt, 199 lessGreedy=details.lessGreedy,needsQuantization=0, 200 treeBuilder=builder,nQuantBounds=details.qBounds, 201 startAt=details.startAt, 202 maxDepth=details.limitDepth, 203 progressCallback=progressCallback, 204 silent=not _verbose) 205 206 207 else: 208 from ML.Neural import CrossValidate 209 driver = CrossValidate.CrossValidationDriver 210 composite.Grow(trainExamples,attrs,[0]+nPossibleVals,nTries=details.nModels, 211 buildDriver=driver,needsQuantization=0) 212 213 composite.AverageErrors() 214 composite.SortModels() 215 modelList,counts,avgErrs = composite.GetAllData() 216 counts = numpy.array(counts) 217 avgErrs = numpy.array(avgErrs) 218 composite._varNames = data.GetVarNames() 219 220 for i in xrange(len(modelList)): 221 modelList[i].NameModel(composite._varNames) 222 223 # do final statistics 224 weightedErrs = counts*avgErrs 225 averageErr = sum(weightedErrs)/sum(counts) 226 devs = (avgErrs - averageErr) 227 devs = devs * counts 228 devs = numpy.sqrt(devs*devs) 229 avgDev = sum(devs)/sum(counts) 230 if _verbose: 231 message('# Overall Average Error: %%% 5.2f, Average Deviation: %%% 6.2f'%(100.*averageErr,100.*avgDev)) 232 233 if details.bayesModel: 234 composite.Train(trainExamples,verbose=0) 235 236 badExamples = [] 237 if not details.detailedRes: 238 if _verbose: 239 message('Testing all examples') 240 wrong = BuildComposite.testall(composite,namedExamples,badExamples) 241 if _verbose: 242 message('%d examples (%% %5.2f) were misclassified'%(len(wrong),100.*float(len(wrong))/float(len(namedExamples)))) 243 _runDetails.overall_error = float(len(wrong))/len(namedExamples) 244 245 if details.detailedRes: 246 if _verbose: 247 message('\nEntire data set:') 248 resTup = ScreenComposite.ShowVoteResults(range(data.GetNPts()),data,composite, 249 nPossibleVals[-1],details.threshold) 250 nGood,nBad,nSkip,avgGood,avgBad,avgSkip,voteTab = resTup 251 nPts = len(namedExamples) 252 nClass = nGood+nBad 253 _runDetails.overall_error = float(nBad) / nClass 254 _runDetails.overall_correct_conf = avgGood 255 _runDetails.overall_incorrect_conf = avgBad 256 _runDetails.overall_result_matrix = repr(voteTab) 257 nRej = nClass-nPts 258 if nRej > 0: 259 _runDetails.overall_fraction_dropped = float(nRej)/nPts 260 261 return composite
262
263 -def GetComposites(details):
264 res = [] 265 if details.persistTblName and details.inNote: 266 conn = DbConnect(details.dbName,details.persistTblName) 267 mdls = conn.GetData(fields='MODEL',where="where note='%s'"%(details.inNote)) 268 for row in mdls: 269 rawD = row[0] 270 res.append(cPickle.loads(str(rawD))) 271 elif details.composFileName: 272 res.append(cPickle.load(open(details.composFileName,'rb'))) 273 return res
274
275 -def BalanceComposite(details,composite,data1=None,data2=None):
276 """ balances the composite using the parameters provided in details 277 278 **Arguments** 279 280 - details a _CompositeRun.RunDetails_ object 281 282 - composite: the composite model to be balanced 283 284 - data1: (optional) if provided, this should be the 285 data set used to construct the original models 286 287 - data2: (optional) if provided, this should be the 288 data set used to construct the new individual models 289 290 """ 291 if not details.balCnt or details.balCnt > len(composite): 292 return composite 293 message("Balancing Composite") 294 295 # 296 # start by getting data set 1: which is the data set used to build the 297 # original models 298 # 299 if data1 is None: 300 message("\tReading First Data Set") 301 fName = details.balTable.strip() 302 tmp = details.tableName 303 details.tableName = fName 304 dbName = details.dbName 305 details.dbName = details.balDb 306 data1 = details.GetDataSet() 307 details.tableName = tmp 308 details.dbName = dbName 309 if data1 is None: 310 return composite 311 details.splitFrac = composite._splitFrac 312 details.randomSeed = composite._randomSeed 313 DataUtils.InitRandomNumbers(details.randomSeed) 314 if details.shuffleActivities == 1: 315 DataUtils.RandomizeActivities(data1,shuffle=1,runDetails=details) 316 elif details.randomActivities == 1: 317 DataUtils.RandomizeActivities(data1,shuffle=0,runDetails=details) 318 namedExamples = data1.GetNamedData() 319 if details.balDoHoldout or details.balDoTrain: 320 trainIdx,testIdx = SplitData.SplitIndices(len(namedExamples),details.splitFrac, 321 silent=1) 322 trainExamples = [namedExamples[x] for x in trainIdx] 323 testExamples = [namedExamples[x] for x in testIdx] 324 if details.filterFrac != 0.0: 325 trainIdx,temp = DataUtils.FilterData(trainExamples,details.filterVal, 326 details.filterFrac,-1, 327 indicesOnly=1) 328 tmp = [trainExamples[x] for x in trainIdx] 329 testExamples += [trainExamples[x] for x in temp] 330 trainExamples = tmp 331 if details.balDoHoldout: 332 testExamples,trainExamples = trainExamples,testExamples 333 else: 334 trainExamples = namedExamples 335 dataSet1 = trainExamples 336 cols1 = [x.upper() for x in data1.GetVarNames()] 337 data1 = None 338 339 # 340 # now grab data set 2: the data used to build the new individual models 341 # 342 if data2 is None: 343 message("\tReading Second Data Set") 344 data2 = details.GetDataSet() 345 if data2 is None: 346 return composite 347 details.splitFrac = composite._splitFrac 348 details.randomSeed = composite._randomSeed 349 DataUtils.InitRandomNumbers(details.randomSeed) 350 if details.shuffleActivities == 1: 351 DataUtils.RandomizeActivities(data2,shuffle=1,runDetails=details) 352 elif details.randomActivities == 1: 353 DataUtils.RandomizeActivities(data2,shuffle=0,runDetails=details) 354 dataSet2 = data2.GetNamedData() 355 cols2 = [x.upper() for x in data2.GetVarNames()] 356 data2 = None 357 358 # and balance it: 359 res = [] 360 weights = details.balWeight 361 if type(weights) not in (types.TupleType,types.ListType): 362 weights = (weights,) 363 for weight in weights: 364 message("\tBalancing with Weight: %.4f"%(weight)) 365 res.append(AdjustComposite.BalanceComposite(composite,dataSet1,dataSet2, 366 weight, 367 details.balCnt, 368 names1=cols1,names2=cols2)) 369 return res
370
371 -def ShowVersion(includeArgs=0):
372 """ prints the version number 373 374 """ 375 print 'This is GrowComposite.py version %s'%(__VERSION_STRING) 376 if includeArgs: 377 import sys 378 print 'command line was:' 379 print ' '.join(sys.argv)
380
381 -def Usage():
382 """ provides a list of arguments for when this is used from the command line 383 384 """ 385 import sys 386 print __doc__ 387 sys.exit(-1)
388
389 -def SetDefaults(runDetails=None):
390 """ initializes a details object with default values 391 392 **Arguments** 393 394 - details: (optional) a _CompositeRun.CompositeRun_ object. 395 If this is not provided, the global _runDetails will be used. 396 397 **Returns** 398 399 the initialized _CompositeRun_ object. 400 401 402 """ 403 if runDetails is None: runDetails = _runDetails 404 return CompositeRun.SetDefaults(runDetails)
405
406 -def ParseArgs(runDetails):
407 """ parses command line arguments and updates _runDetails_ 408 409 **Arguments** 410 411 - runDetails: a _CompositeRun.CompositeRun_ object. 412 413 """ 414 import getopt 415 args,extra = getopt.getopt(sys.argv[1:],'P:o:n:p:b:sf:F:v:hlgd:rSTt:Q:q:DVG:L:C:N:', 416 ['inNote=','outNote=','balTable=','balWeight=','balCnt=', 417 'balH','balT','balDb=',]) 418 runDetails.inNote='' 419 runDetails.composFileName='' 420 runDetails.balTable='' 421 runDetails.balWeight=(0.5,) 422 runDetails.balCnt=0 423 runDetails.balDoHoldout=0 424 runDetails.balDoTrain=0 425 runDetails.balDb='' 426 for arg,val in args: 427 if arg == '-n': 428 runDetails.nModels = int(val) 429 elif arg == '-C': 430 runDetails.composFileName=val 431 elif arg=='--balTable': 432 runDetails.balTable=val 433 elif arg=='--balWeight': 434 runDetails.balWeight=eval(val) 435 if type(runDetails.balWeight) not in (types.TupleType,types.ListType): 436 runDetails.balWeight=(runDetails.balWeight,) 437 elif arg=='--balCnt': 438 runDetails.balCnt=int(val) 439 elif arg=='--balH': 440 runDetails.balDoHoldout=1 441 elif arg=='--balT': 442 runDetails.balDoTrain=1 443 elif arg=='--balDb': 444 runDetails.balDb=val 445 elif arg == '--inNote': 446 runDetails.inNote=val 447 elif arg == '-N' or arg=='--outNote': 448 runDetails.note=val 449 elif arg == '-o': 450 runDetails.outName = val 451 elif arg == '-p': 452 runDetails.persistTblName=val 453 elif arg == '-r': 454 runDetails.randomActivities = 1 455 elif arg == '-S': 456 runDetails.shuffleActivities = 1 457 elif arg == '-h': 458 Usage() 459 elif arg == '-l': 460 runDetails.lockRandom = 1 461 elif arg == '-g': 462 runDetails.lessGreedy=1 463 elif arg == '-G': 464 runDetails.startAt = int(val) 465 elif arg == '-d': 466 runDetails.dbName=val 467 elif arg == '-T': 468 runDetails.useTrees = 0 469 elif arg == '-t': 470 runDetails.threshold=float(val) 471 elif arg == '-D': 472 runDetails.detailedRes = 1 473 elif arg == '-L': 474 runDetails.limitDepth = int(val) 475 elif arg == '-q': 476 qBounds = eval(val) 477 assert type(qBounds) in (types.TupleType,types.ListType),'bad argument type for -q, specify a list as a string' 478 runDetails.qBoundCount=val 479 runDetails.qBounds = qBounds 480 elif arg == '-Q': 481 qBounds = eval(val) 482 assert type(qBounds) in [type([]),type(())],'bad argument type for -Q, specify a list as a string' 483 runDetails.activityBounds=qBounds 484 runDetails.activityBoundsVals=val 485 elif arg == '-V': 486 ShowVersion() 487 sys.exit(0) 488 else: 489 print >>sys.stderr,'bad argument:',arg 490 Usage() 491 runDetails.tableName=extra[0] 492 if not runDetails.balDb: 493 runDetails.balDb=runDetails.dbName
494 if __name__ == '__main__': 495 if len(sys.argv) < 2: 496 Usage() 497 498 _runDetails.cmd = ' '.join(sys.argv) 499 SetDefaults(_runDetails) 500 ParseArgs(_runDetails) 501 502 ShowVersion(includeArgs=1) 503 504 initModels = GetComposites(_runDetails) 505 nModels = len(initModels) 506 if nModels>1: 507 for i in range(nModels): 508 sys.stderr.write('---------------------------------\n\tDoing %d of %d\n---------------------------------\n'%(i+1,nModels)) 509 composite = GrowIt(_runDetails,initModels[i],setDescNames=1) 510 if _runDetails.balTable and _runDetails.balCnt: 511 composites = BalanceComposite(_runDetails,composite) 512 else: 513 composites=[composite] 514 for mdl in composites: 515 mdl.ClearModelExamples() 516 if _runDetails.outName: 517 nWeights = len(_runDetails.balWeight) 518 if nWeights==1: 519 outName = _runDetails.outName 520 composites[0].Pickle(outName) 521 else: 522 for i in range(nWeights): 523 weight = int(100*_runDetails.balWeight[i]) 524 model = composites[i] 525 outName = '%s.%d.pkl'%(_runDetails.outName.split('.pkl')[0],weight) 526 model.Pickle(outName) 527 if _runDetails.persistTblName and _runDetails.dbName: 528 message('Updating results table %s:%s'%(_runDetails.dbName,_runDetails.persistTblName)) 529 if(len(_runDetails.balWeight))>1: 530 message('WARNING: updating results table with models having different weights') 531 # save the composite 532 for i in range(len(composites)): 533 _runDetails.model = cPickle.dumps(composites[i]) 534 _runDetails.Store(db=_runDetails.dbName,table=_runDetails.persistTblName) 535 elif nModels==1: 536 composite = GrowIt(_runDetails,initModels[0],setDescNames=1) 537 if _runDetails.balTable and _runDetails.balCnt: 538 composites = BalanceComposite(_runDetails,composite) 539 else: 540 composites=[composite] 541 for mdl in composites: 542 mdl.ClearModelExamples() 543 if _runDetails.outName: 544 nWeights = len(_runDetails.balWeight) 545 if nWeights==1: 546 outName = _runDetails.outName 547 composites[0].Pickle(outName) 548 else: 549 for i in range(nWeights): 550 weight = int(100*_runDetails.balWeight[i]) 551 model = composites[i] 552 outName = '%s.%d.pkl'%(_runDetails.outName.split('.pkl')[0],weight) 553 model.Pickle(outName) 554 if _runDetails.persistTblName and _runDetails.dbName: 555 message('Updating results table %s:%s'%(_runDetails.dbName,_runDetails.persistTblName)) 556 if(len(composites))>1: 557 message('WARNING: updating results table with models having different weights') 558 for i in range(len(composites)): 559 _runDetails.model = cPickle.dumps(composites[i]) 560 _runDetails.Store(db=_runDetails.dbName,table=_runDetails.persistTblName) 561 else: 562 message("No models found") 563