Package ML :: Package DecTree :: Module BuildSigTree
[hide private]
[frames] | no frames]

Source Code for Module ML.DecTree.BuildSigTree

  1  ## Automatically adapted for numpy.oldnumeric Jun 27, 2008 by -c 
  2   
  3  # $Id: BuildSigTree.py 742 2008-07-05 07:42:38Z glandrum $ 
  4  # 
  5  #  Copyright (C) 2003-2008  Greg Landrum and Rational Discovery LLC 
  6  #    All Rights Reserved 
  7  # 
  8  """  
  9   
 10  """ 
 11   
 12  import numpy 
 13  from ML.DecTree import SigTree 
 14  from ML import InfoTheory 
 15  from ML.FeatureSelect import CMIM 
 16  from DataStructs.VectCollection import VectCollection 
 17  import copy 
 18  import random 
19 -def _GenerateRandomEnsemble(nToInclude,nBits):
20 """ Generates a random subset of a group of indices 21 22 **Arguments** 23 24 - nToInclude: the size of the desired set 25 26 - nBits: the maximum index to be included in the set 27 28 **Returns** 29 30 a list of indices 31 32 """ 33 # Before Python 2.3 added the random.sample() function, this was 34 # way more complicated: 35 res = random.sample(range(nBits),nToInclude) 36 return res
37
38 -def BuildSigTree(examples,nPossibleRes,ensemble=None,random=0, 39 metric=InfoTheory.InfoType.BIASENTROPY, 40 biasList=[1], 41 depth=0,maxDepth=-1, 42 useCMIM=0,allowCollections=False, 43 verbose=0,**kwargs):
44 """ 45 **Arguments** 46 47 - examples: the examples to be classified. Each example 48 should be a sequence at least three entries long, with 49 entry 0 being a label, entry 1 a BitVector and entry -1 50 an activity value 51 52 - nPossibleRes: the number of result codes possible 53 54 - ensemble: (optional) if this argument is provided, it 55 should be a sequence which is used to limit the bits 56 which are actually considered as potential descriptors. 57 The default is None (use all bits). 58 59 - random: (optional) If this argument is nonzero, it 60 specifies the number of bits to be randomly selected 61 for consideration at this node (i.e. this toggles the 62 growth of Random Trees). 63 The default is 0 (no random descriptor selection) 64 65 - metric: (optional) This is an _InfoTheory.InfoType_ and 66 sets the metric used to rank the bits. 67 The default is _InfoTheory.InfoType.BIASENTROPY_ 68 69 - biasList: (optional) If provided, this provides a bias 70 list for the bit ranker. 71 See the _InfoTheory.InfoBitRanker_ docs for an explanation 72 of bias. 73 The default value is [1], which biases towards actives. 74 75 - maxDepth: (optional) the maximum depth to which the tree 76 will be grown 77 The default is -1 (no depth limit). 78 79 - useCMIM: (optional) if this is >0, the CMIM algorithm 80 (conditional mutual information maximization) will be 81 used to select the descriptors used to build the trees. 82 The value of the variable should be set to the number 83 of descriptors to be used. This option and the 84 ensemble option are mutually exclusive (CMIM will not be 85 used if the ensemble is set), but it happily coexsts 86 with the random argument (to only consider random subsets 87 of the top N CMIM bits) 88 The default is 0 (do not use CMIM) 89 90 - depth: (optional) the current depth in the tree 91 This is used in the recursion and should not be set 92 by the client. 93 94 **Returns** 95 96 a SigTree.SigTreeNode with the root of the decision tree 97 98 """ 99 if verbose: print ' '*depth,'Build' 100 tree=SigTree.SigTreeNode(None,'node',level=depth) 101 tree.SetData(-666) 102 #tree.SetExamples(examples) 103 104 # counts of each result code: 105 #resCodes = map(lambda x:int(x[-1]),examples) 106 resCodes = [int(x[-1]) for x in examples] 107 #print 'resCodes:',resCodes 108 counts = [0]*nPossibleRes 109 for res in resCodes: 110 counts[res] += 1 111 #print ' '*depth,'counts:',counts 112 113 nzCounts = numpy.nonzero(counts)[0] 114 if verbose: print ' '*depth,'\tcounts:',counts 115 if len(nzCounts) == 1: 116 # bottomed out because there is only one result code left 117 # with any counts (i.e. there's only one type of example 118 # left... this is GOOD!). 119 res = nzCounts[0] 120 tree.SetLabel(res) 121 tree.SetName(str(res)) 122 tree.SetTerminal(1) 123 elif maxDepth>=0 and depth>maxDepth: 124 # Bottomed out: max depth hit 125 # We don't really know what to do here, so 126 # use the heuristic of picking the most prevalent 127 # result 128 v = numpy.argmax(counts) 129 tree.SetLabel(v) 130 tree.SetName('%d?'%v) 131 tree.SetTerminal(1) 132 else: 133 # find the variable which gives us the best improvement 134 # We do this with an InfoBitRanker: 135 fp = examples[0][1] 136 nBits = fp.GetNumBits() 137 ranker = InfoTheory.InfoBitRanker(nBits,nPossibleRes,metric) 138 if biasList: ranker.SetBiasList(biasList) 139 if useCMIM > 0 and not ensemble: 140 ensemble = CMIM.SelectFeatures(examples,useCMIM,bvCol=1) 141 if random: 142 if ensemble: 143 if len(ensemble)>random: 144 picks = _GenerateRandomEnsemble(random,len(ensemble)) 145 availBits = list(take(ensemble,picks)) 146 else: 147 availBits = range(len(ensemble)) 148 else: 149 availBits = _GenerateRandomEnsemble(random,nBits) 150 else: 151 availBits=None 152 if availBits: 153 ranker.SetMaskBits(availBits) 154 #print ' 2:'*depth,availBits 155 156 useCollections=isinstance(examples[0][1],VectCollection) 157 for example in examples: 158 #print ' '*depth,example[1].ToBitString(),example[-1] 159 if not useCollections: 160 ranker.AccumulateVotes(example[1],example[-1]) 161 else: 162 example[1].Reset() 163 ranker.AccumulateVotes(example[1].orVect,example[-1]) 164 165 try: 166 bitInfo = ranker.GetTopN(1)[0] 167 best = int(bitInfo[0]) 168 gain = bitInfo[1] 169 except: 170 import traceback 171 traceback.print_exc() 172 print 'get top n failed' 173 gain = -1.0 174 if gain <= 0.0: 175 v = numpy.argmax(counts) 176 tree.SetLabel(v) 177 tree.SetName('?%d?'%v) 178 tree.SetTerminal(1) 179 return tree 180 best = int(bitInfo[0]) 181 #print ' '*depth,'\tbest:',bitInfo 182 if verbose: print ' '*depth,'\tbest:',bitInfo 183 # set some info at this node 184 tree.SetName('Bit-%d'%(best)) 185 tree.SetLabel(best) 186 #tree.SetExamples(examples) 187 tree.SetTerminal(0) 188 189 # loop over possible values of the new variable and 190 # build a subtree for each one 191 onExamples = [] 192 offExamples = [] 193 for example in examples: 194 if example[1][best]: 195 if allowCollections and useCollections: 196 sig = copy.copy(example[1]) 197 sig.DetachVectsNotMatchingBit(best) 198 ex = [example[0],sig] 199 if len(example)>2: 200 ex.extend(example[2:]) 201 example = ex 202 onExamples.append(example) 203 else: 204 offExamples.append(example) 205 #print ' '*depth,len(offExamples),len(onExamples) 206 for ex in (offExamples,onExamples): 207 if len(ex) == 0: 208 v = numpy.argmax(counts) 209 tree.AddChild('%d??'%v,label=v,data=0.0,isTerminal=1) 210 else: 211 child = BuildSigTree(ex,nPossibleRes,random=random, 212 ensemble=ensemble, 213 metric=metric,biasList=biasList, 214 depth=depth+1,maxDepth=maxDepth, 215 verbose=verbose) 216 if child is None: 217 v = numpy.argmax(counts) 218 tree.AddChild('%d???'%v,label=v,data=0.0,isTerminal=1) 219 else: 220 tree.AddChildNode(child) 221 return tree
222 223
224 -def SigTreeBuilder(examples,attrs,nPossibleVals,initialVar=None,ensemble=None, 225 randomDescriptors=0, 226 **kwargs):
227 nRes = nPossibleVals[-1] 228 return BuildSigTree(examples,nRes,random=randomDescriptors,**kwargs)
229