1
2
3
4 """ ID3 Decision Trees
5
6 contains an implementation of the ID3 decision tree algorithm
7 as described in Tom Mitchell's book "Machine Learning"
8
9 It relies upon the _Tree.TreeNode_ data structure (or something
10 with the same API) defined locally to represent the trees
11
12 """
13
14 import numpy
15 from ML.DecTree import DecTree
16 from ML.InfoTheory import entropy
17
19 """ Calculates the total entropy of the data set (w.r.t. the results)
20
21 **Arguments**
22
23 - examples: a list (nInstances long) of lists of variable values + instance
24 values
25 - nPossibleVals: a list (nVars long) of the number of possible values each variable
26 can adopt.
27
28 **Returns**
29
30 a float containing the informational entropy of the data set.
31
32 """
33 nRes = nPossibleVals[-1]
34 resList = numpy.zeros(nRes,'i')
35 for example in examples:
36 res = example[-1]
37 resList[res] = resList[res] + 1
38 return entropy.InfoEntropy(resList)
39
41 """Generates a list of variable tables for the examples passed in.
42
43 The table for a given variable records the number of times each possible value
44 of that variable appears for each possible result of the function.
45
46 **Arguments**
47
48 - examples: a list (nInstances long) of lists of variable values + instance
49 values
50
51 - nPossibleVals: a list containing the number of possible values of
52 each variable + the number of values of the function.
53
54 - vars: a list of the variables to include in the var table
55
56
57 **Returns**
58
59 a list of variable result tables. Each table is a Numeric array
60 which is varValues x nResults
61 """
62 nVars = len(vars)
63 res = [None]*nVars
64 nFuncVals = nPossibleVals[-1]
65
66 for i in xrange(nVars):
67 res[i] = numpy.zeros((nPossibleVals[vars[i]],nFuncVals),'i')
68 for example in examples:
69 val = int(example[-1])
70 for i in xrange(nVars):
71 res[i][int(example[vars[i]]),val] += 1
72
73 return res
74
75 -def ID3(examples,target,attrs,nPossibleVals,depth=0,maxDepth=-1,
76 **kwargs):
77 """ Implements the ID3 algorithm for constructing decision trees.
78
79 From Mitchell's book, page 56
80
81 This is *slightly* modified from Mitchell's book because it supports
82 multivalued (non-binary) results.
83
84 **Arguments**
85
86 - examples: a list (nInstances long) of lists of variable values + instance
87 values
88
89 - target: an int
90
91 - attrs: a list of ints indicating which variables can be used in the tree
92
93 - nPossibleVals: a list containing the number of possible values of
94 every variable.
95
96 - depth: (optional) the current depth in the tree
97
98 - maxDepth: (optional) the maximum depth to which the tree
99 will be grown
100
101 **Returns**
102
103 a DecTree.DecTreeNode with the decision tree
104
105 **NOTE:** This code cannot bootstrap (start from nothing...)
106 use _ID3Boot_ (below) for that.
107 """
108 varTable = GenVarTable(examples,nPossibleVals,attrs)
109 tree=DecTree.DecTreeNode(None,'node')
110
111
112 totEntropy = CalcTotalEntropy(examples,nPossibleVals)
113 tree.SetData(totEntropy)
114
115
116
117 tMat = GenVarTable(examples,nPossibleVals,[target])[0]
118
119 counts = sum(tMat)
120 nzCounts = numpy.nonzero(counts)[0]
121
122 if len(nzCounts) == 1:
123
124
125
126 res = nzCounts[0]
127 tree.SetLabel(res)
128 tree.SetName(str(res))
129 tree.SetTerminal(1)
130 elif len(attrs) == 0 or (maxDepth>=0 and depth>=maxDepth):
131
132
133
134
135 v = numpy.argmax(counts)
136 tree.SetLabel(v)
137 tree.SetName('%d?'%v)
138 tree.SetTerminal(1)
139 else:
140
141
142 gains = [entropy.InfoGain(x) for x in varTable]
143 best = attrs[numpy.argmax(gains)]
144
145
146
147 nextAttrs = attrs[:]
148 if not kwargs.get('recycleVars',0):
149 nextAttrs.remove(best)
150
151
152 tree.SetName('Var: %d'%best)
153 tree.SetLabel(best)
154
155 tree.SetTerminal(0)
156
157
158
159 for val in xrange(nPossibleVals[best]):
160 nextExamples = []
161 for example in examples:
162 if example[best] == val:
163 nextExamples.append(example)
164 if len(nextExamples) == 0:
165
166
167
168 v = numpy.argmax(counts)
169 tree.AddChild('%d'%v,label=v,data=0.0,isTerminal=1)
170 else:
171
172 tree.AddChildNode(ID3(nextExamples,best,nextAttrs,nPossibleVals,depth+1,maxDepth,
173 **kwargs))
174 return tree
175
176 -def ID3Boot(examples,attrs,nPossibleVals,initialVar=None,depth=0,maxDepth=-1,
177 **kwargs):
178 """ Bootstrapping code for the ID3 algorithm
179
180 see ID3 for descriptions of the arguments
181
182 If _initialVar_ is not set, the algorithm will automatically
183 choose the first variable in the tree (the standard greedy
184 approach). Otherwise, _initialVar_ will be used as the first
185 split.
186
187 """
188 totEntropy = CalcTotalEntropy(examples,nPossibleVals)
189 varTable = GenVarTable(examples,nPossibleVals,attrs)
190
191 tree=DecTree.DecTreeNode(None,'node')
192
193 tree._nResultCodes = nPossibleVals[-1]
194
195
196
197 if initialVar is None:
198 best = attrs[numpy.argmax([entropy.InfoGain(x) for x in varTable])]
199 else:
200 best = initialVar
201
202 tree.SetName('Var: %d'%best)
203 tree.SetData(totEntropy)
204 tree.SetLabel(best)
205 tree.SetTerminal(0)
206 nextAttrs = attrs[:]
207 if not kwargs.get('recycleVars',0):
208 nextAttrs.remove(best)
209
210 for val in xrange(nPossibleVals[best]):
211 nextExamples = []
212 for example in examples:
213 if example[best] == val:
214 nextExamples.append(example)
215
216 tree.AddChildNode(ID3(nextExamples,best,nextAttrs,nPossibleVals,depth,maxDepth,
217 **kwargs))
218 return tree
219