1
2
3
4
5
6
7 """ Automatic search for quantization bounds
8
9 This uses the expected informational gain to determine where quantization bounds should
10 lie.
11
12 **Notes**:
13
14 - bounds are less than, so if the bounds are [1.,2.],
15 [0.9,1.,1.1,2.,2.2] -> [0,1,1,2,2]
16
17 """
18 import numpy
19 from ML.InfoTheory import entropy
20 try:
21 import cQuantize
22 except:
23 hascQuantize = 0
24 else:
25 hascQuantize = 1
26
27 _float_tol = 1e-8
29 """ floating point equality with a tolerance factor
30
31 **Arguments**
32
33 - v1: a float
34
35 - v2: a float
36
37 - tol: the tolerance for comparison
38
39 **Returns**
40
41 0 or 1
42
43 """
44 return abs(v1-v2) < tol
45
47 """ Uses FindVarMultQuantBounds, only here for historic reasons
48 """
49 bounds,gain = FindVarMultQuantBounds(vals,1,results,nPossibleRes)
50 return (bounds[0],gain)
51
52
54 """ Primarily intended for internal use
55
56 constructs a variable table for the data passed in
57 The table for a given variable records the number of times each possible value
58 of that variable appears for each possible result of the function.
59
60 **Arguments**
61
62 - vals: a 1D Numeric array with the values of the variables
63
64 - cuts: a list with the indices of the quantization bounds
65 (indices are into _starts_ )
66
67 - starts: a list of potential starting points for quantization bounds
68
69 - results: a 1D Numeric array of integer result codes
70
71 - nPossibleRes: an integer with the number of possible result codes
72
73 **Returns**
74
75 the varTable, a 2D Numeric array which is nVarValues x nPossibleRes
76
77 **Notes**
78
79 - _vals_ should be sorted!
80
81 """
82 nVals = len(cuts)+1
83 varTable = numpy.zeros((nVals,nPossibleRes),'i')
84 idx = 0
85 for i in range(nVals-1):
86 cut = cuts[i]
87 while idx < starts[cut]:
88 varTable[i,results[idx]] += 1
89 idx += 1
90 while idx < len(vals):
91 varTable[-1,results[idx]] += 1
92 idx += 1
93 return varTable
94
96 """ Primarily intended for internal use
97
98 Recursively finds the best quantization boundaries
99
100 **Arguments**
101
102 - vals: a 1D Numeric array with the values of the variables,
103 this should be sorted
104
105 - cuts: a list with the indices of the quantization bounds
106 (indices are into _starts_ )
107
108 - which: an integer indicating which bound is being adjusted here
109 (and index into _cuts_ )
110
111 - starts: a list of potential starting points for quantization bounds
112
113 - results: a 1D Numeric array of integer result codes
114
115 - nPossibleRes: an integer with the number of possible result codes
116
117 **Returns**
118
119 - a 2-tuple containing:
120
121 1) the best information gain found so far
122
123 2) a list of the quantization bound indices ( _cuts_ for the best case)
124
125 **Notes**
126
127 - this is not even remotely efficient, which is why a C replacement
128 was written
129
130 """
131 nBounds = len(cuts)
132 maxGain = -1e6
133 bestCuts = None
134 highestCutHere = len(starts) - nBounds + which
135 if varTable is None:
136 varTable = _GenVarTable(vals,cuts,starts,results,nPossibleRes)
137 while cuts[which] <= highestCutHere:
138 varTable = _GenVarTable(vals,cuts,starts,results,nPossibleRes)
139 gainHere = entropy.InfoGain(varTable)
140 if gainHere > maxGain:
141 maxGain = gainHere
142 bestCuts = cuts[:]
143
144 if which < nBounds-1:
145 gainHere,cutsHere=_RecurseOnBounds(vals,cuts[:],which+1,starts,results,nPossibleRes,
146 varTable = varTable)
147 if gainHere > maxGain:
148 maxGain = gainHere
149 bestCuts = cutsHere
150
151 cuts[which] += 1
152 for i in range(which+1,nBounds):
153 if cuts[i] == cuts[i-1]:
154 cuts[i] += 1
155
156 return maxGain,bestCuts
157
159 """ Primarily intended for internal use
160
161 Recursively finds the best quantization boundaries
162
163 **Arguments**
164
165 - vals: a 1D Numeric array with the values of the variables,
166 this should be sorted
167
168 - cuts: a list with the indices of the quantization bounds
169 (indices are into _starts_ )
170
171 - which: an integer indicating which bound is being adjusted here
172 (and index into _cuts_ )
173
174 - starts: a list of potential starting points for quantization bounds
175
176 - results: a 1D Numeric array of integer result codes
177
178 - nPossibleRes: an integer with the number of possible result codes
179
180 **Returns**
181
182 - a 2-tuple containing:
183
184 1) the best information gain found so far
185
186 2) a list of the quantization bound indices ( _cuts_ for the best case)
187
188 **Notes**
189
190 - this is not even remotely efficient, which is why a C replacement
191 was written
192
193 """
194 nBounds = len(cuts)
195 maxGain = -1e6
196 bestCuts = None
197 highestCutHere = len(starts) - nBounds + which
198 if varTable is None:
199 varTable = _GenVarTable(vals,cuts,starts,results,nPossibleRes)
200 while cuts[which] <= highestCutHere:
201 gainHere = entropy.InfoGain(varTable)
202 if gainHere > maxGain:
203 maxGain = gainHere
204 bestCuts = cuts[:]
205
206 if which < nBounds-1:
207 gainHere,cutsHere=_RecurseOnBounds(vals,cuts[:],which+1,starts,results,nPossibleRes,
208 varTable = None)
209 if gainHere > maxGain:
210 maxGain = gainHere
211 bestCuts = cutsHere
212
213 oldCut = cuts[which]
214 cuts[which] += 1
215 bot = starts[oldCut]
216 if oldCut+1 < len(starts):
217 top = starts[oldCut+1]
218 else:
219 top = starts[-1]
220 for i in range(bot,top):
221 v = results[i]
222 varTable[which,v] += 1
223 varTable[which+1,v] -= 1
224 for i in range(which+1,nBounds):
225 if cuts[i] == cuts[i-1]:
226 cuts[i] += 1
227
228
229 return maxGain,bestCuts
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
247 startNext = []
248 tol = 1e-8
249 i = 0
250 start=0
251 actHomog=1
252 valHomog=1
253 while i<nData:
254
255 if sortVals[i]-sortVals[start]>tol:
256 valHomog=0
257 if sortResults[i]!=sortResults[start]:
258 actHomog=0
259 if not actHomog and not valHomog:
260
261
262 if( sortVals[i]-sortVals[i-1]<tol ):
263
264 while i>1 and sortVals[i]-sortVals[i-1]<tol:
265 i-=1
266
267
268 else:
269
270 pass
271 startNext.append(i)
272 start = i
273 actHomog = 1
274 valHomog = 1
275 i += 1
276 return startNext
277
279 """ finds multiple quantization bounds for a single variable
280
281 **Arguments**
282
283 - vals: sequence of variable values (assumed to be floats)
284
285 - nBounds: the number of quantization bounds to find
286
287 - results: a list of result codes (should be integers)
288
289 - nPossibleRes: an integer with the number of possible values of the
290 result variable
291
292 **Returns**
293
294 - a 2-tuple containing:
295
296 1) a list of the quantization bounds (floats)
297
298 2) the information gain associated with this quantization
299
300
301 """
302 assert len(vals) == len(results), 'vals/results length mismatch'
303
304 nData = len(vals)
305 if nData == 0:
306 return [],-1e8
307
308
309 sortIdx = numpy.argsort(vals)
310 sortVals = numpy.array([vals[x] for x in sortIdx],'d')
311 sortResults = numpy.array([results[x] for x in sortIdx],'i')
312 startNext=_FindStartPoints(sortVals,sortResults,nData)
313 if not len(startNext):
314 return [0],0.0
315 if len(startNext)<nBounds:
316 nBounds = len(startNext)-1
317 if nBounds == 0:
318 nBounds=1
319 initCuts = range(nBounds)
320 maxGain,bestCuts = _RecurseOnBounds(sortVals,initCuts,0,startNext,
321 sortResults,nPossibleRes)
322 quantBounds = []
323 nVs = len(sortVals)
324 for cut in bestCuts:
325 idx = startNext[cut]
326 if idx == nVs:
327 quantBounds.append(sortVals[-1])
328 elif idx == 0:
329 quantBounds.append(sortVals[idx])
330 else:
331 quantBounds.append((sortVals[idx]+sortVals[idx-1])/2.)
332
333 return quantBounds,maxGain
334
335 if hascQuantize:
336 _RecurseOnBounds = cQuantize._RecurseOnBounds
337 _FindStartPoints = cQuantize._FindStartPoints
338 else:
339 _RecurseOnBounds = _NewPyRecurseOnBounds
340 _FindStartPoints = _NewPyFindStartPoints
341
342 if __name__ == '__main__':
343 import sys
344 if 1:
345 d = [(1.,0),
346 (1.1,0),
347 (1.2,0),
348 (1.4,1),
349 (1.4,0),
350 (1.6,1),
351 (2.,1),
352 (2.1,0),
353 (2.1,0),
354 (2.1,0),
355 (2.2,1),
356 (2.3,0)]
357 varValues = map(lambda x:x[0],d)
358 resCodes = map(lambda x:x[1],d)
359 nPossibleRes = 2
360 res = FindVarMultQuantBounds(varValues,2,resCodes,nPossibleRes)
361 print 'RES:',res
362 target = ([1.3, 2.05],.34707 )
363 else:
364 d = [(1.,0),
365 (1.1,0),
366 (1.2,0),
367 (1.4,1),
368 (1.4,0),
369 (1.6,1),
370 (2.,1),
371 (2.1,0),
372 (2.1,0),
373 (2.1,0),
374 (2.2,1),
375 (2.3,0)]
376 varValues = map(lambda x:x[0],d)
377 resCodes = map(lambda x:x[1],d)
378 nPossibleRes =2
379 res = FindVarMultQuantBounds(varValues,1,resCodes,nPossibleRes)
380 print res
381
382 d = [(1.4,1),
383 (1.4,0)]
384
385 varValues = map(lambda x:x[0],d)
386 resCodes = map(lambda x:x[1],d)
387 nPossibleRes =2
388 res = FindVarMultQuantBounds(varValues,1,resCodes,nPossibleRes)
389 print res
390
391 d = [(1.4,0),
392 (1.4,0),(1.6,1)]
393 varValues = map(lambda x:x[0],d)
394 resCodes = map(lambda x:x[1],d)
395 nPossibleRes =2
396 res = FindVarMultQuantBounds(varValues,2,resCodes,nPossibleRes)
397 print res
398