Package ML :: Package Data :: Module MLData
[hide private]
[frames] | no frames]

Source Code for Module ML.Data.MLData

  1  # 
  2  #  Copyright (C) 2000-2008  greg Landrum and Rational Discovery LLC 
  3  #    All Rights Reserved 
  4  # 
  5  """ classes to be used to help work with data sets 
  6   
  7  """ 
  8  import numpy 
  9  import math 
 10  import copy,types 
 11   
 12   
 13  numericTypes = [type(1),type(1.0),type(1L)] 
14 -class MLDataSet(object):
15 """ A data set for holding general data (floats, ints, and strings) 16 17 **Note** 18 this is intended to be a read-only data structure 19 (i.e. after calling the constructor you cannot touch it) 20 """
21 - def __init__(self,data,nVars=None,nPts=None,nPossibleVals=None, 22 qBounds=None,varNames=None,ptNames=None,nResults=1):
23 """ Constructor 24 25 **Arguments** 26 27 - data: a list of lists containing the data. The data are copied, so don't worry 28 about us overwriting them. 29 30 - nVars: the number of variables 31 32 - nPts: the number of points 33 34 - nPossibleVals: an list containing the number of possible values 35 for each variable (should contain 0 when not relevant) 36 This is _nVars_ long 37 38 - qBounds: a list of lists containing quantization bounds for variables 39 which are to be quantized (note, this class does not quantize 40 the variables itself, it merely stores quantization bounds. 41 an empty sublist indicates no quantization for a given variable 42 This is _nVars_ long 43 44 - varNames: a list of the names of the variables. 45 This is _nVars_ long 46 47 - ptNames: the names (labels) of the individual data points 48 This is _nPts_ long 49 50 - nResults: the number of results columns in the data lists. This is usually 51 1, but can be higher. 52 """ 53 self.data = [x[:] for x in data] 54 self.nResults = nResults 55 if nVars is None: 56 nVars = len(self.data[0])-self.nResults 57 self.nVars = nVars 58 if nPts is None: 59 nPts = len(data) 60 self.nPts = nPts 61 if qBounds is None: 62 qBounds = [[]]*len(self.data[0]) 63 self.qBounds = qBounds 64 if nPossibleVals is None: 65 nPossibleVals = self._CalcNPossible(self.data) 66 self.nPossibleVals = nPossibleVals 67 if varNames is None: 68 varNames = ['']*self.nVars 69 self.varNames = varNames 70 if ptNames is None: 71 ptNames = ['']*self.nPts 72 self.ptNames = ptNames
73
74 - def _CalcNPossible(self,data):
75 """calculates the number of possible values of each variable (where possible) 76 77 **Arguments** 78 79 -data: a list of examples to be used 80 81 **Returns** 82 83 a list of nPossible values for each variable 84 85 """ 86 nVars = self.GetNVars()+self.nResults 87 nPossible = [-1]*nVars 88 cols = range(nVars) 89 for i,bounds in enumerate(self.qBounds): 90 if len(bounds)>0: 91 nPossible[i] = len(bounds) 92 cols.remove(i) 93 94 nPts = self.GetNPts() 95 for i,pt in enumerate(self.data): 96 for col in cols[:]: 97 d = pt[col] 98 if type(d) in numericTypes: 99 if math.floor(d) == d: 100 nPossible[col] = max(math.floor(d),nPossible[col]) 101 else: 102 nPossible[col] = -1 103 cols.remove(col) 104 else: 105 nPossible[col] = -1 106 cols.remove(col) 107 return [int(x)+1 for x in nPossible]
108
109 - def GetNResults(self):
110 return self.nResults
111 - def GetNVars(self):
112 return self.nVars
113 - def GetNPts(self):
114 return self.nPts
115 - def GetNPossibleVals(self):
116 return self.nPossibleVals
117 - def GetQuantBounds(self):
118 return self.qBounds
119
120 - def __getitem__(self,idx):
121 res = [self.ptNames[idx]]+self.data[idx][:] 122 return res
123 - def __setitem__(self,idx,val):
124 if len(val) != self.GetNVars()+self.GetNResults()+1: 125 raise ValueError,'bad value in assignment' 126 self.ptNames[idx] = val[0] 127 self.data[idx] = val[1:] 128 return val
129
130 - def GetNamedData(self):
131 """ returns a list of named examples 132 133 **Note** 134 135 a named example is the result of prepending the example 136 name to the data list 137 138 """ 139 res = [None]*self.nPts 140 for i in xrange(self.nPts): 141 res[i] = [self.ptNames[i]]+self.data[i][:] 142 return res
143
144 - def GetAllData(self):
145 """ returns a *copy* of the data 146 147 """ 148 return copy.deepcopy(self.data)
149 - def GetInputData(self):
150 """ returns the input data 151 152 **Note** 153 154 _inputData_ means the examples without their result fields 155 (the last _NResults_ entries) 156 157 """ 158 v = self.GetNResults() 159 return [x[:-v] for x in self.data]
160
161 - def GetResults(self):
162 """ Returns the result fields from each example 163 164 """ 165 if self.GetNResults()>1: 166 v = self.GetNResults() 167 res = [x[-v:] for x in self.data] 168 else: 169 res = [x[-1] for x in self.data] 170 return res
171
172 - def GetVarNames(self):
173 return self.varNames
174 - def GetPtNames(self):
175 return self.ptNames
176
177 - def AddPoint(self,pt):
178 self.data.append(pt[1:]) 179 self.ptNames.append(pt[0]) 180 self.nPts += 1
181
182 - def AddPoints(self,pts,names):
183 if len(pts)!=len(names): 184 raise ValueError,"input length mismatch" 185 self.data += pts 186 self.ptNames += names 187 self.nPts = len(self.data)
188
189 -class MLQuantDataSet(MLDataSet):
190 """ a data set for holding quantized data 191 192 193 **Note** 194 195 this is intended to be a read-only data structure 196 (i.e. after calling the constructor you cannot touch it) 197 198 **Big differences to MLDataSet** 199 200 1) data are stored in a numpy array since they are homogenous 201 202 2) results are assumed to be quantized (i.e. no qBounds entry is required) 203 204 """
205 - def _CalcNPossible(self,data):
206 """calculates the number of possible values of each variable 207 208 **Arguments** 209 210 -data: a list of examples to be used 211 212 **Returns** 213 214 a list of nPossible values for each variable 215 216 """ 217 return [max(x)+1 for x in numpy.transpose(data)]
218
219 - def GetNamedData(self):
220 """ returns a list of named examples 221 222 **Note** 223 224 a named example is the result of prepending the example 225 name to the data list 226 227 """ 228 res = [None]*self.nPts 229 for i in xrange(self.nPts): 230 res[i] = [self.ptNames[i]]+self.data[i].tolist() 231 return res
232
233 - def GetAllData(self):
234 """ returns a *copy* of the data 235 236 """ 237 return self.data.tolist()
238 - def GetInputData(self):
239 """ returns the input data 240 241 **Note** 242 243 _inputData_ means the examples without their result fields 244 (the last _NResults_ entries) 245 246 """ 247 return (self.data[:,:-self.nResults]).tolist()
248 - def GetResults(self):
249 """ Returns the result fields from each example 250 251 """ 252 if self.GetNResults()>1: 253 v = self.GetNResults() 254 res = [x[-v:] for x in self.data] 255 else: 256 res = [x[-1] for x in self.data] 257 return res
258 259
260 - def __init__(self,data,nVars=None,nPts=None,nPossibleVals=None, 261 qBounds=None,varNames=None,ptNames=None,nResults=1):
262 """ Constructor 263 264 **Arguments** 265 266 - data: a list of lists containing the data. The data are copied, so don't worry 267 about us overwriting them. 268 269 - nVars: the number of variables 270 271 - nPts: the number of points 272 273 - nPossibleVals: an list containing the number of possible values 274 for each variable (should contain 0 when not relevant) 275 This is _nVars_ long 276 277 - qBounds: a list of lists containing quantization bounds for variables 278 which are to be quantized (note, this class does not quantize 279 the variables itself, it merely stores quantization bounds. 280 an empty sublist indicates no quantization for a given variable 281 This is _nVars_ long 282 283 - varNames: a list of the names of the variables. 284 This is _nVars_ long 285 286 - ptNames: the names (labels) of the individual data points 287 This is _nPts_ long 288 289 - nResults: the number of results columns in the data lists. This is usually 290 1, but can be higher. 291 """ 292 self.data = numpy.array(data) 293 self.nResults = nResults 294 if nVars is None: 295 nVars = len(data[0])-self.nResults 296 self.nVars = nVars 297 if nPts is None: 298 nPts = len(data) 299 self.nPts = nPts 300 if qBounds is None: 301 qBounds = [[]]*self.nVars 302 self.qBounds = qBounds 303 if nPossibleVals is None: 304 nPossibleVals = self._CalcNPossible(data) 305 self.nPossibleVals = nPossibleVals 306 if varNames is None: 307 varNames = ['']*self.nVars 308 self.varNames = varNames 309 if ptNames is None: 310 ptNames = ['']*self.nPts 311 self.ptNames = ptNames
312 313 314 if __name__ == '__main__': 315 import DataUtils 316 examples = [[0,0,0,0,0], 317 [0,0,0,1,0], 318 [1,0,0,0,1], 319 [2,1,0,0,1], 320 [2,2,1,0,1] 321 ] 322 varNames = ['foo1','foo2','foo3','foo4','res'] 323 ptNames = ['p1','p2','p3','p4','p5'] 324 set = MLQuantDataSet(examples,varNames=varNames,ptNames=ptNames) 325 DataUtils.WritePickledData('test_data/test.qdat.pkl',set) 326 print 'nVars:',set.GetNVars() 327 print 'nPts:',set.GetNPts() 328 print 'nPoss:',set.GetNPossibleVals() 329 print 'qBounds:',set.GetQuantBounds() 330 print 'data:',set.GetAllData() 331 print 'Input data:',set.GetInputData() 332 print 'results:',set.GetResults() 333 334 print 'nameddata:',set.GetNamedData() 335 336 examples = [ 337 ['foo',1,1.0,1,1.1], 338 ['foo',2,1.0,1,2.1], 339 ['foo',3,1.2,1.1,3.1], 340 ['foo',4,1.0,1,4.1], 341 ['foo',5,1.1,1,5.1], 342 ] 343 qBounds = [[],[],[],[],[2,4]] 344 varNames = ['foo1','foo2','foo3','foo4','res'] 345 ptNames = ['p1','p2','p3','p4','p5'] 346 set = MLDataSet(examples,qBounds=qBounds) 347 DataUtils.WritePickledData('test_data/test.dat.pkl',set) 348 print 'nVars:',set.GetNVars() 349 print 'nPts:',set.GetNPts() 350 print 'nPoss:',set.GetNPossibleVals() 351 print 'qBounds:',set.GetQuantBounds() 352 print 'data:',set.GetAllData() 353 print 'Input data:',set.GetInputData() 354 print 'results:',set.GetResults() 355 356 print 'nameddata:',set.GetNamedData() 357