Package PySVD :: Module SVDSimilarity
[hide private]
[frames] | no frames]

Source Code for Module PySVD.SVDSimilarity

  1  # 
  2  #  Copyright (C) 2004 Rational Discovery LLC 
  3  #   All Rights Reserved 
  4  # 
  5  from Numeric import * 
  6  from DataStructs.TopNContainer import TopNContainer 
  7  import SVDPack 
  8  import PySVD 
  9   
10 -def showMat(mat):
11 for row in mat: 12 for col in row: 13 print '% 6.3f'%col, 14 print
15 16 17 """ 18 throughout we use the notation of: 19 Deerwester et al. J. Am. Soc. Inf. Sci. 41 391-407 (1990) 20 21 """
22 -class SimilarityCalculator(object):
23 - def __init__(self):
24 self._vects = None 25 self._idMap = None 26 self._tForm = None 27 self._DS = None 28 self._T = None
29
30 - def SetVects(self,vects):
31 """ 32 33 vects is a sequence of *sorted* sequences of bit IDs 34 35 36 >>> calc = SimilarityCalculator() 37 >>> calc.SetVects( ((1,2),(3,100),(1,2,2,4)) ) 38 >>> calc._vects 39 ((0, 1), (2, 4), (0, 1, 3)) 40 >>> calc._vals 41 (1, 1, 1, 1, 1, 2, 1) 42 >>> calc._idMap[100] 43 4 44 >>> calc._idMap[4] 45 3 46 47 """ 48 self._idMap = {} 49 self._vects = [] 50 self._vals = [] 51 self._tForm = None 52 self._DS = None 53 self._T = None 54 55 tmpD = {} 56 for vect in vects: 57 for bit in vect: 58 if not tmpD.has_key(bit): 59 tmpD[bit] = 1 60 ks = tmpD.keys() 61 ks.sort() 62 for i in range(len(ks)): 63 self._idMap[ks[i]] = i 64 ks = None 65 tmpD = None 66 67 for vect in vects: 68 tmp = [] 69 i = 0 70 nBits = len(vect) 71 while i<nBits: 72 bit = vect[i] 73 # we need to do a few things with this bit: 74 # 1) find its location in the global idMap 75 # (mapping bitID -> reduced space ID), 76 # adding a new location if necessary 77 # 2) add an entry to the reduced-space vector 78 # 3) add an entry to the reduced-space value array 79 idx = self._idMap.get(bit) 80 if idx < 0: 81 idx = len(self._idMap) 82 self._idMap[bit] = idx 83 # update the reduced-space vector: 84 tmp.append(idx) 85 # add an entry to the value array: 86 self._vals.append(1) 87 # now grab duplicates: 88 j=i+1 89 while(j<nBits and vect[j]==vect[i]): 90 self._vals[-1] += 1 91 j += 1 92 i=j 93 #if len(self._vects)==0: 94 # print vect 95 # print tuple(tmp) 96 self._vects.append(tuple(tmp)) 97 self._vects = tuple(self._vects) 98 self._vals = tuple(self._vals)
99
100 - def UpdateSingularValues1(self,k=-1,cleanup=1):
101 """ 102 >>> calc = SimilarityCalculator() 103 >>> try: 104 ... calc.UpdateSingularValues() 105 ... except ValueError: 106 ... ok=1 107 ... else: 108 ... ok=0 109 >>> ok 110 1 111 >>> calc.SetVects( ((0,2),(1,3),(0,1,2)) ) 112 >>> calc.UpdateSingularValues() 113 >>> calc._S.shape[0] 114 3 115 116 Unless the optional cleanup argument is unset,the local vects 117 (untransformed data points) will be destroyed after we're done 118 with them here. This can save significant memory: 119 >>> try: 120 ... calc.UpdateSingularValues(2) 121 ... except ValueError: 122 ... ok=1 123 ... else: 124 ... ok=0 125 >>> ok 126 1 127 128 Have to call SetVects again: 129 >>> calc.SetVects( ((0,2),(1,3),(0,1,2)) ) 130 >>> calc.UpdateSingularValues(2) 131 >>> calc._S.shape[0] 132 2 133 >>> print '%.4f'%calc._S[0] 134 2.1889 135 >>> print '%.4f'%calc._S[1] 136 1.4142 137 138 """ 139 if not self._vects: 140 raise ValueError,"SetVects() not called" 141 142 nRows = len(self._vects) 143 nCols = len(self._idMap) 144 if k==-1: 145 k = min(nRows,nCols) 146 #print self._vects 147 #print self._vals 148 #print 'shape:',nRows,nCols,len(self._vals) 149 D,s,T = PySVD.SparseSVD(self._vects,self._vals,nRows,nCols,k,1) 150 T = transpose(T) 151 D = transpose(D) 152 # sometimes the rank of the matrix is smaller than we think 153 # it should be: 154 k = min(k,len(s)) 155 self._S = s[:k] 156 #print 'k:',k,'T:',T.shape,'S:',s.shape,'D:',D.shape 157 self._T = T 158 self._DS = D*s 159 160 if 0: 161 print 'T:' 162 showMat(self._T) 163 164 print 'S:' 165 print s 166 167 print 'D:' 168 showMat(D) 169 170 171 print 'DS:' 172 showMat(self._DS) 173 174 print '------------------------' 175 showMat(matrixmultiply(transpose(T),T)) 176 177 print '------------------------' 178 showMat(matrixmultiply(transpose(D),D)) 179 180 print '------------------------' 181 print self._T.shape,self._DS.shape 182 showMat(matrixmultiply(self._T,transpose(self._DS))) 183 184 185 # save some memory: 186 if cleanup: 187 self._vects = None 188 self._vals = None
189
190 - def UpdateSingularValues(self,k=-1,cleanup=1):
191 """ 192 193 194 >>> calc = SimilarityCalculator() 195 >>> try: 196 ... calc.UpdateSingularValues() 197 ... except ValueError: 198 ... ok=1 199 ... else: 200 ... ok=0 201 >>> ok 202 1 203 >>> calc.SetVects( ((0,2),(1,3),(0,1,2)) ) 204 >>> calc.UpdateSingularValues() 205 >>> calc._S.shape[0] 206 3 207 208 Unless the optional cleanup argument is unset,the local vects 209 (untransformed data points) will be destroyed after we're done 210 with them here. This can save significant memory: 211 >>> try: 212 ... calc.UpdateSingularValues(2) 213 ... except ValueError: 214 ... ok=1 215 ... else: 216 ... ok=0 217 >>> ok 218 1 219 220 Have to call SetVects again: 221 >>> calc.SetVects( ((0,2),(1,3),(0,1,2)) ) 222 >>> calc.UpdateSingularValues(2) 223 >>> calc._S.shape[0] 224 2 225 >>> print '%.4f'%calc._S[0] 226 2.1889 227 >>> print '%.4f'%calc._S[1] 228 1.4142 229 230 """ 231 if not self._vects: 232 raise ValueError,"SetVects() not called" 233 234 nRows = len(self._vects) 235 nCols = len(self._idMap) 236 if k==-1: 237 k = min(nRows,nCols) 238 239 SVDPack.DoSVD(self,k) 240 241 # save some memory: 242 if cleanup: 243 self._vects = None 244 self._vals = None
245 - def ForceSingularValues(self,k,T,D,s,cleanup=1):
246 k = min(k,len(s)) 247 self._S = s[:k] 248 self._T = T 249 self._DS = D*s 250 if cleanup: 251 self._vects = None 252 self._vals = None
253 254 255 256
257 - def PackPoint(self,pt):
258 """ 259 260 converts a point from the normal space to the reduced (packed) space 261 262 >>> calc = SimilarityCalculator() 263 >>> calc.SetVects( ((1,2),(3,100),(1,2,2,4)) ) 264 >>> calc.PackPoint( (1,2) ) 265 array([ 1., 1., 0., 0., 0.]) 266 >>> calc.PackPoint( (1,2,2) ) 267 array([ 1., 2., 0., 0., 0.]) 268 >>> calc.PackPoint( (1,2,5) ) 269 array([ 1., 1., 0., 0., 0.]) 270 271 """ 272 if not self._idMap: 273 raise ValueError,"SetVects() not called" 274 res = zeros((len(self._idMap),),Float) 275 for val in pt: 276 idx = self._idMap.get(val,-1) 277 if idx>=0: 278 res[idx] += 1 279 return res
280
281 - def TransformPoint(self,pt):
282 """ Transforms a point into the SVD space 283 284 >>> calc = SimilarityCalculator() 285 >>> calc.SetVects( ((0,2),(1,3),(0,1,2)) ) 286 >>> calc.UpdateSingularValues(k=3) 287 288 if we pass in a point used for the SVD, we should just get the 289 transformed version of that point back: 290 >>> v2 = calc.TransformPoint( (0,2) ) 291 292 #>>> v1 = transpose(calc._singularVects[0]) 293 #>>> abs(max(v1-v2))<1e-6 294 #1 295 #>>> v1 = transpose(calc._singularVects[1]) 296 #>>> abs(max(v1-v2))>1e-6 297 #1 298 299 300 """ 301 if not self._T: 302 raise ValueError,"UpdateSingularValues() not called" 303 packed = self.PackPoint(pt) 304 #print packed.shape,self._T.shape,self._DS.shape 305 v = matrixmultiply(packed,self._T) 306 return v
307
308 - def ScorePoint(self,pt,against=None,isTransformed=0,topN=0, 309 threshold=-1.0, 310 excludeThese=[]):
311 """ 312 313 return value is a sequence of 2-tuples: (score, index) 314 315 >>> calc = SimilarityCalculator() 316 >>> calc.SetVects( ((0,2),(1,3),(0,1,2)) ) 317 >>> calc.UpdateSingularValues(k=3) 318 >>> r = calc.ScorePoint((0,2),against=[0])[0] 319 >>> print '%.2f'%r[0], r[1] 320 1.00 0 321 322 can transform the point in advance: 323 >>> pt = calc.TransformPoint( (0,2) ) 324 >>> r = calc.ScorePoint(pt,against=[0],isTransformed=1)[0] 325 >>> print '%.2f'%r[0], r[1] 326 1.00 0 327 328 default is to score against a variety of vectors at once: 329 >>> [abs(x[0])>1e-4 for x in calc.ScorePoint(pt,isTransformed=1)] 330 [True, False, True] 331 332 >>> [abs(x[0])>1e-4 for x in calc.ScorePoint((0,3,6))] 333 [True, True, True] 334 335 >>> [abs(x[0])>1e-4 for x in calc.ScorePoint((0,3,6),topN=2)] 336 [True, True] 337 338 you can also put a threshold on the similarity metric: 339 >>> len(calc.ScorePoint((0,3,6))) 340 3 341 >>> len(calc.ScorePoint((0,3,6),threshold=0.50)) 342 2 343 344 345 346 "extra" bits (those that weren't in the training vectors) are 347 ignored: 348 >>> [abs(x[0])>1e-4 for x in calc.ScorePoint((0,2,12))] 349 [True, False, True] 350 351 # look at the indices: 352 >>> v = [x[1] for x in calc.ScorePoint((0,3,6),topN=2)] 353 >>> v.sort() 354 >>> v 355 [0, 1] 356 >>> [x[1] for x in calc.ScorePoint((0,3,6),topN=2,excludeThese=[1])] 357 [2, 0] 358 359 360 """ 361 if not self._T or not self._DS: 362 raise ValueError,"UpdateSingularValues() not called" 363 if not isTransformed: 364 pt = self.TransformPoint(pt) 365 if against is None: 366 against = range(self._DS.shape[0]) 367 try: 368 nPts = len(against) 369 except AttributeError: 370 against = [against] 371 nPts =1 372 if topN: 373 res = TopNContainer(topN) 374 else: 375 res = [] 376 ptSize = sqrt(dot(pt,pt)) 377 #indices = range(nPts) 378 for idx in excludeThese: 379 try: 380 against.remove(idx) 381 except ValueError: 382 pass 383 384 #print 'PT:',pt 385 for i in against: 386 v = self._DS[i] 387 vSize = sqrt(dot(v,v)) 388 numer = dot(v,pt) 389 denom = vSize*ptSize 390 391 if denom != 0.0: 392 simVal = numer/denom 393 else: 394 simVal = 0.0 395 if simVal>threshold: 396 if not topN: 397 res.append((simVal,i)) 398 else: 399 res.Insert(simVal,i) 400 return res
401 402 #------------------------------------ 403 # 404 # doctest boilerplate 405 # 406 molTest=""" 407 408 This is a nice test because not only is it molecular with vaguely 409 sensible results, but the matrix is only of rank 2 (despite being 410 3x3), so it can catch boundary conditions in the solver. 411 412 >>> import Chem 413 >>> from Chem.AtomPairs import Torsions,Utils 414 >>> m1 = Chem.MolFromSmiles('CCN(CCO)CC') 415 >>> m2 = Chem.MolFromSmiles('CCN(CCO)CCO') 416 >>> m3 = Chem.MolFromSmiles('OCCN(CCO)CCO') 417 >>> fp1 = Torsions.GetTopologicalTorsionFingerprintAsIds(m1) 418 >>> print fp1 419 >>> fp2 = Torsions.GetTopologicalTorsionFingerprintAsIds(m2) 420 >>> fp3 = Torsions.GetTopologicalTorsionFingerprintAsIds(m3) 421 >>> calc = SimilarityCalculator() 422 >>> calc.SetVects((fp1,fp2,fp3)) 423 >>> calc.UpdateSingularValues() 424 >>> scores = calc.ScorePoint(fp1) 425 >>> print '%.3f'%scores[0][0],scores[0][1] 426 1.000 0 427 >>> print '%.3f'%scores[1][0],scores[1][1] 428 0.802 1 429 >>> print '%.3f'%scores[2][0],scores[2][1] 430 0.488 2 431 >>> scores = calc.ScorePoint(fp2) 432 >>> print '%.3f'%scores[0][0],scores[0][1] 433 0.802 0 434 >>> print '%.3f'%scores[1][0],scores[1][1] 435 1.000 1 436 >>> print '%.3f'%scores[2][0],scores[2][1] 437 0.913 2 438 >>> scores = calc.ScorePoint(fp3) 439 >>> print '%.3f'%scores[0][0],scores[0][1] 440 0.488 0 441 >>> print '%.3f'%scores[1][0],scores[1][1] 442 0.913 1 443 >>> print '%.3f'%scores[2][0],scores[2][1] 444 1.000 2 445 >>> scores = calc.ScorePoint(fp1,topN=2) 446 >>> scores.reverse() 447 >>> print '%.3f'%scores[0][0],scores[0][1] 448 1.000 0 449 >>> print '%.3f'%scores[1][0],scores[1][1] 450 0.802 1 451 452 453 """ 454 455 __test__={'molTest':molTest} 456
457 -def _test():
458 import doctest,sys 459 return doctest.testmod(sys.modules["__main__"])
460 461 462 if __name__ == '__main__': 463 import sys 464 failed,tried = _test() 465 sys.exit(failed) 466