Package rdkit :: Package Chem :: Package AtomPairs :: Module Utils
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Chem.AtomPairs.Utils

  1  # $Id: Utils.py 1528 2010-09-26 17:04:37Z glandrum $ 
  2  # 
  3  #  Copyright (C) 2004-2006 Rational Discovery LLC 
  4  # 
  5  #   @@ All Rights Reserved @@ 
  6  #  This file is part of the RDKit. 
  7  #  The contents are covered by the terms of the BSD license 
  8  #  which is included in the file license.txt, found at the root 
  9  #  of the RDKit source tree. 
 10  # 
 11  from rdkit import Chem 
 12  from rdkit.Chem import rdMolDescriptors 
 13  import math 
 14   
15 -def ExplainAtomCode(code,branchSubtract=0):
16 """ 17 18 **Arguments**: 19 20 - the code to be considered 21 22 - branchSubtract: (optional) the constant that was subtracted off 23 the number of neighbors before integrating it into the code. 24 This is used by the topological torsions code. 25 26 27 >>> m = Chem.MolFromSmiles('C=CC(=O)O') 28 >>> code = GetAtomCode(m.GetAtomWithIdx(0)) 29 >>> ExplainAtomCode(code) 30 ('C', 1, 1) 31 >>> code = GetAtomCode(m.GetAtomWithIdx(1)) 32 >>> ExplainAtomCode(code) 33 ('C', 2, 1) 34 >>> code = GetAtomCode(m.GetAtomWithIdx(2)) 35 >>> ExplainAtomCode(code) 36 ('C', 3, 1) 37 >>> code = GetAtomCode(m.GetAtomWithIdx(3)) 38 >>> ExplainAtomCode(code) 39 ('O', 1, 1) 40 >>> code = GetAtomCode(m.GetAtomWithIdx(4)) 41 >>> ExplainAtomCode(code) 42 ('O', 1, 0) 43 44 """ 45 typeMask = (1<<rdMolDescriptors.AtomPairsParameters.numTypeBits)-1 46 branchMask = (1<<rdMolDescriptors.AtomPairsParameters.numBranchBits)-1 47 piMask = (1<<rdMolDescriptors.AtomPairsParameters.numPiBits)-1 48 49 nBranch = int(code&branchMask) 50 #print code, 51 code = code>>rdMolDescriptors.AtomPairsParameters.numBranchBits 52 nPi = int(code&piMask) 53 #print code, 54 code = code>>rdMolDescriptors.AtomPairsParameters.numPiBits 55 #print code, 56 typeIdx=int(code&typeMask) 57 if typeIdx<len(rdMolDescriptors.AtomPairsParameters.atomTypes): 58 atomNum = rdMolDescriptors.AtomPairsParameters.atomTypes[typeIdx] 59 atomSymbol=Chem.GetPeriodicTable().GetElementSymbol(atomNum) 60 else: 61 atomSymbol='X' 62 return (atomSymbol,nBranch,nPi)
63 64 GetAtomCode=rdMolDescriptors.GetAtomPairAtomCode 65
66 -def NumPiElectrons(atom):
67 """ Returns the number of electrons an atom is using for pi bonding 68 69 >>> m = Chem.MolFromSmiles('C=C') 70 >>> NumPiElectrons(m.GetAtomWithIdx(0)) 71 1 72 73 >>> m = Chem.MolFromSmiles('C#CC') 74 >>> NumPiElectrons(m.GetAtomWithIdx(0)) 75 2 76 >>> NumPiElectrons(m.GetAtomWithIdx(1)) 77 2 78 79 >>> m = Chem.MolFromSmiles('O=C=CC') 80 >>> NumPiElectrons(m.GetAtomWithIdx(0)) 81 1 82 >>> NumPiElectrons(m.GetAtomWithIdx(1)) 83 2 84 >>> NumPiElectrons(m.GetAtomWithIdx(2)) 85 1 86 >>> NumPiElectrons(m.GetAtomWithIdx(3)) 87 0 88 89 FIX: this behaves oddly in these cases: 90 >>> m = Chem.MolFromSmiles('S(=O)(=O)') 91 >>> NumPiElectrons(m.GetAtomWithIdx(0)) 92 2 93 94 >>> m = Chem.MolFromSmiles('S(=O)(=O)(O)O') 95 >>> NumPiElectrons(m.GetAtomWithIdx(0)) 96 0 97 98 In the second case, the S atom is tagged as sp3 hybridized. 99 100 """ 101 102 res = 0 103 if atom.GetIsAromatic(): 104 res = 1 105 elif atom.GetHybridization() != Chem.HybridizationType.SP3: 106 # the number of pi electrons is just the number of 107 # unsaturations (valence - degree): 108 res = atom.GetExplicitValence() - atom.GetDegree() 109 return res
110 111
112 -def BitsInCommon(v1,v2):
113 """ Returns the number of bits in common between two vectors 114 115 **Arguments**: 116 117 - two vectors (sequences of bit ids) 118 119 **Returns**: an integer 120 121 **Notes** 122 123 - the vectors must be sorted 124 125 - duplicate bit IDs are counted more than once 126 127 >>> BitsInCommon( (1,2,3,4,10), (2,4,6) ) 128 2 129 130 Here's how duplicates are handled: 131 >>> BitsInCommon( (1,2,2,3,4), (2,2,4,5,6) ) 132 3 133 134 """ 135 res = 0 136 v2Pos = 0 137 nV2 = len(v2) 138 for val in v1: 139 while v2Pos<nV2 and v2[v2Pos]<val: 140 v2Pos+=1 141 if v2Pos >= nV2: 142 break 143 if v2[v2Pos]==val: 144 res += 1 145 v2Pos += 1 146 return res
147 148
149 -def DiceSimilarity(v1,v2,bounds=None):
150 """ Implements the DICE similarity metric. 151 This is the recommended metric in both the Topological torsions 152 and Atom pairs papers. 153 154 **Arguments**: 155 156 - two vectors (sequences of bit ids) 157 158 **Returns**: a float. 159 160 **Notes** 161 162 - the vectors must be sorted 163 164 165 >>> DiceSimilarity( (1,2,3), (1,2,3) ) 166 1.0 167 >>> DiceSimilarity( (1,2,3), (5,6) ) 168 0.0 169 >>> DiceSimilarity( (1,2,3,4), (1,3,5,7) ) 170 0.5 171 >>> DiceSimilarity( (1,2,3,4,5,6), (1,3) ) 172 0.5 173 174 Note that duplicate bit IDs count multiple times: 175 >>> DiceSimilarity( (1,1,3,4,5,6), (1,1) ) 176 0.5 177 178 but only if they are duplicated in both vectors: 179 >>> DiceSimilarity( (1,1,3,4,5,6), (1,) )==2./7 180 True 181 182 """ 183 denom = 1.0*(len(v1)+len(v2)) 184 if not denom: 185 res = 0.0 186 else: 187 if bounds and (min(len(v1),len(v2))/denom) < bounds: 188 numer = 0.0 189 else: 190 numer = 2.0*BitsInCommon(v1,v2) 191 res = numer/denom 192 193 return res
194 195
196 -def Dot(v1,v2):
197 """ Returns the Dot product between two vectors: 198 199 **Arguments**: 200 201 - two vectors (sequences of bit ids) 202 203 **Returns**: an integer 204 205 **Notes** 206 207 - the vectors must be sorted 208 209 - duplicate bit IDs are counted more than once 210 211 >>> Dot( (1,2,3,4,10), (2,4,6) ) 212 2 213 214 Here's how duplicates are handled: 215 >>> Dot( (1,2,2,3,4), (2,2,4,5,6) ) 216 5 217 >>> Dot( (1,2,2,3,4), (2,4,5,6) ) 218 2 219 >>> Dot( (1,2,2,3,4), (5,6) ) 220 0 221 >>> Dot( (), (5,6) ) 222 0 223 224 """ 225 res = 0 226 nV1 = len(v1) 227 nV2 = len(v2) 228 i = 0 229 j = 0 230 while i<nV1: 231 v1Val = v1[i] 232 v1Count = 1 233 i+=1 234 while i<nV1 and v1[i]==v1Val: 235 v1Count += 1 236 i+=1 237 while j<nV2 and v2[j]<v1Val: 238 j+=1 239 if j < nV2 and v2[j]==v1Val: 240 v2Val = v2[j] 241 v2Count = 1 242 j+=1 243 while j<nV2 and v2[j]==v1Val: 244 v2Count+=1 245 j+=1 246 commonCount=min(v1Count,v2Count) 247 res += commonCount*commonCount 248 elif j>=nV2: 249 break 250 return res
251
252 -def CosineSimilarity(v1,v2):
253 """ Implements the Cosine similarity metric. 254 This is the recommended metric in the LaSSI paper 255 256 **Arguments**: 257 258 - two vectors (sequences of bit ids) 259 260 **Returns**: a float. 261 262 **Notes** 263 264 - the vectors must be sorted 265 266 >>> print '%.3f'%CosineSimilarity( (1,2,3,4,10), (2,4,6) ) 267 0.516 268 >>> print '%.3f'%CosineSimilarity( (1,2,2,3,4), (2,2,4,5,6) ) 269 0.714 270 >>> print '%.3f'%CosineSimilarity( (1,2,2,3,4), (1,2,2,3,4) ) 271 1.000 272 >>> print '%.3f'%CosineSimilarity( (1,2,2,3,4), (5,6,7) ) 273 0.000 274 >>> print '%.3f'%CosineSimilarity( (1,2,2,3,4), () ) 275 0.000 276 277 """ 278 d1 = Dot(v1,v1) 279 d2 = Dot(v2,v2) 280 denom = math.sqrt(d1*d2) 281 if not denom: 282 res = 0.0 283 else: 284 numer = Dot(v1,v2) 285 res = numer/denom 286 return res
287 288 289 290 291 #------------------------------------ 292 # 293 # doctest boilerplate 294 #
295 -def _test():
296 import doctest,sys 297 return doctest.testmod(sys.modules["__main__"])
298 299 300 if __name__ == '__main__': 301 import sys 302 failed,tried = _test() 303 sys.exit(failed) 304