Package rdkit :: Package Chem :: Module GraphDescriptors
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Chem.GraphDescriptors

  1  # $Id: GraphDescriptors.py 1528 2010-09-26 17:04:37Z glandrum $ 
  2  # 
  3  # Copyright (C) 2003-2006 greg Landrum and Rational Discovery LLC 
  4  # 
  5  #   @@ All Rights Reserved @@ 
  6  #  This file is part of the RDKit. 
  7  #  The contents are covered by the terms of the BSD license 
  8  #  which is included in the file license.txt, found at the root 
  9  #  of the RDKit source tree. 
 10  # 
 11  """ Calculation of topological/topochemical descriptors. 
 12   
 13   
 14   
 15  """ 
 16  from rdkit import Chem 
 17  from rdkit.Chem import Graphs 
 18  from rdkit.Chem import rdchem 
 19  # FIX: remove this dependency here and below 
 20  from rdkit.Chem import pyPeriodicTable as PeriodicTable 
 21  import numpy 
 22  import math 
 23  from rdkit.ML.InfoTheory import entropy 
 24   
 25  periodicTable = rdchem.GetPeriodicTable() 
 26   
 27  _log2val = math.log(2) 
28 -def _log2(x):
29 return math.log(x) / _log2val
30
31 -def _VertexDegrees(mat,onlyOnes=0):
32 """ *Internal Use Only* 33 34 this is just a row sum of the matrix... simple, neh? 35 36 """ 37 if not onlyOnes: 38 res = sum(mat) 39 else: 40 res = sum(numpy.equal(mat,1)) 41 return res
42
43 -def _NumAdjacencies(mol,dMat):
44 """ *Internal Use Only* 45 46 """ 47 res = mol.GetNumBonds() 48 return res
49
50 -def _GetCountDict(arr):
51 """ *Internal Use Only* 52 53 """ 54 res = {} 55 for v in arr: 56 res[v] = res.get(v,0)+1 57 return res
58 59
60 -def HallKierAlpha(m):
61 """ calculate the Hall-Kier alpha value for a molecule 62 63 From equations (58) of Rev. Comp. Chem. vol 2, 367-422, (1991) 64 65 """ 66 alphaSum = 0.0 67 rC = PeriodicTable.nameTable['C'][5] 68 for atom in m.GetAtoms(): 69 atNum=atom.GetAtomicNum() 70 if not atNum: continue 71 symb = atom.GetSymbol() 72 alphaV = PeriodicTable.hallKierAlphas.get(symb,None) 73 if alphaV is not None: 74 hyb = atom.GetHybridization()-2 75 if(hyb<len(alphaV)): 76 alpha = alphaV[hyb] 77 if alpha is None: 78 alpha = alphaV[-1] 79 else: 80 alpha = alphaV[-1] 81 else: 82 rA = PeriodicTable.nameTable[symb][5] 83 alpha = rA/rC - 1 84 alphaSum += alpha 85 return alphaSum
86 HallKierAlpha.version="1.0.2" 87
88 -def Ipc(mol, avg = 0, dMat = None, forceDMat = 0):
89 """This returns the information content of the coefficients of the characteristic 90 polynomial of the adjacency matrix of a hydrogen-suppressed graph of a molecule. 91 92 'avg = 1' returns the information content divided by the total population. 93 94 From D. Bonchev & N. Trinajstic, J. Chem. Phys. vol 67, 4517-4533 (1977) 95 96 """ 97 if forceDMat or dMat is None: 98 if forceDMat: 99 dMat = Chem.GetDistanceMatrix(mol,0) 100 mol._adjMat = dMat 101 else: 102 try: 103 dMat = mol._adjMat 104 except AttributeError: 105 dMat = Chem.GetDistanceMatrix(mol,0) 106 mol._adjMat = dMat 107 108 adjMat = numpy.equal(dMat,1) 109 cPoly = abs(Graphs.CharacteristicPolynomial(mol, adjMat)) 110 if avg: 111 return entropy.InfoEntropy(cPoly) 112 else: 113 return sum(cPoly)*entropy.InfoEntropy(cPoly)
114 Ipc.version="1.0.0" 115 116 117
118 -def Kappa1(mol):
119 """ Hall-Kier Kappa1 value 120 121 From equations (58) and (59) of Rev. Comp. Chem. vol 2, 367-422, (1991) 122 123 """ 124 P1 = mol.GetNumBonds(1) 125 A = mol.GetNumAtoms(onlyHeavy=1) 126 alpha = HallKierAlpha(mol) 127 denom = P1 + alpha 128 if denom: 129 kappa = (A + alpha)*(A + alpha - 1)**2 / denom**2 130 else: 131 kappa = 0.0 132 return kappa
133 Kappa1.version="1.0.0" 134 135
136 -def Kappa2(mol):
137 """ Hall-Kier Kappa2 value 138 139 From equations (58) and (60) of Rev. Comp. Chem. vol 2, 367-422, (1991) 140 141 """ 142 P2 = len(Chem.FindAllPathsOfLengthN(mol,2)) 143 A = mol.GetNumAtoms(onlyHeavy=1) 144 alpha = HallKierAlpha(mol) 145 denom = (P2 + alpha)**2 146 if denom: 147 kappa = (A + alpha - 1)*(A + alpha - 2)**2 / denom 148 else: 149 kappa = 0 150 return kappa
151 Kappa2.version="1.0.0" 152
153 -def Kappa3(mol):
154 """ Hall-Kier Kappa3 value 155 156 From equations (58), (61) and (62) of Rev. Comp. Chem. vol 2, 367-422, (1991) 157 158 """ 159 P3 = len(Chem.FindAllPathsOfLengthN(mol,3)) 160 A = mol.GetNumAtoms(1) 161 alpha = HallKierAlpha(mol) 162 denom = (P3 + alpha)**2 163 if denom: 164 if A % 2 == 1: 165 kappa = (A + alpha - 1)*(A + alpha - 3)**2 / denom 166 else: 167 kappa = (A + alpha - 2)*(A + alpha - 3)**2 / denom 168 else: 169 kappa = 0 170 return kappa
171 Kappa3.version="1.0.0" 172
173 -def Chi0(mol):
174 """ From equations (1),(9) and (10) of Rev. Comp. Chem. vol 2, 367-422, (1991) 175 176 """ 177 deltas = [x.GetDegree() for x in mol.GetAtoms()] 178 while 0 in deltas: 179 deltas.remove(0) 180 deltas = numpy.array(deltas,'d') 181 res = sum(numpy.sqrt(1./deltas)) 182 return res
183 Chi0.version="1.0.0" 184 185
186 -def Chi1(mol):
187 """ From equations (1),(11) and (12) of Rev. Comp. Chem. vol 2, 367-422, (1991) 188 189 """ 190 c1s = [x.GetBeginAtom().GetDegree()*x.GetEndAtom().GetDegree() for x in mol.GetBonds()] 191 while 0 in c1s: 192 c1s.remove(0) 193 c1s = numpy.array(c1s,'d') 194 res = sum(numpy.sqrt(1./c1s)) 195 return res
196 Chi1.version="1.0.0" 197
198 -def _nVal(atom):
199 return periodicTable.GetNOuterElecs(atom.GetAtomicNum())-atom.GetTotalNumHs()
200
201 -def _hkDeltas(mol,skipHs=1):
202 global periodicTable 203 res = [] 204 for atom in mol.GetAtoms(): 205 n = atom.GetAtomicNum() 206 if n>1: 207 nV = periodicTable.GetNOuterElecs(n) 208 nHs = atom.GetTotalNumHs() 209 if n <= 10: 210 # first row 211 res.append(float(nV-nHs)) 212 else: 213 # second row and up 214 res.append(float(nV-nHs)/float(n-nV-1)) 215 elif not skipHs: 216 res.append(0.0) 217 return res
218
219 -def Chi0v(mol):
220 """ From equations (5),(9) and (10) of Rev. Comp. Chem. vol 2, 367-422, (1991) 221 222 """ 223 deltas = _hkDeltas(mol) 224 while 0 in deltas: 225 deltas.remove(0) 226 res = sum(numpy.sqrt(1./numpy.array(deltas))) 227 return res
228 Chi0v.version="1.0.0" 229
230 -def Chi1v(mol):
231 """ From equations (5),(11) and (12) of Rev. Comp. Chem. vol 2, 367-422, (1991) 232 233 """ 234 deltas = numpy.array(_hkDeltas(mol,skipHs=0)) 235 res = 0.0 236 for bond in mol.GetBonds(): 237 v = deltas[bond.GetBeginAtomIdx()]*deltas[bond.GetEndAtomIdx()] 238 if v != 0.0: 239 res += numpy.sqrt(1./v) 240 return res
241 Chi1v.version="1.0.0" 242
243 -def ChiNv_(mol,order=2):
244 """ From equations (5),(15) and (16) of Rev. Comp. Chem. vol 2, 367-422, (1991) 245 246 **NOTE**: because the current path finding code does, by design, 247 detect rings as paths (e.g. in C1CC1 there is *1* atom path of 248 length 3), values of ChiNv with N >= 3 may give results that differ 249 from those provided by the old code in molecules that have rings of 250 size 3. 251 252 """ 253 deltas = numpy.array(_hkDeltas(mol,skipHs=0)) 254 accum = 0.0 255 #print 'DELTAS',deltas 256 for path in Chem.FindAllPathsOfLengthN(mol,order+1,useBonds=0): 257 ats = [] 258 cAccum = 1.0 259 #print 'PATH:',path 260 for idx in path: 261 cAccum *= deltas[idx] 262 if cAccum: 263 accum += 1./numpy.sqrt(cAccum) 264 return accum
265
266 -def Chi2v(mol):
267 """ From equations (5),(15) and (16) of Rev. Comp. Chem. vol 2, 367-422, (1991) 268 269 """ 270 return ChiNv_(mol,2)
271 Chi2v.version="1.0.0" 272
273 -def Chi3v(mol):
274 """ From equations (5),(15) and (16) of Rev. Comp. Chem. vol 2, 367-422, (1991) 275 276 """ 277 return ChiNv_(mol,3)
278 Chi3v.version="1.0.0" 279
280 -def Chi4v(mol):
281 """ From equations (5),(15) and (16) of Rev. Comp. Chem. vol 2, 367-422, (1991) 282 283 **NOTE**: because the current path finding code does, by design, 284 detect rings as paths (e.g. in C1CC1 there is *1* atom path of 285 length 3), values of Chi4v may give results that differ from those 286 provided by the old code in molecules that have 3 rings. 287 288 """ 289 return ChiNv_(mol,4)
290 Chi4v.version="1.0.0" 291
292 -def Chi0n(mol):
293 """ Similar to Hall Kier Chi0v, but uses nVal instead of valence 294 This makes a big difference after we get out of the first row. 295 296 """ 297 deltas = [_nVal(x) for x in mol.GetAtoms()] 298 while deltas.count(0): 299 deltas.remove(0) 300 deltas = numpy.array(deltas,'d') 301 res = sum(numpy.sqrt(1./deltas)) 302 return res
303 Chi0n.version="1.0.0" 304
305 -def Chi1n(mol):
306 """ Similar to Hall Kier Chi1v, but uses nVal instead of valence 307 308 """ 309 delts = numpy.array([_nVal(x) for x in mol.GetAtoms()],'d') 310 res = 0.0 311 for bond in mol.GetBonds(): 312 v = delts[bond.GetBeginAtomIdx()]*delts[bond.GetEndAtomIdx()] 313 if v != 0.0: 314 res += numpy.sqrt(1./v) 315 return res
316 Chi1n.version="1.0.0"
317 -def ChiNn_(mol,order=2):
318 """ Similar to Hall Kier ChiNv, but uses nVal instead of valence 319 This makes a big difference after we get out of the first row. 320 321 **NOTE**: because the current path finding code does, by design, 322 detect rings as paths (e.g. in C1CC1 there is *1* atom path of 323 length 3), values of ChiNn with N >= 3 may give results that differ 324 from those provided by the old code in molecules that have rings of 325 size 3. 326 327 """ 328 deltas = numpy.array([_nVal(x) for x in mol.GetAtoms()],'d') 329 accum = 0.0 330 for path in Chem.FindAllPathsOfLengthN(mol,order+1,useBonds=0): 331 cAccum = 1.0 332 for idx in path: 333 cAccum *= deltas[idx] 334 if cAccum: 335 accum += 1./numpy.sqrt(cAccum) 336 return accum
337
338 -def Chi2n(mol):
339 """ Similar to Hall Kier Chi2v, but uses nVal instead of valence 340 This makes a big difference after we get out of the first row. 341 342 """ 343 return ChiNn_(mol,2)
344 Chi2n.version="1.0.0" 345
346 -def Chi3n(mol):
347 """ Similar to Hall Kier Chi3v, but uses nVal instead of valence 348 This makes a big difference after we get out of the first row. 349 350 """ 351 return ChiNn_(mol,3)
352 Chi3n.version="1.0.0" 353
354 -def Chi4n(mol):
355 """ Similar to Hall Kier Chi4v, but uses nVal instead of valence 356 This makes a big difference after we get out of the first row. 357 358 359 **NOTE**: because the current path finding code does, by design, 360 detect rings as paths (e.g. in C1CC1 there is *1* atom path of 361 length 3), values of Chi4n may give results that differ from those 362 provided by the old code in molecules that have 3 rings. 363 364 """ 365 return ChiNn_(mol,4)
366 Chi4n.version="1.0.0" 367 368
369 -def BalabanJ(mol,dMat=None,forceDMat=0):
370 """ Calculate Balaban's J value for a molecule 371 372 **Arguments** 373 374 - mol: a molecule 375 376 - dMat: (optional) a distance/adjacency matrix for the molecule, if this 377 is not provide, one will be calculated 378 379 - forceDMat: (optional) if this is set, the distance/adjacency matrix 380 will be recalculated regardless of whether or not _dMat_ is provided 381 or the molecule already has one 382 383 **Returns** 384 385 - a float containing the J value 386 387 We follow the notation of Balaban's paper: 388 Chem. Phys. Lett. vol 89, 399-404, (1982) 389 390 """ 391 # if no dMat is passed in, calculate one ourselves 392 if forceDMat or dMat is None: 393 if forceDMat: 394 # FIX: should we be using atom weights here or not? 395 dMat = Chem.GetDistanceMatrix(mol,useBO=1,useAtomWts=0,force=1) 396 mol._balabanMat = dMat 397 adjMat = Chem.GetAdjacencyMatrix(mol,useBO=0,emptyVal=0,force=0,prefix="NoBO") 398 mol._adjMat = adjMat 399 else: 400 try: 401 # first check if the molecule already has one 402 dMat = mol._balabanMat 403 except AttributeError: 404 # nope, gotta calculate one 405 dMat = Chem.GetDistanceMatrix(mol,useBO=1,useAtomWts=0,force=0,prefix="Balaban") 406 # now store it 407 mol._balabanMat = dMat 408 try: 409 adjMat = mol._adjMat 410 except AttributeError: 411 adjMat = Chem.GetAdjacencyMatrix(mol,useBO=0,emptyVal=0,force=0,prefix="NoBO") 412 mol._adjMat = adjMat 413 else: 414 adjMat = Chem.GetAdjacencyMatrix(mol,useBO=0,emptyVal=0,force=0,prefix="NoBO") 415 416 s = _VertexDegrees(dMat) 417 q = _NumAdjacencies(mol,dMat) 418 n = mol.GetNumAtoms() 419 mu = q - n + 1 420 421 sum = 0. 422 nS = len(s) 423 for i in range(nS): 424 si = s[i] 425 for j in range(i,nS): 426 if adjMat[i,j] == 1: 427 sum += 1./numpy.sqrt(si*s[j]) 428 429 if mu+1 != 0: 430 J = float(q) / float(mu + 1) * sum 431 else: 432 J = 0 433 434 return J
435 BalabanJ.version="1.0.0" 436 437 438 #------------------------------------------------------------------------ 439 # 440 # Start block of BertzCT stuff. 441 # 442 443
444 -def _AssignSymmetryClasses(mol, vdList, bdMat, forceBDMat, numAtoms, cutoff):
445 """ 446 Used by BertzCT 447 448 vdList: the number of neighbors each atom has 449 bdMat: "balaban" distance matrix 450 451 """ 452 if forceBDMat: 453 bdMat = Chem.GetDistanceMatrix(mol,useBO=1,useAtomWts=0,force=1, 454 prefix="Balaban") 455 mol._balabanMat = bdMat 456 457 atomIdx = 0 458 keysSeen = [] 459 symList = [0]*numAtoms 460 for i in range(numAtoms): 461 tmpList = bdMat[i].tolist() 462 tmpList.sort() 463 theKey = tuple(['%.4f'%x for x in tmpList[:cutoff]]) 464 try: 465 idx = keysSeen.index(theKey) 466 except ValueError: 467 idx = len(keysSeen) 468 keysSeen.append(theKey) 469 symList[i] = idx+1 470 return tuple(symList)
471
472 -def _LookUpBondOrder(atom1Id, atom2Id, bondDic):
473 """ 474 Used by BertzCT 475 """ 476 if atom1Id < atom2Id: 477 theKey = (atom1Id,atom2Id) 478 else: 479 theKey = (atom2Id,atom1Id) 480 tmp = bondDic[theKey] 481 if tmp == Chem.BondType.AROMATIC: 482 tmp = 1.5 483 else: 484 tmp = float(tmp) 485 #tmp = int(tmp) 486 return tmp
487 488
489 -def _CalculateEntropies(connectionDict, atomTypeDict, numAtoms):
490 """ 491 Used by BertzCT 492 """ 493 connectionList = connectionDict.values() 494 totConnections = sum(connectionList) 495 connectionIE = totConnections*(entropy.InfoEntropy(numpy.array(connectionList)) + 496 math.log(totConnections)/_log2val) 497 atomTypeList = atomTypeDict.values() 498 atomTypeIE = numAtoms*entropy.InfoEntropy(numpy.array(atomTypeList)) 499 return atomTypeIE + connectionIE
500
501 -def _CreateBondDictEtc(mol, numAtoms):
502 """ _Internal Use Only_ 503 Used by BertzCT 504 505 """ 506 bondDict = {} 507 nList = [None]*numAtoms 508 vdList = [0]*numAtoms 509 for aBond in mol.GetBonds(): 510 atom1=aBond.GetBeginAtomIdx() 511 atom2=aBond.GetEndAtomIdx() 512 if atom1>atom2: atom2,atom1=atom1,atom2 513 if not aBond.GetIsAromatic(): 514 bondDict[(atom1,atom2)] = aBond.GetBondType() 515 else: 516 # mark Kekulized systems as aromatic 517 bondDict[(atom1,atom2)] = Chem.BondType.AROMATIC 518 if nList[atom1] is None: 519 nList[atom1] = [atom2] 520 elif atom2 not in nList[atom1]: 521 nList[atom1].append(atom2) 522 if nList[atom2] is None: 523 nList[atom2] = [atom1] 524 elif atom1 not in nList[atom2]: 525 nList[atom2].append(atom1) 526 527 for i,element in enumerate(nList): 528 try: 529 element.sort() 530 vdList[i] = len(element) 531 except: 532 vdList[i] = 0 533 return bondDict, nList, vdList
534
535 -def BertzCT(mol, cutoff = 100, dMat = None, forceDMat = 1):
536 """ A topological index meant to quantify "complexity" of molecules. 537 538 Consists of a sum of two terms, one representing the complexity 539 of the bonding, the other representing the complexity of the 540 distribution of heteroatoms. 541 542 From S. H. Bertz, J. Am. Chem. Soc., vol 103, 3599-3601 (1981) 543 544 "cutoff" is an integer value used to limit the computational 545 expense. A cutoff value tells the program to consider vertices 546 topologically identical if their distance vectors (sets of 547 distances to all other vertices) are equal out to the "cutoff"th 548 nearest-neighbor. 549 550 **NOTE** The original implementation had the following comment: 551 > this implementation treats aromatic rings as the 552 > corresponding Kekule structure with alternating bonds, 553 > for purposes of counting "connections". 554 Upon further thought, this is the WRONG thing to do. It 555 results in the possibility of a molecule giving two different 556 CT values depending on the kekulization. For example, in the 557 old implementation, these two SMILES: 558 CC2=CN=C1C3=C(C(C)=C(C=N3)C)C=CC1=C2C 559 CC3=CN=C2C1=NC=C(C)C(C)=C1C=CC2=C3C 560 which correspond to differentk kekule forms, yield different 561 values. 562 The new implementation uses consistent (aromatic) bond orders 563 for aromatic bonds. 564 565 THIS MEANS THAT THIS IMPLEMENTATION IS NOT BACKWARDS COMPATIBLE. 566 567 Any molecule containing aromatic rings will yield different 568 values with this implementation. The new behavior is the correct 569 one, so we're going to live with the breakage. 570 571 **NOTE** this barfs if the molecule contains a second (or 572 nth) fragment that is one atom. 573 574 """ 575 atomTypeDict = {} 576 connectionDict = {} 577 numAtoms = mol.GetNumAtoms() 578 if forceDMat or dMat is None: 579 if forceDMat: 580 # nope, gotta calculate one 581 dMat = Chem.GetDistanceMatrix(mol,useBO=0,useAtomWts=0,force=1) 582 mol._adjMat = dMat 583 else: 584 try: 585 dMat = mol._adjMat 586 except AttributeError: 587 dMat = Chem.GetDistanceMatrix(mol,useBO=0,useAtomWts=0,force=1) 588 mol._adjMat = dMat 589 590 if numAtoms < 2: 591 return 0 592 593 bondDict, neighborList, vdList = _CreateBondDictEtc(mol, numAtoms) 594 symmetryClasses = _AssignSymmetryClasses(mol, vdList, dMat, forceDMat, numAtoms, cutoff) 595 #print 'Symmm Classes:',symmetryClasses 596 for atomIdx in range(numAtoms): 597 hingeAtomNumber = mol.GetAtomWithIdx(atomIdx).GetAtomicNum() 598 atomTypeDict[hingeAtomNumber] = atomTypeDict.get(hingeAtomNumber,0)+1 599 600 hingeAtomClass = symmetryClasses[atomIdx] 601 numNeighbors = vdList[atomIdx] 602 for i in range(numNeighbors): 603 neighbor_iIdx = neighborList[atomIdx][i] 604 NiClass = symmetryClasses[neighbor_iIdx] 605 bond_i_order = _LookUpBondOrder(atomIdx, neighbor_iIdx, bondDict) 606 #print '\t',atomIdx,i,hingeAtomClass,NiClass,bond_i_order 607 if (bond_i_order > 1) and (neighbor_iIdx > atomIdx): 608 numConnections = bond_i_order*(bond_i_order - 1)/2 609 connectionKey = (min(hingeAtomClass, NiClass), max(hingeAtomClass, NiClass)) 610 connectionDict[connectionKey] = connectionDict.get(connectionKey,0)+numConnections 611 612 for j in range(i+1, numNeighbors): 613 neighbor_jIdx = neighborList[atomIdx][j] 614 NjClass = symmetryClasses[neighbor_jIdx] 615 bond_j_order = _LookUpBondOrder(atomIdx, neighbor_jIdx, bondDict) 616 numConnections = bond_i_order*bond_j_order 617 connectionKey = (min(NiClass, NjClass), hingeAtomClass, max(NiClass, NjClass)) 618 connectionDict[connectionKey] = connectionDict.get(connectionKey,0)+numConnections 619 620 if not connectionDict: 621 connectionDict = {'a':1} 622 623 ks = connectionDict.keys() 624 ks.sort() 625 return _CalculateEntropies(connectionDict, atomTypeDict, numAtoms)
626 BertzCT.version="2.0.0" 627 # Recent Revisions: 628 # 1.0.0 -> 2.0.0: 629 # - force distance matrix updates properly (Fixed as part of Issue 125) 630 # - handle single-atom fragments (Issue 136) 631 632 633 # 634 # End block of BertzCT stuff. 635 # 636 #------------------------------------------------------------------------ 637