Package Chem :: Module GraphDescriptors
[hide private]
[frames] | no frames]

Source Code for Module Chem.GraphDescriptors

  1  # $Id: GraphDescriptors.py 742 2008-07-05 07:42:38Z glandrum $ 
  2  # 
  3  # Copyright (C) 2003-2006 greg Landrum and Rational Discovery LLC 
  4  # 
  5  #   @@ All Rights Reserved  @@ 
  6  # 
  7  """ Calculation of topological/topochemical descriptors. 
  8   
  9   
 10   
 11  """ 
 12  import Chem 
 13  from Chem import Graphs 
 14  from Chem import rdchem 
 15  # FIX: remove this dependency here and below 
 16  from Chem import pyPeriodicTable as PeriodicTable 
 17  import numpy 
 18  import math 
 19  from ML.InfoTheory import entropy 
 20   
 21  periodicTable = rdchem.GetPeriodicTable() 
 22   
 23  _log2val = math.log(2) 
24 -def _log2(x):
25 return math.log(x) / _log2val
26
27 -def _VertexDegrees(mat,onlyOnes=0):
28 """ *Internal Use Only* 29 30 this is just a row sum of the matrix... simple, neh? 31 32 """ 33 if not onlyOnes: 34 res = sum(mat) 35 else: 36 res = sum(numpy.equal(mat,1)) 37 return res
38
39 -def _NumAdjacencies(mol,dMat):
40 """ *Internal Use Only* 41 42 """ 43 res = mol.GetNumBonds() 44 return res
45
46 -def _GetCountDict(arr):
47 """ *Internal Use Only* 48 49 """ 50 res = {} 51 for v in arr: 52 res[v] = res.get(v,0)+1 53 return res
54 55
56 -def HallKierAlpha(m):
57 """ calculate the Hall-Kier alpha value for a molecule 58 59 From equations (58) of Rev. Comp. Chem. vol 2, 367-422, (1991) 60 61 """ 62 alphaSum = 0.0 63 rC = PeriodicTable.nameTable['C'][5] 64 for atom in m.GetAtoms(): 65 atNum=atom.GetAtomicNum() 66 if not atNum: continue 67 symb = atom.GetSymbol() 68 alphaV = PeriodicTable.hallKierAlphas.get(symb,None) 69 if alphaV is not None: 70 hyb = atom.GetHybridization()-2 71 if(hyb<len(alphaV)): 72 alpha = alphaV[hyb] 73 if alpha is None: 74 alpha = alphaV[-1] 75 else: 76 alpha = alphaV[-1] 77 else: 78 rA = PeriodicTable.nameTable[symb][5] 79 alpha = rA/rC - 1 80 alphaSum += alpha 81 return alphaSum
82 HallKierAlpha.version="1.0.2" 83
84 -def Ipc(mol, avg = 0, dMat = None, forceDMat = 0):
85 """This returns the information content of the coefficients of the characteristic 86 polynomial of the adjacency matrix of a hydrogen-suppressed graph of a molecule. 87 88 'avg = 1' returns the information content divided by the total population. 89 90 From D. Bonchev & N. Trinajstic, J. Chem. Phys. vol 67, 4517-4533 (1977) 91 92 """ 93 if forceDMat or dMat is None: 94 if forceDMat: 95 dMat = Chem.GetDistanceMatrix(mol,0) 96 mol._adjMat = dMat 97 else: 98 try: 99 dMat = mol._adjMat 100 except AttributeError: 101 dMat = Chem.GetDistanceMatrix(mol,0) 102 mol._adjMat = dMat 103 104 adjMat = numpy.equal(dMat,1) 105 cPoly = abs(Graphs.CharacteristicPolynomial(mol, adjMat)) 106 if avg: 107 return entropy.InfoEntropy(cPoly) 108 else: 109 return sum(cPoly)*entropy.InfoEntropy(cPoly)
110 Ipc.version="1.0.0" 111 112 113
114 -def Kappa1(mol):
115 """ Hall-Kier Kappa1 value 116 117 From equations (58) and (59) of Rev. Comp. Chem. vol 2, 367-422, (1991) 118 119 """ 120 P1 = mol.GetNumBonds(1) 121 A = mol.GetNumAtoms(onlyHeavy=1) 122 alpha = HallKierAlpha(mol) 123 denom = P1 + alpha 124 if denom: 125 kappa = (A + alpha)*(A + alpha - 1)**2 / denom**2 126 else: 127 kappa = 0.0 128 return kappa
129 Kappa1.version="1.0.0" 130 131
132 -def Kappa2(mol):
133 """ Hall-Kier Kappa2 value 134 135 From equations (58) and (60) of Rev. Comp. Chem. vol 2, 367-422, (1991) 136 137 """ 138 P2 = len(Chem.FindAllPathsOfLengthN(mol,2)) 139 A = mol.GetNumAtoms(onlyHeavy=1) 140 alpha = HallKierAlpha(mol) 141 denom = (P2 + alpha)**2 142 if denom: 143 kappa = (A + alpha - 1)*(A + alpha - 2)**2 / denom 144 else: 145 kappa = 0 146 return kappa
147 Kappa2.version="1.0.0" 148
149 -def Kappa3(mol):
150 """ Hall-Kier Kappa3 value 151 152 From equations (58), (61) and (62) of Rev. Comp. Chem. vol 2, 367-422, (1991) 153 154 """ 155 P3 = len(Chem.FindAllPathsOfLengthN(mol,3)) 156 A = mol.GetNumAtoms(1) 157 alpha = HallKierAlpha(mol) 158 denom = (P3 + alpha)**2 159 if denom: 160 if A % 2 == 1: 161 kappa = (A + alpha - 1)*(A + alpha - 3)**2 / denom 162 else: 163 kappa = (A + alpha - 2)*(A + alpha - 3)**2 / denom 164 else: 165 kappa = 0 166 return kappa
167 Kappa3.version="1.0.0" 168
169 -def Chi0(mol):
170 """ From equations (1),(9) and (10) of Rev. Comp. Chem. vol 2, 367-422, (1991) 171 172 """ 173 deltas = [x.GetDegree() for x in mol.GetAtoms()] 174 while 0 in deltas: 175 deltas.remove(0) 176 deltas = numpy.array(deltas,'d') 177 res = sum(numpy.sqrt(1./deltas)) 178 return res
179 Chi0.version="1.0.0" 180 181
182 -def Chi1(mol):
183 """ From equations (1),(11) and (12) of Rev. Comp. Chem. vol 2, 367-422, (1991) 184 185 """ 186 c1s = [x.GetBeginAtom().GetDegree()*x.GetEndAtom().GetDegree() for x in mol.GetBonds()] 187 while 0 in c1s: 188 c1s.remove(0) 189 c1s = numpy.array(c1s,'d') 190 res = sum(numpy.sqrt(1./c1s)) 191 return res
192 Chi1.version="1.0.0" 193
194 -def _nVal(atom):
195 return periodicTable.GetNOuterElecs(atom.GetAtomicNum())-atom.GetTotalNumHs()
196
197 -def _hkDeltas(mol,skipHs=1):
198 global periodicTable 199 res = [] 200 for atom in mol.GetAtoms(): 201 n = atom.GetAtomicNum() 202 if n>1: 203 nV = periodicTable.GetNOuterElecs(n) 204 nHs = atom.GetTotalNumHs() 205 if n <= 10: 206 # first row 207 res.append(float(nV-nHs)) 208 else: 209 # second row and up 210 res.append(float(nV-nHs)/float(n-nV-1)) 211 elif not skipHs: 212 res.append(0.0) 213 return res
214
215 -def Chi0v(mol):
216 """ From equations (5),(9) and (10) of Rev. Comp. Chem. vol 2, 367-422, (1991) 217 218 """ 219 deltas = _hkDeltas(mol) 220 while 0 in deltas: 221 deltas.remove(0) 222 res = sum(numpy.sqrt(1./numpy.array(deltas))) 223 return res
224 Chi0v.version="1.0.0" 225
226 -def Chi1v(mol):
227 """ From equations (5),(11) and (12) of Rev. Comp. Chem. vol 2, 367-422, (1991) 228 229 """ 230 deltas = numpy.array(_hkDeltas(mol,skipHs=0)) 231 res = 0.0 232 for bond in mol.GetBonds(): 233 v = deltas[bond.GetBeginAtomIdx()]*deltas[bond.GetEndAtomIdx()] 234 if v != 0.0: 235 res += numpy.sqrt(1./v) 236 return res
237 Chi1v.version="1.0.0" 238
239 -def ChiNv_(mol,order=2):
240 """ From equations (5),(15) and (16) of Rev. Comp. Chem. vol 2, 367-422, (1991) 241 242 **NOTE**: because the current path finding code does, by design, 243 detect rings as paths (e.g. in C1CC1 there is *1* atom path of 244 length 3), values of ChiNv with N >= 3 may give results that differ 245 from those provided by the old code in molecules that have rings of 246 size 3. 247 248 """ 249 deltas = numpy.array(_hkDeltas(mol,skipHs=0)) 250 accum = 0.0 251 #print 'DELTAS',deltas 252 for path in Chem.FindAllPathsOfLengthN(mol,order+1,useBonds=0): 253 ats = [] 254 cAccum = 1.0 255 #print 'PATH:',path 256 for idx in path: 257 cAccum *= deltas[idx] 258 if cAccum: 259 accum += 1./numpy.sqrt(cAccum) 260 return accum
261
262 -def Chi2v(mol):
263 """ From equations (5),(15) and (16) of Rev. Comp. Chem. vol 2, 367-422, (1991) 264 265 """ 266 return ChiNv_(mol,2)
267 Chi2v.version="1.0.0" 268
269 -def Chi3v(mol):
270 """ From equations (5),(15) and (16) of Rev. Comp. Chem. vol 2, 367-422, (1991) 271 272 """ 273 return ChiNv_(mol,3)
274 Chi3v.version="1.0.0" 275
276 -def Chi4v(mol):
277 """ From equations (5),(15) and (16) of Rev. Comp. Chem. vol 2, 367-422, (1991) 278 279 **NOTE**: because the current path finding code does, by design, 280 detect rings as paths (e.g. in C1CC1 there is *1* atom path of 281 length 3), values of Chi4v may give results that differ from those 282 provided by the old code in molecules that have 3 rings. 283 284 """ 285 return ChiNv_(mol,4)
286 Chi4v.version="1.0.0" 287
288 -def Chi0n(mol):
289 """ Similar to Hall Kier Chi0v, but uses nVal instead of valence 290 This makes a big difference after we get out of the first row. 291 292 """ 293 deltas = [_nVal(x) for x in mol.GetAtoms()] 294 while deltas.count(0): 295 deltas.remove(0) 296 deltas = numpy.array(deltas,'d') 297 res = sum(numpy.sqrt(1./deltas)) 298 return res
299 Chi0n.version="1.0.0" 300
301 -def Chi1n(mol):
302 """ Similar to Hall Kier Chi1v, but uses nVal instead of valence 303 304 """ 305 delts = numpy.array([_nVal(x) for x in mol.GetAtoms()],'d') 306 res = 0.0 307 for bond in mol.GetBonds(): 308 v = delts[bond.GetBeginAtomIdx()]*delts[bond.GetEndAtomIdx()] 309 if v != 0.0: 310 res += numpy.sqrt(1./v) 311 return res
312 Chi1n.version="1.0.0"
313 -def ChiNn_(mol,order=2):
314 """ Similar to Hall Kier ChiNv, but uses nVal instead of valence 315 This makes a big difference after we get out of the first row. 316 317 **NOTE**: because the current path finding code does, by design, 318 detect rings as paths (e.g. in C1CC1 there is *1* atom path of 319 length 3), values of ChiNn with N >= 3 may give results that differ 320 from those provided by the old code in molecules that have rings of 321 size 3. 322 323 """ 324 deltas = numpy.array([_nVal(x) for x in mol.GetAtoms()],'d') 325 accum = 0.0 326 for path in Chem.FindAllPathsOfLengthN(mol,order+1,useBonds=0): 327 cAccum = 1.0 328 for idx in path: 329 cAccum *= deltas[idx] 330 if cAccum: 331 accum += 1./numpy.sqrt(cAccum) 332 return accum
333
334 -def Chi2n(mol):
335 """ Similar to Hall Kier Chi2v, but uses nVal instead of valence 336 This makes a big difference after we get out of the first row. 337 338 """ 339 return ChiNn_(mol,2)
340 Chi2n.version="1.0.0" 341
342 -def Chi3n(mol):
343 """ Similar to Hall Kier Chi3v, but uses nVal instead of valence 344 This makes a big difference after we get out of the first row. 345 346 """ 347 return ChiNn_(mol,3)
348 Chi3n.version="1.0.0" 349
350 -def Chi4n(mol):
351 """ Similar to Hall Kier Chi4v, but uses nVal instead of valence 352 This makes a big difference after we get out of the first row. 353 354 355 **NOTE**: because the current path finding code does, by design, 356 detect rings as paths (e.g. in C1CC1 there is *1* atom path of 357 length 3), values of Chi4n may give results that differ from those 358 provided by the old code in molecules that have 3 rings. 359 360 """ 361 return ChiNn_(mol,4)
362 Chi4n.version="1.0.0" 363 364
365 -def BalabanJ(mol,dMat=None,forceDMat=0):
366 """ Calculate Balaban's J value for a molecule 367 368 **Arguments** 369 370 - mol: a molecule 371 372 - dMat: (optional) a distance/adjacency matrix for the molecule, if this 373 is not provide, one will be calculated 374 375 - forceDMat: (optional) if this is set, the distance/adjacency matrix 376 will be recalculated regardless of whether or not _dMat_ is provided 377 or the molecule already has one 378 379 **Returns** 380 381 - a float containing the J value 382 383 We follow the notation of Balaban's paper: 384 Chem. Phys. Lett. vol 89, 399-404, (1982) 385 386 """ 387 # if no dMat is passed in, calculate one ourselves 388 if forceDMat or dMat is None: 389 if forceDMat: 390 # FIX: should we be using atom weights here or not? 391 dMat = Chem.GetDistanceMatrix(mol,useBO=1,useAtomWts=0,force=1) 392 mol._balabanMat = dMat 393 adjMat = Chem.GetAdjacencyMatrix(mol,useBO=0,emptyVal=0,force=0,prefix="NoBO") 394 mol._adjMat = adjMat 395 else: 396 try: 397 # first check if the molecule already has one 398 dMat = mol._balabanMat 399 except AttributeError: 400 # nope, gotta calculate one 401 dMat = Chem.GetDistanceMatrix(mol,useBO=1,useAtomWts=0,force=0,prefix="Balaban") 402 # now store it 403 mol._balabanMat = dMat 404 try: 405 adjMat = mol._adjMat 406 except AttributeError: 407 adjMat = Chem.GetAdjacencyMatrix(mol,useBO=0,emptyVal=0,force=0,prefix="NoBO") 408 mol._adjMat = adjMat 409 else: 410 adjMat = Chem.GetAdjacencyMatrix(mol,useBO=0,emptyVal=0,force=0,prefix="NoBO") 411 412 s = _VertexDegrees(dMat) 413 q = _NumAdjacencies(mol,dMat) 414 n = mol.GetNumAtoms() 415 mu = q - n + 1 416 417 sum = 0. 418 nS = len(s) 419 for i in range(nS): 420 si = s[i] 421 for j in range(i,nS): 422 if adjMat[i,j] == 1: 423 sum += 1./numpy.sqrt(si*s[j]) 424 425 if mu+1 != 0: 426 J = float(q) / float(mu + 1) * sum 427 else: 428 J = 0 429 430 return J
431 BalabanJ.version="1.0.0" 432 433 434 #------------------------------------------------------------------------ 435 # 436 # Start block of BertzCT stuff. 437 # 438 439
440 -def _AssignSymmetryClasses(mol, vdList, bdMat, forceBDMat, numAtoms, cutoff):
441 """ 442 Used by BertzCT 443 444 vdList: the number of neighbors each atom has 445 bdMat: "balaban" distance matrix 446 447 """ 448 if forceBDMat: 449 bdMat = Chem.GetDistanceMatrix(mol,useBO=1,useAtomWts=0,force=1, 450 prefix="Balaban") 451 mol._balabanMat = bdMat 452 453 atomIdx = 0 454 keysSeen = [] 455 symList = [0]*numAtoms 456 for i in range(numAtoms): 457 tmpList = bdMat[i].tolist() 458 tmpList.sort() 459 theKey = tuple(['%.4f'%x for x in tmpList[:cutoff]]) 460 try: 461 idx = keysSeen.index(theKey) 462 except ValueError: 463 idx = len(keysSeen) 464 keysSeen.append(theKey) 465 symList[i] = idx+1 466 return tuple(symList)
467
468 -def _LookUpBondOrder(atom1Id, atom2Id, bondDic):
469 """ 470 Used by BertzCT 471 """ 472 if atom1Id < atom2Id: 473 theKey = (atom1Id,atom2Id) 474 else: 475 theKey = (atom2Id,atom1Id) 476 tmp = bondDic[theKey] 477 if tmp == Chem.BondType.AROMATIC: 478 tmp = 1.5 479 else: 480 tmp = float(tmp) 481 #tmp = int(tmp) 482 return tmp
483 484
485 -def _CalculateEntropies(connectionDict, atomTypeDict, numAtoms):
486 """ 487 Used by BertzCT 488 """ 489 connectionList = connectionDict.values() 490 totConnections = sum(connectionList) 491 connectionIE = totConnections*(entropy.InfoEntropy(numpy.array(connectionList)) + 492 math.log(totConnections)/_log2val) 493 atomTypeList = atomTypeDict.values() 494 atomTypeIE = numAtoms*entropy.InfoEntropy(numpy.array(atomTypeList)) 495 return atomTypeIE + connectionIE
496
497 -def _CreateBondDictEtc(mol, numAtoms):
498 """ _Internal Use Only_ 499 Used by BertzCT 500 501 """ 502 bondDict = {} 503 nList = [None]*numAtoms 504 vdList = [0]*numAtoms 505 for aBond in mol.GetBonds(): 506 atom1=aBond.GetBeginAtomIdx() 507 atom2=aBond.GetEndAtomIdx() 508 if atom1>atom2: atom2,atom1=atom1,atom2 509 if not aBond.GetIsAromatic(): 510 bondDict[(atom1,atom2)] = aBond.GetBondType() 511 else: 512 # mark Kekulized systems as aromatic 513 bondDict[(atom1,atom2)] = Chem.BondType.AROMATIC 514 if nList[atom1] is None: 515 nList[atom1] = [atom2] 516 elif atom2 not in nList[atom1]: 517 nList[atom1].append(atom2) 518 if nList[atom2] is None: 519 nList[atom2] = [atom1] 520 elif atom1 not in nList[atom2]: 521 nList[atom2].append(atom1) 522 523 for i,element in enumerate(nList): 524 try: 525 element.sort() 526 vdList[i] = len(element) 527 except: 528 vdList[i] = 0 529 return bondDict, nList, vdList
530
531 -def BertzCT(mol, cutoff = 100, dMat = None, forceDMat = 1):
532 """ A topological index meant to quantify "complexity" of molecules. 533 534 Consists of a sum of two terms, one representing the complexity 535 of the bonding, the other representing the complexity of the 536 distribution of heteroatoms. 537 538 From S. H. Bertz, J. Am. Chem. Soc., vol 103, 3599-3601 (1981) 539 540 "cutoff" is an integer value used to limit the computational 541 expense. A cutoff value tells the program to consider vertices 542 topologically identical if their distance vectors (sets of 543 distances to all other vertices) are equal out to the "cutoff"th 544 nearest-neighbor. 545 546 **NOTE** The original implementation had the following comment: 547 > this implementation treats aromatic rings as the 548 > corresponding Kekule structure with alternating bonds, 549 > for purposes of counting "connections". 550 Upon further thought, this is the WRONG thing to do. It 551 results in the possibility of a molecule giving two different 552 CT values depending on the kekulization. For example, in the 553 old implementation, these two SMILES: 554 CC2=CN=C1C3=C(C(C)=C(C=N3)C)C=CC1=C2C 555 CC3=CN=C2C1=NC=C(C)C(C)=C1C=CC2=C3C 556 which correspond to differentk kekule forms, yield different 557 values. 558 The new implementation uses consistent (aromatic) bond orders 559 for aromatic bonds. 560 561 THIS MEANS THAT THIS IMPLEMENTATION IS NOT BACKWARDS COMPATIBLE. 562 563 Any molecule containing aromatic rings will yield different 564 values with this implementation. The new behavior is the correct 565 one, so we're going to live with the breakage. 566 567 **NOTE** this barfs if the molecule contains a second (or 568 nth) fragment that is one atom. 569 570 """ 571 atomTypeDict = {} 572 connectionDict = {} 573 numAtoms = mol.GetNumAtoms() 574 if forceDMat or dMat is None: 575 if forceDMat: 576 # nope, gotta calculate one 577 dMat = Chem.GetDistanceMatrix(mol,useBO=0,useAtomWts=0,force=1) 578 mol._adjMat = dMat 579 else: 580 try: 581 dMat = mol._adjMat 582 except AttributeError: 583 dMat = Chem.GetDistanceMatrix(mol,useBO=0,useAtomWts=0,force=1) 584 mol._adjMat = dMat 585 586 if numAtoms < 2: 587 return 0 588 589 bondDict, neighborList, vdList = _CreateBondDictEtc(mol, numAtoms) 590 symmetryClasses = _AssignSymmetryClasses(mol, vdList, dMat, forceDMat, numAtoms, cutoff) 591 #print 'Symmm Classes:',symmetryClasses 592 for atomIdx in range(numAtoms): 593 hingeAtomNumber = mol.GetAtomWithIdx(atomIdx).GetAtomicNum() 594 atomTypeDict[hingeAtomNumber] = atomTypeDict.get(hingeAtomNumber,0)+1 595 596 hingeAtomClass = symmetryClasses[atomIdx] 597 numNeighbors = vdList[atomIdx] 598 for i in range(numNeighbors): 599 neighbor_iIdx = neighborList[atomIdx][i] 600 NiClass = symmetryClasses[neighbor_iIdx] 601 bond_i_order = _LookUpBondOrder(atomIdx, neighbor_iIdx, bondDict) 602 #print '\t',atomIdx,i,hingeAtomClass,NiClass,bond_i_order 603 if (bond_i_order > 1) and (neighbor_iIdx > atomIdx): 604 numConnections = bond_i_order*(bond_i_order - 1)/2 605 connectionKey = (min(hingeAtomClass, NiClass), max(hingeAtomClass, NiClass)) 606 connectionDict[connectionKey] = connectionDict.get(connectionKey,0)+numConnections 607 608 for j in range(i+1, numNeighbors): 609 neighbor_jIdx = neighborList[atomIdx][j] 610 NjClass = symmetryClasses[neighbor_jIdx] 611 bond_j_order = _LookUpBondOrder(atomIdx, neighbor_jIdx, bondDict) 612 numConnections = bond_i_order*bond_j_order 613 connectionKey = (min(NiClass, NjClass), hingeAtomClass, max(NiClass, NjClass)) 614 connectionDict[connectionKey] = connectionDict.get(connectionKey,0)+numConnections 615 616 if not connectionDict: 617 connectionDict = {'a':1} 618 619 ks = connectionDict.keys() 620 ks.sort() 621 return _CalculateEntropies(connectionDict, atomTypeDict, numAtoms)
622 BertzCT.version="2.0.0" 623 # Recent Revisions: 624 # 1.0.0 -> 2.0.0: 625 # - force distance matrix updates properly (Fixed as part of Issue 125) 626 # - handle single-atom fragments (Issue 136) 627 628 629 # 630 # End block of BertzCT stuff. 631 # 632 #------------------------------------------------------------------------ 633