Package rdkit :: Package Chem :: Module MACCSkeys
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Chem.MACCSkeys

  1  # $Id: MACCSkeys.py 1733 2011-05-27 03:19:13Z glandrum $ 
  2  # 
  3  # Copyright (C) 2001-2011 greg Landrum and Rational Discovery LLC 
  4  # 
  5  #   @@ All Rights Reserved @@ 
  6  #  This file is part of the RDKit. 
  7  #  The contents are covered by the terms of the BSD license 
  8  #  which is included in the file license.txt, found at the root 
  9  #  of the RDKit source tree. 
 10  # 
 11  """ SMARTS definitions for the publically available MACCS keys 
 12  and a MACCS fingerprinter 
 13   
 14  I compared the MACCS fingerprints generated here with those from two 
 15  other packages (not MDL, unfortunately). Of course there are 
 16  disagreements between the various fingerprints still, but I think 
 17  these definitions work pretty well. Some notes: 
 18   
 19  1) most of the differences have to do with aromaticity 
 20  2) there's a discrepancy sometimes because the current RDKit 
 21  definitions do not require multiple matches to be distinct. e.g. the 
 22  SMILES C(=O)CC(=O) can match the (hypothetical) key O=CC twice in my 
 23  definition. It's not clear to me what the correct behavior is. 
 24  3) Some keys are not fully defined in the MDL documentation 
 25  4) Two keys, 125 and 166, have to be done outside of SMARTS. 
 26  5) Key 1 (ISOTOPE) isn't defined 
 27   
 28  Rev history: 
 29  2006 (gl): Original open-source release 
 30  May 2011 (gl): Update some definitions based on feedback from Andrew Dalke 
 31   
 32  """ 
 33  from rdkit import Chem 
 34  from rdkit import DataStructs 
 35  # these are SMARTS patterns corresponding to the MDL MACCS keys 
 36  smartsPatts={ 
 37    1:('?',0), # ISOTOPE 
 38    #2:('[#104,#105,#106,#107,#106,#109,#110,#111,#112]',0),  # atomic num >103 Not complete 
 39    2:('[#104]',0),  # limit the above def'n since the RDKit only accepts up to #104 
 40    3:('[#32,#33,#34,#50,#51,#52,#82,#83,#84]',0), # Group IVa,Va,VIa Rows 4-6  
 41    4:('[Ac,Th,Pa,U,Np,Pu,Am,Cm,Bk,Cf,Es,Fm,Md,No,Lr]',0), # actinide 
 42    5:('[Sc,Ti,Y,Zr,Hf]',0), # Group IIIB,IVB (Sc...)   
 43    6:('[La,Ce,Pr,Nd,Pm,Sm,Eu,Gd,Tb,Dy,Ho,Er,Tm,Yb,Lu]',0), # Lanthanide 
 44    7:('[V,Cr,Mn,Nb,Mo,Tc,Ta,W,Re]',0), # Group VB,VIB,VIIB 
 45    8:('[!#6;!#1]1~*~*~*~1',0), # QAAA@1 
 46    9:('[Fe,Co,Ni,Ru,Rh,Pd,Os,Ir,Pt]',0), # Group VIII (Fe...) 
 47    10:('[Be,Mg,Ca,Sr,Ba,Ra]',0), # Group IIa (Alkaline earth) 
 48    11:('*1~*~*~*~1',0), # 4M Ring 
 49    12:('[Cu,Zn,Ag,Cd,Au,Hg]',0), # Group IB,IIB (Cu..) 
 50    13:('[#8]~[#7](~[#6])~[#6]',0), # ON(C)C 
 51    14:('[#16]-[#16]',0), # S-S 
 52    15:('[#8]~[#6](~[#8])~[#8]',0), # OC(O)O 
 53    16:('[!#6;!#1]1~*~*~1',0), # QAA@1 
 54    17:('[#6]#[#6]',0), #CTC 
 55    18:('[#5,#13,#31,#49,#81]',0), # Group IIIA (B...)  
 56    19:('*1~*~*~*~*~*~*~1',0), # 7M Ring 
 57    20:('[Si]',0), #Si 
 58    21:('[#6]=[#6](~[!#6;!#1])~[!#6;!#1]',0), # C=C(Q)Q 
 59    22:('*1~*~*~1',0), # 3M Ring 
 60    23:('[#7]~[#6](~[#8])~[#8]',0), # NC(O)O 
 61    24:('[#7]-[#8]',0), # N-O 
 62    25:('[#7]~[#6](~[#7])~[#7]',0), # NC(N)N 
 63    26:('[#6]=;@[#6](@*)@*',0), # C$=C($A)$A 
 64    27:('[I]',0), # I 
 65    28:('[!#6;!#1]~[CH2]~[!#6;!#1]',0), # QCH2Q 
 66    29:('[#15]',0),# P 
 67    30:('[#6]~[!#6;!#1](~[#6])(~[#6])~*',0), # CQ(C)(C)A 
 68    31:('[!#6;!#1]~[F,Cl,Br,I]',0), # QX 
 69    32:('[#6]~[#16]~[#7]',0), # CSN 
 70    33:('[#7]~[#16]',0), # NS 
 71    34:('[CH2]=*',0), # CH2=A 
 72    35:('[Li,Na,K,Rb,Cs,Fr]',0), # Group IA (Alkali Metal) 
 73    36:('[#16R]',0), # S Heterocycle 
 74    37:('[#7]~[#6](~[#8])~[#7]',0), # NC(O)N 
 75    38:('[#7]~[#6](~[#6])~[#7]',0), # NC(C)N 
 76    39:('[#8]~[#16](~[#8])~[#8]',0), # OS(O)O 
 77    40:('[#16]-[#8]',0), # S-O 
 78    41:('[#6]#[#7]',0), # CTN 
 79    42:('F',0), # F 
 80    43:('[!#6;!#1;!H0]~*~[!#6;!#1;!H0]',0), # QHAQH 
 81    44:('?',0), # OTHER 
 82    45:('[#6]=[#6]~[#7]',0), # C=CN 
 83    46:('Br',0), # BR 
 84    47:('[#16]~*~[#7]',0), # SAN 
 85    48:('[#8]~[!#6;!#1](~[#8])(~[#8])',0), # OQ(O)O 
 86    49:('[!+0]',0), # CHARGE   
 87    50:('[#6]=[#6](~[#6])~[#6]',0), # C=C(C)C 
 88    51:('[#6]~[#16]~[#8]',0), # CSO 
 89    52:('[#7]~[#7]',0), # NN 
 90    53:('[!#6;!#1;!H0]~*~*~*~[!#6;!#1;!H0]',0), # QHAAAQH 
 91    54:('[!#6;!#1;!H0]~*~*~[!#6;!#1;!H0]',0), # QHAAQH 
 92    55:('[#8]~[#16]~[#8]',0), #OSO 
 93    56:('[#8]~[#7](~[#8])~[#6]',0), # ON(O)C 
 94    57:('[#8R]',0), # O Heterocycle 
 95    58:('[!#6;!#1]~[#16]~[!#6;!#1]',0), # QSQ 
 96    59:('[#16]!:*:*',0), # Snot%A%A 
 97    60:('[#16]=[#8]',0), # S=O 
 98    61:('*~[#16](~*)~*',0), # AS(A)A 
 99    62:('*@*!@*@*',0), # A$!A$A 
100    63:('[#7]=[#8]',0), # N=O 
101    64:('*@*!@[#16]',0), # A$A!S 
102    65:('c:n',0), # C%N 
103    66:('[#6]~[#6](~[#6])(~[#6])~*',0), # CC(C)(C)A 
104    67:('[!#6;!#1]~[#16]',0), # QS 
105    68:('[!#6;!#1;!H0]~[!#6;!#1;!H0]',0), # QHQH (&...) SPEC Incomplete 
106    69:('[!#6;!#1]~[!#6;!#1;!H0]',0), # QQH 
107    70:('[!#6;!#1]~[#7]~[!#6;!#1]',0), # QNQ 
108    71:('[#7]~[#8]',0), # NO 
109    72:('[#8]~*~*~[#8]',0), # OAAO 
110    73:('[#16]=*',0), # S=A 
111    74:('[CH3]~*~[CH3]',0), # CH3ACH3 
112    75:('*!@[#7]@*',0), # A!N$A 
113    76:('[#6]=[#6](~*)~*',0), # C=C(A)A 
114    77:('[#7]~*~[#7]',0), # NAN 
115    78:('[#6]=[#7]',0), # C=N 
116    79:('[#7]~*~*~[#7]',0), # NAAN 
117    80:('[#7]~*~*~*~[#7]',0), # NAAAN 
118    81:('[#16]~*(~*)~*',0), # SA(A)A 
119    82:('*~[CH2]~[!#6;!#1;!H0]',0), # ACH2QH 
120    83:('[!#6;!#1]1~*~*~*~*~1',0), # QAAAA@1 
121    84:('[NH2]',0), #NH2 
122    85:('[#6]~[#7](~[#6])~[#6]',0), # CN(C)C 
123    86:('[C;H2,H3][!#6;!#1][C;H2,H3]',0), # CH2QCH2 
124    87:('[F,Cl,Br,I]!@*@*',0), # X!A$A 
125    88:('[#16]',0), # S 
126    89:('[#8]~*~*~*~[#8]',0), # OAAAO 
127    90:('[$([!#6;!#1;!H0]~*~*~[CH2]~*),$([!#6;!#1;!H0;R]1@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~[R]1@[R]@[CH2;R]1)]',0), # QHAACH2A 
128    91:('[$([!#6;!#1;!H0]~*~*~*~[CH2]~*),$([!#6;!#1;!H0;R]1@[R]@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~[R]1@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~*~[R]1@[R]@[CH2;R]1)]',0), # QHAAACH2A 
129    92:('[#8]~[#6](~[#7])~[#6]',0), # OC(N)C 
130    93:('[!#6;!#1]~[CH3]',0), # QCH3 
131    94:('[!#6;!#1]~[#7]',0), # QN 
132    95:('[#7]~*~*~[#8]',0), # NAAO 
133    96:('*1~*~*~*~*~1',0), # 5 M ring 
134    97:('[#7]~*~*~*~[#8]',0), # NAAAO 
135    98:('[!#6;!#1]1~*~*~*~*~*~1',0), # QAAAAA@1 
136    99:('[#6]=[#6]',0), # C=C 
137    100:('*~[CH2]~[#7]',0), # ACH2N 
138    101:('[$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1)]',0), # 8M Ring or larger. This only handles up to ring sizes of 14 
139    102:('[!#6;!#1]~[#8]',0), # QO 
140    103:('Cl',0), # CL 
141    104:('[!#6;!#1;!H0]~*~[CH2]~*',0), # QHACH2A 
142    105:('*@*(@*)@*',0), # A$A($A)$A 
143    106:('[!#6;!#1]~*(~[!#6;!#1])~[!#6;!#1]',0), # QA(Q)Q 
144    107:('[F,Cl,Br,I]~*(~*)~*',0), # XA(A)A 
145    108:('[CH3]~*~*~*~[CH2]~*',0), # CH3AAACH2A 
146    109:('*~[CH2]~[#8]',0), # ACH2O 
147    110:('[#7]~[#6]~[#8]',0), # NCO 
148    111:('[#7]~*~[CH2]~*',0), # NACH2A 
149    112:('*~*(~*)(~*)~*',0), # AA(A)(A)A 
150    113:('[#8]!:*:*',0), # Onot%A%A 
151    114:('[CH3]~[CH2]~*',0), # CH3CH2A 
152    115:('[CH3]~*~[CH2]~*',0), # CH3ACH2A 
153    116:('[$([CH3]~*~*~[CH2]~*),$([CH3]~*1~*~[CH2]1)]',0), # CH3AACH2A 
154    117:('[#7]~*~[#8]',0), # NAO 
155    118:('[$(*~[CH2]~[CH2]~*),$(*1~[CH2]~[CH2]1)]',1), # ACH2CH2A > 1 
156    119:('[#7]=*',0), # N=A 
157    120:('[!#6;R]',1), # Heterocyclic atom > 1 (&...) Spec Incomplete 
158    121:('[#7;R]',0), # N Heterocycle 
159    122:('*~[#7](~*)~*',0), # AN(A)A 
160    123:('[#8]~[#6]~[#8]',0), # OCO 
161    124:('[!#6;!#1]~[!#6;!#1]',0), # QQ 
162    125:('?',0), # Aromatic Ring > 1 
163    126:('*!@[#8]!@*',0), # A!O!A 
164    127:('*@*!@[#8]',1), # A$A!O > 1 (&...) Spec Incomplete 
165    128:('[$(*~[CH2]~*~*~*~[CH2]~*),$([R]1@[CH2;R]@[R]@[R]@[R]@[CH2;R]1),$(*~[CH2]~[R]1@[R]@[R]@[CH2;R]1),$(*~[CH2]~*~[R]1@[R]@[CH2;R]1)]',0), # ACH2AAACH2A 
166    129:('[$(*~[CH2]~*~*~[CH2]~*),$([R]1@[CH2]@[R]@[R]@[CH2;R]1),$(*~[CH2]~[R]1@[R]@[CH2;R]1)]',0), # ACH2AACH2A 
167    130:('[!#6;!#1]~[!#6;!#1]',1), # QQ > 1 (&...)  Spec Incomplete 
168    131:('[!#6;!#1;!H0]',1), # QH > 1 
169    132:('[#8]~*~[CH2]~*',0), # OACH2A 
170    133:('*@*!@[#7]',0), # A$A!N 
171    134:('[F,Cl,Br,I]',0), # X (HALOGEN) 
172    135:('[#7]!:*:*',0), # Nnot%A%A 
173    136:('[#8]=*',1), # O=A>1  
174    137:('[!C;!c;R]',0), # Heterocycle 
175    138:('[!#6;!#1]~[CH2]~*',1), # QCH2A>1 (&...) Spec Incomplete 
176    139:('[O;!H0]',0), # OH 
177    140:('[#8]',3), # O > 3 (&...) Spec Incomplete 
178    141:('[CH3]',2), # CH3 > 2  (&...) Spec Incomplete 
179    142:('[#7]',1), # N > 1 
180    143:('*@*!@[#8]',0), # A$A!O 
181    144:('*!:*:*!:*',0), # Anot%A%Anot%A 
182    145:('*1~*~*~*~*~*~1',1), # 6M ring > 1 
183    146:('[#8]',2), # O > 2 
184    147:('[$(*~[CH2]~[CH2]~*),$([R]1@[CH2;R]@[CH2;R]1)]',0), # ACH2CH2A 
185    148:('*~[!#6;!#1](~*)~*',0), # AQ(A)A 
186    149:('[C;H3,H4]',1), # CH3 > 1 
187    150:('*!@*@*!@*',0), # A!A$A!A 
188    151:('[#7;!H0]',0), # NH 
189    152:('[#8]~[#6](~[#6])~[#6]',0), # OC(C)C 
190    153:('[!#6;!#1]~[CH2]~*',0), # QCH2A 
191    154:('[#6]=[#8]',0), # C=O 
192    155:('*!@[CH2]!@*',0), # A!CH2!A 
193    156:('[#7]~*(~*)~*',0), # NA(A)A 
194    157:('[#6]-[#8]',0), # C-O 
195    158:('[#6]-[#7]',0), # C-N 
196    159:('[#8]',1), # O>1 
197    160:('[C;H3,H4]',0), #CH3 
198    161:('[#7]',0), # N 
199    162:('a',0), # Aromatic 
200    163:('*1~*~*~*~*~*~1',0), # 6M Ring 
201    164:('[#8]',0), # O 
202    165:('[R]',0), # Ring 
203    166:('?',0), # Fragments  FIX: this can't be done in SMARTS 
204    } 
205   
206  maccsKeys = None 
207   
208 -def _InitKeys(keyList,keyDict):
209 """ *Internal Use Only* 210 211 generates SMARTS patterns for the keys, run once 212 213 """ 214 assert len(keyList) == len(keyDict.keys()),'length mismatch' 215 for key in keyDict.keys(): 216 patt,count = keyDict[key] 217 if patt != '?': 218 try: 219 sma = Chem.MolFromSmarts(patt) 220 except: 221 sma = None 222 if not sma: 223 print 'SMARTS parser error for key #%d: %s'%(key,patt) 224 else: 225 keyList[key-1] = sma,count
226
227 -def GenMACCSKeys(mol,**kwargs):
228 """ generates the MACCS fingerprint for a molecules 229 230 **Arguments** 231 232 - mol: the molecule to be fingerprinted 233 234 - any extra keyword arguments are ignored 235 236 **Returns** 237 238 a _DataStructs.SparseBitVect_ containing the fingerprint. 239 240 >>> m = Chem.MolFromSmiles('CNO') 241 >>> bv = GenMACCSKeys(m) 242 >>> tuple(bv.GetOnBits()) 243 (24, 68, 69, 71, 93, 94, 102, 124, 131, 139, 151, 158, 160, 161, 164) 244 >>> bv = GenMACCSKeys(Chem.MolFromSmiles('CCC')) 245 >>> tuple(bv.GetOnBits()) 246 (74, 114, 149, 155, 160) 247 248 """ 249 global maccsKeys 250 if maccsKeys is None: 251 maccsKeys = [(None,0)]*len(smartsPatts.keys()) 252 _InitKeys(maccsKeys,smartsPatts) 253 ctor=kwargs.get('ctor',DataStructs.SparseBitVect) 254 255 res = ctor(len(maccsKeys)+1) 256 for i,(patt,count) in enumerate(maccsKeys): 257 if patt is not None: 258 if count==0: 259 res[i+1] = mol.HasSubstructMatch(patt) 260 else: 261 matches = mol.GetSubstructMatches(patt) 262 if len(matches) > count: 263 res[i+1] = 1 264 elif (i+1)==125: 265 # special case: num aromatic rings > 1 266 ri = mol.GetRingInfo() 267 nArom=0 268 res[125]=0 269 for ring in ri.BondRings(): 270 isArom=True 271 for bondIdx in ring: 272 if not mol.GetBondWithIdx(bondIdx).GetIsAromatic(): 273 isArom=False 274 break 275 if isArom: 276 nArom+=1 277 if nArom>1: 278 res[125]=1 279 break 280 elif (i+1)==166: 281 res[166]=0 282 # special case: num frags > 1 283 if len(Chem.GetMolFrags(mol))>1: 284 res[166]=1 285 286 return res
287 288 FingerprintMol = GenMACCSKeys 289 290 #------------------------------------ 291 # 292 # doctest boilerplate 293 #
294 -def _test():
295 import doctest,sys 296 return doctest.testmod(sys.modules["__main__"])
297 298 if __name__ == '__main__': 299 import sys 300 failed,tried = _test() 301 sys.exit(failed) 302