Package Chem :: Module MACCSkeys
[hide private]
[frames] | no frames]

Source Code for Module Chem.MACCSkeys

  1  # $Id: MACCSkeys.py 775 2008-07-24 18:38:28Z glandrum $ 
  2  # 
  3  # Copyright (C) 2001-2008 greg Landrum and Rational Discovery LLC 
  4  # 
  5  #   @@ All Rights Reserved  @@ 
  6  # 
  7  """ SMARTS definitions for the publically available MACCS keys 
  8  and a MACCS fingerprinter 
  9   
 10  I compared the MACCS fingerprints generated here with those from two 
 11  other packages (not MDL, unfortunately). Of course there are 
 12  disagreements between the various fingerprints still, but I think 
 13  these definitions work pretty well. Some notes: 
 14   
 15  1) most of the differences have to do with aromaticity 
 16  2) there's a discrepancy sometimes because the current RDKit 
 17  definitions do not require multiple matches to be distinct. e.g. the 
 18  SMILES C(=O)CC(=O) can match the (hypothetical) key O=CC twice in my 
 19  definition. It's not clear to me what the correct behavior is. 
 20  3) Some keys are not fully defined in the MDL documentation 
 21  4) Two keys, 125 and 166, have to be done outside of SMARTS. 
 22  5) Key 1 (ISOTOPE) isn't defined 
 23   
 24  """ 
 25  import Chem 
 26  import DataStructs 
 27  # these are SMARTS patterns corresponding to the MDL MACCS keys 
 28  smartsPatts={ 
 29    1:('?',0), # ISOTOPE 
 30    #2:('[#103,#104,#105,#106,#107,#106,#109,#110,#111,#112]',0),  # ISOTOPE Not complete 
 31    2:('[#103,#104]',0),  # ISOTOPE Not complete 
 32    3:('[Ge,As,Se,Sn,Sb,Te,Tl,Pb,Bi]',0), # Group IVa,Va,VIa Periods 4-6 (Ge...)  *NOTE* spec wrong 
 33    4:('[Ac,Th,Pa,U,Np,Pu,Am,Cm,Bk,Cf,Es,Fm,Md,No,Lr]',0), # actinide 
 34    5:('[Sc,Ti,Y,Zr,Hf]',0), # Group IIIB,IVB (Sc...)  *NOTE* spec wrong 
 35    6:('[La,Ce,Pr,Nd,Pm,Sm,Eu,Gd,Tb,Dy,Ho,Er,Tm,Yb,Lu]',0), # Lanthanide 
 36    7:('[V,Cr,Mn,Nb,Mo,Tc,Ta,W,Re]',0), # Group VB,VIB,VIIB (V...) *NOTE* spec wrong 
 37    8:('[!#6;!#1]1~*~*~*~1',0), # QAAA@1 
 38    9:('[Fe,Co,Ni,Ru,Rh,Pd,Os,Ir,Pt]',0), # Group VIII (Fe...) 
 39    10:('[Be,Mg,Ca,Sr,Ba,Ra]',0), # Group IIa (Alkaline earth) 
 40    11:('*1~*~*~*~1',0), # 4M Ring 
 41    12:('[Cu,Zn,Ag,Cd,Au,Hg]',0), # Group IB,IIB (Cu..) 
 42    13:('[#8]~[#7](~[#6])~[#6]',0), # ON(C)C 
 43    14:('[#16]-[#16]',0), # S-S 
 44    15:('[#8]~[#6](~[#8])~[#8]',0), # OC(O)O 
 45    16:('[!#6;!#1]1~*~*~1',0), # QAA@1 
 46    17:('[#6]#[#6]',0), #CTC 
 47    18:('[B,Al,Ga,In,Tl]',0), # Group IIIA (B...) *NOTE* spec wrong 
 48    19:('*1~*~*~*~*~*~*~1',0), # 7M Ring 
 49    20:('[Si]',0), #Si 
 50    21:('[#6]=[#6](~[!#6;!#1])~[!#6;!#1]',0), # C=C(Q)Q 
 51    22:('*1~*~*~1',0), # 3M Ring 
 52    23:('[#7]~[#6](~[#8])~[#8]',0), # NC(O)O 
 53    24:('[#7]-[#8]',0), # N-O 
 54    25:('[#7]~[#6](~[#7])~[#7]',0), # NC(N)N 
 55    26:('[#6]=;@[#6](@*)@*',0), # C$=C($A)$A 
 56    27:('[I]',0), # I 
 57    28:('[!#6;!#1]~[CH2]~[!#6;!#1]',0), # QCH2Q 
 58    29:('[#15]',0),# P 
 59    30:('[#6]~[!#6;!#1](~[#6])(~[#6])~*',0), # CQ(C)(C)A 
 60    31:('[!#6;!#1]~[F,Cl,Br,I]',0), # QX 
 61    32:('[#6]~[#16]~[#7]',0), # CSN 
 62    33:('[#7]~[#16]',0), # NS 
 63    34:('[CH2]=*',0), # CH2=A 
 64    35:('[Li,Na,K,Rb,Cs,Fr]',0), # Group IA (Alkali Metal) 
 65    36:('[#16R]',0), # S Heterocycle 
 66    37:('[#7]~[#6](~[#8])~[#7]',0), # NC(O)N 
 67    38:('[#7]~[#6](~[#6])~[#7]',0), # NC(C)N 
 68    39:('[#8]~[#16](~[#8])~[#8]',0), # OS(O)O 
 69    40:('[#16]-[#8]',0), # S-O 
 70    41:('[#6]#[#7]',0), # CTN 
 71    42:('F',0), # F 
 72    43:('[!C;!c;!#1;!H0]~*~[!C;!c;!#1;!H0]',0), # QHAQH 
 73    44:('?',0), # OTHER 
 74    45:('[#6]=[#6]~[#7]',0), # C=CN 
 75    46:('Br',0), # BR 
 76    47:('[#16]~*~[#7]',0), # SAN 
 77    48:('[#8]~[!#6;!#1](~[#8])(~[#8])',0), # OQ(O)O 
 78    49:('[!+0]',0), # CHARGE   
 79    50:('[#6]=[#6](~[#6])~[#6]',0), # C=C(C)C 
 80    51:('[#6]~[#16]~[#8]',0), # CSO 
 81    52:('[#7]~[#7]',0), # NN 
 82    53:('[!#6;!#1;!H0]~*~*~*~[!#6;!#1;!H0]',0), # QHAAAQH 
 83    54:('[!#6;!#1;!H0]~*~*~[!#6;!#1;!H0]',0), # QHAAQH 
 84    55:('[#8]~[#16]~[#8]',0), #OSO 
 85    56:('[#8]~[#7](~[#8])~[#6]',0), # ON(O)C 
 86    57:('[#8R]',0), # O Heterocycle 
 87    58:('[!#6;!#1]~[#16]~[!#6;!#1]',0), # QSQ 
 88    59:('[#16]!:*:*',0), # Snot%A%A 
 89    60:('[#16]=[#8]',0), # S=O 
 90    61:('*~[#16](~*)~*',0), # AS(A)A 
 91    62:('*@*!@*@*',0), # A$!A$A 
 92    63:('[#7]=[#8]',0), # N=O 
 93    64:('*@*!@[#16]',0), # A$A!S 
 94    65:('c:n',0), # C%N 
 95    66:('[#6]~[#6](~[#6])(~[#6])~*',0), # CC(C)(C)A 
 96    67:('[!#6;!#1]~[#16]',0), # QS 
 97    68:('[!#6;!#1;!H0]~[!#6;!#1;!H0]',0), # QHQH (&...) FIX: incomplete definition 
 98    69:('[!#6;!#1]~[!#6;!#1;!H0]',0), # QQH 
 99    70:('[!#6;!#1]~[#7]~[!#6;!#1]',0), # QNQ 
100    71:('[#7]~[#8]',0), # NO 
101    72:('[#8]~*~*~[#8]',0), # OAAO 
102    73:('[#16]=*',0), # S=A 
103    74:('[CH3]~*~[CH3]',0), # CH3ACH3 
104    75:('*!@[#7]@*',0), # A!N$A 
105    76:('[#6]=[#6](~*)~*',0), # C=C(A)A 
106    77:('[#7]~*~[#7]',0), # NAN 
107    78:('[#6]=[#7]',0), # C=N 
108    79:('[#7]~*~*~[#7]',0), # NAAN 
109    80:('[#7]~*~*~*~[#7]',0), # NAAAN 
110    81:('[#16]~*(~*)~*',0), # SA(A)A 
111    82:('*~[CH2]~[!#6;!#1;!H0]',0), # ACH2QH 
112    83:('[!#6;!#1]1~*~*~*~*~1',0), # QAAAA@1 
113    84:('[NH2]',0), #NH2 
114    85:('[#6]~[#7](~[#6])~[#6]',0), # CN(C)C 
115    86:('[C;H2,H3][!#6;!#1][C;H2,H3]',0), # CH2QCH2 
116    87:('[F,Cl,Br,I]!@*@*',0), # X!A$A 
117    88:('[#16]',0), # S 
118    89:('[#8]~*~*~*~[#8]',0), # OAAAO 
119    90:('[$([!#6;!#1;!H0]~*~*~[CH2]~*),$([!#6;!#1;!H0;R]1@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~[R]1@[R]@[CH2;R]1)]',0), # QHAACH2A 
120    91:('[$([!#6;!#1;!H0]~*~*~*~[CH2]~*),$([!#6;!#1;!H0;R]1@[R]@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~[R]1@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~*~[R]1@[R]@[CH2;R]1)]',0), # QHAAACH2A 
121    92:('[#8]~[#6](~[#7])~[#6]',0), # OC(N)C 
122    93:('[!#6;!#1]~[CH3]',0), # QCH3 
123    94:('[!#6;!#1]~[#7]',0), # QN 
124    95:('[#7]~*~*~[#8]',0), # NAAO 
125    96:('*1~*~*~*~*~1',0), # 5 M ring 
126    97:('[#7]~*~*~*~[#8]',0), # NAAAO 
127    98:('[!#6;!#1]1~*~*~*~*~*~1',0), # QAAAAA@1 
128    99:('[#6]=[#6]',0), # C=C 
129    100:('*~[CH2]~[#7]',0), # ACH2N 
130    101:('[$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1)]',0), # 8M Ring or larger. This only handles up to ring sizes of 14 
131    102:('[!#6;!#1]~[#8]',0), # QO 
132    103:('Cl',0), # CL 
133    104:('[!#6;!#1;!H0]~*~[CH2]~*',0), # QHACH2A 
134    105:('*@*(@*)@*',0), # A$A($A)$A 
135    106:('[!#6;!#1]~*(~[!#6;!#1])~[!#6;!#1]',0), # QA(Q)Q 
136    107:('[F,Cl,Br,I]~*(~*)~*',0), # XA(A)A 
137    108:('[CH3]~*~*~*~[CH2]~*',0), # CH3AAACH2A 
138    109:('*~[CH2]~[#8]',0), # ACH2O 
139    110:('[#7]~[#6]~[#8]',0), # NCO 
140    111:('[#7]~*~[CH2]~*',0), # NACH2A 
141    112:('*~*(~*)(~*)~*',0), # AA(A)(A)A 
142    113:('[#8]!:*:*',0), # Onot%A%A 
143    114:('[CH3]~[CH2]~*',0), # CH3CH2A 
144    115:('[CH3]~*~[CH2]~*',0), # CH3ACH2A 
145    116:('[$([CH3]~*~*~[CH2]~*),$([CH3]~*1~*~[CH2]1)]',0), # CH3AACH2A 
146    117:('[#7]~*~[#8]',0), # NAO 
147    118:('[$(*~[CH2]~[CH2]~*),$(*1~[CH2]~[CH2]1)]',1), # ACH2CH2A > 1 
148    119:('[#7]=*',0), # N=A 
149    120:('[!#6;R]',1), # Heterocyclic atom > 1 (&...) FIX: incomplete definition 
150    121:('[#7;R]',0), # N Heterocycle 
151    122:('*~[#7](~*)~*',0), # AN(A)A 
152    123:('[#8]~[#6]~[#8]',0), # OCO 
153    124:('[!#6;!#1]~[!#6;!#1]',0), # QQ 
154    125:('?',0), # Aromatic Ring > 1 
155    126:('*!@[#8]!@*',0), # A!O!A 
156    127:('*@*!@[#8]',1), # A$A!O > 1 (&...) FIX: incomplete definition 
157    128:('[$(*~[CH2]~*~*~*~[CH2]~*),$([R]1@[CH2;R]@[R]@[R]@[R]@[CH2;R]1),$(*~[CH2]~[R]1@[R]@[R]@[CH2;R]1),$(*~[CH2]~*~[R]1@[R]@[CH2;R]1)]',0), # ACH2AAACH2A 
158    129:('[$(*~[CH2]~*~*~[CH2]~*),$([R]1@[CH2]@[R]@[R]@[CH2;R]1),$(*~[CH2]~[R]1@[R]@[CH2;R]1)]',0), # ACH2AACH2A 
159    130:('[!#6;!#1]~[!#6;!#1]',1), # QQ > 1 (&...)  FIX: incomplete definition 
160    131:('[!#6;!#1;!H0]',1), # QH > 1 
161    132:('[#8]~*~[CH2]~*',0), # OACH2A 
162    133:('*@*!@[#7]',0), # A$A!N 
163    134:('[F,Cl,Br,I]',0), # X (HALOGEN) 
164    135:('[#7]!:*:*',0), # Nnot%A%A 
165    136:('[#8]=*',1), # O=A>1  
166    137:('[!C;!c;R]',0), # Heterocycle 
167    138:('[!#6;!#1]~[CH2]~*',1), # QCH2A>1 (&...) FIX: incomplete definition 
168    139:('[O;!H0]',0), # OH 
169    140:('[#8]',3), # O > 3 (&...) FIX: incomplete definition 
170    141:('[CH3]',2), # CH3 > 2  (&...) FIX: incomplete definition 
171    142:('[#7]',1), # N > 1 
172    143:('*@*!@[#8]',0), # A$A!O 
173    144:('*!:*:*!:*',0), # Anot%A%Anot%A 
174    145:('*1~*~*~*~*~*~1',1), # 6M ring > 1 
175    146:('[#8]',2), # O > 2 
176    147:('[$(*~[CH2]~[CH2]~*),$([R]1@[CH2;R]@[CH2;R]1)]',0), # ACH2CH2A 
177    148:('*~[!#6;!#1](~*)~*',0), # AQ(A)A 
178    149:('[C;H3,H4]',1), # CH3 > 1 
179    150:('*!@*@*!@*',0), # A!A$A!A 
180    151:('[#7;!H0]',0), # NH 
181    152:('[#8]~[#6](~[#6])~[#6]',0), # OC(C)C 
182    153:('[!#6;!#1]~[CH2]~*',0), # QCH2A 
183    154:('[#6]=[#8]',0), # C=O 
184    155:('*!@[CH2]!@*',0), # A!CH2!A 
185    156:('[#7]~*(~*)~*',0), # NA(A)A 
186    157:('[#6]-[#8]',0), # C-O 
187    158:('[#6]-[#7]',0), # C-N 
188    159:('[#8]',1), # O>1 
189    160:('[C;H3,H4]',0), #CH3 
190    161:('[#7]',0), # N 
191    162:('a',0), # Aromatic 
192    163:('*1~*~*~*~*~*~1',0), # 6M Ring 
193    164:('[#8]',0), # O 
194    165:('[R]',0), # Ring 
195    166:('?',0), # Fragments  FIX: this can't be done in SMARTS 
196    } 
197   
198  maccsKeys = None 
199   
200 -def _InitKeys(keyList,keyDict):
201 """ *Internal Use Only* 202 203 generates SMARTS patterns for the keys, run once 204 205 """ 206 assert len(keyList) == len(keyDict.keys()),'length mismatch' 207 for key in keyDict.keys(): 208 patt,count = keyDict[key] 209 if patt != '?': 210 try: 211 sma = Chem.MolFromSmarts(patt) 212 except: 213 sma = None 214 if not sma: 215 print 'SMARTS parser error for key #%d: %s'%(key,patt) 216 else: 217 keyList[key-1] = sma,count
218
219 -def GenMACCSKeys(mol,**kwargs):
220 """ generates the MACCS fingerprint for a molecules 221 222 **Arguments** 223 224 - mol: the molecule to be fingerprinted 225 226 - any extra keyword arguments are ignored 227 228 **Returns** 229 230 a _DataStructs.SparseBitVect_ containing the fingerprint. 231 232 >>> m = Chem.MolFromSmiles('CNO') 233 >>> bv = GenMACCSKeys(m) 234 >>> tuple(bv.GetOnBits()) 235 (24, 68, 69, 71, 93, 94, 102, 124, 131, 139, 151, 158, 160, 161, 164) 236 >>> bv = GenMACCSKeys(Chem.MolFromSmiles('CCC')) 237 >>> tuple(bv.GetOnBits()) 238 (74, 114, 149, 155, 160) 239 240 """ 241 global maccsKeys 242 if maccsKeys is None: 243 maccsKeys = [(None,0)]*len(smartsPatts.keys()) 244 _InitKeys(maccsKeys,smartsPatts) 245 ctor=kwargs.get('ctor',DataStructs.SparseBitVect) 246 247 res = ctor(len(maccsKeys)+1) 248 for i,(patt,count) in enumerate(maccsKeys): 249 if patt is not None: 250 if count==0: 251 res[i+1] = mol.HasSubstructMatch(patt) 252 else: 253 matches = mol.GetSubstructMatches(patt) 254 if len(matches) > count: 255 res[i+1] = 1 256 elif (i+1)==125: 257 # special case: num aromatic rings > 1 258 ri = mol.GetRingInfo() 259 nArom=0 260 res[125]=0 261 for ring in ri.BondRings(): 262 isArom=True 263 for bondIdx in ring: 264 if not mol.GetBondWithIdx(bondIdx).GetIsAromatic(): 265 isArom=False 266 break 267 if isArom: 268 nArom+=1 269 if nArom>1: 270 res[125]=1 271 break 272 elif (i+1)==166: 273 res[166]=0 274 # special case: num frags > 1 275 if len(Chem.GetMolFrags(mol))>1: 276 res[166]=1 277 278 return res
279 280 FingerprintMol = GenMACCSKeys 281 282 #------------------------------------ 283 # 284 # doctest boilerplate 285 #
286 -def _test():
287 import doctest,sys 288 return doctest.testmod(sys.modules["__main__"])
289 290 if __name__ == '__main__': 291 import sys 292 failed,tried = _test() 293 sys.exit(failed) 294