1
2
3
4
5
6
7 """ SMARTS definitions for the publically available MACCS keys
8 and a MACCS fingerprinter
9
10 I compared the MACCS fingerprints generated here with those from two
11 other packages (not MDL, unfortunately). Of course there are
12 disagreements between the various fingerprints still, but I think
13 these definitions work pretty well. Some notes:
14
15 1) most of the differences have to do with aromaticity
16 2) there's a discrepancy sometimes because the current RDKit
17 definitions do not require multiple matches to be distinct. e.g. the
18 SMILES C(=O)CC(=O) can match the (hypothetical) key O=CC twice in my
19 definition. It's not clear to me what the correct behavior is.
20 3) Some keys are not fully defined in the MDL documentation
21 4) Two keys, 125 and 166, have to be done outside of SMARTS.
22 5) Key 1 (ISOTOPE) isn't defined
23
24 """
25 import Chem
26 import DataStructs
27
28 smartsPatts={
29 1:('?',0),
30
31 2:('[#103,#104]',0),
32 3:('[Ge,As,Se,Sn,Sb,Te,Tl,Pb,Bi]',0),
33 4:('[Ac,Th,Pa,U,Np,Pu,Am,Cm,Bk,Cf,Es,Fm,Md,No,Lr]',0),
34 5:('[Sc,Ti,Y,Zr,Hf]',0),
35 6:('[La,Ce,Pr,Nd,Pm,Sm,Eu,Gd,Tb,Dy,Ho,Er,Tm,Yb,Lu]',0),
36 7:('[V,Cr,Mn,Nb,Mo,Tc,Ta,W,Re]',0),
37 8:('[!#6;!#1]1~*~*~*~1',0),
38 9:('[Fe,Co,Ni,Ru,Rh,Pd,Os,Ir,Pt]',0),
39 10:('[Be,Mg,Ca,Sr,Ba,Ra]',0),
40 11:('*1~*~*~*~1',0),
41 12:('[Cu,Zn,Ag,Cd,Au,Hg]',0),
42 13:('[#8]~[#7](~[#6])~[#6]',0),
43 14:('[#16]-[#16]',0),
44 15:('[#8]~[#6](~[#8])~[#8]',0),
45 16:('[!#6;!#1]1~*~*~1',0),
46 17:('[#6]#[#6]',0),
47 18:('[B,Al,Ga,In,Tl]',0),
48 19:('*1~*~*~*~*~*~*~1',0),
49 20:('[Si]',0),
50 21:('[#6]=[#6](~[!#6;!#1])~[!#6;!#1]',0),
51 22:('*1~*~*~1',0),
52 23:('[#7]~[#6](~[#8])~[#8]',0),
53 24:('[#7]-[#8]',0),
54 25:('[#7]~[#6](~[#7])~[#7]',0),
55 26:('[#6]=;@[#6](@*)@*',0),
56 27:('[I]',0),
57 28:('[!#6;!#1]~[CH2]~[!#6;!#1]',0),
58 29:('[#15]',0),
59 30:('[#6]~[!#6;!#1](~[#6])(~[#6])~*',0),
60 31:('[!#6;!#1]~[F,Cl,Br,I]',0),
61 32:('[#6]~[#16]~[#7]',0),
62 33:('[#7]~[#16]',0),
63 34:('[CH2]=*',0),
64 35:('[Li,Na,K,Rb,Cs,Fr]',0),
65 36:('[#16R]',0),
66 37:('[#7]~[#6](~[#8])~[#7]',0),
67 38:('[#7]~[#6](~[#6])~[#7]',0),
68 39:('[#8]~[#16](~[#8])~[#8]',0),
69 40:('[#16]-[#8]',0),
70 41:('[#6]#[#7]',0),
71 42:('F',0),
72 43:('[!C;!c;!#1;!H0]~*~[!C;!c;!#1;!H0]',0),
73 44:('?',0),
74 45:('[#6]=[#6]~[#7]',0),
75 46:('Br',0),
76 47:('[#16]~*~[#7]',0),
77 48:('[#8]~[!#6;!#1](~[#8])(~[#8])',0),
78 49:('[!+0]',0),
79 50:('[#6]=[#6](~[#6])~[#6]',0),
80 51:('[#6]~[#16]~[#8]',0),
81 52:('[#7]~[#7]',0),
82 53:('[!#6;!#1;!H0]~*~*~*~[!#6;!#1;!H0]',0),
83 54:('[!#6;!#1;!H0]~*~*~[!#6;!#1;!H0]',0),
84 55:('[#8]~[#16]~[#8]',0),
85 56:('[#8]~[#7](~[#8])~[#6]',0),
86 57:('[#8R]',0),
87 58:('[!#6;!#1]~[#16]~[!#6;!#1]',0),
88 59:('[#16]!:*:*',0),
89 60:('[#16]=[#8]',0),
90 61:('*~[#16](~*)~*',0),
91 62:('*@*!@*@*',0),
92 63:('[#7]=[#8]',0),
93 64:('*@*!@[#16]',0),
94 65:('c:n',0),
95 66:('[#6]~[#6](~[#6])(~[#6])~*',0),
96 67:('[!#6;!#1]~[#16]',0),
97 68:('[!#6;!#1;!H0]~[!#6;!#1;!H0]',0),
98 69:('[!#6;!#1]~[!#6;!#1;!H0]',0),
99 70:('[!#6;!#1]~[#7]~[!#6;!#1]',0),
100 71:('[#7]~[#8]',0),
101 72:('[#8]~*~*~[#8]',0),
102 73:('[#16]=*',0),
103 74:('[CH3]~*~[CH3]',0),
104 75:('*!@[#7]@*',0),
105 76:('[#6]=[#6](~*)~*',0),
106 77:('[#7]~*~[#7]',0),
107 78:('[#6]=[#7]',0),
108 79:('[#7]~*~*~[#7]',0),
109 80:('[#7]~*~*~*~[#7]',0),
110 81:('[#16]~*(~*)~*',0),
111 82:('*~[CH2]~[!#6;!#1;!H0]',0),
112 83:('[!#6;!#1]1~*~*~*~*~1',0),
113 84:('[NH2]',0),
114 85:('[#6]~[#7](~[#6])~[#6]',0),
115 86:('[C;H2,H3][!#6;!#1][C;H2,H3]',0),
116 87:('[F,Cl,Br,I]!@*@*',0),
117 88:('[#16]',0),
118 89:('[#8]~*~*~*~[#8]',0),
119 90:('[$([!#6;!#1;!H0]~*~*~[CH2]~*),$([!#6;!#1;!H0;R]1@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~[R]1@[R]@[CH2;R]1)]',0),
120 91:('[$([!#6;!#1;!H0]~*~*~*~[CH2]~*),$([!#6;!#1;!H0;R]1@[R]@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~[R]1@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~*~[R]1@[R]@[CH2;R]1)]',0),
121 92:('[#8]~[#6](~[#7])~[#6]',0),
122 93:('[!#6;!#1]~[CH3]',0),
123 94:('[!#6;!#1]~[#7]',0),
124 95:('[#7]~*~*~[#8]',0),
125 96:('*1~*~*~*~*~1',0),
126 97:('[#7]~*~*~*~[#8]',0),
127 98:('[!#6;!#1]1~*~*~*~*~*~1',0),
128 99:('[#6]=[#6]',0),
129 100:('*~[CH2]~[#7]',0),
130 101:('[$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1)]',0),
131 102:('[!#6;!#1]~[#8]',0),
132 103:('Cl',0),
133 104:('[!#6;!#1;!H0]~*~[CH2]~*',0),
134 105:('*@*(@*)@*',0),
135 106:('[!#6;!#1]~*(~[!#6;!#1])~[!#6;!#1]',0),
136 107:('[F,Cl,Br,I]~*(~*)~*',0),
137 108:('[CH3]~*~*~*~[CH2]~*',0),
138 109:('*~[CH2]~[#8]',0),
139 110:('[#7]~[#6]~[#8]',0),
140 111:('[#7]~*~[CH2]~*',0),
141 112:('*~*(~*)(~*)~*',0),
142 113:('[#8]!:*:*',0),
143 114:('[CH3]~[CH2]~*',0),
144 115:('[CH3]~*~[CH2]~*',0),
145 116:('[$([CH3]~*~*~[CH2]~*),$([CH3]~*1~*~[CH2]1)]',0),
146 117:('[#7]~*~[#8]',0),
147 118:('[$(*~[CH2]~[CH2]~*),$(*1~[CH2]~[CH2]1)]',1),
148 119:('[#7]=*',0),
149 120:('[!#6;R]',1),
150 121:('[#7;R]',0),
151 122:('*~[#7](~*)~*',0),
152 123:('[#8]~[#6]~[#8]',0),
153 124:('[!#6;!#1]~[!#6;!#1]',0),
154 125:('?',0),
155 126:('*!@[#8]!@*',0),
156 127:('*@*!@[#8]',1),
157 128:('[$(*~[CH2]~*~*~*~[CH2]~*),$([R]1@[CH2;R]@[R]@[R]@[R]@[CH2;R]1),$(*~[CH2]~[R]1@[R]@[R]@[CH2;R]1),$(*~[CH2]~*~[R]1@[R]@[CH2;R]1)]',0),
158 129:('[$(*~[CH2]~*~*~[CH2]~*),$([R]1@[CH2]@[R]@[R]@[CH2;R]1),$(*~[CH2]~[R]1@[R]@[CH2;R]1)]',0),
159 130:('[!#6;!#1]~[!#6;!#1]',1),
160 131:('[!#6;!#1;!H0]',1),
161 132:('[#8]~*~[CH2]~*',0),
162 133:('*@*!@[#7]',0),
163 134:('[F,Cl,Br,I]',0),
164 135:('[#7]!:*:*',0),
165 136:('[#8]=*',1),
166 137:('[!C;!c;R]',0),
167 138:('[!#6;!#1]~[CH2]~*',1),
168 139:('[O;!H0]',0),
169 140:('[#8]',3),
170 141:('[CH3]',2),
171 142:('[#7]',1),
172 143:('*@*!@[#8]',0),
173 144:('*!:*:*!:*',0),
174 145:('*1~*~*~*~*~*~1',1),
175 146:('[#8]',2),
176 147:('[$(*~[CH2]~[CH2]~*),$([R]1@[CH2;R]@[CH2;R]1)]',0),
177 148:('*~[!#6;!#1](~*)~*',0),
178 149:('[C;H3,H4]',1),
179 150:('*!@*@*!@*',0),
180 151:('[#7;!H0]',0),
181 152:('[#8]~[#6](~[#6])~[#6]',0),
182 153:('[!#6;!#1]~[CH2]~*',0),
183 154:('[#6]=[#8]',0),
184 155:('*!@[CH2]!@*',0),
185 156:('[#7]~*(~*)~*',0),
186 157:('[#6]-[#8]',0),
187 158:('[#6]-[#7]',0),
188 159:('[#8]',1),
189 160:('[C;H3,H4]',0),
190 161:('[#7]',0),
191 162:('a',0),
192 163:('*1~*~*~*~*~*~1',0),
193 164:('[#8]',0),
194 165:('[R]',0),
195 166:('?',0),
196 }
197
198 maccsKeys = None
199
201 """ *Internal Use Only*
202
203 generates SMARTS patterns for the keys, run once
204
205 """
206 assert len(keyList) == len(keyDict.keys()),'length mismatch'
207 for key in keyDict.keys():
208 patt,count = keyDict[key]
209 if patt != '?':
210 try:
211 sma = Chem.MolFromSmarts(patt)
212 except:
213 sma = None
214 if not sma:
215 print 'SMARTS parser error for key #%d: %s'%(key,patt)
216 else:
217 keyList[key-1] = sma,count
218
220 """ generates the MACCS fingerprint for a molecules
221
222 **Arguments**
223
224 - mol: the molecule to be fingerprinted
225
226 - any extra keyword arguments are ignored
227
228 **Returns**
229
230 a _DataStructs.SparseBitVect_ containing the fingerprint.
231
232 >>> m = Chem.MolFromSmiles('CNO')
233 >>> bv = GenMACCSKeys(m)
234 >>> tuple(bv.GetOnBits())
235 (24, 68, 69, 71, 93, 94, 102, 124, 131, 139, 151, 158, 160, 161, 164)
236 >>> bv = GenMACCSKeys(Chem.MolFromSmiles('CCC'))
237 >>> tuple(bv.GetOnBits())
238 (74, 114, 149, 155, 160)
239
240 """
241 global maccsKeys
242 if maccsKeys is None:
243 maccsKeys = [(None,0)]*len(smartsPatts.keys())
244 _InitKeys(maccsKeys,smartsPatts)
245 ctor=kwargs.get('ctor',DataStructs.SparseBitVect)
246
247 res = ctor(len(maccsKeys)+1)
248 for i,(patt,count) in enumerate(maccsKeys):
249 if patt is not None:
250 if count==0:
251 res[i+1] = mol.HasSubstructMatch(patt)
252 else:
253 matches = mol.GetSubstructMatches(patt)
254 if len(matches) > count:
255 res[i+1] = 1
256 elif (i+1)==125:
257
258 ri = mol.GetRingInfo()
259 nArom=0
260 res[125]=0
261 for ring in ri.BondRings():
262 isArom=True
263 for bondIdx in ring:
264 if not mol.GetBondWithIdx(bondIdx).GetIsAromatic():
265 isArom=False
266 break
267 if isArom:
268 nArom+=1
269 if nArom>1:
270 res[125]=1
271 break
272 elif (i+1)==166:
273 res[166]=0
274
275 if len(Chem.GetMolFrags(mol))>1:
276 res[166]=1
277
278 return res
279
280 FingerprintMol = GenMACCSKeys
281
282
283
284
285
287 import doctest,sys
288 return doctest.testmod(sys.modules["__main__"])
289
290 if __name__ == '__main__':
291 import sys
292 failed,tried = _test()
293 sys.exit(failed)
294