1
2
3
4
5
6
7
8
9
10
11 """ SMARTS definitions for the publically available MACCS keys
12 and a MACCS fingerprinter
13
14 I compared the MACCS fingerprints generated here with those from two
15 other packages (not MDL, unfortunately). Of course there are
16 disagreements between the various fingerprints still, but I think
17 these definitions work pretty well. Some notes:
18
19 1) most of the differences have to do with aromaticity
20 2) there's a discrepancy sometimes because the current RDKit
21 definitions do not require multiple matches to be distinct. e.g. the
22 SMILES C(=O)CC(=O) can match the (hypothetical) key O=CC twice in my
23 definition. It's not clear to me what the correct behavior is.
24 3) Some keys are not fully defined in the MDL documentation
25 4) Two keys, 125 and 166, have to be done outside of SMARTS.
26 5) Key 1 (ISOTOPE) isn't defined
27
28 Rev history:
29 2006 (gl): Original open-source release
30 May 2011 (gl): Update some definitions based on feedback from Andrew Dalke
31
32 """
33 from rdkit import Chem
34 from rdkit import DataStructs
35
36 smartsPatts={
37 1:('?',0),
38
39 2:('[#104]',0),
40 3:('[#32,#33,#34,#50,#51,#52,#82,#83,#84]',0),
41 4:('[Ac,Th,Pa,U,Np,Pu,Am,Cm,Bk,Cf,Es,Fm,Md,No,Lr]',0),
42 5:('[Sc,Ti,Y,Zr,Hf]',0),
43 6:('[La,Ce,Pr,Nd,Pm,Sm,Eu,Gd,Tb,Dy,Ho,Er,Tm,Yb,Lu]',0),
44 7:('[V,Cr,Mn,Nb,Mo,Tc,Ta,W,Re]',0),
45 8:('[!#6;!#1]1~*~*~*~1',0),
46 9:('[Fe,Co,Ni,Ru,Rh,Pd,Os,Ir,Pt]',0),
47 10:('[Be,Mg,Ca,Sr,Ba,Ra]',0),
48 11:('*1~*~*~*~1',0),
49 12:('[Cu,Zn,Ag,Cd,Au,Hg]',0),
50 13:('[#8]~[#7](~[#6])~[#6]',0),
51 14:('[#16]-[#16]',0),
52 15:('[#8]~[#6](~[#8])~[#8]',0),
53 16:('[!#6;!#1]1~*~*~1',0),
54 17:('[#6]#[#6]',0),
55 18:('[#5,#13,#31,#49,#81]',0),
56 19:('*1~*~*~*~*~*~*~1',0),
57 20:('[Si]',0),
58 21:('[#6]=[#6](~[!#6;!#1])~[!#6;!#1]',0),
59 22:('*1~*~*~1',0),
60 23:('[#7]~[#6](~[#8])~[#8]',0),
61 24:('[#7]-[#8]',0),
62 25:('[#7]~[#6](~[#7])~[#7]',0),
63 26:('[#6]=;@[#6](@*)@*',0),
64 27:('[I]',0),
65 28:('[!#6;!#1]~[CH2]~[!#6;!#1]',0),
66 29:('[#15]',0),
67 30:('[#6]~[!#6;!#1](~[#6])(~[#6])~*',0),
68 31:('[!#6;!#1]~[F,Cl,Br,I]',0),
69 32:('[#6]~[#16]~[#7]',0),
70 33:('[#7]~[#16]',0),
71 34:('[CH2]=*',0),
72 35:('[Li,Na,K,Rb,Cs,Fr]',0),
73 36:('[#16R]',0),
74 37:('[#7]~[#6](~[#8])~[#7]',0),
75 38:('[#7]~[#6](~[#6])~[#7]',0),
76 39:('[#8]~[#16](~[#8])~[#8]',0),
77 40:('[#16]-[#8]',0),
78 41:('[#6]#[#7]',0),
79 42:('F',0),
80 43:('[!#6;!#1;!H0]~*~[!#6;!#1;!H0]',0),
81 44:('?',0),
82 45:('[#6]=[#6]~[#7]',0),
83 46:('Br',0),
84 47:('[#16]~*~[#7]',0),
85 48:('[#8]~[!#6;!#1](~[#8])(~[#8])',0),
86 49:('[!+0]',0),
87 50:('[#6]=[#6](~[#6])~[#6]',0),
88 51:('[#6]~[#16]~[#8]',0),
89 52:('[#7]~[#7]',0),
90 53:('[!#6;!#1;!H0]~*~*~*~[!#6;!#1;!H0]',0),
91 54:('[!#6;!#1;!H0]~*~*~[!#6;!#1;!H0]',0),
92 55:('[#8]~[#16]~[#8]',0),
93 56:('[#8]~[#7](~[#8])~[#6]',0),
94 57:('[#8R]',0),
95 58:('[!#6;!#1]~[#16]~[!#6;!#1]',0),
96 59:('[#16]!:*:*',0),
97 60:('[#16]=[#8]',0),
98 61:('*~[#16](~*)~*',0),
99 62:('*@*!@*@*',0),
100 63:('[#7]=[#8]',0),
101 64:('*@*!@[#16]',0),
102 65:('c:n',0),
103 66:('[#6]~[#6](~[#6])(~[#6])~*',0),
104 67:('[!#6;!#1]~[#16]',0),
105 68:('[!#6;!#1;!H0]~[!#6;!#1;!H0]',0),
106 69:('[!#6;!#1]~[!#6;!#1;!H0]',0),
107 70:('[!#6;!#1]~[#7]~[!#6;!#1]',0),
108 71:('[#7]~[#8]',0),
109 72:('[#8]~*~*~[#8]',0),
110 73:('[#16]=*',0),
111 74:('[CH3]~*~[CH3]',0),
112 75:('*!@[#7]@*',0),
113 76:('[#6]=[#6](~*)~*',0),
114 77:('[#7]~*~[#7]',0),
115 78:('[#6]=[#7]',0),
116 79:('[#7]~*~*~[#7]',0),
117 80:('[#7]~*~*~*~[#7]',0),
118 81:('[#16]~*(~*)~*',0),
119 82:('*~[CH2]~[!#6;!#1;!H0]',0),
120 83:('[!#6;!#1]1~*~*~*~*~1',0),
121 84:('[NH2]',0),
122 85:('[#6]~[#7](~[#6])~[#6]',0),
123 86:('[C;H2,H3][!#6;!#1][C;H2,H3]',0),
124 87:('[F,Cl,Br,I]!@*@*',0),
125 88:('[#16]',0),
126 89:('[#8]~*~*~*~[#8]',0),
127 90:('[$([!#6;!#1;!H0]~*~*~[CH2]~*),$([!#6;!#1;!H0;R]1@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~[R]1@[R]@[CH2;R]1)]',0),
128 91:('[$([!#6;!#1;!H0]~*~*~*~[CH2]~*),$([!#6;!#1;!H0;R]1@[R]@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~[R]1@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~*~[R]1@[R]@[CH2;R]1)]',0),
129 92:('[#8]~[#6](~[#7])~[#6]',0),
130 93:('[!#6;!#1]~[CH3]',0),
131 94:('[!#6;!#1]~[#7]',0),
132 95:('[#7]~*~*~[#8]',0),
133 96:('*1~*~*~*~*~1',0),
134 97:('[#7]~*~*~*~[#8]',0),
135 98:('[!#6;!#1]1~*~*~*~*~*~1',0),
136 99:('[#6]=[#6]',0),
137 100:('*~[CH2]~[#7]',0),
138 101:('[$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1)]',0),
139 102:('[!#6;!#1]~[#8]',0),
140 103:('Cl',0),
141 104:('[!#6;!#1;!H0]~*~[CH2]~*',0),
142 105:('*@*(@*)@*',0),
143 106:('[!#6;!#1]~*(~[!#6;!#1])~[!#6;!#1]',0),
144 107:('[F,Cl,Br,I]~*(~*)~*',0),
145 108:('[CH3]~*~*~*~[CH2]~*',0),
146 109:('*~[CH2]~[#8]',0),
147 110:('[#7]~[#6]~[#8]',0),
148 111:('[#7]~*~[CH2]~*',0),
149 112:('*~*(~*)(~*)~*',0),
150 113:('[#8]!:*:*',0),
151 114:('[CH3]~[CH2]~*',0),
152 115:('[CH3]~*~[CH2]~*',0),
153 116:('[$([CH3]~*~*~[CH2]~*),$([CH3]~*1~*~[CH2]1)]',0),
154 117:('[#7]~*~[#8]',0),
155 118:('[$(*~[CH2]~[CH2]~*),$(*1~[CH2]~[CH2]1)]',1),
156 119:('[#7]=*',0),
157 120:('[!#6;R]',1),
158 121:('[#7;R]',0),
159 122:('*~[#7](~*)~*',0),
160 123:('[#8]~[#6]~[#8]',0),
161 124:('[!#6;!#1]~[!#6;!#1]',0),
162 125:('?',0),
163 126:('*!@[#8]!@*',0),
164 127:('*@*!@[#8]',1),
165 128:('[$(*~[CH2]~*~*~*~[CH2]~*),$([R]1@[CH2;R]@[R]@[R]@[R]@[CH2;R]1),$(*~[CH2]~[R]1@[R]@[R]@[CH2;R]1),$(*~[CH2]~*~[R]1@[R]@[CH2;R]1)]',0),
166 129:('[$(*~[CH2]~*~*~[CH2]~*),$([R]1@[CH2]@[R]@[R]@[CH2;R]1),$(*~[CH2]~[R]1@[R]@[CH2;R]1)]',0),
167 130:('[!#6;!#1]~[!#6;!#1]',1),
168 131:('[!#6;!#1;!H0]',1),
169 132:('[#8]~*~[CH2]~*',0),
170 133:('*@*!@[#7]',0),
171 134:('[F,Cl,Br,I]',0),
172 135:('[#7]!:*:*',0),
173 136:('[#8]=*',1),
174 137:('[!C;!c;R]',0),
175 138:('[!#6;!#1]~[CH2]~*',1),
176 139:('[O;!H0]',0),
177 140:('[#8]',3),
178 141:('[CH3]',2),
179 142:('[#7]',1),
180 143:('*@*!@[#8]',0),
181 144:('*!:*:*!:*',0),
182 145:('*1~*~*~*~*~*~1',1),
183 146:('[#8]',2),
184 147:('[$(*~[CH2]~[CH2]~*),$([R]1@[CH2;R]@[CH2;R]1)]',0),
185 148:('*~[!#6;!#1](~*)~*',0),
186 149:('[C;H3,H4]',1),
187 150:('*!@*@*!@*',0),
188 151:('[#7;!H0]',0),
189 152:('[#8]~[#6](~[#6])~[#6]',0),
190 153:('[!#6;!#1]~[CH2]~*',0),
191 154:('[#6]=[#8]',0),
192 155:('*!@[CH2]!@*',0),
193 156:('[#7]~*(~*)~*',0),
194 157:('[#6]-[#8]',0),
195 158:('[#6]-[#7]',0),
196 159:('[#8]',1),
197 160:('[C;H3,H4]',0),
198 161:('[#7]',0),
199 162:('a',0),
200 163:('*1~*~*~*~*~*~1',0),
201 164:('[#8]',0),
202 165:('[R]',0),
203 166:('?',0),
204 }
205
206 maccsKeys = None
207
209 """ *Internal Use Only*
210
211 generates SMARTS patterns for the keys, run once
212
213 """
214 assert len(keyList) == len(keyDict.keys()),'length mismatch'
215 for key in keyDict.keys():
216 patt,count = keyDict[key]
217 if patt != '?':
218 try:
219 sma = Chem.MolFromSmarts(patt)
220 except:
221 sma = None
222 if not sma:
223 print 'SMARTS parser error for key #%d: %s'%(key,patt)
224 else:
225 keyList[key-1] = sma,count
226
228 """ generates the MACCS fingerprint for a molecules
229
230 **Arguments**
231
232 - mol: the molecule to be fingerprinted
233
234 - any extra keyword arguments are ignored
235
236 **Returns**
237
238 a _DataStructs.SparseBitVect_ containing the fingerprint.
239
240 >>> m = Chem.MolFromSmiles('CNO')
241 >>> bv = GenMACCSKeys(m)
242 >>> tuple(bv.GetOnBits())
243 (24, 68, 69, 71, 93, 94, 102, 124, 131, 139, 151, 158, 160, 161, 164)
244 >>> bv = GenMACCSKeys(Chem.MolFromSmiles('CCC'))
245 >>> tuple(bv.GetOnBits())
246 (74, 114, 149, 155, 160)
247
248 """
249 global maccsKeys
250 if maccsKeys is None:
251 maccsKeys = [(None,0)]*len(smartsPatts.keys())
252 _InitKeys(maccsKeys,smartsPatts)
253 ctor=kwargs.get('ctor',DataStructs.SparseBitVect)
254
255 res = ctor(len(maccsKeys)+1)
256 for i,(patt,count) in enumerate(maccsKeys):
257 if patt is not None:
258 if count==0:
259 res[i+1] = mol.HasSubstructMatch(patt)
260 else:
261 matches = mol.GetSubstructMatches(patt)
262 if len(matches) > count:
263 res[i+1] = 1
264 elif (i+1)==125:
265
266 ri = mol.GetRingInfo()
267 nArom=0
268 res[125]=0
269 for ring in ri.BondRings():
270 isArom=True
271 for bondIdx in ring:
272 if not mol.GetBondWithIdx(bondIdx).GetIsAromatic():
273 isArom=False
274 break
275 if isArom:
276 nArom+=1
277 if nArom>1:
278 res[125]=1
279 break
280 elif (i+1)==166:
281 res[166]=0
282
283 if len(Chem.GetMolFrags(mol))>1:
284 res[166]=1
285
286 return res
287
288 FingerprintMol = GenMACCSKeys
289
290
291
292
293
295 import doctest,sys
296 return doctest.testmod(sys.modules["__main__"])
297
298 if __name__ == '__main__':
299 import sys
300 failed,tried = _test()
301 sys.exit(failed)
302