1
2
3
4
5
6 import RDConfig
7 import DataStructs
8 from DataStructs.TopNContainer import TopNContainer
9 import bisect
10
12 _picks = None
14 raise NotImplementedError,"GenericPicker is a virtual base class"
23
25 """ A class for picking the top N overall best matches across a library
26
27 Connect to a database and build molecules:
28 >>> import Chem
29 >>> import os.path
30 >>> from Dbase.DbConnection import DbConnect
31 >>> dbName = RDConfig.RDTestDatabase
32 >>> conn = DbConnect(dbName,'simple_mols1')
33 >>> [x.upper() for x in conn.GetColumnNames()]
34 ['SMILES', 'ID']
35 >>> mols = []
36 >>> for smi,id in conn.GetData():
37 ... mol = Chem.MolFromSmiles(str(smi))
38 ... mol.SetProp('_Name',str(id))
39 ... mols.append(mol)
40 >>> len(mols)
41 12
42
43 Calculate fingerprints:
44 >>> probefps = []
45 >>> for mol in mols:
46 ... fp = Chem.RDKFingerprint(mol)
47 ... fp._id = mol.GetProp('_Name')
48 ... probefps.append(fp)
49
50 Start by finding the top matches for a single probe. This ether should pull
51 other ethers from the db:
52 >>> mol = Chem.MolFromSmiles('COC')
53 >>> probeFp = Chem.RDKFingerprint(mol)
54 >>> picker = TopNOverallPicker(numToPick=2,probeFps=[probeFp],dataSet=probefps)
55 >>> len(picker)
56 2
57 >>> fp,score = picker[0]
58 >>> id = fp._id
59 >>> str(id)
60 'ether-1'
61 >>> score
62 1.0
63
64 The results come back in order:
65 >>> fp,score = picker[1]
66 >>> id = fp._id
67 >>> str(id)
68 'ether-2'
69
70 Now find the top matches for 2 probes. We'll get one ether and one acid:
71 >>> fps = []
72 >>> fps.append(Chem.RDKFingerprint(Chem.MolFromSmiles('COC')))
73 >>> fps.append(Chem.RDKFingerprint(Chem.MolFromSmiles('CC(=O)O')))
74 >>> picker = TopNOverallPicker(numToPick=3,probeFps=fps,dataSet=probefps)
75 >>> len(picker)
76 3
77 >>> fp,score = picker[0]
78 >>> id = fp._id
79 >>> str(id)
80 'acid-1'
81 >>> fp,score = picker[1]
82 >>> id = fp._id
83 >>> str(id)
84 'ether-1'
85 >>> score
86 1.0
87 >>> fp,score = picker[2]
88 >>> id = fp._id
89 >>> str(id)
90 'acid-2'
91
92 """
95 """
96
97 dataSet should be a sequence of BitVectors
98
99 """
100 self.numToPick = numToPick
101 self.probes = probeFps
102 self.data = dataSet
103 self.simMetric = simMetric
104 self._picks = None
105
122
124 """ A class for picking the best matches across a library
125
126 Connect to a database:
127 >>> import Chem
128 >>> import os.path
129 >>> from Dbase.DbConnection import DbConnect
130 >>> dbName = RDConfig.RDTestDatabase
131 >>> conn = DbConnect(dbName,'simple_mols1')
132 >>> [x.upper() for x in conn.GetColumnNames()]
133 ['SMILES', 'ID']
134 >>> mols = []
135 >>> for smi,id in conn.GetData():
136 ... mol = Chem.MolFromSmiles(str(smi))
137 ... mol.SetProp('_Name',str(id))
138 ... mols.append(mol)
139 >>> len(mols)
140 12
141
142 Calculate fingerprints:
143 >>> probefps = []
144 >>> for mol in mols:
145 ... fp = Chem.RDKFingerprint(mol)
146 ... fp._id = mol.GetProp('_Name')
147 ... probefps.append(fp)
148
149 Start by finding the top matches for a single probe. This ether should pull
150 other ethers from the db:
151 >>> mol = Chem.MolFromSmiles('COC')
152 >>> probeFp = Chem.RDKFingerprint(mol)
153 >>> picker = SpreadPicker(numToPick=2,probeFps=[probeFp],dataSet=probefps)
154 >>> len(picker)
155 2
156 >>> fp,score = picker[0]
157 >>> id = fp._id
158 >>> str(id)
159 'ether-1'
160 >>> score
161 1.0
162
163 The results come back in order:
164 >>> fp,score = picker[1]
165 >>> id = fp._id
166 >>> str(id)
167 'ether-2'
168
169 Now find the top matches for 2 probes. We'll get one ether and one acid:
170 >>> fps = []
171 >>> fps.append(Chem.RDKFingerprint(Chem.MolFromSmiles('COC')))
172 >>> fps.append(Chem.RDKFingerprint(Chem.MolFromSmiles('CC(=O)O')))
173 >>> picker = SpreadPicker(numToPick=3,probeFps=fps,dataSet=probefps)
174 >>> len(picker)
175 3
176 >>> fp,score = picker[0]
177 >>> id = fp._id
178 >>> str(id)
179 'ether-1'
180 >>> score
181 1.0
182 >>> fp,score = picker[1]
183 >>> id = fp._id
184 >>> str(id)
185 'acid-1'
186 >>> score
187 1.0
188 >>> fp,score = picker[2]
189 >>> id = fp._id
190 >>> str(id)
191 'ether-2'
192
193 """
197 """
198
199 dataSet should be a sequence of BitVectors or, if expectPickles
200 is False, a set of strings that can be converted to bit vectors
201
202 """
203 self.numToPick = numToPick
204 self.probes = probeFps
205 self.data = dataSet
206 self.simMetric = simMetric
207 self.expectPickles = expectPickles
208 self.onlyNames=onlyNames
209
210 self._picks = None
211
213 if self._picks is not None and not force:
214 return
215
216
217
218 nProbes = len(self.probes)
219 scores = [None]*nProbes
220 for i in range(nProbes):
221 scores[i] = []
222 j = 0
223 fps = []
224 for origFp in self.data:
225 for i in range(nProbes):
226 score = DataStructs.FingerprintSimilarity(self.probes[i],origFp,
227 self.simMetric)
228 bisect.insort(scores[i],(score,j))
229 if len(scores[i])>=self.numToPick:
230 del scores[self.numToPick:]
231 if self.onlyNames and hasattr(origFp,'_fieldsFromDb'):
232 fps.append(origFp._fieldsFromDb[0])
233 else:
234 fps.append(origFp)
235 j+=1
236 if not silent and not j%1000:
237 print 'scored %d fps'%j
238
239
240
241
242
243
244 nPicked = 0
245 self._picks = []
246 taken = [0]*len(fps)
247 while nPicked < self.numToPick:
248 rowIdx = nPicked%len(scores)
249 row = scores[rowIdx]
250 score,idx = row.pop()
251
252 while taken[idx] and len(row):
253 score,idx = row.pop()
254 if not taken[idx]:
255 fp = fps[idx]
256 self._picks.append((fp,score))
257 taken[idx]=1
258 nPicked += 1
259
260
261
262
263
265 import doctest,sys
266 return doctest.testmod(sys.modules["__main__"])
267
268 if __name__ == '__main__':
269 import sys
270 failed,tried = _test()
271 sys.exit(failed)
272