Package SimDivFilters :: Module SimilarityPickers
[hide private]
[frames] | no frames]

Source Code for Module SimDivFilters.SimilarityPickers

  1  # $Id: SimilarityPickers.py 672 2008-05-17 05:28:32Z glandrum $ 
  2  # 
  3  # Copyright (C) 2003-2008 Greg Landrum and Rational Discovery LLC 
  4  #  All Rights Reserved 
  5  # 
  6  import RDConfig 
  7  import DataStructs 
  8  from DataStructs.TopNContainer import TopNContainer 
  9  import bisect 
 10   
11 -class GenericPicker(object):
12 _picks = None
13 - def MakePicks(self,force=0):
14 raise NotImplementedError,"GenericPicker is a virtual base class"
15 - def __len__(self):
16 if self._picks is None: 17 self.MakePicks() 18 return len(self._picks)
19 - def __getitem__(self,which):
20 if self._picks is None: 21 self.MakePicks() 22 return self._picks[which]
23
24 -class TopNOverallPicker(GenericPicker):
25 """ A class for picking the top N overall best matches across a library 26 27 Connect to a database and build molecules: 28 >>> import Chem 29 >>> import os.path 30 >>> from Dbase.DbConnection import DbConnect 31 >>> dbName = RDConfig.RDTestDatabase 32 >>> conn = DbConnect(dbName,'simple_mols1') 33 >>> [x.upper() for x in conn.GetColumnNames()] 34 ['SMILES', 'ID'] 35 >>> mols = [] 36 >>> for smi,id in conn.GetData(): 37 ... mol = Chem.MolFromSmiles(str(smi)) 38 ... mol.SetProp('_Name',str(id)) 39 ... mols.append(mol) 40 >>> len(mols) 41 12 42 43 Calculate fingerprints: 44 >>> probefps = [] 45 >>> for mol in mols: 46 ... fp = Chem.RDKFingerprint(mol) 47 ... fp._id = mol.GetProp('_Name') 48 ... probefps.append(fp) 49 50 Start by finding the top matches for a single probe. This ether should pull 51 other ethers from the db: 52 >>> mol = Chem.MolFromSmiles('COC') 53 >>> probeFp = Chem.RDKFingerprint(mol) 54 >>> picker = TopNOverallPicker(numToPick=2,probeFps=[probeFp],dataSet=probefps) 55 >>> len(picker) 56 2 57 >>> fp,score = picker[0] 58 >>> id = fp._id 59 >>> str(id) 60 'ether-1' 61 >>> score 62 1.0 63 64 The results come back in order: 65 >>> fp,score = picker[1] 66 >>> id = fp._id 67 >>> str(id) 68 'ether-2' 69 70 Now find the top matches for 2 probes. We'll get one ether and one acid: 71 >>> fps = [] 72 >>> fps.append(Chem.RDKFingerprint(Chem.MolFromSmiles('COC'))) 73 >>> fps.append(Chem.RDKFingerprint(Chem.MolFromSmiles('CC(=O)O'))) 74 >>> picker = TopNOverallPicker(numToPick=3,probeFps=fps,dataSet=probefps) 75 >>> len(picker) 76 3 77 >>> fp,score = picker[0] 78 >>> id = fp._id 79 >>> str(id) 80 'acid-1' 81 >>> fp,score = picker[1] 82 >>> id = fp._id 83 >>> str(id) 84 'ether-1' 85 >>> score 86 1.0 87 >>> fp,score = picker[2] 88 >>> id = fp._id 89 >>> str(id) 90 'acid-2' 91 92 """
93 - def __init__(self,numToPick=10,probeFps=None,dataSet=None, 94 simMetric=DataStructs.TanimotoSimilarity):
95 """ 96 97 dataSet should be a sequence of BitVectors 98 99 """ 100 self.numToPick = numToPick 101 self.probes = probeFps 102 self.data = dataSet 103 self.simMetric = simMetric 104 self._picks = None
105
106 - def MakePicks(self,force=0):
107 if self._picks is not None and not force: 108 return 109 picks = TopNContainer(self.numToPick) 110 for fp in self.data: 111 origFp = fp 112 bestScore = -1.0 113 for probeFp in self.probes: 114 score = DataStructs.FingerprintSimilarity(origFp,probeFp, 115 self.simMetric) 116 bestScore = max(score,bestScore) 117 picks.Insert(bestScore,fp) 118 self._picks = [] 119 for score,pt in picks: 120 self._picks.append((pt,score)) 121 self._picks.reverse()
122
123 -class SpreadPicker(GenericPicker):
124 """ A class for picking the best matches across a library 125 126 Connect to a database: 127 >>> import Chem 128 >>> import os.path 129 >>> from Dbase.DbConnection import DbConnect 130 >>> dbName = RDConfig.RDTestDatabase 131 >>> conn = DbConnect(dbName,'simple_mols1') 132 >>> [x.upper() for x in conn.GetColumnNames()] 133 ['SMILES', 'ID'] 134 >>> mols = [] 135 >>> for smi,id in conn.GetData(): 136 ... mol = Chem.MolFromSmiles(str(smi)) 137 ... mol.SetProp('_Name',str(id)) 138 ... mols.append(mol) 139 >>> len(mols) 140 12 141 142 Calculate fingerprints: 143 >>> probefps = [] 144 >>> for mol in mols: 145 ... fp = Chem.RDKFingerprint(mol) 146 ... fp._id = mol.GetProp('_Name') 147 ... probefps.append(fp) 148 149 Start by finding the top matches for a single probe. This ether should pull 150 other ethers from the db: 151 >>> mol = Chem.MolFromSmiles('COC') 152 >>> probeFp = Chem.RDKFingerprint(mol) 153 >>> picker = SpreadPicker(numToPick=2,probeFps=[probeFp],dataSet=probefps) 154 >>> len(picker) 155 2 156 >>> fp,score = picker[0] 157 >>> id = fp._id 158 >>> str(id) 159 'ether-1' 160 >>> score 161 1.0 162 163 The results come back in order: 164 >>> fp,score = picker[1] 165 >>> id = fp._id 166 >>> str(id) 167 'ether-2' 168 169 Now find the top matches for 2 probes. We'll get one ether and one acid: 170 >>> fps = [] 171 >>> fps.append(Chem.RDKFingerprint(Chem.MolFromSmiles('COC'))) 172 >>> fps.append(Chem.RDKFingerprint(Chem.MolFromSmiles('CC(=O)O'))) 173 >>> picker = SpreadPicker(numToPick=3,probeFps=fps,dataSet=probefps) 174 >>> len(picker) 175 3 176 >>> fp,score = picker[0] 177 >>> id = fp._id 178 >>> str(id) 179 'ether-1' 180 >>> score 181 1.0 182 >>> fp,score = picker[1] 183 >>> id = fp._id 184 >>> str(id) 185 'acid-1' 186 >>> score 187 1.0 188 >>> fp,score = picker[2] 189 >>> id = fp._id 190 >>> str(id) 191 'ether-2' 192 193 """
194 - def __init__(self,numToPick=10,probeFps=None,dataSet=None, 195 simMetric=DataStructs.TanimotoSimilarity, 196 expectPickles=True,onlyNames=False):
197 """ 198 199 dataSet should be a sequence of BitVectors or, if expectPickles 200 is False, a set of strings that can be converted to bit vectors 201 202 """ 203 self.numToPick = numToPick 204 self.probes = probeFps 205 self.data = dataSet 206 self.simMetric = simMetric 207 self.expectPickles = expectPickles 208 self.onlyNames=onlyNames 209 210 self._picks = None
211
212 - def MakePicks(self,force=0,silent=True):
213 if self._picks is not None and not force: 214 return 215 216 # start by getting the NxM score matrix 217 # (N=num probes, M=num fps) 218 nProbes = len(self.probes) 219 scores = [None]*nProbes 220 for i in range(nProbes): 221 scores[i] = [] 222 j = 0 223 fps = [] 224 for origFp in self.data: 225 for i in range(nProbes): 226 score = DataStructs.FingerprintSimilarity(self.probes[i],origFp, 227 self.simMetric) 228 bisect.insort(scores[i],(score,j)) 229 if len(scores[i])>=self.numToPick: 230 del scores[self.numToPick:] 231 if self.onlyNames and hasattr(origFp,'_fieldsFromDb'): 232 fps.append(origFp._fieldsFromDb[0]) 233 else: 234 fps.append(origFp) 235 j+=1 236 if not silent and not j%1000: 237 print 'scored %d fps'%j 238 239 # sort the rows of that matrix: 240 #for i in range(nProbes): 241 # scores[i].sort() 242 243 # now go probe by probe and select the current top entry until we are finished: 244 nPicked = 0 245 self._picks = [] 246 taken = [0]*len(fps) 247 while nPicked < self.numToPick: 248 rowIdx = nPicked%len(scores) 249 row = scores[rowIdx] 250 score,idx = row.pop() 251 # make sure we haven't taken this one already (from another row): 252 while taken[idx] and len(row): 253 score,idx = row.pop() 254 if not taken[idx]: 255 fp = fps[idx] 256 self._picks.append((fp,score)) 257 taken[idx]=1 258 nPicked += 1
259 260 #------------------------------------ 261 # 262 # doctest boilerplate 263 #
264 -def _test():
265 import doctest,sys 266 return doctest.testmod(sys.modules["__main__"])
267 268 if __name__ == '__main__': 269 import sys 270 failed,tried = _test() 271 sys.exit(failed) 272