Package Chem :: Package Fingerprints :: Module MolSimilarity
[hide private]
[frames] | no frames]

Source Code for Module Chem.Fingerprints.MolSimilarity

  1  # $Id: MolSimilarity.py 2 2006-05-06 22:54:39Z glandrum $ 
  2  # 
  3  #  Copyright (c) 2003-2006 Rational Discovery LLC 
  4  # 
  5  #   @@ All Rights Reserved  @@ 
  6  # 
  7  """ utility functionality for molecular similarity 
  8   includes a command line app for screening databases 
  9   
 10   
 11  Sample Usage: 
 12   
 13    python MolSimilarity.py  -d data.gdb -t daylight_sig --idName="Mol_ID" \ 
 14        --topN=100 --smiles='c1(C=O)ccc(Oc2ccccc2)cc1' --smilesTable=raw_dop_data \ 
 15        --smilesName="structure" -o results.csv  
 16   
 17  """ 
 18  import RDConfig 
 19  import DataStructs 
 20  import Chem 
 21  from Dbase.DbConnection import DbConnect 
 22  from Dbase import DbModule 
 23  from DataStructs.TopNContainer import TopNContainer 
 24  import sys,types 
 25  import cPickle 
 26  from Chem.Fingerprints import FingerprintMols,DbFpSupplier 
 27  try:   
 28    from VLib.NodeLib.DbPickleSupplier import _lazyDataSeq as _dataSeq 
 29  except ImportError: 
 30    _dataSeq=None 
 31     
 32   
 33  import DataStructs 
 34   
 35  _cvsVersion="$Id: MolSimilarity.py 2 2006-05-06 22:54:39Z glandrum $" 
 36  idx1 = _cvsVersion.find(':')+1 
 37  idx2 = _cvsVersion.rfind('$') 
 38  __VERSION_STRING="%s"%(_cvsVersion[idx1:idx2]) 
 39   
 40   
41 -def _ConstructSQL(details,extraFields=''):
42 fields = '%s.%s'%(details.tableName,details.idName) 43 join = '' 44 if details.smilesTableName: 45 if details.smilesName: 46 fields = fields + ',%s'%(details.smilesName) 47 join='join %s smi on smi.%s=%s.%s'%(details.smilesTableName, 48 details.idName, 49 details.tableName, 50 details.idName) 51 if details.actTableName: 52 if details.actName: 53 fields = fields + ',%s'%(details.actName) 54 join = join + 'join %s act on act.%s=%s.%s'%(details.actTableName, 55 details.idName, 56 details.tableName, 57 details.idName) 58 #data = conn.GetData(fields=fields,join=join) 59 if extraFields: 60 fields += ','+extraFields 61 cmd = 'select %s from %s %s'%(fields,details.tableName,join) 62 return cmd
63
64 -def ScreenInDb(details,mol):
65 try: 66 probeFp = apply(FingerprintMols.FingerprintMol,(mol,),details.__dict__) 67 except: 68 import traceback 69 FingerprintMols.error('Error: problems fingerprinting molecule.\n') 70 traceback.print_exc() 71 return [] 72 if details.dbName and details.tableName: 73 try: 74 conn = DbConnect(details.dbName,details.tableName) 75 if hasattr(details,'dbUser'): 76 conn.user = details.dbUser 77 if hasattr(details,'dbPassword'): 78 conn.password = details.dbPassword 79 except: 80 import traceback 81 FingerprintMols.error('Error: Problems establishing connection to database: %s|%s\n'%(details.dbName, 82 details.tableName)) 83 traceback.print_exc() 84 85 if details.metric not in (DataStructs.TanimotoSimilarity, 86 DataStructs.DiceSimilarity, 87 DataStructs.CosineSimilarity): 88 data = GetFingerprints(details) 89 res = ScreenFingerprints(details,data,mol) 90 else: 91 res = [] 92 if details.metric == DataStructs.TanimotoSimilarity: 93 func = 'rd_tanimoto' 94 pkl=probeFp.ToBitString() 95 elif details.metric == DataStructs.DiceSimilarity: 96 func = 'rd_dice' 97 pkl=probeFp.ToBitString() 98 elif details.metric == DataStructs.CosineSimilarity: 99 func = 'rd_cosine' 100 pkl=probeFp.ToBitString() 101 extraFields="%s(%s,%s) as tani"%(func,DbModule.placeHolder,details.fpColName) 102 cmd = _ConstructSQL(details,extraFields=extraFields) 103 104 if details.doThreshold: 105 # we need to do a subquery here: 106 cmd = "select * from (%s) tmp where tani>%f"%(cmd,details.screenThresh) 107 cmd += " order by tani desc" 108 if not details.doThreshold and details.topN>0: 109 cmd += " limit %d"%details.topN 110 curs = conn.GetCursor() 111 curs.execute(cmd,(pkl,)) 112 res = curs.fetchall() 113 114 return res
115
116 -def GetFingerprints(details):
117 """ returns an iterable sequence of fingerprints 118 each fingerprint will have a _fieldsFromDb member whose first entry is 119 the id. 120 121 """ 122 if details.dbName and details.tableName: 123 try: 124 conn = DbConnect(details.dbName,details.tableName) 125 if hasattr(details,'dbUser'): 126 conn.user = details.dbUser 127 if hasattr(details,'dbPassword'): 128 conn.password = details.dbPassword 129 except: 130 import traceback 131 FingerprintMols.error('Error: Problems establishing connection to database: %s|%s\n'%(details.dbName, 132 details.tableName)) 133 traceback.print_exc() 134 cmd = _ConstructSQL(details,extraFields=details.fpColName) 135 curs = conn.GetCursor() 136 #curs.execute(cmd) 137 #print 'CURSOR:',curs,curs.closed 138 if _dataSeq: 139 suppl = _dataSeq(curs,cmd,depickle=not details.noPickle,klass=DataStructs.ExplicitBitVect) 140 _dataSeq._conn = conn 141 else: 142 suppl = DbFpSupplier.ForwardDbFpSupplier(data,fpColName=details.fpColName) 143 elif details.inFileName: 144 conn = None 145 try: 146 inF = open(details.inFileName,'r') 147 except IOError: 148 import traceback 149 FingerprintMols.error('Error: Problems reading from file %s\n'%(details.inFileName)) 150 traceback.print_exc() 151 152 supple = [] 153 done = 0 154 while not done: 155 try: 156 id,fp = cPickle.load(inF) 157 except: 158 done = 1 159 else: 160 fp._fieldsFromDb = [id] 161 suppl.append(fp) 162 else: 163 suppl = None 164 165 return suppl
166
167 -def ScreenFingerprints(details,data,mol=None,probeFp=None):
168 """ Returns a list of results 169 170 """ 171 if probeFp is None: 172 try: 173 probeFp = apply(FingerprintMols.FingerprintMol,(mol,),details.__dict__) 174 except: 175 import traceback 176 FingerprintMols.error('Error: problems fingerprinting molecule.\n') 177 traceback.print_exc() 178 return [] 179 if not probeFp: 180 return [] 181 182 res = [] 183 if not details.doThreshold and details.topN>0: 184 topN = TopNContainer(details.topN) 185 else: 186 topN = [] 187 res = [] 188 count = 0 189 for pt in data: 190 fp1 = probeFp 191 if not details.noPickle: 192 if type(pt) in (types.TupleType,types.ListType): 193 id,fp = pt 194 else: 195 fp = pt 196 id = pt._fieldsFromDb[0] 197 score = DataStructs.FingerprintSimilarity(fp1,fp,details.metric) 198 else: 199 id,pkl = pt 200 score = details.metric(fp1,str(pkl)) 201 if topN: 202 topN.Insert(score,id) 203 elif not details.doThreshold or \ 204 (details.doThreshold and score>=details.screenThresh): 205 res.append((id,score)) 206 count += 1 207 if hasattr(details,'stopAfter') and count >= details.stopAfter: 208 break 209 for score,id in topN: 210 res.append((id,score)) 211 212 return res
213
214 -def ScreenFromDetails(details,mol=None):
215 """ Returns a list of results 216 217 """ 218 if not mol: 219 if not details.probeMol: 220 smi = details.probeSmiles 221 try: 222 mol = Chem.MolFromSmiles(smi) 223 except: 224 import traceback 225 FingerprintMols.error('Error: problems generating molecule for smiles: %s\n'%(smi)) 226 traceback.print_exc() 227 return 228 else: 229 mol = details.probeMol 230 if not mol: 231 return 232 233 if details.outFileName: 234 try: 235 outF = open(details.outFileName,'w+') 236 except IOError: 237 FingerprintMols.error("Error: could not open output file %s for writing\n"%(details.outFileName)) 238 return None 239 else: 240 outF = None 241 242 if not hasattr(details,'useDbSimilarity') or not details.useDbSimilarity: 243 data = GetFingerprints(details) 244 res = ScreenFingerprints(details,data,mol) 245 else: 246 res = ScreenInDb(details,mol) 247 if outF: 248 for pt in res: 249 outF.write(','.join([str(x) for x in pt])) 250 outF.write('\n') 251 return res
252 253 _usageDoc=""" 254 Usage: MolSimilarity.py [args] <fName> 255 256 If <fName> is provided and no tableName is specified (see below), 257 data will be read from the pickled file <fName>. This file should 258 contain a series of pickled (id,fingerprint) tuples. 259 260 NOTE: at the moment the user is responsible for ensuring that the 261 fingerprint parameters given at run time (used to fingerprint the 262 probe molecule) match those used to generate the input fingerprints. 263 264 Command line arguments are: 265 - --smiles=val: sets the SMILES for the input molecule. This is 266 a required argument. 267 268 - -d _dbName_: set the name of the database from which 269 to pull input fingerprint information. 270 271 - -t _tableName_: set the name of the database table 272 from which to pull input fingerprint information 273 274 - --smilesTable=val: sets the name of the database table 275 which contains SMILES for the input fingerprints. If this 276 information is provided along with smilesName (see below), 277 the output file will contain SMILES data 278 279 - --smilesName=val: sets the name of the SMILES column 280 in the input database. Default is *SMILES*. 281 282 - --topN=val: sets the number of results to return. 283 Default is *10*. 284 285 - --thresh=val: sets the similarity threshold. 286 287 - --idName=val: sets the name of the id column in the input 288 database. Default is *ID*. 289 290 - -o _outFileName_: name of the output file (output will 291 be a CSV file with one line for each of the output molecules 292 293 - --dice: use the DICE similarity metric instead of Tanimoto 294 295 - --cosine: use the cosine similarity metric instead of Tanimoto 296 297 - --fpColName=val: name to use for the column which stores 298 fingerprints (in pickled format) in the output db table. 299 Default is *AutoFragmentFP* 300 301 - --minPath=val: minimum path length to be included in 302 fragment-based fingerprints. Default is *1*. 303 304 - --maxPath=val: maximum path length to be included in 305 fragment-based fingerprints. Default is *7*. 306 307 - --nBitsPerHash: number of bits to be set in the output 308 fingerprint for each fragment. Default is *4*. 309 310 - --discrim: use of path-based discriminators to hash bits. 311 Default is *false*. 312 313 - -V: include valence information in the fingerprints 314 Default is *false*. 315 316 - -H: include Hs in the fingerprint 317 Default is *false*. 318 319 - --useMACCS: use the public MACCS keys to do the fingerprinting 320 (instead of a daylight-type fingerprint) 321 322 323 """ 324 if __name__ == '__main__': 325 FingerprintMols.message("This is MolSimilarity version %s\n\n"%(__VERSION_STRING)) 326 FingerprintMols._usageDoc=_usageDoc 327 details = FingerprintMols.ParseArgs() 328 ScreenFromDetails(details) 329