Package rdkit :: Package Chem :: Package Fingerprints :: Module ClusterMols
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Chem.Fingerprints.ClusterMols

  1  # $Id: ClusterMols.py 1528 2010-09-26 17:04:37Z glandrum $ 
  2  # 
  3  #  Copyright (c) 2003-2006 Rational Discovery LLC 
  4  # 
  5  #   @@ All Rights Reserved @@ 
  6  #  This file is part of the RDKit. 
  7  #  The contents are covered by the terms of the BSD license 
  8  #  which is included in the file license.txt, found at the root 
  9  #  of the RDKit source tree. 
 10  # 
 11  """ utility functionality for clustering molecules using fingerprints 
 12   includes a command line app for clustering 
 13   
 14   
 15  Sample Usage: 
 16    python ClusterMols.py  -d data.gdb -t daylight_sig \ 
 17      --idName="CAS_TF" -o clust1.pkl \ 
 18      --actTable="dop_test" --actName="moa_quant" 
 19   
 20  """ 
 21  from rdkit.Dbase.DbConnection import DbConnect 
 22  from rdkit.Dbase import DbInfo,DbUtils 
 23  from rdkit.ML.Data import DataUtils 
 24  from rdkit.ML.Cluster import Clusters 
 25  from rdkit.ML.Cluster import Murtagh 
 26  import sys,cPickle 
 27  from rdkit.Chem.Fingerprints import FingerprintMols,MolSimilarity 
 28  from rdkit import DataStructs 
 29  import numpy 
 30  _cvsVersion="$Id: ClusterMols.py 1528 2010-09-26 17:04:37Z glandrum $" 
 31  idx1 = _cvsVersion.find(':')+1 
 32  idx2 = _cvsVersion.rfind('$') 
 33  __VERSION_STRING="%s"%(_cvsVersion[idx1:idx2]) 
 34   
 35  message=FingerprintMols.message 
 36  error=FingerprintMols.error 
 37   
38 -def GetDistanceMatrix(data,metric,isSimilarity=1):
39 """ data should be a list of tuples with fingerprints in position 1 40 (the rest of the elements of the tuple are not important) 41 42 Returns the symmetric distance matrix 43 (see ML.Cluster.Resemblance for layout documentation) 44 45 """ 46 nPts = len(data) 47 res = numpy.zeros((nPts*(nPts-1)/2),numpy.float) 48 nSoFar=0 49 for col in xrange(1,nPts): 50 for row in xrange(col): 51 fp1 = data[col][1] 52 fp2 = data[row][1] 53 if fp1.GetNumBits()>fp2.GetNumBits(): 54 fp1 = DataStructs.FoldFingerprint(fp1,fp1.GetNumBits()/fp2.GetNumBits()) 55 elif fp2.GetNumBits()>fp1.GetNumBits(): 56 fp2 = DataStructs.FoldFingerprint(fp2,fp2.GetNumBits()/fp1.GetNumBits()) 57 sim = metric(fp1,fp2) 58 if isSimilarity: 59 sim = 1.-sim 60 res[nSoFar] = sim 61 nSoFar += 1 62 return res
63
64 -def ClusterPoints(data,metric,algorithmId,haveLabels=False,haveActs=True,returnDistances=False):
65 message('Generating distance matrix.\n') 66 dMat = GetDistanceMatrix(data,metric) 67 message('Clustering\n') 68 clustTree = Murtagh.ClusterData(dMat,len(data),algorithmId, 69 isDistData=1)[0] 70 acts = [] 71 if haveActs and len(data[0])>2: 72 # we've got activities... use them: 73 acts = [int(x[2]) for x in data] 74 75 if not haveLabels: 76 labels = ['Mol: %s'%str(x[0]) for x in data] 77 else: 78 labels = [x[0] for x in data] 79 clustTree._ptLabels = labels 80 if acts: 81 clustTree._ptValues = acts 82 for pt in clustTree.GetPoints(): 83 idx = pt.GetIndex()-1 84 pt.SetName(labels[idx]) 85 if acts: 86 try: 87 pt.SetData(int(acts[idx])) 88 except: 89 pass 90 if not returnDistances: 91 return clustTree 92 else: 93 return clustTree,dMat
94
95 -def ClusterFromDetails(details):
96 """ Returns the cluster tree 97 98 """ 99 data = MolSimilarity.GetFingerprints(details) 100 if details.maxMols > 0: 101 data = data[:details.maxMols] 102 if details.outFileName: 103 try: 104 outF = open(details.outFileName,'wb+') 105 except IOError: 106 error("Error: could not open output file %s for writing\n"%(details.outFileName)) 107 return None 108 else: 109 outF = None 110 111 if not data: 112 return None 113 114 clustTree = ClusterPoints(data,details.metric,details.clusterAlgo, 115 haveLabels=0,haveActs=1) 116 if outF: 117 cPickle.dump(clustTree,outF) 118 return clustTree
119 120 _usageDoc=""" 121 Usage: ClusterMols.py [args] <fName> 122 123 If <fName> is provided and no tableName is specified (see below), 124 data will be read from the text file <fName>. Text files delimited 125 with either commas (extension .csv) or tabs (extension .txt) are 126 supported. 127 128 Command line arguments are: 129 130 - -d _dbName_: set the name of the database from which 131 to pull input fingerprint information. 132 133 - -t _tableName_: set the name of the database table 134 from which to pull input fingerprint information 135 136 - --idName=val: sets the name of the id column in the input 137 database. Default is *ID*. 138 139 - -o _outFileName_: name of the output file (output will 140 be a pickle (.pkl) file with the cluster tree) 141 142 - --actTable=val: name of table containing activity values 143 (used to color points in the cluster tree). 144 145 - --actName=val: name of column with activities in the activity 146 table. The values in this column should either be integers or 147 convertible into integers. 148 149 - --SLINK: use the single-linkage clustering algorithm 150 (default is Ward's minimum variance) 151 152 - --CLINK: use the complete-linkage clustering algorithm 153 (default is Ward's minimum variance) 154 155 - --UPGMA: use the group-average clustering algorithm 156 (default is Ward's minimum variance) 157 158 - --dice: use the DICE similarity metric instead of Tanimoto 159 160 - --cosine: use the cosine similarity metric instead of Tanimoto 161 162 - --fpColName=val: name to use for the column which stores 163 fingerprints (in pickled format) in the input db table. 164 Default is *AutoFragmentFP* 165 166 - --minPath=val: minimum path length to be included in 167 fragment-based fingerprints. Default is *2*. 168 169 - --maxPath=val: maximum path length to be included in 170 fragment-based fingerprints. Default is *7*. 171 172 - --nBitsPerHash: number of bits to be set in the output 173 fingerprint for each fragment. Default is *4*. 174 175 - --discrim: use of path-based discriminators to hash bits. 176 Default is *false*. 177 178 - -V: include valence information in the fingerprints 179 Default is *false*. 180 181 - -H: include Hs in the fingerprint 182 Default is *false*. 183 184 - --useMACCS: use the public MACCS keys to do the fingerprinting 185 (instead of a daylight-type fingerprint) 186 187 188 """ 189 if __name__ == '__main__': 190 message("This is ClusterMols version %s\n\n"%(__VERSION_STRING)) 191 FingerprintMols._usageDoc=_usageDoc 192 details = FingerprintMols.ParseArgs() 193 ClusterFromDetails(details) 194