00001 // $Id: InfoBitRanker.h 1528 2010-09-26 17:04:37Z glandrum $ 00002 // 00003 // Copyright (C) 2003-2007 Greg Landrum and Rational Discovery LLC 00004 // @@ All Rights Reserved @@ 00005 // This file is part of the RDKit. 00006 // The contents are covered by the terms of the BSD license 00007 // which is included in the file license.txt, found at the root 00008 // of the RDKit source tree. 00009 // 00010 00011 #ifndef _RD_INFORANKER_H_ 00012 #define _RD_INFORANKER_H_ 00013 00014 #include <RDGeneral/types.h> 00015 #include <DataStructs/BitVects.h> 00016 #include <iostream> 00017 00018 00019 /*! \brief Class used to rank bits based on a specified measure of infomation 00020 * 00021 * Basically a primitive mimic of the CombiChem "signal" functionality 00022 * To use: 00023 * - create an instance of this class 00024 * - loop over the fingerprints in the dataset by calling accumulateVotes method 00025 * - call getTopN to get the top n ranked bits 00026 * 00027 * Sample usage and results from the python wrapper: 00028 * Here's a small set of vectors: 00029 * >>> for i,bv in enumerate(bvs): print bv.ToBitString(),acts[i] 00030 * ... 00031 * 0001 0 00032 * 0101 0 00033 * 0010 1 00034 * 1110 1 00035 * 00036 * Default ranker, using infogain: 00037 * >>> ranker = InfoBitRanker(4,2) 00038 * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i]) 00039 * ... 00040 * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1) 00041 * ... 00042 * 3 1.000 2 0 00043 * 2 1.000 0 2 00044 * 0 0.311 0 1 00045 * 00046 * Using the biased infogain: 00047 * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASENTROPY) 00048 * >>> ranker.SetBiasList((1,)) 00049 * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i]) 00050 * ... 00051 * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1) 00052 * ... 00053 * 2 1.000 0 2 00054 * 0 0.311 0 1 00055 * 1 0.000 1 1 00056 * 00057 * A chi squared ranker is also available: 00058 * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.CHISQUARE) 00059 * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i]) 00060 * ... 00061 * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1) 00062 * ... 00063 * 3 4.000 2 0 00064 * 2 4.000 0 2 00065 * 0 1.333 0 1 00066 * 00067 * As is a biased chi squared: 00068 * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASCHISQUARE) 00069 * >>> ranker.SetBiasList((1,)) 00070 * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i]) 00071 * ... 00072 * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1) 00073 * ... 00074 * 2 4.000 0 2 00075 * 0 1.333 0 1 00076 * 1 0.000 1 1 00077 */ 00078 namespace RDInfoTheory { 00079 typedef std::vector<RDKit::USHORT> USHORT_VECT; 00080 typedef std::vector<USHORT_VECT> VECT_USHORT_VECT; 00081 00082 class InfoBitRanker { 00083 public: 00084 00085 /*! \brief the type of measure for information 00086 * 00087 */ 00088 typedef enum { 00089 ENTROPY=1, 00090 BIASENTROPY=2, 00091 CHISQUARE=3, 00092 BIASCHISQUARE=4 00093 } InfoType; 00094 00095 /*! \brief Constructor 00096 * 00097 * ARGUMENTS: 00098 * 00099 * - nBits: the dimension of the bit vectors or the fingerprint length 00100 * - nClasses: the number of classes used in the classification problem (e.g. active, 00101 * moderately active, inactive etc.). It is assumed that the classes are 00102 * numbered from 0 to (nClasses - 1) 00103 * - infoType: the type of information metric 00104 */ 00105 InfoBitRanker(unsigned int nBits, unsigned int nClasses, InfoType infoType=InfoBitRanker::ENTROPY) : 00106 d_dims(nBits), d_classes(nClasses), d_type(infoType) { 00107 d_counts.resize(0); 00108 for (unsigned int i = 0; i < nClasses; i++) { 00109 USHORT_VECT cCount; 00110 cCount.resize(d_dims, 0); 00111 d_counts.push_back(cCount); 00112 } 00113 d_clsCount.resize(d_classes, 0); 00114 d_nInst = 0; 00115 d_top = 0; 00116 dp_topBits=0; 00117 d_biasList.resize(0); 00118 dp_maskBits=0; 00119 } 00120 00121 ~InfoBitRanker() { 00122 if(dp_topBits) 00123 delete [] dp_topBits; 00124 if(dp_maskBits) 00125 delete dp_maskBits; 00126 } 00127 00128 /*! \brief Accumulate the votes for all the bits turned on in a bit vector 00129 * 00130 * ARGUMENTS: 00131 * 00132 * - bv : bit vector that supports [] operator 00133 * - label : the class label for the bit vector. It is assumed that 0 <= class < nClasses 00134 */ 00135 void accumulateVotes(const ExplicitBitVect &bv, unsigned int label); 00136 void accumulateVotes(const SparseBitVect &bv, unsigned int label); 00137 00138 /*! \brief Returns the top n bits ranked by the information metric 00139 * 00140 * This is actually the function where most of the work of ranking is happening 00141 * 00142 * \param num the number of top ranked bits that are required 00143 * 00144 * \return a pointer to an information array. The client should *not* 00145 * delete this 00146 */ 00147 double *getTopN(unsigned int num); 00148 00149 /*! \brief return the number of labelled instances(examples) or fingerprints seen so far 00150 * 00151 */ 00152 unsigned int getNumInstances() const { 00153 return d_nInst; 00154 } 00155 00156 /*! \brief return the number of classes 00157 * 00158 */ 00159 unsigned int getNumClasses() const { 00160 return d_classes; 00161 } 00162 00163 /*! \brief Set the classes to which the entropy calculation should be biased 00164 * 00165 * This list contains a set of class ids used when in the BIASENTROPY mode of ranking bits. 00166 * In this mode, a bit must be correllated higher with one of the biased classes than all the 00167 * other classes. For example, in a two class problem with actives and inactives, the fraction of 00168 * actives that hit the bit has to be greater than the fraction of inactives that hit the bit 00169 * 00170 * ARGUMENTS: 00171 * classList - list of class ids that we want a bias towards 00172 */ 00173 void setBiasList(RDKit::INT_VECT &classList); 00174 00175 00176 /*! \brief Set the bits to be used as a mask 00177 * 00178 * If this function is called, only the bits which are present in the 00179 * maskBits list will be used. 00180 * 00181 * ARGUMENTS: 00182 * maskBits - the bits to be considered 00183 */ 00184 void setMaskBits(RDKit::INT_VECT &maskBits); 00185 00186 /*! \brief Write the top N bits to a stream 00187 * 00188 */ 00189 void writeTopBitsToStream(std::ostream *outStream) const; 00190 00191 /*! \brief Write the top bits to a file 00192 * 00193 */ 00194 void writeTopBitsToFile(std::string fileName) const; 00195 00196 private: 00197 /*! \brief check if we want to compute the info content for a bit based on the bias list 00198 * 00199 * This what happens here: 00200 * - the fraction of items in each class that hit a particular bit are computed 00201 * - the maximum of these fractions for classes that are not in the biasList are computed 00202 * - If this maximum is less than the fraction for atleast one of classes in the biaslist 00203 * the bit is considered good 00204 * ARGUMENTS: 00205 * - resMat : the result matrix, one dimensional matrix of dimension (2*(num of classes)) 00206 * a 2D structure is assumed with the first row containing number of items of each class 00207 * with the bit set and the second row to entires of each class with the bit turned off 00208 */ 00209 bool BiasCheckBit(RDKit::USHORT *resMat) const; 00210 00211 /*! \brief Compute the biased info entropy gain based on the bias list 00212 * 00213 * This what happens here: 00214 * - we call BiasCheckBit to see if the bit qualifies to compute the infocontent 00215 * - If this bit is ok then we call InfoEntropyGain otherwise we return 0.0 00216 * 00217 * ARGUMENTS: 00218 * - resMat : the result matrix, one dimensional matrix of dimension (2*(num of classes)) 00219 * a 2D structure is assumed with the first row containing number of items of each class 00220 * with the bit set and the second row to entires of each class with the bit turned off 00221 */ 00222 double BiasInfoEntropyGain(RDKit::USHORT *resMat) const; 00223 00224 /*! \brief Compute the biased chi qsure value based on the bias list 00225 * 00226 * This what happens here: 00227 * - we call BiasCheckBit to see if the bit qualifies to compute the infocontent 00228 * - If this bit is ok then we call InfoEntropyGain otherwise we return 0.0 00229 * 00230 * ARGUMENTS: 00231 * - resMat : the result matrix, one dimensional matrix of dimension (2*(num of classes)) 00232 * a 2D structure is assumed with the first row containing number of items of each class 00233 * with the bit set and the second row to entires of each class with the bit turned off 00234 */ 00235 double BiasChiSquareGain(RDKit::USHORT *resMat) const; 00236 00237 unsigned int d_dims; // the number of bits in the fingerprints 00238 unsigned int d_classes; // the number of classes (active, inactive, moderately active etc.) 00239 InfoType d_type; // the type of information meassure - currently we support only entropy 00240 VECT_USHORT_VECT d_counts; // place holder of counting the number of hits for each bit for each class 00241 USHORT_VECT d_clsCount; // counter for the number of instances of each class 00242 double *dp_topBits; // storage for the top ranked bits and the corresponding statistics 00243 unsigned int d_top; // the number of bits that have been ranked 00244 unsigned int d_nInst; // total number of instances or fingerprints used accumulate votes 00245 RDKit::INT_VECT d_biasList; // if we want a bias towards certain classes in ranking bits 00246 ExplicitBitVect *dp_maskBits; // allows only certain bits to be considered 00247 00248 }; 00249 } 00250 #endif
1.7.1