InfoBitRanker.h

Go to the documentation of this file.
00001 // $Id: InfoBitRanker.h 1528 2010-09-26 17:04:37Z glandrum $
00002 //
00003 //  Copyright (C) 2003-2007 Greg Landrum and Rational Discovery LLC
00004 //   @@ All Rights Reserved @@
00005 //  This file is part of the RDKit.
00006 //  The contents are covered by the terms of the BSD license
00007 //  which is included in the file license.txt, found at the root
00008 //  of the RDKit source tree.
00009 //
00010 
00011 #ifndef _RD_INFORANKER_H_
00012 #define _RD_INFORANKER_H_
00013 
00014 #include <RDGeneral/types.h>
00015 #include <DataStructs/BitVects.h>
00016 #include <iostream>
00017 
00018 
00019 /*! \brief Class used to rank bits based on a specified measure of infomation
00020  *
00021  * Basically a primitive mimic of the CombiChem "signal" functionality
00022  * To use:
00023  *  - create an instance of this class 
00024  *  - loop over the fingerprints in the dataset by calling accumulateVotes method
00025  *  - call getTopN to get the top n ranked bits
00026  *
00027  * Sample usage and results from the python wrapper:
00028  * Here's a small set of vectors:
00029  * >>> for i,bv in enumerate(bvs): print bv.ToBitString(),acts[i]
00030  * ... 
00031  * 0001 0
00032  * 0101 0
00033  * 0010 1
00034  * 1110 1
00035  * 
00036  * Default ranker, using infogain:
00037  * >>> ranker = InfoBitRanker(4,2)  
00038  * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
00039  * ... 
00040  * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1)
00041  * ... 
00042  * 3 1.000 2 0
00043  * 2 1.000 0 2
00044  * 0 0.311 0 1
00045  * 
00046  * Using the biased infogain:
00047  * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASENTROPY)
00048  * >>> ranker.SetBiasList((1,))
00049  * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
00050  * ... 
00051  * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1)
00052  * ... 
00053  * 2 1.000 0 2
00054  * 0 0.311 0 1
00055  * 1 0.000 1 1
00056  * 
00057  * A chi squared ranker is also available:
00058  * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.CHISQUARE)
00059  * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
00060  * ... 
00061  * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1)
00062  * ... 
00063  * 3 4.000 2 0
00064  * 2 4.000 0 2
00065  * 0 1.333 0 1
00066  * 
00067  * As is a biased chi squared:
00068  * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASCHISQUARE)
00069  * >>> ranker.SetBiasList((1,))
00070  * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
00071  * ... 
00072  * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1)
00073  * ... 
00074  * 2 4.000 0 2
00075  * 0 1.333 0 1
00076  * 1 0.000 1 1
00077  */
00078 namespace RDInfoTheory {
00079   typedef std::vector<RDKit::USHORT> USHORT_VECT;
00080   typedef std::vector<USHORT_VECT> VECT_USHORT_VECT;
00081 
00082   class InfoBitRanker {
00083   public:
00084     
00085     /*! \brief the type of measure for information
00086      * 
00087      */
00088     typedef enum {
00089       ENTROPY=1,
00090       BIASENTROPY=2,
00091       CHISQUARE=3,
00092       BIASCHISQUARE=4
00093     } InfoType;
00094     
00095     /*! \brief Constructor 
00096      *
00097      * ARGUMENTS:
00098      *
00099      *   - nBits: the dimension of the bit vectors or the fingerprint length
00100      *   - nClasses: the number of classes used in the classification problem (e.g. active,
00101      *              moderately active, inactive etc.). It is assumed that the classes are 
00102      *              numbered from 0 to (nClasses - 1) 
00103      *   - infoType: the type of information metric
00104      */
00105     InfoBitRanker(unsigned int nBits, unsigned int nClasses, InfoType infoType=InfoBitRanker::ENTROPY) :
00106     d_dims(nBits), d_classes(nClasses), d_type(infoType) {
00107       d_counts.resize(0);
00108       for (unsigned int i = 0; i < nClasses; i++) {
00109         USHORT_VECT cCount;
00110         cCount.resize(d_dims, 0);
00111         d_counts.push_back(cCount);
00112       }
00113       d_clsCount.resize(d_classes, 0);
00114       d_nInst = 0;
00115       d_top = 0;
00116       dp_topBits=0;
00117       d_biasList.resize(0);
00118       dp_maskBits=0;
00119     }
00120     
00121     ~InfoBitRanker() {
00122       if(dp_topBits)
00123         delete [] dp_topBits;
00124       if(dp_maskBits)
00125         delete dp_maskBits;
00126     }
00127 
00128     /*! \brief Accumulate the votes for all the bits turned on in a bit vector
00129      *  
00130      *  ARGUMENTS:
00131      *
00132      *   - bv : bit vector that supports [] operator
00133      *   - label : the class label for the bit vector. It is assumed that 0 <= class < nClasses
00134      */
00135     void accumulateVotes(const ExplicitBitVect &bv, unsigned int label);
00136     void accumulateVotes(const SparseBitVect &bv, unsigned int label);
00137     
00138     /*! \brief Returns the top n bits ranked by the information metric
00139      *
00140      * This is actually the function where most of the work of ranking is happening
00141      * 
00142      *  \param num the number of top ranked bits that are required
00143      *
00144      *  \return a pointer to an information array. The client should *not*
00145      *          delete this
00146      */
00147     double *getTopN(unsigned int num);
00148     
00149     /*! \brief return the number of labelled instances(examples) or fingerprints seen so far
00150      *
00151      */
00152     unsigned int getNumInstances() const {
00153       return d_nInst;
00154     }
00155     
00156     /*! \brief return the number of classes 
00157      *
00158      */
00159     unsigned int getNumClasses() const {
00160       return d_classes;
00161     }
00162 
00163     /*! \brief Set the classes to which the entropy calculation should be biased
00164      *
00165      * This list contains a set of class ids used when in the BIASENTROPY mode of ranking bits. 
00166      * In this mode, a bit must be correllated higher with one of the biased classes than all the 
00167      * other classes. For example, in a two class problem with actives and inactives, the fraction of 
00168      * actives that hit the bit has to be greater than the fraction of inactives that hit the bit
00169      *
00170      * ARGUMENTS:
00171      *   classList - list of class ids that we want a bias towards
00172      */
00173     void setBiasList(RDKit::INT_VECT &classList);
00174 
00175 
00176     /*! \brief Set the bits to be used as a mask
00177      *
00178      * If this function is called, only the bits which are present in the
00179      *   maskBits list will be used.
00180      *
00181      * ARGUMENTS:
00182      *   maskBits - the bits to be considered
00183      */
00184     void setMaskBits(RDKit::INT_VECT &maskBits);
00185 
00186     /*! \brief Write the top N bits to a stream
00187      *
00188      */
00189     void writeTopBitsToStream(std::ostream *outStream) const;
00190     
00191     /*! \brief Write the top bits to a file
00192      *
00193      */
00194     void writeTopBitsToFile(std::string fileName) const;
00195 
00196   private:
00197     /*! \brief check if we want to compute the info content for a bit based on the bias list
00198      *
00199      * This what happens here:
00200      *    - the fraction of items in each class that hit a particular bit are computed
00201      *    - the maximum of these fractions for classes that are not in the biasList are computed
00202      *    - If this maximum is less than the fraction for atleast one of classes in the biaslist
00203      *      the bit is considered good
00204      * ARGUMENTS:
00205      *   - resMat : the result matrix, one dimensional matrix of dimension (2*(num of classes))
00206      *              a 2D structure is assumed with the first row containing number of items of each class 
00207      *              with the bit set and the second row to entires of each class with the bit turned off
00208      */
00209     bool BiasCheckBit(RDKit::USHORT *resMat) const;
00210 
00211     /*! \brief Compute the biased info entropy gain based on the bias list
00212      *
00213      *  This what happens here:
00214      *    - we call BiasCheckBit to see if the bit qualifies to compute the infocontent
00215      *    - If this bit is ok then we call InfoEntropyGain otherwise we return 0.0 
00216      *
00217      * ARGUMENTS:
00218      *   - resMat : the result matrix, one dimensional matrix of dimension (2*(num of classes))
00219      *              a 2D structure is assumed with the first row containing number of items of each class 
00220      *              with the bit set and the second row to entires of each class with the bit turned off
00221      */
00222     double BiasInfoEntropyGain(RDKit::USHORT *resMat) const;
00223 
00224     /*! \brief Compute the biased chi qsure value based on the bias list
00225      *
00226      *  This what happens here:
00227      *    - we call BiasCheckBit to see if the bit qualifies to compute the infocontent
00228      *    - If this bit is ok then we call InfoEntropyGain otherwise we return 0.0 
00229      *
00230      * ARGUMENTS:
00231      *   - resMat : the result matrix, one dimensional matrix of dimension (2*(num of classes))
00232      *              a 2D structure is assumed with the first row containing number of items of each class 
00233      *              with the bit set and the second row to entires of each class with the bit turned off
00234      */
00235     double BiasChiSquareGain(RDKit::USHORT *resMat) const;
00236 
00237     unsigned int d_dims; // the number of bits in the fingerprints
00238     unsigned int d_classes; // the number of classes (active, inactive, moderately active etc.)
00239     InfoType d_type; // the type of information meassure - currently we support only entropy
00240     VECT_USHORT_VECT d_counts; // place holder of counting the number of hits for each bit for each class
00241     USHORT_VECT d_clsCount; // counter for the number of instances of each class 
00242     double *dp_topBits; // storage for the top ranked bits and the corresponding statistics
00243     unsigned int d_top; // the number of bits that have been ranked
00244     unsigned int d_nInst; // total number of instances or fingerprints used accumulate votes
00245     RDKit::INT_VECT d_biasList; // if we want a bias towards certain classes in ranking bits
00246     ExplicitBitVect *dp_maskBits; // allows only certain bits to be considered
00247     
00248   };
00249 }
00250 #endif