MaxMinPicker.h

Go to the documentation of this file.
00001 //
00002 //  Copyright (C) 2003-2007 Greg Landrum and Rational Discovery LLC
00003 //
00004 //   @@ All Rights Reserved @@
00005 //  This file is part of the RDKit.
00006 //  The contents are covered by the terms of the BSD license
00007 //  which is included in the file license.txt, found at the root
00008 //  of the RDKit source tree.
00009 //
00010 #ifndef __RD_MAXMINPICKER_H__
00011 #define __RD_MAXMINPICKER_H__
00012 
00013 #include <RDGeneral/types.h>
00014 #include <RDGeneral/utils.h>
00015 #include <RDGeneral/Invariant.h>
00016 #include <RDGeneral/RDLog.h>
00017 #include <RDBoost/Exceptions.h>
00018 #include <cstdlib>
00019 #include "DistPicker.h"
00020 #include <boost/random.hpp>
00021 
00022 namespace RDPickers {
00023 
00024   namespace {
00025     class distmatFunctor{
00026     public:
00027       distmatFunctor(const double *distMat) : dp_distMat(distMat) {};
00028       double operator()(unsigned int i,unsigned int j) {
00029         return getDistFromLTM(this->dp_distMat,i,j);
00030       }
00031     private:
00032       const double *dp_distMat;
00033     };
00034   }
00035 
00036   /*! \brief Implements the MaxMin algorithm for picking a subset of item from a pool
00037    *
00038    *  This class inherits from the DistPicker and implements a specific picking strategy
00039    *  aimed at diversity. See documentation for "pick()" member function for the algorithm details
00040    */
00041   class MaxMinPicker : public DistPicker {
00042   public:
00043     /*! \brief Default Constructor
00044      *
00045      */
00046     MaxMinPicker() {};
00047 
00048     /*! \brief Contains the implementation for a lazy MaxMin diversity picker
00049      *
00050      * See the documentation for the pick() method for details about the algorithm
00051      *
00052      *   \param func - a function (or functor) taking two unsigned ints as arguments
00053      *              and returning the distance (as a double) between those two elements.   
00054      *   \param poolSize - the size of the pool to pick the items from. It is assumed that the
00055      *              distance matrix above contains the right number of elements; i.e.
00056      *              poolSize*(poolSize-1) 
00057      *   \param pickSize - the number items to pick from pool (<= poolSize)
00058      *   \param firstPicks - (optional)the first items in the pick list
00059      *   \param seed - (optional) seed for the random number generator
00060      */
00061     template <typename T>
00062     RDKit::INT_VECT lazyPick(T &func, 
00063                              unsigned int poolSize, unsigned int pickSize,
00064                              RDKit::INT_VECT firstPicks=RDKit::INT_VECT(),
00065                              int seed=-1) const;
00066 
00067     /*! \brief Contains the implementation for the MaxMin diversity picker
00068      *
00069      * Here is how the picking algorithm works, refer to
00070      *   Ashton, M. et. al., Quant. Struct.-Act. Relat., 21 (2002), 598-604
00071      * for more detail:
00072      *
00073      * A subset of k items is to be selected from a pool containing N molecules. 
00074      * Then the MaxMin method is as follows:
00075      *  -# Initialise Subset with some appropriately chosen seed
00076      *     compound and set x = 1.
00077      *  -# For each of the N-x remaining compounds in Dataset
00078      *     calculate its dissimilarity with each of the x compounds in
00079      *     Subset and retain the smallest of these x dissimilarities for
00080      *     each compound in Dataset.
00081      *  -# Select the molecule from Dataset with the largest value
00082      *     for the smallest dissimilarity calculated in Step 2 and
00083      *     transfer it to Subset.
00084      *  -# Set x = x + 1 and return to Step 2 if x < k.
00085      *
00086      *  
00087      *
00088      *   \param distMat - distance matrix - a vector of double. It is assumed that only the 
00089      *              lower triangle element of the matrix are supplied in a 1D array\n
00090      *   \param poolSize - the size of the pool to pick the items from. It is assumed that the
00091      *              distance matrix above contains the right number of elements; i.e.
00092      *              poolSize*(poolSize-1) \n
00093      *   \param pickSize - the number items to pick from pool (<= poolSize)
00094      *   \param firstPicks - indices of the items used to seed the pick set.
00095      *   \param seed - (optional) seed for the random number generator
00096     */
00097     RDKit::INT_VECT pick(const double *distMat, 
00098                          unsigned int poolSize, unsigned int pickSize,
00099                          RDKit::INT_VECT firstPicks,
00100                          int seed=-1) const {
00101       CHECK_INVARIANT(distMat, "Invalid Distance Matrix");
00102       if(poolSize<pickSize)
00103         throw ValueErrorException("pickSize cannot be larger than the poolSize");
00104       distmatFunctor functor(distMat);
00105       return this->lazyPick(functor,poolSize,pickSize,firstPicks,seed);
00106     }
00107 
00108     /*! \overload */
00109     RDKit::INT_VECT pick(const double *distMat, 
00110                          unsigned int poolSize, unsigned int pickSize) const {
00111       RDKit::INT_VECT iv;
00112       return pick(distMat,poolSize,pickSize,iv);
00113     }
00114 
00115 
00116 
00117   };
00118   // we implement this here in order to allow arbitrary functors without link errors
00119   template <typename T>
00120   RDKit::INT_VECT MaxMinPicker::lazyPick(T &func,
00121                                          unsigned int poolSize, unsigned int pickSize,
00122                                          RDKit::INT_VECT firstPicks,
00123                                          int seed) const {
00124     if(poolSize<pickSize)
00125       throw ValueErrorException("pickSize cannot be larger than the poolSize");
00126 
00127     RDKit::INT_LIST pool;
00128 
00129     RDKit::INT_VECT picks;
00130     picks.reserve(pickSize);
00131     unsigned int pick=0;
00132 
00133     // enter the pool into a list so that we can pick out of it easily
00134     for (unsigned int i = 0; i < poolSize; i++) {
00135       pool.push_back(i);
00136     }
00137 
00138 
00139     // get a seeded random number generator:
00140     typedef boost::mt19937 rng_type;
00141     typedef boost::uniform_int<> distrib_type;
00142     typedef boost::variate_generator<rng_type &,distrib_type> source_type;
00143     rng_type generator(42u);
00144     distrib_type dist(0,poolSize);
00145     source_type randomSource(generator,dist);
00146     if(seed>0) generator.seed(static_cast<rng_type::result_type>(seed));
00147 
00148     // pick the first entry
00149     if(!firstPicks.size()){
00150       pick = randomSource();
00151       // add the pick to the picks
00152       picks.push_back(pick);
00153       // and remove it from the pool
00154       pool.remove(pick);
00155     } else{
00156       for(RDKit::INT_VECT::const_iterator pIdx=firstPicks.begin();
00157           pIdx!=firstPicks.end();++pIdx){
00158         pick = static_cast<unsigned int>(*pIdx);
00159         if(pick>=poolSize)
00160           throw ValueErrorException("pick index was larger than the poolSize");
00161         picks.push_back(pick);
00162         pool.remove(pick);
00163       }
00164     }
00165     // now pick 1 compound at a time
00166     while (picks.size() < pickSize) {
00167       double maxOFmin = -1.0;
00168       RDKit::INT_LIST_I plri=pool.end();
00169       for(RDKit::INT_LIST_I pli=pool.begin();
00170           pli!=pool.end(); ++pli){
00171         unsigned int poolIdx = (*pli);
00172         double minTOi = RDKit::MAX_DOUBLE;
00173         for (RDKit::INT_VECT_CI pi = picks.begin(); 
00174              pi != picks.end(); ++pi) {
00175           unsigned int pickIdx = (*pi);
00176           CHECK_INVARIANT(poolIdx!=pickIdx,"");
00177           double dist = func(poolIdx,pickIdx);
00178           if (dist <= minTOi) {
00179             minTOi = dist;
00180           }
00181         }
00182         if (minTOi > maxOFmin || (RDKit::feq(minTOi,maxOFmin) && poolIdx<pick) ) {
00183           maxOFmin = minTOi;
00184           pick = poolIdx;
00185           plri = pli;
00186         }
00187       }
00188       
00189       // now add the new pick to  picks and remove it from the pool
00190       picks.push_back(pick);
00191       CHECK_INVARIANT(plri!=pool.end(),"");
00192       pool.erase(plri);
00193     }
00194     return picks;
00195   }
00196 
00197 
00198 };
00199 
00200 #endif