HierarchicalClusterPicker.h

Go to the documentation of this file.
00001 //
00002 //  Copyright (C) 2003-2006 Rational Discovery LLC
00003 //
00004 //   @@ All Rights Reserved @@
00005 //  This file is part of the RDKit.
00006 //  The contents are covered by the terms of the BSD license
00007 //  which is included in the file license.txt, found at the root
00008 //  of the RDKit source tree.
00009 //
00010 #ifndef _HIERARCHCLUSTERPICKER_H
00011 #define _HIERARCHCLUSTERPICKER_H
00012 
00013 #include <RDGeneral/types.h>
00014 #include "DistPicker.h"
00015 
00016 namespace RDPickers {
00017   
00018   /*! \brief Diversity picker based on hierarchical clustering
00019    *  
00020    *  This class inherits from DistPicker since it uses the distance matrix
00021    *  for diversity picking. The clustering itself is done using the Murtagh 
00022    *  code in $RDBASE/Code/ML/Cluster/Mutagh/
00023    */
00024   class HierarchicalClusterPicker : public DistPicker {
00025   public:
00026 
00027     /*! \brief The type of hierarchical clustering algorithm to use
00028      */
00029     typedef enum {
00030       WARD=1,
00031       SLINK=2,
00032       CLINK=3,
00033       UPGMA=4,
00034       MCQUITTY=5,
00035       GOWER=6,
00036       CENTROID=7 } ClusterMethod;
00037 
00038     /*! \brief Constructor - takes a ClusterMethod as an argument
00039      *
00040      * Sets the hierarch clustering method
00041      */
00042     explicit HierarchicalClusterPicker(ClusterMethod clusterMethod) : d_method(clusterMethod) {;};
00043 
00044     /*! \brief This is the function that does the picking
00045      *
00046      * Here is how the algorithm works \n
00047      *  FIX: Supply reference
00048      *
00049      * - The entire pool is clustered using the distance matrix using one of the 
00050      *   hierachical clustering method (specified via the constructor). \n
00051      * - Starting with the individaul items in the pool, clusters are merged based 
00052      *   on the output from clustering method. \n
00053      * - The merging is stopped when the number of clusters is same as 
00054      *   the number of picks.
00055      * - For each item in a cluster the sum of square of the distances to the rest of
00056      *   of the items (in the cluster) is computed. The item with the smallest of values is
00057      *   picked as a representative of the cluster. Basically trying to pick the item closest
00058      *   to the centroid of the cluster. 
00059      *
00060      *
00061      *    \param distMat - distance matrix - a vector of double. It is assumed that only the 
00062      *              lower triangle element of the matrix are supplied in a 1D array\n
00063      *              NOTE: this matrix WILL BE ALTERED during the picking\n
00064      *    \param poolSize - the size of the pool to pick the items from. It is assumed that the
00065      *              distance matrix above contains the right number of elements; i.e.
00066      *              poolSize*(poolSize-1) \n
00067      *    \param pickSize - the number items to pick from pool (<= poolSize)
00068      */
00069     RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize) const ;
00070 
00071     /*! \brief This is the function that does the clustering of the items - used by the picker
00072      *
00073      * ARGUMENTS:
00074      *
00075      *   \param distMat - distance matrix - a vector of double. It is assumed that only the 
00076      *              lower triangle element of the matrix are supplied in a 1D array\n
00077      *              NOTE: this matrix WILL BE ALTERED during the picking\n
00078      *   \param poolSize - the size of the pool to pick the items from. It is assumed that the
00079      *              distance matrix above contains the right number of elements; i.e.
00080      *              poolSize*(poolSize-1) \n
00081      *   \param pickSize - the number clusters to divide the pool into (<= poolSize)
00082      */
00083     RDKit::VECT_INT_VECT cluster(const double *distMat, unsigned int poolSize, unsigned int pickSize) const;
00084 
00085   private:
00086     ClusterMethod d_method;
00087   };
00088 };
00089 
00090 #endif