00001 // 00002 // Copyright (C) 2003-2006 Rational Discovery LLC 00003 // 00004 // @@ All Rights Reserved @@ 00005 // 00006 #ifndef _HIERARCHCLUSTERPICKER_H 00007 #define _HIERARCHCLUSTERPICKER_H 00008 00009 #include <RDGeneral/types.h> 00010 #include "DistPicker.h" 00011 00012 namespace RDPickers { 00013 00014 /*! \brief Diversity picker based on hierarchical clustering 00015 * 00016 * This class inherits from DistPicker since it uses the distance matrix 00017 * for diversity picking. The clustering itself is done using the Murtagh 00018 * code in $RDBASE/Code/ML/Cluster/Mutagh/ 00019 */ 00020 class HierarchicalClusterPicker : public DistPicker { 00021 public: 00022 00023 /*! \brief The type of hierarchical clustering algorithm to use 00024 */ 00025 typedef enum { 00026 WARD=1, 00027 SLINK=2, 00028 CLINK=3, 00029 UPGMA=4, 00030 MCQUITTY=5, 00031 GOWER=6, 00032 CENTROID=7 } ClusterMethod; 00033 00034 /*! \brief Constructor - takes a ClusterMethod as an argument 00035 * 00036 * Sets the hierarch clustering method 00037 */ 00038 explicit HierarchicalClusterPicker(ClusterMethod clusterMethod) : d_method(clusterMethod) {;}; 00039 00040 /*! \brief This is the function that does the picking 00041 * 00042 * Here is how the algorithm works \n 00043 * FIX: Supply reference 00044 * 00045 * - The entire pool is clustered using the distance matrix using one of the 00046 * hierachical clustering method (specified via the constructor). \n 00047 * - Starting with the individaul items in the pool, clusters are merged based 00048 * on the output from clustering method. \n 00049 * - The merging is stopped when the number of clusters is same as 00050 * the number of picks. 00051 * - For each item in a cluster the sum of square of the distances to the rest of 00052 * of the items (in the cluster) is computed. The item with the smallest of values is 00053 * picked as a representative of the cluster. Basically trying to pick the item closest 00054 * to the centroid of the cluster. 00055 * 00056 * 00057 * \param distMat - distance matrix - a vector of double. It is assumed that only the 00058 * lower triangle element of the matrix are supplied in a 1D array\n 00059 * NOTE: this matrix WILL BE ALTERED during the picking\n 00060 * \param poolSize - the size of the pool to pick the items from. It is assumed that the 00061 * distance matrix above contains the right number of elements; i.e. 00062 * poolSize*(poolSize-1) \n 00063 * \param pickSize - the number items to pick from pool (<= poolSize) 00064 */ 00065 RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize) const ; 00066 00067 /*! \brief This is the function that does the clustering of the items - used by the picker 00068 * 00069 * ARGUMENTS: 00070 * 00071 * \param distMat - distance matrix - a vector of double. It is assumed that only the 00072 * lower triangle element of the matrix are supplied in a 1D array\n 00073 * NOTE: this matrix WILL BE ALTERED during the picking\n 00074 * \param poolSize - the size of the pool to pick the items from. It is assumed that the 00075 * distance matrix above contains the right number of elements; i.e. 00076 * poolSize*(poolSize-1) \n 00077 * \param pickSize - the number clusters to divide the pool into (<= poolSize) 00078 */ 00079 RDKit::VECT_INT_VECT cluster(const double *distMat, unsigned int poolSize, unsigned int pickSize) const; 00080 00081 private: 00082 ClusterMethod d_method; 00083 }; 00084 }; 00085 00086 #endif
1.5.5