00001 // 00002 // Copyright (C) 2003-2006 Rational Discovery LLC 00003 // 00004 // @@ All Rights Reserved @@ 00005 // This file is part of the RDKit. 00006 // The contents are covered by the terms of the BSD license 00007 // which is included in the file license.txt, found at the root 00008 // of the RDKit source tree. 00009 // 00010 #ifndef _HIERARCHCLUSTERPICKER_H 00011 #define _HIERARCHCLUSTERPICKER_H 00012 00013 #include <RDGeneral/types.h> 00014 #include "DistPicker.h" 00015 00016 namespace RDPickers { 00017 00018 /*! \brief Diversity picker based on hierarchical clustering 00019 * 00020 * This class inherits from DistPicker since it uses the distance matrix 00021 * for diversity picking. The clustering itself is done using the Murtagh 00022 * code in $RDBASE/Code/ML/Cluster/Mutagh/ 00023 */ 00024 class HierarchicalClusterPicker : public DistPicker { 00025 public: 00026 00027 /*! \brief The type of hierarchical clustering algorithm to use 00028 */ 00029 typedef enum { 00030 WARD=1, 00031 SLINK=2, 00032 CLINK=3, 00033 UPGMA=4, 00034 MCQUITTY=5, 00035 GOWER=6, 00036 CENTROID=7 } ClusterMethod; 00037 00038 /*! \brief Constructor - takes a ClusterMethod as an argument 00039 * 00040 * Sets the hierarch clustering method 00041 */ 00042 explicit HierarchicalClusterPicker(ClusterMethod clusterMethod) : d_method(clusterMethod) {;}; 00043 00044 /*! \brief This is the function that does the picking 00045 * 00046 * Here is how the algorithm works \n 00047 * FIX: Supply reference 00048 * 00049 * - The entire pool is clustered using the distance matrix using one of the 00050 * hierachical clustering method (specified via the constructor). \n 00051 * - Starting with the individaul items in the pool, clusters are merged based 00052 * on the output from clustering method. \n 00053 * - The merging is stopped when the number of clusters is same as 00054 * the number of picks. 00055 * - For each item in a cluster the sum of square of the distances to the rest of 00056 * of the items (in the cluster) is computed. The item with the smallest of values is 00057 * picked as a representative of the cluster. Basically trying to pick the item closest 00058 * to the centroid of the cluster. 00059 * 00060 * 00061 * \param distMat - distance matrix - a vector of double. It is assumed that only the 00062 * lower triangle element of the matrix are supplied in a 1D array\n 00063 * NOTE: this matrix WILL BE ALTERED during the picking\n 00064 * \param poolSize - the size of the pool to pick the items from. It is assumed that the 00065 * distance matrix above contains the right number of elements; i.e. 00066 * poolSize*(poolSize-1) \n 00067 * \param pickSize - the number items to pick from pool (<= poolSize) 00068 */ 00069 RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize) const ; 00070 00071 /*! \brief This is the function that does the clustering of the items - used by the picker 00072 * 00073 * ARGUMENTS: 00074 * 00075 * \param distMat - distance matrix - a vector of double. It is assumed that only the 00076 * lower triangle element of the matrix are supplied in a 1D array\n 00077 * NOTE: this matrix WILL BE ALTERED during the picking\n 00078 * \param poolSize - the size of the pool to pick the items from. It is assumed that the 00079 * distance matrix above contains the right number of elements; i.e. 00080 * poolSize*(poolSize-1) \n 00081 * \param pickSize - the number clusters to divide the pool into (<= poolSize) 00082 */ 00083 RDKit::VECT_INT_VECT cluster(const double *distMat, unsigned int poolSize, unsigned int pickSize) const; 00084 00085 private: 00086 ClusterMethod d_method; 00087 }; 00088 }; 00089 00090 #endif
1.7.1