Catalog.h

Go to the documentation of this file.
00001 //
00002 //  Copyright (C) 2003-2006 Rational Discovery LLC
00003 //
00004 //  @@ All Rights Reserved @@
00005 //  This file is part of the RDKit.
00006 //  The contents are covered by the terms of the BSD license
00007 //  which is included in the file license.txt, found at the root
00008 //  of the RDKit source tree.
00009 //
00010 
00011 #ifndef __RD_CATALOG_H__
00012 #define __RD_CATALOG_H__
00013 
00014 // Boost graph stuff
00015 #include <boost/graph/graph_traits.hpp>
00016 #include <boost/graph/adjacency_list.hpp>
00017 #include <boost/version.hpp>
00018 #if BOOST_VERSION >= 104000
00019 #include <boost/property_map/property_map.hpp>
00020 #else
00021 #include <boost/property_map.hpp>
00022 #endif
00023 
00024 
00025 // for some typedefs
00026 #include <RDGeneral/types.h>
00027 #include <RDGeneral/StreamOps.h>
00028 
00029 namespace RDCatalog {
00030   const int versionMajor=1;
00031   const int versionMinor=0;
00032   const int versionPatch=0;
00033   const int endianId=0xDEADBEEF;
00034   
00035   //-----------------------------------------------------------------------------
00036   //! abstract base class for a catalog object
00037   template <class entryType, class paramType>
00038   class Catalog {
00039   public:
00040     //------------------------------------
00041     Catalog() : d_fpLength(0), dp_cParams(0) {};
00042 
00043     //------------------------------------
00044     virtual ~Catalog(){
00045       if (dp_cParams) {
00046         delete dp_cParams;
00047       }
00048     }
00049     
00050     //------------------------------------
00051     //! return a serialized form of the Catalog as an std::string
00052     virtual std::string Serialize() const = 0;
00053     
00054     //------------------------------------
00055     //! adds an entry to the catalog
00056     /*!
00057 
00058       \param entry          the entry to be added
00059       \param updateFPLength (optional) if this is true, our internal
00060       fingerprint length will also be updated.
00061       
00062     */
00063     virtual unsigned int addEntry(entryType *entry, bool updateFPLength = true) = 0;
00064     
00065     //------------------------------------
00066     //! returns a particular entry in the Catalog
00067     virtual const entryType* getEntryWithIdx(unsigned int idx) const = 0;
00068     
00069     //------------------------------------
00070     //! returns the number of entries
00071     virtual unsigned int getNumEntries() const = 0;
00072     
00073     //------------------------------------
00074     //! returns the length of our fingerprint
00075     unsigned int getFPLength() const {return d_fpLength;}
00076     
00077     //------------------------------------
00078     //! sets our fingerprint length
00079     void setFPLength(unsigned int val) {d_fpLength = val;}
00080     
00081     //------------------------------------
00082     //! sets our parameters by copying the \c params argument
00083     void setCatalogParams(paramType *params) {
00084       PRECONDITION(params,"bad parameter object");
00085       //if we already have a paramter object throw an exception
00086       PRECONDITION(!dp_cParams,"A parameter object already exists on the catalog" );
00087       /*
00088         if (dp_cParams) {
00089         // we already have parameter object on the catalog
00090         // can't overwrite it
00091         PRECONDITION(0, "A parameter object already exist on the catalog");
00092         }*/
00093       dp_cParams = new paramType(*params);
00094     }
00095     
00096     //------------------------------------
00097     //! returns a pointer to our parameters
00098     const paramType *getCatalogParams() const { return dp_cParams;}
00099 
00100   protected:
00101     // this is the ID that will be assigned to the next entry 
00102     // added to the catalog - need not be same as the number of entries 
00103     // in the catalog and does not correspond with the
00104     // id of the entry in the catalog.
00105     // this is more along the lines of bitId
00106     unsigned int d_fpLength; //!< the length of our fingerprint
00107     paramType *dp_cParams;   //!< our params object
00108 
00109   };
00110 
00111 
00112 
00113   //-----------------------------------------------------------------------------
00114   //! A Catalog with a hierarchical structure
00115   /*!
00116 
00117     The entries of a HierarchCatalog are arranged in a directed graph
00118 
00119     <b>The difference between <i>Indices</i> and <i>Bit Ids</i></b>
00120     
00121     A HierarchCatalog may contain more entries than the user is actually
00122     interested in.  For example a HierarchCatalog constructed to contain
00123     orders 5 through 8 may well contain information about orders 1-5,
00124     in order to facilitate some search optimizations.
00125 
00126     - <i>Bit Ids</i> refer to the "interesting" bits.  
00127     So, in the above example, Bit Id \c 0 will be the first entry
00128     with order 5.
00129     - <i>Indices</i> refer to the underlying structure of the catalog.
00130     So, in the above example, the entry with index \c 0 will be
00131     the first entry with order 1.
00132 
00133   */
00134   template <class entryType, class paramType, class orderType> 
00135   class HierarchCatalog : public Catalog <entryType, paramType> {
00136     // the entries in the catalog can be traversed using the edges
00137     // in a desired order
00138   public:
00139     //! used by the BGL to set up the node properties in our graph
00140     struct vertex_entry_t {
00141       enum { num=1003 };
00142       typedef boost::vertex_property_tag kind;
00143     };
00144     typedef boost::property<vertex_entry_t, entryType *> EntryProperty;
00145     
00146     //! the type of the graph itself:
00147     typedef boost::adjacency_list<boost::vecS,
00148                                          boost::vecS, // FIX: should be using setS for edges so that parallel edges are never added (page 225 BGL book)
00149       // but that seems result in compile errors
00150       boost::bidirectionalS,
00151              EntryProperty> CatalogGraph;
00152     
00153     typedef boost::graph_traits<CatalogGraph> CAT_GRAPH_TRAITS;
00154     typedef typename CAT_GRAPH_TRAITS::vertex_iterator VER_ITER;
00155     typedef std::pair<VER_ITER, VER_ITER> ENT_ITER_PAIR;
00156     typedef typename CAT_GRAPH_TRAITS::adjacency_iterator DOWN_ENT_ITER;
00157     typedef std::pair<DOWN_ENT_ITER, DOWN_ENT_ITER> DOWN_ENT_ITER_PAIR;
00158     
00159     //------------------------------------
00160     HierarchCatalog<entryType, paramType, orderType>() {};
00161     
00162     //------------------------------------
00163     //! Construct by making a copy of the input \c params object
00164     HierarchCatalog<entryType, paramType, orderType>(paramType *params) : Catalog<entryType,paramType>() {
00165       this->setCatalogParams(params);
00166     }
00167 
00168     //------------------------------------
00169     //! Construct from a \c pickle (a serialized form of the HierarchCatalog)
00170     HierarchCatalog<entryType, paramType, orderType>(const std::string &pickle) {
00171       this->initFromString(pickle);
00172     }
00173     
00174     //------------------------------------
00175     ~HierarchCatalog() {
00176       destroy();
00177     }
00178 
00179     //------------------------------------
00180     //! serializes this object to a stream
00181     void toStream(std::ostream &ss) const {
00182       PRECONDITION(this->getCatalogParams(),"NULL parameter object");
00183       
00184       // the i/o header:
00185       RDKit::streamWrite(ss,endianId);
00186       RDKit::streamWrite(ss,versionMajor);
00187       RDKit::streamWrite(ss,versionMinor);
00188       RDKit::streamWrite(ss,versionPatch);
00189 
00190       // information about the catalog itself:
00191       int tmpUInt;
00192       tmpUInt = this->getFPLength();
00193       RDKit::streamWrite(ss,tmpUInt);
00194       tmpUInt = this->getNumEntries();
00195       RDKit::streamWrite(ss,tmpUInt);
00196 
00197       //std::cout << ">>>>-------------------------------" << std::endl;
00198       //std::cout << "\tlength: " << getFPLength() << " " << getNumEntries() << std::endl;
00199 
00200       // add the params object:
00201       this->getCatalogParams()->toStream(ss);
00202       //std::cout << "\tparams: " << getCatalogParams()->getLowerFragLength();
00203       //std::cout << " " << getCatalogParams()->getUpperFragLength();
00204       //std::cout << " " << getCatalogParams()->getNumFuncGroups();
00205       //std::cout << std::endl;
00206       
00207       // write the entries in order:
00208       for(unsigned int i=0;i<getNumEntries();i++){
00209         this->getEntryWithIdx(i)->toStream(ss);
00210       }
00211 
00212       // finally the adjacency list:
00213       for(unsigned int i=0;i<getNumEntries();i++){
00214         RDKit::INT_VECT children=this->getDownEntryList(i);
00215         tmpUInt = children.size();
00216         RDKit::streamWrite(ss,tmpUInt);
00217         for(RDKit::INT_VECT::const_iterator ivci=children.begin();
00218             ivci!=children.end();
00219             ivci++){
00220           RDKit::streamWrite(ss,*ivci);
00221         }
00222       }
00223     }
00224 
00225     //------------------------------------
00226     //! serializes this object and returns the resulting \c pickle
00227     std::string Serialize() const {
00228       std::stringstream ss(std::ios_base::binary|std::ios_base::out|std::ios_base::in);
00229       this->toStream(ss);
00230       return ss.str();
00231     }
00232 
00233     //------------------------------------
00234     //! fills the contents of this object from a stream containing a \c pickle
00235     void initFromStream(std::istream &ss) {
00236       int tmpInt;
00237       // FIX: at the moment we ignore the header info:
00238       RDKit::streamRead(ss,tmpInt);
00239       RDKit::streamRead(ss,tmpInt);
00240       RDKit::streamRead(ss,tmpInt);
00241       RDKit::streamRead(ss,tmpInt);
00242       
00243       unsigned int tmpUInt;
00244       RDKit::streamRead(ss,tmpUInt);// fp length
00245       this->setFPLength(tmpUInt);
00246       
00247       unsigned int numEntries;
00248       RDKit::streamRead(ss,numEntries);
00249       //std::cout << "<<<-------------------------------" << std::endl;
00250       //std::cout << "\tlength: " << getFPLength() << " " << numEntries << std::endl;
00251 
00252 
00253       // grab the params:
00254       paramType *params = new paramType();
00255       params->initFromStream(ss);
00256       this->setCatalogParams(params);
00257 
00258       //std::cout << "\tparams: " << getCatalogParams()->getLowerFragLength();
00259       //std::cout << " " << getCatalogParams()->getUpperFragLength();
00260       //std::cout << " " << getCatalogParams()->getNumFuncGroups();
00261       //std::cout << std::endl;
00262 
00263       // now all of the entries:
00264       for(unsigned int i=0;i<numEntries;i++){
00265         entryType *entry = new entryType();
00266         entry->initFromStream(ss);
00267         this->addEntry(entry,false);
00268       }
00269 
00270       // and, finally, the adjacency list:
00271       for(unsigned int i=0;i<numEntries;i++){
00272         unsigned int nNeighbors;
00273         RDKit::streamRead(ss,nNeighbors);
00274         for(unsigned int j=0;j<nNeighbors;j++){
00275           RDKit::streamRead(ss,tmpInt);
00276           this->addEdge(i,tmpInt);
00277         }
00278       }
00279     }
00280     
00281     //------------------------------------
00282     unsigned int getNumEntries() const {
00283       return boost::num_vertices(d_graph);
00284     }
00285 
00286     //------------------------------------
00287     //! fills the contents of this object from a string containing a \c pickle
00288     void initFromString(const std::string &text){
00289       std::stringstream ss(std::ios_base::binary|std::ios_base::out|std::ios_base::in);
00290       // initialize the stream:
00291       ss.write(text.c_str(),text.length());
00292       // now start reading out values:
00293       this->initFromStream(ss);
00294     }
00295 
00296     //------------------------------------
00297     //! add a new entry to the catalog
00298     /*!
00299 
00300       \param entry          the entry to be added
00301       \param updateFPLength (optional) if this is true, our internal
00302       fingerprint length will also be updated.
00303 
00304     */
00305     unsigned int addEntry(entryType *entry, bool updateFPLength = true){ 
00306       PRECONDITION(entry,"bad arguments");
00307       if (updateFPLength) {
00308         unsigned int fpl = this->getFPLength();
00309         entry->setBitId(fpl);
00310         fpl++;
00311         this->setFPLength(fpl);
00312       }
00313       unsigned int eid = boost::add_vertex(EntryProperty(entry), d_graph);
00314       orderType etype = entry->getOrder();
00315       // REVIEW: this initialization is not required: the STL map, in
00316       // theory, will create a new object when operator[] is called
00317       // for a new item
00318       if (d_orderMap.find(etype) == d_orderMap.end()) {
00319         RDKit::INT_VECT nets;
00320         d_orderMap[etype] = nets;
00321       }
00322       d_orderMap[etype].push_back(eid);
00323       return eid;
00324     }
00325 
00326     //------------------------------------
00327     //! adds an edge between two entries in the catalog
00328     /*!
00329       Since we are using a bidirectional graph - the order in 
00330       which the ids are supplied here makes a difference
00331 
00332       \param id1 index of the edge's beginning
00333       \param id2 index of the edge's end
00334 
00335     */
00336     void addEdge(unsigned int id1, unsigned int id2) {
00337       unsigned int nents = getNumEntries();
00338       RANGE_CHECK(0, id1, nents-1);
00339       RANGE_CHECK(0, id2, nents-1);
00340       // FIX: if we boost::setS for the edgeList BGL will
00341       // do the checking for duplicity (parallel edges)
00342       // But for reasons unknown setS results in compile
00343       // errors while using adjacent_vertices.
00344       typename CAT_GRAPH_TRAITS::edge_descriptor edge;
00345       bool found;
00346       boost::tie(edge,found) = boost::edge(boost::vertex(id1,d_graph),
00347                                            boost::vertex(id2,d_graph),
00348                                            d_graph);
00349       if (!found) {
00350         boost::add_edge(id1, id2, d_graph);
00351       }
00352     }
00353 
00354     //------------------------------------
00355     //! returns a pointer to our entry with a particular index 
00356     const entryType *getEntryWithIdx(unsigned int idx) const {
00357       RANGE_CHECK(0,idx,getNumEntries()-1);
00358       int vd = boost::vertex(idx, d_graph);
00359       typename boost::property_map < CatalogGraph, vertex_entry_t>::const_type 
00360         pMap = boost::get(vertex_entry_t(), d_graph);
00361       return pMap[vd];
00362     }
00363 
00364     //------------------------------------
00365     //! returns a pointer to our entry with a particular bit ID
00366     const entryType *getEntryWithBitId(unsigned int idx) const {
00367       RANGE_CHECK(0,idx,this->getFPLength()-1);
00368       typename boost::property_map < CatalogGraph, vertex_entry_t>::const_type 
00369         pMap = boost::get(vertex_entry_t(), d_graph);
00370       const entryType *res=NULL;
00371       for(unsigned int i=idx;i<this->getNumEntries();i++){
00372         const entryType *e=pMap[i];
00373         if(e->getBitId()==static_cast<int>(idx)){
00374           res=e;
00375           break;
00376         }
00377       }
00378       return res;
00379     }
00380 
00381     //------------------------------------
00382     //! returns the index of the entry with a particular bit ID
00383     int getIdOfEntryWithBitId(unsigned int idx) const {
00384       RANGE_CHECK(0,idx,this->getFPLength()-1);
00385       typename boost::property_map < CatalogGraph, vertex_entry_t>::const_type 
00386         pMap = boost::get(vertex_entry_t(), d_graph);
00387       int res=-1;
00388       for(unsigned int i=idx;i<this->getNumEntries();i++){
00389         const entryType *e=pMap[i];
00390         if(static_cast<unsigned int>(e->getBitId())==idx){
00391           res=i;
00392           break;
00393         }
00394       }
00395       return res;
00396     }
00397 
00398     //------------------------------------
00399     //! returns a list of the indices of entries below the one passed in
00400     RDKit::INT_VECT getDownEntryList(unsigned int idx) const {
00401       RDKit::INT_VECT res;
00402       DOWN_ENT_ITER nbrIdx, endIdx;
00403       boost::tie(nbrIdx, endIdx) = boost::adjacent_vertices(idx, d_graph);
00404       while (nbrIdx != endIdx) {
00405         res.push_back(*nbrIdx);
00406         nbrIdx++;
00407       }
00408       //std::cout << res.size() << "\n";
00409       return res;
00410     }
00411 
00412     //------------------------------------
00413     //! returns a list of the indices that have a particular order 
00414     const RDKit::INT_VECT &getEntriesOfOrder(orderType ord) {
00415       return d_orderMap[ord];
00416     }
00417 
00418     //------------------------------------
00419     //! returns a list of the indices that have a particular order
00420     /*!
00421       \overload
00422     */
00423     const RDKit::INT_VECT &getEntriesOfOrder(orderType ord) const {
00424       typename std::map<orderType, RDKit::INT_VECT>::const_iterator elem;
00425       elem = d_orderMap.find(ord);
00426       CHECK_INVARIANT(elem!=d_orderMap.end()," catalog does not contain any entries of the order specified");
00427       return elem->second;
00428     }
00429 
00430     
00431   private:
00432     // graphs that store the entries in the catalog in a hierachical manner
00433     CatalogGraph d_graph;
00434     // a  map that maps the order type of entries in the catalog to 
00435     // a vector of vertex indices in the graphs above
00436     // e.g.  for a catalog with molecular fragments, the order of a fragment can 
00437     // simply be the number of bond in it. The list this oder maps to is all the
00438     // vertex ids of these fragment in the catalog that have this many bonds in them
00439     std::map<orderType, RDKit::INT_VECT> d_orderMap;
00440 
00441     //------------------------------------
00442     //! clear any memory that we've used
00443     void destroy() {
00444       ENT_ITER_PAIR entItP = boost::vertices(d_graph);
00445       typename boost::property_map < CatalogGraph, vertex_entry_t>::type 
00446         pMap = boost::get(vertex_entry_t(), d_graph);
00447       while (entItP.first != entItP.second) {
00448         delete pMap[*(entItP.first++)];
00449       }
00450     }
00451 
00452 
00453     
00454   };
00455 
00456 
00457   //-----------------------------------------------------------------------------
00458   //! a linear Catalog (analogous to an std::vector)
00459   /*!
00460     Here there is no particular hierarchy, simply a
00461     collection of entries.
00462   */
00463   template <class entryType, class orderType> 
00464   class LinearCatalog : public Catalog <entryType, orderType> {
00465     // here there is no particular hierarchy of entries
00466     // we simply model it as a vector of entries
00467     // FIX: for retrieval purposes a better model map be std::map
00468 
00469   public:
00470     std::string Serialize();
00471     
00472     unsigned int addEntry(entryType *entry, bool updateFPLength = true);
00473     
00474     const entryType *getEntryWithIdx(unsigned int idx) const;
00475 
00476   private:
00477     std::vector<entryType*> d_vector;
00478   };
00479 }
00480 
00481 #endif