MolSupplier.h

Go to the documentation of this file.
00001 //
00002 //  Copyright (C) 2002-2008 greg landrum and Rational Discovery LLC
00003 //
00004 //   @@ All Rights Reserved @@
00005 //  This file is part of the RDKit.
00006 //  The contents are covered by the terms of the BSD license
00007 //  which is included in the file license.txt, found at the root
00008 //  of the RDKit source tree.
00009 //
00010 #ifndef _RD_MOLSUPPLIER_H
00011 #define _RD_MOLSUPPLIER_H
00012 
00013 #include <RDGeneral/types.h>
00014 
00015 #include <string>
00016 #include <list>
00017 #include <vector>
00018 #include <iostream>
00019 #include <GraphMol/ROMol.h>
00020 
00021 namespace RDKit {
00022   std::string strip(const std::string &orig);
00023 
00024   /*! 
00025   //
00026   //  Here are a couple of ways one can interact with MolSuppliers:
00027   //
00028   //  1) Lazy (ForwardIterator):
00029   //     while(!supplier.atEnd()){
00030   //       ROMol *mol = supplier.next();
00031   //       if(mol){
00032   //           do something;
00033   //       }
00034   //     }
00035   //  2) Random Access:
00036   //     for(int i=0;i<supplier.length();i++){
00037   //       ROMol *mol = supplier[i];
00038   //       if(mol){
00039   //           do something;
00040   //       }
00041   //     }
00042   //
00043   //
00044   */
00045   class MolSupplier {
00046     // this is an abstract base class to supply molecules one at a time
00047   public:
00048     MolSupplier() {};
00049     virtual ~MolSupplier() {};
00050     virtual void init() = 0;
00051     virtual void reset() = 0;
00052     virtual bool atEnd() = 0;
00053     virtual ROMol *next() = 0;
00054 
00055   private:
00056     // disable automatic copy constructors and assignment operators
00057     // for this class and its subclasses.  They will likely be
00058     // carrying around stream pointers and copying those is a recipe
00059     // for disaster.
00060     MolSupplier(const MolSupplier&);
00061     MolSupplier &operator=(const MolSupplier&);
00062   protected:
00063     // stream to read the molecules from:
00064     std::istream *dp_inStream;
00065     // do we own dp_inStream?
00066     bool df_owner; 
00067   };
00068 
00069 
00070   // \brief a supplier from an SD file that only reads forward:
00071   class ForwardSDMolSupplier : public MolSupplier {
00072     /*************************************************************************
00073      * A lazy mol supplier from a SD file. 
00074      *  - When new molecules are read using "next" their positions in the file are noted. 
00075      ***********************************************************************************/
00076   public:
00077     ForwardSDMolSupplier() { init(); };
00078 
00079     explicit ForwardSDMolSupplier(std::istream *inStream, bool takeOwnership=true,
00080                                   bool sanitize=true,bool removeHs=true);
00081 
00082     virtual ~ForwardSDMolSupplier() {
00083       if (df_owner && dp_inStream) {
00084         delete dp_inStream;
00085         df_owner=false;
00086         dp_inStream=NULL;
00087       }
00088     };
00089 
00090     virtual void init();
00091     virtual void reset();
00092     virtual ROMol *next();
00093     virtual bool atEnd(); 
00094 
00095   protected:
00096     virtual void checkForEnd();
00097     ROMol *_next();
00098     virtual void readMolProps(ROMol *);
00099     bool df_end; 
00100     int d_line; // line number we are currently on
00101     bool df_sanitize,df_removeHs;
00102   };
00103 
00104 
00105   // \brief a lazy supplier from an SD file
00106   class SDMolSupplier : public ForwardSDMolSupplier {
00107     /*************************************************************************
00108      * A lazy mol supplier from a SD file. 
00109      *  - When new molecules are read using "next" their positions in the file are noted. 
00110      *  - A call to the "length" will automatically parse the entire file and cache all the mol
00111      *    block positions
00112      *  - [] operator is used to access a molecule at "idx", calling next following this will result
00113      *    in the next molecule after "idx"
00114      ***********************************************************************************/
00115 
00116   public:
00117     SDMolSupplier() { init(); };
00118 
00119     /*! 
00120      *   \param fileName - the name of the SD file
00121      *   \param sanitize - if true sanitize the molecule before returning it
00122      *   \param removeHs - if true remove Hs from the molecule before returning it
00123      *                     (triggers sanitization)
00124      */
00125     explicit SDMolSupplier(const std::string &fileName, bool sanitize=true,
00126                            bool removeHs=true);
00127     
00128     explicit SDMolSupplier(std::istream *inStream, bool takeOwnership=true,
00129                            bool sanitize=true,bool removeHs=true);
00130 
00131     
00132     ~SDMolSupplier() {};
00133     void init();
00134     void reset();
00135     ROMol *next();
00136     bool atEnd(); 
00137     void moveTo(unsigned int idx);
00138     ROMol * operator[](unsigned int idx);
00139     /*! \brief returns the text block for a particular item
00140      *  
00141      *  \param idx - which item to return
00142      */
00143     std::string getItemText(unsigned int idx);
00144     unsigned int length();
00145     void setData(const std::string &text,bool sanitize=true, bool removeHs=true);
00146 
00147     /*! Resets our internal state and sets the indices of molecules in the stream.
00148      *  The client should be *very* careful about calling this method, as it's trivial
00149      *  to end up with a completely useless supplier.
00150      *
00151      *   \param locs - the vector of stream positions.
00152      *
00153      *  Note that this can be used not only to make reading selected molecules from a
00154      *  large SD file much faster, but it can also allow subsetting an SD file or
00155      *  rearranging the order of the molecules.
00156      */
00157     void setStreamIndices(const std::vector<std::streampos> &locs);
00158 
00159   private:
00160     void checkForEnd();
00161     int d_len; // total number of mol blocks in the file (initialized to -1)
00162     int d_last; // the molecule we are ready to read
00163     std::vector<std::streampos> d_molpos;
00164 
00165   };
00166 
00167   //! lazy file parser for Smiles tables
00168   class SmilesMolSupplier : public MolSupplier {
00169     /**************************************************************************
00170      * Lazy file parser for Smiles table file, similar to the lazy SD
00171      * file parser above
00172      * - As an when new molecules are read using "next" their
00173      *    positions in the file are noted.
00174      *  - A call to the "length" will autamatically parse the entire
00175      *    file and cache all the mol block positions
00176      *  - [] operator is used to access a molecule at "idx", calling
00177      *    next following this will result in the next molecule after
00178      *    "idx"
00179      ***************************************************************************/ 
00180   public:
00181 
00182     /*! 
00183      *   \param fileName - the name of smiles table file
00184      *   \param delimiter - delimiting characters between records on a each
00185      *     line NOTE that this is not a string, the tokenizer looks for
00186      *     the individual characters in delimiter, not the full string
00187      *     itself.  So the default delimiter: " \t", means " " or "\t".
00188      *   \param smilesColumn - column number for the SMILES string (defaults
00189      *     to the first column)
00190      *   \param nameColumn - column number for the molecule name (defaults to
00191      *     the second column) If set to -1 we assume that no name is
00192      *     available for the molecule and the name is defaulted to the
00193      *     smiles string
00194      *   \param titleLine - if true, the first line is assumed to list the
00195      *     names of properties in order seperated by 'delimiter'. It is
00196      *     also assume that the 'SMILES' column and the 'name' column
00197      *     are not specified here if false - no title line is assumed
00198      *     and the properties are recorded as the "columnX" where "X" is
00199      *     the cloumn number
00200      *   \param sanitize - if true sanitize the molecule before returning it
00201      */
00202     explicit SmilesMolSupplier(const std::string &fileName, 
00203                                const std::string &delimiter=" \t",
00204                                int smilesColumn=0,
00205                                int nameColumn=1, 
00206                                bool titleLine=true,                
00207                                bool sanitize=true);
00208     SmilesMolSupplier();
00209     explicit SmilesMolSupplier(std::istream *inStream, bool takeOwnership=true,
00210                                const std::string &delimiter=" \t",
00211                                int smilesColumn=0,
00212                                int nameColumn=1, 
00213                                bool titleLine=true,                
00214                                bool sanitize=true);                               
00215 
00216     ~SmilesMolSupplier();
00217     void setData(const std::string &text,
00218                  const std::string &delimiter=" ",
00219                  int smilesColumn=0,
00220                  int nameColumn=1, 
00221                  bool titleLine=true,              
00222                  bool sanitize=true);
00223     void init();
00224     void reset();
00225     ROMol *next();
00226     bool atEnd();
00227     void moveTo(unsigned int idx);
00228     ROMol * operator[](unsigned int idx);
00229     /*! \brief returns the text block for a particular item
00230      *  
00231      *  \param idx - which item to return
00232      */
00233     std::string getItemText(unsigned int idx);
00234     unsigned int length();
00235 
00236   private:
00237     ROMol *processLine(std::string inLine);
00238     void processTitleLine();
00239     std::string nextLine();
00240     int skipComments();
00241     void checkForEnd();
00242     
00243     bool df_end; // have we reached the end of the file?
00244     int d_len; // total number of smiles in the file
00245     int d_next; // the  molecule we are ready to read
00246     int d_line; // line number we are currently on
00247     std::vector<std::streampos> d_molpos; // vector of positions in the file for molecules
00248     std::vector<int> d_lineNums; 
00249     std::string d_delim; // the delimiter string
00250     bool df_sanitize; // sanitize molecules before returning them?
00251     STR_VECT d_props; // vector of property names
00252     bool df_title; // do we have a title line?
00253     int d_smi; // column id for the smile string
00254     int d_name; // column id for the name
00255   };
00256 
00257   //! lazy file parser for TDT files
00258   class TDTMolSupplier : public MolSupplier {
00259     /**************************************************************************
00260      * Lazy file parser for TDT files, similar to the lazy SD
00261      * file parser above
00262      * - As an when new molecules are read using "next" their
00263      *    positions in the file are noted.
00264      *  - A call to the "length" will autamatically parse the entire
00265      *    file and cache all the mol block positions
00266      *  - [] operator is used to access a molecule at "idx", calling
00267      *    next following this will result in the next molecule after
00268      *    "idx"
00269      ***************************************************************************/ 
00270   public:
00271 
00272     /*! 
00273      *   \param fileName - the name of the TDT file
00274      *   \param nameRecord - property name for the molecule name.
00275      *     If empty (the default), the name defaults to be empty
00276      *   \param confId2D - if >=0 and 2D coordinates are provided, the 2D
00277      *                   structure (depiction) in the input will be read into the
00278      *                   corresponding conformer id.
00279      *   \param confId3D - if >=0 and 3D coordinates are provided, the 3D
00280      *                   structure (depiction) in the input will be read into the
00281      *                   corresponding conformer id.
00282      *   \param sanitize - if true sanitize the molecule before returning it
00283      */
00284     explicit TDTMolSupplier(const std::string &fileName, 
00285                             const std::string &nameRecord="",
00286                             int confId2D=-1,int confId3D=0,
00287                             bool sanitize=true);
00288     explicit TDTMolSupplier(std::istream *inStream, bool takeOwnership=true,
00289                             const std::string &nameRecord="",
00290                             int confId2D=-1,int confId3D=0,
00291                             bool sanitize=true);
00292     TDTMolSupplier();
00293     ~TDTMolSupplier();
00294     void setData(const std::string &text,
00295                  const std::string &nameRecord="",
00296                  int confId2D=-1,int confId3D=0,
00297                  bool sanitize=true);
00298     void init();
00299     void reset();
00300     ROMol *next();
00301     bool atEnd();
00302     void moveTo(unsigned int idx);
00303     ROMol * operator[](unsigned int idx);
00304     /*! \brief returns the text block for a particular item
00305      *  
00306      *  \param idx - which item to return
00307      */
00308     std::string getItemText(unsigned int idx);
00309     unsigned int length();
00310 
00311   private:
00312     bool advanceToNextRecord();
00313     void checkForEnd();
00314     ROMol *parseMol(std::string inLine);
00315 
00316     bool df_end; // have we reached the end of the file?
00317     int d_len; // total number of mols in the file
00318     int d_next; // the  molecule we are ready to read
00319     int d_last; // the molecule we are ready to read
00320     int d_line; // line number we are currently on
00321     int d_confId2D; // id to use for 2D conformers
00322     int d_confId3D; // id to use for 3D conformers
00323     std::vector<std::streampos> d_molpos; // vector of positions in the file for molecules
00324     bool df_sanitize; // sanitize molecules before returning them?
00325     std::string d_nameProp; // local storage for the property providing mol names
00326   };
00327 
00328 }
00329 
00330 #endif