00001 // 00002 // Copyright (C) 2002-2008 greg landrum and Rational Discovery LLC 00003 // 00004 // @@ All Rights Reserved @@ 00005 // This file is part of the RDKit. 00006 // The contents are covered by the terms of the BSD license 00007 // which is included in the file license.txt, found at the root 00008 // of the RDKit source tree. 00009 // 00010 #ifndef _RD_MOLSUPPLIER_H 00011 #define _RD_MOLSUPPLIER_H 00012 00013 #include <RDGeneral/types.h> 00014 00015 #include <string> 00016 #include <list> 00017 #include <vector> 00018 #include <iostream> 00019 #include <GraphMol/ROMol.h> 00020 00021 namespace RDKit { 00022 std::string strip(const std::string &orig); 00023 00024 /*! 00025 // 00026 // Here are a couple of ways one can interact with MolSuppliers: 00027 // 00028 // 1) Lazy (ForwardIterator): 00029 // while(!supplier.atEnd()){ 00030 // ROMol *mol = supplier.next(); 00031 // if(mol){ 00032 // do something; 00033 // } 00034 // } 00035 // 2) Random Access: 00036 // for(int i=0;i<supplier.length();i++){ 00037 // ROMol *mol = supplier[i]; 00038 // if(mol){ 00039 // do something; 00040 // } 00041 // } 00042 // 00043 // 00044 */ 00045 class MolSupplier { 00046 // this is an abstract base class to supply molecules one at a time 00047 public: 00048 MolSupplier() {}; 00049 virtual ~MolSupplier() {}; 00050 virtual void init() = 0; 00051 virtual void reset() = 0; 00052 virtual bool atEnd() = 0; 00053 virtual ROMol *next() = 0; 00054 00055 private: 00056 // disable automatic copy constructors and assignment operators 00057 // for this class and its subclasses. They will likely be 00058 // carrying around stream pointers and copying those is a recipe 00059 // for disaster. 00060 MolSupplier(const MolSupplier&); 00061 MolSupplier &operator=(const MolSupplier&); 00062 protected: 00063 // stream to read the molecules from: 00064 std::istream *dp_inStream; 00065 // do we own dp_inStream? 00066 bool df_owner; 00067 }; 00068 00069 00070 // \brief a supplier from an SD file that only reads forward: 00071 class ForwardSDMolSupplier : public MolSupplier { 00072 /************************************************************************* 00073 * A lazy mol supplier from a SD file. 00074 * - When new molecules are read using "next" their positions in the file are noted. 00075 ***********************************************************************************/ 00076 public: 00077 ForwardSDMolSupplier() { init(); }; 00078 00079 explicit ForwardSDMolSupplier(std::istream *inStream, bool takeOwnership=true, 00080 bool sanitize=true,bool removeHs=true); 00081 00082 virtual ~ForwardSDMolSupplier() { 00083 if (df_owner && dp_inStream) { 00084 delete dp_inStream; 00085 df_owner=false; 00086 dp_inStream=NULL; 00087 } 00088 }; 00089 00090 virtual void init(); 00091 virtual void reset(); 00092 virtual ROMol *next(); 00093 virtual bool atEnd(); 00094 00095 protected: 00096 virtual void checkForEnd(); 00097 ROMol *_next(); 00098 virtual void readMolProps(ROMol *); 00099 bool df_end; 00100 int d_line; // line number we are currently on 00101 bool df_sanitize,df_removeHs; 00102 }; 00103 00104 00105 // \brief a lazy supplier from an SD file 00106 class SDMolSupplier : public ForwardSDMolSupplier { 00107 /************************************************************************* 00108 * A lazy mol supplier from a SD file. 00109 * - When new molecules are read using "next" their positions in the file are noted. 00110 * - A call to the "length" will automatically parse the entire file and cache all the mol 00111 * block positions 00112 * - [] operator is used to access a molecule at "idx", calling next following this will result 00113 * in the next molecule after "idx" 00114 ***********************************************************************************/ 00115 00116 public: 00117 SDMolSupplier() { init(); }; 00118 00119 /*! 00120 * \param fileName - the name of the SD file 00121 * \param sanitize - if true sanitize the molecule before returning it 00122 * \param removeHs - if true remove Hs from the molecule before returning it 00123 * (triggers sanitization) 00124 */ 00125 explicit SDMolSupplier(const std::string &fileName, bool sanitize=true, 00126 bool removeHs=true); 00127 00128 explicit SDMolSupplier(std::istream *inStream, bool takeOwnership=true, 00129 bool sanitize=true,bool removeHs=true); 00130 00131 00132 ~SDMolSupplier() {}; 00133 void init(); 00134 void reset(); 00135 ROMol *next(); 00136 bool atEnd(); 00137 void moveTo(unsigned int idx); 00138 ROMol * operator[](unsigned int idx); 00139 /*! \brief returns the text block for a particular item 00140 * 00141 * \param idx - which item to return 00142 */ 00143 std::string getItemText(unsigned int idx); 00144 unsigned int length(); 00145 void setData(const std::string &text,bool sanitize=true, bool removeHs=true); 00146 00147 /*! Resets our internal state and sets the indices of molecules in the stream. 00148 * The client should be *very* careful about calling this method, as it's trivial 00149 * to end up with a completely useless supplier. 00150 * 00151 * \param locs - the vector of stream positions. 00152 * 00153 * Note that this can be used not only to make reading selected molecules from a 00154 * large SD file much faster, but it can also allow subsetting an SD file or 00155 * rearranging the order of the molecules. 00156 */ 00157 void setStreamIndices(const std::vector<std::streampos> &locs); 00158 00159 private: 00160 void checkForEnd(); 00161 int d_len; // total number of mol blocks in the file (initialized to -1) 00162 int d_last; // the molecule we are ready to read 00163 std::vector<std::streampos> d_molpos; 00164 00165 }; 00166 00167 //! lazy file parser for Smiles tables 00168 class SmilesMolSupplier : public MolSupplier { 00169 /************************************************************************** 00170 * Lazy file parser for Smiles table file, similar to the lazy SD 00171 * file parser above 00172 * - As an when new molecules are read using "next" their 00173 * positions in the file are noted. 00174 * - A call to the "length" will autamatically parse the entire 00175 * file and cache all the mol block positions 00176 * - [] operator is used to access a molecule at "idx", calling 00177 * next following this will result in the next molecule after 00178 * "idx" 00179 ***************************************************************************/ 00180 public: 00181 00182 /*! 00183 * \param fileName - the name of smiles table file 00184 * \param delimiter - delimiting characters between records on a each 00185 * line NOTE that this is not a string, the tokenizer looks for 00186 * the individual characters in delimiter, not the full string 00187 * itself. So the default delimiter: " \t", means " " or "\t". 00188 * \param smilesColumn - column number for the SMILES string (defaults 00189 * to the first column) 00190 * \param nameColumn - column number for the molecule name (defaults to 00191 * the second column) If set to -1 we assume that no name is 00192 * available for the molecule and the name is defaulted to the 00193 * smiles string 00194 * \param titleLine - if true, the first line is assumed to list the 00195 * names of properties in order seperated by 'delimiter'. It is 00196 * also assume that the 'SMILES' column and the 'name' column 00197 * are not specified here if false - no title line is assumed 00198 * and the properties are recorded as the "columnX" where "X" is 00199 * the cloumn number 00200 * \param sanitize - if true sanitize the molecule before returning it 00201 */ 00202 explicit SmilesMolSupplier(const std::string &fileName, 00203 const std::string &delimiter=" \t", 00204 int smilesColumn=0, 00205 int nameColumn=1, 00206 bool titleLine=true, 00207 bool sanitize=true); 00208 SmilesMolSupplier(); 00209 explicit SmilesMolSupplier(std::istream *inStream, bool takeOwnership=true, 00210 const std::string &delimiter=" \t", 00211 int smilesColumn=0, 00212 int nameColumn=1, 00213 bool titleLine=true, 00214 bool sanitize=true); 00215 00216 ~SmilesMolSupplier(); 00217 void setData(const std::string &text, 00218 const std::string &delimiter=" ", 00219 int smilesColumn=0, 00220 int nameColumn=1, 00221 bool titleLine=true, 00222 bool sanitize=true); 00223 void init(); 00224 void reset(); 00225 ROMol *next(); 00226 bool atEnd(); 00227 void moveTo(unsigned int idx); 00228 ROMol * operator[](unsigned int idx); 00229 /*! \brief returns the text block for a particular item 00230 * 00231 * \param idx - which item to return 00232 */ 00233 std::string getItemText(unsigned int idx); 00234 unsigned int length(); 00235 00236 private: 00237 ROMol *processLine(std::string inLine); 00238 void processTitleLine(); 00239 std::string nextLine(); 00240 int skipComments(); 00241 void checkForEnd(); 00242 00243 bool df_end; // have we reached the end of the file? 00244 int d_len; // total number of smiles in the file 00245 int d_next; // the molecule we are ready to read 00246 int d_line; // line number we are currently on 00247 std::vector<std::streampos> d_molpos; // vector of positions in the file for molecules 00248 std::vector<int> d_lineNums; 00249 std::string d_delim; // the delimiter string 00250 bool df_sanitize; // sanitize molecules before returning them? 00251 STR_VECT d_props; // vector of property names 00252 bool df_title; // do we have a title line? 00253 int d_smi; // column id for the smile string 00254 int d_name; // column id for the name 00255 }; 00256 00257 //! lazy file parser for TDT files 00258 class TDTMolSupplier : public MolSupplier { 00259 /************************************************************************** 00260 * Lazy file parser for TDT files, similar to the lazy SD 00261 * file parser above 00262 * - As an when new molecules are read using "next" their 00263 * positions in the file are noted. 00264 * - A call to the "length" will autamatically parse the entire 00265 * file and cache all the mol block positions 00266 * - [] operator is used to access a molecule at "idx", calling 00267 * next following this will result in the next molecule after 00268 * "idx" 00269 ***************************************************************************/ 00270 public: 00271 00272 /*! 00273 * \param fileName - the name of the TDT file 00274 * \param nameRecord - property name for the molecule name. 00275 * If empty (the default), the name defaults to be empty 00276 * \param confId2D - if >=0 and 2D coordinates are provided, the 2D 00277 * structure (depiction) in the input will be read into the 00278 * corresponding conformer id. 00279 * \param confId3D - if >=0 and 3D coordinates are provided, the 3D 00280 * structure (depiction) in the input will be read into the 00281 * corresponding conformer id. 00282 * \param sanitize - if true sanitize the molecule before returning it 00283 */ 00284 explicit TDTMolSupplier(const std::string &fileName, 00285 const std::string &nameRecord="", 00286 int confId2D=-1,int confId3D=0, 00287 bool sanitize=true); 00288 explicit TDTMolSupplier(std::istream *inStream, bool takeOwnership=true, 00289 const std::string &nameRecord="", 00290 int confId2D=-1,int confId3D=0, 00291 bool sanitize=true); 00292 TDTMolSupplier(); 00293 ~TDTMolSupplier(); 00294 void setData(const std::string &text, 00295 const std::string &nameRecord="", 00296 int confId2D=-1,int confId3D=0, 00297 bool sanitize=true); 00298 void init(); 00299 void reset(); 00300 ROMol *next(); 00301 bool atEnd(); 00302 void moveTo(unsigned int idx); 00303 ROMol * operator[](unsigned int idx); 00304 /*! \brief returns the text block for a particular item 00305 * 00306 * \param idx - which item to return 00307 */ 00308 std::string getItemText(unsigned int idx); 00309 unsigned int length(); 00310 00311 private: 00312 bool advanceToNextRecord(); 00313 void checkForEnd(); 00314 ROMol *parseMol(std::string inLine); 00315 00316 bool df_end; // have we reached the end of the file? 00317 int d_len; // total number of mols in the file 00318 int d_next; // the molecule we are ready to read 00319 int d_last; // the molecule we are ready to read 00320 int d_line; // line number we are currently on 00321 int d_confId2D; // id to use for 2D conformers 00322 int d_confId3D; // id to use for 3D conformers 00323 std::vector<std::streampos> d_molpos; // vector of positions in the file for molecules 00324 bool df_sanitize; // sanitize molecules before returning them? 00325 std::string d_nameProp; // local storage for the property providing mol names 00326 }; 00327 00328 } 00329 00330 #endif
1.7.1