RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
MolSupplier.h
Go to the documentation of this file.
1//
2// Copyright (C) 2002-2024 greg landrum and other RDKit contributors
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10#include <RDGeneral/export.h>
11#ifndef RD_MOLSUPPLIER_H
12#define RD_MOLSUPPLIER_H
13
14#include <RDGeneral/types.h>
15
16#include <string>
17#include <string_view>
18#include <list>
19#include <memory>
20#include <vector>
21#include <fstream>
22#include <GraphMol/ROMol.h>
24#include "FileParsers.h"
26
27#ifdef RDK_BUILD_MAEPARSER_SUPPORT
28namespace schrodinger {
29namespace mae {
30class Reader;
31class Block;
32} // namespace mae
33} // namespace schrodinger
34#endif // RDK_BUILD_MAEPARSER_SUPPORT
35
36namespace RDKit {
37RDKIT_FILEPARSERS_EXPORT std::string strip(const std::string &orig);
38
39namespace v2 {
40namespace FileParsers {
41/*!
42//
43// Here are a couple of ways one can interact with MolSuppliers:
44//
45// 1) Lazy (ForwardIterator):
46// while(!supplier.atEnd()){
47// ROMol *mol = supplier.next();
48// if(mol){
49// do something;
50// }
51// }
52// 2) Random Access:
53// for(int i=0;i<supplier.length();i++){
54// ROMol *mol = supplier[i];
55// if(mol){
56// do something;
57// }
58// }
59//
60//
61*/
63 // this is an abstract base class to supply molecules one at a time
64 public:
66 virtual ~MolSupplier() {}
67 virtual void init() = 0;
68 virtual void reset() = 0;
69 virtual bool atEnd() = 0;
70 virtual std::unique_ptr<RWMol> next() = 0;
71
72 virtual void close() {
73 if (df_owner) {
74 delete dp_inStream;
75 df_owner = false;
76 }
77 dp_inStream = nullptr;
78 }
79
80 private:
81 // disable automatic copy constructors and assignment operators
82 // for this class and its subclasses. They will likely be
83 // carrying around stream pointers and copying those is a recipe
84 // for disaster.
85 MolSupplier(const MolSupplier &);
86 MolSupplier &operator=(const MolSupplier &);
87
88 protected:
89 // stream to read the molecules from:
90 std::istream *dp_inStream = nullptr;
91 // do we own dp_inStream?
92 bool df_owner = false;
93 // opens a stream for reading and verifies that it can be read from.
94 // if not it throws an exception
95 // the caller owns the resulting stream
96 std::istream *openAndCheckStream(const std::string &filename) {
97 // FIX: this binary mode of opening file is here because of a bug in
98 // VC++ 6.0
99 // the function "tellg" does not work correctly if we do not open it this
100 // way
101 // Jan 2009: Confirmed that this is still the case in visual studio 2008
102 std::ifstream *strm =
103 new std::ifstream(filename.c_str(), std::ios_base::binary);
104 if ((!(*strm)) || strm->bad()) {
105 std::ostringstream errout;
106 errout << "Bad input file " << filename;
107 delete strm;
108 throw BadFileException(errout.str());
109 }
110 strm->peek();
111 if (strm->bad() || strm->eof()) {
112 std::ostringstream errout;
113 errout << "Invalid input file " << filename;
114 delete strm;
115 throw BadFileException(errout.str());
116 }
117 return static_cast<std::istream *>(strm);
118 }
119};
120
121// \brief a supplier from an SD file that only reads forward:
123 /*************************************************************************
124 * A lazy mol supplier from a SD file.
125 * - When new molecules are read using "next" their positions in the file are
126 *noted.
127 ***********************************************************************************/
128 public:
130
132 std::istream *inStream, bool takeOwnership = true,
133 const MolFileParserParams &params = MolFileParserParams());
134
135 ~ForwardSDMolSupplier() override { close(); }
136
137 void init() override;
138 void reset() override;
139 std::unique_ptr<RWMol> next() override;
140 bool atEnd() override;
141
144
145 bool getEOFHitOnRead() const { return df_eofHitOnRead; }
146
147 protected:
148 virtual void checkForEnd();
149 std::unique_ptr<RWMol> _next();
150 virtual void readMolProps(ROMol &);
151 bool df_end = false;
152 int d_line = 0; // line number we are currently on
155 bool df_eofHitOnRead = false;
156};
157// \brief a lazy supplier from an SD file
159 /*************************************************************************
160 * A lazy mol supplier from a SD file.
161 * - When new molecules are read using "next" their positions in the file are
162 *noted.
163 * - A call to the "length" will automatically parse the entire file and
164 *cache all the mol
165 * block positions
166 * - [] operator is used to access a molecule at "idx", calling next
167 *following this will result
168 * in the next molecule after "idx"
169 ***********************************************************************************/
170
171 public:
173
174 /*!
175 * \param fileName - the name of the SD file
176 * \param sanitize - if true sanitize the molecule before returning it
177 * \param removeHs - if true remove Hs from the molecule before returning it
178 * (triggers sanitization)
179 * \param strictParsing - if set to false, the parser is more lax about
180 * correctness
181 * of the contents.
182 */
184 const std::string &fileName,
185 const MolFileParserParams &params = MolFileParserParams());
186
188 std::istream *inStream, bool takeOwnership = true,
189 const MolFileParserParams &params = MolFileParserParams());
190
191 ~SDMolSupplier() override { close(); }
192 void init() override;
193 void reset() override;
194 std::unique_ptr<RWMol> next() override;
195 bool atEnd() override;
196 void moveTo(unsigned int idx);
197 std::unique_ptr<RWMol> operator[](unsigned int idx);
198 /*! \brief returns the text block for a particular item
199 *
200 * \param idx - which item to return
201 */
202 std::string getItemText(unsigned int idx);
203 unsigned int length();
204 void setData(const std::string &text);
205 void setData(const std::string &text, const MolFileParserParams &params);
206
207 /*! Resets our internal state and sets the indices of molecules in the stream.
208 * The client should be *very* careful about calling this method, as it's
209 *trivial
210 * to end up with a completely useless supplier.
211 *
212 * \param locs - the vector of stream positions.
213 *
214 * Note that this can be used not only to make reading selected molecules
215 *from a
216 * large SD file much faster, but it can also allow subsetting an SD file or
217 * rearranging the order of the molecules.
218 */
219 void setStreamIndices(const std::vector<std::streampos> &locs);
220
221 private:
222 void checkForEnd() override;
223 void peekCheckForEnd(char* bufPtr, char* bufEnd, std::streampos molStartPos);
224 void buildIndexTo(unsigned int targetIdx);
225 int d_len = 0; // total number of mol blocks in the file (initialized to -1)
226 int d_last = 0; // the molecule we are ready to read
227 std::vector<std::streampos> d_molpos;
228};
229
231 std::string delimiter = " \t";
233 int nameColumn = 1;
234 bool titleLine = true;
236 true, // sanitize
237 false, // allowCXSMILES
238 true, // strictCXSMILES
239 false, // parseName
240 true, // removeHs
241 false, // skipCleanup
242 false, // debugParse
243 {} // replacements
244 };
245};
246
247//! lazy file parser for Smiles tables
249 /**************************************************************************
250 * Lazy file parser for Smiles table file, similar to the lazy SD
251 * file parser above
252 * - As an when new molecules are read using "next" their
253 * positions in the file are noted.
254 * - A call to the "length" will automatically parse the entire
255 * file and cache all the mol block positions
256 * - [] operator is used to access a molecule at "idx", calling
257 * next following this will result in the next molecule after
258 * "idx"
259 ***************************************************************************/
260 public:
261 /*!
262 * \param fileName - the name of smiles table file
263 * \param delimiter - delimiting characters between records on a each
264 * line NOTE that this is not a string, the tokenizer looks for
265 * the individual characters in delimiter, not the full string
266 * itself. So the default delimiter: " \t", means " " or "\t".
267 * \param smilesColumn - column number for the SMILES string (defaults
268 * to the first column)
269 * \param nameColumn - column number for the molecule name (defaults to
270 * the second column) If set to -1 we assume that no name is
271 * available for the molecule and the name is defaulted to the
272 * smiles string
273 * \param titleLine - if true, the first line is assumed to list the
274 * names of properties in order separated by 'delimiter'. It is
275 * also assume that the 'SMILES' column and the 'name' column
276 * are not specified here if false - no title line is assumed
277 * and the properties are recorded as the "columnX" where "X" is
278 * the column number
279 * \param sanitize - if true sanitize the molecule before returning it
280 */
282 const std::string &fileName,
286 std::istream *inStream, bool takeOwnership = true,
288
289 ~SmilesMolSupplier() override { close(); }
290 void setData(const std::string &text, const SmilesMolSupplierParams &params =
292 void init() override;
293 void reset() override;
294 std::unique_ptr<RWMol> next() override;
295 bool atEnd() override;
296 void moveTo(unsigned int idx);
297 std::unique_ptr<RWMol> operator[](unsigned int idx);
298 /*! \brief returns the text block for a particular item
299 *
300 * \param idx - which item to return
301 */
302 std::string getItemText(unsigned int idx);
303 unsigned int length();
304
305 private:
306 std::unique_ptr<RWMol> processLine(std::string inLine);
307 void processTitleLine();
308 std::string nextLine();
309 long int skipComments();
310 void checkForEnd();
311
312 bool df_end = false; // have we reached the end of the file?
313 long d_len = 0; // total number of smiles in the file
314 long d_next = 0; // the molecule we are ready to read
315 size_t d_line = 0; // line number we are currently on
317 std::vector<std::streampos>
318 d_molpos; // vector of positions in the file for molecules
319 std::vector<int> d_lineNums;
320 STR_VECT d_props; // vector of property names
321};
322
324 std::string nameRecord = "";
325 int confId2D = -1;
326 int confId3D = -1;
328 true, // sanitize
329 false, // allowCXSMILES
330 true, // strictCXSMILES
331 false, // parseName
332 true, // removeHs
333 false, // skipCleanup
334 false, // debugParse
335 {} // replacements
336 };
337};
338
339//! lazy file parser for TDT files
341 /**************************************************************************
342 * Lazy file parser for TDT files, similar to the lazy SD
343 * file parser above
344 * - As an when new molecules are read using "next" their
345 * positions in the file are noted.
346 * - A call to the "length" will automatically parse the entire
347 * file and cache all the mol block positions
348 * - [] operator is used to access a molecule at "idx", calling
349 * next following this will result in the next molecule after
350 * "idx"
351 ***************************************************************************/
352 public:
353 /*!
354 * \param fileName - the name of the TDT file
355 * \param nameRecord - property name for the molecule name.
356 * If empty (the default), the name defaults to be empty
357 * \param confId2D - if >=0 and 2D coordinates are provided, the 2D
358 * structure (depiction) in the input will be read into the
359 * corresponding conformer id.
360 * \param confId3D - if >=0 and 3D coordinates are provided, the 3D
361 * structure (depiction) in the input will be read into the
362 * corresponding conformer id.
363 * \param sanitize - if true sanitize the molecule before returning it
364 */
366 const std::string &fileName,
369 std::istream *inStream, bool takeOwnership = true,
372 ~TDTMolSupplier() override { close(); }
373 void setData(const std::string &text,
375 void init() override;
376 void reset() override;
377 std::unique_ptr<RWMol> next() override;
378 bool atEnd() override;
379 void moveTo(unsigned int idx);
380 std::unique_ptr<RWMol> operator[](unsigned int idx);
381 /*! \brief returns the text block for a particular item
382 *
383 * \param idx - which item to return
384 */
385 std::string getItemText(unsigned int idx);
386 unsigned int length();
387
388 private:
389 bool advanceToNextRecord();
390 void checkForEnd();
391 std::unique_ptr<RWMol> parseMol(std::string inLine);
392
393 bool df_end = false; // have we reached the end of the file?
394 int d_len = 0; // total number of mols in the file
395 int d_last = 0; // the molecule we are ready to read
396 int d_line = 0; // line number we are currently on
397 std::vector<std::streampos>
398 d_molpos; // vector of positions in the file for molecules
399 TDTMolSupplierParams d_params;
400};
401
402#ifdef RDK_BUILD_MAEPARSER_SUPPORT
403struct MaeMolSupplierParams {
404 bool sanitize = true;
405 bool removeHs = true;
406};
407//! lazy file parser for MAE files
408class RDKIT_FILEPARSERS_EXPORT MaeMolSupplier : public MolSupplier {
409 /**
410 * Due to maeparser's shared_ptr<istream> Reader interface, MaeMolSupplier
411 * always requires taking ownership of the istream ptr, as the shared ptr will
412 * always clear it upon destruction.
413 */
414
415 public:
416 MaeMolSupplier() {}
417
418 explicit MaeMolSupplier(
419 std::shared_ptr<std::istream> inStream,
420 const MaeMolSupplierParams &params = MaeMolSupplierParams());
421
422 explicit MaeMolSupplier(
423 std::istream *inStream, bool takeOwnership = true,
424 const MaeMolSupplierParams &params = MaeMolSupplierParams());
425
426 explicit MaeMolSupplier(
427 const std::string &fname,
428 const MaeMolSupplierParams &params = MaeMolSupplierParams());
429
430 ~MaeMolSupplier() override {}
431
432 void init() override;
433 void reset() override;
434 std::unique_ptr<RWMol> next() override;
435 bool atEnd() override;
436 void moveTo(unsigned int idx);
437 std::unique_ptr<RWMol> operator[](unsigned int idx);
438 unsigned int length();
439
440 void close() override { dp_sInStream.reset(); }
441
442 void setData(const std::string &text,
443 const MaeMolSupplierParams &params = MaeMolSupplierParams());
444
445 private:
446 void moveToNextBlock();
447
448 protected:
449 MaeMolSupplierParams d_params;
450 std::shared_ptr<schrodinger::mae::Reader> d_reader;
451 std::shared_ptr<schrodinger::mae::Block> d_next_struct;
452 std::shared_ptr<std::istream> dp_sInStream;
453 std::string d_stored_exc;
454 unsigned d_position;
455 unsigned d_length;
456};
457#endif // RDK_BUILD_MAEPARSER_SUPPORT
458
459} // namespace FileParsers
460} // namespace v2
461} // namespace RDKit
462
463#include "MolSupplier.v1API.h"
464
465#endif
Defines the primary molecule class ROMol as well as associated typedefs.
used by various file parsing classes to indicate a bad file
ForwardSDMolSupplier(std::istream *inStream, bool takeOwnership=true, const MolFileParserParams &params=MolFileParserParams())
std::unique_ptr< RWMol > next() override
std::istream * openAndCheckStream(const std::string &filename)
Definition MolSupplier.h:96
virtual std::unique_ptr< RWMol > next()=0
void setStreamIndices(const std::vector< std::streampos > &locs)
void setData(const std::string &text)
std::unique_ptr< RWMol > next() override
SDMolSupplier(std::istream *inStream, bool takeOwnership=true, const MolFileParserParams &params=MolFileParserParams())
std::string getItemText(unsigned int idx)
returns the text block for a particular item
void setData(const std::string &text, const MolFileParserParams &params)
SDMolSupplier(const std::string &fileName, const MolFileParserParams &params=MolFileParserParams())
std::unique_ptr< RWMol > operator[](unsigned int idx)
std::string getItemText(unsigned int idx)
returns the text block for a particular item
std::unique_ptr< RWMol > next() override
SmilesMolSupplier(std::istream *inStream, bool takeOwnership=true, const SmilesMolSupplierParams &params=SmilesMolSupplierParams())
std::unique_ptr< RWMol > operator[](unsigned int idx)
SmilesMolSupplier(const std::string &fileName, const SmilesMolSupplierParams &params=SmilesMolSupplierParams())
void setData(const std::string &text, const SmilesMolSupplierParams &params=SmilesMolSupplierParams())
std::unique_ptr< RWMol > operator[](unsigned int idx)
void setData(const std::string &text, const TDTMolSupplierParams &params=TDTMolSupplierParams())
TDTMolSupplier(const std::string &fileName, const TDTMolSupplierParams &params=TDTMolSupplierParams())
std::string getItemText(unsigned int idx)
returns the text block for a particular item
TDTMolSupplier(std::istream *inStream, bool takeOwnership=true, const TDTMolSupplierParams &params=TDTMolSupplierParams())
std::unique_ptr< RWMol > next() override
#define RDKIT_FILEPARSERS_EXPORT
Definition export.h:177
Std stuff.
RDKIT_FILEPARSERS_EXPORT std::string strip(const std::string &orig)
std::vector< std::string > STR_VECT
Definition Dict.h:29
v2::SmilesParse::SmilesParserParams parseParameters
v2::SmilesParse::SmilesParserParams parseParameters