RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
FileParsers.h
Go to the documentation of this file.
1//
2// Copyright (C) 2002-2024 Greg Landrum and other RDKit contributors
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10#include <RDGeneral/export.h>
11#ifndef RD_FILEPARSERS_H
12#define RD_FILEPARSERS_H
13
14#include <RDGeneral/types.h>
15#include <GraphMol/RDKitBase.h>
17#include "CDXMLParser.h"
18#include <string>
19#include <string_view>
20#include <vector>
21#include <exception>
22
23#include <boost/shared_ptr.hpp>
24
25namespace RDKit {
26
27RDKIT_FILEPARSERS_EXPORT std::string strip(const std::string &orig);
28
29namespace v2 {
30namespace FileParsers {
32 : public std::exception {
33 public:
34 //! construct with an error message
35 explicit MolFileUnhandledFeatureException(const char *msg) : _msg(msg) {}
36 //! construct with an error message
37 explicit MolFileUnhandledFeatureException(const std::string msg)
38 : _msg(msg) {}
39 //! get the error message
40 const char *what() const noexcept override { return _msg.c_str(); }
41 ~MolFileUnhandledFeatureException() noexcept override = default;
42
43 private:
44 std::string _msg;
45};
46
48 bool sanitize = true; /**< sanitize the molecule after building it */
49 bool removeHs = true; /**< remove Hs after constructing the molecule */
50 bool strictParsing = true; /**< if set to false, the parser is more lax about
51 correctness of the contents. */
53 false; /**< toggle conversion of attachment points into dummy atoms */
54 bool parsingSCSRMol = false; /**< if true, we are parsing a SCSR mol file */
55};
57 AsEntered, //<! use the name of the temlate as entered in the SCSR Mol
58 UseFirstName, //<!Use the first name in the template
59 // def (For AA, the 3 letter code
60 UseSecondName //<!use the second name in the tempate def (
61 // For AA, the 1 letter code)
62};
63
65 Ignore, //<! Do not include base Hbonds in expanded output
66 UseSapAll, //<!use all hbonds defined in SAPs
67 // can be more than one per base
68 UseSapOne, //<!use only one SAP hbond per base
69 // If multiple SAPs are defined, use the first
70 // even if it is not the best
71 //(this just maintains the relationship between
72 // the to base pairs)
73 Auto //<!For bases that are C,G,A,T,U,In (and
74 // derivatives) use the standard Watson-Crick
75 // Hbonding. No SAPs need to be defined, and if
76 // defined, they are ignored.
77};
78
81 true; /**< when true, leaving groups on atoms that are not exo-bonded are
82 retained. When false, no leaving groups are retained */
84
86};
88 std::istream &inStream, unsigned int &line,
91 const std::string &molBlock,
94 const std::string &fName,
96
97RDKIT_FILEPARSERS_EXPORT std::unique_ptr<RDKit::RWMol> MolFromSCSRDataStream(
98 std::istream &inStream, unsigned int &line,
99 const MolFileParserParams &molFileParserParams = MolFileParserParams(),
100 const MolFromSCSRParams &molFromSCSRParams = MolFromSCSRParams());
101RDKIT_FILEPARSERS_EXPORT std::unique_ptr<RDKit::RWMol> MolFromSCSRBlock(
102 const std::string &molBlock,
103 const MolFileParserParams &molFileParserParams = MolFileParserParams(),
104 const MolFromSCSRParams &molFromSCSRParams = MolFromSCSRParams());
105RDKIT_FILEPARSERS_EXPORT std::unique_ptr<RDKit::RWMol> MolFromSCSRFile(
106 const std::string &fName,
107 const MolFileParserParams &molFileParserParams = MolFileParserParams(),
108 const MolFromSCSRParams &molFromSCSRParams = MolFromSCSRParams());
109
110} // namespace FileParsers
111} // namespace v2
112
113inline namespace v1 {
115//-----
116// mol files
117//-----
118// \brief construct a molecule from MDL mol data in a stream
119/*!
120 * \param inStream - stream containing the data
121 * \param line - current line number (used for error reporting)
122 * \param sanitize - toggles sanitization and stereochemistry
123 * perception of the molecule
124 * \param removeHs - toggles removal of Hs from the molecule. H removal
125 * is only done if the molecule is sanitized
126 * \param line - current line number (used for error reporting)
127 * \param strictParsing - if set to false, the parser is more lax about
128 * correctness of the contents.
129 *
130 */
131inline RWMol *MolDataStreamToMol(std::istream *inStream, unsigned int &line,
132 bool sanitize = true, bool removeHs = true,
133 bool strictParsing = true) {
135 ps.sanitize = sanitize;
136 ps.removeHs = removeHs;
137 ps.strictParsing = strictParsing;
138 return v2::FileParsers::MolFromMolDataStream(*inStream, line, ps).release();
139};
140// \overload
141inline RWMol *MolDataStreamToMol(std::istream &inStream, unsigned int &line,
142 bool sanitize = true, bool removeHs = true,
143 bool strictParsing = true) {
144 return MolDataStreamToMol(&inStream, line, sanitize, removeHs, strictParsing);
145};
146// \brief construct a molecule from an MDL mol block
147/*!
148 * \param molBlock - string containing the mol block
149 * \param sanitize - toggles sanitization and stereochemistry
150 * perception of the molecule
151 * \param removeHs - toggles removal of Hs from the molecule. H removal
152 * is only done if the molecule is sanitized
153 * \param strictParsing - if set to false, the parser is more lax about
154 * correctness of the contents.
155 */
156inline RWMol *MolBlockToMol(const std::string &molBlock, bool sanitize = true,
157 bool removeHs = true, bool strictParsing = true) {
159 ps.sanitize = sanitize;
160 ps.removeHs = removeHs;
161 ps.strictParsing = strictParsing;
162 return v2::FileParsers::MolFromMolBlock(molBlock, ps).release();
163};
164
165// \brief construct a molecule from an MDL mol file
166/*!
167 * \param fName - string containing the file name
168 * \param sanitize - toggles sanitization and stereochemistry
169 * perception of the molecule
170 * \param removeHs - toggles removal of Hs from the molecule. H removal
171 * is only done if the molecule is sanitized
172 * \param strictParsing - if set to false, the parser is more lax about
173 * correctness of the contents.
174 */
175inline RWMol *MolFileToMol(const std::string &fName, bool sanitize = true,
176 bool removeHs = true, bool strictParsing = true) {
178 ps.sanitize = sanitize;
179 ps.removeHs = removeHs;
180 ps.strictParsing = strictParsing;
181 return v2::FileParsers::MolFromMolFile(fName, ps).release();
182};
183} // namespace v1
184
185//-----
186// TPL handling:
187//-----
188
189namespace v2 {
190namespace FileParsers {
192 bool sanitize = true; /**< sanitize the molecule after building it */
194 false; /**< if set to true, the first conformer will be skipped */
195};
197 std::istream &inStream, unsigned int &line,
198 const TPLParserParams &params = TPLParserParams());
200 const std::string &fName,
201 const TPLParserParams &params = TPLParserParams());
202
203} // namespace FileParsers
204} // namespace v2
205
206inline namespace v1 {
207//! \brief translate TPL data (BioCad format) into a multi-conf molecule
208/*!
209 \param inStream: the stream from which to read
210 \param line: used to track the line number of errors
211 \param sanitize: toggles sanitization and stereochemistry
212 perception of the molecule
213 \param skipFirstConf: according to the TPL format description, the atomic
214 coords in the atom-information block describe the first
215 conformation and the first conf block describes second
216 conformation. The CombiCode, on the other hand, writes
217 the first conformation data both to the atom-information
218 block and to the first conf block. We want to be able to
219 read CombiCode-style tpls, so we'll allow this
220 mis-feature
221 to be parsed when this flag is set.
222*/
223inline RWMol *TPLDataStreamToMol(std::istream *inStream, unsigned int &line,
224 bool sanitize = true,
225 bool skipFirstConf = false) {
227 ps.sanitize = sanitize;
228 ps.skipFirstConf = skipFirstConf;
229 return v2::FileParsers::MolFromTPLDataStream(*inStream, line, ps).release();
230}
231
232//! \brief construct a multi-conf molecule from a TPL (BioCad format) file
233/*!
234 \param fName: the name of the file from which to read
235 \param sanitize: toggles sanitization and stereochemistry
236 perception of the molecule
237 \param skipFirstConf: according to the TPL format description, the atomic
238 coords in the atom-information block describe the first
239 conformation and the first conf block describes second
240 conformation. The CombiCode, on the other hand, writes
241 the first conformation data both to the atom-information
242 block and to the first conf block. We want to be able to
243 read CombiCode-style tpls, so we'll allow this
244 mis-feature
245 to be parsed when this flag is set.
246*/
247inline RWMol *TPLFileToMol(const std::string &fName, bool sanitize = true,
248 bool skipFirstConf = false) {
250 ps.sanitize = sanitize;
251 ps.skipFirstConf = skipFirstConf;
252 return v2::FileParsers::MolFromTPLFile(fName, ps).release();
253}
254} // namespace v1
255
256namespace v2 {
257namespace FileParsers {
258
259//-----
260// MOL2 handling
261//-----
262
263typedef enum {
264 CORINA = 0 //!< supports output from Corina and some dbtranslate output
265} Mol2Type;
266
268 bool sanitize = true; /**< sanitize the molecule after building it */
269 bool removeHs = true; /**< remove Hs after constructing the molecule */
270 Mol2Type variant = Mol2Type::CORINA; /**< the atom type definitions to use */
272 true; /**< toggles recognition and cleanup of common substructures */
273};
274
276 std::istream &inStream,
277 const Mol2ParserParams &params = Mol2ParserParams());
279 const std::string &molBlock,
280 const Mol2ParserParams &params = Mol2ParserParams());
282 const std::string &fName,
283 const Mol2ParserParams &params = Mol2ParserParams());
284
285} // namespace FileParsers
286} // namespace v2
287
288inline namespace v1 {
290
291// \brief construct a molecule from a Tripos mol2 file
292/*!
293 *
294 * \param fName - string containing the file name
295 * \param sanitize - toggles sanitization of the molecule
296 * \param removeHs - toggles removal of Hs from the molecule. H removal
297 * is only done if the molecule is sanitized
298 * \param variant - the atom type definitions to use
299 * \param cleanupSubstructures - toggles recognition and cleanup of common
300 * substructures
301 */
302inline RWMol *Mol2FileToMol(const std::string &fName, bool sanitize = true,
303 bool removeHs = true,
304 Mol2Type variant = Mol2Type::CORINA,
305 bool cleanupSubstructures = true) {
307 ps.sanitize = sanitize;
308 ps.removeHs = removeHs;
309 ps.variant = variant;
310 ps.cleanupSubstructures = cleanupSubstructures;
311 return v2::FileParsers::MolFromMol2File(fName, ps).release();
312}
313
314// \brief construct a molecule from Tripos mol2 data in a stream
315/*!
316 * \param inStream - stream containing the data
317 * \param sanitize - toggles sanitization of the molecule
318 * \param removeHs - toggles removal of Hs from the molecule. H removal
319 * is only done if the molecule is sanitized
320 * \param variant - the atom type definitions to use
321 * \param cleanupSubstructures - toggles recognition and cleanup of common
322 * substructures
323 */
324inline RWMol *Mol2DataStreamToMol(std::istream &inStream, bool sanitize = true,
325 bool removeHs = true,
326 Mol2Type variant = Mol2Type::CORINA,
327 bool cleanupSubstructures = true) {
329 ps.sanitize = sanitize;
330 ps.removeHs = removeHs;
331 ps.variant = variant;
332 ps.cleanupSubstructures = cleanupSubstructures;
333 return v2::FileParsers::MolFromMol2DataStream(inStream, ps).release();
334}
335// \overload
336inline RWMol *Mol2DataStreamToMol(std::istream *inStream, bool sanitize = true,
337 bool removeHs = true,
338 Mol2Type variant = Mol2Type::CORINA,
339 bool cleanupSubstructures = true) {
340 return Mol2DataStreamToMol(*inStream, sanitize, removeHs, variant,
341 cleanupSubstructures);
342}
343
344// \brief construct a molecule from a Tripos mol2 block
345/*!
346 * \param molBlock - string containing the mol block
347 * \param sanitize - toggles sanitization of the molecule
348 * \param removeHs - toggles removal of Hs from the molecule. H removal
349 * is only done if the molecule is sanitized
350 * \param variant - the atom type definitions to use
351 * \param cleanupSubstructures - toggles recognition and cleanup of common
352 * substructures
353 */
354inline RWMol *Mol2BlockToMol(const std::string &molBlock, bool sanitize = true,
355 bool removeHs = true,
356 Mol2Type variant = Mol2Type::CORINA,
357 bool cleanupSubstructures = true) {
359 ps.sanitize = sanitize;
360 ps.removeHs = removeHs;
361 ps.variant = variant;
362 ps.cleanupSubstructures = cleanupSubstructures;
363 return v2::FileParsers::MolFromMol2Block(molBlock, ps).release();
364}
365} // namespace v1
366
367namespace v2 {
368namespace FileParsers {
369
371 std::istream &inStream);
372// \brief construct a molecule from an xyz block
373/*!
374 * \param xyzBlock - string containing the xyz block
375 */
377 const std::string &xyzBlock);
378// \brief construct a molecule from an xyz file
379/*!
380 * \param fName - string containing the file name
381 */
383 const std::string &fName);
384} // namespace FileParsers
385} // namespace v2
386inline namespace v1 {
387inline RWMol *XYZDataStreamToMol(std::istream &inStream) {
388 return v2::FileParsers::MolFromXYZDataStream(inStream).release();
389}
390// \brief construct a molecule from an xyz block
391/*!
392 * \param xyzBlock - string containing the xyz block
393 */
394inline RWMol *XYZBlockToMol(const std::string &xyzBlock) {
395 return v2::FileParsers::MolFromXYZBlock(xyzBlock).release();
396}
397// \brief construct a molecule from an xyz file
398/*!
399 * \param fName - string containing the file name
400 */
401inline RWMol *XYZFileToMol(const std::string &fName) {
402 return v2::FileParsers::MolFromXYZFile(fName).release();
403}
404
405} // namespace v1
406
407namespace v2 {
408namespace FileParsers {
410 bool sanitize = true; /**< sanitize the molecule after building it */
411 bool removeHs = true; /**< remove Hs after constructing the molecule */
412 bool proximityBonding = true; /**< if set to true, proximity bonding will be
413 performed */
414 unsigned int flavor = 0; /**< flavor to use */
415};
416
418 std::istream &inStream, const PDBParserParams &params = PDBParserParams());
420 const std::string &fname,
421 const PDBParserParams &params = PDBParserParams());
423 const std::string &str, const PDBParserParams &params = PDBParserParams());
424} // namespace FileParsers
425} // namespace v2
426
427inline namespace v1 {
429inline RWMol *PDBBlockToMol(const std::string &str, bool sanitize = true,
430 bool removeHs = true, unsigned int flavor = 0,
431 bool proximityBonding = true) {
433 ps.sanitize = sanitize;
434 ps.removeHs = removeHs;
435 ps.flavor = flavor;
436 ps.proximityBonding = proximityBonding;
437 return v2::FileParsers::MolFromPDBBlock(str, ps).release();
438}
439inline RWMol *PDBBlockToMol(const char *str, bool sanitize = true,
440 bool removeHs = true, unsigned int flavor = 0,
441 bool proximityBonding = true) {
442 return PDBBlockToMol(std::string(str), sanitize, removeHs, flavor,
443 proximityBonding);
444}
445inline RWMol *PDBFileToMol(const std::string &fname, bool sanitize = true,
446 bool removeHs = true, unsigned int flavor = 0,
447 bool proximityBonding = true) {
449 ps.sanitize = sanitize;
450 ps.removeHs = removeHs;
451 ps.flavor = flavor;
452 ps.proximityBonding = proximityBonding;
453 return v2::FileParsers::MolFromPDBFile(fname, ps).release();
454}
455inline RWMol *PDBDataStreamToMol(std::istream &inStream, bool sanitize = true,
456 bool removeHs = true, unsigned int flavor = 0,
457 bool proximityBonding = true) {
459 ps.sanitize = sanitize;
460 ps.removeHs = removeHs;
461 ps.flavor = flavor;
462 ps.proximityBonding = proximityBonding;
463 return v2::FileParsers::MolFromPDBDataStream(inStream, ps).release();
464}
465inline RWMol *PDBDataStreamToMol(std::istream *inStream, bool sanitize = true,
466 bool removeHs = true, unsigned int flavor = 0,
467 bool proximityBonding = true) {
468 return PDBDataStreamToMol(*inStream, sanitize, removeHs, flavor,
469 proximityBonding);
470}
471} // namespace v1
472
473// \brief reads a molecule from the metadata in an RDKit-generated SVG file
474/*!
475 * \param svg - string containing the SVG
476 * \param sanitize - toggles sanitization of the molecule
477 * \param removeHs - toggles removal of Hs from the molecule. H removal
478 * is only done if the molecule is sanitized
479 *
480 * **NOTE** This functionality should be considered beta.
481 */
483 bool sanitize = true,
484 bool removeHs = true);
485/*! \overload
486 */
488 bool sanitize = true,
489 bool removeHs = true);
490
491inline std::unique_ptr<RDKit::RWMol> operator""_ctab(const char *text,
492 size_t len) {
493 std::string data(text, len);
494 try {
496 } catch (const RDKit::MolSanitizeException &) {
497 return nullptr;
498 }
499}
500inline std::unique_ptr<RDKit::RWMol> operator""_mol2(const char *text,
501 size_t len) {
502 std::string data(text, len);
503 try {
505 } catch (const RDKit::MolSanitizeException &) {
506 return nullptr;
507 }
508}
509
510inline std::unique_ptr<RDKit::RWMol> operator""_pdb(const char *text,
511 size_t len) {
512 std::string data(text, len);
513 try {
515 } catch (const RDKit::MolSanitizeException &) {
516 return nullptr;
517 }
518}
519
520} // namespace RDKit
521
522#endif
pulls in the core RDKit functionality
class for flagging sanitization errors
RWMol is a molecule class that is intended to be edited.
Definition RWMol.h:32
MolFileUnhandledFeatureException(const char *msg)
construct with an error message
Definition FileParsers.h:35
const char * what() const noexcept override
get the error message
Definition FileParsers.h:40
MolFileUnhandledFeatureException(const std::string msg)
construct with an error message
Definition FileParsers.h:37
~MolFileUnhandledFeatureException() noexcept override=default
MolFileUnhandledFeatureException(const char *msg)
construct with an error message
Definition FileParsers.h:35
#define RDKIT_FILEPARSERS_EXPORT
Definition export.h:177
RWMol * Mol2BlockToMol(const std::string &molBlock, bool sanitize=true, bool removeHs=true, Mol2Type variant=Mol2Type::CORINA, bool cleanupSubstructures=true)
RWMol * XYZFileToMol(const std::string &fName)
RWMol * Mol2FileToMol(const std::string &fName, bool sanitize=true, bool removeHs=true, Mol2Type variant=Mol2Type::CORINA, bool cleanupSubstructures=true)
RWMol * MolFileToMol(const std::string &fName, bool sanitize=true, bool removeHs=true, bool strictParsing=true)
RWMol * Mol2DataStreamToMol(std::istream &inStream, bool sanitize=true, bool removeHs=true, Mol2Type variant=Mol2Type::CORINA, bool cleanupSubstructures=true)
RWMol * MolBlockToMol(const std::string &molBlock, bool sanitize=true, bool removeHs=true, bool strictParsing=true)
RWMol * PDBDataStreamToMol(std::istream &inStream, bool sanitize=true, bool removeHs=true, unsigned int flavor=0, bool proximityBonding=true)
RWMol * MolDataStreamToMol(std::istream *inStream, unsigned int &line, bool sanitize=true, bool removeHs=true, bool strictParsing=true)
RWMol * TPLFileToMol(const std::string &fName, bool sanitize=true, bool skipFirstConf=false)
construct a multi-conf molecule from a TPL (BioCad format) file
RWMol * PDBFileToMol(const std::string &fname, bool sanitize=true, bool removeHs=true, unsigned int flavor=0, bool proximityBonding=true)
RWMol * XYZDataStreamToMol(std::istream &inStream)
RWMol * TPLDataStreamToMol(std::istream *inStream, unsigned int &line, bool sanitize=true, bool skipFirstConf=false)
translate TPL data (BioCad format) into a multi-conf molecule
RWMol * XYZBlockToMol(const std::string &xyzBlock)
RWMol * PDBBlockToMol(const std::string &str, bool sanitize=true, bool removeHs=true, unsigned int flavor=0, bool proximityBonding=true)
RDKIT_FILEPARSERS_EXPORT std::unique_ptr< RWMol > MolFromTPLFile(const std::string &fName, const TPLParserParams &params=TPLParserParams())
RDKIT_FILEPARSERS_EXPORT std::unique_ptr< RWMol > MolFromPDBFile(const std::string &fname, const PDBParserParams &params=PDBParserParams())
RDKIT_FILEPARSERS_EXPORT std::unique_ptr< RWMol > MolFromPDBDataStream(std::istream &inStream, const PDBParserParams &params=PDBParserParams())
RDKIT_FILEPARSERS_EXPORT std::unique_ptr< RDKit::RWMol > MolFromSCSRDataStream(std::istream &inStream, unsigned int &line, const MolFileParserParams &molFileParserParams=MolFileParserParams(), const MolFromSCSRParams &molFromSCSRParams=MolFromSCSRParams())
RDKIT_FILEPARSERS_EXPORT std::unique_ptr< RWMol > MolFromXYZFile(const std::string &fName)
RDKIT_FILEPARSERS_EXPORT std::unique_ptr< RWMol > MolFromXYZBlock(const std::string &xyzBlock)
RDKIT_FILEPARSERS_EXPORT std::unique_ptr< RWMol > MolFromMolFile(const std::string &fName, const MolFileParserParams &params=MolFileParserParams())
RDKIT_FILEPARSERS_EXPORT std::unique_ptr< RWMol > MolFromMol2DataStream(std::istream &inStream, const Mol2ParserParams &params=Mol2ParserParams())
RDKIT_FILEPARSERS_EXPORT std::unique_ptr< RWMol > MolFromPDBBlock(const std::string &str, const PDBParserParams &params=PDBParserParams())
RDKIT_FILEPARSERS_EXPORT std::unique_ptr< RWMol > MolFromMol2File(const std::string &fName, const Mol2ParserParams &params=Mol2ParserParams())
RDKIT_FILEPARSERS_EXPORT std::unique_ptr< RWMol > MolFromXYZDataStream(std::istream &inStream)
RDKIT_FILEPARSERS_EXPORT std::unique_ptr< RDKit::RWMol > MolFromSCSRFile(const std::string &fName, const MolFileParserParams &molFileParserParams=MolFileParserParams(), const MolFromSCSRParams &molFromSCSRParams=MolFromSCSRParams())
RDKIT_FILEPARSERS_EXPORT std::unique_ptr< RDKit::RWMol > MolFromSCSRBlock(const std::string &molBlock, const MolFileParserParams &molFileParserParams=MolFileParserParams(), const MolFromSCSRParams &molFromSCSRParams=MolFromSCSRParams())
RDKIT_FILEPARSERS_EXPORT std::unique_ptr< RWMol > MolFromMolBlock(const std::string &molBlock, const MolFileParserParams &params=MolFileParserParams())
RDKIT_FILEPARSERS_EXPORT std::unique_ptr< RWMol > MolFromMol2Block(const std::string &molBlock, const Mol2ParserParams &params=Mol2ParserParams())
RDKIT_FILEPARSERS_EXPORT std::unique_ptr< RWMol > MolFromMolDataStream(std::istream &inStream, unsigned int &line, const MolFileParserParams &params=MolFileParserParams())
@ CORINA
supports output from Corina and some dbtranslate output
RDKIT_FILEPARSERS_EXPORT std::unique_ptr< RWMol > MolFromTPLDataStream(std::istream &inStream, unsigned int &line, const TPLParserParams &params=TPLParserParams())
Std stuff.
RDKIT_FILEPARSERS_EXPORT std::string strip(const std::string &orig)
RDKIT_FILEPARSERS_EXPORT RWMol * RDKitSVGToMol(const std::string &svg, bool sanitize=true, bool removeHs=true)