RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
FileParserUtils.h
Go to the documentation of this file.
1//
2// Copyright (C) 2010-2025 Greg Landrum and other RDKit contributors
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10#include <RDGeneral/export.h>
11#ifndef RD_FILEPARSERUTILS_H
12#define RD_FILEPARSERUTILS_H
13
14#include <string>
15#include <iostream>
17#include <boost/lexical_cast.hpp>
18#include <boost/algorithm/string.hpp>
19#include <boost/format.hpp>
21#include "FileParsers.h"
22#include <string_view>
23
24namespace RDKit {
25class RWMol;
26class Conformer;
27
28namespace FileParserUtils {
29RDKIT_FILEPARSERS_EXPORT inline std::string_view strip(
30 std::string_view orig, std::string stripChars = " \t\r\n") {
31 std::string_view res = orig;
32 auto start = res.find_first_not_of(stripChars);
33 if (start != std::string_view::npos) {
34 auto end = res.find_last_not_of(stripChars) + 1;
35 res = res.substr(start, end - start);
36 } else {
37 res = "";
38 }
39 return res;
40}
41
42template <typename T>
43T stripSpacesAndCast(std::string_view input, bool acceptSpaces = false) {
44 auto trimmed = strip(input, " ");
45 if (acceptSpaces && trimmed.empty()) {
46 return 0;
47 } else {
48 return boost::lexical_cast<T>(trimmed);
49 }
50}
51template <typename T>
52T stripSpacesAndCast(const std::string &input, bool acceptSpaces = false) {
53 return stripSpacesAndCast<T>(std::string_view(input.c_str()), acceptSpaces);
54}
55RDKIT_FILEPARSERS_EXPORT int toInt(const std::string &input,
56 bool acceptSpaces = true);
57RDKIT_FILEPARSERS_EXPORT unsigned int toUnsigned(const std::string &input,
58 bool acceptSpaces = true);
59RDKIT_FILEPARSERS_EXPORT double toDouble(const std::string &input,
60 bool acceptSpaces = true);
61RDKIT_FILEPARSERS_EXPORT int toInt(const std::string_view input,
62 bool acceptSpaces = true);
63RDKIT_FILEPARSERS_EXPORT unsigned int toUnsigned(std::string_view input,
64 bool acceptSpaces = true);
65RDKIT_FILEPARSERS_EXPORT double toDouble(const std::string_view input,
66 bool acceptSpaces = true);
67
68// gets a V3000 CTAB for a molecule
70 const ROMol &tmol, const boost::dynamic_bitset<> &wasAromatic,
71 int confId = -1, unsigned int precision = 6);
72//! \overload
73inline std::string getV3000CTAB(const ROMol &tmol, int confId = -1,
74 unsigned int precision = 6) {
75 boost::dynamic_bitset<> wasAromatic(tmol.getNumBonds());
76 return getV3000CTAB(tmol, wasAromatic, confId, precision);
77};
78// reads a line from an MDL v3K CTAB
79RDKIT_FILEPARSERS_EXPORT std::string getV3000Line(std::istream *inStream,
80 unsigned int &line);
81
82// nAtoms and nBonds are ignored on input, set on output
84 std::istream *inStream, unsigned int &line, RWMol *mol, Conformer *&conf,
85 bool &chiralityPossible, unsigned int &nAtoms, unsigned int &nBonds,
86 bool strictParsing = true, bool expectMEND = true,
87 bool expectMacroAtoms = false);
88
89// nAtoms and nBonds are used
91 std::istream *inStream, unsigned int &line, RWMol *mol, Conformer *&conf,
92 bool &chiralityPossible, unsigned int &nAtoms, unsigned int &nBonds,
93 bool strictParsing = true);
94
95//! finishes up the processing (sanitization, etc.) of a molecule read from
96//! CTAB
98 RWMol *res, bool chiralityPossible,
100//! \overload
101inline void finishMolProcessing(RWMol *res, bool chiralityPossible,
102 bool sanitize, bool removeHs) {
104 ps.sanitize = sanitize;
105 ps.removeHs = removeHs;
106 finishMolProcessing(res, chiralityPossible, ps);
107}
108
109//! Deprecated, please use QueryOps::replaceAtomWithQueryAtom instead
111
112//! applies a particular property to the atoms as an atom property list
113template <typename T, typename U>
114void applyMolListProp(ROMol &mol, const std::string &pn,
115 const std::string &prefix,
116 const std::string &missingValueMarker, size_t nItems,
117 U getter) {
118 std::string itempn = pn.substr(prefix.size());
119 std::string strVect = mol.getProp<std::string>(pn);
120 std::vector<std::string> tokens;
121 boost::split(tokens, strVect, boost::is_any_of(" \t\n"),
122 boost::token_compress_on);
123 std::string mv = missingValueMarker;
124 size_t first_token = 0;
125 if (tokens.size() == nItems + 1 && tokens[0].front() == '[' &&
126 tokens[0].back() == ']') {
127 mv = std::string(tokens[0].begin() + 1, tokens[0].end() - 1);
128 first_token = 1;
129 }
130 if (mv.empty()) {
131 BOOST_LOG(rdWarningLog) << "Missing value marker for property " << pn
132 << " is empty." << std::endl;
133 }
134 if(tokens.size() - first_token != nItems) {
135 BOOST_LOG(rdWarningLog) << "Property list " << pn << " has incompatible size, "
136 << tokens.size() << " elements found; expecting "
137 << nItems << ". Ignoring it." << std::endl;
138 return;
139 }
140 for (size_t i = first_token; i < tokens.size(); ++i) {
141 if (tokens[i] != mv) {
142 unsigned int itemid = i - first_token;
143 try {
144 T apv = boost::lexical_cast<T>(tokens[i]);
145 getter(itemid)->setProp(itempn, apv);
146 } catch (const boost::bad_lexical_cast &) {
148 << "Value " << tokens[i] << " for property " << pn << " of item "
149 << itemid << " can not be parsed. Ignoring it." << std::endl;
150 }
151 }
152 }
153}
154
155//! applies a particular property to the atoms as an atom property list
156template <typename T>
157[[deprecated("use applyMolListProp instead")]]
158void applyMolListPropToAtoms(ROMol &mol, const std::string &pn,
159 const std::string &prefix,
160 const std::string &missingValueMarker = "n/a") {
161 auto getter = [&mol](size_t which) { return mol.getAtomWithIdx(which); };
162 applyMolListProp<T>(mol, pn, prefix, missingValueMarker, mol.getNumAtoms(),
163 getter);
164}
165
166template <typename T, typename U>
167void applyMolListProps(ROMol &mol, const std::string &prefix, size_t nItems,
168 U getter, const std::string missingValueMarker = "n/a") {
169 for (auto pn : mol.getPropList()) {
170 if (pn.find(prefix) == 0 && pn.length() > prefix.length()) {
171 applyMolListProp<T>(mol, pn, prefix, missingValueMarker, nItems, getter);
172 }
173 }
174}
175
176//! applies all properties matching a particular prefix as an atom property
177//! list
178template <typename T>
179[[deprecated("use applyMolListProps instead")]]
180void applyMolListPropsToAtoms(ROMol &mol, const std::string &prefix,
181 const std::string missingValueMarker = "n/a") {
182 auto getter = [&mol](size_t which) { return mol.getAtomWithIdx(which); };
183 auto nItems = mol.getNumAtoms();
184 for (auto pn : mol.getPropList()) {
185 if (pn.find(prefix) == 0 && pn.length() > prefix.length()) {
186 applyMolListProp<T>(mol, pn, prefix, missingValueMarker, nItems, getter);
187 }
188 }
189}
190
191static constexpr std::string_view atomPropPrefixView = "atom.";
192static constexpr size_t atomPropPrefixLength = atomPropPrefixView.length();
193static const std::string atomPropPrefix = std::string(atomPropPrefixView);
194static constexpr std::string_view bondPropPrefixView = "bond.";
195static constexpr size_t bondPropPrefixLength = bondPropPrefixView.length();
196static const std::string bondPropPrefix = std::string(bondPropPrefixView);
197
198//! if the property name matches our rules for atom property lists, we'll
199//! apply it to the atoms
201 ROMol &mol, const std::string &pn,
202 const std::string &missingValueMarker = "n/a") {
203 auto propSetter = [&](const std::string &propPrefix, auto getter,
204 size_t nItems) {
205 std::string prefix = propPrefix + "prop.";
206 if (pn.find(prefix) == 0 && pn.length() > prefix.length()) {
207 applyMolListProp<std::string>(mol, pn, prefix, missingValueMarker, nItems,
208 getter);
209 } else {
210 prefix = propPrefix + "iprop.";
211 if (pn.find(prefix) == 0 && pn.length() > prefix.length()) {
212 applyMolListProp<int>(mol, pn, prefix, missingValueMarker, nItems,
213 getter);
214 } else {
215 prefix = propPrefix + "dprop.";
216 if (pn.find(prefix) == 0 && pn.length() > prefix.length()) {
217 applyMolListProp<double>(mol, pn, prefix, missingValueMarker, nItems,
218 getter);
219 } else {
220 prefix = propPrefix + "bprop.";
221 if (pn.find(prefix) == 0 && pn.length() > prefix.length()) {
222 applyMolListProp<bool>(mol, pn, prefix, missingValueMarker, nItems,
223 getter);
224 }
225 }
226 }
227 }
228 };
229
230 if (pn.find(atomPropPrefix) == 0 && pn.length() > atomPropPrefixLength) {
231 propSetter(
233 [&mol](size_t which) { return mol.getAtomWithIdx(which); },
234 mol.getNumAtoms());
235 } else if (pn.find(bondPropPrefix) == 0 &&
236 pn.length() > bondPropPrefixLength) {
237 propSetter(
239 [&mol](size_t which) { return mol.getBondWithIdx(which); },
240 mol.getNumBonds());
241 }
242}
243//! loops over all properties and applies the ones that match the rules for
244//! atom property lists to the atoms and bonds
246 ROMol &mol, const std::string &missingValueMarker = "n/a") {
247 for (const auto &pn : mol.getPropList()) {
248 processMolPropertyList(mol, pn, missingValueMarker);
249 }
250}
251
252static constexpr unsigned int DEFAULT_LINESIZE = 190;
253
254template <typename T, typename U>
255std::string getPropertyList(U getter, const std::string &propName,
256 std::string missingValueMarker = "",
257 unsigned int lineSize = DEFAULT_LINESIZE) {
258 std::string res;
259 std::string propVal;
260 if (!missingValueMarker.empty()) {
261 propVal += boost::str(boost::format("[%s] ") % missingValueMarker);
262 } else {
263 missingValueMarker = "n/a";
264 }
265 for (const auto item : getter()) {
266 std::string apVal = missingValueMarker;
267 T tVal;
268 if (item->getPropIfPresent(propName, tVal)) {
269 apVal = boost::lexical_cast<std::string>(tVal);
270 }
271 if (propVal.length() + apVal.length() + 1 >= lineSize) {
272 // remove trailing space:
273 propVal.pop_back();
274 res += propVal + "\n";
275 propVal = "";
276 }
277 propVal += apVal + " ";
278 }
279 if (!propVal.empty()) {
280 // remove the trailing space:
281 propVal.pop_back();
282 res += propVal;
283 }
284 return res;
285}
286
287template <typename T>
288[[deprecated("use getPropertyList() instead")]]
289std::string getAtomPropertyList(ROMol &mol, const std::string &atomPropName,
290 std::string missingValueMarker = "",
291 unsigned int lineSize = DEFAULT_LINESIZE) {
292 return getPropertyList<T>([&mol]() { return mol.atoms(); }, atomPropName,
293 missingValueMarker, lineSize);
294}
295
296template <typename T, typename U>
297void createPropertyList(ROMol &mol, U getter, const std::string &prefix,
298 const std::string &typeMarker,
299 const std::string &propName,
300 const std::string &missingValueMarker = "",
301 unsigned int lineSize = DEFAULT_LINESIZE) {
302 std::string molPropName = prefix + "." + typeMarker + "." + propName;
303 mol.setProp(molPropName, getPropertyList<T>(getter, propName,
304 missingValueMarker, lineSize));
305}
306
308 ROMol &mol, const std::string &atomPropName,
309 const std::string &missingValueMarker = "",
310 unsigned int lineSize = DEFAULT_LINESIZE) {
312 mol, [&mol]() { return mol.atoms(); }, "atom", "iprop", atomPropName,
313 missingValueMarker, lineSize);
314}
316 ROMol &mol, const std::string &atomPropName,
317 const std::string &missingValueMarker = "",
318 unsigned int lineSize = DEFAULT_LINESIZE) {
320 mol, [&mol]() { return mol.atoms(); }, "atom", "dprop", atomPropName,
321 missingValueMarker, lineSize);
322}
324 ROMol &mol, const std::string &atomPropName,
325 const std::string &missingValueMarker = "",
326 unsigned int lineSize = DEFAULT_LINESIZE) {
328 mol, [&mol]() { return mol.atoms(); }, "atom", "bprop", atomPropName,
329 missingValueMarker, lineSize);
330}
332 ROMol &mol, const std::string &atomPropName,
333 const std::string &missingValueMarker = "",
334 unsigned int lineSize = DEFAULT_LINESIZE) {
336 mol, [&mol]() { return mol.atoms(); }, "atom", "prop", atomPropName,
337 missingValueMarker, lineSize);
338}
339
341 ROMol &mol, const std::string &bondPropName,
342 const std::string &missingValueMarker = "",
343 unsigned int lineSize = DEFAULT_LINESIZE) {
345 mol, [&mol]() { return mol.bonds(); }, "bond", "iprop", bondPropName,
346 missingValueMarker, lineSize);
347}
349 ROMol &mol, const std::string &bondPropName,
350 const std::string &missingValueMarker = "",
351 unsigned int lineSize = DEFAULT_LINESIZE) {
353 mol, [&mol]() { return mol.bonds(); }, "bond", "dprop", bondPropName,
354 missingValueMarker, lineSize);
355}
357 ROMol &mol, const std::string &bondPropName,
358 const std::string &missingValueMarker = "",
359 unsigned int lineSize = DEFAULT_LINESIZE) {
361 mol, [&mol]() { return mol.bonds(); }, "bond", "bprop", bondPropName,
362 missingValueMarker, lineSize);
363}
365 ROMol &mol, const std::string &bondPropName,
366 const std::string &missingValueMarker = "",
367 unsigned int lineSize = DEFAULT_LINESIZE) {
369 mol, [&mol]() { return mol.bonds(); }, "bond", "prop", bondPropName,
370 missingValueMarker, lineSize);
371}
372
374
375} // namespace FileParserUtils
376} // namespace RDKit
377
378#endif
#define BOOST_LOG(__arg__)
Definition RDLog.h:109
RDKIT_RDGENERAL_EXPORT RDLogger rdWarningLog
The class for representing atoms.
Definition Atom.h:74
The class for representing 2D or 3D conformation of a molecule.
Definition Conformer.h:46
void getProp(const std::string &key, T &res) const
allows retrieval of a particular property value
Definition RDProps.h:107
void setProp(const std::string &key, T val, bool computed=false) const
sets a property value
Definition RDProps.h:77
STR_VECT getPropList(bool includePrivate=true, bool includeComputed=true) const
returns a list with the names of our properties
Definition RDProps.h:45
unsigned int getNumBonds(bool onlyHeavy=1) const
returns our number of Bonds
Atom * getAtomWithIdx(unsigned int idx)
returns a pointer to a particular Atom
unsigned int getNumAtoms() const
returns our number of atoms
Definition ROMol.h:421
CXXAtomIterator< MolGraph, Atom * > atoms()
C++11 Range iterator.
Definition ROMol.h:277
CXXBondIterator< MolGraph, Bond * > bonds()
Definition ROMol.h:316
Bond * getBondWithIdx(unsigned int idx)
returns a pointer to a particular Bond
RWMol is a molecule class that is intended to be edited.
Definition RWMol.h:32
#define RDKIT_FILEPARSERS_EXPORT
Definition export.h:177
RDKIT_FILEPARSERS_EXPORT void moveAdditionalPropertiesToSGroups(RWMol &mol)
void createAtomDoublePropertyList(ROMol &mol, const std::string &atomPropName, const std::string &missingValueMarker="", unsigned int lineSize=DEFAULT_LINESIZE)
RDKIT_FILEPARSERS_EXPORT bool ParseV3000CTAB(std::istream *inStream, unsigned int &line, RWMol *mol, Conformer *&conf, bool &chiralityPossible, unsigned int &nAtoms, unsigned int &nBonds, bool strictParsing=true, bool expectMEND=true, bool expectMacroAtoms=false)
RDKIT_FILEPARSERS_EXPORT std::string getV3000CTAB(const ROMol &tmol, const boost::dynamic_bitset<> &wasAromatic, int confId=-1, unsigned int precision=6)
RDKIT_FILEPARSERS_EXPORT double toDouble(const std::string &input, bool acceptSpaces=true)
RDKIT_FILEPARSERS_EXPORT int toInt(const std::string &input, bool acceptSpaces=true)
void createBondDoublePropertyList(ROMol &mol, const std::string &bondPropName, const std::string &missingValueMarker="", unsigned int lineSize=DEFAULT_LINESIZE)
void createPropertyList(ROMol &mol, U getter, const std::string &prefix, const std::string &typeMarker, const std::string &propName, const std::string &missingValueMarker="", unsigned int lineSize=DEFAULT_LINESIZE)
static constexpr std::string_view bondPropPrefixView
RDKIT_FILEPARSERS_EXPORT Atom * replaceAtomWithQueryAtom(RWMol *mol, Atom *atom)
Deprecated, please use QueryOps::replaceAtomWithQueryAtom instead.
static constexpr unsigned int DEFAULT_LINESIZE
T stripSpacesAndCast(std::string_view input, bool acceptSpaces=false)
void applyMolListPropToAtoms(ROMol &mol, const std::string &pn, const std::string &prefix, const std::string &missingValueMarker="n/a")
applies a particular property to the atoms as an atom property list
void createAtomStringPropertyList(ROMol &mol, const std::string &atomPropName, const std::string &missingValueMarker="", unsigned int lineSize=DEFAULT_LINESIZE)
void createAtomBoolPropertyList(ROMol &mol, const std::string &atomPropName, const std::string &missingValueMarker="", unsigned int lineSize=DEFAULT_LINESIZE)
void applyMolListProp(ROMol &mol, const std::string &pn, const std::string &prefix, const std::string &missingValueMarker, size_t nItems, U getter)
applies a particular property to the atoms as an atom property list
static constexpr size_t bondPropPrefixLength
std::string getAtomPropertyList(ROMol &mol, const std::string &atomPropName, std::string missingValueMarker="", unsigned int lineSize=DEFAULT_LINESIZE)
void createAtomIntPropertyList(ROMol &mol, const std::string &atomPropName, const std::string &missingValueMarker="", unsigned int lineSize=DEFAULT_LINESIZE)
static const std::string bondPropPrefix
RDKIT_FILEPARSERS_EXPORT void finishMolProcessing(RWMol *res, bool chiralityPossible, const v2::FileParsers::MolFileParserParams &ps)
RDKIT_FILEPARSERS_EXPORT std::string_view strip(std::string_view orig, std::string stripChars=" \t\r\n")
void createBondBoolPropertyList(ROMol &mol, const std::string &bondPropName, const std::string &missingValueMarker="", unsigned int lineSize=DEFAULT_LINESIZE)
void applyMolListPropsToAtoms(ROMol &mol, const std::string &prefix, const std::string missingValueMarker="n/a")
void createBondIntPropertyList(ROMol &mol, const std::string &bondPropName, const std::string &missingValueMarker="", unsigned int lineSize=DEFAULT_LINESIZE)
static constexpr std::string_view atomPropPrefixView
void processMolPropertyLists(ROMol &mol, const std::string &missingValueMarker="n/a")
RDKIT_FILEPARSERS_EXPORT bool ParseV2000CTAB(std::istream *inStream, unsigned int &line, RWMol *mol, Conformer *&conf, bool &chiralityPossible, unsigned int &nAtoms, unsigned int &nBonds, bool strictParsing=true)
static const std::string atomPropPrefix
void processMolPropertyList(ROMol &mol, const std::string &pn, const std::string &missingValueMarker="n/a")
static constexpr size_t atomPropPrefixLength
std::string getPropertyList(U getter, const std::string &propName, std::string missingValueMarker="", unsigned int lineSize=DEFAULT_LINESIZE)
RDKIT_FILEPARSERS_EXPORT std::string getV3000Line(std::istream *inStream, unsigned int &line)
RDKIT_FILEPARSERS_EXPORT unsigned int toUnsigned(const std::string &input, bool acceptSpaces=true)
void applyMolListProps(ROMol &mol, const std::string &prefix, size_t nItems, U getter, const std::string missingValueMarker="n/a")
void createBondStringPropertyList(ROMol &mol, const std::string &bondPropName, const std::string &missingValueMarker="", unsigned int lineSize=DEFAULT_LINESIZE)
Std stuff.