RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
SmilesWrite.h
Go to the documentation of this file.
1//
2// Copyright (C) 2002-2021 Greg Landrum and other RDKit contributors
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10#include <RDGeneral/export.h>
11#ifndef RD_SMILESWRITE_H_012020
12#define RD_SMILESWRITE_H_012020
13
14#include <string>
15#include <vector>
16#include <memory>
17#include <cstdint>
18#include <limits>
20
21#include <boost/shared_ptr.hpp>
22
23namespace RDKit {
24class Atom;
25class Bond;
26class ROMol;
27
28typedef std::vector<boost::shared_ptr<ROMol>> MOL_SPTR_VECT;
29
31 bool doIsomericSmiles =
32 true; /**< include stereochemistry and isotope information */
33 bool doKekule = false; /**< kekulize the molecule before generating the SMILES
34 and output single/double bonds. NOTE that the output
35 is not canonical and that this will thrown an
36 exception if the molecule cannot be kekulized. */
37 bool canonical = true; /**< generate canonical SMILES */
38 bool cleanStereo = true; /**< clean up stereo */
39 bool allBondsExplicit = false; /**< include symbols for all bonds */
40 bool allHsExplicit = false; /**< provide hydrogen counts for every atom */
41 bool doRandom = false; /**< randomize the output order. The resulting SMILES
42 is not canonical and the value of the canonical
43 parameter will be ignored. */
44 int rootedAtAtom = -1; /**< make sure the SMILES starts at the specified
45 atom. The resulting SMILES is not canonical and
46 the value of the canonical parameter will be
47 ignored. */
48 bool includeDativeBonds =
49 true; /**< include the RDKit extension for dative bonds. Otherwise dative
50 bonds will be written as single bonds*/
51 bool ignoreAtomMapNumbers = false; /**< If true, ignores any atom map numbers
52 when canonicalizing the molecule */
53};
54
55namespace SmilesWrite {
56
57BETTER_ENUM(CXSmilesFields, uint32_t, // clang-format off
58 CX_NONE = 0,
59 CX_ATOM_LABELS = 1 << 0,
60 CX_MOLFILE_VALUES = 1 << 1,
61 CX_COORDS = 1 << 2,
62 CX_RADICALS = 1 << 3,
63 CX_ATOM_PROPS = 1 << 4,
64 CX_LINKNODES = 1 << 5,
65 CX_ENHANCEDSTEREO = 1 << 6,
66 CX_SGROUPS = 1 << 7,
67 CX_POLYMER = 1 << 8,
68 CX_BOND_CFG = 1 << 9,
69 CX_BOND_ATROPISOMER = 1 << 10,
70 CX_COORDINATE_BONDS = 1 << 11,
71 CX_HYDROGEN_BONDS = 1 << 12,
72 CX_ZERO_BONDS = 1 << 13,
73 CX_ALL = 0x7fffffff,
75);
76
77//! \brief returns the cxsmiles data for a molecule
79 const ROMol &mol, std::uint32_t flags = CXSmilesFields::CX_ALL);
80
81//! \brief returns the cxsmiles data for a vector of molecules
83 const std::vector<ROMol *> &mols, std::uint32_t flags);
84
85//! \brief returns true if the atom number is in the SMILES organic subset
87
88//! \brief returns the SMILES for an atom
89/*!
90 \param atom : the atom to work with
91 \param ps : the parameters controlling the SMILES generation
92*/
94 const SmilesWriteParams &ps);
95
96//! \brief returns the SMILES for an atom
97/*!
98 \param atom : the atom to work with
99 \param doKekule : we're doing kekulized smiles (e.g. don't use
100 lower case for the atom label)
101 \param bondIn : the bond we came into the atom on (unused)
102 \param allHsExplicit : if true, hydrogen counts will be provided for every
103 atom.
104 \param isomericSmiles : if true, isomeric SMILES will be generated
105*/
106inline std::string GetAtomSmiles(const Atom *atom, bool doKekule = false,
107 const Bond * = nullptr,
108 bool allHsExplicit = false,
109 bool isomericSmiles = true) {
110 // RDUNUSED_PARAM(bondIn);
113 ps.doKekule = doKekule;
114 ps.allHsExplicit = allHsExplicit;
115 return GetAtomSmiles(atom, ps);
116};
117
118//! \brief returns the SMILES for a bond
119/*!
120 \param bond : the bond to work with
121 \param ps : the parameters controlling the SMILES generation
122 \param atomToLeftIdx : the index of the atom preceding \c bond
123 in the SMILES
124*/
126 const SmilesWriteParams &ps,
127 int atomToLeftIdx = -1);
128//! \brief returns the SMILES for a bond
129/*!
130 \param bond : the bond to work with
131 \param atomToLeftIdx : the index of the atom preceding \c bond
132 in the SMILES
133 \param doKekule : we're doing kekulized smiles (e.g. write out
134 bond orders for aromatic bonds)
135 \param allBondsExplicit : if true, symbols will be included for all bonds.
136*/
137inline std::string GetBondSmiles(const Bond *bond, int atomToLeftIdx = -1,
138 bool doKekule = false,
139 bool allBondsExplicit = false) {
141 ps.doKekule = doKekule;
142 ps.allBondsExplicit = allBondsExplicit;
143 ps.doIsomericSmiles = false;
144 return GetBondSmiles(bond, ps, atomToLeftIdx);
145};
146
147namespace detail {
149 const ROMol &mol, const SmilesWriteParams &params, bool doingCXSmiles);
150}
151
152} // namespace SmilesWrite
153
154//! \brief returns canonical SMILES for a molecule
156 const ROMol &mol, const SmilesWriteParams &params);
157
158//! \brief returns SMILES for a molecule, canonical by default
159/*!
160 \param mol : the molecule in question.
161 \param doIsomericSmiles : include stereochemistry and isotope information
162 in the SMILES
163
164 \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds) NOTE that
165 this will throw an exception if the molecule cannot be kekulized.
166
167 \param rootedAtAtom : make sure the SMILES starts at the specified atom.
168 The resulting SMILES is not, of course, canonical.
169 \param canonical : if false, no attempt will be made to canonicalize the
170 SMILES
171 \param allBondsExplicit : if true, symbols will be included for all bonds.
172 \param allHsExplicit : if true, hydrogen counts will be provided for every
173 atom.
174 \param doRandom : if true, the first atom in the SMILES string will be
175 selected at random and the SMILES string will not be canonical
176 \param ignoreAtomMapNumbers : if true, ignores any atom map numbers when
177 canonicalizing the molecule
178 */
179inline std::string MolToSmiles(const ROMol &mol, bool doIsomericSmiles = true,
180 bool doKekule = false, int rootedAtAtom = -1,
181 bool canonical = true,
182 bool allBondsExplicit = false,
183 bool allHsExplicit = false,
184 bool doRandom = false,
185 bool ignoreAtomMapNumbers = false) {
187 ps.doIsomericSmiles = doIsomericSmiles;
188 ps.doKekule = doKekule;
189 ps.rootedAtAtom = rootedAtAtom;
190 ps.canonical = canonical;
191 ps.allBondsExplicit = allBondsExplicit;
192 ps.allHsExplicit = allHsExplicit;
193 ps.doRandom = doRandom;
194 ps.ignoreAtomMapNumbers = ignoreAtomMapNumbers;
195 return MolToSmiles(mol, ps);
196};
197
198//! \brief returns a vector of random SMILES for a molecule (may contain
199//! duplicates)
200/*!
201 \param mol : the molecule in question.
202 \param numSmiles : the number of SMILES to return
203 \param randomSeed : if >0, will be used to seed the random number generator
204 \param doIsomericSmiles : include stereochemistry and isotope information
205 in the SMILES
206 \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds)
207 \param allBondsExplicit : if true, symbols will be included for all bonds.
208 \param allHsExplicit : if true, hydrogen counts will be provided for every
209 atom.
210 */
212 const ROMol &mol, unsigned int numSmiles, unsigned int randomSeed = 0,
213 bool doIsomericSmiles = true, bool doKekule = false,
214 bool allBondsExplicit = false, bool allHsExplicit = false);
215
216//! \brief returns canonical SMILES for part of a molecule
218 const ROMol &mol, const SmilesWriteParams &params,
219 const std::vector<int> &atomsToUse,
220 const std::vector<int> *bondsToUse = nullptr,
221 const std::vector<std::string> *atomSymbols = nullptr,
222 const std::vector<std::string> *bondSymbols = nullptr);
223
224//! \brief returns canonical SMILES for part of a molecule
225/*!
226 \param mol : the molecule in question.
227 \param atomsToUse : indices of the atoms in the fragment
228 \param bondsToUse : indices of the bonds in the fragment. If this is not
229 provided,
230 all bonds between the atoms in atomsToUse will be included
231 \param atomSymbols : symbols to use for the atoms in the output SMILES
232 \param bondSymbols : symbols to use for the bonds in the output SMILES
233 \param doIsomericSmiles : include stereochemistry and isotope information
234 in the SMILES
235 \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds)
236 \param rootedAtAtom : make sure the SMILES starts at the specified atom.
237 The resulting SMILES is not, of course, canonical.
238 \param canonical : if false, no attempt will be made to canonicalize the
239 SMILES
240 \param allBondsExplicit : if true, symbols will be included for all bonds.
241 \param allHsExplicit : if true, hydrogen counts will be provided for every
242 atom.
243 \param doRandom : generate a randomized smiles string by randomly choosing
244 the priority to follow in the DFS traversal. [default false]
245
246 \b NOTE: the bondSymbols are *not* currently used in the canonicalization.
247
248 */
249inline std::string MolFragmentToSmiles(
250 const ROMol &mol, const std::vector<int> &atomsToUse,
251 const std::vector<int> *bondsToUse = nullptr,
252 const std::vector<std::string> *atomSymbols = nullptr,
253 const std::vector<std::string> *bondSymbols = nullptr,
254 bool doIsomericSmiles = true, bool doKekule = false, int rootedAtAtom = -1,
255 bool canonical = true, bool allBondsExplicit = false,
256 bool allHsExplicit = false) {
258 ps.doIsomericSmiles = doIsomericSmiles;
259 ps.doKekule = doKekule;
260 ps.rootedAtAtom = rootedAtAtom;
261 ps.canonical = canonical;
262 ps.allBondsExplicit = allBondsExplicit;
263 ps.allHsExplicit = allHsExplicit;
264 return MolFragmentToSmiles(mol, ps, atomsToUse, bondsToUse, atomSymbols,
266}
267
269 RestoreBondDirOptionTrue = 0, //<!DO restore bond dirs
270 RestoreBondDirOptionClear = 1 //<!clear all bond dir information
271);
272
273//! \brief returns canonical CXSMILES for a molecule
275 const ROMol &mol, const SmilesWriteParams &ps,
276 std::uint32_t flags = SmilesWrite::CXSmilesFields::CX_ALL,
278 RestoreBondDirOption::RestoreBondDirOptionClear);
279
280//! \brief returns canonical CXSMILES for a molecule
281/*!
282 \param mol : the molecule in question.
283 \param doIsomericSmiles : include stereochemistry and isotope information
284 in the SMILES
285 \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds)
286 \param rootedAtAtom : make sure the SMILES starts at the specified atom.
287 The resulting SMILES is not, of course, canonical.
288 \param canonical : if false, no attempt will be made to canonicalize the
289 SMILES
290 \param allBondsExplicit : if true, symbols will be included for all bonds.
291 \param allHsExplicit : if true, hydrogen counts will be provided for every
292 \param doRandom : generate a randomized smiles string by randomly choosing
293 the priority to follow in the DFS traversal. [default false]
294 atom.
295 */
296inline std::string MolToCXSmiles(const ROMol &mol, bool doIsomericSmiles = true,
297 bool doKekule = false, int rootedAtAtom = -1,
298 bool canonical = true,
299 bool allBondsExplicit = false,
300 bool allHsExplicit = false,
301 bool doRandom = false) {
303 ps.doIsomericSmiles = doIsomericSmiles;
304 ps.doKekule = doKekule;
305 ps.rootedAtAtom = rootedAtAtom;
306 ps.canonical = canonical;
307 ps.allBondsExplicit = allBondsExplicit;
308 ps.allHsExplicit = allHsExplicit;
309 ps.doRandom = doRandom;
310 return MolToCXSmiles(mol, ps, SmilesWrite::CXSmilesFields::CX_ALL);
311};
312
313//! \brief returns canonical CXSMILES for part of a molecule
315 const ROMol &mol, const SmilesWriteParams &params,
316 const std::vector<int> &atomsToUse,
317 const std::vector<int> *bondsToUse = nullptr,
318 const std::vector<std::string> *atomSymbols = nullptr,
319 const std::vector<std::string> *bondSymbols = nullptr);
320
321//! \brief returns canonical CXSMILES for part of a molecule
322/*!
323 \param mol : the molecule in question.
324 \param atomsToUse : indices of the atoms in the fragment
325 \param bondsToUse : indices of the bonds in the fragment. If this is not
326 provided,
327 all bonds between the atoms in atomsToUse will be included
328 \param atomSymbols : symbols to use for the atoms in the output SMILES
329 \param bondSymbols : symbols to use for the bonds in the output SMILES
330 \param doIsomericSmiles : include stereochemistry and isotope information
331 in the SMILES
332 \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds)
333 \param rootedAtAtom : make sure the SMILES starts at the specified atom.
334 The resulting SMILES is not, of course, canonical.
335 \param canonical : if false, no attempt will be made to canonicalize the
336 SMILES
337 \param allBondsExplicit : if true, symbols will be included for all bonds.
338 \param allHsExplicit : if true, hydrogen counts will be provided for every
339 atom.
340
341 \b NOTE: the bondSymbols are *not* currently used in the canonicalization.
342
343 */
344inline std::string MolFragmentToCXSmiles(
345 const ROMol &mol, const std::vector<int> &atomsToUse,
346 const std::vector<int> *bondsToUse = nullptr,
347 const std::vector<std::string> *atomSymbols = nullptr,
348 const std::vector<std::string> *bondSymbols = nullptr,
349 bool doIsomericSmiles = true, bool doKekule = false, int rootedAtAtom = -1,
350 bool canonical = true, bool allBondsExplicit = false,
351 bool allHsExplicit = false) {
353 ps.doIsomericSmiles = doIsomericSmiles;
354 ps.doKekule = doKekule;
355 ps.rootedAtAtom = rootedAtAtom;
356 ps.canonical = canonical;
357 ps.allBondsExplicit = allBondsExplicit;
358 ps.allHsExplicit = allHsExplicit;
359 return MolFragmentToCXSmiles(mol, ps, atomsToUse, bondsToUse, atomSymbols,
361}
362
363} // namespace RDKit
364#endif
#define BETTER_ENUM(Enum, Underlying,...)
Definition BetterEnums.h:17
The class for representing atoms.
Definition Atom.h:75
class for representing a bond
Definition Bond.h:47
#define RDKIT_SMILESPARSE_EXPORT
Definition export.h:505
RDKIT_SMILESPARSE_EXPORT std::string MolToSmiles(const ROMol &mol, const SmilesWriteParams &params, bool doingCXSmiles)
RDKIT_SMILESPARSE_EXPORT std::string GetAtomSmiles(const Atom *atom, const SmilesWriteParams &ps)
returns the SMILES for an atom
RDKIT_SMILESPARSE_EXPORT bool inOrganicSubset(int atomicNumber)
returns true if the atom number is in the SMILES organic subset
RDKIT_SMILESPARSE_EXPORT std::string GetBondSmiles(const Bond *bond, const SmilesWriteParams &ps, int atomToLeftIdx=-1)
returns the SMILES for a bond
RDKIT_SMILESPARSE_EXPORT std::string getCXExtensions(const ROMol &mol, std::uint32_t flags=CXSmilesFields::CX_ALL)
returns the cxsmiles data for a molecule
Std stuff.
RDKIT_SMILESPARSE_EXPORT std::vector< std::string > MolToRandomSmilesVect(const ROMol &mol, unsigned int numSmiles, unsigned int randomSeed=0, bool doIsomericSmiles=true, bool doKekule=false, bool allBondsExplicit=false, bool allHsExplicit=false)
returns a vector of random SMILES for a molecule (may contain duplicates)
bool rdvalue_is(const RDValue_cast_t)
RDKIT_SMILESPARSE_EXPORT std::string MolToCXSmiles(const ROMol &mol, const SmilesWriteParams &ps, std::uint32_t flags=SmilesWrite::CXSmilesFields::CX_ALL, RestoreBondDirOption restoreBondDirs=RestoreBondDirOption::RestoreBondDirOptionClear)
returns canonical CXSMILES for a molecule
RDKIT_SMILESPARSE_EXPORT std::string MolFragmentToSmiles(const ROMol &mol, const SmilesWriteParams &params, const std::vector< int > &atomsToUse, const std::vector< int > *bondsToUse=nullptr, const std::vector< std::string > *atomSymbols=nullptr, const std::vector< std::string > *bondSymbols=nullptr)
returns canonical SMILES for part of a molecule
RDKIT_SMILESPARSE_EXPORT std::string MolToSmiles(const ROMol &mol, const SmilesWriteParams &params)
returns canonical SMILES for a molecule
RDKIT_SMILESPARSE_EXPORT std::string MolFragmentToCXSmiles(const ROMol &mol, const SmilesWriteParams &params, const std::vector< int > &atomsToUse, const std::vector< int > *bondsToUse=nullptr, const std::vector< std::string > *atomSymbols=nullptr, const std::vector< std::string > *bondSymbols=nullptr)
returns canonical CXSMILES for part of a molecule
std::vector< boost::shared_ptr< ROMol > > MOL_SPTR_VECT