RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
SynthonSpace.h
Go to the documentation of this file.
1//
2// Copyright (C) David Cosgrove 2024.
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10// This file and others here contain an implementation of
11// synthonspace substructure search similar to that described in
12// 'Fast Substructure Search in Combinatorial Library Spaces',
13// Thomas Liphardt and Thomas Sander,
14// J. Chem. Inf. Model. 2023, 63, 16, 5133–5141
15// https://doi.org/10.1021/acs.jcim.3c00290
16
17#ifndef RDKIT_SYNTHONSPACE_H
18#define RDKIT_SYNTHONSPACE_H
19
20/*! \file SynthonSpace.h
21
22 \brief contains a class for searching combinatorial libraries in
23 Synthon format such as Enamine REAL.
24
25 \b Note that this functionality is experimental and the API may change
26 in future releases.
27*/
28
29#include <map>
30#include <sstream>
31#include <string>
32#include <vector>
33
34#include <boost/dynamic_bitset.hpp>
35
36#include <RDGeneral/export.h>
42
43namespace RDKit {
44class ROMol;
45
46namespace RascalMCES {
47struct RascalOptions;
48}
49
50namespace SynthonSpaceSearch {
51
52// This the maximum number of connectors that we can deal with at the moment.
53// In reality, there may be fewer than this. However, the key limit is in
54// The symbols used for the connectors in Enamine REAL etc.
55const std::vector<std::string> CONNECTOR_SYMBOLS{"[U]", "[Np]", "[Pu]", "[Am]"};
56constexpr unsigned int MAX_CONNECTOR_NUM{4};
57
59 std::int64_t maxHits{1000}; // The maximum number of hits to return.
60 // Use -1 for no maximum.
61 std::uint64_t maxNumFragSets{
62 100000}; // The maximum number of fragment sets the query can
63 // be broken into. Big molecules will create huge
64 // numbers of fragment sets that may cause excessive
65 // memory use. If the number of fragment sets hits this
66 // number, fragmentation stops and the search results
67 // will likely be incomplete.
68 std::int64_t toTryChunkSize{2500000}; // For similarity searching, especially
69 // fingerprint similarity, there can be a
70 // very large number of possible hits to
71 // screen which can use a lot of memory and
72 // crash the program. It will also be very
73 // slow. To alleviate the memory use, the
74 // possible hits are processed in chunks.
75 // This parameter sets the chunk size.
76
77 std::int64_t hitStart{0}; // Sequence number of hit to start from. So that
78 // you can return the next N hits of a search
79 // having already obtained N-1.
80 bool randomSample{false}; // If true, returns a random sample of the hit
81 // hits, up to maxHits in number.
82 int randomSeed{-1}; // Seed for random-number generator. -1 means use
83 // a random seed (std::random_device).
84 bool buildHits{true}; // If false, reports the maximum number of hits that
85 // the search could produce, but doesn't return them.
86 int numRandomSweeps{10}; // The random sampling doesn't always produce the
87 // required number of hits in 1 go. This parameter
88 // controls how many loops it makes to try and get
89 // the hits before giving up.
90 double similarityCutoff{0.5}; // Similarity cutoff for returning hits by
91 // fingerprint similarity. The default is
92 // appropriate for a Morgan fingerprint of
93 // radius=2, it may need changing for other
94 // fingerprint types.
95 double fragSimilarityAdjuster{
96 0.1}; // Similarity values for fragments are generally low
97 // due to low bit densities. For the fragment
98 // matching, reduce the similarity cutoff
99 // by this amount. A higher number will give slower search
100 // times, a lower number will give faster searches at the
101 // risk of missing some hits. The value you use should have
102 // a positive correlation with your FOMO.
103 double approxSimilarityAdjuster{
104 0.1}; // The fingerprint search uses an approximate similarity method
105 // before building a product and doing a final check. The
106 // similarityCutoff is reduced by this value for the approximate
107 // check. A lower value will give faster run times at the
108 // risk of missing some hits. The value you use should have a
109 // positive correlation with your FOMO. The default is
110 // appropriate for Morgan fingerprints. With RDKit fingerprints,
111 // 0.05 is adequate, and higher than that has been seen to
112 // produce long run times.
113 unsigned int minHitHeavyAtoms{0}; // Minimum number of heavy atoms in a hit.
114 int maxHitHeavyAtoms{-1}; // Maximum number of heavy atoms in a hit.
115 // -1 means no maximum.
116 double minHitMolWt{0}; // Minimum molecular weight for a hit.
117 double maxHitMolWt{0}; // Maximum molecular weight for a hit. 0.0 means
118 // no maximum.
119 unsigned int minHitChiralAtoms{
120 0}; // Minimum number of chiral atoms in a hit.
121 int maxHitChiralAtoms{-1}; // Maximum number of chiral atoms in a hit.
122 // -1 means no maximum.
123 std::uint64_t timeOut{600}; // Maximum number of seconds to spend on a single
124 // search. 0 means no maximum.
125 int numThreads = 1; // The number of threads to use. If > 0, will use that
126 // number. If <= 0, will use the number of hardware
127 // threads plus this number. So if the number of
128 // hardware threads is 8, and numThreads is -1, it will
129 // use 7 threads.
130};
131
132class Synthon;
133
135 friend class SynthonSet;
139
140 public:
141 explicit SynthonSpace() = default;
142 ~SynthonSpace() = default;
143 SynthonSpace(const SynthonSpace &other) = delete;
144 SynthonSpace &operator=(const SynthonSpace &other) = delete;
145 /*!
146 * Get the number of different reactions in the SynthonSpace.
147 *
148 * @return int
149 */
150 size_t getNumReactions() const;
151 /*!
152 * Get a list of the names of all the reactions in the SynthonSpace.
153 *
154 * @return
155 */
156 std::vector<std::string> getReactionNames() const;
157 const std::shared_ptr<SynthonSet> getReaction(std::string reactionName);
158 // The Synthons have a PatternFingerprint for screening in substructure
159 // searches. It's important that the screening process creates ones
160 // of the same size, so this finds out what size that is.
161 unsigned int getPatternFPSize() const;
162 // Likewise for the fingerprints used for similarity searching
163 unsigned int getFPSize() const;
164
165 std::string getInputFileName() const;
166
167 /*!
168 * Get the total number of products that the SynthonSpace could produce.
169 *
170 * @return std::int64_t
171 */
172 std::uint64_t getNumProducts() const;
173
174 /*!
175 * Get the info string for the fingerprint generator used to
176 * generate the stored fingerprints, so the user can query with
177 * the same type.
178 *
179 * @return
180 */
181 std::string getSynthonFingerprintType() const { return d_fpType; }
182
183 /*!
184 * Perform a substructure search with the given query molecule across
185 * the synthonspace library. Duplicate SMILES strings produced by
186 * different reactions will be returned.
187 *
188 * @param query : query molecule
189 * @param params : (optional) settings for the search
190 * @return : the hits as a SearchResults object.
191 */
193 const ROMol &query,
196
197 /*!
198 * Perform a substructure search with the given generalized query
199 * molecule across the synthonspace library. Duplicate SMILES strings
200 * produced by different reactions will be returned.
201 *
202 * @param query : query molecule
203 * @param params : (optional) settings for the search
204 * @return : the hits as a SearchResults object.
205 */
210
211 /*!
212 * Perform a fingerprint similarity search with the given query molecule
213 * across the synthonspace library. Duplicate SMILES strings produced by
214 * different reactions will be returned.
215 * @param query : query molecule
216 * @param fpGen: a FingerprintGenerator object that will provide the
217 * fingerprints for the similarity calculation
218 * @param params : (optional) settings for the search
219 * @return : the hits as a SearchResults object.
220 */
222 const ROMol &query, const FingerprintGenerator<std::uint64_t> &fpGen,
224
225 // Perform a RASCAL similarity search with the given query molecule
226 // across the synthonspace library. Duplicate SMILES strings produced by
227 // different reactions will be returned.
228 /*!
229 *
230 * @param query : query molecule
231 * @param rascalOptions: RASCAL options. The similarityThreshold value
232 * in the rascalOptions will be used rather than
233 * params.similarityCutoff,
234 * but params.fragSimilarityAdjuster will be used
235 * to adjust the threshold for the fragment
236 * comparisons.
237 * @param params : (optional) settings for the search
238 * @return : the hits as a SearchResults object.
239 */
241 const ROMol &query, const RascalMCES::RascalOptions &rascalOptions,
243
244 /*!
245 *
246 * @param inFilename: name of the file containing the synthon-based library.
247 *
248 * The original format is:
249 * all lines are tab-separated
250 * first line:SMILES synton_id synton# reaction_id
251 * Note the spelling "synton" from the original paper/example file.
252 * Subsequent lines have a single reagent e.g.
253 * OCC([U])=NN=[Np] 1-1 0 triazole-1
254 * C1CCCC1N([Pu])[U] 2-1 1 triazole-1
255 * CC1CCN(C1)C(=[Np])[Pu] 3-1 2 triazole-1
256 *
257 * Other acceptable formats are as above, but with a 5th column "release":
258 * SMILES synton_id synton# reaction_id release
259 *
260 * or a comma-separated equivalent of the first format:
261 * SMILES,synton_id,synton_role,reaction_id
262 * but with the 3rd column named differently but with the same meaning.
263 * The formatting of the first 2 formats has been relaxed such that any
264 * whitespace may be used as the field separator.
265 *
266 * Attachment points are U, Np, Pu and Am for up to 4 synthons per reaction.
267 * A product is created by taking a synthon from each synton# value and
268 * combining by replacing matching trans-uranic elements and replacing them
269 * with a direct bond of the appropriate type.
270 * A more (for RDKit) conventional connection flag of isotope labelled
271 * dummy atoms is also accepted ([1*] etc.).
272 * Throws a std::runtime_error if it doesn't think the format is correct,
273 * which it does by checking that the first line is as above and subsequent
274 * lines have appropriate number of fields.
275 * If it receives a SIGINT, returns cancelled=true.
276 */
277 void readTextFile(const std::string &inFilename, bool &cancelled);
278 void readStream(std::istream &is, bool &cancelled);
279
280 /*!
281 * Writes to a binary DB File in our format.
282 *
283 * @param outFilename: the name of the file to write.
284 */
285 void writeDBFile(const std::string &outFilename) const;
286
287 /*!
288 * Reads from a binary DB File in our format.
289 *
290 * @param inFilename: the name of the file to read.
291 * @param numThreads: number of threads to use in reading. If negative,
292 * adds the number to the number of hardware threads
293 * available.
294 */
295 void readDBFile(const std::string &inFilename, int numThreads = 1);
296
297 /*!
298 * Write a summary of the SynthonSpace to given stream.
299 *
300 * @param os: stream
301 */
302 void summarise(std::ostream &os);
303
304 /*!
305 * Writes the enumerated library to file in SMILES format
306 * (1 compound per line, SMILES name)
307 *
308 * @param outFilename: name of the file to write
309 */
310 void writeEnumeratedFile(const std::string &outFilename) const;
311 void enumerateToStream(std::ostream &os) const;
312
313 /*!
314 * Create the fingerprints for the synthons ready for fingerprint searches.
315 * Will be done by the fingerprint search if not done ahead of time.
316 *
317 * @param fpGen: a fingerprint generator of the appropriate type
318 */
321
322 protected:
323 unsigned int getMaxNumSynthons() const { return d_maxNumSynthons; }
324
325 bool hasFingerprints() const;
326
328
329 // Take the SMILES for a Synthon and if it's not in
330 // d_synthonPool make it and add it. If it is in the pool,
331 // just look it up. Either way, return a pointer to the
332 // Synthon.
333 Synthon *addSynthonToPool(const std::string &smiles);
334 std::shared_ptr<SynthonSet> addReactionToPool(
335 const std::string &reactionName);
336
337 // Just do the lookup, and return nullptr if not found.
338 Synthon *getSynthonFromPool(const std::string &smiles) const;
339
340 private:
341 std::string d_fileName;
342 // The reactions, keyed on their IDs as the first value
343 // in the pair.
344 std::vector<std::pair<std::string, std::shared_ptr<SynthonSet>>> d_reactions;
345 // Keep the value of the maximum number of synthon sets used by
346 // any of the reactions. There's no point fragmenting any
347 // query into more than this number of fragments. Shouldn't
348 // ever be higher than 4 at present.
349 unsigned int d_maxNumSynthons{0};
350 std::uint64_t d_numProducts{0};
351
352 // This is actually 1000 * major version + 10 * minor version
353 // and hence the full version number.
354 std::int32_t d_fileMajorVersion{-1};
355
356 // The pool of all synthons, keyed on SMILES string. Synthons
357 // are frequently re-used in different reactions, so this means
358 // they're only stored once. They will be sorted and searched
359 // for via first, which is its SMILES string.
360 std::vector<std::pair<std::string, std::unique_ptr<Synthon>>> d_synthonPool;
361
362 // For the similarity search, this records the generator used for
363 // creating synthon fingerprints that are read from a binary file.
364 std::string d_fpType;
365
366 SearchResults extendedSearch(const MolBundle &query,
367 const SubstructMatchParameters &matchParams,
368 const SynthonSpaceSearchParams &params);
369 SearchResults extendedSearch(
370 const GeneralizedSubstruct::ExtendedQueryMol::TautomerBundle_T &query,
371 const SubstructMatchParameters &matchParams,
372 const SynthonSpaceSearchParams &params);
373 SearchResults extendedSearch(const TautomerQuery &query,
374 const SubstructMatchParameters &matchParams,
375 const SynthonSpaceSearchParams &params);
376};
377
378/*!
379 * Convert the text file into the binary DB file in our format.
380 * Equivalent to readTextFile() followed by writeDBFile().
381 * If a fingerprint generator is provided, fingerprints will
382 * be created for all the synthons, which can be time-consuming.
383 * @param inFilename name of the text file to read
384 * @param outFilename name of the binary file to write
385 * @param cancelled whether it received a SIGINT
386 * @param fpGen optional fingerprint generator
387 */
389 const std::string &inFilename, const std::string &outFilename,
390 bool &cancelled,
392
393/*!
394 * Format an integer with spaces every 3 digits for ease
395 * of reading.
396 *
397 * @return std::string
398 */
400 std::int64_t value);
401
402} // namespace SynthonSpaceSearch
403} // namespace RDKit
404
405#endif // RDKIT_SYNTHONSPACE_H
class that generates same fingerprint style for different output formats
std::string getSynthonFingerprintType() const
void readTextFile(const std::string &inFilename, bool &cancelled)
SearchResults substructureSearch(const GeneralizedSubstruct::ExtendedQueryMol &query, const SubstructMatchParameters &matchParams=SubstructMatchParameters(), const SynthonSpaceSearchParams &params=SynthonSpaceSearchParams())
const std::shared_ptr< SynthonSet > getReaction(std::string reactionName)
SearchResults rascalSearch(const ROMol &query, const RascalMCES::RascalOptions &rascalOptions, const SynthonSpaceSearchParams &params=SynthonSpaceSearchParams())
Synthon * addSynthonToPool(const std::string &smiles)
SynthonSpace & operator=(const SynthonSpace &other)=delete
SynthonSpace(const SynthonSpace &other)=delete
void writeEnumeratedFile(const std::string &outFilename) const
void writeDBFile(const std::string &outFilename) const
void enumerateToStream(std::ostream &os) const
SearchResults fingerprintSearch(const ROMol &query, const FingerprintGenerator< std::uint64_t > &fpGen, const SynthonSpaceSearchParams &params=SynthonSpaceSearchParams())
std::shared_ptr< SynthonSet > addReactionToPool(const std::string &reactionName)
void readDBFile(const std::string &inFilename, int numThreads=1)
SearchResults substructureSearch(const ROMol &query, const SubstructMatchParameters &matchParams=SubstructMatchParameters(), const SynthonSpaceSearchParams &params=SynthonSpaceSearchParams())
Synthon * getSynthonFromPool(const std::string &smiles) const
std::vector< std::string > getReactionNames() const
void buildSynthonFingerprints(const FingerprintGenerator< std::uint64_t > &fpGen)
void readStream(std::istream &is, bool &cancelled)
#define RDKIT_SYNTHONSPACESEARCH_EXPORT
Definition export.h:545
RDKIT_SYNTHONSPACESEARCH_EXPORT void convertTextToDBFile(const std::string &inFilename, const std::string &outFilename, bool &cancelled, const FingerprintGenerator< std::uint64_t > *fpGen=nullptr)
RDKIT_SYNTHONSPACESEARCH_EXPORT std::string formattedIntegerString(std::int64_t value)
constexpr unsigned int MAX_CONNECTOR_NUM
const std::vector< std::string > CONNECTOR_SYMBOLS
Std stuff.
bool rdvalue_is(const RDValue_cast_t)