RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
SynthonSpace.h
Go to the documentation of this file.
1//
2// Copyright (C) David Cosgrove 2024.
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10// This file and others here contain an implementation of
11// synthonspace substructure search similar to that described in
12// 'Fast Substructure Search in Combinatorial Library Spaces',
13// Thomas Liphardt and Thomas Sander,
14// J. Chem. Inf. Model. 2023, 63, 16, 5133–5141
15// https://doi.org/10.1021/acs.jcim.3c00290
16
17#ifndef RDKIT_SYNTHONSPACE_H
18#define RDKIT_SYNTHONSPACE_H
19
20/*! \file SynthonSpace.h
21
22 \brief contains a class for searching combinatorial libraries in
23 Synthon format such as Enamine REAL.
24
25 \b Note that this functionality is experimental and the API may change
26 in future releases.
27*/
28
29#include <map>
30#include <sstream>
31#include <string>
32#include <vector>
33
34#include <boost/dynamic_bitset.hpp>
35
36#include <RDGeneral/export.h>
42
43namespace RDKit {
44class ROMol;
45
46namespace RascalMCES {
47struct RascalOptions;
48}
49
50namespace SynthonSpaceSearch {
51
52// This the maximum number of connectors that we can deal with at the moment.
53// In reality, there may be fewer than this. However, the key limit is in
54// The symbols used for the connectors in Enamine REAL etc.
55const std::vector<std::string> CONNECTOR_SYMBOLS{"[U]", "[Np]", "[Pu]", "[Am]"};
56constexpr unsigned int MAX_CONNECTOR_NUM{4};
57
59 std::int64_t maxHits{1000}; // The maximum number of hits to return.
60 // Use -1 for no maximum.
61 std::uint64_t maxNumFragSets{
62 100000}; // The maximum number of fragment sets the query can
63 // be broken into. Big molecules will create huge
64 // numbers of fragment sets that may cause excessive
65 // memory use. If the number of fragment sets hits this
66 // number, fragmentation stops and the search results
67 // will likely be incomplete.
68 std::int64_t toTryChunkSize{2500000}; // For similarity searching, especially
69 // fingerprint similarity, there can be a
70 // very large number of possible hits to
71 // screen which can use a lot of memory and
72 // crash the program. It will also be very
73 // slow. To alleviate the memory use, the
74 // possible hits are processed in chunks.
75 // This parameter sets the chunk size.
76
77 std::int64_t hitStart{0}; // Sequence number of hit to start from. So that
78 // you can return the next N hits of a search
79 // having already obtained N-1.
80 bool randomSample{false}; // If true, returns a random sample of the hit
81 // hits, up to maxHits in number.
82 int randomSeed{-1}; // Seed for random-number generator. -1 means use
83 // a random seed (std::random_device).
84 bool buildHits{true}; // If false, reports the maximum number of hits that
85 // the search could produce, but doesn't return them.
86 int numRandomSweeps{10}; // The random sampling doesn't always produce the
87 // required number of hits in 1 go. This parameter
88 // controls how many loops it makes to try and get
89 // the hits before giving up.
90 double similarityCutoff{0.5}; // Similarity cutoff for returning hits by
91 // fingerprint similarity. The default is
92 // appropriate for a Morgan fingerprint of
93 // radius=2, it may need changing for other
94 // fingerprint types.
96 0.1}; // Similarity values for fragments are generally low
97 // due to low bit densities. For the fragment
98 // matching, reduce the similarity cutoff
99 // by this amount. A higher number will give slower search
100 // times, a lower number will give faster searches at the
101 // risk of missing some hits. The value you use should have
102 // a positive correlation with your FOMO.
104 0.1}; // The fingerprint search uses an approximate similarity method
105 // before building a product and doing a final check. The
106 // similarityCutoff is reduced by this value for the approximate
107 // check. A lower value will give faster run times at the
108 // risk of missing some hits. The value you use should have a
109 // positive correlation with your FOMO. The default is
110 // appropriate for Morgan fingerprints. With RDKit fingerprints,
111 // 0.05 is adequate, and higher than that has been seen to
112 // produce long run times.
113 unsigned int minHitHeavyAtoms{0}; // Minimum number of heavy atoms in a hit.
114 int maxHitHeavyAtoms{-1}; // Maximum number of heavy atoms in a hit.
115 // -1 means no maximum.
116 double minHitMolWt{0}; // Minimum molecular weight for a hit.
117 double maxHitMolWt{0}; // Maximum molecular weight for a hit. 0.0 means
118 // no maximum.
119 unsigned int minHitChiralAtoms{
120 0}; // Minimum number of chiral atoms in a hit.
121 int maxHitChiralAtoms{-1}; // Maximum number of chiral atoms in a hit.
122 // -1 means no maximum.
123 std::uint64_t timeOut{600}; // Maximum number of seconds to spend on a single
124 // search. 0 means no maximum.
125 int numThreads = 1; // The number of threads to use. If > 0, will use that
126 // number. If <= 0, will use the number of hardware
127 // threads plus this number. So if the number of
128 // hardware threads is 8, and numThreads is -1, it will
129 // use 7 threads.
130};
131
132class Synthon;
133
135 friend class SynthonSet;
140
141 public:
142 explicit SynthonSpace() = default;
143 ~SynthonSpace() = default;
144 SynthonSpace(const SynthonSpace &other) = delete;
145 SynthonSpace &operator=(const SynthonSpace &other) = delete;
146 /*!
147 * Get the number of different reactions in the SynthonSpace.
148 *
149 * @return int
150 */
151 size_t getNumReactions() const;
152 /*!
153 * Get a list of the names of all the reactions in the SynthonSpace.
154 *
155 * @return
156 */
157 std::vector<std::string> getReactionNames() const;
158 const std::shared_ptr<SynthonSet> getReaction(std::string reactionName);
159 // The Synthons have a PatternFingerprint for screening in substructure
160 // searches. It's important that the screening process creates ones
161 // of the same size, so this finds out what size that is.
162 unsigned int getPatternFPSize() const;
163 // Likewise for the fingerprints used for similarity searching
164 unsigned int getFPSize() const;
165
166 std::string getInputFileName() const;
167
168 /*!
169 * Get the total number of products that the SynthonSpace could produce.
170 *
171 * @return std::int64_t
172 */
173 std::uint64_t getNumProducts() const;
174
175 /*!
176 * Get the info string for the fingerprint generator used to
177 * generate the stored fingerprints, so the user can query with
178 * the same type.
179 *
180 * @return
181 */
182 std::string getSynthonFingerprintType() const { return d_fpType; }
183
184 /*!
185 * Perform a substructure search with the given query molecule across
186 * the synthonspace library. Duplicate SMILES strings produced by
187 * different reactions will be returned.
188 *
189 * @param query : query molecule
190 * @param params : (optional) settings for the search
191 * @return : the hits as a SearchResults object.
192 */
194 const ROMol &query,
197
198 /*!
199 * Perform a substructure search with the given query molecule across
200 * the synthonspace library. Duplicate SMILES strings produced by
201 * different reactions will be returned. Search results are returned
202 * incrementally through the provided callback, which will receive
203 * at most `toTryChunkSize` sized lists of ROMols at a time, thereby
204 * reducing the amount of memory required to hold search results.
205 *
206 * @param query : query molecule
207 * @param callback: user-provided callback receiving chunks of ROMols.
208 * @param params : (optional) settings for the search
209 */
211 const ROMol &query, const SearchResultCallback &callback,
214
215 /*!
216 * Perform a substructure search with the given generalized query
217 * molecule across the synthonspace library. Duplicate SMILES strings
218 * produced by different reactions will be returned.
219 *
220 * @param query : query molecule
221 * @param params : (optional) settings for the search
222 * @return : the hits as a SearchResults object.
223 */
228
229 /*!
230 * Perform a fingerprint similarity search with the given query molecule
231 * across the synthonspace library. Duplicate SMILES strings produced by
232 * different reactions will be returned.
233 * @param query : query molecule
234 * @param fpGen: a FingerprintGenerator object that will provide the
235 * fingerprints for the similarity calculation
236 * @param params : (optional) settings for the search
237 * @return : the hits as a SearchResults object.
238 */
240 const ROMol &query, const FingerprintGenerator<std::uint64_t> &fpGen,
242
243 // Perform a fingerprint similarity search with the given query molecule
244 /*!
245 * Perform a fingerprint similarity search with the given query molecule
246 * across the synthonspace library. Duplicate SMILES strings produced by
247 * different reactions will be returned. Search results are returned
248 * incrementally through the provided callback, which will receive
249 * at most `toTryChunkSize` sized lists of ROMols at a time, thereby
250 * reducing the amount of memory required to hold search results.
251 *
252 * @param query : query molecule
253 * @param fpGen: a FingerprintGenerator object that will provide the
254 * fingerprints for the similarity calculation
255 * @param callback: user-provided callback receiving chunks of ROMols.
256 * @param params : (optional) settings for the search
257 */
259 const ROMol &query, const FingerprintGenerator<std::uint64_t> &fpGen,
260 const SearchResultCallback &callback,
262
263 // Perform a RASCAL similarity search with the given query molecule
264 // across the synthonspace library. Duplicate SMILES strings produced by
265 // different reactions will be returned.
266 /*!
267 *
268 * @param query : query molecule
269 * @param rascalOptions: RASCAL options. The similarityThreshold value
270 * in the rascalOptions will be used rather than
271 * params.similarityCutoff,
272 * but params.fragSimilarityAdjuster will be used
273 * to adjust the threshold for the fragment
274 * comparisons.
275 * @param params : (optional) settings for the search
276 * @return : the hits as a SearchResults object.
277 */
279 const ROMol &query, const RascalMCES::RascalOptions &rascalOptions,
281
282 // Perform a RASCAL similarity search with the given query molecule
283 /* across the synthonspace library. Duplicate SMILES strings produced by
284 * different reactions will be returned. Search results are returned
285 * incrementally through the provided callback, which will receive
286 * at most `toTryChunkSize` sized lists of ROMols at a time, thereby
287 * reducing the amount of memory required to hold search results.
288 *
289 * @param query : query molecule
290 * @param callback: user-provided callback receiving chunks of ROMols.
291 * @param rascalOptions: RASCAL options. The similarityThreshold value
292 * in the rascalOptions will be used rather than
293 * params.similarityCutoff,
294 * but params.fragSimilarityAdjuster will be used
295 * to adjust the threshold for the fragment
296 * comparisons.
297 * @param params : (optional) settings for the search
298 */
300 const ROMol &query, const RascalMCES::RascalOptions &rascalOptions,
301 const SearchResultCallback &callback,
303 /*!
304 *
305 * @param inFilename: name of the file containing the synthon-based library.
306 *
307 * The original format is:
308 * all lines are tab-separated
309 * first line:SMILES synton_id synton# reaction_id
310 * Note the spelling "synton" from the original paper/example file.
311 * Subsequent lines have a single reagent e.g.
312 * OCC([U])=NN=[Np] 1-1 0 triazole-1
313 * C1CCCC1N([Pu])[U] 2-1 1 triazole-1
314 * CC1CCN(C1)C(=[Np])[Pu] 3-1 2 triazole-1
315 *
316 * Other acceptable formats are as above, but with a 5th column "release":
317 * SMILES synton_id synton# reaction_id release
318 *
319 * or a comma-separated equivalent of the first format:
320 * SMILES,synton_id,synton_role,reaction_id
321 * but with the 3rd column named differently but with the same meaning.
322 * The formatting of the first 2 formats has been relaxed such that any
323 * whitespace may be used as the field separator, but a tab is tried first
324 * so that a tab-separated file may have spaces in the columns.
325 *
326 * Attachment points are U, Np, Pu and Am for up to 4 synthons per reaction.
327 * A product is created by taking a synthon from each synton# value and
328 * combining by replacing matching trans-uranic elements and replacing them
329 * with a direct bond of the appropriate type.
330 * A more (for RDKit) conventional connection flag of isotope labelled
331 * dummy atoms is also accepted ([1*] etc.).
332 * Throws a std::runtime_error if it doesn't think the format is correct,
333 * which it does by checking that the first line is as above and subsequent
334 * lines have appropriate number of fields.
335 * If it receives a SIGINT, returns cancelled=true.
336 */
337 void readTextFile(const std::string &inFilename, bool &cancelled);
338 void readStream(std::istream &is, bool &cancelled);
339
340 /*!
341 * Writes to a binary DB File in our format.
342 *
343 * @param outFilename: the name of the file to write.
344 */
345 void writeDBFile(const std::string &outFilename) const;
346
347 /*!
348 * Reads from a binary DB File in our format.
349 *
350 * @param inFilename: the name of the file to read.
351 * @param numThreads: number of threads to use in reading. If negative,
352 * adds the number to the number of hardware threads
353 * available.
354 */
355 void readDBFile(const std::string &inFilename, int numThreads = 1);
356
357 /*!
358 * Write a summary of the SynthonSpace to given stream.
359 *
360 * @param os: stream
361 */
362 void summarise(std::ostream &os);
363
364 /*!
365 * Writes the enumerated library to file in SMILES format
366 * (1 compound per line, SMILES name)
367 *
368 * @param outFilename: name of the file to write
369 */
370 void writeEnumeratedFile(const std::string &outFilename) const;
371 void enumerateToStream(std::ostream &os) const;
372
373 /*!
374 * Create the fingerprints for the synthons ready for fingerprint searches.
375 * Will be done by the fingerprint search if not done ahead of time.
376 *
377 * @param fpGen: a fingerprint generator of the appropriate type
378 */
381
382 protected:
383 unsigned int getMaxNumSynthons() const { return d_maxNumSynthons; }
384 unsigned int getMaxNumConnectors() const;
385 bool hasFingerprints() const;
386
388 // Return whether the space contains a ring-forming reaction.
389 bool getHasRingFormer() const { return d_hasRingFormer; }
390 // Take the SMILES for a Synthon and if it's not in
391 // d_synthonPool make it and add it. If it is in the pool,
392 // just look it up. Either way, return a pointer to the
393 // Synthon.
394 Synthon *addSynthonToPool(const std::string &smiles);
395 std::shared_ptr<SynthonSet> addReactionToPool(
396 const std::string &reactionName);
397
398 // Just do the lookup, and return nullptr if not found.
399 Synthon *getSynthonFromPool(const std::string &smiles) const;
400
401 private:
402 std::string d_fileName;
403 // The reactions, keyed on their IDs as the first value
404 // in the pair.
405 std::vector<std::pair<std::string, std::shared_ptr<SynthonSet>>> d_reactions;
406 // Keep the value of the maximum number of synthon sets used by
407 // any of the reactions. There's no point fragmenting any
408 // query into more than this number of fragments. Shouldn't
409 // ever be higher than 4 at present.
410 unsigned int d_maxNumSynthons{0};
411 std::uint64_t d_numProducts{0};
412
413 // This is actually 1000 * major version + 10 * minor version
414 // and hence the full version number.
415 std::int32_t d_fileMajorVersion{-1};
416
417 // The pool of all synthons, keyed on SMILES string. Synthons
418 // are frequently re-used in different reactions, so this means
419 // they're only stored once. They will be sorted and searched
420 // for via first, which is its SMILES string.
421 std::vector<std::pair<std::string, std::unique_ptr<Synthon>>> d_synthonPool;
422
423 // For the similarity search, this records the generator used for
424 // creating synthon fingerprints that are read from a binary file.
425 std::string d_fpType;
426
427 // Whether there is a ring-forming reaction in the space.
428 bool d_hasRingFormer{false};
429
430 SearchResults extendedSearch(const MolBundle &query,
431 const SubstructMatchParameters &matchParams,
432 const SynthonSpaceSearchParams &params);
433 SearchResults extendedSearch(
434 const GeneralizedSubstruct::ExtendedQueryMol::TautomerBundle_T &query,
435 const SubstructMatchParameters &matchParams,
436 const SynthonSpaceSearchParams &params);
437 SearchResults extendedSearch(const TautomerQuery &query,
438 const SubstructMatchParameters &matchParams,
439 const SynthonSpaceSearchParams &params);
440};
441
442/*!
443 * Convert the text file into the binary DB file in our format.
444 * Equivalent to readTextFile() followed by writeDBFile().
445 * If a fingerprint generator is provided, fingerprints will
446 * be created for all the synthons, which can be time-consuming.
447 * @param inFilename name of the text file to read
448 * @param outFilename name of the binary file to write
449 * @param cancelled whether it received a SIGINT
450 * @param fpGen optional fingerprint generator
451 */
453 const std::string &inFilename, const std::string &outFilename,
454 bool &cancelled,
455 const FingerprintGenerator<std::uint64_t> *fpGen = nullptr);
456
457/*!
458 * Format an integer with spaces every 3 digits for ease
459 * of reading.
460 *
461 * @return std::string
462 */
464 std::int64_t value);
465
466} // namespace SynthonSpaceSearch
467} // namespace RDKit
468
469#endif // RDKIT_SYNTHONSPACE_H
class that generates same fingerprint style for different output formats
std::string getSynthonFingerprintType() const
void fingerprintSearch(const ROMol &query, const FingerprintGenerator< std::uint64_t > &fpGen, const SearchResultCallback &callback, const SynthonSpaceSearchParams &params=SynthonSpaceSearchParams())
void readTextFile(const std::string &inFilename, bool &cancelled)
SearchResults substructureSearch(const GeneralizedSubstruct::ExtendedQueryMol &query, const SubstructMatchParameters &matchParams=SubstructMatchParameters(), const SynthonSpaceSearchParams &params=SynthonSpaceSearchParams())
const std::shared_ptr< SynthonSet > getReaction(std::string reactionName)
SearchResults rascalSearch(const ROMol &query, const RascalMCES::RascalOptions &rascalOptions, const SynthonSpaceSearchParams &params=SynthonSpaceSearchParams())
Synthon * addSynthonToPool(const std::string &smiles)
SynthonSpace & operator=(const SynthonSpace &other)=delete
SynthonSpace(const SynthonSpace &other)=delete
void writeEnumeratedFile(const std::string &outFilename) const
void writeDBFile(const std::string &outFilename) const
void substructureSearch(const ROMol &query, const SearchResultCallback &callback, const SubstructMatchParameters &matchParams=SubstructMatchParameters(), const SynthonSpaceSearchParams &params=SynthonSpaceSearchParams())
void enumerateToStream(std::ostream &os) const
SearchResults fingerprintSearch(const ROMol &query, const FingerprintGenerator< std::uint64_t > &fpGen, const SynthonSpaceSearchParams &params=SynthonSpaceSearchParams())
std::shared_ptr< SynthonSet > addReactionToPool(const std::string &reactionName)
void rascalSearch(const ROMol &query, const RascalMCES::RascalOptions &rascalOptions, const SearchResultCallback &callback, const SynthonSpaceSearchParams &params=SynthonSpaceSearchParams())
unsigned int getMaxNumConnectors() const
void readDBFile(const std::string &inFilename, int numThreads=1)
SearchResults substructureSearch(const ROMol &query, const SubstructMatchParameters &matchParams=SubstructMatchParameters(), const SynthonSpaceSearchParams &params=SynthonSpaceSearchParams())
Synthon * getSynthonFromPool(const std::string &smiles) const
std::vector< std::string > getReactionNames() const
void buildSynthonFingerprints(const FingerprintGenerator< std::uint64_t > &fpGen)
void readStream(std::istream &is, bool &cancelled)
#define RDKIT_SYNTHONSPACESEARCH_EXPORT
Definition export.h:577
RDKIT_SYNTHONSPACESEARCH_EXPORT void convertTextToDBFile(const std::string &inFilename, const std::string &outFilename, bool &cancelled, const FingerprintGenerator< std::uint64_t > *fpGen=nullptr)
std::function< bool(std::vector< std::unique_ptr< ROMol > > &)> SearchResultCallback
RDKIT_SYNTHONSPACESEARCH_EXPORT std::string formattedIntegerString(std::int64_t value)
constexpr unsigned int MAX_CONNECTOR_NUM
const std::vector< std::string > CONNECTOR_SYMBOLS
Std stuff.