RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
FPBReader.h
Go to the documentation of this file.
1//
2// Copyright (c) 2016 Greg Landrum
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10#include <RDGeneral/export.h>
11#ifndef RD_FPBREADER_H_DEC2015
12#define RD_FPBREADER_H_DEC2015
13/*! \file FPBReader.h
14
15 \brief contains a simple class for reading and searching FPB files
16
17 \b Note that this functionality is experimental and the API may change
18 in future releases.
19*/
20
21#include <fstream>
22#include <sstream>
23#include <string>
26
27#include <cstdint>
28#include <boost/shared_ptr.hpp>
29#include <boost/shared_array.hpp>
30
31namespace RDKit {
32namespace detail {
33struct FPBReader_impl;
34}
35
36//! class for reading and searching FPB files
37/*!
38 basic usage:
39 \code
40 FPBReader reader("foo.fpb");
41 reader.init();
42 boost::shared_ptr<ExplicitBitVect> ebv = reader.getFP(95);
43 std::vector<std::pair<double, unsigned int> > nbrs =
44 reader.getTanimotoNeighbors(*ebv.get(), 0.70);
45 \endcode
46
47 \b Note: this functionality is experimental and the API may change
48 in future releases.
49
50 <b>Note on thread safety</b>
51 Operations that involve reading from the FPB file are not thread safe.
52 This means that the \c init() method is not thread safe and none of the
53 search operations are thread safe when an \c FPBReader is initialized in
54 \c lazyRead mode.
55
56*/
58 public:
60
61 //! ctor for reading from a named file
62 /*!
63 \param fname the name of the file to reads
64 \param lazyRead if set to \c false all fingerprints from the file will be read
65 into memory when \c init() is called.
66 */
67 FPBReader(const char *fname, bool lazyRead = false) {
68 _initFromFilename(fname, lazyRead);
69 }
70 //! \overload
71 FPBReader(const std::string &fname, bool lazyRead = false) {
72 _initFromFilename(fname.c_str(), lazyRead);
73 }
74 //! ctor for reading from an open istream
75 /*!
76 \param inStream the stream to read from
77 \param takeOwnership if set, we will take over ownership of the stream pointer
78 \param lazyRead if set to \c false all fingerprints from the file will be read
79 into memory when \c init() is called.
80
81 Some additional notes:
82 - if \c lazyRead is set, \c inStream must support the \c seekg() and \c
83 tellg() operations.
84
85 */
86 FPBReader(std::istream *inStream, bool takeOwnership = true,
87 bool lazyRead = false)
88 : dp_istrm(inStream),
89 dp_impl(nullptr),
90 df_owner(takeOwnership),
91 df_init(false),
92 df_lazyRead(lazyRead) {}
94 destroy();
95 if (df_owner) {
96 delete dp_istrm;
97 }
98 dp_istrm = nullptr;
99 df_init = false;
100 }
101
102 //! Read the data from the file and initialize internal data structures
103 /*!
104 This must be called before most of the other methods of this class.
105
106 Some notes:
107 \li if \c lazyRead is not set, all fingerprints will be read into memory. This
108 can require substantial amounts of memory for large files.
109 \li For large files, this can take a long time.
110 \li If \c lazyRead and \c takeOwnership are both \c false it is safe to close
111 and delete inStream after calling \c init()
112 */
113 void init();
114 //! cleanup
115 /*!
116 Cleans up whatever memory was allocated during init()
117 */
118 void cleanup() {
119 if (!df_init) {
120 return;
121 }
122 destroy();
123 df_init = false;
124 }
125 //! returns the requested fingerprint as an \c ExplicitBitVect
126 boost::shared_ptr<ExplicitBitVect> getFP(unsigned int idx) const;
127 //! returns the requested fingerprint as an array of bytes
128 boost::shared_array<std::uint8_t> getBytes(unsigned int idx) const;
129
130 //! returns the id of the requested fingerprint
131 std::string getId(unsigned int idx) const;
132 //! returns the fingerprint and id of the requested fingerprint
133 std::pair<boost::shared_ptr<ExplicitBitVect>, std::string> operator[](
134 unsigned int idx) const {
135 return std::make_pair(getFP(idx), getId(idx));
136 }
137
138 //! returns beginning and end indices of fingerprints having on-bit counts
139 //! within the range (including end points)
140 std::pair<unsigned int, unsigned int> getFPIdsInCountRange(
141 unsigned int minCount, unsigned int maxCount);
142
143 //! returns the number of fingerprints
144 unsigned int length() const;
145 //! returns the number of bits in our fingerprints
146 unsigned int nBits() const;
147
148 //! returns the tanimoto similarity between the specified fingerprint and the
149 //! provided fingerprint
150 double getTanimoto(unsigned int idx, const std::uint8_t *bv) const;
151 //! \overload
152 double getTanimoto(unsigned int idx,
153 boost::shared_array<std::uint8_t> bv) const {
154 return getTanimoto(idx, bv.get());
155 }
156 //! \overload
157 double getTanimoto(unsigned int idx, const ExplicitBitVect &ebv) const;
158
159 //! returns tanimoto neighbors that are within a similarity threshold
160 /*!
161 The result vector of (similarity,index) pairs is sorted in order
162 of decreasing similarity
163
164 \param bv the query fingerprint
165 \param threshold the minimum similarity to return
166 \param usePopcountScreen if this is true (the default) the popcount of the
167 neighbors will be used to reduce the number of calculations that need
168 to be done
169
170 */
171 std::vector<std::pair<double, unsigned int>> getTanimotoNeighbors(
172 const std::uint8_t *bv, double threshold = 0.7,
173 bool usePopcountScreen = true) const;
174 //! \overload
175 std::vector<std::pair<double, unsigned int>> getTanimotoNeighbors(
176 boost::shared_array<std::uint8_t> bv, double threshold = 0.7,
177 bool usePopcountScreen = true) const {
178 return getTanimotoNeighbors(bv.get(), threshold, usePopcountScreen);
179 }
180 //! \overload
181 std::vector<std::pair<double, unsigned int>> getTanimotoNeighbors(
182 const ExplicitBitVect &ebv, double threshold = 0.7,
183 bool usePopcountScreen = true) const;
184
185 //! returns the Tversky similarity between the specified fingerprint and the
186 //! provided fingerprint
187 /*!
188
189 \param idx the fingerprint to compare to
190 \param bv the query fingerprint
191 \param ca the Tversky a coefficient
192 \param cb the Tversky a coefficient
193
194 */
195 double getTversky(unsigned int idx, const std::uint8_t *bv, double ca,
196 double cb) const;
197 //! \overload
198 double getTversky(unsigned int idx, boost::shared_array<std::uint8_t> bv,
199 double ca, double cb) const {
200 return getTversky(idx, bv.get(), ca, cb);
201 }
202 //! \overload
203 double getTversky(unsigned int idx, const ExplicitBitVect &ebv, double ca,
204 double cb) const;
205
206 //! returns Tversky neighbors that are within a similarity threshold
207 /*!
208 The result vector of (similarity,index) pairs is sorted in order
209 of decreasing similarity
210
211 \param bv the query fingerprint
212 \param ca the Tversky a coefficient
213 \param cb the Tversky a coefficient
214 \param threshold the minimum similarity to return
215 \param usePopcountScreen if this is true (the default) the popcount of the
216 neighbors will be used to reduce the number of calculations that need
217 to be done
218
219 */
220 std::vector<std::pair<double, unsigned int>> getTverskyNeighbors(
221 const std::uint8_t *bv, double ca, double cb, double threshold = 0.7,
222 bool usePopcountScreen = true) const;
223 //! \overload
224 std::vector<std::pair<double, unsigned int>> getTverskyNeighbors(
225 boost::shared_array<std::uint8_t> bv, double ca, double cb,
226 double threshold = 0.7, bool usePopcountScreen = true) const {
227 return getTverskyNeighbors(bv.get(), ca, cb, threshold, usePopcountScreen);
228 }
229 //! \overload
230 std::vector<std::pair<double, unsigned int>> getTverskyNeighbors(
231 const ExplicitBitVect &ebv, double ca, double cb, double threshold = 0.7,
232 bool usePopcountScreen = true) const;
233
234 //! returns indices of all fingerprints that completely contain this one
235 /*! (i.e. where all the bits set in the query are also set in the db
236 molecule)
237 */
238 std::vector<unsigned int> getContainingNeighbors(
239 const std::uint8_t *bv) const;
240 //! \overload
241 std::vector<unsigned int> getContainingNeighbors(
242 boost::shared_array<std::uint8_t> bv) const {
243 return getContainingNeighbors(bv.get());
244 }
245 //! \overload
246 std::vector<unsigned int> getContainingNeighbors(
247 const ExplicitBitVect &ebv) const;
248
249 private:
250 std::istream *dp_istrm{nullptr};
251 detail::FPBReader_impl *dp_impl{nullptr}; // implementation details
252 bool df_owner{false};
253 bool df_init{false};
254 bool df_lazyRead{false};
255
256 // disable automatic copy constructors and assignment operators
257 // for this class and its subclasses. They will likely be
258 // carrying around stream pointers and copying those is a recipe
259 // for disaster.
260 FPBReader(const FPBReader &);
261 FPBReader &operator=(const FPBReader &);
262 void destroy();
263 void _initFromFilename(const char *fname, bool lazyRead) {
264 std::istream *tmpStream = static_cast<std::istream *>(
265 new std::ifstream(fname, std::ios_base::binary));
266 if (!(*tmpStream) || (tmpStream->bad())) {
267 std::ostringstream errout;
268 errout << "Bad input file " << fname;
269 delete tmpStream;
270 throw BadFileException(errout.str());
271 }
272 dp_istrm = tmpStream;
273 dp_impl = nullptr;
274 df_owner = true;
275 df_init = false;
276 df_lazyRead = lazyRead;
277 }
278};
279} // namespace RDKit
280#endif
a class for bit vectors that are densely occupied
std::vector< unsigned int > getContainingNeighbors(boost::shared_array< std::uint8_t > bv) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition FPBReader.h:241
std::pair< unsigned int, unsigned int > getFPIdsInCountRange(unsigned int minCount, unsigned int maxCount)
std::vector< std::pair< double, unsigned int > > getTanimotoNeighbors(const std::uint8_t *bv, double threshold=0.7, bool usePopcountScreen=true) const
returns tanimoto neighbors that are within a similarity threshold
std::vector< std::pair< double, unsigned int > > getTanimotoNeighbors(const ExplicitBitVect &ebv, double threshold=0.7, bool usePopcountScreen=true) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
std::vector< std::pair< double, unsigned int > > getTverskyNeighbors(const std::uint8_t *bv, double ca, double cb, double threshold=0.7, bool usePopcountScreen=true) const
returns Tversky neighbors that are within a similarity threshold
void cleanup()
cleanup
Definition FPBReader.h:118
std::vector< unsigned int > getContainingNeighbors(const std::uint8_t *bv) const
returns indices of all fingerprints that completely contain this one
double getTversky(unsigned int idx, const std::uint8_t *bv, double ca, double cb) const
std::vector< std::pair< double, unsigned int > > getTverskyNeighbors(boost::shared_array< std::uint8_t > bv, double ca, double cb, double threshold=0.7, bool usePopcountScreen=true) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition FPBReader.h:224
unsigned int length() const
returns the number of fingerprints
boost::shared_array< std::uint8_t > getBytes(unsigned int idx) const
returns the requested fingerprint as an array of bytes
double getTanimoto(unsigned int idx, const std::uint8_t *bv) const
std::vector< std::pair< double, unsigned int > > getTverskyNeighbors(const ExplicitBitVect &ebv, double ca, double cb, double threshold=0.7, bool usePopcountScreen=true) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
boost::shared_ptr< ExplicitBitVect > getFP(unsigned int idx) const
returns the requested fingerprint as an ExplicitBitVect
double getTanimoto(unsigned int idx, boost::shared_array< std::uint8_t > bv) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition FPBReader.h:152
double getTversky(unsigned int idx, boost::shared_array< std::uint8_t > bv, double ca, double cb) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition FPBReader.h:198
std::pair< boost::shared_ptr< ExplicitBitVect >, std::string > operator[](unsigned int idx) const
returns the fingerprint and id of the requested fingerprint
Definition FPBReader.h:133
std::vector< std::pair< double, unsigned int > > getTanimotoNeighbors(boost::shared_array< std::uint8_t > bv, double threshold=0.7, bool usePopcountScreen=true) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition FPBReader.h:175
double getTversky(unsigned int idx, const ExplicitBitVect &ebv, double ca, double cb) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
FPBReader(std::istream *inStream, bool takeOwnership=true, bool lazyRead=false)
ctor for reading from an open istream
Definition FPBReader.h:86
FPBReader(const char *fname, bool lazyRead=false)
ctor for reading from a named file
Definition FPBReader.h:67
FPBReader(const std::string &fname, bool lazyRead=false)
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition FPBReader.h:71
unsigned int nBits() const
returns the number of bits in our fingerprints
std::vector< unsigned int > getContainingNeighbors(const ExplicitBitVect &ebv) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
double getTanimoto(unsigned int idx, const ExplicitBitVect &ebv) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
std::string getId(unsigned int idx) const
returns the id of the requested fingerprint
void init()
Read the data from the file and initialize internal data structures.
#define RDKIT_DATASTRUCTS_EXPORT
Definition export.h:89
Std stuff.