RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
MHFP.h
Go to the documentation of this file.
1//
2// 2019, Daniel Probst, Reymond Group @ University of Bern
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10
11/*! \file MHFP.h
12
13*/
14#include <RDGeneral/export.h>
15#ifndef RD_MHFPFPS_H
16#define RD_MHFPFPS_H
17#include <string>
18#include <vector>
19#include <GraphMol/ROMol.h>
21
22class SparseBitVect;
23namespace RDKit {
24namespace MHFPFingerprints {
25const std::string mhfpFingerprintVersion = "1.0.0";
26
27namespace FNV {
28const uint32_t prime = 0x01000193;
29const uint32_t seed = 0x811C9DC5;
30
31//! A simple implementation of the Fowler–Noll–Vo hash function.
32inline uint32_t hash(const std::string& str, uint32_t hash = seed) {
33 const unsigned char* ptr = (const unsigned char*)str.c_str();
34 size_t len = str.length();
35
36 while (len--) {
37 hash = (*ptr++ ^ hash) * prime;
38 }
39
40 return hash;
41};
42} // namespace FNV
43
45 public:
46 //! Constructor
47 /*!
48 \brief Construct a MHFPEncoder
49
50 The MHFPEncoder class is instantieted with a given number of permutations
51 and a seed. Fingerprints / minhashes created with a different number of
52 permutations or a different seed are not compatible.
53
54 \param n_permutations the number of permutations used to create hash
55 functions. This will be the dimensionality of the resulting vector.
56 Default: <tt>2048</tt>.
57 \param seed a random seed. Default: <tt>42</tt>.
58 */
59 MHFPEncoder(unsigned int n_permutations = 2048, unsigned int seed = 42);
60
61 /*!
62 \brief Creates a MinHash from a vector of strings.
63
64 This method is exposed in order to enable advanced usage of this MHFP
65 implementation such as customizing the properties that are hashed in order
66 to create an MHFP instance. In theory, any number of values that can be
67 represented as strings can be minhashed. This method is called
68 by MHFPEncoder::Encode.
69
70 \param vec a vector containg strings (e.g. the smiles shingling of a
71 molecule).
72
73 \returns the MinHash of the input.
74 */
75 std::vector<uint32_t> FromStringArray(const std::vector<std::string>& vec);
76
77 /*!
78 \brief Creates a MinHash from a list of unsigned integers.
79
80 This method is exposed in order to enable advanced usage of this MHFP
81 implementation such as MinHashing a sparse array generated by another
82 fingerprint (e.g. Morgan / ECFP).
83
84 \param vec a vector containg unsigned integers.
85
86 \returns the MinHash of the input.
87 */
88 std::vector<uint32_t> FromArray(const std::vector<uint32_t>& vec);
89
90 /*!
91 \brief Creates a molecular shingling based on circular substructures.
92
93 A molecular shingling is a vector of SMILES that were extracted from and
94 represent a molecule. This method extracts substructures centered at each
95 atom of the molecule with different radii. A molecule with 10 atoms will
96 generate <tt>10 * 3</tt> shingles when a radius of <tt>3</tt> is chosen.
97
98 \param radius the maximum radius of the substructure that is generated at
99 each atom. Default: <tt>3</tt>.
100 \param rings whether the rings (SSSR) are extrected from the molecule and
101 added to the shingling. Given the molecule
102 <tt>"C1CCCCCC1C(=O)C"</tt>, "<tt>C1CCCCCC1"</tt> would be added
103 to the shingling. Default: <tt>true</tt>.
104 \param isomeric whether the SMILES added to the shingling are isomeric.
105 Default: <tt>false</tt>.
106 \param kekulize whether the SMILES added to the shingling are kekulized.
107 Default: <tt>true</tt>. NOTE that this will throw an exception if
108 the molecule cannot be kekulized.
109 \param min_radius the minimum radius that is used to extract n-grams.
110 Default: <tt>1</tt>.
111
112 \returns the shingling of a molecule.
113 */
114 std::vector<std::string> CreateShingling(const ROMol& mol,
115 unsigned char radius = 3,
116 bool rings = true,
117 bool isomeric = false,
118 bool kekulize = false,
119 unsigned char min_radius = 1);
120
121 //! \overload
122 std::vector<std::string> CreateShingling(const std::string& smiles,
123 unsigned char radius = 3,
124 bool rings = true,
125 bool isomeric = false,
126 bool kekulize = false,
127 unsigned char min_radius = 1);
128
129 /*!
130 \brief Creates a MinHash vector from a molecule.
131
132 This methods is a wrapper around MHFPEncoder::CreateShingling and
133 MHFPEncoder::FromStringArray. When a vector of molecules or SMILES is passed
134 and RDKit was compiled with OpenMP, it is parallelized and will speed up by
135 a factor of the number of cores.
136
137 \param radius the maximum radius of the substructure that is generated at
138 each atom. Default: <tt>3</tt>.
139 \param rings whether the rings (SSSR) are extrected from the molecule and
140 added to the shingling. Given the molecule
141 <tt>"C1CCCCCC1C(=O)C"</tt>, "<tt>C1CCCCCC1"</tt> would be added
142 to the shingling. Default: <tt>true</tt>.
143 \param isomeric whether the SMILES added to the shingling are isomeric.
144 Default: <tt>false</tt>.
145 \param kekulize whether the SMILES added to the shingling are kekulized.
146 Default: <tt>true</tt>. NOTE that this will throw an exception if
147 the molecule cannot be kekulized.
148 \param min_radius the minimum radius that is used to extract n-grams.
149 Default: <tt>1</tt>.
150
151 \returns the MHFP fingerprint.
152 */
153 std::vector<uint32_t> Encode(ROMol& mol, unsigned char radius = 3,
154 bool rings = true, bool isomeric = false,
155 bool kekulize = false,
156 unsigned char min_radius = 1);
157
158 //! \overload
159 std::vector<std::vector<uint32_t>> Encode(std::vector<ROMol>& mols,
160 unsigned char radius = 3,
161 bool rings = true,
162 bool isomeric = false,
163 bool kekulize = false,
164 unsigned char min_radius = 1);
165
166 //! \overload
167 std::vector<uint32_t> Encode(std::string& smiles, unsigned char radius = 3,
168 bool rings = true, bool isomeric = false,
169 bool kekulize = false,
170 unsigned char min_radius = 1);
171
172 //! \overload
173 std::vector<std::vector<uint32_t>> Encode(std::vector<std::string>& smiles,
174 unsigned char radius = 3,
175 bool rings = true,
176 bool isomeric = false,
177 bool kekulize = false,
178 unsigned char min_radius = 1);
179
180 /*!
181 \brief Creates a binary fingerprint based on circular sub-SMILES.
182
183 Creates a binary fingerprint similar to ECFP. However, instead of using
184 a Morgan-style hashing, circular n-grams (sub-SMILES) are created, hashed
185 directly and folded.
186
187 \param radius the maximum radius of the substructure that is generated at
188 each atom. Default: <tt>3</tt>.
189 \param rings whether the rings (SSSR) are extrected from the molecule and
190 added to the shingling. Given the molecule
191 <tt>"C1CCCCCC1C(=O)C"</tt>, "<tt>C1CCCCCC1"</tt> would be added
192 to the shingling. Default: <tt>true</tt>.
193 \param isomeric whether the SMILES added to the shingling are isomeric.
194 Default: <tt>false</tt>.
195 \param kekulize whether the SMILES added to the shingling are kekulized.
196 Default: <tt>true</tt>. NOTE that this will throw an exception if
197 the molecule cannot be kekulized.
198 \param min_radius the minimum radius that is used to extract n-grams.
199 Default: <tt>1</tt>.
200 \param length the length into which the fingerprint is folded.
201 Default: <tt>2048</tt>.
202
203 \returns the SECFP fingerprint.
204 */
205 ExplicitBitVect EncodeSECFP(ROMol& mol, unsigned char radius = 3,
206 bool rings = true, bool isomeric = false,
207 bool kekulize = false,
208 unsigned char min_radius = 1,
209 size_t length = 2048);
210
211 //! \overload
212 std::vector<ExplicitBitVect> EncodeSECFP(
213 std::vector<ROMol>& mols, unsigned char radius = 3, bool rings = true,
214 bool isomeric = false, bool kekulize = false,
215 unsigned char min_radius = 1, size_t length = 2048);
216
217 //! \overload
218 ExplicitBitVect EncodeSECFP(std::string& smiles, unsigned char radius = 3,
219 bool rings = true, bool isomeric = false,
220 bool kekulize = false,
221 unsigned char min_radius = 1,
222 size_t length = 2048);
223
224 //! \overload
225 std::vector<ExplicitBitVect> EncodeSECFP(
226 std::vector<std::string>& smiles, unsigned char radius = 3,
227 bool rings = true, bool isomeric = false, bool kekulize = false,
228 unsigned char min_radius = 1, size_t length = 2048);
229
230 /*!
231 \brief Calculates the Hamming distance between two MHFP
232 fingerprints.
233
234 \param a an MHFP fingerprint vector.
235 \param b an MHFP fingerprint vector.
236
237 \returns the Hamming distance between the two fingerprints.
238 */
239 static double Distance(const std::vector<uint32_t>& a,
240 const std::vector<uint32_t>& b) {
241 size_t mismatches = 0;
242
243 for (size_t i = 0; i < a.size(); i++) {
244 if (a[i] != b[i]) {
245 mismatches++;
246 }
247 }
248
249 return mismatches / (double)a.size();
250 }
251
252 private:
253 //! The fastest mod implementation.
254 uint64_t FastMod(const uint64_t input, const uint64_t ceil) {
255 return input >= ceil ? input % ceil : input;
256 }
257
258 ExplicitBitVect Fold(const std::vector<uint32_t>& vec,
259 uint32_t length = 2048) {
260 ExplicitBitVect ebv(length);
261 for (size_t i = 0; i < vec.size(); i++) {
262 ebv.setBit(vec[i] % length);
263 }
264 return ebv;
265 }
266
267 std::vector<uint32_t> HashShingling(std::vector<std::string> vec) {
268 std::vector<uint32_t> result(vec.size());
269 for (size_t i = 0; i < vec.size(); i++) {
270 result[i] = FNV::hash(vec[i]);
271 }
272 return result;
273 }
274
275 unsigned int n_permutations_, seed_;
276 uint64_t prime_ = 2305843009213693951UL;
277 uint32_t max_hash_ = 4294967295;
278 std::vector<uint32_t> perms_a_;
279 std::vector<uint32_t> perms_b_;
280};
281
282} // namespace MHFPFingerprints
283} // namespace RDKit
284
285#endif
Defines the primary molecule class ROMol as well as associated typedefs.
a class for bit vectors that are densely occupied
static double Distance(const std::vector< uint32_t > &a, const std::vector< uint32_t > &b)
Calculates the Hamming distance between two MHFP fingerprints.
Definition MHFP.h:239
ExplicitBitVect EncodeSECFP(std::string &smiles, unsigned char radius=3, bool rings=true, bool isomeric=false, bool kekulize=false, unsigned char min_radius=1, size_t length=2048)
This is an overloaded member function, provided for convenience. It differs from the above function o...
std::vector< ExplicitBitVect > EncodeSECFP(std::vector< std::string > &smiles, unsigned char radius=3, bool rings=true, bool isomeric=false, bool kekulize=false, unsigned char min_radius=1, size_t length=2048)
This is an overloaded member function, provided for convenience. It differs from the above function o...
std::vector< std::string > CreateShingling(const std::string &smiles, unsigned char radius=3, bool rings=true, bool isomeric=false, bool kekulize=false, unsigned char min_radius=1)
This is an overloaded member function, provided for convenience. It differs from the above function o...
std::vector< std::string > CreateShingling(const ROMol &mol, unsigned char radius=3, bool rings=true, bool isomeric=false, bool kekulize=false, unsigned char min_radius=1)
Creates a molecular shingling based on circular substructures.
std::vector< ExplicitBitVect > EncodeSECFP(std::vector< ROMol > &mols, unsigned char radius=3, bool rings=true, bool isomeric=false, bool kekulize=false, unsigned char min_radius=1, size_t length=2048)
This is an overloaded member function, provided for convenience. It differs from the above function o...
std::vector< std::vector< uint32_t > > Encode(std::vector< ROMol > &mols, unsigned char radius=3, bool rings=true, bool isomeric=false, bool kekulize=false, unsigned char min_radius=1)
This is an overloaded member function, provided for convenience. It differs from the above function o...
std::vector< uint32_t > Encode(std::string &smiles, unsigned char radius=3, bool rings=true, bool isomeric=false, bool kekulize=false, unsigned char min_radius=1)
This is an overloaded member function, provided for convenience. It differs from the above function o...
ExplicitBitVect EncodeSECFP(ROMol &mol, unsigned char radius=3, bool rings=true, bool isomeric=false, bool kekulize=false, unsigned char min_radius=1, size_t length=2048)
Creates a binary fingerprint based on circular sub-SMILES.
std::vector< uint32_t > Encode(ROMol &mol, unsigned char radius=3, bool rings=true, bool isomeric=false, bool kekulize=false, unsigned char min_radius=1)
Creates a MinHash vector from a molecule.
MHFPEncoder(unsigned int n_permutations=2048, unsigned int seed=42)
Constructor.
std::vector< uint32_t > FromArray(const std::vector< uint32_t > &vec)
Creates a MinHash from a list of unsigned integers.
std::vector< uint32_t > FromStringArray(const std::vector< std::string > &vec)
Creates a MinHash from a vector of strings.
std::vector< std::vector< uint32_t > > Encode(std::vector< std::string > &smiles, unsigned char radius=3, bool rings=true, bool isomeric=false, bool kekulize=false, unsigned char min_radius=1)
This is an overloaded member function, provided for convenience. It differs from the above function o...
a class for bit vectors that are sparsely occupied.
#define RDKIT_FINGERPRINTS_EXPORT
Definition export.h:177
const uint32_t seed
Definition MHFP.h:29
const uint32_t prime
Definition MHFP.h:28
uint32_t hash(const std::string &str, uint32_t hash=seed)
A simple implementation of the Fowler–Noll–Vo hash function.
Definition MHFP.h:32
const std::string mhfpFingerprintVersion
Definition MHFP.h:25
Std stuff.
bool rdvalue_is(const RDValue_cast_t)