RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
BitOps.h
Go to the documentation of this file.
1//
2// Copyright (C) 2003-2012 greg Landrum and Rational Discovery LLC
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10#include <RDGeneral/export.h>
11#ifndef __RD_BITOPS_H__
12#define __RD_BITOPS_H__
13/*! \file BitOps.h
14
15 \brief Contains general bit-comparison and similarity operations.
16
17 The notation used to document the similarity metrics is:
18 - \c V1_n: number of bits in vector 1
19 - \c V1_o: number of on bits in vector 1
20 - <tt>(V1&V2)_o</tt>: number of on bits in the intersection of vectors 1 and
21 2
22
23 */
24
25#include "BitVects.h"
26#include <string>
27
28//! general purpose wrapper for calculating the similarity between two bvs
29//! that may be of unequal size (will automatically fold as appropriate)
30template <typename T>
31double SimilarityWrapper(const T &bv1, const T &bv2,
32 double (*metric)(const T &, const T &),
33 bool returnDistance = false) {
34 double res = 0.0;
35 if (bv1.getNumBits() > bv2.getNumBits()) {
36 T *bv1tmp = FoldFingerprint(bv1, bv1.getNumBits() / bv2.getNumBits());
37 res = metric(*bv1tmp, bv2);
38 delete bv1tmp;
39 } else if (bv2.getNumBits() > bv1.getNumBits()) {
40 T *bv2tmp = FoldFingerprint(bv2, bv2.getNumBits() / bv1.getNumBits());
41 res = metric(bv1, *bv2tmp);
42 delete bv2tmp;
43 } else {
44 res = metric(bv1, bv2);
45 }
46 if (returnDistance) {
47 res = 1.0 - res;
48 }
49 return res;
50}
51//! \overload
52template <typename T>
53double SimilarityWrapper(const T &bv1, const T &bv2, double a, double b,
54 double (*metric)(const T &, const T &, double, double),
55 bool returnDistance = false) {
56 double res = 0.0;
57 if (bv1.getNumBits() > bv2.getNumBits()) {
58 T *bv1tmp = FoldFingerprint(bv1, bv1.getNumBits() / bv2.getNumBits());
59 res = metric(*bv1tmp, bv2, a, b);
60 delete bv1tmp;
61 } else if (bv2.getNumBits() > bv1.getNumBits()) {
62 T *bv2tmp = FoldFingerprint(bv2, bv2.getNumBits() / bv1.getNumBits());
63 res = metric(bv1, *bv2tmp, a, b);
64 delete bv2tmp;
65 } else {
66 res = metric(bv1, bv2, a, b);
67 }
68 if (returnDistance) {
69 res = 1.0 - res;
70 }
71 return res;
72}
73
75 const char *ref);
76RDKIT_DATASTRUCTS_EXPORT bool AllProbeBitsMatch(const std::string &probe,
77 const std::string &ref);
79 const ExplicitBitVect &ref);
80
81template <typename T1>
83 const std::string &pkl);
84
85template <typename T1>
86RDKIT_DATASTRUCTS_EXPORT bool AllProbeBitsMatch(const T1 &probe, const T1 &ref);
87
88//! returns the number of on bits in common between two bit vectors
89/*!
90 \return (bv1&bv2)_o
91*/
92template <typename T1, typename T2>
93RDKIT_DATASTRUCTS_EXPORT int NumOnBitsInCommon(const T1 &bv1, const T2 &bv2);
94
96 const ExplicitBitVect &bv2);
97
98//! returns the Tanimoto similarity between two bit vects
99/*!
100 \return <tt>(bv1&bv2)_o / [bv1_o + bv2_o - (bv1&bv2)_o]</tt>
101*/
102template <typename T1, typename T2>
104 const T2 &bv2);
105
106//! returns the Cosine similarity between two bit vects
107/*!
108 \return <tt>(bv1&bv2)_o / sqrt(bv1_o + bv2_o)</tt>
109*/
110template <typename T1, typename T2>
111RDKIT_DATASTRUCTS_EXPORT double CosineSimilarity(const T1 &bv1, const T2 &bv2);
112
113//! returns the Kulczynski similarity between two bit vects
114/*!
115 \return <tt>(bv1&bv2)_o * [bv1_o + bv2_o] / [2 * bv1_o * bv2_o]</tt>
116*/
117template <typename T1, typename T2>
119 const T2 &bv2);
120
121//! returns the Dice similarity between two bit vects
122/*!
123 \return <tt>2*(bv1&bv2)_o / [bv1_o + bv2_o]</tt>
124*/
125template <typename T1, typename T2>
126RDKIT_DATASTRUCTS_EXPORT double DiceSimilarity(const T1 &bv1, const T2 &bv2);
127
128//! returns the Tversky similarity between two bit vects
129/*!
130 \return <tt>(bv1&bv2)_o / [a*bv1_o + b*bv2_o + (1 - a - b)*(bv1&bv2)_o]</tt>
131
132 Notes:
133 # 0 <= a,b <= 1
134 # Tversky(a=1,b=1) = Tanimoto
135 # Tversky(a=1/2,b=1/2) = Dice
136
137*/
138template <typename T1, typename T2>
139RDKIT_DATASTRUCTS_EXPORT double TverskySimilarity(const T1 &bv1, const T2 &bv2,
140 double a, double b);
141
142//! returns the Sokal similarity between two bit vects
143/*!
144 \return <tt>(bv1&bv2)_o / [2*bv1_o + 2*bv2_o - 3*(bv1&bv2)_o]</tt>
145*/
146template <typename T1, typename T2>
147RDKIT_DATASTRUCTS_EXPORT double SokalSimilarity(const T1 &bv1, const T2 &bv2);
148
149//! returns the McConnaughey similarity between two bit vects
150/*!
151 \return <tt>[(bv1&bv2)_o * (bv1_o + bv2_o) - (bv1_o * bv2_o)] / (bv1_o *
152 bv2_o)</tt>
153*/
154template <typename T1, typename T2>
156 const T2 &bv2);
157
158//! returns the Asymmetric similarity between two bit vects
159/*!
160 \return <tt>(bv1&bv2)_o / min(bv1_o,bv2_o)</tt>
161*/
162template <typename T1, typename T2>
164 const T2 &bv2);
165
166//! returns the Braun-Blanquet similarity between two bit vects
167/*!
168 \return <tt>(bv1&bv2)_o / max(bv1_o,bv2_o)</tt>
169*/
170template <typename T1, typename T2>
172 const T2 &bv2);
173
174//! returns the Russel similarity between two bit vects
175/*!
176 \return <tt>(bv1&bv2)_o / bv1_o</tt>
177
178 <b>Note:</b> that this operation is non-commutative:
179 RusselSimilarity(bv1,bv2) != RusselSimilarity(bv2,bv1)
180
181*/
182template <typename T1, typename T2>
183RDKIT_DATASTRUCTS_EXPORT double RusselSimilarity(const T1 &bv1, const T2 &bv2);
184
185//! returns the Rogot-Goldberg similarity between two bit vects
186/*!
187 \return <tt>(bv1&bv2)_o / (bv1_o + bv2_o)
188 + (bv1_n - bv1_o - bv2_o + (bv1&bv2)_o) / (2*bv1_n - bv1_o - bv2_o) </tt>
189*/
190template <typename T1, typename T2>
192 const T2 &bv2);
193
194//! returns the on bit similarity between two bit vects
195/*!
196 \return <tt>(bv1&bv2)_o / (bv1|bv2)_o </tt>
197*/
198template <typename T1, typename T2>
199RDKIT_DATASTRUCTS_EXPORT double OnBitSimilarity(const T1 &bv1, const T2 &bv2);
200
201//! returns the number of common bits (on and off) between two bit vects
202/*!
203 \return <tt>bv1_n - (bv1^bv2)_o</tt>
204*/
205template <typename T1, typename T2>
206RDKIT_DATASTRUCTS_EXPORT int NumBitsInCommon(const T1 &bv1, const T2 &bv2);
207
209 const ExplicitBitVect &bv2);
210
211//! returns the common-bit similarity (on and off) between two bit vects
212//! This is also called Manhattan similarity.
213/*!
214 \return <tt>[bv1_n - (bv1^bv2)_o] / bv1_n</tt>
215*/
216template <typename T1, typename T2>
217RDKIT_DATASTRUCTS_EXPORT double AllBitSimilarity(const T1 &bv1, const T2 &bv2);
218
219//! returns an IntVect with indices of all on bits in common between two bit
220/// vects
221template <typename T1, typename T2>
222RDKIT_DATASTRUCTS_EXPORT IntVect OnBitsInCommon(const T1 &bv1, const T2 &bv2);
223
224//! returns an IntVect with indices of all off bits in common between two bit
225/// vects
226template <typename T1, typename T2>
228
229//! returns the on-bit projected similarities between two bit vects
230/*!
231 \return two values, as a DoubleVect:
232 - <tt>(bv1&bv2)_o / bv1_o</tt>
233 - <tt>(bv1&bv2)_o / bv2_o</tt>
234*/
235template <typename T1, typename T2>
237 const T2 &bv2);
238
239//! returns the on-bit projected similarities between two bit vects
240/*!
241 \return two values, as a DoubleVect:
242 - <tt>[bv1_n - (bv1|bv2)_o] / [bv1_n - bv1_o]</tt>
243 - <tt>[bv2_n - (bv1|bv2)_o] / [bv2_n - bv2_o]</tt>
244
245 <b>Note:</b> <tt>bv1_n = bv2_n</tt>
246
247*/
248template <typename T1, typename T2>
250 const T2 &bv2);
251
252//! folds a bit vector \c factor times and returns the result
253/*!
254 \param bv1 the vector to be folded
255 \param factor (optional) the number of times to fold it
256
257 \return a pointer to the folded fingerprint, which is
258 <tt>bv1_n/factor</tt> long.
259
260 <b>Note:</b> The caller is responsible for <tt>delete</tt>ing the result.
261 */
262template <typename T1>
264 unsigned int factor = 2);
265
266//! returns a text representation of a bit vector (a string of 0s and 1s)
267/*!
268 \param bv1 the vector to use
269
270 \return an std::string
271
272 */
273template <typename T1>
274RDKIT_DATASTRUCTS_EXPORT std::string BitVectToText(const T1 &bv1);
275
276//! returns a hex representation of a bit vector compatible with Andrew Dalke's
277/// FPS format
278/*!
279 \param bv1 the vector to use
280
281 \return an std::string
282
283 */
284template <typename T1>
286
287//! returns a binary string representation of a bit vector (an array of bytes)
288/*!
289 \param bv1 the vector to use
290
291 \return an std::string
292
293 */
294template <typename T1>
296
297//! updates a bit vector from Andrew Dalke's FPS format
298/*!
299 \param bv1 the vector to use
300 \param fps the FPS hex string
301
302
303 */
304template <typename T1>
306 const std::string &fps);
307
308//! updates a bit vector from a binary string representation of a bit vector (an
309/// array of bytes)
310/*!
311 \param bv1 the vector to use
312 \param fps the binary string
313
314
315 */
316template <typename T1>
318 T1 &bv1, const std::string &fps);
319
320// FIX: docs and tests please
321
323 const unsigned char *bv1, unsigned int nBytes);
324
325RDKIT_DATASTRUCTS_EXPORT double CalcBitmapTanimoto(const unsigned char *bv1,
326 const unsigned char *bv2,
327 unsigned int nBytes);
328RDKIT_DATASTRUCTS_EXPORT double CalcBitmapDice(const unsigned char *bv1,
329 const unsigned char *bv2,
330 unsigned int nBytes);
331RDKIT_DATASTRUCTS_EXPORT double CalcBitmapTversky(const unsigned char *bv1,
332 const unsigned char *bv2,
333 unsigned int nBytes,
334 double ca, double cb);
336 const unsigned char *probe, const unsigned char *ref, unsigned int nBytes);
337#endif
RDKIT_DATASTRUCTS_EXPORT std::string BitVectToBinaryText(const T1 &bv1)
returns a binary string representation of a bit vector (an array of bytes)
RDKIT_DATASTRUCTS_EXPORT double McConnaugheySimilarity(const T1 &bv1, const T2 &bv2)
returns the McConnaughey similarity between two bit vects
RDKIT_DATASTRUCTS_EXPORT int NumBitsInCommon(const T1 &bv1, const T2 &bv2)
returns the number of common bits (on and off) between two bit vects
RDKIT_DATASTRUCTS_EXPORT double AsymmetricSimilarity(const T1 &bv1, const T2 &bv2)
returns the Asymmetric similarity between two bit vects
RDKIT_DATASTRUCTS_EXPORT bool AllProbeBitsMatch(const char *probe, const char *ref)
RDKIT_DATASTRUCTS_EXPORT unsigned int CalcBitmapPopcount(const unsigned char *bv1, unsigned int nBytes)
double SimilarityWrapper(const T &bv1, const T &bv2, double(*metric)(const T &, const T &), bool returnDistance=false)
Definition BitOps.h:31
RDKIT_DATASTRUCTS_EXPORT std::string BitVectToFPSText(const T1 &bv1)
RDKIT_DATASTRUCTS_EXPORT double TverskySimilarity(const T1 &bv1, const T2 &bv2, double a, double b)
returns the Tversky similarity between two bit vects
RDKIT_DATASTRUCTS_EXPORT double RusselSimilarity(const T1 &bv1, const T2 &bv2)
returns the Russel similarity between two bit vects
RDKIT_DATASTRUCTS_EXPORT double AllBitSimilarity(const T1 &bv1, const T2 &bv2)
RDKIT_DATASTRUCTS_EXPORT std::string BitVectToText(const T1 &bv1)
returns a text representation of a bit vector (a string of 0s and 1s)
RDKIT_DATASTRUCTS_EXPORT double BraunBlanquetSimilarity(const T1 &bv1, const T2 &bv2)
returns the Braun-Blanquet similarity between two bit vects
RDKIT_DATASTRUCTS_EXPORT double OnBitSimilarity(const T1 &bv1, const T2 &bv2)
returns the on bit similarity between two bit vects
RDKIT_DATASTRUCTS_EXPORT bool CalcBitmapAllProbeBitsMatch(const unsigned char *probe, const unsigned char *ref, unsigned int nBytes)
RDKIT_DATASTRUCTS_EXPORT IntVect OnBitsInCommon(const T1 &bv1, const T2 &bv2)
RDKIT_DATASTRUCTS_EXPORT T1 * FoldFingerprint(const T1 &bv1, unsigned int factor=2)
folds a bit vector factor times and returns the result
RDKIT_DATASTRUCTS_EXPORT double SokalSimilarity(const T1 &bv1, const T2 &bv2)
returns the Sokal similarity between two bit vects
RDKIT_DATASTRUCTS_EXPORT double CosineSimilarity(const T1 &bv1, const T2 &bv2)
returns the Cosine similarity between two bit vects
RDKIT_DATASTRUCTS_EXPORT double DiceSimilarity(const T1 &bv1, const T2 &bv2)
returns the Dice similarity between two bit vects
RDKIT_DATASTRUCTS_EXPORT double KulczynskiSimilarity(const T1 &bv1, const T2 &bv2)
returns the Kulczynski similarity between two bit vects
RDKIT_DATASTRUCTS_EXPORT double RogotGoldbergSimilarity(const T1 &bv1, const T2 &bv2)
returns the Rogot-Goldberg similarity between two bit vects
RDKIT_DATASTRUCTS_EXPORT IntVect OffBitsInCommon(const T1 &bv1, const T2 &bv2)
RDKIT_DATASTRUCTS_EXPORT double CalcBitmapDice(const unsigned char *bv1, const unsigned char *bv2, unsigned int nBytes)
RDKIT_DATASTRUCTS_EXPORT double TanimotoSimilarity(const T1 &bv1, const T2 &bv2)
returns the Tanimoto similarity between two bit vects
RDKIT_DATASTRUCTS_EXPORT void UpdateBitVectFromBinaryText(T1 &bv1, const std::string &fps)
RDKIT_DATASTRUCTS_EXPORT double CalcBitmapTversky(const unsigned char *bv1, const unsigned char *bv2, unsigned int nBytes, double ca, double cb)
RDKIT_DATASTRUCTS_EXPORT void UpdateBitVectFromFPSText(T1 &bv1, const std::string &fps)
updates a bit vector from Andrew Dalke's FPS format
RDKIT_DATASTRUCTS_EXPORT double CalcBitmapTanimoto(const unsigned char *bv1, const unsigned char *bv2, unsigned int nBytes)
RDKIT_DATASTRUCTS_EXPORT DoubleVect OffBitProjSimilarity(const T1 &bv1, const T2 &bv2)
returns the on-bit projected similarities between two bit vects
RDKIT_DATASTRUCTS_EXPORT int NumOnBitsInCommon(const T1 &bv1, const T2 &bv2)
returns the number of on bits in common between two bit vectors
RDKIT_DATASTRUCTS_EXPORT DoubleVect OnBitProjSimilarity(const T1 &bv1, const T2 &bv2)
returns the on-bit projected similarities between two bit vects
std::vector< int > IntVect
Definition BitVect.h:17
std::vector< double > DoubleVect
Definition BitVect.h:19
Pulls in all the BitVect classes.
a class for bit vectors that are densely occupied
#define RDKIT_DATASTRUCTS_EXPORT
Definition export.h:81