BitOps.h

Go to the documentation of this file.
00001 //
00002 //  Copyright (C) 2003-2012 greg Landrum and Rational Discovery LLC
00003 //
00004 //  @@ All Rights Reserved @@
00005 //  This file is part of the RDKit.
00006 //  The contents are covered by the terms of the BSD license
00007 //  which is included in the file license.txt, found at the root
00008 //  of the RDKit source tree.
00009 //
00010 #ifndef __RD_BITOPS_H__
00011 #define __RD_BITOPS_H__
00012 /*! \file BitOps.h
00013 
00014   \brief Contains general bit-comparison and similarity operations.
00015 
00016   The notation used to document the similarity metrics is:
00017     - \c V1_n: number of bits in vector 1
00018     - \c V1_o: number of on bits in vector 1
00019     - <tt>(V1&V2)_o</tt>: number of on bits in the intersection of vectors 1 and 2
00020   
00021  */
00022 
00023 #include "BitVects.h"
00024 #include <string>
00025 
00026 
00027 //! general purpose wrapper for calculating the similarity between two bvs
00028 //! that may be of unequal size (will automatically fold as appropriate)
00029 template <typename T>
00030 double SimilarityWrapper(const T &bv1,const T &bv2,
00031                          double (*metric)(const T &,const T &),
00032                          bool returnDistance=false){
00033   double res=0.0;
00034   if(bv1.getNumBits()>bv2.getNumBits()){
00035     T *bv1tmp = FoldFingerprint(bv1,bv1.getNumBits()/bv2.getNumBits());
00036     res = metric(*bv1tmp,bv2);
00037     delete bv1tmp;
00038   } else if(bv2.getNumBits()>bv1.getNumBits()){
00039     T *bv2tmp = FoldFingerprint(bv2,bv2.getNumBits()/bv1.getNumBits());
00040     res = metric(bv1,*bv2tmp);
00041     delete bv2tmp;
00042   } else {
00043     res = metric(bv1,bv2);
00044   }
00045   if(returnDistance) res = 1.0-res;
00046   return res;
00047 }
00048 //! \overload
00049 template <typename T>
00050 double SimilarityWrapper(const T &bv1,const T &bv2,double a,double b,
00051                          double (*metric)(const T &,const T &,double,double),
00052                          bool returnDistance=false){
00053   double res=0.0;
00054   if(bv1.getNumBits()>bv2.getNumBits()){
00055     T *bv1tmp = FoldFingerprint(bv1,bv1.getNumBits()/bv2.getNumBits());
00056     res = metric(*bv1tmp,bv2,a,b);
00057     delete bv1tmp;
00058   } else if(bv2.getNumBits()>bv1.getNumBits()){
00059     T *bv2tmp = FoldFingerprint(bv2,bv2.getNumBits()/bv1.getNumBits());
00060     res = metric(bv1,*bv2tmp,a,b);
00061     delete bv2tmp;
00062   } else {
00063     res = metric(bv1,bv2,a,b);
00064   }
00065   if(returnDistance) res = 1.0-res;
00066   return res;
00067 }
00068 
00069 
00070 bool AllProbeBitsMatch(const char *probe,const char *ref);
00071 bool AllProbeBitsMatch(const std::string &probe,const std::string &ref);
00072 
00073   
00074 template <typename T1>
00075 bool AllProbeBitsMatch(const T1 &probe,const std::string &pkl);
00076 
00077 template <typename T1>
00078 bool AllProbeBitsMatch(const T1 &probe,const T1 &ref);
00079 
00080 
00081 //! returns the number of on bits in common between two bit vectors
00082 /*!
00083   \return (bv1&bv2)_o
00084 */
00085 template <typename T1, typename T2>
00086 int
00087 NumOnBitsInCommon(const T1& bv1,const T2& bv2);
00088 
00089 int
00090 NumOnBitsInCommon(const ExplicitBitVect & bv1,const ExplicitBitVect & bv2);
00091 
00092 //! returns the Tanimoto similarity between two bit vects
00093 /*!
00094   \return <tt>(bv1&bv2)_o / [bv1_o + bv2_o - (bv1&bv2)_o]</tt>
00095 */
00096 template <typename T1, typename T2>
00097 double
00098 TanimotoSimilarity(const T1& bv1,const T2& bv2);
00099 
00100 //! returns the Cosine similarity between two bit vects
00101 /*!
00102   \return <tt>(bv1&bv2)_o / sqrt(bv1_o + bv2_o)</tt>
00103 */
00104 template <typename T1, typename T2>
00105 double
00106 CosineSimilarity(const T1& bv1,
00107                  const T2& bv2);
00108 
00109 //! returns the Kulczynski similarity between two bit vects
00110 /*!
00111   \return <tt>(bv1&bv2)_o * [bv1_o + bv2_o] / [2 * bv1_o * bv2_o]</tt>
00112 */
00113 template <typename T1, typename T2>
00114 double
00115 KulczynskiSimilarity(const T1& bv1,
00116                      const T2& bv2);
00117 
00118 //! returns the Dice similarity between two bit vects
00119 /*!
00120   \return <tt>2*(bv1&bv2)_o / [bv1_o + bv2_o]</tt>
00121 */
00122 template <typename T1, typename T2>
00123 double
00124 DiceSimilarity(const T1& bv1,
00125                const T2& bv2);
00126 
00127 //! returns the Tversky similarity between two bit vects
00128 /*!
00129   \return <tt>(bv1&bv2)_o / [a*bv1_o + b*bv2_o + (1 - a - b)*(bv1&bv2)_o]</tt>
00130 
00131   Notes:  
00132    # 0 <= a,b <= 1
00133    # Tversky(a=1,b=1) = Tanimoto
00134    # Tversky(a=1/2,b=1/2) = Dice
00135  
00136 */
00137 template <typename T1, typename T2>
00138 double
00139 TverskySimilarity(const T1& bv1,
00140                   const T2& bv2,double a,double b);
00141 
00142 //! returns the Sokal similarity between two bit vects
00143 /*!
00144   \return <tt>(bv1&bv2)_o / [2*bv1_o + 2*bv2_o - 3*(bv1&bv2)_o]</tt>
00145 */
00146 template <typename T1, typename T2>
00147 double
00148 SokalSimilarity(const T1& bv1,
00149                 const T2& bv2);
00150 
00151 //! returns the McConnaughey similarity between two bit vects
00152 /*!
00153   \return <tt>[(bv1&bv2)_o * (bv1_o + bv2_o) - (bv1_o * bv2_o)] / (bv1_o * bv2_o)</tt>
00154 */
00155 template <typename T1, typename T2>
00156 double
00157 McConnaugheySimilarity(const T1& bv1,
00158                        const T2& bv2);
00159 
00160 //! returns the Asymmetric similarity between two bit vects
00161 /*!
00162   \return <tt>(bv1&bv2)_o / min(bv1_o,bv2_o)</tt>
00163 */
00164 template <typename T1, typename T2>
00165 double
00166 AsymmetricSimilarity(const T1& bv1,
00167                      const T2& bv2);
00168 
00169 //! returns the Braun-Blanquet similarity between two bit vects
00170 /*!
00171   \return <tt>(bv1&bv2)_o / max(bv1_o,bv2_o)</tt>
00172 */
00173 template <typename T1, typename T2>
00174 double
00175 BraunBlanquetSimilarity(const T1& bv1,
00176                         const T2& bv2);
00177 
00178 //! returns the Russel similarity between two bit vects
00179 /*!
00180   \return <tt>(bv1&bv2)_o / bv1_o</tt>
00181 
00182   <b>Note:</b> that this operation is non-commutative:
00183     RusselSimilarity(bv1,bv2) != RusselSimilarity(bv2,bv1)
00184 
00185 */
00186 template <typename T1, typename T2>
00187 double
00188 RusselSimilarity(const T1& bv1,
00189                  const T2& bv2);
00190 
00191 
00192 //! returns the on bit similarity between two bit vects
00193 /*!
00194   \return <tt>(bv1&bv2)_o / (bv1|bv2)_o </tt>
00195 */
00196 template <typename T1, typename T2>
00197 double
00198 OnBitSimilarity(const T1& bv1,const T2& bv2);
00199 
00200 //! returns the number of common bits (on and off) between two bit vects
00201 /*!
00202   \return <tt>bv1_n - (bv1^bv2)_o</tt>
00203 */
00204 template <typename T1, typename T2>
00205 int
00206 NumBitsInCommon(const T1& bv1,const T2& bv2);
00207 
00208 //! returns the commong-bit similarity (on and off) between two bit vects
00209 /*!
00210   \return <tt>[bv1_n - (bv1^bv2)_o] / bv1_n</tt>
00211 */
00212 template <typename T1, typename T2>
00213 double
00214 AllBitSimilarity(const T1& bv1,const T2& bv2);
00215 
00216 //! returns an IntVect with indices of all on bits in common between two bit vects
00217 template <typename T1, typename T2>
00218 IntVect
00219 OnBitsInCommon(const T1& bv1,const T2& bv2);
00220 
00221 //! returns an IntVect with indices of all off bits in common between two bit vects
00222 template <typename T1, typename T2>
00223 IntVect
00224 OffBitsInCommon(const T1& bv1,const T2& bv2);
00225 
00226 //! returns the on-bit projected similarities between two bit vects
00227 /*!
00228   \return two values, as a DoubleVect:
00229       - <tt>(bv1&bv2)_o / bv1_o</tt> 
00230       - <tt>(bv1&bv2)_o / bv2_o</tt> 
00231 */
00232 template <typename T1, typename T2>
00233 DoubleVect
00234 OnBitProjSimilarity(const T1& bv1,const T2& bv2);
00235 
00236 //! returns the on-bit projected similarities between two bit vects
00237 /*!
00238   \return two values, as a DoubleVect:
00239      - <tt>[bv1_n - (bv1|bv2)_o] / [bv1_n - bv1_o]</tt> 
00240      - <tt>[bv2_n - (bv1|bv2)_o] / [bv2_n - bv2_o]</tt> 
00241 
00242    <b>Note:</b> <tt>bv1_n = bv2_n</tt>
00243       
00244 */
00245 template <typename T1, typename T2>
00246 DoubleVect
00247 OffBitProjSimilarity(const T1& bv1,const T2& bv2);
00248 
00249 
00250 //! folds a bit vector \c factor times and returns the result
00251 /*!
00252   \param bv1    the vector to be folded
00253   \param factor (optional) the number of times to fold it
00254   
00255   \return a pointer to the folded fingerprint, which is
00256      <tt>bv1_n/factor</tt> long.
00257      
00258    <b>Note:</b> The caller is responsible for <tt>delete</tt>ing the result.
00259  */
00260 template <typename T1>
00261 T1 *
00262 FoldFingerprint(const T1& bv1,unsigned int factor=2);
00263 
00264 //! returns a text representation of a bit vector (a string of 0s and 1s)
00265 /*!
00266   \param bv1    the vector to use
00267   
00268   \return an std::string
00269 
00270  */
00271 template <typename T1>
00272 std::string
00273 BitVectToText(const T1& bv1);
00274 
00275 //! returns a hex representation of a bit vector compatible with Andrew Dalke's FPS format
00276 /*!
00277   \param bv1    the vector to use
00278   
00279   \return an std::string
00280 
00281  */
00282 template <typename T1>
00283 std::string
00284 BitVectToFPSText(const T1& bv1);
00285 
00286 //! updates a bit vector from Andrew Dalke's FPS format
00287 /*!
00288   \param bv1    the vector to use
00289   \param fps    the FPS hex string
00290 
00291 
00292  */
00293 template <typename T1>
00294 void
00295 UpdateBitVectFromFPSText(T1& bv1,const std::string &fps);
00296 
00297 
00298 
00299 #endif