MorganFingerprints.h

Go to the documentation of this file.
00001 //
00002 //
00003 //  Copyright (c) 2009-2010, Novartis Institutes for BioMedical Research Inc.
00004 //  All rights reserved.
00005 // 
00006 // Redistribution and use in source and binary forms, with or without
00007 // modification, are permitted provided that the following conditions are
00008 // met: 
00009 //
00010 //     * Redistributions of source code must retain the above copyright 
00011 //       notice, this list of conditions and the following disclaimer.
00012 //     * Redistributions in binary form must reproduce the above
00013 //       copyright notice, this list of conditions and the following 
00014 //       disclaimer in the documentation and/or other materials provided 
00015 //       with the distribution.
00016 //     * Neither the name of Novartis Institutes for BioMedical Research Inc. 
00017 //       nor the names of its contributors may be used to endorse or promote 
00018 //       products derived from this software without specific prior written permission.
00019 //
00020 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00021 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00022 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
00023 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
00024 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00025 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
00026 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
00027 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
00028 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
00029 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
00030 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00031 //
00032 //  Created by Greg Landrum, July 2008
00033 //
00034 //
00035 
00036 /*! \file MorganFingerprints.h
00037 
00038 */
00039 #ifndef __RD_MORGANFPS_H__
00040 #define __RD_MORGANFPS_H__
00041 
00042 #include <vector>
00043 #include <map>
00044 #include <DataStructs/SparseIntVect.h>
00045 #include <DataStructs/ExplicitBitVect.h>
00046 #include <boost/cstdint.hpp>
00047 
00048 namespace RDKit {
00049   class ROMol;
00050   namespace MorganFingerprints {
00051     extern std::vector<std::string> defaultFeatureSmarts;
00052     extern std::vector<ROMOL_SPTR> defaultFeatureMatchers;
00053 
00054     typedef std::map<boost::uint32_t,std::vector<std::pair<boost::uint32_t,boost::uint32_t> > > BitInfoMap;
00055     
00056     const std::string morganFingerprintVersion="1.0.0";
00057     
00058     //! returns the Morgan fingerprint for a molecule
00059     /*!  
00060       These fingerprints are similar to the well-known ECFP or
00061       FCFP fingerprints, depending on which invariants are used.
00062         
00063       The algorithm used is described in the paper
00064       Rogers, D. & Hahn, M. Extended-Connectivity Fingerprints. JCIM 50:742-54 (2010) 
00065       http://dx.doi.org/10.1021/ci100050t
00066 
00067       The original implementation was done using this paper:
00068       D. Rogers, R.D. Brown, M. Hahn J. Biomol. Screen. 10:682-6 (2005)
00069       and an unpublished technical report:
00070       http://www.ics.uci.edu/~welling/teaching/ICS274Bspring06/David%20Rogers%20-%20ECFP%20Manuscript.doc
00071 
00072       \param mol:    the molecule to be fingerprinted
00073       \param radius: the number of iterations to grow the fingerprint
00074       \param invariants : optional pointer to a set of atom invariants to
00075             be used. By default ECFP-type invariants are used 
00076             (calculated by getConnectivityInvariants())
00077       \param fromAtoms : if this is provided, only the atoms in the vector will be
00078                          used as centers in the fingerprint
00079       \param useChirality : if set, additional information will be added to the fingerprint
00080                             when chiral atoms are discovered. This will cause \verbatim C[C@H](F)Cl,
00081                             C[C@@H](F)Cl, and CC(F)Cl \endverbatim to generate different fingerprints.
00082       \param useBondTypes : if set, bond types will be included as part of the hash for
00083                             calculating bits
00084       \param onlyNonzeroInvariants : if set, bits will only be set from atoms that
00085                                      have a nonzero invariant.
00086       \param atomsSettingBits : if nonzero, this will be used to return information
00087                                about the atoms that set each particular bit.
00088                                The keys are the map are bit ids, the values
00089                                are lists of (atomId, radius) pairs.
00090 
00091       \return a pointer to the fingerprint. The client is
00092       responsible for calling delete on this.
00093 
00094     */
00095     SparseIntVect<boost::uint32_t> *
00096       getFingerprint(const ROMol &mol,
00097                      unsigned int radius,
00098                      std::vector<boost::uint32_t> *invariants=0,
00099                      const std::vector<boost::uint32_t> *fromAtoms=0,
00100                      bool useChirality=false,
00101                      bool useBondTypes=true,
00102                      bool onlyNonzeroInvariants=false,
00103                      BitInfoMap *atomsSettingBits=0);
00104 
00105 
00106     //! returns the Morgan fingerprint for a molecule as a bit vector
00107     /*!
00108       see documentation for getFingerprint() for theory/references
00109 
00110       \param mol:    the molecule to be fingerprinted
00111       \param radius: the number of iterations to grow the fingerprint
00112       \param nBits:  the number of bits in the final fingerprint
00113       \param invariants : optional pointer to a set of atom invariants to
00114             be used. By default ECFP-type invariants are used 
00115             (calculated by getConnectivityInvariants())
00116       \param fromAtoms : if this is provided, only the atoms in the vector will be
00117                          used as centers in the fingerprint
00118       \param useChirality : if set, additional information will be added to the fingerprint
00119                             when chiral atoms are discovered. This will cause \verbatim C[C@H](F)Cl,
00120                             C[C@@H](F)Cl, and CC(F)Cl \endverbatim to generate different fingerprints.
00121       \param useBondTypes : if set, bond types will be included as part of the hash for
00122                             calculating bits
00123       \param onlyNonzeroInvariants : if set, bits will only be set from atoms that
00124                                      have a nonzero invariant.
00125       \param atomsSettingBits : if nonzero, this will be used to return information
00126                                about the atoms that set each particular bit.
00127                                The keys are the map are bit ids, the values
00128                                are lists of (atomId, radius) pairs.
00129 
00130       \return a pointer to the fingerprint. The client is
00131       responsible for calling delete on this.
00132 
00133     */
00134     ExplicitBitVect *
00135       getFingerprintAsBitVect(const ROMol &mol,
00136                               unsigned int radius,
00137                               unsigned int nBits,
00138                               std::vector<boost::uint32_t> *invariants=0,
00139                               const std::vector<boost::uint32_t> *fromAtoms=0,
00140                               bool useChirality=false,
00141                               bool useBondTypes=true,
00142                               bool onlyNonzeroInvariants=false,
00143                               BitInfoMap *atomsSettingBits=0);
00144       
00145     //! returns the connectivity invariants for a molecule
00146     /*!  
00147 
00148       \param mol :    the molecule to be considered
00149       \param invars : used to return the results
00150       \param includeRingMembership : if set, whether or not the atom is in
00151                  a ring will be used in the invariant list.
00152     */
00153     void getConnectivityInvariants(const ROMol &mol,
00154                                    std::vector<boost::uint32_t> &invars,
00155                                    bool includeRingMembership=true);
00156     const std::string morganConnectivityInvariantVersion="1.0.0";
00157 
00158     //! returns the feature invariants for a molecule
00159     /*!  
00160 
00161       \param mol:    the molecule to be considered
00162       \param invars : used to return the results
00163       \param patterns: if provided should contain the queries used to assign atom-types.
00164                        if not provided, feature definitions adapted from reference:
00165                        Gobbi and Poppinger, Biotech. Bioeng. _61_ 47-54 (1998)
00166                        will be used for Donor, Acceptor, Aromatic, Halogen, Basic, Acidic
00167 
00168     */
00169     void getFeatureInvariants(const ROMol &mol,
00170                               std::vector<boost::uint32_t> &invars,
00171                               std::vector<ROMOL_SPTR> *patterns=0);
00172     const std::string morganFeatureInvariantVersion="0.1.0";
00173 
00174   } // end of namespace MorganFingerprints
00175 }
00176 
00177 #endif