AtomPairs.h

Go to the documentation of this file.
00001 //
00002 //  Copyright (C) 2007-2011 Greg Landrum
00003 //
00004 //   @@ All Rights Reserved @@
00005 //  This file is part of the RDKit.
00006 //  The contents are covered by the terms of the BSD license
00007 //  which is included in the file license.txt, found at the root
00008 //  of the RDKit source tree.
00009 //
00010 
00011 /*! \file AtomPairs.h
00012 
00013 */
00014 #ifndef __RD_ATOMPAIRS_H__
00015 #define __RD_ATOMPAIRS_H__
00016 
00017 #include <DataStructs/SparseIntVect.h>
00018 #include <DataStructs/BitVects.h>
00019 #include <boost/cstdint.hpp>
00020 namespace RDKit {
00021   class Atom;
00022 
00023   namespace AtomPairs {
00024     const std::string atomPairsVersion="1.1.0";
00025     const unsigned int numTypeBits=4;
00026     const unsigned int atomNumberTypes[1<<numTypeBits]={5,6,7,8,9,14,15,16,17,33,34,35,51,52,43};
00027     const unsigned int numPiBits=2;
00028     const unsigned int maxNumPi=(1<<numPiBits)-1;
00029     const unsigned int numBranchBits=3;
00030     const unsigned int maxNumBranches=(1<<numBranchBits)-1;
00031     const unsigned int codeSize=numTypeBits+numPiBits+numBranchBits;
00032     const unsigned int numPathBits=5;
00033     const unsigned int maxPathLen=(1<<numPathBits)-1;
00034     const unsigned int numAtomPairFingerprintBits=numPathBits+2*codeSize;
00035     
00036     //! returns a numeric code for the atom (the atom's hash in the
00037     //! atom-pair scheme)
00038     /*!
00039       \param atom            the atom to be considered
00040       \param branchSubtract  (optional) a constant to subtract from
00041       the number of neighbors when the hash
00042       is calculated (used in the topological
00043       torsions code)
00044     */
00045     boost::uint32_t getAtomCode(const Atom *atom,unsigned int branchSubtract=0);
00046 
00047     //! returns an atom pair hash based on two atom hashes and the
00048     //! distance between the atoms.
00049     /*!
00050       \param codeI  the hash for the first atom
00051       \param codeJ  the hash for the second atom
00052       \param dist   the distance (number of bonds) between the two
00053       atoms
00054     */
00055     boost::uint32_t getAtomPairCode(boost::uint32_t codeI,boost::uint32_t codeJ,
00056                                     unsigned int dist);
00057 
00058     //! returns the atom-pair fingerprint for a molecule
00059     /*!
00060       The algorithm used is described here:
00061       R.E. Carhart, D.H. Smith, R. Venkataraghavan; "Atom Pairs as
00062       Molecular Features in Structure-Activity Studies: Definition
00063       and Applications" JCICS 25, 64-73 (1985).
00064 
00065       
00066       \param mol:   the molecule to be fingerprinted
00067       \param minLength:   minimum distance between atoms to be
00068                           considered in a pair. Default is 1 bond.
00069       \param maxLength:   maximum distance between atoms to be
00070                           considered in a pair.
00071                           Default is maxPathLen-1 bonds.
00072       \param fromAtoms:   if provided, only atom pairs that involve
00073                           the specified atoms will be included in the
00074                           fingerprint
00075       \param ignoreAtoms: if provided, any atom pairs that include 
00076                           the specified atoms will not be included in the
00077                           fingerprint
00078 
00079       \return a pointer to the fingerprint. The client is
00080       responsible for calling delete on this.
00081 
00082     */
00083     SparseIntVect<boost::int32_t> *
00084     getAtomPairFingerprint(const ROMol &mol,
00085                            unsigned int minLength,unsigned int maxLength,
00086                            const std::vector<boost::uint32_t> *fromAtoms=0,
00087                            const std::vector<boost::uint32_t> *ignoreAtoms=0);
00088     //! \overload
00089     SparseIntVect<boost::int32_t> *
00090     getAtomPairFingerprint(const ROMol &mol,
00091                            const std::vector<boost::uint32_t> *fromAtoms=0,
00092                            const std::vector<boost::uint32_t> *ignoreAtoms=0);
00093 
00094 
00095     //! returns the hashed atom-pair fingerprint for a molecule
00096     /*!
00097       \param mol:   the molecule to be fingerprinted
00098       \param nBits:   the length of the fingerprint to generate
00099       \param minLength:   minimum distance between atoms to be
00100                           considered in a pair. Default is 1 bond.
00101       \param maxLength:   maximum distance between atoms to be
00102                           considered in a pair.
00103                           Default is maxPathLen-1 bonds.
00104       \param fromAtoms:   if provided, only atom pairs that involve
00105                           the specified atoms will be included in the
00106                           fingerprint
00107       \param ignoreAtoms: if provided, any atom pairs that include 
00108                           the specified atoms will not be included in the
00109                           fingerprint
00110 
00111       \return a pointer to the fingerprint. The client is
00112       responsible for calling delete on this.
00113 
00114     */
00115     SparseIntVect<boost::int32_t> *
00116     getHashedAtomPairFingerprint(const ROMol &mol,
00117                                  unsigned int nBits=2048,
00118                                  unsigned int minLength=1,
00119                                  unsigned int maxLength=maxPathLen-1,
00120                                  const std::vector<boost::uint32_t> *fromAtoms=0,
00121                                  const std::vector<boost::uint32_t> *ignoreAtoms=0);
00122     //! returns the hashed atom-pair fingerprint for a molecule as a bit vector
00123     /*!
00124       \param mol:   the molecule to be fingerprinted
00125       \param nBits:   the length of the fingerprint to generate
00126       \param minLength:   minimum distance between atoms to be
00127                           considered in a pair. Default is 1 bond.
00128       \param maxLength:   maximum distance between atoms to be
00129                           considered in a pair.
00130                           Default is maxPathLen-1 bonds.
00131       \param fromAtoms:   if provided, only atom pairs that involve
00132                           the specified atoms will be included in the
00133                           fingerprint
00134       \param ignoreAtoms: if provided, any atom pairs that include 
00135                           the specified atoms will not be included in the
00136                           fingerprint
00137       \param nBitsPerEntry: number of bits to use in simulating counts
00138 
00139       \return a pointer to the fingerprint. The client is
00140       responsible for calling delete on this.
00141 
00142     */
00143     ExplicitBitVect *
00144     getHashedAtomPairFingerprintAsBitVect(const ROMol &mol,
00145                                           unsigned int nBits=2048,
00146                                           unsigned int minLength=1,
00147                                           unsigned int maxLength=maxPathLen-1,
00148                                           const std::vector<boost::uint32_t> *fromAtoms=0,
00149                                           const std::vector<boost::uint32_t> *ignoreAtoms=0,
00150                                           unsigned int nBitsPerEntry=4);
00151 
00152 
00153     //! returns an topological torsion hash based on the atom hashes
00154     //! passed in
00155     /*!
00156       \param atomCodes  the vector of atom hashes
00157     */
00158     boost::uint64_t getTopologicalTorsionCode(const std::vector<boost::uint32_t> &atomCodes);
00159 
00160     //! returns the topological-torsion fingerprint for a molecule
00161     /*!
00162       The algorithm used is described here:
00163       R. Nilakantan, N. Bauman, J. S. Dixon, R. Venkataraghavan;
00164       "Topological Torsion: A New Molecular Descriptor for SAR Applications.
00165       Comparison with Other Descriptors" JCICS 27, 82-85 (1987).
00166 
00167       \param mol:         the molecule to be fingerprinted
00168       \param targetSize:  the number of atoms to include in the "torsions"
00169       \param fromAtoms:   if provided, only torsions that start or end at
00170                           the specified atoms will be included in the
00171                           fingerprint
00172       \param ignoreAtoms: if provided, any torsions that include 
00173                           the specified atoms will not be included in the
00174                           fingerprint
00175 
00176       \return a pointer to the fingerprint. The client is
00177       responsible for calling delete on this.
00178 
00179     */
00180     SparseIntVect<boost::int64_t > *
00181     getTopologicalTorsionFingerprint(const ROMol &mol,
00182                                      unsigned int targetSize=4,
00183                                      const std::vector<boost::uint32_t> *fromAtoms=0,
00184                                      const std::vector<boost::uint32_t> *ignoreAtoms=0);
00185     //! returns a hashed topological-torsion fingerprint for a molecule
00186     /*!
00187       The algorithm used is described here:
00188       R. Nilakantan, N. Bauman, J. S. Dixon, R. Venkataraghavan;
00189       "Topological Torsion: A New Molecular Descriptor for SAR Applications.
00190       Comparison with Other Descriptors" JCICS 27, 82-85 (1987).
00191 
00192       \param mol:         the molecule to be fingerprinted
00193       \param nBits:       number of bits to include in the fingerprint
00194       \param targetSize:  the number of atoms to include in the "torsions"
00195       \param fromAtoms:   if provided, only torsions that start or end at
00196                           the specified atoms will be included in the
00197                           fingerprint
00198       \param ignoreAtoms: if provided, any torsions that include 
00199                           the specified atoms will not be included in the
00200                           fingerprint
00201 
00202       \return a pointer to the fingerprint. The client is
00203       responsible for calling delete on this.
00204 
00205     */
00206     SparseIntVect<boost::int64_t > *
00207     getHashedTopologicalTorsionFingerprint(const ROMol &mol,
00208                                            unsigned int nBits=2048,
00209                                            unsigned int targetSize=4,
00210                                            const std::vector<boost::uint32_t> *fromAtoms=0,
00211                                            const std::vector<boost::uint32_t> *ignoreAtoms=0);
00212     //! returns a hashed topological-torsion fingerprint for a molecule as a bit vector
00213     /*!
00214       \param mol:         the molecule to be fingerprinted
00215       \param nBits:       number of bits to include in the fingerprint
00216       \param targetSize:  the number of atoms to include in the "torsions"
00217       \param fromAtoms:   if provided, only torsions that start or end at
00218                           the specified atoms will be included in the
00219                           fingerprint
00220       \param ignoreAtoms: if provided, any torsions that include 
00221                           the specified atoms will not be included in the
00222                           fingerprint
00223       \param nBitsPerEntry: number of bits to use in simulating counts
00224 
00225       \return a pointer to the fingerprint. The client is
00226       responsible for calling delete on this.
00227 
00228     */
00229     ExplicitBitVect *
00230     getHashedTopologicalTorsionFingerprintAsBitVect(const ROMol &mol,
00231                                                     unsigned int nBits=2048,
00232                                                     unsigned int targetSize=4,
00233                                                     const std::vector<boost::uint32_t> *fromAtoms=0,
00234                                                     const std::vector<boost::uint32_t> *ignoreAtoms=0,
00235                                                     unsigned int nBitsPerEntry=4);
00236   }    
00237 }
00238 
00239 #endif