00001 // 00002 // Copyright (C) 2001-2008 Greg Landrum and Rational Discovery LLC 00003 // 00004 // @@ All Rights Reserved @@ 00005 // 00006 #ifndef _RD_MOL_OPS_H_ 00007 #define _RD_MOL_OPS_H_ 00008 00009 #include <RDGeneral/types.h> 00010 #include <boost/tuple/tuple.hpp> 00011 #include <boost/smart_ptr.hpp> 00012 00013 extern const int ci_LOCAL_INF; 00014 namespace RDKit{ 00015 class ROMol; 00016 class RWMol; 00017 class Atom; 00018 class Bond; 00019 typedef std::vector<double> INVAR_VECT; 00020 typedef INVAR_VECT::iterator INVAR_VECT_I; 00021 typedef INVAR_VECT::const_iterator INVAR_VECT_CI; 00022 00023 00024 //! used to return atomic discriminators (three doubles) 00025 typedef boost::tuples::tuple<double,double,double> DiscrimTuple; 00026 00027 //! \brief Groups a variety of molecular query and transformation operations. 00028 namespace MolOps { 00029 00030 //! return the number of electrons available on an atom to donate for aromaticity 00031 /*! 00032 The result is determined using the default valency, number of lone pairs, 00033 number of bonds and the formal charge. Note that the atom may not donate 00034 all of these electrons to a ring for aromaticity (also used in Conjugation 00035 and hybridization code). 00036 00037 \param at the atom of interest 00038 00039 \return the number of electrons 00040 */ 00041 int countAtomElec(const Atom *at); 00042 00043 //! sums up all atomic formal charges and returns the result 00044 int getFormalCharge(const ROMol &mol); 00045 00046 //! returns whether or not the given Atom is involved in a conjugated bond 00047 bool atomHasConjugatedBond(const Atom *at); 00048 00049 //! find fragments (disconnected components of the molecular graph) 00050 /*! 00051 00052 \param mol the molecule of interest 00053 \param mapping used to return the mapping of Atoms->fragments. 00054 On return \c mapping will be <tt>mol->getNumAtoms()</tt> long 00055 and will contain the fragment assignment for each Atom 00056 00057 \return the number of fragments found. 00058 00059 */ 00060 unsigned int getMolFrags(const ROMol &mol,INT_VECT &mapping); 00061 //! find fragments (disconnected components of the molecular graph) 00062 /*! 00063 00064 \param mol the molecule of interest 00065 \param frags used to return the Atoms in each fragment 00066 On return \c mapping will be \c numFrags long, and each entry 00067 will contain the indices of the Atoms in that fragment. 00068 00069 \return the number of fragments found. 00070 00071 */ 00072 unsigned int getMolFrags(const ROMol &mol, VECT_INT_VECT &frags); 00073 00074 //! splits a molecule into its component fragments 00075 // (disconnected components of the molecular graph) 00076 /*! 00077 00078 \param mol the molecule of interest 00079 \param sanitizeFrags toggles sanitization of the fragments after 00080 they are built 00081 \param frags used to return the mapping of Atoms->fragments. 00082 if provided, \c frags will be <tt>mol->getNumAtoms()</tt> long 00083 on return and will contain the fragment assignment for each Atom 00084 00085 \return a vector of the fragments as smart pointers to ROMols 00086 00087 */ 00088 std::vector<boost::shared_ptr<ROMol> > getMolFrags(const ROMol &mol, 00089 bool sanitizeFrags=true, 00090 INT_VECT *frags=0); 00091 00092 //! finds a molecule's minimium spanning tree (MST) 00093 /*! 00094 \param mol the molecule of interest 00095 \param mst used to return the MST as a vector of bond indices 00096 */ 00097 void findSpanningTree(const ROMol &mol,INT_VECT &mst); 00098 00099 //! calculates a set of molecular discriminators from the distance matrix 00100 /*! 00101 Computes: 00102 -# BalabanJ 00103 -# the first eigenvalue of the distance matrix 00104 -# the last but one eigenvalue of the distance matrix 00105 00106 \param mol the molecule of interest 00107 \param useBO toggles inclusion of the bond order in the discriminators 00108 (when false, the discriminators are purely topological) 00109 \param force forces the calculation (instead of using cached results) 00110 00111 \return a \c DiscrimTuple with the results 00112 00113 */ 00114 DiscrimTuple computeDiscriminators(const ROMol &mol, 00115 bool useBO=true, 00116 bool force=false); 00117 //! \brief Same as MolOps::computeDiscriminators(const ROMol &mol), 00118 //! except that this directly uses the user-supplied distance matrix 00119 DiscrimTuple computeDiscriminators(double *distMat, int nb, int na); 00120 00121 00122 //! calculates Balaban's J index for the molecule 00123 /*! 00124 \param mol the molecule of interest 00125 \param useBO toggles inclusion of the bond order in the calculation 00126 (when false, we're not really calculating the J value) 00127 \param force forces the calculation (instead of using cached results) 00128 \param bondPath when included, only paths using bonds whose indices occur 00129 in this vector will be included in the calculation 00130 \param cacheIt If this is true, the calculated value will be cached 00131 as a property on the molecule 00132 \return the J index 00133 00134 */ 00135 double computeBalabanJ(const ROMol &mol, 00136 bool useBO=true, 00137 bool force=false, 00138 const std::vector<int> *bondPath=0, 00139 bool cacheIt=true); 00140 00141 00142 00143 //! \name Dealing with hydrogens 00144 //{@ 00145 00146 //! returns a copy of a molecule with hydrogens added in as explicit Atoms 00147 /*! 00148 \param mol the molecule to add Hs to 00149 \param explicitOnly (optional) if this \c true, only explicit Hs will be added 00150 \param addCoords (optional) If this is true, estimates for the atomic coordinates 00151 of the added Hs will be used. 00152 00153 \return the new molecule 00154 00155 <b>Notes:</b> 00156 - it makes no sense to use the \c addCoords option if the molecule's heavy 00157 atoms don't already have coordinates. 00158 - the caller is responsible for <tt>delete</tt>ing the pointer this returns. 00159 */ 00160 ROMol *addHs(const ROMol &mol,bool explicitOnly=false,bool addCoords=false); 00161 00162 00163 //! returns a copy of a molecule with hydrogens removed 00164 /*! 00165 \param mol the molecule to remove Hs from 00166 \param implicitOnly (optional) if this \c true, only implicit Hs will be removed 00167 \param updateExplicitCount (optional) If this is \c true, when explicit Hs are removed 00168 from the graph, the heavy atom to which they are bound will have its counter of 00169 explicit Hs increased. 00170 \param sanitize: (optional) If this is \c true, the final molecule will be 00171 sanitized 00172 00173 \return the new molecule 00174 00175 <b>Notes:</b> 00176 - Hydrogens which aren't connected to a heavy atom will not be 00177 removed. This prevents molecules like <tt>"[H][H]"</tt> from having 00178 all atoms removed. 00179 - Labelled hydrogen (e.g. atoms with atomic number=1, but mass > 1), 00180 will not be removed. 00181 00182 - the caller is responsible for <tt>delete</tt>ing the pointer this returns. 00183 */ 00184 ROMol *removeHs(const ROMol &mol,bool implicitOnly=false, 00185 bool updateExplicitCount=false,bool sanitize=true); 00186 00187 //! returns a copy of a molecule with hydrogens removed and added as queries 00188 //! to the heavy atoms to which they are bound. 00189 /*! 00190 This is really intended to be used with molecules that contain QueryAtoms 00191 00192 \param mol the molecule to remove Hs from 00193 00194 \return the new molecule 00195 00196 <b>Notes:</b> 00197 - Atoms that do not already have hydrogen count queries will have one 00198 added, other H-related queries will not be touched. Examples: 00199 - C[H] -> [C;!H0] 00200 - [C;H1][H] -> [C;H1] 00201 - [C;H2][H] -> [C;H2] 00202 - Hydrogens which aren't connected to a heavy atom will not be 00203 removed. This prevents molecules like <tt>"[H][H]"</tt> from having 00204 all atoms removed. 00205 - the caller is responsible for <tt>delete</tt>ing the pointer this returns. 00206 00207 */ 00208 ROMol *mergeQueryHs(const ROMol &mol); 00209 //@} 00210 00211 //! \name Sanitization 00212 //@{ 00213 00214 //! \brief carries out a collection of tasks for cleaning up a molecule and ensuring 00215 //! that it makes "chemical sense" 00216 /*! 00217 This functions calls the following in sequence 00218 -# MolOps::cleanUp() 00219 -# MolOps::Kekulize() 00220 -# MolOps::setAromaticity() 00221 -# MolOps::setConjugation() 00222 -# MolOps::setHybridization() 00223 -# MolOps::cleanupChirality() 00224 -# MolOps::adjustHs() 00225 00226 \param mol the RWMol to be cleaned 00227 00228 <b>Notes:</b> 00229 - If there is a failure in the sanitization, a \c SanitException 00230 will be thrown. 00231 - in general the user of this function should cast the molecule following this 00232 function to a ROMol, so that new atoms and bonds cannot be added to the 00233 molecule and screw up the sanitizing that has been done here 00234 */ 00235 void sanitizeMol(RWMol &mol); 00236 00237 //! Sets up the aromaticity for a molecule 00238 /*! 00239 00240 This is what happens here: 00241 -# find all the simple rings by calling the findSSSR function 00242 -# loop over all the Atoms in each ring and mark them if they are candidates 00243 for aromaticity. A ring atom is a candidate if it can spare electrons 00244 to the ring and if it's from the first two rows of the periodic table. 00245 -# ased on the candidate atoms, mark the rings to be either candidates 00246 or non-candidates. A ring is a candidate only if all its atoms are candidates 00247 -# apply Hueckel rule to each of the candidate rings to check if the ring can be 00248 aromatic 00249 00250 \param mol the RWMol of interest 00251 00252 \return 1 on succes, 0 otherwise 00253 00254 <b>Assumptions:</b> 00255 - Kekulization has been done (i.e. \c MolOps::Kekulize() has already 00256 been called) 00257 00258 */ 00259 int setAromaticity(RWMol &mol); 00260 00261 00262 //! Designed to be called by the sanitizer to handle special cases before anything is done. 00263 /*! 00264 00265 Currently this: 00266 - modifies nitro groups, so that the nitrogen does not have a unreasonable 00267 valence of 5, as follows: 00268 - the nitrogen gets a positve charge 00269 - one of the oxygens gets a negative chage and the double bond to this 00270 oxygen is changed to a single bond 00271 The net result is that nitro groups can be counted on to be: 00272 \c "[N+](=O)[O-]" 00273 00274 \param mol the molecule of interest 00275 00276 */ 00277 void cleanUp(RWMol &mol); 00278 00279 00280 //! adjust the number of implicit and explicit Hs for special cases 00281 /*! 00282 00283 Currently this: 00284 - modifies aromatic nitrogens so that, when appropriate, they have an 00285 explicit H marked (e.g. so that we get things like \c "c1cc[nH]cc1" 00286 00287 \param mol the molecule of interest 00288 00289 <b>Assumptions</b> 00290 - this is called after the molecule has been sanitized, 00291 aromaticity has been perceived, and the implicit valence of 00292 everything has been calculated. 00293 00294 */ 00295 void adjustHs(RWMol &mol); 00296 00297 //! Kekulizes the molecule 00298 /*! 00299 00300 \param mol the molecule of interest 00301 \param markAtomsBonds if this is set to true, \c isAromatic boolean settings 00302 on both the Bonds and Atoms are turned to false following 00303 the Kekulization, otherwise they are left alone in their 00304 original state. 00305 \param maxBackTracks the maximum number of attempts at back-tracking. The algorithm 00306 uses a back-tracking procedure to revist a previous setting of 00307 double bond if we hit a wall in the kekulization process 00308 00309 <b>Notes:</b> 00310 - even if \c markAtomsBonds is \c false the \c BondType for all aromatic 00311 bonds will be changed from \c RDKit::Bond::AROMATIC to \c RDKit::Bond::SINGLE 00312 or RDKit::Bond::DOUBLE during Kekulization. 00313 00314 */ 00315 void Kekulize(RWMol &mol, bool markAtomsBonds=true, unsigned int maxBackTracks=100); 00316 00317 //! flags the molecule's conjugated bonds 00318 void setConjugation(ROMol &mol); 00319 00320 //! calculates and sets the hybridization of all a molecule's Stoms 00321 void setHybridization(ROMol &mol); 00322 00323 00324 // @} 00325 00326 //! \name Ring finding and SSSR 00327 //@{ 00328 00329 //! finds a molecule's Smallest Set of Smallest Rings 00330 /*! 00331 Currently this implements a modified form of Figueras algorithm 00332 (JCICS - Vol. 36, No. 5, 1996, 986-991) 00333 00334 \param mol the molecule of interest 00335 \param res used to return the vector of rings. Each entry is a vector with 00336 atom indices. This information is also stored in the molecule's 00337 RingInfo structure, so this argument is optional (see overload) 00338 00339 \return number of smallest rings found 00340 00341 Base algorithm: 00342 - The original algorithm starts by finding representative degree 2 00343 nodes. 00344 - Representative because if a series of deg 2 nodes are found only 00345 one of them is picked. 00346 - The smallest ring around each of them is found. 00347 - The bonds that connect to this degree 2 node are them chopped off, yielding 00348 new deg two nodes 00349 - The process is repeated on the new deg 2 nodes. 00350 - If no deg 2 nodes are found, a deg 3 node is picked. The smallest ring 00351 with it is found. A bond from this is "carefully" (look in the paper) 00352 selected and chopped, yielding deg 2 nodes. The process is same as 00353 above once this is done. 00354 00355 Our Modifications: 00356 - If available, more than one smallest ring around a representative deg 2 00357 node will be computed and stored 00358 - Typically 3 rings are found around a degree 3 node (when no deg 2s are available) 00359 and all the bond to that node are chopped. 00360 - The extra rings that were found in this process are removed after all the nodes 00361 have been covered. 00362 00363 These changes were motivated by several factors: 00364 - We believe the original algorithm fails to find the correct SSSR 00365 (finds the correct number of them but the wrong ones) on some sample mols 00366 - Since SSSR may not be unique, a post-SSSR step to symmetrize may be done. 00367 The extra rings this process adds can be quite useful. 00368 */ 00369 int findSSSR(const ROMol &mol, VECT_INT_VECT &res); 00370 //! \overload 00371 int findSSSR(const ROMol &mol, VECT_INT_VECT *res=0); 00372 00373 //! symmetrize the molecule's Smallest Set of Smallest Rings 00374 /*! 00375 SSSR rings obatined from "findSSSR" can be non-unique in some case. 00376 For example, cubane has five SSSR rings, not six as one would hope. 00377 00378 This function adds additional rings to the SSSR list if necessary 00379 to make the list symmetric, e.g. all atoms in cubane will be part of the same number 00380 of SSSRs. This function choses these extra rings from the extra rings computed 00381 and discarded during findSSSR. The new ring are chosen such that: 00382 - replacing a same sized ring in the SSSR list with an extra ring yields 00383 the same union of bond IDs as the orignal SSSR list 00384 00385 \param mol - the molecule of interest 00386 \param res used to return the vector of rings. Each entry is a vector with 00387 atom indices. This information is also stored in the molecule's 00388 RingInfo structure, so this argument is optional (see overload) 00389 00390 \return the total number of rings = (new rings + old SSSRs) 00391 00392 <b>Notes:</b> 00393 - if no SSSR rings are found on the molecule - MolOps::findSSSR() is called first 00394 */ 00395 int symmetrizeSSSR(ROMol &mol, VECT_INT_VECT &res); 00396 //! \overload 00397 int symmetrizeSSSR(ROMol &mol); 00398 00399 //@} 00400 00401 //! \name Shortest paths and other matrices 00402 //@{ 00403 00404 //! returns a molecule's adjacency matrix 00405 /*! 00406 \param mol the molecule of interest 00407 \param useBO toggles use of bond orders in the matrix 00408 \param emptyVal sets the empty value (for non-adjacent atoms) 00409 \param force forces calculation of the matrix, even if already computed 00410 \param propNamePrefix used to set the cached property name 00411 00412 \return the adjacency matrix. 00413 00414 <b>Notes</b> 00415 - The result of this is cached in the molecule's local property dictionary, 00416 which will handle deallocation. Do the caller should <b>not</b> \c delete 00417 this pointer. 00418 00419 */ 00420 double * getAdjacencyMatrix(const ROMol &mol, 00421 bool useBO=false, 00422 int emptyVal=0, 00423 bool force=false, 00424 const char *propNamePrefix=0); 00425 00426 //! Computes the molecule's topological distance matrix 00427 /*! 00428 Uses the Floyd-Warshall all-pairs-shortest-paths algorithm. 00429 00430 \param mol the molecule of interest 00431 \param useBO toggles use of bond orders in the matrix 00432 \param useAtomWts sets the diagonal elements of the result to 00433 6.0/(atomic number) so that the matrix can be used to calculate 00434 Balaban J values. This does not affect the bond weights. 00435 \param force forces calculation of the matrix, even if already computed 00436 \param propNamePrefix used to set the cached property name 00437 00438 \return the distance matrix. 00439 00440 <b>Notes</b> 00441 - The result of this is cached in the molecule's local property dictionary, 00442 which will handle deallocation. Do the caller should <b>not</b> \c delete 00443 this pointer. 00444 00445 00446 */ 00447 double *getDistanceMat(const ROMol &mol, 00448 bool useBO=false, 00449 bool useAtomWts=false, 00450 bool force=false, 00451 const char *propNamePrefix=0); 00452 00453 00454 //! Computes the molecule's topological distance matrix 00455 /*! 00456 Uses the Floyd-Warshall all-pairs-shortest-paths algorithm. 00457 00458 \param mol the molecule of interest 00459 \param activeAtoms only elements corresponding to these atom indices 00460 will be included in the calculation 00461 \param bonds only bonds found in this list will be included in the 00462 calculation 00463 \param useBO toggles use of bond orders in the matrix 00464 \param useAtomWts sets the diagonal elements of the result to 00465 6.0/(atomic number) so that the matrix can be used to calculate 00466 Balaban J values. This does not affect the bond weights. 00467 00468 \return the distance matrix. 00469 00470 <b>Notes</b> 00471 - The results of this call are not cached, the caller <b>should</b> \c delete 00472 this pointer. 00473 00474 00475 */ 00476 double *getDistanceMat(const ROMol &mol, 00477 const std::vector<int> &activeAtoms, 00478 const std::vector<const Bond *> &bonds, 00479 bool useBO=false, 00480 bool useAtomWts=false); 00481 00482 00483 //! Find the shortest path between two atoms 00484 /*! 00485 Uses the Bellman-Ford algorithm 00486 00487 \param mol molecule of interest 00488 \param aid1 index of the first atom 00489 \param aid2 index of the second atom 00490 00491 \return an std::list with the indices of the atoms along the shortest 00492 path 00493 00494 <b>Notes:</b> 00495 - the starting and end atoms are not include in the path 00496 00497 */ 00498 INT_LIST getShortestPath(const ROMol &mol, int aid1, int aid2); 00499 00500 //@} 00501 00502 //! \name Canonicalization 00503 //@{ 00504 00505 //! assign a canonical ordering to a molecule's atoms 00506 /*! 00507 The algorithm used here is a modification of the published Daylight canonical 00508 smiles algorithm (i.e. it uses atom invariants and products of primes). 00509 00510 \param mol the molecule of interest 00511 \param ranks used to return the ranks 00512 \param breakTies toggles breaking of ties (see below) 00513 \param rankHistory used to return the rank history (see below) 00514 00515 <b>Notes:</b> 00516 - Tie breaking should be done when it's important to have a full ordering 00517 of the atoms (e.g. when generating canonical traversal trees). If it's 00518 acceptable to have ties between symmetry-equivalent atoms (e.g. when 00519 generating CIP codes), tie breaking can/should be skipped. 00520 - if the \c rankHistory argument is provided, the evolution of the ranks of 00521 individual atoms will be tracked. The \c rankHistory pointer should be 00522 to a VECT_INT_VECT that has at least \c mol.getNumAtoms() elements. 00523 */ 00524 void rankAtoms(const ROMol &mol,INT_VECT &ranks, 00525 bool breakTies=true,VECT_INT_VECT *rankHistory=0); 00526 00527 // @} 00528 00529 //! \name Stereochemistry 00530 //@{ 00531 00532 //! removes bogus chirality markers (those on non-sp3 centers): 00533 void cleanupChirality(RWMol &mol); 00534 00535 //! \brief Uses a conformer to assign ChiralType to a molecule's atoms 00536 /*! 00537 \param mol the molecule of interest 00538 \param confId the conformer to use 00539 \param replaceExistingTags if this flag is true, any existing atomic chiral 00540 tags will be replaced 00541 00542 If the conformer provided is not a 3D conformer, nothing will be done. 00543 */ 00544 void assignChiralTypesFrom3D(ROMol &mol,int confId=-1,bool replaceExistingTags=true); 00545 00546 //! Assign stereochemistry tags to atoms (i.e. R/S) and bonds (i.e. Z/E) 00547 /*! 00548 00549 \param mol the molecule of interest 00550 \param cleanIt toggles removal of stereo flags from double bonds that can 00551 not have stereochemistry 00552 \param force forces the calculation to be repeated even if it has 00553 already been done 00554 00555 <b>Notes:M</b> 00556 - Throughout we assume that we're working with a hydrogen-suppressed 00557 graph. 00558 00559 */ 00560 void assignStereochemistry(ROMol &mol,bool cleanIt=false,bool force=false); 00561 00562 //! \deprecated Use assignStereochemistry() instead 00563 static void assignAtomChiralCodes(ROMol &mol,bool cleanIt=false,bool force=false){ 00564 assignStereochemistry(mol,cleanIt,force); 00565 }; 00566 //! \deprecated Use assignStereochemistry() instead 00567 static void assignBondStereoCodes(ROMol &mol,bool cleanIt=false,bool force=false){ 00568 assignStereochemistry(mol,cleanIt,force); 00569 }; 00570 00571 00572 //! \brief finds bonds that could be cis/trans in a molecule and mark them as 00573 //! Bond::STEREONONE 00574 /*! 00575 \param mol the molecule of interest 00576 \param cleanIt toggles removal of stereo flags from double bonds that can 00577 not have stereochemistry 00578 00579 This function is usefuly in two situations 00580 - when parsing a mol file; for the bonds marked here, coordinate informations 00581 on the neighbors can be used to indentify cis or trans states 00582 - when writing a mol file; bonds that can be cis/trans but not marked as either 00583 need to be specially marked in the mol file 00584 */ 00585 void findPotentialStereoBonds(ROMol &mol,bool cleanIt=false); 00586 //@} 00587 00588 }; // end of namespace MolOps 00589 }; // end of namespace RDKit 00590 00591 #endif
1.5.5