00001 // 00002 // Copyright (C) 2001-2010 Greg Landrum and Rational Discovery LLC 00003 // 00004 // @@ All Rights Reserved @@ 00005 // This file is part of the RDKit. 00006 // The contents are covered by the terms of the BSD license 00007 // which is included in the file license.txt, found at the root 00008 // of the RDKit source tree. 00009 // 00010 #ifndef _RD_MOL_OPS_H_ 00011 #define _RD_MOL_OPS_H_ 00012 00013 #include <RDGeneral/types.h> 00014 #include <boost/tuple/tuple.hpp> 00015 #include <boost/smart_ptr.hpp> 00016 00017 extern const int ci_LOCAL_INF; 00018 namespace RDKit{ 00019 class ROMol; 00020 class RWMol; 00021 class Atom; 00022 class Bond; 00023 typedef std::vector<double> INVAR_VECT; 00024 typedef INVAR_VECT::iterator INVAR_VECT_I; 00025 typedef INVAR_VECT::const_iterator INVAR_VECT_CI; 00026 00027 //! \brief Groups a variety of molecular query and transformation operations. 00028 namespace MolOps { 00029 00030 //! return the number of electrons available on an atom to donate for aromaticity 00031 /*! 00032 The result is determined using the default valency, number of lone pairs, 00033 number of bonds and the formal charge. Note that the atom may not donate 00034 all of these electrons to a ring for aromaticity (also used in Conjugation 00035 and hybridization code). 00036 00037 \param at the atom of interest 00038 00039 \return the number of electrons 00040 */ 00041 int countAtomElec(const Atom *at); 00042 00043 //! sums up all atomic formal charges and returns the result 00044 int getFormalCharge(const ROMol &mol); 00045 00046 //! returns whether or not the given Atom is involved in a conjugated bond 00047 bool atomHasConjugatedBond(const Atom *at); 00048 00049 //! find fragments (disconnected components of the molecular graph) 00050 /*! 00051 00052 \param mol the molecule of interest 00053 \param mapping used to return the mapping of Atoms->fragments. 00054 On return \c mapping will be <tt>mol->getNumAtoms()</tt> long 00055 and will contain the fragment assignment for each Atom 00056 00057 \return the number of fragments found. 00058 00059 */ 00060 unsigned int getMolFrags(const ROMol &mol,std::vector<int> &mapping); 00061 //! find fragments (disconnected components of the molecular graph) 00062 /*! 00063 00064 \param mol the molecule of interest 00065 \param frags used to return the Atoms in each fragment 00066 On return \c mapping will be \c numFrags long, and each entry 00067 will contain the indices of the Atoms in that fragment. 00068 00069 \return the number of fragments found. 00070 00071 */ 00072 unsigned int getMolFrags(const ROMol &mol, std::vector<std::vector<int> > &frags); 00073 00074 //! splits a molecule into its component fragments 00075 // (disconnected components of the molecular graph) 00076 /*! 00077 00078 \param mol the molecule of interest 00079 \param sanitizeFrags toggles sanitization of the fragments after 00080 they are built 00081 \param frags used to return the mapping of Atoms->fragments. 00082 if provided, \c frags will be <tt>mol->getNumAtoms()</tt> long 00083 on return and will contain the fragment assignment for each Atom 00084 00085 \return a vector of the fragments as smart pointers to ROMols 00086 00087 */ 00088 std::vector<boost::shared_ptr<ROMol> > getMolFrags(const ROMol &mol, 00089 bool sanitizeFrags=true, 00090 std::vector<int> *frags=0); 00091 00092 #if 0 00093 //! finds a molecule's minimium spanning tree (MST) 00094 /*! 00095 \param mol the molecule of interest 00096 \param mst used to return the MST as a vector of bond indices 00097 */ 00098 void findSpanningTree(const ROMol &mol,std::vector<int> &mst); 00099 #endif 00100 00101 //! calculates Balaban's J index for the molecule 00102 /*! 00103 \param mol the molecule of interest 00104 \param useBO toggles inclusion of the bond order in the calculation 00105 (when false, we're not really calculating the J value) 00106 \param force forces the calculation (instead of using cached results) 00107 \param bondPath when included, only paths using bonds whose indices occur 00108 in this vector will be included in the calculation 00109 \param cacheIt If this is true, the calculated value will be cached 00110 as a property on the molecule 00111 \return the J index 00112 00113 */ 00114 double computeBalabanJ(const ROMol &mol, 00115 bool useBO=true, 00116 bool force=false, 00117 const std::vector<int> *bondPath=0, 00118 bool cacheIt=true); 00119 //! \overload 00120 double computeBalabanJ(double *distMat, int nb, int nAts); 00121 00122 //! \name Dealing with hydrogens 00123 //{@ 00124 00125 //! returns a copy of a molecule with hydrogens added in as explicit Atoms 00126 /*! 00127 \param mol the molecule to add Hs to 00128 \param explicitOnly (optional) if this \c true, only explicit Hs will be added 00129 \param addCoords (optional) If this is true, estimates for the atomic coordinates 00130 of the added Hs will be used. 00131 00132 \return the new molecule 00133 00134 <b>Notes:</b> 00135 - it makes no sense to use the \c addCoords option if the molecule's heavy 00136 atoms don't already have coordinates. 00137 - the caller is responsible for <tt>delete</tt>ing the pointer this returns. 00138 */ 00139 ROMol *addHs(const ROMol &mol,bool explicitOnly=false,bool addCoords=false); 00140 00141 00142 //! returns a copy of a molecule with hydrogens removed 00143 /*! 00144 \param mol the molecule to remove Hs from 00145 \param implicitOnly (optional) if this \c true, only implicit Hs will be removed 00146 \param updateExplicitCount (optional) If this is \c true, when explicit Hs are removed 00147 from the graph, the heavy atom to which they are bound will have its counter of 00148 explicit Hs increased. 00149 \param sanitize: (optional) If this is \c true, the final molecule will be 00150 sanitized 00151 00152 \return the new molecule 00153 00154 <b>Notes:</b> 00155 - Hydrogens which aren't connected to a heavy atom will not be 00156 removed. This prevents molecules like <tt>"[H][H]"</tt> from having 00157 all atoms removed. 00158 - Labelled hydrogen (e.g. atoms with atomic number=1, but mass > 1), 00159 will not be removed. 00160 00161 - the caller is responsible for <tt>delete</tt>ing the pointer this returns. 00162 */ 00163 ROMol *removeHs(const ROMol &mol,bool implicitOnly=false, 00164 bool updateExplicitCount=false,bool sanitize=true); 00165 00166 //! returns a copy of a molecule with hydrogens removed and added as queries 00167 //! to the heavy atoms to which they are bound. 00168 /*! 00169 This is really intended to be used with molecules that contain QueryAtoms 00170 00171 \param mol the molecule to remove Hs from 00172 00173 \return the new molecule 00174 00175 <b>Notes:</b> 00176 - Atoms that do not already have hydrogen count queries will have one 00177 added, other H-related queries will not be touched. Examples: 00178 - C[H] -> [C;!H0] 00179 - [C;H1][H] -> [C;H1] 00180 - [C;H2][H] -> [C;H2] 00181 - Hydrogens which aren't connected to a heavy atom will not be 00182 removed. This prevents molecules like <tt>"[H][H]"</tt> from having 00183 all atoms removed. 00184 - the caller is responsible for <tt>delete</tt>ing the pointer this returns. 00185 00186 */ 00187 ROMol *mergeQueryHs(const ROMol &mol); 00188 //@} 00189 00190 //! \name Sanitization 00191 //@{ 00192 00193 //! \brief carries out a collection of tasks for cleaning up a molecule and ensuring 00194 //! that it makes "chemical sense" 00195 /*! 00196 This functions calls the following in sequence 00197 -# MolOps::cleanUp() 00198 -# MolOps::Kekulize() 00199 -# MolOps::setAromaticity() 00200 -# MolOps::setConjugation() 00201 -# MolOps::setHybridization() 00202 -# MolOps::cleanupChirality() 00203 -# MolOps::adjustHs() 00204 00205 \param mol the RWMol to be cleaned 00206 00207 <b>Notes:</b> 00208 - If there is a failure in the sanitization, a \c SanitException 00209 will be thrown. 00210 - in general the user of this function should cast the molecule following this 00211 function to a ROMol, so that new atoms and bonds cannot be added to the 00212 molecule and screw up the sanitizing that has been done here 00213 */ 00214 void sanitizeMol(RWMol &mol); 00215 00216 //! Sets up the aromaticity for a molecule 00217 /*! 00218 00219 This is what happens here: 00220 -# find all the simple rings by calling the findSSSR function 00221 -# loop over all the Atoms in each ring and mark them if they are candidates 00222 for aromaticity. A ring atom is a candidate if it can spare electrons 00223 to the ring and if it's from the first two rows of the periodic table. 00224 -# ased on the candidate atoms, mark the rings to be either candidates 00225 or non-candidates. A ring is a candidate only if all its atoms are candidates 00226 -# apply Hueckel rule to each of the candidate rings to check if the ring can be 00227 aromatic 00228 00229 \param mol the RWMol of interest 00230 00231 \return 1 on succes, 0 otherwise 00232 00233 <b>Assumptions:</b> 00234 - Kekulization has been done (i.e. \c MolOps::Kekulize() has already 00235 been called) 00236 00237 */ 00238 int setAromaticity(RWMol &mol); 00239 00240 00241 //! Designed to be called by the sanitizer to handle special cases before anything is done. 00242 /*! 00243 00244 Currently this: 00245 - modifies nitro groups, so that the nitrogen does not have a unreasonable 00246 valence of 5, as follows: 00247 - the nitrogen gets a positve charge 00248 - one of the oxygens gets a negative chage and the double bond to this 00249 oxygen is changed to a single bond 00250 The net result is that nitro groups can be counted on to be: 00251 \c "[N+](=O)[O-]" 00252 00253 \param mol the molecule of interest 00254 00255 */ 00256 void cleanUp(RWMol &mol); 00257 00258 //! Called by the sanitizer to assign radical counts to atoms 00259 void assignRadicals(RWMol &mol); 00260 00261 //! adjust the number of implicit and explicit Hs for special cases 00262 /*! 00263 00264 Currently this: 00265 - modifies aromatic nitrogens so that, when appropriate, they have an 00266 explicit H marked (e.g. so that we get things like \c "c1cc[nH]cc1" 00267 00268 \param mol the molecule of interest 00269 00270 <b>Assumptions</b> 00271 - this is called after the molecule has been sanitized, 00272 aromaticity has been perceived, and the implicit valence of 00273 everything has been calculated. 00274 00275 */ 00276 void adjustHs(RWMol &mol); 00277 00278 //! Kekulizes the molecule 00279 /*! 00280 00281 \param mol the molecule of interest 00282 \param markAtomsBonds if this is set to true, \c isAromatic boolean settings 00283 on both the Bonds and Atoms are turned to false following 00284 the Kekulization, otherwise they are left alone in their 00285 original state. 00286 \param maxBackTracks the maximum number of attempts at back-tracking. The algorithm 00287 uses a back-tracking procedure to revist a previous setting of 00288 double bond if we hit a wall in the kekulization process 00289 00290 <b>Notes:</b> 00291 - even if \c markAtomsBonds is \c false the \c BondType for all aromatic 00292 bonds will be changed from \c RDKit::Bond::AROMATIC to \c RDKit::Bond::SINGLE 00293 or RDKit::Bond::DOUBLE during Kekulization. 00294 00295 */ 00296 void Kekulize(RWMol &mol, bool markAtomsBonds=true, unsigned int maxBackTracks=100); 00297 00298 //! flags the molecule's conjugated bonds 00299 void setConjugation(ROMol &mol); 00300 00301 //! calculates and sets the hybridization of all a molecule's Stoms 00302 void setHybridization(ROMol &mol); 00303 00304 00305 // @} 00306 00307 //! \name Ring finding and SSSR 00308 //@{ 00309 00310 //! finds a molecule's Smallest Set of Smallest Rings 00311 /*! 00312 Currently this implements a modified form of Figueras algorithm 00313 (JCICS - Vol. 36, No. 5, 1996, 986-991) 00314 00315 \param mol the molecule of interest 00316 \param res used to return the vector of rings. Each entry is a vector with 00317 atom indices. This information is also stored in the molecule's 00318 RingInfo structure, so this argument is optional (see overload) 00319 00320 \return number of smallest rings found 00321 00322 Base algorithm: 00323 - The original algorithm starts by finding representative degree 2 00324 nodes. 00325 - Representative because if a series of deg 2 nodes are found only 00326 one of them is picked. 00327 - The smallest ring around each of them is found. 00328 - The bonds that connect to this degree 2 node are them chopped off, yielding 00329 new deg two nodes 00330 - The process is repeated on the new deg 2 nodes. 00331 - If no deg 2 nodes are found, a deg 3 node is picked. The smallest ring 00332 with it is found. A bond from this is "carefully" (look in the paper) 00333 selected and chopped, yielding deg 2 nodes. The process is same as 00334 above once this is done. 00335 00336 Our Modifications: 00337 - If available, more than one smallest ring around a representative deg 2 00338 node will be computed and stored 00339 - Typically 3 rings are found around a degree 3 node (when no deg 2s are available) 00340 and all the bond to that node are chopped. 00341 - The extra rings that were found in this process are removed after all the nodes 00342 have been covered. 00343 00344 These changes were motivated by several factors: 00345 - We believe the original algorithm fails to find the correct SSSR 00346 (finds the correct number of them but the wrong ones) on some sample mols 00347 - Since SSSR may not be unique, a post-SSSR step to symmetrize may be done. 00348 The extra rings this process adds can be quite useful. 00349 */ 00350 int findSSSR(const ROMol &mol, std::vector<std::vector<int> > &res); 00351 //! \overload 00352 int findSSSR(const ROMol &mol, std::vector<std::vector<int> > *res=0); 00353 00354 //! use a DFS algorithm to identify ring bonds and atoms in a molecule 00355 /*! 00356 \b NOTE: though the RingInfo structure is populated by this function, 00357 the only really reliable calls that can be made are to check if 00358 mol.getRingInfo().numAtomRings(idx) or mol.getRingInfo().numBondRings(idx) 00359 return values >0 00360 */ 00361 void fastFindRings(const ROMol &mol); 00362 00363 00364 //! symmetrize the molecule's Smallest Set of Smallest Rings 00365 /*! 00366 SSSR rings obatined from "findSSSR" can be non-unique in some case. 00367 For example, cubane has five SSSR rings, not six as one would hope. 00368 00369 This function adds additional rings to the SSSR list if necessary 00370 to make the list symmetric, e.g. all atoms in cubane will be part of the same number 00371 of SSSRs. This function choses these extra rings from the extra rings computed 00372 and discarded during findSSSR. The new ring are chosen such that: 00373 - replacing a same sized ring in the SSSR list with an extra ring yields 00374 the same union of bond IDs as the orignal SSSR list 00375 00376 \param mol - the molecule of interest 00377 \param res used to return the vector of rings. Each entry is a vector with 00378 atom indices. This information is also stored in the molecule's 00379 RingInfo structure, so this argument is optional (see overload) 00380 00381 \return the total number of rings = (new rings + old SSSRs) 00382 00383 <b>Notes:</b> 00384 - if no SSSR rings are found on the molecule - MolOps::findSSSR() is called first 00385 */ 00386 int symmetrizeSSSR(ROMol &mol, std::vector<std::vector<int> > &res); 00387 //! \overload 00388 int symmetrizeSSSR(ROMol &mol); 00389 00390 //@} 00391 00392 //! \name Shortest paths and other matrices 00393 //@{ 00394 00395 //! returns a molecule's adjacency matrix 00396 /*! 00397 \param mol the molecule of interest 00398 \param useBO toggles use of bond orders in the matrix 00399 \param emptyVal sets the empty value (for non-adjacent atoms) 00400 \param force forces calculation of the matrix, even if already computed 00401 \param propNamePrefix used to set the cached property name 00402 00403 \return the adjacency matrix. 00404 00405 <b>Notes</b> 00406 - The result of this is cached in the molecule's local property dictionary, 00407 which will handle deallocation. Do the caller should <b>not</b> \c delete 00408 this pointer. 00409 00410 */ 00411 double * getAdjacencyMatrix(const ROMol &mol, 00412 bool useBO=false, 00413 int emptyVal=0, 00414 bool force=false, 00415 const char *propNamePrefix=0); 00416 00417 //! Computes the molecule's topological distance matrix 00418 /*! 00419 Uses the Floyd-Warshall all-pairs-shortest-paths algorithm. 00420 00421 \param mol the molecule of interest 00422 \param useBO toggles use of bond orders in the matrix 00423 \param useAtomWts sets the diagonal elements of the result to 00424 6.0/(atomic number) so that the matrix can be used to calculate 00425 Balaban J values. This does not affect the bond weights. 00426 \param force forces calculation of the matrix, even if already computed 00427 \param propNamePrefix used to set the cached property name 00428 00429 \return the distance matrix. 00430 00431 <b>Notes</b> 00432 - The result of this is cached in the molecule's local property dictionary, 00433 which will handle deallocation. Do the caller should <b>not</b> \c delete 00434 this pointer. 00435 00436 00437 */ 00438 double *getDistanceMat(const ROMol &mol, 00439 bool useBO=false, 00440 bool useAtomWts=false, 00441 bool force=false, 00442 const char *propNamePrefix=0); 00443 00444 00445 //! Computes the molecule's topological distance matrix 00446 /*! 00447 Uses the Floyd-Warshall all-pairs-shortest-paths algorithm. 00448 00449 \param mol the molecule of interest 00450 \param activeAtoms only elements corresponding to these atom indices 00451 will be included in the calculation 00452 \param bonds only bonds found in this list will be included in the 00453 calculation 00454 \param useBO toggles use of bond orders in the matrix 00455 \param useAtomWts sets the diagonal elements of the result to 00456 6.0/(atomic number) so that the matrix can be used to calculate 00457 Balaban J values. This does not affect the bond weights. 00458 00459 \return the distance matrix. 00460 00461 <b>Notes</b> 00462 - The results of this call are not cached, the caller <b>should</b> \c delete 00463 this pointer. 00464 00465 00466 */ 00467 double *getDistanceMat(const ROMol &mol, 00468 const std::vector<int> &activeAtoms, 00469 const std::vector<const Bond *> &bonds, 00470 bool useBO=false, 00471 bool useAtomWts=false); 00472 00473 00474 //! Find the shortest path between two atoms 00475 /*! 00476 Uses the Bellman-Ford algorithm 00477 00478 \param mol molecule of interest 00479 \param aid1 index of the first atom 00480 \param aid2 index of the second atom 00481 00482 \return an std::list with the indices of the atoms along the shortest 00483 path 00484 00485 <b>Notes:</b> 00486 - the starting and end atoms are included in the path 00487 - if no path is found, an empty path is returned 00488 00489 */ 00490 std::list<int> getShortestPath(const ROMol &mol, int aid1, int aid2); 00491 00492 //@} 00493 00494 //! \name Canonicalization 00495 //@{ 00496 00497 //! assign a canonical ordering to a molecule's atoms 00498 /*! 00499 The algorithm used here is a modification of the published Daylight canonical 00500 smiles algorithm (i.e. it uses atom invariants and products of primes). 00501 00502 \param mol the molecule of interest 00503 \param ranks used to return the ranks 00504 \param breakTies toggles breaking of ties (see below) 00505 \param rankHistory used to return the rank history (see below) 00506 00507 <b>Notes:</b> 00508 - Tie breaking should be done when it's important to have a full ordering 00509 of the atoms (e.g. when generating canonical traversal trees). If it's 00510 acceptable to have ties between symmetry-equivalent atoms (e.g. when 00511 generating CIP codes), tie breaking can/should be skipped. 00512 - if the \c rankHistory argument is provided, the evolution of the ranks of 00513 individual atoms will be tracked. The \c rankHistory pointer should be 00514 to a VECT_INT_VECT that has at least \c mol.getNumAtoms() elements. 00515 */ 00516 void rankAtoms(const ROMol &mol,std::vector<int> &ranks, 00517 bool breakTies=true,std::vector<std::vector<int> > *rankHistory=0); 00518 00519 // @} 00520 00521 //! \name Stereochemistry 00522 //@{ 00523 00524 //! removes bogus chirality markers (those on non-sp3 centers): 00525 void cleanupChirality(RWMol &mol); 00526 00527 //! \brief Uses a conformer to assign ChiralType to a molecule's atoms 00528 /*! 00529 \param mol the molecule of interest 00530 \param confId the conformer to use 00531 \param replaceExistingTags if this flag is true, any existing atomic chiral 00532 tags will be replaced 00533 00534 If the conformer provided is not a 3D conformer, nothing will be done. 00535 */ 00536 void assignChiralTypesFrom3D(ROMol &mol,int confId=-1,bool replaceExistingTags=true); 00537 00538 //! Assign stereochemistry tags to atoms (i.e. R/S) and bonds (i.e. Z/E) 00539 /*! 00540 00541 \param mol the molecule of interest 00542 \param cleanIt toggles removal of stereo flags from double bonds that can 00543 not have stereochemistry 00544 \param force forces the calculation to be repeated even if it has 00545 already been done 00546 \param flagPossibleStereoCenters set the _ChiralityPossible property on 00547 atoms that are possible stereocenters 00548 00549 <b>Notes:M</b> 00550 - Throughout we assume that we're working with a hydrogen-suppressed 00551 graph. 00552 00553 */ 00554 void assignStereochemistry(ROMol &mol,bool cleanIt=false,bool force=false, 00555 bool flagPossibleStereoCenters=false); 00556 //! Removes all stereochemistry information from atoms (i.e. R/S) and bonds (i.e. Z/E) 00557 /*! 00558 00559 \param mol the molecule of interest 00560 */ 00561 void removeStereochemistry(ROMol &mol); 00562 00563 //! \brief finds bonds that could be cis/trans in a molecule and mark them as 00564 //! Bond::STEREONONE 00565 /*! 00566 \param mol the molecule of interest 00567 \param cleanIt toggles removal of stereo flags from double bonds that can 00568 not have stereochemistry 00569 00570 This function is usefuly in two situations 00571 - when parsing a mol file; for the bonds marked here, coordinate informations 00572 on the neighbors can be used to indentify cis or trans states 00573 - when writing a mol file; bonds that can be cis/trans but not marked as either 00574 need to be specially marked in the mol file 00575 */ 00576 void findPotentialStereoBonds(ROMol &mol,bool cleanIt=false); 00577 //@} 00578 00579 }; // end of namespace MolOps 00580 }; // end of namespace RDKit 00581 00582 #endif
1.7.1