Package rdkit :: Package Chem :: Package MolDb :: Module Loader_sa
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Chem.MolDb.Loader_sa

  1  # $Id: Loader_sa.py 1647 2011-02-24 08:00:39Z glandrum $ 
  2  # 
  3  #  Copyright (C) 2007-2009 Greg Landrum 
  4  #   @@ All Rights Reserved @@ 
  5  #  This file is part of the RDKit. 
  6  #  The contents are covered by the terms of the BSD license 
  7  #  which is included in the file license.txt, found at the root 
  8  #  of the RDKit source tree. 
  9  # 
 10  import sqlalchemy 
 11   
 12  from rdkit import Chem 
 13  from rdkit.Chem import AllChem 
 14  from rdkit.Chem import Lipinski,Descriptors,Crippen 
 15  from rdkit.Dbase.DbConnection import DbConnect 
 16  from rdkit.Dbase import DbModule 
 17  import os 
 18   
 19  from sqlalchemy.ext.declarative import declarative_base 
 20  from sqlalchemy import Table,Column,MetaData 
 21  from sqlalchemy import Integer,Text,String,ForeignKey,Binary,DateTime,Float 
 22  from sqlalchemy.orm import relation,mapper,sessionmaker,backref 
 23  from sqlalchemy import create_engine 
 24   
 25  decBase = declarative_base() 
 26   
27 -class Compound(decBase):
28 __tablename__='molecules' 29 guid=Column(Integer,primary_key=True) 30 molpkl=Column(Binary)
31 32
33 -def RegisterSchema(dbUrl,echo=False):
34 engine = create_engine(dbUrl,echo=echo) 35 decBase.metadata.create_all(engine) 36 maker = sessionmaker(bind=engine) 37 return maker
38 ConnectToSchema=RegisterSchema 39 40 #set up the logger: 41 import rdkit.RDLogger as logging 42 logger = logging.logger() 43 logger.setLevel(logging.INFO) 44
45 -def ProcessMol(session,mol,globalProps,nDone,nameProp='_Name',nameCol='compound_id', 46 redraw=False,keepHs=False, 47 skipProps=False,addComputedProps=False, 48 skipSmiles=False):
49 if not mol: 50 raise ValueError,'no molecule' 51 if keepHs: 52 Chem.SanitizeMol(mol) 53 try: 54 nm = mol.GetProp(nameProp) 55 except KeyError: 56 nm = None 57 if not nm: 58 nm = 'Mol_%d'%nDone 59 60 cmpd = Compound() 61 session.add(cmpd) 62 63 if redraw: 64 AllChem.Compute2DCoords(m) 65 66 if not skipSmiles: 67 cmpd.smiles=Chem.MolToSmiles(mol,True) 68 cmpd.molpkl=mol.ToBinary() 69 setattr(cmpd,nameCol,nm) 70 71 if not skipProps: 72 if addComputedProps: 73 cmpd.DonorCount=Lipinski.NumHDonors(mol) 74 cmpd.AcceptorCount=Lipinski.NumHAcceptors(mol) 75 cmpd.RotatableBondCount=Lipinski.NumRotatableBonds(mol) 76 cmpd.AMW=Descriptors.MolWt(mol) 77 cmpd.MolLogP=Crippen.MolLogP(mol) 78 pns = list(mol.GetPropNames()) 79 for pi,pn in enumerate(pns): 80 if pn.lower()==nameCol.lower(): continue 81 pv = mol.GetProp(pn).strip() 82 if globalProps.has_key(pn): 83 setattr(cmpd,pn.lower(),pv) 84 return cmpd
85
86 -def LoadDb(suppl,dbName,nameProp='_Name',nameCol='compound_id',silent=False, 87 redraw=False,errorsTo=None,keepHs=False,defaultVal='N/A',skipProps=False, 88 regName='molecules',skipSmiles=False,maxRowsCached=-1, 89 uniqNames=False,addComputedProps=False,lazySupplier=False, 90 numForPropScan=10,startAnew=True):
91 if not lazySupplier: 92 nMols = len(suppl) 93 else: 94 nMols=-1 95 if not silent: 96 logger.info("Generating molecular database in file %s"%dbName) 97 if not lazySupplier: 98 logger.info(" Processing %d molecules"%nMols) 99 100 globalProps = {} 101 if startAnew: 102 if os.path.exists(dbName): 103 os.unlink(dbName) 104 sIter=iter(suppl) 105 setattr(Compound,nameCol.lower(),Column(nameCol.lower(),String,default=defaultVal,unique=uniqNames)) 106 if not skipSmiles: 107 Compound.smiles = Column(Text,unique=True) 108 if not skipProps: 109 while numForPropScan>0: 110 try: 111 m = sIter.next() 112 except StopIteration: 113 numForPropScan=0 114 break 115 if not m: continue 116 for pn in m.GetPropNames(): 117 if pn.lower()==nameCol.lower(): continue 118 if not globalProps.has_key(pn): 119 globalProps[pn]=1 120 setattr(Compound,pn.lower(),Column(pn.lower(),String,default=defaultVal)) 121 numForPropScan-=1 122 if addComputedProps: 123 Compound.DonorCount=Column(Integer) 124 Compound.AcceptorCount=Column(Integer) 125 Compound.RotatableBondCount=Column(Integer) 126 Compound.AMW=Column(Float) 127 Compound.MolLogP=Column(Float) 128 session=RegisterSchema('sqlite:///%s'%(dbName))() 129 else: 130 raise NotImplementedError,'updating existing databases is not yet supported' 131 132 133 nDone = 0 134 cache=[] 135 for m in suppl: 136 nDone +=1 137 if not m: 138 if errorsTo: 139 if hasattr(suppl,'GetItemText'): 140 d = suppl.GetItemText(nDone-1) 141 errorsTo.write(d) 142 else: 143 logger.warning('full error file support not complete') 144 continue 145 146 cmpd=ProcessMol(session,m,globalProps,nDone,nameProp=nameProp, 147 nameCol=nameCol,redraw=redraw, 148 keepHs=keepHs,skipProps=skipProps, 149 addComputedProps=addComputedProps,skipSmiles=skipSmiles) 150 if cmpd is not None: 151 cache.append(cmpd) 152 153 if not silent and not nDone%100: 154 logger.info(' done %d'%nDone) 155 try: 156 session.commit() 157 except: 158 session.rollback() 159 for cmpd in cache: 160 try: 161 session.add(cmpd) 162 session.commit() 163 except: 164 session.rollback() 165 cache=[] 166 167 168 try: 169 session.commit() 170 except: 171 session.rollback() 172 for cmpd in cache: 173 try: 174 session.add(cmpd) 175 session.commit() 176 except: 177 session.rollback()
178 179 if __name__=='__main__': 180 import sys 181 sdf =Chem.SDMolSupplier(sys.argv[1]) 182 db =sys.argv[2] 183 LoadDb(sdf,db,addComputedProps=False) 184 session = RegisterSchema('sqlite:///%s'%(db))() 185 print '>>>>', len(session.query(Compound).all()) 186