Package Dbase :: Package Pubmed :: Module Searches
[hide private]
[frames] | no frames]

Source Code for Module Dbase.Pubmed.Searches

  1  # $Id: Searches.py 746 2008-07-07 13:21:24Z glandrum $ 
  2  # 
  3  # Copyright (C) 2003-2006 Rational Discovery LLC 
  4  # 
  5  #   @@ All Rights Reserved  @@ 
  6  # 
  7  """ Tools for doing PubMed searches and processing the results 
  8   
  9  NOTE: much of the example code in the documentation here uses XML 
 10  files from the test_data directory in order to avoid having to call 
 11  out to PubMed itself.  Actual calls to the functions would not include 
 12  the _conn_ argument. 
 13   
 14  """ 
 15  import RDConfig 
 16  import QueryParams,Records 
 17  import urllib,urllib2 
 18  from xml.etree import ElementTree 
 19   
20 -def openURL(url,args):
21 proxy_support = urllib2.ProxyHandler({}) 22 opener = urllib2.build_opener(proxy_support) 23 conn = urllib2.urlopen(url,args) 24 return conn
25
26 -def GetNumHits(query,url=QueryParams.searchBase):
27 """ returns a tuple of pubmed ids (strings) for the query provided 28 29 To do a search, we need a query object: 30 >>> query = QueryParams.details() 31 32 set up the search parameters: 33 >>> query['term'] = 'penzotti je AND grootenhuis pd' 34 >>> query['field'] = 'auth' 35 36 now get the search ids: 37 >>> counts = GetNumHits(query) 38 >>> counts 39 2 40 41 alternately, we can search using field specifiers: 42 >>> query = QueryParams.details() 43 >>> query['term'] = 'penzotti je[au] AND hydrogen bonding[mh]' 44 >>> counts = GetNumHits(query) 45 >>> counts 46 3 47 48 49 """ 50 query['rettype']='count' 51 conn = openURL(url,urllib.urlencode(query)) 52 pubmed = ElementTree.parse(conn) 53 countText = pubmed.findtext('Count') 54 if countText: 55 res = int(countText) 56 else: 57 res = 0 58 return res
59 60
61 -def GetSearchIds(query,url=QueryParams.searchBase):
62 """ returns a tuple of pubmed ids (strings) for the query provided 63 64 To do a search, we need a query object: 65 >>> query = QueryParams.details() 66 67 set up the search parameters: 68 >>> query['term'] = 'penzotti je AND grootenhuis pd' 69 >>> query['field'] = 'auth' 70 71 now get the search ids: 72 >>> ids = GetSearchIds(query) 73 >>> len(ids) 74 2 75 >>> ids[0] 76 '11960484' 77 >>> ids[1] 78 '10893315' 79 80 81 """ 82 conn = openURL(url,urllib.urlencode(query)) 83 pubmed = ElementTree.parse(conn) 84 res = [id.text for id in pubmed.getiterator('Id')] 85 return tuple(res)
86
87 -def GetSummaries(ids,query=None,url=QueryParams.summaryBase,conn=None):
88 """ gets a set of document summary records for the ids provided 89 90 >>> ids = ['11960484'] 91 >>> summs = GetSummaries(ids,conn=open(os.path.join(testDataDir,'summary.xml'),'r')) 92 >>> len(summs) 93 1 94 >>> rec = summs[0] 95 >>> isinstance(rec,Records.SummaryRecord) 96 1 97 >>> rec.PubMedId 98 '11960484' 99 >>> rec.Authors 100 'Penzotti JE, Lamb ML, Evensen E, Grootenhuis PD' 101 >>> rec.Title 102 'A computational ensemble pharmacophore model for identifying substrates of P-glycoprotein.' 103 >>> rec.Source 104 'J Med Chem' 105 >>> rec.Volume 106 '45' 107 >>> rec.Pages 108 '1737-40' 109 >>> rec.HasAbstract 110 '1' 111 112 """ 113 if not conn: 114 try: 115 iter(ids) 116 except TypeError: 117 ids = [ids,] 118 if not query: 119 query = QueryParams.details() 120 ids = map(str,ids) 121 query['id'] = ','.join(ids) 122 conn = openURL(url,urllib.urlencode(query)) 123 pubmed = ElementTree.parse(conn) 124 res = [] 125 for summary in pubmed.getiterator('DocSum'): 126 rec = Records.SummaryRecord(summary) 127 if rec.PubMedId in ids: 128 res.append(rec) 129 ids.remove(rec.PubMedId) 130 131 return tuple(res)
132
133 -def GetRecords(ids,query=None,url=QueryParams.fetchBase,conn=None):
134 """ gets a set of document summary records for the ids provided 135 136 >>> ids = ['11960484'] 137 >>> recs = GetRecords(ids,conn=open(os.path.join(testDataDir,'records.xml'),'r')) 138 >>> len(recs) 139 1 140 >>> rec = recs[0] 141 >>> rec.PubMedId 142 '11960484' 143 >>> rec.Authors 144 u'Penzotti JE, Lamb ML, Evensen E, Grootenhuis PD' 145 >>> rec.Title 146 u'A computational ensemble pharmacophore model for identifying substrates of P-glycoprotein.' 147 >>> rec.Source 148 u'J Med Chem' 149 >>> rec.Volume 150 '45' 151 >>> rec.Pages 152 '1737-40' 153 >>> rec.PubYear 154 '2002' 155 >>> rec.Abstract[:10] 156 u'P-glycopro' 157 158 We've also got access to keywords: 159 >>> str(rec.keywords[0]) 160 'Combinatorial Chemistry Techniques' 161 >>> str(rec.keywords[3]) 162 'Indinavir / chemistry' 163 164 and chemicals: 165 >>> rec.chemicals[0] 166 'P-Glycoprotein' 167 >>> rec.chemicals[2] 168 'Nicardipine <55985-32-5>' 169 170 171 """ 172 if not conn: 173 try: 174 iter(ids) 175 except TypeError: 176 ids = [ids,] 177 if not query: 178 query = QueryParams.details() 179 query['id'] = ','.join(map(str,ids)) 180 conn = openURL(url,urllib.urlencode(query)) 181 182 pubmed = ElementTree.parse(conn) 183 res = [] 184 for article in pubmed.getiterator('PubmedArticle'): 185 rec = Records.JournalArticleRecord(article) 186 if rec.PubMedId in ids: 187 res.append(rec) 188 return tuple(res)
189 211 239 240 241 #------------------------------------ 242 # 243 # doctest boilerplate 244 #
245 -def _test():
246 import doctest,sys 247 return doctest.testmod(sys.modules["__main__"])
248 249 if __name__ == '__main__': 250 import sys,os.path 251 testDataDir = os.path.join(RDConfig.RDCodeDir,'Dbase','Pubmed','test_data') 252 failed,tried = _test() 253 sys.exit(failed) 254 #query = QueryParams.details() 255 #query['term']='landrum ga' 256 #query['field']='auth' 257 #ids = GetSearchIds(query) 258 #print ids 259 #ids = ids[:2] 260 ids = ['11666868','11169640'] 261 if 0: 262 summs = GetSummaries(ids,conn=open('summary.xml','r')) 263 print 'summs:',summs 264 for summary in summs: 265 print summary.Authors 266 print '\t',summary.Title 267 print '\t',summary.Source, 268 print summary.Volume, 269 print summary.Pages, 270 print summary.PubDate 271 272 if 0: 273 ids = ['11666868'] 274 res = GetRecords(ids,conn=open('records.xml','r')) 275 for record in res: 276 print record.Authors 277 print '\t',record.Title 278 print '\t',record.Journal, 279 print record.Volume, 280 print record.Pages, 281 print record.PubYear 282 print 283 284 if 0: 285 ids = ['11666868','11169640'] 286 res = CheckForLinks(ids,conn=open('haslinks.xml','r')) 287 print res 288 289 if 0: 290 ids = ['11666868'] 291 res = GetLinks(ids,conn=open('links.xml','r')) 292 #res = GetLinks(ids) 293 for id,score in res[:10]: 294 print id,score 295