Python script recovers the genbank ids for all the nucleotide entries linked to a taxon id.


This python script recovers the genbank ids for all the nucleotide entries linked to a taxon id. The number of requests is minimized using the retmax and retstart parameters provided by the Entrez Utilities.
taxid_2_gbids.py
Python

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465

#!/usr/bin/env python


 


import xml.etree.ElementTree as ET


import sys, urllib, urllib2


 


eutils_base_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/"


 


def get_ids(taxid):


accession_numbers =[]


retstart = 0


iteration_step = 10000


while True:


result = esearch(db = "nucleotide", term = "txid%s[Organism:exp]"%taxid, retstart = retstart, retmax = iteration_step)


try:


result = ET.fromstring(result)


ids = []


if result.find('IdList') is not None:


for id in result.find('IdList').findall('Id'):


ids.append(id.text)


 


result = esummary(db = "nucleotide", ids = ids, retmax = iteration_step)


result = ET.fromstring(result)


for docsum in result.findall('DocSum'):


for item in docsum.findall("Item[@Name='Caption']"):


accession_numbers.append(item.text)


else:


break




except Exception, e:


print e


retstart += iteration_step


return accession_numbers


 


def esearch(db, term, retstart = 0, retmax = 20):


response = urllib.urlopen("%sesearch.fcgi?db=%s&term=%s&retstart=%i&retmax=%i"%(eutils_base_url, db, term, retstart, retmax))


content = str(response.read())


response.close()


return content


 


def esummary(db, ids, retstart = 0, retmax = 20):


data = {


'db':db,


'id':','.join(ids)


}


data = urllib.urlencode(data)


req = urllib2.Request("%sesummary.fcgi"%eutils_base_url, data)


response = urllib2.urlopen(req)


content = str(response.read())


response.close()


return content


 


if __name__ == '__main__':


taxid = None


 


if "-id" in sys.argv:


taxid = sys.argv[sys.argv.index("-id")+1]


 


if not taxid:


print "Usage: taxid_2_gbids.py -id taxid"


print "Example: taxid_2_gbids.py -id 4754"


sys.exit(-1)


 


ids = get_ids(taxid)


print ids


print "%i ids found..."%len(ids)