Python script recovers the genbank ids for all the nucleotide entries linked to a taxon id.

This python script recovers the genbank ids for all the nucleotide entries linked to a taxon id. The number of requests is minimized using the retmax and retstart parameters provided by the Entrez Utilities.

taxid_2_gbids.py

Python

#!/usr/bin/env python

import xml.etree.ElementTree as ET

import sys, urllib, urllib2

eutils_base_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/"

def get_ids(taxid):

    accession_numbers =[]

    retstart = 0

    iteration_step = 10000

    while True:

        result = esearch(db = "nucleotide", term = "txid%s[Organism:exp]"%taxid, retstart = retstart, retmax = iteration_step)

        try:

            result = ET.fromstring(result)

            ids = []

            if result.find('IdList') is not None:

                for id in result.find('IdList').findall('Id'):

                    ids.append(id.text)

                result = esummary(db = "nucleotide", ids = ids, retmax = iteration_step)

                result = ET.fromstring(result)

                for docsum in result.findall('DocSum'):

                    for item in docsum.findall("Item[@Name='Caption']"):

                        accession_numbers.append(item.text)

            else:

                break

        except Exception, e:

            print e

        retstart += iteration_step

    return accession_numbers

def esearch(db, term, retstart = 0, retmax = 20):

    response = urllib.urlopen("%sesearch.fcgi?db=%s&term=%s&retstart=%i&retmax=%i"%(eutils_base_url, db, term, retstart, retmax))

    content = str(response.read())

    response.close()

    return content

def esummary(db, ids, retstart = 0, retmax = 20):

    data = {

        'db':db,

        'id':','.join(ids)

    }

    data = urllib.urlencode(data) 

    req = urllib2.Request("%sesummary.fcgi"%eutils_base_url, data)

    response = urllib2.urlopen(req)

    content = str(response.read())

    response.close()

    return content

if __name__ == '__main__':

    taxid = None

    if "-id" in sys.argv:

        taxid = sys.argv[sys.argv.index("-id")+1]

    if not taxid:

        print "Usage: taxid_2_gbids.py -id taxid"

        print "Example: taxid_2_gbids.py -id 4754"

        sys.exit(-1)

    ids = get_ids(taxid)

    print ids

    print "%i ids found..."%len(ids)

blogtest

Latest News

Python script recovers the genbank ids for all the nucleotide entries linked to a taxon id.

0 comments:

Post a Comment

Popular Posts

Recent Posts

Social

More Links

About Me

Blog Archive

8,521,717

44,112

2,358

RSS Feeds

Featured Posts

Labels

Popular Tags

About

Featured Posts

Featured Posts

Recent Comments