# genbank.py - utilities for downloading and parsing GenBank files
 
from Bio import GenBank # (1)
from Bio import SeqIO
 
def download(accession_list):
    '''Download and save all GenBank records in accession_list.'''
 
    try:
        handle = GenBank.download_many(accession_list) # (2)
    except:
        print "Are you connected to the internet?"
        raise
 
    genbank_strings = handle.read().split('//\n') # (3)
    for i in range(len(accession_list)):  
        #Save raw file as .gb
        gb_file_name = accession_list[i]+'.gb'       
        f = open(gb_file_name,'w')
        f.write(genbank_strings[i]) # (4)
        f.write('//\n')
        f.close()
 
 
def parse(accession_list):
    '''Parse all records in accession_list.'''
 
    parsed = []
    for accession_number in accession_list:
        gb_file_name = accession_number+'.gb'
        print 'Parsing ... ',accession_number
        try:
            gb_file = file(gb_file_name,'r')
        except IOError:
            print 'Is the file %s downloaded?' % gb_file_name
            raise
 
        gb_parsed_record = SeqIO.parse(gb_file,"genbank").next() # (5)
        gb_file.close()
 
        print gb_parsed_record.id  # (6)
        print gb_parsed_record.seq
 
        parsed.append(gb_parsed_record) # (7)
 
    return parsed


# (1) The biopython module is called Bio. The Bio.Genbank module is used to download records from GenBank, and the Bio.SeqIO module provides a general interface for parsing a variety of biological formats, including GenBank. 
# 
# (2) The Bio.GenBank.download_many method is used in the genbank.download method to download Genbank records over the internet. It takes a list of GenBank accession numbers identifying the records to be downloaded. 
# 
# (3) GenBank records are separated by the character string '//\n. Here we manually separate GenBank files that are part of the same character string. 
# 
# (4) When we save the GenBank records as individual files to disk, we include the '//\n' separator again. 
# 
# (5) The Bio.SeqIO.parse method can parse a variety of formats. Here we use it to parse the GenBank files on our local disk using the "genbank" format parameter. The method returns a generator, who's next() method is used to retrieve an object representing the parsed file. 
# 
# (6) The object representing the parsed GenBank file has a variety of methods to extract the record id and sequence. See Example 2 for more details. 
# 
# (7) The genbank.parse method returns a listed of parsed objects, one for each input sequence file. 
# 
# (8) To run the code in genbank.py, Biopython 1.44 must first be installed (reference). Executing the following code should create a file called 'NC_001416.gb' on the local disk (see Figure 1), as well as produce the following output: 
# Parsing ...  NC_001416
# NC_001416.1
# Seq('GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCG ...', IUPACAmbiguousDNA())



if __name__ == '__main__':
    download(['NC_001416'])
    parse(['NC_001416'])
