#! /usr/bin/perl

#gb_feature_fasta_list.pl
#Gene W. Tyson
#John M. Eppley
#VR comment: make a file with a return-separated list of the genbank file names to parse
# these files must be in the same directory as the script
# usage: genbank_parser.pl list_of_genbanks

use strict;
use warnings;
use Bio::SeqIO;
use Bio::Perl;
use Getopt::Long;
use Pod::Usage;

# Process command line options                                                            
my ($opt_verbose, $opt_help, $opt_genes, $opt_proteins, $opt_names, $opt_gb_args);
my $result = GetOptions("verbose"  =>  \$opt_verbose,
						"genes"    =>  \$opt_genes,
						"files"    =>   \$opt_gb_args,
						"name_genes" => \$opt_names,
						"proteins"    =>  \$opt_proteins,
                        "help"      =>  \$opt_help);

die "Failed to parse command line options\n" unless $result;
# print useage if asked to
pod2usage({-exitval => 0, -verbose => 2}) if $opt_help;
# print usage if there is not exactly one argument
pod2usage({-exitval => 1, -verbose => 2}) unless (($#ARGV==0) || ($opt_gb_args && $#ARGV>=0));

### Input files ###
my @files;

### Read in and store list of genbank files to parse###
if ($opt_gb_args) {
	# args are GB file names
	@files = @ARGV;
} else {
	# first arg is file containing gb file names
	open (FILE, $ARGV[0]) || die "Couldn't open $ARGV[0]";

	while (my $file = <FILE>) {
    	chomp $file;
    	push (@files, $file);
	}
	close FILE;
}

### What type of files are we creating?
my $default = !defined($opt_genes) && !defined($opt_proteins);
my $use_fna = defined ($opt_genes) || $default;
my $use_faa = defined($opt_proteins) || $default;

### Loop over files 
foreach my $file (@files) {
    my $infile = Bio::SeqIO->new(-file => "$file", -format => 'genbank');

	open FNA, ">$file.fna" if $use_fna;
	open FAA, ">$file.faa" if $use_faa;

	# loop over entries in file
	while ( my $seq = $infile->next_seq() ) {

		# get organism identifier to use in gene name prefixes
		#  ...use the LOCUS for now (called via display_id method)
		my $genePrefix = $seq->display_id;
		
		# $seq->species if defined, should give us the organism name, but
		#  its a bit unpreditable...we'll add to the header line, but not as the name
		my $organism;
		if (defined($seq->species) && defined($seq->species->binomial)) {
			$organism = $seq->species->binomial;
		}
		
		# start gene count
		my $count = 1;

		for my $feat_object ($seq->get_SeqFeatures){
	    	if ($feat_object->primary_tag eq "CDS") {
				# get sequence
				my $fna = $feat_object->spliced_seq->seq;
				my $faa = translate_as_string($fna) if $use_faa;

				# set the fasta header
				#  ...use gene numbers unless the names option is set
				my $gene = $genePrefix . "_";
				if ($opt_names) {
					$gene .= get_gene_name($feat_object);
				} else {
					$gene .= "gene_" . $count;
				}

				# if the organism was defined, add to header line with space
				$gene .= " " . $organism if defined($organism);
				
				# print to file
				print FNA ">$gene\n" , join("\n",segment_string($fna),60), "\n" if $use_fna;
				print FAA ">$gene\n" , join("\n",segment_string($faa),60), "\n" if $use_faa;
	    	}
	    	$count++;
		}
    }
    close FNA if $use_fna;
   	close FAA if $use_fna;
}
exit;

sub get_gene_name {
	my $feature = shift;
	
	
	#print STDERR "tags: ", join(',',$feature->get_all_tags), "\n";
	
	
	# get first gene name
	my @genes = $feature->get_tag_values("gene");
	my $gene;
	if ($#genes>=0) {
		$gene = $genes[0];
		#print STDERR join(',',@genes), "\n";
	}
	
	# get first descripton from "function","product", or "note"
	foreach my $tag ("function","product","note") {
		my @values = $feature->get_tag_values($tag);
	    #print STDERR $tag, "\n";
		if ($#values>=0) {
			#print STDERR join(',',@values), "\n";
			my $return = "";
			# prepend with gene name if present
			if (defined $gene) {
				$return  = $return . $gene . "_";
			}
			# just take first value
			$return = $return . $values[0];
			# remove white space
			$return =~ s/ /_/g;
			$return =~ s/\t/_/g;
			# return name
			return $return;
		}
	}
	
	# if we get here, just use gene
	return $gene;
}

sub segment_string {
	my $string = shift;
	my $length = shift;
	$length = 60 unless defined $length;
		
	my $i=0;
	my @segments;
	while ($i < length($string)) {
	 	my $segment = substr($string, $i, 60);
		chomp $segment;
	 	push(@segments,$segment);
		$i= $i+60;
    }

	return @segments;
}

__END__

=head1 NAME                                                                               
                                                                                          
B<gb_feature_fasta_list.pl> - Pull out all the featres fomr a list of genbank files and create 
fasta databases of CDS sequences  
                                                                                          
=head1 SYNOPSIS                                                                           
                                                                                          
gb_feature_fasta_list.pl [options] gb_file_list                                      
                                                                                          
=head1 OPTIONS                                                                            
                                                                                          
=over 2                                                                                   
                                                                                          
=item B<--name_genes>                                                                       
                                                                                          
Use annotations to name genes. Default is to name by count. If tru, looks for "gene" tag and 
first available of "function","prodduct","note"                      
                                                                                          
=item B<--genes>                                                                     
                                                                                          
Print nucleotide gene sequences for each CDS in ACCESSION.fna
If neither --genes or --proteins is specified, both are created.
                                                                                          
=item B<--proteins>                                                                         
                                                                                          
Print amino acids sequences for each CDS in ACCESSION.fna
If neither --genes or --proteins is specified, both are created.
                                                                                        
=item B<--verbose>                                                                        
                                                                                          
Print detailed output.                                                                    
                                                                                          
=item B<gb_file_list>                                                                        
                                                                                          
The one argument is the name of a file containing a list of GenBank formatted files to process
                                                                                          
=back                                                                                     
                                                                                          
=head1 DESCRIPTION                                                                        
                                                                                          
This script takes a list of genbank files. For each file a pir of files is created:
 FILE.faa and FILE.fna where FILE is the name of the genbank file. Each of these contains
 a fasta list of the CDS features in the GenBank file.
                                                                                          
=head1 EXIT CODES                                                                         
                                                                                          
This script returns 0 on success and a non-zero value on errors.                          
                                                                                          
=head1 BUGS                                                                               
                                                                                          
Please report them to <jmeppley@mit.edu>

=head1 COPYRIGHT                                                                          
                                                                                          
2007 MIT (?)                                                                              
                                                                                          
=cut