#!/usr/bin/env perl
use strict;
use warnings;
###BREWCONDA###
use Bio::SeqIO;
use Bio::Tools::CodonTable;
use Data::Dumper;

#     gene            complement(1..1200)
#                     /locus_tag="P148_SR1C00001G0001"
#     CDS             complement(1..1200)
#                     /locus_tag="P148_SR1C00001G0001"
#                     /note="RAAC1_SR1_1_1"
#                     /codon_start=1
#                     /transl_table=25
#                     /product="MAEBL"
#                     /protein_id="AHB40819.1"
#                     /db_xref="GI:563351664"

my @IDTAG = ('protein_id', 'locus_tag', 'db_xref');

my(@Options, $verbose, $format, $hypo, $idtag, $sep, $blank, $pseudo, $minlen, $gcode);
setOptions();

my $in = Bio::SeqIO->new(-fh=>\*ARGV, -format=>$format);
my $out = Bio::SeqIO->new(-fh=>\*STDOUT, -format=>'Fasta');

my @TRYIDTAG = $idtag ? ($idtag) : @IDTAG;
print STDERR "Will use first of (@TRYIDTAG) as FASTA ID\n";

# support different /transl_table=XX tags in CDS
#my $table = Bio::Tools::CodonTable->new();
#printf STDERR "Default /transl_table is %d - %s\n", $table->id, $table->name;

while (my $seq = $in->next_seq) {
  print STDERR "\rParsing: ",$seq->display_id, "\n";
  my $counter = 0;
  for my $f ($seq->get_SeqFeatures)   {
    
    next unless $f->primary_tag eq 'CDS';
        
    next unless $f->length >= $minlen;
    
    next if !$pseudo and $f->has_tag('pseudo');

    my $prod = TAG($f, 'product') || $blank;

    next if !$hypo and $prod eq 'hypothetical protein';

    my $cds = $f->spliced_seq;  # don't forget eukaryotes!

    my $id;
    for my $idtag (@TRYIDTAG) {
      $id = TAG($f, $idtag);
      last if $id;
    }
    $id or die "\nFeature #$counter does not have any of these tags: @TRYIDTAG";
    $counter++;
    print STDERR "[$counter] ", $seq->display_id, " | $id | $prod\n" if $verbose;
    
    # HANDLE CODON START FOR FUZZY FEATURES!
    if ($f->has_tag('codon_start')) {
      my($cs) = $f->get_tag_values('codon_start');
      if ($cs != 1) {
        print STDERR "\t/codon_start=$cs - trimming mRNA!\n";
        $cds = $cds->trunc($cs, $cds->length);
      }
    }
    #END

    # DNA -> AA  
    my $tt = $gcode || TAG($f, 'transl_table') || 11;
    print STDERR "\tUsing specified /transl_table=$tt\n" if $verbose && $tt != 1;
    
    # http://www.bioperl.org/wiki/HOWTO:Beginners#Translating
    $cds = $cds->translate(-codontable_id => $tt, -complete => 1);
    
    my $ec = TAG($f, 'EC_number') || $blank;
    my $gene = TAG($f, 'gene') || $blank;
    
    $cds->desc( join $sep, $ec, $gene, $prod );
    $cds->display_id($id);
    
    $out->write_seq($cds);
  }
}
print STDERR "Done.\n";

#----------------------------------------------------------------------

sub TAG {
  my($f, $tag) = @_;
  return unless $f->has_tag($tag);
  # Seems new submissions have 2 protein IDs - one useful and one annoying!
  # /protein_id="REF_PRJNA193299:EFAU085_p1001"
  # /protein_id="YP_008390691.1"
  return ( grep { ! m/PRJNA/ } $f->get_tag_values($tag) ) [0];
}

#----------------------------------------------------------------------
# Option setting routines

sub setOptions {
  use Getopt::Long;

  @Options = (
    {OPT=>"help",      VAR=>\&usage,                       DESC=>"This help"},
    {OPT=>"verbose!",  VAR=>\$verbose, DEFAULT=>0,         DESC=>"Verbose progress"},
    {OPT=>"format=s",  VAR=>\$format,  DEFAULT=>'genbank', DESC=>"Input format"},
    {OPT=>"idtag=s",   VAR=>\$idtag,   DEFAULT=>'',    DESC=>"What tag to use as Fasta ID (default = try first of: @IDTAG)"},
    {OPT=>"sep=s",     VAR=>\$sep,     DEFAULT=>'~~~', DESC=>"Separator between EC/gene/product" },
    {OPT=>"blank=s",   VAR=>\$blank,   DEFAULT=>'',    DESC=>"Replace empty EC/gene/product with this"},
    {OPT=>"pseudo!",   VAR=>\$pseudo,  DEFAULT=>0,     DESC=>"Include /pseudo genes"},
    {OPT=>"hypo!",     VAR=>\$hypo,    DEFAULT=>0,     DESC=>"Include 'hypothetical protein' genes"},
    {OPT=>"gcode=i",   VAR=>\$gcode,   DEFAULT=>0,     DESC=>"Force this genetic code for translation (otherwise /transl_table)"},
    {OPT=>"minlen=i",  VAR=>\$minlen,  DEFAULT=>0,     DESC=>"Minimum peptide length"},
  );

  (!@ARGV) && (usage());

  &GetOptions(map {$_->{OPT}, $_->{VAR}} @Options) || usage();

  # Now setup default values.
  foreach (@Options) {
    if (defined($_->{DEFAULT}) && !defined(${$_->{VAR}})) {
      ${$_->{VAR}} = $_->{DEFAULT};
    }
  }
}

sub usage {
  print "Usage: $0 [options] [genome1.gbk ...] > proteins.faa\n";
  foreach (@Options) {
    printf "  --%-13s %s%s.\n",$_->{OPT},$_->{DESC},
           defined($_->{DEFAULT}) ? " (default '$_->{DEFAULT}')" : "";
  }
  exit(1);
}
 
#----------------------------------------------------------------------
