#!/usr/bin/perl -w
use POSIX;
use Getopt::Long qw(:config bundling require_order auto_version);
use Pod::Usage;
use FAST;
use FAST::Bio::SeqIO;
use strict;

use vars qw($VERSION $DESC $NAME $COMMAND $DATE);
$VERSION = $FAST::VERSION; 
$DESC    = "Filter sequences in a fasta file.\n";
$NAME    = $0;
$NAME    =~ s/^.*\///;
$COMMAND = join " ",$NAME,@ARGV;
$DATE = POSIX::strftime("%c",localtime());

use constant { true => 1, false => 0 };

## DEFAULT OPTION VALUES
my $def_format  = $FAST::DEF_FORMAT;  #7/1/13 "fasta";
my $def_logname = $FAST::DEF_LOGNAME; #7/1/13 "FAST.log.txt";
my $def_split_on_regex = ' ';

## OPTION VARIABLES
my $man                  = undef;  # --man
my $help                 = undef;  # -h
my $moltype              = undef;  # -m, in case bioperl can't tell
my $format               = $def_format;  # --format
my $log                  = undef;        # -l
my $logname              = $def_logname; # -L
my $comment              = undef;        # -C

my $description    = undef;  # -d
my $field          = undef;  # -f
my $tag            = undef;  # -t 
my $regex          = undef;  # -x
my $split_on_regex       = $def_split_on_regex;   # -S 


my $negate                = undef;  # -v
my $fastq                = undef;


GetOptions('help|h'         		 => \$help, 
	   'man'            		 => \$man,
	   'moltype|m=s'                 => sub{  my (undef,$val) = @_; 
						  die "$NAME: --moltype or -m option argument must be \"dna\", \"rna\" or \"protein\"\n" 
						    unless $val =~ /dna|rna|protein/i; 
						  $moltype = $val;
						},
	   'format=s'                    => \$format,
	   'fastq|q'                     => sub{$format = 'fastq';},
	   'log|l'                       => \$log,
	   'logname|L=s'                 => \$logname,
	   'comment|C=s'                 => \$comment,
	   'negate|v'                    => \$negate,

           'description|d'               => \$description,
	   'field|f=i'                   => sub{  my (undef,$val) = @_; 
						  die "$NAME: --field or -f option expects non-zero integer argument\n" 
						    unless $val != 0; 
						  $field = $val;
						},
	   'split-on-regex|S=s'          => \$split_on_regex,
	   'tag|t=s'                     => sub{  my (undef,$val) = @_; 
						  die "$NAME: --tag or -t option expects string argument exclusively from [a-zA-Z0-9_-]\n" 
						    unless $val =~ /^[a-zA-Z0-9_-]+$/; 
						  $tag = $val;
						},
	   'regex|x=s'                   => sub{  my (undef,$val) = @_; 
						  die "$NAME: --regex or -x option expects regex argument with exactly one capture buffer\n" 
						    unless $val =~ /^[^()]*\([^)]+\)[^()]*$/; 
						  $regex = $val;
						},

	  ) 
  or exit(1);
		  
pod2usage(-verbose => 1) if $help;
pod2usage(-verbose => 2) if $man;
my $fromSTDIN = ((-t STDIN) ? false : true);

pod2usage("$NAME: Requires at least two arguments RANGES FILE [FILE2...]. Try $NAME -h for help.") if ((-t STDIN) && (@ARGV < 2));
pod2usage("$NAME: Requires exactly one argument RANGES when reading from STDIN. Try $NAME -h for help.") if (!(-t STDIN) && @ARGV != 1);

&FAST::log($logname, $DATE, $COMMAND, $comment, $fromSTDIN) if ($log); 

my $RANGES = shift @ARGV;
my @RANGES = undef;
if ($RANGES =~ /,/) {
    @RANGES = split /,/,$RANGES;
  }
else {
  $RANGES[0] = $RANGES;
}

my @intervals = ();
foreach my $RANGE (@RANGES) {
  my ($L,$U);
  if ($RANGE =~ /^(-?\d+|([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?)$/) { ## THIS REGEX IS FROM PERL COOKBOOK RECIPE 2.2
    $L = $1;
    $U = $1;
    push @intervals, [$1,$1];
  }
  elsif ($RANGE =~ /^(\.\.|:)(-?\d+|([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?)$/) { 
    push @intervals, [undef,$2];
  } 
  elsif ($RANGE =~ /^(-?\d+|([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?)(\.\.|:)$/) {
    push @intervals, [$1,undef];
  }
  elsif ($RANGE =~ /^(-?\d+|([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?)(\.\.|:|-)(-?\d+|([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?)$/) {
    $L = $1;
    $U = $7;
    die "$NAME: RANGE $RANGE should have form L..U or L:U with L <= U\n" unless ($L <= $U);
    push @intervals, [$L,$U];
  }
  else {
    die "$NAME: RANGE $RANGE has an invalid syntax; Try \"$NAME -h\"\n";
  }
}


my $split_re = ' ';
if ($split_on_regex) {
  $split_re = qr/$split_on_regex/;
}
my $index;
if ($field and $field > 0) {
  $index = $field - 1;
}
elsif ($field and $field < 0) { 
  $index = $field;
}

my $valf;
if ($tag) {
  $valf = sub { my $s = shift; my $re = qr/$tag[:=](-?\d+|([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?)/; my $r = $s->desc(); if ($r =~ $re){ return $1;} else {return;} };
}
elsif ($regex) {
  my $m = 'id'; 
     $m = 'desc' if ($description); 
  $valf = sub { my $s = shift;  my $re = qr/$regex/; my $r = $s->$m(); if ($r =~ $re) { return $1;} else {return;} };
}
elsif ($field) {
  $valf = sub { my $s = shift; my @fields = split $split_re,$s->desc(); if (exists $fields[$index]) {return $fields[$index]} else {return;} };
}
elsif ($description) {
  $valf = sub { my $s = shift; $s->desc() };
}
else {
  $valf = sub { my $s = shift; $s->id() };
}



my $OUT = FAST::Bio::SeqIO->newFh('-format' => $format);
my $IN; 
unless (@ARGV) {
    if ($moltype) {
	$IN = FAST::Bio::SeqIO->new(-fh => *STDIN{IO}, '-format' => $format, '-alphabet' => $moltype);
    }
    else {
	$IN = FAST::Bio::SeqIO->new(-fh => *STDIN{IO}, '-format' => $format);
    }
}

while ($IN or @ARGV) {
  if (@ARGV) {
    my $file = shift (@ARGV);
    unless (-e $file) {
      warn "$NAME: Could not find file $file. Skipping.\n";
      next;
    }
    elsif ($moltype) {
      $IN = FAST::Bio::SeqIO->new(-file => $file, '-format' => $format, '-alphabet' => $moltype);
    }
    else {
      $IN = FAST::Bio::SeqIO->new(-file => $file, '-format' => $format);
    }
  }
  if ($IN) { 
    while (my $seq = $IN->next_seq()) {
      my $val = &$valf($seq);
      my $pass   = undef;
      foreach my $interval (@intervals) {
	my ($L,$U) = @$interval;
	if (((not defined $U) && $val >= $L) || ((not defined $L) && $val <= $U) || (defined $U && defined $L && $val >= $L && $val <= $U)){
	  $pass = 1;
	  last;
	}
      }
      print $OUT $seq if ($pass and not $negate);
      print $OUT $seq if (not $pass and $negate);
    }
    undef $IN;
  }
}



__END__

=head1 NAME

B<fasfilter> -- Select sequences by numerical values of name,value pairs

=head1 SYNOPSIS

B<fasfilter> [OPTION]... [RANGES] [MULTIFASTA-FILE]...

=head1 RANGE SPECIFICATION

         RANGES : { RANGE | RANGE,RANGES }
         RANGE  : { POINT | BOUND>SEP<UBOUND }
         >SEP<  : {    ..     |       :       |       -      }
         POINT  : a number, as defined in perlnumber 
         BOUND  : a number, as defined in perlnumber, or the empty string 
         UBOUND : a number >= BOUND, as defined in perlnumber, or the empty string 

=head1 DESCRIPTION

B<fasfilter> takes sequence or alignment data on input, and outputs
only sequence records that have numerical data within specific value
ranges. By default, entire sequence identifiers are compared, assuming
they are numbers. Optionally, B<fasfilter> may be directed to other
parts of sequence records including entire descriptions, fields in
descriptions, the values of tagged (name,value) pairs in descriptions,
or the values of regex captures on identifiers or descriptions.

More specifically, B<fasfilter> tests whether specific numerical data
of sequence records falls within one of more RANGES specified by its
only mandatory argument. If one record's value falls within one of the
RANGES, the sequence will be printed to output. If a sequence record
does not have the specified component, or its value falls outside all
specified ranges, the sequence will not be printed to output. Range
bounds are inclusive. Ranges are not simplified with respect to one
another (e.g. through taking their union).

If the first range begins with a negative number, you should terminate
option processing with "--".

Options specific to B<fasfilter>:
  B<-d>, B<--description>	                filter on description
  B<-f>, B<--field>=<int>	                filter on field in description
  B<-t>, B<--tag>=<string>                filter on value of a (tag,value) pair in the description
  B<-x>, B<--regex>=<regex>               filter on value of a regex capture
  B<-S>, B<--split-on-regex>=<regex>      split description for field-based filtering using regex 
  B<-v>, B<--negate>                      print records whose values fall outside of all specified ranges

Options general to FAST:
  B<-h>, B<--help>                  	 print a brief help message
  B<--man>             	           print full documentation
  B<--version>                         print version
  B<-l>, B<--log>                         create/append to logfile	
  B<-L>, B<--logname>=<string>            use logfile name <string>
  B<-C>, B<--comment>=<string>            save comment <string> to log
  B<--format>=<format>                 use alternative format for input  
  B<--moltype>=<[dna|rna|protein]>     specify input sequence type
  B<-q>, B<--fastq>                       use fastq format as input and output

=head1 INPUT AND OUTPUT

B<fasfilter> is part of FAST, the FAST Analysis of Sequences Toolbox, based
on Bioperl. Most core FAST utilities expect input and return output in
multifasta format. Input can occur in one or more files or on
STDIN. Output occurs to STDOUT. The FAST utility B<fasconvert> can
reformat other formats to and from multifasta.

=head1 DEFAULT AND OPTIONAL FILTERING BEHAVIOR

By default, B<fasfilter> tests identifiers of sequence
records. Options described below modify which parts of sequence
records get tested. These options take effect as follows with
decreasing priority: B<-t> > B<-xd> > B<-x> > B<-f> > B<-d> 

=head1 OPTIONS

=over 8

=item B<-d>,
      B<--description> 		

Filter sequence records by their descriptions. If used in combination
with the B<-x>, B<--regex> option, filters records by values of
a regex capture applied to their descriptions.

=item B<-f [int]>,
      B<--field=[int]>     

Filter sequece records by a specific field in their descriptions. With
this option, the description is split into fields using strings of
white space as field delimiters (the default Perl behavior for
splitting lines of data into fields).

This option takes a mandatory integer option argument giving the index
for which field to filter on. One-based indexing is used, so the first
field after the identifier has index 1. As standard in Perl, negative
indices count backwards from the last field in the description; field
"-1" is the last field, "-2" is the second-to-last etc.

In fasta files, the identifier occurs between the record separator
(">") and the first whitespace on the identifier line, and the
description is everything after the first string of white space on the
identifier line. Therefore the identifier is counted as the 0th field,
which is what B<fasfilter> tests by default. 

=item B<-S [regex]>,
      B<--split-on-regex=[regex]>   

Use regex <regex> to split the description for the B<-f> option
instead of the perl default (which splits on one or more whitespace
characters). Special characters must be quoted to protect them from
the shell. The regex is often helpfully enclosed by single-quote (')
or double-quote (") brackets but never by forward slashes (/).

=item B<-t [string]>,
      B<--tag=[string]>   

Filter sequence records by values of a named tag in the description.
Name-value pairs in the description are expected to have the format
"name:value" as generated by FAST tools (such as B<faslen>) or
"name=value" as common in General Feature Format. The "name" must
contain only characters from the set [a-zA-Z0-9_-]. The "value" is any
string of non-whitespace characters. Sequence records for which the
specified tag does not exist will not be written to output. Currently
only the first (left-most) occcurence of a tag is compared.

=item B<-x [regex]>,
      B<--regex=[regex]>   

Filter sequence records by values of a regex capture applied to
identifers (by default) or descriptions (in combination with the -d
option). The format of the regex is a perl regular expression that
must contain exactly one "capture buffer" specified by a pair or
parentheses. The value captured in this buffer is tested. Sequence
records that do not match the regex will will not be written to
output. Currently only the first (left-most) match is tested. Special
characters must be quoted to protect them from the shell. The regex is
often helpfully enclosed by single-quote (') or double-quote (")
brackets but never by forward slashes (/).

=item B<-v>,
      B<--negate> 	   

Output sequences that B<do not> meet the specified numerical criteria.

=item B<-h>,
      B<--help>

Print a brief help message and exit.

=item B<--man>

Print the manual page and exit.

=item B<--version>

Print version information and exit.

=item B<-l>,
      B<--log>

Creates, or appends to, a generic FAST logfile in the current working
directory. The logfile records date/time of execution, full command
with options and arguments, and an optional comment.

=item B<-L [string]>,
      B<--logname=[string]>

Use [string] as the name of the logfile. Default is "FAST.log.txt".

=item B<-C [string]>,
      B<--comment=[string]>

Include comment [string] in logfile. No comment is saved by default.

=item B<--format=[format]> 		  

Use alternative format for input. See man page for "fasconvert" for
allowed formats. This is for convenience; the FAST tools are designed
to exchange data in Fasta format, and "fasta" is the default format
for this tool.

=item B<-m [dna|rna|protein]>,
      B<--moltype=[dna|rna|protein]> 		  

Specify the type of sequence on input (should not be needed in most
cases, but sometimes Bioperl cannot guess and complains when
processing data).

=item B<-q>
      B<--fastq>
use fastq format as input and output.

=back

=head1 EXAMPLES

Output sequences with gi number identifiers between 200 million and 500 million

=over 8

B<fasfilter> -x "gi\|(\d+)" 2e8..5e8  t/data/P450.fas 

=back

Output sequences with length greater than or equal to 515

=over 8

B<faslen> t/data/P450.fas | B<fasfilter> -t length 515..

=back

=head1 SEE ALSO

=over 8

=item C<man perlre>

=item C<perldoc perlre>

Documentation on perl regular expressions.

=item C<man FAST>

=item C<perldoc FAST>

Introduction and cookbook for FAST

=item L<The FAST Home Page|http://compbio.ucmerced.edu/ardell/FAST>"

=back 

=head1 CITING

If you use FAST, please cite I<Lawrence et al. (2015). FAST: FAST Analysis of
Sequences Toolbox.> and Bioperl I<Stajich et al.>. 

=cut
