#!/usr/bin/env perl
# ABSTRACT: Filter FASTA/FASTQ files by sequence length
# PODNAME: fu-len

use 5.014;
use strict;
use warnings;
use FASTX::Reader;
use Getopt::Long qw(:config no_ignore_case);
use File::Basename;
use FindBin qw($RealBin);
use Carp qw(croak);
use Try::Tiny;

# The following placeholder is to be programmatically replaced with 'use lib "$RealBin/../lib"' if needed
#~loclib~
if ( -e "$RealBin/../lib/Proch/N50.pm" and -e "$RealBin/../Changes" ) {
    use lib "$RealBin/../lib";
}
use Proch::Seqfu;

my $BASENAME = basename($0);
my $VERSION = $Proch::Seqfu::VERSION // "<Dev>";

# Valid name schemes
my %schemes = (
    'raw'  => 'Do not change sequence name (default)',
    'num'  => 'Numbered sequence (see also -p)',
    'file' => 'Use file basename as prefix',
);

# Command line options
my (
    $opt_minlen,
    $opt_maxlen,
    $opt_fasta_format,
    $opt_fasta_width,
    $opt_verbose,
    $opt_prefix,
    $opt_strip_comment,
    $opt_addlength,
    $opt_version
);
my $opt_name = 'raw';
my $opt_separator = '.';

# Process command line options with error handling
try {
    GetOptions(
        'm|min=i'         => \$opt_minlen,
        'x|max=i'         => \$opt_maxlen,
        'f|fasta'         => \$opt_fasta_format,
        'w|fasta-width=i' => \$opt_fasta_width,
        'c|strip-comment' => \$opt_strip_comment,
        'n|namescheme=s'  => \$opt_name,
        'v|verbose'       => \$opt_verbose,
        'l|len'          => \$opt_addlength,
        's|separator=s'   => \$opt_separator,
        'p|prefix=s'      => \$opt_prefix,
        'version'         => \$opt_version,
    ) or usage();
} catch {
    croak "Error processing command line options: $_";
};

sub version {
    say $BASENAME, " ", $VERSION;
    say STDERR "Using Proch::Seqfu=", $Proch::Seqfu::VERSION, " and FASTX::Reader=", $FASTX::Reader::VERSION;
    exit(0);
}

version() if ($opt_version);
usage() unless defined $ARGV[0];

# Validate name scheme
croak "ERROR: Name scheme -n '$opt_name' is not valid. Choose from: " . join(", ", sort keys %schemes)
    unless exists $schemes{$opt_name};

# Validate length parameters
if (defined $opt_minlen and defined $opt_maxlen) {
    croak "ERROR: Minimum length ($opt_minlen) cannot be greater than maximum length ($opt_maxlen)"
        if $opt_minlen > $opt_maxlen;
}
croak "ERROR: Minimum length must be positive"
    if defined $opt_minlen and $opt_minlen < 0;
croak "ERROR: Maximum length must be positive"
    if defined $opt_maxlen and $opt_maxlen < 0;
croak "ERROR: FASTA width must be positive"
    if defined $opt_fasta_width and $opt_fasta_width < 1;

# Set separator based on prefix
my $sep = $opt_prefix ? $opt_separator : '';
my $global_counter = 0;
my %check_reads;

# Process each input file
foreach my $input_file (@ARGV) {
    my $filename = $input_file;
    
    # Handle STDIN
    if ($input_file eq "-") {
        $filename = "{{STDIN}}";
    } else {
        # Check file existence and readability
        unless (-e $input_file) {
            verbose(qq(Skipping "$input_file": file not found));
            next;
        }
        unless (-r $input_file) {
            verbose(qq(Skipping "$input_file": file not readable));
            next;
        }
    }

    # Initialize FASTX reader with error handling
    my $reader;
    try {
        $reader = FASTX::Reader->new({ filename => $filename });
    } catch {
        croak "ERROR: Failed to initialize FASTX reader for $input_file: $_";
    };

    my $local_counter = 0;
    
    # Process sequences
    while (my $seq = $reader->getRead()) {
        my $len = length($seq->{seq});
        
        # Length filtering
        next if defined $opt_minlen and $len < $opt_minlen;
        next if defined $opt_maxlen and $len > $opt_maxlen;

        $global_counter++;
        $local_counter++;    

        # Generate sequence name based on scheme
        my $name = generate_name($seq, $input_file, $local_counter, $global_counter);
        
        # Add optional components
        $name .= " " . $seq->{comment} unless $opt_strip_comment and defined $seq->{comment};
        $name .= " length=" . $len if $opt_addlength;

        # Check for duplicate names
        if ($check_reads{$name}) {
            croak sprintf("Duplicate read name <%s> using scheme %s\n" .
                         "Reading <%s>, sequence number %d (total sequences %d)",
                         $name, $opt_name, $input_file, $local_counter, $global_counter);
        }
        $check_reads{$name}++;

        # Output sequence with error handling
        try {
            if ($opt_fasta_format or not defined $seq->{qual}) {
                print '>', $name, "\n", format_dna($seq->{seq}, $opt_fasta_width);
            } else {
                print '@', $name, "\n", $seq->{seq}, "\n+\n", $seq->{qual}, "\n";
            }
        } catch {
            croak "ERROR: Failed to write sequence: $_";
        };
    }
}

# Subroutines
sub generate_name {
    my ($seq, $file, $local_count, $global_count) = @_;
    
    if ($opt_name eq 'raw') {
        return $seq->{name};
    } elsif ($opt_name eq 'file') {
        return basename($file) . $opt_separator . $local_count;
    } elsif ($opt_name eq 'num') {
        return $opt_prefix . $sep . $global_count;
    }
}

sub format_dna {
    my ($sequence, $width) = @_;
    return "$sequence\n" unless defined $width;

    my $formatted = '';
    for (my $i = 0; $i < length($sequence); $i += $width) {
        $formatted .= substr($sequence, $i, $width) . "\n";
    }
    return $formatted;
}

sub verbose {
    say STDERR " - $_[0]" if $opt_verbose;
}

sub usage {
    print STDERR<<END;
fu-len - Filter FASTA/FASTQ files by sequence length

SYNOPSIS
    fu-len [options] FILE1 [FILE2 ...]

DESCRIPTION
    This tool filters sequences from FASTA/FASTQ files based on their length.
    It can also reformat sequences and modify sequence names according to
    different schemes.

OPTIONS
    Input/Output Control:
    -m, --min INT          Minimum length to keep a sequence
    -x, --max INT          Maximum length to keep a sequence
    -f, --fasta           Force FASTA output (default: same as input)
    -w, --fasta-width INT Wrap FASTA sequences to specified width
    
    Sequence Naming:
    -n, --namescheme STR  Sequence name scheme:
END
    foreach my $scheme (sort keys %schemes) {
        say STDERR sprintf("\t%-12s %s", "\"$scheme\":", $schemes{$scheme});
    }
    print STDERR<<END;
    -p, --prefix STR      Prefix for sequence names (used with --namescheme)
    -s, --separator STR   Separator between prefix and number (default: '.')
    
    Sequence Annotation:
    -l, --len            Add sequence length as comment
    -c, --strip-comment  Remove sequence comments
    
    Other Options:
    -v, --verbose        Print verbose output
    --version           Print version information and exit

EXAMPLES
    # Keep sequences between 100 and 1000 bp
    fu-len -m 100 -x 1000 input.fa > filtered.fa

    # Convert to FASTA and wrap sequences to 60 characters
    fu-len -f -w 60 input.fastq > output.fa

    # Number sequences and add length information
    fu-len -n num -p 'seq' -l input.fa > numbered.fa

EXIT STATUS
    Returns 0 on success, non-zero on error.

END
    exit(0);
}

__END__

=pod

=encoding UTF-8

=head1 NAME

fu-len - Filter FASTA/FASTQ files by sequence length

=head1 VERSION

version 1.7.0

=head1 SYNOPSIS

    fu-len [options] FILE1 [FILE2 ...]

=head1 DESCRIPTION

fu-len is a versatile tool for filtering sequences from FASTA/FASTQ files based on
their length. It provides additional functionality for sequence reformatting and
name manipulation. The tool can process both FASTA and FASTQ files, including
gzipped files, and can handle input from standard input using '-' as the filename.

=head1 NAME

fu-len - Filter and manipulate FASTA/FASTQ files based on sequence length

=head1 OPTIONS

=head2 Input/Output Control

=over 4

=item B<-m>, B<--min> I<INT>

Minimum length to keep a sequence. Sequences shorter than this will be filtered out.

=item B<-x>, B<--max> I<INT>

Maximum length to keep a sequence. Sequences longer than this will be filtered out.

=item B<-f>, B<--fasta>

Force output in FASTA format, regardless of input format.

=item B<-w>, B<--fasta-width> I<INT>

Wrap FASTA sequence lines to the specified width. If not specified, sequences will
be written as single lines.

=back

=head2 Sequence Naming

=over 4

=item B<-n>, B<--namescheme> I<STR>

Choose how sequence names should be generated. Available schemes:

=over 4

=item * B<raw> - Use original sequence names (default)

=item * B<num> - Number sequences sequentially (see B<--prefix>)

=item * B<file> - Use input filename as prefix followed by sequence number

=back

=item B<-p>, B<--prefix> I<STR>

Prefix to use for sequence names when using the 'num' name scheme.

=item B<-s>, B<--separator> I<STR>

Separator to use between prefix and number (default: '.').

=back

=head2 Sequence Annotation

=over 4

=item B<-l>, B<--len>

Add sequence length as a comment to each sequence header.

=item B<-c>, B<--strip-comment>

Remove existing sequence comments.

=back

=head2 Other Options

=over 4

=item B<-v>, B<--verbose>

Print verbose information to STDERR.

=item B<--version>

Print version information and exit.

=back

=head1 EXAMPLES

Filter sequences by length:

    # Keep sequences between 100 and 1000 bp
    fu-len -m 100 -x 1000 input.fa > filtered.fa

Convert FASTQ to wrapped FASTA:

    # Convert to FASTA and wrap to 60 characters per line
    fu-len -f -w 60 input.fastq > output.fa

Number sequences with custom prefix:

    # Add sequential numbers and length information
    fu-len -n num -p 'seq' -l input.fa > numbered.fa

Process multiple files:

    # Filter all sequences and force FASTA output
    fu-len -m 500 -f file1.fq file2.fa > combined.fa

=head1 NOTES

When processing multiple files, be aware that:

=over 4

=item * Duplicate sequence names can cause errors

=item * Mixing FASTA and FASTQ files without B<--fasta> may cause formatting issues

=item * Memory usage increases when checking for duplicate names

=back

=head1 MODERN ALTERNATIVE

This suite of tools has been superseded by B<SeqFu>, a compiled program providing
faster and safer tools for sequence analysis. This suite is maintained for the
higher portability of Perl scripts under certain circumstances.

SeqFu is available at L<https://github.com/telatin/seqfu2>, and can be installed
with BioConda C<conda install -c bioconda seqfu>

=head1 CITING

Telatin A, Fariselli P, Birolo G.
I<SeqFu: A Suite of Utilities for the Robust and Reproducible Manipulation of Sequence Files>.
Bioengineering 2021, 8, 59. L<https://doi.org/10.3390/bioengineering8050059>

=cut

=head1 AUTHOR

Andrea Telatin <andrea@telatin.com>

=head1 COPYRIGHT AND LICENSE

This software is Copyright (c) 2018-2027 by Quadram Institute Bioscience.

This is free software, licensed under:

  The MIT (X11) License

=cut
