#! /bin/sh
#!perl -w # --*- Perl -*--
eval 'exec perl -x $0 ${1+"$@"}'
    if 0;
#------------------------------------------------------------------------------
#$Author: antanas $
#$Date: 2022-07-31 09:41:50 +0300 (Sun, 31 Jul 2022) $
#$Revision: 9352 $
#$URL: svn+ssh://www.crystallography.net/home/coder/svn-repositories/cod-tools/tags/v3.7.0/scripts/cif_sort_atoms $
#------------------------------------------------------------------------------
#*
#* Sort atoms in a CIF file in given order. Accepts more than one sorting
#* criterion.
#*
#* USAGE:
#*    $0 --options input1.cif input*.cif
#**

use strict;
use warnings;

use COD::AtomProperties;
use COD::CIF::Parser qw( parse_cif );
use COD::CIF::Tags::CanonicalNames qw( canonicalize_all_names );
use COD::CIF::Tags::Manage qw( contains_data_item set_loop_tag );
use COD::CIF::Tags::Print qw( print_cif );
use COD::ErrorHandler qw( process_errors
                          process_warnings
                          process_parser_messages
                          report_message );
use COD::SOptions qw( getOptions get_value );
use COD::SUsage qw( usage options );
use COD::ToolsVersion qw( get_version_string );
use List::MoreUtils qw( first_index );

my @order_functions;
my $direction = 1;
my $record_original_order = 0;
my $order_child_loops = 0;
my $sort_by_atomic_number = 0;

my $use_parser = 'c';

my $die_on_errors   = 1;
my $die_on_warnings = 0;
my $die_on_notes    = 0;
my $die_on_error_level = {
    ERROR   => $die_on_errors,
    WARNING => $die_on_warnings,
    NOTE    => $die_on_notes
};

#* OPTIONS:
#*   -l, --lexicographic
#*                     Sort by lexicographic order (default).
#*   -Z, --atomic-number
#*                     Sort by atomic number.
#*   --ascending-numerical _atom_site_occupancy
#*                     Sort by values of CIF data item in ascending numerical
#*                     order.
#*   --descending-numerical _atom_site_occupancy
#*                     Sort by values of CIF data item in descending numerical
#*                     order.
#*   --ascending-lexical _atom_site_label
#*                     Sort by values of CIF data item in ascending lexical
#*                     order.
#*   --descending-lexical _atom_site_label
#*                     Sort by values of CIF data item in descending lexical
#*                     order.
#*
#*   -r, --reverse
#*                     Reverse the ordering.
#*
#*   --order-child-loops
#*                     Also order loops with data items referencing
#*                     '_atom_site_label'.
#*   --no-order-child-loops
#*                     Preserve the original order of loops with data items
#*                     referencing '_atom_site_label' (default).
#*
#*   --record-original-order
#*                     Record the original atom order.
#*   --no-record-original-order
#*                     Do not record the original atom order (default).
#*
#*   --use-perl-parser
#*                     Use Perl parser for CIF parsing.
#*   --use-c-parser
#*                     Use Perl & C parser for CIF parsing (default).
#*
#*   --help, --usage
#*                     Output a short usage message (this message) and exit.
#*   --version
#*                     Output version information and exit.
#**

@ARGV = getOptions(
    '-l,--lexicographic' => sub { push @order_functions, \&get_lexicographic_order },
    '-Z,--atomic-number' =>
        sub {
            push @order_functions, \&get_atomic_order;
            $sort_by_atomic_number = 1;
        },
    '--ascending-numerical' =>
        sub { push @order_functions,
                   make_comparator_numeric( get_value, 1 ) },
    '--descending-numerical' =>
        sub { push @order_functions,
                   make_comparator_numeric( get_value, -1 ) },
    '--ascending-lexical' =>
        sub { push @order_functions,
                   make_comparator_lexic( get_value, 1 ) },
    '--descending-lexical' =>
        sub { push @order_functions,
                   make_comparator_lexic( get_value, -1 ) },

    '-r,--reverse' => sub { $direction = -1 },

    '--order-child-loops'    => sub { $order_child_loops = 1 },
    '--no-order-child-loops' => sub { $order_child_loops = 1 },

    '--record-original-order'    => sub { $record_original_order = 1 },
    '--no-record-original-order' => sub { $record_original_order = 0 },

    '--use-perl-parser' => sub{ $use_parser = 'perl' },
    '--use-c-parser'    => sub{ $use_parser = 'c' },

    '--options'      => sub { options; exit },
    '--help,--usage' => sub { usage; exit },
    '--version'      => sub { print get_version_string(), "\n"; exit }
);

@order_functions = ( \&get_lexicographic_order ) if !@order_functions;

@ARGV = ( '-' ) unless @ARGV;

binmode STDOUT, ':encoding(UTF-8)';
binmode STDERR, ':encoding(UTF-8)';

for my $filename (@ARGV) {
    my $options = { parser   => $use_parser,
                    no_print => 1 };

    my( $data, $err_count, $messages ) = parse_cif( $filename, $options );
    process_parser_messages( $messages, $die_on_error_level );

    if( !@{$data} ) {
        report_message( {
           'program'   => $0,
           'filename'  => $filename,
           'err_level' => 'WARNING',
           'message'   => 'file seems to be empty'
        }, 0 );
        next;
    }

    canonicalize_all_names( $data );

    for my $dataset (@$data) {
        my $values = $dataset->{values};
        my $dataname = 'data_' . $dataset->{name};

        local $SIG{__WARN__} = sub {
            process_warnings( {
                'message'  => @_,
                'program'  => $0,
                'filename' => $filename,
                'add_pos'  => $dataname
            }, $die_on_error_level )
        };

        eval {
            my $atom_loop = $dataset->{inloop}{_atom_site_label};
            my $type_loop;
            if( contains_data_item( $dataset, '_atom_type_symbol' ) &&
                exists $dataset->{inloop}{_atom_type_symbol} &&
                $dataset->{inloop}{_atom_type_symbol} != $atom_loop ) {
                $type_loop = $dataset->{inloop}{_atom_type_symbol};
            }

            my @atoms;
            for my $i (0..$#{$values->{_atom_site_label}}) {
                my %atom = map { $_ => $values->{$_}[$i] }
                               @{$dataset->{loops}[$atom_loop]};

                if( contains_data_item( $dataset, '_atom_site_type_symbol' ) &&
                    defined $type_loop ) {
                    my $index = first_index { $_ eq $atom{_atom_site_type_symbol} }
                                            @{$values->{'_atom_type_symbol'}};
                    if( defined $index ) {
                        %atom = ( %atom, map { $_ => $values->{$_}[$index] }
                                             @{$dataset->{loops}[$type_loop]} );
                    }
                }

                $atom{index} = $i;
                push @atoms, \%atom;
            }

            if ($sort_by_atomic_number) {
                check_chemical_symbols(\@atoms)
            }

            my @order = map  { $_->{index} }
                        sort { for my $order (@order_functions) {
                                   my $cmp = $order->( $a, $b );
                                   return $direction * $cmp if $cmp;
                               } }
                             @atoms;
            if( defined $atom_loop ) {
                reorder_loop( $dataset, '_atom_site_label', \@order );

                if( $order_child_loops ) {
                    my %order = map { $atoms[$order[$_]]->{_atom_site_label} => $_ }
                                    0..$#atoms;

                    my @packets = (
                        [ qw( _atom_site_aniso_label ) ],
                        [ qw( _geom_bond_atom_site_label_1
                              _geom_bond_atom_site_label_2 ) ],
                        [ qw( _geom_angle_atom_site_label_1
                              _geom_angle_atom_site_label_2
                              _geom_angle_atom_site_label_3 ) ],
                        [ qw( _geom_torsion_atom_site_label_1
                              _geom_torsion_atom_site_label_2
                              _geom_torsion_atom_site_label_3
                              _geom_torsion_atom_site_label_4 ) ],
                        [ qw( _geom_hbond_atom_site_label_D ) ],
                    );
                    for my $packet (@packets) {
                        my @tags = @$packet;

                        next if grep { !contains_data_item( $dataset, $_ ) } @tags;
                        if( grep { grep { !exists $order{$_} } @{$values->{$_}} } @tags ) {
                            warn 'not all values of data item' . (@tags > 1 ? 's' : '') .
                                 ' ' . join( ', ', map { "'$_'" } @tags ) . ' refer to ' .
                                 'existing values of data item \'_atom_site_label\' ' .
                                 '-- cannot order child loop.';
                            next;
                        }

                        if( @tags == 2 || @tags == 3 ) {
                            for my $i (0..$#{$values->{$tags[0]}}) {
                                next if $order{$values->{$tags[ 0]}[$i]} <=
                                        $order{$values->{$tags[-1]}[$i]};
                                ( $values->{$tags[ 0]}[$i],
                                  $values->{$tags[-1]}[$i] ) =
                                ( $values->{$tags[-1]}[$i],
                                  $values->{$tags[ 0]}[$i] );
                            }
                        } elsif( @tags == 4 ) {
                            # FIXME: CIF dictionaries have to be consulted
                            # to check whether by inverting the order of
                            # atom enumeration we can leave the value of the
                            # torsion angle unchanged.
                            for my $i (0..$#{$values->{$tags[0]}}) {
                                next if $order{$values->{$tags[0]}[$i]} <
                                        $order{$values->{$tags[3]}[$i]};
                                next if $order{$values->{$tags[1]}[$i]} <=
                                        $order{$values->{$tags[2]}[$i]};
                                ( $values->{$tags[0]}[$i],
                                  $values->{$tags[1]}[$i],
                                  $values->{$tags[2]}[$i],
                                  $values->{$tags[3]}[$i] ) =
                                ( $values->{$tags[3]}[$i],
                                  $values->{$tags[2]}[$i],
                                  $values->{$tags[1]}[$i],
                                  $values->{$tags[0]}[$i] );
                            }
                        }

                        my @order = sort {  for my $tag (@tags) {
                                                if( $order{$values->{$tag}[$a]} <=>
                                                    $order{$values->{$tag}[$b]} ) {
                                                    return $order{$values->{$tag}[$a]} <=>
                                                           $order{$values->{$tag}[$b]};
                                                }
                                            }
                                            return 0;
                                         } 0..$#{$values->{$tags[0]}};
                        reorder_loop( $dataset, $tags[0], \@order );
                    }
                }

                if( $record_original_order ) {
                    set_loop_tag( $dataset,
                                '_[local]_cod_original_atom_order',
                                '_atom_site_label',
                                [ map { $atoms[$_]->{index} } @order ] );
                }
            }

            print_cif( $dataset,
                       {
                            preserve_loop_order => 1,
                            keep_tag_order => 1
                       } );
        };
        if ( $@ ) {
            process_errors( {
              'message'       => $@,
              'program'       => $0,
              'filename'      => $filename,
              'add_pos'       => $dataname
            }, $die_on_errors )
        }
    }
}

sub get_lexicographic_order
{
    my( $a, $b ) = @_;
    return $a->{_atom_site_label} cmp $b->{_atom_site_label};
}

sub get_atomic_order
{
    my( $a, $b ) = @_;

    my $a_type = extract_chemical_symbol($a);
    my $b_type = extract_chemical_symbol($b);

    if( !defined $a_type || !defined $b_type ) {
        return (defined $b_type) - (defined $a_type);
    }

    my( $a_number, $b_number );
    if( exists $COD::AtomProperties::atoms{$a_type} ) {
        $a_number = $COD::AtomProperties::atoms{$a_type}{atomic_number};
    }
    if( exists $COD::AtomProperties::atoms{$b_type} ) {
        $b_number = $COD::AtomProperties::atoms{$b_type}{atomic_number};
    }
    if( defined $a_number && defined $b_number ) {
        return $a_number <=> $b_number || $a_type cmp $b_type;
    } else {
        return (defined $b_number) - (defined $a_number) ||
               $a_type cmp $b_type;
    }
}

sub make_comparator_numeric
{
    my( $data_name, $direction ) = @_;
    return sub { $direction * ( $a->{$data_name} =~ /^[.?]$/ ||
                                $b->{$data_name} =~ /^[.?]$/
                                ? $a->{$data_name} cmp $b->{$data_name}
                                : drop_precision( $a->{$data_name} ) <=>
                                  drop_precision( $b->{$data_name} ) ) };
}

sub make_comparator_lexic
{
    my( $data_name, $direction ) = @_;
    return sub { $direction * ( $a->{$data_name} cmp $b->{$data_name} ) };
}

sub drop_precision
{
    my( $value ) = @_;
    $value =~ s/\(.+\)$//;
    return $value;
}

sub reorder_loop
{
    my( $dataset, $data_name, $order ) = @_;

    return if !exists $dataset->{inloop}{$data_name};
    my $loop_nr =$dataset->{inloop}{$data_name};

    for my $tag (@{$dataset->{loops}[$loop_nr]}) {
        for my $key (qw( precisions types values )) {
            $dataset->{$key}{$tag} = [ @{$dataset->{$key}{$tag}}[@$order] ];
        }
    }

    return;
}

sub extract_chemical_symbol
{
    my ( $atom ) = @_;

    my $chemical_symbol = exists $atom->{'_atom_site_type_symbol'} ?
                                 $atom->{'_atom_site_type_symbol'} :
                                 $atom->{'_atom_site_label'};

    if ($chemical_symbol =~ m/^([A-Za-z]{1,2})/) {
        $chemical_symbol = $1;
    }

    return $chemical_symbol;
}

sub check_chemical_symbols
{
    my ( $atoms ) = @_;

    my $symbol;
    for my $atom (@{$atoms}) {
        $symbol = extract_chemical_symbol($atom);
        if (!exists $COD::AtomProperties::atoms{$symbol}) {
            warn "unknown chemical type '$symbol'\n";
        }
    }

    return;
}
