#!/usr/local/bin/perl -w

=head1 NAME

count2huge.pl - Convert the output of count.pl to huge-count.pl 

=head1 SYNOPSIS

count2huge.pl takes the output of count.pl and sorts the bigrams by 
alphabet order. The output bigrams have the same order with the 
output of huge-count.pl 

=head1 DESCRIPTION

count2huge.pl convert the output of the count.pl to huge-count.pl
for the same input text and options. The reason we do this is because
for the vector relatedness measure of UMLS-Similarity, it requires the 
bigrams which starts with the same term are grouped together. When the 
bigrams are sorted, it could optimize the processing time. For details, 
please see the vector-input.pl of UMLS-Similarity.   

See perldoc count2huge.pl 

=head1 USGAE

count2huge.pl [OPTION] SOURCE DESTINATIONDIR

=head1 INPUT

=head2 Required Arguments:

=head3 SOURCE

Input to count2huge.pl is a single flat bigrams list file generated by count.pl. 

=head3 DESTINATIONDIR 

count2huge.pl sorts the bigrams in the alphabet order. The result file
count2huge.output is located in the destination directory. It has the 
same result with the output of huge-count.pl which has same text and 
options of count.pl.  

=head4 --split N

huge-split will divide the output bigrmas tokenlist generated by count.pl
Each part created with --split N will contain N lines. Value of N should be 
chosen such that huge-sort.pl can be efficiently run on any part containing 
N lines from the file contains all bigrams file.

We suggest that N is equal to the number of KB of memory you have. If the
computer has 8 GB RAM, which is 8,000,000 KB, N should be set to 8000000. 

=head3 Other Options:

=head4 --help

Displays the help information.

=head4 --version

Displays the version information.

=head1 AUTHOR

Ying Liu, University of Minnesota, Twin Cities.
liux0395@umn.edu

=head1 COPYRIGHT

Copyright (C) 2009, Ying Liu 

Ying Liu, University of Minnesota, Twin Cities.
liux0395@umn.edu

Ted Pedersen, University of Minnesota, Duluth.
tpederse@umn.edu


This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

=cut


###############################################################################
#-----------------------------------------------------------------------------
#                              Start of program
#-----------------------------------------------------------------------------
###############################################################################

use Getopt::Long;

# first check if no commandline options have been provided... in which case
# print out the usage notes!
if ( $#ARGV == -1 )
{
    &minimalUsageNotes();
    exit;
}

GetOptions ("help","version","split=i");

if(defined $opt_help)
{
        $opt_help=1;
        &showHelp();
        exit;
}

if(defined $opt_version)
{
        $opt_version=1;
        &showVersion();
        exit;
}

#input bigrams by count.pl
my $input = $ARGV[0]; 
if (!defined $input)
{
    &minimalUsageNotes();
	exit;
}
else
{
	open(COUNT, "<$input") or die("Error: cannot open file '$input'\n");
}

#output bigrams by count2huge.pl
my $output_dir = $ARGV[1]; 

if (!defined $output_dir)
{
    &minimalUsageNotes();
	exit;
}
elsif (-d "$output_dir")
{
	print "$output_dir is already exist, please use a new directory name.\n";	
    &minimalUsageNotes();
	exit;
}
else
{
	system ("mkdir $output_dir");	
}

if (!defined $opt_split)
{
    print STDERR "Warning($0): You do not specify the split size. count2huge.pl\n";
    print STDERR "does not split the whole bigrams file into smaller pieces.\n";
}

# generate the split files
my $total = <COUNT>;
chomp $total;
my $split = 0;
if (!defined $opt_split)
{
	$split = $total; 
}
else
{
	$split = $opt_split;	
}

# generate the split file
my $sub_i = 1;
my $sub_file = "$output_dir/$input" . ".$sub_i";
open(SUB, ">$sub_file") or die("Error: cannot open file '$sub_file' for output index.\n");
my @split_files = ();
push (@split_files, $sub_file);

# split the bigrams list of count.pl
my $split_num = 0;
while (my $line = <COUNT>)
{
	printf SUB "$line";
    $split_num++;

    if ($split_num == $split)
    {
        close SUB;
        if (eof (COUNT))
        {
            last;
        }
        else
        {
            $sub_i++;
            $split_num = 0;
            $sub_file = "$output_dir/$input" . ".$sub_i";
            open(SUB, ">$sub_file") or die("Error: cannot open file '$sub_file' for output index.\n");
			push (@split_files, $sub_file);
        }
   }
}
close SUB;

# sort each split file
my %bigrams = ();
my @sort_files = ();
foreach my $f (@split_files)
{
	open(FILE, "<$f") or die("Error: cannot open file '$f'\n");

	while (my $line = <FILE>)
	{
		my @words = ();
		chomp ($line);
		@words = split('<>', $line);
		my $bigram = "$words[0]<>$words[1]<>";
		$bigrams{$bigram} = $words[2];	
	}	
	close FILE;

	# sort the bigrams in the alphabet order
	my $sorted = "$f" . "-sorted";
	open(SORT, ">$sorted") or die("Error: cannot open file '$sorted'\n");
	push (@sort_files, $sorted);

	# if only one split file, print the total number of bigrams
	if((!defined $opt_split) or ($opt_split >= $total))
	{
		printf SORT "$total\n";	
	} 

	# print out the sort bigrams	
	foreach my $b (sort (keys %bigrams))
	{
   		printf SORT "$b$bigrams{$b}\n";
	}
	close SORT;
	%bigrams = ();

	system ("rm $f");
}

# only one file, so, no combining files. 
if(@sort_files==1)
{
	my $output = "count2huge.output";
	system ("mv $output_dir/*sorted $output_dir/$output");
}

# combine bigrams. 
my $i = 0;
my $bigramTotal = 0;
while (@sort_files>1)
{
    $i++;
	$bigramTotal = 0;

    my $file1 = shift @sort_files;
    my $file2 = shift @sort_files;
    open(FILE1, "<$file1") or die("Error: cannot open file '$file1'\n");
    open(FILE2, "<$file2") or die("Error: cannot open file '$file2'\n");

    my $merge = "$output_dir" . "/merge." . "$i";
    open(MERGE, ">$merge") or die("Error: cannot open file '$merge'\n");

	my $flag = 0;
	my $line1 = "";
	my $line2 = "";

	while ( )
    {
        if (eof(FILE1) and eof(FILE2))
		{
			last;
		}
        elsif (!eof(FILE1) and !eof(FILE2)) # if two files are not finished
        {
			if ($flag == 0)
			{
    			$line1 = <FILE1>; chomp ($line1);
				my @s1 = split ('<>', $line1);
    			my @fre1 = split (' ', $s1[2]);
    			$bigramTotal += $fre1[0];

    			$line2 = <FILE2>; chomp ($line2);
				my @s2 = split ('<>', $line2);
    			my @fre2 = split (' ', $s2[2]);
    			$bigramTotal += $fre2[0];
			}
            elsif ($flag == 1)
            {
                $line1 = <FILE1>; chomp ($line1);
				my @s = split ('<>', $line1);
                my @fre = split (' ', $s[2]);
                $bigramTotal += $fre[0];
            }
            elsif ($flag == 2)
            {
                $line2 = <FILE2>; chomp ($line2);
				my @s = split ('<>', $line2);
                my @fre = split (' ', $s[2]);
                $bigramTotal += $fre[0];
            }

			if ($line1 eq $line2)
            {
				print STDERR "two bigrams are the same, input data is wrong!\n";
				exit;
            }
            elsif ($line1 gt $line2) 
            {
                printf MERGE "$line2\n";

				if (!eof(FILE1) and !eof(FILE2))
				{
					$flag = 2; # read file2 next
				}
				elsif (eof(FILE1) and eof(FILE2))
				{
                	printf MERGE "$line1\n"; 
					$flag = 7; # both files are finished
				}
                elsif (eof(FILE1) and !eof(FILE2))
				{
					$flag = 3; # continue on file2, and compare with $line1
				}
				elsif (!eof(FILE1) and eof(FILE2))
				{
                	printf MERGE "$line1\n";
					$flag = 4 # continue on file 1, no left over
				}
            }
            elsif ($line1 lt $line2)
            {
                print MERGE "$line1\n";

				if (!eof(FILE1) and !eof(FILE2))
				{
					$flag = 1; # read file1 next
				}
				elsif (eof(FILE1) and eof(FILE2))
				{
                	printf MERGE "$line2\n"; # both files are finished 
					$flag = 7;
				}
                elsif (eof(FILE1) and !eof(FILE2))
				{
                	printf MERGE "$line2\n"; # continue on file2, no left over
					$flag = 5;
				}
				elsif (!eof(FILE1) and eof(FILE2))
				{
					$flag = 6; # continue on file1, and compare with $line2
				}
            }
		}
       	elsif (!eof(FILE1) and eof(FILE2)) 
        {
            while($line1 = <FILE1>)
            {
            	chomp($line1);
                my @s = split ('<>', $line1);
                my @fre = split (' ', $s[2]);
                $bigramTotal += $fre[0];
				
				if ($flag == 4)
				{
					printf MERGE "$line1\n"; # no more leftover
				}
				elsif ($flag == 6)
				{
					if($line2 gt $line1) # $line2 is the left
					{
						printf MERGE "$line1\n";
						if (eof (FILE1))
						{
							printf MERGE "$line2\n";
						}
					}
					else
					{
						printf MERGE "$line2\n";
						printf MERGE "$line1\n";
						$flag = 4;
					}
				}
       		}
		}
       elsif(eof(FILE1) and !eof(FILE2))
       {
            while($line2 = <FILE2>)
            {
                chomp($line2);
                my @s = split ('<>', $line2);
                my @fre = split (' ', $s[2]);
                $bigramTotal += $fre[0];
            
				if ($flag == 5)
				{
					printf MERGE "$line2\n"; # no more leftover
				}
				elsif ($flag == 3)
				{
					if($line1 gt $line2) # $line1 is the left
					{
						printf MERGE "$line2\n";
						if (eof (FILE2))
						{
							printf MERGE "$line1\n";
						}
					}
					else
					{
						printf MERGE "$line1\n";
						printf MERGE "$line2\n";
						$flag = 5;
					}
				}
          	}  	 
        }
    } # end of merge two files, while( )

    close FILE1;
    close FILE2;
    close MERGE;

    push (@sort_files, $merge);
    system ("rm $file1");
    system ("rm $file2");

	# print the total number of bigrams 
    if (@sort_files==1)
    {
    	open(MERGE, "<$merge") or die("Error: cannot open file '$merge'\n");

		$i++;
		my $final= "$output_dir" . "/merge." . "$i";
    	open(FINAL, ">$final") or die("Error: cannot open file '$final'\n");

		# check the total number of bigrams
		if ($bigramTotal==$total)
		{
        	printf FINAL "$bigramTotal\n";
		}
		else
		{
			printf STDERR "Total number of bigrams $bigramTotal is different from $total !\n";
			exit;
		}

    	while (my $line = <MERGE>)
    	{
        	chomp ($line);
        	printf FINAL "$line\n";
    	}
    	close MERGE;
    	close FINAL;
	
    	system("rm $merge");

		my $output = "count2huge.output";
		system ("mv $output_dir/merge.* $output_dir/$output");
    }
} # end of merge all files


print STDERR "Check the output in $output_dir/count2huge.output.\n";
exit; 

#-----------------------------------------------------------------------------
#                       User Defined Function Definitions
#-----------------------------------------------------------------------------

# function to output a minimal usage note when the user has not provided any
# commandline options
sub minimalUsageNotes
{
    print STDERR "Usage: count2huge.pl [OPTIONS] SOURCE DESTINATIONDIR\n";
    askHelp();
}

# function to output "ask for help" message when the user's goofed up!
sub askHelp
{
    print STDERR "Type count2huge.pl --help for help.\n";
}

# function to output help messages for this program
sub showHelp 
{
    print "\n";
    print "Usage: count2huge.pl [OPTIONS] SOURCE DESTINATIONDIR\n\n";
    print "count2huge.pl takes the bigrams file generated by\n";
    print "count.pl as input, and sort the bigrams in the alphabet\n";
    print "order. The bigrams list has the same order with the output\n";
	print "of huge-count.pl. \n\n";

    print "OPTIONS:\n\n";

    print "  --split N          Split the bigrams list into smaller pieces. Each file has\n";
    print "                     N bigrams. N is an integer (N>=1). \n";

    print "  --help             Prints this help message.\n";
    print "  --version          Prints this version message.\n";
}

# function to output the version number
sub showVersion
{
    print STDERR "count2huge.pl      -        version 0.1\n";
    print STDERR "Copyright (C) 2010, Ying Liu\n";
    print STDERR "Date of Last Update 04/26/2010\n";
}

