#!/usr/bin/perl
#
# split-log-into-buckets-cached x:output [y:output2 ...]
#
# Split a mass-check log into several identically-sized buckets, evenly
# taking messages from all checked corpora and preserving comments,
# writing output to the files listed on the line.
# It does this evenly by running through all buckets sequentially
# as each line is read.
# 
# Each output file must be listed, and the count of the "n" buckets
# for that file specified; so for example 1 file can contain
# 9 buckets.
#
# This variant operates randomly (as per -random) but caches results.

my $input;
my @outs = ();
foreach my $arg (@ARGV) {
  if ($arg =~ /^(\d+):(.*)$/) {
    my $c = $1;
    my $out = $2;
    print "Creating $out with $c buckets\n";
    push (@outs, { c => $c, out => $out });
  }
  else {
    print "Reading from $arg\n";
    $input = $arg;
  }
}

die "usage\n" unless $input;

my @instat = stat($input);
my $rebuild = 0;
foreach my $out (@outs) {
  my @outstat = stat($out->{out});
  if (!($outstat[9] && $instat[9] && $instat[9] < $outstat[9])) {
    $rebuild = 1;
  }
}

if ($rebuild == 0) {
  print "Existing outputs are up-to-date\n";
  exit;
}

my %buckets = ();
my $numbuckets = 0;
foreach my $out (@outs) {
  my $last = $numbuckets + $out->{c};
  for ( ; $numbuckets < $last; $numbuckets++) {
    # exploit the auto-syncing semantics of >>
    open ($buckets{$numbuckets}, ">>".$out->{out}.".tmp");
  }
}

srand (1);      # explicitly static seed, for reproducability

open (IN, "<$input") or die "cannot open $input";
while (<IN>) {
  select $buckets{1+int(rand()*$numbuckets)}; $| = 1;
  print $_;
}
close IN;

foreach my $i (1 .. $numbuckets) {
  close $buckets{$i};
}
foreach my $out (@outs) {
  rename $out->{out}.".tmp", $out->{out};
}

