#!/usr/bin/perl -w

use 5.6.1;

if (($#ARGV != 1) || ($ARGV[0] eq '-h')) {
    print STDERR "usage: $0 <infile> <outfile>\n";
    exit 1;
}

###
### HTML 4.0 character entities, except for the Greek and Mathematical 
### characters.  Onindex(8) recognizes only part of them as 
### alphanumeric characters.  Don't send the actual character, instead 
### let Perl do the chr() translation.  See comment below.
###
my $char_entities = { 
    'quot'   => '34',   'amp'    => '38',   'lt'     => '60',
    'gt'     => '62',   'nbsp'   => '160',  'iexcl'  => '161', 
    'cent'   => '162',  'pound'  => '163',  'curren' => '164',
    'yen'    => '165',  'brvbar' => '166',  'sect'   => '167',
    'uml'    => '168',  'copy'   => '169',  'ordf'   => '170',
    'laquo'  => '171',  'not'    => '172',  'shy'    => '173',
    'reg'    => '174',  'macr'   => '175',  'deg'    => '176',
    'plusmn' => '177',  'sup2'   => '178',  'sup3'   => '179',
    'acute'  => '180',  'micro'  => '181',  'para'   => '182',
    'middot' => '183',  'cedil'  => '184',  'sup1'   => '185',
    'ordm'   => '186',  'raquo'  => '187',  'frac14' => '188',
    'frac12' => '189',  'frac34' => '190',  'iquest' => '191',
    'Agrave' => '192',  'Aatute' => '193',  'Acirc'  => '194',
    'Atilde' => '195',  'Auml'   => '196',  'Aring'  => '197',
    'AElig'  => '198',  'Ccedil' => '199',  'Egrave' => '200',
    'Eacute' => '201',  'Ecirc'  => '202',  'Euml'   => '203',
    'Igrave' => '204',  'Iacute' => '205',  'Icirc'  => '206',
    'Iuml'   => '207',  'ETH'    => '208',  'Ntilde' => '209',
    'Ograve' => '210',  'Oacute' => '211',  'Ocirc'  => '212', 
    'Otilde' => '213',  'Ouml'   => '214',  'times'  => '215',
    'Oslash' => '216',  'Ugrave' => '217',  'Uatute' => '218',
    'Ucirc'  => '219',  'Uuml'   => '220',  'Yacute' => '221',
    'THORN'  => '222',  'szlig'  => '223',  'agrave' => '224',
    'aacute' => '225',  'acirc'  => '226',  'atilde' => '227',
    'auml'   => '228',  'aring'  => '229',  'aelig'  => '230',
    'ccedil' => '231',  'egrave' => '232',  'eacute' => '233',
    'ecirc'  => '234',  'euml'   => '235',  'igrave' => '236',
    'iacute' => '237',  'icirc'  => '238',  'iuml'   => '239',
    'eth'    => '240',  'ntilde' => '241',  'ograve' => '242',
    'oacute' => '243',  'ocirc'  => '244',  'otilde' => '245',
    'ouml'   => '246',  'divide' => '247',  'oslash' => '248',
    'ugrave' => '249',  'uacute' => '250',  'ucirc'  => '251',
    'uuml'   => '252',  'yacute' => '253',  'thorn'  => '254',
    'yuml'   => '255',  'OElig'  => '338',  'oelig'  => '339',
    'Scaron' => '352',  'scaron' => '353',  'Yuml'   => '376',
    'circ'   => '710',  'tilde'  => '732',  'ensp'   => '8194',
    'emsp'   => '8195', 'thinsp' => '8201', 'zwnj'   => '8204',
    'zwj'    => '8205', 'lrm'    => '8206', 'rlm'    => '8206',
    'ndash'  => '8211', 'mdash'  => '8212', 'lsquo'  => '8216',
    'rsquo'  => '8217', 'sbquo'  => '8218', 'ldquo'  => '8220',
    'rdquo'  => '8221', 'bdquo'  => '8222', 'dagger' => '8224',
    'Dagger' => '8225', 'permil' => '8240', 'permil' => '8240',
    'lsaquo' => '8249', 'rsaquo' => '8250', 'euro'   => '8364',
};

###
### Note - Not tested with Perl earlier than 5.6.1.
###
sub entity_to_char {
    my $entity = $_[0];
    return ' ' unless $entity;
    if ($entity =~ /^#/) {
	$entity =~ s/\#//;
	return chr ($entity);
    }
    my $char = $char_entities->{$entity};
    return chr ($char) if $char;
    return ' ';
}

undef $/;  # Read the entire file at once.

my ($txt, $ent, $char);

no warnings;
open F, $ARGV[0] or die "$0 $ARGV[0]: $!";
my $txt = <F>;
close F;
use warnings;

$txt =~ s/<style.*<\/style>//igsm;  # Remove a style sheet.
$txt =~ s/<.*?>/ /gsm;              # Remove HTML tags.
                                    # Add white space in case 
                                    # the tag doesn't include
                                    # surrounding space.

while ($txt =~ /\&/gsm) {           # Replace character entities.
    ($ent) = ($txt =~ m/\&((?:\#|\w)+?)\;/sm);
    if ($ent) {
	$char = entity_to_char ($ent);
	$txt =~ s"\&$ent\;"$char"gism;
    }
}

no warnings;
open F, ">$ARGV[1]" or "$0 $ARGV[1]: $!";
print F $txt;
close F;
use warnings;

exit 0;
