#!/usr/local/bin/perl

use strict;

my $VERSION = "1.1.0";

##### Configuration Options #####

# Don't forget to set perl path on first line!!

# Run script from command line (0) or as CGI script (1)
my $CGI_mode = 1;

# This must be set to your home directory, with trailing slash, 
# for CGI mode. Otherwise, it doesn't matter but do not comment it out.
my $xmlroot = "/home/mydir/pub_html/";

# Set this to the column width you wish to wrap text (72 is standard)
$Text::Wrap::columns = 72;

# OUTPUT CONFIGURATION
# Default settings are fine more most standard HTML tags. Change
# the defaults only if you need to add tags or want to alter the 
# output.

# Define all the BLOCK-LEVEL element tags you wish to output. 
# Anything undefined will be ignored. Set each one to an empty string.
# List items require some special handling. I recomment that you leave
# 'li' defined as is, and do not define 'ul' or 'ol'. Feel free to 
# experiment though.

my %text_buffer = ( p=>'',li=>'',h1=>'',
                    h2=>'',h3=>'',h4=>'');

# Define the string you want output for the first line of the block
# NOTE: You must use a '*' character for list items!! The program will
# replace the bullet (*) with numbers for numbered lists.

my %init_str = ( p       => "\n    ",
                 li      => "     * ",
                 h1      => "\n",
                 h2      => "\n  ",
                 h3      => "\n  ",
                 h4      => "\n    "
                );             

# Define the string you want for subsequent lines of the block

my %other_str = ( p       => '    ',
                  li      => '       ',
                  h1      => '',
                  h2      => ' ',
                  h3      => '  ',
                  h4      => '    '
                );

# Define the strings (usually headings) that you want set to uppercase
my @uctags = ('h1','h2');

### DONE ########################

use XML::Parser;
use Text::Wrap; 

my ($xmlsource);

if ($CGI_mode) {
  print "Content-type: text/plain\n\n";
  my $xmlfile = $ENV{'QUERY_STRING'};
  $xmlfile =~ /^\{([^\}]+)\}$/;
  $xmlfile = $1;
  # taint check
  if ( $xmlfile =~ /\.\./ || $xmlfile =~ /^\// ) {
    print "Dangerous path statement supplied!"; exit;
  }
  $xmlsource = $xmlroot.$xmlfile;
  if ( ! -e $xmlsource ) {
    print "Source file not found."; exit;
  }
}

else {
  my $output;
  if ( scalar @ARGV != 2 ) {
    print STDERR "\nUsage: html2txt.pl sourcefile outfile\n\n";
    exit;
  }
  ($xmlsource, $output) = @ARGV;
  if ( ! -e $xmlsource ) {
    die "ERROR: Input file doesn't exist.";
  }
  open ( OUTPUT, ">$output") or die "ERROR: Can't open output file.";
  select OUTPUT;
}

my $current_tag = undef;
my $is_ol = 0;
my $ol_counter = 0;

my $parser = new XML::Parser ( Style => 'Stream', 
                               NoLWP => 1 ); 

$parser->parsefile ($xmlsource);

sub Text {
     if ( defined $current_tag ) {
          shift;
          s/\s+/ /g;
          $text_buffer{$current_tag} .= $_;
     }
}

sub StartTag {
     shift; my $tag = shift;
     if ( defined $text_buffer{$tag} ) {
          $current_tag = $tag;
     } 

     if ( $tag eq 'ol' ) { 
          $is_ol = 1;
          $init_str{'li'} =~ s/ \*/$ol_counter\./;
     }
     elsif ( $tag eq 'ul' ) { 
          $is_ol = 0; 
          $init_str{'li'} =~ s/\d\./ \*/;
     }
}

sub EndTag {
     shift; my $tag = shift;
     if ( defined $text_buffer{$tag} ) {
          foreach (@uctags) { 
            if ($tag eq $_) {
              $text_buffer{$tag} = uc $text_buffer{$tag}; 
              last;
            }
          }

          if ($tag eq 'li') {
            if ($is_ol) {
                $init_str{'li'} =~ s/$ol_counter/++$ol_counter/e;
             }
          }

          print wrap ( $init_str{$tag},
                       $other_str{$tag},
                       $text_buffer{$tag}), "\n";
          $text_buffer{$tag} = '';
          $current_tag = undef;
     }
     if ($tag eq 'ol') { $ol_counter = 0 }
}

sub StartDocument {}
sub EndDocument {}

__END__

=head1 NAME

xhtml2txt

=head1 README

Generates formatted plain-text from valid XHTML documents using XML-based
parsing methods. Can be run from the command line with output to file, 
or on-the-fly as a CGI script.

Complete documentation at http://www.sfu.ca/~ajdelore/xhtml2txt/

=head1 PREREQUISITES

This script runs under C<strict>, and requires C<XML::Parser>,
and C<Text::Wrap>.

=head1 SCRIPT CATEGORIES

C<CGI>,C<Web>

=head1 AUTHOR

Anthony DeLorenzo (ajdelore@sfu.ca)

=cut