#!/usr/local/bin/perl use strict; my $VERSION = "1.1.0"; ##### Configuration Options ##### # Don't forget to set perl path on first line!! # Run script from command line (0) or as CGI script (1) my $CGI_mode = 1; # This must be set to your home directory, with trailing slash, # for CGI mode. Otherwise, it doesn't matter but do not comment it out. my $xmlroot = "/home/mydir/pub_html/"; # Set this to the column width you wish to wrap text (72 is standard) $Text::Wrap::columns = 72; # OUTPUT CONFIGURATION # Default settings are fine more most standard HTML tags. Change # the defaults only if you need to add tags or want to alter the # output. # Define all the BLOCK-LEVEL element tags you wish to output. # Anything undefined will be ignored. Set each one to an empty string. # List items require some special handling. I recomment that you leave # 'li' defined as is, and do not define 'ul' or 'ol'. Feel free to # experiment though. my %text_buffer = ( p=>'',li=>'',h1=>'', h2=>'',h3=>'',h4=>''); # Define the string you want output for the first line of the block # NOTE: You must use a '*' character for list items!! The program will # replace the bullet (*) with numbers for numbered lists. my %init_str = ( p => "\n ", li => " * ", h1 => "\n", h2 => "\n ", h3 => "\n ", h4 => "\n " ); # Define the string you want for subsequent lines of the block my %other_str = ( p => ' ', li => ' ', h1 => '', h2 => ' ', h3 => ' ', h4 => ' ' ); # Define the strings (usually headings) that you want set to uppercase my @uctags = ('h1','h2'); ### DONE ######################## use XML::Parser; use Text::Wrap; my ($xmlsource); if ($CGI_mode) { print "Content-type: text/plain\n\n"; my $xmlfile = $ENV{'QUERY_STRING'}; $xmlfile =~ /^\{([^\}]+)\}$/; $xmlfile = $1; # taint check if ( $xmlfile =~ /\.\./ || $xmlfile =~ /^\// ) { print "Dangerous path statement supplied!"; exit; } $xmlsource = $xmlroot.$xmlfile; if ( ! -e $xmlsource ) { print "Source file not found."; exit; } } else { my $output; if ( scalar @ARGV != 2 ) { print STDERR "\nUsage: html2txt.pl sourcefile outfile\n\n"; exit; } ($xmlsource, $output) = @ARGV; if ( ! -e $xmlsource ) { die "ERROR: Input file doesn't exist."; } open ( OUTPUT, ">$output") or die "ERROR: Can't open output file."; select OUTPUT; } my $current_tag = undef; my $is_ol = 0; my $ol_counter = 0; my $parser = new XML::Parser ( Style => 'Stream', NoLWP => 1 ); $parser->parsefile ($xmlsource); sub Text { if ( defined $current_tag ) { shift; s/\s+/ /g; $text_buffer{$current_tag} .= $_; } } sub StartTag { shift; my $tag = shift; if ( defined $text_buffer{$tag} ) { $current_tag = $tag; } if ( $tag eq 'ol' ) { $is_ol = 1; $init_str{'li'} =~ s/ \*/$ol_counter\./; } elsif ( $tag eq 'ul' ) { $is_ol = 0; $init_str{'li'} =~ s/\d\./ \*/; } } sub EndTag { shift; my $tag = shift; if ( defined $text_buffer{$tag} ) { foreach (@uctags) { if ($tag eq $_) { $text_buffer{$tag} = uc $text_buffer{$tag}; last; } } if ($tag eq 'li') { if ($is_ol) { $init_str{'li'} =~ s/$ol_counter/++$ol_counter/e; } } print wrap ( $init_str{$tag}, $other_str{$tag}, $text_buffer{$tag}), "\n"; $text_buffer{$tag} = ''; $current_tag = undef; } if ($tag eq 'ol') { $ol_counter = 0 } } sub StartDocument {} sub EndDocument {} __END__ =head1 NAME xhtml2txt =head1 README Generates formatted plain-text from valid XHTML documents using XML-based parsing methods. Can be run from the command line with output to file, or on-the-fly as a CGI script. Complete documentation at http://www.sfu.ca/~ajdelore/xhtml2txt/ =head1 PREREQUISITES This script runs under C, and requires C, and C. =head1 SCRIPT CATEGORIES C,C =head1 AUTHOR Anthony DeLorenzo (ajdelore@sfu.ca) =cut