#!/usr/bin/perl

use 5.010;
use strict;
use warnings;
use English qw( -no_match_vars );
use Marpa::UrHTML 'alpha';
use HTML::Tagset;
use Fatal qw(open close);

my $locator = shift;
my $document;
if ($locator =~ /^[a-zA-Z0-9]+[:]/) {
    require WWW::Mechanize;
    my $mech = WWW::Mechanize->new( autocheck => 1 );
    $mech->get( $locator );
    $document = $mech->content;
    undef $mech;
} else {
    local $RS = undef;
    open my $fh, q{<}, $locator;
    $document = <$fh>;
    close $fh;
}

my $urhtml_args = {
    handlers => [
        [   'pre' => sub {
                my @new_line_data      = ();
                my @following_comments = ();
                CHILD:
                for my $child_data ( @{ Marpa::UrHTML::child_data('value') } )
                {
                    next CHILD if not defined( my $value = $child_data->[0] );
                    for my $line_data ( @{$value} ) {
                        given ( $line_data->[0] ) {    # depending on the type
                            when ( [ 'cruft', 'missing start tag', 'missing end tag' ] ) {
                                push @new_line_data,
                                    [
                                    $_, 0,
                                    'following pre',
                                    @{$line_data}[ 3 .. $#{$line_data} ]
                                    ];
                            } ## end when ( [ 'cruft', 'missing start tag' ] )
                        } ## end given
                    } ## end for my $line_data ( @{$value} )
                } ## end for my $child_data ( @{ Marpa::UrHTML::child_data(...)})
                my $original = Marpa::UrHTML::original();
                push @new_line_data, [ 'line', 0, $original ];
                push @new_line_data, @following_comments;
                return \@new_line_data;
            },
        ],
        [   ':CRUFT' => sub {
                my $literal = Marpa::UrHTML::literal();
                my @new_line_data = ( [ 'cruft', 0, 'following', $literal ] );
                $literal =~ s/^\s+//gxms;
                $literal =~ s/\s+$//gxms;
                $literal =~ s/\s+/ /gxms;
                push @new_line_data, [ 'line', 0, $literal ];
                return \@new_line_data;
            },
        ],
        [   q{*} => sub {
                my $tagname = Marpa::UrHTML::tagname();
                my @new_line_data = ();
                my @child_data = @{ Marpa::UrHTML::child_data('token_type,value,original') };
                my $first_child = $child_data[0];
                my $first_content_child = 0;

                if ( defined $first_child->[0] and $first_child->[0] eq 'S' )
                {
                    push @new_line_data, [ 'line', 0, $first_child->[2] ];
                    $first_content_child = 1;
                }
                else {
                    push @new_line_data, [ 'missing start tag', 0, 'following', $tagname ];
                } ## end else [ if ( defined $first_child->[0] and $first_child->[0]...)]

                my $last_child = $child_data[-1];
                my $last_content_child = $#child_data;
                my $end_tag_child;
                if ( defined $last_child->[0] and $last_child->[0] eq 'E' )
                {
                    $end_tag_child = $last_child;
                    $last_content_child -= 1;
                }

                CHILD:
                for my $child_data_ix ( $first_content_child .. $last_content_child ) {
                    my ( $token_type, $value, $original ) =
                        @{ $child_data[$child_data_ix] };
                    if ( defined $value ) {
                        for my $line_data ( @{$value} ) {
                            my ( $type, $indent, @data ) = @{$line_data};
                            push @new_line_data, [ $type, $indent + 1, @data ];
                        }
                        next CHILD;
                    } ## end if ( defined $value )
                    for my $line ( split /\n/xms, $original ) {
                        $line =~ s/^\s+//gxms;
                        $line =~ s/\s+$//gxms;
                        $line =~ s/\s+/ /gxms;
                        push @new_line_data, [ 'line', 1, $line ];
                    }
                } ## end for my $child_data ( @{ Marpa::UrHTML::child_data(...)})

                given (1) {
                    when ( defined $end_tag_child ) {
                        push @new_line_data,
                            [ 'line', 0, $end_tag_child->[2] ];
                    }
                    when ( not $HTML::Tagset::emptyElement{$tagname} ) {
                        push @new_line_data,
                            [ 'missing end tag', 0, 'preceding', $tagname ];
                    }
                } ## end given

                return \@new_line_data;
            },
        ],
        [   ':TOP' => sub {
                my $result = q{};
                CHILD:
                for my $child_data (
                    @{ Marpa::UrHTML::child_data('value,original') } )
                {
                    my ( $value, $original ) = @{$child_data};
                    if ( defined $value ) {
                        LINE: for my $line_data ( @{$value} ) {
                            my $type = shift @{$line_data};
                            my $indent = shift @{$line_data};
                            my $line_prefix = q{  } x $indent;
                            if ( $type eq 'line' ) {
                                my ( $line ) = @{$line_data};
                                next LINE if $line =~ /^\s*$/;
                                $result .= "$line_prefix$line\n";
                                next LINE;
                            } ## end if ( $type eq 'line' )
                            if ( $type eq 'missing start tag' ) {
                                my ( $location, $tagname ) = @{$line_data};
                                given ($location) {
                                    when ('following') {
                                        $result
                                            .= $line_prefix
                                            . qq{<!-- Following start tag is replacement for a missing one -->\n}
                                            . $line_prefix
                                            . "<$tagname>\n";
                                    } ## end when ('following')
                                    when ('following pre') {
                                        $result .= $line_prefix
                                            . qq{<!-- Inside following <pre>, a start tag is missing: <$tagname> -->\n};
                                    }
                                    default {
                                        Carp::croak(
                                            "Internal error: unprovided-for missing start tag location: $_"
                                        );
                                    }
                                } ## end given
                                next LINE;
                            } ## end if ( $type eq 'missing start tag' )
                            if ( $type eq 'missing end tag' ) {
                                my ( $location, $tagname ) = @{$line_data};
                                given ($location) {
                                    when ('preceding') {
                                        $result
                                            .= "$line_prefix</$tagname>\n"
                                            . $line_prefix
                                            . qq{<!-- Preceding end tag is replacement for a missing one -->\n};
                                    } ## end when ('preceding')
                                    when ('following pre') {
                                        $result .= $line_prefix
                                            . qq{<!-- Inside following <pre>, an end tag is missing: <$tagname> -->\n};
                                    }
                                    default {
                                        Carp::croak(
                                            "Internal error: unprovided-for missing end tag location: $_"
                                        );
                                    }
                                } ## end given
                                next LINE;
                            } ## end if ( $type eq 'missing end tag' )
                            if ( $type eq 'cruft' ) {
                                my ( $location, $cruft ) = @{$line_data};
                                given ($location) {
                                    when ('following') {
                                        $result
                                            .= "$line_prefix<!-- Next line is cruft -->\n";
                                    }
                                    when ('following pre') {

                                        # Make sure the cruft quoted inside
                                        # the HTML comment does not
                                        # disrupt the comment.
                                        ( my $safe_cruft = $cruft )
                                            =~ s/--/- -/xms;
                                        $safe_cruft
                                            =~ s/^/  $line_prefix/gxms;
                                        $result
                                            .= qq{$line_prefix<!-- Inside the following <pre>, there is this cruft:\n}
                                            . qq{$safe_cruft\n}
                                            . qq{$line_prefix-->\n};
                                    } ## end when ('following pre')
                                    default {
                                        Carp::croak(
                                            "Internal error: unprovided-for cruft location: $_"
                                        );
                                    }
                                } ## end given
                                next LINE;
                            } ## end if ( $type eq 'cruft' )
                            Carp::croak(qq{Internal error: unknown line data type: "$type"});
                        } ## end for my $line_data ( @{$value} )
                        next CHILD;
                    } ## end if ( defined $value )
                    LINE: for my $line ( split /\n/xms, $original ) {
                        next LINE if $line =~ /^\s*$/;
                        $line =~ s/^\s+//gxms;
                        $line =~ s/\s+$//gxms;
                        $line =~ s/\s+/ /gxms;
                        $result .= "$line\n";
                    } ## end for my $line ( split /\n/xms, $original )
                } ## end for my $child_data ( @{ Marpa::UrHTML::child_data(...)})
                return $result;
            },
        ],
    ]
};

print Marpa::UrHTML->new($urhtml_args)->parse( \$document );

exit 0;

__END__

=head1 NAME

C<urhtml_fmt> - Reformat HTML, indented according to structure

=head1 SYNOPSIS

    urhtml_fmt [uri|file]

=head1 EXAMPLE

    urhtml_fmt http://perl.org

=head1 DESCRIPTION

Given the URI or the name of a file,
writes it to C<STDOUT>
reformatted and
indented according to the HTML structure.
Missing start and end tags are supplied and
comments added to indicate this.
Text inside
C<< <pre> >> elements 
is not altered.

L<urhtml_fmt> tries to parse everything that is actually out there on the Web.
In fact,
L<urhtml_fmt> will assume any file fed to it was intended as HTML,
and will produce its best guess of the author's intent.

L<urhtml_fmt> supplies missing start and end tags.
L<urhtml_fmt>'s parser is extremely liberal in what it accepts.
When its liberalization of the standards is not sufficient to make
a document into valid HTML,
L<urhtml_fmt>
will pick characters to treat as noise or "cruft".
The parser ignores cruft in determining
the structure of the document.

When
L<urhtml_fmt> adds
a missing start tag,
it precedes the new start tag with a comment.
When
L<urhtml_fmt> adds
a missing end tag,
it follows the new end tag with a comment.
When L<urhtml_fmt> classifies characters
as "cruft",
it adds a comment to that effect before the "cruft".

C<pre> elements receive special treatment.
The contents of 
C<pre> elements are not reformatted.
When missing tags or cruft occur inside a C<pre> element,
the comments to that effect are placed 
before the C<< <pre> >> start tag.

The argument to L<urhtml_score> can be either as a URI or a file
name.  If it starts with alphanumerics followed by a colon, it is treated
as a URI.  Otherwise it is treated as file name.

=head1 SAMPLE OUTPUT

Given this input:

    <title>Test page<tr>x<head attr="I am cruft"><p>Final graf

L<urhtml_fmt> returns

    <!-- Following start tag is replacement for a missing one -->
    <html>
      <!-- Following start tag is replacement for a missing one -->
      <head>
        <title>
          Test page
        </title>
        <!-- Preceding end tag is replacement for a missing one -->
      </head>
      <!-- Preceding end tag is replacement for a missing one -->
      <!-- Following start tag is replacement for a missing one -->
      <body>
        <!-- Following start tag is replacement for a missing one -->
        <table>
          <!-- Following start tag is replacement for a missing one -->
          <tbody>
            <tr>
              <!-- Following start tag is replacement for a missing one -->
              <td>
                x
                <!-- Next line is cruft -->
                <head attr="I am cruft">
                <p>
                  Final graf
                </p>
                <!-- Preceding end tag is replacement for a missing one -->
              </td>
              <!-- Preceding end tag is replacement for a missing one -->
            </tr>
            <!-- Preceding end tag is replacement for a missing one -->
          </tbody>
          <!-- Preceding end tag is replacement for a missing one -->
        </table>
        <!-- Preceding end tag is replacement for a missing one -->
      </body>
      <!-- Preceding end tag is replacement for a missing one -->
    </html>
    <!-- Preceding end tag is replacement for a missing one -->

=head1 PURPOSE

This program is a demo of a demo.
It purpose is to show how easy it is to write applications which look
at the structure of web pages using L<Marpa::UrHTML>.
And the purpose of L<Marpa::UrHTML>
is to demonstrate the power of its parse engine,
L<Marpa>.
L<Marpa::UrHTML> was written in a few days,
and its logic 
is a straightforward,
natural expression of the structure of HTML.

=head1 ACKNOWLEDGMENTS

The starting template for this code was
L<HTML::TokeParser>, by Gisle Aas.
See also the
L<acknowledgments for Marpa as a whole|Marpa/"ACKNOWLEDGMENTS">.

=head1 LICENSE AND COPYRIGHT

Copyright 2007-2009 Jeffrey Kegler, all rights reserved.
Marpa is free software under the Perl license.
For details see the LICENSE file in the Marpa distribution.

=cut
