#!/usr/bin/perl
##############################################################################
#      $URL: http://mishin.narod.ru $
#     $Date: 2011-04-11 20:53:20 +0300 (Mon, 14 Feb 2011) $
#   $Author: mishnik $
# $Revision: 1.02 $
#   $Source: split_xml.pl $
#   $Description: spit big xml file $
# It needs to create script for splitting JMS messages log.
#
# Input parameters for script:
# - full path to file for splitting
# - full path to folder for splitting results
# - maximum size of splitting volume
# - start message expression
# - end message expression
#
# Limitations:
# - execution time for script depends on input file size, however this time must be not greater than 1 hour
#
#example:
# perl ../split_xml.pl -f rez.xml -o out -z 500000 -s '\<\?xml version="1.0" encoding="UTF-8"\?\>'
###############################################################################
use 5.006;
use strict;
use warnings;
use File::Basename;
use POSIX qw/strftime difftime mktime/;
use Getopt::Long;
$File::Find::no_chdir = 0;
use IO::File;
use English qw(-no_match_vars);
use Carp;
use Digest::MD5 qw(md5_hex);
use IPC::Open3 'open3';
use File::Find;

our $VERSION = '0.09';

my $EMPTY          = q{};
my $SPACE          = q{ };
my $COMMA          = q{,};
my $QUOTE          = q{'};
my $PLUS           = q{+};
my $DASH           = q{-};
my $TAB            = q{   };
my @empty          = ();
my $ref_arr_empty  = \@empty;
my %hash_empty     = ();
my $ref_hash_empty = \@empty;
my (@dir2index);
my (
    $regexp2file,      $tradelist,      $dir2xml,
    $end_tag,          $start_tag,      $tradeid_tag,
    $tradeversion_tag, $rebuld_index,   $zipfile,
    $zipsize,          $p,              $len_hash,
    $hash_ref_cvs,     $txt,            $split,
    $split_start_cnt,  $split_filename, $split_cnt_trade_in_file,
    $utp_skip_format,  $ret,            $href_check_nofind,
    $start_time,       $elapsed_time,   $index_by_result,
    $file,             $outdir,         $size,
  )
  = (
    $EMPTY, $EMPTY, $EMPTY, $EMPTY, $EMPTY, $EMPTY, $EMPTY, $EMPTY, $EMPTY,
    $EMPTY, 0,      $EMPTY, $EMPTY, $EMPTY, $EMPTY, $EMPTY, $EMPTY, $EMPTY,
    $EMPTY, $EMPTY, $EMPTY, $EMPTY, $EMPTY, $EMPTY, $EMPTY, $EMPTY, $EMPTY,
  );    #best
my $result = GetOptions(
    'file|f=s'      => \$file,        #
    'outdir|o=s'    => \$outdir,      #
    'size|z=s'      => \$size,
    'start_tag|s=s' => \$start_tag
);

$start_time = time;
my @split_vars = ( $file, $outdir, $size, $start_tag, $end_tag, );
$ret          = split_big_file( \@split_vars );
$elapsed_time = wdhms( time - $start_time );
$ret          = print "Time elapsed: $elapsed_time\n";

sub split_big_file {
    my ($arr_ref) = @_;
    my @arr_par   = @{$arr_ref};
    my $file      = $arr_par[0];
    $ret = print "split file $file. \n";
    open my $FH, q{<}, $file or croak "bah $file";
    gen_multi_file( $FH, $arr_ref );
    close $FH or croak "doh $file";
}

sub gen_multi_file {
    my ( $FH, $arr_ref ) = @_;
    my ( $file, $outdir, $size, $start_tag, ) = @{$arr_ref};
    if ( !-d $outdir ) {
        mkdir $outdir;    #  create dir for xml
    }
    else {

        #if $outdir not empty - rm files
        finddepth( \&remove_dir, "$outdir" );

        #rmdir("$outdir") or croak("Could not remove $outdir");
    }

    my $count4suffix = 0;
    my $cnt          = 0;
    my $out_name     = basename($file);
    my $out          = $EMPTY;
    my @tmp_out      = ();
    my $fname        = $EMPTY;
    my $ar_size      = 0;
    my $filesize     = 0;
    my $tmp          = 1;
    my $i            = 0;
    while( my $line=<$FH>) {    # for each line

        if ( $line =~ /$start_tag/ ) {
    #        print $line. $INPUT_LINE_NUMBER . "\n";
            $ar_size = scalar @tmp_out;
            if ( $ar_size > 0 ) {
                $cnt = sprintf '%03d', $count4suffix;
                $fname = "$outdir/${out_name}_$cnt.txt";
                if ($tmp) {
                    $ret = print "write file $fname\n";
                    $tmp = 0;
                }
                $ret = add2file( $fname, join $EMPTY, @tmp_out );
                @tmp_out = ();
                my $filesize = -s $fname || die "$fname: $!";
                if ( $filesize > $size ) {
                    $count4suffix++;
                    $cnt   = sprintf '%03d', $count4suffix;
                    $fname = "$outdir/${out_name}_$cnt.txt";
                    $ret   = print "write file $fname\n";
                }
            }

        }
        push @tmp_out, $line;

        #Out of memory!
        if ( $i > 100 ) {    #commit every 1000 lines
            $ar_size = scalar @tmp_out;
            if ( $ar_size > 0 ) {
                $cnt     = sprintf '%03d', $count4suffix;
                $fname   = "$outdir/${out_name}_$cnt.txt";
                $ret     = add2file( $fname, join $EMPTY, @tmp_out );
                @tmp_out = ();
                $i       = 0;
            }
        }
        $i++;
    }
    $ret = add2file( $fname, join $EMPTY, @tmp_out );
    return 1;
}

sub remove_dir {

    # for a directory, this will be 0
    #if   ( !( stat("$File::Find::name") )[7] ) { rmdir("$File::Find::name"); }
    #else
    { unlink("$File::Find::name"); }
}

sub add2file {
    my ( $file, $message ) = @_;
    open my $fh, q{>>}, "$file" or croak "unable to open:$file $ERRNO";
    $ret = print {$fh} $message;
    close $fh or croak "unable to close: $file $ERRNO";
    return 1;
}

sub run_shell {
    my ($cmd) = @_;
    my @args = ();
    my ( $HIS_IN, $HIS_OUT, $HIS_ERR ) = ( $EMPTY, $EMPTY, $EMPTY );
    my $childpid = open3( $HIS_IN, $HIS_OUT, $HIS_ERR, $cmd, @args );
    $ret = print {$HIS_IN} "stuff\n";
    close $HIS_IN or croak "unable to close: $HIS_IN $ERRNO";
    ;    # Give end of file to kid.
    if ($HIS_OUT) {
        my @outlines = <$HIS_OUT>;    # Read till EOF.
        $ret = print " STDOUT:\n", @outlines, "\n";
    }
    if ($HIS_ERR) {
        my @errlines = <$HIS_ERR>;    # XXX: block potential if massive
        $ret = print " STDERR:\n", @errlines, "\n";
    }
    close $HIS_OUT or croak "unable to close: $HIS_OUT $ERRNO";
    close $HIS_ERR or croak "unable to close: $HIS_ERR $ERRNO";
    waitpid $childpid, 0;
    if ($CHILD_ERROR) {
        $ret = print "That child exited with wait status of $CHILD_ERROR\n";
    }
    return 1;
}

sub wdhms {
    my ( $weeks, $days, $hours, $minutes, $seconds, $sign, $res ) =
      qw/0 0 0 0 0/;

    use constant M_IN_HOUR => 60;
    use constant H_IN_DAY  => 24;
    use constant D_IN_WEEK => 7;

    $seconds = shift;
    $sign    = $seconds == abs $seconds ? $EMPTY : $DASH;
    $seconds = abs $seconds;

    if ($seconds) {
        ( $seconds, $minutes ) =
          ( $seconds % M_IN_HOUR, int( $seconds / M_IN_HOUR ) );
    }

    if ($minutes) {
        ( $minutes, $hours ) =
          ( $minutes % M_IN_HOUR, int( $minutes / M_IN_HOUR ) );
    }
    if ($hours) {
        ( $hours, $days ) = ( $hours % H_IN_DAY, int( $hours / H_IN_DAY ) );
    }
    if ($days) {
        ( $days, $weeks ) = ( $days % D_IN_WEEK, int( $days / D_IN_WEEK ) );
    }

    if ($weeks)   { $res .= sprintf '%dw ', $weeks }
    if ($days)    { $res .= sprintf '%dd ', $days }
    if ($hours)   { $res .= sprintf '%dh ', $hours }
    if ($minutes) { $res .= sprintf '%dm ', $minutes }
    $res .= sprintf '%ds ', $seconds;

    return "$sign$res";
}

