File Coverage

File:blib/lib/Stats/LikeR.pm
Coverage:80.0%

linestmtbrancondsubtimecode
1#!/usr/bin/env perl
2# ABSTRACT: Get basic statistical functions, like in R, but with Perl using XS for performance
3
1
1
54479
1
use 5.010;
4package Stats::LikeR;
5our $VERSION = 0.02;
6require XSLoader;
7
1
1
1
1
0
6
use DDP {output => 'STDOUT', array_max => 10, show_memsize => 1};
8
1
1
1
37
1
1
use Devel::Confess 'color';
9
1
1
1
33
2
19
use warnings FATAL => 'all';
10
1
1
1
200
5202
1
use autodie ':default';
11
1
1
1
2235
1
648
use Exporter 'import';
12XSLoader::load('Stats::LikeR', $VERSION);
13our @EXPORT_OK = qw(aov chisq_test cor cor_test cov fisher_test glm hist kruskal_test ks_test lm matrix mean median min max p_adjust power_t_test quantile rbinom read_table rnorm runif scale sd seq shapiro_test sum t_test var var_test wilcox_test write_table);
14our @EXPORT = @EXPORT_OK;
15
16require XSLoader;
17
18# Wrapper to mimic R's structure
19sub chisq_test {
20
4
14284
        my ($data) = @_;
21
22
4
9
        die 'Input must be an array reference' unless ref($data) eq 'ARRAY';
23
24        # The XS function handles the heavy lifting
25
3
14
        my $result = _chisq_c($data);
26
27        # Format the output to look like R's htest object
28        return {
29          'statistic' => { 'X-squared' => $result->{statistic} },
30          'parameter' => { 'df' => $result->{df} },
31          'p.value'   => $result->{p_value},
32          'method'    => $result->{method},
33          'data.name' => 'Perl ArrayRef',
34          'observed'  => $data,
35          'expected'  => $result->{expected}
36
3
12
        };
37}
38
39sub read_table {
40
14
224878
        my $file = shift;
41
14
78
        die "\"$file\" is either unreadable or not a file" unless -r -f $file;
42
14
24
        my %args = (
43                sep => ',', comment => '#',
44                @_,
45        );
46
14
84
13
60
        my %allowed_args = map {$_ => 1} (
47                'comment',      'output.type',  'filter', 'row.names', 'sep',   'substitutions'
48        );
49
14
40
19
29
        my @undef_args = sort grep {!$allowed_args{$_}} keys %args;
50
14
23
        my $current_sub = (split(/::/,(caller(0))[3]))[-1];
51
14
145
        if (scalar @undef_args > 0) {
52
0
0
                p @undef_args;
53
0
0
                die "the above args aren't defined for $current_sub";
54        }
55
14
24
        $args{'output.type'} = $args{'output.type'} // 'aoh';
56
14
27
        if ($args{'output.type'} !~ m/^(?:aoh|hoa|hoh)$/) {
57
1
9
                die "\"$args{'output.type'}\" isn't allowed";
58        }
59        # Normalize the filter argument
60
13
7
        my $filter = $args{filter};
61
13
28
        if (defined $filter && ref($filter) eq 'CODE') {
62
0
0
                $filter = { 0 => $filter }; # 0 means the whole row
63        } elsif (defined $filter && ref($filter) ne 'HASH') {
64
0
0
                die "'filter' must be a CODE or HASH reference";
65        }
66
13
5
        my (@data, %data, @header, %mapped_filters);
67        # Execute the fast C-state machine. Pass an anonymous coderef to process streams.
68        # This bypasses creating an intermediate AoA memory spike.
69        _parse_csv_file($file, $args{sep} // '', $args{comment} // '', sub {
70
3961
1955
                my ($line_ref) = @_;
71
3961
4218
                my @line = @$line_ref;
72
3961
1855
                if (!@header) {
73                        # --- HEADER PROCESSING ---
74
13
62
                        $line[0] =~ s/^\Q$args{comment}\E// if @line && defined $line[0];
75
13
0
21
0
                        while (@line && $line[-1] eq '') { pop @line }
76
13
16
                        @header = @line;
77                        # R-LIKE BEHAVIOR
78
13
16
                        if ((scalar @header > 0) && ($header[0] eq '')) {
79
7
5
                                $header[0] = 'row_name';
80                        }
81
13
14
                        if (($args{'output.type'} eq 'hoh') && (not defined $args{'row.names'})) {
82
2
3
                                $args{'row.names'} = $header[0];
83                        }
84
13
28
13
15
                        if ((defined $args{'row.names'}) && (!grep {$_ eq $args{'row.names'}} @header)) {
85
0
0
                                die "\"$args{'row.names'}\" isn't in the header of $file";
86                        }
87                        # Map filters to 1-based indices (or 0 for whole row)
88
13
13
                        if ($filter) {
89
3
5
                                for my $k (keys %$filter) {
90
3
4
                                        if ($k =~ /^\d+$/) {
91
0
0
                                                $mapped_filters{$k} = $filter->{$k};
92                                        } else {
93
3
42
4
21
                                                my ($idx) = grep { $header[$_] eq $k } 0..$#header;
94
3
4
                                                die "Filter column '$k' not found in header" unless defined $idx;
95
3
4
                                                $mapped_filters{$idx + 1} = $filter->{$k};
96                                        }
97                                }
98                        }
99
13
61
                        return; # Equivalent to 'next' out of the closure
100                }
101                # Check for column alignment
102
3948
2036
                if (scalar @line != scalar @header) {
103
0
0
                        die "Alignment error on $file (" . scalar(@line) . " fields vs " . scalar(@header) . " headers).";
104                }
105                # --- DATA PROCESSING ---
106
3948
1531
                my %line_hash;
107
3948
2009
                for my $i (0 .. $#header) {
108
55460
38036
                        if (!defined($line[$i]) || $line[$i] eq '') {
109
0
0
                $line_hash{$header[$i]} = 'NA';
110         } else {
111
55460
30054
                $line_hash{$header[$i]} = $line[$i];
112         }
113                }
114                # --- APPLY FILTERS ---
115
3948
1568
                my $skip = 0;
116
3948
1942
                if (%mapped_filters) {
117
1845
0
996
0
                        foreach my $fld (sort { $a <=> $b } keys %mapped_filters) {
118
1845
4335
                                local %_ = %line_hash; # Make %_ available to the callback
119
1845
1723
                                local $_ = $fld == 0 ? $line_ref : $line[$fld - 1]; # Localize $_
120
121
1845
1077
                                my $keep = $mapped_filters{$fld}->($line_ref, \%line_hash);
122
1845
1643
                                if (!$keep) {
123
1131
420
                                        $skip = 1;
124
1131
1016
                                        last;
125                                }
126
127                                # If the callback modified $_, write the mutation back to the data
128
714
373
                                if ($fld > 0) {
129
714
284
                                        $line[$fld - 1] = $_;
130
714
1114
                                        $line_hash{$header[$fld - 1]} = (defined($_) && $_ eq '') ? 'NA' : $_;
131                                }
132                        }
133                }
134
3948
5525
                return if $skip; # Reject the row if it failed the filter, skipping memory allocation
135
136                # Populate requested data structure
137
2817
1690
                if ($args{'output.type'} eq 'aoh') {
138
855
2733
                        push @data, \%line_hash;
139                } elsif ($args{'output.type'} eq 'hoa') {
140
1109
484
                        foreach my $col (@header) {
141
15738
15738
5200
13069
                                push @{ $data{$col} }, $line_hash{$col};
142                        }
143                } elsif ($args{'output.type'} eq 'hoh') {
144
853
414
                        my $row_name = $line_hash{$args{'row.names'}};
145
853
359
                        foreach my $col (@header) {
146
11942
5606
                                next if $col eq $args{'row.names'};
147
11089
9759
                                $data{$col}{$row_name} = $line_hash{$col};
148                        }
149                }
150
13
2356
        });
151
13
156
        @header = ();
152
13
11
        %mapped_filters = ();
153
13
31
        if ($args{'output.type'} eq 'aoh') {
154
7
4
                undef %data;
155
7
4
                my $final_ref = \@data;
156
7
500
                return $final_ref;
157        } elsif ($args{'output.type'} =~ m/^(?:hoa|hoh)$/) {
158
6
5
                @data = ();
159
6
761
                return \%data;
160        }
161}
162
163#sub write_table {
164#       my $data_ref = (ref($_[0]) eq 'HASH' || ref($_[0]) eq 'ARRAY') ? shift : undef;
165#       my $file = shift;
166#       my %args = (
167#               sep         => ',',
168#               'row.names' => 1,      
169#               @_,
170#       );
171#       my $current_sub = (split(/::/,(caller(0))[3]))[-1];
172#       $args{data} //= $data_ref;
173#       my %allowed = map { $_ => 1 } qw(data file row.names sep col.names);
174#       my @err = grep { !$allowed{$_} } keys %args;
175#       if (@err > 0) {
176#               die "$current_sub: Unknown arguments passed: " . join(", ", @err) . "\n";
177#       }
178#       die "$current_sub: 'data' must be a HASH or ARRAY reference\n"
179#               unless defined $args{data} && (ref($args{data}) eq 'HASH' || ref($args{data}) eq 'ARRAY');
180
181#       my $col_names = $args{'col.names'};
182#       if (defined $col_names && ref($col_names) ne 'ARRAY') {
183#               die "$current_sub: 'col.names' must be an ARRAY reference\n";
184#       }
185#       my $data         = $args{data};
186#       my $sep          = $args{sep};
187#       my $inc_rownames = $args{'row.names'};
188#       my $quote_field = sub {
189#               my ($val, $sep) = @_;
190#               return '' unless defined $val;
191#               die "$current_sub: Cannot write nested reference types to table\n" if ref($val);
192#               
193#               my $str = "$val";  
194#               if (index($str, $sep) != -1 || index($str, '"') != -1 || $str =~ /[\r\n]/) {
195#                       $str =~ s/"/""/g;
196#                       $str = qq{"$str"};
197#               }
198#               return $str;
199#       };
200#       my $data_type = ref $data;
201#       my ($is_hoh, $is_hoa, $is_aoh) = (0, 0, 0);
202#       my @rows;
203#       if ($data_type eq 'HASH') {
204#               @rows = keys %$data;
205#               return if @rows == 0;
206#               my $first_type = ref $data->{$rows[0]};
207#               die "$current_sub: Data values must be either all HASHes or all ARRAYs\n"
208#                       unless $first_type eq 'HASH' || $first_type eq 'ARRAY';
209
210#               my @type_err = grep { ref $data->{$_} ne $first_type } @rows;
211#               if (@type_err > 0) {
212#                       die "$current_sub: Mixed data types detected. Ensure all values are $first_type references.\n";
213#               }
214#               $is_hoh = ($first_type eq 'HASH');
215#               $is_hoa = ($first_type eq 'ARRAY');
216#       } else {
217#               return if @$data == 0;
218
219#               my $first_elem = $data->[0];
220#               die "$current_sub: For ARRAY data, all elements must be HASH references (Array of Hashes)\n"
221#                       unless defined $first_elem && ref($first_elem) eq 'HASH';
222
223#               my @type_err = grep { !defined($_) || ref($_) ne 'HASH' } @$data;
224#               if (@type_err > 0) {
225#                       die "$current_sub: Mixed data types detected in Array of Hashes. All elements must be HASH references.\n";
226#               }
227#               $is_aoh = 1;
228#       }
229#       open my $fh, '>', $file or die "$current_sub: Could not open '$file' for writing: $!\n";
230#       if ($is_hoh) {
231#               my @headers;
232#               if (defined $col_names) {  # Bug 6 fix: was "if ($col_names)" — use defined
233#                       @headers = @$col_names;
234#               } else {
235#                       my %col_map;
236#                       for my $r (@rows) {
237#                               $col_map{$_} = 1 for keys %{ $data->{$r} };
238#                       }
239#                       @headers = sort keys %col_map;
240#               }
241
242#               my @header_row = @headers;
243#               unshift @header_row, '' if $inc_rownames;
244#               @header_row = map { $quote_field->($_, $sep) } @header_row;
245#               print $fh join($sep, @header_row) . "\n";
246
247#               for my $r (sort @rows) {
248#                       my @row_data = map { defined $data->{$r}{$_} ? $data->{$r}{$_} : "NA" } @headers;
249#                       unshift @row_data, $r if $inc_rownames;
250#                       my @quoted = map { $quote_field->($_, $sep) } @row_data;
251#                       print $fh join($sep, @quoted) . "\n";
252#               }
253#       } elsif ($is_hoa) {
254#               # 1. Find the maximum number of rows
255#               my $max_rows = 0;
256#               foreach my $col (keys %$data) {
257#                       my $len = scalar @{ $data->{$col} };
258#                       $max_rows = $len if $len > $max_rows;
259#               }
260#               $max_rows--; # Convert length to max index
261
262#               # 2. Determine headers
263#               my @headers;
264#               if (defined $col_names) {
265#                       @headers = @$col_names;
266#               } else {
267#                       @headers = sort keys %$data;
268#               }
269#               
270#               die "Could not get headers in $current_sub" if @headers == 0;
271
272#               # Bug 5 fix: if row.names is a column name (non-numeric string), pull it out to be first
273#               my $rownames_col;
274#               if ($inc_rownames && $inc_rownames =~ /\D/) {
275#                       $rownames_col = $inc_rownames;
276#                       @headers = grep { $_ ne $rownames_col } @headers;
277#               }
278
279#               # 3. Print Header Row
280#               my @header_row = @headers;
281#               unshift @header_row, '' if $inc_rownames;
282#               @header_row = map { $quote_field->($_, $sep) } @header_row;
283#               print $fh join($sep, @header_row) . "\n";
284
285#               # 4. Process and Print Data Rows
286#               for my $i (0 .. $max_rows) {
287#                       my @row_data;
288#                       
289#                       foreach my $col (@headers) {
290#                               push @row_data, defined($data->{$col}[$i]) ? $data->{$col}[$i] : 'NA';
291#                       }
292#                       
293#                       if ($inc_rownames) {
294#                               # Bug 5 fix: use named column value if row.names is a column name
295#                               my $rn_val = defined $rownames_col
296#                                       ? (defined $data->{$rownames_col}[$i] ? $data->{$rownames_col}[$i] : 'NA')
297#                                       : $i + 1;
298#                               unshift @row_data, $rn_val;
299#                       }
300#                       
301#                       my @quoted = map { $quote_field->($_, $sep) } @row_data;
302#                       print $fh join($sep, @quoted) . "\n";
303#               }
304#       } elsif ($is_aoh) {
305#               my @headers;
306#               if (defined $col_names) {  # Bug 6 fix: was "if ($col_names)" — use defined
307#                       @headers = @$col_names;
308#               } else {
309#                       my %col_map;
310#                       for my $row_hash (@$data) {
311#                               $col_map{$_} = 1 for keys %$row_hash;
312#                       }
313#                       @headers = sort keys %col_map;
314#               }
315
316#               # Bug 5 fix: if row.names is a column name (non-numeric string), pull it out to be first
317#               my $rownames_col;
318#               if ($inc_rownames && $inc_rownames =~ /\D/) {
319#                       $rownames_col = $inc_rownames;
320#                       @headers = grep { $_ ne $rownames_col } @headers;
321#               }
322
323#               my @header_row = @headers;
324#               unshift @header_row, "" if $inc_rownames;
325#               @header_row = map { $quote_field->($_, $sep) } @header_row;
326#               print $fh join($sep, @header_row) . "\n";  # Bug 7 fix: was "say $fh" — use print consistently
327#               for my $i (0 .. $#$data) {
328#                       my $row_hash = $data->[$i];
329#                       my @row_data = map { defined $row_hash->{$_} ? $row_hash->{$_} : "NA" } @headers;
330#                       if ($inc_rownames) {
331#                               # Bug 5 fix: use named column value if row.names is a column name
332#                               my $rn_val = defined $rownames_col
333#                                       ? (defined $row_hash->{$rownames_col} ? $row_hash->{$rownames_col} : 'NA')
334#                                       : $i + 1;
335#                               unshift @row_data, $rn_val;
336#                       }
337#                       my @quoted = map { $quote_field->($_, $sep) } @row_data;
338#                       print $fh join($sep, @quoted) . "\n";
339#               }
340#       }
341#       close $fh;  # Bug 8 fix: file handle was never closed
342#}
3431;
344=encoding utf8
345
346 - 900
=head1 Synopsis

Get basic R statistical functions working in Perl as if they were part of List::Util, like C<min>, C<max>, C<sum>, etc.
I've used Artificial Intelligence tools such as Claude, Gemini, and Grok to write this as well as using my own gray matter.
There are other similar tools on CPAN, but I want speed and a form like List::Util, which I've gotten here with the help of AI, which often required many attempts to do correctly.
This is meant to call subroutines directly through eXternal Subroutines (XS) for performance and portability.

There B<are> other modules on CPAN that can do B<PARTS> of this, but this works the way that I B<want> it to.

=head1 Functions/Subroutines

=head2 aov

 aov(
 {
     yield => [5.5, 5.4, 5.8, 4.5, 4.8, 4.2],
     ctrl  => [1,     1,   1,   0,   0,   0]
 },
 'yield ~ ctrl');

which returns

 {
     ctrl        {
         Df          1,
         "F value"   25.6000000000001,
         "Mean Sq"   1.70666666666667,
         Pr(>F)      0.00718232855871859,
         "Sum Sq"    1.70666666666667
     },
     Residuals   {
         Df          4,
         "Mean Sq"   0.0666666666666665,
         "Sum Sq"    0.266666666666666
    }
 }

You can also perform Two-Way ANOVA with categorical interactions using the C<*> operator. The parser will implicitly evaluate the main effects alongside the interaction:

 my $res_2way = aov($data_2way, 'len ~ supp * dose');

It is robust against rank deficiency; collinear terms will gracefully receive 0 degrees of freedom and 0 sum of squares, matching R's behavior.

=head2 chisq_test

 my @test_data = ([762, 327, 468], [484, 239, 477]);
 my $test_data = chisq_test(\@test_data);

which outputs:

 {
 data.name   "Perl ArrayRef",
 expected    [
     [0] [
             [0] 703.671381936888,
             [1] 319.645266594124,
             [2] 533.683351468988
         ],
     [1] [
             [0] 542.328618063112,
             [1] 246.354733405876,
             [2] 411.316648531012
         ]
 ],
 method      "Pearson's Chi-squared test",
 observed    [
     [0] [
             [0] 762,
             [1] 327,
             [2] 468
         ],
     [1] [
             [0] 484,
             [1] 239,
             [2] 477
         ]
 ],
 p.value     2.95358918321176e-07,
 parameter   {
     df   2
 },
 statistic   {
     X-squared   30.0701490957547
 }
 }

It also supports 1D arrays for Goodness of Fit tests:

 my $chisq_1d = chisq_test([10, 20, 30]);

For 2x2 matrices, Yates' Continuity Correction is applied automatically, exactly like in R.

=head2 cor

 cor($array1, $array2, $method = 'pearson'),

that is, C<pearson> is the default and will be used if C<$method> is not specified.

Just like R, C<pearson>, C<spearman>, and C<kendall> are available

If you provide an array of arrays (a matrix), C<cor> will compute the correlation matrix automatically. 

=head2 cor_test

 my $result = cor_test(
         'x'         => $x,
         'y'         => $y,
         alternative => 'two.sided'
         method      => 'pearson',
         continuity  => 1
     );

C<cor_test> safely handles C<undef> (or C<NA>) values seamlessly by computing over pairwise complete observations. 

=head2 cov

 cov($array1, $array2, 'pearson')

or

 cov($array1, $array2, 'spearman')

or

 cov($array1, $array2, 'kendall')

=head2 fisher_test

=head3 array reference entry

 my $array_data = [
     [10, 2],
     [3, 15]
 ];
 my $res1 = fisher_test($array_data);

which returns a hash reference:

 {
 alternative   "two.sided",
 conf_int      [
     [0] 2.75338278824932,
     [1] 301.462337971516
 ],
 estimate      {
     "odds ratio"   21.3053175567504
 },
 method        "Fisher's Exact Test for Count Data",
 p_value       0.00053672411914343
 }

=head3 hash reference entry

 $ft = fisher_test( {
     Guess => {
         Milk => 3, Tea => 1
     },
     Truth => {
         Milk => 1, Tea => 3
     }
 });

I have the p-value calculated very precisely, but there are some inexactness (approximately 1% for the confidence intervals) which I couldn't rectify.  The answers are very close to R besides the p-value, where they are identical.

=head2 glm

takes a hash of an array as input

 my %tooth_growth = (
     dose => [qw(0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
 1.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0
 2.0 2.0 2.0)],
     len  => [qw(4.2 11.5  7.3  5.8  6.4 10.0 11.2 11.2  5.2  7.0 16.5 16.5 15.2 17.3 22.5
 17.3 13.6 14.5 18.8 15.5 23.6 18.5 33.9 25.5 26.4 32.5 26.7 21.5 23.3 29.5
 15.2 21.5 17.6  9.7 14.5 10.0  8.2  9.4 16.5  9.7 19.7 23.3 23.6 26.4 20.0
 25.2 25.8 21.2 14.5 27.3 25.5 26.4 22.4 24.5 24.8 30.9 26.4 27.3 29.4 23.0)],
     supp => [qw(VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC
 VC VC VC VC VC OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ
 OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ)]
 );
 
 my $glm_teeth = glm(
     data    => \%tooth_growth,
     formula => 'len ~ dose + supp',
     family  => 'gaussian'
 );

I'm not completely confident that this is working perfectly, though I've gotten this subroutine to work for simple cases.

In addition to the C<gaussian> default, it fully supports logistic regression using the C<binomial> family parameter via Iteratively Reweighted Least Squares (IRLS):

 my $glm_bin = glm(formula => 'am ~ wt + hp', data => \%mtcars, family => 'binomial');

=head2 hist

Computes the histogram of the given data values, operating in single $O(N)$ pass performance. It returns the bin counts, computed breaks, midpoints, and density. 

 my $res = hist([1, 2, 2, 3, 3, 3, 4, 4, 5], breaks => 4);

If C<breaks> is not explicitly provided, it defaults to calculating the number of bins using Sturges' formula.

=head2 kruskal_test

Essentially the test determines if all groups have the same median (same distribution) (an excellent review is at https://library.virginia.edu/data/articles/getting-started-with-the-kruskal-wallis-test)

Performs a Kruskal-Wallis rank sum test, see 
https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/kruskal.test

=head3 R-like array entry

 my @xk = (2.9, 3.0, 2.5, 2.6, 3.2); # normal subjects
 my @yk = (3.8, 2.7, 4.0, 2.4);      # with obstructive airway disease
 my @zk = (2.8, 3.4, 3.7, 2.2, 2.0); # with asbestosis
 my @x = (@xk, @yk, @zk);
 my @g = (
     (map {'Normal subjects'} 0..4),
     (map {'Subjects with obstructive airway disease'} 0..3),
     map {'Subjects with asbestosis'} 0..4
 );
 my $t0 = Time::HiRes::time();
 my $kt = kruskal_test(\@x, \@g);
 my $t1 = Time::HiRes::time();
 printf("Kruskal calculation in %g seconds.\n", $t1-$t0);
 p $kt;

=head3 hash of array entry

I feel that this is better, and more easily read, than what you get in R:

 my %x = (
 'normal.subjects' => [2.9, 3.0, 2.5, 2.6, 3.2],
 'obs. airway disease' => [3.8, 2.7, 4.0, 2.4],
 'asbestosis' => [2.8, 3.4, 3.7, 2.2, 2.0]
 );
 $t0 = Time::HiRes::time();
 $kt = kruskal_test(\%x);
 $t1 = Time::HiRes::time();
 printf("Kruskal calculation via HoA in %g seconds.\n", $t1-$t0);
 p $kt;

=head2 ks_test

The Kolmogorov-Smirnov test, which tests whether or not two arrays/lists of data are part of the same distribution is implemented simply:

 $ks = ks_test(\@x, \@y, alternative => 'greater');

returning a hash reference.

Also, a single array can be tested against a normal distribution:

 $ks = ks_test($ksx, 'pnorm');

The p-value precision is about 1e-8, which I want to improve, but am not sure how.

=head2 lm

This is the linear models function.

 $lm = lm(formula =>  'mpg ~ wt + hp', data => $mtcars);

where C<$mtcars> is a hash of hashes

C<lm> also supports generating interaction terms directly within the formula using the C<*> operator:

 my $lm = lm(formula => 'mpg ~ wt * hp^2', data => \%mtcars);

If your data contains missing numbers (C<NA> or C<undef>), C<lm> handles listwise deletion dynamically to ensure mathematical integrity before fitting.

the dot operator also works:

 $lm = lm(formula => 'y ~ .', data => $dot_data);

=head2 matrix

 my $mat1 = matrix(
     data => [1..6],
     nrow => 2
 );

You can also pass C<< byrow =E<gt> 1 >> if you want the matrix populated row-wise instead of column-wise.

=head2 max

 max(1,2,3);

or

 my @arr = 1..8;
 max(@arr, 4, 5)

as of version 0.02, max will die if any undefined values are provided

=head2 mean

 mean(1,2,3);

or

 my @arr = 1..8;
 mean(@arr, 4, 5)

or

 mean([1,1], [2,2]) # 1.5

as of version 0.02, mean will die if any undefined values are provided

=head2 median

works like mean, taking array references and arrays:

 median( $test_data[$i][0] )

as of version 0.02, median will die if any undefined values are provided

=head2 min

 min(1,2,3);

or

 my @arr = 1..8;
 min(@arr, 4, 5)

as of version 0.02, min will die if any undefined values are provided

=head2 p_adjust

Returns array of false-discovery-rate-corrected p-values, where methods available are "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr"

 my @q = p_adjust(\@pvalues, $method);

=head2 power_t_test

 $test_data = power_t_test(
     n   => 30,  delta     => 0.5, 
     sd  => 1.0, sig_level => 0.05
 );

It also allows configuring the test type (C<< type =E<gt> 'one.sample' >>, C<'two.sample'>, C<'paired'>) and alternative hypothesis (C<< alternative =E<gt> 'one.sided' >>). You can also pass C<< strict =E<gt> 1 >> to strictly evaluate both tails of the distribution.

=head2 quantile

Calculates sample quantiles using R's continuous Type 7 interpolation. 

 my $quantile = quantile('x' => [1..99], probs => [0.05, 0.1, 0.25]);

If the C<probs> parameter is omitted, it behaves identically to R by defaulting to the 0, 25, 50, 75, and 100 percentiles (C<c(0, .25, .5, .75, 1)>). The returned hash keys match R's standardized naming convention (e.g., C<"25%">, C<"33.3%">).

=head2 rbinom

Create a binomial distribution of numbers

 my $binom = rbinom( n => $n, prob => 0.5, size => 9);

It hooks directly into Perl's internal PRNG system, respecting C<srand()> seeds. 

=head2 read_table

I've tried to make this as simple as possible, trying to follow from R:

 my $test_data = read_table('t/HepatitisCdata.csv');

output types can be AOH (aoa), HOA (hoa), HOH (hoh)

 read_table($filename, 'output.type' => 'aoh');
 
 read_table($filename, 'output.type' => 'hoa');

and, like Text::CSV_XS, filters can be applied in order to save RAM on big files:

 $test_data = read_table(
     't/HepatitisCdata.csv',
     filter => {
         Sex => sub {$_ eq 'f'} # where "Sex" is the column name, and "$_" is the value for that column
     },
     'output.type' => 'aoh'
 );

=head2 rnorm

Make a normal distribution of numbers, with pre-set mean C<mean>, standard deviation C<sd>, and number C<n>.

 my ($rmean, $sd, $n) = (10, 2, 9999);
 my $normals = rnorm( n => $n, mean => $rmean, sd => $sd);

=head2 runif

=head3 named arguments

Make a distribution of approximately uniform distribution

 my $unif = runif( n => $n, min => 0, max => 1);

where C<n> is the number of items, the values are between C<min> and C<max>

=head3 positional args

this is to match R's behavior:

 runif( 9 )

will make 9 numbers in [0,1]

 runif(9, 0, 99)

will match C<n>, C<min>, and C<max> respectively

=head2 scale

 my @scaled_results = scale(1..5);

You can also pass an options hash to disable centering or scaling:

 my @scaled_results = scale(1..5, { center => false, scale => true });

It fully supports matrix operations. By passing an array of arrays, C<scale> processes the data column by column independently:

 my $scaled_mat = scale([[1, 2], [3, 4], [5, 6]]);

=head2 sd

 my $stdev = sd(2,4,4,4,5,5,7,9);

Correct answer is 2.1380899352994;

As of version 0.02, sd will croak/die if any undefined values are provided.

=head2 seq

Works as closely as I can to R's seq, which is very similar to Perl's C<for> loops.  Returns an array, not an array reference.

=head3 Example 1: Standard integer sequence

 say 'seq(1, 5):';
 my @seq = seq(1, 5);
 say join(', ', @seq), "\n";
 
 say 'seq(1, 2, 0.25):';
 @seq = seq(1, 2, 0.25);

=head3 Example 2: Fractional steps

 say 'seq(1, 2, 0.25):';
 @seq = seq(1, 2, 0.25);
 say join(", ", @seq), "\n";
 for (my $idx = 2; $idx >= 1; $idx -= 0.25) { # count down to pop
     is_approx(pop @seq, $idx, "seq item $idx with fractional step");
 }

=head3 Example 3: Negative steps

 say 'seq(10, 5, -1):';
 @seq = seq(10, 5, -1);
 say join(", ", @seq), "\n";
 for (my $idx = 5; $idx <= 10; $idx++) { # count down to pop
     is_approx(pop @seq, $idx, "seq item $idx with negative step");
 }

=head2 shapiro_test

tests to see if an array reference is normally distributed, returns a p-value and a statistic

 my $shapiro = shapiro_test(
     [1..5]
 );

and returns the hash reference:

 {
 p.value     0.589650577093106,
 p_value     0.589650577093106,
 statistic   0.960870680168535,
 W           0.960870680168535
 }

=head2 sum

returns sum, but using both arrays and array references.

 my $test_data = [1..8];
 sum($test_data)

which I prefer, compared to List::Util's required casting into an array:

 sum(@{ $test_data });

which is shorter and much easier to read

as of version 0.02, C<sum> will cause the script to die if any undefined values are provided

=head2 t_test

There are 1-sample and 2-sample t-tests:

 my $t_test = t_test( $test_data[$i][$j], mu => mean( $test_data[$i][$j] ));

or 2-sample:

 $t_test = t_test(
     $test_data[3][0],
     $test_data[3][1],
     paired => true
 );

returns a hash reference, which looks like:

 conf_int     => [
     -0.06672889, 0.25672889
 ],
 df        => 5,
 estimate  => 0.095,
 p_value   => 0.19143688433660,
 statistic => 1.50996688705414

the two groups compared can be specified, though not necessarily, as C<x> and C<y>, just like in R:

 $t_test = t_test(
     'x' => test_data[3][0],
     'y' => $test_data[3][1],
     paired => true
 );

=head2 var

as simple as possible:

 var(2, 4, 5, 8, 9)

as of version 0.02, C<var> will die if any undefined values are provided

like C<min>, C<max>, etc., C<var> can accept array references, to make code simpler:

 my $ref = \@arr;
 var($ref) = var(@arr)

=head2 wilcox_test

 $test_data = wilcox_test(
     [1.83,  0.50,  1.62,  2.48, 1.68, 1.88, 1.55, 3.06, 1.30],
     [0.878, 0.647, 0.598, 2.05, 1.06, 1.29, 1.06, 3.14, 1.29]
 );

It fully supports paired tests (C<< paired =E<gt> true >>) and can calculate exact p-values (the default for $N < 50$ without ties). If ties are encountered, it automatically switches to an approximation with continuity correction.

=head2 write_table

mimics R's "write.table", with data as first argument to subroutine, and output file as second

 write_table(\@data_aoh, $tmp_file, sep => "\t", 'row.names' => true);

You can also precisely filter and reorder which columns are written by passing an array reference to C<col.names>:

 write_table(\@data, $tmp_file, sep => "\t", 'col.names' => ['c', 'a']);