File Coverage

File:blib/lib/Stats/LikeR.pm
Coverage:79.7%

linestmtbrancondsubtimecode
1#!/usr/bin/env perl
2require 5.010;
3package Stats::LikeR;
4our $VERSION = 0.01;
5require XSLoader;
6
1
1
1
37641
1
2
use DDP {output => 'STDOUT', array_max => 10, show_memsize => 1};
7
1
1
1
27
1
2
use Devel::Confess 'color';
8
1
1
1
25
1
14
use warnings FATAL => 'all';
9
1
1
1
81
4423
2
use autodie ':default';
10
1
1
1
1773
1
504
use Exporter 'import';
11XSLoader::load('Stats::LikeR', $VERSION);
12our @EXPORT_OK = qw(aov chisq_test cor cor_test cov fisher_test glm hist kruskal_test ks_test lm matrix mean median min max p_adjust power_t_test quantile rbinom read_table rnorm runif scale sd seq shapiro_test t_test var wilcox_test write_table);
13our @EXPORT = @EXPORT_OK;
14
15require XSLoader;
16
17# Wrapper to mimic R's structure
18sub chisq_test {
19
4
12674
        my ($data) = @_;
20
21
4
9
        die 'Input must be an array reference' unless ref($data) eq 'ARRAY';
22
23        # The XS function handles the heavy lifting
24
3
11
        my $result = _chisq_c($data);
25
26        # Format the output to look like R's htest object
27        return {
28          'statistic' => { 'X-squared' => $result->{statistic} },
29          'parameter' => { 'df' => $result->{df} },
30          'p.value'   => $result->{p_value},
31          'method'    => $result->{method},
32          'data.name' => 'Perl ArrayRef',
33          'observed'  => $data,
34          'expected'  => $result->{expected}
35
3
11
        };
36}
37
38sub read_table {
39
14
180459
        my $file = shift;
40
14
47
        die "\"$file\" is either unreadable or not a file" unless -r -f $file;
41
14
22
        my %args = (
42                sep => ',', comment => '#',
43                @_,
44        );
45
14
84
7
52
        my %allowed_args = map {$_ => 1} (
46                'comment',      'output.type',  'filter', 'row.names', 'sep',   'substitutions'
47        );
48
14
40
12
25
        my @undef_args = sort grep {!$allowed_args{$_}} keys %args;
49
14
17
        my $current_sub = (split(/::/,(caller(0))[3]))[-1];
50
14
107
        if (scalar @undef_args > 0) {
51
0
0
                p @undef_args;
52
0
0
                die "the above args aren't defined for $current_sub";
53        }
54
14
15
        $args{'output.type'} = $args{'output.type'} // 'aoh';
55
14
24
        if ($args{'output.type'} !~ m/^(?:aoh|hoa|hoh)$/) {
56
1
8
                die "\"$args{'output.type'}\" isn't allowed";
57        }
58        # Normalize the filter argument
59
13
5
        my $filter = $args{filter};
60
13
22
        if (defined $filter && ref($filter) eq 'CODE') {
61
0
0
                $filter = { 0 => $filter }; # 0 means the whole row
62        } elsif (defined $filter && ref($filter) ne 'HASH') {
63
0
0
                die "'filter' must be a CODE or HASH reference";
64        }
65
13
6
        my (@data, %data, @header, %mapped_filters);
66        # Execute the fast C-state machine. Pass an anonymous coderef to process streams.
67        # This bypasses creating an intermediate AoA memory spike.
68        _parse_csv_file($file, $args{sep} // '', $args{comment} // '', sub {
69
3961
1420
                my ($line_ref) = @_;
70
3961
3407
                my @line = @$line_ref;
71
3961
1509
                if (!@header) {
72                        # --- HEADER PROCESSING ---
73
13
41
                        $line[0] =~ s/^\Q$args{comment}\E// if @line && defined $line[0];
74
13
0
14
0
                        while (@line && $line[-1] eq '') { pop @line }
75
13
17
                        @header = @line;
76                        # R-LIKE BEHAVIOR
77
13
13
                        if ((scalar @header > 0) && ($header[0] eq '')) {
78
7
4
                                $header[0] = 'row_name';
79                        }
80
13
12
                        if (($args{'output.type'} eq 'hoh') && (not defined $args{'row.names'})) {
81
2
2
                                $args{'row.names'} = $header[0];
82                        }
83
13
28
9
12
                        if ((defined $args{'row.names'}) && (!grep {$_ eq $args{'row.names'}} @header)) {
84
0
0
                                die "\"$args{'row.names'}\" isn't in the header of $file";
85                        }
86                        # Map filters to 1-based indices (or 0 for whole row)
87
13
8
                        if ($filter) {
88
3
2
                                for my $k (keys %$filter) {
89
3
4
                                        if ($k =~ /^\d+$/) {
90
0
0
                                                $mapped_filters{$k} = $filter->{$k};
91                                        } else {
92
3
42
4
16
                                                my ($idx) = grep { $header[$_] eq $k } 0..$#header;
93
3
3
                                                die "Filter column '$k' not found in header" unless defined $idx;
94
3
4
                                                $mapped_filters{$idx + 1} = $filter->{$k};
95                                        }
96                                }
97                        }
98
13
39
                        return; # Equivalent to 'next' out of the closure
99                }
100                # Check for column alignment
101
3948
1507
                if (scalar @line != scalar @header) {
102
0
0
                        die "Alignment error on $file (" . scalar(@line) . " fields vs " . scalar(@header) . " headers).";
103                }
104                # --- DATA PROCESSING ---
105
3948
1222
                my %line_hash;
106
3948
1628
                for my $i (0 .. $#header) {
107
55460
30626
                        if (!defined($line[$i]) || $line[$i] eq '') {
108
0
0
                $line_hash{$header[$i]} = 'NA';
109         } else {
110
55460
23978
                $line_hash{$header[$i]} = $line[$i];
111         }
112                }
113                # --- APPLY FILTERS ---
114
3948
1242
                my $skip = 0;
115
3948
1431
                if (%mapped_filters) {
116
1845
0
761
0
                        foreach my $fld (sort { $a <=> $b } keys %mapped_filters) {
117
1845
3349
                                local %_ = %line_hash; # Make %_ available to the callback
118
1845
1346
                                local $_ = $fld == 0 ? $line_ref : $line[$fld - 1]; # Localize $_
119
120
1845
803
                                my $keep = $mapped_filters{$fld}->($line_ref, \%line_hash);
121
1845
1292
                                if (!$keep) {
122
1131
358
                                        $skip = 1;
123
1131
739
                                        last;
124                                }
125
126                                # If the callback modified $_, write the mutation back to the data
127
714
298
                                if ($fld > 0) {
128
714
261
                                        $line[$fld - 1] = $_;
129
714
804
                                        $line_hash{$header[$fld - 1]} = (defined($_) && $_ eq '') ? 'NA' : $_;
130                                }
131                        }
132                }
133
3948
4349
                return if $skip; # Reject the row if it failed the filter, skipping memory allocation
134
135                # Populate requested data structure
136
2817
1369
                if ($args{'output.type'} eq 'aoh') {
137
855
2220
                        push @data, \%line_hash;
138                } elsif ($args{'output.type'} eq 'hoa') {
139
1109
344
                        foreach my $col (@header) {
140
15738
15738
4327
10332
                                push @{ $data{$col} }, $line_hash{$col};
141                        }
142                } elsif ($args{'output.type'} eq 'hoh') {
143
853
317
                        my $row_name = $line_hash{$args{'row.names'}};
144
853
294
                        foreach my $col (@header) {
145
11942
4458
                                next if $col eq $args{'row.names'};
146
11089
7469
                                $data{$col}{$row_name} = $line_hash{$col};
147                        }
148                }
149
13
379
        });
150
13
127
        @header = ();
151
13
8
        %mapped_filters = ();
152
13
23
        if ($args{'output.type'} eq 'aoh') {
153
7
5
                undef %data;
154
7
3
                my $final_ref = \@data;
155
7
294
                return $final_ref;
156        } elsif ($args{'output.type'} =~ m/^(?:hoa|hoh)$/) {
157
6
2
                @data = ();
158
6
583
                return \%data;
159        }
160}
161
162#sub write_table {
163#       my $data_ref = (ref($_[0]) eq 'HASH' || ref($_[0]) eq 'ARRAY') ? shift : undef;
164#       my $file = shift;
165#       my %args = (
166#               sep         => ',',
167#               'row.names' => 1,      
168#               @_,
169#       );
170#       my $current_sub = (split(/::/,(caller(0))[3]))[-1];
171#       $args{data} //= $data_ref;
172#       my %allowed = map { $_ => 1 } qw(data file row.names sep col.names);
173#       my @err = grep { !$allowed{$_} } keys %args;
174#       if (@err > 0) {
175#               die "$current_sub: Unknown arguments passed: " . join(", ", @err) . "\n";
176#       }
177#       die "$current_sub: 'data' must be a HASH or ARRAY reference\n"
178#               unless defined $args{data} && (ref($args{data}) eq 'HASH' || ref($args{data}) eq 'ARRAY');
179
180#       my $col_names = $args{'col.names'};
181#       if (defined $col_names && ref($col_names) ne 'ARRAY') {
182#               die "$current_sub: 'col.names' must be an ARRAY reference\n";
183#       }
184#       my $data         = $args{data};
185#       my $sep          = $args{sep};
186#       my $inc_rownames = $args{'row.names'};
187#       my $quote_field = sub {
188#               my ($val, $sep) = @_;
189#               return '' unless defined $val;
190#               die "$current_sub: Cannot write nested reference types to table\n" if ref($val);
191#               
192#               my $str = "$val";  
193#               if (index($str, $sep) != -1 || index($str, '"') != -1 || $str =~ /[\r\n]/) {
194#                       $str =~ s/"/""/g;
195#                       $str = qq{"$str"};
196#               }
197#               return $str;
198#       };
199#       my $data_type = ref $data;
200#       my ($is_hoh, $is_hoa, $is_aoh) = (0, 0, 0);
201#       my @rows;
202#       if ($data_type eq 'HASH') {
203#               @rows = keys %$data;
204#               return if @rows == 0;
205#               my $first_type = ref $data->{$rows[0]};
206#               die "$current_sub: Data values must be either all HASHes or all ARRAYs\n"
207#                       unless $first_type eq 'HASH' || $first_type eq 'ARRAY';
208
209#               my @type_err = grep { ref $data->{$_} ne $first_type } @rows;
210#               if (@type_err > 0) {
211#                       die "$current_sub: Mixed data types detected. Ensure all values are $first_type references.\n";
212#               }
213#               $is_hoh = ($first_type eq 'HASH');
214#               $is_hoa = ($first_type eq 'ARRAY');
215#       } else {
216#               return if @$data == 0;
217
218#               my $first_elem = $data->[0];
219#               die "$current_sub: For ARRAY data, all elements must be HASH references (Array of Hashes)\n"
220#                       unless defined $first_elem && ref($first_elem) eq 'HASH';
221
222#               my @type_err = grep { !defined($_) || ref($_) ne 'HASH' } @$data;
223#               if (@type_err > 0) {
224#                       die "$current_sub: Mixed data types detected in Array of Hashes. All elements must be HASH references.\n";
225#               }
226#               $is_aoh = 1;
227#       }
228#       open my $fh, '>', $file or die "$current_sub: Could not open '$file' for writing: $!\n";
229#       if ($is_hoh) {
230#               my @headers;
231#               if (defined $col_names) {  # Bug 6 fix: was "if ($col_names)" — use defined
232#                       @headers = @$col_names;
233#               } else {
234#                       my %col_map;
235#                       for my $r (@rows) {
236#                               $col_map{$_} = 1 for keys %{ $data->{$r} };
237#                       }
238#                       @headers = sort keys %col_map;
239#               }
240
241#               my @header_row = @headers;
242#               unshift @header_row, '' if $inc_rownames;
243#               @header_row = map { $quote_field->($_, $sep) } @header_row;
244#               print $fh join($sep, @header_row) . "\n";
245
246#               for my $r (sort @rows) {
247#                       my @row_data = map { defined $data->{$r}{$_} ? $data->{$r}{$_} : "NA" } @headers;
248#                       unshift @row_data, $r if $inc_rownames;
249#                       my @quoted = map { $quote_field->($_, $sep) } @row_data;
250#                       print $fh join($sep, @quoted) . "\n";
251#               }
252#       } elsif ($is_hoa) {
253#               # 1. Find the maximum number of rows
254#               my $max_rows = 0;
255#               foreach my $col (keys %$data) {
256#                       my $len = scalar @{ $data->{$col} };
257#                       $max_rows = $len if $len > $max_rows;
258#               }
259#               $max_rows--; # Convert length to max index
260
261#               # 2. Determine headers
262#               my @headers;
263#               if (defined $col_names) {
264#                       @headers = @$col_names;
265#               } else {
266#                       @headers = sort keys %$data;
267#               }
268#               
269#               die "Could not get headers in $current_sub" if @headers == 0;
270
271#               # Bug 5 fix: if row.names is a column name (non-numeric string), pull it out to be first
272#               my $rownames_col;
273#               if ($inc_rownames && $inc_rownames =~ /\D/) {
274#                       $rownames_col = $inc_rownames;
275#                       @headers = grep { $_ ne $rownames_col } @headers;
276#               }
277
278#               # 3. Print Header Row
279#               my @header_row = @headers;
280#               unshift @header_row, '' if $inc_rownames;
281#               @header_row = map { $quote_field->($_, $sep) } @header_row;
282#               print $fh join($sep, @header_row) . "\n";
283
284#               # 4. Process and Print Data Rows
285#               for my $i (0 .. $max_rows) {
286#                       my @row_data;
287#                       
288#                       foreach my $col (@headers) {
289#                               push @row_data, defined($data->{$col}[$i]) ? $data->{$col}[$i] : 'NA';
290#                       }
291#                       
292#                       if ($inc_rownames) {
293#                               # Bug 5 fix: use named column value if row.names is a column name
294#                               my $rn_val = defined $rownames_col
295#                                       ? (defined $data->{$rownames_col}[$i] ? $data->{$rownames_col}[$i] : 'NA')
296#                                       : $i + 1;
297#                               unshift @row_data, $rn_val;
298#                       }
299#                       
300#                       my @quoted = map { $quote_field->($_, $sep) } @row_data;
301#                       print $fh join($sep, @quoted) . "\n";
302#               }
303#       } elsif ($is_aoh) {
304#               my @headers;
305#               if (defined $col_names) {  # Bug 6 fix: was "if ($col_names)" — use defined
306#                       @headers = @$col_names;
307#               } else {
308#                       my %col_map;
309#                       for my $row_hash (@$data) {
310#                               $col_map{$_} = 1 for keys %$row_hash;
311#                       }
312#                       @headers = sort keys %col_map;
313#               }
314
315#               # Bug 5 fix: if row.names is a column name (non-numeric string), pull it out to be first
316#               my $rownames_col;
317#               if ($inc_rownames && $inc_rownames =~ /\D/) {
318#                       $rownames_col = $inc_rownames;
319#                       @headers = grep { $_ ne $rownames_col } @headers;
320#               }
321
322#               my @header_row = @headers;
323#               unshift @header_row, "" if $inc_rownames;
324#               @header_row = map { $quote_field->($_, $sep) } @header_row;
325#               print $fh join($sep, @header_row) . "\n";  # Bug 7 fix: was "say $fh" — use print consistently
326#               for my $i (0 .. $#$data) {
327#                       my $row_hash = $data->[$i];
328#                       my @row_data = map { defined $row_hash->{$_} ? $row_hash->{$_} : "NA" } @headers;
329#                       if ($inc_rownames) {
330#                               # Bug 5 fix: use named column value if row.names is a column name
331#                               my $rn_val = defined $rownames_col
332#                                       ? (defined $row_hash->{$rownames_col} ? $row_hash->{$rownames_col} : 'NA')
333#                                       : $i + 1;
334#                               unshift @row_data, $rn_val;
335#                       }
336#                       my @quoted = map { $quote_field->($_, $sep) } @row_data;
337#                       print $fh join($sep, @quoted) . "\n";
338#               }
339#       }
340#       close $fh;  # Bug 8 fix: file handle was never closed
341#}
3421;
343=encoding utf8
344
345 - 848
=head1 Synopsis

Get basic R statistical functions working in Perl as if they were part of List::Util, like C<min>, C<max>, C<sum>, etc.
I've used Artificial Intelligence tools such as Claude, Gemini, and Grok to write this as well as using my own gray matter.
There are other similar tools on CPAN, but I want speed and a form like List::Util, which I've gotten here with the help of AI, which often required many attempts to do correctly.
This is meant to call subroutines directly through eXternal Subroutines (XS) for performance and portability.

There B<are> other modules on CPAN that can do B<PARTS> of this, but this works the way that I B<want> it to.

=head1 Functions/Subroutines

=head2 aov

 aov(
 {
     yield => [5.5, 5.4, 5.8, 4.5, 4.8, 4.2],
     ctrl  => [1,     1,   1,   0,   0,   0]
 },
 'yield ~ ctrl');

which returns

 {
     ctrl        {
         Df          1,
         "F value"   25.6000000000001,
         "Mean Sq"   1.70666666666667,
         Pr(>F)      0.00718232855871859,
         "Sum Sq"    1.70666666666667
     },
     Residuals   {
         Df          4,
         "Mean Sq"   0.0666666666666665,
         "Sum Sq"    0.266666666666666
    }
 }

You can also perform Two-Way ANOVA with categorical interactions using the C<*> operator. The parser will implicitly evaluate the main effects alongside the interaction:

 my $res_2way = aov($data_2way, 'len ~ supp * dose');

It is robust against rank deficiency; collinear terms will gracefully receive 0 degrees of freedom and 0 sum of squares, matching R's behavior.

=head2 chisq_test

 my @test_data = ([762, 327, 468], [484, 239, 477]);
 my $test_data = chisq_test(\@test_data);

which outputs:

 {
 data.name   "Perl ArrayRef",
 expected    [
     [0] [
             [0] 703.671381936888,
             [1] 319.645266594124,
             [2] 533.683351468988
         ],
     [1] [
             [0] 542.328618063112,
             [1] 246.354733405876,
             [2] 411.316648531012
         ]
 ],
 method      "Pearson's Chi-squared test",
 observed    [
     [0] [
             [0] 762,
             [1] 327,
             [2] 468
         ],
     [1] [
             [0] 484,
             [1] 239,
             [2] 477
         ]
 ],
 p.value     2.95358918321176e-07,
 parameter   {
     df   2
 },
 statistic   {
     X-squared   30.0701490957547
 }
 }

It also supports 1D arrays for Goodness of Fit tests:

 my $chisq_1d = chisq_test([10, 20, 30]);

For 2x2 matrices, Yates' Continuity Correction is applied automatically, exactly like in R.

=head2 cor

 cor($array1, $array2, $method = 'pearson'),

that is, C<pearson> is the default and will be used if C<$method> is not specified.

Just like R, C<pearson>, C<spearman>, and C<kendall> are available

If you provide an array of arrays (a matrix), C<cor> will compute the correlation matrix automatically. 

=head2 cor_test

 my $result = cor_test(
         'x'         => $x,
         'y'         => $y,
         alternative => 'two.sided'
         method      => 'pearson',
         continuity  => 1
     );

C<cor_test> safely handles C<undef> (or C<NA>) values seamlessly by computing over pairwise complete observations. 

=head2 cov

 cov($array1, $array2, 'pearson')

or

 cov($array1, $array2, 'spearman')

or

 cov($array1, $array2, 'kendall')

=head2 fisher_test

=head3 array reference entry

 my $array_data = [
     [10, 2],
     [3, 15]
 ];
 my $res1 = fisher_test($array_data);

which returns a hash reference:

 {
 alternative   "two.sided",
 conf_int      [
     [0] 2.75338278824932,
     [1] 301.462337971516
 ],
 estimate      {
     "odds ratio"   21.3053175567504
 },
 method        "Fisher's Exact Test for Count Data",
 p_value       0.00053672411914343

}

=head3 hash reference entry

 $ft = fisher_test( {
     Guess => {
         Milk => 3, Tea => 1
     },
     Truth => {
         Milk => 1, Tea => 3
     }
 });

I have the p-value calculated very precisely, but there are some inexactness (approximately 1% for the confidence intervals) which I couldn't rectify.  The answers are very close to R besides the p-value, where they are identical.

=head2 glm

takes a hash of an array as input

 my %tooth_growth = (
     dose => [qw(0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
 1.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0
 2.0 2.0 2.0)],
     len  => [qw(4.2 11.5  7.3  5.8  6.4 10.0 11.2 11.2  5.2  7.0 16.5 16.5 15.2 17.3 22.5
 17.3 13.6 14.5 18.8 15.5 23.6 18.5 33.9 25.5 26.4 32.5 26.7 21.5 23.3 29.5
 15.2 21.5 17.6  9.7 14.5 10.0  8.2  9.4 16.5  9.7 19.7 23.3 23.6 26.4 20.0
 25.2 25.8 21.2 14.5 27.3 25.5 26.4 22.4 24.5 24.8 30.9 26.4 27.3 29.4 23.0)],
     supp => [qw(VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC
 VC VC VC VC VC OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ
 OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ)]
 );
 
 my $glm_teeth = glm(
     data    => \%tooth_growth,
     formula => 'len ~ dose + supp',
     family  => 'gaussian'
 );

I'm not completely confident that this is working perfectly, though I've gotten this subroutine to work for simple cases.

In addition to the C<gaussian> default, it fully supports logistic regression using the C<binomial> family parameter via Iteratively Reweighted Least Squares (IRLS):

 my $glm_bin = glm(formula => 'am ~ wt + hp', data => \%mtcars, family => 'binomial');

=head2 hist

Computes the histogram of the given data values, operating in single $O(N)$ pass performance. It returns the bin counts, computed breaks, midpoints, and density. 

 my $res = hist([1, 2, 2, 3, 3, 3, 4, 4, 5], breaks => 4);

If C<breaks> is not explicitly provided, it defaults to calculating the number of bins using Sturges' formula.

=head2 kruskal_test

Essentially the test determines if all groups have the same median (same distribution) (an excellent review is at https://library.virginia.edu/data/articles/getting-started-with-the-kruskal-wallis-test)

Performs a Kruskal-Wallis rank sum test, see 
https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/kruskal.test

=head3 R-like array entry

 my @xk = (2.9, 3.0, 2.5, 2.6, 3.2); # normal subjects
 my @yk = (3.8, 2.7, 4.0, 2.4);      # with obstructive airway disease
 my @zk = (2.8, 3.4, 3.7, 2.2, 2.0); # with asbestosis
 my @x = (@xk, @yk, @zk);
 my @g = (
     (map {'Normal subjects'} 0..4),
     (map {'Subjects with obstructive airway disease'} 0..3),
     map {'Subjects with asbestosis'} 0..4
 );
 my $t0 = Time::HiRes::time();
 my $kt = kruskal_test(\@x, \@g);
 my $t1 = Time::HiRes::time();
 printf("Kruskal calculation in %g seconds.\n", $t1-$t0);
 p $kt;

=head3 hash of array entry

 my %x = (
 'normal.subjects' => [2.9, 3.0, 2.5, 2.6, 3.2],
 'obs. airway disease' => [3.8, 2.7, 4.0, 2.4],
 'asbestosis' => [2.8, 3.4, 3.7, 2.2, 2.0]
 );
 $t0 = Time::HiRes::time();
 $kt = kruskal_test(\%x);
 $t1 = Time::HiRes::time();
 printf("Kruskal calculation via HoA in %g seconds.\n", $t1-$t0);
 p $kt;

=head2 ks_test

The Kolmogorov-Smirnov test, which tests whether or not two arrays/lists of data are part of the same distribution is implemented simply:

 $ks = ks_test(\@x, \@y, alternative => 'greater');

returning a hash reference.

Also, a single array can be tested against a normal distribution:

 $ks = ks_test($ksx, 'pnorm');

The p-value precision is about 1e-8, which I want to improve, but am not sure how.

=head2 lm

This is the linear models function.

 $lm = lm(formula =>  'mpg ~ wt + hp', data => $mtcars);

where C<$mtcars> is a hash of hashes

C<lm> also supports generating interaction terms directly within the formula using the C<*> operator:

 my $lm = lm(formula => 'mpg ~ wt * hp^2', data => \%mtcars);

If your data contains missing numbers (C<NA> or C<undef>), C<lm> handles listwise deletion dynamically to ensure mathematical integrity before fitting.

the dot operator also works:

 $lm = lm(formula => 'y ~ .', data => $dot_data);

=head2 matrix

 my $mat1 = matrix(
     data => [1..6],
     nrow => 2
 );

You can also pass C<< byrow =E<gt> 1 >> if you want the matrix populated row-wise instead of column-wise.

=head2 mean

 mean(1,2,3);

or

 my @arr = 1..8;
 mean(@arr, 4, 5)

or

 mean([1,1], [2,2]) # 1.5

=head2 median

works like mean, taking array references and arrays:

 median( $test_data[$i][0] )

=head2 p_adjust

Returns array of false-discovery-rate-corrected p-values, where methods available are "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr"

 my @q = p_adjust(\@pvalues, $method);

=head2 power_t_test

 $test_data = power_t_test(
     n   => 30,  delta     => 0.5, 
     sd  => 1.0, sig_level => 0.05
 );

It also allows configuring the test type (C<< type =E<gt> 'one.sample' >>, C<'two.sample'>, C<'paired'>) and alternative hypothesis (C<< alternative =E<gt> 'one.sided' >>). You can also pass C<< strict =E<gt> 1 >> to strictly evaluate both tails of the distribution.

=head2 quantile

Calculates sample quantiles using R's continuous Type 7 interpolation. 

 my $quantile = quantile('x' => [1..99], probs => [0.05, 0.1, 0.25]);

If the C<probs> parameter is omitted, it behaves identically to R by defaulting to the 0, 25, 50, 75, and 100 percentiles (C<c(0, .25, .5, .75, 1)>). The returned hash keys match R's standardized naming convention (e.g., C<"25%">, C<"33.3%">).

=head2 rbinom

Create a binomial distribution of numbers

 my $binom = rbinom( n => $n, prob => 0.5, size => 9);

It hooks directly into Perl's internal PRNG system, respecting C<srand()> seeds. 

=head2 read_table

I've tried to make this as simple as possible, trying to follow from R:

 my $test_data = read_table('t/HepatitisCdata.csv');

output types can be AOH (aoa), HOA (hoa), HOH (hoh)

 read_table($filename, 'output.type' => 'aoh');
 
 read_table($filename, 'output.type' => 'hoa');

and, like Text::CSV_XS, filters can be applied in order to save RAM on big files:

 $test_data = read_table(
     't/HepatitisCdata.csv',
     filter => {
         Sex => sub {$_ eq 'f'} # where "Sex" is the column name, and "$_" is the value for that column
     },
     'output.type' => 'aoh'
 );

=head2 rnorm

Make a normal distribution of numbers, with pre-set mean C<mean>, standard deviation C<sd>, and number C<n>.

 my ($rmean, $sd, $n) = (10, 2, 9999);
 my $normals = rnorm( n => $n, mean => $rmean, sd => $sd);

=head2 runif

=head3 named arguments

Make a distribution of approximately uniform distribution

 my $unif = runif( n => $n, min => 0, max => 1);

where C<n> is the number of items, the values are between C<min> and C<max>

=head3 positional args

this is to match R's behavior:

 runif( 9 )

will make 9 numbers in [0,1]

 runif(9, 0, 99)

will match C<n>, C<min>, and C<max> respectively

=head2 scale

 my @scaled_results = scale(1..5);

You can also pass an options hash to disable centering or scaling:

 my @scaled_results = scale(1..5, { center => false, scale => true });

It fully supports matrix operations. By passing an array of arrays, C<scale> processes the data column by column independently:

 my $scaled_mat = scale([[1, 2], [3, 4], [5, 6]]);

=head2 sd

 my $stdev = sd(2,4,4,4,5,5,7,9);

Correct answer is 2.1380899352994;

=head2 seq

Works as closely as I can to R's seq, which is very similar to Perl's C<for> loops.  Returns an array, not an array reference.

=head3 Example 1: Standard integer sequence

 say 'seq(1, 5):';
 my @seq = seq(1, 5);
 say join(', ', @seq), "\n";
 
 say 'seq(1, 2, 0.25):';
 @seq = seq(1, 2, 0.25);

=head3 Example 2: Fractional steps

 say 'seq(1, 2, 0.25):';
 @seq = seq(1, 2, 0.25);
 say join(", ", @seq), "\n";
 for (my $idx = 2; $idx >= 1; $idx -= 0.25) { # count down to pop
     is_approx(pop @seq, $idx, "seq item $idx with fractional step");
 }

=head3 Example 3: Negative steps

 say 'seq(10, 5, -1):';
 @seq = seq(10, 5, -1);
 say join(", ", @seq), "\n";
 for (my $idx = 5; $idx <= 10; $idx++) { # count down to pop
     is_approx(pop @seq, $idx, "seq item $idx with negative step");
 }

=head2 shapiro_test

tests to see if an array reference is normally distributed, returns a p-value and a statistic

 my $shapiro = shapiro_test(
     [1..5]
 );

and returns the hash reference:

 {
 p.value     0.589650577093106,
 p_value     0.589650577093106,
 statistic   0.960870680168535,
 W           0.960870680168535
 }

=head2 t_test

There are 1-sample and 2-sample t-tests:

 my $t_test = t_test( $test_data[$i][$j], mu => mean( $test_data[$i][$j] ));

or 2-sample:

 $t_test = t_test(
     $test_data[3][0],
     $test_data[3][1],
     paired => true
 );

returns a hash reference, which looks like:

 conf_int     => [
     -0.06672889, 0.25672889
 ],
 df        => 5,
 estimate  => 0.095,
 p_value   => 0.19143688433660,
 statistic => 1.50996688705414

the two groups compared can be specified, though not necessarily, as C<x> and C<y>, just like in R:

 $t_test = t_test(
     'x' => test_data[3][0],
     'y' => $test_data[3][1],
     paired => true
 );

=head2 var

as simple as possible:

 var(2, 4, 5, 8, 9)

=head2 wilcox_test

 $test_data = wilcox_test(
     [1.83,  0.50,  1.62,  2.48, 1.68, 1.88, 1.55, 3.06, 1.30],
     [0.878, 0.647, 0.598, 2.05, 1.06, 1.29, 1.06, 3.14, 1.29]
 );

It fully supports paired tests (C<< paired =E<gt> true >>) and can calculate exact p-values (the default for $N < 50$ without ties). If ties are encountered, it automatically switches to an approximation with continuity correction.

=head2 write_table

mimics R's "write.table", with data as first argument to subroutine, and output file as second

 write_table(\@data_aoh, $tmp_file, sep => "\t", 'row.names' => true);

You can also precisely filter and reorder which columns are written by passing an array reference to C<col.names>:

 write_table(\@data, $tmp_file, sep => "\t", 'col.names' => ['c', 'a']);