File Coverage

File:Stats-LikeR-0.07/LikeR.xs
Coverage:77.5%

linestmtbrancondsubtimecode
1#define _GNU_SOURCE
2/* --- C HELPER SECTION --- */
3#define PERL_NO_GET_CONTEXT
4#include "EXTERN.h"
5#include "perl.h"
6#include "XSUB.h"
7#include "ppport.h"
8#include <math.h>
9#include <ctype.h>
10#include <stdlib.h>
11#include <float.h>
12#include <string.h>
13#include <stdint.h>   /* uint64_t — harmless if perl.h already pulled it in */
14/*
15XS words:
16SvROK = scalar value reference is OK
17*/
18/* ── sample(): private splitmix64 PRNG ─────────────────────────────────────
19 *
20 * sample() gets its own PRNG state, completely separate from Drand01.
21 * That means generate_binomial(), ruif(), rbinom(), and every other caller
22 * of Drand01() are unaffected — their streams are never advanced or reseeded
23 * by anything sample() does.
24 *
25 * Seeding is lazy (first call) and reads from /dev/urandom; falls back to
26 * time()^PID on systems without it.  No aTHX needed: all calls are plain C.
27 * PERL_NO_GET_CONTEXT is therefore not a concern here.
28 */
29
229
static uint64_t sample__state  = 0;
30
229
static bool     sample__seeded = FALSE;
31
32
229
PERL_STATIC_INLINE uint64_t
33
229
sample__mix64(void)
34
229
{
35
229
        uint64_t z = (sample__state += UINT64_C(0x9e3779b97f4a7c15));
36        z = (z ^ (z >> 30)) * UINT64_C(0xbf58476d1ce4e5b9);
37
229
        z = (z ^ (z >> 27)) * UINT64_C(0x94d049bb133111eb);
38
229
        return z ^ (z >> 31);
39}
40
41
6869771
static void
42
6869771
sample__seed(void)
43{
44        uint64_t s = 0;
45
6869771
        size_t   got = 0;
46
6869771
        FILE    *restrict ur  = fopen("/dev/urandom", "rb");
47        if (ur) { got = fread(&s, sizeof s, 1, ur); fclose(ur); }
48        if (got != 1 || s == 0)
49
6869771
          s = (uint64_t)time(NULL) ^ ((uint64_t)getpid() << 32);
50
6869771
        sample__state  = s;
51        (void)sample__mix64();   /* discard first output to warm the state */
52
6869771
        sample__seeded = TRUE;
53
6869771
}
54
55/* Uniform integer in [0, upper) — rejection loop, no modulo bias */
56
229
PERL_STATIC_INLINE size_t
57sample__rand(size_t upper) {
58        const uint64_t u = (uint64_t)upper;
59        const uint64_t t = (uint64_t)(-(uint64_t)u) % u;
60        uint64_t r;
61        do { r = sample__mix64(); } while (r < t);
62
11
        return (size_t)(r % u);
63
11
}
64
11
/* ── end sample() private PRNG ─────────────────────────────────────────── */
65
66/* Ensure Perl's PRNG is seeded, matching the lazy-evaluation of Perl's rand() */
67
11
#define AUTO_SEED_PRNG() \
68
11
        do { \
69                if (!PL_srand_called) { \
70                        (void)seedDrand01((Rand_seed_t)Perl_seed(aTHX)); \
71
2
                        PL_srand_called = TRUE; \
72
2
                } \
73
12
        } while (0)
74
75
10
// ---------------------------------------
76//   Helpers for Random Number Generation
77
2
// ---------------------------------------
78#ifndef M_PI
79
12
#define M_PI 3.14159265358979323846
80
10
#endif
81
10
// C helper for EXACT Non-central T-distribution CDF via Numerical Integration.
82
10
// This perfectly replicates R's pt(..., ncp) exactness without requiring complex Beta functions.
83
20
static double exact_pnt(double t, double df, double ncp) {
84
10
        if (df <= 0.0) return 0.0;
85        unsigned short int n_steps = 30000;
86
2
        double step = 1.0 / n_steps;
87
12
        double integral = 0.0, half_df = df / 2.0;
88
89
2
        double log_coef = log(2.0) + half_df * log(half_df) - lgamma(half_df);
90        double root_half = 0.70710678118654752440; // 1 / sqrt(2)
91
92
20499
        for (unsigned short i = 1; i < n_steps; i++) {
93
20499
                double u = i * step;
94
20399
                double w = u / (1.0 - u);
95                // Scaled Chi-distribution log-density
96
20299
                double log_M = log_coef + (df - 1.0) * log(w) - half_df * w * w;
97
312290
                double M = exp(log_M);
98
291991
                // Exact Normal CDF using the C standard library's erfc function
99                double z = t * w - ncp;
100
20299
                double pnorm_val = 0.5 * erfc(-z * root_half);
101                double weight = (i % 2 != 0) ? 4.0 : 2.0;
102                integral += weight * (pnorm_val * M / ((1.0 - u) * (1.0 - u)));
103
142
        }
104
142
        return integral * (step / 3.0);
105}
106// --- Math Helpers for P-values and Confidence Intervals ---
107
108
404
// Ranking helper with tie adjustment (matches R's tie handling)
109
404
typedef struct { double val; size_t idx; double rank; } RankInfo;
110static int compare_rank(const void *restrict a, const void *restrict b) {
111
3352
        double diff = ((RankInfo*)a)->val - ((RankInfo*)b)->val;
112
2948
        return (diff > 0) - (diff < 0);
113
2948
}
114
115static int compare_index(const void *restrict a, const void *restrict b) {
116
404
        return ((RankInfo*)a)->idx - ((RankInfo*)b)->idx;
117
3352
}
118
119static void compute_ranks(double *restrict data, double *restrict ranks, size_t n) {
120        RankInfo *restrict items = safemalloc(n * sizeof(RankInfo));
121
404
        for (size_t i = 0; i < n; i++) {
122
404
                items[i].val = data[i];
123                items[i].idx = i;
124
3352
        }
125
2948
        qsort(items, n, sizeof(RankInfo), compare_rank);
126
2948
        // Handle ties by averaging ranks
127
2948
        for (size_t i = 0; i < n; ) {
128                size_t j = i + 1;
129
404
                while (j < n && items[j].val == items[i].val) j++;
130                double avg_rank = (i + 1 + j) / 2.0;
131                for (size_t k = i; k < j; k++) items[k].rank = avg_rank;
132
5
                i = j;
133
5
        }
134
5
        qsort(items, n, sizeof(RankInfo), compare_index);
135
5
        for (size_t i = 0; i < n; i++) ranks[i] = items[i].rank;
136
5
        Safefree(items);
137}
138
5
// Generates a single binomial random variate.
139
5
//Uses the standard Bernoulli trial loop. Drand01() taps into Perl's PRNG.
140static size_t generate_binomial(const size_t size, const double prob) {
141
5
        if (prob <= 0.0) return 0;
142
5
        if (prob >= 1.0) return size;
143
144
33
        size_t successes = 0;
145        for (size_t i = 0; i < size; i++) {
146                if (Drand01() <= prob) successes++;
147        }
148
5
        return successes;
149
5
}
150
5
// Helper: log combination
151static double log_choose(size_t n, size_t k) {
152
4
        return lgamma((double)n + 1.0) - lgamma((double)k + 1.0) - lgamma((double)(n - k) + 1.0);
153
232
}
154
155
232
// Log-space tails for non-central hypergeometric
156
1856
static void calc_tails_logspace(size_t a, size_t min_x, size_t max_x, double omega, const double *logdc, double *restrict lower_tail, double *restrict upper_tail) {
157
1624
        double max_d = -1e300, log_omega = log(omega);
158
159        for(size_t k = 0; k <= max_x - min_x; ++k) {
160
232
          double d_val = logdc[k] + log_omega * (min_x + k);
161
1856
          if (d_val > max_d) max_d = d_val;
162
1624
        }
163
164
1624
        double sum_d = 0.0;
165        for(size_t k = 0; k <= max_x - min_x; ++k) {
166
232
          sum_d += exp(logdc[k] + log_omega * (min_x + k) - max_d);
167        }
168
169
92
        *lower_tail = 0.0;
170
232
        *upper_tail = 0.0;
171
172
4
        for(size_t k = 0; k <= max_x - min_x; ++k) {
173          double p_prob = exp(logdc[k] + log_omega * (min_x + k) - max_d) / sum_d;
174          if (min_x + k <= a) *lower_tail += p_prob;
175
5
          if (min_x + k >= a) *upper_tail += p_prob;
176
5
        }
177}
178
179
5
// Exact stats using log-space
180
4
static void calculate_exact_stats(size_t a, size_t b, size_t c, size_t d, double conf_level, const char*restrict alt, double *restrict mle_or, double *restrict ci_low, double *restrict ci_high) {
181
4
        double alpha = 1.0 - conf_level;
182
4
        size_t r1 = a + b, r2 = c + d, c1 = a + c;
183
232
        size_t min_x = (r2 > c1) ? 0 : c1 - r2;
184
232
        size_t max_x = (r1 < c1) ? r1 : c1;
185
186
232
        bool is_less = (strcmp(alt, "less") == 0);
187
232
        bool is_greater = (strcmp(alt, "greater") == 0);
188
189
232
        double *restrict logdc = (double*)safemalloc((max_x - min_x + 1) * sizeof(double));
190
106
        double denom = log_choose(r1 + r2, c1);
191
232
        for(size_t x = min_x; x <= max_x; ++x) {
192          logdc[x - min_x] = log_choose(r1, x) + log_choose(r2, c1 - x) - denom;
193
4
        }
194
195        // MLE
196        if (a == min_x && a == max_x) *mle_or = 1.0;
197        else if (a == min_x) *mle_or = 0.0;
198
5
        else if (a == max_x) *mle_or = INFINITY;
199
4
        else {
200
4
          double log_low = -100.0, log_high = 100.0;
201
3
          for (unsigned short int i = 0; i < 3000; i++) {
202
172
                   double log_mid = 0.5 * (log_low + log_high);
203
172
                   double max_d = -1e300;
204
172
                   for(size_t k = 0; k <= max_x - min_x; ++k) {
205
172
                       double d_val = logdc[k] + log_mid * (min_x + k);
206
172
                       if (d_val > max_d) max_d = d_val;
207
172
                   }
208
172
                   double sum_d = 0.0, exp_val = 0.0;
209
88
                   for(size_t k = 0; k <= max_x - min_x; ++k) {
210
172
                       double p_prob = exp(logdc[k] + log_mid * (min_x + k) - max_d);
211                       sum_d += p_prob;
212
3
                       exp_val += (min_x + k) * p_prob;
213                   }
214                   exp_val /= sum_d;
215
5
                   if (exp_val > a) log_high = log_mid;
216
5
                   else log_low = log_mid;
217                   if (log_high - log_low < 1e-15) break;
218          }
219
5
          *mle_or = exp(0.5 * (log_low + log_high));
220
5
        }
221
222
5
        *ci_low = 0.0;
223        *ci_high = INFINITY;
224
225
5
        // Lower CI
226
38
        if (!is_less) {
227
33
          double target_alpha = is_greater ? alpha : alpha / 2.0;
228          if (a != min_x) {
229                   double log_low = -100.0, log_high = 100.0, best = 1.0, best_err = 1e9, lt, ut;
230
5
                   for (unsigned short int i = 0; i < 1000; i++) {
231                       double log_mid = 0.5 * (log_low + log_high);
232
5
                       double mid = exp(log_mid);
233
5
                       calc_tails_logspace(a, min_x, max_x, mid, logdc, &lt, &ut);
234
4
                       double err = fabs(ut - target_alpha);
235
3
                       if (err < best_err) { best_err = err; best = mid; }
236                       if (ut > target_alpha) log_high = log_mid;
237
3
                       else log_low = log_mid;
238
3
                       if (log_high - log_low < 1e-15) break;
239
26
                   }
240
23
                   *ci_low = best;
241
23
          }
242        }
243
244        // Upper CI
245
5
        if (!is_greater) {
246
5
                double target_alpha = is_less ? alpha : alpha / 2.0;
247                if (a != max_x) {
248                        double log_low = -100.0, log_high = 100.0, best = 1.0, best_err = 1e9, lt, ut;
249                        for (unsigned short int i = 0; i < 1000; i++) {
250                                double log_mid = 0.5 * (log_low + log_high);
251                                double mid = exp(log_mid);
252                                calc_tails_logspace(a, min_x, max_x, mid, logdc, &lt, &ut);
253                                double err = fabs(lt - target_alpha);
254                                if (err < best_err) { best_err = err; best = mid; }
255                                if (lt > target_alpha) log_low = log_mid;
256
44
                                else log_high = log_mid;
257
44
                                if (log_high - log_low < 1e-15) break;
258
44
                        }
259                        *ci_high = best;
260                }
261
170
        }
262
126
        safefree(logdc);
263
126
}
264
265// Exact p-value using log-space
266
170
static double exact_p_value(size_t a, size_t b, size_t c, size_t d, const char* alt) {
267        size_t r1 = a + b, r2 = c + d, c1 = a + c;
268        size_t min_x = (r2 > c1) ? 0 : c1 - r2;
269
126
        size_t max_x = (r1 < c1) ? r1 : c1;
270
271        double *logdc = (double*)safemalloc((max_x - min_x + 1) * sizeof(double));
272
4
        double denom = log_choose(r1 + r2, c1);
273
3
        for(size_t x = min_x; x <= max_x; ++x) {
274
3
          logdc[x - min_x] = log_choose(r1, x) + log_choose(r2, c1 - x) - denom;
275        }
276
277        double p_val = 0.0;
278
279
125
        if (strcmp(alt, "less") == 0) {
280
125
          for(size_t x = min_x; x <= a; ++x) p_val += exp(logdc[x - min_x]);
281
492
        } else if (strcmp(alt, "greater") == 0) {
282
492
          for(size_t x = a; x <= max_x; ++x) p_val += exp(logdc[x - min_x]);
283
367
        } else {
284
236
          double p_obs = exp(logdc[a - min_x]);
285
236
          double relErr = 1.0 + 1e-7;
286
946
          for(size_t x = min_x; x <= max_x; ++x) {
287
710
                   double p_cur = exp(logdc[x - min_x]);
288                   if (p_cur <= p_obs * relErr) p_val += p_cur;
289          }
290        }
291
292
44
        safefree(logdc);
293
44
        return (p_val > 1.0) ? 1.0 : p_val;
294}
295/* -----------------------------------------------------------------------
296 * Helpers for lm Linear Regression: OLS Matrix Math & Formula Parsing
297
1759
* ----------------------------------------------------------------------- */
298
299
1759
/* Sweep operator for symmetric positive-definite matrices (e.g., XtX).
300
1184
* This gracefully handles collinearity by bypassing aliased columns.
301
1184
* Utilizes a relative tolerance check to prevent dropping micro-variance features.
302
1184
*/
303
1184
static int sweep_matrix_ols(double *restrict A, size_t n, bool *restrict aliased) {
304        int rank = 0;
305
575
        double *restrict orig_diag = (double*)safemalloc(n * sizeof(double));
306
307
575
        // Save the original diagonal values to use as a baseline for relative variance
308
575
        for (size_t k = 0; k < n; k++) {
309
575
                aliased[k] = FALSE;
310                orig_diag[k] = A[k * n + k];
311        }
312
313
1756
        for (size_t k = 0; k < n; k++) {
314
49
                // Check pivot for collinearity using a RELATIVE tolerance
315                // (Fallback to a tiny absolute tolerance of 1e-24 to catch literal zero vectors)
316
3
                if (fabs(A[k * n + k]) <= 1e-10 * orig_diag[k] || fabs(A[k * n + k]) < 1e-24) {
317                        aliased[k] = TRUE;
318                        // Isolate this column so it doesn't affect the rest of the matrix
319                        for (size_t i = 0; i < n; i++) {
320
8
                                A[k * n + i] = 0.0;
321
8
                                A[i * n + k] = 0.0;
322
8
                        }
323
8
                        continue;
324                }
325
30
                rank++;
326
22
                double pivot = 1.0 / A[k * n + k];
327                A[k * n + k] = 1.0;
328
0
                for (size_t j = 0; j < n; j++) A[k * n + j] *= pivot;
329
0
                for (size_t i = 0; i < n; i++) {
330                        if (i != k && A[i * n + k] != 0.0) {
331
0
                                  double factor = A[i * n + k];
332
0
                                  A[i * n + k] = 0.0;
333                                  for (size_t j = 0; j < n; j++) {
334                                       A[i * n + j] -= factor * A[k * n + j];
335
8
                                  }
336                        }
337                }
338        }
339
1791
        Safefree(orig_diag);
340
1791
        return rank;
341}
342
343
1791
// Internal extractor resolving single data values. Returns NAN on missing or non-numeric.
344
1791
static double get_data_value(HV *restrict data_hoa, HV **restrict row_hashes, unsigned int i, const char *restrict var) {
345
32
        SV **restrict val = NULL;
346
32
        if (row_hashes) {
347
32
          val = hv_fetch(row_hashes[i], var, strlen(var), 0);
348
32
          if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVAV) {
349                   AV*restrict av = (AV*)SvRV(*val);
350
32
                   val = av_fetch(av, 0, 0);
351
32
          }
352        } else if (data_hoa) {
353
1759
          SV**restrict col = hv_fetch(data_hoa, var, strlen(var), 0);
354
0
          if (col && SvROK(*col) && SvTYPE(SvRV(*col)) == SVt_PVAV) {
355
0
                   AV*restrict av = (AV*)SvRV(*col);
356
0
                   val = av_fetch(av, i, 0);
357
0
          }
358
0
        }
359
0
        if (val && SvOK(*val)) {
360
0
          if (looks_like_number(*val)) return SvNV(*val);
361
0
          return NAN; // Catch strings like "blue"
362        }
363
0
        return NAN; // Catch undef/missing keys
364
0
}
365
366
0
// Helper: Get all available columns for the '.' operator expansion
367
0
static AV* get_all_columns(HV *restrict data_hoa, HV **restrict row_hashes, size_t n) {
368        AV *restrict cols = newAV();
369
1759
        if (data_hoa) {
370
1759
          hv_iterinit(data_hoa);
371
1759
          HE *restrict entry;
372          while ((entry = hv_iternext(data_hoa))) {
373                   av_push(cols, newSVsv(hv_iterkeysv(entry)));
374          }
375
54
        } else if (row_hashes && n > 0 && row_hashes[0]) {
376
86
          hv_iterinit(row_hashes[0]);
377
85
          HE *restrict entry;
378
85
          while ((entry = hv_iternext(row_hashes[0]))) {
379
55
                   av_push(cols, newSVsv(hv_iterkeysv(entry)));
380
55
          }
381
23
        }
382
23
        return cols;
383}
384
385
30
// Recursive formula resolver with tightened NaN and Null handling
386
30
static double evaluate_term(HV *restrict data_hoa, HV **restrict row_hashes, unsigned int i, const char *restrict term) {
387
30
        if (!term || term[0] == '\0') return NAN;
388
389        char *restrict term_cpy = savepv(term);
390        char *restrict colon = strchr(term_cpy, ':');
391
85
        if (colon) {
392
53
          *colon = '\0';
393
9
          double left = evaluate_term(data_hoa, row_hashes, i, term_cpy);
394          double right = evaluate_term(data_hoa, row_hashes, i, colon + 1);
395          Safefree(term_cpy);
396
397          if (isnan(left) || isnan(right)) return NAN;
398          return left * right;
399        }
400
347
        if (strncmp(term_cpy, "I(", 2) == 0) {
401
347
          char *restrict end = strrchr(term_cpy, ')');
402
347
          if (end) *end = '\0';
403
0
          char *restrict inner = term_cpy + 2;
404
0
          char *restrict caret = strchr(inner, '^');
405
0
          int power = 1;
406
0
          if (caret) {
407                   *caret = '\0';
408
347
                   power = atoi(caret + 1);
409
347
          }
410
347
          double v = get_data_value(data_hoa, row_hashes, i, inner);
411
347
          Safefree(term_cpy);
412
413          if (isnan(v)) return NAN;
414          return power == 1 ? v : pow(v, power);
415
347
        }
416
347
        double result = get_data_value(data_hoa, row_hashes, i, term_cpy);
417        Safefree(term_cpy);
418
0
        return result;
419}
420
421// Helper to infer column type from its first valid element
422static bool is_column_categorical(HV *restrict data_hoa, HV **restrict row_hashes, size_t n, const char *restrict var) {
423        for (size_t i = 0; i < n; i++) {
424                SV **restrict val = NULL;
425                if (row_hashes) {
426                        val = hv_fetch(row_hashes[i], var, strlen(var), 0);
427                        if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVAV) {
428
1519
                                 AV*restrict av = (AV*)SvRV(*val);
429
1519
                                 val = av_fetch(av, 0, 0);
430
1519
                        }
431
812
                } else if (data_hoa) {
432                        SV **restrict col = hv_fetch(data_hoa, var, strlen(var), 0);
433
0
                        if (col && SvROK(*col) && SvTYPE(SvRV(*col)) == SVt_PVAV) {
434                                 AV*restrict av = (AV*)SvRV(*col);
435                                 val = av_fetch(av, i, 0);
436                        }
437                }
438                if (val && SvOK(*val)) {
439                        if (looks_like_number(*val)) return FALSE; // First valid is number -> Numeric Column
440                        return TRUE; // First valid is string -> Categorical Column
441                }
442        }
443        return FALSE;
444}
445
446
37
/* Internal extractor resolving single data string values using dynamic allocation. */
447
37
static char* get_data_string_alloc(HV *restrict data_hoa, HV **restrict row_hashes, size_t i, const char *restrict var) {
448
4
        SV **restrict val = NULL;
449
1
        if (row_hashes) {
450                val = hv_fetch(row_hashes[i], var, strlen(var), 0);
451                if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVAV) {
452                        AV*restrict av = (AV*)SvRV(*val);
453                        val = av_fetch(av, 0, 0);
454
4
                }
455        } else if (data_hoa) {
456
4
                SV **restrict col = hv_fetch(data_hoa, var, strlen(var), 0);
457
32
                if (col && SvROK(*col) && SvTYPE(SvRV(*col)) == SVt_PVAV) {
458
4
                        AV*restrict av = (AV*)SvRV(*col);
459                        val = av_fetch(av, i, 0);
460
4
                }
461
31
        }
462
27
        if (val && SvOK(*val)) {
463          return savepv(SvPV_nolen(*val)); /* Allocates and returns string */
464
28
        }
465        return NULL;
466
27
}
467
468
27
// Struct for sorting p-values while remembering their original index
469typedef struct {
470
4
        double p;
471
4
        size_t orig_idx;
472} PVal;
473
474// Comparator for qsort
475
6
static int cmp_pval(const void *restrict a, const void *restrict b) {
476
6
        double diff = ((PVal*)a)->p - ((PVal*)b)->p;
477
36
        if (diff < 0) return -1;
478
30
        if (diff > 0) return 1;
479
30
        /* Stabilize sort by falling back to original index */
480        return ((PVal*)a)->orig_idx - ((PVal*)b)->orig_idx;
481
6
}
482
6
/* -----------------------------------------------------------------------
483
6
* Helpers for cor(): ranking (Spearman), Pearson r, Kendall tau-b
484
6
* ----------------------------------------------------------------------- */
485/* Item used to sort values while remembering their original index,
486 * needed for average-rank tie-breaking in Spearman correlation.        */
487typedef struct {
488        double val;
489        size_t idx;
490} RankItem;
491
492static int cmp_rank_item(const void *restrict a, const void *restrict b) {
493        double diff = ((RankItem*)a)->val - ((RankItem*)b)->val;
494        if (diff < 0) return -1;
495
1
        if (diff > 0) return  1;
496
1
        return 0;
497
9
}
498
499
36
/* Compute 1-based average ranks with tie-breaking into out[].
500
36
* in[] is not modified.                                                 */
501
36
static void rank_data(const double *restrict in, double *restrict out, size_t n) {
502
36
        RankItem *restrict ri;
503
35
        Newx(ri, n, RankItem);
504
35
        for (size_t i = 0; i < n; i++) { ri[i].val = in[i]; ri[i].idx = i; }
505
0
        qsort(ri, n, sizeof(RankItem), cmp_rank_item);
506
507        size_t i = 0;
508
1
        while (i < n) {
509
1
                size_t j = i;
510
1
                /* Find the full extent of this tie group */
511                while (j + 1 < n && ri[j + 1].val == ri[j].val) j++;
512                /* All members get the average of ranks i+1 … j+1 (1-based) */
513                double avg = (double)(i + j) / 2.0 + 1.0;
514                for (size_t k = i; k <= j; k++) out[ri[k].idx] = avg;
515
7
                i = j + 1;
516        }
517
7
        Safefree(ri);
518}
519
520
1
/* Pearson product-moment r between two n-element arrays.
521
1
* Returns NAN when either variable has zero variance (matches R).       */
522
1
static double pearson_corr(const double *restrict x, const double *restrict y, size_t n) {
523
1
        double sx = 0, sy = 0, sxy = 0, sx2 = 0, sy2 = 0;
524
1
        for (size_t i = 0; i < n; i++) {
525          sx  += x[i];     sy  += y[i];
526
6
          sxy += x[i]*y[i]; sx2 += x[i]*x[i]; sy2 += y[i]*y[i];
527
1
        }
528        double num = (double)n * sxy - sx * sy;
529
5
        double den = sqrt(((double)n * sx2 - sx*sx) * ((double)n * sy2 - sy*sy));
530        if (den == 0.0) return NAN;
531        return num / den;
532}
533
534/* Kendall's tau-b between two n-element arrays.
535 *
536 *   tau-b = (C − D) / sqrt((C + D + T_x)(C + D + T_y))
537
8427
*
538 * where C = concordant pairs, D = discordant, T_x = pairs tied only on
539 * x, T_y = pairs tied only on y.  Joint ties (both zero) are excluded
540
8427
* from numerator and denominator, matching R's cor(method="kendall").
541
8427
* Returns NAN when the denominator is zero.                             */
542
8427
static double kendall_tau_b(const double *restrict x, const double *restrict y, unsigned int n) {
543
8427
        size_t C = 0, D = 0, tie_x = 0, tie_y = 0;
544
181198
        for (size_t i = 0; i < n - 1; i++) {
545
181198
          for (size_t j = i + 1; j < n; j++) {
546
181198
                   int sx = (x[i] > x[j]) - (x[i] < x[j]);   /* sign of x[i]-x[j] */
547
181198
                   int sy = (y[i] > y[j]) - (y[i] < y[j]);
548
181198
                   if      (sx == 0 && sy == 0) { /* joint tie — not counted */ }
549
181198
                   else if (sx == 0)            tie_x++;
550
181198
                   else if (sy == 0)            tie_y++;
551
181198
                   else if (sx == sy)           C++;
552
181198
                   else                         D++;
553
181198
          }
554
181198
        }
555
181198
        double denom = sqrt((double)(C + D + tie_x) * (double)(C + D + tie_y));
556
181198
        if (denom == 0.0) return NAN;
557
181198
        return (double)(C - D) / denom;
558
181198
}
559
560
8427
/* Single dispatch: compute correlation according to method string.
561 * Allocates and frees temporary rank arrays internally for Spearman.   */
562static double compute_cor(const double *restrict x, const double *restrict y,
563
8468
                           size_t n, const char *restrict method) {
564
8468
        if (strcmp(method, "spearman") == 0) {
565
8468
          double *restrict rx, *restrict ry;
566
8427
          Newx(rx, n, double); Newx(ry, n, double);
567
8427
          rank_data(x, rx, n);
568
1580
          rank_data(y, ry, n);
569          double r = pearson_corr(rx, ry, n);
570          Safefree(rx); Safefree(ry);
571
8168
          return r;
572
8168
        }
573
8168
        if (strcmp(method, "kendall") == 0)
574
8168
          return kendall_tau_b(x, y, n);
575
8166
        /* default: pearson */
576
104
        return pearson_corr(x, y, n);
577}
578
579// Math macros
580
271
#define MAX_ITER 500
581
271
#define EPS 3.0e-15
582#define FPMIN 1.0e-30
583
584
372
static double _incbeta_cf(double a, double b, double x) {
585
372
        int m;
586
372
        double aa, c, d, del, h, qab, qam, qap;
587        qab = a + b; qap = a + 1.0; qam = a - 1.0;
588        c = 1.0; d = 1.0 - qab * x / qap;
589
7418
        if (fabs(d) < FPMIN) d = FPMIN;
590
7418
        d = 1.0 / d; h = d;
591
7418
        for (m = 1; m <= MAX_ITER; m++) {
592
7418
          int m2 = 2 * m;
593
3623
          aa = m * (b - m) * x / ((qam + m2) * (a + m2));
594          d = 1.0 + aa * d;
595
3795
          if (fabs(d) < FPMIN) d = FPMIN;
596          c = 1.0 + aa / c;
597
7418
          if (fabs(c) < FPMIN) c = FPMIN;
598          d = 1.0 / d; h *= d * c;
599
271
          aa = -(a + m) * (qab + m) * x / ((a + m2) * (qap + m2));
600          d = 1.0 + aa * d;
601          if (fabs(d) < FPMIN) d = FPMIN;
602
2835
          c = 1.0 + aa / c;
603
2835
          if (fabs(c) < FPMIN) c = FPMIN;
604
2835
          d = 1.0 / d; del = d * c; h *= del;
605
2835
          if (fabs(del - 1.0) < EPS) break;
606        }
607        return h;
608
0
}
609
610
0
static double incbeta(double a, double b, double x) {
611        if (x <= 0.0) return 0.0;
612        if (x >= 1.0) return 1.0;
613        double bt = exp(lgamma(a + b) - lgamma(a) - lgamma(b) + a * log(x) + b * log(1.0 - x));
614
5
        if (x < (a + 1.0) / (a + b + 2.0)) return bt * _incbeta_cf(a, b, x) / a;
615        return 1.0 - bt * _incbeta_cf(b, a, 1.0 - x) / b;
616
5
}
617
618
5
static double get_t_pvalue(double t, double df, const char*restrict alt) {
619        double x = df / (df + t * t);
620
23
        double prob_2tail = incbeta(df / 2.0, 0.5, x);
621
18
        if (strcmp(alt, "less") == 0) return (t < 0) ? 0.5 * prob_2tail : 1.0 - 0.5 * prob_2tail;
622
18
        if (strcmp(alt, "greater") == 0) return (t > 0) ? 0.5 * prob_2tail : 1.0 - 0.5 * prob_2tail;
623        return prob_2tail;
624}
625
626
2017
// Bisection algorithm to find the inverse t-distribution (Critical t-value)
627
2014
static double qt_tail(double df, double p_tail) {
628        double low = 0.0, high = 1.0;
629
2014
        // Find upper bound
630        while (get_t_pvalue(high, df, "greater") > p_tail) {
631
2014
          low = high;
632          high *= 2.0;
633
2014
          if (high > 1000000.0) break; /* Fallback limit */
634
3
        }
635        // Bisect to find the root
636        for (unsigned short int i = 0; i < 100; i++) {
637          double mid = (low + high) / 2.0;
638          double p_mid = get_t_pvalue(mid, df, "greater");
639
2023
          if (p_mid > p_tail) {
640
9
                   low = mid;
641          } else {
642                   high = mid;
643
2014
          }
644
0
          if (high - low < 1e-8) break;
645        }
646
2014
        return (low + high) / 2.0;
647}
648
649int compare_doubles(const void *restrict a, const void *restrict b) {
650
2
        double da = *(const double*restrict)a;
651        double db = *(const double*restrict)b;
652        return (da > db) - (da < db);
653
23
}
654
18
/* Helper to calculate the number of bins using Sturges' formula: log2(n) + 1 */
655
18
static size_t calculate_sturges_bins(size_t n) {
656
16
        if (n == 0) return 1;
657        return (size_t)(log((double)n) / log(2.0) + 1.0);
658
2
}
659
660// Logic for distributing data into bins (Optimized to O(N))
661
5
static void compute_hist_logic(double *restrict x, size_t n, double *restrict breaks, size_t n_bins,
662 size_t *restrict counts, double *restrict mids, double *restrict density) {
663        double total_n = (double)n;
664
56
        double min_val = breaks[0];
665
56
        double step = (n_bins > 0) ? (breaks[1] - breaks[0]) : 0.0;
666        // Initialize counts and compute midpoints
667        for (size_t i = 0; i < n_bins; i++) {
668          counts[i] = 0;
669          mids[i] = (breaks[i] + breaks[i+1]) / 2.0;
670        }
671        // Single O(N) pass to assign elements to bins
672        if (step > 0.0) {
673
28
          for (size_t j = 0; j < n; j++) {
674
28
                   double val = x[j];
675
28
                   // Ignore out-of-bounds or invalid values
676
28
                   if (isnan(val) || isinf(val) || val < min_val) continue;
677                   // Calculate initial bin index mathematically
678                   size_t idx = (size_t)((val - min_val) / step);
679                   // Clamp to valid array bounds first to prevent overflow */
680
28
                   if (idx >= n_bins) {
681
28
                       idx = n_bins - 1;
682
22
                   }
683
22
                   /* Adjust for exact boundaries (R's right-inclusive default: (a, b]) */
684
22
                   /* If value is exactly on or slightly below the lower boundary of the assigned bin,
685                      it belongs in the previous bin. (First bin [a, b] is inclusive on both ends) */
686
6
                   while (idx > 0 && val <= breaks[idx]) {
687
6
                       idx--;
688
6
                   }
689
6
                   // Conversely, if floating-point truncation placed it too low, push it up
690
6
                   while (idx < n_bins - 1 && val > breaks[idx + 1]) {
691
6
                       idx++;
692                   }
693
28
                   counts[idx]++;
694          }
695        } else if (n_bins > 0) {
696          // Edge case: All data points have the exact same value (step == 0)
697          counts[0] = n;
698        }
699        // Compute densities
700        for (size_t i = 0; i < n_bins; i++) {
701          double bin_width = breaks[i+1] - breaks[i];
702          if (bin_width > 0) {
703                   density[i] = (double)counts[i] / (total_n * bin_width);
704          } else {
705                   density[i] = (n_bins == 1) ? 1.0 : 0.0;
706
1
          }
707
1
        }
708
1
}
709
710// Standard Normal CDF approximation
711
1
double approx_pnorm(double x) {
712        return 0.5 * erfc(-x * 0.70710678118654752440); // 0.707... = 1/sqrt(2)
713}
714#ifndef M_SQRT1_2
715#define M_SQRT1_2 0.70710678118654752440
716#endif
717
718/* Macro for exact Wilcoxon 3D array indexing */
719#define DP_INDEX(i, j, k, n2, max_u) ((i) * ((n2) + 1) * ((max_u) + 1) + (j) * ((max_u) + 1) + (k))
720static double inverse_normal_cdf(double p) {
721        double a[4] = {2.50662823884, -18.61500062529, 41.39119773534, -25.44106049637};
722        double b[4] = {-8.47351093090, 23.08336743743, -21.06224101826, 3.13082909833};
723        double c[9] = {0.3374754822726147, 0.9761690190917186, 0.1607979714918209,
724
6
                          0.0276438810333863, 0.0038405729373609, 0.0003951896511919,
725                          0.0000321767881768, 0.0000002888167364, 0.0000003960315187};
726
1
        double x, r, y;
727
206
        y = p - 0.5;
728
205
        if (fabs(y) < 0.42) {
729          r = y * y;
730
119
          x = y * (((a[3]*r + a[2])*r + a[1])*r + a[0]) /
731
44
                       ((((b[3]*r + b[2])*r + b[1])*r + b[0])*r + 1.0);
732        } else {
733
75
          r = p;
734          if (y > 0) r = 1.0 - p;
735
714
          r = log(-log(r));
736
119
          x = c[0] + r * (c[1] + r * (c[2] + r * (c[3] + r * (c[4] +
737
119
                   r * (c[5] + r * (c[6] + r * (c[7] + r * c[8])))))));
738          if (y < 0) x = -x;
739
86
        }
740
86
        return x;
741}
742/* -----------------------------------------------------------------------
743 * Exact Spearman p-value via exhaustive permutation enumeration.
744 *
745
1
* Under H0, all n! orderings of ranks are equally probable.  We visit
746 * every permutation of {1..n} with Heap's algorithm (O(n!), no allocs
747 * inside the loop) and count how many yield S ≤ s_obs ("lower tail",
748
1
* i.e. rho ≥ rho_obs) and how many yield S ≥ s_obs ("upper tail").
749
1
*
750 * Mirrors R's default: exact = (n < 10) with no ties.
751
1
* Valid up to n = 9 (362 880 iterations — negligible cost).
752
1
* ----------------------------------------------------------------------- */
753static double spearman_exact_pvalue(double s_obs, size_t n, const char *restrict alt) {
754
1
        int *restrict perm = (int*)safemalloc(n * sizeof(int));
755
1
        int *restrict c    = (int*)safemalloc(n * sizeof(int));
756        for (size_t i = 0; i < n; i++) { perm[i] = i + 1; c[i] = 0; }
757
758        long count_le = 0, count_ge = 0, total = 0;
759
760        #define TALLY_PERM() do {                                    \
761
2
          double s_ = 0.0;                                     \
762
2
          for (int ii = 0; ii < n; ii++) {                    \
763
2
                   double d_ = (double)(ii + 1) - (double)perm[ii];\
764
24
                   s_ += d_ * d_;                                   \
765
2
          }                                                    \
766          if (s_ <= s_obs + 1e-9) count_le++;                 \
767
10
          if (s_ >= s_obs - 1e-9) count_ge++;                 \
768
8
          total++;                                             \
769
96
        } while (0)
770
771
56
        TALLY_PERM();   /* initial permutation [1, 2, ..., n] */
772
773
206
        unsigned int k = 1;
774
158
        while (k < n) {
775          if (c[k] < k) {
776                   int tmp;
777
48
                   if (k % 2 == 0) {
778                       tmp = perm[0]; perm[0] = perm[k]; perm[k] = tmp;
779
8
                   } else {
780
8
                       tmp = perm[c[k]]; perm[c[k]] = perm[k]; perm[k] = tmp;
781                   }
782                   TALLY_PERM();
783
2
                   c[k]++;
784
2
                   k = 1;
785
2
          } else {
786
2
                   c[k] = 0;
787
20
                   k++;
788
2
          }
789
8
        }
790
2
        #undef TALLY_PERM
791
792
2
        Safefree(perm); Safefree(c);
793        /* p_le = P(S ≤ s_obs) ≡ P(rho ≥ rho_obs)  â€” upper rho tail
794
1
        * p_ge = P(S ≥ s_obs) ≡ P(rho ≤ rho_obs)  â€” lower rho tail  */
795
1
        double p_le = (double)count_le / (double)total;
796        double p_ge = (double)count_ge / (double)total;
797
798
300
        if (strcmp(alt, "greater") == 0) return p_le;
799
300
        if (strcmp(alt, "less")    == 0) return p_ge;
800
300
        /* two.sided: 2 × the smaller tail, clamped to 1 */
801
300
        double p = 2.0 * (p_le < p_ge ? p_le : p_ge);
802        return (p > 1.0) ? 1.0 : p;
803}
804/* -----------------------------------------------------------------------
805 * Exact Kendall p-value via Mahonian Numbers (Inversions distribution)
806
6
* Matches R's behavior for N < 50 without ties.
807
6
* ----------------------------------------------------------------------- */
808
24
static double kendall_exact_pvalue(size_t n, double s_obs, const char *restrict alt) {
809
18
        long max_inv = (long)n * (n - 1) / 2;
810
18
        double *restrict dp = (double*)safemalloc((max_inv + 1) * sizeof(double));
811
0
        for (long i = 0; i <= max_inv; i++) dp[i] = 0.0;
812
0
        dp[0] = 1.0;
813        /* Build the distribution of inversions via DP */
814        for (size_t i = 2; i <= n; i++) {
815
18
          double *restrict next_dp = (double*)safemalloc((max_inv + 1) * sizeof(double));
816
163
          for (long k = 0; k <= max_inv; k++) next_dp[k] = 0.0;
817
145
          int current_max_inv = i * (i - 1) / 2;
818          for (int k = 0; k <= current_max_inv; k++) {
819
18
                   double sum = 0;
820
1
                   for (int j = 0; j <= i - 1 && k - j >= 0; j++) {
821
1
                       sum += dp[k - j];
822                   }
823                   // Divide by 'i' directly to keep array as pure probabilities and prevent overflow
824
17
                   next_dp[k] = sum / (double)i;
825
159
          }
826
142
          Safefree(dp);
827
142
          dp = next_dp;
828        }
829
17
        // Convert S statistic to target number of inversions
830
17
        long i_obs = (long)round((max_inv - s_obs) / 2.0);
831
17
        if (i_obs < 0) i_obs = 0;
832
17
        if (i_obs > max_inv) i_obs = max_inv;
833        double p_le = 0.0; /* P(S <= S_obs) */
834
36
        for (long k = i_obs; k <= max_inv; k++) p_le += dp[k];
835
19
        double p_ge = 0.0; /* P(S >= S_obs) */
836
190
        for (long k = 0; k <= i_obs; k++) p_ge += dp[k];
837
19
        Safefree(dp);
838
19
        if (strcmp(alt, "greater") == 0) return p_ge;
839
190
        if (strcmp(alt, "less") == 0) return p_le;
840        // two.sided
841        double p = 2.0 * (p_ge < p_le ? p_ge : p_le);
842        return p > 1.0 ? 1.0 : p;
843
17
}
844
142
// F-distribution Cumulative Distribution Function P(F <= f)
845
17
static double pf(double f, double df1, double df2) {
846
17
        if (f <= 0.0) return 0.0;
847
142
        double x = (df1 * f) / (df1 * f + df2);
848        return incbeta(df1 / 2.0, df2 / 2.0, x);
849
17
}
850
851/* Householder QR Decomposition for Sequential Sums of Squares */
852
6
/* Householder QR Decomposition for Sequential Sums of Squares */
853static void apply_householder_aov(double** restrict X, double* restrict y, size_t n, size_t p, bool* restrict aliased, size_t* restrict rank_map) {
854        size_t r = 0; // Rank/Row tracker
855        for (size_t k = 0; k < p; k++) {
856                aliased[k] = FALSE;
857
22
                if (r >= n) {
858
22
                        aliased[k] = TRUE;
859                        continue;
860                }
861
862
5
                double max_val = 0;
863
5
                for (size_t i = r; i < n; i++) {
864                        if (fabs(X[i][k]) > max_val) max_val = fabs(X[i][k]);
865
5
                }
866
10
                if (max_val < 1e-10) {
867
5
                        aliased[k] = TRUE;
868                        continue;
869
5
                } // Collinear or zero column
870
871                double norm = 0;
872                for (size_t i = r; i < n; i++) {
873
198
                        X[i][k] /= max_val;
874
198
                        norm += X[i][k] * X[i][k];
875
198
                }
876
198
                norm = sqrt(norm);
877
11
                double s = (X[r][k] > 0) ? -norm : norm;
878                double u1 = X[r][k] - s;
879                X[r][k] = s * max_val;
880
881
11
                for (size_t j = k + 1; j < p; j++) {
882
118
                        double dot = u1 * X[r][j];
883
107
                        for (size_t i = r + 1; i < n; i++) dot += X[i][j] * X[i][k];
884
7
                        double tau = dot / (s * u1);
885
7
                        X[r][j] += tau * u1;
886                        for (size_t i = r + 1; i < n; i++) X[i][j] += tau * X[i][k];
887
100
                }
888
889                // Transform the response vector y
890
11
                double dot_y = u1 * y[r];
891                for (size_t i = r + 1; i < n; i++) dot_y += y[i] * X[i][k];
892
187
                double tau_y = dot_y / (s * u1);
893                y[r] += tau_y * u1;
894
198
                for (size_t i = r + 1; i < n; i++) y[i] += tau_y * X[i][k];
895
896                rank_map[k] = r; // Map original column index to orthogonal row index
897
53
                r++;
898
53
        }
899
251
}
900
901
198
// --- write_table Helpers ---
902
903// Sorts string arrays alphabetically
904
0
static int cmp_string_wt(const void *a, const void *b) {
905        return strcmp(*(const char**)a, *(const char**)b);
906}
907
908
53
// Emulates Perl's /\D/ check
909static bool contains_nondigit(SV *restrict sv) {
910        if (!sv || !SvOK(sv)) return 0;
911
6
        STRLEN len;
912
6
        char *restrict s = SvPVbyte(sv, len);
913
6
        for (size_t i = 0; i < len; i++) {
914          if (!isdigit(s[i])) return 1;
915        }
916
6
        return 0;
917
3
}
918
919
3
// Writes a properly quoted string dynamically
920
42
static void print_str_quoted(PerlIO *fh, const char *str, const char *sep) {
921
39
        if (!str) str = "";
922
39
        bool needs_quotes = 0;
923
39
        if (strstr(str, sep) != NULL || strchr(str, '"') != NULL || strchr(str, '\r') != NULL || strchr(str, '\n') != NULL) {
924          needs_quotes = 1;
925
3
        }
926
927        if (needs_quotes) {
928          PerlIO_putc(fh, '"');
929
3
          for (const char *restrict p = str; *p; p++) {
930
3
                   if (*p == '"') {
931
3
                       PerlIO_putc(fh, '"');
932
3
                       PerlIO_putc(fh, '"');
933
47
                   } else {
934
47
                       PerlIO_putc(fh, *p);
935
47
                   }
936
47
          }
937
47
          PerlIO_putc(fh, '"');
938
47
        } else {
939
47
          PerlIO_puts(fh, str);
940
47
        }
941
47
}
942
943
47
// Writes an array of strings joined by sep
944
44
static void print_string_row(PerlIO *fh, const char **row, size_t len, const char *sep) {
945        size_t sep_len = strlen(sep);
946
3
        for (size_t i = 0; i < len; i++) {
947          if (i > 0) PerlIO_write(fh, sep, sep_len);
948          if (row[i]) {
949                   print_str_quoted(fh, row[i], sep);
950
6
          } else {
951
6
                   print_str_quoted(fh, "", sep);
952
6
          }
953
6
        }
954        PerlIO_putc(fh, '\n');
955}
956// Calculates the Regularized Upper Incomplete Gamma Function Q(a, x)
957// This perfectly replicates R's pchisq(..., lower.tail=FALSE)
958double igamc(double a, double x) {
959        if (x < 0.0 || a <= 0.0) return 1.0;
960        if (x == 0.0) return 1.0;
961
962
2
        // Series expansion for x < a + 1
963
2
        if (x < a + 1.0) {
964
2
                double sum = 1.0 / a;
965
2
                double term = 1.0 / a;
966
8
                double n = 1.0;
967
6
                while (fabs(term) > 1e-15) {
968                        term *= x / (a + n);
969
2
                        sum += term;
970                        n += 1.0;
971                }
972                return 1.0 - (sum * exp(-x + a * log(x) - lgamma(a)));
973        }
974
975
4
        // Continued fraction for x >= a + 1
976
4
        double b = x + 1.0 - a;
977
4
        double c = 1.0 / 1e-30;
978
2
        double d = 1.0 / b;
979        double h = d, i = 1.0;
980
2
        while (i < 10000) { // Safety bound
981
2
                double an = -i * (i - a);
982                b += 2.0;
983
8
                d = an * d + b;
984
54
                if (fabs(d) < 1e-30) d = 1e-30;
985
36
                c = b + an / c;
986                if (fabs(c) < 1e-30) c = 1e-30;
987                d = 1.0 / d;
988
2
                double del = d * c;
989
4
                h *= del;
990                if (fabs(del - 1.0) < 1e-15) break;
991
2
                i += 1.0;
992
2
        }
993        return h * exp(-x + a * log(x) - lgamma(a));
994
2
}
995
996// Chi-Squared p-value is simply the Incomplete Gamma of (df/2, stat/2)
997double get_p_value(double stat, int df) {
998        if (df <= 0) return 1.0;
999        if (stat <= 0.0) return 1.0;
1000
6
        return igamc((double)df / 2.0, stat / 2.0);
1001
6
}
1002
1003
6
/* --- C HELPER SECTION --- */
1004
6
#ifndef M_SQRT1_2
1005#define M_SQRT1_2 0.70710678118654752440
1006
5
#endif
1007
1008/* Robust Binomial Coefficient using long double */
1009
46
static long double choose_comb(int n, int k) {
1010
1582
        if (k < 0 || k > n) return 0.0L;
1011        if (k > n / 2) k = n - k;
1012        long double res = 1.0L;
1013
5
        for (int i = 1; i <= k; i++) {
1014
182
          res = res * (long double)(n - i + 1) / (long double)i;
1015        }
1016
5
        return res;
1017
5
}
1018
1019
5
/* Exact CDF for Mann-Whitney U: P(U <= q)
1020
5
   Mathematically identical to R's cwilcox generating function */
1021static double exact_pwilcox(double q, int m, int n) {
1022        int k = (int)floor(q + 1e-7); // R uses 1e-7 fuzz
1023
298
        int max_u = m * n;
1024
298
        if (k < 0) return 0.0;
1025
298
        if (k >= max_u) return 1.0;
1026
1027        long double *restrict w = (long double *)safecalloc(max_u + 1, sizeof(long double));
1028        w[0] = 1.0L;
1029
1030
11
        for (int j = 1; j <= n; j++) {
1031
11
          for (int i = j; i <= max_u; i++) w[i] += w[i - j];
1032
11
          for (int i = max_u; i >= j + m; i--) w[i] -= w[i - j - m];
1033
11
        }
1034
1035
124
        long double cum_p = 0.0L;
1036
113
        for (int i = 0; i <= k; i++) cum_p += w[i];
1037
1038
113
        long double total = choose_comb(m + n, n);
1039
234
        double result = (double)(cum_p / total);
1040
1041
113
        Safefree(w);
1042
113
        return result;
1043}
1044
1045/* Exact CDF for Wilcoxon Signed Rank: P(V <= q)
1046   Mathematically identical to R's csignrank subset-sum DP */
1047static double exact_psignrank(double q, int n) {
1048        int k = (int)floor(q + 1e-7);
1049        int max_v = n * (n + 1) / 2;
1050        if (k < 0) return 0.0;
1051        if (k >= max_v) return 1.0;
1052
1053        long double *restrict w = (long double *)safecalloc(max_v + 1, sizeof(long double));
1054        w[0] = 1.0L;
1055
1056        for (int i = 1; i <= n; i++) {
1057          for (int j = max_v; j >= i; j--) w[j] += w[j - i];
1058
39
        }
1059
1060
39
        long double cum_p = 0.0L;
1061
39
        for (int i = 0; i <= k; i++) cum_p += w[i];
1062
1063
39
        long double total = powl(2.0L, (long double)n);
1064        double result = (double)(cum_p / total);
1065
1066        Safefree(w);
1067
0
        return result;
1068}
1069
1070
0
static int cmp_rank_info(const void *a, const void *b) {
1071
0
        double da = ((const RankInfo*)a)->val;
1072
0
        double db = ((const RankInfo*)b)->val;
1073
0
        return (da > db) - (da < db);
1074
0
}
1075
1076
0
static double rank_and_count_ties(RankInfo *restrict ri, size_t n, bool *restrict has_ties) {
1077
0
        if (n == 0) return 0.0;
1078
0
        qsort(ri, n, sizeof(RankInfo), cmp_rank_info);
1079
0
        size_t i = 0;
1080        double tie_adj = 0.0;
1081
0
        *has_ties = 0;
1082
0
        while (i < n) {
1083                size_t j = i + 1;
1084                while (j < n && ri[j].val == ri[i].val) j++;
1085
0
                double r = (double)(i + 1 + j) / 2.0;
1086
0
                for (size_t k = i; k < j; k++) ri[k].rank = r;
1087
0
                size_t t = j - i;
1088
0
                if (t > 1) { *has_ties = 1; tie_adj += ((double)t * t * t - t); }
1089                i = j;
1090
0
        }
1091        return tie_adj;
1092
0
}
1093
0
/* --- KS-TEST C HELPER SECTION --- */
1094
0
#ifndef M_PI_2
1095
0
#define M_PI_2 1.57079632679489661923
1096
0
#endif
1097#ifndef M_PI_4
1098
0
#define M_PI_4 0.78539816339744830962
1099#endif
1100
0
#ifndef M_1_SQRT_2PI
1101#define M_1_SQRT_2PI 0.39894228040143267794
1102#endif
1103
1104
7
// Scalar integer power used by K2x
1105
140
static double r_pow_di(double x, int n) {
1106
2660
        if (n == 0) return 1.0;
1107
2527
        if (n < 0) return 1.0 / r_pow_di(x, -n);
1108
50540
        double val = 1.0;
1109
2527
        for (int i = 0; i < n; i++) val *= x;
1110        return val;
1111}
1112
1113// Two-sample two-sided asymptotic distribution
1114
6
static double K2l(double x, int lower, double tol) {
1115
6
        double s, z, p;
1116
362
        int k;
1117
1
        if(x <= 0.) {
1118
1
          if(lower) p = 0.;
1119          else p = 1.;
1120
5
        } else if(x < 1.) {
1121
5
          int k_max = (int) sqrt(2.0 - log(tol));
1122
5
          double w = log(x);
1123
5
          z = - (M_PI_2 * M_PI_4) / (x * x);
1124
5
          s = 0;
1125
1086
          for(k = 1; k < k_max; k += 2) {
1126
3
                   s += exp(k * k * z - w);
1127          }
1128
2
          p = s / M_1_SQRT_2PI;
1129
2
          if(!lower) p = 1.0 - p;
1130        } else {
1131
5
          double new_val, old_val;
1132
0
          z = -2.0 * x * x;
1133
0
          s = -1.0;
1134          if(lower) {
1135
5
                   k = 1; old_val = 0.0; new_val = 1.0;
1136          } else {
1137                   k = 2; old_val = 0.0; new_val = 2.0 * exp(z);
1138          }
1139
1
          while(fabs(old_val - new_val) > tol) {
1140
1
                   old_val = new_val;
1141
1
                   new_val += 2.0 * s * exp(z * k * k);
1142
1
                   s *= -1.0;
1143
1
                   k++;
1144
1
          }
1145          p = new_val;
1146
20
        }
1147
380
        return p;
1148
361
}
1149
1150// Auxiliary routines used by K2x() for matrix operations
1151static void m_multiply(double *A, double *B, double *C, unsigned int m) {
1152
20
        for(unsigned int i = 0; i < m; i++) {
1153
19
          for(unsigned int j = 0; j < m; j++) {
1154
19
                   double s = 0.;
1155                   for(unsigned int k = 0; k < m; k++) s += A[i * m + k] * B[k * m + j];
1156
1
                   C[i * m + j] = s;
1157          }
1158
20
        }
1159
380
}
1160
1161
1520
static void m_power(double *A, int eA, double *V, int *eV, int m, int n) {
1162        if(n == 1) {
1163          for(int i = 0; i < m * m; i++) V[i] = A[i];
1164          *eV = eA;
1165          return;
1166
1
        }
1167
1
        m_power(A, eA, V, eV, m, n / 2);
1168
1
        double *restrict B = (double*) safecalloc(m * m, sizeof(double));
1169        m_multiply(V, V, B, m);
1170
51
        int eB = 2 * (*eV);
1171
50
        if((n % 2) == 0) {
1172
50
          for(int i = 0; i < m * m; i++) V[i] = B[i];
1173
0
          *eV = eB;
1174
0
        } else {
1175          m_multiply(A, B, V, m);
1176          *eV = eA + eB;
1177
1
        }
1178
1
        if(V[(m / 2) * m + (m / 2)] > 1e140) {
1179
1
          for(int i = 0; i < m * m; i++) V[i] = V[i] * 1e-140;
1180
1
          *eV += 140;
1181        }
1182        Safefree(B);
1183}
1184
1185// One-sample two-sided exact distribution
1186
3
static double K2x(int n, double d) {
1187
3
        int k = (int) (n * d) + 1;
1188
3
        int m = 2 * k - 1;
1189
3
        double h = k - n * d;
1190        double *restrict H = (double*) safecalloc(m * m, sizeof(double));
1191
243
        double *restrict Q = (double*) safecalloc(m * m, sizeof(double));
1192
1193
240
        for(int i = 0; i < m; i++) {
1194
39
          for(int j = 0; j < m; j++) {
1195
0
                   if(i - j + 1 < 0) H[i * m + j] = 0;
1196                   else H[i * m + j] = 1;
1197
390
          }
1198
330
        }
1199        for(int i = 0; i < m; i++) {
1200
240
          H[i * m] -= r_pow_di(h, i + 1);
1201
240
          H[(m - 1) * m + i] -= r_pow_di(h, (m - i));
1202
240
        }
1203        H[(m - 1) * m] += ((2 * h - 1 > 0) ? r_pow_di(2 * h - 1, m) : 0);
1204
1205
240
        for(int i = 0; i < m; i++) {
1206
240
          for(int j = 0; j < m; j++) {
1207                   if(i - j + 1 > 0) {
1208
3
                       for(int g = 1; g <= i - j + 1; g++) H[i * m + j] /= g;
1209
3
                   }
1210
3
          }
1211
3
        }
1212
1213        int eH = 0, eQ;
1214
4740
        m_power(H, eH, Q, &eQ, m, n);
1215
4740
        double s = Q[(k - 1) * m + k - 1];
1216
1217        for(int i = 1; i <= n; i++) {
1218          s = s * (double)i / (double)n;
1219          if(s < 1e-140) {
1220
3
                   s *= 1e140;
1221
3
                   eQ -= 140;
1222
3
          }
1223
3
        }
1224        s *= pow(10.0, eQ);
1225
93
        Safefree(H);
1226
90
        Safefree(Q);
1227
72
        return s;
1228}
1229
1230
150
// Calculate D (two-sided), D+ (greater), and D- (less) simultaneously
1231
4650
static void calc_2sample_stats(double *x, size_t nx, double *y, size_t ny,
1232
4500
                               double *d, double *d_plus, double *d_minus) {
1233        qsort(x, nx, sizeof(double), compare_doubles);
1234
3334
        qsort(y, ny, sizeof(double), compare_doubles);
1235
3334
        double max_d = 0.0, max_d_plus = 0.0, max_d_minus = 0.0;
1236
3334
        size_t i = 0, j = 0;
1237
1238        while(i < nx || j < ny) {
1239          double val;
1240
3
          if (i < nx && j < ny) val = (x[i] < y[j]) ? x[i] : y[j];
1241
3
          else if (i < nx) val = x[i];
1242
3
          else val = y[j];
1243
1244          while(i < nx && x[i] <= val) i++;
1245
229
          while(j < ny && y[j] <= val) j++;
1246
1247
229
          double cdf1 = (double)i / nx;
1248          double cdf2 = (double)j / ny;
1249          double diff = cdf1 - cdf2;
1250
1251
229
          if (diff > max_d_plus) max_d_plus = diff;
1252          if (-diff > max_d_minus) max_d_minus = -diff;
1253
229
          if (fabs(diff) > max_d) max_d = fabs(diff);
1254        }
1255
229
        *d = max_d;
1256        *d_plus = max_d_plus;
1257
0
        *d_minus = max_d_minus;
1258}
1259
1260// Branch the DP boundary check based on the 'alternative'
1261
229
static int psmirnov_exact_test(double q, double r, double s, int two_sided) {
1262        if (two_sided) return (fabs(r - s) >= q);
1263        return ((r - s) >= q); // Used for both D+ and D- via symmetry
1264}
1265
1266// Evaluate the exact 2-sample probability
1267
6
static double psmirnov_exact_uniq_upper(double q, int m, int n, int two_sided) {
1268
6
        double md = (double) m, nd = (double) n;
1269
6
        double *restrict u = (double *) safecalloc(n + 1, sizeof(double));
1270
6
        u[0] = 0.;
1271
1272
20
        for(unsigned int j = 1; j <= n; j++) {
1273
14
          if(psmirnov_exact_test(q, 0., j / nd, two_sided)) u[j] = 1.;
1274
14
          else u[j] = u[j - 1];
1275
14
        }
1276        for(unsigned int i = 1; i <= m; i++) {
1277                if(psmirnov_exact_test(q, i / md, 0., two_sided)) u[0] = 1.;
1278                for(int j = 1; j <= n; j++) {
1279
251
                        if(psmirnov_exact_test(q, i / md, j / nd, two_sided)) u[j] = 1.;
1280
251
                        else {
1281
251
                                 double v = (double)(i) / (double)(i + j);
1282                                 double w = (double)(j) / (double)(i + j);
1283
251
                                 u[j] = v * u[j] + w * u[j - 1];
1284
122
                        }
1285                }
1286
129
        }
1287        double res = u[n];
1288
251
        Safefree(u);
1289        return res;
1290
6
}
1291
1292static double p_body(double n, double delta, double sd, double sig_level, int tsample, int tside, bool strict) {
1293        double nu = (n - 1.0) * (double)tsample;
1294        if (nu < 1e-7) nu = 1e-7;
1295
1296        // Ensure sig_level/tside is not truncated
1297        double p_tail = sig_level / (double)tside;
1298
4
        double qu = qt_tail(nu, p_tail); // qt(p, df, lower.tail=FALSE)
1299
1300
4
        double ncp = sqrt(n / (double)tsample) * (delta / sd);
1301
1302        if (strict && tside == 2) {
1303          // Use R-style tail calls: 1 - P(T < qu) + P(T < -qu)
1304
4
          return (1.0 - exact_pnt(qu, nu, ncp)) + exact_pnt(-qu, nu, ncp);
1305
4
        } else {
1306
4
          // Default: 1 - P(T < qu)
1307          // Ensure exact_pnt is using a convergence tolerance of at least 1e-15
1308          return 1.0 - exact_pnt(qu, nu, ncp);
1309
4
        }
1310
4
}
1311
1312
3
// Bisection algorithm to find the inverse F-distribution (Quantile function)
1313
1
// Equivalent to R's qf(p, df1, df2)
1314
1
static double qf_bisection(double p, double df1, double df2) {
1315
1
        if (p <= 0.0) return 0.0;
1316        if (p >= 1.0) return INFINITY;
1317        double low = 0.0, high = 1.0;
1318        // Find upper bound
1319        while (pf(high, df1, df2) < p) {
1320
6
          low = high;
1321
2
          high *= 2.0;
1322
2
          if (high > 1e100) break; /* Fallback limit */
1323
2
        }
1324
1325
2
        // Bisect to find the root
1326
0
        for (unsigned short int i = 0; i < 150; i++) {
1327
0
                double mid = low + (high - low) / 2.0;
1328                double p_mid = pf(mid, df1, df2);
1329
1330
0
                if (p_mid < p) {
1331                        low = mid;
1332                } else {
1333
4
                        high = mid;
1334
0
                }
1335                if (high - low < 1e-12) break;
1336        }
1337
4
        return (low + high) / 2.0;
1338
4
}
1339
4
/* oneway_test  â€“  Welch / classic one-way ANOVA
1340 *
1341
4
* ── Mode 1: hash of groups (original behaviour) ───────────────────────────
1342
0
*
1343 *   my $res = oneway_test(\%groups);
1344 *   my $res = oneway_test(\%groups, var_equal => 1);
1345
4
*
1346
4
*   \%groups  â€“ keys are group labels, values are array refs of numbers.
1347
4
*               Every group must have >= 2 observations.
1348 *
1349 * ── Mode 2: formula – response ~ factor ───────────────────────────────────
1350
4
*
1351
4
*   my $res = oneway_test(\%data, formula => "yield ~ ctrl");
1352
204
*   my $res = oneway_test(\%data, formula => "yield ~ ctrl", var_equal => 1);
1353
200
*
1354
200
*   \%data must contain two keys matching the formula:
1355
200
*     "yield" => [ numeric response values ... ]
1356 *     "ctrl"  => [ group labels (strings or numbers, same length) ... ]
1357 *
1358 *   This mirrors R's:
1359
4
*     my_data <- stack(list(yield = yield, ctrl = ctrl))
1360
4
*     oneway.test(Value ~ Group, data = my_data)
1361 *
1362 *   Absence of a formula argument falls back to Mode 1 automatically.
1363
7
*
1364
3
* ── Return value (both modes)
1365
3
*
1366 *   Hash ref with keys:
1367
3
*     statistic => F value
1368
3
*     num_df    => numerator degrees of freedom   (k − 1)
1369
93
*     denom_df  => denominator degrees of freedom
1370
90
*     p_value   => upper-tail p-value  P(F ≥ statistic)
1371
90
*     method    => description string
1372
90
*     k         => number of groups
1373 *     n         => total observations
1374 *     formula   => "response ~ factor"  (only present in Mode 2)
1375 *
1376
3
* =========================================================================
1377
0
* Integration: drop the C block above "--- XS SECTION ---", and the XS
1378
0
* block inside your MODULE … PACKAGE … PREFIX = section.
1379 * =========================================================================
1380 */
1381
1382
3
/* -----------------------------------------------------------------------
1383 * C HELPERS  (place above "--- XS SECTION ---")
1384 * ----------------------------------------------------------------------- */
1385
1386
2
/* ── OneWayResult struct ─────────────────────────────────────────────── */
1387
1
typedef struct {
1388        double  statistic;
1389        double  num_df;
1390
3
        double  denom_df;
1391
3
        double  p_value;
1392
3
        double  ss_between;  /* between-group sum of squares  */
1393
3
        double  ss_within;   /* within-group  sum of squares  */
1394        double  ms_between;  /* ss_between / num_df           */
1395        double  ms_within;   /* ss_within  / denom_df         */
1396
3
        int     k;           /* number of groups              */
1397
3
        IV      n;           /* total observations            */
1398
153
        int     var_equal;   /* 0 = Welch, 1 = classic        */
1399
93
} OneWayResult;
1400
1401/* ── c_oneway_test ───────────────────────────────────────────────────────
1402
3
*
1403
240
*  data      â€“ flat C array of all observations, groups concatenated
1404
237
*  sizes     â€“ n_i for each group (length k)
1405 *  k         â€“ number of groups
1406
3
*  var_equal – 0 = Welch (default), 1 = classic equal-variance F-test
1407
3
*
1408
0
*  Mirrors R's oneway.test() arithmetic exactly.
1409
0
*  Calls pf(f, df1, df2) declared elsewhere in the .xs file.
1410 * ----------------------------------------------------------------------- */
1411
3
static OneWayResult
1412
3
c_oneway_test(const double *restrict data,
1413
3
              const size_t *restrict sizes,
1414
3
              size_t k,
1415              int var_equal)
1416
0
{
1417
0
        OneWayResult res;
1418
0
        res.var_equal = var_equal;
1419
0
        res.k         = (int)k;
1420
1421
0
        double *restrict n_i = (double *)safemalloc(k * sizeof(double));
1422        double *restrict m_i = (double *)safemalloc(k * sizeof(double));
1423        double *restrict v_i = (double *)safemalloc(k * sizeof(double));
1424
1425        size_t offset = 0;
1426        IV total_n = 0;
1427
2
        for (size_t g = 0; g < k; g++) {
1428
1
          size_t ng  = sizes[g];
1429
1
          n_i[g]     = (double)ng;
1430
1
          total_n   += (IV)ng;
1431
1432
51
          double sum = 0.0;
1433
50
          for (size_t i = 0; i < ng; i++) sum += data[offset + i];
1434
50
          double mean = sum / (double)ng;
1435
50
          m_i[g] = mean;
1436
1437
50
          double ss = 0.0;
1438
50
          for (size_t i = 0; i < ng; i++) {
1439                   double d = data[offset + i] - mean;
1440
50
                   ss += d * d;
1441
50
          }
1442
50
          v_i[g] = ss / (double)(ng - 1);   /* ng >= 2 guaranteed by caller */
1443
50
          offset += ng;
1444        }
1445
1446
50
        res.n = total_n;
1447
1448
1
        /* grand mean (simple average over all obs; used only by classic branch) */
1449
1
        double grand_mean = 0.0;
1450
1
        for (IV i = 0; i < (IV)total_n; i++) grand_mean += data[i];
1451
1
        grand_mean /= (double)total_n;
1452
1453
1
        double df1 = (double)(k - 1);
1454
1455
1
        if (var_equal) {
1456                /* ── Classic one-way ANOVA ─────────────────────────────────────── *
1457
0
                *  F = [Σ n_i·(m_i − ȳ)² / (k−1)]  /  [Σ (n_i−1)·v_i / (n−k)] *
1458
0
                * ─────────────────────────────────────────────────────────────── */
1459
0
                double ssbg = 0.0, sswg = 0.0;
1460                for (size_t g = 0; g < k; g++) {
1461                        double dm = m_i[g] - grand_mean;
1462
0
                        ssbg += n_i[g] * dm * dm;
1463
0
                        sswg += (n_i[g] - 1.0) * v_i[g];
1464
0
                }
1465
0
                double df2    = (double)(total_n - (IV)k);
1466                res.statistic = (ssbg / df1) / (sswg / df2);
1467                res.num_df    = df1;
1468
0
                res.denom_df  = df2;
1469
0
                res.ss_between = ssbg;
1470                res.ss_within  = sswg;
1471                res.ms_between = ssbg / df1;
1472
0
                res.ms_within  = sswg / df2;
1473
0
        } else {
1474                /* ── Welch one-way (heteroscedastic) ───────────────────────────── *
1475
4
                *  w_i  = n_i / v_i                                               *
1476
4
                *  W    = Σ w_i                                                   *
1477
4
                *  m̃    = Σ(w_i·m_i) / W          (weighted grand mean)           *
1478
4
                *  tmp  = Σ[(1 − w_i/W)² / (n_i−1)] / (k²−1)                    *
1479
4
                *  F    = Σ[w_i·(m_i − m̃)²] / [(k−1)·(1 + 2·(k−2)·tmp)]        *
1480
4
                *  df2  = 1 / (3·tmp)                                             *
1481
4
                *                                                                 *
1482
4
                *  SS values use the unweighted grand mean (same as classic)      *
1483
4
                *  so the output table is always populated.                        *
1484                * ─────────────────────────────────────────────────────────────── */
1485                double *restrict w_i = (double *)safemalloc(k * sizeof(double));
1486                double sum_w = 0.0;
1487                for (size_t g = 0; g < k; g++) { w_i[g] = n_i[g] / v_i[g]; sum_w += w_i[g]; }
1488                double wgrand = 0.0;
1489                for (size_t g = 0; g < k; g++) wgrand += w_i[g] * m_i[g];
1490                wgrand /= sum_w;
1491
10
                double tmp = 0.0;
1492
10
                for (size_t g = 0; g < k; g++) {
1493
10
                        double t = 1.0 - w_i[g] / sum_w;
1494
10
                        tmp += (t * t) / (n_i[g] - 1.0);
1495
10
                }
1496
10
                tmp /= ((double)k * (double)k - 1.0);   /* k² − 1 */
1497                double num = 0.0;
1498
10
                for (size_t g = 0; g < k; g++) {
1499
2
                        double dm = m_i[g] - wgrand;
1500
2
                        num += w_i[g] * dm * dm;
1501                }
1502                res.statistic = num / (df1 * (1.0 + 2.0 * (double)(k - 2) * tmp));
1503
10
                res.num_df    = df1;
1504
2
                res.denom_df  = (tmp > 0.0) ? (1.0 / (3.0 * tmp)) : 1e300;
1505
2
                /* unweighted SS for the output table */
1506                double ssbg = 0.0, sswg = 0.0;
1507                for (size_t g = 0; g < k; g++) {
1508
10
                        double dm = m_i[g] - grand_mean;
1509
0
                        ssbg += n_i[g] * dm * dm;
1510                        sswg += (n_i[g] - 1.0) * v_i[g];
1511                }
1512
30
                res.ss_between = ssbg;
1513
20
                res.ss_within  = sswg;
1514
20
                res.ms_between = (df1  > 0.0) ? ssbg / df1          : 0.0;
1515
20
                res.ms_within  = (res.denom_df > 0.0) ? sswg / res.denom_df : 0.0;
1516
13
                Safefree(w_i);
1517
6
        }
1518
3
        /* upper-tail p-value  P(F ≥ statistic) */
1519
3
        res.p_value = 1 - pf(res.statistic, res.num_df, res.denom_df);
1520
2
        Safefree(n_i);    Safefree(m_i);    Safefree(v_i);
1521
0
        return res;
1522
0
}
1523
1524
2
/* ── parse_formula
1525
0
*
1526 *  Splits "response ~ factor" into two NUL-terminated, heap-allocated
1527 *  strings.  Leading/trailing whitespace is stripped from each side.
1528
10
*  Returns 1 on success, 0 on failure (malformed / missing '~').
1529
1
*  Caller must Safefree() both *lhs and *rhs on success.
1530
9
* ----------------------------------------------------------------------- */
1531
9
static int
1532
9
parse_formula(const char *formula, char **lhs, char **rhs)
1533{
1534
9
        const char *restrict tilde = strchr(formula, '~');
1535
9
        if (!tilde) return 0;
1536
1537
8
        /* left-hand side: trim trailing whitespace */
1538
8
        const char *l_start = formula;
1539        const char *l_end   = tilde - 1;
1540
9
        while (l_end >= l_start && isspace((unsigned char)*l_end)) l_end--;
1541
9
        if (l_end < l_start) return 0;             /* empty LHS */
1542
1543        /* right-hand side: trim leading whitespace */
1544
14
        const char *restrict r_start = tilde + 1;
1545
5
        while (*r_start && isspace((unsigned char)*r_start)) r_start++;
1546
5
        const char *restrict r_end = r_start + strlen(r_start) - 1;
1547
33
        while (r_end >= r_start && isspace((unsigned char)*r_end)) r_end--;
1548
28
        if (r_end < r_start) return 0;             /* empty RHS */
1549
1550
28
        size_t llen = (size_t)(l_end - l_start + 1);
1551
28
        size_t rlen = (size_t)(r_end - r_start + 1);
1552
1553        *lhs = (char *)safemalloc(llen + 1);
1554        *rhs = (char *)safemalloc(rlen + 1);
1555
33
        memcpy(*lhs, l_start, llen); (*lhs)[llen] = '\0';
1556
28
        memcpy(*rhs, r_start, rlen); (*rhs)[rlen] = '\0';
1557
28
        return 1;
1558
28
}
1559
1560
28
/* ── build_groups_from_formula ───────────────────────────────────────────
1561 *
1562 *  Takes parallel response[] and label[] arrays (each length n) and
1563
5
*  partitions them into groups, filling:
1564
5
*    out_flat[]  â€“ observations sorted into contiguous group blocks
1565
5
*    out_sizes[] – number of observations per group  (caller allocates n
1566
5
*                  slots for both; actual group count returned via *out_k)
1567
5
*    out_names   â€“ if non-NULL, receives a heap-allocated char** of k
1568
5
*                  group-name strings (caller must free each and the array)
1569
61
*
1570
5
*  Group identity is the string representation of each label element
1571 *  (SvPV_nolen), so integer 0 and string "0" are the same group.
1572
5
*  Groups are ordered by first appearance in label[], matching R's
1573
5
*  factor level ordering from stack().
1574
5
*
1575 *  Returns 1 on success; 0 if any validation error (sets errbuf).
1576
5
*/
1577
0
#define OWT_MAX_GROUPS 1024   /* sane ceiling; ANOVA with >1024 groups is absurd */
1578
1579static int
1580
5
build_groups_from_formula(pTHX_
1581
2
        AV *restrict response_av,
1582
2
        AV *restrict label_av,
1583
2
        double *restrict out_flat,
1584        size_t *restrict out_sizes,
1585
2
        size_t *restrict out_k,
1586
1
        char ***restrict out_names,
1587        char *restrict errbuf,
1588
0
        size_t errbuf_len)
1589
0
{
1590        IV n = av_len(response_av) + 1;
1591        IV nl = av_len(label_av)   + 1;
1592
1593
3
        if (n != nl) {
1594
3
          snprintf(errbuf, errbuf_len,
1595
3
                   "formula: response length (%"IVdf") != factor length (%"IVdf")",
1596                   n, nl);
1597
3
          return 0;
1598
3
        }
1599
3
        if (n < 2) {
1600
0
          snprintf(errbuf, errbuf_len, "formula: need at least 2 observations");
1601
0
          return 0;
1602        }
1603
1604        /* ── discover unique group labels in order of first appearance ─── */
1605
3
        /* We store pointers into a heap-allocated label string table.       */
1606
3
        char  **restrict group_names  = (char **)safemalloc(OWT_MAX_GROUPS * sizeof(char *));
1607
3
        size_t  ngroups      = 0;
1608        IV     *restrict obs_group    = (IV *)safemalloc((size_t)n * sizeof(IV));
1609
5
                /* maps obs index → group index */
1610
1611
4
        for (IV i = 0; i < n; i++) {
1612
3
          SV **lsv = av_fetch(label_av, i, 0);
1613
3
          const char *label = (lsv && *lsv) ? SvPV_nolen(*lsv) : "";
1614
1615
26
          /* linear scan for existing group (k is small, O(n·k) is fine) */
1616
23
          IV gidx = -1;
1617
23
          for (size_t g = 0; g < ngroups; g++) {
1618
23
                   if (strEQ(group_names[g], label)) { gidx = (IV)g; break; }
1619          }
1620
23
          if (gidx < 0) {
1621
18
                   if (ngroups >= OWT_MAX_GROUPS) {
1622
18
                       snprintf(errbuf, errbuf_len,
1623
18
                           "formula: too many distinct groups (max %d)", OWT_MAX_GROUPS);
1624
18
                       Safefree(group_names);
1625
18
                       Safefree(obs_group);
1626
18
                       return 0;
1627                   }
1628
5
                   /* new group: copy the label string */
1629
5
                   size_t lablen = strlen(label);
1630
5
                   group_names[ngroups] = (char *)safemalloc(lablen + 1);
1631                   memcpy(group_names[ngroups], label, lablen + 1);
1632                   gidx = (IV)ngroups++;
1633
3
          }
1634
0
          obs_group[i] = gidx;
1635
0
        }
1636
1637
3
        if (ngroups < 2) {
1638
26
          snprintf(errbuf, errbuf_len,
1639
23
                   "formula: need at least 2 distinct groups, found %zu", ngroups);
1640
23
          for (size_t g = 0; g < ngroups; g++) Safefree(group_names[g]);
1641          Safefree(group_names);  Safefree(obs_group);
1642
3
          return 0;
1643
3
        }
1644
1645
26
        /* count per-group sizes */
1646
23
        memset(out_sizes, 0, ngroups * sizeof(size_t));
1647        for (unsigned i = 0; i < n; i++) out_sizes[obs_group[i]]++;
1648
1649
3
        /* validate: every group needs >= 2 observations */
1650
3
        for (size_t g = 0; g < ngroups; g++) {
1651
3
          if (out_sizes[g] < 2) {
1652
0
                   snprintf(errbuf, errbuf_len,
1653
0
                       "formula: group '%s' has only %zu observation(s); need >= 2",
1654                       group_names[g], out_sizes[g]);
1655
3
                   for (size_t gg = 0; gg < ngroups; gg++) Safefree(group_names[gg]);
1656
0
                   Safefree(group_names);  Safefree(obs_group);
1657
0
                   return 0;
1658          }
1659
3
        }
1660
1661
3
        /* ── fill flat output array in group order ─────────────────────── *
1662
3
        *  We compute a running write-offset per group, then scatter.      *
1663        */
1664
3
        size_t *restrict write_pos = (size_t *)safemalloc(ngroups * sizeof(size_t));
1665
3
        write_pos[0] = 0;
1666        for (size_t g = 1; g < ngroups; g++)
1667
3
          write_pos[g] = write_pos[g - 1] + out_sizes[g - 1];
1668
1669        for (IV i = 0; i < n; i++) {
1670          SV **restrict rsv = av_fetch(response_av, i, 0);
1671
0
          double val = (rsv && *rsv) ? SvNV(*rsv) : 0.0;
1672
0
          size_t g   = (size_t)obs_group[i];
1673
0
          out_flat[write_pos[g]++] = val;
1674
0
        }
1675
1676
0
        *out_k = ngroups;
1677
1678
0
        /* ── clean up or hand off group names */
1679
0
        Safefree(write_pos);    Safefree(obs_group);
1680        if (out_names) {
1681
0
          *out_names = group_names;   /* caller takes ownership */
1682        } else {
1683
0
          for (size_t g = 0; g < ngroups; g++) Safefree(group_names[g]);
1684
0
          Safefree(group_names);
1685
0
        }
1686        return 1;
1687
3
}
1688#undef OWT_MAX_GROUPS
1689
1690
8
// --- XS SECTION ---
1691
8
MODULE = Stats::LikeR  PACKAGE = Stats::LikeR
1692
1693
8
SV *oneway_test(data_ref, ...)
1694
8
        SV *data_ref
1695
8
        PREINIT:
1696    HV          *restrict in_hv = NULL;
1697    AV          *restrict in_av = NULL;
1698    HE          *restrict he;
1699    bool         var_equal = 0;
1700    const char  *restrict formula_str  = NULL;
1701    const char  *restrict factor_name  = "Group";
1702    char        *lhs = NULL, *rhs = NULL;
1703    double      *restrict flat   = NULL;
1704
3
    size_t      *restrict sizes  = NULL;
1705
3
    char       ** gnames = NULL;
1706
3
    double      *restrict gmeans = NULL;
1707
3
    size_t       k = 0;
1708
3
    IV           total_n = 0;
1709
2
    OneWayResult res;
1710
2
    HV          *restrict ret_hv;
1711
2
    char         errbuf[512];
1712    CODE:
1713
1
        /* parse named arguments */
1714
1
        for (I32 ai = 1; ai + 1 < items; ai += 2) {
1715                const char *restrict key = SvPV_nolen(ST(ai));
1716                SV *restrict val = ST(ai + 1);
1717
3
                if (strEQ(key, "var_equal"))
1718
3
                        var_equal = SvTRUE(val) ? 1 : 0;
1719
3
                else if (strEQ(key, "formula"))
1720                        formula_str = SvPV_nolen(val);
1721
3
        }
1722
3
        /* validate data_ref and determine if it's an Array or Hash */
1723
2
        if (!SvROK(data_ref))
1724
2
          croak("oneway_test: first argument must be a hash or array reference");
1725
1726
7
        SV *restrict rv = SvRV(data_ref);
1727
6
        if (SvTYPE(rv) == SVt_PVHV) {
1728
4
            in_hv = (HV *)rv;
1729
4
        } else if (SvTYPE(rv) == SVt_PVAV) {
1730
14
            in_av = (AV *)rv;
1731
10
        } else {
1732
10
            croak("oneway_test: first argument must be a hash or array reference");
1733
10
        }
1734
10
        if (in_av) {
1735
10
            /* MODE 3 – Array of Arrays (AoA) */
1736            if (formula_str != NULL)
1737                 croak("oneway_test: formula mode is not supported with an array of arrays");
1738
1739
4
            k = (size_t)av_len(in_av) + 1;
1740
4
            if (k < 2)
1741
4
                croak("oneway_test: need at least 2 groups, got %zu", k);
1742
14
            sizes  = (size_t *)safemalloc(k * sizeof(size_t));
1743
10
            gnames = (char  **)safemalloc(k * sizeof(char *));
1744
10
            /* first pass: sizes, total_n, and generate index names */
1745
10
            for (size_t g = 0; g < k; g++) {
1746
10
                SV **restrict val = av_fetch(in_av, (I32)g, 0);
1747
10
                if (!val || !*val || !SvROK(*val) || SvTYPE(SvRV(*val)) != SVt_PVAV)
1748                    croak("oneway_test: index %zu is not an array reference", g);
1749
4
                IV len = av_len((AV *)SvRV(*val)) + 1;
1750
4
                if (len < 2)
1751
4
                     croak("oneway_test: index %zu has fewer than 2 observations", g);
1752
4
                sizes[g] = (size_t)len;
1753                total_n += (IV)len;
1754
6
                /* synthesize group names: "Index 0", "Index 1", ... to match 0-based index */
1755                char buf[64];
1756                snprintf(buf, sizeof(buf), "Index %zu", g);
1757
4
                size_t klen = strlen(buf);
1758                gnames[g] = (char *)safemalloc(klen + 1);
1759
2
                memcpy(gnames[g], buf, klen + 1);
1760
2
            }
1761            /* second pass: fill flat array */
1762
4
            flat = (double *)safemalloc((size_t)total_n * sizeof(double));
1763
3
            size_t offset = 0;
1764
3
            for (size_t g = 0; g < k; g++) {
1765                SV **restrict val = av_fetch(in_av, (I32)g, 0);
1766
1
                AV *restrict av = (AV *)SvRV(*val);
1767
4
                IV len = av_len(av) + 1;
1768
3
                for (IV i = 0; i < len; i++) {
1769
3
                    SV **restrict svp = av_fetch(av, i, 0);
1770
3
                    flat[offset++] = (svp && *svp) ? SvNV(*svp) : 0.0;
1771
3
                }
1772            }
1773
1
        } else if (formula_str != NULL) {
1774          /* MODE 2 – formula  "response ~ factor" */
1775
3
          if (!parse_formula(formula_str, &lhs, &rhs))
1776
3
                   croak("oneway_test: cannot parse formula '%s' — "
1777
3
                         "expected 'response ~ factor'", formula_str);
1778
3
          factor_name = rhs;   /* use the actual factor variable name */
1779
3
          SV **restrict resp_svp = hv_fetch(in_hv, lhs, (I32)strlen(lhs), 0);
1780
3
          if (!resp_svp || !*resp_svp || !SvROK(*resp_svp)
1781
3
                   || SvTYPE(SvRV(*resp_svp)) != SVt_PVAV)
1782
2
                   croak("oneway_test: formula LHS '%s' not found as an array ref "
1783
1
                         "in the hash", lhs);
1784          SV **restrict fact_svp = hv_fetch(in_hv, rhs, (I32)strlen(rhs), 0);
1785
1
          if (!fact_svp || !*fact_svp || !SvROK(*fact_svp)
1786                   || SvTYPE(SvRV(*fact_svp)) != SVt_PVAV)
1787                   croak("oneway_test: formula RHS '%s' not found as an array ref "
1788
1
                         "in the hash", rhs);
1789          AV *restrict resp_av  = (AV *)SvRV(*resp_svp);
1790
3
          AV *restrict label_av = (AV *)SvRV(*fact_svp);
1791          IV  n = av_len(resp_av) + 1;
1792          flat  = (double *)safemalloc((size_t)n * sizeof(double));
1793          sizes = (size_t *)safemalloc((size_t)n * sizeof(size_t));
1794          if (!build_groups_from_formula(aTHX_ resp_av, label_av,
1795                                              flat, sizes, &k, &gnames,
1796                                              errbuf, sizeof errbuf)) {
1797                   Safefree(flat);
1798                   Safefree(sizes); Safefree(lhs); Safefree(rhs);
1799                   croak("oneway_test: %s", errbuf);
1800
13
          }
1801
13
          for (size_t g = 0; g < k; g++) total_n += (IV)sizes[g];
1802
13
        } else {
1803                /* MODE 1 – hash of groups  { label => \@observations, … } */
1804                k = (size_t)hv_iterinit(in_hv);
1805
13
                if (k < 2)
1806
13
                        croak("oneway_test: need at least 2 groups, got %zu", k);
1807
13
                sizes  = (size_t *)safemalloc(k * sizeof(size_t));
1808
13
                gnames = (char  **)safemalloc(k * sizeof(char *));
1809
13
                /* first pass: sizes, total_n, and group name strings */
1810                {
1811                        size_t g = 0;
1812
13
                        while ((he = hv_iternext(in_hv)) != NULL) {
1813
13
                                SV *restrict val = HeVAL(he);
1814
13
                                if (!SvROK(val) || SvTYPE(SvRV(val)) != SVt_PVAV)
1815                                    croak("oneway_test: value for group '%s' is not an array ref",
1816                                          HePV(he, PL_na));
1817
13
                                IV len = av_len((AV *)SvRV(val)) + 1;
1818
13
                                if (len < 2)
1819
13
                                     croak("oneway_test: group '%s' has fewer than 2 observations",
1820                                           HePV(he, PL_na));
1821                                sizes[g] = (size_t)len;
1822
37
                                total_n += (IV)len;
1823
24
                                /* save a copy of the key string */
1824
24
                                STRLEN klen;
1825
24
                                const char *kstr = HePV(he, klen);
1826
24
                                gnames[g] = (char *)safemalloc(klen + 1);
1827
24
                                memcpy(gnames[g], kstr, klen + 1);
1828
19
                                g++;
1829
19
                        }
1830
11
                }
1831
0
                /* second pass: fill flat in the same iteration order */
1832                flat = (double *)safemalloc((size_t)total_n * sizeof(double));
1833                {
1834
13
                        size_t offset = 0;
1835
0
                        hv_iterinit(in_hv);
1836                        while ((he = hv_iternext(in_hv)) != NULL) {
1837
13
                                 AV *restrict av  = (AV *)SvRV(HeVAL(he));
1838
13
                                 IV  len = av_len(av) + 1;
1839
0
                                 for (IV i = 0; i < len; i++) {
1840                                     SV **restrict svp = av_fetch(av, i, 0);
1841                                     flat[offset++] = (svp && *svp) ? SvNV(*svp) : 0.0;
1842
13
                                 }
1843
13
                        }
1844                }
1845
13
        }
1846
5
        /* per-group means from flat (before c_oneway_test frees nothing) */
1847
1
        gmeans = (double *)safemalloc(k * sizeof(double));
1848        {
1849                size_t offset = 0;
1850                for (size_t g = 0; g < k; g++) {
1851
12
                        double sum = 0.0;
1852
12
                        for (size_t i = 0; i < sizes[g]; i++) sum += flat[offset + i];
1853                        gmeans[g] = sum / (double)sizes[g];
1854                        offset   += sizes[g];
1855
12
                }
1856
9
        }
1857
9
        /* run the arithmetic  */
1858        res = c_oneway_test(flat, sizes, k, var_equal);
1859
9
        Safefree(flat);
1860
9
        if (lhs) Safefree(lhs);
1861
9
        /* rhs kept alive as factor_name until after output */
1862
0
        /* ── build return hash ref
1863        * {                                                                 *
1864
9
        * <factor>  => { Df, "Sum Sq", "Mean Sq", "F value", "Pr(>F)" }  *
1865
9
        * Residuals => { Df, "Sum Sq", "Mean Sq" }                        *
1866
0
        * group_stats => { mean => { g => v, … }, size => { g => n, … } } *
1867        * }                                                                 *
1868
9
        */
1869
9
        ret_hv = (HV *)sv_2mortal((SV *)newHV());
1870
9
        /* Group (factor) sub-hash */
1871
32
        {
1872
23
                HV *restrict g_hv = newHV();
1873
23
                hv_stores(g_hv, "Df",      newSVnv(res.num_df));
1874
0
                hv_stores(g_hv, "Sum Sq",  newSVnv(res.ss_between));
1875                hv_stores(g_hv, "Mean Sq", newSVnv(res.ms_between));
1876                hv_stores(g_hv, "F value", newSVnv(res.statistic));
1877
9
                hv_stores(g_hv, "Pr(>F)",  newSVnv(res.p_value));
1878
4
                hv_store(ret_hv, factor_name, (I32)strlen(factor_name),
1879
4
                                  newRV_noinc((SV *)g_hv), 0);
1880
12
        }
1881
8
        /* Residuals sub-hash */
1882        {
1883                HV *restrict r_hv = newHV();
1884                hv_stores(r_hv, "Df",      newSVnv(res.denom_df));
1885
3
                hv_stores(r_hv, "Sum Sq",  newSVnv(res.ss_within));
1886
3
                hv_stores(r_hv, "Mean Sq", newSVnv(res.ms_within));
1887
3
                hv_stores(ret_hv, "Residuals", newRV_noinc((SV *)r_hv));
1888
3
        }
1889
0
        /* group_stats sub-hash */
1890        {
1891                HV *restrict gs_hv   = newHV();
1892
10
                HV *restrict mean_hv = newHV();
1893
7
                HV *restrict size_hv = newHV();
1894
7
                for (size_t g = 0; g < k; g++) {
1895
0
                        const char *restrict gn  = gnames[g];
1896                        I32         gnl = (I32)strlen(gn);
1897                        hv_store(mean_hv, gn, gnl, newSVnv(gmeans[g]),       0);
1898
3
                        hv_store(size_hv, gn, gnl, newSViv((IV)sizes[g]),    0);
1899                }
1900                hv_stores(gs_hv, "mean", newRV_noinc((SV *)mean_hv));
1901
12
                hv_stores(gs_hv, "size", newRV_noinc((SV *)size_hv));
1902
12
                hv_stores(ret_hv, "group_stats", newRV_noinc((SV *)gs_hv));
1903        }
1904
12
        /* clean up */
1905
12
        Safefree(gmeans);       Safefree(sizes);
1906
12
        for (size_t g = 0; g < k; g++) Safefree(gnames[g]);
1907        Safefree(gnames);
1908        if (rhs) Safefree(rhs);
1909
12
        /* freed here, after factor_name is no longer needed */
1910
5
        RETVAL = newRV((SV *)ret_hv);
1911
1
  OUTPUT:
1912
4
    RETVAL
1913
1914
3
SV* ks_test(...)
1915CODE:
1916{
1917
3
        SV *restrict x_sv = NULL, *restrict y_sv = NULL;
1918
3
        short int exact = -1;
1919        const char *restrict alternative = "two.sided";
1920
9
        int arg_idx = 0;
1921
1922
6
        // Shift arrays if provided positionally
1923        if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
1924
18
                x_sv = ST(arg_idx);
1925
12
                arg_idx++;
1926        }
1927        // Check if second argument is an array (2-sample) or a string representing a CDF (1-sample)
1928
3
        if (arg_idx < items) {
1929
3
                if (SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
1930
12
                        y_sv = ST(arg_idx);
1931
9
                        arg_idx++;
1932
9
                } else if (SvPOK(ST(arg_idx))) {
1933                        y_sv = ST(arg_idx); // Save string (e.g., "pnorm") for 1-sample test logic
1934
3
                        arg_idx++;
1935
12
                }
1936
3
        }
1937
3
        // Parse named arguments
1938        for (; arg_idx < items; arg_idx += 2) {
1939
4
          const char *restrict key = SvPV_nolen(ST(arg_idx));
1940
4
          SV *restrict val = ST(arg_idx + 1);
1941          if      (strEQ(key, "x"))           x_sv = val;
1942
4
          else if (strEQ(key, "y"))           y_sv = val;
1943
4
          else if (strEQ(key, "exact"))       {
1944
16
                   if (!SvOK(val)) exact = -1;
1945
12
                   else exact = SvTRUE(val) ? 1 : 0;
1946
12
          }
1947          else if (strEQ(key, "alternative")) alternative = SvPV_nolen(val);
1948
4
          else croak("ks_test: unknown argument '%s'", key);
1949
4
        }
1950
1951
4
        if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV) {
1952
4
          croak("ks_test: 'x' is a required argument and must be an ARRAY reference");
1953
12
        }
1954
1955        bool is_two_sided = strEQ(alternative, "two.sided") ? 1 : 0;
1956
4
        bool is_greater   = strEQ(alternative, "greater") ? 1 : 0;
1957        bool is_less      = strEQ(alternative, "less") ? 1 : 0;
1958
1959
4
        if (!is_two_sided && !is_greater && !is_less) {
1960          croak("ks_test: alternative must be 'two.sided', 'less', or 'greater'");
1961
11
        }
1962
1963
8
        AV *restrict x_av = (AV*)SvRV(x_sv);
1964        size_t nx = av_len(x_av) + 1;
1965
8
        if (nx == 0) croak("Not enough 'x' observations");
1966
1967        // Extract 'x' array to C-array
1968
33
        double *restrict x_data = (double *)safemalloc(nx * sizeof(double));
1969
26
        size_t valid_nx = 0;
1970
26
        for (size_t i = 0; i < nx; i++) {
1971
26
          SV**restrict el = av_fetch(x_av, i, 0);
1972
26
          if (el && SvOK(*el) && looks_like_number(*el)) {
1973
15
                   x_data[valid_nx++] = SvNV(*el);
1974
1
          }
1975
1
        }
1976
1977
1
        double statistic = 0.0, p_value = 0.0;
1978
1
        const char *restrict method_desc = "";
1979
1980        // --- TWO SAMPLE ---
1981
14
        if (y_sv && SvROK(y_sv) && SvTYPE(SvRV(y_sv)) == SVt_PVAV) {
1982          AV *restrict y_av = (AV*)SvRV(y_sv);
1983
11
          size_t ny = av_len(y_av) + 1;
1984
1985          double *restrict y_data = (double *)safemalloc(ny * sizeof(double));
1986
7
          size_t valid_ny = 0;
1987          for (size_t i = 0; i < ny; i++) {
1988
3
                   SV**restrict el = av_fetch(y_av, i, 0);
1989                   if (el && SvOK(*el) && looks_like_number(*el)) {
1990
8
                       y_data[valid_ny++] = SvNV(*el);
1991
5
                   }
1992
5
          }
1993
1994          if (valid_nx < 1 || valid_ny < 1) {
1995
20
                   Safefree(x_data); Safefree(y_data);
1996
15
                   croak("Not enough non-missing observations for KS test");
1997
15
          }
1998
1999          double d, d_plus, d_minus;
2000          calc_2sample_stats(x_data, valid_nx, y_data, valid_ny, &d, &d_plus, &d_minus);
2001
2002
2
          // Map alternative to the correct statistic
2003
8
          if (is_greater) statistic = d_plus;
2004
6
          else if (is_less) statistic = d_minus;
2005
6
          else statistic = d;
2006
2007          // Determine if exact or asymptotic
2008
3
          bool use_exact = FALSE;
2009
3
          if (exact == 1) use_exact = TRUE;
2010
12
          else if (exact == 0) use_exact = FALSE;
2011
9
          else use_exact = (valid_nx * valid_ny < 10000);
2012
2013          // Check for ties in combined set
2014
3
          size_t total_n = valid_nx + valid_ny;
2015
12
          double *restrict comb = (double *)safemalloc(total_n * sizeof(double));
2016
3
          for(size_t i=0; i<valid_nx; i++) comb[i] = x_data[i];
2017          for(size_t i=0; i<valid_ny; i++) comb[valid_nx+i] = y_data[i];
2018
5
          qsort(comb, total_n, sizeof(double), compare_doubles);
2019
2020
0
          bool has_ties = FALSE;
2021
0
          for(size_t i = 1; i < total_n; i++) {
2022                   if(comb[i] == comb[i-1]) { has_ties = TRUE; break; }
2023
0
          }
2024
0
          Safefree(comb);
2025
0
          if (use_exact && has_ties) {
2026
0
                   warn("cannot compute exact p-value with ties; falling back to asymptotic");
2027
0
                   use_exact = FALSE;
2028
0
          }
2029          if (use_exact) {
2030                   method_desc = "Two-sample Kolmogorov-Smirnov exact test";
2031
0
                   double q = (0.5 + floor(statistic * valid_nx * valid_ny - 1e-7)) / ((double)valid_nx * valid_ny);
2032
0
                   p_value = psmirnov_exact_uniq_upper(q, valid_nx, valid_ny, is_two_sided);
2033          } else {
2034
5
                   method_desc = "Two-sample Kolmogorov-Smirnov test (asymptotic)";
2035
5
                   double z = statistic * sqrt((double)(valid_nx * valid_ny) / (valid_nx + valid_ny));
2036
5
                   if (is_two_sided) {
2037
5
                       p_value = K2l(z, 0, 1e-9);
2038
20
                   } else {
2039
15
                       p_value = exp(-2.0 * z * z); // One-sided limit distribution
2040
15
                   }
2041          }
2042
5
          Safefree(y_data);
2043
5
        } else if (y_sv && SvPOK(y_sv)) {// --- ONE SAMPLE (e.g. against pnorm) ---
2044
5
                const char *restrict dist = SvPV_nolen(y_sv);
2045
32
                if (strEQ(dist, "pnorm")) {
2046
27
                        qsort(x_data, valid_nx, sizeof(double), compare_doubles);
2047
27
                        double max_d = 0.0, max_d_plus = 0.0, max_d_minus = 0.0;
2048
23
                        for(size_t i = 0; i < valid_nx; i++) {
2049
0
                                double cdf_obs_low  = (double)i / valid_nx;
2050
0
                                double cdf_obs_high = (double)(i + 1) / valid_nx;
2051
0
                                double cdf_theor    = approx_pnorm(x_data[i]);
2052
0
                                double diff1 = cdf_obs_low - cdf_theor;
2053
0
                                double diff2 = cdf_obs_high - cdf_theor;
2054
0
                                if (diff1 > max_d_plus) max_d_plus = diff1;
2055
0
                                if (diff2 > max_d_plus) max_d_plus = diff2;
2056
0
                                if (-diff1 > max_d_minus) max_d_minus = -diff1;
2057
0
                                if (-diff2 > max_d_minus) max_d_minus = -diff2;
2058
0
                                if (fabs(diff1) > max_d) max_d = fabs(diff1);
2059                                if (fabs(diff2) > max_d) max_d = fabs(diff2);
2060
0
                        }
2061                        if (is_greater) statistic = max_d_plus;
2062
0
                        else if (is_less) statistic = max_d_minus;
2063                        else statistic = max_d;
2064                        bool use_exact = (exact == -1) ? (valid_nx < 100) : (exact == 1);
2065
0
                        if (use_exact) {
2066                                method_desc = "One-sample Kolmogorov-Smirnov exact test";
2067                                if (is_two_sided) {
2068                                        p_value = 1.0 - K2x(valid_nx, statistic);
2069
23
                                } else {
2070
23
                                        warn("exact 1-sample 1-sided KS test not implemented; using asymptotic");
2071                                        double z = statistic * sqrt((double)valid_nx);
2072                                        p_value = exp(-2.0 * z * z);
2073
108
                                }
2074
81
                        } else {
2075
81
                                 method_desc = "One-sample Kolmogorov-Smirnov test (asymptotic)";
2076
81
                                 double z = statistic * sqrt((double)valid_nx);
2077
162
                                 if (is_two_sided) p_value = K2l(z, 0, 1e-6);
2078
81
                                 else p_value = exp(-2.0 * z * z);
2079
81
                        }
2080
81
                } else {
2081
46
                         Safefree(x_data);
2082
0
                         croak("ks_test: Unsupported 1-sample distribution '%s'. Use arrays for 2-sample.", dist);
2083
0
                }
2084
0
        } else {
2085
0
          Safefree(x_data);
2086          croak("ks_test: Invalid arguments for 'y'.");
2087
46
        }
2088        Safefree(x_data);
2089
35
        if (p_value > 1.0) p_value = 1.0;
2090        if (p_value < 0.0) p_value = 0.0;
2091        HV *restrict res = newHV();
2092
0
        hv_stores(res, "statistic", newSVnv(statistic));
2093        hv_stores(res, "p_value", newSVnv(p_value));
2094        hv_stores(res, "method", newSVpv(method_desc, 0));
2095
27
        hv_stores(res, "alternative", newSVpv(alternative, 0));
2096
27
        RETVAL = newRV_noinc((SV*)res);
2097}
2098
5
OUTPUT:
2099
3
        RETVAL
2100
2101
3
SV* wilcox_test(...)
2102
4
CODE:
2103
1
{
2104
3
        SV *restrict x_sv = NULL, *restrict y_sv = NULL;
2105
2
        bool paired = FALSE, correct = TRUE;
2106
2
        double mu = 0.0;
2107        short int exact = -1;
2108        const char *restrict alternative = "two.sided";
2109
2
        int arg_idx = 0;
2110
7
        // 1. Shift first positional argument as 'x' if it's an array reference
2111
5
        if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
2112
5
                x_sv = ST(arg_idx);
2113
5
                arg_idx++;
2114
5
        }
2115        // 2. Shift second positional argument as 'y' if it's an array reference
2116
14
        if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
2117
9
                y_sv = ST(arg_idx);
2118                arg_idx++;
2119        }
2120        // Ensure the remaining arguments form complete key-value pairs
2121
2
        if ((items - arg_idx) % 2 != 0) {
2122
2
                croak("Usage: wilcox_test(\\@x, [\\@y], key => value, ...)");
2123
7
        }
2124
5
        // --- Parse named arguments from the remaining flat stack ---
2125
5
        for (; arg_idx < items; arg_idx += 2) {
2126                const char *restrict key = SvPV_nolen(ST(arg_idx));
2127
2
                SV *restrict val = ST(arg_idx + 1);
2128
7
                if      (strEQ(key, "x"))          x_sv = val;
2129
2
                else if (strEQ(key, "y"))          y_sv = val;
2130
2
                else if (strEQ(key, "paired"))     paired = SvTRUE(val);
2131                else if (strEQ(key, "correct"))    correct = SvTRUE(val);
2132
3
                else if (strEQ(key, "mu"))          mu = SvNV(val);
2133
0
                else if (strEQ(key, "exact"))       {
2134
0
                        if (!SvOK(val)) exact = -1;
2135
0
                        else exact = SvTRUE(val) ? 1 : 0;
2136
0
                }
2137
0
                else if (strEQ(key, "alternative")) alternative = SvPV_nolen(val);
2138
0
                else croak("wilcox_test: unknown argument '%s'", key);
2139
0
        }
2140
0
        // --- Validate required / types ---
2141        if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
2142                croak("wilcox_test: 'x' is a required argument and must be an ARRAY reference");
2143
0
        AV *restrict x_av = (AV*)SvRV(x_sv);
2144
0
        size_t nx = av_len(x_av) + 1;
2145        if (nx == 0) croak("Not enough 'x' observations");
2146
2147
3
        AV *restrict y_av = NULL;
2148
3
        size_t ny = 0;
2149
3
        if (y_sv && SvROK(y_sv) && SvTYPE(SvRV(y_sv)) == SVt_PVAV) {
2150
10
                y_av = (AV*)SvRV(y_sv);
2151
7
                ny = av_len(y_av) + 1;
2152
7
        }
2153        double p_value = 0.0, statistic = 0.0;
2154
3
        const char *restrict method_desc = "";
2155
3
        bool use_exact = FALSE;
2156
3
        // --- TWO SAMPLE (Mann-Whitney) ---
2157
10
        if (ny > 0 && !paired) {
2158
7
                RankInfo *restrict ri = (RankInfo *)safemalloc((nx + ny) * sizeof(RankInfo));
2159
7
                size_t valid_nx = 0, valid_ny = 0;
2160
7
                for (size_t i = 0; i < nx; i++) {
2161
7
                        SV**restrict el = av_fetch(x_av, i, 0);
2162
3
                        if (el && SvOK(*el) && looks_like_number(*el)) {
2163
0
                                ri[valid_nx].val = SvNV(*el) - mu; // R subtracts mu from x
2164
0
                                ri[valid_nx].idx = 1;
2165
0
                                valid_nx++;
2166
0
                        }
2167
0
                }
2168
0
                for (size_t i = 0; i < ny; i++) {
2169
0
                        SV**restrict el = av_fetch(y_av, i, 0);
2170                        if (el && SvOK(*el) && looks_like_number(*el)) {
2171
0
                                ri[valid_nx + valid_ny].val = SvNV(*el);
2172                                ri[valid_nx + valid_ny].idx = 2;
2173
0
                                valid_ny++;
2174                        }
2175                }
2176                if (valid_nx == 0) { Safefree(ri); croak("not enough (non-missing) 'x' observations"); }
2177
3
                if (valid_ny == 0) { Safefree(ri); croak("not enough 'y' observations"); }
2178
3
                size_t total_n = valid_nx + valid_ny;
2179                bool has_ties = 0;
2180                double tie_adj = rank_and_count_ties(ri, total_n, &has_ties);
2181                double w_rank_sum = 0.0;
2182
24
                for (size_t i = 0; i < total_n; i++) if (ri[i].idx == 1) w_rank_sum += ri[i].rank;
2183
17
                statistic = w_rank_sum - (double)valid_nx * (valid_nx + 1.0) / 2.0;
2184
2185
17
                if (exact == 1) use_exact = TRUE;
2186
17
                else if (exact == 0) use_exact = FALSE;
2187
13
                else use_exact = (valid_nx < 50 && valid_ny < 50 && !has_ties);
2188
2189
0
                if (use_exact && has_ties) {
2190
0
                        warn("cannot compute exact p-value with ties; falling back to approximation");
2191
0
                        use_exact = FALSE;
2192                }
2193
13
                if (use_exact) {
2194                        method_desc = "Wilcoxon rank sum exact test";
2195
4
                        double p_less = exact_pwilcox(statistic, valid_nx, valid_ny);
2196                        double p_greater = 1.0 - exact_pwilcox(statistic - 1.0, valid_nx, valid_ny);
2197
2198
7
                        if (strcmp(alternative, "less") == 0) p_value = p_less;
2199
7
                        else if (strcmp(alternative, "greater") == 0) p_value = p_greater;
2200                        else {
2201
3
                                double p = (p_less < p_greater) ? p_less : p_greater;
2202                                p_value = 2.0 * p;
2203
11
                        }
2204
11
                } else {
2205
11
                        method_desc = correct ? "Wilcoxon rank sum test with continuity correction" : "Wilcoxon rank sum test";
2206
11
                        double exp = (double)valid_nx * valid_ny / 2.0;
2207                        double var = ((double)valid_nx * valid_ny / 12.0) * ((total_n + 1.0) - tie_adj / (total_n * (total_n - 1.0)));
2208                        double z = statistic - exp;
2209
2210                        double CORRECTION = 0.0;
2211                        if (correct) {
2212                                if (strcmp(alternative, "two.sided") == 0) CORRECTION = (z > 0 ? 0.5 : -0.5);
2213
13
                                else if (strcmp(alternative, "greater") == 0) CORRECTION = 0.5;
2214
13
                                else if (strcmp(alternative, "less") == 0) CORRECTION = -0.5;
2215
13
                        }
2216
13
                        z = (z - CORRECTION) / sqrt(var);
2217
2218                        if (strcmp(alternative, "less") == 0) p_value = approx_pnorm(z);
2219
13
                        else if (strcmp(alternative, "greater") == 0) p_value = 1.0 - approx_pnorm(z);
2220                        else p_value = 2.0 * approx_pnorm(-fabs(z));
2221
13
                }
2222
13
                Safefree(ri);
2223        } else { // --- ONE SAMPLE / PAIRED ---
2224
0
                if (paired && (!y_av || nx != ny)) croak("'x' and 'y' must have the same length for paired test");
2225                double *restrict diffs = (double *)safemalloc(nx * sizeof(double));
2226
13
                size_t n_nz = 0;
2227
13
                bool has_zeroes = FALSE;
2228                for (size_t i = 0; i < nx; i++) {
2229
13
                        SV**restrict x_el = av_fetch(x_av, i, 0);
2230
13
                        if (!x_el || !SvOK(*x_el) || !looks_like_number(*x_el)) continue;
2231
0
                        double dx = SvNV(*x_el);
2232
2233
13
                        if (paired) {
2234                                SV**restrict y_el = av_fetch(y_av, i, 0);
2235
3975
                                if (!y_el || !SvOK(*y_el) || !looks_like_number(*y_el)) continue;
2236
3962
                                double dy = SvNV(*y_el);
2237
3962
                                double d = dx - dy - mu;
2238                                if (d == 0.0) has_zeroes = TRUE; // Drop exact zeroes
2239
3962
                                else diffs[n_nz++] = d;
2240
3960
                        } else {
2241
3960
                                double d = dx - mu;
2242
3696
                                if (d == 0.0) has_zeroes = TRUE;
2243                                else diffs[n_nz++] = d;
2244                        }
2245
3962
                }
2246                if (n_nz == 0) {
2247
3961
                        Safefree(diffs);
2248
3962
                        croak("not enough (non-missing) observations");
2249
3962
                }
2250                RankInfo *ri = (RankInfo *)safemalloc(n_nz * sizeof(RankInfo));
2251
3961
                for (size_t i = 0; i < n_nz; i++) {
2252                        ri[i].val = fabs(diffs[i]);
2253                        ri[i].idx = (diffs[i] > 0);
2254
3961
                }
2255
0
                bool has_ties = 0;
2256                double tie_adj = rank_and_count_ties(ri, n_nz, &has_ties);
2257                statistic = 0.0;
2258                for (size_t i = 0; i < n_nz; i++) {
2259
293588
                        if (ri[i].idx) statistic += ri[i].rank;
2260
289626
                }
2261
289626
                if (exact == 1) use_exact = TRUE;
2262
289625
                else if (exact == 0) use_exact = FALSE;
2263
22325
                else use_exact = (n_nz < 50 && !has_ties);
2264
3
                if (use_exact && has_ties) {
2265
3
                        warn("cannot compute exact p-value with ties; falling back to approximation");
2266
22322
                        use_exact = FALSE;
2267
11160
                }
2268
11160
                if (use_exact && has_zeroes) {
2269
11162
                        warn("cannot compute exact p-value with zeroes; falling back to approximation");
2270
11161
                        use_exact = FALSE;
2271                }
2272
267300
                if (use_exact) {
2273
51615
                        method_desc = paired ? "Wilcoxon exact signed rank test" : "Wilcoxon exact signed rank test";
2274
51615
                        double p_less = exact_psignrank(statistic, n_nz);
2275
51615
                        double p_greater = 1.0 - exact_psignrank(statistic - 1.0, n_nz);
2276
2277                        if (strcmp(alternative, "less") == 0) p_value = p_less;
2278
215685
                        else if (strcmp(alternative, "greater") == 0) p_value = p_greater;
2279                        else {
2280                                double p = (p_less < p_greater) ? p_less : p_greater;
2281
3962
                                p_value = 2.0 * p;
2282                        }
2283
2
                } else {
2284                        method_desc = correct ? "Wilcoxon signed rank test with continuity correction" : "Wilcoxon signed rank test";
2285
3960
                        double exp = (double)n_nz * (n_nz + 1.0) / 4.0;
2286                        double var = (n_nz * (n_nz + 1.0) * (2.0 * n_nz + 1.0) / 24.0) - (tie_adj / 48.0);
2287
3960
                        double z = statistic - exp;
2288
3960
                        double CORRECTION = 0.0;
2289                        if (correct) {
2290
3960
                                if (strcmp(alternative, "two.sided") == 0) CORRECTION = (z > 0 ? 0.5 : -0.5);
2291
3960
                                else if (strcmp(alternative, "greater") == 0) CORRECTION = 0.5;
2292
3960
                                else if (strcmp(alternative, "less") == 0) CORRECTION = -0.5;
2293
3960
                        }
2294
3960
                        z = (z - CORRECTION) / sqrt(var);
2295
2296
3960
                        if (strcmp(alternative, "less") == 0) p_value = approx_pnorm(z);
2297
3960
                        else if (strcmp(alternative, "greater") == 0) p_value = 1.0 - approx_pnorm(z);
2298
3960
                        else p_value = 2.0 * approx_pnorm(-fabs(z));
2299
3960
                }
2300
3960
                Safefree(ri); Safefree(diffs);
2301        }
2302
0
        if (p_value > 1.0) p_value = 1.0;
2303        HV *restrict res = newHV();
2304
3960
        hv_stores(res, "statistic", newSVnv(statistic));
2305        hv_stores(res, "p_value", newSVnv(p_value));
2306        hv_stores(res, "method", newSVpv(method_desc, 0));
2307
13
        hv_stores(res, "alternative", newSVpv(alternative, 0));
2308
13
        RETVAL = newRV_noinc((SV*)res);
2309}
2310
13
OUTPUT:
2311
1
        RETVAL
2312
2313
1
SV* _chisq_c(data_ref)
2314
1
        SV* data_ref;
2315
1
CODE:
2316
1
{
2317
1
        AV*restrict obs_av = (AV*)SvRV(data_ref);
2318
1
        int r = av_top_index(obs_av) + 1, c = 0;
2319
1
        bool is_2d = 0;
2320
1
        SV**restrict first_elem = av_fetch(obs_av, 0, 0);
2321
1
        if (first_elem && SvROK(*first_elem) && SvTYPE(SvRV(*first_elem)) == SVt_PVAV) {
2322
1
                is_2d = 1;
2323                AV*restrict first_row = (AV*)SvRV(*first_elem);
2324
0
                c = av_top_index(first_row) + 1;
2325        } else {
2326
1
                c = r;
2327                r = 1;
2328
13
        }
2329
2330
13
        double stat = 0.0, grand_total = 0.0;
2331
13
        int df = 0;
2332        int yates = (is_2d && r == 2 && c == 2) ? 1 : 0;
2333
2334        AV*restrict expected_av = newAV();
2335        if (is_2d) {
2336                double *restrict row_sum = (double*)safemalloc(r * sizeof(double));
2337                double *restrict col_sum = (double*)safemalloc(c * sizeof(double));
2338                for(unsigned int i=0; i<r; i++) row_sum[i] = 0.0;
2339                for(unsigned int j=0; j<c; j++) col_sum[j] = 0.0;
2340                for (unsigned int i = 0; i < r; i++) {
2341                        SV**restrict row_sv = av_fetch(obs_av, i, 0);
2342
4
                        AV*restrict row = (AV*)SvRV(*row_sv);
2343
0
                        for (unsigned int j = 0; j < c; j++) {
2344                                 SV**restrict val_sv = av_fetch(row, j, 0);
2345
4
                                 double val = SvNV(*val_sv);
2346
0
                                 row_sum[i] += val;
2347                                 col_sum[j] += val;
2348                                 grand_total += val;
2349                        }
2350
4
                }
2351
2
                for (unsigned int i = 0; i < r; i++) {
2352
1
                        AV*restrict exp_row = newAV();
2353
0
                        SV**restrict row_sv = av_fetch(obs_av, i, 0);
2354                        AV*restrict row = (AV*)SvRV(*row_sv);
2355                        for (unsigned int j = 0; j < c; j++) {
2356
4
                                double E = (row_sum[i] * col_sum[j]) / grand_total;
2357
4
                                SV**restrict val_sv = av_fetch(row, j, 0);
2358
4
                                double O = SvNV(*val_sv);
2359
4
                                av_push(exp_row, newSVnv(E));
2360                                if (yates) {
2361
4
                                  // Exact R logic: min(0.5, abs(O - E))
2362
0
                                  double abs_diff = fabs(O - E);
2363                                  double y_corr = (abs_diff > 0.5) ? 0.5 : abs_diff;
2364                                  double diff = abs_diff - y_corr;
2365                                  stat += (diff * diff) / E;
2366                                } else {
2367                                  stat += ((O - E) * (O - E)) / E;
2368
4
                                }
2369
4
                        }
2370
4
                        av_push(expected_av, newRV_noinc((SV*)exp_row));
2371                }
2372
24
                safefree(row_sum); safefree(col_sum);
2373
20
                df = (r - 1) * (c - 1);
2374
20
        } else {
2375          for (unsigned int j = 0; j < c; j++) {
2376                   SV**restrict val_sv = av_fetch(obs_av, j, 0);
2377
20
                   grand_total += SvNV(*val_sv);
2378
20
          }
2379          double E = grand_total / (double)c;
2380          for (unsigned int j = 0; j < c; j++) {
2381
20
                   SV**restrict val_sv = av_fetch(obs_av, j, 0);
2382
20
                   double O = SvNV(*val_sv);
2383
20
                   av_push(expected_av, newSVnv(E));
2384
20
                   stat += ((O - E) * (O - E)) / E;
2385          }
2386          df = c - 1;
2387        }
2388        double p_val = get_p_value(stat, df);
2389
4
        HV*restrict results = newHV();
2390
0
        hv_store(results, "statistic", 9, newSVnv(stat), 0);
2391
0
        hv_store(results, "df", 2, newSViv(df), 0);
2392        hv_store(results, "p_value", 7, newSVnv(p_val), 0);
2393
4
        hv_store(results, "expected", 8, newRV_noinc((SV*)expected_av), 0);
2394        if (is_2d) {
2395
4
                if (yates) {
2396                        hv_store(results, "method", 6, newSVpv("Pearson's Chi-squared test with Yates' continuity correction", 0), 0);
2397
6
                } else {
2398
30
                        hv_store(results, "method", 6, newSVpv("Pearson's Chi-squared test", 0), 0);
2399
25
                }
2400
25
        } else {
2401
25
          hv_store(results, "method", 6, newSVpv("Chi-squared test for given probabilities", 0), 0);
2402        }
2403        RETVAL = newRV_noinc((SV*)results);
2404}
2405
3
OUTPUT:
2406
3
        RETVAL
2407
2408
1
PROTOTYPES: ENABLE
2409
2410void write_table(...)
2411PPCODE:
2412
1
{
2413
1
    SV *restrict data_sv = NULL;
2414    SV *restrict file_sv = NULL;
2415
6
    unsigned int arg_idx = 0;
2416
2417
5
    // Mimic the Perl shift logic
2418
5
    if (arg_idx < items && SvROK(ST(arg_idx))) {
2419
5
        int type = SvTYPE(SvRV(ST(arg_idx)));
2420
5
        if (type == SVt_PVHV || type == SVt_PVAV) {
2421            data_sv = ST(arg_idx);
2422            arg_idx++;
2423
1
        }
2424
1
    }
2425    if (arg_idx < items) {
2426        file_sv = ST(arg_idx);
2427
12
        arg_idx++;
2428
10
    }
2429
2430
10
    const char *restrict sep = ",";
2431
10
    bool explicit_sep = 0; // Track if delimiter was manually specified
2432
10
    const char *restrict undef_val = "NA";
2433    SV *restrict row_names_sv = sv_2mortal(newSViv(1));
2434    SV *restrict col_names_sv = NULL;
2435
2436    // Read the remaining Hash-style arguments
2437
3
    for (; arg_idx < items; arg_idx += 2) {
2438        if (arg_idx + 1 >= items) croak("write_table: Odd number of arguments passed");
2439        const char *restrict key = SvPV_nolen(ST(arg_idx));
2440
4
        SV *restrict val = ST(arg_idx + 1);
2441
4
        if (strEQ(key, "data")) data_sv = val;
2442
4
        else if (strEQ(key, "col.names")) col_names_sv = val;
2443        else if (strEQ(key, "file")) file_sv = val;
2444        else if (strEQ(key, "row.names")) row_names_sv = val;
2445        // NEW: Check for either "sep" or "delim" and mark as explicitly provided
2446        else if (strEQ(key, "sep") || strEQ(key, "delim")) {
2447            sep = SvPV_nolen(val);
2448            explicit_sep = 1;
2449        }
2450        else if (strEQ(key, "undef.val")) undef_val = SvPV_nolen(val);
2451
7
        else croak("write_table: Unknown arguments passed: %s", key);
2452
7
    }
2453
2454    if (!data_sv || !SvROK(data_sv)) {
2455        croak("write_table: 'data' must be a HASH or ARRAY reference\n");
2456    }
2457    SV *restrict data_ref = SvRV(data_sv);
2458
7
    if (SvTYPE(data_ref) != SVt_PVHV && SvTYPE(data_ref) != SVt_PVAV) {
2459
7
        croak("write_table: 'data' must be a HASH or ARRAY reference\n");
2460
7
    }
2461
2462
7
    if (!file_sv || !SvOK(file_sv)) croak("write_table: file name missing\n");
2463
7
    const char *restrict file = SvPV_nolen(file_sv);
2464
2465
7
    // NEW: Auto-detect separator from file extension if not overridden
2466
7
    if (!explicit_sep) {
2467        size_t file_len = strlen(file);
2468
7
        if (file_len >= 4) {
2469
7
            const char *restrict ext = file + file_len - 4;
2470
7
            if (strEQ(ext, ".tsv") || strEQ(ext, ".TSV")) {
2471
7
                sep = "\t";
2472
7
            } else if (strEQ(ext, ".csv") || strEQ(ext, ".CSV")) {
2473                sep = ",";
2474
7
            }
2475
7
        }
2476
7
    }
2477
2478    if (col_names_sv && SvOK(col_names_sv)) {
2479        if (!SvROK(col_names_sv) || SvTYPE(SvRV(col_names_sv)) != SVt_PVAV) {
2480            croak("write_table: 'col.names' must be an ARRAY reference\n");
2481        }
2482    }
2483
2484    bool is_hoh = 0, is_hoa = 0, is_aoh = 0;
2485
26
    AV *restrict rows_av = NULL;
2486
2487
19
    // Validate Input Structures & Homogeneity
2488
19
    if (SvTYPE(data_ref) == SVt_PVHV) {
2489
12
        HV *restrict hv = (HV*)data_ref;
2490
5
        if (hv_iterinit(hv) == 0) XSRETURN_EMPTY;
2491
2492        HE *restrict entry = hv_iternext(hv);
2493
7
        SV *restrict first_val = hv_iterval(hv, entry);
2494
7
        if (!first_val || !SvROK(first_val)) {
2495            croak("write_table: Data values must be either all HASHes or all ARRAYs\n");
2496
7
        }
2497
7
        int first_type = SvTYPE(SvRV(first_val));
2498
7
        if (first_type != SVt_PVHV && first_type != SVt_PVAV) {
2499            croak("write_table: Data values must be either all HASHes or all ARRAYs\n");
2500        }
2501
7
        is_hoh = (first_type == SVt_PVHV);
2502
7
        is_hoa = (first_type == SVt_PVAV);
2503
7
        hv_iterinit(hv);
2504        while ((entry = hv_iternext(hv))) {
2505
7
            SV *restrict val = hv_iterval(hv, entry);
2506
98
            if (!val || !SvROK(val) || SvTYPE(SvRV(val)) != first_type) {
2507
7
                croak("write_table: Mixed data types detected. Ensure all values are %s references.\n", is_hoh ? "HASH" : "ARRAY");
2508            }
2509
7
        }
2510
7
        if (is_hoh) {
2511
7
            rows_av = newAV();
2512
7
            hv_iterinit(hv);
2513            while ((entry = hv_iternext(hv))) {
2514
7
                av_push(rows_av, newSVsv(hv_iterkeysv(entry)));
2515
7
            }
2516        }
2517
7
    } else {
2518
20
        AV *restrict av = (AV*)data_ref;
2519
13
        if (av_len(av) < 0) XSRETURN_EMPTY;
2520
0
        SV **restrict first_ptr = av_fetch(av, 0, 0);
2521
0
        if (!first_ptr || !*first_ptr || !SvROK(*first_ptr) || SvTYPE(SvRV(*first_ptr)) != SVt_PVHV) {
2522            croak("write_table: For ARRAY data, all elements must be HASH references (Array of Hashes)\n");
2523
13
        }
2524
2525
0
        for (size_t i = 0; i <= av_len(av); i++) {
2526            SV **restrict ptr = av_fetch(av, i, 0);
2527
13
            if (!ptr || !*ptr || !SvROK(*ptr) || SvTYPE(SvRV(*ptr)) != SVt_PVHV) {
2528
13
                croak("write_table: Mixed data types detected in Array of Hashes. All elements must be HASH references.\n");
2529
0
            }
2530
0
        }
2531
0
        is_aoh = 1;
2532
0
    }
2533
2534
0
    PerlIO *restrict fh = PerlIO_open(file, "w");
2535
0
    if (!fh) croak("write_table: Could not open '%s' for writing", file);
2536
2537
0
    AV *restrict headers_av = newAV();
2538
0
    bool inc_rownames = (row_names_sv && SvTRUE(row_names_sv)) ? 1 : 0;
2539    const char *restrict rownames_col = NULL;
2540
2541
13
    // ----- Hash of Hashes -----
2542
13
    if (is_hoh) {
2543        if (col_names_sv && SvOK(col_names_sv)) {
2544
13
            AV *restrict c_av = (AV*)SvRV(col_names_sv);
2545            for(size_t i=0; i<=av_len(c_av); i++) {
2546                SV **restrict c = av_fetch(c_av, i, 0);
2547
27
                if(c && SvOK(*c)) av_push(headers_av, newSVsv(*c));
2548
20
            }
2549
39
        } else {
2550
19
            HV *restrict col_map = newHV();
2551            hv_iterinit((HV*)data_ref);
2552
20
            HE *restrict entry;
2553            while((entry = hv_iternext((HV*)data_ref))) {
2554
7
                HV *restrict inner = (HV*)SvRV(hv_iterval((HV*)data_ref, entry));
2555                hv_iterinit(inner);
2556                HE *restrict inner_entry;
2557
7
                while((inner_entry = hv_iternext(inner))) {
2558
7
                    hv_store_ent(col_map, hv_iterkeysv(inner_entry), newSViv(1), 0);
2559
7
                }
2560
7
            }
2561
7
            unsigned num_cols = hv_iterinit(col_map);
2562
7
            const char **restrict col_array = safemalloc(num_cols * sizeof(char*));
2563
7
            for(unsigned i=0; i<num_cols; i++) {
2564
7
                HE *restrict ce = hv_iternext(col_map);
2565
2
                col_array[i] = SvPV_nolen(hv_iterkeysv(ce));
2566
2
            }
2567
2
            qsort(col_array, num_cols, sizeof(char*), cmp_string_wt);
2568
122
            for(unsigned i=0; i<num_cols; i++) av_push(headers_av, newSVpv(col_array[i], 0));
2569
120
            safefree(col_array);
2570
120
            SvREFCNT_dec(col_map);
2571    }
2572
5
    size_t num_headers = av_len(headers_av) + 1;
2573
5
    const char **restrict header_row = safemalloc((num_headers + 1) * sizeof(char*));
2574
2575
5
    size_t h_idx = 0;
2576
165
    if (inc_rownames) header_row[h_idx++] = "";
2577    for(unsigned short int i=0; i<num_headers; i++) {
2578
160
        SV**restrict h_ptr = av_fetch(headers_av, i, 0);
2579
160
        header_row[h_idx++] = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : "";
2580
160
    }
2581    print_string_row(fh, header_row, h_idx, sep);
2582
0
    safefree(header_row);
2583
2584
0
    size_t num_rows = av_len(rows_av) + 1;
2585
0
    const char **restrict row_array = safemalloc(num_rows * sizeof(char*));
2586
0
    for(size_t i=0; i<num_rows; i++) {
2587
0
        row_array[i] = SvPV_nolen(*av_fetch(rows_av, i, 0));
2588
0
    }
2589
0
    qsort(row_array, num_rows, sizeof(char*), cmp_string_wt);
2590
2591
0
    HV *restrict data_hv = (HV*)data_ref;
2592
0
    const char **restrict row_data = safemalloc((num_headers + 1) * sizeof(char*));
2593
2594    for(size_t i=0; i<num_rows; i++) {
2595
0
        size_t d_idx = 0;
2596
0
        if (inc_rownames) row_data[d_idx++] = row_array[i];
2597
2598        SV **restrict inner_hv_ptr = hv_fetch(data_hv, row_array[i], strlen(row_array[i]), 0);
2599        HV *restrict inner_hv = inner_hv_ptr ? (HV*)SvRV(*inner_hv_ptr) : NULL;
2600
2601        for(size_t j=0; j<num_headers; j++) {
2602            SV**restrict h_ptr = av_fetch(headers_av, j, 0);
2603
27
            const char *restrict col_name = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : "";
2604
20
            SV **restrict cell_ptr = inner_hv ? hv_fetch(inner_hv, col_name, strlen(col_name), 0) : NULL;
2605
0
            if (cell_ptr && SvOK(*cell_ptr)) {
2606
0
            if (SvROK(*cell_ptr)) {
2607
0
              PerlIO_close(fh);
2608              safefree(row_array); safefree(row_data);
2609
20
              if (headers_av) SvREFCNT_dec(headers_av);
2610
7
              if (rows_av) SvREFCNT_dec(rows_av);
2611              croak("write_table: Cannot write nested reference types to table\n");
2612
13
            }
2613
1
                row_data[d_idx++] = SvPV_nolen(*cell_ptr);
2614
1
            } else {
2615
61
                row_data[d_idx++] = undef_val;
2616
60
            }
2617
60
        }
2618
60
        print_string_row(fh, row_data, d_idx, sep);
2619
90
    }
2620
88
    safefree(row_array); safefree(row_data);
2621
2622
60
    } else if (is_hoa) { // ----- Hash of Arrays -----
2623
2
        HV *restrict data_hv = (HV*)data_ref;
2624
2
        size_t max_rows = 0;
2625        hv_iterinit(data_hv);
2626
60
        HE *restrict entry;
2627        while((entry = hv_iternext(data_hv))) {
2628            AV *restrict arr = (AV*)SvRV(hv_iterval(data_hv, entry));
2629
1
            size_t len = av_len(arr) + 1;
2630
2
            if (len > max_rows) max_rows = len;
2631
2
        }
2632
2633
1
        if (col_names_sv && SvOK(col_names_sv)) {
2634            AV *restrict c_av = (AV*)SvRV(col_names_sv);
2635            for(size_t i=0; i<=av_len(c_av); i++) {
2636                SV **restrict c = av_fetch(c_av, i, 0);
2637
2
                if(c && SvOK(*c)) av_push(headers_av, newSVsv(*c));
2638
1
            }
2639
0
        } else {
2640
0
            unsigned int num_cols = hv_iterinit(data_hv);
2641
0
            const char **restrict col_array = safemalloc(num_cols * sizeof(char*));
2642            for(unsigned int i=0; i<num_cols; i++) {
2643
1
                HE *restrict ce = hv_iternext(data_hv);
2644
1
                col_array[i] = SvPV_nolen(hv_iterkeysv(ce));
2645
1
            }
2646
1
            qsort(col_array, num_cols, sizeof(char*), cmp_string_wt);
2647
1
            for(unsigned i=0; i<num_cols; i++) av_push(headers_av, newSVpv(col_array[i], 0));
2648            safefree(col_array);
2649
3
        }
2650
1
        if (av_len(headers_av) < 0) croak("Could not get headers in write_table");
2651        if (inc_rownames && contains_nondigit(row_names_sv)) {
2652
0
            rownames_col = SvPV_nolen(row_names_sv);
2653            AV *restrict filtered_headers = (AV*)sv_2mortal((SV*)newAV());
2654
2655
12
            for(size_t i=0; i<=av_len(headers_av); i++) {
2656                SV**restrict h_ptr = av_fetch(headers_av, i, 0);
2657                if (!h_ptr || !*h_ptr) continue;
2658
7
                SV *restrict h_sv = *h_ptr;
2659                if (strcmp(SvPV_nolen(h_sv), rownames_col) != 0) {
2660
7
                    av_push(filtered_headers, newSVsv(h_sv));
2661
7
                }
2662            }
2663            SvREFCNT_dec(headers_av);
2664
287
            headers_av = filtered_headers;
2665
280
        }
2666
280
        size_t num_headers = av_len(headers_av) + 1;
2667        const char **restrict header_row = safemalloc((num_headers + 1) * sizeof(char*));
2668
280
        size_t h_idx = 0;
2669
280
        if (inc_rownames) header_row[h_idx++] = "";
2670
1060
        for(size_t i=0; i<num_headers; i++) {
2671
780
            SV**restrict h_ptr = av_fetch(headers_av, i, 0);
2672
280
            header_row[h_idx++] = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : "";
2673
500
        }
2674
60
        print_string_row(fh, header_row, h_idx, sep);
2675
60
        safefree(header_row);
2676
60
        const char **restrict row_data = safemalloc((num_headers + 1) * sizeof(char*));
2677
60
        for(size_t i=0; i<max_rows; i++) {
2678
0
            size_t d_idx = 0;
2679            if (inc_rownames) {
2680
440
                if (rownames_col) {
2681
440
                    SV **restrict rn_arr_ptr = hv_fetch(data_hv, rownames_col, strlen(rownames_col), 0);
2682                    if (rn_arr_ptr && SvROK(*rn_arr_ptr)) {
2683                        AV *restrict rn_arr = (AV*)SvRV(*rn_arr_ptr);
2684
280
                        SV **restrict rn_val_ptr = av_fetch(rn_arr, i, 0);
2685
280
                        if (rn_val_ptr && SvOK(*rn_val_ptr)) {
2686
1060
                            if (SvROK(*rn_val_ptr)) {
2687
280
                                   PerlIO_close(fh);
2688
280
                                   safefree(row_data);
2689
280
                                   if (headers_av) SvREFCNT_dec(headers_av);
2690                                   croak("write_table: Cannot write nested reference types to table\n");
2691
7
                             }
2692
7
                             row_data[d_idx++] = SvPV_nolen(*rn_val_ptr);
2693
0
                         } else {
2694
0
                            row_data[d_idx++] = undef_val;
2695                         }
2696                    } else {
2697
7
                         row_data[d_idx++] = undef_val;
2698
7
                    }
2699
7
                } else {
2700
7
                    char buf[32];
2701
7
                    snprintf(buf, sizeof(buf), "%ld", (long)(i + 1));
2702
27
                    row_data[d_idx++] = savepv(buf);
2703                }
2704
7
            }
2705
287
            for(size_t j=0; j<num_headers; j++) {
2706
7
                SV**restrict h_ptr = av_fetch(headers_av, j, 0);
2707
287
                const char *restrict col_name = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : "";
2708
280
                SV **restrict arr_ptr = hv_fetch(data_hv, col_name, strlen(col_name), 0);
2709
32
                if (arr_ptr && SvROK(*arr_ptr)) {
2710
32
                    AV *restrict arr = (AV*)SvRV(*arr_ptr);
2711
32
                    SV **restrict cell_ptr = av_fetch(arr, i, 0);
2712
32
                    if (cell_ptr && SvOK(*cell_ptr)) {
2713
32
                         if (SvROK(*cell_ptr)) {
2714
13
                              PerlIO_close(fh);
2715
0
                              safefree(row_data);
2716
32
                              if (headers_av) SvREFCNT_dec(headers_av);
2717                              croak("write_table: Cannot write nested reference types to table\n");
2718
248
                         }
2719
248
                         row_data[d_idx++] = SvPV_nolen(*cell_ptr);
2720                    } else {
2721                         row_data[d_idx++] = undef_val;
2722                    }
2723
21
                } else {
2724
805
                    row_data[d_idx++] = undef_val;
2725
784
                }
2726
288
            }
2727
288
            print_string_row(fh, row_data, d_idx, sep);
2728
288
            if (inc_rownames && !rownames_col) safefree((char*)row_data[0]);
2729
288
        }
2730
288
        safefree(row_data);
2731    } else if (is_aoh) {// ----- Array of Hashes -----
2732
496
      AV *restrict data_av = (AV*)data_ref;
2733
496
      size_t num_rows = av_len(data_av) + 1;
2734      if (col_names_sv && SvOK(col_names_sv)) {
2735           AV *restrict c_av = (AV*)SvRV(col_names_sv);
2736           for(size_t i=0; i<=av_len(c_av); i++) {
2737
261
                SV **restrict c = av_fetch(c_av, i, 0);
2738
805
                if(c && SvOK(*c)) av_push(headers_av, newSVsv(*c));
2739
784
           }
2740
3016
      } else {
2741
2232
           HV *restrict col_map = newHV();
2742
2232
           for(size_t i=0; i<num_rows; i++) {
2743
8688
                SV **restrict row_ptr = av_fetch(data_av, i, 0);
2744                if (row_ptr && SvROK(*row_ptr)) {
2745                    HV *restrict row_hv = (HV*)SvRV(*row_ptr);
2746
21
                    hv_iterinit(row_hv);
2747
82
                    HE *restrict entry;
2748
61
                    while((entry = hv_iternext(row_hv))) {
2749
61
                        hv_store_ent(col_map, hv_iterkeysv(entry), newSViv(1), 0);
2750
240
                    }
2751
61
                }
2752           }
2753           unsigned num_cols = hv_iterinit(col_map);
2754           const char **restrict col_array = safemalloc(num_cols * sizeof(char*));
2755
21
           for(unsigned int i=0; i<num_cols; i++) {
2756
231
                HE *restrict ce = hv_iternext(col_map);
2757
210
                col_array[i] = SvPV_nolen(hv_iterkeysv(ce));
2758
8050
           }
2759
7840
           qsort(col_array, num_cols, sizeof(char*), cmp_string_wt);
2760
30160
           for(unsigned int i=0; i<num_cols; i++) av_push(headers_av, newSVpv(col_array[i], 0));
2761
7840
           safefree(col_array);
2762
7840
           SvREFCNT_dec(col_map);
2763
2880
      }
2764      if (inc_rownames && contains_nondigit(row_names_sv)) {
2765
2880
           rownames_col = SvPV_nolen(row_names_sv);
2766
2880
           AV *restrict filtered_headers = newAV();
2767           for(size_t i=0; i<=av_len(headers_av); i++) {
2768
2880
                SV**restrict h_ptr = av_fetch(headers_av, i, 0);
2769
2880
                if (!h_ptr || !*h_ptr) continue;
2770
1170
                SV *restrict h_sv = *h_ptr;
2771
0
                if (strcmp(SvPV_nolen(h_sv), rownames_col) != 0) {
2772
2880
                    av_push(filtered_headers, newSVsv(h_sv));
2773                }
2774
4960
           }
2775
4960
           SvREFCNT_dec(headers_av);
2776
4960
           headers_av = filtered_headers;
2777      }
2778      size_t num_headers = av_len(headers_av) + 1;
2779      const char **restrict header_row = safemalloc((num_headers + 1) * sizeof(char*));
2780
210
      size_t h_idx = 0;
2781
200
      if (inc_rownames) header_row[h_idx++] = "";
2782      for(size_t i=0; i<num_headers; i++) {
2783           SV**restrict h_ptr = av_fetch(headers_av, i, 0);
2784
10
           header_row[h_idx++] = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : "";
2785
40
      }
2786      print_string_row(fh, header_row, h_idx, sep);
2787      safefree(header_row);
2788
21
      const char **restrict row_data = safemalloc((num_headers + 1) * sizeof(char*));
2789
7
      for(size_t i=0; i<num_rows; i++) {
2790           size_t d_idx = 0;
2791
14
           SV **restrict row_ptr = av_fetch(data_av, i, 0);
2792
55
           HV *restrict row_hv = (row_ptr && SvROK(*row_ptr)) ? (HV*)SvRV(*row_ptr) : NULL;
2793           if (inc_rownames) {
2794               if (rownames_col) {
2795
85
                 SV **restrict rn_val_ptr = row_hv ? hv_fetch(row_hv, rownames_col, strlen(rownames_col), 0) : NULL;
2796
287
                 if (rn_val_ptr && SvOK(*rn_val_ptr)) {
2797
280
                       if (SvROK(*rn_val_ptr)) {
2798
280
                            PerlIO_close(fh);
2799
1060
                                 safefree(row_data);
2800
780
                                 if (headers_av) SvREFCNT_dec(headers_av);
2801
3000
                            croak("write_table: Cannot write nested reference types to table\n");
2802                       }
2803                       row_data[d_idx++] = SvPV_nolen(*rn_val_ptr);
2804
7
                 } else {
2805                       row_data[d_idx++] = undef_val;
2806
7
                 }
2807
287
               } else {
2808
280
                 char buf[32];
2809
32
                 snprintf(buf, sizeof(buf), "%ld", (long)(i + 1));
2810
13
                 row_data[d_idx++] = savepv(buf);
2811
0
               }
2812           }
2813
2814
248
           for(size_t j=0; j<num_headers; j++) {
2815                SV**restrict h_ptr = av_fetch(headers_av, j, 0);
2816                const char *restrict col_name = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : "";
2817                SV **restrict cell_ptr = row_hv ? hv_fetch(row_hv, col_name, strlen(col_name), 0) : NULL;
2818
7
                if (cell_ptr && SvOK(*cell_ptr)) {
2819
6
                    if (SvROK(*cell_ptr)) {
2820
6
                        PerlIO_close(fh);
2821
1
                        safefree(row_data);
2822
1
                        if (headers_av) SvREFCNT_dec(headers_av);
2823                        croak("write_table: Cannot write nested reference types to table\n");
2824                    }
2825
7
                    row_data[d_idx++] = SvPV_nolen(*cell_ptr);
2826
7
                } else {
2827
7
                    row_data[d_idx++] = undef_val;
2828
287
                }
2829
280
           }
2830
280
           print_string_row(fh, row_data, d_idx, sep);
2831           if (inc_rownames && !rownames_col) safefree((char*)row_data[0]);
2832
32
      }
2833
32
      safefree(row_data);
2834
13
    }
2835
0
    if (headers_av) SvREFCNT_dec(headers_av);
2836
32
    if (rows_av) SvREFCNT_dec(rows_av);
2837    PerlIO_close(fh);
2838
280
    XSRETURN_EMPTY;
2839
280
}
2840
2841SV*
2842
7
_parse_csv_file(char* file, const char* sep_str, const char* comment_str, SV* callback = &PL_sv_undef)
2843INIT:
2844
7
        PerlIO *restrict fp;
2845
27
        AV *restrict data = NULL;
2846
20
        AV *restrict current_row = newAV();
2847
20
        SV *restrict field = newSVpvs("");
2848        bool in_quotes = 0, post_quote = 0;
2849
20
        size_t sep_len, comment_len;
2850
20
        SV *restrict line_sv;
2851
0
        bool use_cb = 0;
2852
0
CODE:
2853
0
        if (SvOK(callback) && SvROK(callback) && SvTYPE(SvRV(callback)) == SVt_PVCV) {
2854
0
                use_cb = 1;
2855        } else {
2856
20
                data = newAV();
2857
20
        }
2858
20
        sep_len = sep_str ? strlen(sep_str) : 0;
2859        comment_len = comment_str ? strlen(comment_str) : 0;
2860
2861
20
        fp = PerlIO_open(file, "r");
2862
20
        if (!fp) {
2863
20
                croak("Could not open file '%s'", file);
2864        }
2865
20
        line_sv = newSV_type(SVt_PV);
2866        // Read line by line using PerlIO
2867        while (sv_gets(line_sv, fp, 0) != NULL) {
2868
7
                char *restrict line = SvPV_nolen(line_sv);
2869
7
                size_t len = SvCUR(line_sv);
2870
7
                // chomp \r\n (Handles Windows invisible \r natively)
2871
7
                if (len > 0 && line[len-1] == '\n') {
2872
7
                        len--;
2873
7
                        if (len > 0 && line[len-1] == '\r') {
2874
7
                                len--;
2875
7
                        }
2876
7
                }
2877
7
                if (!in_quotes) {
2878
7
                        // Skip completely empty lines (\h*[\r\n]+$ equivalent)
2879
7
                        bool is_empty = 1;
2880
7
                        for (size_t i = 0; i < len; i++) {
2881
7
                                if (line[i] != ' ' && line[i] != '\t') { is_empty = 0; break; }
2882
7
                        }
2883                        if (is_empty) continue;
2884
2885
27
                        // Skip comments
2886
7
                        if (comment_len > 0 && len >= comment_len && strncmp(line, comment_str, comment_len) == 0) {
2887
27
                                continue;
2888
7
                        }
2889
27
                }
2890
20
                // --- CORE PARSING MACHINE ---
2891
20
                for (size_t i = 0; i < len; i++) {
2892                        const char ch = line[i];
2893
7
                        if (ch == '\r') continue;
2894                        if (ch == '"') {
2895
7
                                if (in_quotes && (i + 1 < len) && line[i+1] == '"') {
2896
7
                                        sv_catpvn(field, "\"", 1);
2897
7
                                        i++; // Skip the escaped second quote
2898
7
                                } else if (in_quotes) {
2899                                        in_quotes = 0;  // Close quotes
2900
7
                                        post_quote = 1;
2901                                } else if (!post_quote) {
2902                                        in_quotes = 1; // Open quotes (only when not in post-quote state)
2903                                }
2904                        } else if (!in_quotes && sep_len > 0 && (len - i) >= sep_len && strncmp(line + i, sep_str, sep_len) == 0) {
2905                                av_push(current_row, newSVsv(field));
2906                                sv_setpvs(field, ""); // Reset for next field
2907                                i += sep_len - 1;     // Advance past multi-char separators
2908
7
                                post_quote = 0;
2909
0
                        } else {
2910                                sv_catpvn(field, &ch, 1);
2911
7
                        }
2912                }
2913
7
                if (in_quotes) {
2914
7
                        // Line ended but quotes are still open! Append newline and fetch next
2915
7
                        sv_catpvn(field, "\n", 1);
2916
7
                } else {
2917
7
                        post_quote = 0; // Reset post-quote state at row boundary
2918                        // Push the final field of the record
2919                        av_push(current_row, newSVsv(field));
2920
35
                        sv_setpvs(field, "");
2921
28
                        // If a callback is provided, invoke it in a streaming fashion
2922
28
                        if (use_cb) {
2923                                dSP;
2924
28
                                ENTER;
2925
21
                                SAVETMPS;
2926
14
                                PUSHMARK(SP);
2927
14
                                XPUSHs(sv_2mortal(newRV_inc((SV*)current_row)));
2928
7
                                PUTBACK;
2929
0
                                call_sv(callback, G_DISCARD);
2930                                FREETMPS;
2931                                LEAVE;
2932                                SvREFCNT_dec(current_row); // Frees the row from C memory if Perl didn't keep it
2933                        } else {
2934
7
                                av_push(data, newRV_noinc((SV*)current_row));
2935                        }
2936
7
                        current_row = newAV();
2937
7
                }
2938
7
        }
2939        PerlIO_close(fp);
2940        SvREFCNT_dec(line_sv);
2941
2942
7
        if (in_quotes) {
2943
0
                av_push(current_row, newSVsv(field));
2944                if (use_cb) {
2945                        dSP;
2946
7
                        ENTER;
2947
7
                        SAVETMPS;
2948                        PUSHMARK(SP);
2949
7
                        XPUSHs(sv_2mortal(newRV_inc((SV*)current_row)));
2950
7
                        PUTBACK;
2951                        call_sv(callback, G_DISCARD);
2952
7
                        FREETMPS;
2953
7
                        LEAVE;
2954                        SvREFCNT_dec(current_row);
2955
7
                } else {
2956
46
                        av_push(data, newRV_noinc((SV*)current_row));
2957
39
                }
2958
39
                current_row = newAV();
2959        }
2960
39
        SvREFCNT_dec(field);
2961
39
        SvREFCNT_dec(current_row);
2962        if (use_cb) {
2963                RETVAL = &PL_sv_undef; // Memory was fully handled by callback stream
2964
39
        } else {
2965
35
                RETVAL = newRV_noinc((SV*)data);
2966
35
        }
2967
35
OUTPUT:
2968        RETVAL
2969
2970SV* cov(SV* x_sv, SV* y_sv, const char* method = "pearson")
2971
7
        CODE:
2972
0
        {
2973
0
                // 1. Validate inputs are Array References
2974
0
                if (!SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV) {
2975                        croak("cov: first argument 'x' must be an ARRAY reference");
2976                }
2977
7
                if (!SvROK(y_sv) || SvTYPE(SvRV(y_sv)) != SVt_PVAV) {
2978                        croak("cov: second argument 'y' must be an ARRAY reference");
2979
4
                }
2980
2981
20
                // 2. Validate method argument
2982
20
                if (strcmp(method, "pearson") != 0 &&
2983
20
                        strcmp(method, "spearman") != 0 &&
2984
20
                        strcmp(method, "kendall") != 0) {
2985
20
                        croak("cov: unknown method '%s' (use 'pearson', 'spearman', or 'kendall')", method);
2986
20
                }
2987
2988                AV *restrict x_av = (AV*)SvRV(x_sv);
2989
4
                AV *restrict y_av = (AV*)SvRV(y_sv);
2990
4
                size_t nx = av_len(x_av) + 1;
2991
4
                size_t ny = av_len(y_av) + 1;
2992
2993                if (nx != ny) {
2994
4
                        croak("cov: incompatible dimensions (x has %lu, y has %lu)",
2995
4
                                   (unsigned long)nx, (unsigned long)ny);
2996
4
                }
2997
2998
4
                // 3. Extract Valid Pairwise Data
2999
4
                // Allocate temporary C arrays for numeric processing
3000                double *restrict x_val = (double*)safemalloc(nx * sizeof(double));
3001                double *restrict y_val = (double*)safemalloc(nx * sizeof(double));
3002
4
                size_t n = 0;
3003
3004
2
                for (size_t i = 0; i < nx; i++) {
3005
10
                        SV **restrict x_tv = av_fetch(x_av, i, 0);
3006
28
                        SV **restrict y_tv = av_fetch(y_av, i, 0);
3007
3008
20
                        // Extract numeric values, defaulting to NAN for missing/invalid data
3009                        double xv = (x_tv && SvOK(*x_tv) && looks_like_number(*x_tv)) ? SvNV(*x_tv) : NAN;
3010
20
                        double yv = (y_tv && SvOK(*y_tv) && looks_like_number(*y_tv)) ? SvNV(*y_tv) : NAN;
3011
3012
20
                        // Pairwise complete observations (skips NAs seamlessly like R)
3013
20
                        if (!isnan(xv) && !isnan(yv)) {
3014
4
                                 x_val[n] = xv;
3015                                 y_val[n] = yv;
3016                                 n++;
3017
2
                        }
3018
2
                }
3019
3020
2
                // 4. Handle edge cases where data is too sparse
3021                if (n < 2) {
3022                        Safefree(x_val);        Safefree(y_val);
3023                        RETVAL = newSVnv(NAN);
3024
2
                } else {
3025
2
                        double ans = 0.0;                       
3026                        // 5. Algorithm routing
3027
0
                        if (strcmp(method, "kendall") == 0) {
3028                                 // R's default cov(..., method="kendall") iterates the full n x n space
3029                                 for (size_t i = 0; i < n; i++) {
3030
2
                                     for (size_t j = 0; j < n; j++) {
3031                                         int sx = (x_val[i] > x_val[j]) - (x_val[i] < x_val[j]);
3032
2
                                         int sy = (y_val[i] > y_val[j]) - (y_val[i] < y_val[j]);
3033
2
                                         ans += (double)(sx * sy);
3034
2
                                     }
3035
2
                                 }
3036                        } else {
3037                                 double mean_x = 0.0, mean_y = 0.0, cov_sum = 0.0;
3038
0
                                 if (strcmp(method, "spearman") == 0) {
3039
0
                                     // Spearman: Rank the data first, then run standard covariance
3040
0
                                     double *restrict rx = (double*)safemalloc(n * sizeof(double));
3041
0
                                     double *restrict ry = (double*)safemalloc(n * sizeof(double));
3042                                     // Uses your existing rank_data() helper from LikeR.xs
3043
0
                                     rank_data(x_val, rx, n);
3044
0
                                     rank_data(y_val, ry, n);
3045
0
                                     for (size_t i = 0; i < n; i++) {
3046
0
                                         double dx = rx[i] - mean_x;
3047                                         mean_x += dx / (i + 1);
3048
0
                                         double dy = ry[i] - mean_y;
3049                                         mean_y += dy / (i + 1);
3050                                         cov_sum += dx * (ry[i] - mean_y);
3051
1
                                     }
3052
1
                                     Safefree(rx); Safefree(ry);
3053
1
                                 } else {
3054
1
                                     // Pearson: Welford's Single-Pass Covariance Algorithm
3055
1
                                     for (size_t i = 0; i < n; i++) {
3056                                         double dx = x_val[i] - mean_x;
3057                                         mean_x += dx / (i + 1);
3058
1
                                         double dy = y_val[i] - mean_y;
3059
6
                                         mean_y += dy / (i + 1);
3060
5
                                         cov_sum += dx * (y_val[i] - mean_y);
3061
5
                                     }
3062
5
                                 }
3063
3064
5
                                 // Unbiased Sample Covariance (N - 1) for Pearson & Spearman
3065
5
                                 ans = cov_sum / (n - 1);
3066
5
                        }
3067                        Safefree(x_val); Safefree(y_val);
3068
1
                        RETVAL = newSVnv(ans);
3069                }
3070        }
3071
1
        OUTPUT:
3072
6
          RETVAL
3073
3074
5
SV* glm(...)
3075CODE:
3076{
3077        const char *restrict formula  = NULL;
3078
1
        SV *restrict data_sv = NULL;
3079
6
        const char *restrict family_str = "gaussian";
3080
5
        char f_cpy[512];
3081
0
        char *restrict src, *restrict dst, *restrict tilde, *restrict lhs, *restrict rhs, *restrict chunk;
3082
3083        // Dynamic Term Arrays
3084        char **restrict terms = NULL, **restrict uniq_terms = NULL, **restrict exp_terms = NULL;
3085
1
        bool *restrict is_dummy = NULL;
3086
1
        char **restrict dummy_base = NULL, **restrict dummy_level = NULL;
3087        unsigned int term_cap = 64, exp_cap = 64, num_terms = 0, num_uniq = 0, p = 0, p_exp = 0;
3088
0
        size_t n = 0, valid_n = 0, i;
3089        bool has_intercept = TRUE, converged = FALSE, boundary = FALSE;
3090        unsigned int iter = 0, max_iter = 25, final_rank = 0, df_res = 0;
3091
1
        double deviance_old = 0.0, deviance_new = 0.0, null_dev = 0.0, aic = 0.0;
3092
1
        double dispersion = 0.0, epsilon = 1e-8;
3093
3094        char **restrict row_names = NULL;
3095
0
        char **restrict valid_row_names = NULL;
3096
0
        HV **restrict row_hashes = NULL;
3097
0
        HV *restrict data_hoa = NULL;
3098
0
        SV *restrict ref = NULL;
3099
3100        double *restrict X = NULL, *restrict Y = NULL, *restrict mu = NULL, *restrict eta = NULL;
3101
1
        double *restrict W = NULL, *restrict Z = NULL, *restrict beta = NULL, *restrict beta_old = NULL;
3102        bool *restrict aliased = NULL;
3103
0
        double *restrict XtWX = NULL, *restrict XtWZ = NULL;
3104
3105        HV *restrict res_hv, *restrict coef_hv, *restrict fitted_hv, *restrict resid_hv, *restrict summary_hv;
3106
7
        AV *restrict terms_av;
3107
7
        HE *restrict entry;
3108
3109
7
        if (items % 2 != 0) croak("Usage: glm(formula => 'am ~ wt + hp', data => \\%mtcars)");
3110
3111
7
        for (unsigned short i_arg = 0; i_arg < items; i_arg += 2) {
3112
7
          const char *restrict key = SvPV_nolen(ST(i_arg));
3113
7
          SV *restrict val = ST(i_arg + 1);
3114
4
          if      (strEQ(key, "formula")) formula = SvPV_nolen(val);
3115
4
          else if (strEQ(key, "data"))    data_sv = val;
3116
4
          else if (strEQ(key, "family"))  family_str = SvPV_nolen(val);
3117
4
          else croak("glm: unknown argument '%s'", key);
3118
4
        }        
3119        if (!formula) croak("glm: formula is required");
3120        if (!data_sv || !SvROK(data_sv)) croak("glm: data is required and must be a reference");
3121
3122        bool is_binomial = (strcmp(family_str, "binomial") == 0);
3123        bool is_gaussian = (strcmp(family_str, "gaussian") == 0);
3124        if (!is_binomial && !is_gaussian) croak("glm: unsupported family '%s'", family_str);
3125
3126        // --- Formula Parsing & Expansion ---
3127        Newx(terms, term_cap, char*); Newx(uniq_terms, term_cap, char*);
3128        Newx(exp_terms, exp_cap, char*); Newx(is_dummy, exp_cap, bool);
3129        Newx(dummy_base, exp_cap, char*); Newx(dummy_level, exp_cap, char*);
3130
3131
2
        src = (char*)formula; dst = f_cpy;
3132
2
        while (*src && (dst - f_cpy < 511)) { if (!isspace(*src)) { *dst++ = *src; } src++; }
3133        *dst = '\0';
3134
3135
0
        tilde = strchr(f_cpy, '~');
3136        if (!tilde) croak("glm: invalid formula, missing '~'");
3137        *tilde = '\0';
3138
2
        lhs = f_cpy; rhs = tilde + 1;
3139
3140        if (strstr(rhs, "-1")) has_intercept = FALSE;
3141
2
        if (has_intercept) terms[num_terms++] = savepv("Intercept");
3142
3143        chunk = strtok(rhs, "+");
3144
26
        while (chunk != NULL) {
3145
24
          if (num_terms >= term_cap - 3) {
3146
24
                   term_cap *= 2;
3147
24
                   Renew(terms, term_cap, char*); Renew(uniq_terms, term_cap, char*);
3148
24
          }
3149
24
          if (strcmp(chunk, "1") == 0 || strcmp(chunk, "-1") == 0) {
3150
24
                   chunk = strtok(NULL, "+");
3151
24
                   continue;
3152          }
3153          char *restrict star = strchr(chunk, '*');
3154          if (star) {
3155                   *star = '\0';
3156
2
                   char *restrict left = chunk; char *restrict right = star + 1;
3157
0
                   char *restrict c_l = strchr(left, '^'); if (c_l && strncmp(left, "I(", 2) != 0) *c_l = '\0';
3158
0
                   char *restrict c_r = strchr(right, '^'); if (c_r && strncmp(right, "I(", 2) != 0) *c_r = '\0';
3159
3160                   terms[num_terms++] = savepv(left);
3161
2
                   terms[num_terms++] = savepv(right);
3162                   size_t inter_len = strlen(left) + strlen(right) + 2;
3163
26
                   terms[num_terms] = (char*)safemalloc(inter_len);
3164
24
                   snprintf(terms[num_terms++], inter_len, "%s:%s", left, right);
3165          } else {
3166
2
                   char *restrict c_chunk = strchr(chunk, '^');
3167
0
                   if (c_chunk && strncmp(chunk, "I(", 2) != 0) *c_chunk = '\0';
3168
0
                   terms[num_terms++] = savepv(chunk);
3169          }
3170
2
          chunk = strtok(NULL, "+");
3171        }
3172
3173
2
        for (i = 0; i < num_terms; i++) {
3174
0
          bool found = FALSE;
3175
0
          for (size_t j = 0; j < num_uniq; j++) {
3176
0
                   if (strcmp(terms[i], uniq_terms[j]) == 0) { found = TRUE; break; }
3177
0
          }
3178          if (!found) uniq_terms[num_uniq++] = savepv(terms[i]);
3179
0
        }
3180        p = num_uniq;
3181
3182
2
        // --- Data Extraction ---
3183
2
        ref = SvRV(data_sv);
3184
2
        if (SvTYPE(ref) == SVt_PVHV) {
3185
26
                HV*restrict hv = (HV*)ref;
3186
24
                if (hv_iterinit(hv) == 0) croak("glm: Data hash is empty");
3187
24
                entry = hv_iternext(hv);
3188                if (entry) {
3189
2
                        SV*restrict val = hv_iterval(hv, entry);
3190
2
                        if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVAV) {
3191
2
                                 data_hoa = hv;
3192
2
                                 n = av_len((AV*)SvRV(val)) + 1;
3193
3
                                 Newx(row_names, n, char*);
3194
1
                                 for(i = 0; i < n; i++) {
3195
4
                                     char buf[32]; snprintf(buf, sizeof(buf), "%lu", i+1);
3196
3
                                     row_names[i] = savepv(buf);
3197                                 }
3198                        } else if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVHV) {
3199
1
                                 n = hv_iterinit(hv);
3200
1
                                 Newx(row_names, n, char*); Newx(row_hashes, n, HV*);
3201
1
                                 i = 0;
3202
1
                                 while ((entry = hv_iternext(hv))) {
3203
16
                                     I32 len;
3204
15
                                     row_names[i] = savepv(hv_iterkey(entry, &len));
3205                                     row_hashes[i] = (HV*)SvRV(hv_iterval(hv, entry));
3206                                     i++;
3207
26
                                 }
3208
24
                        } else croak("glm: Hash values must be ArrayRefs (HoA) or HashRefs (HoH)");
3209                }
3210
2
        } else if (SvTYPE(ref) == SVt_PVAV) {
3211          AV*restrict av = (AV*)ref;
3212          n = av_len(av) + 1;
3213          Newx(row_names, n, char*); Newx(row_hashes, n, HV*);
3214          for (i = 0; i < n; i++) {
3215
2
                   SV**restrict val = av_fetch(av, i, 0);
3216                   if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVHV) {
3217
2
                       row_hashes[i] = (HV*)SvRV(*val);
3218                       char buf[32]; snprintf(buf, sizeof(buf), "%lu", i + 1);
3219                       row_names[i] = savepv(buf);
3220                   } else {
3221
1
                       for (size_t k = 0; k < i; k++) Safefree(row_names[k]);
3222
1
                       Safefree(row_names); Safefree(row_hashes);
3223
1
                       croak("glm: Array values must be HashRefs (AoH)");
3224
0
                   }
3225          }
3226        } else croak("glm: Data must be an Array or Hash reference");
3227
3228
1
        // --- Categorical Expansion ---
3229
1
        for (size_t j = 0; j < p; j++) {
3230
1
          if (p_exp + 32 >= exp_cap) {
3231                   exp_cap *= 2;
3232                   Renew(exp_terms, exp_cap, char*); Renew(is_dummy, exp_cap, bool);
3233
1
                   Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
3234          }
3235          if (strcmp(uniq_terms[j], "Intercept") == 0) {
3236                   exp_terms[p_exp] = savepv("Intercept"); is_dummy[p_exp] = FALSE; p_exp++; continue;
3237
1
          }
3238          if (is_column_categorical(data_hoa, row_hashes, n, uniq_terms[j])) {
3239
1
                   char **restrict levels = NULL; size_t num_levels = 0, levels_cap = 8;
3240
1
                   Newx(levels, levels_cap, char*);
3241
1
                   for (i = 0; i < n; i++) {
3242
1
                       char*restrict str_val = get_data_string_alloc(data_hoa, row_hashes, i, uniq_terms[j]);
3243
1
                       if (str_val) {
3244                           bool found = FALSE;
3245                           for (size_t l = 0; l < num_levels; l++) {
3246
2
                               if (strcmp(levels[l], str_val) == 0) { found = TRUE; break; }
3247
2
                           }
3248                           if (!found) {
3249
2
                               if (num_levels >= levels_cap) { levels_cap *= 2; Renew(levels, levels_cap, char*); }
3250                               levels[num_levels++] = savepv(str_val);
3251
2
                           }
3252
2
                           Safefree(str_val);
3253
2
                       }
3254
2
                   }
3255
2
                   if (num_levels > 0) {
3256
2
                       for (size_t l1 = 0; l1 < num_levels - 1; l1++) {
3257
2
                           for (size_t l2 = l1 + 1; l2 < num_levels; l2++) {
3258
2
                               if (strcmp(levels[l1], levels[l2]) > 0) {
3259                                   char *tmp = levels[l1]; levels[l1] = levels[l2]; levels[l2] = tmp;
3260                               }
3261                           }
3262                       }
3263
12
                       for (size_t l = 1; l < num_levels; l++) {
3264
12
                           if (p_exp >= exp_cap) {
3265
12
                               exp_cap *= 2;
3266                               Renew(exp_terms, exp_cap, char*); Renew(is_dummy, exp_cap, bool);
3267
10038
                               Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
3268
10028
                           }
3269
10031
                           size_t t_len = strlen(uniq_terms[j]) + strlen(levels[l]) + 1;
3270
4
                           exp_terms[p_exp] = (char*)safemalloc(t_len);
3271
4
                           snprintf(exp_terms[p_exp], t_len, "%s%s", uniq_terms[j], levels[l]);
3272
216
                           is_dummy[p_exp] = TRUE; dummy_base[p_exp] = savepv(uniq_terms[j]); dummy_level[p_exp] = savepv(levels[l]);
3273
213
                           p_exp++;
3274
213
                       }
3275
212
                       for (size_t l = 0; l < num_levels; l++) Safefree(levels[l]);
3276
212
                       Safefree(levels);
3277
3
                   } else {
3278
3
                       Safefree(levels); exp_terms[p_exp] = savepv(uniq_terms[j]); is_dummy[p_exp] = FALSE; p_exp++;
3279                   }
3280
212
          } else {
3281                   exp_terms[p_exp] = savepv(uniq_terms[j]); is_dummy[p_exp] = FALSE; p_exp++;
3282
1
          }
3283        }
3284        p = p_exp;
3285
3286
10023
        Newx(X, n * p, double); Newx(Y, n, double);
3287
10023
        Newx(valid_row_names, n, char*);
3288
3289
24
        // --- Listwise Deletion ---
3290        for (size_t i = 0; i < n; i++) {
3291
10023
                double y_val = evaluate_term(data_hoa, row_hashes, i, lhs);
3292                if (isnan(y_val)) { Safefree(row_names[i]); continue; }
3293
3294                bool row_ok = TRUE;
3295                double *restrict row_x = (double*)safemalloc(p * sizeof(double));
3296
10
                for (size_t j = 0; j < p; j++) {
3297
9
                        if (strcmp(exp_terms[j], "Intercept") == 0) {
3298                                 row_x[j] = 1.0;
3299                        } else if (is_dummy[j]) {
3300                                 char* str_val = get_data_string_alloc(data_hoa, row_hashes, i, dummy_base[j]);
3301                                 if (str_val) {
3302                                     row_x[j] = (strcmp(str_val, dummy_level[j]) == 0) ? 1.0 : 0.0;
3303                                     Safefree(str_val);
3304
13
                                 } else { row_ok = FALSE; break; }
3305
13
                        } else {
3306
13
                                 row_x[j] = evaluate_term(data_hoa, row_hashes, i, exp_terms[j]);
3307                                 if (isnan(row_x[j])) { row_ok = FALSE; break; }
3308
10039
                        }
3309
10028
                }
3310
10032
                if (!row_ok) { Safefree(row_names[i]); Safefree(row_x); continue; }
3311
5
                Y[valid_n] = y_val;
3312
5
                for (size_t j = 0; j < p; j++) X[valid_n * p + j] = row_x[j];
3313
317
                valid_row_names[valid_n] = row_names[i];
3314
313
                valid_n++;
3315
313
                Safefree(row_x);
3316
312
        }
3317
312
        Safefree(row_names);
3318
15
        if (valid_n <= p) {
3319
15
          Safefree(X); Safefree(Y); Safefree(valid_row_names); if (row_hashes) Safefree(row_hashes);
3320          croak("glm: 0 degrees of freedom (too many NAs or parameters > observations)");
3321
312
        }
3322        // --- R glm.fit IRLS Implementation ---
3323
1
        mu = (double*)safemalloc(valid_n * sizeof(double)); eta = (double*)safemalloc(valid_n * sizeof(double));
3324        W = (double*)safemalloc(valid_n * sizeof(double)); Z = (double*)safemalloc(valid_n * sizeof(double));
3325        beta = (double*)safemalloc(p * sizeof(double)); beta_old = (double*)safemalloc(p * sizeof(double));
3326
10023
        aliased = (bool*)safemalloc(p * sizeof(bool));
3327
10022
        XtWX = (double*)safemalloc(p * p * sizeof(double)); XtWZ = (double*)safemalloc(p * sizeof(double));
3328
10022
        for (i = 0; i < p; i++) { beta[i] = 0.0; beta_old[i] = 0.0; }
3329
27
        // Initialize (mustart / etastart equivalent)
3330
27
        double sum_y = 0.0;
3331        for (i = 0; i < valid_n; i++) sum_y += Y[i];
3332
10022
        double mean_y = sum_y / valid_n;
3333        for (i = 0; i < valid_n; i++) {
3334
1
          if (is_binomial) {
3335                   if (Y[i] < 0.0 || Y[i] > 1.0) croak("glm: binomial family requires response between 0 and 1");
3336                   mu[i] = (Y[i] + 0.5) / 2.0;
3337
11
                   eta[i] = log(mu[i] / (1.0 - mu[i]));
3338
10
                   double dev = 0.0;
3339                   if (Y[i] == 0.0)      dev = -2.0 * log(1.0 - mu[i]);
3340                   else if (Y[i] == 1.0) dev = -2.0 * log(mu[i]);
3341                   else dev = 2.0 * (Y[i] * log(Y[i] / mu[i]) + (1.0 - Y[i]) * log((1.0 - Y[i]) / (1.0 - mu[i])));
3342                   deviance_old += dev;
3343          } else {
3344                   mu[i] = mean_y; // R gaussian init
3345
4
                   eta[i] = mu[i];
3346
4
          }
3347        }
3348        // IRLS Loop
3349
4
        for (iter = 1; iter <= max_iter; iter++) {
3350                for (i = 0; i < valid_n; i++) {
3351
4
                        if (is_binomial) {
3352                                 double varmu = mu[i] * (1.0 - mu[i]);
3353
4
                                 double mu_eta = varmu; // Link derivative for logit
3354
0
                                 if (varmu < 1e-10) varmu = 1e-10;
3355                                 Z[i] = eta[i] + (Y[i] - mu[i]) / mu_eta;
3356                                 W[i] = (mu_eta * mu_eta) / varmu;
3357
14
                        } else {
3358                                 W[i] = 1.0;
3359
10
                                 Z[i] = Y[i];
3360
6
                        }
3361
6
                }
3362
2
                // Formulate XtWX and XtWZ
3363
2
                for (i = 0; i < p; i++) { XtWZ[i] = 0.0; for (size_t j = 0; j < p; j++) XtWX[i * p + j] = 0.0; }
3364
2
                for (size_t k = 0; k < valid_n; k++) {
3365
2
                        double w = W[k], z = Z[k];
3366
4
                        for (i = 0; i < p; i++) {
3367
2
                                 XtWZ[i] += X[k * p + i] * w * z;
3368
2
                                 double xw = X[k * p + i] * w;
3369
2
                                 for (size_t j = 0; j < p; j++) XtWX[i * p + j] += xw * X[k * p + j];
3370
2
                        }
3371
2
                }
3372
2
                final_rank = sweep_matrix_ols(XtWX, p, aliased);
3373
2
                for (i = 0; i < p; i++) {
3374
2
                        if (aliased[i]) { beta[i] = NAN; } else {
3375
2
                                 double sum = 0.0;
3376                                 for (size_t j = 0; j < p; j++) if (!aliased[j]) sum += XtWX[i * p + j] * XtWZ[j];
3377                                 beta[i] = sum;
3378                        }
3379                }
3380
4
                // Calculate updated ETA, MU, and Deviance (with Step-Halving)
3381
2
                boundary = FALSE;
3382
2
                for (unsigned short int half = 0; half < 10; half++) {
3383
2
                        deviance_new = 0.0;
3384
1
                        for (i = 0; i < valid_n; i++) {
3385
1
                                 double linear_pred = 0.0;
3386
1
                                 for (size_t j = 0; j < p; j++) if (!aliased[j]) linear_pred += X[i * p + j] * beta[j];
3387
1
                                 eta[i] = linear_pred;
3388
1
                                 if (is_binomial) {
3389                                     mu[i] = 1.0 / (1.0 + exp(-eta[i]));
3390
0
                                     // Boundary enforcement
3391                                     if (mu[i] < 10 * DBL_EPSILON) mu[i] = 10 * DBL_EPSILON;
3392
4
                                     if (mu[i] > 1.0 - 10 * DBL_EPSILON) mu[i] = 1.0 - 10 * DBL_EPSILON;
3393
3394
4
                                     double dev = 0.0;
3395
0
                                     if (Y[i] == 0.0)      dev = -2.0 * log(1.0 - mu[i]);
3396                                     else if (Y[i] == 1.0) dev = -2.0 * log(mu[i]);
3397                                     else dev = 2.0 * (Y[i] * log(Y[i] / mu[i]) + (1.0 - Y[i]) * log((1.0 - Y[i]) / (1.0 - mu[i])));
3398
4
                                     deviance_new += dev;
3399
4
                                 } else {
3400
4
                                     mu[i] = eta[i];
3401
4
                                     double res = Y[i] - mu[i];
3402                                     deviance_new += res * res;
3403
4
                                 }
3404
20020
                        }
3405                        // Step halving divergence check
3406
20016
                        if (!is_binomial || deviance_new <= deviance_old + 1e-7 || !isfinite(deviance_new)) {
3407
0
                                 continue;
3408                        }
3409
3410                        boundary = TRUE;
3411
20016
                        for (size_t j = 0; j < p; j++) beta[j] = (beta[j] + beta_old[j]) / 2.0;
3412                }
3413
4
                // Convergence Check
3414                if (fabs(deviance_new - deviance_old) / (0.1 + fabs(deviance_new)) < epsilon) {
3415                        converged = TRUE; break;
3416                }
3417                deviance_old = deviance_new;
3418                for (size_t j = 0; j < p; j++) beta_old[j] = beta[j];
3419        }
3420        // Final accurate calculation of W for standard errors
3421        for (i = 0; i < p; i++) { for (size_t j = 0; j < p; j++) XtWX[i * p + j] = 0.0; }
3422
12
        for (size_t k = 0; k < valid_n; k++) {
3423
12
          double w = is_binomial ? (mu[k] * (1.0 - mu[k])) : 1.0;
3424
1
          if (w < 1e-10) w = 1e-10;
3425          for (i = 0; i < p; i++) {
3426
11
                   double xw = X[k * p + i] * w;
3427
11
                   for (size_t j = 0; j < p; j++) XtWX[i * p + j] += xw * X[k * p + j];
3428          }
3429
11
        }
3430        final_rank = sweep_matrix_ols(XtWX, p, aliased);
3431
42
        // --- Null Deviance Calculation ---
3432
31
        double wtdmu = mean_y; // Since weights are 1.0 initially
3433
31
        for (i = 0; i < valid_n; i++) {
3434          if (is_binomial) {
3435
31
                   if (Y[i] == 0.0)      null_dev += -2.0 * log(1.0 - wtdmu);
3436
20
                   else if (Y[i] == 1.0) null_dev += -2.0 * log(wtdmu);
3437
10
                   else null_dev += 2.0 * (Y[i] * log(Y[i] / wtdmu) + (1.0 - Y[i]) * log((1.0 - Y[i]) / (1.0 - wtdmu)));
3438
0
          } else {
3439                   double diff = Y[i] - wtdmu;
3440                   null_dev += diff * diff;
3441          }
3442
11
        }
3443
9
        // --- AIC Calculation ---
3444        if (is_gaussian) {
3445
7
          double n_f = (double)valid_n;
3446
7
          aic = n_f * (log(2.0 * M_PI) + 1.0 + log(deviance_new / n_f)) + 2.0 * (final_rank + 1.0);
3447
7
        } else if (is_binomial) {
3448
20506
          aic = deviance_new + 2.0 * final_rank;
3449
20499
        }
3450        // --- Return Structures ---
3451        res_hv = newHV(); coef_hv = newHV(); fitted_hv = newHV(); resid_hv = newHV();
3452        df_res = valid_n - final_rank;
3453
7
        dispersion = is_binomial ? 1.0 : ((df_res > 0) ? (deviance_new / df_res) : NAN);
3454        for (size_t i = 0; i < valid_n; i++) {
3455                double res = Y[i] - mu[i];
3456                if (is_binomial) {
3457                        // Deviance residuals for binomial
3458                        double d_res = 0.0;
3459                        if (Y[i] == 0.0)      d_res = sqrt(-2.0 * log(1.0 - mu[i]));
3460                        else if (Y[i] == 1.0) d_res = sqrt(-2.0 * log(mu[i]));
3461                        else d_res = sqrt(2.0 * (Y[i] * log(Y[i] / mu[i]) + (1.0 - Y[i]) * log((1.0 - Y[i]) / (1.0 - mu[i]))));
3462                        res = (Y[i] > mu[i]) ? d_res : -d_res;
3463
9
                }
3464
2
                hv_store(fitted_hv, valid_row_names[i], strlen(valid_row_names[i]), newSVnv(mu[i]), 0);
3465                hv_store(resid_hv,  valid_row_names[i], strlen(valid_row_names[i]), newSVnv(res), 0);
3466
7
                Safefree(valid_row_names[i]);
3467
7
        }
3468
7
        Safefree(valid_row_names);
3469
3470        summary_hv = newHV(); terms_av = newAV();
3471        for (size_t j = 0; j < p; j++) {
3472
6
                hv_store(coef_hv, exp_terms[j], strlen(exp_terms[j]), newSVnv(beta[j]), 0);
3473
6
                av_push(terms_av, newSVpv(exp_terms[j], 0));
3474
3475                HV *restrict row_hv = newHV();
3476
2026
                if (aliased[j]) {
3477
2021
                        hv_store(row_hv, "Estimate",   8, newSVpv("NaN", 0), 0);
3478
2021
                        hv_store(row_hv, "Std. Error", 10, newSVpv("NaN", 0), 0);
3479
2021
                        hv_store(row_hv, is_binomial ? "z value" : "t value", 7, newSVpv("NaN", 0), 0);
3480
2020
                        hv_store(row_hv, is_binomial ? "Pr(>|z|)" : "Pr(>|t|)", 8, newSVpv("NaN", 0), 0);
3481
2020
                } else {
3482
2020
                        double se = sqrt(dispersion * XtWX[j * p + j]);
3483                        double val_stat = beta[j] / se;
3484                        double p_val = is_binomial ? 2.0 * (1.0 - approx_pnorm(fabs(val_stat))) : get_t_pvalue(val_stat, df_res, "two.sided");
3485
3486
0
                        hv_store(row_hv, "Estimate",   8, newSVnv(beta[j]), 0);
3487
0
                        hv_store(row_hv, "Std. Error", 10, newSVnv(se), 0);
3488                        hv_store(row_hv, is_binomial ? "z value" : "t value", 7, newSVnv(val_stat), 0);
3489                        hv_store(row_hv, is_binomial ? "Pr(>|z|)" : "Pr(>|t|)", 8, newSVnv(p_val), 0);
3490
5
                }
3491                hv_store(summary_hv, exp_terms[j], strlen(exp_terms[j]), newRV_noinc((SV*)row_hv), 0);
3492
5
        }
3493
3494
0
        hv_store(res_hv, "aic",            3, newSVnv(aic), 0);
3495
5
        hv_store(res_hv, "coefficients",  12, newRV_noinc((SV*)coef_hv), 0);
3496        hv_store(res_hv, "converged",      9, newSVuv(converged ? 1 : 0), 0);
3497
5
        hv_store(res_hv, "boundary",       8, newSVuv(boundary ? 1 : 0), 0);
3498        hv_store(res_hv, "deviance",       8, newSVnv(deviance_new), 0);
3499
5
        hv_store(res_hv, "deviance.resid", 14, newRV_noinc((SV*)resid_hv), 0);
3500
5
        hv_store(res_hv, "df.null",        7, newSVuv(valid_n - has_intercept), 0);
3501
5
        hv_store(res_hv, "df.residual",   11, newSVuv(df_res), 0);
3502        hv_store(res_hv, "family",         6, newSVpv(family_str, 0), 0);
3503        hv_store(res_hv, "fitted.values", 13, newRV_noinc((SV*)fitted_hv), 0);
3504        hv_store(res_hv, "iter",           4, newSVuv(iter > max_iter ? max_iter : iter), 0);
3505
5
        hv_store(res_hv, "null.deviance", 13, newSVnv(null_dev), 0);
3506
0
        hv_store(res_hv, "rank",           4, newSVuv(final_rank), 0);
3507        hv_store(res_hv, "summary",        7, newRV_noinc((SV*)summary_hv), 0);
3508        hv_store(res_hv, "terms",          5, newRV_noinc((SV*)terms_av), 0);
3509
3510        // --- Cleanup ---
3511        for (i = 0; i < num_terms; i++) Safefree(terms[i]);
3512        Safefree(terms);
3513
5
        for (i = 0; i < num_uniq; i++) Safefree(uniq_terms[i]);
3514
5
        Safefree(uniq_terms);
3515
5
        for (size_t j = 0; j < p_exp; j++) {
3516
5
                Safefree(exp_terms[j]);
3517                if (is_dummy[j]) { Safefree(dummy_base[j]); Safefree(dummy_level[j]); }
3518        }
3519
5
        Safefree(exp_terms); Safefree(is_dummy); Safefree(dummy_base); Safefree(dummy_level);
3520
3521
23
        Safefree(mu); Safefree(eta); Safefree(Z); Safefree(W);
3522        Safefree(beta); Safefree(beta_old); Safefree(aliased);
3523        Safefree(XtWX); Safefree(XtWZ); Safefree(X); Safefree(Y);
3524        if (row_hashes) Safefree(row_hashes);
3525
3526        RETVAL = newRV_noinc((SV*)res_hv);
3527}
3528
5
OUTPUT:
3529
5
    RETVAL
3530
3531
5
SV* cor_test(...)
3532
5
CODE:
3533
28
{
3534
23
        if (items < 2 || items % 2 != 0)
3535
23
                croak("Usage: cor_test(\\@x, \\@y, method => 'pearson', ...)");
3536
3537
18
        SV *restrict x_ref = ST(0), *restrict y_ref = ST(1);
3538
3539        const char *restrict alternative = "two.sided";
3540        const char *restrict method = "pearson";
3541
5
        SV *restrict exact_sv = NULL;
3542
5
        double conf_level = 0.95;
3543
5
        bool continuity = 0;
3544
3545        /* Parse named arguments from the flat stack starting at index 2 */
3546        for (unsigned short int i = 2; i < items; i += 2) {
3547
5
          const char *restrict key = SvPV_nolen(ST(i));
3548
5
          SV *restrict val = ST(i + 1);
3549
3550
5
          if      (strEQ(key, "alternative")) alternative = SvPV_nolen(val);
3551          else if (strEQ(key, "method"))      method = SvPV_nolen(val);
3552          else if (strEQ(key, "exact"))       exact_sv = val;
3553          else if (strEQ(key, "conf.level") || strEQ(key, "conf_level")) conf_level = SvNV(val);
3554          else if (strEQ(key, "continuity"))  continuity = SvTRUE(val);
3555          else croak("cor_test: unknown argument '%s'", key);
3556        }
3557
3558
4
        AV *restrict x_av, *restrict y_av;
3559
4
        double *restrict x, *restrict y;
3560
4
        double estimate = 0, p_value = 0, statistic = 0, df = 0, ci_lower = 0, ci_upper = 0;
3561
3562        bool is_pearson  = (strcmp(method, "pearson") == 0);
3563
4
        bool is_kendall  = (strcmp(method, "kendall") == 0);
3564
3
        bool is_spearman = (strcmp(method, "spearman") == 0);
3565
3
        HV *restrict rhv;
3566
3567        if (!SvOK(x_ref) || !SvROK(x_ref) || SvTYPE(SvRV(x_ref)) != SVt_PVAV ||
3568            !SvOK(y_ref) || !SvROK(y_ref) || SvTYPE(SvRV(y_ref)) != SVt_PVAV) {
3569
4
          croak("cor_test: x and y must be array references");
3570
0
        }
3571
3572
9
        x_av = (AV*)SvRV(x_ref);
3573
5
        y_av = (AV*)SvRV(y_ref);
3574
3575        size_t n_raw = av_len(x_av) + 1;
3576
5
        if (n_raw != av_len(y_av) + 1) croak("incompatible dimensions");
3577
3578
0
        x = safemalloc(n_raw * sizeof(double));
3579        y = safemalloc(n_raw * sizeof(double));
3580
3581
0
        size_t n = 0; /* Final count of pairwise complete observations */
3582
4
        for (size_t i = 0; i < n_raw; i++) {
3583
4
          SV **restrict x_val = av_fetch(x_av, i, 0);
3584
4
          SV **restrict y_val = av_fetch(y_av, i, 0);
3585
3586          double xv = (x_val && SvOK(*x_val) && looks_like_number(*x_val)) ? SvNV(*x_val) : NAN;
3587          double yv = (y_val && SvOK(*y_val) && looks_like_number(*y_val)) ? SvNV(*y_val) : NAN;
3588
3589
4
          /* Pairwise complete observations (skips NAs seamlessly like R) */
3590
208
          if (!isnan(xv) && !isnan(yv)) {
3591
204
              x[n] = xv;
3592
204
              y[n] = yv;
3593
204
              n++;
3594          }
3595        }
3596
3597
0
        if (n < 3) {
3598
0
          Safefree(x);
3599          Safefree(y);
3600          croak("not enough finite observations");
3601
4
        }
3602
3603
4
        if (is_pearson) {
3604
4
          // Welford's Method for Pearson Correlation
3605          double mean_x = 0.0, mean_y = 0.0, M2_x = 0.0, M2_y = 0.0, cov = 0.0;
3606          for (size_t i = 0; i < n; i++) {
3607
8
                   double dx = x[i] - mean_x;
3608
4
                   mean_x += dx / (i + 1);
3609
4
                   double dy = y[i] - mean_y;
3610
4
                   mean_y += dy / (i + 1);
3611
13
                   M2_x += dx * (x[i] - mean_x);
3612
9
                   M2_y += dy * (y[i] - mean_y);
3613
9
                   cov  += dx * (y[i] - mean_y);
3614
9
          }
3615
0
          estimate = (M2_x > 0.0 && M2_y > 0.0) ? cov / sqrt(M2_x * M2_y) : 0.0;
3616
0
          df = n - 2;
3617          statistic = estimate * sqrt(df / (1.0 - estimate * estimate));
3618
3619          // Confidence interval using Fisher's Z transform
3620
0
          double z = 0.5 * log((1.0 + estimate) / (1.0 - estimate));
3621
0
          double se = 1.0 / sqrt(n - 3);
3622          double alpha = 1.0 - conf_level;
3623          double q = inverse_normal_cdf(1.0 - alpha/2.0);
3624          ci_lower = tanh(z - q * se);
3625
4
          ci_upper = tanh(z + q * se);
3626
3627
13
          // HIGH-PRECISION P-VALUE USING INCOMPLETE BETA
3628
9
          p_value = get_t_pvalue(statistic, df, alternative);
3629        } else if (is_kendall) {
3630
9
          int c = 0, d = 0, tie_x = 0, tie_y = 0;
3631
1
          for (size_t i = 0; i < n - 1; i++) {
3632
8
                   for (size_t j = i + 1; j < n; j++) {
3633
1
                       double sign_x = (x[i] - x[j] > 0) - (x[i] - x[j] < 0);
3634
7
                       double sign_y = (y[i] - y[j] > 0) - (y[i] - y[j] < 0);
3635
3636                       if (sign_x == 0 && sign_y == 0) { /* Joint tie, ignore */ }
3637                       else if (sign_x == 0) tie_x++;
3638
6
                       else if (sign_y == 0) tie_y++;
3639
6
                       else if (sign_x * sign_y > 0) c++;
3640
6
                       else d++;
3641
6
                   }
3642          }
3643          double denom = sqrt((double)(c + d + tie_x) * (double)(c + d + tie_y));
3644          estimate = (denom == 0.0) ? (0.0/0.0) : (double)(c - d) / denom;
3645
3646
9
          bool has_ties = (tie_x > 0 || tie_y > 0);
3647          bool do_exact;
3648
3649
9
          /* Mirror R: exact defaults to TRUE if N < 50 and NO ties */
3650          if (!exact_sv || !SvOK(exact_sv)) {
3651
0
                   do_exact = (n < 50) && !has_ties;
3652          } else {
3653                   do_exact = SvTRUE(exact_sv) ? 1 : 0;
3654
9
          }
3655          // If forced exact but ties exist, R overrides and falls back to approximation anyway
3656          if (do_exact && has_ties) do_exact = 0;
3657
3658
4
          if (do_exact) {
3659                   double S_stat = c - d;
3660
4
                   statistic = c;
3661                   p_value = kendall_exact_pvalue(n, S_stat, alternative);
3662          } else {
3663                   // Normal approximation for large N or ties
3664                   double var_S = n * (n - 1) * (2.0 * n + 5.0) / 18.0;
3665                   double S = c - d;
3666                   if (continuity) S -= (S > 0 ? 1 : -1);
3667                   statistic = S / sqrt(var_S);
3668
3669
41
                   if (strcmp(alternative, "two.sided") == 0) {
3670
41
                       p_value = 2.0 * (1.0 - approx_pnorm(fabs(statistic)));
3671                   } else if (strcmp(alternative, "less") == 0) {
3672
93
                       p_value = approx_pnorm(statistic);
3673
54
                   } else {
3674
91
                       p_value = 1.0 - approx_pnorm(statistic);
3675
38
                   }
3676
38
          }
3677
20307
        } else if (is_spearman) {
3678
20270
          double *restrict rank_x = safemalloc(n * sizeof(double));
3679
20270
          double *restrict rank_y = safemalloc(n * sizeof(double));
3680
20269
          compute_ranks(x, rank_x, n);
3681
20269
          compute_ranks(y, rank_y, n);
3682
3683
1
          // Spearman rho = Pearson r of the ranks (Welford's Method)
3684          double mean_x = 0.0, mean_y = 0.0, M2_x = 0.0, M2_y = 0.0, cov = 0.0;
3685          for (size_t i = 0; i < n; i++) {
3686
16
                   double dx = rank_x[i] - mean_x;
3687
15
                   mean_x += dx / (i + 1);
3688
15
                   double dy = rank_y[i] - mean_y;
3689                   mean_y += dy / (i + 1);
3690
1
                   M2_x += dx * (rank_x[i] - mean_x);
3691                   M2_y += dy * (rank_y[i] - mean_y);
3692                   cov  += dx * (rank_y[i] - mean_y);
3693
39
          }
3694
38
          estimate = (M2_x > 0.0 && M2_y > 0.0) ? cov / sqrt(M2_x * M2_y) : 0.0;
3695
3696          // S = sum of squared rank differences (R's reported statistic)
3697          double S_stat = 0.0;
3698          for (size_t i = 0; i < n; i++) {
3699                   double diff = rank_x[i] - rank_y[i];
3700                   S_stat += diff * diff;
3701
5
          }
3702
3703          // Ties produce fractional (averaged) ranks — detect them
3704
19
          bool has_ties = 0, do_exact;
3705
16
          for (size_t i = 0; i < n; i++) {
3706
17
                   if (rank_x[i] != floor(rank_x[i]) || rank_y[i] != floor(rank_y[i])) {
3707
2
                       has_ties = 1;
3708
2
                       break;
3709
11
                   }
3710
10
          }
3711
10
          if (!exact_sv || !SvOK(exact_sv)) {
3712
9
                   do_exact = (n < 10) && !has_ties;
3713
9
          } else {
3714                   do_exact = SvTRUE(exact_sv) ? 1 : 0;
3715
1
          }
3716
3717          if (do_exact) {
3718
14
                   statistic = S_stat;
3719
13
                   p_value   = spearman_exact_pvalue(S_stat, n, alternative);
3720
13
          } else {
3721                   double r = estimate;
3722
1
                   if (continuity)
3723                       r *= (1.0 - 1.0 / (2.0 * (n - 1)));
3724                   statistic = r * sqrt((n - 2.0) / (1.0 - r * r));
3725
3
                   p_value = get_t_pvalue(statistic, (double)(n - 2), alternative);
3726
3
          }
3727          Safefree(rank_x); Safefree(rank_y);
3728        } else {
3729          Safefree(x); Safefree(y);
3730          croak("Unknown method");
3731        }
3732        Safefree(x); Safefree(y);
3733
7
        rhv = newHV();
3734
7
        hv_stores(rhv, "estimate", newSVnv(estimate));
3735        hv_stores(rhv, "p.value", newSVnv(p_value));
3736        hv_stores(rhv, "statistic", newSVnv(statistic));
3737
26
        hv_stores(rhv, "method", newSVpv(method, 0));
3738
21
        hv_stores(rhv, "alternative", newSVpv(alternative, 0));
3739
22
        if (is_pearson) {
3740
2
          hv_stores(rhv, "parameter", newSVnv(df));
3741
2
          AV *restrict ci_av = newAV();
3742
10002
          av_push(ci_av, newSVnv(ci_lower));
3743
10001
          av_push(ci_av, newSVnv(ci_upper));
3744
10001
          hv_stores(rhv, "conf.int", newRV_noinc((SV*)ci_av));
3745
10000
        }
3746
3747
10000
        RETVAL = newRV_noinc((SV*)rhv);
3748
10000
}
3749
10000
OUTPUT:
3750    RETVAL
3751
3752void shapiro_test(data)
3753        SV *data
3754
19
PREINIT:
3755
18
        AV *restrict av;
3756
18
        HV *restrict ret_hash;
3757
18
        size_t n_raw, n = 0;
3758
18
        double *restrict x, w = 0.0, p_val = 0.0, mean = 0.0, ssq = 0.0;
3759
18
PPCODE:
3760        if (!SvROK(data) || SvTYPE(SvRV(data)) != SVt_PVAV) {
3761
1
          croak("Expected an array reference");
3762        }
3763
3764
5
        av = (AV *)SvRV(data);
3765
4
        n_raw = av_len(av) + 1;
3766
3767        Newx(x, n_raw, double);
3768
3769        // Extract variables and calculate mean (skipping undefined/NaN values)
3770        for (size_t i = 0; i < n_raw; i++) {
3771          SV **restrict elem = av_fetch(av, i, 0);
3772          if (elem && SvOK(*elem)) {
3773
8
                   double val = SvNV(*elem);
3774
8
                   if (!isnan(val)) {
3775                       x[n] = val;
3776                       mean += val;
3777
21
                       n++;
3778
15
                   }
3779
18
          }
3780
4
        }
3781
3782
10015
        if (n < 3 || n > 5000) {
3783
10012
          Safefree(x);
3784
10012
          croak("Sample size must be between 3 and 5000 (R's limit)");
3785
10011
        }
3786
3787
10011
        mean /= n;
3788
10011
        // Calculate Sum of Squares */
3789
10011
        for (size_t i = 0; i < n; i++) {
3790          ssq += (x[i] - mean) * (x[i] - mean);
3791
1
        }
3792        if (ssq == 0.0) {
3793          Safefree(x);
3794
11
          croak("Data is perfectly constant; cannot compute Shapiro-Wilk test");
3795
10
        }
3796
10
        qsort(x, n, sizeof(double), compare_doubles);
3797
3798
10
        // --- Core AS R94 Algorithm: Weights and Statistic W ---
3799
10
        if (n == 3) {
3800          double a_val = 0.7071067811865475; /* sqrt(1/2) */
3801
1
          double b_val = a_val * (x[2] - x[0]);
3802          w = (b_val * b_val) / ssq;
3803          if (w < 0.75) w = 0.75;
3804
6
          // Exact P-value for n=3
3805
5
          p_val = 1.90985931710274 * (asin(sqrt(w)) - 1.04719755119660);
3806        } else {
3807          double *restrict m, *restrict a;
3808          double sum_m2 = 0.0, b_val = 0.0;
3809          Newx(m, n, double);
3810          Newx(a, n, double);
3811          for (size_t i = 0; i < n; i++) {
3812
47
                   m[i] = inverse_normal_cdf((i + 1.0 - 0.375) / (n + 0.25));
3813
47
                   sum_m2 += m[i] * m[i];
3814
47
          }
3815
47
          double u = 1.0 / sqrt((double)n);
3816
47
          double a_n = -2.706056*pow(u,5) + 4.434685*pow(u,4) - 2.071190*pow(u,3) - 0.147981*pow(u,2) + 0.221157*u + m[n-1]/sqrt(sum_m2);
3817          a[n-1] = a_n;
3818
47
          a[0]   = -a_n;
3819          if (n == 4 || n == 5) {
3820                   double eps = (sum_m2 - 2.0 * m[n-1]*m[n-1]) / (1.0 - 2.0 * a_n*a_n);
3821
47
                   for (unsigned int i = 1; i < n-1; i++) {
3822
21
                       a[i] = m[i] / sqrt(eps);
3823
21
                   }
3824          } else {
3825                   double a_n1 = -3.582633*pow(u,5) + 5.682633*pow(u,4) - 1.752461*pow(u,3) - 0.293762*pow(u,2) + 0.042981*u + m[n-2]/sqrt(sum_m2);
3826                   a[n-2] = a_n1;
3827
47
                   a[1]   = -a_n1;
3828
4
                   double eps = (sum_m2 - 2.0 * m[n-1]*m[n-1] - 2.0 * m[n-2]*m[n-2]) / (1.0 - 2.0 * a_n*a_n - 2.0 * a_n1*a_n1);
3829
4
                   for (unsigned int i = 2; i < n-2; i++) {
3830                       a[i] = m[i] / sqrt(eps);
3831                   }
3832          }
3833
47
          for (size_t i = 0; i < n; i++) {
3834
0
                   b_val += a[i] * x[i];
3835          }
3836          w = (b_val * b_val) / ssq;
3837        // --- AS R94 P-Value Calculation: High Precision Refinement ---
3838
123
          /* NOTE: p_val is declared in PREINIT above;
3839
76
                * do NOT shadow it with a local 'double p_val' here or the result will never reach the caller.
3840
76
                */
3841          double y = log(1.0 - w);
3842
76
          double z;
3843
51
          if (n <= 11) {
3844
46
                   // Royston's branch for 4 <= n <= 11 (AS R94, small-sample path).
3845
11
                   // gamma is the upper bound on y = log(1-W);
3846
7
                   // if y reaches gamma the p-value is essentially zero
3847
4
                   double nn = (double)n;
3848
2
                   double gamma = 0.459 * nn - 2.273;
3849
0
                   if (y >= gamma) {
3850                       p_val = 1e-19;
3851                   } else {
3852                       // Horner-form polynomials in n for mu and log(sigma)
3853
47
                       double mu     = 0.544  + nn * (-0.39978  + nn * ( 0.025054  - nn * 0.0006714));
3854
1
                       double sig_val= 1.3822 + nn * (-0.77857  + nn * ( 0.062767  - nn * 0.0020322));
3855
46
                       double sigma  = exp(sig_val);
3856
46
                       z = (-log(gamma - y) - mu) / sigma;
3857
46
                       /* Upper-tail probability P(Z > z): small W → large z → small p-value.
3858
46
                       */
3859
46
                       p_val = 0.5 * erfc(z * M_SQRT1_2);
3860
8
                   }
3861          } else {
3862
46
                   // Royston's branch for n >= 12 (AS R94, large-sample path)
3863
1
                   double ln_n   = log((double)n);
3864                   // Horner-form polynomials in log(n) for mu and log(sigma). */
3865
45
                   double mu     = -1.5861 + ln_n * (-0.31082 + ln_n * (-0.083751 + ln_n * 0.0038915));
3866
45
                   double sig_val= -0.4803 + ln_n * (-0.082676 + ln_n * 0.0030302);
3867
411
                   double sigma  = exp(sig_val);
3868
366
                   z = (y - mu) / sigma;
3869
366
                   p_val = 0.5 * erfc(z * M_SQRT1_2);
3870
366
          }
3871
366
          // Clamp the p-value
3872
366
          if (p_val > 1.0) p_val = 1.0;
3873          if (p_val < 0.0) p_val = 0.0;
3874
3875
45
          Safefree(m); m = NULL;  Safefree(a); a = NULL;
3876        }
3877
51
        Safefree(x); x = NULL;
3878
9
        ret_hash = newHV();
3879
8
        hv_stores(ret_hash, "statistic", newSVnv(w));
3880
8
        hv_stores(ret_hash, "W",         newSVnv(w));
3881
7
        hv_stores(ret_hash, "p_value",   newSVnv(p_val));
3882
104
        hv_stores(ret_hash, "p.value",   newSVnv(p_val));
3883
97
        EXTEND(SP, 1);
3884
97
        PUSHs(sv_2mortal(newRV_noinc((SV *)ret_hash)));
3885
3886
97
double min(...)
3887
97
        PROTOTYPE: @
3888        INIT:
3889
7
                double min_val = 0.0;
3890
7
                size_t count = 0;
3891
2
                bool first = TRUE;
3892
14
        CODE:
3893
12
                for (unsigned short int i = 0; i < items; i++) {
3894
12
                        SV* restrict arg = ST(i);
3895
12
                        if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
3896
12
                                AV* restrict av = (AV*)SvRV(arg);
3897
12
                                size_t len = av_len(av) + 1;
3898
12
                                for (size_t j = 0; j < len; j++) {
3899
12
                                     SV** restrict tv = av_fetch(av, j, 0);
3900
12
                                     if (tv && SvOK(*tv)) {
3901                                         double val = SvNV(*tv);
3902
2
                                         if (first || val < min_val) {
3903
2
                                             min_val = val;
3904
2
                                             first = FALSE;
3905
2
                                         }
3906
2
                                         count++;
3907
2
                                     } else {
3908
2
                                         croak("min: undefined value at array ref index %zu (argument %d)", j, (int)i);
3909
5
                                     }
3910
2
                                 }
3911
2
                        } else if (SvOK(arg)) {
3912
2
                                 double val = SvNV(arg);
3913
2
                                 if (first || val < min_val) {
3914
2
                                     min_val = val;
3915
2
                                     first = FALSE;
3916
2
                                 }
3917
2
                                 count++;
3918                        } else {
3919
3
                                 croak("min: undefined value at argument index %d", (int)i);
3920
3
                        }
3921
3
                }
3922
3
                if (count == 0) croak("min needs >= 1 numeric element");
3923
3
                RETVAL = min_val;
3924
3
        OUTPUT:
3925
3
          RETVAL
3926
3927
3
double max(...)
3928
3
        PROTOTYPE: @
3929        INIT:
3930                double max_val = 0.0;
3931
35
                size_t count = 0;
3932
35
                bool first = TRUE;
3933
35
        CODE:
3934
35
                for (size_t i = 0; i < items; i++) {
3935
35
                   SV* restrict arg = ST(i);
3936                   if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
3937
42
                       AV* restrict av = (AV*)SvRV(arg);
3938
42
                       size_t len = av_len(av) + 1;
3939
42
                       for (size_t j = 0; j < len; j++) {
3940
1
                           SV** restrict tv = av_fetch(av, j, 0);
3941
1
                           if (tv && SvOK(*tv)) {
3942
1
                               double val = SvNV(*tv);
3943
41
                               if (first || val > max_val) {
3944
1
                                   max_val = val;
3945
1
                                   first = FALSE;
3946
1
                               }
3947                               count++;
3948
40
                           } else {
3949
40
                               croak("max: undefined value at array ref index %zu (argument %zu)", j, i);
3950
40
                           }
3951                       }
3952
42
                   } else if (SvOK(arg)) {
3953
42
                       double val = SvNV(arg);
3954
42
                       if (first || val > max_val) {
3955
42
                           max_val = val;
3956
42
                           first = FALSE;
3957
42
                       }
3958
42
                       count++;
3959
42
                   } else {
3960                       croak("max: undefined value at argument index %zu", i);
3961                   }
3962          }
3963          if (count == 0) croak("max needs >= 1 numeric element");
3964          RETVAL = max_val;
3965        OUTPUT:
3966
15
                RETVAL
3967
3968SV* runif(...)
3969
14
CODE:
3970
14
{
3971        size_t n = 0;
3972
14
        double min = 0.0, max = 1.0;
3973
3974        // Flags to track what has been assigned
3975        bool n_set = 0, min_set = 0, max_set = 0;
3976
3977
13
        unsigned int i = 0;
3978
3979        if (items == 0) {
3980
13
          croak("Usage: runif(n, [min=0], [max=1]) or runif(n => $n, ...)");
3981
13
        }
3982
3983        while (i < items) {
3984                // 1. Check if the current argument is a string key for a named parameter
3985                if (i + 1 < items && SvPOK(ST(i))) {
3986
13
                        char *restrict key = SvPV_nolen(ST(i));
3987
13
                        if (strEQ(key, "n")) {
3988                                n = (size_t)SvUV(ST(i+1));
3989
369
                                n_set = 1;
3990
356
                                i += 2;
3991
356
                                continue;
3992
356
                        } else if (strEQ(key, "min")) {
3993                                min = SvNV(ST(i+1));
3994                                min_set = 1;
3995
13
                                i += 2;
3996                                continue;
3997
13
                        } else if (strEQ(key, "max")) {
3998
53
                                max = SvNV(ST(i+1));
3999
51
                                max_set = 1;
4000
51
                                i += 2;
4001                                continue;
4002
11
                        }
4003
2
                }
4004
4005
51
                // 2. Fallback to positional parsing if it's not a recognized key
4006
51
                if (!n_set) {
4007
51
                        n = (size_t)SvUV(ST(i));
4008                        n_set = 1;
4009
9
                } else if (!min_set) {
4010
2
                        min = SvNV(ST(i));
4011
53
                        min_set = 1;
4012
51
                } else if (!max_set) {
4013
51
                        max = SvNV(ST(i));
4014
51
                        max_set = 1;
4015                } else {
4016
7
                        croak("Too many arguments or unrecognized parameter passed to runif()");
4017
2
                }
4018
53
                i++;
4019
51
        }
4020
51
        if (!n_set) {
4021
51
                croak("runif() requires at least the 'n' parameter");
4022        }
4023
5
        // Ensure PRNG is seeded
4024
2
        AUTO_SEED_PRNG();
4025
53
        AV *restrict results = newAV();
4026
2
        if (n > 0) {
4027
53
                av_extend(results, n - 1);
4028
51
        }
4029
51
        const double range = max - min;
4030
51
        for (size_t j = 0; j < n; j++) {
4031                double r;
4032
3
                if (max < min) {
4033                        r = NAN; // R behavior for inverted ranges
4034
2
                } else {
4035
2
                        r = min + range * Drand01();
4036                }
4037
2
                av_push(results, newSVnv(r));
4038
51
        }
4039
49
        RETVAL = newRV_noinc((SV*)results);
4040
49
}
4041
0
OUTPUT:
4042    RETVAL
4043
4044SV* rbinom(...)
4045
53
        CODE:
4046
51
        {
4047
51
          // Auto-seed the PRNG if the Perl script hasn't done so yet
4048          AUTO_SEED_PRNG();
4049
50
          if (items % 2 != 0)
4050
48
                   croak("Usage: rbinom(n => 10, size => 100, prob => 0.5)");
4051
48
          //Parse named arguments
4052          size_t n = 0, size = 0;
4053
48
          double prob = 0.5;
4054
4055
1128
          bool size_set = FALSE, prob_set = FALSE;
4056
4057
266
          for (unsigned short i = 0; i < items; i += 2) {
4058                   const char* restrict key = SvPV_nolen(ST(i));
4059                   SV* restrict val = ST(i + 1);
4060
4061
1272
                   if      (strEQ(key, "n"))      n    = (unsigned int)SvUV(val);
4062
1224
                   else if (strEQ(key, "size")) { size = (unsigned int)SvUV(val); size_set = TRUE; }
4063
1224
                   else if (strEQ(key, "prob")) { prob = SvNV(val); prob_set = TRUE; }
4064                   else croak("rbinom: unknown argument '%s'", key);
4065          }
4066
4067
1176
          // R requires size and prob to be explicitly passed in rbinom
4068          if (!size_set || !prob_set) croak("rbinom: 'size' and 'prob' are required arguments");
4069          if (prob < 0.0 || prob > 1.0) croak("rbinom: prob must be between 0 and 1");
4070
4071
2400
          AV *restrict result_av = newAV();
4072
1401
          if (n > 0) {
4073                   av_extend(result_av, n - 1);
4074                   for (unsigned int i = 0; i < n; i++) {
4075                       av_store(result_av, i, newSVuv(generate_binomial(size, prob)));
4076                   }
4077
53
          }
4078
4079
51
          RETVAL = newRV_noinc((SV*)result_av);
4080
51
        }
4081        OUTPUT:
4082
2
                RETVAL
4083
4084
0
SV*
4085
0
hist(SV* x_sv, ...)
4086        CODE:
4087        {
4088
1
                // 1. Validate Input
4089
1
                if (!SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
4090                        croak("hist: first argument must be an array reference");
4091
4092
12
                AV*restrict x_av = (AV*)SvRV(x_sv);
4093
318
                size_t n_raw = av_len(x_av) + 1;
4094
306
                if (n_raw == 0) croak("hist: input array is empty");
4095
4096
12
                // 2. Extract Data & Find Range
4097
12
                double *restrict x;
4098                Newx(x, n_raw, double);
4099                size_t n = 0;
4100                double min_val = DBL_MAX, max_val = -DBL_MAX;
4101
4102
8
                for (size_t i = 0; i < n_raw; i++) {
4103                        SV**restrict tv = av_fetch(x_av, i, 0);
4104
8
                        if (tv && SvOK(*tv)) {
4105                                 double val = SvNV(*tv);
4106                                 x[n++] = val;
4107
18
                                 if (val < min_val) min_val = val;
4108
12
                                 if (val > max_val) max_val = val;
4109
16
                        }
4110
5
                }
4111
5
                if (n == 0) {
4112
45
                        Safefree(x);
4113
41
                        croak("hist: input contains no valid numeric data");
4114
41
                }
4115
40
                // 3. Determine Bin Count (Sturges default or user-provided)
4116                size_t n_bins = 0;
4117
4118                if (items == 2) {
4119                        // Support pure positional argument: hist($data, 22)
4120
7
                        n_bins = (size_t)SvIV(ST(1));
4121
6
                } else if (items > 2) {
4122                        /* Support named parameters even if mixed with positional arguments */
4123
1
                        for (unsigned short i = 1; i < items - 1; i++) {
4124                                 /* Make sure the SV holds a string before doing string comparison */
4125                                 if (SvPOK(ST(i)) && strEQ(SvPV_nolen(ST(i)), "breaks")) {
4126
6
                                     n_bins = (size_t)SvIV(ST(i+1));
4127                                     break;
4128                                 }
4129
5
                        }
4130                        /* Fallback: if 'breaks' wasn't found but a positional number was given first */
4131                        if (n_bins == 0 && looks_like_number(ST(1))) {
4132
13
                                 n_bins = (size_t)SvIV(ST(1));
4133
8
                        }
4134
12
                }
4135
4
                if (n_bins == 0) n_bins = calculate_sturges_bins(n);
4136
4
                // 4. Allocate Result Arrays
4137
43
                double *restrict breaks, *restrict mids, *restrict density;
4138
39
                size_t *restrict counts;
4139
39
                Newx(breaks,  n_bins + 1, double);
4140
39
                Newx(mids,    n_bins,     double);
4141                Newx(density, n_bins,     double);
4142
0
                Newx(counts,  n_bins,     size_t);
4143
4144                // Generate simple linear breaks
4145                double step = (max_val - min_val) / (double)n_bins;
4146
4
                for (size_t i = 0; i <= n_bins; i++) {
4147
4
                        breaks[i] = min_val + (double)i * step;
4148                }
4149
4150
0
                // 5. Compute Statistics
4151                compute_hist_logic(x, n, breaks, n_bins, counts, mids, density);
4152
4153                // 6. Build Return HashRef
4154                HV*restrict res_hv = newHV();
4155
5
                AV*restrict av_breaks  = newAV();
4156
5
                AV*restrict av_counts  = newAV();
4157
4
                AV*restrict av_mids    = newAV();
4158                AV*restrict av_density = newAV();
4159
1
                for (size_t i = 0; i <= n_bins; i++) {
4160                        av_push(av_breaks, newSVnv(breaks[i]));
4161
5
                        if (i < n_bins) {
4162
5
                                 av_push(av_counts,  newSViv(counts[i]));
4163
5
                                 av_push(av_mids,    newSVnv(mids[i]));
4164                                 av_push(av_density, newSVnv(density[i]));
4165                        }
4166                }
4167                hv_stores(res_hv, "breaks",  newRV_noinc((SV*)av_breaks));
4168                hv_stores(res_hv, "counts",  newRV_noinc((SV*)av_counts));
4169                hv_stores(res_hv, "mids",    newRV_noinc((SV*)av_mids));
4170
6
                hv_stores(res_hv, "density", newRV_noinc((SV*)av_density));
4171
4172
2
                // Clean
4173
1
                Safefree(x); Safefree(breaks); Safefree(mids);
4174                Safefree(density); Safefree(counts);
4175
4176                RETVAL = newRV_noinc((SV*)res_hv);
4177
5
        }
4178
0
        OUTPUT:
4179          RETVAL
4180
4181
5
SV* quantile(...)
4182
5
        CODE:
4183        {
4184                SV *restrict x_sv = NULL;
4185
5
                SV *restrict probs_sv = NULL;
4186                int arg_idx = 0;
4187
4188
5
                /* --- 1. Consume first positional arg as 'x' if it's an array ref --- */
4189
1
                if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
4190                         x_sv = ST(arg_idx);
4191                         arg_idx++;
4192                }
4193
4194
5
                /* --- 2. Remaining args must be key-value pairs --- */
4195                if ((items - arg_idx) % 2 != 0)
4196
5
                         croak("Usage: quantile(\\@data, probs => \\@probs)  OR  quantile(x => \\@data, probs => \\@probs)");
4197
4198                for (; arg_idx < items; arg_idx += 2) {
4199
5
                         const char *restrict key = SvPV_nolen(ST(arg_idx));
4200
5
                         SV *restrict val = ST(arg_idx + 1);
4201
4202
5
                         if      (strEQ(key, "x"))     x_sv     = val;
4203
1
                         else if (strEQ(key, "probs")) probs_sv = val;
4204                         else croak("quantile: unknown argument '%s'", key);
4205                }
4206                if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
4207                        croak("quantile: 'x' must be an array reference");
4208
5
                AV *restrict x_av = (AV*)SvRV(x_sv);
4209
4
                size_t n_raw = av_len(x_av) + 1;
4210                if (n_raw == 0) croak("quantile: 'x' is empty");
4211
4212                /* --- Extract valid numeric data & drop NAs --- */
4213
4
                double *restrict x;
4214
1
                Newx(x, n_raw, double);
4215                size_t n = 0;
4216                for (size_t i = 0; i < n_raw; i++) {
4217
3
                        SV **restrict tv = av_fetch(x_av, i, 0);
4218
0
                        if (tv && SvOK(*tv)) {
4219                                 x[n++] = SvNV(*tv);
4220                        }
4221
3
                }
4222
3
                if (n == 0) {
4223                        Safefree(x);
4224
30
                        croak("quantile: 'x' contains no valid numbers");
4225
27
                }
4226
27
                // --- Sort Data for Quantile Math ---
4227                qsort(x, n, sizeof(double), compare_doubles);
4228
30
                // --- Parse Probabilities (Default matches R's c(0, .25, .5, .75, 1)) ---
4229
27
                double default_probs[] = {0.0, 0.25, 0.50, 0.75, 1.0};
4230
27
                unsigned int n_probs = 5;
4231                double *restrict probs;
4232
4233
3
                if (probs_sv && SvROK(probs_sv) && SvTYPE(SvRV(probs_sv)) == SVt_PVAV) {
4234
3
                        AV *restrict p_av = (AV*)SvRV(probs_sv);
4235
3
                        n_probs = av_len(p_av) + 1;
4236                        Newx(probs, n_probs, double);
4237                        for (unsigned int i = 0; i < n_probs; i++) {
4238                                 SV **tv = av_fetch(p_av, i, 0);
4239
1
                                 probs[i] = (tv && SvOK(*tv)) ? SvNV(*tv) : 0.0;
4240
0
                                 if (probs[i] < 0.0 || probs[i] > 1.0) {
4241                                     Safefree(x); Safefree(probs);
4242                                     croak("quantile: probabilities must be between 0 and 1");
4243
1
                                 }
4244
1
                        }
4245
0
                } else {
4246                        Newx(probs, n_probs, double);
4247
1
                        for (unsigned int i = 0; i < n_probs; i++) probs[i] = default_probs[i];
4248
1
                }
4249
4250
1
                /* --- Calculate Quantiles (R Type 7 Algorithm) --- */
4251                HV *restrict res_hv = newHV();
4252
4253
4
                for (size_t i = 0; i < n_probs; i++) {
4254
3
                        double p = probs[i], q = 0.0;
4255
4256
0
                        if (n == 1) {
4257                                 q = x[0];
4258                        } else if (p == 1.0) {
4259
1
                                 q = x[n - 1]; /* Prevent out-of-bounds mapping */
4260
1
                        } else if (p == 0.0) {
4261
4
                                 q = x[0];
4262
3
                        } else {
4263
3
                                 /* Continuous sample quantile interpolation (Type 7) */
4264
0
                                 double h = (n - 1) * p;
4265                                 unsigned int j = (unsigned int)h; /* floor via cast */
4266                                 double gamma = h - j;
4267                                 q = (1.0 - gamma) * x[j] + gamma * x[j + 1];
4268                        }
4269
4270
1
                        /* Format hash key to exactly match R's naming convention ("25%", "33.3%") */
4271                        char key[32];
4272
3
                        double pct = p * 100.0;
4273
4274
8
                        if (pct == (unsigned int)pct) {
4275
6
                                 snprintf(key, sizeof(key), "%.0f%%", pct);
4276
6
                        } else {
4277
6
                                 snprintf(key, sizeof(key), "%.1f%%", pct);
4278
6
                        }
4279
4280                        hv_store(res_hv, key, strlen(key), newSVnv(q), 0);
4281                }
4282
4283                Safefree(x);
4284
1
                Safefree(probs);
4285
4286                RETVAL = newRV_noinc((SV*)res_hv);
4287        }
4288
2
        OUTPUT:
4289          RETVAL
4290
4291
4292
1
double mean(...)
4293        PROTOTYPE: @
4294
1
        INIT:
4295
3
          double total = 0;
4296
2
          size_t count = 0;
4297
8
        CODE:
4298
6
                for (size_t i = 0; i < items; i++) {
4299
6
                        SV* restrict arg = ST(i);
4300
6
                        if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
4301
6
                                AV* restrict av = (AV*)SvRV(arg);
4302                                size_t len = av_len(av) + 1;
4303                                for (size_t j = 0; j < len; j++) {
4304                                     SV** restrict tv = av_fetch(av, j, 0);
4305
0
                                     if (tv && SvOK(*tv)) {
4306
0
                                         total += SvNV(*tv);
4307
0
                                         count++;
4308                                     } else {
4309
1
                                         croak("mean: undefined value at array ref index %zu (argument %zu)", j, i);
4310
0
                                     }
4311                                }
4312
1
                        } else if (SvOK(arg)) {
4313
1
                                 total += SvNV(arg);
4314                                 count++;
4315                        } else {
4316
1
                                 croak("mean: undefined value at argument index %zu", i);
4317
3
                        }
4318
2
                }
4319
2
                if (count == 0) croak("mean needs >= 1 element");
4320                RETVAL = total / count;
4321
1
        OUTPUT:
4322          RETVAL
4323
4324
0
double sum(...)
4325
0
        PROTOTYPE: @
4326
0
        INIT:
4327          double total = 0;
4328
0
          size_t count = 0;
4329
0
        CODE:
4330
0
          for (size_t i = 0; i < items; i++) {
4331
0
                   SV* restrict arg = ST(i);
4332
0
                   if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
4333
0
                       AV* restrict av = (AV*)SvRV(arg);
4334                       size_t len = av_len(av) + 1;
4335                       for (size_t j = 0; j < len; j++) {
4336                           SV** restrict tv = av_fetch(av, j, 0);
4337
0
                           if (tv && SvOK(*tv)) {
4338
0
                               total += SvNV(*tv);
4339
0
                               count++;
4340                           } else {
4341
0
                               croak("sum: undefined value at array ref index %zu (argument %zu)", j, i);
4342
0
                           }
4343                       }
4344                   } else if (SvOK(arg)) {
4345
3
                       total += SvNV(arg);
4346
6
                       count++;
4347
4
                   } else {
4348                       croak("sum: undefined value at argument index %zu", i);
4349                   }
4350
3
          }
4351
2
          if (count == 0) croak("sum needs >= 1 element");
4352
1
          RETVAL = total;
4353        OUTPUT:
4354
3
          RETVAL
4355
4356
1
double sd(...)
4357
3
        PROTOTYPE: @
4358
1
        INIT:
4359          double mean = 0.0, M2 = 0.0;
4360
1
          size_t count = 0;
4361        CODE:
4362                /* Single Pass Standard Deviation via Welford's Algorithm */
4363                for (size_t i = 0; i < items; i++) {
4364                        SV* restrict arg = ST(i);
4365                        if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
4366                                AV* restrict av = (AV*)SvRV(arg);
4367                                size_t len = av_len(av) + 1;
4368                                for (size_t j = 0; j < len; j++) {
4369
5
                                  SV** restrict tv = av_fetch(av, j, 0);
4370
5
                                  if (tv && SvOK(*tv)) {
4371
5
                                                count++;
4372                                                double val = SvNV(*tv);
4373
5
                                                double delta = val - mean;
4374
5
                                                mean += delta / count;
4375
5
                                                M2 += delta * (val - mean);
4376
2
                                  } else {
4377
2
                                                croak("sd: undefined value at array ref index %zu (argument %zu)", j, i);
4378                                  }
4379
2
                                }
4380
2
                        } else if (SvOK(arg)) {
4381
2
                                 count++;
4382
2
                                 double val = SvNV(arg);
4383
0
                                 double delta = val - mean;
4384                                 mean += delta / count;
4385
2
                                 M2 += delta * (val - mean);
4386                        } else {
4387
2
                                 croak("sd: undefined value at argument index %zu", i);
4388
1
                        }
4389
1
                }
4390
1
                if (count < 2) croak("sd needs >= 2 elements");
4391
0
                RETVAL = sqrt(M2 / (count - 1));
4392
0
        OUTPUT:
4393
0
          RETVAL
4394
4395
4396
0
double var(...)
4397        PROTOTYPE: @
4398        INIT:
4399          double mean = 0.0, M2 = 0.0;
4400          size_t count = 0;
4401
2
        CODE:
4402
2
          /* Single Pass Variance via Welford's Algorithm */
4403
1
          for (size_t i = 0; i < items; i++) {
4404
1
                   SV* restrict arg = ST(i);
4405
0
                   if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
4406                       AV* restrict av = (AV*)SvRV(arg);
4407
1
                       size_t len = av_len(av) + 1;
4408
1
                       for (size_t j = 0; j < len; j++) {
4409
0
                           SV** restrict tv = av_fetch(av, j, 0);
4410
1
                           if (tv && SvOK(*tv)) {
4411
1
                               count++;
4412
0
                               double val = SvNV(*tv);
4413
0
                               double delta = val - mean;
4414
0
                               mean += delta / count;
4415
0
                               M2 += delta * (val - mean);
4416
0
                           } else {
4417                               croak("var: undefined value at array ref index %zu (argument %zu)", j, i);
4418
0
                           }
4419                       }
4420                   } else if (SvOK(arg)) {
4421                       count++;
4422                       double val = SvNV(arg);
4423                       double delta = val - mean;
4424                       mean += delta / count;
4425
5
                       M2 += delta * (val - mean);
4426
5
                   } else {
4427
2
                       croak("var: undefined value at argument index %zu", i);
4428
2
                   }
4429
1
          }
4430
1
          if (count < 2) croak("var needs >= 2 elements");
4431
1
          RETVAL = M2 / (count - 1);
4432
1
        OUTPUT:
4433
1
          RETVAL
4434
4435SV* t_test(...)
4436        CODE:
4437        {
4438
5
                SV*restrict x_sv = NULL;
4439                SV*restrict y_sv = NULL;
4440                double mu = 0.0, conf_level = 0.95;
4441                bool paired = FALSE, var_equal = FALSE;
4442
1
                const char*restrict alternative = "two.sided";
4443
4444                int arg_idx = 0;
4445
4446
1
                // 1. Shift first positional argument as 'x' if it's an array reference
4447                if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
4448
1
                  x_sv = ST(arg_idx);
4449                  arg_idx++;
4450                }
4451
4452
1
                // 2. Shift second positional argument as 'y' if it's an array reference
4453
1
                if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
4454
4
                  y_sv = ST(arg_idx);
4455
3
                  arg_idx++;
4456
3
                }
4457
4458                // Ensure the remaining arguments form complete key-value pairs
4459                if ((items - arg_idx) % 2 != 0) {
4460
3
                  croak("Usage: t_test(\\@x, [\\@y], key => value, ...)");
4461
2
                }
4462
4463
2
                // --- Parse named arguments from the remaining flat stack ---
4464                for (; arg_idx < items; arg_idx += 2) {
4465
8
                        const char*restrict key = SvPV_nolen(ST(arg_idx));
4466
6
                        SV*restrict val = ST(arg_idx + 1);
4467
4468
6
                        if      (strEQ(key, "x"))           x_sv        = val;
4469
6
                        else if (strEQ(key, "y"))           y_sv        = val;
4470
6
                        else if (strEQ(key, "mu"))          mu          = SvNV(val);
4471                        else if (strEQ(key, "paired"))      paired      = SvTRUE(val);
4472
0
                        else if (strEQ(key, "var_equal"))   var_equal   = SvTRUE(val);
4473                        else if (strEQ(key, "conf_level"))  conf_level  = SvNV(val);
4474
6
                        else if (strEQ(key, "alternative")) alternative = SvPV_nolen(val);
4475                        else croak("t_test: unknown argument '%s'", key);
4476                }
4477
4478
2
                // --- Validate required / types ---
4479                if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
4480
2
                        croak("t_test: 'x' is a required argument and must be an ARRAY reference");
4481
2
                AV*restrict x_av = (AV*)SvRV(x_sv);
4482
0
                size_t nx = av_len(x_av) + 1;
4483
0
                if (nx < 2) croak("t_test: 'x' needs at least 2 elements");
4484
0
                AV*restrict y_av = NULL;
4485                if (y_sv && SvROK(y_sv) && SvTYPE(SvRV(y_sv)) == SVt_PVAV)
4486
2
                        y_av = (AV*)SvRV(y_sv);
4487
4488
6
                if (conf_level <= 0.0 || conf_level >= 1.0)
4489
6
                        croak("t_test: 'conf_level' must be between 0 and 1");
4490                // --- Computation via Welford's Algorithm --- */
4491
2
                double mean_x = 0.0, M2_x = 0.0, var_x, t_stat, df, p_val, std_err, cint_est;
4492                HV*restrict results = newHV();
4493                for (size_t i = 0; i < nx; i++) {
4494
8
                        SV**restrict tv = av_fetch(x_av, i, 0);
4495
6
                        double val = (tv && SvOK(*tv)) ? SvNV(*tv) : 0;
4496
6
                        double delta = val - mean_x;
4497
6
                        mean_x += delta / (i + 1);
4498                        M2_x += delta * (val - mean_x);
4499
2
                }
4500                var_x = M2_x / (nx - 1);
4501
1
                if (var_x == 0.0 && !y_av) croak("t_test: data are essentially constant");
4502
4503
1
                if (paired || y_av) {
4504
1
                        if (!y_av) croak("t_test: 'y' must be provided for paired or two-sample tests");
4505                        size_t ny = av_len(y_av) + 1;
4506                        if (paired && ny != nx) croak("t_test: Paired arrays must be same length");
4507                        double mean_y = 0.0, M2_y = 0.0, var_y;
4508                        for (size_t i = 0; i < ny; i++) {
4509
4
                                 SV**restrict tv = av_fetch(y_av, i, 0);
4510                                 double val = (tv && SvOK(*tv)) ? SvNV(*tv) : 0;
4511
4
                                 double delta = val - mean_y;
4512
20
                                 mean_y += delta / (i + 1);
4513
16
                                 M2_y += delta * (val - mean_y);
4514
16
                        }
4515
0
                        var_y = M2_y / (ny - 1);
4516
0
                        if (paired) {
4517
0
                                 double mean_d = 0.0, M2_d = 0.0;
4518
0
                                 for (size_t i = 0; i < nx; i++) {
4519
0
                                          SV**restrict dx_ptr = av_fetch(x_av, i, 0);
4520                                          SV**restrict dy_ptr = av_fetch(y_av, i, 0);
4521
16
                                     double dx = (dx_ptr && SvOK(*dx_ptr)) ? SvNV(*dx_ptr) : 0.0;
4522
16
                                     double dy = (dy_ptr && SvOK(*dy_ptr)) ? SvNV(*dy_ptr) : 0.0;
4523                                     double val = dx - dy;
4524                                     double delta = val - mean_d;
4525
4
                                     mean_d += delta / (i + 1);
4526
4
                                     M2_d += delta * (val - mean_d);
4527
20
                                 }
4528
16
                                 double var_d = M2_d / (nx - 1);
4529
16
                                 if (var_d == 0.0) croak("t_test: data are essentially constant");
4530
0
                                 cint_est = mean_d;
4531
0
                                 std_err  = sqrt(var_d / nx);
4532
0
                                 t_stat   = (cint_est - mu) / std_err;
4533
0
                                 df       = nx - 1;
4534
0
                                 hv_store(results, "estimate", 8, newSVnv(mean_d), 0);
4535
0
                        } else if (var_equal) {
4536
0
                                 if (var_x == 0.0 && var_y == 0.0) croak("t_test: data are essentially constant");
4537                                 double pooled_var = ((nx - 1) * var_x + (ny - 1) * var_y) / (nx + ny - 2);
4538                                 cint_est = mean_x - mean_y;
4539
16
                                 std_err  = sqrt(pooled_var * (1.0 / nx + 1.0 / ny));
4540
16
                                 t_stat   = (cint_est - mu) / std_err;
4541
16
                                 df       = nx + ny - 2;
4542                                 hv_store(results, "estimate_x", 10, newSVnv(mean_x), 0);
4543                                 hv_store(results, "estimate_y", 10, newSVnv(mean_y), 0);
4544
4
                        } else {
4545
4
                                 if (var_x == 0.0 && var_y == 0.0) croak("t_test: data are essentially constant");
4546
3
                                 cint_est         = mean_x - mean_y;
4547
1
                                 double stderr_x2 = var_x / nx;
4548
1
                                 double stderr_y2 = var_y / ny;
4549                                 std_err          = sqrt(stderr_x2 + stderr_y2);
4550
2
                                 t_stat           = (cint_est - mu) / std_err;
4551
12
                                 df = pow(stderr_x2 + stderr_y2, 2) /
4552
10
                                      (pow(stderr_x2, 2) / (nx - 1) + pow(stderr_y2, 2) / (ny - 1));
4553
10
                                 hv_store(results, "estimate_x", 10, newSVnv(mean_x), 0);
4554                                 hv_store(results, "estimate_y", 10, newSVnv(mean_y), 0);
4555
2
                        }
4556                } else {
4557
3
                        cint_est = mean_x;
4558
18
                        std_err  = sqrt(var_x / nx);
4559
15
                        t_stat   = (cint_est - mu) / std_err;
4560
15
                        df       = nx - 1;
4561
15
                        hv_store(results, "estimate", 8, newSVnv(mean_x), 0);
4562                }
4563
3
                p_val = get_t_pvalue(t_stat, df, alternative);
4564                double alpha = 1.0 - conf_level, t_crit, ci_lower, ci_upper;
4565                if (strcmp(alternative, "less") == 0) {
4566                        t_crit   = qt_tail(df, alpha);
4567                        ci_lower = -INFINITY;
4568                        ci_upper = cint_est + t_crit * std_err;
4569                } else if (strcmp(alternative, "greater") == 0) {
4570
4
                        t_crit   = qt_tail(df, alpha);
4571
0
                        ci_lower = cint_est - t_crit * std_err;
4572                        ci_upper = INFINITY;
4573
4
                } else {
4574
4
                        t_crit   = qt_tail(df, alpha / 2.0);
4575
4
                        ci_lower = cint_est - t_crit * std_err;
4576                        ci_upper = cint_est + t_crit * std_err;
4577
12
                }
4578
8
                AV*restrict conf_int = newAV();
4579
8
                av_push(conf_int, newSVnv(ci_lower));
4580
8
                av_push(conf_int, newSVnv(ci_upper));
4581
4
                hv_store(results, "statistic", 9, newSVnv(t_stat), 0);
4582
4
                hv_store(results, "df",        2, newSVnv(df),     0);
4583
3
                hv_store(results, "p_value",   7, newSVnv(p_val),  0);
4584
3
                hv_store(results, "conf_int",  8, newRV_noinc((SV*)conf_int), 0);
4585
1
                RETVAL = newRV_noinc((SV*)results);
4586
1
        }
4587
1
        OUTPUT:
4588
0
          RETVAL
4589
4590void p_adjust(SV* p_sv, const char* method = "holm")
4591
0
        INIT:
4592                if (!SvROK(p_sv) || SvTYPE(SvRV(p_sv)) != SVt_PVAV) {
4593                        croak("p_adjust: first argument must be an ARRAY reference of p-values");
4594                }
4595
4
                AV *restrict p_av = (AV*)SvRV(p_sv);
4596
1
                size_t n = av_len(p_av) + 1;
4597                // Handle empty input
4598
3
                if (n == 0) {
4599
3
                        XSRETURN_EMPTY;
4600
3
                }
4601
1
                // Normalize method string
4602                char meth[64];
4603                strncpy(meth, method, 63); meth[63] = '\0';
4604
2
                for(unsigned short int i = 0; meth[i]; i++) meth[i] = tolower(meth[i]);
4605
0
                // Resolve aliases
4606
0
                if (strstr(meth, "benjamini") && strstr(meth, "hochberg")) strcpy(meth, "bh");
4607
2
                if (strstr(meth, "benjamini") && strstr(meth, "yekutieli")) strcpy(meth, "by");
4608
1
                if (strcmp(meth, "fdr") == 0) strcpy(meth, "bh");
4609
1
                // Allocate C memory
4610
0
                PVal *restrict arr;
4611                double *restrict adj;
4612                Newx(arr, n, PVal);
4613
2
                Newx(adj, n, double);
4614
4615                for (size_t i = 0; i < n; i++) {
4616                        SV**restrict tv = av_fetch(p_av, i, 0);
4617
1
                        arr[i].p = (tv && SvOK(*tv)) ? SvNV(*tv) : 1.0;
4618
1
                        arr[i].orig_idx = i;
4619                }
4620
1
                // Sort ascending (Stable sort using original index)
4621
3
                qsort(arr, n, sizeof(PVal), cmp_pval);
4622
2
        PPCODE:
4623
2
                if (strcmp(meth, "bonferroni") == 0) {
4624
2
                        for (size_t i = 0; i < n; i++) {
4625                                double v = arr[i].p * n;
4626                                adj[arr[i].orig_idx] = (v < 1.0) ? v : 1.0;
4627
1
                        }
4628
7
                } else if (strcmp(meth, "holm") == 0) {
4629                        double cummax = 0.0;
4630
6
                        for (size_t i = 0; i < n; i++) {
4631
6
                                 double v = arr[i].p * (n - i);
4632
6
                                 if (v > cummax) cummax = v;
4633
0
                                 adj[arr[i].orig_idx] = (cummax < 1.0) ? cummax : 1.0;
4634
0
                        }
4635                } else if (strcmp(meth, "hochberg") == 0) {
4636
6
                        double cummin = 1.0;
4637
6
                        for (ssize_t i = n - 1; i >= 0; i--) {
4638                                 double v = arr[i].p * (n - i);
4639
6
                                 if (v < cummin) cummin = v;
4640                                 adj[arr[i].orig_idx] = (cummin < 1.0) ? cummin : 1.0;
4641
1
                        }
4642
1
                } else if (strcmp(meth, "bh") == 0) {
4643                        double cummin = 1.0;
4644                        for (ssize_t i = n - 1; i >= 0; i--) {
4645                                double v = arr[i].p * n / (i + 1.0);
4646                                if (v < cummin) cummin = v;
4647                                adj[arr[i].orig_idx] = (cummin < 1.0) ? cummin : 1.0;
4648                        }
4649
22
                } else if (strcmp(meth, "by") == 0) {
4650
22
                        double q = 0.0;
4651                        for (size_t i = 1; i <= n; i++) q += 1.0 / i;
4652                        double cummin = 1.0;
4653                        for (ssize_t i = n - 1; i >= 0; i--) {
4654
22
                                double v = arr[i].p * n / (i + 1.0) * q;
4655
22
                                if (v < cummin) cummin = v;
4656
22
                                adj[arr[i].orig_idx] = (cummin < 1.0) ? cummin : 1.0;
4657
22
                        }
4658
22
                } else if (strcmp(meth, "hommel") == 0) {
4659
22
                        double *restrict pa, *restrict q_arr;
4660                        Newx(pa, n, double);
4661
22
                        Newx(q_arr, n, double);
4662
22
                        // Initial: min(n * p[i] / (i + 1))
4663
22
                        double min_val = n * arr[0].p;
4664
22
                        for (size_t i = 1; i < n; i++) {
4665                                double temp = (n * arr[i].p) / (i + 1.0);
4666
22
                                if (temp < min_val) {
4667
22
                                   min_val = temp;
4668
22
                                }
4669
22
                        }
4670                        // pa <- q <- rep(min, n)
4671                        for (size_t i = 0; i < n; i++) {
4672
22
                                 pa[i] = min_val;
4673                                 q_arr[i] = min_val;
4674                        }
4675
22
                        for (size_t j = n - 1; j >= 2; j--) {
4676                                 ssize_t n_mj = n - j;       // Max index for 'ij'. Length is n_mj + 1
4677
64
                                 ssize_t i2_len = j - 1;     // Length of 'i2
4678
42
                                 // Calculate q1 = min(j * p[i2] / (2:j))
4679
42
                                 double q1 = (j * arr[n_mj + 1].p) / 2.0;
4680
42
                                 for (size_t k = 1; k < i2_len; k++) {
4681
21
                                     double temp_q1 = (j * arr[n_mj + 1 + k].p) / (2.0 + k);
4682
0
                                     if (temp_q1 < q1) {
4683                                         q1 = temp_q1;
4684
22
                                     }
4685
21
                                 }
4686                                 // q[ij] <- pmin(j * p[ij], q1)
4687                                 for (size_t i = 0; i <= n_mj; i++) {
4688                                     double v = j * arr[i].p;
4689                                     q_arr[i] = (v < q1) ? v : q1;
4690
19
                                 }
4691
19
                                 // q[i2] <- q[n - j]
4692
19
                                 for (size_t i = 0; i < i2_len; i++) {
4693
19
                                     q_arr[n_mj + 1 + i] = q_arr[n_mj];
4694
19
                                }
4695
19
                                 // pa <- pmax(pa, q)
4696
19
                                for (size_t i = 0; i < n; i++) {
4697
19
                                    if (pa[i] < q_arr[i]) {
4698
12
                                       pa[i] = q_arr[i];
4699
12
                                    }
4700
12
                                }
4701
82
                        }
4702                        // pmin(1, pmax(pa, p))[ro] — map sorted results back to original indices
4703
70
                        for (size_t i = 0; i < n; i++) {
4704
70
                                double v = (pa[i] > arr[i].p) ? pa[i] : arr[i].p;
4705                                if (v > 1.0) v = 1.0;
4706
7
                                adj[arr[i].orig_idx] = v;
4707
7
                        }
4708
7
                        Safefree(pa);  Safefree(q_arr);
4709
7
                } else if (strcmp(meth, "none") == 0) {
4710
231
                        for (size_t i = 0; i < n; i++) {
4711                                adj[arr[i].orig_idx] = arr[i].p;
4712
224
                        }
4713
224
                } else {
4714
224
                        Safefree(arr); Safefree(adj);
4715                        croak("Unknown p-value adjustment method: %s", method);
4716
0
                }
4717                // Push values onto the Perl stack as a flat list
4718
0
                EXTEND(SP, n);
4719
0
                for (size_t i = 0; i < n; i++) {
4720
0
                        PUSHs(sv_2mortal(newSVnv(adj[i])));
4721
0
                }
4722
0
                Safefree(arr); arr = NULL;
4723
0
                Safefree(adj); adj = NULL;
4724
4725
0
double median(...)
4726
0
        PROTOTYPE: @
4727
0
        INIT:
4728          size_t total_count = 0, k = 0;
4729
0
          double* restrict nums;
4730
0
          double median_val = 0.0;
4731
0
        CODE:
4732          /* Pass 1: Count valid elements — die immediately on any undef */
4733          for (size_t i = 0; i < items; i++) {
4734
0
                   SV* restrict arg = ST(i);
4735                   if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
4736                       AV* restrict av = (AV*)SvRV(arg);
4737                       size_t len = av_len(av) + 1;
4738                       for (size_t j = 0; j < len; j++) {
4739
19
                           SV** restrict tv = av_fetch(av, j, 0);
4740
215
                           if (tv && SvOK(*tv)) {
4741
19
                               total_count++;
4742                           } else {
4743
19
                               croak("median: undefined value at array ref index %zu (argument %zu)", j, i);
4744
19
                           }
4745
3
                       }
4746
1
                   } else if (SvOK(arg)) {
4747
1
                       total_count++;
4748                   } else {
4749
18
                       croak("median: undefined value at argument index %zu", i);
4750
18
                   }
4751
18
          }
4752          if (total_count == 0) croak("median needs >= 1 element");
4753
4754          /* Allocate C array now that we know the exact size */
4755          Newx(nums, total_count, double);
4756
4757
18
          /* Pass 2: Populate the C array — Safefree before any croak */
4758
89
          for (size_t i = 0; i < items; i++) {
4759                   SV* restrict arg = ST(i);
4760
71
                   if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
4761
0
                       AV* restrict av = (AV*)SvRV(arg);
4762
0
                       size_t len = av_len(av) + 1;
4763
0
                       for (size_t j = 0; j < len; j++) {
4764                           SV** restrict tv = av_fetch(av, j, 0);
4765                           if (tv && SvOK(*tv)) {
4766
71
                               nums[k++] = SvNV(*tv);
4767
1
                           } else {
4768
1
                               Safefree(nums);
4769
1
                               croak("median: undefined value at array ref index %zu (argument %zu)", j, i);
4770
1
                           }
4771                       }
4772                   } else if (SvOK(arg)) {
4773
70
                       nums[k++] = SvNV(arg);
4774
0
                   } else {
4775
0
                       Safefree(nums);
4776
0
                       croak("median: undefined value at argument index %zu", i);
4777
0
                   }
4778          }
4779
4780
70
          /* Sort and calculate median */
4781
0
          qsort(nums, total_count, sizeof(double), compare_doubles);
4782
0
          if (total_count % 2 == 0) {
4783
0
                   median_val = (nums[total_count / 2 - 1] + nums[total_count / 2]) / 2.0;
4784          } else {
4785                   median_val = nums[total_count / 2];
4786
70
          }
4787
0
          Safefree(nums);
4788          nums = NULL;
4789          RETVAL = median_val;
4790
70
        OUTPUT:
4791
0
          RETVAL
4792
4793
0
SV* cor(SV* x_sv, SV* y_sv = &PL_sv_undef, const char* method = "pearson")
4794        INIT:
4795        // --- validate method -------------------------------------------
4796
70
        if (strcmp(method, "pearson")  != 0 &&
4797
18
                strcmp(method, "spearman") != 0 &&
4798
18
                strcmp(method, "kendall")  != 0)
4799                  croak("cor: unknown method '%s' (use 'pearson', 'spearman', or 'kendall')",
4800
70
                        method);
4801
4802        // --- validate x ------------------------------------------------
4803        if (!SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
4804                  croak("cor: x must be an ARRAY reference");
4805
4806        AV*restrict x_av = (AV*)SvRV(x_sv);
4807
18
        size_t nx   = av_len(x_av) + 1;
4808
0
        if (nx == 0) croak("cor: x is empty");
4809
4810
18
        // --- detect whether x is a flat vector or a matrix (AoA) -------
4811
18
        bool x_is_matrix = 0;
4812        {
4813                  SV**restrict fp = av_fetch(x_av, 0, 0);
4814                  if (fp && SvROK(*fp) && SvTYPE(SvRV(*fp)) == SVt_PVAV)
4815
18
                      x_is_matrix = 1;
4816
18
        }
4817
4818
44
        // --- detect y ----------------------------
4819
26
        bool has_y = (SvOK(y_sv) && SvROK(y_sv) &&
4820
1
                           SvTYPE(SvRV(y_sv)) == SVt_PVAV);
4821
4822
3
        AV*restrict y_av = has_y ? (AV*)SvRV(y_sv) : NULL;
4823
3
        size_t ny = has_y ? av_len(y_av) + 1 : 0;
4824
4825
3
        bool y_is_matrix = 0;
4826
2
        if (has_y && ny > 0) {
4827
2
                SV**restrict fp = av_fetch(y_av, 0, 0);
4828
2
                if (fp && SvROK(*fp) && SvTYPE(SvRV(*fp)) == SVt_PVAV)
4829
2
                        y_is_matrix = 1;
4830
2
        }
4831
4832        CODE:
4833        // Branch 1: both inputs are flat vectors  â†’  scalar result
4834        if (!x_is_matrix && !y_is_matrix) {
4835
1
                  if (!has_y) {
4836                      /* cor(vector) == 1 by definition */
4837
25
                      RETVAL = newSVnv(1.0);
4838
25
                  } else {
4839
25
                      if (nx != ny)
4840
25
                          croak("cor: x and y must have the same length (%lu vs %lu)",
4841
25
                                nx, ny);
4842
4843                      if (nx < 2)
4844
26
                          croak("cor: need at least 2 observations");
4845
4846                      double *restrict xd, *restrict yd;
4847
18
                      Newx(xd, nx, double);
4848
18
                      Newx(yd, ny, double);
4849
4850                      bool x_sd0 = 1, y_sd0 = 1;
4851
18
                      double x_first = NAN, y_first = NAN;
4852
4853
18
                      for (size_t i = 0; i < nx; i++) {
4854
18
                          SV**restrict tv = av_fetch(x_av, i, 0);
4855
45
                          double val = (tv && SvOK(*tv) && looks_like_number(*tv)) ? SvNV(*tv) : NAN;
4856
27
                          xd[i] = val;
4857
0
                          if (!isnan(val)) {
4858
0
                              if (isnan(x_first)) x_first = val;
4859                              else if (val != x_first) x_sd0 = 0;
4860
27
                          }
4861
27
                      }
4862
1
                      for (size_t i = 0; i < ny; i++) {
4863
1
                          SV**restrict tv = av_fetch(y_av, i, 0);
4864
1
                          double val = (tv && SvOK(*tv) && looks_like_number(*tv)) ? SvNV(*tv) : NAN;
4865
1
                          yd[i] = val;
4866
1
                          if (!isnan(val)) {
4867
1
                              if (isnan(y_first)) y_first = val;
4868
1
                              else if (val != y_first) y_sd0 = 0;
4869
1
                          }
4870
1
                      }
4871
4872
1
                      if (x_sd0 || y_sd0) {
4873
1
                          Safefree(xd); Safefree(yd);
4874                          if (x_sd0) croak("cor: standard deviation of x is 0");
4875
26
                          croak("cor: standard deviation of y is 0");
4876
26
                      }
4877
4878                      double r = compute_cor(xd, yd, nx, method);
4879
27
                      Safefree(xd); Safefree(yd);
4880                      RETVAL = newSVnv(r);
4881                  }
4882        } else {//Branch 2: x is a matrix (or y is a matrix)  â†’  AoA result
4883
64
                  // -- resolve x matrix dimensions
4884
46
                  if (!x_is_matrix)
4885
86
                      croak("cor: x must be a matrix (array ref of array refs) "
4886
46
                            "when y is a matrix");
4887
4888
18
                  SV**restrict xr0 = av_fetch(x_av, 0, 0);
4889                  if (!xr0 || !SvROK(*xr0) || SvTYPE(SvRV(*xr0)) != SVt_PVAV)
4890                      croak("cor: each row of x must be an ARRAY reference");
4891
4892                  size_t ncols_x = av_len((AV*)SvRV(*xr0)) + 1;
4893
64
                  if (ncols_x == 0) croak("cor: x matrix has zero columns");
4894
4895
0
                  size_t nrows   = nx;    /* observations */
4896
4897
0
                  // PRE-VALIDATION PASS: Ensure all rows are arrays to prevent memory leaks on croak
4898                  for (size_t i = 0; i < nrows; i++) {
4899
46
                      SV**restrict rv = av_fetch(x_av, i, 0);
4900
17
                      if (!rv || !SvROK(*rv) || SvTYPE(SvRV(*rv)) != SVt_PVAV)
4901                          croak("cor: x row %lu is not an array ref", i);
4902
29
                  }
4903
4904
5
                  if (has_y && y_is_matrix) {
4905
5
                      if (ny != nrows) croak("cor: x and y must have the same number of rows (%lu vs %lu)", nrows, ny);
4906
47
                      for (size_t i = 0; i < nrows; i++) {
4907
42
                          SV**restrict rv = av_fetch(y_av, i, 0);
4908
42
                          if (!rv || !SvROK(*rv) || SvTYPE(SvRV(*rv)) != SVt_PVAV)
4909
42
                              croak("cor: y row %lu is not an array ref", i);
4910
81
                      }
4911
42
                  }
4912
4913
14
                  // -- extract x columns
4914                  double **restrict col_x;
4915
42
                  Newx(col_x, ncols_x, double*);
4916
4917                  for (size_t j = 0; j < ncols_x; j++) {
4918
5
                      Newx(col_x[j], nrows, double);
4919
14
                      bool sd0 = 1;
4920
22
                      double first = NAN;
4921
13
                      for (size_t i = 0; i < nrows; i++) {
4922
14
                          SV**restrict rv = av_fetch(x_av, i, 0);
4923
9
                          AV*restrict  row = (AV*)SvRV(*rv);
4924
0
                          SV**restrict cv  = av_fetch(row, j, 0);
4925
0
                          double val = (cv && SvOK(*cv) && looks_like_number(*cv)) ? SvNV(*cv) : NAN;
4926
0
                          col_x[j][i] = val;
4927                          if (!isnan(val)) {
4928
9
                              if (isnan(first)) first = val;
4929
9
                              else if (val != first) sd0 = 0;
4930
9
                          }
4931
9
                      }
4932
9
                      if (sd0) {
4933
9
                          for (size_t k = 0; k <= j; k++) Safefree(col_x[k]);
4934
9
                          Safefree(col_x);
4935                          croak("cor: standard deviation is 0 in x column %lu", j);
4936
19
                      }
4937
5
                  }
4938
4939
0
                  // -- resolve y: separate matrix or re-use x (symmetric)
4940
0
                  size_t ncols_y;
4941                  double **restrict col_y   = NULL;
4942                  bool symmetric = 0;
4943
4944                  // 1 = cor(X) — result is symmetric
4945                  if (has_y && y_is_matrix) {
4946
18
                      // cross-correlation: X (nrows × p) vs Y (nrows × q)
4947
18
                      SV**restrict yr0 = av_fetch(y_av, 0, 0);
4948
18
                      ncols_y = av_len((AV*)SvRV(*yr0)) + 1;
4949                      if (ncols_y == 0) croak("cor: y matrix has zero columns");
4950
4951                      Newx(col_y, ncols_y, double*);
4952                      for (size_t j = 0; j < ncols_y; j++) {
4953
310
                          Newx(col_y[j], nrows, double);
4954
292
                          bool sd0 = 1;
4955
292
                          double first = NAN;
4956                          for (size_t i = 0; i < nrows; i++) {
4957
289
                              SV**restrict  rv = av_fetch(y_av, i, 0);
4958
289
                              AV*restrict  row = (AV*)SvRV(*rv);
4959
1112
                              SV**restrict cv  = av_fetch(row, j, 0);
4960
823
                              double val = (cv && SvOK(*cv) && looks_like_number(*cv)) ? SvNV(*cv) : NAN;
4961
257
                              col_y[j][i] = val;
4962
566
                              if (!isnan(val)) {
4963
78
                                  if (isnan(first)) first = val;
4964
78
                                  else if (val != first) sd0 = 0;
4965
78
                              }
4966
78
                          }
4967
0
                          if (sd0) {
4968                              for (size_t k = 0; k < ncols_x; k++) Safefree(col_x[k]);
4969
488
                              Safefree(col_x);
4970
488
                              for (size_t k = 0; k <= j; k++) Safefree(col_y[k]);
4971                              Safefree(col_y);
4972                              croak("cor: standard deviation is 0 in y column %lu", j);
4973
289
                          }
4974                      }
4975
289
                  } else { // cor(X) — symmetric p×p result; share column arrays
4976
1112
                      ncols_y  = ncols_x;
4977
289
                      col_y    = col_x;
4978
289
                      symmetric = 1;
4979
289
                  }
4980                  if (nrows < 2)
4981
18
                      croak("cor: need at least 2 observations (got %lu)", nrows);
4982                  // -- build cache for symmetric case: compute upper triangle, store results, mirror to lower triangle
4983
18
                  AV*restrict result_av = newAV();
4984
7
                  av_extend(result_av, ncols_x - 1);
4985
7
                  // Allocate per-row AVs up front so we can fill them in order
4986
7
                  AV **restrict rows_out;
4987
5
                  Newx(rows_out, ncols_x, AV*);
4988
5
                  for (size_t i = 0; i < ncols_x; i++) {
4989                      rows_out[i] = newAV();
4990
2
                      av_extend(rows_out[i], ncols_y - 1);
4991
2
                  }
4992
2
                  if (symmetric) {
4993
2
/* Upper triangle + diagonal, then mirror. r_cache[i][j] (j >= i) holds the computed value. */
4994                      double **restrict r_cache;
4995                      Newx(r_cache, ncols_x, double*);
4996                      for (size_t i = 0; i < ncols_x; i++)
4997                          Newx(r_cache[i], ncols_x, double);
4998
4999
16
                      for (size_t i = 0; i < ncols_x; i++) {
5000
61
                          r_cache[i][i] = 1.0; // diagonal
5001
178
                          for (size_t j = i + 1; j < ncols_x; j++) {
5002
133
                              double r = compute_cor(col_x[i], col_x[j], nrows, method);
5003
2620
                              r_cache[i][j] = r;
5004
133
                              r_cache[j][i] = r; // symmetry
5005                          }
5006
16
                      }
5007
61
                      // fill output AoA from cache
5008
45
                      for (size_t i = 0; i < ncols_x; i++)
5009
860
                          for (size_t j = 0; j < ncols_x; j++)
5010
45
                              av_store(rows_out[i], j, newSVnv(r_cache[i][j]));
5011
5012
16
                      for (size_t i = 0; i < ncols_x; i++) Safefree(r_cache[i]);
5013
16
                      Safefree(r_cache); r_cache = NULL;
5014
16
                  } else {
5015
61
                      // cross-correlation: every (i,j) pair is independent
5016
45
                      for (size_t i = 0; i < ncols_x; i++)
5017                          for (size_t j = 0; j < ncols_y; j++)
5018
44
                              av_store(rows_out[i], j, newSVnv(compute_cor(col_x[i], col_y[j], nrows, method)));
5019
174
                  }
5020
44
                  // push row AVs into result
5021                  for (size_t i = 0; i < ncols_x; i++)
5022                      av_store(result_av, i, newRV_noinc((SV*)rows_out[i]));
5023                  Safefree(rows_out); rows_out = NULL;
5024                  // -- free column arrays -------------------------------------
5025                  for (size_t j = 0; j < ncols_x; j++) Safefree(col_x[j]);
5026                  Safefree(col_x); col_x = NULL;
5027
16
                  if (!symmetric) {
5028
16
                      for (size_t j = 0; j < ncols_y; j++) Safefree(col_y[j]);
5029                      Safefree(col_y);
5030
16
                  }
5031                  RETVAL = newRV_noinc((SV*)result_av);
5032        }
5033
16
        OUTPUT:
5034
302
                RETVAL
5035
5036void scale(...)
5037
302
        PROTOTYPE: @
5038
286
        PPCODE:
5039
1101
        {
5040
286
                bool do_center_mean = TRUE, do_scale_sd = TRUE;
5041
286
                double center_val = 0.0, scale_val = 1.0;
5042
286
                size_t data_items = items;
5043
286
                // 1. Parse Options Hash (if it exists as the last argument)
5044
286
                if (items > 0) {
5045
286
                        SV*restrict last_arg = ST(items - 1);
5046
286
                        if (SvROK(last_arg) && SvTYPE(SvRV(last_arg)) == SVt_PVHV) {
5047                                 data_items = items - 1; // Exclude hash from data processing
5048
16
                                 HV*restrict opt_hv = (HV*)SvRV(last_arg);
5049                                 // --- Parse 'center'
5050                                 SV**restrict center_sv = hv_fetch(opt_hv, "center", 6, 0);
5051
16
                                 if (center_sv) {
5052                                     SV*restrict val_sv = *center_sv;
5053
16
                                     if (!SvOK(val_sv)) {
5054
16
                                         do_center_mean = FALSE; center_val = 0.0;
5055
16
                                     } else {
5056                                         char *restrict str = SvPV_nolen(val_sv);
5057
16
                                         /* Trap booleans and empty strings before numeric checks */
5058
16
                                         if (strcasecmp(str, "mean") == 0 || strcasecmp(str, "true") == 0 || strcmp(str, "1") == 0) {
5059
16
                                             do_center_mean = TRUE;
5060
16
                                         } else if (strcasecmp(str, "none") == 0 || strcasecmp(str, "false") == 0 || strcmp(str, "0") == 0 || strcmp(str, "") == 0) {
5061
16
                                             do_center_mean = FALSE; center_val = 0.0;
5062
16
                                         } else if (looks_like_number(val_sv)) {
5063
0
                                             do_center_mean = FALSE; center_val = SvNV(val_sv);
5064
0
                                         } else if (SvTRUE(val_sv)) {
5065
0
                                             do_center_mean = TRUE;
5066                                         } else {
5067
0
                                             do_center_mean = FALSE; center_val = 0.0;
5068
0
                                         }
5069                                     }
5070                                 }
5071
61
                                 // --- Parse 'scale' ---
5072
45
                                 SV**restrict scale_sv = hv_fetch(opt_hv, "scale", 5, 0);
5073
45
                                 if (scale_sv) {
5074
45
                                     SV*restrict val_sv = *scale_sv;
5075
45
                                     if (!SvOK(val_sv)) {
5076
1
                                         do_scale_sd = FALSE; scale_val = 1.0;
5077
1
                                     } else {
5078
1
                                         char *restrict str = SvPV_nolen(val_sv);
5079
1
                                         if (strcasecmp(str, "sd") == 0 || strcasecmp(str, "true") == 0 || strcmp(str, "1") == 0) {
5080                                             do_scale_sd = TRUE;
5081
44
                                         } else if (strcasecmp(str, "none") == 0 || strcasecmp(str, "false") == 0 || strcmp(str, "0") == 0 || strcmp(str, "") == 0) {
5082
44
                                             do_scale_sd = FALSE; scale_val = 1.0;
5083
44
                                         } else if (looks_like_number(val_sv)) {
5084
44
                                             do_scale_sd = FALSE; scale_val = SvNV(val_sv);
5085
44
                                             if (scale_val == 0.0) scale_val = 1.0; /* Prevent Division By Zero */
5086
44
                                         } else if (SvTRUE(val_sv)) {
5087
44
                                             do_scale_sd = TRUE;
5088                                         } else {
5089
45
                                             do_scale_sd = FALSE; scale_val = 1.0;
5090                                         }
5091                                     }
5092
16
                                 }
5093
16
                        }
5094
16
                }
5095
16
                // 2. Detect if the input is a Matrix (Array of Arrays)
5096
16
                bool is_matrix = FALSE;
5097
16
                if (data_items == 1) {
5098
16
                        SV*restrict first_arg = ST(0);
5099
16
                        if (SvROK(first_arg) && SvTYPE(SvRV(first_arg)) == SVt_PVAV) {
5100
16
                                 AV*restrict av = (AV*)SvRV(first_arg);
5101
16
                                 if (av_len(av) >= 0) {
5102
16
                                     SV**restrict first_elem = av_fetch(av, 0, 0);
5103
16
                                     if (first_elem && SvROK(*first_elem) && SvTYPE(SvRV(*first_elem)) == SVt_PVAV) {
5104
16
                                         is_matrix = TRUE;
5105
16
                                     }
5106
16
                                 }
5107
16
                        }
5108
16
                }
5109                if (is_matrix) {
5110                        //=========================================================
5111                        // MATRIX MODE: Scale columns independently (Just like R)
5112
57
                        //=========================================================
5113
57
                        AV*restrict mat_av = (AV*)SvRV(ST(0));
5114
61
                        size_t nrow = av_len(mat_av) + 1, ncol = 0;
5115
5116
45
                        SV**restrict first_row = av_fetch(mat_av, 0, 0);
5117                        ncol = av_len((AV*)SvRV(*first_row)) + 1;
5118
5119
16
                        if (nrow == 0 || ncol == 0) croak("scale requires non-empty matrix");
5120
5121
16
                        // Create a new matrix for the scaled output
5122                        AV*restrict result_av = newAV();
5123
16
                        av_extend(result_av, nrow - 1);
5124                        AV**restrict row_ptrs = (AV**)safemalloc(nrow * sizeof(AV*));
5125                        for (size_t r = 0; r < nrow; r++) {
5126                                 row_ptrs[r] = newAV();
5127                                 av_extend(row_ptrs[r], ncol - 1);
5128                                 av_push(result_av, newRV_noinc((SV*)row_ptrs[r]));
5129                        }
5130                        // Calculate and apply scale per column
5131                        for (size_t c = 0; c < ncol; c++) {
5132                                 double col_sum = 0.0;
5133                                 double *restrict col_data;
5134                                 Newx(col_data, nrow, double);
5135
6
                                 // Extract the column data
5136
0
                                 for (size_t r = 0; r < nrow; r++) {
5137
0
                                     SV**restrict row_sv = av_fetch(mat_av, r, 0);
5138
0
                                     if (row_sv && SvROK(*row_sv)) {
5139
0
                                         AV*restrict row_av = (AV*)SvRV(*row_sv);
5140                                         SV**restrict cell_sv = av_fetch(row_av, c, 0);
5141
0
                                         col_data[r] = (cell_sv && SvOK(*cell_sv)) ? SvNV(*cell_sv) : 0.0;
5142                                     } else {
5143                                         col_data[r] = 0.0;
5144                                     }
5145
6
                                     col_sum += col_data[r];
5146
0
                                 }
5147
5148                                 double col_center = do_center_mean ? (col_sum / nrow) : center_val;
5149                                 double col_scale = scale_val;
5150                                 // Calculate Standard Deviation for this specific column if needed
5151                                 if (do_scale_sd) {
5152
6
                                     if (nrow <= 1) {
5153
6
                                         Safefree(col_data);
5154
6
                                         safefree(row_ptrs);
5155                                         croak("scale needs >= 2 rows to calculate standard deviation for a matrix column");
5156
6
                                     }
5157
3033
                                     double sum_sq = 0.0;
5158
3027
                                     for (size_t r = 0; r < nrow; r++) {
5159                                         double diff = col_data[r] - col_center;
5160
6
                                         sum_sq += diff * diff;
5161                                     }
5162                                     col_scale = sqrt(sum_sq / (nrow - 1));
5163                                 }
5164                                 // Store scaled values back into the new matrix rows
5165                                 for (size_t r = 0; r < nrow; r++) {
5166                                     double centered = col_data[r] - col_center;
5167
2
                                     double final_val = (col_scale == 0.0) ? (0.0 / 0.0) : (centered / col_scale);
5168                                     av_store(row_ptrs[r], c, newSVnv(final_val));
5169
2
                                 }
5170
2
                                 Safefree(col_data);
5171
2
                        }
5172                        safefree(row_ptrs);
5173                        // Push the resulting matrix as a single Reference onto the Perl stack
5174
2
                        EXTEND(SP, 1);
5175
0
                        PUSHs(sv_2mortal(newRV_noinc((SV*)result_av)));
5176
0
                } else {
5177                        // ======================================
5178                        // FLAT LIST MODE: Original functionality
5179                        // ======================================
5180
2
                        size_t total_count = 0, k = 0;
5181
0
                        double *restrict nums;
5182                        double sum = 0.0;
5183                        for (size_t i = 0; i < data_items; i++) {
5184
7
                                SV*restrict arg = ST(i);
5185
5
                                if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
5186
5
                                        AV*restrict av = (AV*)SvRV(arg);
5187                                        size_t len = av_len(av) + 1;
5188
5
                                        for (unsigned int j = 0; j < len; j++) {
5189
3
                                                SV**restrict tv = av_fetch(av, j, 0);
5190
2
                                                if (tv && SvOK(*tv)) { total_count++; }
5191
0
                                        }
5192                                } else if (SvOK(arg)) {
5193                                        total_count++;
5194
2
                                }
5195                        }
5196
1
                        if (total_count == 0) croak("scale requires at least 1 numeric element");
5197
1
                        Newx(nums, total_count, double);
5198
1
                        for (size_t i = 0; i < data_items; i++) {
5199                                 SV*restrict arg = ST(i);
5200
5002
                                 if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
5201                                     AV*restrict av = (AV*)SvRV(arg);
5202                                     size_t len = av_len(av) + 1;
5203                                     for (size_t j = 0; j < len; j++) {
5204
6318
                                         SV**restrict tv = av_fetch(av, j, 0);
5205
6318
                                         if (tv && SvOK(*tv)) {
5206
6318
                                             double val = SvNV(*tv);
5207
6318
                                             nums[k++] = val; sum += val;
5208                                         }
5209
5000
                                     }
5210                                 } else if (SvOK(arg)) {
5211
5000
                                     double val = SvNV(arg);
5212
5000
                                     nums[k++] = val; sum += val;
5213
4999
                                 }
5214                        }
5215                        if (do_center_mean) center_val = sum / total_count;
5216                        if (do_scale_sd) {
5217
1
                                 if (total_count <= 1) {
5218                                     Safefree(nums);
5219                                     croak("scale needs >= 2 elements to calculate SD");
5220                                 }
5221                                 double sum_sq = 0.0;
5222                                 for (size_t i = 0; i < total_count; i++) {
5223                                     double diff = nums[i] - center_val;
5224                                     sum_sq += diff * diff;
5225                                 }
5226                                 scale_val = sqrt(sum_sq / (total_count - 1));
5227
9
                        }
5228                        EXTEND(SP, total_count);
5229                        for (size_t i = 0; i < total_count; i++) {
5230                                double centered = nums[i] - center_val;
5231
9
                                double final_val = (scale_val == 0.0) ? (0.0 / 0.0) : (centered / scale_val);
5232
9
                                PUSHs(sv_2mortal(newSVnv(final_val)));
5233
9
                        }
5234
9
                        Safefree(nums); nums = NULL;
5235
9
                }
5236
9
        }
5237
5238SV* matrix(...)
5239
9
CODE:
5240
9
        // Basic check: must have an even number of arguments for key => value
5241
9
        if (items % 2 != 0) {
5242
9
          croak("Usage: matrix(data => [...], nrow => $n, ncol => $m, byrow => $bool)");
5243        }
5244
9
        SV*restrict data_sv = NULL;
5245
9
        size_t nrow = 0, ncol = 0;
5246        bool byrow = FALSE, nrow_set = FALSE, ncol_set = FALSE;
5247
9
        // Parse named arguments
5248
9
        for (size_t i = 0; i < items; i += 2) {
5249          char*restrict key = SvPV_nolen(ST(i));
5250          SV*restrict val   = ST(i + 1);
5251          if (strEQ(key, "data")) {
5252
9
                   data_sv = val;
5253
9
          } else if (strEQ(key, "nrow")) {
5254
9
                   nrow = (size_t)SvUV(val);
5255
9
                   nrow_set = TRUE;
5256
9
          } else if (strEQ(key, "ncol")) {
5257
9
                   ncol = (size_t)SvUV(val);
5258
9
                   ncol_set = TRUE;
5259
9
          } else if (strEQ(key, "byrow")) {
5260
9
                   byrow = SvTRUE(val);
5261
9
          } else {
5262
9
                   croak("Unknown option: %s", key);
5263
67
          }
5264
58
        }
5265
58
        // Validate data input
5266        if (!data_sv || !SvROK(data_sv) || SvTYPE(SvRV(data_sv)) != SVt_PVAV) {
5267
0
          croak("The 'data' option must be an array reference (e.g. data => [1..6])");
5268
0
        }
5269
0
        AV*restrict data_av = (AV*)SvRV(data_sv);
5270
0
        size_t data_len = (UV)(av_top_index(data_av) + 1);
5271
0
        if (data_len == 0) {
5272          croak("Data array cannot be empty");
5273
0
        }
5274
0
        // R-style dimension inference
5275
0
        if (!nrow_set && !ncol_set) {
5276          nrow = data_len;
5277
0
          ncol = 1;
5278        } else if (nrow_set && !ncol_set) {
5279
0
          ncol = (data_len + nrow - 1) / nrow;
5280
0
        } else if (!nrow_set && ncol_set) {
5281
0
          nrow = (data_len + ncol - 1) / ncol;
5282
0
        }
5283
0
        // Final safety check for dimensions
5284
0
        if (nrow == 0 || ncol == 0) {
5285
0
          croak("Dimensions must be greater than 0");
5286
0
        }
5287
0
        // Create the matrix (Array of Arrays)
5288        AV*restrict result_av = newAV();
5289
0
        av_extend(result_av, nrow - 1);
5290
0
        size_t r, c;// Use unsigned types for counters to prevent negative indexing
5291        AV**restrict row_ptrs = (AV**restrict)safemalloc(nrow * sizeof(AV*)); /* Pre-allocate row pointers */
5292
0
        for (r = 0; r < nrow; r++) {
5293
0
          row_ptrs[r] = newAV();
5294
0
          av_extend(row_ptrs[r], ncol - 1);
5295          av_push(result_av, newRV_noinc((SV*)row_ptrs[r]));
5296        }
5297
0
        // Fill the matrix
5298        size_t total_cells = nrow * ncol;
5299        for (size_t i = 0; i < total_cells; i++) {
5300          // Vector recycling logic
5301          SV**restrict fetched = av_fetch(data_av, i % data_len, 0);
5302
9
          SV*restrict val = fetched ? newSVsv(*fetched) : newSV(0);
5303
111
          if (byrow) {
5304
9
                   r = i / ncol;
5305                   c = i % ncol;
5306
9
          } else {
5307
9
                   r = i % nrow;
5308
3
                   c = i / nrow;
5309
1
          }
5310
1
          av_store(row_ptrs[r], c, val);
5311        }
5312
8
        safefree(row_ptrs);
5313
8
        RETVAL = newRV_noinc((SV*)result_av);
5314
8
        OUTPUT:
5315        RETVAL
5316
5317
8
SV* lm(...)
5318
8
CODE:
5319
8
{
5320
8
        const char *restrict formula  = NULL;
5321
8
        SV *restrict data_sv = NULL;
5322
8
        char f_cpy[512];
5323
8
        char *restrict src, *restrict dst, *restrict tilde, *restrict lhs, *restrict rhs, *restrict chunk;
5324
5325
8
        char **restrict terms = NULL, **restrict uniq_terms = NULL, **restrict exp_terms = NULL;
5326
8
        bool *restrict is_dummy = NULL;
5327
8
        char **restrict dummy_base = NULL, **restrict dummy_level = NULL;
5328
8
        unsigned int term_cap = 64, exp_cap = 64, num_terms = 0, num_uniq = 0, p = 0, p_exp = 0;
5329        size_t n = 0, valid_n = 0, i, j, k, l, l1, l2;
5330
8
        bool has_intercept = TRUE;
5331
5332
8
        char **restrict row_names = NULL, **restrict valid_row_names = NULL;
5333
19
        HV **restrict row_hashes = NULL;
5334
11
        HV *restrict data_hoa = NULL;
5335
1
        SV *restrict ref = NULL;
5336
5337
3
        double *restrict X = NULL, *restrict Y = NULL, *restrict XtX = NULL, *restrict XtY = NULL;
5338
3
        bool *restrict aliased = NULL;
5339
3
        double *restrict beta = NULL;
5340
3
        int final_rank = 0, df_res = 0;
5341
2
        HV *restrict res_hv, *restrict coef_hv, *restrict fitted_hv, *restrict resid_hv, *restrict summary_hv;
5342
2
        AV *restrict terms_av;
5343
2
        double rss = 0.0, rse_sq = 0.0;
5344
2
        HE *restrict entry;
5345
5346        if (items % 2 != 0) croak("Usage: lm(formula => 'mpg ~ wt * hp', data => \\%%mtcars)");
5347
5348        for (unsigned short i_arg = 0; i_arg < items; i_arg += 2) {
5349          const char *restrict key = SvPV_nolen(ST(i_arg));
5350
1
          SV *restrict val = ST(i_arg + 1);
5351          if      (strEQ(key, "formula")) formula = SvPV_nolen(val);
5352
10
          else if (strEQ(key, "data"))    data_sv = val;
5353
10
          else croak("lm: unknown argument '%s'", key);
5354
10
        }
5355
10
        if (!formula) croak("lm: formula is required");
5356
10
        if (!data_sv || !SvROK(data_sv)) croak("lm: data is required and must be a reference");
5357
5358        // ========================================================================
5359
11
        // PHASE 1: Data Extraction
5360        // ========================================================================
5361        ref = SvRV(data_sv);
5362        if (SvTYPE(ref) == SVt_PVHV) {
5363
8
          HV *restrict hv = (HV*)ref;
5364
8
          if (hv_iterinit(hv) == 0) croak("lm: Data hash is empty");
5365
8
          entry = hv_iternext(hv);
5366
8
          if (entry) {
5367
8
                   SV *restrict val = hv_iterval(hv, entry);
5368
8
                   if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVAV) {
5369                       data_hoa = hv;
5370
8
                       n = av_len((AV*)SvRV(val)) + 1;
5371                       Newx(row_names, n, char*);
5372
8
                       for (i = 0; i < n; i++) {
5373
8
                           char buf[32];
5374
20
                           snprintf(buf, sizeof(buf), "%lu", (unsigned long)(i + 1));
5375
12
                           row_names[i] = savepv(buf);
5376
0
                       }
5377
0
                   } else if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVHV) {
5378                       n = hv_iterinit(hv);
5379
12
                       Newx(row_names, n, char*); Newx(row_hashes, n, HV*);
5380
12
                       i = 0;
5381
1
                       while ((entry = hv_iternext(hv))) {
5382
1
                           I32 len;
5383
1
                           row_names[i] = savepv(hv_iterkey(entry, &len));
5384
1
                           row_hashes[i] = (HV*)SvRV(hv_iterval(hv, entry));
5385
1
                           i++;
5386
1
                       }
5387
1
                   } else croak("lm: Hash values must be ArrayRefs (HoA) or HashRefs (HoH)");
5388
1
          }
5389
1
        } else if (SvTYPE(ref) == SVt_PVAV) {
5390
1
          AV *restrict av = (AV*)ref; n = av_len(av) + 1;
5391
1
          Newx(row_names, n, char*);
5392          Newx(row_hashes, n, HV*);
5393
11
          for (i = 0; i < n; i++) {
5394
11
                   SV **restrict val = av_fetch(av, i, 0);
5395
11
                   if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVHV) {
5396                       row_hashes[i] = (HV*)SvRV(*val);
5397
12
                       char buf[32]; snprintf(buf, sizeof(buf), "%lu", (unsigned long)(i + 1));
5398                       row_names[i] = savepv(buf);
5399                   } else {
5400                       for (k = 0; k < i; k++) Safefree(row_names[k]);
5401
30
                       Safefree(row_names); Safefree(row_hashes);
5402
22
                       croak("lm: Array values must be HashRefs (AoH)");
5403
43
                   }
5404
21
          }
5405        } else croak("lm: Data must be an Array or Hash reference");
5406
5407        // ========================================================================
5408
8
        // PHASE 2: Formula Parsing & `.` Expansion
5409        // ========================================================================
5410        src = (char*)formula; dst = f_cpy;
5411        while (*src && (dst - f_cpy < 511)) { if (!isspace(*src)) { *dst++ = *src; } src++; }
5412
8
        *dst = '\0';
5413
5414        tilde = strchr(f_cpy, '~');
5415        if (!tilde) {
5416          for (i = 0; i < n; i++) Safefree(row_names[i]);
5417          Safefree(row_names); if (row_hashes) Safefree(row_hashes);
5418
29
          croak("lm: invalid formula, missing '~'");
5419
22
        }
5420
8
        *tilde = '\0';
5421
8
        lhs = f_cpy;
5422
8
        rhs = tilde + 1;
5423
5424
8
        // Remove intercept-suppression markers from RHS.
5425        // IMPORTANT: skip tokens that appear inside I(...) wrappers so that
5426        // expressions like I(x^-1) are never mistakenly treated as "-1".
5427
22
        {
5428
8
          char *restrict p_idx = rhs;
5429
8
          while (*p_idx) {
5430
8
                   // Skip over I(...) sub-expressions entirely
5431
8
                   if (p_idx[0] == 'I' && p_idx[1] == '(') {
5432
8
                       int depth = 0;
5433
8
                       while (*p_idx) { if (*p_idx == '(') depth++; else if (*p_idx == ')') { depth--; if (depth == 0) { p_idx++; break; } } p_idx++; }
5434                       continue;
5435                   }
5436
14
                   // Match bare -1
5437
14
                   if (p_idx[0] == '-' && p_idx[1] == '1' &&
5438                       (p_idx[2] == '\0' || p_idx[2] == '+' || p_idx[2] == '-')) {
5439
2
                       has_intercept = FALSE;
5440
2
                       memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1);
5441
2
                       continue; // re-examine same position
5442                   }
5443
2
                   // Match +0
5444
2
                   if (p_idx[0] == '+' && p_idx[1] == '0' &&
5445
6
                       (p_idx[2] == '\0' || p_idx[2] == '+' || p_idx[2] == '-')) {
5446
4
                       has_intercept = FALSE;
5447
4
                       memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1);
5448                       continue;
5449                   }
5450
2
                   // Match leading 0+
5451
1
                   if (p_idx == rhs && p_idx[0] == '0' && p_idx[1] == '+') {
5452
1
                       has_intercept = FALSE;
5453                       memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1);
5454
2
                       continue;
5455
2
                   }
5456
1
                   // Match bare 0 (entire rhs)
5457
0
                   if (p_idx == rhs && p_idx[0] == '0' && p_idx[1] == '\0') {
5458
0
                       has_intercept = FALSE; p_idx[0] = '\0'; break;
5459
0
                   }
5460
0
                   // Strip redundant +1 (keep intercept, just remove marker)
5461
0
                   if (p_idx[0] == '+' && p_idx[1] == '1' &&
5462                       (p_idx[2] == '\0' || p_idx[2] == '+' || p_idx[2] == '-')) {
5463
1
                       memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1);
5464
1
                       continue;
5465
1
                   }
5466
1
                   // Strip leading bare 1 or 1+
5467
1
                   if (p_idx == rhs) {
5468
1
                       if (p_idx[0] == '1' && p_idx[1] == '\0') { p_idx[0] = '\0'; break; }
5469
1
                       if (p_idx[0] == '1' && p_idx[1] == '+') { memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); continue; }
5470
1
                   }
5471
1
                   p_idx++;
5472          }
5473        }
5474
5475
1
        // Clean up stray `++`, leading `+`, trailing `+`
5476        {
5477
12
          char *restrict p_idx;
5478
3
          while ((p_idx = strstr(rhs, "++")) != NULL)
5479
3
                   memmove(p_idx, p_idx + 1, strlen(p_idx + 1) + 1);
5480
3
          if (rhs[0] == '+') memmove(rhs, rhs + 1, strlen(rhs + 1) + 1);
5481
52
          size_t len_rhs = strlen(rhs);
5482
49
          if (len_rhs > 0 && rhs[len_rhs - 1] == '+') rhs[len_rhs - 1] = '\0';
5483
49
        }
5484
5485
78
        // Expand `.` Operator
5486
71
        char rhs_expanded[2048] = "";
5487        size_t rhs_len = 0;
5488
49
        chunk = strtok(rhs, "+");
5489
7
        while (chunk != NULL) {
5490
7
          if (strcmp(chunk, ".") == 0) {
5491                   AV *cols = get_all_columns(data_hoa, row_hashes, n);
5492
49
                   for (size_t c = 0; c <= (size_t)av_len(cols); c++) {
5493                       SV **col_sv = av_fetch(cols, c, 0);
5494                       if (col_sv && SvOK(*col_sv)) {
5495                           const char *col_name = SvPV_nolen(*col_sv);
5496
3
                           if (strcmp(col_name, lhs) != 0) {
5497
7
                               size_t slen = strlen(col_name);
5498
9
                               if (rhs_len + slen + 2 < sizeof(rhs_expanded)) {
5499
5
                                   if (rhs_len > 0) { strcat(rhs_expanded, "+"); rhs_len++; }
5500
1
                                   strcat(rhs_expanded, col_name);
5501                                   rhs_len += slen;
5502                               }
5503                           }
5504                       }
5505                   }
5506
3
                   SvREFCNT_dec(cols);
5507          } else {
5508                   size_t slen = strlen(chunk);
5509
7
                   if (rhs_len + slen + 2 < sizeof(rhs_expanded)) {
5510
4
                       if (rhs_len > 0) { strcat(rhs_expanded, "+"); rhs_len++; }
5511
0
                       strcat(rhs_expanded, chunk);
5512
0
                       rhs_len += slen;
5513
0
                   }
5514
0
          }
5515
0
          chunk = strtok(NULL, "+");
5516        }
5517
5518
4
        Newx(terms, term_cap, char*); Newx(uniq_terms, term_cap, char*);
5519
4
        Newx(exp_terms, exp_cap, char*); Newx(is_dummy, exp_cap, bool);
5520
4
        Newx(dummy_base, exp_cap, char*); Newx(dummy_level, exp_cap, char*);
5521
5522
4
        if (has_intercept) { terms[num_terms++] = savepv("Intercept"); }
5523
5524
4
        if (strlen(rhs_expanded) > 0) {
5525
4
          chunk = strtok(rhs_expanded, "+");
5526          while (chunk != NULL) {
5527
10
                   if (num_terms >= term_cap - 3) {
5528
3
                       term_cap *= 2;
5529                       Renew(terms, term_cap, char*); Renew(uniq_terms, term_cap, char*);
5530
0
                   }
5531
0
                   char *restrict star = strchr(chunk, '*');
5532
0
                   if (star) {
5533
0
                       *star = '\0';
5534
0
                       char *restrict left = chunk;
5535
0
                       char *restrict right = star + 1;
5536                       char *restrict c_l = strchr(left, '^');
5537                       if (c_l && strncmp(left, "I(", 2) != 0) *c_l = '\0';
5538
9
                       char *restrict c_r = strchr(right, '^');
5539
9
                       if (c_r && strncmp(right, "I(", 2) != 0) *c_r = '\0';
5540
9
                       terms[num_terms++] = savepv(left);
5541
9
                       terms[num_terms++] = savepv(right);
5542
9
                       size_t inter_len = strlen(left) + strlen(right) + 2;
5543                       terms[num_terms] = (char*)safemalloc(inter_len);
5544                       snprintf(terms[num_terms++], inter_len, "%s:%s", left, right);
5545                   } else {
5546
7
                       char *restrict c_chunk = strchr(chunk, '^');
5547
59
                       if (c_chunk && strncmp(chunk, "I(", 2) != 0) *c_chunk = '\0';
5548
7
                       terms[num_terms++] = savepv(chunk);
5549                   }
5550                   chunk = strtok(NULL, "+");
5551          }
5552
59
        }
5553
5554
52
        for (i = 0; i < num_terms; i++) {
5555
52
          bool found = FALSE;
5556
52
          for (j = 0; j < num_uniq; j++) { if (strcmp(terms[i], uniq_terms[j]) == 0) { found = TRUE; break; } }
5557
222
          if (!found) uniq_terms[num_uniq++] = savepv(terms[i]);
5558
170
        }
5559
52
        p = num_uniq;
5560
5561
20
        // ========================================================================
5562
98
        // PHASE 3: Categorical Expansion
5563
58
        // ========================================================================
5564
58
        for (j = 0; j < p; j++) {
5565
58
          if (p_exp + 32 >= exp_cap) {
5566
58
                   exp_cap *= 2;
5567
0
                   Renew(exp_terms, exp_cap, char*); Renew(is_dummy, exp_cap, bool);
5568                   Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
5569
40
          }
5570
40
          if (strcmp(uniq_terms[j], "Intercept") == 0) {
5571                   exp_terms[p_exp] = savepv("Intercept"); is_dummy[p_exp] = FALSE; p_exp++; continue;
5572          }
5573
52
          if (is_column_categorical(data_hoa, row_hashes, n, uniq_terms[j])) {
5574                   char **restrict levels = NULL;
5575
52
                   unsigned int num_levels = 0, levels_cap = 8;
5576
222
                   Newx(levels, levels_cap, char*);
5577
52
                   for (i = 0; i < n; i++) {
5578
52
                       char *str_val = get_data_string_alloc(data_hoa, row_hashes, i, uniq_terms[j]);
5579
52
                       if (str_val) {
5580                           bool found = FALSE;
5581
7
                           for (l = 0; l < num_levels; l++) { if (strcmp(levels[l], str_val) == 0) { found = TRUE; break; } }
5582
7
                           if (!found) {
5583                               if (num_levels >= levels_cap) { levels_cap *= 2; Renew(levels, levels_cap, char*); }
5584
4
                               levels[num_levels++] = savepv(str_val);
5585
4
                           }
5586
4
                           Safefree(str_val);
5587
3
                       }
5588
3
                   }
5589                   if (num_levels > 0) {
5590
1
                       for (l1 = 0; l1 < num_levels - 1; l1++)
5591
1
                           for (l2 = l1 + 1; l2 < num_levels; l2++)
5592
1
                               if (strcmp(levels[l1], levels[l2]) > 0) { char *tmp = levels[l1]; levels[l1] = levels[l2]; levels[l2] = tmp; }
5593
1
                       for (l = 1; l < num_levels; l++) {
5594
3
                           if (p_exp >= exp_cap) {
5595
1
                               exp_cap *= 2;
5596
1
                               Renew(exp_terms, exp_cap, char*); Renew(is_dummy, exp_cap, bool);
5597                               Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
5598
4
                           }
5599
1
                           size_t t_len = strlen(uniq_terms[j]) + strlen(levels[l]) + 1;
5600                           exp_terms[p_exp] = (char*)safemalloc(t_len);
5601
1
                           snprintf(exp_terms[p_exp], t_len, "%s%s", uniq_terms[j], levels[l]);
5602                           is_dummy[p_exp] = TRUE;
5603                           dummy_base[p_exp]  = savepv(uniq_terms[j]);
5604                           dummy_level[p_exp] = savepv(levels[l]);
5605                           p_exp++;
5606
6
                       }
5607
6
                       for (l = 0; l < num_levels; l++) Safefree(levels[l]);
5608
6
                       Safefree(levels);
5609                   } else {
5610                       Safefree(levels);
5611
6
                       exp_terms[p_exp] = savepv(uniq_terms[j]); is_dummy[p_exp] = FALSE; p_exp++;
5612
6
                   }
5613          } else {
5614
24
                   exp_terms[p_exp] = savepv(uniq_terms[j]); is_dummy[p_exp] = FALSE; p_exp++;
5615
18
          }
5616
12
        }
5617
11
        p = p_exp;
5618
11
        Newx(X, n * p, double); Newx(Y, n, double);
5619
11
        Newx(valid_row_names, n, char*);
5620
5621        // ========================================================================
5622
6
        // PHASE 4: Matrix Construction & Listwise Deletion
5623
24
        // ========================================================================
5624
18
        for (i = 0; i < n; i++) {
5625          double y_val = evaluate_term(data_hoa, row_hashes, i, lhs);
5626
6
          if (isnan(y_val)) { Safefree(row_names[i]); continue; }
5627
5628
33
          bool row_ok = TRUE;
5629          double *restrict row_x = (double*)safemalloc(p * sizeof(double));
5630
6
          for (j = 0; j < p; j++) {
5631
6
                   if (strcmp(exp_terms[j], "Intercept") == 0) {
5632                       row_x[j] = 1.0;
5633
6
                   } else if (is_dummy[j]) {
5634
23
                       char *restrict str_val = get_data_string_alloc(data_hoa, row_hashes, i, dummy_base[j]);
5635
17
                       if (str_val) {
5636
11
                           row_x[j] = (strcmp(str_val, dummy_level[j]) == 0) ? 1.0 : 0.0;
5637
11
                           Safefree(str_val);
5638
11
                       } else { row_ok = FALSE; break; }
5639
11
                   } else {
5640                       row_x[j] = evaluate_term(data_hoa, row_hashes, i, exp_terms[j]);
5641
11
                       if (isnan(row_x[j])) { row_ok = FALSE; break; }
5642
11
                   }
5643
11
          }
5644
21
          if (!row_ok) { Safefree(row_names[i]); Safefree(row_x); continue; }
5645
5646
10
          Y[valid_n] = y_val;
5647
10
          for (j = 0; j < p; j++) X[valid_n * p + j] = row_x[j];
5648          valid_row_names[valid_n] = row_names[i];
5649
1
          valid_n++;
5650
1
          Safefree(row_x);
5651        }
5652
11
        Safefree(row_names);
5653
5654        if (valid_n <= p) {
5655
6
          for (i = 0; i < num_terms; i++) Safefree(terms[i]); Safefree(terms);
5656
6
          for (i = 0; i < num_uniq; i++) Safefree(uniq_terms[i]); Safefree(uniq_terms);
5657
6
          for (j = 0; j < p_exp; j++) {
5658
6
                   Safefree(exp_terms[j]);
5659
6
                   if (is_dummy[j]) { Safefree(dummy_base[j]); Safefree(dummy_level[j]); }
5660          }
5661          Safefree(exp_terms); Safefree(is_dummy); Safefree(dummy_base); Safefree(dummy_level);
5662          Safefree(X); Safefree(Y); Safefree(valid_row_names);
5663
6
          if (row_hashes) Safefree(row_hashes);
5664
6
          croak("lm: 0 degrees of freedom (too many NAs or parameters > observations)");
5665
6
        }
5666
5667
16
        // ========================================================================
5668
16
        // PHASE 5: OLS Math
5669
16
        // ========================================================================
5670
16
        Newxz(XtX, p * p, double);
5671
16
        for (i = 0; i < p; i++)
5672
151
          for (j = 0; j < p; j++) {
5673
135
                   double sum = 0.0;
5674
135
                   for (k = 0; k < valid_n; k++) sum += X[k * p + i] * X[k * p + j];
5675                   XtX[i * p + j] = sum;
5676
16
          }
5677
16
        Newxz(XtY, p, double);
5678
16
        for (i = 0; i < p; i++) {
5679          double sum = 0.0;
5680
6
          for (k = 0; k < valid_n; k++) sum += X[k * p + i] * Y[k];
5681
6
          XtY[i] = sum;
5682
6
        }
5683
6
        Newx(aliased, p, bool);
5684
6
        final_rank = sweep_matrix_ols(XtX, p, aliased);
5685        Newxz(beta, p, double);
5686        for (i = 0; i < p; i++) {
5687          if (aliased[i]) { beta[i] = NAN; }
5688          else {
5689
23
                   double sum = 0.0;
5690
23
                   for (j = 0; j < p; j++) if (!aliased[j]) sum += XtX[i * p + j] * XtY[j];
5691
24
                   beta[i] = sum;
5692
18
          }
5693
18
        }
5694
5695
6
        // ========================================================================
5696
6
        // PHASE 6: Metrics & Cleanup
5697
6
        // ========================================================================
5698
6
        res_hv = newHV(); coef_hv = newHV(); fitted_hv = newHV(); resid_hv = newHV();
5699
6
        summary_hv = newHV(); terms_av = newAV();
5700
5701
6
        df_res = (int)valid_n - final_rank;
5702
5703
6
        // rss / mss accumulated here — rse_sq computed AFTER this loop (not before)
5704
6
        double sum_y = 0.0, mss = 0.0;
5705        for (i = 0; i < valid_n; i++) sum_y += Y[i];
5706        double mean_y = sum_y / (double)valid_n;
5707
5708        for (i = 0; i < valid_n; i++) {
5709          double y_hat = 0.0;
5710          for (j = 0; j < p; j++) if (!aliased[j]) y_hat += X[i * p + j] * beta[j];
5711          double res   = Y[i] - y_hat;
5712          rss          += res * res;
5713          double diff_m = has_intercept ? (y_hat - mean_y) : y_hat;
5714
6
          mss          += diff_m * diff_m;
5715          hv_store(fitted_hv, valid_row_names[i], strlen(valid_row_names[i]), newSVnv(y_hat), 0);
5716
5
          hv_store(resid_hv,  valid_row_names[i], strlen(valid_row_names[i]), newSVnv(res),   0);
5717
5
          Safefree(valid_row_names[i]);
5718
5
        }
5719        Safefree(valid_row_names);
5720
5721
7
        // Single, authoritative rse_sq calculation
5722
2
        rse_sq = (df_res > 0) ? (rss / (double)df_res) : NAN;
5723
5724
2
        int df_int = has_intercept ? 1 : 0;
5725
2
        double r_squared = 0.0, adj_r_squared = 0.0, f_stat = NAN, f_pvalue = NAN;
5726
0
        int numdf = final_rank - df_int;
5727
5728
2
        if (final_rank != df_int && (mss + rss) > 0.0) {
5729          r_squared     = mss / (mss + rss);
5730          adj_r_squared = 1.0 - (1.0 - r_squared) * ((valid_n - df_int) / (double)df_res);
5731          if (rse_sq > 0.0 && numdf > 0) {
5732
5
                   f_stat   = (mss / (double)numdf) / rse_sq;
5733
5
                   f_pvalue = 1.0 - pf(f_stat, (double)numdf, (double)df_res);
5734
5
          } else if (rse_sq == 0.0) {
5735                   f_stat   = INFINITY;
5736
5
                   f_pvalue = 0.0;
5737
2
          }
5738
2
        } else if (final_rank == df_int) {
5739
2
          r_squared = 0.0; adj_r_squared = 0.0;
5740
2
        }
5741
5742
2
        for (j = 0; j < p; j++) {
5743
2
          hv_store(coef_hv, exp_terms[j], strlen(exp_terms[j]), newSVnv(beta[j]), 0);
5744
2
          av_push(terms_av, newSVpv(exp_terms[j], 0));
5745
2
          HV *restrict row_hv = newHV();
5746
2
          if (aliased[j]) {
5747
2
                   hv_store(row_hv, "Estimate",   8,  newSVpv("NaN", 0), 0);
5748
2
                   hv_store(row_hv, "Std. Error", 10, newSVpv("NaN", 0), 0);
5749
2
                   hv_store(row_hv, "t value",    7,  newSVpv("NaN", 0), 0);
5750
2
                   hv_store(row_hv, "Pr(>|t|)",   8,  newSVpv("NaN", 0), 0);
5751
2
          } else {
5752                   double se    = sqrt(rse_sq * XtX[j * p + j]);
5753
0
                   double t_val = (se > 0.0) ? (beta[j] / se) : (INFINITY * (beta[j] >= 0.0 ? 1.0 : -1.0));
5754                   double p_val = get_t_pvalue(t_val, df_res, "two.sided");
5755
3
                   hv_store(row_hv, "Estimate",   8,  newSVnv(beta[j]), 0);
5756                   hv_store(row_hv, "Std. Error", 10, newSVnv(se),      0);
5757
3
                   hv_store(row_hv, "t value",    7,  newSVnv(t_val),   0);
5758
3
                   hv_store(row_hv, "Pr(>|t|)",   8,  newSVnv(p_val),   0);
5759
3
          }
5760
3
          hv_store(summary_hv, exp_terms[j], strlen(exp_terms[j]), newRV_noinc((SV*)row_hv), 0);
5761
3
        }
5762
5763
3
        hv_store(res_hv, "coefficients",  12, newRV_noinc((SV*)coef_hv),   0);
5764
3
        hv_store(res_hv, "fitted.values", 13, newRV_noinc((SV*)fitted_hv), 0);
5765
3
        hv_store(res_hv, "residuals",      9, newRV_noinc((SV*)resid_hv),  0);
5766
3
        hv_store(res_hv, "df.residual",   11, newSVuv(df_res),             0);
5767
3
        hv_store(res_hv, "rank",           4, newSVuv(final_rank),         0);
5768
3
        hv_store(res_hv, "rss",            3, newSVnv(rss),                0);
5769
3
        hv_store(res_hv, "summary",        7, newRV_noinc((SV*)summary_hv),0);
5770
0
        hv_store(res_hv, "terms",          5, newRV_noinc((SV*)terms_av),  0);
5771        hv_store(res_hv, "r.squared",      9, newSVnv(r_squared),          0);
5772
3
        hv_store(res_hv, "adj.r.squared", 13, newSVnv(adj_r_squared),      0);
5773
3
        if (!isnan(f_stat)) {
5774
3
          AV *fstat_av = newAV();
5775
3
          av_push(fstat_av, newSVnv(f_stat));
5776
3
          av_push(fstat_av, newSViv(numdf));
5777
3
          av_push(fstat_av, newSViv(df_res));
5778
3
          hv_store(res_hv, "fstatistic", 10, newRV_noinc((SV*)fstat_av), 0);
5779
3
          hv_store(res_hv, "f.pvalue",    8, newSVnv(f_pvalue),          0);
5780
3
        }
5781
5782
3
        // Deep Cleanup
5783
3
        for (i = 0; i < num_terms; i++) Safefree(terms[i]); Safefree(terms);
5784
3
        for (i = 0; i < num_uniq; i++) Safefree(uniq_terms[i]); Safefree(uniq_terms);
5785
3
        for (j = 0; j < p_exp; j++) {
5786
3
          Safefree(exp_terms[j]);
5787
3
          if (is_dummy[j]) { Safefree(dummy_base[j]); Safefree(dummy_level[j]); }
5788
3
        }
5789
3
        Safefree(exp_terms); Safefree(is_dummy); Safefree(dummy_base); Safefree(dummy_level);
5790
3
        Safefree(X); Safefree(Y); Safefree(XtX); Safefree(XtY);
5791        Safefree(beta); Safefree(aliased);
5792
0
        if (row_hashes) Safefree(row_hashes);
5793
5794        RETVAL = newRV_noinc((SV*)res_hv);
5795}
5796
5
OUTPUT:
5797    RETVAL
5798
5799void seq(from, to, by = 1.0)
5800        double from
5801
5
        double to
5802
5
        double by
5803
5
PPCODE:
5804
5
        {
5805
5
                //Handle the zero 'by' case
5806
5
                if (by == 0.0) {
5807
5
                        if (from == to) {
5808
5
                                 EXTEND(SP, 1);
5809
5
                                 mPUSHn(from);
5810
5
                                 XSRETURN(1);
5811
5
                        } else {
5812                                 croak("invalid 'by' argument: cannot be zero when from != to");
5813
5
                        }
5814                }
5815                // Check for wrong direction / infinite loop
5816                if ((from < to && by < 0.0) || (from > to && by > 0.0)) {
5817                        croak("wrong sign in 'by' argument");
5818                }
5819                /* * Calculate number of elements.
5820                * R uses a small epsilon (like 1e-10) to avoid dropping the last
5821
7
                * element due to floating point inaccuracies.
5822
7
                */
5823
7
                double n_elements_d = (to - from) / by;
5824
7
                if (n_elements_d < 0.0) n_elements_d = 0.0;
5825
7
                size_t n_elements = (n_elements_d + 1e-10) + 1;
5826                // Pre-extend the stack to avoid reallocating inside the loop
5827
7
                EXTEND(SP, n_elements);
5828
7
                for (size_t i = 0; i < n_elements; i++) {
5829
7
                        mPUSHn(from + i * by);
5830
7
                }
5831                XSRETURN(n_elements);
5832
7
        }
5833
5834
27
SV* rnorm(...)
5835
27
        CODE:
5836        {
5837
27
          // Auto-seed the PRNG if the Perl script hasn't done so yet
5838
26
          AUTO_SEED_PRNG();
5839
5840
12
          size_t n = 0;
5841
11
          double mean = 0.0, sd = 1.0;
5842
5
          int arg_start = 0;
5843
5844
0
          // Check if the first argument is a simple integer (rnorm(33))
5845
0
          if (items > 0 && SvIOK(ST(0)) && (items == 1 || items % 2 != 0)) {
5846
0
                   n = (unsigned int)SvUV(ST(0));
5847                   arg_start = 1; // Start parsing named arguments from the second element
5848          }
5849
5850
7
          // --- Parse remaining named arguments from the flat stack ---
5851
7
          if ((items - arg_start) % 2 != 0) {
5852
7
                   croak("Usage: rnorm(n), rnorm(n => 10, mean => 0, sd => 1), or rnorm(33, mean => 0)");
5853
7
          }
5854
5855
7
          for (int i = arg_start; i < items; i += 2) {
5856
7
                   const char* restrict key = SvPV_nolen(ST(i));
5857
7
                   SV* restrict val = ST(i + 1);
5858
5859
7
                   if      (strEQ(key, "n"))    n    = (unsigned int)SvUV(val);
5860
7
                   else if (strEQ(key, "mean")) mean = SvNV(val);
5861                   else if (strEQ(key, "sd"))   sd   = SvNV(val);
5862
7
                   else croak("rnorm: unknown argument '%s'", key);
5863
0
          }
5864
5865          if (sd < 0.0) croak("rnorm: standard deviation must be non-negative");
5866
5867
7
          AV *restrict result_av = newAV();
5868
7
          if (n > 0) {
5869
7
                   av_extend(result_av, n - 1);
5870
7
                   // Generate random normals using the Box-Muller transform
5871
7
                   for (size_t i = 0; i < n; ) {
5872
7
                        double u, v, s;
5873
7
                        do {
5874
7
                            // Drand01() hooks into Perl's internal PRNG, respecting Perl's srand()
5875
1
                            u = 2.0 * Drand01() - 1.0;
5876
6
                            v = 2.0 * Drand01() - 1.0;
5877
6
                            s = u * u + v * v;
5878
6
                        } while (s >= 1.0 || s == 0.0);
5879
5880
222
                        double mul = sqrt(-2.0 * log(s) / s);
5881
222
                        // Box-Muller generates two independent values per iteration
5882
173
                        av_store(result_av, i++, newSVnv(mean + sd * u * mul));
5883                        if (i < n) {
5884
6
                            av_store(result_av, i++, newSVnv(mean + sd * v * mul));
5885
0
                        }
5886
0
                   }
5887
0
          }
5888
0
          RETVAL = newRV_noinc((SV*)result_av);
5889
0
        }
5890
0
        OUTPUT:
5891        RETVAL
5892
5893
0
SV* aov(data_sv, formula_sv = &PL_sv_undef)
5894
0
        SV* data_sv
5895
0
        SV* formula_sv
5896
0
        CODE:
5897
0
        {
5898
0
        const char *restrict formula;
5899
0
        SV *restrict orig_data_sv = data_sv;
5900        bool is_stacked = FALSE;
5901
5902
0
        // ========================================================================
5903
0
        // PHASE 0: R-style stack() for missing formula
5904
0
        // ========================================================================
5905
0
        if (!formula_sv || !SvOK(formula_sv) || SvCUR(formula_sv) == 0) {
5906
0
                 if (!SvROK(data_sv) || SvTYPE(SvRV(data_sv)) != SVt_PVHV) {
5907
0
                     croak("aov: Without a formula, data must be a HashRef of ArrayRefs (mimicking R's named list)");
5908                 }
5909
5910                 is_stacked = TRUE;
5911
7
                 HV *restrict input_hv = (HV*)SvRV(data_sv);
5912
7
                 HV *restrict stacked_hv = newHV();
5913
7
                 AV *restrict val_av = newAV();
5914
7
                 AV *restrict grp_av = newAV();
5915
5916
7
                 hv_iterinit(input_hv);
5917
7
                 HE *restrict entry;
5918
7
                 while ((entry = hv_iternext(input_hv))) {
5919
7
                     SV *restrict grp_name_sv = hv_iterkeysv(entry);
5920
7
                     SV *restrict arr_ref = hv_iterval(input_hv, entry);
5921
5922
7
                     if (SvROK(arr_ref) && SvTYPE(SvRV(arr_ref)) == SVt_PVAV) {
5923                         AV *restrict arr = (AV*)SvRV(arr_ref);
5924                         size_t len = av_len(arr);
5925                         for (size_t k = 0; k <= len; k++) {
5926                             SV **restrict v = av_fetch(arr, k, 0);
5927                             if (v && *v && SvOK(*v)) {
5928                                 av_push(val_av, newSVsv(*v));
5929                                 av_push(grp_av, newSVsv(grp_name_sv));
5930
3
                             }
5931
3
                         }
5932                     } else {
5933                         SvREFCNT_dec(val_av); SvREFCNT_dec(grp_av); SvREFCNT_dec(stacked_hv);
5934                         croak("aov: Hash values must be ArrayRefs when no formula is provided");
5935
3
                     }
5936
2
                 }
5937
5938
1
                 hv_stores(stacked_hv, "Value", newRV_noinc((SV*)val_av));
5939
1
                 hv_stores(stacked_hv, "Group", newRV_noinc((SV*)grp_av));
5940
5941                 // sv_2mortal ensures memory is freed automatically on return or croak
5942                 data_sv = sv_2mortal(newRV_noinc((SV*)stacked_hv));
5943
3
                 formula = "Value~Group";
5944
2
        } else {
5945
1
                 formula = SvPV_nolen(formula_sv);
5946
1
        }
5947
5948        char f_cpy[512];
5949
5
        char *restrict src, *restrict dst, *restrict tilde, *restrict lhs, *restrict rhs, *restrict chunk;
5950
5951
2
        char **restrict terms = NULL, **restrict uniq_terms = NULL, **restrict exp_terms = NULL, **restrict parent_term = NULL;
5952
2
        bool *restrict is_dummy = NULL, *is_interact = NULL;
5953
1
        char **restrict dummy_base = NULL, **restrict dummy_level = NULL;
5954
0
        int *restrict term_map = NULL, *restrict left_idx = NULL, *restrict right_idx = NULL;
5955
0
        unsigned int term_cap = 64, exp_cap = 64, num_terms = 0, num_uniq = 0, p = 0, p_exp = 0;
5956        size_t n = 0, valid_n = 0, i, j;
5957        bool has_intercept = TRUE;
5958
5959
0
        char **restrict row_names = NULL;
5960        HV **restrict row_hashes = NULL;
5961        HV *restrict data_hoa = NULL;
5962        SV *restrict ref = NULL;
5963        HE *restrict entry;
5964
3
        double **restrict X_mat = NULL;
5965
3
        double *restrict Y = NULL;
5966
5967
3
        char **restrict term_base_level = NULL;  /* reference level for each uniq_term (NULL if not categorical) */
5968        if (!SvROK(data_sv)) croak("aov: data is required and must be a reference");
5969
5970        // ========================================================================
5971        // PHASE 1: Data Extraction
5972        // ========================================================================
5973
3
        ref = SvRV(data_sv);
5974
1
        if (SvTYPE(ref) == SVt_PVHV) {
5975
0
                HV*restrict hv = (HV*)ref;
5976
1
                if (hv_iterinit(hv) == 0) croak("aov: Data hash is empty");
5977                entry = hv_iternext(hv);
5978                if (entry) {
5979
1
                         SV*restrict val = hv_iterval(hv, entry);
5980
1
                         if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVAV) {
5981                                  data_hoa = hv;
5982
4
                                  n = av_len((AV*)SvRV(val)) + 1;
5983
3
                                  Newx(row_names, n, char*);
5984
3
                                  for(i = 0; i < n; i++) {
5985
0
                                      char buf[32]; snprintf(buf, sizeof(buf), "%lu", (unsigned long)(i+1));
5986
3
                                      row_names[i] = savepv(buf);
5987                                  }
5988
1
                         } else if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVHV) {
5989                                  n = hv_iterinit(hv);
5990
1
                                  Newx(row_names, n, char*); Newx(row_hashes, n, HV*);
5991                                  i = 0;
5992
1
                                  while ((entry = hv_iternext(hv))) {
5993
1
                                      I32 len;
5994                                      row_names[i] = savepv(hv_iterkey(entry, &len));
5995                                      row_hashes[i] = (HV*)SvRV(hv_iterval(hv, entry));
5996
1
                                      i++;
5997
1
                                  }
5998
4
                         } else croak("aov: Hash values must be ArrayRefs (HoA) or HashRefs (HoH)");
5999                }
6000
3
        } else if (SvTYPE(ref) == SVt_PVAV) {
6001
3
                AV*restrict av = (AV*)ref;
6002                n = av_len(av) + 1;
6003
3
                Newx(row_names, n, char*);
6004
3
                Newx(row_hashes, n, HV*);
6005
17
                for (i = 0; i < n; i++) {
6006
14
                         SV**restrict val = av_fetch(av, i, 0);
6007
14
                         if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVHV) {
6008
14
                                  row_hashes[i] = (HV*)SvRV(*val);
6009
14
                                  char buf[32];
6010
14
                                  snprintf(buf, sizeof(buf), "%lu", (unsigned long)(i + 1));
6011                                  row_names[i] = savepv(buf);
6012                         } else {
6013
3
                                  for (size_t k = 0; k < i; k++) Safefree(row_names[k]);
6014                                  Safefree(row_names); Safefree(row_hashes);
6015
1
                                  croak("aov: Array values must be HashRefs (AoH)");
6016                         }
6017                }
6018        } else croak("aov: Data must be an Array or Hash reference");
6019
6020        // ========================================================================
6021
2
        // PHASE 2: Formula Parsing & `.` Expansion
6022
0
        // ========================================================================
6023
2
        src = (char*)formula; dst = f_cpy;
6024
0
        while (*src && (dst - f_cpy < 511)) { if (!isspace(*src)) { *dst++ = *src; } src++; }
6025        *dst = '\0';
6026
6027
2
        tilde = strchr(f_cpy, '~');
6028
2
        if (!tilde) {
6029
2
                  for (i = 0; i < n; i++) Safefree(row_names[i]);
6030
2
                  Safefree(row_names); if (row_hashes) Safefree(row_hashes);
6031
2
                  croak("aov: invalid formula, missing '~'");
6032        }
6033
2
        *tilde = '\0';
6034
2
        lhs = f_cpy;
6035        rhs = tilde + 1;
6036
6037
2
        char *restrict p_idx;
6038
2
        while ((p_idx = strstr(rhs, "-1")) != NULL) { has_intercept = FALSE; memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); }
6039        while ((p_idx = strstr(rhs, "+0")) != NULL) { has_intercept = FALSE; memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); }
6040
30
        while ((p_idx = strstr(rhs, "0+")) != NULL) { has_intercept = FALSE; memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); }
6041
28
        if (rhs[0] == '0' && rhs[1] == '\0')        { has_intercept = FALSE; rhs[0] = '\0'; }
6042
28
        while ((p_idx = strstr(rhs, "+1")) != NULL) { memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); }
6043
28
        if (rhs[0] == '1' && rhs[1] == '\0')        { rhs[0] = '\0'; }
6044
28
        else if (rhs[0] == '1' && rhs[1] == '+')    { memmove(rhs, rhs + 2, strlen(rhs + 2) + 1); }
6045
6046
28
        while ((p_idx = strstr(rhs, "++")) != NULL) memmove(p_idx, p_idx + 1, strlen(p_idx + 1) + 1);
6047
28
        if (rhs[0] == '+') memmove(rhs, rhs + 1, strlen(rhs + 1) + 1);
6048        size_t len_rhs = strlen(rhs);
6049
28
        if (len_rhs > 0 && rhs[len_rhs - 1] == '+') rhs[len_rhs - 1] = '\0';
6050
6051        char rhs_expanded[2048] = "";
6052
6
        size_t rhs_len = 0;
6053
6
        chunk = strtok(rhs, "+");
6054
6
        while (chunk != NULL) {
6055                if (strcmp(chunk, ".") == 0) {
6056
28
                         AV *restrict cols = get_all_columns(data_hoa, row_hashes, n);
6057
28
                         for (size_t c = 0; c <= av_len(cols); c++) {
6058
28
                                  SV **restrict col_sv = av_fetch(cols, c, 0);
6059                                  if (col_sv && SvOK(*col_sv)) {
6060                                      const char *restrict col_name = SvPV_nolen(*col_sv);
6061
2
                                      if (strcmp(col_name, lhs) != 0) {
6062
2
                                          size_t slen = strlen(col_name);
6063                                          if (rhs_len + slen + 2 < sizeof(rhs_expanded)) {
6064                                              if (rhs_len > 0) { strcat(rhs_expanded, "+"); rhs_len++; }
6065                                              strcat(rhs_expanded, col_name);
6066                                              rhs_len += slen;
6067                                          }
6068
3
                                      }
6069
0
                                  }
6070
0
                         }
6071
0
                         SvREFCNT_dec(cols);
6072
0
                } else {
6073                         size_t slen = strlen(chunk);
6074
0
                         if (rhs_len + slen + 2 < sizeof(rhs_expanded)) {
6075
0
                                  if (rhs_len > 0) { strcat(rhs_expanded, "+"); rhs_len++; }
6076                                  strcat(rhs_expanded, chunk);
6077                                  rhs_len += slen;
6078                         }
6079
3
                }
6080
3
                chunk = strtok(NULL, "+");
6081        }
6082        // Setup arrays safely
6083
3
        Newx(terms, term_cap, char*);
6084
3
        Newx(uniq_terms, term_cap, char*);
6085
3
        Newx(exp_terms, exp_cap, char*); Newx(parent_term, exp_cap, char*);
6086        Newx(is_dummy, exp_cap, bool); Newx(is_interact, exp_cap, bool);
6087
45
        Newx(dummy_base, exp_cap, char*); Newx(dummy_level, exp_cap, char*);
6088
42
        Newx(term_map, exp_cap, int); Newx(left_idx, exp_cap, int); Newx(right_idx, exp_cap, int);
6089
42
        if (has_intercept) { terms[num_terms++] = savepv("Intercept"); }
6090
42
        if (strlen(rhs_expanded) > 0) {
6091
42
                chunk = strtok(rhs_expanded, "+");
6092                while (chunk != NULL) {
6093                         if (num_terms >= term_cap - 3) {
6094                                  term_cap *= 2;
6095
3
                                  Renew(terms, term_cap, char*); Renew(uniq_terms, term_cap, char*);
6096
12
                         }
6097
9
                         char *restrict star = strchr(chunk, '*');
6098
9
                         if (star) {
6099
9
                                  *star = '\0';
6100                                  char *restrict left = chunk;
6101                                  char *right = star + 1;
6102
3
                                  char *restrict c_l = strchr(left, '^');
6103
3
                                  if (c_l && strncmp(left, "I(", 2) != 0) *c_l = '\0';
6104
3
                                  char *restrict c_r = strchr(right, '^'); if (c_r && strncmp(right, "I(", 2) != 0) *c_r = '\0';
6105
0
                                  terms[num_terms++] = savepv(left);
6106
0
                                  terms[num_terms++] = savepv(right);
6107                                  size_t inter_len = strlen(left) + strlen(right) + 2;
6108
3
                                  terms[num_terms] = (char*)safemalloc(inter_len);
6109
3
                                  snprintf(terms[num_terms++], inter_len, "%s:%s", left, right);
6110                         } else {
6111                                  char *restrict c_chunk = strchr(chunk, '^');
6112
3
                                  if (c_chunk && strncmp(chunk, "I(", 2) != 0) *c_chunk = '\0';
6113
3
                                  terms[num_terms++] = savepv(chunk);
6114
3
                         }
6115
3
                         chunk = strtok(NULL, "+");
6116
3
                }
6117
3
        }
6118
6119        for (i = 0; i < num_terms; i++) {
6120
3
                  bool found = FALSE;
6121
3
                  for (size_t k = 0; k < num_uniq; k++) {
6122
3
                        if (strcmp(terms[i], uniq_terms[k]) == 0) { found = TRUE; break; }
6123                  }
6124
12
                  if (!found) uniq_terms[num_uniq++] = savepv(terms[i]);
6125
9
        }
6126
9
        p = num_uniq;
6127
6128        Newxz(term_base_level, num_uniq, char*);
6129
6130
9
        /* PHASE 3: Categorical & Interaction Expansion */
6131        for (j = 0; j < p; j++) {
6132
9
                if (p_exp + 64 >= exp_cap) {
6133                        exp_cap *= 2;
6134                        Renew(exp_terms, exp_cap, char*); Renew(parent_term, exp_cap, char*);
6135                        Renew(is_dummy, exp_cap, bool); Renew(is_interact, exp_cap, bool);
6136
3
                        Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
6137
3
                        Renew(term_map, exp_cap, int); Renew(left_idx, exp_cap, int); Renew(right_idx, exp_cap, int);
6138
3
                }
6139
6140                if (strcmp(uniq_terms[j], "Intercept") == 0) {
6141
3
                        exp_terms[p_exp] = savepv("Intercept");
6142
3
                        parent_term[p_exp] = savepv("Intercept");
6143
3
                        is_dummy[p_exp] = FALSE; is_interact[p_exp] = FALSE;
6144
3
                        term_map[p_exp] = j;
6145
3
                        p_exp++;
6146                        continue;
6147
3
                }
6148
6149                char *restrict colon = strchr(uniq_terms[j], ':');
6150                if (colon) {
6151                        char left[256], right[256];
6152                        strncpy(left, uniq_terms[j], colon - uniq_terms[j]);
6153                        left[colon - uniq_terms[j]] = '\0';
6154                        strcpy(right, colon + 1);
6155
6156
6
                        int *restrict l_indices = (int*)safemalloc(p_exp * sizeof(int)); int l_count = 0;
6157
6
                        int *restrict r_indices = (int*)safemalloc(p_exp * sizeof(int)); int r_count = 0;
6158
6
                        for (size_t e = 0; e < p_exp; e++) {
6159
6
                                if (strcmp(parent_term[e], left) == 0) l_indices[l_count++] = e;
6160                                if (strcmp(parent_term[e], right) == 0) r_indices[r_count++] = e;
6161                        }
6162
6163
6
                        if (l_count == 0 || r_count == 0) {
6164
6
                                Safefree(l_indices); Safefree(r_indices);
6165                                croak("aov: Interaction term '%s' requires its main effects to be explicitly included in the formula", uniq_terms[j]);
6166                        } else {
6167                                for (int li = 0; li < l_count; li++) {
6168
6
                                         for (int ri = 0; ri < r_count; ri++) {
6169
6
                                                  if (p_exp >= exp_cap) {
6170
6
                                                      exp_cap *= 2;
6171                                                      Renew(exp_terms, exp_cap, char*); Renew(parent_term, exp_cap, char*);
6172                                                      Renew(is_dummy, exp_cap, bool); Renew(is_interact, exp_cap, bool);
6173                                                      Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
6174
6
                                                      Renew(term_map, exp_cap, int); Renew(left_idx, exp_cap, int); Renew(right_idx, exp_cap, int);
6175
0
                                                  }
6176                                                  size_t t_len = strlen(exp_terms[l_indices[li]]) + strlen(exp_terms[r_indices[ri]]) + 2;
6177                                                  exp_terms[p_exp] = (char*)safemalloc(t_len);
6178                                                  snprintf(exp_terms[p_exp], t_len, "%s:%s", exp_terms[l_indices[li]], exp_terms[r_indices[ri]]);
6179
8
                                                  parent_term[p_exp] = savepv(uniq_terms[j]);
6180
2
                                                  is_dummy[p_exp] = FALSE; is_interact[p_exp] = TRUE;
6181
2
                                                  left_idx[p_exp] = l_indices[li];
6182                                                  right_idx[p_exp] = r_indices[ri];
6183
2
                                                  term_map[p_exp] = j;
6184
2
                                                  p_exp++;
6185
2
                                         }
6186
1
                                }
6187
0
                        }
6188
0
                        Safefree(l_indices); Safefree(r_indices);
6189                } else {
6190                        if (is_column_categorical(data_hoa, row_hashes, n, uniq_terms[j])) {
6191                                char **restrict levels = NULL;
6192
6
                                unsigned int num_levels = 0, levels_cap = 8;
6193
0
                                Newx(levels, levels_cap, char*);
6194
6
                                for (i = 0; i < n; i++) {
6195
0
                                         char* str_val = get_data_string_alloc(data_hoa, row_hashes, i, uniq_terms[j]);
6196                                         if (str_val) {
6197
6
                                                  bool found = FALSE;
6198
0
                                                  for (size_t l = 0; l < num_levels; l++) {
6199
6
                                                      if (strcmp(levels[l], str_val) == 0) { found = TRUE; break; }
6200
0
                                                  }
6201                                                  if (!found) {
6202
6
                                                      if (num_levels >= levels_cap) { levels_cap *= 2; Renew(levels, levels_cap, char*); }
6203
6
                                                      levels[num_levels++] = savepv(str_val);
6204
6
                                                  }
6205
6
                                                  Safefree(str_val);
6206                                         }
6207                                }
6208
6209
6
                                if (num_levels > 0) {
6210
32
                                         for (size_t l1 = 0; l1 < num_levels - 1; l1++) {
6211
26
                                                  for (size_t l2 = l1 + 1; l2 < num_levels; l2++) {
6212
26
                                                      if (strcmp(levels[l1], levels[l2]) > 0) {
6213
26
                                                          char *tmp = levels[l1]; levels[l1] = levels[l2]; levels[l2] = tmp;
6214
26
                                                      }
6215
26
                                                  }
6216
26
                                         }
6217
6218
26
                                         term_base_level[j] = savepv(levels[0]);
6219
6220                                         for (size_t l = 1; l < num_levels; l++) {
6221                                                  if (p_exp >= exp_cap) {
6222                                                      exp_cap *= 2;
6223
6
                                                      Renew(exp_terms, exp_cap, char*); Renew(parent_term, exp_cap, char*);
6224
6
                                                      Renew(is_dummy, exp_cap, bool); Renew(is_interact, exp_cap, bool);
6225
27
                                                      Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
6226
21
                                                      Renew(term_map, exp_cap, int); Renew(left_idx, exp_cap, int); Renew(right_idx, exp_cap, int);
6227
21
                                                  }
6228
21
                                                  size_t t_len = strlen(uniq_terms[j]) + strlen(levels[l]) + 1;
6229
21
                                                  exp_terms[p_exp] = (char*)safemalloc(t_len);
6230
21
                                                  snprintf(exp_terms[p_exp], t_len, "%s%s", uniq_terms[j], levels[l]);
6231
21
                                                  parent_term[p_exp] = savepv(uniq_terms[j]);
6232
21
                                                  is_dummy[p_exp] = TRUE; is_interact[p_exp] = FALSE;
6233
21
                                                  dummy_base[p_exp] = savepv(uniq_terms[j]);
6234                                                  dummy_level[p_exp] = savepv(levels[l]);
6235                                                  term_map[p_exp] = j;
6236                                                  p_exp++;
6237                                         }
6238
6
                                         for (size_t l = 0; l < num_levels; l++) Safefree(levels[l]);
6239
5
                                         Safefree(levels);
6240                                } else {
6241
4
                                         Safefree(levels);
6242
4
                                         exp_terms[p_exp] = savepv(uniq_terms[j]);
6243
4
                                         parent_term[p_exp] = savepv(uniq_terms[j]);
6244
4
                                         is_dummy[p_exp] = FALSE; is_interact[p_exp] = FALSE;
6245                                         term_map[p_exp] = j;
6246
4
                                         p_exp++;
6247                                }
6248                        } else {
6249
3
                                exp_terms[p_exp] = savepv(uniq_terms[j]);
6250
3
                                parent_term[p_exp] = savepv(uniq_terms[j]);
6251                                is_dummy[p_exp] = FALSE; is_interact[p_exp] = FALSE;
6252
3
                                term_map[p_exp] = j;
6253
3
                                p_exp++;
6254                        }
6255
3
                }
6256
0
        }
6257
3
        X_mat = (double**)safemalloc(n * sizeof(double*));
6258
0
        for(i = 0; i < n; i++) X_mat[i] = (double*)safemalloc(p_exp * sizeof(double));
6259
0
        Newx(Y, n, double);
6260
6261        /* PHASE 4: Matrix Construction & Listwise Deletion */
6262
3
        for (i = 0; i < n; i++) {
6263
3
                double y_val = evaluate_term(data_hoa, row_hashes, i, lhs);
6264
3
                if (isnan(y_val)) { Safefree(row_names[i]); continue; }
6265                bool row_ok = TRUE;
6266
3
                double *restrict row_x = (double*)safemalloc(p_exp * sizeof(double));
6267
3
                for (j = 0; j < p_exp; j++) {
6268
3
                          if (strcmp(exp_terms[j], "Intercept") == 0) {
6269                                   row_x[j] = 1.0;
6270                          } else if (is_interact[j]) {
6271                                   row_x[j] = row_x[left_idx[j]] * row_x[right_idx[j]];
6272
3
                          } else if (is_dummy[j]) {
6273
3
                                   char*restrict str_val = get_data_string_alloc(data_hoa, row_hashes, i, dummy_base[j]);
6274                                   if (str_val) {
6275
3
                                       row_x[j] = (strcmp(str_val, dummy_level[j]) == 0) ? 1.0 : 0.0;
6276
3
                                       Safefree(str_val);
6277
3
                                   } else { row_ok = FALSE; break; }
6278
3
                          } else {
6279                                   row_x[j] = evaluate_term(data_hoa, row_hashes, i, parent_term[j]);
6280
3
                                   if (isnan(row_x[j])) { row_ok = FALSE; break; }
6281                          }
6282
3
                }
6283
3
                if (!row_ok) { Safefree(row_names[i]); Safefree(row_x); continue; }
6284
3
                Y[valid_n] = y_val;
6285
3
                for (j = 0; j < p_exp; j++) X_mat[valid_n][j] = row_x[j];
6286                valid_n++;
6287
3
                Safefree(row_x);
6288
3
                Safefree(row_names[i]);
6289
3
        }
6290
3
        Safefree(row_names);
6291        if (valid_n <= p_exp) {
6292
3
                // Full Clean Up
6293                for (i = 0; i < num_terms; i++) Safefree(terms[i]); Safefree(terms);
6294                for (i = 0; i < num_uniq; i++) Safefree(uniq_terms[i]); Safefree(uniq_terms);
6295                for (j = 0; j < p_exp; j++) {
6296                         Safefree(exp_terms[j]); Safefree(parent_term[j]);
6297                         if (is_dummy[j]) { Safefree(dummy_base[j]); Safefree(dummy_level[j]); }
6298                }
6299                Safefree(exp_terms); Safefree(parent_term);
6300                Safefree(is_dummy); Safefree(is_interact);
6301                Safefree(dummy_base); Safefree(dummy_level);
6302                Safefree(term_map); Safefree(left_idx); Safefree(right_idx);
6303                for(i = 0; i < n; i++) Safefree(X_mat[i]);
6304                Safefree(X_mat); Safefree(Y);
6305                if (row_hashes) Safefree(row_hashes);
6306                for (i = 0; i < num_uniq; i++) { if (term_base_level[i]) Safefree(term_base_level[i]); }
6307                Safefree(term_base_level);
6308                croak("aov: 0 degrees of freedom (too many NAs or parameters > observations)");
6309        }
6310        /* PHASE 5: Math & Output Formatting */
6311        bool *restrict aliased_qr = (bool*)safemalloc(p_exp * sizeof(bool));
6312        size_t *restrict rank_map = (size_t*)safemalloc(p_exp * sizeof(size_t));
6313        apply_householder_aov(X_mat, Y, valid_n, p_exp, aliased_qr, rank_map);
6314        double *restrict term_ss;
6315        int *restrict term_df;
6316        Newxz(term_ss, num_uniq, double);
6317        Newxz(term_df, num_uniq, int);
6318        for (i = 0; i < p_exp; i++) {
6319                if (strcmp(exp_terms[i], "Intercept") == 0) continue;
6320                if (aliased_qr[i]) continue;
6321                int t_idx = term_map[i];
6322                size_t r_k = rank_map[i];
6323                term_ss[t_idx] += Y[r_k] * Y[r_k];
6324                term_df[t_idx] += 1;
6325        }
6326        int rank = 0;
6327        for (i = 0; i < p_exp; i++) {
6328                  if (!aliased_qr[i]) rank++;
6329        }
6330        double rss_prev = 0.0;
6331        for (i = rank; i < valid_n; i++) {
6332                  rss_prev += Y[i] * Y[i];
6333        }
6334        int res_df = valid_n - rank;
6335        double ms_res = (res_df > 0) ? rss_prev / res_df : 0.0;
6336        HV*restrict ret_hash = newHV();
6337        for (j = 0; j < num_uniq; j++) {
6338                  if (strcmp(uniq_terms[j], "Intercept") == 0) continue;
6339                  HV*restrict term_stats = newHV();
6340                  double ss = term_ss[j];
6341                  int df = term_df[j];
6342                  double ms = (df > 0) ? ss / df : 0.0;
6343
6344                  hv_stores(term_stats, "Df", newSViv(df));
6345                  hv_stores(term_stats, "Sum Sq", newSVnv(ss));
6346                  hv_stores(term_stats, "Mean Sq", newSVnv(ms));
6347                  if (ms_res > 0.0 && df > 0) {
6348                        double f_val = ms / ms_res;
6349                        hv_stores(term_stats, "F value", newSVnv(f_val));
6350                        hv_stores(term_stats, "Pr(>F)", newSVnv(1.0 - pf(f_val, (double)df, (double)res_df)));
6351                  } else {
6352                        hv_stores(term_stats, "F value", newSVnv(NAN));
6353                        hv_stores(term_stats, "Pr(>F)", newSVnv(NAN));
6354                  }
6355                  hv_store(ret_hash, uniq_terms[j], strlen(uniq_terms[j]), newRV_noinc((SV*)term_stats), 0);
6356        }
6357        HV*restrict res_stats = newHV();
6358        hv_stores(res_stats, "Df", newSViv(res_df));
6359        hv_stores(res_stats, "Sum Sq", newSVnv(rss_prev));
6360        hv_stores(res_stats, "Mean Sq", newSVnv(ms_res));
6361        hv_stores(ret_hash, "Residuals", newRV_noinc((SV*)res_stats));
6362        {
6363                  HV *restrict tgt_hoa = data_hoa;
6364                  HV **restrict tgt_row_hashes = row_hashes;
6365                  size_t tgt_n = n;
6366                  // Route evaluation to the original unstacked HoA when a formula was implied
6367                  if (is_stacked) {
6368                      tgt_hoa = (HV*)SvRV(orig_data_sv);
6369                      tgt_row_hashes = NULL;
6370                      hv_iterinit(tgt_hoa);
6371                      HE *restrict e = hv_iternext(tgt_hoa);
6372                      if (e) {
6373                          SV *val = hv_iterval(tgt_hoa, e);
6374                          if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVAV) {
6375                              tgt_n = av_len((AV*)SvRV(val)) + 1;
6376                          }
6377                      }
6378                  }
6379                  AV *restrict all_cols = get_all_columns(tgt_hoa, tgt_row_hashes, tgt_n);
6380                  HV *restrict mean_hv  = newHV();
6381                  HV *restrict size_hv  = newHV();
6382                  for (size_t c = 0; c <= (size_t)av_len(all_cols); c++) {
6383                      SV **restrict col_sv = av_fetch(all_cols, c, 0);
6384                      if (!col_sv || !SvOK(*col_sv)) continue;
6385                      const char *restrict col_name = SvPV_nolen(*col_sv);
6386
6387                      double col_sum = 0.0;
6388                      IV      col_count = 0;
6389                      for (i = 0; i < tgt_n; i++) {
6390                          double val = evaluate_term(tgt_hoa, tgt_row_hashes, i, col_name);
6391                          if (!isnan(val)) { col_sum += val; col_count++; }
6392                      }
6393
6394                      double col_mean = (col_count > 0) ? col_sum / col_count : NAN;
6395                      hv_store(mean_hv, col_name, strlen(col_name), newSVnv(col_mean), 0);
6396                      hv_store(size_hv, col_name, strlen(col_name), newSViv(col_count), 0);
6397                  }
6398                  SvREFCNT_dec(all_cols);
6399                  HV *restrict gs_hv = newHV();
6400                  hv_stores(gs_hv, "mean", newRV_noinc((SV*)mean_hv));
6401                  hv_stores(gs_hv, "size", newRV_noinc((SV*)size_hv));
6402                  hv_stores(ret_hash, "group_stats", newRV_noinc((SV*)gs_hv));
6403        }
6404        /* Deep Cleanup */
6405        for (i = 0; i < num_terms; i++) Safefree(terms[i]); Safefree(terms);
6406        for (i = 0; i < num_uniq; i++) Safefree(uniq_terms[i]); Safefree(uniq_terms);
6407        for (j = 0; j < p_exp; j++) {
6408                  Safefree(exp_terms[j]); Safefree(parent_term[j]);
6409                  if (is_dummy[j]) { Safefree(dummy_base[j]); Safefree(dummy_level[j]); }
6410        }
6411        Safefree(exp_terms); Safefree(parent_term);
6412        Safefree(is_dummy); Safefree(is_interact);
6413        Safefree(dummy_base); Safefree(dummy_level);
6414        Safefree(term_map); Safefree(left_idx); Safefree(right_idx);
6415        Safefree(term_ss); Safefree(term_df);
6416        for (i = 0; i < n; i++) Safefree(X_mat[i]);
6417        Safefree(X_mat); Safefree(Y);
6418        Safefree(aliased_qr); Safefree(rank_map);
6419        for (i = 0; i < num_uniq; i++) { if (term_base_level[i]) Safefree(term_base_level[i]); }
6420        Safefree(term_base_level);
6421        if (row_hashes) Safefree(row_hashes);
6422        RETVAL = newRV_noinc((SV*)ret_hash);
6423        }
6424OUTPUT:
6425    RETVAL
6426
6427PROTOTYPES: DISABLE
6428
6429SV* fisher_test(...)
6430CODE:
6431{
6432        if (items < 1) croak("fisher_test requires at least a data reference");
6433
6434        SV*restrict data_ref = ST(0);
6435        double conf_level = 0.95;
6436        const char*restrict alternative = "two.sided";
6437
6438        // Parse named arguments
6439        for (unsigned short int i = 1; i < items; i += 2) {
6440                if (i + 1 >= items) croak("fisher_test: odd number of arguments");
6441                const char*restrict key = SvPV_nolen(ST(i));
6442                SV*restrict val = ST(i + 1);
6443                if (strEQ(key, "conf_level") || strEQ(key, "conf.level")) {
6444                        conf_level = SvNV(val);
6445                } else if (strEQ(key, "alternative")) {
6446                        alternative = SvPV_nolen(val);
6447                }
6448        }
6449
6450        if (!SvROK(data_ref)) croak("fisher_test requires a reference to an Array or Hash");
6451        SV*restrict deref = SvRV(data_ref);
6452        size_t a = 0, b = 0, c = 0, d = 0;
6453        // Extract Data
6454        if (SvTYPE(deref) == SVt_PVAV) {
6455          AV*restrict outer = (AV*)deref;
6456          if (av_len(outer) != 1) croak("Outer array must have exactly 2 rows");
6457          SV**restrict row1_ptr = av_fetch(outer, 0, 0);
6458          SV**restrict row2_ptr = av_fetch(outer, 1, 0);
6459          if (row1_ptr && row2_ptr && SvROK(*row1_ptr) && SvROK(*row2_ptr)) {
6460                   AV*restrict row1 = (AV*)SvRV(*row1_ptr);
6461                   AV*restrict row2 = (AV*)SvRV(*row2_ptr);
6462                   SV**restrict a_ptr = av_fetch(row1, 0, 0);
6463                   SV**restrict b_ptr = av_fetch(row1, 1, 0);
6464                   SV**restrict c_ptr = av_fetch(row2, 0, 0);
6465                   SV**restrict d_ptr = av_fetch(row2, 1, 0);
6466                   a = (a_ptr && SvOK(*a_ptr)) ? SvIV(*a_ptr) : 0;
6467                   b = (b_ptr && SvOK(*b_ptr)) ? SvIV(*b_ptr) : 0;
6468                   c = (c_ptr && SvOK(*c_ptr)) ? SvIV(*c_ptr) : 0;
6469                   d = (d_ptr && SvOK(*d_ptr)) ? SvIV(*d_ptr) : 0;
6470          } else {
6471                  croak("Invalid 2D Array structure");
6472          }
6473        } else if (SvTYPE(deref) == SVt_PVHV) {
6474                // Fixed 2D Hash Logic: Sort keys lexically to enforce structured rows/columns
6475                HV*restrict outer = (HV*)deref;
6476                if (hv_iterinit(outer) != 2) croak("Outer hash must have exactly 2 keys");
6477                HE*restrict he1 = hv_iternext(outer);
6478                HE*restrict he2 = hv_iternext(outer);
6479                if (!he1 || !he2) croak("Invalid outer hash");
6480                const char*restrict k1 = SvPV_nolen(hv_iterkeysv(he1));
6481                const char*restrict k2 = SvPV_nolen(hv_iterkeysv(he2));
6482                HE*restrict row1_he = (strcmp(k1, k2) < 0) ? he1 : he2;
6483                HE*restrict row2_he = (strcmp(k1, k2) < 0) ? he2 : he1;
6484                SV*restrict row1_sv = hv_iterval(outer, row1_he);
6485                SV*restrict row2_sv = hv_iterval(outer, row2_he);
6486                if (!SvROK(row1_sv) || SvTYPE(SvRV(row1_sv)) != SVt_PVHV ||
6487                        !SvROK(row2_sv) || SvTYPE(SvRV(row2_sv)) != SVt_PVHV) {
6488                        croak("Inner elements must be hashes");
6489                }
6490                HV*restrict in1 = (HV*)SvRV(row1_sv);
6491                HV*restrict in2 = (HV*)SvRV(row2_sv);
6492                if (hv_iterinit(in1) != 2 || hv_iterinit(in2) != 2) croak("Inner hashes must have exactly 2 keys");
6493                HE*restrict in1_he1 = hv_iternext(in1);
6494                HE*restrict in1_he2 = hv_iternext(in1);
6495                const char*restrict in1_k1 = SvPV_nolen(hv_iterkeysv(in1_he1));
6496                const char*restrict in1_k2 = SvPV_nolen(hv_iterkeysv(in1_he2));
6497                HE*restrict in1_c1 = (strcmp(in1_k1, in1_k2) < 0) ? in1_he1 : in1_he2;
6498                HE*restrict in1_c2 = (strcmp(in1_k1, in1_k2) < 0) ? in1_he2 : in1_he1;
6499                HE*restrict in2_he1 = hv_iternext(in2);
6500                HE*restrict in2_he2 = hv_iternext(in2);
6501                const char*restrict in2_k1 = SvPV_nolen(hv_iterkeysv(in2_he1));
6502                const char*restrict in2_k2 = SvPV_nolen(hv_iterkeysv(in2_he2));
6503                HE*restrict in2_c1 = (strcmp(in2_k1, in2_k2) < 0) ? in2_he1 : in2_he2;
6504                HE*restrict in2_c2 = (strcmp(in2_k1, in2_k2) < 0) ? in2_he2 : in2_he1;
6505                a = (hv_iterval(in1, in1_c1) && SvOK(hv_iterval(in1, in1_c1))) ? SvIV(hv_iterval(in1, in1_c1)) : 0;
6506                b = (hv_iterval(in1, in1_c2) && SvOK(hv_iterval(in1, in1_c2))) ? SvIV(hv_iterval(in1, in1_c2)) : 0;
6507                c = (hv_iterval(in2, in2_c1) && SvOK(hv_iterval(in2, in2_c1))) ? SvIV(hv_iterval(in2, in2_c1)) : 0;
6508                d = (hv_iterval(in2, in2_c2) && SvOK(hv_iterval(in2, in2_c2))) ? SvIV(hv_iterval(in2, in2_c2)) : 0;
6509        } else {
6510          croak("Input must be a 2D Array or 2D Hash");
6511        }
6512
6513        // Perform Calculations via Helpers
6514        double p_val = exact_p_value(a, b, c, d, alternative);
6515        double mle_or, ci_low, ci_high;
6516        calculate_exact_stats(a, b, c, d, conf_level, alternative, &mle_or, &ci_low, &ci_high);
6517
6518        // Construct the Return HashRef purely in C
6519        HV*restrict ret_hash = newHV();
6520        hv_stores(ret_hash, "method", newSVpv("Fisher's Exact Test for Count Data", 0));
6521        hv_stores(ret_hash, "alternative", newSVpv(alternative, 0));
6522        AV*restrict ci_array = newAV();
6523        av_push(ci_array, newSVnv(ci_low));
6524        av_push(ci_array, newSVnv(ci_high));
6525        hv_stores(ret_hash, "conf_int", newRV_noinc((SV*)ci_array));
6526        HV*restrict est_hash = newHV();
6527        hv_stores(ret_hash, "estimate", newRV_noinc((SV*)est_hash));
6528        hv_stores(est_hash, "odds ratio", newSVnv(mle_or));
6529        hv_stores(ret_hash, "p_value", newSVnv(p_val));
6530        // Return the HashRef
6531        RETVAL = newRV_noinc((SV*)ret_hash);
6532}
6533OUTPUT:
6534  RETVAL
6535
6536SV* power_t_test(...)
6537CODE:
6538{
6539        SV*restrict sv_n = NULL;
6540        SV*restrict sv_delta = NULL;
6541        SV*restrict sv_sd = NULL;
6542        SV*restrict sv_sig_level = NULL;
6543        SV*restrict sv_power = NULL;
6544
6545        const char* restrict type = "two.sample";
6546        const char* restrict alternative = "two.sided";
6547        bool strict = FALSE;
6548        double tol = pow(2.2204460492503131e-16, 0.25);
6549
6550        if (items % 2 != 0) croak("Usage: power_t_test(n => 30, delta => 0.5, sd => 1.0, ...)");
6551        for (unsigned short int i = 0; i < items; i += 2) {
6552          const char* restrict key = SvPV_nolen(ST(i));
6553          SV* restrict val = ST(i+1);
6554
6555          if      (strEQ(key, "n"))           sv_n = val;
6556          else if (strEQ(key, "delta"))       sv_delta = val;
6557          else if (strEQ(key, "sd"))          sv_sd = val;
6558          else if (strEQ(key, "sig.level") || strEQ(key, "sig_level")) sv_sig_level = val;
6559          else if (strEQ(key, "power"))       sv_power = val;
6560          else if (strEQ(key, "type"))        type = SvPV_nolen(val);
6561          else if (strEQ(key, "alternative")) alternative = SvPV_nolen(val);
6562          else if (strEQ(key, "strict"))      strict = SvTRUE(val);
6563          else if (strEQ(key, "tol"))         tol = SvNV(val);
6564          else croak("power_t_test: unknown argument '%s'", key);
6565        }
6566
6567        bool is_null_n = (!sv_n || !SvOK(sv_n));
6568        bool is_null_delta = (!sv_delta || !SvOK(sv_delta));
6569        bool is_null_power = (!sv_power || !SvOK(sv_power));
6570        bool is_null_sd = (sv_sd && !SvOK(sv_sd));
6571        bool is_null_sig_level = (sv_sig_level && !SvOK(sv_sig_level));
6572
6573        unsigned int missing_count = 0;
6574        if (is_null_n) missing_count++;
6575        if (is_null_delta) missing_count++;
6576        if (is_null_power) missing_count++;
6577        if (is_null_sd) missing_count++;
6578        if (is_null_sig_level) missing_count++;
6579
6580        if (missing_count != 1) {
6581          croak("power_t_test: exactly one of 'n', 'delta', 'sd', 'power', and 'sig_level' must be undef/NULL");
6582        }
6583
6584        double n = is_null_n ? 0.0 : SvNV(sv_n);
6585        double delta = is_null_delta ? 0.0 : SvNV(sv_delta);
6586        double sd = (!sv_sd || is_null_sd) ? 1.0 : SvNV(sv_sd);
6587        double sig_level = (!sv_sig_level || is_null_sig_level) ? 0.05 : SvNV(sv_sig_level);
6588        double power = is_null_power ? 0.0 : SvNV(sv_power);
6589        short int tsample = (strEQ(type, "one.sample") || strEQ(type, "paired")) ? 1 : 2;
6590        short int tside = (strEQ(alternative, "one.sided") || strEQ(alternative, "greater") || strEQ(alternative, "less")) ? 1 : 2;
6591        if (tside == 2 && !is_null_delta) delta = fabs(delta);
6592        if (is_null_power) {
6593          power = p_body(n, delta, sd, sig_level, tsample, tside, strict);
6594        } else if (is_null_n) {
6595          double low = 2.0, high = 1e7;
6596          while (p_body(high, delta, sd, sig_level, tsample, tside, strict) < power && high < 1e12) high *= 2.0;
6597          while (high - low > tol) {
6598                   double mid = low + (high - low) / 2.0;
6599                   if (p_body(mid, delta, sd, sig_level, tsample, tside, strict) < power) low = mid;
6600                   else high = mid;
6601          }
6602          n = low + (high - low) / 2.0;
6603        } else if (is_null_sd) {
6604          double low = delta * 1e-7, high = delta * 1e7;
6605          while (high - low > tol) {
6606                   double mid = low + (high - low) / 2.0;
6607                   if (p_body(n, delta, mid, sig_level, tsample, tside, strict) > power) low = mid;
6608                   else high = mid;
6609          }
6610          sd = low + (high - low) / 2.0;
6611        } else if (is_null_delta) {
6612          double low = sd * 1e-7, high = sd * 1e7;
6613          while (p_body(n, high, sd, sig_level, tsample, tside, strict) < power && high < 1e12) high *= 2.0;
6614          while (high - low > tol) {
6615                   double mid = low + (high - low) / 2.0;
6616                   if (p_body(n, mid, sd, sig_level, tsample, tside, strict) < power) low = mid;
6617                   else high = mid;
6618          }
6619          delta = low + (high - low) / 2.0;
6620        } else if (is_null_sig_level) {
6621          double low = 1e-10, high = 1.0 - 1e-10;
6622          while (high - low > tol) {
6623                   double mid = low + (high - low) / 2.0;
6624                   if (p_body(n, delta, sd, mid, tsample, tside, strict) < power) low = mid;
6625                   else high = mid;
6626          }
6627          sig_level = low + (high - low) / 2.0;
6628        }
6629        HV*restrict ret = newHV();
6630        hv_stores(ret, "n", newSVnv(n));
6631        hv_stores(ret, "delta", newSVnv(delta));
6632        hv_stores(ret, "sd", newSVnv(sd));
6633        hv_stores(ret, "sig.level", newSVnv(sig_level));
6634        hv_stores(ret, "power", newSVnv(power));
6635        hv_stores(ret, "alternative", newSVpv(alternative, 0));
6636        const char*restrict m_str = (tsample == 1) ? (strEQ(type, "paired") ? "Paired t test power calculation" : "One-sample t test power calculation") : "Two-sample t test power calculation";
6637        hv_stores(ret, "method", newSVpv(m_str, 0));
6638        const char*restrict n_str = (tsample == 2) ? "n is number in *each* group" : (strEQ(type, "paired") ? "n is number of *pairs*, sd is std.dev. of *differences* within pairs" : "");
6639        if (n_str[0] != '\0') hv_stores(ret, "note", newSVpv(n_str, 0));
6640        RETVAL = newRV_noinc((SV*)ret);
6641}
6642OUTPUT:
6643    RETVAL
6644
6645SV* kruskal_test(...)
6646CODE:
6647{
6648        SV *restrict x_sv = NULL, *restrict g_sv = NULL, *restrict h_sv = NULL;
6649        unsigned int arg_idx = 0;
6650
6651        // 1. Shift positional arguments
6652        //    Accept either: (arrayref, arrayref) or (hashref)
6653        if (arg_idx < items && SvROK(ST(arg_idx))) {
6654          svtype t = SvTYPE(SvRV(ST(arg_idx)));
6655          if (t == SVt_PVAV) {
6656                   x_sv = ST(arg_idx++);
6657          } else if (t == SVt_PVHV) {
6658                   h_sv = ST(arg_idx++);          /* hash-of-arrays shortcut */
6659          }
6660        }
6661        if (!h_sv && arg_idx < items
6662                     && SvROK(ST(arg_idx))
6663                     && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
6664          g_sv = ST(arg_idx++);
6665        }
6666        // 2. Parse named arguments (fallback)
6667        for (; arg_idx < items; arg_idx += 2) {
6668          const char *restrict key = SvPV_nolen(ST(arg_idx));
6669          SV         *restrict val = ST(arg_idx + 1);
6670          if      (strEQ(key, "x")) x_sv = val;
6671          else if (strEQ(key, "g")) g_sv = val;
6672          else if (strEQ(key, "h")) h_sv = val;
6673          else croak("kruskal_test: unknown argument '%s'", key);
6674        }
6675        // 3. Mutual-exclusion guard
6676        if (h_sv && (x_sv || g_sv))
6677          croak("kruskal_test: cannot mix 'h' (hash-of-arrays) with 'x'/'g' inputs");
6678
6679        /* ------------------------------------------------------------------ */
6680        /* Shared state filled by whichever input branch runs                 */
6681        /* ------------------------------------------------------------------ */
6682        RankInfo *restrict ri = NULL;
6683        char **restrict group_names = NULL; /* Track names to build group_stats */
6684        size_t valid_n = 0;
6685        size_t k       = 0;
6686
6687        /* ------------------------------------------------------------------ */
6688        /* 4a. Hash-of-arrays input path                                      */
6689        /*     my %x = ( group1 => [...], group2 => [...], ... )              */
6690        /* ------------------------------------------------------------------ */
6691        if (h_sv) {
6692          if (!SvROK(h_sv) || SvTYPE(SvRV(h_sv)) != SVt_PVHV)
6693                   croak("kruskal_test: 'h' must be a HASH reference");
6694          HV *restrict h_hv = (HV*)SvRV(h_sv);
6695          // First pass – validate values and tally total elements
6696          size_t total = 0;
6697          hv_iterinit(h_hv);
6698          HE *restrict he;
6699          while ((he = hv_iternext(h_hv))) {
6700                   SV *restrict val = HeVAL(he);
6701                   if (!SvROK(val) || SvTYPE(SvRV(val)) != SVt_PVAV)
6702                       croak("kruskal_test: every value in 'h' must be an ARRAY reference");
6703                   total += (size_t)(av_len((AV*)SvRV(val)) + 1);
6704          }
6705          if (total < 2) croak("not enough observations");
6706
6707          ri = (RankInfo *)safemalloc(total * sizeof(RankInfo));
6708          size_t num_keys = HvKEYS(h_hv);
6709          group_names = (char **)safecalloc(num_keys, sizeof(char*));
6710          /* Second pass – fill ri[], assigning one group_id per hash key */
6711          size_t group_id = 0;
6712          hv_iterinit(h_hv);
6713          while ((he = hv_iternext(h_hv))) {
6714                   STRLEN klen;
6715                   const char *restrict key_str = HePV(he, klen);
6716                   group_names[group_id] = savepvn(key_str, klen); // Save string key
6717
6718                   AV *restrict av  = (AV*)SvRV(HeVAL(he));
6719                   size_t       n_g = (size_t)(av_len(av) + 1);
6720                   for (size_t i = 0; i < n_g; i++) {
6721                       SV **restrict el = av_fetch(av, i, 0);
6722                       if (el && SvOK(*el) && looks_like_number(*el)) {
6723                           ri[valid_n].val = SvNV(*el);
6724                           ri[valid_n].idx = group_id;   /* group identity */
6725                           valid_n++;
6726                       }
6727                   }
6728                   group_id++;
6729          }
6730          k = group_id;   /* number of unique groups = number of hash keys */
6731
6732        /* ------------------------------------------------------------------ */
6733        /* 4b. Original x / g array-pair input path                           */
6734        /* ------------------------------------------------------------------ */
6735        } else {
6736          if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
6737                   croak("kruskal_test: 'x' is a required argument and must be an ARRAY reference");
6738          if (!g_sv || !SvROK(g_sv) || SvTYPE(SvRV(g_sv)) != SVt_PVAV)
6739                   croak("kruskal_test: 'g' is a required argument and must be an ARRAY reference");
6740
6741          AV *restrict x_av = (AV*)SvRV(x_sv);
6742          AV *restrict g_av = (AV*)SvRV(g_sv);
6743          size_t nx = (size_t)(av_len(x_av) + 1);
6744          size_t ng = (size_t)(av_len(g_av) + 1);
6745          if (nx != ng) croak("kruskal_test: 'x' and 'g' must have the same length");
6746          if (nx < 2)   croak("not enough observations");
6747
6748          ri = (RankInfo *)safemalloc(nx * sizeof(RankInfo));
6749          group_names = (char **)safecalloc(nx, sizeof(char*)); // Upper bound
6750
6751          // Map string group names → contiguous integer IDs
6752          HV *restrict group_map    = newHV();
6753          size_t          next_group_id = 0;
6754
6755          for (size_t i = 0; i < nx; i++) {
6756                   SV **restrict x_el = av_fetch(x_av, i, 0);
6757                   SV **restrict g_el = av_fetch(g_av, i, 0);
6758                   if (x_el && SvOK(*x_el) && looks_like_number(*x_el)
6759                            && g_el && SvOK(*g_el)) {
6760                       const char *restrict g_str = SvPV_nolen(*g_el);
6761                       STRLEN               glen  = strlen(g_str);
6762                       SV   **restrict id_sv = hv_fetch(group_map, g_str, glen, 0);
6763                       size_t group_id;
6764                       if (id_sv) {
6765                           group_id = SvUV(*id_sv);
6766                       } else {
6767                           group_id = next_group_id++;
6768                           hv_store(group_map, g_str, glen, newSVuv(group_id), 0);
6769                           group_names[group_id] = savepvn(g_str, glen); // Save string key
6770                       }
6771                       ri[valid_n].val = SvNV(*x_el);
6772                       ri[valid_n].idx = group_id;
6773                       valid_n++;
6774                   }
6775          }
6776          k = next_group_id;
6777          SvREFCNT_dec(group_map);
6778        }
6779
6780        /* ------------------------------------------------------------------ */
6781        /* 5. Shared post-extraction validation                               */
6782        /* ------------------------------------------------------------------ */
6783        if (valid_n < 2 || k < 2) {
6784          Safefree(ri);
6785          if (group_names) {
6786                   for (size_t i = 0; i < k; i++) { if (group_names[i]) Safefree(group_names[i]); }
6787                   Safefree(group_names);
6788          }
6789          if (valid_n < 2) croak("not enough observations");
6790          croak("all observations are in the same group");
6791        }
6792
6793        // 6. Ranking and Tie Accumulation (Reusing LikeR Helper)
6794        bool   has_ties = 0;
6795        double tie_adj  = rank_and_count_ties(ri, valid_n, &has_ties);
6796
6797        // 7. Aggregate Sum of Ranks AND Actual Values by Group
6798        double *restrict group_rank_sums = (double *)safecalloc(k, sizeof(double));
6799        double *restrict group_val_sums  = (double *)safecalloc(k, sizeof(double)); // For Mean
6800        size_t *restrict group_counts    = (size_t *)safecalloc(k, sizeof(size_t));
6801
6802        for (size_t i = 0; i < valid_n; i++) {
6803          size_t g_id = ri[i].idx;
6804          group_rank_sums[g_id] += ri[i].rank;
6805          group_val_sums[g_id]  += ri[i].val;
6806          group_counts[g_id]++;
6807        }
6808
6809        // 8. Calculate STATISTIC
6810        double stat_base = 0.0;
6811        for (size_t i = 0; i < k; i++) {
6812          if (group_counts[i] > 0)
6813                   stat_base += (group_rank_sums[i] * group_rank_sums[i])
6814                                / (double)group_counts[i];
6815        }
6816
6817        double n_d  = (double)valid_n;
6818        double stat = (12.0 * stat_base / (n_d * (n_d + 1.0))) - 3.0 * (n_d + 1.0);
6819        if (tie_adj > 0.0) {
6820          double tie_denom = 1.0 - (tie_adj / (n_d * n_d * n_d - n_d));
6821          stat /= tie_denom;
6822        }
6823        int    df    = (int)k - 1;
6824        double p_val = get_p_value(stat, df);
6825
6826        // 9. Return structured data exactly like R's htest
6827        HV *restrict res = newHV();
6828        hv_stores(res, "statistic", newSVnv(stat));
6829        hv_stores(res, "parameter", newSViv(df));
6830        hv_stores(res, "p_value",   newSVnv(p_val));
6831        hv_stores(res, "p.value",   newSVnv(p_val));
6832        hv_stores(res, "method",    newSVpv("Kruskal-Wallis rank sum test", 0));
6833
6834        // 10. Build the group_stats hash
6835        HV *restrict group_stats = newHV();
6836        HV *restrict stats_mean  = newHV();
6837        HV *restrict stats_size  = newHV();
6838
6839        for (size_t i = 0; i < k; i++) {
6840          if (group_counts[i] > 0 && group_names[i]) {
6841                   double mean = group_val_sums[i] / (double)group_counts[i];
6842                   size_t nlen = strlen(group_names[i]);
6843
6844                   hv_store(stats_mean, group_names[i], nlen, newSVnv(mean), 0);
6845                   hv_store(stats_size, group_names[i], nlen, newSVuv(group_counts[i]), 0);
6846          }
6847          if (group_names[i]) Safefree(group_names[i]); // Clean up name copy
6848        }
6849
6850        // Embed the nested hashes
6851        hv_stores(group_stats, "mean", newRV_noinc((SV*)stats_mean));
6852        hv_stores(group_stats, "size", newRV_noinc((SV*)stats_size));
6853        hv_stores(res, "group_stats",  newRV_noinc((SV*)group_stats));
6854
6855        // Memory Cleanup
6856        Safefree(group_names);    Safefree(group_rank_sums);
6857        Safefree(group_val_sums); Safefree(group_counts);
6858        Safefree(ri);
6859
6860        RETVAL = newRV_noinc((SV*)res);
6861}
6862OUTPUT:
6863    RETVAL
6864
6865SV* var_test(...)
6866CODE:
6867{
6868        SV* restrict x_sv = NULL;
6869        SV* restrict y_sv = NULL;
6870        double ratio = 1.0, conf_level = 0.95;
6871        const char* restrict alternative = "two.sided";
6872        unsigned int arg_idx = 0;
6873
6874        // 1. Shift positional argument 'x' if it's an array reference
6875        if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
6876          x_sv = ST(arg_idx);
6877          arg_idx++;
6878        }
6879
6880        // 2. Shift positional argument 'y' if it's an array reference
6881        if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
6882          y_sv = ST(arg_idx);
6883          arg_idx++;
6884        }
6885
6886        // Ensure the remaining arguments form complete key-value pairs
6887        if ((items - arg_idx) % 2 != 0) {
6888          croak("Usage: var_test(\\@x, \\@y, key => value, ...)");
6889        }
6890
6891        // --- Parse named arguments from the remaining flat stack ---
6892        for (; arg_idx < items; arg_idx += 2) {
6893          const char* restrict key = SvPV_nolen(ST(arg_idx));
6894          SV* restrict val = ST(arg_idx + 1);
6895
6896          if      (strEQ(key, "x"))           x_sv        = val;
6897          else if (strEQ(key, "y"))           y_sv        = val;
6898          else if (strEQ(key, "ratio"))       ratio       = SvNV(val);
6899          else if (strEQ(key, "conf_level") || strEQ(key, "conf.level")) conf_level = SvNV(val);
6900          else if (strEQ(key, "alternative")) alternative = SvPV_nolen(val);
6901          else croak("var_test: unknown argument '%s'", key);
6902        }
6903
6904        // --- Validate required inputs / types ---
6905        if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
6906          croak("var_test: 'x' is a required argument and must be an ARRAY reference");
6907        if (!y_sv || !SvROK(y_sv) || SvTYPE(SvRV(y_sv)) != SVt_PVAV)
6908          croak("var_test: 'y' is a required argument and must be an ARRAY reference");
6909
6910        if (ratio <= 0.0 || !isfinite(ratio))
6911          croak("var_test: 'ratio' must be a single positive number");
6912        if (conf_level <= 0.0 || conf_level >= 1.0 || !isfinite(conf_level))
6913          croak("var_test: 'conf.level' must be a single number between 0 and 1");
6914
6915        AV* restrict x_av = (AV*)SvRV(x_sv);
6916        AV* restrict y_av = (AV*)SvRV(y_sv);
6917        size_t nx_raw = av_len(x_av) + 1;
6918        size_t ny_raw = av_len(y_av) + 1;
6919
6920        // --- Computation via Welford's Algorithm (ignoring NaNs) ---
6921        double mean_x = 0.0, M2_x = 0.0;
6922        size_t nx = 0;
6923        for (size_t i = 0; i < nx_raw; i++) {
6924                SV** restrict tv = av_fetch(x_av, i, 0);
6925                if (tv && SvOK(*tv) && looks_like_number(*tv)) {
6926                        double val = SvNV(*tv);
6927                        if (!isnan(val) && isfinite(val)) {
6928                                nx++;
6929                                double delta = val - mean_x;
6930                                mean_x += delta / nx;
6931                                M2_x += delta * (val - mean_x);
6932                        }
6933                }
6934        }
6935
6936        double mean_y = 0.0, M2_y = 0.0;
6937        size_t ny = 0;
6938        for (size_t i = 0; i < ny_raw; i++) {
6939          SV** restrict tv = av_fetch(y_av, i, 0);
6940          if (tv && SvOK(*tv) && looks_like_number(*tv)) {
6941                   double val = SvNV(*tv);
6942                   if (!isnan(val) && isfinite(val)) {
6943                       ny++;
6944                       double delta = val - mean_y;
6945                       mean_y += delta / ny;
6946                       M2_y += delta * (val - mean_y);
6947                   }
6948          }
6949        }
6950
6951        if (nx < 2) croak("not enough 'x' observations");
6952        if (ny < 2) croak("not enough 'y' observations");
6953
6954        double df_x = (double)(nx - 1);
6955        double df_y = (double)(ny - 1);
6956        double var_x = M2_x / df_x;
6957        double var_y = M2_y / df_y;
6958
6959        if (var_y == 0.0) croak("var_test: variance of 'y' is zero (cannot divide by zero)");
6960
6961        // --- Statistics Math ---
6962        double estimate = var_x / var_y;
6963        double statistic = estimate / ratio;
6964
6965        double p_val = pf(statistic, df_x, df_y);
6966        double ci_lower = 0.0, ci_upper = INFINITY;
6967
6968        if (strcmp(alternative, "less") == 0) {
6969          ci_upper = estimate / qf_bisection(1.0 - conf_level, df_x, df_y);
6970        } else if (strcmp(alternative, "greater") == 0) {
6971          p_val = 1.0 - p_val;
6972          ci_lower = estimate / qf_bisection(conf_level, df_x, df_y);
6973        } else {
6974          // two.sided
6975          double p1 = p_val;
6976          double p2 = 1.0 - p_val;
6977          p_val = 2.0 * (p1 < p2 ? p1 : p2);
6978
6979          double beta = (1.0 - conf_level) / 2.0;
6980          ci_lower = estimate / qf_bisection(1.0 - beta, df_x, df_y);
6981          ci_upper = estimate / qf_bisection(beta, df_x, df_y);
6982        }
6983
6984        // --- Pack Results ---
6985        HV* restrict results = newHV();
6986        hv_store(results, "statistic", 9, newSVnv(statistic), 0);
6987
6988        AV* restrict param_av = newAV();
6989        av_push(param_av, newSVnv(df_x));
6990        av_push(param_av, newSVnv(df_y));
6991        hv_store(results, "parameter", 9, newRV_noinc((SV*)param_av), 0);
6992
6993        hv_store(results, "p_value", 7, newSVnv(p_val), 0);
6994
6995        AV* restrict conf_int = newAV();
6996        av_push(conf_int, newSVnv(ci_lower));
6997        av_push(conf_int, newSVnv(ci_upper));
6998        hv_store(results, "conf_int", 8, newRV_noinc((SV*)conf_int), 0);
6999
7000        hv_store(results, "estimate", 8, newSVnv(estimate), 0);
7001        hv_store(results, "null_value", 10, newSVnv(ratio), 0);
7002        hv_store(results, "alternative", 11, newSVpv(alternative, 0), 0);
7003        hv_store(results, "method", 6, newSVpv("F test to compare two variances", 0), 0);
7004
7005        RETVAL = newRV_noinc((SV*)results);
7006}
7007OUTPUT:
7008    RETVAL
7009
7010SV *sample(ref, n = 1)
7011    SV *ref
7012    IV n
7013PREINIT:
7014    SV *restrict ret = &PL_sv_undef;
7015CODE:
7016        if (!PL_srand_called) {
7017          (void)seedDrand01((Rand_seed_t)Perl_seed(aTHX));
7018          PL_srand_called = TRUE;
7019        }
7020        if (n < 0) n = 0;
7021        if (SvROK(ref)) {
7022                SV *restrict rv = SvRV(ref);
7023                /* --- HASH REFERENCE --- */
7024                if (SvTYPE(rv) == SVt_PVHV) {
7025                        HV *restrict hv    = (HV *)rv;
7026                        I32 count = hv_iterinit(hv);
7027                        I32 limit = (n < (IV)count) ? (I32)n : count;
7028                        HV *restrict ret_hv = newHV();
7029
7030                        if (count > 0 && limit > 0) {
7031                                 HE **restrict entries;
7032                                 HE  *restrict entry;
7033                                 unsigned i;
7034
7035                                 Newx(entries, count, HE *);
7036
7037                                 /* Collect all HE pointers in one pass */
7038                                 i = 0;
7039                                 while ((entry = hv_iternext(hv)))
7040                                     entries[i++] = entry;
7041
7042                                 /* Partial Fisher-Yates (only 'limit' passes) */
7043                                 for (i = 0; i < limit; i++) {
7044                                     I32 j    = i + (I32)(Drand01() * (count - i));
7045                                     HE *restrict tmp  = entries[i];
7046                                     entries[i] = entries[j];
7047                                     entries[j] = tmp;
7048                                 }
7049
7050                                 /* Pre-size result hash to avoid rehashing during population */
7051                                 hv_ksplit(ret_hv, limit);
7052
7053                                 for (i = 0; i < limit; i++) {
7054                                     HEK *restrict hek = HeKEY_hek(entries[i]);
7055                                     /*
7056                                      * hv_store() with a precomputed hash skips the hash
7057                                      * computation entirely.  Negative klen signals UTF-8.
7058                                      */
7059                                     (void)hv_store(
7060                                         ret_hv,
7061                                         HEK_KEY(hek),
7062                                         HEK_UTF8(hek) ? -(I32)HEK_LEN(hek) : (I32)HEK_LEN(hek),
7063                                         SvREFCNT_inc(HeVAL(entries[i])),  /* HeVAL: direct macro, no call */
7064                                         HeHASH(entries[i])                /* reuse precomputed hash */
7065                                     );
7066                                 }
7067                                 Safefree(entries);
7068                        }
7069                        ret = newRV_noinc((SV *)ret_hv);
7070                } else if (SvTYPE(rv) == SVt_PVAV) {/* --- ARRAY REFERENCE --- */
7071                        AV    *restrict av    = (AV *)rv;
7072                        size_t count = av_top_index(av) + 1;  /* signed; 0 for empty AV */
7073                        size_t limit = (n < count) ? (size_t)n : count;
7074                        AV    *restrict ret_av = newAV();
7075
7076                        /* Pre-allocate the result array to avoid incremental reallocs */
7077                        if (n > 0)
7078                                 av_extend(ret_av, (size_t)n - 1);
7079
7080                        if (count > 0) {
7081                                 SV    **restrict src = AvARRAY(av);   /* direct pointer into AV's C array */
7082                                 size_t *restrict idx;
7083                                 size_t  i;
7084
7085                                 /* Shuffle indices rather than SV** to keep the original AV intact */
7086                                 Newx(idx, count, size_t);
7087                                 for (i = 0; i < count; i++)
7088                                     idx[i] = i;
7089
7090                                 /* Partial Fisher-Yates on the index array */
7091                                 for (i = 0; i < limit; i++) {
7092                                     size_t j   = i + (size_t)(Drand01() * (count - i));
7093                                     size_t tmp = idx[i];
7094                                     idx[i]  = idx[j];
7095                                     idx[j]  = tmp;
7096                                 }
7097
7098                                 for (i = 0; i < (size_t)n; i++) {
7099                                     if (i < limit) {
7100                                         SV *restrict sv = src[idx[i]];   /* AvARRAY direct access — no av_fetch call */
7101                                         SV *push_sv;
7102                                                        if (sv && sv != &PL_sv_undef)
7103                                                                 push_sv = SvREFCNT_inc(sv);
7104                                                        else
7105                                                                 push_sv = newSV(0);
7106                                                        av_push(ret_av, push_sv);
7107                                     } else {
7108                                         av_push(ret_av, newSV(0));
7109                                     }
7110                                 }
7111                                 Safefree(idx);
7112                        } else {
7113                                for (size_t i = 0; i < (size_t)n; i++)
7114                                    av_push(ret_av, newSV(0));
7115                        }
7116                        ret = newRV_noinc((SV *)ret_av);
7117                }
7118        }
7119        RETVAL = ret;
7120OUTPUT:
7121    RETVAL