File Coverage

File:LikeR.xs
Coverage:77.4%

linestmtbrancondsubtimecode
1/* --- C HELPER SECTION --- */
2#define PERL_NO_GET_CONTEXT
3#include "EXTERN.h"
4#include "perl.h"
5#include "XSUB.h"
6#include "ppport.h"
7#include <math.h>
8#include <ctype.h>
9#include <stdlib.h>
10#include <float.h>
11#include <string.h>
12/* Ensure Perl's PRNG is seeded, matching the lazy-evaluation of Perl's rand() */
13#define AUTO_SEED_PRNG() \
14    do { \
15        if (!PL_srand_called) { \
16            (void)seedDrand01((Rand_seed_t)Perl_seed(aTHX)); \
17            PL_srand_called = TRUE; \
18        } \
19    } while (0)
20
21// ---------------------------------------
22//   Helpers for Random Number Generation
23// ---------------------------------------
24#ifndef M_PI
25#define M_PI 3.14159265358979323846
26#endif
27// C helper for EXACT Non-central T-distribution CDF via Numerical Integration.
28// This perfectly replicates R's pt(..., ncp) exactness without requiring complex Beta functions.
29
687
static double exact_pnt(double t, double df, double ncp) {
30
687
        if (df <= 0.0) return 0.0;
31
32
687
        unsigned short int n_steps = 30000;
33
687
        double step = 1.0 / n_steps;
34
687
        double integral = 0.0;
35
687
        double half_df = df / 2.0;
36
37
687
        double log_coef = log(2.0) + half_df * log(half_df) - lgamma(half_df);
38
687
        double root_half = 0.70710678118654752440; // 1 / sqrt(2)
39
40
20610000
        for (unsigned short i = 1; i < n_steps; i++) {
41
20609313
                double u = i * step;
42
20609313
                double w = u / (1.0 - u);
43
44                // Scaled Chi-distribution log-density
45
20609313
                double log_M = log_coef + (df - 1.0) * log(w) - half_df * w * w;
46
20609313
                double M = exp(log_M);
47
48                // Exact Normal CDF using the C standard library's erfc function
49
20609313
                double z = t * w - ncp;
50
20609313
                double pnorm_val = 0.5 * erfc(-z * root_half);
51
52
20609313
                double weight = (i % 2 != 0) ? 4.0 : 2.0;
53
20609313
                integral += weight * (pnorm_val * M / ((1.0 - u) * (1.0 - u)));
54        }
55
56
687
        return integral * (step / 3.0);
57}
58// --- Math Helpers for P-values and Confidence Intervals ---
59
60// Ranking helper with tie adjustment (matches R's tie handling)
61typedef struct { double val; size_t idx; double rank; } RankInfo;
62
33
static int compare_rank(const void *restrict a, const void *restrict b) {
63
33
        double diff = ((RankInfo*)a)->val - ((RankInfo*)b)->val;
64
33
        return (diff > 0) - (diff < 0);
65}
66
67
33
static int compare_index(const void *restrict a, const void *restrict b) {
68
33
        return ((RankInfo*)a)->idx - ((RankInfo*)b)->idx;
69}
70
71
6
static void compute_ranks(double *restrict data, double *restrict ranks, size_t n) {
72
6
        RankInfo *restrict items = safemalloc(n * sizeof(RankInfo));
73
36
        for (size_t i = 0; i < n; i++) {
74
30
                items[i].val = data[i];
75
30
                items[i].idx = i;
76        }
77
6
        qsort(items, n, sizeof(RankInfo), compare_rank);
78        // Handle ties by averaging ranks
79
36
        for (size_t i = 0; i < n; ) {
80
30
                size_t j = i + 1;
81
30
                while (j < n && items[j].val == items[i].val) j++;
82
30
                double avg_rank = (i + 1 + j) / 2.0;
83
60
                for (size_t k = i; k < j; k++) items[k].rank = avg_rank;
84
30
                i = j;
85        }
86
6
        qsort(items, n, sizeof(RankInfo), compare_index);
87
36
        for (size_t i = 0; i < n; i++) ranks[i] = items[i].rank;
88
6
        Safefree(items);
89
6
}
90// Generates a single binomial random variate.
91//Uses the standard Bernoulli trial loop. Drand01() taps into Perl's PRNG.
92
61497
static size_t generate_binomial(const size_t size, const double prob) {
93
61497
        if (prob <= 0.0) return 0;
94
61197
        if (prob >= 1.0) return size;
95
96
60897
        size_t successes = 0;
97
936870
        for (size_t i = 0; i < size; i++) {
98
875973
                if (Drand01() <= prob) successes++;
99        }
100
60897
        return successes;
101}
102// Helper: log combination
103
426
static double log_choose(size_t n, size_t k) {
104
426
        return lgamma((double)n + 1.0) - lgamma((double)k + 1.0) - lgamma((double)(n - k) + 1.0);
105}
106
107// Log-space tails for non-central hypergeometric
108
1212
static void calc_tails_logspace(size_t a, size_t min_x, size_t max_x, double omega, const double *logdc, double *restrict lower_tail, double *restrict upper_tail) {
109
1212
        double max_d = -1e300, log_omega = log(omega);
110
111
10056
        for(size_t k = 0; k <= max_x - min_x; ++k) {
112
8844
          double d_val = logdc[k] + log_omega * (min_x + k);
113
8844
          if (d_val > max_d) max_d = d_val;
114        }
115
116
1212
        double sum_d = 0.0;
117
10056
        for(size_t k = 0; k <= max_x - min_x; ++k) {
118
8844
          sum_d += exp(logdc[k] + log_omega * (min_x + k) - max_d);
119        }
120
121
1212
        *lower_tail = 0.0;
122
1212
        *upper_tail = 0.0;
123
124
10056
        for(size_t k = 0; k <= max_x - min_x; ++k) {
125
8844
          double p_prob = exp(logdc[k] + log_omega * (min_x + k) - max_d) / sum_d;
126
8844
          if (min_x + k <= a) *lower_tail += p_prob;
127
8844
          if (min_x + k >= a) *upper_tail += p_prob;
128        }
129
1212
}
130
131// Exact stats using log-space
132
15
static void calculate_exact_stats(size_t a, size_t b, size_t c, size_t d, double conf_level, const char*restrict alt, double *restrict mle_or, double *restrict ci_low, double *restrict ci_high) {
133
15
    double alpha = 1.0 - conf_level;
134
15
    size_t r1 = a + b, r2 = c + d, c1 = a + c;
135
15
    size_t min_x = (r2 > c1) ? 0 : c1 - r2;
136
15
    size_t max_x = (r1 < c1) ? r1 : c1;
137
138
15
    bool is_less = (strcmp(alt, "less") == 0);
139
15
    bool is_greater = (strcmp(alt, "greater") == 0);
140
141
15
    double *restrict logdc = (double*)safemalloc((max_x - min_x + 1) * sizeof(double));
142
15
    double denom = log_choose(r1 + r2, c1);
143
114
    for(size_t x = min_x; x <= max_x; ++x) {
144
99
        logdc[x - min_x] = log_choose(r1, x) + log_choose(r2, c1 - x) - denom;
145    }
146
147    // MLE
148
15
    if (a == min_x && a == max_x) *mle_or = 1.0;
149
15
    else if (a == min_x) *mle_or = 0.0;
150
15
    else if (a == max_x) *mle_or = INFINITY;
151    else {
152
12
        double log_low = -100.0, log_high = 100.0;
153
696
        for (unsigned short int i = 0; i < 3000; i++) {
154
696
            double log_mid = 0.5 * (log_low + log_high);
155
696
            double max_d = -1e300;
156
5568
            for(size_t k = 0; k <= max_x - min_x; ++k) {
157
4872
                double d_val = logdc[k] + log_mid * (min_x + k);
158
4872
                if (d_val > max_d) max_d = d_val;
159            }
160
696
            double sum_d = 0.0, exp_val = 0.0;
161
5568
            for(size_t k = 0; k <= max_x - min_x; ++k) {
162
4872
                double p_prob = exp(logdc[k] + log_mid * (min_x + k) - max_d);
163
4872
                sum_d += p_prob;
164
4872
                exp_val += (min_x + k) * p_prob;
165            }
166
696
            exp_val /= sum_d;
167
168
696
            if (exp_val > a) log_high = log_mid;
169
276
            else log_low = log_mid;
170
696
            if (log_high - log_low < 1e-15) break;
171        }
172
12
        *mle_or = exp(0.5 * (log_low + log_high));
173    }
174
175
15
    *ci_low = 0.0;
176
15
    *ci_high = INFINITY;
177
178    // Lower CI
179
15
    if (!is_less) {
180
12
        double target_alpha = is_greater ? alpha : alpha / 2.0;
181
12
        if (a != min_x) {
182
12
            double log_low = -100.0, log_high = 100.0, best = 1.0, best_err = 1e9, lt, ut;
183
696
            for (unsigned short int i = 0; i < 1000; i++) {
184
696
                double log_mid = 0.5 * (log_low + log_high);
185
696
                double mid = exp(log_mid);
186
696
                calc_tails_logspace(a, min_x, max_x, mid, logdc, &lt, &ut);
187
696
                double err = fabs(ut - target_alpha);
188
696
                if (err < best_err) { best_err = err; best = mid; }
189
696
                if (ut > target_alpha) log_high = log_mid;
190
318
                else log_low = log_mid;
191
696
                if (log_high - log_low < 1e-15) break;
192            }
193
12
            *ci_low = best;
194        }
195    }
196
197    // Upper CI
198
15
    if (!is_greater) {
199
12
        double target_alpha = is_less ? alpha : alpha / 2.0;
200
12
        if (a != max_x) {
201
9
            double log_low = -100.0, log_high = 100.0, best = 1.0, best_err = 1e9, lt, ut;
202
516
            for (unsigned short int i = 0; i < 1000; i++) {
203
516
                double log_mid = 0.5 * (log_low + log_high);
204
516
                double mid = exp(log_mid);
205
516
                calc_tails_logspace(a, min_x, max_x, mid, logdc, &lt, &ut);
206
516
                double err = fabs(lt - target_alpha);
207
516
                if (err < best_err) { best_err = err; best = mid; }
208
516
                if (lt > target_alpha) log_low = log_mid;
209
264
                else log_high = log_mid;
210
516
                if (log_high - log_low < 1e-15) break;
211            }
212
9
            *ci_high = best;
213        }
214    }
215
15
    safefree(logdc);
216
15
}
217
218// Exact p-value using log-space
219
15
static double exact_p_value(size_t a, size_t b, size_t c, size_t d, const char* alt) {
220
15
    size_t r1 = a + b, r2 = c + d, c1 = a + c;
221
15
    size_t min_x = (r2 > c1) ? 0 : c1 - r2;
222
15
    size_t max_x = (r1 < c1) ? r1 : c1;
223
224
15
    double *logdc = (double*)safemalloc((max_x - min_x + 1) * sizeof(double));
225
15
    double denom = log_choose(r1 + r2, c1);
226
114
    for(size_t x = min_x; x <= max_x; ++x) {
227
99
        logdc[x - min_x] = log_choose(r1, x) + log_choose(r2, c1 - x) - denom;
228    }
229
230
15
    double p_val = 0.0;
231
232
15
    if (strcmp(alt, "less") == 0) {
233
15
        for(size_t x = min_x; x <= a; ++x) p_val += exp(logdc[x - min_x]);
234
12
    } else if (strcmp(alt, "greater") == 0) {
235
9
        for(size_t x = a; x <= max_x; ++x) p_val += exp(logdc[x - min_x]);
236    } else {
237
9
        double p_obs = exp(logdc[a - min_x]);
238
9
        double relErr = 1.0 + 1e-7;
239
78
        for(size_t x = min_x; x <= max_x; ++x) {
240
69
            double p_cur = exp(logdc[x - min_x]);
241
69
            if (p_cur <= p_obs * relErr) p_val += p_cur;
242        }
243    }
244
245
15
    safefree(logdc);
246
15
    return (p_val > 1.0) ? 1.0 : p_val;
247}
248/* -----------------------------------------------------------------------
249 * Helpers for lm Linear Regression: OLS Matrix Math & Formula Parsing
250 * ----------------------------------------------------------------------- */
251
252/* Sweep operator for symmetric positive-definite matrices (e.g., XtX).
253 * This gracefully handles collinearity by bypassing aliased columns.
254 * Utilizes a relative tolerance check to prevent dropping micro-variance features.
255 */
256
132
static int sweep_matrix_ols(double *restrict A, size_t n, bool *restrict aliased) {
257
132
        int rank = 0;
258
132
        double *restrict orig_diag = (double*)safemalloc(n * sizeof(double));
259
260        // Save the original diagonal values to use as a baseline for relative variance
261
510
        for (size_t k = 0; k < n; k++) {
262
378
                aliased[k] = false;
263
378
                orig_diag[k] = A[k * n + k];
264        }
265
266
510
        for (size_t k = 0; k < n; k++) {
267                // Check pivot for collinearity using a RELATIVE tolerance
268                // (Fallback to a tiny absolute tolerance of 1e-24 to catch literal zero vectors)
269
378
                if (fabs(A[k * n + k]) <= 1e-10 * orig_diag[k] || fabs(A[k * n + k]) < 1e-24) {
270
3
                        aliased[k] = true;
271                        // Isolate this column so it doesn't affect the rest of the matrix
272
12
                        for (size_t i = 0; i < n; i++) {
273
9
                                A[k * n + i] = 0.0;
274
9
                                A[i * n + k] = 0.0;
275                        }
276
3
                        continue;
277                }
278
375
                rank++;
279
375
                double pivot = 1.0 / A[k * n + k];
280
375
                A[k * n + k] = 1.0;
281
1476
                for (size_t j = 0; j < n; j++) A[k * n + j] *= pivot;
282
1476
                for (size_t i = 0; i < n; i++) {
283
1101
                        if (i != k && A[i * n + k] != 0.0) {
284
708
                                  double factor = A[i * n + k];
285
708
                                  A[i * n + k] = 0.0;
286
2838
                                  for (size_t j = 0; j < n; j++) {
287
2130
                                       A[i * n + j] -= factor * A[k * n + j];
288                                  }
289                        }
290                }
291        }
292
132
        Safefree(orig_diag);
293
132
        return rank;
294}
295
296// Internal extractor resolving single data values. Returns NAN on missing or non-numeric.
297
4872
static double get_data_value(HV *restrict data_hoa, HV **restrict row_hashes, unsigned int i, const char *restrict var) {
298
4872
    SV **restrict val = NULL;
299
4872
    if (row_hashes) {
300
3552
        val = hv_fetch(row_hashes[i], var, strlen(var), 0);
301
3552
        if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVAV) {
302
3552
            AV*restrict av = (AV*)SvRV(*val);
303
3552
            val = av_fetch(av, 0, 0);
304        }
305
1320
    } else if (data_hoa) {
306
1320
        SV**restrict col = hv_fetch(data_hoa, var, strlen(var), 0);
307
1320
        if (col && SvROK(*col) && SvTYPE(SvRV(*col)) == SVt_PVAV) {
308
1320
            AV*restrict av = (AV*)SvRV(*col);
309
1320
            val = av_fetch(av, i, 0);
310        }
311    }
312
4872
    if (val && SvOK(*val)) {
313
4863
        if (looks_like_number(*val)) return SvNV(*val);
314
0
        return NAN; // Catch strings like "blue"
315    }
316
9
    return NAN; // Catch undef/missing keys
317}
318
319// Helper: Get all available columns for the '.' operator expansion
320
6
static AV* get_all_columns(HV *restrict data_hoa, HV **restrict row_hashes, size_t n) {
321
6
    AV *cols = newAV();
322
6
    if (data_hoa) {
323
6
        hv_iterinit(data_hoa);
324        HE *entry;
325
24
        while ((entry = hv_iternext(data_hoa))) {
326
18
            av_push(cols, newSVsv(hv_iterkeysv(entry)));
327        }
328
0
    } else if (row_hashes && n > 0 && row_hashes[0]) {
329
0
        hv_iterinit(row_hashes[0]);
330        HE *entry;
331
0
        while ((entry = hv_iternext(row_hashes[0]))) {
332
0
            av_push(cols, newSVsv(hv_iterkeysv(entry)));
333        }
334    }
335
6
    return cols;
336}
337
338// Recursive formula resolver with tightened NaN and Null handling
339
4968
static double evaluate_term(HV *restrict data_hoa, HV **restrict row_hashes, unsigned int i, const char *restrict term) {
340
4968
    if (!term || term[0] == '\0') return NAN;
341
342
4968
    char *restrict term_cpy = savepv(term);
343
4968
    char *restrict colon = strchr(term_cpy, ':');
344
4968
    if (colon) {
345
96
        *colon = '\0';
346
96
        double left = evaluate_term(data_hoa, row_hashes, i, term_cpy);
347
96
        double right = evaluate_term(data_hoa, row_hashes, i, colon + 1);
348
96
        Safefree(term_cpy);
349
350
96
        if (isnan(left) || isnan(right)) return NAN;
351
96
        return left * right;
352    }
353
4872
    if (strncmp(term_cpy, "I(", 2) == 0) {
354
0
        char *restrict end = strrchr(term_cpy, ')');
355
0
        if (end) *end = '\0';
356
0
        char *restrict inner = term_cpy + 2;
357
0
        char *restrict caret = strchr(inner, '^');
358
0
        int power = 1;
359
0
        if (caret) {
360
0
            *caret = '\0';
361
0
            power = atoi(caret + 1);
362        }
363
0
        double v = get_data_value(data_hoa, row_hashes, i, inner);
364
0
        Safefree(term_cpy);
365
366
0
        if (isnan(v)) return NAN;
367
0
        return power == 1 ? v : pow(v, power);
368    }
369
4872
    double result = get_data_value(data_hoa, row_hashes, i, term_cpy);
370
4872
    Safefree(term_cpy);
371
4872
    return result;
372}
373
374// Helper to infer column type from its first valid element
375
162
static bool is_column_categorical(HV *restrict data_hoa, HV **restrict row_hashes, size_t n, const char *restrict var) {
376
258
        for (size_t i = 0; i < n; i++) {
377
255
                SV **restrict val = NULL;
378
255
                if (row_hashes) {
379
165
                        val = hv_fetch(row_hashes[i], var, strlen(var), 0);
380
165
                        if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVAV) {
381
69
                                 AV*restrict av = (AV*)SvRV(*val);
382
69
                                 val = av_fetch(av, 0, 0);
383                        }
384
90
                } else if (data_hoa) {
385
90
                        SV **restrict col = hv_fetch(data_hoa, var, strlen(var), 0);
386
90
                        if (col && SvROK(*col) && SvTYPE(SvRV(*col)) == SVt_PVAV) {
387
90
                                 AV*restrict av = (AV*)SvRV(*col);
388
90
                                 val = av_fetch(av, i, 0);
389                        }
390                }
391
255
                if (val && SvOK(*val)) {
392
159
                        if (looks_like_number(*val)) return false; // First valid is number -> Numeric Column
393
27
                        return true; // First valid is string -> Categorical Column
394                }
395        }
396
3
        return false;
397}
398
399/* Internal extractor resolving single data string values using dynamic allocation. */
400
1041
static char* get_data_string_alloc(HV *restrict data_hoa, HV **restrict row_hashes, size_t i, const char *restrict var) {
401
1041
        SV **restrict val = NULL;
402
1041
        if (row_hashes) {
403
0
                val = hv_fetch(row_hashes[i], var, strlen(var), 0);
404
0
                if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVAV) {
405
0
                        AV*restrict av = (AV*)SvRV(*val);
406
0
                        val = av_fetch(av, 0, 0);
407                }
408
1041
        } else if (data_hoa) {
409
1041
                SV **restrict col = hv_fetch(data_hoa, var, strlen(var), 0);
410
1041
                if (col && SvROK(*col) && SvTYPE(SvRV(*col)) == SVt_PVAV) {
411
1041
                        AV*restrict av = (AV*)SvRV(*col);
412
1041
                        val = av_fetch(av, i, 0);
413                }
414        }
415
1041
        if (val && SvOK(*val)) {
416
1041
          return savepv(SvPV_nolen(*val)); /* Allocates and returns string */
417        }
418
0
        return NULL;
419}
420
421// Struct for sorting p-values while remembering their original index
422typedef struct {
423        double p;
424        size_t orig_idx;
425} PVal;
426
427// Comparator for qsort
428
4557
static int cmp_pval(const void *restrict a, const void *restrict b) {
429
4557
        double diff = ((PVal*)a)->p - ((PVal*)b)->p;
430
4557
        if (diff < 0) return -1;
431
2436
        if (diff > 0) return 1;
432        /* Stabilize sort by falling back to original index */
433
0
        return ((PVal*)a)->orig_idx - ((PVal*)b)->orig_idx;
434}
435/* -----------------------------------------------------------------------
436 * Helpers for cor(): ranking (Spearman), Pearson r, Kendall tau-b
437 * ----------------------------------------------------------------------- */
438/* Item used to sort values while remembering their original index,
439 * needed for average-rank tie-breaking in Spearman correlation.        */
440typedef struct {
441        double val;
442        size_t idx;
443} RankItem;
444
445
111
static int cmp_rank_item(const void *restrict a, const void *restrict b) {
446
111
        double diff = ((RankItem*)a)->val - ((RankItem*)b)->val;
447
111
        if (diff < 0) return -1;
448
12
        if (diff > 0) return  1;
449
3
        return 0;
450}
451
452/* Compute 1-based average ranks with tie-breaking into out[].
453 * in[] is not modified.                                                 */
454
12
static void rank_data(const double *restrict in, double *restrict out, size_t n) {
455        RankItem *restrict ri;
456
12
        Newx(ri, n, RankItem);
457
96
        for (size_t i = 0; i < n; i++) { ri[i].val = in[i]; ri[i].idx = i; }
458
12
        qsort(ri, n, sizeof(RankItem), cmp_rank_item);
459
460
12
        size_t i = 0;
461
93
        while (i < n) {
462
81
                size_t j = i;
463                /* Find the full extent of this tie group */
464
84
                while (j + 1 < n && ri[j + 1].val == ri[j].val) j++;
465                /* All members get the average of ranks i+1 … j+1 (1-based) */
466
81
                double avg = (double)(i + j) / 2.0 + 1.0;
467
165
                for (size_t k = i; k <= j; k++) out[ri[k].idx] = avg;
468
81
                i = j + 1;
469        }
470
12
        Safefree(ri);
471
12
}
472
473/* Pearson product-moment r between two n-element arrays.
474 * Returns NAN when either variable has zero variance (matches R).       */
475
18
static double pearson_corr(const double *restrict x, const double *restrict y, size_t n) {
476
18
        double sx = 0, sy = 0, sxy = 0, sx2 = 0, sy2 = 0;
477
108
        for (size_t i = 0; i < n; i++) {
478
90
          sx  += x[i];     sy  += y[i];
479
90
          sxy += x[i]*y[i]; sx2 += x[i]*x[i]; sy2 += y[i]*y[i];
480        }
481
18
        double num = (double)n * sxy - sx * sy;
482
18
        double den = sqrt(((double)n * sx2 - sx*sx) * ((double)n * sy2 - sy*sy));
483
18
        if (den == 0.0) return NAN;
484
18
        return num / den;
485}
486
487/* Kendall's tau-b between two n-element arrays.
488 *
489 *   tau-b = (C − D) / sqrt((C + D + T_x)(C + D + T_y))
490 *
491 * where C = concordant pairs, D = discordant, T_x = pairs tied only on
492 * x, T_y = pairs tied only on y.  Joint ties (both zero) are excluded
493 * from numerator and denominator, matching R's cor(method="kendall").
494 * Returns NAN when the denominator is zero.                             */
495
3
static double kendall_tau_b(const double *restrict x, const double *restrict y, unsigned int n) {
496
3
        size_t C = 0, D = 0, tie_x = 0, tie_y = 0;
497
27
        for (size_t i = 0; i < n - 1; i++) {
498
132
          for (size_t j = i + 1; j < n; j++) {
499
108
                   int sx = (x[i] > x[j]) - (x[i] < x[j]);   /* sign of x[i]-x[j] */
500
108
                   int sy = (y[i] > y[j]) - (y[i] < y[j]);
501
108
                   if      (sx == 0 && sy == 0) { /* joint tie — not counted */ }
502
108
                   else if (sx == 0)            tie_x++;
503
105
                   else if (sy == 0)            tie_y++;
504
105
                   else if (sx == sy)           C++;
505
0
                   else                         D++;
506          }
507        }
508
3
        double denom = sqrt((double)(C + D + tie_x) * (double)(C + D + tie_y));
509
3
        if (denom == 0.0) return NAN;
510
3
        return (double)(C - D) / denom;
511}
512
513/* Single dispatch: compute correlation according to method string.
514 * Allocates and frees temporary rank arrays internally for Spearman.   */
515
21
static double compute_cor(const double *restrict x, const double *restrict y,
516                           size_t n, const char *restrict method) {
517
21
        if (strcmp(method, "spearman") == 0) {
518          double *restrict rx, *restrict ry;
519
3
          Newx(rx, n, double); Newx(ry, n, double);
520
3
          rank_data(x, rx, n);
521
3
          rank_data(y, ry, n);
522
3
          double r = pearson_corr(rx, ry, n);
523
3
          Safefree(rx); Safefree(ry);
524
3
          return r;
525        }
526
18
        if (strcmp(method, "kendall") == 0)
527
3
          return kendall_tau_b(x, y, n);
528        /* default: pearson */
529
15
        return pearson_corr(x, y, n);
530}
531
532// Math macros
533#define MAX_ITER 500
534#define EPS 3.0e-15
535#define FPMIN 1.0e-30
536
537
24459
static double _incbeta_cf(double a, double b, double x) {
538        int m;
539        double aa, c, d, del, h, qab, qam, qap;
540
24459
        qab = a + b; qap = a + 1.0; qam = a - 1.0;
541
24459
        c = 1.0; d = 1.0 - qab * x / qap;
542
24459
        if (fabs(d) < FPMIN) d = FPMIN;
543
24459
        d = 1.0 / d; h = d;
544
540939
        for (m = 1; m <= MAX_ITER; m++) {
545
540939
          int m2 = 2 * m;
546
540939
          aa = m * (b - m) * x / ((qam + m2) * (a + m2));
547
540939
          d = 1.0 + aa * d;
548
540939
          if (fabs(d) < FPMIN) d = FPMIN;
549
540939
          c = 1.0 + aa / c;
550
540939
          if (fabs(c) < FPMIN) c = FPMIN;
551
540939
          d = 1.0 / d; h *= d * c;
552
540939
          aa = -(a + m) * (qab + m) * x / ((a + m2) * (qap + m2));
553
540939
          d = 1.0 + aa * d;
554
540939
          if (fabs(d) < FPMIN) d = FPMIN;
555
540939
          c = 1.0 + aa / c;
556
540939
          if (fabs(c) < FPMIN) c = FPMIN;
557
540939
          d = 1.0 / d; del = d * c; h *= del;
558
540939
          if (fabs(del - 1.0) < EPS) break;
559        }
560
24459
        return h;
561}
562
563
24582
static double incbeta(double a, double b, double x) {
564
24582
        if (x <= 0.0) return 0.0;
565
24582
        if (x >= 1.0) return 1.0;
566
24459
        double bt = exp(lgamma(a + b) - lgamma(a) - lgamma(b) + a * log(x) + b * log(1.0 - x));
567
24459
        if (x < (a + 1.0) / (a + b + 2.0)) return bt * _incbeta_cf(a, b, x) / a;
568
4284
        return 1.0 - bt * _incbeta_cf(b, a, 1.0 - x) / b;
569}
570
571
24504
static double get_t_pvalue(double t, double df, const char*restrict alt) {
572
24504
        double x = df / (df + t * t);
573
24504
        double prob_2tail = incbeta(df / 2.0, 0.5, x);
574
24504
        if (strcmp(alt, "less") == 0) return (t < 0) ? 0.5 * prob_2tail : 1.0 - 0.5 * prob_2tail;
575
24498
        if (strcmp(alt, "greater") == 0) return (t > 0) ? 0.5 * prob_2tail : 1.0 - 0.5 * prob_2tail;
576
312
        return prob_2tail;
577}
578
579// Bisection algorithm to find the inverse t-distribution (Critical t-value)
580
813
static double qt_tail(double df, double p_tail) {
581
813
        double low = 0.0, high = 1.0;
582        // Find upper bound
583
1929
        while (get_t_pvalue(high, df, "greater") > p_tail) {
584
1116
          low = high;
585
1116
          high *= 2.0;
586
1116
          if (high > 1000000.0) break; /* Fallback limit */
587        }
588        // Bisect to find the root
589
22254
        for (unsigned short int i = 0; i < 100; i++) {
590
22254
          double mid = (low + high) / 2.0;
591
22254
          double p_mid = get_t_pvalue(mid, df, "greater");
592
22254
          if (p_mid > p_tail) {
593
10869
                   low = mid;
594          } else {
595
11385
                   high = mid;
596          }
597
22254
          if (high - low < 1e-8) break;
598        }
599
813
        return (low + high) / 2.0;
600}
601
602
8505
int compare_doubles(const void *restrict a, const void *restrict b) {
603
8505
        double da = *(const double*restrict)a;
604
8505
        double db = *(const double*restrict)b;
605
8505
        return (da > db) - (da < db);
606}
607/* Helper to calculate the number of bins using Sturges' formula: log2(n) + 1 */
608
0
static size_t calculate_sturges_bins(size_t n) {
609
0
        if (n == 0) return 1;
610
0
        return (size_t)(log((double)n) / log(2.0) + 1.0);
611}
612
613// Logic for distributing data into bins (Optimized to O(N))
614
15
static void compute_hist_logic(double *restrict x, size_t n, double *restrict breaks, size_t n_bins,
615                               size_t *restrict counts, double *restrict mids, double *restrict density) {
616
15
        double total_n = (double)n;
617
15
        double min_val = breaks[0];
618
15
        double step = (n_bins > 0) ? (breaks[1] - breaks[0]) : 0.0;
619        // Initialize counts and compute midpoints
620
69
        for (size_t i = 0; i < n_bins; i++) {
621
54
          counts[i] = 0;
622
54
          mids[i] = (breaks[i] + breaks[i+1]) / 2.0;
623        }
624        // Single O(N) pass to assign elements to bins
625
15
        if (step > 0.0) {
626
6051
          for (size_t j = 0; j < n; j++) {
627
6042
                   double val = x[j];
628                   // Ignore out-of-bounds or invalid values
629
6042
                   if (isnan(val) || isinf(val) || val < min_val) continue;
630                   // Calculate initial bin index mathematically
631
6042
                   size_t idx = (size_t)((val - min_val) / step);
632                   // Clamp to valid array bounds first to prevent overflow */
633
6042
                   if (idx >= n_bins) {
634
9
                       idx = n_bins - 1;
635                   }
636                   /* Adjust for exact boundaries (R's right-inclusive default: (a, b]) */
637                   /* If value is exactly on or slightly below the lower boundary of the assigned bin,
638                      it belongs in the previous bin. (First bin [a, b] is inclusive on both ends) */
639
6069
                   while (idx > 0 && val <= breaks[idx]) {
640
27
                       idx--;
641                   }
642                   // Conversely, if floating-point truncation placed it too low, push it up
643
6042
                   while (idx < n_bins - 1 && val > breaks[idx + 1]) {
644
0
                       idx++;
645                   }
646
6042
                   counts[idx]++;
647          }
648
6
        } else if (n_bins > 0) {
649          // Edge case: All data points have the exact same value (step == 0)
650
6
          counts[0] = n;
651        }
652        // Compute densities
653
69
        for (size_t i = 0; i < n_bins; i++) {
654
54
          double bin_width = breaks[i+1] - breaks[i];
655
54
          if (bin_width > 0) {
656
48
                   density[i] = (double)counts[i] / (total_n * bin_width);
657          } else {
658
6
                   density[i] = (n_bins == 1) ? 1.0 : 0.0;
659          }
660        }
661
15
}
662
663// Standard Normal CDF approximation
664
168
double approx_pnorm(double x) {
665
168
        return 0.5 * erfc(-x * 0.70710678118654752440); // 0.707... = 1/sqrt(2)
666}
667#ifndef M_SQRT1_2
668#define M_SQRT1_2 0.70710678118654752440
669#endif
670
671/* Macro for exact Wilcoxon 3D array indexing */
672#define DP_INDEX(i, j, k, n2, max_u) ((i) * ((n2) + 1) * ((max_u) + 1) + (j) * ((max_u) + 1) + (k))
673
84
static double inverse_normal_cdf(double p) {
674
84
        double a[4] = {2.50662823884, -18.61500062529, 41.39119773534, -25.44106049637};
675
84
        double b[4] = {-8.47351093090, 23.08336743743, -21.06224101826, 3.13082909833};
676
84
        double c[9] = {0.3374754822726147, 0.9761690190917186, 0.1607979714918209,
677                          0.0276438810333863, 0.0038405729373609, 0.0003951896511919,
678                          0.0000321767881768, 0.0000002888167364, 0.0000003960315187};
679        double x, r, y;
680
84
        y = p - 0.5;
681
84
        if (fabs(y) < 0.42) {
682
66
          r = y * y;
683
66
          x = y * (((a[3]*r + a[2])*r + a[1])*r + a[0]) /
684
66
                       ((((b[3]*r + b[2])*r + b[1])*r + b[0])*r + 1.0);
685        } else {
686
18
          r = p;
687
18
          if (y > 0) r = 1.0 - p;
688
18
          r = log(-log(r));
689
18
          x = c[0] + r * (c[1] + r * (c[2] + r * (c[3] + r * (c[4] +
690
18
                   r * (c[5] + r * (c[6] + r * (c[7] + r * c[8])))))));
691
18
          if (y < 0) x = -x;
692        }
693
84
        return x;
694}
695/* -----------------------------------------------------------------------
696 * Exact Spearman p-value via exhaustive permutation enumeration.
697 *
698 * Under H0, all n! orderings of ranks are equally probable.  We visit
699 * every permutation of {1..n} with Heap's algorithm (O(n!), no allocs
700 * inside the loop) and count how many yield S ≤ s_obs ("lower tail",
701 * i.e. rho ≥ rho_obs) and how many yield S ≥ s_obs ("upper tail").
702 *
703 * Mirrors R's default: exact = (n < 10) with no ties.
704 * Valid up to n = 9 (362 880 iterations — negligible cost).
705 * ----------------------------------------------------------------------- */
706
3
static double spearman_exact_pvalue(double s_obs, size_t n, const char *restrict alt) {
707
3
        int *restrict perm = (int*)safemalloc(n * sizeof(int));
708
3
        int *restrict c    = (int*)safemalloc(n * sizeof(int));
709
18
        for (size_t i = 0; i < n; i++) { perm[i] = i + 1; c[i] = 0; }
710
711
3
        long count_le = 0, count_ge = 0, total = 0;
712
713        #define TALLY_PERM() do {                                    \
714          double s_ = 0.0;                                     \
715          for (int ii = 0; ii < n; ii++) {                    \
716                   double d_ = (double)(ii + 1) - (double)perm[ii];\
717                   s_ += d_ * d_;                                   \
718          }                                                    \
719          if (s_ <= s_obs + 1e-9) count_le++;                 \
720          if (s_ >= s_obs - 1e-9) count_ge++;                 \
721          total++;                                             \
722        } while (0)
723
724
18
        TALLY_PERM();   /* initial permutation [1, 2, ..., n] */
725
726
3
        unsigned int k = 1;
727
618
        while (k < n) {
728
615
          if (c[k] < k) {
729                   int tmp;
730
357
                   if (k % 2 == 0) {
731
132
                       tmp = perm[0]; perm[0] = perm[k]; perm[k] = tmp;
732                   } else {
733
225
                       tmp = perm[c[k]]; perm[c[k]] = perm[k]; perm[k] = tmp;
734                   }
735
2142
                   TALLY_PERM();
736
357
                   c[k]++;
737
357
                   k = 1;
738          } else {
739
258
                   c[k] = 0;
740
258
                   k++;
741          }
742        }
743        #undef TALLY_PERM
744
745
3
        Safefree(perm); Safefree(c);
746        /* p_le = P(S ≤ s_obs) ≡ P(rho ≥ rho_obs)  â€” upper rho tail
747        * p_ge = P(S ≥ s_obs) ≡ P(rho ≤ rho_obs)  â€” lower rho tail  */
748
3
        double p_le = (double)count_le / (double)total;
749
3
        double p_ge = (double)count_ge / (double)total;
750
751
3
        if (strcmp(alt, "greater") == 0) return p_le;
752
3
        if (strcmp(alt, "less")    == 0) return p_ge;
753        /* two.sided: 2 × the smaller tail, clamped to 1 */
754
3
        double p = 2.0 * (p_le < p_ge ? p_le : p_ge);
755
3
        return (p > 1.0) ? 1.0 : p;
756}
757/* -----------------------------------------------------------------------
758 * Exact Kendall p-value via Mahonian Numbers (Inversions distribution)
759 * Matches R's behavior for N < 50 without ties.
760 * ----------------------------------------------------------------------- */
761
6
static double kendall_exact_pvalue(size_t n, double s_obs, const char *restrict alt) {
762
6
        long max_inv = (long)n * (n - 1) / 2;
763
6
        double *restrict dp = (double*)safemalloc((max_inv + 1) * sizeof(double));
764
72
        for (long i = 0; i <= max_inv; i++) dp[i] = 0.0;
765
6
        dp[0] = 1.0;
766        /* Build the distribution of inversions via DP */
767
30
        for (size_t i = 2; i <= n; i++) {
768
24
          double *restrict next_dp = (double*)safemalloc((max_inv + 1) * sizeof(double));
769
288
          for (long k = 0; k <= max_inv; k++) next_dp[k] = 0.0;
770
24
          int current_max_inv = i * (i - 1) / 2;
771
168
          for (int k = 0; k <= current_max_inv; k++) {
772
144
                   double sum = 0;
773
618
                   for (int j = 0; j <= i - 1 && k - j >= 0; j++) {
774
474
                       sum += dp[k - j];
775                   }
776                   // Divide by 'i' directly to keep array as pure probabilities and prevent overflow
777
144
                   next_dp[k] = sum / (double)i;
778          }
779
24
          Safefree(dp);
780
24
          dp = next_dp;
781        }
782        // Convert S statistic to target number of inversions
783
6
        long i_obs = (long)round((max_inv - s_obs) / 2.0);
784
6
        if (i_obs < 0) i_obs = 0;
785
6
        if (i_obs > max_inv) i_obs = max_inv;
786
6
        double p_le = 0.0; /* P(S <= S_obs) */
787
60
        for (long k = i_obs; k <= max_inv; k++) p_le += dp[k];
788
6
        double p_ge = 0.0; /* P(S >= S_obs) */
789
24
        for (long k = 0; k <= i_obs; k++) p_ge += dp[k];
790
6
        Safefree(dp);
791
6
        if (strcmp(alt, "greater") == 0) return p_ge;
792
6
        if (strcmp(alt, "less") == 0) return p_le;
793        // two.sided
794
3
        double p = 2.0 * (p_ge < p_le ? p_ge : p_le);
795
3
        return p > 1.0 ? 1.0 : p;
796}
797// F-distribution Cumulative Distribution Function P(F <= f)
798
78
static double pf(double f, double df1, double df2) {
799
78
        if (f <= 0.0) return 0.0;
800
78
        double x = (df1 * f) / (df1 * f + df2);
801
78
        return incbeta(df1 / 2.0, df2 / 2.0, x);
802}
803
804/* Householder QR Decomposition for Sequential Sums of Squares */
805/* Householder QR Decomposition for Sequential Sums of Squares */
806
18
static void apply_householder_aov(double** restrict X, double* restrict y, size_t n, size_t p, bool* restrict aliased, size_t* restrict rank_map) {
807
18
        size_t r = 0; // Rank/Row tracker
808
72
        for (size_t k = 0; k < p; k++) {
809
54
                aliased[k] = false;
810
54
                if (r >= n) {
811
0
                        aliased[k] = true;
812
0
                        continue;
813                }
814
815
54
                double max_val = 0;
816
489
                for (size_t i = r; i < n; i++) {
817
435
                        if (fabs(X[i][k]) > max_val) max_val = fabs(X[i][k]);
818                }
819
54
                if (max_val < 1e-10) {
820
3
                        aliased[k] = true;
821
3
                        continue;
822                } // Collinear or zero column
823
824
51
                double norm = 0;
825
477
                for (size_t i = r; i < n; i++) {
826
426
                        X[i][k] /= max_val;
827
426
                        norm += X[i][k] * X[i][k];
828                }
829
51
                norm = sqrt(norm);
830
51
                double s = (X[r][k] > 0) ? -norm : norm;
831
51
                double u1 = X[r][k] - s;
832
51
                X[r][k] = s * max_val;
833
834
108
                for (size_t j = k + 1; j < p; j++) {
835
57
                        double dot = u1 * X[r][j];
836
570
                        for (size_t i = r + 1; i < n; i++) dot += X[i][j] * X[i][k];
837
57
                        double tau = dot / (s * u1);
838
57
                        X[r][j] += tau * u1;
839
570
                        for (size_t i = r + 1; i < n; i++) X[i][j] += tau * X[i][k];
840                }
841
842                // Transform the response vector y
843
51
                double dot_y = u1 * y[r];
844
426
                for (size_t i = r + 1; i < n; i++) dot_y += y[i] * X[i][k];
845
51
                double tau_y = dot_y / (s * u1);
846
51
                y[r] += tau_y * u1;
847
426
                for (size_t i = r + 1; i < n; i++) y[i] += tau_y * X[i][k];
848
849
51
                rank_map[k] = r; // Map original column index to orthogonal row index
850
51
                r++;
851        }
852
18
}
853
854// --- write_table Helpers ---
855
856// Sorts string arrays alphabetically
857
75
static int cmp_string_wt(const void *a, const void *b) {
858
75
        return strcmp(*(const char**)a, *(const char**)b);
859}
860
861// Emulates Perl's /\D/ check
862
13
static bool contains_nondigit(SV *restrict sv) {
863
13
        if (!sv || !SvOK(sv)) return 0;
864        STRLEN len;
865
13
        const char *restrict s = SvPVbyte(sv, len);
866
26
        for (size_t i = 0; i < len; i++) {
867
13
          if (!isdigit(s[i])) return 1;
868        }
869
13
        return 0;
870}
871
872// Writes a properly quoted string dynamically
873
522
static void print_str_quoted(PerlIO *fh, const char *str, const char *sep) {
874
522
        if (!str) str = "";
875
522
        bool needs_quotes = 0;
876
522
        if (strstr(str, sep) != NULL || strchr(str, '"') != NULL || strchr(str, '\r') != NULL || strchr(str, '\n') != NULL) {
877
33
          needs_quotes = 1;
878        }
879
880
522
        if (needs_quotes) {
881
33
          PerlIO_putc(fh, '"');
882
354
          for (const char *restrict p = str; *p; p++) {
883
321
                   if (*p == '"') {
884
21
                       PerlIO_putc(fh, '"');
885
21
                       PerlIO_putc(fh, '"');
886                   } else {
887
300
                       PerlIO_putc(fh, *p);
888                   }
889          }
890
33
          PerlIO_putc(fh, '"');
891        } else {
892
489
          PerlIO_puts(fh, str);
893        }
894
522
}
895
896// Writes an array of strings joined by sep
897
141
static void print_string_row(PerlIO *fh, const char **row, size_t len, const char *sep) {
898
141
        size_t sep_len = strlen(sep);
899
663
        for (size_t i = 0; i < len; i++) {
900
522
          if (i > 0) PerlIO_write(fh, sep, sep_len);
901
522
          if (row[i]) {
902
522
                   print_str_quoted(fh, row[i], sep);
903          } else {
904
0
                   print_str_quoted(fh, "", sep);
905          }
906        }
907
141
        PerlIO_putc(fh, '\n');
908
141
}
909// Calculates the Regularized Upper Incomplete Gamma Function Q(a, x)
910// This perfectly replicates R's pchisq(..., lower.tail=FALSE)
911
17
double igamc(double a, double x) {
912
17
        if (x < 0.0 || a <= 0.0) return 1.0;
913
17
        if (x == 0.0) return 1.0;
914
915        // Series expansion for x < a + 1
916
17
        if (x < a + 1.0) {
917
8
                double sum = 1.0 / a;
918
8
                double term = 1.0 / a;
919
8
                double n = 1.0;
920
112
                while (fabs(term) > 1e-15) {
921
104
                        term *= x / (a + n);
922
104
                        sum += term;
923
104
                        n += 1.0;
924                }
925
8
                return 1.0 - (sum * exp(-x + a * log(x) - lgamma(a)));
926        }
927
928        // Continued fraction for x >= a + 1
929
9
        double b = x + 1.0 - a;
930
9
        double c = 1.0 / 1e-30;
931
9
        double d = 1.0 / b;
932
9
        double h = d, i = 1.0;
933
141
        while (i < 10000) { // Safety bound
934
141
                double an = -i * (i - a);
935
141
                b += 2.0;
936
141
                d = an * d + b;
937
141
                if (fabs(d) < 1e-30) d = 1e-30;
938
141
                c = b + an / c;
939
141
                if (fabs(c) < 1e-30) c = 1e-30;
940
141
                d = 1.0 / d;
941
141
                double del = d * c;
942
141
                h *= del;
943
141
                if (fabs(del - 1.0) < 1e-15) break;
944
132
                i += 1.0;
945        }
946
9
        return h * exp(-x + a * log(x) - lgamma(a));
947}
948
949// Chi-Squared p-value is simply the Incomplete Gamma of (df/2, stat/2)
950
17
double get_p_value(double stat, int df) {
951
17
        if (df <= 0) return 1.0;
952
17
        if (stat <= 0.0) return 1.0;
953
17
        return igamc((double)df / 2.0, stat / 2.0);
954}
955
956/* --- C HELPER SECTION --- */
957#ifndef M_SQRT1_2
958#define M_SQRT1_2 0.70710678118654752440
959#endif
960
961/* Robust Binomial Coefficient using long double */
962
6
static long double choose_comb(int n, int k) {
963
6
        if (k < 0 || k > n) return 0.0L;
964
6
        if (k > n / 2) k = n - k;
965
6
        long double res = 1.0L;
966
24
        for (int i = 1; i <= k; i++) {
967
18
          res = res * (long double)(n - i + 1) / (long double)i;
968        }
969
6
        return res;
970}
971
972/* Exact CDF for Mann-Whitney U: P(U <= q)
973   Mathematically identical to R's cwilcox generating function */
974
12
static double exact_pwilcox(double q, int m, int n) {
975
12
    int k = (int)floor(q + 1e-7); // R uses 1e-7 fuzz
976
12
    int max_u = m * n;
977
12
    if (k < 0) return 0.0;
978
6
    if (k >= max_u) return 1.0;
979
980
6
    long double *restrict w = (long double *)safecalloc(max_u + 1, sizeof(long double));
981
6
    w[0] = 1.0L;
982
983
24
    for (int j = 1; j <= n; j++) {
984
162
        for (int i = j; i <= max_u; i++) w[i] += w[i - j];
985
108
        for (int i = max_u; i >= j + m; i--) w[i] -= w[i - j - m];
986    }
987
988
6
    long double cum_p = 0.0L;
989
12
    for (int i = 0; i <= k; i++) cum_p += w[i];
990
991
6
    long double total = choose_comb(m + n, n);
992
6
    double result = (double)(cum_p / total);
993
994
6
    Safefree(w);
995
6
    return result;
996}
997
998/* Exact CDF for Wilcoxon Signed Rank: P(V <= q)
999   Mathematically identical to R's csignrank subset-sum DP */
1000
18
static double exact_psignrank(double q, int n) {
1001
18
        int k = (int)floor(q + 1e-7);
1002
18
        int max_v = n * (n + 1) / 2;
1003
18
        if (k < 0) return 0.0;
1004
18
        if (k >= max_v) return 1.0;
1005
1006
15
        long double *restrict w = (long double *)safecalloc(max_v + 1, sizeof(long double));
1007
15
        w[0] = 1.0L;
1008
1009
138
        for (int i = 1; i <= n; i++) {
1010
4746
          for (int j = max_v; j >= i; j--) w[j] += w[j - i];
1011        }
1012
1013
15
        long double cum_p = 0.0L;
1014
546
        for (int i = 0; i <= k; i++) cum_p += w[i];
1015
1016
15
        long double total = powl(2.0L, (long double)n);
1017
15
        double result = (double)(cum_p / total);
1018
1019
15
        Safefree(w);
1020
15
        return result;
1021}
1022
1023
860
static int cmp_rank_info(const void *a, const void *b) {
1024
860
        double da = ((const RankInfo*)a)->val;
1025
860
        double db = ((const RankInfo*)b)->val;
1026
860
        return (da > db) - (da < db);
1027}
1028
1029
32
static double rank_and_count_ties(RankInfo *restrict ri, size_t n, bool *restrict has_ties) {
1030
32
        if (n == 0) return 0.0;
1031
32
        qsort(ri, n, sizeof(RankInfo), cmp_rank_info);
1032
32
        size_t i = 0;
1033
32
        double tie_adj = 0.0;
1034
32
        *has_ties = 0;
1035
357
        while (i < n) {
1036
325
                size_t j = i + 1;
1037
349
                while (j < n && ri[j].val == ri[i].val) j++;
1038
325
                double r = (double)(i + 1 + j) / 2.0;
1039
674
                for (size_t k = i; k < j; k++) ri[k].rank = r;
1040
325
                size_t t = j - i;
1041
325
                if (t > 1) { *has_ties = 1; tie_adj += ((double)t * t * t - t); }
1042
325
                i = j;
1043        }
1044
32
        return tie_adj;
1045}
1046/* --- KS-TEST C HELPER SECTION --- */
1047#ifndef M_PI_2
1048#define M_PI_2 1.57079632679489661923
1049#endif
1050#ifndef M_PI_4
1051#define M_PI_4 0.78539816339744830962
1052#endif
1053#ifndef M_1_SQRT_2PI
1054#define M_1_SQRT_2PI 0.39894228040143267794
1055#endif
1056
1057// Scalar integer power used by K2x
1058
117
static double r_pow_di(double x, int n) {
1059
117
        if (n == 0) return 1.0;
1060
117
        if (n < 0) return 1.0 / r_pow_di(x, -n);
1061
117
        double val = 1.0;
1062
1314
        for (int i = 0; i < n; i++) val *= x;
1063
117
        return val;
1064}
1065
1066// Two-sample two-sided asymptotic distribution
1067
0
static double K2l(double x, int lower, double tol) {
1068        double s, z, p;
1069        int k;
1070
0
        if(x <= 0.) {
1071
0
          if(lower) p = 0.;
1072
0
          else p = 1.;
1073
0
        } else if(x < 1.) {
1074
0
          int k_max = (int) sqrt(2.0 - log(tol));
1075
0
          double w = log(x);
1076
0
          z = - (M_PI_2 * M_PI_4) / (x * x);
1077
0
          s = 0;
1078
0
          for(k = 1; k < k_max; k += 2) {
1079
0
                   s += exp(k * k * z - w);
1080          }
1081
0
          p = s / M_1_SQRT_2PI;
1082
0
          if(!lower) p = 1.0 - p;
1083        } else {
1084          double new_val, old_val;
1085
0
          z = -2.0 * x * x;
1086
0
          s = -1.0;
1087
0
          if(lower) {
1088
0
                   k = 1; old_val = 0.0; new_val = 1.0;
1089          } else {
1090
0
                   k = 2; old_val = 0.0; new_val = 2.0 * exp(z);
1091          }
1092
0
          while(fabs(old_val - new_val) > tol) {
1093
0
                   old_val = new_val;
1094
0
                   new_val += 2.0 * s * exp(z * k * k);
1095
0
                   s *= -1.0;
1096
0
                   k++;
1097          }
1098
0
          p = new_val;
1099        }
1100
0
        return p;
1101}
1102
1103// Auxiliary routines used by K2x() for matrix operations
1104
21
static void m_multiply(double *A, double *B, double *C, unsigned int m) {
1105
420
        for(unsigned int i = 0; i < m; i++) {
1106
7980
          for(unsigned int j = 0; j < m; j++) {
1107
7581
                   double s = 0.;
1108
151620
                   for(unsigned int k = 0; k < m; k++) s += A[i * m + k] * B[k * m + j];
1109
7581
                   C[i * m + j] = s;
1110          }
1111        }
1112
21
}
1113
1114
18
static void m_power(double *A, int eA, double *V, int *eV, int m, int n) {
1115
18
    if(n == 1) {
1116
1086
        for(int i = 0; i < m * m; i++) V[i] = A[i];
1117
3
        *eV = eA;
1118
3
        return;
1119    }
1120
15
    m_power(A, eA, V, eV, m, n / 2);
1121
15
    double *restrict B = (double*) safecalloc(m * m, sizeof(double));
1122
15
    m_multiply(V, V, B, m);
1123
15
    int eB = 2 * (*eV);
1124
15
    if((n % 2) == 0) {
1125
3258
        for(int i = 0; i < m * m; i++) V[i] = B[i];
1126
9
        *eV = eB;
1127    } else {
1128
6
        m_multiply(A, B, V, m);
1129
6
        *eV = eA + eB;
1130    }
1131
15
    if(V[(m / 2) * m + (m / 2)] > 1e140) {
1132
0
        for(int i = 0; i < m * m; i++) V[i] = V[i] * 1e-140;
1133
0
        *eV += 140;
1134    }
1135
15
    Safefree(B);
1136}
1137
1138// One-sample two-sided exact distribution
1139
3
static double K2x(int n, double d) {
1140
3
        int k = (int) (n * d) + 1;
1141
3
        int m = 2 * k - 1;
1142
3
        double h = k - n * d;
1143
3
        double *restrict H = (double*) safecalloc(m * m, sizeof(double));
1144
3
        double *restrict Q = (double*) safecalloc(m * m, sizeof(double));
1145
1146
60
        for(int i = 0; i < m; i++) {
1147
1140
          for(int j = 0; j < m; j++) {
1148
1083
                   if(i - j + 1 < 0) H[i * m + j] = 0;
1149
624
                   else H[i * m + j] = 1;
1150          }
1151        }
1152
60
        for(int i = 0; i < m; i++) {
1153
57
          H[i * m] -= r_pow_di(h, i + 1);
1154
57
          H[(m - 1) * m + i] -= r_pow_di(h, (m - i));
1155        }
1156
3
        H[(m - 1) * m] += ((2 * h - 1 > 0) ? r_pow_di(2 * h - 1, m) : 0);
1157
1158
60
        for(int i = 0; i < m; i++) {
1159
1140
          for(int j = 0; j < m; j++) {
1160
1083
                   if(i - j + 1 > 0) {
1161
4560
                       for(int g = 1; g <= i - j + 1; g++) H[i * m + j] /= g;
1162                   }
1163          }
1164        }
1165
1166
3
        int eH = 0, eQ;
1167
3
        m_power(H, eH, Q, &eQ, m, n);
1168
3
        double s = Q[(k - 1) * m + k - 1];
1169
1170
153
        for(int i = 1; i <= n; i++) {
1171
150
          s = s * (double)i / (double)n;
1172
150
          if(s < 1e-140) {
1173
0
                   s *= 1e140;
1174
0
                   eQ -= 140;
1175          }
1176        }
1177
3
        s *= pow(10.0, eQ);
1178
3
        Safefree(H);
1179
3
        Safefree(Q);
1180
3
        return s;
1181}
1182
1183// Calculate D (two-sided), D+ (greater), and D- (less) simultaneously
1184
9
static void calc_2sample_stats(double *x, size_t nx, double *y, size_t ny,
1185                               double *d, double *d_plus, double *d_minus) {
1186
9
        qsort(x, nx, sizeof(double), compare_doubles);
1187
9
        qsort(y, ny, sizeof(double), compare_doubles);
1188
9
        double max_d = 0.0, max_d_plus = 0.0, max_d_minus = 0.0;
1189
9
        size_t i = 0, j = 0;
1190
1191
729
        while(i < nx || j < ny) {
1192          double val;
1193
720
          if (i < nx && j < ny) val = (x[i] < y[j]) ? x[i] : y[j];
1194
117
          else if (i < nx) val = x[i];
1195
0
          else val = y[j];
1196
1197
1170
          while(i < nx && x[i] <= val) i++;
1198
990
          while(j < ny && y[j] <= val) j++;
1199
1200
720
          double cdf1 = (double)i / nx;
1201
720
          double cdf2 = (double)j / ny;
1202
720
          double diff = cdf1 - cdf2;
1203
1204
720
          if (diff > max_d_plus) max_d_plus = diff;
1205
720
          if (-diff > max_d_minus) max_d_minus = -diff;
1206
720
          if (fabs(diff) > max_d) max_d = fabs(diff);
1207        }
1208
9
        *d = max_d;
1209
9
        *d_plus = max_d_plus;
1210
9
        *d_minus = max_d_minus;
1211
9
}
1212
1213// Branch the DP boundary check based on the 'alternative'
1214
14220
static int psmirnov_exact_test(double q, double r, double s, int two_sided) {
1215
14220
        if (two_sided) return (fabs(r - s) >= q);
1216
9480
        return ((r - s) >= q); // Used for both D+ and D- via symmetry
1217}
1218
1219// Evaluate the exact 2-sample probability
1220
9
static double psmirnov_exact_uniq_upper(double q, int m, int n, int two_sided) {
1221
9
        double md = (double) m, nd = (double) n;
1222
9
        double *u = (double *) safecalloc(n + 1, sizeof(double));
1223
9
        u[0] = 0.;
1224
1225
279
        for(unsigned int j = 1; j <= n; j++) {
1226
270
          if(psmirnov_exact_test(q, 0., j / nd, two_sided)) u[j] = 1.;
1227
216
          else u[j] = u[j - 1];
1228        }
1229
459
        for(unsigned int i = 1; i <= m; i++) {
1230
450
          if(psmirnov_exact_test(q, i / md, 0., two_sided)) u[0] = 1.;
1231
13950
          for(int j = 1; j <= n; j++) {
1232
13500
                   if(psmirnov_exact_test(q, i / md, j / nd, two_sided)) u[j] = 1.;
1233                   else {
1234
10002
                       double v = (double)(i) / (double)(i + j);
1235
10002
                       double w = (double)(j) / (double)(i + j);
1236
10002
                       u[j] = v * u[j] + w * u[j - 1];
1237                   }
1238          }
1239        }
1240
9
        double res = u[n];
1241
9
        Safefree(u);
1242
9
        return res;
1243}
1244
1245
687
static double p_body(double n, double delta, double sd, double sig_level, int tsample, int tside, bool strict) {
1246
687
        double nu = (n - 1.0) * (double)tsample;
1247
687
        if (nu < 1e-7) nu = 1e-7;
1248
1249        // Ensure sig_level/tside is not truncated
1250
687
        double p_tail = sig_level / (double)tside;
1251
687
        double qu = qt_tail(nu, p_tail); // qt(p, df, lower.tail=FALSE)
1252
1253
687
        double ncp = sqrt(n / (double)tsample) * (delta / sd);
1254
1255
687
        if (strict && tside == 2) {
1256          // Use R-style tail calls: 1 - P(T < qu) + P(T < -qu)
1257
0
          return (1.0 - exact_pnt(qu, nu, ncp)) + exact_pnt(-qu, nu, ncp);
1258        } else {
1259          // Default: 1 - P(T < qu)
1260          // Ensure exact_pnt is using a convergence tolerance of at least 1e-15
1261
687
          return 1.0 - exact_pnt(qu, nu, ncp);
1262        }
1263}
1264
1265// Bisection algorithm to find the inverse F-distribution (Quantile function)
1266// Equivalent to R's qf(p, df1, df2)
1267static double qf_bisection(double p, double df1, double df2) {
1268        if (p <= 0.0) return 0.0;
1269        if (p >= 1.0) return INFINITY;
1270
12
        double low = 0.0, high = 1.0;
1271
12
        // Find upper bound
1272
12
        while (pf(high, df1, df2) < p) {
1273
12
          low = high;
1274          high *= 2.0;
1275          if (high > 1e100) break; /* Fallback limit */
1276
12
        }
1277
1278
12
        // Bisect to find the root
1279        for (unsigned short int i = 0; i < 150; i++) {
1280                double mid = low + (high - low) / 2.0;
1281
12
                double p_mid = pf(mid, df1, df2);
1282
1283
9
                if (p_mid < p) {
1284
9
                        low = mid;
1285
3
                } else {
1286
3
                        high = mid;
1287
3
                }
1288                if (high - low < 1e-12) break;
1289        }
1290        return (low + high) / 2.0;
1291}
1292
18
// --- XS SECTION ---
1293
6
MODULE = Stats::LikeR  PACKAGE = Stats::LikeR
1294
1295
6
SV* ks_test(...)
1296
6
CODE:
1297
6
{
1298
0
        SV *restrict x_sv = NULL, *restrict y_sv = NULL;
1299
0
        short int exact = -1;
1300        const char *restrict alternative = "two.sided";
1301
6
        int arg_idx = 0;
1302
1303        // Shift arrays if provided positionally
1304        if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
1305
12
                x_sv = ST(arg_idx);
1306
0
                arg_idx++;
1307        }
1308        // Check if second argument is an array (2-sample) or a string representing a CDF (1-sample)
1309
12
        if (arg_idx < items) {
1310
12
                if (SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
1311
12
                        y_sv = ST(arg_idx);
1312                        arg_idx++;
1313
12
                } else if (SvPOK(ST(arg_idx))) {
1314
0
                        y_sv = ST(arg_idx); // Save string (e.g., "pnorm") for 1-sample test logic
1315                        arg_idx++;
1316                }
1317
12
        }
1318
1319
12
        // Parse named arguments
1320        for (; arg_idx < items; arg_idx += 2) {
1321          const char *restrict key = SvPV_nolen(ST(arg_idx));
1322
12
          SV *restrict val = ST(arg_idx + 1);
1323
12
          if      (strEQ(key, "x"))           x_sv = val;
1324
612
          else if (strEQ(key, "y"))           y_sv = val;
1325
600
          else if (strEQ(key, "exact"))       {
1326
600
                   if (!SvOK(val)) exact = -1;
1327
600
                   else exact = SvTRUE(val) ? 1 : 0;
1328          }
1329          else if (strEQ(key, "alternative")) alternative = SvPV_nolen(val);
1330          else croak("ks_test: unknown argument '%s'", key);
1331
12
        }
1332
1333        if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV) {
1334          croak("ks_test: 'x' is a required argument and must be an ARRAY reference");
1335
21
        }
1336
1337
9
        bool is_two_sided = strEQ(alternative, "two.sided") ? 1 : 0;
1338        bool is_greater   = strEQ(alternative, "greater") ? 1 : 0;
1339
9
        bool is_less      = strEQ(alternative, "less") ? 1 : 0;
1340
1341
279
        if (!is_two_sided && !is_greater && !is_less) {
1342
270
          croak("ks_test: alternative must be 'two.sided', 'less', or 'greater'");
1343
270
        }
1344
1345        AV *restrict x_av = (AV*)SvRV(x_sv);
1346        size_t nx = av_len(x_av) + 1;
1347        if (nx == 0) croak("Not enough 'x' observations");
1348
1349
0
        // Extract 'x' array to C-array
1350
0
        double *restrict x_data = (double *)safemalloc(nx * sizeof(double));
1351        size_t valid_nx = 0;
1352        for (size_t i = 0; i < nx; i++) {
1353          SV**restrict el = av_fetch(x_av, i, 0);
1354
9
          if (el && SvOK(*el) && looks_like_number(*el)) {
1355                   x_data[valid_nx++] = SvNV(*el);
1356          }
1357
9
        }
1358
1359
3
        double statistic = 0.0, p_value = 0.0;
1360        const char *restrict method_desc = "";
1361
1362
9
        // --- TWO SAMPLE ---
1363
9
        if (y_sv && SvROK(y_sv) && SvTYPE(SvRV(y_sv)) == SVt_PVAV) {
1364
9
          AV *restrict y_av = (AV*)SvRV(y_sv);
1365
9
          size_t ny = av_len(y_av) + 1;
1366
1367          double *restrict y_data = (double *)safemalloc(ny * sizeof(double));
1368
9
          size_t valid_ny = 0;
1369
9
          for (size_t i = 0; i < ny; i++) {
1370
459
                   SV**restrict el = av_fetch(y_av, i, 0);
1371
279
                   if (el && SvOK(*el) && looks_like_number(*el)) {
1372
9
                       y_data[valid_ny++] = SvNV(*el);
1373                   }
1374
9
          }
1375
1376
711
          if (valid_nx < 1 || valid_ny < 1) {
1377                   Safefree(x_data); Safefree(y_data);
1378
9
                   croak("Not enough non-missing observations for KS test");
1379
9
          }
1380
1381
0
          double d, d_plus, d_minus;
1382          calc_2sample_stats(x_data, valid_nx, y_data, valid_ny, &d, &d_plus, &d_minus);
1383
1384
9
          // Map alternative to the correct statistic
1385
9
          if (is_greater) statistic = d_plus;
1386
9
          else if (is_less) statistic = d_minus;
1387          else statistic = d;
1388
1389
0
          // Determine if exact or asymptotic
1390
0
          bool use_exact = false;
1391
0
          if (exact == 1) use_exact = true;
1392          else if (exact == 0) use_exact = false;
1393
0
          else use_exact = (valid_nx * valid_ny < 10000);
1394
1395          // Check for ties in combined set
1396
9
          size_t total_n = valid_nx + valid_ny;
1397          double *restrict comb = (double *)safemalloc(total_n * sizeof(double));
1398          for(size_t i=0; i<valid_nx; i++) comb[i] = x_data[i];
1399
6
          for(size_t i=0; i<valid_ny; i++) comb[valid_nx+i] = y_data[i];
1400
3
          qsort(comb, total_n, sizeof(double), compare_doubles);
1401
1402
3
          bool has_ties = false;
1403
3
          for(size_t i = 1; i < total_n; i++) {
1404
153
                   if(comb[i] == comb[i-1]) { has_ties = true; break; }
1405
150
          }
1406
150
          Safefree(comb);
1407
150
          if (use_exact && has_ties) {
1408                   warn("cannot compute exact p-value with ties; falling back to asymptotic");
1409
150
                   use_exact = false;
1410
150
          }
1411          if (use_exact) {
1412
150
                   method_desc = "Two-sample Kolmogorov-Smirnov exact test";
1413
150
                   double q = (0.5 + floor(statistic * valid_nx * valid_ny - 1e-7)) / ((double)valid_nx * valid_ny);
1414
150
                   p_value = psmirnov_exact_uniq_upper(q, valid_nx, valid_ny, is_two_sided);
1415
150
          } else {
1416                   method_desc = "Two-sample Kolmogorov-Smirnov test (asymptotic)";
1417
150
                   double z = statistic * sqrt((double)(valid_nx * valid_ny) / (valid_nx + valid_ny));
1418
150
                   if (is_two_sided) {
1419                       p_value = K2l(z, 0, 1e-9);
1420
3
                   } else {
1421
3
                       p_value = exp(-2.0 * z * z); // One-sided limit distribution
1422
3
                   }
1423
3
          }
1424
3
          Safefree(y_data);
1425
3
        }
1426
3
        // --- ONE SAMPLE (e.g. against pnorm) ---
1427
3
        else if (y_sv && SvPOK(y_sv)) {
1428          const char *restrict dist = SvPV_nolen(y_sv);
1429
0
          if (strEQ(dist, "pnorm")) {
1430
0
                   qsort(x_data, valid_nx, sizeof(double), compare_doubles);
1431
0
                   double max_d = 0.0, max_d_plus = 0.0, max_d_minus = 0.0;
1432                   for(size_t i = 0; i < valid_nx; i++) {
1433                       double cdf_obs_low  = (double)i / valid_nx;
1434
0
                       double cdf_obs_high = (double)(i + 1) / valid_nx;
1435
0
                       double cdf_theor    = approx_pnorm(x_data[i]);
1436
1437
0
                       double diff1 = cdf_obs_low - cdf_theor;
1438                       double diff2 = cdf_obs_high - cdf_theor;
1439
1440
0
                       if (diff1 > max_d_plus) max_d_plus = diff1;
1441
0
                       if (diff2 > max_d_plus) max_d_plus = diff2;
1442                       if (-diff1 > max_d_minus) max_d_minus = -diff1;
1443                       if (-diff2 > max_d_minus) max_d_minus = -diff2;
1444
1445
0
                       if (fabs(diff1) > max_d) max_d = fabs(diff1);
1446                       if (fabs(diff2) > max_d) max_d = fabs(diff2);
1447
12
                   }
1448
12
                   if (is_greater) statistic = max_d_plus;
1449
12
                   else if (is_less) statistic = max_d_minus;
1450
12
                   else statistic = max_d;
1451
12
                   bool use_exact = (exact == -1) ? (valid_nx < 100) : (exact == 1);
1452
12
                   if (use_exact) {
1453
12
                       method_desc = "One-sample Kolmogorov-Smirnov exact test";
1454
12
                       if (is_two_sided) {
1455
12
                           p_value = 1.0 - K2x(valid_nx, statistic);
1456                       } else {
1457                           warn("exact 1-sample 1-sided KS test not implemented; using asymptotic");
1458                           double z = statistic * sqrt((double)valid_nx);
1459                           p_value = exp(-2.0 * z * z);
1460                       }
1461                   } else {
1462                       method_desc = "One-sample Kolmogorov-Smirnov test (asymptotic)";
1463
30
                       double z = statistic * sqrt((double)valid_nx);
1464
30
                       if (is_two_sided) p_value = K2l(z, 0, 1e-6);
1465
30
                       else p_value = exp(-2.0 * z * z);
1466
30
                   }
1467
30
          } else {
1468
30
                    Safefree(x_data);
1469                    croak("ks_test: Unsupported 1-sample distribution '%s'. Use arrays for 2-sample.", dist);
1470
30
          }
1471
6
        } else {
1472
6
          Safefree(x_data);
1473          croak("ks_test: Invalid arguments for 'y'.");
1474        }
1475
30
        Safefree(x_data);
1476
6
        if (p_value > 1.0) p_value = 1.0;
1477
6
        if (p_value < 0.0) p_value = 0.0;
1478        HV *restrict res = newHV();
1479        hv_stores(res, "statistic", newSVnv(statistic));
1480
30
        hv_stores(res, "p_value", newSVnv(p_value));
1481
0
        hv_stores(res, "method", newSVpv(method_desc, 0));
1482        hv_stores(res, "alternative", newSVpv(alternative, 0));
1483        RETVAL = newRV_noinc((SV*)res);
1484
90
}
1485
60
OUTPUT:
1486
60
    RETVAL
1487
1488
39
SV* wilcox_test(...)
1489
18
CODE:
1490
9
{
1491
9
        SV *restrict x_sv = NULL, *restrict y_sv = NULL;
1492
6
        bool paired = false, correct = true;
1493
0
        double mu = 0.0;
1494
0
        short int exact = -1;
1495        const char *restrict alternative = "two.sided";
1496
6
        int arg_idx = 0;
1497
0
        // 1. Shift first positional argument as 'x' if it's an array reference
1498        if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
1499                x_sv = ST(arg_idx);
1500
30
                arg_idx++;
1501
3
        }
1502
27
        // 2. Shift second positional argument as 'y' if it's an array reference
1503
27
        if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
1504
27
                y_sv = ST(arg_idx);
1505                arg_idx++;
1506
27
        }
1507
27
        // Ensure the remaining arguments form complete key-value pairs
1508
27
        if ((items - arg_idx) % 2 != 0) {
1509
24
                croak("Usage: wilcox_test(\\@x, [\\@y], key => value, ...)");
1510
24
        }
1511        // --- Parse named arguments from the remaining flat stack ---
1512
27
        for (; arg_idx < items; arg_idx += 2) {
1513
27
                const char *restrict key = SvPV_nolen(ST(arg_idx));
1514
27
                SV *restrict val = ST(arg_idx + 1);
1515                if      (strEQ(key, "x"))           x_sv = val;
1516
42
                else if (strEQ(key, "y"))           y_sv = val;
1517
15
                else if (strEQ(key, "paired"))      paired = SvTRUE(val);
1518
15
                else if (strEQ(key, "correct"))     correct = SvTRUE(val);
1519
99
                else if (strEQ(key, "mu"))          mu = SvNV(val);
1520
84
                else if (strEQ(key, "exact"))       {
1521
84
                        if (!SvOK(val)) exact = -1;
1522
84
                        else exact = SvTRUE(val) ? 1 : 0;
1523
84
                }
1524
84
                else if (strEQ(key, "alternative")) alternative = SvPV_nolen(val);
1525                else croak("wilcox_test: unknown argument '%s'", key);
1526        }
1527
99
        // --- Validate required / types ---
1528
84
        if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
1529
84
                croak("wilcox_test: 'x' is a required argument and must be an ARRAY reference");
1530
84
        AV *restrict x_av = (AV*)SvRV(x_sv);
1531
84
        size_t nx = av_len(x_av) + 1;
1532
84
        if (nx == 0) croak("Not enough 'x' observations");
1533
1534        AV *restrict y_av = NULL;
1535
15
        size_t ny = 0;
1536
15
        if (y_sv && SvROK(y_sv) && SvTYPE(SvRV(y_sv)) == SVt_PVAV) {
1537
15
                y_av = (AV*)SvRV(y_sv);
1538
15
                ny = av_len(y_av) + 1;
1539
15
        }
1540
15
        double p_value = 0.0, statistic = 0.0;
1541
183
        const char *restrict method_desc = "";
1542
15
        bool use_exact = false;
1543        // --- TWO SAMPLE (Mann-Whitney) ---
1544
15
        if (ny > 0 && !paired) {
1545
15
                RankInfo *restrict ri = (RankInfo *)safemalloc((nx + ny) * sizeof(RankInfo));
1546
15
                size_t valid_nx = 0, valid_ny = 0;
1547                for (size_t i = 0; i < nx; i++) {
1548
15
                        SV**restrict el = av_fetch(x_av, i, 0);
1549
0
                        if (el && SvOK(*el) && looks_like_number(*el)) {
1550
0
                                ri[valid_nx].val = SvNV(*el) - mu; // R subtracts mu from x
1551                                ri[valid_nx].idx = 1;
1552
15
                                valid_nx++;
1553
6
                        }
1554
6
                }
1555
6
                for (size_t i = 0; i < ny; i++) {
1556                        SV**restrict el = av_fetch(y_av, i, 0);
1557
6
                        if (el && SvOK(*el) && looks_like_number(*el)) {
1558
3
                                ri[valid_nx + valid_ny].val = SvNV(*el);
1559                                ri[valid_nx + valid_ny].idx = 2;
1560
0
                                valid_ny++;
1561
0
                        }
1562                }
1563                if (valid_nx == 0) { Safefree(ri); croak("not enough (non-missing) 'x' observations"); }
1564
9
                if (valid_ny == 0) { Safefree(ri); croak("not enough 'y' observations"); }
1565
9
                size_t total_n = valid_nx + valid_ny;
1566
9
                bool has_ties = 0;
1567
9
                double tie_adj = rank_and_count_ties(ri, total_n, &has_ties);
1568                double w_rank_sum = 0.0;
1569
9
                for (size_t i = 0; i < total_n; i++) if (ri[i].idx == 1) w_rank_sum += ri[i].rank;
1570
9
                statistic = w_rank_sum - (double)valid_nx * (valid_nx + 1.0) / 2.0;
1571
1572
0
                if (exact == 1) use_exact = true;
1573
0
                else if (exact == 0) use_exact = false;
1574                else use_exact = (valid_nx < 50 && valid_ny < 50 && !has_ties);
1575
1576                if (use_exact && has_ties) {
1577
9
                        warn("cannot compute exact p-value with ties; falling back to approximation");
1578
9
                        use_exact = false;
1579
9
                }
1580                if (use_exact) {
1581
15
                        method_desc = "Wilcoxon rank sum exact test";
1582                        double p_less = exact_pwilcox(statistic, valid_nx, valid_ny);
1583
12
                        double p_greater = 1.0 - exact_pwilcox(statistic - 1.0, valid_nx, valid_ny);
1584
1585
9
                        if (strcmp(alternative, "less") == 0) p_value = p_less;
1586
9
                        else if (strcmp(alternative, "greater") == 0) p_value = p_greater;
1587
78
                        else {
1588
69
                                double p = (p_less < p_greater) ? p_less : p_greater;
1589
69
                                p_value = 2.0 * p;
1590
69
                        }
1591                } else {
1592
69
                        method_desc = correct ? "Wilcoxon rank sum test with continuity correction" : "Wilcoxon rank sum test";
1593
54
                        double exp = (double)valid_nx * valid_ny / 2.0;
1594
54
                        double var = ((double)valid_nx * valid_ny / 12.0) * ((total_n + 1.0) - tie_adj / (total_n * (total_n - 1.0)));
1595
54
                        double z = statistic - exp;
1596
1597
54
                        double CORRECTION = 0.0;
1598
54
                        if (correct) {
1599                                if (strcmp(alternative, "two.sided") == 0) CORRECTION = (z > 0 ? 0.5 : -0.5);
1600
15
                                else if (strcmp(alternative, "greater") == 0) CORRECTION = 0.5;
1601
15
                                else if (strcmp(alternative, "less") == 0) CORRECTION = -0.5;
1602
15
                        }
1603                        z = (z - CORRECTION) / sqrt(var);
1604
1605
9
                        if (strcmp(alternative, "less") == 0) p_value = approx_pnorm(z);
1606
0
                        else if (strcmp(alternative, "greater") == 0) p_value = 1.0 - approx_pnorm(z);
1607
0
                        else p_value = 2.0 * approx_pnorm(-fabs(z));
1608                }
1609
9
                Safefree(ri);
1610
78
        } else { // --- ONE SAMPLE / PAIRED ---
1611
69
                if (paired && (!y_av || nx != ny)) croak("'x' and 'y' must have the same length for paired test");
1612
69
                double *restrict diffs = (double *)safemalloc(nx * sizeof(double));
1613                size_t n_nz = 0;
1614
9
                bool has_zeroes = false;
1615
9
                for (size_t i = 0; i < nx; i++) {
1616
9
                        SV**restrict x_el = av_fetch(x_av, i, 0);
1617
78
                        if (!x_el || !SvOK(*x_el) || !looks_like_number(*x_el)) continue;
1618
69
                        double dx = SvNV(*x_el);
1619
1620
9
                        if (paired) {
1621
9
                                SV**restrict y_el = av_fetch(y_av, i, 0);
1622
9
                                if (!y_el || !SvOK(*y_el) || !looks_like_number(*y_el)) continue;
1623
9
                                double dy = SvNV(*y_el);
1624
0
                                double d = dx - dy - mu;
1625
0
                                if (d == 0.0) has_zeroes = true; // Drop exact zeroes
1626                                else diffs[n_nz++] = d;
1627
9
                        } else {
1628
0
                                double d = dx - mu;
1629
0
                                if (d == 0.0) has_zeroes = true;
1630                                else diffs[n_nz++] = d;
1631
9
                        }
1632
9
                }
1633
9
                if (n_nz == 0) {
1634
9
                        Safefree(diffs);
1635                        croak("not enough (non-missing) observations");
1636
9
                }
1637
9
                RankInfo *ri = (RankInfo *)safemalloc(n_nz * sizeof(RankInfo));
1638                for (size_t i = 0; i < n_nz; i++) {
1639
9
                        ri[i].val = fabs(diffs[i]);
1640
9
                        ri[i].idx = (diffs[i] > 0);
1641                }
1642                bool has_ties = 0;
1643
0
                double tie_adj = rank_and_count_ties(ri, n_nz, &has_ties);
1644
0
                statistic = 0.0;
1645
0
                for (size_t i = 0; i < n_nz; i++) {
1646
0
                        if (ri[i].idx) statistic += ri[i].rank;
1647
0
                }
1648
0
                if (exact == 1) use_exact = true;
1649
0
                else if (exact == 0) use_exact = false;
1650
0
                else use_exact = (n_nz < 50 && !has_ties);
1651
0
                if (use_exact && has_ties) {
1652                        warn("cannot compute exact p-value with ties; falling back to approximation");
1653
0
                        use_exact = false;
1654                }
1655
0
                if (use_exact && has_zeroes) {
1656
0
                        warn("cannot compute exact p-value with zeroes; falling back to approximation");
1657
0
                        use_exact = false;
1658                }
1659
9
                if (use_exact) {
1660                        method_desc = paired ? "Wilcoxon exact signed rank test" : "Wilcoxon exact signed rank test";
1661
24
                        double p_less = exact_psignrank(statistic, n_nz);
1662
24
                        double p_greater = 1.0 - exact_psignrank(statistic - 1.0, n_nz);
1663
1664
24
                        if (strcmp(alternative, "less") == 0) p_value = p_less;
1665
24
                        else if (strcmp(alternative, "greater") == 0) p_value = p_greater;
1666
24
                        else {
1667
24
                                double p = (p_less < p_greater) ? p_less : p_greater;
1668                                p_value = 2.0 * p;
1669                        }
1670                } else {
1671                        method_desc = correct ? "Wilcoxon signed rank test with continuity correction" : "Wilcoxon signed rank test";
1672                        double exp = (double)n_nz * (n_nz + 1.0) / 4.0;
1673                        double var = (n_nz * (n_nz + 1.0) * (2.0 * n_nz + 1.0) / 24.0) - (tie_adj / 48.0);
1674                        double z = statistic - exp;
1675                        double CORRECTION = 0.0;
1676
9
                        if (correct) {
1677
9
                                if (strcmp(alternative, "two.sided") == 0) CORRECTION = (z > 0 ? 0.5 : -0.5);
1678
9
                                else if (strcmp(alternative, "greater") == 0) CORRECTION = 0.5;
1679
9
                                else if (strcmp(alternative, "less") == 0) CORRECTION = -0.5;
1680
9
                        }
1681
6
                        z = (z - CORRECTION) / sqrt(var);
1682
1683
6
                        if (strcmp(alternative, "less") == 0) p_value = approx_pnorm(z);
1684                        else if (strcmp(alternative, "greater") == 0) p_value = 1.0 - approx_pnorm(z);
1685
3
                        else p_value = 2.0 * approx_pnorm(-fabs(z));
1686
3
                }
1687                Safefree(ri); Safefree(diffs);
1688        }
1689
9
        if (p_value > 1.0) p_value = 1.0;
1690
9
        HV *restrict res = newHV();
1691
9
        hv_stores(res, "statistic", newSVnv(statistic));
1692        hv_stores(res, "p_value", newSVnv(p_value));
1693
9
        hv_stores(res, "method", newSVpv(method_desc, 0));
1694
9
        hv_stores(res, "alternative", newSVpv(alternative, 0));
1695
6
        RETVAL = newRV_noinc((SV*)res);
1696
6
}
1697
18
OUTPUT:
1698
21
        RETVAL
1699
1700
12
SV* _chisq_c(data_ref)
1701
12
    SV* data_ref;
1702
42
CODE:
1703
30
{
1704
30
        AV*restrict obs_av = (AV*)SvRV(data_ref);
1705
30
        int r = av_top_index(obs_av) + 1, c = 0;
1706
30
        bool is_2d = 0;
1707
30
        SV**restrict first_elem = av_fetch(obs_av, 0, 0);
1708        if (first_elem && SvROK(*first_elem) && SvTYPE(SvRV(*first_elem)) == SVt_PVAV) {
1709                is_2d = 1;
1710
18
                AV*restrict first_row = (AV*)SvRV(*first_elem);
1711
12
                c = av_top_index(first_row) + 1;
1712
12
        } else {
1713
12
                c = r;
1714
42
                r = 1;
1715
30
        }
1716
1717
30
        double stat = 0.0, grand_total = 0.0;
1718
30
        int df = 0;
1719
30
        int yates = (is_2d && r == 2 && c == 2) ? 1 : 0;
1720
1721
12
        AV*restrict expected_av = newAV();
1722
12
        if (is_2d) {
1723
12
                double *restrict row_sum = (double*)safemalloc(r * sizeof(double));
1724
12
                double *restrict col_sum = (double*)safemalloc(c * sizeof(double));
1725                for(unsigned int i=0; i<r; i++) row_sum[i] = 0.0;
1726
18
                for(unsigned int j=0; j<c; j++) col_sum[j] = 0.0;
1727                for (unsigned int i = 0; i < r; i++) {
1728                        SV**restrict row_sv = av_fetch(obs_av, i, 0);
1729
12
                        AV*restrict row = (AV*)SvRV(*row_sv);
1730                        for (unsigned int j = 0; j < c; j++) {
1731
6
                                 SV**restrict val_sv = av_fetch(row, j, 0);
1732
6
                                 double val = SvNV(*val_sv);
1733                                 row_sum[i] += val;
1734
12
                                 col_sum[j] += val;
1735
9
                                 grand_total += val;
1736
9
                        }
1737                }
1738
3
                for (unsigned int i = 0; i < r; i++) {
1739
12
                        AV*restrict exp_row = newAV();
1740
9
                        SV**restrict row_sv = av_fetch(obs_av, i, 0);
1741
9
                        AV*restrict row = (AV*)SvRV(*row_sv);
1742
9
                        for (unsigned int j = 0; j < c; j++) {
1743
9
                                double E = (row_sum[i] * col_sum[j]) / grand_total;
1744                                SV**restrict val_sv = av_fetch(row, j, 0);
1745
3
                                double O = SvNV(*val_sv);
1746                                av_push(exp_row, newSVnv(E));
1747
9
                                if (yates) {
1748
9
                                  // Exact R logic: min(0.5, abs(O - E))
1749
9
                                  double abs_diff = fabs(O - E);
1750
9
                                  double y_corr = (abs_diff > 0.5) ? 0.5 : abs_diff;
1751
9
                                  double diff = abs_diff - y_corr;
1752
9
                                  stat += (diff * diff) / E;
1753
9
                                } else {
1754
6
                                  stat += ((O - E) * (O - E)) / E;
1755
3
                                }
1756                        }
1757
3
                        av_push(expected_av, newRV_noinc((SV*)exp_row));
1758                }
1759                safefree(row_sum); safefree(col_sum);
1760
3
                df = (r - 1) * (c - 1);
1761        } else {
1762
9
          for (unsigned int j = 0; j < c; j++) {
1763                   SV**restrict val_sv = av_fetch(obs_av, j, 0);
1764                   grand_total += SvNV(*val_sv);
1765          }
1766          double E = grand_total / (double)c;
1767          for (unsigned int j = 0; j < c; j++) {
1768                   SV**restrict val_sv = av_fetch(obs_av, j, 0);
1769                   double O = SvNV(*val_sv);
1770                   av_push(expected_av, newSVnv(E));
1771                   stat += ((O - E) * (O - E)) / E;
1772
37
          }
1773
37
          df = c - 1;
1774
37
        }
1775        double p_val = get_p_value(stat, df);
1776        HV*restrict results = newHV();
1777
37
        hv_store(results, "statistic", 9, newSVnv(stat), 0);
1778
37
        hv_store(results, "df", 2, newSViv(df), 0);
1779
37
        hv_store(results, "p_value", 7, newSVnv(p_val), 0);
1780
37
        hv_store(results, "expected", 8, newRV_noinc((SV*)expected_av), 0);
1781
37
        if (is_2d) {
1782                if (yates) {
1783                        hv_store(results, "method", 6, newSVpv("Pearson's Chi-squared test with Yates' continuity correction", 0), 0);
1784
37
                } else {
1785
37
                        hv_store(results, "method", 6, newSVpv("Pearson's Chi-squared test", 0), 0);
1786
37
                }
1787        } else {
1788          hv_store(results, "method", 6, newSVpv("Chi-squared test for given probabilities", 0), 0);
1789
37
        }
1790
37
        RETVAL = newRV_noinc((SV*)results);
1791
37
}
1792OUTPUT:
1793        RETVAL
1794
1795
67
PROTOTYPES: ENABLE
1796
1797
67
void write_table(...)
1798
67
PPCODE:
1799
67
{
1800
55
        SV *restrict data_sv = NULL;
1801
55
        SV *restrict file_sv = NULL;
1802
31
        unsigned int arg_idx = 0;
1803
1804        // Mimic the Perl shift logic
1805        if (arg_idx < items && SvROK(ST(arg_idx))) {
1806
37
          int type = SvTYPE(SvRV(ST(arg_idx)));
1807
0
          if (type == SVt_PVHV || type == SVt_PVAV) {
1808                   data_sv = ST(arg_idx);
1809
37
                   arg_idx++;
1810
37
          }
1811
0
        }
1812        if (arg_idx < items) {
1813          file_sv = ST(arg_idx);
1814
37
          arg_idx++;
1815
37
        }
1816
1817
37
        const char *restrict sep = ",";
1818
12
        SV *restrict row_names_sv = sv_2mortal(newSViv(1));
1819
3
        SV *restrict col_names_sv = NULL;
1820
1821        // Read the remaining Hash-style arguments
1822        for (; arg_idx < items; arg_idx += 2) {
1823
34
          if (arg_idx + 1 >= items) croak("write_table: Odd number of arguments passed");
1824
34
          const char *restrict key = SvPV_nolen(ST(arg_idx));
1825          SV *restrict val = ST(arg_idx + 1);
1826          if (strEQ(key, "data")) data_sv = val;
1827
34
          else if (strEQ(key, "col.names")) col_names_sv = val;
1828
25
          else if (strEQ(key, "file")) file_sv = val;
1829
25
          else if (strEQ(key, "row.names")) row_names_sv = val;
1830          else if (strEQ(key, "sep")) sep = SvPV_nolen(val);
1831
25
          else croak("write_table: Unknown arguments passed: %s", key);
1832
25
        }
1833
1834
0
        if (!data_sv || !SvROK(data_sv)) {
1835          croak("write_table: 'data' must be a HASH or ARRAY reference\n");
1836
25
        }
1837
25
        SV *restrict data_ref = SvRV(data_sv);
1838
0
        if (SvTYPE(data_ref) != SVt_PVHV && SvTYPE(data_ref) != SVt_PVAV) {
1839          croak("write_table: 'data' must be a HASH or ARRAY reference\n");
1840        }
1841
1842
25
        if (!file_sv || !SvOK(file_sv)) croak("write_table: file name missing\n");
1843        const char *restrict file = SvPV_nolen(file_sv);
1844
1845
88
        if (col_names_sv && SvOK(col_names_sv)) {
1846
63
          if (!SvROK(col_names_sv) || SvTYPE(SvRV(col_names_sv)) != SVt_PVAV) {
1847
63
                   croak("write_table: 'col.names' must be an ARRAY reference\n");
1848
0
          }
1849        }
1850
1851        bool is_hoh = 0, is_hoa = 0, is_aoh = 0;
1852
25
        AV *restrict rows_av = NULL;
1853
1854
12
        // Validate Input Structures & Homogeneity
1855
36
        if (SvTYPE(data_ref) == SVt_PVHV) {
1856
24
                HV *restrict hv = (HV*)data_ref;
1857                if (hv_iterinit(hv) == 0) XSRETURN_EMPTY;
1858
1859                HE *restrict entry = hv_iternext(hv);
1860
9
                SV *restrict first_val = hv_iterval(hv, entry);
1861
9
                if (!first_val || !SvROK(first_val)) {
1862
9
                        croak("write_table: Data values must be either all HASHes or all ARRAYs\n");
1863
9
                }
1864
0
                int first_type = SvTYPE(SvRV(first_val));
1865                if (first_type != SVt_PVHV && first_type != SVt_PVAV) {
1866                        croak("write_table: Data values must be either all HASHes or all ARRAYs\n");
1867
30
                }
1868
21
                is_hoh = (first_type == SVt_PVHV);
1869
21
                is_hoa = (first_type == SVt_PVAV);
1870
0
                hv_iterinit(hv);
1871                while ((entry = hv_iternext(hv))) {
1872                        SV *restrict val = hv_iterval(hv, entry);
1873
9
                        if (!val || !SvROK(val) || SvTYPE(SvRV(val)) != first_type) {
1874                                 croak("write_table: Mixed data types detected. Ensure all values are %s references.\n", is_hoh ? "HASH" : "ARRAY");
1875                        }
1876
34
                }
1877
34
                if (is_hoh) {
1878                        rows_av = newAV();
1879
34
                        hv_iterinit(hv);
1880
34
                        while ((entry = hv_iternext(hv))) {
1881
34
                                 av_push(rows_av, newSVsv(hv_iterkeysv(entry)));
1882                        }
1883                }
1884
34
        } else {
1885
15
                AV *restrict av = (AV*)data_ref;
1886
3
                if (av_len(av) < 0) XSRETURN_EMPTY;
1887
12
                SV **restrict first_ptr = av_fetch(av, 0, 0);
1888
9
                if (!first_ptr || !*first_ptr || !SvROK(*first_ptr) || SvTYPE(SvRV(*first_ptr)) != SVt_PVHV) {
1889
9
                        croak("write_table: For ARRAY data, all elements must be HASH references (Array of Hashes)\n");
1890                }
1891
1892
9
                for (size_t i = 0; i <= av_len(av); i++) {
1893
9
                        SV **restrict ptr = av_fetch(av, i, 0);
1894                        if (!ptr || !*ptr || !SvROK(*ptr) || SvTYPE(SvRV(*ptr)) != SVt_PVHV) {
1895
27
                                 croak("write_table: Mixed data types detected in Array of Hashes. All elements must be HASH references.\n");
1896
18
                        }
1897
18
                }
1898                is_aoh = 1;
1899
54
        }
1900
1901        PerlIO *restrict fh = PerlIO_open(file, "w");
1902        if (!fh) croak("write_table: Could not open '%s' for writing", file);
1903
1904
9
        AV *restrict headers_av = newAV();
1905
36
        bool inc_rownames = (row_names_sv && SvTRUE(row_names_sv)) ? 1 : 0;
1906
27
        const char *restrict rownames_col = NULL;
1907
1908        // ----- Hash of Hashes -----
1909
9
        if (is_hoh) {
1910
36
                if (col_names_sv && SvOK(col_names_sv)) {
1911
9
                        AV *restrict c_av = (AV*)SvRV(col_names_sv);
1912
9
                        for(size_t i=0; i<=av_len(c_av); i++) {
1913                                SV **restrict c = av_fetch(c_av, i, 0);
1914
12
                                if(c && SvOK(*c)) av_push(headers_av, newSVsv(*c));
1915
12
                        }
1916                } else {
1917
12
                        HV *restrict col_map = newHV();
1918
12
                        hv_iterinit((HV*)data_ref);
1919
48
                        HE *restrict entry;
1920
36
                        while((entry = hv_iternext((HV*)data_ref))) {
1921
36
                                HV *restrict inner = (HV*)SvRV(hv_iterval((HV*)data_ref, entry));
1922                                hv_iterinit(inner);
1923
12
                                HE *restrict inner_entry;
1924
12
                                while((inner_entry = hv_iternext(inner))) {
1925                                        hv_store_ent(col_map, hv_iterkeysv(inner_entry), newSViv(1), 0);
1926
12
                                }
1927
12
                        }
1928
36
                        unsigned num_cols = hv_iterinit(col_map);
1929
24
                        const char **restrict col_array = safemalloc(num_cols * sizeof(char*));
1930                        for(unsigned i=0; i<num_cols; i++) {
1931
12
                                HE *restrict ce = hv_iternext(col_map);
1932                                col_array[i] = SvPV_nolen(hv_iterkeysv(ce));
1933
12
                        }
1934
12
                        qsort(col_array, num_cols, sizeof(char*), cmp_string_wt);
1935                        for(unsigned i=0; i<num_cols; i++) av_push(headers_av, newSVpv(col_array[i], 0));
1936
33
                        safefree(col_array);
1937
24
                        SvREFCNT_dec(col_map);
1938
24
          }
1939          size_t num_headers = av_len(headers_av) + 1;
1940
24
          const char **restrict header_row = safemalloc((num_headers + 1) * sizeof(char*));
1941
1942          size_t h_idx = 0;
1943
99
          if (inc_rownames) header_row[h_idx++] = "";
1944
78
          for(unsigned short int i=0; i<num_headers; i++) {
1945
78
                   SV**restrict h_ptr = av_fetch(headers_av, i, 0);
1946
78
                   header_row[h_idx++] = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : "";
1947
78
          }
1948
45
          print_string_row(fh, header_row, h_idx, sep);
1949
3
          safefree(header_row);
1950
1951
3
          size_t num_rows = av_len(rows_av) + 1;
1952
3
          const char **restrict row_array = safemalloc(num_rows * sizeof(char*));
1953
3
          for(size_t i=0; i<num_rows; i++) {
1954
3
                row_array[i] = SvPV_nolen(*av_fetch(rows_av, i, 0));
1955          }
1956
42
          qsort(row_array, num_rows, sizeof(char*), cmp_string_wt);
1957
1958
33
          HV *restrict data_hv = (HV*)data_ref;
1959          const char **restrict row_data = safemalloc((num_headers + 1) * sizeof(char*));
1960
1961
21
          for(size_t i=0; i<num_rows; i++) {
1962                   size_t d_idx = 0;
1963
9
                   if (inc_rownames) row_data[d_idx++] = row_array[i];
1964
1965
22
                   SV **restrict inner_hv_ptr = hv_fetch(data_hv, row_array[i], strlen(row_array[i]), 0);
1966
13
                   HV *restrict inner_hv = inner_hv_ptr ? (HV*)SvRV(*inner_hv_ptr) : NULL;
1967
1968
13
                   for(size_t j=0; j<num_headers; j++) {
1969                       SV**restrict h_ptr = av_fetch(headers_av, j, 0);
1970
52
                       const char *restrict col_name = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : "";
1971
39
                       SV **restrict cell_ptr = inner_hv ? hv_fetch(inner_hv, col_name, strlen(col_name), 0) : NULL;
1972
39
                       if (cell_ptr && SvOK(*cell_ptr)) {
1973
39
                           if (SvROK(*cell_ptr)) {
1974                               PerlIO_close(fh);
1975                               safefree(row_array);
1976
16
                     safefree(row_data);
1977
3
                       if (headers_av) SvREFCNT_dec(headers_av);
1978
12
                       if (rows_av) SvREFCNT_dec(rows_av);
1979
9
                               croak("write_table: Cannot write nested reference types to table\n");
1980
9
                           }
1981                           row_data[d_idx++] = SvPV_nolen(*cell_ptr);
1982                       } else {
1983
10
                           row_data[d_idx++] = "NA";
1984
10
                       }
1985
40
                   }
1986
30
                   print_string_row(fh, row_data, d_idx, sep);
1987
30
          }
1988          safefree(row_array); safefree(row_data);
1989
1990
40
        } else if (is_hoa) { // ----- Hash of Arrays -----
1991
10
                HV *restrict data_hv = (HV*)data_ref;
1992                size_t max_rows = 0;
1993
13
                hv_iterinit(data_hv);
1994
13
                HE *restrict entry;
1995
0
                while((entry = hv_iternext(data_hv))) {
1996
0
                        AV *restrict arr = (AV*)SvRV(hv_iterval(data_hv, entry));
1997                        size_t len = av_len(arr) + 1;
1998
0
                        if (len > max_rows) max_rows = len;
1999
0
                }
2000
2001
0
                if (col_names_sv && SvOK(col_names_sv)) {
2002
0
                        AV *restrict c_av = (AV*)SvRV(col_names_sv);
2003
0
                        for(size_t i=0; i<=av_len(c_av); i++) {
2004                                 SV **restrict c = av_fetch(c_av, i, 0);
2005                                 if(c && SvOK(*c)) av_push(headers_av, newSVsv(*c));
2006
0
                        }
2007
0
                } else {
2008                        unsigned int num_cols = hv_iterinit(data_hv);
2009
13
                        const char **restrict col_array = safemalloc(num_cols * sizeof(char*));
2010
13
                        for(unsigned int i=0; i<num_cols; i++) {
2011
13
                                 HE *restrict ce = hv_iternext(data_hv);
2012
13
                                 col_array[i] = SvPV_nolen(hv_iterkeysv(ce));
2013
52
                        }
2014
39
                        qsort(col_array, num_cols, sizeof(char*), cmp_string_wt);
2015
39
                        for(unsigned i=0; i<num_cols; i++) av_push(headers_av, newSVpv(col_array[i], 0));
2016                        safefree(col_array);
2017
13
                }
2018
13
                if (av_len(headers_av) < 0) croak("Could not get headers in write_table");
2019
13
                if (inc_rownames && contains_nondigit(row_names_sv)) {
2020
78
                        rownames_col = SvPV_nolen(row_names_sv);
2021
65
                        AV *restrict filtered_headers = (AV*)sv_2mortal((SV*)newAV());
2022
2023
53
                        for(size_t i=0; i<=av_len(headers_av); i++) {
2024
0
                                 SV**restrict h_ptr = av_fetch(headers_av, i, 0);
2025
0
                                 if (!h_ptr || !*h_ptr) continue;
2026
0
                                 SV *restrict h_sv = *h_ptr;
2027
0
                                 if (strcmp(SvPV_nolen(h_sv), rownames_col) != 0) {
2028
0
                                     av_push(filtered_headers, newSVsv(h_sv));
2029
0
                                 }
2030
0
                        }
2031
0
                        SvREFCNT_dec(headers_av);
2032
0
                        headers_av = filtered_headers;
2033
0
                }
2034                size_t num_headers = av_len(headers_av) + 1;
2035
0
                const char **restrict header_row = safemalloc((num_headers + 1) * sizeof(char*));
2036                size_t h_idx = 0;
2037
0
                if (inc_rownames) header_row[h_idx++] = "";
2038                for(size_t i=0; i<num_headers; i++) {
2039                        SV**restrict h_ptr = av_fetch(headers_av, i, 0);
2040
0
                        header_row[h_idx++] = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : "";
2041                }
2042                print_string_row(fh, header_row, h_idx, sep);
2043                safefree(header_row);
2044
53
                const char **restrict row_data = safemalloc((num_headers + 1) * sizeof(char*));
2045
53
                for(size_t i=0; i<max_rows; i++) {
2046                        size_t d_idx = 0;
2047                        if (inc_rownames) {
2048
260
                                 if (rownames_col) {
2049
195
                                     SV **restrict rn_arr_ptr = hv_fetch(data_hv, rownames_col, strlen(rownames_col), 0);
2050
195
                                     if (rn_arr_ptr && SvROK(*rn_arr_ptr)) {
2051
195
                                         AV *restrict rn_arr = (AV*)SvRV(*rn_arr_ptr);
2052
390
                                         SV **restrict rn_val_ptr = av_fetch(rn_arr, i, 0);
2053
195
                                         if (rn_val_ptr && SvOK(*rn_val_ptr)) {
2054
195
                                             if (SvROK(*rn_val_ptr)) {
2055
195
                                                 PerlIO_close(fh);
2056
104
                                                 safefree(row_data);
2057
0
                                                 if (headers_av) SvREFCNT_dec(headers_av);
2058
0
                                                 croak("write_table: Cannot write nested reference types to table\n");
2059
0
                                             }
2060
0
                                             row_data[d_idx++] = SvPV_nolen(*rn_val_ptr);
2061                                         } else {
2062
104
                                             row_data[d_idx++] = "NA";
2063                                         }
2064
91
                                     } else {
2065                                         row_data[d_idx++] = "NA";
2066                                     }
2067
0
                                 } else {
2068                                     char buf[32];
2069                                     snprintf(buf, sizeof(buf), "%ld", (long)(i + 1));
2070
65
                                     row_data[d_idx++] = savepv(buf);
2071
65
                                 }
2072                        }
2073
13
                        for(size_t j=0; j<num_headers; j++) {
2074
9
                                 SV**restrict h_ptr = av_fetch(headers_av, j, 0);
2075
9
                                 const char *restrict col_name = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : "";
2076
9
                                 SV **restrict arr_ptr = hv_fetch(data_hv, col_name, strlen(col_name), 0);
2077
12
                                 if (arr_ptr && SvROK(*arr_ptr)) {
2078
3
                                     AV *restrict arr = (AV*)SvRV(*arr_ptr);
2079
9
                                     SV **restrict cell_ptr = av_fetch(arr, i, 0);
2080
6
                                     if (cell_ptr && SvOK(*cell_ptr)) {
2081
6
                                         if (SvROK(*cell_ptr)) {
2082                                             PerlIO_close(fh);
2083                                             safefree(row_data);
2084
6
                                             if (headers_av) SvREFCNT_dec(headers_av);
2085
21
                                             croak("write_table: Cannot write nested reference types to table\n");
2086
15
                                         }
2087
15
                                         row_data[d_idx++] = SvPV_nolen(*cell_ptr);
2088
15
                                     } else {
2089
15
                                         row_data[d_idx++] = "NA";
2090                                     }
2091
42
                                 } else {
2092
27
                                     row_data[d_idx++] = "NA";
2093                                 }
2094                        }
2095                        print_string_row(fh, row_data, d_idx, sep);
2096
6
                        if (inc_rownames && !rownames_col) safefree((char*)row_data[0]);
2097
6
                }
2098
21
                safefree(row_data);
2099
15
        } else if (is_aoh) {// ----- Array of Hashes -----
2100
15
                AV *restrict data_av = (AV*)data_ref;
2101                size_t num_rows = av_len(data_av) + 1;
2102
6
                if (col_names_sv && SvOK(col_names_sv)) {
2103
21
                        AV *restrict c_av = (AV*)SvRV(col_names_sv);
2104
6
                        for(size_t i=0; i<=av_len(c_av); i++) {
2105
6
                                 SV **restrict c = av_fetch(c_av, i, 0);
2106                                 if(c && SvOK(*c)) av_push(headers_av, newSVsv(*c));
2107
9
                        }
2108
0
                } else {
2109
0
                        HV *restrict col_map = newHV();
2110
0
                        for(size_t i=0; i<num_rows; i++) {
2111
0
                                 SV **restrict row_ptr = av_fetch(data_av, i, 0);
2112
0
                                 if (row_ptr && SvROK(*row_ptr)) {
2113
0
                                     HV *restrict row_hv = (HV*)SvRV(*row_ptr);
2114
0
                                     hv_iterinit(row_hv);
2115
0
                                     HE *restrict entry;
2116                                     while((entry = hv_iternext(row_hv))) {
2117                                         hv_store_ent(col_map, hv_iterkeysv(entry), newSViv(1), 0);
2118
0
                                     }
2119
0
                                 }
2120                        }
2121
9
                        unsigned num_cols = hv_iterinit(col_map);
2122
9
                        const char **restrict col_array = safemalloc(num_cols * sizeof(char*));
2123
9
                        for(unsigned int i=0; i<num_cols; i++) {
2124
9
                                 HE *restrict ce = hv_iternext(col_map);
2125
30
                                 col_array[i] = SvPV_nolen(hv_iterkeysv(ce));
2126
21
                        }
2127
21
                        qsort(col_array, num_cols, sizeof(char*), cmp_string_wt);
2128                        for(unsigned int i=0; i<num_cols; i++) av_push(headers_av, newSVpv(col_array[i], 0));
2129
9
                        safefree(col_array);
2130
9
                        SvREFCNT_dec(col_map);
2131
9
                }
2132
30
                if (inc_rownames && contains_nondigit(row_names_sv)) {
2133
21
                        rownames_col = SvPV_nolen(row_names_sv);
2134
21
                        AV *restrict filtered_headers = newAV();
2135
21
                        for(size_t i=0; i<=av_len(headers_av); i++) {
2136
21
                                 SV**restrict h_ptr = av_fetch(headers_av, i, 0);
2137
9
                                 if (!h_ptr || !*h_ptr) continue;
2138
0
                                 SV *restrict h_sv = *h_ptr;
2139
0
                                 if (strcmp(SvPV_nolen(h_sv), rownames_col) != 0) {
2140
0
                                     av_push(filtered_headers, newSVsv(h_sv));
2141
0
                                 }
2142
0
                        }
2143
0
                        SvREFCNT_dec(headers_av);
2144
0
                        headers_av = filtered_headers;
2145                }
2146
0
                size_t num_headers = av_len(headers_av) + 1;
2147                const char **restrict header_row = safemalloc((num_headers + 1) * sizeof(char*));
2148
0
                size_t h_idx = 0;
2149                if (inc_rownames) header_row[h_idx++] = "";
2150                for(size_t i=0; i<num_headers; i++) {
2151                        SV**restrict h_ptr = av_fetch(headers_av, i, 0);
2152
9
                        header_row[h_idx++] = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : "";
2153
9
                }
2154                print_string_row(fh, header_row, h_idx, sep);
2155                safefree(header_row);
2156                const char **restrict row_data = safemalloc((num_headers + 1) * sizeof(char*));
2157
72
                for(size_t i=0; i<num_rows; i++) {
2158
51
                        size_t d_idx = 0;
2159
51
                        SV **restrict row_ptr = av_fetch(data_av, i, 0);
2160
51
                        HV *restrict row_hv = (row_ptr && SvROK(*row_ptr)) ? (HV*)SvRV(*row_ptr) : NULL;
2161
51
                        if (inc_rownames) {
2162
39
                                if (rownames_col) {
2163
0
                                  SV **restrict rn_val_ptr = row_hv ? hv_fetch(row_hv, rownames_col, strlen(rownames_col), 0) : NULL;
2164
0
                                  if (rn_val_ptr && SvOK(*rn_val_ptr)) {
2165
0
                                                if (SvROK(*rn_val_ptr)) {
2166
0
                                                         PerlIO_close(fh);
2167                                                                  safefree(row_data);
2168
39
                                                                  if (headers_av) SvREFCNT_dec(headers_av);
2169                                                         croak("write_table: Cannot write nested reference types to table\n");
2170
12
                                                }
2171                                                row_data[d_idx++] = SvPV_nolen(*rn_val_ptr);
2172                                  } else {
2173
21
                                                row_data[d_idx++] = "NA";
2174
21
                                  }
2175                                } else {
2176
9
                                  char buf[32];
2177                                  snprintf(buf, sizeof(buf), "%ld", (long)(i + 1));
2178
31
                                  row_data[d_idx++] = savepv(buf);
2179
31
                                }
2180
31
                        }
2181
2182                        for(size_t j=0; j<num_headers; j++) {
2183                                 SV**restrict h_ptr = av_fetch(headers_av, j, 0);
2184                                 const char *restrict col_name = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : "";
2185                                 SV **restrict cell_ptr = row_hv ? hv_fetch(row_hv, col_name, strlen(col_name), 0) : NULL;
2186                                 if (cell_ptr && SvOK(*cell_ptr)) {
2187                                     if (SvROK(*cell_ptr)) {
2188
39
                                         PerlIO_close(fh);
2189
39
                         safefree(row_data);
2190
39
                         if (headers_av) SvREFCNT_dec(headers_av);
2191
39
                                         croak("write_table: Cannot write nested reference types to table\n");
2192                                     }
2193                                     row_data[d_idx++] = SvPV_nolen(*cell_ptr);
2194
39
                                 } else {
2195                                     row_data[d_idx++] = "NA";
2196
39
                                 }
2197
39
                        }
2198                        print_string_row(fh, row_data, d_idx, sep);
2199
0
                        if (inc_rownames && !rownames_col) safefree((char*)row_data[0]);
2200                }
2201
39
                safefree(row_data);
2202
39
        }
2203        if (headers_av) SvREFCNT_dec(headers_av);
2204
39
        if (rows_av) SvREFCNT_dec(rows_av);
2205
39
        PerlIO_close(fh);
2206
0
        XSRETURN_EMPTY;
2207}
2208
2209SV*
2210
11925
_parse_csv_file(char* file, const char* sep_str, const char* comment_str, SV* callback = &PL_sv_undef)
2211
11886
INIT:
2212
11886
        PerlIO *restrict fp;
2213        AV *restrict data = NULL;
2214
11886
        AV *restrict current_row = newAV();
2215
11880
        SV *restrict field = newSVpvs("");
2216
11880
        bool in_quotes = 0, post_quote = 0;
2217
11088
        size_t sep_len, comment_len;
2218        SV *restrict line_sv;
2219        bool use_cb = 0;
2220
11886
CODE:
2221        if (SvOK(callback) && SvROK(callback) && SvTYPE(SvRV(callback)) == SVt_PVCV) {
2222
11883
                use_cb = 1;
2223
11886
        } else {
2224
11886
                data = newAV();
2225        }
2226
11883
        sep_len = sep_str ? strlen(sep_str) : 0;
2227        comment_len = comment_str ? strlen(comment_str) : 0;
2228
2229
11883
        fp = PerlIO_open(file, "r");
2230
0
        if (!fp) {
2231                croak("Could not open file '%s'", file);
2232        }
2233        line_sv = newSV_type(SVt_PV);
2234
880764
        // Read line by line using PerlIO
2235
868878
        while (sv_gets(line_sv, fp, 0) != NULL) {
2236
868878
                char *restrict line = SvPV_nolen(line_sv);
2237
868875
                size_t len = SvCUR(line_sv);
2238
66975
                // chomp \r\n (Handles Windows invisible \r natively)
2239
9
                if (len > 0 && line[len-1] == '\n') {
2240
9
                        len--;
2241
66966
                        if (len > 0 && line[len-1] == '\r') {
2242
33480
                                len--;
2243
33480
                        }
2244
33486
                }
2245
33483
                if (!in_quotes) {
2246                        // Skip completely empty lines (\h*[\r\n]+$ equivalent)
2247
801900
                        bool is_empty = 1;
2248
154845
                        for (size_t i = 0; i < len; i++) {
2249
154845
                                if (line[i] != ' ' && line[i] != '\t') { is_empty = 0; break; }
2250
154845
                        }
2251
154845
                        if (is_empty) continue;
2252
2253
647055
                        // Skip comments
2254                        if (comment_len > 0 && len >= comment_len && strncmp(line, comment_str, comment_len) == 0) {
2255                                continue;
2256
11886
                        }
2257                }
2258
6
                // --- CORE PARSING MACHINE ---
2259                for (size_t i = 0; i < len; i++) {
2260
11880
                        const char ch = line[i];
2261                        if (ch == '\r') continue;
2262
11880
                        if (ch == '"') {
2263
11880
                                if (in_quotes && (i + 1 < len) && line[i+1] == '"') {
2264                                        sv_catpvn(field, "\"", 1);
2265
11880
                                        i++; // Skip the escaped second quote
2266
11880
                                } else if (in_quotes) {
2267
11880
                                        in_quotes = 0;  // Close quotes
2268
11880
                                        post_quote = 1;
2269
11880
                                } else if (!post_quote) {
2270
11880
                                        in_quotes = 1; // Open quotes (only when not in post-quote state)
2271
11880
                                }
2272
11880
                        } else if (!in_quotes && sep_len > 0 && (len - i) >= sep_len && strncmp(line + i, sep_str, sep_len) == 0) {
2273
11880
                                av_push(current_row, newSVsv(field));
2274
11880
                                sv_setpvs(field, ""); // Reset for next field
2275
11880
                                i += sep_len - 1;     // Advance past multi-char separators
2276                                post_quote = 0;
2277
0
                        } else {
2278                                sv_catpvn(field, &ch, 1);
2279
11880
                        }
2280                }
2281                if (in_quotes) {
2282
39
                        // Line ended but quotes are still open! Append newline and fetch next
2283
39
                        sv_catpvn(field, "\n", 1);
2284                } else {
2285
39
                        post_quote = 0; // Reset post-quote state at row boundary
2286
3
                        // Push the final field of the record
2287
3
                        av_push(current_row, newSVsv(field));
2288
3
                        sv_setpvs(field, "");
2289
3
                        // If a callback is provided, invoke it in a streaming fashion
2290
3
                        if (use_cb) {
2291
3
                                dSP;
2292
3
                                ENTER;
2293
3
                                SAVETMPS;
2294
3
                                PUSHMARK(SP);
2295
3
                                XPUSHs(sv_2mortal(newRV_inc((SV*)current_row)));
2296
3
                                PUTBACK;
2297
3
                                call_sv(callback, G_DISCARD);
2298                                FREETMPS;
2299
0
                                LEAVE;
2300                                SvREFCNT_dec(current_row); // Frees the row from C memory if Perl didn't keep it
2301
3
                        } else {
2302                                av_push(data, newRV_noinc((SV*)current_row));
2303
39
                        }
2304
39
                        current_row = newAV();
2305                }
2306
39
        }
2307
39
        PerlIO_close(fp);
2308        SvREFCNT_dec(line_sv);
2309
2310        if (in_quotes) {
2311                av_push(current_row, newSVsv(field));
2312                if (use_cb) {
2313                        dSP;
2314                        ENTER;
2315                        SAVETMPS;
2316                        PUSHMARK(SP);
2317                        XPUSHs(sv_2mortal(newRV_inc((SV*)current_row)));
2318
12
                        PUTBACK;
2319
0
                        call_sv(callback, G_DISCARD);
2320                        FREETMPS;
2321
12
                        LEAVE;
2322
0
                        SvREFCNT_dec(current_row);
2323                } else {
2324                        av_push(data, newRV_noinc((SV*)current_row));
2325                }
2326
12
                current_row = newAV();
2327
6
        }
2328
3
        SvREFCNT_dec(field);
2329
0
        SvREFCNT_dec(current_row);
2330        if (use_cb) {
2331                RETVAL = &PL_sv_undef; // Memory was fully handled by callback stream
2332
12
        } else {
2333
12
                RETVAL = newRV_noinc((SV*)data);
2334
12
        }
2335
12
OUTPUT:
2336        RETVAL
2337
2338
0
SV* cov(SV* x_sv, SV* y_sv, const char* method = "pearson")
2339        CODE:
2340        {
2341                // 1. Validate inputs are Array References
2342                if (!SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV) {
2343                        croak("cov: first argument 'x' must be an ARRAY reference");
2344
12
                }
2345
12
                if (!SvROK(y_sv) || SvTYPE(SvRV(y_sv)) != SVt_PVAV) {
2346
12
                        croak("cov: second argument 'y' must be an ARRAY reference");
2347                }
2348
2349
60
                // 2. Validate method argument
2350
60
                if (strcmp(method, "pearson") != 0 &&
2351                        strcmp(method, "spearman") != 0 &&
2352                        strcmp(method, "kendall") != 0) {
2353
60
                        croak("cov: unknown method '%s' (use 'pearson', 'spearman', or 'kendall')", method);
2354
60
                }
2355
2356                AV *restrict x_av = (AV*)SvRV(x_sv);
2357
60
                AV *restrict y_av = (AV*)SvRV(y_sv);
2358
60
                size_t nx = av_len(x_av) + 1;
2359
60
                size_t ny = av_len(y_av) + 1;
2360
2361                if (nx != ny) {
2362                        croak("cov: incompatible dimensions (x has %lu, y has %lu)",
2363                                   (unsigned long)nx, (unsigned long)ny);
2364                }
2365
2366
0
                // 3. Extract Valid Pairwise Data
2367
0
                // Allocate temporary C arrays for numeric processing
2368
0
                double *restrict x_val = (double*)safemalloc(nx * sizeof(double));
2369                double *restrict y_val = (double*)safemalloc(nx * sizeof(double));
2370
12
                size_t n = 0;
2371
2372                for (size_t i = 0; i < nx; i++) {
2373
12
                        SV **restrict x_tv = av_fetch(x_av, i, 0);
2374                        SV **restrict y_tv = av_fetch(y_av, i, 0);
2375
2376
90
                        // Extract numeric values, defaulting to NAN for missing/invalid data
2377
75
                        double xv = (x_tv && SvOK(*x_tv) && looks_like_number(*x_tv)) ? SvNV(*x_tv) : NAN;
2378
75
                        double yv = (y_tv && SvOK(*y_tv) && looks_like_number(*y_tv)) ? SvNV(*y_tv) : NAN;
2379
2380                        // Pairwise complete observations (skips NAs seamlessly like R)
2381                        if (!isnan(xv) && !isnan(yv)) {
2382                                 x_val[n] = xv;
2383
9
                                 y_val[n] = yv;
2384                                 n++;
2385
9
                        }
2386                }
2387
2388
3
                // 4. Handle edge cases where data is too sparse
2389                if (n < 2) {
2390                        Safefree(x_val);        Safefree(y_val);
2391
3
                        RETVAL = newSVnv(NAN);
2392
3
                } else {
2393                        double ans = 0.0;                       
2394
18
                        // 5. Algorithm routing
2395
15
                        if (strcmp(method, "kendall") == 0) {
2396
15
                                 // R's default cov(..., method="kendall") iterates the full n x n space
2397
15
                                 for (size_t i = 0; i < n; i++) {
2398
15
                                     for (size_t j = 0; j < n; j++) {
2399
15
                                         int sx = (x_val[i] > x_val[j]) - (x_val[i] < x_val[j]);
2400                                         int sy = (y_val[i] > y_val[j]) - (y_val[i] < y_val[j]);
2401                                         ans += (double)(sx * sy);
2402
3
                                     }
2403
3
                                 }
2404                        } else {
2405                                 double mean_x = 0.0, mean_y = 0.0, cov_sum = 0.0;
2406
36
                                 if (strcmp(method, "spearman") == 0) {
2407
30
                                     // Spearman: Rank the data first, then run standard covariance
2408
30
                                     double *restrict rx = (double*)safemalloc(n * sizeof(double));
2409
30
                                     double *restrict ry = (double*)safemalloc(n * sizeof(double));
2410
2411
30
                                     // Uses your existing rank_data() helper from LikeR.xs
2412                                     rank_data(x_val, rx, n);
2413                                     rank_data(y_val, ry, n);
2414
2415                                     for (size_t i = 0; i < n; i++) {
2416
9
                                         double dx = rx[i] - mean_x;
2417                                         mean_x += dx / (i + 1);
2418                                         double dy = ry[i] - mean_y;
2419
12
                                         mean_y += dy / (i + 1);
2420
12
                                         cov_sum += dx * (ry[i] - mean_y);
2421
12
                                     }
2422
2423                                     Safefree(rx);
2424                                     Safefree(ry);
2425                                 } else {
2426                                     // Pearson: Welford's Single-Pass Covariance Algorithm
2427                                     for (size_t i = 0; i < n; i++) {
2428                                         double dx = x_val[i] - mean_x;
2429                                         mean_x += dx / (i + 1);
2430
21
                                         double dy = y_val[i] - mean_y;
2431
21
                                         mean_y += dy / (i + 1);
2432
21
                                         cov_sum += dx * (y_val[i] - mean_y);
2433                                     }
2434                                 }
2435
2436                                 // Unbiased Sample Covariance (N - 1) for Pearson & Spearman
2437
21
                                 ans = cov_sum / (n - 1);
2438
21
                        }
2439
2440
21
                        Safefree(x_val);
2441
21
                        Safefree(y_val);
2442
21
                        RETVAL = newSVnv(ans);
2443
21
                }
2444
21
        }
2445
21
        OUTPUT:
2446          RETVAL
2447
2448
21
SV* glm(...)
2449
21
CODE:
2450
21
{
2451
21
        const char *restrict formula  = NULL;
2452        SV *restrict data_sv = NULL;
2453
21
        const char *restrict family_str = "gaussian";
2454
21
        char f_cpy[512];
2455
21
        char *restrict src, *restrict dst, *restrict tilde, *restrict lhs, *restrict rhs, *restrict chunk;
2456
2457        // Dynamic Term Arrays
2458        char **restrict terms = NULL, **restrict uniq_terms = NULL, **restrict exp_terms = NULL;
2459        bool *restrict is_dummy = NULL;
2460        char **restrict dummy_base = NULL, **restrict dummy_level = NULL;
2461        unsigned int term_cap = 64, exp_cap = 64, num_terms = 0, num_uniq = 0, p = 0, p_exp = 0;
2462
21
        size_t n = 0, valid_n = 0, i;
2463        bool has_intercept = true, converged = false, boundary = false;
2464
78
        int iter = 0, max_iter = 25, final_rank = 0, df_res = 0;
2465
57
        double deviance_old = 0.0, deviance_new = 0.0, null_dev = 0.0, aic = 0.0;
2466
57
        double dispersion = 0.0, epsilon = 1e-8;
2467
2468
36
        char **restrict row_names = NULL;
2469
15
        char **restrict valid_row_names = NULL;
2470
0
        HV **restrict row_hashes = NULL;
2471        HV *restrict data_hoa = NULL;
2472
21
        SV *restrict ref = NULL;
2473
2474        double *restrict X = NULL, *restrict Y = NULL, *restrict mu = NULL, *restrict eta = NULL;
2475
21
        double *restrict W = NULL, *restrict Z = NULL, *restrict beta = NULL, *restrict beta_old = NULL;
2476
21
        bool *restrict aliased = NULL;
2477
21
        double *restrict XtWX = NULL, *restrict XtWZ = NULL;
2478
2479        HV *restrict res_hv, *restrict coef_hv, *restrict fitted_hv, *restrict resid_hv, *restrict summary_hv;
2480
21
        AV *restrict terms_av;
2481
21
        HE *restrict entry;
2482
2483        if (items % 2 != 0) croak("Usage: glm(formula => 'am ~ wt + hp', data => \\%mtcars)");
2484
2485
294
        for (unsigned short i_arg = 0; i_arg < items; i_arg += 2) {
2486
21
          const char *restrict key = SvPV_nolen(ST(i_arg));
2487          SV *restrict val = ST(i_arg + 1);
2488
21
          if      (strEQ(key, "formula")) formula = SvPV_nolen(val);
2489
21
          else if (strEQ(key, "data"))    data_sv = val;
2490
21
          else if (strEQ(key, "family"))  family_str = SvPV_nolen(val);
2491
21
          else croak("glm: unknown argument '%s'", key);
2492        }        
2493
21
        if (!formula) croak("glm: formula is required");
2494
21
        if (!data_sv || !SvROK(data_sv)) croak("glm: data is required and must be a reference");
2495
2496
21
        bool is_binomial = (strcmp(family_str, "binomial") == 0);
2497
60
        bool is_gaussian = (strcmp(family_str, "gaussian") == 0);
2498
39
        if (!is_binomial && !is_gaussian) croak("glm: unsupported family '%s'", family_str);
2499
2500
0
        // --- Formula Parsing & Expansion ---
2501        Newx(terms, term_cap, char*); Newx(uniq_terms, term_cap, char*);
2502
39
        Newx(exp_terms, exp_cap, char*); Newx(is_dummy, exp_cap, bool);
2503
0
        Newx(dummy_base, exp_cap, char*); Newx(dummy_level, exp_cap, char*);
2504
2505        src = (char*)formula; dst = f_cpy;
2506
39
        while (*src && (dst - f_cpy < 511)) { if (!isspace(*src)) { *dst++ = *src; } src++; }
2507
39
        *dst = '\0';
2508
2509
0
        tilde = strchr(f_cpy, '~');
2510
0
        if (!tilde) croak("glm: invalid formula, missing '~'");
2511
0
        *tilde = '\0';
2512        lhs = f_cpy; rhs = tilde + 1;
2513
2514
0
        if (strstr(rhs, "-1")) has_intercept = false;
2515
0
        if (has_intercept) terms[num_terms++] = savepv("Intercept");
2516
2517
0
        chunk = strtok(rhs, "+");
2518        while (chunk != NULL) {
2519
39
          if (num_terms >= term_cap - 3) {
2520
39
                   term_cap *= 2;
2521
39
                   Renew(terms, term_cap, char*); Renew(uniq_terms, term_cap, char*);
2522          }
2523
39
          if (strcmp(chunk, "1") == 0 || strcmp(chunk, "-1") == 0) {
2524                   chunk = strtok(NULL, "+");
2525                   continue;
2526
81
          }
2527
60
          char *restrict star = strchr(chunk, '*');
2528
117
          if (star) {
2529
57
                   *star = '\0';
2530                   char *restrict left = chunk; char *restrict right = star + 1;
2531
60
                   char *restrict c_l = strchr(left, '^'); if (c_l && strncmp(left, "I(", 2) != 0) *c_l = '\0';
2532                   char *restrict c_r = strchr(right, '^'); if (c_r && strncmp(right, "I(", 2) != 0) *c_r = '\0';
2533
2534                   terms[num_terms++] = savepv(left);
2535                   terms[num_terms++] = savepv(right);
2536
21
                   size_t inter_len = strlen(left) + strlen(right) + 2;
2537
21
                   terms[num_terms] = (char*)safemalloc(inter_len);
2538
21
                   snprintf(terms[num_terms++], inter_len, "%s:%s", left, right);
2539
21
          } else {
2540
21
                   char *restrict c_chunk = strchr(chunk, '^');
2541
21
                   if (c_chunk && strncmp(chunk, "I(", 2) != 0) *c_chunk = '\0';
2542
21
                   terms[num_terms++] = savepv(chunk);
2543
21
          }
2544
6
          chunk = strtok(NULL, "+");
2545
6
        }
2546
2547
366
        for (i = 0; i < num_terms; i++) {
2548
360
          bool found = false;
2549
360
          for (size_t j = 0; j < num_uniq; j++) {
2550                   if (strcmp(terms[i], uniq_terms[j]) == 0) { found = true; break; }
2551
15
          }
2552
15
          if (!found) uniq_terms[num_uniq++] = savepv(terms[i]);
2553
15
        }
2554
15
        p = num_uniq;
2555
2556        // --- Data Extraction ---
2557
480
        ref = SvRV(data_sv);
2558
480
        if (SvTYPE(ref) == SVt_PVHV) {
2559
480
                HV*restrict hv = (HV*)ref;
2560                if (hv_iterinit(hv) == 0) croak("glm: Data hash is empty");
2561
0
                entry = hv_iternext(hv);
2562                if (entry) {
2563
0
                        SV*restrict val = hv_iterval(hv, entry);
2564
0
                        if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVAV) {
2565
0
                                 data_hoa = hv;
2566
0
                                 n = av_len((AV*)SvRV(val)) + 1;
2567
0
                                 Newx(row_names, n, char*);
2568
0
                                 for(i = 0; i < n; i++) {
2569
0
                                     char buf[32]; snprintf(buf, sizeof(buf), "%lu", i+1);
2570
0
                                     row_names[i] = savepv(buf);
2571
0
                                 }
2572
0
                        } else if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVHV) {
2573                                 n = hv_iterinit(hv);
2574
0
                                 Newx(row_names, n, char*); Newx(row_hashes, n, HV*);
2575
0
                                 i = 0;
2576
0
                                 while ((entry = hv_iternext(hv))) {
2577                                     unsigned int len;
2578                                     row_names[i] = savepv(hv_iterkey(entry, &len));
2579
0
                                     row_hashes[i] = (HV*)SvRV(hv_iterval(hv, entry));
2580                                     i++;
2581                                 }
2582
81
                        } else croak("glm: Hash values must be ArrayRefs (HoA) or HashRefs (HoH)");
2583
60
                }
2584
0
        } else if (SvTYPE(ref) == SVt_PVAV) {
2585
0
          AV*restrict av = (AV*)ref;
2586
0
          n = av_len(av) + 1;
2587          Newx(row_names, n, char*); Newx(row_hashes, n, HV*);
2588
60
          for (i = 0; i < n; i++) {
2589
21
                   SV**restrict val = av_fetch(av, i, 0);
2590                   if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVHV) {
2591
39
                       row_hashes[i] = (HV*)SvRV(*val);
2592
3
                       char buf[32]; snprintf(buf, sizeof(buf), "%lu", i + 1);
2593
3
                       row_names[i] = savepv(buf);
2594
183
                   } else {
2595
180
                       for (size_t k = 0; k < i; k++) Safefree(row_names[k]);
2596
180
                       Safefree(row_names); Safefree(row_hashes);
2597
180
                       croak("glm: Array values must be HashRefs (AoH)");
2598
270
                   }
2599
264
          }
2600        } else croak("glm: Data must be an Array or Hash reference");
2601
2602
6
        // --- Categorical Expansion ---
2603
6
        for (size_t j = 0; j < p; j++) {
2604          if (p_exp + 32 >= exp_cap) {
2605
180
                   exp_cap *= 2;
2606                   Renew(exp_terms, exp_cap, char*); Renew(is_dummy, exp_cap, bool);
2607                   Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
2608
3
          }
2609
6
          if (strcmp(uniq_terms[j], "Intercept") == 0) {
2610
6
                   exp_terms[p_exp] = savepv("Intercept"); is_dummy[p_exp] = false; p_exp++; continue;
2611
3
          }
2612
3
          if (is_column_categorical(data_hoa, row_hashes, n, uniq_terms[j])) {
2613                   char **restrict levels = NULL; size_t num_levels = 0, levels_cap = 8;
2614                   Newx(levels, levels_cap, char*);
2615                   for (i = 0; i < n; i++) {
2616
6
                       char*restrict str_val = get_data_string_alloc(data_hoa, row_hashes, i, uniq_terms[j]);
2617
3
                       if (str_val) {
2618
0
                           bool found = false;
2619
0
                           for (size_t l = 0; l < num_levels; l++) {
2620
0
                               if (strcmp(levels[l], str_val) == 0) { found = true; break; }
2621                           }
2622
3
                           if (!found) {
2623
3
                               if (num_levels >= levels_cap) { levels_cap *= 2; Renew(levels, levels_cap, char*); }
2624
3
                               levels[num_levels++] = savepv(str_val);
2625
3
                           }
2626
3
                           Safefree(str_val);
2627                       }
2628
9
                   }
2629
3
                   if (num_levels > 0) {
2630                       for (size_t l1 = 0; l1 < num_levels - 1; l1++) {
2631
0
                           for (size_t l2 = l1 + 1; l2 < num_levels; l2++) {
2632                               if (strcmp(levels[l1], levels[l2]) > 0) {
2633                                   char *tmp = levels[l1]; levels[l1] = levels[l2]; levels[l2] = tmp;
2634
36
                               }
2635                           }
2636                       }
2637
21
                       for (size_t l = 1; l < num_levels; l++) {
2638                           if (p_exp >= exp_cap) {
2639
21
                               exp_cap *= 2;
2640
21
                               Renew(exp_terms, exp_cap, char*); Renew(is_dummy, exp_cap, bool);
2641                               Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
2642                           }
2643
861
                           size_t t_len = strlen(uniq_terms[j]) + strlen(levels[l]) + 1;
2644
840
                           exp_terms[p_exp] = (char*)safemalloc(t_len);
2645
840
                           snprintf(exp_terms[p_exp], t_len, "%s%s", uniq_terms[j], levels[l]);
2646                           is_dummy[p_exp] = true; dummy_base[p_exp] = savepv(uniq_terms[j]); dummy_level[p_exp] = savepv(levels[l]);
2647
840
                           p_exp++;
2648
840
                       }
2649
3180
                       for (size_t l = 0; l < num_levels; l++) Safefree(levels[l]);
2650
2340
                       Safefree(levels);
2651
840
                   } else {
2652
1500
                       Safefree(levels); exp_terms[p_exp] = savepv(uniq_terms[j]); is_dummy[p_exp] = false; p_exp++;
2653
180
                   }
2654
180
          } else {
2655
180
                   exp_terms[p_exp] = savepv(uniq_terms[j]); is_dummy[p_exp] = false; p_exp++;
2656
180
          }
2657
0
        }
2658        p = p_exp;
2659
2660
1320
        Newx(X, n * p, double); Newx(Y, n, double);
2661        Newx(valid_row_names, n, char*);
2662
2663
840
        // --- Listwise Deletion ---
2664
840
        for (size_t i = 0; i < n; i++) {
2665
3180
                double y_val = evaluate_term(data_hoa, row_hashes, i, lhs);
2666
840
                if (isnan(y_val)) { Safefree(row_names[i]); continue; }
2667
2668
840
                bool row_ok = true;
2669                double *restrict row_x = (double*)safemalloc(p * sizeof(double));
2670
21
                for (size_t j = 0; j < p; j++) {
2671
21
                        if (strcmp(exp_terms[j], "Intercept") == 0) {
2672
0
                                 row_x[j] = 1.0;
2673
0
                        } else if (is_dummy[j]) {
2674                                 char* str_val = get_data_string_alloc(data_hoa, row_hashes, i, dummy_base[j]);
2675                                 if (str_val) {
2676
21
                                     row_x[j] = (strcmp(str_val, dummy_level[j]) == 0) ? 1.0 : 0.0;
2677
21
                                     Safefree(str_val);
2678
21
                                 } else { row_ok = false; break; }
2679
21
                        } else {
2680
21
                                 row_x[j] = evaluate_term(data_hoa, row_hashes, i, exp_terms[j]);
2681
81
                                 if (isnan(row_x[j])) { row_ok = false; break; }
2682                        }
2683
21
                }
2684
861
                if (!row_ok) { Safefree(row_names[i]); Safefree(row_x); continue; }
2685
21
                Y[valid_n] = y_val;
2686
861
                for (size_t j = 0; j < p; j++) X[valid_n * p + j] = row_x[j];
2687
840
                valid_row_names[valid_n] = row_names[i];
2688
96
                valid_n++;
2689
96
                Safefree(row_x);
2690
96
        }
2691
96
        Safefree(row_names);
2692
96
        if (valid_n <= p) {
2693
39
          Safefree(X); Safefree(Y); Safefree(valid_row_names); if (row_hashes) Safefree(row_hashes);
2694
0
          croak("glm: 0 degrees of freedom (too many NAs or parameters > observations)");
2695
96
        }
2696        // --- R glm.fit IRLS Implementation ---
2697
744
        mu = (double*)safemalloc(valid_n * sizeof(double)); eta = (double*)safemalloc(valid_n * sizeof(double));
2698
744
        W = (double*)safemalloc(valid_n * sizeof(double)); Z = (double*)safemalloc(valid_n * sizeof(double));
2699        beta = (double*)safemalloc(p * sizeof(double)); beta_old = (double*)safemalloc(p * sizeof(double));
2700        aliased = (bool*)safemalloc(p * sizeof(bool));
2701        XtWX = (double*)safemalloc(p * p * sizeof(double)); XtWZ = (double*)safemalloc(p * sizeof(double));
2702
63
        for (i = 0; i < p; i++) { beta[i] = 0.0; beta_old[i] = 0.0; }
2703
2415
        // Initialize (mustart / etastart equivalent)
2704
2352
        double sum_y = 0.0;
2705
864
        for (i = 0; i < valid_n; i++) sum_y += Y[i];
2706
864
        double mean_y = sum_y / valid_n;
2707
864
        for (i = 0; i < valid_n; i++) {
2708
864
          if (is_binomial) {
2709
864
                   if (Y[i] < 0.0 || Y[i] > 1.0) croak("glm: binomial family requires response between 0 and 1");
2710                   mu[i] = (Y[i] + 0.5) / 2.0;
2711
1488
                   eta[i] = log(mu[i] / (1.0 - mu[i]));
2712
1488
                   double dev = 0.0;
2713                   if (Y[i] == 0.0)      dev = -2.0 * log(1.0 - mu[i]);
2714                   else if (Y[i] == 1.0) dev = -2.0 * log(mu[i]);
2715                   else dev = 2.0 * (Y[i] * log(Y[i] / mu[i]) + (1.0 - Y[i]) * log((1.0 - Y[i]) / (1.0 - mu[i])));
2716
783
                   deviance_old += dev;
2717
2415
          } else {
2718
2352
                   mu[i] = mean_y; // R gaussian init
2719
9048
                   eta[i] = mu[i];
2720
6696
          }
2721
6696
        }
2722
26064
        // IRLS Loop
2723        for (iter = 1; iter <= max_iter; iter++) {
2724                for (i = 0; i < valid_n; i++) {
2725
63
                        if (is_binomial) {
2726
246
                                 double varmu = mu[i] * (1.0 - mu[i]);
2727
183
                                 double mu_eta = varmu; // Link derivative for logit
2728
183
                                 if (varmu < 1e-10) varmu = 1e-10;
2729
720
                                 Z[i] = eta[i] + (Y[i] - mu[i]) / mu_eta;
2730
183
                                 W[i] = (mu_eta * mu_eta) / varmu;
2731                        } else {
2732                                 W[i] = 1.0;
2733                                 Z[i] = Y[i];
2734
63
                        }
2735
693
                }
2736
630
                // Formulate XtWX and XtWZ
2737
24150
                for (i = 0; i < p; i++) { XtWZ[i] = 0.0; for (size_t j = 0; j < p; j++) XtWX[i * p + j] = 0.0; }
2738
23520
                for (size_t k = 0; k < valid_n; k++) {
2739
90480
                        double w = W[k], z = Z[k];
2740
23520
                        for (i = 0; i < p; i++) {
2741
23520
                                 XtWZ[i] += X[k * p + i] * w * z;
2742
8640
                                 double xw = X[k * p + i] * w;
2743                                 for (size_t j = 0; j < p; j++) XtWX[i * p + j] += xw * X[k * p + j];
2744
8640
                        }
2745
8640
                }
2746                final_rank = sweep_matrix_ols(XtWX, p, aliased);
2747
8640
                for (i = 0; i < p; i++) {
2748
8640
                        if (aliased[i]) { beta[i] = NAN; } else {
2749
3510
                                 double sum = 0.0;
2750
0
                                 for (size_t j = 0; j < p; j++) if (!aliased[j]) sum += XtWX[i * p + j] * XtWZ[j];
2751
8640
                                 beta[i] = sum;
2752                        }
2753
14880
                }
2754
14880
                // Calculate updated ETA, MU, and Deviance (with Step-Halving)
2755
14880
                boundary = false;
2756                for (unsigned short int half = 0; half < 10; half++) {
2757                        deviance_new = 0.0;
2758                        for (i = 0; i < valid_n; i++) {
2759
630
                                 double linear_pred = 0.0;
2760
600
                                 for (size_t j = 0; j < p; j++) if (!aliased[j]) linear_pred += X[i * p + j] * beta[j];
2761                                 eta[i] = linear_pred;
2762                                 if (is_binomial) {
2763
30
                                     mu[i] = 1.0 / (1.0 + exp(-eta[i]));
2764
120
                                     // Boundary enforcement
2765                                     if (mu[i] < 10 * DBL_EPSILON) mu[i] = 10 * DBL_EPSILON;
2766                                     if (mu[i] > 1.0 - 10 * DBL_EPSILON) mu[i] = 1.0 - 10 * DBL_EPSILON;
2767
2768
21
                                     double dev = 0.0;
2769                                     if (Y[i] == 0.0)      dev = -2.0 * log(1.0 - mu[i]);
2770
42
                                     else if (Y[i] == 1.0) dev = -2.0 * log(mu[i]);
2771
165
                                     else dev = 2.0 * (Y[i] * log(Y[i] / mu[i]) + (1.0 - Y[i]) * log((1.0 - Y[i]) / (1.0 - mu[i])));
2772                                     deviance_new += dev;
2773                                 } else {
2774
255
                                     mu[i] = eta[i];
2775
861
                                     double res = Y[i] - mu[i];
2776
840
                                     deviance_new += res * res;
2777
840
                                 }
2778
3180
                        }
2779
2340
                        // Step halving divergence check
2780
9000
                        if (!is_binomial || deviance_new <= deviance_old + 1e-7 || !isfinite(deviance_new)) {
2781                                 continue;
2782                        }
2783
2784                        boundary = true;
2785
21
                        for (size_t j = 0; j < p; j++) beta[j] = (beta[j] + beta_old[j]) / 2.0;
2786
861
                }
2787
840
                // Convergence Check
2788
96
                if (fabs(deviance_new - deviance_old) / (0.1 + fabs(deviance_new)) < epsilon) {
2789
39
                        converged = true; break;
2790
0
                }
2791                deviance_old = deviance_new;
2792
744
                for (size_t j = 0; j < p; j++) beta_old[j] = beta[j];
2793
744
        }
2794        // Final accurate calculation of W for standard errors
2795        for (i = 0; i < p; i++) { for (size_t j = 0; j < p; j++) XtWX[i * p + j] = 0.0; }
2796        for (size_t k = 0; k < valid_n; k++) {
2797
21
          double w = is_binomial ? (mu[k] * (1.0 - mu[k])) : 1.0;
2798
18
          if (w < 1e-10) w = 1e-10;
2799
18
          for (i = 0; i < p; i++) {
2800
3
                   double xw = X[k * p + i] * w;
2801
3
                   for (size_t j = 0; j < p; j++) XtWX[i * p + j] += xw * X[k * p + j];
2802          }
2803        }
2804
21
        final_rank = sweep_matrix_ols(XtWX, p, aliased);
2805
21
        // --- Null Deviance Calculation ---
2806
21
        double wtdmu = mean_y; // Since weights are 1.0 initially
2807
861
        for (i = 0; i < valid_n; i++) {
2808
840
          if (is_binomial) {
2809
840
                   if (Y[i] == 0.0)      null_dev += -2.0 * log(1.0 - wtdmu);
2810                   else if (Y[i] == 1.0) null_dev += -2.0 * log(wtdmu);
2811
96
                   else null_dev += 2.0 * (Y[i] * log(Y[i] / wtdmu) + (1.0 - Y[i]) * log((1.0 - Y[i]) / (1.0 - wtdmu)));
2812
96
          } else {
2813
39
                   double diff = Y[i] - wtdmu;
2814
0
                   null_dev += diff * diff;
2815
96
          }
2816        }
2817
840
        // --- AIC Calculation ---
2818
840
        if (is_gaussian) {
2819
840
          double n_f = (double)valid_n;
2820          aic = n_f * (log(2.0 * M_PI) + 1.0 + log(deviance_new / n_f)) + 2.0 * (final_rank + 1.0);
2821
21
        } else if (is_binomial) {
2822          aic = deviance_new + 2.0 * final_rank;
2823
21
        }
2824
81
        // --- Return Structures ---
2825
60
        res_hv = newHV(); coef_hv = newHV(); fitted_hv = newHV(); resid_hv = newHV();
2826
60
        df_res = valid_n - final_rank;
2827        dispersion = is_binomial ? 1.0 : ((df_res > 0) ? (deviance_new / df_res) : NAN);
2828
60
        for (size_t i = 0; i < valid_n; i++) {
2829
60
                double res = Y[i] - mu[i];
2830
0
                if (is_binomial) {
2831
0
                        // Deviance residuals for binomial
2832
0
                        double d_res = 0.0;
2833
0
                        if (Y[i] == 0.0)      d_res = sqrt(-2.0 * log(1.0 - mu[i]));
2834                        else if (Y[i] == 1.0) d_res = sqrt(-2.0 * log(mu[i]));
2835
60
                        else d_res = sqrt(2.0 * (Y[i] * log(Y[i] / mu[i]) + (1.0 - Y[i]) * log((1.0 - Y[i]) / (1.0 - mu[i]))));
2836
60
                        res = (Y[i] > mu[i]) ? d_res : -d_res;
2837
60
                }
2838                hv_store(fitted_hv, valid_row_names[i], strlen(valid_row_names[i]), newSVnv(mu[i]), 0);
2839
60
                hv_store(resid_hv,  valid_row_names[i], strlen(valid_row_names[i]), newSVnv(res), 0);
2840
60
                Safefree(valid_row_names[i]);
2841
60
        }
2842
60
        Safefree(valid_row_names);
2843
2844
60
        summary_hv = newHV(); terms_av = newAV();
2845        for (size_t j = 0; j < p; j++) {
2846                hv_store(coef_hv, exp_terms[j], strlen(exp_terms[j]), newSVnv(beta[j]), 0);
2847
21
                av_push(terms_av, newSVpv(exp_terms[j], 0));
2848
2849
21
                HV *restrict row_hv = newHV();
2850
21
                if (aliased[j]) {
2851
21
                        hv_store(row_hv, "Estimate",   8, newSVpv("NaN", 0), 0);
2852
21
                        hv_store(row_hv, "Std. Error", 10, newSVpv("NaN", 0), 0);
2853
21
                        hv_store(row_hv, is_binomial ? "z value" : "t value", 7, newSVpv("NaN", 0), 0);
2854
21
                        hv_store(row_hv, is_binomial ? "Pr(>|z|)" : "Pr(>|t|)", 8, newSVpv("NaN", 0), 0);
2855
21
                } else {
2856
21
                        double se = sqrt(dispersion * XtWX[j * p + j]);
2857
21
                        double val_stat = beta[j] / se;
2858
21
                        double p_val = is_binomial ? 2.0 * (1.0 - approx_pnorm(fabs(val_stat))) : get_t_pvalue(val_stat, df_res, "two.sided");
2859
2860
21
                        hv_store(row_hv, "Estimate",   8, newSVnv(beta[j]), 0);
2861
21
                        hv_store(row_hv, "Std. Error", 10, newSVnv(se), 0);
2862                        hv_store(row_hv, is_binomial ? "z value" : "t value", 7, newSVnv(val_stat), 0);
2863                        hv_store(row_hv, is_binomial ? "Pr(>|z|)" : "Pr(>|t|)", 8, newSVnv(p_val), 0);
2864
81
                }
2865
21
                hv_store(summary_hv, exp_terms[j], strlen(exp_terms[j]), newRV_noinc((SV*)row_hv), 0);
2866
81
        }
2867
2868
81
        hv_store(res_hv, "aic",            3, newSVnv(aic), 0);
2869
60
        hv_store(res_hv, "coefficients",  12, newRV_noinc((SV*)coef_hv), 0);
2870
60
        hv_store(res_hv, "converged",      9, newSVuv(converged ? 1 : 0), 0);
2871        hv_store(res_hv, "boundary",       8, newSVuv(boundary ? 1 : 0), 0);
2872
21
        hv_store(res_hv, "deviance",       8, newSVnv(deviance_new), 0);
2873        hv_store(res_hv, "deviance.resid", 14, newRV_noinc((SV*)resid_hv), 0);
2874
21
        hv_store(res_hv, "df.null",        7, newSVuv(valid_n - has_intercept), 0);
2875
21
        hv_store(res_hv, "df.residual",   11, newSVuv(df_res), 0);
2876
21
        hv_store(res_hv, "family",         6, newSVpv(family_str, 0), 0);
2877
21
        hv_store(res_hv, "fitted.values", 13, newRV_noinc((SV*)fitted_hv), 0);
2878        hv_store(res_hv, "iter",           4, newSVuv(iter > max_iter ? max_iter : iter), 0);
2879
21
        hv_store(res_hv, "null.deviance", 13, newSVnv(null_dev), 0);
2880        hv_store(res_hv, "rank",           4, newSVuv(final_rank), 0);
2881        hv_store(res_hv, "summary",        7, newRV_noinc((SV*)summary_hv), 0);
2882        hv_store(res_hv, "terms",          5, newRV_noinc((SV*)terms_av), 0);
2883
2884        // --- Cleanup ---
2885        for (i = 0; i < num_terms; i++) Safefree(terms[i]);
2886        Safefree(terms);
2887
21
        for (i = 0; i < num_uniq; i++) Safefree(uniq_terms[i]);
2888
0
        Safefree(uniq_terms);
2889        for (size_t j = 0; j < p_exp; j++) {
2890
21
                Safefree(exp_terms[j]);
2891                if (is_dummy[j]) { Safefree(dummy_base[j]); Safefree(dummy_level[j]); }
2892
21
        }
2893
21
        Safefree(exp_terms); Safefree(is_dummy); Safefree(dummy_base); Safefree(dummy_level);
2894
2895
21
        Safefree(mu); Safefree(eta); Safefree(Z); Safefree(W);
2896
21
        Safefree(beta); Safefree(beta_old); Safefree(aliased);
2897        Safefree(XtWX); Safefree(XtWZ); Safefree(X); Safefree(Y);
2898        if (row_hashes) Safefree(row_hashes);
2899
2900
84
        RETVAL = newRV_noinc((SV*)res_hv);
2901
84
}
2902OUTPUT:
2903
84
    RETVAL
2904
2905
42
SV* cor_test(...)
2906
42
CODE:
2907
21
{
2908
0
        if (items < 2 || items % 2 != 0)
2909                croak("Usage: cor_test(\\@x, \\@y, method => 'pearson', ...)");
2910
2911        SV *restrict x_ref = ST(0), *restrict y_ref = ST(1);
2912
2913
21
        const char *restrict alternative = "two.sided";
2914        const char *restrict method = "pearson";
2915
21
        SV *restrict exact_sv = NULL;
2916
21
        double conf_level = 0.95;
2917
21
        bool continuity = 0;
2918
2919        /* Parse named arguments from the flat stack starting at index 2 */
2920
21
        for (unsigned short int i = 2; i < items; i += 2) {
2921
21
          const char *restrict key = SvPV_nolen(ST(i));
2922
0
          SV *restrict val = ST(i + 1);
2923
2924          if      (strEQ(key, "alternative")) alternative = SvPV_nolen(val);
2925
21
          else if (strEQ(key, "method"))      method = SvPV_nolen(val);
2926
21
          else if (strEQ(key, "exact"))       exact_sv = val;
2927          else if (strEQ(key, "conf.level") || strEQ(key, "conf_level")) conf_level = SvNV(val);
2928
21
          else if (strEQ(key, "continuity"))  continuity = SvTRUE(val);
2929
21
          else croak("cor_test: unknown argument '%s'", key);
2930        }
2931
2932
21
        AV *restrict x_av, *restrict y_av;
2933        double *restrict x, *restrict y;
2934
21
        double estimate = 0, p_value = 0, statistic = 0, df = 0, ci_lower = 0, ci_upper = 0;
2935
2936
117
        bool is_pearson  = (strcmp(method, "pearson") == 0);
2937
117
        bool is_kendall  = (strcmp(method, "kendall") == 0);
2938        bool is_spearman = (strcmp(method, "spearman") == 0);
2939
117
        HV *restrict rhv;
2940
2941        if (!SvOK(x_ref) || !SvROK(x_ref) || SvTYPE(SvRV(x_ref)) != SVt_PVAV ||
2942            !SvOK(y_ref) || !SvROK(y_ref) || SvTYPE(SvRV(y_ref)) != SVt_PVAV) {
2943
117
          croak("cor_test: x and y must be array references");
2944
105
        }
2945
2946
105
        x_av = (AV*)SvRV(x_ref);
2947        y_av = (AV*)SvRV(y_ref);
2948
2949        size_t n_raw = av_len(x_av) + 1;
2950
21
        if (n_raw != av_len(y_av) + 1) croak("incompatible dimensions");
2951
2952
0
        x = safemalloc(n_raw * sizeof(double));
2953
0
        y = safemalloc(n_raw * sizeof(double));
2954
2955        size_t n = 0; /* Final count of pairwise complete observations */
2956
21
        for (size_t i = 0; i < n_raw; i++) {
2957          SV **restrict x_val = av_fetch(x_av, i, 0);
2958
12
          SV **restrict y_val = av_fetch(y_av, i, 0);
2959
2960
60
          double xv = (x_val && SvOK(*x_val) && looks_like_number(*x_val)) ? SvNV(*x_val) : NAN;
2961
60
          double yv = (y_val && SvOK(*y_val) && looks_like_number(*y_val)) ? SvNV(*y_val) : NAN;
2962
2963
60
          /* Pairwise complete observations (skips NAs seamlessly like R) */
2964
60
          if (!isnan(xv) && !isnan(yv)) {
2965
60
              x[n] = xv;
2966
60
              y[n] = yv;
2967              n++;
2968
12
          }
2969
12
        }
2970
2971        if (n < 3) {
2972          Safefree(x);
2973
12
          Safefree(y);
2974
12
          croak("not enough finite observations");
2975
12
        }
2976
2977
12
        if (is_pearson) {
2978
12
          // Welford's Method for Pearson Correlation
2979          double mean_x = 0.0, mean_y = 0.0, M2_x = 0.0, M2_y = 0.0, cov = 0.0;
2980          for (size_t i = 0; i < n; i++) {
2981
12
                   double dx = x[i] - mean_x;
2982
9
                   mean_x += dx / (i + 1);
2983
6
                   double dy = y[i] - mean_y;
2984
30
                   mean_y += dy / (i + 1);
2985
84
                   M2_x += dx * (x[i] - mean_x);
2986
60
                   M2_y += dy * (y[i] - mean_y);
2987
60
                   cov  += dx * (y[i] - mean_y);
2988          }
2989
60
          estimate = (M2_x > 0.0 && M2_y > 0.0) ? cov / sqrt(M2_x * M2_y) : 0.0;
2990
60
          df = n - 2;
2991
60
          statistic = estimate * sqrt(df / (1.0 - estimate * estimate));
2992
2993
12
          // Confidence interval using Fisher's Z transform
2994          double z = 0.5 * log((1.0 + estimate) / (1.0 - estimate));
2995          double se = 1.0 / sqrt(n - 3);
2996
6
          double alpha = 1.0 - conf_level;
2997
6
          double q = inverse_normal_cdf(1.0 - alpha/2.0);
2998          ci_lower = tanh(z - q * se);
2999
6
          ci_upper = tanh(z + q * se);
3000
3001          // HIGH-PRECISION P-VALUE USING INCOMPLETE BETA
3002          p_value = get_t_pvalue(statistic, df, alternative);
3003
6
        } else if (is_kendall) {
3004
6
          int c = 0, d = 0, tie_x = 0, tie_y = 0;
3005          for (size_t i = 0; i < n - 1; i++) {
3006
0
                   for (size_t j = i + 1; j < n; j++) {
3007                       double sign_x = (x[i] - x[j] > 0) - (x[i] - x[j] < 0);
3008                       double sign_y = (y[i] - y[j] > 0) - (y[i] - y[j] < 0);
3009
3010                       if (sign_x == 0 && sign_y == 0) { /* Joint tie, ignore */ }
3011
6
                       else if (sign_x == 0) tie_x++;
3012
6
                       else if (sign_y == 0) tie_y++;
3013
6
                       else if (sign_x * sign_y > 0) c++;
3014
6
                       else d++;
3015                   }
3016          }
3017
0
          double denom = sqrt((double)(c + d + tie_x) * (double)(c + d + tie_y));
3018
0
          estimate = (denom == 0.0) ? (0.0/0.0) : (double)(c - d) / denom;
3019
3020
0
          bool has_ties = (tie_x > 0 || tie_y > 0);
3021          bool do_exact;
3022
3023
0
          /* Mirror R: exact defaults to TRUE if N < 50 and NO ties */
3024
0
          if (!exact_sv || !SvOK(exact_sv)) {
3025
0
                   do_exact = (n < 50) && !has_ties;
3026          } else {
3027
0
                   do_exact = SvTRUE(exact_sv) ? 1 : 0;
3028          }
3029          // If forced exact but ties exist, R overrides and falls back to approximation anyway
3030
3
          if (do_exact && has_ties) do_exact = 0;
3031
3032
3
          if (do_exact) {
3033
3
                   double S_stat = c - d;
3034
3
                   statistic = c;
3035                   p_value = kendall_exact_pvalue(n, S_stat, alternative);
3036          } else {
3037
3
                   // Normal approximation for large N or ties
3038
18
                   double var_S = n * (n - 1) * (2.0 * n + 5.0) / 18.0;
3039
15
                   double S = c - d;
3040
15
                   if (continuity) S -= (S > 0 ? 1 : -1);
3041
15
                   statistic = S / sqrt(var_S);
3042
3043
15
                   if (strcmp(alternative, "two.sided") == 0) {
3044
15
                       p_value = 2.0 * (1.0 - approx_pnorm(fabs(statistic)));
3045
15
                   } else if (strcmp(alternative, "less") == 0) {
3046                       p_value = approx_pnorm(statistic);
3047
3
                   } else {
3048                       p_value = 1.0 - approx_pnorm(statistic);
3049                   }
3050
3
          }
3051
18
        } else if (is_spearman) {
3052
15
          double *restrict rank_x = safemalloc(n * sizeof(double));
3053
15
          double *restrict rank_y = safemalloc(n * sizeof(double));
3054          compute_ranks(x, rank_x, n);
3055          compute_ranks(y, rank_y, n);
3056
3057
3
          // Spearman rho = Pearson r of the ranks (Welford's Method)
3058
18
          double mean_x = 0.0, mean_y = 0.0, M2_x = 0.0, M2_y = 0.0, cov = 0.0;
3059
15
          for (size_t i = 0; i < n; i++) {
3060
0
                   double dx = rank_x[i] - mean_x;
3061
0
                   mean_x += dx / (i + 1);
3062                   double dy = rank_y[i] - mean_y;
3063                   mean_y += dy / (i + 1);
3064
3
                   M2_x += dx * (rank_x[i] - mean_x);
3065
3
                   M2_y += dy * (rank_y[i] - mean_y);
3066                   cov  += dx * (rank_y[i] - mean_y);
3067
0
          }
3068          estimate = (M2_x > 0.0 && M2_y > 0.0) ? cov / sqrt(M2_x * M2_y) : 0.0;
3069
3070
3
          // S = sum of squared rank differences (R's reported statistic)
3071
3
          double S_stat = 0.0;
3072
3
          for (size_t i = 0; i < n; i++) {
3073                   double diff = rank_x[i] - rank_y[i];
3074
0
                   S_stat += diff * diff;
3075
0
          }
3076
3077
0
          // Ties produce fractional (averaged) ranks — detect them
3078
0
          bool has_ties = 0, do_exact;
3079          for (size_t i = 0; i < n; i++) {
3080
3
                   if (rank_x[i] != floor(rank_x[i]) || rank_y[i] != floor(rank_y[i])) {
3081                       has_ties = 1;
3082
0
                       break;
3083
0
                   }
3084          }
3085
21
          if (!exact_sv || !SvOK(exact_sv)) {
3086
21
                   do_exact = (n < 10) && !has_ties;
3087
21
          } else {
3088
21
                   do_exact = SvTRUE(exact_sv) ? 1 : 0;
3089
21
          }
3090
3091
21
          if (do_exact) {
3092
21
                   statistic = S_stat;
3093
12
                   p_value   = spearman_exact_pvalue(S_stat, n, alternative);
3094
12
          } else {
3095
12
                   double r = estimate;
3096
12
                   if (continuity)
3097
12
                       r *= (1.0 - 1.0 / (2.0 * (n - 1)));
3098                   statistic = r * sqrt((n - 2.0) / (1.0 - r * r));
3099                   p_value = get_t_pvalue(statistic, (double)(n - 2), alternative);
3100
21
          }
3101          Safefree(rank_x); Safefree(rank_y);
3102        } else {
3103          Safefree(x); Safefree(y);
3104          croak("Unknown method");
3105        }
3106        Safefree(x); Safefree(y);
3107        rhv = newHV();
3108        hv_stores(rhv, "estimate", newSVnv(estimate));
3109        hv_stores(rhv, "p.value", newSVnv(p_value));
3110        hv_stores(rhv, "statistic", newSVnv(statistic));
3111
6
        hv_stores(rhv, "method", newSVpv(method, 0));
3112
6
        hv_stores(rhv, "alternative", newSVpv(alternative, 0));
3113        if (is_pearson) {
3114
6
          hv_stores(rhv, "parameter", newSVnv(df));
3115
0
          AV *restrict ci_av = newAV();
3116          av_push(ci_av, newSVnv(ci_lower));
3117          av_push(ci_av, newSVnv(ci_upper));
3118
6
          hv_stores(rhv, "conf.int", newRV_noinc((SV*)ci_av));
3119
6
        }
3120
3121
6
        RETVAL = newRV_noinc((SV*)rhv);
3122}
3123OUTPUT:
3124
78
    RETVAL
3125
3126
72
void shapiro_test(data)
3127
72
        SV *data
3128
72
PREINIT:
3129
72
        AV *restrict av;
3130
72
        HV *restrict ret_hash;
3131
72
        size_t n_raw, n = 0;
3132        double *restrict x, w = 0.0, p_val = 0.0, mean = 0.0, ssq = 0.0;
3133PPCODE:
3134        if (!SvROK(data) || SvTYPE(SvRV(data)) != SVt_PVAV) {
3135          croak("Expected an array reference");
3136
6
        }
3137
3138
0
        av = (AV *)SvRV(data);
3139        n_raw = av_len(av) + 1;
3140
3141
6
        Newx(x, n_raw, double);
3142
3143
78
        // Extract variables and calculate mean (skipping undefined/NaN values)
3144
72
        for (size_t i = 0; i < n_raw; i++) {
3145          SV **restrict elem = av_fetch(av, i, 0);
3146
6
          if (elem && SvOK(*elem)) {
3147
0
                   double val = SvNV(*elem);
3148
0
                   if (!isnan(val)) {
3149                       x[n] = val;
3150
6
                       mean += val;
3151                       n++;
3152                   }
3153
6
          }
3154
0
        }
3155
3156
0
        if (n < 3 || n > 5000) {
3157
0
          Safefree(x);
3158          croak("Sample size must be between 3 and 5000 (R's limit)");
3159
0
        }
3160
3161        mean /= n;
3162
6
        // Calculate Sum of Squares */
3163
6
        for (size_t i = 0; i < n; i++) {
3164
6
          ssq += (x[i] - mean) * (x[i] - mean);
3165
78
        }
3166
72
        if (ssq == 0.0) {
3167
72
          Safefree(x);
3168          croak("Data is perfectly constant; cannot compute Shapiro-Wilk test");
3169
6
        }
3170
6
        qsort(x, n, sizeof(double), compare_doubles);
3171
3172
6
        // --- Core AS R94 Algorithm: Weights and Statistic W ---
3173
9
        if (n == 3) {
3174
3
          double a_val = 0.7071067811865475; /* sqrt(1/2) */
3175
12
          double b_val = a_val * (x[2] - x[0]);
3176
9
          w = (b_val * b_val) / ssq;
3177          if (w < 0.75) w = 0.75;
3178          // Exact P-value for n=3
3179
3
          p_val = 1.90985931710274 * (asin(sqrt(w)) - 1.04719755119660);
3180
3
        } else {
3181
3
          double *restrict m, *restrict a;
3182
3
          double sum_m2 = 0.0, b_val = 0.0;
3183
48
          Newx(m, n, double);
3184
45
          Newx(a, n, double);
3185          for (size_t i = 0; i < n; i++) {
3186                   m[i] = inverse_normal_cdf((i + 1.0 - 0.375) / (n + 0.25));
3187
78
                   sum_m2 += m[i] * m[i];
3188
72
          }
3189          double u = 1.0 / sqrt((double)n);
3190
6
          double a_n = -2.706056*pow(u,5) + 4.434685*pow(u,4) - 2.071190*pow(u,3) - 0.147981*pow(u,2) + 0.221157*u + m[n-1]/sqrt(sum_m2);
3191          a[n-1] = a_n;
3192          a[0]   = -a_n;
3193          if (n == 4 || n == 5) {
3194                   double eps = (sum_m2 - 2.0 * m[n-1]*m[n-1]) / (1.0 - 2.0 * a_n*a_n);
3195
6
                   for (unsigned int i = 1; i < n-1; i++) {
3196                       a[i] = m[i] / sqrt(eps);
3197
6
                   }
3198          } else {
3199                   double a_n1 = -3.582633*pow(u,5) + 5.682633*pow(u,4) - 1.752461*pow(u,3) - 0.293762*pow(u,2) + 0.042981*u + m[n-2]/sqrt(sum_m2);
3200                   a[n-2] = a_n1;
3201
3
                   a[1]   = -a_n1;
3202
3
                   double eps = (sum_m2 - 2.0 * m[n-1]*m[n-1] - 2.0 * m[n-2]*m[n-2]) / (1.0 - 2.0 * a_n*a_n - 2.0 * a_n1*a_n1);
3203
3
                   for (unsigned int i = 2; i < n-2; i++) {
3204
0
                       a[i] = m[i] / sqrt(eps);
3205                   }
3206          }
3207
3
          for (size_t i = 0; i < n; i++) {
3208
3
                   b_val += a[i] * x[i];
3209
3
          }
3210
3
          w = (b_val * b_val) / ssq;
3211        // --- AS R94 P-Value Calculation: High Precision Refinement ---
3212          /* NOTE: p_val is declared in PREINIT above;
3213
3
                * do NOT shadow it with a local 'double p_val' here or the result will never reach the caller.
3214                */
3215          double y = log(1.0 - w);
3216          double z;
3217
3
          if (n <= 11) {
3218                   // Royston's branch for 4 <= n <= 11 (AS R94, small-sample path).
3219
3
                   // gamma is the upper bound on y = log(1-W);
3220
3
                   // if y reaches gamma the p-value is essentially zero
3221
3
                   double nn = (double)n;
3222
3
                   double gamma = 0.459 * nn - 2.273;
3223
3
                   if (y >= gamma) {
3224                       p_val = 1e-19;
3225                   } else {
3226
6
                       // Horner-form polynomials in n for mu and log(sigma)
3227
6
                       double mu     = 0.544  + nn * (-0.39978  + nn * ( 0.025054  - nn * 0.0006714));
3228                       double sig_val= 1.3822 + nn * (-0.77857  + nn * ( 0.062767  - nn * 0.0020322));
3229
6
                       double sigma  = exp(sig_val);
3230                       z = (-log(gamma - y) - mu) / sigma;
3231
6
                       /* Upper-tail probability P(Z > z): small W → large z → small p-value.
3232
6
                       */
3233
6
                       p_val = 0.5 * erfc(z * M_SQRT1_2);
3234
6
                   }
3235
6
          } else {
3236
6
                   // Royston's branch for n >= 12 (AS R94, large-sample path)
3237
6
                   double ln_n   = log((double)n);
3238
6
                   // Horner-form polynomials in log(n) for mu and log(sigma). */
3239                   double mu     = -1.5861 + ln_n * (-0.31082 + ln_n * (-0.083751 + ln_n * 0.0038915));
3240                   double sig_val= -0.4803 + ln_n * (-0.082676 + ln_n * 0.0030302);
3241                   double sigma  = exp(sig_val);
3242                   z = (y - mu) / sigma;
3243
30
                   p_val = 0.5 * erfc(z * M_SQRT1_2);
3244
30
          }
3245
30
          // Clamp the p-value
3246          if (p_val > 1.0) p_val = 1.0;
3247
30102
          if (p_val < 0.0) p_val = 0.0;
3248
3249
30081
          Safefree(m); m = NULL;  Safefree(a); a = NULL;
3250
9
        }
3251
9
        Safefree(x); x = NULL;
3252
642
        ret_hash = newHV();
3253
633
        hv_stores(ret_hash, "statistic", newSVnv(w));
3254
633
        hv_stores(ret_hash, "W",         newSVnv(w));
3255
633
        hv_stores(ret_hash, "p_value",   newSVnv(p_val));
3256
633
        hv_stores(ret_hash, "p.value",   newSVnv(p_val));
3257
9
        EXTEND(SP, 1);
3258
9
        PUSHs(sv_2mortal(newRV_noinc((SV *)ret_hash)));
3259
3260
633
double min(...)
3261        PROTOTYPE: @
3262        INIT:
3263
30063
                double min_val = 0.0;
3264
30063
                size_t count = 0;
3265
30063
                bool first = TRUE;
3266
48
        CODE:
3267
48
                for (unsigned short int i = 0; i < items; i++) {
3268                        SV* restrict arg = ST(i);
3269
30063
                        if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
3270                                AV* restrict av = (AV*)SvRV(arg);
3271                                size_t len = av_len(av) + 1;
3272
30
                                for (size_t j = 0; j < len; j++) {
3273
27
                                     SV** restrict tv = av_fetch(av, j, 0);
3274                                     if (tv && SvOK(*tv)) {
3275                                         double val = SvNV(*tv);
3276                                         if (first || val < min_val) {
3277                                             min_val = val;
3278                                             first = FALSE;
3279                                         }
3280
33
                                         count++;
3281
33
                                     } else {
3282
33
                                         croak("min: undefined value at array ref index %zu (argument %d)", j, (int)i);
3283                                     }
3284
30105
                                 }
3285
30072
                        } else if (SvOK(arg)) {
3286
30084
                                 double val = SvNV(arg);
3287
12
                                 if (first || val < min_val) {
3288
12
                                     min_val = val;
3289
945
                                     first = FALSE;
3290
933
                                 }
3291
933
                                 count++;
3292
933
                        } else {
3293
933
                                 croak("min: undefined value at argument index %d", (int)i);
3294
42
                        }
3295
42
                }
3296                if (count == 0) croak("min needs >= 1 numeric element");
3297
933
                RETVAL = min_val;
3298        OUTPUT:
3299          RETVAL
3300
3301
30060
double max(...)
3302
30060
        PROTOTYPE: @
3303
89
        INIT:
3304
89
                double max_val = 0.0;
3305                size_t count = 0;
3306
30060
                bool first = TRUE;
3307        CODE:
3308                for (size_t i = 0; i < items; i++) {
3309
33
                   SV* restrict arg = ST(i);
3310
30
                   if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
3311                       AV* restrict av = (AV*)SvRV(arg);
3312                       size_t len = av_len(av) + 1;
3313                       for (size_t j = 0; j < len; j++) {
3314                           SV** restrict tv = av_fetch(av, j, 0);
3315                           if (tv && SvOK(*tv)) {
3316                               double val = SvNV(*tv);
3317
12
                               if (first || val > max_val) {
3318
12
                                   max_val = val;
3319                                   first = FALSE;
3320                               }
3321
12
                               count++;
3322                           } else {
3323
12
                               croak("max: undefined value at array ref index %zu (argument %zu)", j, i);
3324                           }
3325
12
                       }
3326
0
                   } else if (SvOK(arg)) {
3327                       double val = SvNV(arg);
3328                       if (first || val > max_val) {
3329
42
                           max_val = val;
3330                           first = FALSE;
3331
30
                       }
3332
18
                       count++;
3333
18
                   } else {
3334
6
                       croak("max: undefined value at argument index %zu", i);
3335
6
                   }
3336
6
          }
3337
6
          if (count == 0) croak("max needs >= 1 numeric element");
3338
12
          RETVAL = max_val;
3339
6
        OUTPUT:
3340
6
                RETVAL
3341
3342
6
SV* runif(...)
3343
6
CODE:
3344
6
{
3345
6
        size_t n = 0;
3346
6
        double min = 0.0, max = 1.0;
3347
3348        // Flags to track what has been assigned
3349        bool n_set = 0, min_set = 0, max_set = 0;
3350
3351        unsigned int i = 0;
3352
3353
6
        if (items == 0) {
3354
6
          croak("Usage: runif(n, [min=0], [max=1]) or runif(n => $n, ...)");
3355
6
        }
3356
3357
3
        while (i < items) {
3358
3
                // 1. Check if the current argument is a string key for a named parameter
3359
3
                if (i + 1 < items && SvPOK(ST(i))) {
3360
3
                        char *restrict key = SvPV_nolen(ST(i));
3361                        if (strEQ(key, "n")) {
3362
0
                                n = (size_t)SvUV(ST(i+1));
3363                                n_set = 1;
3364
12
                                i += 2;
3365                                continue;
3366
12
                        } else if (strEQ(key, "min")) {
3367
0
                                min = SvNV(ST(i+1));
3368                                min_set = 1;
3369                                i += 2;
3370
12
                                continue;
3371
12
                        } else if (strEQ(key, "max")) {
3372
12
                                max = SvNV(ST(i+1));
3373
12
                                max_set = 1;
3374                                i += 2;
3375
12
                                continue;
3376
60060
                        }
3377                }
3378
3379
0
                // 2. Fallback to positional parsing if it's not a recognized key
3380                if (!n_set) {
3381
60048
                        n = (size_t)SvUV(ST(i));
3382                        n_set = 1;
3383
60048
                } else if (!min_set) {
3384                        min = SvNV(ST(i));
3385
12
                        min_set = 1;
3386                } else if (!max_set) {
3387                        max = SvNV(ST(i));
3388                        max_set = 1;
3389                } else {
3390                        croak("Too many arguments or unrecognized parameter passed to runif()");
3391                }
3392                i++;
3393        }
3394
36
        if (!n_set) {
3395
36
                croak("runif() requires at least the 'n' parameter");
3396
3
        }
3397        // Ensure PRNG is seeded
3398
33
        AUTO_SEED_PRNG();
3399
33
        AV *restrict results = newAV();
3400        if (n > 0) {
3401
33
                av_extend(results, n - 1);
3402        }
3403
126
        const double range = max - min;
3404
93
        for (size_t j = 0; j < n; j++) {
3405
93
                double r;
3406                if (max < min) {
3407
93
                        r = NAN; // R behavior for inverted ranges
3408
60
                } else {
3409
30
                        r = min + range * Drand01();
3410
0
                }
3411                av_push(results, newSVnv(r));
3412        }
3413        RETVAL = newRV_noinc((SV*)results);
3414
33
}
3415
27
OUTPUT:
3416    RETVAL
3417
3418
21
SV* rbinom(...)
3419
21
        CODE:
3420
61518
        {
3421
61497
          // Auto-seed the PRNG if the Perl script hasn't done so yet
3422          AUTO_SEED_PRNG();
3423          if (items % 2 != 0)
3424                   croak("Usage: rbinom(n => 10, size => 100, prob => 0.5)");
3425
21
          //Parse named arguments
3426          size_t n = 0, size = 0;
3427          double prob = 0.5;
3428
3429          bool size_set = false, prob_set = false;
3430
3431          for (unsigned short i = 0; i < items; i += 2) {
3432                   const char* restrict key = SvPV_nolen(ST(i));
3433                   SV* restrict val = ST(i + 1);
3434
3435
27
                   if      (strEQ(key, "n"))      n    = (unsigned int)SvUV(val);
3436
6
                   else if (strEQ(key, "size")) { size = (unsigned int)SvUV(val); size_set = true; }
3437                   else if (strEQ(key, "prob")) { prob = SvNV(val); prob_set = true; }
3438
21
                   else croak("rbinom: unknown argument '%s'", key);
3439
21
          }
3440
3441          // R requires size and prob to be explicitly passed in rbinom
3442          if (!size_set || !prob_set) croak("rbinom: 'size' and 'prob' are required arguments");
3443          if (prob < 0.0 || prob > 1.0) croak("rbinom: prob must be between 0 and 1");
3444
3445
18
          AV *restrict result_av = newAV();
3446
18
          if (n > 0) {
3447                   av_extend(result_av, n - 1);
3448
6078
                   for (unsigned int i = 0; i < n; i++) {
3449
6063
                       av_store(result_av, i, newSVuv(generate_binomial(size, prob)));
3450
6063
                   }
3451
6063
          }
3452
3453
6060
          RETVAL = newRV_noinc((SV*)result_av);
3454
6060
        }
3455        OUTPUT:
3456          RETVAL
3457
3458
0
SV*
3459
0
hist(SV* x_sv, ...)
3460        CODE:
3461        {
3462
15
                // 1. Validate Input
3463                if (!SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
3464
15
                        croak("hist: first argument must be an array reference");
3465
3466
0
                AV*restrict x_av = (AV*)SvRV(x_sv);
3467
15
                size_t n_raw = av_len(x_av) + 1;
3468                if (n_raw == 0) croak("hist: input array is empty");
3469
3470                // 2. Extract Data & Find Range
3471
15
                double *restrict x;
3472
15
                Newx(x, n_raw, double);
3473
15
                size_t n = 0;
3474                double min_val = DBL_MAX, max_val = -DBL_MAX;
3475
3476                for (size_t i = 0; i < n_raw; i++) {
3477
15
                        SV**restrict tv = av_fetch(x_av, i, 0);
3478
0
                        if (tv && SvOK(*tv)) {
3479                                 double val = SvNV(*tv);
3480                                 x[n++] = val;
3481
15
                                 if (val < min_val) min_val = val;
3482                                 if (val > max_val) max_val = val;
3483                        }
3484                }
3485
15
                if (n == 0) {
3486
15
                        Safefree(x);
3487
15
                        croak("hist: input contains no valid numeric data");
3488
15
                }
3489                // 3. Determine Bin Count (Sturges default or user-provided)
3490                size_t n_bins = 0;
3491
3492
84
                if (items == 2) {
3493
69
                        // Support pure positional argument: hist($data, 22)
3494                        n_bins = (size_t)SvIV(ST(1));
3495                } else if (items > 2) {
3496                        /* Support named parameters even if mixed with positional arguments */
3497
15
                        for (unsigned short i = 1; i < items - 1; i++) {
3498                                 /* Make sure the SV holds a string before doing string comparison */
3499                                 if (SvPOK(ST(i)) && strEQ(SvPV_nolen(ST(i)), "breaks")) {
3500
15
                                     n_bins = (size_t)SvIV(ST(i+1));
3501
15
                                     break;
3502
15
                                 }
3503
15
                        }
3504
15
                        /* Fallback: if 'breaks' wasn't found but a positional number was given first */
3505
84
                        if (n_bins == 0 && looks_like_number(ST(1))) {
3506
69
                                 n_bins = (size_t)SvIV(ST(1));
3507
69
                        }
3508
54
                }
3509
54
                if (n_bins == 0) n_bins = calculate_sturges_bins(n);
3510
54
                // 4. Allocate Result Arrays
3511                double *restrict breaks, *restrict mids, *restrict density;
3512                size_t *restrict counts;
3513
15
                Newx(breaks,  n_bins + 1, double);
3514
15
                Newx(mids,    n_bins,     double);
3515
15
                Newx(density, n_bins,     double);
3516
15
                Newx(counts,  n_bins,     size_t);
3517
3518                // Generate simple linear breaks
3519
15
                double step = (max_val - min_val) / (double)n_bins;
3520
15
                for (size_t i = 0; i <= n_bins; i++) {
3521                        breaks[i] = min_val + (double)i * step;
3522
15
                }
3523
3524                // 5. Compute Statistics
3525                compute_hist_logic(x, n, breaks, n_bins, counts, mids, density);
3526
3527                // 6. Build Return HashRef
3528                HV*restrict res_hv = newHV();
3529                AV*restrict av_breaks  = newAV();
3530
12
                AV*restrict av_counts  = newAV();
3531
12
                AV*restrict av_mids    = newAV();
3532
12
                AV*restrict av_density = newAV();
3533                for (size_t i = 0; i <= n_bins; i++) {
3534                        av_push(av_breaks, newSVnv(breaks[i]));
3535
12
                        if (i < n_bins) {
3536
9
                                 av_push(av_counts,  newSViv(counts[i]));
3537
9
                                 av_push(av_mids,    newSVnv(mids[i]));
3538                                 av_push(av_density, newSVnv(density[i]));
3539                        }
3540                }
3541
12
                hv_stores(res_hv, "breaks",  newRV_noinc((SV*)av_breaks));
3542
0
                hv_stores(res_hv, "counts",  newRV_noinc((SV*)av_counts));
3543                hv_stores(res_hv, "mids",    newRV_noinc((SV*)av_mids));
3544
27
                hv_stores(res_hv, "density", newRV_noinc((SV*)av_density));
3545
3546
15
                // Clean
3547                Safefree(x); Safefree(breaks); Safefree(mids);
3548
15
                Safefree(density); Safefree(counts);
3549
3550
0
                RETVAL = newRV_noinc((SV*)res_hv);
3551        }
3552
12
        OUTPUT:
3553
0
          RETVAL
3554
3555
12
SV* quantile(...)
3556
12
        CODE:
3557        {
3558                SV *restrict x_sv = NULL;
3559                SV *restrict probs_sv = NULL;
3560
12
                int arg_idx = 0;
3561
3562
624
                /* --- 1. Consume first positional arg as 'x' if it's an array ref --- */
3563
612
                if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
3564
612
                         x_sv = ST(arg_idx);
3565
612
                         arg_idx++;
3566                }
3567
3568
12
                /* --- 2. Remaining args must be key-value pairs --- */
3569
0
                if ((items - arg_idx) % 2 != 0)
3570
0
                         croak("Usage: quantile(\\@data, probs => \\@probs)  OR  quantile(x => \\@data, probs => \\@probs)");
3571
3572                for (; arg_idx < items; arg_idx += 2) {
3573
12
                         const char *restrict key = SvPV_nolen(ST(arg_idx));
3574                         SV *restrict val = ST(arg_idx + 1);
3575
3576
12
                         if      (strEQ(key, "x"))     x_sv     = val;
3577                         else if (strEQ(key, "probs")) probs_sv = val;
3578                         else croak("quantile: unknown argument '%s'", key);
3579
24
                }
3580
12
                if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
3581
12
                        croak("quantile: 'x' must be an array reference");
3582
12
                AV *restrict x_av = (AV*)SvRV(x_sv);
3583
39
                size_t n_raw = av_len(x_av) + 1;
3584
27
                if (n_raw == 0) croak("quantile: 'x' is empty");
3585
3586
27
                /* --- Extract valid numeric data & drop NAs --- */
3587
0
                double *restrict x;
3588
0
                Newx(x, n_raw, double);
3589                size_t n = 0;
3590                for (size_t i = 0; i < n_raw; i++) {
3591                        SV **restrict tv = av_fetch(x_av, i, 0);
3592
0
                        if (tv && SvOK(*tv)) {
3593
0
                                 x[n++] = SvNV(*tv);
3594                        }
3595                }
3596                if (n == 0) {
3597
12
                        Safefree(x);
3598                        croak("quantile: 'x' contains no valid numbers");
3599
39
                }
3600
27
                // --- Sort Data for Quantile Math ---
3601                qsort(x, n, sizeof(double), compare_doubles);
3602
27
                // --- Parse Probabilities (Default matches R's c(0, .25, .5, .75, 1)) ---
3603
3
                double default_probs[] = {0.0, 0.25, 0.50, 0.75, 1.0};
3604
24
                unsigned int n_probs = 5;
3605
3
                double *restrict probs;
3606
3607
3
                if (probs_sv && SvROK(probs_sv) && SvTYPE(SvRV(probs_sv)) == SVt_PVAV) {
3608                        AV *restrict p_av = (AV*)SvRV(probs_sv);
3609                        n_probs = av_len(p_av) + 1;
3610
18
                        Newx(probs, n_probs, double);
3611
18
                        for (unsigned int i = 0; i < n_probs; i++) {
3612
18
                                 SV **tv = av_fetch(p_av, i, 0);
3613
18
                                 probs[i] = (tv && SvOK(*tv)) ? SvNV(*tv) : 0.0;
3614                                 if (probs[i] < 0.0 || probs[i] > 1.0) {
3615                                     Safefree(x); Safefree(probs);
3616                                     croak("quantile: probabilities must be between 0 and 1");
3617                                 }
3618
27
                        }
3619                } else {
3620
27
                        Newx(probs, n_probs, double);
3621
27
                        for (unsigned int i = 0; i < n_probs; i++) probs[i] = default_probs[i];
3622                }
3623
3624                /* --- Calculate Quantiles (R Type 7 Algorithm) --- */
3625                HV *restrict res_hv = newHV();
3626
3627                for (size_t i = 0; i < n_probs; i++) {
3628                        double p = probs[i], q = 0.0;
3629
3630
12
                        if (n == 1) {
3631                                 q = x[0];
3632
12
                        } else if (p == 1.0) {
3633                                 q = x[n - 1]; /* Prevent out-of-bounds mapping */
3634                        } else if (p == 0.0) {
3635                                 q = x[0];
3636                        } else {
3637                                 /* Continuous sample quantile interpolation (Type 7) */
3638                                 double h = (n - 1) * p;
3639                                 unsigned int j = (unsigned int)h; /* floor via cast */
3640                                 double gamma = h - j;
3641
117
                                 q = (1.0 - gamma) * x[j] + gamma * x[j + 1];
3642
117
                        }
3643
3644
267
                        /* Format hash key to exactly match R's naming convention ("25%", "33.3%") */
3645
150
                        char key[32];
3646
261
                        double pct = p * 100.0;
3647
3648
111
                        if (pct == (unsigned int)pct) {
3649
60915
                                 snprintf(key, sizeof(key), "%.0f%%", pct);
3650
60804
                        } else {
3651
60804
                                 snprintf(key, sizeof(key), "%.1f%%", pct);
3652                        }
3653
3654
39
                        hv_store(res_hv, key, strlen(key), newSVnv(q), 0);
3655                }
3656
3657
117
                Safefree(x);
3658
114
                Safefree(probs);
3659
3660                RETVAL = newRV_noinc((SV*)res_hv);
3661        }
3662        OUTPUT:
3663          RETVAL
3664
3665
3666
15
double mean(...)
3667        PROTOTYPE: @
3668        INIT:
3669
66
          double total = 0;
3670
51
          size_t count = 0;
3671
54
        CODE:
3672
3
                for (size_t i = 0; i < items; i++) {
3673
3
                        SV* restrict arg = ST(i);
3674
30000
                        if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
3675
29997
                                AV* restrict av = (AV*)SvRV(arg);
3676
29997
                                size_t len = av_len(av) + 1;
3677
29997
                                for (size_t j = 0; j < len; j++) {
3678
29997
                                     SV** restrict tv = av_fetch(av, j, 0);
3679
29997
                                     if (tv && SvOK(*tv)) {
3680
29997
                                         total += SvNV(*tv);
3681
29997
                                         count++;
3682                                     } else {
3683                                         croak("mean: undefined value at array ref index %zu (argument %zu)", j, i);
3684
48
                                     }
3685
48
                                }
3686
48
                        } else if (SvOK(arg)) {
3687
48
                                 total += SvNV(arg);
3688
48
                                 count++;
3689
48
                        } else {
3690                                 croak("mean: undefined value at argument index %zu", i);
3691                        }
3692
15
                }
3693
12
                if (count == 0) croak("mean needs >= 1 element");
3694                RETVAL = total / count;
3695        OUTPUT:
3696          RETVAL
3697
3698double sum(...)
3699        PROTOTYPE: @
3700
18
        INIT:
3701
18
          double total = 0;
3702          size_t count = 0;
3703        CODE:
3704
51
          for (size_t i = 0; i < items; i++) {
3705
33
                   SV* restrict arg = ST(i);
3706
42
                   if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
3707
9
                       AV* restrict av = (AV*)SvRV(arg);
3708
9
                       size_t len = av_len(av) + 1;
3709
30039
                       for (size_t j = 0; j < len; j++) {
3710
30030
                           SV** restrict tv = av_fetch(av, j, 0);
3711
30030
                           if (tv && SvOK(*tv)) {
3712
30030
                               total += SvNV(*tv);
3713
30030
                               count++;
3714
30030
                           } else {
3715
30030
                               croak("sum: undefined value at array ref index %zu (argument %zu)", j, i);
3716
30030
                           }
3717                       }
3718                   } else if (SvOK(arg)) {
3719
24
                       total += SvNV(arg);
3720
24
                       count++;
3721
24
                   } else {
3722
24
                       croak("sum: undefined value at argument index %zu", i);
3723
24
                   }
3724
24
          }
3725          if (count == 0) croak("sum needs >= 1 element");
3726          RETVAL = total;
3727
18
        OUTPUT:
3728
15
          RETVAL
3729
3730double sd(...)
3731    PROTOTYPE: @
3732    INIT:
3733        double mean = 0.0, M2 = 0.0;
3734        size_t count = 0;
3735
141
    CODE:
3736
141
        /* Single Pass Standard Deviation via Welford's Algorithm */
3737
141
        for (size_t i = 0; i < items; i++) {
3738
141
            SV* restrict arg = ST(i);
3739
141
            if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
3740                AV* restrict av = (AV*)SvRV(arg);
3741
141
                size_t len = av_len(av) + 1;
3742                for (size_t j = 0; j < len; j++) {
3743                    SV** restrict tv = av_fetch(av, j, 0);
3744
141
                    if (tv && SvOK(*tv)) {
3745
63
                        count++;
3746
63
                        double val = SvNV(*tv);
3747                        double delta = val - mean;
3748                        mean += delta / count;
3749                        M2 += delta * (val - mean);
3750
141
                    } else {
3751
12
                        croak("sd: undefined value at array ref index %zu (argument %zu)", j, i);
3752
12
                    }
3753                }
3754            } else if (SvOK(arg)) {
3755                count++;
3756
141
                double val = SvNV(arg);
3757
0
                double delta = val - mean;
3758                mean += delta / count;
3759                M2 += delta * (val - mean);
3760            } else {
3761
369
                croak("sd: undefined value at argument index %zu", i);
3762
228
            }
3763
228
        }
3764        if (count < 2) croak("sd needs >= 2 elements");
3765
228
        RETVAL = sqrt(M2 / (count - 1));
3766
153
    OUTPUT:
3767
138
        RETVAL
3768
3769
3770
12
double var(...)
3771
6
        PROTOTYPE: @
3772
0
        INIT:
3773          double mean = 0.0, M2 = 0.0;
3774          size_t count = 0;
3775        CODE:
3776
141
          /* Single Pass Variance via Welford's Algorithm */
3777
3
          for (size_t i = 0; i < items; i++) {
3778
138
                   SV* restrict arg = ST(i);
3779
138
                   if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
3780
138
                       AV* restrict av = (AV*)SvRV(arg);
3781
138
                       size_t len = av_len(av) + 1;
3782
138
                       for (size_t j = 0; j < len; j++) {
3783
24
                           SV** restrict tv = av_fetch(av, j, 0);
3784                           if (tv && SvOK(*tv)) {
3785
138
                               count++;
3786
3
                               double val = SvNV(*tv);
3787                               double delta = val - mean;
3788
135
                               mean += delta / count;
3789
135
                               M2 += delta * (val - mean);
3790
1233
                           } else {
3791
1098
                               croak("var: undefined value at array ref index %zu (argument %zu)", j, i);
3792
1098
                           }
3793
1098
                       }
3794
1098
                   } else if (SvOK(arg)) {
3795
1098
                       count++;
3796                       double val = SvNV(arg);
3797
135
                       double delta = val - mean;
3798
135
                       mean += delta / count;
3799                       M2 += delta * (val - mean);
3800
153
                   } else {
3801
27
                       croak("var: undefined value at argument index %zu", i);
3802
24
                   }
3803
24
          }
3804
21
          if (count < 2) croak("var needs >= 2 elements");
3805
312
          RETVAL = M2 / (count - 1);
3806
291
        OUTPUT:
3807
291
          RETVAL
3808
3809
291
SV* t_test(...)
3810
291
        CODE:
3811        {
3812
21
                SV*restrict x_sv = NULL;
3813
21
                SV*restrict y_sv = NULL;
3814
6
                double mu = 0.0, conf_level = 0.95;
3815
42
                bool paired = FALSE, var_equal = FALSE;
3816
36
                const char*restrict alternative = "two.sided";
3817
3818
36
                int arg_idx = 0;
3819
3820
36
                // 1. Shift first positional argument as 'x' if it's an array reference
3821
36
                if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
3822
36
                  x_sv = ST(arg_idx);
3823
36
                  arg_idx++;
3824                }
3825
3826
6
                // 2. Shift second positional argument as 'y' if it's an array reference
3827
6
                if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
3828
6
                  y_sv = ST(arg_idx);
3829
6
                  arg_idx++;
3830
6
                }
3831
3832
15
                // Ensure the remaining arguments form complete key-value pairs
3833
6
                if ((items - arg_idx) % 2 != 0) {
3834
6
                  croak("Usage: t_test(\\@x, [\\@y], key => value, ...)");
3835
6
                }
3836
3837
6
                // --- Parse named arguments from the remaining flat stack ---
3838
6
                for (; arg_idx < items; arg_idx += 2) {
3839
6
                        const char*restrict key = SvPV_nolen(ST(arg_idx));
3840
6
                        SV*restrict val = ST(arg_idx + 1);
3841
3842
9
                        if      (strEQ(key, "x"))           x_sv        = val;
3843
9
                        else if (strEQ(key, "y"))           y_sv        = val;
3844
9
                        else if (strEQ(key, "mu"))          mu          = SvNV(val);
3845
9
                        else if (strEQ(key, "paired"))      paired      = SvTRUE(val);
3846
9
                        else if (strEQ(key, "var_equal"))   var_equal   = SvTRUE(val);
3847
9
                        else if (strEQ(key, "conf_level"))  conf_level  = SvNV(val);
3848
9
                        else if (strEQ(key, "alternative")) alternative = SvPV_nolen(val);
3849
9
                        else croak("t_test: unknown argument '%s'", key);
3850
9
                }
3851
3852                // --- Validate required / types ---
3853                if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
3854
105
                        croak("t_test: 'x' is a required argument and must be an ARRAY reference");
3855
105
                AV*restrict x_av = (AV*)SvRV(x_sv);
3856
105
                size_t nx = av_len(x_av) + 1;
3857
105
                if (nx < 2) croak("t_test: 'x' needs at least 2 elements");
3858
105
                AV*restrict y_av = NULL;
3859                if (y_sv && SvROK(y_sv) && SvTYPE(SvRV(y_sv)) == SVt_PVAV)
3860
126
                        y_av = (AV*)SvRV(y_sv);
3861
3862
126
                if (conf_level <= 0.0 || conf_level >= 1.0)
3863
3
                        croak("t_test: 'conf_level' must be between 0 and 1");
3864
3
                // --- Computation via Welford's Algorithm --- */
3865
3
                double mean_x = 0.0, M2_x = 0.0, var_x, t_stat, df, p_val, std_err, cint_est;
3866
123
                HV*restrict results = newHV();
3867
3
                for (size_t i = 0; i < nx; i++) {
3868
3
                        SV**restrict tv = av_fetch(x_av, i, 0);
3869
3
                        double val = (tv && SvOK(*tv)) ? SvNV(*tv) : 0;
3870                        double delta = val - mean_x;
3871
120
                        mean_x += delta / (i + 1);
3872
120
                        M2_x += delta * (val - mean_x);
3873
120
                }
3874                var_x = M2_x / (nx - 1);
3875
126
                if (var_x == 0.0 && !y_av) croak("t_test: data are essentially constant");
3876
3877
126
                if (paired || y_av) {
3878
126
                        if (!y_av) croak("t_test: 'y' must be provided for paired or two-sample tests");
3879
126
                        size_t ny = av_len(y_av) + 1;
3880
126
                        if (paired && ny != nx) croak("t_test: Paired arrays must be same length");
3881
126
                        double mean_y = 0.0, M2_y = 0.0, var_y;
3882
126
                        for (size_t i = 0; i < ny; i++) {
3883                                 SV**restrict tv = av_fetch(y_av, i, 0);
3884                                 double val = (tv && SvOK(*tv)) ? SvNV(*tv) : 0;
3885                                 double delta = val - mean_y;
3886                                 mean_y += delta / (i + 1);
3887                                 M2_y += delta * (val - mean_y);
3888                        }
3889
45
                        var_y = M2_y / (ny - 1);
3890
3
                        if (paired) {
3891                                 double mean_d = 0.0, M2_d = 0.0;
3892
42
                                 for (size_t i = 0; i < nx; i++) {
3893
42
                                          SV**restrict dx_ptr = av_fetch(x_av, i, 0);
3894                                          SV**restrict dy_ptr = av_fetch(y_av, i, 0);
3895
42
                                     double dx = (dx_ptr && SvOK(*dx_ptr)) ? SvNV(*dx_ptr) : 0.0;
3896
3
                                     double dy = (dy_ptr && SvOK(*dy_ptr)) ? SvNV(*dy_ptr) : 0.0;
3897                                     double val = dx - dy;
3898                                     double delta = val - mean_d;
3899                                     mean_d += delta / (i + 1);
3900
39
                                     M2_d += delta * (val - mean_d);
3901
471
                                 }
3902                                 double var_d = M2_d / (nx - 1);
3903
39
                                 if (var_d == 0.0) croak("t_test: data are essentially constant");
3904
39
                                 cint_est = mean_d;
3905
39
                                 std_err  = sqrt(var_d / nx);
3906                                 t_stat   = (cint_est - mu) / std_err;
3907                                 df       = nx - 1;
3908                                 hv_store(results, "estimate", 8, newSVnv(mean_d), 0);
3909
39
                        } else if (var_equal) {
3910
39
                                 if (var_x == 0.0 && var_y == 0.0) croak("t_test: data are essentially constant");
3911                                 double pooled_var = ((nx - 1) * var_x + (ny - 1) * var_y) / (nx + ny - 2);
3912
1107
                                 cint_est = mean_x - mean_y;
3913
1068
                                 std_err  = sqrt(pooled_var * (1.0 / nx + 1.0 / ny));
3914
1068
                                 t_stat   = (cint_est - mu) / std_err;
3915
1068
                                 df       = nx + ny - 2;
3916                                 hv_store(results, "estimate_x", 10, newSVnv(mean_x), 0);
3917                                 hv_store(results, "estimate_y", 10, newSVnv(mean_y), 0);
3918
39
                        } else {
3919                                 if (var_x == 0.0 && var_y == 0.0) croak("t_test: data are essentially constant");
3920
39
                                 cint_est         = mean_x - mean_y;
3921
159
                                 double stderr_x2 = var_x / nx;
3922
153
                                 double stderr_y2 = var_y / ny;
3923
153
                                 std_err          = sqrt(stderr_x2 + stderr_y2);
3924                                 t_stat           = (cint_est - mu) / std_err;
3925
33
                                 df = pow(stderr_x2 + stderr_y2, 2) /
3926
6
                                      (pow(stderr_x2, 2) / (nx - 1) + pow(stderr_y2, 2) / (ny - 1));
3927
159
                                 hv_store(results, "estimate_x", 10, newSVnv(mean_x), 0);
3928
153
                                 hv_store(results, "estimate_y", 10, newSVnv(mean_y), 0);
3929
153
                        }
3930
153
                } else {
3931                        cint_est = mean_x;
3932
27
                        std_err  = sqrt(var_x / nx);
3933
6
                        t_stat   = (cint_est - mu) / std_err;
3934
159
                        df       = nx - 1;
3935
153
                        hv_store(results, "estimate", 8, newSVnv(mean_x), 0);
3936
153
                }
3937
153
                p_val = get_t_pvalue(t_stat, df, alternative);
3938                double alpha = 1.0 - conf_level, t_crit, ci_lower, ci_upper;
3939
21
                if (strcmp(alternative, "less") == 0) {
3940
6
                        t_crit   = qt_tail(df, alpha);
3941
159
                        ci_lower = -INFINITY;
3942
153
                        ci_upper = cint_est + t_crit * std_err;
3943
153
                } else if (strcmp(alternative, "greater") == 0) {
3944
153
                        t_crit   = qt_tail(df, alpha);
3945                        ci_lower = cint_est - t_crit * std_err;
3946
15
                        ci_upper = INFINITY;
3947
6
                } else {
3948
159
                        t_crit   = qt_tail(df, alpha / 2.0);
3949
6
                        ci_lower = cint_est - t_crit * std_err;
3950
159
                        ci_upper = cint_est + t_crit * std_err;
3951
153
                }
3952
153
                AV*restrict conf_int = newAV();
3953
153
                av_push(conf_int, newSVnv(ci_lower));
3954                av_push(conf_int, newSVnv(ci_upper));
3955
9
                hv_store(results, "statistic", 9, newSVnv(t_stat), 0);
3956                hv_store(results, "df",        2, newSVnv(df),     0);
3957
6
                hv_store(results, "p_value",   7, newSVnv(p_val),  0);
3958
6
                hv_store(results, "conf_int",  8, newRV_noinc((SV*)conf_int), 0);
3959                RETVAL = newRV_noinc((SV*)results);
3960
6
        }
3961
153
        OUTPUT:
3962
147
          RETVAL
3963
3964
0
void p_adjust(SV* p_sv, const char* method = "holm")
3965        INIT:
3966                if (!SvROK(p_sv) || SvTYPE(SvRV(p_sv)) != SVt_PVAV) {
3967                        croak("p_adjust: first argument must be an ARRAY reference of p-values");
3968
159
                }
3969
153
                AV *restrict p_av = (AV*)SvRV(p_sv);
3970
153
                size_t n = av_len(p_av) + 1;
3971                // Handle empty input
3972
150
                if (n == 0) {
3973
144
                        XSRETURN_EMPTY;
3974
144
                }
3975                // Normalize method string
3976
144
                char meth[64];
3977
3528
                strncpy(meth, method, 63); meth[63] = '\0';
3978
3384
                for(unsigned short int i = 0; meth[i]; i++) meth[i] = tolower(meth[i]);
3979
3384
                // Resolve aliases
3980
798
                if (strstr(meth, "benjamini") && strstr(meth, "hochberg")) strcpy(meth, "bh");
3981                if (strstr(meth, "benjamini") && strstr(meth, "yekutieli")) strcpy(meth, "by");
3982                if (strcmp(meth, "fdr") == 0) strcpy(meth, "bh");
3983                // Allocate C memory
3984
3816
                PVal *restrict arr;
3985
3672
                double *restrict adj;
3986
3672
                Newx(arr, n, PVal);
3987                Newx(adj, n, double);
3988
3989
3672
                for (size_t i = 0; i < n; i++) {
3990
3528
                        SV**restrict tv = av_fetch(p_av, i, 0);
3991                        arr[i].p = (tv && SvOK(*tv)) ? SvNV(*tv) : 1.0;
3992                        arr[i].orig_idx = i;
3993
7344
                }
3994
7200
                // Sort ascending (Stable sort using original index)
3995
4203
                qsort(arr, n, sizeof(PVal), cmp_pval);
3996        PPCODE:
3997                if (strcmp(meth, "bonferroni") == 0) {
3998                        for (size_t i = 0; i < n; i++) {
3999                                double v = arr[i].p * n;
4000
159
                                adj[arr[i].orig_idx] = (v < 1.0) ? v : 1.0;
4001
153
                        }
4002
153
                } else if (strcmp(meth, "holm") == 0) {
4003
153
                        double cummax = 0.0;
4004                        for (size_t i = 0; i < n; i++) {
4005
6
                                 double v = arr[i].p * (n - i);
4006
3
                                 if (v > cummax) cummax = v;
4007
0
                                 adj[arr[i].orig_idx] = (cummax < 1.0) ? cummax : 1.0;
4008
0
                        }
4009                } else if (strcmp(meth, "hochberg") == 0) {
4010                        double cummin = 1.0;
4011
3
                        for (ssize_t i = n - 1; i >= 0; i--) {
4012
3
                                 double v = arr[i].p * (n - i);
4013                                 if (v < cummin) cummin = v;
4014                                 adj[arr[i].orig_idx] = (cummin < 1.0) ? cummin : 1.0;
4015
36
                        }
4016
954
                } else if (strcmp(meth, "bh") == 0) {
4017
918
                        double cummin = 1.0;
4018                        for (ssize_t i = n - 1; i >= 0; i--) {
4019
36
                                double v = arr[i].p * n / (i + 1.0);
4020
36
                                if (v < cummin) cummin = v;
4021                                adj[arr[i].orig_idx] = (cummin < 1.0) ? cummin : 1.0;
4022                        }
4023                } else if (strcmp(meth, "by") == 0) {
4024                        double q = 0.0;
4025
18
                        for (size_t i = 1; i <= n; i++) q += 1.0 / i;
4026                        double cummin = 1.0;
4027
18
                        for (ssize_t i = n - 1; i >= 0; i--) {
4028                                double v = arr[i].p * n / (i + 1.0) * q;
4029                                if (v < cummin) cummin = v;
4030
42
                                adj[arr[i].orig_idx] = (cummin < 1.0) ? cummin : 1.0;
4031
24
                        }
4032
36
                } else if (strcmp(meth, "hommel") == 0) {
4033
12
                        double *restrict pa, *restrict q_arr;
4034
12
                        Newx(pa, n, double);
4035
129
                        Newx(q_arr, n, double);
4036
117
                        // Initial: min(n * p[i] / (i + 1))
4037
117
                        double min_val = n * arr[0].p;
4038                        for (size_t i = 1; i < n; i++) {
4039
12
                                double temp = (n * arr[i].p) / (i + 1.0);
4040
12
                                if (temp < min_val) {
4041                                   min_val = temp;
4042                                }
4043
18
                        }
4044                        // pa <- q <- rep(min, n)
4045
15
                        for (size_t i = 0; i < n; i++) {
4046                                 pa[i] = min_val;
4047
39
                                 q_arr[i] = min_val;
4048
24
                        }
4049
36
                        for (size_t j = n - 1; j >= 2; j--) {
4050
12
                                 ssize_t n_mj = n - j;       // Max index for 'ij'. Length is n_mj + 1
4051
12
                                 ssize_t i2_len = j - 1;     // Length of 'i2
4052
129
                                 // Calculate q1 = min(j * p[i2] / (2:j))
4053
117
                                 double q1 = (j * arr[n_mj + 1].p) / 2.0;
4054
117
                                 for (size_t k = 1; k < i2_len; k++) {
4055
117
                                     double temp_q1 = (j * arr[n_mj + 1 + k].p) / (2.0 + k);
4056                                     if (temp_q1 < q1) {
4057                                         q1 = temp_q1;
4058
12
                                     }
4059
12
                                 }
4060                                 // q[ij] <- pmin(j * p[ij], q1)
4061                                 for (size_t i = 0; i <= n_mj; i++) {
4062                                     double v = j * arr[i].p;
4063
15
                                     q_arr[i] = (v < q1) ? v : q1;
4064
15
                                 }
4065
12
                                 // q[i2] <- q[n - j]
4066                                 for (size_t i = 0; i < i2_len; i++) {
4067
3
                                     q_arr[n_mj + 1 + i] = q_arr[n_mj];
4068                                }
4069
15
                                 // pa <- pmax(pa, q)
4070
15
                                for (size_t i = 0; i < n; i++) {
4071                                    if (pa[i] < q_arr[i]) {
4072                                       pa[i] = q_arr[i];
4073                                    }
4074                                }
4075                        }
4076                        // pmin(1, pmax(pa, p))[ro] — map sorted results back to original indices
4077
18
                        for (size_t i = 0; i < n; i++) {
4078
9
                                double v = (pa[i] > arr[i].p) ? pa[i] : arr[i].p;
4079
6
                                if (v > 1.0) v = 1.0;
4080
3
                                adj[arr[i].orig_idx] = v;
4081                        }
4082                        Safefree(pa);  Safefree(q_arr);
4083                } else if (strcmp(meth, "none") == 0) {
4084
15
                        for (size_t i = 0; i < n; i++) {
4085
0
                                adj[arr[i].orig_idx] = arr[i].p;
4086                        }
4087
15
                } else {
4088
15
                        Safefree(arr); Safefree(adj);
4089
15
                        croak("Unknown p-value adjustment method: %s", method);
4090                }
4091                // Push values onto the Perl stack as a flat list
4092
15
                EXTEND(SP, n);
4093                for (size_t i = 0; i < n; i++) {
4094
15
                        PUSHs(sv_2mortal(newSVnv(adj[i])));
4095
15
                }
4096
3
                Safefree(arr); arr = NULL;
4097                Safefree(adj); adj = NULL;
4098
4099double median(...)
4100
30
        PROTOTYPE: @
4101
15
        INIT:
4102          size_t total_count = 0, k = 0;
4103
15
          double* restrict nums;
4104
15
          double median_val = 0.0;
4105        CODE:
4106
15
          /* Pass 1: Count valid elements — die immediately on any undef */
4107
15
          for (size_t i = 0; i < items; i++) {
4108
15
                   SV* restrict arg = ST(i);
4109
15
                   if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
4110
3
                       AV* restrict av = (AV*)SvRV(arg);
4111                       size_t len = av_len(av) + 1;
4112                       for (size_t j = 0; j < len; j++) {
4113                           SV** restrict tv = av_fetch(av, j, 0);
4114                           if (tv && SvOK(*tv)) {
4115
15
                               total_count++;
4116
12
                           } else {
4117                               croak("median: undefined value at array ref index %zu (argument %zu)", j, i);
4118
0
                           }
4119                       }
4120
12
                   } else if (SvOK(arg)) {
4121
3
                       total_count++;
4122                   } else {
4123                       croak("median: undefined value at argument index %zu", i);
4124
9
                   }
4125
0
          }
4126          if (total_count == 0) croak("median needs >= 1 element");
4127
4128
9
          /* Allocate C array now that we know the exact size */
4129
9
          Newx(nums, total_count, double);
4130
4131
90
          /* Pass 2: Populate the C array — Safefree before any croak */
4132
81
          for (size_t i = 0; i < items; i++) {
4133
81
                   SV* restrict arg = ST(i);
4134                   if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
4135
90
                       AV* restrict av = (AV*)SvRV(arg);
4136
81
                       size_t len = av_len(av) + 1;
4137
81
                       for (size_t j = 0; j < len; j++) {
4138                           SV** restrict tv = av_fetch(av, j, 0);
4139                           if (tv && SvOK(*tv)) {
4140
9
                               nums[k++] = SvNV(*tv);
4141
9
                           } else {
4142
9
                               Safefree(nums);
4143                               croak("median: undefined value at array ref index %zu (argument %zu)", j, i);
4144                           }
4145                       }
4146
3
                   } else if (SvOK(arg)) {
4147
0
                       nums[k++] = SvNV(arg);
4148                   } else {
4149                       Safefree(nums);
4150
3
                       croak("median: undefined value at argument index %zu", i);
4151
3
                   }
4152
0
          }
4153
4154
3
          /* Sort and calculate median */
4155
3
          qsort(nums, total_count, sizeof(double), compare_doubles);
4156          if (total_count % 2 == 0) {
4157
3
                   median_val = (nums[total_count / 2 - 1] + nums[total_count / 2]) / 2.0;
4158          } else {
4159                   median_val = nums[total_count / 2];
4160
12
          }
4161
9
          Safefree(nums);
4162
9
          nums = NULL;
4163
0
          RETVAL = median_val;
4164        OUTPUT:
4165          RETVAL
4166
4167
3
SV* cor(SV* x_sv, SV* y_sv = &PL_sv_undef, const char* method = "pearson")
4168
12
        INIT:
4169
9
          // --- validate method -------------------------------------------
4170
9
          if (strcmp(method, "pearson")  != 0 &&
4171
0
                   strcmp(method, "spearman") != 0 &&
4172                   strcmp(method, "kendall")  != 0)
4173                   croak("cor: unknown method '%s' (use 'pearson', 'spearman', or 'kendall')",
4174                         method);
4175
4176          // --- validate x ------------------------------------------------
4177
3
          if (!SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
4178                   croak("cor: x must be an ARRAY reference");
4179
4180
6
          AV*restrict x_av = (AV*)SvRV(x_sv);
4181
24
          size_t nx   = av_len(x_av) + 1;
4182
18
          if (nx == 0) croak("cor: x is empty");
4183
4184
18
          // --- detect whether x is a flat vector or a matrix (AoA) -------
4185
18
          bool x_is_matrix = 0;
4186          {
4187                   SV**restrict fp = av_fetch(x_av, 0, 0);
4188                   if (fp && SvROK(*fp) && SvTYPE(SvRV(*fp)) == SVt_PVAV)
4189                       x_is_matrix = 1;
4190          }
4191
4192
3
          // --- detect y ----------------------------
4193          bool has_y = (SvOK(y_sv) && SvROK(y_sv) &&
4194                            SvTYPE(SvRV(y_sv)) == SVt_PVAV);
4195
4196          AV*restrict y_av = has_y ? (AV*)SvRV(y_sv) : NULL;
4197
3
          size_t ny = has_y ? av_len(y_av) + 1 : 0;
4198
4199
3
          bool y_is_matrix = 0;
4200          if (has_y && ny > 0) {
4201
3
                   SV**restrict fp = av_fetch(y_av, 0, 0);
4202
9
                   if (fp && SvROK(*fp) && SvTYPE(SvRV(*fp)) == SVt_PVAV)
4203
6
                       y_is_matrix = 1;
4204
24
          }
4205
4206
18
        CODE:
4207
18
      // Branch 1: both inputs are flat vectors  â†’  scalar result
4208
18
          if (!x_is_matrix && !y_is_matrix) {
4209                   if (!has_y) {
4210                       /* cor(vector) == 1 by definition */
4211                       RETVAL = newSVnv(1.0);
4212
0
                   } else {
4213
0
                       if (nx != ny)
4214
0
                           croak("cor: x and y must have the same length (%lu vs %lu)",
4215                                 nx, ny);
4216
4217
0
                       if (nx < 2)
4218                           croak("cor: need at least 2 observations");
4219
4220
3
                       double *restrict xd, *restrict yd;
4221                       Newx(xd, nx, double);
4222                       Newx(yd, ny, double);
4223
4224
9
                       for (size_t i = 0; i < nx; i++) {
4225
6
                           SV**restrict tv = av_fetch(x_av, i, 0);
4226
6
                           xd[i] = (tv && SvOK(*tv) && looks_like_number(*tv)) ? SvNV(*tv) : NAN;
4227                       }
4228
3
                       for (size_t i = 0; i < ny; i++) {
4229                           SV**restrict tv = av_fetch(y_av, i, 0);
4230                           yd[i] = (tv && SvOK(*tv) && looks_like_number(*tv)) ? SvNV(*tv) : NAN;
4231
0
                       }
4232
4233
0
                       double r = compute_cor(xd, yd, nx, method);
4234                       Safefree(xd); Safefree(yd);
4235
0
                       RETVAL = newSVnv(r);
4236
0
                   }
4237
0
          } else {//Branch 2: x is a matrix (or y is a matrix)  â†’  AoA result
4238
0
                   // -- resolve x matrix dimensions
4239
0
                   if (!x_is_matrix)
4240
0
                       croak("cor: x must be a matrix (array ref of array refs) "
4241                             "when y is a matrix");
4242
4243                   SV**restrict xr0 = av_fetch(x_av, 0, 0);
4244
0
                   if (!xr0 || !SvROK(*xr0) || SvTYPE(SvRV(*xr0)) != SVt_PVAV)
4245
0
                       croak("cor: each row of x must be an ARRAY reference");
4246
4247                   size_t ncols_x = av_len((AV*)SvRV(*xr0)) + 1;
4248
0
                   if (ncols_x == 0) croak("cor: x matrix has zero columns");
4249
4250                   size_t nrows   = nx;   /* observations */
4251
4252
9
                   // PRE-VALIDATION PASS: Ensure all rows are arrays to prevent memory leaks on croak
4253
18
                   for (size_t i = 0; i < nrows; i++) {
4254
12
                       SV**restrict rv = av_fetch(x_av, i, 0);
4255                       if (!rv || !SvROK(*rv) || SvTYPE(SvRV(*rv)) != SVt_PVAV)
4256                           croak("cor: x row %lu is not an array ref", i);
4257
9
                   }
4258
4259
3
           if (has_y && y_is_matrix) {
4260                       if (ny != nrows) croak("cor: x and y must have the same number of rows (%lu vs %lu)", nrows, ny);
4261
9
               for (size_t i = 0; i < nrows; i++) {
4262
3
                           SV**restrict rv = av_fetch(y_av, i, 0);
4263
3
                           if (!rv || !SvROK(*rv) || SvTYPE(SvRV(*rv)) != SVt_PVAV)
4264
9
                               croak("cor: y row %lu is not an array ref", i);
4265
3
                       }
4266           }
4267
4268                   // -- extract x columns
4269                   double **restrict col_x;
4270                   Newx(col_x, ncols_x, double*);
4271
4272                   for (size_t j = 0; j < ncols_x; j++) {
4273                       Newx(col_x[j], nrows, double);
4274                       for (size_t i = 0; i < nrows; i++) {
4275                           SV**restrict rv = av_fetch(x_av, i, 0);
4276
15
                           AV*restrict  row = (AV*)SvRV(*rv);
4277
15
                           SV**restrict cv  = av_fetch(row, j, 0);
4278
15
                           col_x[j][i] = (cv && SvOK(*cv) && looks_like_number(*cv)) ? SvNV(*cv) : NAN;
4279                       }
4280
15
                   }
4281
4282
15
                   // -- resolve y: separate matrix or re-use x (symmetric)
4283
6
                   size_t ncols_y;
4284
6
                   double **restrict col_y   = NULL;
4285                   bool symmetric = 0;
4286
4287
6
                   // 1 = cor(X) — result is symmetric
4288
6
                   if (has_y && y_is_matrix) {
4289
6
                       // cross-correlation: X (nrows × p) vs Y (nrows × q)
4290
0
                       SV**restrict yr0 = av_fetch(y_av, 0, 0);
4291                       ncols_y = av_len((AV*)SvRV(*yr0)) + 1;
4292
6
                       if (ncols_y == 0) croak("cor: y matrix has zero columns");
4293
4294
6
                       Newx(col_y, ncols_y, double*);
4295
3
                       for (size_t j = 0; j < ncols_y; j++) {
4296
3
                           Newx(col_y[j], nrows, double);
4297
3
                           for (size_t i = 0; i < nrows; i++) {
4298
0
                               SV**restrict  rv = av_fetch(y_av, i, 0);
4299
0
                               AV*restrict  row = (AV*)SvRV(*rv);
4300
0
                               SV**restrict cv  = av_fetch(row, j, 0);
4301
0
                               col_y[j][i] = (cv && SvOK(*cv) && looks_like_number(*cv)) ? SvNV(*cv) : NAN;
4302                           }
4303
0
                       }
4304                   } else { // cor(X) — symmetric p×p result; share column arrays
4305                       ncols_y  = ncols_x;
4306                       col_y    = col_x;
4307                       symmetric = 1;
4308
6
                   }
4309
6
                   if (nrows < 2)
4310
3
                       croak("cor: need at least 2 observations (got %lu)", nrows);
4311
3
                   // -- build cache for symmetric case: compute upper triangle, store results, mirror to lower triangle
4312
0
                   AV*restrict result_av = newAV();
4313                   av_extend(result_av, ncols_x - 1);
4314
3
                   // Allocate per-row AVs up front so we can fill them in order
4315
3
                   AV **restrict rows_out;
4316
0
                   Newx(rows_out, ncols_x, AV*);
4317
3
                   for (size_t i = 0; i < ncols_x; i++) {
4318
3
                       rows_out[i] = newAV();
4319
0
                       av_extend(rows_out[i], ncols_y - 1);
4320
0
                   }
4321
0
                   if (symmetric) {
4322
0
                       /* Upper triangle + diagonal, then mirror. r_cache[i][j] (j >= i) holds the computed value. */
4323
0
                       double **restrict r_cache;
4324                       Newx(r_cache, ncols_x, double*);
4325
0
                       for (size_t i = 0; i < ncols_x; i++)
4326                           Newx(r_cache[i], ncols_x, double);
4327
4328                       for (size_t i = 0; i < ncols_x; i++) {
4329                           r_cache[i][i] = 1.0; // diagonal
4330                           for (size_t j = i + 1; j < ncols_x; j++) {
4331                               double r = compute_cor(col_x[i], col_x[j], nrows, method);
4332
15
                               r_cache[i][j] = r;
4333
15
                               r_cache[j][i] = r; // symmetry
4334
6
                           }
4335
6
                       }
4336
3
                       // fill output AoA from cache
4337
3
                       for (size_t i = 0; i < ncols_x; i++)
4338
3
                           for (size_t j = 0; j < ncols_x; j++)
4339
3
                               av_store(rows_out[i], j, newSVnv(r_cache[i][j]));
4340
4341                       for (size_t i = 0; i < ncols_x; i++) Safefree(r_cache[i]);
4342                       Safefree(r_cache); r_cache = NULL;
4343                   } else {
4344                       // cross-correlation: every (i,j) pair is independent
4345
15
                       for (size_t i = 0; i < ncols_x; i++)
4346                           for (size_t j = 0; j < ncols_y; j++)
4347                               av_store(rows_out[i], j, newSVnv(compute_cor(col_x[i], col_y[j], nrows, method)));
4348                   }
4349
3
                   // push row AVs into result
4350
3
                   for (size_t i = 0; i < ncols_x; i++)
4351                       av_store(result_av, i, newRV_noinc((SV*)rows_out[i]));
4352
3
                   Safefree(rows_out); rows_out = NULL;
4353
3
                   // -- free column arrays -------------------------------------
4354                   for (size_t j = 0; j < ncols_x; j++) Safefree(col_x[j]);
4355
3
                   Safefree(col_x); col_x = NULL;
4356                   if (!symmetric) {
4357                       for (size_t j = 0; j < ncols_y; j++) Safefree(col_y[j]);
4358
3
                       Safefree(col_y);
4359
3
                   }
4360
3
                   RETVAL = newRV_noinc((SV*)result_av);
4361
12
          }
4362
9
        OUTPUT:
4363
9
                RETVAL
4364
4365void scale(...)
4366        PROTOTYPE: @
4367
9
        PPCODE:
4368
6
        {
4369                bool do_center_mean = true, do_scale_sd = true;
4370
6
                double center_val = 0.0, scale_val = 1.0;
4371                size_t data_items = items;
4372
24
                // 1. Parse Options Hash (if it exists as the last argument)
4373
18
                if (items > 0) {
4374
18
                        SV*restrict last_arg = ST(items - 1);
4375
18
                        if (SvROK(last_arg) && SvTYPE(SvRV(last_arg)) == SVt_PVHV) {
4376
18
                                 data_items = items - 1; // Exclude hash from data processing
4377
18
                                 HV*restrict opt_hv = (HV*)SvRV(last_arg);
4378                                 // --- Parse 'center'
4379
0
                                 SV**restrict center_sv = hv_fetch(opt_hv, "center", 6, 0);
4380                                 if (center_sv) {
4381
18
                                     SV*restrict val_sv = *center_sv;
4382                                     if (!SvOK(val_sv)) {
4383                                         do_center_mean = false; center_val = 0.0;
4384
6
                                     } else {
4385
6
                                         char *restrict str = SvPV_nolen(val_sv);
4386                                         /* Trap booleans and empty strings before numeric checks */
4387
6
                                         if (strcasecmp(str, "mean") == 0 || strcasecmp(str, "true") == 0 || strcmp(str, "1") == 0) {
4388
6
                                             do_center_mean = true;
4389
0
                                         } else if (strcasecmp(str, "none") == 0 || strcasecmp(str, "false") == 0 || strcmp(str, "0") == 0 || strcmp(str, "") == 0) {
4390
0
                                             do_center_mean = false; center_val = 0.0;
4391
0
                                         } else if (looks_like_number(val_sv)) {
4392                                             do_center_mean = false; center_val = SvNV(val_sv);
4393
6
                                         } else if (SvTRUE(val_sv)) {
4394
24
                                             do_center_mean = true;
4395
18
                                         } else {
4396
18
                                             do_center_mean = false; center_val = 0.0;
4397                                         }
4398
6
                                     }
4399                                 }
4400                                 // --- Parse 'scale' ---
4401
24
                                 SV**restrict scale_sv = hv_fetch(opt_hv, "scale", 5, 0);
4402
18
                                 if (scale_sv) {
4403
18
                                     SV*restrict val_sv = *scale_sv;
4404
18
                                     if (!SvOK(val_sv)) {
4405                                         do_scale_sd = false; scale_val = 1.0;
4406
6
                                     } else {
4407                                         char *restrict str = SvPV_nolen(val_sv);
4408
3
                                         if (strcasecmp(str, "sd") == 0 || strcasecmp(str, "true") == 0 || strcmp(str, "1") == 0) {
4409                                             do_scale_sd = true;
4410
3
                                         } else if (strcasecmp(str, "none") == 0 || strcasecmp(str, "false") == 0 || strcmp(str, "0") == 0 || strcmp(str, "") == 0) {
4411
3
                                             do_scale_sd = false; scale_val = 1.0;
4412                                         } else if (looks_like_number(val_sv)) {
4413                                             do_scale_sd = false; scale_val = SvNV(val_sv);
4414                                             if (scale_val == 0.0) scale_val = 1.0; /* Prevent Division By Zero */
4415                                         } else if (SvTRUE(val_sv)) {
4416
12
                                             do_scale_sd = true;
4417                                         } else {
4418
12
                                             do_scale_sd = false; scale_val = 1.0;
4419
60
                                         }
4420
48
                                     }
4421
48
                                 }
4422
0
                        }
4423
0
                }
4424
0
                // 2. Detect if the input is a Matrix (Array of Arrays)
4425
0
                bool is_matrix = false;
4426
0
                if (data_items == 1) {
4427                        SV*restrict first_arg = ST(0);
4428
48
                        if (SvROK(first_arg) && SvTYPE(SvRV(first_arg)) == SVt_PVAV) {
4429
48
                                 AV*restrict av = (AV*)SvRV(first_arg);
4430                                 if (av_len(av) >= 0) {
4431                                     SV**restrict first_elem = av_fetch(av, 0, 0);
4432
12
                                     if (first_elem && SvROK(*first_elem) && SvTYPE(SvRV(*first_elem)) == SVt_PVAV) {
4433
12
                                         is_matrix = true;
4434
60
                                     }
4435
48
                                 }
4436
48
                        }
4437
0
                }
4438
0
                if (is_matrix) {
4439
0
                        //=========================================================
4440
0
                        // MATRIX MODE: Scale columns independently (Just like R)
4441
0
                        //=========================================================
4442
0
                        AV*restrict mat_av = (AV*)SvRV(ST(0));
4443
0
                        size_t nrow = av_len(mat_av) + 1, ncol = 0;
4444
4445                        SV**restrict first_row = av_fetch(mat_av, 0, 0);
4446
48
                        ncol = av_len((AV*)SvRV(*first_row)) + 1;
4447
4448
48
                        if (nrow == 0 || ncol == 0) croak("scale requires non-empty matrix");
4449
4450                        // Create a new matrix for the scaled output
4451                        AV*restrict result_av = newAV();
4452
12
                        av_extend(result_av, nrow - 1);
4453                        AV**restrict row_ptrs = (AV**)safemalloc(nrow * sizeof(AV*));
4454
12
                        for (size_t r = 0; r < nrow; r++) {
4455
9
                                 row_ptrs[r] = newAV();
4456
3
                                 av_extend(row_ptrs[r], ncol - 1);
4457
3
                                 av_push(result_av, newRV_noinc((SV*)row_ptrs[r]));
4458                        }
4459
6
                        // Calculate and apply scale per column
4460
36
                        for (size_t c = 0; c < ncol; c++) {
4461
30
                                 double col_sum = 0.0;
4462
30
                                 double *restrict col_data;
4463                                 Newx(col_data, nrow, double);
4464
6
                                 // Extract the column data
4465                                 for (size_t r = 0; r < nrow; r++) {
4466
9
                                     SV**restrict row_sv = av_fetch(mat_av, r, 0);
4467
54
                                     if (row_sv && SvROK(*row_sv)) {
4468
45
                                         AV*restrict row_av = (AV*)SvRV(*row_sv);
4469
45
                                         SV**restrict cell_sv = av_fetch(row_av, c, 0);
4470
45
                                         col_data[r] = (cell_sv && SvOK(*cell_sv)) ? SvNV(*cell_sv) : 0.0;
4471                                     } else {
4472
9
                                         col_data[r] = 0.0;
4473                                     }
4474                                     col_sum += col_data[r];
4475                                 }
4476
4477                                 double col_center = do_center_mean ? (col_sum / nrow) : center_val;
4478                                 double col_scale = scale_val;
4479
12
                                 // Calculate Standard Deviation for this specific column if needed
4480
0
                                 if (do_scale_sd) {
4481                                     if (nrow <= 1) {
4482
12
                                         Safefree(col_data);
4483
12
                                         safefree(row_ptrs);
4484
12
                                         croak("scale needs >= 2 rows to calculate standard deviation for a matrix column");
4485                                     }
4486
36
                                     double sum_sq = 0.0;
4487
24
                                     for (size_t r = 0; r < nrow; r++) {
4488
24
                                         double diff = col_data[r] - col_center;
4489
24
                                         sum_sq += diff * diff;
4490
12
                                     }
4491
12
                                     col_scale = sqrt(sum_sq / (nrow - 1));
4492
9
                                 }
4493
9
                                 // Store scaled values back into the new matrix rows
4494
3
                                 for (size_t r = 0; r < nrow; r++) {
4495
3
                                     double centered = col_data[r] - col_center;
4496
3
                                     double final_val = (col_scale == 0.0) ? (0.0 / 0.0) : (centered / col_scale);
4497
0
                                     av_store(row_ptrs[r], c, newSVnv(final_val));
4498
0
                                 }
4499                                 Safefree(col_data);
4500
0
                        }
4501                        safefree(row_ptrs);
4502                        // Push the resulting matrix as a single Reference onto the Perl stack
4503                        EXTEND(SP, 1);
4504
12
                        PUSHs(sv_2mortal(newRV_noinc((SV*)result_av)));
4505
3
                } else {
4506                        // ======================================
4507
9
                        // FLAT LIST MODE: Original functionality
4508
9
                        // ======================================
4509
9
                        size_t total_count = 0, k = 0;
4510
3
                        double *restrict nums;
4511                        double sum = 0.0;
4512                        for (size_t i = 0; i < data_items; i++) {
4513
6
                                SV*restrict arg = ST(i);
4514
0
                                if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
4515
0
                                        AV*restrict av = (AV*)SvRV(arg);
4516
6
                                        size_t len = av_len(av) + 1;
4517
3
                                        for (unsigned int j = 0; j < len; j++) {
4518
3
                                                SV**restrict tv = av_fetch(av, j, 0);
4519
0
                                                if (tv && SvOK(*tv)) { total_count++; }
4520                                        }
4521                                } else if (SvOK(arg)) {
4522
6
                                        total_count++;
4523
3
                                }
4524                        }
4525                        if (total_count == 0) croak("scale requires at least 1 numeric element");
4526
3
                        Newx(nums, total_count, double);
4527
3
                        for (size_t i = 0; i < data_items; i++) {
4528                                 SV*restrict arg = ST(i);
4529
3
                                 if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
4530
9
                                     AV*restrict av = (AV*)SvRV(arg);
4531
6
                                     size_t len = av_len(av) + 1;
4532
6
                                     for (size_t j = 0; j < len; j++) {
4533
6
                                         SV**restrict tv = av_fetch(av, j, 0);
4534                                         if (tv && SvOK(*tv)) {
4535                                             double val = SvNV(*tv);
4536
3
                                             nums[k++] = val; sum += val;
4537
21
                                         }
4538                                     }
4539
18
                                 } else if (SvOK(arg)) {
4540
18
                                     double val = SvNV(arg);
4541
18
                                     nums[k++] = val; sum += val;
4542
0
                                 }
4543
0
                        }
4544                        if (do_center_mean) center_val = sum / total_count;
4545
18
                        if (do_scale_sd) {
4546
18
                                 if (total_count <= 1) {
4547                                     Safefree(nums);
4548
18
                                     croak("scale needs >= 2 elements to calculate SD");
4549                                 }
4550
3
                                 double sum_sq = 0.0;
4551
3
                                 for (size_t i = 0; i < total_count; i++) {
4552                                     double diff = nums[i] - center_val;
4553                                     sum_sq += diff * diff;
4554                                 }
4555                                 scale_val = sqrt(sum_sq / (total_count - 1));
4556                        }
4557                        EXTEND(SP, total_count);
4558
66
                        for (size_t i = 0; i < total_count; i++) {
4559
66
                                double centered = nums[i] - center_val;
4560                                double final_val = (scale_val == 0.0) ? (0.0 / 0.0) : (centered / scale_val);
4561                                PUSHs(sv_2mortal(newSVnv(final_val)));
4562                        }
4563
66
                        Safefree(nums); nums = NULL;
4564
66
                }
4565
66
        }
4566
4567
66
SV* matrix(...)
4568
66
CODE:
4569        // Basic check: must have an even number of arguments for key => value
4570
66
        if (items % 2 != 0) {
4571
66
          croak("Usage: matrix(data => [...], nrow => $n, ncol => $m, byrow => $bool)");
4572
66
        }
4573
66
        SV*restrict data_sv = NULL;
4574        size_t nrow = 0, ncol = 0;
4575
66
        bool byrow = FALSE, nrow_set = FALSE, ncol_set = FALSE;
4576
66
        // Parse named arguments
4577
66
        for (size_t i = 0; i < items; i += 2) {
4578
66
          char*restrict key = SvPV_nolen(ST(i));
4579          SV*restrict val   = ST(i + 1);
4580          if (strEQ(key, "data")) {
4581
66
                   data_sv = val;
4582          } else if (strEQ(key, "nrow")) {
4583                   nrow = (size_t)SvUV(val);
4584
66
                   nrow_set = TRUE;
4585          } else if (strEQ(key, "ncol")) {
4586
192
                   ncol = (size_t)SvUV(val);
4587
126
                   ncol_set = TRUE;
4588
126
          } else if (strEQ(key, "byrow")) {
4589
126
                   byrow = SvTRUE(val);
4590
63
          } else {
4591
0
                   croak("Unknown option: %s", key);
4592          }
4593
66
        }
4594
63
        // Validate data input
4595        if (!data_sv || !SvROK(data_sv) || SvTYPE(SvRV(data_sv)) != SVt_PVAV) {
4596          croak("The 'data' option must be an array reference (e.g. data => [1..6])");
4597        }
4598        AV*restrict data_av = (AV*)SvRV(data_sv);
4599
57
        size_t data_len = (UV)(av_top_index(data_av) + 1);
4600
57
        if (data_len == 0) {
4601
57
          croak("Data array cannot be empty");
4602
57
        }
4603
57
        // R-style dimension inference
4604
57
        if (!nrow_set && !ncol_set) {
4605
57
          nrow = data_len;
4606
57
          ncol = 1;
4607
36
        } else if (nrow_set && !ncol_set) {
4608
36
          ncol = (data_len + nrow - 1) / nrow;
4609
36
        } else if (!nrow_set && ncol_set) {
4610
246
          nrow = (data_len + ncol - 1) / ncol;
4611        }
4612
210
        // Final safety check for dimensions
4613
210
        if (nrow == 0 || ncol == 0) {
4614          croak("Dimensions must be greater than 0");
4615
21
        }
4616
21
        // Create the matrix (Array of Arrays)
4617
21
        AV*restrict result_av = newAV();
4618
21
        av_extend(result_av, nrow - 1);
4619
693
        size_t r, c;// Use unsigned types for counters to prevent negative indexing
4620        AV**restrict row_ptrs = (AV**restrict)safemalloc(nrow * sizeof(AV*)); /* Pre-allocate row pointers */
4621
672
        for (r = 0; r < nrow; r++) {
4622
672
          row_ptrs[r] = newAV();
4623
672
          av_extend(row_ptrs[r], ncol - 1);
4624          av_push(result_av, newRV_noinc((SV*)row_ptrs[r]));
4625
0
        }
4626        // Fill the matrix
4627
0
        size_t total_cells = nrow * ncol;
4628
0
        for (size_t i = 0; i < total_cells; i++) {
4629
0
          // Vector recycling logic
4630
0
          SV**restrict fetched = av_fetch(data_av, i % data_len, 0);
4631
0
          SV*restrict val = fetched ? newSVsv(*fetched) : newSV(0);
4632
0
          if (byrow) {
4633
0
                   r = i / ncol;
4634
0
                   c = i % ncol;
4635
0
          } else {
4636
0
                   r = i % nrow;
4637                   c = i / nrow;
4638
0
          }
4639
0
          av_store(row_ptrs[r], c, val);
4640
0
        }
4641        safefree(row_ptrs);
4642        RETVAL = newRV_noinc((SV*)result_av);
4643
0
        OUTPUT:
4644        RETVAL
4645
4646SV* lm(...)
4647CODE:
4648
57
{
4649
645
        const char *restrict formula  = NULL;
4650
57
        SV *restrict data_sv = NULL;
4651        char f_cpy[512];
4652
57
        char *restrict src, *restrict dst, *restrict tilde, *restrict lhs, *restrict rhs, *restrict chunk;
4653
4654
9
        char **restrict terms = NULL, **restrict uniq_terms = NULL, **restrict exp_terms = NULL;
4655
3
        bool *restrict is_dummy = NULL;
4656
3
        char **restrict dummy_base = NULL, **restrict dummy_level = NULL;
4657        unsigned int term_cap = 64, exp_cap = 64, num_terms = 0, num_uniq = 0, p = 0, p_exp = 0;
4658
54
        size_t n = 0, valid_n = 0, i, j, k, l, l1, l2;
4659
54
        bool has_intercept = true;
4660
4661        char **restrict row_names = NULL, **restrict valid_row_names = NULL;
4662        HV **restrict row_hashes = NULL;
4663        HV *restrict data_hoa = NULL;
4664        SV *restrict ref = NULL;
4665
4666
54
        double *restrict X = NULL, *restrict Y = NULL, *restrict XtX = NULL, *restrict XtY = NULL;
4667
267
        bool *restrict aliased = NULL;
4668        double *restrict beta = NULL;
4669
213
        int final_rank = 0, df_res = 0;
4670
0
        HV *restrict res_hv, *restrict coef_hv, *restrict fitted_hv, *restrict resid_hv, *restrict summary_hv;
4671
0
        AV *restrict terms_av;
4672
0
        double rss = 0.0, rse_sq = 0.0;
4673        HE *restrict entry;
4674
4675
213
        if (items % 2 != 0) croak("Usage: lm(formula => 'mpg ~ wt * hp', data => \\%%mtcars)");
4676
4677
3
        for (unsigned short i_arg = 0; i_arg < items; i_arg += 2) {
4678
3
          const char *restrict key = SvPV_nolen(ST(i_arg));
4679
3
          SV *restrict val = ST(i_arg + 1);
4680          if      (strEQ(key, "formula")) formula = SvPV_nolen(val);
4681          else if (strEQ(key, "data"))    data_sv = val;
4682
210
          else croak("lm: unknown argument '%s'", key);
4683
0
        }
4684
0
        if (!formula) croak("lm: formula is required");
4685
0
        if (!data_sv || !SvROK(data_sv)) croak("lm: data is required and must be a reference");
4686
4687        // ========================================================================
4688        // PHASE 1: Data Extraction
4689
210
        // ========================================================================
4690
0
        ref = SvRV(data_sv);
4691
0
        if (SvTYPE(ref) == SVt_PVHV) {
4692
0
          HV *restrict hv = (HV*)ref;
4693          if (hv_iterinit(hv) == 0) croak("lm: Data hash is empty");
4694          entry = hv_iternext(hv);
4695
210
          if (entry) {
4696
0
                   SV *restrict val = hv_iterval(hv, entry);
4697                   if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVAV) {
4698                       data_hoa = hv;
4699
210
                       n = av_len((AV*)SvRV(val)) + 1;
4700
0
                       Newx(row_names, n, char*);
4701
0
                       for (i = 0; i < n; i++) {
4702
0
                           char buf[32];
4703                           snprintf(buf, sizeof(buf), "%lu", (unsigned long)(i + 1));
4704                           row_names[i] = savepv(buf);
4705
210
                       }
4706
54
                   } else if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVHV) {
4707
54
                       n = hv_iterinit(hv);
4708                       Newx(row_names, n, char*); Newx(row_hashes, n, HV*);
4709
210
                       i = 0;
4710                       while ((entry = hv_iternext(hv))) {
4711                           unsigned int len;
4712                           row_names[i] = savepv(hv_iterkey(entry, &len));
4713                           row_hashes[i] = (HV*)SvRV(hv_iterval(hv, entry));
4714                           i++;
4715                       }
4716
54
                   } else croak("lm: Hash values must be ArrayRefs (HoA) or HashRefs (HoH)");
4717
0
          }
4718
54
        } else if (SvTYPE(ref) == SVt_PVAV) {
4719
54
          AV *restrict av = (AV*)ref; n = av_len(av) + 1;
4720
54
          Newx(row_names, n, char*);
4721          Newx(row_hashes, n, HV*);
4722          for (i = 0; i < n; i++) {
4723                   SV **restrict val = av_fetch(av, i, 0);
4724
54
                   if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVHV) {
4725
54
                       row_hashes[i] = (HV*)SvRV(*val);
4726
54
                       char buf[32]; snprintf(buf, sizeof(buf), "%lu", (unsigned long)(i + 1));
4727
132
                       row_names[i] = savepv(buf);
4728
78
                   } else {
4729
3
                       for (k = 0; k < i; k++) Safefree(row_names[k]);
4730
12
                       Safefree(row_names); Safefree(row_hashes);
4731
9
                       croak("lm: Array values must be HashRefs (AoH)");
4732
9
                   }
4733
9
          }
4734
9
        } else croak("lm: Data must be an Array or Hash reference");
4735
4736
6
        // ========================================================================
4737
6
        // PHASE 2: Formula Parsing & `.` Expansion
4738
6
        // ========================================================================
4739
6
        src = (char*)formula; dst = f_cpy;
4740        while (*src && (dst - f_cpy < 511)) { if (!isspace(*src)) { *dst++ = *src; } src++; }
4741        *dst = '\0';
4742
4743        tilde = strchr(f_cpy, '~');
4744
3
        if (!tilde) {
4745          for (i = 0; i < n; i++) Safefree(row_names[i]);
4746
75
          Safefree(row_names); if (row_hashes) Safefree(row_hashes);
4747
75
          croak("lm: invalid formula, missing '~'");
4748
75
        }
4749
75
        *tilde = '\0';
4750
75
        lhs = f_cpy;
4751        rhs = tilde + 1;
4752
4753
78
        // Remove intercept-suppression markers from RHS.
4754        // IMPORTANT: skip tokens that appear inside I(...) wrappers so that
4755        // expressions like I(x^-1) are never mistakenly treated as "-1".
4756
54
        {
4757
54
          char *restrict p_idx = rhs;
4758
54
          while (*p_idx) {
4759                   // Skip over I(...) sub-expressions entirely
4760
54
                   if (p_idx[0] == 'I' && p_idx[1] == '(') {
4761                       int depth = 0;
4762
54
                       while (*p_idx) { if (*p_idx == '(') depth++; else if (*p_idx == ')') { depth--; if (depth == 0) { p_idx++; break; } } p_idx++; }
4763
54
                       continue;
4764
135
                   }
4765
81
                   // Match bare -1
4766
0
                   if (p_idx[0] == '-' && p_idx[1] == '1' &&
4767
0
                       (p_idx[2] == '\0' || p_idx[2] == '+' || p_idx[2] == '-')) {
4768                       has_intercept = false;
4769
81
                       memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1);
4770
81
                       continue; // re-examine same position
4771
3
                   }
4772
3
                   // Match +0
4773
3
                   if (p_idx[0] == '+' && p_idx[1] == '0' &&
4774
3
                       (p_idx[2] == '\0' || p_idx[2] == '+' || p_idx[2] == '-')) {
4775
3
                       has_intercept = false;
4776
3
                       memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1);
4777
3
                       continue;
4778
3
                   }
4779
3
                   // Match leading 0+
4780
3
                   if (p_idx == rhs && p_idx[0] == '0' && p_idx[1] == '+') {
4781
3
                       has_intercept = false;
4782
3
                       memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1);
4783                       continue;
4784
78
                   }
4785
78
                   // Match bare 0 (entire rhs)
4786
78
                   if (p_idx == rhs && p_idx[0] == '0' && p_idx[1] == '\0') {
4787                       has_intercept = false; p_idx[0] = '\0'; break;
4788
81
                   }
4789                   // Strip redundant +1 (keep intercept, just remove marker)
4790                   if (p_idx[0] == '+' && p_idx[1] == '1' &&
4791                       (p_idx[2] == '\0' || p_idx[2] == '+' || p_idx[2] == '-')) {
4792
192
                       memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1);
4793
138
                       continue;
4794
258
                   }
4795
138
                   // Strip leading bare 1 or 1+
4796                   if (p_idx == rhs) {
4797
54
                       if (p_idx[0] == '1' && p_idx[1] == '\0') { p_idx[0] = '\0'; break; }
4798                       if (p_idx[0] == '1' && p_idx[1] == '+') { memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); continue; }
4799                   }
4800                   p_idx++;
4801          }
4802
192
        }
4803
4804
0
        // Clean up stray `++`, leading `+`, trailing `+`
4805
0
        {
4806
0
          char *restrict p_idx;
4807          while ((p_idx = strstr(rhs, "++")) != NULL)
4808
138
                   memmove(p_idx, p_idx + 1, strlen(p_idx + 1) + 1);
4809
51
          if (rhs[0] == '+') memmove(rhs, rhs + 1, strlen(rhs + 1) + 1);
4810          size_t len_rhs = strlen(rhs);
4811
87
          if (len_rhs > 0 && rhs[len_rhs - 1] == '+') rhs[len_rhs - 1] = '\0';
4812
15
        }
4813
4814
15
        // Expand `.` Operator
4815
141
        char rhs_expanded[2048] = "";
4816
126
        size_t rhs_len = 0;
4817
126
        chunk = strtok(rhs, "+");
4818
126
        while (chunk != NULL) {
4819
243
          if (strcmp(chunk, ".") == 0) {
4820
126
                   AV *cols = get_all_columns(data_hoa, row_hashes, n);
4821
42
                   for (size_t c = 0; c <= (size_t)av_len(cols); c++) {
4822
42
                       SV **col_sv = av_fetch(cols, c, 0);
4823                       if (col_sv && SvOK(*col_sv)) {
4824
126
                           const char *col_name = SvPV_nolen(*col_sv);
4825                           if (strcmp(col_name, lhs) != 0) {
4826                               size_t slen = strlen(col_name);
4827
15
                               if (rhs_len + slen + 2 < sizeof(rhs_expanded)) {
4828
42
                                   if (rhs_len > 0) { strcat(rhs_expanded, "+"); rhs_len++; }
4829
66
                                   strcat(rhs_expanded, col_name);
4830
39
                                   rhs_len += slen;
4831
42
                               }
4832
27
                           }
4833
0
                       }
4834
0
                   }
4835
0
                   SvREFCNT_dec(cols);
4836          } else {
4837
27
                   size_t slen = strlen(chunk);
4838
27
                   if (rhs_len + slen + 2 < sizeof(rhs_expanded)) {
4839
27
                       if (rhs_len > 0) { strcat(rhs_expanded, "+"); rhs_len++; }
4840
27
                       strcat(rhs_expanded, chunk);
4841
27
                       rhs_len += slen;
4842
27
                   }
4843
27
          }
4844          chunk = strtok(NULL, "+");
4845
57
        }
4846
4847        Newx(terms, term_cap, char*); Newx(uniq_terms, term_cap, char*);
4848
0
        Newx(exp_terms, exp_cap, char*); Newx(is_dummy, exp_cap, bool);
4849
0
        Newx(dummy_base, exp_cap, char*); Newx(dummy_level, exp_cap, char*);
4850
4851        if (has_intercept) { terms[num_terms++] = savepv("Intercept"); }
4852
4853        if (strlen(rhs_expanded) > 0) {
4854          chunk = strtok(rhs_expanded, "+");
4855
54
          while (chunk != NULL) {
4856
54
                   if (num_terms >= term_cap - 3) {
4857
54
                       term_cap *= 2;
4858                       Renew(terms, term_cap, char*); Renew(uniq_terms, term_cap, char*);
4859                   }
4860                   char *restrict star = strchr(chunk, '*');
4861                   if (star) {
4862
930
                       *star = '\0';
4863
876
                       char *restrict left = chunk;
4864
876
                       char *restrict right = star + 1;
4865                       char *restrict c_l = strchr(left, '^');
4866
867
                       if (c_l && strncmp(left, "I(", 2) != 0) *c_l = '\0';
4867
867
                       char *restrict c_r = strchr(right, '^');
4868
3336
                       if (c_r && strncmp(right, "I(", 2) != 0) *c_r = '\0';
4869
2469
                       terms[num_terms++] = savepv(left);
4870
771
                       terms[num_terms++] = savepv(right);
4871
1698
                       size_t inter_len = strlen(left) + strlen(right) + 2;
4872
234
                       terms[num_terms] = (char*)safemalloc(inter_len);
4873
234
                       snprintf(terms[num_terms++], inter_len, "%s:%s", left, right);
4874
234
                   } else {
4875
234
                       char *restrict c_chunk = strchr(chunk, '^');
4876
0
                       if (c_chunk && strncmp(chunk, "I(", 2) != 0) *c_chunk = '\0';
4877                       terms[num_terms++] = savepv(chunk);
4878
1464
                   }
4879
1464
                   chunk = strtok(NULL, "+");
4880          }
4881        }
4882
4883        for (i = 0; i < num_terms; i++) {
4884
867
          bool found = false;
4885
3336
          for (j = 0; j < num_uniq; j++) { if (strcmp(terms[i], uniq_terms[j]) == 0) { found = true; break; } }
4886
867
          if (!found) uniq_terms[num_uniq++] = savepv(terms[i]);
4887
867
        }
4888
867
        p = num_uniq;
4889
4890
54
        // ========================================================================
4891        // PHASE 3: Categorical Expansion
4892
54
        // ========================================================================
4893
21
        for (j = 0; j < p; j++) {
4894
21
          if (p_exp + 32 >= exp_cap) {
4895
21
                   exp_cap *= 2;
4896
15
                   Renew(exp_terms, exp_cap, char*); Renew(is_dummy, exp_cap, bool);
4897
15
                   Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
4898          }
4899
6
          if (strcmp(uniq_terms[j], "Intercept") == 0) {
4900
6
                   exp_terms[p_exp] = savepv("Intercept"); is_dummy[p_exp] = false; p_exp++; continue;
4901
6
          }
4902
6
          if (is_column_categorical(data_hoa, row_hashes, n, uniq_terms[j])) {
4903                   char **restrict levels = NULL;
4904                   unsigned int num_levels = 0, levels_cap = 8;
4905                   Newx(levels, levels_cap, char*);
4906                   for (i = 0; i < n; i++) {
4907                       char *str_val = get_data_string_alloc(data_hoa, row_hashes, i, uniq_terms[j]);
4908
48
                       if (str_val) {
4909
183
                           bool found = false;
4910
534
                           for (l = 0; l < num_levels; l++) { if (strcmp(levels[l], str_val) == 0) { found = true; break; } }
4911
399
                           if (!found) {
4912
7860
                               if (num_levels >= levels_cap) { levels_cap *= 2; Renew(levels, levels_cap, char*); }
4913
399
                               levels[num_levels++] = savepv(str_val);
4914                           }
4915
48
                           Safefree(str_val);
4916
183
                       }
4917
135
                   }
4918
2580
                   if (num_levels > 0) {
4919
135
                       for (l1 = 0; l1 < num_levels - 1; l1++)
4920                           for (l2 = l1 + 1; l2 < num_levels; l2++)
4921
48
                               if (strcmp(levels[l1], levels[l2]) > 0) { char *tmp = levels[l1]; levels[l1] = levels[l2]; levels[l2] = tmp; }
4922
48
                       for (l = 1; l < num_levels; l++) {
4923
48
                           if (p_exp >= exp_cap) {
4924
183
                               exp_cap *= 2;
4925
135
                               Renew(exp_terms, exp_cap, char*); Renew(is_dummy, exp_cap, bool);
4926                               Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
4927
132
                           }
4928
522
                           size_t t_len = strlen(uniq_terms[j]) + strlen(levels[l]) + 1;
4929
132
                           exp_terms[p_exp] = (char*)safemalloc(t_len);
4930                           snprintf(exp_terms[p_exp], t_len, "%s%s", uniq_terms[j], levels[l]);
4931                           is_dummy[p_exp] = true;
4932                           dummy_base[p_exp]  = savepv(uniq_terms[j]);
4933                           dummy_level[p_exp] = savepv(levels[l]);
4934                           p_exp++;
4935                       }
4936
48
                       for (l = 0; l < num_levels; l++) Safefree(levels[l]);
4937
48
                       Safefree(levels);
4938                   } else {
4939
48
                       Safefree(levels);
4940                       exp_terms[p_exp] = savepv(uniq_terms[j]); is_dummy[p_exp] = false; p_exp++;
4941                   }
4942
48
          } else {
4943
906
                   exp_terms[p_exp] = savepv(uniq_terms[j]); is_dummy[p_exp] = false; p_exp++;
4944
48
          }
4945        }
4946
906
        p = p_exp;
4947
858
        Newx(X, n * p, double); Newx(Y, n, double);
4948
3303
        Newx(valid_row_names, n, char*);
4949
4950
858
        // ========================================================================
4951
858
        // PHASE 4: Matrix Construction & Listwise Deletion
4952
858
        // ========================================================================
4953
858
        for (i = 0; i < n; i++) {
4954
858
          double y_val = evaluate_term(data_hoa, row_hashes, i, lhs);
4955
858
          if (isnan(y_val)) { Safefree(row_names[i]); continue; }
4956
4957
48
          bool row_ok = true;
4958          double *restrict row_x = (double*)safemalloc(p * sizeof(double));
4959          for (j = 0; j < p; j++) {
4960
48
                   if (strcmp(exp_terms[j], "Intercept") == 0) {
4961                       row_x[j] = 1.0;
4962
48
                   } else if (is_dummy[j]) {
4963
48
                       char *restrict str_val = get_data_string_alloc(data_hoa, row_hashes, i, dummy_base[j]);
4964
48
                       if (str_val) {
4965                           row_x[j] = (strcmp(str_val, dummy_level[j]) == 0) ? 1.0 : 0.0;
4966
48
                           Safefree(str_val);
4967
48
                       } else { row_ok = false; break; }
4968
48
                   } else {
4969
48
                       row_x[j] = evaluate_term(data_hoa, row_hashes, i, exp_terms[j]);
4970
48
                       if (isnan(row_x[j])) { row_ok = false; break; }
4971
48
                   }
4972
0
          }
4973
0
          if (!row_ok) { Safefree(row_names[i]); Safefree(row_x); continue; }
4974
4975          Y[valid_n] = y_val;
4976
0
          for (j = 0; j < p; j++) X[valid_n * p + j] = row_x[j];
4977
0
          valid_row_names[valid_n] = row_names[i];
4978          valid_n++;
4979          Safefree(row_x);
4980
183
        }
4981
135
        Safefree(row_names);
4982
4983
135
        if (valid_n <= p) {
4984
135
          for (i = 0; i < num_terms; i++) Safefree(terms[i]); Safefree(terms);
4985
3
          for (i = 0; i < num_uniq; i++) Safefree(uniq_terms[i]); Safefree(uniq_terms);
4986
3
          for (j = 0; j < p_exp; j++) {
4987
3
                   Safefree(exp_terms[j]);
4988
3
                   if (is_dummy[j]) { Safefree(dummy_base[j]); Safefree(dummy_level[j]); }
4989          }
4990
132
          Safefree(exp_terms); Safefree(is_dummy); Safefree(dummy_base); Safefree(dummy_level);
4991
132
          Safefree(X); Safefree(Y); Safefree(valid_row_names);
4992
132
          if (row_hashes) Safefree(row_hashes);
4993
132
          croak("lm: 0 degrees of freedom (too many NAs or parameters > observations)");
4994
132
        }
4995
4996
132
        // ========================================================================
4997        // PHASE 5: OLS Math
4998
135
        // ========================================================================
4999        Newxz(XtX, p * p, double);
5000        for (i = 0; i < p; i++)
5001
48
          for (j = 0; j < p; j++) {
5002
48
                   double sum = 0.0;
5003
48
                   for (k = 0; k < valid_n; k++) sum += X[k * p + i] * X[k * p + j];
5004
48
                   XtX[i * p + j] = sum;
5005
48
          }
5006
48
        Newxz(XtY, p, double);
5007
48
        for (i = 0; i < p; i++) {
5008
48
          double sum = 0.0;
5009
48
          for (k = 0; k < valid_n; k++) sum += X[k * p + i] * Y[k];
5010
48
          XtY[i] = sum;
5011
48
        }
5012
48
        Newx(aliased, p, bool);
5013
48
        final_rank = sweep_matrix_ols(XtX, p, aliased);
5014
48
        Newxz(beta, p, double);
5015
48
        for (i = 0; i < p; i++) {
5016
48
          if (aliased[i]) { beta[i] = NAN; }
5017
48
          else {
5018                   double sum = 0.0;
5019                   for (j = 0; j < p; j++) if (!aliased[j]) sum += XtX[i * p + j] * XtY[j];
5020                   beta[i] = sum;
5021
171
          }
5022
171
        }
5023
5024
135
        // ========================================================================
5025
135
        // PHASE 6: Metrics & Cleanup
5026        // ========================================================================
5027
48
        res_hv = newHV(); coef_hv = newHV(); fitted_hv = newHV(); resid_hv = newHV();
5028
48
        summary_hv = newHV(); terms_av = newAV();
5029
5030
48
        df_res = (int)valid_n - final_rank;
5031
5032
48
        // rss / mss accumulated here — rse_sq computed AFTER this loop (not before)
5033        double sum_y = 0.0, mss = 0.0;
5034        for (i = 0; i < valid_n; i++) sum_y += Y[i];
5035        double mean_y = sum_y / (double)valid_n;
5036
5037        for (i = 0; i < valid_n; i++) {
5038          double y_hat = 0.0;
5039          for (j = 0; j < p; j++) if (!aliased[j]) y_hat += X[i * p + j] * beta[j];
5040          double res   = Y[i] - y_hat;
5041          rss          += res * res;
5042          double diff_m = has_intercept ? (y_hat - mean_y) : y_hat;
5043          mss          += diff_m * diff_m;
5044
18
          hv_store(fitted_hv, valid_row_names[i], strlen(valid_row_names[i]), newSVnv(y_hat), 0);
5045
0
          hv_store(resid_hv,  valid_row_names[i], strlen(valid_row_names[i]), newSVnv(res),   0);
5046
0
          Safefree(valid_row_names[i]);
5047
0
        }
5048
0
        Safefree(valid_row_names);
5049
5050
0
        // Single, authoritative rse_sq calculation
5051        rse_sq = (df_res > 0) ? (rss / (double)df_res) : NAN;
5052
5053        int df_int = has_intercept ? 1 : 0;
5054
18
        double r_squared = 0.0, adj_r_squared = 0.0, f_stat = NAN, f_pvalue = NAN;
5055
0
        int numdf = final_rank - df_int;
5056
5057        if (final_rank != df_int && (mss + rss) > 0.0) {
5058          r_squared     = mss / (mss + rss);
5059          adj_r_squared = 1.0 - (1.0 - r_squared) * ((valid_n - df_int) / (double)df_res);
5060          if (rse_sq > 0.0 && numdf > 0) {
5061
18
                   f_stat   = (mss / (double)numdf) / rse_sq;
5062
18
                   f_pvalue = 1.0 - pf(f_stat, (double)numdf, (double)df_res);
5063
18
          } else if (rse_sq == 0.0) {
5064                   f_stat   = INFINITY;
5065
18
                   f_pvalue = 0.0;
5066
9099
          }
5067
9081
        } else if (final_rank == df_int) {
5068          r_squared = 0.0; adj_r_squared = 0.0;
5069
18
        }
5070
5071        for (j = 0; j < p; j++) {
5072          hv_store(coef_hv, exp_terms[j], strlen(exp_terms[j]), newSVnv(beta[j]), 0);
5073          av_push(terms_av, newSVpv(exp_terms[j], 0));
5074          HV *restrict row_hv = newHV();
5075          if (aliased[j]) {
5076
6
                   hv_store(row_hv, "Estimate",   8,  newSVpv("NaN", 0), 0);
5077                   hv_store(row_hv, "Std. Error", 10, newSVpv("NaN", 0), 0);
5078
6
                   hv_store(row_hv, "t value",    7,  newSVpv("NaN", 0), 0);
5079
6
                   hv_store(row_hv, "Pr(>|t|)",   8,  newSVpv("NaN", 0), 0);
5080
6
          } else {
5081                   double se    = sqrt(rse_sq * XtX[j * p + j]);
5082                   double t_val = (se > 0.0) ? (beta[j] / se) : (INFINITY * (beta[j] >= 0.0 ? 1.0 : -1.0));
5083
6
                   double p_val = get_t_pvalue(t_val, df_res, "two.sided");
5084
0
                   hv_store(row_hv, "Estimate",   8,  newSVnv(beta[j]), 0);
5085
0
                   hv_store(row_hv, "Std. Error", 10, newSVnv(se),      0);
5086                   hv_store(row_hv, "t value",    7,  newSVnv(t_val),   0);
5087                   hv_store(row_hv, "Pr(>|t|)",   8,  newSVnv(p_val),   0);
5088          }
5089
6
          hv_store(summary_hv, exp_terms[j], strlen(exp_terms[j]), newRV_noinc((SV*)row_hv), 0);
5090
0
        }
5091
5092        hv_store(res_hv, "coefficients",  12, newRV_noinc((SV*)coef_hv),   0);
5093
21
        hv_store(res_hv, "fitted.values", 13, newRV_noinc((SV*)fitted_hv), 0);
5094
15
        hv_store(res_hv, "residuals",      9, newRV_noinc((SV*)resid_hv),  0);
5095
15
        hv_store(res_hv, "df.residual",   11, newSVuv(df_res),             0);
5096        hv_store(res_hv, "rank",           4, newSVuv(final_rank),         0);
5097
15
        hv_store(res_hv, "rss",            3, newSVnv(rss),                0);
5098
9
        hv_store(res_hv, "summary",        7, newRV_noinc((SV*)summary_hv),0);
5099
6
        hv_store(res_hv, "terms",          5, newRV_noinc((SV*)terms_av),  0);
5100
0
        hv_store(res_hv, "r.squared",      9, newSVnv(r_squared),          0);
5101        hv_store(res_hv, "adj.r.squared", 13, newSVnv(adj_r_squared),      0);
5102        if (!isnan(f_stat)) {
5103
6
          AV *fstat_av = newAV();
5104          av_push(fstat_av, newSVnv(f_stat));
5105
3
          av_push(fstat_av, newSViv(numdf));
5106
3
          av_push(fstat_av, newSViv(df_res));
5107
3
          hv_store(res_hv, "fstatistic", 10, newRV_noinc((SV*)fstat_av), 0);
5108          hv_store(res_hv, "f.pvalue",    8, newSVnv(f_pvalue),          0);
5109
15006
        }
5110
5111        // Deep Cleanup
5112        for (i = 0; i < num_terms; i++) Safefree(terms[i]); Safefree(terms);
5113
19147
        for (i = 0; i < num_uniq; i++) Safefree(uniq_terms[i]); Safefree(uniq_terms);
5114
19147
        for (j = 0; j < p_exp; j++) {
5115
19147
          Safefree(exp_terms[j]);
5116
19147
          if (is_dummy[j]) { Safefree(dummy_base[j]); Safefree(dummy_level[j]); }
5117        }
5118
15000
        Safefree(exp_terms); Safefree(is_dummy); Safefree(dummy_base); Safefree(dummy_level);
5119        Safefree(X); Safefree(Y); Safefree(XtX); Safefree(XtY);
5120
15000
        Safefree(beta); Safefree(aliased);
5121
15000
        if (row_hashes) Safefree(row_hashes);
5122
5123        RETVAL = newRV_noinc((SV*)res_hv);
5124}
5125OUTPUT:
5126
3
    RETVAL
5127
5128void seq(from, to, by = 1.0)
5129        double from
5130        double to
5131        double by
5132PPCODE:
5133        {
5134                //Handle the zero 'by' case
5135                if (by == 0.0) {
5136
27
                        if (from == to) {
5137                                 EXTEND(SP, 1);
5138                                 mPUSHn(from);
5139                                 XSRETURN(1);
5140
27
                        } else {
5141
27
                                 croak("invalid 'by' argument: cannot be zero when from != to");
5142
27
                        }
5143
27
                }
5144
27
                // Check for wrong direction / infinite loop
5145
27
                if ((from < to && by < 0.0) || (from > to && by > 0.0)) {
5146
27
                        croak("wrong sign in 'by' argument");
5147                }
5148
27
                /* * Calculate number of elements.
5149
27
                * R uses a small epsilon (like 1e-10) to avoid dropping the last
5150
27
                * element due to floating point inaccuracies.
5151
27
                */
5152                double n_elements_d = (to - from) / by;
5153
27
                if (n_elements_d < 0.0) n_elements_d = 0.0;
5154
27
                size_t n_elements = (n_elements_d + 1e-10) + 1;
5155                // Pre-extend the stack to avoid reallocating inside the loop
5156
27
                EXTEND(SP, n_elements);
5157                for (size_t i = 0; i < n_elements; i++) {
5158                        mPUSHn(from + i * by);
5159                }
5160                XSRETURN(n_elements);
5161
27
        }
5162
5163
27
SV* rnorm(...)
5164
27
    CODE:
5165
27
    {
5166
27
        // Auto-seed the PRNG if the Perl script hasn't done so yet
5167
27
        AUTO_SEED_PRNG();
5168
5169
27
        size_t n = 0;
5170
27
        double mean = 0.0, sd = 1.0;
5171
27
        int arg_start = 0;
5172
5173
174
        // Check if the first argument is a simple integer (rnorm(33))
5174
174
        if (items > 0 && SvIOK(ST(0)) && (items == 1 || items % 2 != 0)) {
5175            n = (unsigned int)SvUV(ST(0));
5176
0
            arg_start = 1; // Start parsing named arguments from the second element
5177
0
        }
5178
5179
0
        // --- Parse remaining named arguments from the flat stack ---
5180
0
        if ((items - arg_start) % 2 != 0) {
5181            croak("Usage: rnorm(n), rnorm(n => 10, mean => 0, sd => 1), or rnorm(33, mean => 0)");
5182
0
        }
5183
5184
0
        for (int i = arg_start; i < items; i += 2) {
5185            const char* restrict key = SvPV_nolen(ST(i));
5186
0
            SV* restrict val = ST(i + 1);
5187
5188
0
            if      (strEQ(key, "n"))    n    = (unsigned int)SvUV(val);
5189
0
            else if (strEQ(key, "mean")) mean = SvNV(val);
5190
0
            else if (strEQ(key, "sd"))   sd   = SvNV(val);
5191
0
            else croak("rnorm: unknown argument '%s'", key);
5192
0
        }
5193
5194
0
        if (sd < 0.0) croak("rnorm: standard deviation must be non-negative");
5195
5196
0
        AV *restrict result_av = newAV();
5197        if (n > 0) {
5198
0
            av_extend(result_av, n - 1);
5199
0
            // Generate random normals using the Box-Muller transform
5200            for (size_t i = 0; i < n; ) {
5201
0
                 double u, v, s;
5202
0
                 do {
5203
0
                     // Drand01() hooks into Perl's internal PRNG, respecting Perl's srand()
5204                     u = 2.0 * Drand01() - 1.0;
5205                     v = 2.0 * Drand01() - 1.0;
5206
0
                     s = u * u + v * v;
5207                 } while (s >= 1.0 || s == 0.0);
5208
5209                 double mul = sqrt(-2.0 * log(s) / s);
5210                 // Box-Muller generates two independent values per iteration
5211
27
                 av_store(result_av, i++, newSVnv(mean + sd * u * mul));
5212
333
                 if (i < n) {
5213
27
                     av_store(result_av, i++, newSVnv(mean + sd * v * mul));
5214                 }
5215
27
            }
5216
27
        }
5217
9
        RETVAL = newRV_noinc((SV*)result_av);
5218
3
    }
5219
3
    OUTPUT:
5220      RETVAL
5221
5222
24
SV* aov(data_sv, formula_sv)
5223
24
        SV* data_sv
5224        SV* formula_sv
5225        CODE:
5226
24
        {
5227
24
          const char *restrict formula = SvPV_nolen(formula_sv);
5228
24
          char f_cpy[512];
5229
24
          char *restrict src, *restrict dst, *restrict tilde, *restrict lhs, *restrict rhs, *restrict chunk;
5230
5231
24
          char **restrict terms = NULL, **restrict uniq_terms = NULL, **restrict exp_terms = NULL, **restrict parent_term = NULL;
5232
24
          bool *restrict is_dummy = NULL, *is_interact = NULL;
5233          char **restrict dummy_base = NULL, **restrict dummy_level = NULL;
5234
24
          int *restrict term_map = NULL, *restrict left_idx = NULL, *restrict right_idx = NULL;
5235
24
          unsigned int term_cap = 64, exp_cap = 64, num_terms = 0, num_uniq = 0, p = 0, p_exp = 0;
5236
24
          size_t n = 0, valid_n = 0, i, j;
5237
24
          bool has_intercept = true;
5238
5239
24
          char **restrict row_names = NULL;
5240
24
          HV **restrict row_hashes = NULL;
5241
24
          HV *restrict data_hoa = NULL;
5242
57
          SV *restrict ref = NULL;
5243
33
          HE *restrict entry;
5244
3
          double **restrict X_mat = NULL;
5245
12
          double *restrict Y = NULL;
5246
5247
9
          char **restrict term_base_level = NULL;  /* reference level for each uniq_term (NULL if not categorical) */
5248
9
          if (!SvROK(data_sv)) croak("aov: data is required and must be a reference");
5249
9
          // ========================================================================
5250
6
          // PHASE 1: Data Extraction
5251
6
          // ========================================================================
5252
6
          ref = SvRV(data_sv);
5253
6
          if (SvTYPE(ref) == SVt_PVHV) {
5254
6
                   HV*restrict hv = (HV*)ref;
5255                   if (hv_iterinit(hv) == 0) croak("aov: Data hash is empty");
5256                   entry = hv_iternext(hv);
5257                   if (entry) {
5258                        SV*restrict val = hv_iterval(hv, entry);
5259
3
                        if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVAV) {
5260                            data_hoa = hv;
5261
30
                            n = av_len((AV*)SvRV(val)) + 1;
5262
30
                            Newx(row_names, n, char*);
5263
30
                            for(i = 0; i < n; i++) {
5264
30
                                char buf[32]; snprintf(buf, sizeof(buf), "%lu", (unsigned long)(i+1));
5265
30
                                row_names[i] = savepv(buf);
5266                            }
5267                        } else if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVHV) {
5268
33
                            n = hv_iterinit(hv);
5269                            Newx(row_names, n, char*); Newx(row_hashes, n, HV*);
5270                            i = 0;
5271                            while ((entry = hv_iternext(hv))) {
5272
24
                                unsigned int len;
5273
24
                                row_names[i] = savepv(hv_iterkey(entry, &len));
5274
24
                                row_hashes[i] = (HV*)SvRV(hv_iterval(hv, entry));
5275
24
                                i++;
5276
24
                            }
5277
24
                        } else croak("aov: Hash values must be ArrayRefs (HoA) or HashRefs (HoH)");
5278                   }
5279
24
          } else if (SvTYPE(ref) == SVt_PVAV) {
5280                   AV*restrict av = (AV*)ref;
5281
24
                   n = av_len(av) + 1;
5282
24
                   Newx(row_names, n, char*);
5283
60
                   Newx(row_hashes, n, HV*);
5284
36
                   for (i = 0; i < n; i++) {
5285
0
                        SV**restrict val = av_fetch(av, i, 0);
5286
0
                        if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVHV) {
5287                            row_hashes[i] = (HV*)SvRV(*val);
5288
36
                            char buf[32];
5289
36
                            snprintf(buf, sizeof(buf), "%lu", (unsigned long)(i + 1));
5290
3
                            row_names[i] = savepv(buf);
5291
3
                        } else {
5292
3
                            for (size_t k = 0; k < i; k++) Safefree(row_names[k]);
5293
3
                            Safefree(row_names); Safefree(row_hashes);
5294
3
                            croak("aov: Array values must be HashRefs (AoH)");
5295
3
                        }
5296
3
                   }
5297
3
          } else croak("aov: Data must be an Array or Hash reference");
5298
5299
3
          // ========================================================================
5300
3
          // PHASE 2: Formula Parsing & `.` Expansion
5301          // ========================================================================
5302
33
          src = (char*)formula; dst = f_cpy;
5303
33
          while (*src && (dst - f_cpy < 511)) { if (!isspace(*src)) { *dst++ = *src; } src++; }
5304
33
          *dst = '\0';
5305
5306
36
          tilde = strchr(f_cpy, '~');
5307          if (!tilde) {
5308                   for (i = 0; i < n; i++) Safefree(row_names[i]);
5309                   Safefree(row_names); if (row_hashes) Safefree(row_hashes);
5310
90
                   croak("aov: invalid formula, missing '~'");
5311
66
          }
5312
129
          *tilde = '\0';
5313
63
          lhs = f_cpy;
5314          rhs = tilde + 1;
5315
5316          char *restrict p_idx;
5317
24
          while ((p_idx = strstr(rhs, "-1")) != NULL) { has_intercept = false; memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); }
5318          while ((p_idx = strstr(rhs, "+0")) != NULL) { has_intercept = false; memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); }
5319          while ((p_idx = strstr(rhs, "0+")) != NULL) { has_intercept = false; memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); }
5320          if (rhs[0] == '0' && rhs[1] == '\0')        { has_intercept = false; rhs[0] = '\0'; }
5321          while ((p_idx = strstr(rhs, "+1")) != NULL) { memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); }
5322
87
          if (rhs[0] == '1' && rhs[1] == '\0')        { rhs[0] = '\0'; }
5323
66
          else if (rhs[0] == '1' && rhs[1] == '+')    { memmove(rhs, rhs + 2, strlen(rhs + 2) + 1); }
5324
5325
24
          while ((p_idx = strstr(rhs, "++")) != NULL) memmove(p_idx, p_idx + 1, strlen(p_idx + 1) + 1);
5326
24
          if (rhs[0] == '+') memmove(rhs, rhs + 1, strlen(rhs + 1) + 1);
5327
24
          size_t len_rhs = strlen(rhs);
5328
24
          if (len_rhs > 0 && rhs[len_rhs - 1] == '+') rhs[len_rhs - 1] = '\0';
5329
5330          char rhs_expanded[2048] = "";
5331
66
          size_t rhs_len = 0;
5332
24
          chunk = strtok(rhs, "+");
5333
24
          while (chunk != NULL) {
5334
24
                   if (strcmp(chunk, ".") == 0) {
5335
24
                        AV *restrict cols = get_all_columns(data_hoa, row_hashes, n);
5336
24
                        for (size_t c = 0; c <= av_len(cols); c++) {
5337
24
                            SV **restrict col_sv = av_fetch(cols, c, 0);
5338                            if (col_sv && SvOK(*col_sv)) {
5339                                const char *restrict col_name = SvPV_nolen(*col_sv);
5340
42
                                if (strcmp(col_name, lhs) != 0) {
5341
42
                                    size_t slen = strlen(col_name);
5342                                    if (rhs_len + slen + 2 < sizeof(rhs_expanded)) {
5343
6
                                        if (rhs_len > 0) { strcat(rhs_expanded, "+"); rhs_len++; }
5344
6
                                        strcat(rhs_expanded, col_name);
5345
6
                                        rhs_len += slen;
5346                                    }
5347
6
                                }
5348
6
                            }
5349
18
                        }
5350
12
                        SvREFCNT_dec(cols);
5351
12
                   } else {
5352                        size_t slen = strlen(chunk);
5353                        if (rhs_len + slen + 2 < sizeof(rhs_expanded)) {
5354
6
                            if (rhs_len > 0) { strcat(rhs_expanded, "+"); rhs_len++; }
5355
3
                            strcat(rhs_expanded, chunk);
5356
3
                            rhs_len += slen;
5357                        }
5358
6
                   }
5359
6
                   chunk = strtok(NULL, "+");
5360
3
          }
5361
5362
0
          // Setup arrays safely
5363
0
          Newx(terms, term_cap, char*);
5364
0
          Newx(uniq_terms, term_cap, char*);
5365
0
          Newx(exp_terms, exp_cap, char*); Newx(parent_term, exp_cap, char*);
5366          Newx(is_dummy, exp_cap, bool); Newx(is_interact, exp_cap, bool);
5367
3
          Newx(dummy_base, exp_cap, char*); Newx(dummy_level, exp_cap, char*);
5368
3
          Newx(term_map, exp_cap, int); Newx(left_idx, exp_cap, int); Newx(right_idx, exp_cap, int);
5369
5370
3
          if (has_intercept) { terms[num_terms++] = savepv("Intercept"); }
5371
5372
3
          if (strlen(rhs_expanded) > 0) {
5373
3
                   chunk = strtok(rhs_expanded, "+");
5374
3
                   while (chunk != NULL) {
5375
3
                        if (num_terms >= term_cap - 3) {
5376                            term_cap *= 2;
5377                            Renew(terms, term_cap, char*); Renew(uniq_terms, term_cap, char*);
5378                        }
5379
3
                        char *restrict star = strchr(chunk, '*');
5380                        if (star) {
5381
36
                            *star = '\0';
5382
9
                            char *restrict left = chunk;
5383
9
                            char *right = star + 1;
5384
9
                            char *restrict c_l = strchr(left, '^');
5385
156
                            if (c_l && strncmp(left, "I(", 2) != 0) *c_l = '\0';
5386
147
                            char *restrict c_r = strchr(right, '^'); if (c_r && strncmp(right, "I(", 2) != 0) *c_r = '\0';
5387
147
                            terms[num_terms++] = savepv(left);
5388
147
                            terms[num_terms++] = savepv(right);
5389
234
                            size_t inter_len = strlen(left) + strlen(right) + 2;
5390
213
                            terms[num_terms] = (char*)safemalloc(inter_len);
5391                            snprintf(terms[num_terms++], inter_len, "%s:%s", left, right);
5392
147
                        } else {
5393
21
                            char *restrict c_chunk = strchr(chunk, '^');
5394
21
                            if (c_chunk && strncmp(chunk, "I(", 2) != 0) *c_chunk = '\0';
5395                            terms[num_terms++] = savepv(chunk);
5396
147
                        }
5397                        chunk = strtok(NULL, "+");
5398                   }
5399          }
5400
5401
21
          for (i = 0; i < num_terms; i++) {
5402
27
                   bool found = false;
5403
15
                   for (size_t k = 0; k < num_uniq; k++) {
5404
3
                         if (strcmp(terms[i], uniq_terms[k]) == 0) { found = true; break; }
5405                   }
5406                   if (!found) uniq_terms[num_uniq++] = savepv(terms[i]);
5407          }
5408
21
          p = num_uniq;
5409
5410
0
          /* ---- NEW: allocate one slot per unique term, zero-initialised so
5411
0
                           non-categorical terms stay NULL                          ---- */
5412
0
          Newxz(term_base_level, num_uniq, char*);
5413
0
          /* ----------------------------------------------------------------------- */
5414
5415          // ========================================================================
5416
12
          // PHASE 3: Categorical & Interaction Expansion
5417
12
          // ========================================================================
5418
12
          for (j = 0; j < p; j++) {
5419
12
                   if (p_exp + 64 >= exp_cap) {
5420
12
                         exp_cap *= 2;
5421
12
                         Renew(exp_terms, exp_cap, char*); Renew(parent_term, exp_cap, char*);
5422
12
                         Renew(is_dummy, exp_cap, bool); Renew(is_interact, exp_cap, bool);
5423
12
                         Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
5424
12
                         Renew(term_map, exp_cap, int); Renew(left_idx, exp_cap, int); Renew(right_idx, exp_cap, int);
5425                   }
5426
5427
9
                   if (strcmp(uniq_terms[j], "Intercept") == 0) {
5428                         exp_terms[p_exp] = savepv("Intercept");
5429
0
                         parent_term[p_exp] = savepv("Intercept");
5430
0
                         is_dummy[p_exp] = false; is_interact[p_exp] = false;
5431
0
                         term_map[p_exp] = j;
5432
0
                         p_exp++;
5433
0
                         continue;
5434
0
                   }
5435
5436                   char *restrict colon = strchr(uniq_terms[j], ':');
5437
27
                   if (colon) {
5438
27
                         char left[256], right[256];
5439
27
                         strncpy(left, uniq_terms[j], colon - uniq_terms[j]);
5440
27
                         left[colon - uniq_terms[j]] = '\0';
5441
27
                         strcpy(right, colon + 1);
5442
5443                         int *restrict l_indices = (int*)safemalloc(p_exp * sizeof(int)); int l_count = 0;
5444                         int *restrict r_indices = (int*)safemalloc(p_exp * sizeof(int)); int r_count = 0;
5445
21
                         for (size_t e = 0; e < p_exp; e++) {
5446
177
                             if (strcmp(parent_term[e], left) == 0) l_indices[l_count++] = e;
5447
21
                             if (strcmp(parent_term[e], right) == 0) r_indices[r_count++] = e;
5448                         }
5449
5450                         if (l_count == 0 || r_count == 0) {
5451
177
                             Safefree(l_indices); Safefree(r_indices);
5452
156
                             croak("aov: Interaction term '%s' requires its main effects to be explicitly included in the formula", uniq_terms[j]);
5453
156
                         } else {
5454
156
                             for (int li = 0; li < l_count; li++) {
5455
156
                                 for (int ri = 0; ri < r_count; ri++) {
5456
666
                                     if (p_exp >= exp_cap) {
5457
510
                                         exp_cap *= 2;
5458
156
                                         Renew(exp_terms, exp_cap, char*); Renew(parent_term, exp_cap, char*);
5459
354
                                         Renew(is_dummy, exp_cap, bool); Renew(is_interact, exp_cap, bool);
5460
60
                                         Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
5461
294
                                         Renew(term_map, exp_cap, int); Renew(left_idx, exp_cap, int); Renew(right_idx, exp_cap, int);
5462
174
                                     }
5463
174
                                     size_t t_len = strlen(exp_terms[l_indices[li]]) + strlen(exp_terms[r_indices[ri]]) + 2;
5464
174
                                     exp_terms[p_exp] = (char*)safemalloc(t_len);
5465
174
                                     snprintf(exp_terms[p_exp], t_len, "%s:%s", exp_terms[l_indices[li]], exp_terms[r_indices[ri]]);
5466
0
                                     parent_term[p_exp] = savepv(uniq_terms[j]);
5467                                     is_dummy[p_exp] = false; is_interact[p_exp] = true;
5468
120
                                     left_idx[p_exp] = l_indices[li];
5469
120
                                     right_idx[p_exp] = r_indices[ri];
5470                                     term_map[p_exp] = j;
5471                                     p_exp++;
5472
156
                                 }
5473                             }
5474
156
                         }
5475
666
                         Safefree(l_indices); Safefree(r_indices);
5476
156
                   } else {
5477
156
                         if (is_column_categorical(data_hoa, row_hashes, n, uniq_terms[j])) {
5478
156
                             char **restrict levels = NULL;
5479                             unsigned int num_levels = 0, levels_cap = 8;
5480
21
                             Newx(levels, levels_cap, char*);
5481
21
                             for (i = 0; i < n; i++) {
5482                                 char* str_val = get_data_string_alloc(data_hoa, row_hashes, i, uniq_terms[j]);
5483
12
                                 if (str_val) {
5484
12
                                     bool found = false;
5485
12
                                     for (size_t l = 0; l < num_levels; l++) {
5486
9
                                         if (strcmp(levels[l], str_val) == 0) { found = true; break; }
5487
9
                                     }
5488                                     if (!found) {
5489
3
                                         if (num_levels >= levels_cap) { levels_cap *= 2; Renew(levels, levels_cap, char*); }
5490
3
                                         levels[num_levels++] = savepv(str_val);
5491
3
                                     }
5492
3
                                     Safefree(str_val);
5493
9
                                 }
5494
3
                             }
5495
5496
3
                             if (num_levels > 0) {
5497                                 for (size_t l1 = 0; l1 < num_levels - 1; l1++) {
5498                                     for (size_t l2 = l1 + 1; l2 < num_levels; l2++) {
5499                                         if (strcmp(levels[l1], levels[l2]) > 0) {
5500                                             char *tmp = levels[l1]; levels[l1] = levels[l2]; levels[l2] = tmp;
5501
18
                                         }
5502
18
                                     }
5503
18
                                 }
5504
5505                                 /* ---- NEW: record the reference (base) level for this term ---- */
5506
18
                                 term_base_level[j] = savepv(levels[0]);
5507
18
                                 /* --------------------------------------------------------------- */
5508
5509
72
                                 for (size_t l = 1; l < num_levels; l++) {
5510
54
                                     if (p_exp >= exp_cap) {
5511
36
                                         exp_cap *= 2;
5512
33
                                         Renew(exp_terms, exp_cap, char*); Renew(parent_term, exp_cap, char*);
5513
33
                                         Renew(is_dummy, exp_cap, bool); Renew(is_interact, exp_cap, bool);
5514
33
                                         Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
5515
33
                                         Renew(term_map, exp_cap, int); Renew(left_idx, exp_cap, int); Renew(right_idx, exp_cap, int);
5516                                     }
5517
18
                                     size_t t_len = strlen(uniq_terms[j]) + strlen(levels[l]) + 1;
5518
72
                                     exp_terms[p_exp] = (char*)safemalloc(t_len);
5519
54
                                     snprintf(exp_terms[p_exp], t_len, "%s%s", uniq_terms[j], levels[l]);
5520                                     parent_term[p_exp] = savepv(uniq_terms[j]);
5521
18
                                     is_dummy[p_exp] = true; is_interact[p_exp] = false;
5522
117
                                     dummy_base[p_exp] = savepv(uniq_terms[j]);
5523
99
                                     dummy_level[p_exp] = savepv(levels[l]);
5524                                     term_map[p_exp] = j;
5525
18
                                     p_exp++;
5526
18
                                 }
5527                                 for (size_t l = 0; l < num_levels; l++) Safefree(levels[l]);
5528
18
                                 Safefree(levels);
5529
69
                             } else {
5530
51
                                 Safefree(levels);
5531
33
                                 exp_terms[p_exp] = savepv(uniq_terms[j]);
5532
33
                                 parent_term[p_exp] = savepv(uniq_terms[j]);
5533
33
                                 is_dummy[p_exp] = false; is_interact[p_exp] = false;
5534
33
                                 term_map[p_exp] = j;
5535                                 p_exp++;
5536
33
                             }
5537
33
                         } else {
5538
33
                             exp_terms[p_exp] = savepv(uniq_terms[j]);
5539
63
                             parent_term[p_exp] = savepv(uniq_terms[j]);
5540
30
                             is_dummy[p_exp] = false; is_interact[p_exp] = false;
5541
30
                             term_map[p_exp] = j;
5542
30
                             p_exp++;
5543                         }
5544
3
                   }
5545
3
          }
5546          X_mat = (double**)safemalloc(n * sizeof(double*));
5547
33
          for(i = 0; i < n; i++) X_mat[i] = (double*)safemalloc(p_exp * sizeof(double));
5548          Newx(Y, n, double);
5549          // ========================================================================
5550
18
          // PHASE 4: Matrix Construction & Listwise Deletion
5551
18
          // ========================================================================
5552
18
          for (i = 0; i < n; i++) {
5553
18
                   double y_val = evaluate_term(data_hoa, row_hashes, i, lhs);
5554
18
                   if (isnan(y_val)) { Safefree(row_names[i]); continue; }
5555                   bool row_ok = true;
5556                   double *restrict row_x = (double*)safemalloc(p_exp * sizeof(double));
5557
69
                   for (j = 0; j < p_exp; j++) {
5558
69
                         if (strcmp(exp_terms[j], "Intercept") == 0) {
5559
72
                             row_x[j] = 1.0;
5560
54
                         } else if (is_interact[j]) {
5561
54
                             row_x[j] = row_x[left_idx[j]] * row_x[right_idx[j]];
5562                         } else if (is_dummy[j]) {
5563
18
                             char*restrict str_val = get_data_string_alloc(data_hoa, row_hashes, i, dummy_base[j]);
5564
18
                             if (str_val) {
5565
18
                                 row_x[j] = (strcmp(str_val, dummy_level[j]) == 0) ? 1.0 : 0.0;
5566
18
                                 Safefree(str_val);
5567
18
                             } else { row_ok = false; break; }
5568
168
                         } else {
5569
18
                             row_x[j] = evaluate_term(data_hoa, row_hashes, i, parent_term[j]);
5570
18
                             if (isnan(row_x[j])) { row_ok = false; break; }
5571
18
                         }
5572
18
                   }
5573                   if (!row_ok) { Safefree(row_names[i]); Safefree(row_x); continue; }
5574
5575                   Y[valid_n] = y_val;
5576                   for (j = 0; j < p_exp; j++) X_mat[valid_n][j] = row_x[j];
5577                   valid_n++;
5578                   Safefree(row_x);
5579                   Safefree(row_names[i]);
5580          }
5581          Safefree(row_names);
5582
18
          if (valid_n <= p_exp) {
5583                   // Full Clean Up
5584
15
                   for (i = 0; i < num_terms; i++) Safefree(terms[i]); Safefree(terms);
5585
15
                   for (i = 0; i < num_uniq; i++) Safefree(uniq_terms[i]); Safefree(uniq_terms);
5586
15
                   for (j = 0; j < p_exp; j++) {
5587                        Safefree(exp_terms[j]); Safefree(parent_term[j]);
5588                        if (is_dummy[j]) { Safefree(dummy_base[j]); Safefree(dummy_level[j]); }
5589
21
                   }
5590
6
                   Safefree(exp_terms); Safefree(parent_term);
5591
6
                   Safefree(is_dummy); Safefree(is_interact);
5592
6
                   Safefree(dummy_base); Safefree(dummy_level);
5593
6
                   Safefree(term_map); Safefree(left_idx); Safefree(right_idx);
5594
0
                   for(i = 0; i < n; i++) Safefree(X_mat[i]);
5595
6
                   Safefree(X_mat); Safefree(Y);
5596
6
                   if (row_hashes) Safefree(row_hashes);
5597                   /* ---- NEW ---- */
5598                   for (i = 0; i < num_uniq; i++) { if (term_base_level[i]) Safefree(term_base_level[i]); }
5599                   Safefree(term_base_level);
5600
15
                   /* ------------ */
5601
15
                   croak("aov: 0 degrees of freedom (too many NAs or parameters > observations)");
5602
15
          }
5603        // ========================================================================
5604
15
          // PHASE 5: Math & Output Formatting
5605
6
          // ========================================================================
5606
6
          bool *restrict aliased_qr = (bool*)safemalloc(p_exp * sizeof(bool));
5607
6
          size_t *restrict rank_map = (size_t*)safemalloc(p_exp * sizeof(size_t));
5608
6
          apply_householder_aov(X_mat, Y, valid_n, p_exp, aliased_qr, rank_map);
5609
6
          double *restrict term_ss;
5610
6
          int *restrict term_df;
5611
6
          Newxz(term_ss, num_uniq, double);
5612
6
          Newxz(term_df, num_uniq, int);
5613
5614
6
          for (i = 0; i < p_exp; i++) {
5615
6
                   if (strcmp(exp_terms[i], "Intercept") == 0) continue;
5616
6
                   if (aliased_qr[i]) continue;
5617
6
                   int t_idx = term_map[i];
5618
6
                   size_t r_k = rank_map[i];
5619
6
                   term_ss[t_idx] += Y[r_k] * Y[r_k];
5620                   term_df[t_idx] += 1;
5621
0
          }
5622          int rank = 0;
5623
9
          for (i = 0; i < p_exp; i++) {
5624                   if (!aliased_qr[i]) rank++;
5625
9
          }
5626
9
          double rss_prev = 0.0;
5627
9
          for (i = rank; i < valid_n; i++) {
5628
9
                   rss_prev += Y[i] * Y[i];
5629
9
          }
5630
9
          int res_df = valid_n - rank;
5631
9
          double ms_res = (res_df > 0) ? rss_prev / res_df : 0.0;
5632
5633
9
          HV*restrict ret_hash = newHV();
5634
9
          for (j = 0; j < num_uniq; j++) {
5635
9
                   if (strcmp(uniq_terms[j], "Intercept") == 0) continue;
5636
9
                   HV*restrict term_stats = newHV();
5637
9
                   double ss = term_ss[j];
5638
0
                   int df = term_df[j];
5639                   double ms = (df > 0) ? ss / df : 0.0;
5640
5641
9
                   hv_stores(term_stats, "Df", newSViv(df));
5642
9
                   hv_stores(term_stats, "Sum Sq", newSVnv(ss));
5643
9
                   hv_stores(term_stats, "Mean Sq", newSVnv(ms));
5644
9
                   if (ms_res > 0.0 && df > 0) {
5645
9
                         double f_val = ms / ms_res;
5646
9
                         hv_stores(term_stats, "F value", newSVnv(f_val));
5647
9
                         hv_stores(term_stats, "Pr(>F)", newSVnv(1.0 - pf(f_val, (double)df, (double)res_df)));
5648
9
                   } else {
5649
9
                         hv_stores(term_stats, "F value", newSVnv(NAN));
5650
9
                         hv_stores(term_stats, "Pr(>F)", newSVnv(NAN));
5651
9
                   }
5652
9
                   hv_store(ret_hash, uniq_terms[j], strlen(uniq_terms[j]), newRV_noinc((SV*)term_stats), 0);
5653
9
          }
5654
5655
9
          HV*restrict res_stats = newHV();
5656
9
          hv_stores(res_stats, "Df", newSViv(res_df));
5657
9
          hv_stores(res_stats, "Sum Sq", newSVnv(rss_prev));
5658
9
          hv_stores(res_stats, "Mean Sq", newSVnv(ms_res));
5659          hv_stores(ret_hash, "Residuals", newRV_noinc((SV*)res_stats));
5660
5661          /* ---- NEW: group_stats => { mean => {...}, size => {...} } ---- */
5662          {
5663                   AV *restrict all_cols = get_all_columns(data_hoa, row_hashes, n);
5664
15
                   HV *restrict mean_hv  = newHV();
5665                   HV *restrict size_hv  = newHV();
5666
15
                   for (size_t c = 0; c <= (size_t)av_len(all_cols); c++) {
5667                       SV **restrict col_sv = av_fetch(all_cols, c, 0);
5668                       if (!col_sv || !SvOK(*col_sv)) continue;
5669
15
                       const char *restrict col_name = SvPV_nolen(*col_sv);
5670
15
                       double col_sum = 0.0;
5671
15
                       IV     col_count = 0;
5672
15
                       for (i = 0; i < n; i++) {
5673
15
                           double val = evaluate_term(data_hoa, row_hashes, i, col_name);
5674
15
                           if (!isnan(val)) { col_sum += val; col_count++; }
5675
15
                       }
5676
15
                       double col_mean = (col_count > 0) ? col_sum / col_count : NAN;
5677
15
                       hv_store(mean_hv, col_name, strlen(col_name), newSVnv(col_mean), 0);
5678
15
                       hv_store(size_hv, col_name, strlen(col_name), newSViv(col_count), 0);
5679
15
                   }
5680                   SvREFCNT_dec(all_cols);
5681
15
                   HV *restrict gs_hv = newHV();
5682                   hv_stores(gs_hv, "mean", newRV_noinc((SV*)mean_hv));
5683                   hv_stores(gs_hv, "size", newRV_noinc((SV*)size_hv));
5684                   hv_stores(ret_hash, "group_stats", newRV_noinc((SV*)gs_hv));
5685          }
5686          /* -------------------------------------------------------------- */
5687
5688          // Deep Cleanup
5689
21
          for (i = 0; i < num_terms; i++) Safefree(terms[i]); Safefree(terms);
5690
21
          for (i = 0; i < num_uniq; i++) Safefree(uniq_terms[i]); Safefree(uniq_terms);
5691
21
          for (j = 0; j < p_exp; j++) {
5692
21
                   Safefree(exp_terms[j]); Safefree(parent_term[j]);
5693
21
                   if (is_dummy[j]) { Safefree(dummy_base[j]); Safefree(dummy_level[j]); }
5694          }
5695
21
          Safefree(exp_terms); Safefree(parent_term);
5696
21
          Safefree(is_dummy); Safefree(is_interact);
5697
21
          Safefree(dummy_base); Safefree(dummy_level);
5698
21
          Safefree(term_map); Safefree(left_idx); Safefree(right_idx);
5699          Safefree(term_ss); Safefree(term_df);
5700
21
          for (i = 0; i < n; i++) Safefree(X_mat[i]);
5701
102
          Safefree(X_mat); Safefree(Y);
5702
81
          Safefree(aliased_qr); Safefree(rank_map);
5703
81
          if (row_hashes) Safefree(row_hashes);
5704          RETVAL = newRV_noinc((SV*)ret_hash);
5705
81
        }
5706
78
OUTPUT:
5707
57
    RETVAL
5708
5709
33
PROTOTYPES: DISABLE
5710
5711
6
SV* fisher_test(...)
5712
0
CODE:
5713
0
{
5714
0
        if (items < 1) croak("fisher_test requires at least a data reference");
5715
5716        SV*restrict data_ref = ST(0);
5717
21
        double conf_level = 0.95;
5718
21
        const char*restrict alternative = "two.sided";
5719
5720
21
        // Parse named arguments
5721
21
        for (unsigned short int i = 1; i < items; i += 2) {
5722                if (i + 1 >= items) croak("fisher_test: odd number of arguments");
5723
21
                const char*restrict key = SvPV_nolen(ST(i));
5724
21
                SV*restrict val = ST(i + 1);
5725
21
                if (strEQ(key, "conf_level") || strEQ(key, "conf.level")) {
5726
21
                        conf_level = SvNV(val);
5727
21
                } else if (strEQ(key, "alternative")) {
5728
21
                        alternative = SvPV_nolen(val);
5729                }
5730
21
        }
5731
5732        if (!SvROK(data_ref)) croak("fisher_test requires a reference to an Array or Hash");
5733        SV*restrict deref = SvRV(data_ref);
5734
21
        size_t a = 0, b = 0, c = 0, d = 0;
5735
21
        // Extract Data
5736
21
        if (SvTYPE(deref) == SVt_PVAV) {
5737
21
          AV*restrict outer = (AV*)deref;
5738
21
          if (av_len(outer) != 1) croak("Outer array must have exactly 2 rows");
5739
21
          SV**restrict row1_ptr = av_fetch(outer, 0, 0);
5740
21
          SV**restrict row2_ptr = av_fetch(outer, 1, 0);
5741
21
          if (row1_ptr && row2_ptr && SvROK(*row1_ptr) && SvROK(*row2_ptr)) {
5742
21
                   AV*restrict row1 = (AV*)SvRV(*row1_ptr);
5743
3
                   AV*restrict row2 = (AV*)SvRV(*row2_ptr);
5744
18
                   SV**restrict a_ptr = av_fetch(row1, 0, 0);
5745
18
                   SV**restrict b_ptr = av_fetch(row1, 1, 0);
5746
18
                   SV**restrict c_ptr = av_fetch(row2, 0, 0);
5747
684
                   SV**restrict d_ptr = av_fetch(row2, 1, 0);
5748
666
                   a = (a_ptr && SvOK(*a_ptr)) ? SvIV(*a_ptr) : 0;
5749
666
                   b = (b_ptr && SvOK(*b_ptr)) ? SvIV(*b_ptr) : 0;
5750
519
                   c = (c_ptr && SvOK(*c_ptr)) ? SvIV(*c_ptr) : 0;
5751                   d = (d_ptr && SvOK(*d_ptr)) ? SvIV(*d_ptr) : 0;
5752
18
          } else {
5753
0
                  croak("Invalid 2D Array structure");
5754
0
          }
5755
0
        } else if (SvTYPE(deref) == SVt_PVHV) {
5756
0
                // Fixed 2D Hash Logic: Sort keys lexically to enforce structured rows/columns
5757
0
                HV*restrict outer = (HV*)deref;
5758
0
                if (hv_iterinit(outer) != 2) croak("Outer hash must have exactly 2 keys");
5759                HE*restrict he1 = hv_iternext(outer);
5760
0
                HE*restrict he2 = hv_iternext(outer);
5761
0
                if (!he1 || !he2) croak("Invalid outer hash");
5762
0
                const char*restrict k1 = SvPV_nolen(hv_iterkeysv(he1));
5763
0
                const char*restrict k2 = SvPV_nolen(hv_iterkeysv(he2));
5764
0
                HE*restrict row1_he = (strcmp(k1, k2) < 0) ? he1 : he2;
5765
0
                HE*restrict row2_he = (strcmp(k1, k2) < 0) ? he2 : he1;
5766
0
                SV*restrict row1_sv = hv_iterval(outer, row1_he);
5767
0
                SV*restrict row2_sv = hv_iterval(outer, row2_he);
5768                if (!SvROK(row1_sv) || SvTYPE(SvRV(row1_sv)) != SVt_PVHV ||
5769
0
                        !SvROK(row2_sv) || SvTYPE(SvRV(row2_sv)) != SVt_PVHV) {
5770
0
                        croak("Inner elements must be hashes");
5771
0
                }
5772
0
                HV*restrict in1 = (HV*)SvRV(row1_sv);
5773
0
                HV*restrict in2 = (HV*)SvRV(row2_sv);
5774
0
                if (hv_iterinit(in1) != 2 || hv_iterinit(in2) != 2) croak("Inner hashes must have exactly 2 keys");
5775
0
                HE*restrict in1_he1 = hv_iternext(in1);
5776                HE*restrict in1_he2 = hv_iternext(in1);
5777
0
                const char*restrict in1_k1 = SvPV_nolen(hv_iterkeysv(in1_he1));
5778                const char*restrict in1_k2 = SvPV_nolen(hv_iterkeysv(in1_he2));
5779
21
                HE*restrict in1_c1 = (strcmp(in1_k1, in1_k2) < 0) ? in1_he1 : in1_he2;
5780
21
                HE*restrict in1_c2 = (strcmp(in1_k1, in1_k2) < 0) ? in1_he2 : in1_he1;
5781
21
                HE*restrict in2_he1 = hv_iternext(in2);
5782
21
                HE*restrict in2_he2 = hv_iternext(in2);
5783
21
                const char*restrict in2_k1 = SvPV_nolen(hv_iterkeysv(in2_he1));
5784
21
                const char*restrict in2_k2 = SvPV_nolen(hv_iterkeysv(in2_he2));
5785
21
                HE*restrict in2_c1 = (strcmp(in2_k1, in2_k2) < 0) ? in2_he1 : in2_he2;
5786
21
                HE*restrict in2_c2 = (strcmp(in2_k1, in2_k2) < 0) ? in2_he2 : in2_he1;
5787
21
                a = (hv_iterval(in1, in1_c1) && SvOK(hv_iterval(in1, in1_c1))) ? SvIV(hv_iterval(in1, in1_c1)) : 0;
5788
21
                b = (hv_iterval(in1, in1_c2) && SvOK(hv_iterval(in1, in1_c2))) ? SvIV(hv_iterval(in1, in1_c2)) : 0;
5789
21
                c = (hv_iterval(in2, in2_c1) && SvOK(hv_iterval(in2, in2_c1))) ? SvIV(hv_iterval(in2, in2_c1)) : 0;
5790
21
                d = (hv_iterval(in2, in2_c2) && SvOK(hv_iterval(in2, in2_c2))) ? SvIV(hv_iterval(in2, in2_c2)) : 0;
5791        } else {
5792          croak("Input must be a 2D Array or 2D Hash");
5793        }
5794
5795        // Perform Calculations via Helpers
5796        double p_val = exact_p_value(a, b, c, d, alternative);
5797        double mle_or, ci_low, ci_high;
5798
8
        calculate_exact_stats(a, b, c, d, conf_level, alternative, &mle_or, &ci_low, &ci_high);
5799
5800        // Construct the Return HashRef purely in C
5801        HV*restrict ret_hash = newHV();
5802        hv_stores(ret_hash, "method", newSVpv("Fisher's Exact Test for Count Data", 0));
5803
8
        hv_stores(ret_hash, "alternative", newSVpv(alternative, 0));
5804
6
        AV*restrict ci_array = newAV();
5805
6
        av_push(ci_array, newSVnv(ci_low));
5806
3
        av_push(ci_array, newSVnv(ci_high));
5807
3
        hv_stores(ret_hash, "conf_int", newRV_noinc((SV*)ci_array));
5808
3
        HV*restrict est_hash = newHV();
5809        hv_stores(ret_hash, "estimate", newRV_noinc((SV*)est_hash));
5810        hv_stores(est_hash, "odds ratio", newSVnv(mle_or));
5811
8
        hv_stores(ret_hash, "p_value", newSVnv(p_val));
5812
5
        // Return the HashRef
5813
3
        RETVAL = newRV_noinc((SV*)ret_hash);
5814
3
}
5815OUTPUT:
5816  RETVAL
5817
5818
4
SV* power_t_test(...)
5819
4
CODE:
5820
4
{
5821
2
    SV* sv_n = NULL;
5822
0
    SV* sv_delta = NULL;
5823
0
    SV* sv_sd = NULL;
5824    SV* sv_sig_level = NULL;
5825    SV* sv_power = NULL;
5826
5827
0
    const char* restrict type = "two.sample";
5828    const char* restrict alternative = "two.sided";
5829    bool strict = false;
5830    double tol = pow(2.2204460492503131e-16, 0.25);
5831
5832
8
    if (items % 2 != 0) croak("Usage: power_t_test(n => 30, delta => 0.5, sd => 1.0, ...)");
5833
8
    for (unsigned short int i = 0; i < items; i += 2) {
5834        const char* restrict key = SvPV_nolen(ST(i));
5835        SV* restrict val = ST(i+1);
5836
5837        if      (strEQ(key, "n"))           sv_n = val;
5838
8
        else if (strEQ(key, "delta"))       sv_delta = val;
5839
3
        else if (strEQ(key, "sd"))          sv_sd = val;
5840
0
        else if (strEQ(key, "sig.level") || strEQ(key, "sig_level")) sv_sig_level = val;
5841
3
        else if (strEQ(key, "power"))       sv_power = val;
5842        else if (strEQ(key, "type"))        type = SvPV_nolen(val);
5843
3
        else if (strEQ(key, "alternative")) alternative = SvPV_nolen(val);
5844
3
        else if (strEQ(key, "strict"))      strict = SvTRUE(val);
5845        else if (strEQ(key, "tol"))         tol = SvNV(val);
5846
12
        else croak("power_t_test: unknown argument '%s'", key);
5847
9
    }
5848
5849
0
    bool is_null_n = (!sv_n || !SvOK(sv_n));
5850
9
    bool is_null_delta = (!sv_delta || !SvOK(sv_delta));
5851    bool is_null_power = (!sv_power || !SvOK(sv_power));
5852
3
    bool is_null_sd = (sv_sd && !SvOK(sv_sd));
5853    bool is_null_sig_level = (sv_sig_level && !SvOK(sv_sig_level));
5854
5855    int missing_count = 0;
5856    if (is_null_n) missing_count++;
5857
3
    if (is_null_delta) missing_count++;
5858
3
    if (is_null_power) missing_count++;
5859
12
    if (is_null_sd) missing_count++;
5860
9
    if (is_null_sig_level) missing_count++;
5861
5862
51
    if (missing_count != 1) {
5863
42
        croak("power_t_test: exactly one of 'n', 'delta', 'sd', 'power', and 'sig_level' must be undef/NULL");
5864
42
    }
5865
5866
42
    double n = is_null_n ? 0.0 : SvNV(sv_n);
5867
42
    double delta = is_null_delta ? 0.0 : SvNV(sv_delta);
5868    double sd = (!sv_sd || is_null_sd) ? 1.0 : SvNV(sv_sd);
5869    double sig_level = (!sv_sig_level || is_null_sig_level) ? 0.05 : SvNV(sv_sig_level);
5870
9
    double power = is_null_power ? 0.0 : SvNV(sv_power);
5871    short int tsample = (strEQ(type, "one.sample") || strEQ(type, "paired")) ? 1 : 2;
5872
3
    short int tside = (strEQ(alternative, "one.sided") || strEQ(alternative, "greater") || strEQ(alternative, "less")) ? 1 : 2;
5873    if (tside == 2 && !is_null_delta) delta = fabs(delta);
5874    if (is_null_power) {
5875        power = p_body(n, delta, sd, sig_level, tsample, tside, strict);
5876    } else if (is_null_n) {
5877        double low = 2.0, high = 1e7;
5878
5
        while (p_body(high, delta, sd, sig_level, tsample, tside, strict) < power && high < 1e12) high *= 2.0;
5879
0
        while (high - low > tol) {
5880
5
            double mid = low + (high - low) / 2.0;
5881
0
            if (p_body(mid, delta, sd, sig_level, tsample, tside, strict) < power) low = mid;
5882            else high = mid;
5883
5
        }
5884
5
        n = low + (high - low) / 2.0;
5885
5
    } else if (is_null_sd) {
5886
5
        double low = delta * 1e-7, high = delta * 1e7;
5887
5
        while (high - low > tol) {
5888
5
            double mid = low + (high - low) / 2.0;
5889            if (p_body(n, delta, mid, sig_level, tsample, tside, strict) > power) low = mid;
5890
5
            else high = mid;
5891        }
5892        sd = low + (high - low) / 2.0;
5893
5
    } else if (is_null_delta) {
5894
5
        double low = sd * 1e-7, high = sd * 1e7;
5895        while (p_body(n, high, sd, sig_level, tsample, tside, strict) < power && high < 1e12) high *= 2.0;
5896
75
        while (high - low > tol) {
5897
70
            double mid = low + (high - low) / 2.0;
5898
70
            if (p_body(n, mid, sd, sig_level, tsample, tside, strict) < power) low = mid;
5899
70
            else high = mid;
5900
70
        }
5901
70
        delta = low + (high - low) / 2.0;
5902
70
    } else if (is_null_sig_level) {
5903
70
        double low = 1e-10, high = 1.0 - 1e-10;
5904        while (high - low > tol) {
5905
70
            double mid = low + (high - low) / 2.0;
5906
55
            if (p_body(n, delta, sd, mid, tsample, tside, strict) < power) low = mid;
5907            else high = mid;
5908
15
        }
5909
15
        sig_level = low + (high - low) / 2.0;
5910    }
5911
70
    HV*restrict ret = newHV();
5912
70
    hv_stores(ret, "n", newSVnv(n));
5913
70
    hv_stores(ret, "delta", newSVnv(delta));
5914    hv_stores(ret, "sd", newSVnv(sd));
5915    hv_stores(ret, "sig.level", newSVnv(sig_level));
5916
5
    hv_stores(ret, "power", newSVnv(power));
5917
5
    hv_stores(ret, "alternative", newSVpv(alternative, 0));
5918    const char*restrict m_str = (tsample == 1) ? (strEQ(type, "paired") ? "Paired t test power calculation" : "One-sample t test power calculation") : "Two-sample t test power calculation";
5919    hv_stores(ret, "method", newSVpv(m_str, 0));
5920    const char*restrict n_str = (tsample == 2) ? "n is number in *each* group" : (strEQ(type, "paired") ? "n is number of *pairs*, sd is std.dev. of *differences* within pairs" : "");
5921    if (n_str[0] != '\0') hv_stores(ret, "note", newSVpv(n_str, 0));
5922    RETVAL = newRV_noinc((SV*)ret);
5923
8
}
5924
8
OUTPUT:
5925    RETVAL
5926
5927
8
SV* kruskal_test(...)
5928
8
CODE:
5929{
5930    SV *restrict x_sv = NULL, *restrict g_sv = NULL, *restrict h_sv = NULL;
5931
8
    unsigned int arg_idx = 0;
5932
5933
120
    // 1. Shift positional arguments
5934
112
    //    Accept either: (arrayref, arrayref) or (hashref)
5935
112
    if (arg_idx < items && SvROK(ST(arg_idx))) {
5936
112
        svtype t = SvTYPE(SvRV(ST(arg_idx)));
5937        if (t == SVt_PVAV) {
5938            x_sv = ST(arg_idx++);
5939        } else if (t == SVt_PVHV) {
5940
8
            h_sv = ST(arg_idx++);          /* hash-of-arrays shortcut */
5941
32
        }
5942
24
    }
5943
24
    if (!h_sv && arg_idx < items
5944
24
              && SvROK(ST(arg_idx))
5945              && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
5946
8
        g_sv = ST(arg_idx++);
5947    }
5948
8
    // 2. Parse named arguments (fallback)
5949
8
    for (; arg_idx < items; arg_idx += 2) {
5950
8
        const char *restrict key = SvPV_nolen(ST(arg_idx));
5951
0
        SV         *restrict val = ST(arg_idx + 1);
5952
0
        if      (strEQ(key, "x")) x_sv = val;
5953        else if (strEQ(key, "g")) g_sv = val;
5954
8
        else if (strEQ(key, "h")) h_sv = val;
5955
8
        else croak("kruskal_test: unknown argument '%s'", key);
5956    }
5957    // 3. Mutual-exclusion guard
5958
8
    if (h_sv && (x_sv || g_sv))
5959
8
        croak("kruskal_test: cannot mix 'h' (hash-of-arrays) with 'x'/'g' inputs");
5960
5961
8
    /* ------------------------------------------------------------------ */
5962
8
    /* Shared state filled by whichever input branch runs                 */
5963
8
    /* ------------------------------------------------------------------ */
5964
8
    RankInfo *restrict ri = NULL;
5965    char **restrict group_names = NULL; /* Track names to build group_stats */
5966    size_t valid_n = 0;
5967    size_t k       = 0;
5968
5969    /* ------------------------------------------------------------------ */
5970    /* 4a. Hash-of-arrays input path                                      */
5971    /*     my %x = ( group1 => [...], group2 => [...], ... )              */
5972    /* ------------------------------------------------------------------ */
5973    if (h_sv) {
5974        if (!SvROK(h_sv) || SvTYPE(SvRV(h_sv)) != SVt_PVHV)
5975            croak("kruskal_test: 'h' must be a HASH reference");
5976        HV *restrict h_hv = (HV*)SvRV(h_sv);
5977
5978        // First pass – validate values and tally total elements
5979        size_t total = 0;
5980        hv_iterinit(h_hv);
5981        HE *restrict he;
5982        while ((he = hv_iternext(h_hv))) {
5983            SV *restrict val = HeVAL(he);
5984            if (!SvROK(val) || SvTYPE(SvRV(val)) != SVt_PVAV)
5985                croak("kruskal_test: every value in 'h' must be an ARRAY reference");
5986            total += (size_t)(av_len((AV*)SvRV(val)) + 1);
5987        }
5988        if (total < 2) croak("not enough observations");
5989
5990        ri = (RankInfo *)safemalloc(total * sizeof(RankInfo));
5991
5992        size_t num_keys = HvKEYS(h_hv);
5993        group_names = (char **)safecalloc(num_keys, sizeof(char*));
5994
5995        /* Second pass – fill ri[], assigning one group_id per hash key */
5996        size_t group_id = 0;
5997        hv_iterinit(h_hv);
5998        while ((he = hv_iternext(h_hv))) {
5999            STRLEN klen;
6000            const char *key_str = HePV(he, klen);
6001            group_names[group_id] = savepvn(key_str, klen); // Save string key
6002
6003            AV    *restrict av  = (AV*)SvRV(HeVAL(he));
6004            size_t          n_g = (size_t)(av_len(av) + 1);
6005            for (size_t i = 0; i < n_g; i++) {
6006                SV **restrict el = av_fetch(av, i, 0);
6007                if (el && SvOK(*el) && looks_like_number(*el)) {
6008                    ri[valid_n].val = SvNV(*el);
6009                    ri[valid_n].idx = group_id;   /* group identity */
6010                    valid_n++;
6011                }
6012            }
6013            group_id++;
6014        }
6015        k = group_id;   /* number of unique groups = number of hash keys */
6016
6017    /* ------------------------------------------------------------------ */
6018    /* 4b. Original x / g array-pair input path                           */
6019    /* ------------------------------------------------------------------ */
6020    } else {
6021        if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
6022            croak("kruskal_test: 'x' is a required argument and must be an ARRAY reference");
6023        if (!g_sv || !SvROK(g_sv) || SvTYPE(SvRV(g_sv)) != SVt_PVAV)
6024            croak("kruskal_test: 'g' is a required argument and must be an ARRAY reference");
6025
6026        AV *restrict x_av = (AV*)SvRV(x_sv);
6027        AV *restrict g_av = (AV*)SvRV(g_sv);
6028        size_t nx = (size_t)(av_len(x_av) + 1);
6029        size_t ng = (size_t)(av_len(g_av) + 1);
6030        if (nx != ng) croak("kruskal_test: 'x' and 'g' must have the same length");
6031        if (nx < 2)   croak("not enough observations");
6032
6033        ri = (RankInfo *)safemalloc(nx * sizeof(RankInfo));
6034        group_names = (char **)safecalloc(nx, sizeof(char*)); // Upper bound
6035
6036        // Map string group names → contiguous integer IDs
6037        HV    *restrict group_map    = newHV();
6038        size_t          next_group_id = 0;
6039
6040        for (size_t i = 0; i < nx; i++) {
6041            SV **restrict x_el = av_fetch(x_av, i, 0);
6042            SV **restrict g_el = av_fetch(g_av, i, 0);
6043            if (x_el && SvOK(*x_el) && looks_like_number(*x_el)
6044                     && g_el && SvOK(*g_el)) {
6045                const char *restrict g_str = SvPV_nolen(*g_el);
6046                STRLEN               glen  = strlen(g_str);
6047                SV   **restrict id_sv = hv_fetch(group_map, g_str, glen, 0);
6048                size_t group_id;
6049                if (id_sv) {
6050                    group_id = SvUV(*id_sv);
6051                } else {
6052                    group_id = next_group_id++;
6053                    hv_store(group_map, g_str, glen, newSVuv(group_id), 0);
6054                    group_names[group_id] = savepvn(g_str, glen); // Save string key
6055                }
6056                ri[valid_n].val = SvNV(*x_el);
6057                ri[valid_n].idx = group_id;
6058                valid_n++;
6059            }
6060        }
6061        k = next_group_id;
6062        SvREFCNT_dec(group_map);
6063    }
6064
6065    /* ------------------------------------------------------------------ */
6066    /* 5. Shared post-extraction validation                               */
6067    /* ------------------------------------------------------------------ */
6068    if (valid_n < 2 || k < 2) {
6069        Safefree(ri);
6070        if (group_names) {
6071            for (size_t i = 0; i < k; i++) { if (group_names[i]) Safefree(group_names[i]); }
6072            Safefree(group_names);
6073        }
6074        if (valid_n < 2) croak("not enough observations");
6075        croak("all observations are in the same group");
6076    }
6077
6078    // 6. Ranking and Tie Accumulation (Reusing LikeR Helper)
6079    bool   has_ties = 0;
6080    double tie_adj  = rank_and_count_ties(ri, valid_n, &has_ties);
6081
6082    // 7. Aggregate Sum of Ranks AND Actual Values by Group
6083    double *restrict group_rank_sums = (double *)safecalloc(k, sizeof(double));
6084    double *restrict group_val_sums  = (double *)safecalloc(k, sizeof(double)); // For Mean
6085    size_t *restrict group_counts    = (size_t *)safecalloc(k, sizeof(size_t));
6086
6087    for (size_t i = 0; i < valid_n; i++) {
6088        size_t g_id = ri[i].idx;
6089        group_rank_sums[g_id] += ri[i].rank;
6090        group_val_sums[g_id]  += ri[i].val;
6091        group_counts[g_id]++;
6092    }
6093
6094    // 8. Calculate STATISTIC
6095    double stat_base = 0.0;
6096    for (size_t i = 0; i < k; i++) {
6097        if (group_counts[i] > 0)
6098            stat_base += (group_rank_sums[i] * group_rank_sums[i])
6099                         / (double)group_counts[i];
6100    }
6101
6102    double n_d  = (double)valid_n;
6103    double stat = (12.0 * stat_base / (n_d * (n_d + 1.0))) - 3.0 * (n_d + 1.0);
6104    if (tie_adj > 0.0) {
6105        double tie_denom = 1.0 - (tie_adj / (n_d * n_d * n_d - n_d));
6106        stat /= tie_denom;
6107    }
6108    int    df    = (int)k - 1;
6109    double p_val = get_p_value(stat, df);
6110
6111    // 9. Return structured data exactly like R's htest
6112    HV *restrict res = newHV();
6113    hv_stores(res, "statistic", newSVnv(stat));
6114    hv_stores(res, "parameter", newSViv(df));
6115    hv_stores(res, "p_value",   newSVnv(p_val));
6116    hv_stores(res, "p.value",   newSVnv(p_val));
6117    hv_stores(res, "method",    newSVpv("Kruskal-Wallis rank sum test", 0));
6118
6119    // 10. Build the group_stats hash
6120    HV *group_stats = newHV();
6121    HV *stats_mean  = newHV();
6122    HV *stats_size  = newHV();
6123
6124    for (size_t i = 0; i < k; i++) {
6125        if (group_counts[i] > 0 && group_names[i]) {
6126            double mean = group_val_sums[i] / (double)group_counts[i];
6127            size_t nlen = strlen(group_names[i]);
6128
6129            hv_store(stats_mean, group_names[i], nlen, newSVnv(mean), 0);
6130            hv_store(stats_size, group_names[i], nlen, newSVuv(group_counts[i]), 0);
6131        }
6132        if (group_names[i]) Safefree(group_names[i]); // Clean up name copy
6133    }
6134
6135    // Embed the nested hashes
6136    hv_stores(group_stats, "mean", newRV_noinc((SV*)stats_mean));
6137    hv_stores(group_stats, "size", newRV_noinc((SV*)stats_size));
6138    hv_stores(res, "group_stats",  newRV_noinc((SV*)group_stats));
6139
6140    // Memory Cleanup
6141    Safefree(group_names);
6142    Safefree(group_rank_sums);
6143    Safefree(group_val_sums);
6144    Safefree(group_counts);
6145    Safefree(ri);
6146
6147    RETVAL = newRV_noinc((SV*)res);
6148}
6149OUTPUT:
6150    RETVAL
6151
6152SV* var_test(...)
6153CODE:
6154{
6155        SV* restrict x_sv = NULL;
6156        SV* restrict y_sv = NULL;
6157        double ratio = 1.0, conf_level = 0.95;
6158        const char* restrict alternative = "two.sided";
6159        unsigned int arg_idx = 0;
6160
6161        // 1. Shift positional argument 'x' if it's an array reference
6162        if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
6163          x_sv = ST(arg_idx);
6164          arg_idx++;
6165        }
6166
6167        // 2. Shift positional argument 'y' if it's an array reference
6168        if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
6169          y_sv = ST(arg_idx);
6170          arg_idx++;
6171        }
6172
6173        // Ensure the remaining arguments form complete key-value pairs
6174        if ((items - arg_idx) % 2 != 0) {
6175          croak("Usage: var_test(\\@x, \\@y, key => value, ...)");
6176        }
6177
6178        // --- Parse named arguments from the remaining flat stack ---
6179        for (; arg_idx < items; arg_idx += 2) {
6180          const char* restrict key = SvPV_nolen(ST(arg_idx));
6181          SV* restrict val = ST(arg_idx + 1);
6182
6183          if      (strEQ(key, "x"))           x_sv        = val;
6184          else if (strEQ(key, "y"))           y_sv        = val;
6185          else if (strEQ(key, "ratio"))       ratio       = SvNV(val);
6186          else if (strEQ(key, "conf_level") || strEQ(key, "conf.level")) conf_level = SvNV(val);
6187          else if (strEQ(key, "alternative")) alternative = SvPV_nolen(val);
6188          else croak("var_test: unknown argument '%s'", key);
6189        }
6190
6191        // --- Validate required inputs / types ---
6192        if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
6193          croak("var_test: 'x' is a required argument and must be an ARRAY reference");
6194        if (!y_sv || !SvROK(y_sv) || SvTYPE(SvRV(y_sv)) != SVt_PVAV)
6195          croak("var_test: 'y' is a required argument and must be an ARRAY reference");
6196
6197        if (ratio <= 0.0 || !isfinite(ratio))
6198          croak("var_test: 'ratio' must be a single positive number");
6199        if (conf_level <= 0.0 || conf_level >= 1.0 || !isfinite(conf_level))
6200          croak("var_test: 'conf.level' must be a single number between 0 and 1");
6201
6202        AV* restrict x_av = (AV*)SvRV(x_sv);
6203        AV* restrict y_av = (AV*)SvRV(y_sv);
6204        size_t nx_raw = av_len(x_av) + 1;
6205        size_t ny_raw = av_len(y_av) + 1;
6206
6207        // --- Computation via Welford's Algorithm (ignoring NaNs) ---
6208        double mean_x = 0.0, M2_x = 0.0;
6209        size_t nx = 0;
6210        for (size_t i = 0; i < nx_raw; i++) {
6211                SV** restrict tv = av_fetch(x_av, i, 0);
6212                if (tv && SvOK(*tv) && looks_like_number(*tv)) {
6213                        double val = SvNV(*tv);
6214                        if (!isnan(val) && isfinite(val)) {
6215                                nx++;
6216                                double delta = val - mean_x;
6217                                mean_x += delta / nx;
6218                                M2_x += delta * (val - mean_x);
6219                        }
6220                }
6221        }
6222
6223        double mean_y = 0.0, M2_y = 0.0;
6224        size_t ny = 0;
6225        for (size_t i = 0; i < ny_raw; i++) {
6226          SV** restrict tv = av_fetch(y_av, i, 0);
6227          if (tv && SvOK(*tv) && looks_like_number(*tv)) {
6228                   double val = SvNV(*tv);
6229                   if (!isnan(val) && isfinite(val)) {
6230                       ny++;
6231                       double delta = val - mean_y;
6232                       mean_y += delta / ny;
6233                       M2_y += delta * (val - mean_y);
6234                   }
6235          }
6236        }
6237
6238        if (nx < 2) croak("not enough 'x' observations");
6239        if (ny < 2) croak("not enough 'y' observations");
6240
6241        double df_x = (double)(nx - 1);
6242        double df_y = (double)(ny - 1);
6243        double var_x = M2_x / df_x;
6244        double var_y = M2_y / df_y;
6245
6246        if (var_y == 0.0) croak("var_test: variance of 'y' is zero (cannot divide by zero)");
6247
6248        // --- Statistics Math ---
6249        double estimate = var_x / var_y;
6250        double statistic = estimate / ratio;
6251
6252        double p_val = pf(statistic, df_x, df_y);
6253        double ci_lower = 0.0, ci_upper = INFINITY;
6254
6255        if (strcmp(alternative, "less") == 0) {
6256          ci_upper = estimate / qf_bisection(1.0 - conf_level, df_x, df_y);
6257        } else if (strcmp(alternative, "greater") == 0) {
6258          p_val = 1.0 - p_val;
6259          ci_lower = estimate / qf_bisection(conf_level, df_x, df_y);
6260        } else {
6261          // two.sided
6262          double p1 = p_val;
6263          double p2 = 1.0 - p_val;
6264          p_val = 2.0 * (p1 < p2 ? p1 : p2);
6265
6266          double beta = (1.0 - conf_level) / 2.0;
6267          ci_lower = estimate / qf_bisection(1.0 - beta, df_x, df_y);
6268          ci_upper = estimate / qf_bisection(beta, df_x, df_y);
6269        }
6270
6271        // --- Pack Results ---
6272        HV* restrict results = newHV();
6273        hv_store(results, "statistic", 9, newSVnv(statistic), 0);
6274
6275        AV* restrict param_av = newAV();
6276        av_push(param_av, newSVnv(df_x));
6277        av_push(param_av, newSVnv(df_y));
6278        hv_store(results, "parameter", 9, newRV_noinc((SV*)param_av), 0);
6279
6280        hv_store(results, "p_value", 7, newSVnv(p_val), 0);
6281
6282        AV* restrict conf_int = newAV();
6283        av_push(conf_int, newSVnv(ci_lower));
6284        av_push(conf_int, newSVnv(ci_upper));
6285        hv_store(results, "conf_int", 8, newRV_noinc((SV*)conf_int), 0);
6286
6287        hv_store(results, "estimate", 8, newSVnv(estimate), 0);
6288        hv_store(results, "null_value", 10, newSVnv(ratio), 0);
6289        hv_store(results, "alternative", 11, newSVpv(alternative, 0), 0);
6290        hv_store(results, "method", 6, newSVpv("F test to compare two variances", 0), 0);
6291
6292        RETVAL = newRV_noinc((SV*)results);
6293}
6294OUTPUT:
6295    RETVAL